From 849369d6c66d3054688672f97d31fceb8e8230fb Mon Sep 17 00:00:00 2001 From: root Date: Fri, 25 Dec 2015 04:40:36 +0000 Subject: initial_commit --- kernel/.gitignore | 6 + kernel/Kconfig.freezer | 2 + kernel/Kconfig.hz | 58 + kernel/Kconfig.locks | 202 + kernel/Kconfig.preempt | 54 + kernel/Makefile | 140 + kernel/acct.c | 665 +++ kernel/async.c | 301 + kernel/audit.c | 1508 +++++ kernel/audit.h | 170 + kernel/audit_tree.c | 957 ++++ kernel/audit_watch.c | 547 ++ kernel/auditfilter.c | 1383 +++++ kernel/auditsc.c | 2551 +++++++++ kernel/backtracetest.c | 91 + kernel/bounds.c | 21 + kernel/capability.c | 409 ++ kernel/cgroup.c | 5279 ++++++++++++++++++ kernel/cgroup_freezer.c | 397 ++ kernel/compat.c | 1193 ++++ kernel/configs.c | 99 + kernel/cpu.c | 690 +++ kernel/cpuset.c | 2615 +++++++++ kernel/crash_dump.c | 34 + kernel/cred.c | 863 +++ kernel/debug/Makefile | 6 + kernel/debug/debug_core.c | 971 ++++ kernel/debug/debug_core.h | 82 + kernel/debug/gdbstub.c | 1125 ++++ kernel/debug/kdb/.gitignore | 1 + kernel/debug/kdb/Makefile | 25 + kernel/debug/kdb/kdb_bp.c | 562 ++ kernel/debug/kdb/kdb_bt.c | 210 + kernel/debug/kdb/kdb_cmds | 35 + kernel/debug/kdb/kdb_debugger.c | 168 + kernel/debug/kdb/kdb_io.c | 826 +++ kernel/debug/kdb/kdb_keyboard.c | 212 + kernel/debug/kdb/kdb_main.c | 2937 ++++++++++ kernel/debug/kdb/kdb_private.h | 259 + kernel/debug/kdb/kdb_support.c | 927 ++++ kernel/delayacct.c | 184 + kernel/dma.c | 161 + kernel/elfcore.c | 28 + kernel/events/Makefile | 6 + kernel/events/core.c | 7430 +++++++++++++++++++++++++ kernel/events/hw_breakpoint.c | 659 +++ kernel/exec_domain.c | 195 + kernel/exit.c | 1875 +++++++ kernel/extable.c | 133 + kernel/fork.c | 1775 ++++++ kernel/freezer.c | 166 + kernel/futex.c | 2733 +++++++++ kernel/futex_compat.c | 200 + kernel/gcov/Kconfig | 49 + kernel/gcov/Makefile | 3 + kernel/gcov/base.c | 148 + kernel/gcov/fs.c | 790 +++ kernel/gcov/gcc_3_4.c | 447 ++ kernel/gcov/gcov.h | 128 + kernel/groups.c | 281 + kernel/hrtimer.c | 1861 +++++++ kernel/hung_task.c | 223 + kernel/irq/Kconfig | 74 + kernel/irq/Makefile | 7 + kernel/irq/autoprobe.c | 185 + kernel/irq/chip.c | 700 +++ kernel/irq/debug.h | 45 + kernel/irq/devres.c | 94 + kernel/irq/dummychip.c | 59 + kernel/irq/generic-chip.c | 368 ++ kernel/irq/handle.c | 181 + kernel/irq/internals.h | 171 + kernel/irq/irqdesc.c | 466 ++ kernel/irq/manage.c | 1437 +++++ kernel/irq/migration.c | 81 + kernel/irq/pm.c | 131 + kernel/irq/proc.c | 486 ++ kernel/irq/resend.c | 80 + kernel/irq/settings.h | 142 + kernel/irq/spurious.c | 364 ++ kernel/irq_work.c | 166 + kernel/itimer.c | 298 + kernel/jump_label.c | 394 ++ kernel/kallsyms.c | 588 ++ kernel/kexec.c | 1569 ++++++ kernel/kfifo.c | 608 +++ kernel/kmod.c | 531 ++ kernel/kprobes.c | 2252 ++++++++ kernel/ksysfs.c | 221 + kernel/kthread.c | 443 ++ kernel/latencytop.c | 291 + kernel/lockdep.c | 4005 ++++++++++++++ kernel/lockdep_internals.h | 170 + kernel/lockdep_proc.c | 680 +++ kernel/lockdep_states.h | 9 + kernel/module.c | 3469 ++++++++++++ kernel/mutex-debug.c | 110 + kernel/mutex-debug.h | 55 + kernel/mutex.c | 500 ++ kernel/mutex.h | 48 + kernel/notifier.c | 586 ++ kernel/nsproxy.c | 280 + kernel/padata.c | 1135 ++++ kernel/panic.c | 464 ++ kernel/params.c | 924 ++++ kernel/pid.c | 570 ++ kernel/pid_namespace.c | 200 + kernel/pm_qos_params.c | 481 ++ kernel/posix-cpu-timers.c | 1632 ++++++ kernel/posix-timers.c | 1069 ++++ kernel/power/Kconfig | 342 ++ kernel/power/Makefile | 23 + kernel/power/block_io.c | 103 + kernel/power/console.c | 35 + kernel/power/consoleearlysuspend.c | 78 + kernel/power/cpufreq_earlysuspend.c | 67 + kernel/power/cpufreq_governor_chg.c | 138 + kernel/power/cpuhotplug_earlysuspend.c | 111 + kernel/power/earlysuspend.c | 187 + kernel/power/fbearlysuspend.c | 153 + kernel/power/hibernate.c | 1067 ++++ kernel/power/main.c | 453 ++ kernel/power/power.h | 271 + kernel/power/poweroff.c | 46 + kernel/power/process.c | 206 + kernel/power/snapshot.c | 2325 ++++++++ kernel/power/suspend.c | 335 ++ kernel/power/suspend_test.c | 188 + kernel/power/suspend_time.c | 111 + kernel/power/swap.c | 989 ++++ kernel/power/user.c | 486 ++ kernel/power/userwakelock.c | 219 + kernel/power/wakelock.c | 634 +++ kernel/printk.c | 1794 ++++++ kernel/profile.c | 631 +++ kernel/ptrace.c | 942 ++++ kernel/range.c | 159 + kernel/rcupdate.c | 294 + kernel/rcutiny.c | 324 ++ kernel/rcutiny_plugin.h | 1007 ++++ kernel/rcutorture.c | 1633 ++++++ kernel/rcutree.c | 2098 +++++++ kernel/rcutree.h | 470 ++ kernel/rcutree_plugin.h | 2010 +++++++ kernel/rcutree_trace.c | 515 ++ kernel/relay.c | 1365 +++++ kernel/res_counter.c | 191 + kernel/resource.c | 1132 ++++ kernel/rtmutex-debug.c | 238 + kernel/rtmutex-debug.h | 33 + kernel/rtmutex-tester.c | 417 ++ kernel/rtmutex.c | 1046 ++++ kernel/rtmutex.h | 26 + kernel/rtmutex_common.h | 126 + kernel/rwsem.c | 148 + kernel/sched.c | 9414 ++++++++++++++++++++++++++++++++ kernel/sched_autogroup.c | 275 + kernel/sched_autogroup.h | 41 + kernel/sched_clock.c | 350 ++ kernel/sched_cpupri.c | 204 + kernel/sched_cpupri.h | 37 + kernel/sched_debug.c | 508 ++ kernel/sched_fair.c | 4334 +++++++++++++++ kernel/sched_features.h | 74 + kernel/sched_idletask.c | 97 + kernel/sched_rt.c | 1859 +++++++ kernel/sched_stats.h | 336 ++ kernel/sched_stoptask.c | 104 + kernel/seccomp.c | 86 + kernel/semaphore.c | 263 + kernel/signal.c | 3120 +++++++++++ kernel/smp.c | 703 +++ kernel/softirq.c | 933 ++++ kernel/spinlock.c | 385 ++ kernel/srcu.c | 315 ++ kernel/stacktrace.c | 37 + kernel/stop_machine.c | 490 ++ kernel/sys.c | 1878 +++++++ kernel/sys_ni.c | 202 + kernel/sysctl.c | 3006 ++++++++++ kernel/sysctl_binary.c | 1519 ++++++ kernel/sysctl_check.c | 160 + kernel/taskstats.c | 711 +++ kernel/test_kprobes.c | 414 ++ kernel/time.c | 711 +++ kernel/time/Kconfig | 29 + kernel/time/Makefile | 9 + kernel/time/alarmtimer.c | 728 +++ kernel/time/clockevents.c | 341 ++ kernel/time/clocksource.c | 916 ++++ kernel/time/jiffies.c | 97 + kernel/time/ntp.c | 972 ++++ kernel/time/posix-clock.c | 445 ++ kernel/time/tick-broadcast.c | 701 +++ kernel/time/tick-common.c | 419 ++ kernel/time/tick-internal.h | 146 + kernel/time/tick-oneshot.c | 185 + kernel/time/tick-sched.c | 861 +++ kernel/time/timecompare.c | 193 + kernel/time/timeconv.c | 127 + kernel/time/timekeeping.c | 1137 ++++ kernel/time/timer_list.c | 301 + kernel/time/timer_stats.c | 425 ++ kernel/timeconst.pl | 378 ++ kernel/timer.c | 1792 ++++++ kernel/trace/Kconfig | 493 ++ kernel/trace/Makefile | 60 + kernel/trace/blktrace.c | 1813 ++++++ kernel/trace/ftrace.c | 4214 ++++++++++++++ kernel/trace/power-traces.c | 20 + kernel/trace/ring_buffer.c | 4067 ++++++++++++++ kernel/trace/ring_buffer_benchmark.c | 488 ++ kernel/trace/trace.c | 4669 ++++++++++++++++ kernel/trace/trace.h | 803 +++ kernel/trace/trace_branch.c | 411 ++ kernel/trace/trace_clock.c | 115 + kernel/trace/trace_entries.h | 284 + kernel/trace/trace_event_perf.c | 216 + kernel/trace/trace_events.c | 1764 ++++++ kernel/trace/trace_events_filter.c | 2014 +++++++ kernel/trace/trace_export.c | 173 + kernel/trace/trace_functions.c | 406 ++ kernel/trace/trace_functions_graph.c | 1479 +++++ kernel/trace/trace_irqsoff.c | 687 +++ kernel/trace/trace_kdb.c | 135 + kernel/trace/trace_kprobe.c | 1951 +++++++ kernel/trace/trace_mmiotrace.c | 374 ++ kernel/trace/trace_nop.c | 101 + kernel/trace/trace_output.c | 1308 +++++ kernel/trace/trace_output.h | 53 + kernel/trace/trace_printk.c | 344 ++ kernel/trace/trace_sched_switch.c | 249 + kernel/trace/trace_sched_wakeup.c | 626 +++ kernel/trace/trace_selftest.c | 931 ++++ kernel/trace/trace_selftest_dynamic.c | 13 + kernel/trace/trace_stack.c | 376 ++ kernel/trace/trace_stat.c | 388 ++ kernel/trace/trace_stat.h | 33 + kernel/trace/trace_syscalls.c | 690 +++ kernel/trace/trace_workqueue.c | 300 + kernel/tracepoint.c | 640 +++ kernel/tsacct.c | 154 + kernel/uid16.c | 230 + kernel/up.c | 21 + kernel/user-return-notifier.c | 44 + kernel/user.c | 200 + kernel/user_namespace.c | 137 + kernel/utsname.c | 120 + kernel/utsname_sysctl.c | 114 + kernel/wait.c | 289 + kernel/watchdog.c | 590 ++ kernel/workqueue.c | 3813 +++++++++++++ kernel/workqueue_sched.h | 9 + 253 files changed, 183218 insertions(+) create mode 100644 kernel/.gitignore create mode 100644 kernel/Kconfig.freezer create mode 100644 kernel/Kconfig.hz create mode 100644 kernel/Kconfig.locks create mode 100644 kernel/Kconfig.preempt create mode 100644 kernel/Makefile create mode 100644 kernel/acct.c create mode 100644 kernel/async.c create mode 100644 kernel/audit.c create mode 100644 kernel/audit.h create mode 100644 kernel/audit_tree.c create mode 100644 kernel/audit_watch.c create mode 100644 kernel/auditfilter.c create mode 100644 kernel/auditsc.c create mode 100644 kernel/backtracetest.c create mode 100644 kernel/bounds.c create mode 100644 kernel/capability.c create mode 100644 kernel/cgroup.c create mode 100644 kernel/cgroup_freezer.c create mode 100644 kernel/compat.c create mode 100644 kernel/configs.c create mode 100644 kernel/cpu.c create mode 100644 kernel/cpuset.c create mode 100644 kernel/crash_dump.c create mode 100644 kernel/cred.c create mode 100644 kernel/debug/Makefile create mode 100644 kernel/debug/debug_core.c create mode 100644 kernel/debug/debug_core.h create mode 100644 kernel/debug/gdbstub.c create mode 100644 kernel/debug/kdb/.gitignore create mode 100644 kernel/debug/kdb/Makefile create mode 100644 kernel/debug/kdb/kdb_bp.c create mode 100644 kernel/debug/kdb/kdb_bt.c create mode 100644 kernel/debug/kdb/kdb_cmds create mode 100644 kernel/debug/kdb/kdb_debugger.c create mode 100644 kernel/debug/kdb/kdb_io.c create mode 100644 kernel/debug/kdb/kdb_keyboard.c create mode 100644 kernel/debug/kdb/kdb_main.c create mode 100644 kernel/debug/kdb/kdb_private.h create mode 100644 kernel/debug/kdb/kdb_support.c create mode 100644 kernel/delayacct.c create mode 100644 kernel/dma.c create mode 100644 kernel/elfcore.c create mode 100644 kernel/events/Makefile create mode 100644 kernel/events/core.c create mode 100644 kernel/events/hw_breakpoint.c create mode 100644 kernel/exec_domain.c create mode 100644 kernel/exit.c create mode 100644 kernel/extable.c create mode 100644 kernel/fork.c create mode 100644 kernel/freezer.c create mode 100644 kernel/futex.c create mode 100644 kernel/futex_compat.c create mode 100644 kernel/gcov/Kconfig create mode 100644 kernel/gcov/Makefile create mode 100644 kernel/gcov/base.c create mode 100644 kernel/gcov/fs.c create mode 100644 kernel/gcov/gcc_3_4.c create mode 100644 kernel/gcov/gcov.h create mode 100644 kernel/groups.c create mode 100644 kernel/hrtimer.c create mode 100644 kernel/hung_task.c create mode 100644 kernel/irq/Kconfig create mode 100644 kernel/irq/Makefile create mode 100644 kernel/irq/autoprobe.c create mode 100644 kernel/irq/chip.c create mode 100644 kernel/irq/debug.h create mode 100644 kernel/irq/devres.c create mode 100644 kernel/irq/dummychip.c create mode 100644 kernel/irq/generic-chip.c create mode 100644 kernel/irq/handle.c create mode 100644 kernel/irq/internals.h create mode 100644 kernel/irq/irqdesc.c create mode 100644 kernel/irq/manage.c create mode 100644 kernel/irq/migration.c create mode 100644 kernel/irq/pm.c create mode 100644 kernel/irq/proc.c create mode 100644 kernel/irq/resend.c create mode 100644 kernel/irq/settings.h create mode 100644 kernel/irq/spurious.c create mode 100644 kernel/irq_work.c create mode 100644 kernel/itimer.c create mode 100644 kernel/jump_label.c create mode 100644 kernel/kallsyms.c create mode 100644 kernel/kexec.c create mode 100644 kernel/kfifo.c create mode 100644 kernel/kmod.c create mode 100644 kernel/kprobes.c create mode 100644 kernel/ksysfs.c create mode 100644 kernel/kthread.c create mode 100644 kernel/latencytop.c create mode 100644 kernel/lockdep.c create mode 100644 kernel/lockdep_internals.h create mode 100644 kernel/lockdep_proc.c create mode 100644 kernel/lockdep_states.h create mode 100644 kernel/module.c create mode 100644 kernel/mutex-debug.c create mode 100644 kernel/mutex-debug.h create mode 100644 kernel/mutex.c create mode 100644 kernel/mutex.h create mode 100644 kernel/notifier.c create mode 100644 kernel/nsproxy.c create mode 100644 kernel/padata.c create mode 100644 kernel/panic.c create mode 100644 kernel/params.c create mode 100644 kernel/pid.c create mode 100644 kernel/pid_namespace.c create mode 100644 kernel/pm_qos_params.c create mode 100644 kernel/posix-cpu-timers.c create mode 100644 kernel/posix-timers.c create mode 100644 kernel/power/Kconfig create mode 100644 kernel/power/Makefile create mode 100644 kernel/power/block_io.c create mode 100644 kernel/power/console.c create mode 100644 kernel/power/consoleearlysuspend.c create mode 100644 kernel/power/cpufreq_earlysuspend.c create mode 100644 kernel/power/cpufreq_governor_chg.c create mode 100644 kernel/power/cpuhotplug_earlysuspend.c create mode 100644 kernel/power/earlysuspend.c create mode 100644 kernel/power/fbearlysuspend.c create mode 100644 kernel/power/hibernate.c create mode 100644 kernel/power/main.c create mode 100644 kernel/power/power.h create mode 100644 kernel/power/poweroff.c create mode 100644 kernel/power/process.c create mode 100644 kernel/power/snapshot.c create mode 100644 kernel/power/suspend.c create mode 100644 kernel/power/suspend_test.c create mode 100644 kernel/power/suspend_time.c create mode 100644 kernel/power/swap.c create mode 100644 kernel/power/user.c create mode 100644 kernel/power/userwakelock.c create mode 100644 kernel/power/wakelock.c create mode 100644 kernel/printk.c create mode 100644 kernel/profile.c create mode 100644 kernel/ptrace.c create mode 100644 kernel/range.c create mode 100644 kernel/rcupdate.c create mode 100644 kernel/rcutiny.c create mode 100644 kernel/rcutiny_plugin.h create mode 100644 kernel/rcutorture.c create mode 100644 kernel/rcutree.c create mode 100644 kernel/rcutree.h create mode 100644 kernel/rcutree_plugin.h create mode 100644 kernel/rcutree_trace.c create mode 100644 kernel/relay.c create mode 100644 kernel/res_counter.c create mode 100644 kernel/resource.c create mode 100644 kernel/rtmutex-debug.c create mode 100644 kernel/rtmutex-debug.h create mode 100644 kernel/rtmutex-tester.c create mode 100644 kernel/rtmutex.c create mode 100644 kernel/rtmutex.h create mode 100644 kernel/rtmutex_common.h create mode 100644 kernel/rwsem.c create mode 100644 kernel/sched.c create mode 100644 kernel/sched_autogroup.c create mode 100644 kernel/sched_autogroup.h create mode 100644 kernel/sched_clock.c create mode 100644 kernel/sched_cpupri.c create mode 100644 kernel/sched_cpupri.h create mode 100644 kernel/sched_debug.c create mode 100644 kernel/sched_fair.c create mode 100644 kernel/sched_features.h create mode 100644 kernel/sched_idletask.c create mode 100644 kernel/sched_rt.c create mode 100644 kernel/sched_stats.h create mode 100644 kernel/sched_stoptask.c create mode 100644 kernel/seccomp.c create mode 100644 kernel/semaphore.c create mode 100644 kernel/signal.c create mode 100644 kernel/smp.c create mode 100644 kernel/softirq.c create mode 100644 kernel/spinlock.c create mode 100644 kernel/srcu.c create mode 100644 kernel/stacktrace.c create mode 100644 kernel/stop_machine.c create mode 100644 kernel/sys.c create mode 100644 kernel/sys_ni.c create mode 100644 kernel/sysctl.c create mode 100644 kernel/sysctl_binary.c create mode 100644 kernel/sysctl_check.c create mode 100644 kernel/taskstats.c create mode 100644 kernel/test_kprobes.c create mode 100644 kernel/time.c create mode 100644 kernel/time/Kconfig create mode 100644 kernel/time/Makefile create mode 100644 kernel/time/alarmtimer.c create mode 100644 kernel/time/clockevents.c create mode 100644 kernel/time/clocksource.c create mode 100644 kernel/time/jiffies.c create mode 100644 kernel/time/ntp.c create mode 100644 kernel/time/posix-clock.c create mode 100644 kernel/time/tick-broadcast.c create mode 100644 kernel/time/tick-common.c create mode 100644 kernel/time/tick-internal.h create mode 100644 kernel/time/tick-oneshot.c create mode 100644 kernel/time/tick-sched.c create mode 100644 kernel/time/timecompare.c create mode 100644 kernel/time/timeconv.c create mode 100644 kernel/time/timekeeping.c create mode 100644 kernel/time/timer_list.c create mode 100644 kernel/time/timer_stats.c create mode 100644 kernel/timeconst.pl create mode 100644 kernel/timer.c create mode 100644 kernel/trace/Kconfig create mode 100644 kernel/trace/Makefile create mode 100644 kernel/trace/blktrace.c create mode 100644 kernel/trace/ftrace.c create mode 100644 kernel/trace/power-traces.c create mode 100644 kernel/trace/ring_buffer.c create mode 100644 kernel/trace/ring_buffer_benchmark.c create mode 100644 kernel/trace/trace.c create mode 100644 kernel/trace/trace.h create mode 100644 kernel/trace/trace_branch.c create mode 100644 kernel/trace/trace_clock.c create mode 100644 kernel/trace/trace_entries.h create mode 100644 kernel/trace/trace_event_perf.c create mode 100644 kernel/trace/trace_events.c create mode 100644 kernel/trace/trace_events_filter.c create mode 100644 kernel/trace/trace_export.c create mode 100644 kernel/trace/trace_functions.c create mode 100644 kernel/trace/trace_functions_graph.c create mode 100644 kernel/trace/trace_irqsoff.c create mode 100644 kernel/trace/trace_kdb.c create mode 100644 kernel/trace/trace_kprobe.c create mode 100644 kernel/trace/trace_mmiotrace.c create mode 100644 kernel/trace/trace_nop.c create mode 100644 kernel/trace/trace_output.c create mode 100644 kernel/trace/trace_output.h create mode 100644 kernel/trace/trace_printk.c create mode 100644 kernel/trace/trace_sched_switch.c create mode 100644 kernel/trace/trace_sched_wakeup.c create mode 100644 kernel/trace/trace_selftest.c create mode 100644 kernel/trace/trace_selftest_dynamic.c create mode 100644 kernel/trace/trace_stack.c create mode 100644 kernel/trace/trace_stat.c create mode 100644 kernel/trace/trace_stat.h create mode 100644 kernel/trace/trace_syscalls.c create mode 100644 kernel/trace/trace_workqueue.c create mode 100644 kernel/tracepoint.c create mode 100644 kernel/tsacct.c create mode 100644 kernel/uid16.c create mode 100644 kernel/up.c create mode 100644 kernel/user-return-notifier.c create mode 100644 kernel/user.c create mode 100644 kernel/user_namespace.c create mode 100644 kernel/utsname.c create mode 100644 kernel/utsname_sysctl.c create mode 100644 kernel/wait.c create mode 100644 kernel/watchdog.c create mode 100644 kernel/workqueue.c create mode 100644 kernel/workqueue_sched.h (limited to 'kernel') diff --git a/kernel/.gitignore b/kernel/.gitignore new file mode 100644 index 00000000..ab4f1090 --- /dev/null +++ b/kernel/.gitignore @@ -0,0 +1,6 @@ +# +# Generated files +# +config_data.h +config_data.gz +timeconst.h diff --git a/kernel/Kconfig.freezer b/kernel/Kconfig.freezer new file mode 100644 index 00000000..a3bb4cb5 --- /dev/null +++ b/kernel/Kconfig.freezer @@ -0,0 +1,2 @@ +config FREEZER + def_bool PM_SLEEP || CGROUP_FREEZER diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz new file mode 100644 index 00000000..94fabd53 --- /dev/null +++ b/kernel/Kconfig.hz @@ -0,0 +1,58 @@ +# +# Timer Interrupt Frequency Configuration +# + +choice + prompt "Timer frequency" + default HZ_250 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more + beneficial for servers and NUMA systems that do not need to have + a fast response for user interaction and that may experience bus + contention and cacheline bounces as a result of timer interrupts. + Note that the timer interrupt occurs on each processor in an SMP + environment leading to NR_CPUS * HZ number of timer interrupts + per second. + + + config HZ_100 + bool "100 HZ" + help + 100 Hz is a typical choice for servers, SMP and NUMA systems + with lots of processors that may show reduced performance if + too many timer interrupts are occurring. + + config HZ_250 + bool "250 HZ" + help + 250 Hz is a good compromise choice allowing server performance + while also showing good interactive responsiveness even + on SMP and NUMA systems. If you are going to be using NTSC video + or multimedia, selected 300Hz instead. + + config HZ_300 + bool "300 HZ" + help + 300 Hz is a good compromise choice allowing server performance + while also showing good interactive responsiveness even + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + + config HZ_1000 + bool "1000 HZ" + help + 1000 Hz is the preferred choice for desktop systems and other + systems requiring fast interactive responses to events. + +endchoice + +config HZ + int + default 100 if HZ_100 + default 250 if HZ_250 + default 300 if HZ_300 + default 1000 if HZ_1000 + +config SCHED_HRTICK + def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS) diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks new file mode 100644 index 00000000..5068e2a4 --- /dev/null +++ b/kernel/Kconfig.locks @@ -0,0 +1,202 @@ +# +# The ARCH_INLINE foo is necessary because select ignores "depends on" +# +config ARCH_INLINE_SPIN_TRYLOCK + bool + +config ARCH_INLINE_SPIN_TRYLOCK_BH + bool + +config ARCH_INLINE_SPIN_LOCK + bool + +config ARCH_INLINE_SPIN_LOCK_BH + bool + +config ARCH_INLINE_SPIN_LOCK_IRQ + bool + +config ARCH_INLINE_SPIN_LOCK_IRQSAVE + bool + +config ARCH_INLINE_SPIN_UNLOCK + bool + +config ARCH_INLINE_SPIN_UNLOCK_BH + bool + +config ARCH_INLINE_SPIN_UNLOCK_IRQ + bool + +config ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE + bool + + +config ARCH_INLINE_READ_TRYLOCK + bool + +config ARCH_INLINE_READ_LOCK + bool + +config ARCH_INLINE_READ_LOCK_BH + bool + +config ARCH_INLINE_READ_LOCK_IRQ + bool + +config ARCH_INLINE_READ_LOCK_IRQSAVE + bool + +config ARCH_INLINE_READ_UNLOCK + bool + +config ARCH_INLINE_READ_UNLOCK_BH + bool + +config ARCH_INLINE_READ_UNLOCK_IRQ + bool + +config ARCH_INLINE_READ_UNLOCK_IRQRESTORE + bool + + +config ARCH_INLINE_WRITE_TRYLOCK + bool + +config ARCH_INLINE_WRITE_LOCK + bool + +config ARCH_INLINE_WRITE_LOCK_BH + bool + +config ARCH_INLINE_WRITE_LOCK_IRQ + bool + +config ARCH_INLINE_WRITE_LOCK_IRQSAVE + bool + +config ARCH_INLINE_WRITE_UNLOCK + bool + +config ARCH_INLINE_WRITE_UNLOCK_BH + bool + +config ARCH_INLINE_WRITE_UNLOCK_IRQ + bool + +config ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE + bool + +# +# lock_* functions are inlined when: +# - DEBUG_SPINLOCK=n and GENERIC_LOCKBREAK=n and ARCH_INLINE_*LOCK=y +# +# trylock_* functions are inlined when: +# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y +# +# unlock and unlock_irq functions are inlined when: +# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y +# or +# - DEBUG_SPINLOCK=n and PREEMPT=n +# +# unlock_bh and unlock_irqrestore functions are inlined when: +# - DEBUG_SPINLOCK=n and ARCH_INLINE_*LOCK=y +# + +config INLINE_SPIN_TRYLOCK + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK + +config INLINE_SPIN_TRYLOCK_BH + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_TRYLOCK_BH + +config INLINE_SPIN_LOCK + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_SPIN_LOCK + +config INLINE_SPIN_LOCK_BH + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_SPIN_LOCK_BH + +config INLINE_SPIN_LOCK_IRQ + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_SPIN_LOCK_IRQ + +config INLINE_SPIN_LOCK_IRQSAVE + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_SPIN_LOCK_IRQSAVE + +config INLINE_SPIN_UNLOCK + def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK) + +config INLINE_SPIN_UNLOCK_BH + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_BH + +config INLINE_SPIN_UNLOCK_IRQ + def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_SPIN_UNLOCK_BH) + +config INLINE_SPIN_UNLOCK_IRQRESTORE + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_SPIN_UNLOCK_IRQRESTORE + + +config INLINE_READ_TRYLOCK + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_TRYLOCK + +config INLINE_READ_LOCK + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_READ_LOCK + +config INLINE_READ_LOCK_BH + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_READ_LOCK_BH + +config INLINE_READ_LOCK_IRQ + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_READ_LOCK_IRQ + +config INLINE_READ_LOCK_IRQSAVE + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_READ_LOCK_IRQSAVE + +config INLINE_READ_UNLOCK + def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK) + +config INLINE_READ_UNLOCK_BH + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_BH + +config INLINE_READ_UNLOCK_IRQ + def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_READ_UNLOCK_BH) + +config INLINE_READ_UNLOCK_IRQRESTORE + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_READ_UNLOCK_IRQRESTORE + + +config INLINE_WRITE_TRYLOCK + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_TRYLOCK + +config INLINE_WRITE_LOCK + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && ARCH_INLINE_WRITE_LOCK + +config INLINE_WRITE_LOCK_BH + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_WRITE_LOCK_BH + +config INLINE_WRITE_LOCK_IRQ + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_WRITE_LOCK_IRQ + +config INLINE_WRITE_LOCK_IRQSAVE + def_bool !DEBUG_SPINLOCK && !GENERIC_LOCKBREAK && \ + ARCH_INLINE_WRITE_LOCK_IRQSAVE + +config INLINE_WRITE_UNLOCK + def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK) + +config INLINE_WRITE_UNLOCK_BH + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_BH + +config INLINE_WRITE_UNLOCK_IRQ + def_bool !DEBUG_SPINLOCK && (!PREEMPT || ARCH_INLINE_WRITE_UNLOCK_BH) + +config INLINE_WRITE_UNLOCK_IRQRESTORE + def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE + +config MUTEX_SPIN_ON_OWNER + def_bool SMP && !DEBUG_MUTEXES diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt new file mode 100644 index 00000000..bf987b95 --- /dev/null +++ b/kernel/Kconfig.preempt @@ -0,0 +1,54 @@ + +choice + prompt "Preemption Model" + default PREEMPT_NONE + +config PREEMPT_NONE + bool "No Forced Preemption (Server)" + help + This is the traditional Linux preemption model, geared towards + throughput. It will still provide good latencies most of the + time, but there are no guarantees and occasional longer delays + are possible. + + Select this option if you are building a kernel for a server or + scientific/computation system, or if you want to maximize the + raw processing power of the kernel, irrespective of scheduling + latencies. + +config PREEMPT_VOLUNTARY + bool "Voluntary Kernel Preemption (Desktop)" + help + This option reduces the latency of the kernel by adding more + "explicit preemption points" to the kernel code. These new + preemption points have been selected to reduce the maximum + latency of rescheduling, providing faster application reactions, + at the cost of slightly lower throughput. + + This allows reaction to interactive events by allowing a + low priority process to voluntarily preempt itself even if it + is in kernel mode executing a system call. This allows + applications to run more 'smoothly' even when the system is + under load. + + Select this if you are building a kernel for a desktop system. + +config PREEMPT + bool "Preemptible Kernel (Low-Latency Desktop)" + help + This option reduces the latency of the kernel by making + all kernel code (that is not executing in a critical section) + preemptible. This allows reaction to interactive events by + permitting a low priority process to be preempted involuntarily + even if it is in kernel mode executing a system call and would + otherwise not be about to reach a natural preemption point. + This allows applications to run more 'smoothly' even when the + system is under load, at the cost of slightly lower throughput + and a slight runtime overhead to kernel code. + + Select this if you are building a kernel for a desktop or + embedded system with latency requirements in the milliseconds + range. + +endchoice + diff --git a/kernel/Makefile b/kernel/Makefile new file mode 100644 index 00000000..2d64cfcc --- /dev/null +++ b/kernel/Makefile @@ -0,0 +1,140 @@ +# +# Makefile for the linux kernel. +# + +obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ + cpu.o exit.o itimer.o time.o softirq.o resource.o \ + sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \ + signal.o sys.o kmod.o workqueue.o pid.o \ + rcupdate.o extable.o params.o posix-timers.o \ + kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ + hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ + notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ + async.o range.o jump_label.o +obj-y += groups.o + +ifdef CONFIG_FUNCTION_TRACER +# Do not trace debug files and internal ftrace files +CFLAGS_REMOVE_lockdep.o = -pg +CFLAGS_REMOVE_lockdep_proc.o = -pg +CFLAGS_REMOVE_mutex-debug.o = -pg +CFLAGS_REMOVE_rtmutex-debug.o = -pg +CFLAGS_REMOVE_cgroup-debug.o = -pg +CFLAGS_REMOVE_sched_clock.o = -pg +CFLAGS_REMOVE_irq_work.o = -pg +endif + +obj-$(CONFIG_FREEZER) += freezer.o +obj-$(CONFIG_PROFILING) += profile.o +obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o +obj-$(CONFIG_STACKTRACE) += stacktrace.o +obj-y += time/ +obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o +obj-$(CONFIG_LOCKDEP) += lockdep.o +ifeq ($(CONFIG_PROC_FS),y) +obj-$(CONFIG_LOCKDEP) += lockdep_proc.o +endif +obj-$(CONFIG_FUTEX) += futex.o +ifeq ($(CONFIG_COMPAT),y) +obj-$(CONFIG_FUTEX) += futex_compat.o +endif +obj-$(CONFIG_RT_MUTEXES) += rtmutex.o +obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o +obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o +obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o +obj-$(CONFIG_SMP) += smp.o +ifneq ($(CONFIG_SMP),y) +obj-y += up.o +endif +obj-$(CONFIG_SMP) += spinlock.o +obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o +obj-$(CONFIG_PROVE_LOCKING) += spinlock.o +obj-$(CONFIG_UID16) += uid16.o +obj-$(CONFIG_MODULES) += module.o +obj-$(CONFIG_KALLSYMS) += kallsyms.o +obj-$(CONFIG_PM) += power/ +obj-$(CONFIG_FREEZER) += power/ +obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o +obj-$(CONFIG_KEXEC) += kexec.o +obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o +obj-$(CONFIG_COMPAT) += compat.o +obj-$(CONFIG_CGROUPS) += cgroup.o +obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o +obj-$(CONFIG_CPUSETS) += cpuset.o +obj-$(CONFIG_UTS_NS) += utsname.o +obj-$(CONFIG_USER_NS) += user_namespace.o +obj-$(CONFIG_PID_NS) += pid_namespace.o +obj-$(CONFIG_IKCONFIG) += configs.o +obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o +obj-$(CONFIG_SMP) += stop_machine.o +obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o +obj-$(CONFIG_AUDIT) += audit.o auditfilter.o +obj-$(CONFIG_AUDITSYSCALL) += auditsc.o +obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o +obj-$(CONFIG_AUDIT_TREE) += audit_tree.o +obj-$(CONFIG_GCOV_KERNEL) += gcov/ +obj-$(CONFIG_KPROBES) += kprobes.o +obj-$(CONFIG_KGDB) += debug/ +obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o +obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o +obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ +obj-$(CONFIG_SECCOMP) += seccomp.o +obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o +obj-$(CONFIG_TREE_RCU) += rcutree.o +obj-$(CONFIG_TREE_PREEMPT_RCU) += rcutree.o +obj-$(CONFIG_TREE_RCU_TRACE) += rcutree_trace.o +obj-$(CONFIG_TINY_RCU) += rcutiny.o +obj-$(CONFIG_TINY_PREEMPT_RCU) += rcutiny.o +obj-$(CONFIG_RELAY) += relay.o +obj-$(CONFIG_SYSCTL) += utsname_sysctl.o +obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o +obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o +obj-$(CONFIG_TRACEPOINTS) += tracepoint.o +obj-$(CONFIG_LATENCYTOP) += latencytop.o +obj-$(CONFIG_BINFMT_ELF) += elfcore.o +obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o +obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o +obj-$(CONFIG_FUNCTION_TRACER) += trace/ +obj-$(CONFIG_TRACING) += trace/ +obj-$(CONFIG_X86_DS) += trace/ +obj-$(CONFIG_RING_BUFFER) += trace/ +obj-$(CONFIG_TRACEPOINTS) += trace/ +obj-$(CONFIG_SMP) += sched_cpupri.o +obj-$(CONFIG_IRQ_WORK) += irq_work.o + +obj-$(CONFIG_PERF_EVENTS) += events/ + +obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o +obj-$(CONFIG_PADATA) += padata.o +obj-$(CONFIG_CRASH_DUMP) += crash_dump.o + +ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) +# According to Alan Modra , the -fno-omit-frame-pointer is +# needed for x86 only. Why this used to be enabled for all architectures is beyond +# me. I suspect most platforms don't need this, but until we know that for sure +# I turn this off for IA-64 only. Andreas Schwab says it's also needed on m68k +# to get a correct value for the wait-channel (WCHAN in ps). --davidm +CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer +endif + +$(obj)/configs.o: $(obj)/config_data.h + +# config_data.h contains the same information as ikconfig.h but gzipped. +# Info from config_data can be extracted from /proc/config* +targets += config_data.gz +$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE + $(call if_changed,gzip) + +quiet_cmd_ikconfiggz = IKCFG $@ + cmd_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") > $@ +targets += config_data.h +$(obj)/config_data.h: $(obj)/config_data.gz FORCE + $(call if_changed,ikconfiggz) + +$(obj)/time.o: $(obj)/timeconst.h + +quiet_cmd_timeconst = TIMEC $@ + cmd_timeconst = $(PERL) $< $(CONFIG_HZ) > $@ +targets += timeconst.h +$(obj)/timeconst.h: $(src)/timeconst.pl FORCE + $(call if_changed,timeconst) diff --git a/kernel/acct.c b/kernel/acct.c new file mode 100644 index 00000000..fa7eb3de --- /dev/null +++ b/kernel/acct.c @@ -0,0 +1,665 @@ +/* + * linux/kernel/acct.c + * + * BSD Process Accounting for Linux + * + * Author: Marco van Wieringen + * + * Some code based on ideas and code from: + * Thomas K. Dyas + * + * This file implements BSD-style process accounting. Whenever any + * process exits, an accounting record of type "struct acct" is + * written to the file specified with the acct() system call. It is + * up to user-level programs to do useful things with the accounting + * log. The kernel just provides the raw accounting information. + * + * (C) Copyright 1995 - 1997 Marco van Wieringen - ELM Consultancy B.V. + * + * Plugged two leaks. 1) It didn't return acct_file into the free_filps if + * the file happened to be read-only. 2) If the accounting was suspended + * due to the lack of space it happily allowed to reopen it and completely + * lost the old acct_file. 3/10/98, Al Viro. + * + * Now we silently close acct_file on attempt to reopen. Cleaned sys_acct(). + * XTerms and EMACS are manifestations of pure evil. 21/10/98, AV. + * + * Fixed a nasty interaction with with sys_umount(). If the accointing + * was suspeneded we failed to stop it on umount(). Messy. + * Another one: remount to readonly didn't stop accounting. + * Question: what should we do if we have CAP_SYS_ADMIN but not + * CAP_SYS_PACCT? Current code does the following: umount returns -EBUSY + * unless we are messing with the root. In that case we are getting a + * real mess with do_remount_sb(). 9/11/98, AV. + * + * Fixed a bunch of races (and pair of leaks). Probably not the best way, + * but this one obviously doesn't introduce deadlocks. Later. BTW, found + * one race (and leak) in BSD implementation. + * OK, that's better. ANOTHER race and leak in BSD variant. There always + * is one more bug... 10/11/98, AV. + * + * Oh, fsck... Oopsable SMP race in do_process_acct() - we must hold + * ->mmap_sem to walk the vma list of current->mm. Nasty, since it leaks + * a struct file opened for write. Fixed. 2/6/2000, AV. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* sector_div */ +#include + +/* + * These constants control the amount of freespace that suspend and + * resume the process accounting system, and the time delay between + * each check. + * Turned into sysctl-controllable parameters. AV, 12/11/98 + */ + +int acct_parm[3] = {4, 2, 30}; +#define RESUME (acct_parm[0]) /* >foo% free space - resume */ +#define SUSPEND (acct_parm[1]) /* needcheck = 1; +} + +/* + * Check the amount of free space and suspend/resume accordingly. + */ +static int check_free_space(struct bsd_acct_struct *acct, struct file *file) +{ + struct kstatfs sbuf; + int res; + int act; + sector_t resume; + sector_t suspend; + + spin_lock(&acct_lock); + res = acct->active; + if (!file || !acct->needcheck) + goto out; + spin_unlock(&acct_lock); + + /* May block */ + if (vfs_statfs(&file->f_path, &sbuf)) + return res; + suspend = sbuf.f_blocks * SUSPEND; + resume = sbuf.f_blocks * RESUME; + + sector_div(suspend, 100); + sector_div(resume, 100); + + if (sbuf.f_bavail <= suspend) + act = -1; + else if (sbuf.f_bavail >= resume) + act = 1; + else + act = 0; + + /* + * If some joker switched acct->file under us we'ld better be + * silent and _not_ touch anything. + */ + spin_lock(&acct_lock); + if (file != acct->file) { + if (act) + res = act>0; + goto out; + } + + if (acct->active) { + if (act < 0) { + acct->active = 0; + printk(KERN_INFO "Process accounting paused\n"); + } + } else { + if (act > 0) { + acct->active = 1; + printk(KERN_INFO "Process accounting resumed\n"); + } + } + + del_timer(&acct->timer); + acct->needcheck = 0; + acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ; + add_timer(&acct->timer); + res = acct->active; +out: + spin_unlock(&acct_lock); + return res; +} + +/* + * Close the old accounting file (if currently open) and then replace + * it with file (if non-NULL). + * + * NOTE: acct_lock MUST be held on entry and exit. + */ +static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, + struct pid_namespace *ns) +{ + struct file *old_acct = NULL; + struct pid_namespace *old_ns = NULL; + + if (acct->file) { + old_acct = acct->file; + old_ns = acct->ns; + del_timer(&acct->timer); + acct->active = 0; + acct->needcheck = 0; + acct->file = NULL; + acct->ns = NULL; + list_del(&acct->list); + } + if (file) { + acct->file = file; + acct->ns = ns; + acct->needcheck = 0; + acct->active = 1; + list_add(&acct->list, &acct_list); + /* It's been deleted if it was used before so this is safe */ + setup_timer(&acct->timer, acct_timeout, (unsigned long)acct); + acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ; + add_timer(&acct->timer); + } + if (old_acct) { + mnt_unpin(old_acct->f_path.mnt); + spin_unlock(&acct_lock); + do_acct_process(acct, old_ns, old_acct); + filp_close(old_acct, NULL); + spin_lock(&acct_lock); + } +} + +static int acct_on(char *name) +{ + struct file *file; + struct vfsmount *mnt; + struct pid_namespace *ns; + struct bsd_acct_struct *acct = NULL; + + /* Difference from BSD - they don't do O_APPEND */ + file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + + if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) { + filp_close(file, NULL); + return -EACCES; + } + + if (!file->f_op->write) { + filp_close(file, NULL); + return -EIO; + } + + ns = task_active_pid_ns(current); + if (ns->bacct == NULL) { + acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); + if (acct == NULL) { + filp_close(file, NULL); + return -ENOMEM; + } + } + + spin_lock(&acct_lock); + if (ns->bacct == NULL) { + ns->bacct = acct; + acct = NULL; + } + + mnt = file->f_path.mnt; + mnt_pin(mnt); + acct_file_reopen(ns->bacct, file, ns); + spin_unlock(&acct_lock); + + mntput(mnt); /* it's pinned, now give up active reference */ + kfree(acct); + + return 0; +} + +/** + * sys_acct - enable/disable process accounting + * @name: file name for accounting records or NULL to shutdown accounting + * + * Returns 0 for success or negative errno values for failure. + * + * sys_acct() is the only system call needed to implement process + * accounting. It takes the name of the file where accounting records + * should be written. If the filename is NULL, accounting will be + * shutdown. + */ +SYSCALL_DEFINE1(acct, const char __user *, name) +{ + int error = 0; + + if (!capable(CAP_SYS_PACCT)) + return -EPERM; + + if (name) { + char *tmp = getname(name); + if (IS_ERR(tmp)) + return (PTR_ERR(tmp)); + error = acct_on(tmp); + putname(tmp); + } else { + struct bsd_acct_struct *acct; + + acct = task_active_pid_ns(current)->bacct; + if (acct == NULL) + return 0; + + spin_lock(&acct_lock); + acct_file_reopen(acct, NULL, NULL); + spin_unlock(&acct_lock); + } + + return error; +} + +/** + * acct_auto_close - turn off a filesystem's accounting if it is on + * @m: vfsmount being shut down + * + * If the accounting is turned on for a file in the subtree pointed to + * to by m, turn accounting off. Done when m is about to die. + */ +void acct_auto_close_mnt(struct vfsmount *m) +{ + struct bsd_acct_struct *acct; + + spin_lock(&acct_lock); +restart: + list_for_each_entry(acct, &acct_list, list) + if (acct->file && acct->file->f_path.mnt == m) { + acct_file_reopen(acct, NULL, NULL); + goto restart; + } + spin_unlock(&acct_lock); +} + +/** + * acct_auto_close - turn off a filesystem's accounting if it is on + * @sb: super block for the filesystem + * + * If the accounting is turned on for a file in the filesystem pointed + * to by sb, turn accounting off. + */ +void acct_auto_close(struct super_block *sb) +{ + struct bsd_acct_struct *acct; + + spin_lock(&acct_lock); +restart: + list_for_each_entry(acct, &acct_list, list) + if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) { + acct_file_reopen(acct, NULL, NULL); + goto restart; + } + spin_unlock(&acct_lock); +} + +void acct_exit_ns(struct pid_namespace *ns) +{ + struct bsd_acct_struct *acct = ns->bacct; + + if (acct == NULL) + return; + + del_timer_sync(&acct->timer); + spin_lock(&acct_lock); + if (acct->file != NULL) + acct_file_reopen(acct, NULL, NULL); + spin_unlock(&acct_lock); + + kfree(acct); +} + +/* + * encode an unsigned long into a comp_t + * + * This routine has been adopted from the encode_comp_t() function in + * the kern_acct.c file of the FreeBSD operating system. The encoding + * is a 13-bit fraction with a 3-bit (base 8) exponent. + */ + +#define MANTSIZE 13 /* 13 bit mantissa. */ +#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */ +#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */ + +static comp_t encode_comp_t(unsigned long value) +{ + int exp, rnd; + + exp = rnd = 0; + while (value > MAXFRACT) { + rnd = value & (1 << (EXPSIZE - 1)); /* Round up? */ + value >>= EXPSIZE; /* Base 8 exponent == 3 bit shift. */ + exp++; + } + + /* + * If we need to round up, do it (and handle overflow correctly). + */ + if (rnd && (++value > MAXFRACT)) { + value >>= EXPSIZE; + exp++; + } + + /* + * Clean it up and polish it off. + */ + exp <<= MANTSIZE; /* Shift the exponent into place */ + exp += value; /* and add on the mantissa. */ + return exp; +} + +#if ACCT_VERSION==1 || ACCT_VERSION==2 +/* + * encode an u64 into a comp2_t (24 bits) + * + * Format: 5 bit base 2 exponent, 20 bits mantissa. + * The leading bit of the mantissa is not stored, but implied for + * non-zero exponents. + * Largest encodable value is 50 bits. + */ + +#define MANTSIZE2 20 /* 20 bit mantissa. */ +#define EXPSIZE2 5 /* 5 bit base 2 exponent. */ +#define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */ +#define MAXEXP2 ((1 < (MAXFRACT2>>1)); + rnd = 0; + while (value > MAXFRACT2) { + rnd = value & 1; + value >>= 1; + exp++; + } + + /* + * If we need to round up, do it (and handle overflow correctly). + */ + if (rnd && (++value > MAXFRACT2)) { + value >>= 1; + exp++; + } + + if (exp > MAXEXP2) { + /* Overflow. Return largest representable number instead. */ + return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1; + } else { + return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1)); + } +} +#endif + +#if ACCT_VERSION==3 +/* + * encode an u64 into a 32 bit IEEE float + */ +static u32 encode_float(u64 value) +{ + unsigned exp = 190; + unsigned u; + + if (value==0) return 0; + while ((s64)value > 0){ + value <<= 1; + exp--; + } + u = (u32)(value >> 40) & 0x7fffffu; + return u | (exp << 23); +} +#endif + +/* + * Write an accounting entry for an exiting process + * + * The acct_process() call is the workhorse of the process + * accounting system. The struct acct is built here and then written + * into the accounting file. This function should only be called from + * do_exit() or when switching to a different output file. + */ + +/* + * do_acct_process does all actual work. Caller holds the reference to file. + */ +static void do_acct_process(struct bsd_acct_struct *acct, + struct pid_namespace *ns, struct file *file) +{ + struct pacct_struct *pacct = ¤t->signal->pacct; + acct_t ac; + mm_segment_t fs; + unsigned long flim; + u64 elapsed; + u64 run_time; + struct timespec uptime; + struct tty_struct *tty; + const struct cred *orig_cred; + + /* Perform file operations on behalf of whoever enabled accounting */ + orig_cred = override_creds(file->f_cred); + + /* + * First check to see if there is enough free_space to continue + * the process accounting system. + */ + if (!check_free_space(acct, file)) + goto out; + + /* + * Fill the accounting struct with the needed info as recorded + * by the different kernel functions. + */ + memset((caddr_t)&ac, 0, sizeof(acct_t)); + + ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER; + strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); + + /* calculate run_time in nsec*/ + do_posix_clock_monotonic_gettime(&uptime); + run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec; + run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC + + current->group_leader->start_time.tv_nsec; + /* convert nsec -> AHZ */ + elapsed = nsec_to_AHZ(run_time); +#if ACCT_VERSION==3 + ac.ac_etime = encode_float(elapsed); +#else + ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? + (unsigned long) elapsed : (unsigned long) -1l); +#endif +#if ACCT_VERSION==1 || ACCT_VERSION==2 + { + /* new enlarged etime field */ + comp2_t etime = encode_comp2_t(elapsed); + ac.ac_etime_hi = etime >> 16; + ac.ac_etime_lo = (u16) etime; + } +#endif + do_div(elapsed, AHZ); + ac.ac_btime = get_seconds() - elapsed; + /* we really need to bite the bullet and change layout */ + ac.ac_uid = orig_cred->uid; + ac.ac_gid = orig_cred->gid; +#if ACCT_VERSION==2 + ac.ac_ahz = AHZ; +#endif +#if ACCT_VERSION==1 || ACCT_VERSION==2 + /* backward-compatible 16 bit fields */ + ac.ac_uid16 = ac.ac_uid; + ac.ac_gid16 = ac.ac_gid; +#endif +#if ACCT_VERSION==3 + ac.ac_pid = task_tgid_nr_ns(current, ns); + rcu_read_lock(); + ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); + rcu_read_unlock(); +#endif + + spin_lock_irq(¤t->sighand->siglock); + tty = current->signal->tty; /* Safe as we hold the siglock */ + ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; + ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); + ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); + ac.ac_flag = pacct->ac_flag; + ac.ac_mem = encode_comp_t(pacct->ac_mem); + ac.ac_minflt = encode_comp_t(pacct->ac_minflt); + ac.ac_majflt = encode_comp_t(pacct->ac_majflt); + ac.ac_exitcode = pacct->ac_exitcode; + spin_unlock_irq(¤t->sighand->siglock); + ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ + ac.ac_rw = encode_comp_t(ac.ac_io / 1024); + ac.ac_swaps = encode_comp_t(0); + + /* + * Kernel segment override to datasegment and write it + * to the accounting file. + */ + fs = get_fs(); + set_fs(KERNEL_DS); + /* + * Accounting records are not subject to resource limits. + */ + flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; + file->f_op->write(file, (char *)&ac, + sizeof(acct_t), &file->f_pos); + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; + set_fs(fs); +out: + revert_creds(orig_cred); +} + +/** + * acct_collect - collect accounting information into pacct_struct + * @exitcode: task exit code + * @group_dead: not 0, if this thread is the last one in the process. + */ +void acct_collect(long exitcode, int group_dead) +{ + struct pacct_struct *pacct = ¤t->signal->pacct; + unsigned long vsize = 0; + + if (group_dead && current->mm) { + struct vm_area_struct *vma; + down_read(¤t->mm->mmap_sem); + vma = current->mm->mmap; + while (vma) { + vsize += vma->vm_end - vma->vm_start; + vma = vma->vm_next; + } + up_read(¤t->mm->mmap_sem); + } + + spin_lock_irq(¤t->sighand->siglock); + if (group_dead) + pacct->ac_mem = vsize / 1024; + if (thread_group_leader(current)) { + pacct->ac_exitcode = exitcode; + if (current->flags & PF_FORKNOEXEC) + pacct->ac_flag |= AFORK; + } + if (current->flags & PF_SUPERPRIV) + pacct->ac_flag |= ASU; + if (current->flags & PF_DUMPCORE) + pacct->ac_flag |= ACORE; + if (current->flags & PF_SIGNALED) + pacct->ac_flag |= AXSIG; + pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime); + pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime); + pacct->ac_minflt += current->min_flt; + pacct->ac_majflt += current->maj_flt; + spin_unlock_irq(¤t->sighand->siglock); +} + +static void acct_process_in_ns(struct pid_namespace *ns) +{ + struct file *file = NULL; + struct bsd_acct_struct *acct; + + acct = ns->bacct; + /* + * accelerate the common fastpath: + */ + if (!acct || !acct->file) + return; + + spin_lock(&acct_lock); + file = acct->file; + if (unlikely(!file)) { + spin_unlock(&acct_lock); + return; + } + get_file(file); + spin_unlock(&acct_lock); + + do_acct_process(acct, ns, file); + fput(file); +} + +/** + * acct_process - now just a wrapper around acct_process_in_ns, + * which in turn is a wrapper around do_acct_process. + * + * handles process accounting for an exiting task + */ +void acct_process(void) +{ + struct pid_namespace *ns; + + /* + * This loop is safe lockless, since current is still + * alive and holds its namespace, which in turn holds + * its parent. + */ + for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) + acct_process_in_ns(ns); +} diff --git a/kernel/async.c b/kernel/async.c new file mode 100644 index 00000000..cd9dbb91 --- /dev/null +++ b/kernel/async.c @@ -0,0 +1,301 @@ +/* + * async.c: Asynchronous function calls for boot performance + * + * (C) Copyright 2009 Intel Corporation + * Author: Arjan van de Ven + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + + +/* + +Goals and Theory of Operation + +The primary goal of this feature is to reduce the kernel boot time, +by doing various independent hardware delays and discovery operations +decoupled and not strictly serialized. + +More specifically, the asynchronous function call concept allows +certain operations (primarily during system boot) to happen +asynchronously, out of order, while these operations still +have their externally visible parts happen sequentially and in-order. +(not unlike how out-of-order CPUs retire their instructions in order) + +Key to the asynchronous function call implementation is the concept of +a "sequence cookie" (which, although it has an abstracted type, can be +thought of as a monotonically incrementing number). + +The async core will assign each scheduled event such a sequence cookie and +pass this to the called functions. + +The asynchronously called function should before doing a globally visible +operation, such as registering device numbers, call the +async_synchronize_cookie() function and pass in its own cookie. The +async_synchronize_cookie() function will make sure that all asynchronous +operations that were scheduled prior to the operation corresponding with the +cookie have completed. + +Subsystem/driver initialization code that scheduled asynchronous probe +functions, but which shares global resources with other drivers/subsystems +that do not use the asynchronous call feature, need to do a full +synchronization with the async_synchronize_full() function, before returning +from their init function. This is to maintain strict ordering between the +asynchronous and synchronous parts of the kernel. + +*/ + +#include +#include +#include +#include +#include +#include +#include + +static async_cookie_t next_cookie = 1; + +#define MAX_WORK 32768 + +static LIST_HEAD(async_pending); +static LIST_HEAD(async_running); +static DEFINE_SPINLOCK(async_lock); + +struct async_entry { + struct list_head list; + struct work_struct work; + async_cookie_t cookie; + async_func_ptr *func; + void *data; + struct list_head *running; +}; + +static DECLARE_WAIT_QUEUE_HEAD(async_done); + +static atomic_t entry_count; + +extern int initcall_debug; + + +/* + * MUST be called with the lock held! + */ +static async_cookie_t __lowest_in_progress(struct list_head *running) +{ + struct async_entry *entry; + + if (!list_empty(running)) { + entry = list_first_entry(running, + struct async_entry, list); + return entry->cookie; + } + + list_for_each_entry(entry, &async_pending, list) + if (entry->running == running) + return entry->cookie; + + return next_cookie; /* "infinity" value */ +} + +static async_cookie_t lowest_in_progress(struct list_head *running) +{ + unsigned long flags; + async_cookie_t ret; + + spin_lock_irqsave(&async_lock, flags); + ret = __lowest_in_progress(running); + spin_unlock_irqrestore(&async_lock, flags); + return ret; +} + +/* + * pick the first pending entry and run it + */ +static void async_run_entry_fn(struct work_struct *work) +{ + struct async_entry *entry = + container_of(work, struct async_entry, work); + unsigned long flags; + ktime_t calltime, delta, rettime; + + /* 1) move self to the running queue */ + spin_lock_irqsave(&async_lock, flags); + list_move_tail(&entry->list, entry->running); + spin_unlock_irqrestore(&async_lock, flags); + + /* 2) run (and print duration) */ + if (initcall_debug && system_state == SYSTEM_BOOTING) { + printk("calling %lli_%pF @ %i\n", (long long)entry->cookie, + entry->func, task_pid_nr(current)); + calltime = ktime_get(); + } + entry->func(entry->data, entry->cookie); + if (initcall_debug && system_state == SYSTEM_BOOTING) { + rettime = ktime_get(); + delta = ktime_sub(rettime, calltime); + printk("initcall %lli_%pF returned 0 after %lld usecs\n", + (long long)entry->cookie, + entry->func, + (long long)ktime_to_ns(delta) >> 10); + } + + /* 3) remove self from the running queue */ + spin_lock_irqsave(&async_lock, flags); + list_del(&entry->list); + + /* 4) free the entry */ + kfree(entry); + atomic_dec(&entry_count); + + spin_unlock_irqrestore(&async_lock, flags); + + /* 5) wake up any waiters */ + wake_up(&async_done); +} + +static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct list_head *running) +{ + struct async_entry *entry; + unsigned long flags; + async_cookie_t newcookie; + + /* allow irq-off callers */ + entry = kzalloc(sizeof(struct async_entry), GFP_ATOMIC); + + /* + * If we're out of memory or if there's too much work + * pending already, we execute synchronously. + */ + if (!entry || atomic_read(&entry_count) > MAX_WORK) { + kfree(entry); + spin_lock_irqsave(&async_lock, flags); + newcookie = next_cookie++; + spin_unlock_irqrestore(&async_lock, flags); + + /* low on memory.. run synchronously */ + ptr(data, newcookie); + return newcookie; + } + INIT_WORK(&entry->work, async_run_entry_fn); + entry->func = ptr; + entry->data = data; + entry->running = running; + + spin_lock_irqsave(&async_lock, flags); + newcookie = entry->cookie = next_cookie++; + list_add_tail(&entry->list, &async_pending); + atomic_inc(&entry_count); + spin_unlock_irqrestore(&async_lock, flags); + + /* schedule for execution */ + queue_work(system_unbound_wq, &entry->work); + + return newcookie; +} + +/** + * async_schedule - schedule a function for asynchronous execution + * @ptr: function to execute asynchronously + * @data: data pointer to pass to the function + * + * Returns an async_cookie_t that may be used for checkpointing later. + * Note: This function may be called from atomic or non-atomic contexts. + */ +async_cookie_t async_schedule(async_func_ptr *ptr, void *data) +{ + return __async_schedule(ptr, data, &async_running); +} +EXPORT_SYMBOL_GPL(async_schedule); + +/** + * async_schedule_domain - schedule a function for asynchronous execution within a certain domain + * @ptr: function to execute asynchronously + * @data: data pointer to pass to the function + * @running: running list for the domain + * + * Returns an async_cookie_t that may be used for checkpointing later. + * @running may be used in the async_synchronize_*_domain() functions + * to wait within a certain synchronization domain rather than globally. + * A synchronization domain is specified via the running queue @running to use. + * Note: This function may be called from atomic or non-atomic contexts. + */ +async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, + struct list_head *running) +{ + return __async_schedule(ptr, data, running); +} +EXPORT_SYMBOL_GPL(async_schedule_domain); + +/** + * async_synchronize_full - synchronize all asynchronous function calls + * + * This function waits until all asynchronous function calls have been done. + */ +void async_synchronize_full(void) +{ + do { + async_synchronize_cookie(next_cookie); + } while (!list_empty(&async_running) || !list_empty(&async_pending)); +} +EXPORT_SYMBOL_GPL(async_synchronize_full); + +/** + * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain + * @list: running list to synchronize on + * + * This function waits until all asynchronous function calls for the + * synchronization domain specified by the running list @list have been done. + */ +void async_synchronize_full_domain(struct list_head *list) +{ + async_synchronize_cookie_domain(next_cookie, list); +} +EXPORT_SYMBOL_GPL(async_synchronize_full_domain); + +/** + * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing + * @cookie: async_cookie_t to use as checkpoint + * @running: running list to synchronize on + * + * This function waits until all asynchronous function calls for the + * synchronization domain specified by the running list @list submitted + * prior to @cookie have been done. + */ +void async_synchronize_cookie_domain(async_cookie_t cookie, + struct list_head *running) +{ + ktime_t starttime, delta, endtime; + + if (initcall_debug && system_state == SYSTEM_BOOTING) { + printk("async_waiting @ %i\n", task_pid_nr(current)); + starttime = ktime_get(); + } + + wait_event(async_done, lowest_in_progress(running) >= cookie); + + if (initcall_debug && system_state == SYSTEM_BOOTING) { + endtime = ktime_get(); + delta = ktime_sub(endtime, starttime); + + printk("async_continuing @ %i after %lli usec\n", + task_pid_nr(current), + (long long)ktime_to_ns(delta) >> 10); + } +} +EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain); + +/** + * async_synchronize_cookie - synchronize asynchronous function calls with cookie checkpointing + * @cookie: async_cookie_t to use as checkpoint + * + * This function waits until all asynchronous function calls prior to @cookie + * have been done. + */ +void async_synchronize_cookie(async_cookie_t cookie) +{ + async_synchronize_cookie_domain(cookie, &async_running); +} +EXPORT_SYMBOL_GPL(async_synchronize_cookie); diff --git a/kernel/audit.c b/kernel/audit.c new file mode 100644 index 00000000..93950031 --- /dev/null +++ b/kernel/audit.c @@ -0,0 +1,1508 @@ +/* audit.c -- Auditing support + * Gateway between the kernel (e.g., selinux) and the user-space audit daemon. + * System-call specific features have moved to auditsc.c + * + * Copyright 2003-2007 Red Hat Inc., Durham, North Carolina. + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Written by Rickard E. (Rik) Faith + * + * Goals: 1) Integrate fully with Security Modules. + * 2) Minimal run-time overhead: + * a) Minimal when syscall auditing is disabled (audit_enable=0). + * b) Small when syscall auditing is enabled and no audit record + * is generated (defer as much work as possible to record + * generation time): + * i) context is allocated, + * ii) names from getname are stored without a copy, and + * iii) inode information stored from path_lookup. + * 3) Ability to disable syscall auditing at boot time (audit=0). + * 4) Usable by other parts of the kernel (if audit_log* is called, + * then a syscall record will be generated automatically for the + * current syscall). + * 5) Netlink interface to user-space. + * 6) Support low-overhead kernel-based filtering to minimize the + * information that must be passed to user-space. + * + * Example user-space utilities: http://people.redhat.com/sgrubb/audit/ + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include "audit.h" + +/* No auditing will take place until audit_initialized == AUDIT_INITIALIZED. + * (Initialization happens after skb_init is called.) */ +#define AUDIT_DISABLED -1 +#define AUDIT_UNINITIALIZED 0 +#define AUDIT_INITIALIZED 1 +static int audit_initialized; + +#define AUDIT_OFF 0 +#define AUDIT_ON 1 +#define AUDIT_LOCKED 2 +int audit_enabled; +int audit_ever_enabled; + +EXPORT_SYMBOL_GPL(audit_enabled); + +/* Default state when kernel boots without any parameters. */ +static int audit_default; + +/* If auditing cannot proceed, audit_failure selects what happens. */ +static int audit_failure = AUDIT_FAIL_PRINTK; + +/* + * If audit records are to be written to the netlink socket, audit_pid + * contains the pid of the auditd process and audit_nlk_pid contains + * the pid to use to send netlink messages to that process. + */ +int audit_pid; +static int audit_nlk_pid; + +/* If audit_rate_limit is non-zero, limit the rate of sending audit records + * to that number per second. This prevents DoS attacks, but results in + * audit records being dropped. */ +static int audit_rate_limit; + +/* Number of outstanding audit_buffers allowed. */ +static int audit_backlog_limit = 64; +static int audit_backlog_wait_time = 60 * HZ; +static int audit_backlog_wait_overflow = 0; + +/* The identity of the user shutting down the audit system. */ +uid_t audit_sig_uid = -1; +pid_t audit_sig_pid = -1; +u32 audit_sig_sid = 0; + +/* Records can be lost in several ways: + 0) [suppressed in audit_alloc] + 1) out of memory in audit_log_start [kmalloc of struct audit_buffer] + 2) out of memory in audit_log_move [alloc_skb] + 3) suppressed due to audit_rate_limit + 4) suppressed due to audit_backlog_limit +*/ +static atomic_t audit_lost = ATOMIC_INIT(0); + +/* The netlink socket. */ +static struct sock *audit_sock; + +/* Hash for inode-based rules */ +struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; + +/* The audit_freelist is a list of pre-allocated audit buffers (if more + * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of + * being placed on the freelist). */ +static DEFINE_SPINLOCK(audit_freelist_lock); +static int audit_freelist_count; +static LIST_HEAD(audit_freelist); + +static struct sk_buff_head audit_skb_queue; +/* queue of skbs to send to auditd when/if it comes back */ +static struct sk_buff_head audit_skb_hold_queue; +static struct task_struct *kauditd_task; +static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); +static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); + +/* Serialize requests from userspace. */ +DEFINE_MUTEX(audit_cmd_mutex); + +/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting + * audit records. Since printk uses a 1024 byte buffer, this buffer + * should be at least that large. */ +#define AUDIT_BUFSIZ 1024 + +/* AUDIT_MAXFREE is the number of empty audit_buffers we keep on the + * audit_freelist. Doing so eliminates many kmalloc/kfree calls. */ +#define AUDIT_MAXFREE (2*NR_CPUS) + +/* The audit_buffer is used when formatting an audit record. The caller + * locks briefly to get the record off the freelist or to allocate the + * buffer, and locks briefly to send the buffer to the netlink layer or + * to place it on a transmit queue. Multiple audit_buffers can be in + * use simultaneously. */ +struct audit_buffer { + struct list_head list; + struct sk_buff *skb; /* formatted skb ready to send */ + struct audit_context *ctx; /* NULL or associated context */ + gfp_t gfp_mask; +}; + +struct audit_reply { + int pid; + struct sk_buff *skb; +}; + +static void audit_set_pid(struct audit_buffer *ab, pid_t pid) +{ + if (ab) { + struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); + nlh->nlmsg_pid = pid; + } +} + +void audit_panic(const char *message) +{ + switch (audit_failure) + { + case AUDIT_FAIL_SILENT: + break; + case AUDIT_FAIL_PRINTK: + if (printk_ratelimit()) + printk(KERN_ERR "audit: %s\n", message); + break; + case AUDIT_FAIL_PANIC: + /* test audit_pid since printk is always losey, why bother? */ + if (audit_pid) + panic("audit: %s\n", message); + break; + } +} + +static inline int audit_rate_check(void) +{ + static unsigned long last_check = 0; + static int messages = 0; + static DEFINE_SPINLOCK(lock); + unsigned long flags; + unsigned long now; + unsigned long elapsed; + int retval = 0; + + if (!audit_rate_limit) return 1; + + spin_lock_irqsave(&lock, flags); + if (++messages < audit_rate_limit) { + retval = 1; + } else { + now = jiffies; + elapsed = now - last_check; + if (elapsed > HZ) { + last_check = now; + messages = 0; + retval = 1; + } + } + spin_unlock_irqrestore(&lock, flags); + + return retval; +} + +/** + * audit_log_lost - conditionally log lost audit message event + * @message: the message stating reason for lost audit message + * + * Emit at least 1 message per second, even if audit_rate_check is + * throttling. + * Always increment the lost messages counter. +*/ +void audit_log_lost(const char *message) +{ + static unsigned long last_msg = 0; + static DEFINE_SPINLOCK(lock); + unsigned long flags; + unsigned long now; + int print; + + atomic_inc(&audit_lost); + + print = (audit_failure == AUDIT_FAIL_PANIC || !audit_rate_limit); + + if (!print) { + spin_lock_irqsave(&lock, flags); + now = jiffies; + if (now - last_msg > HZ) { + print = 1; + last_msg = now; + } + spin_unlock_irqrestore(&lock, flags); + } + + if (print) { + if (printk_ratelimit()) + printk(KERN_WARNING + "audit: audit_lost=%d audit_rate_limit=%d " + "audit_backlog_limit=%d\n", + atomic_read(&audit_lost), + audit_rate_limit, + audit_backlog_limit); + audit_panic(message); + } +} + +static int audit_log_config_change(char *function_name, int new, int old, + uid_t loginuid, u32 sessionid, u32 sid, + int allow_changes) +{ + struct audit_buffer *ab; + int rc = 0; + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); + audit_log_format(ab, "%s=%d old=%d auid=%u ses=%u", function_name, new, + old, loginuid, sessionid); + if (sid) { + char *ctx = NULL; + u32 len; + + rc = security_secid_to_secctx(sid, &ctx, &len); + if (rc) { + audit_log_format(ab, " sid=%u", sid); + allow_changes = 0; /* Something weird, deny request */ + } else { + audit_log_format(ab, " subj=%s", ctx); + security_release_secctx(ctx, len); + } + } + audit_log_format(ab, " res=%d", allow_changes); + audit_log_end(ab); + return rc; +} + +static int audit_do_config_change(char *function_name, int *to_change, + int new, uid_t loginuid, u32 sessionid, + u32 sid) +{ + int allow_changes, rc = 0, old = *to_change; + + /* check if we are locked */ + if (audit_enabled == AUDIT_LOCKED) + allow_changes = 0; + else + allow_changes = 1; + + if (audit_enabled != AUDIT_OFF) { + rc = audit_log_config_change(function_name, new, old, loginuid, + sessionid, sid, allow_changes); + if (rc) + allow_changes = 0; + } + + /* If we are allowed, make the change */ + if (allow_changes == 1) + *to_change = new; + /* Not allowed, update reason */ + else if (rc == 0) + rc = -EPERM; + return rc; +} + +static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sessionid, + u32 sid) +{ + return audit_do_config_change("audit_rate_limit", &audit_rate_limit, + limit, loginuid, sessionid, sid); +} + +static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sessionid, + u32 sid) +{ + return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, + limit, loginuid, sessionid, sid); +} + +static int audit_set_enabled(int state, uid_t loginuid, u32 sessionid, u32 sid) +{ + int rc; + if (state < AUDIT_OFF || state > AUDIT_LOCKED) + return -EINVAL; + + rc = audit_do_config_change("audit_enabled", &audit_enabled, state, + loginuid, sessionid, sid); + + if (!rc) + audit_ever_enabled |= !!state; + + return rc; +} + +static int audit_set_failure(int state, uid_t loginuid, u32 sessionid, u32 sid) +{ + if (state != AUDIT_FAIL_SILENT + && state != AUDIT_FAIL_PRINTK + && state != AUDIT_FAIL_PANIC) + return -EINVAL; + + return audit_do_config_change("audit_failure", &audit_failure, state, + loginuid, sessionid, sid); +} + +/* + * Queue skbs to be sent to auditd when/if it comes back. These skbs should + * already have been sent via prink/syslog and so if these messages are dropped + * it is not a huge concern since we already passed the audit_log_lost() + * notification and stuff. This is just nice to get audit messages during + * boot before auditd is running or messages generated while auditd is stopped. + * This only holds messages is audit_default is set, aka booting with audit=1 + * or building your kernel that way. + */ +static void audit_hold_skb(struct sk_buff *skb) +{ + if (audit_default && + skb_queue_len(&audit_skb_hold_queue) < audit_backlog_limit) + skb_queue_tail(&audit_skb_hold_queue, skb); + else + kfree_skb(skb); +} + +/* + * For one reason or another this nlh isn't getting delivered to the userspace + * audit daemon, just send it to printk. + */ +static void audit_printk_skb(struct sk_buff *skb) +{ + struct nlmsghdr *nlh = nlmsg_hdr(skb); + char *data = NLMSG_DATA(nlh); + + if (nlh->nlmsg_type != AUDIT_EOE) { + if (printk_ratelimit()) + printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, data); + else + audit_log_lost("printk limit exceeded\n"); + } + + audit_hold_skb(skb); +} + +static void kauditd_send_skb(struct sk_buff *skb) +{ + int err; + /* take a reference in case we can't send it and we want to hold it */ + skb_get(skb); + err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); + if (err < 0) { + BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ + printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); + audit_log_lost("auditd disappeared\n"); + audit_pid = 0; + /* we might get lucky and get this in the next auditd */ + audit_hold_skb(skb); + } else + /* drop the extra reference if sent ok */ + consume_skb(skb); +} + +static int kauditd_thread(void *dummy) +{ + struct sk_buff *skb; + + set_freezable(); + while (!kthread_should_stop()) { + /* + * if auditd just started drain the queue of messages already + * sent to syslog/printk. remember loss here is ok. we already + * called audit_log_lost() if it didn't go out normally. so the + * race between the skb_dequeue and the next check for audit_pid + * doesn't matter. + * + * if you ever find kauditd to be too slow we can get a perf win + * by doing our own locking and keeping better track if there + * are messages in this queue. I don't see the need now, but + * in 5 years when I want to play with this again I'll see this + * note and still have no friggin idea what i'm thinking today. + */ + if (audit_default && audit_pid) { + skb = skb_dequeue(&audit_skb_hold_queue); + if (unlikely(skb)) { + while (skb && audit_pid) { + kauditd_send_skb(skb); + skb = skb_dequeue(&audit_skb_hold_queue); + } + } + } + + skb = skb_dequeue(&audit_skb_queue); + wake_up(&audit_backlog_wait); + if (skb) { + if (audit_pid) + kauditd_send_skb(skb); + else + audit_printk_skb(skb); + } else { + DECLARE_WAITQUEUE(wait, current); + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&kauditd_wait, &wait); + + if (!skb_queue_len(&audit_skb_queue)) { + try_to_freeze(); + schedule(); + } + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&kauditd_wait, &wait); + } + } + return 0; +} + +static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid) +{ + struct task_struct *tsk; + int err; + + rcu_read_lock(); + tsk = find_task_by_vpid(pid); + if (!tsk) { + rcu_read_unlock(); + return -ESRCH; + } + get_task_struct(tsk); + rcu_read_unlock(); + err = tty_audit_push_task(tsk, loginuid, sessionid); + put_task_struct(tsk); + return err; +} + +int audit_send_list(void *_dest) +{ + struct audit_netlink_list *dest = _dest; + int pid = dest->pid; + struct sk_buff *skb; + + /* wait for parent to finish and send an ACK */ + mutex_lock(&audit_cmd_mutex); + mutex_unlock(&audit_cmd_mutex); + + while ((skb = __skb_dequeue(&dest->q)) != NULL) + netlink_unicast(audit_sock, skb, pid, 0); + + kfree(dest); + + return 0; +} + +struct sk_buff *audit_make_reply(int pid, int seq, int type, int done, + int multi, const void *payload, int size) +{ + struct sk_buff *skb; + struct nlmsghdr *nlh; + void *data; + int flags = multi ? NLM_F_MULTI : 0; + int t = done ? NLMSG_DONE : type; + + skb = nlmsg_new(size, GFP_KERNEL); + if (!skb) + return NULL; + + nlh = NLMSG_NEW(skb, pid, seq, t, size, flags); + data = NLMSG_DATA(nlh); + memcpy(data, payload, size); + return skb; + +nlmsg_failure: /* Used by NLMSG_NEW */ + if (skb) + kfree_skb(skb); + return NULL; +} + +static int audit_send_reply_thread(void *arg) +{ + struct audit_reply *reply = (struct audit_reply *)arg; + + mutex_lock(&audit_cmd_mutex); + mutex_unlock(&audit_cmd_mutex); + + /* Ignore failure. It'll only happen if the sender goes away, + because our timeout is set to infinite. */ + netlink_unicast(audit_sock, reply->skb, reply->pid, 0); + kfree(reply); + return 0; +} +/** + * audit_send_reply - send an audit reply message via netlink + * @pid: process id to send reply to + * @seq: sequence number + * @type: audit message type + * @done: done (last) flag + * @multi: multi-part message flag + * @payload: payload data + * @size: payload size + * + * Allocates an skb, builds the netlink message, and sends it to the pid. + * No failure notifications. + */ +static void audit_send_reply(int pid, int seq, int type, int done, int multi, + const void *payload, int size) +{ + struct sk_buff *skb; + struct task_struct *tsk; + struct audit_reply *reply = kmalloc(sizeof(struct audit_reply), + GFP_KERNEL); + + if (!reply) + return; + + skb = audit_make_reply(pid, seq, type, done, multi, payload, size); + if (!skb) + goto out; + + reply->pid = pid; + reply->skb = skb; + + tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); + if (!IS_ERR(tsk)) + return; + kfree_skb(skb); +out: + kfree(reply); +} + +/* + * Check for appropriate CAP_AUDIT_ capabilities on incoming audit + * control messages. + */ +static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) +{ + int err = 0; + + switch (msg_type) { + case AUDIT_GET: + case AUDIT_LIST: + case AUDIT_LIST_RULES: + case AUDIT_SET: + case AUDIT_ADD: + case AUDIT_ADD_RULE: + case AUDIT_DEL: + case AUDIT_DEL_RULE: + case AUDIT_SIGNAL_INFO: + case AUDIT_TTY_GET: + case AUDIT_TTY_SET: + case AUDIT_TRIM: + case AUDIT_MAKE_EQUIV: + if (security_netlink_recv(skb, CAP_AUDIT_CONTROL)) + err = -EPERM; + break; + case AUDIT_USER: + case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: + case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: + if (security_netlink_recv(skb, CAP_AUDIT_WRITE)) + err = -EPERM; + break; + default: /* bad msg */ + err = -EINVAL; + } + + return err; +} + +static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, + u32 pid, u32 uid, uid_t auid, u32 ses, + u32 sid) +{ + int rc = 0; + char *ctx = NULL; + u32 len; + + if (!audit_enabled) { + *ab = NULL; + return rc; + } + + *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); + audit_log_format(*ab, "user pid=%d uid=%u auid=%u ses=%u", + pid, uid, auid, ses); + if (sid) { + rc = security_secid_to_secctx(sid, &ctx, &len); + if (rc) + audit_log_format(*ab, " ssid=%u", sid); + else { + audit_log_format(*ab, " subj=%s", ctx); + security_release_secctx(ctx, len); + } + } + + return rc; +} + +static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + u32 uid, pid, seq, sid; + void *data; + struct audit_status *status_get, status_set; + int err; + struct audit_buffer *ab; + u16 msg_type = nlh->nlmsg_type; + uid_t loginuid; /* loginuid of sender */ + u32 sessionid; + struct audit_sig_info *sig_data; + char *ctx = NULL; + u32 len; + + err = audit_netlink_ok(skb, msg_type); + if (err) + return err; + + /* As soon as there's any sign of userspace auditd, + * start kauditd to talk to it */ + if (!kauditd_task) + kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); + if (IS_ERR(kauditd_task)) { + err = PTR_ERR(kauditd_task); + kauditd_task = NULL; + return err; + } + + pid = NETLINK_CREDS(skb)->pid; + uid = NETLINK_CREDS(skb)->uid; + loginuid = audit_get_loginuid(current); + sessionid = audit_get_sessionid(current); + security_task_getsecid(current, &sid); + seq = nlh->nlmsg_seq; + data = NLMSG_DATA(nlh); + + switch (msg_type) { + case AUDIT_GET: + status_set.enabled = audit_enabled; + status_set.failure = audit_failure; + status_set.pid = audit_pid; + status_set.rate_limit = audit_rate_limit; + status_set.backlog_limit = audit_backlog_limit; + status_set.lost = atomic_read(&audit_lost); + status_set.backlog = skb_queue_len(&audit_skb_queue); + audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_GET, 0, 0, + &status_set, sizeof(status_set)); + break; + case AUDIT_SET: + if (nlh->nlmsg_len < sizeof(struct audit_status)) + return -EINVAL; + status_get = (struct audit_status *)data; + if (status_get->mask & AUDIT_STATUS_ENABLED) { + err = audit_set_enabled(status_get->enabled, + loginuid, sessionid, sid); + if (err < 0) + return err; + } + if (status_get->mask & AUDIT_STATUS_FAILURE) { + err = audit_set_failure(status_get->failure, + loginuid, sessionid, sid); + if (err < 0) + return err; + } + if (status_get->mask & AUDIT_STATUS_PID) { + int new_pid = status_get->pid; + + if (audit_enabled != AUDIT_OFF) + audit_log_config_change("audit_pid", new_pid, + audit_pid, loginuid, + sessionid, sid, 1); + + audit_pid = new_pid; + audit_nlk_pid = NETLINK_CB(skb).pid; + } + if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) { + err = audit_set_rate_limit(status_get->rate_limit, + loginuid, sessionid, sid); + if (err < 0) + return err; + } + if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) + err = audit_set_backlog_limit(status_get->backlog_limit, + loginuid, sessionid, sid); + break; + case AUDIT_USER: + case AUDIT_FIRST_USER_MSG ... AUDIT_LAST_USER_MSG: + case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2: + if (!audit_enabled && msg_type != AUDIT_USER_AVC) + return 0; + + err = audit_filter_user(&NETLINK_CB(skb)); + if (err == 1) { + err = 0; + if (msg_type == AUDIT_USER_TTY) { + err = audit_prepare_user_tty(pid, loginuid, + sessionid); + if (err) + break; + } + audit_log_common_recv_msg(&ab, msg_type, pid, uid, + loginuid, sessionid, sid); + + if (msg_type != AUDIT_USER_TTY) + audit_log_format(ab, " msg='%.1024s'", + (char *)data); + else { + int size; + + audit_log_format(ab, " msg="); + size = nlmsg_len(nlh); + if (size > 0 && + ((unsigned char *)data)[size - 1] == '\0') + size--; + audit_log_n_untrustedstring(ab, data, size); + } + audit_set_pid(ab, pid); + audit_log_end(ab); + } + break; + case AUDIT_ADD: + case AUDIT_DEL: + if (nlmsg_len(nlh) < sizeof(struct audit_rule)) + return -EINVAL; + if (audit_enabled == AUDIT_LOCKED) { + audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, + uid, loginuid, sessionid, sid); + + audit_log_format(ab, " audit_enabled=%d res=0", + audit_enabled); + audit_log_end(ab); + return -EPERM; + } + /* fallthrough */ + case AUDIT_LIST: + err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, + uid, seq, data, nlmsg_len(nlh), + loginuid, sessionid, sid); + break; + case AUDIT_ADD_RULE: + case AUDIT_DEL_RULE: + if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) + return -EINVAL; + if (audit_enabled == AUDIT_LOCKED) { + audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, + uid, loginuid, sessionid, sid); + + audit_log_format(ab, " audit_enabled=%d res=0", + audit_enabled); + audit_log_end(ab); + return -EPERM; + } + /* fallthrough */ + case AUDIT_LIST_RULES: + err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid, + uid, seq, data, nlmsg_len(nlh), + loginuid, sessionid, sid); + break; + case AUDIT_TRIM: + audit_trim_trees(); + + audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, + uid, loginuid, sessionid, sid); + + audit_log_format(ab, " op=trim res=1"); + audit_log_end(ab); + break; + case AUDIT_MAKE_EQUIV: { + void *bufp = data; + u32 sizes[2]; + size_t msglen = nlmsg_len(nlh); + char *old, *new; + + err = -EINVAL; + if (msglen < 2 * sizeof(u32)) + break; + memcpy(sizes, bufp, 2 * sizeof(u32)); + bufp += 2 * sizeof(u32); + msglen -= 2 * sizeof(u32); + old = audit_unpack_string(&bufp, &msglen, sizes[0]); + if (IS_ERR(old)) { + err = PTR_ERR(old); + break; + } + new = audit_unpack_string(&bufp, &msglen, sizes[1]); + if (IS_ERR(new)) { + err = PTR_ERR(new); + kfree(old); + break; + } + /* OK, here comes... */ + err = audit_tag_tree(old, new); + + audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, + uid, loginuid, sessionid, sid); + + audit_log_format(ab, " op=make_equiv old="); + audit_log_untrustedstring(ab, old); + audit_log_format(ab, " new="); + audit_log_untrustedstring(ab, new); + audit_log_format(ab, " res=%d", !err); + audit_log_end(ab); + kfree(old); + kfree(new); + break; + } + case AUDIT_SIGNAL_INFO: + len = 0; + if (audit_sig_sid) { + err = security_secid_to_secctx(audit_sig_sid, &ctx, &len); + if (err) + return err; + } + sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); + if (!sig_data) { + if (audit_sig_sid) + security_release_secctx(ctx, len); + return -ENOMEM; + } + sig_data->uid = audit_sig_uid; + sig_data->pid = audit_sig_pid; + if (audit_sig_sid) { + memcpy(sig_data->ctx, ctx, len); + security_release_secctx(ctx, len); + } + audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, + 0, 0, sig_data, sizeof(*sig_data) + len); + kfree(sig_data); + break; + case AUDIT_TTY_GET: { + struct audit_tty_status s; + struct task_struct *tsk; + unsigned long flags; + + rcu_read_lock(); + tsk = find_task_by_vpid(pid); + if (tsk && lock_task_sighand(tsk, &flags)) { + s.enabled = tsk->signal->audit_tty != 0; + unlock_task_sighand(tsk, &flags); + } else + err = -ESRCH; + rcu_read_unlock(); + + if (!err) + audit_send_reply(NETLINK_CB(skb).pid, seq, + AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); + break; + } + case AUDIT_TTY_SET: { + struct audit_tty_status *s; + struct task_struct *tsk; + unsigned long flags; + + if (nlh->nlmsg_len < sizeof(struct audit_tty_status)) + return -EINVAL; + s = data; + if (s->enabled != 0 && s->enabled != 1) + return -EINVAL; + rcu_read_lock(); + tsk = find_task_by_vpid(pid); + if (tsk && lock_task_sighand(tsk, &flags)) { + tsk->signal->audit_tty = s->enabled != 0; + unlock_task_sighand(tsk, &flags); + } else + err = -ESRCH; + rcu_read_unlock(); + break; + } + default: + err = -EINVAL; + break; + } + + return err < 0 ? err : 0; +} + +/* + * Get message from skb. Each message is processed by audit_receive_msg. + * Malformed skbs with wrong length are discarded silently. + */ +static void audit_receive_skb(struct sk_buff *skb) +{ + struct nlmsghdr *nlh; + /* + * len MUST be signed for NLMSG_NEXT to be able to dec it below 0 + * if the nlmsg_len was not aligned + */ + int len; + int err; + + nlh = nlmsg_hdr(skb); + len = skb->len; + + while (NLMSG_OK(nlh, len)) { + err = audit_receive_msg(skb, nlh); + /* if err or if this message says it wants a response */ + if (err || (nlh->nlmsg_flags & NLM_F_ACK)) + netlink_ack(skb, nlh, err); + + nlh = NLMSG_NEXT(nlh, len); + } +} + +/* Receive messages from netlink socket. */ +static void audit_receive(struct sk_buff *skb) +{ + mutex_lock(&audit_cmd_mutex); + audit_receive_skb(skb); + mutex_unlock(&audit_cmd_mutex); +} + +/* Initialize audit support at boot time. */ +static int __init audit_init(void) +{ + int i; + + if (audit_initialized == AUDIT_DISABLED) + return 0; + + printk(KERN_INFO "audit: initializing netlink socket (%s)\n", + audit_default ? "enabled" : "disabled"); + audit_sock = netlink_kernel_create(&init_net, NETLINK_AUDIT, 0, + audit_receive, NULL, THIS_MODULE); + if (!audit_sock) + audit_panic("cannot initialize netlink socket"); + else + audit_sock->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + + skb_queue_head_init(&audit_skb_queue); + skb_queue_head_init(&audit_skb_hold_queue); + audit_initialized = AUDIT_INITIALIZED; + audit_enabled = audit_default; + audit_ever_enabled |= !!audit_default; + + audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); + + for (i = 0; i < AUDIT_INODE_BUCKETS; i++) + INIT_LIST_HEAD(&audit_inode_hash[i]); + + return 0; +} +__initcall(audit_init); + +/* Process kernel command-line parameter at boot time. audit=0 or audit=1. */ +static int __init audit_enable(char *str) +{ + audit_default = !!simple_strtol(str, NULL, 0); + if (!audit_default) + audit_initialized = AUDIT_DISABLED; + + printk(KERN_INFO "audit: %s", audit_default ? "enabled" : "disabled"); + + if (audit_initialized == AUDIT_INITIALIZED) { + audit_enabled = audit_default; + audit_ever_enabled |= !!audit_default; + } else if (audit_initialized == AUDIT_UNINITIALIZED) { + printk(" (after initialization)"); + } else { + printk(" (until reboot)"); + } + printk("\n"); + + return 1; +} + +__setup("audit=", audit_enable); + +static void audit_buffer_free(struct audit_buffer *ab) +{ + unsigned long flags; + + if (!ab) + return; + + if (ab->skb) + kfree_skb(ab->skb); + + spin_lock_irqsave(&audit_freelist_lock, flags); + if (audit_freelist_count > AUDIT_MAXFREE) + kfree(ab); + else { + audit_freelist_count++; + list_add(&ab->list, &audit_freelist); + } + spin_unlock_irqrestore(&audit_freelist_lock, flags); +} + +static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, + gfp_t gfp_mask, int type) +{ + unsigned long flags; + struct audit_buffer *ab = NULL; + struct nlmsghdr *nlh; + + spin_lock_irqsave(&audit_freelist_lock, flags); + if (!list_empty(&audit_freelist)) { + ab = list_entry(audit_freelist.next, + struct audit_buffer, list); + list_del(&ab->list); + --audit_freelist_count; + } + spin_unlock_irqrestore(&audit_freelist_lock, flags); + + if (!ab) { + ab = kmalloc(sizeof(*ab), gfp_mask); + if (!ab) + goto err; + } + + ab->ctx = ctx; + ab->gfp_mask = gfp_mask; + + ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); + if (!ab->skb) + goto nlmsg_failure; + + nlh = NLMSG_NEW(ab->skb, 0, 0, type, 0, 0); + + return ab; + +nlmsg_failure: /* Used by NLMSG_NEW */ + kfree_skb(ab->skb); + ab->skb = NULL; +err: + audit_buffer_free(ab); + return NULL; +} + +/** + * audit_serial - compute a serial number for the audit record + * + * Compute a serial number for the audit record. Audit records are + * written to user-space as soon as they are generated, so a complete + * audit record may be written in several pieces. The timestamp of the + * record and this serial number are used by the user-space tools to + * determine which pieces belong to the same audit record. The + * (timestamp,serial) tuple is unique for each syscall and is live from + * syscall entry to syscall exit. + * + * NOTE: Another possibility is to store the formatted records off the + * audit context (for those records that have a context), and emit them + * all at syscall exit. However, this could delay the reporting of + * significant errors until syscall exit (or never, if the system + * halts). + */ +unsigned int audit_serial(void) +{ + static DEFINE_SPINLOCK(serial_lock); + static unsigned int serial = 0; + + unsigned long flags; + unsigned int ret; + + spin_lock_irqsave(&serial_lock, flags); + do { + ret = ++serial; + } while (unlikely(!ret)); + spin_unlock_irqrestore(&serial_lock, flags); + + return ret; +} + +static inline void audit_get_stamp(struct audit_context *ctx, + struct timespec *t, unsigned int *serial) +{ + if (!ctx || !auditsc_get_stamp(ctx, t, serial)) { + *t = CURRENT_TIME; + *serial = audit_serial(); + } +} + +/* Obtain an audit buffer. This routine does locking to obtain the + * audit buffer, but then no locking is required for calls to + * audit_log_*format. If the tsk is a task that is currently in a + * syscall, then the syscall is marked as auditable and an audit record + * will be written at syscall exit. If there is no associated task, tsk + * should be NULL. */ + +/** + * audit_log_start - obtain an audit buffer + * @ctx: audit_context (may be NULL) + * @gfp_mask: type of allocation + * @type: audit message type + * + * Returns audit_buffer pointer on success or NULL on error. + * + * Obtain an audit buffer. This routine does locking to obtain the + * audit buffer, but then no locking is required for calls to + * audit_log_*format. If the task (ctx) is a task that is currently in a + * syscall, then the syscall is marked as auditable and an audit record + * will be written at syscall exit. If there is no associated task, then + * task context (ctx) should be NULL. + */ +struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, + int type) +{ + struct audit_buffer *ab = NULL; + struct timespec t; + unsigned int uninitialized_var(serial); + int reserve; + unsigned long timeout_start = jiffies; + + if (audit_initialized != AUDIT_INITIALIZED) + return NULL; + + if (unlikely(audit_filter_type(type))) + return NULL; + + if (gfp_mask & __GFP_WAIT) + reserve = 0; + else + reserve = 5; /* Allow atomic callers to go up to five + entries over the normal backlog limit */ + + while (audit_backlog_limit + && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { + if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time + && time_before(jiffies, timeout_start + audit_backlog_wait_time)) { + + /* Wait for auditd to drain the queue a little */ + DECLARE_WAITQUEUE(wait, current); + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&audit_backlog_wait, &wait); + + if (audit_backlog_limit && + skb_queue_len(&audit_skb_queue) > audit_backlog_limit) + schedule_timeout(timeout_start + audit_backlog_wait_time - jiffies); + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&audit_backlog_wait, &wait); + continue; + } + if (audit_rate_check() && printk_ratelimit()) + printk(KERN_WARNING + "audit: audit_backlog=%d > " + "audit_backlog_limit=%d\n", + skb_queue_len(&audit_skb_queue), + audit_backlog_limit); + audit_log_lost("backlog limit exceeded"); + audit_backlog_wait_time = audit_backlog_wait_overflow; + wake_up(&audit_backlog_wait); + return NULL; + } + + ab = audit_buffer_alloc(ctx, gfp_mask, type); + if (!ab) { + audit_log_lost("out of memory in audit_log_start"); + return NULL; + } + + audit_get_stamp(ab->ctx, &t, &serial); + + audit_log_format(ab, "audit(%lu.%03lu:%u): ", + t.tv_sec, t.tv_nsec/1000000, serial); + return ab; +} + +/** + * audit_expand - expand skb in the audit buffer + * @ab: audit_buffer + * @extra: space to add at tail of the skb + * + * Returns 0 (no space) on failed expansion, or available space if + * successful. + */ +static inline int audit_expand(struct audit_buffer *ab, int extra) +{ + struct sk_buff *skb = ab->skb; + int oldtail = skb_tailroom(skb); + int ret = pskb_expand_head(skb, 0, extra, ab->gfp_mask); + int newtail = skb_tailroom(skb); + + if (ret < 0) { + audit_log_lost("out of memory in audit_expand"); + return 0; + } + + skb->truesize += newtail - oldtail; + return newtail; +} + +/* + * Format an audit message into the audit buffer. If there isn't enough + * room in the audit buffer, more room will be allocated and vsnprint + * will be called a second time. Currently, we assume that a printk + * can't format message larger than 1024 bytes, so we don't either. + */ +static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, + va_list args) +{ + int len, avail; + struct sk_buff *skb; + va_list args2; + + if (!ab) + return; + + BUG_ON(!ab->skb); + skb = ab->skb; + avail = skb_tailroom(skb); + if (avail == 0) { + avail = audit_expand(ab, AUDIT_BUFSIZ); + if (!avail) + goto out; + } + va_copy(args2, args); + len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args); + if (len >= avail) { + /* The printk buffer is 1024 bytes long, so if we get + * here and AUDIT_BUFSIZ is at least 1024, then we can + * log everything that printk could have logged. */ + avail = audit_expand(ab, + max_t(unsigned, AUDIT_BUFSIZ, 1+len-avail)); + if (!avail) + goto out; + len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2); + } + va_end(args2); + if (len > 0) + skb_put(skb, len); +out: + return; +} + +/** + * audit_log_format - format a message into the audit buffer. + * @ab: audit_buffer + * @fmt: format string + * @...: optional parameters matching @fmt string + * + * All the work is done in audit_log_vformat. + */ +void audit_log_format(struct audit_buffer *ab, const char *fmt, ...) +{ + va_list args; + + if (!ab) + return; + va_start(args, fmt); + audit_log_vformat(ab, fmt, args); + va_end(args); +} + +/** + * audit_log_hex - convert a buffer to hex and append it to the audit skb + * @ab: the audit_buffer + * @buf: buffer to convert to hex + * @len: length of @buf to be converted + * + * No return value; failure to expand is silently ignored. + * + * This function will take the passed buf and convert it into a string of + * ascii hex digits. The new string is placed onto the skb. + */ +void audit_log_n_hex(struct audit_buffer *ab, const unsigned char *buf, + size_t len) +{ + int i, avail, new_len; + unsigned char *ptr; + struct sk_buff *skb; + static const unsigned char *hex = "0123456789ABCDEF"; + + if (!ab) + return; + + BUG_ON(!ab->skb); + skb = ab->skb; + avail = skb_tailroom(skb); + new_len = len<<1; + if (new_len >= avail) { + /* Round the buffer request up to the next multiple */ + new_len = AUDIT_BUFSIZ*(((new_len-avail)/AUDIT_BUFSIZ) + 1); + avail = audit_expand(ab, new_len); + if (!avail) + return; + } + + ptr = skb_tail_pointer(skb); + for (i=0; i>4]; /* Upper nibble */ + *ptr++ = hex[buf[i] & 0x0F]; /* Lower nibble */ + } + *ptr = 0; + skb_put(skb, len << 1); /* new string is twice the old string */ +} + +/* + * Format a string of no more than slen characters into the audit buffer, + * enclosed in quote marks. + */ +void audit_log_n_string(struct audit_buffer *ab, const char *string, + size_t slen) +{ + int avail, new_len; + unsigned char *ptr; + struct sk_buff *skb; + + if (!ab) + return; + + BUG_ON(!ab->skb); + skb = ab->skb; + avail = skb_tailroom(skb); + new_len = slen + 3; /* enclosing quotes + null terminator */ + if (new_len > avail) { + avail = audit_expand(ab, new_len); + if (!avail) + return; + } + ptr = skb_tail_pointer(skb); + *ptr++ = '"'; + memcpy(ptr, string, slen); + ptr += slen; + *ptr++ = '"'; + *ptr = 0; + skb_put(skb, slen + 2); /* don't include null terminator */ +} + +/** + * audit_string_contains_control - does a string need to be logged in hex + * @string: string to be checked + * @len: max length of the string to check + */ +int audit_string_contains_control(const char *string, size_t len) +{ + const unsigned char *p; + for (p = string; p < (const unsigned char *)string + len; p++) { + if (*p == '"' || *p < 0x21 || *p > 0x7e) + return 1; + } + return 0; +} + +/** + * audit_log_n_untrustedstring - log a string that may contain random characters + * @ab: audit_buffer + * @len: length of string (not including trailing null) + * @string: string to be logged + * + * This code will escape a string that is passed to it if the string + * contains a control character, unprintable character, double quote mark, + * or a space. Unescaped strings will start and end with a double quote mark. + * Strings that are escaped are printed in hex (2 digits per char). + * + * The caller specifies the number of characters in the string to log, which may + * or may not be the entire string. + */ +void audit_log_n_untrustedstring(struct audit_buffer *ab, const char *string, + size_t len) +{ + if (audit_string_contains_control(string, len)) + audit_log_n_hex(ab, string, len); + else + audit_log_n_string(ab, string, len); +} + +/** + * audit_log_untrustedstring - log a string that may contain random characters + * @ab: audit_buffer + * @string: string to be logged + * + * Same as audit_log_n_untrustedstring(), except that strlen is used to + * determine string length. + */ +void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) +{ + audit_log_n_untrustedstring(ab, string, strlen(string)); +} + +/* This is a helper-function to print the escaped d_path */ +void audit_log_d_path(struct audit_buffer *ab, const char *prefix, + struct path *path) +{ + char *p, *pathname; + + if (prefix) + audit_log_format(ab, " %s", prefix); + + /* We will allow 11 spaces for ' (deleted)' to be appended */ + pathname = kmalloc(PATH_MAX+11, ab->gfp_mask); + if (!pathname) { + audit_log_string(ab, ""); + return; + } + p = d_path(path, pathname, PATH_MAX+11); + if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */ + /* FIXME: can we save some information here? */ + audit_log_string(ab, ""); + } else + audit_log_untrustedstring(ab, p); + kfree(pathname); +} + +void audit_log_key(struct audit_buffer *ab, char *key) +{ + audit_log_format(ab, " key="); + if (key) + audit_log_untrustedstring(ab, key); + else + audit_log_format(ab, "(null)"); +} + +/** + * audit_log_end - end one audit record + * @ab: the audit_buffer + * + * The netlink_* functions cannot be called inside an irq context, so + * the audit buffer is placed on a queue and a tasklet is scheduled to + * remove them from the queue outside the irq context. May be called in + * any context. + */ +void audit_log_end(struct audit_buffer *ab) +{ + if (!ab) + return; + if (!audit_rate_check()) { + audit_log_lost("rate limit exceeded"); + } else { + struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); + nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0); + + if (audit_pid) { + skb_queue_tail(&audit_skb_queue, ab->skb); + wake_up_interruptible(&kauditd_wait); + } else { + audit_printk_skb(ab->skb); + } + ab->skb = NULL; + } + audit_buffer_free(ab); +} + +/** + * audit_log - Log an audit record + * @ctx: audit context + * @gfp_mask: type of allocation + * @type: audit message type + * @fmt: format string to use + * @...: variable parameters matching the format string + * + * This is a convenience function that calls audit_log_start, + * audit_log_vformat, and audit_log_end. It may be called + * in any context. + */ +void audit_log(struct audit_context *ctx, gfp_t gfp_mask, int type, + const char *fmt, ...) +{ + struct audit_buffer *ab; + va_list args; + + ab = audit_log_start(ctx, gfp_mask, type); + if (ab) { + va_start(args, fmt); + audit_log_vformat(ab, fmt, args); + va_end(args); + audit_log_end(ab); + } +} + +EXPORT_SYMBOL(audit_log_start); +EXPORT_SYMBOL(audit_log_end); +EXPORT_SYMBOL(audit_log_format); +EXPORT_SYMBOL(audit_log); diff --git a/kernel/audit.h b/kernel/audit.h new file mode 100644 index 00000000..91e7071c --- /dev/null +++ b/kernel/audit.h @@ -0,0 +1,170 @@ +/* audit -- definition of audit_context structure and supporting types + * + * Copyright 2003-2004 Red Hat, Inc. + * Copyright 2005 Hewlett-Packard Development Company, L.P. + * Copyright 2005 IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include + +/* 0 = no checking + 1 = put_count checking + 2 = verbose put_count checking +*/ +#define AUDIT_DEBUG 0 + +/* At task start time, the audit_state is set in the audit_context using + a per-task filter. At syscall entry, the audit_state is augmented by + the syscall filter. */ +enum audit_state { + AUDIT_DISABLED, /* Do not create per-task audit_context. + * No syscall-specific audit records can + * be generated. */ + AUDIT_SETUP_CONTEXT, /* Create the per-task audit_context, + * but don't necessarily fill it in at + * syscall entry time (i.e., filter + * instead). */ + AUDIT_BUILD_CONTEXT, /* Create the per-task audit_context, + * and always fill it in at syscall + * entry time. This makes a full + * syscall record available if some + * other part of the kernel decides it + * should be recorded. */ + AUDIT_RECORD_CONTEXT /* Create the per-task audit_context, + * always fill it in at syscall entry + * time, and always write out the audit + * record at syscall exit time. */ +}; + +/* Rule lists */ +struct audit_watch; +struct audit_tree; +struct audit_chunk; + +struct audit_entry { + struct list_head list; + struct rcu_head rcu; + struct audit_krule rule; +}; + +#ifdef CONFIG_AUDIT +extern int audit_enabled; +extern int audit_ever_enabled; +#endif + +extern int audit_pid; + +#define AUDIT_INODE_BUCKETS 32 +extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; + +static inline int audit_hash_ino(u32 ino) +{ + return (ino & (AUDIT_INODE_BUCKETS-1)); +} + +extern int audit_match_class(int class, unsigned syscall); +extern int audit_comparator(const u32 left, const u32 op, const u32 right); +extern int audit_compare_dname_path(const char *dname, const char *path, + int *dirlen); +extern struct sk_buff * audit_make_reply(int pid, int seq, int type, + int done, int multi, + const void *payload, int size); +extern void audit_panic(const char *message); + +struct audit_netlink_list { + int pid; + struct sk_buff_head q; +}; + +int audit_send_list(void *); + +extern int selinux_audit_rule_update(void); + +extern struct mutex audit_filter_mutex; +extern void audit_free_rule_rcu(struct rcu_head *); +extern struct list_head audit_filter_list[]; + +extern struct audit_entry *audit_dupe_rule(struct audit_krule *old); + +/* audit watch functions */ +#ifdef CONFIG_AUDIT_WATCH +extern void audit_put_watch(struct audit_watch *watch); +extern void audit_get_watch(struct audit_watch *watch); +extern int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op); +extern int audit_add_watch(struct audit_krule *krule, struct list_head **list); +extern void audit_remove_watch_rule(struct audit_krule *krule); +extern char *audit_watch_path(struct audit_watch *watch); +extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev); +#else +#define audit_put_watch(w) {} +#define audit_get_watch(w) {} +#define audit_to_watch(k, p, l, o) (-EINVAL) +#define audit_add_watch(k, l) (-EINVAL) +#define audit_remove_watch_rule(k) BUG() +#define audit_watch_path(w) "" +#define audit_watch_compare(w, i, d) 0 + +#endif /* CONFIG_AUDIT_WATCH */ + +#ifdef CONFIG_AUDIT_TREE +extern struct audit_chunk *audit_tree_lookup(const struct inode *); +extern void audit_put_chunk(struct audit_chunk *); +extern int audit_tree_match(struct audit_chunk *, struct audit_tree *); +extern int audit_make_tree(struct audit_krule *, char *, u32); +extern int audit_add_tree_rule(struct audit_krule *); +extern int audit_remove_tree_rule(struct audit_krule *); +extern void audit_trim_trees(void); +extern int audit_tag_tree(char *old, char *new); +extern const char *audit_tree_path(struct audit_tree *); +extern void audit_put_tree(struct audit_tree *); +extern void audit_kill_trees(struct list_head *); +#else +#define audit_remove_tree_rule(rule) BUG() +#define audit_add_tree_rule(rule) -EINVAL +#define audit_make_tree(rule, str, op) -EINVAL +#define audit_trim_trees() (void)0 +#define audit_put_tree(tree) (void)0 +#define audit_tag_tree(old, new) -EINVAL +#define audit_tree_path(rule) "" /* never called */ +#define audit_kill_trees(list) BUG() +#endif + +extern char *audit_unpack_string(void **, size_t *, size_t); + +extern pid_t audit_sig_pid; +extern uid_t audit_sig_uid; +extern u32 audit_sig_sid; + +#ifdef CONFIG_AUDITSYSCALL +extern int __audit_signal_info(int sig, struct task_struct *t); +static inline int audit_signal_info(int sig, struct task_struct *t) +{ + if (unlikely((audit_pid && t->tgid == audit_pid) || + (audit_signals && !audit_dummy_context()))) + return __audit_signal_info(sig, t); + return 0; +} +extern void audit_filter_inodes(struct task_struct *, struct audit_context *); +extern struct list_head *audit_killed_trees(void); +#else +#define audit_signal_info(s,t) AUDIT_DISABLED +#define audit_filter_inodes(t,c) AUDIT_DISABLED +#endif + +extern struct mutex audit_cmd_mutex; diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c new file mode 100644 index 00000000..e99dda04 --- /dev/null +++ b/kernel/audit_tree.c @@ -0,0 +1,957 @@ +#include "audit.h" +#include +#include +#include +#include +#include + +struct audit_tree; +struct audit_chunk; + +struct audit_tree { + atomic_t count; + int goner; + struct audit_chunk *root; + struct list_head chunks; + struct list_head rules; + struct list_head list; + struct list_head same_root; + struct rcu_head head; + char pathname[]; +}; + +struct audit_chunk { + struct list_head hash; + struct fsnotify_mark mark; + struct list_head trees; /* with root here */ + int dead; + int count; + atomic_long_t refs; + struct rcu_head head; + struct node { + struct list_head list; + struct audit_tree *owner; + unsigned index; /* index; upper bit indicates 'will prune' */ + } owners[]; +}; + +static LIST_HEAD(tree_list); +static LIST_HEAD(prune_list); + +/* + * One struct chunk is attached to each inode of interest. + * We replace struct chunk on tagging/untagging. + * Rules have pointer to struct audit_tree. + * Rules have struct list_head rlist forming a list of rules over + * the same tree. + * References to struct chunk are collected at audit_inode{,_child}() + * time and used in AUDIT_TREE rule matching. + * These references are dropped at the same time we are calling + * audit_free_names(), etc. + * + * Cyclic lists galore: + * tree.chunks anchors chunk.owners[].list hash_lock + * tree.rules anchors rule.rlist audit_filter_mutex + * chunk.trees anchors tree.same_root hash_lock + * chunk.hash is a hash with middle bits of watch.inode as + * a hash function. RCU, hash_lock + * + * tree is refcounted; one reference for "some rules on rules_list refer to + * it", one for each chunk with pointer to it. + * + * chunk is refcounted by embedded fsnotify_mark + .refs (non-zero refcount + * of watch contributes 1 to .refs). + * + * node.index allows to get from node.list to containing chunk. + * MSB of that sucker is stolen to mark taggings that we might have to + * revert - several operations have very unpleasant cleanup logics and + * that makes a difference. Some. + */ + +static struct fsnotify_group *audit_tree_group; + +static struct audit_tree *alloc_tree(const char *s) +{ + struct audit_tree *tree; + + tree = kmalloc(sizeof(struct audit_tree) + strlen(s) + 1, GFP_KERNEL); + if (tree) { + atomic_set(&tree->count, 1); + tree->goner = 0; + INIT_LIST_HEAD(&tree->chunks); + INIT_LIST_HEAD(&tree->rules); + INIT_LIST_HEAD(&tree->list); + INIT_LIST_HEAD(&tree->same_root); + tree->root = NULL; + strcpy(tree->pathname, s); + } + return tree; +} + +static inline void get_tree(struct audit_tree *tree) +{ + atomic_inc(&tree->count); +} + +static void __put_tree(struct rcu_head *rcu) +{ + struct audit_tree *tree = container_of(rcu, struct audit_tree, head); + kfree(tree); +} + +static inline void put_tree(struct audit_tree *tree) +{ + if (atomic_dec_and_test(&tree->count)) + call_rcu(&tree->head, __put_tree); +} + +/* to avoid bringing the entire thing in audit.h */ +const char *audit_tree_path(struct audit_tree *tree) +{ + return tree->pathname; +} + +static void free_chunk(struct audit_chunk *chunk) +{ + int i; + + for (i = 0; i < chunk->count; i++) { + if (chunk->owners[i].owner) + put_tree(chunk->owners[i].owner); + } + kfree(chunk); +} + +void audit_put_chunk(struct audit_chunk *chunk) +{ + if (atomic_long_dec_and_test(&chunk->refs)) + free_chunk(chunk); +} + +static void __put_chunk(struct rcu_head *rcu) +{ + struct audit_chunk *chunk = container_of(rcu, struct audit_chunk, head); + audit_put_chunk(chunk); +} + +static void audit_tree_destroy_watch(struct fsnotify_mark *entry) +{ + struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark); + call_rcu(&chunk->head, __put_chunk); +} + +static struct audit_chunk *alloc_chunk(int count) +{ + struct audit_chunk *chunk; + size_t size; + int i; + + size = offsetof(struct audit_chunk, owners) + count * sizeof(struct node); + chunk = kzalloc(size, GFP_KERNEL); + if (!chunk) + return NULL; + + INIT_LIST_HEAD(&chunk->hash); + INIT_LIST_HEAD(&chunk->trees); + chunk->count = count; + atomic_long_set(&chunk->refs, 1); + for (i = 0; i < count; i++) { + INIT_LIST_HEAD(&chunk->owners[i].list); + chunk->owners[i].index = i; + } + fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch); + return chunk; +} + +enum {HASH_SIZE = 128}; +static struct list_head chunk_hash_heads[HASH_SIZE]; +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock); + +static inline struct list_head *chunk_hash(const struct inode *inode) +{ + unsigned long n = (unsigned long)inode / L1_CACHE_BYTES; + return chunk_hash_heads + n % HASH_SIZE; +} + +/* hash_lock & entry->lock is held by caller */ +static void insert_hash(struct audit_chunk *chunk) +{ + struct fsnotify_mark *entry = &chunk->mark; + struct list_head *list; + + if (!entry->i.inode) + return; + list = chunk_hash(entry->i.inode); + list_add_rcu(&chunk->hash, list); +} + +/* called under rcu_read_lock */ +struct audit_chunk *audit_tree_lookup(const struct inode *inode) +{ + struct list_head *list = chunk_hash(inode); + struct audit_chunk *p; + + list_for_each_entry_rcu(p, list, hash) { + /* mark.inode may have gone NULL, but who cares? */ + if (p->mark.i.inode == inode) { + atomic_long_inc(&p->refs); + return p; + } + } + return NULL; +} + +int audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree) +{ + int n; + for (n = 0; n < chunk->count; n++) + if (chunk->owners[n].owner == tree) + return 1; + return 0; +} + +/* tagging and untagging inodes with trees */ + +static struct audit_chunk *find_chunk(struct node *p) +{ + int index = p->index & ~(1U<<31); + p -= index; + return container_of(p, struct audit_chunk, owners[0]); +} + +static void untag_chunk(struct node *p) +{ + struct audit_chunk *chunk = find_chunk(p); + struct fsnotify_mark *entry = &chunk->mark; + struct audit_chunk *new = NULL; + struct audit_tree *owner; + int size = chunk->count - 1; + int i, j; + + fsnotify_get_mark(entry); + + spin_unlock(&hash_lock); + + if (size) + new = alloc_chunk(size); + + spin_lock(&entry->lock); + if (chunk->dead || !entry->i.inode) { + spin_unlock(&entry->lock); + if (new) + free_chunk(new); + goto out; + } + + owner = p->owner; + + if (!size) { + chunk->dead = 1; + spin_lock(&hash_lock); + list_del_init(&chunk->trees); + if (owner->root == chunk) + owner->root = NULL; + list_del_init(&p->list); + list_del_rcu(&chunk->hash); + spin_unlock(&hash_lock); + spin_unlock(&entry->lock); + fsnotify_destroy_mark(entry); + fsnotify_put_mark(entry); + goto out; + } + + if (!new) + goto Fallback; + + fsnotify_duplicate_mark(&new->mark, entry); + if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) { + free_chunk(new); + goto Fallback; + } + + chunk->dead = 1; + spin_lock(&hash_lock); + list_replace_init(&chunk->trees, &new->trees); + if (owner->root == chunk) { + list_del_init(&owner->same_root); + owner->root = NULL; + } + + for (i = j = 0; j <= size; i++, j++) { + struct audit_tree *s; + if (&chunk->owners[j] == p) { + list_del_init(&p->list); + i--; + continue; + } + s = chunk->owners[j].owner; + new->owners[i].owner = s; + new->owners[i].index = chunk->owners[j].index - j + i; + if (!s) /* result of earlier fallback */ + continue; + get_tree(s); + list_replace_init(&chunk->owners[j].list, &new->owners[i].list); + } + + list_replace_rcu(&chunk->hash, &new->hash); + list_for_each_entry(owner, &new->trees, same_root) + owner->root = new; + spin_unlock(&hash_lock); + spin_unlock(&entry->lock); + fsnotify_destroy_mark(entry); + fsnotify_put_mark(entry); + goto out; + +Fallback: + // do the best we can + spin_lock(&hash_lock); + if (owner->root == chunk) { + list_del_init(&owner->same_root); + owner->root = NULL; + } + list_del_init(&p->list); + p->owner = NULL; + put_tree(owner); + spin_unlock(&hash_lock); + spin_unlock(&entry->lock); +out: + fsnotify_put_mark(entry); + spin_lock(&hash_lock); +} + +static int create_chunk(struct inode *inode, struct audit_tree *tree) +{ + struct fsnotify_mark *entry; + struct audit_chunk *chunk = alloc_chunk(1); + if (!chunk) + return -ENOMEM; + + entry = &chunk->mark; + if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) { + free_chunk(chunk); + return -ENOSPC; + } + + spin_lock(&entry->lock); + spin_lock(&hash_lock); + if (tree->goner) { + spin_unlock(&hash_lock); + chunk->dead = 1; + spin_unlock(&entry->lock); + fsnotify_destroy_mark(entry); + fsnotify_put_mark(entry); + return 0; + } + chunk->owners[0].index = (1U << 31); + chunk->owners[0].owner = tree; + get_tree(tree); + list_add(&chunk->owners[0].list, &tree->chunks); + if (!tree->root) { + tree->root = chunk; + list_add(&tree->same_root, &chunk->trees); + } + insert_hash(chunk); + spin_unlock(&hash_lock); + spin_unlock(&entry->lock); + return 0; +} + +/* the first tagged inode becomes root of tree */ +static int tag_chunk(struct inode *inode, struct audit_tree *tree) +{ + struct fsnotify_mark *old_entry, *chunk_entry; + struct audit_tree *owner; + struct audit_chunk *chunk, *old; + struct node *p; + int n; + + old_entry = fsnotify_find_inode_mark(audit_tree_group, inode); + if (!old_entry) + return create_chunk(inode, tree); + + old = container_of(old_entry, struct audit_chunk, mark); + + /* are we already there? */ + spin_lock(&hash_lock); + for (n = 0; n < old->count; n++) { + if (old->owners[n].owner == tree) { + spin_unlock(&hash_lock); + fsnotify_put_mark(old_entry); + return 0; + } + } + spin_unlock(&hash_lock); + + chunk = alloc_chunk(old->count + 1); + if (!chunk) { + fsnotify_put_mark(old_entry); + return -ENOMEM; + } + + chunk_entry = &chunk->mark; + + spin_lock(&old_entry->lock); + if (!old_entry->i.inode) { + /* old_entry is being shot, lets just lie */ + spin_unlock(&old_entry->lock); + fsnotify_put_mark(old_entry); + free_chunk(chunk); + return -ENOENT; + } + + fsnotify_duplicate_mark(chunk_entry, old_entry); + if (fsnotify_add_mark(chunk_entry, chunk_entry->group, chunk_entry->i.inode, NULL, 1)) { + spin_unlock(&old_entry->lock); + free_chunk(chunk); + fsnotify_put_mark(old_entry); + return -ENOSPC; + } + + /* even though we hold old_entry->lock, this is safe since chunk_entry->lock could NEVER have been grabbed before */ + spin_lock(&chunk_entry->lock); + spin_lock(&hash_lock); + + /* we now hold old_entry->lock, chunk_entry->lock, and hash_lock */ + if (tree->goner) { + spin_unlock(&hash_lock); + chunk->dead = 1; + spin_unlock(&chunk_entry->lock); + spin_unlock(&old_entry->lock); + + fsnotify_destroy_mark(chunk_entry); + + fsnotify_put_mark(chunk_entry); + fsnotify_put_mark(old_entry); + return 0; + } + list_replace_init(&old->trees, &chunk->trees); + for (n = 0, p = chunk->owners; n < old->count; n++, p++) { + struct audit_tree *s = old->owners[n].owner; + p->owner = s; + p->index = old->owners[n].index; + if (!s) /* result of fallback in untag */ + continue; + get_tree(s); + list_replace_init(&old->owners[n].list, &p->list); + } + p->index = (chunk->count - 1) | (1U<<31); + p->owner = tree; + get_tree(tree); + list_add(&p->list, &tree->chunks); + list_replace_rcu(&old->hash, &chunk->hash); + list_for_each_entry(owner, &chunk->trees, same_root) + owner->root = chunk; + old->dead = 1; + if (!tree->root) { + tree->root = chunk; + list_add(&tree->same_root, &chunk->trees); + } + spin_unlock(&hash_lock); + spin_unlock(&chunk_entry->lock); + spin_unlock(&old_entry->lock); + fsnotify_destroy_mark(old_entry); + fsnotify_put_mark(old_entry); /* pair to fsnotify_find mark_entry */ + fsnotify_put_mark(old_entry); /* and kill it */ + return 0; +} + +static void kill_rules(struct audit_tree *tree) +{ + struct audit_krule *rule, *next; + struct audit_entry *entry; + struct audit_buffer *ab; + + list_for_each_entry_safe(rule, next, &tree->rules, rlist) { + entry = container_of(rule, struct audit_entry, rule); + + list_del_init(&rule->rlist); + if (rule->tree) { + /* not a half-baked one */ + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); + audit_log_format(ab, "op="); + audit_log_string(ab, "remove rule"); + audit_log_format(ab, " dir="); + audit_log_untrustedstring(ab, rule->tree->pathname); + audit_log_key(ab, rule->filterkey); + audit_log_format(ab, " list=%d res=1", rule->listnr); + audit_log_end(ab); + rule->tree = NULL; + list_del_rcu(&entry->list); + list_del(&entry->rule.list); + call_rcu(&entry->rcu, audit_free_rule_rcu); + } + } +} + +/* + * finish killing struct audit_tree + */ +static void prune_one(struct audit_tree *victim) +{ + spin_lock(&hash_lock); + while (!list_empty(&victim->chunks)) { + struct node *p; + + p = list_entry(victim->chunks.next, struct node, list); + + untag_chunk(p); + } + spin_unlock(&hash_lock); + put_tree(victim); +} + +/* trim the uncommitted chunks from tree */ + +static void trim_marked(struct audit_tree *tree) +{ + struct list_head *p, *q; + spin_lock(&hash_lock); + if (tree->goner) { + spin_unlock(&hash_lock); + return; + } + /* reorder */ + for (p = tree->chunks.next; p != &tree->chunks; p = q) { + struct node *node = list_entry(p, struct node, list); + q = p->next; + if (node->index & (1U<<31)) { + list_del_init(p); + list_add(p, &tree->chunks); + } + } + + while (!list_empty(&tree->chunks)) { + struct node *node; + + node = list_entry(tree->chunks.next, struct node, list); + + /* have we run out of marked? */ + if (!(node->index & (1U<<31))) + break; + + untag_chunk(node); + } + if (!tree->root && !tree->goner) { + tree->goner = 1; + spin_unlock(&hash_lock); + mutex_lock(&audit_filter_mutex); + kill_rules(tree); + list_del_init(&tree->list); + mutex_unlock(&audit_filter_mutex); + prune_one(tree); + } else { + spin_unlock(&hash_lock); + } +} + +static void audit_schedule_prune(void); + +/* called with audit_filter_mutex */ +int audit_remove_tree_rule(struct audit_krule *rule) +{ + struct audit_tree *tree; + tree = rule->tree; + if (tree) { + spin_lock(&hash_lock); + list_del_init(&rule->rlist); + if (list_empty(&tree->rules) && !tree->goner) { + tree->root = NULL; + list_del_init(&tree->same_root); + tree->goner = 1; + list_move(&tree->list, &prune_list); + rule->tree = NULL; + spin_unlock(&hash_lock); + audit_schedule_prune(); + return 1; + } + rule->tree = NULL; + spin_unlock(&hash_lock); + return 1; + } + return 0; +} + +static int compare_root(struct vfsmount *mnt, void *arg) +{ + return mnt->mnt_root->d_inode == arg; +} + +void audit_trim_trees(void) +{ + struct list_head cursor; + + mutex_lock(&audit_filter_mutex); + list_add(&cursor, &tree_list); + while (cursor.next != &tree_list) { + struct audit_tree *tree; + struct path path; + struct vfsmount *root_mnt; + struct node *node; + int err; + + tree = container_of(cursor.next, struct audit_tree, list); + get_tree(tree); + list_del(&cursor); + list_add(&cursor, &tree->list); + mutex_unlock(&audit_filter_mutex); + + err = kern_path(tree->pathname, 0, &path); + if (err) + goto skip_it; + + root_mnt = collect_mounts(&path); + path_put(&path); + if (!root_mnt) + goto skip_it; + + spin_lock(&hash_lock); + list_for_each_entry(node, &tree->chunks, list) { + struct audit_chunk *chunk = find_chunk(node); + /* this could be NULL if the watch is dying else where... */ + struct inode *inode = chunk->mark.i.inode; + node->index |= 1U<<31; + if (iterate_mounts(compare_root, inode, root_mnt)) + node->index &= ~(1U<<31); + } + spin_unlock(&hash_lock); + trim_marked(tree); + put_tree(tree); + drop_collected_mounts(root_mnt); +skip_it: + mutex_lock(&audit_filter_mutex); + } + list_del(&cursor); + mutex_unlock(&audit_filter_mutex); +} + +int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) +{ + + if (pathname[0] != '/' || + rule->listnr != AUDIT_FILTER_EXIT || + op != Audit_equal || + rule->inode_f || rule->watch || rule->tree) + return -EINVAL; + rule->tree = alloc_tree(pathname); + if (!rule->tree) + return -ENOMEM; + return 0; +} + +void audit_put_tree(struct audit_tree *tree) +{ + put_tree(tree); +} + +static int tag_mount(struct vfsmount *mnt, void *arg) +{ + return tag_chunk(mnt->mnt_root->d_inode, arg); +} + +/* called with audit_filter_mutex */ +int audit_add_tree_rule(struct audit_krule *rule) +{ + struct audit_tree *seed = rule->tree, *tree; + struct path path; + struct vfsmount *mnt; + int err; + + list_for_each_entry(tree, &tree_list, list) { + if (!strcmp(seed->pathname, tree->pathname)) { + put_tree(seed); + rule->tree = tree; + list_add(&rule->rlist, &tree->rules); + return 0; + } + } + tree = seed; + list_add(&tree->list, &tree_list); + list_add(&rule->rlist, &tree->rules); + /* do not set rule->tree yet */ + mutex_unlock(&audit_filter_mutex); + + err = kern_path(tree->pathname, 0, &path); + if (err) + goto Err; + mnt = collect_mounts(&path); + path_put(&path); + if (!mnt) { + err = -ENOMEM; + goto Err; + } + + get_tree(tree); + err = iterate_mounts(tag_mount, tree, mnt); + drop_collected_mounts(mnt); + + if (!err) { + struct node *node; + spin_lock(&hash_lock); + list_for_each_entry(node, &tree->chunks, list) + node->index &= ~(1U<<31); + spin_unlock(&hash_lock); + } else { + trim_marked(tree); + goto Err; + } + + mutex_lock(&audit_filter_mutex); + if (list_empty(&rule->rlist)) { + put_tree(tree); + return -ENOENT; + } + rule->tree = tree; + put_tree(tree); + + return 0; +Err: + mutex_lock(&audit_filter_mutex); + list_del_init(&tree->list); + list_del_init(&tree->rules); + put_tree(tree); + return err; +} + +int audit_tag_tree(char *old, char *new) +{ + struct list_head cursor, barrier; + int failed = 0; + struct path path1, path2; + struct vfsmount *tagged; + int err; + + err = kern_path(new, 0, &path2); + if (err) + return err; + tagged = collect_mounts(&path2); + path_put(&path2); + if (!tagged) + return -ENOMEM; + + err = kern_path(old, 0, &path1); + if (err) { + drop_collected_mounts(tagged); + return err; + } + + mutex_lock(&audit_filter_mutex); + list_add(&barrier, &tree_list); + list_add(&cursor, &barrier); + + while (cursor.next != &tree_list) { + struct audit_tree *tree; + int good_one = 0; + + tree = container_of(cursor.next, struct audit_tree, list); + get_tree(tree); + list_del(&cursor); + list_add(&cursor, &tree->list); + mutex_unlock(&audit_filter_mutex); + + err = kern_path(tree->pathname, 0, &path2); + if (!err) { + good_one = path_is_under(&path1, &path2); + path_put(&path2); + } + + if (!good_one) { + put_tree(tree); + mutex_lock(&audit_filter_mutex); + continue; + } + + failed = iterate_mounts(tag_mount, tree, tagged); + if (failed) { + put_tree(tree); + mutex_lock(&audit_filter_mutex); + break; + } + + mutex_lock(&audit_filter_mutex); + spin_lock(&hash_lock); + if (!tree->goner) { + list_del(&tree->list); + list_add(&tree->list, &tree_list); + } + spin_unlock(&hash_lock); + put_tree(tree); + } + + while (barrier.prev != &tree_list) { + struct audit_tree *tree; + + tree = container_of(barrier.prev, struct audit_tree, list); + get_tree(tree); + list_del(&tree->list); + list_add(&tree->list, &barrier); + mutex_unlock(&audit_filter_mutex); + + if (!failed) { + struct node *node; + spin_lock(&hash_lock); + list_for_each_entry(node, &tree->chunks, list) + node->index &= ~(1U<<31); + spin_unlock(&hash_lock); + } else { + trim_marked(tree); + } + + put_tree(tree); + mutex_lock(&audit_filter_mutex); + } + list_del(&barrier); + list_del(&cursor); + mutex_unlock(&audit_filter_mutex); + path_put(&path1); + drop_collected_mounts(tagged); + return failed; +} + +/* + * That gets run when evict_chunk() ends up needing to kill audit_tree. + * Runs from a separate thread. + */ +static int prune_tree_thread(void *unused) +{ + mutex_lock(&audit_cmd_mutex); + mutex_lock(&audit_filter_mutex); + + while (!list_empty(&prune_list)) { + struct audit_tree *victim; + + victim = list_entry(prune_list.next, struct audit_tree, list); + list_del_init(&victim->list); + + mutex_unlock(&audit_filter_mutex); + + prune_one(victim); + + mutex_lock(&audit_filter_mutex); + } + + mutex_unlock(&audit_filter_mutex); + mutex_unlock(&audit_cmd_mutex); + return 0; +} + +static void audit_schedule_prune(void) +{ + kthread_run(prune_tree_thread, NULL, "audit_prune_tree"); +} + +/* + * ... and that one is done if evict_chunk() decides to delay until the end + * of syscall. Runs synchronously. + */ +void audit_kill_trees(struct list_head *list) +{ + mutex_lock(&audit_cmd_mutex); + mutex_lock(&audit_filter_mutex); + + while (!list_empty(list)) { + struct audit_tree *victim; + + victim = list_entry(list->next, struct audit_tree, list); + kill_rules(victim); + list_del_init(&victim->list); + + mutex_unlock(&audit_filter_mutex); + + prune_one(victim); + + mutex_lock(&audit_filter_mutex); + } + + mutex_unlock(&audit_filter_mutex); + mutex_unlock(&audit_cmd_mutex); +} + +/* + * Here comes the stuff asynchronous to auditctl operations + */ + +static void evict_chunk(struct audit_chunk *chunk) +{ + struct audit_tree *owner; + struct list_head *postponed = audit_killed_trees(); + int need_prune = 0; + int n; + + if (chunk->dead) + return; + + chunk->dead = 1; + mutex_lock(&audit_filter_mutex); + spin_lock(&hash_lock); + while (!list_empty(&chunk->trees)) { + owner = list_entry(chunk->trees.next, + struct audit_tree, same_root); + owner->goner = 1; + owner->root = NULL; + list_del_init(&owner->same_root); + spin_unlock(&hash_lock); + if (!postponed) { + kill_rules(owner); + list_move(&owner->list, &prune_list); + need_prune = 1; + } else { + list_move(&owner->list, postponed); + } + spin_lock(&hash_lock); + } + list_del_rcu(&chunk->hash); + for (n = 0; n < chunk->count; n++) + list_del_init(&chunk->owners[n].list); + spin_unlock(&hash_lock); + if (need_prune) + audit_schedule_prune(); + mutex_unlock(&audit_filter_mutex); +} + +static int audit_tree_handle_event(struct fsnotify_group *group, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmonut_mark, + struct fsnotify_event *event) +{ + BUG(); + return -EOPNOTSUPP; +} + +static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify_group *group) +{ + struct audit_chunk *chunk = container_of(entry, struct audit_chunk, mark); + + evict_chunk(chunk); + fsnotify_put_mark(entry); +} + +static bool audit_tree_send_event(struct fsnotify_group *group, struct inode *inode, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + __u32 mask, void *data, int data_type) +{ + return false; +} + +static const struct fsnotify_ops audit_tree_ops = { + .handle_event = audit_tree_handle_event, + .should_send_event = audit_tree_send_event, + .free_group_priv = NULL, + .free_event_priv = NULL, + .freeing_mark = audit_tree_freeing_mark, +}; + +static int __init audit_tree_init(void) +{ + int i; + + audit_tree_group = fsnotify_alloc_group(&audit_tree_ops); + if (IS_ERR(audit_tree_group)) + audit_panic("cannot initialize fsnotify group for rectree watches"); + + for (i = 0; i < HASH_SIZE; i++) + INIT_LIST_HEAD(&chunk_hash_heads[i]); + + return 0; +} +__initcall(audit_tree_init); diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c new file mode 100644 index 00000000..e6838693 --- /dev/null +++ b/kernel/audit_watch.c @@ -0,0 +1,547 @@ +/* audit_watch.c -- watching inodes + * + * Copyright 2003-2009 Red Hat, Inc. + * Copyright 2005 Hewlett-Packard Development Company, L.P. + * Copyright 2005 IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "audit.h" + +/* + * Reference counting: + * + * audit_parent: lifetime is from audit_init_parent() to receipt of an FS_IGNORED + * event. Each audit_watch holds a reference to its associated parent. + * + * audit_watch: if added to lists, lifetime is from audit_init_watch() to + * audit_remove_watch(). Additionally, an audit_watch may exist + * temporarily to assist in searching existing filter data. Each + * audit_krule holds a reference to its associated watch. + */ + +struct audit_watch { + atomic_t count; /* reference count */ + dev_t dev; /* associated superblock device */ + char *path; /* insertion path */ + unsigned long ino; /* associated inode number */ + struct audit_parent *parent; /* associated parent */ + struct list_head wlist; /* entry in parent->watches list */ + struct list_head rules; /* anchor for krule->rlist */ +}; + +struct audit_parent { + struct list_head watches; /* anchor for audit_watch->wlist */ + struct fsnotify_mark mark; /* fsnotify mark on the inode */ +}; + +/* fsnotify handle. */ +static struct fsnotify_group *audit_watch_group; + +/* fsnotify events we care about. */ +#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\ + FS_MOVE_SELF | FS_EVENT_ON_CHILD) + +static void audit_free_parent(struct audit_parent *parent) +{ + WARN_ON(!list_empty(&parent->watches)); + kfree(parent); +} + +static void audit_watch_free_mark(struct fsnotify_mark *entry) +{ + struct audit_parent *parent; + + parent = container_of(entry, struct audit_parent, mark); + audit_free_parent(parent); +} + +static void audit_get_parent(struct audit_parent *parent) +{ + if (likely(parent)) + fsnotify_get_mark(&parent->mark); +} + +static void audit_put_parent(struct audit_parent *parent) +{ + if (likely(parent)) + fsnotify_put_mark(&parent->mark); +} + +/* + * Find and return the audit_parent on the given inode. If found a reference + * is taken on this parent. + */ +static inline struct audit_parent *audit_find_parent(struct inode *inode) +{ + struct audit_parent *parent = NULL; + struct fsnotify_mark *entry; + + entry = fsnotify_find_inode_mark(audit_watch_group, inode); + if (entry) + parent = container_of(entry, struct audit_parent, mark); + + return parent; +} + +void audit_get_watch(struct audit_watch *watch) +{ + atomic_inc(&watch->count); +} + +void audit_put_watch(struct audit_watch *watch) +{ + if (atomic_dec_and_test(&watch->count)) { + WARN_ON(watch->parent); + WARN_ON(!list_empty(&watch->rules)); + kfree(watch->path); + kfree(watch); + } +} + +static void audit_remove_watch(struct audit_watch *watch) +{ + list_del(&watch->wlist); + audit_put_parent(watch->parent); + watch->parent = NULL; + audit_put_watch(watch); /* match initial get */ +} + +char *audit_watch_path(struct audit_watch *watch) +{ + return watch->path; +} + +int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) +{ + return (watch->ino != (unsigned long)-1) && + (watch->ino == ino) && + (watch->dev == dev); +} + +/* Initialize a parent watch entry. */ +static struct audit_parent *audit_init_parent(struct path *path) +{ + struct inode *inode = path->dentry->d_inode; + struct audit_parent *parent; + int ret; + + parent = kzalloc(sizeof(*parent), GFP_KERNEL); + if (unlikely(!parent)) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&parent->watches); + + fsnotify_init_mark(&parent->mark, audit_watch_free_mark); + parent->mark.mask = AUDIT_FS_WATCH; + ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, NULL, 0); + if (ret < 0) { + audit_free_parent(parent); + return ERR_PTR(ret); + } + + return parent; +} + +/* Initialize a watch entry. */ +static struct audit_watch *audit_init_watch(char *path) +{ + struct audit_watch *watch; + + watch = kzalloc(sizeof(*watch), GFP_KERNEL); + if (unlikely(!watch)) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&watch->rules); + atomic_set(&watch->count, 1); + watch->path = path; + watch->dev = (dev_t)-1; + watch->ino = (unsigned long)-1; + + return watch; +} + +/* Translate a watch string to kernel respresentation. */ +int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op) +{ + struct audit_watch *watch; + + if (!audit_watch_group) + return -EOPNOTSUPP; + + if (path[0] != '/' || path[len-1] == '/' || + krule->listnr != AUDIT_FILTER_EXIT || + op != Audit_equal || + krule->inode_f || krule->watch || krule->tree) + return -EINVAL; + + watch = audit_init_watch(path); + if (IS_ERR(watch)) + return PTR_ERR(watch); + + audit_get_watch(watch); + krule->watch = watch; + + return 0; +} + +/* Duplicate the given audit watch. The new watch's rules list is initialized + * to an empty list and wlist is undefined. */ +static struct audit_watch *audit_dupe_watch(struct audit_watch *old) +{ + char *path; + struct audit_watch *new; + + path = kstrdup(old->path, GFP_KERNEL); + if (unlikely(!path)) + return ERR_PTR(-ENOMEM); + + new = audit_init_watch(path); + if (IS_ERR(new)) { + kfree(path); + goto out; + } + + new->dev = old->dev; + new->ino = old->ino; + audit_get_parent(old->parent); + new->parent = old->parent; + +out: + return new; +} + +static void audit_watch_log_rule_change(struct audit_krule *r, struct audit_watch *w, char *op) +{ + if (audit_enabled) { + struct audit_buffer *ab; + ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE); + audit_log_format(ab, "auid=%u ses=%u op=", + audit_get_loginuid(current), + audit_get_sessionid(current)); + audit_log_string(ab, op); + audit_log_format(ab, " path="); + audit_log_untrustedstring(ab, w->path); + audit_log_key(ab, r->filterkey); + audit_log_format(ab, " list=%d res=1", r->listnr); + audit_log_end(ab); + } +} + +/* Update inode info in audit rules based on filesystem event. */ +static void audit_update_watch(struct audit_parent *parent, + const char *dname, dev_t dev, + unsigned long ino, unsigned invalidating) +{ + struct audit_watch *owatch, *nwatch, *nextw; + struct audit_krule *r, *nextr; + struct audit_entry *oentry, *nentry; + + mutex_lock(&audit_filter_mutex); + /* Run all of the watches on this parent looking for the one that + * matches the given dname */ + list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { + if (audit_compare_dname_path(dname, owatch->path, NULL)) + continue; + + /* If the update involves invalidating rules, do the inode-based + * filtering now, so we don't omit records. */ + if (invalidating && !audit_dummy_context()) + audit_filter_inodes(current, current->audit_context); + + /* updating ino will likely change which audit_hash_list we + * are on so we need a new watch for the new list */ + nwatch = audit_dupe_watch(owatch); + if (IS_ERR(nwatch)) { + mutex_unlock(&audit_filter_mutex); + audit_panic("error updating watch, skipping"); + return; + } + nwatch->dev = dev; + nwatch->ino = ino; + + list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) { + + oentry = container_of(r, struct audit_entry, rule); + list_del(&oentry->rule.rlist); + list_del_rcu(&oentry->list); + + nentry = audit_dupe_rule(&oentry->rule); + if (IS_ERR(nentry)) { + list_del(&oentry->rule.list); + audit_panic("error updating watch, removing"); + } else { + int h = audit_hash_ino((u32)ino); + + /* + * nentry->rule.watch == oentry->rule.watch so + * we must drop that reference and set it to our + * new watch. + */ + audit_put_watch(nentry->rule.watch); + audit_get_watch(nwatch); + nentry->rule.watch = nwatch; + list_add(&nentry->rule.rlist, &nwatch->rules); + list_add_rcu(&nentry->list, &audit_inode_hash[h]); + list_replace(&oentry->rule.list, + &nentry->rule.list); + } + + audit_watch_log_rule_change(r, owatch, "updated rules"); + + call_rcu(&oentry->rcu, audit_free_rule_rcu); + } + + audit_remove_watch(owatch); + goto add_watch_to_parent; /* event applies to a single watch */ + } + mutex_unlock(&audit_filter_mutex); + return; + +add_watch_to_parent: + list_add(&nwatch->wlist, &parent->watches); + mutex_unlock(&audit_filter_mutex); + return; +} + +/* Remove all watches & rules associated with a parent that is going away. */ +static void audit_remove_parent_watches(struct audit_parent *parent) +{ + struct audit_watch *w, *nextw; + struct audit_krule *r, *nextr; + struct audit_entry *e; + + mutex_lock(&audit_filter_mutex); + list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { + list_for_each_entry_safe(r, nextr, &w->rules, rlist) { + e = container_of(r, struct audit_entry, rule); + audit_watch_log_rule_change(r, w, "remove rule"); + list_del(&r->rlist); + list_del(&r->list); + list_del_rcu(&e->list); + call_rcu(&e->rcu, audit_free_rule_rcu); + } + audit_remove_watch(w); + } + mutex_unlock(&audit_filter_mutex); + + fsnotify_destroy_mark(&parent->mark); +} + +/* Get path information necessary for adding watches. */ +static int audit_get_nd(struct audit_watch *watch, struct path *parent) +{ + struct nameidata nd; + struct dentry *d; + int err; + + err = kern_path_parent(watch->path, &nd); + if (err) + return err; + + if (nd.last_type != LAST_NORM) { + path_put(&nd.path); + return -EINVAL; + } + + mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len); + if (IS_ERR(d)) { + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + path_put(&nd.path); + return PTR_ERR(d); + } + if (d->d_inode) { + /* update watch filter fields */ + watch->dev = d->d_inode->i_sb->s_dev; + watch->ino = d->d_inode->i_ino; + } + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + + *parent = nd.path; + dput(d); + return 0; +} + +/* Associate the given rule with an existing parent. + * Caller must hold audit_filter_mutex. */ +static void audit_add_to_parent(struct audit_krule *krule, + struct audit_parent *parent) +{ + struct audit_watch *w, *watch = krule->watch; + int watch_found = 0; + + BUG_ON(!mutex_is_locked(&audit_filter_mutex)); + + list_for_each_entry(w, &parent->watches, wlist) { + if (strcmp(watch->path, w->path)) + continue; + + watch_found = 1; + + /* put krule's and initial refs to temporary watch */ + audit_put_watch(watch); + audit_put_watch(watch); + + audit_get_watch(w); + krule->watch = watch = w; + break; + } + + if (!watch_found) { + audit_get_parent(parent); + watch->parent = parent; + + list_add(&watch->wlist, &parent->watches); + } + list_add(&krule->rlist, &watch->rules); +} + +/* Find a matching watch entry, or add this one. + * Caller must hold audit_filter_mutex. */ +int audit_add_watch(struct audit_krule *krule, struct list_head **list) +{ + struct audit_watch *watch = krule->watch; + struct audit_parent *parent; + struct path parent_path; + int h, ret = 0; + + mutex_unlock(&audit_filter_mutex); + + /* Avoid calling path_lookup under audit_filter_mutex. */ + ret = audit_get_nd(watch, &parent_path); + + /* caller expects mutex locked */ + mutex_lock(&audit_filter_mutex); + + if (ret) + return ret; + + /* either find an old parent or attach a new one */ + parent = audit_find_parent(parent_path.dentry->d_inode); + if (!parent) { + parent = audit_init_parent(&parent_path); + if (IS_ERR(parent)) { + ret = PTR_ERR(parent); + goto error; + } + } + + audit_add_to_parent(krule, parent); + + /* match get in audit_find_parent or audit_init_parent */ + audit_put_parent(parent); + + h = audit_hash_ino((u32)watch->ino); + *list = &audit_inode_hash[h]; +error: + path_put(&parent_path); + return ret; +} + +void audit_remove_watch_rule(struct audit_krule *krule) +{ + struct audit_watch *watch = krule->watch; + struct audit_parent *parent = watch->parent; + + list_del(&krule->rlist); + + if (list_empty(&watch->rules)) { + audit_remove_watch(watch); + + if (list_empty(&parent->watches)) { + audit_get_parent(parent); + fsnotify_destroy_mark(&parent->mark); + audit_put_parent(parent); + } + } +} + +static bool audit_watch_should_send_event(struct fsnotify_group *group, struct inode *inode, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + __u32 mask, void *data, int data_type) +{ + return true; +} + +/* Update watch data in audit rules based on fsnotify events. */ +static int audit_watch_handle_event(struct fsnotify_group *group, + struct fsnotify_mark *inode_mark, + struct fsnotify_mark *vfsmount_mark, + struct fsnotify_event *event) +{ + struct inode *inode; + __u32 mask = event->mask; + const char *dname = event->file_name; + struct audit_parent *parent; + + parent = container_of(inode_mark, struct audit_parent, mark); + + BUG_ON(group != audit_watch_group); + + switch (event->data_type) { + case (FSNOTIFY_EVENT_PATH): + inode = event->path.dentry->d_inode; + break; + case (FSNOTIFY_EVENT_INODE): + inode = event->inode; + break; + default: + BUG(); + inode = NULL; + break; + }; + + if (mask & (FS_CREATE|FS_MOVED_TO) && inode) + audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0); + else if (mask & (FS_DELETE|FS_MOVED_FROM)) + audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); + else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) + audit_remove_parent_watches(parent); + + return 0; +} + +static const struct fsnotify_ops audit_watch_fsnotify_ops = { + .should_send_event = audit_watch_should_send_event, + .handle_event = audit_watch_handle_event, + .free_group_priv = NULL, + .freeing_mark = NULL, + .free_event_priv = NULL, +}; + +static int __init audit_watch_init(void) +{ + audit_watch_group = fsnotify_alloc_group(&audit_watch_fsnotify_ops); + if (IS_ERR(audit_watch_group)) { + audit_watch_group = NULL; + audit_panic("cannot create audit fsnotify group"); + } + return 0; +} +device_initcall(audit_watch_init); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c new file mode 100644 index 00000000..f8277c80 --- /dev/null +++ b/kernel/auditfilter.c @@ -0,0 +1,1383 @@ +/* auditfilter.c -- filtering of audit events + * + * Copyright 2003-2004 Red Hat, Inc. + * Copyright 2005 Hewlett-Packard Development Company, L.P. + * Copyright 2005 IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "audit.h" + +/* + * Locking model: + * + * audit_filter_mutex: + * Synchronizes writes and blocking reads of audit's filterlist + * data. Rcu is used to traverse the filterlist and access + * contents of structs audit_entry, audit_watch and opaque + * LSM rules during filtering. If modified, these structures + * must be copied and replace their counterparts in the filterlist. + * An audit_parent struct is not accessed during filtering, so may + * be written directly provided audit_filter_mutex is held. + */ + +/* Audit filter lists, defined in */ +struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { + LIST_HEAD_INIT(audit_filter_list[0]), + LIST_HEAD_INIT(audit_filter_list[1]), + LIST_HEAD_INIT(audit_filter_list[2]), + LIST_HEAD_INIT(audit_filter_list[3]), + LIST_HEAD_INIT(audit_filter_list[4]), + LIST_HEAD_INIT(audit_filter_list[5]), +#if AUDIT_NR_FILTERS != 6 +#error Fix audit_filter_list initialiser +#endif +}; +static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = { + LIST_HEAD_INIT(audit_rules_list[0]), + LIST_HEAD_INIT(audit_rules_list[1]), + LIST_HEAD_INIT(audit_rules_list[2]), + LIST_HEAD_INIT(audit_rules_list[3]), + LIST_HEAD_INIT(audit_rules_list[4]), + LIST_HEAD_INIT(audit_rules_list[5]), +}; + +DEFINE_MUTEX(audit_filter_mutex); + +static inline void audit_free_rule(struct audit_entry *e) +{ + int i; + struct audit_krule *erule = &e->rule; + + /* some rules don't have associated watches */ + if (erule->watch) + audit_put_watch(erule->watch); + if (erule->fields) + for (i = 0; i < erule->field_count; i++) { + struct audit_field *f = &erule->fields[i]; + kfree(f->lsm_str); + security_audit_rule_free(f->lsm_rule); + } + kfree(erule->fields); + kfree(erule->filterkey); + kfree(e); +} + +void audit_free_rule_rcu(struct rcu_head *head) +{ + struct audit_entry *e = container_of(head, struct audit_entry, rcu); + audit_free_rule(e); +} + +/* Initialize an audit filterlist entry. */ +static inline struct audit_entry *audit_init_entry(u32 field_count) +{ + struct audit_entry *entry; + struct audit_field *fields; + + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (unlikely(!entry)) + return NULL; + + fields = kzalloc(sizeof(*fields) * field_count, GFP_KERNEL); + if (unlikely(!fields)) { + kfree(entry); + return NULL; + } + entry->rule.fields = fields; + + return entry; +} + +/* Unpack a filter field's string representation from user-space + * buffer. */ +char *audit_unpack_string(void **bufp, size_t *remain, size_t len) +{ + char *str; + + if (!*bufp || (len == 0) || (len > *remain)) + return ERR_PTR(-EINVAL); + + /* Of the currently implemented string fields, PATH_MAX + * defines the longest valid length. + */ + if (len > PATH_MAX) + return ERR_PTR(-ENAMETOOLONG); + + str = kmalloc(len + 1, GFP_KERNEL); + if (unlikely(!str)) + return ERR_PTR(-ENOMEM); + + memcpy(str, *bufp, len); + str[len] = 0; + *bufp += len; + *remain -= len; + + return str; +} + +/* Translate an inode field to kernel respresentation. */ +static inline int audit_to_inode(struct audit_krule *krule, + struct audit_field *f) +{ + if (krule->listnr != AUDIT_FILTER_EXIT || + krule->watch || krule->inode_f || krule->tree || + (f->op != Audit_equal && f->op != Audit_not_equal)) + return -EINVAL; + + krule->inode_f = f; + return 0; +} + +static __u32 *classes[AUDIT_SYSCALL_CLASSES]; + +int __init audit_register_class(int class, unsigned *list) +{ + __u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL); + if (!p) + return -ENOMEM; + while (*list != ~0U) { + unsigned n = *list++; + if (n >= AUDIT_BITMASK_SIZE * 32 - AUDIT_SYSCALL_CLASSES) { + kfree(p); + return -EINVAL; + } + p[AUDIT_WORD(n)] |= AUDIT_BIT(n); + } + if (class >= AUDIT_SYSCALL_CLASSES || classes[class]) { + kfree(p); + return -EINVAL; + } + classes[class] = p; + return 0; +} + +int audit_match_class(int class, unsigned syscall) +{ + if (unlikely(syscall >= AUDIT_BITMASK_SIZE * 32)) + return 0; + if (unlikely(class >= AUDIT_SYSCALL_CLASSES || !classes[class])) + return 0; + return classes[class][AUDIT_WORD(syscall)] & AUDIT_BIT(syscall); +} + +#ifdef CONFIG_AUDITSYSCALL +static inline int audit_match_class_bits(int class, u32 *mask) +{ + int i; + + if (classes[class]) { + for (i = 0; i < AUDIT_BITMASK_SIZE; i++) + if (mask[i] & classes[class][i]) + return 0; + } + return 1; +} + +static int audit_match_signal(struct audit_entry *entry) +{ + struct audit_field *arch = entry->rule.arch_f; + + if (!arch) { + /* When arch is unspecified, we must check both masks on biarch + * as syscall number alone is ambiguous. */ + return (audit_match_class_bits(AUDIT_CLASS_SIGNAL, + entry->rule.mask) && + audit_match_class_bits(AUDIT_CLASS_SIGNAL_32, + entry->rule.mask)); + } + + switch(audit_classify_arch(arch->val)) { + case 0: /* native */ + return (audit_match_class_bits(AUDIT_CLASS_SIGNAL, + entry->rule.mask)); + case 1: /* 32bit on biarch */ + return (audit_match_class_bits(AUDIT_CLASS_SIGNAL_32, + entry->rule.mask)); + default: + return 1; + } +} +#endif + +/* Common user-space to kernel rule translation. */ +static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) +{ + unsigned listnr; + struct audit_entry *entry; + int i, err; + + err = -EINVAL; + listnr = rule->flags & ~AUDIT_FILTER_PREPEND; + switch(listnr) { + default: + goto exit_err; + case AUDIT_FILTER_USER: + case AUDIT_FILTER_TYPE: +#ifdef CONFIG_AUDITSYSCALL + case AUDIT_FILTER_ENTRY: + case AUDIT_FILTER_EXIT: + case AUDIT_FILTER_TASK: +#endif + ; + } + if (unlikely(rule->action == AUDIT_POSSIBLE)) { + printk(KERN_ERR "AUDIT_POSSIBLE is deprecated\n"); + goto exit_err; + } + if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS) + goto exit_err; + if (rule->field_count > AUDIT_MAX_FIELDS) + goto exit_err; + + err = -ENOMEM; + entry = audit_init_entry(rule->field_count); + if (!entry) + goto exit_err; + + entry->rule.flags = rule->flags & AUDIT_FILTER_PREPEND; + entry->rule.listnr = listnr; + entry->rule.action = rule->action; + entry->rule.field_count = rule->field_count; + + for (i = 0; i < AUDIT_BITMASK_SIZE; i++) + entry->rule.mask[i] = rule->mask[i]; + + for (i = 0; i < AUDIT_SYSCALL_CLASSES; i++) { + int bit = AUDIT_BITMASK_SIZE * 32 - i - 1; + __u32 *p = &entry->rule.mask[AUDIT_WORD(bit)]; + __u32 *class; + + if (!(*p & AUDIT_BIT(bit))) + continue; + *p &= ~AUDIT_BIT(bit); + class = classes[i]; + if (class) { + int j; + for (j = 0; j < AUDIT_BITMASK_SIZE; j++) + entry->rule.mask[j] |= class[j]; + } + } + + return entry; + +exit_err: + return ERR_PTR(err); +} + +static u32 audit_ops[] = +{ + [Audit_equal] = AUDIT_EQUAL, + [Audit_not_equal] = AUDIT_NOT_EQUAL, + [Audit_bitmask] = AUDIT_BIT_MASK, + [Audit_bittest] = AUDIT_BIT_TEST, + [Audit_lt] = AUDIT_LESS_THAN, + [Audit_gt] = AUDIT_GREATER_THAN, + [Audit_le] = AUDIT_LESS_THAN_OR_EQUAL, + [Audit_ge] = AUDIT_GREATER_THAN_OR_EQUAL, +}; + +static u32 audit_to_op(u32 op) +{ + u32 n; + for (n = Audit_equal; n < Audit_bad && audit_ops[n] != op; n++) + ; + return n; +} + + +/* Translate struct audit_rule to kernel's rule respresentation. + * Exists for backward compatibility with userspace. */ +static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) +{ + struct audit_entry *entry; + int err = 0; + int i; + + entry = audit_to_entry_common(rule); + if (IS_ERR(entry)) + goto exit_nofree; + + for (i = 0; i < rule->field_count; i++) { + struct audit_field *f = &entry->rule.fields[i]; + u32 n; + + n = rule->fields[i] & (AUDIT_NEGATE|AUDIT_OPERATORS); + + /* Support for legacy operators where + * AUDIT_NEGATE bit signifies != and otherwise assumes == */ + if (n & AUDIT_NEGATE) + f->op = Audit_not_equal; + else if (!n) + f->op = Audit_equal; + else + f->op = audit_to_op(n); + + entry->rule.vers_ops = (n & AUDIT_OPERATORS) ? 2 : 1; + + f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); + f->val = rule->values[i]; + + err = -EINVAL; + if (f->op == Audit_bad) + goto exit_free; + + switch(f->type) { + default: + goto exit_free; + case AUDIT_PID: + case AUDIT_UID: + case AUDIT_EUID: + case AUDIT_SUID: + case AUDIT_FSUID: + case AUDIT_GID: + case AUDIT_EGID: + case AUDIT_SGID: + case AUDIT_FSGID: + case AUDIT_LOGINUID: + case AUDIT_PERS: + case AUDIT_MSGTYPE: + case AUDIT_PPID: + case AUDIT_DEVMAJOR: + case AUDIT_DEVMINOR: + case AUDIT_EXIT: + case AUDIT_SUCCESS: + /* bit ops are only useful on syscall args */ + if (f->op == Audit_bitmask || f->op == Audit_bittest) + goto exit_free; + break; + case AUDIT_ARG0: + case AUDIT_ARG1: + case AUDIT_ARG2: + case AUDIT_ARG3: + break; + /* arch is only allowed to be = or != */ + case AUDIT_ARCH: + if (f->op != Audit_not_equal && f->op != Audit_equal) + goto exit_free; + entry->rule.arch_f = f; + break; + case AUDIT_PERM: + if (f->val & ~15) + goto exit_free; + break; + case AUDIT_FILETYPE: + if ((f->val & ~S_IFMT) > S_IFMT) + goto exit_free; + break; + case AUDIT_INODE: + err = audit_to_inode(&entry->rule, f); + if (err) + goto exit_free; + break; + } + } + + if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal) + entry->rule.inode_f = NULL; + +exit_nofree: + return entry; + +exit_free: + audit_free_rule(entry); + return ERR_PTR(err); +} + +/* Translate struct audit_rule_data to kernel's rule respresentation. */ +static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, + size_t datasz) +{ + int err = 0; + struct audit_entry *entry; + void *bufp; + size_t remain = datasz - sizeof(struct audit_rule_data); + int i; + char *str; + + entry = audit_to_entry_common((struct audit_rule *)data); + if (IS_ERR(entry)) + goto exit_nofree; + + bufp = data->buf; + entry->rule.vers_ops = 2; + for (i = 0; i < data->field_count; i++) { + struct audit_field *f = &entry->rule.fields[i]; + + err = -EINVAL; + + f->op = audit_to_op(data->fieldflags[i]); + if (f->op == Audit_bad) + goto exit_free; + + f->type = data->fields[i]; + f->val = data->values[i]; + f->lsm_str = NULL; + f->lsm_rule = NULL; + switch(f->type) { + case AUDIT_PID: + case AUDIT_UID: + case AUDIT_EUID: + case AUDIT_SUID: + case AUDIT_FSUID: + case AUDIT_GID: + case AUDIT_EGID: + case AUDIT_SGID: + case AUDIT_FSGID: + case AUDIT_LOGINUID: + case AUDIT_PERS: + case AUDIT_MSGTYPE: + case AUDIT_PPID: + case AUDIT_DEVMAJOR: + case AUDIT_DEVMINOR: + case AUDIT_EXIT: + case AUDIT_SUCCESS: + case AUDIT_ARG0: + case AUDIT_ARG1: + case AUDIT_ARG2: + case AUDIT_ARG3: + break; + case AUDIT_ARCH: + entry->rule.arch_f = f; + break; + case AUDIT_SUBJ_USER: + case AUDIT_SUBJ_ROLE: + case AUDIT_SUBJ_TYPE: + case AUDIT_SUBJ_SEN: + case AUDIT_SUBJ_CLR: + case AUDIT_OBJ_USER: + case AUDIT_OBJ_ROLE: + case AUDIT_OBJ_TYPE: + case AUDIT_OBJ_LEV_LOW: + case AUDIT_OBJ_LEV_HIGH: + str = audit_unpack_string(&bufp, &remain, f->val); + if (IS_ERR(str)) + goto exit_free; + entry->rule.buflen += f->val; + + err = security_audit_rule_init(f->type, f->op, str, + (void **)&f->lsm_rule); + /* Keep currently invalid fields around in case they + * become valid after a policy reload. */ + if (err == -EINVAL) { + printk(KERN_WARNING "audit rule for LSM " + "\'%s\' is invalid\n", str); + err = 0; + } + if (err) { + kfree(str); + goto exit_free; + } else + f->lsm_str = str; + break; + case AUDIT_WATCH: + str = audit_unpack_string(&bufp, &remain, f->val); + if (IS_ERR(str)) + goto exit_free; + entry->rule.buflen += f->val; + + err = audit_to_watch(&entry->rule, str, f->val, f->op); + if (err) { + kfree(str); + goto exit_free; + } + break; + case AUDIT_DIR: + str = audit_unpack_string(&bufp, &remain, f->val); + if (IS_ERR(str)) + goto exit_free; + entry->rule.buflen += f->val; + + err = audit_make_tree(&entry->rule, str, f->op); + kfree(str); + if (err) + goto exit_free; + break; + case AUDIT_INODE: + err = audit_to_inode(&entry->rule, f); + if (err) + goto exit_free; + break; + case AUDIT_FILTERKEY: + err = -EINVAL; + if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN) + goto exit_free; + str = audit_unpack_string(&bufp, &remain, f->val); + if (IS_ERR(str)) + goto exit_free; + entry->rule.buflen += f->val; + entry->rule.filterkey = str; + break; + case AUDIT_PERM: + if (f->val & ~15) + goto exit_free; + break; + case AUDIT_FILETYPE: + if ((f->val & ~S_IFMT) > S_IFMT) + goto exit_free; + break; + default: + goto exit_free; + } + } + + if (entry->rule.inode_f && entry->rule.inode_f->op == Audit_not_equal) + entry->rule.inode_f = NULL; + +exit_nofree: + return entry; + +exit_free: + audit_free_rule(entry); + return ERR_PTR(err); +} + +/* Pack a filter field's string representation into data block. */ +static inline size_t audit_pack_string(void **bufp, const char *str) +{ + size_t len = strlen(str); + + memcpy(*bufp, str, len); + *bufp += len; + + return len; +} + +/* Translate kernel rule respresentation to struct audit_rule. + * Exists for backward compatibility with userspace. */ +static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule) +{ + struct audit_rule *rule; + int i; + + rule = kzalloc(sizeof(*rule), GFP_KERNEL); + if (unlikely(!rule)) + return NULL; + + rule->flags = krule->flags | krule->listnr; + rule->action = krule->action; + rule->field_count = krule->field_count; + for (i = 0; i < rule->field_count; i++) { + rule->values[i] = krule->fields[i].val; + rule->fields[i] = krule->fields[i].type; + + if (krule->vers_ops == 1) { + if (krule->fields[i].op == Audit_not_equal) + rule->fields[i] |= AUDIT_NEGATE; + } else { + rule->fields[i] |= audit_ops[krule->fields[i].op]; + } + } + for (i = 0; i < AUDIT_BITMASK_SIZE; i++) rule->mask[i] = krule->mask[i]; + + return rule; +} + +/* Translate kernel rule respresentation to struct audit_rule_data. */ +static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) +{ + struct audit_rule_data *data; + void *bufp; + int i; + + data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL); + if (unlikely(!data)) + return NULL; + memset(data, 0, sizeof(*data)); + + data->flags = krule->flags | krule->listnr; + data->action = krule->action; + data->field_count = krule->field_count; + bufp = data->buf; + for (i = 0; i < data->field_count; i++) { + struct audit_field *f = &krule->fields[i]; + + data->fields[i] = f->type; + data->fieldflags[i] = audit_ops[f->op]; + switch(f->type) { + case AUDIT_SUBJ_USER: + case AUDIT_SUBJ_ROLE: + case AUDIT_SUBJ_TYPE: + case AUDIT_SUBJ_SEN: + case AUDIT_SUBJ_CLR: + case AUDIT_OBJ_USER: + case AUDIT_OBJ_ROLE: + case AUDIT_OBJ_TYPE: + case AUDIT_OBJ_LEV_LOW: + case AUDIT_OBJ_LEV_HIGH: + data->buflen += data->values[i] = + audit_pack_string(&bufp, f->lsm_str); + break; + case AUDIT_WATCH: + data->buflen += data->values[i] = + audit_pack_string(&bufp, + audit_watch_path(krule->watch)); + break; + case AUDIT_DIR: + data->buflen += data->values[i] = + audit_pack_string(&bufp, + audit_tree_path(krule->tree)); + break; + case AUDIT_FILTERKEY: + data->buflen += data->values[i] = + audit_pack_string(&bufp, krule->filterkey); + break; + default: + data->values[i] = f->val; + } + } + for (i = 0; i < AUDIT_BITMASK_SIZE; i++) data->mask[i] = krule->mask[i]; + + return data; +} + +/* Compare two rules in kernel format. Considered success if rules + * don't match. */ +static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b) +{ + int i; + + if (a->flags != b->flags || + a->listnr != b->listnr || + a->action != b->action || + a->field_count != b->field_count) + return 1; + + for (i = 0; i < a->field_count; i++) { + if (a->fields[i].type != b->fields[i].type || + a->fields[i].op != b->fields[i].op) + return 1; + + switch(a->fields[i].type) { + case AUDIT_SUBJ_USER: + case AUDIT_SUBJ_ROLE: + case AUDIT_SUBJ_TYPE: + case AUDIT_SUBJ_SEN: + case AUDIT_SUBJ_CLR: + case AUDIT_OBJ_USER: + case AUDIT_OBJ_ROLE: + case AUDIT_OBJ_TYPE: + case AUDIT_OBJ_LEV_LOW: + case AUDIT_OBJ_LEV_HIGH: + if (strcmp(a->fields[i].lsm_str, b->fields[i].lsm_str)) + return 1; + break; + case AUDIT_WATCH: + if (strcmp(audit_watch_path(a->watch), + audit_watch_path(b->watch))) + return 1; + break; + case AUDIT_DIR: + if (strcmp(audit_tree_path(a->tree), + audit_tree_path(b->tree))) + return 1; + break; + case AUDIT_FILTERKEY: + /* both filterkeys exist based on above type compare */ + if (strcmp(a->filterkey, b->filterkey)) + return 1; + break; + default: + if (a->fields[i].val != b->fields[i].val) + return 1; + } + } + + for (i = 0; i < AUDIT_BITMASK_SIZE; i++) + if (a->mask[i] != b->mask[i]) + return 1; + + return 0; +} + +/* Duplicate LSM field information. The lsm_rule is opaque, so must be + * re-initialized. */ +static inline int audit_dupe_lsm_field(struct audit_field *df, + struct audit_field *sf) +{ + int ret = 0; + char *lsm_str; + + /* our own copy of lsm_str */ + lsm_str = kstrdup(sf->lsm_str, GFP_KERNEL); + if (unlikely(!lsm_str)) + return -ENOMEM; + df->lsm_str = lsm_str; + + /* our own (refreshed) copy of lsm_rule */ + ret = security_audit_rule_init(df->type, df->op, df->lsm_str, + (void **)&df->lsm_rule); + /* Keep currently invalid fields around in case they + * become valid after a policy reload. */ + if (ret == -EINVAL) { + printk(KERN_WARNING "audit rule for LSM \'%s\' is " + "invalid\n", df->lsm_str); + ret = 0; + } + + return ret; +} + +/* Duplicate an audit rule. This will be a deep copy with the exception + * of the watch - that pointer is carried over. The LSM specific fields + * will be updated in the copy. The point is to be able to replace the old + * rule with the new rule in the filterlist, then free the old rule. + * The rlist element is undefined; list manipulations are handled apart from + * the initial copy. */ +struct audit_entry *audit_dupe_rule(struct audit_krule *old) +{ + u32 fcount = old->field_count; + struct audit_entry *entry; + struct audit_krule *new; + char *fk; + int i, err = 0; + + entry = audit_init_entry(fcount); + if (unlikely(!entry)) + return ERR_PTR(-ENOMEM); + + new = &entry->rule; + new->vers_ops = old->vers_ops; + new->flags = old->flags; + new->listnr = old->listnr; + new->action = old->action; + for (i = 0; i < AUDIT_BITMASK_SIZE; i++) + new->mask[i] = old->mask[i]; + new->prio = old->prio; + new->buflen = old->buflen; + new->inode_f = old->inode_f; + new->field_count = old->field_count; + + /* + * note that we are OK with not refcounting here; audit_match_tree() + * never dereferences tree and we can't get false positives there + * since we'd have to have rule gone from the list *and* removed + * before the chunks found by lookup had been allocated, i.e. before + * the beginning of list scan. + */ + new->tree = old->tree; + memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount); + + /* deep copy this information, updating the lsm_rule fields, because + * the originals will all be freed when the old rule is freed. */ + for (i = 0; i < fcount; i++) { + switch (new->fields[i].type) { + case AUDIT_SUBJ_USER: + case AUDIT_SUBJ_ROLE: + case AUDIT_SUBJ_TYPE: + case AUDIT_SUBJ_SEN: + case AUDIT_SUBJ_CLR: + case AUDIT_OBJ_USER: + case AUDIT_OBJ_ROLE: + case AUDIT_OBJ_TYPE: + case AUDIT_OBJ_LEV_LOW: + case AUDIT_OBJ_LEV_HIGH: + err = audit_dupe_lsm_field(&new->fields[i], + &old->fields[i]); + break; + case AUDIT_FILTERKEY: + fk = kstrdup(old->filterkey, GFP_KERNEL); + if (unlikely(!fk)) + err = -ENOMEM; + else + new->filterkey = fk; + } + if (err) { + audit_free_rule(entry); + return ERR_PTR(err); + } + } + + if (old->watch) { + audit_get_watch(old->watch); + new->watch = old->watch; + } + + return entry; +} + +/* Find an existing audit rule. + * Caller must hold audit_filter_mutex to prevent stale rule data. */ +static struct audit_entry *audit_find_rule(struct audit_entry *entry, + struct list_head **p) +{ + struct audit_entry *e, *found = NULL; + struct list_head *list; + int h; + + if (entry->rule.inode_f) { + h = audit_hash_ino(entry->rule.inode_f->val); + *p = list = &audit_inode_hash[h]; + } else if (entry->rule.watch) { + /* we don't know the inode number, so must walk entire hash */ + for (h = 0; h < AUDIT_INODE_BUCKETS; h++) { + list = &audit_inode_hash[h]; + list_for_each_entry(e, list, list) + if (!audit_compare_rule(&entry->rule, &e->rule)) { + found = e; + goto out; + } + } + goto out; + } else { + *p = list = &audit_filter_list[entry->rule.listnr]; + } + + list_for_each_entry(e, list, list) + if (!audit_compare_rule(&entry->rule, &e->rule)) { + found = e; + goto out; + } + +out: + return found; +} + +static u64 prio_low = ~0ULL/2; +static u64 prio_high = ~0ULL/2 - 1; + +/* Add rule to given filterlist if not a duplicate. */ +static inline int audit_add_rule(struct audit_entry *entry) +{ + struct audit_entry *e; + struct audit_watch *watch = entry->rule.watch; + struct audit_tree *tree = entry->rule.tree; + struct list_head *list; + int err; +#ifdef CONFIG_AUDITSYSCALL + int dont_count = 0; + + /* If either of these, don't count towards total */ + if (entry->rule.listnr == AUDIT_FILTER_USER || + entry->rule.listnr == AUDIT_FILTER_TYPE) + dont_count = 1; +#endif + + mutex_lock(&audit_filter_mutex); + e = audit_find_rule(entry, &list); + if (e) { + mutex_unlock(&audit_filter_mutex); + err = -EEXIST; + /* normally audit_add_tree_rule() will free it on failure */ + if (tree) + audit_put_tree(tree); + goto error; + } + + if (watch) { + /* audit_filter_mutex is dropped and re-taken during this call */ + err = audit_add_watch(&entry->rule, &list); + if (err) { + mutex_unlock(&audit_filter_mutex); + goto error; + } + } + if (tree) { + err = audit_add_tree_rule(&entry->rule); + if (err) { + mutex_unlock(&audit_filter_mutex); + goto error; + } + } + + entry->rule.prio = ~0ULL; + if (entry->rule.listnr == AUDIT_FILTER_EXIT) { + if (entry->rule.flags & AUDIT_FILTER_PREPEND) + entry->rule.prio = ++prio_high; + else + entry->rule.prio = --prio_low; + } + + if (entry->rule.flags & AUDIT_FILTER_PREPEND) { + list_add(&entry->rule.list, + &audit_rules_list[entry->rule.listnr]); + list_add_rcu(&entry->list, list); + entry->rule.flags &= ~AUDIT_FILTER_PREPEND; + } else { + list_add_tail(&entry->rule.list, + &audit_rules_list[entry->rule.listnr]); + list_add_tail_rcu(&entry->list, list); + } +#ifdef CONFIG_AUDITSYSCALL + if (!dont_count) + audit_n_rules++; + + if (!audit_match_signal(entry)) + audit_signals++; +#endif + mutex_unlock(&audit_filter_mutex); + + return 0; + +error: + if (watch) + audit_put_watch(watch); /* tmp watch, matches initial get */ + return err; +} + +/* Remove an existing rule from filterlist. */ +static inline int audit_del_rule(struct audit_entry *entry) +{ + struct audit_entry *e; + struct audit_watch *watch = entry->rule.watch; + struct audit_tree *tree = entry->rule.tree; + struct list_head *list; + int ret = 0; +#ifdef CONFIG_AUDITSYSCALL + int dont_count = 0; + + /* If either of these, don't count towards total */ + if (entry->rule.listnr == AUDIT_FILTER_USER || + entry->rule.listnr == AUDIT_FILTER_TYPE) + dont_count = 1; +#endif + + mutex_lock(&audit_filter_mutex); + e = audit_find_rule(entry, &list); + if (!e) { + mutex_unlock(&audit_filter_mutex); + ret = -ENOENT; + goto out; + } + + if (e->rule.watch) + audit_remove_watch_rule(&e->rule); + + if (e->rule.tree) + audit_remove_tree_rule(&e->rule); + + list_del_rcu(&e->list); + list_del(&e->rule.list); + call_rcu(&e->rcu, audit_free_rule_rcu); + +#ifdef CONFIG_AUDITSYSCALL + if (!dont_count) + audit_n_rules--; + + if (!audit_match_signal(entry)) + audit_signals--; +#endif + mutex_unlock(&audit_filter_mutex); + +out: + if (watch) + audit_put_watch(watch); /* match initial get */ + if (tree) + audit_put_tree(tree); /* that's the temporary one */ + + return ret; +} + +/* List rules using struct audit_rule. Exists for backward + * compatibility with userspace. */ +static void audit_list(int pid, int seq, struct sk_buff_head *q) +{ + struct sk_buff *skb; + struct audit_krule *r; + int i; + + /* This is a blocking read, so use audit_filter_mutex instead of rcu + * iterator to sync with list writers. */ + for (i=0; ibuflen); + if (skb) + skb_queue_tail(q, skb); + kfree(data); + } + } + skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); + if (skb) + skb_queue_tail(q, skb); +} + +/* Log rule additions and removals */ +static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid, + char *action, struct audit_krule *rule, + int res) +{ + struct audit_buffer *ab; + + if (!audit_enabled) + return; + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); + if (!ab) + return; + audit_log_format(ab, "auid=%u ses=%u", loginuid, sessionid); + if (sid) { + char *ctx = NULL; + u32 len; + if (security_secid_to_secctx(sid, &ctx, &len)) + audit_log_format(ab, " ssid=%u", sid); + else { + audit_log_format(ab, " subj=%s", ctx); + security_release_secctx(ctx, len); + } + } + audit_log_format(ab, " op="); + audit_log_string(ab, action); + audit_log_key(ab, rule->filterkey); + audit_log_format(ab, " list=%d res=%d", rule->listnr, res); + audit_log_end(ab); +} + +/** + * audit_receive_filter - apply all rules to the specified message type + * @type: audit message type + * @pid: target pid for netlink audit messages + * @uid: target uid for netlink audit messages + * @seq: netlink audit message sequence (serial) number + * @data: payload data + * @datasz: size of payload data + * @loginuid: loginuid of sender + * @sessionid: sessionid for netlink audit message + * @sid: SE Linux Security ID of sender + */ +int audit_receive_filter(int type, int pid, int uid, int seq, void *data, + size_t datasz, uid_t loginuid, u32 sessionid, u32 sid) +{ + struct task_struct *tsk; + struct audit_netlink_list *dest; + int err = 0; + struct audit_entry *entry; + + switch (type) { + case AUDIT_LIST: + case AUDIT_LIST_RULES: + /* We can't just spew out the rules here because we might fill + * the available socket buffer space and deadlock waiting for + * auditctl to read from it... which isn't ever going to + * happen if we're actually running in the context of auditctl + * trying to _send_ the stuff */ + + dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL); + if (!dest) + return -ENOMEM; + dest->pid = pid; + skb_queue_head_init(&dest->q); + + mutex_lock(&audit_filter_mutex); + if (type == AUDIT_LIST) + audit_list(pid, seq, &dest->q); + else + audit_list_rules(pid, seq, &dest->q); + mutex_unlock(&audit_filter_mutex); + + tsk = kthread_run(audit_send_list, dest, "audit_send_list"); + if (IS_ERR(tsk)) { + skb_queue_purge(&dest->q); + kfree(dest); + err = PTR_ERR(tsk); + } + break; + case AUDIT_ADD: + case AUDIT_ADD_RULE: + if (type == AUDIT_ADD) + entry = audit_rule_to_entry(data); + else + entry = audit_data_to_entry(data, datasz); + if (IS_ERR(entry)) + return PTR_ERR(entry); + + err = audit_add_rule(entry); + audit_log_rule_change(loginuid, sessionid, sid, "add rule", + &entry->rule, !err); + + if (err) + audit_free_rule(entry); + break; + case AUDIT_DEL: + case AUDIT_DEL_RULE: + if (type == AUDIT_DEL) + entry = audit_rule_to_entry(data); + else + entry = audit_data_to_entry(data, datasz); + if (IS_ERR(entry)) + return PTR_ERR(entry); + + err = audit_del_rule(entry); + audit_log_rule_change(loginuid, sessionid, sid, "remove rule", + &entry->rule, !err); + + audit_free_rule(entry); + break; + default: + return -EINVAL; + } + + return err; +} + +int audit_comparator(u32 left, u32 op, u32 right) +{ + switch (op) { + case Audit_equal: + return (left == right); + case Audit_not_equal: + return (left != right); + case Audit_lt: + return (left < right); + case Audit_le: + return (left <= right); + case Audit_gt: + return (left > right); + case Audit_ge: + return (left >= right); + case Audit_bitmask: + return (left & right); + case Audit_bittest: + return ((left & right) == right); + default: + BUG(); + return 0; + } +} + +/* Compare given dentry name with last component in given path, + * return of 0 indicates a match. */ +int audit_compare_dname_path(const char *dname, const char *path, + int *dirlen) +{ + int dlen, plen; + const char *p; + + if (!dname || !path) + return 1; + + dlen = strlen(dname); + plen = strlen(path); + if (plen < dlen) + return 1; + + /* disregard trailing slashes */ + p = path + plen - 1; + while ((*p == '/') && (p > path)) + p--; + + /* find last path component */ + p = p - dlen + 1; + if (p < path) + return 1; + else if (p > path) { + if (*--p != '/') + return 1; + else + p++; + } + + /* return length of path's directory component */ + if (dirlen) + *dirlen = p - path; + return strncmp(p, dname, dlen); +} + +static int audit_filter_user_rules(struct netlink_skb_parms *cb, + struct audit_krule *rule, + enum audit_state *state) +{ + int i; + + for (i = 0; i < rule->field_count; i++) { + struct audit_field *f = &rule->fields[i]; + int result = 0; + u32 sid; + + switch (f->type) { + case AUDIT_PID: + result = audit_comparator(cb->creds.pid, f->op, f->val); + break; + case AUDIT_UID: + result = audit_comparator(cb->creds.uid, f->op, f->val); + break; + case AUDIT_GID: + result = audit_comparator(cb->creds.gid, f->op, f->val); + break; + case AUDIT_LOGINUID: + result = audit_comparator(audit_get_loginuid(current), + f->op, f->val); + break; + case AUDIT_SUBJ_USER: + case AUDIT_SUBJ_ROLE: + case AUDIT_SUBJ_TYPE: + case AUDIT_SUBJ_SEN: + case AUDIT_SUBJ_CLR: + if (f->lsm_rule) { + security_task_getsecid(current, &sid); + result = security_audit_rule_match(sid, + f->type, + f->op, + f->lsm_rule, + NULL); + } + break; + } + + if (!result) + return 0; + } + switch (rule->action) { + case AUDIT_NEVER: *state = AUDIT_DISABLED; break; + case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; + } + return 1; +} + +int audit_filter_user(struct netlink_skb_parms *cb) +{ + enum audit_state state = AUDIT_DISABLED; + struct audit_entry *e; + int ret = 1; + + rcu_read_lock(); + list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_USER], list) { + if (audit_filter_user_rules(cb, &e->rule, &state)) { + if (state == AUDIT_DISABLED) + ret = 0; + break; + } + } + rcu_read_unlock(); + + return ret; /* Audit by default */ +} + +int audit_filter_type(int type) +{ + struct audit_entry *e; + int result = 0; + + rcu_read_lock(); + if (list_empty(&audit_filter_list[AUDIT_FILTER_TYPE])) + goto unlock_and_return; + + list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TYPE], + list) { + int i; + for (i = 0; i < e->rule.field_count; i++) { + struct audit_field *f = &e->rule.fields[i]; + if (f->type == AUDIT_MSGTYPE) { + result = audit_comparator(type, f->op, f->val); + if (!result) + break; + } + } + if (result) + goto unlock_and_return; + } +unlock_and_return: + rcu_read_unlock(); + return result; +} + +static int update_lsm_rule(struct audit_krule *r) +{ + struct audit_entry *entry = container_of(r, struct audit_entry, rule); + struct audit_entry *nentry; + int err = 0; + + if (!security_audit_rule_known(r)) + return 0; + + nentry = audit_dupe_rule(r); + if (IS_ERR(nentry)) { + /* save the first error encountered for the + * return value */ + err = PTR_ERR(nentry); + audit_panic("error updating LSM filters"); + if (r->watch) + list_del(&r->rlist); + list_del_rcu(&entry->list); + list_del(&r->list); + } else { + if (r->watch || r->tree) + list_replace_init(&r->rlist, &nentry->rule.rlist); + list_replace_rcu(&entry->list, &nentry->list); + list_replace(&r->list, &nentry->rule.list); + } + call_rcu(&entry->rcu, audit_free_rule_rcu); + + return err; +} + +/* This function will re-initialize the lsm_rule field of all applicable rules. + * It will traverse the filter lists serarching for rules that contain LSM + * specific filter fields. When such a rule is found, it is copied, the + * LSM field is re-initialized, and the old rule is replaced with the + * updated rule. */ +int audit_update_lsm_rules(void) +{ + struct audit_krule *r, *n; + int i, err = 0; + + /* audit_filter_mutex synchronizes the writers */ + mutex_lock(&audit_filter_mutex); + + for (i = 0; i < AUDIT_NR_FILTERS; i++) { + list_for_each_entry_safe(r, n, &audit_rules_list[i], list) { + int res = update_lsm_rule(r); + if (!err) + err = res; + } + } + mutex_unlock(&audit_filter_mutex); + + return err; +} diff --git a/kernel/auditsc.c b/kernel/auditsc.c new file mode 100644 index 00000000..00d79df0 --- /dev/null +++ b/kernel/auditsc.c @@ -0,0 +1,2551 @@ +/* auditsc.c -- System-call auditing support + * Handles all system-call specific auditing features. + * + * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. + * Copyright 2005 Hewlett-Packard Development Company, L.P. + * Copyright (C) 2005, 2006 IBM Corporation + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Written by Rickard E. (Rik) Faith + * + * Many of the ideas implemented here are from Stephen C. Tweedie, + * especially the idea of avoiding a copy by using getname. + * + * The method for actual interception of syscall entry and exit (not in + * this file -- see entry.S) is based on a GPL'd patch written by + * okir@suse.de and Copyright 2003 SuSE Linux AG. + * + * POSIX message queue support added by George Wilson , + * 2006. + * + * The support of additional filter rules compares (>, <, >=, <=) was + * added by Dustin Kirkland , 2005. + * + * Modified by Amy Griffis to collect additional + * filesystem information. + * + * Subject and object context labeling support added by + * and for LSPP certification compliance. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "audit.h" + +/* AUDIT_NAMES is the number of slots we reserve in the audit_context + * for saving names from getname(). */ +#define AUDIT_NAMES 20 + +/* Indicates that audit should log the full pathname. */ +#define AUDIT_NAME_FULL -1 + +/* no execve audit message should be longer than this (userspace limits) */ +#define MAX_EXECVE_AUDIT_LEN 7500 + +/* number of audit rules */ +int audit_n_rules; + +/* determines whether we collect data for signals sent */ +int audit_signals; + +struct audit_cap_data { + kernel_cap_t permitted; + kernel_cap_t inheritable; + union { + unsigned int fE; /* effective bit of a file capability */ + kernel_cap_t effective; /* effective set of a process */ + }; +}; + +/* When fs/namei.c:getname() is called, we store the pointer in name and + * we don't let putname() free it (instead we free all of the saved + * pointers at syscall exit time). + * + * Further, in fs/namei.c:path_lookup() we store the inode and device. */ +struct audit_names { + const char *name; + int name_len; /* number of name's characters to log */ + unsigned name_put; /* call __putname() for this name */ + unsigned long ino; + dev_t dev; + umode_t mode; + uid_t uid; + gid_t gid; + dev_t rdev; + u32 osid; + struct audit_cap_data fcap; + unsigned int fcap_ver; +}; + +struct audit_aux_data { + struct audit_aux_data *next; + int type; +}; + +#define AUDIT_AUX_IPCPERM 0 + +/* Number of target pids per aux struct. */ +#define AUDIT_AUX_PIDS 16 + +struct audit_aux_data_execve { + struct audit_aux_data d; + int argc; + int envc; + struct mm_struct *mm; +}; + +struct audit_aux_data_pids { + struct audit_aux_data d; + pid_t target_pid[AUDIT_AUX_PIDS]; + uid_t target_auid[AUDIT_AUX_PIDS]; + uid_t target_uid[AUDIT_AUX_PIDS]; + unsigned int target_sessionid[AUDIT_AUX_PIDS]; + u32 target_sid[AUDIT_AUX_PIDS]; + char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN]; + int pid_count; +}; + +struct audit_aux_data_bprm_fcaps { + struct audit_aux_data d; + struct audit_cap_data fcap; + unsigned int fcap_ver; + struct audit_cap_data old_pcap; + struct audit_cap_data new_pcap; +}; + +struct audit_aux_data_capset { + struct audit_aux_data d; + pid_t pid; + struct audit_cap_data cap; +}; + +struct audit_tree_refs { + struct audit_tree_refs *next; + struct audit_chunk *c[31]; +}; + +/* The per-task audit context. */ +struct audit_context { + int dummy; /* must be the first element */ + int in_syscall; /* 1 if task is in a syscall */ + enum audit_state state, current_state; + unsigned int serial; /* serial number for record */ + int major; /* syscall number */ + struct timespec ctime; /* time of syscall entry */ + unsigned long argv[4]; /* syscall arguments */ + long return_code;/* syscall return code */ + u64 prio; + int return_valid; /* return code is valid */ + int name_count; + struct audit_names names[AUDIT_NAMES]; + char * filterkey; /* key for rule that triggered record */ + struct path pwd; + struct audit_context *previous; /* For nested syscalls */ + struct audit_aux_data *aux; + struct audit_aux_data *aux_pids; + struct sockaddr_storage *sockaddr; + size_t sockaddr_len; + /* Save things to print about task_struct */ + pid_t pid, ppid; + uid_t uid, euid, suid, fsuid; + gid_t gid, egid, sgid, fsgid; + unsigned long personality; + int arch; + + pid_t target_pid; + uid_t target_auid; + uid_t target_uid; + unsigned int target_sessionid; + u32 target_sid; + char target_comm[TASK_COMM_LEN]; + + struct audit_tree_refs *trees, *first_trees; + struct list_head killed_trees; + int tree_count; + + int type; + union { + struct { + int nargs; + long args[6]; + } socketcall; + struct { + uid_t uid; + gid_t gid; + mode_t mode; + u32 osid; + int has_perm; + uid_t perm_uid; + gid_t perm_gid; + mode_t perm_mode; + unsigned long qbytes; + } ipc; + struct { + mqd_t mqdes; + struct mq_attr mqstat; + } mq_getsetattr; + struct { + mqd_t mqdes; + int sigev_signo; + } mq_notify; + struct { + mqd_t mqdes; + size_t msg_len; + unsigned int msg_prio; + struct timespec abs_timeout; + } mq_sendrecv; + struct { + int oflag; + mode_t mode; + struct mq_attr attr; + } mq_open; + struct { + pid_t pid; + struct audit_cap_data cap; + } capset; + struct { + int fd; + int flags; + } mmap; + }; + int fds[2]; + +#if AUDIT_DEBUG + int put_count; + int ino_count; +#endif +}; + +static inline int open_arg(int flags, int mask) +{ + int n = ACC_MODE(flags); + if (flags & (O_TRUNC | O_CREAT)) + n |= AUDIT_PERM_WRITE; + return n & mask; +} + +static int audit_match_perm(struct audit_context *ctx, int mask) +{ + unsigned n; + if (unlikely(!ctx)) + return 0; + n = ctx->major; + + switch (audit_classify_syscall(ctx->arch, n)) { + case 0: /* native */ + if ((mask & AUDIT_PERM_WRITE) && + audit_match_class(AUDIT_CLASS_WRITE, n)) + return 1; + if ((mask & AUDIT_PERM_READ) && + audit_match_class(AUDIT_CLASS_READ, n)) + return 1; + if ((mask & AUDIT_PERM_ATTR) && + audit_match_class(AUDIT_CLASS_CHATTR, n)) + return 1; + return 0; + case 1: /* 32bit on biarch */ + if ((mask & AUDIT_PERM_WRITE) && + audit_match_class(AUDIT_CLASS_WRITE_32, n)) + return 1; + if ((mask & AUDIT_PERM_READ) && + audit_match_class(AUDIT_CLASS_READ_32, n)) + return 1; + if ((mask & AUDIT_PERM_ATTR) && + audit_match_class(AUDIT_CLASS_CHATTR_32, n)) + return 1; + return 0; + case 2: /* open */ + return mask & ACC_MODE(ctx->argv[1]); + case 3: /* openat */ + return mask & ACC_MODE(ctx->argv[2]); + case 4: /* socketcall */ + return ((mask & AUDIT_PERM_WRITE) && ctx->argv[0] == SYS_BIND); + case 5: /* execve */ + return mask & AUDIT_PERM_EXEC; + default: + return 0; + } +} + +static int audit_match_filetype(struct audit_context *ctx, int which) +{ + unsigned index = which & ~S_IFMT; + mode_t mode = which & S_IFMT; + + if (unlikely(!ctx)) + return 0; + + if (index >= ctx->name_count) + return 0; + if (ctx->names[index].ino == -1) + return 0; + if ((ctx->names[index].mode ^ mode) & S_IFMT) + return 0; + return 1; +} + +/* + * We keep a linked list of fixed-sized (31 pointer) arrays of audit_chunk *; + * ->first_trees points to its beginning, ->trees - to the current end of data. + * ->tree_count is the number of free entries in array pointed to by ->trees. + * Original condition is (NULL, NULL, 0); as soon as it grows we never revert to NULL, + * "empty" becomes (p, p, 31) afterwards. We don't shrink the list (and seriously, + * it's going to remain 1-element for almost any setup) until we free context itself. + * References in it _are_ dropped - at the same time we free/drop aux stuff. + */ + +#ifdef CONFIG_AUDIT_TREE +static void audit_set_auditable(struct audit_context *ctx) +{ + if (!ctx->prio) { + ctx->prio = 1; + ctx->current_state = AUDIT_RECORD_CONTEXT; + } +} + +static int put_tree_ref(struct audit_context *ctx, struct audit_chunk *chunk) +{ + struct audit_tree_refs *p = ctx->trees; + int left = ctx->tree_count; + if (likely(left)) { + p->c[--left] = chunk; + ctx->tree_count = left; + return 1; + } + if (!p) + return 0; + p = p->next; + if (p) { + p->c[30] = chunk; + ctx->trees = p; + ctx->tree_count = 30; + return 1; + } + return 0; +} + +static int grow_tree_refs(struct audit_context *ctx) +{ + struct audit_tree_refs *p = ctx->trees; + ctx->trees = kzalloc(sizeof(struct audit_tree_refs), GFP_KERNEL); + if (!ctx->trees) { + ctx->trees = p; + return 0; + } + if (p) + p->next = ctx->trees; + else + ctx->first_trees = ctx->trees; + ctx->tree_count = 31; + return 1; +} +#endif + +static void unroll_tree_refs(struct audit_context *ctx, + struct audit_tree_refs *p, int count) +{ +#ifdef CONFIG_AUDIT_TREE + struct audit_tree_refs *q; + int n; + if (!p) { + /* we started with empty chain */ + p = ctx->first_trees; + count = 31; + /* if the very first allocation has failed, nothing to do */ + if (!p) + return; + } + n = count; + for (q = p; q != ctx->trees; q = q->next, n = 31) { + while (n--) { + audit_put_chunk(q->c[n]); + q->c[n] = NULL; + } + } + while (n-- > ctx->tree_count) { + audit_put_chunk(q->c[n]); + q->c[n] = NULL; + } + ctx->trees = p; + ctx->tree_count = count; +#endif +} + +static void free_tree_refs(struct audit_context *ctx) +{ + struct audit_tree_refs *p, *q; + for (p = ctx->first_trees; p; p = q) { + q = p->next; + kfree(p); + } +} + +static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree) +{ +#ifdef CONFIG_AUDIT_TREE + struct audit_tree_refs *p; + int n; + if (!tree) + return 0; + /* full ones */ + for (p = ctx->first_trees; p != ctx->trees; p = p->next) { + for (n = 0; n < 31; n++) + if (audit_tree_match(p->c[n], tree)) + return 1; + } + /* partial */ + if (p) { + for (n = ctx->tree_count; n < 31; n++) + if (audit_tree_match(p->c[n], tree)) + return 1; + } +#endif + return 0; +} + +/* Determine if any context name data matches a rule's watch data */ +/* Compare a task_struct with an audit_rule. Return 1 on match, 0 + * otherwise. + * + * If task_creation is true, this is an explicit indication that we are + * filtering a task rule at task creation time. This and tsk == current are + * the only situations where tsk->cred may be accessed without an rcu read lock. + */ +static int audit_filter_rules(struct task_struct *tsk, + struct audit_krule *rule, + struct audit_context *ctx, + struct audit_names *name, + enum audit_state *state, + bool task_creation) +{ + const struct cred *cred; + int i, j, need_sid = 1; + u32 sid; + + cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation); + + for (i = 0; i < rule->field_count; i++) { + struct audit_field *f = &rule->fields[i]; + int result = 0; + + switch (f->type) { + case AUDIT_PID: + result = audit_comparator(tsk->pid, f->op, f->val); + break; + case AUDIT_PPID: + if (ctx) { + if (!ctx->ppid) + ctx->ppid = sys_getppid(); + result = audit_comparator(ctx->ppid, f->op, f->val); + } + break; + case AUDIT_UID: + result = audit_comparator(cred->uid, f->op, f->val); + break; + case AUDIT_EUID: + result = audit_comparator(cred->euid, f->op, f->val); + break; + case AUDIT_SUID: + result = audit_comparator(cred->suid, f->op, f->val); + break; + case AUDIT_FSUID: + result = audit_comparator(cred->fsuid, f->op, f->val); + break; + case AUDIT_GID: + result = audit_comparator(cred->gid, f->op, f->val); + break; + case AUDIT_EGID: + result = audit_comparator(cred->egid, f->op, f->val); + break; + case AUDIT_SGID: + result = audit_comparator(cred->sgid, f->op, f->val); + break; + case AUDIT_FSGID: + result = audit_comparator(cred->fsgid, f->op, f->val); + break; + case AUDIT_PERS: + result = audit_comparator(tsk->personality, f->op, f->val); + break; + case AUDIT_ARCH: + if (ctx) + result = audit_comparator(ctx->arch, f->op, f->val); + break; + + case AUDIT_EXIT: + if (ctx && ctx->return_valid) + result = audit_comparator(ctx->return_code, f->op, f->val); + break; + case AUDIT_SUCCESS: + if (ctx && ctx->return_valid) { + if (f->val) + result = audit_comparator(ctx->return_valid, f->op, AUDITSC_SUCCESS); + else + result = audit_comparator(ctx->return_valid, f->op, AUDITSC_FAILURE); + } + break; + case AUDIT_DEVMAJOR: + if (name) + result = audit_comparator(MAJOR(name->dev), + f->op, f->val); + else if (ctx) { + for (j = 0; j < ctx->name_count; j++) { + if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) { + ++result; + break; + } + } + } + break; + case AUDIT_DEVMINOR: + if (name) + result = audit_comparator(MINOR(name->dev), + f->op, f->val); + else if (ctx) { + for (j = 0; j < ctx->name_count; j++) { + if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) { + ++result; + break; + } + } + } + break; + case AUDIT_INODE: + if (name) + result = (name->ino == f->val); + else if (ctx) { + for (j = 0; j < ctx->name_count; j++) { + if (audit_comparator(ctx->names[j].ino, f->op, f->val)) { + ++result; + break; + } + } + } + break; + case AUDIT_WATCH: + if (name) + result = audit_watch_compare(rule->watch, name->ino, name->dev); + break; + case AUDIT_DIR: + if (ctx) + result = match_tree_refs(ctx, rule->tree); + break; + case AUDIT_LOGINUID: + result = 0; + if (ctx) + result = audit_comparator(tsk->loginuid, f->op, f->val); + break; + case AUDIT_SUBJ_USER: + case AUDIT_SUBJ_ROLE: + case AUDIT_SUBJ_TYPE: + case AUDIT_SUBJ_SEN: + case AUDIT_SUBJ_CLR: + /* NOTE: this may return negative values indicating + a temporary error. We simply treat this as a + match for now to avoid losing information that + may be wanted. An error message will also be + logged upon error */ + if (f->lsm_rule) { + if (need_sid) { + security_task_getsecid(tsk, &sid); + need_sid = 0; + } + result = security_audit_rule_match(sid, f->type, + f->op, + f->lsm_rule, + ctx); + } + break; + case AUDIT_OBJ_USER: + case AUDIT_OBJ_ROLE: + case AUDIT_OBJ_TYPE: + case AUDIT_OBJ_LEV_LOW: + case AUDIT_OBJ_LEV_HIGH: + /* The above note for AUDIT_SUBJ_USER...AUDIT_SUBJ_CLR + also applies here */ + if (f->lsm_rule) { + /* Find files that match */ + if (name) { + result = security_audit_rule_match( + name->osid, f->type, f->op, + f->lsm_rule, ctx); + } else if (ctx) { + for (j = 0; j < ctx->name_count; j++) { + if (security_audit_rule_match( + ctx->names[j].osid, + f->type, f->op, + f->lsm_rule, ctx)) { + ++result; + break; + } + } + } + /* Find ipc objects that match */ + if (!ctx || ctx->type != AUDIT_IPC) + break; + if (security_audit_rule_match(ctx->ipc.osid, + f->type, f->op, + f->lsm_rule, ctx)) + ++result; + } + break; + case AUDIT_ARG0: + case AUDIT_ARG1: + case AUDIT_ARG2: + case AUDIT_ARG3: + if (ctx) + result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val); + break; + case AUDIT_FILTERKEY: + /* ignore this field for filtering */ + result = 1; + break; + case AUDIT_PERM: + result = audit_match_perm(ctx, f->val); + break; + case AUDIT_FILETYPE: + result = audit_match_filetype(ctx, f->val); + break; + } + + if (!result) + return 0; + } + + if (ctx) { + if (rule->prio <= ctx->prio) + return 0; + if (rule->filterkey) { + kfree(ctx->filterkey); + ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); + } + ctx->prio = rule->prio; + } + switch (rule->action) { + case AUDIT_NEVER: *state = AUDIT_DISABLED; break; + case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; + } + return 1; +} + +/* At process creation time, we can determine if system-call auditing is + * completely disabled for this task. Since we only have the task + * structure at this point, we can only check uid and gid. + */ +static enum audit_state audit_filter_task(struct task_struct *tsk, char **key) +{ + struct audit_entry *e; + enum audit_state state; + + rcu_read_lock(); + list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { + if (audit_filter_rules(tsk, &e->rule, NULL, NULL, + &state, true)) { + if (state == AUDIT_RECORD_CONTEXT) + *key = kstrdup(e->rule.filterkey, GFP_ATOMIC); + rcu_read_unlock(); + return state; + } + } + rcu_read_unlock(); + return AUDIT_BUILD_CONTEXT; +} + +/* At syscall entry and exit time, this filter is called if the + * audit_state is not low enough that auditing cannot take place, but is + * also not high enough that we already know we have to write an audit + * record (i.e., the state is AUDIT_SETUP_CONTEXT or AUDIT_BUILD_CONTEXT). + */ +static enum audit_state audit_filter_syscall(struct task_struct *tsk, + struct audit_context *ctx, + struct list_head *list) +{ + struct audit_entry *e; + enum audit_state state; + + if (audit_pid && tsk->tgid == audit_pid) + return AUDIT_DISABLED; + + rcu_read_lock(); + if (!list_empty(list)) { + int word = AUDIT_WORD(ctx->major); + int bit = AUDIT_BIT(ctx->major); + + list_for_each_entry_rcu(e, list, list) { + if ((e->rule.mask[word] & bit) == bit && + audit_filter_rules(tsk, &e->rule, ctx, NULL, + &state, false)) { + rcu_read_unlock(); + ctx->current_state = state; + return state; + } + } + } + rcu_read_unlock(); + return AUDIT_BUILD_CONTEXT; +} + +/* At syscall exit time, this filter is called if any audit_names[] have been + * collected during syscall processing. We only check rules in sublists at hash + * buckets applicable to the inode numbers in audit_names[]. + * Regarding audit_state, same rules apply as for audit_filter_syscall(). + */ +void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx) +{ + int i; + struct audit_entry *e; + enum audit_state state; + + if (audit_pid && tsk->tgid == audit_pid) + return; + + rcu_read_lock(); + for (i = 0; i < ctx->name_count; i++) { + int word = AUDIT_WORD(ctx->major); + int bit = AUDIT_BIT(ctx->major); + struct audit_names *n = &ctx->names[i]; + int h = audit_hash_ino((u32)n->ino); + struct list_head *list = &audit_inode_hash[h]; + + if (list_empty(list)) + continue; + + list_for_each_entry_rcu(e, list, list) { + if ((e->rule.mask[word] & bit) == bit && + audit_filter_rules(tsk, &e->rule, ctx, n, + &state, false)) { + rcu_read_unlock(); + ctx->current_state = state; + return; + } + } + } + rcu_read_unlock(); +} + +static inline struct audit_context *audit_get_context(struct task_struct *tsk, + int return_valid, + long return_code) +{ + struct audit_context *context = tsk->audit_context; + + if (likely(!context)) + return NULL; + context->return_valid = return_valid; + + /* + * we need to fix up the return code in the audit logs if the actual + * return codes are later going to be fixed up by the arch specific + * signal handlers + * + * This is actually a test for: + * (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) || + * (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK) + * + * but is faster than a bunch of || + */ + if (unlikely(return_code <= -ERESTARTSYS) && + (return_code >= -ERESTART_RESTARTBLOCK) && + (return_code != -ENOIOCTLCMD)) + context->return_code = -EINTR; + else + context->return_code = return_code; + + if (context->in_syscall && !context->dummy) { + audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); + audit_filter_inodes(tsk, context); + } + + tsk->audit_context = NULL; + return context; +} + +static inline void audit_free_names(struct audit_context *context) +{ + int i; + +#if AUDIT_DEBUG == 2 + if (context->put_count + context->ino_count != context->name_count) { + printk(KERN_ERR "%s:%d(:%d): major=%d in_syscall=%d" + " name_count=%d put_count=%d" + " ino_count=%d [NOT freeing]\n", + __FILE__, __LINE__, + context->serial, context->major, context->in_syscall, + context->name_count, context->put_count, + context->ino_count); + for (i = 0; i < context->name_count; i++) { + printk(KERN_ERR "names[%d] = %p = %s\n", i, + context->names[i].name, + context->names[i].name ?: "(null)"); + } + dump_stack(); + return; + } +#endif +#if AUDIT_DEBUG + context->put_count = 0; + context->ino_count = 0; +#endif + + for (i = 0; i < context->name_count; i++) { + if (context->names[i].name && context->names[i].name_put) + __putname(context->names[i].name); + } + context->name_count = 0; + path_put(&context->pwd); + context->pwd.dentry = NULL; + context->pwd.mnt = NULL; +} + +static inline void audit_free_aux(struct audit_context *context) +{ + struct audit_aux_data *aux; + + while ((aux = context->aux)) { + context->aux = aux->next; + kfree(aux); + } + while ((aux = context->aux_pids)) { + context->aux_pids = aux->next; + kfree(aux); + } +} + +static inline void audit_zero_context(struct audit_context *context, + enum audit_state state) +{ + memset(context, 0, sizeof(*context)); + context->state = state; + context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; +} + +static inline struct audit_context *audit_alloc_context(enum audit_state state) +{ + struct audit_context *context; + + if (!(context = kmalloc(sizeof(*context), GFP_KERNEL))) + return NULL; + audit_zero_context(context, state); + INIT_LIST_HEAD(&context->killed_trees); + return context; +} + +/** + * audit_alloc - allocate an audit context block for a task + * @tsk: task + * + * Filter on the task information and allocate a per-task audit context + * if necessary. Doing so turns on system call auditing for the + * specified task. This is called from copy_process, so no lock is + * needed. + */ +int audit_alloc(struct task_struct *tsk) +{ + struct audit_context *context; + enum audit_state state; + char *key = NULL; + + if (likely(!audit_ever_enabled)) + return 0; /* Return if not auditing. */ + + state = audit_filter_task(tsk, &key); + if (likely(state == AUDIT_DISABLED)) + return 0; + + if (!(context = audit_alloc_context(state))) { + kfree(key); + audit_log_lost("out of memory in audit_alloc"); + return -ENOMEM; + } + context->filterkey = key; + + tsk->audit_context = context; + set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT); + return 0; +} + +static inline void audit_free_context(struct audit_context *context) +{ + struct audit_context *previous; + int count = 0; + + do { + previous = context->previous; + if (previous || (count && count < 10)) { + ++count; + printk(KERN_ERR "audit(:%d): major=%d name_count=%d:" + " freeing multiple contexts (%d)\n", + context->serial, context->major, + context->name_count, count); + } + audit_free_names(context); + unroll_tree_refs(context, NULL, 0); + free_tree_refs(context); + audit_free_aux(context); + kfree(context->filterkey); + kfree(context->sockaddr); + kfree(context); + context = previous; + } while (context); + if (count >= 10) + printk(KERN_ERR "audit: freed %d contexts\n", count); +} + +void audit_log_task_context(struct audit_buffer *ab) +{ + char *ctx = NULL; + unsigned len; + int error; + u32 sid; + + security_task_getsecid(current, &sid); + if (!sid) + return; + + error = security_secid_to_secctx(sid, &ctx, &len); + if (error) { + if (error != -EINVAL) + goto error_path; + return; + } + + audit_log_format(ab, " subj=%s", ctx); + security_release_secctx(ctx, len); + return; + +error_path: + audit_panic("error in audit_log_task_context"); + return; +} + +EXPORT_SYMBOL(audit_log_task_context); + +static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) +{ + char name[sizeof(tsk->comm)]; + struct mm_struct *mm = tsk->mm; + struct vm_area_struct *vma; + + /* tsk == current */ + + get_task_comm(name, tsk); + audit_log_format(ab, " comm="); + audit_log_untrustedstring(ab, name); + + if (mm) { + down_read(&mm->mmap_sem); + vma = mm->mmap; + while (vma) { + if ((vma->vm_flags & VM_EXECUTABLE) && + vma->vm_file) { + audit_log_d_path(ab, "exe=", + &vma->vm_file->f_path); + break; + } + vma = vma->vm_next; + } + up_read(&mm->mmap_sem); + } + audit_log_task_context(ab); +} + +static int audit_log_pid_context(struct audit_context *context, pid_t pid, + uid_t auid, uid_t uid, unsigned int sessionid, + u32 sid, char *comm) +{ + struct audit_buffer *ab; + char *ctx = NULL; + u32 len; + int rc = 0; + + ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID); + if (!ab) + return rc; + + audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid, + uid, sessionid); + if (security_secid_to_secctx(sid, &ctx, &len)) { + audit_log_format(ab, " obj=(none)"); + rc = 1; + } else { + audit_log_format(ab, " obj=%s", ctx); + security_release_secctx(ctx, len); + } + audit_log_format(ab, " ocomm="); + audit_log_untrustedstring(ab, comm); + audit_log_end(ab); + + return rc; +} + +/* + * to_send and len_sent accounting are very loose estimates. We aren't + * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being + * within about 500 bytes (next page boundary) + * + * why snprintf? an int is up to 12 digits long. if we just assumed when + * logging that a[%d]= was going to be 16 characters long we would be wasting + * space in every audit message. In one 7500 byte message we can log up to + * about 1000 min size arguments. That comes down to about 50% waste of space + * if we didn't do the snprintf to find out how long arg_num_len was. + */ +static int audit_log_single_execve_arg(struct audit_context *context, + struct audit_buffer **ab, + int arg_num, + size_t *len_sent, + const char __user *p, + char *buf) +{ + char arg_num_len_buf[12]; + const char __user *tmp_p = p; + /* how many digits are in arg_num? 5 is the length of ' a=""' */ + size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5; + size_t len, len_left, to_send; + size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN; + unsigned int i, has_cntl = 0, too_long = 0; + int ret; + + /* strnlen_user includes the null we don't want to send */ + len_left = len = strnlen_user(p, MAX_ARG_STRLEN) - 1; + + /* + * We just created this mm, if we can't find the strings + * we just copied into it something is _very_ wrong. Similar + * for strings that are too long, we should not have created + * any. + */ + if (unlikely((len == -1) || len > MAX_ARG_STRLEN - 1)) { + WARN_ON(1); + send_sig(SIGKILL, current, 0); + return -1; + } + + /* walk the whole argument looking for non-ascii chars */ + do { + if (len_left > MAX_EXECVE_AUDIT_LEN) + to_send = MAX_EXECVE_AUDIT_LEN; + else + to_send = len_left; + ret = copy_from_user(buf, tmp_p, to_send); + /* + * There is no reason for this copy to be short. We just + * copied them here, and the mm hasn't been exposed to user- + * space yet. + */ + if (ret) { + WARN_ON(1); + send_sig(SIGKILL, current, 0); + return -1; + } + buf[to_send] = '\0'; + has_cntl = audit_string_contains_control(buf, to_send); + if (has_cntl) { + /* + * hex messages get logged as 2 bytes, so we can only + * send half as much in each message + */ + max_execve_audit_len = MAX_EXECVE_AUDIT_LEN / 2; + break; + } + len_left -= to_send; + tmp_p += to_send; + } while (len_left > 0); + + len_left = len; + + if (len > max_execve_audit_len) + too_long = 1; + + /* rewalk the argument actually logging the message */ + for (i = 0; len_left > 0; i++) { + int room_left; + + if (len_left > max_execve_audit_len) + to_send = max_execve_audit_len; + else + to_send = len_left; + + /* do we have space left to send this argument in this ab? */ + room_left = MAX_EXECVE_AUDIT_LEN - arg_num_len - *len_sent; + if (has_cntl) + room_left -= (to_send * 2); + else + room_left -= to_send; + if (room_left < 0) { + *len_sent = 0; + audit_log_end(*ab); + *ab = audit_log_start(context, GFP_KERNEL, AUDIT_EXECVE); + if (!*ab) + return 0; + } + + /* + * first record needs to say how long the original string was + * so we can be sure nothing was lost. + */ + if ((i == 0) && (too_long)) + audit_log_format(*ab, " a%d_len=%zu", arg_num, + has_cntl ? 2*len : len); + + /* + * normally arguments are small enough to fit and we already + * filled buf above when we checked for control characters + * so don't bother with another copy_from_user + */ + if (len >= max_execve_audit_len) + ret = copy_from_user(buf, p, to_send); + else + ret = 0; + if (ret) { + WARN_ON(1); + send_sig(SIGKILL, current, 0); + return -1; + } + buf[to_send] = '\0'; + + /* actually log it */ + audit_log_format(*ab, " a%d", arg_num); + if (too_long) + audit_log_format(*ab, "[%d]", i); + audit_log_format(*ab, "="); + if (has_cntl) + audit_log_n_hex(*ab, buf, to_send); + else + audit_log_string(*ab, buf); + + p += to_send; + len_left -= to_send; + *len_sent += arg_num_len; + if (has_cntl) + *len_sent += to_send * 2; + else + *len_sent += to_send; + } + /* include the null we didn't log */ + return len + 1; +} + +static void audit_log_execve_info(struct audit_context *context, + struct audit_buffer **ab, + struct audit_aux_data_execve *axi) +{ + int i; + size_t len, len_sent = 0; + const char __user *p; + char *buf; + + if (axi->mm != current->mm) + return; /* execve failed, no additional info */ + + p = (const char __user *)axi->mm->arg_start; + + audit_log_format(*ab, "argc=%d", axi->argc); + + /* + * we need some kernel buffer to hold the userspace args. Just + * allocate one big one rather than allocating one of the right size + * for every single argument inside audit_log_single_execve_arg() + * should be <8k allocation so should be pretty safe. + */ + buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); + if (!buf) { + audit_panic("out of memory for argv string\n"); + return; + } + + for (i = 0; i < axi->argc; i++) { + len = audit_log_single_execve_arg(context, ab, i, + &len_sent, p, buf); + if (len <= 0) + break; + p += len; + } + kfree(buf); +} + +static void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap) +{ + int i; + + audit_log_format(ab, " %s=", prefix); + CAP_FOR_EACH_U32(i) { + audit_log_format(ab, "%08x", cap->cap[(_KERNEL_CAPABILITY_U32S-1) - i]); + } +} + +static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) +{ + kernel_cap_t *perm = &name->fcap.permitted; + kernel_cap_t *inh = &name->fcap.inheritable; + int log = 0; + + if (!cap_isclear(*perm)) { + audit_log_cap(ab, "cap_fp", perm); + log = 1; + } + if (!cap_isclear(*inh)) { + audit_log_cap(ab, "cap_fi", inh); + log = 1; + } + + if (log) + audit_log_format(ab, " cap_fe=%d cap_fver=%x", name->fcap.fE, name->fcap_ver); +} + +static void show_special(struct audit_context *context, int *call_panic) +{ + struct audit_buffer *ab; + int i; + + ab = audit_log_start(context, GFP_KERNEL, context->type); + if (!ab) + return; + + switch (context->type) { + case AUDIT_SOCKETCALL: { + int nargs = context->socketcall.nargs; + audit_log_format(ab, "nargs=%d", nargs); + for (i = 0; i < nargs; i++) + audit_log_format(ab, " a%d=%lx", i, + context->socketcall.args[i]); + break; } + case AUDIT_IPC: { + u32 osid = context->ipc.osid; + + audit_log_format(ab, "ouid=%u ogid=%u mode=%#o", + context->ipc.uid, context->ipc.gid, context->ipc.mode); + if (osid) { + char *ctx = NULL; + u32 len; + if (security_secid_to_secctx(osid, &ctx, &len)) { + audit_log_format(ab, " osid=%u", osid); + *call_panic = 1; + } else { + audit_log_format(ab, " obj=%s", ctx); + security_release_secctx(ctx, len); + } + } + if (context->ipc.has_perm) { + audit_log_end(ab); + ab = audit_log_start(context, GFP_KERNEL, + AUDIT_IPC_SET_PERM); + audit_log_format(ab, + "qbytes=%lx ouid=%u ogid=%u mode=%#o", + context->ipc.qbytes, + context->ipc.perm_uid, + context->ipc.perm_gid, + context->ipc.perm_mode); + if (!ab) + return; + } + break; } + case AUDIT_MQ_OPEN: { + audit_log_format(ab, + "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld " + "mq_msgsize=%ld mq_curmsgs=%ld", + context->mq_open.oflag, context->mq_open.mode, + context->mq_open.attr.mq_flags, + context->mq_open.attr.mq_maxmsg, + context->mq_open.attr.mq_msgsize, + context->mq_open.attr.mq_curmsgs); + break; } + case AUDIT_MQ_SENDRECV: { + audit_log_format(ab, + "mqdes=%d msg_len=%zd msg_prio=%u " + "abs_timeout_sec=%ld abs_timeout_nsec=%ld", + context->mq_sendrecv.mqdes, + context->mq_sendrecv.msg_len, + context->mq_sendrecv.msg_prio, + context->mq_sendrecv.abs_timeout.tv_sec, + context->mq_sendrecv.abs_timeout.tv_nsec); + break; } + case AUDIT_MQ_NOTIFY: { + audit_log_format(ab, "mqdes=%d sigev_signo=%d", + context->mq_notify.mqdes, + context->mq_notify.sigev_signo); + break; } + case AUDIT_MQ_GETSETATTR: { + struct mq_attr *attr = &context->mq_getsetattr.mqstat; + audit_log_format(ab, + "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld " + "mq_curmsgs=%ld ", + context->mq_getsetattr.mqdes, + attr->mq_flags, attr->mq_maxmsg, + attr->mq_msgsize, attr->mq_curmsgs); + break; } + case AUDIT_CAPSET: { + audit_log_format(ab, "pid=%d", context->capset.pid); + audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable); + audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted); + audit_log_cap(ab, "cap_pe", &context->capset.cap.effective); + break; } + case AUDIT_MMAP: { + audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd, + context->mmap.flags); + break; } + } + audit_log_end(ab); +} + +static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) +{ + const struct cred *cred; + int i, call_panic = 0; + struct audit_buffer *ab; + struct audit_aux_data *aux; + const char *tty; + + /* tsk == current */ + context->pid = tsk->pid; + if (!context->ppid) + context->ppid = sys_getppid(); + cred = current_cred(); + context->uid = cred->uid; + context->gid = cred->gid; + context->euid = cred->euid; + context->suid = cred->suid; + context->fsuid = cred->fsuid; + context->egid = cred->egid; + context->sgid = cred->sgid; + context->fsgid = cred->fsgid; + context->personality = tsk->personality; + + ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); + if (!ab) + return; /* audit_panic has been called */ + audit_log_format(ab, "arch=%x syscall=%d", + context->arch, context->major); + if (context->personality != PER_LINUX) + audit_log_format(ab, " per=%lx", context->personality); + if (context->return_valid) + audit_log_format(ab, " success=%s exit=%ld", + (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", + context->return_code); + + spin_lock_irq(&tsk->sighand->siglock); + if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) + tty = tsk->signal->tty->name; + else + tty = "(none)"; + spin_unlock_irq(&tsk->sighand->siglock); + + audit_log_format(ab, + " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" + " ppid=%d pid=%d auid=%u uid=%u gid=%u" + " euid=%u suid=%u fsuid=%u" + " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", + context->argv[0], + context->argv[1], + context->argv[2], + context->argv[3], + context->name_count, + context->ppid, + context->pid, + tsk->loginuid, + context->uid, + context->gid, + context->euid, context->suid, context->fsuid, + context->egid, context->sgid, context->fsgid, tty, + tsk->sessionid); + + + audit_log_task_info(ab, tsk); + audit_log_key(ab, context->filterkey); + audit_log_end(ab); + + for (aux = context->aux; aux; aux = aux->next) { + + ab = audit_log_start(context, GFP_KERNEL, aux->type); + if (!ab) + continue; /* audit_panic has been called */ + + switch (aux->type) { + + case AUDIT_EXECVE: { + struct audit_aux_data_execve *axi = (void *)aux; + audit_log_execve_info(context, &ab, axi); + break; } + + case AUDIT_BPRM_FCAPS: { + struct audit_aux_data_bprm_fcaps *axs = (void *)aux; + audit_log_format(ab, "fver=%x", axs->fcap_ver); + audit_log_cap(ab, "fp", &axs->fcap.permitted); + audit_log_cap(ab, "fi", &axs->fcap.inheritable); + audit_log_format(ab, " fe=%d", axs->fcap.fE); + audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted); + audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable); + audit_log_cap(ab, "old_pe", &axs->old_pcap.effective); + audit_log_cap(ab, "new_pp", &axs->new_pcap.permitted); + audit_log_cap(ab, "new_pi", &axs->new_pcap.inheritable); + audit_log_cap(ab, "new_pe", &axs->new_pcap.effective); + break; } + + } + audit_log_end(ab); + } + + if (context->type) + show_special(context, &call_panic); + + if (context->fds[0] >= 0) { + ab = audit_log_start(context, GFP_KERNEL, AUDIT_FD_PAIR); + if (ab) { + audit_log_format(ab, "fd0=%d fd1=%d", + context->fds[0], context->fds[1]); + audit_log_end(ab); + } + } + + if (context->sockaddr_len) { + ab = audit_log_start(context, GFP_KERNEL, AUDIT_SOCKADDR); + if (ab) { + audit_log_format(ab, "saddr="); + audit_log_n_hex(ab, (void *)context->sockaddr, + context->sockaddr_len); + audit_log_end(ab); + } + } + + for (aux = context->aux_pids; aux; aux = aux->next) { + struct audit_aux_data_pids *axs = (void *)aux; + + for (i = 0; i < axs->pid_count; i++) + if (audit_log_pid_context(context, axs->target_pid[i], + axs->target_auid[i], + axs->target_uid[i], + axs->target_sessionid[i], + axs->target_sid[i], + axs->target_comm[i])) + call_panic = 1; + } + + if (context->target_pid && + audit_log_pid_context(context, context->target_pid, + context->target_auid, context->target_uid, + context->target_sessionid, + context->target_sid, context->target_comm)) + call_panic = 1; + + if (context->pwd.dentry && context->pwd.mnt) { + ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD); + if (ab) { + audit_log_d_path(ab, "cwd=", &context->pwd); + audit_log_end(ab); + } + } + for (i = 0; i < context->name_count; i++) { + struct audit_names *n = &context->names[i]; + + ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); + if (!ab) + continue; /* audit_panic has been called */ + + audit_log_format(ab, "item=%d", i); + + if (n->name) { + switch(n->name_len) { + case AUDIT_NAME_FULL: + /* log the full path */ + audit_log_format(ab, " name="); + audit_log_untrustedstring(ab, n->name); + break; + case 0: + /* name was specified as a relative path and the + * directory component is the cwd */ + audit_log_d_path(ab, "name=", &context->pwd); + break; + default: + /* log the name's directory component */ + audit_log_format(ab, " name="); + audit_log_n_untrustedstring(ab, n->name, + n->name_len); + } + } else + audit_log_format(ab, " name=(null)"); + + if (n->ino != (unsigned long)-1) { + audit_log_format(ab, " inode=%lu" + " dev=%02x:%02x mode=%#o" + " ouid=%u ogid=%u rdev=%02x:%02x", + n->ino, + MAJOR(n->dev), + MINOR(n->dev), + n->mode, + n->uid, + n->gid, + MAJOR(n->rdev), + MINOR(n->rdev)); + } + if (n->osid != 0) { + char *ctx = NULL; + u32 len; + if (security_secid_to_secctx( + n->osid, &ctx, &len)) { + audit_log_format(ab, " osid=%u", n->osid); + call_panic = 2; + } else { + audit_log_format(ab, " obj=%s", ctx); + security_release_secctx(ctx, len); + } + } + + audit_log_fcaps(ab, n); + + audit_log_end(ab); + } + + /* Send end of event record to help user space know we are finished */ + ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); + if (ab) + audit_log_end(ab); + if (call_panic) + audit_panic("error converting sid to string"); +} + +/** + * audit_free - free a per-task audit context + * @tsk: task whose audit context block to free + * + * Called from copy_process and do_exit + */ +void audit_free(struct task_struct *tsk) +{ + struct audit_context *context; + + context = audit_get_context(tsk, 0, 0); + if (likely(!context)) + return; + + /* Check for system calls that do not go through the exit + * function (e.g., exit_group), then free context block. + * We use GFP_ATOMIC here because we might be doing this + * in the context of the idle thread */ + /* that can happen only if we are called from do_exit() */ + if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) + audit_log_exit(context, tsk); + if (!list_empty(&context->killed_trees)) + audit_kill_trees(&context->killed_trees); + + audit_free_context(context); +} + +/** + * audit_syscall_entry - fill in an audit record at syscall entry + * @arch: architecture type + * @major: major syscall type (function) + * @a1: additional syscall register 1 + * @a2: additional syscall register 2 + * @a3: additional syscall register 3 + * @a4: additional syscall register 4 + * + * Fill in audit context at syscall entry. This only happens if the + * audit context was created when the task was created and the state or + * filters demand the audit context be built. If the state from the + * per-task filter or from the per-syscall filter is AUDIT_RECORD_CONTEXT, + * then the record will be written at syscall exit time (otherwise, it + * will only be written if another part of the kernel requests that it + * be written). + */ +void audit_syscall_entry(int arch, int major, + unsigned long a1, unsigned long a2, + unsigned long a3, unsigned long a4) +{ + struct task_struct *tsk = current; + struct audit_context *context = tsk->audit_context; + enum audit_state state; + + if (unlikely(!context)) + return; + + /* + * This happens only on certain architectures that make system + * calls in kernel_thread via the entry.S interface, instead of + * with direct calls. (If you are porting to a new + * architecture, hitting this condition can indicate that you + * got the _exit/_leave calls backward in entry.S.) + * + * i386 no + * x86_64 no + * ppc64 yes (see arch/powerpc/platforms/iseries/misc.S) + * + * This also happens with vm86 emulation in a non-nested manner + * (entries without exits), so this case must be caught. + */ + if (context->in_syscall) { + struct audit_context *newctx; + +#if AUDIT_DEBUG + printk(KERN_ERR + "audit(:%d) pid=%d in syscall=%d;" + " entering syscall=%d\n", + context->serial, tsk->pid, context->major, major); +#endif + newctx = audit_alloc_context(context->state); + if (newctx) { + newctx->previous = context; + context = newctx; + tsk->audit_context = newctx; + } else { + /* If we can't alloc a new context, the best we + * can do is to leak memory (any pending putname + * will be lost). The only other alternative is + * to abandon auditing. */ + audit_zero_context(context, context->state); + } + } + BUG_ON(context->in_syscall || context->name_count); + + if (!audit_enabled) + return; + + context->arch = arch; + context->major = major; + context->argv[0] = a1; + context->argv[1] = a2; + context->argv[2] = a3; + context->argv[3] = a4; + + state = context->state; + context->dummy = !audit_n_rules; + if (!context->dummy && state == AUDIT_BUILD_CONTEXT) { + context->prio = 0; + state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); + } + if (likely(state == AUDIT_DISABLED)) + return; + + context->serial = 0; + context->ctime = CURRENT_TIME; + context->in_syscall = 1; + context->current_state = state; + context->ppid = 0; +} + +void audit_finish_fork(struct task_struct *child) +{ + struct audit_context *ctx = current->audit_context; + struct audit_context *p = child->audit_context; + if (!p || !ctx) + return; + if (!ctx->in_syscall || ctx->current_state != AUDIT_RECORD_CONTEXT) + return; + p->arch = ctx->arch; + p->major = ctx->major; + memcpy(p->argv, ctx->argv, sizeof(ctx->argv)); + p->ctime = ctx->ctime; + p->dummy = ctx->dummy; + p->in_syscall = ctx->in_syscall; + p->filterkey = kstrdup(ctx->filterkey, GFP_KERNEL); + p->ppid = current->pid; + p->prio = ctx->prio; + p->current_state = ctx->current_state; +} + +/** + * audit_syscall_exit - deallocate audit context after a system call + * @valid: success/failure flag + * @return_code: syscall return value + * + * Tear down after system call. If the audit context has been marked as + * auditable (either because of the AUDIT_RECORD_CONTEXT state from + * filtering, or because some other part of the kernel write an audit + * message), then write out the syscall information. In call cases, + * free the names stored from getname(). + */ +void audit_syscall_exit(int valid, long return_code) +{ + struct task_struct *tsk = current; + struct audit_context *context; + + context = audit_get_context(tsk, valid, return_code); + + if (likely(!context)) + return; + + if (context->in_syscall && context->current_state == AUDIT_RECORD_CONTEXT) + audit_log_exit(context, tsk); + + context->in_syscall = 0; + context->prio = context->state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; + + if (!list_empty(&context->killed_trees)) + audit_kill_trees(&context->killed_trees); + + if (context->previous) { + struct audit_context *new_context = context->previous; + context->previous = NULL; + audit_free_context(context); + tsk->audit_context = new_context; + } else { + audit_free_names(context); + unroll_tree_refs(context, NULL, 0); + audit_free_aux(context); + context->aux = NULL; + context->aux_pids = NULL; + context->target_pid = 0; + context->target_sid = 0; + context->sockaddr_len = 0; + context->type = 0; + context->fds[0] = -1; + if (context->state != AUDIT_RECORD_CONTEXT) { + kfree(context->filterkey); + context->filterkey = NULL; + } + tsk->audit_context = context; + } +} + +static inline void handle_one(const struct inode *inode) +{ +#ifdef CONFIG_AUDIT_TREE + struct audit_context *context; + struct audit_tree_refs *p; + struct audit_chunk *chunk; + int count; + if (likely(hlist_empty(&inode->i_fsnotify_marks))) + return; + context = current->audit_context; + p = context->trees; + count = context->tree_count; + rcu_read_lock(); + chunk = audit_tree_lookup(inode); + rcu_read_unlock(); + if (!chunk) + return; + if (likely(put_tree_ref(context, chunk))) + return; + if (unlikely(!grow_tree_refs(context))) { + printk(KERN_WARNING "out of memory, audit has lost a tree reference\n"); + audit_set_auditable(context); + audit_put_chunk(chunk); + unroll_tree_refs(context, p, count); + return; + } + put_tree_ref(context, chunk); +#endif +} + +static void handle_path(const struct dentry *dentry) +{ +#ifdef CONFIG_AUDIT_TREE + struct audit_context *context; + struct audit_tree_refs *p; + const struct dentry *d, *parent; + struct audit_chunk *drop; + unsigned long seq; + int count; + + context = current->audit_context; + p = context->trees; + count = context->tree_count; +retry: + drop = NULL; + d = dentry; + rcu_read_lock(); + seq = read_seqbegin(&rename_lock); + for(;;) { + struct inode *inode = d->d_inode; + if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) { + struct audit_chunk *chunk; + chunk = audit_tree_lookup(inode); + if (chunk) { + if (unlikely(!put_tree_ref(context, chunk))) { + drop = chunk; + break; + } + } + } + parent = d->d_parent; + if (parent == d) + break; + d = parent; + } + if (unlikely(read_seqretry(&rename_lock, seq) || drop)) { /* in this order */ + rcu_read_unlock(); + if (!drop) { + /* just a race with rename */ + unroll_tree_refs(context, p, count); + goto retry; + } + audit_put_chunk(drop); + if (grow_tree_refs(context)) { + /* OK, got more space */ + unroll_tree_refs(context, p, count); + goto retry; + } + /* too bad */ + printk(KERN_WARNING + "out of memory, audit has lost a tree reference\n"); + unroll_tree_refs(context, p, count); + audit_set_auditable(context); + return; + } + rcu_read_unlock(); +#endif +} + +/** + * audit_getname - add a name to the list + * @name: name to add + * + * Add a name to the list of audit names for this context. + * Called from fs/namei.c:getname(). + */ +void __audit_getname(const char *name) +{ + struct audit_context *context = current->audit_context; + + if (IS_ERR(name) || !name) + return; + + if (!context->in_syscall) { +#if AUDIT_DEBUG == 2 + printk(KERN_ERR "%s:%d(:%d): ignoring getname(%p)\n", + __FILE__, __LINE__, context->serial, name); + dump_stack(); +#endif + return; + } + BUG_ON(context->name_count >= AUDIT_NAMES); + context->names[context->name_count].name = name; + context->names[context->name_count].name_len = AUDIT_NAME_FULL; + context->names[context->name_count].name_put = 1; + context->names[context->name_count].ino = (unsigned long)-1; + context->names[context->name_count].osid = 0; + ++context->name_count; + if (!context->pwd.dentry) + get_fs_pwd(current->fs, &context->pwd); +} + +/* audit_putname - intercept a putname request + * @name: name to intercept and delay for putname + * + * If we have stored the name from getname in the audit context, + * then we delay the putname until syscall exit. + * Called from include/linux/fs.h:putname(). + */ +void audit_putname(const char *name) +{ + struct audit_context *context = current->audit_context; + + BUG_ON(!context); + if (!context->in_syscall) { +#if AUDIT_DEBUG == 2 + printk(KERN_ERR "%s:%d(:%d): __putname(%p)\n", + __FILE__, __LINE__, context->serial, name); + if (context->name_count) { + int i; + for (i = 0; i < context->name_count; i++) + printk(KERN_ERR "name[%d] = %p = %s\n", i, + context->names[i].name, + context->names[i].name ?: "(null)"); + } +#endif + __putname(name); + } +#if AUDIT_DEBUG + else { + ++context->put_count; + if (context->put_count > context->name_count) { + printk(KERN_ERR "%s:%d(:%d): major=%d" + " in_syscall=%d putname(%p) name_count=%d" + " put_count=%d\n", + __FILE__, __LINE__, + context->serial, context->major, + context->in_syscall, name, context->name_count, + context->put_count); + dump_stack(); + } + } +#endif +} + +static int audit_inc_name_count(struct audit_context *context, + const struct inode *inode) +{ + if (context->name_count >= AUDIT_NAMES) { + if (inode) + printk(KERN_DEBUG "audit: name_count maxed, losing inode data: " + "dev=%02x:%02x, inode=%lu\n", + MAJOR(inode->i_sb->s_dev), + MINOR(inode->i_sb->s_dev), + inode->i_ino); + + else + printk(KERN_DEBUG "name_count maxed, losing inode data\n"); + return 1; + } + context->name_count++; +#if AUDIT_DEBUG + context->ino_count++; +#endif + return 0; +} + + +static inline int audit_copy_fcaps(struct audit_names *name, const struct dentry *dentry) +{ + struct cpu_vfs_cap_data caps; + int rc; + + memset(&name->fcap.permitted, 0, sizeof(kernel_cap_t)); + memset(&name->fcap.inheritable, 0, sizeof(kernel_cap_t)); + name->fcap.fE = 0; + name->fcap_ver = 0; + + if (!dentry) + return 0; + + rc = get_vfs_caps_from_disk(dentry, &caps); + if (rc) + return rc; + + name->fcap.permitted = caps.permitted; + name->fcap.inheritable = caps.inheritable; + name->fcap.fE = !!(caps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); + name->fcap_ver = (caps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT; + + return 0; +} + + +/* Copy inode data into an audit_names. */ +static void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, + const struct inode *inode) +{ + name->ino = inode->i_ino; + name->dev = inode->i_sb->s_dev; + name->mode = inode->i_mode; + name->uid = inode->i_uid; + name->gid = inode->i_gid; + name->rdev = inode->i_rdev; + security_inode_getsecid(inode, &name->osid); + audit_copy_fcaps(name, dentry); +} + +/** + * audit_inode - store the inode and device from a lookup + * @name: name being audited + * @dentry: dentry being audited + * + * Called from fs/namei.c:path_lookup(). + */ +void __audit_inode(const char *name, const struct dentry *dentry) +{ + int idx; + struct audit_context *context = current->audit_context; + const struct inode *inode = dentry->d_inode; + + if (!context->in_syscall) + return; + if (context->name_count + && context->names[context->name_count-1].name + && context->names[context->name_count-1].name == name) + idx = context->name_count - 1; + else if (context->name_count > 1 + && context->names[context->name_count-2].name + && context->names[context->name_count-2].name == name) + idx = context->name_count - 2; + else { + /* FIXME: how much do we care about inodes that have no + * associated name? */ + if (audit_inc_name_count(context, inode)) + return; + idx = context->name_count - 1; + context->names[idx].name = NULL; + } + handle_path(dentry); + audit_copy_inode(&context->names[idx], dentry, inode); +} + +/** + * audit_inode_child - collect inode info for created/removed objects + * @dentry: dentry being audited + * @parent: inode of dentry parent + * + * For syscalls that create or remove filesystem objects, audit_inode + * can only collect information for the filesystem object's parent. + * This call updates the audit context with the child's information. + * Syscalls that create a new filesystem object must be hooked after + * the object is created. Syscalls that remove a filesystem object + * must be hooked prior, in order to capture the target inode during + * unsuccessful attempts. + */ +void __audit_inode_child(const struct dentry *dentry, + const struct inode *parent) +{ + int idx; + struct audit_context *context = current->audit_context; + const char *found_parent = NULL, *found_child = NULL; + const struct inode *inode = dentry->d_inode; + const char *dname = dentry->d_name.name; + int dirlen = 0; + + if (!context->in_syscall) + return; + + if (inode) + handle_one(inode); + + /* parent is more likely, look for it first */ + for (idx = 0; idx < context->name_count; idx++) { + struct audit_names *n = &context->names[idx]; + + if (!n->name) + continue; + + if (n->ino == parent->i_ino && + !audit_compare_dname_path(dname, n->name, &dirlen)) { + n->name_len = dirlen; /* update parent data in place */ + found_parent = n->name; + goto add_names; + } + } + + /* no matching parent, look for matching child */ + for (idx = 0; idx < context->name_count; idx++) { + struct audit_names *n = &context->names[idx]; + + if (!n->name) + continue; + + /* strcmp() is the more likely scenario */ + if (!strcmp(dname, n->name) || + !audit_compare_dname_path(dname, n->name, &dirlen)) { + if (inode) + audit_copy_inode(n, NULL, inode); + else + n->ino = (unsigned long)-1; + found_child = n->name; + goto add_names; + } + } + +add_names: + if (!found_parent) { + if (audit_inc_name_count(context, parent)) + return; + idx = context->name_count - 1; + context->names[idx].name = NULL; + audit_copy_inode(&context->names[idx], NULL, parent); + } + + if (!found_child) { + if (audit_inc_name_count(context, inode)) + return; + idx = context->name_count - 1; + + /* Re-use the name belonging to the slot for a matching parent + * directory. All names for this context are relinquished in + * audit_free_names() */ + if (found_parent) { + context->names[idx].name = found_parent; + context->names[idx].name_len = AUDIT_NAME_FULL; + /* don't call __putname() */ + context->names[idx].name_put = 0; + } else { + context->names[idx].name = NULL; + } + + if (inode) + audit_copy_inode(&context->names[idx], NULL, inode); + else + context->names[idx].ino = (unsigned long)-1; + } +} +EXPORT_SYMBOL_GPL(__audit_inode_child); + +/** + * auditsc_get_stamp - get local copies of audit_context values + * @ctx: audit_context for the task + * @t: timespec to store time recorded in the audit_context + * @serial: serial value that is recorded in the audit_context + * + * Also sets the context as auditable. + */ +int auditsc_get_stamp(struct audit_context *ctx, + struct timespec *t, unsigned int *serial) +{ + if (!ctx->in_syscall) + return 0; + if (!ctx->serial) + ctx->serial = audit_serial(); + t->tv_sec = ctx->ctime.tv_sec; + t->tv_nsec = ctx->ctime.tv_nsec; + *serial = ctx->serial; + if (!ctx->prio) { + ctx->prio = 1; + ctx->current_state = AUDIT_RECORD_CONTEXT; + } + return 1; +} + +/* global counter which is incremented every time something logs in */ +static atomic_t session_id = ATOMIC_INIT(0); + +/** + * audit_set_loginuid - set a task's audit_context loginuid + * @task: task whose audit context is being modified + * @loginuid: loginuid value + * + * Returns 0. + * + * Called (set) from fs/proc/base.c::proc_loginuid_write(). + */ +int audit_set_loginuid(struct task_struct *task, uid_t loginuid) +{ + unsigned int sessionid = atomic_inc_return(&session_id); + struct audit_context *context = task->audit_context; + + if (context && context->in_syscall) { + struct audit_buffer *ab; + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); + if (ab) { + audit_log_format(ab, "login pid=%d uid=%u " + "old auid=%u new auid=%u" + " old ses=%u new ses=%u", + task->pid, task_uid(task), + task->loginuid, loginuid, + task->sessionid, sessionid); + audit_log_end(ab); + } + } + task->sessionid = sessionid; + task->loginuid = loginuid; + return 0; +} + +/** + * __audit_mq_open - record audit data for a POSIX MQ open + * @oflag: open flag + * @mode: mode bits + * @attr: queue attributes + * + */ +void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr) +{ + struct audit_context *context = current->audit_context; + + if (attr) + memcpy(&context->mq_open.attr, attr, sizeof(struct mq_attr)); + else + memset(&context->mq_open.attr, 0, sizeof(struct mq_attr)); + + context->mq_open.oflag = oflag; + context->mq_open.mode = mode; + + context->type = AUDIT_MQ_OPEN; +} + +/** + * __audit_mq_sendrecv - record audit data for a POSIX MQ timed send/receive + * @mqdes: MQ descriptor + * @msg_len: Message length + * @msg_prio: Message priority + * @abs_timeout: Message timeout in absolute time + * + */ +void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, + const struct timespec *abs_timeout) +{ + struct audit_context *context = current->audit_context; + struct timespec *p = &context->mq_sendrecv.abs_timeout; + + if (abs_timeout) + memcpy(p, abs_timeout, sizeof(struct timespec)); + else + memset(p, 0, sizeof(struct timespec)); + + context->mq_sendrecv.mqdes = mqdes; + context->mq_sendrecv.msg_len = msg_len; + context->mq_sendrecv.msg_prio = msg_prio; + + context->type = AUDIT_MQ_SENDRECV; +} + +/** + * __audit_mq_notify - record audit data for a POSIX MQ notify + * @mqdes: MQ descriptor + * @notification: Notification event + * + */ + +void __audit_mq_notify(mqd_t mqdes, const struct sigevent *notification) +{ + struct audit_context *context = current->audit_context; + + if (notification) + context->mq_notify.sigev_signo = notification->sigev_signo; + else + context->mq_notify.sigev_signo = 0; + + context->mq_notify.mqdes = mqdes; + context->type = AUDIT_MQ_NOTIFY; +} + +/** + * __audit_mq_getsetattr - record audit data for a POSIX MQ get/set attribute + * @mqdes: MQ descriptor + * @mqstat: MQ flags + * + */ +void __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat) +{ + struct audit_context *context = current->audit_context; + context->mq_getsetattr.mqdes = mqdes; + context->mq_getsetattr.mqstat = *mqstat; + context->type = AUDIT_MQ_GETSETATTR; +} + +/** + * audit_ipc_obj - record audit data for ipc object + * @ipcp: ipc permissions + * + */ +void __audit_ipc_obj(struct kern_ipc_perm *ipcp) +{ + struct audit_context *context = current->audit_context; + context->ipc.uid = ipcp->uid; + context->ipc.gid = ipcp->gid; + context->ipc.mode = ipcp->mode; + context->ipc.has_perm = 0; + security_ipc_getsecid(ipcp, &context->ipc.osid); + context->type = AUDIT_IPC; +} + +/** + * audit_ipc_set_perm - record audit data for new ipc permissions + * @qbytes: msgq bytes + * @uid: msgq user id + * @gid: msgq group id + * @mode: msgq mode (permissions) + * + * Called only after audit_ipc_obj(). + */ +void __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode) +{ + struct audit_context *context = current->audit_context; + + context->ipc.qbytes = qbytes; + context->ipc.perm_uid = uid; + context->ipc.perm_gid = gid; + context->ipc.perm_mode = mode; + context->ipc.has_perm = 1; +} + +int audit_bprm(struct linux_binprm *bprm) +{ + struct audit_aux_data_execve *ax; + struct audit_context *context = current->audit_context; + + if (likely(!audit_enabled || !context || context->dummy)) + return 0; + + ax = kmalloc(sizeof(*ax), GFP_KERNEL); + if (!ax) + return -ENOMEM; + + ax->argc = bprm->argc; + ax->envc = bprm->envc; + ax->mm = bprm->mm; + ax->d.type = AUDIT_EXECVE; + ax->d.next = context->aux; + context->aux = (void *)ax; + return 0; +} + + +/** + * audit_socketcall - record audit data for sys_socketcall + * @nargs: number of args + * @args: args array + * + */ +void audit_socketcall(int nargs, unsigned long *args) +{ + struct audit_context *context = current->audit_context; + + if (likely(!context || context->dummy)) + return; + + context->type = AUDIT_SOCKETCALL; + context->socketcall.nargs = nargs; + memcpy(context->socketcall.args, args, nargs * sizeof(unsigned long)); +} + +/** + * __audit_fd_pair - record audit data for pipe and socketpair + * @fd1: the first file descriptor + * @fd2: the second file descriptor + * + */ +void __audit_fd_pair(int fd1, int fd2) +{ + struct audit_context *context = current->audit_context; + context->fds[0] = fd1; + context->fds[1] = fd2; +} + +/** + * audit_sockaddr - record audit data for sys_bind, sys_connect, sys_sendto + * @len: data length in user space + * @a: data address in kernel space + * + * Returns 0 for success or NULL context or < 0 on error. + */ +int audit_sockaddr(int len, void *a) +{ + struct audit_context *context = current->audit_context; + + if (likely(!context || context->dummy)) + return 0; + + if (!context->sockaddr) { + void *p = kmalloc(sizeof(struct sockaddr_storage), GFP_KERNEL); + if (!p) + return -ENOMEM; + context->sockaddr = p; + } + + context->sockaddr_len = len; + memcpy(context->sockaddr, a, len); + return 0; +} + +void __audit_ptrace(struct task_struct *t) +{ + struct audit_context *context = current->audit_context; + + context->target_pid = t->pid; + context->target_auid = audit_get_loginuid(t); + context->target_uid = task_uid(t); + context->target_sessionid = audit_get_sessionid(t); + security_task_getsecid(t, &context->target_sid); + memcpy(context->target_comm, t->comm, TASK_COMM_LEN); +} + +/** + * audit_signal_info - record signal info for shutting down audit subsystem + * @sig: signal value + * @t: task being signaled + * + * If the audit subsystem is being terminated, record the task (pid) + * and uid that is doing that. + */ +int __audit_signal_info(int sig, struct task_struct *t) +{ + struct audit_aux_data_pids *axp; + struct task_struct *tsk = current; + struct audit_context *ctx = tsk->audit_context; + uid_t uid = current_uid(), t_uid = task_uid(t); + + if (audit_pid && t->tgid == audit_pid) { + if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) { + audit_sig_pid = tsk->pid; + if (tsk->loginuid != -1) + audit_sig_uid = tsk->loginuid; + else + audit_sig_uid = uid; + security_task_getsecid(tsk, &audit_sig_sid); + } + if (!audit_signals || audit_dummy_context()) + return 0; + } + + /* optimize the common case by putting first signal recipient directly + * in audit_context */ + if (!ctx->target_pid) { + ctx->target_pid = t->tgid; + ctx->target_auid = audit_get_loginuid(t); + ctx->target_uid = t_uid; + ctx->target_sessionid = audit_get_sessionid(t); + security_task_getsecid(t, &ctx->target_sid); + memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN); + return 0; + } + + axp = (void *)ctx->aux_pids; + if (!axp || axp->pid_count == AUDIT_AUX_PIDS) { + axp = kzalloc(sizeof(*axp), GFP_ATOMIC); + if (!axp) + return -ENOMEM; + + axp->d.type = AUDIT_OBJ_PID; + axp->d.next = ctx->aux_pids; + ctx->aux_pids = (void *)axp; + } + BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS); + + axp->target_pid[axp->pid_count] = t->tgid; + axp->target_auid[axp->pid_count] = audit_get_loginuid(t); + axp->target_uid[axp->pid_count] = t_uid; + axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); + security_task_getsecid(t, &axp->target_sid[axp->pid_count]); + memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN); + axp->pid_count++; + + return 0; +} + +/** + * __audit_log_bprm_fcaps - store information about a loading bprm and relevant fcaps + * @bprm: pointer to the bprm being processed + * @new: the proposed new credentials + * @old: the old credentials + * + * Simply check if the proc already has the caps given by the file and if not + * store the priv escalation info for later auditing at the end of the syscall + * + * -Eric + */ +int __audit_log_bprm_fcaps(struct linux_binprm *bprm, + const struct cred *new, const struct cred *old) +{ + struct audit_aux_data_bprm_fcaps *ax; + struct audit_context *context = current->audit_context; + struct cpu_vfs_cap_data vcaps; + struct dentry *dentry; + + ax = kmalloc(sizeof(*ax), GFP_KERNEL); + if (!ax) + return -ENOMEM; + + ax->d.type = AUDIT_BPRM_FCAPS; + ax->d.next = context->aux; + context->aux = (void *)ax; + + dentry = dget(bprm->file->f_dentry); + get_vfs_caps_from_disk(dentry, &vcaps); + dput(dentry); + + ax->fcap.permitted = vcaps.permitted; + ax->fcap.inheritable = vcaps.inheritable; + ax->fcap.fE = !!(vcaps.magic_etc & VFS_CAP_FLAGS_EFFECTIVE); + ax->fcap_ver = (vcaps.magic_etc & VFS_CAP_REVISION_MASK) >> VFS_CAP_REVISION_SHIFT; + + ax->old_pcap.permitted = old->cap_permitted; + ax->old_pcap.inheritable = old->cap_inheritable; + ax->old_pcap.effective = old->cap_effective; + + ax->new_pcap.permitted = new->cap_permitted; + ax->new_pcap.inheritable = new->cap_inheritable; + ax->new_pcap.effective = new->cap_effective; + return 0; +} + +/** + * __audit_log_capset - store information about the arguments to the capset syscall + * @pid: target pid of the capset call + * @new: the new credentials + * @old: the old (current) credentials + * + * Record the aguments userspace sent to sys_capset for later printing by the + * audit system if applicable + */ +void __audit_log_capset(pid_t pid, + const struct cred *new, const struct cred *old) +{ + struct audit_context *context = current->audit_context; + context->capset.pid = pid; + context->capset.cap.effective = new->cap_effective; + context->capset.cap.inheritable = new->cap_effective; + context->capset.cap.permitted = new->cap_permitted; + context->type = AUDIT_CAPSET; +} + +void __audit_mmap_fd(int fd, int flags) +{ + struct audit_context *context = current->audit_context; + context->mmap.fd = fd; + context->mmap.flags = flags; + context->type = AUDIT_MMAP; +} + +/** + * audit_core_dumps - record information about processes that end abnormally + * @signr: signal value + * + * If a process ends with a core dump, something fishy is going on and we + * should record the event for investigation. + */ +void audit_core_dumps(long signr) +{ + struct audit_buffer *ab; + u32 sid; + uid_t auid = audit_get_loginuid(current), uid; + gid_t gid; + unsigned int sessionid = audit_get_sessionid(current); + + if (!audit_enabled) + return; + + if (signr == SIGQUIT) /* don't care for those */ + return; + + ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); + current_uid_gid(&uid, &gid); + audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", + auid, uid, gid, sessionid); + security_task_getsecid(current, &sid); + if (sid) { + char *ctx = NULL; + u32 len; + + if (security_secid_to_secctx(sid, &ctx, &len)) + audit_log_format(ab, " ssid=%u", sid); + else { + audit_log_format(ab, " subj=%s", ctx); + security_release_secctx(ctx, len); + } + } + audit_log_format(ab, " pid=%d comm=", current->pid); + audit_log_untrustedstring(ab, current->comm); + audit_log_format(ab, " sig=%ld", signr); + audit_log_end(ab); +} + +struct list_head *audit_killed_trees(void) +{ + struct audit_context *ctx = current->audit_context; + if (likely(!ctx || !ctx->in_syscall)) + return NULL; + return &ctx->killed_trees; +} diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c new file mode 100644 index 00000000..a5e026bc --- /dev/null +++ b/kernel/backtracetest.c @@ -0,0 +1,91 @@ +/* + * Simple stack backtrace regression test module + * + * (C) Copyright 2008 Intel Corporation + * Author: Arjan van de Ven + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include +#include +#include +#include +#include +#include + +static void backtrace_test_normal(void) +{ + printk("Testing a backtrace from process context.\n"); + printk("The following trace is a kernel self test and not a bug!\n"); + + dump_stack(); +} + +static DECLARE_COMPLETION(backtrace_work); + +static void backtrace_test_irq_callback(unsigned long data) +{ + dump_stack(); + complete(&backtrace_work); +} + +static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0); + +static void backtrace_test_irq(void) +{ + printk("Testing a backtrace from irq context.\n"); + printk("The following trace is a kernel self test and not a bug!\n"); + + init_completion(&backtrace_work); + tasklet_schedule(&backtrace_tasklet); + wait_for_completion(&backtrace_work); +} + +#ifdef CONFIG_STACKTRACE +static void backtrace_test_saved(void) +{ + struct stack_trace trace; + unsigned long entries[8]; + + printk("Testing a saved backtrace.\n"); + printk("The following trace is a kernel self test and not a bug!\n"); + + trace.nr_entries = 0; + trace.max_entries = ARRAY_SIZE(entries); + trace.entries = entries; + trace.skip = 0; + + save_stack_trace(&trace); + print_stack_trace(&trace, 0); +} +#else +static void backtrace_test_saved(void) +{ + printk("Saved backtrace test skipped.\n"); +} +#endif + +static int backtrace_regression_test(void) +{ + printk("====[ backtrace testing ]===========\n"); + + backtrace_test_normal(); + backtrace_test_irq(); + backtrace_test_saved(); + + printk("====[ end of backtrace testing ]====\n"); + return 0; +} + +static void exitf(void) +{ +} + +module_init(backtrace_regression_test); +module_exit(exitf); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arjan van de Ven "); diff --git a/kernel/bounds.c b/kernel/bounds.c new file mode 100644 index 00000000..0c9b8622 --- /dev/null +++ b/kernel/bounds.c @@ -0,0 +1,21 @@ +/* + * Generate definitions needed by the preprocessor. + * This code generates raw asm output which is post-processed + * to extract and format the required data. + */ + +#define __GENERATING_BOUNDS_H +/* Include headers that define the enum constants of interest */ +#include +#include +#include +#include + +void foo(void) +{ + /* The enum constants to put into include/generated/bounds.h */ + DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); + DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); + DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS); + /* End of constants */ +} diff --git a/kernel/capability.c b/kernel/capability.c new file mode 100644 index 00000000..283c529f --- /dev/null +++ b/kernel/capability.c @@ -0,0 +1,409 @@ +/* + * linux/kernel/capability.c + * + * Copyright (C) 1997 Andrew Main + * + * Integrated into 2.1.97+, Andrew G. Morgan + * 30 May 2002: Cleanup, Robert M. Love + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Leveraged for setting/resetting capabilities + */ + +const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; + +EXPORT_SYMBOL(__cap_empty_set); + +int file_caps_enabled = 1; + +static int __init file_caps_disable(char *str) +{ + file_caps_enabled = 0; + return 1; +} +__setup("no_file_caps", file_caps_disable); + +/* + * More recent versions of libcap are available from: + * + * http://www.kernel.org/pub/linux/libs/security/linux-privs/ + */ + +static void warn_legacy_capability_use(void) +{ + static int warned; + if (!warned) { + char name[sizeof(current->comm)]; + + printk(KERN_INFO "warning: `%s' uses 32-bit capabilities" + " (legacy support in use)\n", + get_task_comm(name, current)); + warned = 1; + } +} + +/* + * Version 2 capabilities worked fine, but the linux/capability.h file + * that accompanied their introduction encouraged their use without + * the necessary user-space source code changes. As such, we have + * created a version 3 with equivalent functionality to version 2, but + * with a header change to protect legacy source code from using + * version 2 when it wanted to use version 1. If your system has code + * that trips the following warning, it is using version 2 specific + * capabilities and may be doing so insecurely. + * + * The remedy is to either upgrade your version of libcap (to 2.10+, + * if the application is linked against it), or recompile your + * application with modern kernel headers and this warning will go + * away. + */ + +static void warn_deprecated_v2(void) +{ + static int warned; + + if (!warned) { + char name[sizeof(current->comm)]; + + printk(KERN_INFO "warning: `%s' uses deprecated v2" + " capabilities in a way that may be insecure.\n", + get_task_comm(name, current)); + warned = 1; + } +} + +/* + * Version check. Return the number of u32s in each capability flag + * array, or a negative value on error. + */ +static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) +{ + __u32 version; + + if (get_user(version, &header->version)) + return -EFAULT; + + switch (version) { + case _LINUX_CAPABILITY_VERSION_1: + warn_legacy_capability_use(); + *tocopy = _LINUX_CAPABILITY_U32S_1; + break; + case _LINUX_CAPABILITY_VERSION_2: + warn_deprecated_v2(); + /* + * fall through - v3 is otherwise equivalent to v2. + */ + case _LINUX_CAPABILITY_VERSION_3: + *tocopy = _LINUX_CAPABILITY_U32S_3; + break; + default: + if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version)) + return -EFAULT; + return -EINVAL; + } + + return 0; +} + +/* + * The only thing that can change the capabilities of the current + * process is the current process. As such, we can't be in this code + * at the same time as we are in the process of setting capabilities + * in this process. The net result is that we can limit our use of + * locks to when we are reading the caps of another process. + */ +static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, + kernel_cap_t *pIp, kernel_cap_t *pPp) +{ + int ret; + + if (pid && (pid != task_pid_vnr(current))) { + struct task_struct *target; + + rcu_read_lock(); + + target = find_task_by_vpid(pid); + if (!target) + ret = -ESRCH; + else + ret = security_capget(target, pEp, pIp, pPp); + + rcu_read_unlock(); + } else + ret = security_capget(current, pEp, pIp, pPp); + + return ret; +} + +/** + * sys_capget - get the capabilities of a given process. + * @header: pointer to struct that contains capability version and + * target pid data + * @dataptr: pointer to struct that contains the effective, permitted, + * and inheritable capabilities that are returned + * + * Returns 0 on success and < 0 on error. + */ +SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) +{ + int ret = 0; + pid_t pid; + unsigned tocopy; + kernel_cap_t pE, pI, pP; + + ret = cap_validate_magic(header, &tocopy); + if ((dataptr == NULL) || (ret != 0)) + return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret; + + if (get_user(pid, &header->pid)) + return -EFAULT; + + if (pid < 0) + return -EINVAL; + + ret = cap_get_target_pid(pid, &pE, &pI, &pP); + if (!ret) { + struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; + unsigned i; + + for (i = 0; i < tocopy; i++) { + kdata[i].effective = pE.cap[i]; + kdata[i].permitted = pP.cap[i]; + kdata[i].inheritable = pI.cap[i]; + } + + /* + * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S, + * we silently drop the upper capabilities here. This + * has the effect of making older libcap + * implementations implicitly drop upper capability + * bits when they perform a: capget/modify/capset + * sequence. + * + * This behavior is considered fail-safe + * behavior. Upgrading the application to a newer + * version of libcap will enable access to the newer + * capabilities. + * + * An alternative would be to return an error here + * (-ERANGE), but that causes legacy applications to + * unexpectidly fail; the capget/modify/capset aborts + * before modification is attempted and the application + * fails. + */ + if (copy_to_user(dataptr, kdata, tocopy + * sizeof(struct __user_cap_data_struct))) { + return -EFAULT; + } + } + + return ret; +} + +/** + * sys_capset - set capabilities for a process or (*) a group of processes + * @header: pointer to struct that contains capability version and + * target pid data + * @data: pointer to struct that contains the effective, permitted, + * and inheritable capabilities + * + * Set capabilities for the current process only. The ability to any other + * process(es) has been deprecated and removed. + * + * The restrictions on setting capabilities are specified as: + * + * I: any raised capabilities must be a subset of the old permitted + * P: any raised capabilities must be a subset of the old permitted + * E: must be set to a subset of new permitted + * + * Returns 0 on success and < 0 on error. + */ +SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) +{ + struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; + unsigned i, tocopy, copybytes; + kernel_cap_t inheritable, permitted, effective; + struct cred *new; + int ret; + pid_t pid; + + ret = cap_validate_magic(header, &tocopy); + if (ret != 0) + return ret; + + if (get_user(pid, &header->pid)) + return -EFAULT; + + /* may only affect current now */ + if (pid != 0 && pid != task_pid_vnr(current)) + return -EPERM; + + copybytes = tocopy * sizeof(struct __user_cap_data_struct); + if (copybytes > sizeof(kdata)) + return -EFAULT; + + if (copy_from_user(&kdata, data, copybytes)) + return -EFAULT; + + for (i = 0; i < tocopy; i++) { + effective.cap[i] = kdata[i].effective; + permitted.cap[i] = kdata[i].permitted; + inheritable.cap[i] = kdata[i].inheritable; + } + while (i < _KERNEL_CAPABILITY_U32S) { + effective.cap[i] = 0; + permitted.cap[i] = 0; + inheritable.cap[i] = 0; + i++; + } + + new = prepare_creds(); + if (!new) + return -ENOMEM; + + ret = security_capset(new, current_cred(), + &effective, &inheritable, &permitted); + if (ret < 0) + goto error; + + audit_log_capset(pid, new, current_cred()); + + return commit_creds(new); + +error: + abort_creds(new); + return ret; +} + +/** + * has_capability - Does a task have a capability in init_user_ns + * @t: The task in question + * @cap: The capability to be tested for + * + * Return true if the specified task has the given superior capability + * currently in effect to the initial user namespace, false if not. + * + * Note that this does not set PF_SUPERPRIV on the task. + */ +bool has_capability(struct task_struct *t, int cap) +{ + int ret = security_real_capable(t, &init_user_ns, cap); + + return (ret == 0); +} + +/** + * has_capability - Does a task have a capability in a specific user ns + * @t: The task in question + * @ns: target user namespace + * @cap: The capability to be tested for + * + * Return true if the specified task has the given superior capability + * currently in effect to the specified user namespace, false if not. + * + * Note that this does not set PF_SUPERPRIV on the task. + */ +bool has_ns_capability(struct task_struct *t, + struct user_namespace *ns, int cap) +{ + int ret = security_real_capable(t, ns, cap); + + return (ret == 0); +} + +/** + * has_capability_noaudit - Does a task have a capability (unaudited) + * @t: The task in question + * @cap: The capability to be tested for + * + * Return true if the specified task has the given superior capability + * currently in effect to init_user_ns, false if not. Don't write an + * audit message for the check. + * + * Note that this does not set PF_SUPERPRIV on the task. + */ +bool has_capability_noaudit(struct task_struct *t, int cap) +{ + int ret = security_real_capable_noaudit(t, &init_user_ns, cap); + + return (ret == 0); +} + +/** + * capable - Determine if the current task has a superior capability in effect + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. + * + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. + */ +bool capable(int cap) +{ + return ns_capable(&init_user_ns, cap); +} +EXPORT_SYMBOL(capable); + +/** + * ns_capable - Determine if the current task has a superior capability in effect + * @ns: The usernamespace we want the capability in + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. + * + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. + */ +bool ns_capable(struct user_namespace *ns, int cap) +{ + if (unlikely(!cap_valid(cap))) { + printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap); + BUG(); + } + + if (security_capable(ns, current_cred(), cap) == 0) { + current->flags |= PF_SUPERPRIV; + return true; + } + return false; +} +EXPORT_SYMBOL(ns_capable); + +/** + * task_ns_capable - Determine whether current task has a superior + * capability targeted at a specific task's user namespace. + * @t: The task whose user namespace is targeted. + * @cap: The capability in question. + * + * Return true if it does, false otherwise. + */ +bool task_ns_capable(struct task_struct *t, int cap) +{ + return ns_capable(task_cred_xxx(t, user)->user_ns, cap); +} +EXPORT_SYMBOL(task_ns_capable); + +/** + * nsown_capable - Check superior capability to one's own user_ns + * @cap: The capability in question + * + * Return true if the current task has the given superior capability + * targeted at its own user namespace. + */ +bool nsown_capable(int cap) +{ + return ns_capable(current_user_ns(), cap); +} diff --git a/kernel/cgroup.c b/kernel/cgroup.c new file mode 100644 index 00000000..5083a09a --- /dev/null +++ b/kernel/cgroup.c @@ -0,0 +1,5279 @@ +/* + * Generic process-grouping system. + * + * Based originally on the cpuset system, extracted by Paul Menage + * Copyright (C) 2006 Google, Inc + * + * Notifications support + * Copyright (C) 2009 Nokia Corporation + * Author: Kirill A. Shutemov + * + * Copyright notices from the original cpuset code: + * -------------------------------------------------- + * Copyright (C) 2003 BULL SA. + * Copyright (C) 2004-2006 Silicon Graphics, Inc. + * + * Portions derived from Patrick Mochel's sysfs code. + * sysfs is Copyright (c) 2001-3 Patrick Mochel + * + * 2003-10-10 Written by Simon Derr. + * 2003-10-22 Updates by Stephen Hemminger. + * 2004 May-July Rework by Paul Jackson. + * --------------------------------------------------- + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* TODO: replace with more sophisticated array */ +#include +#include +#include /* used in cgroup_attach_proc */ + +#include + +static DEFINE_MUTEX(cgroup_mutex); + +/* + * Generate an array of cgroup subsystem pointers. At boot time, this is + * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are + * registered after that. The mutable section of this array is protected by + * cgroup_mutex. + */ +#define SUBSYS(_x) &_x ## _subsys, +static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { +#include +}; + +#define MAX_CGROUP_ROOT_NAMELEN 64 + +/* + * A cgroupfs_root represents the root of a cgroup hierarchy, + * and may be associated with a superblock to form an active + * hierarchy + */ +struct cgroupfs_root { + struct super_block *sb; + + /* + * The bitmask of subsystems intended to be attached to this + * hierarchy + */ + unsigned long subsys_bits; + + /* Unique id for this hierarchy. */ + int hierarchy_id; + + /* The bitmask of subsystems currently attached to this hierarchy */ + unsigned long actual_subsys_bits; + + /* A list running through the attached subsystems */ + struct list_head subsys_list; + + /* The root cgroup for this hierarchy */ + struct cgroup top_cgroup; + + /* Tracks how many cgroups are currently defined in hierarchy.*/ + int number_of_cgroups; + + /* A list running through the active hierarchies */ + struct list_head root_list; + + /* Hierarchy-specific flags */ + unsigned long flags; + + /* The path to use for release notifications. */ + char release_agent_path[PATH_MAX]; + + /* The name for this hierarchy - may be empty */ + char name[MAX_CGROUP_ROOT_NAMELEN]; +}; + +/* + * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the + * subsystems that are otherwise unattached - it never has more than a + * single cgroup, and all tasks are part of that cgroup. + */ +static struct cgroupfs_root rootnode; + +/* + * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when + * cgroup_subsys->use_id != 0. + */ +#define CSS_ID_MAX (65535) +struct css_id { + /* + * The css to which this ID points. This pointer is set to valid value + * after cgroup is populated. If cgroup is removed, this will be NULL. + * This pointer is expected to be RCU-safe because destroy() + * is called after synchronize_rcu(). But for safe use, css_is_removed() + * css_tryget() should be used for avoiding race. + */ + struct cgroup_subsys_state __rcu *css; + /* + * ID of this css. + */ + unsigned short id; + /* + * Depth in hierarchy which this ID belongs to. + */ + unsigned short depth; + /* + * ID is freed by RCU. (and lookup routine is RCU safe.) + */ + struct rcu_head rcu_head; + /* + * Hierarchy of CSS ID belongs to. + */ + unsigned short stack[0]; /* Array of Length (depth+1) */ +}; + +/* + * cgroup_event represents events which userspace want to receive. + */ +struct cgroup_event { + /* + * Cgroup which the event belongs to. + */ + struct cgroup *cgrp; + /* + * Control file which the event associated. + */ + struct cftype *cft; + /* + * eventfd to signal userspace about the event. + */ + struct eventfd_ctx *eventfd; + /* + * Each of these stored in a list by the cgroup. + */ + struct list_head list; + /* + * All fields below needed to unregister event when + * userspace closes eventfd. + */ + poll_table pt; + wait_queue_head_t *wqh; + wait_queue_t wait; + struct work_struct remove; +}; + +/* The list of hierarchy roots */ + +static LIST_HEAD(roots); +static int root_count; + +static DEFINE_IDA(hierarchy_ida); +static int next_hierarchy_id; +static DEFINE_SPINLOCK(hierarchy_id_lock); + +/* dummytop is a shorthand for the dummy hierarchy's top cgroup */ +#define dummytop (&rootnode.top_cgroup) + +/* This flag indicates whether tasks in the fork and exit paths should + * check for fork/exit handlers to call. This avoids us having to do + * extra work in the fork/exit path if none of the subsystems need to + * be called. + */ +static int need_forkexit_callback __read_mostly; + +#ifdef CONFIG_PROVE_LOCKING +int cgroup_lock_is_held(void) +{ + return lockdep_is_held(&cgroup_mutex); +} +#else /* #ifdef CONFIG_PROVE_LOCKING */ +int cgroup_lock_is_held(void) +{ + return mutex_is_locked(&cgroup_mutex); +} +#endif /* #else #ifdef CONFIG_PROVE_LOCKING */ + +EXPORT_SYMBOL_GPL(cgroup_lock_is_held); + +/* convenient tests for these bits */ +inline int cgroup_is_removed(const struct cgroup *cgrp) +{ + return test_bit(CGRP_REMOVED, &cgrp->flags); +} + +/* bits in struct cgroupfs_root flags field */ +enum { + ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ +}; + +static int cgroup_is_releasable(const struct cgroup *cgrp) +{ + const int bits = + (1 << CGRP_RELEASABLE) | + (1 << CGRP_NOTIFY_ON_RELEASE); + return (cgrp->flags & bits) == bits; +} + +static int notify_on_release(const struct cgroup *cgrp) +{ + return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); +} + +static int clone_children(const struct cgroup *cgrp) +{ + return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); +} + +/* + * for_each_subsys() allows you to iterate on each subsystem attached to + * an active hierarchy + */ +#define for_each_subsys(_root, _ss) \ +list_for_each_entry(_ss, &_root->subsys_list, sibling) + +/* for_each_active_root() allows you to iterate across the active hierarchies */ +#define for_each_active_root(_root) \ +list_for_each_entry(_root, &roots, root_list) + +/* the list of cgroups eligible for automatic release. Protected by + * release_list_lock */ +static LIST_HEAD(release_list); +static DEFINE_SPINLOCK(release_list_lock); +static void cgroup_release_agent(struct work_struct *work); +static DECLARE_WORK(release_agent_work, cgroup_release_agent); +static void check_for_release(struct cgroup *cgrp); + +/* + * A queue for waiters to do rmdir() cgroup. A tasks will sleep when + * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some + * reference to css->refcnt. In general, this refcnt is expected to goes down + * to zero, soon. + * + * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; + */ +DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); + +static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) +{ + if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) + wake_up_all(&cgroup_rmdir_waitq); +} + +void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) +{ + css_get(css); +} + +void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) +{ + cgroup_wakeup_rmdir_waiter(css->cgroup); + css_put(css); +} + +/* Link structure for associating css_set objects with cgroups */ +struct cg_cgroup_link { + /* + * List running through cg_cgroup_links associated with a + * cgroup, anchored on cgroup->css_sets + */ + struct list_head cgrp_link_list; + struct cgroup *cgrp; + /* + * List running through cg_cgroup_links pointing at a + * single css_set object, anchored on css_set->cg_links + */ + struct list_head cg_link_list; + struct css_set *cg; +}; + +/* The default css_set - used by init and its children prior to any + * hierarchies being mounted. It contains a pointer to the root state + * for each subsystem. Also used to anchor the list of css_sets. Not + * reference-counted, to improve performance when child cgroups + * haven't been created. + */ + +static struct css_set init_css_set; +static struct cg_cgroup_link init_css_set_link; + +static int cgroup_init_idr(struct cgroup_subsys *ss, + struct cgroup_subsys_state *css); + +/* css_set_lock protects the list of css_set objects, and the + * chain of tasks off each css_set. Nests outside task->alloc_lock + * due to cgroup_iter_start() */ +static DEFINE_RWLOCK(css_set_lock); +static int css_set_count; + +/* + * hash table for cgroup groups. This improves the performance to find + * an existing css_set. This hash doesn't (currently) take into + * account cgroups in empty hierarchies. + */ +#define CSS_SET_HASH_BITS 7 +#define CSS_SET_TABLE_SIZE (1 << CSS_SET_HASH_BITS) +static struct hlist_head css_set_table[CSS_SET_TABLE_SIZE]; + +static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) +{ + int i; + int index; + unsigned long tmp = 0UL; + + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) + tmp += (unsigned long)css[i]; + tmp = (tmp >> 16) ^ tmp; + + index = hash_long(tmp, CSS_SET_HASH_BITS); + + return &css_set_table[index]; +} + +static void free_css_set_work(struct work_struct *work) +{ + struct css_set *cg = container_of(work, struct css_set, work); + struct cg_cgroup_link *link; + struct cg_cgroup_link *saved_link; + + write_lock(&css_set_lock); + list_for_each_entry_safe(link, saved_link, &cg->cg_links, + cg_link_list) { + struct cgroup *cgrp = link->cgrp; + list_del(&link->cg_link_list); + list_del(&link->cgrp_link_list); + if (atomic_dec_and_test(&cgrp->count)) { + check_for_release(cgrp); + cgroup_wakeup_rmdir_waiter(cgrp); + } + kfree(link); + } + write_unlock(&css_set_lock); + + kfree(cg); +} + +static void free_css_set_rcu(struct rcu_head *obj) +{ + struct css_set *cg = container_of(obj, struct css_set, rcu_head); + + INIT_WORK(&cg->work, free_css_set_work); + schedule_work(&cg->work); +} + +/* We don't maintain the lists running through each css_set to its + * task until after the first call to cgroup_iter_start(). This + * reduces the fork()/exit() overhead for people who have cgroups + * compiled into their kernel but not actually in use */ +static int use_task_css_set_links __read_mostly; + +/* + * refcounted get/put for css_set objects + */ +static inline void get_css_set(struct css_set *cg) +{ + atomic_inc(&cg->refcount); +} + +static void put_css_set(struct css_set *cg) +{ + /* + * Ensure that the refcount doesn't hit zero while any readers + * can see it. Similar to atomic_dec_and_lock(), but for an + * rwlock + */ + if (atomic_add_unless(&cg->refcount, -1, 1)) + return; + write_lock(&css_set_lock); + if (!atomic_dec_and_test(&cg->refcount)) { + write_unlock(&css_set_lock); + return; + } + + hlist_del(&cg->hlist); + css_set_count--; + + write_unlock(&css_set_lock); + call_rcu(&cg->rcu_head, free_css_set_rcu); +} + +/* + * compare_css_sets - helper function for find_existing_css_set(). + * @cg: candidate css_set being tested + * @old_cg: existing css_set for a task + * @new_cgrp: cgroup that's being entered by the task + * @template: desired set of css pointers in css_set (pre-calculated) + * + * Returns true if "cg" matches "old_cg" except for the hierarchy + * which "new_cgrp" belongs to, for which it should match "new_cgrp". + */ +static bool compare_css_sets(struct css_set *cg, + struct css_set *old_cg, + struct cgroup *new_cgrp, + struct cgroup_subsys_state *template[]) +{ + struct list_head *l1, *l2; + + if (memcmp(template, cg->subsys, sizeof(cg->subsys))) { + /* Not all subsystems matched */ + return false; + } + + /* + * Compare cgroup pointers in order to distinguish between + * different cgroups in heirarchies with no subsystems. We + * could get by with just this check alone (and skip the + * memcmp above) but on most setups the memcmp check will + * avoid the need for this more expensive check on almost all + * candidates. + */ + + l1 = &cg->cg_links; + l2 = &old_cg->cg_links; + while (1) { + struct cg_cgroup_link *cgl1, *cgl2; + struct cgroup *cg1, *cg2; + + l1 = l1->next; + l2 = l2->next; + /* See if we reached the end - both lists are equal length. */ + if (l1 == &cg->cg_links) { + BUG_ON(l2 != &old_cg->cg_links); + break; + } else { + BUG_ON(l2 == &old_cg->cg_links); + } + /* Locate the cgroups associated with these links. */ + cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list); + cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list); + cg1 = cgl1->cgrp; + cg2 = cgl2->cgrp; + /* Hierarchies should be linked in the same order. */ + BUG_ON(cg1->root != cg2->root); + + /* + * If this hierarchy is the hierarchy of the cgroup + * that's changing, then we need to check that this + * css_set points to the new cgroup; if it's any other + * hierarchy, then this css_set should point to the + * same cgroup as the old css_set. + */ + if (cg1->root == new_cgrp->root) { + if (cg1 != new_cgrp) + return false; + } else { + if (cg1 != cg2) + return false; + } + } + return true; +} + +/* + * find_existing_css_set() is a helper for + * find_css_set(), and checks to see whether an existing + * css_set is suitable. + * + * oldcg: the cgroup group that we're using before the cgroup + * transition + * + * cgrp: the cgroup that we're moving into + * + * template: location in which to build the desired set of subsystem + * state objects for the new cgroup group + */ +static struct css_set *find_existing_css_set( + struct css_set *oldcg, + struct cgroup *cgrp, + struct cgroup_subsys_state *template[]) +{ + int i; + struct cgroupfs_root *root = cgrp->root; + struct hlist_head *hhead; + struct hlist_node *node; + struct css_set *cg; + + /* + * Build the set of subsystem state objects that we want to see in the + * new css_set. while subsystems can change globally, the entries here + * won't change, so no need for locking. + */ + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + if (root->subsys_bits & (1UL << i)) { + /* Subsystem is in this hierarchy. So we want + * the subsystem state from the new + * cgroup */ + template[i] = cgrp->subsys[i]; + } else { + /* Subsystem is not in this hierarchy, so we + * don't want to change the subsystem state */ + template[i] = oldcg->subsys[i]; + } + } + + hhead = css_set_hash(template); + hlist_for_each_entry(cg, node, hhead, hlist) { + if (!compare_css_sets(cg, oldcg, cgrp, template)) + continue; + + /* This css_set matches what we need */ + return cg; + } + + /* No existing cgroup group matched */ + return NULL; +} + +static void free_cg_links(struct list_head *tmp) +{ + struct cg_cgroup_link *link; + struct cg_cgroup_link *saved_link; + + list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { + list_del(&link->cgrp_link_list); + kfree(link); + } +} + +/* + * allocate_cg_links() allocates "count" cg_cgroup_link structures + * and chains them on tmp through their cgrp_link_list fields. Returns 0 on + * success or a negative error + */ +static int allocate_cg_links(int count, struct list_head *tmp) +{ + struct cg_cgroup_link *link; + int i; + INIT_LIST_HEAD(tmp); + for (i = 0; i < count; i++) { + link = kmalloc(sizeof(*link), GFP_KERNEL); + if (!link) { + free_cg_links(tmp); + return -ENOMEM; + } + list_add(&link->cgrp_link_list, tmp); + } + return 0; +} + +/** + * link_css_set - a helper function to link a css_set to a cgroup + * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links() + * @cg: the css_set to be linked + * @cgrp: the destination cgroup + */ +static void link_css_set(struct list_head *tmp_cg_links, + struct css_set *cg, struct cgroup *cgrp) +{ + struct cg_cgroup_link *link; + + BUG_ON(list_empty(tmp_cg_links)); + link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, + cgrp_link_list); + link->cg = cg; + link->cgrp = cgrp; + atomic_inc(&cgrp->count); + list_move(&link->cgrp_link_list, &cgrp->css_sets); + /* + * Always add links to the tail of the list so that the list + * is sorted by order of hierarchy creation + */ + list_add_tail(&link->cg_link_list, &cg->cg_links); +} + +/* + * find_css_set() takes an existing cgroup group and a + * cgroup object, and returns a css_set object that's + * equivalent to the old group, but with the given cgroup + * substituted into the appropriate hierarchy. Must be called with + * cgroup_mutex held + */ +static struct css_set *find_css_set( + struct css_set *oldcg, struct cgroup *cgrp) +{ + struct css_set *res; + struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; + + struct list_head tmp_cg_links; + + struct hlist_head *hhead; + struct cg_cgroup_link *link; + + /* First see if we already have a cgroup group that matches + * the desired set */ + read_lock(&css_set_lock); + res = find_existing_css_set(oldcg, cgrp, template); + if (res) + get_css_set(res); + read_unlock(&css_set_lock); + + if (res) + return res; + + res = kmalloc(sizeof(*res), GFP_KERNEL); + if (!res) + return NULL; + + /* Allocate all the cg_cgroup_link objects that we'll need */ + if (allocate_cg_links(root_count, &tmp_cg_links) < 0) { + kfree(res); + return NULL; + } + + atomic_set(&res->refcount, 1); + INIT_LIST_HEAD(&res->cg_links); + INIT_LIST_HEAD(&res->tasks); + INIT_HLIST_NODE(&res->hlist); + + /* Copy the set of subsystem state objects generated in + * find_existing_css_set() */ + memcpy(res->subsys, template, sizeof(res->subsys)); + + write_lock(&css_set_lock); + /* Add reference counts and links from the new css_set. */ + list_for_each_entry(link, &oldcg->cg_links, cg_link_list) { + struct cgroup *c = link->cgrp; + if (c->root == cgrp->root) + c = cgrp; + link_css_set(&tmp_cg_links, res, c); + } + + BUG_ON(!list_empty(&tmp_cg_links)); + + css_set_count++; + + /* Add this cgroup group to the hash table */ + hhead = css_set_hash(res->subsys); + hlist_add_head(&res->hlist, hhead); + + write_unlock(&css_set_lock); + + return res; +} + +/* + * Return the cgroup for "task" from the given hierarchy. Must be + * called with cgroup_mutex held. + */ +static struct cgroup *task_cgroup_from_root(struct task_struct *task, + struct cgroupfs_root *root) +{ + struct css_set *css; + struct cgroup *res = NULL; + + BUG_ON(!mutex_is_locked(&cgroup_mutex)); + read_lock(&css_set_lock); + /* + * No need to lock the task - since we hold cgroup_mutex the + * task can't change groups, so the only thing that can happen + * is that it exits and its css is set back to init_css_set. + */ + css = task->cgroups; + if (css == &init_css_set) { + res = &root->top_cgroup; + } else { + struct cg_cgroup_link *link; + list_for_each_entry(link, &css->cg_links, cg_link_list) { + struct cgroup *c = link->cgrp; + if (c->root == root) { + res = c; + break; + } + } + } + read_unlock(&css_set_lock); + BUG_ON(!res); + return res; +} + +/* + * There is one global cgroup mutex. We also require taking + * task_lock() when dereferencing a task's cgroup subsys pointers. + * See "The task_lock() exception", at the end of this comment. + * + * A task must hold cgroup_mutex to modify cgroups. + * + * Any task can increment and decrement the count field without lock. + * So in general, code holding cgroup_mutex can't rely on the count + * field not changing. However, if the count goes to zero, then only + * cgroup_attach_task() can increment it again. Because a count of zero + * means that no tasks are currently attached, therefore there is no + * way a task attached to that cgroup can fork (the other way to + * increment the count). So code holding cgroup_mutex can safely + * assume that if the count is zero, it will stay zero. Similarly, if + * a task holds cgroup_mutex on a cgroup with zero count, it + * knows that the cgroup won't be removed, as cgroup_rmdir() + * needs that mutex. + * + * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't + * (usually) take cgroup_mutex. These are the two most performance + * critical pieces of code here. The exception occurs on cgroup_exit(), + * when a task in a notify_on_release cgroup exits. Then cgroup_mutex + * is taken, and if the cgroup count is zero, a usermode call made + * to the release agent with the name of the cgroup (path relative to + * the root of cgroup file system) as the argument. + * + * A cgroup can only be deleted if both its 'count' of using tasks + * is zero, and its list of 'children' cgroups is empty. Since all + * tasks in the system use _some_ cgroup, and since there is always at + * least one task in the system (init, pid == 1), therefore, top_cgroup + * always has either children cgroups and/or using tasks. So we don't + * need a special hack to ensure that top_cgroup cannot be deleted. + * + * The task_lock() exception + * + * The need for this exception arises from the action of + * cgroup_attach_task(), which overwrites one tasks cgroup pointer with + * another. It does so using cgroup_mutex, however there are + * several performance critical places that need to reference + * task->cgroups without the expense of grabbing a system global + * mutex. Therefore except as noted below, when dereferencing or, as + * in cgroup_attach_task(), modifying a task's cgroups pointer we use + * task_lock(), which acts on a spinlock (task->alloc_lock) already in + * the task_struct routinely used for such matters. + * + * P.S. One more locking exception. RCU is used to guard the + * update of a tasks cgroup pointer by cgroup_attach_task() + */ + +/** + * cgroup_lock - lock out any changes to cgroup structures + * + */ +void cgroup_lock(void) +{ + mutex_lock(&cgroup_mutex); +} +EXPORT_SYMBOL_GPL(cgroup_lock); + +/** + * cgroup_unlock - release lock on cgroup changes + * + * Undo the lock taken in a previous cgroup_lock() call. + */ +void cgroup_unlock(void) +{ + mutex_unlock(&cgroup_mutex); +} +EXPORT_SYMBOL_GPL(cgroup_unlock); + +/* + * A couple of forward declarations required, due to cyclic reference loop: + * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir -> + * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations + * -> cgroup_mkdir. + */ + +static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode); +static struct dentry *cgroup_lookup(struct inode *, struct dentry *, struct nameidata *); +static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); +static int cgroup_populate_dir(struct cgroup *cgrp); +static const struct inode_operations cgroup_dir_inode_operations; +static const struct file_operations proc_cgroupstats_operations; + +static struct backing_dev_info cgroup_backing_dev_info = { + .name = "cgroup", + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, +}; + +static int alloc_css_id(struct cgroup_subsys *ss, + struct cgroup *parent, struct cgroup *child); + +static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) +{ + struct inode *inode = new_inode(sb); + + if (inode) { + inode->i_ino = get_next_ino(); + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info; + } + return inode; +} + +/* + * Call subsys's pre_destroy handler. + * This is called before css refcnt check. + */ +static int cgroup_call_pre_destroy(struct cgroup *cgrp) +{ + struct cgroup_subsys *ss; + int ret = 0; + + for_each_subsys(cgrp->root, ss) + if (ss->pre_destroy) { + ret = ss->pre_destroy(ss, cgrp); + if (ret) + break; + } + + return ret; +} + +static void cgroup_diput(struct dentry *dentry, struct inode *inode) +{ + /* is dentry a directory ? if so, kfree() associated cgroup */ + if (S_ISDIR(inode->i_mode)) { + struct cgroup *cgrp = dentry->d_fsdata; + struct cgroup_subsys *ss; + BUG_ON(!(cgroup_is_removed(cgrp))); + /* It's possible for external users to be holding css + * reference counts on a cgroup; css_put() needs to + * be able to access the cgroup after decrementing + * the reference count in order to know if it needs to + * queue the cgroup to be handled by the release + * agent */ + synchronize_rcu(); + + mutex_lock(&cgroup_mutex); + /* + * Release the subsystem state objects. + */ + for_each_subsys(cgrp->root, ss) + ss->destroy(ss, cgrp); + + cgrp->root->number_of_cgroups--; + mutex_unlock(&cgroup_mutex); + + /* + * Drop the active superblock reference that we took when we + * created the cgroup + */ + deactivate_super(cgrp->root->sb); + + /* + * if we're getting rid of the cgroup, refcount should ensure + * that there are no pidlists left. + */ + BUG_ON(!list_empty(&cgrp->pidlists)); + + kfree_rcu(cgrp, rcu_head); + } + iput(inode); +} + +static int cgroup_delete(const struct dentry *d) +{ + return 1; +} + +static void remove_dir(struct dentry *d) +{ + struct dentry *parent = dget(d->d_parent); + + d_delete(d); + simple_rmdir(parent->d_inode, d); + dput(parent); +} + +static void cgroup_clear_directory(struct dentry *dentry) +{ + struct list_head *node; + + BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); + spin_lock(&dentry->d_lock); + node = dentry->d_subdirs.next; + while (node != &dentry->d_subdirs) { + struct dentry *d = list_entry(node, struct dentry, d_u.d_child); + + spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); + list_del_init(node); + if (d->d_inode) { + /* This should never be called on a cgroup + * directory with child cgroups */ + BUG_ON(d->d_inode->i_mode & S_IFDIR); + dget_dlock(d); + spin_unlock(&d->d_lock); + spin_unlock(&dentry->d_lock); + d_delete(d); + simple_unlink(dentry->d_inode, d); + dput(d); + spin_lock(&dentry->d_lock); + } else + spin_unlock(&d->d_lock); + node = dentry->d_subdirs.next; + } + spin_unlock(&dentry->d_lock); +} + +/* + * NOTE : the dentry must have been dget()'ed + */ +static void cgroup_d_remove_dir(struct dentry *dentry) +{ + struct dentry *parent; + + cgroup_clear_directory(dentry); + + parent = dentry->d_parent; + spin_lock(&parent->d_lock); + spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + list_del_init(&dentry->d_u.d_child); + spin_unlock(&dentry->d_lock); + spin_unlock(&parent->d_lock); + remove_dir(dentry); +} + +/* + * Call with cgroup_mutex held. Drops reference counts on modules, including + * any duplicate ones that parse_cgroupfs_options took. If this function + * returns an error, no reference counts are touched. + */ +static int rebind_subsystems(struct cgroupfs_root *root, + unsigned long final_bits) +{ + unsigned long added_bits, removed_bits; + struct cgroup *cgrp = &root->top_cgroup; + int i; + + BUG_ON(!mutex_is_locked(&cgroup_mutex)); + + removed_bits = root->actual_subsys_bits & ~final_bits; + added_bits = final_bits & ~root->actual_subsys_bits; + /* Check that any added subsystems are currently free */ + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + unsigned long bit = 1UL << i; + struct cgroup_subsys *ss = subsys[i]; + if (!(bit & added_bits)) + continue; + /* + * Nobody should tell us to do a subsys that doesn't exist: + * parse_cgroupfs_options should catch that case and refcounts + * ensure that subsystems won't disappear once selected. + */ + BUG_ON(ss == NULL); + if (ss->root != &rootnode) { + /* Subsystem isn't free */ + return -EBUSY; + } + } + + /* Currently we don't handle adding/removing subsystems when + * any child cgroups exist. This is theoretically supportable + * but involves complex error handling, so it's being left until + * later */ + if (root->number_of_cgroups > 1) + return -EBUSY; + + /* Process each subsystem */ + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + unsigned long bit = 1UL << i; + if (bit & added_bits) { + /* We're binding this subsystem to this hierarchy */ + BUG_ON(ss == NULL); + BUG_ON(cgrp->subsys[i]); + BUG_ON(!dummytop->subsys[i]); + BUG_ON(dummytop->subsys[i]->cgroup != dummytop); + mutex_lock(&ss->hierarchy_mutex); + cgrp->subsys[i] = dummytop->subsys[i]; + cgrp->subsys[i]->cgroup = cgrp; + list_move(&ss->sibling, &root->subsys_list); + ss->root = root; + if (ss->bind) + ss->bind(ss, cgrp); + mutex_unlock(&ss->hierarchy_mutex); + /* refcount was already taken, and we're keeping it */ + } else if (bit & removed_bits) { + /* We're removing this subsystem */ + BUG_ON(ss == NULL); + BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); + BUG_ON(cgrp->subsys[i]->cgroup != cgrp); + mutex_lock(&ss->hierarchy_mutex); + if (ss->bind) + ss->bind(ss, dummytop); + dummytop->subsys[i]->cgroup = dummytop; + cgrp->subsys[i] = NULL; + subsys[i]->root = &rootnode; + list_move(&ss->sibling, &rootnode.subsys_list); + mutex_unlock(&ss->hierarchy_mutex); + /* subsystem is now free - drop reference on module */ + module_put(ss->module); + } else if (bit & final_bits) { + /* Subsystem state should already exist */ + BUG_ON(ss == NULL); + BUG_ON(!cgrp->subsys[i]); + /* + * a refcount was taken, but we already had one, so + * drop the extra reference. + */ + module_put(ss->module); +#ifdef CONFIG_MODULE_UNLOAD + BUG_ON(ss->module && !module_refcount(ss->module)); +#endif + } else { + /* Subsystem state shouldn't exist */ + BUG_ON(cgrp->subsys[i]); + } + } + root->subsys_bits = root->actual_subsys_bits = final_bits; + synchronize_rcu(); + + return 0; +} + +static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs) +{ + struct cgroupfs_root *root = vfs->mnt_sb->s_fs_info; + struct cgroup_subsys *ss; + + mutex_lock(&cgroup_mutex); + for_each_subsys(root, ss) + seq_printf(seq, ",%s", ss->name); + if (test_bit(ROOT_NOPREFIX, &root->flags)) + seq_puts(seq, ",noprefix"); + if (strlen(root->release_agent_path)) + seq_printf(seq, ",release_agent=%s", root->release_agent_path); + if (clone_children(&root->top_cgroup)) + seq_puts(seq, ",clone_children"); + if (strlen(root->name)) + seq_printf(seq, ",name=%s", root->name); + mutex_unlock(&cgroup_mutex); + return 0; +} + +struct cgroup_sb_opts { + unsigned long subsys_bits; + unsigned long flags; + char *release_agent; + bool clone_children; + char *name; + /* User explicitly requested empty subsystem */ + bool none; + + struct cgroupfs_root *new_root; + +}; + +/* + * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call + * with cgroup_mutex held to protect the subsys[] array. This function takes + * refcounts on subsystems to be used, unless it returns error, in which case + * no refcounts are taken. + */ +static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) +{ + char *token, *o = data; + bool all_ss = false, one_ss = false; + unsigned long mask = (unsigned long)-1; + int i; + bool module_pin_failed = false; + + BUG_ON(!mutex_is_locked(&cgroup_mutex)); + +#ifdef CONFIG_CPUSETS + mask = ~(1UL << cpuset_subsys_id); +#endif + + memset(opts, 0, sizeof(*opts)); + + while ((token = strsep(&o, ",")) != NULL) { + if (!*token) + return -EINVAL; + if (!strcmp(token, "none")) { + /* Explicitly have no subsystems */ + opts->none = true; + continue; + } + if (!strcmp(token, "all")) { + /* Mutually exclusive option 'all' + subsystem name */ + if (one_ss) + return -EINVAL; + all_ss = true; + continue; + } + if (!strcmp(token, "noprefix")) { + set_bit(ROOT_NOPREFIX, &opts->flags); + continue; + } + if (!strcmp(token, "clone_children")) { + opts->clone_children = true; + continue; + } + if (!strncmp(token, "release_agent=", 14)) { + /* Specifying two release agents is forbidden */ + if (opts->release_agent) + return -EINVAL; + opts->release_agent = + kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); + if (!opts->release_agent) + return -ENOMEM; + continue; + } + if (!strncmp(token, "name=", 5)) { + const char *name = token + 5; + /* Can't specify an empty name */ + if (!strlen(name)) + return -EINVAL; + /* Must match [\w.-]+ */ + for (i = 0; i < strlen(name); i++) { + char c = name[i]; + if (isalnum(c)) + continue; + if ((c == '.') || (c == '-') || (c == '_')) + continue; + return -EINVAL; + } + /* Specifying two names is forbidden */ + if (opts->name) + return -EINVAL; + opts->name = kstrndup(name, + MAX_CGROUP_ROOT_NAMELEN - 1, + GFP_KERNEL); + if (!opts->name) + return -ENOMEM; + + continue; + } + + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss == NULL) + continue; + if (strcmp(token, ss->name)) + continue; + if (ss->disabled) + continue; + + /* Mutually exclusive option 'all' + subsystem name */ + if (all_ss) + return -EINVAL; + set_bit(i, &opts->subsys_bits); + one_ss = true; + + break; + } + if (i == CGROUP_SUBSYS_COUNT) + return -ENOENT; + } + + /* + * If the 'all' option was specified select all the subsystems, + * otherwise if 'none', 'name=' and a subsystem name options + * were not specified, let's default to 'all' + */ + if (all_ss || (!one_ss && !opts->none && !opts->name)) { + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss == NULL) + continue; + if (ss->disabled) + continue; + set_bit(i, &opts->subsys_bits); + } + } + + /* Consistency checks */ + + /* + * Option noprefix was introduced just for backward compatibility + * with the old cpuset, so we allow noprefix only if mounting just + * the cpuset subsystem. + */ + if (test_bit(ROOT_NOPREFIX, &opts->flags) && + (opts->subsys_bits & mask)) + return -EINVAL; + + + /* Can't specify "none" and some subsystems */ + if (opts->subsys_bits && opts->none) + return -EINVAL; + + /* + * We either have to specify by name or by subsystems. (So all + * empty hierarchies must have a name). + */ + if (!opts->subsys_bits && !opts->name) + return -EINVAL; + + /* + * Grab references on all the modules we'll need, so the subsystems + * don't dance around before rebind_subsystems attaches them. This may + * take duplicate reference counts on a subsystem that's already used, + * but rebind_subsystems handles this case. + */ + for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { + unsigned long bit = 1UL << i; + + if (!(bit & opts->subsys_bits)) + continue; + if (!try_module_get(subsys[i]->module)) { + module_pin_failed = true; + break; + } + } + if (module_pin_failed) { + /* + * oops, one of the modules was going away. this means that we + * raced with a module_delete call, and to the user this is + * essentially a "subsystem doesn't exist" case. + */ + for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) { + /* drop refcounts only on the ones we took */ + unsigned long bit = 1UL << i; + + if (!(bit & opts->subsys_bits)) + continue; + module_put(subsys[i]->module); + } + return -ENOENT; + } + + return 0; +} + +static void drop_parsed_module_refcounts(unsigned long subsys_bits) +{ + int i; + for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { + unsigned long bit = 1UL << i; + + if (!(bit & subsys_bits)) + continue; + module_put(subsys[i]->module); + } +} + +static int cgroup_remount(struct super_block *sb, int *flags, char *data) +{ + int ret = 0; + struct cgroupfs_root *root = sb->s_fs_info; + struct cgroup *cgrp = &root->top_cgroup; + struct cgroup_sb_opts opts; + + mutex_lock(&cgrp->dentry->d_inode->i_mutex); + mutex_lock(&cgroup_mutex); + + /* See what subsystems are wanted */ + ret = parse_cgroupfs_options(data, &opts); + if (ret) + goto out_unlock; + + /* Don't allow flags or name to change at remount */ + if (opts.flags != root->flags || + (opts.name && strcmp(opts.name, root->name))) { + ret = -EINVAL; + drop_parsed_module_refcounts(opts.subsys_bits); + goto out_unlock; + } + + ret = rebind_subsystems(root, opts.subsys_bits); + if (ret) { + drop_parsed_module_refcounts(opts.subsys_bits); + goto out_unlock; + } + + /* (re)populate subsystem files */ + cgroup_populate_dir(cgrp); + + if (opts.release_agent) + strcpy(root->release_agent_path, opts.release_agent); + out_unlock: + kfree(opts.release_agent); + kfree(opts.name); + mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgrp->dentry->d_inode->i_mutex); + return ret; +} + +static const struct super_operations cgroup_ops = { + .statfs = simple_statfs, + .drop_inode = generic_delete_inode, + .show_options = cgroup_show_options, + .remount_fs = cgroup_remount, +}; + +static void init_cgroup_housekeeping(struct cgroup *cgrp) +{ + INIT_LIST_HEAD(&cgrp->sibling); + INIT_LIST_HEAD(&cgrp->children); + INIT_LIST_HEAD(&cgrp->css_sets); + INIT_LIST_HEAD(&cgrp->release_list); + INIT_LIST_HEAD(&cgrp->pidlists); + mutex_init(&cgrp->pidlist_mutex); + INIT_LIST_HEAD(&cgrp->event_list); + spin_lock_init(&cgrp->event_list_lock); +} + +static void init_cgroup_root(struct cgroupfs_root *root) +{ + struct cgroup *cgrp = &root->top_cgroup; + INIT_LIST_HEAD(&root->subsys_list); + INIT_LIST_HEAD(&root->root_list); + root->number_of_cgroups = 1; + cgrp->root = root; + cgrp->top_cgroup = cgrp; + init_cgroup_housekeeping(cgrp); +} + +static bool init_root_id(struct cgroupfs_root *root) +{ + int ret = 0; + + do { + if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) + return false; + spin_lock(&hierarchy_id_lock); + /* Try to allocate the next unused ID */ + ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, + &root->hierarchy_id); + if (ret == -ENOSPC) + /* Try again starting from 0 */ + ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id); + if (!ret) { + next_hierarchy_id = root->hierarchy_id + 1; + } else if (ret != -EAGAIN) { + /* Can only get here if the 31-bit IDR is full ... */ + BUG_ON(ret); + } + spin_unlock(&hierarchy_id_lock); + } while (ret); + return true; +} + +static int cgroup_test_super(struct super_block *sb, void *data) +{ + struct cgroup_sb_opts *opts = data; + struct cgroupfs_root *root = sb->s_fs_info; + + /* If we asked for a name then it must match */ + if (opts->name && strcmp(opts->name, root->name)) + return 0; + + /* + * If we asked for subsystems (or explicitly for no + * subsystems) then they must match + */ + if ((opts->subsys_bits || opts->none) + && (opts->subsys_bits != root->subsys_bits)) + return 0; + + return 1; +} + +static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) +{ + struct cgroupfs_root *root; + + if (!opts->subsys_bits && !opts->none) + return NULL; + + root = kzalloc(sizeof(*root), GFP_KERNEL); + if (!root) + return ERR_PTR(-ENOMEM); + + if (!init_root_id(root)) { + kfree(root); + return ERR_PTR(-ENOMEM); + } + init_cgroup_root(root); + + root->subsys_bits = opts->subsys_bits; + root->flags = opts->flags; + if (opts->release_agent) + strcpy(root->release_agent_path, opts->release_agent); + if (opts->name) + strcpy(root->name, opts->name); + if (opts->clone_children) + set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags); + return root; +} + +static void cgroup_drop_root(struct cgroupfs_root *root) +{ + if (!root) + return; + + BUG_ON(!root->hierarchy_id); + spin_lock(&hierarchy_id_lock); + ida_remove(&hierarchy_ida, root->hierarchy_id); + spin_unlock(&hierarchy_id_lock); + kfree(root); +} + +static int cgroup_set_super(struct super_block *sb, void *data) +{ + int ret; + struct cgroup_sb_opts *opts = data; + + /* If we don't have a new root, we can't set up a new sb */ + if (!opts->new_root) + return -EINVAL; + + BUG_ON(!opts->subsys_bits && !opts->none); + + ret = set_anon_super(sb, NULL); + if (ret) + return ret; + + sb->s_fs_info = opts->new_root; + opts->new_root->sb = sb; + + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = CGROUP_SUPER_MAGIC; + sb->s_op = &cgroup_ops; + + return 0; +} + +static int cgroup_get_rootdir(struct super_block *sb) +{ + static const struct dentry_operations cgroup_dops = { + .d_iput = cgroup_diput, + .d_delete = cgroup_delete, + }; + + struct inode *inode = + cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); + struct dentry *dentry; + + if (!inode) + return -ENOMEM; + + inode->i_fop = &simple_dir_operations; + inode->i_op = &cgroup_dir_inode_operations; + /* directories start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); + dentry = d_alloc_root(inode); + if (!dentry) { + iput(inode); + return -ENOMEM; + } + sb->s_root = dentry; + /* for everything else we want ->d_op set */ + sb->s_d_op = &cgroup_dops; + return 0; +} + +static struct dentry *cgroup_mount(struct file_system_type *fs_type, + int flags, const char *unused_dev_name, + void *data) +{ + struct cgroup_sb_opts opts; + struct cgroupfs_root *root; + int ret = 0; + struct super_block *sb; + struct cgroupfs_root *new_root; + + /* First find the desired set of subsystems */ + mutex_lock(&cgroup_mutex); + ret = parse_cgroupfs_options(data, &opts); + mutex_unlock(&cgroup_mutex); + if (ret) + goto out_err; + + /* + * Allocate a new cgroup root. We may not need it if we're + * reusing an existing hierarchy. + */ + new_root = cgroup_root_from_opts(&opts); + if (IS_ERR(new_root)) { + ret = PTR_ERR(new_root); + goto drop_modules; + } + opts.new_root = new_root; + + /* Locate an existing or new sb for this hierarchy */ + sb = sget(fs_type, cgroup_test_super, cgroup_set_super, &opts); + if (IS_ERR(sb)) { + ret = PTR_ERR(sb); + cgroup_drop_root(opts.new_root); + goto drop_modules; + } + + root = sb->s_fs_info; + BUG_ON(!root); + if (root == opts.new_root) { + /* We used the new root structure, so this is a new hierarchy */ + struct list_head tmp_cg_links; + struct cgroup *root_cgrp = &root->top_cgroup; + struct inode *inode; + struct cgroupfs_root *existing_root; + int i; + + BUG_ON(sb->s_root != NULL); + + ret = cgroup_get_rootdir(sb); + if (ret) + goto drop_new_super; + inode = sb->s_root->d_inode; + + mutex_lock(&inode->i_mutex); + mutex_lock(&cgroup_mutex); + + if (strlen(root->name)) { + /* Check for name clashes with existing mounts */ + for_each_active_root(existing_root) { + if (!strcmp(existing_root->name, root->name)) { + ret = -EBUSY; + mutex_unlock(&cgroup_mutex); + mutex_unlock(&inode->i_mutex); + goto drop_new_super; + } + } + } + + /* + * We're accessing css_set_count without locking + * css_set_lock here, but that's OK - it can only be + * increased by someone holding cgroup_lock, and + * that's us. The worst that can happen is that we + * have some link structures left over + */ + ret = allocate_cg_links(css_set_count, &tmp_cg_links); + if (ret) { + mutex_unlock(&cgroup_mutex); + mutex_unlock(&inode->i_mutex); + goto drop_new_super; + } + + ret = rebind_subsystems(root, root->subsys_bits); + if (ret == -EBUSY) { + mutex_unlock(&cgroup_mutex); + mutex_unlock(&inode->i_mutex); + free_cg_links(&tmp_cg_links); + goto drop_new_super; + } + /* + * There must be no failure case after here, since rebinding + * takes care of subsystems' refcounts, which are explicitly + * dropped in the failure exit path. + */ + + /* EBUSY should be the only error here */ + BUG_ON(ret); + + list_add(&root->root_list, &roots); + root_count++; + + sb->s_root->d_fsdata = root_cgrp; + root->top_cgroup.dentry = sb->s_root; + + /* Link the top cgroup in this hierarchy into all + * the css_set objects */ + write_lock(&css_set_lock); + for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { + struct hlist_head *hhead = &css_set_table[i]; + struct hlist_node *node; + struct css_set *cg; + + hlist_for_each_entry(cg, node, hhead, hlist) + link_css_set(&tmp_cg_links, cg, root_cgrp); + } + write_unlock(&css_set_lock); + + free_cg_links(&tmp_cg_links); + + BUG_ON(!list_empty(&root_cgrp->sibling)); + BUG_ON(!list_empty(&root_cgrp->children)); + BUG_ON(root->number_of_cgroups != 1); + + cgroup_populate_dir(root_cgrp); + mutex_unlock(&cgroup_mutex); + mutex_unlock(&inode->i_mutex); + } else { + /* + * We re-used an existing hierarchy - the new root (if + * any) is not needed + */ + cgroup_drop_root(opts.new_root); + /* no subsys rebinding, so refcounts don't change */ + drop_parsed_module_refcounts(opts.subsys_bits); + } + + kfree(opts.release_agent); + kfree(opts.name); + return dget(sb->s_root); + + drop_new_super: + deactivate_locked_super(sb); + drop_modules: + drop_parsed_module_refcounts(opts.subsys_bits); + out_err: + kfree(opts.release_agent); + kfree(opts.name); + return ERR_PTR(ret); +} + +static void cgroup_kill_sb(struct super_block *sb) { + struct cgroupfs_root *root = sb->s_fs_info; + struct cgroup *cgrp = &root->top_cgroup; + int ret; + struct cg_cgroup_link *link; + struct cg_cgroup_link *saved_link; + + BUG_ON(!root); + + BUG_ON(root->number_of_cgroups != 1); + BUG_ON(!list_empty(&cgrp->children)); + BUG_ON(!list_empty(&cgrp->sibling)); + + mutex_lock(&cgroup_mutex); + + /* Rebind all subsystems back to the default hierarchy */ + ret = rebind_subsystems(root, 0); + /* Shouldn't be able to fail ... */ + BUG_ON(ret); + + /* + * Release all the links from css_sets to this hierarchy's + * root cgroup + */ + write_lock(&css_set_lock); + + list_for_each_entry_safe(link, saved_link, &cgrp->css_sets, + cgrp_link_list) { + list_del(&link->cg_link_list); + list_del(&link->cgrp_link_list); + kfree(link); + } + write_unlock(&css_set_lock); + + if (!list_empty(&root->root_list)) { + list_del(&root->root_list); + root_count--; + } + + mutex_unlock(&cgroup_mutex); + + kill_litter_super(sb); + cgroup_drop_root(root); +} + +static struct file_system_type cgroup_fs_type = { + .name = "cgroup", + .mount = cgroup_mount, + .kill_sb = cgroup_kill_sb, +}; + +static struct kobject *cgroup_kobj; + +static inline struct cgroup *__d_cgrp(struct dentry *dentry) +{ + return dentry->d_fsdata; +} + +static inline struct cftype *__d_cft(struct dentry *dentry) +{ + return dentry->d_fsdata; +} + +/** + * cgroup_path - generate the path of a cgroup + * @cgrp: the cgroup in question + * @buf: the buffer to write the path into + * @buflen: the length of the buffer + * + * Called with cgroup_mutex held or else with an RCU-protected cgroup + * reference. Writes path of cgroup into buf. Returns 0 on success, + * -errno on error. + */ +int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) +{ + char *start; + struct dentry *dentry = rcu_dereference_check(cgrp->dentry, + rcu_read_lock_held() || + cgroup_lock_is_held()); + + if (!dentry || cgrp == dummytop) { + /* + * Inactive subsystems have no dentry for their root + * cgroup + */ + strcpy(buf, "/"); + return 0; + } + + start = buf + buflen; + + *--start = '\0'; + for (;;) { + int len = dentry->d_name.len; + + if ((start -= len) < buf) + return -ENAMETOOLONG; + memcpy(start, dentry->d_name.name, len); + cgrp = cgrp->parent; + if (!cgrp) + break; + + dentry = rcu_dereference_check(cgrp->dentry, + rcu_read_lock_held() || + cgroup_lock_is_held()); + if (!cgrp->parent) + continue; + if (--start < buf) + return -ENAMETOOLONG; + *start = '/'; + } + memmove(buf, start, buf + buflen - start); + return 0; +} +EXPORT_SYMBOL_GPL(cgroup_path); + +/* + * cgroup_task_migrate - move a task from one cgroup to another. + * + * 'guarantee' is set if the caller promises that a new css_set for the task + * will already exist. If not set, this function might sleep, and can fail with + * -ENOMEM. Otherwise, it can only fail with -ESRCH. + */ +static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, + struct task_struct *tsk, bool guarantee) +{ + struct css_set *oldcg; + struct css_set *newcg; + + /* + * get old css_set. we need to take task_lock and refcount it, because + * an exiting task can change its css_set to init_css_set and drop its + * old one without taking cgroup_mutex. + */ + task_lock(tsk); + oldcg = tsk->cgroups; + get_css_set(oldcg); + task_unlock(tsk); + + /* locate or allocate a new css_set for this task. */ + if (guarantee) { + /* we know the css_set we want already exists. */ + struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; + read_lock(&css_set_lock); + newcg = find_existing_css_set(oldcg, cgrp, template); + BUG_ON(!newcg); + get_css_set(newcg); + read_unlock(&css_set_lock); + } else { + might_sleep(); + /* find_css_set will give us newcg already referenced. */ + newcg = find_css_set(oldcg, cgrp); + if (!newcg) { + put_css_set(oldcg); + return -ENOMEM; + } + } + put_css_set(oldcg); + + /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */ + task_lock(tsk); + if (tsk->flags & PF_EXITING) { + task_unlock(tsk); + put_css_set(newcg); + return -ESRCH; + } + rcu_assign_pointer(tsk->cgroups, newcg); + task_unlock(tsk); + + /* Update the css_set linked lists if we're using them */ + write_lock(&css_set_lock); + if (!list_empty(&tsk->cg_list)) + list_move(&tsk->cg_list, &newcg->tasks); + write_unlock(&css_set_lock); + + /* + * We just gained a reference on oldcg by taking it from the task. As + * trading it for newcg is protected by cgroup_mutex, we're safe to drop + * it here; it will be freed under RCU. + */ + put_css_set(oldcg); + + set_bit(CGRP_RELEASABLE, &oldcgrp->flags); + return 0; +} + +/** + * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' + * @cgrp: the cgroup the task is attaching to + * @tsk: the task to be attached + * + * Call holding cgroup_mutex. May take task_lock of + * the task 'tsk' during call. + */ +int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) +{ + int retval; + struct cgroup_subsys *ss, *failed_ss = NULL; + struct cgroup *oldcgrp; + struct cgroupfs_root *root = cgrp->root; + struct css_set *cg; + + /* Nothing to do if the task is already in that cgroup */ + oldcgrp = task_cgroup_from_root(tsk, root); + if (cgrp == oldcgrp) + return 0; + + for_each_subsys(root, ss) { + if (ss->can_attach) { + retval = ss->can_attach(ss, cgrp, tsk); + if (retval) { + /* + * Remember on which subsystem the can_attach() + * failed, so that we only call cancel_attach() + * against the subsystems whose can_attach() + * succeeded. (See below) + */ + failed_ss = ss; + goto out; + } + } + if (ss->can_attach_task) { + retval = ss->can_attach_task(cgrp, tsk); + if (retval) { + failed_ss = ss; + goto out; + } + } + } + + task_lock(tsk); + cg = tsk->cgroups; + get_css_set(cg); + task_unlock(tsk); + + retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false); + if (retval) + goto out; + + for_each_subsys(root, ss) { + if (ss->pre_attach) + ss->pre_attach(cgrp); + if (ss->attach_task) + ss->attach_task(cgrp, tsk); + if (ss->attach) + ss->attach(ss, cgrp, oldcgrp, tsk); + } + set_bit(CGRP_RELEASABLE, &cgrp->flags); + /* put_css_set will not destroy cg until after an RCU grace period */ + put_css_set(cg); + + /* + * wake up rmdir() waiter. the rmdir should fail since the cgroup + * is no longer empty. + */ + cgroup_wakeup_rmdir_waiter(cgrp); +out: + if (retval) { + for_each_subsys(root, ss) { + if (ss == failed_ss) + /* + * This subsystem was the one that failed the + * can_attach() check earlier, so we don't need + * to call cancel_attach() against it or any + * remaining subsystems. + */ + break; + if (ss->cancel_attach) + ss->cancel_attach(ss, cgrp, tsk); + } + } + return retval; +} + +/** + * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' + * @from: attach to all cgroups of a given task + * @tsk: the task to be attached + */ +int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) +{ + struct cgroupfs_root *root; + int retval = 0; + + cgroup_lock(); + for_each_active_root(root) { + struct cgroup *from_cg = task_cgroup_from_root(from, root); + + retval = cgroup_attach_task(from_cg, tsk); + if (retval) + break; + } + cgroup_unlock(); + + return retval; +} +EXPORT_SYMBOL_GPL(cgroup_attach_task_all); + +/* + * cgroup_attach_proc works in two stages, the first of which prefetches all + * new css_sets needed (to make sure we have enough memory before committing + * to the move) and stores them in a list of entries of the following type. + * TODO: possible optimization: use css_set->rcu_head for chaining instead + */ +struct cg_list_entry { + struct css_set *cg; + struct list_head links; +}; + +static bool css_set_check_fetched(struct cgroup *cgrp, + struct task_struct *tsk, struct css_set *cg, + struct list_head *newcg_list) +{ + struct css_set *newcg; + struct cg_list_entry *cg_entry; + struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; + + read_lock(&css_set_lock); + newcg = find_existing_css_set(cg, cgrp, template); + if (newcg) + get_css_set(newcg); + read_unlock(&css_set_lock); + + /* doesn't exist at all? */ + if (!newcg) + return false; + /* see if it's already in the list */ + list_for_each_entry(cg_entry, newcg_list, links) { + if (cg_entry->cg == newcg) { + put_css_set(newcg); + return true; + } + } + + /* not found */ + put_css_set(newcg); + return false; +} + +/* + * Find the new css_set and store it in the list in preparation for moving the + * given task to the given cgroup. Returns 0 or -ENOMEM. + */ +static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg, + struct list_head *newcg_list) +{ + struct css_set *newcg; + struct cg_list_entry *cg_entry; + + /* ensure a new css_set will exist for this thread */ + newcg = find_css_set(cg, cgrp); + if (!newcg) + return -ENOMEM; + /* add it to the list */ + cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL); + if (!cg_entry) { + put_css_set(newcg); + return -ENOMEM; + } + cg_entry->cg = newcg; + list_add(&cg_entry->links, newcg_list); + return 0; +} + +/** + * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup + * @cgrp: the cgroup to attach to + * @leader: the threadgroup leader task_struct of the group to be attached + * + * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will + * take task_lock of each thread in leader's threadgroup individually in turn. + */ +int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) +{ + int retval, i, group_size; + struct cgroup_subsys *ss, *failed_ss = NULL; + bool cancel_failed_ss = false; + /* guaranteed to be initialized later, but the compiler needs this */ + struct cgroup *oldcgrp = NULL; + struct css_set *oldcg; + struct cgroupfs_root *root = cgrp->root; + /* threadgroup list cursor and array */ + struct task_struct *tsk; + struct flex_array *group; + /* + * we need to make sure we have css_sets for all the tasks we're + * going to move -before- we actually start moving them, so that in + * case we get an ENOMEM we can bail out before making any changes. + */ + struct list_head newcg_list; + struct cg_list_entry *cg_entry, *temp_nobe; + + /* + * step 0: in order to do expensive, possibly blocking operations for + * every thread, we cannot iterate the thread group list, since it needs + * rcu or tasklist locked. instead, build an array of all threads in the + * group - threadgroup_fork_lock prevents new threads from appearing, + * and if threads exit, this will just be an over-estimate. + */ + group_size = get_nr_threads(leader); + /* flex_array supports very large thread-groups better than kmalloc. */ + group = flex_array_alloc(sizeof(struct task_struct *), group_size, + GFP_KERNEL); + if (!group) + return -ENOMEM; + /* pre-allocate to guarantee space while iterating in rcu read-side. */ + retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL); + if (retval) + goto out_free_group_list; + + /* prevent changes to the threadgroup list while we take a snapshot. */ + rcu_read_lock(); + if (!thread_group_leader(leader)) { + /* + * a race with de_thread from another thread's exec() may strip + * us of our leadership, making while_each_thread unsafe to use + * on this task. if this happens, there is no choice but to + * throw this task away and try again (from cgroup_procs_write); + * this is "double-double-toil-and-trouble-check locking". + */ + rcu_read_unlock(); + retval = -EAGAIN; + goto out_free_group_list; + } + /* take a reference on each task in the group to go in the array. */ + tsk = leader; + i = 0; + do { + /* as per above, nr_threads may decrease, but not increase. */ + BUG_ON(i >= group_size); + get_task_struct(tsk); + /* + * saying GFP_ATOMIC has no effect here because we did prealloc + * earlier, but it's good form to communicate our expectations. + */ + retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC); + BUG_ON(retval != 0); + i++; + } while_each_thread(leader, tsk); + /* remember the number of threads in the array for later. */ + group_size = i; + rcu_read_unlock(); + + /* + * step 1: check that we can legitimately attach to the cgroup. + */ + for_each_subsys(root, ss) { + if (ss->can_attach) { + retval = ss->can_attach(ss, cgrp, leader); + if (retval) { + failed_ss = ss; + goto out_cancel_attach; + } + } + /* a callback to be run on every thread in the threadgroup. */ + if (ss->can_attach_task) { + /* run on each task in the threadgroup. */ + for (i = 0; i < group_size; i++) { + tsk = flex_array_get_ptr(group, i); + retval = ss->can_attach_task(cgrp, tsk); + if (retval) { + failed_ss = ss; + cancel_failed_ss = true; + goto out_cancel_attach; + } + } + } + } + + /* + * step 2: make sure css_sets exist for all threads to be migrated. + * we use find_css_set, which allocates a new one if necessary. + */ + INIT_LIST_HEAD(&newcg_list); + for (i = 0; i < group_size; i++) { + tsk = flex_array_get_ptr(group, i); + /* nothing to do if this task is already in the cgroup */ + oldcgrp = task_cgroup_from_root(tsk, root); + if (cgrp == oldcgrp) + continue; + /* get old css_set pointer */ + task_lock(tsk); + oldcg = tsk->cgroups; + get_css_set(oldcg); + task_unlock(tsk); + /* see if the new one for us is already in the list? */ + if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) { + /* was already there, nothing to do. */ + put_css_set(oldcg); + } else { + /* we don't already have it. get new one. */ + retval = css_set_prefetch(cgrp, oldcg, &newcg_list); + put_css_set(oldcg); + if (retval) + goto out_list_teardown; + } + } + + /* + * step 3: now that we're guaranteed success wrt the css_sets, proceed + * to move all tasks to the new cgroup, calling ss->attach_task for each + * one along the way. there are no failure cases after here, so this is + * the commit point. + */ + for_each_subsys(root, ss) { + if (ss->pre_attach) + ss->pre_attach(cgrp); + } + for (i = 0; i < group_size; i++) { + tsk = flex_array_get_ptr(group, i); + /* leave current thread as it is if it's already there */ + oldcgrp = task_cgroup_from_root(tsk, root); + if (cgrp == oldcgrp) + continue; + /* attach each task to each subsystem */ + for_each_subsys(root, ss) { + if (ss->attach_task) + ss->attach_task(cgrp, tsk); + } + /* if the thread is PF_EXITING, it can just get skipped. */ + retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true); + BUG_ON(retval != 0 && retval != -ESRCH); + } + /* nothing is sensitive to fork() after this point. */ + + /* + * step 4: do expensive, non-thread-specific subsystem callbacks. + * TODO: if ever a subsystem needs to know the oldcgrp for each task + * being moved, this call will need to be reworked to communicate that. + */ + for_each_subsys(root, ss) { + if (ss->attach) + ss->attach(ss, cgrp, oldcgrp, leader); + } + + /* + * step 5: success! and cleanup + */ + synchronize_rcu(); + cgroup_wakeup_rmdir_waiter(cgrp); + retval = 0; +out_list_teardown: + /* clean up the list of prefetched css_sets. */ + list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) { + list_del(&cg_entry->links); + put_css_set(cg_entry->cg); + kfree(cg_entry); + } +out_cancel_attach: + /* same deal as in cgroup_attach_task */ + if (retval) { + for_each_subsys(root, ss) { + if (ss == failed_ss) { + if (cancel_failed_ss && ss->cancel_attach) + ss->cancel_attach(ss, cgrp, leader); + break; + } + if (ss->cancel_attach) + ss->cancel_attach(ss, cgrp, leader); + } + } + /* clean up the array of referenced threads in the group. */ + for (i = 0; i < group_size; i++) { + tsk = flex_array_get_ptr(group, i); + put_task_struct(tsk); + } +out_free_group_list: + flex_array_free(group); + return retval; +} + +static int cgroup_allow_attach(struct cgroup *cgrp, struct task_struct *tsk) +{ + struct cgroup_subsys *ss; + int ret; + + for_each_subsys(cgrp->root, ss) { + if (ss->allow_attach) { + ret = ss->allow_attach(cgrp, tsk); + if (ret) + return ret; + } else { + return -EACCES; + } + } + + return 0; +} + +/* + * Find the task_struct of the task to attach by vpid and pass it along to the + * function to attach either it or all tasks in its threadgroup. Will take + * cgroup_mutex; may take task_lock of task. + */ +static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) +{ + struct task_struct *tsk; + const struct cred *cred = current_cred(), *tcred; + int ret; + + if (!cgroup_lock_live_group(cgrp)) + return -ENODEV; + + if (pid) { + rcu_read_lock(); + tsk = find_task_by_vpid(pid); + if (!tsk) { + rcu_read_unlock(); + cgroup_unlock(); + return -ESRCH; + } + if (threadgroup) { + /* + * RCU protects this access, since tsk was found in the + * tid map. a race with de_thread may cause group_leader + * to stop being the leader, but cgroup_attach_proc will + * detect it later. + */ + tsk = tsk->group_leader; + } else if (tsk->flags & PF_EXITING) { + /* optimization for the single-task-only case */ + rcu_read_unlock(); + cgroup_unlock(); + return -ESRCH; + } + + /* + * even if we're attaching all tasks in the thread group, we + * only need to check permissions on one of them. + */ + tcred = __task_cred(tsk); + if (cred->euid && + cred->euid != tcred->uid && + cred->euid != tcred->suid) { + /* + * if the default permission check fails, give each + * cgroup a chance to extend the permission check + */ + ret = cgroup_allow_attach(cgrp, tsk); + if (ret) { + rcu_read_unlock(); + cgroup_unlock(); + return ret; + } + } + get_task_struct(tsk); + rcu_read_unlock(); + } else { + if (threadgroup) + tsk = current->group_leader; + else + tsk = current; + get_task_struct(tsk); + } + + if (threadgroup) { + threadgroup_fork_write_lock(tsk); + ret = cgroup_attach_proc(cgrp, tsk); + threadgroup_fork_write_unlock(tsk); + } else { + ret = cgroup_attach_task(cgrp, tsk); + } + put_task_struct(tsk); + cgroup_unlock(); + return ret; +} + +static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) +{ + return attach_task_by_pid(cgrp, pid, false); +} + +static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) +{ + int ret; + do { + /* + * attach_proc fails with -EAGAIN if threadgroup leadership + * changes in the middle of the operation, in which case we need + * to find the task_struct for the new leader and start over. + */ + ret = attach_task_by_pid(cgrp, tgid, true); + } while (ret == -EAGAIN); + return ret; +} + +/** + * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. + * @cgrp: the cgroup to be checked for liveness + * + * On success, returns true; the lock should be later released with + * cgroup_unlock(). On failure returns false with no lock held. + */ +bool cgroup_lock_live_group(struct cgroup *cgrp) +{ + mutex_lock(&cgroup_mutex); + if (cgroup_is_removed(cgrp)) { + mutex_unlock(&cgroup_mutex); + return false; + } + return true; +} +EXPORT_SYMBOL_GPL(cgroup_lock_live_group); + +static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, + const char *buffer) +{ + BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + if (strlen(buffer) >= PATH_MAX) + return -EINVAL; + if (!cgroup_lock_live_group(cgrp)) + return -ENODEV; + strcpy(cgrp->root->release_agent_path, buffer); + cgroup_unlock(); + return 0; +} + +static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, + struct seq_file *seq) +{ + if (!cgroup_lock_live_group(cgrp)) + return -ENODEV; + seq_puts(seq, cgrp->root->release_agent_path); + seq_putc(seq, '\n'); + cgroup_unlock(); + return 0; +} + +/* A buffer size big enough for numbers or short strings */ +#define CGROUP_LOCAL_BUFFER_SIZE 64 + +static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, + struct file *file, + const char __user *userbuf, + size_t nbytes, loff_t *unused_ppos) +{ + char buffer[CGROUP_LOCAL_BUFFER_SIZE]; + int retval = 0; + char *end; + + if (!nbytes) + return -EINVAL; + if (nbytes >= sizeof(buffer)) + return -E2BIG; + if (copy_from_user(buffer, userbuf, nbytes)) + return -EFAULT; + + buffer[nbytes] = 0; /* nul-terminate */ + if (cft->write_u64) { + u64 val = simple_strtoull(strstrip(buffer), &end, 0); + if (*end) + return -EINVAL; + retval = cft->write_u64(cgrp, cft, val); + } else { + s64 val = simple_strtoll(strstrip(buffer), &end, 0); + if (*end) + return -EINVAL; + retval = cft->write_s64(cgrp, cft, val); + } + if (!retval) + retval = nbytes; + return retval; +} + +static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, + struct file *file, + const char __user *userbuf, + size_t nbytes, loff_t *unused_ppos) +{ + char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; + int retval = 0; + size_t max_bytes = cft->max_write_len; + char *buffer = local_buffer; + + if (!max_bytes) + max_bytes = sizeof(local_buffer) - 1; + if (nbytes >= max_bytes) + return -E2BIG; + /* Allocate a dynamic buffer if we need one */ + if (nbytes >= sizeof(local_buffer)) { + buffer = kmalloc(nbytes + 1, GFP_KERNEL); + if (buffer == NULL) + return -ENOMEM; + } + if (nbytes && copy_from_user(buffer, userbuf, nbytes)) { + retval = -EFAULT; + goto out; + } + + buffer[nbytes] = 0; /* nul-terminate */ + retval = cft->write_string(cgrp, cft, strstrip(buffer)); + if (!retval) + retval = nbytes; +out: + if (buffer != local_buffer) + kfree(buffer); + return retval; +} + +static ssize_t cgroup_file_write(struct file *file, const char __user *buf, + size_t nbytes, loff_t *ppos) +{ + struct cftype *cft = __d_cft(file->f_dentry); + struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); + + if (cgroup_is_removed(cgrp)) + return -ENODEV; + if (cft->write) + return cft->write(cgrp, cft, file, buf, nbytes, ppos); + if (cft->write_u64 || cft->write_s64) + return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); + if (cft->write_string) + return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos); + if (cft->trigger) { + int ret = cft->trigger(cgrp, (unsigned int)cft->private); + return ret ? ret : nbytes; + } + return -EINVAL; +} + +static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, + struct file *file, + char __user *buf, size_t nbytes, + loff_t *ppos) +{ + char tmp[CGROUP_LOCAL_BUFFER_SIZE]; + u64 val = cft->read_u64(cgrp, cft); + int len = sprintf(tmp, "%llu\n", (unsigned long long) val); + + return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); +} + +static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, + struct file *file, + char __user *buf, size_t nbytes, + loff_t *ppos) +{ + char tmp[CGROUP_LOCAL_BUFFER_SIZE]; + s64 val = cft->read_s64(cgrp, cft); + int len = sprintf(tmp, "%lld\n", (long long) val); + + return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); +} + +static ssize_t cgroup_file_read(struct file *file, char __user *buf, + size_t nbytes, loff_t *ppos) +{ + struct cftype *cft = __d_cft(file->f_dentry); + struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); + + if (cgroup_is_removed(cgrp)) + return -ENODEV; + + if (cft->read) + return cft->read(cgrp, cft, file, buf, nbytes, ppos); + if (cft->read_u64) + return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos); + if (cft->read_s64) + return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos); + return -EINVAL; +} + +/* + * seqfile ops/methods for returning structured data. Currently just + * supports string->u64 maps, but can be extended in future. + */ + +struct cgroup_seqfile_state { + struct cftype *cft; + struct cgroup *cgroup; +}; + +static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) +{ + struct seq_file *sf = cb->state; + return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value); +} + +static int cgroup_seqfile_show(struct seq_file *m, void *arg) +{ + struct cgroup_seqfile_state *state = m->private; + struct cftype *cft = state->cft; + if (cft->read_map) { + struct cgroup_map_cb cb = { + .fill = cgroup_map_add, + .state = m, + }; + return cft->read_map(state->cgroup, cft, &cb); + } + return cft->read_seq_string(state->cgroup, cft, m); +} + +static int cgroup_seqfile_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + kfree(seq->private); + return single_release(inode, file); +} + +static const struct file_operations cgroup_seqfile_operations = { + .read = seq_read, + .write = cgroup_file_write, + .llseek = seq_lseek, + .release = cgroup_seqfile_release, +}; + +static int cgroup_file_open(struct inode *inode, struct file *file) +{ + int err; + struct cftype *cft; + + err = generic_file_open(inode, file); + if (err) + return err; + cft = __d_cft(file->f_dentry); + + if (cft->read_map || cft->read_seq_string) { + struct cgroup_seqfile_state *state = + kzalloc(sizeof(*state), GFP_USER); + if (!state) + return -ENOMEM; + state->cft = cft; + state->cgroup = __d_cgrp(file->f_dentry->d_parent); + file->f_op = &cgroup_seqfile_operations; + err = single_open(file, cgroup_seqfile_show, state); + if (err < 0) + kfree(state); + } else if (cft->open) + err = cft->open(inode, file); + else + err = 0; + + return err; +} + +static int cgroup_file_release(struct inode *inode, struct file *file) +{ + struct cftype *cft = __d_cft(file->f_dentry); + if (cft->release) + return cft->release(inode, file); + return 0; +} + +/* + * cgroup_rename - Only allow simple rename of directories in place. + */ +static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + if (!S_ISDIR(old_dentry->d_inode->i_mode)) + return -ENOTDIR; + if (new_dentry->d_inode) + return -EEXIST; + if (old_dir != new_dir) + return -EIO; + return simple_rename(old_dir, old_dentry, new_dir, new_dentry); +} + +static const struct file_operations cgroup_file_operations = { + .read = cgroup_file_read, + .write = cgroup_file_write, + .llseek = generic_file_llseek, + .open = cgroup_file_open, + .release = cgroup_file_release, +}; + +static const struct inode_operations cgroup_dir_inode_operations = { + .lookup = cgroup_lookup, + .mkdir = cgroup_mkdir, + .rmdir = cgroup_rmdir, + .rename = cgroup_rename, +}; + +static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) +{ + if (dentry->d_name.len > NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + d_add(dentry, NULL); + return NULL; +} + +/* + * Check if a file is a control file + */ +static inline struct cftype *__file_cft(struct file *file) +{ + if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations) + return ERR_PTR(-EINVAL); + return __d_cft(file->f_dentry); +} + +static int cgroup_create_file(struct dentry *dentry, mode_t mode, + struct super_block *sb) +{ + struct inode *inode; + + if (!dentry) + return -ENOENT; + if (dentry->d_inode) + return -EEXIST; + + inode = cgroup_new_inode(mode, sb); + if (!inode) + return -ENOMEM; + + if (S_ISDIR(mode)) { + inode->i_op = &cgroup_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + + /* start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); + + /* start with the directory inode held, so that we can + * populate it without racing with another mkdir */ + mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); + } else if (S_ISREG(mode)) { + inode->i_size = 0; + inode->i_fop = &cgroup_file_operations; + } + d_instantiate(dentry, inode); + dget(dentry); /* Extra count - pin the dentry in core */ + return 0; +} + +/* + * cgroup_create_dir - create a directory for an object. + * @cgrp: the cgroup we create the directory for. It must have a valid + * ->parent field. And we are going to fill its ->dentry field. + * @dentry: dentry of the new cgroup + * @mode: mode to set on new directory. + */ +static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, + mode_t mode) +{ + struct dentry *parent; + int error = 0; + + parent = cgrp->parent->dentry; + error = cgroup_create_file(dentry, S_IFDIR | mode, cgrp->root->sb); + if (!error) { + dentry->d_fsdata = cgrp; + inc_nlink(parent->d_inode); + rcu_assign_pointer(cgrp->dentry, dentry); + dget(dentry); + } + dput(dentry); + + return error; +} + +/** + * cgroup_file_mode - deduce file mode of a control file + * @cft: the control file in question + * + * returns cft->mode if ->mode is not 0 + * returns S_IRUGO|S_IWUSR if it has both a read and a write handler + * returns S_IRUGO if it has only a read handler + * returns S_IWUSR if it has only a write hander + */ +static mode_t cgroup_file_mode(const struct cftype *cft) +{ + mode_t mode = 0; + + if (cft->mode) + return cft->mode; + + if (cft->read || cft->read_u64 || cft->read_s64 || + cft->read_map || cft->read_seq_string) + mode |= S_IRUGO; + + if (cft->write || cft->write_u64 || cft->write_s64 || + cft->write_string || cft->trigger) + mode |= S_IWUSR; + + return mode; +} + +int cgroup_add_file(struct cgroup *cgrp, + struct cgroup_subsys *subsys, + const struct cftype *cft) +{ + struct dentry *dir = cgrp->dentry; + struct dentry *dentry; + int error; + mode_t mode; + + char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; + if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { + strcpy(name, subsys->name); + strcat(name, "."); + } + strcat(name, cft->name); + BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); + dentry = lookup_one_len(name, dir, strlen(name)); + if (!IS_ERR(dentry)) { + mode = cgroup_file_mode(cft); + error = cgroup_create_file(dentry, mode | S_IFREG, + cgrp->root->sb); + if (!error) + dentry->d_fsdata = (void *)cft; + dput(dentry); + } else + error = PTR_ERR(dentry); + return error; +} +EXPORT_SYMBOL_GPL(cgroup_add_file); + +int cgroup_add_files(struct cgroup *cgrp, + struct cgroup_subsys *subsys, + const struct cftype cft[], + int count) +{ + int i, err; + for (i = 0; i < count; i++) { + err = cgroup_add_file(cgrp, subsys, &cft[i]); + if (err) + return err; + } + return 0; +} +EXPORT_SYMBOL_GPL(cgroup_add_files); + +/** + * cgroup_task_count - count the number of tasks in a cgroup. + * @cgrp: the cgroup in question + * + * Return the number of tasks in the cgroup. + */ +int cgroup_task_count(const struct cgroup *cgrp) +{ + int count = 0; + struct cg_cgroup_link *link; + + read_lock(&css_set_lock); + list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { + count += atomic_read(&link->cg->refcount); + } + read_unlock(&css_set_lock); + return count; +} + +/* + * Advance a list_head iterator. The iterator should be positioned at + * the start of a css_set + */ +static void cgroup_advance_iter(struct cgroup *cgrp, + struct cgroup_iter *it) +{ + struct list_head *l = it->cg_link; + struct cg_cgroup_link *link; + struct css_set *cg; + + /* Advance to the next non-empty css_set */ + do { + l = l->next; + if (l == &cgrp->css_sets) { + it->cg_link = NULL; + return; + } + link = list_entry(l, struct cg_cgroup_link, cgrp_link_list); + cg = link->cg; + } while (list_empty(&cg->tasks)); + it->cg_link = l; + it->task = cg->tasks.next; +} + +/* + * To reduce the fork() overhead for systems that are not actually + * using their cgroups capability, we don't maintain the lists running + * through each css_set to its tasks until we see the list actually + * used - in other words after the first call to cgroup_iter_start(). + * + * The tasklist_lock is not held here, as do_each_thread() and + * while_each_thread() are protected by RCU. + */ +static void cgroup_enable_task_cg_lists(void) +{ + struct task_struct *p, *g; + write_lock(&css_set_lock); + use_task_css_set_links = 1; + do_each_thread(g, p) { + task_lock(p); + /* + * We should check if the process is exiting, otherwise + * it will race with cgroup_exit() in that the list + * entry won't be deleted though the process has exited. + */ + if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) + list_add(&p->cg_list, &p->cgroups->tasks); + task_unlock(p); + } while_each_thread(g, p); + write_unlock(&css_set_lock); +} + +void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) +{ + /* + * The first time anyone tries to iterate across a cgroup, + * we need to enable the list linking each css_set to its + * tasks, and fix up all existing tasks. + */ + if (!use_task_css_set_links) + cgroup_enable_task_cg_lists(); + + read_lock(&css_set_lock); + it->cg_link = &cgrp->css_sets; + cgroup_advance_iter(cgrp, it); +} + +struct task_struct *cgroup_iter_next(struct cgroup *cgrp, + struct cgroup_iter *it) +{ + struct task_struct *res; + struct list_head *l = it->task; + struct cg_cgroup_link *link; + + /* If the iterator cg is NULL, we have no tasks */ + if (!it->cg_link) + return NULL; + res = list_entry(l, struct task_struct, cg_list); + /* Advance iterator to find next entry */ + l = l->next; + link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list); + if (l == &link->cg->tasks) { + /* We reached the end of this task list - move on to + * the next cg_cgroup_link */ + cgroup_advance_iter(cgrp, it); + } else { + it->task = l; + } + return res; +} + +void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) +{ + read_unlock(&css_set_lock); +} + +static inline int started_after_time(struct task_struct *t1, + struct timespec *time, + struct task_struct *t2) +{ + int start_diff = timespec_compare(&t1->start_time, time); + if (start_diff > 0) { + return 1; + } else if (start_diff < 0) { + return 0; + } else { + /* + * Arbitrarily, if two processes started at the same + * time, we'll say that the lower pointer value + * started first. Note that t2 may have exited by now + * so this may not be a valid pointer any longer, but + * that's fine - it still serves to distinguish + * between two tasks started (effectively) simultaneously. + */ + return t1 > t2; + } +} + +/* + * This function is a callback from heap_insert() and is used to order + * the heap. + * In this case we order the heap in descending task start time. + */ +static inline int started_after(void *p1, void *p2) +{ + struct task_struct *t1 = p1; + struct task_struct *t2 = p2; + return started_after_time(t1, &t2->start_time, t2); +} + +/** + * cgroup_scan_tasks - iterate though all the tasks in a cgroup + * @scan: struct cgroup_scanner containing arguments for the scan + * + * Arguments include pointers to callback functions test_task() and + * process_task(). + * Iterate through all the tasks in a cgroup, calling test_task() for each, + * and if it returns true, call process_task() for it also. + * The test_task pointer may be NULL, meaning always true (select all tasks). + * Effectively duplicates cgroup_iter_{start,next,end}() + * but does not lock css_set_lock for the call to process_task(). + * The struct cgroup_scanner may be embedded in any structure of the caller's + * creation. + * It is guaranteed that process_task() will act on every task that + * is a member of the cgroup for the duration of this call. This + * function may or may not call process_task() for tasks that exit + * or move to a different cgroup during the call, or are forked or + * move into the cgroup during the call. + * + * Note that test_task() may be called with locks held, and may in some + * situations be called multiple times for the same task, so it should + * be cheap. + * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been + * pre-allocated and will be used for heap operations (and its "gt" member will + * be overwritten), else a temporary heap will be used (allocation of which + * may cause this function to fail). + */ +int cgroup_scan_tasks(struct cgroup_scanner *scan) +{ + int retval, i; + struct cgroup_iter it; + struct task_struct *p, *dropped; + /* Never dereference latest_task, since it's not refcounted */ + struct task_struct *latest_task = NULL; + struct ptr_heap tmp_heap; + struct ptr_heap *heap; + struct timespec latest_time = { 0, 0 }; + + if (scan->heap) { + /* The caller supplied our heap and pre-allocated its memory */ + heap = scan->heap; + heap->gt = &started_after; + } else { + /* We need to allocate our own heap memory */ + heap = &tmp_heap; + retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after); + if (retval) + /* cannot allocate the heap */ + return retval; + } + + again: + /* + * Scan tasks in the cgroup, using the scanner's "test_task" callback + * to determine which are of interest, and using the scanner's + * "process_task" callback to process any of them that need an update. + * Since we don't want to hold any locks during the task updates, + * gather tasks to be processed in a heap structure. + * The heap is sorted by descending task start time. + * If the statically-sized heap fills up, we overflow tasks that + * started later, and in future iterations only consider tasks that + * started after the latest task in the previous pass. This + * guarantees forward progress and that we don't miss any tasks. + */ + heap->size = 0; + cgroup_iter_start(scan->cg, &it); + while ((p = cgroup_iter_next(scan->cg, &it))) { + /* + * Only affect tasks that qualify per the caller's callback, + * if he provided one + */ + if (scan->test_task && !scan->test_task(p, scan)) + continue; + /* + * Only process tasks that started after the last task + * we processed + */ + if (!started_after_time(p, &latest_time, latest_task)) + continue; + dropped = heap_insert(heap, p); + if (dropped == NULL) { + /* + * The new task was inserted; the heap wasn't + * previously full + */ + get_task_struct(p); + } else if (dropped != p) { + /* + * The new task was inserted, and pushed out a + * different task + */ + get_task_struct(p); + put_task_struct(dropped); + } + /* + * Else the new task was newer than anything already in + * the heap and wasn't inserted + */ + } + cgroup_iter_end(scan->cg, &it); + + if (heap->size) { + for (i = 0; i < heap->size; i++) { + struct task_struct *q = heap->ptrs[i]; + if (i == 0) { + latest_time = q->start_time; + latest_task = q; + } + /* Process the task per the caller's callback */ + scan->process_task(q, scan); + put_task_struct(q); + } + /* + * If we had to process any tasks at all, scan again + * in case some of them were in the middle of forking + * children that didn't get processed. + * Not the most efficient way to do it, but it avoids + * having to take callback_mutex in the fork path + */ + goto again; + } + if (heap == &tmp_heap) + heap_free(&tmp_heap); + return 0; +} + +/* + * Stuff for reading the 'tasks'/'procs' files. + * + * Reading this file can return large amounts of data if a cgroup has + * *lots* of attached tasks. So it may need several calls to read(), + * but we cannot guarantee that the information we produce is correct + * unless we produce it entirely atomically. + * + */ + +/* + * The following two functions "fix" the issue where there are more pids + * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. + * TODO: replace with a kernel-wide solution to this problem + */ +#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) +static void *pidlist_allocate(int count) +{ + if (PIDLIST_TOO_LARGE(count)) + return vmalloc(count * sizeof(pid_t)); + else + return kmalloc(count * sizeof(pid_t), GFP_KERNEL); +} +static void pidlist_free(void *p) +{ + if (is_vmalloc_addr(p)) + vfree(p); + else + kfree(p); +} +static void *pidlist_resize(void *p, int newcount) +{ + void *newlist; + /* note: if new alloc fails, old p will still be valid either way */ + if (is_vmalloc_addr(p)) { + newlist = vmalloc(newcount * sizeof(pid_t)); + if (!newlist) + return NULL; + memcpy(newlist, p, newcount * sizeof(pid_t)); + vfree(p); + } else { + newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL); + } + return newlist; +} + +/* + * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries + * If the new stripped list is sufficiently smaller and there's enough memory + * to allocate a new buffer, will let go of the unneeded memory. Returns the + * number of unique elements. + */ +/* is the size difference enough that we should re-allocate the array? */ +#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new)) +static int pidlist_uniq(pid_t **p, int length) +{ + int src, dest = 1; + pid_t *list = *p; + pid_t *newlist; + + /* + * we presume the 0th element is unique, so i starts at 1. trivial + * edge cases first; no work needs to be done for either + */ + if (length == 0 || length == 1) + return length; + /* src and dest walk down the list; dest counts unique elements */ + for (src = 1; src < length; src++) { + /* find next unique element */ + while (list[src] == list[src-1]) { + src++; + if (src == length) + goto after; + } + /* dest always points to where the next unique element goes */ + list[dest] = list[src]; + dest++; + } +after: + /* + * if the length difference is large enough, we want to allocate a + * smaller buffer to save memory. if this fails due to out of memory, + * we'll just stay with what we've got. + */ + if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) { + newlist = pidlist_resize(list, dest); + if (newlist) + *p = newlist; + } + return dest; +} + +static int cmppid(const void *a, const void *b) +{ + return *(pid_t *)a - *(pid_t *)b; +} + +/* + * find the appropriate pidlist for our purpose (given procs vs tasks) + * returns with the lock on that pidlist already held, and takes care + * of the use count, or returns NULL with no locks held if we're out of + * memory. + */ +static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, + enum cgroup_filetype type) +{ + struct cgroup_pidlist *l; + /* don't need task_nsproxy() if we're looking at ourself */ + struct pid_namespace *ns = current->nsproxy->pid_ns; + + /* + * We can't drop the pidlist_mutex before taking the l->mutex in case + * the last ref-holder is trying to remove l from the list at the same + * time. Holding the pidlist_mutex precludes somebody taking whichever + * list we find out from under us - compare release_pid_array(). + */ + mutex_lock(&cgrp->pidlist_mutex); + list_for_each_entry(l, &cgrp->pidlists, links) { + if (l->key.type == type && l->key.ns == ns) { + /* make sure l doesn't vanish out from under us */ + down_write(&l->mutex); + mutex_unlock(&cgrp->pidlist_mutex); + return l; + } + } + /* entry not found; create a new one */ + l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); + if (!l) { + mutex_unlock(&cgrp->pidlist_mutex); + return l; + } + init_rwsem(&l->mutex); + down_write(&l->mutex); + l->key.type = type; + l->key.ns = get_pid_ns(ns); + l->use_count = 0; /* don't increment here */ + l->list = NULL; + l->owner = cgrp; + list_add(&l->links, &cgrp->pidlists); + mutex_unlock(&cgrp->pidlist_mutex); + return l; +} + +/* + * Load a cgroup's pidarray with either procs' tgids or tasks' pids + */ +static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, + struct cgroup_pidlist **lp) +{ + pid_t *array; + int length; + int pid, n = 0; /* used for populating the array */ + struct cgroup_iter it; + struct task_struct *tsk; + struct cgroup_pidlist *l; + + /* + * If cgroup gets more users after we read count, we won't have + * enough space - tough. This race is indistinguishable to the + * caller from the case that the additional cgroup users didn't + * show up until sometime later on. + */ + length = cgroup_task_count(cgrp); + array = pidlist_allocate(length); + if (!array) + return -ENOMEM; + /* now, populate the array */ + cgroup_iter_start(cgrp, &it); + while ((tsk = cgroup_iter_next(cgrp, &it))) { + if (unlikely(n == length)) + break; + /* get tgid or pid for procs or tasks file respectively */ + if (type == CGROUP_FILE_PROCS) + pid = task_tgid_vnr(tsk); + else + pid = task_pid_vnr(tsk); + if (pid > 0) /* make sure to only use valid results */ + array[n++] = pid; + } + cgroup_iter_end(cgrp, &it); + length = n; + /* now sort & (if procs) strip out duplicates */ + sort(array, length, sizeof(pid_t), cmppid, NULL); + if (type == CGROUP_FILE_PROCS) + length = pidlist_uniq(&array, length); + l = cgroup_pidlist_find(cgrp, type); + if (!l) { + pidlist_free(array); + return -ENOMEM; + } + /* store array, freeing old if necessary - lock already held */ + pidlist_free(l->list); + l->list = array; + l->length = length; + l->use_count++; + up_write(&l->mutex); + *lp = l; + return 0; +} + +/** + * cgroupstats_build - build and fill cgroupstats + * @stats: cgroupstats to fill information into + * @dentry: A dentry entry belonging to the cgroup for which stats have + * been requested. + * + * Build and fill cgroupstats so that taskstats can export it to user + * space. + */ +int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) +{ + int ret = -EINVAL; + struct cgroup *cgrp; + struct cgroup_iter it; + struct task_struct *tsk; + + /* + * Validate dentry by checking the superblock operations, + * and make sure it's a directory. + */ + if (dentry->d_sb->s_op != &cgroup_ops || + !S_ISDIR(dentry->d_inode->i_mode)) + goto err; + + ret = 0; + cgrp = dentry->d_fsdata; + + cgroup_iter_start(cgrp, &it); + while ((tsk = cgroup_iter_next(cgrp, &it))) { + switch (tsk->state) { + case TASK_RUNNING: + stats->nr_running++; + break; + case TASK_INTERRUPTIBLE: + stats->nr_sleeping++; + break; + case TASK_UNINTERRUPTIBLE: + stats->nr_uninterruptible++; + break; + case TASK_STOPPED: + stats->nr_stopped++; + break; + default: + if (delayacct_is_task_waiting_on_io(tsk)) + stats->nr_io_wait++; + break; + } + } + cgroup_iter_end(cgrp, &it); + +err: + return ret; +} + + +/* + * seq_file methods for the tasks/procs files. The seq_file position is the + * next pid to display; the seq_file iterator is a pointer to the pid + * in the cgroup->l->list array. + */ + +static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) +{ + /* + * Initially we receive a position value that corresponds to + * one more than the last pid shown (or 0 on the first call or + * after a seek to the start). Use a binary-search to find the + * next pid to display, if any + */ + struct cgroup_pidlist *l = s->private; + int index = 0, pid = *pos; + int *iter; + + down_read(&l->mutex); + if (pid) { + int end = l->length; + + while (index < end) { + int mid = (index + end) / 2; + if (l->list[mid] == pid) { + index = mid; + break; + } else if (l->list[mid] <= pid) + index = mid + 1; + else + end = mid; + } + } + /* If we're off the end of the array, we're done */ + if (index >= l->length) + return NULL; + /* Update the abstract position to be the actual pid that we found */ + iter = l->list + index; + *pos = *iter; + return iter; +} + +static void cgroup_pidlist_stop(struct seq_file *s, void *v) +{ + struct cgroup_pidlist *l = s->private; + up_read(&l->mutex); +} + +static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct cgroup_pidlist *l = s->private; + pid_t *p = v; + pid_t *end = l->list + l->length; + /* + * Advance to the next pid in the array. If this goes off the + * end, we're done + */ + p++; + if (p >= end) { + return NULL; + } else { + *pos = *p; + return p; + } +} + +static int cgroup_pidlist_show(struct seq_file *s, void *v) +{ + return seq_printf(s, "%d\n", *(int *)v); +} + +/* + * seq_operations functions for iterating on pidlists through seq_file - + * independent of whether it's tasks or procs + */ +static const struct seq_operations cgroup_pidlist_seq_operations = { + .start = cgroup_pidlist_start, + .stop = cgroup_pidlist_stop, + .next = cgroup_pidlist_next, + .show = cgroup_pidlist_show, +}; + +static void cgroup_release_pid_array(struct cgroup_pidlist *l) +{ + /* + * the case where we're the last user of this particular pidlist will + * have us remove it from the cgroup's list, which entails taking the + * mutex. since in pidlist_find the pidlist->lock depends on cgroup-> + * pidlist_mutex, we have to take pidlist_mutex first. + */ + mutex_lock(&l->owner->pidlist_mutex); + down_write(&l->mutex); + BUG_ON(!l->use_count); + if (!--l->use_count) { + /* we're the last user if refcount is 0; remove and free */ + list_del(&l->links); + mutex_unlock(&l->owner->pidlist_mutex); + pidlist_free(l->list); + put_pid_ns(l->key.ns); + up_write(&l->mutex); + kfree(l); + return; + } + mutex_unlock(&l->owner->pidlist_mutex); + up_write(&l->mutex); +} + +static int cgroup_pidlist_release(struct inode *inode, struct file *file) +{ + struct cgroup_pidlist *l; + if (!(file->f_mode & FMODE_READ)) + return 0; + /* + * the seq_file will only be initialized if the file was opened for + * reading; hence we check if it's not null only in that case. + */ + l = ((struct seq_file *)file->private_data)->private; + cgroup_release_pid_array(l); + return seq_release(inode, file); +} + +static const struct file_operations cgroup_pidlist_operations = { + .read = seq_read, + .llseek = seq_lseek, + .write = cgroup_file_write, + .release = cgroup_pidlist_release, +}; + +/* + * The following functions handle opens on a file that displays a pidlist + * (tasks or procs). Prepare an array of the process/thread IDs of whoever's + * in the cgroup. + */ +/* helper function for the two below it */ +static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type) +{ + struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); + struct cgroup_pidlist *l; + int retval; + + /* Nothing to do for write-only files */ + if (!(file->f_mode & FMODE_READ)) + return 0; + + /* have the array populated */ + retval = pidlist_array_load(cgrp, type, &l); + if (retval) + return retval; + /* configure file information */ + file->f_op = &cgroup_pidlist_operations; + + retval = seq_open(file, &cgroup_pidlist_seq_operations); + if (retval) { + cgroup_release_pid_array(l); + return retval; + } + ((struct seq_file *)file->private_data)->private = l; + return 0; +} +static int cgroup_tasks_open(struct inode *unused, struct file *file) +{ + return cgroup_pidlist_open(file, CGROUP_FILE_TASKS); +} +static int cgroup_procs_open(struct inode *unused, struct file *file) +{ + return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); +} + +static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, + struct cftype *cft) +{ + return notify_on_release(cgrp); +} + +static int cgroup_write_notify_on_release(struct cgroup *cgrp, + struct cftype *cft, + u64 val) +{ + clear_bit(CGRP_RELEASABLE, &cgrp->flags); + if (val) + set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); + else + clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); + return 0; +} + +/* + * Unregister event and free resources. + * + * Gets called from workqueue. + */ +static void cgroup_event_remove(struct work_struct *work) +{ + struct cgroup_event *event = container_of(work, struct cgroup_event, + remove); + struct cgroup *cgrp = event->cgrp; + + event->cft->unregister_event(cgrp, event->cft, event->eventfd); + + eventfd_ctx_put(event->eventfd); + kfree(event); + dput(cgrp->dentry); +} + +/* + * Gets called on POLLHUP on eventfd when user closes it. + * + * Called with wqh->lock held and interrupts disabled. + */ +static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, + int sync, void *key) +{ + struct cgroup_event *event = container_of(wait, + struct cgroup_event, wait); + struct cgroup *cgrp = event->cgrp; + unsigned long flags = (unsigned long)key; + + if (flags & POLLHUP) { + __remove_wait_queue(event->wqh, &event->wait); + spin_lock(&cgrp->event_list_lock); + list_del(&event->list); + spin_unlock(&cgrp->event_list_lock); + /* + * We are in atomic context, but cgroup_event_remove() may + * sleep, so we have to call it in workqueue. + */ + schedule_work(&event->remove); + } + + return 0; +} + +static void cgroup_event_ptable_queue_proc(struct file *file, + wait_queue_head_t *wqh, poll_table *pt) +{ + struct cgroup_event *event = container_of(pt, + struct cgroup_event, pt); + + event->wqh = wqh; + add_wait_queue(wqh, &event->wait); +} + +/* + * Parse input and register new cgroup event handler. + * + * Input must be in format ' '. + * Interpretation of args is defined by control file implementation. + */ +static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, + const char *buffer) +{ + struct cgroup_event *event = NULL; + unsigned int efd, cfd; + struct file *efile = NULL; + struct file *cfile = NULL; + char *endp; + int ret; + + efd = simple_strtoul(buffer, &endp, 10); + if (*endp != ' ') + return -EINVAL; + buffer = endp + 1; + + cfd = simple_strtoul(buffer, &endp, 10); + if ((*endp != ' ') && (*endp != '\0')) + return -EINVAL; + buffer = endp + 1; + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return -ENOMEM; + event->cgrp = cgrp; + INIT_LIST_HEAD(&event->list); + init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); + init_waitqueue_func_entry(&event->wait, cgroup_event_wake); + INIT_WORK(&event->remove, cgroup_event_remove); + + efile = eventfd_fget(efd); + if (IS_ERR(efile)) { + ret = PTR_ERR(efile); + goto fail; + } + + event->eventfd = eventfd_ctx_fileget(efile); + if (IS_ERR(event->eventfd)) { + ret = PTR_ERR(event->eventfd); + goto fail; + } + + cfile = fget(cfd); + if (!cfile) { + ret = -EBADF; + goto fail; + } + + /* the process need read permission on control file */ + ret = file_permission(cfile, MAY_READ); + if (ret < 0) + goto fail; + + event->cft = __file_cft(cfile); + if (IS_ERR(event->cft)) { + ret = PTR_ERR(event->cft); + goto fail; + } + + if (!event->cft->register_event || !event->cft->unregister_event) { + ret = -EINVAL; + goto fail; + } + + ret = event->cft->register_event(cgrp, event->cft, + event->eventfd, buffer); + if (ret) + goto fail; + + if (efile->f_op->poll(efile, &event->pt) & POLLHUP) { + event->cft->unregister_event(cgrp, event->cft, event->eventfd); + ret = 0; + goto fail; + } + + /* + * Events should be removed after rmdir of cgroup directory, but before + * destroying subsystem state objects. Let's take reference to cgroup + * directory dentry to do that. + */ + dget(cgrp->dentry); + + spin_lock(&cgrp->event_list_lock); + list_add(&event->list, &cgrp->event_list); + spin_unlock(&cgrp->event_list_lock); + + fput(cfile); + fput(efile); + + return 0; + +fail: + if (cfile) + fput(cfile); + + if (event && event->eventfd && !IS_ERR(event->eventfd)) + eventfd_ctx_put(event->eventfd); + + if (!IS_ERR_OR_NULL(efile)) + fput(efile); + + kfree(event); + + return ret; +} + +static u64 cgroup_clone_children_read(struct cgroup *cgrp, + struct cftype *cft) +{ + return clone_children(cgrp); +} + +static int cgroup_clone_children_write(struct cgroup *cgrp, + struct cftype *cft, + u64 val) +{ + if (val) + set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); + else + clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); + return 0; +} + +/* + * for the common functions, 'private' gives the type of file + */ +/* for hysterical raisins, we can't put this on the older files */ +#define CGROUP_FILE_GENERIC_PREFIX "cgroup." +static struct cftype files[] = { + { + .name = "tasks", + .open = cgroup_tasks_open, + .write_u64 = cgroup_tasks_write, + .release = cgroup_pidlist_release, + .mode = S_IRUGO | S_IWUSR, + }, + { + .name = CGROUP_FILE_GENERIC_PREFIX "procs", + .open = cgroup_procs_open, + .write_u64 = cgroup_procs_write, + .release = cgroup_pidlist_release, + .mode = S_IRUGO | S_IWUSR, + }, + { + .name = "notify_on_release", + .read_u64 = cgroup_read_notify_on_release, + .write_u64 = cgroup_write_notify_on_release, + }, + { + .name = CGROUP_FILE_GENERIC_PREFIX "event_control", + .write_string = cgroup_write_event_control, + .mode = S_IWUGO, + }, + { + .name = "cgroup.clone_children", + .read_u64 = cgroup_clone_children_read, + .write_u64 = cgroup_clone_children_write, + }, +}; + +static struct cftype cft_release_agent = { + .name = "release_agent", + .read_seq_string = cgroup_release_agent_show, + .write_string = cgroup_release_agent_write, + .max_write_len = PATH_MAX, +}; + +static int cgroup_populate_dir(struct cgroup *cgrp) +{ + int err; + struct cgroup_subsys *ss; + + /* First clear out any existing files */ + cgroup_clear_directory(cgrp->dentry); + + err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files)); + if (err < 0) + return err; + + if (cgrp == cgrp->top_cgroup) { + if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0) + return err; + } + + for_each_subsys(cgrp->root, ss) { + if (ss->populate && (err = ss->populate(ss, cgrp)) < 0) + return err; + } + /* This cgroup is ready now */ + for_each_subsys(cgrp->root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + /* + * Update id->css pointer and make this css visible from + * CSS ID functions. This pointer will be dereferened + * from RCU-read-side without locks. + */ + if (css->id) + rcu_assign_pointer(css->id->css, css); + } + + return 0; +} + +static void init_cgroup_css(struct cgroup_subsys_state *css, + struct cgroup_subsys *ss, + struct cgroup *cgrp) +{ + css->cgroup = cgrp; + atomic_set(&css->refcnt, 1); + css->flags = 0; + css->id = NULL; + if (cgrp == dummytop) + set_bit(CSS_ROOT, &css->flags); + BUG_ON(cgrp->subsys[ss->subsys_id]); + cgrp->subsys[ss->subsys_id] = css; +} + +static void cgroup_lock_hierarchy(struct cgroupfs_root *root) +{ + /* We need to take each hierarchy_mutex in a consistent order */ + int i; + + /* + * No worry about a race with rebind_subsystems that might mess up the + * locking order, since both parties are under cgroup_mutex. + */ + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss == NULL) + continue; + if (ss->root == root) + mutex_lock(&ss->hierarchy_mutex); + } +} + +static void cgroup_unlock_hierarchy(struct cgroupfs_root *root) +{ + int i; + + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss == NULL) + continue; + if (ss->root == root) + mutex_unlock(&ss->hierarchy_mutex); + } +} + +/* + * cgroup_create - create a cgroup + * @parent: cgroup that will be parent of the new cgroup + * @dentry: dentry of the new cgroup + * @mode: mode to set on new inode + * + * Must be called with the mutex on the parent inode held + */ +static long cgroup_create(struct cgroup *parent, struct dentry *dentry, + mode_t mode) +{ + struct cgroup *cgrp; + struct cgroupfs_root *root = parent->root; + int err = 0; + struct cgroup_subsys *ss; + struct super_block *sb = root->sb; + + cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); + if (!cgrp) + return -ENOMEM; + + /* Grab a reference on the superblock so the hierarchy doesn't + * get deleted on unmount if there are child cgroups. This + * can be done outside cgroup_mutex, since the sb can't + * disappear while someone has an open control file on the + * fs */ + atomic_inc(&sb->s_active); + + mutex_lock(&cgroup_mutex); + + init_cgroup_housekeeping(cgrp); + + cgrp->parent = parent; + cgrp->root = parent->root; + cgrp->top_cgroup = parent->top_cgroup; + + if (notify_on_release(parent)) + set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); + + if (clone_children(parent)) + set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags); + + for_each_subsys(root, ss) { + struct cgroup_subsys_state *css = ss->create(ss, cgrp); + + if (IS_ERR(css)) { + err = PTR_ERR(css); + goto err_destroy; + } + init_cgroup_css(css, ss, cgrp); + if (ss->use_id) { + err = alloc_css_id(ss, parent, cgrp); + if (err) + goto err_destroy; + } + /* At error, ->destroy() callback has to free assigned ID. */ + if (clone_children(parent) && ss->post_clone) + ss->post_clone(ss, cgrp); + } + + cgroup_lock_hierarchy(root); + list_add(&cgrp->sibling, &cgrp->parent->children); + cgroup_unlock_hierarchy(root); + root->number_of_cgroups++; + + err = cgroup_create_dir(cgrp, dentry, mode); + if (err < 0) + goto err_remove; + + set_bit(CGRP_RELEASABLE, &parent->flags); + + /* The cgroup directory was pre-locked for us */ + BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); + + err = cgroup_populate_dir(cgrp); + /* If err < 0, we have a half-filled directory - oh well ;) */ + + mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgrp->dentry->d_inode->i_mutex); + + return 0; + + err_remove: + + cgroup_lock_hierarchy(root); + list_del(&cgrp->sibling); + cgroup_unlock_hierarchy(root); + root->number_of_cgroups--; + + err_destroy: + + for_each_subsys(root, ss) { + if (cgrp->subsys[ss->subsys_id]) + ss->destroy(ss, cgrp); + } + + mutex_unlock(&cgroup_mutex); + + /* Release the reference count that we took on the superblock */ + deactivate_super(sb); + + kfree(cgrp); + return err; +} + +static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct cgroup *c_parent = dentry->d_parent->d_fsdata; + + /* the vfs holds inode->i_mutex already */ + return cgroup_create(c_parent, dentry, mode | S_IFDIR); +} + +static int cgroup_has_css_refs(struct cgroup *cgrp) +{ + /* Check the reference count on each subsystem. Since we + * already established that there are no tasks in the + * cgroup, if the css refcount is also 1, then there should + * be no outstanding references, so the subsystem is safe to + * destroy. We scan across all subsystems rather than using + * the per-hierarchy linked list of mounted subsystems since + * we can be called via check_for_release() with no + * synchronization other than RCU, and the subsystem linked + * list isn't RCU-safe */ + int i; + /* + * We won't need to lock the subsys array, because the subsystems + * we're concerned about aren't going anywhere since our cgroup root + * has a reference on them. + */ + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + struct cgroup_subsys_state *css; + /* Skip subsystems not present or not in this hierarchy */ + if (ss == NULL || ss->root != cgrp->root) + continue; + css = cgrp->subsys[ss->subsys_id]; + /* When called from check_for_release() it's possible + * that by this point the cgroup has been removed + * and the css deleted. But a false-positive doesn't + * matter, since it can only happen if the cgroup + * has been deleted and hence no longer needs the + * release agent to be called anyway. */ + if (css && (atomic_read(&css->refcnt) > 1)) + return 1; + } + return 0; +} + +/* + * Atomically mark all (or else none) of the cgroup's CSS objects as + * CSS_REMOVED. Return true on success, or false if the cgroup has + * busy subsystems. Call with cgroup_mutex held + */ + +static int cgroup_clear_css_refs(struct cgroup *cgrp) +{ + struct cgroup_subsys *ss; + unsigned long flags; + bool failed = false; + local_irq_save(flags); + for_each_subsys(cgrp->root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + int refcnt; + while (1) { + /* We can only remove a CSS with a refcnt==1 */ + refcnt = atomic_read(&css->refcnt); + if (refcnt > 1) { + failed = true; + goto done; + } + BUG_ON(!refcnt); + /* + * Drop the refcnt to 0 while we check other + * subsystems. This will cause any racing + * css_tryget() to spin until we set the + * CSS_REMOVED bits or abort + */ + if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt) + break; + cpu_relax(); + } + } + done: + for_each_subsys(cgrp->root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + if (failed) { + /* + * Restore old refcnt if we previously managed + * to clear it from 1 to 0 + */ + if (!atomic_read(&css->refcnt)) + atomic_set(&css->refcnt, 1); + } else { + /* Commit the fact that the CSS is removed */ + set_bit(CSS_REMOVED, &css->flags); + } + } + local_irq_restore(flags); + return !failed; +} + +/* checks if all of the css_sets attached to a cgroup have a refcount of 0. + * Must be called with css_set_lock held */ +static int cgroup_css_sets_empty(struct cgroup *cgrp) +{ + struct cg_cgroup_link *link; + + list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { + struct css_set *cg = link->cg; + if (atomic_read(&cg->refcount) > 0) + return 0; + } + + return 1; +} + +static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) +{ + struct cgroup *cgrp = dentry->d_fsdata; + struct dentry *d; + struct cgroup *parent; + DEFINE_WAIT(wait); + struct cgroup_event *event, *tmp; + int ret; + + /* the vfs holds both inode->i_mutex already */ +again: + mutex_lock(&cgroup_mutex); + if (!cgroup_css_sets_empty(cgrp)) { + mutex_unlock(&cgroup_mutex); + return -EBUSY; + } + if (!list_empty(&cgrp->children)) { + mutex_unlock(&cgroup_mutex); + return -EBUSY; + } + mutex_unlock(&cgroup_mutex); + + /* + * In general, subsystem has no css->refcnt after pre_destroy(). But + * in racy cases, subsystem may have to get css->refcnt after + * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes + * make rmdir return -EBUSY too often. To avoid that, we use waitqueue + * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir + * and subsystem's reference count handling. Please see css_get/put + * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation. + */ + set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); + + /* + * Call pre_destroy handlers of subsys. Notify subsystems + * that rmdir() request comes. + */ + ret = cgroup_call_pre_destroy(cgrp); + if (ret) { + clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); + return ret; + } + + mutex_lock(&cgroup_mutex); + parent = cgrp->parent; + if (!cgroup_css_sets_empty(cgrp) || !list_empty(&cgrp->children)) { + clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); + mutex_unlock(&cgroup_mutex); + return -EBUSY; + } + prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); + if (!cgroup_clear_css_refs(cgrp)) { + mutex_unlock(&cgroup_mutex); + /* + * Because someone may call cgroup_wakeup_rmdir_waiter() before + * prepare_to_wait(), we need to check this flag. + */ + if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)) + schedule(); + finish_wait(&cgroup_rmdir_waitq, &wait); + clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); + if (signal_pending(current)) + return -EINTR; + goto again; + } + /* NO css_tryget() can success after here. */ + finish_wait(&cgroup_rmdir_waitq, &wait); + clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); + + spin_lock(&release_list_lock); + set_bit(CGRP_REMOVED, &cgrp->flags); + if (!list_empty(&cgrp->release_list)) + list_del_init(&cgrp->release_list); + spin_unlock(&release_list_lock); + + cgroup_lock_hierarchy(cgrp->root); + /* delete this cgroup from parent->children */ + list_del_init(&cgrp->sibling); + cgroup_unlock_hierarchy(cgrp->root); + + d = dget(cgrp->dentry); + + cgroup_d_remove_dir(d); + dput(d); + + check_for_release(parent); + + /* + * Unregister events and notify userspace. + * Notify userspace about cgroup removing only after rmdir of cgroup + * directory to avoid race between userspace and kernelspace + */ + spin_lock(&cgrp->event_list_lock); + list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { + list_del(&event->list); + remove_wait_queue(event->wqh, &event->wait); + eventfd_signal(event->eventfd, 1); + schedule_work(&event->remove); + } + spin_unlock(&cgrp->event_list_lock); + + mutex_unlock(&cgroup_mutex); + return 0; +} + +static void __init cgroup_init_subsys(struct cgroup_subsys *ss) +{ + struct cgroup_subsys_state *css; + + printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); + + /* Create the top cgroup state for this subsystem */ + list_add(&ss->sibling, &rootnode.subsys_list); + ss->root = &rootnode; + css = ss->create(ss, dummytop); + /* We don't handle early failures gracefully */ + BUG_ON(IS_ERR(css)); + init_cgroup_css(css, ss, dummytop); + + /* Update the init_css_set to contain a subsys + * pointer to this state - since the subsystem is + * newly registered, all tasks and hence the + * init_css_set is in the subsystem's top cgroup. */ + init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id]; + + need_forkexit_callback |= ss->fork || ss->exit; + + /* At system boot, before all subsystems have been + * registered, no tasks have been forked, so we don't + * need to invoke fork callbacks here. */ + BUG_ON(!list_empty(&init_task.tasks)); + + mutex_init(&ss->hierarchy_mutex); + lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); + ss->active = 1; + + /* this function shouldn't be used with modular subsystems, since they + * need to register a subsys_id, among other things */ + BUG_ON(ss->module); +} + +/** + * cgroup_load_subsys: load and register a modular subsystem at runtime + * @ss: the subsystem to load + * + * This function should be called in a modular subsystem's initcall. If the + * subsystem is built as a module, it will be assigned a new subsys_id and set + * up for use. If the subsystem is built-in anyway, work is delegated to the + * simpler cgroup_init_subsys. + */ +int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) +{ + int i; + struct cgroup_subsys_state *css; + + /* check name and function validity */ + if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || + ss->create == NULL || ss->destroy == NULL) + return -EINVAL; + + /* + * we don't support callbacks in modular subsystems. this check is + * before the ss->module check for consistency; a subsystem that could + * be a module should still have no callbacks even if the user isn't + * compiling it as one. + */ + if (ss->fork || ss->exit) + return -EINVAL; + + /* + * an optionally modular subsystem is built-in: we want to do nothing, + * since cgroup_init_subsys will have already taken care of it. + */ + if (ss->module == NULL) { + /* a few sanity checks */ + BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT); + BUG_ON(subsys[ss->subsys_id] != ss); + return 0; + } + + /* + * need to register a subsys id before anything else - for example, + * init_cgroup_css needs it. + */ + mutex_lock(&cgroup_mutex); + /* find the first empty slot in the array */ + for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { + if (subsys[i] == NULL) + break; + } + if (i == CGROUP_SUBSYS_COUNT) { + /* maximum number of subsystems already registered! */ + mutex_unlock(&cgroup_mutex); + return -EBUSY; + } + /* assign ourselves the subsys_id */ + ss->subsys_id = i; + subsys[i] = ss; + + /* + * no ss->create seems to need anything important in the ss struct, so + * this can happen first (i.e. before the rootnode attachment). + */ + css = ss->create(ss, dummytop); + if (IS_ERR(css)) { + /* failure case - need to deassign the subsys[] slot. */ + subsys[i] = NULL; + mutex_unlock(&cgroup_mutex); + return PTR_ERR(css); + } + + list_add(&ss->sibling, &rootnode.subsys_list); + ss->root = &rootnode; + + /* our new subsystem will be attached to the dummy hierarchy. */ + init_cgroup_css(css, ss, dummytop); + /* init_idr must be after init_cgroup_css because it sets css->id. */ + if (ss->use_id) { + int ret = cgroup_init_idr(ss, css); + if (ret) { + dummytop->subsys[ss->subsys_id] = NULL; + ss->destroy(ss, dummytop); + subsys[i] = NULL; + mutex_unlock(&cgroup_mutex); + return ret; + } + } + + /* + * Now we need to entangle the css into the existing css_sets. unlike + * in cgroup_init_subsys, there are now multiple css_sets, so each one + * will need a new pointer to it; done by iterating the css_set_table. + * furthermore, modifying the existing css_sets will corrupt the hash + * table state, so each changed css_set will need its hash recomputed. + * this is all done under the css_set_lock. + */ + write_lock(&css_set_lock); + for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { + struct css_set *cg; + struct hlist_node *node, *tmp; + struct hlist_head *bucket = &css_set_table[i], *new_bucket; + + hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) { + /* skip entries that we already rehashed */ + if (cg->subsys[ss->subsys_id]) + continue; + /* remove existing entry */ + hlist_del(&cg->hlist); + /* set new value */ + cg->subsys[ss->subsys_id] = css; + /* recompute hash and restore entry */ + new_bucket = css_set_hash(cg->subsys); + hlist_add_head(&cg->hlist, new_bucket); + } + } + write_unlock(&css_set_lock); + + mutex_init(&ss->hierarchy_mutex); + lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); + ss->active = 1; + + /* success! */ + mutex_unlock(&cgroup_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(cgroup_load_subsys); + +/** + * cgroup_unload_subsys: unload a modular subsystem + * @ss: the subsystem to unload + * + * This function should be called in a modular subsystem's exitcall. When this + * function is invoked, the refcount on the subsystem's module will be 0, so + * the subsystem will not be attached to any hierarchy. + */ +void cgroup_unload_subsys(struct cgroup_subsys *ss) +{ + struct cg_cgroup_link *link; + struct hlist_head *hhead; + + BUG_ON(ss->module == NULL); + + /* + * we shouldn't be called if the subsystem is in use, and the use of + * try_module_get in parse_cgroupfs_options should ensure that it + * doesn't start being used while we're killing it off. + */ + BUG_ON(ss->root != &rootnode); + + mutex_lock(&cgroup_mutex); + /* deassign the subsys_id */ + BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT); + subsys[ss->subsys_id] = NULL; + + /* remove subsystem from rootnode's list of subsystems */ + list_del_init(&ss->sibling); + + /* + * disentangle the css from all css_sets attached to the dummytop. as + * in loading, we need to pay our respects to the hashtable gods. + */ + write_lock(&css_set_lock); + list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { + struct css_set *cg = link->cg; + + hlist_del(&cg->hlist); + BUG_ON(!cg->subsys[ss->subsys_id]); + cg->subsys[ss->subsys_id] = NULL; + hhead = css_set_hash(cg->subsys); + hlist_add_head(&cg->hlist, hhead); + } + write_unlock(&css_set_lock); + + /* + * remove subsystem's css from the dummytop and free it - need to free + * before marking as null because ss->destroy needs the cgrp->subsys + * pointer to find their state. note that this also takes care of + * freeing the css_id. + */ + ss->destroy(ss, dummytop); + dummytop->subsys[ss->subsys_id] = NULL; + + mutex_unlock(&cgroup_mutex); +} +EXPORT_SYMBOL_GPL(cgroup_unload_subsys); + +/** + * cgroup_init_early - cgroup initialization at system boot + * + * Initialize cgroups at system boot, and initialize any + * subsystems that request early init. + */ +int __init cgroup_init_early(void) +{ + int i; + atomic_set(&init_css_set.refcount, 1); + INIT_LIST_HEAD(&init_css_set.cg_links); + INIT_LIST_HEAD(&init_css_set.tasks); + INIT_HLIST_NODE(&init_css_set.hlist); + css_set_count = 1; + init_cgroup_root(&rootnode); + root_count = 1; + init_task.cgroups = &init_css_set; + + init_css_set_link.cg = &init_css_set; + init_css_set_link.cgrp = dummytop; + list_add(&init_css_set_link.cgrp_link_list, + &rootnode.top_cgroup.css_sets); + list_add(&init_css_set_link.cg_link_list, + &init_css_set.cg_links); + + for (i = 0; i < CSS_SET_TABLE_SIZE; i++) + INIT_HLIST_HEAD(&css_set_table[i]); + + /* at bootup time, we don't worry about modular subsystems */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + + BUG_ON(!ss->name); + BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); + BUG_ON(!ss->create); + BUG_ON(!ss->destroy); + if (ss->subsys_id != i) { + printk(KERN_ERR "cgroup: Subsys %s id == %d\n", + ss->name, ss->subsys_id); + BUG(); + } + + if (ss->early_init) + cgroup_init_subsys(ss); + } + return 0; +} + +/** + * cgroup_init - cgroup initialization + * + * Register cgroup filesystem and /proc file, and initialize + * any subsystems that didn't request early init. + */ +int __init cgroup_init(void) +{ + int err; + int i; + struct hlist_head *hhead; + + err = bdi_init(&cgroup_backing_dev_info); + if (err) + return err; + + /* at bootup time, we don't worry about modular subsystems */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (!ss->early_init) + cgroup_init_subsys(ss); + if (ss->use_id) + cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]); + } + + /* Add init_css_set to the hash table */ + hhead = css_set_hash(init_css_set.subsys); + hlist_add_head(&init_css_set.hlist, hhead); + BUG_ON(!init_root_id(&rootnode)); + + cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); + if (!cgroup_kobj) { + err = -ENOMEM; + goto out; + } + + err = register_filesystem(&cgroup_fs_type); + if (err < 0) { + kobject_put(cgroup_kobj); + goto out; + } + + proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); + +out: + if (err) + bdi_destroy(&cgroup_backing_dev_info); + + return err; +} + +/* + * proc_cgroup_show() + * - Print task's cgroup paths into seq_file, one line for each hierarchy + * - Used for /proc//cgroup. + * - No need to task_lock(tsk) on this tsk->cgroup reference, as it + * doesn't really matter if tsk->cgroup changes after we read it, + * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it + * anyway. No need to check that tsk->cgroup != NULL, thanks to + * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks + * cgroup to top_cgroup. + */ + +/* TODO: Use a proper seq_file iterator */ +static int proc_cgroup_show(struct seq_file *m, void *v) +{ + struct pid *pid; + struct task_struct *tsk; + char *buf; + int retval; + struct cgroupfs_root *root; + + retval = -ENOMEM; + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) + goto out; + + retval = -ESRCH; + pid = m->private; + tsk = get_pid_task(pid, PIDTYPE_PID); + if (!tsk) + goto out_free; + + retval = 0; + + mutex_lock(&cgroup_mutex); + + for_each_active_root(root) { + struct cgroup_subsys *ss; + struct cgroup *cgrp; + int count = 0; + + seq_printf(m, "%d:", root->hierarchy_id); + for_each_subsys(root, ss) + seq_printf(m, "%s%s", count++ ? "," : "", ss->name); + if (strlen(root->name)) + seq_printf(m, "%sname=%s", count ? "," : "", + root->name); + seq_putc(m, ':'); + cgrp = task_cgroup_from_root(tsk, root); + retval = cgroup_path(cgrp, buf, PAGE_SIZE); + if (retval < 0) + goto out_unlock; + seq_puts(m, buf); + seq_putc(m, '\n'); + } + +out_unlock: + mutex_unlock(&cgroup_mutex); + put_task_struct(tsk); +out_free: + kfree(buf); +out: + return retval; +} + +static int cgroup_open(struct inode *inode, struct file *file) +{ + struct pid *pid = PROC_I(inode)->pid; + return single_open(file, proc_cgroup_show, pid); +} + +const struct file_operations proc_cgroup_operations = { + .open = cgroup_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* Display information about each subsystem and each hierarchy */ +static int proc_cgroupstats_show(struct seq_file *m, void *v) +{ + int i; + + seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); + /* + * ideally we don't want subsystems moving around while we do this. + * cgroup_mutex is also necessary to guarantee an atomic snapshot of + * subsys/hierarchy state. + */ + mutex_lock(&cgroup_mutex); + for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss == NULL) + continue; + seq_printf(m, "%s\t%d\t%d\t%d\n", + ss->name, ss->root->hierarchy_id, + ss->root->number_of_cgroups, !ss->disabled); + } + mutex_unlock(&cgroup_mutex); + return 0; +} + +static int cgroupstats_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_cgroupstats_show, NULL); +} + +static const struct file_operations proc_cgroupstats_operations = { + .open = cgroupstats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/** + * cgroup_fork - attach newly forked task to its parents cgroup. + * @child: pointer to task_struct of forking parent process. + * + * Description: A task inherits its parent's cgroup at fork(). + * + * A pointer to the shared css_set was automatically copied in + * fork.c by dup_task_struct(). However, we ignore that copy, since + * it was not made under the protection of RCU or cgroup_mutex, so + * might no longer be a valid cgroup pointer. cgroup_attach_task() might + * have already changed current->cgroups, allowing the previously + * referenced cgroup group to be removed and freed. + * + * At the point that cgroup_fork() is called, 'current' is the parent + * task, and the passed argument 'child' points to the child task. + */ +void cgroup_fork(struct task_struct *child) +{ + task_lock(current); + child->cgroups = current->cgroups; + get_css_set(child->cgroups); + task_unlock(current); + INIT_LIST_HEAD(&child->cg_list); +} + +/** + * cgroup_fork_callbacks - run fork callbacks + * @child: the new task + * + * Called on a new task very soon before adding it to the + * tasklist. No need to take any locks since no-one can + * be operating on this task. + */ +void cgroup_fork_callbacks(struct task_struct *child) +{ + if (need_forkexit_callback) { + int i; + /* + * forkexit callbacks are only supported for builtin + * subsystems, and the builtin section of the subsys array is + * immutable, so we don't need to lock the subsys array here. + */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss->fork) + ss->fork(ss, child); + } + } +} + +/** + * cgroup_post_fork - called on a new task after adding it to the task list + * @child: the task in question + * + * Adds the task to the list running through its css_set if necessary. + * Has to be after the task is visible on the task list in case we race + * with the first call to cgroup_iter_start() - to guarantee that the + * new task ends up on its list. + */ +void cgroup_post_fork(struct task_struct *child) +{ + if (use_task_css_set_links) { + write_lock(&css_set_lock); + task_lock(child); + if (list_empty(&child->cg_list)) + list_add(&child->cg_list, &child->cgroups->tasks); + task_unlock(child); + write_unlock(&css_set_lock); + } +} +/** + * cgroup_exit - detach cgroup from exiting task + * @tsk: pointer to task_struct of exiting process + * @run_callback: run exit callbacks? + * + * Description: Detach cgroup from @tsk and release it. + * + * Note that cgroups marked notify_on_release force every task in + * them to take the global cgroup_mutex mutex when exiting. + * This could impact scaling on very large systems. Be reluctant to + * use notify_on_release cgroups where very high task exit scaling + * is required on large systems. + * + * the_top_cgroup_hack: + * + * Set the exiting tasks cgroup to the root cgroup (top_cgroup). + * + * We call cgroup_exit() while the task is still competent to + * handle notify_on_release(), then leave the task attached to the + * root cgroup in each hierarchy for the remainder of its exit. + * + * To do this properly, we would increment the reference count on + * top_cgroup, and near the very end of the kernel/exit.c do_exit() + * code we would add a second cgroup function call, to drop that + * reference. This would just create an unnecessary hot spot on + * the top_cgroup reference count, to no avail. + * + * Normally, holding a reference to a cgroup without bumping its + * count is unsafe. The cgroup could go away, or someone could + * attach us to a different cgroup, decrementing the count on + * the first cgroup that we never incremented. But in this case, + * top_cgroup isn't going away, and either task has PF_EXITING set, + * which wards off any cgroup_attach_task() attempts, or task is a failed + * fork, never visible to cgroup_attach_task. + */ +void cgroup_exit(struct task_struct *tsk, int run_callbacks) +{ + struct css_set *cg; + int i; + + /* + * Unlink from the css_set task list if necessary. + * Optimistically check cg_list before taking + * css_set_lock + */ + if (!list_empty(&tsk->cg_list)) { + write_lock(&css_set_lock); + if (!list_empty(&tsk->cg_list)) + list_del_init(&tsk->cg_list); + write_unlock(&css_set_lock); + } + + /* Reassign the task to the init_css_set. */ + task_lock(tsk); + cg = tsk->cgroups; + tsk->cgroups = &init_css_set; + + if (run_callbacks && need_forkexit_callback) { + /* + * modular subsystems can't use callbacks, so no need to lock + * the subsys array + */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + if (ss->exit) { + struct cgroup *old_cgrp = + rcu_dereference_raw(cg->subsys[i])->cgroup; + struct cgroup *cgrp = task_cgroup(tsk, i); + ss->exit(ss, cgrp, old_cgrp, tsk); + } + } + } + task_unlock(tsk); + + if (cg) + put_css_set(cg); +} + +/** + * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp + * @cgrp: the cgroup in question + * @task: the task in question + * + * See if @cgrp is a descendant of @task's cgroup in the appropriate + * hierarchy. + * + * If we are sending in dummytop, then presumably we are creating + * the top cgroup in the subsystem. + * + * Called only by the ns (nsproxy) cgroup. + */ +int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task) +{ + int ret; + struct cgroup *target; + + if (cgrp == dummytop) + return 1; + + target = task_cgroup_from_root(task, cgrp->root); + while (cgrp != target && cgrp!= cgrp->top_cgroup) + cgrp = cgrp->parent; + ret = (cgrp == target); + return ret; +} + +static void check_for_release(struct cgroup *cgrp) +{ + /* All of these checks rely on RCU to keep the cgroup + * structure alive */ + if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count) + && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) { + /* Control Group is currently removeable. If it's not + * already queued for a userspace notification, queue + * it now */ + int need_schedule_work = 0; + spin_lock(&release_list_lock); + if (!cgroup_is_removed(cgrp) && + list_empty(&cgrp->release_list)) { + list_add(&cgrp->release_list, &release_list); + need_schedule_work = 1; + } + spin_unlock(&release_list_lock); + if (need_schedule_work) + schedule_work(&release_agent_work); + } +} + +/* Caller must verify that the css is not for root cgroup */ +void __css_get(struct cgroup_subsys_state *css, int count) +{ + atomic_add(count, &css->refcnt); + set_bit(CGRP_RELEASABLE, &css->cgroup->flags); +} +EXPORT_SYMBOL_GPL(__css_get); + +/* Caller must verify that the css is not for root cgroup */ +void __css_put(struct cgroup_subsys_state *css, int count) +{ + struct cgroup *cgrp = css->cgroup; + int val; + rcu_read_lock(); + val = atomic_sub_return(count, &css->refcnt); + if (val == 1) { + check_for_release(cgrp); + cgroup_wakeup_rmdir_waiter(cgrp); + } + rcu_read_unlock(); + WARN_ON_ONCE(val < 1); +} +EXPORT_SYMBOL_GPL(__css_put); + +/* + * Notify userspace when a cgroup is released, by running the + * configured release agent with the name of the cgroup (path + * relative to the root of cgroup file system) as the argument. + * + * Most likely, this user command will try to rmdir this cgroup. + * + * This races with the possibility that some other task will be + * attached to this cgroup before it is removed, or that some other + * user task will 'mkdir' a child cgroup of this cgroup. That's ok. + * The presumed 'rmdir' will fail quietly if this cgroup is no longer + * unused, and this cgroup will be reprieved from its death sentence, + * to continue to serve a useful existence. Next time it's released, + * we will get notified again, if it still has 'notify_on_release' set. + * + * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which + * means only wait until the task is successfully execve()'d. The + * separate release agent task is forked by call_usermodehelper(), + * then control in this thread returns here, without waiting for the + * release agent task. We don't bother to wait because the caller of + * this routine has no use for the exit status of the release agent + * task, so no sense holding our caller up for that. + */ +static void cgroup_release_agent(struct work_struct *work) +{ + BUG_ON(work != &release_agent_work); + mutex_lock(&cgroup_mutex); + spin_lock(&release_list_lock); + while (!list_empty(&release_list)) { + char *argv[3], *envp[3]; + int i; + char *pathbuf = NULL, *agentbuf = NULL; + struct cgroup *cgrp = list_entry(release_list.next, + struct cgroup, + release_list); + list_del_init(&cgrp->release_list); + spin_unlock(&release_list_lock); + pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!pathbuf) + goto continue_free; + if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) + goto continue_free; + agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); + if (!agentbuf) + goto continue_free; + + i = 0; + argv[i++] = agentbuf; + argv[i++] = pathbuf; + argv[i] = NULL; + + i = 0; + /* minimal command environment */ + envp[i++] = "HOME=/"; + envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; + envp[i] = NULL; + + /* Drop the lock while we invoke the usermode helper, + * since the exec could involve hitting disk and hence + * be a slow process */ + mutex_unlock(&cgroup_mutex); + call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); + mutex_lock(&cgroup_mutex); + continue_free: + kfree(pathbuf); + kfree(agentbuf); + spin_lock(&release_list_lock); + } + spin_unlock(&release_list_lock); + mutex_unlock(&cgroup_mutex); +} + +static int __init cgroup_disable(char *str) +{ + int i; + char *token; + + while ((token = strsep(&str, ",")) != NULL) { + if (!*token) + continue; + /* + * cgroup_disable, being at boot time, can't know about module + * subsystems, so we don't worry about them. + */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { + struct cgroup_subsys *ss = subsys[i]; + + if (!strcmp(token, ss->name)) { + ss->disabled = 1; + printk(KERN_INFO "Disabling %s control group" + " subsystem\n", ss->name); + break; + } + } + } + return 1; +} +__setup("cgroup_disable=", cgroup_disable); + +/* + * Functons for CSS ID. + */ + +/* + *To get ID other than 0, this should be called when !cgroup_is_removed(). + */ +unsigned short css_id(struct cgroup_subsys_state *css) +{ + struct css_id *cssid; + + /* + * This css_id() can return correct value when somone has refcnt + * on this or this is under rcu_read_lock(). Once css->id is allocated, + * it's unchanged until freed. + */ + cssid = rcu_dereference_check(css->id, + rcu_read_lock_held() || atomic_read(&css->refcnt)); + + if (cssid) + return cssid->id; + return 0; +} +EXPORT_SYMBOL_GPL(css_id); + +unsigned short css_depth(struct cgroup_subsys_state *css) +{ + struct css_id *cssid; + + cssid = rcu_dereference_check(css->id, + rcu_read_lock_held() || atomic_read(&css->refcnt)); + + if (cssid) + return cssid->depth; + return 0; +} +EXPORT_SYMBOL_GPL(css_depth); + +/** + * css_is_ancestor - test "root" css is an ancestor of "child" + * @child: the css to be tested. + * @root: the css supporsed to be an ancestor of the child. + * + * Returns true if "root" is an ancestor of "child" in its hierarchy. Because + * this function reads css->id, this use rcu_dereference() and rcu_read_lock(). + * But, considering usual usage, the csses should be valid objects after test. + * Assuming that the caller will do some action to the child if this returns + * returns true, the caller must take "child";s reference count. + * If "child" is valid object and this returns true, "root" is valid, too. + */ + +bool css_is_ancestor(struct cgroup_subsys_state *child, + const struct cgroup_subsys_state *root) +{ + struct css_id *child_id; + struct css_id *root_id; + bool ret = true; + + rcu_read_lock(); + child_id = rcu_dereference(child->id); + root_id = rcu_dereference(root->id); + if (!child_id + || !root_id + || (child_id->depth < root_id->depth) + || (child_id->stack[root_id->depth] != root_id->id)) + ret = false; + rcu_read_unlock(); + return ret; +} + +void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) +{ + struct css_id *id = css->id; + /* When this is called before css_id initialization, id can be NULL */ + if (!id) + return; + + BUG_ON(!ss->use_id); + + rcu_assign_pointer(id->css, NULL); + rcu_assign_pointer(css->id, NULL); + spin_lock(&ss->id_lock); + idr_remove(&ss->idr, id->id); + spin_unlock(&ss->id_lock); + kfree_rcu(id, rcu_head); +} +EXPORT_SYMBOL_GPL(free_css_id); + +/* + * This is called by init or create(). Then, calls to this function are + * always serialized (By cgroup_mutex() at create()). + */ + +static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) +{ + struct css_id *newid; + int myid, error, size; + + BUG_ON(!ss->use_id); + + size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1); + newid = kzalloc(size, GFP_KERNEL); + if (!newid) + return ERR_PTR(-ENOMEM); + /* get id */ + if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) { + error = -ENOMEM; + goto err_out; + } + spin_lock(&ss->id_lock); + /* Don't use 0. allocates an ID of 1-65535 */ + error = idr_get_new_above(&ss->idr, newid, 1, &myid); + spin_unlock(&ss->id_lock); + + /* Returns error when there are no free spaces for new ID.*/ + if (error) { + error = -ENOSPC; + goto err_out; + } + if (myid > CSS_ID_MAX) + goto remove_idr; + + newid->id = myid; + newid->depth = depth; + return newid; +remove_idr: + error = -ENOSPC; + spin_lock(&ss->id_lock); + idr_remove(&ss->idr, myid); + spin_unlock(&ss->id_lock); +err_out: + kfree(newid); + return ERR_PTR(error); + +} + +static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, + struct cgroup_subsys_state *rootcss) +{ + struct css_id *newid; + + spin_lock_init(&ss->id_lock); + idr_init(&ss->idr); + + newid = get_new_cssid(ss, 0); + if (IS_ERR(newid)) + return PTR_ERR(newid); + + newid->stack[0] = newid->id; + newid->css = rootcss; + rootcss->id = newid; + return 0; +} + +static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, + struct cgroup *child) +{ + int subsys_id, i, depth = 0; + struct cgroup_subsys_state *parent_css, *child_css; + struct css_id *child_id, *parent_id; + + subsys_id = ss->subsys_id; + parent_css = parent->subsys[subsys_id]; + child_css = child->subsys[subsys_id]; + parent_id = parent_css->id; + depth = parent_id->depth + 1; + + child_id = get_new_cssid(ss, depth); + if (IS_ERR(child_id)) + return PTR_ERR(child_id); + + for (i = 0; i < depth; i++) + child_id->stack[i] = parent_id->stack[i]; + child_id->stack[depth] = child_id->id; + /* + * child_id->css pointer will be set after this cgroup is available + * see cgroup_populate_dir() + */ + rcu_assign_pointer(child_css->id, child_id); + + return 0; +} + +/** + * css_lookup - lookup css by id + * @ss: cgroup subsys to be looked into. + * @id: the id + * + * Returns pointer to cgroup_subsys_state if there is valid one with id. + * NULL if not. Should be called under rcu_read_lock() + */ +struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) +{ + struct css_id *cssid = NULL; + + BUG_ON(!ss->use_id); + cssid = idr_find(&ss->idr, id); + + if (unlikely(!cssid)) + return NULL; + + return rcu_dereference(cssid->css); +} +EXPORT_SYMBOL_GPL(css_lookup); + +/** + * css_get_next - lookup next cgroup under specified hierarchy. + * @ss: pointer to subsystem + * @id: current position of iteration. + * @root: pointer to css. search tree under this. + * @foundid: position of found object. + * + * Search next css under the specified hierarchy of rootid. Calling under + * rcu_read_lock() is necessary. Returns NULL if it reaches the end. + */ +struct cgroup_subsys_state * +css_get_next(struct cgroup_subsys *ss, int id, + struct cgroup_subsys_state *root, int *foundid) +{ + struct cgroup_subsys_state *ret = NULL; + struct css_id *tmp; + int tmpid; + int rootid = css_id(root); + int depth = css_depth(root); + + if (!rootid) + return NULL; + + BUG_ON(!ss->use_id); + /* fill start point for scan */ + tmpid = id; + while (1) { + /* + * scan next entry from bitmap(tree), tmpid is updated after + * idr_get_next(). + */ + spin_lock(&ss->id_lock); + tmp = idr_get_next(&ss->idr, &tmpid); + spin_unlock(&ss->id_lock); + + if (!tmp) + break; + if (tmp->depth >= depth && tmp->stack[depth] == rootid) { + ret = rcu_dereference(tmp->css); + if (ret) { + *foundid = tmpid; + break; + } + } + /* continue to scan from next id */ + tmpid = tmpid + 1; + } + return ret; +} + +/* + * get corresponding css from file open on cgroupfs directory + */ +struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) +{ + struct cgroup *cgrp; + struct inode *inode; + struct cgroup_subsys_state *css; + + inode = f->f_dentry->d_inode; + /* check in cgroup filesystem dir */ + if (inode->i_op != &cgroup_dir_inode_operations) + return ERR_PTR(-EBADF); + + if (id < 0 || id >= CGROUP_SUBSYS_COUNT) + return ERR_PTR(-EINVAL); + + /* get cgroup */ + cgrp = __d_cgrp(f->f_dentry); + css = cgrp->subsys[id]; + return css ? css : ERR_PTR(-ENOENT); +} + +#ifdef CONFIG_CGROUP_DEBUG +static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss, + struct cgroup *cont) +{ + struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); + + if (!css) + return ERR_PTR(-ENOMEM); + + return css; +} + +static void debug_destroy(struct cgroup_subsys *ss, struct cgroup *cont) +{ + kfree(cont->subsys[debug_subsys_id]); +} + +static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft) +{ + return atomic_read(&cont->count); +} + +static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft) +{ + return cgroup_task_count(cont); +} + +static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) +{ + return (u64)(unsigned long)current->cgroups; +} + +static u64 current_css_set_refcount_read(struct cgroup *cont, + struct cftype *cft) +{ + u64 count; + + rcu_read_lock(); + count = atomic_read(¤t->cgroups->refcount); + rcu_read_unlock(); + return count; +} + +static int current_css_set_cg_links_read(struct cgroup *cont, + struct cftype *cft, + struct seq_file *seq) +{ + struct cg_cgroup_link *link; + struct css_set *cg; + + read_lock(&css_set_lock); + rcu_read_lock(); + cg = rcu_dereference(current->cgroups); + list_for_each_entry(link, &cg->cg_links, cg_link_list) { + struct cgroup *c = link->cgrp; + const char *name; + + if (c->dentry) + name = c->dentry->d_name.name; + else + name = "?"; + seq_printf(seq, "Root %d group %s\n", + c->root->hierarchy_id, name); + } + rcu_read_unlock(); + read_unlock(&css_set_lock); + return 0; +} + +#define MAX_TASKS_SHOWN_PER_CSS 25 +static int cgroup_css_links_read(struct cgroup *cont, + struct cftype *cft, + struct seq_file *seq) +{ + struct cg_cgroup_link *link; + + read_lock(&css_set_lock); + list_for_each_entry(link, &cont->css_sets, cgrp_link_list) { + struct css_set *cg = link->cg; + struct task_struct *task; + int count = 0; + seq_printf(seq, "css_set %p\n", cg); + list_for_each_entry(task, &cg->tasks, cg_list) { + if (count++ > MAX_TASKS_SHOWN_PER_CSS) { + seq_puts(seq, " ...\n"); + break; + } else { + seq_printf(seq, " task %d\n", + task_pid_vnr(task)); + } + } + } + read_unlock(&css_set_lock); + return 0; +} + +static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) +{ + return test_bit(CGRP_RELEASABLE, &cgrp->flags); +} + +static struct cftype debug_files[] = { + { + .name = "cgroup_refcount", + .read_u64 = cgroup_refcount_read, + }, + { + .name = "taskcount", + .read_u64 = debug_taskcount_read, + }, + + { + .name = "current_css_set", + .read_u64 = current_css_set_read, + }, + + { + .name = "current_css_set_refcount", + .read_u64 = current_css_set_refcount_read, + }, + + { + .name = "current_css_set_cg_links", + .read_seq_string = current_css_set_cg_links_read, + }, + + { + .name = "cgroup_css_links", + .read_seq_string = cgroup_css_links_read, + }, + + { + .name = "releasable", + .read_u64 = releasable_read, + }, +}; + +static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) +{ + return cgroup_add_files(cont, ss, debug_files, + ARRAY_SIZE(debug_files)); +} + +struct cgroup_subsys debug_subsys = { + .name = "debug", + .create = debug_create, + .destroy = debug_destroy, + .populate = debug_populate, + .subsys_id = debug_subsys_id, +}; +#endif /* CONFIG_CGROUP_DEBUG */ diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c new file mode 100644 index 00000000..a3f638ac --- /dev/null +++ b/kernel/cgroup_freezer.c @@ -0,0 +1,397 @@ +/* + * cgroup_freezer.c - control group freezer subsystem + * + * Copyright IBM Corporation, 2007 + * + * Author : Cedric Le Goater + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include +#include +#include +#include +#include +#include +#include + +enum freezer_state { + CGROUP_THAWED = 0, + CGROUP_FREEZING, + CGROUP_FROZEN, +}; + +struct freezer { + struct cgroup_subsys_state css; + enum freezer_state state; + spinlock_t lock; /* protects _writes_ to state */ +}; + +static inline struct freezer *cgroup_freezer( + struct cgroup *cgroup) +{ + return container_of( + cgroup_subsys_state(cgroup, freezer_subsys_id), + struct freezer, css); +} + +static inline struct freezer *task_freezer(struct task_struct *task) +{ + return container_of(task_subsys_state(task, freezer_subsys_id), + struct freezer, css); +} + +static inline int __cgroup_freezing_or_frozen(struct task_struct *task) +{ + enum freezer_state state = task_freezer(task)->state; + return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN); +} + +int cgroup_freezing_or_frozen(struct task_struct *task) +{ + int result; + task_lock(task); + result = __cgroup_freezing_or_frozen(task); + task_unlock(task); + return result; +} + +/* + * cgroups_write_string() limits the size of freezer state strings to + * CGROUP_LOCAL_BUFFER_SIZE + */ +static const char *freezer_state_strs[] = { + "THAWED", + "FREEZING", + "FROZEN", +}; + +/* + * State diagram + * Transitions are caused by userspace writes to the freezer.state file. + * The values in parenthesis are state labels. The rest are edge labels. + * + * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN) + * ^ ^ | | + * | \_______THAWED_______/ | + * \__________________________THAWED____________/ + */ + +struct cgroup_subsys freezer_subsys; + +/* Locks taken and their ordering + * ------------------------------ + * cgroup_mutex (AKA cgroup_lock) + * freezer->lock + * css_set_lock + * task->alloc_lock (AKA task_lock) + * task->sighand->siglock + * + * cgroup code forces css_set_lock to be taken before task->alloc_lock + * + * freezer_create(), freezer_destroy(): + * cgroup_mutex [ by cgroup core ] + * + * freezer_can_attach(): + * cgroup_mutex (held by caller of can_attach) + * + * cgroup_freezing_or_frozen(): + * task->alloc_lock (to get task's cgroup) + * + * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): + * freezer->lock + * sighand->siglock (if the cgroup is freezing) + * + * freezer_read(): + * cgroup_mutex + * freezer->lock + * write_lock css_set_lock (cgroup iterator start) + * task->alloc_lock + * read_lock css_set_lock (cgroup iterator start) + * + * freezer_write() (freeze): + * cgroup_mutex + * freezer->lock + * write_lock css_set_lock (cgroup iterator start) + * task->alloc_lock + * read_lock css_set_lock (cgroup iterator start) + * sighand->siglock (fake signal delivery inside freeze_task()) + * + * freezer_write() (unfreeze): + * cgroup_mutex + * freezer->lock + * write_lock css_set_lock (cgroup iterator start) + * task->alloc_lock + * read_lock css_set_lock (cgroup iterator start) + * task->alloc_lock (inside thaw_process(), prevents race with refrigerator()) + * sighand->siglock + */ +static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, + struct cgroup *cgroup) +{ + struct freezer *freezer; + + freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL); + if (!freezer) + return ERR_PTR(-ENOMEM); + + spin_lock_init(&freezer->lock); + freezer->state = CGROUP_THAWED; + return &freezer->css; +} + +static void freezer_destroy(struct cgroup_subsys *ss, + struct cgroup *cgroup) +{ + kfree(cgroup_freezer(cgroup)); +} + +/* task is frozen or will freeze immediately when next it gets woken */ +static bool is_task_frozen_enough(struct task_struct *task) +{ + return frozen(task) || + (task_is_stopped_or_traced(task) && freezing(task)); +} + +/* + * The call to cgroup_lock() in the freezer.state write method prevents + * a write to that file racing against an attach, and hence the + * can_attach() result will remain valid until the attach completes. + */ +static int freezer_can_attach(struct cgroup_subsys *ss, + struct cgroup *new_cgroup, + struct task_struct *task) +{ + struct freezer *freezer; + + /* + * Anything frozen can't move or be moved to/from. + */ + + freezer = cgroup_freezer(new_cgroup); + if (freezer->state != CGROUP_THAWED) + return -EBUSY; + + return 0; +} + +static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) +{ + rcu_read_lock(); + if (__cgroup_freezing_or_frozen(tsk)) { + rcu_read_unlock(); + return -EBUSY; + } + rcu_read_unlock(); + return 0; +} + +static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) +{ + struct freezer *freezer; + + /* + * No lock is needed, since the task isn't on tasklist yet, + * so it can't be moved to another cgroup, which means the + * freezer won't be removed and will be valid during this + * function call. Nevertheless, apply RCU read-side critical + * section to suppress RCU lockdep false positives. + */ + rcu_read_lock(); + freezer = task_freezer(task); + rcu_read_unlock(); + + /* + * The root cgroup is non-freezable, so we can skip the + * following check. + */ + if (!freezer->css.cgroup->parent) + return; + + spin_lock_irq(&freezer->lock); + BUG_ON(freezer->state == CGROUP_FROZEN); + + /* Locking avoids race with FREEZING -> THAWED transitions. */ + if (freezer->state == CGROUP_FREEZING) + freeze_task(task, true); + spin_unlock_irq(&freezer->lock); +} + +/* + * caller must hold freezer->lock + */ +static void update_if_frozen(struct cgroup *cgroup, + struct freezer *freezer) +{ + struct cgroup_iter it; + struct task_struct *task; + unsigned int nfrozen = 0, ntotal = 0; + enum freezer_state old_state = freezer->state; + + cgroup_iter_start(cgroup, &it); + while ((task = cgroup_iter_next(cgroup, &it))) { + ntotal++; + if (is_task_frozen_enough(task)) + nfrozen++; + } + + if (old_state == CGROUP_THAWED) { + BUG_ON(nfrozen > 0); + } else if (old_state == CGROUP_FREEZING) { + if (nfrozen == ntotal) + freezer->state = CGROUP_FROZEN; + } else { /* old_state == CGROUP_FROZEN */ + BUG_ON(nfrozen != ntotal); + } + + cgroup_iter_end(cgroup, &it); +} + +static int freezer_read(struct cgroup *cgroup, struct cftype *cft, + struct seq_file *m) +{ + struct freezer *freezer; + enum freezer_state state; + + if (!cgroup_lock_live_group(cgroup)) + return -ENODEV; + + freezer = cgroup_freezer(cgroup); + spin_lock_irq(&freezer->lock); + state = freezer->state; + if (state == CGROUP_FREEZING) { + /* We change from FREEZING to FROZEN lazily if the cgroup was + * only partially frozen when we exitted write. */ + update_if_frozen(cgroup, freezer); + state = freezer->state; + } + spin_unlock_irq(&freezer->lock); + cgroup_unlock(); + + seq_puts(m, freezer_state_strs[state]); + seq_putc(m, '\n'); + return 0; +} + +static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) +{ + struct cgroup_iter it; + struct task_struct *task; + unsigned int num_cant_freeze_now = 0; + + freezer->state = CGROUP_FREEZING; + cgroup_iter_start(cgroup, &it); + while ((task = cgroup_iter_next(cgroup, &it))) { + if (!freeze_task(task, true)) + continue; + if (is_task_frozen_enough(task)) + continue; + if (!freezing(task) && !freezer_should_skip(task)) + num_cant_freeze_now++; + } + cgroup_iter_end(cgroup, &it); + + return num_cant_freeze_now ? -EBUSY : 0; +} + +static void unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer) +{ + struct cgroup_iter it; + struct task_struct *task; + + cgroup_iter_start(cgroup, &it); + while ((task = cgroup_iter_next(cgroup, &it))) { + thaw_process(task); + } + cgroup_iter_end(cgroup, &it); + + freezer->state = CGROUP_THAWED; +} + +static int freezer_change_state(struct cgroup *cgroup, + enum freezer_state goal_state) +{ + struct freezer *freezer; + int retval = 0; + + freezer = cgroup_freezer(cgroup); + + spin_lock_irq(&freezer->lock); + + update_if_frozen(cgroup, freezer); + if (goal_state == freezer->state) + goto out; + + switch (goal_state) { + case CGROUP_THAWED: + unfreeze_cgroup(cgroup, freezer); + break; + case CGROUP_FROZEN: + retval = try_to_freeze_cgroup(cgroup, freezer); + break; + default: + BUG(); + } +out: + spin_unlock_irq(&freezer->lock); + + return retval; +} + +static int freezer_write(struct cgroup *cgroup, + struct cftype *cft, + const char *buffer) +{ + int retval; + enum freezer_state goal_state; + + if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0) + goal_state = CGROUP_THAWED; + else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0) + goal_state = CGROUP_FROZEN; + else + return -EINVAL; + + if (!cgroup_lock_live_group(cgroup)) + return -ENODEV; + retval = freezer_change_state(cgroup, goal_state); + cgroup_unlock(); + return retval; +} + +static struct cftype files[] = { + { + .name = "state", + .read_seq_string = freezer_read, + .write_string = freezer_write, + }, +}; + +static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup) +{ + if (!cgroup->parent) + return 0; + return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files)); +} + +struct cgroup_subsys freezer_subsys = { + .name = "freezer", + .create = freezer_create, + .destroy = freezer_destroy, + .populate = freezer_populate, + .subsys_id = freezer_subsys_id, + .can_attach = freezer_can_attach, + .can_attach_task = freezer_can_attach_task, + .pre_attach = NULL, + .attach_task = NULL, + .attach = NULL, + .fork = freezer_fork, + .exit = NULL, +}; diff --git a/kernel/compat.c b/kernel/compat.c new file mode 100644 index 00000000..3507c936 --- /dev/null +++ b/kernel/compat.c @@ -0,0 +1,1193 @@ +/* + * linux/kernel/compat.c + * + * Kernel compatibililty routines for e.g. 32 bit syscall support + * on 64 bit kernels. + * + * Copyright (C) 2002-2003 Stephen Rothwell, IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include /* for MAX_SCHEDULE_TIMEOUT */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * Note that the native side is already converted to a timespec, because + * that's what we want anyway. + */ +static int compat_get_timeval(struct timespec *o, + struct compat_timeval __user *i) +{ + long usec; + + if (get_user(o->tv_sec, &i->tv_sec) || + get_user(usec, &i->tv_usec)) + return -EFAULT; + o->tv_nsec = usec * 1000; + return 0; +} + +static int compat_put_timeval(struct compat_timeval __user *o, + struct timeval *i) +{ + return (put_user(i->tv_sec, &o->tv_sec) || + put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0; +} + +static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp) +{ + memset(txc, 0, sizeof(struct timex)); + + if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) || + __get_user(txc->modes, &utp->modes) || + __get_user(txc->offset, &utp->offset) || + __get_user(txc->freq, &utp->freq) || + __get_user(txc->maxerror, &utp->maxerror) || + __get_user(txc->esterror, &utp->esterror) || + __get_user(txc->status, &utp->status) || + __get_user(txc->constant, &utp->constant) || + __get_user(txc->precision, &utp->precision) || + __get_user(txc->tolerance, &utp->tolerance) || + __get_user(txc->time.tv_sec, &utp->time.tv_sec) || + __get_user(txc->time.tv_usec, &utp->time.tv_usec) || + __get_user(txc->tick, &utp->tick) || + __get_user(txc->ppsfreq, &utp->ppsfreq) || + __get_user(txc->jitter, &utp->jitter) || + __get_user(txc->shift, &utp->shift) || + __get_user(txc->stabil, &utp->stabil) || + __get_user(txc->jitcnt, &utp->jitcnt) || + __get_user(txc->calcnt, &utp->calcnt) || + __get_user(txc->errcnt, &utp->errcnt) || + __get_user(txc->stbcnt, &utp->stbcnt)) + return -EFAULT; + + return 0; +} + +static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc) +{ + if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) || + __put_user(txc->modes, &utp->modes) || + __put_user(txc->offset, &utp->offset) || + __put_user(txc->freq, &utp->freq) || + __put_user(txc->maxerror, &utp->maxerror) || + __put_user(txc->esterror, &utp->esterror) || + __put_user(txc->status, &utp->status) || + __put_user(txc->constant, &utp->constant) || + __put_user(txc->precision, &utp->precision) || + __put_user(txc->tolerance, &utp->tolerance) || + __put_user(txc->time.tv_sec, &utp->time.tv_sec) || + __put_user(txc->time.tv_usec, &utp->time.tv_usec) || + __put_user(txc->tick, &utp->tick) || + __put_user(txc->ppsfreq, &utp->ppsfreq) || + __put_user(txc->jitter, &utp->jitter) || + __put_user(txc->shift, &utp->shift) || + __put_user(txc->stabil, &utp->stabil) || + __put_user(txc->jitcnt, &utp->jitcnt) || + __put_user(txc->calcnt, &utp->calcnt) || + __put_user(txc->errcnt, &utp->errcnt) || + __put_user(txc->stbcnt, &utp->stbcnt) || + __put_user(txc->tai, &utp->tai)) + return -EFAULT; + return 0; +} + +asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv, + struct timezone __user *tz) +{ + if (tv) { + struct timeval ktv; + do_gettimeofday(&ktv); + if (compat_put_timeval(tv, &ktv)) + return -EFAULT; + } + if (tz) { + if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) + return -EFAULT; + } + + return 0; +} + +asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv, + struct timezone __user *tz) +{ + struct timespec kts; + struct timezone ktz; + + if (tv) { + if (compat_get_timeval(&kts, tv)) + return -EFAULT; + } + if (tz) { + if (copy_from_user(&ktz, tz, sizeof(ktz))) + return -EFAULT; + } + + return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); +} + +int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) +{ + return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || + __get_user(ts->tv_sec, &cts->tv_sec) || + __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; +} + +int put_compat_timespec(const struct timespec *ts, struct compat_timespec __user *cts) +{ + return (!access_ok(VERIFY_WRITE, cts, sizeof(*cts)) || + __put_user(ts->tv_sec, &cts->tv_sec) || + __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; +} + +static long compat_nanosleep_restart(struct restart_block *restart) +{ + struct compat_timespec __user *rmtp; + struct timespec rmt; + mm_segment_t oldfs; + long ret; + + restart->nanosleep.rmtp = (struct timespec __user *) &rmt; + oldfs = get_fs(); + set_fs(KERNEL_DS); + ret = hrtimer_nanosleep_restart(restart); + set_fs(oldfs); + + if (ret) { + rmtp = restart->nanosleep.compat_rmtp; + + if (rmtp && put_compat_timespec(&rmt, rmtp)) + return -EFAULT; + } + + return ret; +} + +asmlinkage long compat_sys_nanosleep(struct compat_timespec __user *rqtp, + struct compat_timespec __user *rmtp) +{ + struct timespec tu, rmt; + mm_segment_t oldfs; + long ret; + + if (get_compat_timespec(&tu, rqtp)) + return -EFAULT; + + if (!timespec_valid(&tu)) + return -EINVAL; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + ret = hrtimer_nanosleep(&tu, + rmtp ? (struct timespec __user *)&rmt : NULL, + HRTIMER_MODE_REL, CLOCK_MONOTONIC); + set_fs(oldfs); + + if (ret) { + struct restart_block *restart + = ¤t_thread_info()->restart_block; + + restart->fn = compat_nanosleep_restart; + restart->nanosleep.compat_rmtp = rmtp; + + if (rmtp && put_compat_timespec(&rmt, rmtp)) + return -EFAULT; + } + + return ret; +} + +static inline long get_compat_itimerval(struct itimerval *o, + struct compat_itimerval __user *i) +{ + return (!access_ok(VERIFY_READ, i, sizeof(*i)) || + (__get_user(o->it_interval.tv_sec, &i->it_interval.tv_sec) | + __get_user(o->it_interval.tv_usec, &i->it_interval.tv_usec) | + __get_user(o->it_value.tv_sec, &i->it_value.tv_sec) | + __get_user(o->it_value.tv_usec, &i->it_value.tv_usec))); +} + +static inline long put_compat_itimerval(struct compat_itimerval __user *o, + struct itimerval *i) +{ + return (!access_ok(VERIFY_WRITE, o, sizeof(*o)) || + (__put_user(i->it_interval.tv_sec, &o->it_interval.tv_sec) | + __put_user(i->it_interval.tv_usec, &o->it_interval.tv_usec) | + __put_user(i->it_value.tv_sec, &o->it_value.tv_sec) | + __put_user(i->it_value.tv_usec, &o->it_value.tv_usec))); +} + +asmlinkage long compat_sys_getitimer(int which, + struct compat_itimerval __user *it) +{ + struct itimerval kit; + int error; + + error = do_getitimer(which, &kit); + if (!error && put_compat_itimerval(it, &kit)) + error = -EFAULT; + return error; +} + +asmlinkage long compat_sys_setitimer(int which, + struct compat_itimerval __user *in, + struct compat_itimerval __user *out) +{ + struct itimerval kin, kout; + int error; + + if (in) { + if (get_compat_itimerval(&kin, in)) + return -EFAULT; + } else + memset(&kin, 0, sizeof(kin)); + + error = do_setitimer(which, &kin, out ? &kout : NULL); + if (error || !out) + return error; + if (put_compat_itimerval(out, &kout)) + return -EFAULT; + return 0; +} + +static compat_clock_t clock_t_to_compat_clock_t(clock_t x) +{ + return compat_jiffies_to_clock_t(clock_t_to_jiffies(x)); +} + +asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) +{ + if (tbuf) { + struct tms tms; + struct compat_tms tmp; + + do_sys_times(&tms); + /* Convert our struct tms to the compat version. */ + tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime); + tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime); + tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime); + tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime); + if (copy_to_user(tbuf, &tmp, sizeof(tmp))) + return -EFAULT; + } + force_successful_syscall_return(); + return compat_jiffies_to_clock_t(jiffies); +} + +#ifdef __ARCH_WANT_SYS_SIGPENDING + +/* + * Assumption: old_sigset_t and compat_old_sigset_t are both + * types that can be passed to put_user()/get_user(). + */ + +asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set) +{ + old_sigset_t s; + long ret; + mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); + ret = sys_sigpending((old_sigset_t __user *) &s); + set_fs(old_fs); + if (ret == 0) + ret = put_user(s, set); + return ret; +} + +#endif + +#ifdef __ARCH_WANT_SYS_SIGPROCMASK + +/* + * sys_sigprocmask SIG_SETMASK sets the first (compat) word of the + * blocked set of signals to the supplied signal set + */ +static inline void compat_sig_setmask(sigset_t *blocked, compat_sigset_word set) +{ + memcpy(blocked->sig, &set, sizeof(set)); +} + +asmlinkage long compat_sys_sigprocmask(int how, + compat_old_sigset_t __user *nset, + compat_old_sigset_t __user *oset) +{ + old_sigset_t old_set, new_set; + sigset_t new_blocked; + + old_set = current->blocked.sig[0]; + + if (nset) { + if (get_user(new_set, nset)) + return -EFAULT; + new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); + + new_blocked = current->blocked; + + switch (how) { + case SIG_BLOCK: + sigaddsetmask(&new_blocked, new_set); + break; + case SIG_UNBLOCK: + sigdelsetmask(&new_blocked, new_set); + break; + case SIG_SETMASK: + compat_sig_setmask(&new_blocked, new_set); + break; + default: + return -EINVAL; + } + + set_current_blocked(&new_blocked); + } + + if (oset) { + if (put_user(old_set, oset)) + return -EFAULT; + } + + return 0; +} + +#endif + +asmlinkage long compat_sys_setrlimit(unsigned int resource, + struct compat_rlimit __user *rlim) +{ + struct rlimit r; + + if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) || + __get_user(r.rlim_cur, &rlim->rlim_cur) || + __get_user(r.rlim_max, &rlim->rlim_max)) + return -EFAULT; + + if (r.rlim_cur == COMPAT_RLIM_INFINITY) + r.rlim_cur = RLIM_INFINITY; + if (r.rlim_max == COMPAT_RLIM_INFINITY) + r.rlim_max = RLIM_INFINITY; + return do_prlimit(current, resource, &r, NULL); +} + +#ifdef COMPAT_RLIM_OLD_INFINITY + +asmlinkage long compat_sys_old_getrlimit(unsigned int resource, + struct compat_rlimit __user *rlim) +{ + struct rlimit r; + int ret; + mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); + ret = sys_old_getrlimit(resource, &r); + set_fs(old_fs); + + if (!ret) { + if (r.rlim_cur > COMPAT_RLIM_OLD_INFINITY) + r.rlim_cur = COMPAT_RLIM_INFINITY; + if (r.rlim_max > COMPAT_RLIM_OLD_INFINITY) + r.rlim_max = COMPAT_RLIM_INFINITY; + + if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) || + __put_user(r.rlim_cur, &rlim->rlim_cur) || + __put_user(r.rlim_max, &rlim->rlim_max)) + return -EFAULT; + } + return ret; +} + +#endif + +asmlinkage long compat_sys_getrlimit(unsigned int resource, + struct compat_rlimit __user *rlim) +{ + struct rlimit r; + int ret; + + ret = do_prlimit(current, resource, NULL, &r); + if (!ret) { + if (r.rlim_cur > COMPAT_RLIM_INFINITY) + r.rlim_cur = COMPAT_RLIM_INFINITY; + if (r.rlim_max > COMPAT_RLIM_INFINITY) + r.rlim_max = COMPAT_RLIM_INFINITY; + + if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) || + __put_user(r.rlim_cur, &rlim->rlim_cur) || + __put_user(r.rlim_max, &rlim->rlim_max)) + return -EFAULT; + } + return ret; +} + +int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru) +{ + if (!access_ok(VERIFY_WRITE, ru, sizeof(*ru)) || + __put_user(r->ru_utime.tv_sec, &ru->ru_utime.tv_sec) || + __put_user(r->ru_utime.tv_usec, &ru->ru_utime.tv_usec) || + __put_user(r->ru_stime.tv_sec, &ru->ru_stime.tv_sec) || + __put_user(r->ru_stime.tv_usec, &ru->ru_stime.tv_usec) || + __put_user(r->ru_maxrss, &ru->ru_maxrss) || + __put_user(r->ru_ixrss, &ru->ru_ixrss) || + __put_user(r->ru_idrss, &ru->ru_idrss) || + __put_user(r->ru_isrss, &ru->ru_isrss) || + __put_user(r->ru_minflt, &ru->ru_minflt) || + __put_user(r->ru_majflt, &ru->ru_majflt) || + __put_user(r->ru_nswap, &ru->ru_nswap) || + __put_user(r->ru_inblock, &ru->ru_inblock) || + __put_user(r->ru_oublock, &ru->ru_oublock) || + __put_user(r->ru_msgsnd, &ru->ru_msgsnd) || + __put_user(r->ru_msgrcv, &ru->ru_msgrcv) || + __put_user(r->ru_nsignals, &ru->ru_nsignals) || + __put_user(r->ru_nvcsw, &ru->ru_nvcsw) || + __put_user(r->ru_nivcsw, &ru->ru_nivcsw)) + return -EFAULT; + return 0; +} + +asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru) +{ + struct rusage r; + int ret; + mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); + ret = sys_getrusage(who, (struct rusage __user *) &r); + set_fs(old_fs); + + if (ret) + return ret; + + if (put_compat_rusage(&r, ru)) + return -EFAULT; + + return 0; +} + +asmlinkage long +compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options, + struct compat_rusage __user *ru) +{ + if (!ru) { + return sys_wait4(pid, stat_addr, options, NULL); + } else { + struct rusage r; + int ret; + unsigned int status; + mm_segment_t old_fs = get_fs(); + + set_fs (KERNEL_DS); + ret = sys_wait4(pid, + (stat_addr ? + (unsigned int __user *) &status : NULL), + options, (struct rusage __user *) &r); + set_fs (old_fs); + + if (ret > 0) { + if (put_compat_rusage(&r, ru)) + return -EFAULT; + if (stat_addr && put_user(status, stat_addr)) + return -EFAULT; + } + return ret; + } +} + +asmlinkage long compat_sys_waitid(int which, compat_pid_t pid, + struct compat_siginfo __user *uinfo, int options, + struct compat_rusage __user *uru) +{ + siginfo_t info; + struct rusage ru; + long ret; + mm_segment_t old_fs = get_fs(); + + memset(&info, 0, sizeof(info)); + + set_fs(KERNEL_DS); + ret = sys_waitid(which, pid, (siginfo_t __user *)&info, options, + uru ? (struct rusage __user *)&ru : NULL); + set_fs(old_fs); + + if ((ret < 0) || (info.si_signo == 0)) + return ret; + + if (uru) { + ret = put_compat_rusage(&ru, uru); + if (ret) + return ret; + } + + BUG_ON(info.si_code & __SI_MASK); + info.si_code |= __SI_CHLD; + return copy_siginfo_to_user32(uinfo, &info); +} + +static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr, + unsigned len, struct cpumask *new_mask) +{ + unsigned long *k; + + if (len < cpumask_size()) + memset(new_mask, 0, cpumask_size()); + else if (len > cpumask_size()) + len = cpumask_size(); + + k = cpumask_bits(new_mask); + return compat_get_bitmap(k, user_mask_ptr, len * 8); +} + +asmlinkage long compat_sys_sched_setaffinity(compat_pid_t pid, + unsigned int len, + compat_ulong_t __user *user_mask_ptr) +{ + cpumask_var_t new_mask; + int retval; + + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) + return -ENOMEM; + + retval = compat_get_user_cpu_mask(user_mask_ptr, len, new_mask); + if (retval) + goto out; + + retval = sched_setaffinity(pid, new_mask); +out: + free_cpumask_var(new_mask); + return retval; +} + +asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len, + compat_ulong_t __user *user_mask_ptr) +{ + int ret; + cpumask_var_t mask; + + if ((len * BITS_PER_BYTE) < nr_cpu_ids) + return -EINVAL; + if (len & (sizeof(compat_ulong_t)-1)) + return -EINVAL; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + ret = sched_getaffinity(pid, mask); + if (ret == 0) { + size_t retlen = min_t(size_t, len, cpumask_size()); + + if (compat_put_bitmap(user_mask_ptr, cpumask_bits(mask), retlen * 8)) + ret = -EFAULT; + else + ret = retlen; + } + free_cpumask_var(mask); + + return ret; +} + +int get_compat_itimerspec(struct itimerspec *dst, + const struct compat_itimerspec __user *src) +{ + if (get_compat_timespec(&dst->it_interval, &src->it_interval) || + get_compat_timespec(&dst->it_value, &src->it_value)) + return -EFAULT; + return 0; +} + +int put_compat_itimerspec(struct compat_itimerspec __user *dst, + const struct itimerspec *src) +{ + if (put_compat_timespec(&src->it_interval, &dst->it_interval) || + put_compat_timespec(&src->it_value, &dst->it_value)) + return -EFAULT; + return 0; +} + +long compat_sys_timer_create(clockid_t which_clock, + struct compat_sigevent __user *timer_event_spec, + timer_t __user *created_timer_id) +{ + struct sigevent __user *event = NULL; + + if (timer_event_spec) { + struct sigevent kevent; + + event = compat_alloc_user_space(sizeof(*event)); + if (get_compat_sigevent(&kevent, timer_event_spec) || + copy_to_user(event, &kevent, sizeof(*event))) + return -EFAULT; + } + + return sys_timer_create(which_clock, event, created_timer_id); +} + +long compat_sys_timer_settime(timer_t timer_id, int flags, + struct compat_itimerspec __user *new, + struct compat_itimerspec __user *old) +{ + long err; + mm_segment_t oldfs; + struct itimerspec newts, oldts; + + if (!new) + return -EINVAL; + if (get_compat_itimerspec(&newts, new)) + return -EFAULT; + oldfs = get_fs(); + set_fs(KERNEL_DS); + err = sys_timer_settime(timer_id, flags, + (struct itimerspec __user *) &newts, + (struct itimerspec __user *) &oldts); + set_fs(oldfs); + if (!err && old && put_compat_itimerspec(old, &oldts)) + return -EFAULT; + return err; +} + +long compat_sys_timer_gettime(timer_t timer_id, + struct compat_itimerspec __user *setting) +{ + long err; + mm_segment_t oldfs; + struct itimerspec ts; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + err = sys_timer_gettime(timer_id, + (struct itimerspec __user *) &ts); + set_fs(oldfs); + if (!err && put_compat_itimerspec(setting, &ts)) + return -EFAULT; + return err; +} + +long compat_sys_clock_settime(clockid_t which_clock, + struct compat_timespec __user *tp) +{ + long err; + mm_segment_t oldfs; + struct timespec ts; + + if (get_compat_timespec(&ts, tp)) + return -EFAULT; + oldfs = get_fs(); + set_fs(KERNEL_DS); + err = sys_clock_settime(which_clock, + (struct timespec __user *) &ts); + set_fs(oldfs); + return err; +} + +long compat_sys_clock_gettime(clockid_t which_clock, + struct compat_timespec __user *tp) +{ + long err; + mm_segment_t oldfs; + struct timespec ts; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + err = sys_clock_gettime(which_clock, + (struct timespec __user *) &ts); + set_fs(oldfs); + if (!err && put_compat_timespec(&ts, tp)) + return -EFAULT; + return err; +} + +long compat_sys_clock_adjtime(clockid_t which_clock, + struct compat_timex __user *utp) +{ + struct timex txc; + mm_segment_t oldfs; + int err, ret; + + err = compat_get_timex(&txc, utp); + if (err) + return err; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc); + set_fs(oldfs); + + err = compat_put_timex(utp, &txc); + if (err) + return err; + + return ret; +} + +long compat_sys_clock_getres(clockid_t which_clock, + struct compat_timespec __user *tp) +{ + long err; + mm_segment_t oldfs; + struct timespec ts; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + err = sys_clock_getres(which_clock, + (struct timespec __user *) &ts); + set_fs(oldfs); + if (!err && tp && put_compat_timespec(&ts, tp)) + return -EFAULT; + return err; +} + +static long compat_clock_nanosleep_restart(struct restart_block *restart) +{ + long err; + mm_segment_t oldfs; + struct timespec tu; + struct compat_timespec *rmtp = restart->nanosleep.compat_rmtp; + + restart->nanosleep.rmtp = (struct timespec __user *) &tu; + oldfs = get_fs(); + set_fs(KERNEL_DS); + err = clock_nanosleep_restart(restart); + set_fs(oldfs); + + if ((err == -ERESTART_RESTARTBLOCK) && rmtp && + put_compat_timespec(&tu, rmtp)) + return -EFAULT; + + if (err == -ERESTART_RESTARTBLOCK) { + restart->fn = compat_clock_nanosleep_restart; + restart->nanosleep.compat_rmtp = rmtp; + } + return err; +} + +long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, + struct compat_timespec __user *rqtp, + struct compat_timespec __user *rmtp) +{ + long err; + mm_segment_t oldfs; + struct timespec in, out; + struct restart_block *restart; + + if (get_compat_timespec(&in, rqtp)) + return -EFAULT; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + err = sys_clock_nanosleep(which_clock, flags, + (struct timespec __user *) &in, + (struct timespec __user *) &out); + set_fs(oldfs); + + if ((err == -ERESTART_RESTARTBLOCK) && rmtp && + put_compat_timespec(&out, rmtp)) + return -EFAULT; + + if (err == -ERESTART_RESTARTBLOCK) { + restart = ¤t_thread_info()->restart_block; + restart->fn = compat_clock_nanosleep_restart; + restart->nanosleep.compat_rmtp = rmtp; + } + return err; +} + +/* + * We currently only need the following fields from the sigevent + * structure: sigev_value, sigev_signo, sig_notify and (sometimes + * sigev_notify_thread_id). The others are handled in user mode. + * We also assume that copying sigev_value.sival_int is sufficient + * to keep all the bits of sigev_value.sival_ptr intact. + */ +int get_compat_sigevent(struct sigevent *event, + const struct compat_sigevent __user *u_event) +{ + memset(event, 0, sizeof(*event)); + return (!access_ok(VERIFY_READ, u_event, sizeof(*u_event)) || + __get_user(event->sigev_value.sival_int, + &u_event->sigev_value.sival_int) || + __get_user(event->sigev_signo, &u_event->sigev_signo) || + __get_user(event->sigev_notify, &u_event->sigev_notify) || + __get_user(event->sigev_notify_thread_id, + &u_event->sigev_notify_thread_id)) + ? -EFAULT : 0; +} + +long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, + unsigned long bitmap_size) +{ + int i, j; + unsigned long m; + compat_ulong_t um; + unsigned long nr_compat_longs; + + /* align bitmap up to nearest compat_long_t boundary */ + bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); + + if (!access_ok(VERIFY_READ, umask, bitmap_size / 8)) + return -EFAULT; + + nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); + + for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) { + m = 0; + + for (j = 0; j < sizeof(m)/sizeof(um); j++) { + /* + * We dont want to read past the end of the userspace + * bitmap. We must however ensure the end of the + * kernel bitmap is zeroed. + */ + if (nr_compat_longs-- > 0) { + if (__get_user(um, umask)) + return -EFAULT; + } else { + um = 0; + } + + umask++; + m |= (long)um << (j * BITS_PER_COMPAT_LONG); + } + *mask++ = m; + } + + return 0; +} + +long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, + unsigned long bitmap_size) +{ + int i, j; + unsigned long m; + compat_ulong_t um; + unsigned long nr_compat_longs; + + /* align bitmap up to nearest compat_long_t boundary */ + bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG); + + if (!access_ok(VERIFY_WRITE, umask, bitmap_size / 8)) + return -EFAULT; + + nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size); + + for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) { + m = *mask++; + + for (j = 0; j < sizeof(m)/sizeof(um); j++) { + um = m; + + /* + * We dont want to write past the end of the userspace + * bitmap. + */ + if (nr_compat_longs-- > 0) { + if (__put_user(um, umask)) + return -EFAULT; + } + + umask++; + m >>= 4*sizeof(um); + m >>= 4*sizeof(um); + } + } + + return 0; +} + +void +sigset_from_compat (sigset_t *set, compat_sigset_t *compat) +{ + switch (_NSIG_WORDS) { + case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); + case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 ); + case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 ); + case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); + } +} + +asmlinkage long +compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, + struct compat_siginfo __user *uinfo, + struct compat_timespec __user *uts, compat_size_t sigsetsize) +{ + compat_sigset_t s32; + sigset_t s; + struct timespec t; + siginfo_t info; + long ret; + + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + + if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t))) + return -EFAULT; + sigset_from_compat(&s, &s32); + + if (uts) { + if (get_compat_timespec(&t, uts)) + return -EFAULT; + } + + ret = do_sigtimedwait(&s, &info, uts ? &t : NULL); + + if (ret > 0 && uinfo) { + if (copy_siginfo_to_user32(uinfo, &info)) + ret = -EFAULT; + } + + return ret; + +} + +asmlinkage long +compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig, + struct compat_siginfo __user *uinfo) +{ + siginfo_t info; + + if (copy_siginfo_from_user32(&info, uinfo)) + return -EFAULT; + return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); +} + +#ifdef __ARCH_WANT_COMPAT_SYS_TIME + +/* compat_time_t is a 32 bit "long" and needs to get converted. */ + +asmlinkage long compat_sys_time(compat_time_t __user * tloc) +{ + compat_time_t i; + struct timeval tv; + + do_gettimeofday(&tv); + i = tv.tv_sec; + + if (tloc) { + if (put_user(i,tloc)) + return -EFAULT; + } + force_successful_syscall_return(); + return i; +} + +asmlinkage long compat_sys_stime(compat_time_t __user *tptr) +{ + struct timespec tv; + int err; + + if (get_user(tv.tv_sec, tptr)) + return -EFAULT; + + tv.tv_nsec = 0; + + err = security_settime(&tv, NULL); + if (err) + return err; + + do_settimeofday(&tv); + return 0; +} + +#endif /* __ARCH_WANT_COMPAT_SYS_TIME */ + +#ifdef __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND +asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat_size_t sigsetsize) +{ + sigset_t newset; + compat_sigset_t newset32; + + /* XXX: Don't preclude handling different sized sigset_t's. */ + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + + if (copy_from_user(&newset32, unewset, sizeof(compat_sigset_t))) + return -EFAULT; + sigset_from_compat(&newset, &newset32); + sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); + + spin_lock_irq(¤t->sighand->siglock); + current->saved_sigmask = current->blocked; + current->blocked = newset; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + current->state = TASK_INTERRUPTIBLE; + schedule(); + set_restore_sigmask(); + return -ERESTARTNOHAND; +} +#endif /* __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND */ + +asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp) +{ + struct timex txc; + int err, ret; + + err = compat_get_timex(&txc, utp); + if (err) + return err; + + ret = do_adjtimex(&txc); + + err = compat_put_timex(utp, &txc); + if (err) + return err; + + return ret; +} + +#ifdef CONFIG_NUMA +asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages, + compat_uptr_t __user *pages32, + const int __user *nodes, + int __user *status, + int flags) +{ + const void __user * __user *pages; + int i; + + pages = compat_alloc_user_space(nr_pages * sizeof(void *)); + for (i = 0; i < nr_pages; i++) { + compat_uptr_t p; + + if (get_user(p, pages32 + i) || + put_user(compat_ptr(p), pages + i)) + return -EFAULT; + } + return sys_move_pages(pid, nr_pages, pages, nodes, status, flags); +} + +asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, + compat_ulong_t maxnode, + const compat_ulong_t __user *old_nodes, + const compat_ulong_t __user *new_nodes) +{ + unsigned long __user *old = NULL; + unsigned long __user *new = NULL; + nodemask_t tmp_mask; + unsigned long nr_bits; + unsigned long size; + + nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES); + size = ALIGN(nr_bits, BITS_PER_LONG) / 8; + if (old_nodes) { + if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits)) + return -EFAULT; + old = compat_alloc_user_space(new_nodes ? size * 2 : size); + if (new_nodes) + new = old + size / sizeof(unsigned long); + if (copy_to_user(old, nodes_addr(tmp_mask), size)) + return -EFAULT; + } + if (new_nodes) { + if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits)) + return -EFAULT; + if (new == NULL) + new = compat_alloc_user_space(size); + if (copy_to_user(new, nodes_addr(tmp_mask), size)) + return -EFAULT; + } + return sys_migrate_pages(pid, nr_bits + 1, old, new); +} +#endif + +struct compat_sysinfo { + s32 uptime; + u32 loads[3]; + u32 totalram; + u32 freeram; + u32 sharedram; + u32 bufferram; + u32 totalswap; + u32 freeswap; + u16 procs; + u16 pad; + u32 totalhigh; + u32 freehigh; + u32 mem_unit; + char _f[20-2*sizeof(u32)-sizeof(int)]; +}; + +asmlinkage long +compat_sys_sysinfo(struct compat_sysinfo __user *info) +{ + struct sysinfo s; + + do_sysinfo(&s); + + /* Check to see if any memory value is too large for 32-bit and scale + * down if needed + */ + if ((s.totalram >> 32) || (s.totalswap >> 32)) { + int bitcount = 0; + + while (s.mem_unit < PAGE_SIZE) { + s.mem_unit <<= 1; + bitcount++; + } + + s.totalram >>= bitcount; + s.freeram >>= bitcount; + s.sharedram >>= bitcount; + s.bufferram >>= bitcount; + s.totalswap >>= bitcount; + s.freeswap >>= bitcount; + s.totalhigh >>= bitcount; + s.freehigh >>= bitcount; + } + + if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) || + __put_user (s.uptime, &info->uptime) || + __put_user (s.loads[0], &info->loads[0]) || + __put_user (s.loads[1], &info->loads[1]) || + __put_user (s.loads[2], &info->loads[2]) || + __put_user (s.totalram, &info->totalram) || + __put_user (s.freeram, &info->freeram) || + __put_user (s.sharedram, &info->sharedram) || + __put_user (s.bufferram, &info->bufferram) || + __put_user (s.totalswap, &info->totalswap) || + __put_user (s.freeswap, &info->freeswap) || + __put_user (s.procs, &info->procs) || + __put_user (s.totalhigh, &info->totalhigh) || + __put_user (s.freehigh, &info->freehigh) || + __put_user (s.mem_unit, &info->mem_unit)) + return -EFAULT; + + return 0; +} + +/* + * Allocate user-space memory for the duration of a single system call, + * in order to marshall parameters inside a compat thunk. + */ +void __user *compat_alloc_user_space(unsigned long len) +{ + void __user *ptr; + + /* If len would occupy more than half of the entire compat space... */ + if (unlikely(len > (((compat_uptr_t)~0) >> 1))) + return NULL; + + ptr = arch_compat_alloc_user_space(len); + + if (unlikely(!access_ok(VERIFY_WRITE, ptr, len))) + return NULL; + + return ptr; +} +EXPORT_SYMBOL_GPL(compat_alloc_user_space); diff --git a/kernel/configs.c b/kernel/configs.c new file mode 100644 index 00000000..b4066b44 --- /dev/null +++ b/kernel/configs.c @@ -0,0 +1,99 @@ +/* + * kernel/configs.c + * Echo the kernel .config file used to build the kernel + * + * Copyright (C) 2002 Khalid Aziz + * Copyright (C) 2002 Randy Dunlap + * Copyright (C) 2002 Al Stone + * Copyright (C) 2002 Hewlett-Packard Company + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include +#include + +/**************************************************/ +/* the actual current config file */ + +/* + * Define kernel_config_data and kernel_config_data_size, which contains the + * wrapped and compressed configuration file. The file is first compressed + * with gzip and then bounded by two eight byte magic numbers to allow + * extraction from a binary kernel image: + * + * IKCFG_ST + * + * IKCFG_ED + */ +#define MAGIC_START "IKCFG_ST" +#define MAGIC_END "IKCFG_ED" +#include "config_data.h" + + +#define MAGIC_SIZE (sizeof(MAGIC_START) - 1) +#define kernel_config_data_size \ + (sizeof(kernel_config_data) - 1 - MAGIC_SIZE * 2) + +#ifdef CONFIG_IKCONFIG_PROC + +static ssize_t +ikconfig_read_current(struct file *file, char __user *buf, + size_t len, loff_t * offset) +{ + return simple_read_from_buffer(buf, len, offset, + kernel_config_data + MAGIC_SIZE, + kernel_config_data_size); +} + +static const struct file_operations ikconfig_file_ops = { + .owner = THIS_MODULE, + .read = ikconfig_read_current, + .llseek = default_llseek, +}; + +static int __init ikconfig_init(void) +{ + struct proc_dir_entry *entry; + + /* create the current config file */ + entry = proc_create("config.gz", S_IFREG | S_IRUGO, NULL, + &ikconfig_file_ops); + if (!entry) + return -ENOMEM; + + entry->size = kernel_config_data_size; + + return 0; +} + +static void __exit ikconfig_cleanup(void) +{ + remove_proc_entry("config.gz", NULL); +} + +module_init(ikconfig_init); +module_exit(ikconfig_cleanup); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Randy Dunlap"); +MODULE_DESCRIPTION("Echo the kernel .config file used to build the kernel"); + +#endif /* CONFIG_IKCONFIG_PROC */ diff --git a/kernel/cpu.c b/kernel/cpu.c new file mode 100644 index 00000000..eae3d9b3 --- /dev/null +++ b/kernel/cpu.c @@ -0,0 +1,690 @@ +/* CPU control. + * (C) 2001, 2002, 2003, 2004 Rusty Russell + * + * This code is licenced under the GPL. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_SMP +/* Serializes the updates to cpu_online_mask, cpu_present_mask */ +static DEFINE_MUTEX(cpu_add_remove_lock); + +/* + * The following two API's must be used when attempting + * to serialize the updates to cpu_online_mask, cpu_present_mask. + */ +void cpu_maps_update_begin(void) +{ + mutex_lock(&cpu_add_remove_lock); +} + +void cpu_maps_update_done(void) +{ + mutex_unlock(&cpu_add_remove_lock); +} + +static RAW_NOTIFIER_HEAD(cpu_chain); + +/* If set, cpu_up and cpu_down will return -EBUSY and do nothing. + * Should always be manipulated under cpu_add_remove_lock + */ +static int cpu_hotplug_disabled; + +#ifdef CONFIG_HOTPLUG_CPU + +static struct { + struct task_struct *active_writer; + struct mutex lock; /* Synchronizes accesses to refcount, */ + /* + * Also blocks the new readers during + * an ongoing cpu hotplug operation. + */ + int refcount; +} cpu_hotplug = { + .active_writer = NULL, + .lock = __MUTEX_INITIALIZER(cpu_hotplug.lock), + .refcount = 0, +}; + +void get_online_cpus(void) +{ + might_sleep(); + if (cpu_hotplug.active_writer == current) + return; + mutex_lock(&cpu_hotplug.lock); + cpu_hotplug.refcount++; + mutex_unlock(&cpu_hotplug.lock); + +} +EXPORT_SYMBOL_GPL(get_online_cpus); + +void put_online_cpus(void) +{ + if (cpu_hotplug.active_writer == current) + return; + mutex_lock(&cpu_hotplug.lock); + if (!--cpu_hotplug.refcount && unlikely(cpu_hotplug.active_writer)) + wake_up_process(cpu_hotplug.active_writer); + mutex_unlock(&cpu_hotplug.lock); + +} +EXPORT_SYMBOL_GPL(put_online_cpus); + +/* + * This ensures that the hotplug operation can begin only when the + * refcount goes to zero. + * + * Note that during a cpu-hotplug operation, the new readers, if any, + * will be blocked by the cpu_hotplug.lock + * + * Since cpu_hotplug_begin() is always called after invoking + * cpu_maps_update_begin(), we can be sure that only one writer is active. + * + * Note that theoretically, there is a possibility of a livelock: + * - Refcount goes to zero, last reader wakes up the sleeping + * writer. + * - Last reader unlocks the cpu_hotplug.lock. + * - A new reader arrives at this moment, bumps up the refcount. + * - The writer acquires the cpu_hotplug.lock finds the refcount + * non zero and goes to sleep again. + * + * However, this is very difficult to achieve in practice since + * get_online_cpus() not an api which is called all that often. + * + */ +static void cpu_hotplug_begin(void) +{ + cpu_hotplug.active_writer = current; + + for (;;) { + mutex_lock(&cpu_hotplug.lock); + if (likely(!cpu_hotplug.refcount)) + break; + __set_current_state(TASK_UNINTERRUPTIBLE); + mutex_unlock(&cpu_hotplug.lock); + schedule(); + } +} + +static void cpu_hotplug_done(void) +{ + cpu_hotplug.active_writer = NULL; + mutex_unlock(&cpu_hotplug.lock); +} + +#else /* #if CONFIG_HOTPLUG_CPU */ +static void cpu_hotplug_begin(void) {} +static void cpu_hotplug_done(void) {} +#endif /* #else #if CONFIG_HOTPLUG_CPU */ + +/* Need to know about CPUs going up/down? */ +int __ref register_cpu_notifier(struct notifier_block *nb) +{ + int ret; + cpu_maps_update_begin(); + ret = raw_notifier_chain_register(&cpu_chain, nb); + cpu_maps_update_done(); + return ret; +} + +static int __cpu_notify(unsigned long val, void *v, int nr_to_call, + int *nr_calls) +{ + int ret; + + ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call, + nr_calls); + + return notifier_to_errno(ret); +} + +static int cpu_notify(unsigned long val, void *v) +{ + return __cpu_notify(val, v, -1, NULL); +} + +#ifdef CONFIG_HOTPLUG_CPU + +static void cpu_notify_nofail(unsigned long val, void *v) +{ + BUG_ON(cpu_notify(val, v)); +} +EXPORT_SYMBOL(register_cpu_notifier); + +void __ref unregister_cpu_notifier(struct notifier_block *nb) +{ + cpu_maps_update_begin(); + raw_notifier_chain_unregister(&cpu_chain, nb); + cpu_maps_update_done(); +} +EXPORT_SYMBOL(unregister_cpu_notifier); + +static inline void check_for_tasks(int cpu) +{ + struct task_struct *p; + + write_lock_irq(&tasklist_lock); + for_each_process(p) { + if (task_cpu(p) == cpu && p->state == TASK_RUNNING && + (!cputime_eq(p->utime, cputime_zero) || + !cputime_eq(p->stime, cputime_zero))) + printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " + "(state = %ld, flags = %x)\n", + p->comm, task_pid_nr(p), cpu, + p->state, p->flags); + } + write_unlock_irq(&tasklist_lock); +} + +struct take_cpu_down_param { + unsigned long mod; + void *hcpu; +}; + +/* Take this CPU down. */ +static int __ref take_cpu_down(void *_param) +{ + struct take_cpu_down_param *param = _param; + int err; + + /* Ensure this CPU doesn't handle any more interrupts. */ + err = __cpu_disable(); + if (err < 0) + return err; + + cpu_notify(CPU_DYING | param->mod, param->hcpu); + return 0; +} + +/* Requires cpu_add_remove_lock to be held */ +static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) +{ + int err, nr_calls = 0; + void *hcpu = (void *)(long)cpu; + unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; + struct take_cpu_down_param tcd_param = { + .mod = mod, + .hcpu = hcpu, + }; + + if (num_online_cpus() == 1) + return -EBUSY; + + if (!cpu_online(cpu)) + return -EINVAL; + + cpu_hotplug_begin(); + + err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); + if (err) { + nr_calls--; + __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); + printk("%s: attempt to take down CPU %u failed\n", + __func__, cpu); + goto out_release; + } + + err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); + if (err) { + /* CPU didn't die: tell everyone. Can't complain. */ + cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); + + goto out_release; + } + BUG_ON(cpu_online(cpu)); + + /* + * The migration_call() CPU_DYING callback will have removed all + * runnable tasks from the cpu, there's only the idle task left now + * that the migration thread is done doing the stop_machine thing. + * + * Wait for the stop thread to go away. + */ + while (!idle_cpu(cpu)) + cpu_relax(); + + /* This actually kills the CPU. */ + __cpu_die(cpu); + + /* CPU is completely dead: tell everyone. Too late to complain. */ + cpu_notify_nofail(CPU_DEAD | mod, hcpu); + + check_for_tasks(cpu); + +out_release: + cpu_hotplug_done(); + if (!err) + cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu); + return err; +} + +int __ref cpu_down(unsigned int cpu) +{ + int err; + + cpu_maps_update_begin(); + + if (cpu_hotplug_disabled) { + err = -EBUSY; + goto out; + } + + err = _cpu_down(cpu, 0); + +out: + cpu_maps_update_done(); + return err; +} +EXPORT_SYMBOL(cpu_down); +#endif /*CONFIG_HOTPLUG_CPU*/ + +/* Requires cpu_add_remove_lock to be held */ +static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) +{ + int ret, nr_calls = 0; + void *hcpu = (void *)(long)cpu; + unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; + + if (cpu_online(cpu) || !cpu_present(cpu)) + return -EINVAL; + + cpu_hotplug_begin(); + ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); + if (ret) { + nr_calls--; + printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", + __func__, cpu); + goto out_notify; + } + + /* Arch-specific enabling code. */ + ret = __cpu_up(cpu); + if (ret != 0) + goto out_notify; + BUG_ON(!cpu_online(cpu)); + + /* Now call notifier in preparation. */ + cpu_notify(CPU_ONLINE | mod, hcpu); + +out_notify: + if (ret != 0) + __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); + cpu_hotplug_done(); + + return ret; +} + +int __cpuinit cpu_up(unsigned int cpu) +{ + int err = 0; + +#ifdef CONFIG_MEMORY_HOTPLUG + int nid; + pg_data_t *pgdat; +#endif + + if (!cpu_possible(cpu)) { + printk(KERN_ERR "can't online cpu %d because it is not " + "configured as may-hotadd at boot time\n", cpu); +#if defined(CONFIG_IA64) + printk(KERN_ERR "please check additional_cpus= boot " + "parameter\n"); +#endif + return -EINVAL; + } + +#ifdef CONFIG_MEMORY_HOTPLUG + nid = cpu_to_node(cpu); + if (!node_online(nid)) { + err = mem_online_node(nid); + if (err) + return err; + } + + pgdat = NODE_DATA(nid); + if (!pgdat) { + printk(KERN_ERR + "Can't online cpu %d due to NULL pgdat\n", cpu); + return -ENOMEM; + } + + if (pgdat->node_zonelists->_zonerefs->zone == NULL) { + mutex_lock(&zonelists_mutex); + build_all_zonelists(NULL); + mutex_unlock(&zonelists_mutex); + } +#endif + + cpu_maps_update_begin(); + + if (cpu_hotplug_disabled) { + err = -EBUSY; + goto out; + } + + err = _cpu_up(cpu, 0); + +out: + cpu_maps_update_done(); + return err; +} + +#ifdef CONFIG_PM_SLEEP_SMP +static cpumask_var_t frozen_cpus; + +void __weak arch_disable_nonboot_cpus_begin(void) +{ +} + +void __weak arch_disable_nonboot_cpus_end(void) +{ +} + +int disable_nonboot_cpus(void) +{ + int cpu, first_cpu, error = 0; + + cpu_maps_update_begin(); + first_cpu = cpumask_first(cpu_online_mask); + /* + * We take down all of the non-boot CPUs in one shot to avoid races + * with the userspace trying to use the CPU hotplug at the same time + */ + cpumask_clear(frozen_cpus); + arch_disable_nonboot_cpus_begin(); + + printk("Disabling non-boot CPUs ...\n"); + for_each_online_cpu(cpu) { + if (cpu == first_cpu) + continue; + error = _cpu_down(cpu, 1); + if (!error) + cpumask_set_cpu(cpu, frozen_cpus); + else { + printk(KERN_ERR "Error taking CPU%d down: %d\n", + cpu, error); + break; + } + } + + arch_disable_nonboot_cpus_end(); + + if (!error) { + BUG_ON(num_online_cpus() > 1); + /* Make sure the CPUs won't be enabled by someone else */ + cpu_hotplug_disabled = 1; + } else { + printk(KERN_ERR "Non-boot CPUs are not disabled\n"); + } + cpu_maps_update_done(); + return error; +} + +void __weak arch_enable_nonboot_cpus_begin(void) +{ +} + +void __weak arch_enable_nonboot_cpus_end(void) +{ +} + +void __ref enable_nonboot_cpus(void) +{ + int cpu, error; + + /* Allow everyone to use the CPU hotplug again */ + cpu_maps_update_begin(); + cpu_hotplug_disabled = 0; + if (cpumask_empty(frozen_cpus)) + goto out; + + printk(KERN_INFO "Enabling non-boot CPUs ...\n"); + + arch_enable_nonboot_cpus_begin(); + + for_each_cpu(cpu, frozen_cpus) { + error = _cpu_up(cpu, 1); + if (!error) { + printk(KERN_INFO "CPU%d is up\n", cpu); + continue; + } + printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); + } + + arch_enable_nonboot_cpus_end(); + + cpumask_clear(frozen_cpus); +out: + cpu_maps_update_done(); +} + +static int alloc_frozen_cpus(void) +{ + if (!alloc_cpumask_var(&frozen_cpus, GFP_KERNEL|__GFP_ZERO)) + return -ENOMEM; + return 0; +} +core_initcall(alloc_frozen_cpus); + +/* + * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU + * hotplug when tasks are about to be frozen. Also, don't allow the freezer + * to continue until any currently running CPU hotplug operation gets + * completed. + * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the + * 'cpu_add_remove_lock'. And this same lock is also taken by the regular + * CPU hotplug path and released only after it is complete. Thus, we + * (and hence the freezer) will block here until any currently running CPU + * hotplug operation gets completed. + */ +void cpu_hotplug_disable_before_freeze(void) +{ + cpu_maps_update_begin(); + cpu_hotplug_disabled = 1; + cpu_maps_update_done(); +} + + +/* + * When tasks have been thawed, re-enable regular CPU hotplug (which had been + * disabled while beginning to freeze tasks). + */ +void cpu_hotplug_enable_after_thaw(void) +{ + cpu_maps_update_begin(); + cpu_hotplug_disabled = 0; + cpu_maps_update_done(); +} + +/* + * When callbacks for CPU hotplug notifications are being executed, we must + * ensure that the state of the system with respect to the tasks being frozen + * or not, as reported by the notification, remains unchanged *throughout the + * duration* of the execution of the callbacks. + * Hence we need to prevent the freezer from racing with regular CPU hotplug. + * + * This synchronization is implemented by mutually excluding regular CPU + * hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/ + * Hibernate notifications. + */ +static int +cpu_hotplug_pm_callback(struct notifier_block *nb, + unsigned long action, void *ptr) +{ + switch (action) { + + case PM_SUSPEND_PREPARE: + case PM_HIBERNATION_PREPARE: + cpu_hotplug_disable_before_freeze(); + break; + + case PM_POST_SUSPEND: + case PM_POST_HIBERNATION: + cpu_hotplug_enable_after_thaw(); + break; + + default: + return NOTIFY_DONE; + } + + return NOTIFY_OK; +} + + +int cpu_hotplug_pm_sync_init(void) +{ + pm_notifier(cpu_hotplug_pm_callback, 0); + return 0; +} +core_initcall(cpu_hotplug_pm_sync_init); + +#endif /* CONFIG_PM_SLEEP_SMP */ + +/** + * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers + * @cpu: cpu that just started + * + * This function calls the cpu_chain notifiers with CPU_STARTING. + * It must be called by the arch code on the new cpu, before the new cpu + * enables interrupts and before the "boot" cpu returns from __cpu_up(). + */ +void __cpuinit notify_cpu_starting(unsigned int cpu) +{ + unsigned long val = CPU_STARTING; + +#ifdef CONFIG_PM_SLEEP_SMP + if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus)) + val = CPU_STARTING_FROZEN; +#endif /* CONFIG_PM_SLEEP_SMP */ + cpu_notify(val, (void *)(long)cpu); +} + +#endif /* CONFIG_SMP */ + +/* + * cpu_bit_bitmap[] is a special, "compressed" data structure that + * represents all NR_CPUS bits binary values of 1< 32 + MASK_DECLARE_8(32), MASK_DECLARE_8(40), + MASK_DECLARE_8(48), MASK_DECLARE_8(56), +#endif +}; +EXPORT_SYMBOL_GPL(cpu_bit_bitmap); + +const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL; +EXPORT_SYMBOL(cpu_all_bits); + +#ifdef CONFIG_INIT_ALL_POSSIBLE +static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly + = CPU_BITS_ALL; +#else +static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly; +#endif +const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits); +EXPORT_SYMBOL(cpu_possible_mask); + +static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly; +const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits); +EXPORT_SYMBOL(cpu_online_mask); + +static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly; +const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits); +EXPORT_SYMBOL(cpu_present_mask); + +static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly; +const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits); +EXPORT_SYMBOL(cpu_active_mask); + +void set_cpu_possible(unsigned int cpu, bool possible) +{ + if (possible) + cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits)); + else + cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits)); +} + +void set_cpu_present(unsigned int cpu, bool present) +{ + if (present) + cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits)); + else + cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits)); +} + +void set_cpu_online(unsigned int cpu, bool online) +{ + if (online) + cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits)); + else + cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits)); +} + +void set_cpu_active(unsigned int cpu, bool active) +{ + if (active) + cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits)); + else + cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits)); +} + +void init_cpu_present(const struct cpumask *src) +{ + cpumask_copy(to_cpumask(cpu_present_bits), src); +} + +void init_cpu_possible(const struct cpumask *src) +{ + cpumask_copy(to_cpumask(cpu_possible_bits), src); +} + +void init_cpu_online(const struct cpumask *src) +{ + cpumask_copy(to_cpumask(cpu_online_bits), src); +} + +static ATOMIC_NOTIFIER_HEAD(idle_notifier); + +void idle_notifier_register(struct notifier_block *n) +{ + atomic_notifier_chain_register(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_register); + +void idle_notifier_unregister(struct notifier_block *n) +{ + atomic_notifier_chain_unregister(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_unregister); + +void idle_notifier_call_chain(unsigned long val) +{ + atomic_notifier_call_chain(&idle_notifier, val, NULL); +} +EXPORT_SYMBOL_GPL(idle_notifier_call_chain); diff --git a/kernel/cpuset.c b/kernel/cpuset.c new file mode 100644 index 00000000..9c9b7545 --- /dev/null +++ b/kernel/cpuset.c @@ -0,0 +1,2615 @@ +/* + * kernel/cpuset.c + * + * Processor and Memory placement constraints for sets of tasks. + * + * Copyright (C) 2003 BULL SA. + * Copyright (C) 2004-2007 Silicon Graphics, Inc. + * Copyright (C) 2006 Google, Inc + * + * Portions derived from Patrick Mochel's sysfs code. + * sysfs is Copyright (c) 2001-3 Patrick Mochel + * + * 2003-10-10 Written by Simon Derr. + * 2003-10-22 Updates by Stephen Hemminger. + * 2004 May-July Rework by Paul Jackson. + * 2006 Rework by Paul Menage to use generic cgroups + * 2008 Rework of the scheduler domains and CPU hotplug handling + * by Max Krasnyansky + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of the Linux + * distribution for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +/* + * Workqueue for cpuset related tasks. + * + * Using kevent workqueue may cause deadlock when memory_migrate + * is set. So we create a separate workqueue thread for cpuset. + */ +static struct workqueue_struct *cpuset_wq; + +/* + * Tracks how many cpusets are currently defined in system. + * When there is only one cpuset (the root cpuset) we can + * short circuit some hooks. + */ +int number_of_cpusets __read_mostly; + +/* Forward declare cgroup structures */ +struct cgroup_subsys cpuset_subsys; +struct cpuset; + +/* See "Frequency meter" comments, below. */ + +struct fmeter { + int cnt; /* unprocessed events count */ + int val; /* most recent output value */ + time_t time; /* clock (secs) when val computed */ + spinlock_t lock; /* guards read or write of above */ +}; + +struct cpuset { + struct cgroup_subsys_state css; + + unsigned long flags; /* "unsigned long" so bitops work */ + cpumask_var_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ + nodemask_t mems_allowed; /* Memory Nodes allowed to tasks */ + + struct cpuset *parent; /* my parent */ + + struct fmeter fmeter; /* memory_pressure filter */ + + /* partition number for rebuild_sched_domains() */ + int pn; + + /* for custom sched domain */ + int relax_domain_level; + + /* used for walking a cpuset hierarchy */ + struct list_head stack_list; +}; + +/* Retrieve the cpuset for a cgroup */ +static inline struct cpuset *cgroup_cs(struct cgroup *cont) +{ + return container_of(cgroup_subsys_state(cont, cpuset_subsys_id), + struct cpuset, css); +} + +/* Retrieve the cpuset for a task */ +static inline struct cpuset *task_cs(struct task_struct *task) +{ + return container_of(task_subsys_state(task, cpuset_subsys_id), + struct cpuset, css); +} + +/* bits in struct cpuset flags field */ +typedef enum { + CS_CPU_EXCLUSIVE, + CS_MEM_EXCLUSIVE, + CS_MEM_HARDWALL, + CS_MEMORY_MIGRATE, + CS_SCHED_LOAD_BALANCE, + CS_SPREAD_PAGE, + CS_SPREAD_SLAB, +} cpuset_flagbits_t; + +/* convenient tests for these bits */ +static inline int is_cpu_exclusive(const struct cpuset *cs) +{ + return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); +} + +static inline int is_mem_exclusive(const struct cpuset *cs) +{ + return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); +} + +static inline int is_mem_hardwall(const struct cpuset *cs) +{ + return test_bit(CS_MEM_HARDWALL, &cs->flags); +} + +static inline int is_sched_load_balance(const struct cpuset *cs) +{ + return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); +} + +static inline int is_memory_migrate(const struct cpuset *cs) +{ + return test_bit(CS_MEMORY_MIGRATE, &cs->flags); +} + +static inline int is_spread_page(const struct cpuset *cs) +{ + return test_bit(CS_SPREAD_PAGE, &cs->flags); +} + +static inline int is_spread_slab(const struct cpuset *cs) +{ + return test_bit(CS_SPREAD_SLAB, &cs->flags); +} + +static struct cpuset top_cpuset = { + .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), +}; + +/* + * There are two global mutexes guarding cpuset structures. The first + * is the main control groups cgroup_mutex, accessed via + * cgroup_lock()/cgroup_unlock(). The second is the cpuset-specific + * callback_mutex, below. They can nest. It is ok to first take + * cgroup_mutex, then nest callback_mutex. We also require taking + * task_lock() when dereferencing a task's cpuset pointer. See "The + * task_lock() exception", at the end of this comment. + * + * A task must hold both mutexes to modify cpusets. If a task + * holds cgroup_mutex, then it blocks others wanting that mutex, + * ensuring that it is the only task able to also acquire callback_mutex + * and be able to modify cpusets. It can perform various checks on + * the cpuset structure first, knowing nothing will change. It can + * also allocate memory while just holding cgroup_mutex. While it is + * performing these checks, various callback routines can briefly + * acquire callback_mutex to query cpusets. Once it is ready to make + * the changes, it takes callback_mutex, blocking everyone else. + * + * Calls to the kernel memory allocator can not be made while holding + * callback_mutex, as that would risk double tripping on callback_mutex + * from one of the callbacks into the cpuset code from within + * __alloc_pages(). + * + * If a task is only holding callback_mutex, then it has read-only + * access to cpusets. + * + * Now, the task_struct fields mems_allowed and mempolicy may be changed + * by other task, we use alloc_lock in the task_struct fields to protect + * them. + * + * The cpuset_common_file_read() handlers only hold callback_mutex across + * small pieces of code, such as when reading out possibly multi-word + * cpumasks and nodemasks. + * + * Accessing a task's cpuset should be done in accordance with the + * guidelines for accessing subsystem state in kernel/cgroup.c + */ + +static DEFINE_MUTEX(callback_mutex); + +/* + * cpuset_buffer_lock protects both the cpuset_name and cpuset_nodelist + * buffers. They are statically allocated to prevent using excess stack + * when calling cpuset_print_task_mems_allowed(). + */ +#define CPUSET_NAME_LEN (128) +#define CPUSET_NODELIST_LEN (256) +static char cpuset_name[CPUSET_NAME_LEN]; +static char cpuset_nodelist[CPUSET_NODELIST_LEN]; +static DEFINE_SPINLOCK(cpuset_buffer_lock); + +/* + * This is ugly, but preserves the userspace API for existing cpuset + * users. If someone tries to mount the "cpuset" filesystem, we + * silently switch it to mount "cgroup" instead + */ +static struct dentry *cpuset_mount(struct file_system_type *fs_type, + int flags, const char *unused_dev_name, void *data) +{ + struct file_system_type *cgroup_fs = get_fs_type("cgroup"); + struct dentry *ret = ERR_PTR(-ENODEV); + if (cgroup_fs) { + char mountopts[] = + "cpuset,noprefix," + "release_agent=/sbin/cpuset_release_agent"; + ret = cgroup_fs->mount(cgroup_fs, flags, + unused_dev_name, mountopts); + put_filesystem(cgroup_fs); + } + return ret; +} + +static struct file_system_type cpuset_fs_type = { + .name = "cpuset", + .mount = cpuset_mount, +}; + +/* + * Return in pmask the portion of a cpusets's cpus_allowed that + * are online. If none are online, walk up the cpuset hierarchy + * until we find one that does have some online cpus. If we get + * all the way to the top and still haven't found any online cpus, + * return cpu_online_map. Or if passed a NULL cs from an exit'ing + * task, return cpu_online_map. + * + * One way or another, we guarantee to return some non-empty subset + * of cpu_online_map. + * + * Call with callback_mutex held. + */ + +static void guarantee_online_cpus(const struct cpuset *cs, + struct cpumask *pmask) +{ + while (cs && !cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) + cs = cs->parent; + if (cs) + cpumask_and(pmask, cs->cpus_allowed, cpu_online_mask); + else + cpumask_copy(pmask, cpu_online_mask); + BUG_ON(!cpumask_intersects(pmask, cpu_online_mask)); +} + +/* + * Return in *pmask the portion of a cpusets's mems_allowed that + * are online, with memory. If none are online with memory, walk + * up the cpuset hierarchy until we find one that does have some + * online mems. If we get all the way to the top and still haven't + * found any online mems, return node_states[N_HIGH_MEMORY]. + * + * One way or another, we guarantee to return some non-empty subset + * of node_states[N_HIGH_MEMORY]. + * + * Call with callback_mutex held. + */ + +static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) +{ + while (cs && !nodes_intersects(cs->mems_allowed, + node_states[N_HIGH_MEMORY])) + cs = cs->parent; + if (cs) + nodes_and(*pmask, cs->mems_allowed, + node_states[N_HIGH_MEMORY]); + else + *pmask = node_states[N_HIGH_MEMORY]; + BUG_ON(!nodes_intersects(*pmask, node_states[N_HIGH_MEMORY])); +} + +/* + * update task's spread flag if cpuset's page/slab spread flag is set + * + * Called with callback_mutex/cgroup_mutex held + */ +static void cpuset_update_task_spread_flag(struct cpuset *cs, + struct task_struct *tsk) +{ + if (is_spread_page(cs)) + tsk->flags |= PF_SPREAD_PAGE; + else + tsk->flags &= ~PF_SPREAD_PAGE; + if (is_spread_slab(cs)) + tsk->flags |= PF_SPREAD_SLAB; + else + tsk->flags &= ~PF_SPREAD_SLAB; +} + +/* + * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q? + * + * One cpuset is a subset of another if all its allowed CPUs and + * Memory Nodes are a subset of the other, and its exclusive flags + * are only set if the other's are set. Call holding cgroup_mutex. + */ + +static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) +{ + return cpumask_subset(p->cpus_allowed, q->cpus_allowed) && + nodes_subset(p->mems_allowed, q->mems_allowed) && + is_cpu_exclusive(p) <= is_cpu_exclusive(q) && + is_mem_exclusive(p) <= is_mem_exclusive(q); +} + +/** + * alloc_trial_cpuset - allocate a trial cpuset + * @cs: the cpuset that the trial cpuset duplicates + */ +static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs) +{ + struct cpuset *trial; + + trial = kmemdup(cs, sizeof(*cs), GFP_KERNEL); + if (!trial) + return NULL; + + if (!alloc_cpumask_var(&trial->cpus_allowed, GFP_KERNEL)) { + kfree(trial); + return NULL; + } + cpumask_copy(trial->cpus_allowed, cs->cpus_allowed); + + return trial; +} + +/** + * free_trial_cpuset - free the trial cpuset + * @trial: the trial cpuset to be freed + */ +static void free_trial_cpuset(struct cpuset *trial) +{ + free_cpumask_var(trial->cpus_allowed); + kfree(trial); +} + +/* + * validate_change() - Used to validate that any proposed cpuset change + * follows the structural rules for cpusets. + * + * If we replaced the flag and mask values of the current cpuset + * (cur) with those values in the trial cpuset (trial), would + * our various subset and exclusive rules still be valid? Presumes + * cgroup_mutex held. + * + * 'cur' is the address of an actual, in-use cpuset. Operations + * such as list traversal that depend on the actual address of the + * cpuset in the list must use cur below, not trial. + * + * 'trial' is the address of bulk structure copy of cur, with + * perhaps one or more of the fields cpus_allowed, mems_allowed, + * or flags changed to new, trial values. + * + * Return 0 if valid, -errno if not. + */ + +static int validate_change(const struct cpuset *cur, const struct cpuset *trial) +{ + struct cgroup *cont; + struct cpuset *c, *par; + + /* Each of our child cpusets must be a subset of us */ + list_for_each_entry(cont, &cur->css.cgroup->children, sibling) { + if (!is_cpuset_subset(cgroup_cs(cont), trial)) + return -EBUSY; + } + + /* Remaining checks don't apply to root cpuset */ + if (cur == &top_cpuset) + return 0; + + par = cur->parent; + + /* We must be a subset of our parent cpuset */ + if (!is_cpuset_subset(trial, par)) + return -EACCES; + + /* + * If either I or some sibling (!= me) is exclusive, we can't + * overlap + */ + list_for_each_entry(cont, &par->css.cgroup->children, sibling) { + c = cgroup_cs(cont); + if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && + c != cur && + cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) + return -EINVAL; + if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) && + c != cur && + nodes_intersects(trial->mems_allowed, c->mems_allowed)) + return -EINVAL; + } + + /* Cpusets with tasks can't have empty cpus_allowed or mems_allowed */ + if (cgroup_task_count(cur->css.cgroup)) { + if (cpumask_empty(trial->cpus_allowed) || + nodes_empty(trial->mems_allowed)) { + return -ENOSPC; + } + } + + return 0; +} + +#ifdef CONFIG_SMP +/* + * Helper routine for generate_sched_domains(). + * Do cpusets a, b have overlapping cpus_allowed masks? + */ +static int cpusets_overlap(struct cpuset *a, struct cpuset *b) +{ + return cpumask_intersects(a->cpus_allowed, b->cpus_allowed); +} + +static void +update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) +{ + if (dattr->relax_domain_level < c->relax_domain_level) + dattr->relax_domain_level = c->relax_domain_level; + return; +} + +static void +update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) +{ + LIST_HEAD(q); + + list_add(&c->stack_list, &q); + while (!list_empty(&q)) { + struct cpuset *cp; + struct cgroup *cont; + struct cpuset *child; + + cp = list_first_entry(&q, struct cpuset, stack_list); + list_del(q.next); + + if (cpumask_empty(cp->cpus_allowed)) + continue; + + if (is_sched_load_balance(cp)) + update_domain_attr(dattr, cp); + + list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { + child = cgroup_cs(cont); + list_add_tail(&child->stack_list, &q); + } + } +} + +/* + * generate_sched_domains() + * + * This function builds a partial partition of the systems CPUs + * A 'partial partition' is a set of non-overlapping subsets whose + * union is a subset of that set. + * The output of this function needs to be passed to kernel/sched.c + * partition_sched_domains() routine, which will rebuild the scheduler's + * load balancing domains (sched domains) as specified by that partial + * partition. + * + * See "What is sched_load_balance" in Documentation/cgroups/cpusets.txt + * for a background explanation of this. + * + * Does not return errors, on the theory that the callers of this + * routine would rather not worry about failures to rebuild sched + * domains when operating in the severe memory shortage situations + * that could cause allocation failures below. + * + * Must be called with cgroup_lock held. + * + * The three key local variables below are: + * q - a linked-list queue of cpuset pointers, used to implement a + * top-down scan of all cpusets. This scan loads a pointer + * to each cpuset marked is_sched_load_balance into the + * array 'csa'. For our purposes, rebuilding the schedulers + * sched domains, we can ignore !is_sched_load_balance cpusets. + * csa - (for CpuSet Array) Array of pointers to all the cpusets + * that need to be load balanced, for convenient iterative + * access by the subsequent code that finds the best partition, + * i.e the set of domains (subsets) of CPUs such that the + * cpus_allowed of every cpuset marked is_sched_load_balance + * is a subset of one of these domains, while there are as + * many such domains as possible, each as small as possible. + * doms - Conversion of 'csa' to an array of cpumasks, for passing to + * the kernel/sched.c routine partition_sched_domains() in a + * convenient format, that can be easily compared to the prior + * value to determine what partition elements (sched domains) + * were changed (added or removed.) + * + * Finding the best partition (set of domains): + * The triple nested loops below over i, j, k scan over the + * load balanced cpusets (using the array of cpuset pointers in + * csa[]) looking for pairs of cpusets that have overlapping + * cpus_allowed, but which don't have the same 'pn' partition + * number and gives them in the same partition number. It keeps + * looping on the 'restart' label until it can no longer find + * any such pairs. + * + * The union of the cpus_allowed masks from the set of + * all cpusets having the same 'pn' value then form the one + * element of the partition (one sched domain) to be passed to + * partition_sched_domains(). + */ +static int generate_sched_domains(cpumask_var_t **domains, + struct sched_domain_attr **attributes) +{ + LIST_HEAD(q); /* queue of cpusets to be scanned */ + struct cpuset *cp; /* scans q */ + struct cpuset **csa; /* array of all cpuset ptrs */ + int csn; /* how many cpuset ptrs in csa so far */ + int i, j, k; /* indices for partition finding loops */ + cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ + struct sched_domain_attr *dattr; /* attributes for custom domains */ + int ndoms = 0; /* number of sched domains in result */ + int nslot; /* next empty doms[] struct cpumask slot */ + + doms = NULL; + dattr = NULL; + csa = NULL; + + /* Special case for the 99% of systems with one, full, sched domain */ + if (is_sched_load_balance(&top_cpuset)) { + ndoms = 1; + doms = alloc_sched_domains(ndoms); + if (!doms) + goto done; + + dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); + if (dattr) { + *dattr = SD_ATTR_INIT; + update_domain_attr_tree(dattr, &top_cpuset); + } + cpumask_copy(doms[0], top_cpuset.cpus_allowed); + + goto done; + } + + csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); + if (!csa) + goto done; + csn = 0; + + list_add(&top_cpuset.stack_list, &q); + while (!list_empty(&q)) { + struct cgroup *cont; + struct cpuset *child; /* scans child cpusets of cp */ + + cp = list_first_entry(&q, struct cpuset, stack_list); + list_del(q.next); + + if (cpumask_empty(cp->cpus_allowed)) + continue; + + /* + * All child cpusets contain a subset of the parent's cpus, so + * just skip them, and then we call update_domain_attr_tree() + * to calc relax_domain_level of the corresponding sched + * domain. + */ + if (is_sched_load_balance(cp)) { + csa[csn++] = cp; + continue; + } + + list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { + child = cgroup_cs(cont); + list_add_tail(&child->stack_list, &q); + } + } + + for (i = 0; i < csn; i++) + csa[i]->pn = i; + ndoms = csn; + +restart: + /* Find the best partition (set of sched domains) */ + for (i = 0; i < csn; i++) { + struct cpuset *a = csa[i]; + int apn = a->pn; + + for (j = 0; j < csn; j++) { + struct cpuset *b = csa[j]; + int bpn = b->pn; + + if (apn != bpn && cpusets_overlap(a, b)) { + for (k = 0; k < csn; k++) { + struct cpuset *c = csa[k]; + + if (c->pn == bpn) + c->pn = apn; + } + ndoms--; /* one less element */ + goto restart; + } + } + } + + /* + * Now we know how many domains to create. + * Convert to and populate cpu masks. + */ + doms = alloc_sched_domains(ndoms); + if (!doms) + goto done; + + /* + * The rest of the code, including the scheduler, can deal with + * dattr==NULL case. No need to abort if alloc fails. + */ + dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); + + for (nslot = 0, i = 0; i < csn; i++) { + struct cpuset *a = csa[i]; + struct cpumask *dp; + int apn = a->pn; + + if (apn < 0) { + /* Skip completed partitions */ + continue; + } + + dp = doms[nslot]; + + if (nslot == ndoms) { + static int warnings = 10; + if (warnings) { + printk(KERN_WARNING + "rebuild_sched_domains confused:" + " nslot %d, ndoms %d, csn %d, i %d," + " apn %d\n", + nslot, ndoms, csn, i, apn); + warnings--; + } + continue; + } + + cpumask_clear(dp); + if (dattr) + *(dattr + nslot) = SD_ATTR_INIT; + for (j = i; j < csn; j++) { + struct cpuset *b = csa[j]; + + if (apn == b->pn) { + cpumask_or(dp, dp, b->cpus_allowed); + if (dattr) + update_domain_attr_tree(dattr + nslot, b); + + /* Done with this partition */ + b->pn = -1; + } + } + nslot++; + } + BUG_ON(nslot != ndoms); + +done: + kfree(csa); + + /* + * Fallback to the default domain if kmalloc() failed. + * See comments in partition_sched_domains(). + */ + if (doms == NULL) + ndoms = 1; + + *domains = doms; + *attributes = dattr; + return ndoms; +} + +/* + * Rebuild scheduler domains. + * + * Call with neither cgroup_mutex held nor within get_online_cpus(). + * Takes both cgroup_mutex and get_online_cpus(). + * + * Cannot be directly called from cpuset code handling changes + * to the cpuset pseudo-filesystem, because it cannot be called + * from code that already holds cgroup_mutex. + */ +static void do_rebuild_sched_domains(struct work_struct *unused) +{ + struct sched_domain_attr *attr; + cpumask_var_t *doms; + int ndoms; + + get_online_cpus(); + + /* Generate domain masks and attrs */ + cgroup_lock(); + ndoms = generate_sched_domains(&doms, &attr); + cgroup_unlock(); + + /* Have scheduler rebuild the domains */ + partition_sched_domains(ndoms, doms, attr); + + put_online_cpus(); +} +#else /* !CONFIG_SMP */ +static void do_rebuild_sched_domains(struct work_struct *unused) +{ +} + +static int generate_sched_domains(cpumask_var_t **domains, + struct sched_domain_attr **attributes) +{ + *domains = NULL; + return 1; +} +#endif /* CONFIG_SMP */ + +static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); + +/* + * Rebuild scheduler domains, asynchronously via workqueue. + * + * If the flag 'sched_load_balance' of any cpuset with non-empty + * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset + * which has that flag enabled, or if any cpuset with a non-empty + * 'cpus' is removed, then call this routine to rebuild the + * scheduler's dynamic sched domains. + * + * The rebuild_sched_domains() and partition_sched_domains() + * routines must nest cgroup_lock() inside get_online_cpus(), + * but such cpuset changes as these must nest that locking the + * other way, holding cgroup_lock() for much of the code. + * + * So in order to avoid an ABBA deadlock, the cpuset code handling + * these user changes delegates the actual sched domain rebuilding + * to a separate workqueue thread, which ends up processing the + * above do_rebuild_sched_domains() function. + */ +static void async_rebuild_sched_domains(void) +{ + queue_work(cpuset_wq, &rebuild_sched_domains_work); +} + +/* + * Accomplishes the same scheduler domain rebuild as the above + * async_rebuild_sched_domains(), however it directly calls the + * rebuild routine synchronously rather than calling it via an + * asynchronous work thread. + * + * This can only be called from code that is not holding + * cgroup_mutex (not nested in a cgroup_lock() call.) + */ +void rebuild_sched_domains(void) +{ + do_rebuild_sched_domains(NULL); +} + +/** + * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's + * @tsk: task to test + * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner + * + * Call with cgroup_mutex held. May take callback_mutex during call. + * Called for each task in a cgroup by cgroup_scan_tasks(). + * Return nonzero if this tasks's cpus_allowed mask should be changed (in other + * words, if its mask is not equal to its cpuset's mask). + */ +static int cpuset_test_cpumask(struct task_struct *tsk, + struct cgroup_scanner *scan) +{ + return !cpumask_equal(&tsk->cpus_allowed, + (cgroup_cs(scan->cg))->cpus_allowed); +} + +/** + * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's + * @tsk: task to test + * @scan: struct cgroup_scanner containing the cgroup of the task + * + * Called by cgroup_scan_tasks() for each task in a cgroup whose + * cpus_allowed mask needs to be changed. + * + * We don't need to re-check for the cgroup/cpuset membership, since we're + * holding cgroup_lock() at this point. + */ +static void cpuset_change_cpumask(struct task_struct *tsk, + struct cgroup_scanner *scan) +{ + set_cpus_allowed_ptr(tsk, ((cgroup_cs(scan->cg))->cpus_allowed)); +} + +/** + * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. + * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed + * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() + * + * Called with cgroup_mutex held + * + * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, + * calling callback functions for each. + * + * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 + * if @heap != NULL. + */ +static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) +{ + struct cgroup_scanner scan; + + scan.cg = cs->css.cgroup; + scan.test_task = cpuset_test_cpumask; + scan.process_task = cpuset_change_cpumask; + scan.heap = heap; + cgroup_scan_tasks(&scan); +} + +/** + * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it + * @cs: the cpuset to consider + * @buf: buffer of cpu numbers written to this cpuset + */ +static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, + const char *buf) +{ + struct ptr_heap heap; + int retval; + int is_load_balanced; + + /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ + if (cs == &top_cpuset) + return -EACCES; + + /* + * An empty cpus_allowed is ok only if the cpuset has no tasks. + * Since cpulist_parse() fails on an empty mask, we special case + * that parsing. The validate_change() call ensures that cpusets + * with tasks have cpus. + */ + if (!*buf) { + cpumask_clear(trialcs->cpus_allowed); + } else { + retval = cpulist_parse(buf, trialcs->cpus_allowed); + if (retval < 0) + return retval; + + if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask)) + return -EINVAL; + } + retval = validate_change(cs, trialcs); + if (retval < 0) + return retval; + + /* Nothing to do if the cpus didn't change */ + if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed)) + return 0; + + retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); + if (retval) + return retval; + + is_load_balanced = is_sched_load_balance(trialcs); + + mutex_lock(&callback_mutex); + cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); + mutex_unlock(&callback_mutex); + + /* + * Scan tasks in the cpuset, and update the cpumasks of any + * that need an update. + */ + update_tasks_cpumask(cs, &heap); + + heap_free(&heap); + + if (is_load_balanced) + async_rebuild_sched_domains(); + return 0; +} + +/* + * cpuset_migrate_mm + * + * Migrate memory region from one set of nodes to another. + * + * Temporarilly set tasks mems_allowed to target nodes of migration, + * so that the migration code can allocate pages on these nodes. + * + * Call holding cgroup_mutex, so current's cpuset won't change + * during this call, as manage_mutex holds off any cpuset_attach() + * calls. Therefore we don't need to take task_lock around the + * call to guarantee_online_mems(), as we know no one is changing + * our task's cpuset. + * + * While the mm_struct we are migrating is typically from some + * other task, the task_struct mems_allowed that we are hacking + * is for our current task, which must allocate new pages for that + * migrating memory region. + */ + +static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, + const nodemask_t *to) +{ + struct task_struct *tsk = current; + + tsk->mems_allowed = *to; + + do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); + + guarantee_online_mems(task_cs(tsk),&tsk->mems_allowed); +} + +/* + * cpuset_change_task_nodemask - change task's mems_allowed and mempolicy + * @tsk: the task to change + * @newmems: new nodes that the task will be set + * + * In order to avoid seeing no nodes if the old and new nodes are disjoint, + * we structure updates as setting all new allowed nodes, then clearing newly + * disallowed ones. + */ +static void cpuset_change_task_nodemask(struct task_struct *tsk, + nodemask_t *newmems) +{ +repeat: + /* + * Allow tasks that have access to memory reserves because they have + * been OOM killed to get memory anywhere. + */ + if (unlikely(test_thread_flag(TIF_MEMDIE))) + return; + if (current->flags & PF_EXITING) /* Let dying task have memory */ + return; + + task_lock(tsk); + nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); + mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1); + + + /* + * ensure checking ->mems_allowed_change_disable after setting all new + * allowed nodes. + * + * the read-side task can see an nodemask with new allowed nodes and + * old allowed nodes. and if it allocates page when cpuset clears newly + * disallowed ones continuous, it can see the new allowed bits. + * + * And if setting all new allowed nodes is after the checking, setting + * all new allowed nodes and clearing newly disallowed ones will be done + * continuous, and the read-side task may find no node to alloc page. + */ + smp_mb(); + + /* + * Allocation of memory is very fast, we needn't sleep when waiting + * for the read-side. + */ + while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) { + task_unlock(tsk); + if (!task_curr(tsk)) + yield(); + goto repeat; + } + + /* + * ensure checking ->mems_allowed_change_disable before clearing all new + * disallowed nodes. + * + * if clearing newly disallowed bits before the checking, the read-side + * task may find no node to alloc page. + */ + smp_mb(); + + mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2); + tsk->mems_allowed = *newmems; + task_unlock(tsk); +} + +/* + * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy + * of it to cpuset's new mems_allowed, and migrate pages to new nodes if + * memory_migrate flag is set. Called with cgroup_mutex held. + */ +static void cpuset_change_nodemask(struct task_struct *p, + struct cgroup_scanner *scan) +{ + struct mm_struct *mm; + struct cpuset *cs; + int migrate; + const nodemask_t *oldmem = scan->data; + static nodemask_t newmems; /* protected by cgroup_mutex */ + + cs = cgroup_cs(scan->cg); + guarantee_online_mems(cs, &newmems); + + cpuset_change_task_nodemask(p, &newmems); + + mm = get_task_mm(p); + if (!mm) + return; + + migrate = is_memory_migrate(cs); + + mpol_rebind_mm(mm, &cs->mems_allowed); + if (migrate) + cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed); + mmput(mm); +} + +static void *cpuset_being_rebound; + +/** + * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. + * @cs: the cpuset in which each task's mems_allowed mask needs to be changed + * @oldmem: old mems_allowed of cpuset cs + * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() + * + * Called with cgroup_mutex held + * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 + * if @heap != NULL. + */ +static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, + struct ptr_heap *heap) +{ + struct cgroup_scanner scan; + + cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ + + scan.cg = cs->css.cgroup; + scan.test_task = NULL; + scan.process_task = cpuset_change_nodemask; + scan.heap = heap; + scan.data = (nodemask_t *)oldmem; + + /* + * The mpol_rebind_mm() call takes mmap_sem, which we couldn't + * take while holding tasklist_lock. Forks can happen - the + * mpol_dup() cpuset_being_rebound check will catch such forks, + * and rebind their vma mempolicies too. Because we still hold + * the global cgroup_mutex, we know that no other rebind effort + * will be contending for the global variable cpuset_being_rebound. + * It's ok if we rebind the same mm twice; mpol_rebind_mm() + * is idempotent. Also migrate pages in each mm to new nodes. + */ + cgroup_scan_tasks(&scan); + + /* We're done rebinding vmas to this cpuset's new mems_allowed. */ + cpuset_being_rebound = NULL; +} + +/* + * Handle user request to change the 'mems' memory placement + * of a cpuset. Needs to validate the request, update the + * cpusets mems_allowed, and for each task in the cpuset, + * update mems_allowed and rebind task's mempolicy and any vma + * mempolicies and if the cpuset is marked 'memory_migrate', + * migrate the tasks pages to the new memory. + * + * Call with cgroup_mutex held. May take callback_mutex during call. + * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, + * lock each such tasks mm->mmap_sem, scan its vma's and rebind + * their mempolicies to the cpusets new mems_allowed. + */ +static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, + const char *buf) +{ + NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL); + int retval; + struct ptr_heap heap; + + if (!oldmem) + return -ENOMEM; + + /* + * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; + * it's read-only + */ + if (cs == &top_cpuset) { + retval = -EACCES; + goto done; + } + + /* + * An empty mems_allowed is ok iff there are no tasks in the cpuset. + * Since nodelist_parse() fails on an empty mask, we special case + * that parsing. The validate_change() call ensures that cpusets + * with tasks have memory. + */ + if (!*buf) { + nodes_clear(trialcs->mems_allowed); + } else { + retval = nodelist_parse(buf, trialcs->mems_allowed); + if (retval < 0) + goto done; + + if (!nodes_subset(trialcs->mems_allowed, + node_states[N_HIGH_MEMORY])) { + retval = -EINVAL; + goto done; + } + } + *oldmem = cs->mems_allowed; + if (nodes_equal(*oldmem, trialcs->mems_allowed)) { + retval = 0; /* Too easy - nothing to do */ + goto done; + } + retval = validate_change(cs, trialcs); + if (retval < 0) + goto done; + + retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); + if (retval < 0) + goto done; + + mutex_lock(&callback_mutex); + cs->mems_allowed = trialcs->mems_allowed; + mutex_unlock(&callback_mutex); + + update_tasks_nodemask(cs, oldmem, &heap); + + heap_free(&heap); +done: + NODEMASK_FREE(oldmem); + return retval; +} + +int current_cpuset_is_being_rebound(void) +{ + return task_cs(current) == cpuset_being_rebound; +} + +static int update_relax_domain_level(struct cpuset *cs, s64 val) +{ +#ifdef CONFIG_SMP + if (val < -1 || val >= sched_domain_level_max) + return -EINVAL; +#endif + + if (val != cs->relax_domain_level) { + cs->relax_domain_level = val; + if (!cpumask_empty(cs->cpus_allowed) && + is_sched_load_balance(cs)) + async_rebuild_sched_domains(); + } + + return 0; +} + +/* + * cpuset_change_flag - make a task's spread flags the same as its cpuset's + * @tsk: task to be updated + * @scan: struct cgroup_scanner containing the cgroup of the task + * + * Called by cgroup_scan_tasks() for each task in a cgroup. + * + * We don't need to re-check for the cgroup/cpuset membership, since we're + * holding cgroup_lock() at this point. + */ +static void cpuset_change_flag(struct task_struct *tsk, + struct cgroup_scanner *scan) +{ + cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk); +} + +/* + * update_tasks_flags - update the spread flags of tasks in the cpuset. + * @cs: the cpuset in which each task's spread flags needs to be changed + * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() + * + * Called with cgroup_mutex held + * + * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, + * calling callback functions for each. + * + * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 + * if @heap != NULL. + */ +static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) +{ + struct cgroup_scanner scan; + + scan.cg = cs->css.cgroup; + scan.test_task = NULL; + scan.process_task = cpuset_change_flag; + scan.heap = heap; + cgroup_scan_tasks(&scan); +} + +/* + * update_flag - read a 0 or a 1 in a file and update associated flag + * bit: the bit to update (see cpuset_flagbits_t) + * cs: the cpuset to update + * turning_on: whether the flag is being set or cleared + * + * Call with cgroup_mutex held. + */ + +static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, + int turning_on) +{ + struct cpuset *trialcs; + int balance_flag_changed; + int spread_flag_changed; + struct ptr_heap heap; + int err; + + trialcs = alloc_trial_cpuset(cs); + if (!trialcs) + return -ENOMEM; + + if (turning_on) + set_bit(bit, &trialcs->flags); + else + clear_bit(bit, &trialcs->flags); + + err = validate_change(cs, trialcs); + if (err < 0) + goto out; + + err = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); + if (err < 0) + goto out; + + balance_flag_changed = (is_sched_load_balance(cs) != + is_sched_load_balance(trialcs)); + + spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) + || (is_spread_page(cs) != is_spread_page(trialcs))); + + mutex_lock(&callback_mutex); + cs->flags = trialcs->flags; + mutex_unlock(&callback_mutex); + + if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) + async_rebuild_sched_domains(); + + if (spread_flag_changed) + update_tasks_flags(cs, &heap); + heap_free(&heap); +out: + free_trial_cpuset(trialcs); + return err; +} + +/* + * Frequency meter - How fast is some event occurring? + * + * These routines manage a digitally filtered, constant time based, + * event frequency meter. There are four routines: + * fmeter_init() - initialize a frequency meter. + * fmeter_markevent() - called each time the event happens. + * fmeter_getrate() - returns the recent rate of such events. + * fmeter_update() - internal routine used to update fmeter. + * + * A common data structure is passed to each of these routines, + * which is used to keep track of the state required to manage the + * frequency meter and its digital filter. + * + * The filter works on the number of events marked per unit time. + * The filter is single-pole low-pass recursive (IIR). The time unit + * is 1 second. Arithmetic is done using 32-bit integers scaled to + * simulate 3 decimal digits of precision (multiplied by 1000). + * + * With an FM_COEF of 933, and a time base of 1 second, the filter + * has a half-life of 10 seconds, meaning that if the events quit + * happening, then the rate returned from the fmeter_getrate() + * will be cut in half each 10 seconds, until it converges to zero. + * + * It is not worth doing a real infinitely recursive filter. If more + * than FM_MAXTICKS ticks have elapsed since the last filter event, + * just compute FM_MAXTICKS ticks worth, by which point the level + * will be stable. + * + * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid + * arithmetic overflow in the fmeter_update() routine. + * + * Given the simple 32 bit integer arithmetic used, this meter works + * best for reporting rates between one per millisecond (msec) and + * one per 32 (approx) seconds. At constant rates faster than one + * per msec it maxes out at values just under 1,000,000. At constant + * rates between one per msec, and one per second it will stabilize + * to a value N*1000, where N is the rate of events per second. + * At constant rates between one per second and one per 32 seconds, + * it will be choppy, moving up on the seconds that have an event, + * and then decaying until the next event. At rates slower than + * about one in 32 seconds, it decays all the way back to zero between + * each event. + */ + +#define FM_COEF 933 /* coefficient for half-life of 10 secs */ +#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */ +#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ +#define FM_SCALE 1000 /* faux fixed point scale */ + +/* Initialize a frequency meter */ +static void fmeter_init(struct fmeter *fmp) +{ + fmp->cnt = 0; + fmp->val = 0; + fmp->time = 0; + spin_lock_init(&fmp->lock); +} + +/* Internal meter update - process cnt events and update value */ +static void fmeter_update(struct fmeter *fmp) +{ + time_t now = get_seconds(); + time_t ticks = now - fmp->time; + + if (ticks == 0) + return; + + ticks = min(FM_MAXTICKS, ticks); + while (ticks-- > 0) + fmp->val = (FM_COEF * fmp->val) / FM_SCALE; + fmp->time = now; + + fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; + fmp->cnt = 0; +} + +/* Process any previous ticks, then bump cnt by one (times scale). */ +static void fmeter_markevent(struct fmeter *fmp) +{ + spin_lock(&fmp->lock); + fmeter_update(fmp); + fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); + spin_unlock(&fmp->lock); +} + +/* Process any previous ticks, then return current value. */ +static int fmeter_getrate(struct fmeter *fmp) +{ + int val; + + spin_lock(&fmp->lock); + fmeter_update(fmp); + val = fmp->val; + spin_unlock(&fmp->lock); + return val; +} + +/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ +static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, + struct task_struct *tsk) +{ + struct cpuset *cs = cgroup_cs(cont); + + if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) + return -ENOSPC; + + /* + * Kthreads bound to specific cpus cannot be moved to a new cpuset; we + * cannot change their cpu affinity and isolating such threads by their + * set of allowed nodes is unnecessary. Thus, cpusets are not + * applicable for such threads. This prevents checking for success of + * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may + * be changed. + */ + if (tsk->flags & PF_THREAD_BOUND) + return -EINVAL; + + return 0; +} + +static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task) +{ + return security_task_setscheduler(task); +} + +/* + * Protected by cgroup_lock. The nodemasks must be stored globally because + * dynamically allocating them is not allowed in pre_attach, and they must + * persist among pre_attach, attach_task, and attach. + */ +static cpumask_var_t cpus_attach; +static nodemask_t cpuset_attach_nodemask_from; +static nodemask_t cpuset_attach_nodemask_to; + +/* Set-up work for before attaching each task. */ +static void cpuset_pre_attach(struct cgroup *cont) +{ + struct cpuset *cs = cgroup_cs(cont); + + if (cs == &top_cpuset) + cpumask_copy(cpus_attach, cpu_possible_mask); + else + guarantee_online_cpus(cs, cpus_attach); + + guarantee_online_mems(cs, &cpuset_attach_nodemask_to); +} + +/* Per-thread attachment work. */ +static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk) +{ + int err; + struct cpuset *cs = cgroup_cs(cont); + + /* + * can_attach beforehand should guarantee that this doesn't fail. + * TODO: have a better way to handle failure here + */ + err = set_cpus_allowed_ptr(tsk, cpus_attach); + WARN_ON_ONCE(err); + + cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to); + cpuset_update_task_spread_flag(cs, tsk); +} + +static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, + struct cgroup *oldcont, struct task_struct *tsk) +{ + struct mm_struct *mm; + struct cpuset *cs = cgroup_cs(cont); + struct cpuset *oldcs = cgroup_cs(oldcont); + + /* + * Change mm, possibly for multiple threads in a threadgroup. This is + * expensive and may sleep. + */ + cpuset_attach_nodemask_from = oldcs->mems_allowed; + cpuset_attach_nodemask_to = cs->mems_allowed; + mm = get_task_mm(tsk); + if (mm) { + mpol_rebind_mm(mm, &cpuset_attach_nodemask_to); + if (is_memory_migrate(cs)) + cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from, + &cpuset_attach_nodemask_to); + mmput(mm); + } +} + +/* The various types of files and directories in a cpuset file system */ + +typedef enum { + FILE_MEMORY_MIGRATE, + FILE_CPULIST, + FILE_MEMLIST, + FILE_CPU_EXCLUSIVE, + FILE_MEM_EXCLUSIVE, + FILE_MEM_HARDWALL, + FILE_SCHED_LOAD_BALANCE, + FILE_SCHED_RELAX_DOMAIN_LEVEL, + FILE_MEMORY_PRESSURE_ENABLED, + FILE_MEMORY_PRESSURE, + FILE_SPREAD_PAGE, + FILE_SPREAD_SLAB, +} cpuset_filetype_t; + +static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) +{ + int retval = 0; + struct cpuset *cs = cgroup_cs(cgrp); + cpuset_filetype_t type = cft->private; + + if (!cgroup_lock_live_group(cgrp)) + return -ENODEV; + + switch (type) { + case FILE_CPU_EXCLUSIVE: + retval = update_flag(CS_CPU_EXCLUSIVE, cs, val); + break; + case FILE_MEM_EXCLUSIVE: + retval = update_flag(CS_MEM_EXCLUSIVE, cs, val); + break; + case FILE_MEM_HARDWALL: + retval = update_flag(CS_MEM_HARDWALL, cs, val); + break; + case FILE_SCHED_LOAD_BALANCE: + retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, val); + break; + case FILE_MEMORY_MIGRATE: + retval = update_flag(CS_MEMORY_MIGRATE, cs, val); + break; + case FILE_MEMORY_PRESSURE_ENABLED: + cpuset_memory_pressure_enabled = !!val; + break; + case FILE_MEMORY_PRESSURE: + retval = -EACCES; + break; + case FILE_SPREAD_PAGE: + retval = update_flag(CS_SPREAD_PAGE, cs, val); + break; + case FILE_SPREAD_SLAB: + retval = update_flag(CS_SPREAD_SLAB, cs, val); + break; + default: + retval = -EINVAL; + break; + } + cgroup_unlock(); + return retval; +} + +static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) +{ + int retval = 0; + struct cpuset *cs = cgroup_cs(cgrp); + cpuset_filetype_t type = cft->private; + + if (!cgroup_lock_live_group(cgrp)) + return -ENODEV; + + switch (type) { + case FILE_SCHED_RELAX_DOMAIN_LEVEL: + retval = update_relax_domain_level(cs, val); + break; + default: + retval = -EINVAL; + break; + } + cgroup_unlock(); + return retval; +} + +/* + * Common handling for a write to a "cpus" or "mems" file. + */ +static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, + const char *buf) +{ + int retval = 0; + struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *trialcs; + + if (!cgroup_lock_live_group(cgrp)) + return -ENODEV; + + trialcs = alloc_trial_cpuset(cs); + if (!trialcs) { + retval = -ENOMEM; + goto out; + } + + switch (cft->private) { + case FILE_CPULIST: + retval = update_cpumask(cs, trialcs, buf); + break; + case FILE_MEMLIST: + retval = update_nodemask(cs, trialcs, buf); + break; + default: + retval = -EINVAL; + break; + } + + free_trial_cpuset(trialcs); +out: + cgroup_unlock(); + return retval; +} + +/* + * These ascii lists should be read in a single call, by using a user + * buffer large enough to hold the entire map. If read in smaller + * chunks, there is no guarantee of atomicity. Since the display format + * used, list of ranges of sequential numbers, is variable length, + * and since these maps can change value dynamically, one could read + * gibberish by doing partial reads while a list was changing. + * A single large read to a buffer that crosses a page boundary is + * ok, because the result being copied to user land is not recomputed + * across a page fault. + */ + +static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs) +{ + size_t count; + + mutex_lock(&callback_mutex); + count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed); + mutex_unlock(&callback_mutex); + + return count; +} + +static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs) +{ + size_t count; + + mutex_lock(&callback_mutex); + count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed); + mutex_unlock(&callback_mutex); + + return count; +} + +static ssize_t cpuset_common_file_read(struct cgroup *cont, + struct cftype *cft, + struct file *file, + char __user *buf, + size_t nbytes, loff_t *ppos) +{ + struct cpuset *cs = cgroup_cs(cont); + cpuset_filetype_t type = cft->private; + char *page; + ssize_t retval = 0; + char *s; + + if (!(page = (char *)__get_free_page(GFP_TEMPORARY))) + return -ENOMEM; + + s = page; + + switch (type) { + case FILE_CPULIST: + s += cpuset_sprintf_cpulist(s, cs); + break; + case FILE_MEMLIST: + s += cpuset_sprintf_memlist(s, cs); + break; + default: + retval = -EINVAL; + goto out; + } + *s++ = '\n'; + + retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page); +out: + free_page((unsigned long)page); + return retval; +} + +static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) +{ + struct cpuset *cs = cgroup_cs(cont); + cpuset_filetype_t type = cft->private; + switch (type) { + case FILE_CPU_EXCLUSIVE: + return is_cpu_exclusive(cs); + case FILE_MEM_EXCLUSIVE: + return is_mem_exclusive(cs); + case FILE_MEM_HARDWALL: + return is_mem_hardwall(cs); + case FILE_SCHED_LOAD_BALANCE: + return is_sched_load_balance(cs); + case FILE_MEMORY_MIGRATE: + return is_memory_migrate(cs); + case FILE_MEMORY_PRESSURE_ENABLED: + return cpuset_memory_pressure_enabled; + case FILE_MEMORY_PRESSURE: + return fmeter_getrate(&cs->fmeter); + case FILE_SPREAD_PAGE: + return is_spread_page(cs); + case FILE_SPREAD_SLAB: + return is_spread_slab(cs); + default: + BUG(); + } + + /* Unreachable but makes gcc happy */ + return 0; +} + +static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) +{ + struct cpuset *cs = cgroup_cs(cont); + cpuset_filetype_t type = cft->private; + switch (type) { + case FILE_SCHED_RELAX_DOMAIN_LEVEL: + return cs->relax_domain_level; + default: + BUG(); + } + + /* Unrechable but makes gcc happy */ + return 0; +} + + +/* + * for the common functions, 'private' gives the type of file + */ + +static struct cftype files[] = { + { + .name = "cpus", + .read = cpuset_common_file_read, + .write_string = cpuset_write_resmask, + .max_write_len = (100U + 6 * NR_CPUS), + .private = FILE_CPULIST, + }, + + { + .name = "mems", + .read = cpuset_common_file_read, + .write_string = cpuset_write_resmask, + .max_write_len = (100U + 6 * MAX_NUMNODES), + .private = FILE_MEMLIST, + }, + + { + .name = "cpu_exclusive", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_CPU_EXCLUSIVE, + }, + + { + .name = "mem_exclusive", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEM_EXCLUSIVE, + }, + + { + .name = "mem_hardwall", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEM_HARDWALL, + }, + + { + .name = "sched_load_balance", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_SCHED_LOAD_BALANCE, + }, + + { + .name = "sched_relax_domain_level", + .read_s64 = cpuset_read_s64, + .write_s64 = cpuset_write_s64, + .private = FILE_SCHED_RELAX_DOMAIN_LEVEL, + }, + + { + .name = "memory_migrate", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEMORY_MIGRATE, + }, + + { + .name = "memory_pressure", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEMORY_PRESSURE, + .mode = S_IRUGO, + }, + + { + .name = "memory_spread_page", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_SPREAD_PAGE, + }, + + { + .name = "memory_spread_slab", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_SPREAD_SLAB, + }, +}; + +static struct cftype cft_memory_pressure_enabled = { + .name = "memory_pressure_enabled", + .read_u64 = cpuset_read_u64, + .write_u64 = cpuset_write_u64, + .private = FILE_MEMORY_PRESSURE_ENABLED, +}; + +static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) +{ + int err; + + err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); + if (err) + return err; + /* memory_pressure_enabled is in root cpuset only */ + if (!cont->parent) + err = cgroup_add_file(cont, ss, + &cft_memory_pressure_enabled); + return err; +} + +/* + * post_clone() is called during cgroup_create() when the + * clone_children mount argument was specified. The cgroup + * can not yet have any tasks. + * + * Currently we refuse to set up the cgroup - thereby + * refusing the task to be entered, and as a result refusing + * the sys_unshare() or clone() which initiated it - if any + * sibling cpusets have exclusive cpus or mem. + * + * If this becomes a problem for some users who wish to + * allow that scenario, then cpuset_post_clone() could be + * changed to grant parent->cpus_allowed-sibling_cpus_exclusive + * (and likewise for mems) to the new cgroup. Called with cgroup_mutex + * held. + */ +static void cpuset_post_clone(struct cgroup_subsys *ss, + struct cgroup *cgroup) +{ + struct cgroup *parent, *child; + struct cpuset *cs, *parent_cs; + + parent = cgroup->parent; + list_for_each_entry(child, &parent->children, sibling) { + cs = cgroup_cs(child); + if (is_mem_exclusive(cs) || is_cpu_exclusive(cs)) + return; + } + cs = cgroup_cs(cgroup); + parent_cs = cgroup_cs(parent); + + mutex_lock(&callback_mutex); + cs->mems_allowed = parent_cs->mems_allowed; + cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed); + mutex_unlock(&callback_mutex); + return; +} + +/* + * cpuset_create - create a cpuset + * ss: cpuset cgroup subsystem + * cont: control group that the new cpuset will be part of + */ + +static struct cgroup_subsys_state *cpuset_create( + struct cgroup_subsys *ss, + struct cgroup *cont) +{ + struct cpuset *cs; + struct cpuset *parent; + + if (!cont->parent) { + return &top_cpuset.css; + } + parent = cgroup_cs(cont->parent); + cs = kmalloc(sizeof(*cs), GFP_KERNEL); + if (!cs) + return ERR_PTR(-ENOMEM); + if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL)) { + kfree(cs); + return ERR_PTR(-ENOMEM); + } + + cs->flags = 0; + if (is_spread_page(parent)) + set_bit(CS_SPREAD_PAGE, &cs->flags); + if (is_spread_slab(parent)) + set_bit(CS_SPREAD_SLAB, &cs->flags); + set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); + cpumask_clear(cs->cpus_allowed); + nodes_clear(cs->mems_allowed); + fmeter_init(&cs->fmeter); + cs->relax_domain_level = -1; + + cs->parent = parent; + number_of_cpusets++; + return &cs->css ; +} + +/* + * If the cpuset being removed has its flag 'sched_load_balance' + * enabled, then simulate turning sched_load_balance off, which + * will call async_rebuild_sched_domains(). + */ + +static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) +{ + struct cpuset *cs = cgroup_cs(cont); + + if (is_sched_load_balance(cs)) + update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); + + number_of_cpusets--; + free_cpumask_var(cs->cpus_allowed); + kfree(cs); +} + +struct cgroup_subsys cpuset_subsys = { + .name = "cpuset", + .create = cpuset_create, + .destroy = cpuset_destroy, + .can_attach = cpuset_can_attach, + .can_attach_task = cpuset_can_attach_task, + .pre_attach = cpuset_pre_attach, + .attach_task = cpuset_attach_task, + .attach = cpuset_attach, + .populate = cpuset_populate, + .post_clone = cpuset_post_clone, + .subsys_id = cpuset_subsys_id, + .early_init = 1, +}; + +/** + * cpuset_init - initialize cpusets at system boot + * + * Description: Initialize top_cpuset and the cpuset internal file system, + **/ + +int __init cpuset_init(void) +{ + int err = 0; + + if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)) + BUG(); + + cpumask_setall(top_cpuset.cpus_allowed); + nodes_setall(top_cpuset.mems_allowed); + + fmeter_init(&top_cpuset.fmeter); + set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); + top_cpuset.relax_domain_level = -1; + + err = register_filesystem(&cpuset_fs_type); + if (err < 0) + return err; + + if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) + BUG(); + + number_of_cpusets = 1; + return 0; +} + +/** + * cpuset_do_move_task - move a given task to another cpuset + * @tsk: pointer to task_struct the task to move + * @scan: struct cgroup_scanner contained in its struct cpuset_hotplug_scanner + * + * Called by cgroup_scan_tasks() for each task in a cgroup. + * Return nonzero to stop the walk through the tasks. + */ +static void cpuset_do_move_task(struct task_struct *tsk, + struct cgroup_scanner *scan) +{ + struct cgroup *new_cgroup = scan->data; + + cgroup_attach_task(new_cgroup, tsk); +} + +/** + * move_member_tasks_to_cpuset - move tasks from one cpuset to another + * @from: cpuset in which the tasks currently reside + * @to: cpuset to which the tasks will be moved + * + * Called with cgroup_mutex held + * callback_mutex must not be held, as cpuset_attach() will take it. + * + * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, + * calling callback functions for each. + */ +static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) +{ + struct cgroup_scanner scan; + + scan.cg = from->css.cgroup; + scan.test_task = NULL; /* select all tasks in cgroup */ + scan.process_task = cpuset_do_move_task; + scan.heap = NULL; + scan.data = to->css.cgroup; + + if (cgroup_scan_tasks(&scan)) + printk(KERN_ERR "move_member_tasks_to_cpuset: " + "cgroup_scan_tasks failed\n"); +} + +/* + * If CPU and/or memory hotplug handlers, below, unplug any CPUs + * or memory nodes, we need to walk over the cpuset hierarchy, + * removing that CPU or node from all cpusets. If this removes the + * last CPU or node from a cpuset, then move the tasks in the empty + * cpuset to its next-highest non-empty parent. + * + * Called with cgroup_mutex held + * callback_mutex must not be held, as cpuset_attach() will take it. + */ +static void remove_tasks_in_empty_cpuset(struct cpuset *cs) +{ + struct cpuset *parent; + + /* + * The cgroup's css_sets list is in use if there are tasks + * in the cpuset; the list is empty if there are none; + * the cs->css.refcnt seems always 0. + */ + if (list_empty(&cs->css.cgroup->css_sets)) + return; + + /* + * Find its next-highest non-empty parent, (top cpuset + * has online cpus, so can't be empty). + */ + parent = cs->parent; + while (cpumask_empty(parent->cpus_allowed) || + nodes_empty(parent->mems_allowed)) + parent = parent->parent; + + move_member_tasks_to_cpuset(cs, parent); +} + +/* + * Walk the specified cpuset subtree and look for empty cpusets. + * The tasks of such cpuset must be moved to a parent cpuset. + * + * Called with cgroup_mutex held. We take callback_mutex to modify + * cpus_allowed and mems_allowed. + * + * This walk processes the tree from top to bottom, completing one layer + * before dropping down to the next. It always processes a node before + * any of its children. + * + * For now, since we lack memory hot unplug, we'll never see a cpuset + * that has tasks along with an empty 'mems'. But if we did see such + * a cpuset, we'd handle it just like we do if its 'cpus' was empty. + */ +static void scan_for_empty_cpusets(struct cpuset *root) +{ + LIST_HEAD(queue); + struct cpuset *cp; /* scans cpusets being updated */ + struct cpuset *child; /* scans child cpusets of cp */ + struct cgroup *cont; + static nodemask_t oldmems; /* protected by cgroup_mutex */ + + list_add_tail((struct list_head *)&root->stack_list, &queue); + + while (!list_empty(&queue)) { + cp = list_first_entry(&queue, struct cpuset, stack_list); + list_del(queue.next); + list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { + child = cgroup_cs(cont); + list_add_tail(&child->stack_list, &queue); + } + + /* Continue past cpusets with all cpus, mems online */ + if (cpumask_subset(cp->cpus_allowed, cpu_active_mask) && + nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) + continue; + + oldmems = cp->mems_allowed; + + /* Remove offline cpus and mems from this cpuset. */ + mutex_lock(&callback_mutex); + cpumask_and(cp->cpus_allowed, cp->cpus_allowed, + cpu_active_mask); + nodes_and(cp->mems_allowed, cp->mems_allowed, + node_states[N_HIGH_MEMORY]); + mutex_unlock(&callback_mutex); + + /* Move tasks from the empty cpuset to a parent */ + if (cpumask_empty(cp->cpus_allowed) || + nodes_empty(cp->mems_allowed)) + remove_tasks_in_empty_cpuset(cp); + else { + update_tasks_cpumask(cp, NULL); + update_tasks_nodemask(cp, &oldmems, NULL); + } + } +} + +/* + * The top_cpuset tracks what CPUs and Memory Nodes are online, + * period. This is necessary in order to make cpusets transparent + * (of no affect) on systems that are actively using CPU hotplug + * but making no active use of cpusets. + * + * This routine ensures that top_cpuset.cpus_allowed tracks + * cpu_active_mask on each CPU hotplug (cpuhp) event. + * + * Called within get_online_cpus(). Needs to call cgroup_lock() + * before calling generate_sched_domains(). + */ +void cpuset_update_active_cpus(void) +{ + struct sched_domain_attr *attr; + cpumask_var_t *doms; + int ndoms; + + cgroup_lock(); + mutex_lock(&callback_mutex); + cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); + mutex_unlock(&callback_mutex); + scan_for_empty_cpusets(&top_cpuset); + ndoms = generate_sched_domains(&doms, &attr); + cgroup_unlock(); + + /* Have scheduler rebuild the domains */ + partition_sched_domains(ndoms, doms, attr); +} + +#ifdef CONFIG_MEMORY_HOTPLUG +/* + * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. + * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. + * See also the previous routine cpuset_track_online_cpus(). + */ +static int cpuset_track_online_nodes(struct notifier_block *self, + unsigned long action, void *arg) +{ + static nodemask_t oldmems; /* protected by cgroup_mutex */ + + cgroup_lock(); + switch (action) { + case MEM_ONLINE: + oldmems = top_cpuset.mems_allowed; + mutex_lock(&callback_mutex); + top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; + mutex_unlock(&callback_mutex); + update_tasks_nodemask(&top_cpuset, &oldmems, NULL); + break; + case MEM_OFFLINE: + /* + * needn't update top_cpuset.mems_allowed explicitly because + * scan_for_empty_cpusets() will update it. + */ + scan_for_empty_cpusets(&top_cpuset); + break; + default: + break; + } + cgroup_unlock(); + + return NOTIFY_OK; +} +#endif + +/** + * cpuset_init_smp - initialize cpus_allowed + * + * Description: Finish top cpuset after cpu, node maps are initialized + **/ + +void __init cpuset_init_smp(void) +{ + cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); + top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; + + hotplug_memory_notifier(cpuset_track_online_nodes, 10); + + cpuset_wq = create_singlethread_workqueue("cpuset"); + BUG_ON(!cpuset_wq); +} + +/** + * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. + * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. + * @pmask: pointer to struct cpumask variable to receive cpus_allowed set. + * + * Description: Returns the cpumask_var_t cpus_allowed of the cpuset + * attached to the specified @tsk. Guaranteed to return some non-empty + * subset of cpu_online_map, even if this means going outside the + * tasks cpuset. + **/ + +void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) +{ + mutex_lock(&callback_mutex); + task_lock(tsk); + guarantee_online_cpus(task_cs(tsk), pmask); + task_unlock(tsk); + mutex_unlock(&callback_mutex); +} + +int cpuset_cpus_allowed_fallback(struct task_struct *tsk) +{ + const struct cpuset *cs; + int cpu; + + rcu_read_lock(); + cs = task_cs(tsk); + if (cs) + do_set_cpus_allowed(tsk, cs->cpus_allowed); + rcu_read_unlock(); + + /* + * We own tsk->cpus_allowed, nobody can change it under us. + * + * But we used cs && cs->cpus_allowed lockless and thus can + * race with cgroup_attach_task() or update_cpumask() and get + * the wrong tsk->cpus_allowed. However, both cases imply the + * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr() + * which takes task_rq_lock(). + * + * If we are called after it dropped the lock we must see all + * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary + * set any mask even if it is not right from task_cs() pov, + * the pending set_cpus_allowed_ptr() will fix things. + */ + + cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask); + if (cpu >= nr_cpu_ids) { + /* + * Either tsk->cpus_allowed is wrong (see above) or it + * is actually empty. The latter case is only possible + * if we are racing with remove_tasks_in_empty_cpuset(). + * Like above we can temporary set any mask and rely on + * set_cpus_allowed_ptr() as synchronization point. + */ + do_set_cpus_allowed(tsk, cpu_possible_mask); + cpu = cpumask_any(cpu_active_mask); + } + + return cpu; +} + +void cpuset_init_current_mems_allowed(void) +{ + nodes_setall(current->mems_allowed); +} + +/** + * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset. + * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed. + * + * Description: Returns the nodemask_t mems_allowed of the cpuset + * attached to the specified @tsk. Guaranteed to return some non-empty + * subset of node_states[N_HIGH_MEMORY], even if this means going outside the + * tasks cpuset. + **/ + +nodemask_t cpuset_mems_allowed(struct task_struct *tsk) +{ + nodemask_t mask; + + mutex_lock(&callback_mutex); + task_lock(tsk); + guarantee_online_mems(task_cs(tsk), &mask); + task_unlock(tsk); + mutex_unlock(&callback_mutex); + + return mask; +} + +/** + * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed + * @nodemask: the nodemask to be checked + * + * Are any of the nodes in the nodemask allowed in current->mems_allowed? + */ +int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) +{ + return nodes_intersects(*nodemask, current->mems_allowed); +} + +/* + * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or + * mem_hardwall ancestor to the specified cpuset. Call holding + * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall + * (an unusual configuration), then returns the root cpuset. + */ +static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) +{ + while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && cs->parent) + cs = cs->parent; + return cs; +} + +/** + * cpuset_node_allowed_softwall - Can we allocate on a memory node? + * @node: is this an allowed node? + * @gfp_mask: memory allocation flags + * + * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is + * set, yes, we can always allocate. If node is in our task's mems_allowed, + * yes. If it's not a __GFP_HARDWALL request and this node is in the nearest + * hardwalled cpuset ancestor to this task's cpuset, yes. If the task has been + * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE + * flag, yes. + * Otherwise, no. + * + * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to + * cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall() + * might sleep, and might allow a node from an enclosing cpuset. + * + * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall + * cpusets, and never sleeps. + * + * The __GFP_THISNODE placement logic is really handled elsewhere, + * by forcibly using a zonelist starting at a specified node, and by + * (in get_page_from_freelist()) refusing to consider the zones for + * any node on the zonelist except the first. By the time any such + * calls get to this routine, we should just shut up and say 'yes'. + * + * GFP_USER allocations are marked with the __GFP_HARDWALL bit, + * and do not allow allocations outside the current tasks cpuset + * unless the task has been OOM killed as is marked TIF_MEMDIE. + * GFP_KERNEL allocations are not so marked, so can escape to the + * nearest enclosing hardwalled ancestor cpuset. + * + * Scanning up parent cpusets requires callback_mutex. The + * __alloc_pages() routine only calls here with __GFP_HARDWALL bit + * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the + * current tasks mems_allowed came up empty on the first pass over + * the zonelist. So only GFP_KERNEL allocations, if all nodes in the + * cpuset are short of memory, might require taking the callback_mutex + * mutex. + * + * The first call here from mm/page_alloc:get_page_from_freelist() + * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, + * so no allocation on a node outside the cpuset is allowed (unless + * in interrupt, of course). + * + * The second pass through get_page_from_freelist() doesn't even call + * here for GFP_ATOMIC calls. For those calls, the __alloc_pages() + * variable 'wait' is not set, and the bit ALLOC_CPUSET is not set + * in alloc_flags. That logic and the checks below have the combined + * affect that: + * in_interrupt - any node ok (current task context irrelevant) + * GFP_ATOMIC - any node ok + * TIF_MEMDIE - any node ok + * GFP_KERNEL - any node in enclosing hardwalled cpuset ok + * GFP_USER - only nodes in current tasks mems allowed ok. + * + * Rule: + * Don't call cpuset_node_allowed_softwall if you can't sleep, unless you + * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables + * the code that might scan up ancestor cpusets and sleep. + */ +int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) +{ + const struct cpuset *cs; /* current cpuset ancestors */ + int allowed; /* is allocation in zone z allowed? */ + + if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) + return 1; + might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); + if (node_isset(node, current->mems_allowed)) + return 1; + /* + * Allow tasks that have access to memory reserves because they have + * been OOM killed to get memory anywhere. + */ + if (unlikely(test_thread_flag(TIF_MEMDIE))) + return 1; + if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ + return 0; + + if (current->flags & PF_EXITING) /* Let dying task have memory */ + return 1; + + /* Not hardwall and node outside mems_allowed: scan up cpusets */ + mutex_lock(&callback_mutex); + + task_lock(current); + cs = nearest_hardwall_ancestor(task_cs(current)); + task_unlock(current); + + allowed = node_isset(node, cs->mems_allowed); + mutex_unlock(&callback_mutex); + return allowed; +} + +/* + * cpuset_node_allowed_hardwall - Can we allocate on a memory node? + * @node: is this an allowed node? + * @gfp_mask: memory allocation flags + * + * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is + * set, yes, we can always allocate. If node is in our task's mems_allowed, + * yes. If the task has been OOM killed and has access to memory reserves as + * specified by the TIF_MEMDIE flag, yes. + * Otherwise, no. + * + * The __GFP_THISNODE placement logic is really handled elsewhere, + * by forcibly using a zonelist starting at a specified node, and by + * (in get_page_from_freelist()) refusing to consider the zones for + * any node on the zonelist except the first. By the time any such + * calls get to this routine, we should just shut up and say 'yes'. + * + * Unlike the cpuset_node_allowed_softwall() variant, above, + * this variant requires that the node be in the current task's + * mems_allowed or that we're in interrupt. It does not scan up the + * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset. + * It never sleeps. + */ +int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) +{ + if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) + return 1; + if (node_isset(node, current->mems_allowed)) + return 1; + /* + * Allow tasks that have access to memory reserves because they have + * been OOM killed to get memory anywhere. + */ + if (unlikely(test_thread_flag(TIF_MEMDIE))) + return 1; + return 0; +} + +/** + * cpuset_unlock - release lock on cpuset changes + * + * Undo the lock taken in a previous cpuset_lock() call. + */ + +void cpuset_unlock(void) +{ + mutex_unlock(&callback_mutex); +} + +/** + * cpuset_mem_spread_node() - On which node to begin search for a file page + * cpuset_slab_spread_node() - On which node to begin search for a slab page + * + * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for + * tasks in a cpuset with is_spread_page or is_spread_slab set), + * and if the memory allocation used cpuset_mem_spread_node() + * to determine on which node to start looking, as it will for + * certain page cache or slab cache pages such as used for file + * system buffers and inode caches, then instead of starting on the + * local node to look for a free page, rather spread the starting + * node around the tasks mems_allowed nodes. + * + * We don't have to worry about the returned node being offline + * because "it can't happen", and even if it did, it would be ok. + * + * The routines calling guarantee_online_mems() are careful to + * only set nodes in task->mems_allowed that are online. So it + * should not be possible for the following code to return an + * offline node. But if it did, that would be ok, as this routine + * is not returning the node where the allocation must be, only + * the node where the search should start. The zonelist passed to + * __alloc_pages() will include all nodes. If the slab allocator + * is passed an offline node, it will fall back to the local node. + * See kmem_cache_alloc_node(). + */ + +static int cpuset_spread_node(int *rotor) +{ + int node; + + node = next_node(*rotor, current->mems_allowed); + if (node == MAX_NUMNODES) + node = first_node(current->mems_allowed); + *rotor = node; + return node; +} + +int cpuset_mem_spread_node(void) +{ + return cpuset_spread_node(¤t->cpuset_mem_spread_rotor); +} + +int cpuset_slab_spread_node(void) +{ + return cpuset_spread_node(¤t->cpuset_slab_spread_rotor); +} + +EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); + +/** + * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's? + * @tsk1: pointer to task_struct of some task. + * @tsk2: pointer to task_struct of some other task. + * + * Description: Return true if @tsk1's mems_allowed intersects the + * mems_allowed of @tsk2. Used by the OOM killer to determine if + * one of the task's memory usage might impact the memory available + * to the other. + **/ + +int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, + const struct task_struct *tsk2) +{ + return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); +} + +/** + * cpuset_print_task_mems_allowed - prints task's cpuset and mems_allowed + * @task: pointer to task_struct of some task. + * + * Description: Prints @task's name, cpuset name, and cached copy of its + * mems_allowed to the kernel log. Must hold task_lock(task) to allow + * dereferencing task_cs(task). + */ +void cpuset_print_task_mems_allowed(struct task_struct *tsk) +{ + struct dentry *dentry; + + dentry = task_cs(tsk)->css.cgroup->dentry; + spin_lock(&cpuset_buffer_lock); + snprintf(cpuset_name, CPUSET_NAME_LEN, + dentry ? (const char *)dentry->d_name.name : "/"); + nodelist_scnprintf(cpuset_nodelist, CPUSET_NODELIST_LEN, + tsk->mems_allowed); + printk(KERN_INFO "%s cpuset=%s mems_allowed=%s\n", + tsk->comm, cpuset_name, cpuset_nodelist); + spin_unlock(&cpuset_buffer_lock); +} + +/* + * Collection of memory_pressure is suppressed unless + * this flag is enabled by writing "1" to the special + * cpuset file 'memory_pressure_enabled' in the root cpuset. + */ + +int cpuset_memory_pressure_enabled __read_mostly; + +/** + * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. + * + * Keep a running average of the rate of synchronous (direct) + * page reclaim efforts initiated by tasks in each cpuset. + * + * This represents the rate at which some task in the cpuset + * ran low on memory on all nodes it was allowed to use, and + * had to enter the kernels page reclaim code in an effort to + * create more free memory by tossing clean pages or swapping + * or writing dirty pages. + * + * Display to user space in the per-cpuset read-only file + * "memory_pressure". Value displayed is an integer + * representing the recent rate of entry into the synchronous + * (direct) page reclaim by any task attached to the cpuset. + **/ + +void __cpuset_memory_pressure_bump(void) +{ + task_lock(current); + fmeter_markevent(&task_cs(current)->fmeter); + task_unlock(current); +} + +#ifdef CONFIG_PROC_PID_CPUSET +/* + * proc_cpuset_show() + * - Print tasks cpuset path into seq_file. + * - Used for /proc//cpuset. + * - No need to task_lock(tsk) on this tsk->cpuset reference, as it + * doesn't really matter if tsk->cpuset changes after we read it, + * and we take cgroup_mutex, keeping cpuset_attach() from changing it + * anyway. + */ +static int proc_cpuset_show(struct seq_file *m, void *unused_v) +{ + struct pid *pid; + struct task_struct *tsk; + char *buf; + struct cgroup_subsys_state *css; + int retval; + + retval = -ENOMEM; + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) + goto out; + + retval = -ESRCH; + pid = m->private; + tsk = get_pid_task(pid, PIDTYPE_PID); + if (!tsk) + goto out_free; + + retval = -EINVAL; + cgroup_lock(); + css = task_subsys_state(tsk, cpuset_subsys_id); + retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); + if (retval < 0) + goto out_unlock; + seq_puts(m, buf); + seq_putc(m, '\n'); +out_unlock: + cgroup_unlock(); + put_task_struct(tsk); +out_free: + kfree(buf); +out: + return retval; +} + +static int cpuset_open(struct inode *inode, struct file *file) +{ + struct pid *pid = PROC_I(inode)->pid; + return single_open(file, proc_cpuset_show, pid); +} + +const struct file_operations proc_cpuset_operations = { + .open = cpuset_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_PROC_PID_CPUSET */ + +/* Display task mems_allowed in /proc//status file. */ +void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) +{ + seq_printf(m, "Mems_allowed:\t"); + seq_nodemask(m, &task->mems_allowed); + seq_printf(m, "\n"); + seq_printf(m, "Mems_allowed_list:\t"); + seq_nodemask_list(m, &task->mems_allowed); + seq_printf(m, "\n"); +} diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c new file mode 100644 index 00000000..5f856902 --- /dev/null +++ b/kernel/crash_dump.c @@ -0,0 +1,34 @@ +#include +#include +#include +#include +#include + +/* + * If we have booted due to a crash, max_pfn will be a very low value. We need + * to know the amount of memory that the previous kernel used. + */ +unsigned long saved_max_pfn; + +/* + * stores the physical address of elf header of crash image + * + * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by + * is_kdump_kernel() to determine if we are booting after a panic. Hence put + * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. + */ +unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; + +/* + * elfcorehdr= specifies the location of elf core header stored by the crashed + * kernel. This option will be passed by kexec loader to the capture kernel. + */ +static int __init setup_elfcorehdr(char *arg) +{ + char *end; + if (!arg) + return -EINVAL; + elfcorehdr_addr = memparse(arg, &end); + return end > arg ? 0 : -EINVAL; +} +early_param("elfcorehdr", setup_elfcorehdr); diff --git a/kernel/cred.c b/kernel/cred.c new file mode 100644 index 00000000..3a55ea4f --- /dev/null +++ b/kernel/cred.c @@ -0,0 +1,863 @@ +/* Task credentials management - see Documentation/security/credentials.txt + * + * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if 0 +#define kdebug(FMT, ...) \ + printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) +#else +#define kdebug(FMT, ...) \ + no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) +#endif + +static struct kmem_cache *cred_jar; + +/* + * The common credentials for the initial task's thread group + */ +#ifdef CONFIG_KEYS +static struct thread_group_cred init_tgcred = { + .usage = ATOMIC_INIT(2), + .tgid = 0, + .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock), +}; +#endif + +/* + * The initial credentials for the initial task + */ +struct cred init_cred = { + .usage = ATOMIC_INIT(4), +#ifdef CONFIG_DEBUG_CREDENTIALS + .subscribers = ATOMIC_INIT(2), + .magic = CRED_MAGIC, +#endif + .securebits = SECUREBITS_DEFAULT, + .cap_inheritable = CAP_EMPTY_SET, + .cap_permitted = CAP_FULL_SET, + .cap_effective = CAP_FULL_SET, + .cap_bset = CAP_FULL_SET, + .user = INIT_USER, + .user_ns = &init_user_ns, + .group_info = &init_groups, +#ifdef CONFIG_KEYS + .tgcred = &init_tgcred, +#endif +}; + +static inline void set_cred_subscribers(struct cred *cred, int n) +{ +#ifdef CONFIG_DEBUG_CREDENTIALS + atomic_set(&cred->subscribers, n); +#endif +} + +static inline int read_cred_subscribers(const struct cred *cred) +{ +#ifdef CONFIG_DEBUG_CREDENTIALS + return atomic_read(&cred->subscribers); +#else + return 0; +#endif +} + +static inline void alter_cred_subscribers(const struct cred *_cred, int n) +{ +#ifdef CONFIG_DEBUG_CREDENTIALS + struct cred *cred = (struct cred *) _cred; + + atomic_add(n, &cred->subscribers); +#endif +} + +/* + * Dispose of the shared task group credentials + */ +#ifdef CONFIG_KEYS +static void release_tgcred_rcu(struct rcu_head *rcu) +{ + struct thread_group_cred *tgcred = + container_of(rcu, struct thread_group_cred, rcu); + + BUG_ON(atomic_read(&tgcred->usage) != 0); + + key_put(tgcred->session_keyring); + key_put(tgcred->process_keyring); + kfree(tgcred); +} +#endif + +/* + * Release a set of thread group credentials. + */ +static void release_tgcred(struct cred *cred) +{ +#ifdef CONFIG_KEYS + struct thread_group_cred *tgcred = cred->tgcred; + + if (atomic_dec_and_test(&tgcred->usage)) + call_rcu(&tgcred->rcu, release_tgcred_rcu); +#endif +} + +/* + * The RCU callback to actually dispose of a set of credentials + */ +static void put_cred_rcu(struct rcu_head *rcu) +{ + struct cred *cred = container_of(rcu, struct cred, rcu); + + kdebug("put_cred_rcu(%p)", cred); + +#ifdef CONFIG_DEBUG_CREDENTIALS + if (cred->magic != CRED_MAGIC_DEAD || + atomic_read(&cred->usage) != 0 || + read_cred_subscribers(cred) != 0) + panic("CRED: put_cred_rcu() sees %p with" + " mag %x, put %p, usage %d, subscr %d\n", + cred, cred->magic, cred->put_addr, + atomic_read(&cred->usage), + read_cred_subscribers(cred)); +#else + if (atomic_read(&cred->usage) != 0) + panic("CRED: put_cred_rcu() sees %p with usage %d\n", + cred, atomic_read(&cred->usage)); +#endif + + security_cred_free(cred); + key_put(cred->thread_keyring); + key_put(cred->request_key_auth); + release_tgcred(cred); + if (cred->group_info) + put_group_info(cred->group_info); + free_uid(cred->user); + kmem_cache_free(cred_jar, cred); +} + +/** + * __put_cred - Destroy a set of credentials + * @cred: The record to release + * + * Destroy a set of credentials on which no references remain. + */ +void __put_cred(struct cred *cred) +{ + kdebug("__put_cred(%p{%d,%d})", cred, + atomic_read(&cred->usage), + read_cred_subscribers(cred)); + + BUG_ON(atomic_read(&cred->usage) != 0); +#ifdef CONFIG_DEBUG_CREDENTIALS + BUG_ON(read_cred_subscribers(cred) != 0); + cred->magic = CRED_MAGIC_DEAD; + cred->put_addr = __builtin_return_address(0); +#endif + BUG_ON(cred == current->cred); + BUG_ON(cred == current->real_cred); + + call_rcu(&cred->rcu, put_cred_rcu); +} +EXPORT_SYMBOL(__put_cred); + +/* + * Clean up a task's credentials when it exits + */ +void exit_creds(struct task_struct *tsk) +{ + struct cred *cred; + + kdebug("exit_creds(%u,%p,%p,{%d,%d})", tsk->pid, tsk->real_cred, tsk->cred, + atomic_read(&tsk->cred->usage), + read_cred_subscribers(tsk->cred)); + + cred = (struct cred *) tsk->real_cred; + tsk->real_cred = NULL; + validate_creds(cred); + alter_cred_subscribers(cred, -1); + put_cred(cred); + + cred = (struct cred *) tsk->cred; + tsk->cred = NULL; + validate_creds(cred); + alter_cred_subscribers(cred, -1); + put_cred(cred); + + cred = (struct cred *) tsk->replacement_session_keyring; + if (cred) { + tsk->replacement_session_keyring = NULL; + validate_creds(cred); + put_cred(cred); + } +} + +/** + * get_task_cred - Get another task's objective credentials + * @task: The task to query + * + * Get the objective credentials of a task, pinning them so that they can't go + * away. Accessing a task's credentials directly is not permitted. + * + * The caller must also make sure task doesn't get deleted, either by holding a + * ref on task or by holding tasklist_lock to prevent it from being unlinked. + */ +const struct cred *get_task_cred(struct task_struct *task) +{ + const struct cred *cred; + + rcu_read_lock(); + + do { + cred = __task_cred((task)); + BUG_ON(!cred); + } while (!atomic_inc_not_zero(&((struct cred *)cred)->usage)); + + rcu_read_unlock(); + return cred; +} + +/* + * Allocate blank credentials, such that the credentials can be filled in at a + * later date without risk of ENOMEM. + */ +struct cred *cred_alloc_blank(void) +{ + struct cred *new; + + new = kmem_cache_zalloc(cred_jar, GFP_KERNEL); + if (!new) + return NULL; + +#ifdef CONFIG_KEYS + new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL); + if (!new->tgcred) { + kmem_cache_free(cred_jar, new); + return NULL; + } + atomic_set(&new->tgcred->usage, 1); +#endif + + atomic_set(&new->usage, 1); +#ifdef CONFIG_DEBUG_CREDENTIALS + new->magic = CRED_MAGIC; +#endif + + if (security_cred_alloc_blank(new, GFP_KERNEL) < 0) + goto error; + + return new; + +error: + abort_creds(new); + return NULL; +} + +/** + * prepare_creds - Prepare a new set of credentials for modification + * + * Prepare a new set of task credentials for modification. A task's creds + * shouldn't generally be modified directly, therefore this function is used to + * prepare a new copy, which the caller then modifies and then commits by + * calling commit_creds(). + * + * Preparation involves making a copy of the objective creds for modification. + * + * Returns a pointer to the new creds-to-be if successful, NULL otherwise. + * + * Call commit_creds() or abort_creds() to clean up. + */ +struct cred *prepare_creds(void) +{ + struct task_struct *task = current; + const struct cred *old; + struct cred *new; + + validate_process_creds(); + + new = kmem_cache_alloc(cred_jar, GFP_KERNEL); + if (!new) + return NULL; + + kdebug("prepare_creds() alloc %p", new); + + old = task->cred; + memcpy(new, old, sizeof(struct cred)); + + atomic_set(&new->usage, 1); + set_cred_subscribers(new, 0); + get_group_info(new->group_info); + get_uid(new->user); + +#ifdef CONFIG_KEYS + key_get(new->thread_keyring); + key_get(new->request_key_auth); + atomic_inc(&new->tgcred->usage); +#endif + +#ifdef CONFIG_SECURITY + new->security = NULL; +#endif + + if (security_prepare_creds(new, old, GFP_KERNEL) < 0) + goto error; + validate_creds(new); + return new; + +error: + abort_creds(new); + return NULL; +} +EXPORT_SYMBOL(prepare_creds); + +/* + * Prepare credentials for current to perform an execve() + * - The caller must hold ->cred_guard_mutex + */ +struct cred *prepare_exec_creds(void) +{ + struct thread_group_cred *tgcred = NULL; + struct cred *new; + +#ifdef CONFIG_KEYS + tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); + if (!tgcred) + return NULL; +#endif + + new = prepare_creds(); + if (!new) { + kfree(tgcred); + return new; + } + +#ifdef CONFIG_KEYS + /* newly exec'd tasks don't get a thread keyring */ + key_put(new->thread_keyring); + new->thread_keyring = NULL; + + /* create a new per-thread-group creds for all this set of threads to + * share */ + memcpy(tgcred, new->tgcred, sizeof(struct thread_group_cred)); + + atomic_set(&tgcred->usage, 1); + spin_lock_init(&tgcred->lock); + + /* inherit the session keyring; new process keyring */ + key_get(tgcred->session_keyring); + tgcred->process_keyring = NULL; + + release_tgcred(new); + new->tgcred = tgcred; +#endif + + return new; +} + +/* + * Copy credentials for the new process created by fork() + * + * We share if we can, but under some circumstances we have to generate a new + * set. + * + * The new process gets the current process's subjective credentials as its + * objective and subjective credentials + */ +int copy_creds(struct task_struct *p, unsigned long clone_flags) +{ +#ifdef CONFIG_KEYS + struct thread_group_cred *tgcred; +#endif + struct cred *new; + int ret; + + p->replacement_session_keyring = NULL; + + if ( +#ifdef CONFIG_KEYS + !p->cred->thread_keyring && +#endif + clone_flags & CLONE_THREAD + ) { + p->real_cred = get_cred(p->cred); + get_cred(p->cred); + alter_cred_subscribers(p->cred, 2); + kdebug("share_creds(%p{%d,%d})", + p->cred, atomic_read(&p->cred->usage), + read_cred_subscribers(p->cred)); + atomic_inc(&p->cred->user->processes); + return 0; + } + + new = prepare_creds(); + if (!new) + return -ENOMEM; + + if (clone_flags & CLONE_NEWUSER) { + ret = create_user_ns(new); + if (ret < 0) + goto error_put; + } + + /* cache user_ns in cred. Doesn't need a refcount because it will + * stay pinned by cred->user + */ + new->user_ns = new->user->user_ns; + +#ifdef CONFIG_KEYS + /* new threads get their own thread keyrings if their parent already + * had one */ + if (new->thread_keyring) { + key_put(new->thread_keyring); + new->thread_keyring = NULL; + if (clone_flags & CLONE_THREAD) + install_thread_keyring_to_cred(new); + } + + /* we share the process and session keyrings between all the threads in + * a process - this is slightly icky as we violate COW credentials a + * bit */ + if (!(clone_flags & CLONE_THREAD)) { + tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL); + if (!tgcred) { + ret = -ENOMEM; + goto error_put; + } + atomic_set(&tgcred->usage, 1); + spin_lock_init(&tgcred->lock); + tgcred->process_keyring = NULL; + tgcred->session_keyring = key_get(new->tgcred->session_keyring); + + release_tgcred(new); + new->tgcred = tgcred; + } +#endif + + atomic_inc(&new->user->processes); + p->cred = p->real_cred = get_cred(new); + alter_cred_subscribers(new, 2); + validate_creds(new); + return 0; + +error_put: + put_cred(new); + return ret; +} + +/** + * commit_creds - Install new credentials upon the current task + * @new: The credentials to be assigned + * + * Install a new set of credentials to the current task, using RCU to replace + * the old set. Both the objective and the subjective credentials pointers are + * updated. This function may not be called if the subjective credentials are + * in an overridden state. + * + * This function eats the caller's reference to the new credentials. + * + * Always returns 0 thus allowing this function to be tail-called at the end + * of, say, sys_setgid(). + */ +int commit_creds(struct cred *new) +{ + struct task_struct *task = current; + const struct cred *old = task->real_cred; + + kdebug("commit_creds(%p{%d,%d})", new, + atomic_read(&new->usage), + read_cred_subscribers(new)); + + BUG_ON(task->cred != old); +#ifdef CONFIG_DEBUG_CREDENTIALS + BUG_ON(read_cred_subscribers(old) < 2); + validate_creds(old); + validate_creds(new); +#endif + BUG_ON(atomic_read(&new->usage) < 1); + + get_cred(new); /* we will require a ref for the subj creds too */ + + /* dumpability changes */ + if (old->euid != new->euid || + old->egid != new->egid || + old->fsuid != new->fsuid || + old->fsgid != new->fsgid || + !cap_issubset(new->cap_permitted, old->cap_permitted)) { + if (task->mm) + set_dumpable(task->mm, suid_dumpable); + task->pdeath_signal = 0; + smp_wmb(); + } + + /* alter the thread keyring */ + if (new->fsuid != old->fsuid) + key_fsuid_changed(task); + if (new->fsgid != old->fsgid) + key_fsgid_changed(task); + + /* do it + * - What if a process setreuid()'s and this brings the + * new uid over his NPROC rlimit? We can check this now + * cheaply with the new uid cache, so if it matters + * we should be checking for it. -DaveM + */ + alter_cred_subscribers(new, 2); + if (new->user != old->user) + atomic_inc(&new->user->processes); + rcu_assign_pointer(task->real_cred, new); + rcu_assign_pointer(task->cred, new); + if (new->user != old->user) + atomic_dec(&old->user->processes); + alter_cred_subscribers(old, -2); + + /* send notifications */ + if (new->uid != old->uid || + new->euid != old->euid || + new->suid != old->suid || + new->fsuid != old->fsuid) + proc_id_connector(task, PROC_EVENT_UID); + + if (new->gid != old->gid || + new->egid != old->egid || + new->sgid != old->sgid || + new->fsgid != old->fsgid) + proc_id_connector(task, PROC_EVENT_GID); + + /* release the old obj and subj refs both */ + put_cred(old); + put_cred(old); + return 0; +} +EXPORT_SYMBOL(commit_creds); + +/** + * abort_creds - Discard a set of credentials and unlock the current task + * @new: The credentials that were going to be applied + * + * Discard a set of credentials that were under construction and unlock the + * current task. + */ +void abort_creds(struct cred *new) +{ + kdebug("abort_creds(%p{%d,%d})", new, + atomic_read(&new->usage), + read_cred_subscribers(new)); + +#ifdef CONFIG_DEBUG_CREDENTIALS + BUG_ON(read_cred_subscribers(new) != 0); +#endif + BUG_ON(atomic_read(&new->usage) < 1); + put_cred(new); +} +EXPORT_SYMBOL(abort_creds); + +/** + * override_creds - Override the current process's subjective credentials + * @new: The credentials to be assigned + * + * Install a set of temporary override subjective credentials on the current + * process, returning the old set for later reversion. + */ +const struct cred *override_creds(const struct cred *new) +{ + const struct cred *old = current->cred; + + kdebug("override_creds(%p{%d,%d})", new, + atomic_read(&new->usage), + read_cred_subscribers(new)); + + validate_creds(old); + validate_creds(new); + get_cred(new); + alter_cred_subscribers(new, 1); + rcu_assign_pointer(current->cred, new); + alter_cred_subscribers(old, -1); + + kdebug("override_creds() = %p{%d,%d}", old, + atomic_read(&old->usage), + read_cred_subscribers(old)); + return old; +} +EXPORT_SYMBOL(override_creds); + +/** + * revert_creds - Revert a temporary subjective credentials override + * @old: The credentials to be restored + * + * Revert a temporary set of override subjective credentials to an old set, + * discarding the override set. + */ +void revert_creds(const struct cred *old) +{ + const struct cred *override = current->cred; + + kdebug("revert_creds(%p{%d,%d})", old, + atomic_read(&old->usage), + read_cred_subscribers(old)); + + validate_creds(old); + validate_creds(override); + alter_cred_subscribers(old, 1); + rcu_assign_pointer(current->cred, old); + alter_cred_subscribers(override, -1); + put_cred(override); +} +EXPORT_SYMBOL(revert_creds); + +/* + * initialise the credentials stuff + */ +void __init cred_init(void) +{ + /* allocate a slab in which we can store credentials */ + cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); +} + +/** + * prepare_kernel_cred - Prepare a set of credentials for a kernel service + * @daemon: A userspace daemon to be used as a reference + * + * Prepare a set of credentials for a kernel service. This can then be used to + * override a task's own credentials so that work can be done on behalf of that + * task that requires a different subjective context. + * + * @daemon is used to provide a base for the security record, but can be NULL. + * If @daemon is supplied, then the security data will be derived from that; + * otherwise they'll be set to 0 and no groups, full capabilities and no keys. + * + * The caller may change these controls afterwards if desired. + * + * Returns the new credentials or NULL if out of memory. + * + * Does not take, and does not return holding current->cred_replace_mutex. + */ +struct cred *prepare_kernel_cred(struct task_struct *daemon) +{ + const struct cred *old; + struct cred *new; + + new = kmem_cache_alloc(cred_jar, GFP_KERNEL); + if (!new) + return NULL; + + kdebug("prepare_kernel_cred() alloc %p", new); + + if (daemon) + old = get_task_cred(daemon); + else + old = get_cred(&init_cred); + + validate_creds(old); + + *new = *old; + atomic_set(&new->usage, 1); + set_cred_subscribers(new, 0); + get_uid(new->user); + get_group_info(new->group_info); + +#ifdef CONFIG_KEYS + atomic_inc(&init_tgcred.usage); + new->tgcred = &init_tgcred; + new->request_key_auth = NULL; + new->thread_keyring = NULL; + new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; +#endif + +#ifdef CONFIG_SECURITY + new->security = NULL; +#endif + if (security_prepare_creds(new, old, GFP_KERNEL) < 0) + goto error; + + put_cred(old); + validate_creds(new); + return new; + +error: + put_cred(new); + put_cred(old); + return NULL; +} +EXPORT_SYMBOL(prepare_kernel_cred); + +/** + * set_security_override - Set the security ID in a set of credentials + * @new: The credentials to alter + * @secid: The LSM security ID to set + * + * Set the LSM security ID in a set of credentials so that the subjective + * security is overridden when an alternative set of credentials is used. + */ +int set_security_override(struct cred *new, u32 secid) +{ + return security_kernel_act_as(new, secid); +} +EXPORT_SYMBOL(set_security_override); + +/** + * set_security_override_from_ctx - Set the security ID in a set of credentials + * @new: The credentials to alter + * @secctx: The LSM security context to generate the security ID from. + * + * Set the LSM security ID in a set of credentials so that the subjective + * security is overridden when an alternative set of credentials is used. The + * security ID is specified in string form as a security context to be + * interpreted by the LSM. + */ +int set_security_override_from_ctx(struct cred *new, const char *secctx) +{ + u32 secid; + int ret; + + ret = security_secctx_to_secid(secctx, strlen(secctx), &secid); + if (ret < 0) + return ret; + + return set_security_override(new, secid); +} +EXPORT_SYMBOL(set_security_override_from_ctx); + +/** + * set_create_files_as - Set the LSM file create context in a set of credentials + * @new: The credentials to alter + * @inode: The inode to take the context from + * + * Change the LSM file creation context in a set of credentials to be the same + * as the object context of the specified inode, so that the new inodes have + * the same MAC context as that inode. + */ +int set_create_files_as(struct cred *new, struct inode *inode) +{ + new->fsuid = inode->i_uid; + new->fsgid = inode->i_gid; + return security_kernel_create_files_as(new, inode); +} +EXPORT_SYMBOL(set_create_files_as); + +#ifdef CONFIG_DEBUG_CREDENTIALS + +bool creds_are_invalid(const struct cred *cred) +{ + if (cred->magic != CRED_MAGIC) + return true; +#ifdef CONFIG_SECURITY_SELINUX + /* + * cred->security == NULL if security_cred_alloc_blank() or + * security_prepare_creds() returned an error. + */ + if (selinux_is_enabled() && cred->security) { + if ((unsigned long) cred->security < PAGE_SIZE) + return true; + if ((*(u32 *)cred->security & 0xffffff00) == + (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8)) + return true; + } +#endif + return false; +} +EXPORT_SYMBOL(creds_are_invalid); + +/* + * dump invalid credentials + */ +static void dump_invalid_creds(const struct cred *cred, const char *label, + const struct task_struct *tsk) +{ + printk(KERN_ERR "CRED: %s credentials: %p %s%s%s\n", + label, cred, + cred == &init_cred ? "[init]" : "", + cred == tsk->real_cred ? "[real]" : "", + cred == tsk->cred ? "[eff]" : ""); + printk(KERN_ERR "CRED: ->magic=%x, put_addr=%p\n", + cred->magic, cred->put_addr); + printk(KERN_ERR "CRED: ->usage=%d, subscr=%d\n", + atomic_read(&cred->usage), + read_cred_subscribers(cred)); + printk(KERN_ERR "CRED: ->*uid = { %d,%d,%d,%d }\n", + cred->uid, cred->euid, cred->suid, cred->fsuid); + printk(KERN_ERR "CRED: ->*gid = { %d,%d,%d,%d }\n", + cred->gid, cred->egid, cred->sgid, cred->fsgid); +#ifdef CONFIG_SECURITY + printk(KERN_ERR "CRED: ->security is %p\n", cred->security); + if ((unsigned long) cred->security >= PAGE_SIZE && + (((unsigned long) cred->security & 0xffffff00) != + (POISON_FREE << 24 | POISON_FREE << 16 | POISON_FREE << 8))) + printk(KERN_ERR "CRED: ->security {%x, %x}\n", + ((u32*)cred->security)[0], + ((u32*)cred->security)[1]); +#endif +} + +/* + * report use of invalid credentials + */ +void __invalid_creds(const struct cred *cred, const char *file, unsigned line) +{ + printk(KERN_ERR "CRED: Invalid credentials\n"); + printk(KERN_ERR "CRED: At %s:%u\n", file, line); + dump_invalid_creds(cred, "Specified", current); + BUG(); +} +EXPORT_SYMBOL(__invalid_creds); + +/* + * check the credentials on a process + */ +void __validate_process_creds(struct task_struct *tsk, + const char *file, unsigned line) +{ + if (tsk->cred == tsk->real_cred) { + if (unlikely(read_cred_subscribers(tsk->cred) < 2 || + creds_are_invalid(tsk->cred))) + goto invalid_creds; + } else { + if (unlikely(read_cred_subscribers(tsk->real_cred) < 1 || + read_cred_subscribers(tsk->cred) < 1 || + creds_are_invalid(tsk->real_cred) || + creds_are_invalid(tsk->cred))) + goto invalid_creds; + } + return; + +invalid_creds: + printk(KERN_ERR "CRED: Invalid process credentials\n"); + printk(KERN_ERR "CRED: At %s:%u\n", file, line); + + dump_invalid_creds(tsk->real_cred, "Real", tsk); + if (tsk->cred != tsk->real_cred) + dump_invalid_creds(tsk->cred, "Effective", tsk); + else + printk(KERN_ERR "CRED: Effective creds == Real creds\n"); + BUG(); +} +EXPORT_SYMBOL(__validate_process_creds); + +/* + * check creds for do_exit() + */ +void validate_creds_for_do_exit(struct task_struct *tsk) +{ + kdebug("validate_creds_for_do_exit(%p,%p{%d,%d})", + tsk->real_cred, tsk->cred, + atomic_read(&tsk->cred->usage), + read_cred_subscribers(tsk->cred)); + + __validate_process_creds(tsk, __FILE__, __LINE__); +} + +#endif /* CONFIG_DEBUG_CREDENTIALS */ diff --git a/kernel/debug/Makefile b/kernel/debug/Makefile new file mode 100644 index 00000000..a85edc33 --- /dev/null +++ b/kernel/debug/Makefile @@ -0,0 +1,6 @@ +# +# Makefile for the linux kernel debugger +# + +obj-$(CONFIG_KGDB) += debug_core.o gdbstub.o +obj-$(CONFIG_KGDB_KDB) += kdb/ diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c new file mode 100644 index 00000000..5ee24d10 --- /dev/null +++ b/kernel/debug/debug_core.c @@ -0,0 +1,971 @@ +/* + * Kernel Debug Core + * + * Maintainer: Jason Wessel + * + * Copyright (C) 2000-2001 VERITAS Software Corporation. + * Copyright (C) 2002-2004 Timesys Corporation + * Copyright (C) 2003-2004 Amit S. Kale + * Copyright (C) 2004 Pavel Machek + * Copyright (C) 2004-2006 Tom Rini + * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. + * Copyright (C) 2005-2009 Wind River Systems, Inc. + * Copyright (C) 2007 MontaVista Software, Inc. + * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar + * + * Contributors at various stages not listed above: + * Jason Wessel ( jason.wessel@windriver.com ) + * George Anzinger + * Anurekh Saxena (anurekh.saxena@timesys.com) + * Lake Stevens Instrument Division (Glenn Engel) + * Jim Kingdon, Cygnus Support. + * + * Original KGDB stub: David Grothe , + * Tigran Aivazian + * + * This file is licensed under the terms of the GNU General Public License + * version 2. This program is licensed "as is" without any warranty of any + * kind, whether express or implied. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "debug_core.h" + +static int kgdb_break_asap; + +struct debuggerinfo_struct kgdb_info[NR_CPUS]; + +/** + * kgdb_connected - Is a host GDB connected to us? + */ +int kgdb_connected; +EXPORT_SYMBOL_GPL(kgdb_connected); + +/* All the KGDB handlers are installed */ +int kgdb_io_module_registered; + +/* Guard for recursive entry */ +static int exception_level; + +struct kgdb_io *dbg_io_ops; +static DEFINE_SPINLOCK(kgdb_registration_lock); + +/* kgdb console driver is loaded */ +static int kgdb_con_registered; +/* determine if kgdb console output should be used */ +static int kgdb_use_con; +/* Flag for alternate operations for early debugging */ +bool dbg_is_early = true; +/* Next cpu to become the master debug core */ +int dbg_switch_cpu; + +/* Use kdb or gdbserver mode */ +int dbg_kdb_mode = 1; + +static int __init opt_kgdb_con(char *str) +{ + kgdb_use_con = 1; + return 0; +} + +early_param("kgdbcon", opt_kgdb_con); + +module_param(kgdb_use_con, int, 0644); + +/* + * Holds information about breakpoints in a kernel. These breakpoints are + * added and removed by gdb. + */ +static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = { + [0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED } +}; + +/* + * The CPU# of the active CPU, or -1 if none: + */ +atomic_t kgdb_active = ATOMIC_INIT(-1); +EXPORT_SYMBOL_GPL(kgdb_active); +static DEFINE_RAW_SPINLOCK(dbg_master_lock); +static DEFINE_RAW_SPINLOCK(dbg_slave_lock); + +/* + * We use NR_CPUs not PERCPU, in case kgdb is used to debug early + * bootup code (which might not have percpu set up yet): + */ +static atomic_t masters_in_kgdb; +static atomic_t slaves_in_kgdb; +static atomic_t kgdb_break_tasklet_var; +atomic_t kgdb_setting_breakpoint; + +struct task_struct *kgdb_usethread; +struct task_struct *kgdb_contthread; + +int kgdb_single_step; +static pid_t kgdb_sstep_pid; + +/* to keep track of the CPU which is doing the single stepping*/ +atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1); + +/* + * If you are debugging a problem where roundup (the collection of + * all other CPUs) is a problem [this should be extremely rare], + * then use the nokgdbroundup option to avoid roundup. In that case + * the other CPUs might interfere with your debugging context, so + * use this with care: + */ +static int kgdb_do_roundup = 1; + +static int __init opt_nokgdbroundup(char *str) +{ + kgdb_do_roundup = 0; + + return 0; +} + +early_param("nokgdbroundup", opt_nokgdbroundup); + +/* + * Finally, some KGDB code :-) + */ + +/* + * Weak aliases for breakpoint management, + * can be overriden by architectures when needed: + */ +int __weak kgdb_arch_set_breakpoint(struct kgdb_bkpt *bpt) +{ + int err; + + err = probe_kernel_read(bpt->saved_instr, (char *)bpt->bpt_addr, + BREAK_INSTR_SIZE); + if (err) + return err; + err = probe_kernel_write((char *)bpt->bpt_addr, + arch_kgdb_ops.gdb_bpt_instr, BREAK_INSTR_SIZE); + return err; +} + +int __weak kgdb_arch_remove_breakpoint(struct kgdb_bkpt *bpt) +{ + return probe_kernel_write((char *)bpt->bpt_addr, + (char *)bpt->saved_instr, BREAK_INSTR_SIZE); +} + +int __weak kgdb_validate_break_address(unsigned long addr) +{ + struct kgdb_bkpt tmp; + int err; + /* Validate setting the breakpoint and then removing it. If the + * remove fails, the kernel needs to emit a bad message because we + * are deep trouble not being able to put things back the way we + * found them. + */ + tmp.bpt_addr = addr; + err = kgdb_arch_set_breakpoint(&tmp); + if (err) + return err; + err = kgdb_arch_remove_breakpoint(&tmp); + if (err) + printk(KERN_ERR "KGDB: Critical breakpoint error, kernel " + "memory destroyed at: %lx", addr); + return err; +} + +unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs) +{ + return instruction_pointer(regs); +} + +int __weak kgdb_arch_init(void) +{ + return 0; +} + +int __weak kgdb_skipexception(int exception, struct pt_regs *regs) +{ + return 0; +} + +/* + * Some architectures need cache flushes when we set/clear a + * breakpoint: + */ +static void kgdb_flush_swbreak_addr(unsigned long addr) +{ + if (!CACHE_FLUSH_IS_SAFE) + return; + + if (current->mm && current->mm->mmap_cache) { + flush_cache_range(current->mm->mmap_cache, + addr, addr + BREAK_INSTR_SIZE); + } + /* Force flush instruction cache if it was outside the mm */ + flush_icache_range(addr, addr + BREAK_INSTR_SIZE); +} + +/* + * SW breakpoint management: + */ +int dbg_activate_sw_breakpoints(void) +{ + int error; + int ret = 0; + int i; + + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (kgdb_break[i].state != BP_SET) + continue; + + error = kgdb_arch_set_breakpoint(&kgdb_break[i]); + if (error) { + ret = error; + printk(KERN_INFO "KGDB: BP install failed: %lx", + kgdb_break[i].bpt_addr); + continue; + } + + kgdb_flush_swbreak_addr(kgdb_break[i].bpt_addr); + kgdb_break[i].state = BP_ACTIVE; + } + return ret; +} + +int dbg_set_sw_break(unsigned long addr) +{ + int err = kgdb_validate_break_address(addr); + int breakno = -1; + int i; + + if (err) + return err; + + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if ((kgdb_break[i].state == BP_SET) && + (kgdb_break[i].bpt_addr == addr)) + return -EEXIST; + } + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (kgdb_break[i].state == BP_REMOVED && + kgdb_break[i].bpt_addr == addr) { + breakno = i; + break; + } + } + + if (breakno == -1) { + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (kgdb_break[i].state == BP_UNDEFINED) { + breakno = i; + break; + } + } + } + + if (breakno == -1) + return -E2BIG; + + kgdb_break[breakno].state = BP_SET; + kgdb_break[breakno].type = BP_BREAKPOINT; + kgdb_break[breakno].bpt_addr = addr; + + return 0; +} + +int dbg_deactivate_sw_breakpoints(void) +{ + int error; + int ret = 0; + int i; + + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (kgdb_break[i].state != BP_ACTIVE) + continue; + error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); + if (error) { + printk(KERN_INFO "KGDB: BP remove failed: %lx\n", + kgdb_break[i].bpt_addr); + ret = error; + } + + kgdb_flush_swbreak_addr(kgdb_break[i].bpt_addr); + kgdb_break[i].state = BP_SET; + } + return ret; +} + +int dbg_remove_sw_break(unsigned long addr) +{ + int i; + + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if ((kgdb_break[i].state == BP_SET) && + (kgdb_break[i].bpt_addr == addr)) { + kgdb_break[i].state = BP_REMOVED; + return 0; + } + } + return -ENOENT; +} + +int kgdb_isremovedbreak(unsigned long addr) +{ + int i; + + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if ((kgdb_break[i].state == BP_REMOVED) && + (kgdb_break[i].bpt_addr == addr)) + return 1; + } + return 0; +} + +int dbg_remove_all_break(void) +{ + int error; + int i; + + /* Clear memory breakpoints. */ + for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) { + if (kgdb_break[i].state != BP_ACTIVE) + goto setundefined; + error = kgdb_arch_remove_breakpoint(&kgdb_break[i]); + if (error) + printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n", + kgdb_break[i].bpt_addr); +setundefined: + kgdb_break[i].state = BP_UNDEFINED; + } + + /* Clear hardware breakpoints. */ + if (arch_kgdb_ops.remove_all_hw_break) + arch_kgdb_ops.remove_all_hw_break(); + + return 0; +} + +/* + * Return true if there is a valid kgdb I/O module. Also if no + * debugger is attached a message can be printed to the console about + * waiting for the debugger to attach. + * + * The print_wait argument is only to be true when called from inside + * the core kgdb_handle_exception, because it will wait for the + * debugger to attach. + */ +static int kgdb_io_ready(int print_wait) +{ + if (!dbg_io_ops) + return 0; + if (kgdb_connected) + return 1; + if (atomic_read(&kgdb_setting_breakpoint)) + return 1; + if (print_wait) { +#ifdef CONFIG_KGDB_KDB + if (!dbg_kdb_mode) + printk(KERN_CRIT "KGDB: waiting... or $3#33 for KDB\n"); +#else + printk(KERN_CRIT "KGDB: Waiting for remote debugger\n"); +#endif + } + return 1; +} + +static int kgdb_reenter_check(struct kgdb_state *ks) +{ + unsigned long addr; + + if (atomic_read(&kgdb_active) != raw_smp_processor_id()) + return 0; + + /* Panic on recursive debugger calls: */ + exception_level++; + addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs); + dbg_deactivate_sw_breakpoints(); + + /* + * If the break point removed ok at the place exception + * occurred, try to recover and print a warning to the end + * user because the user planted a breakpoint in a place that + * KGDB needs in order to function. + */ + if (dbg_remove_sw_break(addr) == 0) { + exception_level = 0; + kgdb_skipexception(ks->ex_vector, ks->linux_regs); + dbg_activate_sw_breakpoints(); + printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n", + addr); + WARN_ON_ONCE(1); + + return 1; + } + dbg_remove_all_break(); + kgdb_skipexception(ks->ex_vector, ks->linux_regs); + + if (exception_level > 1) { + dump_stack(); + panic("Recursive entry to debugger"); + } + + printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n"); +#ifdef CONFIG_KGDB_KDB + /* Allow kdb to debug itself one level */ + return 0; +#endif + dump_stack(); + panic("Recursive entry to debugger"); + + return 1; +} + +static void dbg_touch_watchdogs(void) +{ + touch_softlockup_watchdog_sync(); + clocksource_touch_watchdog(); + rcu_cpu_stall_reset(); +} + +static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs, + int exception_state) +{ + unsigned long flags; + int sstep_tries = 100; + int error; + int cpu; + int trace_on = 0; + int online_cpus = num_online_cpus(); + + kgdb_info[ks->cpu].enter_kgdb++; + kgdb_info[ks->cpu].exception_state |= exception_state; + + if (exception_state == DCPU_WANT_MASTER) + atomic_inc(&masters_in_kgdb); + else + atomic_inc(&slaves_in_kgdb); + + if (arch_kgdb_ops.disable_hw_break) + arch_kgdb_ops.disable_hw_break(regs); + +acquirelock: + /* + * Interrupts will be restored by the 'trap return' code, except when + * single stepping. + */ + local_irq_save(flags); + + cpu = ks->cpu; + kgdb_info[cpu].debuggerinfo = regs; + kgdb_info[cpu].task = current; + kgdb_info[cpu].ret_state = 0; + kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT; + + /* Make sure the above info reaches the primary CPU */ + smp_mb(); + + if (exception_level == 1) { + if (raw_spin_trylock(&dbg_master_lock)) + atomic_xchg(&kgdb_active, cpu); + goto cpu_master_loop; + } + + /* + * CPU will loop if it is a slave or request to become a kgdb + * master cpu and acquire the kgdb_active lock: + */ + while (1) { +cpu_loop: + if (kgdb_info[cpu].exception_state & DCPU_NEXT_MASTER) { + kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER; + goto cpu_master_loop; + } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) { + if (raw_spin_trylock(&dbg_master_lock)) { + atomic_xchg(&kgdb_active, cpu); + break; + } + } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) { + if (!raw_spin_is_locked(&dbg_slave_lock)) + goto return_normal; + } else { +return_normal: + /* Return to normal operation by executing any + * hw breakpoint fixup. + */ + if (arch_kgdb_ops.correct_hw_break) + arch_kgdb_ops.correct_hw_break(); + if (trace_on) + tracing_on(); + kgdb_info[cpu].exception_state &= + ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); + kgdb_info[cpu].enter_kgdb--; + smp_mb__before_atomic_dec(); + atomic_dec(&slaves_in_kgdb); + dbg_touch_watchdogs(); + local_irq_restore(flags); + return 0; + } + cpu_relax(); + } + + /* + * For single stepping, try to only enter on the processor + * that was single stepping. To guard against a deadlock, the + * kernel will only try for the value of sstep_tries before + * giving up and continuing on. + */ + if (atomic_read(&kgdb_cpu_doing_single_step) != -1 && + (kgdb_info[cpu].task && + kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { + atomic_set(&kgdb_active, -1); + raw_spin_unlock(&dbg_master_lock); + dbg_touch_watchdogs(); + local_irq_restore(flags); + + goto acquirelock; + } + + if (!kgdb_io_ready(1)) { + kgdb_info[cpu].ret_state = 1; + goto kgdb_restore; /* No I/O connection, resume the system */ + } + + /* + * Don't enter if we have hit a removed breakpoint. + */ + if (kgdb_skipexception(ks->ex_vector, ks->linux_regs)) + goto kgdb_restore; + + /* Call the I/O driver's pre_exception routine */ + if (dbg_io_ops->pre_exception) + dbg_io_ops->pre_exception(); + + /* + * Get the passive CPU lock which will hold all the non-primary + * CPU in a spin state while the debugger is active + */ + if (!kgdb_single_step) + raw_spin_lock(&dbg_slave_lock); + +#ifdef CONFIG_SMP + /* Signal the other CPUs to enter kgdb_wait() */ + if ((!kgdb_single_step) && kgdb_do_roundup) + kgdb_roundup_cpus(flags); +#endif + + /* + * Wait for the other CPUs to be notified and be waiting for us: + */ + while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) + + atomic_read(&slaves_in_kgdb)) != online_cpus) + cpu_relax(); + + /* + * At this point the primary processor is completely + * in the debugger and all secondary CPUs are quiescent + */ + dbg_deactivate_sw_breakpoints(); + kgdb_single_step = 0; + kgdb_contthread = current; + exception_level = 0; + trace_on = tracing_is_on(); + if (trace_on) + tracing_off(); + + while (1) { +cpu_master_loop: + if (dbg_kdb_mode) { + kgdb_connected = 1; + error = kdb_stub(ks); + if (error == -1) + continue; + kgdb_connected = 0; + } else { + error = gdb_serial_stub(ks); + } + + if (error == DBG_PASS_EVENT) { + dbg_kdb_mode = !dbg_kdb_mode; + } else if (error == DBG_SWITCH_CPU_EVENT) { + kgdb_info[dbg_switch_cpu].exception_state |= + DCPU_NEXT_MASTER; + goto cpu_loop; + } else { + kgdb_info[cpu].ret_state = error; + break; + } + } + + /* Call the I/O driver's post_exception routine */ + if (dbg_io_ops->post_exception) + dbg_io_ops->post_exception(); + + if (!kgdb_single_step) { + raw_spin_unlock(&dbg_slave_lock); + /* Wait till all the CPUs have quit from the debugger. */ + while (kgdb_do_roundup && atomic_read(&slaves_in_kgdb)) + cpu_relax(); + } + +kgdb_restore: + if (atomic_read(&kgdb_cpu_doing_single_step) != -1) { + int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step); + if (kgdb_info[sstep_cpu].task) + kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid; + else + kgdb_sstep_pid = 0; + } + if (arch_kgdb_ops.correct_hw_break) + arch_kgdb_ops.correct_hw_break(); + if (trace_on) + tracing_on(); + + kgdb_info[cpu].exception_state &= + ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); + kgdb_info[cpu].enter_kgdb--; + smp_mb__before_atomic_dec(); + atomic_dec(&masters_in_kgdb); + /* Free kgdb_active */ + atomic_set(&kgdb_active, -1); + raw_spin_unlock(&dbg_master_lock); + dbg_touch_watchdogs(); + local_irq_restore(flags); + + return kgdb_info[cpu].ret_state; +} + +/* + * kgdb_handle_exception() - main entry point from a kernel exception + * + * Locking hierarchy: + * interface locks, if any (begin_session) + * kgdb lock (kgdb_active) + */ +int +kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) +{ + struct kgdb_state kgdb_var; + struct kgdb_state *ks = &kgdb_var; + + ks->cpu = raw_smp_processor_id(); + ks->ex_vector = evector; + ks->signo = signo; + ks->err_code = ecode; + ks->kgdb_usethreadid = 0; + ks->linux_regs = regs; + + if (kgdb_reenter_check(ks)) + return 0; /* Ouch, double exception ! */ + if (kgdb_info[ks->cpu].enter_kgdb != 0) + return 0; + + return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER); +} + +int kgdb_nmicallback(int cpu, void *regs) +{ +#ifdef CONFIG_SMP + struct kgdb_state kgdb_var; + struct kgdb_state *ks = &kgdb_var; + + memset(ks, 0, sizeof(struct kgdb_state)); + ks->cpu = cpu; + ks->linux_regs = regs; + + if (kgdb_info[ks->cpu].enter_kgdb == 0 && + raw_spin_is_locked(&dbg_master_lock)) { + kgdb_cpu_enter(ks, regs, DCPU_IS_SLAVE); + return 0; + } +#endif + return 1; +} + +static void kgdb_console_write(struct console *co, const char *s, + unsigned count) +{ + unsigned long flags; + + /* If we're debugging, or KGDB has not connected, don't try + * and print. */ + if (!kgdb_connected || atomic_read(&kgdb_active) != -1 || dbg_kdb_mode) + return; + + local_irq_save(flags); + gdbstub_msg_write(s, count); + local_irq_restore(flags); +} + +static struct console kgdbcons = { + .name = "kgdb", + .write = kgdb_console_write, + .flags = CON_PRINTBUFFER | CON_ENABLED, + .index = -1, +}; + +#ifdef CONFIG_MAGIC_SYSRQ +static void sysrq_handle_dbg(int key) +{ + if (!dbg_io_ops) { + printk(KERN_CRIT "ERROR: No KGDB I/O module available\n"); + return; + } + if (!kgdb_connected) { +#ifdef CONFIG_KGDB_KDB + if (!dbg_kdb_mode) + printk(KERN_CRIT "KGDB or $3#33 for KDB\n"); +#else + printk(KERN_CRIT "Entering KGDB\n"); +#endif + } + + kgdb_breakpoint(); +} + +static struct sysrq_key_op sysrq_dbg_op = { + .handler = sysrq_handle_dbg, + .help_msg = "debug(G)", + .action_msg = "DEBUG", +}; +#endif + +static int kgdb_panic_event(struct notifier_block *self, + unsigned long val, + void *data) +{ + if (dbg_kdb_mode) + kdb_printf("PANIC: %s\n", (char *)data); + kgdb_breakpoint(); + return NOTIFY_DONE; +} + +static struct notifier_block kgdb_panic_event_nb = { + .notifier_call = kgdb_panic_event, + .priority = INT_MAX, +}; + +void __weak kgdb_arch_late(void) +{ +} + +void __init dbg_late_init(void) +{ + dbg_is_early = false; + if (kgdb_io_module_registered) + kgdb_arch_late(); + kdb_init(KDB_INIT_FULL); +} + +static void kgdb_register_callbacks(void) +{ + if (!kgdb_io_module_registered) { + kgdb_io_module_registered = 1; + kgdb_arch_init(); + if (!dbg_is_early) + kgdb_arch_late(); + atomic_notifier_chain_register(&panic_notifier_list, + &kgdb_panic_event_nb); +#ifdef CONFIG_MAGIC_SYSRQ + register_sysrq_key('g', &sysrq_dbg_op); +#endif + if (kgdb_use_con && !kgdb_con_registered) { + register_console(&kgdbcons); + kgdb_con_registered = 1; + } + } +} + +static void kgdb_unregister_callbacks(void) +{ + /* + * When this routine is called KGDB should unregister from the + * panic handler and clean up, making sure it is not handling any + * break exceptions at the time. + */ + if (kgdb_io_module_registered) { + kgdb_io_module_registered = 0; + atomic_notifier_chain_unregister(&panic_notifier_list, + &kgdb_panic_event_nb); + kgdb_arch_exit(); +#ifdef CONFIG_MAGIC_SYSRQ + unregister_sysrq_key('g', &sysrq_dbg_op); +#endif + if (kgdb_con_registered) { + unregister_console(&kgdbcons); + kgdb_con_registered = 0; + } + } +} + +/* + * There are times a tasklet needs to be used vs a compiled in + * break point so as to cause an exception outside a kgdb I/O module, + * such as is the case with kgdboe, where calling a breakpoint in the + * I/O driver itself would be fatal. + */ +static void kgdb_tasklet_bpt(unsigned long ing) +{ + kgdb_breakpoint(); + atomic_set(&kgdb_break_tasklet_var, 0); +} + +static DECLARE_TASKLET(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt, 0); + +void kgdb_schedule_breakpoint(void) +{ + if (atomic_read(&kgdb_break_tasklet_var) || + atomic_read(&kgdb_active) != -1 || + atomic_read(&kgdb_setting_breakpoint)) + return; + atomic_inc(&kgdb_break_tasklet_var); + tasklet_schedule(&kgdb_tasklet_breakpoint); +} +EXPORT_SYMBOL_GPL(kgdb_schedule_breakpoint); + +static void kgdb_initial_breakpoint(void) +{ + kgdb_break_asap = 0; + + printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n"); + kgdb_breakpoint(); +} + +/** + * kgdb_register_io_module - register KGDB IO module + * @new_dbg_io_ops: the io ops vector + * + * Register it with the KGDB core. + */ +int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops) +{ + int err; + + spin_lock(&kgdb_registration_lock); + + if (dbg_io_ops) { + spin_unlock(&kgdb_registration_lock); + + printk(KERN_ERR "kgdb: Another I/O driver is already " + "registered with KGDB.\n"); + return -EBUSY; + } + + if (new_dbg_io_ops->init) { + err = new_dbg_io_ops->init(); + if (err) { + spin_unlock(&kgdb_registration_lock); + return err; + } + } + + dbg_io_ops = new_dbg_io_ops; + + spin_unlock(&kgdb_registration_lock); + + printk(KERN_INFO "kgdb: Registered I/O driver %s.\n", + new_dbg_io_ops->name); + + /* Arm KGDB now. */ + kgdb_register_callbacks(); + + if (kgdb_break_asap) + kgdb_initial_breakpoint(); + + return 0; +} +EXPORT_SYMBOL_GPL(kgdb_register_io_module); + +/** + * kkgdb_unregister_io_module - unregister KGDB IO module + * @old_dbg_io_ops: the io ops vector + * + * Unregister it with the KGDB core. + */ +void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops) +{ + BUG_ON(kgdb_connected); + + /* + * KGDB is no longer able to communicate out, so + * unregister our callbacks and reset state. + */ + kgdb_unregister_callbacks(); + + spin_lock(&kgdb_registration_lock); + + WARN_ON_ONCE(dbg_io_ops != old_dbg_io_ops); + dbg_io_ops = NULL; + + spin_unlock(&kgdb_registration_lock); + + printk(KERN_INFO + "kgdb: Unregistered I/O driver %s, debugger disabled.\n", + old_dbg_io_ops->name); +} +EXPORT_SYMBOL_GPL(kgdb_unregister_io_module); + +int dbg_io_get_char(void) +{ + int ret = dbg_io_ops->read_char(); + if (ret == NO_POLL_CHAR) + return -1; + if (!dbg_kdb_mode) + return ret; + if (ret == 127) + return 8; + return ret; +} + +/** + * kgdb_breakpoint - generate breakpoint exception + * + * This function will generate a breakpoint exception. It is used at the + * beginning of a program to sync up with a debugger and can be used + * otherwise as a quick means to stop program execution and "break" into + * the debugger. + */ +void kgdb_breakpoint(void) +{ + atomic_inc(&kgdb_setting_breakpoint); + wmb(); /* Sync point before breakpoint */ + arch_kgdb_breakpoint(); + wmb(); /* Sync point after breakpoint */ + atomic_dec(&kgdb_setting_breakpoint); +} +EXPORT_SYMBOL_GPL(kgdb_breakpoint); + +static int __init opt_kgdb_wait(char *str) +{ + kgdb_break_asap = 1; + + kdb_init(KDB_INIT_EARLY); + if (kgdb_io_module_registered) + kgdb_initial_breakpoint(); + + return 0; +} + +early_param("kgdbwait", opt_kgdb_wait); diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h new file mode 100644 index 00000000..3494c28a --- /dev/null +++ b/kernel/debug/debug_core.h @@ -0,0 +1,82 @@ +/* + * Created by: Jason Wessel + * + * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ + +#ifndef _DEBUG_CORE_H_ +#define _DEBUG_CORE_H_ +/* + * These are the private implementation headers between the kernel + * debugger core and the debugger front end code. + */ + +/* kernel debug core data structures */ +struct kgdb_state { + int ex_vector; + int signo; + int err_code; + int cpu; + int pass_exception; + unsigned long thr_query; + unsigned long threadid; + long kgdb_usethreadid; + struct pt_regs *linux_regs; +}; + +/* Exception state values */ +#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */ +#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */ +#define DCPU_IS_SLAVE 0x4 /* Slave cpu enter exception */ +#define DCPU_SSTEP 0x8 /* CPU is single stepping */ + +struct debuggerinfo_struct { + void *debuggerinfo; + struct task_struct *task; + int exception_state; + int ret_state; + int irq_depth; + int enter_kgdb; +}; + +extern struct debuggerinfo_struct kgdb_info[]; + +/* kernel debug core break point routines */ +extern int dbg_remove_all_break(void); +extern int dbg_set_sw_break(unsigned long addr); +extern int dbg_remove_sw_break(unsigned long addr); +extern int dbg_activate_sw_breakpoints(void); +extern int dbg_deactivate_sw_breakpoints(void); + +/* polled character access to i/o module */ +extern int dbg_io_get_char(void); + +/* stub return value for switching between the gdbstub and kdb */ +#define DBG_PASS_EVENT -12345 +/* Switch from one cpu to another */ +#define DBG_SWITCH_CPU_EVENT -123456 +extern int dbg_switch_cpu; + +/* gdbstub interface functions */ +extern int gdb_serial_stub(struct kgdb_state *ks); +extern void gdbstub_msg_write(const char *s, int len); + +/* gdbstub functions used for kdb <-> gdbstub transition */ +extern int gdbstub_state(struct kgdb_state *ks, char *cmd); +extern int dbg_kdb_mode; + +#ifdef CONFIG_KGDB_KDB +extern int kdb_stub(struct kgdb_state *ks); +extern int kdb_parse(const char *cmdstr); +#else /* ! CONFIG_KGDB_KDB */ +static inline int kdb_stub(struct kgdb_state *ks) +{ + return DBG_PASS_EVENT; +} +#endif /* CONFIG_KGDB_KDB */ + +#endif /* _DEBUG_CORE_H_ */ diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c new file mode 100644 index 00000000..a11db956 --- /dev/null +++ b/kernel/debug/gdbstub.c @@ -0,0 +1,1125 @@ +/* + * Kernel Debug Core + * + * Maintainer: Jason Wessel + * + * Copyright (C) 2000-2001 VERITAS Software Corporation. + * Copyright (C) 2002-2004 Timesys Corporation + * Copyright (C) 2003-2004 Amit S. Kale + * Copyright (C) 2004 Pavel Machek + * Copyright (C) 2004-2006 Tom Rini + * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. + * Copyright (C) 2005-2009 Wind River Systems, Inc. + * Copyright (C) 2007 MontaVista Software, Inc. + * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar + * + * Contributors at various stages not listed above: + * Jason Wessel ( jason.wessel@windriver.com ) + * George Anzinger + * Anurekh Saxena (anurekh.saxena@timesys.com) + * Lake Stevens Instrument Division (Glenn Engel) + * Jim Kingdon, Cygnus Support. + * + * Original KGDB stub: David Grothe , + * Tigran Aivazian + * + * This file is licensed under the terms of the GNU General Public License + * version 2. This program is licensed "as is" without any warranty of any + * kind, whether express or implied. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "debug_core.h" + +#define KGDB_MAX_THREAD_QUERY 17 + +/* Our I/O buffers. */ +static char remcom_in_buffer[BUFMAX]; +static char remcom_out_buffer[BUFMAX]; + +/* Storage for the registers, in GDB format. */ +static unsigned long gdb_regs[(NUMREGBYTES + + sizeof(unsigned long) - 1) / + sizeof(unsigned long)]; + +/* + * GDB remote protocol parser: + */ + +#ifdef CONFIG_KGDB_KDB +static int gdbstub_read_wait(void) +{ + int ret = -1; + int i; + + /* poll any additional I/O interfaces that are defined */ + while (ret < 0) + for (i = 0; kdb_poll_funcs[i] != NULL; i++) { + ret = kdb_poll_funcs[i](); + if (ret > 0) + break; + } + return ret; +} +#else +static int gdbstub_read_wait(void) +{ + int ret = dbg_io_ops->read_char(); + while (ret == NO_POLL_CHAR) + ret = dbg_io_ops->read_char(); + return ret; +} +#endif +/* scan for the sequence $# */ +static void get_packet(char *buffer) +{ + unsigned char checksum; + unsigned char xmitcsum; + int count; + char ch; + + do { + /* + * Spin and wait around for the start character, ignore all + * other characters: + */ + while ((ch = (gdbstub_read_wait())) != '$') + /* nothing */; + + kgdb_connected = 1; + checksum = 0; + xmitcsum = -1; + + count = 0; + + /* + * now, read until a # or end of buffer is found: + */ + while (count < (BUFMAX - 1)) { + ch = gdbstub_read_wait(); + if (ch == '#') + break; + checksum = checksum + ch; + buffer[count] = ch; + count = count + 1; + } + buffer[count] = 0; + + if (ch == '#') { + xmitcsum = hex_to_bin(gdbstub_read_wait()) << 4; + xmitcsum += hex_to_bin(gdbstub_read_wait()); + + if (checksum != xmitcsum) + /* failed checksum */ + dbg_io_ops->write_char('-'); + else + /* successful transfer */ + dbg_io_ops->write_char('+'); + if (dbg_io_ops->flush) + dbg_io_ops->flush(); + } + } while (checksum != xmitcsum); +} + +/* + * Send the packet in buffer. + * Check for gdb connection if asked for. + */ +static void put_packet(char *buffer) +{ + unsigned char checksum; + int count; + char ch; + + /* + * $#. + */ + while (1) { + dbg_io_ops->write_char('$'); + checksum = 0; + count = 0; + + while ((ch = buffer[count])) { + dbg_io_ops->write_char(ch); + checksum += ch; + count++; + } + + dbg_io_ops->write_char('#'); + dbg_io_ops->write_char(hex_asc_hi(checksum)); + dbg_io_ops->write_char(hex_asc_lo(checksum)); + if (dbg_io_ops->flush) + dbg_io_ops->flush(); + + /* Now see what we get in reply. */ + ch = gdbstub_read_wait(); + + if (ch == 3) + ch = gdbstub_read_wait(); + + /* If we get an ACK, we are done. */ + if (ch == '+') + return; + + /* + * If we get the start of another packet, this means + * that GDB is attempting to reconnect. We will NAK + * the packet being sent, and stop trying to send this + * packet. + */ + if (ch == '$') { + dbg_io_ops->write_char('-'); + if (dbg_io_ops->flush) + dbg_io_ops->flush(); + return; + } + } +} + +static char gdbmsgbuf[BUFMAX + 1]; + +void gdbstub_msg_write(const char *s, int len) +{ + char *bufptr; + int wcount; + int i; + + if (len == 0) + len = strlen(s); + + /* 'O'utput */ + gdbmsgbuf[0] = 'O'; + + /* Fill and send buffers... */ + while (len > 0) { + bufptr = gdbmsgbuf + 1; + + /* Calculate how many this time */ + if ((len << 1) > (BUFMAX - 2)) + wcount = (BUFMAX - 2) >> 1; + else + wcount = len; + + /* Pack in hex chars */ + for (i = 0; i < wcount; i++) + bufptr = pack_hex_byte(bufptr, s[i]); + *bufptr = '\0'; + + /* Move up */ + s += wcount; + len -= wcount; + + /* Write packet */ + put_packet(gdbmsgbuf); + } +} + +/* + * Convert the memory pointed to by mem into hex, placing result in + * buf. Return a pointer to the last char put in buf (null). May + * return an error. + */ +char *kgdb_mem2hex(char *mem, char *buf, int count) +{ + char *tmp; + int err; + + /* + * We use the upper half of buf as an intermediate buffer for the + * raw memory copy. Hex conversion will work against this one. + */ + tmp = buf + count; + + err = probe_kernel_read(tmp, mem, count); + if (err) + return NULL; + while (count > 0) { + buf = pack_hex_byte(buf, *tmp); + tmp++; + count--; + } + *buf = 0; + + return buf; +} + +/* + * Convert the hex array pointed to by buf into binary to be placed in + * mem. Return a pointer to the character AFTER the last byte + * written. May return an error. + */ +int kgdb_hex2mem(char *buf, char *mem, int count) +{ + char *tmp_raw; + char *tmp_hex; + + /* + * We use the upper half of buf as an intermediate buffer for the + * raw memory that is converted from hex. + */ + tmp_raw = buf + count * 2; + + tmp_hex = tmp_raw - 1; + while (tmp_hex >= buf) { + tmp_raw--; + *tmp_raw = hex_to_bin(*tmp_hex--); + *tmp_raw |= hex_to_bin(*tmp_hex--) << 4; + } + + return probe_kernel_write(mem, tmp_raw, count); +} + +/* + * While we find nice hex chars, build a long_val. + * Return number of chars processed. + */ +int kgdb_hex2long(char **ptr, unsigned long *long_val) +{ + int hex_val; + int num = 0; + int negate = 0; + + *long_val = 0; + + if (**ptr == '-') { + negate = 1; + (*ptr)++; + } + while (**ptr) { + hex_val = hex_to_bin(**ptr); + if (hex_val < 0) + break; + + *long_val = (*long_val << 4) | hex_val; + num++; + (*ptr)++; + } + + if (negate) + *long_val = -*long_val; + + return num; +} + +/* + * Copy the binary array pointed to by buf into mem. Fix $, #, and + * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success. + * The input buf is overwitten with the result to write to mem. + */ +static int kgdb_ebin2mem(char *buf, char *mem, int count) +{ + int size = 0; + char *c = buf; + + while (count-- > 0) { + c[size] = *buf++; + if (c[size] == 0x7d) + c[size] = *buf++ ^ 0x20; + size++; + } + + return probe_kernel_write(mem, c, size); +} + +#if DBG_MAX_REG_NUM > 0 +void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) +{ + int i; + int idx = 0; + char *ptr = (char *)gdb_regs; + + for (i = 0; i < DBG_MAX_REG_NUM; i++) { + dbg_get_reg(i, ptr + idx, regs); + idx += dbg_reg_def[i].size; + } +} + +void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) +{ + int i; + int idx = 0; + char *ptr = (char *)gdb_regs; + + for (i = 0; i < DBG_MAX_REG_NUM; i++) { + dbg_set_reg(i, ptr + idx, regs); + idx += dbg_reg_def[i].size; + } +} +#endif /* DBG_MAX_REG_NUM > 0 */ + +/* Write memory due to an 'M' or 'X' packet. */ +static int write_mem_msg(int binary) +{ + char *ptr = &remcom_in_buffer[1]; + unsigned long addr; + unsigned long length; + int err; + + if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' && + kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') { + if (binary) + err = kgdb_ebin2mem(ptr, (char *)addr, length); + else + err = kgdb_hex2mem(ptr, (char *)addr, length); + if (err) + return err; + if (CACHE_FLUSH_IS_SAFE) + flush_icache_range(addr, addr + length); + return 0; + } + + return -EINVAL; +} + +static void error_packet(char *pkt, int error) +{ + error = -error; + pkt[0] = 'E'; + pkt[1] = hex_asc[(error / 10)]; + pkt[2] = hex_asc[(error % 10)]; + pkt[3] = '\0'; +} + +/* + * Thread ID accessors. We represent a flat TID space to GDB, where + * the per CPU idle threads (which under Linux all have PID 0) are + * remapped to negative TIDs. + */ + +#define BUF_THREAD_ID_SIZE 8 + +static char *pack_threadid(char *pkt, unsigned char *id) +{ + unsigned char *limit; + int lzero = 1; + + limit = id + (BUF_THREAD_ID_SIZE / 2); + while (id < limit) { + if (!lzero || *id != 0) { + pkt = pack_hex_byte(pkt, *id); + lzero = 0; + } + id++; + } + + if (lzero) + pkt = pack_hex_byte(pkt, 0); + + return pkt; +} + +static void int_to_threadref(unsigned char *id, int value) +{ + put_unaligned_be32(value, id); +} + +static struct task_struct *getthread(struct pt_regs *regs, int tid) +{ + /* + * Non-positive TIDs are remapped to the cpu shadow information + */ + if (tid == 0 || tid == -1) + tid = -atomic_read(&kgdb_active) - 2; + if (tid < -1 && tid > -NR_CPUS - 2) { + if (kgdb_info[-tid - 2].task) + return kgdb_info[-tid - 2].task; + else + return idle_task(-tid - 2); + } + if (tid <= 0) { + printk(KERN_ERR "KGDB: Internal thread select error\n"); + dump_stack(); + return NULL; + } + + /* + * find_task_by_pid_ns() does not take the tasklist lock anymore + * but is nicely RCU locked - hence is a pretty resilient + * thing to use: + */ + return find_task_by_pid_ns(tid, &init_pid_ns); +} + + +/* + * Remap normal tasks to their real PID, + * CPU shadow threads are mapped to -CPU - 2 + */ +static inline int shadow_pid(int realpid) +{ + if (realpid) + return realpid; + + return -raw_smp_processor_id() - 2; +} + +/* + * All the functions that start with gdb_cmd are the various + * operations to implement the handlers for the gdbserial protocol + * where KGDB is communicating with an external debugger + */ + +/* Handle the '?' status packets */ +static void gdb_cmd_status(struct kgdb_state *ks) +{ + /* + * We know that this packet is only sent + * during initial connect. So to be safe, + * we clear out our breakpoints now in case + * GDB is reconnecting. + */ + dbg_remove_all_break(); + + remcom_out_buffer[0] = 'S'; + pack_hex_byte(&remcom_out_buffer[1], ks->signo); +} + +static void gdb_get_regs_helper(struct kgdb_state *ks) +{ + struct task_struct *thread; + void *local_debuggerinfo; + int i; + + thread = kgdb_usethread; + if (!thread) { + thread = kgdb_info[ks->cpu].task; + local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo; + } else { + local_debuggerinfo = NULL; + for_each_online_cpu(i) { + /* + * Try to find the task on some other + * or possibly this node if we do not + * find the matching task then we try + * to approximate the results. + */ + if (thread == kgdb_info[i].task) + local_debuggerinfo = kgdb_info[i].debuggerinfo; + } + } + + /* + * All threads that don't have debuggerinfo should be + * in schedule() sleeping, since all other CPUs + * are in kgdb_wait, and thus have debuggerinfo. + */ + if (local_debuggerinfo) { + pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo); + } else { + /* + * Pull stuff saved during switch_to; nothing + * else is accessible (or even particularly + * relevant). + * + * This should be enough for a stack trace. + */ + sleeping_thread_to_gdb_regs(gdb_regs, thread); + } +} + +/* Handle the 'g' get registers request */ +static void gdb_cmd_getregs(struct kgdb_state *ks) +{ + gdb_get_regs_helper(ks); + kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES); +} + +/* Handle the 'G' set registers request */ +static void gdb_cmd_setregs(struct kgdb_state *ks) +{ + kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES); + + if (kgdb_usethread && kgdb_usethread != current) { + error_packet(remcom_out_buffer, -EINVAL); + } else { + gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs); + strcpy(remcom_out_buffer, "OK"); + } +} + +/* Handle the 'm' memory read bytes */ +static void gdb_cmd_memread(struct kgdb_state *ks) +{ + char *ptr = &remcom_in_buffer[1]; + unsigned long length; + unsigned long addr; + char *err; + + if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' && + kgdb_hex2long(&ptr, &length) > 0) { + err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length); + if (!err) + error_packet(remcom_out_buffer, -EINVAL); + } else { + error_packet(remcom_out_buffer, -EINVAL); + } +} + +/* Handle the 'M' memory write bytes */ +static void gdb_cmd_memwrite(struct kgdb_state *ks) +{ + int err = write_mem_msg(0); + + if (err) + error_packet(remcom_out_buffer, err); + else + strcpy(remcom_out_buffer, "OK"); +} + +#if DBG_MAX_REG_NUM > 0 +static char *gdb_hex_reg_helper(int regnum, char *out) +{ + int i; + int offset = 0; + + for (i = 0; i < regnum; i++) + offset += dbg_reg_def[i].size; + return kgdb_mem2hex((char *)gdb_regs + offset, out, + dbg_reg_def[i].size); +} + +/* Handle the 'p' individual regster get */ +static void gdb_cmd_reg_get(struct kgdb_state *ks) +{ + unsigned long regnum; + char *ptr = &remcom_in_buffer[1]; + + kgdb_hex2long(&ptr, ®num); + if (regnum >= DBG_MAX_REG_NUM) { + error_packet(remcom_out_buffer, -EINVAL); + return; + } + gdb_get_regs_helper(ks); + gdb_hex_reg_helper(regnum, remcom_out_buffer); +} + +/* Handle the 'P' individual regster set */ +static void gdb_cmd_reg_set(struct kgdb_state *ks) +{ + unsigned long regnum; + char *ptr = &remcom_in_buffer[1]; + int i = 0; + + kgdb_hex2long(&ptr, ®num); + if (*ptr++ != '=' || + !(!kgdb_usethread || kgdb_usethread == current) || + !dbg_get_reg(regnum, gdb_regs, ks->linux_regs)) { + error_packet(remcom_out_buffer, -EINVAL); + return; + } + memset(gdb_regs, 0, sizeof(gdb_regs)); + while (i < sizeof(gdb_regs) * 2) + if (hex_to_bin(ptr[i]) >= 0) + i++; + else + break; + i = i / 2; + kgdb_hex2mem(ptr, (char *)gdb_regs, i); + dbg_set_reg(regnum, gdb_regs, ks->linux_regs); + strcpy(remcom_out_buffer, "OK"); +} +#endif /* DBG_MAX_REG_NUM > 0 */ + +/* Handle the 'X' memory binary write bytes */ +static void gdb_cmd_binwrite(struct kgdb_state *ks) +{ + int err = write_mem_msg(1); + + if (err) + error_packet(remcom_out_buffer, err); + else + strcpy(remcom_out_buffer, "OK"); +} + +/* Handle the 'D' or 'k', detach or kill packets */ +static void gdb_cmd_detachkill(struct kgdb_state *ks) +{ + int error; + + /* The detach case */ + if (remcom_in_buffer[0] == 'D') { + error = dbg_remove_all_break(); + if (error < 0) { + error_packet(remcom_out_buffer, error); + } else { + strcpy(remcom_out_buffer, "OK"); + kgdb_connected = 0; + } + put_packet(remcom_out_buffer); + } else { + /* + * Assume the kill case, with no exit code checking, + * trying to force detach the debugger: + */ + dbg_remove_all_break(); + kgdb_connected = 0; + } +} + +/* Handle the 'R' reboot packets */ +static int gdb_cmd_reboot(struct kgdb_state *ks) +{ + /* For now, only honor R0 */ + if (strcmp(remcom_in_buffer, "R0") == 0) { + printk(KERN_CRIT "Executing emergency reboot\n"); + strcpy(remcom_out_buffer, "OK"); + put_packet(remcom_out_buffer); + + /* + * Execution should not return from + * machine_emergency_restart() + */ + machine_emergency_restart(); + kgdb_connected = 0; + + return 1; + } + return 0; +} + +/* Handle the 'q' query packets */ +static void gdb_cmd_query(struct kgdb_state *ks) +{ + struct task_struct *g; + struct task_struct *p; + unsigned char thref[BUF_THREAD_ID_SIZE]; + char *ptr; + int i; + int cpu; + int finished = 0; + + switch (remcom_in_buffer[1]) { + case 's': + case 'f': + if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) + break; + + i = 0; + remcom_out_buffer[0] = 'm'; + ptr = remcom_out_buffer + 1; + if (remcom_in_buffer[1] == 'f') { + /* Each cpu is a shadow thread */ + for_each_online_cpu(cpu) { + ks->thr_query = 0; + int_to_threadref(thref, -cpu - 2); + ptr = pack_threadid(ptr, thref); + *(ptr++) = ','; + i++; + } + } + + do_each_thread(g, p) { + if (i >= ks->thr_query && !finished) { + int_to_threadref(thref, p->pid); + ptr = pack_threadid(ptr, thref); + *(ptr++) = ','; + ks->thr_query++; + if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0) + finished = 1; + } + i++; + } while_each_thread(g, p); + + *(--ptr) = '\0'; + break; + + case 'C': + /* Current thread id */ + strcpy(remcom_out_buffer, "QC"); + ks->threadid = shadow_pid(current->pid); + int_to_threadref(thref, ks->threadid); + pack_threadid(remcom_out_buffer + 2, thref); + break; + case 'T': + if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) + break; + + ks->threadid = 0; + ptr = remcom_in_buffer + 17; + kgdb_hex2long(&ptr, &ks->threadid); + if (!getthread(ks->linux_regs, ks->threadid)) { + error_packet(remcom_out_buffer, -EINVAL); + break; + } + if ((int)ks->threadid > 0) { + kgdb_mem2hex(getthread(ks->linux_regs, + ks->threadid)->comm, + remcom_out_buffer, 16); + } else { + static char tmpstr[23 + BUF_THREAD_ID_SIZE]; + + sprintf(tmpstr, "shadowCPU%d", + (int)(-ks->threadid - 2)); + kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr)); + } + break; +#ifdef CONFIG_KGDB_KDB + case 'R': + if (strncmp(remcom_in_buffer, "qRcmd,", 6) == 0) { + int len = strlen(remcom_in_buffer + 6); + + if ((len % 2) != 0) { + strcpy(remcom_out_buffer, "E01"); + break; + } + kgdb_hex2mem(remcom_in_buffer + 6, + remcom_out_buffer, len); + len = len / 2; + remcom_out_buffer[len++] = 0; + + kdb_parse(remcom_out_buffer); + strcpy(remcom_out_buffer, "OK"); + } + break; +#endif + } +} + +/* Handle the 'H' task query packets */ +static void gdb_cmd_task(struct kgdb_state *ks) +{ + struct task_struct *thread; + char *ptr; + + switch (remcom_in_buffer[1]) { + case 'g': + ptr = &remcom_in_buffer[2]; + kgdb_hex2long(&ptr, &ks->threadid); + thread = getthread(ks->linux_regs, ks->threadid); + if (!thread && ks->threadid > 0) { + error_packet(remcom_out_buffer, -EINVAL); + break; + } + kgdb_usethread = thread; + ks->kgdb_usethreadid = ks->threadid; + strcpy(remcom_out_buffer, "OK"); + break; + case 'c': + ptr = &remcom_in_buffer[2]; + kgdb_hex2long(&ptr, &ks->threadid); + if (!ks->threadid) { + kgdb_contthread = NULL; + } else { + thread = getthread(ks->linux_regs, ks->threadid); + if (!thread && ks->threadid > 0) { + error_packet(remcom_out_buffer, -EINVAL); + break; + } + kgdb_contthread = thread; + } + strcpy(remcom_out_buffer, "OK"); + break; + } +} + +/* Handle the 'T' thread query packets */ +static void gdb_cmd_thread(struct kgdb_state *ks) +{ + char *ptr = &remcom_in_buffer[1]; + struct task_struct *thread; + + kgdb_hex2long(&ptr, &ks->threadid); + thread = getthread(ks->linux_regs, ks->threadid); + if (thread) + strcpy(remcom_out_buffer, "OK"); + else + error_packet(remcom_out_buffer, -EINVAL); +} + +/* Handle the 'z' or 'Z' breakpoint remove or set packets */ +static void gdb_cmd_break(struct kgdb_state *ks) +{ + /* + * Since GDB-5.3, it's been drafted that '0' is a software + * breakpoint, '1' is a hardware breakpoint, so let's do that. + */ + char *bpt_type = &remcom_in_buffer[1]; + char *ptr = &remcom_in_buffer[2]; + unsigned long addr; + unsigned long length; + int error = 0; + + if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') { + /* Unsupported */ + if (*bpt_type > '4') + return; + } else { + if (*bpt_type != '0' && *bpt_type != '1') + /* Unsupported. */ + return; + } + + /* + * Test if this is a hardware breakpoint, and + * if we support it: + */ + if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)) + /* Unsupported. */ + return; + + if (*(ptr++) != ',') { + error_packet(remcom_out_buffer, -EINVAL); + return; + } + if (!kgdb_hex2long(&ptr, &addr)) { + error_packet(remcom_out_buffer, -EINVAL); + return; + } + if (*(ptr++) != ',' || + !kgdb_hex2long(&ptr, &length)) { + error_packet(remcom_out_buffer, -EINVAL); + return; + } + + if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0') + error = dbg_set_sw_break(addr); + else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0') + error = dbg_remove_sw_break(addr); + else if (remcom_in_buffer[0] == 'Z') + error = arch_kgdb_ops.set_hw_breakpoint(addr, + (int)length, *bpt_type - '0'); + else if (remcom_in_buffer[0] == 'z') + error = arch_kgdb_ops.remove_hw_breakpoint(addr, + (int) length, *bpt_type - '0'); + + if (error == 0) + strcpy(remcom_out_buffer, "OK"); + else + error_packet(remcom_out_buffer, error); +} + +/* Handle the 'C' signal / exception passing packets */ +static int gdb_cmd_exception_pass(struct kgdb_state *ks) +{ + /* C09 == pass exception + * C15 == detach kgdb, pass exception + */ + if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') { + + ks->pass_exception = 1; + remcom_in_buffer[0] = 'c'; + + } else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') { + + ks->pass_exception = 1; + remcom_in_buffer[0] = 'D'; + dbg_remove_all_break(); + kgdb_connected = 0; + return 1; + + } else { + gdbstub_msg_write("KGDB only knows signal 9 (pass)" + " and 15 (pass and disconnect)\n" + "Executing a continue without signal passing\n", 0); + remcom_in_buffer[0] = 'c'; + } + + /* Indicate fall through */ + return -1; +} + +/* + * This function performs all gdbserial command procesing + */ +int gdb_serial_stub(struct kgdb_state *ks) +{ + int error = 0; + int tmp; + + /* Initialize comm buffer and globals. */ + memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); + kgdb_usethread = kgdb_info[ks->cpu].task; + ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid); + ks->pass_exception = 0; + + if (kgdb_connected) { + unsigned char thref[BUF_THREAD_ID_SIZE]; + char *ptr; + + /* Reply to host that an exception has occurred */ + ptr = remcom_out_buffer; + *ptr++ = 'T'; + ptr = pack_hex_byte(ptr, ks->signo); + ptr += strlen(strcpy(ptr, "thread:")); + int_to_threadref(thref, shadow_pid(current->pid)); + ptr = pack_threadid(ptr, thref); + *ptr++ = ';'; + put_packet(remcom_out_buffer); + } + + while (1) { + error = 0; + + /* Clear the out buffer. */ + memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); + + get_packet(remcom_in_buffer); + + switch (remcom_in_buffer[0]) { + case '?': /* gdbserial status */ + gdb_cmd_status(ks); + break; + case 'g': /* return the value of the CPU registers */ + gdb_cmd_getregs(ks); + break; + case 'G': /* set the value of the CPU registers - return OK */ + gdb_cmd_setregs(ks); + break; + case 'm': /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ + gdb_cmd_memread(ks); + break; + case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */ + gdb_cmd_memwrite(ks); + break; +#if DBG_MAX_REG_NUM > 0 + case 'p': /* pXX Return gdb register XX (in hex) */ + gdb_cmd_reg_get(ks); + break; + case 'P': /* PXX=aaaa Set gdb register XX to aaaa (in hex) */ + gdb_cmd_reg_set(ks); + break; +#endif /* DBG_MAX_REG_NUM > 0 */ + case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */ + gdb_cmd_binwrite(ks); + break; + /* kill or detach. KGDB should treat this like a + * continue. + */ + case 'D': /* Debugger detach */ + case 'k': /* Debugger detach via kill */ + gdb_cmd_detachkill(ks); + goto default_handle; + case 'R': /* Reboot */ + if (gdb_cmd_reboot(ks)) + goto default_handle; + break; + case 'q': /* query command */ + gdb_cmd_query(ks); + break; + case 'H': /* task related */ + gdb_cmd_task(ks); + break; + case 'T': /* Query thread status */ + gdb_cmd_thread(ks); + break; + case 'z': /* Break point remove */ + case 'Z': /* Break point set */ + gdb_cmd_break(ks); + break; +#ifdef CONFIG_KGDB_KDB + case '3': /* Escape into back into kdb */ + if (remcom_in_buffer[1] == '\0') { + gdb_cmd_detachkill(ks); + return DBG_PASS_EVENT; + } +#endif + case 'C': /* Exception passing */ + tmp = gdb_cmd_exception_pass(ks); + if (tmp > 0) + goto default_handle; + if (tmp == 0) + break; + /* Fall through on tmp < 0 */ + case 'c': /* Continue packet */ + case 's': /* Single step packet */ + if (kgdb_contthread && kgdb_contthread != current) { + /* Can't switch threads in kgdb */ + error_packet(remcom_out_buffer, -EINVAL); + break; + } + dbg_activate_sw_breakpoints(); + /* Fall through to default processing */ + default: +default_handle: + error = kgdb_arch_handle_exception(ks->ex_vector, + ks->signo, + ks->err_code, + remcom_in_buffer, + remcom_out_buffer, + ks->linux_regs); + /* + * Leave cmd processing on error, detach, + * kill, continue, or single step. + */ + if (error >= 0 || remcom_in_buffer[0] == 'D' || + remcom_in_buffer[0] == 'k') { + error = 0; + goto kgdb_exit; + } + + } + + /* reply to the request */ + put_packet(remcom_out_buffer); + } + +kgdb_exit: + if (ks->pass_exception) + error = 1; + return error; +} + +int gdbstub_state(struct kgdb_state *ks, char *cmd) +{ + int error; + + switch (cmd[0]) { + case 'e': + error = kgdb_arch_handle_exception(ks->ex_vector, + ks->signo, + ks->err_code, + remcom_in_buffer, + remcom_out_buffer, + ks->linux_regs); + return error; + case 's': + case 'c': + strcpy(remcom_in_buffer, cmd); + return 0; + case '?': + gdb_cmd_status(ks); + break; + case '\0': + strcpy(remcom_out_buffer, ""); + break; + } + dbg_io_ops->write_char('+'); + put_packet(remcom_out_buffer); + return 0; +} + +/** + * gdbstub_exit - Send an exit message to GDB + * @status: The exit code to report. + */ +void gdbstub_exit(int status) +{ + unsigned char checksum, ch, buffer[3]; + int loop; + + buffer[0] = 'W'; + buffer[1] = hex_asc_hi(status); + buffer[2] = hex_asc_lo(status); + + dbg_io_ops->write_char('$'); + checksum = 0; + + for (loop = 0; loop < 3; loop++) { + ch = buffer[loop]; + checksum += ch; + dbg_io_ops->write_char(ch); + } + + dbg_io_ops->write_char('#'); + dbg_io_ops->write_char(hex_asc_hi(checksum)); + dbg_io_ops->write_char(hex_asc_lo(checksum)); + + /* make sure the output is flushed, lest the bootloader clobber it */ + dbg_io_ops->flush(); +} diff --git a/kernel/debug/kdb/.gitignore b/kernel/debug/kdb/.gitignore new file mode 100644 index 00000000..396d12ed --- /dev/null +++ b/kernel/debug/kdb/.gitignore @@ -0,0 +1 @@ +gen-kdb_cmds.c diff --git a/kernel/debug/kdb/Makefile b/kernel/debug/kdb/Makefile new file mode 100644 index 00000000..d4fc58f4 --- /dev/null +++ b/kernel/debug/kdb/Makefile @@ -0,0 +1,25 @@ +# This file is subject to the terms and conditions of the GNU General Public +# License. See the file "COPYING" in the main directory of this archive +# for more details. +# +# Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved. +# Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. +# + +CCVERSION := $(shell $(CC) -v 2>&1 | sed -ne '$$p') +obj-y := kdb_io.o kdb_main.o kdb_support.o kdb_bt.o gen-kdb_cmds.o kdb_bp.o kdb_debugger.o +obj-$(CONFIG_KDB_KEYBOARD) += kdb_keyboard.o + +clean-files := gen-kdb_cmds.c + +quiet_cmd_gen-kdb = GENKDB $@ + cmd_gen-kdb = $(AWK) 'BEGIN {print "\#include "; print "\#include "} \ + /^\#/{next} \ + /^[ \t]*$$/{next} \ + {gsub(/"/, "\\\"", $$0); \ + print "static __initdata char kdb_cmd" cmds++ "[] = \"" $$0 "\\n\";"} \ + END {print "extern char *kdb_cmds[]; char __initdata *kdb_cmds[] = {"; for (i = 0; i < cmds; ++i) {print " kdb_cmd" i ","}; print(" NULL\n};");}' \ + $(filter-out %/Makefile,$^) > $@# + +$(obj)/gen-kdb_cmds.c: $(src)/kdb_cmds $(src)/Makefile + $(call cmd,gen-kdb) diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c new file mode 100644 index 00000000..20059ef4 --- /dev/null +++ b/kernel/debug/kdb/kdb_bp.c @@ -0,0 +1,562 @@ +/* + * Kernel Debugger Architecture Independent Breakpoint Handler + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "kdb_private.h" + +/* + * Table of kdb_breakpoints + */ +kdb_bp_t kdb_breakpoints[KDB_MAXBPT]; + +static void kdb_setsinglestep(struct pt_regs *regs) +{ + KDB_STATE_SET(DOING_SS); +} + +static char *kdb_rwtypes[] = { + "Instruction(i)", + "Instruction(Register)", + "Data Write", + "I/O", + "Data Access" +}; + +static char *kdb_bptype(kdb_bp_t *bp) +{ + if (bp->bp_type < 0 || bp->bp_type > 4) + return ""; + + return kdb_rwtypes[bp->bp_type]; +} + +static int kdb_parsebp(int argc, const char **argv, int *nextargp, kdb_bp_t *bp) +{ + int nextarg = *nextargp; + int diag; + + bp->bph_length = 1; + if ((argc + 1) != nextarg) { + if (strnicmp(argv[nextarg], "datar", sizeof("datar")) == 0) + bp->bp_type = BP_ACCESS_WATCHPOINT; + else if (strnicmp(argv[nextarg], "dataw", sizeof("dataw")) == 0) + bp->bp_type = BP_WRITE_WATCHPOINT; + else if (strnicmp(argv[nextarg], "inst", sizeof("inst")) == 0) + bp->bp_type = BP_HARDWARE_BREAKPOINT; + else + return KDB_ARGCOUNT; + + bp->bph_length = 1; + + nextarg++; + + if ((argc + 1) != nextarg) { + unsigned long len; + + diag = kdbgetularg((char *)argv[nextarg], + &len); + if (diag) + return diag; + + + if (len > 8) + return KDB_BADLENGTH; + + bp->bph_length = len; + nextarg++; + } + + if ((argc + 1) != nextarg) + return KDB_ARGCOUNT; + } + + *nextargp = nextarg; + return 0; +} + +static int _kdb_bp_remove(kdb_bp_t *bp) +{ + int ret = 1; + if (!bp->bp_installed) + return ret; + if (!bp->bp_type) + ret = dbg_remove_sw_break(bp->bp_addr); + else + ret = arch_kgdb_ops.remove_hw_breakpoint(bp->bp_addr, + bp->bph_length, + bp->bp_type); + if (ret == 0) + bp->bp_installed = 0; + return ret; +} + +static void kdb_handle_bp(struct pt_regs *regs, kdb_bp_t *bp) +{ + if (KDB_DEBUG(BP)) + kdb_printf("regs->ip = 0x%lx\n", instruction_pointer(regs)); + + /* + * Setup single step + */ + kdb_setsinglestep(regs); + + /* + * Reset delay attribute + */ + bp->bp_delay = 0; + bp->bp_delayed = 1; +} + +static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp) +{ + int ret; + /* + * Install the breakpoint, if it is not already installed. + */ + + if (KDB_DEBUG(BP)) + kdb_printf("%s: bp_installed %d\n", + __func__, bp->bp_installed); + if (!KDB_STATE(SSBPT)) + bp->bp_delay = 0; + if (bp->bp_installed) + return 1; + if (bp->bp_delay || (bp->bp_delayed && KDB_STATE(DOING_SS))) { + if (KDB_DEBUG(BP)) + kdb_printf("%s: delayed bp\n", __func__); + kdb_handle_bp(regs, bp); + return 0; + } + if (!bp->bp_type) + ret = dbg_set_sw_break(bp->bp_addr); + else + ret = arch_kgdb_ops.set_hw_breakpoint(bp->bp_addr, + bp->bph_length, + bp->bp_type); + if (ret == 0) { + bp->bp_installed = 1; + } else { + kdb_printf("%s: failed to set breakpoint at 0x%lx\n", + __func__, bp->bp_addr); + return 1; + } + return 0; +} + +/* + * kdb_bp_install + * + * Install kdb_breakpoints prior to returning from the + * kernel debugger. This allows the kdb_breakpoints to be set + * upon functions that are used internally by kdb, such as + * printk(). This function is only called once per kdb session. + */ +void kdb_bp_install(struct pt_regs *regs) +{ + int i; + + for (i = 0; i < KDB_MAXBPT; i++) { + kdb_bp_t *bp = &kdb_breakpoints[i]; + + if (KDB_DEBUG(BP)) { + kdb_printf("%s: bp %d bp_enabled %d\n", + __func__, i, bp->bp_enabled); + } + if (bp->bp_enabled) + _kdb_bp_install(regs, bp); + } +} + +/* + * kdb_bp_remove + * + * Remove kdb_breakpoints upon entry to the kernel debugger. + * + * Parameters: + * None. + * Outputs: + * None. + * Returns: + * None. + * Locking: + * None. + * Remarks: + */ +void kdb_bp_remove(void) +{ + int i; + + for (i = KDB_MAXBPT - 1; i >= 0; i--) { + kdb_bp_t *bp = &kdb_breakpoints[i]; + + if (KDB_DEBUG(BP)) { + kdb_printf("%s: bp %d bp_enabled %d\n", + __func__, i, bp->bp_enabled); + } + if (bp->bp_enabled) + _kdb_bp_remove(bp); + } +} + + +/* + * kdb_printbp + * + * Internal function to format and print a breakpoint entry. + * + * Parameters: + * None. + * Outputs: + * None. + * Returns: + * None. + * Locking: + * None. + * Remarks: + */ + +static void kdb_printbp(kdb_bp_t *bp, int i) +{ + kdb_printf("%s ", kdb_bptype(bp)); + kdb_printf("BP #%d at ", i); + kdb_symbol_print(bp->bp_addr, NULL, KDB_SP_DEFAULT); + + if (bp->bp_enabled) + kdb_printf("\n is enabled"); + else + kdb_printf("\n is disabled"); + + kdb_printf("\taddr at %016lx, hardtype=%d installed=%d\n", + bp->bp_addr, bp->bp_type, bp->bp_installed); + + kdb_printf("\n"); +} + +/* + * kdb_bp + * + * Handle the bp commands. + * + * [bp|bph] [DATAR|DATAW] + * + * Parameters: + * argc Count of arguments in argv + * argv Space delimited command line arguments + * Outputs: + * None. + * Returns: + * Zero for success, a kdb diagnostic if failure. + * Locking: + * None. + * Remarks: + * + * bp Set breakpoint on all cpus. Only use hardware assist if need. + * bph Set breakpoint on all cpus. Force hardware register + */ + +static int kdb_bp(int argc, const char **argv) +{ + int i, bpno; + kdb_bp_t *bp, *bp_check; + int diag; + char *symname = NULL; + long offset = 0ul; + int nextarg; + kdb_bp_t template = {0}; + + if (argc == 0) { + /* + * Display breakpoint table + */ + for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; + bpno++, bp++) { + if (bp->bp_free) + continue; + kdb_printbp(bp, bpno); + } + + return 0; + } + + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &template.bp_addr, + &offset, &symname); + if (diag) + return diag; + if (!template.bp_addr) + return KDB_BADINT; + + /* + * Find an empty bp structure to allocate + */ + for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) { + if (bp->bp_free) + break; + } + + if (bpno == KDB_MAXBPT) + return KDB_TOOMANYBPT; + + if (strcmp(argv[0], "bph") == 0) { + template.bp_type = BP_HARDWARE_BREAKPOINT; + diag = kdb_parsebp(argc, argv, &nextarg, &template); + if (diag) + return diag; + } else { + template.bp_type = BP_BREAKPOINT; + } + + /* + * Check for clashing breakpoints. + * + * Note, in this design we can't have hardware breakpoints + * enabled for both read and write on the same address. + */ + for (i = 0, bp_check = kdb_breakpoints; i < KDB_MAXBPT; + i++, bp_check++) { + if (!bp_check->bp_free && + bp_check->bp_addr == template.bp_addr) { + kdb_printf("You already have a breakpoint at " + kdb_bfd_vma_fmt0 "\n", template.bp_addr); + return KDB_DUPBPT; + } + } + + template.bp_enabled = 1; + + /* + * Actually allocate the breakpoint found earlier + */ + *bp = template; + bp->bp_free = 0; + + kdb_printbp(bp, bpno); + + return 0; +} + +/* + * kdb_bc + * + * Handles the 'bc', 'be', and 'bd' commands + * + * [bd|bc|be] + * [bd|bc|be] * + * + * Parameters: + * argc Count of arguments in argv + * argv Space delimited command line arguments + * Outputs: + * None. + * Returns: + * Zero for success, a kdb diagnostic for failure + * Locking: + * None. + * Remarks: + */ +static int kdb_bc(int argc, const char **argv) +{ + unsigned long addr; + kdb_bp_t *bp = NULL; + int lowbp = KDB_MAXBPT; + int highbp = 0; + int done = 0; + int i; + int diag = 0; + + int cmd; /* KDBCMD_B? */ +#define KDBCMD_BC 0 +#define KDBCMD_BE 1 +#define KDBCMD_BD 2 + + if (strcmp(argv[0], "be") == 0) + cmd = KDBCMD_BE; + else if (strcmp(argv[0], "bd") == 0) + cmd = KDBCMD_BD; + else + cmd = KDBCMD_BC; + + if (argc != 1) + return KDB_ARGCOUNT; + + if (strcmp(argv[1], "*") == 0) { + lowbp = 0; + highbp = KDB_MAXBPT; + } else { + diag = kdbgetularg(argv[1], &addr); + if (diag) + return diag; + + /* + * For addresses less than the maximum breakpoint number, + * assume that the breakpoint number is desired. + */ + if (addr < KDB_MAXBPT) { + bp = &kdb_breakpoints[addr]; + lowbp = highbp = addr; + highbp++; + } else { + for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; + i++, bp++) { + if (bp->bp_addr == addr) { + lowbp = highbp = i; + highbp++; + break; + } + } + } + } + + /* + * Now operate on the set of breakpoints matching the input + * criteria (either '*' for all, or an individual breakpoint). + */ + for (bp = &kdb_breakpoints[lowbp], i = lowbp; + i < highbp; + i++, bp++) { + if (bp->bp_free) + continue; + + done++; + + switch (cmd) { + case KDBCMD_BC: + bp->bp_enabled = 0; + + kdb_printf("Breakpoint %d at " + kdb_bfd_vma_fmt " cleared\n", + i, bp->bp_addr); + + bp->bp_addr = 0; + bp->bp_free = 1; + + break; + case KDBCMD_BE: + bp->bp_enabled = 1; + + kdb_printf("Breakpoint %d at " + kdb_bfd_vma_fmt " enabled", + i, bp->bp_addr); + + kdb_printf("\n"); + break; + case KDBCMD_BD: + if (!bp->bp_enabled) + break; + + bp->bp_enabled = 0; + + kdb_printf("Breakpoint %d at " + kdb_bfd_vma_fmt " disabled\n", + i, bp->bp_addr); + + break; + } + if (bp->bp_delay && (cmd == KDBCMD_BC || cmd == KDBCMD_BD)) { + bp->bp_delay = 0; + KDB_STATE_CLEAR(SSBPT); + } + } + + return (!done) ? KDB_BPTNOTFOUND : 0; +} + +/* + * kdb_ss + * + * Process the 'ss' (Single Step) and 'ssb' (Single Step to Branch) + * commands. + * + * ss + * ssb + * + * Parameters: + * argc Argument count + * argv Argument vector + * Outputs: + * None. + * Returns: + * KDB_CMD_SS[B] for success, a kdb error if failure. + * Locking: + * None. + * Remarks: + * + * Set the arch specific option to trigger a debug trap after the next + * instruction. + * + * For 'ssb', set the trace flag in the debug trap handler + * after printing the current insn and return directly without + * invoking the kdb command processor, until a branch instruction + * is encountered. + */ + +static int kdb_ss(int argc, const char **argv) +{ + int ssb = 0; + + ssb = (strcmp(argv[0], "ssb") == 0); + if (argc != 0) + return KDB_ARGCOUNT; + /* + * Set trace flag and go. + */ + KDB_STATE_SET(DOING_SS); + if (ssb) { + KDB_STATE_SET(DOING_SSB); + return KDB_CMD_SSB; + } + return KDB_CMD_SS; +} + +/* Initialize the breakpoint table and register breakpoint commands. */ + +void __init kdb_initbptab(void) +{ + int i; + kdb_bp_t *bp; + + /* + * First time initialization. + */ + memset(&kdb_breakpoints, '\0', sizeof(kdb_breakpoints)); + + for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) + bp->bp_free = 1; + + kdb_register_repeat("bp", kdb_bp, "[]", + "Set/Display breakpoints", 0, KDB_REPEAT_NO_ARGS); + kdb_register_repeat("bl", kdb_bp, "[]", + "Display breakpoints", 0, KDB_REPEAT_NO_ARGS); + if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT) + kdb_register_repeat("bph", kdb_bp, "[]", + "[datar [length]|dataw [length]] Set hw brk", 0, KDB_REPEAT_NO_ARGS); + kdb_register_repeat("bc", kdb_bc, "", + "Clear Breakpoint", 0, KDB_REPEAT_NONE); + kdb_register_repeat("be", kdb_bc, "", + "Enable Breakpoint", 0, KDB_REPEAT_NONE); + kdb_register_repeat("bd", kdb_bc, "", + "Disable Breakpoint", 0, KDB_REPEAT_NONE); + + kdb_register_repeat("ss", kdb_ss, "", + "Single Step", 1, KDB_REPEAT_NO_ARGS); + kdb_register_repeat("ssb", kdb_ss, "", + "Single step to branch/call", 0, KDB_REPEAT_NO_ARGS); + /* + * Architecture dependent initialization. + */ +} diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c new file mode 100644 index 00000000..2f62fe85 --- /dev/null +++ b/kernel/debug/kdb/kdb_bt.c @@ -0,0 +1,210 @@ +/* + * Kernel Debugger Architecture Independent Stack Traceback + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "kdb_private.h" + + +static void kdb_show_stack(struct task_struct *p, void *addr) +{ + int old_lvl = console_loglevel; + console_loglevel = 15; + kdb_trap_printk++; + kdb_set_current_task(p); + if (addr) { + show_stack((struct task_struct *)p, addr); + } else if (kdb_current_regs) { +#ifdef CONFIG_X86 + show_stack(p, &kdb_current_regs->sp); +#else + show_stack(p, NULL); +#endif + } else { + show_stack(p, NULL); + } + console_loglevel = old_lvl; + kdb_trap_printk--; +} + +/* + * kdb_bt + * + * This function implements the 'bt' command. Print a stack + * traceback. + * + * bt [] (addr-exp is for alternate stacks) + * btp Kernel stack for + * btt Kernel stack for task structure at + * + * bta [DRSTCZEUIMA] All useful processes, optionally + * filtered by state + * btc [] The current process on one cpu, + * default is all cpus + * + * bt refers to a address on the stack, that location + * is assumed to contain a return address. + * + * btt refers to the address of a struct task. + * + * Inputs: + * argc argument count + * argv argument vector + * Outputs: + * None. + * Returns: + * zero for success, a kdb diagnostic if error + * Locking: + * none. + * Remarks: + * Backtrack works best when the code uses frame pointers. But even + * without frame pointers we should get a reasonable trace. + * + * mds comes in handy when examining the stack to do a manual traceback or + * to get a starting point for bt . + */ + +static int +kdb_bt1(struct task_struct *p, unsigned long mask, + int argcount, int btaprompt) +{ + char buffer[2]; + if (kdb_getarea(buffer[0], (unsigned long)p) || + kdb_getarea(buffer[0], (unsigned long)(p+1)-1)) + return KDB_BADADDR; + if (!kdb_task_state(p, mask)) + return 0; + kdb_printf("Stack traceback for pid %d\n", p->pid); + kdb_ps1(p); + kdb_show_stack(p, NULL); + if (btaprompt) { + kdb_getstr(buffer, sizeof(buffer), + "Enter to end, to continue:"); + if (buffer[0] == 'q') { + kdb_printf("\n"); + return 1; + } + } + touch_nmi_watchdog(); + return 0; +} + +int +kdb_bt(int argc, const char **argv) +{ + int diag; + int argcount = 5; + int btaprompt = 1; + int nextarg; + unsigned long addr; + long offset; + + kdbgetintenv("BTARGS", &argcount); /* Arguments to print */ + kdbgetintenv("BTAPROMPT", &btaprompt); /* Prompt after each + * proc in bta */ + + if (strcmp(argv[0], "bta") == 0) { + struct task_struct *g, *p; + unsigned long cpu; + unsigned long mask = kdb_task_state_string(argc ? argv[1] : + NULL); + if (argc == 0) + kdb_ps_suppressed(); + /* Run the active tasks first */ + for_each_online_cpu(cpu) { + p = kdb_curr_task(cpu); + if (kdb_bt1(p, mask, argcount, btaprompt)) + return 0; + } + /* Now the inactive tasks */ + kdb_do_each_thread(g, p) { + if (task_curr(p)) + continue; + if (kdb_bt1(p, mask, argcount, btaprompt)) + return 0; + } kdb_while_each_thread(g, p); + } else if (strcmp(argv[0], "btp") == 0) { + struct task_struct *p; + unsigned long pid; + if (argc != 1) + return KDB_ARGCOUNT; + diag = kdbgetularg((char *)argv[1], &pid); + if (diag) + return diag; + p = find_task_by_pid_ns(pid, &init_pid_ns); + if (p) { + kdb_set_current_task(p); + return kdb_bt1(p, ~0UL, argcount, 0); + } + kdb_printf("No process with pid == %ld found\n", pid); + return 0; + } else if (strcmp(argv[0], "btt") == 0) { + if (argc != 1) + return KDB_ARGCOUNT; + diag = kdbgetularg((char *)argv[1], &addr); + if (diag) + return diag; + kdb_set_current_task((struct task_struct *)addr); + return kdb_bt1((struct task_struct *)addr, ~0UL, argcount, 0); + } else if (strcmp(argv[0], "btc") == 0) { + unsigned long cpu = ~0; + struct task_struct *save_current_task = kdb_current_task; + char buf[80]; + if (argc > 1) + return KDB_ARGCOUNT; + if (argc == 1) { + diag = kdbgetularg((char *)argv[1], &cpu); + if (diag) + return diag; + } + /* Recursive use of kdb_parse, do not use argv after + * this point */ + argv = NULL; + if (cpu != ~0) { + if (cpu >= num_possible_cpus() || !cpu_online(cpu)) { + kdb_printf("no process for cpu %ld\n", cpu); + return 0; + } + sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu)); + kdb_parse(buf); + return 0; + } + kdb_printf("btc: cpu status: "); + kdb_parse("cpu\n"); + for_each_online_cpu(cpu) { + sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu)); + kdb_parse(buf); + touch_nmi_watchdog(); + } + kdb_set_current_task(save_current_task); + return 0; + } else { + if (argc) { + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, + &offset, NULL); + if (diag) + return diag; + kdb_show_stack(kdb_current_task, (void *)addr); + return 0; + } else { + return kdb_bt1(kdb_current_task, ~0UL, argcount, 0); + } + } + + /* NOTREACHED */ + return 0; +} diff --git a/kernel/debug/kdb/kdb_cmds b/kernel/debug/kdb/kdb_cmds new file mode 100644 index 00000000..56c88e4d --- /dev/null +++ b/kernel/debug/kdb/kdb_cmds @@ -0,0 +1,35 @@ +# Initial commands for kdb, alter to suit your needs. +# These commands are executed in kdb_init() context, no SMP, no +# processes. Commands that require process data (including stack or +# registers) are not reliable this early. set and bp commands should +# be safe. Global breakpoint commands affect each cpu as it is booted. + +# Standard debugging information for first level support, just type archkdb +# or archkdbcpu or archkdbshort at the kdb prompt. + +defcmd dumpcommon "" "Common kdb debugging" + set BTAPROMPT 0 + set LINES 10000 + -summary + -cpu + -ps + -dmesg 600 + -bt +endefcmd + +defcmd dumpall "" "First line debugging" + set BTSYMARG 1 + set BTARGS 9 + pid R + -dumpcommon + -bta +endefcmd + +defcmd dumpcpu "" "Same as dumpall but only tasks on cpus" + set BTSYMARG 1 + set BTARGS 9 + pid R + -dumpcommon + -btc +endefcmd + diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c new file mode 100644 index 00000000..dd0b1b7d --- /dev/null +++ b/kernel/debug/kdb/kdb_debugger.c @@ -0,0 +1,168 @@ +/* + * Created by: Jason Wessel + * + * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + */ + +#include +#include +#include +#include "kdb_private.h" +#include "../debug_core.h" + +/* + * KDB interface to KGDB internals + */ +get_char_func kdb_poll_funcs[] = { + dbg_io_get_char, + NULL, + NULL, + NULL, + NULL, + NULL, +}; +EXPORT_SYMBOL_GPL(kdb_poll_funcs); + +int kdb_poll_idx = 1; +EXPORT_SYMBOL_GPL(kdb_poll_idx); + +int kdb_stub(struct kgdb_state *ks) +{ + int error = 0; + kdb_bp_t *bp; + unsigned long addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs); + kdb_reason_t reason = KDB_REASON_OOPS; + kdb_dbtrap_t db_result = KDB_DB_NOBPT; + int i; + + if (KDB_STATE(REENTRY)) { + reason = KDB_REASON_SWITCH; + KDB_STATE_CLEAR(REENTRY); + addr = instruction_pointer(ks->linux_regs); + } + ks->pass_exception = 0; + if (atomic_read(&kgdb_setting_breakpoint)) + reason = KDB_REASON_KEYBOARD; + + for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) { + if ((bp->bp_enabled) && (bp->bp_addr == addr)) { + reason = KDB_REASON_BREAK; + db_result = KDB_DB_BPT; + if (addr != instruction_pointer(ks->linux_regs)) + kgdb_arch_set_pc(ks->linux_regs, addr); + break; + } + } + if (reason == KDB_REASON_BREAK || reason == KDB_REASON_SWITCH) { + for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) { + if (bp->bp_free) + continue; + if (bp->bp_addr == addr) { + bp->bp_delay = 1; + bp->bp_delayed = 1; + /* + * SSBPT is set when the kernel debugger must single step a + * task in order to re-establish an instruction breakpoint + * which uses the instruction replacement mechanism. It is + * cleared by any action that removes the need to single-step + * the breakpoint. + */ + reason = KDB_REASON_BREAK; + db_result = KDB_DB_BPT; + KDB_STATE_SET(SSBPT); + break; + } + } + } + + if (reason != KDB_REASON_BREAK && ks->ex_vector == 0 && + ks->signo == SIGTRAP) { + reason = KDB_REASON_SSTEP; + db_result = KDB_DB_BPT; + } + /* Set initial kdb state variables */ + KDB_STATE_CLEAR(KGDB_TRANS); + kdb_initial_cpu = atomic_read(&kgdb_active); + kdb_current_task = kgdb_info[ks->cpu].task; + kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo; + /* Remove any breakpoints as needed by kdb and clear single step */ + kdb_bp_remove(); + KDB_STATE_CLEAR(DOING_SS); + KDB_STATE_CLEAR(DOING_SSB); + KDB_STATE_SET(PAGER); + /* zero out any offline cpu data */ + for_each_present_cpu(i) { + if (!cpu_online(i)) { + kgdb_info[i].debuggerinfo = NULL; + kgdb_info[i].task = NULL; + } + } + if (ks->err_code == DIE_OOPS || reason == KDB_REASON_OOPS) { + ks->pass_exception = 1; + KDB_FLAG_SET(CATASTROPHIC); + } + if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) { + KDB_STATE_CLEAR(SSBPT); + KDB_STATE_CLEAR(DOING_SS); + } else { + /* Start kdb main loop */ + error = kdb_main_loop(KDB_REASON_ENTER, reason, + ks->err_code, db_result, ks->linux_regs); + } + /* + * Upon exit from the kdb main loop setup break points and restart + * the system based on the requested continue state + */ + kdb_initial_cpu = -1; + kdb_current_task = NULL; + kdb_current_regs = NULL; + KDB_STATE_CLEAR(PAGER); + kdbnearsym_cleanup(); + if (error == KDB_CMD_KGDB) { + if (KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)) { + /* + * This inteface glue which allows kdb to transition in into + * the gdb stub. In order to do this the '?' or '' gdb serial + * packet response is processed here. And then control is + * passed to the gdbstub. + */ + if (KDB_STATE(DOING_KGDB)) + gdbstub_state(ks, "?"); + else + gdbstub_state(ks, ""); + KDB_STATE_CLEAR(DOING_KGDB); + KDB_STATE_CLEAR(DOING_KGDB2); + } + return DBG_PASS_EVENT; + } + kdb_bp_install(ks->linux_regs); + dbg_activate_sw_breakpoints(); + /* Set the exit state to a single step or a continue */ + if (KDB_STATE(DOING_SS)) + gdbstub_state(ks, "s"); + else + gdbstub_state(ks, "c"); + + KDB_FLAG_CLEAR(CATASTROPHIC); + + /* Invoke arch specific exception handling prior to system resume */ + kgdb_info[ks->cpu].ret_state = gdbstub_state(ks, "e"); + if (ks->pass_exception) + kgdb_info[ks->cpu].ret_state = 1; + if (error == KDB_CMD_CPU) { + KDB_STATE_SET(REENTRY); + /* + * Force clear the single step bit because kdb emulates this + * differently vs the gdbstub + */ + kgdb_single_step = 0; + dbg_deactivate_sw_breakpoints(); + return DBG_SWITCH_CPU_EVENT; + } + return kgdb_info[ks->cpu].ret_state; +} + diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c new file mode 100644 index 00000000..96fdaac4 --- /dev/null +++ b/kernel/debug/kdb/kdb_io.c @@ -0,0 +1,826 @@ +/* + * Kernel Debugger Architecture Independent Console I/O handler + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (c) 1999-2006 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "kdb_private.h" + +#define CMD_BUFLEN 256 +char kdb_prompt_str[CMD_BUFLEN]; + +int kdb_trap_printk; + +static void kgdb_transition_check(char *buffer) +{ + int slen = strlen(buffer); + if (strncmp(buffer, "$?#3f", slen) != 0 && + strncmp(buffer, "$qSupported#37", slen) != 0 && + strncmp(buffer, "+$qSupported#37", slen) != 0) { + KDB_STATE_SET(KGDB_TRANS); + kdb_printf("%s", buffer); + } +} + +static int kdb_read_get_key(char *buffer, size_t bufsize) +{ +#define ESCAPE_UDELAY 1000 +#define ESCAPE_DELAY (2*1000000/ESCAPE_UDELAY) /* 2 seconds worth of udelays */ + char escape_data[5]; /* longest vt100 escape sequence is 4 bytes */ + char *ped = escape_data; + int escape_delay = 0; + get_char_func *f, *f_escape = NULL; + int key; + + for (f = &kdb_poll_funcs[0]; ; ++f) { + if (*f == NULL) { + /* Reset NMI watchdog once per poll loop */ + touch_nmi_watchdog(); + f = &kdb_poll_funcs[0]; + } + if (escape_delay == 2) { + *ped = '\0'; + ped = escape_data; + --escape_delay; + } + if (escape_delay == 1) { + key = *ped++; + if (!*ped) + --escape_delay; + break; + } + key = (*f)(); + if (key == -1) { + if (escape_delay) { + udelay(ESCAPE_UDELAY); + --escape_delay; + } + continue; + } + if (bufsize <= 2) { + if (key == '\r') + key = '\n'; + *buffer++ = key; + *buffer = '\0'; + return -1; + } + if (escape_delay == 0 && key == '\e') { + escape_delay = ESCAPE_DELAY; + ped = escape_data; + f_escape = f; + } + if (escape_delay) { + *ped++ = key; + if (f_escape != f) { + escape_delay = 2; + continue; + } + if (ped - escape_data == 1) { + /* \e */ + continue; + } else if (ped - escape_data == 2) { + /* \e */ + if (key != '[') + escape_delay = 2; + continue; + } else if (ped - escape_data == 3) { + /* \e[ */ + int mapkey = 0; + switch (key) { + case 'A': /* \e[A, up arrow */ + mapkey = 16; + break; + case 'B': /* \e[B, down arrow */ + mapkey = 14; + break; + case 'C': /* \e[C, right arrow */ + mapkey = 6; + break; + case 'D': /* \e[D, left arrow */ + mapkey = 2; + break; + case '1': /* dropthrough */ + case '3': /* dropthrough */ + /* \e[<1,3,4>], may be home, del, end */ + case '4': + mapkey = -1; + break; + } + if (mapkey != -1) { + if (mapkey > 0) { + escape_data[0] = mapkey; + escape_data[1] = '\0'; + } + escape_delay = 2; + } + continue; + } else if (ped - escape_data == 4) { + /* \e[<1,3,4> */ + int mapkey = 0; + if (key == '~') { + switch (escape_data[2]) { + case '1': /* \e[1~, home */ + mapkey = 1; + break; + case '3': /* \e[3~, del */ + mapkey = 4; + break; + case '4': /* \e[4~, end */ + mapkey = 5; + break; + } + } + if (mapkey > 0) { + escape_data[0] = mapkey; + escape_data[1] = '\0'; + } + escape_delay = 2; + continue; + } + } + break; /* A key to process */ + } + return key; +} + +/* + * kdb_read + * + * This function reads a string of characters, terminated by + * a newline, or by reaching the end of the supplied buffer, + * from the current kernel debugger console device. + * Parameters: + * buffer - Address of character buffer to receive input characters. + * bufsize - size, in bytes, of the character buffer + * Returns: + * Returns a pointer to the buffer containing the received + * character string. This string will be terminated by a + * newline character. + * Locking: + * No locks are required to be held upon entry to this + * function. It is not reentrant - it relies on the fact + * that while kdb is running on only one "master debug" cpu. + * Remarks: + * + * The buffer size must be >= 2. A buffer size of 2 means that the caller only + * wants a single key. + * + * An escape key could be the start of a vt100 control sequence such as \e[D + * (left arrow) or it could be a character in its own right. The standard + * method for detecting the difference is to wait for 2 seconds to see if there + * are any other characters. kdb is complicated by the lack of a timer service + * (interrupts are off), by multiple input sources and by the need to sometimes + * return after just one key. Escape sequence processing has to be done as + * states in the polling loop. + */ + +static char *kdb_read(char *buffer, size_t bufsize) +{ + char *cp = buffer; + char *bufend = buffer+bufsize-2; /* Reserve space for newline + * and null byte */ + char *lastchar; + char *p_tmp; + char tmp; + static char tmpbuffer[CMD_BUFLEN]; + int len = strlen(buffer); + int len_tmp; + int tab = 0; + int count; + int i; + int diag, dtab_count; + int key; + + + diag = kdbgetintenv("DTABCOUNT", &dtab_count); + if (diag) + dtab_count = 30; + + if (len > 0) { + cp += len; + if (*(buffer+len-1) == '\n') + cp--; + } + + lastchar = cp; + *cp = '\0'; + kdb_printf("%s", buffer); +poll_again: + key = kdb_read_get_key(buffer, bufsize); + if (key == -1) + return buffer; + if (key != 9) + tab = 0; + switch (key) { + case 8: /* backspace */ + if (cp > buffer) { + if (cp < lastchar) { + memcpy(tmpbuffer, cp, lastchar - cp); + memcpy(cp-1, tmpbuffer, lastchar - cp); + } + *(--lastchar) = '\0'; + --cp; + kdb_printf("\b%s \r", cp); + tmp = *cp; + *cp = '\0'; + kdb_printf(kdb_prompt_str); + kdb_printf("%s", buffer); + *cp = tmp; + } + break; + case 13: /* enter */ + *lastchar++ = '\n'; + *lastchar++ = '\0'; + kdb_printf("\n"); + return buffer; + case 4: /* Del */ + if (cp < lastchar) { + memcpy(tmpbuffer, cp+1, lastchar - cp - 1); + memcpy(cp, tmpbuffer, lastchar - cp - 1); + *(--lastchar) = '\0'; + kdb_printf("%s \r", cp); + tmp = *cp; + *cp = '\0'; + kdb_printf(kdb_prompt_str); + kdb_printf("%s", buffer); + *cp = tmp; + } + break; + case 1: /* Home */ + if (cp > buffer) { + kdb_printf("\r"); + kdb_printf(kdb_prompt_str); + cp = buffer; + } + break; + case 5: /* End */ + if (cp < lastchar) { + kdb_printf("%s", cp); + cp = lastchar; + } + break; + case 2: /* Left */ + if (cp > buffer) { + kdb_printf("\b"); + --cp; + } + break; + case 14: /* Down */ + memset(tmpbuffer, ' ', + strlen(kdb_prompt_str) + (lastchar-buffer)); + *(tmpbuffer+strlen(kdb_prompt_str) + + (lastchar-buffer)) = '\0'; + kdb_printf("\r%s\r", tmpbuffer); + *lastchar = (char)key; + *(lastchar+1) = '\0'; + return lastchar; + case 6: /* Right */ + if (cp < lastchar) { + kdb_printf("%c", *cp); + ++cp; + } + break; + case 16: /* Up */ + memset(tmpbuffer, ' ', + strlen(kdb_prompt_str) + (lastchar-buffer)); + *(tmpbuffer+strlen(kdb_prompt_str) + + (lastchar-buffer)) = '\0'; + kdb_printf("\r%s\r", tmpbuffer); + *lastchar = (char)key; + *(lastchar+1) = '\0'; + return lastchar; + case 9: /* Tab */ + if (tab < 2) + ++tab; + p_tmp = buffer; + while (*p_tmp == ' ') + p_tmp++; + if (p_tmp > cp) + break; + memcpy(tmpbuffer, p_tmp, cp-p_tmp); + *(tmpbuffer + (cp-p_tmp)) = '\0'; + p_tmp = strrchr(tmpbuffer, ' '); + if (p_tmp) + ++p_tmp; + else + p_tmp = tmpbuffer; + len = strlen(p_tmp); + count = kallsyms_symbol_complete(p_tmp, + sizeof(tmpbuffer) - + (p_tmp - tmpbuffer)); + if (tab == 2 && count > 0) { + kdb_printf("\n%d symbols are found.", count); + if (count > dtab_count) { + count = dtab_count; + kdb_printf(" But only first %d symbols will" + " be printed.\nYou can change the" + " environment variable DTABCOUNT.", + count); + } + kdb_printf("\n"); + for (i = 0; i < count; i++) { + if (kallsyms_symbol_next(p_tmp, i) < 0) + break; + kdb_printf("%s ", p_tmp); + *(p_tmp + len) = '\0'; + } + if (i >= dtab_count) + kdb_printf("..."); + kdb_printf("\n"); + kdb_printf(kdb_prompt_str); + kdb_printf("%s", buffer); + } else if (tab != 2 && count > 0) { + len_tmp = strlen(p_tmp); + strncpy(p_tmp+len_tmp, cp, lastchar-cp+1); + len_tmp = strlen(p_tmp); + strncpy(cp, p_tmp+len, len_tmp-len + 1); + len = len_tmp - len; + kdb_printf("%s", cp); + cp += len; + lastchar += len; + } + kdb_nextline = 1; /* reset output line number */ + break; + default: + if (key >= 32 && lastchar < bufend) { + if (cp < lastchar) { + memcpy(tmpbuffer, cp, lastchar - cp); + memcpy(cp+1, tmpbuffer, lastchar - cp); + *++lastchar = '\0'; + *cp = key; + kdb_printf("%s\r", cp); + ++cp; + tmp = *cp; + *cp = '\0'; + kdb_printf(kdb_prompt_str); + kdb_printf("%s", buffer); + *cp = tmp; + } else { + *++lastchar = '\0'; + *cp++ = key; + /* The kgdb transition check will hide + * printed characters if we think that + * kgdb is connecting, until the check + * fails */ + if (!KDB_STATE(KGDB_TRANS)) + kgdb_transition_check(buffer); + else + kdb_printf("%c", key); + } + /* Special escape to kgdb */ + if (lastchar - buffer >= 5 && + strcmp(lastchar - 5, "$?#3f") == 0) { + strcpy(buffer, "kgdb"); + KDB_STATE_SET(DOING_KGDB); + return buffer; + } + if (lastchar - buffer >= 14 && + strcmp(lastchar - 14, "$qSupported#37") == 0) { + strcpy(buffer, "kgdb"); + KDB_STATE_SET(DOING_KGDB2); + return buffer; + } + } + break; + } + goto poll_again; +} + +/* + * kdb_getstr + * + * Print the prompt string and read a command from the + * input device. + * + * Parameters: + * buffer Address of buffer to receive command + * bufsize Size of buffer in bytes + * prompt Pointer to string to use as prompt string + * Returns: + * Pointer to command buffer. + * Locking: + * None. + * Remarks: + * For SMP kernels, the processor number will be + * substituted for %d, %x or %o in the prompt. + */ + +char *kdb_getstr(char *buffer, size_t bufsize, char *prompt) +{ + if (prompt && kdb_prompt_str != prompt) + strncpy(kdb_prompt_str, prompt, CMD_BUFLEN); + kdb_printf(kdb_prompt_str); + kdb_nextline = 1; /* Prompt and input resets line number */ + return kdb_read(buffer, bufsize); +} + +/* + * kdb_input_flush + * + * Get rid of any buffered console input. + * + * Parameters: + * none + * Returns: + * nothing + * Locking: + * none + * Remarks: + * Call this function whenever you want to flush input. If there is any + * outstanding input, it ignores all characters until there has been no + * data for approximately 1ms. + */ + +static void kdb_input_flush(void) +{ + get_char_func *f; + int res; + int flush_delay = 1; + while (flush_delay) { + flush_delay--; +empty: + touch_nmi_watchdog(); + for (f = &kdb_poll_funcs[0]; *f; ++f) { + res = (*f)(); + if (res != -1) { + flush_delay = 1; + goto empty; + } + } + if (flush_delay) + mdelay(1); + } +} + +/* + * kdb_printf + * + * Print a string to the output device(s). + * + * Parameters: + * printf-like format and optional args. + * Returns: + * 0 + * Locking: + * None. + * Remarks: + * use 'kdbcons->write()' to avoid polluting 'log_buf' with + * kdb output. + * + * If the user is doing a cmd args | grep srch + * then kdb_grepping_flag is set. + * In that case we need to accumulate full lines (ending in \n) before + * searching for the pattern. + */ + +static char kdb_buffer[256]; /* A bit too big to go on stack */ +static char *next_avail = kdb_buffer; +static int size_avail; +static int suspend_grep; + +/* + * search arg1 to see if it contains arg2 + * (kdmain.c provides flags for ^pat and pat$) + * + * return 1 for found, 0 for not found + */ +static int kdb_search_string(char *searched, char *searchfor) +{ + char firstchar, *cp; + int len1, len2; + + /* not counting the newline at the end of "searched" */ + len1 = strlen(searched)-1; + len2 = strlen(searchfor); + if (len1 < len2) + return 0; + if (kdb_grep_leading && kdb_grep_trailing && len1 != len2) + return 0; + if (kdb_grep_leading) { + if (!strncmp(searched, searchfor, len2)) + return 1; + } else if (kdb_grep_trailing) { + if (!strncmp(searched+len1-len2, searchfor, len2)) + return 1; + } else { + firstchar = *searchfor; + cp = searched; + while ((cp = strchr(cp, firstchar))) { + if (!strncmp(cp, searchfor, len2)) + return 1; + cp++; + } + } + return 0; +} + +int vkdb_printf(const char *fmt, va_list ap) +{ + int diag; + int linecount; + int logging, saved_loglevel = 0; + int saved_trap_printk; + int got_printf_lock = 0; + int retlen = 0; + int fnd, len; + char *cp, *cp2, *cphold = NULL, replaced_byte = ' '; + char *moreprompt = "more> "; + struct console *c = console_drivers; + static DEFINE_SPINLOCK(kdb_printf_lock); + unsigned long uninitialized_var(flags); + + preempt_disable(); + saved_trap_printk = kdb_trap_printk; + kdb_trap_printk = 0; + + /* Serialize kdb_printf if multiple cpus try to write at once. + * But if any cpu goes recursive in kdb, just print the output, + * even if it is interleaved with any other text. + */ + if (!KDB_STATE(PRINTF_LOCK)) { + KDB_STATE_SET(PRINTF_LOCK); + spin_lock_irqsave(&kdb_printf_lock, flags); + got_printf_lock = 1; + atomic_inc(&kdb_event); + } else { + __acquire(kdb_printf_lock); + } + + diag = kdbgetintenv("LINES", &linecount); + if (diag || linecount <= 1) + linecount = 24; + + diag = kdbgetintenv("LOGGING", &logging); + if (diag) + logging = 0; + + if (!kdb_grepping_flag || suspend_grep) { + /* normally, every vsnprintf starts a new buffer */ + next_avail = kdb_buffer; + size_avail = sizeof(kdb_buffer); + } + vsnprintf(next_avail, size_avail, fmt, ap); + + /* + * If kdb_parse() found that the command was cmd xxx | grep yyy + * then kdb_grepping_flag is set, and kdb_grep_string contains yyy + * + * Accumulate the print data up to a newline before searching it. + * (vsnprintf does null-terminate the string that it generates) + */ + + /* skip the search if prints are temporarily unconditional */ + if (!suspend_grep && kdb_grepping_flag) { + cp = strchr(kdb_buffer, '\n'); + if (!cp) { + /* + * Special cases that don't end with newlines + * but should be written without one: + * The "[nn]kdb> " prompt should + * appear at the front of the buffer. + * + * The "[nn]more " prompt should also be + * (MOREPROMPT -> moreprompt) + * written * but we print that ourselves, + * we set the suspend_grep flag to make + * it unconditional. + * + */ + if (next_avail == kdb_buffer) { + /* + * these should occur after a newline, + * so they will be at the front of the + * buffer + */ + cp2 = kdb_buffer; + len = strlen(kdb_prompt_str); + if (!strncmp(cp2, kdb_prompt_str, len)) { + /* + * We're about to start a new + * command, so we can go back + * to normal mode. + */ + kdb_grepping_flag = 0; + goto kdb_printit; + } + } + /* no newline; don't search/write the buffer + until one is there */ + len = strlen(kdb_buffer); + next_avail = kdb_buffer + len; + size_avail = sizeof(kdb_buffer) - len; + goto kdb_print_out; + } + + /* + * The newline is present; print through it or discard + * it, depending on the results of the search. + */ + cp++; /* to byte after the newline */ + replaced_byte = *cp; /* remember what/where it was */ + cphold = cp; + *cp = '\0'; /* end the string for our search */ + + /* + * We now have a newline at the end of the string + * Only continue with this output if it contains the + * search string. + */ + fnd = kdb_search_string(kdb_buffer, kdb_grep_string); + if (!fnd) { + /* + * At this point the complete line at the start + * of kdb_buffer can be discarded, as it does + * not contain what the user is looking for. + * Shift the buffer left. + */ + *cphold = replaced_byte; + strcpy(kdb_buffer, cphold); + len = strlen(kdb_buffer); + next_avail = kdb_buffer + len; + size_avail = sizeof(kdb_buffer) - len; + goto kdb_print_out; + } + /* + * at this point the string is a full line and + * should be printed, up to the null. + */ + } +kdb_printit: + + /* + * Write to all consoles. + */ + retlen = strlen(kdb_buffer); + if (!dbg_kdb_mode && kgdb_connected) { + gdbstub_msg_write(kdb_buffer, retlen); + } else { + if (!dbg_io_ops->is_console) { + len = strlen(kdb_buffer); + cp = kdb_buffer; + while (len--) { + dbg_io_ops->write_char(*cp); + cp++; + } + } + while (c) { + c->write(c, kdb_buffer, retlen); + touch_nmi_watchdog(); + c = c->next; + } + } + if (logging) { + saved_loglevel = console_loglevel; + console_loglevel = 0; + printk(KERN_INFO "%s", kdb_buffer); + } + + if (KDB_STATE(PAGER) && strchr(kdb_buffer, '\n')) + kdb_nextline++; + + /* check for having reached the LINES number of printed lines */ + if (kdb_nextline == linecount) { + char buf1[16] = ""; +#if defined(CONFIG_SMP) + char buf2[32]; +#endif + + /* Watch out for recursion here. Any routine that calls + * kdb_printf will come back through here. And kdb_read + * uses kdb_printf to echo on serial consoles ... + */ + kdb_nextline = 1; /* In case of recursion */ + + /* + * Pause until cr. + */ + moreprompt = kdbgetenv("MOREPROMPT"); + if (moreprompt == NULL) + moreprompt = "more> "; + +#if defined(CONFIG_SMP) + if (strchr(moreprompt, '%')) { + sprintf(buf2, moreprompt, get_cpu()); + put_cpu(); + moreprompt = buf2; + } +#endif + + kdb_input_flush(); + c = console_drivers; + + if (!dbg_io_ops->is_console) { + len = strlen(moreprompt); + cp = moreprompt; + while (len--) { + dbg_io_ops->write_char(*cp); + cp++; + } + } + while (c) { + c->write(c, moreprompt, strlen(moreprompt)); + touch_nmi_watchdog(); + c = c->next; + } + + if (logging) + printk("%s", moreprompt); + + kdb_read(buf1, 2); /* '2' indicates to return + * immediately after getting one key. */ + kdb_nextline = 1; /* Really set output line 1 */ + + /* empty and reset the buffer: */ + kdb_buffer[0] = '\0'; + next_avail = kdb_buffer; + size_avail = sizeof(kdb_buffer); + if ((buf1[0] == 'q') || (buf1[0] == 'Q')) { + /* user hit q or Q */ + KDB_FLAG_SET(CMD_INTERRUPT); /* command interrupted */ + KDB_STATE_CLEAR(PAGER); + /* end of command output; back to normal mode */ + kdb_grepping_flag = 0; + kdb_printf("\n"); + } else if (buf1[0] == ' ') { + kdb_printf("\n"); + suspend_grep = 1; /* for this recursion */ + } else if (buf1[0] == '\n') { + kdb_nextline = linecount - 1; + kdb_printf("\r"); + suspend_grep = 1; /* for this recursion */ + } else if (buf1[0] && buf1[0] != '\n') { + /* user hit something other than enter */ + suspend_grep = 1; /* for this recursion */ + kdb_printf("\nOnly 'q' or 'Q' are processed at more " + "prompt, input ignored\n"); + } else if (kdb_grepping_flag) { + /* user hit enter */ + suspend_grep = 1; /* for this recursion */ + kdb_printf("\n"); + } + kdb_input_flush(); + } + + /* + * For grep searches, shift the printed string left. + * replaced_byte contains the character that was overwritten with + * the terminating null, and cphold points to the null. + * Then adjust the notion of available space in the buffer. + */ + if (kdb_grepping_flag && !suspend_grep) { + *cphold = replaced_byte; + strcpy(kdb_buffer, cphold); + len = strlen(kdb_buffer); + next_avail = kdb_buffer + len; + size_avail = sizeof(kdb_buffer) - len; + } + +kdb_print_out: + suspend_grep = 0; /* end of what may have been a recursive call */ + if (logging) + console_loglevel = saved_loglevel; + if (KDB_STATE(PRINTF_LOCK) && got_printf_lock) { + got_printf_lock = 0; + spin_unlock_irqrestore(&kdb_printf_lock, flags); + KDB_STATE_CLEAR(PRINTF_LOCK); + atomic_dec(&kdb_event); + } else { + __release(kdb_printf_lock); + } + kdb_trap_printk = saved_trap_printk; + preempt_enable(); + return retlen; +} + +int kdb_printf(const char *fmt, ...) +{ + va_list ap; + int r; + + va_start(ap, fmt); + r = vkdb_printf(fmt, ap); + va_end(ap); + + return r; +} +EXPORT_SYMBOL_GPL(kdb_printf); diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c new file mode 100644 index 00000000..4bca6349 --- /dev/null +++ b/kernel/debug/kdb/kdb_keyboard.c @@ -0,0 +1,212 @@ +/* + * Kernel Debugger Architecture Dependent Console I/O handler + * + * This file is subject to the terms and conditions of the GNU General Public + * License. + * + * Copyright (c) 1999-2006 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. + */ + +#include +#include +#include +#include +#include + +/* Keyboard Controller Registers on normal PCs. */ + +#define KBD_STATUS_REG 0x64 /* Status register (R) */ +#define KBD_DATA_REG 0x60 /* Keyboard data register (R/W) */ + +/* Status Register Bits */ + +#define KBD_STAT_OBF 0x01 /* Keyboard output buffer full */ +#define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */ + +static int kbd_exists; + +/* + * Check if the keyboard controller has a keypress for us. + * Some parts (Enter Release, LED change) are still blocking polled here, + * but hopefully they are all short. + */ +int kdb_get_kbd_char(void) +{ + int scancode, scanstatus; + static int shift_lock; /* CAPS LOCK state (0-off, 1-on) */ + static int shift_key; /* Shift next keypress */ + static int ctrl_key; + u_short keychar; + + if (KDB_FLAG(NO_I8042) || KDB_FLAG(NO_VT_CONSOLE) || + (inb(KBD_STATUS_REG) == 0xff && inb(KBD_DATA_REG) == 0xff)) { + kbd_exists = 0; + return -1; + } + kbd_exists = 1; + + if ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0) + return -1; + + /* + * Fetch the scancode + */ + scancode = inb(KBD_DATA_REG); + scanstatus = inb(KBD_STATUS_REG); + + /* + * Ignore mouse events. + */ + if (scanstatus & KBD_STAT_MOUSE_OBF) + return -1; + + /* + * Ignore release, trigger on make + * (except for shift keys, where we want to + * keep the shift state so long as the key is + * held down). + */ + + if (((scancode&0x7f) == 0x2a) || ((scancode&0x7f) == 0x36)) { + /* + * Next key may use shift table + */ + if ((scancode & 0x80) == 0) + shift_key = 1; + else + shift_key = 0; + return -1; + } + + if ((scancode&0x7f) == 0x1d) { + /* + * Left ctrl key + */ + if ((scancode & 0x80) == 0) + ctrl_key = 1; + else + ctrl_key = 0; + return -1; + } + + if ((scancode & 0x80) != 0) + return -1; + + scancode &= 0x7f; + + /* + * Translate scancode + */ + + if (scancode == 0x3a) { + /* + * Toggle caps lock + */ + shift_lock ^= 1; + +#ifdef KDB_BLINK_LED + kdb_toggleled(0x4); +#endif + return -1; + } + + if (scancode == 0x0e) { + /* + * Backspace + */ + return 8; + } + + /* Special Key */ + switch (scancode) { + case 0xF: /* Tab */ + return 9; + case 0x53: /* Del */ + return 4; + case 0x47: /* Home */ + return 1; + case 0x4F: /* End */ + return 5; + case 0x4B: /* Left */ + return 2; + case 0x48: /* Up */ + return 16; + case 0x50: /* Down */ + return 14; + case 0x4D: /* Right */ + return 6; + } + + if (scancode == 0xe0) + return -1; + + /* + * For Japanese 86/106 keyboards + * See comment in drivers/char/pc_keyb.c. + * - Masahiro Adegawa + */ + if (scancode == 0x73) + scancode = 0x59; + else if (scancode == 0x7d) + scancode = 0x7c; + + if (!shift_lock && !shift_key && !ctrl_key) { + keychar = plain_map[scancode]; + } else if ((shift_lock || shift_key) && key_maps[1]) { + keychar = key_maps[1][scancode]; + } else if (ctrl_key && key_maps[4]) { + keychar = key_maps[4][scancode]; + } else { + keychar = 0x0020; + kdb_printf("Unknown state/scancode (%d)\n", scancode); + } + keychar &= 0x0fff; + if (keychar == '\t') + keychar = ' '; + switch (KTYP(keychar)) { + case KT_LETTER: + case KT_LATIN: + if (isprint(keychar)) + break; /* printable characters */ + /* drop through */ + case KT_SPEC: + if (keychar == K_ENTER) + break; + /* drop through */ + default: + return -1; /* ignore unprintables */ + } + + if ((scancode & 0x7f) == 0x1c) { + /* + * enter key. All done. Absorb the release scancode. + */ + while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0) + ; + + /* + * Fetch the scancode + */ + scancode = inb(KBD_DATA_REG); + scanstatus = inb(KBD_STATUS_REG); + + while (scanstatus & KBD_STAT_MOUSE_OBF) { + scancode = inb(KBD_DATA_REG); + scanstatus = inb(KBD_STATUS_REG); + } + + if (scancode != 0x9c) { + /* + * Wasn't an enter-release, why not? + */ + kdb_printf("kdb: expected enter got 0x%x status 0x%x\n", + scancode, scanstatus); + } + + return 13; + } + + return keychar & 0xff; +} +EXPORT_SYMBOL_GPL(kdb_get_kbd_char); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c new file mode 100644 index 00000000..be14779b --- /dev/null +++ b/kernel/debug/kdb/kdb_main.c @@ -0,0 +1,2937 @@ +/* + * Kernel Debugger Architecture Independent Main Code + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 1999-2004 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (C) 2000 Stephane Eranian + * Xscale (R) modifications copyright (C) 2003 Intel Corporation. + * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "kdb_private.h" + +#define GREP_LEN 256 +char kdb_grep_string[GREP_LEN]; +int kdb_grepping_flag; +EXPORT_SYMBOL(kdb_grepping_flag); +int kdb_grep_leading; +int kdb_grep_trailing; + +/* + * Kernel debugger state flags + */ +int kdb_flags; +atomic_t kdb_event; + +/* + * kdb_lock protects updates to kdb_initial_cpu. Used to + * single thread processors through the kernel debugger. + */ +int kdb_initial_cpu = -1; /* cpu number that owns kdb */ +int kdb_nextline = 1; +int kdb_state; /* General KDB state */ + +struct task_struct *kdb_current_task; +EXPORT_SYMBOL(kdb_current_task); +struct pt_regs *kdb_current_regs; + +const char *kdb_diemsg; +static int kdb_go_count; +#ifdef CONFIG_KDB_CONTINUE_CATASTROPHIC +static unsigned int kdb_continue_catastrophic = + CONFIG_KDB_CONTINUE_CATASTROPHIC; +#else +static unsigned int kdb_continue_catastrophic; +#endif + +/* kdb_commands describes the available commands. */ +static kdbtab_t *kdb_commands; +#define KDB_BASE_CMD_MAX 50 +static int kdb_max_commands = KDB_BASE_CMD_MAX; +static kdbtab_t kdb_base_commands[KDB_BASE_CMD_MAX]; +#define for_each_kdbcmd(cmd, num) \ + for ((cmd) = kdb_base_commands, (num) = 0; \ + num < kdb_max_commands; \ + num++, num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++) + +typedef struct _kdbmsg { + int km_diag; /* kdb diagnostic */ + char *km_msg; /* Corresponding message text */ +} kdbmsg_t; + +#define KDBMSG(msgnum, text) \ + { KDB_##msgnum, text } + +static kdbmsg_t kdbmsgs[] = { + KDBMSG(NOTFOUND, "Command Not Found"), + KDBMSG(ARGCOUNT, "Improper argument count, see usage."), + KDBMSG(BADWIDTH, "Illegal value for BYTESPERWORD use 1, 2, 4 or 8, " + "8 is only allowed on 64 bit systems"), + KDBMSG(BADRADIX, "Illegal value for RADIX use 8, 10 or 16"), + KDBMSG(NOTENV, "Cannot find environment variable"), + KDBMSG(NOENVVALUE, "Environment variable should have value"), + KDBMSG(NOTIMP, "Command not implemented"), + KDBMSG(ENVFULL, "Environment full"), + KDBMSG(ENVBUFFULL, "Environment buffer full"), + KDBMSG(TOOMANYBPT, "Too many breakpoints defined"), +#ifdef CONFIG_CPU_XSCALE + KDBMSG(TOOMANYDBREGS, "More breakpoints than ibcr registers defined"), +#else + KDBMSG(TOOMANYDBREGS, "More breakpoints than db registers defined"), +#endif + KDBMSG(DUPBPT, "Duplicate breakpoint address"), + KDBMSG(BPTNOTFOUND, "Breakpoint not found"), + KDBMSG(BADMODE, "Invalid IDMODE"), + KDBMSG(BADINT, "Illegal numeric value"), + KDBMSG(INVADDRFMT, "Invalid symbolic address format"), + KDBMSG(BADREG, "Invalid register name"), + KDBMSG(BADCPUNUM, "Invalid cpu number"), + KDBMSG(BADLENGTH, "Invalid length field"), + KDBMSG(NOBP, "No Breakpoint exists"), + KDBMSG(BADADDR, "Invalid address"), +}; +#undef KDBMSG + +static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t); + + +/* + * Initial environment. This is all kept static and local to + * this file. We don't want to rely on the memory allocation + * mechanisms in the kernel, so we use a very limited allocate-only + * heap for new and altered environment variables. The entire + * environment is limited to a fixed number of entries (add more + * to __env[] if required) and a fixed amount of heap (add more to + * KDB_ENVBUFSIZE if required). + */ + +static char *__env[] = { +#if defined(CONFIG_SMP) + "PROMPT=[%d]kdb> ", + "MOREPROMPT=[%d]more> ", +#else + "PROMPT=kdb> ", + "MOREPROMPT=more> ", +#endif + "RADIX=16", + "MDCOUNT=8", /* lines of md output */ + "BTARGS=9", /* 9 possible args in bt */ + KDB_PLATFORM_ENV, + "DTABCOUNT=30", + "NOSECT=1", + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, + (char *)0, +}; + +static const int __nenv = (sizeof(__env) / sizeof(char *)); + +struct task_struct *kdb_curr_task(int cpu) +{ + struct task_struct *p = curr_task(cpu); +#ifdef _TIF_MCA_INIT + if ((task_thread_info(p)->flags & _TIF_MCA_INIT) && KDB_TSK(cpu)) + p = krp->p; +#endif + return p; +} + +/* + * kdbgetenv - This function will return the character string value of + * an environment variable. + * Parameters: + * match A character string representing an environment variable. + * Returns: + * NULL No environment variable matches 'match' + * char* Pointer to string value of environment variable. + */ +char *kdbgetenv(const char *match) +{ + char **ep = __env; + int matchlen = strlen(match); + int i; + + for (i = 0; i < __nenv; i++) { + char *e = *ep++; + + if (!e) + continue; + + if ((strncmp(match, e, matchlen) == 0) + && ((e[matchlen] == '\0') + || (e[matchlen] == '='))) { + char *cp = strchr(e, '='); + return cp ? ++cp : ""; + } + } + return NULL; +} + +/* + * kdballocenv - This function is used to allocate bytes for + * environment entries. + * Parameters: + * match A character string representing a numeric value + * Outputs: + * *value the unsigned long representation of the env variable 'match' + * Returns: + * Zero on success, a kdb diagnostic on failure. + * Remarks: + * We use a static environment buffer (envbuffer) to hold the values + * of dynamically generated environment variables (see kdb_set). Buffer + * space once allocated is never free'd, so over time, the amount of space + * (currently 512 bytes) will be exhausted if env variables are changed + * frequently. + */ +static char *kdballocenv(size_t bytes) +{ +#define KDB_ENVBUFSIZE 512 + static char envbuffer[KDB_ENVBUFSIZE]; + static int envbufsize; + char *ep = NULL; + + if ((KDB_ENVBUFSIZE - envbufsize) >= bytes) { + ep = &envbuffer[envbufsize]; + envbufsize += bytes; + } + return ep; +} + +/* + * kdbgetulenv - This function will return the value of an unsigned + * long-valued environment variable. + * Parameters: + * match A character string representing a numeric value + * Outputs: + * *value the unsigned long represntation of the env variable 'match' + * Returns: + * Zero on success, a kdb diagnostic on failure. + */ +static int kdbgetulenv(const char *match, unsigned long *value) +{ + char *ep; + + ep = kdbgetenv(match); + if (!ep) + return KDB_NOTENV; + if (strlen(ep) == 0) + return KDB_NOENVVALUE; + + *value = simple_strtoul(ep, NULL, 0); + + return 0; +} + +/* + * kdbgetintenv - This function will return the value of an + * integer-valued environment variable. + * Parameters: + * match A character string representing an integer-valued env variable + * Outputs: + * *value the integer representation of the environment variable 'match' + * Returns: + * Zero on success, a kdb diagnostic on failure. + */ +int kdbgetintenv(const char *match, int *value) +{ + unsigned long val; + int diag; + + diag = kdbgetulenv(match, &val); + if (!diag) + *value = (int) val; + return diag; +} + +/* + * kdbgetularg - This function will convert a numeric string into an + * unsigned long value. + * Parameters: + * arg A character string representing a numeric value + * Outputs: + * *value the unsigned long represntation of arg. + * Returns: + * Zero on success, a kdb diagnostic on failure. + */ +int kdbgetularg(const char *arg, unsigned long *value) +{ + char *endp; + unsigned long val; + + val = simple_strtoul(arg, &endp, 0); + + if (endp == arg) { + /* + * Also try base 16, for us folks too lazy to type the + * leading 0x... + */ + val = simple_strtoul(arg, &endp, 16); + if (endp == arg) + return KDB_BADINT; + } + + *value = val; + + return 0; +} + +int kdbgetu64arg(const char *arg, u64 *value) +{ + char *endp; + u64 val; + + val = simple_strtoull(arg, &endp, 0); + + if (endp == arg) { + + val = simple_strtoull(arg, &endp, 16); + if (endp == arg) + return KDB_BADINT; + } + + *value = val; + + return 0; +} + +/* + * kdb_set - This function implements the 'set' command. Alter an + * existing environment variable or create a new one. + */ +int kdb_set(int argc, const char **argv) +{ + int i; + char *ep; + size_t varlen, vallen; + + /* + * we can be invoked two ways: + * set var=value argv[1]="var", argv[2]="value" + * set var = value argv[1]="var", argv[2]="=", argv[3]="value" + * - if the latter, shift 'em down. + */ + if (argc == 3) { + argv[2] = argv[3]; + argc--; + } + + if (argc != 2) + return KDB_ARGCOUNT; + + /* + * Check for internal variables + */ + if (strcmp(argv[1], "KDBDEBUG") == 0) { + unsigned int debugflags; + char *cp; + + debugflags = simple_strtoul(argv[2], &cp, 0); + if (cp == argv[2] || debugflags & ~KDB_DEBUG_FLAG_MASK) { + kdb_printf("kdb: illegal debug flags '%s'\n", + argv[2]); + return 0; + } + kdb_flags = (kdb_flags & + ~(KDB_DEBUG_FLAG_MASK << KDB_DEBUG_FLAG_SHIFT)) + | (debugflags << KDB_DEBUG_FLAG_SHIFT); + + return 0; + } + + /* + * Tokenizer squashed the '=' sign. argv[1] is variable + * name, argv[2] = value. + */ + varlen = strlen(argv[1]); + vallen = strlen(argv[2]); + ep = kdballocenv(varlen + vallen + 2); + if (ep == (char *)0) + return KDB_ENVBUFFULL; + + sprintf(ep, "%s=%s", argv[1], argv[2]); + + ep[varlen+vallen+1] = '\0'; + + for (i = 0; i < __nenv; i++) { + if (__env[i] + && ((strncmp(__env[i], argv[1], varlen) == 0) + && ((__env[i][varlen] == '\0') + || (__env[i][varlen] == '=')))) { + __env[i] = ep; + return 0; + } + } + + /* + * Wasn't existing variable. Fit into slot. + */ + for (i = 0; i < __nenv-1; i++) { + if (__env[i] == (char *)0) { + __env[i] = ep; + return 0; + } + } + + return KDB_ENVFULL; +} + +static int kdb_check_regs(void) +{ + if (!kdb_current_regs) { + kdb_printf("No current kdb registers." + " You may need to select another task\n"); + return KDB_BADREG; + } + return 0; +} + +/* + * kdbgetaddrarg - This function is responsible for parsing an + * address-expression and returning the value of the expression, + * symbol name, and offset to the caller. + * + * The argument may consist of a numeric value (decimal or + * hexidecimal), a symbol name, a register name (preceded by the + * percent sign), an environment variable with a numeric value + * (preceded by a dollar sign) or a simple arithmetic expression + * consisting of a symbol name, +/-, and a numeric constant value + * (offset). + * Parameters: + * argc - count of arguments in argv + * argv - argument vector + * *nextarg - index to next unparsed argument in argv[] + * regs - Register state at time of KDB entry + * Outputs: + * *value - receives the value of the address-expression + * *offset - receives the offset specified, if any + * *name - receives the symbol name, if any + * *nextarg - index to next unparsed argument in argv[] + * Returns: + * zero is returned on success, a kdb diagnostic code is + * returned on error. + */ +int kdbgetaddrarg(int argc, const char **argv, int *nextarg, + unsigned long *value, long *offset, + char **name) +{ + unsigned long addr; + unsigned long off = 0; + int positive; + int diag; + int found = 0; + char *symname; + char symbol = '\0'; + char *cp; + kdb_symtab_t symtab; + + /* + * Process arguments which follow the following syntax: + * + * symbol | numeric-address [+/- numeric-offset] + * %register + * $environment-variable + */ + + if (*nextarg > argc) + return KDB_ARGCOUNT; + + symname = (char *)argv[*nextarg]; + + /* + * If there is no whitespace between the symbol + * or address and the '+' or '-' symbols, we + * remember the character and replace it with a + * null so the symbol/value can be properly parsed + */ + cp = strpbrk(symname, "+-"); + if (cp != NULL) { + symbol = *cp; + *cp++ = '\0'; + } + + if (symname[0] == '$') { + diag = kdbgetulenv(&symname[1], &addr); + if (diag) + return diag; + } else if (symname[0] == '%') { + diag = kdb_check_regs(); + if (diag) + return diag; + /* Implement register values with % at a later time as it is + * arch optional. + */ + return KDB_NOTIMP; + } else { + found = kdbgetsymval(symname, &symtab); + if (found) { + addr = symtab.sym_start; + } else { + diag = kdbgetularg(argv[*nextarg], &addr); + if (diag) + return diag; + } + } + + if (!found) + found = kdbnearsym(addr, &symtab); + + (*nextarg)++; + + if (name) + *name = symname; + if (value) + *value = addr; + if (offset && name && *name) + *offset = addr - symtab.sym_start; + + if ((*nextarg > argc) + && (symbol == '\0')) + return 0; + + /* + * check for +/- and offset + */ + + if (symbol == '\0') { + if ((argv[*nextarg][0] != '+') + && (argv[*nextarg][0] != '-')) { + /* + * Not our argument. Return. + */ + return 0; + } else { + positive = (argv[*nextarg][0] == '+'); + (*nextarg)++; + } + } else + positive = (symbol == '+'); + + /* + * Now there must be an offset! + */ + if ((*nextarg > argc) + && (symbol == '\0')) { + return KDB_INVADDRFMT; + } + + if (!symbol) { + cp = (char *)argv[*nextarg]; + (*nextarg)++; + } + + diag = kdbgetularg(cp, &off); + if (diag) + return diag; + + if (!positive) + off = -off; + + if (offset) + *offset += off; + + if (value) + *value += off; + + return 0; +} + +static void kdb_cmderror(int diag) +{ + int i; + + if (diag >= 0) { + kdb_printf("no error detected (diagnostic is %d)\n", diag); + return; + } + + for (i = 0; i < __nkdb_err; i++) { + if (kdbmsgs[i].km_diag == diag) { + kdb_printf("diag: %d: %s\n", diag, kdbmsgs[i].km_msg); + return; + } + } + + kdb_printf("Unknown diag %d\n", -diag); +} + +/* + * kdb_defcmd, kdb_defcmd2 - This function implements the 'defcmd' + * command which defines one command as a set of other commands, + * terminated by endefcmd. kdb_defcmd processes the initial + * 'defcmd' command, kdb_defcmd2 is invoked from kdb_parse for + * the following commands until 'endefcmd'. + * Inputs: + * argc argument count + * argv argument vector + * Returns: + * zero for success, a kdb diagnostic if error + */ +struct defcmd_set { + int count; + int usable; + char *name; + char *usage; + char *help; + char **command; +}; +static struct defcmd_set *defcmd_set; +static int defcmd_set_count; +static int defcmd_in_progress; + +/* Forward references */ +static int kdb_exec_defcmd(int argc, const char **argv); + +static int kdb_defcmd2(const char *cmdstr, const char *argv0) +{ + struct defcmd_set *s = defcmd_set + defcmd_set_count - 1; + char **save_command = s->command; + if (strcmp(argv0, "endefcmd") == 0) { + defcmd_in_progress = 0; + if (!s->count) + s->usable = 0; + if (s->usable) + kdb_register(s->name, kdb_exec_defcmd, + s->usage, s->help, 0); + return 0; + } + if (!s->usable) + return KDB_NOTIMP; + s->command = kzalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB); + if (!s->command) { + kdb_printf("Could not allocate new kdb_defcmd table for %s\n", + cmdstr); + s->usable = 0; + return KDB_NOTIMP; + } + memcpy(s->command, save_command, s->count * sizeof(*(s->command))); + s->command[s->count++] = kdb_strdup(cmdstr, GFP_KDB); + kfree(save_command); + return 0; +} + +static int kdb_defcmd(int argc, const char **argv) +{ + struct defcmd_set *save_defcmd_set = defcmd_set, *s; + if (defcmd_in_progress) { + kdb_printf("kdb: nested defcmd detected, assuming missing " + "endefcmd\n"); + kdb_defcmd2("endefcmd", "endefcmd"); + } + if (argc == 0) { + int i; + for (s = defcmd_set; s < defcmd_set + defcmd_set_count; ++s) { + kdb_printf("defcmd %s \"%s\" \"%s\"\n", s->name, + s->usage, s->help); + for (i = 0; i < s->count; ++i) + kdb_printf("%s", s->command[i]); + kdb_printf("endefcmd\n"); + } + return 0; + } + if (argc != 3) + return KDB_ARGCOUNT; + defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set), + GFP_KDB); + if (!defcmd_set) { + kdb_printf("Could not allocate new defcmd_set entry for %s\n", + argv[1]); + defcmd_set = save_defcmd_set; + return KDB_NOTIMP; + } + memcpy(defcmd_set, save_defcmd_set, + defcmd_set_count * sizeof(*defcmd_set)); + kfree(save_defcmd_set); + s = defcmd_set + defcmd_set_count; + memset(s, 0, sizeof(*s)); + s->usable = 1; + s->name = kdb_strdup(argv[1], GFP_KDB); + s->usage = kdb_strdup(argv[2], GFP_KDB); + s->help = kdb_strdup(argv[3], GFP_KDB); + if (s->usage[0] == '"') { + strcpy(s->usage, s->usage+1); + s->usage[strlen(s->usage)-1] = '\0'; + } + if (s->help[0] == '"') { + strcpy(s->help, s->help+1); + s->help[strlen(s->help)-1] = '\0'; + } + ++defcmd_set_count; + defcmd_in_progress = 1; + return 0; +} + +/* + * kdb_exec_defcmd - Execute the set of commands associated with this + * defcmd name. + * Inputs: + * argc argument count + * argv argument vector + * Returns: + * zero for success, a kdb diagnostic if error + */ +static int kdb_exec_defcmd(int argc, const char **argv) +{ + int i, ret; + struct defcmd_set *s; + if (argc != 0) + return KDB_ARGCOUNT; + for (s = defcmd_set, i = 0; i < defcmd_set_count; ++i, ++s) { + if (strcmp(s->name, argv[0]) == 0) + break; + } + if (i == defcmd_set_count) { + kdb_printf("kdb_exec_defcmd: could not find commands for %s\n", + argv[0]); + return KDB_NOTIMP; + } + for (i = 0; i < s->count; ++i) { + /* Recursive use of kdb_parse, do not use argv after + * this point */ + argv = NULL; + kdb_printf("[%s]kdb> %s\n", s->name, s->command[i]); + ret = kdb_parse(s->command[i]); + if (ret) + return ret; + } + return 0; +} + +/* Command history */ +#define KDB_CMD_HISTORY_COUNT 32 +#define CMD_BUFLEN 200 /* kdb_printf: max printline + * size == 256 */ +static unsigned int cmd_head, cmd_tail; +static unsigned int cmdptr; +static char cmd_hist[KDB_CMD_HISTORY_COUNT][CMD_BUFLEN]; +static char cmd_cur[CMD_BUFLEN]; + +/* + * The "str" argument may point to something like | grep xyz + */ +static void parse_grep(const char *str) +{ + int len; + char *cp = (char *)str, *cp2; + + /* sanity check: we should have been called with the \ first */ + if (*cp != '|') + return; + cp++; + while (isspace(*cp)) + cp++; + if (strncmp(cp, "grep ", 5)) { + kdb_printf("invalid 'pipe', see grephelp\n"); + return; + } + cp += 5; + while (isspace(*cp)) + cp++; + cp2 = strchr(cp, '\n'); + if (cp2) + *cp2 = '\0'; /* remove the trailing newline */ + len = strlen(cp); + if (len == 0) { + kdb_printf("invalid 'pipe', see grephelp\n"); + return; + } + /* now cp points to a nonzero length search string */ + if (*cp == '"') { + /* allow it be "x y z" by removing the "'s - there must + be two of them */ + cp++; + cp2 = strchr(cp, '"'); + if (!cp2) { + kdb_printf("invalid quoted string, see grephelp\n"); + return; + } + *cp2 = '\0'; /* end the string where the 2nd " was */ + } + kdb_grep_leading = 0; + if (*cp == '^') { + kdb_grep_leading = 1; + cp++; + } + len = strlen(cp); + kdb_grep_trailing = 0; + if (*(cp+len-1) == '$') { + kdb_grep_trailing = 1; + *(cp+len-1) = '\0'; + } + len = strlen(cp); + if (!len) + return; + if (len >= GREP_LEN) { + kdb_printf("search string too long\n"); + return; + } + strcpy(kdb_grep_string, cp); + kdb_grepping_flag++; + return; +} + +/* + * kdb_parse - Parse the command line, search the command table for a + * matching command and invoke the command function. This + * function may be called recursively, if it is, the second call + * will overwrite argv and cbuf. It is the caller's + * responsibility to save their argv if they recursively call + * kdb_parse(). + * Parameters: + * cmdstr The input command line to be parsed. + * regs The registers at the time kdb was entered. + * Returns: + * Zero for success, a kdb diagnostic if failure. + * Remarks: + * Limited to 20 tokens. + * + * Real rudimentary tokenization. Basically only whitespace + * is considered a token delimeter (but special consideration + * is taken of the '=' sign as used by the 'set' command). + * + * The algorithm used to tokenize the input string relies on + * there being at least one whitespace (or otherwise useless) + * character between tokens as the character immediately following + * the token is altered in-place to a null-byte to terminate the + * token string. + */ + +#define MAXARGC 20 + +int kdb_parse(const char *cmdstr) +{ + static char *argv[MAXARGC]; + static int argc; + static char cbuf[CMD_BUFLEN+2]; + char *cp; + char *cpp, quoted; + kdbtab_t *tp; + int i, escaped, ignore_errors = 0, check_grep; + + /* + * First tokenize the command string. + */ + cp = (char *)cmdstr; + kdb_grepping_flag = check_grep = 0; + + if (KDB_FLAG(CMD_INTERRUPT)) { + /* Previous command was interrupted, newline must not + * repeat the command */ + KDB_FLAG_CLEAR(CMD_INTERRUPT); + KDB_STATE_SET(PAGER); + argc = 0; /* no repeat */ + } + + if (*cp != '\n' && *cp != '\0') { + argc = 0; + cpp = cbuf; + while (*cp) { + /* skip whitespace */ + while (isspace(*cp)) + cp++; + if ((*cp == '\0') || (*cp == '\n') || + (*cp == '#' && !defcmd_in_progress)) + break; + /* special case: check for | grep pattern */ + if (*cp == '|') { + check_grep++; + break; + } + if (cpp >= cbuf + CMD_BUFLEN) { + kdb_printf("kdb_parse: command buffer " + "overflow, command ignored\n%s\n", + cmdstr); + return KDB_NOTFOUND; + } + if (argc >= MAXARGC - 1) { + kdb_printf("kdb_parse: too many arguments, " + "command ignored\n%s\n", cmdstr); + return KDB_NOTFOUND; + } + argv[argc++] = cpp; + escaped = 0; + quoted = '\0'; + /* Copy to next unquoted and unescaped + * whitespace or '=' */ + while (*cp && *cp != '\n' && + (escaped || quoted || !isspace(*cp))) { + if (cpp >= cbuf + CMD_BUFLEN) + break; + if (escaped) { + escaped = 0; + *cpp++ = *cp++; + continue; + } + if (*cp == '\\') { + escaped = 1; + ++cp; + continue; + } + if (*cp == quoted) + quoted = '\0'; + else if (*cp == '\'' || *cp == '"') + quoted = *cp; + *cpp = *cp++; + if (*cpp == '=' && !quoted) + break; + ++cpp; + } + *cpp++ = '\0'; /* Squash a ws or '=' character */ + } + } + if (!argc) + return 0; + if (check_grep) + parse_grep(cp); + if (defcmd_in_progress) { + int result = kdb_defcmd2(cmdstr, argv[0]); + if (!defcmd_in_progress) { + argc = 0; /* avoid repeat on endefcmd */ + *(argv[0]) = '\0'; + } + return result; + } + if (argv[0][0] == '-' && argv[0][1] && + (argv[0][1] < '0' || argv[0][1] > '9')) { + ignore_errors = 1; + ++argv[0]; + } + + for_each_kdbcmd(tp, i) { + if (tp->cmd_name) { + /* + * If this command is allowed to be abbreviated, + * check to see if this is it. + */ + + if (tp->cmd_minlen + && (strlen(argv[0]) <= tp->cmd_minlen)) { + if (strncmp(argv[0], + tp->cmd_name, + tp->cmd_minlen) == 0) { + break; + } + } + + if (strcmp(argv[0], tp->cmd_name) == 0) + break; + } + } + + /* + * If we don't find a command by this name, see if the first + * few characters of this match any of the known commands. + * e.g., md1c20 should match md. + */ + if (i == kdb_max_commands) { + for_each_kdbcmd(tp, i) { + if (tp->cmd_name) { + if (strncmp(argv[0], + tp->cmd_name, + strlen(tp->cmd_name)) == 0) { + break; + } + } + } + } + + if (i < kdb_max_commands) { + int result; + KDB_STATE_SET(CMD); + result = (*tp->cmd_func)(argc-1, (const char **)argv); + if (result && ignore_errors && result > KDB_CMD_GO) + result = 0; + KDB_STATE_CLEAR(CMD); + switch (tp->cmd_repeat) { + case KDB_REPEAT_NONE: + argc = 0; + if (argv[0]) + *(argv[0]) = '\0'; + break; + case KDB_REPEAT_NO_ARGS: + argc = 1; + if (argv[1]) + *(argv[1]) = '\0'; + break; + case KDB_REPEAT_WITH_ARGS: + break; + } + return result; + } + + /* + * If the input with which we were presented does not + * map to an existing command, attempt to parse it as an + * address argument and display the result. Useful for + * obtaining the address of a variable, or the nearest symbol + * to an address contained in a register. + */ + { + unsigned long value; + char *name = NULL; + long offset; + int nextarg = 0; + + if (kdbgetaddrarg(0, (const char **)argv, &nextarg, + &value, &offset, &name)) { + return KDB_NOTFOUND; + } + + kdb_printf("%s = ", argv[0]); + kdb_symbol_print(value, NULL, KDB_SP_DEFAULT); + kdb_printf("\n"); + return 0; + } +} + + +static int handle_ctrl_cmd(char *cmd) +{ +#define CTRL_P 16 +#define CTRL_N 14 + + /* initial situation */ + if (cmd_head == cmd_tail) + return 0; + switch (*cmd) { + case CTRL_P: + if (cmdptr != cmd_tail) + cmdptr = (cmdptr-1) % KDB_CMD_HISTORY_COUNT; + strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN); + return 1; + case CTRL_N: + if (cmdptr != cmd_head) + cmdptr = (cmdptr+1) % KDB_CMD_HISTORY_COUNT; + strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN); + return 1; + } + return 0; +} + +/* + * kdb_reboot - This function implements the 'reboot' command. Reboot + * the system immediately, or loop for ever on failure. + */ +static int kdb_reboot(int argc, const char **argv) +{ + emergency_restart(); + kdb_printf("Hmm, kdb_reboot did not reboot, spinning here\n"); + while (1) + cpu_relax(); + /* NOTREACHED */ + return 0; +} + +static void kdb_dumpregs(struct pt_regs *regs) +{ + int old_lvl = console_loglevel; + console_loglevel = 15; + kdb_trap_printk++; + show_regs(regs); + kdb_trap_printk--; + kdb_printf("\n"); + console_loglevel = old_lvl; +} + +void kdb_set_current_task(struct task_struct *p) +{ + kdb_current_task = p; + + if (kdb_task_has_cpu(p)) { + kdb_current_regs = KDB_TSKREGS(kdb_process_cpu(p)); + return; + } + kdb_current_regs = NULL; +} + +/* + * kdb_local - The main code for kdb. This routine is invoked on a + * specific processor, it is not global. The main kdb() routine + * ensures that only one processor at a time is in this routine. + * This code is called with the real reason code on the first + * entry to a kdb session, thereafter it is called with reason + * SWITCH, even if the user goes back to the original cpu. + * Inputs: + * reason The reason KDB was invoked + * error The hardware-defined error code + * regs The exception frame at time of fault/breakpoint. + * db_result Result code from the break or debug point. + * Returns: + * 0 KDB was invoked for an event which it wasn't responsible + * 1 KDB handled the event for which it was invoked. + * KDB_CMD_GO User typed 'go'. + * KDB_CMD_CPU User switched to another cpu. + * KDB_CMD_SS Single step. + * KDB_CMD_SSB Single step until branch. + */ +static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs, + kdb_dbtrap_t db_result) +{ + char *cmdbuf; + int diag; + struct task_struct *kdb_current = + kdb_curr_task(raw_smp_processor_id()); + + KDB_DEBUG_STATE("kdb_local 1", reason); + kdb_go_count = 0; + if (reason == KDB_REASON_DEBUG) { + /* special case below */ + } else { + kdb_printf("\nEntering kdb (current=0x%p, pid %d) ", + kdb_current, kdb_current ? kdb_current->pid : 0); +#if defined(CONFIG_SMP) + kdb_printf("on processor %d ", raw_smp_processor_id()); +#endif + } + + switch (reason) { + case KDB_REASON_DEBUG: + { + /* + * If re-entering kdb after a single step + * command, don't print the message. + */ + switch (db_result) { + case KDB_DB_BPT: + kdb_printf("\nEntering kdb (0x%p, pid %d) ", + kdb_current, kdb_current->pid); +#if defined(CONFIG_SMP) + kdb_printf("on processor %d ", raw_smp_processor_id()); +#endif + kdb_printf("due to Debug @ " kdb_machreg_fmt "\n", + instruction_pointer(regs)); + break; + case KDB_DB_SSB: + /* + * In the midst of ssb command. Just return. + */ + KDB_DEBUG_STATE("kdb_local 3", reason); + return KDB_CMD_SSB; /* Continue with SSB command */ + + break; + case KDB_DB_SS: + break; + case KDB_DB_SSBPT: + KDB_DEBUG_STATE("kdb_local 4", reason); + return 1; /* kdba_db_trap did the work */ + default: + kdb_printf("kdb: Bad result from kdba_db_trap: %d\n", + db_result); + break; + } + + } + break; + case KDB_REASON_ENTER: + if (KDB_STATE(KEYBOARD)) + kdb_printf("due to Keyboard Entry\n"); + else + kdb_printf("due to KDB_ENTER()\n"); + break; + case KDB_REASON_KEYBOARD: + KDB_STATE_SET(KEYBOARD); + kdb_printf("due to Keyboard Entry\n"); + break; + case KDB_REASON_ENTER_SLAVE: + /* drop through, slaves only get released via cpu switch */ + case KDB_REASON_SWITCH: + kdb_printf("due to cpu switch\n"); + break; + case KDB_REASON_OOPS: + kdb_printf("Oops: %s\n", kdb_diemsg); + kdb_printf("due to oops @ " kdb_machreg_fmt "\n", + instruction_pointer(regs)); + kdb_dumpregs(regs); + break; + case KDB_REASON_NMI: + kdb_printf("due to NonMaskable Interrupt @ " + kdb_machreg_fmt "\n", + instruction_pointer(regs)); + kdb_dumpregs(regs); + break; + case KDB_REASON_SSTEP: + case KDB_REASON_BREAK: + kdb_printf("due to %s @ " kdb_machreg_fmt "\n", + reason == KDB_REASON_BREAK ? + "Breakpoint" : "SS trap", instruction_pointer(regs)); + /* + * Determine if this breakpoint is one that we + * are interested in. + */ + if (db_result != KDB_DB_BPT) { + kdb_printf("kdb: error return from kdba_bp_trap: %d\n", + db_result); + KDB_DEBUG_STATE("kdb_local 6", reason); + return 0; /* Not for us, dismiss it */ + } + break; + case KDB_REASON_RECURSE: + kdb_printf("due to Recursion @ " kdb_machreg_fmt "\n", + instruction_pointer(regs)); + break; + default: + kdb_printf("kdb: unexpected reason code: %d\n", reason); + KDB_DEBUG_STATE("kdb_local 8", reason); + return 0; /* Not for us, dismiss it */ + } + + while (1) { + /* + * Initialize pager context. + */ + kdb_nextline = 1; + KDB_STATE_CLEAR(SUPPRESS); + + cmdbuf = cmd_cur; + *cmdbuf = '\0'; + *(cmd_hist[cmd_head]) = '\0'; + + if (KDB_FLAG(ONLY_DO_DUMP)) { + /* kdb is off but a catastrophic error requires a dump. + * Take the dump and reboot. + * Turn on logging so the kdb output appears in the log + * buffer in the dump. + */ + const char *setargs[] = { "set", "LOGGING", "1" }; + kdb_set(2, setargs); + kdb_reboot(0, NULL); + /*NOTREACHED*/ + } + +do_full_getstr: +#if defined(CONFIG_SMP) + snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"), + raw_smp_processor_id()); +#else + snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT")); +#endif + if (defcmd_in_progress) + strncat(kdb_prompt_str, "[defcmd]", CMD_BUFLEN); + + /* + * Fetch command from keyboard + */ + cmdbuf = kdb_getstr(cmdbuf, CMD_BUFLEN, kdb_prompt_str); + if (*cmdbuf != '\n') { + if (*cmdbuf < 32) { + if (cmdptr == cmd_head) { + strncpy(cmd_hist[cmd_head], cmd_cur, + CMD_BUFLEN); + *(cmd_hist[cmd_head] + + strlen(cmd_hist[cmd_head])-1) = '\0'; + } + if (!handle_ctrl_cmd(cmdbuf)) + *(cmd_cur+strlen(cmd_cur)-1) = '\0'; + cmdbuf = cmd_cur; + goto do_full_getstr; + } else { + strncpy(cmd_hist[cmd_head], cmd_cur, + CMD_BUFLEN); + } + + cmd_head = (cmd_head+1) % KDB_CMD_HISTORY_COUNT; + if (cmd_head == cmd_tail) + cmd_tail = (cmd_tail+1) % KDB_CMD_HISTORY_COUNT; + } + + cmdptr = cmd_head; + diag = kdb_parse(cmdbuf); + if (diag == KDB_NOTFOUND) { + kdb_printf("Unknown kdb command: '%s'\n", cmdbuf); + diag = 0; + } + if (diag == KDB_CMD_GO + || diag == KDB_CMD_CPU + || diag == KDB_CMD_SS + || diag == KDB_CMD_SSB + || diag == KDB_CMD_KGDB) + break; + + if (diag) + kdb_cmderror(diag); + } + KDB_DEBUG_STATE("kdb_local 9", diag); + return diag; +} + + +/* + * kdb_print_state - Print the state data for the current processor + * for debugging. + * Inputs: + * text Identifies the debug point + * value Any integer value to be printed, e.g. reason code. + */ +void kdb_print_state(const char *text, int value) +{ + kdb_printf("state: %s cpu %d value %d initial %d state %x\n", + text, raw_smp_processor_id(), value, kdb_initial_cpu, + kdb_state); +} + +/* + * kdb_main_loop - After initial setup and assignment of the + * controlling cpu, all cpus are in this loop. One cpu is in + * control and will issue the kdb prompt, the others will spin + * until 'go' or cpu switch. + * + * To get a consistent view of the kernel stacks for all + * processes, this routine is invoked from the main kdb code via + * an architecture specific routine. kdba_main_loop is + * responsible for making the kernel stacks consistent for all + * processes, there should be no difference between a blocked + * process and a running process as far as kdb is concerned. + * Inputs: + * reason The reason KDB was invoked + * error The hardware-defined error code + * reason2 kdb's current reason code. + * Initially error but can change + * according to kdb state. + * db_result Result code from break or debug point. + * regs The exception frame at time of fault/breakpoint. + * should always be valid. + * Returns: + * 0 KDB was invoked for an event which it wasn't responsible + * 1 KDB handled the event for which it was invoked. + */ +int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error, + kdb_dbtrap_t db_result, struct pt_regs *regs) +{ + int result = 1; + /* Stay in kdb() until 'go', 'ss[b]' or an error */ + while (1) { + /* + * All processors except the one that is in control + * will spin here. + */ + KDB_DEBUG_STATE("kdb_main_loop 1", reason); + while (KDB_STATE(HOLD_CPU)) { + /* state KDB is turned off by kdb_cpu to see if the + * other cpus are still live, each cpu in this loop + * turns it back on. + */ + if (!KDB_STATE(KDB)) + KDB_STATE_SET(KDB); + } + + KDB_STATE_CLEAR(SUPPRESS); + KDB_DEBUG_STATE("kdb_main_loop 2", reason); + if (KDB_STATE(LEAVING)) + break; /* Another cpu said 'go' */ + /* Still using kdb, this processor is in control */ + result = kdb_local(reason2, error, regs, db_result); + KDB_DEBUG_STATE("kdb_main_loop 3", result); + + if (result == KDB_CMD_CPU) + break; + + if (result == KDB_CMD_SS) { + KDB_STATE_SET(DOING_SS); + break; + } + + if (result == KDB_CMD_SSB) { + KDB_STATE_SET(DOING_SS); + KDB_STATE_SET(DOING_SSB); + break; + } + + if (result == KDB_CMD_KGDB) { + if (!(KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2))) + kdb_printf("Entering please attach debugger " + "or use $D#44+ or $3#33\n"); + break; + } + if (result && result != 1 && result != KDB_CMD_GO) + kdb_printf("\nUnexpected kdb_local return code %d\n", + result); + KDB_DEBUG_STATE("kdb_main_loop 4", reason); + break; + } + if (KDB_STATE(DOING_SS)) + KDB_STATE_CLEAR(SSBPT); + + return result; +} + +/* + * kdb_mdr - This function implements the guts of the 'mdr', memory + * read command. + * mdr , + * Inputs: + * addr Start address + * count Number of bytes + * Returns: + * Always 0. Any errors are detected and printed by kdb_getarea. + */ +static int kdb_mdr(unsigned long addr, unsigned int count) +{ + unsigned char c; + while (count--) { + if (kdb_getarea(c, addr)) + return 0; + kdb_printf("%02x", c); + addr++; + } + kdb_printf("\n"); + return 0; +} + +/* + * kdb_md - This function implements the 'md', 'md1', 'md2', 'md4', + * 'md8' 'mdr' and 'mds' commands. + * + * md|mds [ [ []]] + * mdWcN [ [ []]] + * where W = is the width (1, 2, 4 or 8) and N is the count. + * for eg., md1c20 reads 20 bytes, 1 at a time. + * mdr , + */ +static void kdb_md_line(const char *fmtstr, unsigned long addr, + int symbolic, int nosect, int bytesperword, + int num, int repeat, int phys) +{ + /* print just one line of data */ + kdb_symtab_t symtab; + char cbuf[32]; + char *c = cbuf; + int i; + unsigned long word; + + memset(cbuf, '\0', sizeof(cbuf)); + if (phys) + kdb_printf("phys " kdb_machreg_fmt0 " ", addr); + else + kdb_printf(kdb_machreg_fmt0 " ", addr); + + for (i = 0; i < num && repeat--; i++) { + if (phys) { + if (kdb_getphysword(&word, addr, bytesperword)) + break; + } else if (kdb_getword(&word, addr, bytesperword)) + break; + kdb_printf(fmtstr, word); + if (symbolic) + kdbnearsym(word, &symtab); + else + memset(&symtab, 0, sizeof(symtab)); + if (symtab.sym_name) { + kdb_symbol_print(word, &symtab, 0); + if (!nosect) { + kdb_printf("\n"); + kdb_printf(" %s %s " + kdb_machreg_fmt " " + kdb_machreg_fmt " " + kdb_machreg_fmt, symtab.mod_name, + symtab.sec_name, symtab.sec_start, + symtab.sym_start, symtab.sym_end); + } + addr += bytesperword; + } else { + union { + u64 word; + unsigned char c[8]; + } wc; + unsigned char *cp; +#ifdef __BIG_ENDIAN + cp = wc.c + 8 - bytesperword; +#else + cp = wc.c; +#endif + wc.word = word; +#define printable_char(c) \ + ({unsigned char __c = c; isascii(__c) && isprint(__c) ? __c : '.'; }) + switch (bytesperword) { + case 8: + *c++ = printable_char(*cp++); + *c++ = printable_char(*cp++); + *c++ = printable_char(*cp++); + *c++ = printable_char(*cp++); + addr += 4; + case 4: + *c++ = printable_char(*cp++); + *c++ = printable_char(*cp++); + addr += 2; + case 2: + *c++ = printable_char(*cp++); + addr++; + case 1: + *c++ = printable_char(*cp++); + addr++; + break; + } +#undef printable_char + } + } + kdb_printf("%*s %s\n", (int)((num-i)*(2*bytesperword + 1)+1), + " ", cbuf); +} + +static int kdb_md(int argc, const char **argv) +{ + static unsigned long last_addr; + static int last_radix, last_bytesperword, last_repeat; + int radix = 16, mdcount = 8, bytesperword = KDB_WORD_SIZE, repeat; + int nosect = 0; + char fmtchar, fmtstr[64]; + unsigned long addr; + unsigned long word; + long offset = 0; + int symbolic = 0; + int valid = 0; + int phys = 0; + + kdbgetintenv("MDCOUNT", &mdcount); + kdbgetintenv("RADIX", &radix); + kdbgetintenv("BYTESPERWORD", &bytesperword); + + /* Assume 'md ' and start with environment values */ + repeat = mdcount * 16 / bytesperword; + + if (strcmp(argv[0], "mdr") == 0) { + if (argc != 2) + return KDB_ARGCOUNT; + valid = 1; + } else if (isdigit(argv[0][2])) { + bytesperword = (int)(argv[0][2] - '0'); + if (bytesperword == 0) { + bytesperword = last_bytesperword; + if (bytesperword == 0) + bytesperword = 4; + } + last_bytesperword = bytesperword; + repeat = mdcount * 16 / bytesperword; + if (!argv[0][3]) + valid = 1; + else if (argv[0][3] == 'c' && argv[0][4]) { + char *p; + repeat = simple_strtoul(argv[0] + 4, &p, 10); + mdcount = ((repeat * bytesperword) + 15) / 16; + valid = !*p; + } + last_repeat = repeat; + } else if (strcmp(argv[0], "md") == 0) + valid = 1; + else if (strcmp(argv[0], "mds") == 0) + valid = 1; + else if (strcmp(argv[0], "mdp") == 0) { + phys = valid = 1; + } + if (!valid) + return KDB_NOTFOUND; + + if (argc == 0) { + if (last_addr == 0) + return KDB_ARGCOUNT; + addr = last_addr; + radix = last_radix; + bytesperword = last_bytesperword; + repeat = last_repeat; + mdcount = ((repeat * bytesperword) + 15) / 16; + } + + if (argc) { + unsigned long val; + int diag, nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, + &offset, NULL); + if (diag) + return diag; + if (argc > nextarg+2) + return KDB_ARGCOUNT; + + if (argc >= nextarg) { + diag = kdbgetularg(argv[nextarg], &val); + if (!diag) { + mdcount = (int) val; + repeat = mdcount * 16 / bytesperword; + } + } + if (argc >= nextarg+1) { + diag = kdbgetularg(argv[nextarg+1], &val); + if (!diag) + radix = (int) val; + } + } + + if (strcmp(argv[0], "mdr") == 0) + return kdb_mdr(addr, mdcount); + + switch (radix) { + case 10: + fmtchar = 'd'; + break; + case 16: + fmtchar = 'x'; + break; + case 8: + fmtchar = 'o'; + break; + default: + return KDB_BADRADIX; + } + + last_radix = radix; + + if (bytesperword > KDB_WORD_SIZE) + return KDB_BADWIDTH; + + switch (bytesperword) { + case 8: + sprintf(fmtstr, "%%16.16l%c ", fmtchar); + break; + case 4: + sprintf(fmtstr, "%%8.8l%c ", fmtchar); + break; + case 2: + sprintf(fmtstr, "%%4.4l%c ", fmtchar); + break; + case 1: + sprintf(fmtstr, "%%2.2l%c ", fmtchar); + break; + default: + return KDB_BADWIDTH; + } + + last_repeat = repeat; + last_bytesperword = bytesperword; + + if (strcmp(argv[0], "mds") == 0) { + symbolic = 1; + /* Do not save these changes as last_*, they are temporary mds + * overrides. + */ + bytesperword = KDB_WORD_SIZE; + repeat = mdcount; + kdbgetintenv("NOSECT", &nosect); + } + + /* Round address down modulo BYTESPERWORD */ + + addr &= ~(bytesperword-1); + + while (repeat > 0) { + unsigned long a; + int n, z, num = (symbolic ? 1 : (16 / bytesperword)); + + if (KDB_FLAG(CMD_INTERRUPT)) + return 0; + for (a = addr, z = 0; z < repeat; a += bytesperword, ++z) { + if (phys) { + if (kdb_getphysword(&word, a, bytesperword) + || word) + break; + } else if (kdb_getword(&word, a, bytesperword) || word) + break; + } + n = min(num, repeat); + kdb_md_line(fmtstr, addr, symbolic, nosect, bytesperword, + num, repeat, phys); + addr += bytesperword * n; + repeat -= n; + z = (z + num - 1) / num; + if (z > 2) { + int s = num * (z-2); + kdb_printf(kdb_machreg_fmt0 "-" kdb_machreg_fmt0 + " zero suppressed\n", + addr, addr + bytesperword * s - 1); + addr += bytesperword * s; + repeat -= s; + } + } + last_addr = addr; + + return 0; +} + +/* + * kdb_mm - This function implements the 'mm' command. + * mm address-expression new-value + * Remarks: + * mm works on machine words, mmW works on bytes. + */ +static int kdb_mm(int argc, const char **argv) +{ + int diag; + unsigned long addr; + long offset = 0; + unsigned long contents; + int nextarg; + int width; + + if (argv[0][2] && !isdigit(argv[0][2])) + return KDB_NOTFOUND; + + if (argc < 2) + return KDB_ARGCOUNT; + + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL); + if (diag) + return diag; + + if (nextarg > argc) + return KDB_ARGCOUNT; + diag = kdbgetaddrarg(argc, argv, &nextarg, &contents, NULL, NULL); + if (diag) + return diag; + + if (nextarg != argc + 1) + return KDB_ARGCOUNT; + + width = argv[0][2] ? (argv[0][2] - '0') : (KDB_WORD_SIZE); + diag = kdb_putword(addr, contents, width); + if (diag) + return diag; + + kdb_printf(kdb_machreg_fmt " = " kdb_machreg_fmt "\n", addr, contents); + + return 0; +} + +/* + * kdb_go - This function implements the 'go' command. + * go [address-expression] + */ +static int kdb_go(int argc, const char **argv) +{ + unsigned long addr; + int diag; + int nextarg; + long offset; + + if (raw_smp_processor_id() != kdb_initial_cpu) { + kdb_printf("go must execute on the entry cpu, " + "please use \"cpu %d\" and then execute go\n", + kdb_initial_cpu); + return KDB_BADCPUNUM; + } + if (argc == 1) { + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, + &addr, &offset, NULL); + if (diag) + return diag; + } else if (argc) { + return KDB_ARGCOUNT; + } + + diag = KDB_CMD_GO; + if (KDB_FLAG(CATASTROPHIC)) { + kdb_printf("Catastrophic error detected\n"); + kdb_printf("kdb_continue_catastrophic=%d, ", + kdb_continue_catastrophic); + if (kdb_continue_catastrophic == 0 && kdb_go_count++ == 0) { + kdb_printf("type go a second time if you really want " + "to continue\n"); + return 0; + } + if (kdb_continue_catastrophic == 2) { + kdb_printf("forcing reboot\n"); + kdb_reboot(0, NULL); + } + kdb_printf("attempting to continue\n"); + } + return diag; +} + +/* + * kdb_rd - This function implements the 'rd' command. + */ +static int kdb_rd(int argc, const char **argv) +{ + int len = kdb_check_regs(); +#if DBG_MAX_REG_NUM > 0 + int i; + char *rname; + int rsize; + u64 reg64; + u32 reg32; + u16 reg16; + u8 reg8; + + if (len) + return len; + + for (i = 0; i < DBG_MAX_REG_NUM; i++) { + rsize = dbg_reg_def[i].size * 2; + if (rsize > 16) + rsize = 2; + if (len + strlen(dbg_reg_def[i].name) + 4 + rsize > 80) { + len = 0; + kdb_printf("\n"); + } + if (len) + len += kdb_printf(" "); + switch(dbg_reg_def[i].size * 8) { + case 8: + rname = dbg_get_reg(i, ®8, kdb_current_regs); + if (!rname) + break; + len += kdb_printf("%s: %02x", rname, reg8); + break; + case 16: + rname = dbg_get_reg(i, ®16, kdb_current_regs); + if (!rname) + break; + len += kdb_printf("%s: %04x", rname, reg16); + break; + case 32: + rname = dbg_get_reg(i, ®32, kdb_current_regs); + if (!rname) + break; + len += kdb_printf("%s: %08x", rname, reg32); + break; + case 64: + rname = dbg_get_reg(i, ®64, kdb_current_regs); + if (!rname) + break; + len += kdb_printf("%s: %016llx", rname, reg64); + break; + default: + len += kdb_printf("%s: ??", dbg_reg_def[i].name); + } + } + kdb_printf("\n"); +#else + if (len) + return len; + + kdb_dumpregs(kdb_current_regs); +#endif + return 0; +} + +/* + * kdb_rm - This function implements the 'rm' (register modify) command. + * rm register-name new-contents + * Remarks: + * Allows register modification with the same restrictions as gdb + */ +static int kdb_rm(int argc, const char **argv) +{ +#if DBG_MAX_REG_NUM > 0 + int diag; + const char *rname; + int i; + u64 reg64; + u32 reg32; + u16 reg16; + u8 reg8; + + if (argc != 2) + return KDB_ARGCOUNT; + /* + * Allow presence or absence of leading '%' symbol. + */ + rname = argv[1]; + if (*rname == '%') + rname++; + + diag = kdbgetu64arg(argv[2], ®64); + if (diag) + return diag; + + diag = kdb_check_regs(); + if (diag) + return diag; + + diag = KDB_BADREG; + for (i = 0; i < DBG_MAX_REG_NUM; i++) { + if (strcmp(rname, dbg_reg_def[i].name) == 0) { + diag = 0; + break; + } + } + if (!diag) { + switch(dbg_reg_def[i].size * 8) { + case 8: + reg8 = reg64; + dbg_set_reg(i, ®8, kdb_current_regs); + break; + case 16: + reg16 = reg64; + dbg_set_reg(i, ®16, kdb_current_regs); + break; + case 32: + reg32 = reg64; + dbg_set_reg(i, ®32, kdb_current_regs); + break; + case 64: + dbg_set_reg(i, ®64, kdb_current_regs); + break; + } + } + return diag; +#else + kdb_printf("ERROR: Register set currently not implemented\n"); + return 0; +#endif +} + +#if defined(CONFIG_MAGIC_SYSRQ) +/* + * kdb_sr - This function implements the 'sr' (SYSRQ key) command + * which interfaces to the soi-disant MAGIC SYSRQ functionality. + * sr + */ +static int kdb_sr(int argc, const char **argv) +{ + if (argc != 1) + return KDB_ARGCOUNT; + kdb_trap_printk++; + __handle_sysrq(*argv[1], false); + kdb_trap_printk--; + + return 0; +} +#endif /* CONFIG_MAGIC_SYSRQ */ + +/* + * kdb_ef - This function implements the 'regs' (display exception + * frame) command. This command takes an address and expects to + * find an exception frame at that address, formats and prints + * it. + * regs address-expression + * Remarks: + * Not done yet. + */ +static int kdb_ef(int argc, const char **argv) +{ + int diag; + unsigned long addr; + long offset; + int nextarg; + + if (argc != 1) + return KDB_ARGCOUNT; + + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL); + if (diag) + return diag; + show_regs((struct pt_regs *)addr); + return 0; +} + +#if defined(CONFIG_MODULES) +/* + * kdb_lsmod - This function implements the 'lsmod' command. Lists + * currently loaded kernel modules. + * Mostly taken from userland lsmod. + */ +static int kdb_lsmod(int argc, const char **argv) +{ + struct module *mod; + + if (argc != 0) + return KDB_ARGCOUNT; + + kdb_printf("Module Size modstruct Used by\n"); + list_for_each_entry(mod, kdb_modules, list) { + + kdb_printf("%-20s%8u 0x%p ", mod->name, + mod->core_size, (void *)mod); +#ifdef CONFIG_MODULE_UNLOAD + kdb_printf("%4d ", module_refcount(mod)); +#endif + if (mod->state == MODULE_STATE_GOING) + kdb_printf(" (Unloading)"); + else if (mod->state == MODULE_STATE_COMING) + kdb_printf(" (Loading)"); + else + kdb_printf(" (Live)"); + kdb_printf(" 0x%p", mod->module_core); + +#ifdef CONFIG_MODULE_UNLOAD + { + struct module_use *use; + kdb_printf(" [ "); + list_for_each_entry(use, &mod->source_list, + source_list) + kdb_printf("%s ", use->target->name); + kdb_printf("]\n"); + } +#endif + } + + return 0; +} + +#endif /* CONFIG_MODULES */ + +/* + * kdb_env - This function implements the 'env' command. Display the + * current environment variables. + */ + +static int kdb_env(int argc, const char **argv) +{ + int i; + + for (i = 0; i < __nenv; i++) { + if (__env[i]) + kdb_printf("%s\n", __env[i]); + } + + if (KDB_DEBUG(MASK)) + kdb_printf("KDBFLAGS=0x%x\n", kdb_flags); + + return 0; +} + +#ifdef CONFIG_PRINTK +/* + * kdb_dmesg - This function implements the 'dmesg' command to display + * the contents of the syslog buffer. + * dmesg [lines] [adjust] + */ +static int kdb_dmesg(int argc, const char **argv) +{ + char *syslog_data[4], *start, *end, c = '\0', *p; + int diag, logging, logsize, lines = 0, adjust = 0, n; + + if (argc > 2) + return KDB_ARGCOUNT; + if (argc) { + char *cp; + lines = simple_strtol(argv[1], &cp, 0); + if (*cp) + lines = 0; + if (argc > 1) { + adjust = simple_strtoul(argv[2], &cp, 0); + if (*cp || adjust < 0) + adjust = 0; + } + } + + /* disable LOGGING if set */ + diag = kdbgetintenv("LOGGING", &logging); + if (!diag && logging) { + const char *setargs[] = { "set", "LOGGING", "0" }; + kdb_set(2, setargs); + } + + /* syslog_data[0,1] physical start, end+1. syslog_data[2,3] + * logical start, end+1. */ + kdb_syslog_data(syslog_data); + if (syslog_data[2] == syslog_data[3]) + return 0; + logsize = syslog_data[1] - syslog_data[0]; + start = syslog_data[2]; + end = syslog_data[3]; +#define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0]) + for (n = 0, p = start; p < end; ++p) { + c = *KDB_WRAP(p); + if (c == '\n') + ++n; + } + if (c != '\n') + ++n; + if (lines < 0) { + if (adjust >= n) + kdb_printf("buffer only contains %d lines, nothing " + "printed\n", n); + else if (adjust - lines >= n) + kdb_printf("buffer only contains %d lines, last %d " + "lines printed\n", n, n - adjust); + if (adjust) { + for (; start < end && adjust; ++start) { + if (*KDB_WRAP(start) == '\n') + --adjust; + } + if (start < end) + ++start; + } + for (p = start; p < end && lines; ++p) { + if (*KDB_WRAP(p) == '\n') + ++lines; + } + end = p; + } else if (lines > 0) { + int skip = n - (adjust + lines); + if (adjust >= n) { + kdb_printf("buffer only contains %d lines, " + "nothing printed\n", n); + skip = n; + } else if (skip < 0) { + lines += skip; + skip = 0; + kdb_printf("buffer only contains %d lines, first " + "%d lines printed\n", n, lines); + } + for (; start < end && skip; ++start) { + if (*KDB_WRAP(start) == '\n') + --skip; + } + for (p = start; p < end && lines; ++p) { + if (*KDB_WRAP(p) == '\n') + --lines; + } + end = p; + } + /* Do a line at a time (max 200 chars) to reduce protocol overhead */ + c = '\n'; + while (start != end) { + char buf[201]; + p = buf; + if (KDB_FLAG(CMD_INTERRUPT)) + return 0; + while (start < end && (c = *KDB_WRAP(start)) && + (p - buf) < sizeof(buf)-1) { + ++start; + *p++ = c; + if (c == '\n') + break; + } + *p = '\0'; + kdb_printf("%s", buf); + } + if (c != '\n') + kdb_printf("\n"); + + return 0; +} +#endif /* CONFIG_PRINTK */ +/* + * kdb_cpu - This function implements the 'cpu' command. + * cpu [] + * Returns: + * KDB_CMD_CPU for success, a kdb diagnostic if error + */ +static void kdb_cpu_status(void) +{ + int i, start_cpu, first_print = 1; + char state, prev_state = '?'; + + kdb_printf("Currently on cpu %d\n", raw_smp_processor_id()); + kdb_printf("Available cpus: "); + for (start_cpu = -1, i = 0; i < NR_CPUS; i++) { + if (!cpu_online(i)) { + state = 'F'; /* cpu is offline */ + } else { + state = ' '; /* cpu is responding to kdb */ + if (kdb_task_state_char(KDB_TSK(i)) == 'I') + state = 'I'; /* idle task */ + } + if (state != prev_state) { + if (prev_state != '?') { + if (!first_print) + kdb_printf(", "); + first_print = 0; + kdb_printf("%d", start_cpu); + if (start_cpu < i-1) + kdb_printf("-%d", i-1); + if (prev_state != ' ') + kdb_printf("(%c)", prev_state); + } + prev_state = state; + start_cpu = i; + } + } + /* print the trailing cpus, ignoring them if they are all offline */ + if (prev_state != 'F') { + if (!first_print) + kdb_printf(", "); + kdb_printf("%d", start_cpu); + if (start_cpu < i-1) + kdb_printf("-%d", i-1); + if (prev_state != ' ') + kdb_printf("(%c)", prev_state); + } + kdb_printf("\n"); +} + +static int kdb_cpu(int argc, const char **argv) +{ + unsigned long cpunum; + int diag; + + if (argc == 0) { + kdb_cpu_status(); + return 0; + } + + if (argc != 1) + return KDB_ARGCOUNT; + + diag = kdbgetularg(argv[1], &cpunum); + if (diag) + return diag; + + /* + * Validate cpunum + */ + if ((cpunum > NR_CPUS) || !cpu_online(cpunum)) + return KDB_BADCPUNUM; + + dbg_switch_cpu = cpunum; + + /* + * Switch to other cpu + */ + return KDB_CMD_CPU; +} + +/* The user may not realize that ps/bta with no parameters does not print idle + * or sleeping system daemon processes, so tell them how many were suppressed. + */ +void kdb_ps_suppressed(void) +{ + int idle = 0, daemon = 0; + unsigned long mask_I = kdb_task_state_string("I"), + mask_M = kdb_task_state_string("M"); + unsigned long cpu; + const struct task_struct *p, *g; + for_each_online_cpu(cpu) { + p = kdb_curr_task(cpu); + if (kdb_task_state(p, mask_I)) + ++idle; + } + kdb_do_each_thread(g, p) { + if (kdb_task_state(p, mask_M)) + ++daemon; + } kdb_while_each_thread(g, p); + if (idle || daemon) { + if (idle) + kdb_printf("%d idle process%s (state I)%s\n", + idle, idle == 1 ? "" : "es", + daemon ? " and " : ""); + if (daemon) + kdb_printf("%d sleeping system daemon (state M) " + "process%s", daemon, + daemon == 1 ? "" : "es"); + kdb_printf(" suppressed,\nuse 'ps A' to see all.\n"); + } +} + +/* + * kdb_ps - This function implements the 'ps' command which shows a + * list of the active processes. + * ps [DRSTCZEUIMA] All processes, optionally filtered by state + */ +void kdb_ps1(const struct task_struct *p) +{ + int cpu; + unsigned long tmp; + + if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long))) + return; + + cpu = kdb_process_cpu(p); + kdb_printf("0x%p %8d %8d %d %4d %c 0x%p %c%s\n", + (void *)p, p->pid, p->parent->pid, + kdb_task_has_cpu(p), kdb_process_cpu(p), + kdb_task_state_char(p), + (void *)(&p->thread), + p == kdb_curr_task(raw_smp_processor_id()) ? '*' : ' ', + p->comm); + if (kdb_task_has_cpu(p)) { + if (!KDB_TSK(cpu)) { + kdb_printf(" Error: no saved data for this cpu\n"); + } else { + if (KDB_TSK(cpu) != p) + kdb_printf(" Error: does not match running " + "process table (0x%p)\n", KDB_TSK(cpu)); + } + } +} + +static int kdb_ps(int argc, const char **argv) +{ + struct task_struct *g, *p; + unsigned long mask, cpu; + + if (argc == 0) + kdb_ps_suppressed(); + kdb_printf("%-*s Pid Parent [*] cpu State %-*s Command\n", + (int)(2*sizeof(void *))+2, "Task Addr", + (int)(2*sizeof(void *))+2, "Thread"); + mask = kdb_task_state_string(argc ? argv[1] : NULL); + /* Run the active tasks first */ + for_each_online_cpu(cpu) { + if (KDB_FLAG(CMD_INTERRUPT)) + return 0; + p = kdb_curr_task(cpu); + if (kdb_task_state(p, mask)) + kdb_ps1(p); + } + kdb_printf("\n"); + /* Now the real tasks */ + kdb_do_each_thread(g, p) { + if (KDB_FLAG(CMD_INTERRUPT)) + return 0; + if (kdb_task_state(p, mask)) + kdb_ps1(p); + } kdb_while_each_thread(g, p); + + return 0; +} + +/* + * kdb_pid - This function implements the 'pid' command which switches + * the currently active process. + * pid [ | R] + */ +static int kdb_pid(int argc, const char **argv) +{ + struct task_struct *p; + unsigned long val; + int diag; + + if (argc > 1) + return KDB_ARGCOUNT; + + if (argc) { + if (strcmp(argv[1], "R") == 0) { + p = KDB_TSK(kdb_initial_cpu); + } else { + diag = kdbgetularg(argv[1], &val); + if (diag) + return KDB_BADINT; + + p = find_task_by_pid_ns((pid_t)val, &init_pid_ns); + if (!p) { + kdb_printf("No task with pid=%d\n", (pid_t)val); + return 0; + } + } + kdb_set_current_task(p); + } + kdb_printf("KDB current process is %s(pid=%d)\n", + kdb_current_task->comm, + kdb_current_task->pid); + + return 0; +} + +/* + * kdb_ll - This function implements the 'll' command which follows a + * linked list and executes an arbitrary command for each + * element. + */ +static int kdb_ll(int argc, const char **argv) +{ + int diag = 0; + unsigned long addr; + long offset = 0; + unsigned long va; + unsigned long linkoffset; + int nextarg; + const char *command; + + if (argc != 3) + return KDB_ARGCOUNT; + + nextarg = 1; + diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL); + if (diag) + return diag; + + diag = kdbgetularg(argv[2], &linkoffset); + if (diag) + return diag; + + /* + * Using the starting address as + * the first element in the list, and assuming that + * the list ends with a null pointer. + */ + + va = addr; + command = kdb_strdup(argv[3], GFP_KDB); + if (!command) { + kdb_printf("%s: cannot duplicate command\n", __func__); + return 0; + } + /* Recursive use of kdb_parse, do not use argv after this point */ + argv = NULL; + + while (va) { + char buf[80]; + + if (KDB_FLAG(CMD_INTERRUPT)) + goto out; + + sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va); + diag = kdb_parse(buf); + if (diag) + goto out; + + addr = va + linkoffset; + if (kdb_getword(&va, addr, sizeof(va))) + goto out; + } + +out: + kfree(command); + return diag; +} + +static int kdb_kgdb(int argc, const char **argv) +{ + return KDB_CMD_KGDB; +} + +/* + * kdb_help - This function implements the 'help' and '?' commands. + */ +static int kdb_help(int argc, const char **argv) +{ + kdbtab_t *kt; + int i; + + kdb_printf("%-15.15s %-20.20s %s\n", "Command", "Usage", "Description"); + kdb_printf("-----------------------------" + "-----------------------------\n"); + for_each_kdbcmd(kt, i) { + if (kt->cmd_name) + kdb_printf("%-15.15s %-20.20s %s\n", kt->cmd_name, + kt->cmd_usage, kt->cmd_help); + if (KDB_FLAG(CMD_INTERRUPT)) + return 0; + } + return 0; +} + +/* + * kdb_kill - This function implements the 'kill' commands. + */ +static int kdb_kill(int argc, const char **argv) +{ + long sig, pid; + char *endp; + struct task_struct *p; + struct siginfo info; + + if (argc != 2) + return KDB_ARGCOUNT; + + sig = simple_strtol(argv[1], &endp, 0); + if (*endp) + return KDB_BADINT; + if (sig >= 0) { + kdb_printf("Invalid signal parameter.<-signal>\n"); + return 0; + } + sig = -sig; + + pid = simple_strtol(argv[2], &endp, 0); + if (*endp) + return KDB_BADINT; + if (pid <= 0) { + kdb_printf("Process ID must be large than 0.\n"); + return 0; + } + + /* Find the process. */ + p = find_task_by_pid_ns(pid, &init_pid_ns); + if (!p) { + kdb_printf("The specified process isn't found.\n"); + return 0; + } + p = p->group_leader; + info.si_signo = sig; + info.si_errno = 0; + info.si_code = SI_USER; + info.si_pid = pid; /* same capabilities as process being signalled */ + info.si_uid = 0; /* kdb has root authority */ + kdb_send_sig_info(p, &info); + return 0; +} + +struct kdb_tm { + int tm_sec; /* seconds */ + int tm_min; /* minutes */ + int tm_hour; /* hours */ + int tm_mday; /* day of the month */ + int tm_mon; /* month */ + int tm_year; /* year */ +}; + +static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm) +{ + /* This will work from 1970-2099, 2100 is not a leap year */ + static int mon_day[] = { 31, 29, 31, 30, 31, 30, 31, + 31, 30, 31, 30, 31 }; + memset(tm, 0, sizeof(*tm)); + tm->tm_sec = tv->tv_sec % (24 * 60 * 60); + tm->tm_mday = tv->tv_sec / (24 * 60 * 60) + + (2 * 365 + 1); /* shift base from 1970 to 1968 */ + tm->tm_min = tm->tm_sec / 60 % 60; + tm->tm_hour = tm->tm_sec / 60 / 60; + tm->tm_sec = tm->tm_sec % 60; + tm->tm_year = 68 + 4*(tm->tm_mday / (4*365+1)); + tm->tm_mday %= (4*365+1); + mon_day[1] = 29; + while (tm->tm_mday >= mon_day[tm->tm_mon]) { + tm->tm_mday -= mon_day[tm->tm_mon]; + if (++tm->tm_mon == 12) { + tm->tm_mon = 0; + ++tm->tm_year; + mon_day[1] = 28; + } + } + ++tm->tm_mday; +} + +/* + * Most of this code has been lifted from kernel/timer.c::sys_sysinfo(). + * I cannot call that code directly from kdb, it has an unconditional + * cli()/sti() and calls routines that take locks which can stop the debugger. + */ +static void kdb_sysinfo(struct sysinfo *val) +{ + struct timespec uptime; + do_posix_clock_monotonic_gettime(&uptime); + memset(val, 0, sizeof(*val)); + val->uptime = uptime.tv_sec; + val->loads[0] = avenrun[0]; + val->loads[1] = avenrun[1]; + val->loads[2] = avenrun[2]; + val->procs = nr_threads-1; + si_meminfo(val); + + return; +} + +/* + * kdb_summary - This function implements the 'summary' command. + */ +static int kdb_summary(int argc, const char **argv) +{ + struct timespec now; + struct kdb_tm tm; + struct sysinfo val; + + if (argc) + return KDB_ARGCOUNT; + + kdb_printf("sysname %s\n", init_uts_ns.name.sysname); + kdb_printf("release %s\n", init_uts_ns.name.release); + kdb_printf("version %s\n", init_uts_ns.name.version); + kdb_printf("machine %s\n", init_uts_ns.name.machine); + kdb_printf("nodename %s\n", init_uts_ns.name.nodename); + kdb_printf("domainname %s\n", init_uts_ns.name.domainname); + kdb_printf("ccversion %s\n", __stringify(CCVERSION)); + + now = __current_kernel_time(); + kdb_gmtime(&now, &tm); + kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d " + "tz_minuteswest %d\n", + 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday, + tm.tm_hour, tm.tm_min, tm.tm_sec, + sys_tz.tz_minuteswest); + + kdb_sysinfo(&val); + kdb_printf("uptime "); + if (val.uptime > (24*60*60)) { + int days = val.uptime / (24*60*60); + val.uptime %= (24*60*60); + kdb_printf("%d day%s ", days, days == 1 ? "" : "s"); + } + kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60); + + /* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */ + +#define LOAD_INT(x) ((x) >> FSHIFT) +#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) + kdb_printf("load avg %ld.%02ld %ld.%02ld %ld.%02ld\n", + LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]), + LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]), + LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2])); +#undef LOAD_INT +#undef LOAD_FRAC + /* Display in kilobytes */ +#define K(x) ((x) << (PAGE_SHIFT - 10)) + kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n" + "Buffers: %8lu kB\n", + val.totalram, val.freeram, val.bufferram); + return 0; +} + +/* + * kdb_per_cpu - This function implements the 'per_cpu' command. + */ +static int kdb_per_cpu(int argc, const char **argv) +{ + char fmtstr[64]; + int cpu, diag, nextarg = 1; + unsigned long addr, symaddr, val, bytesperword = 0, whichcpu = ~0UL; + + if (argc < 1 || argc > 3) + return KDB_ARGCOUNT; + + diag = kdbgetaddrarg(argc, argv, &nextarg, &symaddr, NULL, NULL); + if (diag) + return diag; + + if (argc >= 2) { + diag = kdbgetularg(argv[2], &bytesperword); + if (diag) + return diag; + } + if (!bytesperword) + bytesperword = KDB_WORD_SIZE; + else if (bytesperword > KDB_WORD_SIZE) + return KDB_BADWIDTH; + sprintf(fmtstr, "%%0%dlx ", (int)(2*bytesperword)); + if (argc >= 3) { + diag = kdbgetularg(argv[3], &whichcpu); + if (diag) + return diag; + if (!cpu_online(whichcpu)) { + kdb_printf("cpu %ld is not online\n", whichcpu); + return KDB_BADCPUNUM; + } + } + + /* Most architectures use __per_cpu_offset[cpu], some use + * __per_cpu_offset(cpu), smp has no __per_cpu_offset. + */ +#ifdef __per_cpu_offset +#define KDB_PCU(cpu) __per_cpu_offset(cpu) +#else +#ifdef CONFIG_SMP +#define KDB_PCU(cpu) __per_cpu_offset[cpu] +#else +#define KDB_PCU(cpu) 0 +#endif +#endif + for_each_online_cpu(cpu) { + if (KDB_FLAG(CMD_INTERRUPT)) + return 0; + + if (whichcpu != ~0UL && whichcpu != cpu) + continue; + addr = symaddr + KDB_PCU(cpu); + diag = kdb_getword(&val, addr, bytesperword); + if (diag) { + kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to " + "read, diag=%d\n", cpu, addr, diag); + continue; + } + kdb_printf("%5d ", cpu); + kdb_md_line(fmtstr, addr, + bytesperword == KDB_WORD_SIZE, + 1, bytesperword, 1, 1, 0); + } +#undef KDB_PCU + return 0; +} + +/* + * display help for the use of cmd | grep pattern + */ +static int kdb_grep_help(int argc, const char **argv) +{ + kdb_printf("Usage of cmd args | grep pattern:\n"); + kdb_printf(" Any command's output may be filtered through an "); + kdb_printf("emulated 'pipe'.\n"); + kdb_printf(" 'grep' is just a key word.\n"); + kdb_printf(" The pattern may include a very limited set of " + "metacharacters:\n"); + kdb_printf(" pattern or ^pattern or pattern$ or ^pattern$\n"); + kdb_printf(" And if there are spaces in the pattern, you may " + "quote it:\n"); + kdb_printf(" \"pat tern\" or \"^pat tern\" or \"pat tern$\"" + " or \"^pat tern$\"\n"); + return 0; +} + +/* + * kdb_register_repeat - This function is used to register a kernel + * debugger command. + * Inputs: + * cmd Command name + * func Function to execute the command + * usage A simple usage string showing arguments + * help A simple help string describing command + * repeat Does the command auto repeat on enter? + * Returns: + * zero for success, one if a duplicate command. + */ +#define kdb_command_extend 50 /* arbitrary */ +int kdb_register_repeat(char *cmd, + kdb_func_t func, + char *usage, + char *help, + short minlen, + kdb_repeat_t repeat) +{ + int i; + kdbtab_t *kp; + + /* + * Brute force method to determine duplicates + */ + for_each_kdbcmd(kp, i) { + if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) { + kdb_printf("Duplicate kdb command registered: " + "%s, func %p help %s\n", cmd, func, help); + return 1; + } + } + + /* + * Insert command into first available location in table + */ + for_each_kdbcmd(kp, i) { + if (kp->cmd_name == NULL) + break; + } + + if (i >= kdb_max_commands) { + kdbtab_t *new = kmalloc((kdb_max_commands - KDB_BASE_CMD_MAX + + kdb_command_extend) * sizeof(*new), GFP_KDB); + if (!new) { + kdb_printf("Could not allocate new kdb_command " + "table\n"); + return 1; + } + if (kdb_commands) { + memcpy(new, kdb_commands, + (kdb_max_commands - KDB_BASE_CMD_MAX) * sizeof(*new)); + kfree(kdb_commands); + } + memset(new + kdb_max_commands, 0, + kdb_command_extend * sizeof(*new)); + kdb_commands = new; + kp = kdb_commands + kdb_max_commands - KDB_BASE_CMD_MAX; + kdb_max_commands += kdb_command_extend; + } + + kp->cmd_name = cmd; + kp->cmd_func = func; + kp->cmd_usage = usage; + kp->cmd_help = help; + kp->cmd_flags = 0; + kp->cmd_minlen = minlen; + kp->cmd_repeat = repeat; + + return 0; +} +EXPORT_SYMBOL_GPL(kdb_register_repeat); + + +/* + * kdb_register - Compatibility register function for commands that do + * not need to specify a repeat state. Equivalent to + * kdb_register_repeat with KDB_REPEAT_NONE. + * Inputs: + * cmd Command name + * func Function to execute the command + * usage A simple usage string showing arguments + * help A simple help string describing command + * Returns: + * zero for success, one if a duplicate command. + */ +int kdb_register(char *cmd, + kdb_func_t func, + char *usage, + char *help, + short minlen) +{ + return kdb_register_repeat(cmd, func, usage, help, minlen, + KDB_REPEAT_NONE); +} +EXPORT_SYMBOL_GPL(kdb_register); + +/* + * kdb_unregister - This function is used to unregister a kernel + * debugger command. It is generally called when a module which + * implements kdb commands is unloaded. + * Inputs: + * cmd Command name + * Returns: + * zero for success, one command not registered. + */ +int kdb_unregister(char *cmd) +{ + int i; + kdbtab_t *kp; + + /* + * find the command. + */ + for_each_kdbcmd(kp, i) { + if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) { + kp->cmd_name = NULL; + return 0; + } + } + + /* Couldn't find it. */ + return 1; +} +EXPORT_SYMBOL_GPL(kdb_unregister); + +/* Initialize the kdb command table. */ +static void __init kdb_inittab(void) +{ + int i; + kdbtab_t *kp; + + for_each_kdbcmd(kp, i) + kp->cmd_name = NULL; + + kdb_register_repeat("md", kdb_md, "", + "Display Memory Contents, also mdWcN, e.g. md8c1", 1, + KDB_REPEAT_NO_ARGS); + kdb_register_repeat("mdr", kdb_md, " ", + "Display Raw Memory", 0, KDB_REPEAT_NO_ARGS); + kdb_register_repeat("mdp", kdb_md, " ", + "Display Physical Memory", 0, KDB_REPEAT_NO_ARGS); + kdb_register_repeat("mds", kdb_md, "", + "Display Memory Symbolically", 0, KDB_REPEAT_NO_ARGS); + kdb_register_repeat("mm", kdb_mm, " ", + "Modify Memory Contents", 0, KDB_REPEAT_NO_ARGS); + kdb_register_repeat("go", kdb_go, "[]", + "Continue Execution", 1, KDB_REPEAT_NONE); + kdb_register_repeat("rd", kdb_rd, "", + "Display Registers", 0, KDB_REPEAT_NONE); + kdb_register_repeat("rm", kdb_rm, " ", + "Modify Registers", 0, KDB_REPEAT_NONE); + kdb_register_repeat("ef", kdb_ef, "", + "Display exception frame", 0, KDB_REPEAT_NONE); + kdb_register_repeat("bt", kdb_bt, "[]", + "Stack traceback", 1, KDB_REPEAT_NONE); + kdb_register_repeat("btp", kdb_bt, "", + "Display stack for process ", 0, KDB_REPEAT_NONE); + kdb_register_repeat("bta", kdb_bt, "[DRSTCZEUIMA]", + "Display stack all processes", 0, KDB_REPEAT_NONE); + kdb_register_repeat("btc", kdb_bt, "", + "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE); + kdb_register_repeat("btt", kdb_bt, "", + "Backtrace process given its struct task address", 0, + KDB_REPEAT_NONE); + kdb_register_repeat("ll", kdb_ll, " ", + "Execute cmd for each element in linked list", 0, KDB_REPEAT_NONE); + kdb_register_repeat("env", kdb_env, "", + "Show environment variables", 0, KDB_REPEAT_NONE); + kdb_register_repeat("set", kdb_set, "", + "Set environment variables", 0, KDB_REPEAT_NONE); + kdb_register_repeat("help", kdb_help, "", + "Display Help Message", 1, KDB_REPEAT_NONE); + kdb_register_repeat("?", kdb_help, "", + "Display Help Message", 0, KDB_REPEAT_NONE); + kdb_register_repeat("cpu", kdb_cpu, "", + "Switch to new cpu", 0, KDB_REPEAT_NONE); + kdb_register_repeat("kgdb", kdb_kgdb, "", + "Enter kgdb mode", 0, KDB_REPEAT_NONE); + kdb_register_repeat("ps", kdb_ps, "[|A]", + "Display active task list", 0, KDB_REPEAT_NONE); + kdb_register_repeat("pid", kdb_pid, "", + "Switch to another task", 0, KDB_REPEAT_NONE); + kdb_register_repeat("reboot", kdb_reboot, "", + "Reboot the machine immediately", 0, KDB_REPEAT_NONE); +#if defined(CONFIG_MODULES) + kdb_register_repeat("lsmod", kdb_lsmod, "", + "List loaded kernel modules", 0, KDB_REPEAT_NONE); +#endif +#if defined(CONFIG_MAGIC_SYSRQ) + kdb_register_repeat("sr", kdb_sr, "", + "Magic SysRq key", 0, KDB_REPEAT_NONE); +#endif +#if defined(CONFIG_PRINTK) + kdb_register_repeat("dmesg", kdb_dmesg, "[lines]", + "Display syslog buffer", 0, KDB_REPEAT_NONE); +#endif + kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"", + "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE); + kdb_register_repeat("kill", kdb_kill, "<-signal> ", + "Send a signal to a process", 0, KDB_REPEAT_NONE); + kdb_register_repeat("summary", kdb_summary, "", + "Summarize the system", 4, KDB_REPEAT_NONE); + kdb_register_repeat("per_cpu", kdb_per_cpu, " [] []", + "Display per_cpu variables", 3, KDB_REPEAT_NONE); + kdb_register_repeat("grephelp", kdb_grep_help, "", + "Display help on | grep", 0, KDB_REPEAT_NONE); +} + +/* Execute any commands defined in kdb_cmds. */ +static void __init kdb_cmd_init(void) +{ + int i, diag; + for (i = 0; kdb_cmds[i]; ++i) { + diag = kdb_parse(kdb_cmds[i]); + if (diag) + kdb_printf("kdb command %s failed, kdb diag %d\n", + kdb_cmds[i], diag); + } + if (defcmd_in_progress) { + kdb_printf("Incomplete 'defcmd' set, forcing endefcmd\n"); + kdb_parse("endefcmd"); + } +} + +/* Initialize kdb_printf, breakpoint tables and kdb state */ +void __init kdb_init(int lvl) +{ + static int kdb_init_lvl = KDB_NOT_INITIALIZED; + int i; + + if (kdb_init_lvl == KDB_INIT_FULL || lvl <= kdb_init_lvl) + return; + for (i = kdb_init_lvl; i < lvl; i++) { + switch (i) { + case KDB_NOT_INITIALIZED: + kdb_inittab(); /* Initialize Command Table */ + kdb_initbptab(); /* Initialize Breakpoints */ + break; + case KDB_INIT_EARLY: + kdb_cmd_init(); /* Build kdb_cmds tables */ + break; + } + } + kdb_init_lvl = lvl; +} diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h new file mode 100644 index 00000000..35d69ed1 --- /dev/null +++ b/kernel/debug/kdb/kdb_private.h @@ -0,0 +1,259 @@ +#ifndef _KDBPRIVATE_H +#define _KDBPRIVATE_H + +/* + * Kernel Debugger Architecture Independent Private Headers + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. + */ + +#include +#include "../debug_core.h" + +/* Kernel Debugger Command codes. Must not overlap with error codes. */ +#define KDB_CMD_GO (-1001) +#define KDB_CMD_CPU (-1002) +#define KDB_CMD_SS (-1003) +#define KDB_CMD_SSB (-1004) +#define KDB_CMD_KGDB (-1005) +#define KDB_CMD_KGDB2 (-1006) + +/* Internal debug flags */ +#define KDB_DEBUG_FLAG_BP 0x0002 /* Breakpoint subsystem debug */ +#define KDB_DEBUG_FLAG_BB_SUMM 0x0004 /* Basic block analysis, summary only */ +#define KDB_DEBUG_FLAG_AR 0x0008 /* Activation record, generic */ +#define KDB_DEBUG_FLAG_ARA 0x0010 /* Activation record, arch specific */ +#define KDB_DEBUG_FLAG_BB 0x0020 /* All basic block analysis */ +#define KDB_DEBUG_FLAG_STATE 0x0040 /* State flags */ +#define KDB_DEBUG_FLAG_MASK 0xffff /* All debug flags */ +#define KDB_DEBUG_FLAG_SHIFT 16 /* Shift factor for dbflags */ + +#define KDB_DEBUG(flag) (kdb_flags & \ + (KDB_DEBUG_FLAG_##flag << KDB_DEBUG_FLAG_SHIFT)) +#define KDB_DEBUG_STATE(text, value) if (KDB_DEBUG(STATE)) \ + kdb_print_state(text, value) + +#if BITS_PER_LONG == 32 + +#define KDB_PLATFORM_ENV "BYTESPERWORD=4" + +#define kdb_machreg_fmt "0x%lx" +#define kdb_machreg_fmt0 "0x%08lx" +#define kdb_bfd_vma_fmt "0x%lx" +#define kdb_bfd_vma_fmt0 "0x%08lx" +#define kdb_elfw_addr_fmt "0x%x" +#define kdb_elfw_addr_fmt0 "0x%08x" +#define kdb_f_count_fmt "%d" + +#elif BITS_PER_LONG == 64 + +#define KDB_PLATFORM_ENV "BYTESPERWORD=8" + +#define kdb_machreg_fmt "0x%lx" +#define kdb_machreg_fmt0 "0x%016lx" +#define kdb_bfd_vma_fmt "0x%lx" +#define kdb_bfd_vma_fmt0 "0x%016lx" +#define kdb_elfw_addr_fmt "0x%x" +#define kdb_elfw_addr_fmt0 "0x%016x" +#define kdb_f_count_fmt "%ld" + +#endif + +/* + * KDB_MAXBPT describes the total number of breakpoints + * supported by this architecure. + */ +#define KDB_MAXBPT 16 + +/* Symbol table format returned by kallsyms. */ +typedef struct __ksymtab { + unsigned long value; /* Address of symbol */ + const char *mod_name; /* Module containing symbol or + * "kernel" */ + unsigned long mod_start; + unsigned long mod_end; + const char *sec_name; /* Section containing symbol */ + unsigned long sec_start; + unsigned long sec_end; + const char *sym_name; /* Full symbol name, including + * any version */ + unsigned long sym_start; + unsigned long sym_end; + } kdb_symtab_t; +extern int kallsyms_symbol_next(char *prefix_name, int flag); +extern int kallsyms_symbol_complete(char *prefix_name, int max_len); + +/* Exported Symbols for kernel loadable modules to use. */ +extern int kdb_getarea_size(void *, unsigned long, size_t); +extern int kdb_putarea_size(unsigned long, void *, size_t); + +/* + * Like get_user and put_user, kdb_getarea and kdb_putarea take variable + * names, not pointers. The underlying *_size functions take pointers. + */ +#define kdb_getarea(x, addr) kdb_getarea_size(&(x), addr, sizeof((x))) +#define kdb_putarea(addr, x) kdb_putarea_size(addr, &(x), sizeof((x))) + +extern int kdb_getphysword(unsigned long *word, + unsigned long addr, size_t size); +extern int kdb_getword(unsigned long *, unsigned long, size_t); +extern int kdb_putword(unsigned long, unsigned long, size_t); + +extern int kdbgetularg(const char *, unsigned long *); +extern int kdbgetu64arg(const char *, u64 *); +extern char *kdbgetenv(const char *); +extern int kdbgetaddrarg(int, const char **, int*, unsigned long *, + long *, char **); +extern int kdbgetsymval(const char *, kdb_symtab_t *); +extern int kdbnearsym(unsigned long, kdb_symtab_t *); +extern void kdbnearsym_cleanup(void); +extern char *kdb_strdup(const char *str, gfp_t type); +extern void kdb_symbol_print(unsigned long, const kdb_symtab_t *, unsigned int); + +/* Routine for debugging the debugger state. */ +extern void kdb_print_state(const char *, int); + +extern int kdb_state; +#define KDB_STATE_KDB 0x00000001 /* Cpu is inside kdb */ +#define KDB_STATE_LEAVING 0x00000002 /* Cpu is leaving kdb */ +#define KDB_STATE_CMD 0x00000004 /* Running a kdb command */ +#define KDB_STATE_KDB_CONTROL 0x00000008 /* This cpu is under + * kdb control */ +#define KDB_STATE_HOLD_CPU 0x00000010 /* Hold this cpu inside kdb */ +#define KDB_STATE_DOING_SS 0x00000020 /* Doing ss command */ +#define KDB_STATE_DOING_SSB 0x00000040 /* Doing ssb command, + * DOING_SS is also set */ +#define KDB_STATE_SSBPT 0x00000080 /* Install breakpoint + * after one ss, independent of + * DOING_SS */ +#define KDB_STATE_REENTRY 0x00000100 /* Valid re-entry into kdb */ +#define KDB_STATE_SUPPRESS 0x00000200 /* Suppress error messages */ +#define KDB_STATE_PAGER 0x00000400 /* pager is available */ +#define KDB_STATE_GO_SWITCH 0x00000800 /* go is switching + * back to initial cpu */ +#define KDB_STATE_PRINTF_LOCK 0x00001000 /* Holds kdb_printf lock */ +#define KDB_STATE_WAIT_IPI 0x00002000 /* Waiting for kdb_ipi() NMI */ +#define KDB_STATE_RECURSE 0x00004000 /* Recursive entry to kdb */ +#define KDB_STATE_IP_ADJUSTED 0x00008000 /* Restart IP has been + * adjusted */ +#define KDB_STATE_GO1 0x00010000 /* go only releases one cpu */ +#define KDB_STATE_KEYBOARD 0x00020000 /* kdb entered via + * keyboard on this cpu */ +#define KDB_STATE_KEXEC 0x00040000 /* kexec issued */ +#define KDB_STATE_DOING_KGDB 0x00080000 /* kgdb enter now issued */ +#define KDB_STATE_DOING_KGDB2 0x00100000 /* kgdb enter now issued */ +#define KDB_STATE_KGDB_TRANS 0x00200000 /* Transition to kgdb */ +#define KDB_STATE_ARCH 0xff000000 /* Reserved for arch + * specific use */ + +#define KDB_STATE(flag) (kdb_state & KDB_STATE_##flag) +#define KDB_STATE_SET(flag) ((void)(kdb_state |= KDB_STATE_##flag)) +#define KDB_STATE_CLEAR(flag) ((void)(kdb_state &= ~KDB_STATE_##flag)) + +extern int kdb_nextline; /* Current number of lines displayed */ + +typedef struct _kdb_bp { + unsigned long bp_addr; /* Address breakpoint is present at */ + unsigned int bp_free:1; /* This entry is available */ + unsigned int bp_enabled:1; /* Breakpoint is active in register */ + unsigned int bp_type:4; /* Uses hardware register */ + unsigned int bp_installed:1; /* Breakpoint is installed */ + unsigned int bp_delay:1; /* Do delayed bp handling */ + unsigned int bp_delayed:1; /* Delayed breakpoint */ + unsigned int bph_length; /* HW break length */ +} kdb_bp_t; + +#ifdef CONFIG_KGDB_KDB +extern kdb_bp_t kdb_breakpoints[/* KDB_MAXBPT */]; + +/* The KDB shell command table */ +typedef struct _kdbtab { + char *cmd_name; /* Command name */ + kdb_func_t cmd_func; /* Function to execute command */ + char *cmd_usage; /* Usage String for this command */ + char *cmd_help; /* Help message for this command */ + short cmd_flags; /* Parsing flags */ + short cmd_minlen; /* Minimum legal # command + * chars required */ + kdb_repeat_t cmd_repeat; /* Does command auto repeat on enter? */ +} kdbtab_t; + +extern int kdb_bt(int, const char **); /* KDB display back trace */ + +/* KDB breakpoint management functions */ +extern void kdb_initbptab(void); +extern void kdb_bp_install(struct pt_regs *); +extern void kdb_bp_remove(void); + +typedef enum { + KDB_DB_BPT, /* Breakpoint */ + KDB_DB_SS, /* Single-step trap */ + KDB_DB_SSB, /* Single step to branch */ + KDB_DB_SSBPT, /* Single step over breakpoint */ + KDB_DB_NOBPT /* Spurious breakpoint */ +} kdb_dbtrap_t; + +extern int kdb_main_loop(kdb_reason_t, kdb_reason_t, + int, kdb_dbtrap_t, struct pt_regs *); + +/* Miscellaneous functions and data areas */ +extern int kdb_grepping_flag; +extern char kdb_grep_string[]; +extern int kdb_grep_leading; +extern int kdb_grep_trailing; +extern char *kdb_cmds[]; +extern void kdb_syslog_data(char *syslog_data[]); +extern unsigned long kdb_task_state_string(const char *); +extern char kdb_task_state_char (const struct task_struct *); +extern unsigned long kdb_task_state(const struct task_struct *p, + unsigned long mask); +extern void kdb_ps_suppressed(void); +extern void kdb_ps1(const struct task_struct *p); +extern void kdb_print_nameval(const char *name, unsigned long val); +extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); +extern void kdb_meminfo_proc_show(void); +extern char *kdb_getstr(char *, size_t, char *); + +/* Defines for kdb_symbol_print */ +#define KDB_SP_SPACEB 0x0001 /* Space before string */ +#define KDB_SP_SPACEA 0x0002 /* Space after string */ +#define KDB_SP_PAREN 0x0004 /* Parenthesis around string */ +#define KDB_SP_VALUE 0x0008 /* Print the value of the address */ +#define KDB_SP_SYMSIZE 0x0010 /* Print the size of the symbol */ +#define KDB_SP_NEWLINE 0x0020 /* Newline after string */ +#define KDB_SP_DEFAULT (KDB_SP_VALUE|KDB_SP_PAREN) + +#define KDB_TSK(cpu) kgdb_info[cpu].task +#define KDB_TSKREGS(cpu) kgdb_info[cpu].debuggerinfo + +extern struct task_struct *kdb_curr_task(int); + +#define kdb_task_has_cpu(p) (task_curr(p)) + +/* Simplify coexistence with NPTL */ +#define kdb_do_each_thread(g, p) do_each_thread(g, p) +#define kdb_while_each_thread(g, p) while_each_thread(g, p) + +#define GFP_KDB (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL) + +extern void *debug_kmalloc(size_t size, gfp_t flags); +extern void debug_kfree(void *); +extern void debug_kusage(void); + +extern void kdb_set_current_task(struct task_struct *); +extern struct task_struct *kdb_current_task; +#ifdef CONFIG_MODULES +extern struct list_head *kdb_modules; +#endif /* CONFIG_MODULES */ + +extern char kdb_prompt_str[]; + +#define KDB_WORD_SIZE ((int)sizeof(unsigned long)) + +#endif /* CONFIG_KGDB_KDB */ +#endif /* !_KDBPRIVATE_H */ diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c new file mode 100644 index 00000000..5532dd37 --- /dev/null +++ b/kernel/debug/kdb/kdb_support.c @@ -0,0 +1,927 @@ +/* + * Kernel Debugger Architecture Independent Support Functions + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved. + * 03/02/13 added new 2.5 kallsyms + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "kdb_private.h" + +/* + * kdbgetsymval - Return the address of the given symbol. + * + * Parameters: + * symname Character string containing symbol name + * symtab Structure to receive results + * Returns: + * 0 Symbol not found, symtab zero filled + * 1 Symbol mapped to module/symbol/section, data in symtab + */ +int kdbgetsymval(const char *symname, kdb_symtab_t *symtab) +{ + if (KDB_DEBUG(AR)) + kdb_printf("kdbgetsymval: symname=%s, symtab=%p\n", symname, + symtab); + memset(symtab, 0, sizeof(*symtab)); + symtab->sym_start = kallsyms_lookup_name(symname); + if (symtab->sym_start) { + if (KDB_DEBUG(AR)) + kdb_printf("kdbgetsymval: returns 1, " + "symtab->sym_start=0x%lx\n", + symtab->sym_start); + return 1; + } + if (KDB_DEBUG(AR)) + kdb_printf("kdbgetsymval: returns 0\n"); + return 0; +} +EXPORT_SYMBOL(kdbgetsymval); + +static char *kdb_name_table[100]; /* arbitrary size */ + +/* + * kdbnearsym - Return the name of the symbol with the nearest address + * less than 'addr'. + * + * Parameters: + * addr Address to check for symbol near + * symtab Structure to receive results + * Returns: + * 0 No sections contain this address, symtab zero filled + * 1 Address mapped to module/symbol/section, data in symtab + * Remarks: + * 2.6 kallsyms has a "feature" where it unpacks the name into a + * string. If that string is reused before the caller expects it + * then the caller sees its string change without warning. To + * avoid cluttering up the main kdb code with lots of kdb_strdup, + * tests and kfree calls, kdbnearsym maintains an LRU list of the + * last few unique strings. The list is sized large enough to + * hold active strings, no kdb caller of kdbnearsym makes more + * than ~20 later calls before using a saved value. + */ +int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab) +{ + int ret = 0; + unsigned long symbolsize = 0; + unsigned long offset = 0; +#define knt1_size 128 /* must be >= kallsyms table size */ + char *knt1 = NULL; + + if (KDB_DEBUG(AR)) + kdb_printf("kdbnearsym: addr=0x%lx, symtab=%p\n", addr, symtab); + memset(symtab, 0, sizeof(*symtab)); + + if (addr < 4096) + goto out; + knt1 = debug_kmalloc(knt1_size, GFP_ATOMIC); + if (!knt1) { + kdb_printf("kdbnearsym: addr=0x%lx cannot kmalloc knt1\n", + addr); + goto out; + } + symtab->sym_name = kallsyms_lookup(addr, &symbolsize , &offset, + (char **)(&symtab->mod_name), knt1); + if (offset > 8*1024*1024) { + symtab->sym_name = NULL; + addr = offset = symbolsize = 0; + } + symtab->sym_start = addr - offset; + symtab->sym_end = symtab->sym_start + symbolsize; + ret = symtab->sym_name != NULL && *(symtab->sym_name) != '\0'; + + if (ret) { + int i; + /* Another 2.6 kallsyms "feature". Sometimes the sym_name is + * set but the buffer passed into kallsyms_lookup is not used, + * so it contains garbage. The caller has to work out which + * buffer needs to be saved. + * + * What was Rusty smoking when he wrote that code? + */ + if (symtab->sym_name != knt1) { + strncpy(knt1, symtab->sym_name, knt1_size); + knt1[knt1_size-1] = '\0'; + } + for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) { + if (kdb_name_table[i] && + strcmp(kdb_name_table[i], knt1) == 0) + break; + } + if (i >= ARRAY_SIZE(kdb_name_table)) { + debug_kfree(kdb_name_table[0]); + memcpy(kdb_name_table, kdb_name_table+1, + sizeof(kdb_name_table[0]) * + (ARRAY_SIZE(kdb_name_table)-1)); + } else { + debug_kfree(knt1); + knt1 = kdb_name_table[i]; + memcpy(kdb_name_table+i, kdb_name_table+i+1, + sizeof(kdb_name_table[0]) * + (ARRAY_SIZE(kdb_name_table)-i-1)); + } + i = ARRAY_SIZE(kdb_name_table) - 1; + kdb_name_table[i] = knt1; + symtab->sym_name = kdb_name_table[i]; + knt1 = NULL; + } + + if (symtab->mod_name == NULL) + symtab->mod_name = "kernel"; + if (KDB_DEBUG(AR)) + kdb_printf("kdbnearsym: returns %d symtab->sym_start=0x%lx, " + "symtab->mod_name=%p, symtab->sym_name=%p (%s)\n", ret, + symtab->sym_start, symtab->mod_name, symtab->sym_name, + symtab->sym_name); + +out: + debug_kfree(knt1); + return ret; +} + +void kdbnearsym_cleanup(void) +{ + int i; + for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) { + if (kdb_name_table[i]) { + debug_kfree(kdb_name_table[i]); + kdb_name_table[i] = NULL; + } + } +} + +static char ks_namebuf[KSYM_NAME_LEN+1], ks_namebuf_prev[KSYM_NAME_LEN+1]; + +/* + * kallsyms_symbol_complete + * + * Parameters: + * prefix_name prefix of a symbol name to lookup + * max_len maximum length that can be returned + * Returns: + * Number of symbols which match the given prefix. + * Notes: + * prefix_name is changed to contain the longest unique prefix that + * starts with this prefix (tab completion). + */ +int kallsyms_symbol_complete(char *prefix_name, int max_len) +{ + loff_t pos = 0; + int prefix_len = strlen(prefix_name), prev_len = 0; + int i, number = 0; + const char *name; + + while ((name = kdb_walk_kallsyms(&pos))) { + if (strncmp(name, prefix_name, prefix_len) == 0) { + strcpy(ks_namebuf, name); + /* Work out the longest name that matches the prefix */ + if (++number == 1) { + prev_len = min_t(int, max_len-1, + strlen(ks_namebuf)); + memcpy(ks_namebuf_prev, ks_namebuf, prev_len); + ks_namebuf_prev[prev_len] = '\0'; + continue; + } + for (i = 0; i < prev_len; i++) { + if (ks_namebuf[i] != ks_namebuf_prev[i]) { + prev_len = i; + ks_namebuf_prev[i] = '\0'; + break; + } + } + } + } + if (prev_len > prefix_len) + memcpy(prefix_name, ks_namebuf_prev, prev_len+1); + return number; +} + +/* + * kallsyms_symbol_next + * + * Parameters: + * prefix_name prefix of a symbol name to lookup + * flag 0 means search from the head, 1 means continue search. + * Returns: + * 1 if a symbol matches the given prefix. + * 0 if no string found + */ +int kallsyms_symbol_next(char *prefix_name, int flag) +{ + int prefix_len = strlen(prefix_name); + static loff_t pos; + const char *name; + + if (!flag) + pos = 0; + + while ((name = kdb_walk_kallsyms(&pos))) { + if (strncmp(name, prefix_name, prefix_len) == 0) { + strncpy(prefix_name, name, strlen(name)+1); + return 1; + } + } + return 0; +} + +/* + * kdb_symbol_print - Standard method for printing a symbol name and offset. + * Inputs: + * addr Address to be printed. + * symtab Address of symbol data, if NULL this routine does its + * own lookup. + * punc Punctuation for string, bit field. + * Remarks: + * The string and its punctuation is only printed if the address + * is inside the kernel, except that the value is always printed + * when requested. + */ +void kdb_symbol_print(unsigned long addr, const kdb_symtab_t *symtab_p, + unsigned int punc) +{ + kdb_symtab_t symtab, *symtab_p2; + if (symtab_p) { + symtab_p2 = (kdb_symtab_t *)symtab_p; + } else { + symtab_p2 = &symtab; + kdbnearsym(addr, symtab_p2); + } + if (!(symtab_p2->sym_name || (punc & KDB_SP_VALUE))) + return; + if (punc & KDB_SP_SPACEB) + kdb_printf(" "); + if (punc & KDB_SP_VALUE) + kdb_printf(kdb_machreg_fmt0, addr); + if (symtab_p2->sym_name) { + if (punc & KDB_SP_VALUE) + kdb_printf(" "); + if (punc & KDB_SP_PAREN) + kdb_printf("("); + if (strcmp(symtab_p2->mod_name, "kernel")) + kdb_printf("[%s]", symtab_p2->mod_name); + kdb_printf("%s", symtab_p2->sym_name); + if (addr != symtab_p2->sym_start) + kdb_printf("+0x%lx", addr - symtab_p2->sym_start); + if (punc & KDB_SP_SYMSIZE) + kdb_printf("/0x%lx", + symtab_p2->sym_end - symtab_p2->sym_start); + if (punc & KDB_SP_PAREN) + kdb_printf(")"); + } + if (punc & KDB_SP_SPACEA) + kdb_printf(" "); + if (punc & KDB_SP_NEWLINE) + kdb_printf("\n"); +} + +/* + * kdb_strdup - kdb equivalent of strdup, for disasm code. + * Inputs: + * str The string to duplicate. + * type Flags to kmalloc for the new string. + * Returns: + * Address of the new string, NULL if storage could not be allocated. + * Remarks: + * This is not in lib/string.c because it uses kmalloc which is not + * available when string.o is used in boot loaders. + */ +char *kdb_strdup(const char *str, gfp_t type) +{ + int n = strlen(str)+1; + char *s = kmalloc(n, type); + if (!s) + return NULL; + return strcpy(s, str); +} + +/* + * kdb_getarea_size - Read an area of data. The kdb equivalent of + * copy_from_user, with kdb messages for invalid addresses. + * Inputs: + * res Pointer to the area to receive the result. + * addr Address of the area to copy. + * size Size of the area. + * Returns: + * 0 for success, < 0 for error. + */ +int kdb_getarea_size(void *res, unsigned long addr, size_t size) +{ + int ret = probe_kernel_read((char *)res, (char *)addr, size); + if (ret) { + if (!KDB_STATE(SUPPRESS)) { + kdb_printf("kdb_getarea: Bad address 0x%lx\n", addr); + KDB_STATE_SET(SUPPRESS); + } + ret = KDB_BADADDR; + } else { + KDB_STATE_CLEAR(SUPPRESS); + } + return ret; +} + +/* + * kdb_putarea_size - Write an area of data. The kdb equivalent of + * copy_to_user, with kdb messages for invalid addresses. + * Inputs: + * addr Address of the area to write to. + * res Pointer to the area holding the data. + * size Size of the area. + * Returns: + * 0 for success, < 0 for error. + */ +int kdb_putarea_size(unsigned long addr, void *res, size_t size) +{ + int ret = probe_kernel_read((char *)addr, (char *)res, size); + if (ret) { + if (!KDB_STATE(SUPPRESS)) { + kdb_printf("kdb_putarea: Bad address 0x%lx\n", addr); + KDB_STATE_SET(SUPPRESS); + } + ret = KDB_BADADDR; + } else { + KDB_STATE_CLEAR(SUPPRESS); + } + return ret; +} + +/* + * kdb_getphys - Read data from a physical address. Validate the + * address is in range, use kmap_atomic() to get data + * similar to kdb_getarea() - but for phys addresses + * Inputs: + * res Pointer to the word to receive the result + * addr Physical address of the area to copy + * size Size of the area + * Returns: + * 0 for success, < 0 for error. + */ +static int kdb_getphys(void *res, unsigned long addr, size_t size) +{ + unsigned long pfn; + void *vaddr; + struct page *page; + + pfn = (addr >> PAGE_SHIFT); + if (!pfn_valid(pfn)) + return 1; + page = pfn_to_page(pfn); + vaddr = kmap_atomic(page, KM_KDB); + memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size); + kunmap_atomic(vaddr, KM_KDB); + + return 0; +} + +/* + * kdb_getphysword + * Inputs: + * word Pointer to the word to receive the result. + * addr Address of the area to copy. + * size Size of the area. + * Returns: + * 0 for success, < 0 for error. + */ +int kdb_getphysword(unsigned long *word, unsigned long addr, size_t size) +{ + int diag; + __u8 w1; + __u16 w2; + __u32 w4; + __u64 w8; + *word = 0; /* Default value if addr or size is invalid */ + + switch (size) { + case 1: + diag = kdb_getphys(&w1, addr, sizeof(w1)); + if (!diag) + *word = w1; + break; + case 2: + diag = kdb_getphys(&w2, addr, sizeof(w2)); + if (!diag) + *word = w2; + break; + case 4: + diag = kdb_getphys(&w4, addr, sizeof(w4)); + if (!diag) + *word = w4; + break; + case 8: + if (size <= sizeof(*word)) { + diag = kdb_getphys(&w8, addr, sizeof(w8)); + if (!diag) + *word = w8; + break; + } + /* drop through */ + default: + diag = KDB_BADWIDTH; + kdb_printf("kdb_getphysword: bad width %ld\n", (long) size); + } + return diag; +} + +/* + * kdb_getword - Read a binary value. Unlike kdb_getarea, this treats + * data as numbers. + * Inputs: + * word Pointer to the word to receive the result. + * addr Address of the area to copy. + * size Size of the area. + * Returns: + * 0 for success, < 0 for error. + */ +int kdb_getword(unsigned long *word, unsigned long addr, size_t size) +{ + int diag; + __u8 w1; + __u16 w2; + __u32 w4; + __u64 w8; + *word = 0; /* Default value if addr or size is invalid */ + switch (size) { + case 1: + diag = kdb_getarea(w1, addr); + if (!diag) + *word = w1; + break; + case 2: + diag = kdb_getarea(w2, addr); + if (!diag) + *word = w2; + break; + case 4: + diag = kdb_getarea(w4, addr); + if (!diag) + *word = w4; + break; + case 8: + if (size <= sizeof(*word)) { + diag = kdb_getarea(w8, addr); + if (!diag) + *word = w8; + break; + } + /* drop through */ + default: + diag = KDB_BADWIDTH; + kdb_printf("kdb_getword: bad width %ld\n", (long) size); + } + return diag; +} + +/* + * kdb_putword - Write a binary value. Unlike kdb_putarea, this + * treats data as numbers. + * Inputs: + * addr Address of the area to write to.. + * word The value to set. + * size Size of the area. + * Returns: + * 0 for success, < 0 for error. + */ +int kdb_putword(unsigned long addr, unsigned long word, size_t size) +{ + int diag; + __u8 w1; + __u16 w2; + __u32 w4; + __u64 w8; + switch (size) { + case 1: + w1 = word; + diag = kdb_putarea(addr, w1); + break; + case 2: + w2 = word; + diag = kdb_putarea(addr, w2); + break; + case 4: + w4 = word; + diag = kdb_putarea(addr, w4); + break; + case 8: + if (size <= sizeof(word)) { + w8 = word; + diag = kdb_putarea(addr, w8); + break; + } + /* drop through */ + default: + diag = KDB_BADWIDTH; + kdb_printf("kdb_putword: bad width %ld\n", (long) size); + } + return diag; +} + +/* + * kdb_task_state_string - Convert a string containing any of the + * letters DRSTCZEUIMA to a mask for the process state field and + * return the value. If no argument is supplied, return the mask + * that corresponds to environment variable PS, DRSTCZEU by + * default. + * Inputs: + * s String to convert + * Returns: + * Mask for process state. + * Notes: + * The mask folds data from several sources into a single long value, so + * be careful not to overlap the bits. TASK_* bits are in the LSB, + * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there + * is no overlap between TASK_* and EXIT_* but that may not always be + * true, so EXIT_* bits are shifted left 16 bits before being stored in + * the mask. + */ + +/* unrunnable is < 0 */ +#define UNRUNNABLE (1UL << (8*sizeof(unsigned long) - 1)) +#define RUNNING (1UL << (8*sizeof(unsigned long) - 2)) +#define IDLE (1UL << (8*sizeof(unsigned long) - 3)) +#define DAEMON (1UL << (8*sizeof(unsigned long) - 4)) + +unsigned long kdb_task_state_string(const char *s) +{ + long res = 0; + if (!s) { + s = kdbgetenv("PS"); + if (!s) + s = "DRSTCZEU"; /* default value for ps */ + } + while (*s) { + switch (*s) { + case 'D': + res |= TASK_UNINTERRUPTIBLE; + break; + case 'R': + res |= RUNNING; + break; + case 'S': + res |= TASK_INTERRUPTIBLE; + break; + case 'T': + res |= TASK_STOPPED; + break; + case 'C': + res |= TASK_TRACED; + break; + case 'Z': + res |= EXIT_ZOMBIE << 16; + break; + case 'E': + res |= EXIT_DEAD << 16; + break; + case 'U': + res |= UNRUNNABLE; + break; + case 'I': + res |= IDLE; + break; + case 'M': + res |= DAEMON; + break; + case 'A': + res = ~0UL; + break; + default: + kdb_printf("%s: unknown flag '%c' ignored\n", + __func__, *s); + break; + } + ++s; + } + return res; +} + +/* + * kdb_task_state_char - Return the character that represents the task state. + * Inputs: + * p struct task for the process + * Returns: + * One character to represent the task state. + */ +char kdb_task_state_char (const struct task_struct *p) +{ + int cpu; + char state; + unsigned long tmp; + + if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long))) + return 'E'; + + cpu = kdb_process_cpu(p); + state = (p->state == 0) ? 'R' : + (p->state < 0) ? 'U' : + (p->state & TASK_UNINTERRUPTIBLE) ? 'D' : + (p->state & TASK_STOPPED) ? 'T' : + (p->state & TASK_TRACED) ? 'C' : + (p->exit_state & EXIT_ZOMBIE) ? 'Z' : + (p->exit_state & EXIT_DEAD) ? 'E' : + (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?'; + if (p->pid == 0) { + /* Idle task. Is it really idle, apart from the kdb + * interrupt? */ + if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) { + if (cpu != kdb_initial_cpu) + state = 'I'; /* idle task */ + } + } else if (!p->mm && state == 'S') { + state = 'M'; /* sleeping system daemon */ + } + return state; +} + +/* + * kdb_task_state - Return true if a process has the desired state + * given by the mask. + * Inputs: + * p struct task for the process + * mask mask from kdb_task_state_string to select processes + * Returns: + * True if the process matches at least one criteria defined by the mask. + */ +unsigned long kdb_task_state(const struct task_struct *p, unsigned long mask) +{ + char state[] = { kdb_task_state_char(p), '\0' }; + return (mask & kdb_task_state_string(state)) != 0; +} + +/* + * kdb_print_nameval - Print a name and its value, converting the + * value to a symbol lookup if possible. + * Inputs: + * name field name to print + * val value of field + */ +void kdb_print_nameval(const char *name, unsigned long val) +{ + kdb_symtab_t symtab; + kdb_printf(" %-11.11s ", name); + if (kdbnearsym(val, &symtab)) + kdb_symbol_print(val, &symtab, + KDB_SP_VALUE|KDB_SP_SYMSIZE|KDB_SP_NEWLINE); + else + kdb_printf("0x%lx\n", val); +} + +/* Last ditch allocator for debugging, so we can still debug even when + * the GFP_ATOMIC pool has been exhausted. The algorithms are tuned + * for space usage, not for speed. One smallish memory pool, the free + * chain is always in ascending address order to allow coalescing, + * allocations are done in brute force best fit. + */ + +struct debug_alloc_header { + u32 next; /* offset of next header from start of pool */ + u32 size; + void *caller; +}; + +/* The memory returned by this allocator must be aligned, which means + * so must the header size. Do not assume that sizeof(struct + * debug_alloc_header) is a multiple of the alignment, explicitly + * calculate the overhead of this header, including the alignment. + * The rest of this code must not use sizeof() on any header or + * pointer to a header. + */ +#define dah_align 8 +#define dah_overhead ALIGN(sizeof(struct debug_alloc_header), dah_align) + +static u64 debug_alloc_pool_aligned[256*1024/dah_align]; /* 256K pool */ +static char *debug_alloc_pool = (char *)debug_alloc_pool_aligned; +static u32 dah_first, dah_first_call = 1, dah_used, dah_used_max; + +/* Locking is awkward. The debug code is called from all contexts, + * including non maskable interrupts. A normal spinlock is not safe + * in NMI context. Try to get the debug allocator lock, if it cannot + * be obtained after a second then give up. If the lock could not be + * previously obtained on this cpu then only try once. + * + * sparse has no annotation for "this function _sometimes_ acquires a + * lock", so fudge the acquire/release notation. + */ +static DEFINE_SPINLOCK(dap_lock); +static int get_dap_lock(void) + __acquires(dap_lock) +{ + static int dap_locked = -1; + int count; + if (dap_locked == smp_processor_id()) + count = 1; + else + count = 1000; + while (1) { + if (spin_trylock(&dap_lock)) { + dap_locked = -1; + return 1; + } + if (!count--) + break; + udelay(1000); + } + dap_locked = smp_processor_id(); + __acquire(dap_lock); + return 0; +} + +void *debug_kmalloc(size_t size, gfp_t flags) +{ + unsigned int rem, h_offset; + struct debug_alloc_header *best, *bestprev, *prev, *h; + void *p = NULL; + if (!get_dap_lock()) { + __release(dap_lock); /* we never actually got it */ + return NULL; + } + h = (struct debug_alloc_header *)(debug_alloc_pool + dah_first); + if (dah_first_call) { + h->size = sizeof(debug_alloc_pool_aligned) - dah_overhead; + dah_first_call = 0; + } + size = ALIGN(size, dah_align); + prev = best = bestprev = NULL; + while (1) { + if (h->size >= size && (!best || h->size < best->size)) { + best = h; + bestprev = prev; + if (h->size == size) + break; + } + if (!h->next) + break; + prev = h; + h = (struct debug_alloc_header *)(debug_alloc_pool + h->next); + } + if (!best) + goto out; + rem = best->size - size; + /* The pool must always contain at least one header */ + if (best->next == 0 && bestprev == NULL && rem < dah_overhead) + goto out; + if (rem >= dah_overhead) { + best->size = size; + h_offset = ((char *)best - debug_alloc_pool) + + dah_overhead + best->size; + h = (struct debug_alloc_header *)(debug_alloc_pool + h_offset); + h->size = rem - dah_overhead; + h->next = best->next; + } else + h_offset = best->next; + best->caller = __builtin_return_address(0); + dah_used += best->size; + dah_used_max = max(dah_used, dah_used_max); + if (bestprev) + bestprev->next = h_offset; + else + dah_first = h_offset; + p = (char *)best + dah_overhead; + memset(p, POISON_INUSE, best->size - 1); + *((char *)p + best->size - 1) = POISON_END; +out: + spin_unlock(&dap_lock); + return p; +} + +void debug_kfree(void *p) +{ + struct debug_alloc_header *h; + unsigned int h_offset; + if (!p) + return; + if ((char *)p < debug_alloc_pool || + (char *)p >= debug_alloc_pool + sizeof(debug_alloc_pool_aligned)) { + kfree(p); + return; + } + if (!get_dap_lock()) { + __release(dap_lock); /* we never actually got it */ + return; /* memory leak, cannot be helped */ + } + h = (struct debug_alloc_header *)((char *)p - dah_overhead); + memset(p, POISON_FREE, h->size - 1); + *((char *)p + h->size - 1) = POISON_END; + h->caller = NULL; + dah_used -= h->size; + h_offset = (char *)h - debug_alloc_pool; + if (h_offset < dah_first) { + h->next = dah_first; + dah_first = h_offset; + } else { + struct debug_alloc_header *prev; + unsigned int prev_offset; + prev = (struct debug_alloc_header *)(debug_alloc_pool + + dah_first); + while (1) { + if (!prev->next || prev->next > h_offset) + break; + prev = (struct debug_alloc_header *) + (debug_alloc_pool + prev->next); + } + prev_offset = (char *)prev - debug_alloc_pool; + if (prev_offset + dah_overhead + prev->size == h_offset) { + prev->size += dah_overhead + h->size; + memset(h, POISON_FREE, dah_overhead - 1); + *((char *)h + dah_overhead - 1) = POISON_END; + h = prev; + h_offset = prev_offset; + } else { + h->next = prev->next; + prev->next = h_offset; + } + } + if (h_offset + dah_overhead + h->size == h->next) { + struct debug_alloc_header *next; + next = (struct debug_alloc_header *) + (debug_alloc_pool + h->next); + h->size += dah_overhead + next->size; + h->next = next->next; + memset(next, POISON_FREE, dah_overhead - 1); + *((char *)next + dah_overhead - 1) = POISON_END; + } + spin_unlock(&dap_lock); +} + +void debug_kusage(void) +{ + struct debug_alloc_header *h_free, *h_used; +#ifdef CONFIG_IA64 + /* FIXME: using dah for ia64 unwind always results in a memory leak. + * Fix that memory leak first, then set debug_kusage_one_time = 1 for + * all architectures. + */ + static int debug_kusage_one_time; +#else + static int debug_kusage_one_time = 1; +#endif + if (!get_dap_lock()) { + __release(dap_lock); /* we never actually got it */ + return; + } + h_free = (struct debug_alloc_header *)(debug_alloc_pool + dah_first); + if (dah_first == 0 && + (h_free->size == sizeof(debug_alloc_pool_aligned) - dah_overhead || + dah_first_call)) + goto out; + if (!debug_kusage_one_time) + goto out; + debug_kusage_one_time = 0; + kdb_printf("%s: debug_kmalloc memory leak dah_first %d\n", + __func__, dah_first); + if (dah_first) { + h_used = (struct debug_alloc_header *)debug_alloc_pool; + kdb_printf("%s: h_used %p size %d\n", __func__, h_used, + h_used->size); + } + do { + h_used = (struct debug_alloc_header *) + ((char *)h_free + dah_overhead + h_free->size); + kdb_printf("%s: h_used %p size %d caller %p\n", + __func__, h_used, h_used->size, h_used->caller); + h_free = (struct debug_alloc_header *) + (debug_alloc_pool + h_free->next); + } while (h_free->next); + h_used = (struct debug_alloc_header *) + ((char *)h_free + dah_overhead + h_free->size); + if ((char *)h_used - debug_alloc_pool != + sizeof(debug_alloc_pool_aligned)) + kdb_printf("%s: h_used %p size %d caller %p\n", + __func__, h_used, h_used->size, h_used->caller); +out: + spin_unlock(&dap_lock); +} + +/* Maintain a small stack of kdb_flags to allow recursion without disturbing + * the global kdb state. + */ + +static int kdb_flags_stack[4], kdb_flags_index; + +void kdb_save_flags(void) +{ + BUG_ON(kdb_flags_index >= ARRAY_SIZE(kdb_flags_stack)); + kdb_flags_stack[kdb_flags_index++] = kdb_flags; +} + +void kdb_restore_flags(void) +{ + BUG_ON(kdb_flags_index <= 0); + kdb_flags = kdb_flags_stack[--kdb_flags_index]; +} diff --git a/kernel/delayacct.c b/kernel/delayacct.c new file mode 100644 index 00000000..ead9b610 --- /dev/null +++ b/kernel/delayacct.c @@ -0,0 +1,184 @@ +/* delayacct.c - per-task delay accounting + * + * Copyright (C) Shailabh Nagar, IBM Corp. 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include + +int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */ +struct kmem_cache *delayacct_cache; + +static int __init delayacct_setup_disable(char *str) +{ + delayacct_on = 0; + return 1; +} +__setup("nodelayacct", delayacct_setup_disable); + +void delayacct_init(void) +{ + delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC); + delayacct_tsk_init(&init_task); +} + +void __delayacct_tsk_init(struct task_struct *tsk) +{ + tsk->delays = kmem_cache_zalloc(delayacct_cache, GFP_KERNEL); + if (tsk->delays) + spin_lock_init(&tsk->delays->lock); +} + +/* + * Start accounting for a delay statistic using + * its starting timestamp (@start) + */ + +static inline void delayacct_start(struct timespec *start) +{ + do_posix_clock_monotonic_gettime(start); +} + +/* + * Finish delay accounting for a statistic using + * its timestamps (@start, @end), accumalator (@total) and @count + */ + +static void delayacct_end(struct timespec *start, struct timespec *end, + u64 *total, u32 *count) +{ + struct timespec ts; + s64 ns; + unsigned long flags; + + do_posix_clock_monotonic_gettime(end); + ts = timespec_sub(*end, *start); + ns = timespec_to_ns(&ts); + if (ns < 0) + return; + + spin_lock_irqsave(¤t->delays->lock, flags); + *total += ns; + (*count)++; + spin_unlock_irqrestore(¤t->delays->lock, flags); +} + +void __delayacct_blkio_start(void) +{ + delayacct_start(¤t->delays->blkio_start); +} + +void __delayacct_blkio_end(void) +{ + if (current->delays->flags & DELAYACCT_PF_SWAPIN) + /* Swapin block I/O */ + delayacct_end(¤t->delays->blkio_start, + ¤t->delays->blkio_end, + ¤t->delays->swapin_delay, + ¤t->delays->swapin_count); + else /* Other block I/O */ + delayacct_end(¤t->delays->blkio_start, + ¤t->delays->blkio_end, + ¤t->delays->blkio_delay, + ¤t->delays->blkio_count); +} + +int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) +{ + s64 tmp; + unsigned long t1; + unsigned long long t2, t3; + unsigned long flags; + struct timespec ts; + + /* Though tsk->delays accessed later, early exit avoids + * unnecessary returning of other data + */ + if (!tsk->delays) + goto done; + + tmp = (s64)d->cpu_run_real_total; + cputime_to_timespec(tsk->utime + tsk->stime, &ts); + tmp += timespec_to_ns(&ts); + d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; + + tmp = (s64)d->cpu_scaled_run_real_total; + cputime_to_timespec(tsk->utimescaled + tsk->stimescaled, &ts); + tmp += timespec_to_ns(&ts); + d->cpu_scaled_run_real_total = + (tmp < (s64)d->cpu_scaled_run_real_total) ? 0 : tmp; + + /* + * No locking available for sched_info (and too expensive to add one) + * Mitigate by taking snapshot of values + */ + t1 = tsk->sched_info.pcount; + t2 = tsk->sched_info.run_delay; + t3 = tsk->se.sum_exec_runtime; + + d->cpu_count += t1; + + tmp = (s64)d->cpu_delay_total + t2; + d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp; + + tmp = (s64)d->cpu_run_virtual_total + t3; + d->cpu_run_virtual_total = + (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp; + + /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */ + + spin_lock_irqsave(&tsk->delays->lock, flags); + tmp = d->blkio_delay_total + tsk->delays->blkio_delay; + d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp; + tmp = d->swapin_delay_total + tsk->delays->swapin_delay; + d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; + tmp = d->freepages_delay_total + tsk->delays->freepages_delay; + d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp; + d->blkio_count += tsk->delays->blkio_count; + d->swapin_count += tsk->delays->swapin_count; + d->freepages_count += tsk->delays->freepages_count; + spin_unlock_irqrestore(&tsk->delays->lock, flags); + +done: + return 0; +} + +__u64 __delayacct_blkio_ticks(struct task_struct *tsk) +{ + __u64 ret; + unsigned long flags; + + spin_lock_irqsave(&tsk->delays->lock, flags); + ret = nsec_to_clock_t(tsk->delays->blkio_delay + + tsk->delays->swapin_delay); + spin_unlock_irqrestore(&tsk->delays->lock, flags); + return ret; +} + +void __delayacct_freepages_start(void) +{ + delayacct_start(¤t->delays->freepages_start); +} + +void __delayacct_freepages_end(void) +{ + delayacct_end(¤t->delays->freepages_start, + ¤t->delays->freepages_end, + ¤t->delays->freepages_delay, + ¤t->delays->freepages_count); +} + diff --git a/kernel/dma.c b/kernel/dma.c new file mode 100644 index 00000000..f903189c --- /dev/null +++ b/kernel/dma.c @@ -0,0 +1,161 @@ +/* + * linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c. + * + * Written by Hennus Bergman, 1992. + * + * 1994/12/26: Changes by Alex Nash to fix a minor bug in /proc/dma. + * In the previous version the reported device could end up being wrong, + * if a device requested a DMA channel that was already in use. + * [It also happened to remove the sizeof(char *) == sizeof(int) + * assumption introduced because of those /proc/dma patches. -- Hennus] + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +/* A note on resource allocation: + * + * All drivers needing DMA channels, should allocate and release them + * through the public routines `request_dma()' and `free_dma()'. + * + * In order to avoid problems, all processes should allocate resources in + * the same sequence and release them in the reverse order. + * + * So, when allocating DMAs and IRQs, first allocate the IRQ, then the DMA. + * When releasing them, first release the DMA, then release the IRQ. + * If you don't, you may cause allocation requests to fail unnecessarily. + * This doesn't really matter now, but it will once we get real semaphores + * in the kernel. + */ + + +DEFINE_SPINLOCK(dma_spin_lock); + +/* + * If our port doesn't define this it has no PC like DMA + */ + +#ifdef MAX_DMA_CHANNELS + + +/* Channel n is busy iff dma_chan_busy[n].lock != 0. + * DMA0 used to be reserved for DRAM refresh, but apparently not any more... + * DMA4 is reserved for cascading. + */ + +struct dma_chan { + int lock; + const char *device_id; +}; + +static struct dma_chan dma_chan_busy[MAX_DMA_CHANNELS] = { + [4] = { 1, "cascade" }, +}; + + +/** + * request_dma - request and reserve a system DMA channel + * @dmanr: DMA channel number + * @device_id: reserving device ID string, used in /proc/dma + */ +int request_dma(unsigned int dmanr, const char * device_id) +{ + if (dmanr >= MAX_DMA_CHANNELS) + return -EINVAL; + + if (xchg(&dma_chan_busy[dmanr].lock, 1) != 0) + return -EBUSY; + + dma_chan_busy[dmanr].device_id = device_id; + + /* old flag was 0, now contains 1 to indicate busy */ + return 0; +} /* request_dma */ + +/** + * free_dma - free a reserved system DMA channel + * @dmanr: DMA channel number + */ +void free_dma(unsigned int dmanr) +{ + if (dmanr >= MAX_DMA_CHANNELS) { + printk(KERN_WARNING "Trying to free DMA%d\n", dmanr); + return; + } + + if (xchg(&dma_chan_busy[dmanr].lock, 0) == 0) { + printk(KERN_WARNING "Trying to free free DMA%d\n", dmanr); + return; + } + +} /* free_dma */ + +#else + +int request_dma(unsigned int dmanr, const char *device_id) +{ + return -EINVAL; +} + +void free_dma(unsigned int dmanr) +{ +} + +#endif + +#ifdef CONFIG_PROC_FS + +#ifdef MAX_DMA_CHANNELS +static int proc_dma_show(struct seq_file *m, void *v) +{ + int i; + + for (i = 0 ; i < MAX_DMA_CHANNELS ; i++) { + if (dma_chan_busy[i].lock) { + seq_printf(m, "%2d: %s\n", i, + dma_chan_busy[i].device_id); + } + } + return 0; +} +#else +static int proc_dma_show(struct seq_file *m, void *v) +{ + seq_puts(m, "No DMA\n"); + return 0; +} +#endif /* MAX_DMA_CHANNELS */ + +static int proc_dma_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_dma_show, NULL); +} + +static const struct file_operations proc_dma_operations = { + .open = proc_dma_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_dma_init(void) +{ + proc_create("dma", 0, NULL, &proc_dma_operations); + return 0; +} + +__initcall(proc_dma_init); +#endif + +EXPORT_SYMBOL(request_dma); +EXPORT_SYMBOL(free_dma); +EXPORT_SYMBOL(dma_spin_lock); diff --git a/kernel/elfcore.c b/kernel/elfcore.c new file mode 100644 index 00000000..ff915efe --- /dev/null +++ b/kernel/elfcore.c @@ -0,0 +1,28 @@ +#include +#include +#include + +#include + + +Elf_Half __weak elf_core_extra_phdrs(void) +{ + return 0; +} + +int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size, + unsigned long limit) +{ + return 1; +} + +int __weak elf_core_write_extra_data(struct file *file, size_t *size, + unsigned long limit) +{ + return 1; +} + +size_t __weak elf_core_extra_data_size(void) +{ + return 0; +} diff --git a/kernel/events/Makefile b/kernel/events/Makefile new file mode 100644 index 00000000..1ce23d3d --- /dev/null +++ b/kernel/events/Makefile @@ -0,0 +1,6 @@ +ifdef CONFIG_FUNCTION_TRACER +CFLAGS_REMOVE_core.o = -pg +endif + +obj-y := core.o +obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o diff --git a/kernel/events/core.c b/kernel/events/core.c new file mode 100644 index 00000000..32a61513 --- /dev/null +++ b/kernel/events/core.c @@ -0,0 +1,7430 @@ +/* + * Performance events core code: + * + * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra + * Copyright © 2009 Paul Mackerras, IBM Corp. + * + * For licensing details see kernel-base/COPYING + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +struct remote_function_call { + struct task_struct *p; + int (*func)(void *info); + void *info; + int ret; +}; + +static void remote_function(void *data) +{ + struct remote_function_call *tfc = data; + struct task_struct *p = tfc->p; + + if (p) { + tfc->ret = -EAGAIN; + if (task_cpu(p) != smp_processor_id() || !task_curr(p)) + return; + } + + tfc->ret = tfc->func(tfc->info); +} + +/** + * task_function_call - call a function on the cpu on which a task runs + * @p: the task to evaluate + * @func: the function to be called + * @info: the function call argument + * + * Calls the function @func when the task is currently running. This might + * be on the current CPU, which just calls the function directly + * + * returns: @func return value, or + * -ESRCH - when the process isn't running + * -EAGAIN - when the process moved away + */ +static int +task_function_call(struct task_struct *p, int (*func) (void *info), void *info) +{ + struct remote_function_call data = { + .p = p, + .func = func, + .info = info, + .ret = -ESRCH, /* No such (running) process */ + }; + + if (task_curr(p)) + smp_call_function_single(task_cpu(p), remote_function, &data, 1); + + return data.ret; +} + +/** + * cpu_function_call - call a function on the cpu + * @func: the function to be called + * @info: the function call argument + * + * Calls the function @func on the remote cpu. + * + * returns: @func return value or -ENXIO when the cpu is offline + */ +static int cpu_function_call(int cpu, int (*func) (void *info), void *info) +{ + struct remote_function_call data = { + .p = NULL, + .func = func, + .info = info, + .ret = -ENXIO, /* No such CPU */ + }; + + smp_call_function_single(cpu, remote_function, &data, 1); + + return data.ret; +} + +#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ + PERF_FLAG_FD_OUTPUT |\ + PERF_FLAG_PID_CGROUP) + +enum event_type_t { + EVENT_FLEXIBLE = 0x1, + EVENT_PINNED = 0x2, + EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, +}; + +/* + * perf_sched_events : >0 events exist + * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu + */ +struct jump_label_key perf_sched_events __read_mostly; +static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); + +static atomic_t nr_mmap_events __read_mostly; +static atomic_t nr_comm_events __read_mostly; +static atomic_t nr_task_events __read_mostly; + +static LIST_HEAD(pmus); +static DEFINE_MUTEX(pmus_lock); +static struct srcu_struct pmus_srcu; + +/* + * perf event paranoia level: + * -1 - not paranoid at all + * 0 - disallow raw tracepoint access for unpriv + * 1 - disallow cpu events for unpriv + * 2 - disallow kernel profiling for unpriv + */ +int sysctl_perf_event_paranoid __read_mostly = 1; + +/* Minimum for 512 kiB + 1 user control page */ +int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ + +/* + * max perf event sample rate + */ +#define DEFAULT_MAX_SAMPLE_RATE 100000 +int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; +static int max_samples_per_tick __read_mostly = + DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); + +int perf_proc_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (ret || !write) + return ret; + + max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); + + return 0; +} + +static atomic64_t perf_event_id; + +static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, + enum event_type_t event_type); + +static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, + enum event_type_t event_type, + struct task_struct *task); + +static void update_context_time(struct perf_event_context *ctx); +static u64 perf_event_time(struct perf_event *event); + +void __weak perf_event_print_debug(void) { } + +extern __weak const char *perf_pmu_name(void) +{ + return "pmu"; +} + +static inline u64 perf_clock(void) +{ + return local_clock(); +} + +static inline struct perf_cpu_context * +__get_cpu_context(struct perf_event_context *ctx) +{ + return this_cpu_ptr(ctx->pmu->pmu_cpu_context); +} + +#ifdef CONFIG_CGROUP_PERF + +/* + * Must ensure cgroup is pinned (css_get) before calling + * this function. In other words, we cannot call this function + * if there is no cgroup event for the current CPU context. + */ +static inline struct perf_cgroup * +perf_cgroup_from_task(struct task_struct *task) +{ + return container_of(task_subsys_state(task, perf_subsys_id), + struct perf_cgroup, css); +} + +static inline bool +perf_cgroup_match(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + + return !event->cgrp || event->cgrp == cpuctx->cgrp; +} + +static inline void perf_get_cgroup(struct perf_event *event) +{ + css_get(&event->cgrp->css); +} + +static inline void perf_put_cgroup(struct perf_event *event) +{ + css_put(&event->cgrp->css); +} + +static inline void perf_detach_cgroup(struct perf_event *event) +{ + perf_put_cgroup(event); + event->cgrp = NULL; +} + +static inline int is_cgroup_event(struct perf_event *event) +{ + return event->cgrp != NULL; +} + +static inline u64 perf_cgroup_event_time(struct perf_event *event) +{ + struct perf_cgroup_info *t; + + t = per_cpu_ptr(event->cgrp->info, event->cpu); + return t->time; +} + +static inline void __update_cgrp_time(struct perf_cgroup *cgrp) +{ + struct perf_cgroup_info *info; + u64 now; + + now = perf_clock(); + + info = this_cpu_ptr(cgrp->info); + + info->time += now - info->timestamp; + info->timestamp = now; +} + +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) +{ + struct perf_cgroup *cgrp_out = cpuctx->cgrp; + if (cgrp_out) + __update_cgrp_time(cgrp_out); +} + +static inline void update_cgrp_time_from_event(struct perf_event *event) +{ + struct perf_cgroup *cgrp; + + /* + * ensure we access cgroup data only when needed and + * when we know the cgroup is pinned (css_get) + */ + if (!is_cgroup_event(event)) + return; + + cgrp = perf_cgroup_from_task(current); + /* + * Do not update time when cgroup is not active + */ + if (cgrp == event->cgrp) + __update_cgrp_time(event->cgrp); +} + +static inline void +perf_cgroup_set_timestamp(struct task_struct *task, + struct perf_event_context *ctx) +{ + struct perf_cgroup *cgrp; + struct perf_cgroup_info *info; + + /* + * ctx->lock held by caller + * ensure we do not access cgroup data + * unless we have the cgroup pinned (css_get) + */ + if (!task || !ctx->nr_cgroups) + return; + + cgrp = perf_cgroup_from_task(task); + info = this_cpu_ptr(cgrp->info); + info->timestamp = ctx->timestamp; +} + +#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */ +#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */ + +/* + * reschedule events based on the cgroup constraint of task. + * + * mode SWOUT : schedule out everything + * mode SWIN : schedule in based on cgroup for next + */ +void perf_cgroup_switch(struct task_struct *task, int mode) +{ + struct perf_cpu_context *cpuctx; + struct pmu *pmu; + unsigned long flags; + + /* + * disable interrupts to avoid geting nr_cgroup + * changes via __perf_event_disable(). Also + * avoids preemption. + */ + local_irq_save(flags); + + /* + * we reschedule only in the presence of cgroup + * constrained events. + */ + rcu_read_lock(); + + list_for_each_entry_rcu(pmu, &pmus, entry) { + + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + perf_pmu_disable(cpuctx->ctx.pmu); + + /* + * perf_cgroup_events says at least one + * context on this CPU has cgroup events. + * + * ctx->nr_cgroups reports the number of cgroup + * events for a context. + */ + if (cpuctx->ctx.nr_cgroups > 0) { + + if (mode & PERF_CGROUP_SWOUT) { + cpu_ctx_sched_out(cpuctx, EVENT_ALL); + /* + * must not be done before ctxswout due + * to event_filter_match() in event_sched_out() + */ + cpuctx->cgrp = NULL; + } + + if (mode & PERF_CGROUP_SWIN) { + WARN_ON_ONCE(cpuctx->cgrp); + /* set cgrp before ctxsw in to + * allow event_filter_match() to not + * have to pass task around + */ + cpuctx->cgrp = perf_cgroup_from_task(task); + cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); + } + } + + perf_pmu_enable(cpuctx->ctx.pmu); + } + + rcu_read_unlock(); + + local_irq_restore(flags); +} + +static inline void perf_cgroup_sched_out(struct task_struct *task) +{ + perf_cgroup_switch(task, PERF_CGROUP_SWOUT); +} + +static inline void perf_cgroup_sched_in(struct task_struct *task) +{ + perf_cgroup_switch(task, PERF_CGROUP_SWIN); +} + +static inline int perf_cgroup_connect(int fd, struct perf_event *event, + struct perf_event_attr *attr, + struct perf_event *group_leader) +{ + struct perf_cgroup *cgrp; + struct cgroup_subsys_state *css; + struct file *file; + int ret = 0, fput_needed; + + file = fget_light(fd, &fput_needed); + if (!file) + return -EBADF; + + css = cgroup_css_from_dir(file, perf_subsys_id); + if (IS_ERR(css)) { + ret = PTR_ERR(css); + goto out; + } + + cgrp = container_of(css, struct perf_cgroup, css); + event->cgrp = cgrp; + + /* must be done before we fput() the file */ + perf_get_cgroup(event); + + /* + * all events in a group must monitor + * the same cgroup because a task belongs + * to only one perf cgroup at a time + */ + if (group_leader && group_leader->cgrp != cgrp) { + perf_detach_cgroup(event); + ret = -EINVAL; + } +out: + fput_light(file, fput_needed); + return ret; +} + +static inline void +perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) +{ + struct perf_cgroup_info *t; + t = per_cpu_ptr(event->cgrp->info, event->cpu); + event->shadow_ctx_time = now - t->timestamp; +} + +static inline void +perf_cgroup_defer_enabled(struct perf_event *event) +{ + /* + * when the current task's perf cgroup does not match + * the event's, we need to remember to call the + * perf_mark_enable() function the first time a task with + * a matching perf cgroup is scheduled in. + */ + if (is_cgroup_event(event) && !perf_cgroup_match(event)) + event->cgrp_defer_enabled = 1; +} + +static inline void +perf_cgroup_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) +{ + struct perf_event *sub; + u64 tstamp = perf_event_time(event); + + if (!event->cgrp_defer_enabled) + return; + + event->cgrp_defer_enabled = 0; + + event->tstamp_enabled = tstamp - event->total_time_enabled; + list_for_each_entry(sub, &event->sibling_list, group_entry) { + if (sub->state >= PERF_EVENT_STATE_INACTIVE) { + sub->tstamp_enabled = tstamp - sub->total_time_enabled; + sub->cgrp_defer_enabled = 0; + } + } +} +#else /* !CONFIG_CGROUP_PERF */ + +static inline bool +perf_cgroup_match(struct perf_event *event) +{ + return true; +} + +static inline void perf_detach_cgroup(struct perf_event *event) +{} + +static inline int is_cgroup_event(struct perf_event *event) +{ + return 0; +} + +static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event) +{ + return 0; +} + +static inline void update_cgrp_time_from_event(struct perf_event *event) +{ +} + +static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) +{ +} + +static inline void perf_cgroup_sched_out(struct task_struct *task) +{ +} + +static inline void perf_cgroup_sched_in(struct task_struct *task) +{ +} + +static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event, + struct perf_event_attr *attr, + struct perf_event *group_leader) +{ + return -EINVAL; +} + +static inline void +perf_cgroup_set_timestamp(struct task_struct *task, + struct perf_event_context *ctx) +{ +} + +void +perf_cgroup_switch(struct task_struct *task, struct task_struct *next) +{ +} + +static inline void +perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) +{ +} + +static inline u64 perf_cgroup_event_time(struct perf_event *event) +{ + return 0; +} + +static inline void +perf_cgroup_defer_enabled(struct perf_event *event) +{ +} + +static inline void +perf_cgroup_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) +{ +} +#endif + +void perf_pmu_disable(struct pmu *pmu) +{ + int *count = this_cpu_ptr(pmu->pmu_disable_count); + if (!(*count)++) + pmu->pmu_disable(pmu); +} + +void perf_pmu_enable(struct pmu *pmu) +{ + int *count = this_cpu_ptr(pmu->pmu_disable_count); + if (!--(*count)) + pmu->pmu_enable(pmu); +} + +static DEFINE_PER_CPU(struct list_head, rotation_list); + +/* + * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized + * because they're strictly cpu affine and rotate_start is called with IRQs + * disabled, while rotate_context is called from IRQ context. + */ +static void perf_pmu_rotate_start(struct pmu *pmu) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + struct list_head *head = &__get_cpu_var(rotation_list); + + WARN_ON(!irqs_disabled()); + + if (list_empty(&cpuctx->rotation_list)) + list_add(&cpuctx->rotation_list, head); +} + +static void get_ctx(struct perf_event_context *ctx) +{ + WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); +} + +static void put_ctx(struct perf_event_context *ctx) +{ + if (atomic_dec_and_test(&ctx->refcount)) { + if (ctx->parent_ctx) + put_ctx(ctx->parent_ctx); + if (ctx->task) + put_task_struct(ctx->task); + kfree_rcu(ctx, rcu_head); + } +} + +static void unclone_ctx(struct perf_event_context *ctx) +{ + if (ctx->parent_ctx) { + put_ctx(ctx->parent_ctx); + ctx->parent_ctx = NULL; + } +} + +static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) +{ + /* + * only top level events have the pid namespace they were created in + */ + if (event->parent) + event = event->parent; + + return task_tgid_nr_ns(p, event->ns); +} + +static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) +{ + /* + * only top level events have the pid namespace they were created in + */ + if (event->parent) + event = event->parent; + + return task_pid_nr_ns(p, event->ns); +} + +/* + * If we inherit events we want to return the parent event id + * to userspace. + */ +static u64 primary_event_id(struct perf_event *event) +{ + u64 id = event->id; + + if (event->parent) + id = event->parent->id; + + return id; +} + +/* + * Get the perf_event_context for a task and lock it. + * This has to cope with with the fact that until it is locked, + * the context could get moved to another task. + */ +static struct perf_event_context * +perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) +{ + struct perf_event_context *ctx; + + rcu_read_lock(); +retry: + ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); + if (ctx) { + /* + * If this context is a clone of another, it might + * get swapped for another underneath us by + * perf_event_task_sched_out, though the + * rcu_read_lock() protects us from any context + * getting freed. Lock the context and check if it + * got swapped before we could get the lock, and retry + * if so. If we locked the right context, then it + * can't get swapped on us any more. + */ + raw_spin_lock_irqsave(&ctx->lock, *flags); + if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { + raw_spin_unlock_irqrestore(&ctx->lock, *flags); + goto retry; + } + + if (!atomic_inc_not_zero(&ctx->refcount)) { + raw_spin_unlock_irqrestore(&ctx->lock, *flags); + ctx = NULL; + } + } + rcu_read_unlock(); + return ctx; +} + +/* + * Get the context for a task and increment its pin_count so it + * can't get swapped to another task. This also increments its + * reference count so that the context can't get freed. + */ +static struct perf_event_context * +perf_pin_task_context(struct task_struct *task, int ctxn) +{ + struct perf_event_context *ctx; + unsigned long flags; + + ctx = perf_lock_task_context(task, ctxn, &flags); + if (ctx) { + ++ctx->pin_count; + raw_spin_unlock_irqrestore(&ctx->lock, flags); + } + return ctx; +} + +static void perf_unpin_context(struct perf_event_context *ctx) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&ctx->lock, flags); + --ctx->pin_count; + raw_spin_unlock_irqrestore(&ctx->lock, flags); +} + +/* + * Update the record of the current time in a context. + */ +static void update_context_time(struct perf_event_context *ctx) +{ + u64 now = perf_clock(); + + ctx->time += now - ctx->timestamp; + ctx->timestamp = now; +} + +static u64 perf_event_time(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + + if (is_cgroup_event(event)) + return perf_cgroup_event_time(event); + + return ctx ? ctx->time : 0; +} + +/* + * Update the total_time_enabled and total_time_running fields for a event. + */ +static void update_event_times(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + u64 run_end; + + if (event->state < PERF_EVENT_STATE_INACTIVE || + event->group_leader->state < PERF_EVENT_STATE_INACTIVE) + return; + /* + * in cgroup mode, time_enabled represents + * the time the event was enabled AND active + * tasks were in the monitored cgroup. This is + * independent of the activity of the context as + * there may be a mix of cgroup and non-cgroup events. + * + * That is why we treat cgroup events differently + * here. + */ + if (is_cgroup_event(event)) + run_end = perf_event_time(event); + else if (ctx->is_active) + run_end = ctx->time; + else + run_end = event->tstamp_stopped; + + event->total_time_enabled = run_end - event->tstamp_enabled; + + if (event->state == PERF_EVENT_STATE_INACTIVE) + run_end = event->tstamp_stopped; + else + run_end = perf_event_time(event); + + event->total_time_running = run_end - event->tstamp_running; + +} + +/* + * Update total_time_enabled and total_time_running for all events in a group. + */ +static void update_group_times(struct perf_event *leader) +{ + struct perf_event *event; + + update_event_times(leader); + list_for_each_entry(event, &leader->sibling_list, group_entry) + update_event_times(event); +} + +static struct list_head * +ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) +{ + if (event->attr.pinned) + return &ctx->pinned_groups; + else + return &ctx->flexible_groups; +} + +/* + * Add a event from the lists for its context. + * Must be called with ctx->mutex and ctx->lock held. + */ +static void +list_add_event(struct perf_event *event, struct perf_event_context *ctx) +{ + WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); + event->attach_state |= PERF_ATTACH_CONTEXT; + + /* + * If we're a stand alone event or group leader, we go to the context + * list, group events are kept attached to the group so that + * perf_group_detach can, at all times, locate all siblings. + */ + if (event->group_leader == event) { + struct list_head *list; + + if (is_software_event(event)) + event->group_flags |= PERF_GROUP_SOFTWARE; + + list = ctx_group_list(event, ctx); + list_add_tail(&event->group_entry, list); + } + + if (is_cgroup_event(event)) + ctx->nr_cgroups++; + + list_add_rcu(&event->event_entry, &ctx->event_list); + if (!ctx->nr_events) + perf_pmu_rotate_start(ctx->pmu); + ctx->nr_events++; + if (event->attr.inherit_stat) + ctx->nr_stat++; +} + +/* + * Called at perf_event creation and when events are attached/detached from a + * group. + */ +static void perf_event__read_size(struct perf_event *event) +{ + int entry = sizeof(u64); /* value */ + int size = 0; + int nr = 1; + + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + size += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + size += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_ID) + entry += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_GROUP) { + nr += event->group_leader->nr_siblings; + size += sizeof(u64); + } + + size += entry * nr; + event->read_size = size; +} + +static void perf_event__header_size(struct perf_event *event) +{ + struct perf_sample_data *data; + u64 sample_type = event->attr.sample_type; + u16 size = 0; + + perf_event__read_size(event); + + if (sample_type & PERF_SAMPLE_IP) + size += sizeof(data->ip); + + if (sample_type & PERF_SAMPLE_ADDR) + size += sizeof(data->addr); + + if (sample_type & PERF_SAMPLE_PERIOD) + size += sizeof(data->period); + + if (sample_type & PERF_SAMPLE_READ) + size += event->read_size; + + event->header_size = size; +} + +static void perf_event__id_header_size(struct perf_event *event) +{ + struct perf_sample_data *data; + u64 sample_type = event->attr.sample_type; + u16 size = 0; + + if (sample_type & PERF_SAMPLE_TID) + size += sizeof(data->tid_entry); + + if (sample_type & PERF_SAMPLE_TIME) + size += sizeof(data->time); + + if (sample_type & PERF_SAMPLE_ID) + size += sizeof(data->id); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + size += sizeof(data->stream_id); + + if (sample_type & PERF_SAMPLE_CPU) + size += sizeof(data->cpu_entry); + + event->id_header_size = size; +} + +static void perf_group_attach(struct perf_event *event) +{ + struct perf_event *group_leader = event->group_leader, *pos; + + /* + * We can have double attach due to group movement in perf_event_open. + */ + if (event->attach_state & PERF_ATTACH_GROUP) + return; + + event->attach_state |= PERF_ATTACH_GROUP; + + if (group_leader == event) + return; + + if (group_leader->group_flags & PERF_GROUP_SOFTWARE && + !is_software_event(event)) + group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; + + list_add_tail(&event->group_entry, &group_leader->sibling_list); + group_leader->nr_siblings++; + + perf_event__header_size(group_leader); + + list_for_each_entry(pos, &group_leader->sibling_list, group_entry) + perf_event__header_size(pos); +} + +/* + * Remove a event from the lists for its context. + * Must be called with ctx->mutex and ctx->lock held. + */ +static void +list_del_event(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_cpu_context *cpuctx; + /* + * We can have double detach due to exit/hot-unplug + close. + */ + if (!(event->attach_state & PERF_ATTACH_CONTEXT)) + return; + + event->attach_state &= ~PERF_ATTACH_CONTEXT; + + if (is_cgroup_event(event)) { + ctx->nr_cgroups--; + cpuctx = __get_cpu_context(ctx); + /* + * if there are no more cgroup events + * then cler cgrp to avoid stale pointer + * in update_cgrp_time_from_cpuctx() + */ + if (!ctx->nr_cgroups) + cpuctx->cgrp = NULL; + } + + ctx->nr_events--; + if (event->attr.inherit_stat) + ctx->nr_stat--; + + list_del_rcu(&event->event_entry); + + if (event->group_leader == event) + list_del_init(&event->group_entry); + + update_group_times(event); + + /* + * If event was in error state, then keep it + * that way, otherwise bogus counts will be + * returned on read(). The only way to get out + * of error state is by explicit re-enabling + * of the event + */ + if (event->state > PERF_EVENT_STATE_OFF) + event->state = PERF_EVENT_STATE_OFF; +} + +static void perf_group_detach(struct perf_event *event) +{ + struct perf_event *sibling, *tmp; + struct list_head *list = NULL; + + /* + * We can have double detach due to exit/hot-unplug + close. + */ + if (!(event->attach_state & PERF_ATTACH_GROUP)) + return; + + event->attach_state &= ~PERF_ATTACH_GROUP; + + /* + * If this is a sibling, remove it from its group. + */ + if (event->group_leader != event) { + list_del_init(&event->group_entry); + event->group_leader->nr_siblings--; + goto out; + } + + if (!list_empty(&event->group_entry)) + list = &event->group_entry; + + /* + * If this was a group event with sibling events then + * upgrade the siblings to singleton events by adding them + * to whatever list we are on. + */ + list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { + if (list) + list_move_tail(&sibling->group_entry, list); + sibling->group_leader = sibling; + + /* Inherit group flags from the previous leader */ + sibling->group_flags = event->group_flags; + } + +out: + perf_event__header_size(event->group_leader); + + list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry) + perf_event__header_size(tmp); +} + +static inline int +event_filter_match(struct perf_event *event) +{ + return (event->cpu == -1 || event->cpu == smp_processor_id()) + && perf_cgroup_match(event); +} + +static void +event_sched_out(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + u64 tstamp = perf_event_time(event); + u64 delta; + /* + * An event which could not be activated because of + * filter mismatch still needs to have its timings + * maintained, otherwise bogus information is return + * via read() for time_enabled, time_running: + */ + if (event->state == PERF_EVENT_STATE_INACTIVE + && !event_filter_match(event)) { + delta = tstamp - event->tstamp_stopped; + event->tstamp_running += delta; + event->tstamp_stopped = tstamp; + } + + if (event->state != PERF_EVENT_STATE_ACTIVE) + return; + + event->state = PERF_EVENT_STATE_INACTIVE; + if (event->pending_disable) { + event->pending_disable = 0; + event->state = PERF_EVENT_STATE_OFF; + } + event->tstamp_stopped = tstamp; + event->pmu->del(event, 0); + event->oncpu = -1; + + if (!is_software_event(event)) + cpuctx->active_oncpu--; + ctx->nr_active--; + if (event->attr.exclusive || !cpuctx->active_oncpu) + cpuctx->exclusive = 0; +} + +static void +group_sched_out(struct perf_event *group_event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + struct perf_event *event; + int state = group_event->state; + + event_sched_out(group_event, cpuctx, ctx); + + /* + * Schedule out siblings (if any): + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) + event_sched_out(event, cpuctx, ctx); + + if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive) + cpuctx->exclusive = 0; +} + +/* + * Cross CPU call to remove a performance event + * + * We disable the event on the hardware level first. After that we + * remove it from the context list. + */ +static int __perf_remove_from_context(void *info) +{ + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + + raw_spin_lock(&ctx->lock); + event_sched_out(event, cpuctx, ctx); + list_del_event(event, ctx); + raw_spin_unlock(&ctx->lock); + + return 0; +} + + +/* + * Remove the event from a task's (or a CPU's) list of events. + * + * CPU events are removed with a smp call. For task events we only + * call when the task is on a CPU. + * + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to + * remains valid. This is OK when called from perf_release since + * that only calls us on the top-level context, which can't be a clone. + * When called from perf_event_exit_task, it's OK because the + * context has been detached from its task. + */ +static void perf_remove_from_context(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct task_struct *task = ctx->task; + + lockdep_assert_held(&ctx->mutex); + + if (!task) { + /* + * Per cpu events are removed via an smp call and + * the removal is always successful. + */ + cpu_function_call(event->cpu, __perf_remove_from_context, event); + return; + } + +retry: + if (!task_function_call(task, __perf_remove_from_context, event)) + return; + + raw_spin_lock_irq(&ctx->lock); + /* + * If we failed to find a running task, but find the context active now + * that we've acquired the ctx->lock, retry. + */ + if (ctx->is_active) { + raw_spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * Since the task isn't running, its safe to remove the event, us + * holding the ctx->lock ensures the task won't get scheduled in. + */ + list_del_event(event, ctx); + raw_spin_unlock_irq(&ctx->lock); +} + +/* + * Cross CPU call to disable a performance event + */ +static int __perf_event_disable(void *info) +{ + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + + /* + * If this is a per-task event, need to check whether this + * event's task is the current task on this cpu. + * + * Can trigger due to concurrent perf_event_context_sched_out() + * flipping contexts around. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return -EINVAL; + + raw_spin_lock(&ctx->lock); + + /* + * If the event is on, turn it off. + * If it is in error state, leave it in error state. + */ + if (event->state >= PERF_EVENT_STATE_INACTIVE) { + update_context_time(ctx); + update_cgrp_time_from_event(event); + update_group_times(event); + if (event == event->group_leader) + group_sched_out(event, cpuctx, ctx); + else + event_sched_out(event, cpuctx, ctx); + event->state = PERF_EVENT_STATE_OFF; + } + + raw_spin_unlock(&ctx->lock); + + return 0; +} + +/* + * Disable a event. + * + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to + * remains valid. This condition is satisifed when called through + * perf_event_for_each_child or perf_event_for_each because they + * hold the top-level event's child_mutex, so any descendant that + * goes to exit will block in sync_child_event. + * When called from perf_pending_event it's OK because event->ctx + * is the current context on this CPU and preemption is disabled, + * hence we can't get into perf_event_task_sched_out for this context. + */ +void perf_event_disable(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Disable the event on the cpu that it's on + */ + cpu_function_call(event->cpu, __perf_event_disable, event); + return; + } + +retry: + if (!task_function_call(task, __perf_event_disable, event)) + return; + + raw_spin_lock_irq(&ctx->lock); + /* + * If the event is still active, we need to retry the cross-call. + */ + if (event->state == PERF_EVENT_STATE_ACTIVE) { + raw_spin_unlock_irq(&ctx->lock); + /* + * Reload the task pointer, it might have been changed by + * a concurrent perf_event_context_sched_out(). + */ + task = ctx->task; + goto retry; + } + + /* + * Since we have the lock this context can't be scheduled + * in, so we can change the state safely. + */ + if (event->state == PERF_EVENT_STATE_INACTIVE) { + update_group_times(event); + event->state = PERF_EVENT_STATE_OFF; + } + raw_spin_unlock_irq(&ctx->lock); +} + +static void perf_set_shadow_time(struct perf_event *event, + struct perf_event_context *ctx, + u64 tstamp) +{ + /* + * use the correct time source for the time snapshot + * + * We could get by without this by leveraging the + * fact that to get to this function, the caller + * has most likely already called update_context_time() + * and update_cgrp_time_xx() and thus both timestamp + * are identical (or very close). Given that tstamp is, + * already adjusted for cgroup, we could say that: + * tstamp - ctx->timestamp + * is equivalent to + * tstamp - cgrp->timestamp. + * + * Then, in perf_output_read(), the calculation would + * work with no changes because: + * - event is guaranteed scheduled in + * - no scheduled out in between + * - thus the timestamp would be the same + * + * But this is a bit hairy. + * + * So instead, we have an explicit cgroup call to remain + * within the time time source all along. We believe it + * is cleaner and simpler to understand. + */ + if (is_cgroup_event(event)) + perf_cgroup_set_shadow_time(event, tstamp); + else + event->shadow_ctx_time = tstamp - ctx->timestamp; +} + +#define MAX_INTERRUPTS (~0ULL) + +static void perf_log_throttle(struct perf_event *event, int enable); + +static int +event_sched_in(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + u64 tstamp = perf_event_time(event); + + if (event->state <= PERF_EVENT_STATE_OFF) + return 0; + + event->state = PERF_EVENT_STATE_ACTIVE; + event->oncpu = smp_processor_id(); + + /* + * Unthrottle events, since we scheduled we might have missed several + * ticks already, also for a heavily scheduling task there is little + * guarantee it'll get a tick in a timely manner. + */ + if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) { + perf_log_throttle(event, 1); + event->hw.interrupts = 0; + } + + /* + * The new state must be visible before we turn it on in the hardware: + */ + smp_wmb(); + + if (event->pmu->add(event, PERF_EF_START)) { + event->state = PERF_EVENT_STATE_INACTIVE; + event->oncpu = -1; + return -EAGAIN; + } + + event->tstamp_running += tstamp - event->tstamp_stopped; + + perf_set_shadow_time(event, ctx, tstamp); + + if (!is_software_event(event)) + cpuctx->active_oncpu++; + ctx->nr_active++; + + if (event->attr.exclusive) + cpuctx->exclusive = 1; + + return 0; +} + +static int +group_sched_in(struct perf_event *group_event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + struct perf_event *event, *partial_group = NULL; + struct pmu *pmu = group_event->pmu; + u64 now = ctx->time; + bool simulate = false; + + if (group_event->state == PERF_EVENT_STATE_OFF) + return 0; + + pmu->start_txn(pmu); + + if (event_sched_in(group_event, cpuctx, ctx)) { + pmu->cancel_txn(pmu); + return -EAGAIN; + } + + /* + * Schedule in siblings as one group (if any): + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) { + if (event_sched_in(event, cpuctx, ctx)) { + partial_group = event; + goto group_error; + } + } + + if (!pmu->commit_txn(pmu)) + return 0; + +group_error: + /* + * Groups can be scheduled in as one unit only, so undo any + * partial group before returning: + * The events up to the failed event are scheduled out normally, + * tstamp_stopped will be updated. + * + * The failed events and the remaining siblings need to have + * their timings updated as if they had gone thru event_sched_in() + * and event_sched_out(). This is required to get consistent timings + * across the group. This also takes care of the case where the group + * could never be scheduled by ensuring tstamp_stopped is set to mark + * the time the event was actually stopped, such that time delta + * calculation in update_event_times() is correct. + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) { + if (event == partial_group) + simulate = true; + + if (simulate) { + event->tstamp_running += now - event->tstamp_stopped; + event->tstamp_stopped = now; + } else { + event_sched_out(event, cpuctx, ctx); + } + } + event_sched_out(group_event, cpuctx, ctx); + + pmu->cancel_txn(pmu); + + return -EAGAIN; +} + +/* + * Work out whether we can put this event group on the CPU now. + */ +static int group_can_go_on(struct perf_event *event, + struct perf_cpu_context *cpuctx, + int can_add_hw) +{ + /* + * Groups consisting entirely of software events can always go on. + */ + if (event->group_flags & PERF_GROUP_SOFTWARE) + return 1; + /* + * If an exclusive group is already on, no other hardware + * events can go on. + */ + if (cpuctx->exclusive) + return 0; + /* + * If this group is exclusive and there are already + * events on the CPU, it can't go on. + */ + if (event->attr.exclusive && cpuctx->active_oncpu) + return 0; + /* + * Otherwise, try to add it if all previous groups were able + * to go on. + */ + return can_add_hw; +} + +static void add_event_to_ctx(struct perf_event *event, + struct perf_event_context *ctx) +{ + u64 tstamp = perf_event_time(event); + + list_add_event(event, ctx); + perf_group_attach(event); + event->tstamp_enabled = tstamp; + event->tstamp_running = tstamp; + event->tstamp_stopped = tstamp; +} + +static void perf_event_context_sched_in(struct perf_event_context *ctx, + struct task_struct *tsk); + +/* + * Cross CPU call to install and enable a performance event + * + * Must be called with ctx->mutex held + */ +static int __perf_install_in_context(void *info) +{ + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + struct perf_event *leader = event->group_leader; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + int err; + + /* + * In case we're installing a new context to an already running task, + * could also happen before perf_event_task_sched_in() on architectures + * which do context switches with IRQs enabled. + */ + if (ctx->task && !cpuctx->task_ctx) + perf_event_context_sched_in(ctx, ctx->task); + + raw_spin_lock(&ctx->lock); + ctx->is_active = 1; + update_context_time(ctx); + /* + * update cgrp time only if current cgrp + * matches event->cgrp. Must be done before + * calling add_event_to_ctx() + */ + update_cgrp_time_from_event(event); + + add_event_to_ctx(event, ctx); + + if (!event_filter_match(event)) + goto unlock; + + /* + * Don't put the event on if it is disabled or if + * it is in a group and the group isn't on. + */ + if (event->state != PERF_EVENT_STATE_INACTIVE || + (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)) + goto unlock; + + /* + * An exclusive event can't go on if there are already active + * hardware events, and no hardware event can go on if there + * is already an exclusive event on. + */ + if (!group_can_go_on(event, cpuctx, 1)) + err = -EEXIST; + else + err = event_sched_in(event, cpuctx, ctx); + + if (err) { + /* + * This event couldn't go on. If it is in a group + * then we have to pull the whole group off. + * If the event group is pinned then put it in error state. + */ + if (leader != event) + group_sched_out(leader, cpuctx, ctx); + if (leader->attr.pinned) { + update_group_times(leader); + leader->state = PERF_EVENT_STATE_ERROR; + } + } + +unlock: + raw_spin_unlock(&ctx->lock); + + return 0; +} + +/* + * Attach a performance event to a context + * + * First we add the event to the list with the hardware enable bit + * in event->hw_config cleared. + * + * If the event is attached to a task which is on a CPU we use a smp + * call to enable it in the task context. The task might have been + * scheduled away, but we check this in the smp call again. + */ +static void +perf_install_in_context(struct perf_event_context *ctx, + struct perf_event *event, + int cpu) +{ + struct task_struct *task = ctx->task; + + lockdep_assert_held(&ctx->mutex); + + event->ctx = ctx; + + if (!task) { + /* + * Per cpu events are installed via an smp call and + * the install is always successful. + */ + cpu_function_call(cpu, __perf_install_in_context, event); + return; + } + +retry: + if (!task_function_call(task, __perf_install_in_context, event)) + return; + + raw_spin_lock_irq(&ctx->lock); + /* + * If we failed to find a running task, but find the context active now + * that we've acquired the ctx->lock, retry. + */ + if (ctx->is_active) { + raw_spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * Since the task isn't running, its safe to add the event, us holding + * the ctx->lock ensures the task won't get scheduled in. + */ + add_event_to_ctx(event, ctx); + raw_spin_unlock_irq(&ctx->lock); +} + +/* + * Put a event into inactive state and update time fields. + * Enabling the leader of a group effectively enables all + * the group members that aren't explicitly disabled, so we + * have to update their ->tstamp_enabled also. + * Note: this works for group members as well as group leaders + * since the non-leader members' sibling_lists will be empty. + */ +static void __perf_event_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) +{ + struct perf_event *sub; + u64 tstamp = perf_event_time(event); + + event->state = PERF_EVENT_STATE_INACTIVE; + event->tstamp_enabled = tstamp - event->total_time_enabled; + list_for_each_entry(sub, &event->sibling_list, group_entry) { + if (sub->state >= PERF_EVENT_STATE_INACTIVE) + sub->tstamp_enabled = tstamp - sub->total_time_enabled; + } +} + +/* + * Cross CPU call to enable a performance event + */ +static int __perf_event_enable(void *info) +{ + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + struct perf_event *leader = event->group_leader; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + int err; + + if (WARN_ON_ONCE(!ctx->is_active)) + return -EINVAL; + + raw_spin_lock(&ctx->lock); + update_context_time(ctx); + + if (event->state >= PERF_EVENT_STATE_INACTIVE) + goto unlock; + + /* + * set current task's cgroup time reference point + */ + perf_cgroup_set_timestamp(current, ctx); + + __perf_event_mark_enabled(event, ctx); + + if (!event_filter_match(event)) { + if (is_cgroup_event(event)) + perf_cgroup_defer_enabled(event); + goto unlock; + } + + /* + * If the event is in a group and isn't the group leader, + * then don't put it on unless the group is on. + */ + if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) + goto unlock; + + if (!group_can_go_on(event, cpuctx, 1)) { + err = -EEXIST; + } else { + if (event == leader) + err = group_sched_in(event, cpuctx, ctx); + else + err = event_sched_in(event, cpuctx, ctx); + } + + if (err) { + /* + * If this event can't go on and it's part of a + * group, then the whole group has to come off. + */ + if (leader != event) + group_sched_out(leader, cpuctx, ctx); + if (leader->attr.pinned) { + update_group_times(leader); + leader->state = PERF_EVENT_STATE_ERROR; + } + } + +unlock: + raw_spin_unlock(&ctx->lock); + + return 0; +} + +/* + * Enable a event. + * + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to + * remains valid. This condition is satisfied when called through + * perf_event_for_each_child or perf_event_for_each as described + * for perf_event_disable. + */ +void perf_event_enable(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Enable the event on the cpu that it's on + */ + cpu_function_call(event->cpu, __perf_event_enable, event); + return; + } + + raw_spin_lock_irq(&ctx->lock); + if (event->state >= PERF_EVENT_STATE_INACTIVE) + goto out; + + /* + * If the event is in error state, clear that first. + * That way, if we see the event in error state below, we + * know that it has gone back into error state, as distinct + * from the task having been scheduled away before the + * cross-call arrived. + */ + if (event->state == PERF_EVENT_STATE_ERROR) + event->state = PERF_EVENT_STATE_OFF; + +retry: + if (!ctx->is_active) { + __perf_event_mark_enabled(event, ctx); + goto out; + } + + raw_spin_unlock_irq(&ctx->lock); + + if (!task_function_call(task, __perf_event_enable, event)) + return; + + raw_spin_lock_irq(&ctx->lock); + + /* + * If the context is active and the event is still off, + * we need to retry the cross-call. + */ + if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) { + /* + * task could have been flipped by a concurrent + * perf_event_context_sched_out() + */ + task = ctx->task; + goto retry; + } + +out: + raw_spin_unlock_irq(&ctx->lock); +} + +static int perf_event_refresh(struct perf_event *event, int refresh) +{ + /* + * not supported on inherited events + */ + if (event->attr.inherit || !is_sampling_event(event)) + return -EINVAL; + + atomic_add(refresh, &event->event_limit); + perf_event_enable(event); + + return 0; +} + +static void ctx_sched_out(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, + enum event_type_t event_type) +{ + struct perf_event *event; + + raw_spin_lock(&ctx->lock); + perf_pmu_disable(ctx->pmu); + ctx->is_active = 0; + if (likely(!ctx->nr_events)) + goto out; + update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx); + + if (!ctx->nr_active) + goto out; + + if (event_type & EVENT_PINNED) { + list_for_each_entry(event, &ctx->pinned_groups, group_entry) + group_sched_out(event, cpuctx, ctx); + } + + if (event_type & EVENT_FLEXIBLE) { + list_for_each_entry(event, &ctx->flexible_groups, group_entry) + group_sched_out(event, cpuctx, ctx); + } +out: + perf_pmu_enable(ctx->pmu); + raw_spin_unlock(&ctx->lock); +} + +/* + * Test whether two contexts are equivalent, i.e. whether they + * have both been cloned from the same version of the same context + * and they both have the same number of enabled events. + * If the number of enabled events is the same, then the set + * of enabled events should be the same, because these are both + * inherited contexts, therefore we can't access individual events + * in them directly with an fd; we can only enable/disable all + * events via prctl, or enable/disable all events in a family + * via ioctl, which will have the same effect on both contexts. + */ +static int context_equiv(struct perf_event_context *ctx1, + struct perf_event_context *ctx2) +{ + return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx + && ctx1->parent_gen == ctx2->parent_gen + && !ctx1->pin_count && !ctx2->pin_count; +} + +static void __perf_event_sync_stat(struct perf_event *event, + struct perf_event *next_event) +{ + u64 value; + + if (!event->attr.inherit_stat) + return; + + /* + * Update the event value, we cannot use perf_event_read() + * because we're in the middle of a context switch and have IRQs + * disabled, which upsets smp_call_function_single(), however + * we know the event must be on the current CPU, therefore we + * don't need to use it. + */ + switch (event->state) { + case PERF_EVENT_STATE_ACTIVE: + event->pmu->read(event); + /* fall-through */ + + case PERF_EVENT_STATE_INACTIVE: + update_event_times(event); + break; + + default: + break; + } + + /* + * In order to keep per-task stats reliable we need to flip the event + * values when we flip the contexts. + */ + value = local64_read(&next_event->count); + value = local64_xchg(&event->count, value); + local64_set(&next_event->count, value); + + swap(event->total_time_enabled, next_event->total_time_enabled); + swap(event->total_time_running, next_event->total_time_running); + + /* + * Since we swizzled the values, update the user visible data too. + */ + perf_event_update_userpage(event); + perf_event_update_userpage(next_event); +} + +#define list_next_entry(pos, member) \ + list_entry(pos->member.next, typeof(*pos), member) + +static void perf_event_sync_stat(struct perf_event_context *ctx, + struct perf_event_context *next_ctx) +{ + struct perf_event *event, *next_event; + + if (!ctx->nr_stat) + return; + + update_context_time(ctx); + + event = list_first_entry(&ctx->event_list, + struct perf_event, event_entry); + + next_event = list_first_entry(&next_ctx->event_list, + struct perf_event, event_entry); + + while (&event->event_entry != &ctx->event_list && + &next_event->event_entry != &next_ctx->event_list) { + + __perf_event_sync_stat(event, next_event); + + event = list_next_entry(event, event_entry); + next_event = list_next_entry(next_event, event_entry); + } +} + +static void perf_event_context_sched_out(struct task_struct *task, int ctxn, + struct task_struct *next) +{ + struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; + struct perf_event_context *next_ctx; + struct perf_event_context *parent; + struct perf_cpu_context *cpuctx; + int do_switch = 1; + + if (likely(!ctx)) + return; + + cpuctx = __get_cpu_context(ctx); + if (!cpuctx->task_ctx) + return; + + rcu_read_lock(); + parent = rcu_dereference(ctx->parent_ctx); + next_ctx = next->perf_event_ctxp[ctxn]; + if (parent && next_ctx && + rcu_dereference(next_ctx->parent_ctx) == parent) { + /* + * Looks like the two contexts are clones, so we might be + * able to optimize the context switch. We lock both + * contexts and check that they are clones under the + * lock (including re-checking that neither has been + * uncloned in the meantime). It doesn't matter which + * order we take the locks because no other cpu could + * be trying to lock both of these tasks. + */ + raw_spin_lock(&ctx->lock); + raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); + if (context_equiv(ctx, next_ctx)) { + /* + * XXX do we need a memory barrier of sorts + * wrt to rcu_dereference() of perf_event_ctxp + */ + task->perf_event_ctxp[ctxn] = next_ctx; + next->perf_event_ctxp[ctxn] = ctx; + ctx->task = next; + next_ctx->task = task; + do_switch = 0; + + perf_event_sync_stat(ctx, next_ctx); + } + raw_spin_unlock(&next_ctx->lock); + raw_spin_unlock(&ctx->lock); + } + rcu_read_unlock(); + + if (do_switch) { + ctx_sched_out(ctx, cpuctx, EVENT_ALL); + cpuctx->task_ctx = NULL; + } +} + +#define for_each_task_context_nr(ctxn) \ + for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) + +/* + * Called from scheduler to remove the events of the current task, + * with interrupts disabled. + * + * We stop each event and update the event value in event->count. + * + * This does not protect us against NMI, but disable() + * sets the disabled bit in the control field of event _before_ + * accessing the event control register. If a NMI hits, then it will + * not restart the event. + */ +void __perf_event_task_sched_out(struct task_struct *task, + struct task_struct *next) +{ + int ctxn; + + for_each_task_context_nr(ctxn) + perf_event_context_sched_out(task, ctxn, next); + + /* + * if cgroup events exist on this CPU, then we need + * to check if we have to switch out PMU state. + * cgroup event are system-wide mode only + */ + if (atomic_read(&__get_cpu_var(perf_cgroup_events))) + perf_cgroup_sched_out(task); +} + +static void task_ctx_sched_out(struct perf_event_context *ctx, + enum event_type_t event_type) +{ + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + + if (!cpuctx->task_ctx) + return; + + if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) + return; + + ctx_sched_out(ctx, cpuctx, event_type); + cpuctx->task_ctx = NULL; +} + +/* + * Called with IRQs disabled + */ +static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, + enum event_type_t event_type) +{ + ctx_sched_out(&cpuctx->ctx, cpuctx, event_type); +} + +static void +ctx_pinned_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx) +{ + struct perf_event *event; + + list_for_each_entry(event, &ctx->pinned_groups, group_entry) { + if (event->state <= PERF_EVENT_STATE_OFF) + continue; + if (!event_filter_match(event)) + continue; + + /* may need to reset tstamp_enabled */ + if (is_cgroup_event(event)) + perf_cgroup_mark_enabled(event, ctx); + + if (group_can_go_on(event, cpuctx, 1)) + group_sched_in(event, cpuctx, ctx); + + /* + * If this pinned group hasn't been scheduled, + * put it in error state. + */ + if (event->state == PERF_EVENT_STATE_INACTIVE) { + update_group_times(event); + event->state = PERF_EVENT_STATE_ERROR; + } + } +} + +static void +ctx_flexible_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx) +{ + struct perf_event *event; + int can_add_hw = 1; + + list_for_each_entry(event, &ctx->flexible_groups, group_entry) { + /* Ignore events in OFF or ERROR state */ + if (event->state <= PERF_EVENT_STATE_OFF) + continue; + /* + * Listen to the 'cpu' scheduling filter constraint + * of events: + */ + if (!event_filter_match(event)) + continue; + + /* may need to reset tstamp_enabled */ + if (is_cgroup_event(event)) + perf_cgroup_mark_enabled(event, ctx); + + if (group_can_go_on(event, cpuctx, can_add_hw)) { + if (group_sched_in(event, cpuctx, ctx)) + can_add_hw = 0; + } + } +} + +static void +ctx_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, + enum event_type_t event_type, + struct task_struct *task) +{ + u64 now; + + raw_spin_lock(&ctx->lock); + ctx->is_active = 1; + if (likely(!ctx->nr_events)) + goto out; + + now = perf_clock(); + ctx->timestamp = now; + perf_cgroup_set_timestamp(task, ctx); + /* + * First go through the list and put on any pinned groups + * in order to give them the best chance of going on. + */ + if (event_type & EVENT_PINNED) + ctx_pinned_sched_in(ctx, cpuctx); + + /* Then walk through the lower prio flexible groups */ + if (event_type & EVENT_FLEXIBLE) + ctx_flexible_sched_in(ctx, cpuctx); + +out: + raw_spin_unlock(&ctx->lock); +} + +static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, + enum event_type_t event_type, + struct task_struct *task) +{ + struct perf_event_context *ctx = &cpuctx->ctx; + + ctx_sched_in(ctx, cpuctx, event_type, task); +} + +static void task_ctx_sched_in(struct perf_event_context *ctx, + enum event_type_t event_type) +{ + struct perf_cpu_context *cpuctx; + + cpuctx = __get_cpu_context(ctx); + if (cpuctx->task_ctx == ctx) + return; + + ctx_sched_in(ctx, cpuctx, event_type, NULL); + cpuctx->task_ctx = ctx; +} + +static void perf_event_context_sched_in(struct perf_event_context *ctx, + struct task_struct *task) +{ + struct perf_cpu_context *cpuctx; + + cpuctx = __get_cpu_context(ctx); + if (cpuctx->task_ctx == ctx) + return; + + perf_pmu_disable(ctx->pmu); + /* + * We want to keep the following priority order: + * cpu pinned (that don't need to move), task pinned, + * cpu flexible, task flexible. + */ + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + + ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task); + ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); + + cpuctx->task_ctx = ctx; + + /* + * Since these rotations are per-cpu, we need to ensure the + * cpu-context we got scheduled on is actually rotating. + */ + perf_pmu_rotate_start(ctx->pmu); + perf_pmu_enable(ctx->pmu); +} + +/* + * Called from scheduler to add the events of the current task + * with interrupts disabled. + * + * We restore the event value and then enable it. + * + * This does not protect us against NMI, but enable() + * sets the enabled bit in the control field of event _before_ + * accessing the event control register. If a NMI hits, then it will + * keep the event running. + */ +void __perf_event_task_sched_in(struct task_struct *task) +{ + struct perf_event_context *ctx; + int ctxn; + + for_each_task_context_nr(ctxn) { + ctx = task->perf_event_ctxp[ctxn]; + if (likely(!ctx)) + continue; + + perf_event_context_sched_in(ctx, task); + } + /* + * if cgroup events exist on this CPU, then we need + * to check if we have to switch in PMU state. + * cgroup event are system-wide mode only + */ + if (atomic_read(&__get_cpu_var(perf_cgroup_events))) + perf_cgroup_sched_in(task); +} + +static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) +{ + u64 frequency = event->attr.sample_freq; + u64 sec = NSEC_PER_SEC; + u64 divisor, dividend; + + int count_fls, nsec_fls, frequency_fls, sec_fls; + + count_fls = fls64(count); + nsec_fls = fls64(nsec); + frequency_fls = fls64(frequency); + sec_fls = 30; + + /* + * We got @count in @nsec, with a target of sample_freq HZ + * the target period becomes: + * + * @count * 10^9 + * period = ------------------- + * @nsec * sample_freq + * + */ + + /* + * Reduce accuracy by one bit such that @a and @b converge + * to a similar magnitude. + */ +#define REDUCE_FLS(a, b) \ +do { \ + if (a##_fls > b##_fls) { \ + a >>= 1; \ + a##_fls--; \ + } else { \ + b >>= 1; \ + b##_fls--; \ + } \ +} while (0) + + /* + * Reduce accuracy until either term fits in a u64, then proceed with + * the other, so that finally we can do a u64/u64 division. + */ + while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) { + REDUCE_FLS(nsec, frequency); + REDUCE_FLS(sec, count); + } + + if (count_fls + sec_fls > 64) { + divisor = nsec * frequency; + + while (count_fls + sec_fls > 64) { + REDUCE_FLS(count, sec); + divisor >>= 1; + } + + dividend = count * sec; + } else { + dividend = count * sec; + + while (nsec_fls + frequency_fls > 64) { + REDUCE_FLS(nsec, frequency); + dividend >>= 1; + } + + divisor = nsec * frequency; + } + + if (!divisor) + return dividend; + + return div64_u64(dividend, divisor); +} + +static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) +{ + struct hw_perf_event *hwc = &event->hw; + s64 period, sample_period; + s64 delta; + + period = perf_calculate_period(event, nsec, count); + + delta = (s64)(period - hwc->sample_period); + delta = (delta + 7) / 8; /* low pass filter */ + + sample_period = hwc->sample_period + delta; + + if (!sample_period) + sample_period = 1; + + hwc->sample_period = sample_period; + + if (local64_read(&hwc->period_left) > 8*sample_period) { + event->pmu->stop(event, PERF_EF_UPDATE); + local64_set(&hwc->period_left, 0); + event->pmu->start(event, PERF_EF_RELOAD); + } +} + +static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) +{ + struct perf_event *event; + struct hw_perf_event *hwc; + u64 interrupts, now; + s64 delta; + + raw_spin_lock(&ctx->lock); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (event->state != PERF_EVENT_STATE_ACTIVE) + continue; + + if (!event_filter_match(event)) + continue; + + hwc = &event->hw; + + interrupts = hwc->interrupts; + hwc->interrupts = 0; + + /* + * unthrottle events on the tick + */ + if (interrupts == MAX_INTERRUPTS) { + perf_log_throttle(event, 1); + event->pmu->start(event, 0); + } + + if (!event->attr.freq || !event->attr.sample_freq) + continue; + + event->pmu->read(event); + now = local64_read(&event->count); + delta = now - hwc->freq_count_stamp; + hwc->freq_count_stamp = now; + + if (delta > 0) + perf_adjust_period(event, period, delta); + } + raw_spin_unlock(&ctx->lock); +} + +/* + * Round-robin a context's events: + */ +static void rotate_ctx(struct perf_event_context *ctx) +{ + raw_spin_lock(&ctx->lock); + + /* + * Rotate the first entry last of non-pinned groups. Rotation might be + * disabled by the inheritance code. + */ + if (!ctx->rotate_disable) + list_rotate_left(&ctx->flexible_groups); + + raw_spin_unlock(&ctx->lock); +} + +/* + * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized + * because they're strictly cpu affine and rotate_start is called with IRQs + * disabled, while rotate_context is called from IRQ context. + */ +static void perf_rotate_context(struct perf_cpu_context *cpuctx) +{ + u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; + struct perf_event_context *ctx = NULL; + int rotate = 0, remove = 1; + + if (cpuctx->ctx.nr_events) { + remove = 0; + if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) + rotate = 1; + } + + ctx = cpuctx->task_ctx; + if (ctx && ctx->nr_events) { + remove = 0; + if (ctx->nr_events != ctx->nr_active) + rotate = 1; + } + + perf_pmu_disable(cpuctx->ctx.pmu); + perf_ctx_adjust_freq(&cpuctx->ctx, interval); + if (ctx) + perf_ctx_adjust_freq(ctx, interval); + + if (!rotate) + goto done; + + cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); + if (ctx) + task_ctx_sched_out(ctx, EVENT_FLEXIBLE); + + rotate_ctx(&cpuctx->ctx); + if (ctx) + rotate_ctx(ctx); + + cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current); + if (ctx) + task_ctx_sched_in(ctx, EVENT_FLEXIBLE); + +done: + if (remove) + list_del_init(&cpuctx->rotation_list); + + perf_pmu_enable(cpuctx->ctx.pmu); +} + +void perf_event_task_tick(void) +{ + struct list_head *head = &__get_cpu_var(rotation_list); + struct perf_cpu_context *cpuctx, *tmp; + + WARN_ON(!irqs_disabled()); + + list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { + if (cpuctx->jiffies_interval == 1 || + !(jiffies % cpuctx->jiffies_interval)) + perf_rotate_context(cpuctx); + } +} + +static int event_enable_on_exec(struct perf_event *event, + struct perf_event_context *ctx) +{ + if (!event->attr.enable_on_exec) + return 0; + + event->attr.enable_on_exec = 0; + if (event->state >= PERF_EVENT_STATE_INACTIVE) + return 0; + + __perf_event_mark_enabled(event, ctx); + + return 1; +} + +/* + * Enable all of a task's events that have been marked enable-on-exec. + * This expects task == current. + */ +static void perf_event_enable_on_exec(struct perf_event_context *ctx) +{ + struct perf_event *event; + unsigned long flags; + int enabled = 0; + int ret; + + local_irq_save(flags); + if (!ctx || !ctx->nr_events) + goto out; + + /* + * We must ctxsw out cgroup events to avoid conflict + * when invoking perf_task_event_sched_in() later on + * in this function. Otherwise we end up trying to + * ctxswin cgroup events which are already scheduled + * in. + */ + perf_cgroup_sched_out(current); + task_ctx_sched_out(ctx, EVENT_ALL); + + raw_spin_lock(&ctx->lock); + + list_for_each_entry(event, &ctx->pinned_groups, group_entry) { + ret = event_enable_on_exec(event, ctx); + if (ret) + enabled = 1; + } + + list_for_each_entry(event, &ctx->flexible_groups, group_entry) { + ret = event_enable_on_exec(event, ctx); + if (ret) + enabled = 1; + } + + /* + * Unclone this context if we enabled any event. + */ + if (enabled) + unclone_ctx(ctx); + + raw_spin_unlock(&ctx->lock); + + /* + * Also calls ctxswin for cgroup events, if any: + */ + perf_event_context_sched_in(ctx, ctx->task); +out: + local_irq_restore(flags); +} + +/* + * Cross CPU call to read the hardware event + */ +static void __perf_event_read(void *info) +{ + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. In that case + * event->count would have been updated to a recent sample + * when the event was scheduled out. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + raw_spin_lock(&ctx->lock); + if (ctx->is_active) { + update_context_time(ctx); + update_cgrp_time_from_event(event); + } + update_event_times(event); + if (event->state == PERF_EVENT_STATE_ACTIVE) + event->pmu->read(event); + raw_spin_unlock(&ctx->lock); +} + +static inline u64 perf_event_count(struct perf_event *event) +{ + return local64_read(&event->count) + atomic64_read(&event->child_count); +} + +static u64 perf_event_read(struct perf_event *event) +{ + /* + * If event is enabled and currently active on a CPU, update the + * value in the event structure: + */ + if (event->state == PERF_EVENT_STATE_ACTIVE) { + smp_call_function_single(event->oncpu, + __perf_event_read, event, 1); + } else if (event->state == PERF_EVENT_STATE_INACTIVE) { + struct perf_event_context *ctx = event->ctx; + unsigned long flags; + + raw_spin_lock_irqsave(&ctx->lock, flags); + /* + * may read while context is not active + * (e.g., thread is blocked), in that case + * we cannot update context time + */ + if (ctx->is_active) { + update_context_time(ctx); + update_cgrp_time_from_event(event); + } + update_event_times(event); + raw_spin_unlock_irqrestore(&ctx->lock, flags); + } + + return perf_event_count(event); +} + +/* + * Callchain support + */ + +struct callchain_cpus_entries { + struct rcu_head rcu_head; + struct perf_callchain_entry *cpu_entries[0]; +}; + +static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); +static atomic_t nr_callchain_events; +static DEFINE_MUTEX(callchain_mutex); +struct callchain_cpus_entries *callchain_cpus_entries; + + +__weak void perf_callchain_kernel(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +__weak void perf_callchain_user(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +static void release_callchain_buffers_rcu(struct rcu_head *head) +{ + struct callchain_cpus_entries *entries; + int cpu; + + entries = container_of(head, struct callchain_cpus_entries, rcu_head); + + for_each_possible_cpu(cpu) + kfree(entries->cpu_entries[cpu]); + + kfree(entries); +} + +static void release_callchain_buffers(void) +{ + struct callchain_cpus_entries *entries; + + entries = callchain_cpus_entries; + rcu_assign_pointer(callchain_cpus_entries, NULL); + call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); +} + +static int alloc_callchain_buffers(void) +{ + int cpu; + int size; + struct callchain_cpus_entries *entries; + + /* + * We can't use the percpu allocation API for data that can be + * accessed from NMI. Use a temporary manual per cpu allocation + * until that gets sorted out. + */ + size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]); + + entries = kzalloc(size, GFP_KERNEL); + if (!entries) + return -ENOMEM; + + size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; + + for_each_possible_cpu(cpu) { + entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, + cpu_to_node(cpu)); + if (!entries->cpu_entries[cpu]) + goto fail; + } + + rcu_assign_pointer(callchain_cpus_entries, entries); + + return 0; + +fail: + for_each_possible_cpu(cpu) + kfree(entries->cpu_entries[cpu]); + kfree(entries); + + return -ENOMEM; +} + +static int get_callchain_buffers(void) +{ + int err = 0; + int count; + + mutex_lock(&callchain_mutex); + + count = atomic_inc_return(&nr_callchain_events); + if (WARN_ON_ONCE(count < 1)) { + err = -EINVAL; + goto exit; + } + + if (count > 1) { + /* If the allocation failed, give up */ + if (!callchain_cpus_entries) + err = -ENOMEM; + goto exit; + } + + err = alloc_callchain_buffers(); + if (err) + release_callchain_buffers(); +exit: + mutex_unlock(&callchain_mutex); + + return err; +} + +static void put_callchain_buffers(void) +{ + if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { + release_callchain_buffers(); + mutex_unlock(&callchain_mutex); + } +} + +static int get_recursion_context(int *recursion) +{ + int rctx; + + if (in_nmi()) + rctx = 3; + else if (in_irq()) + rctx = 2; + else if (in_softirq()) + rctx = 1; + else + rctx = 0; + + if (recursion[rctx]) + return -1; + + recursion[rctx]++; + barrier(); + + return rctx; +} + +static inline void put_recursion_context(int *recursion, int rctx) +{ + barrier(); + recursion[rctx]--; +} + +static struct perf_callchain_entry *get_callchain_entry(int *rctx) +{ + int cpu; + struct callchain_cpus_entries *entries; + + *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); + if (*rctx == -1) + return NULL; + + entries = rcu_dereference(callchain_cpus_entries); + if (!entries) + return NULL; + + cpu = smp_processor_id(); + + return &entries->cpu_entries[cpu][*rctx]; +} + +static void +put_callchain_entry(int rctx) +{ + put_recursion_context(__get_cpu_var(callchain_recursion), rctx); +} + +static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + int rctx; + struct perf_callchain_entry *entry; + + + entry = get_callchain_entry(&rctx); + if (rctx == -1) + return NULL; + + if (!entry) + goto exit_put; + + entry->nr = 0; + + if (!user_mode(regs)) { + perf_callchain_store(entry, PERF_CONTEXT_KERNEL); + perf_callchain_kernel(entry, regs); + if (current->mm) + regs = task_pt_regs(current); + else + regs = NULL; + } + + if (regs) { + perf_callchain_store(entry, PERF_CONTEXT_USER); + perf_callchain_user(entry, regs); + } + +exit_put: + put_callchain_entry(rctx); + + return entry; +} + +/* + * Initialize the perf_event context in a task_struct: + */ +static void __perf_event_init_context(struct perf_event_context *ctx) +{ + raw_spin_lock_init(&ctx->lock); + mutex_init(&ctx->mutex); + INIT_LIST_HEAD(&ctx->pinned_groups); + INIT_LIST_HEAD(&ctx->flexible_groups); + INIT_LIST_HEAD(&ctx->event_list); + atomic_set(&ctx->refcount, 1); +} + +static struct perf_event_context * +alloc_perf_context(struct pmu *pmu, struct task_struct *task) +{ + struct perf_event_context *ctx; + + ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); + if (!ctx) + return NULL; + + __perf_event_init_context(ctx); + if (task) { + ctx->task = task; + get_task_struct(task); + } + ctx->pmu = pmu; + + return ctx; +} + +static struct task_struct * +find_lively_task_by_vpid(pid_t vpid) +{ + struct task_struct *task; + int err; + + rcu_read_lock(); + if (!vpid) + task = current; + else + task = find_task_by_vpid(vpid); + if (task) + get_task_struct(task); + rcu_read_unlock(); + + if (!task) + return ERR_PTR(-ESRCH); + + /* Reuse ptrace permission checks for now. */ + err = -EACCES; + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto errout; + + return task; +errout: + put_task_struct(task); + return ERR_PTR(err); + +} + +/* + * Returns a matching context with refcount and pincount. + */ +static struct perf_event_context * +find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) +{ + struct perf_event_context *ctx; + struct perf_cpu_context *cpuctx; + unsigned long flags; + int ctxn, err; + + if (!task) { + /* Must be root to operate on a CPU event: */ + if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EACCES); + + /* + * We could be clever and allow to attach a event to an + * offline CPU and activate it when the CPU comes up, but + * that's for later. + */ + if (!cpu_online(cpu)) + return ERR_PTR(-ENODEV); + + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + ctx = &cpuctx->ctx; + get_ctx(ctx); + ++ctx->pin_count; + + return ctx; + } + + err = -EINVAL; + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto errout; + +retry: + ctx = perf_lock_task_context(task, ctxn, &flags); + if (ctx) { + unclone_ctx(ctx); + ++ctx->pin_count; + raw_spin_unlock_irqrestore(&ctx->lock, flags); + } + + if (!ctx) { + ctx = alloc_perf_context(pmu, task); + err = -ENOMEM; + if (!ctx) + goto errout; + + get_ctx(ctx); + + err = 0; + mutex_lock(&task->perf_event_mutex); + /* + * If it has already passed perf_event_exit_task(). + * we must see PF_EXITING, it takes this mutex too. + */ + if (task->flags & PF_EXITING) + err = -ESRCH; + else if (task->perf_event_ctxp[ctxn]) + err = -EAGAIN; + else { + ++ctx->pin_count; + rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); + } + mutex_unlock(&task->perf_event_mutex); + + if (unlikely(err)) { + put_task_struct(task); + kfree(ctx); + + if (err == -EAGAIN) + goto retry; + goto errout; + } + } + + return ctx; + +errout: + return ERR_PTR(err); +} + +static void perf_event_free_filter(struct perf_event *event); + +static void free_event_rcu(struct rcu_head *head) +{ + struct perf_event *event; + + event = container_of(head, struct perf_event, rcu_head); + if (event->ns) + put_pid_ns(event->ns); + perf_event_free_filter(event); + kfree(event); +} + +static void perf_buffer_put(struct perf_buffer *buffer); + +static void free_event(struct perf_event *event) +{ + irq_work_sync(&event->pending); + + if (!event->parent) { + if (event->attach_state & PERF_ATTACH_TASK) + jump_label_dec(&perf_sched_events); + if (event->attr.mmap || event->attr.mmap_data) + atomic_dec(&nr_mmap_events); + if (event->attr.comm) + atomic_dec(&nr_comm_events); + if (event->attr.task) + atomic_dec(&nr_task_events); + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + put_callchain_buffers(); + if (is_cgroup_event(event)) { + atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); + jump_label_dec(&perf_sched_events); + } + } + + if (event->buffer) { + perf_buffer_put(event->buffer); + event->buffer = NULL; + } + + if (is_cgroup_event(event)) + perf_detach_cgroup(event); + + if (event->destroy) + event->destroy(event); + + if (event->ctx) + put_ctx(event->ctx); + + call_rcu(&event->rcu_head, free_event_rcu); +} + +int perf_event_release_kernel(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + + /* + * Remove from the PMU, can't get re-enabled since we got + * here because the last ref went. + */ + perf_event_disable(event); + + WARN_ON_ONCE(ctx->parent_ctx); + /* + * There are two ways this annotation is useful: + * + * 1) there is a lock recursion from perf_event_exit_task + * see the comment there. + * + * 2) there is a lock-inversion with mmap_sem through + * perf_event_read_group(), which takes faults while + * holding ctx->mutex, however this is called after + * the last filedesc died, so there is no possibility + * to trigger the AB-BA case. + */ + mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); + raw_spin_lock_irq(&ctx->lock); + perf_group_detach(event); + list_del_event(event, ctx); + raw_spin_unlock_irq(&ctx->lock); + mutex_unlock(&ctx->mutex); + + free_event(event); + + return 0; +} +EXPORT_SYMBOL_GPL(perf_event_release_kernel); + +/* + * Called when the last reference to the file is gone. + */ +static int perf_release(struct inode *inode, struct file *file) +{ + struct perf_event *event = file->private_data; + struct task_struct *owner; + + file->private_data = NULL; + + rcu_read_lock(); + owner = ACCESS_ONCE(event->owner); + /* + * Matches the smp_wmb() in perf_event_exit_task(). If we observe + * !owner it means the list deletion is complete and we can indeed + * free this event, otherwise we need to serialize on + * owner->perf_event_mutex. + */ + smp_read_barrier_depends(); + if (owner) { + /* + * Since delayed_put_task_struct() also drops the last + * task reference we can safely take a new reference + * while holding the rcu_read_lock(). + */ + get_task_struct(owner); + } + rcu_read_unlock(); + + if (owner) { + mutex_lock(&owner->perf_event_mutex); + /* + * We have to re-check the event->owner field, if it is cleared + * we raced with perf_event_exit_task(), acquiring the mutex + * ensured they're done, and we can proceed with freeing the + * event. + */ + if (event->owner) + list_del_init(&event->owner_entry); + mutex_unlock(&owner->perf_event_mutex); + put_task_struct(owner); + } + + return perf_event_release_kernel(event); +} + +u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) +{ + struct perf_event *child; + u64 total = 0; + + *enabled = 0; + *running = 0; + + mutex_lock(&event->child_mutex); + total += perf_event_read(event); + *enabled += event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); + *running += event->total_time_running + + atomic64_read(&event->child_total_time_running); + + list_for_each_entry(child, &event->child_list, child_list) { + total += perf_event_read(child); + *enabled += child->total_time_enabled; + *running += child->total_time_running; + } + mutex_unlock(&event->child_mutex); + + return total; +} +EXPORT_SYMBOL_GPL(perf_event_read_value); + +static int perf_event_read_group(struct perf_event *event, + u64 read_format, char __user *buf) +{ + struct perf_event *leader = event->group_leader, *sub; + int n = 0, size = 0, ret = -EFAULT; + struct perf_event_context *ctx = leader->ctx; + u64 values[5]; + u64 count, enabled, running; + + mutex_lock(&ctx->mutex); + count = perf_event_read_value(leader, &enabled, &running); + + values[n++] = 1 + leader->nr_siblings; + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = enabled; + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = running; + values[n++] = count; + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(leader); + + size = n * sizeof(u64); + + if (copy_to_user(buf, values, size)) + goto unlock; + + ret = size; + + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + n = 0; + + values[n++] = perf_event_read_value(sub, &enabled, &running); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(sub); + + size = n * sizeof(u64); + + if (copy_to_user(buf + ret, values, size)) { + ret = -EFAULT; + goto unlock; + } + + ret += size; + } +unlock: + mutex_unlock(&ctx->mutex); + + return ret; +} + +static int perf_event_read_one(struct perf_event *event, + u64 read_format, char __user *buf) +{ + u64 enabled, running; + u64 values[4]; + int n = 0; + + values[n++] = perf_event_read_value(event, &enabled, &running); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = enabled; + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = running; + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(event); + + if (copy_to_user(buf, values, n * sizeof(u64))) + return -EFAULT; + + return n * sizeof(u64); +} + +/* + * Read the performance event - simple non blocking version for now + */ +static ssize_t +perf_read_hw(struct perf_event *event, char __user *buf, size_t count) +{ + u64 read_format = event->attr.read_format; + int ret; + + /* + * Return end-of-file for a read on a event that is in + * error state (i.e. because it was pinned but it couldn't be + * scheduled on to the CPU at some point). + */ + if (event->state == PERF_EVENT_STATE_ERROR) + return 0; + + if (count < event->read_size) + return -ENOSPC; + + WARN_ON_ONCE(event->ctx->parent_ctx); + if (read_format & PERF_FORMAT_GROUP) + ret = perf_event_read_group(event, read_format, buf); + else + ret = perf_event_read_one(event, read_format, buf); + + return ret; +} + +static ssize_t +perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + struct perf_event *event = file->private_data; + + return perf_read_hw(event, buf, count); +} + +static unsigned int perf_poll(struct file *file, poll_table *wait) +{ + struct perf_event *event = file->private_data; + struct perf_buffer *buffer; + unsigned int events = POLL_HUP; + + rcu_read_lock(); + buffer = rcu_dereference(event->buffer); + if (buffer) + events = atomic_xchg(&buffer->poll, 0); + rcu_read_unlock(); + + poll_wait(file, &event->waitq, wait); + + return events; +} + +static void perf_event_reset(struct perf_event *event) +{ + (void)perf_event_read(event); + local64_set(&event->count, 0); + perf_event_update_userpage(event); +} + +/* + * Holding the top-level event's child_mutex means that any + * descendant process that has inherited this event will block + * in sync_child_event if it goes to exit, thus satisfying the + * task existence requirements of perf_event_enable/disable. + */ +static void perf_event_for_each_child(struct perf_event *event, + void (*func)(struct perf_event *)) +{ + struct perf_event *child; + + WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->child_mutex); + func(event); + list_for_each_entry(child, &event->child_list, child_list) + func(child); + mutex_unlock(&event->child_mutex); +} + +static void perf_event_for_each(struct perf_event *event, + void (*func)(struct perf_event *)) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_event *sibling; + + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + event = event->group_leader; + + perf_event_for_each_child(event, func); + func(event); + list_for_each_entry(sibling, &event->sibling_list, group_entry) + perf_event_for_each_child(event, func); + mutex_unlock(&ctx->mutex); +} + +static int perf_event_period(struct perf_event *event, u64 __user *arg) +{ + struct perf_event_context *ctx = event->ctx; + int ret = 0; + u64 value; + + if (!is_sampling_event(event)) + return -EINVAL; + + if (copy_from_user(&value, arg, sizeof(value))) + return -EFAULT; + + if (!value) + return -EINVAL; + + raw_spin_lock_irq(&ctx->lock); + if (event->attr.freq) { + if (value > sysctl_perf_event_sample_rate) { + ret = -EINVAL; + goto unlock; + } + + event->attr.sample_freq = value; + } else { + event->attr.sample_period = value; + event->hw.sample_period = value; + } +unlock: + raw_spin_unlock_irq(&ctx->lock); + + return ret; +} + +static const struct file_operations perf_fops; + +static struct perf_event *perf_fget_light(int fd, int *fput_needed) +{ + struct file *file; + + file = fget_light(fd, fput_needed); + if (!file) + return ERR_PTR(-EBADF); + + if (file->f_op != &perf_fops) { + fput_light(file, *fput_needed); + *fput_needed = 0; + return ERR_PTR(-EBADF); + } + + return file->private_data; +} + +static int perf_event_set_output(struct perf_event *event, + struct perf_event *output_event); +static int perf_event_set_filter(struct perf_event *event, void __user *arg); + +static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct perf_event *event = file->private_data; + void (*func)(struct perf_event *); + u32 flags = arg; + + switch (cmd) { + case PERF_EVENT_IOC_ENABLE: + func = perf_event_enable; + break; + case PERF_EVENT_IOC_DISABLE: + func = perf_event_disable; + break; + case PERF_EVENT_IOC_RESET: + func = perf_event_reset; + break; + + case PERF_EVENT_IOC_REFRESH: + return perf_event_refresh(event, arg); + + case PERF_EVENT_IOC_PERIOD: + return perf_event_period(event, (u64 __user *)arg); + + case PERF_EVENT_IOC_SET_OUTPUT: + { + struct perf_event *output_event = NULL; + int fput_needed = 0; + int ret; + + if (arg != -1) { + output_event = perf_fget_light(arg, &fput_needed); + if (IS_ERR(output_event)) + return PTR_ERR(output_event); + } + + ret = perf_event_set_output(event, output_event); + if (output_event) + fput_light(output_event->filp, fput_needed); + + return ret; + } + + case PERF_EVENT_IOC_SET_FILTER: + return perf_event_set_filter(event, (void __user *)arg); + + default: + return -ENOTTY; + } + + if (flags & PERF_IOC_FLAG_GROUP) + perf_event_for_each(event, func); + else + perf_event_for_each_child(event, func); + + return 0; +} + +int perf_event_task_enable(void) +{ + struct perf_event *event; + + mutex_lock(¤t->perf_event_mutex); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) + perf_event_for_each_child(event, perf_event_enable); + mutex_unlock(¤t->perf_event_mutex); + + return 0; +} + +int perf_event_task_disable(void) +{ + struct perf_event *event; + + mutex_lock(¤t->perf_event_mutex); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) + perf_event_for_each_child(event, perf_event_disable); + mutex_unlock(¤t->perf_event_mutex); + + return 0; +} + +#ifndef PERF_EVENT_INDEX_OFFSET +# define PERF_EVENT_INDEX_OFFSET 0 +#endif + +static int perf_event_index(struct perf_event *event) +{ + if (event->hw.state & PERF_HES_STOPPED) + return 0; + + if (event->state != PERF_EVENT_STATE_ACTIVE) + return 0; + + return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; +} + +/* + * Callers need to ensure there can be no nesting of this function, otherwise + * the seqlock logic goes bad. We can not serialize this because the arch + * code calls this from NMI context. + */ +void perf_event_update_userpage(struct perf_event *event) +{ + struct perf_event_mmap_page *userpg; + struct perf_buffer *buffer; + + rcu_read_lock(); + buffer = rcu_dereference(event->buffer); + if (!buffer) + goto unlock; + + userpg = buffer->user_page; + + /* + * Disable preemption so as to not let the corresponding user-space + * spin too long if we get preempted. + */ + preempt_disable(); + ++userpg->lock; + barrier(); + userpg->index = perf_event_index(event); + userpg->offset = perf_event_count(event); + if (event->state == PERF_EVENT_STATE_ACTIVE) + userpg->offset -= local64_read(&event->hw.prev_count); + + userpg->time_enabled = event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); + + userpg->time_running = event->total_time_running + + atomic64_read(&event->child_total_time_running); + + barrier(); + ++userpg->lock; + preempt_enable(); +unlock: + rcu_read_unlock(); +} + +static unsigned long perf_data_size(struct perf_buffer *buffer); + +static void +perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags) +{ + long max_size = perf_data_size(buffer); + + if (watermark) + buffer->watermark = min(max_size, watermark); + + if (!buffer->watermark) + buffer->watermark = max_size / 2; + + if (flags & PERF_BUFFER_WRITABLE) + buffer->writable = 1; + + atomic_set(&buffer->refcount, 1); +} + +#ifndef CONFIG_PERF_USE_VMALLOC + +/* + * Back perf_mmap() with regular GFP_KERNEL-0 pages. + */ + +static struct page * +perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) +{ + if (pgoff > buffer->nr_pages) + return NULL; + + if (pgoff == 0) + return virt_to_page(buffer->user_page); + + return virt_to_page(buffer->data_pages[pgoff - 1]); +} + +static void *perf_mmap_alloc_page(int cpu) +{ + struct page *page; + int node; + + node = (cpu == -1) ? cpu : cpu_to_node(cpu); + page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + if (!page) + return NULL; + + return page_address(page); +} + +static struct perf_buffer * +perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) +{ + struct perf_buffer *buffer; + unsigned long size; + int i; + + size = sizeof(struct perf_buffer); + size += nr_pages * sizeof(void *); + + buffer = kzalloc(size, GFP_KERNEL); + if (!buffer) + goto fail; + + buffer->user_page = perf_mmap_alloc_page(cpu); + if (!buffer->user_page) + goto fail_user_page; + + for (i = 0; i < nr_pages; i++) { + buffer->data_pages[i] = perf_mmap_alloc_page(cpu); + if (!buffer->data_pages[i]) + goto fail_data_pages; + } + + buffer->nr_pages = nr_pages; + + perf_buffer_init(buffer, watermark, flags); + + return buffer; + +fail_data_pages: + for (i--; i >= 0; i--) + free_page((unsigned long)buffer->data_pages[i]); + + free_page((unsigned long)buffer->user_page); + +fail_user_page: + kfree(buffer); + +fail: + return NULL; +} + +static void perf_mmap_free_page(unsigned long addr) +{ + struct page *page = virt_to_page((void *)addr); + + page->mapping = NULL; + __free_page(page); +} + +static void perf_buffer_free(struct perf_buffer *buffer) +{ + int i; + + perf_mmap_free_page((unsigned long)buffer->user_page); + for (i = 0; i < buffer->nr_pages; i++) + perf_mmap_free_page((unsigned long)buffer->data_pages[i]); + kfree(buffer); +} + +static inline int page_order(struct perf_buffer *buffer) +{ + return 0; +} + +#else + +/* + * Back perf_mmap() with vmalloc memory. + * + * Required for architectures that have d-cache aliasing issues. + */ + +static inline int page_order(struct perf_buffer *buffer) +{ + return buffer->page_order; +} + +static struct page * +perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff) +{ + if (pgoff > (1UL << page_order(buffer))) + return NULL; + + return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE); +} + +static void perf_mmap_unmark_page(void *addr) +{ + struct page *page = vmalloc_to_page(addr); + + page->mapping = NULL; +} + +static void perf_buffer_free_work(struct work_struct *work) +{ + struct perf_buffer *buffer; + void *base; + int i, nr; + + buffer = container_of(work, struct perf_buffer, work); + nr = 1 << page_order(buffer); + + base = buffer->user_page; + for (i = 0; i < nr + 1; i++) + perf_mmap_unmark_page(base + (i * PAGE_SIZE)); + + vfree(base); + kfree(buffer); +} + +static void perf_buffer_free(struct perf_buffer *buffer) +{ + schedule_work(&buffer->work); +} + +static struct perf_buffer * +perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags) +{ + struct perf_buffer *buffer; + unsigned long size; + void *all_buf; + + size = sizeof(struct perf_buffer); + size += sizeof(void *); + + buffer = kzalloc(size, GFP_KERNEL); + if (!buffer) + goto fail; + + INIT_WORK(&buffer->work, perf_buffer_free_work); + + all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE); + if (!all_buf) + goto fail_all_buf; + + buffer->user_page = all_buf; + buffer->data_pages[0] = all_buf + PAGE_SIZE; + buffer->page_order = ilog2(nr_pages); + buffer->nr_pages = 1; + + perf_buffer_init(buffer, watermark, flags); + + return buffer; + +fail_all_buf: + kfree(buffer); + +fail: + return NULL; +} + +#endif + +static unsigned long perf_data_size(struct perf_buffer *buffer) +{ + return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer)); +} + +static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct perf_event *event = vma->vm_file->private_data; + struct perf_buffer *buffer; + int ret = VM_FAULT_SIGBUS; + + if (vmf->flags & FAULT_FLAG_MKWRITE) { + if (vmf->pgoff == 0) + ret = 0; + return ret; + } + + rcu_read_lock(); + buffer = rcu_dereference(event->buffer); + if (!buffer) + goto unlock; + + if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) + goto unlock; + + vmf->page = perf_mmap_to_page(buffer, vmf->pgoff); + if (!vmf->page) + goto unlock; + + get_page(vmf->page); + vmf->page->mapping = vma->vm_file->f_mapping; + vmf->page->index = vmf->pgoff; + + ret = 0; +unlock: + rcu_read_unlock(); + + return ret; +} + +static void perf_buffer_free_rcu(struct rcu_head *rcu_head) +{ + struct perf_buffer *buffer; + + buffer = container_of(rcu_head, struct perf_buffer, rcu_head); + perf_buffer_free(buffer); +} + +static struct perf_buffer *perf_buffer_get(struct perf_event *event) +{ + struct perf_buffer *buffer; + + rcu_read_lock(); + buffer = rcu_dereference(event->buffer); + if (buffer) { + if (!atomic_inc_not_zero(&buffer->refcount)) + buffer = NULL; + } + rcu_read_unlock(); + + return buffer; +} + +static void perf_buffer_put(struct perf_buffer *buffer) +{ + if (!atomic_dec_and_test(&buffer->refcount)) + return; + + call_rcu(&buffer->rcu_head, perf_buffer_free_rcu); +} + +static void perf_mmap_open(struct vm_area_struct *vma) +{ + struct perf_event *event = vma->vm_file->private_data; + + atomic_inc(&event->mmap_count); +} + +static void perf_mmap_close(struct vm_area_struct *vma) +{ + struct perf_event *event = vma->vm_file->private_data; + + if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { + unsigned long size = perf_data_size(event->buffer); + struct user_struct *user = event->mmap_user; + struct perf_buffer *buffer = event->buffer; + + atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); + vma->vm_mm->locked_vm -= event->mmap_locked; + rcu_assign_pointer(event->buffer, NULL); + mutex_unlock(&event->mmap_mutex); + + perf_buffer_put(buffer); + free_uid(user); + } +} + +static const struct vm_operations_struct perf_mmap_vmops = { + .open = perf_mmap_open, + .close = perf_mmap_close, + .fault = perf_mmap_fault, + .page_mkwrite = perf_mmap_fault, +}; + +static int perf_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct perf_event *event = file->private_data; + unsigned long user_locked, user_lock_limit; + struct user_struct *user = current_user(); + unsigned long locked, lock_limit; + struct perf_buffer *buffer; + unsigned long vma_size; + unsigned long nr_pages; + long user_extra, extra; + int ret = 0, flags = 0; + + /* + * Don't allow mmap() of inherited per-task counters. This would + * create a performance issue due to all children writing to the + * same buffer. + */ + if (event->cpu == -1 && event->attr.inherit) + return -EINVAL; + + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + vma_size = vma->vm_end - vma->vm_start; + nr_pages = (vma_size / PAGE_SIZE) - 1; + + /* + * If we have buffer pages ensure they're a power-of-two number, so we + * can do bitmasks instead of modulo. + */ + if (nr_pages != 0 && !is_power_of_2(nr_pages)) + return -EINVAL; + + if (vma_size != PAGE_SIZE * (1 + nr_pages)) + return -EINVAL; + + if (vma->vm_pgoff != 0) + return -EINVAL; + + WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->mmap_mutex); + if (event->buffer) { + if (event->buffer->nr_pages == nr_pages) + atomic_inc(&event->buffer->refcount); + else + ret = -EINVAL; + goto unlock; + } + + user_extra = nr_pages + 1; + user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); + + /* + * Increase the limit linearly with more CPUs: + */ + user_lock_limit *= num_online_cpus(); + + user_locked = atomic_long_read(&user->locked_vm) + user_extra; + + extra = 0; + if (user_locked > user_lock_limit) + extra = user_locked - user_lock_limit; + + lock_limit = rlimit(RLIMIT_MEMLOCK); + lock_limit >>= PAGE_SHIFT; + locked = vma->vm_mm->locked_vm + extra; + + if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && + !capable(CAP_IPC_LOCK)) { + ret = -EPERM; + goto unlock; + } + + WARN_ON(event->buffer); + + if (vma->vm_flags & VM_WRITE) + flags |= PERF_BUFFER_WRITABLE; + + buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark, + event->cpu, flags); + if (!buffer) { + ret = -ENOMEM; + goto unlock; + } + rcu_assign_pointer(event->buffer, buffer); + + atomic_long_add(user_extra, &user->locked_vm); + event->mmap_locked = extra; + event->mmap_user = get_current_user(); + vma->vm_mm->locked_vm += event->mmap_locked; + +unlock: + if (!ret) + atomic_inc(&event->mmap_count); + mutex_unlock(&event->mmap_mutex); + + vma->vm_flags |= VM_RESERVED; + vma->vm_ops = &perf_mmap_vmops; + + return ret; +} + +static int perf_fasync(int fd, struct file *filp, int on) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct perf_event *event = filp->private_data; + int retval; + + mutex_lock(&inode->i_mutex); + retval = fasync_helper(fd, filp, on, &event->fasync); + mutex_unlock(&inode->i_mutex); + + if (retval < 0) + return retval; + + return 0; +} + +static const struct file_operations perf_fops = { + .llseek = no_llseek, + .release = perf_release, + .read = perf_read, + .poll = perf_poll, + .unlocked_ioctl = perf_ioctl, + .compat_ioctl = perf_ioctl, + .mmap = perf_mmap, + .fasync = perf_fasync, +}; + +/* + * Perf event wakeup + * + * If there's data, ensure we set the poll() state and publish everything + * to user-space before waking everybody up. + */ + +void perf_event_wakeup(struct perf_event *event) +{ + wake_up_all(&event->waitq); + + if (event->pending_kill) { + kill_fasync(&event->fasync, SIGIO, event->pending_kill); + event->pending_kill = 0; + } +} + +static void perf_pending_event(struct irq_work *entry) +{ + struct perf_event *event = container_of(entry, + struct perf_event, pending); + + if (event->pending_disable) { + event->pending_disable = 0; + __perf_event_disable(event); + } + + if (event->pending_wakeup) { + event->pending_wakeup = 0; + perf_event_wakeup(event); + } +} + +/* + * We assume there is only KVM supporting the callbacks. + * Later on, we might change it to a list if there is + * another virtualization implementation supporting the callbacks. + */ +struct perf_guest_info_callbacks *perf_guest_cbs; + +int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +{ + perf_guest_cbs = cbs; + return 0; +} +EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks); + +int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs) +{ + perf_guest_cbs = NULL; + return 0; +} +EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); + +/* + * Output + */ +static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail, + unsigned long offset, unsigned long head) +{ + unsigned long mask; + + if (!buffer->writable) + return true; + + mask = perf_data_size(buffer) - 1; + + offset = (offset - tail) & mask; + head = (head - tail) & mask; + + if ((int)(head - offset) < 0) + return false; + + return true; +} + +static void perf_output_wakeup(struct perf_output_handle *handle) +{ + atomic_set(&handle->buffer->poll, POLL_IN); + + if (handle->nmi) { + handle->event->pending_wakeup = 1; + irq_work_queue(&handle->event->pending); + } else + perf_event_wakeup(handle->event); +} + +/* + * We need to ensure a later event_id doesn't publish a head when a former + * event isn't done writing. However since we need to deal with NMIs we + * cannot fully serialize things. + * + * We only publish the head (and generate a wakeup) when the outer-most + * event completes. + */ +static void perf_output_get_handle(struct perf_output_handle *handle) +{ + struct perf_buffer *buffer = handle->buffer; + + preempt_disable(); + local_inc(&buffer->nest); + handle->wakeup = local_read(&buffer->wakeup); +} + +static void perf_output_put_handle(struct perf_output_handle *handle) +{ + struct perf_buffer *buffer = handle->buffer; + unsigned long head; + +again: + head = local_read(&buffer->head); + + /* + * IRQ/NMI can happen here, which means we can miss a head update. + */ + + if (!local_dec_and_test(&buffer->nest)) + goto out; + + /* + * Publish the known good head. Rely on the full barrier implied + * by atomic_dec_and_test() order the buffer->head read and this + * write. + */ + buffer->user_page->data_head = head; + + /* + * Now check if we missed an update, rely on the (compiler) + * barrier in atomic_dec_and_test() to re-read buffer->head. + */ + if (unlikely(head != local_read(&buffer->head))) { + local_inc(&buffer->nest); + goto again; + } + + if (handle->wakeup != local_read(&buffer->wakeup)) + perf_output_wakeup(handle); + +out: + preempt_enable(); +} + +__always_inline void perf_output_copy(struct perf_output_handle *handle, + const void *buf, unsigned int len) +{ + do { + unsigned long size = min_t(unsigned long, handle->size, len); + + memcpy(handle->addr, buf, size); + + len -= size; + handle->addr += size; + buf += size; + handle->size -= size; + if (!handle->size) { + struct perf_buffer *buffer = handle->buffer; + + handle->page++; + handle->page &= buffer->nr_pages - 1; + handle->addr = buffer->data_pages[handle->page]; + handle->size = PAGE_SIZE << page_order(buffer); + } + } while (len); +} + +static void __perf_event_header__init_id(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event) +{ + u64 sample_type = event->attr.sample_type; + + data->type = sample_type; + header->size += event->id_header_size; + + if (sample_type & PERF_SAMPLE_TID) { + /* namespace issues */ + data->tid_entry.pid = perf_event_pid(event, current); + data->tid_entry.tid = perf_event_tid(event, current); + } + + if (sample_type & PERF_SAMPLE_TIME) + data->time = perf_clock(); + + if (sample_type & PERF_SAMPLE_ID) + data->id = primary_event_id(event); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + data->stream_id = event->id; + + if (sample_type & PERF_SAMPLE_CPU) { + data->cpu_entry.cpu = raw_smp_processor_id(); + data->cpu_entry.reserved = 0; + } +} + +static void perf_event_header__init_id(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event) +{ + if (event->attr.sample_id_all) + __perf_event_header__init_id(header, data, event); +} + +static void __perf_event__output_id_sample(struct perf_output_handle *handle, + struct perf_sample_data *data) +{ + u64 sample_type = data->type; + + if (sample_type & PERF_SAMPLE_TID) + perf_output_put(handle, data->tid_entry); + + if (sample_type & PERF_SAMPLE_TIME) + perf_output_put(handle, data->time); + + if (sample_type & PERF_SAMPLE_ID) + perf_output_put(handle, data->id); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + perf_output_put(handle, data->stream_id); + + if (sample_type & PERF_SAMPLE_CPU) + perf_output_put(handle, data->cpu_entry); +} + +static void perf_event__output_id_sample(struct perf_event *event, + struct perf_output_handle *handle, + struct perf_sample_data *sample) +{ + if (event->attr.sample_id_all) + __perf_event__output_id_sample(handle, sample); +} + +int perf_output_begin(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size, + int nmi, int sample) +{ + struct perf_buffer *buffer; + unsigned long tail, offset, head; + int have_lost; + struct perf_sample_data sample_data; + struct { + struct perf_event_header header; + u64 id; + u64 lost; + } lost_event; + + rcu_read_lock(); + /* + * For inherited events we send all the output towards the parent. + */ + if (event->parent) + event = event->parent; + + buffer = rcu_dereference(event->buffer); + if (!buffer) + goto out; + + handle->buffer = buffer; + handle->event = event; + handle->nmi = nmi; + handle->sample = sample; + + if (!buffer->nr_pages) + goto out; + + have_lost = local_read(&buffer->lost); + if (have_lost) { + lost_event.header.size = sizeof(lost_event); + perf_event_header__init_id(&lost_event.header, &sample_data, + event); + size += lost_event.header.size; + } + + perf_output_get_handle(handle); + + do { + /* + * Userspace could choose to issue a mb() before updating the + * tail pointer. So that all reads will be completed before the + * write is issued. + */ + tail = ACCESS_ONCE(buffer->user_page->data_tail); + smp_rmb(); + offset = head = local_read(&buffer->head); + head += size; + if (unlikely(!perf_output_space(buffer, tail, offset, head))) + goto fail; + } while (local_cmpxchg(&buffer->head, offset, head) != offset); + + if (head - local_read(&buffer->wakeup) > buffer->watermark) + local_add(buffer->watermark, &buffer->wakeup); + + handle->page = offset >> (PAGE_SHIFT + page_order(buffer)); + handle->page &= buffer->nr_pages - 1; + handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1); + handle->addr = buffer->data_pages[handle->page]; + handle->addr += handle->size; + handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size; + + if (have_lost) { + lost_event.header.type = PERF_RECORD_LOST; + lost_event.header.misc = 0; + lost_event.id = event->id; + lost_event.lost = local_xchg(&buffer->lost, 0); + + perf_output_put(handle, lost_event); + perf_event__output_id_sample(event, handle, &sample_data); + } + + return 0; + +fail: + local_inc(&buffer->lost); + perf_output_put_handle(handle); +out: + rcu_read_unlock(); + + return -ENOSPC; +} + +void perf_output_end(struct perf_output_handle *handle) +{ + struct perf_event *event = handle->event; + struct perf_buffer *buffer = handle->buffer; + + int wakeup_events = event->attr.wakeup_events; + + if (handle->sample && wakeup_events) { + int events = local_inc_return(&buffer->events); + if (events >= wakeup_events) { + local_sub(wakeup_events, &buffer->events); + local_inc(&buffer->wakeup); + } + } + + perf_output_put_handle(handle); + rcu_read_unlock(); +} + +static void perf_output_read_one(struct perf_output_handle *handle, + struct perf_event *event, + u64 enabled, u64 running) +{ + u64 read_format = event->attr.read_format; + u64 values[4]; + int n = 0; + + values[n++] = perf_event_count(event); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = enabled + + atomic64_read(&event->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = running + + atomic64_read(&event->child_total_time_running); + } + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(event); + + perf_output_copy(handle, values, n * sizeof(u64)); +} + +/* + * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. + */ +static void perf_output_read_group(struct perf_output_handle *handle, + struct perf_event *event, + u64 enabled, u64 running) +{ + struct perf_event *leader = event->group_leader, *sub; + u64 read_format = event->attr.read_format; + u64 values[5]; + int n = 0; + + values[n++] = 1 + leader->nr_siblings; + + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = enabled; + + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = running; + + if (leader != event) + leader->pmu->read(leader); + + values[n++] = perf_event_count(leader); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(leader); + + perf_output_copy(handle, values, n * sizeof(u64)); + + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + n = 0; + + if (sub != event) + sub->pmu->read(sub); + + values[n++] = perf_event_count(sub); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(sub); + + perf_output_copy(handle, values, n * sizeof(u64)); + } +} + +#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\ + PERF_FORMAT_TOTAL_TIME_RUNNING) + +static void perf_output_read(struct perf_output_handle *handle, + struct perf_event *event) +{ + u64 enabled = 0, running = 0, now, ctx_time; + u64 read_format = event->attr.read_format; + + /* + * compute total_time_enabled, total_time_running + * based on snapshot values taken when the event + * was last scheduled in. + * + * we cannot simply called update_context_time() + * because of locking issue as we are called in + * NMI context + */ + if (read_format & PERF_FORMAT_TOTAL_TIMES) { + now = perf_clock(); + ctx_time = event->shadow_ctx_time + now; + enabled = ctx_time - event->tstamp_enabled; + running = ctx_time - event->tstamp_running; + } + + if (event->attr.read_format & PERF_FORMAT_GROUP) + perf_output_read_group(handle, event, enabled, running); + else + perf_output_read_one(handle, event, enabled, running); +} + +void perf_output_sample(struct perf_output_handle *handle, + struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event) +{ + u64 sample_type = data->type; + + perf_output_put(handle, *header); + + if (sample_type & PERF_SAMPLE_IP) + perf_output_put(handle, data->ip); + + if (sample_type & PERF_SAMPLE_TID) + perf_output_put(handle, data->tid_entry); + + if (sample_type & PERF_SAMPLE_TIME) + perf_output_put(handle, data->time); + + if (sample_type & PERF_SAMPLE_ADDR) + perf_output_put(handle, data->addr); + + if (sample_type & PERF_SAMPLE_ID) + perf_output_put(handle, data->id); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + perf_output_put(handle, data->stream_id); + + if (sample_type & PERF_SAMPLE_CPU) + perf_output_put(handle, data->cpu_entry); + + if (sample_type & PERF_SAMPLE_PERIOD) + perf_output_put(handle, data->period); + + if (sample_type & PERF_SAMPLE_READ) + perf_output_read(handle, event); + + if (sample_type & PERF_SAMPLE_CALLCHAIN) { + if (data->callchain) { + int size = 1; + + if (data->callchain) + size += data->callchain->nr; + + size *= sizeof(u64); + + perf_output_copy(handle, data->callchain, size); + } else { + u64 nr = 0; + perf_output_put(handle, nr); + } + } + + if (sample_type & PERF_SAMPLE_RAW) { + if (data->raw) { + perf_output_put(handle, data->raw->size); + perf_output_copy(handle, data->raw->data, + data->raw->size); + } else { + struct { + u32 size; + u32 data; + } raw = { + .size = sizeof(u32), + .data = 0, + }; + perf_output_put(handle, raw); + } + } +} + +void perf_prepare_sample(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event, + struct pt_regs *regs) +{ + u64 sample_type = event->attr.sample_type; + + header->type = PERF_RECORD_SAMPLE; + header->size = sizeof(*header) + event->header_size; + + header->misc = 0; + header->misc |= perf_misc_flags(regs); + + __perf_event_header__init_id(header, data, event); + + if (sample_type & PERF_SAMPLE_IP) + data->ip = perf_instruction_pointer(regs); + + if (sample_type & PERF_SAMPLE_CALLCHAIN) { + int size = 1; + + data->callchain = perf_callchain(regs); + + if (data->callchain) + size += data->callchain->nr; + + header->size += size * sizeof(u64); + } + + if (sample_type & PERF_SAMPLE_RAW) { + int size = sizeof(u32); + + if (data->raw) + size += data->raw->size; + else + size += sizeof(u32); + + WARN_ON_ONCE(size & (sizeof(u64)-1)); + header->size += size; + } +} + +static void perf_event_output(struct perf_event *event, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct perf_output_handle handle; + struct perf_event_header header; + + /* protect the callchain buffers */ + rcu_read_lock(); + + perf_prepare_sample(&header, data, event, regs); + + if (perf_output_begin(&handle, event, header.size, nmi, 1)) + goto exit; + + perf_output_sample(&handle, &header, data, event); + + perf_output_end(&handle); + +exit: + rcu_read_unlock(); +} + +/* + * read event_id + */ + +struct perf_read_event { + struct perf_event_header header; + + u32 pid; + u32 tid; +}; + +static void +perf_event_read_event(struct perf_event *event, + struct task_struct *task) +{ + struct perf_output_handle handle; + struct perf_sample_data sample; + struct perf_read_event read_event = { + .header = { + .type = PERF_RECORD_READ, + .misc = 0, + .size = sizeof(read_event) + event->read_size, + }, + .pid = perf_event_pid(event, task), + .tid = perf_event_tid(event, task), + }; + int ret; + + perf_event_header__init_id(&read_event.header, &sample, event); + ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); + if (ret) + return; + + perf_output_put(&handle, read_event); + perf_output_read(&handle, event); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +/* + * task tracking -- fork/exit + * + * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task + */ + +struct perf_task_event { + struct task_struct *task; + struct perf_event_context *task_ctx; + + struct { + struct perf_event_header header; + + u32 pid; + u32 ppid; + u32 tid; + u32 ptid; + u64 time; + } event_id; +}; + +static void perf_event_task_output(struct perf_event *event, + struct perf_task_event *task_event) +{ + struct perf_output_handle handle; + struct perf_sample_data sample; + struct task_struct *task = task_event->task; + int ret, size = task_event->event_id.header.size; + + perf_event_header__init_id(&task_event->event_id.header, &sample, event); + + ret = perf_output_begin(&handle, event, + task_event->event_id.header.size, 0, 0); + if (ret) + goto out; + + task_event->event_id.pid = perf_event_pid(event, task); + task_event->event_id.ppid = perf_event_pid(event, current); + + task_event->event_id.tid = perf_event_tid(event, task); + task_event->event_id.ptid = perf_event_tid(event, current); + + perf_output_put(&handle, task_event->event_id); + + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +out: + task_event->event_id.header.size = size; +} + +static int perf_event_task_match(struct perf_event *event) +{ + if (event->state < PERF_EVENT_STATE_INACTIVE) + return 0; + + if (!event_filter_match(event)) + return 0; + + if (event->attr.comm || event->attr.mmap || + event->attr.mmap_data || event->attr.task) + return 1; + + return 0; +} + +static void perf_event_task_ctx(struct perf_event_context *ctx, + struct perf_task_event *task_event) +{ + struct perf_event *event; + + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_task_match(event)) + perf_event_task_output(event, task_event); + } +} + +static void perf_event_task_event(struct perf_task_event *task_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + struct pmu *pmu; + int ctxn; + + rcu_read_lock(); + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + if (cpuctx->active_pmu != pmu) + goto next; + perf_event_task_ctx(&cpuctx->ctx, task_event); + + ctx = task_event->task_ctx; + if (!ctx) { + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto next; + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + } + if (ctx) + perf_event_task_ctx(ctx, task_event); +next: + put_cpu_ptr(pmu->pmu_cpu_context); + } + rcu_read_unlock(); +} + +static void perf_event_task(struct task_struct *task, + struct perf_event_context *task_ctx, + int new) +{ + struct perf_task_event task_event; + + if (!atomic_read(&nr_comm_events) && + !atomic_read(&nr_mmap_events) && + !atomic_read(&nr_task_events)) + return; + + task_event = (struct perf_task_event){ + .task = task, + .task_ctx = task_ctx, + .event_id = { + .header = { + .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT, + .misc = 0, + .size = sizeof(task_event.event_id), + }, + /* .pid */ + /* .ppid */ + /* .tid */ + /* .ptid */ + .time = perf_clock(), + }, + }; + + perf_event_task_event(&task_event); +} + +void perf_event_fork(struct task_struct *task) +{ + perf_event_task(task, NULL, 1); +} + +/* + * comm tracking + */ + +struct perf_comm_event { + struct task_struct *task; + char *comm; + int comm_size; + + struct { + struct perf_event_header header; + + u32 pid; + u32 tid; + } event_id; +}; + +static void perf_event_comm_output(struct perf_event *event, + struct perf_comm_event *comm_event) +{ + struct perf_output_handle handle; + struct perf_sample_data sample; + int size = comm_event->event_id.header.size; + int ret; + + perf_event_header__init_id(&comm_event->event_id.header, &sample, event); + ret = perf_output_begin(&handle, event, + comm_event->event_id.header.size, 0, 0); + + if (ret) + goto out; + + comm_event->event_id.pid = perf_event_pid(event, comm_event->task); + comm_event->event_id.tid = perf_event_tid(event, comm_event->task); + + perf_output_put(&handle, comm_event->event_id); + perf_output_copy(&handle, comm_event->comm, + comm_event->comm_size); + + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +out: + comm_event->event_id.header.size = size; +} + +static int perf_event_comm_match(struct perf_event *event) +{ + if (event->state < PERF_EVENT_STATE_INACTIVE) + return 0; + + if (!event_filter_match(event)) + return 0; + + if (event->attr.comm) + return 1; + + return 0; +} + +static void perf_event_comm_ctx(struct perf_event_context *ctx, + struct perf_comm_event *comm_event) +{ + struct perf_event *event; + + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_comm_match(event)) + perf_event_comm_output(event, comm_event); + } +} + +static void perf_event_comm_event(struct perf_comm_event *comm_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + char comm[TASK_COMM_LEN]; + unsigned int size; + struct pmu *pmu; + int ctxn; + + memset(comm, 0, sizeof(comm)); + strlcpy(comm, comm_event->task->comm, sizeof(comm)); + size = ALIGN(strlen(comm)+1, sizeof(u64)); + + comm_event->comm = comm; + comm_event->comm_size = size; + + comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; + rcu_read_lock(); + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + if (cpuctx->active_pmu != pmu) + goto next; + perf_event_comm_ctx(&cpuctx->ctx, comm_event); + + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto next; + + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) + perf_event_comm_ctx(ctx, comm_event); +next: + put_cpu_ptr(pmu->pmu_cpu_context); + } + rcu_read_unlock(); +} + +void perf_event_comm(struct task_struct *task) +{ + struct perf_comm_event comm_event; + struct perf_event_context *ctx; + int ctxn; + + for_each_task_context_nr(ctxn) { + ctx = task->perf_event_ctxp[ctxn]; + if (!ctx) + continue; + + perf_event_enable_on_exec(ctx); + } + + if (!atomic_read(&nr_comm_events)) + return; + + comm_event = (struct perf_comm_event){ + .task = task, + /* .comm */ + /* .comm_size */ + .event_id = { + .header = { + .type = PERF_RECORD_COMM, + .misc = 0, + /* .size */ + }, + /* .pid */ + /* .tid */ + }, + }; + + perf_event_comm_event(&comm_event); +} + +/* + * mmap tracking + */ + +struct perf_mmap_event { + struct vm_area_struct *vma; + + const char *file_name; + int file_size; + + struct { + struct perf_event_header header; + + u32 pid; + u32 tid; + u64 start; + u64 len; + u64 pgoff; + } event_id; +}; + +static void perf_event_mmap_output(struct perf_event *event, + struct perf_mmap_event *mmap_event) +{ + struct perf_output_handle handle; + struct perf_sample_data sample; + int size = mmap_event->event_id.header.size; + int ret; + + perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); + ret = perf_output_begin(&handle, event, + mmap_event->event_id.header.size, 0, 0); + if (ret) + goto out; + + mmap_event->event_id.pid = perf_event_pid(event, current); + mmap_event->event_id.tid = perf_event_tid(event, current); + + perf_output_put(&handle, mmap_event->event_id); + perf_output_copy(&handle, mmap_event->file_name, + mmap_event->file_size); + + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +out: + mmap_event->event_id.header.size = size; +} + +static int perf_event_mmap_match(struct perf_event *event, + struct perf_mmap_event *mmap_event, + int executable) +{ + if (event->state < PERF_EVENT_STATE_INACTIVE) + return 0; + + if (!event_filter_match(event)) + return 0; + + if ((!executable && event->attr.mmap_data) || + (executable && event->attr.mmap)) + return 1; + + return 0; +} + +static void perf_event_mmap_ctx(struct perf_event_context *ctx, + struct perf_mmap_event *mmap_event, + int executable) +{ + struct perf_event *event; + + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_mmap_match(event, mmap_event, executable)) + perf_event_mmap_output(event, mmap_event); + } +} + +static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + struct vm_area_struct *vma = mmap_event->vma; + struct file *file = vma->vm_file; + unsigned int size; + char tmp[16]; + char *buf = NULL; + const char *name; + struct pmu *pmu; + int ctxn; + + memset(tmp, 0, sizeof(tmp)); + + if (file) { + /* + * d_path works from the end of the buffer backwards, so we + * need to add enough zero bytes after the string to handle + * the 64bit alignment we do later. + */ + buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); + if (!buf) { + name = strncpy(tmp, "//enomem", sizeof(tmp)); + goto got_name; + } + name = d_path(&file->f_path, buf, PATH_MAX); + if (IS_ERR(name)) { + name = strncpy(tmp, "//toolong", sizeof(tmp)); + goto got_name; + } + } else { + if (arch_vma_name(mmap_event->vma)) { + name = strncpy(tmp, arch_vma_name(mmap_event->vma), + sizeof(tmp)); + goto got_name; + } + + if (!vma->vm_mm) { + name = strncpy(tmp, "[vdso]", sizeof(tmp)); + goto got_name; + } else if (vma->vm_start <= vma->vm_mm->start_brk && + vma->vm_end >= vma->vm_mm->brk) { + name = strncpy(tmp, "[heap]", sizeof(tmp)); + goto got_name; + } else if (vma->vm_start <= vma->vm_mm->start_stack && + vma->vm_end >= vma->vm_mm->start_stack) { + name = strncpy(tmp, "[stack]", sizeof(tmp)); + goto got_name; + } + + name = strncpy(tmp, "//anon", sizeof(tmp)); + goto got_name; + } + +got_name: + size = ALIGN(strlen(name)+1, sizeof(u64)); + + mmap_event->file_name = name; + mmap_event->file_size = size; + + mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; + + rcu_read_lock(); + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); + if (cpuctx->active_pmu != pmu) + goto next; + perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, + vma->vm_flags & VM_EXEC); + + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto next; + + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) { + perf_event_mmap_ctx(ctx, mmap_event, + vma->vm_flags & VM_EXEC); + } +next: + put_cpu_ptr(pmu->pmu_cpu_context); + } + rcu_read_unlock(); + + kfree(buf); +} + +void perf_event_mmap(struct vm_area_struct *vma) +{ + struct perf_mmap_event mmap_event; + + if (!atomic_read(&nr_mmap_events)) + return; + + mmap_event = (struct perf_mmap_event){ + .vma = vma, + /* .file_name */ + /* .file_size */ + .event_id = { + .header = { + .type = PERF_RECORD_MMAP, + .misc = PERF_RECORD_MISC_USER, + /* .size */ + }, + /* .pid */ + /* .tid */ + .start = vma->vm_start, + .len = vma->vm_end - vma->vm_start, + .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, + }, + }; + + perf_event_mmap_event(&mmap_event); +} + +/* + * IRQ throttle logging + */ + +static void perf_log_throttle(struct perf_event *event, int enable) +{ + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret; + + struct { + struct perf_event_header header; + u64 time; + u64 id; + u64 stream_id; + } throttle_event = { + .header = { + .type = PERF_RECORD_THROTTLE, + .misc = 0, + .size = sizeof(throttle_event), + }, + .time = perf_clock(), + .id = primary_event_id(event), + .stream_id = event->id, + }; + + if (enable) + throttle_event.header.type = PERF_RECORD_UNTHROTTLE; + + perf_event_header__init_id(&throttle_event.header, &sample, event); + + ret = perf_output_begin(&handle, event, + throttle_event.header.size, 1, 0); + if (ret) + return; + + perf_output_put(&handle, throttle_event); + perf_event__output_id_sample(event, &handle, &sample); + perf_output_end(&handle); +} + +/* + * Generic event overflow handling, sampling. + */ + +static int __perf_event_overflow(struct perf_event *event, int nmi, + int throttle, struct perf_sample_data *data, + struct pt_regs *regs) +{ + int events = atomic_read(&event->event_limit); + struct hw_perf_event *hwc = &event->hw; + int ret = 0; + + /* + * Non-sampling counters might still use the PMI to fold short + * hardware counters, ignore those. + */ + if (unlikely(!is_sampling_event(event))) + return 0; + + if (unlikely(hwc->interrupts >= max_samples_per_tick)) { + if (throttle) { + hwc->interrupts = MAX_INTERRUPTS; + perf_log_throttle(event, 0); + ret = 1; + } + } else + hwc->interrupts++; + + if (event->attr.freq) { + u64 now = perf_clock(); + s64 delta = now - hwc->freq_time_stamp; + + hwc->freq_time_stamp = now; + + if (delta > 0 && delta < 2*TICK_NSEC) + perf_adjust_period(event, delta, hwc->last_period); + } + + /* + * XXX event_limit might not quite work as expected on inherited + * events + */ + + event->pending_kill = POLL_IN; + if (events && atomic_dec_and_test(&event->event_limit)) { + ret = 1; + event->pending_kill = POLL_HUP; + event->pending_disable = 1; + irq_work_queue(&event->pending); + } + + if (event->overflow_handler) + event->overflow_handler(event, nmi, data, regs); + else + perf_event_output(event, nmi, data, regs); + + if (event->fasync && event->pending_kill) { + if (nmi) { + event->pending_wakeup = 1; + irq_work_queue(&event->pending); + } else + perf_event_wakeup(event); + } + + return ret; +} + +int perf_event_overflow(struct perf_event *event, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + return __perf_event_overflow(event, nmi, 1, data, regs); +} + +/* + * Generic software event infrastructure + */ + +struct swevent_htable { + struct swevent_hlist *swevent_hlist; + struct mutex hlist_mutex; + int hlist_refcount; + + /* Recursion avoidance in each contexts */ + int recursion[PERF_NR_CONTEXTS]; +}; + +static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); + +/* + * We directly increment event->count and keep a second value in + * event->hw.period_left to count intervals. This period event + * is kept in the range [-sample_period, 0] so that we can use the + * sign as trigger. + */ + +static u64 perf_swevent_set_period(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 period = hwc->last_period; + u64 nr, offset; + s64 old, val; + + hwc->last_period = hwc->sample_period; + +again: + old = val = local64_read(&hwc->period_left); + if (val < 0) + return 0; + + nr = div64_u64(period + val, period); + offset = nr * period; + val -= offset; + if (local64_cmpxchg(&hwc->period_left, old, val) != old) + goto again; + + return nr; +} + +static void perf_swevent_overflow(struct perf_event *event, u64 overflow, + int nmi, struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct hw_perf_event *hwc = &event->hw; + int throttle = 0; + + data->period = event->hw.last_period; + if (!overflow) + overflow = perf_swevent_set_period(event); + + if (hwc->interrupts == MAX_INTERRUPTS) + return; + + for (; overflow; overflow--) { + if (__perf_event_overflow(event, nmi, throttle, + data, regs)) { + /* + * We inhibit the overflow from happening when + * hwc->interrupts == MAX_INTERRUPTS. + */ + break; + } + throttle = 1; + } +} + +static void perf_swevent_event(struct perf_event *event, u64 nr, + int nmi, struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct hw_perf_event *hwc = &event->hw; + + local64_add(nr, &event->count); + + if (!regs) + return; + + if (!is_sampling_event(event)) + return; + + if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) + return perf_swevent_overflow(event, 1, nmi, data, regs); + + if (local64_add_negative(nr, &hwc->period_left)) + return; + + perf_swevent_overflow(event, 0, nmi, data, regs); +} + +static int perf_exclude_event(struct perf_event *event, + struct pt_regs *regs) +{ + if (event->hw.state & PERF_HES_STOPPED) + return 1; + + if (regs) { + if (event->attr.exclude_user && user_mode(regs)) + return 1; + + if (event->attr.exclude_kernel && !user_mode(regs)) + return 1; + } + + return 0; +} + +static int perf_swevent_match(struct perf_event *event, + enum perf_type_id type, + u32 event_id, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + if (event->attr.type != type) + return 0; + + if (event->attr.config != event_id) + return 0; + + if (perf_exclude_event(event, regs)) + return 0; + + return 1; +} + +static inline u64 swevent_hash(u64 type, u32 event_id) +{ + u64 val = event_id | (type << 32); + + return hash_64(val, SWEVENT_HLIST_BITS); +} + +static inline struct hlist_head * +__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) +{ + u64 hash = swevent_hash(type, event_id); + + return &hlist->heads[hash]; +} + +/* For the read side: events when they trigger */ +static inline struct hlist_head * +find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id) +{ + struct swevent_hlist *hlist; + + hlist = rcu_dereference(swhash->swevent_hlist); + if (!hlist) + return NULL; + + return __find_swevent_head(hlist, type, event_id); +} + +/* For the event head insertion and removal in the hlist */ +static inline struct hlist_head * +find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) +{ + struct swevent_hlist *hlist; + u32 event_id = event->attr.config; + u64 type = event->attr.type; + + /* + * Event scheduling is always serialized against hlist allocation + * and release. Which makes the protected version suitable here. + * The context lock guarantees that. + */ + hlist = rcu_dereference_protected(swhash->swevent_hlist, + lockdep_is_held(&event->ctx->lock)); + if (!hlist) + return NULL; + + return __find_swevent_head(hlist, type, event_id); +} + +static void do_perf_sw_event(enum perf_type_id type, u32 event_id, + u64 nr, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); + struct perf_event *event; + struct hlist_node *node; + struct hlist_head *head; + + rcu_read_lock(); + head = find_swevent_head_rcu(swhash, type, event_id); + if (!head) + goto end; + + hlist_for_each_entry_rcu(event, node, head, hlist_entry) { + if (perf_swevent_match(event, type, event_id, data, regs)) + perf_swevent_event(event, nr, nmi, data, regs); + } +end: + rcu_read_unlock(); +} + +int perf_swevent_get_recursion_context(void) +{ + struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); + + return get_recursion_context(swhash->recursion); +} +EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); + +inline void perf_swevent_put_recursion_context(int rctx) +{ + struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); + + put_recursion_context(swhash->recursion, rctx); +} + +void __perf_sw_event(u32 event_id, u64 nr, int nmi, + struct pt_regs *regs, u64 addr) +{ + struct perf_sample_data data; + int rctx; + + preempt_disable_notrace(); + rctx = perf_swevent_get_recursion_context(); + if (rctx < 0) + return; + + perf_sample_data_init(&data, addr); + + do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); + + perf_swevent_put_recursion_context(rctx); + preempt_enable_notrace(); +} + +static void perf_swevent_read(struct perf_event *event) +{ +} + +static int perf_swevent_add(struct perf_event *event, int flags) +{ + struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); + struct hw_perf_event *hwc = &event->hw; + struct hlist_head *head; + + if (is_sampling_event(event)) { + hwc->last_period = hwc->sample_period; + perf_swevent_set_period(event); + } + + hwc->state = !(flags & PERF_EF_START); + + head = find_swevent_head(swhash, event); + if (WARN_ON_ONCE(!head)) + return -EINVAL; + + hlist_add_head_rcu(&event->hlist_entry, head); + + return 0; +} + +static void perf_swevent_del(struct perf_event *event, int flags) +{ + hlist_del_rcu(&event->hlist_entry); +} + +static void perf_swevent_start(struct perf_event *event, int flags) +{ + event->hw.state = 0; +} + +static void perf_swevent_stop(struct perf_event *event, int flags) +{ + event->hw.state = PERF_HES_STOPPED; +} + +/* Deref the hlist from the update side */ +static inline struct swevent_hlist * +swevent_hlist_deref(struct swevent_htable *swhash) +{ + return rcu_dereference_protected(swhash->swevent_hlist, + lockdep_is_held(&swhash->hlist_mutex)); +} + +static void swevent_hlist_release(struct swevent_htable *swhash) +{ + struct swevent_hlist *hlist = swevent_hlist_deref(swhash); + + if (!hlist) + return; + + rcu_assign_pointer(swhash->swevent_hlist, NULL); + kfree_rcu(hlist, rcu_head); +} + +static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) +{ + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); + + mutex_lock(&swhash->hlist_mutex); + + if (!--swhash->hlist_refcount) + swevent_hlist_release(swhash); + + mutex_unlock(&swhash->hlist_mutex); +} + +static void swevent_hlist_put(struct perf_event *event) +{ + int cpu; + + if (event->cpu != -1) { + swevent_hlist_put_cpu(event, event->cpu); + return; + } + + for_each_possible_cpu(cpu) + swevent_hlist_put_cpu(event, cpu); +} + +static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) +{ + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); + int err = 0; + + mutex_lock(&swhash->hlist_mutex); + + if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { + struct swevent_hlist *hlist; + + hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); + if (!hlist) { + err = -ENOMEM; + goto exit; + } + rcu_assign_pointer(swhash->swevent_hlist, hlist); + } + swhash->hlist_refcount++; +exit: + mutex_unlock(&swhash->hlist_mutex); + + return err; +} + +static int swevent_hlist_get(struct perf_event *event) +{ + int err; + int cpu, failed_cpu; + + if (event->cpu != -1) + return swevent_hlist_get_cpu(event, event->cpu); + + get_online_cpus(); + for_each_possible_cpu(cpu) { + err = swevent_hlist_get_cpu(event, cpu); + if (err) { + failed_cpu = cpu; + goto fail; + } + } + put_online_cpus(); + + return 0; +fail: + for_each_possible_cpu(cpu) { + if (cpu == failed_cpu) + break; + swevent_hlist_put_cpu(event, cpu); + } + + put_online_cpus(); + return err; +} + +struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX]; + +static void sw_perf_event_destroy(struct perf_event *event) +{ + u64 event_id = event->attr.config; + + WARN_ON(event->parent); + + jump_label_dec(&perf_swevent_enabled[event_id]); + swevent_hlist_put(event); +} + +static int perf_swevent_init(struct perf_event *event) +{ + int event_id = event->attr.config; + + if (event->attr.type != PERF_TYPE_SOFTWARE) + return -ENOENT; + + switch (event_id) { + case PERF_COUNT_SW_CPU_CLOCK: + case PERF_COUNT_SW_TASK_CLOCK: + return -ENOENT; + + default: + break; + } + + if (event_id >= PERF_COUNT_SW_MAX) + return -ENOENT; + + if (!event->parent) { + int err; + + err = swevent_hlist_get(event); + if (err) + return err; + + jump_label_inc(&perf_swevent_enabled[event_id]); + event->destroy = sw_perf_event_destroy; + } + + return 0; +} + +static struct pmu perf_swevent = { + .task_ctx_nr = perf_sw_context, + + .event_init = perf_swevent_init, + .add = perf_swevent_add, + .del = perf_swevent_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, + .read = perf_swevent_read, +}; + +#ifdef CONFIG_EVENT_TRACING + +static int perf_tp_filter_match(struct perf_event *event, + struct perf_sample_data *data) +{ + void *record = data->raw->data; + + if (likely(!event->filter) || filter_match_preds(event->filter, record)) + return 1; + return 0; +} + +static int perf_tp_event_match(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + if (event->hw.state & PERF_HES_STOPPED) + return 0; + /* + * All tracepoints are from kernel-space. + */ + if (event->attr.exclude_kernel) + return 0; + + if (!perf_tp_filter_match(event, data)) + return 0; + + return 1; +} + +void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, + struct pt_regs *regs, struct hlist_head *head, int rctx) +{ + struct perf_sample_data data; + struct perf_event *event; + struct hlist_node *node; + + struct perf_raw_record raw = { + .size = entry_size, + .data = record, + }; + + perf_sample_data_init(&data, addr); + data.raw = &raw; + + hlist_for_each_entry_rcu(event, node, head, hlist_entry) { + if (perf_tp_event_match(event, &data, regs)) + perf_swevent_event(event, count, 1, &data, regs); + } + + perf_swevent_put_recursion_context(rctx); +} +EXPORT_SYMBOL_GPL(perf_tp_event); + +static void tp_perf_event_destroy(struct perf_event *event) +{ + perf_trace_destroy(event); +} + +static int perf_tp_event_init(struct perf_event *event) +{ + int err; + + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -ENOENT; + + err = perf_trace_init(event); + if (err) + return err; + + event->destroy = tp_perf_event_destroy; + + return 0; +} + +static struct pmu perf_tracepoint = { + .task_ctx_nr = perf_sw_context, + + .event_init = perf_tp_event_init, + .add = perf_trace_add, + .del = perf_trace_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, + .read = perf_swevent_read, +}; + +static inline void perf_tp_register(void) +{ + perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT); +} + +static int perf_event_set_filter(struct perf_event *event, void __user *arg) +{ + char *filter_str; + int ret; + + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -EINVAL; + + filter_str = strndup_user(arg, PAGE_SIZE); + if (IS_ERR(filter_str)) + return PTR_ERR(filter_str); + + ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); + + kfree(filter_str); + return ret; +} + +static void perf_event_free_filter(struct perf_event *event) +{ + ftrace_profile_free_filter(event); +} + +#else + +static inline void perf_tp_register(void) +{ +} + +static int perf_event_set_filter(struct perf_event *event, void __user *arg) +{ + return -ENOENT; +} + +static void perf_event_free_filter(struct perf_event *event) +{ +} + +#endif /* CONFIG_EVENT_TRACING */ + +#ifdef CONFIG_HAVE_HW_BREAKPOINT +void perf_bp_event(struct perf_event *bp, void *data) +{ + struct perf_sample_data sample; + struct pt_regs *regs = data; + + perf_sample_data_init(&sample, bp->attr.bp_addr); + + if (!bp->hw.state && !perf_exclude_event(bp, regs)) + perf_swevent_event(bp, 1, 1, &sample, regs); +} +#endif + +/* + * hrtimer based swevent callback + */ + +static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) +{ + enum hrtimer_restart ret = HRTIMER_RESTART; + struct perf_sample_data data; + struct pt_regs *regs; + struct perf_event *event; + u64 period; + + event = container_of(hrtimer, struct perf_event, hw.hrtimer); + + if (event->state != PERF_EVENT_STATE_ACTIVE) + return HRTIMER_NORESTART; + + event->pmu->read(event); + + perf_sample_data_init(&data, 0); + data.period = event->hw.last_period; + regs = get_irq_regs(); + + if (regs && !perf_exclude_event(event, regs)) { + if (!(event->attr.exclude_idle && current->pid == 0)) + if (perf_event_overflow(event, 0, &data, regs)) + ret = HRTIMER_NORESTART; + } + + period = max_t(u64, 10000, event->hw.sample_period); + hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + + return ret; +} + +static void perf_swevent_start_hrtimer(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + s64 period; + + if (!is_sampling_event(event)) + return; + + period = local64_read(&hwc->period_left); + if (period) { + if (period < 0) + period = 10000; + + local64_set(&hwc->period_left, 0); + } else { + period = max_t(u64, 10000, hwc->sample_period); + } + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(period), 0, + HRTIMER_MODE_REL_PINNED, 0); +} + +static void perf_swevent_cancel_hrtimer(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (is_sampling_event(event)) { + ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); + local64_set(&hwc->period_left, ktime_to_ns(remaining)); + + hrtimer_cancel(&hwc->hrtimer); + } +} + +static void perf_swevent_init_hrtimer(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (!is_sampling_event(event)) + return; + + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swevent_hrtimer; + + /* + * Since hrtimers have a fixed rate, we can do a static freq->period + * mapping and avoid the whole period adjust feedback stuff. + */ + if (event->attr.freq) { + long freq = event->attr.sample_freq; + + event->attr.sample_period = NSEC_PER_SEC / freq; + hwc->sample_period = event->attr.sample_period; + local64_set(&hwc->period_left, hwc->sample_period); + event->attr.freq = 0; + } +} + +/* + * Software event: cpu wall time clock + */ + +static void cpu_clock_event_update(struct perf_event *event) +{ + s64 prev; + u64 now; + + now = local_clock(); + prev = local64_xchg(&event->hw.prev_count, now); + local64_add(now - prev, &event->count); +} + +static void cpu_clock_event_start(struct perf_event *event, int flags) +{ + local64_set(&event->hw.prev_count, local_clock()); + perf_swevent_start_hrtimer(event); +} + +static void cpu_clock_event_stop(struct perf_event *event, int flags) +{ + perf_swevent_cancel_hrtimer(event); + cpu_clock_event_update(event); +} + +static int cpu_clock_event_add(struct perf_event *event, int flags) +{ + if (flags & PERF_EF_START) + cpu_clock_event_start(event, flags); + + return 0; +} + +static void cpu_clock_event_del(struct perf_event *event, int flags) +{ + cpu_clock_event_stop(event, flags); +} + +static void cpu_clock_event_read(struct perf_event *event) +{ + cpu_clock_event_update(event); +} + +static int cpu_clock_event_init(struct perf_event *event) +{ + if (event->attr.type != PERF_TYPE_SOFTWARE) + return -ENOENT; + + if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) + return -ENOENT; + + perf_swevent_init_hrtimer(event); + + return 0; +} + +static struct pmu perf_cpu_clock = { + .task_ctx_nr = perf_sw_context, + + .event_init = cpu_clock_event_init, + .add = cpu_clock_event_add, + .del = cpu_clock_event_del, + .start = cpu_clock_event_start, + .stop = cpu_clock_event_stop, + .read = cpu_clock_event_read, +}; + +/* + * Software event: task time clock + */ + +static void task_clock_event_update(struct perf_event *event, u64 now) +{ + u64 prev; + s64 delta; + + prev = local64_xchg(&event->hw.prev_count, now); + delta = now - prev; + local64_add(delta, &event->count); +} + +static void task_clock_event_start(struct perf_event *event, int flags) +{ + local64_set(&event->hw.prev_count, event->ctx->time); + perf_swevent_start_hrtimer(event); +} + +static void task_clock_event_stop(struct perf_event *event, int flags) +{ + perf_swevent_cancel_hrtimer(event); + task_clock_event_update(event, event->ctx->time); +} + +static int task_clock_event_add(struct perf_event *event, int flags) +{ + if (flags & PERF_EF_START) + task_clock_event_start(event, flags); + + return 0; +} + +static void task_clock_event_del(struct perf_event *event, int flags) +{ + task_clock_event_stop(event, PERF_EF_UPDATE); +} + +static void task_clock_event_read(struct perf_event *event) +{ + u64 now = perf_clock(); + u64 delta = now - event->ctx->timestamp; + u64 time = event->ctx->time + delta; + + task_clock_event_update(event, time); +} + +static int task_clock_event_init(struct perf_event *event) +{ + if (event->attr.type != PERF_TYPE_SOFTWARE) + return -ENOENT; + + if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) + return -ENOENT; + + perf_swevent_init_hrtimer(event); + + return 0; +} + +static struct pmu perf_task_clock = { + .task_ctx_nr = perf_sw_context, + + .event_init = task_clock_event_init, + .add = task_clock_event_add, + .del = task_clock_event_del, + .start = task_clock_event_start, + .stop = task_clock_event_stop, + .read = task_clock_event_read, +}; + +static void perf_pmu_nop_void(struct pmu *pmu) +{ +} + +static int perf_pmu_nop_int(struct pmu *pmu) +{ + return 0; +} + +static void perf_pmu_start_txn(struct pmu *pmu) +{ + perf_pmu_disable(pmu); +} + +static int perf_pmu_commit_txn(struct pmu *pmu) +{ + perf_pmu_enable(pmu); + return 0; +} + +static void perf_pmu_cancel_txn(struct pmu *pmu) +{ + perf_pmu_enable(pmu); +} + +/* + * Ensures all contexts with the same task_ctx_nr have the same + * pmu_cpu_context too. + */ +static void *find_pmu_context(int ctxn) +{ + struct pmu *pmu; + + if (ctxn < 0) + return NULL; + + list_for_each_entry(pmu, &pmus, entry) { + if (pmu->task_ctx_nr == ctxn) + return pmu->pmu_cpu_context; + } + + return NULL; +} + +static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct perf_cpu_context *cpuctx; + + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + + if (cpuctx->active_pmu == old_pmu) + cpuctx->active_pmu = pmu; + } +} + +static void free_pmu_context(struct pmu *pmu) +{ + struct pmu *i; + + mutex_lock(&pmus_lock); + /* + * Like a real lame refcount. + */ + list_for_each_entry(i, &pmus, entry) { + if (i->pmu_cpu_context == pmu->pmu_cpu_context) { + update_pmu_context(i, pmu); + goto out; + } + } + + free_percpu(pmu->pmu_cpu_context); +out: + mutex_unlock(&pmus_lock); +} +static struct idr pmu_idr; + +static ssize_t +type_show(struct device *dev, struct device_attribute *attr, char *page) +{ + struct pmu *pmu = dev_get_drvdata(dev); + + return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); +} + +static struct device_attribute pmu_dev_attrs[] = { + __ATTR_RO(type), + __ATTR_NULL, +}; + +static int pmu_bus_running; +static struct bus_type pmu_bus = { + .name = "event_source", + .dev_attrs = pmu_dev_attrs, +}; + +static void pmu_dev_release(struct device *dev) +{ + kfree(dev); +} + +static int pmu_dev_alloc(struct pmu *pmu) +{ + int ret = -ENOMEM; + + pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL); + if (!pmu->dev) + goto out; + + device_initialize(pmu->dev); + ret = dev_set_name(pmu->dev, "%s", pmu->name); + if (ret) + goto free_dev; + + dev_set_drvdata(pmu->dev, pmu); + pmu->dev->bus = &pmu_bus; + pmu->dev->release = pmu_dev_release; + ret = device_add(pmu->dev); + if (ret) + goto free_dev; + +out: + return ret; + +free_dev: + put_device(pmu->dev); + goto out; +} + +static struct lock_class_key cpuctx_mutex; + +int perf_pmu_register(struct pmu *pmu, char *name, int type) +{ + int cpu, ret; + + mutex_lock(&pmus_lock); + ret = -ENOMEM; + pmu->pmu_disable_count = alloc_percpu(int); + if (!pmu->pmu_disable_count) + goto unlock; + + pmu->type = -1; + if (!name) + goto skip_type; + pmu->name = name; + + if (type < 0) { + int err = idr_pre_get(&pmu_idr, GFP_KERNEL); + if (!err) + goto free_pdc; + + err = idr_get_new_above(&pmu_idr, pmu, PERF_TYPE_MAX, &type); + if (err) { + ret = err; + goto free_pdc; + } + } + pmu->type = type; + + if (pmu_bus_running) { + ret = pmu_dev_alloc(pmu); + if (ret) + goto free_idr; + } + +skip_type: + pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); + if (pmu->pmu_cpu_context) + goto got_cpu_context; + + pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); + if (!pmu->pmu_cpu_context) + goto free_dev; + + for_each_possible_cpu(cpu) { + struct perf_cpu_context *cpuctx; + + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + __perf_event_init_context(&cpuctx->ctx); + lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); + cpuctx->ctx.type = cpu_context; + cpuctx->ctx.pmu = pmu; + cpuctx->jiffies_interval = 1; + INIT_LIST_HEAD(&cpuctx->rotation_list); + cpuctx->active_pmu = pmu; + } + +got_cpu_context: + if (!pmu->start_txn) { + if (pmu->pmu_enable) { + /* + * If we have pmu_enable/pmu_disable calls, install + * transaction stubs that use that to try and batch + * hardware accesses. + */ + pmu->start_txn = perf_pmu_start_txn; + pmu->commit_txn = perf_pmu_commit_txn; + pmu->cancel_txn = perf_pmu_cancel_txn; + } else { + pmu->start_txn = perf_pmu_nop_void; + pmu->commit_txn = perf_pmu_nop_int; + pmu->cancel_txn = perf_pmu_nop_void; + } + } + + if (!pmu->pmu_enable) { + pmu->pmu_enable = perf_pmu_nop_void; + pmu->pmu_disable = perf_pmu_nop_void; + } + + list_add_rcu(&pmu->entry, &pmus); + ret = 0; +unlock: + mutex_unlock(&pmus_lock); + + return ret; + +free_dev: + device_del(pmu->dev); + put_device(pmu->dev); + +free_idr: + if (pmu->type >= PERF_TYPE_MAX) + idr_remove(&pmu_idr, pmu->type); + +free_pdc: + free_percpu(pmu->pmu_disable_count); + goto unlock; +} + +void perf_pmu_unregister(struct pmu *pmu) +{ + mutex_lock(&pmus_lock); + list_del_rcu(&pmu->entry); + mutex_unlock(&pmus_lock); + + /* + * We dereference the pmu list under both SRCU and regular RCU, so + * synchronize against both of those. + */ + synchronize_srcu(&pmus_srcu); + synchronize_rcu(); + + free_percpu(pmu->pmu_disable_count); + if (pmu->type >= PERF_TYPE_MAX) + idr_remove(&pmu_idr, pmu->type); + device_del(pmu->dev); + put_device(pmu->dev); + free_pmu_context(pmu); +} + +struct pmu *perf_init_event(struct perf_event *event) +{ + struct pmu *pmu = NULL; + int idx; + int ret; + + idx = srcu_read_lock(&pmus_srcu); + + rcu_read_lock(); + pmu = idr_find(&pmu_idr, event->attr.type); + rcu_read_unlock(); + if (pmu) { + ret = pmu->event_init(event); + if (ret) + pmu = ERR_PTR(ret); + goto unlock; + } + + list_for_each_entry_rcu(pmu, &pmus, entry) { + ret = pmu->event_init(event); + if (!ret) + goto unlock; + + if (ret != -ENOENT) { + pmu = ERR_PTR(ret); + goto unlock; + } + } + pmu = ERR_PTR(-ENOENT); +unlock: + srcu_read_unlock(&pmus_srcu, idx); + + return pmu; +} + +/* + * Allocate and initialize a event structure + */ +static struct perf_event * +perf_event_alloc(struct perf_event_attr *attr, int cpu, + struct task_struct *task, + struct perf_event *group_leader, + struct perf_event *parent_event, + perf_overflow_handler_t overflow_handler) +{ + struct pmu *pmu; + struct perf_event *event; + struct hw_perf_event *hwc; + long err; + + if ((unsigned)cpu >= nr_cpu_ids) { + if (!task || cpu != -1) + return ERR_PTR(-EINVAL); + } + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return ERR_PTR(-ENOMEM); + + /* + * Single events are their own group leaders, with an + * empty sibling list: + */ + if (!group_leader) + group_leader = event; + + mutex_init(&event->child_mutex); + INIT_LIST_HEAD(&event->child_list); + + INIT_LIST_HEAD(&event->group_entry); + INIT_LIST_HEAD(&event->event_entry); + INIT_LIST_HEAD(&event->sibling_list); + init_waitqueue_head(&event->waitq); + init_irq_work(&event->pending, perf_pending_event); + + mutex_init(&event->mmap_mutex); + + event->cpu = cpu; + event->attr = *attr; + event->group_leader = group_leader; + event->pmu = NULL; + event->oncpu = -1; + + event->parent = parent_event; + + event->ns = get_pid_ns(current->nsproxy->pid_ns); + event->id = atomic64_inc_return(&perf_event_id); + + event->state = PERF_EVENT_STATE_INACTIVE; + + if (task) { + event->attach_state = PERF_ATTACH_TASK; +#ifdef CONFIG_HAVE_HW_BREAKPOINT + /* + * hw_breakpoint is a bit difficult here.. + */ + if (attr->type == PERF_TYPE_BREAKPOINT) + event->hw.bp_target = task; +#endif + } + + if (!overflow_handler && parent_event) + overflow_handler = parent_event->overflow_handler; + + event->overflow_handler = overflow_handler; + + if (attr->disabled) + event->state = PERF_EVENT_STATE_OFF; + + pmu = NULL; + + hwc = &event->hw; + hwc->sample_period = attr->sample_period; + if (attr->freq && attr->sample_freq) + hwc->sample_period = 1; + hwc->last_period = hwc->sample_period; + + local64_set(&hwc->period_left, hwc->sample_period); + + /* + * we currently do not support PERF_FORMAT_GROUP on inherited events + */ + if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) + goto done; + + pmu = perf_init_event(event); + +done: + err = 0; + if (!pmu) + err = -EINVAL; + else if (IS_ERR(pmu)) + err = PTR_ERR(pmu); + + if (err) { + if (event->ns) + put_pid_ns(event->ns); + kfree(event); + return ERR_PTR(err); + } + + event->pmu = pmu; + + if (!event->parent) { + if (event->attach_state & PERF_ATTACH_TASK) + jump_label_inc(&perf_sched_events); + if (event->attr.mmap || event->attr.mmap_data) + atomic_inc(&nr_mmap_events); + if (event->attr.comm) + atomic_inc(&nr_comm_events); + if (event->attr.task) + atomic_inc(&nr_task_events); + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { + err = get_callchain_buffers(); + if (err) { + free_event(event); + return ERR_PTR(err); + } + } + } + + return event; +} + +static int perf_copy_attr(struct perf_event_attr __user *uattr, + struct perf_event_attr *attr) +{ + u32 size; + int ret; + + if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) + return -EFAULT; + + /* + * zero the full structure, so that a short copy will be nice. + */ + memset(attr, 0, sizeof(*attr)); + + ret = get_user(size, &uattr->size); + if (ret) + return ret; + + if (size > PAGE_SIZE) /* silly large */ + goto err_size; + + if (!size) /* abi compat */ + size = PERF_ATTR_SIZE_VER0; + + if (size < PERF_ATTR_SIZE_VER0) + goto err_size; + + /* + * If we're handed a bigger struct than we know of, + * ensure all the unknown bits are 0 - i.e. new + * user-space does not rely on any kernel feature + * extensions we dont know about yet. + */ + if (size > sizeof(*attr)) { + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + + addr = (void __user *)uattr + sizeof(*attr); + end = (void __user *)uattr + size; + + for (; addr < end; addr++) { + ret = get_user(val, addr); + if (ret) + return ret; + if (val) + goto err_size; + } + size = sizeof(*attr); + } + + ret = copy_from_user(attr, uattr, size); + if (ret) + return -EFAULT; + + /* + * If the type exists, the corresponding creation will verify + * the attr->config. + */ + if (attr->type >= PERF_TYPE_MAX) + return -EINVAL; + + if (attr->__reserved_1) + return -EINVAL; + + if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) + return -EINVAL; + + if (attr->read_format & ~(PERF_FORMAT_MAX-1)) + return -EINVAL; + +out: + return ret; + +err_size: + put_user(sizeof(*attr), &uattr->size); + ret = -E2BIG; + goto out; +} + +static int +perf_event_set_output(struct perf_event *event, struct perf_event *output_event) +{ + struct perf_buffer *buffer = NULL, *old_buffer = NULL; + int ret = -EINVAL; + + if (!output_event) + goto set; + + /* don't allow circular references */ + if (event == output_event) + goto out; + + /* + * Don't allow cross-cpu buffers + */ + if (output_event->cpu != event->cpu) + goto out; + + /* + * If its not a per-cpu buffer, it must be the same task. + */ + if (output_event->cpu == -1 && output_event->ctx != event->ctx) + goto out; + +set: + mutex_lock(&event->mmap_mutex); + /* Can't redirect output if we've got an active mmap() */ + if (atomic_read(&event->mmap_count)) + goto unlock; + + if (output_event) { + /* get the buffer we want to redirect to */ + buffer = perf_buffer_get(output_event); + if (!buffer) + goto unlock; + } + + old_buffer = event->buffer; + rcu_assign_pointer(event->buffer, buffer); + ret = 0; +unlock: + mutex_unlock(&event->mmap_mutex); + + if (old_buffer) + perf_buffer_put(old_buffer); +out: + return ret; +} + +/** + * sys_perf_event_open - open a performance event, associate it to a task/cpu + * + * @attr_uptr: event_id type attributes for monitoring/sampling + * @pid: target pid + * @cpu: target cpu + * @group_fd: group leader event fd + */ +SYSCALL_DEFINE5(perf_event_open, + struct perf_event_attr __user *, attr_uptr, + pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) +{ + struct perf_event *group_leader = NULL, *output_event = NULL; + struct perf_event *event, *sibling; + struct perf_event_attr attr; + struct perf_event_context *ctx; + struct file *event_file = NULL; + struct file *group_file = NULL; + struct task_struct *task = NULL; + struct pmu *pmu; + int event_fd; + int move_group = 0; + int fput_needed = 0; + int err; + + /* for future expandability... */ + if (flags & ~PERF_FLAG_ALL) + return -EINVAL; + + err = perf_copy_attr(attr_uptr, &attr); + if (err) + return err; + + if (!attr.exclude_kernel) { + if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + } + + if (attr.freq) { + if (attr.sample_freq > sysctl_perf_event_sample_rate) + return -EINVAL; + } + + /* + * In cgroup mode, the pid argument is used to pass the fd + * opened to the cgroup directory in cgroupfs. The cpu argument + * designates the cpu on which to monitor threads from that + * cgroup. + */ + if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1)) + return -EINVAL; + + event_fd = get_unused_fd_flags(O_RDWR); + if (event_fd < 0) + return event_fd; + + if (group_fd != -1) { + group_leader = perf_fget_light(group_fd, &fput_needed); + if (IS_ERR(group_leader)) { + err = PTR_ERR(group_leader); + goto err_fd; + } + group_file = group_leader->filp; + if (flags & PERF_FLAG_FD_OUTPUT) + output_event = group_leader; + if (flags & PERF_FLAG_FD_NO_GROUP) + group_leader = NULL; + } + + if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) { + task = find_lively_task_by_vpid(pid); + if (IS_ERR(task)) { + err = PTR_ERR(task); + goto err_group_fd; + } + } + + event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL); + if (IS_ERR(event)) { + err = PTR_ERR(event); + goto err_task; + } + + if (flags & PERF_FLAG_PID_CGROUP) { + err = perf_cgroup_connect(pid, event, &attr, group_leader); + if (err) + goto err_alloc; + /* + * one more event: + * - that has cgroup constraint on event->cpu + * - that may need work on context switch + */ + atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); + jump_label_inc(&perf_sched_events); + } + + /* + * Special case software events and allow them to be part of + * any hardware group. + */ + pmu = event->pmu; + + if (group_leader && + (is_software_event(event) != is_software_event(group_leader))) { + if (is_software_event(event)) { + /* + * If event and group_leader are not both a software + * event, and event is, then group leader is not. + * + * Allow the addition of software events to !software + * groups, this is safe because software events never + * fail to schedule. + */ + pmu = group_leader->pmu; + } else if (is_software_event(group_leader) && + (group_leader->group_flags & PERF_GROUP_SOFTWARE)) { + /* + * In case the group is a pure software group, and we + * try to add a hardware event, move the whole group to + * the hardware context. + */ + move_group = 1; + } + } + + /* + * Get the target context (task or percpu): + */ + ctx = find_get_context(pmu, task, cpu); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + goto err_alloc; + } + + if (task) { + put_task_struct(task); + task = NULL; + } + + /* + * Look up the group leader (we will attach this event to it): + */ + if (group_leader) { + err = -EINVAL; + + /* + * Do not allow a recursive hierarchy (this new sibling + * becoming part of another group-sibling): + */ + if (group_leader->group_leader != group_leader) + goto err_context; + /* + * Do not allow to attach to a group in a different + * task or CPU context: + */ + if (move_group) { + if (group_leader->ctx->type != ctx->type) + goto err_context; + } else { + if (group_leader->ctx != ctx) + goto err_context; + } + + /* + * Only a group leader can be exclusive or pinned + */ + if (attr.exclusive || attr.pinned) + goto err_context; + } + + if (output_event) { + err = perf_event_set_output(event, output_event); + if (err) + goto err_context; + } + + event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); + if (IS_ERR(event_file)) { + err = PTR_ERR(event_file); + goto err_context; + } + + if (move_group) { + struct perf_event_context *gctx = group_leader->ctx; + + mutex_lock(&gctx->mutex); + perf_remove_from_context(group_leader); + list_for_each_entry(sibling, &group_leader->sibling_list, + group_entry) { + perf_remove_from_context(sibling); + put_ctx(gctx); + } + mutex_unlock(&gctx->mutex); + put_ctx(gctx); + } + + event->filp = event_file; + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + + if (move_group) { + perf_install_in_context(ctx, group_leader, cpu); + get_ctx(ctx); + list_for_each_entry(sibling, &group_leader->sibling_list, + group_entry) { + perf_install_in_context(ctx, sibling, cpu); + get_ctx(ctx); + } + } + + perf_install_in_context(ctx, event, cpu); + ++ctx->generation; + perf_unpin_context(ctx); + mutex_unlock(&ctx->mutex); + + event->owner = current; + + mutex_lock(¤t->perf_event_mutex); + list_add_tail(&event->owner_entry, ¤t->perf_event_list); + mutex_unlock(¤t->perf_event_mutex); + + /* + * Precalculate sample_data sizes + */ + perf_event__header_size(event); + perf_event__id_header_size(event); + + /* + * Drop the reference on the group_event after placing the + * new event on the sibling_list. This ensures destruction + * of the group leader will find the pointer to itself in + * perf_group_detach(). + */ + fput_light(group_file, fput_needed); + fd_install(event_fd, event_file); + return event_fd; + +err_context: + perf_unpin_context(ctx); + put_ctx(ctx); +err_alloc: + free_event(event); +err_task: + if (task) + put_task_struct(task); +err_group_fd: + fput_light(group_file, fput_needed); +err_fd: + put_unused_fd(event_fd); + return err; +} + +/** + * perf_event_create_kernel_counter + * + * @attr: attributes of the counter to create + * @cpu: cpu in which the counter is bound + * @task: task to profile (NULL for percpu) + */ +struct perf_event * +perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, + struct task_struct *task, + perf_overflow_handler_t overflow_handler) +{ + struct perf_event_context *ctx; + struct perf_event *event; + int err; + + /* + * Get the target context (task or percpu): + */ + + event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler); + if (IS_ERR(event)) { + err = PTR_ERR(event); + goto err; + } + + ctx = find_get_context(event->pmu, task, cpu); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + goto err_free; + } + + event->filp = NULL; + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + perf_install_in_context(ctx, event, cpu); + ++ctx->generation; + perf_unpin_context(ctx); + mutex_unlock(&ctx->mutex); + + return event; + +err_free: + free_event(event); +err: + return ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); + +static void sync_child_event(struct perf_event *child_event, + struct task_struct *child) +{ + struct perf_event *parent_event = child_event->parent; + u64 child_val; + + if (child_event->attr.inherit_stat) + perf_event_read_event(child_event, child); + + child_val = perf_event_count(child_event); + + /* + * Add back the child's count to the parent's count: + */ + atomic64_add(child_val, &parent_event->child_count); + atomic64_add(child_event->total_time_enabled, + &parent_event->child_total_time_enabled); + atomic64_add(child_event->total_time_running, + &parent_event->child_total_time_running); + + /* + * Remove this event from the parent's list + */ + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_del_init(&child_event->child_list); + mutex_unlock(&parent_event->child_mutex); + + /* + * Release the parent event, if this was the last + * reference to it. + */ + fput(parent_event->filp); +} + +static void +__perf_event_exit_task(struct perf_event *child_event, + struct perf_event_context *child_ctx, + struct task_struct *child) +{ + if (child_event->parent) { + raw_spin_lock_irq(&child_ctx->lock); + perf_group_detach(child_event); + raw_spin_unlock_irq(&child_ctx->lock); + } + + perf_remove_from_context(child_event); + + /* + * It can happen that the parent exits first, and has events + * that are still around due to the child reference. These + * events need to be zapped. + */ + if (child_event->parent) { + sync_child_event(child_event, child); + free_event(child_event); + } +} + +static void perf_event_exit_task_context(struct task_struct *child, int ctxn) +{ + struct perf_event *child_event, *tmp; + struct perf_event_context *child_ctx; + unsigned long flags; + + if (likely(!child->perf_event_ctxp[ctxn])) { + perf_event_task(child, NULL, 0); + return; + } + + local_irq_save(flags); + /* + * We can't reschedule here because interrupts are disabled, + * and either child is current or it is a task that can't be + * scheduled, so we are now safe from rescheduling changing + * our context. + */ + child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); + task_ctx_sched_out(child_ctx, EVENT_ALL); + + /* + * Take the context lock here so that if find_get_context is + * reading child->perf_event_ctxp, we wait until it has + * incremented the context's refcount before we do put_ctx below. + */ + raw_spin_lock(&child_ctx->lock); + child->perf_event_ctxp[ctxn] = NULL; + /* + * If this context is a clone; unclone it so it can't get + * swapped to another process while we're removing all + * the events from it. + */ + unclone_ctx(child_ctx); + update_context_time(child_ctx); + raw_spin_unlock_irqrestore(&child_ctx->lock, flags); + + /* + * Report the task dead after unscheduling the events so that we + * won't get any samples after PERF_RECORD_EXIT. We can however still + * get a few PERF_RECORD_READ events. + */ + perf_event_task(child, child_ctx, 0); + + /* + * We can recurse on the same lock type through: + * + * __perf_event_exit_task() + * sync_child_event() + * fput(parent_event->filp) + * perf_release() + * mutex_lock(&ctx->mutex) + * + * But since its the parent context it won't be the same instance. + */ + mutex_lock(&child_ctx->mutex); + +again: + list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, + group_entry) + __perf_event_exit_task(child_event, child_ctx, child); + + list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups, + group_entry) + __perf_event_exit_task(child_event, child_ctx, child); + + /* + * If the last event was a group event, it will have appended all + * its siblings to the list, but we obtained 'tmp' before that which + * will still point to the list head terminating the iteration. + */ + if (!list_empty(&child_ctx->pinned_groups) || + !list_empty(&child_ctx->flexible_groups)) + goto again; + + mutex_unlock(&child_ctx->mutex); + + put_ctx(child_ctx); +} + +/* + * When a child task exits, feed back event values to parent events. + */ +void perf_event_exit_task(struct task_struct *child) +{ + struct perf_event *event, *tmp; + int ctxn; + + mutex_lock(&child->perf_event_mutex); + list_for_each_entry_safe(event, tmp, &child->perf_event_list, + owner_entry) { + list_del_init(&event->owner_entry); + + /* + * Ensure the list deletion is visible before we clear + * the owner, closes a race against perf_release() where + * we need to serialize on the owner->perf_event_mutex. + */ + smp_wmb(); + event->owner = NULL; + } + mutex_unlock(&child->perf_event_mutex); + + for_each_task_context_nr(ctxn) + perf_event_exit_task_context(child, ctxn); +} + +static void perf_free_event(struct perf_event *event, + struct perf_event_context *ctx) +{ + struct perf_event *parent = event->parent; + + if (WARN_ON_ONCE(!parent)) + return; + + mutex_lock(&parent->child_mutex); + list_del_init(&event->child_list); + mutex_unlock(&parent->child_mutex); + + fput(parent->filp); + + perf_group_detach(event); + list_del_event(event, ctx); + free_event(event); +} + +/* + * free an unexposed, unused context as created by inheritance by + * perf_event_init_task below, used by fork() in case of fail. + */ +void perf_event_free_task(struct task_struct *task) +{ + struct perf_event_context *ctx; + struct perf_event *event, *tmp; + int ctxn; + + for_each_task_context_nr(ctxn) { + ctx = task->perf_event_ctxp[ctxn]; + if (!ctx) + continue; + + mutex_lock(&ctx->mutex); +again: + list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, + group_entry) + perf_free_event(event, ctx); + + list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, + group_entry) + perf_free_event(event, ctx); + + if (!list_empty(&ctx->pinned_groups) || + !list_empty(&ctx->flexible_groups)) + goto again; + + mutex_unlock(&ctx->mutex); + + put_ctx(ctx); + } +} + +void perf_event_delayed_put(struct task_struct *task) +{ + int ctxn; + + for_each_task_context_nr(ctxn) + WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); +} + +/* + * inherit a event from parent task to child task: + */ +static struct perf_event * +inherit_event(struct perf_event *parent_event, + struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, + struct perf_event *group_leader, + struct perf_event_context *child_ctx) +{ + struct perf_event *child_event; + unsigned long flags; + + /* + * Instead of creating recursive hierarchies of events, + * we link inherited events back to the original parent, + * which has a filp for sure, which we use as the reference + * count: + */ + if (parent_event->parent) + parent_event = parent_event->parent; + + child_event = perf_event_alloc(&parent_event->attr, + parent_event->cpu, + child, + group_leader, parent_event, + NULL); + if (IS_ERR(child_event)) + return child_event; + get_ctx(child_ctx); + + /* + * Make the child state follow the state of the parent event, + * not its attr.disabled bit. We hold the parent's mutex, + * so we won't race with perf_event_{en, dis}able_family. + */ + if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) + child_event->state = PERF_EVENT_STATE_INACTIVE; + else + child_event->state = PERF_EVENT_STATE_OFF; + + if (parent_event->attr.freq) { + u64 sample_period = parent_event->hw.sample_period; + struct hw_perf_event *hwc = &child_event->hw; + + hwc->sample_period = sample_period; + hwc->last_period = sample_period; + + local64_set(&hwc->period_left, sample_period); + } + + child_event->ctx = child_ctx; + child_event->overflow_handler = parent_event->overflow_handler; + + /* + * Precalculate sample_data sizes + */ + perf_event__header_size(child_event); + perf_event__id_header_size(child_event); + + /* + * Link it up in the child's context: + */ + raw_spin_lock_irqsave(&child_ctx->lock, flags); + add_event_to_ctx(child_event, child_ctx); + raw_spin_unlock_irqrestore(&child_ctx->lock, flags); + + /* + * Get a reference to the parent filp - we will fput it + * when the child event exits. This is safe to do because + * we are in the parent and we know that the filp still + * exists and has a nonzero count: + */ + atomic_long_inc(&parent_event->filp->f_count); + + /* + * Link this into the parent event's child list + */ + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_add_tail(&child_event->child_list, &parent_event->child_list); + mutex_unlock(&parent_event->child_mutex); + + return child_event; +} + +static int inherit_group(struct perf_event *parent_event, + struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, + struct perf_event_context *child_ctx) +{ + struct perf_event *leader; + struct perf_event *sub; + struct perf_event *child_ctr; + + leader = inherit_event(parent_event, parent, parent_ctx, + child, NULL, child_ctx); + if (IS_ERR(leader)) + return PTR_ERR(leader); + list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { + child_ctr = inherit_event(sub, parent, parent_ctx, + child, leader, child_ctx); + if (IS_ERR(child_ctr)) + return PTR_ERR(child_ctr); + } + return 0; +} + +static int +inherit_task_group(struct perf_event *event, struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, int ctxn, + int *inherited_all) +{ + int ret; + struct perf_event_context *child_ctx; + + if (!event->attr.inherit) { + *inherited_all = 0; + return 0; + } + + child_ctx = child->perf_event_ctxp[ctxn]; + if (!child_ctx) { + /* + * This is executed from the parent task context, so + * inherit events that have been marked for cloning. + * First allocate and initialize a context for the + * child. + */ + + child_ctx = alloc_perf_context(event->pmu, child); + if (!child_ctx) + return -ENOMEM; + + child->perf_event_ctxp[ctxn] = child_ctx; + } + + ret = inherit_group(event, parent, parent_ctx, + child, child_ctx); + + if (ret) + *inherited_all = 0; + + return ret; +} + +/* + * Initialize the perf_event context in task_struct + */ +int perf_event_init_context(struct task_struct *child, int ctxn) +{ + struct perf_event_context *child_ctx, *parent_ctx; + struct perf_event_context *cloned_ctx; + struct perf_event *event; + struct task_struct *parent = current; + int inherited_all = 1; + unsigned long flags; + int ret = 0; + + if (likely(!parent->perf_event_ctxp[ctxn])) + return 0; + + /* + * If the parent's context is a clone, pin it so it won't get + * swapped under us. + */ + parent_ctx = perf_pin_task_context(parent, ctxn); + + /* + * No need to check if parent_ctx != NULL here; since we saw + * it non-NULL earlier, the only reason for it to become NULL + * is if we exit, and since we're currently in the middle of + * a fork we can't be exiting at the same time. + */ + + /* + * Lock the parent list. No need to lock the child - not PID + * hashed yet and not running, so nobody can access it. + */ + mutex_lock(&parent_ctx->mutex); + + /* + * We dont have to disable NMIs - we are only looking at + * the list, not manipulating it: + */ + list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { + ret = inherit_task_group(event, parent, parent_ctx, + child, ctxn, &inherited_all); + if (ret) + break; + } + + /* + * We can't hold ctx->lock when iterating the ->flexible_group list due + * to allocations, but we need to prevent rotation because + * rotate_ctx() will change the list from interrupt context. + */ + raw_spin_lock_irqsave(&parent_ctx->lock, flags); + parent_ctx->rotate_disable = 1; + raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); + + list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { + ret = inherit_task_group(event, parent, parent_ctx, + child, ctxn, &inherited_all); + if (ret) + break; + } + + raw_spin_lock_irqsave(&parent_ctx->lock, flags); + parent_ctx->rotate_disable = 0; + + child_ctx = child->perf_event_ctxp[ctxn]; + + if (child_ctx && inherited_all) { + /* + * Mark the child context as a clone of the parent + * context, or of whatever the parent is a clone of. + * + * Note that if the parent is a clone, the holding of + * parent_ctx->lock avoids it from being uncloned. + */ + cloned_ctx = parent_ctx->parent_ctx; + if (cloned_ctx) { + child_ctx->parent_ctx = cloned_ctx; + child_ctx->parent_gen = parent_ctx->parent_gen; + } else { + child_ctx->parent_ctx = parent_ctx; + child_ctx->parent_gen = parent_ctx->generation; + } + get_ctx(child_ctx->parent_ctx); + } + + raw_spin_unlock_irqrestore(&parent_ctx->lock, flags); + mutex_unlock(&parent_ctx->mutex); + + perf_unpin_context(parent_ctx); + put_ctx(parent_ctx); + + return ret; +} + +/* + * Initialize the perf_event context in task_struct + */ +int perf_event_init_task(struct task_struct *child) +{ + int ctxn, ret; + + memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp)); + mutex_init(&child->perf_event_mutex); + INIT_LIST_HEAD(&child->perf_event_list); + + for_each_task_context_nr(ctxn) { + ret = perf_event_init_context(child, ctxn); + if (ret) + return ret; + } + + return 0; +} + +static void __init perf_event_init_all_cpus(void) +{ + struct swevent_htable *swhash; + int cpu; + + for_each_possible_cpu(cpu) { + swhash = &per_cpu(swevent_htable, cpu); + mutex_init(&swhash->hlist_mutex); + INIT_LIST_HEAD(&per_cpu(rotation_list, cpu)); + } +} + +static void __cpuinit perf_event_init_cpu(int cpu) +{ + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); + + mutex_lock(&swhash->hlist_mutex); + if (swhash->hlist_refcount > 0) { + struct swevent_hlist *hlist; + + hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); + WARN_ON(!hlist); + rcu_assign_pointer(swhash->swevent_hlist, hlist); + } + mutex_unlock(&swhash->hlist_mutex); +} + +#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC +static void perf_pmu_rotate_stop(struct pmu *pmu) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + WARN_ON(!irqs_disabled()); + + list_del_init(&cpuctx->rotation_list); +} + +static void __perf_event_exit_context(void *__info) +{ + struct perf_event_context *ctx = __info; + struct perf_event *event, *tmp; + + perf_pmu_rotate_stop(ctx->pmu); + + list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) + __perf_remove_from_context(event); + list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) + __perf_remove_from_context(event); +} + +static void perf_event_exit_cpu_context(int cpu) +{ + struct perf_event_context *ctx; + struct pmu *pmu; + int idx; + + idx = srcu_read_lock(&pmus_srcu); + list_for_each_entry_rcu(pmu, &pmus, entry) { + ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx; + + mutex_lock(&ctx->mutex); + smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); + mutex_unlock(&ctx->mutex); + } + srcu_read_unlock(&pmus_srcu, idx); +} + +static void perf_event_exit_cpu(int cpu) +{ + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); + + mutex_lock(&swhash->hlist_mutex); + swevent_hlist_release(swhash); + mutex_unlock(&swhash->hlist_mutex); + + perf_event_exit_cpu_context(cpu); +} +#else +static inline void perf_event_exit_cpu(int cpu) { } +#endif + +static int +perf_reboot(struct notifier_block *notifier, unsigned long val, void *v) +{ + int cpu; + + for_each_online_cpu(cpu) + perf_event_exit_cpu(cpu); + + return NOTIFY_OK; +} + +/* + * Run the perf reboot notifier at the very last possible moment so that + * the generic watchdog code runs as long as possible. + */ +static struct notifier_block perf_reboot_notifier = { + .notifier_call = perf_reboot, + .priority = INT_MIN, +}; + +static int __cpuinit +perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action & ~CPU_TASKS_FROZEN) { + + case CPU_UP_PREPARE: + case CPU_DOWN_FAILED: + perf_event_init_cpu(cpu); + break; + + case CPU_UP_CANCELED: + case CPU_DOWN_PREPARE: + perf_event_exit_cpu(cpu); + break; + + default: + break; + } + + return NOTIFY_OK; +} + +void __init perf_event_init(void) +{ + int ret; + + idr_init(&pmu_idr); + + perf_event_init_all_cpus(); + init_srcu_struct(&pmus_srcu); + perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE); + perf_pmu_register(&perf_cpu_clock, NULL, -1); + perf_pmu_register(&perf_task_clock, NULL, -1); + perf_tp_register(); + perf_cpu_notifier(perf_cpu_notify); + register_reboot_notifier(&perf_reboot_notifier); + + ret = init_hw_breakpoint(); + WARN(ret, "hw_breakpoint initialization failed with: %d", ret); +} + +static int __init perf_event_sysfs_init(void) +{ + struct pmu *pmu; + int ret; + + mutex_lock(&pmus_lock); + + ret = bus_register(&pmu_bus); + if (ret) + goto unlock; + + list_for_each_entry(pmu, &pmus, entry) { + if (!pmu->name || pmu->type < 0) + continue; + + ret = pmu_dev_alloc(pmu); + WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret); + } + pmu_bus_running = 1; + ret = 0; + +unlock: + mutex_unlock(&pmus_lock); + + return ret; +} +device_initcall(perf_event_sysfs_init); + +#ifdef CONFIG_CGROUP_PERF +static struct cgroup_subsys_state *perf_cgroup_create( + struct cgroup_subsys *ss, struct cgroup *cont) +{ + struct perf_cgroup *jc; + + jc = kzalloc(sizeof(*jc), GFP_KERNEL); + if (!jc) + return ERR_PTR(-ENOMEM); + + jc->info = alloc_percpu(struct perf_cgroup_info); + if (!jc->info) { + kfree(jc); + return ERR_PTR(-ENOMEM); + } + + return &jc->css; +} + +static void perf_cgroup_destroy(struct cgroup_subsys *ss, + struct cgroup *cont) +{ + struct perf_cgroup *jc; + jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), + struct perf_cgroup, css); + free_percpu(jc->info); + kfree(jc); +} + +static int __perf_cgroup_move(void *info) +{ + struct task_struct *task = info; + perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); + return 0; +} + +static void +perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task) +{ + task_function_call(task, __perf_cgroup_move, task); +} + +static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup *old_cgrp, struct task_struct *task) +{ + /* + * cgroup_exit() is called in the copy_process() failure path. + * Ignore this case since the task hasn't ran yet, this avoids + * trying to poke a half freed task state from generic code. + */ + if (!(task->flags & PF_EXITING)) + return; + + perf_cgroup_attach_task(cgrp, task); +} + +struct cgroup_subsys perf_subsys = { + .name = "perf_event", + .subsys_id = perf_subsys_id, + .create = perf_cgroup_create, + .destroy = perf_cgroup_destroy, + .exit = perf_cgroup_exit, + .attach_task = perf_cgroup_attach_task, +}; +#endif /* CONFIG_CGROUP_PERF */ diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c new file mode 100644 index 00000000..086adf25 --- /dev/null +++ b/kernel/events/hw_breakpoint.c @@ -0,0 +1,659 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2007 Alan Stern + * Copyright (C) IBM Corporation, 2009 + * Copyright (C) 2009, Frederic Weisbecker + * + * Thanks to Ingo Molnar for his many suggestions. + * + * Authors: Alan Stern + * K.Prasad + * Frederic Weisbecker + */ + +/* + * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility, + * using the CPU's debug registers. + * This file contains the arch-independent routines. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +/* + * Constraints data + */ + +/* Number of pinned cpu breakpoints in a cpu */ +static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]); + +/* Number of pinned task breakpoints in a cpu */ +static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]); + +/* Number of non-pinned cpu/task breakpoints in a cpu */ +static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]); + +static int nr_slots[TYPE_MAX]; + +/* Keep track of the breakpoints attached to tasks */ +static LIST_HEAD(bp_task_head); + +static int constraints_initialized; + +/* Gather the number of total pinned and un-pinned bp in a cpuset */ +struct bp_busy_slots { + unsigned int pinned; + unsigned int flexible; +}; + +/* Serialize accesses to the above constraints */ +static DEFINE_MUTEX(nr_bp_mutex); + +__weak int hw_breakpoint_weight(struct perf_event *bp) +{ + return 1; +} + +static inline enum bp_type_idx find_slot_idx(struct perf_event *bp) +{ + if (bp->attr.bp_type & HW_BREAKPOINT_RW) + return TYPE_DATA; + + return TYPE_INST; +} + +/* + * Report the maximum number of pinned breakpoints a task + * have in this cpu + */ +static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) +{ + int i; + unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); + + for (i = nr_slots[type] - 1; i >= 0; i--) { + if (tsk_pinned[i] > 0) + return i + 1; + } + + return 0; +} + +/* + * Count the number of breakpoints of the same type and same task. + * The given event must be not on the list. + */ +static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) +{ + struct task_struct *tsk = bp->hw.bp_target; + struct perf_event *iter; + int count = 0; + + list_for_each_entry(iter, &bp_task_head, hw.bp_list) { + if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type) + count += hw_breakpoint_weight(iter); + } + + return count; +} + +/* + * Report the number of pinned/un-pinned breakpoints we have in + * a given cpu (cpu > -1) or in all of them (cpu = -1). + */ +static void +fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, + enum bp_type_idx type) +{ + int cpu = bp->cpu; + struct task_struct *tsk = bp->hw.bp_target; + + if (cpu >= 0) { + slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); + if (!tsk) + slots->pinned += max_task_bp_pinned(cpu, type); + else + slots->pinned += task_bp_pinned(bp, type); + slots->flexible = per_cpu(nr_bp_flexible[type], cpu); + + return; + } + + for_each_online_cpu(cpu) { + unsigned int nr; + + nr = per_cpu(nr_cpu_bp_pinned[type], cpu); + if (!tsk) + nr += max_task_bp_pinned(cpu, type); + else + nr += task_bp_pinned(bp, type); + + if (nr > slots->pinned) + slots->pinned = nr; + + nr = per_cpu(nr_bp_flexible[type], cpu); + + if (nr > slots->flexible) + slots->flexible = nr; + } +} + +/* + * For now, continue to consider flexible as pinned, until we can + * ensure no flexible event can ever be scheduled before a pinned event + * in a same cpu. + */ +static void +fetch_this_slot(struct bp_busy_slots *slots, int weight) +{ + slots->pinned += weight; +} + +/* + * Add a pinned breakpoint for the given task in our constraint table + */ +static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable, + enum bp_type_idx type, int weight) +{ + unsigned int *tsk_pinned; + int old_count = 0; + int old_idx = 0; + int idx = 0; + + old_count = task_bp_pinned(bp, type); + old_idx = old_count - 1; + idx = old_idx + weight; + + /* tsk_pinned[n] is the number of tasks having n breakpoints */ + tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu); + if (enable) { + tsk_pinned[idx]++; + if (old_count > 0) + tsk_pinned[old_idx]--; + } else { + tsk_pinned[idx]--; + if (old_count > 0) + tsk_pinned[old_idx]++; + } +} + +/* + * Add/remove the given breakpoint in our constraint table + */ +static void +toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, + int weight) +{ + int cpu = bp->cpu; + struct task_struct *tsk = bp->hw.bp_target; + + /* Pinned counter cpu profiling */ + if (!tsk) { + + if (enable) + per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; + else + per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight; + return; + } + + /* Pinned counter task profiling */ + + if (!enable) + list_del(&bp->hw.bp_list); + + if (cpu >= 0) { + toggle_bp_task_slot(bp, cpu, enable, type, weight); + } else { + for_each_online_cpu(cpu) + toggle_bp_task_slot(bp, cpu, enable, type, weight); + } + + if (enable) + list_add_tail(&bp->hw.bp_list, &bp_task_head); +} + +/* + * Function to perform processor-specific cleanup during unregistration + */ +__weak void arch_unregister_hw_breakpoint(struct perf_event *bp) +{ + /* + * A weak stub function here for those archs that don't define + * it inside arch/.../kernel/hw_breakpoint.c + */ +} + +/* + * Contraints to check before allowing this new breakpoint counter: + * + * == Non-pinned counter == (Considered as pinned for now) + * + * - If attached to a single cpu, check: + * + * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu) + * + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM + * + * -> If there are already non-pinned counters in this cpu, it means + * there is already a free slot for them. + * Otherwise, we check that the maximum number of per task + * breakpoints (for this cpu) plus the number of per cpu breakpoint + * (for this cpu) doesn't cover every registers. + * + * - If attached to every cpus, check: + * + * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *)) + * + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM + * + * -> This is roughly the same, except we check the number of per cpu + * bp for every cpu and we keep the max one. Same for the per tasks + * breakpoints. + * + * + * == Pinned counter == + * + * - If attached to a single cpu, check: + * + * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu) + * + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM + * + * -> Same checks as before. But now the nr_bp_flexible, if any, must keep + * one register at least (or they will never be fed). + * + * - If attached to every cpus, check: + * + * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) + * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM + */ +static int __reserve_bp_slot(struct perf_event *bp) +{ + struct bp_busy_slots slots = {0}; + enum bp_type_idx type; + int weight; + + /* We couldn't initialize breakpoint constraints on boot */ + if (!constraints_initialized) + return -ENOMEM; + + /* Basic checks */ + if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY || + bp->attr.bp_type == HW_BREAKPOINT_INVALID) + return -EINVAL; + + type = find_slot_idx(bp); + weight = hw_breakpoint_weight(bp); + + fetch_bp_busy_slots(&slots, bp, type); + /* + * Simulate the addition of this breakpoint to the constraints + * and see the result. + */ + fetch_this_slot(&slots, weight); + + /* Flexible counters need to keep at least one slot */ + if (slots.pinned + (!!slots.flexible) > nr_slots[type]) + return -ENOSPC; + + toggle_bp_slot(bp, true, type, weight); + + return 0; +} + +int reserve_bp_slot(struct perf_event *bp) +{ + int ret; + + mutex_lock(&nr_bp_mutex); + + ret = __reserve_bp_slot(bp); + + mutex_unlock(&nr_bp_mutex); + + return ret; +} + +static void __release_bp_slot(struct perf_event *bp) +{ + enum bp_type_idx type; + int weight; + + type = find_slot_idx(bp); + weight = hw_breakpoint_weight(bp); + toggle_bp_slot(bp, false, type, weight); +} + +void release_bp_slot(struct perf_event *bp) +{ + mutex_lock(&nr_bp_mutex); + + arch_unregister_hw_breakpoint(bp); + __release_bp_slot(bp); + + mutex_unlock(&nr_bp_mutex); +} + +/* + * Allow the kernel debugger to reserve breakpoint slots without + * taking a lock using the dbg_* variant of for the reserve and + * release breakpoint slots. + */ +int dbg_reserve_bp_slot(struct perf_event *bp) +{ + if (mutex_is_locked(&nr_bp_mutex)) + return -1; + + return __reserve_bp_slot(bp); +} + +int dbg_release_bp_slot(struct perf_event *bp) +{ + if (mutex_is_locked(&nr_bp_mutex)) + return -1; + + __release_bp_slot(bp); + + return 0; +} + +static int validate_hw_breakpoint(struct perf_event *bp) +{ + int ret; + + ret = arch_validate_hwbkpt_settings(bp); + if (ret) + return ret; + + if (arch_check_bp_in_kernelspace(bp)) { + if (bp->attr.exclude_kernel) + return -EINVAL; + /* + * Don't let unprivileged users set a breakpoint in the trap + * path to avoid trap recursion attacks. + */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + } + + return 0; +} + +int register_perf_hw_breakpoint(struct perf_event *bp) +{ + int ret; + + ret = reserve_bp_slot(bp); + if (ret) + return ret; + + ret = validate_hw_breakpoint(bp); + + /* if arch_validate_hwbkpt_settings() fails then release bp slot */ + if (ret) + release_bp_slot(bp); + + return ret; +} + +/** + * register_user_hw_breakpoint - register a hardware breakpoint for user space + * @attr: breakpoint attributes + * @triggered: callback to trigger when we hit the breakpoint + * @tsk: pointer to 'task_struct' of the process to which the address belongs + */ +struct perf_event * +register_user_hw_breakpoint(struct perf_event_attr *attr, + perf_overflow_handler_t triggered, + struct task_struct *tsk) +{ + return perf_event_create_kernel_counter(attr, -1, tsk, triggered); +} +EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); + +/** + * modify_user_hw_breakpoint - modify a user-space hardware breakpoint + * @bp: the breakpoint structure to modify + * @attr: new breakpoint attributes + * @triggered: callback to trigger when we hit the breakpoint + * @tsk: pointer to 'task_struct' of the process to which the address belongs + */ +int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) +{ + u64 old_addr = bp->attr.bp_addr; + u64 old_len = bp->attr.bp_len; + int old_type = bp->attr.bp_type; + int err = 0; + + perf_event_disable(bp); + + bp->attr.bp_addr = attr->bp_addr; + bp->attr.bp_type = attr->bp_type; + bp->attr.bp_len = attr->bp_len; + + if (attr->disabled) + goto end; + + err = validate_hw_breakpoint(bp); + if (!err) + perf_event_enable(bp); + + if (err) { + bp->attr.bp_addr = old_addr; + bp->attr.bp_type = old_type; + bp->attr.bp_len = old_len; + if (!bp->attr.disabled) + perf_event_enable(bp); + + return err; + } + +end: + bp->attr.disabled = attr->disabled; + + return 0; +} +EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); + +/** + * unregister_hw_breakpoint - unregister a user-space hardware breakpoint + * @bp: the breakpoint structure to unregister + */ +void unregister_hw_breakpoint(struct perf_event *bp) +{ + if (!bp) + return; + perf_event_release_kernel(bp); +} +EXPORT_SYMBOL_GPL(unregister_hw_breakpoint); + +/** + * register_wide_hw_breakpoint - register a wide breakpoint in the kernel + * @attr: breakpoint attributes + * @triggered: callback to trigger when we hit the breakpoint + * + * @return a set of per_cpu pointers to perf events + */ +struct perf_event * __percpu * +register_wide_hw_breakpoint(struct perf_event_attr *attr, + perf_overflow_handler_t triggered) +{ + struct perf_event * __percpu *cpu_events, **pevent, *bp; + long err; + int cpu; + + cpu_events = alloc_percpu(typeof(*cpu_events)); + if (!cpu_events) + return (void __percpu __force *)ERR_PTR(-ENOMEM); + + get_online_cpus(); + for_each_online_cpu(cpu) { + pevent = per_cpu_ptr(cpu_events, cpu); + bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered); + + *pevent = bp; + + if (IS_ERR(bp)) { + err = PTR_ERR(bp); + goto fail; + } + } + put_online_cpus(); + + return cpu_events; + +fail: + for_each_online_cpu(cpu) { + pevent = per_cpu_ptr(cpu_events, cpu); + if (IS_ERR(*pevent)) + break; + unregister_hw_breakpoint(*pevent); + } + put_online_cpus(); + + free_percpu(cpu_events); + return (void __percpu __force *)ERR_PTR(err); +} +EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); + +/** + * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel + * @cpu_events: the per cpu set of events to unregister + */ +void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) +{ + int cpu; + struct perf_event **pevent; + + for_each_possible_cpu(cpu) { + pevent = per_cpu_ptr(cpu_events, cpu); + unregister_hw_breakpoint(*pevent); + } + free_percpu(cpu_events); +} +EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); + +static struct notifier_block hw_breakpoint_exceptions_nb = { + .notifier_call = hw_breakpoint_exceptions_notify, + /* we need to be notified first */ + .priority = 0x7fffffff +}; + +static void bp_perf_event_destroy(struct perf_event *event) +{ + release_bp_slot(event); +} + +static int hw_breakpoint_event_init(struct perf_event *bp) +{ + int err; + + if (bp->attr.type != PERF_TYPE_BREAKPOINT) + return -ENOENT; + + err = register_perf_hw_breakpoint(bp); + if (err) + return err; + + bp->destroy = bp_perf_event_destroy; + + return 0; +} + +static int hw_breakpoint_add(struct perf_event *bp, int flags) +{ + if (!(flags & PERF_EF_START)) + bp->hw.state = PERF_HES_STOPPED; + + return arch_install_hw_breakpoint(bp); +} + +static void hw_breakpoint_del(struct perf_event *bp, int flags) +{ + arch_uninstall_hw_breakpoint(bp); +} + +static void hw_breakpoint_start(struct perf_event *bp, int flags) +{ + bp->hw.state = 0; +} + +static void hw_breakpoint_stop(struct perf_event *bp, int flags) +{ + bp->hw.state = PERF_HES_STOPPED; +} + +static struct pmu perf_breakpoint = { + .task_ctx_nr = perf_sw_context, /* could eventually get its own */ + + .event_init = hw_breakpoint_event_init, + .add = hw_breakpoint_add, + .del = hw_breakpoint_del, + .start = hw_breakpoint_start, + .stop = hw_breakpoint_stop, + .read = hw_breakpoint_pmu_read, +}; + +int __init init_hw_breakpoint(void) +{ + unsigned int **task_bp_pinned; + int cpu, err_cpu; + int i; + + for (i = 0; i < TYPE_MAX; i++) + nr_slots[i] = hw_breakpoint_slots(i); + + for_each_possible_cpu(cpu) { + for (i = 0; i < TYPE_MAX; i++) { + task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu); + *task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i], + GFP_KERNEL); + if (!*task_bp_pinned) + goto err_alloc; + } + } + + constraints_initialized = 1; + + perf_pmu_register(&perf_breakpoint, "breakpoint", PERF_TYPE_BREAKPOINT); + + return register_die_notifier(&hw_breakpoint_exceptions_nb); + + err_alloc: + for_each_possible_cpu(err_cpu) { + if (err_cpu == cpu) + break; + for (i = 0; i < TYPE_MAX; i++) + kfree(per_cpu(nr_task_bp_pinned[i], cpu)); + } + + return -ENOMEM; +} + + diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c new file mode 100644 index 00000000..0dbeae37 --- /dev/null +++ b/kernel/exec_domain.c @@ -0,0 +1,195 @@ +/* + * Handling of different ABIs (personalities). + * + * We group personalities into execution domains which have their + * own handlers for kernel entry points, signal mapping, etc... + * + * 2001-05-06 Complete rewrite, Christoph Hellwig (hch@infradead.org) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +static void default_handler(int, struct pt_regs *); + +static struct exec_domain *exec_domains = &default_exec_domain; +static DEFINE_RWLOCK(exec_domains_lock); + + +static unsigned long ident_map[32] = { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31 +}; + +struct exec_domain default_exec_domain = { + .name = "Linux", /* name */ + .handler = default_handler, /* lcall7 causes a seg fault. */ + .pers_low = 0, /* PER_LINUX personality. */ + .pers_high = 0, /* PER_LINUX personality. */ + .signal_map = ident_map, /* Identity map signals. */ + .signal_invmap = ident_map, /* - both ways. */ +}; + + +static void +default_handler(int segment, struct pt_regs *regp) +{ + set_personality(0); + + if (current_thread_info()->exec_domain->handler != default_handler) + current_thread_info()->exec_domain->handler(segment, regp); + else + send_sig(SIGSEGV, current, 1); +} + +static struct exec_domain * +lookup_exec_domain(unsigned int personality) +{ + unsigned int pers = personality(personality); + struct exec_domain *ep; + + read_lock(&exec_domains_lock); + for (ep = exec_domains; ep; ep = ep->next) { + if (pers >= ep->pers_low && pers <= ep->pers_high) + if (try_module_get(ep->module)) + goto out; + } + +#ifdef CONFIG_MODULES + read_unlock(&exec_domains_lock); + request_module("personality-%d", pers); + read_lock(&exec_domains_lock); + + for (ep = exec_domains; ep; ep = ep->next) { + if (pers >= ep->pers_low && pers <= ep->pers_high) + if (try_module_get(ep->module)) + goto out; + } +#endif + + ep = &default_exec_domain; +out: + read_unlock(&exec_domains_lock); + return (ep); +} + +int +register_exec_domain(struct exec_domain *ep) +{ + struct exec_domain *tmp; + int err = -EBUSY; + + if (ep == NULL) + return -EINVAL; + + if (ep->next != NULL) + return -EBUSY; + + write_lock(&exec_domains_lock); + for (tmp = exec_domains; tmp; tmp = tmp->next) { + if (tmp == ep) + goto out; + } + + ep->next = exec_domains; + exec_domains = ep; + err = 0; + +out: + write_unlock(&exec_domains_lock); + return (err); +} + +int +unregister_exec_domain(struct exec_domain *ep) +{ + struct exec_domain **epp; + + epp = &exec_domains; + write_lock(&exec_domains_lock); + for (epp = &exec_domains; *epp; epp = &(*epp)->next) { + if (ep == *epp) + goto unregister; + } + write_unlock(&exec_domains_lock); + return -EINVAL; + +unregister: + *epp = ep->next; + ep->next = NULL; + write_unlock(&exec_domains_lock); + return 0; +} + +int __set_personality(unsigned int personality) +{ + struct exec_domain *oep = current_thread_info()->exec_domain; + + current_thread_info()->exec_domain = lookup_exec_domain(personality); + current->personality = personality; + module_put(oep->module); + + return 0; +} + +#ifdef CONFIG_PROC_FS +static int execdomains_proc_show(struct seq_file *m, void *v) +{ + struct exec_domain *ep; + + read_lock(&exec_domains_lock); + for (ep = exec_domains; ep; ep = ep->next) + seq_printf(m, "%d-%d\t%-16s\t[%s]\n", + ep->pers_low, ep->pers_high, ep->name, + module_name(ep->module)); + read_unlock(&exec_domains_lock); + return 0; +} + +static int execdomains_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, execdomains_proc_show, NULL); +} + +static const struct file_operations execdomains_proc_fops = { + .open = execdomains_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_execdomains_init(void) +{ + proc_create("execdomains", 0, NULL, &execdomains_proc_fops); + return 0; +} +module_init(proc_execdomains_init); +#endif + +SYSCALL_DEFINE1(personality, unsigned int, personality) +{ + unsigned int old = current->personality; + + if (personality != 0xffffffff) + set_personality(personality); + + return old; +} + + +EXPORT_SYMBOL(register_exec_domain); +EXPORT_SYMBOL(unregister_exec_domain); +EXPORT_SYMBOL(__set_personality); diff --git a/kernel/exit.c b/kernel/exit.c new file mode 100644 index 00000000..303bed29 --- /dev/null +++ b/kernel/exit.c @@ -0,0 +1,1875 @@ +/* + * linux/kernel/exit.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for audit_free() */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static void exit_mm(struct task_struct * tsk); + +static void __unhash_process(struct task_struct *p, bool group_dead) +{ + nr_threads--; + detach_pid(p, PIDTYPE_PID); + if (group_dead) { + detach_pid(p, PIDTYPE_PGID); + detach_pid(p, PIDTYPE_SID); + + list_del_rcu(&p->tasks); + list_del_init(&p->sibling); + __this_cpu_dec(process_counts); + } + list_del_rcu(&p->thread_group); +} + +/* + * This function expects the tasklist_lock write-locked. + */ +static void __exit_signal(struct task_struct *tsk) +{ + struct signal_struct *sig = tsk->signal; + bool group_dead = thread_group_leader(tsk); + struct sighand_struct *sighand; + struct tty_struct *uninitialized_var(tty); + + sighand = rcu_dereference_check(tsk->sighand, + rcu_read_lock_held() || + lockdep_tasklist_lock_is_held()); + spin_lock(&sighand->siglock); + + posix_cpu_timers_exit(tsk); + if (group_dead) { + posix_cpu_timers_exit_group(tsk); + tty = sig->tty; + sig->tty = NULL; + } else { + /* + * This can only happen if the caller is de_thread(). + * FIXME: this is the temporary hack, we should teach + * posix-cpu-timers to handle this case correctly. + */ + if (unlikely(has_group_leader_pid(tsk))) + posix_cpu_timers_exit_group(tsk); + + /* + * If there is any task waiting for the group exit + * then notify it: + */ + if (sig->notify_count > 0 && !--sig->notify_count) + wake_up_process(sig->group_exit_task); + + if (tsk == sig->curr_target) + sig->curr_target = next_thread(tsk); + /* + * Accumulate here the counters for all threads but the + * group leader as they die, so they can be added into + * the process-wide totals when those are taken. + * The group leader stays around as a zombie as long + * as there are other threads. When it gets reaped, + * the exit.c code will add its counts into these totals. + * We won't ever get here for the group leader, since it + * will have been the last reference on the signal_struct. + */ + sig->utime = cputime_add(sig->utime, tsk->utime); + sig->stime = cputime_add(sig->stime, tsk->stime); + sig->gtime = cputime_add(sig->gtime, tsk->gtime); + sig->min_flt += tsk->min_flt; + sig->maj_flt += tsk->maj_flt; + sig->nvcsw += tsk->nvcsw; + sig->nivcsw += tsk->nivcsw; + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); + sig->sum_sched_runtime += tsk->se.sum_exec_runtime; + } + + sig->nr_threads--; + __unhash_process(tsk, group_dead); + + /* + * Do this under ->siglock, we can race with another thread + * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. + */ + flush_sigqueue(&tsk->pending); + tsk->sighand = NULL; + spin_unlock(&sighand->siglock); + + __cleanup_sighand(sighand); + clear_tsk_thread_flag(tsk,TIF_SIGPENDING); + if (group_dead) { + flush_sigqueue(&sig->shared_pending); + tty_kref_put(tty); + } +} + +static void delayed_put_task_struct(struct rcu_head *rhp) +{ + struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + + perf_event_delayed_put(tsk); + trace_sched_process_free(tsk); + put_task_struct(tsk); +} + + +void release_task(struct task_struct * p) +{ + struct task_struct *leader; + int zap_leader; +repeat: + tracehook_prepare_release_task(p); + /* don't need to get the RCU readlock here - the process is dead and + * can't be modifying its own credentials. But shut RCU-lockdep up */ + rcu_read_lock(); + atomic_dec(&__task_cred(p)->user->processes); + rcu_read_unlock(); + + proc_flush_task(p); + + write_lock_irq(&tasklist_lock); + tracehook_finish_release_task(p); + __exit_signal(p); + + /* + * If we are the last non-leader member of the thread + * group, and the leader is zombie, then notify the + * group leader's parent process. (if it wants notification.) + */ + zap_leader = 0; + leader = p->group_leader; + if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { + BUG_ON(task_detached(leader)); + do_notify_parent(leader, leader->exit_signal); + /* + * If we were the last child thread and the leader has + * exited already, and the leader's parent ignores SIGCHLD, + * then we are the one who should release the leader. + * + * do_notify_parent() will have marked it self-reaping in + * that case. + */ + zap_leader = task_detached(leader); + + /* + * This maintains the invariant that release_task() + * only runs on a task in EXIT_DEAD, just for sanity. + */ + if (zap_leader) + leader->exit_state = EXIT_DEAD; + } + + write_unlock_irq(&tasklist_lock); + release_thread(p); + call_rcu(&p->rcu, delayed_put_task_struct); + + p = leader; + if (unlikely(zap_leader)) + goto repeat; +} + +/* + * This checks not only the pgrp, but falls back on the pid if no + * satisfactory pgrp is found. I dunno - gdb doesn't work correctly + * without this... + * + * The caller must hold rcu lock or the tasklist lock. + */ +struct pid *session_of_pgrp(struct pid *pgrp) +{ + struct task_struct *p; + struct pid *sid = NULL; + + p = pid_task(pgrp, PIDTYPE_PGID); + if (p == NULL) + p = pid_task(pgrp, PIDTYPE_PID); + if (p != NULL) + sid = task_session(p); + + return sid; +} + +/* + * Determine if a process group is "orphaned", according to the POSIX + * definition in 2.2.2.52. Orphaned process groups are not to be affected + * by terminal-generated stop signals. Newly orphaned process groups are + * to receive a SIGHUP and a SIGCONT. + * + * "I ask you, have you ever known what it is to be an orphan?" + */ +static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) +{ + struct task_struct *p; + + do_each_pid_task(pgrp, PIDTYPE_PGID, p) { + if ((p == ignored_task) || + (p->exit_state && thread_group_empty(p)) || + is_global_init(p->real_parent)) + continue; + + if (task_pgrp(p->real_parent) != pgrp && + task_session(p->real_parent) == task_session(p)) + return 0; + } while_each_pid_task(pgrp, PIDTYPE_PGID, p); + + return 1; +} + +int is_current_pgrp_orphaned(void) +{ + int retval; + + read_lock(&tasklist_lock); + retval = will_become_orphaned_pgrp(task_pgrp(current), NULL); + read_unlock(&tasklist_lock); + + return retval; +} + +static int has_stopped_jobs(struct pid *pgrp) +{ + int retval = 0; + struct task_struct *p; + + do_each_pid_task(pgrp, PIDTYPE_PGID, p) { + if (!task_is_stopped(p)) + continue; + retval = 1; + break; + } while_each_pid_task(pgrp, PIDTYPE_PGID, p); + return retval; +} + +/* + * Check to see if any process groups have become orphaned as + * a result of our exiting, and if they have any stopped jobs, + * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) + */ +static void +kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) +{ + struct pid *pgrp = task_pgrp(tsk); + struct task_struct *ignored_task = tsk; + + if (!parent) + /* exit: our father is in a different pgrp than + * we are and we were the only connection outside. + */ + parent = tsk->real_parent; + else + /* reparent: our child is in a different pgrp than + * we are, and it was the only connection outside. + */ + ignored_task = NULL; + + if (task_pgrp(parent) != pgrp && + task_session(parent) == task_session(tsk) && + will_become_orphaned_pgrp(pgrp, ignored_task) && + has_stopped_jobs(pgrp)) { + __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); + __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); + } +} + +/** + * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd + * + * If a kernel thread is launched as a result of a system call, or if + * it ever exits, it should generally reparent itself to kthreadd so it + * isn't in the way of other processes and is correctly cleaned up on exit. + * + * The various task state such as scheduling policy and priority may have + * been inherited from a user process, so we reset them to sane values here. + * + * NOTE that reparent_to_kthreadd() gives the caller full capabilities. + */ +static void reparent_to_kthreadd(void) +{ + write_lock_irq(&tasklist_lock); + + ptrace_unlink(current); + /* Reparent to init */ + current->real_parent = current->parent = kthreadd_task; + list_move_tail(¤t->sibling, ¤t->real_parent->children); + + /* Set the exit signal to SIGCHLD so we signal init on exit */ + current->exit_signal = SIGCHLD; + + if (task_nice(current) < 0) + set_user_nice(current, 0); + /* cpus_allowed? */ + /* rt_priority? */ + /* signals? */ + memcpy(current->signal->rlim, init_task.signal->rlim, + sizeof(current->signal->rlim)); + + atomic_inc(&init_cred.usage); + commit_creds(&init_cred); + write_unlock_irq(&tasklist_lock); +} + +void __set_special_pids(struct pid *pid) +{ + struct task_struct *curr = current->group_leader; + + if (task_session(curr) != pid) + change_pid(curr, PIDTYPE_SID, pid); + + if (task_pgrp(curr) != pid) + change_pid(curr, PIDTYPE_PGID, pid); +} + +static void set_special_pids(struct pid *pid) +{ + write_lock_irq(&tasklist_lock); + __set_special_pids(pid); + write_unlock_irq(&tasklist_lock); +} + +/* + * Let kernel threads use this to say that they allow a certain signal. + * Must not be used if kthread was cloned with CLONE_SIGHAND. + */ +int allow_signal(int sig) +{ + if (!valid_signal(sig) || sig < 1) + return -EINVAL; + + spin_lock_irq(¤t->sighand->siglock); + /* This is only needed for daemonize()'ed kthreads */ + sigdelset(¤t->blocked, sig); + /* + * Kernel threads handle their own signals. Let the signal code + * know it'll be handled, so that they don't get converted to + * SIGKILL or just silently dropped. + */ + current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + return 0; +} + +EXPORT_SYMBOL(allow_signal); + +int disallow_signal(int sig) +{ + if (!valid_signal(sig) || sig < 1) + return -EINVAL; + + spin_lock_irq(¤t->sighand->siglock); + current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + return 0; +} + +EXPORT_SYMBOL(disallow_signal); + +/* + * Put all the gunge required to become a kernel thread without + * attached user resources in one place where it belongs. + */ + +void daemonize(const char *name, ...) +{ + va_list args; + sigset_t blocked; + + va_start(args, name); + vsnprintf(current->comm, sizeof(current->comm), name, args); + va_end(args); + + /* + * If we were started as result of loading a module, close all of the + * user space pages. We don't need them, and if we didn't close them + * they would be locked into memory. + */ + exit_mm(current); + /* + * We don't want to have TIF_FREEZE set if the system-wide hibernation + * or suspend transition begins right now. + */ + current->flags |= (PF_NOFREEZE | PF_KTHREAD); + + if (current->nsproxy != &init_nsproxy) { + get_nsproxy(&init_nsproxy); + switch_task_namespaces(current, &init_nsproxy); + } + set_special_pids(&init_struct_pid); + proc_clear_tty(current); + + /* Block and flush all signals */ + sigfillset(&blocked); + sigprocmask(SIG_BLOCK, &blocked, NULL); + flush_signals(current); + + /* Become as one with the init task */ + + daemonize_fs_struct(); + exit_files(current); + current->files = init_task.files; + atomic_inc(¤t->files->count); + + reparent_to_kthreadd(); +} + +EXPORT_SYMBOL(daemonize); + +static void close_files(struct files_struct * files) +{ + int i, j; + struct fdtable *fdt; + + j = 0; + + /* + * It is safe to dereference the fd table without RCU or + * ->file_lock because this is the last reference to the + * files structure. But use RCU to shut RCU-lockdep up. + */ + rcu_read_lock(); + fdt = files_fdtable(files); + rcu_read_unlock(); + for (;;) { + unsigned long set; + i = j * __NFDBITS; + if (i >= fdt->max_fds) + break; + set = fdt->open_fds->fds_bits[j++]; + while (set) { + if (set & 1) { + struct file * file = xchg(&fdt->fd[i], NULL); + if (file) { + filp_close(file, files); + cond_resched(); + } + } + i++; + set >>= 1; + } + } +} + +struct files_struct *get_files_struct(struct task_struct *task) +{ + struct files_struct *files; + + task_lock(task); + files = task->files; + if (files) + atomic_inc(&files->count); + task_unlock(task); + + return files; +} + +void put_files_struct(struct files_struct *files) +{ + struct fdtable *fdt; + + if (atomic_dec_and_test(&files->count)) { + close_files(files); + /* + * Free the fd and fdset arrays if we expanded them. + * If the fdtable was embedded, pass files for freeing + * at the end of the RCU grace period. Otherwise, + * you can free files immediately. + */ + rcu_read_lock(); + fdt = files_fdtable(files); + if (fdt != &files->fdtab) + kmem_cache_free(files_cachep, files); + free_fdtable(fdt); + rcu_read_unlock(); + } +} + +void reset_files_struct(struct files_struct *files) +{ + struct task_struct *tsk = current; + struct files_struct *old; + + old = tsk->files; + task_lock(tsk); + tsk->files = files; + task_unlock(tsk); + put_files_struct(old); +} + +void exit_files(struct task_struct *tsk) +{ + struct files_struct * files = tsk->files; + + if (files) { + task_lock(tsk); + tsk->files = NULL; + task_unlock(tsk); + put_files_struct(files); + } +} + +#ifdef CONFIG_MM_OWNER +/* + * A task is exiting. If it owned this mm, find a new owner for the mm. + */ +void mm_update_next_owner(struct mm_struct *mm) +{ + struct task_struct *c, *g, *p = current; + +retry: + /* + * If the exiting or execing task is not the owner, it's + * someone else's problem. + */ + if (mm->owner != p) + return; + /* + * The current owner is exiting/execing and there are no other + * candidates. Do not leave the mm pointing to a possibly + * freed task structure. + */ + if (atomic_read(&mm->mm_users) <= 1) { + mm->owner = NULL; + return; + } + + read_lock(&tasklist_lock); + /* + * Search in the children + */ + list_for_each_entry(c, &p->children, sibling) { + if (c->mm == mm) + goto assign_new_owner; + } + + /* + * Search in the siblings + */ + list_for_each_entry(c, &p->real_parent->children, sibling) { + if (c->mm == mm) + goto assign_new_owner; + } + + /* + * Search through everything else. We should not get + * here often + */ + do_each_thread(g, c) { + if (c->mm == mm) + goto assign_new_owner; + } while_each_thread(g, c); + + read_unlock(&tasklist_lock); + /* + * We found no owner yet mm_users > 1: this implies that we are + * most likely racing with swapoff (try_to_unuse()) or /proc or + * ptrace or page migration (get_task_mm()). Mark owner as NULL. + */ + mm->owner = NULL; + return; + +assign_new_owner: + BUG_ON(c == p); + get_task_struct(c); + /* + * The task_lock protects c->mm from changing. + * We always want mm->owner->mm == mm + */ + task_lock(c); + /* + * Delay read_unlock() till we have the task_lock() + * to ensure that c does not slip away underneath us + */ + read_unlock(&tasklist_lock); + if (c->mm != mm) { + task_unlock(c); + put_task_struct(c); + goto retry; + } + mm->owner = c; + task_unlock(c); + put_task_struct(c); +} +#endif /* CONFIG_MM_OWNER */ + +/* + * Turn us into a lazy TLB process if we + * aren't already.. + */ +static void exit_mm(struct task_struct * tsk) +{ + struct mm_struct *mm = tsk->mm; + struct core_state *core_state; + + mm_release(tsk, mm); + if (!mm) + return; + /* + * Serialize with any possible pending coredump. + * We must hold mmap_sem around checking core_state + * and clearing tsk->mm. The core-inducing thread + * will increment ->nr_threads for each thread in the + * group with ->mm != NULL. + */ + down_read(&mm->mmap_sem); + core_state = mm->core_state; + if (core_state) { + struct core_thread self; + up_read(&mm->mmap_sem); + + self.task = tsk; + self.next = xchg(&core_state->dumper.next, &self); + /* + * Implies mb(), the result of xchg() must be visible + * to core_state->dumper. + */ + if (atomic_dec_and_test(&core_state->nr_threads)) + complete(&core_state->startup); + + for (;;) { + set_task_state(tsk, TASK_UNINTERRUPTIBLE); + if (!self.task) /* see coredump_finish() */ + break; + schedule(); + } + __set_task_state(tsk, TASK_RUNNING); + down_read(&mm->mmap_sem); + } + atomic_inc(&mm->mm_count); + BUG_ON(mm != tsk->active_mm); + /* more a memory barrier than a real lock */ + task_lock(tsk); + tsk->mm = NULL; + up_read(&mm->mmap_sem); + enter_lazy_tlb(mm, current); + /* We don't want this task to be frozen prematurely */ + clear_freeze_flag(tsk); + if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_dec(&mm->oom_disable_count); + task_unlock(tsk); + mm_update_next_owner(mm); + mmput(mm); +} + +/* + * When we die, we re-parent all our children. + * Try to give them to another thread in our thread + * group, and if no such member exists, give it to + * the child reaper process (ie "init") in our pid + * space. + */ +static struct task_struct *find_new_reaper(struct task_struct *father) + __releases(&tasklist_lock) + __acquires(&tasklist_lock) +{ + struct pid_namespace *pid_ns = task_active_pid_ns(father); + struct task_struct *thread; + + thread = father; + while_each_thread(father, thread) { + if (thread->flags & PF_EXITING) + continue; + if (unlikely(pid_ns->child_reaper == father)) + pid_ns->child_reaper = thread; + return thread; + } + + if (unlikely(pid_ns->child_reaper == father)) { + write_unlock_irq(&tasklist_lock); + if (unlikely(pid_ns == &init_pid_ns)) + panic("Attempted to kill init!"); + + zap_pid_ns_processes(pid_ns); + write_lock_irq(&tasklist_lock); + /* + * We can not clear ->child_reaper or leave it alone. + * There may by stealth EXIT_DEAD tasks on ->children, + * forget_original_parent() must move them somewhere. + */ + pid_ns->child_reaper = init_pid_ns.child_reaper; + } + + return pid_ns->child_reaper; +} + +/* +* Any that need to be release_task'd are put on the @dead list. + */ +static void reparent_leader(struct task_struct *father, struct task_struct *p, + struct list_head *dead) +{ + list_move_tail(&p->sibling, &p->real_parent->children); + + if (task_detached(p)) + return; + /* + * If this is a threaded reparent there is no need to + * notify anyone anything has happened. + */ + if (same_thread_group(p->real_parent, father)) + return; + + /* We don't want people slaying init. */ + p->exit_signal = SIGCHLD; + + /* If it has exited notify the new parent about this child's death. */ + if (!task_ptrace(p) && + p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { + do_notify_parent(p, p->exit_signal); + if (task_detached(p)) { + p->exit_state = EXIT_DEAD; + list_move_tail(&p->sibling, dead); + } + } + + kill_orphaned_pgrp(p, father); +} + +static void forget_original_parent(struct task_struct *father) +{ + struct task_struct *p, *n, *reaper; + LIST_HEAD(dead_children); + + write_lock_irq(&tasklist_lock); + /* + * Note that exit_ptrace() and find_new_reaper() might + * drop tasklist_lock and reacquire it. + */ + exit_ptrace(father); + reaper = find_new_reaper(father); + + list_for_each_entry_safe(p, n, &father->children, sibling) { + struct task_struct *t = p; + do { + t->real_parent = reaper; + if (t->parent == father) { + BUG_ON(task_ptrace(t)); + t->parent = t->real_parent; + } + if (t->pdeath_signal) + group_send_sig_info(t->pdeath_signal, + SEND_SIG_NOINFO, t); + } while_each_thread(p, t); + reparent_leader(father, p, &dead_children); + } + write_unlock_irq(&tasklist_lock); + + BUG_ON(!list_empty(&father->children)); + + list_for_each_entry_safe(p, n, &dead_children, sibling) { + list_del_init(&p->sibling); + release_task(p); + } +} + +/* + * Send signals to all our closest relatives so that they know + * to properly mourn us.. + */ +static void exit_notify(struct task_struct *tsk, int group_dead) +{ + int signal; + void *cookie; + + /* + * This does two things: + * + * A. Make init inherit all the child processes + * B. Check to see if any process groups have become orphaned + * as a result of our exiting, and if they have any stopped + * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) + */ + forget_original_parent(tsk); + exit_task_namespaces(tsk); + + write_lock_irq(&tasklist_lock); + if (group_dead) + kill_orphaned_pgrp(tsk->group_leader, NULL); + + /* Let father know we died + * + * Thread signals are configurable, but you aren't going to use + * that to send signals to arbitrary processes. + * That stops right now. + * + * If the parent exec id doesn't match the exec id we saved + * when we started then we know the parent has changed security + * domain. + * + * If our self_exec id doesn't match our parent_exec_id then + * we have changed execution domain as these two values started + * the same after a fork. + */ + if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) && + (tsk->parent_exec_id != tsk->real_parent->self_exec_id || + tsk->self_exec_id != tsk->parent_exec_id)) + tsk->exit_signal = SIGCHLD; + + signal = tracehook_notify_death(tsk, &cookie, group_dead); + if (signal >= 0) + signal = do_notify_parent(tsk, signal); + + tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; + + /* mt-exec, de_thread() is waiting for group leader */ + if (unlikely(tsk->signal->notify_count < 0)) + wake_up_process(tsk->signal->group_exit_task); + write_unlock_irq(&tasklist_lock); + + tracehook_report_death(tsk, signal, cookie, group_dead); + + /* If the process is dead, release it - nobody will wait for it */ + if (signal == DEATH_REAP) + release_task(tsk); +} + +#ifdef CONFIG_DEBUG_STACK_USAGE +static void check_stack_usage(void) +{ + static DEFINE_SPINLOCK(low_water_lock); + static int lowest_to_date = THREAD_SIZE; + unsigned long free; + + free = stack_not_used(current); + + if (free >= lowest_to_date) + return; + + spin_lock(&low_water_lock); + if (free < lowest_to_date) { + printk(KERN_WARNING "%s used greatest stack depth: %lu bytes " + "left\n", + current->comm, free); + lowest_to_date = free; + } + spin_unlock(&low_water_lock); +} +#else +static inline void check_stack_usage(void) {} +#endif + +NORET_TYPE void do_exit(long code) +{ + struct task_struct *tsk = current; + int group_dead; + + profile_task_exit(tsk); + + WARN_ON(atomic_read(&tsk->fs_excl)); + WARN_ON(blk_needs_flush_plug(tsk)); + + if (unlikely(in_interrupt())) + panic("Aiee, killing interrupt handler!"); + if (unlikely(!tsk->pid)) + panic("Attempted to kill the idle task!"); + + /* + * If do_exit is called because this processes oopsed, it's possible + * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before + * continuing. Amongst other possible reasons, this is to prevent + * mm_release()->clear_child_tid() from writing to a user-controlled + * kernel address. + */ + set_fs(USER_DS); + + tracehook_report_exit(&code); + + validate_creds_for_do_exit(tsk); + + /* + * We're taking recursive faults here in do_exit. Safest is to just + * leave this task alone and wait for reboot. + */ + if (unlikely(tsk->flags & PF_EXITING)) { + printk(KERN_ALERT + "Fixing recursive fault but reboot is needed!\n"); + /* + * We can do this unlocked here. The futex code uses + * this flag just to verify whether the pi state + * cleanup has been done or not. In the worst case it + * loops once more. We pretend that the cleanup was + * done as there is no way to return. Either the + * OWNER_DIED bit is set by now or we push the blocked + * task into the wait for ever nirwana as well. + */ + tsk->flags |= PF_EXITPIDONE; + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + } + + exit_irq_thread(); + + exit_signals(tsk); /* sets PF_EXITING */ + /* + * tsk->flags are checked in the futex code to protect against + * an exiting task cleaning up the robust pi futexes. + */ + smp_mb(); + raw_spin_unlock_wait(&tsk->pi_lock); + + if (unlikely(in_atomic())) + printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", + current->comm, task_pid_nr(current), + preempt_count()); + + acct_update_integrals(tsk); + /* sync mm's RSS info before statistics gathering */ + if (tsk->mm) + sync_mm_rss(tsk, tsk->mm); + group_dead = atomic_dec_and_test(&tsk->signal->live); + if (group_dead) { + hrtimer_cancel(&tsk->signal->real_timer); + exit_itimers(tsk->signal); + if (tsk->mm) + setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm); + } + acct_collect(code, group_dead); + if (group_dead) + tty_audit_exit(); + if (unlikely(tsk->audit_context)) + audit_free(tsk); + + tsk->exit_code = code; + taskstats_exit(tsk, group_dead); + + exit_mm(tsk); + + if (group_dead) + acct_process(); + trace_sched_process_exit(tsk); + + exit_sem(tsk); + exit_files(tsk); + exit_fs(tsk); + check_stack_usage(); + exit_thread(); + + /* + * Flush inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + * + * because of cgroup mode, must be called before cgroup_exit() + */ + perf_event_exit_task(tsk); + + cgroup_exit(tsk, 1); + + if (group_dead) + disassociate_ctty(1); + + module_put(task_thread_info(tsk)->exec_domain->module); + + proc_exit_connector(tsk); + + /* + * FIXME: do that only when needed, using sched_exit tracepoint + */ + ptrace_put_breakpoints(tsk); + + exit_notify(tsk, group_dead); +#ifdef CONFIG_NUMA + task_lock(tsk); + mpol_put(tsk->mempolicy); + tsk->mempolicy = NULL; + task_unlock(tsk); +#endif +#ifdef CONFIG_FUTEX + if (unlikely(current->pi_state_cache)) + kfree(current->pi_state_cache); +#endif + /* + * Make sure we are holding no locks: + */ + debug_check_no_locks_held(tsk); + /* + * We can do this unlocked here. The futex code uses this flag + * just to verify whether the pi state cleanup has been done + * or not. In the worst case it loops once more. + */ + tsk->flags |= PF_EXITPIDONE; + + if (tsk->io_context) + exit_io_context(tsk); + + if (tsk->splice_pipe) + __free_pipe_info(tsk->splice_pipe); + + validate_creds_for_do_exit(tsk); + + preempt_disable(); + exit_rcu(); + /* causes final put_task_struct in finish_task_switch(). */ + tsk->state = TASK_DEAD; + schedule(); + BUG(); + /* Avoid "noreturn function does return". */ + for (;;) + cpu_relax(); /* For when BUG is null */ +} + +EXPORT_SYMBOL_GPL(do_exit); + +NORET_TYPE void complete_and_exit(struct completion *comp, long code) +{ + if (comp) + complete(comp); + + do_exit(code); +} + +EXPORT_SYMBOL(complete_and_exit); + +SYSCALL_DEFINE1(exit, int, error_code) +{ + do_exit((error_code&0xff)<<8); +} + +/* + * Take down every thread in the group. This is called by fatal signals + * as well as by sys_exit_group (below). + */ +NORET_TYPE void +do_group_exit(int exit_code) +{ + struct signal_struct *sig = current->signal; + + BUG_ON(exit_code & 0x80); /* core dumps don't get here */ + + if (signal_group_exit(sig)) + exit_code = sig->group_exit_code; + else if (!thread_group_empty(current)) { + struct sighand_struct *const sighand = current->sighand; + spin_lock_irq(&sighand->siglock); + if (signal_group_exit(sig)) + /* Another thread got here before we took the lock. */ + exit_code = sig->group_exit_code; + else { + sig->group_exit_code = exit_code; + sig->flags = SIGNAL_GROUP_EXIT; + zap_other_threads(current); + } + spin_unlock_irq(&sighand->siglock); + } + + do_exit(exit_code); + /* NOTREACHED */ +} + +/* + * this kills every thread in the thread group. Note that any externally + * wait4()-ing process will get the correct exit code - even if this + * thread is not the thread group leader. + */ +SYSCALL_DEFINE1(exit_group, int, error_code) +{ + do_group_exit((error_code & 0xff) << 8); + /* NOTREACHED */ + return 0; +} + +struct wait_opts { + enum pid_type wo_type; + int wo_flags; + struct pid *wo_pid; + + struct siginfo __user *wo_info; + int __user *wo_stat; + struct rusage __user *wo_rusage; + + wait_queue_t child_wait; + int notask_error; +}; + +static inline +struct pid *task_pid_type(struct task_struct *task, enum pid_type type) +{ + if (type != PIDTYPE_PID) + task = task->group_leader; + return task->pids[type].pid; +} + +static int eligible_pid(struct wait_opts *wo, struct task_struct *p) +{ + return wo->wo_type == PIDTYPE_MAX || + task_pid_type(p, wo->wo_type) == wo->wo_pid; +} + +static int eligible_child(struct wait_opts *wo, struct task_struct *p) +{ + if (!eligible_pid(wo, p)) + return 0; + /* Wait for all children (clone and not) if __WALL is set; + * otherwise, wait for clone children *only* if __WCLONE is + * set; otherwise, wait for non-clone children *only*. (Note: + * A "clone" child here is one that reports to its parent + * using a signal other than SIGCHLD.) */ + if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) + && !(wo->wo_flags & __WALL)) + return 0; + + return 1; +} + +static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p, + pid_t pid, uid_t uid, int why, int status) +{ + struct siginfo __user *infop; + int retval = wo->wo_rusage + ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; + + put_task_struct(p); + infop = wo->wo_info; + if (infop) { + if (!retval) + retval = put_user(SIGCHLD, &infop->si_signo); + if (!retval) + retval = put_user(0, &infop->si_errno); + if (!retval) + retval = put_user((short)why, &infop->si_code); + if (!retval) + retval = put_user(pid, &infop->si_pid); + if (!retval) + retval = put_user(uid, &infop->si_uid); + if (!retval) + retval = put_user(status, &infop->si_status); + } + if (!retval) + retval = pid; + return retval; +} + +/* + * Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold + * read_lock(&tasklist_lock) on entry. If we return zero, we still hold + * the lock and this task is uninteresting. If we return nonzero, we have + * released the lock and the system call should return. + */ +static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) +{ + unsigned long state; + int retval, status, traced; + pid_t pid = task_pid_vnr(p); + uid_t uid = __task_cred(p)->uid; + struct siginfo __user *infop; + + if (!likely(wo->wo_flags & WEXITED)) + return 0; + + if (unlikely(wo->wo_flags & WNOWAIT)) { + int exit_code = p->exit_code; + int why; + + get_task_struct(p); + read_unlock(&tasklist_lock); + if ((exit_code & 0x7f) == 0) { + why = CLD_EXITED; + status = exit_code >> 8; + } else { + why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED; + status = exit_code & 0x7f; + } + return wait_noreap_copyout(wo, p, pid, uid, why, status); + } + + /* + * Try to move the task's state to DEAD + * only one thread is allowed to do this: + */ + state = xchg(&p->exit_state, EXIT_DEAD); + if (state != EXIT_ZOMBIE) { + BUG_ON(state != EXIT_DEAD); + return 0; + } + + traced = ptrace_reparented(p); + /* + * It can be ptraced but not reparented, check + * !task_detached() to filter out sub-threads. + */ + if (likely(!traced) && likely(!task_detached(p))) { + struct signal_struct *psig; + struct signal_struct *sig; + unsigned long maxrss; + cputime_t tgutime, tgstime; + + /* + * The resource counters for the group leader are in its + * own task_struct. Those for dead threads in the group + * are in its signal_struct, as are those for the child + * processes it has previously reaped. All these + * accumulate in the parent's signal_struct c* fields. + * + * We don't bother to take a lock here to protect these + * p->signal fields, because they are only touched by + * __exit_signal, which runs with tasklist_lock + * write-locked anyway, and so is excluded here. We do + * need to protect the access to parent->signal fields, + * as other threads in the parent group can be right + * here reaping other children at the same time. + * + * We use thread_group_times() to get times for the thread + * group, which consolidates times for all threads in the + * group including the group leader. + */ + thread_group_times(p, &tgutime, &tgstime); + spin_lock_irq(&p->real_parent->sighand->siglock); + psig = p->real_parent->signal; + sig = p->signal; + psig->cutime = + cputime_add(psig->cutime, + cputime_add(tgutime, + sig->cutime)); + psig->cstime = + cputime_add(psig->cstime, + cputime_add(tgstime, + sig->cstime)); + psig->cgtime = + cputime_add(psig->cgtime, + cputime_add(p->gtime, + cputime_add(sig->gtime, + sig->cgtime))); + psig->cmin_flt += + p->min_flt + sig->min_flt + sig->cmin_flt; + psig->cmaj_flt += + p->maj_flt + sig->maj_flt + sig->cmaj_flt; + psig->cnvcsw += + p->nvcsw + sig->nvcsw + sig->cnvcsw; + psig->cnivcsw += + p->nivcsw + sig->nivcsw + sig->cnivcsw; + psig->cinblock += + task_io_get_inblock(p) + + sig->inblock + sig->cinblock; + psig->coublock += + task_io_get_oublock(p) + + sig->oublock + sig->coublock; + maxrss = max(sig->maxrss, sig->cmaxrss); + if (psig->cmaxrss < maxrss) + psig->cmaxrss = maxrss; + task_io_accounting_add(&psig->ioac, &p->ioac); + task_io_accounting_add(&psig->ioac, &sig->ioac); + spin_unlock_irq(&p->real_parent->sighand->siglock); + } + + /* + * Now we are sure this task is interesting, and no other + * thread can reap it because we set its state to EXIT_DEAD. + */ + read_unlock(&tasklist_lock); + + retval = wo->wo_rusage + ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; + status = (p->signal->flags & SIGNAL_GROUP_EXIT) + ? p->signal->group_exit_code : p->exit_code; + if (!retval && wo->wo_stat) + retval = put_user(status, wo->wo_stat); + + infop = wo->wo_info; + if (!retval && infop) + retval = put_user(SIGCHLD, &infop->si_signo); + if (!retval && infop) + retval = put_user(0, &infop->si_errno); + if (!retval && infop) { + int why; + + if ((status & 0x7f) == 0) { + why = CLD_EXITED; + status >>= 8; + } else { + why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED; + status &= 0x7f; + } + retval = put_user((short)why, &infop->si_code); + if (!retval) + retval = put_user(status, &infop->si_status); + } + if (!retval && infop) + retval = put_user(pid, &infop->si_pid); + if (!retval && infop) + retval = put_user(uid, &infop->si_uid); + if (!retval) + retval = pid; + + if (traced) { + write_lock_irq(&tasklist_lock); + /* We dropped tasklist, ptracer could die and untrace */ + ptrace_unlink(p); + /* + * If this is not a detached task, notify the parent. + * If it's still not detached after that, don't release + * it now. + */ + if (!task_detached(p)) { + do_notify_parent(p, p->exit_signal); + if (!task_detached(p)) { + p->exit_state = EXIT_ZOMBIE; + p = NULL; + } + } + write_unlock_irq(&tasklist_lock); + } + if (p != NULL) + release_task(p); + + return retval; +} + +static int *task_stopped_code(struct task_struct *p, bool ptrace) +{ + if (ptrace) { + if (task_is_stopped_or_traced(p)) + return &p->exit_code; + } else { + if (p->signal->flags & SIGNAL_STOP_STOPPED) + return &p->signal->group_exit_code; + } + return NULL; +} + +/** + * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED + * @wo: wait options + * @ptrace: is the wait for ptrace + * @p: task to wait for + * + * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED. + * + * CONTEXT: + * read_lock(&tasklist_lock), which is released if return value is + * non-zero. Also, grabs and releases @p->sighand->siglock. + * + * RETURNS: + * 0 if wait condition didn't exist and search for other wait conditions + * should continue. Non-zero return, -errno on failure and @p's pid on + * success, implies that tasklist_lock is released and wait condition + * search should terminate. + */ +static int wait_task_stopped(struct wait_opts *wo, + int ptrace, struct task_struct *p) +{ + struct siginfo __user *infop; + int retval, exit_code, *p_code, why; + uid_t uid = 0; /* unneeded, required by compiler */ + pid_t pid; + + /* + * Traditionally we see ptrace'd stopped tasks regardless of options. + */ + if (!ptrace && !(wo->wo_flags & WUNTRACED)) + return 0; + + if (!task_stopped_code(p, ptrace)) + return 0; + + exit_code = 0; + spin_lock_irq(&p->sighand->siglock); + + p_code = task_stopped_code(p, ptrace); + if (unlikely(!p_code)) + goto unlock_sig; + + exit_code = *p_code; + if (!exit_code) + goto unlock_sig; + + if (!unlikely(wo->wo_flags & WNOWAIT)) + *p_code = 0; + + uid = task_uid(p); +unlock_sig: + spin_unlock_irq(&p->sighand->siglock); + if (!exit_code) + return 0; + + /* + * Now we are pretty sure this task is interesting. + * Make sure it doesn't get reaped out from under us while we + * give up the lock and then examine it below. We don't want to + * keep holding onto the tasklist_lock while we call getrusage and + * possibly take page faults for user memory. + */ + get_task_struct(p); + pid = task_pid_vnr(p); + why = ptrace ? CLD_TRAPPED : CLD_STOPPED; + read_unlock(&tasklist_lock); + + if (unlikely(wo->wo_flags & WNOWAIT)) + return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); + + retval = wo->wo_rusage + ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; + if (!retval && wo->wo_stat) + retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat); + + infop = wo->wo_info; + if (!retval && infop) + retval = put_user(SIGCHLD, &infop->si_signo); + if (!retval && infop) + retval = put_user(0, &infop->si_errno); + if (!retval && infop) + retval = put_user((short)why, &infop->si_code); + if (!retval && infop) + retval = put_user(exit_code, &infop->si_status); + if (!retval && infop) + retval = put_user(pid, &infop->si_pid); + if (!retval && infop) + retval = put_user(uid, &infop->si_uid); + if (!retval) + retval = pid; + put_task_struct(p); + + BUG_ON(!retval); + return retval; +} + +/* + * Handle do_wait work for one task in a live, non-stopped state. + * read_lock(&tasklist_lock) on entry. If we return zero, we still hold + * the lock and this task is uninteresting. If we return nonzero, we have + * released the lock and the system call should return. + */ +static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) +{ + int retval; + pid_t pid; + uid_t uid; + + if (!unlikely(wo->wo_flags & WCONTINUED)) + return 0; + + if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) + return 0; + + spin_lock_irq(&p->sighand->siglock); + /* Re-check with the lock held. */ + if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) { + spin_unlock_irq(&p->sighand->siglock); + return 0; + } + if (!unlikely(wo->wo_flags & WNOWAIT)) + p->signal->flags &= ~SIGNAL_STOP_CONTINUED; + uid = task_uid(p); + spin_unlock_irq(&p->sighand->siglock); + + pid = task_pid_vnr(p); + get_task_struct(p); + read_unlock(&tasklist_lock); + + if (!wo->wo_info) { + retval = wo->wo_rusage + ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; + put_task_struct(p); + if (!retval && wo->wo_stat) + retval = put_user(0xffff, wo->wo_stat); + if (!retval) + retval = pid; + } else { + retval = wait_noreap_copyout(wo, p, pid, uid, + CLD_CONTINUED, SIGCONT); + BUG_ON(retval == 0); + } + + return retval; +} + +/* + * Consider @p for a wait by @parent. + * + * -ECHILD should be in ->notask_error before the first call. + * Returns nonzero for a final return, when we have unlocked tasklist_lock. + * Returns zero if the search for a child should continue; + * then ->notask_error is 0 if @p is an eligible child, + * or another error from security_task_wait(), or still -ECHILD. + */ +static int wait_consider_task(struct wait_opts *wo, int ptrace, + struct task_struct *p) +{ + int ret = eligible_child(wo, p); + if (!ret) + return ret; + + ret = security_task_wait(p); + if (unlikely(ret < 0)) { + /* + * If we have not yet seen any eligible child, + * then let this error code replace -ECHILD. + * A permission error will give the user a clue + * to look for security policy problems, rather + * than for mysterious wait bugs. + */ + if (wo->notask_error) + wo->notask_error = ret; + return 0; + } + + /* dead body doesn't have much to contribute */ + if (unlikely(p->exit_state == EXIT_DEAD)) { + /* + * But do not ignore this task until the tracer does + * wait_task_zombie()->do_notify_parent(). + */ + if (likely(!ptrace) && unlikely(ptrace_reparented(p))) + wo->notask_error = 0; + return 0; + } + + /* slay zombie? */ + if (p->exit_state == EXIT_ZOMBIE) { + /* + * A zombie ptracee is only visible to its ptracer. + * Notification and reaping will be cascaded to the real + * parent when the ptracer detaches. + */ + if (likely(!ptrace) && unlikely(task_ptrace(p))) { + /* it will become visible, clear notask_error */ + wo->notask_error = 0; + return 0; + } + + /* we don't reap group leaders with subthreads */ + if (!delay_group_leader(p)) + return wait_task_zombie(wo, p); + + /* + * Allow access to stopped/continued state via zombie by + * falling through. Clearing of notask_error is complex. + * + * When !@ptrace: + * + * If WEXITED is set, notask_error should naturally be + * cleared. If not, subset of WSTOPPED|WCONTINUED is set, + * so, if there are live subthreads, there are events to + * wait for. If all subthreads are dead, it's still safe + * to clear - this function will be called again in finite + * amount time once all the subthreads are released and + * will then return without clearing. + * + * When @ptrace: + * + * Stopped state is per-task and thus can't change once the + * target task dies. Only continued and exited can happen. + * Clear notask_error if WCONTINUED | WEXITED. + */ + if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED))) + wo->notask_error = 0; + } else { + /* + * If @p is ptraced by a task in its real parent's group, + * hide group stop/continued state when looking at @p as + * the real parent; otherwise, a single stop can be + * reported twice as group and ptrace stops. + * + * If a ptracer wants to distinguish the two events for its + * own children, it should create a separate process which + * takes the role of real parent. + */ + if (likely(!ptrace) && task_ptrace(p) && + same_thread_group(p->parent, p->real_parent)) + return 0; + + /* + * @p is alive and it's gonna stop, continue or exit, so + * there always is something to wait for. + */ + wo->notask_error = 0; + } + + /* + * Wait for stopped. Depending on @ptrace, different stopped state + * is used and the two don't interact with each other. + */ + ret = wait_task_stopped(wo, ptrace, p); + if (ret) + return ret; + + /* + * Wait for continued. There's only one continued state and the + * ptracer can consume it which can confuse the real parent. Don't + * use WCONTINUED from ptracer. You don't need or want it. + */ + return wait_task_continued(wo, p); +} + +/* + * Do the work of do_wait() for one thread in the group, @tsk. + * + * -ECHILD should be in ->notask_error before the first call. + * Returns nonzero for a final return, when we have unlocked tasklist_lock. + * Returns zero if the search for a child should continue; then + * ->notask_error is 0 if there were any eligible children, + * or another error from security_task_wait(), or still -ECHILD. + */ +static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk) +{ + struct task_struct *p; + + list_for_each_entry(p, &tsk->children, sibling) { + int ret = wait_consider_task(wo, 0, p); + if (ret) + return ret; + } + + return 0; +} + +static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk) +{ + struct task_struct *p; + + list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { + int ret = wait_consider_task(wo, 1, p); + if (ret) + return ret; + } + + return 0; +} + +static int child_wait_callback(wait_queue_t *wait, unsigned mode, + int sync, void *key) +{ + struct wait_opts *wo = container_of(wait, struct wait_opts, + child_wait); + struct task_struct *p = key; + + if (!eligible_pid(wo, p)) + return 0; + + if ((wo->wo_flags & __WNOTHREAD) && wait->private != p->parent) + return 0; + + return default_wake_function(wait, mode, sync, key); +} + +void __wake_up_parent(struct task_struct *p, struct task_struct *parent) +{ + __wake_up_sync_key(&parent->signal->wait_chldexit, + TASK_INTERRUPTIBLE, 1, p); +} + +static long do_wait(struct wait_opts *wo) +{ + struct task_struct *tsk; + int retval; + + trace_sched_process_wait(wo->wo_pid); + + init_waitqueue_func_entry(&wo->child_wait, child_wait_callback); + wo->child_wait.private = current; + add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); +repeat: + /* + * If there is nothing that can match our critiera just get out. + * We will clear ->notask_error to zero if we see any child that + * might later match our criteria, even if we are not able to reap + * it yet. + */ + wo->notask_error = -ECHILD; + if ((wo->wo_type < PIDTYPE_MAX) && + (!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type]))) + goto notask; + + set_current_state(TASK_INTERRUPTIBLE); + read_lock(&tasklist_lock); + tsk = current; + do { + retval = do_wait_thread(wo, tsk); + if (retval) + goto end; + + retval = ptrace_do_wait(wo, tsk); + if (retval) + goto end; + + if (wo->wo_flags & __WNOTHREAD) + break; + } while_each_thread(current, tsk); + read_unlock(&tasklist_lock); + +notask: + retval = wo->notask_error; + if (!retval && !(wo->wo_flags & WNOHANG)) { + retval = -ERESTARTSYS; + if (!signal_pending(current)) { + schedule(); + goto repeat; + } + } +end: + __set_current_state(TASK_RUNNING); + remove_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); + return retval; +} + +SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *, + infop, int, options, struct rusage __user *, ru) +{ + struct wait_opts wo; + struct pid *pid = NULL; + enum pid_type type; + long ret; + + if (options & ~(WNOHANG|WNOWAIT|WEXITED|WSTOPPED|WCONTINUED)) + return -EINVAL; + if (!(options & (WEXITED|WSTOPPED|WCONTINUED))) + return -EINVAL; + + switch (which) { + case P_ALL: + type = PIDTYPE_MAX; + break; + case P_PID: + type = PIDTYPE_PID; + if (upid <= 0) + return -EINVAL; + break; + case P_PGID: + type = PIDTYPE_PGID; + if (upid <= 0) + return -EINVAL; + break; + default: + return -EINVAL; + } + + if (type < PIDTYPE_MAX) + pid = find_get_pid(upid); + + wo.wo_type = type; + wo.wo_pid = pid; + wo.wo_flags = options; + wo.wo_info = infop; + wo.wo_stat = NULL; + wo.wo_rusage = ru; + ret = do_wait(&wo); + + if (ret > 0) { + ret = 0; + } else if (infop) { + /* + * For a WNOHANG return, clear out all the fields + * we would set so the user can easily tell the + * difference. + */ + if (!ret) + ret = put_user(0, &infop->si_signo); + if (!ret) + ret = put_user(0, &infop->si_errno); + if (!ret) + ret = put_user(0, &infop->si_code); + if (!ret) + ret = put_user(0, &infop->si_pid); + if (!ret) + ret = put_user(0, &infop->si_uid); + if (!ret) + ret = put_user(0, &infop->si_status); + } + + put_pid(pid); + + /* avoid REGPARM breakage on x86: */ + asmlinkage_protect(5, ret, which, upid, infop, options, ru); + return ret; +} + +SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr, + int, options, struct rusage __user *, ru) +{ + struct wait_opts wo; + struct pid *pid = NULL; + enum pid_type type; + long ret; + + if (options & ~(WNOHANG|WUNTRACED|WCONTINUED| + __WNOTHREAD|__WCLONE|__WALL)) + return -EINVAL; + + if (upid == -1) + type = PIDTYPE_MAX; + else if (upid < 0) { + type = PIDTYPE_PGID; + pid = find_get_pid(-upid); + } else if (upid == 0) { + type = PIDTYPE_PGID; + pid = get_task_pid(current, PIDTYPE_PGID); + } else /* upid > 0 */ { + type = PIDTYPE_PID; + pid = find_get_pid(upid); + } + + wo.wo_type = type; + wo.wo_pid = pid; + wo.wo_flags = options | WEXITED; + wo.wo_info = NULL; + wo.wo_stat = stat_addr; + wo.wo_rusage = ru; + ret = do_wait(&wo); + put_pid(pid); + + /* avoid REGPARM breakage on x86: */ + asmlinkage_protect(4, ret, upid, stat_addr, options, ru); + return ret; +} + +#ifdef __ARCH_WANT_SYS_WAITPID + +/* + * sys_waitpid() remains for compatibility. waitpid() should be + * implemented by calling sys_wait4() from libc.a. + */ +SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options) +{ + return sys_wait4(pid, stat_addr, options, NULL); +} + +#endif diff --git a/kernel/extable.c b/kernel/extable.c new file mode 100644 index 00000000..5339705b --- /dev/null +++ b/kernel/extable.c @@ -0,0 +1,133 @@ +/* Rewritten by Rusty Russell, on the backs of many others... + Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +#include +#include +#include +#include +#include + +#include +#include + +/* + * mutex protecting text section modification (dynamic code patching). + * some users need to sleep (allocating memory...) while they hold this lock. + * + * NOT exported to modules - patching kernel text is a really delicate matter. + */ +DEFINE_MUTEX(text_mutex); + +extern struct exception_table_entry __start___ex_table[]; +extern struct exception_table_entry __stop___ex_table[]; + +/* Sort the kernel's built-in exception table */ +void __init sort_main_extable(void) +{ + sort_extable(__start___ex_table, __stop___ex_table); +} + +/* Given an address, look for it in the exception tables. */ +const struct exception_table_entry *search_exception_tables(unsigned long addr) +{ + const struct exception_table_entry *e; + + e = search_extable(__start___ex_table, __stop___ex_table-1, addr); + if (!e) + e = search_module_extables(addr); + return e; +} + +static inline int init_kernel_text(unsigned long addr) +{ + if (addr >= (unsigned long)_sinittext && + addr <= (unsigned long)_einittext) + return 1; + return 0; +} + +int core_kernel_text(unsigned long addr) +{ + if (addr >= (unsigned long)_stext && + addr <= (unsigned long)_etext) + return 1; + + if (system_state == SYSTEM_BOOTING && + init_kernel_text(addr)) + return 1; + return 0; +} + +/** + * core_kernel_data - tell if addr points to kernel data + * @addr: address to test + * + * Returns true if @addr passed in is from the core kernel data + * section. + * + * Note: On some archs it may return true for core RODATA, and false + * for others. But will always be true for core RW data. + */ +int core_kernel_data(unsigned long addr) +{ + if (addr >= (unsigned long)_sdata && + addr < (unsigned long)_edata) + return 1; + return 0; +} + +int __kernel_text_address(unsigned long addr) +{ + if (core_kernel_text(addr)) + return 1; + if (is_module_text_address(addr)) + return 1; + /* + * There might be init symbols in saved stacktraces. + * Give those symbols a chance to be printed in + * backtraces (such as lockdep traces). + * + * Since we are after the module-symbols check, there's + * no danger of address overlap: + */ + if (init_kernel_text(addr)) + return 1; + return 0; +} + +int kernel_text_address(unsigned long addr) +{ + if (core_kernel_text(addr)) + return 1; + return is_module_text_address(addr); +} + +/* + * On some architectures (PPC64, IA64) function pointers + * are actually only tokens to some data that then holds the + * real function address. As a result, to find if a function + * pointer is part of the kernel text, we need to do some + * special dereferencing first. + */ +int func_ptr_is_kernel_text(void *ptr) +{ + unsigned long addr; + addr = (unsigned long) dereference_function_descriptor(ptr); + if (core_kernel_text(addr)) + return 1; + return is_module_text_address(addr); +} diff --git a/kernel/fork.c b/kernel/fork.c new file mode 100644 index 00000000..fa6030d9 --- /dev/null +++ b/kernel/fork.c @@ -0,0 +1,1775 @@ +/* + * linux/kernel/fork.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * 'fork.c' contains the help-routines for the 'fork' system call + * (see also entry.S and others). + * Fork is rather simple, once you get the hang of it, but the memory + * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +/* + * Protected counters by write_lock_irq(&tasklist_lock) + */ +unsigned long total_forks; /* Handle normal Linux uptimes. */ +int nr_threads; /* The idle threads do not count.. */ + +int max_threads; /* tunable limit on nr_threads */ + +DEFINE_PER_CPU(unsigned long, process_counts) = 0; + +__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ + +#ifdef CONFIG_PROVE_RCU +int lockdep_tasklist_lock_is_held(void) +{ + return lockdep_is_held(&tasklist_lock); +} +EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held); +#endif /* #ifdef CONFIG_PROVE_RCU */ + +int nr_processes(void) +{ + int cpu; + int total = 0; + + for_each_possible_cpu(cpu) + total += per_cpu(process_counts, cpu); + + return total; +} + +#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR +# define alloc_task_struct_node(node) \ + kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node) +# define free_task_struct(tsk) \ + kmem_cache_free(task_struct_cachep, (tsk)) +static struct kmem_cache *task_struct_cachep; +#endif + +#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR +static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, + int node) +{ +#ifdef CONFIG_DEBUG_STACK_USAGE + gfp_t mask = GFP_KERNEL | __GFP_ZERO; +#else + gfp_t mask = GFP_KERNEL; +#endif + struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER); + + return page ? page_address(page) : NULL; +} + +static inline void free_thread_info(struct thread_info *ti) +{ + free_pages((unsigned long)ti, THREAD_SIZE_ORDER); +} +#endif + +/* SLAB cache for signal_struct structures (tsk->signal) */ +static struct kmem_cache *signal_cachep; + +/* SLAB cache for sighand_struct structures (tsk->sighand) */ +struct kmem_cache *sighand_cachep; + +/* SLAB cache for files_struct structures (tsk->files) */ +struct kmem_cache *files_cachep; + +/* SLAB cache for fs_struct structures (tsk->fs) */ +struct kmem_cache *fs_cachep; + +/* SLAB cache for vm_area_struct structures */ +struct kmem_cache *vm_area_cachep; + +/* SLAB cache for mm_struct structures (tsk->mm) */ +static struct kmem_cache *mm_cachep; + +/* Notifier list called when a task struct is freed */ +static ATOMIC_NOTIFIER_HEAD(task_free_notifier); + +static void account_kernel_stack(struct thread_info *ti, int account) +{ + struct zone *zone = page_zone(virt_to_page(ti)); + + mod_zone_page_state(zone, NR_KERNEL_STACK, account); +} + +void free_task(struct task_struct *tsk) +{ + prop_local_destroy_single(&tsk->dirties); + account_kernel_stack(tsk->stack, -1); + free_thread_info(tsk->stack); + rt_mutex_debug_task_free(tsk); + ftrace_graph_exit_task(tsk); + free_task_struct(tsk); +} +EXPORT_SYMBOL(free_task); + +static inline void free_signal_struct(struct signal_struct *sig) +{ + taskstats_tgid_free(sig); + sched_autogroup_exit(sig); + kmem_cache_free(signal_cachep, sig); +} + +static inline void put_signal_struct(struct signal_struct *sig) +{ + if (atomic_dec_and_test(&sig->sigcnt)) + free_signal_struct(sig); +} + +int task_free_register(struct notifier_block *n) +{ + return atomic_notifier_chain_register(&task_free_notifier, n); +} +EXPORT_SYMBOL(task_free_register); + +int task_free_unregister(struct notifier_block *n) +{ + return atomic_notifier_chain_unregister(&task_free_notifier, n); +} +EXPORT_SYMBOL(task_free_unregister); + +void __put_task_struct(struct task_struct *tsk) +{ + WARN_ON(!tsk->exit_state); + WARN_ON(atomic_read(&tsk->usage)); + WARN_ON(tsk == current); + + exit_creds(tsk); + delayacct_tsk_free(tsk); + put_signal_struct(tsk->signal); + + atomic_notifier_call_chain(&task_free_notifier, 0, tsk); + if (!profile_handoff_task(tsk)) + free_task(tsk); +} +EXPORT_SYMBOL_GPL(__put_task_struct); + +/* + * macro override instead of weak attribute alias, to workaround + * gcc 4.1.0 and 4.1.1 bugs with weak attribute and empty functions. + */ +#ifndef arch_task_cache_init +#define arch_task_cache_init() +#endif + +void __init fork_init(unsigned long mempages) +{ +#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR +#ifndef ARCH_MIN_TASKALIGN +#define ARCH_MIN_TASKALIGN L1_CACHE_BYTES +#endif + /* create a slab on which task_structs can be allocated */ + task_struct_cachep = + kmem_cache_create("task_struct", sizeof(struct task_struct), + ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL); +#endif + + /* do the arch specific task caches init */ + arch_task_cache_init(); + + /* + * The default maximum number of threads is set to a safe + * value: the thread structures can take up at most half + * of memory. + */ + max_threads = mempages / (8 * THREAD_SIZE / PAGE_SIZE); + + /* + * we need to allow at least 20 threads to boot a system + */ + if(max_threads < 20) + max_threads = 20; + + init_task.signal->rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; + init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2; + init_task.signal->rlim[RLIMIT_SIGPENDING] = + init_task.signal->rlim[RLIMIT_NPROC]; +} + +int __attribute__((weak)) arch_dup_task_struct(struct task_struct *dst, + struct task_struct *src) +{ + *dst = *src; + return 0; +} + +static struct task_struct *dup_task_struct(struct task_struct *orig) +{ + struct task_struct *tsk; + struct thread_info *ti; + unsigned long *stackend; + int node = tsk_fork_get_node(orig); + int err; + + prepare_to_copy(orig); + + tsk = alloc_task_struct_node(node); + if (!tsk) + return NULL; + + ti = alloc_thread_info_node(tsk, node); + if (!ti) { + free_task_struct(tsk); + return NULL; + } + + err = arch_dup_task_struct(tsk, orig); + if (err) + goto out; + + tsk->stack = ti; + + err = prop_local_init_single(&tsk->dirties); + if (err) + goto out; + + setup_thread_stack(tsk, orig); + clear_user_return_notifier(tsk); + clear_tsk_need_resched(tsk); + stackend = end_of_stack(tsk); + *stackend = STACK_END_MAGIC; /* for overflow detection */ + +#ifdef CONFIG_CC_STACKPROTECTOR + tsk->stack_canary = get_random_int(); +#endif + + /* One for us, one for whoever does the "release_task()" (usually parent) */ + atomic_set(&tsk->usage,2); + atomic_set(&tsk->fs_excl, 0); +#ifdef CONFIG_BLK_DEV_IO_TRACE + tsk->btrace_seq = 0; +#endif + tsk->splice_pipe = NULL; + + account_kernel_stack(ti, 1); + + return tsk; + +out: + free_thread_info(ti); + free_task_struct(tsk); + return NULL; +} + +#ifdef CONFIG_MMU +static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) +{ + struct vm_area_struct *mpnt, *tmp, *prev, **pprev; + struct rb_node **rb_link, *rb_parent; + int retval; + unsigned long charge; + struct mempolicy *pol; + + down_write(&oldmm->mmap_sem); + flush_cache_dup_mm(oldmm); + /* + * Not linked in yet - no deadlock potential: + */ + down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); + + mm->locked_vm = 0; + mm->mmap = NULL; + mm->mmap_cache = NULL; + mm->free_area_cache = oldmm->mmap_base; + mm->cached_hole_size = ~0UL; + mm->map_count = 0; + cpumask_clear(mm_cpumask(mm)); + mm->mm_rb = RB_ROOT; + rb_link = &mm->mm_rb.rb_node; + rb_parent = NULL; + pprev = &mm->mmap; + retval = ksm_fork(mm, oldmm); + if (retval) + goto out; + retval = khugepaged_fork(mm, oldmm); + if (retval) + goto out; + + prev = NULL; + for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { + struct file *file; + + if (mpnt->vm_flags & VM_DONTCOPY) { + long pages = vma_pages(mpnt); + mm->total_vm -= pages; + vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, + -pages); + continue; + } + charge = 0; + if (mpnt->vm_flags & VM_ACCOUNT) { + unsigned int len = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; + if (security_vm_enough_memory(len)) + goto fail_nomem; + charge = len; + } + tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + if (!tmp) + goto fail_nomem; + *tmp = *mpnt; + INIT_LIST_HEAD(&tmp->anon_vma_chain); + pol = mpol_dup(vma_policy(mpnt)); + retval = PTR_ERR(pol); + if (IS_ERR(pol)) + goto fail_nomem_policy; + vma_set_policy(tmp, pol); + tmp->vm_mm = mm; + if (anon_vma_fork(tmp, mpnt)) + goto fail_nomem_anon_vma_fork; + tmp->vm_flags &= ~VM_LOCKED; + tmp->vm_next = tmp->vm_prev = NULL; + file = tmp->vm_file; + if (file) { + struct inode *inode = file->f_path.dentry->d_inode; + struct address_space *mapping = file->f_mapping; + + get_file(file); + if (tmp->vm_flags & VM_DENYWRITE) + atomic_dec(&inode->i_writecount); + mutex_lock(&mapping->i_mmap_mutex); + if (tmp->vm_flags & VM_SHARED) + mapping->i_mmap_writable++; + flush_dcache_mmap_lock(mapping); + /* insert tmp into the share list, just after mpnt */ + vma_prio_tree_add(tmp, mpnt); + flush_dcache_mmap_unlock(mapping); + mutex_unlock(&mapping->i_mmap_mutex); + } + + /* + * Clear hugetlb-related page reserves for children. This only + * affects MAP_PRIVATE mappings. Faults generated by the child + * are not guaranteed to succeed, even if read-only + */ + if (is_vm_hugetlb_page(tmp)) + reset_vma_resv_huge_pages(tmp); + + /* + * Link in the new vma and copy the page table entries. + */ + *pprev = tmp; + pprev = &tmp->vm_next; + tmp->vm_prev = prev; + prev = tmp; + + __vma_link_rb(mm, tmp, rb_link, rb_parent); + rb_link = &tmp->vm_rb.rb_right; + rb_parent = &tmp->vm_rb; + + mm->map_count++; + retval = copy_page_range(mm, oldmm, mpnt); + + if (tmp->vm_ops && tmp->vm_ops->open) + tmp->vm_ops->open(tmp); + + if (retval) + goto out; + } + /* a new mm has just been created */ + arch_dup_mmap(oldmm, mm); + retval = 0; +out: + up_write(&mm->mmap_sem); + flush_tlb_mm(oldmm); + up_write(&oldmm->mmap_sem); + return retval; +fail_nomem_anon_vma_fork: + mpol_put(pol); +fail_nomem_policy: + kmem_cache_free(vm_area_cachep, tmp); +fail_nomem: + retval = -ENOMEM; + vm_unacct_memory(charge); + goto out; +} + +static inline int mm_alloc_pgd(struct mm_struct * mm) +{ + mm->pgd = pgd_alloc(mm); + if (unlikely(!mm->pgd)) + return -ENOMEM; + return 0; +} + +static inline void mm_free_pgd(struct mm_struct * mm) +{ + pgd_free(mm, mm->pgd); +} +#else +#define dup_mmap(mm, oldmm) (0) +#define mm_alloc_pgd(mm) (0) +#define mm_free_pgd(mm) +#endif /* CONFIG_MMU */ + +__cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); + +#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) +#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) + +static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT; + +static int __init coredump_filter_setup(char *s) +{ + default_dump_filter = + (simple_strtoul(s, NULL, 0) << MMF_DUMP_FILTER_SHIFT) & + MMF_DUMP_FILTER_MASK; + return 1; +} + +__setup("coredump_filter=", coredump_filter_setup); + +#include + +static void mm_init_aio(struct mm_struct *mm) +{ +#ifdef CONFIG_AIO + spin_lock_init(&mm->ioctx_lock); + INIT_HLIST_HEAD(&mm->ioctx_list); +#endif +} + +static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) +{ + atomic_set(&mm->mm_users, 1); + atomic_set(&mm->mm_count, 1); + init_rwsem(&mm->mmap_sem); + INIT_LIST_HEAD(&mm->mmlist); + mm->flags = (current->mm) ? + (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; + mm->core_state = NULL; + mm->nr_ptes = 0; + memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); + spin_lock_init(&mm->page_table_lock); + mm->free_area_cache = TASK_UNMAPPED_BASE; + mm->cached_hole_size = ~0UL; + mm_init_aio(mm); + mm_init_owner(mm, p); + atomic_set(&mm->oom_disable_count, 0); + + if (likely(!mm_alloc_pgd(mm))) { + mm->def_flags = 0; + mmu_notifier_mm_init(mm); + return mm; + } + + free_mm(mm); + return NULL; +} + +/* + * Allocate and initialize an mm_struct. + */ +struct mm_struct * mm_alloc(void) +{ + struct mm_struct * mm; + + mm = allocate_mm(); + if (!mm) + return NULL; + + memset(mm, 0, sizeof(*mm)); + mm_init_cpumask(mm); + return mm_init(mm, current); +} + +/* + * Called when the last reference to the mm + * is dropped: either by a lazy thread or by + * mmput. Free the page directory and the mm. + */ +void __mmdrop(struct mm_struct *mm) +{ + BUG_ON(mm == &init_mm); + mm_free_pgd(mm); + destroy_context(mm); + mmu_notifier_mm_destroy(mm); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + VM_BUG_ON(mm->pmd_huge_pte); +#endif + free_mm(mm); +} +EXPORT_SYMBOL_GPL(__mmdrop); + +/* + * Decrement the use count and release all resources for an mm. + */ +void mmput(struct mm_struct *mm) +{ + might_sleep(); + + if (atomic_dec_and_test(&mm->mm_users)) { + exit_aio(mm); + ksm_exit(mm); + khugepaged_exit(mm); /* must run before exit_mmap */ + exit_mmap(mm); + set_mm_exe_file(mm, NULL); + if (!list_empty(&mm->mmlist)) { + spin_lock(&mmlist_lock); + list_del(&mm->mmlist); + spin_unlock(&mmlist_lock); + } + put_swap_token(mm); + if (mm->binfmt) + module_put(mm->binfmt->module); + mmdrop(mm); + } +} +EXPORT_SYMBOL_GPL(mmput); + +/* + * We added or removed a vma mapping the executable. The vmas are only mapped + * during exec and are not mapped with the mmap system call. + * Callers must hold down_write() on the mm's mmap_sem for these + */ +void added_exe_file_vma(struct mm_struct *mm) +{ + mm->num_exe_file_vmas++; +} + +void removed_exe_file_vma(struct mm_struct *mm) +{ + mm->num_exe_file_vmas--; + if ((mm->num_exe_file_vmas == 0) && mm->exe_file){ + fput(mm->exe_file); + mm->exe_file = NULL; + } + +} + +void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) +{ + if (new_exe_file) + get_file(new_exe_file); + if (mm->exe_file) + fput(mm->exe_file); + mm->exe_file = new_exe_file; + mm->num_exe_file_vmas = 0; +} + +struct file *get_mm_exe_file(struct mm_struct *mm) +{ + struct file *exe_file; + + /* We need mmap_sem to protect against races with removal of + * VM_EXECUTABLE vmas */ + down_read(&mm->mmap_sem); + exe_file = mm->exe_file; + if (exe_file) + get_file(exe_file); + up_read(&mm->mmap_sem); + return exe_file; +} + +static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm) +{ + /* It's safe to write the exe_file pointer without exe_file_lock because + * this is called during fork when the task is not yet in /proc */ + newmm->exe_file = get_mm_exe_file(oldmm); +} + +/** + * get_task_mm - acquire a reference to the task's mm + * + * Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning + * this kernel workthread has transiently adopted a user mm with use_mm, + * to do its AIO) is not set and if so returns a reference to it, after + * bumping up the use count. User must release the mm via mmput() + * after use. Typically used by /proc and ptrace. + */ +struct mm_struct *get_task_mm(struct task_struct *task) +{ + struct mm_struct *mm; + + task_lock(task); + mm = task->mm; + if (mm) { + if (task->flags & PF_KTHREAD) + mm = NULL; + else + atomic_inc(&mm->mm_users); + } + task_unlock(task); + return mm; +} +EXPORT_SYMBOL_GPL(get_task_mm); + +/* Please note the differences between mmput and mm_release. + * mmput is called whenever we stop holding onto a mm_struct, + * error success whatever. + * + * mm_release is called after a mm_struct has been removed + * from the current process. + * + * This difference is important for error handling, when we + * only half set up a mm_struct for a new process and need to restore + * the old one. Because we mmput the new mm_struct before + * restoring the old one. . . + * Eric Biederman 10 January 1998 + */ +void mm_release(struct task_struct *tsk, struct mm_struct *mm) +{ + struct completion *vfork_done = tsk->vfork_done; + + /* Get rid of any futexes when releasing the mm */ +#ifdef CONFIG_FUTEX + if (unlikely(tsk->robust_list)) { + exit_robust_list(tsk); + tsk->robust_list = NULL; + } +#ifdef CONFIG_COMPAT + if (unlikely(tsk->compat_robust_list)) { + compat_exit_robust_list(tsk); + tsk->compat_robust_list = NULL; + } +#endif + if (unlikely(!list_empty(&tsk->pi_state_list))) + exit_pi_state_list(tsk); +#endif + + /* Get rid of any cached register state */ + deactivate_mm(tsk, mm); + + /* notify parent sleeping on vfork() */ + if (vfork_done) { + tsk->vfork_done = NULL; + complete(vfork_done); + } + + /* + * If we're exiting normally, clear a user-space tid field if + * requested. We leave this alone when dying by signal, to leave + * the value intact in a core dump, and to save the unnecessary + * trouble otherwise. Userland only wants this done for a sys_exit. + */ + if (tsk->clear_child_tid) { + if (!(tsk->flags & PF_SIGNALED) && + atomic_read(&mm->mm_users) > 1) { + /* + * We don't check the error code - if userspace has + * not set up a proper pointer then tough luck. + */ + put_user(0, tsk->clear_child_tid); + sys_futex(tsk->clear_child_tid, FUTEX_WAKE, + 1, NULL, NULL, 0); + } + tsk->clear_child_tid = NULL; + } +} + +/* + * Allocate a new mm structure and copy contents from the + * mm structure of the passed in task structure. + */ +struct mm_struct *dup_mm(struct task_struct *tsk) +{ + struct mm_struct *mm, *oldmm = current->mm; + int err; + + if (!oldmm) + return NULL; + + mm = allocate_mm(); + if (!mm) + goto fail_nomem; + + memcpy(mm, oldmm, sizeof(*mm)); + mm_init_cpumask(mm); + + /* Initializing for Swap token stuff */ + mm->token_priority = 0; + mm->last_interval = 0; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + mm->pmd_huge_pte = NULL; +#endif + + if (!mm_init(mm, tsk)) + goto fail_nomem; + + if (init_new_context(tsk, mm)) + goto fail_nocontext; + + dup_mm_exe_file(oldmm, mm); + + err = dup_mmap(mm, oldmm); + if (err) + goto free_pt; + + mm->hiwater_rss = get_mm_rss(mm); + mm->hiwater_vm = mm->total_vm; + + if (mm->binfmt && !try_module_get(mm->binfmt->module)) + goto free_pt; + + return mm; + +free_pt: + /* don't put binfmt in mmput, we haven't got module yet */ + mm->binfmt = NULL; + mmput(mm); + +fail_nomem: + return NULL; + +fail_nocontext: + /* + * If init_new_context() failed, we cannot use mmput() to free the mm + * because it calls destroy_context() + */ + mm_free_pgd(mm); + free_mm(mm); + return NULL; +} + +static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) +{ + struct mm_struct * mm, *oldmm; + int retval; + + tsk->min_flt = tsk->maj_flt = 0; + tsk->nvcsw = tsk->nivcsw = 0; +#ifdef CONFIG_DETECT_HUNG_TASK + tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw; +#endif + + tsk->mm = NULL; + tsk->active_mm = NULL; + + /* + * Are we cloning a kernel thread? + * + * We need to steal a active VM for that.. + */ + oldmm = current->mm; + if (!oldmm) + return 0; + + if (clone_flags & CLONE_VM) { + atomic_inc(&oldmm->mm_users); + mm = oldmm; + goto good_mm; + } + + retval = -ENOMEM; + mm = dup_mm(tsk); + if (!mm) + goto fail_nomem; + +good_mm: + /* Initializing for Swap token stuff */ + mm->token_priority = 0; + mm->last_interval = 0; + if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_inc(&mm->oom_disable_count); + + tsk->mm = mm; + tsk->active_mm = mm; + return 0; + +fail_nomem: + return retval; +} + +static int copy_fs(unsigned long clone_flags, struct task_struct *tsk) +{ + struct fs_struct *fs = current->fs; + if (clone_flags & CLONE_FS) { + /* tsk->fs is already what we want */ + spin_lock(&fs->lock); + if (fs->in_exec) { + spin_unlock(&fs->lock); + return -EAGAIN; + } + fs->users++; + spin_unlock(&fs->lock); + return 0; + } + tsk->fs = copy_fs_struct(fs); + if (!tsk->fs) + return -ENOMEM; + return 0; +} + +static int copy_files(unsigned long clone_flags, struct task_struct * tsk) +{ + struct files_struct *oldf, *newf; + int error = 0; + + /* + * A background process may not have any files ... + */ + oldf = current->files; + if (!oldf) + goto out; + + if (clone_flags & CLONE_FILES) { + atomic_inc(&oldf->count); + goto out; + } + + newf = dup_fd(oldf, &error); + if (!newf) + goto out; + + tsk->files = newf; + error = 0; +out: + return error; +} + +static int copy_io(unsigned long clone_flags, struct task_struct *tsk) +{ +#ifdef CONFIG_BLOCK + struct io_context *ioc = current->io_context; + + if (!ioc) + return 0; + /* + * Share io context with parent, if CLONE_IO is set + */ + if (clone_flags & CLONE_IO) { + tsk->io_context = ioc_task_link(ioc); + if (unlikely(!tsk->io_context)) + return -ENOMEM; + } else if (ioprio_valid(ioc->ioprio)) { + tsk->io_context = alloc_io_context(GFP_KERNEL, -1); + if (unlikely(!tsk->io_context)) + return -ENOMEM; + + tsk->io_context->ioprio = ioc->ioprio; + } +#endif + return 0; +} + +static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk) +{ + struct sighand_struct *sig; + + if (clone_flags & CLONE_SIGHAND) { + atomic_inc(¤t->sighand->count); + return 0; + } + sig = kmem_cache_alloc(sighand_cachep, GFP_KERNEL); + rcu_assign_pointer(tsk->sighand, sig); + if (!sig) + return -ENOMEM; + atomic_set(&sig->count, 1); + memcpy(sig->action, current->sighand->action, sizeof(sig->action)); + return 0; +} + +void __cleanup_sighand(struct sighand_struct *sighand) +{ + if (atomic_dec_and_test(&sighand->count)) { + signalfd_cleanup(sighand); + kmem_cache_free(sighand_cachep, sighand); + } +} + + +/* + * Initialize POSIX timer handling for a thread group. + */ +static void posix_cpu_timers_init_group(struct signal_struct *sig) +{ + unsigned long cpu_limit; + + /* Thread group counters. */ + thread_group_cputime_init(sig); + + cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); + if (cpu_limit != RLIM_INFINITY) { + sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); + sig->cputimer.running = 1; + } + + /* The timer lists. */ + INIT_LIST_HEAD(&sig->cpu_timers[0]); + INIT_LIST_HEAD(&sig->cpu_timers[1]); + INIT_LIST_HEAD(&sig->cpu_timers[2]); +} + +static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) +{ + struct signal_struct *sig; + + if (clone_flags & CLONE_THREAD) + return 0; + + sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL); + tsk->signal = sig; + if (!sig) + return -ENOMEM; + + sig->nr_threads = 1; + atomic_set(&sig->live, 1); + atomic_set(&sig->sigcnt, 1); + init_waitqueue_head(&sig->wait_chldexit); + if (clone_flags & CLONE_NEWPID) + sig->flags |= SIGNAL_UNKILLABLE; + sig->curr_target = tsk; + init_sigpending(&sig->shared_pending); + INIT_LIST_HEAD(&sig->posix_timers); + + hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + sig->real_timer.function = it_real_fn; + + task_lock(current->group_leader); + memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); + task_unlock(current->group_leader); + + posix_cpu_timers_init_group(sig); + + tty_audit_fork(sig); + sched_autogroup_fork(sig); + +#ifdef CONFIG_CGROUPS + init_rwsem(&sig->threadgroup_fork_lock); +#endif + + sig->oom_adj = current->signal->oom_adj; + sig->oom_score_adj = current->signal->oom_score_adj; + sig->oom_score_adj_min = current->signal->oom_score_adj_min; + + mutex_init(&sig->cred_guard_mutex); + + return 0; +} + +static void copy_flags(unsigned long clone_flags, struct task_struct *p) +{ + unsigned long new_flags = p->flags; + + new_flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER); + new_flags |= PF_FORKNOEXEC; + new_flags |= PF_STARTING; + p->flags = new_flags; + clear_freeze_flag(p); +} + +SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr) +{ + current->clear_child_tid = tidptr; + + return task_pid_vnr(current); +} + +static void rt_mutex_init_task(struct task_struct *p) +{ + raw_spin_lock_init(&p->pi_lock); +#ifdef CONFIG_RT_MUTEXES + plist_head_init(&p->pi_waiters); + p->pi_blocked_on = NULL; +#endif +} + +#ifdef CONFIG_MM_OWNER +void mm_init_owner(struct mm_struct *mm, struct task_struct *p) +{ + mm->owner = p; +} +#endif /* CONFIG_MM_OWNER */ + +/* + * Initialize POSIX timer handling for a single task. + */ +static void posix_cpu_timers_init(struct task_struct *tsk) +{ + tsk->cputime_expires.prof_exp = cputime_zero; + tsk->cputime_expires.virt_exp = cputime_zero; + tsk->cputime_expires.sched_exp = 0; + INIT_LIST_HEAD(&tsk->cpu_timers[0]); + INIT_LIST_HEAD(&tsk->cpu_timers[1]); + INIT_LIST_HEAD(&tsk->cpu_timers[2]); +} + +/* + * This creates a new process as a copy of the old one, + * but does not actually start it yet. + * + * It copies the registers, and all the appropriate + * parts of the process environment (as per the clone + * flags). The actual kick-off is left to the caller. + */ +static struct task_struct *copy_process(unsigned long clone_flags, + unsigned long stack_start, + struct pt_regs *regs, + unsigned long stack_size, + int __user *child_tidptr, + struct pid *pid, + int trace) +{ + int retval; + struct task_struct *p; + int cgroup_callbacks_done = 0; + + if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) + return ERR_PTR(-EINVAL); + + /* + * Thread groups must share signals as well, and detached threads + * can only be started up within the thread group. + */ + if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND)) + return ERR_PTR(-EINVAL); + + /* + * Shared signal handlers imply shared VM. By way of the above, + * thread groups also imply shared VM. Blocking this case allows + * for various simplifications in other code. + */ + if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM)) + return ERR_PTR(-EINVAL); + + /* + * Siblings of global init remain as zombies on exit since they are + * not reaped by their parent (swapper). To solve this and to avoid + * multi-rooted process trees, prevent global and container-inits + * from creating siblings. + */ + if ((clone_flags & CLONE_PARENT) && + current->signal->flags & SIGNAL_UNKILLABLE) + return ERR_PTR(-EINVAL); + + retval = security_task_create(clone_flags); + if (retval) + goto fork_out; + + retval = -ENOMEM; + p = dup_task_struct(current); + if (!p) + goto fork_out; + + ftrace_graph_init_task(p); + + rt_mutex_init_task(p); + +#ifdef CONFIG_PROVE_LOCKING + DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); + DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); +#endif + retval = -EAGAIN; + if (atomic_read(&p->real_cred->user->processes) >= + task_rlimit(p, RLIMIT_NPROC)) { + if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && + p->real_cred->user != INIT_USER) + goto bad_fork_free; + } + + retval = copy_creds(p, clone_flags); + if (retval < 0) + goto bad_fork_free; + + /* + * If multiple threads are within copy_process(), then this check + * triggers too late. This doesn't hurt, the check is only there + * to stop root fork bombs. + */ + retval = -EAGAIN; + if (nr_threads >= max_threads) + goto bad_fork_cleanup_count; + + if (!try_module_get(task_thread_info(p)->exec_domain->module)) + goto bad_fork_cleanup_count; + + p->did_exec = 0; + delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ + copy_flags(clone_flags, p); + INIT_LIST_HEAD(&p->children); + INIT_LIST_HEAD(&p->sibling); + rcu_copy_process(p); + p->vfork_done = NULL; + spin_lock_init(&p->alloc_lock); + + init_sigpending(&p->pending); + + p->utime = cputime_zero; + p->stime = cputime_zero; + p->gtime = cputime_zero; + p->utimescaled = cputime_zero; + p->stimescaled = cputime_zero; +#ifndef CONFIG_VIRT_CPU_ACCOUNTING + p->prev_utime = cputime_zero; + p->prev_stime = cputime_zero; +#endif +#if defined(SPLIT_RSS_COUNTING) + memset(&p->rss_stat, 0, sizeof(p->rss_stat)); +#endif + + p->default_timer_slack_ns = current->timer_slack_ns; + + task_io_accounting_init(&p->ioac); + acct_clear_integrals(p); + + posix_cpu_timers_init(p); + + do_posix_clock_monotonic_gettime(&p->start_time); + p->real_start_time = p->start_time; + monotonic_to_bootbased(&p->real_start_time); + p->io_context = NULL; + p->audit_context = NULL; + if (clone_flags & CLONE_THREAD) + threadgroup_fork_read_lock(current); + cgroup_fork(p); +#ifdef CONFIG_NUMA + p->mempolicy = mpol_dup(p->mempolicy); + if (IS_ERR(p->mempolicy)) { + retval = PTR_ERR(p->mempolicy); + p->mempolicy = NULL; + goto bad_fork_cleanup_cgroup; + } + mpol_fix_fork_child_flag(p); +#endif +#ifdef CONFIG_TRACE_IRQFLAGS + p->irq_events = 0; +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + p->hardirqs_enabled = 1; +#else + p->hardirqs_enabled = 0; +#endif + p->hardirq_enable_ip = 0; + p->hardirq_enable_event = 0; + p->hardirq_disable_ip = _THIS_IP_; + p->hardirq_disable_event = 0; + p->softirqs_enabled = 1; + p->softirq_enable_ip = _THIS_IP_; + p->softirq_enable_event = 0; + p->softirq_disable_ip = 0; + p->softirq_disable_event = 0; + p->hardirq_context = 0; + p->softirq_context = 0; +#endif +#ifdef CONFIG_LOCKDEP + p->lockdep_depth = 0; /* no locks held yet */ + p->curr_chain_key = 0; + p->lockdep_recursion = 0; +#endif + +#ifdef CONFIG_DEBUG_MUTEXES + p->blocked_on = NULL; /* not blocked yet */ +#endif +#ifdef CONFIG_CGROUP_MEM_RES_CTLR + p->memcg_batch.do_batch = 0; + p->memcg_batch.memcg = NULL; +#endif + + /* Perform scheduler related setup. Assign this task to a CPU. */ + sched_fork(p); + + retval = perf_event_init_task(p); + if (retval) + goto bad_fork_cleanup_policy; + + if ((retval = audit_alloc(p))) + goto bad_fork_cleanup_policy; + /* copy all the process information */ + if ((retval = copy_semundo(clone_flags, p))) + goto bad_fork_cleanup_audit; + if ((retval = copy_files(clone_flags, p))) + goto bad_fork_cleanup_semundo; + if ((retval = copy_fs(clone_flags, p))) + goto bad_fork_cleanup_files; + if ((retval = copy_sighand(clone_flags, p))) + goto bad_fork_cleanup_fs; + if ((retval = copy_signal(clone_flags, p))) + goto bad_fork_cleanup_sighand; + if ((retval = copy_mm(clone_flags, p))) + goto bad_fork_cleanup_signal; + if ((retval = copy_namespaces(clone_flags, p))) + goto bad_fork_cleanup_mm; + if ((retval = copy_io(clone_flags, p))) + goto bad_fork_cleanup_namespaces; + retval = copy_thread(clone_flags, stack_start, stack_size, p, regs); + if (retval) + goto bad_fork_cleanup_io; + + if (pid != &init_struct_pid) { + retval = -ENOMEM; + pid = alloc_pid(p->nsproxy->pid_ns); + if (!pid) + goto bad_fork_cleanup_io; + } + + p->pid = pid_nr(pid); + p->tgid = p->pid; + if (clone_flags & CLONE_THREAD) + p->tgid = current->tgid; + + p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; + /* + * Clear TID on mm_release()? + */ + p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL; +#ifdef CONFIG_BLOCK + p->plug = NULL; +#endif +#ifdef CONFIG_FUTEX + p->robust_list = NULL; +#ifdef CONFIG_COMPAT + p->compat_robust_list = NULL; +#endif + INIT_LIST_HEAD(&p->pi_state_list); + p->pi_state_cache = NULL; +#endif + /* + * sigaltstack should be cleared when sharing the same VM + */ + if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM) + p->sas_ss_sp = p->sas_ss_size = 0; + + /* + * Syscall tracing and stepping should be turned off in the + * child regardless of CLONE_PTRACE. + */ + user_disable_single_step(p); + clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); +#ifdef TIF_SYSCALL_EMU + clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); +#endif + clear_all_latency_tracing(p); + + /* ok, now we should be set up.. */ + p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL); + p->pdeath_signal = 0; + p->exit_state = 0; + + /* + * Ok, make it visible to the rest of the system. + * We dont wake it up yet. + */ + p->group_leader = p; + INIT_LIST_HEAD(&p->thread_group); + + /* Now that the task is set up, run cgroup callbacks if + * necessary. We need to run them before the task is visible + * on the tasklist. */ + cgroup_fork_callbacks(p); + cgroup_callbacks_done = 1; + + /* Need tasklist lock for parent etc handling! */ + write_lock_irq(&tasklist_lock); + + /* CLONE_PARENT re-uses the old parent */ + if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { + p->real_parent = current->real_parent; + p->parent_exec_id = current->parent_exec_id; + } else { + p->real_parent = current; + p->parent_exec_id = current->self_exec_id; + } + + spin_lock(¤t->sighand->siglock); + + /* + * Process group and session signals need to be delivered to just the + * parent before the fork or both the parent and the child after the + * fork. Restart if a signal comes in before we add the new process to + * it's process group. + * A fatal signal pending means that current will exit, so the new + * thread can't slip out of an OOM kill (or normal SIGKILL). + */ + recalc_sigpending(); + if (signal_pending(current)) { + spin_unlock(¤t->sighand->siglock); + write_unlock_irq(&tasklist_lock); + retval = -ERESTARTNOINTR; + goto bad_fork_free_pid; + } + + if (clone_flags & CLONE_THREAD) { + current->signal->nr_threads++; + atomic_inc(¤t->signal->live); + atomic_inc(¤t->signal->sigcnt); + p->group_leader = current->group_leader; + list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); + } + + if (likely(p->pid)) { + tracehook_finish_clone(p, clone_flags, trace); + + if (thread_group_leader(p)) { + if (is_child_reaper(pid)) + p->nsproxy->pid_ns->child_reaper = p; + + p->signal->leader_pid = pid; + p->signal->tty = tty_kref_get(current->signal->tty); + attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); + attach_pid(p, PIDTYPE_SID, task_session(current)); + list_add_tail(&p->sibling, &p->real_parent->children); + list_add_tail_rcu(&p->tasks, &init_task.tasks); + __this_cpu_inc(process_counts); + } + attach_pid(p, PIDTYPE_PID, pid); + nr_threads++; + } + + total_forks++; + spin_unlock(¤t->sighand->siglock); + write_unlock_irq(&tasklist_lock); + proc_fork_connector(p); + cgroup_post_fork(p); + if (clone_flags & CLONE_THREAD) + threadgroup_fork_read_unlock(current); + perf_event_fork(p); + return p; + +bad_fork_free_pid: + if (pid != &init_struct_pid) + free_pid(pid); +bad_fork_cleanup_io: + if (p->io_context) + exit_io_context(p); +bad_fork_cleanup_namespaces: + if (unlikely(clone_flags & CLONE_NEWPID)) + pid_ns_release_proc(p->nsproxy->pid_ns); + exit_task_namespaces(p); +bad_fork_cleanup_mm: + if (p->mm) { + task_lock(p); + if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + atomic_dec(&p->mm->oom_disable_count); + task_unlock(p); + mmput(p->mm); + } +bad_fork_cleanup_signal: + if (!(clone_flags & CLONE_THREAD)) + free_signal_struct(p->signal); +bad_fork_cleanup_sighand: + __cleanup_sighand(p->sighand); +bad_fork_cleanup_fs: + exit_fs(p); /* blocking */ +bad_fork_cleanup_files: + exit_files(p); /* blocking */ +bad_fork_cleanup_semundo: + exit_sem(p); +bad_fork_cleanup_audit: + audit_free(p); +bad_fork_cleanup_policy: + perf_event_free_task(p); +#ifdef CONFIG_NUMA + mpol_put(p->mempolicy); +bad_fork_cleanup_cgroup: +#endif + if (clone_flags & CLONE_THREAD) + threadgroup_fork_read_unlock(current); + cgroup_exit(p, cgroup_callbacks_done); + delayacct_tsk_free(p); + module_put(task_thread_info(p)->exec_domain->module); +bad_fork_cleanup_count: + atomic_dec(&p->cred->user->processes); + exit_creds(p); +bad_fork_free: + free_task(p); +fork_out: + return ERR_PTR(retval); +} + +noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_regs *regs) +{ + memset(regs, 0, sizeof(struct pt_regs)); + return regs; +} + +static inline void init_idle_pids(struct pid_link *links) +{ + enum pid_type type; + + for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) { + INIT_HLIST_NODE(&links[type].node); /* not really needed */ + links[type].pid = &init_struct_pid; + } +} + +struct task_struct * __cpuinit fork_idle(int cpu) +{ + struct task_struct *task; + struct pt_regs regs; + + task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, + &init_struct_pid, 0); + if (!IS_ERR(task)) { + init_idle_pids(task->pids); + init_idle(task, cpu); + } + + return task; +} + +/* + * Ok, this is the main fork-routine. + * + * It copies the process, and if successful kick-starts + * it and waits for it to finish using the VM if required. + */ +long do_fork(unsigned long clone_flags, + unsigned long stack_start, + struct pt_regs *regs, + unsigned long stack_size, + int __user *parent_tidptr, + int __user *child_tidptr) +{ + struct task_struct *p; + int trace = 0; + long nr; + + /* + * Do some preliminary argument and permissions checking before we + * actually start allocating stuff + */ + if (clone_flags & CLONE_NEWUSER) { + if (clone_flags & CLONE_THREAD) + return -EINVAL; + /* hopefully this check will go away when userns support is + * complete + */ + if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) || + !capable(CAP_SETGID)) + return -EPERM; + } + + /* + * When called from kernel_thread, don't do user tracing stuff. + */ + if (likely(user_mode(regs))) + trace = tracehook_prepare_clone(clone_flags); + + p = copy_process(clone_flags, stack_start, regs, stack_size, + child_tidptr, NULL, trace); + /* + * Do this prior waking up the new thread - the thread pointer + * might get invalid after that point, if the thread exits quickly. + */ + if (!IS_ERR(p)) { + struct completion vfork; + + trace_sched_process_fork(current, p); + + nr = task_pid_vnr(p); + + if (clone_flags & CLONE_PARENT_SETTID) + put_user(nr, parent_tidptr); + + if (clone_flags & CLONE_VFORK) { + p->vfork_done = &vfork; + init_completion(&vfork); + } + + audit_finish_fork(p); + tracehook_report_clone(regs, clone_flags, nr, p); + + /* + * We set PF_STARTING at creation in case tracing wants to + * use this to distinguish a fully live task from one that + * hasn't gotten to tracehook_report_clone() yet. Now we + * clear it and set the child going. + */ + p->flags &= ~PF_STARTING; + + wake_up_new_task(p); + + tracehook_report_clone_complete(trace, regs, + clone_flags, nr, p); + + if (clone_flags & CLONE_VFORK) { + freezer_do_not_count(); + wait_for_completion(&vfork); + freezer_count(); + tracehook_report_vfork_done(p, nr); + } + } else { + nr = PTR_ERR(p); + } + return nr; +} + +#ifndef ARCH_MIN_MMSTRUCT_ALIGN +#define ARCH_MIN_MMSTRUCT_ALIGN 0 +#endif + +static void sighand_ctor(void *data) +{ + struct sighand_struct *sighand = data; + + spin_lock_init(&sighand->siglock); + init_waitqueue_head(&sighand->signalfd_wqh); +} + +void __init proc_caches_init(void) +{ + sighand_cachep = kmem_cache_create("sighand_cache", + sizeof(struct sighand_struct), 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU| + SLAB_NOTRACK, sighand_ctor); + signal_cachep = kmem_cache_create("signal_cache", + sizeof(struct signal_struct), 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + files_cachep = kmem_cache_create("files_cache", + sizeof(struct files_struct), 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + fs_cachep = kmem_cache_create("fs_cache", + sizeof(struct fs_struct), 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + /* + * FIXME! The "sizeof(struct mm_struct)" currently includes the + * whole struct cpumask for the OFFSTACK case. We could change + * this to *only* allocate as much of it as required by the + * maximum number of CPU's we can ever have. The cpumask_allocation + * is at the end of the structure, exactly for that reason. + */ + mm_cachep = kmem_cache_create("mm_struct", + sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); + mmap_init(); +} + +/* + * Check constraints on flags passed to the unshare system call. + */ +static int check_unshare_flags(unsigned long unshare_flags) +{ + if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| + CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| + CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET)) + return -EINVAL; + /* + * Not implemented, but pretend it works if there is nothing to + * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND + * needs to unshare vm. + */ + if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) { + /* FIXME: get_task_mm() increments ->mm_users */ + if (atomic_read(¤t->mm->mm_users) > 1) + return -EINVAL; + } + + return 0; +} + +/* + * Unshare the filesystem structure if it is being shared + */ +static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp) +{ + struct fs_struct *fs = current->fs; + + if (!(unshare_flags & CLONE_FS) || !fs) + return 0; + + /* don't need lock here; in the worst case we'll do useless copy */ + if (fs->users == 1) + return 0; + + *new_fsp = copy_fs_struct(fs); + if (!*new_fsp) + return -ENOMEM; + + return 0; +} + +/* + * Unshare file descriptor table if it is being shared + */ +static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp) +{ + struct files_struct *fd = current->files; + int error = 0; + + if ((unshare_flags & CLONE_FILES) && + (fd && atomic_read(&fd->count) > 1)) { + *new_fdp = dup_fd(fd, &error); + if (!*new_fdp) + return error; + } + + return 0; +} + +/* + * unshare allows a process to 'unshare' part of the process + * context which was originally shared using clone. copy_* + * functions used by do_fork() cannot be used here directly + * because they modify an inactive task_struct that is being + * constructed. Here we are modifying the current, active, + * task_struct. + */ +SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) +{ + struct fs_struct *fs, *new_fs = NULL; + struct files_struct *fd, *new_fd = NULL; + struct nsproxy *new_nsproxy = NULL; + int do_sysvsem = 0; + int err; + + err = check_unshare_flags(unshare_flags); + if (err) + goto bad_unshare_out; + + /* + * If unsharing namespace, must also unshare filesystem information. + */ + if (unshare_flags & CLONE_NEWNS) + unshare_flags |= CLONE_FS; + /* + * CLONE_NEWIPC must also detach from the undolist: after switching + * to a new ipc namespace, the semaphore arrays from the old + * namespace are unreachable. + */ + if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM)) + do_sysvsem = 1; + if ((err = unshare_fs(unshare_flags, &new_fs))) + goto bad_unshare_out; + if ((err = unshare_fd(unshare_flags, &new_fd))) + goto bad_unshare_cleanup_fs; + if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy, + new_fs))) + goto bad_unshare_cleanup_fd; + + if (new_fs || new_fd || do_sysvsem || new_nsproxy) { + if (do_sysvsem) { + /* + * CLONE_SYSVSEM is equivalent to sys_exit(). + */ + exit_sem(current); + } + + if (new_nsproxy) { + switch_task_namespaces(current, new_nsproxy); + new_nsproxy = NULL; + } + + task_lock(current); + + if (new_fs) { + fs = current->fs; + spin_lock(&fs->lock); + current->fs = new_fs; + if (--fs->users) + new_fs = NULL; + else + new_fs = fs; + spin_unlock(&fs->lock); + } + + if (new_fd) { + fd = current->files; + current->files = new_fd; + new_fd = fd; + } + + task_unlock(current); + } + + if (new_nsproxy) + put_nsproxy(new_nsproxy); + +bad_unshare_cleanup_fd: + if (new_fd) + put_files_struct(new_fd); + +bad_unshare_cleanup_fs: + if (new_fs) + free_fs_struct(new_fs); + +bad_unshare_out: + return err; +} + +/* + * Helper to unshare the files of the current task. + * We don't want to expose copy_files internals to + * the exec layer of the kernel. + */ + +int unshare_files(struct files_struct **displaced) +{ + struct task_struct *task = current; + struct files_struct *copy = NULL; + int error; + + error = unshare_fd(CLONE_FILES, ©); + if (error || !copy) { + *displaced = NULL; + return error; + } + *displaced = task->files; + task_lock(task); + task->files = copy; + task_unlock(task); + return 0; +} diff --git a/kernel/freezer.c b/kernel/freezer.c new file mode 100644 index 00000000..7b01de98 --- /dev/null +++ b/kernel/freezer.c @@ -0,0 +1,166 @@ +/* + * kernel/freezer.c - Function to freeze a process + * + * Originally from kernel/power/process.c + */ + +#include +#include +#include +#include +#include + +/* + * freezing is complete, mark current process as frozen + */ +static inline void frozen_process(void) +{ + if (!unlikely(current->flags & PF_NOFREEZE)) { + current->flags |= PF_FROZEN; + smp_wmb(); + } + clear_freeze_flag(current); +} + +/* Refrigerator is place where frozen processes are stored :-). */ +void refrigerator(void) +{ + /* Hmm, should we be allowed to suspend when there are realtime + processes around? */ + long save; + + task_lock(current); + if (freezing(current)) { + frozen_process(); + task_unlock(current); + } else { + task_unlock(current); + return; + } + save = current->state; + pr_debug("%s entered refrigerator\n", current->comm); + + spin_lock_irq(¤t->sighand->siglock); + recalc_sigpending(); /* We sent fake signal, clean it up */ + spin_unlock_irq(¤t->sighand->siglock); + + /* prevent accounting of that task to load */ + current->flags |= PF_FREEZING; + + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!frozen(current)) + break; + schedule(); + } + + /* Remove the accounting blocker */ + current->flags &= ~PF_FREEZING; + + pr_debug("%s left refrigerator\n", current->comm); + __set_current_state(save); +} +EXPORT_SYMBOL(refrigerator); + +static void fake_signal_wake_up(struct task_struct *p) +{ + unsigned long flags; + + spin_lock_irqsave(&p->sighand->siglock, flags); + signal_wake_up(p, 0); + spin_unlock_irqrestore(&p->sighand->siglock, flags); +} + +/** + * freeze_task - send a freeze request to given task + * @p: task to send the request to + * @sig_only: if set, the request will only be sent if the task has the + * PF_FREEZER_NOSIG flag unset + * Return value: 'false', if @sig_only is set and the task has + * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise + * + * The freeze request is sent by setting the tasks's TIF_FREEZE flag and + * either sending a fake signal to it or waking it up, depending on whether + * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task + * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its + * TIF_FREEZE flag will not be set. + */ +bool freeze_task(struct task_struct *p, bool sig_only) +{ + /* + * We first check if the task is freezing and next if it has already + * been frozen to avoid the race with frozen_process() which first marks + * the task as frozen and next clears its TIF_FREEZE. + */ + if (!freezing(p)) { + smp_rmb(); + if (frozen(p)) + return false; + + if (!sig_only || should_send_signal(p)) + set_freeze_flag(p); + else + return false; + } + + if (should_send_signal(p)) { + fake_signal_wake_up(p); + /* + * fake_signal_wake_up() goes through p's scheduler + * lock and guarantees that TASK_STOPPED/TRACED -> + * TASK_RUNNING transition can't race with task state + * testing in try_to_freeze_tasks(). + */ + } else if (sig_only) { + return false; + } else { + wake_up_state(p, TASK_INTERRUPTIBLE); + } + + return true; +} + +void cancel_freezing(struct task_struct *p) +{ + unsigned long flags; + + if (freezing(p)) { + pr_debug(" clean up: %s\n", p->comm); + clear_freeze_flag(p); + spin_lock_irqsave(&p->sighand->siglock, flags); + recalc_sigpending_and_wake(p); + spin_unlock_irqrestore(&p->sighand->siglock, flags); + } +} + +static int __thaw_process(struct task_struct *p) +{ + if (frozen(p)) { + p->flags &= ~PF_FROZEN; + return 1; + } + clear_freeze_flag(p); + return 0; +} + +/* + * Wake up a frozen process + * + * task_lock() is needed to prevent the race with refrigerator() which may + * occur if the freezing of tasks fails. Namely, without the lock, if the + * freezing of tasks failed, thaw_tasks() might have run before a task in + * refrigerator() could call frozen_process(), in which case the task would be + * frozen and no one would thaw it. + */ +int thaw_process(struct task_struct *p) +{ + task_lock(p); + if (__thaw_process(p) == 1) { + task_unlock(p); + wake_up_process(p); + return 1; + } + task_unlock(p); + return 0; +} +EXPORT_SYMBOL(thaw_process); diff --git a/kernel/futex.c b/kernel/futex.c new file mode 100644 index 00000000..5e3a5347 --- /dev/null +++ b/kernel/futex.c @@ -0,0 +1,2733 @@ +/* + * Fast Userspace Mutexes (which I call "Futexes!"). + * (C) Rusty Russell, IBM 2002 + * + * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar + * (C) Copyright 2003 Red Hat Inc, All Rights Reserved + * + * Removed page pinning, fix privately mapped COW pages and other cleanups + * (C) Copyright 2003, 2004 Jamie Lokier + * + * Robust futex support started by Ingo Molnar + * (C) Copyright 2006 Red Hat Inc, All Rights Reserved + * Thanks to Thomas Gleixner for suggestions, analysis and fixes. + * + * PI-futex support started by Ingo Molnar and Thomas Gleixner + * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2006 Timesys Corp., Thomas Gleixner + * + * PRIVATE futexes by Eric Dumazet + * Copyright (C) 2007 Eric Dumazet + * + * Requeue-PI support by Darren Hart + * Copyright (C) IBM Corporation, 2009 + * Thanks to Thomas Gleixner for conceptual design and careful reviews. + * + * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly + * enough at me, Linus for the original (flawed) idea, Matthew + * Kirkwood for proof-of-concept implementation. + * + * "The futexes are also cursed." + * "But they come in a choice of three flavours!" + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "rtmutex_common.h" + +int __read_mostly futex_cmpxchg_enabled; + +#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) + +/* + * Futex flags used to encode options to functions and preserve them across + * restarts. + */ +#define FLAGS_SHARED 0x01 +#define FLAGS_CLOCKRT 0x02 +#define FLAGS_HAS_TIMEOUT 0x04 + +/* + * Priority Inheritance state: + */ +struct futex_pi_state { + /* + * list of 'owned' pi_state instances - these have to be + * cleaned up in do_exit() if the task exits prematurely: + */ + struct list_head list; + + /* + * The PI object: + */ + struct rt_mutex pi_mutex; + + struct task_struct *owner; + atomic_t refcount; + + union futex_key key; +}; + +/** + * struct futex_q - The hashed futex queue entry, one per waiting task + * @list: priority-sorted list of tasks waiting on this futex + * @task: the task waiting on the futex + * @lock_ptr: the hash bucket lock + * @key: the key the futex is hashed on + * @pi_state: optional priority inheritance state + * @rt_waiter: rt_waiter storage for use with requeue_pi + * @requeue_pi_key: the requeue_pi target futex key + * @bitset: bitset for the optional bitmasked wakeup + * + * We use this hashed waitqueue, instead of a normal wait_queue_t, so + * we can wake only the relevant ones (hashed queues may be shared). + * + * A futex_q has a woken state, just like tasks have TASK_RUNNING. + * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. + * The order of wakeup is always to make the first condition true, then + * the second. + * + * PI futexes are typically woken before they are removed from the hash list via + * the rt_mutex code. See unqueue_me_pi(). + */ +struct futex_q { + struct plist_node list; + + struct task_struct *task; + spinlock_t *lock_ptr; + union futex_key key; + struct futex_pi_state *pi_state; + struct rt_mutex_waiter *rt_waiter; + union futex_key *requeue_pi_key; + u32 bitset; +}; + +static const struct futex_q futex_q_init = { + /* list gets initialized in queue_me()*/ + .key = FUTEX_KEY_INIT, + .bitset = FUTEX_BITSET_MATCH_ANY +}; + +/* + * Hash buckets are shared by all the futex_keys that hash to the same + * location. Each key may have multiple futex_q structures, one for each task + * waiting on a futex. + */ +struct futex_hash_bucket { + spinlock_t lock; + struct plist_head chain; +}; + +static struct futex_hash_bucket futex_queues[1<both.word, + (sizeof(key->both.word)+sizeof(key->both.ptr))/4, + key->both.offset); + return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)]; +} + +/* + * Return 1 if two futex_keys are equal, 0 otherwise. + */ +static inline int match_futex(union futex_key *key1, union futex_key *key2) +{ + return (key1 && key2 + && key1->both.word == key2->both.word + && key1->both.ptr == key2->both.ptr + && key1->both.offset == key2->both.offset); +} + +/* + * Take a reference to the resource addressed by a key. + * Can be called while holding spinlocks. + * + */ +static void get_futex_key_refs(union futex_key *key) +{ + if (!key->both.ptr) + return; + + switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { + case FUT_OFF_INODE: + ihold(key->shared.inode); + break; + case FUT_OFF_MMSHARED: + atomic_inc(&key->private.mm->mm_count); + break; + } +} + +/* + * Drop a reference to the resource addressed by a key. + * The hash bucket spinlock must not be held. + */ +static void drop_futex_key_refs(union futex_key *key) +{ + if (!key->both.ptr) { + /* If we're here then we tried to put a key we failed to get */ + WARN_ON_ONCE(1); + return; + } + + switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { + case FUT_OFF_INODE: + iput(key->shared.inode); + break; + case FUT_OFF_MMSHARED: + mmdrop(key->private.mm); + break; + } +} + +/** + * get_futex_key() - Get parameters which are the keys for a futex + * @uaddr: virtual address of the futex + * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED + * @key: address where result is stored. + * @rw: mapping needs to be read/write (values: VERIFY_READ, + * VERIFY_WRITE) + * + * Returns a negative error code or 0 + * The key words are stored in *key on success. + * + * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode, + * offset_within_page). For private mappings, it's (uaddr, current->mm). + * We can usually work out the index without swapping in the page. + * + * lock_page() might sleep, the caller should not hold a spinlock. + */ +static int +get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) +{ + unsigned long address = (unsigned long)uaddr; + struct mm_struct *mm = current->mm; + struct page *page, *page_head; + int err, ro = 0; + + /* + * The futex address must be "naturally" aligned. + */ + key->both.offset = address % PAGE_SIZE; + if (unlikely((address % sizeof(u32)) != 0)) + return -EINVAL; + address -= key->both.offset; + + /* + * PROCESS_PRIVATE futexes are fast. + * As the mm cannot disappear under us and the 'key' only needs + * virtual address, we dont even have to find the underlying vma. + * Note : We do have to check 'uaddr' is a valid user address, + * but access_ok() should be faster than find_vma() + */ + if (!fshared) { + if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) + return -EFAULT; + key->private.mm = mm; + key->private.address = address; + get_futex_key_refs(key); + return 0; + } + +again: + err = get_user_pages_fast(address, 1, 1, &page); + /* + * If write access is not required (eg. FUTEX_WAIT), try + * and get read-only access. + */ + if (err == -EFAULT && rw == VERIFY_READ) { + err = get_user_pages_fast(address, 1, 0, &page); + ro = 1; + } + if (err < 0) + return err; + else + err = 0; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + page_head = page; + if (unlikely(PageTail(page))) { + put_page(page); + /* serialize against __split_huge_page_splitting() */ + local_irq_disable(); + if (likely(__get_user_pages_fast(address, 1, 1, &page) == 1)) { + page_head = compound_head(page); + /* + * page_head is valid pointer but we must pin + * it before taking the PG_lock and/or + * PG_compound_lock. The moment we re-enable + * irqs __split_huge_page_splitting() can + * return and the head page can be freed from + * under us. We can't take the PG_lock and/or + * PG_compound_lock on a page that could be + * freed from under us. + */ + if (page != page_head) { + get_page(page_head); + put_page(page); + } + local_irq_enable(); + } else { + local_irq_enable(); + goto again; + } + } +#else + page_head = compound_head(page); + if (page != page_head) { + get_page(page_head); + put_page(page); + } +#endif + + lock_page(page_head); + + /* + * If page_head->mapping is NULL, then it cannot be a PageAnon + * page; but it might be the ZERO_PAGE or in the gate area or + * in a special mapping (all cases which we are happy to fail); + * or it may have been a good file page when get_user_pages_fast + * found it, but truncated or holepunched or subjected to + * invalidate_complete_page2 before we got the page lock (also + * cases which we are happy to fail). And we hold a reference, + * so refcount care in invalidate_complete_page's remove_mapping + * prevents drop_caches from setting mapping to NULL beneath us. + * + * The case we do have to guard against is when memory pressure made + * shmem_writepage move it from filecache to swapcache beneath us: + * an unlikely race, but we do need to retry for page_head->mapping. + */ + if (!page_head->mapping) { + int shmem_swizzled = PageSwapCache(page_head); + unlock_page(page_head); + put_page(page_head); + if (shmem_swizzled) + goto again; + return -EFAULT; + } + + /* + * Private mappings are handled in a simple way. + * + * NOTE: When userspace waits on a MAP_SHARED mapping, even if + * it's a read-only handle, it's expected that futexes attach to + * the object not the particular process. + */ + if (PageAnon(page_head)) { + /* + * A RO anonymous page will never change and thus doesn't make + * sense for futex operations. + */ + if (ro) { + err = -EFAULT; + goto out; + } + + key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ + key->private.mm = mm; + key->private.address = address; + } else { + key->both.offset |= FUT_OFF_INODE; /* inode-based key */ + key->shared.inode = page_head->mapping->host; + key->shared.pgoff = page_head->index; + } + + get_futex_key_refs(key); + +out: + unlock_page(page_head); + put_page(page_head); + return err; +} + +static inline void put_futex_key(union futex_key *key) +{ + drop_futex_key_refs(key); +} + +/** + * fault_in_user_writeable() - Fault in user address and verify RW access + * @uaddr: pointer to faulting user space address + * + * Slow path to fixup the fault we just took in the atomic write + * access to @uaddr. + * + * We have no generic implementation of a non-destructive write to the + * user address. We know that we faulted in the atomic pagefault + * disabled section so we can as well avoid the #PF overhead by + * calling get_user_pages() right away. + */ +static int fault_in_user_writeable(u32 __user *uaddr) +{ + struct mm_struct *mm = current->mm; + int ret; + + down_read(&mm->mmap_sem); + ret = fixup_user_fault(current, mm, (unsigned long)uaddr, + FAULT_FLAG_WRITE); + up_read(&mm->mmap_sem); + + return ret < 0 ? ret : 0; +} + +/** + * futex_top_waiter() - Return the highest priority waiter on a futex + * @hb: the hash bucket the futex_q's reside in + * @key: the futex key (to distinguish it from other futex futex_q's) + * + * Must be called with the hb lock held. + */ +static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, + union futex_key *key) +{ + struct futex_q *this; + + plist_for_each_entry(this, &hb->chain, list) { + if (match_futex(&this->key, key)) + return this; + } + return NULL; +} + +static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr, + u32 uval, u32 newval) +{ + int ret; + + pagefault_disable(); + ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); + pagefault_enable(); + + return ret; +} + +static int get_futex_value_locked(u32 *dest, u32 __user *from) +{ + int ret; + + pagefault_disable(); + ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); + pagefault_enable(); + + return ret ? -EFAULT : 0; +} + + +/* + * PI code: + */ +static int refill_pi_state_cache(void) +{ + struct futex_pi_state *pi_state; + + if (likely(current->pi_state_cache)) + return 0; + + pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL); + + if (!pi_state) + return -ENOMEM; + + INIT_LIST_HEAD(&pi_state->list); + /* pi_mutex gets initialized later */ + pi_state->owner = NULL; + atomic_set(&pi_state->refcount, 1); + pi_state->key = FUTEX_KEY_INIT; + + current->pi_state_cache = pi_state; + + return 0; +} + +static struct futex_pi_state * alloc_pi_state(void) +{ + struct futex_pi_state *pi_state = current->pi_state_cache; + + WARN_ON(!pi_state); + current->pi_state_cache = NULL; + + return pi_state; +} + +static void free_pi_state(struct futex_pi_state *pi_state) +{ + if (!atomic_dec_and_test(&pi_state->refcount)) + return; + + /* + * If pi_state->owner is NULL, the owner is most probably dying + * and has cleaned up the pi_state already + */ + if (pi_state->owner) { + raw_spin_lock_irq(&pi_state->owner->pi_lock); + list_del_init(&pi_state->list); + raw_spin_unlock_irq(&pi_state->owner->pi_lock); + + rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); + } + + if (current->pi_state_cache) + kfree(pi_state); + else { + /* + * pi_state->list is already empty. + * clear pi_state->owner. + * refcount is at 0 - put it back to 1. + */ + pi_state->owner = NULL; + atomic_set(&pi_state->refcount, 1); + current->pi_state_cache = pi_state; + } +} + +/* + * Look up the task based on what TID userspace gave us. + * We dont trust it. + */ +static struct task_struct * futex_find_get_task(pid_t pid) +{ + struct task_struct *p; + + rcu_read_lock(); + p = find_task_by_vpid(pid); + if (p) + get_task_struct(p); + + rcu_read_unlock(); + + return p; +} + +/* + * This task is holding PI mutexes at exit time => bad. + * Kernel cleans up PI-state, but userspace is likely hosed. + * (Robust-futex cleanup is separate and might save the day for userspace.) + */ +void exit_pi_state_list(struct task_struct *curr) +{ + struct list_head *next, *head = &curr->pi_state_list; + struct futex_pi_state *pi_state; + struct futex_hash_bucket *hb; + union futex_key key = FUTEX_KEY_INIT; + + if (!futex_cmpxchg_enabled) + return; + /* + * We are a ZOMBIE and nobody can enqueue itself on + * pi_state_list anymore, but we have to be careful + * versus waiters unqueueing themselves: + */ + raw_spin_lock_irq(&curr->pi_lock); + while (!list_empty(head)) { + + next = head->next; + pi_state = list_entry(next, struct futex_pi_state, list); + key = pi_state->key; + hb = hash_futex(&key); + raw_spin_unlock_irq(&curr->pi_lock); + + spin_lock(&hb->lock); + + raw_spin_lock_irq(&curr->pi_lock); + /* + * We dropped the pi-lock, so re-check whether this + * task still owns the PI-state: + */ + if (head->next != next) { + spin_unlock(&hb->lock); + continue; + } + + WARN_ON(pi_state->owner != curr); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + pi_state->owner = NULL; + raw_spin_unlock_irq(&curr->pi_lock); + + rt_mutex_unlock(&pi_state->pi_mutex); + + spin_unlock(&hb->lock); + + raw_spin_lock_irq(&curr->pi_lock); + } + raw_spin_unlock_irq(&curr->pi_lock); +} + +static int +lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, + union futex_key *key, struct futex_pi_state **ps) +{ + struct futex_pi_state *pi_state = NULL; + struct futex_q *this, *next; + struct plist_head *head; + struct task_struct *p; + pid_t pid = uval & FUTEX_TID_MASK; + + head = &hb->chain; + + plist_for_each_entry_safe(this, next, head, list) { + if (match_futex(&this->key, key)) { + /* + * Another waiter already exists - bump up + * the refcount and return its pi_state: + */ + pi_state = this->pi_state; + /* + * Userspace might have messed up non-PI and PI futexes + */ + if (unlikely(!pi_state)) + return -EINVAL; + + WARN_ON(!atomic_read(&pi_state->refcount)); + + /* + * When pi_state->owner is NULL then the owner died + * and another waiter is on the fly. pi_state->owner + * is fixed up by the task which acquires + * pi_state->rt_mutex. + * + * We do not check for pid == 0 which can happen when + * the owner died and robust_list_exit() cleared the + * TID. + */ + if (pid && pi_state->owner) { + /* + * Bail out if user space manipulated the + * futex value. + */ + if (pid != task_pid_vnr(pi_state->owner)) + return -EINVAL; + } + + atomic_inc(&pi_state->refcount); + *ps = pi_state; + + return 0; + } + } + + /* + * We are the first waiter - try to look up the real owner and attach + * the new pi_state to it, but bail out when TID = 0 + */ + if (!pid) + return -ESRCH; + p = futex_find_get_task(pid); + if (!p) + return -ESRCH; + + /* + * We need to look at the task state flags to figure out, + * whether the task is exiting. To protect against the do_exit + * change of the task flags, we do this protected by + * p->pi_lock: + */ + raw_spin_lock_irq(&p->pi_lock); + if (unlikely(p->flags & PF_EXITING)) { + /* + * The task is on the way out. When PF_EXITPIDONE is + * set, we know that the task has finished the + * cleanup: + */ + int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; + + raw_spin_unlock_irq(&p->pi_lock); + put_task_struct(p); + return ret; + } + + pi_state = alloc_pi_state(); + + /* + * Initialize the pi_mutex in locked state and make 'p' + * the owner of it: + */ + rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); + + /* Store the key for possible exit cleanups: */ + pi_state->key = *key; + + WARN_ON(!list_empty(&pi_state->list)); + list_add(&pi_state->list, &p->pi_state_list); + pi_state->owner = p; + raw_spin_unlock_irq(&p->pi_lock); + + put_task_struct(p); + + *ps = pi_state; + + return 0; +} + +/** + * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex + * @uaddr: the pi futex user address + * @hb: the pi futex hash bucket + * @key: the futex key associated with uaddr and hb + * @ps: the pi_state pointer where we store the result of the + * lookup + * @task: the task to perform the atomic lock work for. This will + * be "current" except in the case of requeue pi. + * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) + * + * Returns: + * 0 - ready to wait + * 1 - acquired the lock + * <0 - error + * + * The hb->lock and futex_key refs shall be held by the caller. + */ +static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, + union futex_key *key, + struct futex_pi_state **ps, + struct task_struct *task, int set_waiters) +{ + int lock_taken, ret, ownerdied = 0; + u32 uval, newval, curval, vpid = task_pid_vnr(task); + +retry: + ret = lock_taken = 0; + + /* + * To avoid races, we attempt to take the lock here again + * (by doing a 0 -> TID atomic cmpxchg), while holding all + * the locks. It will most likely not succeed. + */ + newval = vpid; + if (set_waiters) + newval |= FUTEX_WAITERS; + + if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval))) + return -EFAULT; + + /* + * Detect deadlocks. + */ + if ((unlikely((curval & FUTEX_TID_MASK) == vpid))) + return -EDEADLK; + + /* + * Surprise - we got the lock. Just return to userspace: + */ + if (unlikely(!curval)) + return 1; + + uval = curval; + + /* + * Set the FUTEX_WAITERS flag, so the owner will know it has someone + * to wake at the next unlock. + */ + newval = curval | FUTEX_WAITERS; + + /* + * There are two cases, where a futex might have no owner (the + * owner TID is 0): OWNER_DIED. We take over the futex in this + * case. We also do an unconditional take over, when the owner + * of the futex died. + * + * This is safe as we are protected by the hash bucket lock ! + */ + if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { + /* Keep the OWNER_DIED bit */ + newval = (curval & ~FUTEX_TID_MASK) | vpid; + ownerdied = 0; + lock_taken = 1; + } + + if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) + return -EFAULT; + if (unlikely(curval != uval)) + goto retry; + + /* + * We took the lock due to owner died take over. + */ + if (unlikely(lock_taken)) + return 1; + + /* + * We dont have the lock. Look up the PI state (or create it if + * we are the first waiter): + */ + ret = lookup_pi_state(uval, hb, key, ps); + + if (unlikely(ret)) { + switch (ret) { + case -ESRCH: + /* + * No owner found for this futex. Check if the + * OWNER_DIED bit is set to figure out whether + * this is a robust futex or not. + */ + if (get_futex_value_locked(&curval, uaddr)) + return -EFAULT; + + /* + * We simply start over in case of a robust + * futex. The code above will take the futex + * and return happy. + */ + if (curval & FUTEX_OWNER_DIED) { + ownerdied = 1; + goto retry; + } + default: + break; + } + } + + return ret; +} + +/** + * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket + * @q: The futex_q to unqueue + * + * The q->lock_ptr must not be NULL and must be held by the caller. + */ +static void __unqueue_futex(struct futex_q *q) +{ + struct futex_hash_bucket *hb; + + if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr)) + || WARN_ON(plist_node_empty(&q->list))) + return; + + hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); + plist_del(&q->list, &hb->chain); +} + +/* + * The hash bucket lock must be held when this is called. + * Afterwards, the futex_q must not be accessed. + */ +static void wake_futex(struct futex_q *q) +{ + struct task_struct *p = q->task; + + /* + * We set q->lock_ptr = NULL _before_ we wake up the task. If + * a non-futex wake up happens on another CPU then the task + * might exit and p would dereference a non-existing task + * struct. Prevent this by holding a reference on p across the + * wake up. + */ + get_task_struct(p); + + __unqueue_futex(q); + /* + * The waiting task can free the futex_q as soon as + * q->lock_ptr = NULL is written, without taking any locks. A + * memory barrier is required here to prevent the following + * store to lock_ptr from getting ahead of the plist_del. + */ + smp_wmb(); + q->lock_ptr = NULL; + + wake_up_state(p, TASK_NORMAL); + put_task_struct(p); +} + +static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) +{ + struct task_struct *new_owner; + struct futex_pi_state *pi_state = this->pi_state; + u32 uninitialized_var(curval), newval; + + if (!pi_state) + return -EINVAL; + + /* + * If current does not own the pi_state then the futex is + * inconsistent and user space fiddled with the futex value. + */ + if (pi_state->owner != current) + return -EINVAL; + + raw_spin_lock(&pi_state->pi_mutex.wait_lock); + new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); + + /* + * It is possible that the next waiter (the one that brought + * this owner to the kernel) timed out and is no longer + * waiting on the lock. + */ + if (!new_owner) + new_owner = this->task; + + /* + * We pass it to the next owner. (The WAITERS bit is always + * kept enabled while there is PI state around. We must also + * preserve the owner died bit.) + */ + if (!(uval & FUTEX_OWNER_DIED)) { + int ret = 0; + + newval = FUTEX_WAITERS | task_pid_vnr(new_owner); + + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) + ret = -EFAULT; + else if (curval != uval) + ret = -EINVAL; + if (ret) { + raw_spin_unlock(&pi_state->pi_mutex.wait_lock); + return ret; + } + } + + raw_spin_lock_irq(&pi_state->owner->pi_lock); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + raw_spin_unlock_irq(&pi_state->owner->pi_lock); + + raw_spin_lock_irq(&new_owner->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); + list_add(&pi_state->list, &new_owner->pi_state_list); + pi_state->owner = new_owner; + raw_spin_unlock_irq(&new_owner->pi_lock); + + raw_spin_unlock(&pi_state->pi_mutex.wait_lock); + rt_mutex_unlock(&pi_state->pi_mutex); + + return 0; +} + +static int unlock_futex_pi(u32 __user *uaddr, u32 uval) +{ + u32 uninitialized_var(oldval); + + /* + * There is no waiter, so we unlock the futex. The owner died + * bit has not to be preserved here. We are the owner: + */ + if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0)) + return -EFAULT; + if (oldval != uval) + return -EAGAIN; + + return 0; +} + +/* + * Express the locking dependencies for lockdep: + */ +static inline void +double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) +{ + if (hb1 <= hb2) { + spin_lock(&hb1->lock); + if (hb1 < hb2) + spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING); + } else { /* hb1 > hb2 */ + spin_lock(&hb2->lock); + spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING); + } +} + +static inline void +double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) +{ + spin_unlock(&hb1->lock); + if (hb1 != hb2) + spin_unlock(&hb2->lock); +} + +/* + * Wake up waiters matching bitset queued on this futex (uaddr). + */ +static int +futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) +{ + struct futex_hash_bucket *hb; + struct futex_q *this, *next; + struct plist_head *head; + union futex_key key = FUTEX_KEY_INIT; + int ret; + + if (!bitset) + return -EINVAL; + + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ); + if (unlikely(ret != 0)) + goto out; + + hb = hash_futex(&key); + spin_lock(&hb->lock); + head = &hb->chain; + + plist_for_each_entry_safe(this, next, head, list) { + if (match_futex (&this->key, &key)) { + if (this->pi_state || this->rt_waiter) { + ret = -EINVAL; + break; + } + + /* Check if one of the bits is set in both bitsets */ + if (!(this->bitset & bitset)) + continue; + + wake_futex(this); + if (++ret >= nr_wake) + break; + } + } + + spin_unlock(&hb->lock); + put_futex_key(&key); +out: + return ret; +} + +/* + * Wake up all waiters hashed on the physical page that is mapped + * to this virtual address: + */ +static int +futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, + int nr_wake, int nr_wake2, int op) +{ + union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; + struct futex_hash_bucket *hb1, *hb2; + struct plist_head *head; + struct futex_q *this, *next; + int ret, op_ret; + +retry: + ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); + if (unlikely(ret != 0)) + goto out; + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); + if (unlikely(ret != 0)) + goto out_put_key1; + + hb1 = hash_futex(&key1); + hb2 = hash_futex(&key2); + +retry_private: + double_lock_hb(hb1, hb2); + op_ret = futex_atomic_op_inuser(op, uaddr2); + if (unlikely(op_ret < 0)) { + + double_unlock_hb(hb1, hb2); + +#ifndef CONFIG_MMU + /* + * we don't get EFAULT from MMU faults if we don't have an MMU, + * but we might get them from range checking + */ + ret = op_ret; + goto out_put_keys; +#endif + + if (unlikely(op_ret != -EFAULT)) { + ret = op_ret; + goto out_put_keys; + } + + ret = fault_in_user_writeable(uaddr2); + if (ret) + goto out_put_keys; + + if (!(flags & FLAGS_SHARED)) + goto retry_private; + + put_futex_key(&key2); + put_futex_key(&key1); + goto retry; + } + + head = &hb1->chain; + + plist_for_each_entry_safe(this, next, head, list) { + if (match_futex (&this->key, &key1)) { + wake_futex(this); + if (++ret >= nr_wake) + break; + } + } + + if (op_ret > 0) { + head = &hb2->chain; + + op_ret = 0; + plist_for_each_entry_safe(this, next, head, list) { + if (match_futex (&this->key, &key2)) { + wake_futex(this); + if (++op_ret >= nr_wake2) + break; + } + } + ret += op_ret; + } + + double_unlock_hb(hb1, hb2); +out_put_keys: + put_futex_key(&key2); +out_put_key1: + put_futex_key(&key1); +out: + return ret; +} + +/** + * requeue_futex() - Requeue a futex_q from one hb to another + * @q: the futex_q to requeue + * @hb1: the source hash_bucket + * @hb2: the target hash_bucket + * @key2: the new key for the requeued futex_q + */ +static inline +void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, + struct futex_hash_bucket *hb2, union futex_key *key2) +{ + + /* + * If key1 and key2 hash to the same bucket, no need to + * requeue. + */ + if (likely(&hb1->chain != &hb2->chain)) { + plist_del(&q->list, &hb1->chain); + plist_add(&q->list, &hb2->chain); + q->lock_ptr = &hb2->lock; + } + get_futex_key_refs(key2); + q->key = *key2; +} + +/** + * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue + * @q: the futex_q + * @key: the key of the requeue target futex + * @hb: the hash_bucket of the requeue target futex + * + * During futex_requeue, with requeue_pi=1, it is possible to acquire the + * target futex if it is uncontended or via a lock steal. Set the futex_q key + * to the requeue target futex so the waiter can detect the wakeup on the right + * futex, but remove it from the hb and NULL the rt_waiter so it can detect + * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock + * to protect access to the pi_state to fixup the owner later. Must be called + * with both q->lock_ptr and hb->lock held. + */ +static inline +void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, + struct futex_hash_bucket *hb) +{ + get_futex_key_refs(key); + q->key = *key; + + __unqueue_futex(q); + + WARN_ON(!q->rt_waiter); + q->rt_waiter = NULL; + + q->lock_ptr = &hb->lock; + + wake_up_state(q->task, TASK_NORMAL); +} + +/** + * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter + * @pifutex: the user address of the to futex + * @hb1: the from futex hash bucket, must be locked by the caller + * @hb2: the to futex hash bucket, must be locked by the caller + * @key1: the from futex key + * @key2: the to futex key + * @ps: address to store the pi_state pointer + * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) + * + * Try and get the lock on behalf of the top waiter if we can do it atomically. + * Wake the top waiter if we succeed. If the caller specified set_waiters, + * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. + * hb1 and hb2 must be held by the caller. + * + * Returns: + * 0 - failed to acquire the lock atomicly + * 1 - acquired the lock + * <0 - error + */ +static int futex_proxy_trylock_atomic(u32 __user *pifutex, + struct futex_hash_bucket *hb1, + struct futex_hash_bucket *hb2, + union futex_key *key1, union futex_key *key2, + struct futex_pi_state **ps, int set_waiters) +{ + struct futex_q *top_waiter = NULL; + u32 curval; + int ret; + + if (get_futex_value_locked(&curval, pifutex)) + return -EFAULT; + + /* + * Find the top_waiter and determine if there are additional waiters. + * If the caller intends to requeue more than 1 waiter to pifutex, + * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, + * as we have means to handle the possible fault. If not, don't set + * the bit unecessarily as it will force the subsequent unlock to enter + * the kernel. + */ + top_waiter = futex_top_waiter(hb1, key1); + + /* There are no waiters, nothing for us to do. */ + if (!top_waiter) + return 0; + + /* Ensure we requeue to the expected futex. */ + if (!match_futex(top_waiter->requeue_pi_key, key2)) + return -EINVAL; + + /* + * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in + * the contended case or if set_waiters is 1. The pi_state is returned + * in ps in contended cases. + */ + ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, + set_waiters); + if (ret == 1) + requeue_pi_wake_futex(top_waiter, key2, hb2); + + return ret; +} + +/** + * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 + * @uaddr1: source futex user address + * @flags: futex flags (FLAGS_SHARED, etc.) + * @uaddr2: target futex user address + * @nr_wake: number of waiters to wake (must be 1 for requeue_pi) + * @nr_requeue: number of waiters to requeue (0-INT_MAX) + * @cmpval: @uaddr1 expected value (or %NULL) + * @requeue_pi: if we are attempting to requeue from a non-pi futex to a + * pi futex (pi to pi requeue is not supported) + * + * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire + * uaddr2 atomically on behalf of the top waiter. + * + * Returns: + * >=0 - on success, the number of tasks requeued or woken + * <0 - on error + */ +static int futex_requeue(u32 __user *uaddr1, unsigned int flags, + u32 __user *uaddr2, int nr_wake, int nr_requeue, + u32 *cmpval, int requeue_pi) +{ + union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; + int drop_count = 0, task_count = 0, ret; + struct futex_pi_state *pi_state = NULL; + struct futex_hash_bucket *hb1, *hb2; + struct plist_head *head1; + struct futex_q *this, *next; + u32 curval2; + + if (requeue_pi) { + /* + * requeue_pi requires a pi_state, try to allocate it now + * without any locks in case it fails. + */ + if (refill_pi_state_cache()) + return -ENOMEM; + /* + * requeue_pi must wake as many tasks as it can, up to nr_wake + * + nr_requeue, since it acquires the rt_mutex prior to + * returning to userspace, so as to not leave the rt_mutex with + * waiters and no owner. However, second and third wake-ups + * cannot be predicted as they involve race conditions with the + * first wake and a fault while looking up the pi_state. Both + * pthread_cond_signal() and pthread_cond_broadcast() should + * use nr_wake=1. + */ + if (nr_wake != 1) + return -EINVAL; + } + +retry: + if (pi_state != NULL) { + /* + * We will have to lookup the pi_state again, so free this one + * to keep the accounting correct. + */ + free_pi_state(pi_state); + pi_state = NULL; + } + + ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); + if (unlikely(ret != 0)) + goto out; + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, + requeue_pi ? VERIFY_WRITE : VERIFY_READ); + if (unlikely(ret != 0)) + goto out_put_key1; + + hb1 = hash_futex(&key1); + hb2 = hash_futex(&key2); + +retry_private: + double_lock_hb(hb1, hb2); + + if (likely(cmpval != NULL)) { + u32 curval; + + ret = get_futex_value_locked(&curval, uaddr1); + + if (unlikely(ret)) { + double_unlock_hb(hb1, hb2); + + ret = get_user(curval, uaddr1); + if (ret) + goto out_put_keys; + + if (!(flags & FLAGS_SHARED)) + goto retry_private; + + put_futex_key(&key2); + put_futex_key(&key1); + goto retry; + } + if (curval != *cmpval) { + ret = -EAGAIN; + goto out_unlock; + } + } + + if (requeue_pi && (task_count - nr_wake < nr_requeue)) { + /* + * Attempt to acquire uaddr2 and wake the top waiter. If we + * intend to requeue waiters, force setting the FUTEX_WAITERS + * bit. We force this here where we are able to easily handle + * faults rather in the requeue loop below. + */ + ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, + &key2, &pi_state, nr_requeue); + + /* + * At this point the top_waiter has either taken uaddr2 or is + * waiting on it. If the former, then the pi_state will not + * exist yet, look it up one more time to ensure we have a + * reference to it. + */ + if (ret == 1) { + WARN_ON(pi_state); + drop_count++; + task_count++; + ret = get_futex_value_locked(&curval2, uaddr2); + if (!ret) + ret = lookup_pi_state(curval2, hb2, &key2, + &pi_state); + } + + switch (ret) { + case 0: + break; + case -EFAULT: + double_unlock_hb(hb1, hb2); + put_futex_key(&key2); + put_futex_key(&key1); + ret = fault_in_user_writeable(uaddr2); + if (!ret) + goto retry; + goto out; + case -EAGAIN: + /* The owner was exiting, try again. */ + double_unlock_hb(hb1, hb2); + put_futex_key(&key2); + put_futex_key(&key1); + cond_resched(); + goto retry; + default: + goto out_unlock; + } + } + + head1 = &hb1->chain; + plist_for_each_entry_safe(this, next, head1, list) { + if (task_count - nr_wake >= nr_requeue) + break; + + if (!match_futex(&this->key, &key1)) + continue; + + /* + * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always + * be paired with each other and no other futex ops. + */ + if ((requeue_pi && !this->rt_waiter) || + (!requeue_pi && this->rt_waiter)) { + ret = -EINVAL; + break; + } + + /* + * Wake nr_wake waiters. For requeue_pi, if we acquired the + * lock, we already woke the top_waiter. If not, it will be + * woken by futex_unlock_pi(). + */ + if (++task_count <= nr_wake && !requeue_pi) { + wake_futex(this); + continue; + } + + /* Ensure we requeue to the expected futex for requeue_pi. */ + if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) { + ret = -EINVAL; + break; + } + + /* + * Requeue nr_requeue waiters and possibly one more in the case + * of requeue_pi if we couldn't acquire the lock atomically. + */ + if (requeue_pi) { + /* Prepare the waiter to take the rt_mutex. */ + atomic_inc(&pi_state->refcount); + this->pi_state = pi_state; + ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, + this->rt_waiter, + this->task, 1); + if (ret == 1) { + /* We got the lock. */ + requeue_pi_wake_futex(this, &key2, hb2); + drop_count++; + continue; + } else if (ret) { + /* -EDEADLK */ + this->pi_state = NULL; + free_pi_state(pi_state); + goto out_unlock; + } + } + requeue_futex(this, hb1, hb2, &key2); + drop_count++; + } + +out_unlock: + double_unlock_hb(hb1, hb2); + + /* + * drop_futex_key_refs() must be called outside the spinlocks. During + * the requeue we moved futex_q's from the hash bucket at key1 to the + * one at key2 and updated their key pointer. We no longer need to + * hold the references to key1. + */ + while (--drop_count >= 0) + drop_futex_key_refs(&key1); + +out_put_keys: + put_futex_key(&key2); +out_put_key1: + put_futex_key(&key1); +out: + if (pi_state != NULL) + free_pi_state(pi_state); + return ret ? ret : task_count; +} + +/* The key must be already stored in q->key. */ +static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) + __acquires(&hb->lock) +{ + struct futex_hash_bucket *hb; + + hb = hash_futex(&q->key); + q->lock_ptr = &hb->lock; + + spin_lock(&hb->lock); + return hb; +} + +static inline void +queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) + __releases(&hb->lock) +{ + spin_unlock(&hb->lock); +} + +/** + * queue_me() - Enqueue the futex_q on the futex_hash_bucket + * @q: The futex_q to enqueue + * @hb: The destination hash bucket + * + * The hb->lock must be held by the caller, and is released here. A call to + * queue_me() is typically paired with exactly one call to unqueue_me(). The + * exceptions involve the PI related operations, which may use unqueue_me_pi() + * or nothing if the unqueue is done as part of the wake process and the unqueue + * state is implicit in the state of woken task (see futex_wait_requeue_pi() for + * an example). + */ +static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) + __releases(&hb->lock) +{ + int prio; + + /* + * The priority used to register this element is + * - either the real thread-priority for the real-time threads + * (i.e. threads with a priority lower than MAX_RT_PRIO) + * - or MAX_RT_PRIO for non-RT threads. + * Thus, all RT-threads are woken first in priority order, and + * the others are woken last, in FIFO order. + */ + prio = min(current->normal_prio, MAX_RT_PRIO); + + plist_node_init(&q->list, prio); + plist_add(&q->list, &hb->chain); + q->task = current; + spin_unlock(&hb->lock); +} + +/** + * unqueue_me() - Remove the futex_q from its futex_hash_bucket + * @q: The futex_q to unqueue + * + * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must + * be paired with exactly one earlier call to queue_me(). + * + * Returns: + * 1 - if the futex_q was still queued (and we removed unqueued it) + * 0 - if the futex_q was already removed by the waking thread + */ +static int unqueue_me(struct futex_q *q) +{ + spinlock_t *lock_ptr; + int ret = 0; + + /* In the common case we don't take the spinlock, which is nice. */ +retry: + lock_ptr = q->lock_ptr; + barrier(); + if (lock_ptr != NULL) { + spin_lock(lock_ptr); + /* + * q->lock_ptr can change between reading it and + * spin_lock(), causing us to take the wrong lock. This + * corrects the race condition. + * + * Reasoning goes like this: if we have the wrong lock, + * q->lock_ptr must have changed (maybe several times) + * between reading it and the spin_lock(). It can + * change again after the spin_lock() but only if it was + * already changed before the spin_lock(). It cannot, + * however, change back to the original value. Therefore + * we can detect whether we acquired the correct lock. + */ + if (unlikely(lock_ptr != q->lock_ptr)) { + spin_unlock(lock_ptr); + goto retry; + } + __unqueue_futex(q); + + BUG_ON(q->pi_state); + + spin_unlock(lock_ptr); + ret = 1; + } + + drop_futex_key_refs(&q->key); + return ret; +} + +/* + * PI futexes can not be requeued and must remove themself from the + * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry + * and dropped here. + */ +static void unqueue_me_pi(struct futex_q *q) + __releases(q->lock_ptr) +{ + __unqueue_futex(q); + + BUG_ON(!q->pi_state); + free_pi_state(q->pi_state); + q->pi_state = NULL; + + spin_unlock(q->lock_ptr); +} + +/* + * Fixup the pi_state owner with the new owner. + * + * Must be called with hash bucket lock held and mm->sem held for non + * private futexes. + */ +static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, + struct task_struct *newowner) +{ + u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; + struct futex_pi_state *pi_state = q->pi_state; + struct task_struct *oldowner = pi_state->owner; + u32 uval, uninitialized_var(curval), newval; + int ret; + + /* Owner died? */ + if (!pi_state->owner) + newtid |= FUTEX_OWNER_DIED; + + /* + * We are here either because we stole the rtmutex from the + * previous highest priority waiter or we are the highest priority + * waiter but failed to get the rtmutex the first time. + * We have to replace the newowner TID in the user space variable. + * This must be atomic as we have to preserve the owner died bit here. + * + * Note: We write the user space value _before_ changing the pi_state + * because we can fault here. Imagine swapped out pages or a fork + * that marked all the anonymous memory readonly for cow. + * + * Modifying pi_state _before_ the user space value would + * leave the pi_state in an inconsistent state when we fault + * here, because we need to drop the hash bucket lock to + * handle the fault. This might be observed in the PID check + * in lookup_pi_state. + */ +retry: + if (get_futex_value_locked(&uval, uaddr)) + goto handle_fault; + + while (1) { + newval = (uval & FUTEX_OWNER_DIED) | newtid; + + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) + goto handle_fault; + if (curval == uval) + break; + uval = curval; + } + + /* + * We fixed up user space. Now we need to fix the pi_state + * itself. + */ + if (pi_state->owner != NULL) { + raw_spin_lock_irq(&pi_state->owner->pi_lock); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + raw_spin_unlock_irq(&pi_state->owner->pi_lock); + } + + pi_state->owner = newowner; + + raw_spin_lock_irq(&newowner->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); + list_add(&pi_state->list, &newowner->pi_state_list); + raw_spin_unlock_irq(&newowner->pi_lock); + return 0; + + /* + * To handle the page fault we need to drop the hash bucket + * lock here. That gives the other task (either the highest priority + * waiter itself or the task which stole the rtmutex) the + * chance to try the fixup of the pi_state. So once we are + * back from handling the fault we need to check the pi_state + * after reacquiring the hash bucket lock and before trying to + * do another fixup. When the fixup has been done already we + * simply return. + */ +handle_fault: + spin_unlock(q->lock_ptr); + + ret = fault_in_user_writeable(uaddr); + + spin_lock(q->lock_ptr); + + /* + * Check if someone else fixed it for us: + */ + if (pi_state->owner != oldowner) + return 0; + + if (ret) + return ret; + + goto retry; +} + +static long futex_wait_restart(struct restart_block *restart); + +/** + * fixup_owner() - Post lock pi_state and corner case management + * @uaddr: user address of the futex + * @q: futex_q (contains pi_state and access to the rt_mutex) + * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) + * + * After attempting to lock an rt_mutex, this function is called to cleanup + * the pi_state owner as well as handle race conditions that may allow us to + * acquire the lock. Must be called with the hb lock held. + * + * Returns: + * 1 - success, lock taken + * 0 - success, lock not taken + * <0 - on error (-EFAULT) + */ +static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) +{ + struct task_struct *owner; + int ret = 0; + + if (locked) { + /* + * Got the lock. We might not be the anticipated owner if we + * did a lock-steal - fix up the PI-state in that case: + */ + if (q->pi_state->owner != current) + ret = fixup_pi_state_owner(uaddr, q, current); + goto out; + } + + /* + * Catch the rare case, where the lock was released when we were on the + * way back before we locked the hash bucket. + */ + if (q->pi_state->owner == current) { + /* + * Try to get the rt_mutex now. This might fail as some other + * task acquired the rt_mutex after we removed ourself from the + * rt_mutex waiters list. + */ + if (rt_mutex_trylock(&q->pi_state->pi_mutex)) { + locked = 1; + goto out; + } + + /* + * pi_state is incorrect, some other task did a lock steal and + * we returned due to timeout or signal without taking the + * rt_mutex. Too late. + */ + raw_spin_lock(&q->pi_state->pi_mutex.wait_lock); + owner = rt_mutex_owner(&q->pi_state->pi_mutex); + if (!owner) + owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); + raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock); + ret = fixup_pi_state_owner(uaddr, q, owner); + goto out; + } + + /* + * Paranoia check. If we did not take the lock, then we should not be + * the owner of the rt_mutex. + */ + if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) + printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " + "pi-state %p\n", ret, + q->pi_state->pi_mutex.owner, + q->pi_state->owner); + +out: + return ret ? ret : locked; +} + +/** + * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal + * @hb: the futex hash bucket, must be locked by the caller + * @q: the futex_q to queue up on + * @timeout: the prepared hrtimer_sleeper, or null for no timeout + */ +static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, + struct hrtimer_sleeper *timeout) +{ + /* + * The task state is guaranteed to be set before another task can + * wake it. set_current_state() is implemented using set_mb() and + * queue_me() calls spin_unlock() upon completion, both serializing + * access to the hash list and forcing another memory barrier. + */ + set_current_state(TASK_INTERRUPTIBLE); + queue_me(q, hb); + + /* Arm the timer */ + if (timeout) { + hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); + if (!hrtimer_active(&timeout->timer)) + timeout->task = NULL; + } + + /* + * If we have been removed from the hash list, then another task + * has tried to wake us, and we can skip the call to schedule(). + */ + if (likely(!plist_node_empty(&q->list))) { + /* + * If the timer has already expired, current will already be + * flagged for rescheduling. Only call schedule if there + * is no timeout, or if it has yet to expire. + */ + if (!timeout || timeout->task) + schedule(); + } + __set_current_state(TASK_RUNNING); +} + +/** + * futex_wait_setup() - Prepare to wait on a futex + * @uaddr: the futex userspace address + * @val: the expected value + * @flags: futex flags (FLAGS_SHARED, etc.) + * @q: the associated futex_q + * @hb: storage for hash_bucket pointer to be returned to caller + * + * Setup the futex_q and locate the hash_bucket. Get the futex value and + * compare it with the expected value. Handle atomic faults internally. + * Return with the hb lock held and a q.key reference on success, and unlocked + * with no q.key reference on failure. + * + * Returns: + * 0 - uaddr contains val and hb has been locked + * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked + */ +static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, + struct futex_q *q, struct futex_hash_bucket **hb) +{ + u32 uval; + int ret; + + /* + * Access the page AFTER the hash-bucket is locked. + * Order is important: + * + * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); + * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } + * + * The basic logical guarantee of a futex is that it blocks ONLY + * if cond(var) is known to be true at the time of blocking, for + * any cond. If we locked the hash-bucket after testing *uaddr, that + * would open a race condition where we could block indefinitely with + * cond(var) false, which would violate the guarantee. + * + * On the other hand, we insert q and release the hash-bucket only + * after testing *uaddr. This guarantees that futex_wait() will NOT + * absorb a wakeup if *uaddr does not match the desired values + * while the syscall executes. + */ +retry: + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ); + if (unlikely(ret != 0)) + return ret; + +retry_private: + *hb = queue_lock(q); + + ret = get_futex_value_locked(&uval, uaddr); + + if (ret) { + queue_unlock(q, *hb); + + ret = get_user(uval, uaddr); + if (ret) + goto out; + + if (!(flags & FLAGS_SHARED)) + goto retry_private; + + put_futex_key(&q->key); + goto retry; + } + + if (uval != val) { + queue_unlock(q, *hb); + ret = -EWOULDBLOCK; + } + +out: + if (ret) + put_futex_key(&q->key); + return ret; +} + +static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val, + ktime_t *abs_time, u32 bitset) +{ + struct hrtimer_sleeper timeout, *to = NULL; + struct restart_block *restart; + struct futex_hash_bucket *hb; + struct futex_q q = futex_q_init; + int ret; + + if (!bitset) + return -EINVAL; + q.bitset = bitset; + + if (abs_time) { + to = &timeout; + + hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? + CLOCK_REALTIME : CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); + hrtimer_init_sleeper(to, current); + hrtimer_set_expires_range_ns(&to->timer, *abs_time, + current->timer_slack_ns); + } + +retry: + /* + * Prepare to wait on uaddr. On success, holds hb lock and increments + * q.key refs. + */ + ret = futex_wait_setup(uaddr, val, flags, &q, &hb); + if (ret) + goto out; + + /* queue_me and wait for wakeup, timeout, or a signal. */ + futex_wait_queue_me(hb, &q, to); + + /* If we were woken (and unqueued), we succeeded, whatever. */ + ret = 0; + /* unqueue_me() drops q.key ref */ + if (!unqueue_me(&q)) + goto out; + ret = -ETIMEDOUT; + if (to && !to->task) + goto out; + + /* + * We expect signal_pending(current), but we might be the + * victim of a spurious wakeup as well. + */ + if (!signal_pending(current)) + goto retry; + + ret = -ERESTARTSYS; + if (!abs_time) + goto out; + + restart = ¤t_thread_info()->restart_block; + restart->fn = futex_wait_restart; + restart->futex.uaddr = uaddr; + restart->futex.val = val; + restart->futex.time = abs_time->tv64; + restart->futex.bitset = bitset; + restart->futex.flags = flags | FLAGS_HAS_TIMEOUT; + + ret = -ERESTART_RESTARTBLOCK; + +out: + if (to) { + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + } + return ret; +} + + +static long futex_wait_restart(struct restart_block *restart) +{ + u32 __user *uaddr = restart->futex.uaddr; + ktime_t t, *tp = NULL; + + if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { + t.tv64 = restart->futex.time; + tp = &t; + } + restart->fn = do_no_restart_syscall; + + return (long)futex_wait(uaddr, restart->futex.flags, + restart->futex.val, tp, restart->futex.bitset); +} + + +/* + * Userspace tried a 0 -> TID atomic transition of the futex value + * and failed. The kernel side here does the whole locking operation: + * if there are waiters then it will block, it does PI, etc. (Due to + * races the kernel might see a 0 value of the futex too.) + */ +static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect, + ktime_t *time, int trylock) +{ + struct hrtimer_sleeper timeout, *to = NULL; + struct futex_hash_bucket *hb; + struct futex_q q = futex_q_init; + int res, ret; + + if (refill_pi_state_cache()) + return -ENOMEM; + + if (time) { + to = &timeout; + hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME, + HRTIMER_MODE_ABS); + hrtimer_init_sleeper(to, current); + hrtimer_set_expires(&to->timer, *time); + } + +retry: + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE); + if (unlikely(ret != 0)) + goto out; + +retry_private: + hb = queue_lock(&q); + + ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); + if (unlikely(ret)) { + switch (ret) { + case 1: + /* We got the lock. */ + ret = 0; + goto out_unlock_put_key; + case -EFAULT: + goto uaddr_faulted; + case -EAGAIN: + /* + * Task is exiting and we just wait for the + * exit to complete. + */ + queue_unlock(&q, hb); + put_futex_key(&q.key); + cond_resched(); + goto retry; + default: + goto out_unlock_put_key; + } + } + + /* + * Only actually queue now that the atomic ops are done: + */ + queue_me(&q, hb); + + WARN_ON(!q.pi_state); + /* + * Block on the PI mutex: + */ + if (!trylock) + ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1); + else { + ret = rt_mutex_trylock(&q.pi_state->pi_mutex); + /* Fixup the trylock return value: */ + ret = ret ? 0 : -EWOULDBLOCK; + } + + spin_lock(q.lock_ptr); + /* + * Fixup the pi_state owner and possibly acquire the lock if we + * haven't already. + */ + res = fixup_owner(uaddr, &q, !ret); + /* + * If fixup_owner() returned an error, proprogate that. If it acquired + * the lock, clear our -ETIMEDOUT or -EINTR. + */ + if (res) + ret = (res < 0) ? res : 0; + + /* + * If fixup_owner() faulted and was unable to handle the fault, unlock + * it and return the fault to userspace. + */ + if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) + rt_mutex_unlock(&q.pi_state->pi_mutex); + + /* Unqueue and drop the lock */ + unqueue_me_pi(&q); + + goto out_put_key; + +out_unlock_put_key: + queue_unlock(&q, hb); + +out_put_key: + put_futex_key(&q.key); +out: + if (to) + destroy_hrtimer_on_stack(&to->timer); + return ret != -EINTR ? ret : -ERESTARTNOINTR; + +uaddr_faulted: + queue_unlock(&q, hb); + + ret = fault_in_user_writeable(uaddr); + if (ret) + goto out_put_key; + + if (!(flags & FLAGS_SHARED)) + goto retry_private; + + put_futex_key(&q.key); + goto retry; +} + +/* + * Userspace attempted a TID -> 0 atomic transition, and failed. + * This is the in-kernel slowpath: we look up the PI state (if any), + * and do the rt-mutex unlock. + */ +static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) +{ + struct futex_hash_bucket *hb; + struct futex_q *this, *next; + struct plist_head *head; + union futex_key key = FUTEX_KEY_INIT; + u32 uval, vpid = task_pid_vnr(current); + int ret; + +retry: + if (get_user(uval, uaddr)) + return -EFAULT; + /* + * We release only a lock we actually own: + */ + if ((uval & FUTEX_TID_MASK) != vpid) + return -EPERM; + + ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE); + if (unlikely(ret != 0)) + goto out; + + hb = hash_futex(&key); + spin_lock(&hb->lock); + + /* + * To avoid races, try to do the TID -> 0 atomic transition + * again. If it succeeds then we can return without waking + * anyone else up: + */ + if (!(uval & FUTEX_OWNER_DIED) && + cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0)) + goto pi_faulted; + /* + * Rare case: we managed to release the lock atomically, + * no need to wake anyone else up: + */ + if (unlikely(uval == vpid)) + goto out_unlock; + + /* + * Ok, other tasks may need to be woken up - check waiters + * and do the wakeup if necessary: + */ + head = &hb->chain; + + plist_for_each_entry_safe(this, next, head, list) { + if (!match_futex (&this->key, &key)) + continue; + ret = wake_futex_pi(uaddr, uval, this); + /* + * The atomic access to the futex value + * generated a pagefault, so retry the + * user-access and the wakeup: + */ + if (ret == -EFAULT) + goto pi_faulted; + goto out_unlock; + } + /* + * No waiters - kernel unlocks the futex: + */ + if (!(uval & FUTEX_OWNER_DIED)) { + ret = unlock_futex_pi(uaddr, uval); + if (ret == -EFAULT) + goto pi_faulted; + } + +out_unlock: + spin_unlock(&hb->lock); + put_futex_key(&key); + +out: + return ret; + +pi_faulted: + spin_unlock(&hb->lock); + put_futex_key(&key); + + ret = fault_in_user_writeable(uaddr); + if (!ret) + goto retry; + + return ret; +} + +/** + * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex + * @hb: the hash_bucket futex_q was original enqueued on + * @q: the futex_q woken while waiting to be requeued + * @key2: the futex_key of the requeue target futex + * @timeout: the timeout associated with the wait (NULL if none) + * + * Detect if the task was woken on the initial futex as opposed to the requeue + * target futex. If so, determine if it was a timeout or a signal that caused + * the wakeup and return the appropriate error code to the caller. Must be + * called with the hb lock held. + * + * Returns + * 0 - no early wakeup detected + * <0 - -ETIMEDOUT or -ERESTARTNOINTR + */ +static inline +int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, + struct futex_q *q, union futex_key *key2, + struct hrtimer_sleeper *timeout) +{ + int ret = 0; + + /* + * With the hb lock held, we avoid races while we process the wakeup. + * We only need to hold hb (and not hb2) to ensure atomicity as the + * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. + * It can't be requeued from uaddr2 to something else since we don't + * support a PI aware source futex for requeue. + */ + if (!match_futex(&q->key, key2)) { + WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr)); + /* + * We were woken prior to requeue by a timeout or a signal. + * Unqueue the futex_q and determine which it was. + */ + plist_del(&q->list, &hb->chain); + + /* Handle spurious wakeups gracefully */ + ret = -EWOULDBLOCK; + if (timeout && !timeout->task) + ret = -ETIMEDOUT; + else if (signal_pending(current)) + ret = -ERESTARTNOINTR; + } + return ret; +} + +/** + * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 + * @uaddr: the futex we initially wait on (non-pi) + * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be + * the same type, no requeueing from private to shared, etc. + * @val: the expected value of uaddr + * @abs_time: absolute timeout + * @bitset: 32 bit wakeup bitset set by userspace, defaults to all + * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0) + * @uaddr2: the pi futex we will take prior to returning to user-space + * + * The caller will wait on uaddr and will be requeued by futex_requeue() to + * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and + * complete the acquisition of the rt_mutex prior to returning to userspace. + * This ensures the rt_mutex maintains an owner when it has waiters; without + * one, the pi logic wouldn't know which task to boost/deboost, if there was a + * need to. + * + * We call schedule in futex_wait_queue_me() when we enqueue and return there + * via the following: + * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() + * 2) wakeup on uaddr2 after a requeue + * 3) signal + * 4) timeout + * + * If 3, cleanup and return -ERESTARTNOINTR. + * + * If 2, we may then block on trying to take the rt_mutex and return via: + * 5) successful lock + * 6) signal + * 7) timeout + * 8) other lock acquisition failure + * + * If 6, return -EWOULDBLOCK (restarting the syscall would do the same). + * + * If 4 or 7, we cleanup and return with -ETIMEDOUT. + * + * Returns: + * 0 - On success + * <0 - On error + */ +static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, + u32 val, ktime_t *abs_time, u32 bitset, + u32 __user *uaddr2) +{ + struct hrtimer_sleeper timeout, *to = NULL; + struct rt_mutex_waiter rt_waiter; + struct rt_mutex *pi_mutex = NULL; + struct futex_hash_bucket *hb; + union futex_key key2 = FUTEX_KEY_INIT; + struct futex_q q = futex_q_init; + int res, ret; + + if (!bitset) + return -EINVAL; + + if (abs_time) { + to = &timeout; + hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ? + CLOCK_REALTIME : CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); + hrtimer_init_sleeper(to, current); + hrtimer_set_expires_range_ns(&to->timer, *abs_time, + current->timer_slack_ns); + } + + /* + * The waiter is allocated on our stack, manipulated by the requeue + * code while we sleep on uaddr. + */ + debug_rt_mutex_init_waiter(&rt_waiter); + rt_waiter.task = NULL; + + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); + if (unlikely(ret != 0)) + goto out; + + q.bitset = bitset; + q.rt_waiter = &rt_waiter; + q.requeue_pi_key = &key2; + + /* + * Prepare to wait on uaddr. On success, increments q.key (key1) ref + * count. + */ + ret = futex_wait_setup(uaddr, val, flags, &q, &hb); + if (ret) + goto out_key2; + + /* Queue the futex_q, drop the hb lock, wait for wakeup. */ + futex_wait_queue_me(hb, &q, to); + + spin_lock(&hb->lock); + ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); + spin_unlock(&hb->lock); + if (ret) + goto out_put_keys; + + /* + * In order for us to be here, we know our q.key == key2, and since + * we took the hb->lock above, we also know that futex_requeue() has + * completed and we no longer have to concern ourselves with a wakeup + * race with the atomic proxy lock acquisition by the requeue code. The + * futex_requeue dropped our key1 reference and incremented our key2 + * reference count. + */ + + /* Check if the requeue code acquired the second futex for us. */ + if (!q.rt_waiter) { + /* + * Got the lock. We might not be the anticipated owner if we + * did a lock-steal - fix up the PI-state in that case. + */ + if (q.pi_state && (q.pi_state->owner != current)) { + spin_lock(q.lock_ptr); + ret = fixup_pi_state_owner(uaddr2, &q, current); + spin_unlock(q.lock_ptr); + } + } else { + /* + * We have been woken up by futex_unlock_pi(), a timeout, or a + * signal. futex_unlock_pi() will not destroy the lock_ptr nor + * the pi_state. + */ + WARN_ON(!&q.pi_state); + pi_mutex = &q.pi_state->pi_mutex; + ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); + debug_rt_mutex_free_waiter(&rt_waiter); + + spin_lock(q.lock_ptr); + /* + * Fixup the pi_state owner and possibly acquire the lock if we + * haven't already. + */ + res = fixup_owner(uaddr2, &q, !ret); + /* + * If fixup_owner() returned an error, proprogate that. If it + * acquired the lock, clear -ETIMEDOUT or -EINTR. + */ + if (res) + ret = (res < 0) ? res : 0; + + /* Unqueue and drop the lock. */ + unqueue_me_pi(&q); + } + + /* + * If fixup_pi_state_owner() faulted and was unable to handle the + * fault, unlock the rt_mutex and return the fault to userspace. + */ + if (ret == -EFAULT) { + if (rt_mutex_owner(pi_mutex) == current) + rt_mutex_unlock(pi_mutex); + } else if (ret == -EINTR) { + /* + * We've already been requeued, but cannot restart by calling + * futex_lock_pi() directly. We could restart this syscall, but + * it would detect that the user space "val" changed and return + * -EWOULDBLOCK. Save the overhead of the restart and return + * -EWOULDBLOCK directly. + */ + ret = -EWOULDBLOCK; + } + +out_put_keys: + put_futex_key(&q.key); +out_key2: + put_futex_key(&key2); + +out: + if (to) { + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + } + return ret; +} + +/* + * Support for robust futexes: the kernel cleans up held futexes at + * thread exit time. + * + * Implementation: user-space maintains a per-thread list of locks it + * is holding. Upon do_exit(), the kernel carefully walks this list, + * and marks all locks that are owned by this thread with the + * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is + * always manipulated with the lock held, so the list is private and + * per-thread. Userspace also maintains a per-thread 'list_op_pending' + * field, to allow the kernel to clean up if the thread dies after + * acquiring the lock, but just before it could have added itself to + * the list. There can only be one such pending lock. + */ + +/** + * sys_set_robust_list() - Set the robust-futex list head of a task + * @head: pointer to the list-head + * @len: length of the list-head, as userspace expects + */ +SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, + size_t, len) +{ + if (!futex_cmpxchg_enabled) + return -ENOSYS; + /* + * The kernel knows only one size for now: + */ + if (unlikely(len != sizeof(*head))) + return -EINVAL; + + current->robust_list = head; + + return 0; +} + +/** + * sys_get_robust_list() - Get the robust-futex list head of a task + * @pid: pid of the process [zero for current task] + * @head_ptr: pointer to a list-head pointer, the kernel fills it in + * @len_ptr: pointer to a length field, the kernel fills in the header size + */ +SYSCALL_DEFINE3(get_robust_list, int, pid, + struct robust_list_head __user * __user *, head_ptr, + size_t __user *, len_ptr) +{ + struct robust_list_head __user *head; + unsigned long ret; + struct task_struct *p; + + if (!futex_cmpxchg_enabled) + return -ENOSYS; + + rcu_read_lock(); + + ret = -ESRCH; + if (!pid) + p = current; + else { + p = find_task_by_vpid(pid); + if (!p) + goto err_unlock; + } + + ret = -EPERM; + if (!ptrace_may_access(p, PTRACE_MODE_READ)) + goto err_unlock; + + head = p->robust_list; + rcu_read_unlock(); + + if (put_user(sizeof(*head), len_ptr)) + return -EFAULT; + return put_user(head, head_ptr); + +err_unlock: + rcu_read_unlock(); + + return ret; +} + +/* + * Process a futex-list entry, check whether it's owned by the + * dying task, and do notification if so: + */ +int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) +{ + u32 uval, uninitialized_var(nval), mval; + +retry: + if (get_user(uval, uaddr)) + return -1; + + if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) { + /* + * Ok, this dying thread is truly holding a futex + * of interest. Set the OWNER_DIED bit atomically + * via cmpxchg, and if the value had FUTEX_WAITERS + * set, wake up a waiter (if any). (We have to do a + * futex_wake() even if OWNER_DIED is already set - + * to handle the rare but possible case of recursive + * thread-death.) The rest of the cleanup is done in + * userspace. + */ + mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; + /* + * We are not holding a lock here, but we want to have + * the pagefault_disable/enable() protection because + * we want to handle the fault gracefully. If the + * access fails we try to fault in the futex with R/W + * verification via get_user_pages. get_user() above + * does not guarantee R/W access. If that fails we + * give up and leave the futex locked. + */ + if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) { + if (fault_in_user_writeable(uaddr)) + return -1; + goto retry; + } + if (nval != uval) + goto retry; + + /* + * Wake robust non-PI futexes here. The wakeup of + * PI futexes happens in exit_pi_state(): + */ + if (!pi && (uval & FUTEX_WAITERS)) + futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); + } + return 0; +} + +/* + * Fetch a robust-list pointer. Bit 0 signals PI futexes: + */ +static inline int fetch_robust_entry(struct robust_list __user **entry, + struct robust_list __user * __user *head, + unsigned int *pi) +{ + unsigned long uentry; + + if (get_user(uentry, (unsigned long __user *)head)) + return -EFAULT; + + *entry = (void __user *)(uentry & ~1UL); + *pi = uentry & 1; + + return 0; +} + +/* + * Walk curr->robust_list (very carefully, it's a userspace list!) + * and mark any locks found there dead, and notify any waiters. + * + * We silently return on any sign of list-walking problem. + */ +void exit_robust_list(struct task_struct *curr) +{ + struct robust_list_head __user *head = curr->robust_list; + struct robust_list __user *entry, *next_entry, *pending; + unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; + unsigned int uninitialized_var(next_pi); + unsigned long futex_offset; + int rc; + + if (!futex_cmpxchg_enabled) + return; + + /* + * Fetch the list head (which was registered earlier, via + * sys_set_robust_list()): + */ + if (fetch_robust_entry(&entry, &head->list.next, &pi)) + return; + /* + * Fetch the relative futex offset: + */ + if (get_user(futex_offset, &head->futex_offset)) + return; + /* + * Fetch any possibly pending lock-add first, and handle it + * if it exists: + */ + if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) + return; + + next_entry = NULL; /* avoid warning with gcc */ + while (entry != &head->list) { + /* + * Fetch the next entry in the list before calling + * handle_futex_death: + */ + rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); + /* + * A pending lock might already be on the list, so + * don't process it twice: + */ + if (entry != pending) + if (handle_futex_death((void __user *)entry + futex_offset, + curr, pi)) + return; + if (rc) + return; + entry = next_entry; + pi = next_pi; + /* + * Avoid excessively long or circular lists: + */ + if (!--limit) + break; + + cond_resched(); + } + + if (pending) + handle_futex_death((void __user *)pending + futex_offset, + curr, pip); +} + +long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, + u32 __user *uaddr2, u32 val2, u32 val3) +{ + int cmd = op & FUTEX_CMD_MASK; + unsigned int flags = 0; + + if (!(op & FUTEX_PRIVATE_FLAG)) + flags |= FLAGS_SHARED; + + if (op & FUTEX_CLOCK_REALTIME) { + flags |= FLAGS_CLOCKRT; + if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) + return -ENOSYS; + } + + switch (cmd) { + case FUTEX_LOCK_PI: + case FUTEX_UNLOCK_PI: + case FUTEX_TRYLOCK_PI: + case FUTEX_WAIT_REQUEUE_PI: + case FUTEX_CMP_REQUEUE_PI: + if (!futex_cmpxchg_enabled) + return -ENOSYS; + } + + switch (cmd) { + case FUTEX_WAIT: + val3 = FUTEX_BITSET_MATCH_ANY; + case FUTEX_WAIT_BITSET: + return futex_wait(uaddr, flags, val, timeout, val3); + case FUTEX_WAKE: + val3 = FUTEX_BITSET_MATCH_ANY; + case FUTEX_WAKE_BITSET: + return futex_wake(uaddr, flags, val, val3); + case FUTEX_REQUEUE: + return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0); + case FUTEX_CMP_REQUEUE: + return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0); + case FUTEX_WAKE_OP: + return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3); + case FUTEX_LOCK_PI: + return futex_lock_pi(uaddr, flags, val, timeout, 0); + case FUTEX_UNLOCK_PI: + return futex_unlock_pi(uaddr, flags); + case FUTEX_TRYLOCK_PI: + return futex_lock_pi(uaddr, flags, 0, timeout, 1); + case FUTEX_WAIT_REQUEUE_PI: + val3 = FUTEX_BITSET_MATCH_ANY; + return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3, + uaddr2); + case FUTEX_CMP_REQUEUE_PI: + return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1); + } + return -ENOSYS; +} + + +SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, + struct timespec __user *, utime, u32 __user *, uaddr2, + u32, val3) +{ + struct timespec ts; + ktime_t t, *tp = NULL; + u32 val2 = 0; + int cmd = op & FUTEX_CMD_MASK; + + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || + cmd == FUTEX_WAIT_BITSET || + cmd == FUTEX_WAIT_REQUEUE_PI)) { + if (copy_from_user(&ts, utime, sizeof(ts)) != 0) + return -EFAULT; + if (!timespec_valid(&ts)) + return -EINVAL; + + t = timespec_to_ktime(ts); + if (cmd == FUTEX_WAIT) + t = ktime_add_safe(ktime_get(), t); + tp = &t; + } + /* + * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*. + * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. + */ + if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || + cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) + val2 = (u32) (unsigned long) utime; + + return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); +} + +static int __init futex_init(void) +{ + u32 curval; + int i; + + /* + * This will fail and we want it. Some arch implementations do + * runtime detection of the futex_atomic_cmpxchg_inatomic() + * functionality. We want to know that before we call in any + * of the complex code paths. Also we want to prevent + * registration of robust lists in that case. NULL is + * guaranteed to fault and we get -EFAULT on functional + * implementation, the non-functional ones will return + * -ENOSYS. + */ + if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) + futex_cmpxchg_enabled = 1; + + for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { + plist_head_init(&futex_queues[i].chain); + spin_lock_init(&futex_queues[i].lock); + } + + return 0; +} +__initcall(futex_init); diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c new file mode 100644 index 00000000..a9642d52 --- /dev/null +++ b/kernel/futex_compat.c @@ -0,0 +1,200 @@ +/* + * linux/kernel/futex_compat.c + * + * Futex compatibililty routines. + * + * Copyright 2006, Red Hat, Inc., Ingo Molnar + */ + +#include +#include +#include +#include +#include + +#include + + +/* + * Fetch a robust-list pointer. Bit 0 signals PI futexes: + */ +static inline int +fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, + compat_uptr_t __user *head, unsigned int *pi) +{ + if (get_user(*uentry, head)) + return -EFAULT; + + *entry = compat_ptr((*uentry) & ~1); + *pi = (unsigned int)(*uentry) & 1; + + return 0; +} + +static void __user *futex_uaddr(struct robust_list __user *entry, + compat_long_t futex_offset) +{ + compat_uptr_t base = ptr_to_compat(entry); + void __user *uaddr = compat_ptr(base + futex_offset); + + return uaddr; +} + +/* + * Walk curr->robust_list (very carefully, it's a userspace list!) + * and mark any locks found there dead, and notify any waiters. + * + * We silently return on any sign of list-walking problem. + */ +void compat_exit_robust_list(struct task_struct *curr) +{ + struct compat_robust_list_head __user *head = curr->compat_robust_list; + struct robust_list __user *entry, *next_entry, *pending; + unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; + unsigned int uninitialized_var(next_pi); + compat_uptr_t uentry, next_uentry, upending; + compat_long_t futex_offset; + int rc; + + if (!futex_cmpxchg_enabled) + return; + + /* + * Fetch the list head (which was registered earlier, via + * sys_set_robust_list()): + */ + if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) + return; + /* + * Fetch the relative futex offset: + */ + if (get_user(futex_offset, &head->futex_offset)) + return; + /* + * Fetch any possibly pending lock-add first, and handle it + * if it exists: + */ + if (fetch_robust_entry(&upending, &pending, + &head->list_op_pending, &pip)) + return; + + next_entry = NULL; /* avoid warning with gcc */ + while (entry != (struct robust_list __user *) &head->list) { + /* + * Fetch the next entry in the list before calling + * handle_futex_death: + */ + rc = fetch_robust_entry(&next_uentry, &next_entry, + (compat_uptr_t __user *)&entry->next, &next_pi); + /* + * A pending lock might already be on the list, so + * dont process it twice: + */ + if (entry != pending) { + void __user *uaddr = futex_uaddr(entry, futex_offset); + + if (handle_futex_death(uaddr, curr, pi)) + return; + } + if (rc) + return; + uentry = next_uentry; + entry = next_entry; + pi = next_pi; + /* + * Avoid excessively long or circular lists: + */ + if (!--limit) + break; + + cond_resched(); + } + if (pending) { + void __user *uaddr = futex_uaddr(pending, futex_offset); + + handle_futex_death(uaddr, curr, pip); + } +} + +asmlinkage long +compat_sys_set_robust_list(struct compat_robust_list_head __user *head, + compat_size_t len) +{ + if (!futex_cmpxchg_enabled) + return -ENOSYS; + + if (unlikely(len != sizeof(*head))) + return -EINVAL; + + current->compat_robust_list = head; + + return 0; +} + +asmlinkage long +compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, + compat_size_t __user *len_ptr) +{ + struct compat_robust_list_head __user *head; + unsigned long ret; + struct task_struct *p; + + if (!futex_cmpxchg_enabled) + return -ENOSYS; + + rcu_read_lock(); + + ret = -ESRCH; + if (!pid) + p = current; + else { + p = find_task_by_vpid(pid); + if (!p) + goto err_unlock; + } + + ret = -EPERM; + if (!ptrace_may_access(p, PTRACE_MODE_READ)) + goto err_unlock; + + head = p->compat_robust_list; + rcu_read_unlock(); + + if (put_user(sizeof(*head), len_ptr)) + return -EFAULT; + return put_user(ptr_to_compat(head), head_ptr); + +err_unlock: + rcu_read_unlock(); + + return ret; +} + +asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, + struct compat_timespec __user *utime, u32 __user *uaddr2, + u32 val3) +{ + struct timespec ts; + ktime_t t, *tp = NULL; + int val2 = 0; + int cmd = op & FUTEX_CMD_MASK; + + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || + cmd == FUTEX_WAIT_BITSET || + cmd == FUTEX_WAIT_REQUEUE_PI)) { + if (get_compat_timespec(&ts, utime)) + return -EFAULT; + if (!timespec_valid(&ts)) + return -EINVAL; + + t = timespec_to_ktime(ts); + if (cmd == FUTEX_WAIT) + t = ktime_add_safe(ktime_get(), t); + tp = &t; + } + if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || + cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) + val2 = (int) (unsigned long) utime; + + return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); +} diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig new file mode 100644 index 00000000..5bf924d8 --- /dev/null +++ b/kernel/gcov/Kconfig @@ -0,0 +1,49 @@ +menu "GCOV-based kernel profiling" + +config GCOV_KERNEL + bool "Enable gcov-based kernel profiling" + depends on DEBUG_FS + select CONSTRUCTORS + default n + ---help--- + This option enables gcov-based code profiling (e.g. for code coverage + measurements). + + If unsure, say N. + + Additionally specify CONFIG_GCOV_PROFILE_ALL=y to get profiling data + for the entire kernel. To enable profiling for specific files or + directories, add a line similar to the following to the respective + Makefile: + + For a single file (e.g. main.o): + GCOV_PROFILE_main.o := y + + For all files in one directory: + GCOV_PROFILE := y + + To exclude files from being profiled even when CONFIG_GCOV_PROFILE_ALL + is specified, use: + + GCOV_PROFILE_main.o := n + and: + GCOV_PROFILE := n + + Note that the debugfs filesystem has to be mounted to access + profiling data. + +config GCOV_PROFILE_ALL + bool "Profile entire Kernel" + depends on GCOV_KERNEL + depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE + default n + ---help--- + This options activates profiling for the entire kernel. + + If unsure, say N. + + Note that a kernel compiled with profiling flags will be significantly + larger and run slower. Also be sure to exclude files from profiling + which are not linked to the kernel image to prevent linker errors. + +endmenu diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile new file mode 100644 index 00000000..e97ca59e --- /dev/null +++ b/kernel/gcov/Makefile @@ -0,0 +1,3 @@ +ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"' + +obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c new file mode 100644 index 00000000..9b22d03c --- /dev/null +++ b/kernel/gcov/base.c @@ -0,0 +1,148 @@ +/* + * This code maintains a list of active profiling data structures. + * + * Copyright IBM Corp. 2009 + * Author(s): Peter Oberparleiter + * + * Uses gcc-internal data definitions. + * Based on the gcov-kernel patch by: + * Hubertus Franke + * Nigel Hinds + * Rajan Ravindran + * Peter Oberparleiter + * Paul Larson + */ + +#define pr_fmt(fmt) "gcov: " fmt + +#include +#include +#include +#include "gcov.h" + +static struct gcov_info *gcov_info_head; +static int gcov_events_enabled; +static DEFINE_MUTEX(gcov_lock); + +/* + * __gcov_init is called by gcc-generated constructor code for each object + * file compiled with -fprofile-arcs. + */ +void __gcov_init(struct gcov_info *info) +{ + static unsigned int gcov_version; + + mutex_lock(&gcov_lock); + if (gcov_version == 0) { + gcov_version = info->version; + /* + * Printing gcc's version magic may prove useful for debugging + * incompatibility reports. + */ + pr_info("version magic: 0x%x\n", gcov_version); + } + /* + * Add new profiling data structure to list and inform event + * listener. + */ + info->next = gcov_info_head; + gcov_info_head = info; + if (gcov_events_enabled) + gcov_event(GCOV_ADD, info); + mutex_unlock(&gcov_lock); +} +EXPORT_SYMBOL(__gcov_init); + +/* + * These functions may be referenced by gcc-generated profiling code but serve + * no function for kernel profiling. + */ +void __gcov_flush(void) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_flush); + +void __gcov_merge_add(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_add); + +void __gcov_merge_single(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_single); + +void __gcov_merge_delta(gcov_type *counters, unsigned int n_counters) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_merge_delta); + +/** + * gcov_enable_events - enable event reporting through gcov_event() + * + * Turn on reporting of profiling data load/unload-events through the + * gcov_event() callback. Also replay all previous events once. This function + * is needed because some events are potentially generated too early for the + * callback implementation to handle them initially. + */ +void gcov_enable_events(void) +{ + struct gcov_info *info; + + mutex_lock(&gcov_lock); + gcov_events_enabled = 1; + /* Perform event callback for previously registered entries. */ + for (info = gcov_info_head; info; info = info->next) + gcov_event(GCOV_ADD, info); + mutex_unlock(&gcov_lock); +} + +#ifdef CONFIG_MODULES +static inline int within(void *addr, void *start, unsigned long size) +{ + return ((addr >= start) && (addr < start + size)); +} + +/* Update list and generate events when modules are unloaded. */ +static int gcov_module_notifier(struct notifier_block *nb, unsigned long event, + void *data) +{ + struct module *mod = data; + struct gcov_info *info; + struct gcov_info *prev; + + if (event != MODULE_STATE_GOING) + return NOTIFY_OK; + mutex_lock(&gcov_lock); + prev = NULL; + /* Remove entries located in module from linked list. */ + for (info = gcov_info_head; info; info = info->next) { + if (within(info, mod->module_core, mod->core_size)) { + if (prev) + prev->next = info->next; + else + gcov_info_head = info->next; + if (gcov_events_enabled) + gcov_event(GCOV_REMOVE, info); + } else + prev = info; + } + mutex_unlock(&gcov_lock); + + return NOTIFY_OK; +} + +static struct notifier_block gcov_nb = { + .notifier_call = gcov_module_notifier, +}; + +static int __init gcov_init(void) +{ + return register_module_notifier(&gcov_nb); +} +device_initcall(gcov_init); +#endif /* CONFIG_MODULES */ diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c new file mode 100644 index 00000000..9bd0934f --- /dev/null +++ b/kernel/gcov/fs.c @@ -0,0 +1,790 @@ +/* + * This code exports profiling data as debugfs files to userspace. + * + * Copyright IBM Corp. 2009 + * Author(s): Peter Oberparleiter + * + * Uses gcc-internal data definitions. + * Based on the gcov-kernel patch by: + * Hubertus Franke + * Nigel Hinds + * Rajan Ravindran + * Peter Oberparleiter + * Paul Larson + * Yi CDL Yang + */ + +#define pr_fmt(fmt) "gcov: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "gcov.h" + +/** + * struct gcov_node - represents a debugfs entry + * @list: list head for child node list + * @children: child nodes + * @all: list head for list of all nodes + * @parent: parent node + * @loaded_info: array of pointers to profiling data sets for loaded object + * files. + * @num_loaded: number of profiling data sets for loaded object files. + * @unloaded_info: accumulated copy of profiling data sets for unloaded + * object files. Used only when gcov_persist=1. + * @dentry: main debugfs entry, either a directory or data file + * @links: associated symbolic links + * @name: data file basename + * + * struct gcov_node represents an entity within the gcov/ subdirectory + * of debugfs. There are directory and data file nodes. The latter represent + * the actual synthesized data file plus any associated symbolic links which + * are needed by the gcov tool to work correctly. + */ +struct gcov_node { + struct list_head list; + struct list_head children; + struct list_head all; + struct gcov_node *parent; + struct gcov_info **loaded_info; + struct gcov_info *unloaded_info; + struct dentry *dentry; + struct dentry **links; + int num_loaded; + char name[0]; +}; + +static const char objtree[] = OBJTREE; +static const char srctree[] = SRCTREE; +static struct gcov_node root_node; +static struct dentry *reset_dentry; +static LIST_HEAD(all_head); +static DEFINE_MUTEX(node_lock); + +/* If non-zero, keep copies of profiling data for unloaded modules. */ +static int gcov_persist = 1; + +static int __init gcov_persist_setup(char *str) +{ + unsigned long val; + + if (strict_strtoul(str, 0, &val)) { + pr_warning("invalid gcov_persist parameter '%s'\n", str); + return 0; + } + gcov_persist = val; + pr_info("setting gcov_persist to %d\n", gcov_persist); + + return 1; +} +__setup("gcov_persist=", gcov_persist_setup); + +/* + * seq_file.start() implementation for gcov data files. Note that the + * gcov_iterator interface is designed to be more restrictive than seq_file + * (no start from arbitrary position, etc.), to simplify the iterator + * implementation. + */ +static void *gcov_seq_start(struct seq_file *seq, loff_t *pos) +{ + loff_t i; + + gcov_iter_start(seq->private); + for (i = 0; i < *pos; i++) { + if (gcov_iter_next(seq->private)) + return NULL; + } + return seq->private; +} + +/* seq_file.next() implementation for gcov data files. */ +static void *gcov_seq_next(struct seq_file *seq, void *data, loff_t *pos) +{ + struct gcov_iterator *iter = data; + + if (gcov_iter_next(iter)) + return NULL; + (*pos)++; + + return iter; +} + +/* seq_file.show() implementation for gcov data files. */ +static int gcov_seq_show(struct seq_file *seq, void *data) +{ + struct gcov_iterator *iter = data; + + if (gcov_iter_write(iter, seq)) + return -EINVAL; + return 0; +} + +static void gcov_seq_stop(struct seq_file *seq, void *data) +{ + /* Unused. */ +} + +static const struct seq_operations gcov_seq_ops = { + .start = gcov_seq_start, + .next = gcov_seq_next, + .show = gcov_seq_show, + .stop = gcov_seq_stop, +}; + +/* + * Return a profiling data set associated with the given node. This is + * either a data set for a loaded object file or a data set copy in case + * all associated object files have been unloaded. + */ +static struct gcov_info *get_node_info(struct gcov_node *node) +{ + if (node->num_loaded > 0) + return node->loaded_info[0]; + + return node->unloaded_info; +} + +/* + * Return a newly allocated profiling data set which contains the sum of + * all profiling data associated with the given node. + */ +static struct gcov_info *get_accumulated_info(struct gcov_node *node) +{ + struct gcov_info *info; + int i = 0; + + if (node->unloaded_info) + info = gcov_info_dup(node->unloaded_info); + else + info = gcov_info_dup(node->loaded_info[i++]); + if (!info) + return NULL; + for (; i < node->num_loaded; i++) + gcov_info_add(info, node->loaded_info[i]); + + return info; +} + +/* + * open() implementation for gcov data files. Create a copy of the profiling + * data set and initialize the iterator and seq_file interface. + */ +static int gcov_seq_open(struct inode *inode, struct file *file) +{ + struct gcov_node *node = inode->i_private; + struct gcov_iterator *iter; + struct seq_file *seq; + struct gcov_info *info; + int rc = -ENOMEM; + + mutex_lock(&node_lock); + /* + * Read from a profiling data copy to minimize reference tracking + * complexity and concurrent access and to keep accumulating multiple + * profiling data sets associated with one node simple. + */ + info = get_accumulated_info(node); + if (!info) + goto out_unlock; + iter = gcov_iter_new(info); + if (!iter) + goto err_free_info; + rc = seq_open(file, &gcov_seq_ops); + if (rc) + goto err_free_iter_info; + seq = file->private_data; + seq->private = iter; +out_unlock: + mutex_unlock(&node_lock); + return rc; + +err_free_iter_info: + gcov_iter_free(iter); +err_free_info: + gcov_info_free(info); + goto out_unlock; +} + +/* + * release() implementation for gcov data files. Release resources allocated + * by open(). + */ +static int gcov_seq_release(struct inode *inode, struct file *file) +{ + struct gcov_iterator *iter; + struct gcov_info *info; + struct seq_file *seq; + + seq = file->private_data; + iter = seq->private; + info = gcov_iter_get_info(iter); + gcov_iter_free(iter); + gcov_info_free(info); + seq_release(inode, file); + + return 0; +} + +/* + * Find a node by the associated data file name. Needs to be called with + * node_lock held. + */ +static struct gcov_node *get_node_by_name(const char *name) +{ + struct gcov_node *node; + struct gcov_info *info; + + list_for_each_entry(node, &all_head, all) { + info = get_node_info(node); + if (info && (strcmp(info->filename, name) == 0)) + return node; + } + + return NULL; +} + +/* + * Reset all profiling data associated with the specified node. + */ +static void reset_node(struct gcov_node *node) +{ + int i; + + if (node->unloaded_info) + gcov_info_reset(node->unloaded_info); + for (i = 0; i < node->num_loaded; i++) + gcov_info_reset(node->loaded_info[i]); +} + +static void remove_node(struct gcov_node *node); + +/* + * write() implementation for gcov data files. Reset profiling data for the + * corresponding file. If all associated object files have been unloaded, + * remove the debug fs node as well. + */ +static ssize_t gcov_seq_write(struct file *file, const char __user *addr, + size_t len, loff_t *pos) +{ + struct seq_file *seq; + struct gcov_info *info; + struct gcov_node *node; + + seq = file->private_data; + info = gcov_iter_get_info(seq->private); + mutex_lock(&node_lock); + node = get_node_by_name(info->filename); + if (node) { + /* Reset counts or remove node for unloaded modules. */ + if (node->num_loaded == 0) + remove_node(node); + else + reset_node(node); + } + /* Reset counts for open file. */ + gcov_info_reset(info); + mutex_unlock(&node_lock); + + return len; +} + +/* + * Given a string representing a file path of format: + * path/to/file.gcda + * construct and return a new string: + * path/to/file. + */ +static char *link_target(const char *dir, const char *path, const char *ext) +{ + char *target; + char *old_ext; + char *copy; + + copy = kstrdup(path, GFP_KERNEL); + if (!copy) + return NULL; + old_ext = strrchr(copy, '.'); + if (old_ext) + *old_ext = '\0'; + if (dir) + target = kasprintf(GFP_KERNEL, "%s/%s.%s", dir, copy, ext); + else + target = kasprintf(GFP_KERNEL, "%s.%s", copy, ext); + kfree(copy); + + return target; +} + +/* + * Construct a string representing the symbolic link target for the given + * gcov data file name and link type. Depending on the link type and the + * location of the data file, the link target can either point to a + * subdirectory of srctree, objtree or in an external location. + */ +static char *get_link_target(const char *filename, const struct gcov_link *ext) +{ + const char *rel; + char *result; + + if (strncmp(filename, objtree, strlen(objtree)) == 0) { + rel = filename + strlen(objtree) + 1; + if (ext->dir == SRC_TREE) + result = link_target(srctree, rel, ext->ext); + else + result = link_target(objtree, rel, ext->ext); + } else { + /* External compilation. */ + result = link_target(NULL, filename, ext->ext); + } + + return result; +} + +#define SKEW_PREFIX ".tmp_" + +/* + * For a filename .tmp_filename.ext return filename.ext. Needed to compensate + * for filename skewing caused by the mod-versioning mechanism. + */ +static const char *deskew(const char *basename) +{ + if (strncmp(basename, SKEW_PREFIX, sizeof(SKEW_PREFIX) - 1) == 0) + return basename + sizeof(SKEW_PREFIX) - 1; + return basename; +} + +/* + * Create links to additional files (usually .c and .gcno files) which the + * gcov tool expects to find in the same directory as the gcov data file. + */ +static void add_links(struct gcov_node *node, struct dentry *parent) +{ + char *basename; + char *target; + int num; + int i; + + for (num = 0; gcov_link[num].ext; num++) + /* Nothing. */; + node->links = kcalloc(num, sizeof(struct dentry *), GFP_KERNEL); + if (!node->links) + return; + for (i = 0; i < num; i++) { + target = get_link_target(get_node_info(node)->filename, + &gcov_link[i]); + if (!target) + goto out_err; + basename = strrchr(target, '/'); + if (!basename) + goto out_err; + basename++; + node->links[i] = debugfs_create_symlink(deskew(basename), + parent, target); + if (!node->links[i]) + goto out_err; + kfree(target); + } + + return; +out_err: + kfree(target); + while (i-- > 0) + debugfs_remove(node->links[i]); + kfree(node->links); + node->links = NULL; +} + +static const struct file_operations gcov_data_fops = { + .open = gcov_seq_open, + .release = gcov_seq_release, + .read = seq_read, + .llseek = seq_lseek, + .write = gcov_seq_write, +}; + +/* Basic initialization of a new node. */ +static void init_node(struct gcov_node *node, struct gcov_info *info, + const char *name, struct gcov_node *parent) +{ + INIT_LIST_HEAD(&node->list); + INIT_LIST_HEAD(&node->children); + INIT_LIST_HEAD(&node->all); + if (node->loaded_info) { + node->loaded_info[0] = info; + node->num_loaded = 1; + } + node->parent = parent; + if (name) + strcpy(node->name, name); +} + +/* + * Create a new node and associated debugfs entry. Needs to be called with + * node_lock held. + */ +static struct gcov_node *new_node(struct gcov_node *parent, + struct gcov_info *info, const char *name) +{ + struct gcov_node *node; + + node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL); + if (!node) + goto err_nomem; + if (info) { + node->loaded_info = kcalloc(1, sizeof(struct gcov_info *), + GFP_KERNEL); + if (!node->loaded_info) + goto err_nomem; + } + init_node(node, info, name, parent); + /* Differentiate between gcov data file nodes and directory nodes. */ + if (info) { + node->dentry = debugfs_create_file(deskew(node->name), 0600, + parent->dentry, node, &gcov_data_fops); + } else + node->dentry = debugfs_create_dir(node->name, parent->dentry); + if (!node->dentry) { + pr_warning("could not create file\n"); + kfree(node); + return NULL; + } + if (info) + add_links(node, parent->dentry); + list_add(&node->list, &parent->children); + list_add(&node->all, &all_head); + + return node; + +err_nomem: + kfree(node); + pr_warning("out of memory\n"); + return NULL; +} + +/* Remove symbolic links associated with node. */ +static void remove_links(struct gcov_node *node) +{ + int i; + + if (!node->links) + return; + for (i = 0; gcov_link[i].ext; i++) + debugfs_remove(node->links[i]); + kfree(node->links); + node->links = NULL; +} + +/* + * Remove node from all lists and debugfs and release associated resources. + * Needs to be called with node_lock held. + */ +static void release_node(struct gcov_node *node) +{ + list_del(&node->list); + list_del(&node->all); + debugfs_remove(node->dentry); + remove_links(node); + kfree(node->loaded_info); + if (node->unloaded_info) + gcov_info_free(node->unloaded_info); + kfree(node); +} + +/* Release node and empty parents. Needs to be called with node_lock held. */ +static void remove_node(struct gcov_node *node) +{ + struct gcov_node *parent; + + while ((node != &root_node) && list_empty(&node->children)) { + parent = node->parent; + release_node(node); + node = parent; + } +} + +/* + * Find child node with given basename. Needs to be called with node_lock + * held. + */ +static struct gcov_node *get_child_by_name(struct gcov_node *parent, + const char *name) +{ + struct gcov_node *node; + + list_for_each_entry(node, &parent->children, list) { + if (strcmp(node->name, name) == 0) + return node; + } + + return NULL; +} + +/* + * write() implementation for reset file. Reset all profiling data to zero + * and remove nodes for which all associated object files are unloaded. + */ +static ssize_t reset_write(struct file *file, const char __user *addr, + size_t len, loff_t *pos) +{ + struct gcov_node *node; + + mutex_lock(&node_lock); +restart: + list_for_each_entry(node, &all_head, all) { + if (node->num_loaded > 0) + reset_node(node); + else if (list_empty(&node->children)) { + remove_node(node); + /* Several nodes may have gone - restart loop. */ + goto restart; + } + } + mutex_unlock(&node_lock); + + return len; +} + +/* read() implementation for reset file. Unused. */ +static ssize_t reset_read(struct file *file, char __user *addr, size_t len, + loff_t *pos) +{ + /* Allow read operation so that a recursive copy won't fail. */ + return 0; +} + +static const struct file_operations gcov_reset_fops = { + .write = reset_write, + .read = reset_read, + .llseek = noop_llseek, +}; + +/* + * Create a node for a given profiling data set and add it to all lists and + * debugfs. Needs to be called with node_lock held. + */ +static void add_node(struct gcov_info *info) +{ + char *filename; + char *curr; + char *next; + struct gcov_node *parent; + struct gcov_node *node; + + filename = kstrdup(info->filename, GFP_KERNEL); + if (!filename) + return; + parent = &root_node; + /* Create directory nodes along the path. */ + for (curr = filename; (next = strchr(curr, '/')); curr = next + 1) { + if (curr == next) + continue; + *next = 0; + if (strcmp(curr, ".") == 0) + continue; + if (strcmp(curr, "..") == 0) { + if (!parent->parent) + goto err_remove; + parent = parent->parent; + continue; + } + node = get_child_by_name(parent, curr); + if (!node) { + node = new_node(parent, NULL, curr); + if (!node) + goto err_remove; + } + parent = node; + } + /* Create file node. */ + node = new_node(parent, info, curr); + if (!node) + goto err_remove; +out: + kfree(filename); + return; + +err_remove: + remove_node(parent); + goto out; +} + +/* + * Associate a profiling data set with an existing node. Needs to be called + * with node_lock held. + */ +static void add_info(struct gcov_node *node, struct gcov_info *info) +{ + struct gcov_info **loaded_info; + int num = node->num_loaded; + + /* + * Prepare new array. This is done first to simplify cleanup in + * case the new data set is incompatible, the node only contains + * unloaded data sets and there's not enough memory for the array. + */ + loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL); + if (!loaded_info) { + pr_warning("could not add '%s' (out of memory)\n", + info->filename); + return; + } + memcpy(loaded_info, node->loaded_info, + num * sizeof(struct gcov_info *)); + loaded_info[num] = info; + /* Check if the new data set is compatible. */ + if (num == 0) { + /* + * A module was unloaded, modified and reloaded. The new + * data set replaces the copy of the last one. + */ + if (!gcov_info_is_compatible(node->unloaded_info, info)) { + pr_warning("discarding saved data for %s " + "(incompatible version)\n", info->filename); + gcov_info_free(node->unloaded_info); + node->unloaded_info = NULL; + } + } else { + /* + * Two different versions of the same object file are loaded. + * The initial one takes precedence. + */ + if (!gcov_info_is_compatible(node->loaded_info[0], info)) { + pr_warning("could not add '%s' (incompatible " + "version)\n", info->filename); + kfree(loaded_info); + return; + } + } + /* Overwrite previous array. */ + kfree(node->loaded_info); + node->loaded_info = loaded_info; + node->num_loaded = num + 1; +} + +/* + * Return the index of a profiling data set associated with a node. + */ +static int get_info_index(struct gcov_node *node, struct gcov_info *info) +{ + int i; + + for (i = 0; i < node->num_loaded; i++) { + if (node->loaded_info[i] == info) + return i; + } + return -ENOENT; +} + +/* + * Save the data of a profiling data set which is being unloaded. + */ +static void save_info(struct gcov_node *node, struct gcov_info *info) +{ + if (node->unloaded_info) + gcov_info_add(node->unloaded_info, info); + else { + node->unloaded_info = gcov_info_dup(info); + if (!node->unloaded_info) { + pr_warning("could not save data for '%s' " + "(out of memory)\n", info->filename); + } + } +} + +/* + * Disassociate a profiling data set from a node. Needs to be called with + * node_lock held. + */ +static void remove_info(struct gcov_node *node, struct gcov_info *info) +{ + int i; + + i = get_info_index(node, info); + if (i < 0) { + pr_warning("could not remove '%s' (not found)\n", + info->filename); + return; + } + if (gcov_persist) + save_info(node, info); + /* Shrink array. */ + node->loaded_info[i] = node->loaded_info[node->num_loaded - 1]; + node->num_loaded--; + if (node->num_loaded > 0) + return; + /* Last loaded data set was removed. */ + kfree(node->loaded_info); + node->loaded_info = NULL; + node->num_loaded = 0; + if (!node->unloaded_info) + remove_node(node); +} + +/* + * Callback to create/remove profiling files when code compiled with + * -fprofile-arcs is loaded/unloaded. + */ +void gcov_event(enum gcov_action action, struct gcov_info *info) +{ + struct gcov_node *node; + + mutex_lock(&node_lock); + node = get_node_by_name(info->filename); + switch (action) { + case GCOV_ADD: + if (node) + add_info(node, info); + else + add_node(info); + break; + case GCOV_REMOVE: + if (node) + remove_info(node, info); + else { + pr_warning("could not remove '%s' (not found)\n", + info->filename); + } + break; + } + mutex_unlock(&node_lock); +} + +/* Create debugfs entries. */ +static __init int gcov_fs_init(void) +{ + int rc = -EIO; + + init_node(&root_node, NULL, NULL, NULL); + /* + * /sys/kernel/debug/gcov will be parent for the reset control file + * and all profiling files. + */ + root_node.dentry = debugfs_create_dir("gcov", NULL); + if (!root_node.dentry) + goto err_remove; + /* + * Create reset file which resets all profiling counts when written + * to. + */ + reset_dentry = debugfs_create_file("reset", 0600, root_node.dentry, + NULL, &gcov_reset_fops); + if (!reset_dentry) + goto err_remove; + /* Replay previous events to get our fs hierarchy up-to-date. */ + gcov_enable_events(); + return 0; + +err_remove: + pr_err("init failed\n"); + if (root_node.dentry) + debugfs_remove(root_node.dentry); + + return rc; +} +device_initcall(gcov_fs_init); diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c new file mode 100644 index 00000000..ae5bb426 --- /dev/null +++ b/kernel/gcov/gcc_3_4.c @@ -0,0 +1,447 @@ +/* + * This code provides functions to handle gcc's profiling data format + * introduced with gcc 3.4. Future versions of gcc may change the gcov + * format (as happened before), so all format-specific information needs + * to be kept modular and easily exchangeable. + * + * This file is based on gcc-internal definitions. Functions and data + * structures are defined to be compatible with gcc counterparts. + * For a better understanding, refer to gcc source: gcc/gcov-io.h. + * + * Copyright IBM Corp. 2009 + * Author(s): Peter Oberparleiter + * + * Uses gcc-internal data definitions. + */ + +#include +#include +#include +#include +#include +#include "gcov.h" + +/* Symbolic links to be created for each profiling data file. */ +const struct gcov_link gcov_link[] = { + { OBJ_TREE, "gcno" }, /* Link to .gcno file in $(objtree). */ + { 0, NULL}, +}; + +/* + * Determine whether a counter is active. Based on gcc magic. Doesn't change + * at run-time. + */ +static int counter_active(struct gcov_info *info, unsigned int type) +{ + return (1 << type) & info->ctr_mask; +} + +/* Determine number of active counters. Based on gcc magic. */ +static unsigned int num_counter_active(struct gcov_info *info) +{ + unsigned int i; + unsigned int result = 0; + + for (i = 0; i < GCOV_COUNTERS; i++) { + if (counter_active(info, i)) + result++; + } + return result; +} + +/** + * gcov_info_reset - reset profiling data to zero + * @info: profiling data set + */ +void gcov_info_reset(struct gcov_info *info) +{ + unsigned int active = num_counter_active(info); + unsigned int i; + + for (i = 0; i < active; i++) { + memset(info->counts[i].values, 0, + info->counts[i].num * sizeof(gcov_type)); + } +} + +/** + * gcov_info_is_compatible - check if profiling data can be added + * @info1: first profiling data set + * @info2: second profiling data set + * + * Returns non-zero if profiling data can be added, zero otherwise. + */ +int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2) +{ + return (info1->stamp == info2->stamp); +} + +/** + * gcov_info_add - add up profiling data + * @dest: profiling data set to which data is added + * @source: profiling data set which is added + * + * Adds profiling counts of @source to @dest. + */ +void gcov_info_add(struct gcov_info *dest, struct gcov_info *source) +{ + unsigned int i; + unsigned int j; + + for (i = 0; i < num_counter_active(dest); i++) { + for (j = 0; j < dest->counts[i].num; j++) { + dest->counts[i].values[j] += + source->counts[i].values[j]; + } + } +} + +/* Get size of function info entry. Based on gcc magic. */ +static size_t get_fn_size(struct gcov_info *info) +{ + size_t size; + + size = sizeof(struct gcov_fn_info) + num_counter_active(info) * + sizeof(unsigned int); + if (__alignof__(struct gcov_fn_info) > sizeof(unsigned int)) + size = ALIGN(size, __alignof__(struct gcov_fn_info)); + return size; +} + +/* Get address of function info entry. Based on gcc magic. */ +static struct gcov_fn_info *get_fn_info(struct gcov_info *info, unsigned int fn) +{ + return (struct gcov_fn_info *) + ((char *) info->functions + fn * get_fn_size(info)); +} + +/** + * gcov_info_dup - duplicate profiling data set + * @info: profiling data set to duplicate + * + * Return newly allocated duplicate on success, %NULL on error. + */ +struct gcov_info *gcov_info_dup(struct gcov_info *info) +{ + struct gcov_info *dup; + unsigned int i; + unsigned int active; + + /* Duplicate gcov_info. */ + active = num_counter_active(info); + dup = kzalloc(sizeof(struct gcov_info) + + sizeof(struct gcov_ctr_info) * active, GFP_KERNEL); + if (!dup) + return NULL; + dup->version = info->version; + dup->stamp = info->stamp; + dup->n_functions = info->n_functions; + dup->ctr_mask = info->ctr_mask; + /* Duplicate filename. */ + dup->filename = kstrdup(info->filename, GFP_KERNEL); + if (!dup->filename) + goto err_free; + /* Duplicate table of functions. */ + dup->functions = kmemdup(info->functions, info->n_functions * + get_fn_size(info), GFP_KERNEL); + if (!dup->functions) + goto err_free; + /* Duplicate counter arrays. */ + for (i = 0; i < active ; i++) { + struct gcov_ctr_info *ctr = &info->counts[i]; + size_t size = ctr->num * sizeof(gcov_type); + + dup->counts[i].num = ctr->num; + dup->counts[i].merge = ctr->merge; + dup->counts[i].values = vmalloc(size); + if (!dup->counts[i].values) + goto err_free; + memcpy(dup->counts[i].values, ctr->values, size); + } + return dup; + +err_free: + gcov_info_free(dup); + return NULL; +} + +/** + * gcov_info_free - release memory for profiling data set duplicate + * @info: profiling data set duplicate to free + */ +void gcov_info_free(struct gcov_info *info) +{ + unsigned int active = num_counter_active(info); + unsigned int i; + + for (i = 0; i < active ; i++) + vfree(info->counts[i].values); + kfree(info->functions); + kfree(info->filename); + kfree(info); +} + +/** + * struct type_info - iterator helper array + * @ctr_type: counter type + * @offset: index of the first value of the current function for this type + * + * This array is needed to convert the in-memory data format into the in-file + * data format: + * + * In-memory: + * for each counter type + * for each function + * values + * + * In-file: + * for each function + * for each counter type + * values + * + * See gcc source gcc/gcov-io.h for more information on data organization. + */ +struct type_info { + int ctr_type; + unsigned int offset; +}; + +/** + * struct gcov_iterator - specifies current file position in logical records + * @info: associated profiling data + * @record: record type + * @function: function number + * @type: counter type + * @count: index into values array + * @num_types: number of counter types + * @type_info: helper array to get values-array offset for current function + */ +struct gcov_iterator { + struct gcov_info *info; + + int record; + unsigned int function; + unsigned int type; + unsigned int count; + + int num_types; + struct type_info type_info[0]; +}; + +static struct gcov_fn_info *get_func(struct gcov_iterator *iter) +{ + return get_fn_info(iter->info, iter->function); +} + +static struct type_info *get_type(struct gcov_iterator *iter) +{ + return &iter->type_info[iter->type]; +} + +/** + * gcov_iter_new - allocate and initialize profiling data iterator + * @info: profiling data set to be iterated + * + * Return file iterator on success, %NULL otherwise. + */ +struct gcov_iterator *gcov_iter_new(struct gcov_info *info) +{ + struct gcov_iterator *iter; + + iter = kzalloc(sizeof(struct gcov_iterator) + + num_counter_active(info) * sizeof(struct type_info), + GFP_KERNEL); + if (iter) + iter->info = info; + + return iter; +} + +/** + * gcov_iter_free - release memory for iterator + * @iter: file iterator to free + */ +void gcov_iter_free(struct gcov_iterator *iter) +{ + kfree(iter); +} + +/** + * gcov_iter_get_info - return profiling data set for given file iterator + * @iter: file iterator + */ +struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter) +{ + return iter->info; +} + +/** + * gcov_iter_start - reset file iterator to starting position + * @iter: file iterator + */ +void gcov_iter_start(struct gcov_iterator *iter) +{ + int i; + + iter->record = 0; + iter->function = 0; + iter->type = 0; + iter->count = 0; + iter->num_types = 0; + for (i = 0; i < GCOV_COUNTERS; i++) { + if (counter_active(iter->info, i)) { + iter->type_info[iter->num_types].ctr_type = i; + iter->type_info[iter->num_types++].offset = 0; + } + } +} + +/* Mapping of logical record number to actual file content. */ +#define RECORD_FILE_MAGIC 0 +#define RECORD_GCOV_VERSION 1 +#define RECORD_TIME_STAMP 2 +#define RECORD_FUNCTION_TAG 3 +#define RECORD_FUNCTON_TAG_LEN 4 +#define RECORD_FUNCTION_IDENT 5 +#define RECORD_FUNCTION_CHECK 6 +#define RECORD_COUNT_TAG 7 +#define RECORD_COUNT_LEN 8 +#define RECORD_COUNT 9 + +/** + * gcov_iter_next - advance file iterator to next logical record + * @iter: file iterator + * + * Return zero if new position is valid, non-zero if iterator has reached end. + */ +int gcov_iter_next(struct gcov_iterator *iter) +{ + switch (iter->record) { + case RECORD_FILE_MAGIC: + case RECORD_GCOV_VERSION: + case RECORD_FUNCTION_TAG: + case RECORD_FUNCTON_TAG_LEN: + case RECORD_FUNCTION_IDENT: + case RECORD_COUNT_TAG: + /* Advance to next record */ + iter->record++; + break; + case RECORD_COUNT: + /* Advance to next count */ + iter->count++; + /* fall through */ + case RECORD_COUNT_LEN: + if (iter->count < get_func(iter)->n_ctrs[iter->type]) { + iter->record = 9; + break; + } + /* Advance to next counter type */ + get_type(iter)->offset += iter->count; + iter->count = 0; + iter->type++; + /* fall through */ + case RECORD_FUNCTION_CHECK: + if (iter->type < iter->num_types) { + iter->record = 7; + break; + } + /* Advance to next function */ + iter->type = 0; + iter->function++; + /* fall through */ + case RECORD_TIME_STAMP: + if (iter->function < iter->info->n_functions) + iter->record = 3; + else + iter->record = -1; + break; + } + /* Check for EOF. */ + if (iter->record == -1) + return -EINVAL; + else + return 0; +} + +/** + * seq_write_gcov_u32 - write 32 bit number in gcov format to seq_file + * @seq: seq_file handle + * @v: value to be stored + * + * Number format defined by gcc: numbers are recorded in the 32 bit + * unsigned binary form of the endianness of the machine generating the + * file. + */ +static int seq_write_gcov_u32(struct seq_file *seq, u32 v) +{ + return seq_write(seq, &v, sizeof(v)); +} + +/** + * seq_write_gcov_u64 - write 64 bit number in gcov format to seq_file + * @seq: seq_file handle + * @v: value to be stored + * + * Number format defined by gcc: numbers are recorded in the 32 bit + * unsigned binary form of the endianness of the machine generating the + * file. 64 bit numbers are stored as two 32 bit numbers, the low part + * first. + */ +static int seq_write_gcov_u64(struct seq_file *seq, u64 v) +{ + u32 data[2]; + + data[0] = (v & 0xffffffffUL); + data[1] = (v >> 32); + return seq_write(seq, data, sizeof(data)); +} + +/** + * gcov_iter_write - write data for current pos to seq_file + * @iter: file iterator + * @seq: seq_file handle + * + * Return zero on success, non-zero otherwise. + */ +int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq) +{ + int rc = -EINVAL; + + switch (iter->record) { + case RECORD_FILE_MAGIC: + rc = seq_write_gcov_u32(seq, GCOV_DATA_MAGIC); + break; + case RECORD_GCOV_VERSION: + rc = seq_write_gcov_u32(seq, iter->info->version); + break; + case RECORD_TIME_STAMP: + rc = seq_write_gcov_u32(seq, iter->info->stamp); + break; + case RECORD_FUNCTION_TAG: + rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION); + break; + case RECORD_FUNCTON_TAG_LEN: + rc = seq_write_gcov_u32(seq, 2); + break; + case RECORD_FUNCTION_IDENT: + rc = seq_write_gcov_u32(seq, get_func(iter)->ident); + break; + case RECORD_FUNCTION_CHECK: + rc = seq_write_gcov_u32(seq, get_func(iter)->checksum); + break; + case RECORD_COUNT_TAG: + rc = seq_write_gcov_u32(seq, + GCOV_TAG_FOR_COUNTER(get_type(iter)->ctr_type)); + break; + case RECORD_COUNT_LEN: + rc = seq_write_gcov_u32(seq, + get_func(iter)->n_ctrs[iter->type] * 2); + break; + case RECORD_COUNT: + rc = seq_write_gcov_u64(seq, + iter->info->counts[iter->type]. + values[iter->count + get_type(iter)->offset]); + break; + } + return rc; +} diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h new file mode 100644 index 00000000..060073eb --- /dev/null +++ b/kernel/gcov/gcov.h @@ -0,0 +1,128 @@ +/* + * Profiling infrastructure declarations. + * + * This file is based on gcc-internal definitions. Data structures are + * defined to be compatible with gcc counterparts. For a better + * understanding, refer to gcc source: gcc/gcov-io.h. + * + * Copyright IBM Corp. 2009 + * Author(s): Peter Oberparleiter + * + * Uses gcc-internal data definitions. + */ + +#ifndef GCOV_H +#define GCOV_H GCOV_H + +#include + +/* + * Profiling data types used for gcc 3.4 and above - these are defined by + * gcc and need to be kept as close to the original definition as possible to + * remain compatible. + */ +#define GCOV_COUNTERS 5 +#define GCOV_DATA_MAGIC ((unsigned int) 0x67636461) +#define GCOV_TAG_FUNCTION ((unsigned int) 0x01000000) +#define GCOV_TAG_COUNTER_BASE ((unsigned int) 0x01a10000) +#define GCOV_TAG_FOR_COUNTER(count) \ + (GCOV_TAG_COUNTER_BASE + ((unsigned int) (count) << 17)) + +#if BITS_PER_LONG >= 64 +typedef long gcov_type; +#else +typedef long long gcov_type; +#endif + +/** + * struct gcov_fn_info - profiling meta data per function + * @ident: object file-unique function identifier + * @checksum: function checksum + * @n_ctrs: number of values per counter type belonging to this function + * + * This data is generated by gcc during compilation and doesn't change + * at run-time. + */ +struct gcov_fn_info { + unsigned int ident; + unsigned int checksum; + unsigned int n_ctrs[0]; +}; + +/** + * struct gcov_ctr_info - profiling data per counter type + * @num: number of counter values for this type + * @values: array of counter values for this type + * @merge: merge function for counter values of this type (unused) + * + * This data is generated by gcc during compilation and doesn't change + * at run-time with the exception of the values array. + */ +struct gcov_ctr_info { + unsigned int num; + gcov_type *values; + void (*merge)(gcov_type *, unsigned int); +}; + +/** + * struct gcov_info - profiling data per object file + * @version: gcov version magic indicating the gcc version used for compilation + * @next: list head for a singly-linked list + * @stamp: time stamp + * @filename: name of the associated gcov data file + * @n_functions: number of instrumented functions + * @functions: function data + * @ctr_mask: mask specifying which counter types are active + * @counts: counter data per counter type + * + * This data is generated by gcc during compilation and doesn't change + * at run-time with the exception of the next pointer. + */ +struct gcov_info { + unsigned int version; + struct gcov_info *next; + unsigned int stamp; + const char *filename; + unsigned int n_functions; + const struct gcov_fn_info *functions; + unsigned int ctr_mask; + struct gcov_ctr_info counts[0]; +}; + +/* Base interface. */ +enum gcov_action { + GCOV_ADD, + GCOV_REMOVE, +}; + +void gcov_event(enum gcov_action action, struct gcov_info *info); +void gcov_enable_events(void); + +/* Iterator control. */ +struct seq_file; +struct gcov_iterator; + +struct gcov_iterator *gcov_iter_new(struct gcov_info *info); +void gcov_iter_free(struct gcov_iterator *iter); +void gcov_iter_start(struct gcov_iterator *iter); +int gcov_iter_next(struct gcov_iterator *iter); +int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq); +struct gcov_info *gcov_iter_get_info(struct gcov_iterator *iter); + +/* gcov_info control. */ +void gcov_info_reset(struct gcov_info *info); +int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2); +void gcov_info_add(struct gcov_info *dest, struct gcov_info *source); +struct gcov_info *gcov_info_dup(struct gcov_info *info); +void gcov_info_free(struct gcov_info *info); + +struct gcov_link { + enum { + OBJ_TREE, + SRC_TREE, + } dir; + const char *ext; +}; +extern const struct gcov_link gcov_link[]; + +#endif /* GCOV_H */ diff --git a/kernel/groups.c b/kernel/groups.c new file mode 100644 index 00000000..1cc476d5 --- /dev/null +++ b/kernel/groups.c @@ -0,0 +1,281 @@ +/* + * Supplementary group IDs + */ +#include +#include +#include +#include +#include +#include + +/* init to 2 - one for init_task, one to ensure it is never freed */ +struct group_info init_groups = { .usage = ATOMIC_INIT(2) }; + +struct group_info *groups_alloc(int gidsetsize) +{ + struct group_info *group_info; + int nblocks; + int i; + + nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK; + /* Make sure we always allocate at least one indirect block pointer */ + nblocks = nblocks ? : 1; + group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER); + if (!group_info) + return NULL; + group_info->ngroups = gidsetsize; + group_info->nblocks = nblocks; + atomic_set(&group_info->usage, 1); + + if (gidsetsize <= NGROUPS_SMALL) + group_info->blocks[0] = group_info->small_block; + else { + for (i = 0; i < nblocks; i++) { + gid_t *b; + b = (void *)__get_free_page(GFP_USER); + if (!b) + goto out_undo_partial_alloc; + group_info->blocks[i] = b; + } + } + return group_info; + +out_undo_partial_alloc: + while (--i >= 0) { + free_page((unsigned long)group_info->blocks[i]); + } + kfree(group_info); + return NULL; +} + +EXPORT_SYMBOL(groups_alloc); + +void groups_free(struct group_info *group_info) +{ + if (group_info->blocks[0] != group_info->small_block) { + int i; + for (i = 0; i < group_info->nblocks; i++) + free_page((unsigned long)group_info->blocks[i]); + } + kfree(group_info); +} + +EXPORT_SYMBOL(groups_free); + +/* export the group_info to a user-space array */ +static int groups_to_user(gid_t __user *grouplist, + const struct group_info *group_info) +{ + int i; + unsigned int count = group_info->ngroups; + + for (i = 0; i < group_info->nblocks; i++) { + unsigned int cp_count = min(NGROUPS_PER_BLOCK, count); + unsigned int len = cp_count * sizeof(*grouplist); + + if (copy_to_user(grouplist, group_info->blocks[i], len)) + return -EFAULT; + + grouplist += NGROUPS_PER_BLOCK; + count -= cp_count; + } + return 0; +} + +/* fill a group_info from a user-space array - it must be allocated already */ +static int groups_from_user(struct group_info *group_info, + gid_t __user *grouplist) +{ + int i; + unsigned int count = group_info->ngroups; + + for (i = 0; i < group_info->nblocks; i++) { + unsigned int cp_count = min(NGROUPS_PER_BLOCK, count); + unsigned int len = cp_count * sizeof(*grouplist); + + if (copy_from_user(group_info->blocks[i], grouplist, len)) + return -EFAULT; + + grouplist += NGROUPS_PER_BLOCK; + count -= cp_count; + } + return 0; +} + +/* a simple Shell sort */ +static void groups_sort(struct group_info *group_info) +{ + int base, max, stride; + int gidsetsize = group_info->ngroups; + + for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1) + ; /* nothing */ + stride /= 3; + + while (stride) { + max = gidsetsize - stride; + for (base = 0; base < max; base++) { + int left = base; + int right = left + stride; + gid_t tmp = GROUP_AT(group_info, right); + + while (left >= 0 && GROUP_AT(group_info, left) > tmp) { + GROUP_AT(group_info, right) = + GROUP_AT(group_info, left); + right = left; + left -= stride; + } + GROUP_AT(group_info, right) = tmp; + } + stride /= 3; + } +} + +/* a simple bsearch */ +int groups_search(const struct group_info *group_info, gid_t grp) +{ + unsigned int left, right; + + if (!group_info) + return 0; + + left = 0; + right = group_info->ngroups; + while (left < right) { + unsigned int mid = (left+right)/2; + if (grp > GROUP_AT(group_info, mid)) + left = mid + 1; + else if (grp < GROUP_AT(group_info, mid)) + right = mid; + else + return 1; + } + return 0; +} + +/** + * set_groups - Change a group subscription in a set of credentials + * @new: The newly prepared set of credentials to alter + * @group_info: The group list to install + * + * Validate a group subscription and, if valid, insert it into a set + * of credentials. + */ +int set_groups(struct cred *new, struct group_info *group_info) +{ + put_group_info(new->group_info); + groups_sort(group_info); + get_group_info(group_info); + new->group_info = group_info; + return 0; +} + +EXPORT_SYMBOL(set_groups); + +/** + * set_current_groups - Change current's group subscription + * @group_info: The group list to impose + * + * Validate a group subscription and, if valid, impose it upon current's task + * security record. + */ +int set_current_groups(struct group_info *group_info) +{ + struct cred *new; + int ret; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + + ret = set_groups(new, group_info); + if (ret < 0) { + abort_creds(new); + return ret; + } + + return commit_creds(new); +} + +EXPORT_SYMBOL(set_current_groups); + +SYSCALL_DEFINE2(getgroups, int, gidsetsize, gid_t __user *, grouplist) +{ + const struct cred *cred = current_cred(); + int i; + + if (gidsetsize < 0) + return -EINVAL; + + /* no need to grab task_lock here; it cannot change */ + i = cred->group_info->ngroups; + if (gidsetsize) { + if (i > gidsetsize) { + i = -EINVAL; + goto out; + } + if (groups_to_user(grouplist, cred->group_info)) { + i = -EFAULT; + goto out; + } + } +out: + return i; +} + +/* + * SMP: Our groups are copy-on-write. We can set them safely + * without another task interfering. + */ + +SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) +{ + struct group_info *group_info; + int retval; + + if (!nsown_capable(CAP_SETGID)) + return -EPERM; + if ((unsigned)gidsetsize > NGROUPS_MAX) + return -EINVAL; + + group_info = groups_alloc(gidsetsize); + if (!group_info) + return -ENOMEM; + retval = groups_from_user(group_info, grouplist); + if (retval) { + put_group_info(group_info); + return retval; + } + + retval = set_current_groups(group_info); + put_group_info(group_info); + + return retval; +} + +/* + * Check whether we're fsgid/egid or in the supplemental group.. + */ +int in_group_p(gid_t grp) +{ + const struct cred *cred = current_cred(); + int retval = 1; + + if (grp != cred->fsgid) + retval = groups_search(cred->group_info, grp); + return retval; +} + +EXPORT_SYMBOL(in_group_p); + +int in_egroup_p(gid_t grp) +{ + const struct cred *cred = current_cred(); + int retval = 1; + + if (grp != cred->egid) + retval = groups_search(cred->group_info, grp); + return retval; +} + +EXPORT_SYMBOL(in_egroup_p); diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c new file mode 100644 index 00000000..2043c08d --- /dev/null +++ b/kernel/hrtimer.c @@ -0,0 +1,1861 @@ +/* + * linux/kernel/hrtimer.c + * + * Copyright(C) 2005-2006, Thomas Gleixner + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner + * + * High-resolution kernel timers + * + * In contrast to the low-resolution timeout API implemented in + * kernel/timer.c, hrtimers provide finer resolution and accuracy + * depending on system configuration and capabilities. + * + * These timers are currently used for: + * - itimers + * - POSIX timers + * - nanosleep + * - precise in-kernel timing + * + * Started by: Thomas Gleixner and Ingo Molnar + * + * Credits: + * based on kernel/timer.c + * + * Help, testing, suggestions, bugfixes, improvements were + * provided by: + * + * George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel + * et. al. + * + * For licencing details see kernel-base/COPYING + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +/* + * The timer bases: + * + * There are more clockids then hrtimer bases. Thus, we index + * into the timer bases by the hrtimer_base_type enum. When trying + * to reach a base using a clockid, hrtimer_clockid_to_base() + * is used to convert from clockid to the proper hrtimer_base_type. + */ +DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = +{ + + .clock_base = + { + { + .index = HRTIMER_BASE_MONOTONIC, + .clockid = CLOCK_MONOTONIC, + .get_time = &ktime_get, + .resolution = KTIME_LOW_RES, + }, + { + .index = HRTIMER_BASE_REALTIME, + .clockid = CLOCK_REALTIME, + .get_time = &ktime_get_real, + .resolution = KTIME_LOW_RES, + }, + { + .index = HRTIMER_BASE_BOOTTIME, + .clockid = CLOCK_BOOTTIME, + .get_time = &ktime_get_boottime, + .resolution = KTIME_LOW_RES, + }, + } +}; + +static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { + [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, + [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, + [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, +}; + +static inline int hrtimer_clockid_to_base(clockid_t clock_id) +{ + return hrtimer_clock_to_base_table[clock_id]; +} + + +/* + * Get the coarse grained time at the softirq based on xtime and + * wall_to_monotonic. + */ +static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) +{ + ktime_t xtim, mono, boot; + struct timespec xts, tom, slp; + + get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp); + + xtim = timespec_to_ktime(xts); + mono = ktime_add(xtim, timespec_to_ktime(tom)); + boot = ktime_add(mono, timespec_to_ktime(slp)); + base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; + base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; + base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; +} + +/* + * Functions and macros which are different for UP/SMP systems are kept in a + * single place + */ +#ifdef CONFIG_SMP + +/* + * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock + * means that all timers which are tied to this base via timer->base are + * locked, and the base itself is locked too. + * + * So __run_timers/migrate_timers can safely modify all timers which could + * be found on the lists/queues. + * + * When the timer's base is locked, and the timer removed from list, it is + * possible to set timer->base = NULL and drop the lock: the timer remains + * locked. + */ +static +struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, + unsigned long *flags) +{ + struct hrtimer_clock_base *base; + + for (;;) { + base = timer->base; + if (likely(base != NULL)) { + raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); + if (likely(base == timer->base)) + return base; + /* The timer has migrated to another CPU: */ + raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags); + } + cpu_relax(); + } +} + + +/* + * Get the preferred target CPU for NOHZ + */ +static int hrtimer_get_target(int this_cpu, int pinned) +{ +#ifdef CONFIG_NO_HZ + if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) + return get_nohz_timer_target(); +#endif + return this_cpu; +} + +/* + * With HIGHRES=y we do not migrate the timer when it is expiring + * before the next event on the target cpu because we cannot reprogram + * the target cpu hardware and we would cause it to fire late. + * + * Called with cpu_base->lock of target cpu held. + */ +static int +hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) +{ +#ifdef CONFIG_HIGH_RES_TIMERS + ktime_t expires; + + if (!new_base->cpu_base->hres_active) + return 0; + + expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); + return expires.tv64 <= new_base->cpu_base->expires_next.tv64; +#else + return 0; +#endif +} + +/* + * Switch the timer base to the current CPU when possible. + */ +static inline struct hrtimer_clock_base * +switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, + int pinned) +{ + struct hrtimer_clock_base *new_base; + struct hrtimer_cpu_base *new_cpu_base; + int this_cpu = smp_processor_id(); + int cpu = hrtimer_get_target(this_cpu, pinned); + int basenum = base->index; + +again: + new_cpu_base = &per_cpu(hrtimer_bases, cpu); + new_base = &new_cpu_base->clock_base[basenum]; + + if (base != new_base) { + /* + * We are trying to move timer to new_base. + * However we can't change timer's base while it is running, + * so we keep it on the same CPU. No hassle vs. reprogramming + * the event source in the high resolution case. The softirq + * code will take care of this when the timer function has + * completed. There is no conflict as we hold the lock until + * the timer is enqueued. + */ + if (unlikely(hrtimer_callback_running(timer))) + return base; + + /* See the comment in lock_timer_base() */ + timer->base = NULL; + raw_spin_unlock(&base->cpu_base->lock); + raw_spin_lock(&new_base->cpu_base->lock); + + if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { + cpu = this_cpu; + raw_spin_unlock(&new_base->cpu_base->lock); + raw_spin_lock(&base->cpu_base->lock); + timer->base = base; + goto again; + } + timer->base = new_base; + } + return new_base; +} + +#else /* CONFIG_SMP */ + +static inline struct hrtimer_clock_base * +lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) +{ + struct hrtimer_clock_base *base = timer->base; + + raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); + + return base; +} + +# define switch_hrtimer_base(t, b, p) (b) + +#endif /* !CONFIG_SMP */ + +/* + * Functions for the union type storage format of ktime_t which are + * too large for inlining: + */ +#if BITS_PER_LONG < 64 +# ifndef CONFIG_KTIME_SCALAR +/** + * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable + * @kt: addend + * @nsec: the scalar nsec value to add + * + * Returns the sum of kt and nsec in ktime_t format + */ +ktime_t ktime_add_ns(const ktime_t kt, u64 nsec) +{ + ktime_t tmp; + + if (likely(nsec < NSEC_PER_SEC)) { + tmp.tv64 = nsec; + } else { + unsigned long rem = do_div(nsec, NSEC_PER_SEC); + + tmp = ktime_set((long)nsec, rem); + } + + return ktime_add(kt, tmp); +} + +EXPORT_SYMBOL_GPL(ktime_add_ns); + +/** + * ktime_sub_ns - Subtract a scalar nanoseconds value from a ktime_t variable + * @kt: minuend + * @nsec: the scalar nsec value to subtract + * + * Returns the subtraction of @nsec from @kt in ktime_t format + */ +ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec) +{ + ktime_t tmp; + + if (likely(nsec < NSEC_PER_SEC)) { + tmp.tv64 = nsec; + } else { + unsigned long rem = do_div(nsec, NSEC_PER_SEC); + + tmp = ktime_set((long)nsec, rem); + } + + return ktime_sub(kt, tmp); +} + +EXPORT_SYMBOL_GPL(ktime_sub_ns); +# endif /* !CONFIG_KTIME_SCALAR */ + +/* + * Divide a ktime value by a nanosecond value + */ +u64 ktime_divns(const ktime_t kt, s64 div) +{ + u64 dclc; + int sft = 0; + + dclc = ktime_to_ns(kt); + /* Make sure the divisor is less than 2^32: */ + while (div >> 32) { + sft++; + div >>= 1; + } + dclc >>= sft; + do_div(dclc, (unsigned long) div); + + return dclc; +} +#endif /* BITS_PER_LONG >= 64 */ + +/* + * Add two ktime values and do a safety check for overflow: + */ +ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) +{ + ktime_t res = ktime_add(lhs, rhs); + + /* + * We use KTIME_SEC_MAX here, the maximum timeout which we can + * return to user space in a timespec: + */ + if (res.tv64 < 0 || res.tv64 < lhs.tv64 || res.tv64 < rhs.tv64) + res = ktime_set(KTIME_SEC_MAX, 0); + + return res; +} + +EXPORT_SYMBOL_GPL(ktime_add_safe); + +#ifdef CONFIG_DEBUG_OBJECTS_TIMERS + +static struct debug_obj_descr hrtimer_debug_descr; + +static void *hrtimer_debug_hint(void *addr) +{ + return ((struct hrtimer *) addr)->function; +} + +/* + * fixup_init is called when: + * - an active object is initialized + */ +static int hrtimer_fixup_init(void *addr, enum debug_obj_state state) +{ + struct hrtimer *timer = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + hrtimer_cancel(timer); + debug_object_init(timer, &hrtimer_debug_descr); + return 1; + default: + return 0; + } +} + +/* + * fixup_activate is called when: + * - an active object is activated + * - an unknown object is activated (might be a statically initialized object) + */ +static int hrtimer_fixup_activate(void *addr, enum debug_obj_state state) +{ + switch (state) { + + case ODEBUG_STATE_NOTAVAILABLE: + WARN_ON_ONCE(1); + return 0; + + case ODEBUG_STATE_ACTIVE: + WARN_ON(1); + + default: + return 0; + } +} + +/* + * fixup_free is called when: + * - an active object is freed + */ +static int hrtimer_fixup_free(void *addr, enum debug_obj_state state) +{ + struct hrtimer *timer = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + hrtimer_cancel(timer); + debug_object_free(timer, &hrtimer_debug_descr); + return 1; + default: + return 0; + } +} + +static struct debug_obj_descr hrtimer_debug_descr = { + .name = "hrtimer", + .debug_hint = hrtimer_debug_hint, + .fixup_init = hrtimer_fixup_init, + .fixup_activate = hrtimer_fixup_activate, + .fixup_free = hrtimer_fixup_free, +}; + +static inline void debug_hrtimer_init(struct hrtimer *timer) +{ + debug_object_init(timer, &hrtimer_debug_descr); +} + +static inline void debug_hrtimer_activate(struct hrtimer *timer) +{ + debug_object_activate(timer, &hrtimer_debug_descr); +} + +static inline void debug_hrtimer_deactivate(struct hrtimer *timer) +{ + debug_object_deactivate(timer, &hrtimer_debug_descr); +} + +static inline void debug_hrtimer_free(struct hrtimer *timer) +{ + debug_object_free(timer, &hrtimer_debug_descr); +} + +static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, + enum hrtimer_mode mode); + +void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id, + enum hrtimer_mode mode) +{ + debug_object_init_on_stack(timer, &hrtimer_debug_descr); + __hrtimer_init(timer, clock_id, mode); +} +EXPORT_SYMBOL_GPL(hrtimer_init_on_stack); + +void destroy_hrtimer_on_stack(struct hrtimer *timer) +{ + debug_object_free(timer, &hrtimer_debug_descr); +} + +#else +static inline void debug_hrtimer_init(struct hrtimer *timer) { } +static inline void debug_hrtimer_activate(struct hrtimer *timer) { } +static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } +#endif + +static inline void +debug_init(struct hrtimer *timer, clockid_t clockid, + enum hrtimer_mode mode) +{ + debug_hrtimer_init(timer); + trace_hrtimer_init(timer, clockid, mode); +} + +static inline void debug_activate(struct hrtimer *timer) +{ + debug_hrtimer_activate(timer); + trace_hrtimer_start(timer); +} + +static inline void debug_deactivate(struct hrtimer *timer) +{ + debug_hrtimer_deactivate(timer); + trace_hrtimer_cancel(timer); +} + +/* High resolution timer related functions */ +#ifdef CONFIG_HIGH_RES_TIMERS + +/* + * High resolution timer enabled ? + */ +static int hrtimer_hres_enabled __read_mostly = 1; + +/* + * Enable / Disable high resolution mode + */ +static int __init setup_hrtimer_hres(char *str) +{ + if (!strcmp(str, "off")) + hrtimer_hres_enabled = 0; + else if (!strcmp(str, "on")) + hrtimer_hres_enabled = 1; + else + return 0; + return 1; +} + +__setup("highres=", setup_hrtimer_hres); + +/* + * hrtimer_high_res_enabled - query, if the highres mode is enabled + */ +static inline int hrtimer_is_hres_enabled(void) +{ + return hrtimer_hres_enabled; +} + +/* + * Is the high resolution mode active ? + */ +static inline int hrtimer_hres_active(void) +{ + return __this_cpu_read(hrtimer_bases.hres_active); +} + +/* + * Reprogram the event source with checking both queues for the + * next event + * Called with interrupts disabled and base->lock held + */ +static void +hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) +{ + int i; + struct hrtimer_clock_base *base = cpu_base->clock_base; + ktime_t expires, expires_next; + + expires_next.tv64 = KTIME_MAX; + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { + struct hrtimer *timer; + struct timerqueue_node *next; + + next = timerqueue_getnext(&base->active); + if (!next) + continue; + timer = container_of(next, struct hrtimer, node); + + expires = ktime_sub(hrtimer_get_expires(timer), base->offset); + /* + * clock_was_set() has changed base->offset so the + * result might be negative. Fix it up to prevent a + * false positive in clockevents_program_event() + */ + if (expires.tv64 < 0) + expires.tv64 = 0; + if (expires.tv64 < expires_next.tv64) + expires_next = expires; + } + + if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64) + return; + + cpu_base->expires_next.tv64 = expires_next.tv64; + + if (cpu_base->expires_next.tv64 != KTIME_MAX) + tick_program_event(cpu_base->expires_next, 1); +} + +/* + * Shared reprogramming for clock_realtime and clock_monotonic + * + * When a timer is enqueued and expires earlier than the already enqueued + * timers, we have to check, whether it expires earlier than the timer for + * which the clock event device was armed. + * + * Called with interrupts disabled and base->cpu_base.lock held + */ +static int hrtimer_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) +{ + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); + int res; + + WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); + + /* + * When the callback is running, we do not reprogram the clock event + * device. The timer callback is either running on a different CPU or + * the callback is executed in the hrtimer_interrupt context. The + * reprogramming is handled either by the softirq, which called the + * callback or at the end of the hrtimer_interrupt. + */ + if (hrtimer_callback_running(timer)) + return 0; + + /* + * CLOCK_REALTIME timer might be requested with an absolute + * expiry time which is less than base->offset. Nothing wrong + * about that, just avoid to call into the tick code, which + * has now objections against negative expiry values. + */ + if (expires.tv64 < 0) + return -ETIME; + + if (expires.tv64 >= cpu_base->expires_next.tv64) + return 0; + + /* + * If a hang was detected in the last timer interrupt then we + * do not schedule a timer which is earlier than the expiry + * which we enforced in the hang detection. We want the system + * to make progress. + */ + if (cpu_base->hang_detected) + return 0; + + /* + * Clockevents returns -ETIME, when the event was in the past. + */ + res = tick_program_event(expires, 0); + if (!IS_ERR_VALUE(res)) + cpu_base->expires_next = expires; + return res; +} + +/* + * Initialize the high resolution related parts of cpu_base + */ +static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) +{ + base->expires_next.tv64 = KTIME_MAX; + base->hres_active = 0; +} + +/* + * When High resolution timers are active, try to reprogram. Note, that in case + * the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry + * check happens. The timer gets enqueued into the rbtree. The reprogramming + * and expiry check is done in the hrtimer_interrupt or in the softirq. + */ +static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base, + int wakeup) +{ + if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { + if (wakeup) { + raw_spin_unlock(&base->cpu_base->lock); + raise_softirq_irqoff(HRTIMER_SOFTIRQ); + raw_spin_lock(&base->cpu_base->lock); + } else + __raise_softirq_irqoff(HRTIMER_SOFTIRQ); + + return 1; + } + + return 0; +} + +/* + * Retrigger next event is called after clock was set + * + * Called with interrupts disabled via on_each_cpu() + */ +static void retrigger_next_event(void *arg) +{ + struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); + struct timespec realtime_offset, xtim, wtm, sleep; + + if (!hrtimer_hres_active()) + return; + + /* Optimized out for !HIGH_RES */ + get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep); + set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec); + + /* Adjust CLOCK_REALTIME offset */ + raw_spin_lock(&base->lock); + base->clock_base[HRTIMER_BASE_REALTIME].offset = + timespec_to_ktime(realtime_offset); + base->clock_base[HRTIMER_BASE_BOOTTIME].offset = + timespec_to_ktime(sleep); + + hrtimer_force_reprogram(base, 0); + raw_spin_unlock(&base->lock); +} + +/* + * Switch to high resolution mode + */ +static int hrtimer_switch_to_hres(void) +{ + int i, cpu = smp_processor_id(); + struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); + unsigned long flags; + + if (base->hres_active) + return 1; + + local_irq_save(flags); + + if (tick_init_highres()) { + local_irq_restore(flags); + printk(KERN_WARNING "Could not switch to high resolution " + "mode on CPU %d\n", cpu); + return 0; + } + base->hres_active = 1; + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) + base->clock_base[i].resolution = KTIME_HIGH_RES; + + tick_setup_sched_timer(); + + /* "Retrigger" the interrupt to get things going */ + retrigger_next_event(NULL); + local_irq_restore(flags); + return 1; +} + +#else + +static inline int hrtimer_hres_active(void) { return 0; } +static inline int hrtimer_is_hres_enabled(void) { return 0; } +static inline int hrtimer_switch_to_hres(void) { return 0; } +static inline void +hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } +static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base, + int wakeup) +{ + return 0; +} +static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } +static inline void retrigger_next_event(void *arg) { } + +#endif /* CONFIG_HIGH_RES_TIMERS */ + +/* + * Clock realtime was set + * + * Change the offset of the realtime clock vs. the monotonic + * clock. + * + * We might have to reprogram the high resolution timer interrupt. On + * SMP we call the architecture specific code to retrigger _all_ high + * resolution timer interrupts. On UP we just disable interrupts and + * call the high resolution interrupt code. + */ +void clock_was_set(void) +{ +#ifdef CONFIG_HIGH_RES_TIMERS + /* Retrigger the CPU local events everywhere */ + on_each_cpu(retrigger_next_event, NULL, 1); +#endif + timerfd_clock_was_set(); +} + +/* + * During resume we might have to reprogram the high resolution timer + * interrupt (on the local CPU): + */ +void hrtimers_resume(void) +{ + WARN_ONCE(!irqs_disabled(), + KERN_INFO "hrtimers_resume() called with IRQs enabled!"); + + retrigger_next_event(NULL); + timerfd_clock_was_set(); +} + +static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer) +{ +#ifdef CONFIG_TIMER_STATS + if (timer->start_site) + return; + timer->start_site = __builtin_return_address(0); + memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); + timer->start_pid = current->pid; +#endif +} + +static inline void timer_stats_hrtimer_clear_start_info(struct hrtimer *timer) +{ +#ifdef CONFIG_TIMER_STATS + timer->start_site = NULL; +#endif +} + +static inline void timer_stats_account_hrtimer(struct hrtimer *timer) +{ +#ifdef CONFIG_TIMER_STATS + if (likely(!timer_stats_active)) + return; + timer_stats_update_stats(timer, timer->start_pid, timer->start_site, + timer->function, timer->start_comm, 0); +#endif +} + +/* + * Counterpart to lock_hrtimer_base above: + */ +static inline +void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) +{ + raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); +} + +/** + * hrtimer_forward - forward the timer expiry + * @timer: hrtimer to forward + * @now: forward past this time + * @interval: the interval to forward + * + * Forward the timer expiry so it will expire in the future. + * Returns the number of overruns. + */ +u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) +{ + u64 orun = 1; + ktime_t delta; + + delta = ktime_sub(now, hrtimer_get_expires(timer)); + + if (delta.tv64 < 0) + return 0; + + if (interval.tv64 < timer->base->resolution.tv64) + interval.tv64 = timer->base->resolution.tv64; + + if (unlikely(delta.tv64 >= interval.tv64)) { + s64 incr = ktime_to_ns(interval); + + orun = ktime_divns(delta, incr); + hrtimer_add_expires_ns(timer, incr * orun); + if (hrtimer_get_expires_tv64(timer) > now.tv64) + return orun; + /* + * This (and the ktime_add() below) is the + * correction for exact: + */ + orun++; + } + hrtimer_add_expires(timer, interval); + + return orun; +} +EXPORT_SYMBOL_GPL(hrtimer_forward); + +/* + * enqueue_hrtimer - internal function to (re)start a timer + * + * The timer is inserted in expiry order. Insertion into the + * red black tree is O(log(n)). Must hold the base lock. + * + * Returns 1 when the new timer is the leftmost timer in the tree. + */ +static int enqueue_hrtimer(struct hrtimer *timer, + struct hrtimer_clock_base *base) +{ + debug_activate(timer); + + timerqueue_add(&base->active, &timer->node); + base->cpu_base->active_bases |= 1 << base->index; + + /* + * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the + * state of a possibly running callback. + */ + timer->state |= HRTIMER_STATE_ENQUEUED; + + return (&timer->node == base->active.next); +} + +/* + * __remove_hrtimer - internal function to remove a timer + * + * Caller must hold the base lock. + * + * High resolution timer mode reprograms the clock event device when the + * timer is the one which expires next. The caller can disable this by setting + * reprogram to zero. This is useful, when the context does a reprogramming + * anyway (e.g. timer interrupt) + */ +static void __remove_hrtimer(struct hrtimer *timer, + struct hrtimer_clock_base *base, + unsigned long newstate, int reprogram) +{ + struct timerqueue_node *next_timer; + if (!(timer->state & HRTIMER_STATE_ENQUEUED)) + goto out; + + next_timer = timerqueue_getnext(&base->active); + timerqueue_del(&base->active, &timer->node); + if (&timer->node == next_timer) { +#ifdef CONFIG_HIGH_RES_TIMERS + /* Reprogram the clock event device. if enabled */ + if (reprogram && hrtimer_hres_active()) { + ktime_t expires; + + expires = ktime_sub(hrtimer_get_expires(timer), + base->offset); + if (base->cpu_base->expires_next.tv64 == expires.tv64) + hrtimer_force_reprogram(base->cpu_base, 1); + } +#endif + } + if (!timerqueue_getnext(&base->active)) + base->cpu_base->active_bases &= ~(1 << base->index); +out: + timer->state = newstate; +} + +/* + * remove hrtimer, called with base lock held + */ +static inline int +remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) +{ + if (hrtimer_is_queued(timer)) { + unsigned long state; + int reprogram; + + /* + * Remove the timer and force reprogramming when high + * resolution mode is active and the timer is on the current + * CPU. If we remove a timer on another CPU, reprogramming is + * skipped. The interrupt event on this CPU is fired and + * reprogramming happens in the interrupt handler. This is a + * rare case and less expensive than a smp call. + */ + debug_deactivate(timer); + timer_stats_hrtimer_clear_start_info(timer); + reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); + /* + * We must preserve the CALLBACK state flag here, + * otherwise we could move the timer base in + * switch_hrtimer_base. + */ + state = timer->state & HRTIMER_STATE_CALLBACK; + __remove_hrtimer(timer, base, state, reprogram); + return 1; + } + return 0; +} + +int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, + unsigned long delta_ns, const enum hrtimer_mode mode, + int wakeup) +{ + struct hrtimer_clock_base *base, *new_base; + unsigned long flags; + int ret, leftmost; + + base = lock_hrtimer_base(timer, &flags); + + /* Remove an active timer from the queue: */ + ret = remove_hrtimer(timer, base); + + /* Switch the timer base, if necessary: */ + new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); + + if (mode & HRTIMER_MODE_REL) { + tim = ktime_add_safe(tim, new_base->get_time()); + /* + * CONFIG_TIME_LOW_RES is a temporary way for architectures + * to signal that they simply return xtime in + * do_gettimeoffset(). In this case we want to round up by + * resolution when starting a relative timer, to avoid short + * timeouts. This will go away with the GTOD framework. + */ +#ifdef CONFIG_TIME_LOW_RES + tim = ktime_add_safe(tim, base->resolution); +#endif + } + + hrtimer_set_expires_range_ns(timer, tim, delta_ns); + + timer_stats_hrtimer_set_start_info(timer); + + leftmost = enqueue_hrtimer(timer, new_base); + + /* + * Only allow reprogramming if the new base is on this CPU. + * (it might still be on another CPU if the timer was pending) + * + * XXX send_remote_softirq() ? + */ + if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)) + hrtimer_enqueue_reprogram(timer, new_base, wakeup); + + unlock_hrtimer_base(timer, &flags); + + return ret; +} + +/** + * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU + * @timer: the timer to be added + * @tim: expiry time + * @delta_ns: "slack" range for the timer + * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) + * + * Returns: + * 0 on success + * 1 when the timer was active + */ +int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, + unsigned long delta_ns, const enum hrtimer_mode mode) +{ + return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1); +} +EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); + +/** + * hrtimer_start - (re)start an hrtimer on the current CPU + * @timer: the timer to be added + * @tim: expiry time + * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) + * + * Returns: + * 0 on success + * 1 when the timer was active + */ +int +hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) +{ + return __hrtimer_start_range_ns(timer, tim, 0, mode, 1); +} +EXPORT_SYMBOL_GPL(hrtimer_start); + + +/** + * hrtimer_try_to_cancel - try to deactivate a timer + * @timer: hrtimer to stop + * + * Returns: + * 0 when the timer was not active + * 1 when the timer was active + * -1 when the timer is currently excuting the callback function and + * cannot be stopped + */ +int hrtimer_try_to_cancel(struct hrtimer *timer) +{ + struct hrtimer_clock_base *base; + unsigned long flags; + int ret = -1; + + base = lock_hrtimer_base(timer, &flags); + + if (!hrtimer_callback_running(timer)) + ret = remove_hrtimer(timer, base); + + unlock_hrtimer_base(timer, &flags); + + return ret; + +} +EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); + +/** + * hrtimer_cancel - cancel a timer and wait for the handler to finish. + * @timer: the timer to be cancelled + * + * Returns: + * 0 when the timer was not active + * 1 when the timer was active + */ +int hrtimer_cancel(struct hrtimer *timer) +{ + for (;;) { + int ret = hrtimer_try_to_cancel(timer); + + if (ret >= 0) + return ret; + cpu_relax(); + } +} +EXPORT_SYMBOL_GPL(hrtimer_cancel); + +/** + * hrtimer_get_remaining - get remaining time for the timer + * @timer: the timer to read + */ +ktime_t hrtimer_get_remaining(const struct hrtimer *timer) +{ + unsigned long flags; + ktime_t rem; + + lock_hrtimer_base(timer, &flags); + rem = hrtimer_expires_remaining(timer); + unlock_hrtimer_base(timer, &flags); + + return rem; +} +EXPORT_SYMBOL_GPL(hrtimer_get_remaining); + +#ifdef CONFIG_NO_HZ +/** + * hrtimer_get_next_event - get the time until next expiry event + * + * Returns the delta to the next expiry event or KTIME_MAX if no timer + * is pending. + */ +ktime_t hrtimer_get_next_event(void) +{ + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_clock_base *base = cpu_base->clock_base; + ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; + unsigned long flags; + int i; + + raw_spin_lock_irqsave(&cpu_base->lock, flags); + + if (!hrtimer_hres_active()) { + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { + struct hrtimer *timer; + struct timerqueue_node *next; + + next = timerqueue_getnext(&base->active); + if (!next) + continue; + + timer = container_of(next, struct hrtimer, node); + delta.tv64 = hrtimer_get_expires_tv64(timer); + delta = ktime_sub(delta, base->get_time()); + if (delta.tv64 < mindelta.tv64) + mindelta.tv64 = delta.tv64; + } + } + + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + + if (mindelta.tv64 < 0) + mindelta.tv64 = 0; + return mindelta; +} +#endif + +static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, + enum hrtimer_mode mode) +{ + struct hrtimer_cpu_base *cpu_base; + int base; + + memset(timer, 0, sizeof(struct hrtimer)); + + cpu_base = &__raw_get_cpu_var(hrtimer_bases); + + if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) + clock_id = CLOCK_MONOTONIC; + + base = hrtimer_clockid_to_base(clock_id); + timer->base = &cpu_base->clock_base[base]; + timerqueue_init(&timer->node); + +#ifdef CONFIG_TIMER_STATS + timer->start_site = NULL; + timer->start_pid = -1; + memset(timer->start_comm, 0, TASK_COMM_LEN); +#endif +} + +/** + * hrtimer_init - initialize a timer to the given clock + * @timer: the timer to be initialized + * @clock_id: the clock to be used + * @mode: timer mode abs/rel + */ +void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, + enum hrtimer_mode mode) +{ + debug_init(timer, clock_id, mode); + __hrtimer_init(timer, clock_id, mode); +} +EXPORT_SYMBOL_GPL(hrtimer_init); + +/** + * hrtimer_get_res - get the timer resolution for a clock + * @which_clock: which clock to query + * @tp: pointer to timespec variable to store the resolution + * + * Store the resolution of the clock selected by @which_clock in the + * variable pointed to by @tp. + */ +int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) +{ + struct hrtimer_cpu_base *cpu_base; + int base = hrtimer_clockid_to_base(which_clock); + + cpu_base = &__raw_get_cpu_var(hrtimer_bases); + *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution); + + return 0; +} +EXPORT_SYMBOL_GPL(hrtimer_get_res); + +static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) +{ + struct hrtimer_clock_base *base = timer->base; + struct hrtimer_cpu_base *cpu_base = base->cpu_base; + enum hrtimer_restart (*fn)(struct hrtimer *); + int restart; + + WARN_ON(!irqs_disabled()); + + debug_deactivate(timer); + __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); + timer_stats_account_hrtimer(timer); + fn = timer->function; + + /* + * Because we run timers from hardirq context, there is no chance + * they get migrated to another cpu, therefore its safe to unlock + * the timer base. + */ + raw_spin_unlock(&cpu_base->lock); + trace_hrtimer_expire_entry(timer, now); + restart = fn(timer); + trace_hrtimer_expire_exit(timer); + raw_spin_lock(&cpu_base->lock); + + /* + * Note: We clear the CALLBACK bit after enqueue_hrtimer and + * we do not reprogramm the event hardware. Happens either in + * hrtimer_start_range_ns() or in hrtimer_interrupt() + */ + if (restart != HRTIMER_NORESTART) { + BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); + enqueue_hrtimer(timer, base); + } + + WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK)); + + timer->state &= ~HRTIMER_STATE_CALLBACK; +} + +#ifdef CONFIG_HIGH_RES_TIMERS + +/* + * High resolution timer interrupt + * Called with interrupts disabled + */ +void hrtimer_interrupt(struct clock_event_device *dev) +{ + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + ktime_t expires_next, now, entry_time, delta; + int i, retries = 0; + + BUG_ON(!cpu_base->hres_active); + cpu_base->nr_events++; + dev->next_event.tv64 = KTIME_MAX; + + entry_time = now = ktime_get(); +retry: + expires_next.tv64 = KTIME_MAX; + + raw_spin_lock(&cpu_base->lock); + /* + * We set expires_next to KTIME_MAX here with cpu_base->lock + * held to prevent that a timer is enqueued in our queue via + * the migration code. This does not affect enqueueing of + * timers which run their callback and need to be requeued on + * this CPU. + */ + cpu_base->expires_next.tv64 = KTIME_MAX; + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + struct hrtimer_clock_base *base; + struct timerqueue_node *node; + ktime_t basenow; + + if (!(cpu_base->active_bases & (1 << i))) + continue; + + base = cpu_base->clock_base + i; + basenow = ktime_add(now, base->offset); + + while ((node = timerqueue_getnext(&base->active))) { + struct hrtimer *timer; + + timer = container_of(node, struct hrtimer, node); + + /* + * The immediate goal for using the softexpires is + * minimizing wakeups, not running timers at the + * earliest interrupt after their soft expiration. + * This allows us to avoid using a Priority Search + * Tree, which can answer a stabbing querry for + * overlapping intervals and instead use the simple + * BST we already have. + * We don't add extra wakeups by delaying timers that + * are right-of a not yet expired timer, because that + * timer will have to trigger a wakeup anyway. + */ + + if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) { + ktime_t expires; + + expires = ktime_sub(hrtimer_get_expires(timer), + base->offset); + if (expires.tv64 < expires_next.tv64) + expires_next = expires; + break; + } + + __run_hrtimer(timer, &basenow); + } + } + + /* + * Store the new expiry value so the migration code can verify + * against it. + */ + cpu_base->expires_next = expires_next; + raw_spin_unlock(&cpu_base->lock); + + /* Reprogramming necessary ? */ + if (expires_next.tv64 == KTIME_MAX || + !tick_program_event(expires_next, 0)) { + cpu_base->hang_detected = 0; + return; + } + + /* + * The next timer was already expired due to: + * - tracing + * - long lasting callbacks + * - being scheduled away when running in a VM + * + * We need to prevent that we loop forever in the hrtimer + * interrupt routine. We give it 3 attempts to avoid + * overreacting on some spurious event. + */ + now = ktime_get(); + cpu_base->nr_retries++; + if (++retries < 3) + goto retry; + /* + * Give the system a chance to do something else than looping + * here. We stored the entry time, so we know exactly how long + * we spent here. We schedule the next event this amount of + * time away. + */ + cpu_base->nr_hangs++; + cpu_base->hang_detected = 1; + delta = ktime_sub(now, entry_time); + if (delta.tv64 > cpu_base->max_hang_time.tv64) + cpu_base->max_hang_time = delta; + /* + * Limit it to a sensible value as we enforce a longer + * delay. Give the CPU at least 100ms to catch up. + */ + if (delta.tv64 > 100 * NSEC_PER_MSEC) + expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); + else + expires_next = ktime_add(now, delta); + tick_program_event(expires_next, 1); + printk_once(KERN_WARNING "hrtimer: interrupt took %llu ns\n", + ktime_to_ns(delta)); +} + +/* + * local version of hrtimer_peek_ahead_timers() called with interrupts + * disabled. + */ +static void __hrtimer_peek_ahead_timers(void) +{ + struct tick_device *td; + + if (!hrtimer_hres_active()) + return; + + td = &__get_cpu_var(tick_cpu_device); + if (td && td->evtdev) + hrtimer_interrupt(td->evtdev); +} + +/** + * hrtimer_peek_ahead_timers -- run soft-expired timers now + * + * hrtimer_peek_ahead_timers will peek at the timer queue of + * the current cpu and check if there are any timers for which + * the soft expires time has passed. If any such timers exist, + * they are run immediately and then removed from the timer queue. + * + */ +void hrtimer_peek_ahead_timers(void) +{ + unsigned long flags; + + local_irq_save(flags); + __hrtimer_peek_ahead_timers(); + local_irq_restore(flags); +} + +static void run_hrtimer_softirq(struct softirq_action *h) +{ + hrtimer_peek_ahead_timers(); +} + +#else /* CONFIG_HIGH_RES_TIMERS */ + +static inline void __hrtimer_peek_ahead_timers(void) { } + +#endif /* !CONFIG_HIGH_RES_TIMERS */ + +/* + * Called from timer softirq every jiffy, expire hrtimers: + * + * For HRT its the fall back code to run the softirq in the timer + * softirq context in case the hrtimer initialization failed or has + * not been done yet. + */ +void hrtimer_run_pending(void) +{ + if (hrtimer_hres_active()) + return; + + /* + * This _is_ ugly: We have to check in the softirq context, + * whether we can switch to highres and / or nohz mode. The + * clocksource switch happens in the timer interrupt with + * xtime_lock held. Notification from there only sets the + * check bit in the tick_oneshot code, otherwise we might + * deadlock vs. xtime_lock. + */ + if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) + hrtimer_switch_to_hres(); +} + +/* + * Called from hardirq context every jiffy + */ +void hrtimer_run_queues(void) +{ + struct timerqueue_node *node; + struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_clock_base *base; + int index, gettime = 1; + + if (hrtimer_hres_active()) + return; + + for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { + base = &cpu_base->clock_base[index]; + if (!timerqueue_getnext(&base->active)) + continue; + + if (gettime) { + hrtimer_get_softirq_time(cpu_base); + gettime = 0; + } + + raw_spin_lock(&cpu_base->lock); + + while ((node = timerqueue_getnext(&base->active))) { + struct hrtimer *timer; + + timer = container_of(node, struct hrtimer, node); + if (base->softirq_time.tv64 <= + hrtimer_get_expires_tv64(timer)) + break; + + __run_hrtimer(timer, &base->softirq_time); + } + raw_spin_unlock(&cpu_base->lock); + } +} + +/* + * Sleep related functions: + */ +static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) +{ + struct hrtimer_sleeper *t = + container_of(timer, struct hrtimer_sleeper, timer); + struct task_struct *task = t->task; + + t->task = NULL; + if (task) + wake_up_process(task); + + return HRTIMER_NORESTART; +} + +void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) +{ + sl->timer.function = hrtimer_wakeup; + sl->task = task; +} +EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); + +static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) +{ + hrtimer_init_sleeper(t, current); + + do { + set_current_state(TASK_INTERRUPTIBLE); + hrtimer_start_expires(&t->timer, mode); + if (!hrtimer_active(&t->timer)) + t->task = NULL; + + if (likely(t->task)) + schedule(); + + hrtimer_cancel(&t->timer); + mode = HRTIMER_MODE_ABS; + + } while (t->task && !signal_pending(current)); + + __set_current_state(TASK_RUNNING); + + return t->task == NULL; +} + +static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp) +{ + struct timespec rmt; + ktime_t rem; + + rem = hrtimer_expires_remaining(timer); + if (rem.tv64 <= 0) + return 0; + rmt = ktime_to_timespec(rem); + + if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) + return -EFAULT; + + return 1; +} + +long __sched hrtimer_nanosleep_restart(struct restart_block *restart) +{ + struct hrtimer_sleeper t; + struct timespec __user *rmtp; + int ret = 0; + + hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid, + HRTIMER_MODE_ABS); + hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); + + if (do_nanosleep(&t, HRTIMER_MODE_ABS)) + goto out; + + rmtp = restart->nanosleep.rmtp; + if (rmtp) { + ret = update_rmtp(&t.timer, rmtp); + if (ret <= 0) + goto out; + } + + /* The other values in restart are already filled in */ + ret = -ERESTART_RESTARTBLOCK; +out: + destroy_hrtimer_on_stack(&t.timer); + return ret; +} + +long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, + const enum hrtimer_mode mode, const clockid_t clockid) +{ + struct restart_block *restart; + struct hrtimer_sleeper t; + int ret = 0; + unsigned long slack; + + slack = current->timer_slack_ns; + if (rt_task(current)) + slack = 0; + + hrtimer_init_on_stack(&t.timer, clockid, mode); + hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack); + if (do_nanosleep(&t, mode)) + goto out; + + /* Absolute timers do not update the rmtp value and restart: */ + if (mode == HRTIMER_MODE_ABS) { + ret = -ERESTARTNOHAND; + goto out; + } + + if (rmtp) { + ret = update_rmtp(&t.timer, rmtp); + if (ret <= 0) + goto out; + } + + restart = ¤t_thread_info()->restart_block; + restart->fn = hrtimer_nanosleep_restart; + restart->nanosleep.clockid = t.timer.base->clockid; + restart->nanosleep.rmtp = rmtp; + restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); + + ret = -ERESTART_RESTARTBLOCK; +out: + destroy_hrtimer_on_stack(&t.timer); + return ret; +} + +SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, + struct timespec __user *, rmtp) +{ + struct timespec tu; + + if (copy_from_user(&tu, rqtp, sizeof(tu))) + return -EFAULT; + + if (!timespec_valid(&tu)) + return -EINVAL; + + return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); +} + +/* + * Functions related to boot-time initialization: + */ +static void __cpuinit init_hrtimers_cpu(int cpu) +{ + struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); + int i; + + raw_spin_lock_init(&cpu_base->lock); + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + cpu_base->clock_base[i].cpu_base = cpu_base; + timerqueue_init_head(&cpu_base->clock_base[i].active); + } + + hrtimer_init_hres(cpu_base); +} + +#ifdef CONFIG_HOTPLUG_CPU + +static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, + struct hrtimer_clock_base *new_base) +{ + struct hrtimer *timer; + struct timerqueue_node *node; + + while ((node = timerqueue_getnext(&old_base->active))) { + timer = container_of(node, struct hrtimer, node); + BUG_ON(hrtimer_callback_running(timer)); + debug_deactivate(timer); + + /* + * Mark it as STATE_MIGRATE not INACTIVE otherwise the + * timer could be seen as !active and just vanish away + * under us on another CPU + */ + __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); + timer->base = new_base; + /* + * Enqueue the timers on the new cpu. This does not + * reprogram the event device in case the timer + * expires before the earliest on this CPU, but we run + * hrtimer_interrupt after we migrated everything to + * sort out already expired timers and reprogram the + * event device. + */ + enqueue_hrtimer(timer, new_base); + + /* Clear the migration state bit */ + timer->state &= ~HRTIMER_STATE_MIGRATE; + } +} + +static void migrate_hrtimers(int scpu) +{ + struct hrtimer_cpu_base *old_base, *new_base; + int i; + + BUG_ON(cpu_online(scpu)); + tick_cancel_sched_timer(scpu); + + local_irq_disable(); + old_base = &per_cpu(hrtimer_bases, scpu); + new_base = &__get_cpu_var(hrtimer_bases); + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + raw_spin_lock(&new_base->lock); + raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + migrate_hrtimer_list(&old_base->clock_base[i], + &new_base->clock_base[i]); + } + + raw_spin_unlock(&old_base->lock); + raw_spin_unlock(&new_base->lock); + + /* Check, if we got expired work to do */ + __hrtimer_peek_ahead_timers(); + local_irq_enable(); +} + +#endif /* CONFIG_HOTPLUG_CPU */ + +static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + int scpu = (long)hcpu; + + switch (action) { + + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + init_hrtimers_cpu(scpu); + break; + +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DYING: + case CPU_DYING_FROZEN: + clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + { + clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu); + migrate_hrtimers(scpu); + break; + } +#endif + + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata hrtimers_nb = { + .notifier_call = hrtimer_cpu_notify, +}; + +void __init hrtimers_init(void) +{ + hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); + register_cpu_notifier(&hrtimers_nb); +#ifdef CONFIG_HIGH_RES_TIMERS + open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq); +#endif +} + +/** + * schedule_hrtimeout_range_clock - sleep until timeout + * @expires: timeout value (ktime_t) + * @delta: slack in expires timeout (ktime_t) + * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL + * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME + */ +int __sched +schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, + const enum hrtimer_mode mode, int clock) +{ + struct hrtimer_sleeper t; + + /* + * Optimize when a zero timeout value is given. It does not + * matter whether this is an absolute or a relative time. + */ + if (expires && !expires->tv64) { + __set_current_state(TASK_RUNNING); + return 0; + } + + /* + * A NULL parameter means "infinite" + */ + if (!expires) { + schedule(); + __set_current_state(TASK_RUNNING); + return -EINTR; + } + + hrtimer_init_on_stack(&t.timer, clock, mode); + hrtimer_set_expires_range_ns(&t.timer, *expires, delta); + + hrtimer_init_sleeper(&t, current); + + hrtimer_start_expires(&t.timer, mode); + if (!hrtimer_active(&t.timer)) + t.task = NULL; + + if (likely(t.task)) + schedule(); + + hrtimer_cancel(&t.timer); + destroy_hrtimer_on_stack(&t.timer); + + __set_current_state(TASK_RUNNING); + + return !t.task ? 0 : -EINTR; +} + +/** + * schedule_hrtimeout_range - sleep until timeout + * @expires: timeout value (ktime_t) + * @delta: slack in expires timeout (ktime_t) + * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL + * + * Make the current task sleep until the given expiry time has + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * The @delta argument gives the kernel the freedom to schedule the + * actual wakeup to a time that is both power and performance friendly. + * The kernel give the normal best effort behavior for "@expires+@delta", + * but may decide to fire the timer earlier, but no earlier than @expires. + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to + * pass before the routine returns. + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task. + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Returns 0 when the timer has expired otherwise -EINTR + */ +int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, + const enum hrtimer_mode mode) +{ + return schedule_hrtimeout_range_clock(expires, delta, mode, + CLOCK_MONOTONIC); +} +EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); + +/** + * schedule_hrtimeout - sleep until timeout + * @expires: timeout value (ktime_t) + * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL + * + * Make the current task sleep until the given expiry time has + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to + * pass before the routine returns. + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task. + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Returns 0 when the timer has expired otherwise -EINTR + */ +int __sched schedule_hrtimeout(ktime_t *expires, + const enum hrtimer_mode mode) +{ + return schedule_hrtimeout_range(expires, 0, mode); +} +EXPORT_SYMBOL_GPL(schedule_hrtimeout); diff --git a/kernel/hung_task.c b/kernel/hung_task.c new file mode 100644 index 00000000..e972276f --- /dev/null +++ b/kernel/hung_task.c @@ -0,0 +1,223 @@ +/* + * Detect Hung Task + * + * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * The number of tasks checked: + */ +unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; + +/* + * Limit number of tasks checked in a batch. + * + * This value controls the preemptibility of khungtaskd since preemption + * is disabled during the critical section. It also controls the size of + * the RCU grace period. So it needs to be upper-bound. + */ +#define HUNG_TASK_BATCHING 1024 + +/* + * Zero means infinite timeout - no checking done: + */ +unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT; + +unsigned long __read_mostly sysctl_hung_task_warnings = 10; + +static int __read_mostly did_panic; + +static struct task_struct *watchdog_task; + +/* + * Should we panic (and reboot, if panic_timeout= is set) when a + * hung task is detected: + */ +unsigned int __read_mostly sysctl_hung_task_panic = + CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE; + +static int __init hung_task_panic_setup(char *str) +{ + sysctl_hung_task_panic = simple_strtoul(str, NULL, 0); + + return 1; +} +__setup("hung_task_panic=", hung_task_panic_setup); + +static int +hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) +{ + did_panic = 1; + + return NOTIFY_DONE; +} + +static struct notifier_block panic_block = { + .notifier_call = hung_task_panic, +}; + +static void check_hung_task(struct task_struct *t, unsigned long timeout) +{ + unsigned long switch_count = t->nvcsw + t->nivcsw; + + /* + * Ensure the task is not frozen. + * Also, skip vfork and any other user process that freezer should skip. + */ + if (unlikely(t->flags & (PF_FROZEN | PF_FREEZER_SKIP))) + return; + + /* + * When a freshly created task is scheduled once, changes its state to + * TASK_UNINTERRUPTIBLE without having ever been switched out once, it + * musn't be checked. + */ + if (unlikely(!switch_count)) + return; + + if (switch_count != t->last_switch_count) { + t->last_switch_count = switch_count; + return; + } + if (!sysctl_hung_task_warnings) + return; + sysctl_hung_task_warnings--; + + /* + * Ok, the task did not get scheduled for more than 2 minutes, + * complain: + */ + printk(KERN_ERR "INFO: task %s:%d blocked for more than " + "%ld seconds.\n", t->comm, t->pid, timeout); + printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" + " disables this message.\n"); + sched_show_task(t); + debug_show_held_locks(t); + + touch_nmi_watchdog(); + + if (sysctl_hung_task_panic) + panic("hung_task: blocked tasks"); +} + +/* + * To avoid extending the RCU grace period for an unbounded amount of time, + * periodically exit the critical section and enter a new one. + * + * For preemptible RCU it is sufficient to call rcu_read_unlock in order + * to exit the grace period. For classic RCU, a reschedule is required. + */ +static void rcu_lock_break(struct task_struct *g, struct task_struct *t) +{ + get_task_struct(g); + get_task_struct(t); + rcu_read_unlock(); + cond_resched(); + rcu_read_lock(); + put_task_struct(t); + put_task_struct(g); +} + +/* + * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for + * a really long time (120 seconds). If that happens, print out + * a warning. + */ +static void check_hung_uninterruptible_tasks(unsigned long timeout) +{ + int max_count = sysctl_hung_task_check_count; + int batch_count = HUNG_TASK_BATCHING; + struct task_struct *g, *t; + + /* + * If the system crashed already then all bets are off, + * do not report extra hung tasks: + */ + if (test_taint(TAINT_DIE) || did_panic) + return; + + rcu_read_lock(); + do_each_thread(g, t) { + if (!max_count--) + goto unlock; + if (!--batch_count) { + batch_count = HUNG_TASK_BATCHING; + rcu_lock_break(g, t); + /* Exit if t or g was unhashed during refresh. */ + if (t->state == TASK_DEAD || g->state == TASK_DEAD) + goto unlock; + } + /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ + if (t->state == TASK_UNINTERRUPTIBLE) + check_hung_task(t, timeout); + } while_each_thread(g, t); + unlock: + rcu_read_unlock(); +} + +static unsigned long timeout_jiffies(unsigned long timeout) +{ + /* timeout of 0 will disable the watchdog */ + return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT; +} + +/* + * Process updating of timeout sysctl + */ +int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); + + if (ret || !write) + goto out; + + wake_up_process(watchdog_task); + + out: + return ret; +} + +/* + * kthread which checks for tasks stuck in D state + */ +static int watchdog(void *dummy) +{ + set_user_nice(current, 0); + + for ( ; ; ) { + unsigned long timeout = sysctl_hung_task_timeout_secs; + + while (schedule_timeout_interruptible(timeout_jiffies(timeout))) + timeout = sysctl_hung_task_timeout_secs; + + check_hung_uninterruptible_tasks(timeout); + } + + return 0; +} + +static int __init hung_task_init(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, &panic_block); + watchdog_task = kthread_run(watchdog, NULL, "khungtaskd"); + + return 0; +} + +module_init(hung_task_init); diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig new file mode 100644 index 00000000..d1d051b3 --- /dev/null +++ b/kernel/irq/Kconfig @@ -0,0 +1,74 @@ +# Select this to activate the generic irq options below +config HAVE_GENERIC_HARDIRQS + bool + +if HAVE_GENERIC_HARDIRQS +menu "IRQ subsystem" +# +# Interrupt subsystem related configuration options +# +config GENERIC_HARDIRQS + def_bool y + +# Options selectable by the architecture code + +# Make sparse irq Kconfig switch below available +config HAVE_SPARSE_IRQ + bool + +# Enable the generic irq autoprobe mechanism +config GENERIC_IRQ_PROBE + bool + +# Use the generic /proc/interrupts implementation +config GENERIC_IRQ_SHOW + bool + +# Print level/edge extra information +config GENERIC_IRQ_SHOW_LEVEL + bool + +# Support for delayed migration from interrupt context +config GENERIC_PENDING_IRQ + bool + +# Alpha specific irq affinity mechanism +config AUTO_IRQ_AFFINITY + bool + +# Tasklet based software resend for pending interrupts on enable_irq() +config HARDIRQS_SW_RESEND + bool + +# Preflow handler support for fasteoi (sparc64) +config IRQ_PREFLOW_FASTEOI + bool + +# Edge style eoi based handler (cell) +config IRQ_EDGE_EOI_HANDLER + bool + +# Generic configurable interrupt chip implementation +config GENERIC_IRQ_CHIP + bool + +# Support forced irq threading +config IRQ_FORCED_THREADING + bool + +config SPARSE_IRQ + bool "Support sparse irq numbering" + depends on HAVE_SPARSE_IRQ + ---help--- + + Sparse irq numbering is useful for distro kernels that want + to define a high CONFIG_NR_CPUS value but still want to have + low kernel memory footprint on smaller machines. + + ( Sparse irqs can also be beneficial on NUMA boxes, as they spread + out the interrupt descriptors in a more NUMA-friendly way. ) + + If you don't know what to do here, say N. + +endmenu +endif diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile new file mode 100644 index 00000000..73290056 --- /dev/null +++ b/kernel/irq/Makefile @@ -0,0 +1,7 @@ + +obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o +obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o +obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o +obj-$(CONFIG_PROC_FS) += proc.o +obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o +obj-$(CONFIG_PM_SLEEP) += pm.o diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c new file mode 100644 index 00000000..0119b9d4 --- /dev/null +++ b/kernel/irq/autoprobe.c @@ -0,0 +1,185 @@ +/* + * linux/kernel/irq/autoprobe.c + * + * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar + * + * This file contains the interrupt probing code and driver APIs. + */ + +#include +#include +#include +#include +#include + +#include "internals.h" + +/* + * Autodetection depends on the fact that any interrupt that + * comes in on to an unassigned handler will get stuck with + * "IRQS_WAITING" cleared and the interrupt disabled. + */ +static DEFINE_MUTEX(probing_active); + +/** + * probe_irq_on - begin an interrupt autodetect + * + * Commence probing for an interrupt. The interrupts are scanned + * and a mask of potential interrupt lines is returned. + * + */ +unsigned long probe_irq_on(void) +{ + struct irq_desc *desc; + unsigned long mask = 0; + int i; + + /* + * quiesce the kernel, or at least the asynchronous portion + */ + async_synchronize_full(); + mutex_lock(&probing_active); + /* + * something may have generated an irq long ago and we want to + * flush such a longstanding irq before considering it as spurious. + */ + for_each_irq_desc_reverse(i, desc) { + raw_spin_lock_irq(&desc->lock); + if (!desc->action && irq_settings_can_probe(desc)) { + /* + * Some chips need to know about probing in + * progress: + */ + if (desc->irq_data.chip->irq_set_type) + desc->irq_data.chip->irq_set_type(&desc->irq_data, + IRQ_TYPE_PROBE); + irq_startup(desc, false); + } + raw_spin_unlock_irq(&desc->lock); + } + + /* Wait for longstanding interrupts to trigger. */ + msleep(20); + + /* + * enable any unassigned irqs + * (we must startup again here because if a longstanding irq + * happened in the previous stage, it may have masked itself) + */ + for_each_irq_desc_reverse(i, desc) { + raw_spin_lock_irq(&desc->lock); + if (!desc->action && irq_settings_can_probe(desc)) { + desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; + if (irq_startup(desc, false)) + desc->istate |= IRQS_PENDING; + } + raw_spin_unlock_irq(&desc->lock); + } + + /* + * Wait for spurious interrupts to trigger + */ + msleep(100); + + /* + * Now filter out any obviously spurious interrupts + */ + for_each_irq_desc(i, desc) { + raw_spin_lock_irq(&desc->lock); + + if (desc->istate & IRQS_AUTODETECT) { + /* It triggered already - consider it spurious. */ + if (!(desc->istate & IRQS_WAITING)) { + desc->istate &= ~IRQS_AUTODETECT; + irq_shutdown(desc); + } else + if (i < 32) + mask |= 1 << i; + } + raw_spin_unlock_irq(&desc->lock); + } + + return mask; +} +EXPORT_SYMBOL(probe_irq_on); + +/** + * probe_irq_mask - scan a bitmap of interrupt lines + * @val: mask of interrupts to consider + * + * Scan the interrupt lines and return a bitmap of active + * autodetect interrupts. The interrupt probe logic state + * is then returned to its previous value. + * + * Note: we need to scan all the irq's even though we will + * only return autodetect irq numbers - just so that we reset + * them all to a known state. + */ +unsigned int probe_irq_mask(unsigned long val) +{ + unsigned int mask = 0; + struct irq_desc *desc; + int i; + + for_each_irq_desc(i, desc) { + raw_spin_lock_irq(&desc->lock); + if (desc->istate & IRQS_AUTODETECT) { + if (i < 16 && !(desc->istate & IRQS_WAITING)) + mask |= 1 << i; + + desc->istate &= ~IRQS_AUTODETECT; + irq_shutdown(desc); + } + raw_spin_unlock_irq(&desc->lock); + } + mutex_unlock(&probing_active); + + return mask & val; +} +EXPORT_SYMBOL(probe_irq_mask); + +/** + * probe_irq_off - end an interrupt autodetect + * @val: mask of potential interrupts (unused) + * + * Scans the unused interrupt lines and returns the line which + * appears to have triggered the interrupt. If no interrupt was + * found then zero is returned. If more than one interrupt is + * found then minus the first candidate is returned to indicate + * their is doubt. + * + * The interrupt probe logic state is returned to its previous + * value. + * + * BUGS: When used in a module (which arguably shouldn't happen) + * nothing prevents two IRQ probe callers from overlapping. The + * results of this are non-optimal. + */ +int probe_irq_off(unsigned long val) +{ + int i, irq_found = 0, nr_of_irqs = 0; + struct irq_desc *desc; + + for_each_irq_desc(i, desc) { + raw_spin_lock_irq(&desc->lock); + + if (desc->istate & IRQS_AUTODETECT) { + if (!(desc->istate & IRQS_WAITING)) { + if (!nr_of_irqs) + irq_found = i; + nr_of_irqs++; + } + desc->istate &= ~IRQS_AUTODETECT; + irq_shutdown(desc); + } + raw_spin_unlock_irq(&desc->lock); + } + mutex_unlock(&probing_active); + + if (nr_of_irqs > 1) + irq_found = -irq_found; + + return irq_found; +} +EXPORT_SYMBOL(probe_irq_off); + diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c new file mode 100644 index 00000000..990965ec --- /dev/null +++ b/kernel/irq/chip.c @@ -0,0 +1,700 @@ +/* + * linux/kernel/irq/chip.c + * + * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar + * Copyright (C) 2005-2006, Thomas Gleixner, Russell King + * + * This file contains the core interrupt handling code, for irq-chip + * based architectures. + * + * Detailed information is available in Documentation/DocBook/genericirq + */ + +#include +#include +#include +#include +#include + +#include "internals.h" + +/** + * irq_set_chip - set the irq chip for an irq + * @irq: irq number + * @chip: pointer to irq chip description structure + */ +int irq_set_chip(unsigned int irq, struct irq_chip *chip) +{ + unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); + + if (!desc) + return -EINVAL; + + if (!chip) + chip = &no_irq_chip; + + desc->irq_data.chip = chip; + irq_put_desc_unlock(desc, flags); + /* + * For !CONFIG_SPARSE_IRQ make the irq show up in + * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is + * already marked, and this call is harmless. + */ + irq_reserve_irq(irq); + return 0; +} +EXPORT_SYMBOL(irq_set_chip); + +/** + * irq_set_type - set the irq trigger type for an irq + * @irq: irq number + * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h + */ +int irq_set_irq_type(unsigned int irq, unsigned int type) +{ + unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); + int ret = 0; + + if (!desc) + return -EINVAL; + + type &= IRQ_TYPE_SENSE_MASK; + ret = __irq_set_trigger(desc, irq, type); + irq_put_desc_busunlock(desc, flags); + return ret; +} +EXPORT_SYMBOL(irq_set_irq_type); + +/** + * irq_set_handler_data - set irq handler data for an irq + * @irq: Interrupt number + * @data: Pointer to interrupt specific data + * + * Set the hardware irq controller data for an irq + */ +int irq_set_handler_data(unsigned int irq, void *data) +{ + unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); + + if (!desc) + return -EINVAL; + desc->irq_data.handler_data = data; + irq_put_desc_unlock(desc, flags); + return 0; +} +EXPORT_SYMBOL(irq_set_handler_data); + +/** + * irq_set_msi_desc - set MSI descriptor data for an irq + * @irq: Interrupt number + * @entry: Pointer to MSI descriptor data + * + * Set the MSI descriptor entry for an irq + */ +int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) +{ + unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); + + if (!desc) + return -EINVAL; + desc->irq_data.msi_desc = entry; + if (entry) + entry->irq = irq; + irq_put_desc_unlock(desc, flags); + return 0; +} + +/** + * irq_set_chip_data - set irq chip data for an irq + * @irq: Interrupt number + * @data: Pointer to chip specific data + * + * Set the hardware irq chip data for an irq + */ +int irq_set_chip_data(unsigned int irq, void *data) +{ + unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); + + if (!desc) + return -EINVAL; + desc->irq_data.chip_data = data; + irq_put_desc_unlock(desc, flags); + return 0; +} +EXPORT_SYMBOL(irq_set_chip_data); + +struct irq_data *irq_get_irq_data(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + return desc ? &desc->irq_data : NULL; +} +EXPORT_SYMBOL_GPL(irq_get_irq_data); + +static void irq_state_clr_disabled(struct irq_desc *desc) +{ + irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED); +} + +static void irq_state_set_disabled(struct irq_desc *desc) +{ + irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); +} + +static void irq_state_clr_masked(struct irq_desc *desc) +{ + irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED); +} + +static void irq_state_set_masked(struct irq_desc *desc) +{ + irqd_set(&desc->irq_data, IRQD_IRQ_MASKED); +} + +int irq_startup(struct irq_desc *desc, bool resend) +{ + int ret = 0; + + irq_state_clr_disabled(desc); + desc->depth = 0; + + if (desc->irq_data.chip->irq_startup) { + ret = desc->irq_data.chip->irq_startup(&desc->irq_data); + irq_state_clr_masked(desc); + } else { + irq_enable(desc); + } + if (resend) + check_irq_resend(desc, desc->irq_data.irq); + return ret; +} + +void irq_shutdown(struct irq_desc *desc) +{ + irq_state_set_disabled(desc); + desc->depth = 1; + if (desc->irq_data.chip->irq_shutdown) + desc->irq_data.chip->irq_shutdown(&desc->irq_data); + else if (desc->irq_data.chip->irq_disable) + desc->irq_data.chip->irq_disable(&desc->irq_data); + else + desc->irq_data.chip->irq_mask(&desc->irq_data); + irq_state_set_masked(desc); +} + +void irq_enable(struct irq_desc *desc) +{ + irq_state_clr_disabled(desc); + if (desc->irq_data.chip->irq_enable) + desc->irq_data.chip->irq_enable(&desc->irq_data); + else + desc->irq_data.chip->irq_unmask(&desc->irq_data); + irq_state_clr_masked(desc); +} + +void irq_disable(struct irq_desc *desc) +{ + irq_state_set_disabled(desc); + if (desc->irq_data.chip->irq_disable) { + desc->irq_data.chip->irq_disable(&desc->irq_data); + irq_state_set_masked(desc); + } +} + +static inline void mask_ack_irq(struct irq_desc *desc) +{ + if (desc->irq_data.chip->irq_mask_ack) + desc->irq_data.chip->irq_mask_ack(&desc->irq_data); + else { + desc->irq_data.chip->irq_mask(&desc->irq_data); + if (desc->irq_data.chip->irq_ack) + desc->irq_data.chip->irq_ack(&desc->irq_data); + } + irq_state_set_masked(desc); +} + +void mask_irq(struct irq_desc *desc) +{ + if (desc->irq_data.chip->irq_mask) { + desc->irq_data.chip->irq_mask(&desc->irq_data); + irq_state_set_masked(desc); + } +} + +void unmask_irq(struct irq_desc *desc) +{ + if (desc->irq_data.chip->irq_unmask) { + desc->irq_data.chip->irq_unmask(&desc->irq_data); + irq_state_clr_masked(desc); + } +} + +/* + * handle_nested_irq - Handle a nested irq from a irq thread + * @irq: the interrupt number + * + * Handle interrupts which are nested into a threaded interrupt + * handler. The handler function is called inside the calling + * threads context. + */ +void handle_nested_irq(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct irqaction *action; + irqreturn_t action_ret; + + might_sleep(); + + raw_spin_lock_irq(&desc->lock); + + kstat_incr_irqs_this_cpu(irq, desc); + + action = desc->action; + if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) + goto out_unlock; + + irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); + raw_spin_unlock_irq(&desc->lock); + + action_ret = action->thread_fn(action->irq, action->dev_id); + if (!noirqdebug) + note_interrupt(irq, desc, action_ret); + + raw_spin_lock_irq(&desc->lock); + irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); + +out_unlock: + raw_spin_unlock_irq(&desc->lock); +} +EXPORT_SYMBOL_GPL(handle_nested_irq); + +static bool irq_check_poll(struct irq_desc *desc) +{ + if (!(desc->istate & IRQS_POLL_INPROGRESS)) + return false; + return irq_wait_for_poll(desc); +} + +/** + * handle_simple_irq - Simple and software-decoded IRQs. + * @irq: the interrupt number + * @desc: the interrupt description structure for this irq + * + * Simple interrupts are either sent from a demultiplexing interrupt + * handler or come from hardware, where no interrupt hardware control + * is necessary. + * + * Note: The caller is expected to handle the ack, clear, mask and + * unmask issues if necessary. + */ +void +handle_simple_irq(unsigned int irq, struct irq_desc *desc) +{ + raw_spin_lock(&desc->lock); + + if (unlikely(irqd_irq_inprogress(&desc->irq_data))) + if (!irq_check_poll(desc)) + goto out_unlock; + + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + kstat_incr_irqs_this_cpu(irq, desc); + + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) + goto out_unlock; + + handle_irq_event(desc); + +out_unlock: + raw_spin_unlock(&desc->lock); +} +EXPORT_SYMBOL_GPL(handle_simple_irq); + +/* + * Called unconditionally from handle_level_irq() and only for oneshot + * interrupts from handle_fasteoi_irq() + */ +static void cond_unmask_irq(struct irq_desc *desc) +{ + /* + * We need to unmask in the following cases: + * - Standard level irq (IRQF_ONESHOT is not set) + * - Oneshot irq which did not wake the thread (caused by a + * spurious interrupt or a primary handler handling it + * completely). + */ + if (!irqd_irq_disabled(&desc->irq_data) && + irqd_irq_masked(&desc->irq_data) && !desc->threads_oneshot) + unmask_irq(desc); +} + +/** + * handle_level_irq - Level type irq handler + * @irq: the interrupt number + * @desc: the interrupt description structure for this irq + * + * Level type interrupts are active as long as the hardware line has + * the active level. This may require to mask the interrupt and unmask + * it after the associated handler has acknowledged the device, so the + * interrupt line is back to inactive. + */ +void +handle_level_irq(unsigned int irq, struct irq_desc *desc) +{ + raw_spin_lock(&desc->lock); + mask_ack_irq(desc); + + if (unlikely(irqd_irq_inprogress(&desc->irq_data))) + if (!irq_check_poll(desc)) + goto out_unlock; + + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + kstat_incr_irqs_this_cpu(irq, desc); + + /* + * If its disabled or no action available + * keep it masked and get out of here + */ + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) + goto out_unlock; + + handle_irq_event(desc); + + cond_unmask_irq(desc); + +out_unlock: + raw_spin_unlock(&desc->lock); +} +EXPORT_SYMBOL_GPL(handle_level_irq); + +#ifdef CONFIG_IRQ_PREFLOW_FASTEOI +static inline void preflow_handler(struct irq_desc *desc) +{ + if (desc->preflow_handler) + desc->preflow_handler(&desc->irq_data); +} +#else +static inline void preflow_handler(struct irq_desc *desc) { } +#endif + +/** + * handle_fasteoi_irq - irq handler for transparent controllers + * @irq: the interrupt number + * @desc: the interrupt description structure for this irq + * + * Only a single callback will be issued to the chip: an ->eoi() + * call when the interrupt has been serviced. This enables support + * for modern forms of interrupt handlers, which handle the flow + * details in hardware, transparently. + */ +void +handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) +{ + raw_spin_lock(&desc->lock); + + if (unlikely(irqd_irq_inprogress(&desc->irq_data))) + if (!irq_check_poll(desc)) + goto out; + + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + kstat_incr_irqs_this_cpu(irq, desc); + + /* + * If its disabled or no action available + * then mask it and get out of here: + */ + if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { + desc->istate |= IRQS_PENDING; + mask_irq(desc); + goto out; + } + + if (desc->istate & IRQS_ONESHOT) + mask_irq(desc); + + preflow_handler(desc); + handle_irq_event(desc); + + if (desc->istate & IRQS_ONESHOT) + cond_unmask_irq(desc); + +out_eoi: + desc->irq_data.chip->irq_eoi(&desc->irq_data); +out_unlock: + raw_spin_unlock(&desc->lock); + return; +out: + if (!(desc->irq_data.chip->flags & IRQCHIP_EOI_IF_HANDLED)) + goto out_eoi; + goto out_unlock; +} + +/** + * handle_edge_irq - edge type IRQ handler + * @irq: the interrupt number + * @desc: the interrupt description structure for this irq + * + * Interrupt occures on the falling and/or rising edge of a hardware + * signal. The occurrence is latched into the irq controller hardware + * and must be acked in order to be reenabled. After the ack another + * interrupt can happen on the same source even before the first one + * is handled by the associated event handler. If this happens it + * might be necessary to disable (mask) the interrupt depending on the + * controller hardware. This requires to reenable the interrupt inside + * of the loop which handles the interrupts which have arrived while + * the handler was running. If all pending interrupts are handled, the + * loop is left. + */ +void +handle_edge_irq(unsigned int irq, struct irq_desc *desc) +{ + raw_spin_lock(&desc->lock); + + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + /* + * If we're currently running this IRQ, or its disabled, + * we shouldn't process the IRQ. Mark it pending, handle + * the necessary masking and go out + */ + if (unlikely(irqd_irq_disabled(&desc->irq_data) || + irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { + if (!irq_check_poll(desc)) { + desc->istate |= IRQS_PENDING; + mask_ack_irq(desc); + goto out_unlock; + } + } + kstat_incr_irqs_this_cpu(irq, desc); + + /* Start handling the irq */ + desc->irq_data.chip->irq_ack(&desc->irq_data); + + do { + if (unlikely(!desc->action)) { + mask_irq(desc); + goto out_unlock; + } + + /* + * When another irq arrived while we were handling + * one, we could have masked the irq. + * Renable it, if it was not disabled in meantime. + */ + if (unlikely(desc->istate & IRQS_PENDING)) { + if (!irqd_irq_disabled(&desc->irq_data) && + irqd_irq_masked(&desc->irq_data)) + unmask_irq(desc); + } + + handle_irq_event(desc); + + } while ((desc->istate & IRQS_PENDING) && + !irqd_irq_disabled(&desc->irq_data)); + +out_unlock: + raw_spin_unlock(&desc->lock); +} + +#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER +/** + * handle_edge_eoi_irq - edge eoi type IRQ handler + * @irq: the interrupt number + * @desc: the interrupt description structure for this irq + * + * Similar as the above handle_edge_irq, but using eoi and w/o the + * mask/unmask logic. + */ +void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc) +{ + struct irq_chip *chip = irq_desc_get_chip(desc); + + raw_spin_lock(&desc->lock); + + desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); + /* + * If we're currently running this IRQ, or its disabled, + * we shouldn't process the IRQ. Mark it pending, handle + * the necessary masking and go out + */ + if (unlikely(irqd_irq_disabled(&desc->irq_data) || + irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { + if (!irq_check_poll(desc)) { + desc->istate |= IRQS_PENDING; + goto out_eoi; + } + } + kstat_incr_irqs_this_cpu(irq, desc); + + do { + if (unlikely(!desc->action)) + goto out_eoi; + + handle_irq_event(desc); + + } while ((desc->istate & IRQS_PENDING) && + !irqd_irq_disabled(&desc->irq_data)); + +out_eoi: + chip->irq_eoi(&desc->irq_data); + raw_spin_unlock(&desc->lock); +} +#endif + +/** + * handle_percpu_irq - Per CPU local irq handler + * @irq: the interrupt number + * @desc: the interrupt description structure for this irq + * + * Per CPU interrupts on SMP machines without locking requirements + */ +void +handle_percpu_irq(unsigned int irq, struct irq_desc *desc) +{ + struct irq_chip *chip = irq_desc_get_chip(desc); + + kstat_incr_irqs_this_cpu(irq, desc); + + if (chip->irq_ack) + chip->irq_ack(&desc->irq_data); + + handle_irq_event_percpu(desc, desc->action); + + if (chip->irq_eoi) + chip->irq_eoi(&desc->irq_data); +} + +void +__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, + const char *name) +{ + unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); + + if (!desc) + return; + + if (!handle) { + handle = handle_bad_irq; + } else { + if (WARN_ON(desc->irq_data.chip == &no_irq_chip)) + goto out; + } + + /* Uninstall? */ + if (handle == handle_bad_irq) { + if (desc->irq_data.chip != &no_irq_chip) + mask_ack_irq(desc); + irq_state_set_disabled(desc); + desc->depth = 1; + } + desc->handle_irq = handle; + desc->name = name; + + if (handle != handle_bad_irq && is_chained) { + irq_settings_set_noprobe(desc); + irq_settings_set_norequest(desc); + irq_settings_set_nothread(desc); + irq_startup(desc, true); + } +out: + irq_put_desc_busunlock(desc, flags); +} +EXPORT_SYMBOL_GPL(__irq_set_handler); + +void +irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, + irq_flow_handler_t handle, const char *name) +{ + irq_set_chip(irq, chip); + __irq_set_handler(irq, handle, 0, name); +} + +void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set) +{ + unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); + + if (!desc) + return; + irq_settings_clr_and_set(desc, clr, set); + + irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU | + IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT); + if (irq_settings_has_no_balance_set(desc)) + irqd_set(&desc->irq_data, IRQD_NO_BALANCING); + if (irq_settings_is_per_cpu(desc)) + irqd_set(&desc->irq_data, IRQD_PER_CPU); + if (irq_settings_can_move_pcntxt(desc)) + irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT); + if (irq_settings_is_level(desc)) + irqd_set(&desc->irq_data, IRQD_LEVEL); + + irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc)); + + irq_put_desc_unlock(desc, flags); +} +EXPORT_SYMBOL_GPL(irq_modify_status); + +/** + * irq_cpu_online - Invoke all irq_cpu_online functions. + * + * Iterate through all irqs and invoke the chip.irq_cpu_online() + * for each. + */ +void irq_cpu_online(void) +{ + struct irq_desc *desc; + struct irq_chip *chip; + unsigned long flags; + unsigned int irq; + + for_each_active_irq(irq) { + desc = irq_to_desc(irq); + if (!desc) + continue; + + raw_spin_lock_irqsave(&desc->lock, flags); + + chip = irq_data_get_irq_chip(&desc->irq_data); + if (chip && chip->irq_cpu_online && + (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || + !irqd_irq_disabled(&desc->irq_data))) + chip->irq_cpu_online(&desc->irq_data); + + raw_spin_unlock_irqrestore(&desc->lock, flags); + } +} + +/** + * irq_cpu_offline - Invoke all irq_cpu_offline functions. + * + * Iterate through all irqs and invoke the chip.irq_cpu_offline() + * for each. + */ +void irq_cpu_offline(void) +{ + struct irq_desc *desc; + struct irq_chip *chip; + unsigned long flags; + unsigned int irq; + + for_each_active_irq(irq) { + desc = irq_to_desc(irq); + if (!desc) + continue; + + raw_spin_lock_irqsave(&desc->lock, flags); + + chip = irq_data_get_irq_chip(&desc->irq_data); + if (chip && chip->irq_cpu_offline && + (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) || + !irqd_irq_disabled(&desc->irq_data))) + chip->irq_cpu_offline(&desc->irq_data); + + raw_spin_unlock_irqrestore(&desc->lock, flags); + } +} diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h new file mode 100644 index 00000000..97a8bfad --- /dev/null +++ b/kernel/irq/debug.h @@ -0,0 +1,45 @@ +/* + * Debugging printout: + */ + +#include + +#define P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f) +#define PS(f) if (desc->istate & f) printk("%14s set\n", #f) +/* FIXME */ +#define PD(f) do { } while (0) + +static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) +{ + printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", + irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); + printk("->handle_irq(): %p, ", desc->handle_irq); + print_symbol("%s\n", (unsigned long)desc->handle_irq); + printk("->irq_data.chip(): %p, ", desc->irq_data.chip); + print_symbol("%s\n", (unsigned long)desc->irq_data.chip); + printk("->action(): %p\n", desc->action); + if (desc->action) { + printk("->action->handler(): %p, ", desc->action->handler); + print_symbol("%s\n", (unsigned long)desc->action->handler); + } + + P(IRQ_LEVEL); + P(IRQ_PER_CPU); + P(IRQ_NOPROBE); + P(IRQ_NOREQUEST); + P(IRQ_NOTHREAD); + P(IRQ_NOAUTOEN); + + PS(IRQS_AUTODETECT); + PS(IRQS_REPLAY); + PS(IRQS_WAITING); + PS(IRQS_PENDING); + + PD(IRQS_INPROGRESS); + PD(IRQS_DISABLED); + PD(IRQS_MASKED); +} + +#undef P +#undef PS +#undef PD diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c new file mode 100644 index 00000000..1ef4ffcd --- /dev/null +++ b/kernel/irq/devres.c @@ -0,0 +1,94 @@ +#include +#include +#include +#include + +/* + * Device resource management aware IRQ request/free implementation. + */ +struct irq_devres { + unsigned int irq; + void *dev_id; +}; + +static void devm_irq_release(struct device *dev, void *res) +{ + struct irq_devres *this = res; + + free_irq(this->irq, this->dev_id); +} + +static int devm_irq_match(struct device *dev, void *res, void *data) +{ + struct irq_devres *this = res, *match = data; + + return this->irq == match->irq && this->dev_id == match->dev_id; +} + +/** + * devm_request_threaded_irq - allocate an interrupt line for a managed device + * @dev: device to request interrupt for + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs + * @thread_fn: function to be called in a threaded interrupt context. NULL + * for devices which handle everything in @handler + * @irqflags: Interrupt type flags + * @devname: An ascii name for the claiming device + * @dev_id: A cookie passed back to the handler function + * + * Except for the extra @dev argument, this function takes the + * same arguments and performs the same function as + * request_irq(). IRQs requested with this function will be + * automatically freed on driver detach. + * + * If an IRQ allocated with this function needs to be freed + * separately, devm_free_irq() must be used. + */ +int devm_request_threaded_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, irq_handler_t thread_fn, + unsigned long irqflags, const char *devname, + void *dev_id) +{ + struct irq_devres *dr; + int rc; + + dr = devres_alloc(devm_irq_release, sizeof(struct irq_devres), + GFP_KERNEL); + if (!dr) + return -ENOMEM; + + rc = request_threaded_irq(irq, handler, thread_fn, irqflags, devname, + dev_id); + if (rc) { + devres_free(dr); + return rc; + } + + dr->irq = irq; + dr->dev_id = dev_id; + devres_add(dev, dr); + + return 0; +} +EXPORT_SYMBOL(devm_request_threaded_irq); + +/** + * devm_free_irq - free an interrupt + * @dev: device to free interrupt for + * @irq: Interrupt line to free + * @dev_id: Device identity to free + * + * Except for the extra @dev argument, this function takes the + * same arguments and performs the same function as free_irq(). + * This function instead of free_irq() should be used to manually + * free IRQs allocated with devm_request_irq(). + */ +void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id) +{ + struct irq_devres match_data = { irq, dev_id }; + + free_irq(irq, dev_id); + WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match, + &match_data)); +} +EXPORT_SYMBOL(devm_free_irq); diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c new file mode 100644 index 00000000..b5fcd96c --- /dev/null +++ b/kernel/irq/dummychip.c @@ -0,0 +1,59 @@ +/* + * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar + * Copyright (C) 2005-2006, Thomas Gleixner, Russell King + * + * This file contains the dummy interrupt chip implementation + */ +#include +#include + +#include "internals.h" + +/* + * What should we do if we get a hw irq event on an illegal vector? + * Each architecture has to answer this themself. + */ +static void ack_bad(struct irq_data *data) +{ + struct irq_desc *desc = irq_data_to_desc(data); + + print_irq_desc(data->irq, desc); + ack_bad_irq(data->irq); +} + +/* + * NOP functions + */ +static void noop(struct irq_data *data) { } + +static unsigned int noop_ret(struct irq_data *data) +{ + return 0; +} + +/* + * Generic no controller implementation + */ +struct irq_chip no_irq_chip = { + .name = "none", + .irq_startup = noop_ret, + .irq_shutdown = noop, + .irq_enable = noop, + .irq_disable = noop, + .irq_ack = ack_bad, +}; + +/* + * Generic dummy implementation which can be used for + * real dumb interrupt sources + */ +struct irq_chip dummy_irq_chip = { + .name = "dummy", + .irq_startup = noop_ret, + .irq_shutdown = noop, + .irq_enable = noop, + .irq_disable = noop, + .irq_ack = noop, + .irq_mask = noop, + .irq_unmask = noop, +}; diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c new file mode 100644 index 00000000..e38544dd --- /dev/null +++ b/kernel/irq/generic-chip.c @@ -0,0 +1,368 @@ +/* + * Library implementing the most common irq chip callback functions + * + * Copyright (C) 2011, Thomas Gleixner + */ +#include +#include +#include +#include +#include +#include + +#include "internals.h" + +static LIST_HEAD(gc_list); +static DEFINE_RAW_SPINLOCK(gc_lock); + +static inline struct irq_chip_regs *cur_regs(struct irq_data *d) +{ + return &container_of(d->chip, struct irq_chip_type, chip)->regs; +} + +/** + * irq_gc_noop - NOOP function + * @d: irq_data + */ +void irq_gc_noop(struct irq_data *d) +{ +} + +/** + * irq_gc_mask_disable_reg - Mask chip via disable register + * @d: irq_data + * + * Chip has separate enable/disable registers instead of a single mask + * register. + */ +void irq_gc_mask_disable_reg(struct irq_data *d) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + u32 mask = 1 << (d->irq - gc->irq_base); + + irq_gc_lock(gc); + irq_reg_writel(mask, gc->reg_base + cur_regs(d)->disable); + gc->mask_cache &= ~mask; + irq_gc_unlock(gc); +} + +/** + * irq_gc_mask_set_mask_bit - Mask chip via setting bit in mask register + * @d: irq_data + * + * Chip has a single mask register. Values of this register are cached + * and protected by gc->lock + */ +void irq_gc_mask_set_bit(struct irq_data *d) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + u32 mask = 1 << (d->irq - gc->irq_base); + + irq_gc_lock(gc); + gc->mask_cache |= mask; + irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); + irq_gc_unlock(gc); +} + +/** + * irq_gc_mask_set_mask_bit - Mask chip via clearing bit in mask register + * @d: irq_data + * + * Chip has a single mask register. Values of this register are cached + * and protected by gc->lock + */ +void irq_gc_mask_clr_bit(struct irq_data *d) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + u32 mask = 1 << (d->irq - gc->irq_base); + + irq_gc_lock(gc); + gc->mask_cache &= ~mask; + irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask); + irq_gc_unlock(gc); +} + +/** + * irq_gc_unmask_enable_reg - Unmask chip via enable register + * @d: irq_data + * + * Chip has separate enable/disable registers instead of a single mask + * register. + */ +void irq_gc_unmask_enable_reg(struct irq_data *d) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + u32 mask = 1 << (d->irq - gc->irq_base); + + irq_gc_lock(gc); + irq_reg_writel(mask, gc->reg_base + cur_regs(d)->enable); + gc->mask_cache |= mask; + irq_gc_unlock(gc); +} + +/** + * irq_gc_ack_set_bit - Ack pending interrupt via setting bit + * @d: irq_data + */ +void irq_gc_ack_set_bit(struct irq_data *d) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + u32 mask = 1 << (d->irq - gc->irq_base); + + irq_gc_lock(gc); + irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); + irq_gc_unlock(gc); +} + +/** + * irq_gc_ack_clr_bit - Ack pending interrupt via clearing bit + * @d: irq_data + */ +void irq_gc_ack_clr_bit(struct irq_data *d) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + u32 mask = ~(1 << (d->irq - gc->irq_base)); + + irq_gc_lock(gc); + irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); + irq_gc_unlock(gc); +} + +/** + * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt + * @d: irq_data + */ +void irq_gc_mask_disable_reg_and_ack(struct irq_data *d) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + u32 mask = 1 << (d->irq - gc->irq_base); + + irq_gc_lock(gc); + irq_reg_writel(mask, gc->reg_base + cur_regs(d)->mask); + irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack); + irq_gc_unlock(gc); +} + +/** + * irq_gc_eoi - EOI interrupt + * @d: irq_data + */ +void irq_gc_eoi(struct irq_data *d) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + u32 mask = 1 << (d->irq - gc->irq_base); + + irq_gc_lock(gc); + irq_reg_writel(mask, gc->reg_base + cur_regs(d)->eoi); + irq_gc_unlock(gc); +} + +/** + * irq_gc_set_wake - Set/clr wake bit for an interrupt + * @d: irq_data + * + * For chips where the wake from suspend functionality is not + * configured in a separate register and the wakeup active state is + * just stored in a bitmask. + */ +int irq_gc_set_wake(struct irq_data *d, unsigned int on) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + u32 mask = 1 << (d->irq - gc->irq_base); + + if (!(mask & gc->wake_enabled)) + return -EINVAL; + + irq_gc_lock(gc); + if (on) + gc->wake_active |= mask; + else + gc->wake_active &= ~mask; + irq_gc_unlock(gc); + return 0; +} + +/** + * irq_alloc_generic_chip - Allocate a generic chip and initialize it + * @name: Name of the irq chip + * @num_ct: Number of irq_chip_type instances associated with this + * @irq_base: Interrupt base nr for this chip + * @reg_base: Register base address (virtual) + * @handler: Default flow handler associated with this chip + * + * Returns an initialized irq_chip_generic structure. The chip defaults + * to the primary (index 0) irq_chip_type and @handler + */ +struct irq_chip_generic * +irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base, + void __iomem *reg_base, irq_flow_handler_t handler) +{ + struct irq_chip_generic *gc; + unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type); + + gc = kzalloc(sz, GFP_KERNEL); + if (gc) { + raw_spin_lock_init(&gc->lock); + gc->num_ct = num_ct; + gc->irq_base = irq_base; + gc->reg_base = reg_base; + gc->chip_types->chip.name = name; + gc->chip_types->handler = handler; + } + return gc; +} + +/* + * Separate lockdep class for interrupt chip which can nest irq_desc + * lock. + */ +static struct lock_class_key irq_nested_lock_class; + +/** + * irq_setup_generic_chip - Setup a range of interrupts with a generic chip + * @gc: Generic irq chip holding all data + * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base + * @flags: Flags for initialization + * @clr: IRQ_* bits to clear + * @set: IRQ_* bits to set + * + * Set up max. 32 interrupts starting from gc->irq_base. Note, this + * initializes all interrupts to the primary irq_chip_type and its + * associated handler. + */ +void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, + enum irq_gc_flags flags, unsigned int clr, + unsigned int set) +{ + struct irq_chip_type *ct = gc->chip_types; + unsigned int i; + + raw_spin_lock(&gc_lock); + list_add_tail(&gc->list, &gc_list); + raw_spin_unlock(&gc_lock); + + /* Init mask cache ? */ + if (flags & IRQ_GC_INIT_MASK_CACHE) + gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask); + + for (i = gc->irq_base; msk; msk >>= 1, i++) { + if (!(msk & 0x01)) + continue; + + if (flags & IRQ_GC_INIT_NESTED_LOCK) + irq_set_lockdep_class(i, &irq_nested_lock_class); + + irq_set_chip_and_handler(i, &ct->chip, ct->handler); + irq_set_chip_data(i, gc); + irq_modify_status(i, clr, set); + } + gc->irq_cnt = i - gc->irq_base; +} + +/** + * irq_setup_alt_chip - Switch to alternative chip + * @d: irq_data for this interrupt + * @type Flow type to be initialized + * + * Only to be called from chip->irq_set_type() callbacks. + */ +int irq_setup_alt_chip(struct irq_data *d, unsigned int type) +{ + struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d); + struct irq_chip_type *ct = gc->chip_types; + unsigned int i; + + for (i = 0; i < gc->num_ct; i++, ct++) { + if (ct->type & type) { + d->chip = &ct->chip; + irq_data_to_desc(d)->handle_irq = ct->handler; + return 0; + } + } + return -EINVAL; +} + +/** + * irq_remove_generic_chip - Remove a chip + * @gc: Generic irq chip holding all data + * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base + * @clr: IRQ_* bits to clear + * @set: IRQ_* bits to set + * + * Remove up to 32 interrupts starting from gc->irq_base. + */ +void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk, + unsigned int clr, unsigned int set) +{ + unsigned int i = gc->irq_base; + + raw_spin_lock(&gc_lock); + list_del(&gc->list); + raw_spin_unlock(&gc_lock); + + for (; msk; msk >>= 1, i++) { + if (!(msk & 0x01)) + continue; + + /* Remove handler first. That will mask the irq line */ + irq_set_handler(i, NULL); + irq_set_chip(i, &no_irq_chip); + irq_set_chip_data(i, NULL); + irq_modify_status(i, clr, set); + } +} + +#ifdef CONFIG_PM +static int irq_gc_suspend(void) +{ + struct irq_chip_generic *gc; + + list_for_each_entry(gc, &gc_list, list) { + struct irq_chip_type *ct = gc->chip_types; + + if (ct->chip.irq_suspend) + ct->chip.irq_suspend(irq_get_irq_data(gc->irq_base)); + } + return 0; +} + +static void irq_gc_resume(void) +{ + struct irq_chip_generic *gc; + + list_for_each_entry(gc, &gc_list, list) { + struct irq_chip_type *ct = gc->chip_types; + + if (ct->chip.irq_resume) + ct->chip.irq_resume(irq_get_irq_data(gc->irq_base)); + } +} +#else +#define irq_gc_suspend NULL +#define irq_gc_resume NULL +#endif + +static void irq_gc_shutdown(void) +{ + struct irq_chip_generic *gc; + + list_for_each_entry(gc, &gc_list, list) { + struct irq_chip_type *ct = gc->chip_types; + + if (ct->chip.irq_pm_shutdown) + ct->chip.irq_pm_shutdown(irq_get_irq_data(gc->irq_base)); + } +} + +static struct syscore_ops irq_gc_syscore_ops = { + .suspend = irq_gc_suspend, + .resume = irq_gc_resume, + .shutdown = irq_gc_shutdown, +}; + +static int __init irq_gc_init_ops(void) +{ + register_syscore_ops(&irq_gc_syscore_ops); + return 0; +} +device_initcall(irq_gc_init_ops); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c new file mode 100644 index 00000000..470d08c8 --- /dev/null +++ b/kernel/irq/handle.c @@ -0,0 +1,181 @@ +/* + * linux/kernel/irq/handle.c + * + * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar + * Copyright (C) 2005-2006, Thomas Gleixner, Russell King + * + * This file contains the core interrupt handling code. + * + * Detailed information is available in Documentation/DocBook/genericirq + * + */ + +#include +#include +#include +#include +#include + +#include + +#include "internals.h" + +/** + * handle_bad_irq - handle spurious and unhandled irqs + * @irq: the interrupt number + * @desc: description of the interrupt + * + * Handles spurious and unhandled IRQ's. It also prints a debugmessage. + */ +void handle_bad_irq(unsigned int irq, struct irq_desc *desc) +{ + print_irq_desc(irq, desc); + kstat_incr_irqs_this_cpu(irq, desc); + ack_bad_irq(irq); +} + +/* + * Special, empty irq handler: + */ +irqreturn_t no_action(int cpl, void *dev_id) +{ + return IRQ_NONE; +} + +static void warn_no_thread(unsigned int irq, struct irqaction *action) +{ + if (test_and_set_bit(IRQTF_WARNED, &action->thread_flags)) + return; + + printk(KERN_WARNING "IRQ %d device %s returned IRQ_WAKE_THREAD " + "but no thread function available.", irq, action->name); +} + +static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action) +{ + /* + * Wake up the handler thread for this action. In case the + * thread crashed and was killed we just pretend that we + * handled the interrupt. The hardirq handler has disabled the + * device interrupt, so no irq storm is lurking. If the + * RUNTHREAD bit is already set, nothing to do. + */ + if (test_bit(IRQTF_DIED, &action->thread_flags) || + test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags)) + return; + + /* + * It's safe to OR the mask lockless here. We have only two + * places which write to threads_oneshot: This code and the + * irq thread. + * + * This code is the hard irq context and can never run on two + * cpus in parallel. If it ever does we have more serious + * problems than this bitmask. + * + * The irq threads of this irq which clear their "running" bit + * in threads_oneshot are serialized via desc->lock against + * each other and they are serialized against this code by + * IRQS_INPROGRESS. + * + * Hard irq handler: + * + * spin_lock(desc->lock); + * desc->state |= IRQS_INPROGRESS; + * spin_unlock(desc->lock); + * set_bit(IRQTF_RUNTHREAD, &action->thread_flags); + * desc->threads_oneshot |= mask; + * spin_lock(desc->lock); + * desc->state &= ~IRQS_INPROGRESS; + * spin_unlock(desc->lock); + * + * irq thread: + * + * again: + * spin_lock(desc->lock); + * if (desc->state & IRQS_INPROGRESS) { + * spin_unlock(desc->lock); + * while(desc->state & IRQS_INPROGRESS) + * cpu_relax(); + * goto again; + * } + * if (!test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) + * desc->threads_oneshot &= ~mask; + * spin_unlock(desc->lock); + * + * So either the thread waits for us to clear IRQS_INPROGRESS + * or we are waiting in the flow handler for desc->lock to be + * released before we reach this point. The thread also checks + * IRQTF_RUNTHREAD under desc->lock. If set it leaves + * threads_oneshot untouched and runs the thread another time. + */ + desc->threads_oneshot |= action->thread_mask; + wake_up_process(action->thread); +} + +irqreturn_t +handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action) +{ + irqreturn_t retval = IRQ_NONE; + unsigned int random = 0, irq = desc->irq_data.irq; + + do { + irqreturn_t res; + + trace_irq_handler_entry(irq, action); + res = action->handler(irq, action->dev_id); + trace_irq_handler_exit(irq, action, res); + + if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n", + irq, action->handler)) + local_irq_disable(); + + switch (res) { + case IRQ_WAKE_THREAD: + /* + * Catch drivers which return WAKE_THREAD but + * did not set up a thread function + */ + if (unlikely(!action->thread_fn)) { + warn_no_thread(irq, action); + break; + } + + irq_wake_thread(desc, action); + + /* Fall through to add to randomness */ + case IRQ_HANDLED: + random |= action->flags; + break; + + default: + break; + } + + retval |= res; + action = action->next; + } while (action); + + if (random & IRQF_SAMPLE_RANDOM) + add_interrupt_randomness(irq); + + if (!noirqdebug) + note_interrupt(irq, desc, retval); + return retval; +} + +irqreturn_t handle_irq_event(struct irq_desc *desc) +{ + struct irqaction *action = desc->action; + irqreturn_t ret; + + desc->istate &= ~IRQS_PENDING; + irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); + raw_spin_unlock(&desc->lock); + + ret = handle_irq_event_percpu(desc, action); + + raw_spin_lock(&desc->lock); + irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); + return ret; +} diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h new file mode 100644 index 00000000..62efdc44 --- /dev/null +++ b/kernel/irq/internals.h @@ -0,0 +1,171 @@ +/* + * IRQ subsystem internal functions and variables: + * + * Do not ever include this file from anything else than + * kernel/irq/. Do not even think about using any information outside + * of this file for your non core code. + */ +#include + +#ifdef CONFIG_SPARSE_IRQ +# define IRQ_BITMAP_BITS (NR_IRQS + 8196) +#else +# define IRQ_BITMAP_BITS NR_IRQS +#endif + +#define istate core_internal_state__do_not_mess_with_it + +extern int noirqdebug; + +/* + * Bits used by threaded handlers: + * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run + * IRQTF_DIED - handler thread died + * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed + * IRQTF_AFFINITY - irq thread is requested to adjust affinity + * IRQTF_FORCED_THREAD - irq action is force threaded + */ +enum { + IRQTF_RUNTHREAD, + IRQTF_DIED, + IRQTF_WARNED, + IRQTF_AFFINITY, + IRQTF_FORCED_THREAD, +}; + +/* + * Bit masks for desc->state + * + * IRQS_AUTODETECT - autodetection in progress + * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt + * detection + * IRQS_POLL_INPROGRESS - polling in progress + * IRQS_ONESHOT - irq is not unmasked in primary handler + * IRQS_REPLAY - irq is replayed + * IRQS_WAITING - irq is waiting + * IRQS_PENDING - irq is pending and replayed later + * IRQS_SUSPENDED - irq is suspended + */ +enum { + IRQS_AUTODETECT = 0x00000001, + IRQS_SPURIOUS_DISABLED = 0x00000002, + IRQS_POLL_INPROGRESS = 0x00000008, + IRQS_ONESHOT = 0x00000020, + IRQS_REPLAY = 0x00000040, + IRQS_WAITING = 0x00000080, + IRQS_PENDING = 0x00000200, + IRQS_SUSPENDED = 0x00000800, +}; + +#include "debug.h" +#include "settings.h" + +#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data) + +extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, + unsigned long flags); +extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); +extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); + +extern int irq_startup(struct irq_desc *desc, bool resend); +extern void irq_shutdown(struct irq_desc *desc); +extern void irq_enable(struct irq_desc *desc); +extern void irq_disable(struct irq_desc *desc); +extern void mask_irq(struct irq_desc *desc); +extern void unmask_irq(struct irq_desc *desc); + +extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr); + +irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action); +irqreturn_t handle_irq_event(struct irq_desc *desc); + +/* Resending of interrupts :*/ +void check_irq_resend(struct irq_desc *desc, unsigned int irq); +bool irq_wait_for_poll(struct irq_desc *desc); + +#ifdef CONFIG_PROC_FS +extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); +extern void unregister_irq_proc(unsigned int irq, struct irq_desc *desc); +extern void register_handler_proc(unsigned int irq, struct irqaction *action); +extern void unregister_handler_proc(unsigned int irq, struct irqaction *action); +#else +static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { } +static inline void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) { } +static inline void register_handler_proc(unsigned int irq, + struct irqaction *action) { } +static inline void unregister_handler_proc(unsigned int irq, + struct irqaction *action) { } +#endif + +extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask); + +extern void irq_set_thread_affinity(struct irq_desc *desc); + +/* Inline functions for support of irq chips on slow busses */ +static inline void chip_bus_lock(struct irq_desc *desc) +{ + if (unlikely(desc->irq_data.chip->irq_bus_lock)) + desc->irq_data.chip->irq_bus_lock(&desc->irq_data); +} + +static inline void chip_bus_sync_unlock(struct irq_desc *desc) +{ + if (unlikely(desc->irq_data.chip->irq_bus_sync_unlock)) + desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data); +} + +struct irq_desc * +__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus); +void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus); + +static inline struct irq_desc * +irq_get_desc_buslock(unsigned int irq, unsigned long *flags) +{ + return __irq_get_desc_lock(irq, flags, true); +} + +static inline void +irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags) +{ + __irq_put_desc_unlock(desc, flags, true); +} + +static inline struct irq_desc * +irq_get_desc_lock(unsigned int irq, unsigned long *flags) +{ + return __irq_get_desc_lock(irq, flags, false); +} + +static inline void +irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags) +{ + __irq_put_desc_unlock(desc, flags, false); +} + +/* + * Manipulation functions for irq_data.state + */ +static inline void irqd_set_move_pending(struct irq_data *d) +{ + d->state_use_accessors |= IRQD_SETAFFINITY_PENDING; +} + +static inline void irqd_clr_move_pending(struct irq_data *d) +{ + d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING; +} + +static inline void irqd_clear(struct irq_data *d, unsigned int mask) +{ + d->state_use_accessors &= ~mask; +} + +static inline void irqd_set(struct irq_data *d, unsigned int mask) +{ + d->state_use_accessors |= mask; +} + +static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) +{ + return d->state_use_accessors & mask; +} diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c new file mode 100644 index 00000000..4c60a50e --- /dev/null +++ b/kernel/irq/irqdesc.c @@ -0,0 +1,466 @@ +/* + * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar + * Copyright (C) 2005-2006, Thomas Gleixner, Russell King + * + * This file contains the interrupt descriptor management code + * + * Detailed information is available in Documentation/DocBook/genericirq + * + */ +#include +#include +#include +#include +#include +#include +#include + +#include "internals.h" + +/* + * lockdep: we want to handle all irq_desc locks as a single lock-class: + */ +static struct lock_class_key irq_desc_lock_class; + +#if defined(CONFIG_SMP) +static void __init init_irq_default_affinity(void) +{ + alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); + cpumask_setall(irq_default_affinity); +} +#else +static void __init init_irq_default_affinity(void) +{ +} +#endif + +#ifdef CONFIG_SMP +static int alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) +{ + if (!zalloc_cpumask_var_node(&desc->irq_data.affinity, gfp, node)) + return -ENOMEM; + +#ifdef CONFIG_GENERIC_PENDING_IRQ + if (!zalloc_cpumask_var_node(&desc->pending_mask, gfp, node)) { + free_cpumask_var(desc->irq_data.affinity); + return -ENOMEM; + } +#endif + return 0; +} + +static void desc_smp_init(struct irq_desc *desc, int node) +{ + desc->irq_data.node = node; + cpumask_copy(desc->irq_data.affinity, irq_default_affinity); +#ifdef CONFIG_GENERIC_PENDING_IRQ + cpumask_clear(desc->pending_mask); +#endif +} + +static inline int desc_node(struct irq_desc *desc) +{ + return desc->irq_data.node; +} + +#else +static inline int +alloc_masks(struct irq_desc *desc, gfp_t gfp, int node) { return 0; } +static inline void desc_smp_init(struct irq_desc *desc, int node) { } +static inline int desc_node(struct irq_desc *desc) { return 0; } +#endif + +static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node) +{ + int cpu; + + desc->irq_data.irq = irq; + desc->irq_data.chip = &no_irq_chip; + desc->irq_data.chip_data = NULL; + desc->irq_data.handler_data = NULL; + desc->irq_data.msi_desc = NULL; + irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS); + irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED); + desc->handle_irq = handle_bad_irq; + desc->depth = 1; + desc->irq_count = 0; + desc->irqs_unhandled = 0; + desc->name = NULL; + for_each_possible_cpu(cpu) + *per_cpu_ptr(desc->kstat_irqs, cpu) = 0; + desc_smp_init(desc, node); +} + +int nr_irqs = NR_IRQS; +EXPORT_SYMBOL_GPL(nr_irqs); + +static DEFINE_MUTEX(sparse_irq_lock); +static DECLARE_BITMAP(allocated_irqs, IRQ_BITMAP_BITS); + +#ifdef CONFIG_SPARSE_IRQ + +static RADIX_TREE(irq_desc_tree, GFP_KERNEL); + +static void irq_insert_desc(unsigned int irq, struct irq_desc *desc) +{ + radix_tree_insert(&irq_desc_tree, irq, desc); +} + +struct irq_desc *irq_to_desc(unsigned int irq) +{ + return radix_tree_lookup(&irq_desc_tree, irq); +} + +static void delete_irq_desc(unsigned int irq) +{ + radix_tree_delete(&irq_desc_tree, irq); +} + +#ifdef CONFIG_SMP +static void free_masks(struct irq_desc *desc) +{ +#ifdef CONFIG_GENERIC_PENDING_IRQ + free_cpumask_var(desc->pending_mask); +#endif + free_cpumask_var(desc->irq_data.affinity); +} +#else +static inline void free_masks(struct irq_desc *desc) { } +#endif + +static struct irq_desc *alloc_desc(int irq, int node) +{ + struct irq_desc *desc; + gfp_t gfp = GFP_KERNEL; + + desc = kzalloc_node(sizeof(*desc), gfp, node); + if (!desc) + return NULL; + /* allocate based on nr_cpu_ids */ + desc->kstat_irqs = alloc_percpu(unsigned int); + if (!desc->kstat_irqs) + goto err_desc; + + if (alloc_masks(desc, gfp, node)) + goto err_kstat; + + raw_spin_lock_init(&desc->lock); + lockdep_set_class(&desc->lock, &irq_desc_lock_class); + + desc_set_defaults(irq, desc, node); + + return desc; + +err_kstat: + free_percpu(desc->kstat_irqs); +err_desc: + kfree(desc); + return NULL; +} + +static void free_desc(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + unregister_irq_proc(irq, desc); + + mutex_lock(&sparse_irq_lock); + delete_irq_desc(irq); + mutex_unlock(&sparse_irq_lock); + + free_masks(desc); + free_percpu(desc->kstat_irqs); + kfree(desc); +} + +static int alloc_descs(unsigned int start, unsigned int cnt, int node) +{ + struct irq_desc *desc; + int i; + + for (i = 0; i < cnt; i++) { + desc = alloc_desc(start + i, node); + if (!desc) + goto err; + mutex_lock(&sparse_irq_lock); + irq_insert_desc(start + i, desc); + mutex_unlock(&sparse_irq_lock); + } + return start; + +err: + for (i--; i >= 0; i--) + free_desc(start + i); + + mutex_lock(&sparse_irq_lock); + bitmap_clear(allocated_irqs, start, cnt); + mutex_unlock(&sparse_irq_lock); + return -ENOMEM; +} + +static int irq_expand_nr_irqs(unsigned int nr) +{ + if (nr > IRQ_BITMAP_BITS) + return -ENOMEM; + nr_irqs = nr; + return 0; +} + +int __init early_irq_init(void) +{ + int i, initcnt, node = first_online_node; + struct irq_desc *desc; + + init_irq_default_affinity(); + + /* Let arch update nr_irqs and return the nr of preallocated irqs */ + initcnt = arch_probe_nr_irqs(); + printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d %d\n", NR_IRQS, nr_irqs, initcnt); + + if (WARN_ON(nr_irqs > IRQ_BITMAP_BITS)) + nr_irqs = IRQ_BITMAP_BITS; + + if (WARN_ON(initcnt > IRQ_BITMAP_BITS)) + initcnt = IRQ_BITMAP_BITS; + + if (initcnt > nr_irqs) + nr_irqs = initcnt; + + for (i = 0; i < initcnt; i++) { + desc = alloc_desc(i, node); + set_bit(i, allocated_irqs); + irq_insert_desc(i, desc); + } + return arch_early_irq_init(); +} + +#else /* !CONFIG_SPARSE_IRQ */ + +struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { + [0 ... NR_IRQS-1] = { + .handle_irq = handle_bad_irq, + .depth = 1, + .lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock), + } +}; + +int __init early_irq_init(void) +{ + int count, i, node = first_online_node; + struct irq_desc *desc; + + init_irq_default_affinity(); + + printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS); + + desc = irq_desc; + count = ARRAY_SIZE(irq_desc); + + for (i = 0; i < count; i++) { + desc[i].kstat_irqs = alloc_percpu(unsigned int); + alloc_masks(&desc[i], GFP_KERNEL, node); + raw_spin_lock_init(&desc[i].lock); + lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); + desc_set_defaults(i, &desc[i], node); + } + return arch_early_irq_init(); +} + +struct irq_desc *irq_to_desc(unsigned int irq) +{ + return (irq < NR_IRQS) ? irq_desc + irq : NULL; +} + +static void free_desc(unsigned int irq) +{ + dynamic_irq_cleanup(irq); +} + +static inline int alloc_descs(unsigned int start, unsigned int cnt, int node) +{ + return start; +} + +static int irq_expand_nr_irqs(unsigned int nr) +{ + return -ENOMEM; +} + +#endif /* !CONFIG_SPARSE_IRQ */ + +/** + * generic_handle_irq - Invoke the handler for a particular irq + * @irq: The irq number to handle + * + */ +int generic_handle_irq(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (!desc) + return -EINVAL; + generic_handle_irq_desc(irq, desc); + return 0; +} +EXPORT_SYMBOL_GPL(generic_handle_irq); + +/* Dynamic interrupt handling */ + +/** + * irq_free_descs - free irq descriptors + * @from: Start of descriptor range + * @cnt: Number of consecutive irqs to free + */ +void irq_free_descs(unsigned int from, unsigned int cnt) +{ + int i; + + if (from >= nr_irqs || (from + cnt) > nr_irqs) + return; + + for (i = 0; i < cnt; i++) + free_desc(from + i); + + mutex_lock(&sparse_irq_lock); + bitmap_clear(allocated_irqs, from, cnt); + mutex_unlock(&sparse_irq_lock); +} +EXPORT_SYMBOL_GPL(irq_free_descs); + +/** + * irq_alloc_descs - allocate and initialize a range of irq descriptors + * @irq: Allocate for specific irq number if irq >= 0 + * @from: Start the search from this irq number + * @cnt: Number of consecutive irqs to allocate. + * @node: Preferred node on which the irq descriptor should be allocated + * + * Returns the first irq number or error code + */ +int __ref +irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node) +{ + int start, ret; + + if (!cnt) + return -EINVAL; + + if (irq >= 0) { + if (from > irq) + return -EINVAL; + from = irq; + } + + mutex_lock(&sparse_irq_lock); + + start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS, + from, cnt, 0); + ret = -EEXIST; + if (irq >=0 && start != irq) + goto err; + + if (start + cnt > nr_irqs) { + ret = irq_expand_nr_irqs(start + cnt); + if (ret) + goto err; + } + + bitmap_set(allocated_irqs, start, cnt); + mutex_unlock(&sparse_irq_lock); + return alloc_descs(start, cnt, node); + +err: + mutex_unlock(&sparse_irq_lock); + return ret; +} +EXPORT_SYMBOL_GPL(irq_alloc_descs); + +/** + * irq_reserve_irqs - mark irqs allocated + * @from: mark from irq number + * @cnt: number of irqs to mark + * + * Returns 0 on success or an appropriate error code + */ +int irq_reserve_irqs(unsigned int from, unsigned int cnt) +{ + unsigned int start; + int ret = 0; + + if (!cnt || (from + cnt) > nr_irqs) + return -EINVAL; + + mutex_lock(&sparse_irq_lock); + start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0); + if (start == from) + bitmap_set(allocated_irqs, start, cnt); + else + ret = -EEXIST; + mutex_unlock(&sparse_irq_lock); + return ret; +} + +/** + * irq_get_next_irq - get next allocated irq number + * @offset: where to start the search + * + * Returns next irq number after offset or nr_irqs if none is found. + */ +unsigned int irq_get_next_irq(unsigned int offset) +{ + return find_next_bit(allocated_irqs, nr_irqs, offset); +} + +struct irq_desc * +__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (desc) { + if (bus) + chip_bus_lock(desc); + raw_spin_lock_irqsave(&desc->lock, *flags); + } + return desc; +} + +void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus) +{ + raw_spin_unlock_irqrestore(&desc->lock, flags); + if (bus) + chip_bus_sync_unlock(desc); +} + +/** + * dynamic_irq_cleanup - cleanup a dynamically allocated irq + * @irq: irq number to initialize + */ +void dynamic_irq_cleanup(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + unsigned long flags; + + raw_spin_lock_irqsave(&desc->lock, flags); + desc_set_defaults(irq, desc, desc_node(desc)); + raw_spin_unlock_irqrestore(&desc->lock, flags); +} + +unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) +{ + struct irq_desc *desc = irq_to_desc(irq); + + return desc && desc->kstat_irqs ? + *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; +} + +unsigned int kstat_irqs(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + int cpu; + int sum = 0; + + if (!desc || !desc->kstat_irqs) + return 0; + for_each_possible_cpu(cpu) + sum += *per_cpu_ptr(desc->kstat_irqs, cpu); + return sum; +} diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c new file mode 100644 index 00000000..df8136ff --- /dev/null +++ b/kernel/irq/manage.c @@ -0,0 +1,1437 @@ +/* + * linux/kernel/irq/manage.c + * + * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar + * Copyright (C) 2005-2006 Thomas Gleixner + * + * This file contains driver APIs to the irq subsystem. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "internals.h" + +#ifdef CONFIG_IRQ_FORCED_THREADING +__read_mostly bool force_irqthreads; + +static int __init setup_forced_irqthreads(char *arg) +{ + force_irqthreads = true; + return 0; +} +early_param("threadirqs", setup_forced_irqthreads); +#endif + +/** + * synchronize_irq - wait for pending IRQ handlers (on other CPUs) + * @irq: interrupt number to wait for + * + * This function waits for any pending IRQ handlers for this interrupt + * to complete before returning. If you use this function while + * holding a resource the IRQ handler may need you will deadlock. + * + * This function may be called - with care - from IRQ context. + */ +void synchronize_irq(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + bool inprogress; + + if (!desc) + return; + + do { + unsigned long flags; + + /* + * Wait until we're out of the critical section. This might + * give the wrong answer due to the lack of memory barriers. + */ + while (irqd_irq_inprogress(&desc->irq_data)) + cpu_relax(); + + /* Ok, that indicated we're done: double-check carefully. */ + raw_spin_lock_irqsave(&desc->lock, flags); + inprogress = irqd_irq_inprogress(&desc->irq_data); + raw_spin_unlock_irqrestore(&desc->lock, flags); + + /* Oops, that failed? */ + } while (inprogress); + + /* + * We made sure that no hardirq handler is running. Now verify + * that no threaded handlers are active. + */ + wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active)); +} +EXPORT_SYMBOL(synchronize_irq); + +#ifdef CONFIG_SMP +cpumask_var_t irq_default_affinity; + +/** + * irq_can_set_affinity - Check if the affinity of a given irq can be set + * @irq: Interrupt to check + * + */ +int irq_can_set_affinity(unsigned int irq) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (!desc || !irqd_can_balance(&desc->irq_data) || + !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity) + return 0; + + return 1; +} + +/** + * irq_set_thread_affinity - Notify irq threads to adjust affinity + * @desc: irq descriptor which has affitnity changed + * + * We just set IRQTF_AFFINITY and delegate the affinity setting + * to the interrupt thread itself. We can not call + * set_cpus_allowed_ptr() here as we hold desc->lock and this + * code can be called from hard interrupt context. + */ +void irq_set_thread_affinity(struct irq_desc *desc) +{ + struct irqaction *action = desc->action; + + while (action) { + if (action->thread) + set_bit(IRQTF_AFFINITY, &action->thread_flags); + action = action->next; + } +} + +#ifdef CONFIG_GENERIC_PENDING_IRQ +static inline bool irq_can_move_pcntxt(struct irq_data *data) +{ + return irqd_can_move_in_process_context(data); +} +static inline bool irq_move_pending(struct irq_data *data) +{ + return irqd_is_setaffinity_pending(data); +} +static inline void +irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) +{ + cpumask_copy(desc->pending_mask, mask); +} +static inline void +irq_get_pending(struct cpumask *mask, struct irq_desc *desc) +{ + cpumask_copy(mask, desc->pending_mask); +} +#else +static inline bool irq_can_move_pcntxt(struct irq_data *data) { return true; } +static inline bool irq_move_pending(struct irq_data *data) { return false; } +static inline void +irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { } +static inline void +irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { } +#endif + +int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask) +{ + struct irq_chip *chip = irq_data_get_irq_chip(data); + struct irq_desc *desc = irq_data_to_desc(data); + int ret = 0; + + if (!chip || !chip->irq_set_affinity) + return -EINVAL; + + if (irq_can_move_pcntxt(data)) { + ret = chip->irq_set_affinity(data, mask, false); + switch (ret) { + case IRQ_SET_MASK_OK: + cpumask_copy(data->affinity, mask); + case IRQ_SET_MASK_OK_NOCOPY: + irq_set_thread_affinity(desc); + ret = 0; + } + } else { + irqd_set_move_pending(data); + irq_copy_pending(desc, mask); + } + + if (desc->affinity_notify) { + kref_get(&desc->affinity_notify->kref); + schedule_work(&desc->affinity_notify->work); + } + irqd_set(data, IRQD_AFFINITY_SET); + + return ret; +} + +/** + * irq_set_affinity - Set the irq affinity of a given irq + * @irq: Interrupt to set affinity + * @mask: cpumask + * + */ +int irq_set_affinity(unsigned int irq, const struct cpumask *mask) +{ + struct irq_desc *desc = irq_to_desc(irq); + unsigned long flags; + int ret; + + if (!desc) + return -EINVAL; + + raw_spin_lock_irqsave(&desc->lock, flags); + ret = __irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask); + raw_spin_unlock_irqrestore(&desc->lock, flags); + return ret; +} + +int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m) +{ + unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); + + if (!desc) + return -EINVAL; + desc->affinity_hint = m; + irq_put_desc_unlock(desc, flags); + return 0; +} +EXPORT_SYMBOL_GPL(irq_set_affinity_hint); + +static void irq_affinity_notify(struct work_struct *work) +{ + struct irq_affinity_notify *notify = + container_of(work, struct irq_affinity_notify, work); + struct irq_desc *desc = irq_to_desc(notify->irq); + cpumask_var_t cpumask; + unsigned long flags; + + if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL)) + goto out; + + raw_spin_lock_irqsave(&desc->lock, flags); + if (irq_move_pending(&desc->irq_data)) + irq_get_pending(cpumask, desc); + else + cpumask_copy(cpumask, desc->irq_data.affinity); + raw_spin_unlock_irqrestore(&desc->lock, flags); + + notify->notify(notify, cpumask); + + free_cpumask_var(cpumask); +out: + kref_put(¬ify->kref, notify->release); +} + +/** + * irq_set_affinity_notifier - control notification of IRQ affinity changes + * @irq: Interrupt for which to enable/disable notification + * @notify: Context for notification, or %NULL to disable + * notification. Function pointers must be initialised; + * the other fields will be initialised by this function. + * + * Must be called in process context. Notification may only be enabled + * after the IRQ is allocated and must be disabled before the IRQ is + * freed using free_irq(). + */ +int +irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct irq_affinity_notify *old_notify; + unsigned long flags; + + /* The release function is promised process context */ + might_sleep(); + + if (!desc) + return -EINVAL; + + /* Complete initialisation of *notify */ + if (notify) { + notify->irq = irq; + kref_init(¬ify->kref); + INIT_WORK(¬ify->work, irq_affinity_notify); + } + + raw_spin_lock_irqsave(&desc->lock, flags); + old_notify = desc->affinity_notify; + desc->affinity_notify = notify; + raw_spin_unlock_irqrestore(&desc->lock, flags); + + if (old_notify) + kref_put(&old_notify->kref, old_notify->release); + + return 0; +} +EXPORT_SYMBOL_GPL(irq_set_affinity_notifier); + +#ifndef CONFIG_AUTO_IRQ_AFFINITY +/* + * Generic version of the affinity autoselector. + */ +static int +setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) +{ + struct irq_chip *chip = irq_desc_get_chip(desc); + struct cpumask *set = irq_default_affinity; + int ret; + + /* Excludes PER_CPU and NO_BALANCE interrupts */ + if (!irq_can_set_affinity(irq)) + return 0; + + /* + * Preserve an userspace affinity setup, but make sure that + * one of the targets is online. + */ + if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) { + if (cpumask_intersects(desc->irq_data.affinity, + cpu_online_mask)) + set = desc->irq_data.affinity; + else + irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET); + } + + cpumask_and(mask, cpu_online_mask, set); + ret = chip->irq_set_affinity(&desc->irq_data, mask, false); + switch (ret) { + case IRQ_SET_MASK_OK: + cpumask_copy(desc->irq_data.affinity, mask); + case IRQ_SET_MASK_OK_NOCOPY: + irq_set_thread_affinity(desc); + } + return 0; +} +#else +static inline int +setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask) +{ + return irq_select_affinity(irq); +} +#endif + +/* + * Called when affinity is set via /proc/irq + */ +int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask) +{ + struct irq_desc *desc = irq_to_desc(irq); + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&desc->lock, flags); + ret = setup_affinity(irq, desc, mask); + raw_spin_unlock_irqrestore(&desc->lock, flags); + return ret; +} + +#else +static inline int +setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) +{ + return 0; +} +#endif + +void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) +{ + if (suspend) { + if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND)) + return; + desc->istate |= IRQS_SUSPENDED; + } + + if (!desc->depth++) + irq_disable(desc); +} + +static int __disable_irq_nosync(unsigned int irq) +{ + unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); + + if (!desc) + return -EINVAL; + __disable_irq(desc, irq, false); + irq_put_desc_busunlock(desc, flags); + return 0; +} + +/** + * disable_irq_nosync - disable an irq without waiting + * @irq: Interrupt to disable + * + * Disable the selected interrupt line. Disables and Enables are + * nested. + * Unlike disable_irq(), this function does not ensure existing + * instances of the IRQ handler have completed before returning. + * + * This function may be called from IRQ context. + */ +void disable_irq_nosync(unsigned int irq) +{ + __disable_irq_nosync(irq); +} +EXPORT_SYMBOL(disable_irq_nosync); + +/** + * disable_irq - disable an irq and wait for completion + * @irq: Interrupt to disable + * + * Disable the selected interrupt line. Enables and Disables are + * nested. + * This function waits for any pending IRQ handlers for this interrupt + * to complete before returning. If you use this function while + * holding a resource the IRQ handler may need you will deadlock. + * + * This function may be called - with care - from IRQ context. + */ +void disable_irq(unsigned int irq) +{ + if (!__disable_irq_nosync(irq)) + synchronize_irq(irq); +} +EXPORT_SYMBOL(disable_irq); + +void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) +{ + if (resume) { + if (!(desc->istate & IRQS_SUSPENDED)) { + if (!desc->action) + return; + if (!(desc->action->flags & IRQF_FORCE_RESUME)) + return; + /* Pretend that it got disabled ! */ + desc->depth++; + } + desc->istate &= ~IRQS_SUSPENDED; + } + + switch (desc->depth) { + case 0: + err_out: + WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); + break; + case 1: { + if (desc->istate & IRQS_SUSPENDED) + goto err_out; + /* Prevent probing on this irq: */ + irq_settings_set_noprobe(desc); + irq_enable(desc); + check_irq_resend(desc, irq); + /* fall-through */ + } + default: + desc->depth--; + } +} + +/** + * enable_irq - enable handling of an irq + * @irq: Interrupt to enable + * + * Undoes the effect of one call to disable_irq(). If this + * matches the last disable, processing of interrupts on this + * IRQ line is re-enabled. + * + * This function may be called from IRQ context only when + * desc->irq_data.chip->bus_lock and desc->chip->bus_sync_unlock are NULL ! + */ +void enable_irq(unsigned int irq) +{ + unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); + + if (!desc) + return; + if (WARN(!desc->irq_data.chip, + KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) + goto out; + + __enable_irq(desc, irq, false); +out: + irq_put_desc_busunlock(desc, flags); +} +EXPORT_SYMBOL(enable_irq); + +static int set_irq_wake_real(unsigned int irq, unsigned int on) +{ + struct irq_desc *desc = irq_to_desc(irq); + int ret = -ENXIO; + + if (desc->irq_data.chip->irq_set_wake) + ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on); + + return ret; +} + +/** + * irq_set_irq_wake - control irq power management wakeup + * @irq: interrupt to control + * @on: enable/disable power management wakeup + * + * Enable/disable power management wakeup mode, which is + * disabled by default. Enables and disables must match, + * just as they match for non-wakeup mode support. + * + * Wakeup mode lets this IRQ wake the system from sleep + * states like "suspend to RAM". + */ +int irq_set_irq_wake(unsigned int irq, unsigned int on) +{ + unsigned long flags; + struct irq_desc *desc = irq_get_desc_buslock(irq, &flags); + int ret = 0; + + if (!desc) + return -EINVAL; + + /* wakeup-capable irqs can be shared between drivers that + * don't need to have the same sleep mode behaviors. + */ + if (on) { + if (desc->wake_depth++ == 0) { + ret = set_irq_wake_real(irq, on); + if (ret) + desc->wake_depth = 0; + else + irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE); + } + } else { + if (desc->wake_depth == 0) { + WARN(1, "Unbalanced IRQ %d wake disable\n", irq); + } else if (--desc->wake_depth == 0) { + ret = set_irq_wake_real(irq, on); + if (ret) + desc->wake_depth = 1; + else + irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE); + } + } + irq_put_desc_busunlock(desc, flags); + return ret; +} +EXPORT_SYMBOL(irq_set_irq_wake); + +/* + * Internal function that tells the architecture code whether a + * particular irq has been exclusively allocated or is available + * for driver use. + */ +int can_request_irq(unsigned int irq, unsigned long irqflags) +{ + unsigned long flags; + struct irq_desc *desc = irq_get_desc_lock(irq, &flags); + int canrequest = 0; + + if (!desc) + return 0; + + if (irq_settings_can_request(desc)) { + if (desc->action) + if (irqflags & desc->action->flags & IRQF_SHARED) + canrequest =1; + } + irq_put_desc_unlock(desc, flags); + return canrequest; +} + +int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, + unsigned long flags) +{ + struct irq_chip *chip = desc->irq_data.chip; + int ret, unmask = 0; + + if (!chip || !chip->irq_set_type) { + /* + * IRQF_TRIGGER_* but the PIC does not support multiple + * flow-types? + */ + pr_debug("No set_type function for IRQ %d (%s)\n", irq, + chip ? (chip->name ? : "unknown") : "unknown"); + return 0; + } + + flags &= IRQ_TYPE_SENSE_MASK; + + if (chip->flags & IRQCHIP_SET_TYPE_MASKED) { + if (!irqd_irq_masked(&desc->irq_data)) + mask_irq(desc); + if (!irqd_irq_disabled(&desc->irq_data)) + unmask = 1; + } + + /* caller masked out all except trigger mode flags */ + ret = chip->irq_set_type(&desc->irq_data, flags); + + switch (ret) { + case IRQ_SET_MASK_OK: + irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK); + irqd_set(&desc->irq_data, flags); + + case IRQ_SET_MASK_OK_NOCOPY: + flags = irqd_get_trigger_type(&desc->irq_data); + irq_settings_set_trigger_mask(desc, flags); + irqd_clear(&desc->irq_data, IRQD_LEVEL); + irq_settings_clr_level(desc); + if (flags & IRQ_TYPE_LEVEL_MASK) { + irq_settings_set_level(desc); + irqd_set(&desc->irq_data, IRQD_LEVEL); + } + + ret = 0; + break; + default: + pr_err("setting trigger mode %lu for irq %u failed (%pF)\n", + flags, irq, chip->irq_set_type); + } + if (unmask) + unmask_irq(desc); + return ret; +} + +/* + * Default primary interrupt handler for threaded interrupts. Is + * assigned as primary handler when request_threaded_irq is called + * with handler == NULL. Useful for oneshot interrupts. + */ +static irqreturn_t irq_default_primary_handler(int irq, void *dev_id) +{ + return IRQ_WAKE_THREAD; +} + +/* + * Primary handler for nested threaded interrupts. Should never be + * called. + */ +static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id) +{ + WARN(1, "Primary handler called for nested irq %d\n", irq); + return IRQ_NONE; +} + +static int irq_wait_for_interrupt(struct irqaction *action) +{ + set_current_state(TASK_INTERRUPTIBLE); + + while (!kthread_should_stop()) { + + if (test_and_clear_bit(IRQTF_RUNTHREAD, + &action->thread_flags)) { + __set_current_state(TASK_RUNNING); + return 0; + } + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return -1; +} + +/* + * Oneshot interrupts keep the irq line masked until the threaded + * handler finished. unmask if the interrupt has not been disabled and + * is marked MASKED. + */ +static void irq_finalize_oneshot(struct irq_desc *desc, + struct irqaction *action, bool force) +{ + if (!(desc->istate & IRQS_ONESHOT)) + return; +again: + chip_bus_lock(desc); + raw_spin_lock_irq(&desc->lock); + + /* + * Implausible though it may be we need to protect us against + * the following scenario: + * + * The thread is faster done than the hard interrupt handler + * on the other CPU. If we unmask the irq line then the + * interrupt can come in again and masks the line, leaves due + * to IRQS_INPROGRESS and the irq line is masked forever. + * + * This also serializes the state of shared oneshot handlers + * versus "desc->threads_onehsot |= action->thread_mask;" in + * irq_wake_thread(). See the comment there which explains the + * serialization. + */ + if (unlikely(irqd_irq_inprogress(&desc->irq_data))) { + raw_spin_unlock_irq(&desc->lock); + chip_bus_sync_unlock(desc); + cpu_relax(); + goto again; + } + + /* + * Now check again, whether the thread should run. Otherwise + * we would clear the threads_oneshot bit of this thread which + * was just set. + */ + if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags)) + goto out_unlock; + + desc->threads_oneshot &= ~action->thread_mask; + + if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) && + irqd_irq_masked(&desc->irq_data)) + unmask_irq(desc); + +out_unlock: + raw_spin_unlock_irq(&desc->lock); + chip_bus_sync_unlock(desc); +} + +#ifdef CONFIG_SMP +/* + * Check whether we need to chasnge the affinity of the interrupt thread. + */ +static void +irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) +{ + cpumask_var_t mask; + + if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) + return; + + /* + * In case we are out of memory we set IRQTF_AFFINITY again and + * try again next time + */ + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { + set_bit(IRQTF_AFFINITY, &action->thread_flags); + return; + } + + raw_spin_lock_irq(&desc->lock); + cpumask_copy(mask, desc->irq_data.affinity); + raw_spin_unlock_irq(&desc->lock); + + set_cpus_allowed_ptr(current, mask); + free_cpumask_var(mask); +} +#else +static inline void +irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { } +#endif + +/* + * Interrupts which are not explicitely requested as threaded + * interrupts rely on the implicit bh/preempt disable of the hard irq + * context. So we need to disable bh here to avoid deadlocks and other + * side effects. + */ +static irqreturn_t +irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action) +{ + irqreturn_t ret; + + local_bh_disable(); + ret = action->thread_fn(action->irq, action->dev_id); + irq_finalize_oneshot(desc, action, false); + local_bh_enable(); + return ret; +} + +/* + * Interrupts explicitely requested as threaded interupts want to be + * preemtible - many of them need to sleep and wait for slow busses to + * complete. + */ +static irqreturn_t irq_thread_fn(struct irq_desc *desc, + struct irqaction *action) +{ + irqreturn_t ret; + + ret = action->thread_fn(action->irq, action->dev_id); + irq_finalize_oneshot(desc, action, false); + return ret; +} + +/* + * Interrupt handler thread + */ +static int irq_thread(void *data) +{ + static const struct sched_param param = { + .sched_priority = MAX_USER_RT_PRIO/2, + }; + struct irqaction *action = data; + struct irq_desc *desc = irq_to_desc(action->irq); + irqreturn_t (*handler_fn)(struct irq_desc *desc, + struct irqaction *action); + int wake; + + if (force_irqthreads && test_bit(IRQTF_FORCED_THREAD, + &action->thread_flags)) + handler_fn = irq_forced_thread_fn; + else + handler_fn = irq_thread_fn; + + sched_setscheduler(current, SCHED_FIFO, ¶m); + current->irqaction = action; + + while (!irq_wait_for_interrupt(action)) { + + irq_thread_check_affinity(desc, action); + + atomic_inc(&desc->threads_active); + + raw_spin_lock_irq(&desc->lock); + if (unlikely(irqd_irq_disabled(&desc->irq_data))) { + /* + * CHECKME: We might need a dedicated + * IRQ_THREAD_PENDING flag here, which + * retriggers the thread in check_irq_resend() + * but AFAICT IRQS_PENDING should be fine as it + * retriggers the interrupt itself --- tglx + */ + desc->istate |= IRQS_PENDING; + raw_spin_unlock_irq(&desc->lock); + } else { + irqreturn_t action_ret; + + raw_spin_unlock_irq(&desc->lock); + action_ret = handler_fn(desc, action); + if (!noirqdebug) + note_interrupt(action->irq, desc, action_ret); + } + + wake = atomic_dec_and_test(&desc->threads_active); + + if (wake && waitqueue_active(&desc->wait_for_threads)) + wake_up(&desc->wait_for_threads); + } + + /* Prevent a stale desc->threads_oneshot */ + irq_finalize_oneshot(desc, action, true); + + /* + * Clear irqaction. Otherwise exit_irq_thread() would make + * fuzz about an active irq thread going into nirvana. + */ + current->irqaction = NULL; + return 0; +} + +/* + * Called from do_exit() + */ +void exit_irq_thread(void) +{ + struct task_struct *tsk = current; + struct irq_desc *desc; + + if (!tsk->irqaction) + return; + + printk(KERN_ERR + "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n", + tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq); + + desc = irq_to_desc(tsk->irqaction->irq); + + /* + * Prevent a stale desc->threads_oneshot. Must be called + * before setting the IRQTF_DIED flag. + */ + irq_finalize_oneshot(desc, tsk->irqaction, true); + + /* + * Set the THREAD DIED flag to prevent further wakeups of the + * soon to be gone threaded handler. + */ + set_bit(IRQTF_DIED, &tsk->irqaction->flags); +} + +static void irq_setup_forced_threading(struct irqaction *new) +{ + if (!force_irqthreads) + return; + if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT)) + return; + + new->flags |= IRQF_ONESHOT; + + if (!new->thread_fn) { + set_bit(IRQTF_FORCED_THREAD, &new->thread_flags); + new->thread_fn = new->handler; + new->handler = irq_default_primary_handler; + } +} + +/* + * Internal function to register an irqaction - typically used to + * allocate special interrupts that are part of the architecture. + */ +static int +__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) +{ + struct irqaction *old, **old_ptr; + const char *old_name = NULL; + unsigned long flags, thread_mask = 0; + int ret, nested, shared = 0; + cpumask_var_t mask; + + if (!desc) + return -EINVAL; + + if (desc->irq_data.chip == &no_irq_chip) + return -ENOSYS; + /* + * Some drivers like serial.c use request_irq() heavily, + * so we have to be careful not to interfere with a + * running system. + */ + if (new->flags & IRQF_SAMPLE_RANDOM) { + /* + * This function might sleep, we want to call it first, + * outside of the atomic block. + * Yes, this might clear the entropy pool if the wrong + * driver is attempted to be loaded, without actually + * installing a new handler, but is this really a problem, + * only the sysadmin is able to do this. + */ + rand_initialize_irq(irq); + } + + /* + * Check whether the interrupt nests into another interrupt + * thread. + */ + nested = irq_settings_is_nested_thread(desc); + if (nested) { + if (!new->thread_fn) + return -EINVAL; + /* + * Replace the primary handler which was provided from + * the driver for non nested interrupt handling by the + * dummy function which warns when called. + */ + new->handler = irq_nested_primary_handler; + } else { + if (irq_settings_can_thread(desc)) + irq_setup_forced_threading(new); + } + + /* + * Create a handler thread when a thread function is supplied + * and the interrupt does not nest into another interrupt + * thread. + */ + if (new->thread_fn && !nested) { + struct task_struct *t; + + t = kthread_create(irq_thread, new, "irq/%d-%s", irq, + new->name); + if (IS_ERR(t)) + return PTR_ERR(t); + /* + * We keep the reference to the task struct even if + * the thread dies to avoid that the interrupt code + * references an already freed task_struct. + */ + get_task_struct(t); + new->thread = t; + } + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { + ret = -ENOMEM; + goto out_thread; + } + + /* + * The following block of code has to be executed atomically + */ + raw_spin_lock_irqsave(&desc->lock, flags); + old_ptr = &desc->action; + old = *old_ptr; + if (old) { + /* + * Can't share interrupts unless both agree to and are + * the same type (level, edge, polarity). So both flag + * fields must have IRQF_SHARED set and the bits which + * set the trigger type must match. Also all must + * agree on ONESHOT. + */ + if (!((old->flags & new->flags) & IRQF_SHARED) || + ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || + ((old->flags ^ new->flags) & IRQF_ONESHOT)) { + old_name = old->name; + goto mismatch; + } + + /* All handlers must agree on per-cpuness */ + if ((old->flags & IRQF_PERCPU) != + (new->flags & IRQF_PERCPU)) + goto mismatch; + + /* add new interrupt at end of irq queue */ + do { + /* + * Or all existing action->thread_mask bits, + * so we can find the next zero bit for this + * new action. + */ + thread_mask |= old->thread_mask; + old_ptr = &old->next; + old = *old_ptr; + } while (old); + shared = 1; + } + + /* + * Setup the thread mask for this irqaction for ONESHOT. For + * !ONESHOT irqs the thread mask is 0 so we can avoid a + * conditional in irq_wake_thread(). + */ + if (new->flags & IRQF_ONESHOT) { + /* + * Unlikely to have 32 resp 64 irqs sharing one line, + * but who knows. + */ + if (thread_mask == ~0UL) { + ret = -EBUSY; + goto out_mask; + } + /* + * The thread_mask for the action is or'ed to + * desc->thread_active to indicate that the + * IRQF_ONESHOT thread handler has been woken, but not + * yet finished. The bit is cleared when a thread + * completes. When all threads of a shared interrupt + * line have completed desc->threads_active becomes + * zero and the interrupt line is unmasked. See + * handle.c:irq_wake_thread() for further information. + * + * If no thread is woken by primary (hard irq context) + * interrupt handlers, then desc->threads_active is + * also checked for zero to unmask the irq line in the + * affected hard irq flow handlers + * (handle_[fasteoi|level]_irq). + * + * The new action gets the first zero bit of + * thread_mask assigned. See the loop above which or's + * all existing action->thread_mask bits. + */ + new->thread_mask = 1 << ffz(thread_mask); + } + + if (!shared) { + init_waitqueue_head(&desc->wait_for_threads); + + /* Setup the type (level, edge polarity) if configured: */ + if (new->flags & IRQF_TRIGGER_MASK) { + ret = __irq_set_trigger(desc, irq, + new->flags & IRQF_TRIGGER_MASK); + + if (ret) + goto out_mask; + } + + desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \ + IRQS_ONESHOT | IRQS_WAITING); + irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); + + if (new->flags & IRQF_PERCPU) { + irqd_set(&desc->irq_data, IRQD_PER_CPU); + irq_settings_set_per_cpu(desc); + } + + if (new->flags & IRQF_ONESHOT) + desc->istate |= IRQS_ONESHOT; + + if (irq_settings_can_autoenable(desc)) + irq_startup(desc, true); + else + /* Undo nested disables: */ + desc->depth = 1; + + /* Exclude IRQ from balancing if requested */ + if (new->flags & IRQF_NOBALANCING) { + irq_settings_set_no_balancing(desc); + irqd_set(&desc->irq_data, IRQD_NO_BALANCING); + } + + /* Set default affinity mask once everything is setup */ + setup_affinity(irq, desc, mask); + + } else if (new->flags & IRQF_TRIGGER_MASK) { + unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; + unsigned int omsk = irq_settings_get_trigger_mask(desc); + + if (nmsk != omsk) + /* hope the handler works with current trigger mode */ + pr_warning("IRQ %d uses trigger mode %u; requested %u\n", + irq, nmsk, omsk); + } + + new->irq = irq; + *old_ptr = new; + + /* Reset broken irq detection when installing new handler */ + desc->irq_count = 0; + desc->irqs_unhandled = 0; + + /* + * Check whether we disabled the irq via the spurious handler + * before. Reenable it and give it another chance. + */ + if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) { + desc->istate &= ~IRQS_SPURIOUS_DISABLED; + __enable_irq(desc, irq, false); + } + + raw_spin_unlock_irqrestore(&desc->lock, flags); + + /* + * Strictly no need to wake it up, but hung_task complains + * when no hard interrupt wakes the thread up. + */ + if (new->thread) + wake_up_process(new->thread); + + register_irq_proc(irq, desc); + new->dir = NULL; + register_handler_proc(irq, new); + free_cpumask_var(mask); + + return 0; + +mismatch: +#ifdef CONFIG_DEBUG_SHIRQ + if (!(new->flags & IRQF_PROBE_SHARED)) { + printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); + if (old_name) + printk(KERN_ERR "current handler: %s\n", old_name); + dump_stack(); + } +#endif + ret = -EBUSY; + +out_mask: + raw_spin_unlock_irqrestore(&desc->lock, flags); + free_cpumask_var(mask); + +out_thread: + if (new->thread) { + struct task_struct *t = new->thread; + + new->thread = NULL; + if (likely(!test_bit(IRQTF_DIED, &new->thread_flags))) + kthread_stop(t); + put_task_struct(t); + } + return ret; +} + +/** + * setup_irq - setup an interrupt + * @irq: Interrupt line to setup + * @act: irqaction for the interrupt + * + * Used to statically setup interrupts in the early boot process. + */ +int setup_irq(unsigned int irq, struct irqaction *act) +{ + int retval; + struct irq_desc *desc = irq_to_desc(irq); + + chip_bus_lock(desc); + retval = __setup_irq(irq, desc, act); + chip_bus_sync_unlock(desc); + + return retval; +} +EXPORT_SYMBOL_GPL(setup_irq); + + /* + * Internal function to unregister an irqaction - used to free + * regular and special interrupts that are part of the architecture. + */ +static struct irqaction *__free_irq(unsigned int irq, void *dev_id) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct irqaction *action, **action_ptr; + unsigned long flags; + + WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); + + if (!desc) + return NULL; + + raw_spin_lock_irqsave(&desc->lock, flags); + + /* + * There can be multiple actions per IRQ descriptor, find the right + * one based on the dev_id: + */ + action_ptr = &desc->action; + for (;;) { + action = *action_ptr; + + if (!action) { + WARN(1, "Trying to free already-free IRQ %d\n", irq); + raw_spin_unlock_irqrestore(&desc->lock, flags); + + return NULL; + } + + if (action->dev_id == dev_id) + break; + action_ptr = &action->next; + } + + /* Found it - now remove it from the list of entries: */ + *action_ptr = action->next; + + /* Currently used only by UML, might disappear one day: */ +#ifdef CONFIG_IRQ_RELEASE_METHOD + if (desc->irq_data.chip->release) + desc->irq_data.chip->release(irq, dev_id); +#endif + + /* If this was the last handler, shut down the IRQ line: */ + if (!desc->action) + irq_shutdown(desc); + +#ifdef CONFIG_SMP + /* make sure affinity_hint is cleaned up */ + if (WARN_ON_ONCE(desc->affinity_hint)) + desc->affinity_hint = NULL; +#endif + + raw_spin_unlock_irqrestore(&desc->lock, flags); + + unregister_handler_proc(irq, action); + + /* Make sure it's not being used on another CPU: */ + synchronize_irq(irq); + +#ifdef CONFIG_DEBUG_SHIRQ + /* + * It's a shared IRQ -- the driver ought to be prepared for an IRQ + * event to happen even now it's being freed, so let's make sure that + * is so by doing an extra call to the handler .... + * + * ( We do this after actually deregistering it, to make sure that a + * 'real' IRQ doesn't run in * parallel with our fake. ) + */ + if (action->flags & IRQF_SHARED) { + local_irq_save(flags); + action->handler(irq, dev_id); + local_irq_restore(flags); + } +#endif + + if (action->thread) { + if (!test_bit(IRQTF_DIED, &action->thread_flags)) + kthread_stop(action->thread); + put_task_struct(action->thread); + } + + return action; +} + +/** + * remove_irq - free an interrupt + * @irq: Interrupt line to free + * @act: irqaction for the interrupt + * + * Used to remove interrupts statically setup by the early boot process. + */ +void remove_irq(unsigned int irq, struct irqaction *act) +{ + __free_irq(irq, act->dev_id); +} +EXPORT_SYMBOL_GPL(remove_irq); + +/** + * free_irq - free an interrupt allocated with request_irq + * @irq: Interrupt line to free + * @dev_id: Device identity to free + * + * Remove an interrupt handler. The handler is removed and if the + * interrupt line is no longer in use by any driver it is disabled. + * On a shared IRQ the caller must ensure the interrupt is disabled + * on the card it drives before calling this function. The function + * does not return until any executing interrupts for this IRQ + * have completed. + * + * This function must not be called from interrupt context. + */ +void free_irq(unsigned int irq, void *dev_id) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (!desc) + return; + +#ifdef CONFIG_SMP + if (WARN_ON(desc->affinity_notify)) + desc->affinity_notify = NULL; +#endif + + chip_bus_lock(desc); + kfree(__free_irq(irq, dev_id)); + chip_bus_sync_unlock(desc); +} +EXPORT_SYMBOL(free_irq); + +/** + * request_threaded_irq - allocate an interrupt line + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs. + * Primary handler for threaded interrupts + * If NULL and thread_fn != NULL the default + * primary handler is installed + * @thread_fn: Function called from the irq handler thread + * If NULL, no irq thread is created + * @irqflags: Interrupt type flags + * @devname: An ascii name for the claiming device + * @dev_id: A cookie passed back to the handler function + * + * This call allocates interrupt resources and enables the + * interrupt line and IRQ handling. From the point this + * call is made your handler function may be invoked. Since + * your handler function must clear any interrupt the board + * raises, you must take care both to initialise your hardware + * and to set up the interrupt handler in the right order. + * + * If you want to set up a threaded irq handler for your device + * then you need to supply @handler and @thread_fn. @handler ist + * still called in hard interrupt context and has to check + * whether the interrupt originates from the device. If yes it + * needs to disable the interrupt on the device and return + * IRQ_WAKE_THREAD which will wake up the handler thread and run + * @thread_fn. This split handler design is necessary to support + * shared interrupts. + * + * Dev_id must be globally unique. Normally the address of the + * device data structure is used as the cookie. Since the handler + * receives this value it makes sense to use it. + * + * If your interrupt is shared you must pass a non NULL dev_id + * as this is required when freeing the interrupt. + * + * Flags: + * + * IRQF_SHARED Interrupt is shared + * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy + * IRQF_TRIGGER_* Specify active edge(s) or level + * + */ +int request_threaded_irq(unsigned int irq, irq_handler_t handler, + irq_handler_t thread_fn, unsigned long irqflags, + const char *devname, void *dev_id) +{ + struct irqaction *action; + struct irq_desc *desc; + int retval; + + /* + * Sanity-check: shared interrupts must pass in a real dev-ID, + * otherwise we'll have trouble later trying to figure out + * which interrupt is which (messes up the interrupt freeing + * logic etc). + */ + if ((irqflags & IRQF_SHARED) && !dev_id) + return -EINVAL; + + desc = irq_to_desc(irq); + if (!desc) + return -EINVAL; + + if (!irq_settings_can_request(desc)) + return -EINVAL; + + if (!handler) { + if (!thread_fn) + return -EINVAL; + handler = irq_default_primary_handler; + } + + action = kzalloc(sizeof(struct irqaction), GFP_KERNEL); + if (!action) + return -ENOMEM; + + action->handler = handler; + action->thread_fn = thread_fn; + action->flags = irqflags; + action->name = devname; + action->dev_id = dev_id; + + chip_bus_lock(desc); + retval = __setup_irq(irq, desc, action); + chip_bus_sync_unlock(desc); + + if (retval) + kfree(action); + +#ifdef CONFIG_DEBUG_SHIRQ_FIXME + if (!retval && (irqflags & IRQF_SHARED)) { + /* + * It's a shared IRQ -- the driver ought to be prepared for it + * to happen immediately, so let's make sure.... + * We disable the irq to make sure that a 'real' IRQ doesn't + * run in parallel with our fake. + */ + unsigned long flags; + + disable_irq(irq); + local_irq_save(flags); + + handler(irq, dev_id); + + local_irq_restore(flags); + enable_irq(irq); + } +#endif + return retval; +} +EXPORT_SYMBOL(request_threaded_irq); + +/** + * request_any_context_irq - allocate an interrupt line + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs. + * Threaded handler for threaded interrupts. + * @flags: Interrupt type flags + * @name: An ascii name for the claiming device + * @dev_id: A cookie passed back to the handler function + * + * This call allocates interrupt resources and enables the + * interrupt line and IRQ handling. It selects either a + * hardirq or threaded handling method depending on the + * context. + * + * On failure, it returns a negative value. On success, + * it returns either IRQC_IS_HARDIRQ or IRQC_IS_NESTED. + */ +int request_any_context_irq(unsigned int irq, irq_handler_t handler, + unsigned long flags, const char *name, void *dev_id) +{ + struct irq_desc *desc = irq_to_desc(irq); + int ret; + + if (!desc) + return -EINVAL; + + if (irq_settings_is_nested_thread(desc)) { + ret = request_threaded_irq(irq, NULL, handler, + flags, name, dev_id); + return !ret ? IRQC_IS_NESTED : ret; + } + + ret = request_irq(irq, handler, flags, name, dev_id); + return !ret ? IRQC_IS_HARDIRQ : ret; +} +EXPORT_SYMBOL_GPL(request_any_context_irq); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c new file mode 100644 index 00000000..c3c89751 --- /dev/null +++ b/kernel/irq/migration.c @@ -0,0 +1,81 @@ + +#include +#include + +#include "internals.h" + +void irq_move_masked_irq(struct irq_data *idata) +{ + struct irq_desc *desc = irq_data_to_desc(idata); + struct irq_chip *chip = idata->chip; + + if (likely(!irqd_is_setaffinity_pending(&desc->irq_data))) + return; + + /* + * Paranoia: cpu-local interrupts shouldn't be calling in here anyway. + */ + if (!irqd_can_balance(&desc->irq_data)) { + WARN_ON(1); + return; + } + + irqd_clr_move_pending(&desc->irq_data); + + if (unlikely(cpumask_empty(desc->pending_mask))) + return; + + if (!chip->irq_set_affinity) + return; + + assert_raw_spin_locked(&desc->lock); + + /* + * If there was a valid mask to work with, please + * do the disable, re-program, enable sequence. + * This is *not* particularly important for level triggered + * but in a edge trigger case, we might be setting rte + * when an active trigger is coming in. This could + * cause some ioapics to mal-function. + * Being paranoid i guess! + * + * For correct operation this depends on the caller + * masking the irqs. + */ + if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) + < nr_cpu_ids)) { + int ret = chip->irq_set_affinity(&desc->irq_data, + desc->pending_mask, false); + switch (ret) { + case IRQ_SET_MASK_OK: + cpumask_copy(desc->irq_data.affinity, desc->pending_mask); + case IRQ_SET_MASK_OK_NOCOPY: + irq_set_thread_affinity(desc); + } + } + + cpumask_clear(desc->pending_mask); +} + +void irq_move_irq(struct irq_data *idata) +{ + bool masked; + + if (likely(!irqd_is_setaffinity_pending(idata))) + return; + + if (unlikely(irqd_irq_disabled(idata))) + return; + + /* + * Be careful vs. already masked interrupts. If this is a + * threaded interrupt with ONESHOT set, we can end up with an + * interrupt storm. + */ + masked = irqd_irq_masked(idata); + if (!masked) + idata->chip->irq_mask(idata); + irq_move_masked_irq(idata); + if (!masked) + idata->chip->irq_unmask(idata); +} diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c new file mode 100644 index 00000000..bbf72985 --- /dev/null +++ b/kernel/irq/pm.c @@ -0,0 +1,131 @@ +/* + * linux/kernel/irq/pm.c + * + * Copyright (C) 2009 Rafael J. Wysocki , Novell Inc. + * + * This file contains power management functions related to interrupts. + */ + +#include +#include +#include +#include + +#include "internals.h" + +/** + * suspend_device_irqs - disable all currently enabled interrupt lines + * + * During system-wide suspend or hibernation device drivers need to be prevented + * from receiving interrupts and this function is provided for this purpose. + * It marks all interrupt lines in use, except for the timer ones, as disabled + * and sets the IRQS_SUSPENDED flag for each of them. + */ +void suspend_device_irqs(void) +{ + struct irq_desc *desc; + int irq; + + for_each_irq_desc(irq, desc) { + unsigned long flags; + + raw_spin_lock_irqsave(&desc->lock, flags); + __disable_irq(desc, irq, true); + raw_spin_unlock_irqrestore(&desc->lock, flags); + } + + for_each_irq_desc(irq, desc) + if (desc->istate & IRQS_SUSPENDED) + synchronize_irq(irq); +} +EXPORT_SYMBOL_GPL(suspend_device_irqs); + +void resume_irqs(bool want_early) +{ + struct irq_desc *desc; + int irq; + + for_each_irq_desc(irq, desc) { + unsigned long flags; + bool is_early = desc->action && + desc->action->flags & IRQF_EARLY_RESUME; + + if (is_early != want_early) + continue; + + raw_spin_lock_irqsave(&desc->lock, flags); + __enable_irq(desc, irq, true); + raw_spin_unlock_irqrestore(&desc->lock, flags); + } +} +EXPORT_SYMBOL_GPL(resume_irqs); + +/** + * irq_pm_syscore_ops - enable interrupt lines early + * + * Enable all interrupt lines with %IRQF_EARLY_RESUME set. + */ +static void irq_pm_syscore_resume(void) +{ + resume_irqs(true); +} + +static struct syscore_ops irq_pm_syscore_ops = { + .resume = irq_pm_syscore_resume, +}; + +static int __init irq_pm_init_ops(void) +{ + register_syscore_ops(&irq_pm_syscore_ops); + return 0; +} + +device_initcall(irq_pm_init_ops); + +/** + * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs() + * + * Enable all non-%IRQF_EARLY_RESUME interrupt lines previously + * disabled by suspend_device_irqs() that have the IRQS_SUSPENDED flag + * set as well as those with %IRQF_FORCE_RESUME. + */ +void resume_device_irqs(void) +{ + resume_irqs(false); +} +EXPORT_SYMBOL_GPL(resume_device_irqs); + +/** + * check_wakeup_irqs - check if any wake-up interrupts are pending + */ +int check_wakeup_irqs(void) +{ + struct irq_desc *desc; + int irq; + + for_each_irq_desc(irq, desc) { + if (irqd_is_wakeup_set(&desc->irq_data)) { + if (desc->istate & IRQS_PENDING) { + pr_info("Wakeup IRQ %d %s pending, suspend aborted\n", + irq, + desc->action && desc->action->name ? + desc->action->name : ""); + return -EBUSY; + } + continue; + } + /* + * Check the non wakeup interrupts whether they need + * to be masked before finally going into suspend + * state. That's for hardware which has no wakeup + * source configuration facility. The chip + * implementation indicates that with + * IRQCHIP_MASK_ON_SUSPEND. + */ + if (desc->istate & IRQS_SUSPENDED && + irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND) + mask_irq(desc); + } + + return 0; +} diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c new file mode 100644 index 00000000..4bd4faa6 --- /dev/null +++ b/kernel/irq/proc.c @@ -0,0 +1,486 @@ +/* + * linux/kernel/irq/proc.c + * + * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar + * + * This file contains the /proc/irq/ handling code. + */ + +#include +#include +#include +#include +#include +#include + +#include "internals.h" + +static struct proc_dir_entry *root_irq_dir; + +#ifdef CONFIG_SMP + +static int show_irq_affinity(int type, struct seq_file *m, void *v) +{ + struct irq_desc *desc = irq_to_desc((long)m->private); + const struct cpumask *mask = desc->irq_data.affinity; + +#ifdef CONFIG_GENERIC_PENDING_IRQ + if (irqd_is_setaffinity_pending(&desc->irq_data)) + mask = desc->pending_mask; +#endif + if (type) + seq_cpumask_list(m, mask); + else + seq_cpumask(m, mask); + seq_putc(m, '\n'); + return 0; +} + +static int irq_affinity_hint_proc_show(struct seq_file *m, void *v) +{ + struct irq_desc *desc = irq_to_desc((long)m->private); + unsigned long flags; + cpumask_var_t mask; + + if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + raw_spin_lock_irqsave(&desc->lock, flags); + if (desc->affinity_hint) + cpumask_copy(mask, desc->affinity_hint); + raw_spin_unlock_irqrestore(&desc->lock, flags); + + seq_cpumask(m, mask); + seq_putc(m, '\n'); + free_cpumask_var(mask); + + return 0; +} + +#ifndef is_affinity_mask_valid +#define is_affinity_mask_valid(val) 1 +#endif + +int no_irq_affinity; +static int irq_affinity_proc_show(struct seq_file *m, void *v) +{ + return show_irq_affinity(0, m, v); +} + +static int irq_affinity_list_proc_show(struct seq_file *m, void *v) +{ + return show_irq_affinity(1, m, v); +} + + +static ssize_t write_irq_affinity(int type, struct file *file, + const char __user *buffer, size_t count, loff_t *pos) +{ + unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data; + cpumask_var_t new_value; + int err; + + if (!irq_can_set_affinity(irq) || no_irq_affinity) + return -EIO; + + if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) + return -ENOMEM; + + if (type) + err = cpumask_parselist_user(buffer, count, new_value); + else + err = cpumask_parse_user(buffer, count, new_value); + if (err) + goto free_cpumask; + + if (!is_affinity_mask_valid(new_value)) { + err = -EINVAL; + goto free_cpumask; + } + + /* + * Do not allow disabling IRQs completely - it's a too easy + * way to make the system unusable accidentally :-) At least + * one online CPU still has to be targeted. + */ + if (!cpumask_intersects(new_value, cpu_online_mask)) { + /* Special case for empty set - allow the architecture + code to set default SMP affinity. */ + err = irq_select_affinity_usr(irq, new_value) ? -EINVAL : count; + } else { + irq_set_affinity(irq, new_value); + err = count; + } + +free_cpumask: + free_cpumask_var(new_value); + return err; +} + +static ssize_t irq_affinity_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *pos) +{ + return write_irq_affinity(0, file, buffer, count, pos); +} + +static ssize_t irq_affinity_list_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *pos) +{ + return write_irq_affinity(1, file, buffer, count, pos); +} + +static int irq_affinity_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, irq_affinity_proc_show, PDE(inode)->data); +} + +static int irq_affinity_list_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, irq_affinity_list_proc_show, PDE(inode)->data); +} + +static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data); +} + +static const struct file_operations irq_affinity_proc_fops = { + .open = irq_affinity_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = irq_affinity_proc_write, +}; + +static const struct file_operations irq_affinity_hint_proc_fops = { + .open = irq_affinity_hint_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static const struct file_operations irq_affinity_list_proc_fops = { + .open = irq_affinity_list_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = irq_affinity_list_proc_write, +}; + +static int default_affinity_show(struct seq_file *m, void *v) +{ + seq_cpumask(m, irq_default_affinity); + seq_putc(m, '\n'); + return 0; +} + +static ssize_t default_affinity_write(struct file *file, + const char __user *buffer, size_t count, loff_t *ppos) +{ + cpumask_var_t new_value; + int err; + + if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) + return -ENOMEM; + + err = cpumask_parse_user(buffer, count, new_value); + if (err) + goto out; + + if (!is_affinity_mask_valid(new_value)) { + err = -EINVAL; + goto out; + } + + /* + * Do not allow disabling IRQs completely - it's a too easy + * way to make the system unusable accidentally :-) At least + * one online CPU still has to be targeted. + */ + if (!cpumask_intersects(new_value, cpu_online_mask)) { + err = -EINVAL; + goto out; + } + + cpumask_copy(irq_default_affinity, new_value); + err = count; + +out: + free_cpumask_var(new_value); + return err; +} + +static int default_affinity_open(struct inode *inode, struct file *file) +{ + return single_open(file, default_affinity_show, PDE(inode)->data); +} + +static const struct file_operations default_affinity_proc_fops = { + .open = default_affinity_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = default_affinity_write, +}; + +static int irq_node_proc_show(struct seq_file *m, void *v) +{ + struct irq_desc *desc = irq_to_desc((long) m->private); + + seq_printf(m, "%d\n", desc->irq_data.node); + return 0; +} + +static int irq_node_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, irq_node_proc_show, PDE(inode)->data); +} + +static const struct file_operations irq_node_proc_fops = { + .open = irq_node_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + +static int irq_spurious_proc_show(struct seq_file *m, void *v) +{ + struct irq_desc *desc = irq_to_desc((long) m->private); + + seq_printf(m, "count %u\n" "unhandled %u\n" "last_unhandled %u ms\n", + desc->irq_count, desc->irqs_unhandled, + jiffies_to_msecs(desc->last_unhandled)); + return 0; +} + +static int irq_spurious_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, irq_spurious_proc_show, PDE(inode)->data); +} + +static const struct file_operations irq_spurious_proc_fops = { + .open = irq_spurious_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#define MAX_NAMELEN 128 + +static int name_unique(unsigned int irq, struct irqaction *new_action) +{ + struct irq_desc *desc = irq_to_desc(irq); + struct irqaction *action; + unsigned long flags; + int ret = 1; + + raw_spin_lock_irqsave(&desc->lock, flags); + for (action = desc->action ; action; action = action->next) { + if ((action != new_action) && action->name && + !strcmp(new_action->name, action->name)) { + ret = 0; + break; + } + } + raw_spin_unlock_irqrestore(&desc->lock, flags); + return ret; +} + +void register_handler_proc(unsigned int irq, struct irqaction *action) +{ + char name [MAX_NAMELEN]; + struct irq_desc *desc = irq_to_desc(irq); + + if (!desc->dir || action->dir || !action->name || + !name_unique(irq, action)) + return; + + memset(name, 0, MAX_NAMELEN); + snprintf(name, MAX_NAMELEN, "%s", action->name); + + /* create /proc/irq/1234/handler/ */ + action->dir = proc_mkdir(name, desc->dir); +} + +#undef MAX_NAMELEN + +#define MAX_NAMELEN 10 + +void register_irq_proc(unsigned int irq, struct irq_desc *desc) +{ + char name [MAX_NAMELEN]; + + if (!root_irq_dir || (desc->irq_data.chip == &no_irq_chip) || desc->dir) + return; + + memset(name, 0, MAX_NAMELEN); + sprintf(name, "%d", irq); + + /* create /proc/irq/1234 */ + desc->dir = proc_mkdir(name, root_irq_dir); + if (!desc->dir) + return; + +#ifdef CONFIG_SMP + /* create /proc/irq//smp_affinity */ + proc_create_data("smp_affinity", 0600, desc->dir, + &irq_affinity_proc_fops, (void *)(long)irq); + + /* create /proc/irq//affinity_hint */ + proc_create_data("affinity_hint", 0400, desc->dir, + &irq_affinity_hint_proc_fops, (void *)(long)irq); + + /* create /proc/irq//smp_affinity_list */ + proc_create_data("smp_affinity_list", 0600, desc->dir, + &irq_affinity_list_proc_fops, (void *)(long)irq); + + proc_create_data("node", 0444, desc->dir, + &irq_node_proc_fops, (void *)(long)irq); +#endif + + proc_create_data("spurious", 0444, desc->dir, + &irq_spurious_proc_fops, (void *)(long)irq); +} + +void unregister_irq_proc(unsigned int irq, struct irq_desc *desc) +{ + char name [MAX_NAMELEN]; + + if (!root_irq_dir || !desc->dir) + return; +#ifdef CONFIG_SMP + remove_proc_entry("smp_affinity", desc->dir); + remove_proc_entry("affinity_hint", desc->dir); + remove_proc_entry("smp_affinity_list", desc->dir); + remove_proc_entry("node", desc->dir); +#endif + remove_proc_entry("spurious", desc->dir); + + memset(name, 0, MAX_NAMELEN); + sprintf(name, "%u", irq); + remove_proc_entry(name, root_irq_dir); +} + +#undef MAX_NAMELEN + +void unregister_handler_proc(unsigned int irq, struct irqaction *action) +{ + if (action->dir) { + struct irq_desc *desc = irq_to_desc(irq); + + remove_proc_entry(action->dir->name, desc->dir); + } +} + +static void register_default_affinity_proc(void) +{ +#ifdef CONFIG_SMP + proc_create("irq/default_smp_affinity", 0600, NULL, + &default_affinity_proc_fops); +#endif +} + +void init_irq_proc(void) +{ + unsigned int irq; + struct irq_desc *desc; + + /* create /proc/irq */ + root_irq_dir = proc_mkdir("irq", NULL); + if (!root_irq_dir) + return; + + register_default_affinity_proc(); + + /* + * Create entries for all existing IRQs. + */ + for_each_irq_desc(irq, desc) { + if (!desc) + continue; + + register_irq_proc(irq, desc); + } +} + +#ifdef CONFIG_GENERIC_IRQ_SHOW + +int __weak arch_show_interrupts(struct seq_file *p, int prec) +{ + return 0; +} + +#ifndef ACTUAL_NR_IRQS +# define ACTUAL_NR_IRQS nr_irqs +#endif + +int show_interrupts(struct seq_file *p, void *v) +{ + static int prec; + + unsigned long flags, any_count = 0; + int i = *(loff_t *) v, j; + struct irqaction *action; + struct irq_desc *desc; + + if (i > ACTUAL_NR_IRQS) + return 0; + + if (i == ACTUAL_NR_IRQS) + return arch_show_interrupts(p, prec); + + /* print header and calculate the width of the first column */ + if (i == 0) { + for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec) + j *= 10; + + seq_printf(p, "%*s", prec + 8, ""); + for_each_online_cpu(j) + seq_printf(p, "CPU%-8d", j); + seq_putc(p, '\n'); + } + + desc = irq_to_desc(i); + if (!desc) + return 0; + + raw_spin_lock_irqsave(&desc->lock, flags); + for_each_online_cpu(j) + any_count |= kstat_irqs_cpu(i, j); + action = desc->action; + if (!action && !any_count) + goto out; + + seq_printf(p, "%*d: ", prec, i); + for_each_online_cpu(j) + seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); + + if (desc->irq_data.chip) { + if (desc->irq_data.chip->irq_print_chip) + desc->irq_data.chip->irq_print_chip(&desc->irq_data, p); + else if (desc->irq_data.chip->name) + seq_printf(p, " %8s", desc->irq_data.chip->name); + else + seq_printf(p, " %8s", "-"); + } else { + seq_printf(p, " %8s", "None"); + } +#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL + seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge"); +#endif + if (desc->name) + seq_printf(p, "-%-8s", desc->name); + + if (action) { + seq_printf(p, " %s", action->name); + while ((action = action->next) != NULL) + seq_printf(p, ", %s", action->name); + } + + seq_putc(p, '\n'); +out: + raw_spin_unlock_irqrestore(&desc->lock, flags); + return 0; +} +#endif diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c new file mode 100644 index 00000000..14dd5761 --- /dev/null +++ b/kernel/irq/resend.c @@ -0,0 +1,80 @@ +/* + * linux/kernel/irq/resend.c + * + * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar + * Copyright (C) 2005-2006, Thomas Gleixner + * + * This file contains the IRQ-resend code + * + * If the interrupt is waiting to be processed, we try to re-run it. + * We can't directly run it from here since the caller might be in an + * interrupt-protected region. Not all irq controller chips can + * retrigger interrupts at the hardware level, so in those cases + * we allow the resending of IRQs via a tasklet. + */ + +#include +#include +#include +#include + +#include "internals.h" + +#ifdef CONFIG_HARDIRQS_SW_RESEND + +/* Bitmap to handle software resend of interrupts: */ +static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS); + +/* + * Run software resends of IRQ's + */ +static void resend_irqs(unsigned long arg) +{ + struct irq_desc *desc; + int irq; + + while (!bitmap_empty(irqs_resend, nr_irqs)) { + irq = find_first_bit(irqs_resend, nr_irqs); + clear_bit(irq, irqs_resend); + desc = irq_to_desc(irq); + local_irq_disable(); + desc->handle_irq(irq, desc); + local_irq_enable(); + } +} + +/* Tasklet to handle resend: */ +static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); + +#endif + +/* + * IRQ resend + * + * Is called with interrupts disabled and desc->lock held. + */ +void check_irq_resend(struct irq_desc *desc, unsigned int irq) +{ + /* + * We do not resend level type interrupts. Level type + * interrupts are resent by hardware when they are still + * active. + */ + if (irq_settings_is_level(desc)) + return; + if (desc->istate & IRQS_REPLAY) + return; + if (desc->istate & IRQS_PENDING) { + desc->istate &= ~IRQS_PENDING; + desc->istate |= IRQS_REPLAY; + + if (!desc->irq_data.chip->irq_retrigger || + !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { +#ifdef CONFIG_HARDIRQS_SW_RESEND + /* Set it pending and activate the softirq: */ + set_bit(irq, irqs_resend); + tasklet_schedule(&resend_tasklet); +#endif + } + } +} diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h new file mode 100644 index 00000000..f1667833 --- /dev/null +++ b/kernel/irq/settings.h @@ -0,0 +1,142 @@ +/* + * Internal header to deal with irq_desc->status which will be renamed + * to irq_desc->settings. + */ +enum { + _IRQ_DEFAULT_INIT_FLAGS = IRQ_DEFAULT_INIT_FLAGS, + _IRQ_PER_CPU = IRQ_PER_CPU, + _IRQ_LEVEL = IRQ_LEVEL, + _IRQ_NOPROBE = IRQ_NOPROBE, + _IRQ_NOREQUEST = IRQ_NOREQUEST, + _IRQ_NOTHREAD = IRQ_NOTHREAD, + _IRQ_NOAUTOEN = IRQ_NOAUTOEN, + _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT, + _IRQ_NO_BALANCING = IRQ_NO_BALANCING, + _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD, + _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK, +}; + +#define IRQ_PER_CPU GOT_YOU_MORON +#define IRQ_NO_BALANCING GOT_YOU_MORON +#define IRQ_LEVEL GOT_YOU_MORON +#define IRQ_NOPROBE GOT_YOU_MORON +#define IRQ_NOREQUEST GOT_YOU_MORON +#define IRQ_NOTHREAD GOT_YOU_MORON +#define IRQ_NOAUTOEN GOT_YOU_MORON +#define IRQ_NESTED_THREAD GOT_YOU_MORON +#undef IRQF_MODIFY_MASK +#define IRQF_MODIFY_MASK GOT_YOU_MORON + +static inline void +irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set) +{ + desc->status_use_accessors &= ~(clr & _IRQF_MODIFY_MASK); + desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK); +} + +static inline bool irq_settings_is_per_cpu(struct irq_desc *desc) +{ + return desc->status_use_accessors & _IRQ_PER_CPU; +} + +static inline void irq_settings_set_per_cpu(struct irq_desc *desc) +{ + desc->status_use_accessors |= _IRQ_PER_CPU; +} + +static inline void irq_settings_set_no_balancing(struct irq_desc *desc) +{ + desc->status_use_accessors |= _IRQ_NO_BALANCING; +} + +static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc) +{ + return desc->status_use_accessors & _IRQ_NO_BALANCING; +} + +static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc) +{ + return desc->status_use_accessors & IRQ_TYPE_SENSE_MASK; +} + +static inline void +irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask) +{ + desc->status_use_accessors &= ~IRQ_TYPE_SENSE_MASK; + desc->status_use_accessors |= mask & IRQ_TYPE_SENSE_MASK; +} + +static inline bool irq_settings_is_level(struct irq_desc *desc) +{ + return desc->status_use_accessors & _IRQ_LEVEL; +} + +static inline void irq_settings_clr_level(struct irq_desc *desc) +{ + desc->status_use_accessors &= ~_IRQ_LEVEL; +} + +static inline void irq_settings_set_level(struct irq_desc *desc) +{ + desc->status_use_accessors |= _IRQ_LEVEL; +} + +static inline bool irq_settings_can_request(struct irq_desc *desc) +{ + return !(desc->status_use_accessors & _IRQ_NOREQUEST); +} + +static inline void irq_settings_clr_norequest(struct irq_desc *desc) +{ + desc->status_use_accessors &= ~_IRQ_NOREQUEST; +} + +static inline void irq_settings_set_norequest(struct irq_desc *desc) +{ + desc->status_use_accessors |= _IRQ_NOREQUEST; +} + +static inline bool irq_settings_can_thread(struct irq_desc *desc) +{ + return !(desc->status_use_accessors & _IRQ_NOTHREAD); +} + +static inline void irq_settings_clr_nothread(struct irq_desc *desc) +{ + desc->status_use_accessors &= ~_IRQ_NOTHREAD; +} + +static inline void irq_settings_set_nothread(struct irq_desc *desc) +{ + desc->status_use_accessors |= _IRQ_NOTHREAD; +} + +static inline bool irq_settings_can_probe(struct irq_desc *desc) +{ + return !(desc->status_use_accessors & _IRQ_NOPROBE); +} + +static inline void irq_settings_clr_noprobe(struct irq_desc *desc) +{ + desc->status_use_accessors &= ~_IRQ_NOPROBE; +} + +static inline void irq_settings_set_noprobe(struct irq_desc *desc) +{ + desc->status_use_accessors |= _IRQ_NOPROBE; +} + +static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc) +{ + return desc->status_use_accessors & _IRQ_MOVE_PCNTXT; +} + +static inline bool irq_settings_can_autoenable(struct irq_desc *desc) +{ + return !(desc->status_use_accessors & _IRQ_NOAUTOEN); +} + +static inline bool irq_settings_is_nested_thread(struct irq_desc *desc) +{ + return desc->status_use_accessors & _IRQ_NESTED_THREAD; +} diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c new file mode 100644 index 00000000..dc813a94 --- /dev/null +++ b/kernel/irq/spurious.c @@ -0,0 +1,364 @@ +/* + * linux/kernel/irq/spurious.c + * + * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar + * + * This file contains spurious interrupt handling. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "internals.h" + +static int irqfixup __read_mostly; + +#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10) +static void poll_spurious_irqs(unsigned long dummy); +static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0); +static int irq_poll_cpu; +static atomic_t irq_poll_active; + +/* + * We wait here for a poller to finish. + * + * If the poll runs on this CPU, then we yell loudly and return + * false. That will leave the interrupt line disabled in the worst + * case, but it should never happen. + * + * We wait until the poller is done and then recheck disabled and + * action (about to be disabled). Only if it's still active, we return + * true and let the handler run. + */ +bool irq_wait_for_poll(struct irq_desc *desc) +{ + if (WARN_ONCE(irq_poll_cpu == smp_processor_id(), + "irq poll in progress on cpu %d for irq %d\n", + smp_processor_id(), desc->irq_data.irq)) + return false; + +#ifdef CONFIG_SMP + do { + raw_spin_unlock(&desc->lock); + while (irqd_irq_inprogress(&desc->irq_data)) + cpu_relax(); + raw_spin_lock(&desc->lock); + } while (irqd_irq_inprogress(&desc->irq_data)); + /* Might have been disabled in meantime */ + return !irqd_irq_disabled(&desc->irq_data) && desc->action; +#else + return false; +#endif +} + + +/* + * Recovery handler for misrouted interrupts. + */ +static int try_one_irq(int irq, struct irq_desc *desc, bool force) +{ + irqreturn_t ret = IRQ_NONE; + struct irqaction *action; + + raw_spin_lock(&desc->lock); + + /* PER_CPU and nested thread interrupts are never polled */ + if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc)) + goto out; + + /* + * Do not poll disabled interrupts unless the spurious + * disabled poller asks explicitely. + */ + if (irqd_irq_disabled(&desc->irq_data) && !force) + goto out; + + /* + * All handlers must agree on IRQF_SHARED, so we test just the + * first. Check for action->next as well. + */ + action = desc->action; + if (!action || !(action->flags & IRQF_SHARED) || + (action->flags & __IRQF_TIMER) || + (action->handler(irq, action->dev_id) == IRQ_HANDLED) || + !action->next) + goto out; + + /* Already running on another processor */ + if (irqd_irq_inprogress(&desc->irq_data)) { + /* + * Already running: If it is shared get the other + * CPU to go looking for our mystery interrupt too + */ + desc->istate |= IRQS_PENDING; + goto out; + } + + /* Mark it poll in progress */ + desc->istate |= IRQS_POLL_INPROGRESS; + do { + if (handle_irq_event(desc) == IRQ_HANDLED) + ret = IRQ_HANDLED; + action = desc->action; + } while ((desc->istate & IRQS_PENDING) && action); + desc->istate &= ~IRQS_POLL_INPROGRESS; +out: + raw_spin_unlock(&desc->lock); + return ret == IRQ_HANDLED; +} + +static int misrouted_irq(int irq) +{ + struct irq_desc *desc; + int i, ok = 0; + + if (atomic_inc_return(&irq_poll_active) != 1) + goto out; + + irq_poll_cpu = smp_processor_id(); + + for_each_irq_desc(i, desc) { + if (!i) + continue; + + if (i == irq) /* Already tried */ + continue; + + if (try_one_irq(i, desc, false)) + ok = 1; + } +out: + atomic_dec(&irq_poll_active); + /* So the caller can adjust the irq error counts */ + return ok; +} + +static void poll_spurious_irqs(unsigned long dummy) +{ + struct irq_desc *desc; + int i; + + if (atomic_inc_return(&irq_poll_active) != 1) + goto out; + irq_poll_cpu = smp_processor_id(); + + for_each_irq_desc(i, desc) { + unsigned int state; + + if (!i) + continue; + + /* Racy but it doesn't matter */ + state = desc->istate; + barrier(); + if (!(state & IRQS_SPURIOUS_DISABLED)) + continue; + + local_irq_disable(); + try_one_irq(i, desc, true); + local_irq_enable(); + } +out: + atomic_dec(&irq_poll_active); + mod_timer(&poll_spurious_irq_timer, + jiffies + POLL_SPURIOUS_IRQ_INTERVAL); +} + +static inline int bad_action_ret(irqreturn_t action_ret) +{ + if (likely(action_ret <= (IRQ_HANDLED | IRQ_WAKE_THREAD))) + return 0; + return 1; +} + +/* + * If 99,900 of the previous 100,000 interrupts have not been handled + * then assume that the IRQ is stuck in some manner. Drop a diagnostic + * and try to turn the IRQ off. + * + * (The other 100-of-100,000 interrupts may have been a correctly + * functioning device sharing an IRQ with the failing one) + */ +static void +__report_bad_irq(unsigned int irq, struct irq_desc *desc, + irqreturn_t action_ret) +{ + struct irqaction *action; + unsigned long flags; + + if (bad_action_ret(action_ret)) { + printk(KERN_ERR "irq event %d: bogus return value %x\n", + irq, action_ret); + } else { + printk(KERN_ERR "irq %d: nobody cared (try booting with " + "the \"irqpoll\" option)\n", irq); + } + dump_stack(); + printk(KERN_ERR "handlers:\n"); + + /* + * We need to take desc->lock here. note_interrupt() is called + * w/o desc->lock held, but IRQ_PROGRESS set. We might race + * with something else removing an action. It's ok to take + * desc->lock here. See synchronize_irq(). + */ + raw_spin_lock_irqsave(&desc->lock, flags); + action = desc->action; + while (action) { + printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler); + if (action->thread_fn) + printk(KERN_CONT " threaded [<%p>] %pf", + action->thread_fn, action->thread_fn); + printk(KERN_CONT "\n"); + action = action->next; + } + raw_spin_unlock_irqrestore(&desc->lock, flags); +} + +static void +report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) +{ + static int count = 100; + + if (count > 0) { + count--; + __report_bad_irq(irq, desc, action_ret); + } +} + +static inline int +try_misrouted_irq(unsigned int irq, struct irq_desc *desc, + irqreturn_t action_ret) +{ + struct irqaction *action; + + if (!irqfixup) + return 0; + + /* We didn't actually handle the IRQ - see if it was misrouted? */ + if (action_ret == IRQ_NONE) + return 1; + + /* + * But for 'irqfixup == 2' we also do it for handled interrupts if + * they are marked as IRQF_IRQPOLL (or for irq zero, which is the + * traditional PC timer interrupt.. Legacy) + */ + if (irqfixup < 2) + return 0; + + if (!irq) + return 1; + + /* + * Since we don't get the descriptor lock, "action" can + * change under us. We don't really care, but we don't + * want to follow a NULL pointer. So tell the compiler to + * just load it once by using a barrier. + */ + action = desc->action; + barrier(); + return action && (action->flags & IRQF_IRQPOLL); +} + +void note_interrupt(unsigned int irq, struct irq_desc *desc, + irqreturn_t action_ret) +{ + if (desc->istate & IRQS_POLL_INPROGRESS) + return; + + /* we get here again via the threaded handler */ + if (action_ret == IRQ_WAKE_THREAD) + return; + + if (bad_action_ret(action_ret)) { + report_bad_irq(irq, desc, action_ret); + return; + } + + if (unlikely(action_ret == IRQ_NONE)) { + /* + * If we are seeing only the odd spurious IRQ caused by + * bus asynchronicity then don't eventually trigger an error, + * otherwise the counter becomes a doomsday timer for otherwise + * working systems + */ + if (time_after(jiffies, desc->last_unhandled + HZ/10)) + desc->irqs_unhandled = 1; + else + desc->irqs_unhandled++; + desc->last_unhandled = jiffies; + } + + if (unlikely(try_misrouted_irq(irq, desc, action_ret))) { + int ok = misrouted_irq(irq); + if (action_ret == IRQ_NONE) + desc->irqs_unhandled -= ok; + } + + desc->irq_count++; + if (likely(desc->irq_count < 100000)) + return; + + desc->irq_count = 0; + if (unlikely(desc->irqs_unhandled > 99900)) { + /* + * The interrupt is stuck + */ + __report_bad_irq(irq, desc, action_ret); + /* + * Now kill the IRQ + */ + printk(KERN_EMERG "Disabling IRQ #%d\n", irq); + desc->istate |= IRQS_SPURIOUS_DISABLED; + desc->depth++; + irq_disable(desc); + + mod_timer(&poll_spurious_irq_timer, + jiffies + POLL_SPURIOUS_IRQ_INTERVAL); + } + desc->irqs_unhandled = 0; +} + +int noirqdebug __read_mostly; + +int noirqdebug_setup(char *str) +{ + noirqdebug = 1; + printk(KERN_INFO "IRQ lockup detection disabled\n"); + + return 1; +} + +__setup("noirqdebug", noirqdebug_setup); +module_param(noirqdebug, bool, 0644); +MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true"); + +static int __init irqfixup_setup(char *str) +{ + irqfixup = 1; + printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); + printk(KERN_WARNING "This may impact system performance.\n"); + + return 1; +} + +__setup("irqfixup", irqfixup_setup); +module_param(irqfixup, int, 0644); + +static int __init irqpoll_setup(char *str) +{ + irqfixup = 2; + printk(KERN_WARNING "Misrouted IRQ fixup and polling support " + "enabled\n"); + printk(KERN_WARNING "This may significantly impact system " + "performance\n"); + return 1; +} + +__setup("irqpoll", irqpoll_setup); diff --git a/kernel/irq_work.c b/kernel/irq_work.c new file mode 100644 index 00000000..c58fa7da --- /dev/null +++ b/kernel/irq_work.c @@ -0,0 +1,166 @@ +/* + * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra + * + * Provides a framework for enqueueing and running callbacks from hardirq + * context. The enqueueing is NMI-safe. + */ + +#include +#include +#include +#include + +/* + * An entry can be in one of four states: + * + * free NULL, 0 -> {claimed} : free to be used + * claimed NULL, 3 -> {pending} : claimed to be enqueued + * pending next, 3 -> {busy} : queued, pending callback + * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed + * + * We use the lower two bits of the next pointer to keep PENDING and BUSY + * flags. + */ + +#define IRQ_WORK_PENDING 1UL +#define IRQ_WORK_BUSY 2UL +#define IRQ_WORK_FLAGS 3UL + +static inline bool irq_work_is_set(struct irq_work *entry, int flags) +{ + return (unsigned long)entry->next & flags; +} + +static inline struct irq_work *irq_work_next(struct irq_work *entry) +{ + unsigned long next = (unsigned long)entry->next; + next &= ~IRQ_WORK_FLAGS; + return (struct irq_work *)next; +} + +static inline struct irq_work *next_flags(struct irq_work *entry, int flags) +{ + unsigned long next = (unsigned long)entry; + next |= flags; + return (struct irq_work *)next; +} + +static DEFINE_PER_CPU(struct irq_work *, irq_work_list); + +/* + * Claim the entry so that no one else will poke at it. + */ +static bool irq_work_claim(struct irq_work *entry) +{ + struct irq_work *next, *nflags; + + do { + next = entry->next; + if ((unsigned long)next & IRQ_WORK_PENDING) + return false; + nflags = next_flags(next, IRQ_WORK_FLAGS); + } while (cmpxchg(&entry->next, next, nflags) != next); + + return true; +} + + +void __weak arch_irq_work_raise(void) +{ + /* + * Lame architectures will get the timer tick callback + */ +} + +/* + * Queue the entry and raise the IPI if needed. + */ +static void __irq_work_queue(struct irq_work *entry) +{ + struct irq_work *next; + + preempt_disable(); + + do { + next = __this_cpu_read(irq_work_list); + /* Can assign non-atomic because we keep the flags set. */ + entry->next = next_flags(next, IRQ_WORK_FLAGS); + } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next); + + /* The list was empty, raise self-interrupt to start processing. */ + if (!irq_work_next(entry)) + arch_irq_work_raise(); + + preempt_enable(); +} + +/* + * Enqueue the irq_work @entry, returns true on success, failure when the + * @entry was already enqueued by someone else. + * + * Can be re-enqueued while the callback is still in progress. + */ +bool irq_work_queue(struct irq_work *entry) +{ + if (!irq_work_claim(entry)) { + /* + * Already enqueued, can't do! + */ + return false; + } + + __irq_work_queue(entry); + return true; +} +EXPORT_SYMBOL_GPL(irq_work_queue); + +/* + * Run the irq_work entries on this cpu. Requires to be ran from hardirq + * context with local IRQs disabled. + */ +void irq_work_run(void) +{ + struct irq_work *list; + + if (this_cpu_read(irq_work_list) == NULL) + return; + + BUG_ON(!in_irq()); + BUG_ON(!irqs_disabled()); + + list = this_cpu_xchg(irq_work_list, NULL); + + while (list != NULL) { + struct irq_work *entry = list; + + list = irq_work_next(list); + + /* + * Clear the PENDING bit, after this point the @entry + * can be re-used. + */ + entry->next = next_flags(NULL, IRQ_WORK_BUSY); + entry->func(entry); + /* + * Clear the BUSY bit and return to the free state if + * no-one else claimed it meanwhile. + */ + (void)cmpxchg(&entry->next, + next_flags(NULL, IRQ_WORK_BUSY), + NULL); + } +} +EXPORT_SYMBOL_GPL(irq_work_run); + +/* + * Synchronize against the irq_work @entry, ensures the entry is not + * currently in use. + */ +void irq_work_sync(struct irq_work *entry) +{ + WARN_ON_ONCE(irqs_disabled()); + + while (irq_work_is_set(entry, IRQ_WORK_BUSY)) + cpu_relax(); +} +EXPORT_SYMBOL_GPL(irq_work_sync); diff --git a/kernel/itimer.c b/kernel/itimer.c new file mode 100644 index 00000000..d8028831 --- /dev/null +++ b/kernel/itimer.c @@ -0,0 +1,298 @@ +/* + * linux/kernel/itimer.c + * + * Copyright (C) 1992 Darren Senn + */ + +/* These are all the functions necessary to implement itimers */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +/** + * itimer_get_remtime - get remaining time for the timer + * + * @timer: the timer to read + * + * Returns the delta between the expiry time and now, which can be + * less than zero or 1usec for an pending expired timer + */ +static struct timeval itimer_get_remtime(struct hrtimer *timer) +{ + ktime_t rem = hrtimer_get_remaining(timer); + + /* + * Racy but safe: if the itimer expires after the above + * hrtimer_get_remtime() call but before this condition + * then we return 0 - which is correct. + */ + if (hrtimer_active(timer)) { + if (rem.tv64 <= 0) + rem.tv64 = NSEC_PER_USEC; + } else + rem.tv64 = 0; + + return ktime_to_timeval(rem); +} + +static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, + struct itimerval *const value) +{ + cputime_t cval, cinterval; + struct cpu_itimer *it = &tsk->signal->it[clock_id]; + + spin_lock_irq(&tsk->sighand->siglock); + + cval = it->expires; + cinterval = it->incr; + if (!cputime_eq(cval, cputime_zero)) { + struct task_cputime cputime; + cputime_t t; + + thread_group_cputimer(tsk, &cputime); + if (clock_id == CPUCLOCK_PROF) + t = cputime_add(cputime.utime, cputime.stime); + else + /* CPUCLOCK_VIRT */ + t = cputime.utime; + + if (cputime_le(cval, t)) + /* about to fire */ + cval = cputime_one_jiffy; + else + cval = cputime_sub(cval, t); + } + + spin_unlock_irq(&tsk->sighand->siglock); + + cputime_to_timeval(cval, &value->it_value); + cputime_to_timeval(cinterval, &value->it_interval); +} + +int do_getitimer(int which, struct itimerval *value) +{ + struct task_struct *tsk = current; + + switch (which) { + case ITIMER_REAL: + spin_lock_irq(&tsk->sighand->siglock); + value->it_value = itimer_get_remtime(&tsk->signal->real_timer); + value->it_interval = + ktime_to_timeval(tsk->signal->it_real_incr); + spin_unlock_irq(&tsk->sighand->siglock); + break; + case ITIMER_VIRTUAL: + get_cpu_itimer(tsk, CPUCLOCK_VIRT, value); + break; + case ITIMER_PROF: + get_cpu_itimer(tsk, CPUCLOCK_PROF, value); + break; + default: + return(-EINVAL); + } + return 0; +} + +SYSCALL_DEFINE2(getitimer, int, which, struct itimerval __user *, value) +{ + int error = -EFAULT; + struct itimerval get_buffer; + + if (value) { + error = do_getitimer(which, &get_buffer); + if (!error && + copy_to_user(value, &get_buffer, sizeof(get_buffer))) + error = -EFAULT; + } + return error; +} + + +/* + * The timer is automagically restarted, when interval != 0 + */ +enum hrtimer_restart it_real_fn(struct hrtimer *timer) +{ + struct signal_struct *sig = + container_of(timer, struct signal_struct, real_timer); + + trace_itimer_expire(ITIMER_REAL, sig->leader_pid, 0); + kill_pid_info(SIGALRM, SEND_SIG_PRIV, sig->leader_pid); + + return HRTIMER_NORESTART; +} + +static inline u32 cputime_sub_ns(cputime_t ct, s64 real_ns) +{ + struct timespec ts; + s64 cpu_ns; + + cputime_to_timespec(ct, &ts); + cpu_ns = timespec_to_ns(&ts); + + return (cpu_ns <= real_ns) ? 0 : cpu_ns - real_ns; +} + +static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, + const struct itimerval *const value, + struct itimerval *const ovalue) +{ + cputime_t cval, nval, cinterval, ninterval; + s64 ns_ninterval, ns_nval; + u32 error, incr_error; + struct cpu_itimer *it = &tsk->signal->it[clock_id]; + + nval = timeval_to_cputime(&value->it_value); + ns_nval = timeval_to_ns(&value->it_value); + ninterval = timeval_to_cputime(&value->it_interval); + ns_ninterval = timeval_to_ns(&value->it_interval); + + error = cputime_sub_ns(nval, ns_nval); + incr_error = cputime_sub_ns(ninterval, ns_ninterval); + + spin_lock_irq(&tsk->sighand->siglock); + + cval = it->expires; + cinterval = it->incr; + if (!cputime_eq(cval, cputime_zero) || + !cputime_eq(nval, cputime_zero)) { + if (cputime_gt(nval, cputime_zero)) + nval = cputime_add(nval, cputime_one_jiffy); + set_process_cpu_timer(tsk, clock_id, &nval, &cval); + } + it->expires = nval; + it->incr = ninterval; + it->error = error; + it->incr_error = incr_error; + trace_itimer_state(clock_id == CPUCLOCK_VIRT ? + ITIMER_VIRTUAL : ITIMER_PROF, value, nval); + + spin_unlock_irq(&tsk->sighand->siglock); + + if (ovalue) { + cputime_to_timeval(cval, &ovalue->it_value); + cputime_to_timeval(cinterval, &ovalue->it_interval); + } +} + +/* + * Returns true if the timeval is in canonical form + */ +#define timeval_valid(t) \ + (((t)->tv_sec >= 0) && (((unsigned long) (t)->tv_usec) < USEC_PER_SEC)) + +int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) +{ + struct task_struct *tsk = current; + struct hrtimer *timer; + ktime_t expires; + + /* + * Validate the timevals in value. + */ + if (!timeval_valid(&value->it_value) || + !timeval_valid(&value->it_interval)) + return -EINVAL; + + switch (which) { + case ITIMER_REAL: +again: + spin_lock_irq(&tsk->sighand->siglock); + timer = &tsk->signal->real_timer; + if (ovalue) { + ovalue->it_value = itimer_get_remtime(timer); + ovalue->it_interval + = ktime_to_timeval(tsk->signal->it_real_incr); + } + /* We are sharing ->siglock with it_real_fn() */ + if (hrtimer_try_to_cancel(timer) < 0) { + spin_unlock_irq(&tsk->sighand->siglock); + goto again; + } + expires = timeval_to_ktime(value->it_value); + if (expires.tv64 != 0) { + tsk->signal->it_real_incr = + timeval_to_ktime(value->it_interval); + hrtimer_start(timer, expires, HRTIMER_MODE_REL); + } else + tsk->signal->it_real_incr.tv64 = 0; + + trace_itimer_state(ITIMER_REAL, value, 0); + spin_unlock_irq(&tsk->sighand->siglock); + break; + case ITIMER_VIRTUAL: + set_cpu_itimer(tsk, CPUCLOCK_VIRT, value, ovalue); + break; + case ITIMER_PROF: + set_cpu_itimer(tsk, CPUCLOCK_PROF, value, ovalue); + break; + default: + return -EINVAL; + } + return 0; +} + +/** + * alarm_setitimer - set alarm in seconds + * + * @seconds: number of seconds until alarm + * 0 disables the alarm + * + * Returns the remaining time in seconds of a pending timer or 0 when + * the timer is not active. + * + * On 32 bit machines the seconds value is limited to (INT_MAX/2) to avoid + * negative timeval settings which would cause immediate expiry. + */ +unsigned int alarm_setitimer(unsigned int seconds) +{ + struct itimerval it_new, it_old; + +#if BITS_PER_LONG < 64 + if (seconds > INT_MAX) + seconds = INT_MAX; +#endif + it_new.it_value.tv_sec = seconds; + it_new.it_value.tv_usec = 0; + it_new.it_interval.tv_sec = it_new.it_interval.tv_usec = 0; + + do_setitimer(ITIMER_REAL, &it_new, &it_old); + + /* + * We can't return 0 if we have an alarm pending ... And we'd + * better return too much than too little anyway + */ + if ((!it_old.it_value.tv_sec && it_old.it_value.tv_usec) || + it_old.it_value.tv_usec >= 500000) + it_old.it_value.tv_sec++; + + return it_old.it_value.tv_sec; +} + +SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value, + struct itimerval __user *, ovalue) +{ + struct itimerval set_buffer, get_buffer; + int error; + + if (value) { + if(copy_from_user(&set_buffer, value, sizeof(set_buffer))) + return -EFAULT; + } else + memset((char *) &set_buffer, 0, sizeof(set_buffer)); + + error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL); + if (error || !ovalue) + return error; + + if (copy_to_user(ovalue, &get_buffer, sizeof(get_buffer))) + return -EFAULT; + return 0; +} diff --git a/kernel/jump_label.c b/kernel/jump_label.c new file mode 100644 index 00000000..e6f1f24a --- /dev/null +++ b/kernel/jump_label.c @@ -0,0 +1,394 @@ +/* + * jump label support + * + * Copyright (C) 2009 Jason Baron + * Copyright (C) 2011 Peter Zijlstra + * + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_JUMP_LABEL + +/* mutex to protect coming/going of the the jump_label table */ +static DEFINE_MUTEX(jump_label_mutex); + +void jump_label_lock(void) +{ + mutex_lock(&jump_label_mutex); +} + +void jump_label_unlock(void) +{ + mutex_unlock(&jump_label_mutex); +} + +bool jump_label_enabled(struct jump_label_key *key) +{ + return !!atomic_read(&key->enabled); +} + +static int jump_label_cmp(const void *a, const void *b) +{ + const struct jump_entry *jea = a; + const struct jump_entry *jeb = b; + + if (jea->key < jeb->key) + return -1; + + if (jea->key > jeb->key) + return 1; + + return 0; +} + +static void +jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop) +{ + unsigned long size; + + size = (((unsigned long)stop - (unsigned long)start) + / sizeof(struct jump_entry)); + sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); +} + +static void jump_label_update(struct jump_label_key *key, int enable); + +void jump_label_inc(struct jump_label_key *key) +{ + if (atomic_inc_not_zero(&key->enabled)) + return; + + jump_label_lock(); + if (atomic_read(&key->enabled) == 0) + jump_label_update(key, JUMP_LABEL_ENABLE); + atomic_inc(&key->enabled); + jump_label_unlock(); +} + +void jump_label_dec(struct jump_label_key *key) +{ + if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex)) + return; + + jump_label_update(key, JUMP_LABEL_DISABLE); + jump_label_unlock(); +} + +static int addr_conflict(struct jump_entry *entry, void *start, void *end) +{ + if (entry->code <= (unsigned long)end && + entry->code + JUMP_LABEL_NOP_SIZE > (unsigned long)start) + return 1; + + return 0; +} + +static int __jump_label_text_reserved(struct jump_entry *iter_start, + struct jump_entry *iter_stop, void *start, void *end) +{ + struct jump_entry *iter; + + iter = iter_start; + while (iter < iter_stop) { + if (addr_conflict(iter, start, end)) + return 1; + iter++; + } + + return 0; +} + +static void __jump_label_update(struct jump_label_key *key, + struct jump_entry *entry, + struct jump_entry *stop, int enable) +{ + for (; (entry < stop) && + (entry->key == (jump_label_t)(unsigned long)key); + entry++) { + /* + * entry->code set to 0 invalidates module init text sections + * kernel_text_address() verifies we are not in core kernel + * init code, see jump_label_invalidate_module_init(). + */ + if (entry->code && kernel_text_address(entry->code)) + arch_jump_label_transform(entry, enable); + } +} + +/* + * Not all archs need this. + */ +void __weak arch_jump_label_text_poke_early(jump_label_t addr) +{ +} + +static __init int jump_label_init(void) +{ + struct jump_entry *iter_start = __start___jump_table; + struct jump_entry *iter_stop = __stop___jump_table; + struct jump_label_key *key = NULL; + struct jump_entry *iter; + + jump_label_lock(); + jump_label_sort_entries(iter_start, iter_stop); + + for (iter = iter_start; iter < iter_stop; iter++) { + arch_jump_label_text_poke_early(iter->code); + if (iter->key == (jump_label_t)(unsigned long)key) + continue; + + key = (struct jump_label_key *)(unsigned long)iter->key; + atomic_set(&key->enabled, 0); + key->entries = iter; +#ifdef CONFIG_MODULES + key->next = NULL; +#endif + } + jump_label_unlock(); + + return 0; +} +early_initcall(jump_label_init); + +#ifdef CONFIG_MODULES + +struct jump_label_mod { + struct jump_label_mod *next; + struct jump_entry *entries; + struct module *mod; +}; + +static int __jump_label_mod_text_reserved(void *start, void *end) +{ + struct module *mod; + + mod = __module_text_address((unsigned long)start); + if (!mod) + return 0; + + WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod); + + return __jump_label_text_reserved(mod->jump_entries, + mod->jump_entries + mod->num_jump_entries, + start, end); +} + +static void __jump_label_mod_update(struct jump_label_key *key, int enable) +{ + struct jump_label_mod *mod = key->next; + + while (mod) { + struct module *m = mod->mod; + + __jump_label_update(key, mod->entries, + m->jump_entries + m->num_jump_entries, + enable); + mod = mod->next; + } +} + +/*** + * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop() + * @mod: module to patch + * + * Allow for run-time selection of the optimal nops. Before the module + * loads patch these with arch_get_jump_label_nop(), which is specified by + * the arch specific jump label code. + */ +void jump_label_apply_nops(struct module *mod) +{ + struct jump_entry *iter_start = mod->jump_entries; + struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; + struct jump_entry *iter; + + /* if the module doesn't have jump label entries, just return */ + if (iter_start == iter_stop) + return; + + for (iter = iter_start; iter < iter_stop; iter++) + arch_jump_label_text_poke_early(iter->code); +} + +static int jump_label_add_module(struct module *mod) +{ + struct jump_entry *iter_start = mod->jump_entries; + struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; + struct jump_entry *iter; + struct jump_label_key *key = NULL; + struct jump_label_mod *jlm; + + /* if the module doesn't have jump label entries, just return */ + if (iter_start == iter_stop) + return 0; + + jump_label_sort_entries(iter_start, iter_stop); + + for (iter = iter_start; iter < iter_stop; iter++) { + if (iter->key == (jump_label_t)(unsigned long)key) + continue; + + key = (struct jump_label_key *)(unsigned long)iter->key; + + if (__module_address(iter->key) == mod) { + atomic_set(&key->enabled, 0); + key->entries = iter; + key->next = NULL; + continue; + } + + jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL); + if (!jlm) + return -ENOMEM; + + jlm->mod = mod; + jlm->entries = iter; + jlm->next = key->next; + key->next = jlm; + + if (jump_label_enabled(key)) + __jump_label_update(key, iter, iter_stop, + JUMP_LABEL_ENABLE); + } + + return 0; +} + +static void jump_label_del_module(struct module *mod) +{ + struct jump_entry *iter_start = mod->jump_entries; + struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; + struct jump_entry *iter; + struct jump_label_key *key = NULL; + struct jump_label_mod *jlm, **prev; + + for (iter = iter_start; iter < iter_stop; iter++) { + if (iter->key == (jump_label_t)(unsigned long)key) + continue; + + key = (struct jump_label_key *)(unsigned long)iter->key; + + if (__module_address(iter->key) == mod) + continue; + + prev = &key->next; + jlm = key->next; + + while (jlm && jlm->mod != mod) { + prev = &jlm->next; + jlm = jlm->next; + } + + if (jlm) { + *prev = jlm->next; + kfree(jlm); + } + } +} + +static void jump_label_invalidate_module_init(struct module *mod) +{ + struct jump_entry *iter_start = mod->jump_entries; + struct jump_entry *iter_stop = iter_start + mod->num_jump_entries; + struct jump_entry *iter; + + for (iter = iter_start; iter < iter_stop; iter++) { + if (within_module_init(iter->code, mod)) + iter->code = 0; + } +} + +static int +jump_label_module_notify(struct notifier_block *self, unsigned long val, + void *data) +{ + struct module *mod = data; + int ret = 0; + + switch (val) { + case MODULE_STATE_COMING: + jump_label_lock(); + ret = jump_label_add_module(mod); + if (ret) + jump_label_del_module(mod); + jump_label_unlock(); + break; + case MODULE_STATE_GOING: + jump_label_lock(); + jump_label_del_module(mod); + jump_label_unlock(); + break; + case MODULE_STATE_LIVE: + jump_label_lock(); + jump_label_invalidate_module_init(mod); + jump_label_unlock(); + break; + } + + return notifier_from_errno(ret); +} + +struct notifier_block jump_label_module_nb = { + .notifier_call = jump_label_module_notify, + .priority = 1, /* higher than tracepoints */ +}; + +static __init int jump_label_init_module(void) +{ + return register_module_notifier(&jump_label_module_nb); +} +early_initcall(jump_label_init_module); + +#endif /* CONFIG_MODULES */ + +/*** + * jump_label_text_reserved - check if addr range is reserved + * @start: start text addr + * @end: end text addr + * + * checks if the text addr located between @start and @end + * overlaps with any of the jump label patch addresses. Code + * that wants to modify kernel text should first verify that + * it does not overlap with any of the jump label addresses. + * Caller must hold jump_label_mutex. + * + * returns 1 if there is an overlap, 0 otherwise + */ +int jump_label_text_reserved(void *start, void *end) +{ + int ret = __jump_label_text_reserved(__start___jump_table, + __stop___jump_table, start, end); + + if (ret) + return ret; + +#ifdef CONFIG_MODULES + ret = __jump_label_mod_text_reserved(start, end); +#endif + return ret; +} + +static void jump_label_update(struct jump_label_key *key, int enable) +{ + struct jump_entry *entry = key->entries, *stop = __stop___jump_table; + +#ifdef CONFIG_MODULES + struct module *mod = __module_address((jump_label_t)key); + + __jump_label_mod_update(key, enable); + + if (mod) + stop = mod->jump_entries + mod->num_jump_entries; +#endif + /* if there are no users, entry can be NULL */ + if (entry) + __jump_label_update(key, entry, stop, enable); +} + +#endif diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c new file mode 100644 index 00000000..079f1d39 --- /dev/null +++ b/kernel/kallsyms.c @@ -0,0 +1,588 @@ +/* + * kallsyms.c: in-kernel printing of symbolic oopses and stack traces. + * + * Rewritten and vastly simplified by Rusty Russell for in-kernel + * module loader: + * Copyright 2002 Rusty Russell IBM Corporation + * + * ChangeLog: + * + * (25/Aug/2004) Paulo Marques + * Changed the compression method from stem compression to "table lookup" + * compression (see scripts/kallsyms.c for a more complete description) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for cond_resched */ +#include +#include +#include + +#include + +#ifdef CONFIG_KALLSYMS_ALL +#define all_var 1 +#else +#define all_var 0 +#endif + +/* + * These will be re-linked against their real values + * during the second link stage. + */ +extern const unsigned long kallsyms_addresses[] __attribute__((weak)); +extern const u8 kallsyms_names[] __attribute__((weak)); + +/* + * Tell the compiler that the count isn't in the small data section if the arch + * has one (eg: FRV). + */ +extern const unsigned long kallsyms_num_syms +__attribute__((weak, section(".rodata"))); + +extern const u8 kallsyms_token_table[] __attribute__((weak)); +extern const u16 kallsyms_token_index[] __attribute__((weak)); + +extern const unsigned long kallsyms_markers[] __attribute__((weak)); + +static inline int is_kernel_inittext(unsigned long addr) +{ + if (addr >= (unsigned long)_sinittext + && addr <= (unsigned long)_einittext) + return 1; + return 0; +} + +static inline int is_kernel_text(unsigned long addr) +{ + if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || + arch_is_kernel_text(addr)) + return 1; + return in_gate_area_no_mm(addr); +} + +static inline int is_kernel(unsigned long addr) +{ + if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) + return 1; + return in_gate_area_no_mm(addr); +} + +static int is_ksym_addr(unsigned long addr) +{ + if (all_var) + return is_kernel(addr); + + return is_kernel_text(addr) || is_kernel_inittext(addr); +} + +/* + * Expand a compressed symbol data into the resulting uncompressed string, + * given the offset to where the symbol is in the compressed stream. + */ +static unsigned int kallsyms_expand_symbol(unsigned int off, char *result) +{ + int len, skipped_first = 0; + const u8 *tptr, *data; + + /* Get the compressed symbol length from the first symbol byte. */ + data = &kallsyms_names[off]; + len = *data; + data++; + + /* + * Update the offset to return the offset for the next symbol on + * the compressed stream. + */ + off += len + 1; + + /* + * For every byte on the compressed symbol data, copy the table + * entry for that byte. + */ + while (len) { + tptr = &kallsyms_token_table[kallsyms_token_index[*data]]; + data++; + len--; + + while (*tptr) { + if (skipped_first) { + *result = *tptr; + result++; + } else + skipped_first = 1; + tptr++; + } + } + + *result = '\0'; + + /* Return to offset to the next symbol. */ + return off; +} + +/* + * Get symbol type information. This is encoded as a single char at the + * beginning of the symbol name. + */ +static char kallsyms_get_symbol_type(unsigned int off) +{ + /* + * Get just the first code, look it up in the token table, + * and return the first char from this token. + */ + return kallsyms_token_table[kallsyms_token_index[kallsyms_names[off + 1]]]; +} + + +/* + * Find the offset on the compressed stream given and index in the + * kallsyms array. + */ +static unsigned int get_symbol_offset(unsigned long pos) +{ + const u8 *name; + int i; + + /* + * Use the closest marker we have. We have markers every 256 positions, + * so that should be close enough. + */ + name = &kallsyms_names[kallsyms_markers[pos >> 8]]; + + /* + * Sequentially scan all the symbols up to the point we're searching + * for. Every symbol is stored in a [][ bytes of data] format, + * so we just need to add the len to the current pointer for every + * symbol we wish to skip. + */ + for (i = 0; i < (pos & 0xFF); i++) + name = name + (*name) + 1; + + return name - kallsyms_names; +} + +/* Lookup the address for this symbol. Returns 0 if not found. */ +unsigned long kallsyms_lookup_name(const char *name) +{ + char namebuf[KSYM_NAME_LEN]; + unsigned long i; + unsigned int off; + + for (i = 0, off = 0; i < kallsyms_num_syms; i++) { + off = kallsyms_expand_symbol(off, namebuf); + + if (strcmp(namebuf, name) == 0) + return kallsyms_addresses[i]; + } + return module_kallsyms_lookup_name(name); +} +EXPORT_SYMBOL_GPL(kallsyms_lookup_name); + +int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, + unsigned long), + void *data) +{ + char namebuf[KSYM_NAME_LEN]; + unsigned long i; + unsigned int off; + int ret; + + for (i = 0, off = 0; i < kallsyms_num_syms; i++) { + off = kallsyms_expand_symbol(off, namebuf); + ret = fn(data, namebuf, NULL, kallsyms_addresses[i]); + if (ret != 0) + return ret; + } + return module_kallsyms_on_each_symbol(fn, data); +} +EXPORT_SYMBOL_GPL(kallsyms_on_each_symbol); + +static unsigned long get_symbol_pos(unsigned long addr, + unsigned long *symbolsize, + unsigned long *offset) +{ + unsigned long symbol_start = 0, symbol_end = 0; + unsigned long i, low, high, mid; + + /* This kernel should never had been booted. */ + BUG_ON(!kallsyms_addresses); + + /* Do a binary search on the sorted kallsyms_addresses array. */ + low = 0; + high = kallsyms_num_syms; + + while (high - low > 1) { + mid = low + (high - low) / 2; + if (kallsyms_addresses[mid] <= addr) + low = mid; + else + high = mid; + } + + /* + * Search for the first aliased symbol. Aliased + * symbols are symbols with the same address. + */ + while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low]) + --low; + + symbol_start = kallsyms_addresses[low]; + + /* Search for next non-aliased symbol. */ + for (i = low + 1; i < kallsyms_num_syms; i++) { + if (kallsyms_addresses[i] > symbol_start) { + symbol_end = kallsyms_addresses[i]; + break; + } + } + + /* If we found no next symbol, we use the end of the section. */ + if (!symbol_end) { + if (is_kernel_inittext(addr)) + symbol_end = (unsigned long)_einittext; + else if (all_var) + symbol_end = (unsigned long)_end; + else + symbol_end = (unsigned long)_etext; + } + + if (symbolsize) + *symbolsize = symbol_end - symbol_start; + if (offset) + *offset = addr - symbol_start; + + return low; +} + +/* + * Lookup an address but don't bother to find any names. + */ +int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, + unsigned long *offset) +{ + char namebuf[KSYM_NAME_LEN]; + if (is_ksym_addr(addr)) + return !!get_symbol_pos(addr, symbolsize, offset); + + return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf); +} + +/* + * Lookup an address + * - modname is set to NULL if it's in the kernel. + * - We guarantee that the returned name is valid until we reschedule even if. + * It resides in a module. + * - We also guarantee that modname will be valid until rescheduled. + */ +const char *kallsyms_lookup(unsigned long addr, + unsigned long *symbolsize, + unsigned long *offset, + char **modname, char *namebuf) +{ + namebuf[KSYM_NAME_LEN - 1] = 0; + namebuf[0] = 0; + + if (is_ksym_addr(addr)) { + unsigned long pos; + + pos = get_symbol_pos(addr, symbolsize, offset); + /* Grab name */ + kallsyms_expand_symbol(get_symbol_offset(pos), namebuf); + if (modname) + *modname = NULL; + return namebuf; + } + + /* See if it's in a module. */ + return module_address_lookup(addr, symbolsize, offset, modname, + namebuf); +} + +int lookup_symbol_name(unsigned long addr, char *symname) +{ + symname[0] = '\0'; + symname[KSYM_NAME_LEN - 1] = '\0'; + + if (is_ksym_addr(addr)) { + unsigned long pos; + + pos = get_symbol_pos(addr, NULL, NULL); + /* Grab name */ + kallsyms_expand_symbol(get_symbol_offset(pos), symname); + return 0; + } + /* See if it's in a module. */ + return lookup_module_symbol_name(addr, symname); +} + +int lookup_symbol_attrs(unsigned long addr, unsigned long *size, + unsigned long *offset, char *modname, char *name) +{ + name[0] = '\0'; + name[KSYM_NAME_LEN - 1] = '\0'; + + if (is_ksym_addr(addr)) { + unsigned long pos; + + pos = get_symbol_pos(addr, size, offset); + /* Grab name */ + kallsyms_expand_symbol(get_symbol_offset(pos), name); + modname[0] = '\0'; + return 0; + } + /* See if it's in a module. */ + return lookup_module_symbol_attrs(addr, size, offset, modname, name); +} + +/* Look up a kernel symbol and return it in a text buffer. */ +static int __sprint_symbol(char *buffer, unsigned long address, + int symbol_offset) +{ + char *modname; + const char *name; + unsigned long offset, size; + int len; + + address += symbol_offset; + name = kallsyms_lookup(address, &size, &offset, &modname, buffer); + if (!name) + return sprintf(buffer, "0x%lx", address); + + if (name != buffer) + strcpy(buffer, name); + len = strlen(buffer); + buffer += len; + offset -= symbol_offset; + + if (modname) + len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname); + else + len += sprintf(buffer, "+%#lx/%#lx", offset, size); + + return len; +} + +/** + * sprint_symbol - Look up a kernel symbol and return it in a text buffer + * @buffer: buffer to be stored + * @address: address to lookup + * + * This function looks up a kernel symbol with @address and stores its name, + * offset, size and module name to @buffer if possible. If no symbol was found, + * just saves its @address as is. + * + * This function returns the number of bytes stored in @buffer. + */ +int sprint_symbol(char *buffer, unsigned long address) +{ + return __sprint_symbol(buffer, address, 0); +} + +EXPORT_SYMBOL_GPL(sprint_symbol); + +/** + * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer + * @buffer: buffer to be stored + * @address: address to lookup + * + * This function is for stack backtrace and does the same thing as + * sprint_symbol() but with modified/decreased @address. If there is a + * tail-call to the function marked "noreturn", gcc optimized out code after + * the call so that the stack-saved return address could point outside of the + * caller. This function ensures that kallsyms will find the original caller + * by decreasing @address. + * + * This function returns the number of bytes stored in @buffer. + */ +int sprint_backtrace(char *buffer, unsigned long address) +{ + return __sprint_symbol(buffer, address, -1); +} + +/* Look up a kernel symbol and print it to the kernel messages. */ +void __print_symbol(const char *fmt, unsigned long address) +{ + char buffer[KSYM_SYMBOL_LEN]; + + sprint_symbol(buffer, address); + + printk(fmt, buffer); +} +EXPORT_SYMBOL(__print_symbol); + +/* To avoid using get_symbol_offset for every symbol, we carry prefix along. */ +struct kallsym_iter { + loff_t pos; + unsigned long value; + unsigned int nameoff; /* If iterating in core kernel symbols. */ + char type; + char name[KSYM_NAME_LEN]; + char module_name[MODULE_NAME_LEN]; + int exported; +}; + +static int get_ksymbol_mod(struct kallsym_iter *iter) +{ + if (module_get_kallsym(iter->pos - kallsyms_num_syms, &iter->value, + &iter->type, iter->name, iter->module_name, + &iter->exported) < 0) + return 0; + return 1; +} + +/* Returns space to next name. */ +static unsigned long get_ksymbol_core(struct kallsym_iter *iter) +{ + unsigned off = iter->nameoff; + + iter->module_name[0] = '\0'; + iter->value = kallsyms_addresses[iter->pos]; + + iter->type = kallsyms_get_symbol_type(off); + + off = kallsyms_expand_symbol(off, iter->name); + + return off - iter->nameoff; +} + +static void reset_iter(struct kallsym_iter *iter, loff_t new_pos) +{ + iter->name[0] = '\0'; + iter->nameoff = get_symbol_offset(new_pos); + iter->pos = new_pos; +} + +/* Returns false if pos at or past end of file. */ +static int update_iter(struct kallsym_iter *iter, loff_t pos) +{ + /* Module symbols can be accessed randomly. */ + if (pos >= kallsyms_num_syms) { + iter->pos = pos; + return get_ksymbol_mod(iter); + } + + /* If we're not on the desired position, reset to new position. */ + if (pos != iter->pos) + reset_iter(iter, pos); + + iter->nameoff += get_ksymbol_core(iter); + iter->pos++; + + return 1; +} + +static void *s_next(struct seq_file *m, void *p, loff_t *pos) +{ + (*pos)++; + + if (!update_iter(m->private, *pos)) + return NULL; + return p; +} + +static void *s_start(struct seq_file *m, loff_t *pos) +{ + if (!update_iter(m->private, *pos)) + return NULL; + return m->private; +} + +static void s_stop(struct seq_file *m, void *p) +{ +} + +static int s_show(struct seq_file *m, void *p) +{ + struct kallsym_iter *iter = m->private; + + /* Some debugging symbols have no name. Ignore them. */ + if (!iter->name[0]) + return 0; + + if (iter->module_name[0]) { + char type; + + /* + * Label it "global" if it is exported, + * "local" if not exported. + */ + type = iter->exported ? toupper(iter->type) : + tolower(iter->type); + seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value, + type, iter->name, iter->module_name); + } else + seq_printf(m, "%pK %c %s\n", (void *)iter->value, + iter->type, iter->name); + return 0; +} + +static const struct seq_operations kallsyms_op = { + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show +}; + +static int kallsyms_open(struct inode *inode, struct file *file) +{ + /* + * We keep iterator in m->private, since normal case is to + * s_start from where we left off, so we avoid doing + * using get_symbol_offset for every symbol. + */ + struct kallsym_iter *iter; + int ret; + + iter = kmalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + reset_iter(iter, 0); + + ret = seq_open(file, &kallsyms_op); + if (ret == 0) + ((struct seq_file *)file->private_data)->private = iter; + else + kfree(iter); + return ret; +} + +#ifdef CONFIG_KGDB_KDB +const char *kdb_walk_kallsyms(loff_t *pos) +{ + static struct kallsym_iter kdb_walk_kallsyms_iter; + if (*pos == 0) { + memset(&kdb_walk_kallsyms_iter, 0, + sizeof(kdb_walk_kallsyms_iter)); + reset_iter(&kdb_walk_kallsyms_iter, 0); + } + while (1) { + if (!update_iter(&kdb_walk_kallsyms_iter, *pos)) + return NULL; + ++*pos; + /* Some debugging symbols have no name. Ignore them. */ + if (kdb_walk_kallsyms_iter.name[0]) + return kdb_walk_kallsyms_iter.name; + } +} +#endif /* CONFIG_KGDB_KDB */ + +static const struct file_operations kallsyms_operations = { + .open = kallsyms_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static int __init kallsyms_init(void) +{ + proc_create("kallsyms", 0444, NULL, &kallsyms_operations); + return 0; +} +device_initcall(kallsyms_init); diff --git a/kernel/kexec.c b/kernel/kexec.c new file mode 100644 index 00000000..8d814cbc --- /dev/null +++ b/kernel/kexec.c @@ -0,0 +1,1569 @@ +/* + * kexec.c - kexec system call + * Copyright (C) 2002-2004 Eric Biederman + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +/* Per cpu memory for storing cpu states in case of system crash. */ +note_buf_t __percpu *crash_notes; + +/* vmcoreinfo stuff */ +static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; +u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; +size_t vmcoreinfo_size; +size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); + +/* Location of the reserved area for the crash kernel */ +struct resource crashk_res = { + .name = "Crash kernel", + .start = 0, + .end = 0, + .flags = IORESOURCE_BUSY | IORESOURCE_MEM +}; + +int kexec_should_crash(struct task_struct *p) +{ + if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) + return 1; + return 0; +} + +/* + * When kexec transitions to the new kernel there is a one-to-one + * mapping between physical and virtual addresses. On processors + * where you can disable the MMU this is trivial, and easy. For + * others it is still a simple predictable page table to setup. + * + * In that environment kexec copies the new kernel to its final + * resting place. This means I can only support memory whose + * physical address can fit in an unsigned long. In particular + * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. + * If the assembly stub has more restrictive requirements + * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be + * defined more restrictively in . + * + * The code for the transition from the current kernel to the + * the new kernel is placed in the control_code_buffer, whose size + * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single + * page of memory is necessary, but some architectures require more. + * Because this memory must be identity mapped in the transition from + * virtual to physical addresses it must live in the range + * 0 - TASK_SIZE, as only the user space mappings are arbitrarily + * modifiable. + * + * The assembly stub in the control code buffer is passed a linked list + * of descriptor pages detailing the source pages of the new kernel, + * and the destination addresses of those source pages. As this data + * structure is not used in the context of the current OS, it must + * be self-contained. + * + * The code has been made to work with highmem pages and will use a + * destination page in its final resting place (if it happens + * to allocate it). The end product of this is that most of the + * physical address space, and most of RAM can be used. + * + * Future directions include: + * - allocating a page table with the control code buffer identity + * mapped, to simplify machine_kexec and make kexec_on_panic more + * reliable. + */ + +/* + * KIMAGE_NO_DEST is an impossible destination address..., for + * allocating pages whose destination address we do not care about. + */ +#define KIMAGE_NO_DEST (-1UL) + +static int kimage_is_destination_range(struct kimage *image, + unsigned long start, unsigned long end); +static struct page *kimage_alloc_page(struct kimage *image, + gfp_t gfp_mask, + unsigned long dest); + +static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, + unsigned long nr_segments, + struct kexec_segment __user *segments) +{ + size_t segment_bytes; + struct kimage *image; + unsigned long i; + int result; + + /* Allocate a controlling structure */ + result = -ENOMEM; + image = kzalloc(sizeof(*image), GFP_KERNEL); + if (!image) + goto out; + + image->head = 0; + image->entry = &image->head; + image->last_entry = &image->head; + image->control_page = ~0; /* By default this does not apply */ + image->start = entry; + image->type = KEXEC_TYPE_DEFAULT; + + /* Initialize the list of control pages */ + INIT_LIST_HEAD(&image->control_pages); + + /* Initialize the list of destination pages */ + INIT_LIST_HEAD(&image->dest_pages); + + /* Initialize the list of unusable pages */ + INIT_LIST_HEAD(&image->unuseable_pages); + + /* Read in the segments */ + image->nr_segments = nr_segments; + segment_bytes = nr_segments * sizeof(*segments); + result = copy_from_user(image->segment, segments, segment_bytes); + if (result) { + result = -EFAULT; + goto out; + } + + /* + * Verify we have good destination addresses. The caller is + * responsible for making certain we don't attempt to load + * the new image into invalid or reserved areas of RAM. This + * just verifies it is an address we can use. + * + * Since the kernel does everything in page size chunks ensure + * the destination addresses are page aligned. Too many + * special cases crop of when we don't do this. The most + * insidious is getting overlapping destination addresses + * simply because addresses are changed to page size + * granularity. + */ + result = -EADDRNOTAVAIL; + for (i = 0; i < nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) + goto out; + if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) + goto out; + } + + /* Verify our destination addresses do not overlap. + * If we alloed overlapping destination addresses + * through very weird things can happen with no + * easy explanation as one segment stops on another. + */ + result = -EINVAL; + for (i = 0; i < nr_segments; i++) { + unsigned long mstart, mend; + unsigned long j; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + for (j = 0; j < i; j++) { + unsigned long pstart, pend; + pstart = image->segment[j].mem; + pend = pstart + image->segment[j].memsz; + /* Do the segments overlap ? */ + if ((mend > pstart) && (mstart < pend)) + goto out; + } + } + + /* Ensure our buffer sizes are strictly less than + * our memory sizes. This should always be the case, + * and it is easier to check up front than to be surprised + * later on. + */ + result = -EINVAL; + for (i = 0; i < nr_segments; i++) { + if (image->segment[i].bufsz > image->segment[i].memsz) + goto out; + } + + result = 0; +out: + if (result == 0) + *rimage = image; + else + kfree(image); + + return result; + +} + +static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, + unsigned long nr_segments, + struct kexec_segment __user *segments) +{ + int result; + struct kimage *image; + + /* Allocate and initialize a controlling structure */ + image = NULL; + result = do_kimage_alloc(&image, entry, nr_segments, segments); + if (result) + goto out; + + *rimage = image; + + /* + * Find a location for the control code buffer, and add it + * the vector of segments so that it's pages will also be + * counted as destination pages. + */ + result = -ENOMEM; + image->control_code_page = kimage_alloc_control_pages(image, + get_order(KEXEC_CONTROL_PAGE_SIZE)); + if (!image->control_code_page) { + printk(KERN_ERR "Could not allocate control_code_buffer\n"); + goto out; + } + + image->swap_page = kimage_alloc_control_pages(image, 0); + if (!image->swap_page) { + printk(KERN_ERR "Could not allocate swap buffer\n"); + goto out; + } + + result = 0; + out: + if (result == 0) + *rimage = image; + else + kfree(image); + + return result; +} + +static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, + unsigned long nr_segments, + struct kexec_segment __user *segments) +{ + int result; + struct kimage *image; + unsigned long i; + + image = NULL; + /* Verify we have a valid entry point */ + if ((entry < crashk_res.start) || (entry > crashk_res.end)) { + result = -EADDRNOTAVAIL; + goto out; + } + + /* Allocate and initialize a controlling structure */ + result = do_kimage_alloc(&image, entry, nr_segments, segments); + if (result) + goto out; + + /* Enable the special crash kernel control page + * allocation policy. + */ + image->control_page = crashk_res.start; + image->type = KEXEC_TYPE_CRASH; + + /* + * Verify we have good destination addresses. Normally + * the caller is responsible for making certain we don't + * attempt to load the new image into invalid or reserved + * areas of RAM. But crash kernels are preloaded into a + * reserved area of ram. We must ensure the addresses + * are in the reserved area otherwise preloading the + * kernel could corrupt things. + */ + result = -EADDRNOTAVAIL; + for (i = 0; i < nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz - 1; + /* Ensure we are within the crash kernel limits */ + if ((mstart < crashk_res.start) || (mend > crashk_res.end)) + goto out; + } + + /* + * Find a location for the control code buffer, and add + * the vector of segments so that it's pages will also be + * counted as destination pages. + */ + result = -ENOMEM; + image->control_code_page = kimage_alloc_control_pages(image, + get_order(KEXEC_CONTROL_PAGE_SIZE)); + if (!image->control_code_page) { + printk(KERN_ERR "Could not allocate control_code_buffer\n"); + goto out; + } + + result = 0; +out: + if (result == 0) + *rimage = image; + else + kfree(image); + + return result; +} + +static int kimage_is_destination_range(struct kimage *image, + unsigned long start, + unsigned long end) +{ + unsigned long i; + + for (i = 0; i < image->nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz; + if ((end > mstart) && (start < mend)) + return 1; + } + + return 0; +} + +static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) +{ + struct page *pages; + + pages = alloc_pages(gfp_mask, order); + if (pages) { + unsigned int count, i; + pages->mapping = NULL; + set_page_private(pages, order); + count = 1 << order; + for (i = 0; i < count; i++) + SetPageReserved(pages + i); + } + + return pages; +} + +static void kimage_free_pages(struct page *page) +{ + unsigned int order, count, i; + + order = page_private(page); + count = 1 << order; + for (i = 0; i < count; i++) + ClearPageReserved(page + i); + __free_pages(page, order); +} + +static void kimage_free_page_list(struct list_head *list) +{ + struct list_head *pos, *next; + + list_for_each_safe(pos, next, list) { + struct page *page; + + page = list_entry(pos, struct page, lru); + list_del(&page->lru); + kimage_free_pages(page); + } +} + +static struct page *kimage_alloc_normal_control_pages(struct kimage *image, + unsigned int order) +{ + /* Control pages are special, they are the intermediaries + * that are needed while we copy the rest of the pages + * to their final resting place. As such they must + * not conflict with either the destination addresses + * or memory the kernel is already using. + * + * The only case where we really need more than one of + * these are for architectures where we cannot disable + * the MMU and must instead generate an identity mapped + * page table for all of the memory. + * + * At worst this runs in O(N) of the image size. + */ + struct list_head extra_pages; + struct page *pages; + unsigned int count; + + count = 1 << order; + INIT_LIST_HEAD(&extra_pages); + + /* Loop while I can allocate a page and the page allocated + * is a destination page. + */ + do { + unsigned long pfn, epfn, addr, eaddr; + + pages = kimage_alloc_pages(GFP_KERNEL, order); + if (!pages) + break; + pfn = page_to_pfn(pages); + epfn = pfn + count; + addr = pfn << PAGE_SHIFT; + eaddr = epfn << PAGE_SHIFT; + if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || + kimage_is_destination_range(image, addr, eaddr)) { + list_add(&pages->lru, &extra_pages); + pages = NULL; + } + } while (!pages); + + if (pages) { + /* Remember the allocated page... */ + list_add(&pages->lru, &image->control_pages); + + /* Because the page is already in it's destination + * location we will never allocate another page at + * that address. Therefore kimage_alloc_pages + * will not return it (again) and we don't need + * to give it an entry in image->segment[]. + */ + } + /* Deal with the destination pages I have inadvertently allocated. + * + * Ideally I would convert multi-page allocations into single + * page allocations, and add everything to image->dest_pages. + * + * For now it is simpler to just free the pages. + */ + kimage_free_page_list(&extra_pages); + + return pages; +} + +static struct page *kimage_alloc_crash_control_pages(struct kimage *image, + unsigned int order) +{ + /* Control pages are special, they are the intermediaries + * that are needed while we copy the rest of the pages + * to their final resting place. As such they must + * not conflict with either the destination addresses + * or memory the kernel is already using. + * + * Control pages are also the only pags we must allocate + * when loading a crash kernel. All of the other pages + * are specified by the segments and we just memcpy + * into them directly. + * + * The only case where we really need more than one of + * these are for architectures where we cannot disable + * the MMU and must instead generate an identity mapped + * page table for all of the memory. + * + * Given the low demand this implements a very simple + * allocator that finds the first hole of the appropriate + * size in the reserved memory region, and allocates all + * of the memory up to and including the hole. + */ + unsigned long hole_start, hole_end, size; + struct page *pages; + + pages = NULL; + size = (1 << order) << PAGE_SHIFT; + hole_start = (image->control_page + (size - 1)) & ~(size - 1); + hole_end = hole_start + size - 1; + while (hole_end <= crashk_res.end) { + unsigned long i; + + if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) + break; + if (hole_end > crashk_res.end) + break; + /* See if I overlap any of the segments */ + for (i = 0; i < image->nr_segments; i++) { + unsigned long mstart, mend; + + mstart = image->segment[i].mem; + mend = mstart + image->segment[i].memsz - 1; + if ((hole_end >= mstart) && (hole_start <= mend)) { + /* Advance the hole to the end of the segment */ + hole_start = (mend + (size - 1)) & ~(size - 1); + hole_end = hole_start + size - 1; + break; + } + } + /* If I don't overlap any segments I have found my hole! */ + if (i == image->nr_segments) { + pages = pfn_to_page(hole_start >> PAGE_SHIFT); + break; + } + } + if (pages) + image->control_page = hole_end; + + return pages; +} + + +struct page *kimage_alloc_control_pages(struct kimage *image, + unsigned int order) +{ + struct page *pages = NULL; + + switch (image->type) { + case KEXEC_TYPE_DEFAULT: + pages = kimage_alloc_normal_control_pages(image, order); + break; + case KEXEC_TYPE_CRASH: + pages = kimage_alloc_crash_control_pages(image, order); + break; + } + + return pages; +} + +static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) +{ + if (*image->entry != 0) + image->entry++; + + if (image->entry == image->last_entry) { + kimage_entry_t *ind_page; + struct page *page; + + page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); + if (!page) + return -ENOMEM; + + ind_page = page_address(page); + *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; + image->entry = ind_page; + image->last_entry = ind_page + + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); + } + *image->entry = entry; + image->entry++; + *image->entry = 0; + + return 0; +} + +static int kimage_set_destination(struct kimage *image, + unsigned long destination) +{ + int result; + + destination &= PAGE_MASK; + result = kimage_add_entry(image, destination | IND_DESTINATION); + if (result == 0) + image->destination = destination; + + return result; +} + + +static int kimage_add_page(struct kimage *image, unsigned long page) +{ + int result; + + page &= PAGE_MASK; + result = kimage_add_entry(image, page | IND_SOURCE); + if (result == 0) + image->destination += PAGE_SIZE; + + return result; +} + + +static void kimage_free_extra_pages(struct kimage *image) +{ + /* Walk through and free any extra destination pages I may have */ + kimage_free_page_list(&image->dest_pages); + + /* Walk through and free any unusable pages I have cached */ + kimage_free_page_list(&image->unuseable_pages); + +} +static void kimage_terminate(struct kimage *image) +{ + if (*image->entry != 0) + image->entry++; + + *image->entry = IND_DONE; +} + +#define for_each_kimage_entry(image, ptr, entry) \ + for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ + ptr = (entry & IND_INDIRECTION)? \ + phys_to_virt((entry & PAGE_MASK)): ptr +1) + +static void kimage_free_entry(kimage_entry_t entry) +{ + struct page *page; + + page = pfn_to_page(entry >> PAGE_SHIFT); + kimage_free_pages(page); +} + +static void kimage_free(struct kimage *image) +{ + kimage_entry_t *ptr, entry; + kimage_entry_t ind = 0; + + if (!image) + return; + + kimage_free_extra_pages(image); + for_each_kimage_entry(image, ptr, entry) { + if (entry & IND_INDIRECTION) { + /* Free the previous indirection page */ + if (ind & IND_INDIRECTION) + kimage_free_entry(ind); + /* Save this indirection page until we are + * done with it. + */ + ind = entry; + } + else if (entry & IND_SOURCE) + kimage_free_entry(entry); + } + /* Free the final indirection page */ + if (ind & IND_INDIRECTION) + kimage_free_entry(ind); + + /* Handle any machine specific cleanup */ + machine_kexec_cleanup(image); + + /* Free the kexec control pages... */ + kimage_free_page_list(&image->control_pages); + kfree(image); +} + +static kimage_entry_t *kimage_dst_used(struct kimage *image, + unsigned long page) +{ + kimage_entry_t *ptr, entry; + unsigned long destination = 0; + + for_each_kimage_entry(image, ptr, entry) { + if (entry & IND_DESTINATION) + destination = entry & PAGE_MASK; + else if (entry & IND_SOURCE) { + if (page == destination) + return ptr; + destination += PAGE_SIZE; + } + } + + return NULL; +} + +static struct page *kimage_alloc_page(struct kimage *image, + gfp_t gfp_mask, + unsigned long destination) +{ + /* + * Here we implement safeguards to ensure that a source page + * is not copied to its destination page before the data on + * the destination page is no longer useful. + * + * To do this we maintain the invariant that a source page is + * either its own destination page, or it is not a + * destination page at all. + * + * That is slightly stronger than required, but the proof + * that no problems will not occur is trivial, and the + * implementation is simply to verify. + * + * When allocating all pages normally this algorithm will run + * in O(N) time, but in the worst case it will run in O(N^2) + * time. If the runtime is a problem the data structures can + * be fixed. + */ + struct page *page; + unsigned long addr; + + /* + * Walk through the list of destination pages, and see if I + * have a match. + */ + list_for_each_entry(page, &image->dest_pages, lru) { + addr = page_to_pfn(page) << PAGE_SHIFT; + if (addr == destination) { + list_del(&page->lru); + return page; + } + } + page = NULL; + while (1) { + kimage_entry_t *old; + + /* Allocate a page, if we run out of memory give up */ + page = kimage_alloc_pages(gfp_mask, 0); + if (!page) + return NULL; + /* If the page cannot be used file it away */ + if (page_to_pfn(page) > + (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { + list_add(&page->lru, &image->unuseable_pages); + continue; + } + addr = page_to_pfn(page) << PAGE_SHIFT; + + /* If it is the destination page we want use it */ + if (addr == destination) + break; + + /* If the page is not a destination page use it */ + if (!kimage_is_destination_range(image, addr, + addr + PAGE_SIZE)) + break; + + /* + * I know that the page is someones destination page. + * See if there is already a source page for this + * destination page. And if so swap the source pages. + */ + old = kimage_dst_used(image, addr); + if (old) { + /* If so move it */ + unsigned long old_addr; + struct page *old_page; + + old_addr = *old & PAGE_MASK; + old_page = pfn_to_page(old_addr >> PAGE_SHIFT); + copy_highpage(page, old_page); + *old = addr | (*old & ~PAGE_MASK); + + /* The old page I have found cannot be a + * destination page, so return it if it's + * gfp_flags honor the ones passed in. + */ + if (!(gfp_mask & __GFP_HIGHMEM) && + PageHighMem(old_page)) { + kimage_free_pages(old_page); + continue; + } + addr = old_addr; + page = old_page; + break; + } + else { + /* Place the page on the destination list I + * will use it later. + */ + list_add(&page->lru, &image->dest_pages); + } + } + + return page; +} + +static int kimage_load_normal_segment(struct kimage *image, + struct kexec_segment *segment) +{ + unsigned long maddr; + unsigned long ubytes, mbytes; + int result; + unsigned char __user *buf; + + result = 0; + buf = segment->buf; + ubytes = segment->bufsz; + mbytes = segment->memsz; + maddr = segment->mem; + + result = kimage_set_destination(image, maddr); + if (result < 0) + goto out; + + while (mbytes) { + struct page *page; + char *ptr; + size_t uchunk, mchunk; + + page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); + if (!page) { + result = -ENOMEM; + goto out; + } + result = kimage_add_page(image, page_to_pfn(page) + << PAGE_SHIFT); + if (result < 0) + goto out; + + ptr = kmap(page); + /* Start with a clear page */ + clear_page(ptr); + ptr += maddr & ~PAGE_MASK; + mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); + if (mchunk > mbytes) + mchunk = mbytes; + + uchunk = mchunk; + if (uchunk > ubytes) + uchunk = ubytes; + + result = copy_from_user(ptr, buf, uchunk); + kunmap(page); + if (result) { + result = -EFAULT; + goto out; + } + ubytes -= uchunk; + maddr += mchunk; + buf += mchunk; + mbytes -= mchunk; + } +out: + return result; +} + +static int kimage_load_crash_segment(struct kimage *image, + struct kexec_segment *segment) +{ + /* For crash dumps kernels we simply copy the data from + * user space to it's destination. + * We do things a page at a time for the sake of kmap. + */ + unsigned long maddr; + unsigned long ubytes, mbytes; + int result; + unsigned char __user *buf; + + result = 0; + buf = segment->buf; + ubytes = segment->bufsz; + mbytes = segment->memsz; + maddr = segment->mem; + while (mbytes) { + struct page *page; + char *ptr; + size_t uchunk, mchunk; + + page = pfn_to_page(maddr >> PAGE_SHIFT); + if (!page) { + result = -ENOMEM; + goto out; + } + ptr = kmap(page); + ptr += maddr & ~PAGE_MASK; + mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); + if (mchunk > mbytes) + mchunk = mbytes; + + uchunk = mchunk; + if (uchunk > ubytes) { + uchunk = ubytes; + /* Zero the trailing part of the page */ + memset(ptr + uchunk, 0, mchunk - uchunk); + } + result = copy_from_user(ptr, buf, uchunk); + kexec_flush_icache_page(page); + kunmap(page); + if (result) { + result = -EFAULT; + goto out; + } + ubytes -= uchunk; + maddr += mchunk; + buf += mchunk; + mbytes -= mchunk; + } +out: + return result; +} + +static int kimage_load_segment(struct kimage *image, + struct kexec_segment *segment) +{ + int result = -ENOMEM; + + switch (image->type) { + case KEXEC_TYPE_DEFAULT: + result = kimage_load_normal_segment(image, segment); + break; + case KEXEC_TYPE_CRASH: + result = kimage_load_crash_segment(image, segment); + break; + } + + return result; +} + +/* + * Exec Kernel system call: for obvious reasons only root may call it. + * + * This call breaks up into three pieces. + * - A generic part which loads the new kernel from the current + * address space, and very carefully places the data in the + * allocated pages. + * + * - A generic part that interacts with the kernel and tells all of + * the devices to shut down. Preventing on-going dmas, and placing + * the devices in a consistent state so a later kernel can + * reinitialize them. + * + * - A machine specific part that includes the syscall number + * and the copies the image to it's final destination. And + * jumps into the image at entry. + * + * kexec does not sync, or unmount filesystems so if you need + * that to happen you need to do that yourself. + */ +struct kimage *kexec_image; +struct kimage *kexec_crash_image; + +static DEFINE_MUTEX(kexec_mutex); + +SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, + struct kexec_segment __user *, segments, unsigned long, flags) +{ + struct kimage **dest_image, *image; + int result; + + /* We only trust the superuser with rebooting the system. */ + if (!capable(CAP_SYS_BOOT)) + return -EPERM; + + /* + * Verify we have a legal set of flags + * This leaves us room for future extensions. + */ + if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK)) + return -EINVAL; + + /* Verify we are on the appropriate architecture */ + if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && + ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) + return -EINVAL; + + /* Put an artificial cap on the number + * of segments passed to kexec_load. + */ + if (nr_segments > KEXEC_SEGMENT_MAX) + return -EINVAL; + + image = NULL; + result = 0; + + /* Because we write directly to the reserved memory + * region when loading crash kernels we need a mutex here to + * prevent multiple crash kernels from attempting to load + * simultaneously, and to prevent a crash kernel from loading + * over the top of a in use crash kernel. + * + * KISS: always take the mutex. + */ + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; + + dest_image = &kexec_image; + if (flags & KEXEC_ON_CRASH) + dest_image = &kexec_crash_image; + if (nr_segments > 0) { + unsigned long i; + + /* Loading another kernel to reboot into */ + if ((flags & KEXEC_ON_CRASH) == 0) + result = kimage_normal_alloc(&image, entry, + nr_segments, segments); + /* Loading another kernel to switch to if this one crashes */ + else if (flags & KEXEC_ON_CRASH) { + /* Free any current crash dump kernel before + * we corrupt it. + */ + kimage_free(xchg(&kexec_crash_image, NULL)); + result = kimage_crash_alloc(&image, entry, + nr_segments, segments); + } + if (result) + goto out; + + if (flags & KEXEC_PRESERVE_CONTEXT) + image->preserve_context = 1; + result = machine_kexec_prepare(image); + if (result) + goto out; + + for (i = 0; i < nr_segments; i++) { + result = kimage_load_segment(image, &image->segment[i]); + if (result) + goto out; + } + kimage_terminate(image); + } + /* Install the new kernel, and Uninstall the old */ + image = xchg(dest_image, image); + +out: + mutex_unlock(&kexec_mutex); + kimage_free(image); + + return result; +} + +#ifdef CONFIG_COMPAT +asmlinkage long compat_sys_kexec_load(unsigned long entry, + unsigned long nr_segments, + struct compat_kexec_segment __user *segments, + unsigned long flags) +{ + struct compat_kexec_segment in; + struct kexec_segment out, __user *ksegments; + unsigned long i, result; + + /* Don't allow clients that don't understand the native + * architecture to do anything. + */ + if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) + return -EINVAL; + + if (nr_segments > KEXEC_SEGMENT_MAX) + return -EINVAL; + + ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); + for (i=0; i < nr_segments; i++) { + result = copy_from_user(&in, &segments[i], sizeof(in)); + if (result) + return -EFAULT; + + out.buf = compat_ptr(in.buf); + out.bufsz = in.bufsz; + out.mem = in.mem; + out.memsz = in.memsz; + + result = copy_to_user(&ksegments[i], &out, sizeof(out)); + if (result) + return -EFAULT; + } + + return sys_kexec_load(entry, nr_segments, ksegments, flags); +} +#endif + +void crash_kexec(struct pt_regs *regs) +{ + /* Take the kexec_mutex here to prevent sys_kexec_load + * running on one cpu from replacing the crash kernel + * we are using after a panic on a different cpu. + * + * If the crash kernel was not located in a fixed area + * of memory the xchg(&kexec_crash_image) would be + * sufficient. But since I reuse the memory... + */ + if (mutex_trylock(&kexec_mutex)) { + if (kexec_crash_image) { + struct pt_regs fixed_regs; + + kmsg_dump(KMSG_DUMP_KEXEC); + + crash_setup_regs(&fixed_regs, regs); + crash_save_vmcoreinfo(); + machine_crash_shutdown(&fixed_regs); + machine_kexec(kexec_crash_image); + } + mutex_unlock(&kexec_mutex); + } +} + +size_t crash_get_memory_size(void) +{ + size_t size = 0; + mutex_lock(&kexec_mutex); + if (crashk_res.end != crashk_res.start) + size = crashk_res.end - crashk_res.start + 1; + mutex_unlock(&kexec_mutex); + return size; +} + +void __weak crash_free_reserved_phys_range(unsigned long begin, + unsigned long end) +{ + unsigned long addr; + + for (addr = begin; addr < end; addr += PAGE_SIZE) { + ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT)); + init_page_count(pfn_to_page(addr >> PAGE_SHIFT)); + free_page((unsigned long)__va(addr)); + totalram_pages++; + } +} + +int crash_shrink_memory(unsigned long new_size) +{ + int ret = 0; + unsigned long start, end; + + mutex_lock(&kexec_mutex); + + if (kexec_crash_image) { + ret = -ENOENT; + goto unlock; + } + start = crashk_res.start; + end = crashk_res.end; + + if (new_size >= end - start + 1) { + ret = -EINVAL; + if (new_size == end - start + 1) + ret = 0; + goto unlock; + } + + start = roundup(start, PAGE_SIZE); + end = roundup(start + new_size, PAGE_SIZE); + + crash_free_reserved_phys_range(end, crashk_res.end); + + if ((start == end) && (crashk_res.parent != NULL)) + release_resource(&crashk_res); + crashk_res.end = end - 1; + +unlock: + mutex_unlock(&kexec_mutex); + return ret; +} + +static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, + size_t data_len) +{ + struct elf_note note; + + note.n_namesz = strlen(name) + 1; + note.n_descsz = data_len; + note.n_type = type; + memcpy(buf, ¬e, sizeof(note)); + buf += (sizeof(note) + 3)/4; + memcpy(buf, name, note.n_namesz); + buf += (note.n_namesz + 3)/4; + memcpy(buf, data, note.n_descsz); + buf += (note.n_descsz + 3)/4; + + return buf; +} + +static void final_note(u32 *buf) +{ + struct elf_note note; + + note.n_namesz = 0; + note.n_descsz = 0; + note.n_type = 0; + memcpy(buf, ¬e, sizeof(note)); +} + +void crash_save_cpu(struct pt_regs *regs, int cpu) +{ + struct elf_prstatus prstatus; + u32 *buf; + + if ((cpu < 0) || (cpu >= nr_cpu_ids)) + return; + + /* Using ELF notes here is opportunistic. + * I need a well defined structure format + * for the data I pass, and I need tags + * on the data to indicate what information I have + * squirrelled away. ELF notes happen to provide + * all of that, so there is no need to invent something new. + */ + buf = (u32*)per_cpu_ptr(crash_notes, cpu); + if (!buf) + return; + memset(&prstatus, 0, sizeof(prstatus)); + prstatus.pr_pid = current->pid; + elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); + buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, + &prstatus, sizeof(prstatus)); + final_note(buf); +} + +static int __init crash_notes_memory_init(void) +{ + /* Allocate memory for saving cpu registers. */ + crash_notes = alloc_percpu(note_buf_t); + if (!crash_notes) { + printk("Kexec: Memory allocation for saving cpu register" + " states failed\n"); + return -ENOMEM; + } + return 0; +} +module_init(crash_notes_memory_init) + + +/* + * parsing the "crashkernel" commandline + * + * this code is intended to be called from architecture specific code + */ + + +/* + * This function parses command lines in the format + * + * crashkernel=ramsize-range:size[,...][@offset] + * + * The function returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_mem(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + char *cur = cmdline, *tmp; + + /* for each entry of the comma-separated list */ + do { + unsigned long long start, end = ULLONG_MAX, size; + + /* get the start of the range */ + start = memparse(cur, &tmp); + if (cur == tmp) { + pr_warning("crashkernel: Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (*cur != '-') { + pr_warning("crashkernel: '-' expected\n"); + return -EINVAL; + } + cur++; + + /* if no ':' is here, than we read the end */ + if (*cur != ':') { + end = memparse(cur, &tmp); + if (cur == tmp) { + pr_warning("crashkernel: Memory " + "value expected\n"); + return -EINVAL; + } + cur = tmp; + if (end <= start) { + pr_warning("crashkernel: end <= start\n"); + return -EINVAL; + } + } + + if (*cur != ':') { + pr_warning("crashkernel: ':' expected\n"); + return -EINVAL; + } + cur++; + + size = memparse(cur, &tmp); + if (cur == tmp) { + pr_warning("Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (size >= system_ram) { + pr_warning("crashkernel: invalid size\n"); + return -EINVAL; + } + + /* match ? */ + if (system_ram >= start && system_ram < end) { + *crash_size = size; + break; + } + } while (*cur++ == ','); + + if (*crash_size > 0) { + while (*cur && *cur != ' ' && *cur != '@') + cur++; + if (*cur == '@') { + cur++; + *crash_base = memparse(cur, &tmp); + if (cur == tmp) { + pr_warning("Memory value expected " + "after '@'\n"); + return -EINVAL; + } + } + } + + return 0; +} + +/* + * That function parses "simple" (old) crashkernel command lines like + * + * crashkernel=size[@offset] + * + * It returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_simple(char *cmdline, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + char *cur = cmdline; + + *crash_size = memparse(cmdline, &cur); + if (cmdline == cur) { + pr_warning("crashkernel: memory value expected\n"); + return -EINVAL; + } + + if (*cur == '@') + *crash_base = memparse(cur+1, &cur); + + return 0; +} + +/* + * That function is the entry point for command line parsing and should be + * called from the arch-specific code. + */ +int __init parse_crashkernel(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + char *p = cmdline, *ck_cmdline = NULL; + char *first_colon, *first_space; + + BUG_ON(!crash_size || !crash_base); + *crash_size = 0; + *crash_base = 0; + + /* find crashkernel and use the last one if there are more */ + p = strstr(p, "crashkernel="); + while (p) { + ck_cmdline = p; + p = strstr(p+1, "crashkernel="); + } + + if (!ck_cmdline) + return -EINVAL; + + ck_cmdline += 12; /* strlen("crashkernel=") */ + + /* + * if the commandline contains a ':', then that's the extended + * syntax -- if not, it must be the classic syntax + */ + first_colon = strchr(ck_cmdline, ':'); + first_space = strchr(ck_cmdline, ' '); + if (first_colon && (!first_space || first_colon < first_space)) + return parse_crashkernel_mem(ck_cmdline, system_ram, + crash_size, crash_base); + else + return parse_crashkernel_simple(ck_cmdline, crash_size, + crash_base); + + return 0; +} + + + +void crash_save_vmcoreinfo(void) +{ + u32 *buf; + + if (!vmcoreinfo_size) + return; + + vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds()); + + buf = (u32 *)vmcoreinfo_note; + + buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, + vmcoreinfo_size); + + final_note(buf); +} + +void vmcoreinfo_append_str(const char *fmt, ...) +{ + va_list args; + char buf[0x50]; + int r; + + va_start(args, fmt); + r = vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + if (r + vmcoreinfo_size > vmcoreinfo_max_size) + r = vmcoreinfo_max_size - vmcoreinfo_size; + + memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); + + vmcoreinfo_size += r; +} + +/* + * provide an empty default implementation here -- architecture + * code may override this + */ +void __attribute__ ((weak)) arch_crash_save_vmcoreinfo(void) +{} + +unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void) +{ + return __pa((unsigned long)(char *)&vmcoreinfo_note); +} + +static int __init crash_save_vmcoreinfo_init(void) +{ + VMCOREINFO_OSRELEASE(init_uts_ns.name.release); + VMCOREINFO_PAGESIZE(PAGE_SIZE); + + VMCOREINFO_SYMBOL(init_uts_ns); + VMCOREINFO_SYMBOL(node_online_map); + VMCOREINFO_SYMBOL(swapper_pg_dir); + VMCOREINFO_SYMBOL(_stext); + VMCOREINFO_SYMBOL(vmlist); + +#ifndef CONFIG_NEED_MULTIPLE_NODES + VMCOREINFO_SYMBOL(mem_map); + VMCOREINFO_SYMBOL(contig_page_data); +#endif +#ifdef CONFIG_SPARSEMEM + VMCOREINFO_SYMBOL(mem_section); + VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); + VMCOREINFO_STRUCT_SIZE(mem_section); + VMCOREINFO_OFFSET(mem_section, section_mem_map); +#endif + VMCOREINFO_STRUCT_SIZE(page); + VMCOREINFO_STRUCT_SIZE(pglist_data); + VMCOREINFO_STRUCT_SIZE(zone); + VMCOREINFO_STRUCT_SIZE(free_area); + VMCOREINFO_STRUCT_SIZE(list_head); + VMCOREINFO_SIZE(nodemask_t); + VMCOREINFO_OFFSET(page, flags); + VMCOREINFO_OFFSET(page, _count); + VMCOREINFO_OFFSET(page, mapping); + VMCOREINFO_OFFSET(page, lru); + VMCOREINFO_OFFSET(pglist_data, node_zones); + VMCOREINFO_OFFSET(pglist_data, nr_zones); +#ifdef CONFIG_FLAT_NODE_MEM_MAP + VMCOREINFO_OFFSET(pglist_data, node_mem_map); +#endif + VMCOREINFO_OFFSET(pglist_data, node_start_pfn); + VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); + VMCOREINFO_OFFSET(pglist_data, node_id); + VMCOREINFO_OFFSET(zone, free_area); + VMCOREINFO_OFFSET(zone, vm_stat); + VMCOREINFO_OFFSET(zone, spanned_pages); + VMCOREINFO_OFFSET(free_area, free_list); + VMCOREINFO_OFFSET(list_head, next); + VMCOREINFO_OFFSET(list_head, prev); + VMCOREINFO_OFFSET(vm_struct, addr); + VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); + log_buf_kexec_setup(); + VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); + VMCOREINFO_NUMBER(NR_FREE_PAGES); + VMCOREINFO_NUMBER(PG_lru); + VMCOREINFO_NUMBER(PG_private); + VMCOREINFO_NUMBER(PG_swapcache); + + arch_crash_save_vmcoreinfo(); + + return 0; +} + +module_init(crash_save_vmcoreinfo_init) + +/* + * Move into place and start executing a preloaded standalone + * executable. If nothing was preloaded return an error. + */ +int kernel_kexec(void) +{ + int error = 0; + + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; + if (!kexec_image) { + error = -EINVAL; + goto Unlock; + } + +#ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) { + mutex_lock(&pm_mutex); + pm_prepare_console(); + error = freeze_processes(); + if (error) { + error = -EBUSY; + goto Restore_console; + } + suspend_console(); + error = dpm_suspend_start(PMSG_FREEZE); + if (error) + goto Resume_console; + /* At this point, dpm_suspend_start() has been called, + * but *not* dpm_suspend_noirq(). We *must* call + * dpm_suspend_noirq() now. Otherwise, drivers for + * some devices (e.g. interrupt controllers) become + * desynchronized with the actual state of the + * hardware at resume time, and evil weirdness ensues. + */ + error = dpm_suspend_noirq(PMSG_FREEZE); + if (error) + goto Resume_devices; + error = disable_nonboot_cpus(); + if (error) + goto Enable_cpus; + local_irq_disable(); + error = syscore_suspend(); + if (error) + goto Enable_irqs; + } else +#endif + { + kernel_restart_prepare(NULL); + printk(KERN_EMERG "Starting new kernel\n"); + machine_shutdown(); + } + + machine_kexec(kexec_image); + +#ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) { + syscore_resume(); + Enable_irqs: + local_irq_enable(); + Enable_cpus: + enable_nonboot_cpus(); + dpm_resume_noirq(PMSG_RESTORE); + Resume_devices: + dpm_resume_end(PMSG_RESTORE); + Resume_console: + resume_console(); + thaw_processes(); + Restore_console: + pm_restore_console(); + mutex_unlock(&pm_mutex); + } +#endif + + Unlock: + mutex_unlock(&kexec_mutex); + return error; +} diff --git a/kernel/kfifo.c b/kernel/kfifo.c new file mode 100644 index 00000000..01a0700e --- /dev/null +++ b/kernel/kfifo.c @@ -0,0 +1,608 @@ +/* + * A generic kernel FIFO implementation + * + * Copyright (C) 2009/2010 Stefani Seibold + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * internal helper to calculate the unused elements in a fifo + */ +static inline unsigned int kfifo_unused(struct __kfifo *fifo) +{ + return (fifo->mask + 1) - (fifo->in - fifo->out); +} + +int __kfifo_alloc(struct __kfifo *fifo, unsigned int size, + size_t esize, gfp_t gfp_mask) +{ + /* + * round down to the next power of 2, since our 'let the indices + * wrap' technique works only in this case. + */ + if (!is_power_of_2(size)) + size = rounddown_pow_of_two(size); + + fifo->in = 0; + fifo->out = 0; + fifo->esize = esize; + + if (size < 2) { + fifo->data = NULL; + fifo->mask = 0; + return -EINVAL; + } + + fifo->data = kmalloc(size * esize, gfp_mask); + + if (!fifo->data) { + fifo->mask = 0; + return -ENOMEM; + } + fifo->mask = size - 1; + + return 0; +} +EXPORT_SYMBOL(__kfifo_alloc); + +void __kfifo_free(struct __kfifo *fifo) +{ + kfree(fifo->data); + fifo->in = 0; + fifo->out = 0; + fifo->esize = 0; + fifo->data = NULL; + fifo->mask = 0; +} +EXPORT_SYMBOL(__kfifo_free); + +int __kfifo_init(struct __kfifo *fifo, void *buffer, + unsigned int size, size_t esize) +{ + size /= esize; + + if (!is_power_of_2(size)) + size = rounddown_pow_of_two(size); + + fifo->in = 0; + fifo->out = 0; + fifo->esize = esize; + fifo->data = buffer; + + if (size < 2) { + fifo->mask = 0; + return -EINVAL; + } + fifo->mask = size - 1; + + return 0; +} +EXPORT_SYMBOL(__kfifo_init); + +static void kfifo_copy_in(struct __kfifo *fifo, const void *src, + unsigned int len, unsigned int off) +{ + unsigned int size = fifo->mask + 1; + unsigned int esize = fifo->esize; + unsigned int l; + + off &= fifo->mask; + if (esize != 1) { + off *= esize; + size *= esize; + len *= esize; + } + l = min(len, size - off); + + memcpy(fifo->data + off, src, l); + memcpy(fifo->data, src + l, len - l); + /* + * make sure that the data in the fifo is up to date before + * incrementing the fifo->in index counter + */ + smp_wmb(); +} + +unsigned int __kfifo_in(struct __kfifo *fifo, + const void *buf, unsigned int len) +{ + unsigned int l; + + l = kfifo_unused(fifo); + if (len > l) + len = l; + + kfifo_copy_in(fifo, buf, len, fifo->in); + fifo->in += len; + return len; +} +EXPORT_SYMBOL(__kfifo_in); + +static void kfifo_copy_out(struct __kfifo *fifo, void *dst, + unsigned int len, unsigned int off) +{ + unsigned int size = fifo->mask + 1; + unsigned int esize = fifo->esize; + unsigned int l; + + off &= fifo->mask; + if (esize != 1) { + off *= esize; + size *= esize; + len *= esize; + } + l = min(len, size - off); + + memcpy(dst, fifo->data + off, l); + memcpy(dst + l, fifo->data, len - l); + /* + * make sure that the data is copied before + * incrementing the fifo->out index counter + */ + smp_wmb(); +} + +unsigned int __kfifo_out_peek(struct __kfifo *fifo, + void *buf, unsigned int len) +{ + unsigned int l; + + l = fifo->in - fifo->out; + if (len > l) + len = l; + + kfifo_copy_out(fifo, buf, len, fifo->out); + return len; +} +EXPORT_SYMBOL(__kfifo_out_peek); + +unsigned int __kfifo_out(struct __kfifo *fifo, + void *buf, unsigned int len) +{ + len = __kfifo_out_peek(fifo, buf, len); + fifo->out += len; + return len; +} +EXPORT_SYMBOL(__kfifo_out); + +static unsigned long kfifo_copy_from_user(struct __kfifo *fifo, + const void __user *from, unsigned int len, unsigned int off, + unsigned int *copied) +{ + unsigned int size = fifo->mask + 1; + unsigned int esize = fifo->esize; + unsigned int l; + unsigned long ret; + + off &= fifo->mask; + if (esize != 1) { + off *= esize; + size *= esize; + len *= esize; + } + l = min(len, size - off); + + ret = copy_from_user(fifo->data + off, from, l); + if (unlikely(ret)) + ret = DIV_ROUND_UP(ret + len - l, esize); + else { + ret = copy_from_user(fifo->data, from + l, len - l); + if (unlikely(ret)) + ret = DIV_ROUND_UP(ret, esize); + } + /* + * make sure that the data in the fifo is up to date before + * incrementing the fifo->in index counter + */ + smp_wmb(); + *copied = len - ret; + /* return the number of elements which are not copied */ + return ret; +} + +int __kfifo_from_user(struct __kfifo *fifo, const void __user *from, + unsigned long len, unsigned int *copied) +{ + unsigned int l; + unsigned long ret; + unsigned int esize = fifo->esize; + int err; + + if (esize != 1) + len /= esize; + + l = kfifo_unused(fifo); + if (len > l) + len = l; + + ret = kfifo_copy_from_user(fifo, from, len, fifo->in, copied); + if (unlikely(ret)) { + len -= ret; + err = -EFAULT; + } else + err = 0; + fifo->in += len; + return err; +} +EXPORT_SYMBOL(__kfifo_from_user); + +static unsigned long kfifo_copy_to_user(struct __kfifo *fifo, void __user *to, + unsigned int len, unsigned int off, unsigned int *copied) +{ + unsigned int l; + unsigned long ret; + unsigned int size = fifo->mask + 1; + unsigned int esize = fifo->esize; + + off &= fifo->mask; + if (esize != 1) { + off *= esize; + size *= esize; + len *= esize; + } + l = min(len, size - off); + + ret = copy_to_user(to, fifo->data + off, l); + if (unlikely(ret)) + ret = DIV_ROUND_UP(ret + len - l, esize); + else { + ret = copy_to_user(to + l, fifo->data, len - l); + if (unlikely(ret)) + ret = DIV_ROUND_UP(ret, esize); + } + /* + * make sure that the data is copied before + * incrementing the fifo->out index counter + */ + smp_wmb(); + *copied = len - ret; + /* return the number of elements which are not copied */ + return ret; +} + +int __kfifo_to_user(struct __kfifo *fifo, void __user *to, + unsigned long len, unsigned int *copied) +{ + unsigned int l; + unsigned long ret; + unsigned int esize = fifo->esize; + int err; + + if (esize != 1) + len /= esize; + + l = fifo->in - fifo->out; + if (len > l) + len = l; + ret = kfifo_copy_to_user(fifo, to, len, fifo->out, copied); + if (unlikely(ret)) { + len -= ret; + err = -EFAULT; + } else + err = 0; + fifo->out += len; + return err; +} +EXPORT_SYMBOL(__kfifo_to_user); + +static int setup_sgl_buf(struct scatterlist *sgl, void *buf, + int nents, unsigned int len) +{ + int n; + unsigned int l; + unsigned int off; + struct page *page; + + if (!nents) + return 0; + + if (!len) + return 0; + + n = 0; + page = virt_to_page(buf); + off = offset_in_page(buf); + l = 0; + + while (len >= l + PAGE_SIZE - off) { + struct page *npage; + + l += PAGE_SIZE; + buf += PAGE_SIZE; + npage = virt_to_page(buf); + if (page_to_phys(page) != page_to_phys(npage) - l) { + sg_set_page(sgl, page, l - off, off); + sgl = sg_next(sgl); + if (++n == nents || sgl == NULL) + return n; + page = npage; + len -= l - off; + l = off = 0; + } + } + sg_set_page(sgl, page, len, off); + return n + 1; +} + +static unsigned int setup_sgl(struct __kfifo *fifo, struct scatterlist *sgl, + int nents, unsigned int len, unsigned int off) +{ + unsigned int size = fifo->mask + 1; + unsigned int esize = fifo->esize; + unsigned int l; + unsigned int n; + + off &= fifo->mask; + if (esize != 1) { + off *= esize; + size *= esize; + len *= esize; + } + l = min(len, size - off); + + n = setup_sgl_buf(sgl, fifo->data + off, nents, l); + n += setup_sgl_buf(sgl + n, fifo->data, nents - n, len - l); + + return n; +} + +unsigned int __kfifo_dma_in_prepare(struct __kfifo *fifo, + struct scatterlist *sgl, int nents, unsigned int len) +{ + unsigned int l; + + l = kfifo_unused(fifo); + if (len > l) + len = l; + + return setup_sgl(fifo, sgl, nents, len, fifo->in); +} +EXPORT_SYMBOL(__kfifo_dma_in_prepare); + +unsigned int __kfifo_dma_out_prepare(struct __kfifo *fifo, + struct scatterlist *sgl, int nents, unsigned int len) +{ + unsigned int l; + + l = fifo->in - fifo->out; + if (len > l) + len = l; + + return setup_sgl(fifo, sgl, nents, len, fifo->out); +} +EXPORT_SYMBOL(__kfifo_dma_out_prepare); + +unsigned int __kfifo_max_r(unsigned int len, size_t recsize) +{ + unsigned int max = (1 << (recsize << 3)) - 1; + + if (len > max) + return max; + return len; +} + +#define __KFIFO_PEEK(data, out, mask) \ + ((data)[(out) & (mask)]) +/* + * __kfifo_peek_n internal helper function for determinate the length of + * the next record in the fifo + */ +static unsigned int __kfifo_peek_n(struct __kfifo *fifo, size_t recsize) +{ + unsigned int l; + unsigned int mask = fifo->mask; + unsigned char *data = fifo->data; + + l = __KFIFO_PEEK(data, fifo->out, mask); + + if (--recsize) + l |= __KFIFO_PEEK(data, fifo->out + 1, mask) << 8; + + return l; +} + +#define __KFIFO_POKE(data, in, mask, val) \ + ( \ + (data)[(in) & (mask)] = (unsigned char)(val) \ + ) + +/* + * __kfifo_poke_n internal helper function for storeing the length of + * the record into the fifo + */ +static void __kfifo_poke_n(struct __kfifo *fifo, unsigned int n, size_t recsize) +{ + unsigned int mask = fifo->mask; + unsigned char *data = fifo->data; + + __KFIFO_POKE(data, fifo->in, mask, n); + + if (recsize > 1) + __KFIFO_POKE(data, fifo->in + 1, mask, n >> 8); +} + +unsigned int __kfifo_len_r(struct __kfifo *fifo, size_t recsize) +{ + return __kfifo_peek_n(fifo, recsize); +} +EXPORT_SYMBOL(__kfifo_len_r); + +unsigned int __kfifo_in_r(struct __kfifo *fifo, const void *buf, + unsigned int len, size_t recsize) +{ + if (len + recsize > kfifo_unused(fifo)) + return 0; + + __kfifo_poke_n(fifo, len, recsize); + + kfifo_copy_in(fifo, buf, len, fifo->in + recsize); + fifo->in += len + recsize; + return len; +} +EXPORT_SYMBOL(__kfifo_in_r); + +static unsigned int kfifo_out_copy_r(struct __kfifo *fifo, + void *buf, unsigned int len, size_t recsize, unsigned int *n) +{ + *n = __kfifo_peek_n(fifo, recsize); + + if (len > *n) + len = *n; + + kfifo_copy_out(fifo, buf, len, fifo->out + recsize); + return len; +} + +unsigned int __kfifo_out_peek_r(struct __kfifo *fifo, void *buf, + unsigned int len, size_t recsize) +{ + unsigned int n; + + if (fifo->in == fifo->out) + return 0; + + return kfifo_out_copy_r(fifo, buf, len, recsize, &n); +} +EXPORT_SYMBOL(__kfifo_out_peek_r); + +unsigned int __kfifo_out_r(struct __kfifo *fifo, void *buf, + unsigned int len, size_t recsize) +{ + unsigned int n; + + if (fifo->in == fifo->out) + return 0; + + len = kfifo_out_copy_r(fifo, buf, len, recsize, &n); + fifo->out += n + recsize; + return len; +} +EXPORT_SYMBOL(__kfifo_out_r); + +void __kfifo_skip_r(struct __kfifo *fifo, size_t recsize) +{ + unsigned int n; + + n = __kfifo_peek_n(fifo, recsize); + fifo->out += n + recsize; +} +EXPORT_SYMBOL(__kfifo_skip_r); + +int __kfifo_from_user_r(struct __kfifo *fifo, const void __user *from, + unsigned long len, unsigned int *copied, size_t recsize) +{ + unsigned long ret; + + len = __kfifo_max_r(len, recsize); + + if (len + recsize > kfifo_unused(fifo)) { + *copied = 0; + return 0; + } + + __kfifo_poke_n(fifo, len, recsize); + + ret = kfifo_copy_from_user(fifo, from, len, fifo->in + recsize, copied); + if (unlikely(ret)) { + *copied = 0; + return -EFAULT; + } + fifo->in += len + recsize; + return 0; +} +EXPORT_SYMBOL(__kfifo_from_user_r); + +int __kfifo_to_user_r(struct __kfifo *fifo, void __user *to, + unsigned long len, unsigned int *copied, size_t recsize) +{ + unsigned long ret; + unsigned int n; + + if (fifo->in == fifo->out) { + *copied = 0; + return 0; + } + + n = __kfifo_peek_n(fifo, recsize); + if (len > n) + len = n; + + ret = kfifo_copy_to_user(fifo, to, len, fifo->out + recsize, copied); + if (unlikely(ret)) { + *copied = 0; + return -EFAULT; + } + fifo->out += n + recsize; + return 0; +} +EXPORT_SYMBOL(__kfifo_to_user_r); + +unsigned int __kfifo_dma_in_prepare_r(struct __kfifo *fifo, + struct scatterlist *sgl, int nents, unsigned int len, size_t recsize) +{ + if (!nents) + BUG(); + + len = __kfifo_max_r(len, recsize); + + if (len + recsize > kfifo_unused(fifo)) + return 0; + + return setup_sgl(fifo, sgl, nents, len, fifo->in + recsize); +} +EXPORT_SYMBOL(__kfifo_dma_in_prepare_r); + +void __kfifo_dma_in_finish_r(struct __kfifo *fifo, + unsigned int len, size_t recsize) +{ + len = __kfifo_max_r(len, recsize); + __kfifo_poke_n(fifo, len, recsize); + fifo->in += len + recsize; +} +EXPORT_SYMBOL(__kfifo_dma_in_finish_r); + +unsigned int __kfifo_dma_out_prepare_r(struct __kfifo *fifo, + struct scatterlist *sgl, int nents, unsigned int len, size_t recsize) +{ + if (!nents) + BUG(); + + len = __kfifo_max_r(len, recsize); + + if (len + recsize > fifo->in - fifo->out) + return 0; + + return setup_sgl(fifo, sgl, nents, len, fifo->out + recsize); +} +EXPORT_SYMBOL(__kfifo_dma_out_prepare_r); + +void __kfifo_dma_out_finish_r(struct __kfifo *fifo, size_t recsize) +{ + unsigned int len; + + len = __kfifo_peek_n(fifo, recsize); + fifo->out += len + recsize; +} +EXPORT_SYMBOL(__kfifo_dma_out_finish_r); diff --git a/kernel/kmod.c b/kernel/kmod.c new file mode 100644 index 00000000..fabfe541 --- /dev/null +++ b/kernel/kmod.c @@ -0,0 +1,531 @@ +/* + kmod, the new module loader (replaces kerneld) + Kirk Petersen + + Reorganized not to be a daemon by Adam Richter, with guidance + from Greg Zornetzer. + + Modified to avoid chroot and file sharing problems. + Mikael Pettersson + + Limit the concurrent number of kmod modprobes to catch loops from + "modprobe needs a service that is in a module". + Keith Owens December 1999 + + Unblock all signals when we exec a usermode process. + Shuu Yamaguchi December 2000 + + call_usermodehelper wait flag, and remove exec_usermodehelper. + Rusty Russell Jan 2003 +*/ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +extern int max_threads; + +static struct workqueue_struct *khelper_wq; + +#define CAP_BSET (void *)1 +#define CAP_PI (void *)2 + +static kernel_cap_t usermodehelper_bset = CAP_FULL_SET; +static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET; +static DEFINE_SPINLOCK(umh_sysctl_lock); + +#ifdef CONFIG_MODULES + +/* + modprobe_path is set via /proc/sys. +*/ +char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe"; + +/** + * __request_module - try to load a kernel module + * @wait: wait (or not) for the operation to complete + * @fmt: printf style format string for the name of the module + * @...: arguments as specified in the format string + * + * Load a module using the user mode module loader. The function returns + * zero on success or a negative errno code on failure. Note that a + * successful module load does not mean the module did not then unload + * and exit on an error of its own. Callers must check that the service + * they requested is now available not blindly invoke it. + * + * If module auto-loading support is disabled then this function + * becomes a no-operation. + */ +int __request_module(bool wait, const char *fmt, ...) +{ + va_list args; + char module_name[MODULE_NAME_LEN]; + unsigned int max_modprobes; + int ret; + char *argv[] = { modprobe_path, "-q", "--", module_name, NULL }; + static char *envp[] = { "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + NULL }; + static atomic_t kmod_concurrent = ATOMIC_INIT(0); +#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */ + static int kmod_loop_msg; + + va_start(args, fmt); + ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args); + va_end(args); + if (ret >= MODULE_NAME_LEN) + return -ENAMETOOLONG; + + ret = security_kernel_module_request(module_name); + if (ret) + return ret; + + /* If modprobe needs a service that is in a module, we get a recursive + * loop. Limit the number of running kmod threads to max_threads/2 or + * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method + * would be to run the parents of this process, counting how many times + * kmod was invoked. That would mean accessing the internals of the + * process tables to get the command line, proc_pid_cmdline is static + * and it is not worth changing the proc code just to handle this case. + * KAO. + * + * "trace the ppid" is simple, but will fail if someone's + * parent exits. I think this is as good as it gets. --RR + */ + max_modprobes = min(max_threads/2, MAX_KMOD_CONCURRENT); + atomic_inc(&kmod_concurrent); + if (atomic_read(&kmod_concurrent) > max_modprobes) { + /* We may be blaming an innocent here, but unlikely */ + if (kmod_loop_msg < 5) { + printk(KERN_ERR + "request_module: runaway loop modprobe %s\n", + module_name); + kmod_loop_msg++; + } + atomic_dec(&kmod_concurrent); + return -ENOMEM; + } + + trace_module_request(module_name, wait, _RET_IP_); + + ret = call_usermodehelper_fns(modprobe_path, argv, envp, + wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC, + NULL, NULL, NULL); + + atomic_dec(&kmod_concurrent); + return ret; +} +EXPORT_SYMBOL(__request_module); +#endif /* CONFIG_MODULES */ + +/* + * This is the task which runs the usermode application + */ +static int ____call_usermodehelper(void *data) +{ + struct subprocess_info *sub_info = data; + struct cred *new; + int retval; + + spin_lock_irq(¤t->sighand->siglock); + flush_signal_handlers(current, 1); + spin_unlock_irq(¤t->sighand->siglock); + + /* We can run anywhere, unlike our parent keventd(). */ + set_cpus_allowed_ptr(current, cpu_all_mask); + + /* + * Our parent is keventd, which runs with elevated scheduling priority. + * Avoid propagating that into the userspace child. + */ + set_user_nice(current, 0); + + retval = -ENOMEM; + new = prepare_kernel_cred(current); + if (!new) + goto fail; + + spin_lock(&umh_sysctl_lock); + new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); + new->cap_inheritable = cap_intersect(usermodehelper_inheritable, + new->cap_inheritable); + spin_unlock(&umh_sysctl_lock); + + if (sub_info->init) { + retval = sub_info->init(sub_info, new); + if (retval) { + abort_creds(new); + goto fail; + } + } + + commit_creds(new); + + retval = kernel_execve(sub_info->path, + (const char *const *)sub_info->argv, + (const char *const *)sub_info->envp); + + /* Exec failed? */ +fail: + sub_info->retval = retval; + do_exit(0); +} + +void call_usermodehelper_freeinfo(struct subprocess_info *info) +{ + if (info->cleanup) + (*info->cleanup)(info); + kfree(info); +} +EXPORT_SYMBOL(call_usermodehelper_freeinfo); + +/* Keventd can't block, but this (a child) can. */ +static int wait_for_helper(void *data) +{ + struct subprocess_info *sub_info = data; + pid_t pid; + + /* If SIGCLD is ignored sys_wait4 won't populate the status. */ + spin_lock_irq(¤t->sighand->siglock); + current->sighand->action[SIGCHLD-1].sa.sa_handler = SIG_DFL; + spin_unlock_irq(¤t->sighand->siglock); + + pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); + if (pid < 0) { + sub_info->retval = pid; + } else { + int ret = -ECHILD; + /* + * Normally it is bogus to call wait4() from in-kernel because + * wait4() wants to write the exit code to a userspace address. + * But wait_for_helper() always runs as keventd, and put_user() + * to a kernel address works OK for kernel threads, due to their + * having an mm_segment_t which spans the entire address space. + * + * Thus the __user pointer cast is valid here. + */ + sys_wait4(pid, (int __user *)&ret, 0, NULL); + + /* + * If ret is 0, either ____call_usermodehelper failed and the + * real error code is already in sub_info->retval or + * sub_info->retval is 0 anyway, so don't mess with it then. + */ + if (ret) + sub_info->retval = ret; + } + + complete(sub_info->complete); + return 0; +} + +/* This is run by khelper thread */ +static void __call_usermodehelper(struct work_struct *work) +{ + struct subprocess_info *sub_info = + container_of(work, struct subprocess_info, work); + enum umh_wait wait = sub_info->wait; + pid_t pid; + + /* CLONE_VFORK: wait until the usermode helper has execve'd + * successfully We need the data structures to stay around + * until that is done. */ + if (wait == UMH_WAIT_PROC) + pid = kernel_thread(wait_for_helper, sub_info, + CLONE_FS | CLONE_FILES | SIGCHLD); + else + pid = kernel_thread(____call_usermodehelper, sub_info, + CLONE_VFORK | SIGCHLD); + + switch (wait) { + case UMH_NO_WAIT: + call_usermodehelper_freeinfo(sub_info); + break; + + case UMH_WAIT_PROC: + if (pid > 0) + break; + /* FALLTHROUGH */ + case UMH_WAIT_EXEC: + if (pid < 0) + sub_info->retval = pid; + complete(sub_info->complete); + } +} + +/* + * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY + * (used for preventing user land processes from being created after the user + * land has been frozen during a system-wide hibernation or suspend operation). + */ +static int usermodehelper_disabled; + +/* Number of helpers running */ +static atomic_t running_helpers = ATOMIC_INIT(0); + +/* + * Wait queue head used by usermodehelper_pm_callback() to wait for all running + * helpers to finish. + */ +static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq); + +/* + * Time to wait for running_helpers to become zero before the setting of + * usermodehelper_disabled in usermodehelper_pm_callback() fails + */ +#define RUNNING_HELPERS_TIMEOUT (5 * HZ) + +/** + * usermodehelper_disable - prevent new helpers from being started + */ +int usermodehelper_disable(void) +{ + long retval; + + usermodehelper_disabled = 1; + smp_mb(); + /* + * From now on call_usermodehelper_exec() won't start any new + * helpers, so it is sufficient if running_helpers turns out to + * be zero at one point (it may be increased later, but that + * doesn't matter). + */ + retval = wait_event_timeout(running_helpers_waitq, + atomic_read(&running_helpers) == 0, + RUNNING_HELPERS_TIMEOUT); + if (retval) + return 0; + + usermodehelper_disabled = 0; + return -EAGAIN; +} + +/** + * usermodehelper_enable - allow new helpers to be started again + */ +void usermodehelper_enable(void) +{ + usermodehelper_disabled = 0; +} + +/** + * usermodehelper_is_disabled - check if new helpers are allowed to be started + */ +bool usermodehelper_is_disabled(void) +{ + return usermodehelper_disabled; +} +EXPORT_SYMBOL_GPL(usermodehelper_is_disabled); + +static void helper_lock(void) +{ + atomic_inc(&running_helpers); + smp_mb__after_atomic_inc(); +} + +static void helper_unlock(void) +{ + if (atomic_dec_and_test(&running_helpers)) + wake_up(&running_helpers_waitq); +} + +/** + * call_usermodehelper_setup - prepare to call a usermode helper + * @path: path to usermode executable + * @argv: arg vector for process + * @envp: environment for process + * @gfp_mask: gfp mask for memory allocation + * + * Returns either %NULL on allocation failure, or a subprocess_info + * structure. This should be passed to call_usermodehelper_exec to + * exec the process and free the structure. + */ +struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, + char **envp, gfp_t gfp_mask) +{ + struct subprocess_info *sub_info; + sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask); + if (!sub_info) + goto out; + + INIT_WORK(&sub_info->work, __call_usermodehelper); + sub_info->path = path; + sub_info->argv = argv; + sub_info->envp = envp; + out: + return sub_info; +} +EXPORT_SYMBOL(call_usermodehelper_setup); + +/** + * call_usermodehelper_setfns - set a cleanup/init function + * @info: a subprocess_info returned by call_usermodehelper_setup + * @cleanup: a cleanup function + * @init: an init function + * @data: arbitrary context sensitive data + * + * The init function is used to customize the helper process prior to + * exec. A non-zero return code causes the process to error out, exit, + * and return the failure to the calling process + * + * The cleanup function is just before ethe subprocess_info is about to + * be freed. This can be used for freeing the argv and envp. The + * Function must be runnable in either a process context or the + * context in which call_usermodehelper_exec is called. + */ +void call_usermodehelper_setfns(struct subprocess_info *info, + int (*init)(struct subprocess_info *info, struct cred *new), + void (*cleanup)(struct subprocess_info *info), + void *data) +{ + info->cleanup = cleanup; + info->init = init; + info->data = data; +} +EXPORT_SYMBOL(call_usermodehelper_setfns); + +/** + * call_usermodehelper_exec - start a usermode application + * @sub_info: information about the subprocessa + * @wait: wait for the application to finish and return status. + * when -1 don't wait at all, but you get no useful error back when + * the program couldn't be exec'ed. This makes it safe to call + * from interrupt context. + * + * Runs a user-space application. The application is started + * asynchronously if wait is not set, and runs as a child of keventd. + * (ie. it runs with full root capabilities). + */ +int call_usermodehelper_exec(struct subprocess_info *sub_info, + enum umh_wait wait) +{ + DECLARE_COMPLETION_ONSTACK(done); + int retval = 0; + + helper_lock(); + if (sub_info->path[0] == '\0') + goto out; + + if (!khelper_wq || usermodehelper_disabled) { + retval = -EBUSY; + goto out; + } + + sub_info->complete = &done; + sub_info->wait = wait; + + queue_work(khelper_wq, &sub_info->work); + if (wait == UMH_NO_WAIT) /* task has freed sub_info */ + goto unlock; + wait_for_completion(&done); + retval = sub_info->retval; + +out: + call_usermodehelper_freeinfo(sub_info); +unlock: + helper_unlock(); + return retval; +} +EXPORT_SYMBOL(call_usermodehelper_exec); + +static int proc_cap_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + unsigned long cap_array[_KERNEL_CAPABILITY_U32S]; + kernel_cap_t new_cap; + int err, i; + + if (write && (!capable(CAP_SETPCAP) || + !capable(CAP_SYS_MODULE))) + return -EPERM; + + /* + * convert from the global kernel_cap_t to the ulong array to print to + * userspace if this is a read. + */ + spin_lock(&umh_sysctl_lock); + for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) { + if (table->data == CAP_BSET) + cap_array[i] = usermodehelper_bset.cap[i]; + else if (table->data == CAP_PI) + cap_array[i] = usermodehelper_inheritable.cap[i]; + else + BUG(); + } + spin_unlock(&umh_sysctl_lock); + + t = *table; + t.data = &cap_array; + + /* + * actually read or write and array of ulongs from userspace. Remember + * these are least significant 32 bits first + */ + err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + + /* + * convert from the sysctl array of ulongs to the kernel_cap_t + * internal representation + */ + for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) + new_cap.cap[i] = cap_array[i]; + + /* + * Drop everything not in the new_cap (but don't add things) + */ + spin_lock(&umh_sysctl_lock); + if (write) { + if (table->data == CAP_BSET) + usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap); + if (table->data == CAP_PI) + usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap); + } + spin_unlock(&umh_sysctl_lock); + + return 0; +} + +struct ctl_table usermodehelper_table[] = { + { + .procname = "bset", + .data = CAP_BSET, + .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), + .mode = 0600, + .proc_handler = proc_cap_handler, + }, + { + .procname = "inheritable", + .data = CAP_PI, + .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long), + .mode = 0600, + .proc_handler = proc_cap_handler, + }, + { } +}; + +void __init usermodehelper_init(void) +{ + khelper_wq = create_singlethread_workqueue("khelper"); + BUG_ON(!khelper_wq); +} diff --git a/kernel/kprobes.c b/kernel/kprobes.c new file mode 100644 index 00000000..f1dcde49 --- /dev/null +++ b/kernel/kprobes.c @@ -0,0 +1,2252 @@ +/* + * Kernel Probes (KProbes) + * kernel/kprobes.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2002, 2004 + * + * 2002-Oct Created by Vamsi Krishna S Kernel + * Probes initial implementation (includes suggestions from + * Rusty Russell). + * 2004-Aug Updated by Prasanna S Panchamukhi with + * hlists and exceptions notifier as suggested by Andi Kleen. + * 2004-July Suparna Bhattacharya added jumper probes + * interface to access function arguments. + * 2004-Sep Prasanna S Panchamukhi Changed Kprobes + * exceptions notifier to be first on the priority list. + * 2005-May Hien Nguyen , Jim Keniston + * and Prasanna S Panchamukhi + * added function-return probes. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define KPROBE_HASH_BITS 6 +#define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) + + +/* + * Some oddball architectures like 64bit powerpc have function descriptors + * so this must be overridable. + */ +#ifndef kprobe_lookup_name +#define kprobe_lookup_name(name, addr) \ + addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name))) +#endif + +static int kprobes_initialized; +static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; +static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; + +/* NOTE: change this value only with kprobe_mutex held */ +static bool kprobes_all_disarmed; + +/* This protects kprobe_table and optimizing_list */ +static DEFINE_MUTEX(kprobe_mutex); +static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; +static struct { + spinlock_t lock ____cacheline_aligned_in_smp; +} kretprobe_table_locks[KPROBE_TABLE_SIZE]; + +static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) +{ + return &(kretprobe_table_locks[hash].lock); +} + +/* + * Normally, functions that we'd want to prohibit kprobes in, are marked + * __kprobes. But, there are cases where such functions already belong to + * a different section (__sched for preempt_schedule) + * + * For such cases, we now have a blacklist + */ +static struct kprobe_blackpoint kprobe_blacklist[] = { + {"preempt_schedule",}, + {"native_get_debugreg",}, + {"irq_entries_start",}, + {"common_interrupt",}, + {"mcount",}, /* mcount can be called from everywhere */ + {NULL} /* Terminator */ +}; + +#ifdef __ARCH_WANT_KPROBES_INSN_SLOT +/* + * kprobe->ainsn.insn points to the copy of the instruction to be + * single-stepped. x86_64, POWER4 and above have no-exec support and + * stepping on the instruction on a vmalloced/kmalloced/data page + * is a recipe for disaster + */ +struct kprobe_insn_page { + struct list_head list; + kprobe_opcode_t *insns; /* Page of instruction slots */ + int nused; + int ngarbage; + char slot_used[]; +}; + +#define KPROBE_INSN_PAGE_SIZE(slots) \ + (offsetof(struct kprobe_insn_page, slot_used) + \ + (sizeof(char) * (slots))) + +struct kprobe_insn_cache { + struct list_head pages; /* list of kprobe_insn_page */ + size_t insn_size; /* size of instruction slot */ + int nr_garbage; +}; + +static int slots_per_page(struct kprobe_insn_cache *c) +{ + return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t)); +} + +enum kprobe_slot_state { + SLOT_CLEAN = 0, + SLOT_DIRTY = 1, + SLOT_USED = 2, +}; + +static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */ +static struct kprobe_insn_cache kprobe_insn_slots = { + .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages), + .insn_size = MAX_INSN_SIZE, + .nr_garbage = 0, +}; +static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c); + +/** + * __get_insn_slot() - Find a slot on an executable page for an instruction. + * We allocate an executable page if there's no room on existing ones. + */ +static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) +{ + struct kprobe_insn_page *kip; + + retry: + list_for_each_entry(kip, &c->pages, list) { + if (kip->nused < slots_per_page(c)) { + int i; + for (i = 0; i < slots_per_page(c); i++) { + if (kip->slot_used[i] == SLOT_CLEAN) { + kip->slot_used[i] = SLOT_USED; + kip->nused++; + return kip->insns + (i * c->insn_size); + } + } + /* kip->nused is broken. Fix it. */ + kip->nused = slots_per_page(c); + WARN_ON(1); + } + } + + /* If there are any garbage slots, collect it and try again. */ + if (c->nr_garbage && collect_garbage_slots(c) == 0) + goto retry; + + /* All out of space. Need to allocate a new page. */ + kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL); + if (!kip) + return NULL; + + /* + * Use module_alloc so this page is within +/- 2GB of where the + * kernel image and loaded module images reside. This is required + * so x86_64 can correctly handle the %rip-relative fixups. + */ + kip->insns = module_alloc(PAGE_SIZE); + if (!kip->insns) { + kfree(kip); + return NULL; + } + INIT_LIST_HEAD(&kip->list); + memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c)); + kip->slot_used[0] = SLOT_USED; + kip->nused = 1; + kip->ngarbage = 0; + list_add(&kip->list, &c->pages); + return kip->insns; +} + + +kprobe_opcode_t __kprobes *get_insn_slot(void) +{ + kprobe_opcode_t *ret = NULL; + + mutex_lock(&kprobe_insn_mutex); + ret = __get_insn_slot(&kprobe_insn_slots); + mutex_unlock(&kprobe_insn_mutex); + + return ret; +} + +/* Return 1 if all garbages are collected, otherwise 0. */ +static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) +{ + kip->slot_used[idx] = SLOT_CLEAN; + kip->nused--; + if (kip->nused == 0) { + /* + * Page is no longer in use. Free it unless + * it's the last one. We keep the last one + * so as not to have to set it up again the + * next time somebody inserts a probe. + */ + if (!list_is_singular(&kip->list)) { + list_del(&kip->list); + module_free(NULL, kip->insns); + kfree(kip); + } + return 1; + } + return 0; +} + +static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c) +{ + struct kprobe_insn_page *kip, *next; + + /* Ensure no-one is interrupted on the garbages */ + synchronize_sched(); + + list_for_each_entry_safe(kip, next, &c->pages, list) { + int i; + if (kip->ngarbage == 0) + continue; + kip->ngarbage = 0; /* we will collect all garbages */ + for (i = 0; i < slots_per_page(c); i++) { + if (kip->slot_used[i] == SLOT_DIRTY && + collect_one_slot(kip, i)) + break; + } + } + c->nr_garbage = 0; + return 0; +} + +static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, + kprobe_opcode_t *slot, int dirty) +{ + struct kprobe_insn_page *kip; + + list_for_each_entry(kip, &c->pages, list) { + long idx = ((long)slot - (long)kip->insns) / + (c->insn_size * sizeof(kprobe_opcode_t)); + if (idx >= 0 && idx < slots_per_page(c)) { + WARN_ON(kip->slot_used[idx] != SLOT_USED); + if (dirty) { + kip->slot_used[idx] = SLOT_DIRTY; + kip->ngarbage++; + if (++c->nr_garbage > slots_per_page(c)) + collect_garbage_slots(c); + } else + collect_one_slot(kip, idx); + return; + } + } + /* Could not free this slot. */ + WARN_ON(1); +} + +void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) +{ + mutex_lock(&kprobe_insn_mutex); + __free_insn_slot(&kprobe_insn_slots, slot, dirty); + mutex_unlock(&kprobe_insn_mutex); +} +#ifdef CONFIG_OPTPROBES +/* For optimized_kprobe buffer */ +static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */ +static struct kprobe_insn_cache kprobe_optinsn_slots = { + .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages), + /* .insn_size is initialized later */ + .nr_garbage = 0, +}; +/* Get a slot for optimized_kprobe buffer */ +kprobe_opcode_t __kprobes *get_optinsn_slot(void) +{ + kprobe_opcode_t *ret = NULL; + + mutex_lock(&kprobe_optinsn_mutex); + ret = __get_insn_slot(&kprobe_optinsn_slots); + mutex_unlock(&kprobe_optinsn_mutex); + + return ret; +} + +void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty) +{ + mutex_lock(&kprobe_optinsn_mutex); + __free_insn_slot(&kprobe_optinsn_slots, slot, dirty); + mutex_unlock(&kprobe_optinsn_mutex); +} +#endif +#endif + +/* We have preemption disabled.. so it is safe to use __ versions */ +static inline void set_kprobe_instance(struct kprobe *kp) +{ + __this_cpu_write(kprobe_instance, kp); +} + +static inline void reset_kprobe_instance(void) +{ + __this_cpu_write(kprobe_instance, NULL); +} + +/* + * This routine is called either: + * - under the kprobe_mutex - during kprobe_[un]register() + * OR + * - with preemption disabled - from arch/xxx/kernel/kprobes.c + */ +struct kprobe __kprobes *get_kprobe(void *addr) +{ + struct hlist_head *head; + struct hlist_node *node; + struct kprobe *p; + + head = &kprobe_table[hash_ptr(addr, KPROBE_HASH_BITS)]; + hlist_for_each_entry_rcu(p, node, head, hlist) { + if (p->addr == addr) + return p; + } + + return NULL; +} + +static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs); + +/* Return true if the kprobe is an aggregator */ +static inline int kprobe_aggrprobe(struct kprobe *p) +{ + return p->pre_handler == aggr_pre_handler; +} + +/* Return true(!0) if the kprobe is unused */ +static inline int kprobe_unused(struct kprobe *p) +{ + return kprobe_aggrprobe(p) && kprobe_disabled(p) && + list_empty(&p->list); +} + +/* + * Keep all fields in the kprobe consistent + */ +static inline void copy_kprobe(struct kprobe *ap, struct kprobe *p) +{ + memcpy(&p->opcode, &ap->opcode, sizeof(kprobe_opcode_t)); + memcpy(&p->ainsn, &ap->ainsn, sizeof(struct arch_specific_insn)); +} + +#ifdef CONFIG_OPTPROBES +/* NOTE: change this value only with kprobe_mutex held */ +static bool kprobes_allow_optimization; + +/* + * Call all pre_handler on the list, but ignores its return value. + * This must be called from arch-dep optimized caller. + */ +void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kprobe *kp; + + list_for_each_entry_rcu(kp, &p->list, list) { + if (kp->pre_handler && likely(!kprobe_disabled(kp))) { + set_kprobe_instance(kp); + kp->pre_handler(kp, regs); + } + reset_kprobe_instance(); + } +} + +/* Free optimized instructions and optimized_kprobe */ +static __kprobes void free_aggr_kprobe(struct kprobe *p) +{ + struct optimized_kprobe *op; + + op = container_of(p, struct optimized_kprobe, kp); + arch_remove_optimized_kprobe(op); + arch_remove_kprobe(p); + kfree(op); +} + +/* Return true(!0) if the kprobe is ready for optimization. */ +static inline int kprobe_optready(struct kprobe *p) +{ + struct optimized_kprobe *op; + + if (kprobe_aggrprobe(p)) { + op = container_of(p, struct optimized_kprobe, kp); + return arch_prepared_optinsn(&op->optinsn); + } + + return 0; +} + +/* Return true(!0) if the kprobe is disarmed. Note: p must be on hash list */ +static inline int kprobe_disarmed(struct kprobe *p) +{ + struct optimized_kprobe *op; + + /* If kprobe is not aggr/opt probe, just return kprobe is disabled */ + if (!kprobe_aggrprobe(p)) + return kprobe_disabled(p); + + op = container_of(p, struct optimized_kprobe, kp); + + return kprobe_disabled(p) && list_empty(&op->list); +} + +/* Return true(!0) if the probe is queued on (un)optimizing lists */ +static int __kprobes kprobe_queued(struct kprobe *p) +{ + struct optimized_kprobe *op; + + if (kprobe_aggrprobe(p)) { + op = container_of(p, struct optimized_kprobe, kp); + if (!list_empty(&op->list)) + return 1; + } + return 0; +} + +/* + * Return an optimized kprobe whose optimizing code replaces + * instructions including addr (exclude breakpoint). + */ +static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) +{ + int i; + struct kprobe *p = NULL; + struct optimized_kprobe *op; + + /* Don't check i == 0, since that is a breakpoint case. */ + for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++) + p = get_kprobe((void *)(addr - i)); + + if (p && kprobe_optready(p)) { + op = container_of(p, struct optimized_kprobe, kp); + if (arch_within_optimized_kprobe(op, addr)) + return p; + } + + return NULL; +} + +/* Optimization staging list, protected by kprobe_mutex */ +static LIST_HEAD(optimizing_list); +static LIST_HEAD(unoptimizing_list); + +static void kprobe_optimizer(struct work_struct *work); +static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); +static DECLARE_COMPLETION(optimizer_comp); +#define OPTIMIZE_DELAY 5 + +/* + * Optimize (replace a breakpoint with a jump) kprobes listed on + * optimizing_list. + */ +static __kprobes void do_optimize_kprobes(void) +{ + /* Optimization never be done when disarmed */ + if (kprobes_all_disarmed || !kprobes_allow_optimization || + list_empty(&optimizing_list)) + return; + + /* + * The optimization/unoptimization refers online_cpus via + * stop_machine() and cpu-hotplug modifies online_cpus. + * And same time, text_mutex will be held in cpu-hotplug and here. + * This combination can cause a deadlock (cpu-hotplug try to lock + * text_mutex but stop_machine can not be done because online_cpus + * has been changed) + * To avoid this deadlock, we need to call get_online_cpus() + * for preventing cpu-hotplug outside of text_mutex locking. + */ + get_online_cpus(); + mutex_lock(&text_mutex); + arch_optimize_kprobes(&optimizing_list); + mutex_unlock(&text_mutex); + put_online_cpus(); +} + +/* + * Unoptimize (replace a jump with a breakpoint and remove the breakpoint + * if need) kprobes listed on unoptimizing_list. + */ +static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) +{ + struct optimized_kprobe *op, *tmp; + + /* Unoptimization must be done anytime */ + if (list_empty(&unoptimizing_list)) + return; + + /* Ditto to do_optimize_kprobes */ + get_online_cpus(); + mutex_lock(&text_mutex); + arch_unoptimize_kprobes(&unoptimizing_list, free_list); + /* Loop free_list for disarming */ + list_for_each_entry_safe(op, tmp, free_list, list) { + /* Disarm probes if marked disabled */ + if (kprobe_disabled(&op->kp)) + arch_disarm_kprobe(&op->kp); + if (kprobe_unused(&op->kp)) { + /* + * Remove unused probes from hash list. After waiting + * for synchronization, these probes are reclaimed. + * (reclaiming is done by do_free_cleaned_kprobes.) + */ + hlist_del_rcu(&op->kp.hlist); + } else + list_del_init(&op->list); + } + mutex_unlock(&text_mutex); + put_online_cpus(); +} + +/* Reclaim all kprobes on the free_list */ +static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list) +{ + struct optimized_kprobe *op, *tmp; + + list_for_each_entry_safe(op, tmp, free_list, list) { + BUG_ON(!kprobe_unused(&op->kp)); + list_del_init(&op->list); + free_aggr_kprobe(&op->kp); + } +} + +/* Start optimizer after OPTIMIZE_DELAY passed */ +static __kprobes void kick_kprobe_optimizer(void) +{ + if (!delayed_work_pending(&optimizing_work)) + schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); +} + +/* Kprobe jump optimizer */ +static __kprobes void kprobe_optimizer(struct work_struct *work) +{ + LIST_HEAD(free_list); + + /* Lock modules while optimizing kprobes */ + mutex_lock(&module_mutex); + mutex_lock(&kprobe_mutex); + + /* + * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) + * kprobes before waiting for quiesence period. + */ + do_unoptimize_kprobes(&free_list); + + /* + * Step 2: Wait for quiesence period to ensure all running interrupts + * are done. Because optprobe may modify multiple instructions + * there is a chance that Nth instruction is interrupted. In that + * case, running interrupt can return to 2nd-Nth byte of jump + * instruction. This wait is for avoiding it. + */ + synchronize_sched(); + + /* Step 3: Optimize kprobes after quiesence period */ + do_optimize_kprobes(); + + /* Step 4: Free cleaned kprobes after quiesence period */ + do_free_cleaned_kprobes(&free_list); + + mutex_unlock(&kprobe_mutex); + mutex_unlock(&module_mutex); + + /* Step 5: Kick optimizer again if needed */ + if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list)) + kick_kprobe_optimizer(); + else + /* Wake up all waiters */ + complete_all(&optimizer_comp); +} + +/* Wait for completing optimization and unoptimization */ +static __kprobes void wait_for_kprobe_optimizer(void) +{ + if (delayed_work_pending(&optimizing_work)) + wait_for_completion(&optimizer_comp); +} + +/* Optimize kprobe if p is ready to be optimized */ +static __kprobes void optimize_kprobe(struct kprobe *p) +{ + struct optimized_kprobe *op; + + /* Check if the kprobe is disabled or not ready for optimization. */ + if (!kprobe_optready(p) || !kprobes_allow_optimization || + (kprobe_disabled(p) || kprobes_all_disarmed)) + return; + + /* Both of break_handler and post_handler are not supported. */ + if (p->break_handler || p->post_handler) + return; + + op = container_of(p, struct optimized_kprobe, kp); + + /* Check there is no other kprobes at the optimized instructions */ + if (arch_check_optimized_kprobe(op) < 0) + return; + + /* Check if it is already optimized. */ + if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) + return; + op->kp.flags |= KPROBE_FLAG_OPTIMIZED; + + if (!list_empty(&op->list)) + /* This is under unoptimizing. Just dequeue the probe */ + list_del_init(&op->list); + else { + list_add(&op->list, &optimizing_list); + kick_kprobe_optimizer(); + } +} + +/* Short cut to direct unoptimizing */ +static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op) +{ + get_online_cpus(); + arch_unoptimize_kprobe(op); + put_online_cpus(); + if (kprobe_disabled(&op->kp)) + arch_disarm_kprobe(&op->kp); +} + +/* Unoptimize a kprobe if p is optimized */ +static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force) +{ + struct optimized_kprobe *op; + + if (!kprobe_aggrprobe(p) || kprobe_disarmed(p)) + return; /* This is not an optprobe nor optimized */ + + op = container_of(p, struct optimized_kprobe, kp); + if (!kprobe_optimized(p)) { + /* Unoptimized or unoptimizing case */ + if (force && !list_empty(&op->list)) { + /* + * Only if this is unoptimizing kprobe and forced, + * forcibly unoptimize it. (No need to unoptimize + * unoptimized kprobe again :) + */ + list_del_init(&op->list); + force_unoptimize_kprobe(op); + } + return; + } + + op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; + if (!list_empty(&op->list)) { + /* Dequeue from the optimization queue */ + list_del_init(&op->list); + return; + } + /* Optimized kprobe case */ + if (force) + /* Forcibly update the code: this is a special case */ + force_unoptimize_kprobe(op); + else { + list_add(&op->list, &unoptimizing_list); + kick_kprobe_optimizer(); + } +} + +/* Cancel unoptimizing for reusing */ +static void reuse_unused_kprobe(struct kprobe *ap) +{ + struct optimized_kprobe *op; + + BUG_ON(!kprobe_unused(ap)); + /* + * Unused kprobe MUST be on the way of delayed unoptimizing (means + * there is still a relative jump) and disabled. + */ + op = container_of(ap, struct optimized_kprobe, kp); + if (unlikely(list_empty(&op->list))) + printk(KERN_WARNING "Warning: found a stray unused " + "aggrprobe@%p\n", ap->addr); + /* Enable the probe again */ + ap->flags &= ~KPROBE_FLAG_DISABLED; + /* Optimize it again (remove from op->list) */ + BUG_ON(!kprobe_optready(ap)); + optimize_kprobe(ap); +} + +/* Remove optimized instructions */ +static void __kprobes kill_optimized_kprobe(struct kprobe *p) +{ + struct optimized_kprobe *op; + + op = container_of(p, struct optimized_kprobe, kp); + if (!list_empty(&op->list)) + /* Dequeue from the (un)optimization queue */ + list_del_init(&op->list); + + op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; + /* Don't touch the code, because it is already freed. */ + arch_remove_optimized_kprobe(op); +} + +/* Try to prepare optimized instructions */ +static __kprobes void prepare_optimized_kprobe(struct kprobe *p) +{ + struct optimized_kprobe *op; + + op = container_of(p, struct optimized_kprobe, kp); + arch_prepare_optimized_kprobe(op); +} + +/* Allocate new optimized_kprobe and try to prepare optimized instructions */ +static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) +{ + struct optimized_kprobe *op; + + op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL); + if (!op) + return NULL; + + INIT_LIST_HEAD(&op->list); + op->kp.addr = p->addr; + arch_prepare_optimized_kprobe(op); + + return &op->kp; +} + +static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p); + +/* + * Prepare an optimized_kprobe and optimize it + * NOTE: p must be a normal registered kprobe + */ +static __kprobes void try_to_optimize_kprobe(struct kprobe *p) +{ + struct kprobe *ap; + struct optimized_kprobe *op; + + ap = alloc_aggr_kprobe(p); + if (!ap) + return; + + op = container_of(ap, struct optimized_kprobe, kp); + if (!arch_prepared_optinsn(&op->optinsn)) { + /* If failed to setup optimizing, fallback to kprobe */ + arch_remove_optimized_kprobe(op); + kfree(op); + return; + } + + init_aggr_kprobe(ap, p); + optimize_kprobe(ap); +} + +#ifdef CONFIG_SYSCTL +/* This should be called with kprobe_mutex locked */ +static void __kprobes optimize_all_kprobes(void) +{ + struct hlist_head *head; + struct hlist_node *node; + struct kprobe *p; + unsigned int i; + + /* If optimization is already allowed, just return */ + if (kprobes_allow_optimization) + return; + + kprobes_allow_optimization = true; + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + head = &kprobe_table[i]; + hlist_for_each_entry_rcu(p, node, head, hlist) + if (!kprobe_disabled(p)) + optimize_kprobe(p); + } + printk(KERN_INFO "Kprobes globally optimized\n"); +} + +/* This should be called with kprobe_mutex locked */ +static void __kprobes unoptimize_all_kprobes(void) +{ + struct hlist_head *head; + struct hlist_node *node; + struct kprobe *p; + unsigned int i; + + /* If optimization is already prohibited, just return */ + if (!kprobes_allow_optimization) + return; + + kprobes_allow_optimization = false; + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + head = &kprobe_table[i]; + hlist_for_each_entry_rcu(p, node, head, hlist) { + if (!kprobe_disabled(p)) + unoptimize_kprobe(p, false); + } + } + /* Wait for unoptimizing completion */ + wait_for_kprobe_optimizer(); + printk(KERN_INFO "Kprobes globally unoptimized\n"); +} + +int sysctl_kprobes_optimization; +int proc_kprobes_optimization_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, + loff_t *ppos) +{ + int ret; + + mutex_lock(&kprobe_mutex); + sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0; + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + + if (sysctl_kprobes_optimization) + optimize_all_kprobes(); + else + unoptimize_all_kprobes(); + mutex_unlock(&kprobe_mutex); + + return ret; +} +#endif /* CONFIG_SYSCTL */ + +/* Put a breakpoint for a probe. Must be called with text_mutex locked */ +static void __kprobes __arm_kprobe(struct kprobe *p) +{ + struct kprobe *_p; + + /* Check collision with other optimized kprobes */ + _p = get_optimized_kprobe((unsigned long)p->addr); + if (unlikely(_p)) + /* Fallback to unoptimized kprobe */ + unoptimize_kprobe(_p, true); + + arch_arm_kprobe(p); + optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */ +} + +/* Remove the breakpoint of a probe. Must be called with text_mutex locked */ +static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt) +{ + struct kprobe *_p; + + unoptimize_kprobe(p, false); /* Try to unoptimize */ + + if (!kprobe_queued(p)) { + arch_disarm_kprobe(p); + /* If another kprobe was blocked, optimize it. */ + _p = get_optimized_kprobe((unsigned long)p->addr); + if (unlikely(_p) && reopt) + optimize_kprobe(_p); + } + /* TODO: reoptimize others after unoptimized this probe */ +} + +#else /* !CONFIG_OPTPROBES */ + +#define optimize_kprobe(p) do {} while (0) +#define unoptimize_kprobe(p, f) do {} while (0) +#define kill_optimized_kprobe(p) do {} while (0) +#define prepare_optimized_kprobe(p) do {} while (0) +#define try_to_optimize_kprobe(p) do {} while (0) +#define __arm_kprobe(p) arch_arm_kprobe(p) +#define __disarm_kprobe(p, o) arch_disarm_kprobe(p) +#define kprobe_disarmed(p) kprobe_disabled(p) +#define wait_for_kprobe_optimizer() do {} while (0) + +/* There should be no unused kprobes can be reused without optimization */ +static void reuse_unused_kprobe(struct kprobe *ap) +{ + printk(KERN_ERR "Error: There should be no unused kprobe here.\n"); + BUG_ON(kprobe_unused(ap)); +} + +static __kprobes void free_aggr_kprobe(struct kprobe *p) +{ + arch_remove_kprobe(p); + kfree(p); +} + +static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) +{ + return kzalloc(sizeof(struct kprobe), GFP_KERNEL); +} +#endif /* CONFIG_OPTPROBES */ + +/* Arm a kprobe with text_mutex */ +static void __kprobes arm_kprobe(struct kprobe *kp) +{ + /* + * Here, since __arm_kprobe() doesn't use stop_machine(), + * this doesn't cause deadlock on text_mutex. So, we don't + * need get_online_cpus(). + */ + mutex_lock(&text_mutex); + __arm_kprobe(kp); + mutex_unlock(&text_mutex); +} + +/* Disarm a kprobe with text_mutex */ +static void __kprobes disarm_kprobe(struct kprobe *kp) +{ + /* Ditto */ + mutex_lock(&text_mutex); + __disarm_kprobe(kp, true); + mutex_unlock(&text_mutex); +} + +/* + * Aggregate handlers for multiple kprobes support - these handlers + * take care of invoking the individual kprobe handlers on p->list + */ +static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kprobe *kp; + + list_for_each_entry_rcu(kp, &p->list, list) { + if (kp->pre_handler && likely(!kprobe_disabled(kp))) { + set_kprobe_instance(kp); + if (kp->pre_handler(kp, regs)) + return 1; + } + reset_kprobe_instance(); + } + return 0; +} + +static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs, + unsigned long flags) +{ + struct kprobe *kp; + + list_for_each_entry_rcu(kp, &p->list, list) { + if (kp->post_handler && likely(!kprobe_disabled(kp))) { + set_kprobe_instance(kp); + kp->post_handler(kp, regs, flags); + reset_kprobe_instance(); + } + } +} + +static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs, + int trapnr) +{ + struct kprobe *cur = __this_cpu_read(kprobe_instance); + + /* + * if we faulted "during" the execution of a user specified + * probe handler, invoke just that probe's fault handler + */ + if (cur && cur->fault_handler) { + if (cur->fault_handler(cur, regs, trapnr)) + return 1; + } + return 0; +} + +static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kprobe *cur = __this_cpu_read(kprobe_instance); + int ret = 0; + + if (cur && cur->break_handler) { + if (cur->break_handler(cur, regs)) + ret = 1; + } + reset_kprobe_instance(); + return ret; +} + +/* Walks the list and increments nmissed count for multiprobe case */ +void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) +{ + struct kprobe *kp; + if (!kprobe_aggrprobe(p)) { + p->nmissed++; + } else { + list_for_each_entry_rcu(kp, &p->list, list) + kp->nmissed++; + } + return; +} + +void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, + struct hlist_head *head) +{ + struct kretprobe *rp = ri->rp; + + /* remove rp inst off the rprobe_inst_table */ + hlist_del(&ri->hlist); + INIT_HLIST_NODE(&ri->hlist); + if (likely(rp)) { + spin_lock(&rp->lock); + hlist_add_head(&ri->hlist, &rp->free_instances); + spin_unlock(&rp->lock); + } else + /* Unregistering */ + hlist_add_head(&ri->hlist, head); +} + +void __kprobes kretprobe_hash_lock(struct task_struct *tsk, + struct hlist_head **head, unsigned long *flags) +__acquires(hlist_lock) +{ + unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); + spinlock_t *hlist_lock; + + *head = &kretprobe_inst_table[hash]; + hlist_lock = kretprobe_table_lock_ptr(hash); + spin_lock_irqsave(hlist_lock, *flags); +} + +static void __kprobes kretprobe_table_lock(unsigned long hash, + unsigned long *flags) +__acquires(hlist_lock) +{ + spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); + spin_lock_irqsave(hlist_lock, *flags); +} + +void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, + unsigned long *flags) +__releases(hlist_lock) +{ + unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); + spinlock_t *hlist_lock; + + hlist_lock = kretprobe_table_lock_ptr(hash); + spin_unlock_irqrestore(hlist_lock, *flags); +} + +static void __kprobes kretprobe_table_unlock(unsigned long hash, + unsigned long *flags) +__releases(hlist_lock) +{ + spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); + spin_unlock_irqrestore(hlist_lock, *flags); +} + +/* + * This function is called from finish_task_switch when task tk becomes dead, + * so that we can recycle any function-return probe instances associated + * with this task. These left over instances represent probed functions + * that have been called but will never return. + */ +void __kprobes kprobe_flush_task(struct task_struct *tk) +{ + struct kretprobe_instance *ri; + struct hlist_head *head, empty_rp; + struct hlist_node *node, *tmp; + unsigned long hash, flags = 0; + + if (unlikely(!kprobes_initialized)) + /* Early boot. kretprobe_table_locks not yet initialized. */ + return; + + INIT_HLIST_HEAD(&empty_rp); + hash = hash_ptr(tk, KPROBE_HASH_BITS); + head = &kretprobe_inst_table[hash]; + kretprobe_table_lock(hash, &flags); + hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { + if (ri->task == tk) + recycle_rp_inst(ri, &empty_rp); + } + kretprobe_table_unlock(hash, &flags); + hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { + hlist_del(&ri->hlist); + kfree(ri); + } +} + +static inline void free_rp_inst(struct kretprobe *rp) +{ + struct kretprobe_instance *ri; + struct hlist_node *pos, *next; + + hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, hlist) { + hlist_del(&ri->hlist); + kfree(ri); + } +} + +static void __kprobes cleanup_rp_inst(struct kretprobe *rp) +{ + unsigned long flags, hash; + struct kretprobe_instance *ri; + struct hlist_node *pos, *next; + struct hlist_head *head; + + /* No race here */ + for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) { + kretprobe_table_lock(hash, &flags); + head = &kretprobe_inst_table[hash]; + hlist_for_each_entry_safe(ri, pos, next, head, hlist) { + if (ri->rp == rp) + ri->rp = NULL; + } + kretprobe_table_unlock(hash, &flags); + } + free_rp_inst(rp); +} + +/* +* Add the new probe to ap->list. Fail if this is the +* second jprobe at the address - two jprobes can't coexist +*/ +static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) +{ + BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); + + if (p->break_handler || p->post_handler) + unoptimize_kprobe(ap, true); /* Fall back to normal kprobe */ + + if (p->break_handler) { + if (ap->break_handler) + return -EEXIST; + list_add_tail_rcu(&p->list, &ap->list); + ap->break_handler = aggr_break_handler; + } else + list_add_rcu(&p->list, &ap->list); + if (p->post_handler && !ap->post_handler) + ap->post_handler = aggr_post_handler; + + if (kprobe_disabled(ap) && !kprobe_disabled(p)) { + ap->flags &= ~KPROBE_FLAG_DISABLED; + if (!kprobes_all_disarmed) + /* Arm the breakpoint again. */ + __arm_kprobe(ap); + } + return 0; +} + +/* + * Fill in the required fields of the "manager kprobe". Replace the + * earlier kprobe in the hlist with the manager kprobe + */ +static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) +{ + /* Copy p's insn slot to ap */ + copy_kprobe(p, ap); + flush_insn_slot(ap); + ap->addr = p->addr; + ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED; + ap->pre_handler = aggr_pre_handler; + ap->fault_handler = aggr_fault_handler; + /* We don't care the kprobe which has gone. */ + if (p->post_handler && !kprobe_gone(p)) + ap->post_handler = aggr_post_handler; + if (p->break_handler && !kprobe_gone(p)) + ap->break_handler = aggr_break_handler; + + INIT_LIST_HEAD(&ap->list); + INIT_HLIST_NODE(&ap->hlist); + + list_add_rcu(&p->list, &ap->list); + hlist_replace_rcu(&p->hlist, &ap->hlist); +} + +/* + * This is the second or subsequent kprobe at the address - handle + * the intricacies + */ +static int __kprobes register_aggr_kprobe(struct kprobe *orig_p, + struct kprobe *p) +{ + int ret = 0; + struct kprobe *ap = orig_p; + + if (!kprobe_aggrprobe(orig_p)) { + /* If orig_p is not an aggr_kprobe, create new aggr_kprobe. */ + ap = alloc_aggr_kprobe(orig_p); + if (!ap) + return -ENOMEM; + init_aggr_kprobe(ap, orig_p); + } else if (kprobe_unused(ap)) + /* This probe is going to die. Rescue it */ + reuse_unused_kprobe(ap); + + if (kprobe_gone(ap)) { + /* + * Attempting to insert new probe at the same location that + * had a probe in the module vaddr area which already + * freed. So, the instruction slot has already been + * released. We need a new slot for the new probe. + */ + ret = arch_prepare_kprobe(ap); + if (ret) + /* + * Even if fail to allocate new slot, don't need to + * free aggr_probe. It will be used next time, or + * freed by unregister_kprobe. + */ + return ret; + + /* Prepare optimized instructions if possible. */ + prepare_optimized_kprobe(ap); + + /* + * Clear gone flag to prevent allocating new slot again, and + * set disabled flag because it is not armed yet. + */ + ap->flags = (ap->flags & ~KPROBE_FLAG_GONE) + | KPROBE_FLAG_DISABLED; + } + + /* Copy ap's insn slot to p */ + copy_kprobe(ap, p); + return add_new_kprobe(ap, p); +} + +static int __kprobes in_kprobes_functions(unsigned long addr) +{ + struct kprobe_blackpoint *kb; + + if (addr >= (unsigned long)__kprobes_text_start && + addr < (unsigned long)__kprobes_text_end) + return -EINVAL; + /* + * If there exists a kprobe_blacklist, verify and + * fail any probe registration in the prohibited area + */ + for (kb = kprobe_blacklist; kb->name != NULL; kb++) { + if (kb->start_addr) { + if (addr >= kb->start_addr && + addr < (kb->start_addr + kb->range)) + return -EINVAL; + } + } + return 0; +} + +/* + * If we have a symbol_name argument, look it up and add the offset field + * to it. This way, we can specify a relative address to a symbol. + */ +static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) +{ + kprobe_opcode_t *addr = p->addr; + if (p->symbol_name) { + if (addr) + return NULL; + kprobe_lookup_name(p->symbol_name, addr); + } + + if (!addr) + return NULL; + return (kprobe_opcode_t *)(((char *)addr) + p->offset); +} + +/* Check passed kprobe is valid and return kprobe in kprobe_table. */ +static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) +{ + struct kprobe *ap, *list_p; + + ap = get_kprobe(p->addr); + if (unlikely(!ap)) + return NULL; + + if (p != ap) { + list_for_each_entry_rcu(list_p, &ap->list, list) + if (list_p == p) + /* kprobe p is a valid probe */ + goto valid; + return NULL; + } +valid: + return ap; +} + +/* Return error if the kprobe is being re-registered */ +static inline int check_kprobe_rereg(struct kprobe *p) +{ + int ret = 0; + + mutex_lock(&kprobe_mutex); + if (__get_valid_kprobe(p)) + ret = -EINVAL; + mutex_unlock(&kprobe_mutex); + + return ret; +} + +int __kprobes register_kprobe(struct kprobe *p) +{ + int ret = 0; + struct kprobe *old_p; + struct module *probed_mod; + kprobe_opcode_t *addr; + + addr = kprobe_addr(p); + if (!addr) + return -EINVAL; + p->addr = addr; + + ret = check_kprobe_rereg(p); + if (ret) + return ret; + + jump_label_lock(); + preempt_disable(); + if (!kernel_text_address((unsigned long) p->addr) || + in_kprobes_functions((unsigned long) p->addr) || + ftrace_text_reserved(p->addr, p->addr) || + jump_label_text_reserved(p->addr, p->addr)) + goto fail_with_jump_label; + + /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */ + p->flags &= KPROBE_FLAG_DISABLED; + + /* + * Check if are we probing a module. + */ + probed_mod = __module_text_address((unsigned long) p->addr); + if (probed_mod) { + /* + * We must hold a refcount of the probed module while updating + * its code to prohibit unexpected unloading. + */ + if (unlikely(!try_module_get(probed_mod))) + goto fail_with_jump_label; + + /* + * If the module freed .init.text, we couldn't insert + * kprobes in there. + */ + if (within_module_init((unsigned long)p->addr, probed_mod) && + probed_mod->state != MODULE_STATE_COMING) { + module_put(probed_mod); + goto fail_with_jump_label; + } + } + preempt_enable(); + jump_label_unlock(); + + p->nmissed = 0; + INIT_LIST_HEAD(&p->list); + mutex_lock(&kprobe_mutex); + + jump_label_lock(); /* needed to call jump_label_text_reserved() */ + + get_online_cpus(); /* For avoiding text_mutex deadlock. */ + mutex_lock(&text_mutex); + + old_p = get_kprobe(p->addr); + if (old_p) { + /* Since this may unoptimize old_p, locking text_mutex. */ + ret = register_aggr_kprobe(old_p, p); + goto out; + } + + ret = arch_prepare_kprobe(p); + if (ret) + goto out; + + INIT_HLIST_NODE(&p->hlist); + hlist_add_head_rcu(&p->hlist, + &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); + + if (!kprobes_all_disarmed && !kprobe_disabled(p)) + __arm_kprobe(p); + + /* Try to optimize kprobe */ + try_to_optimize_kprobe(p); + +out: + mutex_unlock(&text_mutex); + put_online_cpus(); + jump_label_unlock(); + mutex_unlock(&kprobe_mutex); + + if (probed_mod) + module_put(probed_mod); + + return ret; + +fail_with_jump_label: + preempt_enable(); + jump_label_unlock(); + return -EINVAL; +} +EXPORT_SYMBOL_GPL(register_kprobe); + +/* Check if all probes on the aggrprobe are disabled */ +static int __kprobes aggr_kprobe_disabled(struct kprobe *ap) +{ + struct kprobe *kp; + + list_for_each_entry_rcu(kp, &ap->list, list) + if (!kprobe_disabled(kp)) + /* + * There is an active probe on the list. + * We can't disable this ap. + */ + return 0; + + return 1; +} + +/* Disable one kprobe: Make sure called under kprobe_mutex is locked */ +static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p) +{ + struct kprobe *orig_p; + + /* Get an original kprobe for return */ + orig_p = __get_valid_kprobe(p); + if (unlikely(orig_p == NULL)) + return NULL; + + if (!kprobe_disabled(p)) { + /* Disable probe if it is a child probe */ + if (p != orig_p) + p->flags |= KPROBE_FLAG_DISABLED; + + /* Try to disarm and disable this/parent probe */ + if (p == orig_p || aggr_kprobe_disabled(orig_p)) { + disarm_kprobe(orig_p); + orig_p->flags |= KPROBE_FLAG_DISABLED; + } + } + + return orig_p; +} + +/* + * Unregister a kprobe without a scheduler synchronization. + */ +static int __kprobes __unregister_kprobe_top(struct kprobe *p) +{ + struct kprobe *ap, *list_p; + + /* Disable kprobe. This will disarm it if needed. */ + ap = __disable_kprobe(p); + if (ap == NULL) + return -EINVAL; + + if (ap == p) + /* + * This probe is an independent(and non-optimized) kprobe + * (not an aggrprobe). Remove from the hash list. + */ + goto disarmed; + + /* Following process expects this probe is an aggrprobe */ + WARN_ON(!kprobe_aggrprobe(ap)); + + if (list_is_singular(&ap->list) && kprobe_disarmed(ap)) + /* + * !disarmed could be happen if the probe is under delayed + * unoptimizing. + */ + goto disarmed; + else { + /* If disabling probe has special handlers, update aggrprobe */ + if (p->break_handler && !kprobe_gone(p)) + ap->break_handler = NULL; + if (p->post_handler && !kprobe_gone(p)) { + list_for_each_entry_rcu(list_p, &ap->list, list) { + if ((list_p != p) && (list_p->post_handler)) + goto noclean; + } + ap->post_handler = NULL; + } +noclean: + /* + * Remove from the aggrprobe: this path will do nothing in + * __unregister_kprobe_bottom(). + */ + list_del_rcu(&p->list); + if (!kprobe_disabled(ap) && !kprobes_all_disarmed) + /* + * Try to optimize this probe again, because post + * handler may have been changed. + */ + optimize_kprobe(ap); + } + return 0; + +disarmed: + BUG_ON(!kprobe_disarmed(ap)); + hlist_del_rcu(&ap->hlist); + return 0; +} + +static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) +{ + struct kprobe *ap; + + if (list_empty(&p->list)) + /* This is an independent kprobe */ + arch_remove_kprobe(p); + else if (list_is_singular(&p->list)) { + /* This is the last child of an aggrprobe */ + ap = list_entry(p->list.next, struct kprobe, list); + list_del(&p->list); + free_aggr_kprobe(ap); + } + /* Otherwise, do nothing. */ +} + +int __kprobes register_kprobes(struct kprobe **kps, int num) +{ + int i, ret = 0; + + if (num <= 0) + return -EINVAL; + for (i = 0; i < num; i++) { + ret = register_kprobe(kps[i]); + if (ret < 0) { + if (i > 0) + unregister_kprobes(kps, i); + break; + } + } + return ret; +} +EXPORT_SYMBOL_GPL(register_kprobes); + +void __kprobes unregister_kprobe(struct kprobe *p) +{ + unregister_kprobes(&p, 1); +} +EXPORT_SYMBOL_GPL(unregister_kprobe); + +void __kprobes unregister_kprobes(struct kprobe **kps, int num) +{ + int i; + + if (num <= 0) + return; + mutex_lock(&kprobe_mutex); + for (i = 0; i < num; i++) + if (__unregister_kprobe_top(kps[i]) < 0) + kps[i]->addr = NULL; + mutex_unlock(&kprobe_mutex); + + synchronize_sched(); + for (i = 0; i < num; i++) + if (kps[i]->addr) + __unregister_kprobe_bottom(kps[i]); +} +EXPORT_SYMBOL_GPL(unregister_kprobes); + +static struct notifier_block kprobe_exceptions_nb = { + .notifier_call = kprobe_exceptions_notify, + .priority = 0x7fffffff /* we need to be notified first */ +}; + +unsigned long __weak arch_deref_entry_point(void *entry) +{ + return (unsigned long)entry; +} + +int __kprobes register_jprobes(struct jprobe **jps, int num) +{ + struct jprobe *jp; + int ret = 0, i; + + if (num <= 0) + return -EINVAL; + for (i = 0; i < num; i++) { + unsigned long addr, offset; + jp = jps[i]; + addr = arch_deref_entry_point(jp->entry); + + /* Verify probepoint is a function entry point */ + if (kallsyms_lookup_size_offset(addr, NULL, &offset) && + offset == 0) { + jp->kp.pre_handler = setjmp_pre_handler; + jp->kp.break_handler = longjmp_break_handler; + ret = register_kprobe(&jp->kp); + } else + ret = -EINVAL; + + if (ret < 0) { + if (i > 0) + unregister_jprobes(jps, i); + break; + } + } + return ret; +} +EXPORT_SYMBOL_GPL(register_jprobes); + +int __kprobes register_jprobe(struct jprobe *jp) +{ + return register_jprobes(&jp, 1); +} +EXPORT_SYMBOL_GPL(register_jprobe); + +void __kprobes unregister_jprobe(struct jprobe *jp) +{ + unregister_jprobes(&jp, 1); +} +EXPORT_SYMBOL_GPL(unregister_jprobe); + +void __kprobes unregister_jprobes(struct jprobe **jps, int num) +{ + int i; + + if (num <= 0) + return; + mutex_lock(&kprobe_mutex); + for (i = 0; i < num; i++) + if (__unregister_kprobe_top(&jps[i]->kp) < 0) + jps[i]->kp.addr = NULL; + mutex_unlock(&kprobe_mutex); + + synchronize_sched(); + for (i = 0; i < num; i++) { + if (jps[i]->kp.addr) + __unregister_kprobe_bottom(&jps[i]->kp); + } +} +EXPORT_SYMBOL_GPL(unregister_jprobes); + +#ifdef CONFIG_KRETPROBES +/* + * This kprobe pre_handler is registered with every kretprobe. When probe + * hits it will set up the return probe. + */ +static int __kprobes pre_handler_kretprobe(struct kprobe *p, + struct pt_regs *regs) +{ + struct kretprobe *rp = container_of(p, struct kretprobe, kp); + unsigned long hash, flags = 0; + struct kretprobe_instance *ri; + + /*TODO: consider to only swap the RA after the last pre_handler fired */ + hash = hash_ptr(current, KPROBE_HASH_BITS); + spin_lock_irqsave(&rp->lock, flags); + if (!hlist_empty(&rp->free_instances)) { + ri = hlist_entry(rp->free_instances.first, + struct kretprobe_instance, hlist); + hlist_del(&ri->hlist); + spin_unlock_irqrestore(&rp->lock, flags); + + ri->rp = rp; + ri->task = current; + + if (rp->entry_handler && rp->entry_handler(ri, regs)) { + spin_lock_irqsave(&rp->lock, flags); + hlist_add_head(&ri->hlist, &rp->free_instances); + spin_unlock_irqrestore(&rp->lock, flags); + return 0; + } + + arch_prepare_kretprobe(ri, regs); + + /* XXX(hch): why is there no hlist_move_head? */ + INIT_HLIST_NODE(&ri->hlist); + kretprobe_table_lock(hash, &flags); + hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]); + kretprobe_table_unlock(hash, &flags); + } else { + rp->nmissed++; + spin_unlock_irqrestore(&rp->lock, flags); + } + return 0; +} + +int __kprobes register_kretprobe(struct kretprobe *rp) +{ + int ret = 0; + struct kretprobe_instance *inst; + int i; + void *addr; + + if (kretprobe_blacklist_size) { + addr = kprobe_addr(&rp->kp); + if (!addr) + return -EINVAL; + + for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { + if (kretprobe_blacklist[i].addr == addr) + return -EINVAL; + } + } + + rp->kp.pre_handler = pre_handler_kretprobe; + rp->kp.post_handler = NULL; + rp->kp.fault_handler = NULL; + rp->kp.break_handler = NULL; + + /* Pre-allocate memory for max kretprobe instances */ + if (rp->maxactive <= 0) { +#ifdef CONFIG_PREEMPT + rp->maxactive = max_t(unsigned int, 10, 2*num_possible_cpus()); +#else + rp->maxactive = num_possible_cpus(); +#endif + } + spin_lock_init(&rp->lock); + INIT_HLIST_HEAD(&rp->free_instances); + for (i = 0; i < rp->maxactive; i++) { + inst = kmalloc(sizeof(struct kretprobe_instance) + + rp->data_size, GFP_KERNEL); + if (inst == NULL) { + free_rp_inst(rp); + return -ENOMEM; + } + INIT_HLIST_NODE(&inst->hlist); + hlist_add_head(&inst->hlist, &rp->free_instances); + } + + rp->nmissed = 0; + /* Establish function entry probe point */ + ret = register_kprobe(&rp->kp); + if (ret != 0) + free_rp_inst(rp); + return ret; +} +EXPORT_SYMBOL_GPL(register_kretprobe); + +int __kprobes register_kretprobes(struct kretprobe **rps, int num) +{ + int ret = 0, i; + + if (num <= 0) + return -EINVAL; + for (i = 0; i < num; i++) { + ret = register_kretprobe(rps[i]); + if (ret < 0) { + if (i > 0) + unregister_kretprobes(rps, i); + break; + } + } + return ret; +} +EXPORT_SYMBOL_GPL(register_kretprobes); + +void __kprobes unregister_kretprobe(struct kretprobe *rp) +{ + unregister_kretprobes(&rp, 1); +} +EXPORT_SYMBOL_GPL(unregister_kretprobe); + +void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) +{ + int i; + + if (num <= 0) + return; + mutex_lock(&kprobe_mutex); + for (i = 0; i < num; i++) + if (__unregister_kprobe_top(&rps[i]->kp) < 0) + rps[i]->kp.addr = NULL; + mutex_unlock(&kprobe_mutex); + + synchronize_sched(); + for (i = 0; i < num; i++) { + if (rps[i]->kp.addr) { + __unregister_kprobe_bottom(&rps[i]->kp); + cleanup_rp_inst(rps[i]); + } + } +} +EXPORT_SYMBOL_GPL(unregister_kretprobes); + +#else /* CONFIG_KRETPROBES */ +int __kprobes register_kretprobe(struct kretprobe *rp) +{ + return -ENOSYS; +} +EXPORT_SYMBOL_GPL(register_kretprobe); + +int __kprobes register_kretprobes(struct kretprobe **rps, int num) +{ + return -ENOSYS; +} +EXPORT_SYMBOL_GPL(register_kretprobes); + +void __kprobes unregister_kretprobe(struct kretprobe *rp) +{ +} +EXPORT_SYMBOL_GPL(unregister_kretprobe); + +void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) +{ +} +EXPORT_SYMBOL_GPL(unregister_kretprobes); + +static int __kprobes pre_handler_kretprobe(struct kprobe *p, + struct pt_regs *regs) +{ + return 0; +} + +#endif /* CONFIG_KRETPROBES */ + +/* Set the kprobe gone and remove its instruction buffer. */ +static void __kprobes kill_kprobe(struct kprobe *p) +{ + struct kprobe *kp; + + p->flags |= KPROBE_FLAG_GONE; + if (kprobe_aggrprobe(p)) { + /* + * If this is an aggr_kprobe, we have to list all the + * chained probes and mark them GONE. + */ + list_for_each_entry_rcu(kp, &p->list, list) + kp->flags |= KPROBE_FLAG_GONE; + p->post_handler = NULL; + p->break_handler = NULL; + kill_optimized_kprobe(p); + } + /* + * Here, we can remove insn_slot safely, because no thread calls + * the original probed function (which will be freed soon) any more. + */ + arch_remove_kprobe(p); +} + +/* Disable one kprobe */ +int __kprobes disable_kprobe(struct kprobe *kp) +{ + int ret = 0; + + mutex_lock(&kprobe_mutex); + + /* Disable this kprobe */ + if (__disable_kprobe(kp) == NULL) + ret = -EINVAL; + + mutex_unlock(&kprobe_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(disable_kprobe); + +/* Enable one kprobe */ +int __kprobes enable_kprobe(struct kprobe *kp) +{ + int ret = 0; + struct kprobe *p; + + mutex_lock(&kprobe_mutex); + + /* Check whether specified probe is valid. */ + p = __get_valid_kprobe(kp); + if (unlikely(p == NULL)) { + ret = -EINVAL; + goto out; + } + + if (kprobe_gone(kp)) { + /* This kprobe has gone, we couldn't enable it. */ + ret = -EINVAL; + goto out; + } + + if (p != kp) + kp->flags &= ~KPROBE_FLAG_DISABLED; + + if (!kprobes_all_disarmed && kprobe_disabled(p)) { + p->flags &= ~KPROBE_FLAG_DISABLED; + arm_kprobe(p); + } +out: + mutex_unlock(&kprobe_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(enable_kprobe); + +void __kprobes dump_kprobe(struct kprobe *kp) +{ + printk(KERN_WARNING "Dumping kprobe:\n"); + printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n", + kp->symbol_name, kp->addr, kp->offset); +} + +/* Module notifier call back, checking kprobes on the module */ +static int __kprobes kprobes_module_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct module *mod = data; + struct hlist_head *head; + struct hlist_node *node; + struct kprobe *p; + unsigned int i; + int checkcore = (val == MODULE_STATE_GOING); + + if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE) + return NOTIFY_DONE; + + /* + * When MODULE_STATE_GOING was notified, both of module .text and + * .init.text sections would be freed. When MODULE_STATE_LIVE was + * notified, only .init.text section would be freed. We need to + * disable kprobes which have been inserted in the sections. + */ + mutex_lock(&kprobe_mutex); + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + head = &kprobe_table[i]; + hlist_for_each_entry_rcu(p, node, head, hlist) + if (within_module_init((unsigned long)p->addr, mod) || + (checkcore && + within_module_core((unsigned long)p->addr, mod))) { + /* + * The vaddr this probe is installed will soon + * be vfreed buy not synced to disk. Hence, + * disarming the breakpoint isn't needed. + */ + kill_kprobe(p); + } + } + mutex_unlock(&kprobe_mutex); + return NOTIFY_DONE; +} + +static struct notifier_block kprobe_module_nb = { + .notifier_call = kprobes_module_callback, + .priority = 0 +}; + +static int __init init_kprobes(void) +{ + int i, err = 0; + unsigned long offset = 0, size = 0; + char *modname, namebuf[128]; + const char *symbol_name; + void *addr; + struct kprobe_blackpoint *kb; + + /* FIXME allocate the probe table, currently defined statically */ + /* initialize all list heads */ + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + INIT_HLIST_HEAD(&kprobe_table[i]); + INIT_HLIST_HEAD(&kretprobe_inst_table[i]); + spin_lock_init(&(kretprobe_table_locks[i].lock)); + } + + /* + * Lookup and populate the kprobe_blacklist. + * + * Unlike the kretprobe blacklist, we'll need to determine + * the range of addresses that belong to the said functions, + * since a kprobe need not necessarily be at the beginning + * of a function. + */ + for (kb = kprobe_blacklist; kb->name != NULL; kb++) { + kprobe_lookup_name(kb->name, addr); + if (!addr) + continue; + + kb->start_addr = (unsigned long)addr; + symbol_name = kallsyms_lookup(kb->start_addr, + &size, &offset, &modname, namebuf); + if (!symbol_name) + kb->range = 0; + else + kb->range = size; + } + + if (kretprobe_blacklist_size) { + /* lookup the function address from its name */ + for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { + kprobe_lookup_name(kretprobe_blacklist[i].name, + kretprobe_blacklist[i].addr); + if (!kretprobe_blacklist[i].addr) + printk("kretprobe: lookup failed: %s\n", + kretprobe_blacklist[i].name); + } + } + +#if defined(CONFIG_OPTPROBES) +#if defined(__ARCH_WANT_KPROBES_INSN_SLOT) + /* Init kprobe_optinsn_slots */ + kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE; +#endif + /* By default, kprobes can be optimized */ + kprobes_allow_optimization = true; +#endif + + /* By default, kprobes are armed */ + kprobes_all_disarmed = false; + + err = arch_init_kprobes(); + if (!err) + err = register_die_notifier(&kprobe_exceptions_nb); + if (!err) + err = register_module_notifier(&kprobe_module_nb); + + kprobes_initialized = (err == 0); + + if (!err) + init_test_probes(); + return err; +} + +#ifdef CONFIG_DEBUG_FS +static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, + const char *sym, int offset, char *modname, struct kprobe *pp) +{ + char *kprobe_type; + + if (p->pre_handler == pre_handler_kretprobe) + kprobe_type = "r"; + else if (p->pre_handler == setjmp_pre_handler) + kprobe_type = "j"; + else + kprobe_type = "k"; + + if (sym) + seq_printf(pi, "%p %s %s+0x%x %s ", + p->addr, kprobe_type, sym, offset, + (modname ? modname : " ")); + else + seq_printf(pi, "%p %s %p ", + p->addr, kprobe_type, p->addr); + + if (!pp) + pp = p; + seq_printf(pi, "%s%s%s\n", + (kprobe_gone(p) ? "[GONE]" : ""), + ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""), + (kprobe_optimized(pp) ? "[OPTIMIZED]" : "")); +} + +static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) +{ + return (*pos < KPROBE_TABLE_SIZE) ? pos : NULL; +} + +static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos) +{ + (*pos)++; + if (*pos >= KPROBE_TABLE_SIZE) + return NULL; + return pos; +} + +static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v) +{ + /* Nothing to do */ +} + +static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) +{ + struct hlist_head *head; + struct hlist_node *node; + struct kprobe *p, *kp; + const char *sym = NULL; + unsigned int i = *(loff_t *) v; + unsigned long offset = 0; + char *modname, namebuf[128]; + + head = &kprobe_table[i]; + preempt_disable(); + hlist_for_each_entry_rcu(p, node, head, hlist) { + sym = kallsyms_lookup((unsigned long)p->addr, NULL, + &offset, &modname, namebuf); + if (kprobe_aggrprobe(p)) { + list_for_each_entry_rcu(kp, &p->list, list) + report_probe(pi, kp, sym, offset, modname, p); + } else + report_probe(pi, p, sym, offset, modname, NULL); + } + preempt_enable(); + return 0; +} + +static const struct seq_operations kprobes_seq_ops = { + .start = kprobe_seq_start, + .next = kprobe_seq_next, + .stop = kprobe_seq_stop, + .show = show_kprobe_addr +}; + +static int __kprobes kprobes_open(struct inode *inode, struct file *filp) +{ + return seq_open(filp, &kprobes_seq_ops); +} + +static const struct file_operations debugfs_kprobes_operations = { + .open = kprobes_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void __kprobes arm_all_kprobes(void) +{ + struct hlist_head *head; + struct hlist_node *node; + struct kprobe *p; + unsigned int i; + + mutex_lock(&kprobe_mutex); + + /* If kprobes are armed, just return */ + if (!kprobes_all_disarmed) + goto already_enabled; + + /* Arming kprobes doesn't optimize kprobe itself */ + mutex_lock(&text_mutex); + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + head = &kprobe_table[i]; + hlist_for_each_entry_rcu(p, node, head, hlist) + if (!kprobe_disabled(p)) + __arm_kprobe(p); + } + mutex_unlock(&text_mutex); + + kprobes_all_disarmed = false; + printk(KERN_INFO "Kprobes globally enabled\n"); + +already_enabled: + mutex_unlock(&kprobe_mutex); + return; +} + +static void __kprobes disarm_all_kprobes(void) +{ + struct hlist_head *head; + struct hlist_node *node; + struct kprobe *p; + unsigned int i; + + mutex_lock(&kprobe_mutex); + + /* If kprobes are already disarmed, just return */ + if (kprobes_all_disarmed) { + mutex_unlock(&kprobe_mutex); + return; + } + + kprobes_all_disarmed = true; + printk(KERN_INFO "Kprobes globally disabled\n"); + + mutex_lock(&text_mutex); + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + head = &kprobe_table[i]; + hlist_for_each_entry_rcu(p, node, head, hlist) { + if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) + __disarm_kprobe(p, false); + } + } + mutex_unlock(&text_mutex); + mutex_unlock(&kprobe_mutex); + + /* Wait for disarming all kprobes by optimizer */ + wait_for_kprobe_optimizer(); +} + +/* + * XXX: The debugfs bool file interface doesn't allow for callbacks + * when the bool state is switched. We can reuse that facility when + * available + */ +static ssize_t read_enabled_file_bool(struct file *file, + char __user *user_buf, size_t count, loff_t *ppos) +{ + char buf[3]; + + if (!kprobes_all_disarmed) + buf[0] = '1'; + else + buf[0] = '0'; + buf[1] = '\n'; + buf[2] = 0x00; + return simple_read_from_buffer(user_buf, count, ppos, buf, 2); +} + +static ssize_t write_enabled_file_bool(struct file *file, + const char __user *user_buf, size_t count, loff_t *ppos) +{ + char buf[32]; + int buf_size; + + buf_size = min(count, (sizeof(buf)-1)); + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + switch (buf[0]) { + case 'y': + case 'Y': + case '1': + arm_all_kprobes(); + break; + case 'n': + case 'N': + case '0': + disarm_all_kprobes(); + break; + } + + return count; +} + +static const struct file_operations fops_kp = { + .read = read_enabled_file_bool, + .write = write_enabled_file_bool, + .llseek = default_llseek, +}; + +static int __kprobes debugfs_kprobe_init(void) +{ + struct dentry *dir, *file; + unsigned int value = 1; + + dir = debugfs_create_dir("kprobes", NULL); + if (!dir) + return -ENOMEM; + + file = debugfs_create_file("list", 0444, dir, NULL, + &debugfs_kprobes_operations); + if (!file) { + debugfs_remove(dir); + return -ENOMEM; + } + + file = debugfs_create_file("enabled", 0600, dir, + &value, &fops_kp); + if (!file) { + debugfs_remove(dir); + return -ENOMEM; + } + + return 0; +} + +late_initcall(debugfs_kprobe_init); +#endif /* CONFIG_DEBUG_FS */ + +module_init(init_kprobes); + +/* defined in arch/.../kernel/kprobes.c */ +EXPORT_SYMBOL_GPL(jprobe_return); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c new file mode 100644 index 00000000..3b053c04 --- /dev/null +++ b/kernel/ksysfs.c @@ -0,0 +1,221 @@ +/* + * kernel/ksysfs.c - sysfs attributes in /sys/kernel, which + * are not related to any other subsystem + * + * Copyright (C) 2004 Kay Sievers + * + * This file is release under the GPLv2 + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define KERNEL_ATTR_RO(_name) \ +static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +#define KERNEL_ATTR_RW(_name) \ +static struct kobj_attribute _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +#if defined(CONFIG_HOTPLUG) +/* current uevent sequence number */ +static ssize_t uevent_seqnum_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%llu\n", (unsigned long long)uevent_seqnum); +} +KERNEL_ATTR_RO(uevent_seqnum); + +/* uevent helper program, used during early boot */ +static ssize_t uevent_helper_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%s\n", uevent_helper); +} +static ssize_t uevent_helper_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (count+1 > UEVENT_HELPER_PATH_LEN) + return -ENOENT; + memcpy(uevent_helper, buf, count); + uevent_helper[count] = '\0'; + if (count && uevent_helper[count-1] == '\n') + uevent_helper[count-1] = '\0'; + return count; +} +KERNEL_ATTR_RW(uevent_helper); +#endif + +#ifdef CONFIG_PROFILING +static ssize_t profiling_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", prof_on); +} +static ssize_t profiling_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int ret; + + if (prof_on) + return -EEXIST; + /* + * This eventually calls into get_option() which + * has a ton of callers and is not const. It is + * easiest to cast it away here. + */ + profile_setup((char *)buf); + ret = profile_init(); + if (ret) + return ret; + ret = create_proc_profile(); + if (ret) + return ret; + return count; +} +KERNEL_ATTR_RW(profiling); +#endif + +#ifdef CONFIG_KEXEC +static ssize_t kexec_loaded_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", !!kexec_image); +} +KERNEL_ATTR_RO(kexec_loaded); + +static ssize_t kexec_crash_loaded_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", !!kexec_crash_image); +} +KERNEL_ATTR_RO(kexec_crash_loaded); + +static ssize_t kexec_crash_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%zu\n", crash_get_memory_size()); +} +static ssize_t kexec_crash_size_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long cnt; + int ret; + + if (strict_strtoul(buf, 0, &cnt)) + return -EINVAL; + + ret = crash_shrink_memory(cnt); + return ret < 0 ? ret : count; +} +KERNEL_ATTR_RW(kexec_crash_size); + +static ssize_t vmcoreinfo_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lx %x\n", + paddr_vmcoreinfo_note(), + (unsigned int)vmcoreinfo_max_size); +} +KERNEL_ATTR_RO(vmcoreinfo); + +#endif /* CONFIG_KEXEC */ + +/* whether file capabilities are enabled */ +static ssize_t fscaps_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", file_caps_enabled); +} +KERNEL_ATTR_RO(fscaps); + +/* + * Make /sys/kernel/notes give the raw contents of our kernel .notes section. + */ +extern const void __start_notes __attribute__((weak)); +extern const void __stop_notes __attribute__((weak)); +#define notes_size (&__stop_notes - &__start_notes) + +static ssize_t notes_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t count) +{ + memcpy(buf, &__start_notes + off, count); + return count; +} + +static struct bin_attribute notes_attr = { + .attr = { + .name = "notes", + .mode = S_IRUGO, + }, + .read = ¬es_read, +}; + +struct kobject *kernel_kobj; +EXPORT_SYMBOL_GPL(kernel_kobj); + +static struct attribute * kernel_attrs[] = { + &fscaps_attr.attr, +#if defined(CONFIG_HOTPLUG) + &uevent_seqnum_attr.attr, + &uevent_helper_attr.attr, +#endif +#ifdef CONFIG_PROFILING + &profiling_attr.attr, +#endif +#ifdef CONFIG_KEXEC + &kexec_loaded_attr.attr, + &kexec_crash_loaded_attr.attr, + &kexec_crash_size_attr.attr, + &vmcoreinfo_attr.attr, +#endif + NULL +}; + +static struct attribute_group kernel_attr_group = { + .attrs = kernel_attrs, +}; + +static int __init ksysfs_init(void) +{ + int error; + + kernel_kobj = kobject_create_and_add("kernel", NULL); + if (!kernel_kobj) { + error = -ENOMEM; + goto exit; + } + error = sysfs_create_group(kernel_kobj, &kernel_attr_group); + if (error) + goto kset_exit; + + if (notes_size > 0) { + notes_attr.size = notes_size; + error = sysfs_create_bin_file(kernel_kobj, ¬es_attr); + if (error) + goto group_exit; + } + + return 0; + +group_exit: + sysfs_remove_group(kernel_kobj, &kernel_attr_group); +kset_exit: + kobject_put(kernel_kobj); +exit: + return error; +} + +core_initcall(ksysfs_init); diff --git a/kernel/kthread.c b/kernel/kthread.c new file mode 100644 index 00000000..4ba7cccb --- /dev/null +++ b/kernel/kthread.c @@ -0,0 +1,443 @@ +/* Kernel thread helper functions. + * Copyright (C) 2004 IBM Corporation, Rusty Russell. + * + * Creation is done via kthreadd, so that we get a clean environment + * even if we're invoked from userspace (think modprobe, hotplug cpu, + * etc.). + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_SPINLOCK(kthread_create_lock); +static LIST_HEAD(kthread_create_list); +struct task_struct *kthreadd_task; + +struct kthread_create_info +{ + /* Information passed to kthread() from kthreadd. */ + int (*threadfn)(void *data); + void *data; + int node; + + /* Result passed back to kthread_create() from kthreadd. */ + struct task_struct *result; + struct completion done; + + struct list_head list; +}; + +struct kthread { + int should_stop; + void *data; + struct completion exited; +}; + +#define to_kthread(tsk) \ + container_of((tsk)->vfork_done, struct kthread, exited) + +/** + * kthread_should_stop - should this kthread return now? + * + * When someone calls kthread_stop() on your kthread, it will be woken + * and this will return true. You should then return, and your return + * value will be passed through to kthread_stop(). + */ +int kthread_should_stop(void) +{ + return to_kthread(current)->should_stop; +} +EXPORT_SYMBOL(kthread_should_stop); + +/** + * kthread_data - return data value specified on kthread creation + * @task: kthread task in question + * + * Return the data value specified when kthread @task was created. + * The caller is responsible for ensuring the validity of @task when + * calling this function. + */ +void *kthread_data(struct task_struct *task) +{ + return to_kthread(task)->data; +} + +static int kthread(void *_create) +{ + /* Copy data: it's on kthread's stack */ + struct kthread_create_info *create = _create; + int (*threadfn)(void *data) = create->threadfn; + void *data = create->data; + struct kthread self; + int ret; + + self.should_stop = 0; + self.data = data; + init_completion(&self.exited); + current->vfork_done = &self.exited; + + /* OK, tell user we're spawned, wait for stop or wakeup */ + __set_current_state(TASK_UNINTERRUPTIBLE); + create->result = current; + complete(&create->done); + schedule(); + + ret = -EINTR; + if (!self.should_stop) + ret = threadfn(data); + + /* we can't just return, we must preserve "self" on stack */ + do_exit(ret); +} + +/* called from do_fork() to get node information for about to be created task */ +int tsk_fork_get_node(struct task_struct *tsk) +{ +#ifdef CONFIG_NUMA + if (tsk == kthreadd_task) + return tsk->pref_node_fork; +#endif + return numa_node_id(); +} + +static void create_kthread(struct kthread_create_info *create) +{ + int pid; + +#ifdef CONFIG_NUMA + current->pref_node_fork = create->node; +#endif + /* We want our own signal handler (we take no signals by default). */ + pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); + if (pid < 0) { + create->result = ERR_PTR(pid); + complete(&create->done); + } +} + +/** + * kthread_create_on_node - create a kthread. + * @threadfn: the function to run until signal_pending(current). + * @data: data ptr for @threadfn. + * @node: memory node number. + * @namefmt: printf-style name for the thread. + * + * Description: This helper function creates and names a kernel + * thread. The thread will be stopped: use wake_up_process() to start + * it. See also kthread_run(). + * + * If thread is going to be bound on a particular cpu, give its node + * in @node, to get NUMA affinity for kthread stack, or else give -1. + * When woken, the thread will run @threadfn() with @data as its + * argument. @threadfn() can either call do_exit() directly if it is a + * standalone thread for which no one will call kthread_stop(), or + * return when 'kthread_should_stop()' is true (which means + * kthread_stop() has been called). The return value should be zero + * or a negative error number; it will be passed to kthread_stop(). + * + * Returns a task_struct or ERR_PTR(-ENOMEM). + */ +struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), + void *data, + int node, + const char namefmt[], + ...) +{ + struct kthread_create_info create; + + create.threadfn = threadfn; + create.data = data; + create.node = node; + init_completion(&create.done); + + spin_lock(&kthread_create_lock); + list_add_tail(&create.list, &kthread_create_list); + spin_unlock(&kthread_create_lock); + + wake_up_process(kthreadd_task); + wait_for_completion(&create.done); + + if (!IS_ERR(create.result)) { + static const struct sched_param param = { .sched_priority = 0 }; + va_list args; + + va_start(args, namefmt); + vsnprintf(create.result->comm, sizeof(create.result->comm), + namefmt, args); + va_end(args); + /* + * root may have changed our (kthreadd's) priority or CPU mask. + * The kernel thread should not inherit these properties. + */ + sched_setscheduler_nocheck(create.result, SCHED_NORMAL, ¶m); + set_cpus_allowed_ptr(create.result, cpu_all_mask); + } + return create.result; +} +EXPORT_SYMBOL(kthread_create_on_node); + +/** + * kthread_bind - bind a just-created kthread to a cpu. + * @p: thread created by kthread_create(). + * @cpu: cpu (might not be online, must be possible) for @k to run on. + * + * Description: This function is equivalent to set_cpus_allowed(), + * except that @cpu doesn't need to be online, and the thread must be + * stopped (i.e., just returned from kthread_create()). + */ +void kthread_bind(struct task_struct *p, unsigned int cpu) +{ + /* Must have done schedule() in kthread() before we set_task_cpu */ + if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { + WARN_ON(1); + return; + } + + /* It's safe because the task is inactive. */ + do_set_cpus_allowed(p, cpumask_of(cpu)); + p->flags |= PF_THREAD_BOUND; +} +EXPORT_SYMBOL(kthread_bind); + +/** + * kthread_stop - stop a thread created by kthread_create(). + * @k: thread created by kthread_create(). + * + * Sets kthread_should_stop() for @k to return true, wakes it, and + * waits for it to exit. This can also be called after kthread_create() + * instead of calling wake_up_process(): the thread will exit without + * calling threadfn(). + * + * If threadfn() may call do_exit() itself, the caller must ensure + * task_struct can't go away. + * + * Returns the result of threadfn(), or %-EINTR if wake_up_process() + * was never called. + */ +int kthread_stop(struct task_struct *k) +{ + struct kthread *kthread; + int ret; + + trace_sched_kthread_stop(k); + get_task_struct(k); + + kthread = to_kthread(k); + barrier(); /* it might have exited */ + if (k->vfork_done != NULL) { + kthread->should_stop = 1; + wake_up_process(k); + wait_for_completion(&kthread->exited); + } + ret = k->exit_code; + + put_task_struct(k); + trace_sched_kthread_stop_ret(ret); + + return ret; +} +EXPORT_SYMBOL(kthread_stop); + +int kthreadd(void *unused) +{ + struct task_struct *tsk = current; + + /* Setup a clean context for our children to inherit. */ + set_task_comm(tsk, "kthreadd"); + ignore_signals(tsk); + set_cpus_allowed_ptr(tsk, cpu_all_mask); + set_mems_allowed(node_states[N_HIGH_MEMORY]); + + current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (list_empty(&kthread_create_list)) + schedule(); + __set_current_state(TASK_RUNNING); + + spin_lock(&kthread_create_lock); + while (!list_empty(&kthread_create_list)) { + struct kthread_create_info *create; + + create = list_entry(kthread_create_list.next, + struct kthread_create_info, list); + list_del_init(&create->list); + spin_unlock(&kthread_create_lock); + + create_kthread(create); + + spin_lock(&kthread_create_lock); + } + spin_unlock(&kthread_create_lock); + } + + return 0; +} + +void __init_kthread_worker(struct kthread_worker *worker, + const char *name, + struct lock_class_key *key) +{ + spin_lock_init(&worker->lock); + lockdep_set_class_and_name(&worker->lock, key, name); + INIT_LIST_HEAD(&worker->work_list); + worker->task = NULL; +} +EXPORT_SYMBOL_GPL(__init_kthread_worker); + +/** + * kthread_worker_fn - kthread function to process kthread_worker + * @worker_ptr: pointer to initialized kthread_worker + * + * This function can be used as @threadfn to kthread_create() or + * kthread_run() with @worker_ptr argument pointing to an initialized + * kthread_worker. The started kthread will process work_list until + * the it is stopped with kthread_stop(). A kthread can also call + * this function directly after extra initialization. + * + * Different kthreads can be used for the same kthread_worker as long + * as there's only one kthread attached to it at any given time. A + * kthread_worker without an attached kthread simply collects queued + * kthread_works. + */ +int kthread_worker_fn(void *worker_ptr) +{ + struct kthread_worker *worker = worker_ptr; + struct kthread_work *work; + + WARN_ON(worker->task); + worker->task = current; +repeat: + set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ + + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + spin_lock_irq(&worker->lock); + worker->task = NULL; + spin_unlock_irq(&worker->lock); + return 0; + } + + work = NULL; + spin_lock_irq(&worker->lock); + if (!list_empty(&worker->work_list)) { + work = list_first_entry(&worker->work_list, + struct kthread_work, node); + list_del_init(&work->node); + } + spin_unlock_irq(&worker->lock); + + if (work) { + __set_current_state(TASK_RUNNING); + work->func(work); + smp_wmb(); /* wmb worker-b0 paired with flush-b1 */ + work->done_seq = work->queue_seq; + smp_mb(); /* mb worker-b1 paired with flush-b0 */ + if (atomic_read(&work->flushing)) + wake_up_all(&work->done); + } else if (!freezing(current)) + schedule(); + + try_to_freeze(); + goto repeat; +} +EXPORT_SYMBOL_GPL(kthread_worker_fn); + +/** + * queue_kthread_work - queue a kthread_work + * @worker: target kthread_worker + * @work: kthread_work to queue + * + * Queue @work to work processor @task for async execution. @task + * must have been created with kthread_worker_create(). Returns %true + * if @work was successfully queued, %false if it was already pending. + */ +bool queue_kthread_work(struct kthread_worker *worker, + struct kthread_work *work) +{ + bool ret = false; + unsigned long flags; + + spin_lock_irqsave(&worker->lock, flags); + if (list_empty(&work->node)) { + list_add_tail(&work->node, &worker->work_list); + work->queue_seq++; + if (likely(worker->task)) + wake_up_process(worker->task); + ret = true; + } + spin_unlock_irqrestore(&worker->lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(queue_kthread_work); + +/** + * flush_kthread_work - flush a kthread_work + * @work: work to flush + * + * If @work is queued or executing, wait for it to finish execution. + */ +void flush_kthread_work(struct kthread_work *work) +{ + int seq = work->queue_seq; + + atomic_inc(&work->flushing); + + /* + * mb flush-b0 paired with worker-b1, to make sure either + * worker sees the above increment or we see done_seq update. + */ + smp_mb__after_atomic_inc(); + + /* A - B <= 0 tests whether B is in front of A regardless of overflow */ + wait_event(work->done, seq - work->done_seq <= 0); + atomic_dec(&work->flushing); + + /* + * rmb flush-b1 paired with worker-b0, to make sure our caller + * sees every change made by work->func(). + */ + smp_mb__after_atomic_dec(); +} +EXPORT_SYMBOL_GPL(flush_kthread_work); + +struct kthread_flush_work { + struct kthread_work work; + struct completion done; +}; + +static void kthread_flush_work_fn(struct kthread_work *work) +{ + struct kthread_flush_work *fwork = + container_of(work, struct kthread_flush_work, work); + complete(&fwork->done); +} + +/** + * flush_kthread_worker - flush all current works on a kthread_worker + * @worker: worker to flush + * + * Wait until all currently executing or pending works on @worker are + * finished. + */ +void flush_kthread_worker(struct kthread_worker *worker) +{ + struct kthread_flush_work fwork = { + KTHREAD_WORK_INIT(fwork.work, kthread_flush_work_fn), + COMPLETION_INITIALIZER_ONSTACK(fwork.done), + }; + + queue_kthread_work(worker, &fwork.work); + wait_for_completion(&fwork.done); +} +EXPORT_SYMBOL_GPL(flush_kthread_worker); diff --git a/kernel/latencytop.c b/kernel/latencytop.c new file mode 100644 index 00000000..376066e1 --- /dev/null +++ b/kernel/latencytop.c @@ -0,0 +1,291 @@ +/* + * latencytop.c: Latency display infrastructure + * + * (C) Copyright 2008 Intel Corporation + * Author: Arjan van de Ven + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +/* + * CONFIG_LATENCYTOP enables a kernel latency tracking infrastructure that is + * used by the "latencytop" userspace tool. The latency that is tracked is not + * the 'traditional' interrupt latency (which is primarily caused by something + * else consuming CPU), but instead, it is the latency an application encounters + * because the kernel sleeps on its behalf for various reasons. + * + * This code tracks 2 levels of statistics: + * 1) System level latency + * 2) Per process latency + * + * The latency is stored in fixed sized data structures in an accumulated form; + * if the "same" latency cause is hit twice, this will be tracked as one entry + * in the data structure. Both the count, total accumulated latency and maximum + * latency are tracked in this data structure. When the fixed size structure is + * full, no new causes are tracked until the buffer is flushed by writing to + * the /proc file; the userspace tool does this on a regular basis. + * + * A latency cause is identified by a stringified backtrace at the point that + * the scheduler gets invoked. The userland tool will use this string to + * identify the cause of the latency in human readable form. + * + * The information is exported via /proc/latency_stats and /proc//latency. + * These files look like this: + * + * Latency Top version : v0.1 + * 70 59433 4897 i915_irq_wait drm_ioctl vfs_ioctl do_vfs_ioctl sys_ioctl + * | | | | + * | | | +----> the stringified backtrace + * | | +---------> The maximum latency for this entry in microseconds + * | +--------------> The accumulated latency for this entry (microseconds) + * +-------------------> The number of times this entry is hit + * + * (note: the average latency is the accumulated latency divided by the number + * of times) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_SPINLOCK(latency_lock); + +#define MAXLR 128 +static struct latency_record latency_record[MAXLR]; + +int latencytop_enabled; + +void clear_all_latency_tracing(struct task_struct *p) +{ + unsigned long flags; + + if (!latencytop_enabled) + return; + + spin_lock_irqsave(&latency_lock, flags); + memset(&p->latency_record, 0, sizeof(p->latency_record)); + p->latency_record_count = 0; + spin_unlock_irqrestore(&latency_lock, flags); +} + +static void clear_global_latency_tracing(void) +{ + unsigned long flags; + + spin_lock_irqsave(&latency_lock, flags); + memset(&latency_record, 0, sizeof(latency_record)); + spin_unlock_irqrestore(&latency_lock, flags); +} + +static void __sched +account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat) +{ + int firstnonnull = MAXLR + 1; + int i; + + if (!latencytop_enabled) + return; + + /* skip kernel threads for now */ + if (!tsk->mm) + return; + + for (i = 0; i < MAXLR; i++) { + int q, same = 1; + + /* Nothing stored: */ + if (!latency_record[i].backtrace[0]) { + if (firstnonnull > i) + firstnonnull = i; + continue; + } + for (q = 0; q < LT_BACKTRACEDEPTH; q++) { + unsigned long record = lat->backtrace[q]; + + if (latency_record[i].backtrace[q] != record) { + same = 0; + break; + } + + /* 0 and ULONG_MAX entries mean end of backtrace: */ + if (record == 0 || record == ULONG_MAX) + break; + } + if (same) { + latency_record[i].count++; + latency_record[i].time += lat->time; + if (lat->time > latency_record[i].max) + latency_record[i].max = lat->time; + return; + } + } + + i = firstnonnull; + if (i >= MAXLR - 1) + return; + + /* Allocted a new one: */ + memcpy(&latency_record[i], lat, sizeof(struct latency_record)); +} + +/* + * Iterator to store a backtrace into a latency record entry + */ +static inline void store_stacktrace(struct task_struct *tsk, + struct latency_record *lat) +{ + struct stack_trace trace; + + memset(&trace, 0, sizeof(trace)); + trace.max_entries = LT_BACKTRACEDEPTH; + trace.entries = &lat->backtrace[0]; + save_stack_trace_tsk(tsk, &trace); +} + +/** + * __account_scheduler_latency - record an occurred latency + * @tsk - the task struct of the task hitting the latency + * @usecs - the duration of the latency in microseconds + * @inter - 1 if the sleep was interruptible, 0 if uninterruptible + * + * This function is the main entry point for recording latency entries + * as called by the scheduler. + * + * This function has a few special cases to deal with normal 'non-latency' + * sleeps: specifically, interruptible sleep longer than 5 msec is skipped + * since this usually is caused by waiting for events via select() and co. + * + * Negative latencies (caused by time going backwards) are also explicitly + * skipped. + */ +void __sched +__account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) +{ + unsigned long flags; + int i, q; + struct latency_record lat; + + /* Long interruptible waits are generally user requested... */ + if (inter && usecs > 5000) + return; + + /* Negative sleeps are time going backwards */ + /* Zero-time sleeps are non-interesting */ + if (usecs <= 0) + return; + + memset(&lat, 0, sizeof(lat)); + lat.count = 1; + lat.time = usecs; + lat.max = usecs; + store_stacktrace(tsk, &lat); + + spin_lock_irqsave(&latency_lock, flags); + + account_global_scheduler_latency(tsk, &lat); + + for (i = 0; i < tsk->latency_record_count; i++) { + struct latency_record *mylat; + int same = 1; + + mylat = &tsk->latency_record[i]; + for (q = 0; q < LT_BACKTRACEDEPTH; q++) { + unsigned long record = lat.backtrace[q]; + + if (mylat->backtrace[q] != record) { + same = 0; + break; + } + + /* 0 and ULONG_MAX entries mean end of backtrace: */ + if (record == 0 || record == ULONG_MAX) + break; + } + if (same) { + mylat->count++; + mylat->time += lat.time; + if (lat.time > mylat->max) + mylat->max = lat.time; + goto out_unlock; + } + } + + /* + * short term hack; if we're > 32 we stop; future we recycle: + */ + if (tsk->latency_record_count >= LT_SAVECOUNT) + goto out_unlock; + + /* Allocated a new one: */ + i = tsk->latency_record_count++; + memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); + +out_unlock: + spin_unlock_irqrestore(&latency_lock, flags); +} + +static int lstats_show(struct seq_file *m, void *v) +{ + int i; + + seq_puts(m, "Latency Top version : v0.1\n"); + + for (i = 0; i < MAXLR; i++) { + struct latency_record *lr = &latency_record[i]; + + if (lr->backtrace[0]) { + int q; + seq_printf(m, "%i %lu %lu", + lr->count, lr->time, lr->max); + for (q = 0; q < LT_BACKTRACEDEPTH; q++) { + unsigned long bt = lr->backtrace[q]; + if (!bt) + break; + if (bt == ULONG_MAX) + break; + seq_printf(m, " %ps", (void *)bt); + } + seq_printf(m, "\n"); + } + } + return 0; +} + +static ssize_t +lstats_write(struct file *file, const char __user *buf, size_t count, + loff_t *offs) +{ + clear_global_latency_tracing(); + + return count; +} + +static int lstats_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, lstats_show, NULL); +} + +static const struct file_operations lstats_fops = { + .open = lstats_open, + .read = seq_read, + .write = lstats_write, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init init_lstats_procfs(void) +{ + proc_create("latency_stats", 0644, NULL, &lstats_fops); + return 0; +} +device_initcall(init_lstats_procfs); diff --git a/kernel/lockdep.c b/kernel/lockdep.c new file mode 100644 index 00000000..298c9276 --- /dev/null +++ b/kernel/lockdep.c @@ -0,0 +1,4005 @@ +/* + * kernel/lockdep.c + * + * Runtime locking correctness validator + * + * Started by Ingo Molnar: + * + * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * + * this code maps all the lock dependencies as they occur in a live kernel + * and will warn about the following classes of locking bugs: + * + * - lock inversion scenarios + * - circular lock dependencies + * - hardirq/softirq safe/unsafe locking bugs + * + * Bugs are reported even if the current locking scenario does not cause + * any deadlock at this point. + * + * I.e. if anytime in the past two locks were taken in a different order, + * even if it happened for another task, even if those were different + * locks (but of the same class as this lock), this code will detect it. + * + * Thanks to Arjan van de Ven for coming up with the initial idea of + * mapping lock dependencies runtime. + */ +#define DISABLE_BRANCH_PROFILING +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "lockdep_internals.h" + +#define CREATE_TRACE_POINTS +#include + +#ifdef CONFIG_PROVE_LOCKING +int prove_locking = 1; +module_param(prove_locking, int, 0644); +#else +#define prove_locking 0 +#endif + +#ifdef CONFIG_LOCK_STAT +int lock_stat = 1; +module_param(lock_stat, int, 0644); +#else +#define lock_stat 0 +#endif + +/* + * lockdep_lock: protects the lockdep graph, the hashes and the + * class/list/hash allocators. + * + * This is one of the rare exceptions where it's justified + * to use a raw spinlock - we really dont want the spinlock + * code to recurse back into the lockdep code... + */ +static arch_spinlock_t lockdep_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + +static int graph_lock(void) +{ + arch_spin_lock(&lockdep_lock); + /* + * Make sure that if another CPU detected a bug while + * walking the graph we dont change it (while the other + * CPU is busy printing out stuff with the graph lock + * dropped already) + */ + if (!debug_locks) { + arch_spin_unlock(&lockdep_lock); + return 0; + } + /* prevent any recursions within lockdep from causing deadlocks */ + current->lockdep_recursion++; + return 1; +} + +static inline int graph_unlock(void) +{ + if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) + return DEBUG_LOCKS_WARN_ON(1); + + current->lockdep_recursion--; + arch_spin_unlock(&lockdep_lock); + return 0; +} + +/* + * Turn lock debugging off and return with 0 if it was off already, + * and also release the graph lock: + */ +static inline int debug_locks_off_graph_unlock(void) +{ + int ret = debug_locks_off(); + + arch_spin_unlock(&lockdep_lock); + + return ret; +} + +static int lockdep_initialized; + +unsigned long nr_list_entries; +static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; + +/* + * All data structures here are protected by the global debug_lock. + * + * Mutex key structs only get allocated, once during bootup, and never + * get freed - this significantly simplifies the debugging code. + */ +unsigned long nr_lock_classes; +static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; + +static inline struct lock_class *hlock_class(struct held_lock *hlock) +{ + if (!hlock->class_idx) { + DEBUG_LOCKS_WARN_ON(1); + return NULL; + } + return lock_classes + hlock->class_idx - 1; +} + +#ifdef CONFIG_LOCK_STAT +static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], + cpu_lock_stats); + +static inline u64 lockstat_clock(void) +{ + return local_clock(); +} + +static int lock_point(unsigned long points[], unsigned long ip) +{ + int i; + + for (i = 0; i < LOCKSTAT_POINTS; i++) { + if (points[i] == 0) { + points[i] = ip; + break; + } + if (points[i] == ip) + break; + } + + return i; +} + +static void lock_time_inc(struct lock_time *lt, u64 time) +{ + if (time > lt->max) + lt->max = time; + + if (time < lt->min || !lt->nr) + lt->min = time; + + lt->total += time; + lt->nr++; +} + +static inline void lock_time_add(struct lock_time *src, struct lock_time *dst) +{ + if (!src->nr) + return; + + if (src->max > dst->max) + dst->max = src->max; + + if (src->min < dst->min || !dst->nr) + dst->min = src->min; + + dst->total += src->total; + dst->nr += src->nr; +} + +struct lock_class_stats lock_stats(struct lock_class *class) +{ + struct lock_class_stats stats; + int cpu, i; + + memset(&stats, 0, sizeof(struct lock_class_stats)); + for_each_possible_cpu(cpu) { + struct lock_class_stats *pcs = + &per_cpu(cpu_lock_stats, cpu)[class - lock_classes]; + + for (i = 0; i < ARRAY_SIZE(stats.contention_point); i++) + stats.contention_point[i] += pcs->contention_point[i]; + + for (i = 0; i < ARRAY_SIZE(stats.contending_point); i++) + stats.contending_point[i] += pcs->contending_point[i]; + + lock_time_add(&pcs->read_waittime, &stats.read_waittime); + lock_time_add(&pcs->write_waittime, &stats.write_waittime); + + lock_time_add(&pcs->read_holdtime, &stats.read_holdtime); + lock_time_add(&pcs->write_holdtime, &stats.write_holdtime); + + for (i = 0; i < ARRAY_SIZE(stats.bounces); i++) + stats.bounces[i] += pcs->bounces[i]; + } + + return stats; +} + +void clear_lock_stats(struct lock_class *class) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct lock_class_stats *cpu_stats = + &per_cpu(cpu_lock_stats, cpu)[class - lock_classes]; + + memset(cpu_stats, 0, sizeof(struct lock_class_stats)); + } + memset(class->contention_point, 0, sizeof(class->contention_point)); + memset(class->contending_point, 0, sizeof(class->contending_point)); +} + +static struct lock_class_stats *get_lock_stats(struct lock_class *class) +{ + return &get_cpu_var(cpu_lock_stats)[class - lock_classes]; +} + +static void put_lock_stats(struct lock_class_stats *stats) +{ + put_cpu_var(cpu_lock_stats); +} + +static void lock_release_holdtime(struct held_lock *hlock) +{ + struct lock_class_stats *stats; + u64 holdtime; + + if (!lock_stat) + return; + + holdtime = lockstat_clock() - hlock->holdtime_stamp; + + stats = get_lock_stats(hlock_class(hlock)); + if (hlock->read) + lock_time_inc(&stats->read_holdtime, holdtime); + else + lock_time_inc(&stats->write_holdtime, holdtime); + put_lock_stats(stats); +} +#else +static inline void lock_release_holdtime(struct held_lock *hlock) +{ +} +#endif + +/* + * We keep a global list of all lock classes. The list only grows, + * never shrinks. The list is only accessed with the lockdep + * spinlock lock held. + */ +LIST_HEAD(all_lock_classes); + +/* + * The lockdep classes are in a hash-table as well, for fast lookup: + */ +#define CLASSHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1) +#define CLASSHASH_SIZE (1UL << CLASSHASH_BITS) +#define __classhashfn(key) hash_long((unsigned long)key, CLASSHASH_BITS) +#define classhashentry(key) (classhash_table + __classhashfn((key))) + +static struct list_head classhash_table[CLASSHASH_SIZE]; + +/* + * We put the lock dependency chains into a hash-table as well, to cache + * their existence: + */ +#define CHAINHASH_BITS (MAX_LOCKDEP_CHAINS_BITS-1) +#define CHAINHASH_SIZE (1UL << CHAINHASH_BITS) +#define __chainhashfn(chain) hash_long(chain, CHAINHASH_BITS) +#define chainhashentry(chain) (chainhash_table + __chainhashfn((chain))) + +static struct list_head chainhash_table[CHAINHASH_SIZE]; + +/* + * The hash key of the lock dependency chains is a hash itself too: + * it's a hash of all locks taken up to that lock, including that lock. + * It's a 64-bit hash, because it's important for the keys to be + * unique. + */ +#define iterate_chain_key(key1, key2) \ + (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \ + ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \ + (key2)) + +void lockdep_off(void) +{ + current->lockdep_recursion++; +} +EXPORT_SYMBOL(lockdep_off); + +void lockdep_on(void) +{ + current->lockdep_recursion--; +} +EXPORT_SYMBOL(lockdep_on); + +/* + * Debugging switches: + */ + +#define VERBOSE 0 +#define VERY_VERBOSE 0 + +#if VERBOSE +# define HARDIRQ_VERBOSE 1 +# define SOFTIRQ_VERBOSE 1 +# define RECLAIM_VERBOSE 1 +#else +# define HARDIRQ_VERBOSE 0 +# define SOFTIRQ_VERBOSE 0 +# define RECLAIM_VERBOSE 0 +#endif + +#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE +/* + * Quick filtering for interesting events: + */ +static int class_filter(struct lock_class *class) +{ +#if 0 + /* Example */ + if (class->name_version == 1 && + !strcmp(class->name, "lockname")) + return 1; + if (class->name_version == 1 && + !strcmp(class->name, "&struct->lockfield")) + return 1; +#endif + /* Filter everything else. 1 would be to allow everything else */ + return 0; +} +#endif + +static int verbose(struct lock_class *class) +{ +#if VERBOSE + return class_filter(class); +#endif + return 0; +} + +/* + * Stack-trace: tightly packed array of stack backtrace + * addresses. Protected by the graph_lock. + */ +unsigned long nr_stack_trace_entries; +static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; + +static int save_trace(struct stack_trace *trace) +{ + trace->nr_entries = 0; + trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; + trace->entries = stack_trace + nr_stack_trace_entries; + + trace->skip = 3; + + save_stack_trace(trace); + + /* + * Some daft arches put -1 at the end to indicate its a full trace. + * + * this is buggy anyway, since it takes a whole extra entry so a + * complete trace that maxes out the entries provided will be reported + * as incomplete, friggin useless + */ + if (trace->nr_entries != 0 && + trace->entries[trace->nr_entries-1] == ULONG_MAX) + trace->nr_entries--; + + trace->max_entries = trace->nr_entries; + + nr_stack_trace_entries += trace->nr_entries; + + if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { + if (!debug_locks_off_graph_unlock()) + return 0; + + printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n"); + printk("turning off the locking correctness validator.\n"); + dump_stack(); + + return 0; + } + + return 1; +} + +unsigned int nr_hardirq_chains; +unsigned int nr_softirq_chains; +unsigned int nr_process_chains; +unsigned int max_lockdep_depth; + +#ifdef CONFIG_DEBUG_LOCKDEP +/* + * We cannot printk in early bootup code. Not even early_printk() + * might work. So we mark any initialization errors and printk + * about it later on, in lockdep_info(). + */ +static int lockdep_init_error; +static unsigned long lockdep_init_trace_data[20]; +static struct stack_trace lockdep_init_trace = { + .max_entries = ARRAY_SIZE(lockdep_init_trace_data), + .entries = lockdep_init_trace_data, +}; + +/* + * Various lockdep statistics: + */ +DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats); +#endif + +/* + * Locking printouts: + */ + +#define __USAGE(__STATE) \ + [LOCK_USED_IN_##__STATE] = "IN-"__stringify(__STATE)"-W", \ + [LOCK_ENABLED_##__STATE] = __stringify(__STATE)"-ON-W", \ + [LOCK_USED_IN_##__STATE##_READ] = "IN-"__stringify(__STATE)"-R",\ + [LOCK_ENABLED_##__STATE##_READ] = __stringify(__STATE)"-ON-R", + +static const char *usage_str[] = +{ +#define LOCKDEP_STATE(__STATE) __USAGE(__STATE) +#include "lockdep_states.h" +#undef LOCKDEP_STATE + [LOCK_USED] = "INITIAL USE", +}; + +const char * __get_key_name(struct lockdep_subclass_key *key, char *str) +{ + return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str); +} + +static inline unsigned long lock_flag(enum lock_usage_bit bit) +{ + return 1UL << bit; +} + +static char get_usage_char(struct lock_class *class, enum lock_usage_bit bit) +{ + char c = '.'; + + if (class->usage_mask & lock_flag(bit + 2)) + c = '+'; + if (class->usage_mask & lock_flag(bit)) { + c = '-'; + if (class->usage_mask & lock_flag(bit + 2)) + c = '?'; + } + + return c; +} + +void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS]) +{ + int i = 0; + +#define LOCKDEP_STATE(__STATE) \ + usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE); \ + usage[i++] = get_usage_char(class, LOCK_USED_IN_##__STATE##_READ); +#include "lockdep_states.h" +#undef LOCKDEP_STATE + + usage[i] = '\0'; +} + +static int __print_lock_name(struct lock_class *class) +{ + char str[KSYM_NAME_LEN]; + const char *name; + + name = class->name; + if (!name) + name = __get_key_name(class->key, str); + + return printk("%s", name); +} + +static void print_lock_name(struct lock_class *class) +{ + char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS]; + const char *name; + + get_usage_chars(class, usage); + + name = class->name; + if (!name) { + name = __get_key_name(class->key, str); + printk(" (%s", name); + } else { + printk(" (%s", name); + if (class->name_version > 1) + printk("#%d", class->name_version); + if (class->subclass) + printk("/%d", class->subclass); + } + printk("){%s}", usage); +} + +static void print_lockdep_cache(struct lockdep_map *lock) +{ + const char *name; + char str[KSYM_NAME_LEN]; + + name = lock->name; + if (!name) + name = __get_key_name(lock->key->subkeys, str); + + printk("%s", name); +} + +static void print_lock(struct held_lock *hlock) +{ + print_lock_name(hlock_class(hlock)); + printk(", at: "); + print_ip_sym(hlock->acquire_ip); +} + +static void lockdep_print_held_locks(struct task_struct *curr) +{ + int i, depth = curr->lockdep_depth; + + if (!depth) { + printk("no locks held by %s/%d.\n", curr->comm, task_pid_nr(curr)); + return; + } + printk("%d lock%s held by %s/%d:\n", + depth, depth > 1 ? "s" : "", curr->comm, task_pid_nr(curr)); + + for (i = 0; i < depth; i++) { + printk(" #%d: ", i); + print_lock(curr->held_locks + i); + } +} + +static void print_kernel_version(void) +{ + printk("%s %.*s\n", init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); +} + +static int very_verbose(struct lock_class *class) +{ +#if VERY_VERBOSE + return class_filter(class); +#endif + return 0; +} + +/* + * Is this the address of a static object: + */ +static int static_obj(void *obj) +{ + unsigned long start = (unsigned long) &_stext, + end = (unsigned long) &_end, + addr = (unsigned long) obj; + + /* + * static variable? + */ + if ((addr >= start) && (addr < end)) + return 1; + + if (arch_is_kernel_data(addr)) + return 1; + + /* + * in-kernel percpu var? + */ + if (is_kernel_percpu_address(addr)) + return 1; + + /* + * module static or percpu var? + */ + return is_module_address(addr) || is_module_percpu_address(addr); +} + +/* + * To make lock name printouts unique, we calculate a unique + * class->name_version generation counter: + */ +static int count_matching_names(struct lock_class *new_class) +{ + struct lock_class *class; + int count = 0; + + if (!new_class->name) + return 0; + + list_for_each_entry(class, &all_lock_classes, lock_entry) { + if (new_class->key - new_class->subclass == class->key) + return class->name_version; + if (class->name && !strcmp(class->name, new_class->name)) + count = max(count, class->name_version); + } + + return count + 1; +} + +/* + * Register a lock's class in the hash-table, if the class is not present + * yet. Otherwise we look it up. We cache the result in the lock object + * itself, so actual lookup of the hash should be once per lock object. + */ +static inline struct lock_class * +look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) +{ + struct lockdep_subclass_key *key; + struct list_head *hash_head; + struct lock_class *class; + +#ifdef CONFIG_DEBUG_LOCKDEP + /* + * If the architecture calls into lockdep before initializing + * the hashes then we'll warn about it later. (we cannot printk + * right now) + */ + if (unlikely(!lockdep_initialized)) { + lockdep_init(); + lockdep_init_error = 1; + save_stack_trace(&lockdep_init_trace); + } +#endif + + if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { + debug_locks_off(); + printk(KERN_ERR + "BUG: looking up invalid subclass: %u\n", subclass); + printk(KERN_ERR + "turning off the locking correctness validator.\n"); + dump_stack(); + return NULL; + } + + /* + * Static locks do not have their class-keys yet - for them the key + * is the lock object itself: + */ + if (unlikely(!lock->key)) + lock->key = (void *)lock; + + /* + * NOTE: the class-key must be unique. For dynamic locks, a static + * lock_class_key variable is passed in through the mutex_init() + * (or spin_lock_init()) call - which acts as the key. For static + * locks we use the lock object itself as the key. + */ + BUILD_BUG_ON(sizeof(struct lock_class_key) > + sizeof(struct lockdep_map)); + + key = lock->key->subkeys + subclass; + + hash_head = classhashentry(key); + + /* + * We can walk the hash lockfree, because the hash only + * grows, and we are careful when adding entries to the end: + */ + list_for_each_entry(class, hash_head, hash_entry) { + if (class->key == key) { + WARN_ON_ONCE(class->name != lock->name); + return class; + } + } + + return NULL; +} + +/* + * Register a lock's class in the hash-table, if the class is not present + * yet. Otherwise we look it up. We cache the result in the lock object + * itself, so actual lookup of the hash should be once per lock object. + */ +static inline struct lock_class * +register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) +{ + struct lockdep_subclass_key *key; + struct list_head *hash_head; + struct lock_class *class; + unsigned long flags; + + class = look_up_lock_class(lock, subclass); + if (likely(class)) + return class; + + /* + * Debug-check: all keys must be persistent! + */ + if (!static_obj(lock->key)) { + debug_locks_off(); + printk("INFO: trying to register non-static key.\n"); + printk("the code is fine but needs lockdep annotation.\n"); + printk("turning off the locking correctness validator.\n"); + dump_stack(); + + return NULL; + } + + key = lock->key->subkeys + subclass; + hash_head = classhashentry(key); + + raw_local_irq_save(flags); + if (!graph_lock()) { + raw_local_irq_restore(flags); + return NULL; + } + /* + * We have to do the hash-walk again, to avoid races + * with another CPU: + */ + list_for_each_entry(class, hash_head, hash_entry) + if (class->key == key) + goto out_unlock_set; + /* + * Allocate a new key from the static array, and add it to + * the hash: + */ + if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { + if (!debug_locks_off_graph_unlock()) { + raw_local_irq_restore(flags); + return NULL; + } + raw_local_irq_restore(flags); + + printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); + printk("turning off the locking correctness validator.\n"); + dump_stack(); + return NULL; + } + class = lock_classes + nr_lock_classes++; + debug_atomic_inc(nr_unused_locks); + class->key = key; + class->name = lock->name; + class->subclass = subclass; + INIT_LIST_HEAD(&class->lock_entry); + INIT_LIST_HEAD(&class->locks_before); + INIT_LIST_HEAD(&class->locks_after); + class->name_version = count_matching_names(class); + /* + * We use RCU's safe list-add method to make + * parallel walking of the hash-list safe: + */ + list_add_tail_rcu(&class->hash_entry, hash_head); + /* + * Add it to the global list of classes: + */ + list_add_tail_rcu(&class->lock_entry, &all_lock_classes); + + if (verbose(class)) { + graph_unlock(); + raw_local_irq_restore(flags); + + printk("\nnew class %p: %s", class->key, class->name); + if (class->name_version > 1) + printk("#%d", class->name_version); + printk("\n"); + dump_stack(); + + raw_local_irq_save(flags); + if (!graph_lock()) { + raw_local_irq_restore(flags); + return NULL; + } + } +out_unlock_set: + graph_unlock(); + raw_local_irq_restore(flags); + + if (!subclass || force) + lock->class_cache[0] = class; + else if (subclass < NR_LOCKDEP_CACHING_CLASSES) + lock->class_cache[subclass] = class; + + if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) + return NULL; + + return class; +} + +#ifdef CONFIG_PROVE_LOCKING +/* + * Allocate a lockdep entry. (assumes the graph_lock held, returns + * with NULL on failure) + */ +static struct lock_list *alloc_list_entry(void) +{ + if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) { + if (!debug_locks_off_graph_unlock()) + return NULL; + + printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); + printk("turning off the locking correctness validator.\n"); + dump_stack(); + return NULL; + } + return list_entries + nr_list_entries++; +} + +/* + * Add a new dependency to the head of the list: + */ +static int add_lock_to_list(struct lock_class *class, struct lock_class *this, + struct list_head *head, unsigned long ip, + int distance, struct stack_trace *trace) +{ + struct lock_list *entry; + /* + * Lock not present yet - get a new dependency struct and + * add it to the list: + */ + entry = alloc_list_entry(); + if (!entry) + return 0; + + entry->class = this; + entry->distance = distance; + entry->trace = *trace; + /* + * Since we never remove from the dependency list, the list can + * be walked lockless by other CPUs, it's only allocation + * that must be protected by the spinlock. But this also means + * we must make new entries visible only once writes to the + * entry become visible - hence the RCU op: + */ + list_add_tail_rcu(&entry->entry, head); + + return 1; +} + +/* + * For good efficiency of modular, we use power of 2 + */ +#define MAX_CIRCULAR_QUEUE_SIZE 4096UL +#define CQ_MASK (MAX_CIRCULAR_QUEUE_SIZE-1) + +/* + * The circular_queue and helpers is used to implement the + * breadth-first search(BFS)algorithem, by which we can build + * the shortest path from the next lock to be acquired to the + * previous held lock if there is a circular between them. + */ +struct circular_queue { + unsigned long element[MAX_CIRCULAR_QUEUE_SIZE]; + unsigned int front, rear; +}; + +static struct circular_queue lock_cq; + +unsigned int max_bfs_queue_depth; + +static unsigned int lockdep_dependency_gen_id; + +static inline void __cq_init(struct circular_queue *cq) +{ + cq->front = cq->rear = 0; + lockdep_dependency_gen_id++; +} + +static inline int __cq_empty(struct circular_queue *cq) +{ + return (cq->front == cq->rear); +} + +static inline int __cq_full(struct circular_queue *cq) +{ + return ((cq->rear + 1) & CQ_MASK) == cq->front; +} + +static inline int __cq_enqueue(struct circular_queue *cq, unsigned long elem) +{ + if (__cq_full(cq)) + return -1; + + cq->element[cq->rear] = elem; + cq->rear = (cq->rear + 1) & CQ_MASK; + return 0; +} + +static inline int __cq_dequeue(struct circular_queue *cq, unsigned long *elem) +{ + if (__cq_empty(cq)) + return -1; + + *elem = cq->element[cq->front]; + cq->front = (cq->front + 1) & CQ_MASK; + return 0; +} + +static inline unsigned int __cq_get_elem_count(struct circular_queue *cq) +{ + return (cq->rear - cq->front) & CQ_MASK; +} + +static inline void mark_lock_accessed(struct lock_list *lock, + struct lock_list *parent) +{ + unsigned long nr; + + nr = lock - list_entries; + WARN_ON(nr >= nr_list_entries); + lock->parent = parent; + lock->class->dep_gen_id = lockdep_dependency_gen_id; +} + +static inline unsigned long lock_accessed(struct lock_list *lock) +{ + unsigned long nr; + + nr = lock - list_entries; + WARN_ON(nr >= nr_list_entries); + return lock->class->dep_gen_id == lockdep_dependency_gen_id; +} + +static inline struct lock_list *get_lock_parent(struct lock_list *child) +{ + return child->parent; +} + +static inline int get_lock_depth(struct lock_list *child) +{ + int depth = 0; + struct lock_list *parent; + + while ((parent = get_lock_parent(child))) { + child = parent; + depth++; + } + return depth; +} + +static int __bfs(struct lock_list *source_entry, + void *data, + int (*match)(struct lock_list *entry, void *data), + struct lock_list **target_entry, + int forward) +{ + struct lock_list *entry; + struct list_head *head; + struct circular_queue *cq = &lock_cq; + int ret = 1; + + if (match(source_entry, data)) { + *target_entry = source_entry; + ret = 0; + goto exit; + } + + if (forward) + head = &source_entry->class->locks_after; + else + head = &source_entry->class->locks_before; + + if (list_empty(head)) + goto exit; + + __cq_init(cq); + __cq_enqueue(cq, (unsigned long)source_entry); + + while (!__cq_empty(cq)) { + struct lock_list *lock; + + __cq_dequeue(cq, (unsigned long *)&lock); + + if (!lock->class) { + ret = -2; + goto exit; + } + + if (forward) + head = &lock->class->locks_after; + else + head = &lock->class->locks_before; + + list_for_each_entry(entry, head, entry) { + if (!lock_accessed(entry)) { + unsigned int cq_depth; + mark_lock_accessed(entry, lock); + if (match(entry, data)) { + *target_entry = entry; + ret = 0; + goto exit; + } + + if (__cq_enqueue(cq, (unsigned long)entry)) { + ret = -1; + goto exit; + } + cq_depth = __cq_get_elem_count(cq); + if (max_bfs_queue_depth < cq_depth) + max_bfs_queue_depth = cq_depth; + } + } + } +exit: + return ret; +} + +static inline int __bfs_forwards(struct lock_list *src_entry, + void *data, + int (*match)(struct lock_list *entry, void *data), + struct lock_list **target_entry) +{ + return __bfs(src_entry, data, match, target_entry, 1); + +} + +static inline int __bfs_backwards(struct lock_list *src_entry, + void *data, + int (*match)(struct lock_list *entry, void *data), + struct lock_list **target_entry) +{ + return __bfs(src_entry, data, match, target_entry, 0); + +} + +/* + * Recursive, forwards-direction lock-dependency checking, used for + * both noncyclic checking and for hardirq-unsafe/softirq-unsafe + * checking. + */ + +/* + * Print a dependency chain entry (this is only done when a deadlock + * has been detected): + */ +static noinline int +print_circular_bug_entry(struct lock_list *target, int depth) +{ + if (debug_locks_silent) + return 0; + printk("\n-> #%u", depth); + print_lock_name(target->class); + printk(":\n"); + print_stack_trace(&target->trace, 6); + + return 0; +} + +static void +print_circular_lock_scenario(struct held_lock *src, + struct held_lock *tgt, + struct lock_list *prt) +{ + struct lock_class *source = hlock_class(src); + struct lock_class *target = hlock_class(tgt); + struct lock_class *parent = prt->class; + + /* + * A direct locking problem where unsafe_class lock is taken + * directly by safe_class lock, then all we need to show + * is the deadlock scenario, as it is obvious that the + * unsafe lock is taken under the safe lock. + * + * But if there is a chain instead, where the safe lock takes + * an intermediate lock (middle_class) where this lock is + * not the same as the safe lock, then the lock chain is + * used to describe the problem. Otherwise we would need + * to show a different CPU case for each link in the chain + * from the safe_class lock to the unsafe_class lock. + */ + if (parent != source) { + printk("Chain exists of:\n "); + __print_lock_name(source); + printk(" --> "); + __print_lock_name(parent); + printk(" --> "); + __print_lock_name(target); + printk("\n\n"); + } + + printk(" Possible unsafe locking scenario:\n\n"); + printk(" CPU0 CPU1\n"); + printk(" ---- ----\n"); + printk(" lock("); + __print_lock_name(target); + printk(");\n"); + printk(" lock("); + __print_lock_name(parent); + printk(");\n"); + printk(" lock("); + __print_lock_name(target); + printk(");\n"); + printk(" lock("); + __print_lock_name(source); + printk(");\n"); + printk("\n *** DEADLOCK ***\n\n"); +} + +/* + * When a circular dependency is detected, print the + * header first: + */ +static noinline int +print_circular_bug_header(struct lock_list *entry, unsigned int depth, + struct held_lock *check_src, + struct held_lock *check_tgt) +{ + struct task_struct *curr = current; + + if (debug_locks_silent) + return 0; + + printk("\n=======================================================\n"); + printk( "[ INFO: possible circular locking dependency detected ]\n"); + print_kernel_version(); + printk( "-------------------------------------------------------\n"); + printk("%s/%d is trying to acquire lock:\n", + curr->comm, task_pid_nr(curr)); + print_lock(check_src); + printk("\nbut task is already holding lock:\n"); + print_lock(check_tgt); + printk("\nwhich lock already depends on the new lock.\n\n"); + printk("\nthe existing dependency chain (in reverse order) is:\n"); + + print_circular_bug_entry(entry, depth); + + return 0; +} + +static inline int class_equal(struct lock_list *entry, void *data) +{ + return entry->class == data; +} + +static noinline int print_circular_bug(struct lock_list *this, + struct lock_list *target, + struct held_lock *check_src, + struct held_lock *check_tgt) +{ + struct task_struct *curr = current; + struct lock_list *parent; + struct lock_list *first_parent; + int depth; + + if (!debug_locks_off_graph_unlock() || debug_locks_silent) + return 0; + + if (!save_trace(&this->trace)) + return 0; + + depth = get_lock_depth(target); + + print_circular_bug_header(target, depth, check_src, check_tgt); + + parent = get_lock_parent(target); + first_parent = parent; + + while (parent) { + print_circular_bug_entry(parent, --depth); + parent = get_lock_parent(parent); + } + + printk("\nother info that might help us debug this:\n\n"); + print_circular_lock_scenario(check_src, check_tgt, + first_parent); + + lockdep_print_held_locks(curr); + + printk("\nstack backtrace:\n"); + dump_stack(); + + return 0; +} + +static noinline int print_bfs_bug(int ret) +{ + if (!debug_locks_off_graph_unlock()) + return 0; + + WARN(1, "lockdep bfs error:%d\n", ret); + + return 0; +} + +static int noop_count(struct lock_list *entry, void *data) +{ + (*(unsigned long *)data)++; + return 0; +} + +unsigned long __lockdep_count_forward_deps(struct lock_list *this) +{ + unsigned long count = 0; + struct lock_list *uninitialized_var(target_entry); + + __bfs_forwards(this, (void *)&count, noop_count, &target_entry); + + return count; +} +unsigned long lockdep_count_forward_deps(struct lock_class *class) +{ + unsigned long ret, flags; + struct lock_list this; + + this.parent = NULL; + this.class = class; + + local_irq_save(flags); + arch_spin_lock(&lockdep_lock); + ret = __lockdep_count_forward_deps(&this); + arch_spin_unlock(&lockdep_lock); + local_irq_restore(flags); + + return ret; +} + +unsigned long __lockdep_count_backward_deps(struct lock_list *this) +{ + unsigned long count = 0; + struct lock_list *uninitialized_var(target_entry); + + __bfs_backwards(this, (void *)&count, noop_count, &target_entry); + + return count; +} + +unsigned long lockdep_count_backward_deps(struct lock_class *class) +{ + unsigned long ret, flags; + struct lock_list this; + + this.parent = NULL; + this.class = class; + + local_irq_save(flags); + arch_spin_lock(&lockdep_lock); + ret = __lockdep_count_backward_deps(&this); + arch_spin_unlock(&lockdep_lock); + local_irq_restore(flags); + + return ret; +} + +/* + * Prove that the dependency graph starting at can not + * lead to . Print an error and return 0 if it does. + */ +static noinline int +check_noncircular(struct lock_list *root, struct lock_class *target, + struct lock_list **target_entry) +{ + int result; + + debug_atomic_inc(nr_cyclic_checks); + + result = __bfs_forwards(root, target, class_equal, target_entry); + + return result; +} + +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) +/* + * Forwards and backwards subgraph searching, for the purposes of + * proving that two subgraphs can be connected by a new dependency + * without creating any illegal irq-safe -> irq-unsafe lock dependency. + */ + +static inline int usage_match(struct lock_list *entry, void *bit) +{ + return entry->class->usage_mask & (1 << (enum lock_usage_bit)bit); +} + + + +/* + * Find a node in the forwards-direction dependency sub-graph starting + * at @root->class that matches @bit. + * + * Return 0 if such a node exists in the subgraph, and put that node + * into *@target_entry. + * + * Return 1 otherwise and keep *@target_entry unchanged. + * Return <0 on error. + */ +static int +find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit, + struct lock_list **target_entry) +{ + int result; + + debug_atomic_inc(nr_find_usage_forwards_checks); + + result = __bfs_forwards(root, (void *)bit, usage_match, target_entry); + + return result; +} + +/* + * Find a node in the backwards-direction dependency sub-graph starting + * at @root->class that matches @bit. + * + * Return 0 if such a node exists in the subgraph, and put that node + * into *@target_entry. + * + * Return 1 otherwise and keep *@target_entry unchanged. + * Return <0 on error. + */ +static int +find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit, + struct lock_list **target_entry) +{ + int result; + + debug_atomic_inc(nr_find_usage_backwards_checks); + + result = __bfs_backwards(root, (void *)bit, usage_match, target_entry); + + return result; +} + +static void print_lock_class_header(struct lock_class *class, int depth) +{ + int bit; + + printk("%*s->", depth, ""); + print_lock_name(class); + printk(" ops: %lu", class->ops); + printk(" {\n"); + + for (bit = 0; bit < LOCK_USAGE_STATES; bit++) { + if (class->usage_mask & (1 << bit)) { + int len = depth; + + len += printk("%*s %s", depth, "", usage_str[bit]); + len += printk(" at:\n"); + print_stack_trace(class->usage_traces + bit, len); + } + } + printk("%*s }\n", depth, ""); + + printk("%*s ... key at: ",depth,""); + print_ip_sym((unsigned long)class->key); +} + +/* + * printk the shortest lock dependencies from @start to @end in reverse order: + */ +static void __used +print_shortest_lock_dependencies(struct lock_list *leaf, + struct lock_list *root) +{ + struct lock_list *entry = leaf; + int depth; + + /*compute depth from generated tree by BFS*/ + depth = get_lock_depth(leaf); + + do { + print_lock_class_header(entry->class, depth); + printk("%*s ... acquired at:\n", depth, ""); + print_stack_trace(&entry->trace, 2); + printk("\n"); + + if (depth == 0 && (entry != root)) { + printk("lockdep:%s bad path found in chain graph\n", __func__); + break; + } + + entry = get_lock_parent(entry); + depth--; + } while (entry && (depth >= 0)); + + return; +} + +static void +print_irq_lock_scenario(struct lock_list *safe_entry, + struct lock_list *unsafe_entry, + struct lock_class *prev_class, + struct lock_class *next_class) +{ + struct lock_class *safe_class = safe_entry->class; + struct lock_class *unsafe_class = unsafe_entry->class; + struct lock_class *middle_class = prev_class; + + if (middle_class == safe_class) + middle_class = next_class; + + /* + * A direct locking problem where unsafe_class lock is taken + * directly by safe_class lock, then all we need to show + * is the deadlock scenario, as it is obvious that the + * unsafe lock is taken under the safe lock. + * + * But if there is a chain instead, where the safe lock takes + * an intermediate lock (middle_class) where this lock is + * not the same as the safe lock, then the lock chain is + * used to describe the problem. Otherwise we would need + * to show a different CPU case for each link in the chain + * from the safe_class lock to the unsafe_class lock. + */ + if (middle_class != unsafe_class) { + printk("Chain exists of:\n "); + __print_lock_name(safe_class); + printk(" --> "); + __print_lock_name(middle_class); + printk(" --> "); + __print_lock_name(unsafe_class); + printk("\n\n"); + } + + printk(" Possible interrupt unsafe locking scenario:\n\n"); + printk(" CPU0 CPU1\n"); + printk(" ---- ----\n"); + printk(" lock("); + __print_lock_name(unsafe_class); + printk(");\n"); + printk(" local_irq_disable();\n"); + printk(" lock("); + __print_lock_name(safe_class); + printk(");\n"); + printk(" lock("); + __print_lock_name(middle_class); + printk(");\n"); + printk(" \n"); + printk(" lock("); + __print_lock_name(safe_class); + printk(");\n"); + printk("\n *** DEADLOCK ***\n\n"); +} + +static int +print_bad_irq_dependency(struct task_struct *curr, + struct lock_list *prev_root, + struct lock_list *next_root, + struct lock_list *backwards_entry, + struct lock_list *forwards_entry, + struct held_lock *prev, + struct held_lock *next, + enum lock_usage_bit bit1, + enum lock_usage_bit bit2, + const char *irqclass) +{ + if (!debug_locks_off_graph_unlock() || debug_locks_silent) + return 0; + + printk("\n======================================================\n"); + printk( "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", + irqclass, irqclass); + print_kernel_version(); + printk( "------------------------------------------------------\n"); + printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", + curr->comm, task_pid_nr(curr), + curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, + curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT, + curr->hardirqs_enabled, + curr->softirqs_enabled); + print_lock(next); + + printk("\nand this task is already holding:\n"); + print_lock(prev); + printk("which would create a new lock dependency:\n"); + print_lock_name(hlock_class(prev)); + printk(" ->"); + print_lock_name(hlock_class(next)); + printk("\n"); + + printk("\nbut this new dependency connects a %s-irq-safe lock:\n", + irqclass); + print_lock_name(backwards_entry->class); + printk("\n... which became %s-irq-safe at:\n", irqclass); + + print_stack_trace(backwards_entry->class->usage_traces + bit1, 1); + + printk("\nto a %s-irq-unsafe lock:\n", irqclass); + print_lock_name(forwards_entry->class); + printk("\n... which became %s-irq-unsafe at:\n", irqclass); + printk("..."); + + print_stack_trace(forwards_entry->class->usage_traces + bit2, 1); + + printk("\nother info that might help us debug this:\n\n"); + print_irq_lock_scenario(backwards_entry, forwards_entry, + hlock_class(prev), hlock_class(next)); + + lockdep_print_held_locks(curr); + + printk("\nthe dependencies between %s-irq-safe lock", irqclass); + printk(" and the holding lock:\n"); + if (!save_trace(&prev_root->trace)) + return 0; + print_shortest_lock_dependencies(backwards_entry, prev_root); + + printk("\nthe dependencies between the lock to be acquired"); + printk(" and %s-irq-unsafe lock:\n", irqclass); + if (!save_trace(&next_root->trace)) + return 0; + print_shortest_lock_dependencies(forwards_entry, next_root); + + printk("\nstack backtrace:\n"); + dump_stack(); + + return 0; +} + +static int +check_usage(struct task_struct *curr, struct held_lock *prev, + struct held_lock *next, enum lock_usage_bit bit_backwards, + enum lock_usage_bit bit_forwards, const char *irqclass) +{ + int ret; + struct lock_list this, that; + struct lock_list *uninitialized_var(target_entry); + struct lock_list *uninitialized_var(target_entry1); + + this.parent = NULL; + + this.class = hlock_class(prev); + ret = find_usage_backwards(&this, bit_backwards, &target_entry); + if (ret < 0) + return print_bfs_bug(ret); + if (ret == 1) + return ret; + + that.parent = NULL; + that.class = hlock_class(next); + ret = find_usage_forwards(&that, bit_forwards, &target_entry1); + if (ret < 0) + return print_bfs_bug(ret); + if (ret == 1) + return ret; + + return print_bad_irq_dependency(curr, &this, &that, + target_entry, target_entry1, + prev, next, + bit_backwards, bit_forwards, irqclass); +} + +static const char *state_names[] = { +#define LOCKDEP_STATE(__STATE) \ + __stringify(__STATE), +#include "lockdep_states.h" +#undef LOCKDEP_STATE +}; + +static const char *state_rnames[] = { +#define LOCKDEP_STATE(__STATE) \ + __stringify(__STATE)"-READ", +#include "lockdep_states.h" +#undef LOCKDEP_STATE +}; + +static inline const char *state_name(enum lock_usage_bit bit) +{ + return (bit & 1) ? state_rnames[bit >> 2] : state_names[bit >> 2]; +} + +static int exclusive_bit(int new_bit) +{ + /* + * USED_IN + * USED_IN_READ + * ENABLED + * ENABLED_READ + * + * bit 0 - write/read + * bit 1 - used_in/enabled + * bit 2+ state + */ + + int state = new_bit & ~3; + int dir = new_bit & 2; + + /* + * keep state, bit flip the direction and strip read. + */ + return state | (dir ^ 2); +} + +static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, + struct held_lock *next, enum lock_usage_bit bit) +{ + /* + * Prove that the new dependency does not connect a hardirq-safe + * lock with a hardirq-unsafe lock - to achieve this we search + * the backwards-subgraph starting at , and the + * forwards-subgraph starting at : + */ + if (!check_usage(curr, prev, next, bit, + exclusive_bit(bit), state_name(bit))) + return 0; + + bit++; /* _READ */ + + /* + * Prove that the new dependency does not connect a hardirq-safe-read + * lock with a hardirq-unsafe lock - to achieve this we search + * the backwards-subgraph starting at , and the + * forwards-subgraph starting at : + */ + if (!check_usage(curr, prev, next, bit, + exclusive_bit(bit), state_name(bit))) + return 0; + + return 1; +} + +static int +check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, + struct held_lock *next) +{ +#define LOCKDEP_STATE(__STATE) \ + if (!check_irq_usage(curr, prev, next, LOCK_USED_IN_##__STATE)) \ + return 0; +#include "lockdep_states.h" +#undef LOCKDEP_STATE + + return 1; +} + +static void inc_chains(void) +{ + if (current->hardirq_context) + nr_hardirq_chains++; + else { + if (current->softirq_context) + nr_softirq_chains++; + else + nr_process_chains++; + } +} + +#else + +static inline int +check_prev_add_irq(struct task_struct *curr, struct held_lock *prev, + struct held_lock *next) +{ + return 1; +} + +static inline void inc_chains(void) +{ + nr_process_chains++; +} + +#endif + +static void +print_deadlock_scenario(struct held_lock *nxt, + struct held_lock *prv) +{ + struct lock_class *next = hlock_class(nxt); + struct lock_class *prev = hlock_class(prv); + + printk(" Possible unsafe locking scenario:\n\n"); + printk(" CPU0\n"); + printk(" ----\n"); + printk(" lock("); + __print_lock_name(prev); + printk(");\n"); + printk(" lock("); + __print_lock_name(next); + printk(");\n"); + printk("\n *** DEADLOCK ***\n\n"); + printk(" May be due to missing lock nesting notation\n\n"); +} + +static int +print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, + struct held_lock *next) +{ + if (!debug_locks_off_graph_unlock() || debug_locks_silent) + return 0; + + printk("\n=============================================\n"); + printk( "[ INFO: possible recursive locking detected ]\n"); + print_kernel_version(); + printk( "---------------------------------------------\n"); + printk("%s/%d is trying to acquire lock:\n", + curr->comm, task_pid_nr(curr)); + print_lock(next); + printk("\nbut task is already holding lock:\n"); + print_lock(prev); + + printk("\nother info that might help us debug this:\n"); + print_deadlock_scenario(next, prev); + lockdep_print_held_locks(curr); + + printk("\nstack backtrace:\n"); + dump_stack(); + + return 0; +} + +/* + * Check whether we are holding such a class already. + * + * (Note that this has to be done separately, because the graph cannot + * detect such classes of deadlocks.) + * + * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read + */ +static int +check_deadlock(struct task_struct *curr, struct held_lock *next, + struct lockdep_map *next_instance, int read) +{ + struct held_lock *prev; + struct held_lock *nest = NULL; + int i; + + for (i = 0; i < curr->lockdep_depth; i++) { + prev = curr->held_locks + i; + + if (prev->instance == next->nest_lock) + nest = prev; + + if (hlock_class(prev) != hlock_class(next)) + continue; + + /* + * Allow read-after-read recursion of the same + * lock class (i.e. read_lock(lock)+read_lock(lock)): + */ + if ((read == 2) && prev->read) + return 2; + + /* + * We're holding the nest_lock, which serializes this lock's + * nesting behaviour. + */ + if (nest) + return 2; + + return print_deadlock_bug(curr, prev, next); + } + return 1; +} + +/* + * There was a chain-cache miss, and we are about to add a new dependency + * to a previous lock. We recursively validate the following rules: + * + * - would the adding of the -> dependency create a + * circular dependency in the graph? [== circular deadlock] + * + * - does the new prev->next dependency connect any hardirq-safe lock + * (in the full backwards-subgraph starting at ) with any + * hardirq-unsafe lock (in the full forwards-subgraph starting at + * )? [== illegal lock inversion with hardirq contexts] + * + * - does the new prev->next dependency connect any softirq-safe lock + * (in the full backwards-subgraph starting at ) with any + * softirq-unsafe lock (in the full forwards-subgraph starting at + * )? [== illegal lock inversion with softirq contexts] + * + * any of these scenarios could lead to a deadlock. + * + * Then if all the validations pass, we add the forwards and backwards + * dependency. + */ +static int +check_prev_add(struct task_struct *curr, struct held_lock *prev, + struct held_lock *next, int distance, int trylock_loop) +{ + struct lock_list *entry; + int ret; + struct lock_list this; + struct lock_list *uninitialized_var(target_entry); + /* + * Static variable, serialized by the graph_lock(). + * + * We use this static variable to save the stack trace in case + * we call into this function multiple times due to encountering + * trylocks in the held lock stack. + */ + static struct stack_trace trace; + + /* + * Prove that the new -> dependency would not + * create a circular dependency in the graph. (We do this by + * forward-recursing into the graph starting at , and + * checking whether we can reach .) + * + * We are using global variables to control the recursion, to + * keep the stackframe size of the recursive functions low: + */ + this.class = hlock_class(next); + this.parent = NULL; + ret = check_noncircular(&this, hlock_class(prev), &target_entry); + if (unlikely(!ret)) + return print_circular_bug(&this, target_entry, next, prev); + else if (unlikely(ret < 0)) + return print_bfs_bug(ret); + + if (!check_prev_add_irq(curr, prev, next)) + return 0; + + /* + * For recursive read-locks we do all the dependency checks, + * but we dont store read-triggered dependencies (only + * write-triggered dependencies). This ensures that only the + * write-side dependencies matter, and that if for example a + * write-lock never takes any other locks, then the reads are + * equivalent to a NOP. + */ + if (next->read == 2 || prev->read == 2) + return 1; + /* + * Is the -> dependency already present? + * + * (this may occur even though this is a new chain: consider + * e.g. the L1 -> L2 -> L3 -> L4 and the L5 -> L1 -> L2 -> L3 + * chains - the second one will be new, but L1 already has + * L2 added to its dependency list, due to the first chain.) + */ + list_for_each_entry(entry, &hlock_class(prev)->locks_after, entry) { + if (entry->class == hlock_class(next)) { + if (distance == 1) + entry->distance = 1; + return 2; + } + } + + if (!trylock_loop && !save_trace(&trace)) + return 0; + + /* + * Ok, all validations passed, add the new lock + * to the previous lock's dependency list: + */ + ret = add_lock_to_list(hlock_class(prev), hlock_class(next), + &hlock_class(prev)->locks_after, + next->acquire_ip, distance, &trace); + + if (!ret) + return 0; + + ret = add_lock_to_list(hlock_class(next), hlock_class(prev), + &hlock_class(next)->locks_before, + next->acquire_ip, distance, &trace); + if (!ret) + return 0; + + /* + * Debugging printouts: + */ + if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) { + graph_unlock(); + printk("\n new dependency: "); + print_lock_name(hlock_class(prev)); + printk(" => "); + print_lock_name(hlock_class(next)); + printk("\n"); + dump_stack(); + return graph_lock(); + } + return 1; +} + +/* + * Add the dependency to all directly-previous locks that are 'relevant'. + * The ones that are relevant are (in increasing distance from curr): + * all consecutive trylock entries and the final non-trylock entry - or + * the end of this context's lock-chain - whichever comes first. + */ +static int +check_prevs_add(struct task_struct *curr, struct held_lock *next) +{ + int depth = curr->lockdep_depth; + int trylock_loop = 0; + struct held_lock *hlock; + + /* + * Debugging checks. + * + * Depth must not be zero for a non-head lock: + */ + if (!depth) + goto out_bug; + /* + * At least two relevant locks must exist for this + * to be a head: + */ + if (curr->held_locks[depth].irq_context != + curr->held_locks[depth-1].irq_context) + goto out_bug; + + for (;;) { + int distance = curr->lockdep_depth - depth + 1; + hlock = curr->held_locks + depth-1; + /* + * Only non-recursive-read entries get new dependencies + * added: + */ + if (hlock->read != 2) { + if (!check_prev_add(curr, hlock, next, + distance, trylock_loop)) + return 0; + /* + * Stop after the first non-trylock entry, + * as non-trylock entries have added their + * own direct dependencies already, so this + * lock is connected to them indirectly: + */ + if (!hlock->trylock) + break; + } + depth--; + /* + * End of lock-stack? + */ + if (!depth) + break; + /* + * Stop the search if we cross into another context: + */ + if (curr->held_locks[depth].irq_context != + curr->held_locks[depth-1].irq_context) + break; + trylock_loop = 1; + } + return 1; +out_bug: + if (!debug_locks_off_graph_unlock()) + return 0; + + WARN_ON(1); + + return 0; +} + +unsigned long nr_lock_chains; +struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; +int nr_chain_hlocks; +static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS]; + +struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i) +{ + return lock_classes + chain_hlocks[chain->base + i]; +} + +/* + * Look up a dependency chain. If the key is not present yet then + * add it and return 1 - in this case the new dependency chain is + * validated. If the key is already hashed, return 0. + * (On return with 1 graph_lock is held.) + */ +static inline int lookup_chain_cache(struct task_struct *curr, + struct held_lock *hlock, + u64 chain_key) +{ + struct lock_class *class = hlock_class(hlock); + struct list_head *hash_head = chainhashentry(chain_key); + struct lock_chain *chain; + struct held_lock *hlock_curr, *hlock_next; + int i, j; + + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return 0; + /* + * We can walk it lock-free, because entries only get added + * to the hash: + */ + list_for_each_entry(chain, hash_head, entry) { + if (chain->chain_key == chain_key) { +cache_hit: + debug_atomic_inc(chain_lookup_hits); + if (very_verbose(class)) + printk("\nhash chain already cached, key: " + "%016Lx tail class: [%p] %s\n", + (unsigned long long)chain_key, + class->key, class->name); + return 0; + } + } + if (very_verbose(class)) + printk("\nnew hash chain, key: %016Lx tail class: [%p] %s\n", + (unsigned long long)chain_key, class->key, class->name); + /* + * Allocate a new chain entry from the static array, and add + * it to the hash: + */ + if (!graph_lock()) + return 0; + /* + * We have to walk the chain again locked - to avoid duplicates: + */ + list_for_each_entry(chain, hash_head, entry) { + if (chain->chain_key == chain_key) { + graph_unlock(); + goto cache_hit; + } + } + if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) { + if (!debug_locks_off_graph_unlock()) + return 0; + + printk("BUG: MAX_LOCKDEP_CHAINS too low!\n"); + printk("turning off the locking correctness validator.\n"); + dump_stack(); + return 0; + } + chain = lock_chains + nr_lock_chains++; + chain->chain_key = chain_key; + chain->irq_context = hlock->irq_context; + /* Find the first held_lock of current chain */ + hlock_next = hlock; + for (i = curr->lockdep_depth - 1; i >= 0; i--) { + hlock_curr = curr->held_locks + i; + if (hlock_curr->irq_context != hlock_next->irq_context) + break; + hlock_next = hlock; + } + i++; + chain->depth = curr->lockdep_depth + 1 - i; + if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { + chain->base = nr_chain_hlocks; + nr_chain_hlocks += chain->depth; + for (j = 0; j < chain->depth - 1; j++, i++) { + int lock_id = curr->held_locks[i].class_idx - 1; + chain_hlocks[chain->base + j] = lock_id; + } + chain_hlocks[chain->base + j] = class - lock_classes; + } + list_add_tail_rcu(&chain->entry, hash_head); + debug_atomic_inc(chain_lookup_misses); + inc_chains(); + + return 1; +} + +static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, + struct held_lock *hlock, int chain_head, u64 chain_key) +{ + /* + * Trylock needs to maintain the stack of held locks, but it + * does not add new dependencies, because trylock can be done + * in any order. + * + * We look up the chain_key and do the O(N^2) check and update of + * the dependencies only if this is a new dependency chain. + * (If lookup_chain_cache() returns with 1 it acquires + * graph_lock for us) + */ + if (!hlock->trylock && (hlock->check == 2) && + lookup_chain_cache(curr, hlock, chain_key)) { + /* + * Check whether last held lock: + * + * - is irq-safe, if this lock is irq-unsafe + * - is softirq-safe, if this lock is hardirq-unsafe + * + * And check whether the new lock's dependency graph + * could lead back to the previous lock. + * + * any of these scenarios could lead to a deadlock. If + * All validations + */ + int ret = check_deadlock(curr, hlock, lock, hlock->read); + + if (!ret) + return 0; + /* + * Mark recursive read, as we jump over it when + * building dependencies (just like we jump over + * trylock entries): + */ + if (ret == 2) + hlock->read = 2; + /* + * Add dependency only if this lock is not the head + * of the chain, and if it's not a secondary read-lock: + */ + if (!chain_head && ret != 2) + if (!check_prevs_add(curr, hlock)) + return 0; + graph_unlock(); + } else + /* after lookup_chain_cache(): */ + if (unlikely(!debug_locks)) + return 0; + + return 1; +} +#else +static inline int validate_chain(struct task_struct *curr, + struct lockdep_map *lock, struct held_lock *hlock, + int chain_head, u64 chain_key) +{ + return 1; +} +#endif + +/* + * We are building curr_chain_key incrementally, so double-check + * it from scratch, to make sure that it's done correctly: + */ +static void check_chain_key(struct task_struct *curr) +{ +#ifdef CONFIG_DEBUG_LOCKDEP + struct held_lock *hlock, *prev_hlock = NULL; + unsigned int i, id; + u64 chain_key = 0; + + for (i = 0; i < curr->lockdep_depth; i++) { + hlock = curr->held_locks + i; + if (chain_key != hlock->prev_chain_key) { + debug_locks_off(); + WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n", + curr->lockdep_depth, i, + (unsigned long long)chain_key, + (unsigned long long)hlock->prev_chain_key); + return; + } + id = hlock->class_idx - 1; + if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) + return; + + if (prev_hlock && (prev_hlock->irq_context != + hlock->irq_context)) + chain_key = 0; + chain_key = iterate_chain_key(chain_key, id); + prev_hlock = hlock; + } + if (chain_key != curr->curr_chain_key) { + debug_locks_off(); + WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n", + curr->lockdep_depth, i, + (unsigned long long)chain_key, + (unsigned long long)curr->curr_chain_key); + } +#endif +} + +static void +print_usage_bug_scenario(struct held_lock *lock) +{ + struct lock_class *class = hlock_class(lock); + + printk(" Possible unsafe locking scenario:\n\n"); + printk(" CPU0\n"); + printk(" ----\n"); + printk(" lock("); + __print_lock_name(class); + printk(");\n"); + printk(" \n"); + printk(" lock("); + __print_lock_name(class); + printk(");\n"); + printk("\n *** DEADLOCK ***\n\n"); +} + +static int +print_usage_bug(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) +{ + if (!debug_locks_off_graph_unlock() || debug_locks_silent) + return 0; + + printk("\n=================================\n"); + printk( "[ INFO: inconsistent lock state ]\n"); + print_kernel_version(); + printk( "---------------------------------\n"); + + printk("inconsistent {%s} -> {%s} usage.\n", + usage_str[prev_bit], usage_str[new_bit]); + + printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n", + curr->comm, task_pid_nr(curr), + trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT, + trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT, + trace_hardirqs_enabled(curr), + trace_softirqs_enabled(curr)); + print_lock(this); + + printk("{%s} state was registered at:\n", usage_str[prev_bit]); + print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1); + + print_irqtrace_events(curr); + printk("\nother info that might help us debug this:\n"); + print_usage_bug_scenario(this); + + lockdep_print_held_locks(curr); + + printk("\nstack backtrace:\n"); + dump_stack(); + + return 0; +} + +/* + * Print out an error if an invalid bit is set: + */ +static inline int +valid_state(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) +{ + if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) + return print_usage_bug(curr, this, bad_bit, new_bit); + return 1; +} + +static int mark_lock(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit); + +#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) + +/* + * print irq inversion bug: + */ +static int +print_irq_inversion_bug(struct task_struct *curr, + struct lock_list *root, struct lock_list *other, + struct held_lock *this, int forwards, + const char *irqclass) +{ + struct lock_list *entry = other; + struct lock_list *middle = NULL; + int depth; + + if (!debug_locks_off_graph_unlock() || debug_locks_silent) + return 0; + + printk("\n=========================================================\n"); + printk( "[ INFO: possible irq lock inversion dependency detected ]\n"); + print_kernel_version(); + printk( "---------------------------------------------------------\n"); + printk("%s/%d just changed the state of lock:\n", + curr->comm, task_pid_nr(curr)); + print_lock(this); + if (forwards) + printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass); + else + printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass); + print_lock_name(other->class); + printk("\n\nand interrupts could create inverse lock ordering between them.\n\n"); + + printk("\nother info that might help us debug this:\n"); + + /* Find a middle lock (if one exists) */ + depth = get_lock_depth(other); + do { + if (depth == 0 && (entry != root)) { + printk("lockdep:%s bad path found in chain graph\n", __func__); + break; + } + middle = entry; + entry = get_lock_parent(entry); + depth--; + } while (entry && entry != root && (depth >= 0)); + if (forwards) + print_irq_lock_scenario(root, other, + middle ? middle->class : root->class, other->class); + else + print_irq_lock_scenario(other, root, + middle ? middle->class : other->class, root->class); + + lockdep_print_held_locks(curr); + + printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n"); + if (!save_trace(&root->trace)) + return 0; + print_shortest_lock_dependencies(other, root); + + printk("\nstack backtrace:\n"); + dump_stack(); + + return 0; +} + +/* + * Prove that in the forwards-direction subgraph starting at + * there is no lock matching : + */ +static int +check_usage_forwards(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit bit, const char *irqclass) +{ + int ret; + struct lock_list root; + struct lock_list *uninitialized_var(target_entry); + + root.parent = NULL; + root.class = hlock_class(this); + ret = find_usage_forwards(&root, bit, &target_entry); + if (ret < 0) + return print_bfs_bug(ret); + if (ret == 1) + return ret; + + return print_irq_inversion_bug(curr, &root, target_entry, + this, 1, irqclass); +} + +/* + * Prove that in the backwards-direction subgraph starting at + * there is no lock matching : + */ +static int +check_usage_backwards(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit bit, const char *irqclass) +{ + int ret; + struct lock_list root; + struct lock_list *uninitialized_var(target_entry); + + root.parent = NULL; + root.class = hlock_class(this); + ret = find_usage_backwards(&root, bit, &target_entry); + if (ret < 0) + return print_bfs_bug(ret); + if (ret == 1) + return ret; + + return print_irq_inversion_bug(curr, &root, target_entry, + this, 0, irqclass); +} + +void print_irqtrace_events(struct task_struct *curr) +{ + printk("irq event stamp: %u\n", curr->irq_events); + printk("hardirqs last enabled at (%u): ", curr->hardirq_enable_event); + print_ip_sym(curr->hardirq_enable_ip); + printk("hardirqs last disabled at (%u): ", curr->hardirq_disable_event); + print_ip_sym(curr->hardirq_disable_ip); + printk("softirqs last enabled at (%u): ", curr->softirq_enable_event); + print_ip_sym(curr->softirq_enable_ip); + printk("softirqs last disabled at (%u): ", curr->softirq_disable_event); + print_ip_sym(curr->softirq_disable_ip); +} + +static int HARDIRQ_verbose(struct lock_class *class) +{ +#if HARDIRQ_VERBOSE + return class_filter(class); +#endif + return 0; +} + +static int SOFTIRQ_verbose(struct lock_class *class) +{ +#if SOFTIRQ_VERBOSE + return class_filter(class); +#endif + return 0; +} + +static int RECLAIM_FS_verbose(struct lock_class *class) +{ +#if RECLAIM_VERBOSE + return class_filter(class); +#endif + return 0; +} + +#define STRICT_READ_CHECKS 1 + +static int (*state_verbose_f[])(struct lock_class *class) = { +#define LOCKDEP_STATE(__STATE) \ + __STATE##_verbose, +#include "lockdep_states.h" +#undef LOCKDEP_STATE +}; + +static inline int state_verbose(enum lock_usage_bit bit, + struct lock_class *class) +{ + return state_verbose_f[bit >> 2](class); +} + +typedef int (*check_usage_f)(struct task_struct *, struct held_lock *, + enum lock_usage_bit bit, const char *name); + +static int +mark_lock_irq(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit) +{ + int excl_bit = exclusive_bit(new_bit); + int read = new_bit & 1; + int dir = new_bit & 2; + + /* + * mark USED_IN has to look forwards -- to ensure no dependency + * has ENABLED state, which would allow recursion deadlocks. + * + * mark ENABLED has to look backwards -- to ensure no dependee + * has USED_IN state, which, again, would allow recursion deadlocks. + */ + check_usage_f usage = dir ? + check_usage_backwards : check_usage_forwards; + + /* + * Validate that this particular lock does not have conflicting + * usage states. + */ + if (!valid_state(curr, this, new_bit, excl_bit)) + return 0; + + /* + * Validate that the lock dependencies don't have conflicting usage + * states. + */ + if ((!read || !dir || STRICT_READ_CHECKS) && + !usage(curr, this, excl_bit, state_name(new_bit & ~1))) + return 0; + + /* + * Check for read in write conflicts + */ + if (!read) { + if (!valid_state(curr, this, new_bit, excl_bit + 1)) + return 0; + + if (STRICT_READ_CHECKS && + !usage(curr, this, excl_bit + 1, + state_name(new_bit + 1))) + return 0; + } + + if (state_verbose(new_bit, hlock_class(this))) + return 2; + + return 1; +} + +enum mark_type { +#define LOCKDEP_STATE(__STATE) __STATE, +#include "lockdep_states.h" +#undef LOCKDEP_STATE +}; + +/* + * Mark all held locks with a usage bit: + */ +static int +mark_held_locks(struct task_struct *curr, enum mark_type mark) +{ + enum lock_usage_bit usage_bit; + struct held_lock *hlock; + int i; + + for (i = 0; i < curr->lockdep_depth; i++) { + hlock = curr->held_locks + i; + + usage_bit = 2 + (mark << 2); /* ENABLED */ + if (hlock->read) + usage_bit += 1; /* READ */ + + BUG_ON(usage_bit >= LOCK_USAGE_STATES); + + if (!mark_lock(curr, hlock, usage_bit)) + return 0; + } + + return 1; +} + +/* + * Hardirqs will be enabled: + */ +void trace_hardirqs_on_caller(unsigned long ip) +{ + struct task_struct *curr = current; + + time_hardirqs_on(CALLER_ADDR0, ip); + + if (unlikely(!debug_locks || current->lockdep_recursion)) + return; + + if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled))) + return; + + if (unlikely(curr->hardirqs_enabled)) { + /* + * Neither irq nor preemption are disabled here + * so this is racy by nature but losing one hit + * in a stat is not a big deal. + */ + __debug_atomic_inc(redundant_hardirqs_on); + return; + } + /* we'll do an OFF -> ON transition: */ + curr->hardirqs_enabled = 1; + + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return; + if (DEBUG_LOCKS_WARN_ON(current->hardirq_context)) + return; + /* + * We are going to turn hardirqs on, so set the + * usage bit for all held locks: + */ + if (!mark_held_locks(curr, HARDIRQ)) + return; + /* + * If we have softirqs enabled, then set the usage + * bit for all held locks. (disabled hardirqs prevented + * this bit from being set before) + */ + if (curr->softirqs_enabled) + if (!mark_held_locks(curr, SOFTIRQ)) + return; + + curr->hardirq_enable_ip = ip; + curr->hardirq_enable_event = ++curr->irq_events; + debug_atomic_inc(hardirqs_on_events); +} +EXPORT_SYMBOL(trace_hardirqs_on_caller); + +void trace_hardirqs_on(void) +{ + trace_hardirqs_on_caller(CALLER_ADDR0); +} +EXPORT_SYMBOL(trace_hardirqs_on); + +/* + * Hardirqs were disabled: + */ +void trace_hardirqs_off_caller(unsigned long ip) +{ + struct task_struct *curr = current; + + time_hardirqs_off(CALLER_ADDR0, ip); + + if (unlikely(!debug_locks || current->lockdep_recursion)) + return; + + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return; + + if (curr->hardirqs_enabled) { + /* + * We have done an ON -> OFF transition: + */ + curr->hardirqs_enabled = 0; + curr->hardirq_disable_ip = ip; + curr->hardirq_disable_event = ++curr->irq_events; + debug_atomic_inc(hardirqs_off_events); + } else + debug_atomic_inc(redundant_hardirqs_off); +} +EXPORT_SYMBOL(trace_hardirqs_off_caller); + +void trace_hardirqs_off(void) +{ + trace_hardirqs_off_caller(CALLER_ADDR0); +} +EXPORT_SYMBOL(trace_hardirqs_off); + +/* + * Softirqs will be enabled: + */ +void trace_softirqs_on(unsigned long ip) +{ + struct task_struct *curr = current; + + if (unlikely(!debug_locks)) + return; + + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return; + + if (curr->softirqs_enabled) { + debug_atomic_inc(redundant_softirqs_on); + return; + } + + /* + * We'll do an OFF -> ON transition: + */ + curr->softirqs_enabled = 1; + curr->softirq_enable_ip = ip; + curr->softirq_enable_event = ++curr->irq_events; + debug_atomic_inc(softirqs_on_events); + /* + * We are going to turn softirqs on, so set the + * usage bit for all held locks, if hardirqs are + * enabled too: + */ + if (curr->hardirqs_enabled) + mark_held_locks(curr, SOFTIRQ); +} + +/* + * Softirqs were disabled: + */ +void trace_softirqs_off(unsigned long ip) +{ + struct task_struct *curr = current; + + if (unlikely(!debug_locks)) + return; + + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return; + + if (curr->softirqs_enabled) { + /* + * We have done an ON -> OFF transition: + */ + curr->softirqs_enabled = 0; + curr->softirq_disable_ip = ip; + curr->softirq_disable_event = ++curr->irq_events; + debug_atomic_inc(softirqs_off_events); + DEBUG_LOCKS_WARN_ON(!softirq_count()); + } else + debug_atomic_inc(redundant_softirqs_off); +} + +static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) +{ + struct task_struct *curr = current; + + if (unlikely(!debug_locks)) + return; + + /* no reclaim without waiting on it */ + if (!(gfp_mask & __GFP_WAIT)) + return; + + /* this guy won't enter reclaim */ + if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) + return; + + /* We're only interested __GFP_FS allocations for now */ + if (!(gfp_mask & __GFP_FS)) + return; + + if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) + return; + + mark_held_locks(curr, RECLAIM_FS); +} + +static void check_flags(unsigned long flags); + +void lockdep_trace_alloc(gfp_t gfp_mask) +{ + unsigned long flags; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + current->lockdep_recursion = 1; + __lockdep_trace_alloc(gfp_mask, flags); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} + +static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock) +{ + /* + * If non-trylock use in a hardirq or softirq context, then + * mark the lock as used in these contexts: + */ + if (!hlock->trylock) { + if (hlock->read) { + if (curr->hardirq_context) + if (!mark_lock(curr, hlock, + LOCK_USED_IN_HARDIRQ_READ)) + return 0; + if (curr->softirq_context) + if (!mark_lock(curr, hlock, + LOCK_USED_IN_SOFTIRQ_READ)) + return 0; + } else { + if (curr->hardirq_context) + if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ)) + return 0; + if (curr->softirq_context) + if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ)) + return 0; + } + } + if (!hlock->hardirqs_off) { + if (hlock->read) { + if (!mark_lock(curr, hlock, + LOCK_ENABLED_HARDIRQ_READ)) + return 0; + if (curr->softirqs_enabled) + if (!mark_lock(curr, hlock, + LOCK_ENABLED_SOFTIRQ_READ)) + return 0; + } else { + if (!mark_lock(curr, hlock, + LOCK_ENABLED_HARDIRQ)) + return 0; + if (curr->softirqs_enabled) + if (!mark_lock(curr, hlock, + LOCK_ENABLED_SOFTIRQ)) + return 0; + } + } + + /* + * We reuse the irq context infrastructure more broadly as a general + * context checking code. This tests GFP_FS recursion (a lock taken + * during reclaim for a GFP_FS allocation is held over a GFP_FS + * allocation). + */ + if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) { + if (hlock->read) { + if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS_READ)) + return 0; + } else { + if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS)) + return 0; + } + } + + return 1; +} + +static int separate_irq_context(struct task_struct *curr, + struct held_lock *hlock) +{ + unsigned int depth = curr->lockdep_depth; + + /* + * Keep track of points where we cross into an interrupt context: + */ + hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) + + curr->softirq_context; + if (depth) { + struct held_lock *prev_hlock; + + prev_hlock = curr->held_locks + depth-1; + /* + * If we cross into another context, reset the + * hash key (this also prevents the checking and the + * adding of the dependency to 'prev'): + */ + if (prev_hlock->irq_context != hlock->irq_context) + return 1; + } + return 0; +} + +#else + +static inline +int mark_lock_irq(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit) +{ + WARN_ON(1); + return 1; +} + +static inline int mark_irqflags(struct task_struct *curr, + struct held_lock *hlock) +{ + return 1; +} + +static inline int separate_irq_context(struct task_struct *curr, + struct held_lock *hlock) +{ + return 0; +} + +void lockdep_trace_alloc(gfp_t gfp_mask) +{ +} + +#endif + +/* + * Mark a lock with a usage bit, and validate the state transition: + */ +static int mark_lock(struct task_struct *curr, struct held_lock *this, + enum lock_usage_bit new_bit) +{ + unsigned int new_mask = 1 << new_bit, ret = 1; + + /* + * If already set then do not dirty the cacheline, + * nor do any checks: + */ + if (likely(hlock_class(this)->usage_mask & new_mask)) + return 1; + + if (!graph_lock()) + return 0; + /* + * Make sure we didn't race: + */ + if (unlikely(hlock_class(this)->usage_mask & new_mask)) { + graph_unlock(); + return 1; + } + + hlock_class(this)->usage_mask |= new_mask; + + if (!save_trace(hlock_class(this)->usage_traces + new_bit)) + return 0; + + switch (new_bit) { +#define LOCKDEP_STATE(__STATE) \ + case LOCK_USED_IN_##__STATE: \ + case LOCK_USED_IN_##__STATE##_READ: \ + case LOCK_ENABLED_##__STATE: \ + case LOCK_ENABLED_##__STATE##_READ: +#include "lockdep_states.h" +#undef LOCKDEP_STATE + ret = mark_lock_irq(curr, this, new_bit); + if (!ret) + return 0; + break; + case LOCK_USED: + debug_atomic_dec(nr_unused_locks); + break; + default: + if (!debug_locks_off_graph_unlock()) + return 0; + WARN_ON(1); + return 0; + } + + graph_unlock(); + + /* + * We must printk outside of the graph_lock: + */ + if (ret == 2) { + printk("\nmarked lock as {%s}:\n", usage_str[new_bit]); + print_lock(this); + print_irqtrace_events(curr); + dump_stack(); + } + + return ret; +} + +/* + * Initialize a lock instance's lock-class mapping info: + */ +void lockdep_init_map(struct lockdep_map *lock, const char *name, + struct lock_class_key *key, int subclass) +{ + int i; + + for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) + lock->class_cache[i] = NULL; + +#ifdef CONFIG_LOCK_STAT + lock->cpu = raw_smp_processor_id(); +#endif + + if (DEBUG_LOCKS_WARN_ON(!name)) { + lock->name = "NULL"; + return; + } + + lock->name = name; + + if (DEBUG_LOCKS_WARN_ON(!key)) + return; + /* + * Sanity check, the lock-class key must be persistent: + */ + if (!static_obj(key)) { + printk("BUG: key %p not in .data!\n", key); + DEBUG_LOCKS_WARN_ON(1); + return; + } + lock->key = key; + + if (unlikely(!debug_locks)) + return; + + if (subclass) + register_lock_class(lock, subclass, 1); +} +EXPORT_SYMBOL_GPL(lockdep_init_map); + +struct lock_class_key __lockdep_no_validate__; + +/* + * This gets called for every mutex_lock*()/spin_lock*() operation. + * We maintain the dependency maps and validate the locking attempt: + */ +static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, + int trylock, int read, int check, int hardirqs_off, + struct lockdep_map *nest_lock, unsigned long ip, + int references) +{ + struct task_struct *curr = current; + struct lock_class *class = NULL; + struct held_lock *hlock; + unsigned int depth, id; + int chain_head = 0; + int class_idx; + u64 chain_key; + + if (!prove_locking) + check = 1; + + if (unlikely(!debug_locks)) + return 0; + + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return 0; + + if (lock->key == &__lockdep_no_validate__) + check = 1; + + if (subclass < NR_LOCKDEP_CACHING_CLASSES) + class = lock->class_cache[subclass]; + /* + * Not cached? + */ + if (unlikely(!class)) { + class = register_lock_class(lock, subclass, 0); + if (!class) + return 0; + } + atomic_inc((atomic_t *)&class->ops); + if (very_verbose(class)) { + printk("\nacquire class [%p] %s", class->key, class->name); + if (class->name_version > 1) + printk("#%d", class->name_version); + printk("\n"); + dump_stack(); + } + + /* + * Add the lock to the list of currently held locks. + * (we dont increase the depth just yet, up until the + * dependency checks are done) + */ + depth = curr->lockdep_depth; + if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH)) + return 0; + + class_idx = class - lock_classes + 1; + + if (depth) { + hlock = curr->held_locks + depth - 1; + if (hlock->class_idx == class_idx && nest_lock) { + if (hlock->references) + hlock->references++; + else + hlock->references = 2; + + return 1; + } + } + + hlock = curr->held_locks + depth; + if (DEBUG_LOCKS_WARN_ON(!class)) + return 0; + hlock->class_idx = class_idx; + hlock->acquire_ip = ip; + hlock->instance = lock; + hlock->nest_lock = nest_lock; + hlock->trylock = trylock; + hlock->read = read; + hlock->check = check; + hlock->hardirqs_off = !!hardirqs_off; + hlock->references = references; +#ifdef CONFIG_LOCK_STAT + hlock->waittime_stamp = 0; + hlock->holdtime_stamp = lockstat_clock(); +#endif + + if (check == 2 && !mark_irqflags(curr, hlock)) + return 0; + + /* mark it as used: */ + if (!mark_lock(curr, hlock, LOCK_USED)) + return 0; + + /* + * Calculate the chain hash: it's the combined hash of all the + * lock keys along the dependency chain. We save the hash value + * at every step so that we can get the current hash easily + * after unlock. The chain hash is then used to cache dependency + * results. + * + * The 'key ID' is what is the most compact key value to drive + * the hash, not class->key. + */ + id = class - lock_classes; + if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) + return 0; + + chain_key = curr->curr_chain_key; + if (!depth) { + if (DEBUG_LOCKS_WARN_ON(chain_key != 0)) + return 0; + chain_head = 1; + } + + hlock->prev_chain_key = chain_key; + if (separate_irq_context(curr, hlock)) { + chain_key = 0; + chain_head = 1; + } + chain_key = iterate_chain_key(chain_key, id); + + if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) + return 0; + + curr->curr_chain_key = chain_key; + curr->lockdep_depth++; + check_chain_key(curr); +#ifdef CONFIG_DEBUG_LOCKDEP + if (unlikely(!debug_locks)) + return 0; +#endif + if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { + debug_locks_off(); + printk("BUG: MAX_LOCK_DEPTH too low!\n"); + printk("turning off the locking correctness validator.\n"); + dump_stack(); + return 0; + } + + if (unlikely(curr->lockdep_depth > max_lockdep_depth)) + max_lockdep_depth = curr->lockdep_depth; + + return 1; +} + +static int +print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock, + unsigned long ip) +{ + if (!debug_locks_off()) + return 0; + if (debug_locks_silent) + return 0; + + printk("\n=====================================\n"); + printk( "[ BUG: bad unlock balance detected! ]\n"); + printk( "-------------------------------------\n"); + printk("%s/%d is trying to release lock (", + curr->comm, task_pid_nr(curr)); + print_lockdep_cache(lock); + printk(") at:\n"); + print_ip_sym(ip); + printk("but there are no more locks to release!\n"); + printk("\nother info that might help us debug this:\n"); + lockdep_print_held_locks(curr); + + printk("\nstack backtrace:\n"); + dump_stack(); + + return 0; +} + +/* + * Common debugging checks for both nested and non-nested unlock: + */ +static int check_unlock(struct task_struct *curr, struct lockdep_map *lock, + unsigned long ip) +{ + if (unlikely(!debug_locks)) + return 0; + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return 0; + + if (curr->lockdep_depth <= 0) + return print_unlock_inbalance_bug(curr, lock, ip); + + return 1; +} + +static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) +{ + if (hlock->instance == lock) + return 1; + + if (hlock->references) { + struct lock_class *class = lock->class_cache[0]; + + if (!class) + class = look_up_lock_class(lock, 0); + + if (DEBUG_LOCKS_WARN_ON(!class)) + return 0; + + if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock)) + return 0; + + if (hlock->class_idx == class - lock_classes + 1) + return 1; + } + + return 0; +} + +static int +__lock_set_class(struct lockdep_map *lock, const char *name, + struct lock_class_key *key, unsigned int subclass, + unsigned long ip) +{ + struct task_struct *curr = current; + struct held_lock *hlock, *prev_hlock; + struct lock_class *class; + unsigned int depth; + int i; + + depth = curr->lockdep_depth; + if (DEBUG_LOCKS_WARN_ON(!depth)) + return 0; + + prev_hlock = NULL; + for (i = depth-1; i >= 0; i--) { + hlock = curr->held_locks + i; + /* + * We must not cross into another context: + */ + if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) + break; + if (match_held_lock(hlock, lock)) + goto found_it; + prev_hlock = hlock; + } + return print_unlock_inbalance_bug(curr, lock, ip); + +found_it: + lockdep_init_map(lock, name, key, 0); + class = register_lock_class(lock, subclass, 0); + hlock->class_idx = class - lock_classes + 1; + + curr->lockdep_depth = i; + curr->curr_chain_key = hlock->prev_chain_key; + + for (; i < depth; i++) { + hlock = curr->held_locks + i; + if (!__lock_acquire(hlock->instance, + hlock_class(hlock)->subclass, hlock->trylock, + hlock->read, hlock->check, hlock->hardirqs_off, + hlock->nest_lock, hlock->acquire_ip, + hlock->references)) + return 0; + } + + if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) + return 0; + return 1; +} + +/* + * Remove the lock to the list of currently held locks in a + * potentially non-nested (out of order) manner. This is a + * relatively rare operation, as all the unlock APIs default + * to nested mode (which uses lock_release()): + */ +static int +lock_release_non_nested(struct task_struct *curr, + struct lockdep_map *lock, unsigned long ip) +{ + struct held_lock *hlock, *prev_hlock; + unsigned int depth; + int i; + + /* + * Check whether the lock exists in the current stack + * of held locks: + */ + depth = curr->lockdep_depth; + if (DEBUG_LOCKS_WARN_ON(!depth)) + return 0; + + prev_hlock = NULL; + for (i = depth-1; i >= 0; i--) { + hlock = curr->held_locks + i; + /* + * We must not cross into another context: + */ + if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) + break; + if (match_held_lock(hlock, lock)) + goto found_it; + prev_hlock = hlock; + } + return print_unlock_inbalance_bug(curr, lock, ip); + +found_it: + if (hlock->instance == lock) + lock_release_holdtime(hlock); + + if (hlock->references) { + hlock->references--; + if (hlock->references) { + /* + * We had, and after removing one, still have + * references, the current lock stack is still + * valid. We're done! + */ + return 1; + } + } + + /* + * We have the right lock to unlock, 'hlock' points to it. + * Now we remove it from the stack, and add back the other + * entries (if any), recalculating the hash along the way: + */ + + curr->lockdep_depth = i; + curr->curr_chain_key = hlock->prev_chain_key; + + for (i++; i < depth; i++) { + hlock = curr->held_locks + i; + if (!__lock_acquire(hlock->instance, + hlock_class(hlock)->subclass, hlock->trylock, + hlock->read, hlock->check, hlock->hardirqs_off, + hlock->nest_lock, hlock->acquire_ip, + hlock->references)) + return 0; + } + + if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1)) + return 0; + return 1; +} + +/* + * Remove the lock to the list of currently held locks - this gets + * called on mutex_unlock()/spin_unlock*() (or on a failed + * mutex_lock_interruptible()). This is done for unlocks that nest + * perfectly. (i.e. the current top of the lock-stack is unlocked) + */ +static int lock_release_nested(struct task_struct *curr, + struct lockdep_map *lock, unsigned long ip) +{ + struct held_lock *hlock; + unsigned int depth; + + /* + * Pop off the top of the lock stack: + */ + depth = curr->lockdep_depth - 1; + hlock = curr->held_locks + depth; + + /* + * Is the unlock non-nested: + */ + if (hlock->instance != lock || hlock->references) + return lock_release_non_nested(curr, lock, ip); + curr->lockdep_depth--; + + if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0))) + return 0; + + curr->curr_chain_key = hlock->prev_chain_key; + + lock_release_holdtime(hlock); + +#ifdef CONFIG_DEBUG_LOCKDEP + hlock->prev_chain_key = 0; + hlock->class_idx = 0; + hlock->acquire_ip = 0; + hlock->irq_context = 0; +#endif + return 1; +} + +/* + * Remove the lock to the list of currently held locks - this gets + * called on mutex_unlock()/spin_unlock*() (or on a failed + * mutex_lock_interruptible()). This is done for unlocks that nest + * perfectly. (i.e. the current top of the lock-stack is unlocked) + */ +static void +__lock_release(struct lockdep_map *lock, int nested, unsigned long ip) +{ + struct task_struct *curr = current; + + if (!check_unlock(curr, lock, ip)) + return; + + if (nested) { + if (!lock_release_nested(curr, lock, ip)) + return; + } else { + if (!lock_release_non_nested(curr, lock, ip)) + return; + } + + check_chain_key(curr); +} + +static int __lock_is_held(struct lockdep_map *lock) +{ + struct task_struct *curr = current; + int i; + + for (i = 0; i < curr->lockdep_depth; i++) { + struct held_lock *hlock = curr->held_locks + i; + + if (match_held_lock(hlock, lock)) + return 1; + } + + return 0; +} + +/* + * Check whether we follow the irq-flags state precisely: + */ +static void check_flags(unsigned long flags) +{ +#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) && \ + defined(CONFIG_TRACE_IRQFLAGS) + if (!debug_locks) + return; + + if (irqs_disabled_flags(flags)) { + if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) { + printk("possible reason: unannotated irqs-off.\n"); + } + } else { + if (DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)) { + printk("possible reason: unannotated irqs-on.\n"); + } + } + + /* + * We dont accurately track softirq state in e.g. + * hardirq contexts (such as on 4KSTACKS), so only + * check if not in hardirq contexts: + */ + if (!hardirq_count()) { + if (softirq_count()) + DEBUG_LOCKS_WARN_ON(current->softirqs_enabled); + else + DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); + } + + if (!debug_locks) + print_irqtrace_events(current); +#endif +} + +void lock_set_class(struct lockdep_map *lock, const char *name, + struct lock_class_key *key, unsigned int subclass, + unsigned long ip) +{ + unsigned long flags; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + current->lockdep_recursion = 1; + check_flags(flags); + if (__lock_set_class(lock, name, key, subclass, ip)) + check_chain_key(current); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_set_class); + +/* + * We are not always called with irqs disabled - do that here, + * and also avoid lockdep recursion: + */ +void lock_acquire(struct lockdep_map *lock, unsigned int subclass, + int trylock, int read, int check, + struct lockdep_map *nest_lock, unsigned long ip) +{ + unsigned long flags; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + + current->lockdep_recursion = 1; + trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); + __lock_acquire(lock, subclass, trylock, read, check, + irqs_disabled_flags(flags), nest_lock, ip, 0); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_acquire); + +void lock_release(struct lockdep_map *lock, int nested, + unsigned long ip) +{ + unsigned long flags; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + current->lockdep_recursion = 1; + trace_lock_release(lock, ip); + __lock_release(lock, nested, ip); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_release); + +int lock_is_held(struct lockdep_map *lock) +{ + unsigned long flags; + int ret = 0; + + if (unlikely(current->lockdep_recursion)) + return 1; /* avoid false negative lockdep_assert_held() */ + + raw_local_irq_save(flags); + check_flags(flags); + + current->lockdep_recursion = 1; + ret = __lock_is_held(lock); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); + + return ret; +} +EXPORT_SYMBOL_GPL(lock_is_held); + +void lockdep_set_current_reclaim_state(gfp_t gfp_mask) +{ + current->lockdep_reclaim_gfp = gfp_mask; +} + +void lockdep_clear_current_reclaim_state(void) +{ + current->lockdep_reclaim_gfp = 0; +} + +#ifdef CONFIG_LOCK_STAT +static int +print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, + unsigned long ip) +{ + if (!debug_locks_off()) + return 0; + if (debug_locks_silent) + return 0; + + printk("\n=================================\n"); + printk( "[ BUG: bad contention detected! ]\n"); + printk( "---------------------------------\n"); + printk("%s/%d is trying to contend lock (", + curr->comm, task_pid_nr(curr)); + print_lockdep_cache(lock); + printk(") at:\n"); + print_ip_sym(ip); + printk("but there are no locks held!\n"); + printk("\nother info that might help us debug this:\n"); + lockdep_print_held_locks(curr); + + printk("\nstack backtrace:\n"); + dump_stack(); + + return 0; +} + +static void +__lock_contended(struct lockdep_map *lock, unsigned long ip) +{ + struct task_struct *curr = current; + struct held_lock *hlock, *prev_hlock; + struct lock_class_stats *stats; + unsigned int depth; + int i, contention_point, contending_point; + + depth = curr->lockdep_depth; + if (DEBUG_LOCKS_WARN_ON(!depth)) + return; + + prev_hlock = NULL; + for (i = depth-1; i >= 0; i--) { + hlock = curr->held_locks + i; + /* + * We must not cross into another context: + */ + if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) + break; + if (match_held_lock(hlock, lock)) + goto found_it; + prev_hlock = hlock; + } + print_lock_contention_bug(curr, lock, ip); + return; + +found_it: + if (hlock->instance != lock) + return; + + hlock->waittime_stamp = lockstat_clock(); + + contention_point = lock_point(hlock_class(hlock)->contention_point, ip); + contending_point = lock_point(hlock_class(hlock)->contending_point, + lock->ip); + + stats = get_lock_stats(hlock_class(hlock)); + if (contention_point < LOCKSTAT_POINTS) + stats->contention_point[contention_point]++; + if (contending_point < LOCKSTAT_POINTS) + stats->contending_point[contending_point]++; + if (lock->cpu != smp_processor_id()) + stats->bounces[bounce_contended + !!hlock->read]++; + put_lock_stats(stats); +} + +static void +__lock_acquired(struct lockdep_map *lock, unsigned long ip) +{ + struct task_struct *curr = current; + struct held_lock *hlock, *prev_hlock; + struct lock_class_stats *stats; + unsigned int depth; + u64 now, waittime = 0; + int i, cpu; + + depth = curr->lockdep_depth; + if (DEBUG_LOCKS_WARN_ON(!depth)) + return; + + prev_hlock = NULL; + for (i = depth-1; i >= 0; i--) { + hlock = curr->held_locks + i; + /* + * We must not cross into another context: + */ + if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) + break; + if (match_held_lock(hlock, lock)) + goto found_it; + prev_hlock = hlock; + } + print_lock_contention_bug(curr, lock, _RET_IP_); + return; + +found_it: + if (hlock->instance != lock) + return; + + cpu = smp_processor_id(); + if (hlock->waittime_stamp) { + now = lockstat_clock(); + waittime = now - hlock->waittime_stamp; + hlock->holdtime_stamp = now; + } + + trace_lock_acquired(lock, ip); + + stats = get_lock_stats(hlock_class(hlock)); + if (waittime) { + if (hlock->read) + lock_time_inc(&stats->read_waittime, waittime); + else + lock_time_inc(&stats->write_waittime, waittime); + } + if (lock->cpu != cpu) + stats->bounces[bounce_acquired + !!hlock->read]++; + put_lock_stats(stats); + + lock->cpu = cpu; + lock->ip = ip; +} + +void lock_contended(struct lockdep_map *lock, unsigned long ip) +{ + unsigned long flags; + + if (unlikely(!lock_stat)) + return; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + current->lockdep_recursion = 1; + trace_lock_contended(lock, ip); + __lock_contended(lock, ip); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_contended); + +void lock_acquired(struct lockdep_map *lock, unsigned long ip) +{ + unsigned long flags; + + if (unlikely(!lock_stat)) + return; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + check_flags(flags); + current->lockdep_recursion = 1; + __lock_acquired(lock, ip); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_acquired); +#endif + +/* + * Used by the testsuite, sanitize the validator state + * after a simulated failure: + */ + +void lockdep_reset(void) +{ + unsigned long flags; + int i; + + raw_local_irq_save(flags); + current->curr_chain_key = 0; + current->lockdep_depth = 0; + current->lockdep_recursion = 0; + memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock)); + nr_hardirq_chains = 0; + nr_softirq_chains = 0; + nr_process_chains = 0; + debug_locks = 1; + for (i = 0; i < CHAINHASH_SIZE; i++) + INIT_LIST_HEAD(chainhash_table + i); + raw_local_irq_restore(flags); +} + +static void zap_class(struct lock_class *class) +{ + int i; + + /* + * Remove all dependencies this lock is + * involved in: + */ + for (i = 0; i < nr_list_entries; i++) { + if (list_entries[i].class == class) + list_del_rcu(&list_entries[i].entry); + } + /* + * Unhash the class and remove it from the all_lock_classes list: + */ + list_del_rcu(&class->hash_entry); + list_del_rcu(&class->lock_entry); + + class->key = NULL; +} + +static inline int within(const void *addr, void *start, unsigned long size) +{ + return addr >= start && addr < start + size; +} + +void lockdep_free_key_range(void *start, unsigned long size) +{ + struct lock_class *class, *next; + struct list_head *head; + unsigned long flags; + int i; + int locked; + + raw_local_irq_save(flags); + locked = graph_lock(); + + /* + * Unhash all classes that were created by this module: + */ + for (i = 0; i < CLASSHASH_SIZE; i++) { + head = classhash_table + i; + if (list_empty(head)) + continue; + list_for_each_entry_safe(class, next, head, hash_entry) { + if (within(class->key, start, size)) + zap_class(class); + else if (within(class->name, start, size)) + zap_class(class); + } + } + + if (locked) + graph_unlock(); + raw_local_irq_restore(flags); +} + +void lockdep_reset_lock(struct lockdep_map *lock) +{ + struct lock_class *class, *next; + struct list_head *head; + unsigned long flags; + int i, j; + int locked; + + raw_local_irq_save(flags); + + /* + * Remove all classes this lock might have: + */ + for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) { + /* + * If the class exists we look it up and zap it: + */ + class = look_up_lock_class(lock, j); + if (class) + zap_class(class); + } + /* + * Debug check: in the end all mapped classes should + * be gone. + */ + locked = graph_lock(); + for (i = 0; i < CLASSHASH_SIZE; i++) { + head = classhash_table + i; + if (list_empty(head)) + continue; + list_for_each_entry_safe(class, next, head, hash_entry) { + int match = 0; + + for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) + match |= class == lock->class_cache[j]; + + if (unlikely(match)) { + if (debug_locks_off_graph_unlock()) + WARN_ON(1); + goto out_restore; + } + } + } + if (locked) + graph_unlock(); + +out_restore: + raw_local_irq_restore(flags); +} + +void lockdep_init(void) +{ + int i; + + /* + * Some architectures have their own start_kernel() + * code which calls lockdep_init(), while we also + * call lockdep_init() from the start_kernel() itself, + * and we want to initialize the hashes only once: + */ + if (lockdep_initialized) + return; + + for (i = 0; i < CLASSHASH_SIZE; i++) + INIT_LIST_HEAD(classhash_table + i); + + for (i = 0; i < CHAINHASH_SIZE; i++) + INIT_LIST_HEAD(chainhash_table + i); + + lockdep_initialized = 1; +} + +void __init lockdep_info(void) +{ + printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); + + printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES); + printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH); + printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS); + printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE); + printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES); + printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS); + printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE); + + printk(" memory used by lock dependency info: %lu kB\n", + (sizeof(struct lock_class) * MAX_LOCKDEP_KEYS + + sizeof(struct list_head) * CLASSHASH_SIZE + + sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES + + sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS + + sizeof(struct list_head) * CHAINHASH_SIZE +#ifdef CONFIG_PROVE_LOCKING + + sizeof(struct circular_queue) +#endif + ) / 1024 + ); + + printk(" per task-struct memory footprint: %lu bytes\n", + sizeof(struct held_lock) * MAX_LOCK_DEPTH); + +#ifdef CONFIG_DEBUG_LOCKDEP + if (lockdep_init_error) { + printk("WARNING: lockdep init error! Arch code didn't call lockdep_init() early enough?\n"); + printk("Call stack leading to lockdep invocation was:\n"); + print_stack_trace(&lockdep_init_trace, 0); + } +#endif +} + +static void +print_freed_lock_bug(struct task_struct *curr, const void *mem_from, + const void *mem_to, struct held_lock *hlock) +{ + if (!debug_locks_off()) + return; + if (debug_locks_silent) + return; + + printk("\n=========================\n"); + printk( "[ BUG: held lock freed! ]\n"); + printk( "-------------------------\n"); + printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", + curr->comm, task_pid_nr(curr), mem_from, mem_to-1); + print_lock(hlock); + lockdep_print_held_locks(curr); + + printk("\nstack backtrace:\n"); + dump_stack(); +} + +static inline int not_in_range(const void* mem_from, unsigned long mem_len, + const void* lock_from, unsigned long lock_len) +{ + return lock_from + lock_len <= mem_from || + mem_from + mem_len <= lock_from; +} + +/* + * Called when kernel memory is freed (or unmapped), or if a lock + * is destroyed or reinitialized - this code checks whether there is + * any held lock in the memory range of to : + */ +void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) +{ + struct task_struct *curr = current; + struct held_lock *hlock; + unsigned long flags; + int i; + + if (unlikely(!debug_locks)) + return; + + local_irq_save(flags); + for (i = 0; i < curr->lockdep_depth; i++) { + hlock = curr->held_locks + i; + + if (not_in_range(mem_from, mem_len, hlock->instance, + sizeof(*hlock->instance))) + continue; + + print_freed_lock_bug(curr, mem_from, mem_from + mem_len, hlock); + break; + } + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); + +static void print_held_locks_bug(struct task_struct *curr) +{ + if (!debug_locks_off()) + return; + if (debug_locks_silent) + return; + + printk("\n=====================================\n"); + printk( "[ BUG: lock held at task exit time! ]\n"); + printk( "-------------------------------------\n"); + printk("%s/%d is exiting with locks still held!\n", + curr->comm, task_pid_nr(curr)); + lockdep_print_held_locks(curr); + + printk("\nstack backtrace:\n"); + dump_stack(); +} + +void debug_check_no_locks_held(struct task_struct *task) +{ + if (unlikely(task->lockdep_depth > 0)) + print_held_locks_bug(task); +} + +void debug_show_all_locks(void) +{ + struct task_struct *g, *p; + int count = 10; + int unlock = 1; + + if (unlikely(!debug_locks)) { + printk("INFO: lockdep is turned off.\n"); + return; + } + printk("\nShowing all locks held in the system:\n"); + + /* + * Here we try to get the tasklist_lock as hard as possible, + * if not successful after 2 seconds we ignore it (but keep + * trying). This is to enable a debug printout even if a + * tasklist_lock-holding task deadlocks or crashes. + */ +retry: + if (!read_trylock(&tasklist_lock)) { + if (count == 10) + printk("hm, tasklist_lock locked, retrying... "); + if (count) { + count--; + printk(" #%d", 10-count); + mdelay(200); + goto retry; + } + printk(" ignoring it.\n"); + unlock = 0; + } else { + if (count != 10) + printk(KERN_CONT " locked it.\n"); + } + + do_each_thread(g, p) { + /* + * It's not reliable to print a task's held locks + * if it's not sleeping (or if it's not the current + * task): + */ + if (p->state == TASK_RUNNING && p != current) + continue; + if (p->lockdep_depth) + lockdep_print_held_locks(p); + if (!unlock) + if (read_trylock(&tasklist_lock)) + unlock = 1; + } while_each_thread(g, p); + + printk("\n"); + printk("=============================================\n\n"); + + if (unlock) + read_unlock(&tasklist_lock); +} +EXPORT_SYMBOL_GPL(debug_show_all_locks); + +/* + * Careful: only use this function if you are sure that + * the task cannot run in parallel! + */ +void debug_show_held_locks(struct task_struct *task) +{ + if (unlikely(!debug_locks)) { + printk("INFO: lockdep is turned off.\n"); + return; + } + lockdep_print_held_locks(task); +} +EXPORT_SYMBOL_GPL(debug_show_held_locks); + +void lockdep_sys_exit(void) +{ + struct task_struct *curr = current; + + if (unlikely(curr->lockdep_depth)) { + if (!debug_locks_off()) + return; + printk("\n================================================\n"); + printk( "[ BUG: lock held when returning to user space! ]\n"); + printk( "------------------------------------------------\n"); + printk("%s/%d is leaving the kernel with locks still held!\n", + curr->comm, curr->pid); + lockdep_print_held_locks(curr); + } +} + +void lockdep_rcu_dereference(const char *file, const int line) +{ + struct task_struct *curr = current; + +#ifndef CONFIG_PROVE_RCU_REPEATEDLY + if (!debug_locks_off()) + return; +#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ + /* Note: the following can be executed concurrently, so be careful. */ + printk("\n===================================================\n"); + printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n"); + printk( "---------------------------------------------------\n"); + printk("%s:%d invoked rcu_dereference_check() without protection!\n", + file, line); + printk("\nother info that might help us debug this:\n\n"); + printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); + lockdep_print_held_locks(curr); + printk("\nstack backtrace:\n"); + dump_stack(); +} +EXPORT_SYMBOL_GPL(lockdep_rcu_dereference); diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h new file mode 100644 index 00000000..4f560cfe --- /dev/null +++ b/kernel/lockdep_internals.h @@ -0,0 +1,170 @@ +/* + * kernel/lockdep_internals.h + * + * Runtime locking correctness validator + * + * lockdep subsystem internal functions and variables. + */ + +/* + * Lock-class usage-state bits: + */ +enum lock_usage_bit { +#define LOCKDEP_STATE(__STATE) \ + LOCK_USED_IN_##__STATE, \ + LOCK_USED_IN_##__STATE##_READ, \ + LOCK_ENABLED_##__STATE, \ + LOCK_ENABLED_##__STATE##_READ, +#include "lockdep_states.h" +#undef LOCKDEP_STATE + LOCK_USED, + LOCK_USAGE_STATES +}; + +/* + * Usage-state bitmasks: + */ +#define __LOCKF(__STATE) LOCKF_##__STATE = (1 << LOCK_##__STATE), + +enum { +#define LOCKDEP_STATE(__STATE) \ + __LOCKF(USED_IN_##__STATE) \ + __LOCKF(USED_IN_##__STATE##_READ) \ + __LOCKF(ENABLED_##__STATE) \ + __LOCKF(ENABLED_##__STATE##_READ) +#include "lockdep_states.h" +#undef LOCKDEP_STATE + __LOCKF(USED) +}; + +#define LOCKF_ENABLED_IRQ (LOCKF_ENABLED_HARDIRQ | LOCKF_ENABLED_SOFTIRQ) +#define LOCKF_USED_IN_IRQ (LOCKF_USED_IN_HARDIRQ | LOCKF_USED_IN_SOFTIRQ) + +#define LOCKF_ENABLED_IRQ_READ \ + (LOCKF_ENABLED_HARDIRQ_READ | LOCKF_ENABLED_SOFTIRQ_READ) +#define LOCKF_USED_IN_IRQ_READ \ + (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ) + +/* + * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies + * we track. + * + * We use the per-lock dependency maps in two ways: we grow it by adding + * every to-be-taken lock to all currently held lock's own dependency + * table (if it's not there yet), and we check it for lock order + * conflicts and deadlocks. + */ +#define MAX_LOCKDEP_ENTRIES 16384UL + +#define MAX_LOCKDEP_CHAINS_BITS 15 +#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) + +#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) + +/* + * Stack-trace: tightly packed array of stack backtrace + * addresses. Protected by the hash_lock. + */ +#define MAX_STACK_TRACE_ENTRIES 262144UL + +extern struct list_head all_lock_classes; +extern struct lock_chain lock_chains[]; + +#define LOCK_USAGE_CHARS (1+LOCK_USAGE_STATES/2) + +extern void get_usage_chars(struct lock_class *class, + char usage[LOCK_USAGE_CHARS]); + +extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str); + +struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i); + +extern unsigned long nr_lock_classes; +extern unsigned long nr_list_entries; +extern unsigned long nr_lock_chains; +extern int nr_chain_hlocks; +extern unsigned long nr_stack_trace_entries; + +extern unsigned int nr_hardirq_chains; +extern unsigned int nr_softirq_chains; +extern unsigned int nr_process_chains; +extern unsigned int max_lockdep_depth; +extern unsigned int max_recursion_depth; + +extern unsigned int max_bfs_queue_depth; + +#ifdef CONFIG_PROVE_LOCKING +extern unsigned long lockdep_count_forward_deps(struct lock_class *); +extern unsigned long lockdep_count_backward_deps(struct lock_class *); +#else +static inline unsigned long +lockdep_count_forward_deps(struct lock_class *class) +{ + return 0; +} +static inline unsigned long +lockdep_count_backward_deps(struct lock_class *class) +{ + return 0; +} +#endif + +#ifdef CONFIG_DEBUG_LOCKDEP + +#include +/* + * Various lockdep statistics. + * We want them per cpu as they are often accessed in fast path + * and we want to avoid too much cache bouncing. + */ +struct lockdep_stats { + int chain_lookup_hits; + int chain_lookup_misses; + int hardirqs_on_events; + int hardirqs_off_events; + int redundant_hardirqs_on; + int redundant_hardirqs_off; + int softirqs_on_events; + int softirqs_off_events; + int redundant_softirqs_on; + int redundant_softirqs_off; + int nr_unused_locks; + int nr_cyclic_checks; + int nr_cyclic_check_recursions; + int nr_find_usage_forwards_checks; + int nr_find_usage_forwards_recursions; + int nr_find_usage_backwards_checks; + int nr_find_usage_backwards_recursions; +}; + +DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats); + +#define __debug_atomic_inc(ptr) \ + this_cpu_inc(lockdep_stats.ptr); + +#define debug_atomic_inc(ptr) { \ + WARN_ON_ONCE(!irqs_disabled()); \ + __this_cpu_inc(lockdep_stats.ptr); \ +} + +#define debug_atomic_dec(ptr) { \ + WARN_ON_ONCE(!irqs_disabled()); \ + __this_cpu_dec(lockdep_stats.ptr); \ +} + +#define debug_atomic_read(ptr) ({ \ + struct lockdep_stats *__cpu_lockdep_stats; \ + unsigned long long __total = 0; \ + int __cpu; \ + for_each_possible_cpu(__cpu) { \ + __cpu_lockdep_stats = &per_cpu(lockdep_stats, __cpu); \ + __total += __cpu_lockdep_stats->ptr; \ + } \ + __total; \ +}) +#else +# define __debug_atomic_inc(ptr) do { } while (0) +# define debug_atomic_inc(ptr) do { } while (0) +# define debug_atomic_dec(ptr) do { } while (0) +# define debug_atomic_read(ptr) 0 +#endif diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c new file mode 100644 index 00000000..71edd2f6 --- /dev/null +++ b/kernel/lockdep_proc.c @@ -0,0 +1,680 @@ +/* + * kernel/lockdep_proc.c + * + * Runtime locking correctness validator + * + * Started by Ingo Molnar: + * + * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + * + * Code for /proc/lockdep and /proc/lockdep_stats: + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "lockdep_internals.h" + +static void *l_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &all_lock_classes, pos); +} + +static void *l_start(struct seq_file *m, loff_t *pos) +{ + return seq_list_start_head(&all_lock_classes, *pos); +} + +static void l_stop(struct seq_file *m, void *v) +{ +} + +static void print_name(struct seq_file *m, struct lock_class *class) +{ + char str[128]; + const char *name = class->name; + + if (!name) { + name = __get_key_name(class->key, str); + seq_printf(m, "%s", name); + } else{ + seq_printf(m, "%s", name); + if (class->name_version > 1) + seq_printf(m, "#%d", class->name_version); + if (class->subclass) + seq_printf(m, "/%d", class->subclass); + } +} + +static int l_show(struct seq_file *m, void *v) +{ + struct lock_class *class = list_entry(v, struct lock_class, lock_entry); + struct lock_list *entry; + char usage[LOCK_USAGE_CHARS]; + + if (v == &all_lock_classes) { + seq_printf(m, "all lock classes:\n"); + return 0; + } + + seq_printf(m, "%p", class->key); +#ifdef CONFIG_DEBUG_LOCKDEP + seq_printf(m, " OPS:%8ld", class->ops); +#endif +#ifdef CONFIG_PROVE_LOCKING + seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class)); + seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class)); +#endif + + get_usage_chars(class, usage); + seq_printf(m, " %s", usage); + + seq_printf(m, ": "); + print_name(m, class); + seq_puts(m, "\n"); + + list_for_each_entry(entry, &class->locks_after, entry) { + if (entry->distance == 1) { + seq_printf(m, " -> [%p] ", entry->class->key); + print_name(m, entry->class); + seq_puts(m, "\n"); + } + } + seq_puts(m, "\n"); + + return 0; +} + +static const struct seq_operations lockdep_ops = { + .start = l_start, + .next = l_next, + .stop = l_stop, + .show = l_show, +}; + +static int lockdep_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &lockdep_ops); +} + +static const struct file_operations proc_lockdep_operations = { + .open = lockdep_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#ifdef CONFIG_PROVE_LOCKING +static void *lc_start(struct seq_file *m, loff_t *pos) +{ + if (*pos == 0) + return SEQ_START_TOKEN; + + if (*pos - 1 < nr_lock_chains) + return lock_chains + (*pos - 1); + + return NULL; +} + +static void *lc_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return lc_start(m, pos); +} + +static void lc_stop(struct seq_file *m, void *v) +{ +} + +static int lc_show(struct seq_file *m, void *v) +{ + struct lock_chain *chain = v; + struct lock_class *class; + int i; + + if (v == SEQ_START_TOKEN) { + seq_printf(m, "all lock chains:\n"); + return 0; + } + + seq_printf(m, "irq_context: %d\n", chain->irq_context); + + for (i = 0; i < chain->depth; i++) { + class = lock_chain_get_class(chain, i); + if (!class->key) + continue; + + seq_printf(m, "[%p] ", class->key); + print_name(m, class); + seq_puts(m, "\n"); + } + seq_puts(m, "\n"); + + return 0; +} + +static const struct seq_operations lockdep_chains_ops = { + .start = lc_start, + .next = lc_next, + .stop = lc_stop, + .show = lc_show, +}; + +static int lockdep_chains_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &lockdep_chains_ops); +} + +static const struct file_operations proc_lockdep_chains_operations = { + .open = lockdep_chains_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif /* CONFIG_PROVE_LOCKING */ + +static void lockdep_stats_debug_show(struct seq_file *m) +{ +#ifdef CONFIG_DEBUG_LOCKDEP + unsigned long long hi1 = debug_atomic_read(hardirqs_on_events), + hi2 = debug_atomic_read(hardirqs_off_events), + hr1 = debug_atomic_read(redundant_hardirqs_on), + hr2 = debug_atomic_read(redundant_hardirqs_off), + si1 = debug_atomic_read(softirqs_on_events), + si2 = debug_atomic_read(softirqs_off_events), + sr1 = debug_atomic_read(redundant_softirqs_on), + sr2 = debug_atomic_read(redundant_softirqs_off); + + seq_printf(m, " chain lookup misses: %11llu\n", + debug_atomic_read(chain_lookup_misses)); + seq_printf(m, " chain lookup hits: %11llu\n", + debug_atomic_read(chain_lookup_hits)); + seq_printf(m, " cyclic checks: %11llu\n", + debug_atomic_read(nr_cyclic_checks)); + seq_printf(m, " find-mask forwards checks: %11llu\n", + debug_atomic_read(nr_find_usage_forwards_checks)); + seq_printf(m, " find-mask backwards checks: %11llu\n", + debug_atomic_read(nr_find_usage_backwards_checks)); + + seq_printf(m, " hardirq on events: %11llu\n", hi1); + seq_printf(m, " hardirq off events: %11llu\n", hi2); + seq_printf(m, " redundant hardirq ons: %11llu\n", hr1); + seq_printf(m, " redundant hardirq offs: %11llu\n", hr2); + seq_printf(m, " softirq on events: %11llu\n", si1); + seq_printf(m, " softirq off events: %11llu\n", si2); + seq_printf(m, " redundant softirq ons: %11llu\n", sr1); + seq_printf(m, " redundant softirq offs: %11llu\n", sr2); +#endif +} + +static int lockdep_stats_show(struct seq_file *m, void *v) +{ + struct lock_class *class; + unsigned long nr_unused = 0, nr_uncategorized = 0, + nr_irq_safe = 0, nr_irq_unsafe = 0, + nr_softirq_safe = 0, nr_softirq_unsafe = 0, + nr_hardirq_safe = 0, nr_hardirq_unsafe = 0, + nr_irq_read_safe = 0, nr_irq_read_unsafe = 0, + nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0, + nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0, + sum_forward_deps = 0; + + list_for_each_entry(class, &all_lock_classes, lock_entry) { + + if (class->usage_mask == 0) + nr_unused++; + if (class->usage_mask == LOCKF_USED) + nr_uncategorized++; + if (class->usage_mask & LOCKF_USED_IN_IRQ) + nr_irq_safe++; + if (class->usage_mask & LOCKF_ENABLED_IRQ) + nr_irq_unsafe++; + if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ) + nr_softirq_safe++; + if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ) + nr_softirq_unsafe++; + if (class->usage_mask & LOCKF_USED_IN_HARDIRQ) + nr_hardirq_safe++; + if (class->usage_mask & LOCKF_ENABLED_HARDIRQ) + nr_hardirq_unsafe++; + if (class->usage_mask & LOCKF_USED_IN_IRQ_READ) + nr_irq_read_safe++; + if (class->usage_mask & LOCKF_ENABLED_IRQ_READ) + nr_irq_read_unsafe++; + if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) + nr_softirq_read_safe++; + if (class->usage_mask & LOCKF_ENABLED_SOFTIRQ_READ) + nr_softirq_read_unsafe++; + if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) + nr_hardirq_read_safe++; + if (class->usage_mask & LOCKF_ENABLED_HARDIRQ_READ) + nr_hardirq_read_unsafe++; + +#ifdef CONFIG_PROVE_LOCKING + sum_forward_deps += lockdep_count_forward_deps(class); +#endif + } +#ifdef CONFIG_DEBUG_LOCKDEP + DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused); +#endif + seq_printf(m, " lock-classes: %11lu [max: %lu]\n", + nr_lock_classes, MAX_LOCKDEP_KEYS); + seq_printf(m, " direct dependencies: %11lu [max: %lu]\n", + nr_list_entries, MAX_LOCKDEP_ENTRIES); + seq_printf(m, " indirect dependencies: %11lu\n", + sum_forward_deps); + + /* + * Total number of dependencies: + * + * All irq-safe locks may nest inside irq-unsafe locks, + * plus all the other known dependencies: + */ + seq_printf(m, " all direct dependencies: %11lu\n", + nr_irq_unsafe * nr_irq_safe + + nr_hardirq_unsafe * nr_hardirq_safe + + nr_list_entries); + +#ifdef CONFIG_PROVE_LOCKING + seq_printf(m, " dependency chains: %11lu [max: %lu]\n", + nr_lock_chains, MAX_LOCKDEP_CHAINS); + seq_printf(m, " dependency chain hlocks: %11d [max: %lu]\n", + nr_chain_hlocks, MAX_LOCKDEP_CHAIN_HLOCKS); +#endif + +#ifdef CONFIG_TRACE_IRQFLAGS + seq_printf(m, " in-hardirq chains: %11u\n", + nr_hardirq_chains); + seq_printf(m, " in-softirq chains: %11u\n", + nr_softirq_chains); +#endif + seq_printf(m, " in-process chains: %11u\n", + nr_process_chains); + seq_printf(m, " stack-trace entries: %11lu [max: %lu]\n", + nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES); + seq_printf(m, " combined max dependencies: %11u\n", + (nr_hardirq_chains + 1) * + (nr_softirq_chains + 1) * + (nr_process_chains + 1) + ); + seq_printf(m, " hardirq-safe locks: %11lu\n", + nr_hardirq_safe); + seq_printf(m, " hardirq-unsafe locks: %11lu\n", + nr_hardirq_unsafe); + seq_printf(m, " softirq-safe locks: %11lu\n", + nr_softirq_safe); + seq_printf(m, " softirq-unsafe locks: %11lu\n", + nr_softirq_unsafe); + seq_printf(m, " irq-safe locks: %11lu\n", + nr_irq_safe); + seq_printf(m, " irq-unsafe locks: %11lu\n", + nr_irq_unsafe); + + seq_printf(m, " hardirq-read-safe locks: %11lu\n", + nr_hardirq_read_safe); + seq_printf(m, " hardirq-read-unsafe locks: %11lu\n", + nr_hardirq_read_unsafe); + seq_printf(m, " softirq-read-safe locks: %11lu\n", + nr_softirq_read_safe); + seq_printf(m, " softirq-read-unsafe locks: %11lu\n", + nr_softirq_read_unsafe); + seq_printf(m, " irq-read-safe locks: %11lu\n", + nr_irq_read_safe); + seq_printf(m, " irq-read-unsafe locks: %11lu\n", + nr_irq_read_unsafe); + + seq_printf(m, " uncategorized locks: %11lu\n", + nr_uncategorized); + seq_printf(m, " unused locks: %11lu\n", + nr_unused); + seq_printf(m, " max locking depth: %11u\n", + max_lockdep_depth); +#ifdef CONFIG_PROVE_LOCKING + seq_printf(m, " max bfs queue depth: %11u\n", + max_bfs_queue_depth); +#endif + lockdep_stats_debug_show(m); + seq_printf(m, " debug_locks: %11u\n", + debug_locks); + + return 0; +} + +static int lockdep_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, lockdep_stats_show, NULL); +} + +static const struct file_operations proc_lockdep_stats_operations = { + .open = lockdep_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#ifdef CONFIG_LOCK_STAT + +struct lock_stat_data { + struct lock_class *class; + struct lock_class_stats stats; +}; + +struct lock_stat_seq { + struct lock_stat_data *iter_end; + struct lock_stat_data stats[MAX_LOCKDEP_KEYS]; +}; + +/* + * sort on absolute number of contentions + */ +static int lock_stat_cmp(const void *l, const void *r) +{ + const struct lock_stat_data *dl = l, *dr = r; + unsigned long nl, nr; + + nl = dl->stats.read_waittime.nr + dl->stats.write_waittime.nr; + nr = dr->stats.read_waittime.nr + dr->stats.write_waittime.nr; + + return nr - nl; +} + +static void seq_line(struct seq_file *m, char c, int offset, int length) +{ + int i; + + for (i = 0; i < offset; i++) + seq_puts(m, " "); + for (i = 0; i < length; i++) + seq_printf(m, "%c", c); + seq_puts(m, "\n"); +} + +static void snprint_time(char *buf, size_t bufsiz, s64 nr) +{ + s64 div; + s32 rem; + + nr += 5; /* for display rounding */ + div = div_s64_rem(nr, 1000, &rem); + snprintf(buf, bufsiz, "%lld.%02d", (long long)div, (int)rem/10); +} + +static void seq_time(struct seq_file *m, s64 time) +{ + char num[15]; + + snprint_time(num, sizeof(num), time); + seq_printf(m, " %14s", num); +} + +static void seq_lock_time(struct seq_file *m, struct lock_time *lt) +{ + seq_printf(m, "%14lu", lt->nr); + seq_time(m, lt->min); + seq_time(m, lt->max); + seq_time(m, lt->total); +} + +static void seq_stats(struct seq_file *m, struct lock_stat_data *data) +{ + char name[39]; + struct lock_class *class; + struct lock_class_stats *stats; + int i, namelen; + + class = data->class; + stats = &data->stats; + + namelen = 38; + if (class->name_version > 1) + namelen -= 2; /* XXX truncates versions > 9 */ + if (class->subclass) + namelen -= 2; + + if (!class->name) { + char str[KSYM_NAME_LEN]; + const char *key_name; + + key_name = __get_key_name(class->key, str); + snprintf(name, namelen, "%s", key_name); + } else { + snprintf(name, namelen, "%s", class->name); + } + namelen = strlen(name); + if (class->name_version > 1) { + snprintf(name+namelen, 3, "#%d", class->name_version); + namelen += 2; + } + if (class->subclass) { + snprintf(name+namelen, 3, "/%d", class->subclass); + namelen += 2; + } + + if (stats->write_holdtime.nr) { + if (stats->read_holdtime.nr) + seq_printf(m, "%38s-W:", name); + else + seq_printf(m, "%40s:", name); + + seq_printf(m, "%14lu ", stats->bounces[bounce_contended_write]); + seq_lock_time(m, &stats->write_waittime); + seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_write]); + seq_lock_time(m, &stats->write_holdtime); + seq_puts(m, "\n"); + } + + if (stats->read_holdtime.nr) { + seq_printf(m, "%38s-R:", name); + seq_printf(m, "%14lu ", stats->bounces[bounce_contended_read]); + seq_lock_time(m, &stats->read_waittime); + seq_printf(m, " %14lu ", stats->bounces[bounce_acquired_read]); + seq_lock_time(m, &stats->read_holdtime); + seq_puts(m, "\n"); + } + + if (stats->read_waittime.nr + stats->write_waittime.nr == 0) + return; + + if (stats->read_holdtime.nr) + namelen += 2; + + for (i = 0; i < LOCKSTAT_POINTS; i++) { + char ip[32]; + + if (class->contention_point[i] == 0) + break; + + if (!i) + seq_line(m, '-', 40-namelen, namelen); + + snprintf(ip, sizeof(ip), "[<%p>]", + (void *)class->contention_point[i]); + seq_printf(m, "%40s %14lu %29s %pS\n", + name, stats->contention_point[i], + ip, (void *)class->contention_point[i]); + } + for (i = 0; i < LOCKSTAT_POINTS; i++) { + char ip[32]; + + if (class->contending_point[i] == 0) + break; + + if (!i) + seq_line(m, '-', 40-namelen, namelen); + + snprintf(ip, sizeof(ip), "[<%p>]", + (void *)class->contending_point[i]); + seq_printf(m, "%40s %14lu %29s %pS\n", + name, stats->contending_point[i], + ip, (void *)class->contending_point[i]); + } + if (i) { + seq_puts(m, "\n"); + seq_line(m, '.', 0, 40 + 1 + 10 * (14 + 1)); + seq_puts(m, "\n"); + } +} + +static void seq_header(struct seq_file *m) +{ + seq_printf(m, "lock_stat version 0.3\n"); + + if (unlikely(!debug_locks)) + seq_printf(m, "*WARNING* lock debugging disabled!! - possibly due to a lockdep warning\n"); + + seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); + seq_printf(m, "%40s %14s %14s %14s %14s %14s %14s %14s %14s " + "%14s %14s\n", + "class name", + "con-bounces", + "contentions", + "waittime-min", + "waittime-max", + "waittime-total", + "acq-bounces", + "acquisitions", + "holdtime-min", + "holdtime-max", + "holdtime-total"); + seq_line(m, '-', 0, 40 + 1 + 10 * (14 + 1)); + seq_printf(m, "\n"); +} + +static void *ls_start(struct seq_file *m, loff_t *pos) +{ + struct lock_stat_seq *data = m->private; + struct lock_stat_data *iter; + + if (*pos == 0) + return SEQ_START_TOKEN; + + iter = data->stats + (*pos - 1); + if (iter >= data->iter_end) + iter = NULL; + + return iter; +} + +static void *ls_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return ls_start(m, pos); +} + +static void ls_stop(struct seq_file *m, void *v) +{ +} + +static int ls_show(struct seq_file *m, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_header(m); + else + seq_stats(m, v); + + return 0; +} + +static const struct seq_operations lockstat_ops = { + .start = ls_start, + .next = ls_next, + .stop = ls_stop, + .show = ls_show, +}; + +static int lock_stat_open(struct inode *inode, struct file *file) +{ + int res; + struct lock_class *class; + struct lock_stat_seq *data = vmalloc(sizeof(struct lock_stat_seq)); + + if (!data) + return -ENOMEM; + + res = seq_open(file, &lockstat_ops); + if (!res) { + struct lock_stat_data *iter = data->stats; + struct seq_file *m = file->private_data; + + list_for_each_entry(class, &all_lock_classes, lock_entry) { + iter->class = class; + iter->stats = lock_stats(class); + iter++; + } + data->iter_end = iter; + + sort(data->stats, data->iter_end - data->stats, + sizeof(struct lock_stat_data), + lock_stat_cmp, NULL); + + m->private = data; + } else + vfree(data); + + return res; +} + +static ssize_t lock_stat_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct lock_class *class; + char c; + + if (count) { + if (get_user(c, buf)) + return -EFAULT; + + if (c != '0') + return count; + + list_for_each_entry(class, &all_lock_classes, lock_entry) + clear_lock_stats(class); + } + return count; +} + +static int lock_stat_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + + vfree(seq->private); + return seq_release(inode, file); +} + +static const struct file_operations proc_lock_stat_operations = { + .open = lock_stat_open, + .write = lock_stat_write, + .read = seq_read, + .llseek = seq_lseek, + .release = lock_stat_release, +}; +#endif /* CONFIG_LOCK_STAT */ + +static int __init lockdep_proc_init(void) +{ + proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations); +#ifdef CONFIG_PROVE_LOCKING + proc_create("lockdep_chains", S_IRUSR, NULL, + &proc_lockdep_chains_operations); +#endif + proc_create("lockdep_stats", S_IRUSR, NULL, + &proc_lockdep_stats_operations); + +#ifdef CONFIG_LOCK_STAT + proc_create("lock_stat", S_IRUSR | S_IWUSR, NULL, + &proc_lock_stat_operations); +#endif + + return 0; +} + +__initcall(lockdep_proc_init); + diff --git a/kernel/lockdep_states.h b/kernel/lockdep_states.h new file mode 100644 index 00000000..995b0cc2 --- /dev/null +++ b/kernel/lockdep_states.h @@ -0,0 +1,9 @@ +/* + * Lockdep states, + * + * please update XXX_LOCK_USAGE_STATES in include/linux/lockdep.h whenever + * you add one, or come up with a nice dynamic solution. + */ +LOCKDEP_STATE(HARDIRQ) +LOCKDEP_STATE(SOFTIRQ) +LOCKDEP_STATE(RECLAIM_FS) diff --git a/kernel/module.c b/kernel/module.c new file mode 100644 index 00000000..b9d0667e --- /dev/null +++ b/kernel/module.c @@ -0,0 +1,3469 @@ +/* + Copyright (C) 2002 Richard Henderson + Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(fmt , a...) +#endif + +#ifndef ARCH_SHF_SMALL +#define ARCH_SHF_SMALL 0 +#endif + +/* + * Modules' sections will be aligned on page boundaries + * to ensure complete separation of code and data, but + * only when CONFIG_DEBUG_SET_MODULE_RONX=y + */ +#ifdef CONFIG_DEBUG_SET_MODULE_RONX +# define debug_align(X) ALIGN(X, PAGE_SIZE) +#else +# define debug_align(X) (X) +#endif + +/* + * Given BASE and SIZE this macro calculates the number of pages the + * memory regions occupies + */ +#define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ? \ + (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \ + PFN_DOWN((unsigned long)BASE) + 1) \ + : (0UL)) + +/* If this is set, the section belongs in the init part of the module */ +#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) + +/* + * Mutex protects: + * 1) List of modules (also safely readable with preempt_disable), + * 2) module_use links, + * 3) module_addr_min/module_addr_max. + * (delete uses stop_machine/add uses RCU list operations). */ +DEFINE_MUTEX(module_mutex); +EXPORT_SYMBOL_GPL(module_mutex); +static LIST_HEAD(modules); +#ifdef CONFIG_KGDB_KDB +struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */ +#endif /* CONFIG_KGDB_KDB */ + + +/* Block module loading/unloading? */ +int modules_disabled = 0; + +/* Waiting for a module to finish initializing? */ +static DECLARE_WAIT_QUEUE_HEAD(module_wq); + +static BLOCKING_NOTIFIER_HEAD(module_notify_list); + +/* Bounds of module allocation, for speeding __module_address. + * Protected by module_mutex. */ +static unsigned long module_addr_min = -1UL, module_addr_max = 0; + +int register_module_notifier(struct notifier_block * nb) +{ + return blocking_notifier_chain_register(&module_notify_list, nb); +} +EXPORT_SYMBOL(register_module_notifier); + +int unregister_module_notifier(struct notifier_block * nb) +{ + return blocking_notifier_chain_unregister(&module_notify_list, nb); +} +EXPORT_SYMBOL(unregister_module_notifier); + +struct load_info { + Elf_Ehdr *hdr; + unsigned long len; + Elf_Shdr *sechdrs; + char *secstrings, *strtab; + unsigned long *strmap; + unsigned long symoffs, stroffs; + struct _ddebug *debug; + unsigned int num_debug; + struct { + unsigned int sym, str, mod, vers, info, pcpu; + } index; +}; + +/* We require a truly strong try_module_get(): 0 means failure due to + ongoing or failed initialization etc. */ +static inline int strong_try_module_get(struct module *mod) +{ + if (mod && mod->state == MODULE_STATE_COMING) + return -EBUSY; + if (try_module_get(mod)) + return 0; + else + return -ENOENT; +} + +static inline void add_taint_module(struct module *mod, unsigned flag) +{ + add_taint(flag); + mod->taints |= (1U << flag); +} + +/* + * A thread that wants to hold a reference to a module only while it + * is running can call this to safely exit. nfsd and lockd use this. + */ +void __module_put_and_exit(struct module *mod, long code) +{ + module_put(mod); + do_exit(code); +} +EXPORT_SYMBOL(__module_put_and_exit); + +/* Find a module section: 0 means not found. */ +static unsigned int find_sec(const struct load_info *info, const char *name) +{ + unsigned int i; + + for (i = 1; i < info->hdr->e_shnum; i++) { + Elf_Shdr *shdr = &info->sechdrs[i]; + /* Alloc bit cleared means "ignore it." */ + if ((shdr->sh_flags & SHF_ALLOC) + && strcmp(info->secstrings + shdr->sh_name, name) == 0) + return i; + } + return 0; +} + +/* Find a module section, or NULL. */ +static void *section_addr(const struct load_info *info, const char *name) +{ + /* Section 0 has sh_addr 0. */ + return (void *)info->sechdrs[find_sec(info, name)].sh_addr; +} + +/* Find a module section, or NULL. Fill in number of "objects" in section. */ +static void *section_objs(const struct load_info *info, + const char *name, + size_t object_size, + unsigned int *num) +{ + unsigned int sec = find_sec(info, name); + + /* Section 0 has sh_addr 0 and sh_size 0. */ + *num = info->sechdrs[sec].sh_size / object_size; + return (void *)info->sechdrs[sec].sh_addr; +} + +/* Provided by the linker */ +extern const struct kernel_symbol __start___ksymtab[]; +extern const struct kernel_symbol __stop___ksymtab[]; +extern const struct kernel_symbol __start___ksymtab_gpl[]; +extern const struct kernel_symbol __stop___ksymtab_gpl[]; +extern const struct kernel_symbol __start___ksymtab_gpl_future[]; +extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; +extern const unsigned long __start___kcrctab[]; +extern const unsigned long __start___kcrctab_gpl[]; +extern const unsigned long __start___kcrctab_gpl_future[]; +#ifdef CONFIG_UNUSED_SYMBOLS +extern const struct kernel_symbol __start___ksymtab_unused[]; +extern const struct kernel_symbol __stop___ksymtab_unused[]; +extern const struct kernel_symbol __start___ksymtab_unused_gpl[]; +extern const struct kernel_symbol __stop___ksymtab_unused_gpl[]; +extern const unsigned long __start___kcrctab_unused[]; +extern const unsigned long __start___kcrctab_unused_gpl[]; +#endif + +#ifndef CONFIG_MODVERSIONS +#define symversion(base, idx) NULL +#else +#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL) +#endif + +static bool each_symbol_in_section(const struct symsearch *arr, + unsigned int arrsize, + struct module *owner, + bool (*fn)(const struct symsearch *syms, + struct module *owner, + void *data), + void *data) +{ + unsigned int j; + + for (j = 0; j < arrsize; j++) { + if (fn(&arr[j], owner, data)) + return true; + } + + return false; +} + +/* Returns true as soon as fn returns true, otherwise false. */ +bool each_symbol_section(bool (*fn)(const struct symsearch *arr, + struct module *owner, + void *data), + void *data) +{ + struct module *mod; + static const struct symsearch arr[] = { + { __start___ksymtab, __stop___ksymtab, __start___kcrctab, + NOT_GPL_ONLY, false }, + { __start___ksymtab_gpl, __stop___ksymtab_gpl, + __start___kcrctab_gpl, + GPL_ONLY, false }, + { __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future, + __start___kcrctab_gpl_future, + WILL_BE_GPL_ONLY, false }, +#ifdef CONFIG_UNUSED_SYMBOLS + { __start___ksymtab_unused, __stop___ksymtab_unused, + __start___kcrctab_unused, + NOT_GPL_ONLY, true }, + { __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl, + __start___kcrctab_unused_gpl, + GPL_ONLY, true }, +#endif + }; + + if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data)) + return true; + + list_for_each_entry_rcu(mod, &modules, list) { + struct symsearch arr[] = { + { mod->syms, mod->syms + mod->num_syms, mod->crcs, + NOT_GPL_ONLY, false }, + { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms, + mod->gpl_crcs, + GPL_ONLY, false }, + { mod->gpl_future_syms, + mod->gpl_future_syms + mod->num_gpl_future_syms, + mod->gpl_future_crcs, + WILL_BE_GPL_ONLY, false }, +#ifdef CONFIG_UNUSED_SYMBOLS + { mod->unused_syms, + mod->unused_syms + mod->num_unused_syms, + mod->unused_crcs, + NOT_GPL_ONLY, true }, + { mod->unused_gpl_syms, + mod->unused_gpl_syms + mod->num_unused_gpl_syms, + mod->unused_gpl_crcs, + GPL_ONLY, true }, +#endif + }; + + if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data)) + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(each_symbol_section); + +struct find_symbol_arg { + /* Input */ + const char *name; + bool gplok; + bool warn; + + /* Output */ + struct module *owner; + const unsigned long *crc; + const struct kernel_symbol *sym; +}; + +static bool check_symbol(const struct symsearch *syms, + struct module *owner, + unsigned int symnum, void *data) +{ + struct find_symbol_arg *fsa = data; + + if (!fsa->gplok) { + if (syms->licence == GPL_ONLY) + return false; + if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) { + printk(KERN_WARNING "Symbol %s is being used " + "by a non-GPL module, which will not " + "be allowed in the future\n", fsa->name); + printk(KERN_WARNING "Please see the file " + "Documentation/feature-removal-schedule.txt " + "in the kernel source tree for more details.\n"); + } + } + +#ifdef CONFIG_UNUSED_SYMBOLS + if (syms->unused && fsa->warn) { + printk(KERN_WARNING "Symbol %s is marked as UNUSED, " + "however this module is using it.\n", fsa->name); + printk(KERN_WARNING + "This symbol will go away in the future.\n"); + printk(KERN_WARNING + "Please evalute if this is the right api to use and if " + "it really is, submit a report the linux kernel " + "mailinglist together with submitting your code for " + "inclusion.\n"); + } +#endif + + fsa->owner = owner; + fsa->crc = symversion(syms->crcs, symnum); + fsa->sym = &syms->start[symnum]; + return true; +} + +static int cmp_name(const void *va, const void *vb) +{ + const char *a; + const struct kernel_symbol *b; + a = va; b = vb; + return strcmp(a, b->name); +} + +static bool find_symbol_in_section(const struct symsearch *syms, + struct module *owner, + void *data) +{ + struct find_symbol_arg *fsa = data; + struct kernel_symbol *sym; + + sym = bsearch(fsa->name, syms->start, syms->stop - syms->start, + sizeof(struct kernel_symbol), cmp_name); + + if (sym != NULL && check_symbol(syms, owner, sym - syms->start, data)) + return true; + + return false; +} + +/* Find a symbol and return it, along with, (optional) crc and + * (optional) module which owns it. Needs preempt disabled or module_mutex. */ +const struct kernel_symbol *find_symbol(const char *name, + struct module **owner, + const unsigned long **crc, + bool gplok, + bool warn) +{ + struct find_symbol_arg fsa; + + fsa.name = name; + fsa.gplok = gplok; + fsa.warn = warn; + + if (each_symbol_section(find_symbol_in_section, &fsa)) { + if (owner) + *owner = fsa.owner; + if (crc) + *crc = fsa.crc; + return fsa.sym; + } + + DEBUGP("Failed to find symbol %s\n", name); + return NULL; +} +EXPORT_SYMBOL_GPL(find_symbol); + +/* Search for module by name: must hold module_mutex. */ +struct module *find_module(const char *name) +{ + struct module *mod; + + list_for_each_entry(mod, &modules, list) { + if (strcmp(mod->name, name) == 0) + return mod; + } + return NULL; +} +EXPORT_SYMBOL_GPL(find_module); + +#ifdef CONFIG_SMP + +static inline void __percpu *mod_percpu(struct module *mod) +{ + return mod->percpu; +} + +static int percpu_modalloc(struct module *mod, + unsigned long size, unsigned long align) +{ + if (align > PAGE_SIZE) { + printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", + mod->name, align, PAGE_SIZE); + align = PAGE_SIZE; + } + + mod->percpu = __alloc_reserved_percpu(size, align); + if (!mod->percpu) { + printk(KERN_WARNING + "%s: Could not allocate %lu bytes percpu data\n", + mod->name, size); + return -ENOMEM; + } + mod->percpu_size = size; + return 0; +} + +static void percpu_modfree(struct module *mod) +{ + free_percpu(mod->percpu); +} + +static unsigned int find_pcpusec(struct load_info *info) +{ + return find_sec(info, ".data..percpu"); +} + +static void percpu_modcopy(struct module *mod, + const void *from, unsigned long size) +{ + int cpu; + + for_each_possible_cpu(cpu) + memcpy(per_cpu_ptr(mod->percpu, cpu), from, size); +} + +/** + * is_module_percpu_address - test whether address is from module static percpu + * @addr: address to test + * + * Test whether @addr belongs to module static percpu area. + * + * RETURNS: + * %true if @addr is from module static percpu area + */ +bool is_module_percpu_address(unsigned long addr) +{ + struct module *mod; + unsigned int cpu; + + preempt_disable(); + + list_for_each_entry_rcu(mod, &modules, list) { + if (!mod->percpu_size) + continue; + for_each_possible_cpu(cpu) { + void *start = per_cpu_ptr(mod->percpu, cpu); + + if ((void *)addr >= start && + (void *)addr < start + mod->percpu_size) { + preempt_enable(); + return true; + } + } + } + + preempt_enable(); + return false; +} + +#else /* ... !CONFIG_SMP */ + +static inline void __percpu *mod_percpu(struct module *mod) +{ + return NULL; +} +static inline int percpu_modalloc(struct module *mod, + unsigned long size, unsigned long align) +{ + return -ENOMEM; +} +static inline void percpu_modfree(struct module *mod) +{ +} +static unsigned int find_pcpusec(struct load_info *info) +{ + return 0; +} +static inline void percpu_modcopy(struct module *mod, + const void *from, unsigned long size) +{ + /* pcpusec should be 0, and size of that section should be 0. */ + BUG_ON(size != 0); +} +bool is_module_percpu_address(unsigned long addr) +{ + return false; +} + +#endif /* CONFIG_SMP */ + +#define MODINFO_ATTR(field) \ +static void setup_modinfo_##field(struct module *mod, const char *s) \ +{ \ + mod->field = kstrdup(s, GFP_KERNEL); \ +} \ +static ssize_t show_modinfo_##field(struct module_attribute *mattr, \ + struct module *mod, char *buffer) \ +{ \ + return sprintf(buffer, "%s\n", mod->field); \ +} \ +static int modinfo_##field##_exists(struct module *mod) \ +{ \ + return mod->field != NULL; \ +} \ +static void free_modinfo_##field(struct module *mod) \ +{ \ + kfree(mod->field); \ + mod->field = NULL; \ +} \ +static struct module_attribute modinfo_##field = { \ + .attr = { .name = __stringify(field), .mode = 0444 }, \ + .show = show_modinfo_##field, \ + .setup = setup_modinfo_##field, \ + .test = modinfo_##field##_exists, \ + .free = free_modinfo_##field, \ +}; + +MODINFO_ATTR(version); +MODINFO_ATTR(srcversion); + +static char last_unloaded_module[MODULE_NAME_LEN+1]; + +#ifdef CONFIG_MODULE_UNLOAD + +EXPORT_TRACEPOINT_SYMBOL(module_get); + +/* Init the unload section of the module. */ +static int module_unload_init(struct module *mod) +{ + mod->refptr = alloc_percpu(struct module_ref); + if (!mod->refptr) + return -ENOMEM; + + INIT_LIST_HEAD(&mod->source_list); + INIT_LIST_HEAD(&mod->target_list); + + /* Hold reference count during initialization. */ + __this_cpu_write(mod->refptr->incs, 1); + /* Backwards compatibility macros put refcount during init. */ + mod->waiter = current; + + return 0; +} + +/* Does a already use b? */ +static int already_uses(struct module *a, struct module *b) +{ + struct module_use *use; + + list_for_each_entry(use, &b->source_list, source_list) { + if (use->source == a) { + DEBUGP("%s uses %s!\n", a->name, b->name); + return 1; + } + } + DEBUGP("%s does not use %s!\n", a->name, b->name); + return 0; +} + +/* + * Module a uses b + * - we add 'a' as a "source", 'b' as a "target" of module use + * - the module_use is added to the list of 'b' sources (so + * 'b' can walk the list to see who sourced them), and of 'a' + * targets (so 'a' can see what modules it targets). + */ +static int add_module_usage(struct module *a, struct module *b) +{ + struct module_use *use; + + DEBUGP("Allocating new usage for %s.\n", a->name); + use = kmalloc(sizeof(*use), GFP_ATOMIC); + if (!use) { + printk(KERN_WARNING "%s: out of memory loading\n", a->name); + return -ENOMEM; + } + + use->source = a; + use->target = b; + list_add(&use->source_list, &b->source_list); + list_add(&use->target_list, &a->target_list); + return 0; +} + +/* Module a uses b: caller needs module_mutex() */ +int ref_module(struct module *a, struct module *b) +{ + int err; + + if (b == NULL || already_uses(a, b)) + return 0; + + /* If module isn't available, we fail. */ + err = strong_try_module_get(b); + if (err) + return err; + + err = add_module_usage(a, b); + if (err) { + module_put(b); + return err; + } + return 0; +} +EXPORT_SYMBOL_GPL(ref_module); + +/* Clear the unload stuff of the module. */ +static void module_unload_free(struct module *mod) +{ + struct module_use *use, *tmp; + + mutex_lock(&module_mutex); + list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) { + struct module *i = use->target; + DEBUGP("%s unusing %s\n", mod->name, i->name); + module_put(i); + list_del(&use->source_list); + list_del(&use->target_list); + kfree(use); + } + mutex_unlock(&module_mutex); + + free_percpu(mod->refptr); +} + +#ifdef CONFIG_MODULE_FORCE_UNLOAD +static inline int try_force_unload(unsigned int flags) +{ + int ret = (flags & O_TRUNC); + if (ret) + add_taint(TAINT_FORCED_RMMOD); + return ret; +} +#else +static inline int try_force_unload(unsigned int flags) +{ + return 0; +} +#endif /* CONFIG_MODULE_FORCE_UNLOAD */ + +struct stopref +{ + struct module *mod; + int flags; + int *forced; +}; + +/* Whole machine is stopped with interrupts off when this runs. */ +static int __try_stop_module(void *_sref) +{ + struct stopref *sref = _sref; + + /* If it's not unused, quit unless we're forcing. */ + if (module_refcount(sref->mod) != 0) { + if (!(*sref->forced = try_force_unload(sref->flags))) + return -EWOULDBLOCK; + } + + /* Mark it as dying. */ + sref->mod->state = MODULE_STATE_GOING; + return 0; +} + +static int try_stop_module(struct module *mod, int flags, int *forced) +{ + if (flags & O_NONBLOCK) { + struct stopref sref = { mod, flags, forced }; + + return stop_machine(__try_stop_module, &sref, NULL); + } else { + /* We don't need to stop the machine for this. */ + mod->state = MODULE_STATE_GOING; + synchronize_sched(); + return 0; + } +} + +unsigned int module_refcount(struct module *mod) +{ + unsigned int incs = 0, decs = 0; + int cpu; + + for_each_possible_cpu(cpu) + decs += per_cpu_ptr(mod->refptr, cpu)->decs; + /* + * ensure the incs are added up after the decs. + * module_put ensures incs are visible before decs with smp_wmb. + * + * This 2-count scheme avoids the situation where the refcount + * for CPU0 is read, then CPU0 increments the module refcount, + * then CPU1 drops that refcount, then the refcount for CPU1 is + * read. We would record a decrement but not its corresponding + * increment so we would see a low count (disaster). + * + * Rare situation? But module_refcount can be preempted, and we + * might be tallying up 4096+ CPUs. So it is not impossible. + */ + smp_rmb(); + for_each_possible_cpu(cpu) + incs += per_cpu_ptr(mod->refptr, cpu)->incs; + return incs - decs; +} +EXPORT_SYMBOL(module_refcount); + +/* This exists whether we can unload or not */ +static void free_module(struct module *mod); + +static void wait_for_zero_refcount(struct module *mod) +{ + /* Since we might sleep for some time, release the mutex first */ + mutex_unlock(&module_mutex); + for (;;) { + DEBUGP("Looking at refcount...\n"); + set_current_state(TASK_UNINTERRUPTIBLE); + if (module_refcount(mod) == 0) + break; + schedule(); + } + current->state = TASK_RUNNING; + mutex_lock(&module_mutex); +} + +SYSCALL_DEFINE2(delete_module, const char __user *, name_user, + unsigned int, flags) +{ + struct module *mod; + char name[MODULE_NAME_LEN]; + int ret, forced = 0; + + if (!capable(CAP_SYS_MODULE) || modules_disabled) + return -EPERM; + + if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0) + return -EFAULT; + name[MODULE_NAME_LEN-1] = '\0'; + + if (mutex_lock_interruptible(&module_mutex) != 0) + return -EINTR; + + mod = find_module(name); + if (!mod) { + ret = -ENOENT; + goto out; + } + + if (!list_empty(&mod->source_list)) { + /* Other modules depend on us: get rid of them first. */ + ret = -EWOULDBLOCK; + goto out; + } + + /* Doing init or already dying? */ + if (mod->state != MODULE_STATE_LIVE) { + /* FIXME: if (force), slam module count and wake up + waiter --RR */ + DEBUGP("%s already dying\n", mod->name); + ret = -EBUSY; + goto out; + } + + /* If it has an init func, it must have an exit func to unload */ + if (mod->init && !mod->exit) { + forced = try_force_unload(flags); + if (!forced) { + /* This module can't be removed */ + ret = -EBUSY; + goto out; + } + } + + /* Set this up before setting mod->state */ + mod->waiter = current; + + /* Stop the machine so refcounts can't move and disable module. */ + ret = try_stop_module(mod, flags, &forced); + if (ret != 0) + goto out; + + /* Never wait if forced. */ + if (!forced && module_refcount(mod) != 0) + wait_for_zero_refcount(mod); + + mutex_unlock(&module_mutex); + /* Final destruction now no one is using it. */ + if (mod->exit != NULL) + mod->exit(); + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_GOING, mod); + async_synchronize_full(); + + /* Store the name of the last unloaded module for diagnostic purposes */ + strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); + + free_module(mod); + return 0; +out: + mutex_unlock(&module_mutex); + return ret; +} + +static inline void print_unload_info(struct seq_file *m, struct module *mod) +{ + struct module_use *use; + int printed_something = 0; + + seq_printf(m, " %u ", module_refcount(mod)); + + /* Always include a trailing , so userspace can differentiate + between this and the old multi-field proc format. */ + list_for_each_entry(use, &mod->source_list, source_list) { + printed_something = 1; + seq_printf(m, "%s,", use->source->name); + } + + if (mod->init != NULL && mod->exit == NULL) { + printed_something = 1; + seq_printf(m, "[permanent],"); + } + + if (!printed_something) + seq_printf(m, "-"); +} + +void __symbol_put(const char *symbol) +{ + struct module *owner; + + preempt_disable(); + if (!find_symbol(symbol, &owner, NULL, true, false)) + BUG(); + module_put(owner); + preempt_enable(); +} +EXPORT_SYMBOL(__symbol_put); + +/* Note this assumes addr is a function, which it currently always is. */ +void symbol_put_addr(void *addr) +{ + struct module *modaddr; + unsigned long a = (unsigned long)dereference_function_descriptor(addr); + + if (core_kernel_text(a)) + return; + + /* module_text_address is safe here: we're supposed to have reference + * to module from symbol_get, so it can't go away. */ + modaddr = __module_text_address(a); + BUG_ON(!modaddr); + module_put(modaddr); +} +EXPORT_SYMBOL_GPL(symbol_put_addr); + +static ssize_t show_refcnt(struct module_attribute *mattr, + struct module *mod, char *buffer) +{ + return sprintf(buffer, "%u\n", module_refcount(mod)); +} + +static struct module_attribute refcnt = { + .attr = { .name = "refcnt", .mode = 0444 }, + .show = show_refcnt, +}; + +void module_put(struct module *module) +{ + if (module) { + preempt_disable(); + smp_wmb(); /* see comment in module_refcount */ + __this_cpu_inc(module->refptr->decs); + + trace_module_put(module, _RET_IP_); + /* Maybe they're waiting for us to drop reference? */ + if (unlikely(!module_is_live(module))) + wake_up_process(module->waiter); + preempt_enable(); + } +} +EXPORT_SYMBOL(module_put); + +#else /* !CONFIG_MODULE_UNLOAD */ +static inline void print_unload_info(struct seq_file *m, struct module *mod) +{ + /* We don't know the usage count, or what modules are using. */ + seq_printf(m, " - -"); +} + +static inline void module_unload_free(struct module *mod) +{ +} + +int ref_module(struct module *a, struct module *b) +{ + return strong_try_module_get(b); +} +EXPORT_SYMBOL_GPL(ref_module); + +static inline int module_unload_init(struct module *mod) +{ + return 0; +} +#endif /* CONFIG_MODULE_UNLOAD */ + +static ssize_t show_initstate(struct module_attribute *mattr, + struct module *mod, char *buffer) +{ + const char *state = "unknown"; + + switch (mod->state) { + case MODULE_STATE_LIVE: + state = "live"; + break; + case MODULE_STATE_COMING: + state = "coming"; + break; + case MODULE_STATE_GOING: + state = "going"; + break; + } + return sprintf(buffer, "%s\n", state); +} + +static struct module_attribute initstate = { + .attr = { .name = "initstate", .mode = 0444 }, + .show = show_initstate, +}; + +static struct module_attribute *modinfo_attrs[] = { + &modinfo_version, + &modinfo_srcversion, + &initstate, +#ifdef CONFIG_MODULE_UNLOAD + &refcnt, +#endif + NULL, +}; + +static const char vermagic[] = VERMAGIC_STRING; + +static int try_to_force_load(struct module *mod, const char *reason) +{ +#ifdef CONFIG_MODULE_FORCE_LOAD + if (!test_taint(TAINT_FORCED_MODULE)) + printk(KERN_WARNING "%s: %s: kernel tainted.\n", + mod->name, reason); + add_taint_module(mod, TAINT_FORCED_MODULE); + return 0; +#else + return -ENOEXEC; +#endif +} + +#ifdef CONFIG_MODVERSIONS +/* If the arch applies (non-zero) relocations to kernel kcrctab, unapply it. */ +static unsigned long maybe_relocated(unsigned long crc, + const struct module *crc_owner) +{ +#ifdef ARCH_RELOCATES_KCRCTAB + if (crc_owner == NULL) + return crc - (unsigned long)reloc_start; +#endif + return crc; +} + +static int check_version(Elf_Shdr *sechdrs, + unsigned int versindex, + const char *symname, + struct module *mod, + const unsigned long *crc, + const struct module *crc_owner) +{ + unsigned int i, num_versions; + struct modversion_info *versions; + + /* Exporting module didn't supply crcs? OK, we're already tainted. */ + if (!crc) + return 1; + + /* No versions at all? modprobe --force does this. */ + if (versindex == 0) + return try_to_force_load(mod, symname) == 0; + + versions = (void *) sechdrs[versindex].sh_addr; + num_versions = sechdrs[versindex].sh_size + / sizeof(struct modversion_info); + + for (i = 0; i < num_versions; i++) { + if (strcmp(versions[i].name, symname) != 0) + continue; + + if (versions[i].crc == maybe_relocated(*crc, crc_owner)) + return 1; + DEBUGP("Found checksum %lX vs module %lX\n", + maybe_relocated(*crc, crc_owner), versions[i].crc); + goto bad_version; + } + + printk(KERN_WARNING "%s: no symbol version for %s\n", + mod->name, symname); + return 0; + +bad_version: + printk("%s: disagrees about version of symbol %s\n", + mod->name, symname); + return 0; +} + +static inline int check_modstruct_version(Elf_Shdr *sechdrs, + unsigned int versindex, + struct module *mod) +{ + const unsigned long *crc; + + /* Since this should be found in kernel (which can't be removed), + * no locking is necessary. */ + if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, + &crc, true, false)) + BUG(); + return check_version(sechdrs, versindex, "module_layout", mod, crc, + NULL); +} + +/* First part is kernel version, which we ignore if module has crcs. */ +static inline int same_magic(const char *amagic, const char *bmagic, + bool has_crcs) +{ + if (has_crcs) { + amagic += strcspn(amagic, " "); + bmagic += strcspn(bmagic, " "); + } + return strcmp(amagic, bmagic) == 0; +} +#else +static inline int check_version(Elf_Shdr *sechdrs, + unsigned int versindex, + const char *symname, + struct module *mod, + const unsigned long *crc, + const struct module *crc_owner) +{ + return 1; +} + +static inline int check_modstruct_version(Elf_Shdr *sechdrs, + unsigned int versindex, + struct module *mod) +{ + return 1; +} + +static inline int same_magic(const char *amagic, const char *bmagic, + bool has_crcs) +{ + return strcmp(amagic, bmagic) == 0; +} +#endif /* CONFIG_MODVERSIONS */ + +/* Resolve a symbol for this module. I.e. if we find one, record usage. */ +static const struct kernel_symbol *resolve_symbol(struct module *mod, + const struct load_info *info, + const char *name, + char ownername[]) +{ + struct module *owner; + const struct kernel_symbol *sym; + const unsigned long *crc; + int err; + + mutex_lock(&module_mutex); + sym = find_symbol(name, &owner, &crc, + !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); + if (!sym) + goto unlock; + + if (!check_version(info->sechdrs, info->index.vers, name, mod, crc, + owner)) { + sym = ERR_PTR(-EINVAL); + goto getname; + } + + err = ref_module(mod, owner); + if (err) { + sym = ERR_PTR(err); + goto getname; + } + +getname: + /* We must make copy under the lock if we failed to get ref. */ + strncpy(ownername, module_name(owner), MODULE_NAME_LEN); +unlock: + mutex_unlock(&module_mutex); + return sym; +} + +static const struct kernel_symbol * +resolve_symbol_wait(struct module *mod, + const struct load_info *info, + const char *name) +{ + const struct kernel_symbol *ksym; + char owner[MODULE_NAME_LEN]; + + if (wait_event_interruptible_timeout(module_wq, + !IS_ERR(ksym = resolve_symbol(mod, info, name, owner)) + || PTR_ERR(ksym) != -EBUSY, + 30 * HZ) <= 0) { + printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n", + mod->name, owner); + } + return ksym; +} + +/* + * /sys/module/foo/sections stuff + * J. Corbet + */ +#ifdef CONFIG_SYSFS + +#ifdef CONFIG_KALLSYMS +static inline bool sect_empty(const Elf_Shdr *sect) +{ + return !(sect->sh_flags & SHF_ALLOC) || sect->sh_size == 0; +} + +struct module_sect_attr +{ + struct module_attribute mattr; + char *name; + unsigned long address; +}; + +struct module_sect_attrs +{ + struct attribute_group grp; + unsigned int nsections; + struct module_sect_attr attrs[0]; +}; + +static ssize_t module_sect_show(struct module_attribute *mattr, + struct module *mod, char *buf) +{ + struct module_sect_attr *sattr = + container_of(mattr, struct module_sect_attr, mattr); + return sprintf(buf, "0x%pK\n", (void *)sattr->address); +} + +static void free_sect_attrs(struct module_sect_attrs *sect_attrs) +{ + unsigned int section; + + for (section = 0; section < sect_attrs->nsections; section++) + kfree(sect_attrs->attrs[section].name); + kfree(sect_attrs); +} + +static void add_sect_attrs(struct module *mod, const struct load_info *info) +{ + unsigned int nloaded = 0, i, size[2]; + struct module_sect_attrs *sect_attrs; + struct module_sect_attr *sattr; + struct attribute **gattr; + + /* Count loaded sections and allocate structures */ + for (i = 0; i < info->hdr->e_shnum; i++) + if (!sect_empty(&info->sechdrs[i])) + nloaded++; + size[0] = ALIGN(sizeof(*sect_attrs) + + nloaded * sizeof(sect_attrs->attrs[0]), + sizeof(sect_attrs->grp.attrs[0])); + size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]); + sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL); + if (sect_attrs == NULL) + return; + + /* Setup section attributes. */ + sect_attrs->grp.name = "sections"; + sect_attrs->grp.attrs = (void *)sect_attrs + size[0]; + + sect_attrs->nsections = 0; + sattr = §_attrs->attrs[0]; + gattr = §_attrs->grp.attrs[0]; + for (i = 0; i < info->hdr->e_shnum; i++) { + Elf_Shdr *sec = &info->sechdrs[i]; + if (sect_empty(sec)) + continue; + sattr->address = sec->sh_addr; + sattr->name = kstrdup(info->secstrings + sec->sh_name, + GFP_KERNEL); + if (sattr->name == NULL) + goto out; + sect_attrs->nsections++; + sysfs_attr_init(&sattr->mattr.attr); + sattr->mattr.show = module_sect_show; + sattr->mattr.store = NULL; + sattr->mattr.attr.name = sattr->name; + sattr->mattr.attr.mode = S_IRUGO; + *(gattr++) = &(sattr++)->mattr.attr; + } + *gattr = NULL; + + if (sysfs_create_group(&mod->mkobj.kobj, §_attrs->grp)) + goto out; + + mod->sect_attrs = sect_attrs; + return; + out: + free_sect_attrs(sect_attrs); +} + +static void remove_sect_attrs(struct module *mod) +{ + if (mod->sect_attrs) { + sysfs_remove_group(&mod->mkobj.kobj, + &mod->sect_attrs->grp); + /* We are positive that no one is using any sect attrs + * at this point. Deallocate immediately. */ + free_sect_attrs(mod->sect_attrs); + mod->sect_attrs = NULL; + } +} + +/* + * /sys/module/foo/notes/.section.name gives contents of SHT_NOTE sections. + */ + +struct module_notes_attrs { + struct kobject *dir; + unsigned int notes; + struct bin_attribute attrs[0]; +}; + +static ssize_t module_notes_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t pos, size_t count) +{ + /* + * The caller checked the pos and count against our size. + */ + memcpy(buf, bin_attr->private + pos, count); + return count; +} + +static void free_notes_attrs(struct module_notes_attrs *notes_attrs, + unsigned int i) +{ + if (notes_attrs->dir) { + while (i-- > 0) + sysfs_remove_bin_file(notes_attrs->dir, + ¬es_attrs->attrs[i]); + kobject_put(notes_attrs->dir); + } + kfree(notes_attrs); +} + +static void add_notes_attrs(struct module *mod, const struct load_info *info) +{ + unsigned int notes, loaded, i; + struct module_notes_attrs *notes_attrs; + struct bin_attribute *nattr; + + /* failed to create section attributes, so can't create notes */ + if (!mod->sect_attrs) + return; + + /* Count notes sections and allocate structures. */ + notes = 0; + for (i = 0; i < info->hdr->e_shnum; i++) + if (!sect_empty(&info->sechdrs[i]) && + (info->sechdrs[i].sh_type == SHT_NOTE)) + ++notes; + + if (notes == 0) + return; + + notes_attrs = kzalloc(sizeof(*notes_attrs) + + notes * sizeof(notes_attrs->attrs[0]), + GFP_KERNEL); + if (notes_attrs == NULL) + return; + + notes_attrs->notes = notes; + nattr = ¬es_attrs->attrs[0]; + for (loaded = i = 0; i < info->hdr->e_shnum; ++i) { + if (sect_empty(&info->sechdrs[i])) + continue; + if (info->sechdrs[i].sh_type == SHT_NOTE) { + sysfs_bin_attr_init(nattr); + nattr->attr.name = mod->sect_attrs->attrs[loaded].name; + nattr->attr.mode = S_IRUGO; + nattr->size = info->sechdrs[i].sh_size; + nattr->private = (void *) info->sechdrs[i].sh_addr; + nattr->read = module_notes_read; + ++nattr; + } + ++loaded; + } + + notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj); + if (!notes_attrs->dir) + goto out; + + for (i = 0; i < notes; ++i) + if (sysfs_create_bin_file(notes_attrs->dir, + ¬es_attrs->attrs[i])) + goto out; + + mod->notes_attrs = notes_attrs; + return; + + out: + free_notes_attrs(notes_attrs, i); +} + +static void remove_notes_attrs(struct module *mod) +{ + if (mod->notes_attrs) + free_notes_attrs(mod->notes_attrs, mod->notes_attrs->notes); +} + +#else + +static inline void add_sect_attrs(struct module *mod, + const struct load_info *info) +{ +} + +static inline void remove_sect_attrs(struct module *mod) +{ +} + +static inline void add_notes_attrs(struct module *mod, + const struct load_info *info) +{ +} + +static inline void remove_notes_attrs(struct module *mod) +{ +} +#endif /* CONFIG_KALLSYMS */ + +static void add_usage_links(struct module *mod) +{ +#ifdef CONFIG_MODULE_UNLOAD + struct module_use *use; + int nowarn; + + mutex_lock(&module_mutex); + list_for_each_entry(use, &mod->target_list, target_list) { + nowarn = sysfs_create_link(use->target->holders_dir, + &mod->mkobj.kobj, mod->name); + } + mutex_unlock(&module_mutex); +#endif +} + +static void del_usage_links(struct module *mod) +{ +#ifdef CONFIG_MODULE_UNLOAD + struct module_use *use; + + mutex_lock(&module_mutex); + list_for_each_entry(use, &mod->target_list, target_list) + sysfs_remove_link(use->target->holders_dir, mod->name); + mutex_unlock(&module_mutex); +#endif +} + +static int module_add_modinfo_attrs(struct module *mod) +{ + struct module_attribute *attr; + struct module_attribute *temp_attr; + int error = 0; + int i; + + mod->modinfo_attrs = kzalloc((sizeof(struct module_attribute) * + (ARRAY_SIZE(modinfo_attrs) + 1)), + GFP_KERNEL); + if (!mod->modinfo_attrs) + return -ENOMEM; + + temp_attr = mod->modinfo_attrs; + for (i = 0; (attr = modinfo_attrs[i]) && !error; i++) { + if (!attr->test || + (attr->test && attr->test(mod))) { + memcpy(temp_attr, attr, sizeof(*temp_attr)); + sysfs_attr_init(&temp_attr->attr); + error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); + ++temp_attr; + } + } + return error; +} + +static void module_remove_modinfo_attrs(struct module *mod) +{ + struct module_attribute *attr; + int i; + + for (i = 0; (attr = &mod->modinfo_attrs[i]); i++) { + /* pick a field to test for end of list */ + if (!attr->attr.name) + break; + sysfs_remove_file(&mod->mkobj.kobj,&attr->attr); + if (attr->free) + attr->free(mod); + } + kfree(mod->modinfo_attrs); +} + +static int mod_sysfs_init(struct module *mod) +{ + int err; + struct kobject *kobj; + + if (!module_sysfs_initialized) { + printk(KERN_ERR "%s: module sysfs not initialized\n", + mod->name); + err = -EINVAL; + goto out; + } + + kobj = kset_find_obj(module_kset, mod->name); + if (kobj) { + printk(KERN_ERR "%s: module is already loaded\n", mod->name); + kobject_put(kobj); + err = -EINVAL; + goto out; + } + + mod->mkobj.mod = mod; + + memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj)); + mod->mkobj.kobj.kset = module_kset; + err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL, + "%s", mod->name); + if (err) + kobject_put(&mod->mkobj.kobj); + + /* delay uevent until full sysfs population */ +out: + return err; +} + +static int mod_sysfs_setup(struct module *mod, + const struct load_info *info, + struct kernel_param *kparam, + unsigned int num_params) +{ + int err; + + err = mod_sysfs_init(mod); + if (err) + goto out; + + mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj); + if (!mod->holders_dir) { + err = -ENOMEM; + goto out_unreg; + } + + err = module_param_sysfs_setup(mod, kparam, num_params); + if (err) + goto out_unreg_holders; + + err = module_add_modinfo_attrs(mod); + if (err) + goto out_unreg_param; + + add_usage_links(mod); + add_sect_attrs(mod, info); + add_notes_attrs(mod, info); + + kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); + return 0; + +out_unreg_param: + module_param_sysfs_remove(mod); +out_unreg_holders: + kobject_put(mod->holders_dir); +out_unreg: + kobject_put(&mod->mkobj.kobj); +out: + return err; +} + +static void mod_sysfs_fini(struct module *mod) +{ + remove_notes_attrs(mod); + remove_sect_attrs(mod); + kobject_put(&mod->mkobj.kobj); +} + +#else /* !CONFIG_SYSFS */ + +static int mod_sysfs_setup(struct module *mod, + const struct load_info *info, + struct kernel_param *kparam, + unsigned int num_params) +{ + return 0; +} + +static void mod_sysfs_fini(struct module *mod) +{ +} + +static void module_remove_modinfo_attrs(struct module *mod) +{ +} + +static void del_usage_links(struct module *mod) +{ +} + +#endif /* CONFIG_SYSFS */ + +static void mod_sysfs_teardown(struct module *mod) +{ + del_usage_links(mod); + module_remove_modinfo_attrs(mod); + module_param_sysfs_remove(mod); + kobject_put(mod->mkobj.drivers_dir); + kobject_put(mod->holders_dir); + mod_sysfs_fini(mod); +} + +/* + * unlink the module with the whole machine is stopped with interrupts off + * - this defends against kallsyms not taking locks + */ +static int __unlink_module(void *_mod) +{ + struct module *mod = _mod; + list_del(&mod->list); + module_bug_cleanup(mod); + return 0; +} + +#ifdef CONFIG_DEBUG_SET_MODULE_RONX +/* + * LKM RO/NX protection: protect module's text/ro-data + * from modification and any data from execution. + */ +void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages)) +{ + unsigned long begin_pfn = PFN_DOWN((unsigned long)start); + unsigned long end_pfn = PFN_DOWN((unsigned long)end); + + if (end_pfn > begin_pfn) + set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); +} + +static void set_section_ro_nx(void *base, + unsigned long text_size, + unsigned long ro_size, + unsigned long total_size) +{ + /* begin and end PFNs of the current subsection */ + unsigned long begin_pfn; + unsigned long end_pfn; + + /* + * Set RO for module text and RO-data: + * - Always protect first page. + * - Do not protect last partial page. + */ + if (ro_size > 0) + set_page_attributes(base, base + ro_size, set_memory_ro); + + /* + * Set NX permissions for module data: + * - Do not protect first partial page. + * - Always protect last page. + */ + if (total_size > text_size) { + begin_pfn = PFN_UP((unsigned long)base + text_size); + end_pfn = PFN_UP((unsigned long)base + total_size); + if (end_pfn > begin_pfn) + set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); + } +} + +static void unset_module_core_ro_nx(struct module *mod) +{ + set_page_attributes(mod->module_core + mod->core_text_size, + mod->module_core + mod->core_size, + set_memory_x); + set_page_attributes(mod->module_core, + mod->module_core + mod->core_ro_size, + set_memory_rw); +} + +static void unset_module_init_ro_nx(struct module *mod) +{ + set_page_attributes(mod->module_init + mod->init_text_size, + mod->module_init + mod->init_size, + set_memory_x); + set_page_attributes(mod->module_init, + mod->module_init + mod->init_ro_size, + set_memory_rw); +} + +/* Iterate through all modules and set each module's text as RW */ +void set_all_modules_text_rw(void) +{ + struct module *mod; + + mutex_lock(&module_mutex); + list_for_each_entry_rcu(mod, &modules, list) { + if ((mod->module_core) && (mod->core_text_size)) { + set_page_attributes(mod->module_core, + mod->module_core + mod->core_text_size, + set_memory_rw); + } + if ((mod->module_init) && (mod->init_text_size)) { + set_page_attributes(mod->module_init, + mod->module_init + mod->init_text_size, + set_memory_rw); + } + } + mutex_unlock(&module_mutex); +} + +/* Iterate through all modules and set each module's text as RO */ +void set_all_modules_text_ro(void) +{ + struct module *mod; + + mutex_lock(&module_mutex); + list_for_each_entry_rcu(mod, &modules, list) { + if ((mod->module_core) && (mod->core_text_size)) { + set_page_attributes(mod->module_core, + mod->module_core + mod->core_text_size, + set_memory_ro); + } + if ((mod->module_init) && (mod->init_text_size)) { + set_page_attributes(mod->module_init, + mod->module_init + mod->init_text_size, + set_memory_ro); + } + } + mutex_unlock(&module_mutex); +} +#else +static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { } +static void unset_module_core_ro_nx(struct module *mod) { } +static void unset_module_init_ro_nx(struct module *mod) { } +#endif + +/* Free a module, remove from lists, etc. */ +static void free_module(struct module *mod) +{ + trace_module_free(mod); + + /* Delete from various lists */ + mutex_lock(&module_mutex); + stop_machine(__unlink_module, mod, NULL); + mutex_unlock(&module_mutex); + mod_sysfs_teardown(mod); + + /* Remove dynamic debug info */ + ddebug_remove_module(mod->name); + + /* Arch-specific cleanup. */ + module_arch_cleanup(mod); + + /* Module unload stuff */ + module_unload_free(mod); + + /* Free any allocated parameters. */ + destroy_params(mod->kp, mod->num_kp); + + /* This may be NULL, but that's OK */ + unset_module_init_ro_nx(mod); + module_free(mod, mod->module_init); + kfree(mod->args); + percpu_modfree(mod); + + /* Free lock-classes: */ + lockdep_free_key_range(mod->module_core, mod->core_size); + + /* Finally, free the core (containing the module structure) */ + unset_module_core_ro_nx(mod); + module_free(mod, mod->module_core); + +#ifdef CONFIG_MPU + update_protections(current->mm); +#endif +} + +void *__symbol_get(const char *symbol) +{ + struct module *owner; + const struct kernel_symbol *sym; + + preempt_disable(); + sym = find_symbol(symbol, &owner, NULL, true, true); + if (sym && strong_try_module_get(owner)) + sym = NULL; + preempt_enable(); + + return sym ? (void *)sym->value : NULL; +} +EXPORT_SYMBOL_GPL(__symbol_get); + +/* + * Ensure that an exported symbol [global namespace] does not already exist + * in the kernel or in some other module's exported symbol table. + * + * You must hold the module_mutex. + */ +static int verify_export_symbols(struct module *mod) +{ + unsigned int i; + struct module *owner; + const struct kernel_symbol *s; + struct { + const struct kernel_symbol *sym; + unsigned int num; + } arr[] = { + { mod->syms, mod->num_syms }, + { mod->gpl_syms, mod->num_gpl_syms }, + { mod->gpl_future_syms, mod->num_gpl_future_syms }, +#ifdef CONFIG_UNUSED_SYMBOLS + { mod->unused_syms, mod->num_unused_syms }, + { mod->unused_gpl_syms, mod->num_unused_gpl_syms }, +#endif + }; + + for (i = 0; i < ARRAY_SIZE(arr); i++) { + for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) { + if (find_symbol(s->name, &owner, NULL, true, false)) { + printk(KERN_ERR + "%s: exports duplicate symbol %s" + " (owned by %s)\n", + mod->name, s->name, module_name(owner)); + return -ENOEXEC; + } + } + } + return 0; +} + +/* Change all symbols so that st_value encodes the pointer directly. */ +static int simplify_symbols(struct module *mod, const struct load_info *info) +{ + Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; + Elf_Sym *sym = (void *)symsec->sh_addr; + unsigned long secbase; + unsigned int i; + int ret = 0; + const struct kernel_symbol *ksym; + + for (i = 1; i < symsec->sh_size / sizeof(Elf_Sym); i++) { + const char *name = info->strtab + sym[i].st_name; + + switch (sym[i].st_shndx) { + case SHN_COMMON: + /* We compiled with -fno-common. These are not + supposed to happen. */ + DEBUGP("Common symbol: %s\n", name); + printk("%s: please compile with -fno-common\n", + mod->name); + ret = -ENOEXEC; + break; + + case SHN_ABS: + /* Don't need to do anything */ + DEBUGP("Absolute symbol: 0x%08lx\n", + (long)sym[i].st_value); + break; + + case SHN_UNDEF: + ksym = resolve_symbol_wait(mod, info, name); + /* Ok if resolved. */ + if (ksym && !IS_ERR(ksym)) { + sym[i].st_value = ksym->value; + break; + } + + /* Ok if weak. */ + if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK) + break; + + printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n", + mod->name, name, PTR_ERR(ksym)); + ret = PTR_ERR(ksym) ?: -ENOENT; + break; + + default: + /* Divert to percpu allocation if a percpu var. */ + if (sym[i].st_shndx == info->index.pcpu) + secbase = (unsigned long)mod_percpu(mod); + else + secbase = info->sechdrs[sym[i].st_shndx].sh_addr; + sym[i].st_value += secbase; + break; + } + } + + return ret; +} + +static int apply_relocations(struct module *mod, const struct load_info *info) +{ + unsigned int i; + int err = 0; + + /* Now do relocations. */ + for (i = 1; i < info->hdr->e_shnum; i++) { + unsigned int infosec = info->sechdrs[i].sh_info; + + /* Not a valid relocation section? */ + if (infosec >= info->hdr->e_shnum) + continue; + + /* Don't bother with non-allocated sections */ + if (!(info->sechdrs[infosec].sh_flags & SHF_ALLOC)) + continue; + + if (info->sechdrs[i].sh_type == SHT_REL) + err = apply_relocate(info->sechdrs, info->strtab, + info->index.sym, i, mod); + else if (info->sechdrs[i].sh_type == SHT_RELA) + err = apply_relocate_add(info->sechdrs, info->strtab, + info->index.sym, i, mod); + if (err < 0) + break; + } + return err; +} + +/* Additional bytes needed by arch in front of individual sections */ +unsigned int __weak arch_mod_section_prepend(struct module *mod, + unsigned int section) +{ + /* default implementation just returns zero */ + return 0; +} + +/* Update size with this section: return offset. */ +static long get_offset(struct module *mod, unsigned int *size, + Elf_Shdr *sechdr, unsigned int section) +{ + long ret; + + *size += arch_mod_section_prepend(mod, section); + ret = ALIGN(*size, sechdr->sh_addralign ?: 1); + *size = ret + sechdr->sh_size; + return ret; +} + +/* Lay out the SHF_ALLOC sections in a way not dissimilar to how ld + might -- code, read-only data, read-write data, small data. Tally + sizes, and place the offsets into sh_entsize fields: high bit means it + belongs in init. */ +static void layout_sections(struct module *mod, struct load_info *info) +{ + static unsigned long const masks[][2] = { + /* NOTE: all executable code must be the first section + * in this array; otherwise modify the text_size + * finder in the two loops below */ + { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL }, + { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL }, + { SHF_WRITE | SHF_ALLOC, ARCH_SHF_SMALL }, + { ARCH_SHF_SMALL | SHF_ALLOC, 0 } + }; + unsigned int m, i; + + for (i = 0; i < info->hdr->e_shnum; i++) + info->sechdrs[i].sh_entsize = ~0UL; + + DEBUGP("Core section allocation order:\n"); + for (m = 0; m < ARRAY_SIZE(masks); ++m) { + for (i = 0; i < info->hdr->e_shnum; ++i) { + Elf_Shdr *s = &info->sechdrs[i]; + const char *sname = info->secstrings + s->sh_name; + + if ((s->sh_flags & masks[m][0]) != masks[m][0] + || (s->sh_flags & masks[m][1]) + || s->sh_entsize != ~0UL + || strstarts(sname, ".init")) + continue; + s->sh_entsize = get_offset(mod, &mod->core_size, s, i); + DEBUGP("\t%s\n", name); + } + switch (m) { + case 0: /* executable */ + mod->core_size = debug_align(mod->core_size); + mod->core_text_size = mod->core_size; + break; + case 1: /* RO: text and ro-data */ + mod->core_size = debug_align(mod->core_size); + mod->core_ro_size = mod->core_size; + break; + case 3: /* whole core */ + mod->core_size = debug_align(mod->core_size); + break; + } + } + + DEBUGP("Init section allocation order:\n"); + for (m = 0; m < ARRAY_SIZE(masks); ++m) { + for (i = 0; i < info->hdr->e_shnum; ++i) { + Elf_Shdr *s = &info->sechdrs[i]; + const char *sname = info->secstrings + s->sh_name; + + if ((s->sh_flags & masks[m][0]) != masks[m][0] + || (s->sh_flags & masks[m][1]) + || s->sh_entsize != ~0UL + || !strstarts(sname, ".init")) + continue; + s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) + | INIT_OFFSET_MASK); + DEBUGP("\t%s\n", sname); + } + switch (m) { + case 0: /* executable */ + mod->init_size = debug_align(mod->init_size); + mod->init_text_size = mod->init_size; + break; + case 1: /* RO: text and ro-data */ + mod->init_size = debug_align(mod->init_size); + mod->init_ro_size = mod->init_size; + break; + case 3: /* whole init */ + mod->init_size = debug_align(mod->init_size); + break; + } + } +} + +static void set_license(struct module *mod, const char *license) +{ + if (!license) + license = "unspecified"; + + if (!license_is_gpl_compatible(license)) { + if (!test_taint(TAINT_PROPRIETARY_MODULE)) + printk(KERN_WARNING "%s: module license '%s' taints " + "kernel.\n", mod->name, license); + add_taint_module(mod, TAINT_PROPRIETARY_MODULE); + } +} + +/* Parse tag=value strings from .modinfo section */ +static char *next_string(char *string, unsigned long *secsize) +{ + /* Skip non-zero chars */ + while (string[0]) { + string++; + if ((*secsize)-- <= 1) + return NULL; + } + + /* Skip any zero padding. */ + while (!string[0]) { + string++; + if ((*secsize)-- <= 1) + return NULL; + } + return string; +} + +static char *get_modinfo(struct load_info *info, const char *tag) +{ + char *p; + unsigned int taglen = strlen(tag); + Elf_Shdr *infosec = &info->sechdrs[info->index.info]; + unsigned long size = infosec->sh_size; + + for (p = (char *)infosec->sh_addr; p; p = next_string(p, &size)) { + if (strncmp(p, tag, taglen) == 0 && p[taglen] == '=') + return p + taglen + 1; + } + return NULL; +} + +static void setup_modinfo(struct module *mod, struct load_info *info) +{ + struct module_attribute *attr; + int i; + + for (i = 0; (attr = modinfo_attrs[i]); i++) { + if (attr->setup) + attr->setup(mod, get_modinfo(info, attr->attr.name)); + } +} + +static void free_modinfo(struct module *mod) +{ + struct module_attribute *attr; + int i; + + for (i = 0; (attr = modinfo_attrs[i]); i++) { + if (attr->free) + attr->free(mod); + } +} + +#ifdef CONFIG_KALLSYMS + +/* lookup symbol in given range of kernel_symbols */ +static const struct kernel_symbol *lookup_symbol(const char *name, + const struct kernel_symbol *start, + const struct kernel_symbol *stop) +{ + return bsearch(name, start, stop - start, + sizeof(struct kernel_symbol), cmp_name); +} + +static int is_exported(const char *name, unsigned long value, + const struct module *mod) +{ + const struct kernel_symbol *ks; + if (!mod) + ks = lookup_symbol(name, __start___ksymtab, __stop___ksymtab); + else + ks = lookup_symbol(name, mod->syms, mod->syms + mod->num_syms); + return ks != NULL && ks->value == value; +} + +/* As per nm */ +static char elf_type(const Elf_Sym *sym, const struct load_info *info) +{ + const Elf_Shdr *sechdrs = info->sechdrs; + + if (ELF_ST_BIND(sym->st_info) == STB_WEAK) { + if (ELF_ST_TYPE(sym->st_info) == STT_OBJECT) + return 'v'; + else + return 'w'; + } + if (sym->st_shndx == SHN_UNDEF) + return 'U'; + if (sym->st_shndx == SHN_ABS) + return 'a'; + if (sym->st_shndx >= SHN_LORESERVE) + return '?'; + if (sechdrs[sym->st_shndx].sh_flags & SHF_EXECINSTR) + return 't'; + if (sechdrs[sym->st_shndx].sh_flags & SHF_ALLOC + && sechdrs[sym->st_shndx].sh_type != SHT_NOBITS) { + if (!(sechdrs[sym->st_shndx].sh_flags & SHF_WRITE)) + return 'r'; + else if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL) + return 'g'; + else + return 'd'; + } + if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) { + if (sechdrs[sym->st_shndx].sh_flags & ARCH_SHF_SMALL) + return 's'; + else + return 'b'; + } + if (strstarts(info->secstrings + sechdrs[sym->st_shndx].sh_name, + ".debug")) { + return 'n'; + } + return '?'; +} + +static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, + unsigned int shnum) +{ + const Elf_Shdr *sec; + + if (src->st_shndx == SHN_UNDEF + || src->st_shndx >= shnum + || !src->st_name) + return false; + + sec = sechdrs + src->st_shndx; + if (!(sec->sh_flags & SHF_ALLOC) +#ifndef CONFIG_KALLSYMS_ALL + || !(sec->sh_flags & SHF_EXECINSTR) +#endif + || (sec->sh_entsize & INIT_OFFSET_MASK)) + return false; + + return true; +} + +static void layout_symtab(struct module *mod, struct load_info *info) +{ + Elf_Shdr *symsect = info->sechdrs + info->index.sym; + Elf_Shdr *strsect = info->sechdrs + info->index.str; + const Elf_Sym *src; + unsigned int i, nsrc, ndst; + + /* Put symbol section at end of init part of module. */ + symsect->sh_flags |= SHF_ALLOC; + symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, + info->index.sym) | INIT_OFFSET_MASK; + DEBUGP("\t%s\n", info->secstrings + symsect->sh_name); + + src = (void *)info->hdr + symsect->sh_offset; + nsrc = symsect->sh_size / sizeof(*src); + for (ndst = i = 1; i < nsrc; ++i, ++src) + if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) { + unsigned int j = src->st_name; + + while (!__test_and_set_bit(j, info->strmap) + && info->strtab[j]) + ++j; + ++ndst; + } + + /* Append room for core symbols at end of core part. */ + info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); + mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); + + /* Put string table section at end of init part of module. */ + strsect->sh_flags |= SHF_ALLOC; + strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, + info->index.str) | INIT_OFFSET_MASK; + DEBUGP("\t%s\n", info->secstrings + strsect->sh_name); + + /* Append room for core symbols' strings at end of core part. */ + info->stroffs = mod->core_size; + __set_bit(0, info->strmap); + mod->core_size += bitmap_weight(info->strmap, strsect->sh_size); +} + +static void add_kallsyms(struct module *mod, const struct load_info *info) +{ + unsigned int i, ndst; + const Elf_Sym *src; + Elf_Sym *dst; + char *s; + Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; + + mod->symtab = (void *)symsec->sh_addr; + mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym); + /* Make sure we get permanent strtab: don't use info->strtab. */ + mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr; + + /* Set types up while we still have access to sections. */ + for (i = 0; i < mod->num_symtab; i++) + mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); + + mod->core_symtab = dst = mod->module_core + info->symoffs; + src = mod->symtab; + *dst = *src; + for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { + if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) + continue; + dst[ndst] = *src; + dst[ndst].st_name = bitmap_weight(info->strmap, + dst[ndst].st_name); + ++ndst; + } + mod->core_num_syms = ndst; + + mod->core_strtab = s = mod->module_core + info->stroffs; + for (*s = 0, i = 1; i < info->sechdrs[info->index.str].sh_size; ++i) + if (test_bit(i, info->strmap)) + *++s = mod->strtab[i]; +} +#else +static inline void layout_symtab(struct module *mod, struct load_info *info) +{ +} + +static void add_kallsyms(struct module *mod, const struct load_info *info) +{ +} +#endif /* CONFIG_KALLSYMS */ + +static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num) +{ + if (!debug) + return; +#ifdef CONFIG_DYNAMIC_DEBUG + if (ddebug_add_module(debug, num, debug->modname)) + printk(KERN_ERR "dynamic debug error adding module: %s\n", + debug->modname); +#endif +} + +static void dynamic_debug_remove(struct _ddebug *debug) +{ + if (debug) + ddebug_remove_module(debug->modname); +} + +static void *module_alloc_update_bounds(unsigned long size) +{ + void *ret = module_alloc(size); + + if (ret) { + mutex_lock(&module_mutex); + /* Update module bounds. */ + if ((unsigned long)ret < module_addr_min) + module_addr_min = (unsigned long)ret; + if ((unsigned long)ret + size > module_addr_max) + module_addr_max = (unsigned long)ret + size; + mutex_unlock(&module_mutex); + } + return ret; +} + +#ifdef CONFIG_DEBUG_KMEMLEAK +static void kmemleak_load_module(const struct module *mod, + const struct load_info *info) +{ + unsigned int i; + + /* only scan the sections containing data */ + kmemleak_scan_area(mod, sizeof(struct module), GFP_KERNEL); + + for (i = 1; i < info->hdr->e_shnum; i++) { + const char *name = info->secstrings + info->sechdrs[i].sh_name; + if (!(info->sechdrs[i].sh_flags & SHF_ALLOC)) + continue; + if (!strstarts(name, ".data") && !strstarts(name, ".bss")) + continue; + + kmemleak_scan_area((void *)info->sechdrs[i].sh_addr, + info->sechdrs[i].sh_size, GFP_KERNEL); + } +} +#else +static inline void kmemleak_load_module(const struct module *mod, + const struct load_info *info) +{ +} +#endif + +/* Sets info->hdr and info->len. */ +static int copy_and_check(struct load_info *info, + const void __user *umod, unsigned long len, + const char __user *uargs) +{ + int err; + Elf_Ehdr *hdr; + + if (len < sizeof(*hdr)) + return -ENOEXEC; + + /* Suck in entire file: we'll want most of it. */ + if ((hdr = vmalloc(len)) == NULL) + return -ENOMEM; + + if (copy_from_user(hdr, umod, len) != 0) { + err = -EFAULT; + goto free_hdr; + } + + /* Sanity checks against insmoding binaries or wrong arch, + weird elf version */ + if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0 + || hdr->e_type != ET_REL + || !elf_check_arch(hdr) + || hdr->e_shentsize != sizeof(Elf_Shdr)) { + err = -ENOEXEC; + goto free_hdr; + } + + if (len < hdr->e_shoff + hdr->e_shnum * sizeof(Elf_Shdr)) { + err = -ENOEXEC; + goto free_hdr; + } + + info->hdr = hdr; + info->len = len; + return 0; + +free_hdr: + vfree(hdr); + return err; +} + +static void free_copy(struct load_info *info) +{ + vfree(info->hdr); +} + +static int rewrite_section_headers(struct load_info *info) +{ + unsigned int i; + + /* This should always be true, but let's be sure. */ + info->sechdrs[0].sh_addr = 0; + + for (i = 1; i < info->hdr->e_shnum; i++) { + Elf_Shdr *shdr = &info->sechdrs[i]; + if (shdr->sh_type != SHT_NOBITS + && info->len < shdr->sh_offset + shdr->sh_size) { + printk(KERN_ERR "Module len %lu truncated\n", + info->len); + return -ENOEXEC; + } + + /* Mark all sections sh_addr with their address in the + temporary image. */ + shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset; + +#ifndef CONFIG_MODULE_UNLOAD + /* Don't load .exit sections */ + if (strstarts(info->secstrings+shdr->sh_name, ".exit")) + shdr->sh_flags &= ~(unsigned long)SHF_ALLOC; +#endif + } + + /* Track but don't keep modinfo and version sections. */ + info->index.vers = find_sec(info, "__versions"); + info->index.info = find_sec(info, ".modinfo"); + info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC; + info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC; + return 0; +} + +/* + * Set up our basic convenience variables (pointers to section headers, + * search for module section index etc), and do some basic section + * verification. + * + * Return the temporary module pointer (we'll replace it with the final + * one when we move the module sections around). + */ +static struct module *setup_load_info(struct load_info *info) +{ + unsigned int i; + int err; + struct module *mod; + + /* Set up the convenience variables */ + info->sechdrs = (void *)info->hdr + info->hdr->e_shoff; + info->secstrings = (void *)info->hdr + + info->sechdrs[info->hdr->e_shstrndx].sh_offset; + + err = rewrite_section_headers(info); + if (err) + return ERR_PTR(err); + + /* Find internal symbols and strings. */ + for (i = 1; i < info->hdr->e_shnum; i++) { + if (info->sechdrs[i].sh_type == SHT_SYMTAB) { + info->index.sym = i; + info->index.str = info->sechdrs[i].sh_link; + info->strtab = (char *)info->hdr + + info->sechdrs[info->index.str].sh_offset; + break; + } + } + + info->index.mod = find_sec(info, ".gnu.linkonce.this_module"); + if (!info->index.mod) { + printk(KERN_WARNING "No module found in object\n"); + return ERR_PTR(-ENOEXEC); + } + /* This is temporary: point mod into copy of data. */ + mod = (void *)info->sechdrs[info->index.mod].sh_addr; + + if (info->index.sym == 0) { + printk(KERN_WARNING "%s: module has no symbols (stripped?)\n", + mod->name); + return ERR_PTR(-ENOEXEC); + } + + info->index.pcpu = find_pcpusec(info); + + /* Check module struct version now, before we try to use module. */ + if (!check_modstruct_version(info->sechdrs, info->index.vers, mod)) + return ERR_PTR(-ENOEXEC); + + return mod; +} + +static int check_modinfo(struct module *mod, struct load_info *info) +{ + const char *modmagic = get_modinfo(info, "vermagic"); + int err; + + /* This is allowed: modprobe --force will invalidate it. */ + if (!modmagic) { + err = try_to_force_load(mod, "bad vermagic"); + if (err) + return err; + } else if (!same_magic(modmagic, vermagic, info->index.vers)) { + printk(KERN_ERR "%s: version magic '%s' should be '%s'\n", + mod->name, modmagic, vermagic); + return -ENOEXEC; + } + + if (get_modinfo(info, "staging")) { + add_taint_module(mod, TAINT_CRAP); + printk(KERN_WARNING "%s: module is from the staging directory," + " the quality is unknown, you have been warned.\n", + mod->name); + } + + /* Set up license info based on the info section */ + set_license(mod, get_modinfo(info, "license")); + + return 0; +} + +static void find_module_sections(struct module *mod, struct load_info *info) +{ + mod->kp = section_objs(info, "__param", + sizeof(*mod->kp), &mod->num_kp); + mod->syms = section_objs(info, "__ksymtab", + sizeof(*mod->syms), &mod->num_syms); + mod->crcs = section_addr(info, "__kcrctab"); + mod->gpl_syms = section_objs(info, "__ksymtab_gpl", + sizeof(*mod->gpl_syms), + &mod->num_gpl_syms); + mod->gpl_crcs = section_addr(info, "__kcrctab_gpl"); + mod->gpl_future_syms = section_objs(info, + "__ksymtab_gpl_future", + sizeof(*mod->gpl_future_syms), + &mod->num_gpl_future_syms); + mod->gpl_future_crcs = section_addr(info, "__kcrctab_gpl_future"); + +#ifdef CONFIG_UNUSED_SYMBOLS + mod->unused_syms = section_objs(info, "__ksymtab_unused", + sizeof(*mod->unused_syms), + &mod->num_unused_syms); + mod->unused_crcs = section_addr(info, "__kcrctab_unused"); + mod->unused_gpl_syms = section_objs(info, "__ksymtab_unused_gpl", + sizeof(*mod->unused_gpl_syms), + &mod->num_unused_gpl_syms); + mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl"); +#endif +#ifdef CONFIG_CONSTRUCTORS + mod->ctors = section_objs(info, ".ctors", + sizeof(*mod->ctors), &mod->num_ctors); +#endif + +#ifdef CONFIG_TRACEPOINTS + mod->tracepoints_ptrs = section_objs(info, "__tracepoints_ptrs", + sizeof(*mod->tracepoints_ptrs), + &mod->num_tracepoints); +#endif +#ifdef HAVE_JUMP_LABEL + mod->jump_entries = section_objs(info, "__jump_table", + sizeof(*mod->jump_entries), + &mod->num_jump_entries); +#endif +#ifdef CONFIG_EVENT_TRACING + mod->trace_events = section_objs(info, "_ftrace_events", + sizeof(*mod->trace_events), + &mod->num_trace_events); + /* + * This section contains pointers to allocated objects in the trace + * code and not scanning it leads to false positives. + */ + kmemleak_scan_area(mod->trace_events, sizeof(*mod->trace_events) * + mod->num_trace_events, GFP_KERNEL); +#endif +#ifdef CONFIG_TRACING + mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt", + sizeof(*mod->trace_bprintk_fmt_start), + &mod->num_trace_bprintk_fmt); + /* + * This section contains pointers to allocated objects in the trace + * code and not scanning it leads to false positives. + */ + kmemleak_scan_area(mod->trace_bprintk_fmt_start, + sizeof(*mod->trace_bprintk_fmt_start) * + mod->num_trace_bprintk_fmt, GFP_KERNEL); +#endif +#ifdef CONFIG_FTRACE_MCOUNT_RECORD + /* sechdrs[0].sh_size is always zero */ + mod->ftrace_callsites = section_objs(info, "__mcount_loc", + sizeof(*mod->ftrace_callsites), + &mod->num_ftrace_callsites); +#endif + + mod->extable = section_objs(info, "__ex_table", + sizeof(*mod->extable), &mod->num_exentries); + + if (section_addr(info, "__obsparm")) + printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", + mod->name); + + info->debug = section_objs(info, "__verbose", + sizeof(*info->debug), &info->num_debug); +} + +static int move_module(struct module *mod, struct load_info *info) +{ + int i; + void *ptr; + + /* Do the allocs. */ + ptr = module_alloc_update_bounds(mod->core_size); + /* + * The pointer to this block is stored in the module structure + * which is inside the block. Just mark it as not being a + * leak. + */ + kmemleak_not_leak(ptr); + if (!ptr) + return -ENOMEM; + + memset(ptr, 0, mod->core_size); + mod->module_core = ptr; + + ptr = module_alloc_update_bounds(mod->init_size); + /* + * The pointer to this block is stored in the module structure + * which is inside the block. This block doesn't need to be + * scanned as it contains data and code that will be freed + * after the module is initialized. + */ + kmemleak_ignore(ptr); + if (!ptr && mod->init_size) { + module_free(mod, mod->module_core); + return -ENOMEM; + } + memset(ptr, 0, mod->init_size); + mod->module_init = ptr; + + /* Transfer each section which specifies SHF_ALLOC */ + DEBUGP("final section addresses:\n"); + for (i = 0; i < info->hdr->e_shnum; i++) { + void *dest; + Elf_Shdr *shdr = &info->sechdrs[i]; + + if (!(shdr->sh_flags & SHF_ALLOC)) + continue; + + if (shdr->sh_entsize & INIT_OFFSET_MASK) + dest = mod->module_init + + (shdr->sh_entsize & ~INIT_OFFSET_MASK); + else + dest = mod->module_core + shdr->sh_entsize; + + if (shdr->sh_type != SHT_NOBITS) + memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); + /* Update sh_addr to point to copy in image. */ + shdr->sh_addr = (unsigned long)dest; + DEBUGP("\t0x%lx %s\n", + shdr->sh_addr, info->secstrings + shdr->sh_name); + } + + return 0; +} + +static int check_module_license_and_versions(struct module *mod) +{ + /* + * ndiswrapper is under GPL by itself, but loads proprietary modules. + * Don't use add_taint_module(), as it would prevent ndiswrapper from + * using GPL-only symbols it needs. + */ + if (strcmp(mod->name, "ndiswrapper") == 0) + add_taint(TAINT_PROPRIETARY_MODULE); + + /* driverloader was caught wrongly pretending to be under GPL */ + if (strcmp(mod->name, "driverloader") == 0) + add_taint_module(mod, TAINT_PROPRIETARY_MODULE); + +#ifdef CONFIG_MODVERSIONS + if ((mod->num_syms && !mod->crcs) + || (mod->num_gpl_syms && !mod->gpl_crcs) + || (mod->num_gpl_future_syms && !mod->gpl_future_crcs) +#ifdef CONFIG_UNUSED_SYMBOLS + || (mod->num_unused_syms && !mod->unused_crcs) + || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs) +#endif + ) { + return try_to_force_load(mod, + "no versions for exported symbols"); + } +#endif + return 0; +} + +static void flush_module_icache(const struct module *mod) +{ + mm_segment_t old_fs; + + /* flush the icache in correct context */ + old_fs = get_fs(); + set_fs(KERNEL_DS); + + /* + * Flush the instruction cache, since we've played with text. + * Do it before processing of module parameters, so the module + * can provide parameter accessor functions of its own. + */ + if (mod->module_init) + flush_icache_range((unsigned long)mod->module_init, + (unsigned long)mod->module_init + + mod->init_size); + flush_icache_range((unsigned long)mod->module_core, + (unsigned long)mod->module_core + mod->core_size); + + set_fs(old_fs); +} + +static struct module *layout_and_allocate(struct load_info *info) +{ + /* Module within temporary copy. */ + struct module *mod; + Elf_Shdr *pcpusec; + int err; + + mod = setup_load_info(info); + if (IS_ERR(mod)) + return mod; + + err = check_modinfo(mod, info); + if (err) + return ERR_PTR(err); + + /* Allow arches to frob section contents and sizes. */ + err = module_frob_arch_sections(info->hdr, info->sechdrs, + info->secstrings, mod); + if (err < 0) + goto out; + + pcpusec = &info->sechdrs[info->index.pcpu]; + if (pcpusec->sh_size) { + /* We have a special allocation for this section. */ + err = percpu_modalloc(mod, + pcpusec->sh_size, pcpusec->sh_addralign); + if (err) + goto out; + pcpusec->sh_flags &= ~(unsigned long)SHF_ALLOC; + } + + /* Determine total sizes, and put offsets in sh_entsize. For now + this is done generically; there doesn't appear to be any + special cases for the architectures. */ + layout_sections(mod, info); + + info->strmap = kzalloc(BITS_TO_LONGS(info->sechdrs[info->index.str].sh_size) + * sizeof(long), GFP_KERNEL); + if (!info->strmap) { + err = -ENOMEM; + goto free_percpu; + } + layout_symtab(mod, info); + + /* Allocate and move to the final place */ + err = move_module(mod, info); + if (err) + goto free_strmap; + + /* Module has been copied to its final place now: return it. */ + mod = (void *)info->sechdrs[info->index.mod].sh_addr; + kmemleak_load_module(mod, info); + return mod; + +free_strmap: + kfree(info->strmap); +free_percpu: + percpu_modfree(mod); +out: + return ERR_PTR(err); +} + +/* mod is no longer valid after this! */ +static void module_deallocate(struct module *mod, struct load_info *info) +{ + kfree(info->strmap); + percpu_modfree(mod); + module_free(mod, mod->module_init); + module_free(mod, mod->module_core); +} + +static int post_relocation(struct module *mod, const struct load_info *info) +{ + /* Sort exception table now relocations are done. */ + sort_extable(mod->extable, mod->extable + mod->num_exentries); + + /* Copy relocated percpu area over. */ + percpu_modcopy(mod, (void *)info->sechdrs[info->index.pcpu].sh_addr, + info->sechdrs[info->index.pcpu].sh_size); + + /* Setup kallsyms-specific fields. */ + add_kallsyms(mod, info); + + /* Arch-specific module finalizing. */ + return module_finalize(info->hdr, info->sechdrs, mod); +} + +/* Allocate and load the module: note that size of section 0 is always + zero, and we rely on this for optional sections. */ +static struct module *load_module(void __user *umod, + unsigned long len, + const char __user *uargs) +{ + struct load_info info = { NULL, }; + struct module *mod; + long err; + + DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", + umod, len, uargs); + + /* Copy in the blobs from userspace, check they are vaguely sane. */ + err = copy_and_check(&info, umod, len, uargs); + if (err) + return ERR_PTR(err); + + /* Figure out module layout, and allocate all the memory. */ + mod = layout_and_allocate(&info); + if (IS_ERR(mod)) { + err = PTR_ERR(mod); + goto free_copy; + } + + /* Now module is in final location, initialize linked lists, etc. */ + err = module_unload_init(mod); + if (err) + goto free_module; + + /* Now we've got everything in the final locations, we can + * find optional sections. */ + find_module_sections(mod, &info); + + err = check_module_license_and_versions(mod); + if (err) + goto free_unload; + + /* Set up MODINFO_ATTR fields */ + setup_modinfo(mod, &info); + + /* Fix up syms, so that st_value is a pointer to location. */ + err = simplify_symbols(mod, &info); + if (err < 0) + goto free_modinfo; + + err = apply_relocations(mod, &info); + if (err < 0) + goto free_modinfo; + + err = post_relocation(mod, &info); + if (err < 0) + goto free_modinfo; + + flush_module_icache(mod); + + /* Now copy in args */ + mod->args = strndup_user(uargs, ~0UL >> 1); + if (IS_ERR(mod->args)) { + err = PTR_ERR(mod->args); + goto free_arch_cleanup; + } + + /* Mark state as coming so strong_try_module_get() ignores us. */ + mod->state = MODULE_STATE_COMING; + + /* Now sew it into the lists so we can get lockdep and oops + * info during argument parsing. No one should access us, since + * strong_try_module_get() will fail. + * lockdep/oops can run asynchronous, so use the RCU list insertion + * function to insert in a way safe to concurrent readers. + * The mutex protects against concurrent writers. + */ + mutex_lock(&module_mutex); + if (find_module(mod->name)) { + err = -EEXIST; + goto unlock; + } + + /* This has to be done once we're sure module name is unique. */ + if (!mod->taints || mod->taints == (1U<list, &modules); + mutex_unlock(&module_mutex); + + /* Module is ready to execute: parsing args may do that. */ + err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); + if (err < 0) + goto unlink; + + /* Link in to syfs. */ + err = mod_sysfs_setup(mod, &info, mod->kp, mod->num_kp); + if (err < 0) + goto unlink; + + /* Get rid of temporary copy and strmap. */ + kfree(info.strmap); + free_copy(&info); + + /* Done! */ + trace_module_load(mod); + return mod; + + unlink: + mutex_lock(&module_mutex); + /* Unlink carefully: kallsyms could be walking list. */ + list_del_rcu(&mod->list); + module_bug_cleanup(mod); + + ddebug: + if (!mod->taints || mod->taints == (1U<args); + free_arch_cleanup: + module_arch_cleanup(mod); + free_modinfo: + free_modinfo(mod); + free_unload: + module_unload_free(mod); + free_module: + module_deallocate(mod, &info); + free_copy: + free_copy(&info); + return ERR_PTR(err); +} + +/* Call module constructors. */ +static void do_mod_ctors(struct module *mod) +{ +#ifdef CONFIG_CONSTRUCTORS + unsigned long i; + + for (i = 0; i < mod->num_ctors; i++) + mod->ctors[i](); +#endif +} + +/* This is where the real work happens */ +SYSCALL_DEFINE3(init_module, void __user *, umod, + unsigned long, len, const char __user *, uargs) +{ + struct module *mod; + int ret = 0; + + /* Must have permission */ + if (!capable(CAP_SYS_MODULE) || modules_disabled) + return -EPERM; + + /* Do all the hard work */ + mod = load_module(umod, len, uargs); + if (IS_ERR(mod)) + return PTR_ERR(mod); + + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_COMING, mod); + + /* Set RO and NX regions for core */ + set_section_ro_nx(mod->module_core, + mod->core_text_size, + mod->core_ro_size, + mod->core_size); + + /* Set RO and NX regions for init */ + set_section_ro_nx(mod->module_init, + mod->init_text_size, + mod->init_ro_size, + mod->init_size); + + do_mod_ctors(mod); + /* Start the module */ + if (mod->init != NULL) + ret = do_one_initcall(mod->init); + if (ret < 0) { + /* Init routine failed: abort. Try to protect us from + buggy refcounters. */ + mod->state = MODULE_STATE_GOING; + synchronize_sched(); + module_put(mod); + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_GOING, mod); + free_module(mod); + wake_up(&module_wq); + return ret; + } + if (ret > 0) { + printk(KERN_WARNING +"%s: '%s'->init suspiciously returned %d, it should follow 0/-E convention\n" +"%s: loading module anyway...\n", + __func__, mod->name, ret, + __func__); + dump_stack(); + } + + /* Now it's a first class citizen! Wake up anyone waiting for it. */ + mod->state = MODULE_STATE_LIVE; + wake_up(&module_wq); + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_LIVE, mod); + + /* We need to finish all async code before the module init sequence is done */ + async_synchronize_full(); + + mutex_lock(&module_mutex); + /* Drop initial reference. */ + module_put(mod); + trim_init_extable(mod); +#ifdef CONFIG_KALLSYMS + mod->num_symtab = mod->core_num_syms; + mod->symtab = mod->core_symtab; + mod->strtab = mod->core_strtab; +#endif + unset_module_init_ro_nx(mod); + module_free(mod, mod->module_init); + mod->module_init = NULL; + mod->init_size = 0; + mod->init_ro_size = 0; + mod->init_text_size = 0; + mutex_unlock(&module_mutex); + + return 0; +} + +static inline int within(unsigned long addr, void *start, unsigned long size) +{ + return ((void *)addr >= start && (void *)addr < start + size); +} + +#ifdef CONFIG_KALLSYMS +/* + * This ignores the intensely annoying "mapping symbols" found + * in ARM ELF files: $a, $t and $d. + */ +static inline int is_arm_mapping_symbol(const char *str) +{ + return str[0] == '$' && strchr("atd", str[1]) + && (str[2] == '\0' || str[2] == '.'); +} + +static const char *get_ksymbol(struct module *mod, + unsigned long addr, + unsigned long *size, + unsigned long *offset) +{ + unsigned int i, best = 0; + unsigned long nextval; + + /* At worse, next value is at end of module */ + if (within_module_init(addr, mod)) + nextval = (unsigned long)mod->module_init+mod->init_text_size; + else + nextval = (unsigned long)mod->module_core+mod->core_text_size; + + /* Scan for closest preceding symbol, and next symbol. (ELF + starts real symbols at 1). */ + for (i = 1; i < mod->num_symtab; i++) { + if (mod->symtab[i].st_shndx == SHN_UNDEF) + continue; + + /* We ignore unnamed symbols: they're uninformative + * and inserted at a whim. */ + if (mod->symtab[i].st_value <= addr + && mod->symtab[i].st_value > mod->symtab[best].st_value + && *(mod->strtab + mod->symtab[i].st_name) != '\0' + && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name)) + best = i; + if (mod->symtab[i].st_value > addr + && mod->symtab[i].st_value < nextval + && *(mod->strtab + mod->symtab[i].st_name) != '\0' + && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name)) + nextval = mod->symtab[i].st_value; + } + + if (!best) + return NULL; + + if (size) + *size = nextval - mod->symtab[best].st_value; + if (offset) + *offset = addr - mod->symtab[best].st_value; + return mod->strtab + mod->symtab[best].st_name; +} + +/* For kallsyms to ask for address resolution. NULL means not found. Careful + * not to lock to avoid deadlock on oopses, simply disable preemption. */ +const char *module_address_lookup(unsigned long addr, + unsigned long *size, + unsigned long *offset, + char **modname, + char *namebuf) +{ + struct module *mod; + const char *ret = NULL; + + preempt_disable(); + list_for_each_entry_rcu(mod, &modules, list) { + if (within_module_init(addr, mod) || + within_module_core(addr, mod)) { + if (modname) + *modname = mod->name; + ret = get_ksymbol(mod, addr, size, offset); + break; + } + } + /* Make a copy in here where it's safe */ + if (ret) { + strncpy(namebuf, ret, KSYM_NAME_LEN - 1); + ret = namebuf; + } + preempt_enable(); + return ret; +} + +int lookup_module_symbol_name(unsigned long addr, char *symname) +{ + struct module *mod; + + preempt_disable(); + list_for_each_entry_rcu(mod, &modules, list) { + if (within_module_init(addr, mod) || + within_module_core(addr, mod)) { + const char *sym; + + sym = get_ksymbol(mod, addr, NULL, NULL); + if (!sym) + goto out; + strlcpy(symname, sym, KSYM_NAME_LEN); + preempt_enable(); + return 0; + } + } +out: + preempt_enable(); + return -ERANGE; +} + +int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, + unsigned long *offset, char *modname, char *name) +{ + struct module *mod; + + preempt_disable(); + list_for_each_entry_rcu(mod, &modules, list) { + if (within_module_init(addr, mod) || + within_module_core(addr, mod)) { + const char *sym; + + sym = get_ksymbol(mod, addr, size, offset); + if (!sym) + goto out; + if (modname) + strlcpy(modname, mod->name, MODULE_NAME_LEN); + if (name) + strlcpy(name, sym, KSYM_NAME_LEN); + preempt_enable(); + return 0; + } + } +out: + preempt_enable(); + return -ERANGE; +} + +int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, + char *name, char *module_name, int *exported) +{ + struct module *mod; + + preempt_disable(); + list_for_each_entry_rcu(mod, &modules, list) { + if (symnum < mod->num_symtab) { + *value = mod->symtab[symnum].st_value; + *type = mod->symtab[symnum].st_info; + strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, + KSYM_NAME_LEN); + strlcpy(module_name, mod->name, MODULE_NAME_LEN); + *exported = is_exported(name, *value, mod); + preempt_enable(); + return 0; + } + symnum -= mod->num_symtab; + } + preempt_enable(); + return -ERANGE; +} + +static unsigned long mod_find_symname(struct module *mod, const char *name) +{ + unsigned int i; + + for (i = 0; i < mod->num_symtab; i++) + if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0 && + mod->symtab[i].st_info != 'U') + return mod->symtab[i].st_value; + return 0; +} + +/* Look for this name: can be of form module:name. */ +unsigned long module_kallsyms_lookup_name(const char *name) +{ + struct module *mod; + char *colon; + unsigned long ret = 0; + + /* Don't lock: we're in enough trouble already. */ + preempt_disable(); + if ((colon = strchr(name, ':')) != NULL) { + *colon = '\0'; + if ((mod = find_module(name)) != NULL) + ret = mod_find_symname(mod, colon+1); + *colon = ':'; + } else { + list_for_each_entry_rcu(mod, &modules, list) + if ((ret = mod_find_symname(mod, name)) != 0) + break; + } + preempt_enable(); + return ret; +} + +int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, + struct module *, unsigned long), + void *data) +{ + struct module *mod; + unsigned int i; + int ret; + + list_for_each_entry(mod, &modules, list) { + for (i = 0; i < mod->num_symtab; i++) { + ret = fn(data, mod->strtab + mod->symtab[i].st_name, + mod, mod->symtab[i].st_value); + if (ret != 0) + return ret; + } + } + return 0; +} +#endif /* CONFIG_KALLSYMS */ + +static char *module_flags(struct module *mod, char *buf) +{ + int bx = 0; + + if (mod->taints || + mod->state == MODULE_STATE_GOING || + mod->state == MODULE_STATE_COMING) { + buf[bx++] = '('; + if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE)) + buf[bx++] = 'P'; + if (mod->taints & (1 << TAINT_FORCED_MODULE)) + buf[bx++] = 'F'; + if (mod->taints & (1 << TAINT_CRAP)) + buf[bx++] = 'C'; + /* + * TAINT_FORCED_RMMOD: could be added. + * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't + * apply to modules. + */ + + /* Show a - for module-is-being-unloaded */ + if (mod->state == MODULE_STATE_GOING) + buf[bx++] = '-'; + /* Show a + for module-is-being-loaded */ + if (mod->state == MODULE_STATE_COMING) + buf[bx++] = '+'; + buf[bx++] = ')'; + } + buf[bx] = '\0'; + + return buf; +} + +#ifdef CONFIG_PROC_FS +/* Called by the /proc file system to return a list of modules. */ +static void *m_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&module_mutex); + return seq_list_start(&modules, *pos); +} + +static void *m_next(struct seq_file *m, void *p, loff_t *pos) +{ + return seq_list_next(p, &modules, pos); +} + +static void m_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&module_mutex); +} + +static int m_show(struct seq_file *m, void *p) +{ + struct module *mod = list_entry(p, struct module, list); + char buf[8]; + + seq_printf(m, "%s %u", + mod->name, mod->init_size + mod->core_size); + print_unload_info(m, mod); + + /* Informative for users. */ + seq_printf(m, " %s", + mod->state == MODULE_STATE_GOING ? "Unloading": + mod->state == MODULE_STATE_COMING ? "Loading": + "Live"); + /* Used by oprofile and other similar tools. */ + seq_printf(m, " 0x%pK", mod->module_core); + + /* Taints info */ + if (mod->taints) + seq_printf(m, " %s", module_flags(mod, buf)); + + seq_printf(m, "\n"); + return 0; +} + +/* Format: modulename size refcount deps address + + Where refcount is a number or -, and deps is a comma-separated list + of depends or -. +*/ +static const struct seq_operations modules_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = m_show +}; + +static int modules_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &modules_op); +} + +static const struct file_operations proc_modules_operations = { + .open = modules_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init proc_modules_init(void) +{ + proc_create("modules", 0, NULL, &proc_modules_operations); + return 0; +} +module_init(proc_modules_init); +#endif + +/* Given an address, look for it in the module exception tables. */ +const struct exception_table_entry *search_module_extables(unsigned long addr) +{ + const struct exception_table_entry *e = NULL; + struct module *mod; + + preempt_disable(); + list_for_each_entry_rcu(mod, &modules, list) { + if (mod->num_exentries == 0) + continue; + + e = search_extable(mod->extable, + mod->extable + mod->num_exentries - 1, + addr); + if (e) + break; + } + preempt_enable(); + + /* Now, if we found one, we are running inside it now, hence + we cannot unload the module, hence no refcnt needed. */ + return e; +} + +/* + * is_module_address - is this address inside a module? + * @addr: the address to check. + * + * See is_module_text_address() if you simply want to see if the address + * is code (not data). + */ +bool is_module_address(unsigned long addr) +{ + bool ret; + + preempt_disable(); + ret = __module_address(addr) != NULL; + preempt_enable(); + + return ret; +} + +/* + * __module_address - get the module which contains an address. + * @addr: the address. + * + * Must be called with preempt disabled or module mutex held so that + * module doesn't get freed during this. + */ +struct module *__module_address(unsigned long addr) +{ + struct module *mod; + + if (addr < module_addr_min || addr > module_addr_max) + return NULL; + + list_for_each_entry_rcu(mod, &modules, list) + if (within_module_core(addr, mod) + || within_module_init(addr, mod)) + return mod; + return NULL; +} +EXPORT_SYMBOL_GPL(__module_address); + +/* + * is_module_text_address - is this address inside module code? + * @addr: the address to check. + * + * See is_module_address() if you simply want to see if the address is + * anywhere in a module. See kernel_text_address() for testing if an + * address corresponds to kernel or module code. + */ +bool is_module_text_address(unsigned long addr) +{ + bool ret; + + preempt_disable(); + ret = __module_text_address(addr) != NULL; + preempt_enable(); + + return ret; +} + +/* + * __module_text_address - get the module whose code contains an address. + * @addr: the address. + * + * Must be called with preempt disabled or module mutex held so that + * module doesn't get freed during this. + */ +struct module *__module_text_address(unsigned long addr) +{ + struct module *mod = __module_address(addr); + if (mod) { + /* Make sure it's within the text section. */ + if (!within(addr, mod->module_init, mod->init_text_size) + && !within(addr, mod->module_core, mod->core_text_size)) + mod = NULL; + } + return mod; +} +EXPORT_SYMBOL_GPL(__module_text_address); + +/* Don't grab lock, we're oopsing. */ +void print_modules(void) +{ + struct module *mod; + char buf[8]; + + printk(KERN_DEFAULT "Modules linked in:"); + /* Most callers should already have preempt disabled, but make sure */ + preempt_disable(); + list_for_each_entry_rcu(mod, &modules, list) + printk(" %s%s", mod->name, module_flags(mod, buf)); + preempt_enable(); + if (last_unloaded_module[0]) + printk(" [last unloaded: %s]", last_unloaded_module); + printk("\n"); +} + +#ifdef CONFIG_MODVERSIONS +/* Generate the signature for all relevant module structures here. + * If these change, we don't want to try to parse the module. */ +void module_layout(struct module *mod, + struct modversion_info *ver, + struct kernel_param *kp, + struct kernel_symbol *ks, + struct tracepoint * const *tp) +{ +} +EXPORT_SYMBOL(module_layout); +#endif + +#ifdef CONFIG_TRACEPOINTS +void module_update_tracepoints(void) +{ + struct module *mod; + + mutex_lock(&module_mutex); + list_for_each_entry(mod, &modules, list) + if (!mod->taints) + tracepoint_update_probe_range(mod->tracepoints_ptrs, + mod->tracepoints_ptrs + mod->num_tracepoints); + mutex_unlock(&module_mutex); +} + +/* + * Returns 0 if current not found. + * Returns 1 if current found. + */ +int module_get_iter_tracepoints(struct tracepoint_iter *iter) +{ + struct module *iter_mod; + int found = 0; + + mutex_lock(&module_mutex); + list_for_each_entry(iter_mod, &modules, list) { + if (!iter_mod->taints) { + /* + * Sorted module list + */ + if (iter_mod < iter->module) + continue; + else if (iter_mod > iter->module) + iter->tracepoint = NULL; + found = tracepoint_get_iter_range(&iter->tracepoint, + iter_mod->tracepoints_ptrs, + iter_mod->tracepoints_ptrs + + iter_mod->num_tracepoints); + if (found) { + iter->module = iter_mod; + break; + } + } + } + mutex_unlock(&module_mutex); + return found; +} +#endif diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c new file mode 100644 index 00000000..73da83af --- /dev/null +++ b/kernel/mutex-debug.c @@ -0,0 +1,110 @@ +/* + * kernel/mutex-debug.c + * + * Debugging code for mutexes + * + * Started by Ingo Molnar: + * + * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar + * + * lock debugging, locking tree, deadlock detection started by: + * + * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey + * Released under the General Public License (GPL). + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mutex-debug.h" + +/* + * Must be called with lock->wait_lock held. + */ +void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) +{ + memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter)); + waiter->magic = waiter; + INIT_LIST_HEAD(&waiter->list); +} + +void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) +{ + SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); + DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list)); + DEBUG_LOCKS_WARN_ON(waiter->magic != waiter); + DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); +} + +void debug_mutex_free_waiter(struct mutex_waiter *waiter) +{ + DEBUG_LOCKS_WARN_ON(!list_empty(&waiter->list)); + memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter)); +} + +void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, + struct thread_info *ti) +{ + SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock)); + + /* Mark the current thread as blocked on the lock: */ + ti->task->blocked_on = waiter; +} + +void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, + struct thread_info *ti) +{ + DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); + DEBUG_LOCKS_WARN_ON(waiter->task != ti->task); + DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter); + ti->task->blocked_on = NULL; + + list_del_init(&waiter->list); + waiter->task = NULL; +} + +void debug_mutex_unlock(struct mutex *lock) +{ + if (unlikely(!debug_locks)) + return; + + DEBUG_LOCKS_WARN_ON(lock->magic != lock); + DEBUG_LOCKS_WARN_ON(lock->owner != current); + DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); + mutex_clear_owner(lock); +} + +void debug_mutex_init(struct mutex *lock, const char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif + lock->magic = lock; +} + +/*** + * mutex_destroy - mark a mutex unusable + * @lock: the mutex to be destroyed + * + * This function marks the mutex uninitialized, and any subsequent + * use of the mutex is forbidden. The mutex must not be locked when + * this function is called. + */ +void mutex_destroy(struct mutex *lock) +{ + DEBUG_LOCKS_WARN_ON(mutex_is_locked(lock)); + lock->magic = NULL; +} + +EXPORT_SYMBOL_GPL(mutex_destroy); diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h new file mode 100644 index 00000000..0799fd3e --- /dev/null +++ b/kernel/mutex-debug.h @@ -0,0 +1,55 @@ +/* + * Mutexes: blocking mutual exclusion locks + * + * started by Ingo Molnar: + * + * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar + * + * This file contains mutex debugging related internal declarations, + * prototypes and inline functions, for the CONFIG_DEBUG_MUTEXES case. + * More details are in kernel/mutex-debug.c. + */ + +/* + * This must be called with lock->wait_lock held. + */ +extern void debug_mutex_lock_common(struct mutex *lock, + struct mutex_waiter *waiter); +extern void debug_mutex_wake_waiter(struct mutex *lock, + struct mutex_waiter *waiter); +extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); +extern void debug_mutex_add_waiter(struct mutex *lock, + struct mutex_waiter *waiter, + struct thread_info *ti); +extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, + struct thread_info *ti); +extern void debug_mutex_unlock(struct mutex *lock); +extern void debug_mutex_init(struct mutex *lock, const char *name, + struct lock_class_key *key); + +static inline void mutex_set_owner(struct mutex *lock) +{ + lock->owner = current; +} + +static inline void mutex_clear_owner(struct mutex *lock) +{ + lock->owner = NULL; +} + +#define spin_lock_mutex(lock, flags) \ + do { \ + struct mutex *l = container_of(lock, struct mutex, wait_lock); \ + \ + DEBUG_LOCKS_WARN_ON(in_interrupt()); \ + local_irq_save(flags); \ + arch_spin_lock(&(lock)->rlock.raw_lock);\ + DEBUG_LOCKS_WARN_ON(l->magic != l); \ + } while (0) + +#define spin_unlock_mutex(lock, flags) \ + do { \ + arch_spin_unlock(&(lock)->rlock.raw_lock); \ + local_irq_restore(flags); \ + preempt_check_resched(); \ + } while (0) diff --git a/kernel/mutex.c b/kernel/mutex.c new file mode 100644 index 00000000..d607ed5d --- /dev/null +++ b/kernel/mutex.c @@ -0,0 +1,500 @@ +/* + * kernel/mutex.c + * + * Mutexes: blocking mutual exclusion locks + * + * Started by Ingo Molnar: + * + * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar + * + * Many thanks to Arjan van de Ven, Thomas Gleixner, Steven Rostedt and + * David Howells for suggestions and improvements. + * + * - Adaptive spinning for mutexes by Peter Zijlstra. (Ported to mainline + * from the -rt tree, where it was originally implemented for rtmutexes + * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale + * and Sven Dietrich. + * + * Also see Documentation/mutex-design.txt. + */ +#include +#include +#include +#include +#include +#include + +/* + * In the DEBUG case we are using the "NULL fastpath" for mutexes, + * which forces all calls into the slowpath: + */ +#ifdef CONFIG_DEBUG_MUTEXES +# include "mutex-debug.h" +# include +#else +# include "mutex.h" +# include +#endif + +void +__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) +{ + atomic_set(&lock->count, 1); + spin_lock_init(&lock->wait_lock); + INIT_LIST_HEAD(&lock->wait_list); + mutex_clear_owner(lock); + + debug_mutex_init(lock, name, key); +} + +EXPORT_SYMBOL(__mutex_init); + +#ifndef CONFIG_DEBUG_LOCK_ALLOC +/* + * We split the mutex lock/unlock logic into separate fastpath and + * slowpath functions, to reduce the register pressure on the fastpath. + * We also put the fastpath first in the kernel image, to make sure the + * branch is predicted by the CPU as default-untaken. + */ +static __used noinline void __sched +__mutex_lock_slowpath(atomic_t *lock_count); + +/** + * mutex_lock - acquire the mutex + * @lock: the mutex to be acquired + * + * Lock the mutex exclusively for this task. If the mutex is not + * available right now, it will sleep until it can get it. + * + * The mutex must later on be released by the same task that + * acquired it. Recursive locking is not allowed. The task + * may not exit without first unlocking the mutex. Also, kernel + * memory where the mutex resides mutex must not be freed with + * the mutex still locked. The mutex must first be initialized + * (or statically defined) before it can be locked. memset()-ing + * the mutex to 0 is not allowed. + * + * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging + * checks that will enforce the restrictions and will also do + * deadlock debugging. ) + * + * This function is similar to (but not equivalent to) down(). + */ +void __sched mutex_lock(struct mutex *lock) +{ + might_sleep(); + /* + * The locking fastpath is the 1->0 transition from + * 'unlocked' into 'locked' state. + */ + __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath); + mutex_set_owner(lock); +} + +EXPORT_SYMBOL(mutex_lock); +#endif + +static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); + +/** + * mutex_unlock - release the mutex + * @lock: the mutex to be released + * + * Unlock a mutex that has been locked by this task previously. + * + * This function must not be used in interrupt context. Unlocking + * of a not locked mutex is not allowed. + * + * This function is similar to (but not equivalent to) up(). + */ +void __sched mutex_unlock(struct mutex *lock) +{ + /* + * The unlocking fastpath is the 0->1 transition from 'locked' + * into 'unlocked' state: + */ +#ifndef CONFIG_DEBUG_MUTEXES + /* + * When debugging is enabled we must not clear the owner before time, + * the slow path will always be taken, and that clears the owner field + * after verifying that it was indeed current. + */ + mutex_clear_owner(lock); +#endif + __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath); +} + +EXPORT_SYMBOL(mutex_unlock); + +/* + * Lock a mutex (possibly interruptible), slowpath: + */ +static inline int __sched +__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, + struct lockdep_map *nest_lock, unsigned long ip) +{ + struct task_struct *task = current; + struct mutex_waiter waiter; + unsigned long flags; + + preempt_disable(); + mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); + +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER + /* + * Optimistic spinning. + * + * We try to spin for acquisition when we find that there are no + * pending waiters and the lock owner is currently running on a + * (different) CPU. + * + * The rationale is that if the lock owner is running, it is likely to + * release the lock soon. + * + * Since this needs the lock owner, and this mutex implementation + * doesn't track the owner atomically in the lock field, we need to + * track it non-atomically. + * + * We can't do this for DEBUG_MUTEXES because that relies on wait_lock + * to serialize everything. + */ + + for (;;) { + struct task_struct *owner; + + /* + * If there's an owner, wait for it to either + * release the lock or go to sleep. + */ + owner = ACCESS_ONCE(lock->owner); + if (owner && !mutex_spin_on_owner(lock, owner)) + break; + + if (atomic_cmpxchg(&lock->count, 1, 0) == 1) { + lock_acquired(&lock->dep_map, ip); + mutex_set_owner(lock); + preempt_enable(); + return 0; + } + + /* + * When there's no owner, we might have preempted between the + * owner acquiring the lock and setting the owner field. If + * we're an RT task that will live-lock because we won't let + * the owner complete. + */ + if (!owner && (need_resched() || rt_task(task))) + break; + + /* + * The cpu_relax() call is a compiler barrier which forces + * everything in this loop to be re-loaded. We don't need + * memory barriers as we'll eventually observe the right + * values at the cost of a few extra spins. + */ + arch_mutex_cpu_relax(); + } +#endif + spin_lock_mutex(&lock->wait_lock, flags); + + debug_mutex_lock_common(lock, &waiter); + debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); + + /* add waiting tasks to the end of the waitqueue (FIFO): */ + list_add_tail(&waiter.list, &lock->wait_list); + waiter.task = task; + + if (atomic_xchg(&lock->count, -1) == 1) + goto done; + + lock_contended(&lock->dep_map, ip); + + for (;;) { + /* + * Lets try to take the lock again - this is needed even if + * we get here for the first time (shortly after failing to + * acquire the lock), to make sure that we get a wakeup once + * it's unlocked. Later on, if we sleep, this is the + * operation that gives us the lock. We xchg it to -1, so + * that when we release the lock, we properly wake up the + * other waiters: + */ + if (atomic_xchg(&lock->count, -1) == 1) + break; + + /* + * got a signal? (This code gets eliminated in the + * TASK_UNINTERRUPTIBLE case.) + */ + if (unlikely(signal_pending_state(state, task))) { + mutex_remove_waiter(lock, &waiter, + task_thread_info(task)); + mutex_release(&lock->dep_map, 1, ip); + spin_unlock_mutex(&lock->wait_lock, flags); + + debug_mutex_free_waiter(&waiter); + preempt_enable(); + return -EINTR; + } + __set_task_state(task, state); + + /* didn't get the lock, go to sleep: */ + spin_unlock_mutex(&lock->wait_lock, flags); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); + spin_lock_mutex(&lock->wait_lock, flags); + } + +done: + lock_acquired(&lock->dep_map, ip); + /* got the lock - rejoice! */ + mutex_remove_waiter(lock, &waiter, current_thread_info()); + mutex_set_owner(lock); + + /* set it to 0 if there are no waiters left: */ + if (likely(list_empty(&lock->wait_list))) + atomic_set(&lock->count, 0); + + spin_unlock_mutex(&lock->wait_lock, flags); + + debug_mutex_free_waiter(&waiter); + preempt_enable(); + + return 0; +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __sched +mutex_lock_nested(struct mutex *lock, unsigned int subclass) +{ + might_sleep(); + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_); +} + +EXPORT_SYMBOL_GPL(mutex_lock_nested); + +void __sched +_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) +{ + might_sleep(); + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_); +} + +EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); + +int __sched +mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) +{ + might_sleep(); + return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_); +} +EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); + +int __sched +mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) +{ + might_sleep(); + return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, + subclass, NULL, _RET_IP_); +} + +EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); +#endif + +/* + * Release the lock, slowpath: + */ +static inline void +__mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) +{ + struct mutex *lock = container_of(lock_count, struct mutex, count); + unsigned long flags; + + spin_lock_mutex(&lock->wait_lock, flags); + mutex_release(&lock->dep_map, nested, _RET_IP_); + debug_mutex_unlock(lock); + + /* + * some architectures leave the lock unlocked in the fastpath failure + * case, others need to leave it locked. In the later case we have to + * unlock it here + */ + if (__mutex_slowpath_needs_to_unlock()) + atomic_set(&lock->count, 1); + + if (!list_empty(&lock->wait_list)) { + /* get the first entry from the wait-list: */ + struct mutex_waiter *waiter = + list_entry(lock->wait_list.next, + struct mutex_waiter, list); + + debug_mutex_wake_waiter(lock, waiter); + + wake_up_process(waiter->task); + } + + spin_unlock_mutex(&lock->wait_lock, flags); +} + +/* + * Release the lock, slowpath: + */ +static __used noinline void +__mutex_unlock_slowpath(atomic_t *lock_count) +{ + __mutex_unlock_common_slowpath(lock_count, 1); +} + +#ifndef CONFIG_DEBUG_LOCK_ALLOC +/* + * Here come the less common (and hence less performance-critical) APIs: + * mutex_lock_interruptible() and mutex_trylock(). + */ +static noinline int __sched +__mutex_lock_killable_slowpath(atomic_t *lock_count); + +static noinline int __sched +__mutex_lock_interruptible_slowpath(atomic_t *lock_count); + +/** + * mutex_lock_interruptible - acquire the mutex, interruptible + * @lock: the mutex to be acquired + * + * Lock the mutex like mutex_lock(), and return 0 if the mutex has + * been acquired or sleep until the mutex becomes available. If a + * signal arrives while waiting for the lock then this function + * returns -EINTR. + * + * This function is similar to (but not equivalent to) down_interruptible(). + */ +int __sched mutex_lock_interruptible(struct mutex *lock) +{ + int ret; + + might_sleep(); + ret = __mutex_fastpath_lock_retval + (&lock->count, __mutex_lock_interruptible_slowpath); + if (!ret) + mutex_set_owner(lock); + + return ret; +} + +EXPORT_SYMBOL(mutex_lock_interruptible); + +int __sched mutex_lock_killable(struct mutex *lock) +{ + int ret; + + might_sleep(); + ret = __mutex_fastpath_lock_retval + (&lock->count, __mutex_lock_killable_slowpath); + if (!ret) + mutex_set_owner(lock); + + return ret; +} +EXPORT_SYMBOL(mutex_lock_killable); + +static __used noinline void __sched +__mutex_lock_slowpath(atomic_t *lock_count) +{ + struct mutex *lock = container_of(lock_count, struct mutex, count); + + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_); +} + +static noinline int __sched +__mutex_lock_killable_slowpath(atomic_t *lock_count) +{ + struct mutex *lock = container_of(lock_count, struct mutex, count); + + return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_); +} + +static noinline int __sched +__mutex_lock_interruptible_slowpath(atomic_t *lock_count) +{ + struct mutex *lock = container_of(lock_count, struct mutex, count); + + return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_); +} +#endif + +/* + * Spinlock based trylock, we take the spinlock and check whether we + * can get the lock: + */ +static inline int __mutex_trylock_slowpath(atomic_t *lock_count) +{ + struct mutex *lock = container_of(lock_count, struct mutex, count); + unsigned long flags; + int prev; + + spin_lock_mutex(&lock->wait_lock, flags); + + prev = atomic_xchg(&lock->count, -1); + if (likely(prev == 1)) { + mutex_set_owner(lock); + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); + } + + /* Set it back to 0 if there are no waiters: */ + if (likely(list_empty(&lock->wait_list))) + atomic_set(&lock->count, 0); + + spin_unlock_mutex(&lock->wait_lock, flags); + + return prev == 1; +} + +/** + * mutex_trylock - try to acquire the mutex, without waiting + * @lock: the mutex to be acquired + * + * Try to acquire the mutex atomically. Returns 1 if the mutex + * has been acquired successfully, and 0 on contention. + * + * NOTE: this function follows the spin_trylock() convention, so + * it is negated from the down_trylock() return values! Be careful + * about this when converting semaphore users to mutexes. + * + * This function must not be used in interrupt context. The + * mutex must be released by the same task that acquired it. + */ +int __sched mutex_trylock(struct mutex *lock) +{ + int ret; + + ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath); + if (ret) + mutex_set_owner(lock); + + return ret; +} +EXPORT_SYMBOL(mutex_trylock); + +/** + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 + * @cnt: the atomic which we are to dec + * @lock: the mutex to return holding if we dec to 0 + * + * return true and hold lock if we dec to 0, return false otherwise + */ +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) +{ + /* dec if we can't possibly hit 0 */ + if (atomic_add_unless(cnt, -1, 1)) + return 0; + /* we might hit 0, so take the lock */ + mutex_lock(lock); + if (!atomic_dec_and_test(cnt)) { + /* when we actually did the dec, we didn't hit 0 */ + mutex_unlock(lock); + return 0; + } + /* we hit 0, and we hold the lock */ + return 1; +} +EXPORT_SYMBOL(atomic_dec_and_mutex_lock); diff --git a/kernel/mutex.h b/kernel/mutex.h new file mode 100644 index 00000000..4115fbf8 --- /dev/null +++ b/kernel/mutex.h @@ -0,0 +1,48 @@ +/* + * Mutexes: blocking mutual exclusion locks + * + * started by Ingo Molnar: + * + * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar + * + * This file contains mutex debugging related internal prototypes, for the + * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: + */ + +#define spin_lock_mutex(lock, flags) \ + do { spin_lock(lock); (void)(flags); } while (0) +#define spin_unlock_mutex(lock, flags) \ + do { spin_unlock(lock); (void)(flags); } while (0) +#define mutex_remove_waiter(lock, waiter, ti) \ + __list_del((waiter)->list.prev, (waiter)->list.next) + +#ifdef CONFIG_SMP +static inline void mutex_set_owner(struct mutex *lock) +{ + lock->owner = current; +} + +static inline void mutex_clear_owner(struct mutex *lock) +{ + lock->owner = NULL; +} +#else +static inline void mutex_set_owner(struct mutex *lock) +{ +} + +static inline void mutex_clear_owner(struct mutex *lock) +{ +} +#endif + +#define debug_mutex_wake_waiter(lock, waiter) do { } while (0) +#define debug_mutex_free_waiter(waiter) do { } while (0) +#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) +#define debug_mutex_unlock(lock) do { } while (0) +#define debug_mutex_init(lock, name, key) do { } while (0) + +static inline void +debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter) +{ +} diff --git a/kernel/notifier.c b/kernel/notifier.c new file mode 100644 index 00000000..2488ba7e --- /dev/null +++ b/kernel/notifier.c @@ -0,0 +1,586 @@ +#include +#include +#include +#include +#include +#include +#include + +/* + * Notifier list for kernel code which wants to be called + * at shutdown. This is used to stop any idling DMA operations + * and the like. + */ +BLOCKING_NOTIFIER_HEAD(reboot_notifier_list); + +/* + * Notifier chain core routines. The exported routines below + * are layered on top of these, with appropriate locking added. + */ + +static int notifier_chain_register(struct notifier_block **nl, + struct notifier_block *n) +{ + while ((*nl) != NULL) { + if (n->priority > (*nl)->priority) + break; + nl = &((*nl)->next); + } + n->next = *nl; + rcu_assign_pointer(*nl, n); + return 0; +} + +static int notifier_chain_cond_register(struct notifier_block **nl, + struct notifier_block *n) +{ + while ((*nl) != NULL) { + if ((*nl) == n) + return 0; + if (n->priority > (*nl)->priority) + break; + nl = &((*nl)->next); + } + n->next = *nl; + rcu_assign_pointer(*nl, n); + return 0; +} + +static int notifier_chain_unregister(struct notifier_block **nl, + struct notifier_block *n) +{ + while ((*nl) != NULL) { + if ((*nl) == n) { + rcu_assign_pointer(*nl, n->next); + return 0; + } + nl = &((*nl)->next); + } + return -ENOENT; +} + +/** + * notifier_call_chain - Informs the registered notifiers about an event. + * @nl: Pointer to head of the blocking notifier chain + * @val: Value passed unmodified to notifier function + * @v: Pointer passed unmodified to notifier function + * @nr_to_call: Number of notifier functions to be called. Don't care + * value of this parameter is -1. + * @nr_calls: Records the number of notifications sent. Don't care + * value of this field is NULL. + * @returns: notifier_call_chain returns the value returned by the + * last notifier function called. + */ +static int __kprobes notifier_call_chain(struct notifier_block **nl, + unsigned long val, void *v, + int nr_to_call, int *nr_calls) +{ + int ret = NOTIFY_DONE; + struct notifier_block *nb, *next_nb; + + nb = rcu_dereference_raw(*nl); + + while (nb && nr_to_call) { + next_nb = rcu_dereference_raw(nb->next); + +#ifdef CONFIG_DEBUG_NOTIFIERS + if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) { + WARN(1, "Invalid notifier called!"); + nb = next_nb; + continue; + } +#endif + ret = nb->notifier_call(nb, val, v); + + if (nr_calls) + (*nr_calls)++; + + if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) + break; + nb = next_nb; + nr_to_call--; + } + return ret; +} + +/* + * Atomic notifier chain routines. Registration and unregistration + * use a spinlock, and call_chain is synchronized by RCU (no locks). + */ + +/** + * atomic_notifier_chain_register - Add notifier to an atomic notifier chain + * @nh: Pointer to head of the atomic notifier chain + * @n: New entry in notifier chain + * + * Adds a notifier to an atomic notifier chain. + * + * Currently always returns zero. + */ +int atomic_notifier_chain_register(struct atomic_notifier_head *nh, + struct notifier_block *n) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&nh->lock, flags); + ret = notifier_chain_register(&nh->head, n); + spin_unlock_irqrestore(&nh->lock, flags); + return ret; +} +EXPORT_SYMBOL_GPL(atomic_notifier_chain_register); + +/** + * atomic_notifier_chain_unregister - Remove notifier from an atomic notifier chain + * @nh: Pointer to head of the atomic notifier chain + * @n: Entry to remove from notifier chain + * + * Removes a notifier from an atomic notifier chain. + * + * Returns zero on success or %-ENOENT on failure. + */ +int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, + struct notifier_block *n) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&nh->lock, flags); + ret = notifier_chain_unregister(&nh->head, n); + spin_unlock_irqrestore(&nh->lock, flags); + synchronize_rcu(); + return ret; +} +EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); + +/** + * __atomic_notifier_call_chain - Call functions in an atomic notifier chain + * @nh: Pointer to head of the atomic notifier chain + * @val: Value passed unmodified to notifier function + * @v: Pointer passed unmodified to notifier function + * @nr_to_call: See the comment for notifier_call_chain. + * @nr_calls: See the comment for notifier_call_chain. + * + * Calls each function in a notifier chain in turn. The functions + * run in an atomic context, so they must not block. + * This routine uses RCU to synchronize with changes to the chain. + * + * If the return value of the notifier can be and'ed + * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain() + * will return immediately, with the return value of + * the notifier function which halted execution. + * Otherwise the return value is the return value + * of the last notifier function called. + */ +int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh, + unsigned long val, void *v, + int nr_to_call, int *nr_calls) +{ + int ret; + + rcu_read_lock(); + ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain); + +int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh, + unsigned long val, void *v) +{ + return __atomic_notifier_call_chain(nh, val, v, -1, NULL); +} +EXPORT_SYMBOL_GPL(atomic_notifier_call_chain); + +/* + * Blocking notifier chain routines. All access to the chain is + * synchronized by an rwsem. + */ + +/** + * blocking_notifier_chain_register - Add notifier to a blocking notifier chain + * @nh: Pointer to head of the blocking notifier chain + * @n: New entry in notifier chain + * + * Adds a notifier to a blocking notifier chain. + * Must be called in process context. + * + * Currently always returns zero. + */ +int blocking_notifier_chain_register(struct blocking_notifier_head *nh, + struct notifier_block *n) +{ + int ret; + + /* + * This code gets used during boot-up, when task switching is + * not yet working and interrupts must remain disabled. At + * such times we must not call down_write(). + */ + if (unlikely(system_state == SYSTEM_BOOTING)) + return notifier_chain_register(&nh->head, n); + + down_write(&nh->rwsem); + ret = notifier_chain_register(&nh->head, n); + up_write(&nh->rwsem); + return ret; +} +EXPORT_SYMBOL_GPL(blocking_notifier_chain_register); + +/** + * blocking_notifier_chain_cond_register - Cond add notifier to a blocking notifier chain + * @nh: Pointer to head of the blocking notifier chain + * @n: New entry in notifier chain + * + * Adds a notifier to a blocking notifier chain, only if not already + * present in the chain. + * Must be called in process context. + * + * Currently always returns zero. + */ +int blocking_notifier_chain_cond_register(struct blocking_notifier_head *nh, + struct notifier_block *n) +{ + int ret; + + down_write(&nh->rwsem); + ret = notifier_chain_cond_register(&nh->head, n); + up_write(&nh->rwsem); + return ret; +} +EXPORT_SYMBOL_GPL(blocking_notifier_chain_cond_register); + +/** + * blocking_notifier_chain_unregister - Remove notifier from a blocking notifier chain + * @nh: Pointer to head of the blocking notifier chain + * @n: Entry to remove from notifier chain + * + * Removes a notifier from a blocking notifier chain. + * Must be called from process context. + * + * Returns zero on success or %-ENOENT on failure. + */ +int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh, + struct notifier_block *n) +{ + int ret; + + /* + * This code gets used during boot-up, when task switching is + * not yet working and interrupts must remain disabled. At + * such times we must not call down_write(). + */ + if (unlikely(system_state == SYSTEM_BOOTING)) + return notifier_chain_unregister(&nh->head, n); + + down_write(&nh->rwsem); + ret = notifier_chain_unregister(&nh->head, n); + up_write(&nh->rwsem); + return ret; +} +EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister); + +/** + * __blocking_notifier_call_chain - Call functions in a blocking notifier chain + * @nh: Pointer to head of the blocking notifier chain + * @val: Value passed unmodified to notifier function + * @v: Pointer passed unmodified to notifier function + * @nr_to_call: See comment for notifier_call_chain. + * @nr_calls: See comment for notifier_call_chain. + * + * Calls each function in a notifier chain in turn. The functions + * run in a process context, so they are allowed to block. + * + * If the return value of the notifier can be and'ed + * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain() + * will return immediately, with the return value of + * the notifier function which halted execution. + * Otherwise the return value is the return value + * of the last notifier function called. + */ +int __blocking_notifier_call_chain(struct blocking_notifier_head *nh, + unsigned long val, void *v, + int nr_to_call, int *nr_calls) +{ + int ret = NOTIFY_DONE; + + /* + * We check the head outside the lock, but if this access is + * racy then it does not matter what the result of the test + * is, we re-check the list after having taken the lock anyway: + */ + if (rcu_dereference_raw(nh->head)) { + down_read(&nh->rwsem); + ret = notifier_call_chain(&nh->head, val, v, nr_to_call, + nr_calls); + up_read(&nh->rwsem); + } + return ret; +} +EXPORT_SYMBOL_GPL(__blocking_notifier_call_chain); + +int blocking_notifier_call_chain(struct blocking_notifier_head *nh, + unsigned long val, void *v) +{ + return __blocking_notifier_call_chain(nh, val, v, -1, NULL); +} +EXPORT_SYMBOL_GPL(blocking_notifier_call_chain); + +/* + * Raw notifier chain routines. There is no protection; + * the caller must provide it. Use at your own risk! + */ + +/** + * raw_notifier_chain_register - Add notifier to a raw notifier chain + * @nh: Pointer to head of the raw notifier chain + * @n: New entry in notifier chain + * + * Adds a notifier to a raw notifier chain. + * All locking must be provided by the caller. + * + * Currently always returns zero. + */ +int raw_notifier_chain_register(struct raw_notifier_head *nh, + struct notifier_block *n) +{ + return notifier_chain_register(&nh->head, n); +} +EXPORT_SYMBOL_GPL(raw_notifier_chain_register); + +/** + * raw_notifier_chain_unregister - Remove notifier from a raw notifier chain + * @nh: Pointer to head of the raw notifier chain + * @n: Entry to remove from notifier chain + * + * Removes a notifier from a raw notifier chain. + * All locking must be provided by the caller. + * + * Returns zero on success or %-ENOENT on failure. + */ +int raw_notifier_chain_unregister(struct raw_notifier_head *nh, + struct notifier_block *n) +{ + return notifier_chain_unregister(&nh->head, n); +} +EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister); + +/** + * __raw_notifier_call_chain - Call functions in a raw notifier chain + * @nh: Pointer to head of the raw notifier chain + * @val: Value passed unmodified to notifier function + * @v: Pointer passed unmodified to notifier function + * @nr_to_call: See comment for notifier_call_chain. + * @nr_calls: See comment for notifier_call_chain + * + * Calls each function in a notifier chain in turn. The functions + * run in an undefined context. + * All locking must be provided by the caller. + * + * If the return value of the notifier can be and'ed + * with %NOTIFY_STOP_MASK then raw_notifier_call_chain() + * will return immediately, with the return value of + * the notifier function which halted execution. + * Otherwise the return value is the return value + * of the last notifier function called. + */ +int __raw_notifier_call_chain(struct raw_notifier_head *nh, + unsigned long val, void *v, + int nr_to_call, int *nr_calls) +{ + return notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); +} +EXPORT_SYMBOL_GPL(__raw_notifier_call_chain); + +int raw_notifier_call_chain(struct raw_notifier_head *nh, + unsigned long val, void *v) +{ + return __raw_notifier_call_chain(nh, val, v, -1, NULL); +} +EXPORT_SYMBOL_GPL(raw_notifier_call_chain); + +/* + * SRCU notifier chain routines. Registration and unregistration + * use a mutex, and call_chain is synchronized by SRCU (no locks). + */ + +/** + * srcu_notifier_chain_register - Add notifier to an SRCU notifier chain + * @nh: Pointer to head of the SRCU notifier chain + * @n: New entry in notifier chain + * + * Adds a notifier to an SRCU notifier chain. + * Must be called in process context. + * + * Currently always returns zero. + */ +int srcu_notifier_chain_register(struct srcu_notifier_head *nh, + struct notifier_block *n) +{ + int ret; + + /* + * This code gets used during boot-up, when task switching is + * not yet working and interrupts must remain disabled. At + * such times we must not call mutex_lock(). + */ + if (unlikely(system_state == SYSTEM_BOOTING)) + return notifier_chain_register(&nh->head, n); + + mutex_lock(&nh->mutex); + ret = notifier_chain_register(&nh->head, n); + mutex_unlock(&nh->mutex); + return ret; +} +EXPORT_SYMBOL_GPL(srcu_notifier_chain_register); + +/** + * srcu_notifier_chain_unregister - Remove notifier from an SRCU notifier chain + * @nh: Pointer to head of the SRCU notifier chain + * @n: Entry to remove from notifier chain + * + * Removes a notifier from an SRCU notifier chain. + * Must be called from process context. + * + * Returns zero on success or %-ENOENT on failure. + */ +int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh, + struct notifier_block *n) +{ + int ret; + + /* + * This code gets used during boot-up, when task switching is + * not yet working and interrupts must remain disabled. At + * such times we must not call mutex_lock(). + */ + if (unlikely(system_state == SYSTEM_BOOTING)) + return notifier_chain_unregister(&nh->head, n); + + mutex_lock(&nh->mutex); + ret = notifier_chain_unregister(&nh->head, n); + mutex_unlock(&nh->mutex); + synchronize_srcu(&nh->srcu); + return ret; +} +EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister); + +/** + * __srcu_notifier_call_chain - Call functions in an SRCU notifier chain + * @nh: Pointer to head of the SRCU notifier chain + * @val: Value passed unmodified to notifier function + * @v: Pointer passed unmodified to notifier function + * @nr_to_call: See comment for notifier_call_chain. + * @nr_calls: See comment for notifier_call_chain + * + * Calls each function in a notifier chain in turn. The functions + * run in a process context, so they are allowed to block. + * + * If the return value of the notifier can be and'ed + * with %NOTIFY_STOP_MASK then srcu_notifier_call_chain() + * will return immediately, with the return value of + * the notifier function which halted execution. + * Otherwise the return value is the return value + * of the last notifier function called. + */ +int __srcu_notifier_call_chain(struct srcu_notifier_head *nh, + unsigned long val, void *v, + int nr_to_call, int *nr_calls) +{ + int ret; + int idx; + + idx = srcu_read_lock(&nh->srcu); + ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); + srcu_read_unlock(&nh->srcu, idx); + return ret; +} +EXPORT_SYMBOL_GPL(__srcu_notifier_call_chain); + +int srcu_notifier_call_chain(struct srcu_notifier_head *nh, + unsigned long val, void *v) +{ + return __srcu_notifier_call_chain(nh, val, v, -1, NULL); +} +EXPORT_SYMBOL_GPL(srcu_notifier_call_chain); + +/** + * srcu_init_notifier_head - Initialize an SRCU notifier head + * @nh: Pointer to head of the srcu notifier chain + * + * Unlike other sorts of notifier heads, SRCU notifier heads require + * dynamic initialization. Be sure to call this routine before + * calling any of the other SRCU notifier routines for this head. + * + * If an SRCU notifier head is deallocated, it must first be cleaned + * up by calling srcu_cleanup_notifier_head(). Otherwise the head's + * per-cpu data (used by the SRCU mechanism) will leak. + */ +void srcu_init_notifier_head(struct srcu_notifier_head *nh) +{ + mutex_init(&nh->mutex); + if (init_srcu_struct(&nh->srcu) < 0) + BUG(); + nh->head = NULL; +} +EXPORT_SYMBOL_GPL(srcu_init_notifier_head); + +/** + * register_reboot_notifier - Register function to be called at reboot time + * @nb: Info about notifier function to be called + * + * Registers a function with the list of functions + * to be called at reboot time. + * + * Currently always returns zero, as blocking_notifier_chain_register() + * always returns zero. + */ +int register_reboot_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&reboot_notifier_list, nb); +} +EXPORT_SYMBOL(register_reboot_notifier); + +/** + * unregister_reboot_notifier - Unregister previously registered reboot notifier + * @nb: Hook to be unregistered + * + * Unregisters a previously registered reboot + * notifier function. + * + * Returns zero on success, or %-ENOENT on failure. + */ +int unregister_reboot_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&reboot_notifier_list, nb); +} +EXPORT_SYMBOL(unregister_reboot_notifier); + +static ATOMIC_NOTIFIER_HEAD(die_chain); + +int notrace __kprobes notify_die(enum die_val val, const char *str, + struct pt_regs *regs, long err, int trap, int sig) +{ + struct die_args args = { + .regs = regs, + .str = str, + .err = err, + .trapnr = trap, + .signr = sig, + + }; + return atomic_notifier_call_chain(&die_chain, val, &args); +} + +int register_die_notifier(struct notifier_block *nb) +{ + vmalloc_sync_all(); + return atomic_notifier_chain_register(&die_chain, nb); +} +EXPORT_SYMBOL_GPL(register_die_notifier); + +int unregister_die_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&die_chain, nb); +} +EXPORT_SYMBOL_GPL(unregister_die_notifier); diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c new file mode 100644 index 00000000..d6a00f3d --- /dev/null +++ b/kernel/nsproxy.c @@ -0,0 +1,280 @@ +/* + * Copyright (C) 2006 IBM Corporation + * + * Author: Serge Hallyn + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + * + * Jun 2006 - namespaces support + * OpenVZ, SWsoft Inc. + * Pavel Emelianov + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct kmem_cache *nsproxy_cachep; + +struct nsproxy init_nsproxy = { + .count = ATOMIC_INIT(1), + .uts_ns = &init_uts_ns, +#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC) + .ipc_ns = &init_ipc_ns, +#endif + .mnt_ns = NULL, + .pid_ns = &init_pid_ns, +#ifdef CONFIG_NET + .net_ns = &init_net, +#endif +}; + +static inline struct nsproxy *create_nsproxy(void) +{ + struct nsproxy *nsproxy; + + nsproxy = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL); + if (nsproxy) + atomic_set(&nsproxy->count, 1); + return nsproxy; +} + +/* + * Create new nsproxy and all of its the associated namespaces. + * Return the newly created nsproxy. Do not attach this to the task, + * leave it to the caller to do proper locking and attach it to task. + */ +static struct nsproxy *create_new_namespaces(unsigned long flags, + struct task_struct *tsk, struct fs_struct *new_fs) +{ + struct nsproxy *new_nsp; + int err; + + new_nsp = create_nsproxy(); + if (!new_nsp) + return ERR_PTR(-ENOMEM); + + new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs); + if (IS_ERR(new_nsp->mnt_ns)) { + err = PTR_ERR(new_nsp->mnt_ns); + goto out_ns; + } + + new_nsp->uts_ns = copy_utsname(flags, tsk); + if (IS_ERR(new_nsp->uts_ns)) { + err = PTR_ERR(new_nsp->uts_ns); + goto out_uts; + } + + new_nsp->ipc_ns = copy_ipcs(flags, tsk); + if (IS_ERR(new_nsp->ipc_ns)) { + err = PTR_ERR(new_nsp->ipc_ns); + goto out_ipc; + } + + new_nsp->pid_ns = copy_pid_ns(flags, task_active_pid_ns(tsk)); + if (IS_ERR(new_nsp->pid_ns)) { + err = PTR_ERR(new_nsp->pid_ns); + goto out_pid; + } + + new_nsp->net_ns = copy_net_ns(flags, tsk->nsproxy->net_ns); + if (IS_ERR(new_nsp->net_ns)) { + err = PTR_ERR(new_nsp->net_ns); + goto out_net; + } + + return new_nsp; + +out_net: + if (new_nsp->pid_ns) + put_pid_ns(new_nsp->pid_ns); +out_pid: + if (new_nsp->ipc_ns) + put_ipc_ns(new_nsp->ipc_ns); +out_ipc: + if (new_nsp->uts_ns) + put_uts_ns(new_nsp->uts_ns); +out_uts: + if (new_nsp->mnt_ns) + put_mnt_ns(new_nsp->mnt_ns); +out_ns: + kmem_cache_free(nsproxy_cachep, new_nsp); + return ERR_PTR(err); +} + +/* + * called from clone. This now handles copy for nsproxy and all + * namespaces therein. + */ +int copy_namespaces(unsigned long flags, struct task_struct *tsk) +{ + struct nsproxy *old_ns = tsk->nsproxy; + struct nsproxy *new_ns; + int err = 0; + + if (!old_ns) + return 0; + + get_nsproxy(old_ns); + + if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | + CLONE_NEWPID | CLONE_NEWNET))) + return 0; + + if (!capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto out; + } + + /* + * CLONE_NEWIPC must detach from the undolist: after switching + * to a new ipc namespace, the semaphore arrays from the old + * namespace are unreachable. In clone parlance, CLONE_SYSVSEM + * means share undolist with parent, so we must forbid using + * it along with CLONE_NEWIPC. + */ + if ((flags & CLONE_NEWIPC) && (flags & CLONE_SYSVSEM)) { + err = -EINVAL; + goto out; + } + + new_ns = create_new_namespaces(flags, tsk, tsk->fs); + if (IS_ERR(new_ns)) { + err = PTR_ERR(new_ns); + goto out; + } + + tsk->nsproxy = new_ns; + +out: + put_nsproxy(old_ns); + return err; +} + +void free_nsproxy(struct nsproxy *ns) +{ + if (ns->mnt_ns) + put_mnt_ns(ns->mnt_ns); + if (ns->uts_ns) + put_uts_ns(ns->uts_ns); + if (ns->ipc_ns) + put_ipc_ns(ns->ipc_ns); + if (ns->pid_ns) + put_pid_ns(ns->pid_ns); + put_net(ns->net_ns); + kmem_cache_free(nsproxy_cachep, ns); +} + +/* + * Called from unshare. Unshare all the namespaces part of nsproxy. + * On success, returns the new nsproxy. + */ +int unshare_nsproxy_namespaces(unsigned long unshare_flags, + struct nsproxy **new_nsp, struct fs_struct *new_fs) +{ + int err = 0; + + if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | + CLONE_NEWNET))) + return 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + *new_nsp = create_new_namespaces(unshare_flags, current, + new_fs ? new_fs : current->fs); + if (IS_ERR(*new_nsp)) { + err = PTR_ERR(*new_nsp); + goto out; + } + +out: + return err; +} + +void switch_task_namespaces(struct task_struct *p, struct nsproxy *new) +{ + struct nsproxy *ns; + + might_sleep(); + + ns = p->nsproxy; + + rcu_assign_pointer(p->nsproxy, new); + + if (ns && atomic_dec_and_test(&ns->count)) { + /* + * wait for others to get what they want from this nsproxy. + * + * cannot release this nsproxy via the call_rcu() since + * put_mnt_ns() will want to sleep + */ + synchronize_rcu(); + free_nsproxy(ns); + } +} + +void exit_task_namespaces(struct task_struct *p) +{ + switch_task_namespaces(p, NULL); +} + +SYSCALL_DEFINE2(setns, int, fd, int, nstype) +{ + const struct proc_ns_operations *ops; + struct task_struct *tsk = current; + struct nsproxy *new_nsproxy; + struct proc_inode *ei; + struct file *file; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + file = proc_ns_fget(fd); + if (IS_ERR(file)) + return PTR_ERR(file); + + err = -EINVAL; + ei = PROC_I(file->f_dentry->d_inode); + ops = ei->ns_ops; + if (nstype && (ops->type != nstype)) + goto out; + + new_nsproxy = create_new_namespaces(0, tsk, tsk->fs); + if (IS_ERR(new_nsproxy)) { + err = PTR_ERR(new_nsproxy); + goto out; + } + + err = ops->install(new_nsproxy, ei->ns); + if (err) { + free_nsproxy(new_nsproxy); + goto out; + } + switch_task_namespaces(tsk, new_nsproxy); +out: + fput(file); + return err; +} + +static int __init nsproxy_cache_init(void) +{ + nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC); + return 0; +} + +module_init(nsproxy_cache_init); diff --git a/kernel/padata.c b/kernel/padata.c new file mode 100644 index 00000000..b91941df --- /dev/null +++ b/kernel/padata.c @@ -0,0 +1,1135 @@ +/* + * padata.c - generic interface to process data streams in parallel + * + * Copyright (C) 2008, 2009 secunet Security Networks AG + * Copyright (C) 2008, 2009 Steffen Klassert + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_SEQ_NR (INT_MAX - NR_CPUS) +#define MAX_OBJ_NUM 1000 + +static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) +{ + int cpu, target_cpu; + + target_cpu = cpumask_first(pd->cpumask.pcpu); + for (cpu = 0; cpu < cpu_index; cpu++) + target_cpu = cpumask_next(target_cpu, pd->cpumask.pcpu); + + return target_cpu; +} + +static int padata_cpu_hash(struct padata_priv *padata) +{ + int cpu_index; + struct parallel_data *pd; + + pd = padata->pd; + + /* + * Hash the sequence numbers to the cpus by taking + * seq_nr mod. number of cpus in use. + */ + cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask.pcpu); + + return padata_index_to_cpu(pd, cpu_index); +} + +static void padata_parallel_worker(struct work_struct *parallel_work) +{ + struct padata_parallel_queue *pqueue; + struct parallel_data *pd; + struct padata_instance *pinst; + LIST_HEAD(local_list); + + local_bh_disable(); + pqueue = container_of(parallel_work, + struct padata_parallel_queue, work); + pd = pqueue->pd; + pinst = pd->pinst; + + spin_lock(&pqueue->parallel.lock); + list_replace_init(&pqueue->parallel.list, &local_list); + spin_unlock(&pqueue->parallel.lock); + + while (!list_empty(&local_list)) { + struct padata_priv *padata; + + padata = list_entry(local_list.next, + struct padata_priv, list); + + list_del_init(&padata->list); + + padata->parallel(padata); + } + + local_bh_enable(); +} + +/** + * padata_do_parallel - padata parallelization function + * + * @pinst: padata instance + * @padata: object to be parallelized + * @cb_cpu: cpu the serialization callback function will run on, + * must be in the serial cpumask of padata(i.e. cpumask.cbcpu). + * + * The parallelization callback function will run with BHs off. + * Note: Every object which is parallelized by padata_do_parallel + * must be seen by padata_do_serial. + */ +int padata_do_parallel(struct padata_instance *pinst, + struct padata_priv *padata, int cb_cpu) +{ + int target_cpu, err; + struct padata_parallel_queue *queue; + struct parallel_data *pd; + + rcu_read_lock_bh(); + + pd = rcu_dereference(pinst->pd); + + err = -EINVAL; + if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID) + goto out; + + if (!cpumask_test_cpu(cb_cpu, pd->cpumask.cbcpu)) + goto out; + + err = -EBUSY; + if ((pinst->flags & PADATA_RESET)) + goto out; + + if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM) + goto out; + + err = 0; + atomic_inc(&pd->refcnt); + padata->pd = pd; + padata->cb_cpu = cb_cpu; + + if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr)) + atomic_set(&pd->seq_nr, -1); + + padata->seq_nr = atomic_inc_return(&pd->seq_nr); + + target_cpu = padata_cpu_hash(padata); + queue = per_cpu_ptr(pd->pqueue, target_cpu); + + spin_lock(&queue->parallel.lock); + list_add_tail(&padata->list, &queue->parallel.list); + spin_unlock(&queue->parallel.lock); + + queue_work_on(target_cpu, pinst->wq, &queue->work); + +out: + rcu_read_unlock_bh(); + + return err; +} +EXPORT_SYMBOL(padata_do_parallel); + +/* + * padata_get_next - Get the next object that needs serialization. + * + * Return values are: + * + * A pointer to the control struct of the next object that needs + * serialization, if present in one of the percpu reorder queues. + * + * NULL, if all percpu reorder queues are empty. + * + * -EINPROGRESS, if the next object that needs serialization will + * be parallel processed by another cpu and is not yet present in + * the cpu's reorder queue. + * + * -ENODATA, if this cpu has to do the parallel processing for + * the next object. + */ +static struct padata_priv *padata_get_next(struct parallel_data *pd) +{ + int cpu, num_cpus; + int next_nr, next_index; + struct padata_parallel_queue *queue, *next_queue; + struct padata_priv *padata; + struct padata_list *reorder; + + num_cpus = cpumask_weight(pd->cpumask.pcpu); + + /* + * Calculate the percpu reorder queue and the sequence + * number of the next object. + */ + next_nr = pd->processed; + next_index = next_nr % num_cpus; + cpu = padata_index_to_cpu(pd, next_index); + next_queue = per_cpu_ptr(pd->pqueue, cpu); + + if (unlikely(next_nr > pd->max_seq_nr)) { + next_nr = next_nr - pd->max_seq_nr - 1; + next_index = next_nr % num_cpus; + cpu = padata_index_to_cpu(pd, next_index); + next_queue = per_cpu_ptr(pd->pqueue, cpu); + pd->processed = 0; + } + + padata = NULL; + + reorder = &next_queue->reorder; + + if (!list_empty(&reorder->list)) { + padata = list_entry(reorder->list.next, + struct padata_priv, list); + + BUG_ON(next_nr != padata->seq_nr); + + spin_lock(&reorder->lock); + list_del_init(&padata->list); + atomic_dec(&pd->reorder_objects); + spin_unlock(&reorder->lock); + + pd->processed++; + + goto out; + } + + queue = per_cpu_ptr(pd->pqueue, smp_processor_id()); + if (queue->cpu_index == next_queue->cpu_index) { + padata = ERR_PTR(-ENODATA); + goto out; + } + + padata = ERR_PTR(-EINPROGRESS); +out: + return padata; +} + +static void padata_reorder(struct parallel_data *pd) +{ + struct padata_priv *padata; + struct padata_serial_queue *squeue; + struct padata_instance *pinst = pd->pinst; + + /* + * We need to ensure that only one cpu can work on dequeueing of + * the reorder queue the time. Calculating in which percpu reorder + * queue the next object will arrive takes some time. A spinlock + * would be highly contended. Also it is not clear in which order + * the objects arrive to the reorder queues. So a cpu could wait to + * get the lock just to notice that there is nothing to do at the + * moment. Therefore we use a trylock and let the holder of the lock + * care for all the objects enqueued during the holdtime of the lock. + */ + if (!spin_trylock_bh(&pd->lock)) + return; + + while (1) { + padata = padata_get_next(pd); + + /* + * All reorder queues are empty, or the next object that needs + * serialization is parallel processed by another cpu and is + * still on it's way to the cpu's reorder queue, nothing to + * do for now. + */ + if (!padata || PTR_ERR(padata) == -EINPROGRESS) + break; + + /* + * This cpu has to do the parallel processing of the next + * object. It's waiting in the cpu's parallelization queue, + * so exit immediately. + */ + if (PTR_ERR(padata) == -ENODATA) { + del_timer(&pd->timer); + spin_unlock_bh(&pd->lock); + return; + } + + squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu); + + spin_lock(&squeue->serial.lock); + list_add_tail(&padata->list, &squeue->serial.list); + spin_unlock(&squeue->serial.lock); + + queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work); + } + + spin_unlock_bh(&pd->lock); + + /* + * The next object that needs serialization might have arrived to + * the reorder queues in the meantime, we will be called again + * from the timer function if no one else cares for it. + */ + if (atomic_read(&pd->reorder_objects) + && !(pinst->flags & PADATA_RESET)) + mod_timer(&pd->timer, jiffies + HZ); + else + del_timer(&pd->timer); + + return; +} + +static void padata_reorder_timer(unsigned long arg) +{ + struct parallel_data *pd = (struct parallel_data *)arg; + + padata_reorder(pd); +} + +static void padata_serial_worker(struct work_struct *serial_work) +{ + struct padata_serial_queue *squeue; + struct parallel_data *pd; + LIST_HEAD(local_list); + + local_bh_disable(); + squeue = container_of(serial_work, struct padata_serial_queue, work); + pd = squeue->pd; + + spin_lock(&squeue->serial.lock); + list_replace_init(&squeue->serial.list, &local_list); + spin_unlock(&squeue->serial.lock); + + while (!list_empty(&local_list)) { + struct padata_priv *padata; + + padata = list_entry(local_list.next, + struct padata_priv, list); + + list_del_init(&padata->list); + + padata->serial(padata); + atomic_dec(&pd->refcnt); + } + local_bh_enable(); +} + +/** + * padata_do_serial - padata serialization function + * + * @padata: object to be serialized. + * + * padata_do_serial must be called for every parallelized object. + * The serialization callback function will run with BHs off. + */ +void padata_do_serial(struct padata_priv *padata) +{ + int cpu; + struct padata_parallel_queue *pqueue; + struct parallel_data *pd; + + pd = padata->pd; + + cpu = get_cpu(); + pqueue = per_cpu_ptr(pd->pqueue, cpu); + + spin_lock(&pqueue->reorder.lock); + atomic_inc(&pd->reorder_objects); + list_add_tail(&padata->list, &pqueue->reorder.list); + spin_unlock(&pqueue->reorder.lock); + + put_cpu(); + + padata_reorder(pd); +} +EXPORT_SYMBOL(padata_do_serial); + +static int padata_setup_cpumasks(struct parallel_data *pd, + const struct cpumask *pcpumask, + const struct cpumask *cbcpumask) +{ + if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL)) + return -ENOMEM; + + cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_active_mask); + if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) { + free_cpumask_var(pd->cpumask.cbcpu); + return -ENOMEM; + } + + cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_active_mask); + return 0; +} + +static void __padata_list_init(struct padata_list *pd_list) +{ + INIT_LIST_HEAD(&pd_list->list); + spin_lock_init(&pd_list->lock); +} + +/* Initialize all percpu queues used by serial workers */ +static void padata_init_squeues(struct parallel_data *pd) +{ + int cpu; + struct padata_serial_queue *squeue; + + for_each_cpu(cpu, pd->cpumask.cbcpu) { + squeue = per_cpu_ptr(pd->squeue, cpu); + squeue->pd = pd; + __padata_list_init(&squeue->serial); + INIT_WORK(&squeue->work, padata_serial_worker); + } +} + +/* Initialize all percpu queues used by parallel workers */ +static void padata_init_pqueues(struct parallel_data *pd) +{ + int cpu_index, num_cpus, cpu; + struct padata_parallel_queue *pqueue; + + cpu_index = 0; + for_each_cpu(cpu, pd->cpumask.pcpu) { + pqueue = per_cpu_ptr(pd->pqueue, cpu); + pqueue->pd = pd; + pqueue->cpu_index = cpu_index; + cpu_index++; + + __padata_list_init(&pqueue->reorder); + __padata_list_init(&pqueue->parallel); + INIT_WORK(&pqueue->work, padata_parallel_worker); + atomic_set(&pqueue->num_obj, 0); + } + + num_cpus = cpumask_weight(pd->cpumask.pcpu); + pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0; +} + +/* Allocate and initialize the internal cpumask dependend resources. */ +static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, + const struct cpumask *pcpumask, + const struct cpumask *cbcpumask) +{ + struct parallel_data *pd; + + pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); + if (!pd) + goto err; + + pd->pqueue = alloc_percpu(struct padata_parallel_queue); + if (!pd->pqueue) + goto err_free_pd; + + pd->squeue = alloc_percpu(struct padata_serial_queue); + if (!pd->squeue) + goto err_free_pqueue; + if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0) + goto err_free_squeue; + + padata_init_pqueues(pd); + padata_init_squeues(pd); + setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd); + atomic_set(&pd->seq_nr, -1); + atomic_set(&pd->reorder_objects, 0); + atomic_set(&pd->refcnt, 0); + pd->pinst = pinst; + spin_lock_init(&pd->lock); + + return pd; + +err_free_squeue: + free_percpu(pd->squeue); +err_free_pqueue: + free_percpu(pd->pqueue); +err_free_pd: + kfree(pd); +err: + return NULL; +} + +static void padata_free_pd(struct parallel_data *pd) +{ + free_cpumask_var(pd->cpumask.pcpu); + free_cpumask_var(pd->cpumask.cbcpu); + free_percpu(pd->pqueue); + free_percpu(pd->squeue); + kfree(pd); +} + +/* Flush all objects out of the padata queues. */ +static void padata_flush_queues(struct parallel_data *pd) +{ + int cpu; + struct padata_parallel_queue *pqueue; + struct padata_serial_queue *squeue; + + for_each_cpu(cpu, pd->cpumask.pcpu) { + pqueue = per_cpu_ptr(pd->pqueue, cpu); + flush_work(&pqueue->work); + } + + del_timer_sync(&pd->timer); + + if (atomic_read(&pd->reorder_objects)) + padata_reorder(pd); + + for_each_cpu(cpu, pd->cpumask.cbcpu) { + squeue = per_cpu_ptr(pd->squeue, cpu); + flush_work(&squeue->work); + } + + BUG_ON(atomic_read(&pd->refcnt) != 0); +} + +static void __padata_start(struct padata_instance *pinst) +{ + pinst->flags |= PADATA_INIT; +} + +static void __padata_stop(struct padata_instance *pinst) +{ + if (!(pinst->flags & PADATA_INIT)) + return; + + pinst->flags &= ~PADATA_INIT; + + synchronize_rcu(); + + get_online_cpus(); + padata_flush_queues(pinst->pd); + put_online_cpus(); +} + +/* Replace the internal control structure with a new one. */ +static void padata_replace(struct padata_instance *pinst, + struct parallel_data *pd_new) +{ + struct parallel_data *pd_old = pinst->pd; + int notification_mask = 0; + + pinst->flags |= PADATA_RESET; + + rcu_assign_pointer(pinst->pd, pd_new); + + synchronize_rcu(); + + if (!cpumask_equal(pd_old->cpumask.pcpu, pd_new->cpumask.pcpu)) + notification_mask |= PADATA_CPU_PARALLEL; + if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu)) + notification_mask |= PADATA_CPU_SERIAL; + + padata_flush_queues(pd_old); + padata_free_pd(pd_old); + + if (notification_mask) + blocking_notifier_call_chain(&pinst->cpumask_change_notifier, + notification_mask, + &pd_new->cpumask); + + pinst->flags &= ~PADATA_RESET; +} + +/** + * padata_register_cpumask_notifier - Registers a notifier that will be called + * if either pcpu or cbcpu or both cpumasks change. + * + * @pinst: A poineter to padata instance + * @nblock: A pointer to notifier block. + */ +int padata_register_cpumask_notifier(struct padata_instance *pinst, + struct notifier_block *nblock) +{ + return blocking_notifier_chain_register(&pinst->cpumask_change_notifier, + nblock); +} +EXPORT_SYMBOL(padata_register_cpumask_notifier); + +/** + * padata_unregister_cpumask_notifier - Unregisters cpumask notifier + * registered earlier using padata_register_cpumask_notifier + * + * @pinst: A pointer to data instance. + * @nlock: A pointer to notifier block. + */ +int padata_unregister_cpumask_notifier(struct padata_instance *pinst, + struct notifier_block *nblock) +{ + return blocking_notifier_chain_unregister( + &pinst->cpumask_change_notifier, + nblock); +} +EXPORT_SYMBOL(padata_unregister_cpumask_notifier); + + +/* If cpumask contains no active cpu, we mark the instance as invalid. */ +static bool padata_validate_cpumask(struct padata_instance *pinst, + const struct cpumask *cpumask) +{ + if (!cpumask_intersects(cpumask, cpu_active_mask)) { + pinst->flags |= PADATA_INVALID; + return false; + } + + pinst->flags &= ~PADATA_INVALID; + return true; +} + +static int __padata_set_cpumasks(struct padata_instance *pinst, + cpumask_var_t pcpumask, + cpumask_var_t cbcpumask) +{ + int valid; + struct parallel_data *pd; + + valid = padata_validate_cpumask(pinst, pcpumask); + if (!valid) { + __padata_stop(pinst); + goto out_replace; + } + + valid = padata_validate_cpumask(pinst, cbcpumask); + if (!valid) + __padata_stop(pinst); + +out_replace: + pd = padata_alloc_pd(pinst, pcpumask, cbcpumask); + if (!pd) + return -ENOMEM; + + cpumask_copy(pinst->cpumask.pcpu, pcpumask); + cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); + + padata_replace(pinst, pd); + + if (valid) + __padata_start(pinst); + + return 0; +} + +/** + * padata_set_cpumasks - Set both parallel and serial cpumasks. The first + * one is used by parallel workers and the second one + * by the wokers doing serialization. + * + * @pinst: padata instance + * @pcpumask: the cpumask to use for parallel workers + * @cbcpumask: the cpumsak to use for serial workers + */ +int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask, + cpumask_var_t cbcpumask) +{ + int err; + + mutex_lock(&pinst->lock); + get_online_cpus(); + + err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask); + + put_online_cpus(); + mutex_unlock(&pinst->lock); + + return err; + +} +EXPORT_SYMBOL(padata_set_cpumasks); + +/** + * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value + * equivalent to @cpumask. + * + * @pinst: padata instance + * @cpumask_type: PADATA_CPU_SERIAL or PADATA_CPU_PARALLEL corresponding + * to parallel and serial cpumasks respectively. + * @cpumask: the cpumask to use + */ +int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type, + cpumask_var_t cpumask) +{ + struct cpumask *serial_mask, *parallel_mask; + int err = -EINVAL; + + mutex_lock(&pinst->lock); + get_online_cpus(); + + switch (cpumask_type) { + case PADATA_CPU_PARALLEL: + serial_mask = pinst->cpumask.cbcpu; + parallel_mask = cpumask; + break; + case PADATA_CPU_SERIAL: + parallel_mask = pinst->cpumask.pcpu; + serial_mask = cpumask; + break; + default: + goto out; + } + + err = __padata_set_cpumasks(pinst, parallel_mask, serial_mask); + +out: + put_online_cpus(); + mutex_unlock(&pinst->lock); + + return err; +} +EXPORT_SYMBOL(padata_set_cpumask); + +static int __padata_add_cpu(struct padata_instance *pinst, int cpu) +{ + struct parallel_data *pd; + + if (cpumask_test_cpu(cpu, cpu_active_mask)) { + pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu, + pinst->cpumask.cbcpu); + if (!pd) + return -ENOMEM; + + padata_replace(pinst, pd); + + if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu) && + padata_validate_cpumask(pinst, pinst->cpumask.cbcpu)) + __padata_start(pinst); + } + + return 0; +} + + /** + * padata_add_cpu - add a cpu to one or both(parallel and serial) + * padata cpumasks. + * + * @pinst: padata instance + * @cpu: cpu to add + * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added. + * The @mask may be any combination of the following flags: + * PADATA_CPU_SERIAL - serial cpumask + * PADATA_CPU_PARALLEL - parallel cpumask + */ + +int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask) +{ + int err; + + if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL))) + return -EINVAL; + + mutex_lock(&pinst->lock); + + get_online_cpus(); + if (mask & PADATA_CPU_SERIAL) + cpumask_set_cpu(cpu, pinst->cpumask.cbcpu); + if (mask & PADATA_CPU_PARALLEL) + cpumask_set_cpu(cpu, pinst->cpumask.pcpu); + + err = __padata_add_cpu(pinst, cpu); + put_online_cpus(); + + mutex_unlock(&pinst->lock); + + return err; +} +EXPORT_SYMBOL(padata_add_cpu); + +static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) +{ + struct parallel_data *pd = NULL; + + if (cpumask_test_cpu(cpu, cpu_online_mask)) { + + if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu) || + !padata_validate_cpumask(pinst, pinst->cpumask.cbcpu)) + __padata_stop(pinst); + + pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu, + pinst->cpumask.cbcpu); + if (!pd) + return -ENOMEM; + + padata_replace(pinst, pd); + } + + return 0; +} + + /** + * padata_remove_cpu - remove a cpu from the one or both(serial and parallel) + * padata cpumasks. + * + * @pinst: padata instance + * @cpu: cpu to remove + * @mask: bitmask specifying from which cpumask @cpu should be removed + * The @mask may be any combination of the following flags: + * PADATA_CPU_SERIAL - serial cpumask + * PADATA_CPU_PARALLEL - parallel cpumask + */ +int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask) +{ + int err; + + if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL))) + return -EINVAL; + + mutex_lock(&pinst->lock); + + get_online_cpus(); + if (mask & PADATA_CPU_SERIAL) + cpumask_clear_cpu(cpu, pinst->cpumask.cbcpu); + if (mask & PADATA_CPU_PARALLEL) + cpumask_clear_cpu(cpu, pinst->cpumask.pcpu); + + err = __padata_remove_cpu(pinst, cpu); + put_online_cpus(); + + mutex_unlock(&pinst->lock); + + return err; +} +EXPORT_SYMBOL(padata_remove_cpu); + +/** + * padata_start - start the parallel processing + * + * @pinst: padata instance to start + */ +int padata_start(struct padata_instance *pinst) +{ + int err = 0; + + mutex_lock(&pinst->lock); + + if (pinst->flags & PADATA_INVALID) + err =-EINVAL; + + __padata_start(pinst); + + mutex_unlock(&pinst->lock); + + return err; +} +EXPORT_SYMBOL(padata_start); + +/** + * padata_stop - stop the parallel processing + * + * @pinst: padata instance to stop + */ +void padata_stop(struct padata_instance *pinst) +{ + mutex_lock(&pinst->lock); + __padata_stop(pinst); + mutex_unlock(&pinst->lock); +} +EXPORT_SYMBOL(padata_stop); + +#ifdef CONFIG_HOTPLUG_CPU + +static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu) +{ + return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) || + cpumask_test_cpu(cpu, pinst->cpumask.cbcpu); +} + + +static int padata_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + int err; + struct padata_instance *pinst; + int cpu = (unsigned long)hcpu; + + pinst = container_of(nfb, struct padata_instance, cpu_notifier); + + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + if (!pinst_has_cpu(pinst, cpu)) + break; + mutex_lock(&pinst->lock); + err = __padata_add_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + if (err) + return notifier_from_errno(err); + break; + + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + if (!pinst_has_cpu(pinst, cpu)) + break; + mutex_lock(&pinst->lock); + err = __padata_remove_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + if (err) + return notifier_from_errno(err); + break; + + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + if (!pinst_has_cpu(pinst, cpu)) + break; + mutex_lock(&pinst->lock); + __padata_remove_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + if (!pinst_has_cpu(pinst, cpu)) + break; + mutex_lock(&pinst->lock); + __padata_add_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + } + + return NOTIFY_OK; +} +#endif + +static void __padata_free(struct padata_instance *pinst) +{ +#ifdef CONFIG_HOTPLUG_CPU + unregister_hotcpu_notifier(&pinst->cpu_notifier); +#endif + + padata_stop(pinst); + padata_free_pd(pinst->pd); + free_cpumask_var(pinst->cpumask.pcpu); + free_cpumask_var(pinst->cpumask.cbcpu); + kfree(pinst); +} + +#define kobj2pinst(_kobj) \ + container_of(_kobj, struct padata_instance, kobj) +#define attr2pentry(_attr) \ + container_of(_attr, struct padata_sysfs_entry, attr) + +static void padata_sysfs_release(struct kobject *kobj) +{ + struct padata_instance *pinst = kobj2pinst(kobj); + __padata_free(pinst); +} + +struct padata_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct padata_instance *, struct attribute *, char *); + ssize_t (*store)(struct padata_instance *, struct attribute *, + const char *, size_t); +}; + +static ssize_t show_cpumask(struct padata_instance *pinst, + struct attribute *attr, char *buf) +{ + struct cpumask *cpumask; + ssize_t len; + + mutex_lock(&pinst->lock); + if (!strcmp(attr->name, "serial_cpumask")) + cpumask = pinst->cpumask.cbcpu; + else + cpumask = pinst->cpumask.pcpu; + + len = bitmap_scnprintf(buf, PAGE_SIZE, cpumask_bits(cpumask), + nr_cpu_ids); + if (PAGE_SIZE - len < 2) + len = -EINVAL; + else + len += sprintf(buf + len, "\n"); + + mutex_unlock(&pinst->lock); + return len; +} + +static ssize_t store_cpumask(struct padata_instance *pinst, + struct attribute *attr, + const char *buf, size_t count) +{ + cpumask_var_t new_cpumask; + ssize_t ret; + int mask_type; + + if (!alloc_cpumask_var(&new_cpumask, GFP_KERNEL)) + return -ENOMEM; + + ret = bitmap_parse(buf, count, cpumask_bits(new_cpumask), + nr_cpumask_bits); + if (ret < 0) + goto out; + + mask_type = !strcmp(attr->name, "serial_cpumask") ? + PADATA_CPU_SERIAL : PADATA_CPU_PARALLEL; + ret = padata_set_cpumask(pinst, mask_type, new_cpumask); + if (!ret) + ret = count; + +out: + free_cpumask_var(new_cpumask); + return ret; +} + +#define PADATA_ATTR_RW(_name, _show_name, _store_name) \ + static struct padata_sysfs_entry _name##_attr = \ + __ATTR(_name, 0644, _show_name, _store_name) +#define PADATA_ATTR_RO(_name, _show_name) \ + static struct padata_sysfs_entry _name##_attr = \ + __ATTR(_name, 0400, _show_name, NULL) + +PADATA_ATTR_RW(serial_cpumask, show_cpumask, store_cpumask); +PADATA_ATTR_RW(parallel_cpumask, show_cpumask, store_cpumask); + +/* + * Padata sysfs provides the following objects: + * serial_cpumask [RW] - cpumask for serial workers + * parallel_cpumask [RW] - cpumask for parallel workers + */ +static struct attribute *padata_default_attrs[] = { + &serial_cpumask_attr.attr, + ¶llel_cpumask_attr.attr, + NULL, +}; + +static ssize_t padata_sysfs_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct padata_instance *pinst; + struct padata_sysfs_entry *pentry; + ssize_t ret = -EIO; + + pinst = kobj2pinst(kobj); + pentry = attr2pentry(attr); + if (pentry->show) + ret = pentry->show(pinst, attr, buf); + + return ret; +} + +static ssize_t padata_sysfs_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t count) +{ + struct padata_instance *pinst; + struct padata_sysfs_entry *pentry; + ssize_t ret = -EIO; + + pinst = kobj2pinst(kobj); + pentry = attr2pentry(attr); + if (pentry->show) + ret = pentry->store(pinst, attr, buf, count); + + return ret; +} + +static const struct sysfs_ops padata_sysfs_ops = { + .show = padata_sysfs_show, + .store = padata_sysfs_store, +}; + +static struct kobj_type padata_attr_type = { + .sysfs_ops = &padata_sysfs_ops, + .default_attrs = padata_default_attrs, + .release = padata_sysfs_release, +}; + +/** + * padata_alloc_possible - Allocate and initialize padata instance. + * Use the cpu_possible_mask for serial and + * parallel workers. + * + * @wq: workqueue to use for the allocated padata instance + */ +struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq) +{ + return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask); +} +EXPORT_SYMBOL(padata_alloc_possible); + +/** + * padata_alloc - allocate and initialize a padata instance and specify + * cpumasks for serial and parallel workers. + * + * @wq: workqueue to use for the allocated padata instance + * @pcpumask: cpumask that will be used for padata parallelization + * @cbcpumask: cpumask that will be used for padata serialization + */ +struct padata_instance *padata_alloc(struct workqueue_struct *wq, + const struct cpumask *pcpumask, + const struct cpumask *cbcpumask) +{ + struct padata_instance *pinst; + struct parallel_data *pd = NULL; + + pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL); + if (!pinst) + goto err; + + get_online_cpus(); + if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL)) + goto err_free_inst; + if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) { + free_cpumask_var(pinst->cpumask.pcpu); + goto err_free_inst; + } + if (!padata_validate_cpumask(pinst, pcpumask) || + !padata_validate_cpumask(pinst, cbcpumask)) + goto err_free_masks; + + pd = padata_alloc_pd(pinst, pcpumask, cbcpumask); + if (!pd) + goto err_free_masks; + + rcu_assign_pointer(pinst->pd, pd); + + pinst->wq = wq; + + cpumask_copy(pinst->cpumask.pcpu, pcpumask); + cpumask_copy(pinst->cpumask.cbcpu, cbcpumask); + + pinst->flags = 0; + +#ifdef CONFIG_HOTPLUG_CPU + pinst->cpu_notifier.notifier_call = padata_cpu_callback; + pinst->cpu_notifier.priority = 0; + register_hotcpu_notifier(&pinst->cpu_notifier); +#endif + + put_online_cpus(); + + BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier); + kobject_init(&pinst->kobj, &padata_attr_type); + mutex_init(&pinst->lock); + + return pinst; + +err_free_masks: + free_cpumask_var(pinst->cpumask.pcpu); + free_cpumask_var(pinst->cpumask.cbcpu); +err_free_inst: + kfree(pinst); + put_online_cpus(); +err: + return NULL; +} +EXPORT_SYMBOL(padata_alloc); + +/** + * padata_free - free a padata instance + * + * @padata_inst: padata instance to free + */ +void padata_free(struct padata_instance *pinst) +{ + kobject_put(&pinst->kobj); +} +EXPORT_SYMBOL(padata_free); diff --git a/kernel/panic.c b/kernel/panic.c new file mode 100644 index 00000000..564c7bc6 --- /dev/null +++ b/kernel/panic.c @@ -0,0 +1,464 @@ +/* + * linux/kernel/panic.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +/* + * This function is used through-out the kernel (including mm and fs) + * to indicate a major problem. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PANIC_TIMER_STEP 100 +#define PANIC_BLINK_SPD 18 + +/* Machine specific panic information string */ +char *mach_panic_string; + +int panic_on_oops; +static unsigned long tainted_mask; +static int pause_on_oops; +static int pause_on_oops_flag; +static DEFINE_SPINLOCK(pause_on_oops_lock); + +#ifndef CONFIG_PANIC_TIMEOUT +#define CONFIG_PANIC_TIMEOUT 0 +#endif +int panic_timeout = CONFIG_PANIC_TIMEOUT; +EXPORT_SYMBOL_GPL(panic_timeout); + +ATOMIC_NOTIFIER_HEAD(panic_notifier_list); + +EXPORT_SYMBOL(panic_notifier_list); + +static long no_blink(int state) +{ + return 0; +} + +/* Returns how long it waited in ms */ +long (*panic_blink)(int state); +EXPORT_SYMBOL(panic_blink); + +/** + * panic - halt the system + * @fmt: The text string to print + * + * Display a message, then perform cleanups. + * + * This function never returns. + */ +NORET_TYPE void panic(const char * fmt, ...) +{ + static char buf[1024]; + va_list args; + long i, i_next = 0; + int state = 0; + + /* + * It's possible to come here directly from a panic-assertion and + * not have preempt disabled. Some functions called from here want + * preempt to be disabled. No point enabling it later though... + */ + preempt_disable(); + + console_verbose(); + bust_spinlocks(1); + va_start(args, fmt); + vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); +#ifdef CONFIG_DEBUG_BUGVERBOSE + dump_stack(); +#endif + + /* + * If we have crashed and we have a crash kernel loaded let it handle + * everything else. + * Do we want to call this before we try to display a message? + */ + crash_kexec(NULL); + + kmsg_dump(KMSG_DUMP_PANIC); + + /* + * Note smp_send_stop is the usual smp shutdown function, which + * unfortunately means it may not be hardened to work in a panic + * situation. + */ + smp_send_stop(); + + atomic_notifier_call_chain(&panic_notifier_list, 0, buf); + + bust_spinlocks(0); + + if (!panic_blink) + panic_blink = no_blink; + + if (panic_timeout > 0) { + /* + * Delay timeout seconds before rebooting the machine. + * We can't use the "normal" timers since we just panicked. + */ + printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); + + for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) { + touch_nmi_watchdog(); + if (i >= i_next) { + i += panic_blink(state ^= 1); + i_next = i + 3600 / PANIC_BLINK_SPD; + } + mdelay(PANIC_TIMER_STEP); + } + /* + * This will not be a clean reboot, with everything + * shutting down. But if there is a chance of + * rebooting the system it will be rebooted. + */ + emergency_restart(); + } +#ifdef __sparc__ + { + extern int stop_a_enabled; + /* Make sure the user can actually press Stop-A (L1-A) */ + stop_a_enabled = 1; + printk(KERN_EMERG "Press Stop-A (L1-A) to return to the boot prom\n"); + } +#endif +#if defined(CONFIG_S390) + { + unsigned long caller; + + caller = (unsigned long)__builtin_return_address(0); + disabled_wait(caller); + } +#endif + local_irq_enable(); + for (i = 0; ; i += PANIC_TIMER_STEP) { + touch_softlockup_watchdog(); + if (i >= i_next) { + i += panic_blink(state ^= 1); + i_next = i + 3600 / PANIC_BLINK_SPD; + } + mdelay(PANIC_TIMER_STEP); + } +} + +EXPORT_SYMBOL(panic); + + +struct tnt { + u8 bit; + char true; + char false; +}; + +static const struct tnt tnts[] = { + { TAINT_PROPRIETARY_MODULE, 'P', 'G' }, + { TAINT_FORCED_MODULE, 'F', ' ' }, + { TAINT_UNSAFE_SMP, 'S', ' ' }, + { TAINT_FORCED_RMMOD, 'R', ' ' }, + { TAINT_MACHINE_CHECK, 'M', ' ' }, + { TAINT_BAD_PAGE, 'B', ' ' }, + { TAINT_USER, 'U', ' ' }, + { TAINT_DIE, 'D', ' ' }, + { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' }, + { TAINT_WARN, 'W', ' ' }, + { TAINT_CRAP, 'C', ' ' }, + { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, +}; + +/** + * print_tainted - return a string to represent the kernel taint state. + * + * 'P' - Proprietary module has been loaded. + * 'F' - Module has been forcibly loaded. + * 'S' - SMP with CPUs not designed for SMP. + * 'R' - User forced a module unload. + * 'M' - System experienced a machine check exception. + * 'B' - System has hit bad_page. + * 'U' - Userspace-defined naughtiness. + * 'D' - Kernel has oopsed before + * 'A' - ACPI table overridden. + * 'W' - Taint on warning. + * 'C' - modules from drivers/staging are loaded. + * 'I' - Working around severe firmware bug. + * + * The string is overwritten by the next call to print_tainted(). + */ +const char *print_tainted(void) +{ + static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ") + 1]; + + if (tainted_mask) { + char *s; + int i; + + s = buf + sprintf(buf, "Tainted: "); + for (i = 0; i < ARRAY_SIZE(tnts); i++) { + const struct tnt *t = &tnts[i]; + *s++ = test_bit(t->bit, &tainted_mask) ? + t->true : t->false; + } + *s = 0; + } else + snprintf(buf, sizeof(buf), "Not tainted"); + + return buf; +} + +int test_taint(unsigned flag) +{ + return test_bit(flag, &tainted_mask); +} +EXPORT_SYMBOL(test_taint); + +unsigned long get_taint(void) +{ + return tainted_mask; +} + +void add_taint(unsigned flag) +{ + /* + * Can't trust the integrity of the kernel anymore. + * We don't call directly debug_locks_off() because the issue + * is not necessarily serious enough to set oops_in_progress to 1 + * Also we want to keep up lockdep for staging development and + * post-warning case. + */ + switch (flag) { + case TAINT_CRAP: + case TAINT_WARN: + case TAINT_FIRMWARE_WORKAROUND: + break; + + default: + if (__debug_locks_off()) + printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n"); + } + + set_bit(flag, &tainted_mask); +} +EXPORT_SYMBOL(add_taint); + +static void spin_msec(int msecs) +{ + int i; + + for (i = 0; i < msecs; i++) { + touch_nmi_watchdog(); + mdelay(1); + } +} + +/* + * It just happens that oops_enter() and oops_exit() are identically + * implemented... + */ +static void do_oops_enter_exit(void) +{ + unsigned long flags; + static int spin_counter; + + if (!pause_on_oops) + return; + + spin_lock_irqsave(&pause_on_oops_lock, flags); + if (pause_on_oops_flag == 0) { + /* This CPU may now print the oops message */ + pause_on_oops_flag = 1; + } else { + /* We need to stall this CPU */ + if (!spin_counter) { + /* This CPU gets to do the counting */ + spin_counter = pause_on_oops; + do { + spin_unlock(&pause_on_oops_lock); + spin_msec(MSEC_PER_SEC); + spin_lock(&pause_on_oops_lock); + } while (--spin_counter); + pause_on_oops_flag = 0; + } else { + /* This CPU waits for a different one */ + while (spin_counter) { + spin_unlock(&pause_on_oops_lock); + spin_msec(1); + spin_lock(&pause_on_oops_lock); + } + } + } + spin_unlock_irqrestore(&pause_on_oops_lock, flags); +} + +/* + * Return true if the calling CPU is allowed to print oops-related info. + * This is a bit racy.. + */ +int oops_may_print(void) +{ + return pause_on_oops_flag == 0; +} + +/* + * Called when the architecture enters its oops handler, before it prints + * anything. If this is the first CPU to oops, and it's oopsing the first + * time then let it proceed. + * + * This is all enabled by the pause_on_oops kernel boot option. We do all + * this to ensure that oopses don't scroll off the screen. It has the + * side-effect of preventing later-oopsing CPUs from mucking up the display, + * too. + * + * It turns out that the CPU which is allowed to print ends up pausing for + * the right duration, whereas all the other CPUs pause for twice as long: + * once in oops_enter(), once in oops_exit(). + */ +void oops_enter(void) +{ + tracing_off(); + /* can't trust the integrity of the kernel anymore: */ + debug_locks_off(); + do_oops_enter_exit(); +} + +/* + * 64-bit random ID for oopses: + */ +static u64 oops_id; + +static int init_oops_id(void) +{ + if (!oops_id) + get_random_bytes(&oops_id, sizeof(oops_id)); + else + oops_id++; + + return 0; +} +late_initcall(init_oops_id); + +void print_oops_end_marker(void) +{ + init_oops_id(); + + if (mach_panic_string) + printk(KERN_WARNING "Board Information: %s\n", + mach_panic_string); + + printk(KERN_WARNING "---[ end trace %016llx ]---\n", + (unsigned long long)oops_id); +} + +/* + * Called when the architecture exits its oops handler, after printing + * everything. + */ +void oops_exit(void) +{ + do_oops_enter_exit(); + print_oops_end_marker(); + kmsg_dump(KMSG_DUMP_OOPS); +} + +#ifdef WANT_WARN_ON_SLOWPATH +struct slowpath_args { + const char *fmt; + va_list args; +}; + +static void warn_slowpath_common(const char *file, int line, void *caller, + unsigned taint, struct slowpath_args *args) +{ + const char *board; + + printk(KERN_WARNING "------------[ cut here ]------------\n"); + printk(KERN_WARNING "WARNING: at %s:%d %pS()\n", file, line, caller); + board = dmi_get_system_info(DMI_PRODUCT_NAME); + if (board) + printk(KERN_WARNING "Hardware name: %s\n", board); + + if (args) + vprintk(args->fmt, args->args); + + print_modules(); + dump_stack(); + print_oops_end_marker(); + add_taint(taint); +} + +void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) +{ + struct slowpath_args args; + + args.fmt = fmt; + va_start(args.args, fmt); + warn_slowpath_common(file, line, __builtin_return_address(0), + TAINT_WARN, &args); + va_end(args.args); +} +EXPORT_SYMBOL(warn_slowpath_fmt); + +void warn_slowpath_fmt_taint(const char *file, int line, + unsigned taint, const char *fmt, ...) +{ + struct slowpath_args args; + + args.fmt = fmt; + va_start(args.args, fmt); + warn_slowpath_common(file, line, __builtin_return_address(0), + taint, &args); + va_end(args.args); +} +EXPORT_SYMBOL(warn_slowpath_fmt_taint); + +void warn_slowpath_null(const char *file, int line) +{ + warn_slowpath_common(file, line, __builtin_return_address(0), + TAINT_WARN, NULL); +} +EXPORT_SYMBOL(warn_slowpath_null); +#endif + +#ifdef CONFIG_CC_STACKPROTECTOR + +/* + * Called when gcc's -fstack-protector feature is used, and + * gcc detects corruption of the on-stack canary value + */ +void __stack_chk_fail(void) +{ + panic("stack-protector: Kernel stack is corrupted in: %p\n", + __builtin_return_address(0)); +} +EXPORT_SYMBOL(__stack_chk_fail); + +#endif + +core_param(panic, panic_timeout, int, 0644); +core_param(pause_on_oops, pause_on_oops, int, 0644); + +static int __init oops_setup(char *s) +{ + if (!s) + return -EINVAL; + if (!strcmp(s, "panic")) + panic_on_oops = 1; + return 0; +} +early_param("oops", oops_setup); diff --git a/kernel/params.c b/kernel/params.c new file mode 100644 index 00000000..ed72e133 --- /dev/null +++ b/kernel/params.c @@ -0,0 +1,924 @@ +/* Helpers for initial module or kernel cmdline parsing + Copyright (C) 2001 Rusty Russell. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(fmt, a...) +#endif + +/* Protects all parameters, and incidentally kmalloced_param list. */ +static DEFINE_MUTEX(param_lock); + +/* This just allows us to keep track of which parameters are kmalloced. */ +struct kmalloced_param { + struct list_head list; + char val[]; +}; +static LIST_HEAD(kmalloced_params); + +static void *kmalloc_parameter(unsigned int size) +{ + struct kmalloced_param *p; + + p = kmalloc(sizeof(*p) + size, GFP_KERNEL); + if (!p) + return NULL; + + list_add(&p->list, &kmalloced_params); + return p->val; +} + +/* Does nothing if parameter wasn't kmalloced above. */ +static void maybe_kfree_parameter(void *param) +{ + struct kmalloced_param *p; + + list_for_each_entry(p, &kmalloced_params, list) { + if (p->val == param) { + list_del(&p->list); + kfree(p); + break; + } + } +} + +static inline char dash2underscore(char c) +{ + if (c == '-') + return '_'; + return c; +} + +static inline int parameq(const char *input, const char *paramname) +{ + unsigned int i; + for (i = 0; dash2underscore(input[i]) == paramname[i]; i++) + if (input[i] == '\0') + return 1; + return 0; +} + +static int parse_one(char *param, + char *val, + const struct kernel_param *params, + unsigned num_params, + int (*handle_unknown)(char *param, char *val)) +{ + unsigned int i; + int err; + + /* Find parameter */ + for (i = 0; i < num_params; i++) { + if (parameq(param, params[i].name)) { + /* No one handled NULL, so do it here. */ + if (!val && params[i].ops->set != param_set_bool) + return -EINVAL; + DEBUGP("They are equal! Calling %p\n", + params[i].ops->set); + mutex_lock(¶m_lock); + err = params[i].ops->set(val, ¶ms[i]); + mutex_unlock(¶m_lock); + return err; + } + } + + if (handle_unknown) { + DEBUGP("Unknown argument: calling %p\n", handle_unknown); + return handle_unknown(param, val); + } + + DEBUGP("Unknown argument `%s'\n", param); + return -ENOENT; +} + +/* You can use " around spaces, but can't escape ". */ +/* Hyphens and underscores equivalent in parameter names. */ +static char *next_arg(char *args, char **param, char **val) +{ + unsigned int i, equals = 0; + int in_quote = 0, quoted = 0; + char *next; + + if (*args == '"') { + args++; + in_quote = 1; + quoted = 1; + } + + for (i = 0; args[i]; i++) { + if (isspace(args[i]) && !in_quote) + break; + if (equals == 0) { + if (args[i] == '=') + equals = i; + } + if (args[i] == '"') + in_quote = !in_quote; + } + + *param = args; + if (!equals) + *val = NULL; + else { + args[equals] = '\0'; + *val = args + equals + 1; + + /* Don't include quotes in value. */ + if (**val == '"') { + (*val)++; + if (args[i-1] == '"') + args[i-1] = '\0'; + } + if (quoted && args[i-1] == '"') + args[i-1] = '\0'; + } + + if (args[i]) { + args[i] = '\0'; + next = args + i + 1; + } else + next = args + i; + + /* Chew up trailing spaces. */ + return skip_spaces(next); +} + +/* Args looks like "foo=bar,bar2 baz=fuz wiz". */ +int parse_args(const char *name, + char *args, + const struct kernel_param *params, + unsigned num, + int (*unknown)(char *param, char *val)) +{ + char *param, *val; + + DEBUGP("Parsing ARGS: %s\n", args); + + /* Chew leading spaces */ + args = skip_spaces(args); + + while (*args) { + int ret; + int irq_was_disabled; + + args = next_arg(args, ¶m, &val); + irq_was_disabled = irqs_disabled(); + ret = parse_one(param, val, params, num, unknown); + if (irq_was_disabled && !irqs_disabled()) { + printk(KERN_WARNING "parse_args(): option '%s' enabled " + "irq's!\n", param); + } + switch (ret) { + case -ENOENT: + printk(KERN_ERR "%s: Unknown parameter `%s'\n", + name, param); + return ret; + case -ENOSPC: + printk(KERN_ERR + "%s: `%s' too large for parameter `%s'\n", + name, val ?: "", param); + return ret; + case 0: + break; + default: + printk(KERN_ERR + "%s: `%s' invalid for parameter `%s'\n", + name, val ?: "", param); + return ret; + } + } + + /* All parsed OK. */ + return 0; +} + +/* Lazy bastard, eh? */ +#define STANDARD_PARAM_DEF(name, type, format, tmptype, strtolfn) \ + int param_set_##name(const char *val, const struct kernel_param *kp) \ + { \ + tmptype l; \ + int ret; \ + \ + ret = strtolfn(val, 0, &l); \ + if (ret == -EINVAL || ((type)l != l)) \ + return -EINVAL; \ + *((type *)kp->arg) = l; \ + return 0; \ + } \ + int param_get_##name(char *buffer, const struct kernel_param *kp) \ + { \ + return sprintf(buffer, format, *((type *)kp->arg)); \ + } \ + struct kernel_param_ops param_ops_##name = { \ + .set = param_set_##name, \ + .get = param_get_##name, \ + }; \ + EXPORT_SYMBOL(param_set_##name); \ + EXPORT_SYMBOL(param_get_##name); \ + EXPORT_SYMBOL(param_ops_##name) + + +STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul); +STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol); +STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, strict_strtoul); +STANDARD_PARAM_DEF(int, int, "%i", long, strict_strtol); +STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul); +STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol); +STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); + +int param_set_charp(const char *val, const struct kernel_param *kp) +{ + if (strlen(val) > 1024) { + printk(KERN_ERR "%s: string parameter too long\n", + kp->name); + return -ENOSPC; + } + + maybe_kfree_parameter(*(char **)kp->arg); + + /* This is a hack. We can't kmalloc in early boot, and we + * don't need to; this mangled commandline is preserved. */ + if (slab_is_available()) { + *(char **)kp->arg = kmalloc_parameter(strlen(val)+1); + if (!*(char **)kp->arg) + return -ENOMEM; + strcpy(*(char **)kp->arg, val); + } else + *(const char **)kp->arg = val; + + return 0; +} +EXPORT_SYMBOL(param_set_charp); + +int param_get_charp(char *buffer, const struct kernel_param *kp) +{ + return sprintf(buffer, "%s", *((char **)kp->arg)); +} +EXPORT_SYMBOL(param_get_charp); + +static void param_free_charp(void *arg) +{ + maybe_kfree_parameter(*((char **)arg)); +} + +struct kernel_param_ops param_ops_charp = { + .set = param_set_charp, + .get = param_get_charp, + .free = param_free_charp, +}; +EXPORT_SYMBOL(param_ops_charp); + +/* Actually could be a bool or an int, for historical reasons. */ +int param_set_bool(const char *val, const struct kernel_param *kp) +{ + bool v; + int ret; + + /* No equals means "set"... */ + if (!val) val = "1"; + + /* One of =[yYnN01] */ + ret = strtobool(val, &v); + if (ret) + return ret; + + if (kp->flags & KPARAM_ISBOOL) + *(bool *)kp->arg = v; + else + *(int *)kp->arg = v; + return 0; +} +EXPORT_SYMBOL(param_set_bool); + +int param_get_bool(char *buffer, const struct kernel_param *kp) +{ + bool val; + if (kp->flags & KPARAM_ISBOOL) + val = *(bool *)kp->arg; + else + val = *(int *)kp->arg; + + /* Y and N chosen as being relatively non-coder friendly */ + return sprintf(buffer, "%c", val ? 'Y' : 'N'); +} +EXPORT_SYMBOL(param_get_bool); + +struct kernel_param_ops param_ops_bool = { + .set = param_set_bool, + .get = param_get_bool, +}; +EXPORT_SYMBOL(param_ops_bool); + +/* This one must be bool. */ +int param_set_invbool(const char *val, const struct kernel_param *kp) +{ + int ret; + bool boolval; + struct kernel_param dummy; + + dummy.arg = &boolval; + dummy.flags = KPARAM_ISBOOL; + ret = param_set_bool(val, &dummy); + if (ret == 0) + *(bool *)kp->arg = !boolval; + return ret; +} +EXPORT_SYMBOL(param_set_invbool); + +int param_get_invbool(char *buffer, const struct kernel_param *kp) +{ + return sprintf(buffer, "%c", (*(bool *)kp->arg) ? 'N' : 'Y'); +} +EXPORT_SYMBOL(param_get_invbool); + +struct kernel_param_ops param_ops_invbool = { + .set = param_set_invbool, + .get = param_get_invbool, +}; +EXPORT_SYMBOL(param_ops_invbool); + +/* We break the rule and mangle the string. */ +static int param_array(const char *name, + const char *val, + unsigned int min, unsigned int max, + void *elem, int elemsize, + int (*set)(const char *, const struct kernel_param *kp), + u16 flags, + unsigned int *num) +{ + int ret; + struct kernel_param kp; + char save; + + /* Get the name right for errors. */ + kp.name = name; + kp.arg = elem; + kp.flags = flags; + + *num = 0; + /* We expect a comma-separated list of values. */ + do { + int len; + + if (*num == max) { + printk(KERN_ERR "%s: can only take %i arguments\n", + name, max); + return -EINVAL; + } + len = strcspn(val, ","); + + /* nul-terminate and parse */ + save = val[len]; + ((char *)val)[len] = '\0'; + BUG_ON(!mutex_is_locked(¶m_lock)); + ret = set(val, &kp); + + if (ret != 0) + return ret; + kp.arg += elemsize; + val += len+1; + (*num)++; + } while (save == ','); + + if (*num < min) { + printk(KERN_ERR "%s: needs at least %i arguments\n", + name, min); + return -EINVAL; + } + return 0; +} + +static int param_array_set(const char *val, const struct kernel_param *kp) +{ + const struct kparam_array *arr = kp->arr; + unsigned int temp_num; + + return param_array(kp->name, val, 1, arr->max, arr->elem, + arr->elemsize, arr->ops->set, kp->flags, + arr->num ?: &temp_num); +} + +static int param_array_get(char *buffer, const struct kernel_param *kp) +{ + int i, off, ret; + const struct kparam_array *arr = kp->arr; + struct kernel_param p; + + p = *kp; + for (i = off = 0; i < (arr->num ? *arr->num : arr->max); i++) { + if (i) + buffer[off++] = ','; + p.arg = arr->elem + arr->elemsize * i; + BUG_ON(!mutex_is_locked(¶m_lock)); + ret = arr->ops->get(buffer + off, &p); + if (ret < 0) + return ret; + off += ret; + } + buffer[off] = '\0'; + return off; +} + +static void param_array_free(void *arg) +{ + unsigned int i; + const struct kparam_array *arr = arg; + + if (arr->ops->free) + for (i = 0; i < (arr->num ? *arr->num : arr->max); i++) + arr->ops->free(arr->elem + arr->elemsize * i); +} + +struct kernel_param_ops param_array_ops = { + .set = param_array_set, + .get = param_array_get, + .free = param_array_free, +}; +EXPORT_SYMBOL(param_array_ops); + +int param_set_copystring(const char *val, const struct kernel_param *kp) +{ + const struct kparam_string *kps = kp->str; + + if (strlen(val)+1 > kps->maxlen) { + printk(KERN_ERR "%s: string doesn't fit in %u chars.\n", + kp->name, kps->maxlen-1); + return -ENOSPC; + } + strcpy(kps->string, val); + return 0; +} +EXPORT_SYMBOL(param_set_copystring); + +int param_get_string(char *buffer, const struct kernel_param *kp) +{ + const struct kparam_string *kps = kp->str; + return strlcpy(buffer, kps->string, kps->maxlen); +} +EXPORT_SYMBOL(param_get_string); + +struct kernel_param_ops param_ops_string = { + .set = param_set_copystring, + .get = param_get_string, +}; +EXPORT_SYMBOL(param_ops_string); + +/* sysfs output in /sys/modules/XYZ/parameters/ */ +#define to_module_attr(n) container_of(n, struct module_attribute, attr) +#define to_module_kobject(n) container_of(n, struct module_kobject, kobj) + +extern struct kernel_param __start___param[], __stop___param[]; + +struct param_attribute +{ + struct module_attribute mattr; + const struct kernel_param *param; +}; + +struct module_param_attrs +{ + unsigned int num; + struct attribute_group grp; + struct param_attribute attrs[0]; +}; + +#ifdef CONFIG_SYSFS +#define to_param_attr(n) container_of(n, struct param_attribute, mattr) + +static ssize_t param_attr_show(struct module_attribute *mattr, + struct module *mod, char *buf) +{ + int count; + struct param_attribute *attribute = to_param_attr(mattr); + + if (!attribute->param->ops->get) + return -EPERM; + + mutex_lock(¶m_lock); + count = attribute->param->ops->get(buf, attribute->param); + mutex_unlock(¶m_lock); + if (count > 0) { + strcat(buf, "\n"); + ++count; + } + return count; +} + +/* sysfs always hands a nul-terminated string in buf. We rely on that. */ +static ssize_t param_attr_store(struct module_attribute *mattr, + struct module *owner, + const char *buf, size_t len) +{ + int err; + struct param_attribute *attribute = to_param_attr(mattr); + + if (!attribute->param->ops->set) + return -EPERM; + + mutex_lock(¶m_lock); + err = attribute->param->ops->set(buf, attribute->param); + mutex_unlock(¶m_lock); + if (!err) + return len; + return err; +} +#endif + +#ifdef CONFIG_MODULES +#define __modinit +#else +#define __modinit __init +#endif + +#ifdef CONFIG_SYSFS +void __kernel_param_lock(void) +{ + mutex_lock(¶m_lock); +} +EXPORT_SYMBOL(__kernel_param_lock); + +void __kernel_param_unlock(void) +{ + mutex_unlock(¶m_lock); +} +EXPORT_SYMBOL(__kernel_param_unlock); + +/* + * add_sysfs_param - add a parameter to sysfs + * @mk: struct module_kobject + * @kparam: the actual parameter definition to add to sysfs + * @name: name of parameter + * + * Create a kobject if for a (per-module) parameter if mp NULL, and + * create file in sysfs. Returns an error on out of memory. Always cleans up + * if there's an error. + */ +static __modinit int add_sysfs_param(struct module_kobject *mk, + const struct kernel_param *kp, + const char *name) +{ + struct module_param_attrs *new; + struct attribute **attrs; + int err, num; + + /* We don't bother calling this with invisible parameters. */ + BUG_ON(!kp->perm); + + if (!mk->mp) { + num = 0; + attrs = NULL; + } else { + num = mk->mp->num; + attrs = mk->mp->grp.attrs; + } + + /* Enlarge. */ + new = krealloc(mk->mp, + sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1), + GFP_KERNEL); + if (!new) { + kfree(mk->mp); + err = -ENOMEM; + goto fail; + } + attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL); + if (!attrs) { + err = -ENOMEM; + goto fail_free_new; + } + + /* Sysfs wants everything zeroed. */ + memset(new, 0, sizeof(*new)); + memset(&new->attrs[num], 0, sizeof(new->attrs[num])); + memset(&attrs[num], 0, sizeof(attrs[num])); + new->grp.name = "parameters"; + new->grp.attrs = attrs; + + /* Tack new one on the end. */ + sysfs_attr_init(&new->attrs[num].mattr.attr); + new->attrs[num].param = kp; + new->attrs[num].mattr.show = param_attr_show; + new->attrs[num].mattr.store = param_attr_store; + new->attrs[num].mattr.attr.name = (char *)name; + new->attrs[num].mattr.attr.mode = kp->perm; + new->num = num+1; + + /* Fix up all the pointers, since krealloc can move us */ + for (num = 0; num < new->num; num++) + new->grp.attrs[num] = &new->attrs[num].mattr.attr; + new->grp.attrs[num] = NULL; + + mk->mp = new; + return 0; + +fail_free_new: + kfree(new); +fail: + mk->mp = NULL; + return err; +} + +#ifdef CONFIG_MODULES +static void free_module_param_attrs(struct module_kobject *mk) +{ + kfree(mk->mp->grp.attrs); + kfree(mk->mp); + mk->mp = NULL; +} + +/* + * module_param_sysfs_setup - setup sysfs support for one module + * @mod: module + * @kparam: module parameters (array) + * @num_params: number of module parameters + * + * Adds sysfs entries for module parameters under + * /sys/module/[mod->name]/parameters/ + */ +int module_param_sysfs_setup(struct module *mod, + const struct kernel_param *kparam, + unsigned int num_params) +{ + int i, err; + bool params = false; + + for (i = 0; i < num_params; i++) { + if (kparam[i].perm == 0) + continue; + err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name); + if (err) + return err; + params = true; + } + + if (!params) + return 0; + + /* Create the param group. */ + err = sysfs_create_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp); + if (err) + free_module_param_attrs(&mod->mkobj); + return err; +} + +/* + * module_param_sysfs_remove - remove sysfs support for one module + * @mod: module + * + * Remove sysfs entries for module parameters and the corresponding + * kobject. + */ +void module_param_sysfs_remove(struct module *mod) +{ + if (mod->mkobj.mp) { + sysfs_remove_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp); + /* We are positive that no one is using any param + * attrs at this point. Deallocate immediately. */ + free_module_param_attrs(&mod->mkobj); + } +} +#endif + +void destroy_params(const struct kernel_param *params, unsigned num) +{ + unsigned int i; + + for (i = 0; i < num; i++) + if (params[i].ops->free) + params[i].ops->free(params[i].arg); +} + +static struct module_kobject * __init locate_module_kobject(const char *name) +{ + struct module_kobject *mk; + struct kobject *kobj; + int err; + + kobj = kset_find_obj(module_kset, name); + if (kobj) { + mk = to_module_kobject(kobj); + } else { + mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); + BUG_ON(!mk); + + mk->mod = THIS_MODULE; + mk->kobj.kset = module_kset; + err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, + "%s", name); + if (err) { + kobject_put(&mk->kobj); + printk(KERN_ERR + "Module '%s' failed add to sysfs, error number %d\n", + name, err); + printk(KERN_ERR + "The system will be unstable now.\n"); + return NULL; + } + + /* So that we hold reference in both cases. */ + kobject_get(&mk->kobj); + } + + return mk; +} + +static void __init kernel_add_sysfs_param(const char *name, + struct kernel_param *kparam, + unsigned int name_skip) +{ + struct module_kobject *mk; + int err; + + mk = locate_module_kobject(name); + if (!mk) + return; + + /* We need to remove old parameters before adding more. */ + if (mk->mp) + sysfs_remove_group(&mk->kobj, &mk->mp->grp); + + /* These should not fail at boot. */ + err = add_sysfs_param(mk, kparam, kparam->name + name_skip); + BUG_ON(err); + err = sysfs_create_group(&mk->kobj, &mk->mp->grp); + BUG_ON(err); + kobject_uevent(&mk->kobj, KOBJ_ADD); + kobject_put(&mk->kobj); +} + +/* + * param_sysfs_builtin - add contents in /sys/parameters for built-in modules + * + * Add module_parameters to sysfs for "modules" built into the kernel. + * + * The "module" name (KBUILD_MODNAME) is stored before a dot, the + * "parameter" name is stored behind a dot in kernel_param->name. So, + * extract the "module" name for all built-in kernel_param-eters, + * and for all who have the same, call kernel_add_sysfs_param. + */ +static void __init param_sysfs_builtin(void) +{ + struct kernel_param *kp; + unsigned int name_len; + char modname[MODULE_NAME_LEN]; + + for (kp = __start___param; kp < __stop___param; kp++) { + char *dot; + + if (kp->perm == 0) + continue; + + dot = strchr(kp->name, '.'); + if (!dot) { + /* This happens for core_param() */ + strcpy(modname, "kernel"); + name_len = 0; + } else { + name_len = dot - kp->name + 1; + strlcpy(modname, kp->name, name_len); + } + kernel_add_sysfs_param(modname, kp, name_len); + } +} + +ssize_t __modver_version_show(struct module_attribute *mattr, + struct module *mod, char *buf) +{ + struct module_version_attribute *vattr = + container_of(mattr, struct module_version_attribute, mattr); + + return sprintf(buf, "%s\n", vattr->version); +} + +extern const struct module_version_attribute *__start___modver[]; +extern const struct module_version_attribute *__stop___modver[]; + +static void __init version_sysfs_builtin(void) +{ + const struct module_version_attribute **p; + struct module_kobject *mk; + int err; + + for (p = __start___modver; p < __stop___modver; p++) { + const struct module_version_attribute *vattr = *p; + + mk = locate_module_kobject(vattr->module_name); + if (mk) { + err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); + kobject_uevent(&mk->kobj, KOBJ_ADD); + kobject_put(&mk->kobj); + } + } +} + +/* module-related sysfs stuff */ + +static ssize_t module_attr_show(struct kobject *kobj, + struct attribute *attr, + char *buf) +{ + struct module_attribute *attribute; + struct module_kobject *mk; + int ret; + + attribute = to_module_attr(attr); + mk = to_module_kobject(kobj); + + if (!attribute->show) + return -EIO; + + ret = attribute->show(attribute, mk->mod, buf); + + return ret; +} + +static ssize_t module_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct module_attribute *attribute; + struct module_kobject *mk; + int ret; + + attribute = to_module_attr(attr); + mk = to_module_kobject(kobj); + + if (!attribute->store) + return -EIO; + + ret = attribute->store(attribute, mk->mod, buf, len); + + return ret; +} + +static const struct sysfs_ops module_sysfs_ops = { + .show = module_attr_show, + .store = module_attr_store, +}; + +static int uevent_filter(struct kset *kset, struct kobject *kobj) +{ + struct kobj_type *ktype = get_ktype(kobj); + + if (ktype == &module_ktype) + return 1; + return 0; +} + +static const struct kset_uevent_ops module_uevent_ops = { + .filter = uevent_filter, +}; + +struct kset *module_kset; +int module_sysfs_initialized; + +struct kobj_type module_ktype = { + .sysfs_ops = &module_sysfs_ops, +}; + +/* + * param_sysfs_init - wrapper for built-in params support + */ +static int __init param_sysfs_init(void) +{ + module_kset = kset_create_and_add("module", &module_uevent_ops, NULL); + if (!module_kset) { + printk(KERN_WARNING "%s (%d): error creating kset\n", + __FILE__, __LINE__); + return -ENOMEM; + } + module_sysfs_initialized = 1; + + version_sysfs_builtin(); + param_sysfs_builtin(); + + return 0; +} +subsys_initcall(param_sysfs_init); + +#endif /* CONFIG_SYSFS */ diff --git a/kernel/pid.c b/kernel/pid.c new file mode 100644 index 00000000..57a8346a --- /dev/null +++ b/kernel/pid.c @@ -0,0 +1,570 @@ +/* + * Generic pidhash and scalable, time-bounded PID allocator + * + * (C) 2002-2003 William Irwin, IBM + * (C) 2004 William Irwin, Oracle + * (C) 2002-2004 Ingo Molnar, Red Hat + * + * pid-structures are backing objects for tasks sharing a given ID to chain + * against. There is very little to them aside from hashing them and + * parking tasks using given ID's on a list. + * + * The hash is always changed with the tasklist_lock write-acquired, + * and the hash is only accessed with the tasklist_lock at least + * read-acquired, so there's no additional SMP locking needed here. + * + * We have a list of bitmap pages, which bitmaps represent the PID space. + * Allocating and freeing PIDs is completely lockless. The worst-case + * allocation scenario when all but one out of 1 million PIDs possible are + * allocated already: the scanning of 32 list entries and at most PAGE_SIZE + * bytes. The typical fastpath is a single successful setbit. Freeing is O(1). + * + * Pid namespaces: + * (C) 2007 Pavel Emelyanov , OpenVZ, SWsoft Inc. + * (C) 2007 Sukadev Bhattiprolu , IBM + * Many thanks to Oleg Nesterov for comments and help + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define pid_hashfn(nr, ns) \ + hash_long((unsigned long)nr + (unsigned long)ns, pidhash_shift) +static struct hlist_head *pid_hash; +static unsigned int pidhash_shift = 4; +struct pid init_struct_pid = INIT_STRUCT_PID; + +int pid_max = PID_MAX_DEFAULT; + +#define RESERVED_PIDS 300 + +int pid_max_min = RESERVED_PIDS + 1; +int pid_max_max = PID_MAX_LIMIT; + +#define BITS_PER_PAGE (PAGE_SIZE*8) +#define BITS_PER_PAGE_MASK (BITS_PER_PAGE-1) + +static inline int mk_pid(struct pid_namespace *pid_ns, + struct pidmap *map, int off) +{ + return (map - pid_ns->pidmap)*BITS_PER_PAGE + off; +} + +#define find_next_offset(map, off) \ + find_next_zero_bit((map)->page, BITS_PER_PAGE, off) + +/* + * PID-map pages start out as NULL, they get allocated upon + * first use and are never deallocated. This way a low pid_max + * value does not cause lots of bitmaps to be allocated, but + * the scheme scales to up to 4 million PIDs, runtime. + */ +struct pid_namespace init_pid_ns = { + .kref = { + .refcount = ATOMIC_INIT(2), + }, + .pidmap = { + [ 0 ... PIDMAP_ENTRIES-1] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } + }, + .last_pid = 0, + .level = 0, + .child_reaper = &init_task, +}; +EXPORT_SYMBOL_GPL(init_pid_ns); + +int is_container_init(struct task_struct *tsk) +{ + int ret = 0; + struct pid *pid; + + rcu_read_lock(); + pid = task_pid(tsk); + if (pid != NULL && pid->numbers[pid->level].nr == 1) + ret = 1; + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL(is_container_init); + +/* + * Note: disable interrupts while the pidmap_lock is held as an + * interrupt might come in and do read_lock(&tasklist_lock). + * + * If we don't disable interrupts there is a nasty deadlock between + * detach_pid()->free_pid() and another cpu that does + * spin_lock(&pidmap_lock) followed by an interrupt routine that does + * read_lock(&tasklist_lock); + * + * After we clean up the tasklist_lock and know there are no + * irq handlers that take it we can leave the interrupts enabled. + * For now it is easier to be safe than to prove it can't happen. + */ + +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); + +static void free_pidmap(struct upid *upid) +{ + int nr = upid->nr; + struct pidmap *map = upid->ns->pidmap + nr / BITS_PER_PAGE; + int offset = nr & BITS_PER_PAGE_MASK; + + clear_bit(offset, map->page); + atomic_inc(&map->nr_free); +} + +/* + * If we started walking pids at 'base', is 'a' seen before 'b'? + */ +static int pid_before(int base, int a, int b) +{ + /* + * This is the same as saying + * + * (a - base + MAXUINT) % MAXUINT < (b - base + MAXUINT) % MAXUINT + * and that mapping orders 'a' and 'b' with respect to 'base'. + */ + return (unsigned)(a - base) < (unsigned)(b - base); +} + +/* + * We might be racing with someone else trying to set pid_ns->last_pid. + * We want the winner to have the "later" value, because if the + * "earlier" value prevails, then a pid may get reused immediately. + * + * Since pids rollover, it is not sufficient to just pick the bigger + * value. We have to consider where we started counting from. + * + * 'base' is the value of pid_ns->last_pid that we observed when + * we started looking for a pid. + * + * 'pid' is the pid that we eventually found. + */ +static void set_last_pid(struct pid_namespace *pid_ns, int base, int pid) +{ + int prev; + int last_write = base; + do { + prev = last_write; + last_write = cmpxchg(&pid_ns->last_pid, prev, pid); + } while ((prev != last_write) && (pid_before(base, last_write, pid))); +} + +static int alloc_pidmap(struct pid_namespace *pid_ns) +{ + int i, offset, max_scan, pid, last = pid_ns->last_pid; + struct pidmap *map; + + pid = last + 1; + if (pid >= pid_max) + pid = RESERVED_PIDS; + offset = pid & BITS_PER_PAGE_MASK; + map = &pid_ns->pidmap[pid/BITS_PER_PAGE]; + /* + * If last_pid points into the middle of the map->page we + * want to scan this bitmap block twice, the second time + * we start with offset == 0 (or RESERVED_PIDS). + */ + max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset; + for (i = 0; i <= max_scan; ++i) { + if (unlikely(!map->page)) { + void *page = kzalloc(PAGE_SIZE, GFP_KERNEL); + /* + * Free the page if someone raced with us + * installing it: + */ + spin_lock_irq(&pidmap_lock); + if (!map->page) { + map->page = page; + page = NULL; + } + spin_unlock_irq(&pidmap_lock); + kfree(page); + if (unlikely(!map->page)) + break; + } + if (likely(atomic_read(&map->nr_free))) { + do { + if (!test_and_set_bit(offset, map->page)) { + atomic_dec(&map->nr_free); + set_last_pid(pid_ns, last, pid); + return pid; + } + offset = find_next_offset(map, offset); + pid = mk_pid(pid_ns, map, offset); + } while (offset < BITS_PER_PAGE && pid < pid_max); + } + if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) { + ++map; + offset = 0; + } else { + map = &pid_ns->pidmap[0]; + offset = RESERVED_PIDS; + if (unlikely(last == offset)) + break; + } + pid = mk_pid(pid_ns, map, offset); + } + return -1; +} + +int next_pidmap(struct pid_namespace *pid_ns, unsigned int last) +{ + int offset; + struct pidmap *map, *end; + + if (last >= PID_MAX_LIMIT) + return -1; + + offset = (last + 1) & BITS_PER_PAGE_MASK; + map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE]; + end = &pid_ns->pidmap[PIDMAP_ENTRIES]; + for (; map < end; map++, offset = 0) { + if (unlikely(!map->page)) + continue; + offset = find_next_bit((map)->page, BITS_PER_PAGE, offset); + if (offset < BITS_PER_PAGE) + return mk_pid(pid_ns, map, offset); + } + return -1; +} + +void put_pid(struct pid *pid) +{ + struct pid_namespace *ns; + + if (!pid) + return; + + ns = pid->numbers[pid->level].ns; + if ((atomic_read(&pid->count) == 1) || + atomic_dec_and_test(&pid->count)) { + kmem_cache_free(ns->pid_cachep, pid); + put_pid_ns(ns); + } +} +EXPORT_SYMBOL_GPL(put_pid); + +static void delayed_put_pid(struct rcu_head *rhp) +{ + struct pid *pid = container_of(rhp, struct pid, rcu); + put_pid(pid); +} + +void free_pid(struct pid *pid) +{ + /* We can be called with write_lock_irq(&tasklist_lock) held */ + int i; + unsigned long flags; + + spin_lock_irqsave(&pidmap_lock, flags); + for (i = 0; i <= pid->level; i++) + hlist_del_rcu(&pid->numbers[i].pid_chain); + spin_unlock_irqrestore(&pidmap_lock, flags); + + for (i = 0; i <= pid->level; i++) + free_pidmap(pid->numbers + i); + + call_rcu(&pid->rcu, delayed_put_pid); +} + +struct pid *alloc_pid(struct pid_namespace *ns) +{ + struct pid *pid; + enum pid_type type; + int i, nr; + struct pid_namespace *tmp; + struct upid *upid; + + pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); + if (!pid) + goto out; + + tmp = ns; + for (i = ns->level; i >= 0; i--) { + nr = alloc_pidmap(tmp); + if (nr < 0) + goto out_free; + + pid->numbers[i].nr = nr; + pid->numbers[i].ns = tmp; + tmp = tmp->parent; + } + + get_pid_ns(ns); + pid->level = ns->level; + atomic_set(&pid->count, 1); + for (type = 0; type < PIDTYPE_MAX; ++type) + INIT_HLIST_HEAD(&pid->tasks[type]); + + upid = pid->numbers + ns->level; + spin_lock_irq(&pidmap_lock); + for ( ; upid >= pid->numbers; --upid) + hlist_add_head_rcu(&upid->pid_chain, + &pid_hash[pid_hashfn(upid->nr, upid->ns)]); + spin_unlock_irq(&pidmap_lock); + +out: + return pid; + +out_free: + while (++i <= ns->level) + free_pidmap(pid->numbers + i); + + kmem_cache_free(ns->pid_cachep, pid); + pid = NULL; + goto out; +} + +struct pid *find_pid_ns(int nr, struct pid_namespace *ns) +{ + struct hlist_node *elem; + struct upid *pnr; + + hlist_for_each_entry_rcu(pnr, elem, + &pid_hash[pid_hashfn(nr, ns)], pid_chain) + if (pnr->nr == nr && pnr->ns == ns) + return container_of(pnr, struct pid, + numbers[ns->level]); + + return NULL; +} +EXPORT_SYMBOL_GPL(find_pid_ns); + +struct pid *find_vpid(int nr) +{ + return find_pid_ns(nr, current->nsproxy->pid_ns); +} +EXPORT_SYMBOL_GPL(find_vpid); + +/* + * attach_pid() must be called with the tasklist_lock write-held. + */ +void attach_pid(struct task_struct *task, enum pid_type type, + struct pid *pid) +{ + struct pid_link *link; + + link = &task->pids[type]; + link->pid = pid; + hlist_add_head_rcu(&link->node, &pid->tasks[type]); +} + +static void __change_pid(struct task_struct *task, enum pid_type type, + struct pid *new) +{ + struct pid_link *link; + struct pid *pid; + int tmp; + + link = &task->pids[type]; + pid = link->pid; + + hlist_del_rcu(&link->node); + link->pid = new; + + for (tmp = PIDTYPE_MAX; --tmp >= 0; ) + if (!hlist_empty(&pid->tasks[tmp])) + return; + + free_pid(pid); +} + +void detach_pid(struct task_struct *task, enum pid_type type) +{ + __change_pid(task, type, NULL); +} + +void change_pid(struct task_struct *task, enum pid_type type, + struct pid *pid) +{ + __change_pid(task, type, pid); + attach_pid(task, type, pid); +} + +/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ +void transfer_pid(struct task_struct *old, struct task_struct *new, + enum pid_type type) +{ + new->pids[type].pid = old->pids[type].pid; + hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node); +} + +struct task_struct *pid_task(struct pid *pid, enum pid_type type) +{ + struct task_struct *result = NULL; + if (pid) { + struct hlist_node *first; + first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), + rcu_read_lock_held() || + lockdep_tasklist_lock_is_held()); + if (first) + result = hlist_entry(first, struct task_struct, pids[(type)].node); + } + return result; +} +EXPORT_SYMBOL(pid_task); + +/* + * Must be called under rcu_read_lock(). + */ +struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) +{ + rcu_lockdep_assert(rcu_read_lock_held()); + return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); +} + +struct task_struct *find_task_by_vpid(pid_t vnr) +{ + return find_task_by_pid_ns(vnr, current->nsproxy->pid_ns); +} + +struct pid *get_task_pid(struct task_struct *task, enum pid_type type) +{ + struct pid *pid; + rcu_read_lock(); + if (type != PIDTYPE_PID) + task = task->group_leader; + pid = get_pid(task->pids[type].pid); + rcu_read_unlock(); + return pid; +} +EXPORT_SYMBOL_GPL(get_task_pid); + +struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) +{ + struct task_struct *result; + rcu_read_lock(); + result = pid_task(pid, type); + if (result) + get_task_struct(result); + rcu_read_unlock(); + return result; +} +EXPORT_SYMBOL_GPL(get_pid_task); + +struct pid *find_get_pid(pid_t nr) +{ + struct pid *pid; + + rcu_read_lock(); + pid = get_pid(find_vpid(nr)); + rcu_read_unlock(); + + return pid; +} +EXPORT_SYMBOL_GPL(find_get_pid); + +pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) +{ + struct upid *upid; + pid_t nr = 0; + + if (pid && ns->level <= pid->level) { + upid = &pid->numbers[ns->level]; + if (upid->ns == ns) + nr = upid->nr; + } + return nr; +} + +pid_t pid_vnr(struct pid *pid) +{ + return pid_nr_ns(pid, current->nsproxy->pid_ns); +} +EXPORT_SYMBOL_GPL(pid_vnr); + +pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, + struct pid_namespace *ns) +{ + pid_t nr = 0; + + rcu_read_lock(); + if (!ns) + ns = current->nsproxy->pid_ns; + if (likely(pid_alive(task))) { + if (type != PIDTYPE_PID) + task = task->group_leader; + nr = pid_nr_ns(task->pids[type].pid, ns); + } + rcu_read_unlock(); + + return nr; +} +EXPORT_SYMBOL(__task_pid_nr_ns); + +pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) +{ + return pid_nr_ns(task_tgid(tsk), ns); +} +EXPORT_SYMBOL(task_tgid_nr_ns); + +struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) +{ + return ns_of_pid(task_pid(tsk)); +} +EXPORT_SYMBOL_GPL(task_active_pid_ns); + +/* + * Used by proc to find the first pid that is greater than or equal to nr. + * + * If there is a pid at nr this function is exactly the same as find_pid_ns. + */ +struct pid *find_ge_pid(int nr, struct pid_namespace *ns) +{ + struct pid *pid; + + do { + pid = find_pid_ns(nr, ns); + if (pid) + break; + nr = next_pidmap(ns, nr); + } while (nr > 0); + + return pid; +} + +/* + * The pid hash table is scaled according to the amount of memory in the + * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or + * more. + */ +void __init pidhash_init(void) +{ + int i, pidhash_size; + + pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18, + HASH_EARLY | HASH_SMALL, + &pidhash_shift, NULL, 4096); + pidhash_size = 1 << pidhash_shift; + + for (i = 0; i < pidhash_size; i++) + INIT_HLIST_HEAD(&pid_hash[i]); +} + +void __init pidmap_init(void) +{ + /* bump default and minimum pid_max based on number of cpus */ + pid_max = min(pid_max_max, max_t(int, pid_max, + PIDS_PER_CPU_DEFAULT * num_possible_cpus())); + pid_max_min = max_t(int, pid_max_min, + PIDS_PER_CPU_MIN * num_possible_cpus()); + pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min); + + init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); + /* Reserve PID 0. We never call free_pidmap(0) */ + set_bit(0, init_pid_ns.pidmap[0].page); + atomic_dec(&init_pid_ns.pidmap[0].nr_free); + + init_pid_ns.pid_cachep = KMEM_CACHE(pid, + SLAB_HWCACHE_ALIGN | SLAB_PANIC); +} diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c new file mode 100644 index 00000000..e9c9adc8 --- /dev/null +++ b/kernel/pid_namespace.c @@ -0,0 +1,200 @@ +/* + * Pid namespaces + * + * Authors: + * (C) 2007 Pavel Emelyanov , OpenVZ, SWsoft Inc. + * (C) 2007 Sukadev Bhattiprolu , IBM + * Many thanks to Oleg Nesterov for comments and help + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#define BITS_PER_PAGE (PAGE_SIZE*8) + +struct pid_cache { + int nr_ids; + char name[16]; + struct kmem_cache *cachep; + struct list_head list; +}; + +static LIST_HEAD(pid_caches_lh); +static DEFINE_MUTEX(pid_caches_mutex); +static struct kmem_cache *pid_ns_cachep; + +/* + * creates the kmem cache to allocate pids from. + * @nr_ids: the number of numerical ids this pid will have to carry + */ + +static struct kmem_cache *create_pid_cachep(int nr_ids) +{ + struct pid_cache *pcache; + struct kmem_cache *cachep; + + mutex_lock(&pid_caches_mutex); + list_for_each_entry(pcache, &pid_caches_lh, list) + if (pcache->nr_ids == nr_ids) + goto out; + + pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL); + if (pcache == NULL) + goto err_alloc; + + snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids); + cachep = kmem_cache_create(pcache->name, + sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid), + 0, SLAB_HWCACHE_ALIGN, NULL); + if (cachep == NULL) + goto err_cachep; + + pcache->nr_ids = nr_ids; + pcache->cachep = cachep; + list_add(&pcache->list, &pid_caches_lh); +out: + mutex_unlock(&pid_caches_mutex); + return pcache->cachep; + +err_cachep: + kfree(pcache); +err_alloc: + mutex_unlock(&pid_caches_mutex); + return NULL; +} + +static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns) +{ + struct pid_namespace *ns; + unsigned int level = parent_pid_ns->level + 1; + int i, err = -ENOMEM; + + ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); + if (ns == NULL) + goto out; + + ns->pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!ns->pidmap[0].page) + goto out_free; + + ns->pid_cachep = create_pid_cachep(level + 1); + if (ns->pid_cachep == NULL) + goto out_free_map; + + kref_init(&ns->kref); + ns->level = level; + ns->parent = get_pid_ns(parent_pid_ns); + + set_bit(0, ns->pidmap[0].page); + atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); + + for (i = 1; i < PIDMAP_ENTRIES; i++) + atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); + + err = pid_ns_prepare_proc(ns); + if (err) + goto out_put_parent_pid_ns; + + return ns; + +out_put_parent_pid_ns: + put_pid_ns(parent_pid_ns); +out_free_map: + kfree(ns->pidmap[0].page); +out_free: + kmem_cache_free(pid_ns_cachep, ns); +out: + return ERR_PTR(err); +} + +static void destroy_pid_namespace(struct pid_namespace *ns) +{ + int i; + + for (i = 0; i < PIDMAP_ENTRIES; i++) + kfree(ns->pidmap[i].page); + kmem_cache_free(pid_ns_cachep, ns); +} + +struct pid_namespace *copy_pid_ns(unsigned long flags, struct pid_namespace *old_ns) +{ + if (!(flags & CLONE_NEWPID)) + return get_pid_ns(old_ns); + if (flags & (CLONE_THREAD|CLONE_PARENT)) + return ERR_PTR(-EINVAL); + return create_pid_namespace(old_ns); +} + +void free_pid_ns(struct kref *kref) +{ + struct pid_namespace *ns, *parent; + + ns = container_of(kref, struct pid_namespace, kref); + + parent = ns->parent; + destroy_pid_namespace(ns); + + if (parent != NULL) + put_pid_ns(parent); +} + +void zap_pid_ns_processes(struct pid_namespace *pid_ns) +{ + int nr; + int rc; + struct task_struct *task; + + /* + * The last thread in the cgroup-init thread group is terminating. + * Find remaining pid_ts in the namespace, signal and wait for them + * to exit. + * + * Note: This signals each threads in the namespace - even those that + * belong to the same thread group, To avoid this, we would have + * to walk the entire tasklist looking a processes in this + * namespace, but that could be unnecessarily expensive if the + * pid namespace has just a few processes. Or we need to + * maintain a tasklist for each pid namespace. + * + */ + read_lock(&tasklist_lock); + nr = next_pidmap(pid_ns, 1); + while (nr > 0) { + rcu_read_lock(); + + /* + * Any nested-container's init processes won't ignore the + * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser(). + */ + task = pid_task(find_vpid(nr), PIDTYPE_PID); + if (task) + send_sig_info(SIGKILL, SEND_SIG_NOINFO, task); + + rcu_read_unlock(); + + nr = next_pidmap(pid_ns, nr); + } + read_unlock(&tasklist_lock); + + do { + clear_thread_flag(TIF_SIGPENDING); + rc = sys_wait4(-1, NULL, __WALL, NULL); + } while (rc != -ECHILD); + + acct_exit_ns(pid_ns); + return; +} + +static __init int pid_namespaces_init(void) +{ + pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); + return 0; +} + +__initcall(pid_namespaces_init); diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c new file mode 100644 index 00000000..37f05d0f --- /dev/null +++ b/kernel/pm_qos_params.c @@ -0,0 +1,481 @@ +/* + * This module exposes the interface to kernel space for specifying + * QoS dependencies. It provides infrastructure for registration of: + * + * Dependents on a QoS value : register requests + * Watchers of QoS value : get notified when target QoS value changes + * + * This QoS design is best effort based. Dependents register their QoS needs. + * Watchers register to keep track of the current QoS needs of the system. + * + * There are 3 basic classes of QoS parameter: latency, timeout, throughput + * each have defined units: + * latency: usec + * timeout: usec <-- currently not used. + * throughput: kbs (kilo byte / sec) + * + * There are lists of pm_qos_objects each one wrapping requests, notifiers + * + * User mode requests on a QOS parameter register themselves to the + * subsystem by opening the device node /dev/... and writing there request to + * the node. As long as the process holds a file handle open to the node the + * client continues to be accounted for. Upon file release the usermode + * request is removed and a new qos target is computed. This way when the + * request that the application has is cleaned up when closes the file + * pointer or exits the pm_qos_object will get an opportunity to clean up. + * + * Mark Gross + */ + +/*#define DEBUG*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * locking rule: all changes to requests or notifiers lists + * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock + * held, taken with _irqsave. One lock to rule them all + */ +enum pm_qos_type { + PM_QOS_MAX, /* return the largest value */ + PM_QOS_MIN /* return the smallest value */ +}; + +/* + * Note: The lockless read path depends on the CPU accessing + * target_value atomically. Atomic access is only guaranteed on all CPU + * types linux supports for 32 bit quantites + */ +struct pm_qos_object { + struct plist_head requests; + struct blocking_notifier_head *notifiers; + struct miscdevice pm_qos_power_miscdev; + char *name; + s32 target_value; /* Do not change to 64 bit */ + s32 default_value; + enum pm_qos_type type; +}; + +static DEFINE_SPINLOCK(pm_qos_lock); + +static struct pm_qos_object null_pm_qos; +static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); +static struct pm_qos_object cpu_dma_pm_qos = { + .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests), + .notifiers = &cpu_dma_lat_notifier, + .name = "cpu_dma_latency", + .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, + .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE, + .type = PM_QOS_MIN, +}; + +static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); +static struct pm_qos_object network_lat_pm_qos = { + .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests), + .notifiers = &network_lat_notifier, + .name = "network_latency", + .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, + .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE, + .type = PM_QOS_MIN +}; + + +static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); +static struct pm_qos_object network_throughput_pm_qos = { + .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests), + .notifiers = &network_throughput_notifier, + .name = "network_throughput", + .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, + .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE, + .type = PM_QOS_MAX, +}; + + +static struct pm_qos_object *pm_qos_array[] = { + &null_pm_qos, + &cpu_dma_pm_qos, + &network_lat_pm_qos, + &network_throughput_pm_qos +}; + +static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, + size_t count, loff_t *f_pos); +static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, + size_t count, loff_t *f_pos); +static int pm_qos_power_open(struct inode *inode, struct file *filp); +static int pm_qos_power_release(struct inode *inode, struct file *filp); + +static const struct file_operations pm_qos_power_fops = { + .write = pm_qos_power_write, + .read = pm_qos_power_read, + .open = pm_qos_power_open, + .release = pm_qos_power_release, + .llseek = noop_llseek, +}; + +/* unlocked internal variant */ +static inline int pm_qos_get_value(struct pm_qos_object *o) +{ + if (plist_head_empty(&o->requests)) + return o->default_value; + + switch (o->type) { + case PM_QOS_MIN: + return plist_first(&o->requests)->prio; + + case PM_QOS_MAX: + return plist_last(&o->requests)->prio; + + default: + /* runtime check for not using enum */ + BUG(); + } +} + +static inline s32 pm_qos_read_value(struct pm_qos_object *o) +{ + return o->target_value; +} + +static inline void pm_qos_set_value(struct pm_qos_object *o, s32 value) +{ + o->target_value = value; +} + +static void update_target(struct pm_qos_object *o, struct plist_node *node, + int del, int value) +{ + unsigned long flags; + int prev_value, curr_value; + + spin_lock_irqsave(&pm_qos_lock, flags); + prev_value = pm_qos_get_value(o); + /* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */ + if (value != PM_QOS_DEFAULT_VALUE) { + /* + * to change the list, we atomically remove, reinit + * with new value and add, then see if the extremal + * changed + */ + plist_del(node, &o->requests); + plist_node_init(node, value); + plist_add(node, &o->requests); + } else if (del) { + plist_del(node, &o->requests); + } else { + plist_add(node, &o->requests); + } + curr_value = pm_qos_get_value(o); + pm_qos_set_value(o, curr_value); + spin_unlock_irqrestore(&pm_qos_lock, flags); + + if (prev_value != curr_value) + blocking_notifier_call_chain(o->notifiers, + (unsigned long)curr_value, + NULL); +} + +static int register_pm_qos_misc(struct pm_qos_object *qos) +{ + qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR; + qos->pm_qos_power_miscdev.name = qos->name; + qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops; + + return misc_register(&qos->pm_qos_power_miscdev); +} + +static int find_pm_qos_object_by_minor(int minor) +{ + int pm_qos_class; + + for (pm_qos_class = 0; + pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) { + if (minor == + pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor) + return pm_qos_class; + } + return -1; +} + +/** + * pm_qos_request - returns current system wide qos expectation + * @pm_qos_class: identification of which qos value is requested + * + * This function returns the current target value. + */ +int pm_qos_request(int pm_qos_class) +{ + return pm_qos_read_value(pm_qos_array[pm_qos_class]); +} +EXPORT_SYMBOL_GPL(pm_qos_request); + +int pm_qos_request_active(struct pm_qos_request_list *req) +{ + return req->pm_qos_class != 0; +} +EXPORT_SYMBOL_GPL(pm_qos_request_active); + +/** + * pm_qos_add_request - inserts new qos request into the list + * @dep: pointer to a preallocated handle + * @pm_qos_class: identifies which list of qos request to use + * @value: defines the qos request + * + * This function inserts a new entry in the pm_qos_class list of requested qos + * performance characteristics. It recomputes the aggregate QoS expectations + * for the pm_qos_class of parameters and initializes the pm_qos_request_list + * handle. Caller needs to save this handle for later use in updates and + * removal. + */ + +void pm_qos_add_request(struct pm_qos_request_list *dep, + int pm_qos_class, s32 value) +{ + struct pm_qos_object *o = pm_qos_array[pm_qos_class]; + int new_value; + + if (pm_qos_request_active(dep)) { + WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n"); + return; + } + if (value == PM_QOS_DEFAULT_VALUE) + new_value = o->default_value; + else + new_value = value; + plist_node_init(&dep->list, new_value); + dep->pm_qos_class = pm_qos_class; + update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE); +} +EXPORT_SYMBOL_GPL(pm_qos_add_request); + +/** + * pm_qos_update_request - modifies an existing qos request + * @pm_qos_req : handle to list element holding a pm_qos request to use + * @value: defines the qos request + * + * Updates an existing qos request for the pm_qos_class of parameters along + * with updating the target pm_qos_class value. + * + * Attempts are made to make this code callable on hot code paths. + */ +void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req, + s32 new_value) +{ + s32 temp; + struct pm_qos_object *o; + + if (!pm_qos_req) /*guard against callers passing in null */ + return; + + if (!pm_qos_request_active(pm_qos_req)) { + WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n"); + return; + } + + o = pm_qos_array[pm_qos_req->pm_qos_class]; + + if (new_value == PM_QOS_DEFAULT_VALUE) + temp = o->default_value; + else + temp = new_value; + + if (temp != pm_qos_req->list.prio) + update_target(o, &pm_qos_req->list, 0, temp); +} +EXPORT_SYMBOL_GPL(pm_qos_update_request); + +/** + * pm_qos_remove_request - modifies an existing qos request + * @pm_qos_req: handle to request list element + * + * Will remove pm qos request from the list of requests and + * recompute the current target value for the pm_qos_class. Call this + * on slow code paths. + */ +void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req) +{ + struct pm_qos_object *o; + + if (pm_qos_req == NULL) + return; + /* silent return to keep pcm code cleaner */ + + if (!pm_qos_request_active(pm_qos_req)) { + WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n"); + return; + } + + o = pm_qos_array[pm_qos_req->pm_qos_class]; + update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE); + memset(pm_qos_req, 0, sizeof(*pm_qos_req)); +} +EXPORT_SYMBOL_GPL(pm_qos_remove_request); + +/** + * pm_qos_add_notifier - sets notification entry for changes to target value + * @pm_qos_class: identifies which qos target changes should be notified. + * @notifier: notifier block managed by caller. + * + * will register the notifier into a notification chain that gets called + * upon changes to the pm_qos_class target value. + */ +int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) +{ + int retval; + + retval = blocking_notifier_chain_register( + pm_qos_array[pm_qos_class]->notifiers, notifier); + + return retval; +} +EXPORT_SYMBOL_GPL(pm_qos_add_notifier); + +/** + * pm_qos_remove_notifier - deletes notification entry from chain. + * @pm_qos_class: identifies which qos target changes are notified. + * @notifier: notifier block to be removed. + * + * will remove the notifier from the notification chain that gets called + * upon changes to the pm_qos_class target value. + */ +int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier) +{ + int retval; + + retval = blocking_notifier_chain_unregister( + pm_qos_array[pm_qos_class]->notifiers, notifier); + + return retval; +} +EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); + +static int pm_qos_power_open(struct inode *inode, struct file *filp) +{ + long pm_qos_class; + + pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); + if (pm_qos_class >= 0) { + struct pm_qos_request_list *req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return -ENOMEM; + + pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE); + filp->private_data = req; + + if (filp->private_data) + return 0; + } + return -EPERM; +} + +static int pm_qos_power_release(struct inode *inode, struct file *filp) +{ + struct pm_qos_request_list *req; + + req = filp->private_data; + pm_qos_remove_request(req); + kfree(req); + + return 0; +} + + +static ssize_t pm_qos_power_read(struct file *filp, char __user *buf, + size_t count, loff_t *f_pos) +{ + s32 value; + unsigned long flags; + struct pm_qos_object *o; + struct pm_qos_request_list *pm_qos_req = filp->private_data; + + if (!pm_qos_req) + return -EINVAL; + if (!pm_qos_request_active(pm_qos_req)) + return -EINVAL; + + o = pm_qos_array[pm_qos_req->pm_qos_class]; + spin_lock_irqsave(&pm_qos_lock, flags); + value = pm_qos_get_value(o); + spin_unlock_irqrestore(&pm_qos_lock, flags); + + return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32)); +} + +static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, + size_t count, loff_t *f_pos) +{ + s32 value; + struct pm_qos_request_list *pm_qos_req; + + if (count == sizeof(s32)) { + if (copy_from_user(&value, buf, sizeof(s32))) + return -EFAULT; + } else if (count <= 11) { /* ASCII perhaps? */ + char ascii_value[11]; + unsigned long int ulval; + int ret; + + if (copy_from_user(ascii_value, buf, count)) + return -EFAULT; + + if (count > 10) { + if (ascii_value[10] == '\n') + ascii_value[10] = '\0'; + else + return -EINVAL; + } else { + ascii_value[count] = '\0'; + } + ret = strict_strtoul(ascii_value, 16, &ulval); + if (ret) { + pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret); + return -EINVAL; + } + value = (s32)lower_32_bits(ulval); + } else { + return -EINVAL; + } + + pm_qos_req = filp->private_data; + pm_qos_update_request(pm_qos_req, value); + + return count; +} + + +static int __init pm_qos_power_init(void) +{ + int ret = 0; + + ret = register_pm_qos_misc(&cpu_dma_pm_qos); + if (ret < 0) { + printk(KERN_ERR "pm_qos_param: cpu_dma_latency setup failed\n"); + return ret; + } + ret = register_pm_qos_misc(&network_lat_pm_qos); + if (ret < 0) { + printk(KERN_ERR "pm_qos_param: network_latency setup failed\n"); + return ret; + } + ret = register_pm_qos_misc(&network_throughput_pm_qos); + if (ret < 0) + printk(KERN_ERR + "pm_qos_param: network_throughput setup failed\n"); + + return ret; +} + +late_initcall(pm_qos_power_init); diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c new file mode 100644 index 00000000..640ded8f --- /dev/null +++ b/kernel/posix-cpu-timers.c @@ -0,0 +1,1632 @@ +/* + * Implement CPU time clocks for the POSIX clock interface. + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * Called after updating RLIMIT_CPU to run cpu timer and update + * tsk->signal->cputime_expires expiration cache if necessary. Needs + * siglock protection since other code may update expiration cache as + * well. + */ +void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new) +{ + cputime_t cputime = secs_to_cputime(rlim_new); + + spin_lock_irq(&task->sighand->siglock); + set_process_cpu_timer(task, CPUCLOCK_PROF, &cputime, NULL); + spin_unlock_irq(&task->sighand->siglock); +} + +static int check_clock(const clockid_t which_clock) +{ + int error = 0; + struct task_struct *p; + const pid_t pid = CPUCLOCK_PID(which_clock); + + if (CPUCLOCK_WHICH(which_clock) >= CPUCLOCK_MAX) + return -EINVAL; + + if (pid == 0) + return 0; + + rcu_read_lock(); + p = find_task_by_vpid(pid); + if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ? + same_thread_group(p, current) : has_group_leader_pid(p))) { + error = -EINVAL; + } + rcu_read_unlock(); + + return error; +} + +static inline union cpu_time_count +timespec_to_sample(const clockid_t which_clock, const struct timespec *tp) +{ + union cpu_time_count ret; + ret.sched = 0; /* high half always zero when .cpu used */ + if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { + ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec; + } else { + ret.cpu = timespec_to_cputime(tp); + } + return ret; +} + +static void sample_to_timespec(const clockid_t which_clock, + union cpu_time_count cpu, + struct timespec *tp) +{ + if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) + *tp = ns_to_timespec(cpu.sched); + else + cputime_to_timespec(cpu.cpu, tp); +} + +static inline int cpu_time_before(const clockid_t which_clock, + union cpu_time_count now, + union cpu_time_count then) +{ + if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { + return now.sched < then.sched; + } else { + return cputime_lt(now.cpu, then.cpu); + } +} +static inline void cpu_time_add(const clockid_t which_clock, + union cpu_time_count *acc, + union cpu_time_count val) +{ + if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { + acc->sched += val.sched; + } else { + acc->cpu = cputime_add(acc->cpu, val.cpu); + } +} +static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock, + union cpu_time_count a, + union cpu_time_count b) +{ + if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { + a.sched -= b.sched; + } else { + a.cpu = cputime_sub(a.cpu, b.cpu); + } + return a; +} + +/* + * Divide and limit the result to res >= 1 + * + * This is necessary to prevent signal delivery starvation, when the result of + * the division would be rounded down to 0. + */ +static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div) +{ + cputime_t res = cputime_div(time, div); + + return max_t(cputime_t, res, 1); +} + +/* + * Update expiry time from increment, and increase overrun count, + * given the current clock sample. + */ +static void bump_cpu_timer(struct k_itimer *timer, + union cpu_time_count now) +{ + int i; + + if (timer->it.cpu.incr.sched == 0) + return; + + if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { + unsigned long long delta, incr; + + if (now.sched < timer->it.cpu.expires.sched) + return; + incr = timer->it.cpu.incr.sched; + delta = now.sched + incr - timer->it.cpu.expires.sched; + /* Don't use (incr*2 < delta), incr*2 might overflow. */ + for (i = 0; incr < delta - incr; i++) + incr = incr << 1; + for (; i >= 0; incr >>= 1, i--) { + if (delta < incr) + continue; + timer->it.cpu.expires.sched += incr; + timer->it_overrun += 1 << i; + delta -= incr; + } + } else { + cputime_t delta, incr; + + if (cputime_lt(now.cpu, timer->it.cpu.expires.cpu)) + return; + incr = timer->it.cpu.incr.cpu; + delta = cputime_sub(cputime_add(now.cpu, incr), + timer->it.cpu.expires.cpu); + /* Don't use (incr*2 < delta), incr*2 might overflow. */ + for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++) + incr = cputime_add(incr, incr); + for (; i >= 0; incr = cputime_halve(incr), i--) { + if (cputime_lt(delta, incr)) + continue; + timer->it.cpu.expires.cpu = + cputime_add(timer->it.cpu.expires.cpu, incr); + timer->it_overrun += 1 << i; + delta = cputime_sub(delta, incr); + } + } +} + +static inline cputime_t prof_ticks(struct task_struct *p) +{ + return cputime_add(p->utime, p->stime); +} +static inline cputime_t virt_ticks(struct task_struct *p) +{ + return p->utime; +} + +static int +posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) +{ + int error = check_clock(which_clock); + if (!error) { + tp->tv_sec = 0; + tp->tv_nsec = ((NSEC_PER_SEC + HZ - 1) / HZ); + if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) { + /* + * If sched_clock is using a cycle counter, we + * don't have any idea of its true resolution + * exported, but it is much more than 1s/HZ. + */ + tp->tv_nsec = 1; + } + } + return error; +} + +static int +posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) +{ + /* + * You can never reset a CPU clock, but we check for other errors + * in the call before failing with EPERM. + */ + int error = check_clock(which_clock); + if (error == 0) { + error = -EPERM; + } + return error; +} + + +/* + * Sample a per-thread clock for the given task. + */ +static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p, + union cpu_time_count *cpu) +{ + switch (CPUCLOCK_WHICH(which_clock)) { + default: + return -EINVAL; + case CPUCLOCK_PROF: + cpu->cpu = prof_ticks(p); + break; + case CPUCLOCK_VIRT: + cpu->cpu = virt_ticks(p); + break; + case CPUCLOCK_SCHED: + cpu->sched = task_sched_runtime(p); + break; + } + return 0; +} + +void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) +{ + struct signal_struct *sig = tsk->signal; + struct task_struct *t; + + times->utime = sig->utime; + times->stime = sig->stime; + times->sum_exec_runtime = sig->sum_sched_runtime; + + rcu_read_lock(); + /* make sure we can trust tsk->thread_group list */ + if (!likely(pid_alive(tsk))) + goto out; + + t = tsk; + do { + times->utime = cputime_add(times->utime, t->utime); + times->stime = cputime_add(times->stime, t->stime); + times->sum_exec_runtime += task_sched_runtime(t); + } while_each_thread(tsk, t); +out: + rcu_read_unlock(); +} + +static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b) +{ + if (cputime_gt(b->utime, a->utime)) + a->utime = b->utime; + + if (cputime_gt(b->stime, a->stime)) + a->stime = b->stime; + + if (b->sum_exec_runtime > a->sum_exec_runtime) + a->sum_exec_runtime = b->sum_exec_runtime; +} + +void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times) +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + struct task_cputime sum; + unsigned long flags; + + if (!cputimer->running) { + /* + * The POSIX timer interface allows for absolute time expiry + * values through the TIMER_ABSTIME flag, therefore we have + * to synchronize the timer to the clock every time we start + * it. + */ + thread_group_cputime(tsk, &sum); + spin_lock_irqsave(&cputimer->lock, flags); + cputimer->running = 1; + update_gt_cputime(&cputimer->cputime, &sum); + } else + spin_lock_irqsave(&cputimer->lock, flags); + *times = cputimer->cputime; + spin_unlock_irqrestore(&cputimer->lock, flags); +} + +/* + * Sample a process (thread group) clock for the given group_leader task. + * Must be called with tasklist_lock held for reading. + */ +static int cpu_clock_sample_group(const clockid_t which_clock, + struct task_struct *p, + union cpu_time_count *cpu) +{ + struct task_cputime cputime; + + switch (CPUCLOCK_WHICH(which_clock)) { + default: + return -EINVAL; + case CPUCLOCK_PROF: + thread_group_cputime(p, &cputime); + cpu->cpu = cputime_add(cputime.utime, cputime.stime); + break; + case CPUCLOCK_VIRT: + thread_group_cputime(p, &cputime); + cpu->cpu = cputime.utime; + break; + case CPUCLOCK_SCHED: + thread_group_cputime(p, &cputime); + cpu->sched = cputime.sum_exec_runtime; + break; + } + return 0; +} + + +static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) +{ + const pid_t pid = CPUCLOCK_PID(which_clock); + int error = -EINVAL; + union cpu_time_count rtn; + + if (pid == 0) { + /* + * Special case constant value for our own clocks. + * We don't have to do any lookup to find ourselves. + */ + if (CPUCLOCK_PERTHREAD(which_clock)) { + /* + * Sampling just ourselves we can do with no locking. + */ + error = cpu_clock_sample(which_clock, + current, &rtn); + } else { + read_lock(&tasklist_lock); + error = cpu_clock_sample_group(which_clock, + current, &rtn); + read_unlock(&tasklist_lock); + } + } else { + /* + * Find the given PID, and validate that the caller + * should be able to see it. + */ + struct task_struct *p; + rcu_read_lock(); + p = find_task_by_vpid(pid); + if (p) { + if (CPUCLOCK_PERTHREAD(which_clock)) { + if (same_thread_group(p, current)) { + error = cpu_clock_sample(which_clock, + p, &rtn); + } + } else { + read_lock(&tasklist_lock); + if (thread_group_leader(p) && p->sighand) { + error = + cpu_clock_sample_group(which_clock, + p, &rtn); + } + read_unlock(&tasklist_lock); + } + } + rcu_read_unlock(); + } + + if (error) + return error; + sample_to_timespec(which_clock, rtn, tp); + return 0; +} + + +/* + * Validate the clockid_t for a new CPU-clock timer, and initialize the timer. + * This is called from sys_timer_create() and do_cpu_nanosleep() with the + * new timer already all-zeros initialized. + */ +static int posix_cpu_timer_create(struct k_itimer *new_timer) +{ + int ret = 0; + const pid_t pid = CPUCLOCK_PID(new_timer->it_clock); + struct task_struct *p; + + if (CPUCLOCK_WHICH(new_timer->it_clock) >= CPUCLOCK_MAX) + return -EINVAL; + + INIT_LIST_HEAD(&new_timer->it.cpu.entry); + + rcu_read_lock(); + if (CPUCLOCK_PERTHREAD(new_timer->it_clock)) { + if (pid == 0) { + p = current; + } else { + p = find_task_by_vpid(pid); + if (p && !same_thread_group(p, current)) + p = NULL; + } + } else { + if (pid == 0) { + p = current->group_leader; + } else { + p = find_task_by_vpid(pid); + if (p && !has_group_leader_pid(p)) + p = NULL; + } + } + new_timer->it.cpu.task = p; + if (p) { + get_task_struct(p); + } else { + ret = -EINVAL; + } + rcu_read_unlock(); + + return ret; +} + +/* + * Clean up a CPU-clock timer that is about to be destroyed. + * This is called from timer deletion with the timer already locked. + * If we return TIMER_RETRY, it's necessary to release the timer's lock + * and try again. (This happens when the timer is in the middle of firing.) + */ +static int posix_cpu_timer_del(struct k_itimer *timer) +{ + struct task_struct *p = timer->it.cpu.task; + int ret = 0; + + if (likely(p != NULL)) { + read_lock(&tasklist_lock); + if (unlikely(p->sighand == NULL)) { + /* + * We raced with the reaping of the task. + * The deletion should have cleared us off the list. + */ + BUG_ON(!list_empty(&timer->it.cpu.entry)); + } else { + spin_lock(&p->sighand->siglock); + if (timer->it.cpu.firing) + ret = TIMER_RETRY; + else + list_del(&timer->it.cpu.entry); + spin_unlock(&p->sighand->siglock); + } + read_unlock(&tasklist_lock); + + if (!ret) + put_task_struct(p); + } + + return ret; +} + +/* + * Clean out CPU timers still ticking when a thread exited. The task + * pointer is cleared, and the expiry time is replaced with the residual + * time for later timer_gettime calls to return. + * This must be called with the siglock held. + */ +static void cleanup_timers(struct list_head *head, + cputime_t utime, cputime_t stime, + unsigned long long sum_exec_runtime) +{ + struct cpu_timer_list *timer, *next; + cputime_t ptime = cputime_add(utime, stime); + + list_for_each_entry_safe(timer, next, head, entry) { + list_del_init(&timer->entry); + if (cputime_lt(timer->expires.cpu, ptime)) { + timer->expires.cpu = cputime_zero; + } else { + timer->expires.cpu = cputime_sub(timer->expires.cpu, + ptime); + } + } + + ++head; + list_for_each_entry_safe(timer, next, head, entry) { + list_del_init(&timer->entry); + if (cputime_lt(timer->expires.cpu, utime)) { + timer->expires.cpu = cputime_zero; + } else { + timer->expires.cpu = cputime_sub(timer->expires.cpu, + utime); + } + } + + ++head; + list_for_each_entry_safe(timer, next, head, entry) { + list_del_init(&timer->entry); + if (timer->expires.sched < sum_exec_runtime) { + timer->expires.sched = 0; + } else { + timer->expires.sched -= sum_exec_runtime; + } + } +} + +/* + * These are both called with the siglock held, when the current thread + * is being reaped. When the final (leader) thread in the group is reaped, + * posix_cpu_timers_exit_group will be called after posix_cpu_timers_exit. + */ +void posix_cpu_timers_exit(struct task_struct *tsk) +{ + cleanup_timers(tsk->cpu_timers, + tsk->utime, tsk->stime, tsk->se.sum_exec_runtime); + +} +void posix_cpu_timers_exit_group(struct task_struct *tsk) +{ + struct signal_struct *const sig = tsk->signal; + + cleanup_timers(tsk->signal->cpu_timers, + cputime_add(tsk->utime, sig->utime), + cputime_add(tsk->stime, sig->stime), + tsk->se.sum_exec_runtime + sig->sum_sched_runtime); +} + +static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) +{ + /* + * That's all for this thread or process. + * We leave our residual in expires to be reported. + */ + put_task_struct(timer->it.cpu.task); + timer->it.cpu.task = NULL; + timer->it.cpu.expires = cpu_time_sub(timer->it_clock, + timer->it.cpu.expires, + now); +} + +static inline int expires_gt(cputime_t expires, cputime_t new_exp) +{ + return cputime_eq(expires, cputime_zero) || + cputime_gt(expires, new_exp); +} + +/* + * Insert the timer on the appropriate list before any timers that + * expire later. This must be called with the tasklist_lock held + * for reading, interrupts disabled and p->sighand->siglock taken. + */ +static void arm_timer(struct k_itimer *timer) +{ + struct task_struct *p = timer->it.cpu.task; + struct list_head *head, *listpos; + struct task_cputime *cputime_expires; + struct cpu_timer_list *const nt = &timer->it.cpu; + struct cpu_timer_list *next; + + if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + head = p->cpu_timers; + cputime_expires = &p->cputime_expires; + } else { + head = p->signal->cpu_timers; + cputime_expires = &p->signal->cputime_expires; + } + head += CPUCLOCK_WHICH(timer->it_clock); + + listpos = head; + list_for_each_entry(next, head, entry) { + if (cpu_time_before(timer->it_clock, nt->expires, next->expires)) + break; + listpos = &next->entry; + } + list_add(&nt->entry, listpos); + + if (listpos == head) { + union cpu_time_count *exp = &nt->expires; + + /* + * We are the new earliest-expiring POSIX 1.b timer, hence + * need to update expiration cache. Take into account that + * for process timers we share expiration cache with itimers + * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME. + */ + + switch (CPUCLOCK_WHICH(timer->it_clock)) { + case CPUCLOCK_PROF: + if (expires_gt(cputime_expires->prof_exp, exp->cpu)) + cputime_expires->prof_exp = exp->cpu; + break; + case CPUCLOCK_VIRT: + if (expires_gt(cputime_expires->virt_exp, exp->cpu)) + cputime_expires->virt_exp = exp->cpu; + break; + case CPUCLOCK_SCHED: + if (cputime_expires->sched_exp == 0 || + cputime_expires->sched_exp > exp->sched) + cputime_expires->sched_exp = exp->sched; + break; + } + } +} + +/* + * The timer is locked, fire it and arrange for its reload. + */ +static void cpu_timer_fire(struct k_itimer *timer) +{ + if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) { + /* + * User don't want any signal. + */ + timer->it.cpu.expires.sched = 0; + } else if (unlikely(timer->sigq == NULL)) { + /* + * This a special case for clock_nanosleep, + * not a normal timer from sys_timer_create. + */ + wake_up_process(timer->it_process); + timer->it.cpu.expires.sched = 0; + } else if (timer->it.cpu.incr.sched == 0) { + /* + * One-shot timer. Clear it as soon as it's fired. + */ + posix_timer_event(timer, 0); + timer->it.cpu.expires.sched = 0; + } else if (posix_timer_event(timer, ++timer->it_requeue_pending)) { + /* + * The signal did not get queued because the signal + * was ignored, so we won't get any callback to + * reload the timer. But we need to keep it + * ticking in case the signal is deliverable next time. + */ + posix_cpu_timer_schedule(timer); + } +} + +/* + * Sample a process (thread group) timer for the given group_leader task. + * Must be called with tasklist_lock held for reading. + */ +static int cpu_timer_sample_group(const clockid_t which_clock, + struct task_struct *p, + union cpu_time_count *cpu) +{ + struct task_cputime cputime; + + thread_group_cputimer(p, &cputime); + switch (CPUCLOCK_WHICH(which_clock)) { + default: + return -EINVAL; + case CPUCLOCK_PROF: + cpu->cpu = cputime_add(cputime.utime, cputime.stime); + break; + case CPUCLOCK_VIRT: + cpu->cpu = cputime.utime; + break; + case CPUCLOCK_SCHED: + cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); + break; + } + return 0; +} + +/* + * Guts of sys_timer_settime for CPU timers. + * This is called with the timer locked and interrupts disabled. + * If we return TIMER_RETRY, it's necessary to release the timer's lock + * and try again. (This happens when the timer is in the middle of firing.) + */ +static int posix_cpu_timer_set(struct k_itimer *timer, int flags, + struct itimerspec *new, struct itimerspec *old) +{ + struct task_struct *p = timer->it.cpu.task; + union cpu_time_count old_expires, new_expires, old_incr, val; + int ret; + + if (unlikely(p == NULL)) { + /* + * Timer refers to a dead task's clock. + */ + return -ESRCH; + } + + new_expires = timespec_to_sample(timer->it_clock, &new->it_value); + + read_lock(&tasklist_lock); + /* + * We need the tasklist_lock to protect against reaping that + * clears p->sighand. If p has just been reaped, we can no + * longer get any information about it at all. + */ + if (unlikely(p->sighand == NULL)) { + read_unlock(&tasklist_lock); + put_task_struct(p); + timer->it.cpu.task = NULL; + return -ESRCH; + } + + /* + * Disarm any old timer after extracting its expiry time. + */ + BUG_ON(!irqs_disabled()); + + ret = 0; + old_incr = timer->it.cpu.incr; + spin_lock(&p->sighand->siglock); + old_expires = timer->it.cpu.expires; + if (unlikely(timer->it.cpu.firing)) { + timer->it.cpu.firing = -1; + ret = TIMER_RETRY; + } else + list_del_init(&timer->it.cpu.entry); + + /* + * We need to sample the current value to convert the new + * value from to relative and absolute, and to convert the + * old value from absolute to relative. To set a process + * timer, we need a sample to balance the thread expiry + * times (in arm_timer). With an absolute time, we must + * check if it's already passed. In short, we need a sample. + */ + if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + cpu_clock_sample(timer->it_clock, p, &val); + } else { + cpu_timer_sample_group(timer->it_clock, p, &val); + } + + if (old) { + if (old_expires.sched == 0) { + old->it_value.tv_sec = 0; + old->it_value.tv_nsec = 0; + } else { + /* + * Update the timer in case it has + * overrun already. If it has, + * we'll report it as having overrun + * and with the next reloaded timer + * already ticking, though we are + * swallowing that pending + * notification here to install the + * new setting. + */ + bump_cpu_timer(timer, val); + if (cpu_time_before(timer->it_clock, val, + timer->it.cpu.expires)) { + old_expires = cpu_time_sub( + timer->it_clock, + timer->it.cpu.expires, val); + sample_to_timespec(timer->it_clock, + old_expires, + &old->it_value); + } else { + old->it_value.tv_nsec = 1; + old->it_value.tv_sec = 0; + } + } + } + + if (unlikely(ret)) { + /* + * We are colliding with the timer actually firing. + * Punt after filling in the timer's old value, and + * disable this firing since we are already reporting + * it as an overrun (thanks to bump_cpu_timer above). + */ + spin_unlock(&p->sighand->siglock); + read_unlock(&tasklist_lock); + goto out; + } + + if (new_expires.sched != 0 && !(flags & TIMER_ABSTIME)) { + cpu_time_add(timer->it_clock, &new_expires, val); + } + + /* + * Install the new expiry time (or zero). + * For a timer with no notification action, we don't actually + * arm the timer (we'll just fake it for timer_gettime). + */ + timer->it.cpu.expires = new_expires; + if (new_expires.sched != 0 && + cpu_time_before(timer->it_clock, val, new_expires)) { + arm_timer(timer); + } + + spin_unlock(&p->sighand->siglock); + read_unlock(&tasklist_lock); + + /* + * Install the new reload setting, and + * set up the signal and overrun bookkeeping. + */ + timer->it.cpu.incr = timespec_to_sample(timer->it_clock, + &new->it_interval); + + /* + * This acts as a modification timestamp for the timer, + * so any automatic reload attempt will punt on seeing + * that we have reset the timer manually. + */ + timer->it_requeue_pending = (timer->it_requeue_pending + 2) & + ~REQUEUE_PENDING; + timer->it_overrun_last = 0; + timer->it_overrun = -1; + + if (new_expires.sched != 0 && + !cpu_time_before(timer->it_clock, val, new_expires)) { + /* + * The designated time already passed, so we notify + * immediately, even if the thread never runs to + * accumulate more time on this clock. + */ + cpu_timer_fire(timer); + } + + ret = 0; + out: + if (old) { + sample_to_timespec(timer->it_clock, + old_incr, &old->it_interval); + } + return ret; +} + +static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) +{ + union cpu_time_count now; + struct task_struct *p = timer->it.cpu.task; + int clear_dead; + + /* + * Easy part: convert the reload time. + */ + sample_to_timespec(timer->it_clock, + timer->it.cpu.incr, &itp->it_interval); + + if (timer->it.cpu.expires.sched == 0) { /* Timer not armed at all. */ + itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; + return; + } + + if (unlikely(p == NULL)) { + /* + * This task already died and the timer will never fire. + * In this case, expires is actually the dead value. + */ + dead: + sample_to_timespec(timer->it_clock, timer->it.cpu.expires, + &itp->it_value); + return; + } + + /* + * Sample the clock to take the difference with the expiry time. + */ + if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + cpu_clock_sample(timer->it_clock, p, &now); + clear_dead = p->exit_state; + } else { + read_lock(&tasklist_lock); + if (unlikely(p->sighand == NULL)) { + /* + * The process has been reaped. + * We can't even collect a sample any more. + * Call the timer disarmed, nothing else to do. + */ + put_task_struct(p); + timer->it.cpu.task = NULL; + timer->it.cpu.expires.sched = 0; + read_unlock(&tasklist_lock); + goto dead; + } else { + cpu_timer_sample_group(timer->it_clock, p, &now); + clear_dead = (unlikely(p->exit_state) && + thread_group_empty(p)); + } + read_unlock(&tasklist_lock); + } + + if (unlikely(clear_dead)) { + /* + * We've noticed that the thread is dead, but + * not yet reaped. Take this opportunity to + * drop our task ref. + */ + clear_dead_task(timer, now); + goto dead; + } + + if (cpu_time_before(timer->it_clock, now, timer->it.cpu.expires)) { + sample_to_timespec(timer->it_clock, + cpu_time_sub(timer->it_clock, + timer->it.cpu.expires, now), + &itp->it_value); + } else { + /* + * The timer should have expired already, but the firing + * hasn't taken place yet. Say it's just about to expire. + */ + itp->it_value.tv_nsec = 1; + itp->it_value.tv_sec = 0; + } +} + +/* + * Check for any per-thread CPU timers that have fired and move them off + * the tsk->cpu_timers[N] list onto the firing list. Here we update the + * tsk->it_*_expires values to reflect the remaining thread CPU timers. + */ +static void check_thread_timers(struct task_struct *tsk, + struct list_head *firing) +{ + int maxfire; + struct list_head *timers = tsk->cpu_timers; + struct signal_struct *const sig = tsk->signal; + unsigned long soft; + + maxfire = 20; + tsk->cputime_expires.prof_exp = cputime_zero; + while (!list_empty(timers)) { + struct cpu_timer_list *t = list_first_entry(timers, + struct cpu_timer_list, + entry); + if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { + tsk->cputime_expires.prof_exp = t->expires.cpu; + break; + } + t->firing = 1; + list_move_tail(&t->entry, firing); + } + + ++timers; + maxfire = 20; + tsk->cputime_expires.virt_exp = cputime_zero; + while (!list_empty(timers)) { + struct cpu_timer_list *t = list_first_entry(timers, + struct cpu_timer_list, + entry); + if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { + tsk->cputime_expires.virt_exp = t->expires.cpu; + break; + } + t->firing = 1; + list_move_tail(&t->entry, firing); + } + + ++timers; + maxfire = 20; + tsk->cputime_expires.sched_exp = 0; + while (!list_empty(timers)) { + struct cpu_timer_list *t = list_first_entry(timers, + struct cpu_timer_list, + entry); + if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) { + tsk->cputime_expires.sched_exp = t->expires.sched; + break; + } + t->firing = 1; + list_move_tail(&t->entry, firing); + } + + /* + * Check for the special case thread timers. + */ + soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur); + if (soft != RLIM_INFINITY) { + unsigned long hard = + ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max); + + if (hard != RLIM_INFINITY && + tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { + /* + * At the hard limit, we just die. + * No need to calculate anything else now. + */ + __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); + return; + } + if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { + /* + * At the soft limit, send a SIGXCPU every second. + */ + if (soft < hard) { + soft += USEC_PER_SEC; + sig->rlim[RLIMIT_RTTIME].rlim_cur = soft; + } + printk(KERN_INFO + "RT Watchdog Timeout: %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); + __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + } + } +} + +static void stop_process_timers(struct signal_struct *sig) +{ + struct thread_group_cputimer *cputimer = &sig->cputimer; + unsigned long flags; + + spin_lock_irqsave(&cputimer->lock, flags); + cputimer->running = 0; + spin_unlock_irqrestore(&cputimer->lock, flags); +} + +static u32 onecputick; + +static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it, + cputime_t *expires, cputime_t cur_time, int signo) +{ + if (cputime_eq(it->expires, cputime_zero)) + return; + + if (cputime_ge(cur_time, it->expires)) { + if (!cputime_eq(it->incr, cputime_zero)) { + it->expires = cputime_add(it->expires, it->incr); + it->error += it->incr_error; + if (it->error >= onecputick) { + it->expires = cputime_sub(it->expires, + cputime_one_jiffy); + it->error -= onecputick; + } + } else { + it->expires = cputime_zero; + } + + trace_itimer_expire(signo == SIGPROF ? + ITIMER_PROF : ITIMER_VIRTUAL, + tsk->signal->leader_pid, cur_time); + __group_send_sig_info(signo, SEND_SIG_PRIV, tsk); + } + + if (!cputime_eq(it->expires, cputime_zero) && + (cputime_eq(*expires, cputime_zero) || + cputime_lt(it->expires, *expires))) { + *expires = it->expires; + } +} + +/** + * task_cputime_zero - Check a task_cputime struct for all zero fields. + * + * @cputime: The struct to compare. + * + * Checks @cputime to see if all fields are zero. Returns true if all fields + * are zero, false if any field is nonzero. + */ +static inline int task_cputime_zero(const struct task_cputime *cputime) +{ + if (cputime_eq(cputime->utime, cputime_zero) && + cputime_eq(cputime->stime, cputime_zero) && + cputime->sum_exec_runtime == 0) + return 1; + return 0; +} + +/* + * Check for any per-thread CPU timers that have fired and move them + * off the tsk->*_timers list onto the firing list. Per-thread timers + * have already been taken off. + */ +static void check_process_timers(struct task_struct *tsk, + struct list_head *firing) +{ + int maxfire; + struct signal_struct *const sig = tsk->signal; + cputime_t utime, ptime, virt_expires, prof_expires; + unsigned long long sum_sched_runtime, sched_expires; + struct list_head *timers = sig->cpu_timers; + struct task_cputime cputime; + unsigned long soft; + + /* + * Collect the current process totals. + */ + thread_group_cputimer(tsk, &cputime); + utime = cputime.utime; + ptime = cputime_add(utime, cputime.stime); + sum_sched_runtime = cputime.sum_exec_runtime; + maxfire = 20; + prof_expires = cputime_zero; + while (!list_empty(timers)) { + struct cpu_timer_list *tl = list_first_entry(timers, + struct cpu_timer_list, + entry); + if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) { + prof_expires = tl->expires.cpu; + break; + } + tl->firing = 1; + list_move_tail(&tl->entry, firing); + } + + ++timers; + maxfire = 20; + virt_expires = cputime_zero; + while (!list_empty(timers)) { + struct cpu_timer_list *tl = list_first_entry(timers, + struct cpu_timer_list, + entry); + if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) { + virt_expires = tl->expires.cpu; + break; + } + tl->firing = 1; + list_move_tail(&tl->entry, firing); + } + + ++timers; + maxfire = 20; + sched_expires = 0; + while (!list_empty(timers)) { + struct cpu_timer_list *tl = list_first_entry(timers, + struct cpu_timer_list, + entry); + if (!--maxfire || sum_sched_runtime < tl->expires.sched) { + sched_expires = tl->expires.sched; + break; + } + tl->firing = 1; + list_move_tail(&tl->entry, firing); + } + + /* + * Check for the special case process timers. + */ + check_cpu_itimer(tsk, &sig->it[CPUCLOCK_PROF], &prof_expires, ptime, + SIGPROF); + check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, + SIGVTALRM); + soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); + if (soft != RLIM_INFINITY) { + unsigned long psecs = cputime_to_secs(ptime); + unsigned long hard = + ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max); + cputime_t x; + if (psecs >= hard) { + /* + * At the hard limit, we just die. + * No need to calculate anything else now. + */ + __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); + return; + } + if (psecs >= soft) { + /* + * At the soft limit, send a SIGXCPU every second. + */ + __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + if (soft < hard) { + soft++; + sig->rlim[RLIMIT_CPU].rlim_cur = soft; + } + } + x = secs_to_cputime(soft); + if (cputime_eq(prof_expires, cputime_zero) || + cputime_lt(x, prof_expires)) { + prof_expires = x; + } + } + + sig->cputime_expires.prof_exp = prof_expires; + sig->cputime_expires.virt_exp = virt_expires; + sig->cputime_expires.sched_exp = sched_expires; + if (task_cputime_zero(&sig->cputime_expires)) + stop_process_timers(sig); +} + +/* + * This is called from the signal code (via do_schedule_next_timer) + * when the last timer signal was delivered and we have to reload the timer. + */ +void posix_cpu_timer_schedule(struct k_itimer *timer) +{ + struct task_struct *p = timer->it.cpu.task; + union cpu_time_count now; + + if (unlikely(p == NULL)) + /* + * The task was cleaned up already, no future firings. + */ + goto out; + + /* + * Fetch the current sample and update the timer's expiry time. + */ + if (CPUCLOCK_PERTHREAD(timer->it_clock)) { + cpu_clock_sample(timer->it_clock, p, &now); + bump_cpu_timer(timer, now); + if (unlikely(p->exit_state)) { + clear_dead_task(timer, now); + goto out; + } + read_lock(&tasklist_lock); /* arm_timer needs it. */ + spin_lock(&p->sighand->siglock); + } else { + read_lock(&tasklist_lock); + if (unlikely(p->sighand == NULL)) { + /* + * The process has been reaped. + * We can't even collect a sample any more. + */ + put_task_struct(p); + timer->it.cpu.task = p = NULL; + timer->it.cpu.expires.sched = 0; + goto out_unlock; + } else if (unlikely(p->exit_state) && thread_group_empty(p)) { + /* + * We've noticed that the thread is dead, but + * not yet reaped. Take this opportunity to + * drop our task ref. + */ + clear_dead_task(timer, now); + goto out_unlock; + } + spin_lock(&p->sighand->siglock); + cpu_timer_sample_group(timer->it_clock, p, &now); + bump_cpu_timer(timer, now); + /* Leave the tasklist_lock locked for the call below. */ + } + + /* + * Now re-arm for the new expiry time. + */ + BUG_ON(!irqs_disabled()); + arm_timer(timer); + spin_unlock(&p->sighand->siglock); + +out_unlock: + read_unlock(&tasklist_lock); + +out: + timer->it_overrun_last = timer->it_overrun; + timer->it_overrun = -1; + ++timer->it_requeue_pending; +} + +/** + * task_cputime_expired - Compare two task_cputime entities. + * + * @sample: The task_cputime structure to be checked for expiration. + * @expires: Expiration times, against which @sample will be checked. + * + * Checks @sample against @expires to see if any field of @sample has expired. + * Returns true if any field of the former is greater than the corresponding + * field of the latter if the latter field is set. Otherwise returns false. + */ +static inline int task_cputime_expired(const struct task_cputime *sample, + const struct task_cputime *expires) +{ + if (!cputime_eq(expires->utime, cputime_zero) && + cputime_ge(sample->utime, expires->utime)) + return 1; + if (!cputime_eq(expires->stime, cputime_zero) && + cputime_ge(cputime_add(sample->utime, sample->stime), + expires->stime)) + return 1; + if (expires->sum_exec_runtime != 0 && + sample->sum_exec_runtime >= expires->sum_exec_runtime) + return 1; + return 0; +} + +/** + * fastpath_timer_check - POSIX CPU timers fast path. + * + * @tsk: The task (thread) being checked. + * + * Check the task and thread group timers. If both are zero (there are no + * timers set) return false. Otherwise snapshot the task and thread group + * timers and compare them with the corresponding expiration times. Return + * true if a timer has expired, else return false. + */ +static inline int fastpath_timer_check(struct task_struct *tsk) +{ + struct signal_struct *sig; + + if (!task_cputime_zero(&tsk->cputime_expires)) { + struct task_cputime task_sample = { + .utime = tsk->utime, + .stime = tsk->stime, + .sum_exec_runtime = tsk->se.sum_exec_runtime + }; + + if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) + return 1; + } + + sig = tsk->signal; + if (sig->cputimer.running) { + struct task_cputime group_sample; + + spin_lock(&sig->cputimer.lock); + group_sample = sig->cputimer.cputime; + spin_unlock(&sig->cputimer.lock); + + if (task_cputime_expired(&group_sample, &sig->cputime_expires)) + return 1; + } + + return 0; +} + +/* + * This is called from the timer interrupt handler. The irq handler has + * already updated our counts. We need to check if any timers fire now. + * Interrupts are disabled. + */ +void run_posix_cpu_timers(struct task_struct *tsk) +{ + LIST_HEAD(firing); + struct k_itimer *timer, *next; + unsigned long flags; + + BUG_ON(!irqs_disabled()); + + /* + * The fast path checks that there are no expired thread or thread + * group timers. If that's so, just return. + */ + if (!fastpath_timer_check(tsk)) + return; + + if (!lock_task_sighand(tsk, &flags)) + return; + /* + * Here we take off tsk->signal->cpu_timers[N] and + * tsk->cpu_timers[N] all the timers that are firing, and + * put them on the firing list. + */ + check_thread_timers(tsk, &firing); + /* + * If there are any active process wide timers (POSIX 1.b, itimers, + * RLIMIT_CPU) cputimer must be running. + */ + if (tsk->signal->cputimer.running) + check_process_timers(tsk, &firing); + + /* + * We must release these locks before taking any timer's lock. + * There is a potential race with timer deletion here, as the + * siglock now protects our private firing list. We have set + * the firing flag in each timer, so that a deletion attempt + * that gets the timer lock before we do will give it up and + * spin until we've taken care of that timer below. + */ + unlock_task_sighand(tsk, &flags); + + /* + * Now that all the timers on our list have the firing flag, + * no one will touch their list entries but us. We'll take + * each timer's lock before clearing its firing flag, so no + * timer call will interfere. + */ + list_for_each_entry_safe(timer, next, &firing, it.cpu.entry) { + int cpu_firing; + + spin_lock(&timer->it_lock); + list_del_init(&timer->it.cpu.entry); + cpu_firing = timer->it.cpu.firing; + timer->it.cpu.firing = 0; + /* + * The firing flag is -1 if we collided with a reset + * of the timer, which already reported this + * almost-firing as an overrun. So don't generate an event. + */ + if (likely(cpu_firing >= 0)) + cpu_timer_fire(timer); + spin_unlock(&timer->it_lock); + } +} + +/* + * Set one of the process-wide special case CPU timers or RLIMIT_CPU. + * The tsk->sighand->siglock must be held by the caller. + */ +void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, + cputime_t *newval, cputime_t *oldval) +{ + union cpu_time_count now; + + BUG_ON(clock_idx == CPUCLOCK_SCHED); + cpu_timer_sample_group(clock_idx, tsk, &now); + + if (oldval) { + /* + * We are setting itimer. The *oldval is absolute and we update + * it to be relative, *newval argument is relative and we update + * it to be absolute. + */ + if (!cputime_eq(*oldval, cputime_zero)) { + if (cputime_le(*oldval, now.cpu)) { + /* Just about to fire. */ + *oldval = cputime_one_jiffy; + } else { + *oldval = cputime_sub(*oldval, now.cpu); + } + } + + if (cputime_eq(*newval, cputime_zero)) + return; + *newval = cputime_add(*newval, now.cpu); + } + + /* + * Update expiration cache if we are the earliest timer, or eventually + * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire. + */ + switch (clock_idx) { + case CPUCLOCK_PROF: + if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval)) + tsk->signal->cputime_expires.prof_exp = *newval; + break; + case CPUCLOCK_VIRT: + if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval)) + tsk->signal->cputime_expires.virt_exp = *newval; + break; + } +} + +static int do_cpu_nanosleep(const clockid_t which_clock, int flags, + struct timespec *rqtp, struct itimerspec *it) +{ + struct k_itimer timer; + int error; + + /* + * Set up a temporary timer and then wait for it to go off. + */ + memset(&timer, 0, sizeof timer); + spin_lock_init(&timer.it_lock); + timer.it_clock = which_clock; + timer.it_overrun = -1; + error = posix_cpu_timer_create(&timer); + timer.it_process = current; + if (!error) { + static struct itimerspec zero_it; + + memset(it, 0, sizeof *it); + it->it_value = *rqtp; + + spin_lock_irq(&timer.it_lock); + error = posix_cpu_timer_set(&timer, flags, it, NULL); + if (error) { + spin_unlock_irq(&timer.it_lock); + return error; + } + + while (!signal_pending(current)) { + if (timer.it.cpu.expires.sched == 0) { + /* + * Our timer fired and was reset. + */ + spin_unlock_irq(&timer.it_lock); + return 0; + } + + /* + * Block until cpu_timer_fire (or a signal) wakes us. + */ + __set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irq(&timer.it_lock); + schedule(); + spin_lock_irq(&timer.it_lock); + } + + /* + * We were interrupted by a signal. + */ + sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp); + posix_cpu_timer_set(&timer, 0, &zero_it, it); + spin_unlock_irq(&timer.it_lock); + + if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) { + /* + * It actually did fire already. + */ + return 0; + } + + error = -ERESTART_RESTARTBLOCK; + } + + return error; +} + +static long posix_cpu_nsleep_restart(struct restart_block *restart_block); + +static int posix_cpu_nsleep(const clockid_t which_clock, int flags, + struct timespec *rqtp, struct timespec __user *rmtp) +{ + struct restart_block *restart_block = + ¤t_thread_info()->restart_block; + struct itimerspec it; + int error; + + /* + * Diagnose required errors first. + */ + if (CPUCLOCK_PERTHREAD(which_clock) && + (CPUCLOCK_PID(which_clock) == 0 || + CPUCLOCK_PID(which_clock) == current->pid)) + return -EINVAL; + + error = do_cpu_nanosleep(which_clock, flags, rqtp, &it); + + if (error == -ERESTART_RESTARTBLOCK) { + + if (flags & TIMER_ABSTIME) + return -ERESTARTNOHAND; + /* + * Report back to the user the time still remaining. + */ + if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) + return -EFAULT; + + restart_block->fn = posix_cpu_nsleep_restart; + restart_block->nanosleep.clockid = which_clock; + restart_block->nanosleep.rmtp = rmtp; + restart_block->nanosleep.expires = timespec_to_ns(rqtp); + } + return error; +} + +static long posix_cpu_nsleep_restart(struct restart_block *restart_block) +{ + clockid_t which_clock = restart_block->nanosleep.clockid; + struct timespec t; + struct itimerspec it; + int error; + + t = ns_to_timespec(restart_block->nanosleep.expires); + + error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); + + if (error == -ERESTART_RESTARTBLOCK) { + struct timespec __user *rmtp = restart_block->nanosleep.rmtp; + /* + * Report back to the user the time still remaining. + */ + if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) + return -EFAULT; + + restart_block->nanosleep.expires = timespec_to_ns(&t); + } + return error; + +} + +#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) +#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) + +static int process_cpu_clock_getres(const clockid_t which_clock, + struct timespec *tp) +{ + return posix_cpu_clock_getres(PROCESS_CLOCK, tp); +} +static int process_cpu_clock_get(const clockid_t which_clock, + struct timespec *tp) +{ + return posix_cpu_clock_get(PROCESS_CLOCK, tp); +} +static int process_cpu_timer_create(struct k_itimer *timer) +{ + timer->it_clock = PROCESS_CLOCK; + return posix_cpu_timer_create(timer); +} +static int process_cpu_nsleep(const clockid_t which_clock, int flags, + struct timespec *rqtp, + struct timespec __user *rmtp) +{ + return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp); +} +static long process_cpu_nsleep_restart(struct restart_block *restart_block) +{ + return -EINVAL; +} +static int thread_cpu_clock_getres(const clockid_t which_clock, + struct timespec *tp) +{ + return posix_cpu_clock_getres(THREAD_CLOCK, tp); +} +static int thread_cpu_clock_get(const clockid_t which_clock, + struct timespec *tp) +{ + return posix_cpu_clock_get(THREAD_CLOCK, tp); +} +static int thread_cpu_timer_create(struct k_itimer *timer) +{ + timer->it_clock = THREAD_CLOCK; + return posix_cpu_timer_create(timer); +} + +struct k_clock clock_posix_cpu = { + .clock_getres = posix_cpu_clock_getres, + .clock_set = posix_cpu_clock_set, + .clock_get = posix_cpu_clock_get, + .timer_create = posix_cpu_timer_create, + .nsleep = posix_cpu_nsleep, + .nsleep_restart = posix_cpu_nsleep_restart, + .timer_set = posix_cpu_timer_set, + .timer_del = posix_cpu_timer_del, + .timer_get = posix_cpu_timer_get, +}; + +static __init int init_posix_cpu_timers(void) +{ + struct k_clock process = { + .clock_getres = process_cpu_clock_getres, + .clock_get = process_cpu_clock_get, + .timer_create = process_cpu_timer_create, + .nsleep = process_cpu_nsleep, + .nsleep_restart = process_cpu_nsleep_restart, + }; + struct k_clock thread = { + .clock_getres = thread_cpu_clock_getres, + .clock_get = thread_cpu_clock_get, + .timer_create = thread_cpu_timer_create, + }; + struct timespec ts; + + posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process); + posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread); + + cputime_to_timespec(cputime_one_jiffy, &ts); + onecputick = ts.tv_nsec; + WARN_ON(ts.tv_sec != 0); + + return 0; +} +__initcall(init_posix_cpu_timers); diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c new file mode 100644 index 00000000..45561825 --- /dev/null +++ b/kernel/posix-timers.c @@ -0,0 +1,1069 @@ +/* + * linux/kernel/posix-timers.c + * + * + * 2002-10-15 Posix Clocks & timers + * by George Anzinger george@mvista.com + * + * Copyright (C) 2002 2003 by MontaVista Software. + * + * 2004-06-01 Fix CLOCK_REALTIME clock/timer TIMER_ABSTIME bug. + * Copyright (C) 2004 Boris Hu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * MontaVista Software | 1237 East Arques Avenue | Sunnyvale | CA 94085 | USA + */ + +/* These are all the functions necessary to implement + * POSIX clocks & timers + */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Management arrays for POSIX timers. Timers are kept in slab memory + * Timer ids are allocated by an external routine that keeps track of the + * id and the timer. The external interface is: + * + * void *idr_find(struct idr *idp, int id); to find timer_id + * int idr_get_new(struct idr *idp, void *ptr); to get a new id and + * related it to + * void idr_remove(struct idr *idp, int id); to release + * void idr_init(struct idr *idp); to initialize + * which we supply. + * The idr_get_new *may* call slab for more memory so it must not be + * called under a spin lock. Likewise idr_remore may release memory + * (but it may be ok to do this under a lock...). + * idr_find is just a memory look up and is quite fast. A -1 return + * indicates that the requested id does not exist. + */ + +/* + * Lets keep our timers in a slab cache :-) + */ +static struct kmem_cache *posix_timers_cache; +static struct idr posix_timers_id; +static DEFINE_SPINLOCK(idr_lock); + +/* + * we assume that the new SIGEV_THREAD_ID shares no bits with the other + * SIGEV values. Here we put out an error if this assumption fails. + */ +#if SIGEV_THREAD_ID != (SIGEV_THREAD_ID & \ + ~(SIGEV_SIGNAL | SIGEV_NONE | SIGEV_THREAD)) +#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!" +#endif + +/* + * parisc wants ENOTSUP instead of EOPNOTSUPP + */ +#ifndef ENOTSUP +# define ENANOSLEEP_NOTSUP EOPNOTSUPP +#else +# define ENANOSLEEP_NOTSUP ENOTSUP +#endif + +/* + * The timer ID is turned into a timer address by idr_find(). + * Verifying a valid ID consists of: + * + * a) checking that idr_find() returns other than -1. + * b) checking that the timer id matches the one in the timer itself. + * c) that the timer owner is in the callers thread group. + */ + +/* + * CLOCKs: The POSIX standard calls for a couple of clocks and allows us + * to implement others. This structure defines the various + * clocks. + * + * RESOLUTION: Clock resolution is used to round up timer and interval + * times, NOT to report clock times, which are reported with as + * much resolution as the system can muster. In some cases this + * resolution may depend on the underlying clock hardware and + * may not be quantifiable until run time, and only then is the + * necessary code is written. The standard says we should say + * something about this issue in the documentation... + * + * FUNCTIONS: The CLOCKs structure defines possible functions to + * handle various clock functions. + * + * The standard POSIX timer management code assumes the + * following: 1.) The k_itimer struct (sched.h) is used for + * the timer. 2.) The list, it_lock, it_clock, it_id and + * it_pid fields are not modified by timer code. + * + * Permissions: It is assumed that the clock_settime() function defined + * for each clock will take care of permission checks. Some + * clocks may be set able by any user (i.e. local process + * clocks) others not. Currently the only set able clock we + * have is CLOCK_REALTIME and its high res counter part, both of + * which we beg off on and pass to do_sys_settimeofday(). + */ + +static struct k_clock posix_clocks[MAX_CLOCKS]; + +/* + * These ones are defined below. + */ +static int common_nsleep(const clockid_t, int flags, struct timespec *t, + struct timespec __user *rmtp); +static int common_timer_create(struct k_itimer *new_timer); +static void common_timer_get(struct k_itimer *, struct itimerspec *); +static int common_timer_set(struct k_itimer *, int, + struct itimerspec *, struct itimerspec *); +static int common_timer_del(struct k_itimer *timer); + +static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); + +static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags); + +#define lock_timer(tid, flags) \ +({ struct k_itimer *__timr; \ + __cond_lock(&__timr->it_lock, __timr = __lock_timer(tid, flags)); \ + __timr; \ +}) + +static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) +{ + spin_unlock_irqrestore(&timr->it_lock, flags); +} + +/* Get clock_realtime */ +static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp) +{ + ktime_get_real_ts(tp); + return 0; +} + +/* Set clock_realtime */ +static int posix_clock_realtime_set(const clockid_t which_clock, + const struct timespec *tp) +{ + return do_sys_settimeofday(tp, NULL); +} + +static int posix_clock_realtime_adj(const clockid_t which_clock, + struct timex *t) +{ + return do_adjtimex(t); +} + +/* + * Get monotonic time for posix timers + */ +static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp) +{ + ktime_get_ts(tp); + return 0; +} + +/* + * Get monotonic-raw time for posix timers + */ +static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) +{ + getrawmonotonic(tp); + return 0; +} + + +static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp) +{ + *tp = current_kernel_time(); + return 0; +} + +static int posix_get_monotonic_coarse(clockid_t which_clock, + struct timespec *tp) +{ + *tp = get_monotonic_coarse(); + return 0; +} + +static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) +{ + *tp = ktime_to_timespec(KTIME_LOW_RES); + return 0; +} + +static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp) +{ + get_monotonic_boottime(tp); + return 0; +} + + +/* + * Initialize everything, well, just everything in Posix clocks/timers ;) + */ +static __init int init_posix_timers(void) +{ + struct k_clock clock_realtime = { + .clock_getres = hrtimer_get_res, + .clock_get = posix_clock_realtime_get, + .clock_set = posix_clock_realtime_set, + .clock_adj = posix_clock_realtime_adj, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + }; + struct k_clock clock_monotonic = { + .clock_getres = hrtimer_get_res, + .clock_get = posix_ktime_get_ts, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + }; + struct k_clock clock_monotonic_raw = { + .clock_getres = hrtimer_get_res, + .clock_get = posix_get_monotonic_raw, + }; + struct k_clock clock_realtime_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_realtime_coarse, + }; + struct k_clock clock_monotonic_coarse = { + .clock_getres = posix_get_coarse_res, + .clock_get = posix_get_monotonic_coarse, + }; + struct k_clock clock_boottime = { + .clock_getres = hrtimer_get_res, + .clock_get = posix_get_boottime, + .nsleep = common_nsleep, + .nsleep_restart = hrtimer_nanosleep_restart, + .timer_create = common_timer_create, + .timer_set = common_timer_set, + .timer_get = common_timer_get, + .timer_del = common_timer_del, + }; + + posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime); + posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic); + posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw); + posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse); + posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse); + posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime); + + posix_timers_cache = kmem_cache_create("posix_timers_cache", + sizeof (struct k_itimer), 0, SLAB_PANIC, + NULL); + idr_init(&posix_timers_id); + return 0; +} + +__initcall(init_posix_timers); + +static void schedule_next_timer(struct k_itimer *timr) +{ + struct hrtimer *timer = &timr->it.real.timer; + + if (timr->it.real.interval.tv64 == 0) + return; + + timr->it_overrun += (unsigned int) hrtimer_forward(timer, + timer->base->get_time(), + timr->it.real.interval); + + timr->it_overrun_last = timr->it_overrun; + timr->it_overrun = -1; + ++timr->it_requeue_pending; + hrtimer_restart(timer); +} + +/* + * This function is exported for use by the signal deliver code. It is + * called just prior to the info block being released and passes that + * block to us. It's function is to update the overrun entry AND to + * restart the timer. It should only be called if the timer is to be + * restarted (i.e. we have flagged this in the sys_private entry of the + * info block). + * + * To protect against the timer going away while the interrupt is queued, + * we require that the it_requeue_pending flag be set. + */ +void do_schedule_next_timer(struct siginfo *info) +{ + struct k_itimer *timr; + unsigned long flags; + + timr = lock_timer(info->si_tid, &flags); + + if (timr && timr->it_requeue_pending == info->si_sys_private) { + if (timr->it_clock < 0) + posix_cpu_timer_schedule(timr); + else + schedule_next_timer(timr); + + info->si_overrun += timr->it_overrun_last; + } + + if (timr) + unlock_timer(timr, flags); +} + +int posix_timer_event(struct k_itimer *timr, int si_private) +{ + struct task_struct *task; + int shared, ret = -1; + /* + * FIXME: if ->sigq is queued we can race with + * dequeue_signal()->do_schedule_next_timer(). + * + * If dequeue_signal() sees the "right" value of + * si_sys_private it calls do_schedule_next_timer(). + * We re-queue ->sigq and drop ->it_lock(). + * do_schedule_next_timer() locks the timer + * and re-schedules it while ->sigq is pending. + * Not really bad, but not that we want. + */ + timr->sigq->info.si_sys_private = si_private; + + rcu_read_lock(); + task = pid_task(timr->it_pid, PIDTYPE_PID); + if (task) { + shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID); + ret = send_sigqueue(timr->sigq, task, shared); + } + rcu_read_unlock(); + /* If we failed to send the signal the timer stops. */ + return ret > 0; +} +EXPORT_SYMBOL_GPL(posix_timer_event); + +/* + * This function gets called when a POSIX.1b interval timer expires. It + * is used as a callback from the kernel internal timer. The + * run_timer_list code ALWAYS calls with interrupts on. + + * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers. + */ +static enum hrtimer_restart posix_timer_fn(struct hrtimer *timer) +{ + struct k_itimer *timr; + unsigned long flags; + int si_private = 0; + enum hrtimer_restart ret = HRTIMER_NORESTART; + + timr = container_of(timer, struct k_itimer, it.real.timer); + spin_lock_irqsave(&timr->it_lock, flags); + + if (timr->it.real.interval.tv64 != 0) + si_private = ++timr->it_requeue_pending; + + if (posix_timer_event(timr, si_private)) { + /* + * signal was not sent because of sig_ignor + * we will not get a call back to restart it AND + * it should be restarted. + */ + if (timr->it.real.interval.tv64 != 0) { + ktime_t now = hrtimer_cb_get_time(timer); + + /* + * FIXME: What we really want, is to stop this + * timer completely and restart it in case the + * SIG_IGN is removed. This is a non trivial + * change which involves sighand locking + * (sigh !), which we don't want to do late in + * the release cycle. + * + * For now we just let timers with an interval + * less than a jiffie expire every jiffie to + * avoid softirq starvation in case of SIG_IGN + * and a very small interval, which would put + * the timer right back on the softirq pending + * list. By moving now ahead of time we trick + * hrtimer_forward() to expire the timer + * later, while we still maintain the overrun + * accuracy, but have some inconsistency in + * the timer_gettime() case. This is at least + * better than a starved softirq. A more + * complex fix which solves also another related + * inconsistency is already in the pipeline. + */ +#ifdef CONFIG_HIGH_RES_TIMERS + { + ktime_t kj = ktime_set(0, NSEC_PER_SEC / HZ); + + if (timr->it.real.interval.tv64 < kj.tv64) + now = ktime_add(now, kj); + } +#endif + timr->it_overrun += (unsigned int) + hrtimer_forward(timer, now, + timr->it.real.interval); + ret = HRTIMER_RESTART; + ++timr->it_requeue_pending; + } + } + + unlock_timer(timr, flags); + return ret; +} + +static struct pid *good_sigevent(sigevent_t * event) +{ + struct task_struct *rtn = current->group_leader; + + if ((event->sigev_notify & SIGEV_THREAD_ID ) && + (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) || + !same_thread_group(rtn, current) || + (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL)) + return NULL; + + if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) && + ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX))) + return NULL; + + return task_pid(rtn); +} + +void posix_timers_register_clock(const clockid_t clock_id, + struct k_clock *new_clock) +{ + if ((unsigned) clock_id >= MAX_CLOCKS) { + printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n", + clock_id); + return; + } + + if (!new_clock->clock_get) { + printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n", + clock_id); + return; + } + if (!new_clock->clock_getres) { + printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n", + clock_id); + return; + } + + posix_clocks[clock_id] = *new_clock; +} +EXPORT_SYMBOL_GPL(posix_timers_register_clock); + +static struct k_itimer * alloc_posix_timer(void) +{ + struct k_itimer *tmr; + tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); + if (!tmr) + return tmr; + if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { + kmem_cache_free(posix_timers_cache, tmr); + return NULL; + } + memset(&tmr->sigq->info, 0, sizeof(siginfo_t)); + return tmr; +} + +static void k_itimer_rcu_free(struct rcu_head *head) +{ + struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu); + + kmem_cache_free(posix_timers_cache, tmr); +} + +#define IT_ID_SET 1 +#define IT_ID_NOT_SET 0 +static void release_posix_timer(struct k_itimer *tmr, int it_id_set) +{ + if (it_id_set) { + unsigned long flags; + spin_lock_irqsave(&idr_lock, flags); + idr_remove(&posix_timers_id, tmr->it_id); + spin_unlock_irqrestore(&idr_lock, flags); + } + put_pid(tmr->it_pid); + sigqueue_free(tmr->sigq); + call_rcu(&tmr->it.rcu, k_itimer_rcu_free); +} + +static struct k_clock *clockid_to_kclock(const clockid_t id) +{ + if (id < 0) + return (id & CLOCKFD_MASK) == CLOCKFD ? + &clock_posix_dynamic : &clock_posix_cpu; + + if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres) + return NULL; + return &posix_clocks[id]; +} + +static int common_timer_create(struct k_itimer *new_timer) +{ + hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0); + return 0; +} + +/* Create a POSIX.1b interval timer. */ + +SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, + struct sigevent __user *, timer_event_spec, + timer_t __user *, created_timer_id) +{ + struct k_clock *kc = clockid_to_kclock(which_clock); + struct k_itimer *new_timer; + int error, new_timer_id; + sigevent_t event; + int it_id_set = IT_ID_NOT_SET; + + if (!kc) + return -EINVAL; + if (!kc->timer_create) + return -EOPNOTSUPP; + + new_timer = alloc_posix_timer(); + if (unlikely(!new_timer)) + return -EAGAIN; + + spin_lock_init(&new_timer->it_lock); + retry: + if (unlikely(!idr_pre_get(&posix_timers_id, GFP_KERNEL))) { + error = -EAGAIN; + goto out; + } + spin_lock_irq(&idr_lock); + error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id); + spin_unlock_irq(&idr_lock); + if (error) { + if (error == -EAGAIN) + goto retry; + /* + * Weird looking, but we return EAGAIN if the IDR is + * full (proper POSIX return value for this) + */ + error = -EAGAIN; + goto out; + } + + it_id_set = IT_ID_SET; + new_timer->it_id = (timer_t) new_timer_id; + new_timer->it_clock = which_clock; + new_timer->it_overrun = -1; + + if (timer_event_spec) { + if (copy_from_user(&event, timer_event_spec, sizeof (event))) { + error = -EFAULT; + goto out; + } + rcu_read_lock(); + new_timer->it_pid = get_pid(good_sigevent(&event)); + rcu_read_unlock(); + if (!new_timer->it_pid) { + error = -EINVAL; + goto out; + } + } else { + event.sigev_notify = SIGEV_SIGNAL; + event.sigev_signo = SIGALRM; + event.sigev_value.sival_int = new_timer->it_id; + new_timer->it_pid = get_pid(task_tgid(current)); + } + + new_timer->it_sigev_notify = event.sigev_notify; + new_timer->sigq->info.si_signo = event.sigev_signo; + new_timer->sigq->info.si_value = event.sigev_value; + new_timer->sigq->info.si_tid = new_timer->it_id; + new_timer->sigq->info.si_code = SI_TIMER; + + if (copy_to_user(created_timer_id, + &new_timer_id, sizeof (new_timer_id))) { + error = -EFAULT; + goto out; + } + + error = kc->timer_create(new_timer); + if (error) + goto out; + + spin_lock_irq(¤t->sighand->siglock); + new_timer->it_signal = current->signal; + list_add(&new_timer->list, ¤t->signal->posix_timers); + spin_unlock_irq(¤t->sighand->siglock); + + return 0; + /* + * In the case of the timer belonging to another task, after + * the task is unlocked, the timer is owned by the other task + * and may cease to exist at any time. Don't use or modify + * new_timer after the unlock call. + */ +out: + release_posix_timer(new_timer, it_id_set); + return error; +} + +/* + * Locking issues: We need to protect the result of the id look up until + * we get the timer locked down so it is not deleted under us. The + * removal is done under the idr spinlock so we use that here to bridge + * the find to the timer lock. To avoid a dead lock, the timer id MUST + * be release with out holding the timer lock. + */ +static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) +{ + struct k_itimer *timr; + + rcu_read_lock(); + timr = idr_find(&posix_timers_id, (int)timer_id); + if (timr) { + spin_lock_irqsave(&timr->it_lock, *flags); + if (timr->it_signal == current->signal) { + rcu_read_unlock(); + return timr; + } + spin_unlock_irqrestore(&timr->it_lock, *flags); + } + rcu_read_unlock(); + + return NULL; +} + +/* + * Get the time remaining on a POSIX.1b interval timer. This function + * is ALWAYS called with spin_lock_irq on the timer, thus it must not + * mess with irq. + * + * We have a couple of messes to clean up here. First there is the case + * of a timer that has a requeue pending. These timers should appear to + * be in the timer list with an expiry as if we were to requeue them + * now. + * + * The second issue is the SIGEV_NONE timer which may be active but is + * not really ever put in the timer list (to save system resources). + * This timer may be expired, and if so, we will do it here. Otherwise + * it is the same as a requeue pending timer WRT to what we should + * report. + */ +static void +common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) +{ + ktime_t now, remaining, iv; + struct hrtimer *timer = &timr->it.real.timer; + + memset(cur_setting, 0, sizeof(struct itimerspec)); + + iv = timr->it.real.interval; + + /* interval timer ? */ + if (iv.tv64) + cur_setting->it_interval = ktime_to_timespec(iv); + else if (!hrtimer_active(timer) && + (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) + return; + + now = timer->base->get_time(); + + /* + * When a requeue is pending or this is a SIGEV_NONE + * timer move the expiry time forward by intervals, so + * expiry is > now. + */ + if (iv.tv64 && (timr->it_requeue_pending & REQUEUE_PENDING || + (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) + timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv); + + remaining = ktime_sub(hrtimer_get_expires(timer), now); + /* Return 0 only, when the timer is expired and not pending */ + if (remaining.tv64 <= 0) { + /* + * A single shot SIGEV_NONE timer must return 0, when + * it is expired ! + */ + if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) + cur_setting->it_value.tv_nsec = 1; + } else + cur_setting->it_value = ktime_to_timespec(remaining); +} + +/* Get the time remaining on a POSIX.1b interval timer. */ +SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, + struct itimerspec __user *, setting) +{ + struct itimerspec cur_setting; + struct k_itimer *timr; + struct k_clock *kc; + unsigned long flags; + int ret = 0; + + timr = lock_timer(timer_id, &flags); + if (!timr) + return -EINVAL; + + kc = clockid_to_kclock(timr->it_clock); + if (WARN_ON_ONCE(!kc || !kc->timer_get)) + ret = -EINVAL; + else + kc->timer_get(timr, &cur_setting); + + unlock_timer(timr, flags); + + if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting))) + return -EFAULT; + + return ret; +} + +/* + * Get the number of overruns of a POSIX.1b interval timer. This is to + * be the overrun of the timer last delivered. At the same time we are + * accumulating overruns on the next timer. The overrun is frozen when + * the signal is delivered, either at the notify time (if the info block + * is not queued) or at the actual delivery time (as we are informed by + * the call back to do_schedule_next_timer(). So all we need to do is + * to pick up the frozen overrun. + */ +SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) +{ + struct k_itimer *timr; + int overrun; + unsigned long flags; + + timr = lock_timer(timer_id, &flags); + if (!timr) + return -EINVAL; + + overrun = timr->it_overrun_last; + unlock_timer(timr, flags); + + return overrun; +} + +/* Set a POSIX.1b interval timer. */ +/* timr->it_lock is taken. */ +static int +common_timer_set(struct k_itimer *timr, int flags, + struct itimerspec *new_setting, struct itimerspec *old_setting) +{ + struct hrtimer *timer = &timr->it.real.timer; + enum hrtimer_mode mode; + + if (old_setting) + common_timer_get(timr, old_setting); + + /* disable the timer */ + timr->it.real.interval.tv64 = 0; + /* + * careful here. If smp we could be in the "fire" routine which will + * be spinning as we hold the lock. But this is ONLY an SMP issue. + */ + if (hrtimer_try_to_cancel(timer) < 0) + return TIMER_RETRY; + + timr->it_requeue_pending = (timr->it_requeue_pending + 2) & + ~REQUEUE_PENDING; + timr->it_overrun_last = 0; + + /* switch off the timer when it_value is zero */ + if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) + return 0; + + mode = flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL; + hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); + timr->it.real.timer.function = posix_timer_fn; + + hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value)); + + /* Convert interval */ + timr->it.real.interval = timespec_to_ktime(new_setting->it_interval); + + /* SIGEV_NONE timers are not queued ! See common_timer_get */ + if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { + /* Setup correct expiry time for relative timers */ + if (mode == HRTIMER_MODE_REL) { + hrtimer_add_expires(timer, timer->base->get_time()); + } + return 0; + } + + hrtimer_start_expires(timer, mode); + return 0; +} + +/* Set a POSIX.1b interval timer */ +SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, + const struct itimerspec __user *, new_setting, + struct itimerspec __user *, old_setting) +{ + struct k_itimer *timr; + struct itimerspec new_spec, old_spec; + int error = 0; + unsigned long flag; + struct itimerspec *rtn = old_setting ? &old_spec : NULL; + struct k_clock *kc; + + if (!new_setting) + return -EINVAL; + + if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) + return -EFAULT; + + if (!timespec_valid(&new_spec.it_interval) || + !timespec_valid(&new_spec.it_value)) + return -EINVAL; +retry: + timr = lock_timer(timer_id, &flag); + if (!timr) + return -EINVAL; + + kc = clockid_to_kclock(timr->it_clock); + if (WARN_ON_ONCE(!kc || !kc->timer_set)) + error = -EINVAL; + else + error = kc->timer_set(timr, flags, &new_spec, rtn); + + unlock_timer(timr, flag); + if (error == TIMER_RETRY) { + rtn = NULL; // We already got the old time... + goto retry; + } + + if (old_setting && !error && + copy_to_user(old_setting, &old_spec, sizeof (old_spec))) + error = -EFAULT; + + return error; +} + +static int common_timer_del(struct k_itimer *timer) +{ + timer->it.real.interval.tv64 = 0; + + if (hrtimer_try_to_cancel(&timer->it.real.timer) < 0) + return TIMER_RETRY; + return 0; +} + +static inline int timer_delete_hook(struct k_itimer *timer) +{ + struct k_clock *kc = clockid_to_kclock(timer->it_clock); + + if (WARN_ON_ONCE(!kc || !kc->timer_del)) + return -EINVAL; + return kc->timer_del(timer); +} + +/* Delete a POSIX.1b interval timer. */ +SYSCALL_DEFINE1(timer_delete, timer_t, timer_id) +{ + struct k_itimer *timer; + unsigned long flags; + +retry_delete: + timer = lock_timer(timer_id, &flags); + if (!timer) + return -EINVAL; + + if (timer_delete_hook(timer) == TIMER_RETRY) { + unlock_timer(timer, flags); + goto retry_delete; + } + + spin_lock(¤t->sighand->siglock); + list_del(&timer->list); + spin_unlock(¤t->sighand->siglock); + /* + * This keeps any tasks waiting on the spin lock from thinking + * they got something (see the lock code above). + */ + timer->it_signal = NULL; + + unlock_timer(timer, flags); + release_posix_timer(timer, IT_ID_SET); + return 0; +} + +/* + * return timer owned by the process, used by exit_itimers + */ +static void itimer_delete(struct k_itimer *timer) +{ + unsigned long flags; + +retry_delete: + spin_lock_irqsave(&timer->it_lock, flags); + + if (timer_delete_hook(timer) == TIMER_RETRY) { + unlock_timer(timer, flags); + goto retry_delete; + } + list_del(&timer->list); + /* + * This keeps any tasks waiting on the spin lock from thinking + * they got something (see the lock code above). + */ + timer->it_signal = NULL; + + unlock_timer(timer, flags); + release_posix_timer(timer, IT_ID_SET); +} + +/* + * This is called by do_exit or de_thread, only when there are no more + * references to the shared signal_struct. + */ +void exit_itimers(struct signal_struct *sig) +{ + struct k_itimer *tmr; + + while (!list_empty(&sig->posix_timers)) { + tmr = list_entry(sig->posix_timers.next, struct k_itimer, list); + itimer_delete(tmr); + } +} + +SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, + const struct timespec __user *, tp) +{ + struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec new_tp; + + if (!kc || !kc->clock_set) + return -EINVAL; + + if (copy_from_user(&new_tp, tp, sizeof (*tp))) + return -EFAULT; + + return kc->clock_set(which_clock, &new_tp); +} + +SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, + struct timespec __user *,tp) +{ + struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec kernel_tp; + int error; + + if (!kc) + return -EINVAL; + + error = kc->clock_get(which_clock, &kernel_tp); + + if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) + error = -EFAULT; + + return error; +} + +SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock, + struct timex __user *, utx) +{ + struct k_clock *kc = clockid_to_kclock(which_clock); + struct timex ktx; + int err; + + if (!kc) + return -EINVAL; + if (!kc->clock_adj) + return -EOPNOTSUPP; + + if (copy_from_user(&ktx, utx, sizeof(ktx))) + return -EFAULT; + + err = kc->clock_adj(which_clock, &ktx); + + if (!err && copy_to_user(utx, &ktx, sizeof(ktx))) + return -EFAULT; + + return err; +} + +SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, + struct timespec __user *, tp) +{ + struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec rtn_tp; + int error; + + if (!kc) + return -EINVAL; + + error = kc->clock_getres(which_clock, &rtn_tp); + + if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) + error = -EFAULT; + + return error; +} + +/* + * nanosleep for monotonic and realtime clocks + */ +static int common_nsleep(const clockid_t which_clock, int flags, + struct timespec *tsave, struct timespec __user *rmtp) +{ + return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? + HRTIMER_MODE_ABS : HRTIMER_MODE_REL, + which_clock); +} + +SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, + const struct timespec __user *, rqtp, + struct timespec __user *, rmtp) +{ + struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec t; + + if (!kc) + return -EINVAL; + if (!kc->nsleep) + return -ENANOSLEEP_NOTSUP; + + if (copy_from_user(&t, rqtp, sizeof (struct timespec))) + return -EFAULT; + + if (!timespec_valid(&t)) + return -EINVAL; + + return kc->nsleep(which_clock, flags, &t, rmtp); +} + +/* + * This will restart clock_nanosleep. This is required only by + * compat_clock_nanosleep_restart for now. + */ +long clock_nanosleep_restart(struct restart_block *restart_block) +{ + clockid_t which_clock = restart_block->nanosleep.clockid; + struct k_clock *kc = clockid_to_kclock(which_clock); + + if (WARN_ON_ONCE(!kc || !kc->nsleep_restart)) + return -EINVAL; + + return kc->nsleep_restart(restart_block); +} diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig new file mode 100644 index 00000000..f6f69736 --- /dev/null +++ b/kernel/power/Kconfig @@ -0,0 +1,342 @@ +config SUSPEND + bool "Suspend to RAM and standby" + depends on ARCH_SUSPEND_POSSIBLE + default y + ---help--- + Allow the system to enter sleep states in which main memory is + powered and thus its contents are preserved, such as the + suspend-to-RAM state (e.g. the ACPI S3 state). + +config PM_TEST_SUSPEND + bool "Test suspend/resume and wakealarm during bootup" + depends on SUSPEND && PM_DEBUG && RTC_CLASS=y + ---help--- + This option will let you suspend your machine during bootup, and + make it wake up a few seconds later using an RTC wakeup alarm. + Enable this with a kernel parameter like "test_suspend=mem". + + You probably want to have your system's RTC driver statically + linked, ensuring that it's available when this test runs. + +config SUSPEND_DEVICE_TIME_DEBUG + bool "Warnning device suspend/resume takes too much time" + depends on SUSPEND && PM_DEBUG + default n + ---help--- + This option will enable a timing function to check device + suspend time consumption, If the device takes more time that + the threshold(default 0.5 ms), it will print the device and + bus name on the console. You can change the threshold + on-the-fly by modify /sys/power/time_threshold the time unit + is in microsecond. + + This options only for debug proprose, If in doubt, say N. + +config SUSPEND_FREEZER + bool "Enable freezer for suspend to RAM/standby" \ + if ARCH_WANTS_FREEZER_CONTROL || BROKEN + depends on SUSPEND + default y + help + This allows you to turn off the freezer for suspend. If this is + done, no tasks are frozen for suspend to RAM/standby. + + Turning OFF this setting is NOT recommended! If in doubt, say Y. + +config HAS_WAKELOCK + bool + +config HAS_EARLYSUSPEND + bool + +config CPUFREQ_GOV_ON_EARLYSUPSEND + bool "Use conservative cpu frequency governor when device enters early suspend" + depends on HAS_EARLYSUSPEND && CPU_FREQ + default n + help + Also will restore to original cpu frequency governor when device is resumed + +config CPUHOTPLUG_EARLYSUSPEND + bool "hotplug cpu when device enters early suspend" + depends on HAS_EARLYSUSPEND && SMP + default n + help + Will restore to original cpu nums online when device is resumed + +config WAKELOCK + bool "Wake lock" + depends on PM && RTC_CLASS + default n + select HAS_WAKELOCK + ---help--- + Enable wakelocks. When user space request a sleep state the + sleep request will be delayed until no wake locks are held. + +config WAKELOCK_STAT + bool "Wake lock stats" + depends on WAKELOCK + default y + ---help--- + Report wake lock stats in /proc/wakelocks + +config USER_WAKELOCK + bool "Userspace wake locks" + depends on WAKELOCK + default y + ---help--- + User-space wake lock api. Write "lockname" or "lockname timeout" + to /sys/power/wake_lock lock and if needed create a wake lock. + Write "lockname" to /sys/power/wake_unlock to unlock a user wake + lock. + +config EARLYSUSPEND + bool "Early suspend" + depends on WAKELOCK + default y + select HAS_EARLYSUSPEND + ---help--- + Call early suspend handlers when the user requested sleep state + changes. + +choice + prompt "User-space screen access" + default FB_EARLYSUSPEND if !FRAMEBUFFER_CONSOLE + default CONSOLE_EARLYSUSPEND + depends on HAS_EARLYSUSPEND + + config NO_USER_SPACE_SCREEN_ACCESS_CONTROL + bool "None" + + config CONSOLE_EARLYSUSPEND + bool "Console switch on early-suspend" + depends on HAS_EARLYSUSPEND && VT + ---help--- + Register early suspend handler to perform a console switch to + when user-space should stop drawing to the screen and a switch + back when it should resume. + + config FB_EARLYSUSPEND + bool "Sysfs interface" + depends on HAS_EARLYSUSPEND + ---help--- + Register early suspend handler that notifies and waits for + user-space through sysfs when user-space should stop drawing + to the screen and notifies user-space when it should resume. +endchoice + +config HIBERNATE_CALLBACKS + bool + +config HIBERNATION + bool "Hibernation (aka 'suspend to disk')" + depends on SWAP && ARCH_HIBERNATION_POSSIBLE + select HIBERNATE_CALLBACKS + select LZO_COMPRESS + select LZO_DECOMPRESS + ---help--- + Enable the suspend to disk (STD) functionality, which is usually + called "hibernation" in user interfaces. STD checkpoints the + system and powers it off; and restores that checkpoint on reboot. + + You can suspend your machine with 'echo disk > /sys/power/state' + after placing resume=/dev/swappartition on the kernel command line + in your bootloader's configuration file. + + Alternatively, you can use the additional userland tools available + from . + + In principle it does not require ACPI or APM, although for example + ACPI will be used for the final steps when it is available. One + of the reasons to use software suspend is that the firmware hooks + for suspend states like suspend-to-RAM (STR) often don't work very + well with Linux. + + It creates an image which is saved in your active swap. Upon the next + boot, pass the 'resume=/dev/swappartition' argument to the kernel to + have it detect the saved image, restore memory state from it, and + continue to run as before. If you do not want the previous state to + be reloaded, then use the 'noresume' kernel command line argument. + Note, however, that fsck will be run on your filesystems and you will + need to run mkswap against the swap partition used for the suspend. + + It also works with swap files to a limited extent (for details see + ). + + Right now you may boot without resuming and resume later but in the + meantime you cannot use the swap partition(s)/file(s) involved in + suspending. Also in this case you must not use the filesystems + that were mounted before the suspend. In particular, you MUST NOT + MOUNT any journaled filesystems mounted before the suspend or they + will get corrupted in a nasty way. + + For more information take a look at . + +config PM_STD_PARTITION + string "Default resume partition" + depends on HIBERNATION + default "" + ---help--- + The default resume partition is the partition that the suspend- + to-disk implementation will look for a suspended disk image. + + The partition specified here will be different for almost every user. + It should be a valid swap partition (at least for now) that is turned + on before suspending. + + The partition specified can be overridden by specifying: + + resume=/dev/ + + which will set the resume partition to the device specified. + + Note there is currently not a way to specify which device to save the + suspended image to. It will simply pick the first available swap + device. + +config PM_SLEEP + def_bool y + depends on SUSPEND || HIBERNATE_CALLBACKS + +config PM_SLEEP_SMP + def_bool y + depends on SMP + depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE + depends on PM_SLEEP + select HOTPLUG + select HOTPLUG_CPU + +config PM_RUNTIME + bool "Run-time PM core functionality" + depends on !IA64_HP_SIM + ---help--- + Enable functionality allowing I/O devices to be put into energy-saving + (low power) states at run time (or autosuspended) after a specified + period of inactivity and woken up in response to a hardware-generated + wake-up event or a driver's request. + + Hardware support is generally required for this functionality to work + and the bus type drivers of the buses the devices are on are + responsible for the actual handling of the autosuspend requests and + wake-up events. + +config PM + def_bool y + depends on PM_SLEEP || PM_RUNTIME + +config PM_DEBUG + bool "Power Management Debug Support" + depends on PM + ---help--- + This option enables various debugging support in the Power Management + code. This is helpful when debugging and reporting PM bugs, like + suspend support. + +config PM_ADVANCED_DEBUG + bool "Extra PM attributes in sysfs for low-level debugging/testing" + depends on PM_DEBUG + ---help--- + Add extra sysfs attributes allowing one to access some Power Management + fields of device objects from user space. If you are not a kernel + developer interested in debugging/testing Power Management, say "no". + +config PM_TEST_SUSPEND + bool "Test suspend/resume and wakealarm during bootup" + depends on SUSPEND && PM_DEBUG && RTC_CLASS=y + ---help--- + This option will let you suspend your machine during bootup, and + make it wake up a few seconds later using an RTC wakeup alarm. + Enable this with a kernel parameter like "test_suspend=mem". + + You probably want to have your system's RTC driver statically + linked, ensuring that it's available when this test runs. + +config CAN_PM_TRACE + def_bool y + depends on PM_DEBUG && PM_SLEEP + +config PM_TRACE + bool + help + This enables code to save the last PM event point across + reboot. The architecture needs to support this, x86 for + example does by saving things in the RTC, see below. + + The architecture specific code must provide the extern + functions from as well as the + header with a TRACE_RESUME() macro. + + The way the information is presented is architecture- + dependent, x86 will print the information during a + late_initcall. + +config PM_TRACE_RTC + bool "Suspend/resume event tracing" + depends on CAN_PM_TRACE + depends on X86 + select PM_TRACE + ---help--- + This enables some cheesy code to save the last PM event point in the + RTC across reboots, so that you can debug a machine that just hangs + during suspend (or more commonly, during resume). + + To use this debugging feature you should attempt to suspend the + machine, reboot it and then run + + dmesg -s 1000000 | grep 'hash matches' + + CAUTION: this option will cause your machine's real-time clock to be + set to an invalid time after a resume. + +config APM_EMULATION + tristate "Advanced Power Management Emulation" + depends on PM && SYS_SUPPORTS_APM_EMULATION + help + APM is a BIOS specification for saving power using several different + techniques. This is mostly useful for battery powered laptops with + APM compliant BIOSes. If you say Y here, the system time will be + reset after a RESUME operation, the /proc/apm device will provide + battery status information, and user-space programs will receive + notification of APM "events" (e.g. battery status change). + + In order to use APM, you will need supporting software. For location + and more information, read and the + Battery Powered Linux mini-HOWTO, available from + . + + This driver does not spin down disk drives (see the hdparm(8) + manpage ("man 8 hdparm") for that), and it doesn't turn off + VESA-compliant "green" monitors. + + Generally, if you don't have a battery in your machine, there isn't + much point in using this driver and you should say N. If you get + random kernel OOPSes or reboots that don't seem to be related to + anything, try disabling/enabling this option (or disabling/enabling + APM in your BIOS). + +config ARCH_HAS_OPP + bool + +config PM_OPP + bool "Operating Performance Point (OPP) Layer library" + depends on ARCH_HAS_OPP + ---help--- + SOCs have a standard set of tuples consisting of frequency and + voltage pairs that the device will support per voltage domain. This + is called Operating Performance Point or OPP. The actual definitions + of OPP varies over silicon within the same family of devices. + + OPP layer organizes the data internally using device pointers + representing individual voltage domains and provides SOC + implementations a ready to use framework to manage OPPs. + For more information, read + +config PM_RUNTIME_CLK + def_bool y + depends on PM_RUNTIME && HAVE_CLK + +config SUSPEND_TIME + bool "Log time spent in suspend" + ---help--- + Prints the time spent in suspend in the kernel log, and + keeps statistics on the time spent in suspend in + /sys/kernel/debug/suspend_time diff --git a/kernel/power/Makefile b/kernel/power/Makefile new file mode 100644 index 00000000..8de92715 --- /dev/null +++ b/kernel/power/Makefile @@ -0,0 +1,23 @@ + +ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG + +obj-$(CONFIG_PM) += main.o +obj-$(CONFIG_PM_SLEEP) += console.o +obj-$(CONFIG_FREEZER) += process.o +obj-$(CONFIG_SUSPEND) += suspend.o +obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o +obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ + block_io.o +obj-$(CONFIG_WAKELOCK) += wakelock.o +obj-$(CONFIG_USER_WAKELOCK) += userwakelock.o +obj-$(CONFIG_EARLYSUSPEND) += earlysuspend.o +obj-$(CONFIG_CONSOLE_EARLYSUSPEND) += consoleearlysuspend.o +obj-$(CONFIG_FB_EARLYSUSPEND) += fbearlysuspend.o +obj-$(CONFIG_CPUFREQ_GOV_ON_EARLYSUPSEND) += cpufreq_earlysuspend.o +obj-$(CONFIG_CPUHOTPLUG_EARLYSUSPEND) += cpuhotplug_earlysuspend.o +obj-$(CONFIG_SUSPEND_TIME) += suspend_time.o + +obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o + +obj-$(CONFIG_CPU_FREQ) += cpufreq_governor_chg.o + diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c new file mode 100644 index 00000000..d09dd10c --- /dev/null +++ b/kernel/power/block_io.c @@ -0,0 +1,103 @@ +/* + * This file provides functions for block I/O operations on swap/file. + * + * Copyright (C) 1998,2001-2005 Pavel Machek + * Copyright (C) 2006 Rafael J. Wysocki + * + * This file is released under the GPLv2. + */ + +#include +#include +#include +#include + +#include "power.h" + +/** + * submit - submit BIO request. + * @rw: READ or WRITE. + * @off physical offset of page. + * @page: page we're reading or writing. + * @bio_chain: list of pending biod (for async reading) + * + * Straight from the textbook - allocate and initialize the bio. + * If we're reading, make sure the page is marked as dirty. + * Then submit it and, if @bio_chain == NULL, wait. + */ +static int submit(int rw, struct block_device *bdev, sector_t sector, + struct page *page, struct bio **bio_chain) +{ + const int bio_rw = rw | REQ_SYNC; + struct bio *bio; + + bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); + bio->bi_sector = sector; + bio->bi_bdev = bdev; + bio->bi_end_io = end_swap_bio_read; + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + printk(KERN_ERR "PM: Adding page to bio failed at %llu\n", + (unsigned long long)sector); + bio_put(bio); + return -EFAULT; + } + + lock_page(page); + bio_get(bio); + + if (bio_chain == NULL) { + submit_bio(bio_rw, bio); + wait_on_page_locked(page); + if (rw == READ) + bio_set_pages_dirty(bio); + bio_put(bio); + } else { + if (rw == READ) + get_page(page); /* These pages are freed later */ + bio->bi_private = *bio_chain; + *bio_chain = bio; + submit_bio(bio_rw, bio); + } + return 0; +} + +int hib_bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain) +{ + return submit(READ, hib_resume_bdev, page_off * (PAGE_SIZE >> 9), + virt_to_page(addr), bio_chain); +} + +int hib_bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain) +{ + return submit(WRITE, hib_resume_bdev, page_off * (PAGE_SIZE >> 9), + virt_to_page(addr), bio_chain); +} + +int hib_wait_on_bio_chain(struct bio **bio_chain) +{ + struct bio *bio; + struct bio *next_bio; + int ret = 0; + + if (bio_chain == NULL) + return 0; + + bio = *bio_chain; + if (bio == NULL) + return 0; + while (bio) { + struct page *page; + + next_bio = bio->bi_private; + page = bio->bi_io_vec[0].bv_page; + wait_on_page_locked(page); + if (!PageUptodate(page) || PageError(page)) + ret = -EIO; + put_page(page); + bio_put(bio); + bio = next_bio; + } + *bio_chain = NULL; + return ret; +} diff --git a/kernel/power/console.c b/kernel/power/console.c new file mode 100644 index 00000000..218e5af9 --- /dev/null +++ b/kernel/power/console.c @@ -0,0 +1,35 @@ +/* + * drivers/power/process.c - Functions for saving/restoring console. + * + * Originally from swsusp. + */ + +#include +#include +#include +#include +#include "power.h" + +#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE) +#define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) + +static int orig_fgconsole, orig_kmsg; + +int pm_prepare_console(void) +{ + orig_fgconsole = vt_move_to_console(SUSPEND_CONSOLE, 1); + if (orig_fgconsole < 0) + return 1; + + orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE); + return 0; +} + +void pm_restore_console(void) +{ + if (orig_fgconsole >= 0) { + vt_move_to_console(orig_fgconsole, 0); + vt_kmsg_redirect(orig_kmsg); + } +} +#endif diff --git a/kernel/power/consoleearlysuspend.c b/kernel/power/consoleearlysuspend.c new file mode 100644 index 00000000..a3edcb26 --- /dev/null +++ b/kernel/power/consoleearlysuspend.c @@ -0,0 +1,78 @@ +/* kernel/power/consoleearlysuspend.c + * + * Copyright (C) 2005-2008 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include +#include +#include + +#define EARLY_SUSPEND_CONSOLE (MAX_NR_CONSOLES-1) + +static int orig_fgconsole; +static void console_early_suspend(struct early_suspend *h) +{ + acquire_console_sem(); + orig_fgconsole = fg_console; + if (vc_allocate(EARLY_SUSPEND_CONSOLE)) + goto err; + if (set_console(EARLY_SUSPEND_CONSOLE)) + goto err; + release_console_sem(); + + if (vt_waitactive(EARLY_SUSPEND_CONSOLE + 1)) + pr_warning("console_early_suspend: Can't switch VCs.\n"); + return; +err: + pr_warning("console_early_suspend: Can't set console\n"); + release_console_sem(); +} + +static void console_late_resume(struct early_suspend *h) +{ + int ret; + acquire_console_sem(); + ret = set_console(orig_fgconsole); + release_console_sem(); + if (ret) { + pr_warning("console_late_resume: Can't set console.\n"); + return; + } + + if (vt_waitactive(orig_fgconsole + 1)) + pr_warning("console_late_resume: Can't switch VCs.\n"); +} + +static struct early_suspend console_early_suspend_desc = { + .level = EARLY_SUSPEND_LEVEL_STOP_DRAWING, + .suspend = console_early_suspend, + .resume = console_late_resume, +}; + +static int __init console_early_suspend_init(void) +{ + register_early_suspend(&console_early_suspend_desc); + return 0; +} + +static void __exit console_early_suspend_exit(void) +{ + unregister_early_suspend(&console_early_suspend_desc); +} + +module_init(console_early_suspend_init); +module_exit(console_early_suspend_exit); + diff --git a/kernel/power/cpufreq_earlysuspend.c b/kernel/power/cpufreq_earlysuspend.c new file mode 100644 index 00000000..65faf3b9 --- /dev/null +++ b/kernel/power/cpufreq_earlysuspend.c @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2012 Freescale Semiconductor, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include +#include + +extern void cpufreq_save_default_governor(void); +extern void cpufreq_restore_default_governor(void); +extern void cpufreq_set_conservative_governor_param(int up_th, int down_th); +extern void cpufreq_set_performance_governor(void); +extern void cpufreq_set_conservative_governor(void); + +#define SET_GOVERNOR_TO_PERFORMANCE 1 + +static void cpufreq_early_suspend(struct early_suspend *p) +{ + cpufreq_save_default_governor(); +#ifdef SET_GOVERNOR_TO_PERFORMANCE//[ + cpufreq_set_performance_governor(); +#else //][SET_GOVERNOR_TO_PERFORMANCE + cpufreq_set_conservative_governor(); + cpufreq_set_conservative_governor_param( + SET_CONSERVATIVE_GOVERNOR_UP_THRESHOLD, + SET_CONSERVATIVE_GOVERNOR_DOWN_THRESHOLD); +#endif //]SET_GOVERNOR_TO_PERFORMANCE +} + +static void cpufreq_late_resume(struct early_suspend *p) +{ + cpufreq_restore_default_governor(); +} + +struct early_suspend cpufreq_earlysuspend = { + .level = EARLY_SUSPEND_LEVEL_POST_DISABLE_FB, + .suspend = cpufreq_early_suspend, + .resume = cpufreq_late_resume, +}; + +static int __init cpufreq_on_earlysuspend_init(void) +{ + register_early_suspend(&cpufreq_earlysuspend); + return 0; +} + +static void __exit cpufreq_on_earlysuspend_exit(void) +{ + unregister_early_suspend(&cpufreq_earlysuspend); +} + +module_init(cpufreq_on_earlysuspend_init); +module_exit(cpufreq_on_earlysuspend_exit); diff --git a/kernel/power/cpufreq_governor_chg.c b/kernel/power/cpufreq_governor_chg.c new file mode 100644 index 00000000..719b5f99 --- /dev/null +++ b/kernel/power/cpufreq_governor_chg.c @@ -0,0 +1,138 @@ + +#include +#include +#if 0 +extern void cpufreq_save_default_governor(void); +extern void cpufreq_restore_default_governor(void); +extern void cpufreq_set_conservative_governor(void); +extern void cpufreq_set_performance_governor(void); +extern void cpufreq_set_conservative_governor_param(int up_th, int down_th); +#endif + +#define GOV_CHG_DBG 1 + +#define SET_CONSERVATIVE_GOVERNOR_UP_THRESHOLD 95 +#define SET_CONSERVATIVE_GOVERNOR_DOWN_THRESHOLD 50 + +static char cpufreq_gov_default[32]; + +static char *sz_cpufreq_gov_performance = "performance"; +static char *sz_cpufreq_gov_conservative = "conservative"; + +static char *cpufreq_sysfs_place_holder = "/sys/devices/system/cpu/cpu%i/cpufreq/scaling_governor"; +static char *cpufreq_gov_conservative_param = "/sys/devices/system/cpu/cpufreq/conservative/%s"; + +static void cpufreq_set_governor(char *governor) +{ + struct file *scaling_gov = NULL; + char buf[128]; + int i; + loff_t offset = 0; + + if (governor == NULL) + return; + + for_each_online_cpu(i) { + sprintf(buf, cpufreq_sysfs_place_holder, i); + scaling_gov = filp_open(buf, O_RDWR, 0); + if (scaling_gov != NULL) { + if (scaling_gov->f_op != NULL && + scaling_gov->f_op->write != NULL) + { + scaling_gov->f_op->write(scaling_gov, + governor, + strlen(governor), + &offset); +#ifdef GOV_CHG_DBG//[ + printk("%s():set policy \"%s\"\n",__FUNCTION__,governor); +#endif //]GOV_CHG_DBG + } + else + pr_err("f_op might be null\n"); + + filp_close(scaling_gov, NULL); + } else { + pr_err("%s. Can't open %s\n", __func__, buf); + } + } +} + +void cpufreq_save_default_governor(void) +{ + int ret; + struct cpufreq_policy current_policy; + ret = cpufreq_get_policy(¤t_policy, 0); + if (ret < 0) + pr_err("%s: cpufreq_get_policy got error", __func__); + memcpy(cpufreq_gov_default, current_policy.governor->name, 32); +#ifdef GOV_CHG_DBG//[ + printk("%s():save policy \"%s\"\n",__FUNCTION__,cpufreq_gov_default); +#endif //]GOV_CHG_DBG +} + +void cpufreq_restore_default_governor(void) +{ + cpufreq_set_governor(cpufreq_gov_default); +#ifdef GOV_CHG_DBG//[ + printk("%s():restore policy \"%s\"\n",__FUNCTION__,cpufreq_gov_default); +#endif //]GOV_CHG_DBG +} + +void cpufreq_set_conservative_governor_param(int up_th, int down_th) +{ + struct file *gov_param = NULL; + static char buf[128], parm[8]; + loff_t offset = 0; + + if (up_th <= down_th) { + printk(KERN_ERR "%s: up_th(%d) is lesser than down_th(%d)\n", + __func__, up_th, down_th); + return; + } + + sprintf(parm, "%d", up_th); + sprintf(buf, cpufreq_gov_conservative_param , "up_threshold"); + gov_param = filp_open(buf, O_RDONLY, 0); + if (gov_param != NULL) { + if (gov_param->f_op != NULL && + gov_param->f_op->write != NULL) + gov_param->f_op->write(gov_param, + parm, + strlen(parm), + &offset); + else + pr_err("f_op might be null\n"); + + filp_close(gov_param, NULL); + } else { + pr_err("%s. Can't open %s\n", __func__, buf); + } + + sprintf(parm, "%d", down_th); + sprintf(buf, cpufreq_gov_conservative_param , "down_threshold"); + gov_param = filp_open(buf, O_RDONLY, 0); + if (gov_param != NULL) { + if (gov_param->f_op != NULL && + gov_param->f_op->write != NULL) + gov_param->f_op->write(gov_param, + parm, + strlen(parm), + &offset); + else + pr_err("f_op might be null\n"); + + filp_close(gov_param, NULL); + } else { + pr_err("%s. Can't open %s\n", __func__, buf); + } +} +void cpufreq_set_performance_governor(void) +{ + cpufreq_set_governor(sz_cpufreq_gov_performance); +} +void cpufreq_set_conservative_governor(void) +{ + cpufreq_set_governor(sz_cpufreq_gov_conservative); +} + + diff --git a/kernel/power/cpuhotplug_earlysuspend.c b/kernel/power/cpuhotplug_earlysuspend.c new file mode 100644 index 00000000..4b12c7e9 --- /dev/null +++ b/kernel/power/cpuhotplug_earlysuspend.c @@ -0,0 +1,111 @@ +/* + * Copyright (C) 2011-2012 Freescale Semiconductor, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include +#include + +enum { + DEBUG_USER_STATE = 1U << 0, + DEBUG_SUSPEND = 1U << 2, + DEBUG_VERBOSE = 1U << 3, +}; +static int debug_mask = DEBUG_USER_STATE; +module_param_named(debug_mask, debug_mask, int, S_IRUGO | S_IWUSR | S_IWGRP); + +static DEFINE_PER_CPU(int, tag); +static struct work_struct cpu_up_work; +static struct workqueue_struct *cpu_op_workqueue; + + +static void earlysuspend_cpu_op(int cpu, bool status) +{ + /* tag the cpu is onling/offline */ + if (status) { + per_cpu(tag, cpu) = 0; + cpu_up(cpu); + } else { + per_cpu(tag, cpu) = 1; + cpu_down(cpu); + } +} + +static void cpuhotplug_early_suspend(struct early_suspend *p) +{ + int first_cpu, cpu; + /* skip the first cpu, cpu0 always online */ + first_cpu = cpumask_first(cpu_online_mask); + for_each_possible_cpu(cpu) { + if (cpu == first_cpu) + continue; + if (cpu_online(cpu)) + earlysuspend_cpu_op(cpu, false); + } +} + +static void cpu_up_work_func(struct work_struct *work) +{ + int first_cpu, c; + /* skip the first cpu, cpu0 always online */ + first_cpu = cpumask_first(cpu_online_mask); + for_each_possible_cpu(c) { + if (c == first_cpu) + continue; + if (debug_mask & DEBUG_SUSPEND) + pr_info(" %s: CPU%d tag %d\n", __func__, c, + per_cpu(tag, c)); + if (!cpu_online(c) && per_cpu(tag, c)) + earlysuspend_cpu_op(c, true); + } +} + + +static void cpuhotplug_late_resume(struct early_suspend *p) +{ + if (debug_mask & DEBUG_SUSPEND) + pr_info(" %s: bootup secondary cpus\n", __func__); + queue_work(cpu_op_workqueue, &cpu_up_work); + +} + +struct early_suspend cpuhotplug_earlysuspend = { + .level = EARLY_SUSPEND_LEVEL_DISABLE_FB, + .suspend = cpuhotplug_early_suspend, + .resume = cpuhotplug_late_resume, +}; + + +static int __init cpuhotplug_earlysuspend_init(void) +{ + cpu_op_workqueue = create_workqueue("cpu hotplug earlysuspend wq"); + INIT_WORK(&cpu_up_work, cpu_up_work_func); + + register_early_suspend(&cpuhotplug_earlysuspend); + return 0; +} + +static void __exit cpuhotplug_earlysuspend_exit(void) +{ + if (NULL != cpu_op_workqueue) + destroy_workqueue(cpu_op_workqueue); + unregister_early_suspend(&cpuhotplug_earlysuspend); +} + +module_init(cpuhotplug_earlysuspend_init); +module_exit(cpuhotplug_earlysuspend_exit); +MODULE_AUTHOR("Freescale Semiconductor, Inc."); diff --git a/kernel/power/earlysuspend.c b/kernel/power/earlysuspend.c new file mode 100644 index 00000000..b15f02eb --- /dev/null +++ b/kernel/power/earlysuspend.c @@ -0,0 +1,187 @@ +/* kernel/power/earlysuspend.c + * + * Copyright (C) 2005-2008 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include +#include /* sys_sync */ +#include +#include + +#include "power.h" + +enum { + DEBUG_USER_STATE = 1U << 0, + DEBUG_SUSPEND = 1U << 2, + DEBUG_VERBOSE = 1U << 3, +}; +static int debug_mask = DEBUG_USER_STATE; +module_param_named(debug_mask, debug_mask, int, S_IRUGO | S_IWUSR | S_IWGRP); + +static DEFINE_MUTEX(early_suspend_lock); +static LIST_HEAD(early_suspend_handlers); +static void early_suspend(struct work_struct *work); +static void late_resume(struct work_struct *work); +static DECLARE_WORK(early_suspend_work, early_suspend); +static DECLARE_WORK(late_resume_work, late_resume); +static DEFINE_SPINLOCK(state_lock); +enum { + SUSPEND_REQUESTED = 0x1, + SUSPENDED = 0x2, + SUSPEND_REQUESTED_AND_SUSPENDED = SUSPEND_REQUESTED | SUSPENDED, +}; +static int state; + +void register_early_suspend(struct early_suspend *handler) +{ + struct list_head *pos; + + mutex_lock(&early_suspend_lock); + list_for_each(pos, &early_suspend_handlers) { + struct early_suspend *e; + e = list_entry(pos, struct early_suspend, link); + if (e->level > handler->level) + break; + } + list_add_tail(&handler->link, pos); + if ((state & SUSPENDED) && handler->suspend) + handler->suspend(handler); + mutex_unlock(&early_suspend_lock); +} +EXPORT_SYMBOL(register_early_suspend); + +void unregister_early_suspend(struct early_suspend *handler) +{ + mutex_lock(&early_suspend_lock); + list_del(&handler->link); + mutex_unlock(&early_suspend_lock); +} +EXPORT_SYMBOL(unregister_early_suspend); + +static void early_suspend(struct work_struct *work) +{ + struct early_suspend *pos; + unsigned long irqflags; + int abort = 0; + + mutex_lock(&early_suspend_lock); + spin_lock_irqsave(&state_lock, irqflags); + if (state == SUSPEND_REQUESTED) + state |= SUSPENDED; + else + abort = 1; + spin_unlock_irqrestore(&state_lock, irqflags); + + if (abort) { + if (debug_mask & DEBUG_SUSPEND) + pr_info("early_suspend: abort, state %d\n", state); + mutex_unlock(&early_suspend_lock); + goto abort; + } + + if (debug_mask & DEBUG_SUSPEND) + pr_info("early_suspend: call handlers\n"); + list_for_each_entry(pos, &early_suspend_handlers, link) { + if (pos->suspend != NULL) { + if (debug_mask & DEBUG_VERBOSE) + pr_info("early_suspend: calling %pf\n", pos->suspend); + pos->suspend(pos); + } + } + mutex_unlock(&early_suspend_lock); + + if (debug_mask & DEBUG_SUSPEND) + pr_info("early_suspend: sync\n"); + + sys_sync(); +abort: + spin_lock_irqsave(&state_lock, irqflags); + if (state == SUSPEND_REQUESTED_AND_SUSPENDED) + wake_unlock(&main_wake_lock); + spin_unlock_irqrestore(&state_lock, irqflags); +} + +static void late_resume(struct work_struct *work) +{ + struct early_suspend *pos; + unsigned long irqflags; + int abort = 0; + + mutex_lock(&early_suspend_lock); + spin_lock_irqsave(&state_lock, irqflags); + if (state == SUSPENDED) + state &= ~SUSPENDED; + else + abort = 1; + spin_unlock_irqrestore(&state_lock, irqflags); + + if (abort) { + if (debug_mask & DEBUG_SUSPEND) + pr_info("late_resume: abort, state %d\n", state); + goto abort; + } + if (debug_mask & DEBUG_SUSPEND) + pr_info("late_resume: call handlers\n"); + list_for_each_entry_reverse(pos, &early_suspend_handlers, link) { + if (pos->resume != NULL) { + if (debug_mask & DEBUG_VERBOSE) + pr_info("late_resume: calling %pf\n", pos->resume); + + pos->resume(pos); + } + } + if (debug_mask & DEBUG_SUSPEND) + pr_info("late_resume: done\n"); +abort: + mutex_unlock(&early_suspend_lock); +} + +void request_suspend_state(suspend_state_t new_state) +{ + unsigned long irqflags; + int old_sleep; + + spin_lock_irqsave(&state_lock, irqflags); + old_sleep = state & SUSPEND_REQUESTED; + if (debug_mask & DEBUG_USER_STATE) { + struct timespec ts; + struct rtc_time tm; + getnstimeofday(&ts); + rtc_time_to_tm(ts.tv_sec, &tm); + pr_info("request_suspend_state: %s (%d->%d) at %lld " + "(%d-%02d-%02d %02d:%02d:%02d.%09lu UTC)\n", + new_state != PM_SUSPEND_ON ? "sleep" : "wakeup", + requested_suspend_state, new_state, + ktime_to_ns(ktime_get()), + tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, + tm.tm_hour, tm.tm_min, tm.tm_sec, ts.tv_nsec); + } + if (!old_sleep && new_state != PM_SUSPEND_ON) { + state |= SUSPEND_REQUESTED; + queue_work(suspend_work_queue, &early_suspend_work); + } else if (old_sleep && new_state == PM_SUSPEND_ON) { + state &= ~SUSPEND_REQUESTED; + wake_lock(&main_wake_lock); + queue_work(suspend_work_queue, &late_resume_work); + } + requested_suspend_state = new_state; + spin_unlock_irqrestore(&state_lock, irqflags); +} + +suspend_state_t get_suspend_state(void) +{ + return requested_suspend_state; +} diff --git a/kernel/power/fbearlysuspend.c b/kernel/power/fbearlysuspend.c new file mode 100644 index 00000000..15137650 --- /dev/null +++ b/kernel/power/fbearlysuspend.c @@ -0,0 +1,153 @@ +/* kernel/power/fbearlysuspend.c + * + * Copyright (C) 2005-2008 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include + +#include "power.h" + +static wait_queue_head_t fb_state_wq; +static DEFINE_SPINLOCK(fb_state_lock); +static enum { + FB_STATE_STOPPED_DRAWING, + FB_STATE_REQUEST_STOP_DRAWING, + FB_STATE_DRAWING_OK, +} fb_state; + +/* tell userspace to stop drawing, wait for it to stop */ +static void stop_drawing_early_suspend(struct early_suspend *h) +{ + int ret; + unsigned long irq_flags; + + spin_lock_irqsave(&fb_state_lock, irq_flags); + fb_state = FB_STATE_REQUEST_STOP_DRAWING; + spin_unlock_irqrestore(&fb_state_lock, irq_flags); + + wake_up_all(&fb_state_wq); + ret = wait_event_timeout(fb_state_wq, + fb_state == FB_STATE_STOPPED_DRAWING, + HZ); + if (unlikely(fb_state != FB_STATE_STOPPED_DRAWING)) + pr_warning("stop_drawing_early_suspend: timeout waiting for " + "userspace to stop drawing\n"); +} + +/* tell userspace to start drawing */ +static void start_drawing_late_resume(struct early_suspend *h) +{ + unsigned long irq_flags; + + spin_lock_irqsave(&fb_state_lock, irq_flags); + fb_state = FB_STATE_DRAWING_OK; + spin_unlock_irqrestore(&fb_state_lock, irq_flags); + wake_up(&fb_state_wq); +} + +static struct early_suspend stop_drawing_early_suspend_desc = { + .level = EARLY_SUSPEND_LEVEL_STOP_DRAWING, + .suspend = stop_drawing_early_suspend, + .resume = start_drawing_late_resume, +}; + +static ssize_t wait_for_fb_sleep_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + char *s = buf; + int ret; + + ret = wait_event_interruptible(fb_state_wq, + fb_state != FB_STATE_DRAWING_OK); + if (ret && fb_state == FB_STATE_DRAWING_OK) + return ret; + else + s += sprintf(buf, "sleeping"); + return s - buf; +} + +static ssize_t wait_for_fb_wake_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + char *s = buf; + int ret; + unsigned long irq_flags; + + spin_lock_irqsave(&fb_state_lock, irq_flags); + if (fb_state == FB_STATE_REQUEST_STOP_DRAWING) { + fb_state = FB_STATE_STOPPED_DRAWING; + wake_up(&fb_state_wq); + } + spin_unlock_irqrestore(&fb_state_lock, irq_flags); + + ret = wait_event_interruptible(fb_state_wq, + fb_state == FB_STATE_DRAWING_OK); + if (ret && fb_state != FB_STATE_DRAWING_OK) + return ret; + else + s += sprintf(buf, "awake"); + + return s - buf; +} + +#define power_ro_attr(_name) \ +static struct kobj_attribute _name##_attr = { \ + .attr = { \ + .name = __stringify(_name), \ + .mode = 0444, \ + }, \ + .show = _name##_show, \ + .store = NULL, \ +} + +power_ro_attr(wait_for_fb_sleep); +power_ro_attr(wait_for_fb_wake); + +static struct attribute *g[] = { + &wait_for_fb_sleep_attr.attr, + &wait_for_fb_wake_attr.attr, + NULL, +}; + +static struct attribute_group attr_group = { + .attrs = g, +}; + +static int __init android_power_init(void) +{ + int ret; + + init_waitqueue_head(&fb_state_wq); + fb_state = FB_STATE_DRAWING_OK; + + ret = sysfs_create_group(power_kobj, &attr_group); + if (ret) { + pr_err("android_power_init: sysfs_create_group failed\n"); + return ret; + } + + register_early_suspend(&stop_drawing_early_suspend_desc); + return 0; +} + +static void __exit android_power_exit(void) +{ + unregister_early_suspend(&stop_drawing_early_suspend_desc); + sysfs_remove_group(power_kobj, &attr_group); +} + +module_init(android_power_init); +module_exit(android_power_exit); + diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c new file mode 100644 index 00000000..8884c276 --- /dev/null +++ b/kernel/power/hibernate.c @@ -0,0 +1,1067 @@ +/* + * kernel/power/hibernate.c - Hibernation (a.k.a suspend-to-disk) support. + * + * Copyright (c) 2003 Patrick Mochel + * Copyright (c) 2003 Open Source Development Lab + * Copyright (c) 2004 Pavel Machek + * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. + * + * This file is released under the GPLv2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "power.h" + + +static int nocompress = 0; +static int noresume = 0; +static char resume_file[256] = CONFIG_PM_STD_PARTITION; +dev_t swsusp_resume_device; +sector_t swsusp_resume_block; +int in_suspend __nosavedata = 0; + +enum { + HIBERNATION_INVALID, + HIBERNATION_PLATFORM, + HIBERNATION_TEST, + HIBERNATION_TESTPROC, + HIBERNATION_SHUTDOWN, + HIBERNATION_REBOOT, + /* keep last */ + __HIBERNATION_AFTER_LAST +}; +#define HIBERNATION_MAX (__HIBERNATION_AFTER_LAST-1) +#define HIBERNATION_FIRST (HIBERNATION_INVALID + 1) + +static int hibernation_mode = HIBERNATION_SHUTDOWN; + +static const struct platform_hibernation_ops *hibernation_ops; + +/** + * hibernation_set_ops - Set the global hibernate operations. + * @ops: Hibernation operations to use in subsequent hibernation transitions. + */ +void hibernation_set_ops(const struct platform_hibernation_ops *ops) +{ + if (ops && !(ops->begin && ops->end && ops->pre_snapshot + && ops->prepare && ops->finish && ops->enter && ops->pre_restore + && ops->restore_cleanup && ops->leave)) { + WARN_ON(1); + return; + } + mutex_lock(&pm_mutex); + hibernation_ops = ops; + if (ops) + hibernation_mode = HIBERNATION_PLATFORM; + else if (hibernation_mode == HIBERNATION_PLATFORM) + hibernation_mode = HIBERNATION_SHUTDOWN; + + mutex_unlock(&pm_mutex); +} + +static bool entering_platform_hibernation; + +bool system_entering_hibernation(void) +{ + return entering_platform_hibernation; +} +EXPORT_SYMBOL(system_entering_hibernation); + +#ifdef CONFIG_PM_DEBUG +static void hibernation_debug_sleep(void) +{ + printk(KERN_INFO "hibernation debug: Waiting for 5 seconds.\n"); + mdelay(5000); +} + +static int hibernation_testmode(int mode) +{ + if (hibernation_mode == mode) { + hibernation_debug_sleep(); + return 1; + } + return 0; +} + +static int hibernation_test(int level) +{ + if (pm_test_level == level) { + hibernation_debug_sleep(); + return 1; + } + return 0; +} +#else /* !CONFIG_PM_DEBUG */ +static int hibernation_testmode(int mode) { return 0; } +static int hibernation_test(int level) { return 0; } +#endif /* !CONFIG_PM_DEBUG */ + +/** + * platform_begin - Call platform to start hibernation. + * @platform_mode: Whether or not to use the platform driver. + */ +static int platform_begin(int platform_mode) +{ + return (platform_mode && hibernation_ops) ? + hibernation_ops->begin() : 0; +} + +/** + * platform_end - Call platform to finish transition to the working state. + * @platform_mode: Whether or not to use the platform driver. + */ +static void platform_end(int platform_mode) +{ + if (platform_mode && hibernation_ops) + hibernation_ops->end(); +} + +/** + * platform_pre_snapshot - Call platform to prepare the machine for hibernation. + * @platform_mode: Whether or not to use the platform driver. + * + * Use the platform driver to prepare the system for creating a hibernate image, + * if so configured, and return an error code if that fails. + */ + +static int platform_pre_snapshot(int platform_mode) +{ + return (platform_mode && hibernation_ops) ? + hibernation_ops->pre_snapshot() : 0; +} + +/** + * platform_leave - Call platform to prepare a transition to the working state. + * @platform_mode: Whether or not to use the platform driver. + * + * Use the platform driver prepare to prepare the machine for switching to the + * normal mode of operation. + * + * This routine is called on one CPU with interrupts disabled. + */ +static void platform_leave(int platform_mode) +{ + if (platform_mode && hibernation_ops) + hibernation_ops->leave(); +} + +/** + * platform_finish - Call platform to switch the system to the working state. + * @platform_mode: Whether or not to use the platform driver. + * + * Use the platform driver to switch the machine to the normal mode of + * operation. + * + * This routine must be called after platform_prepare(). + */ +static void platform_finish(int platform_mode) +{ + if (platform_mode && hibernation_ops) + hibernation_ops->finish(); +} + +/** + * platform_pre_restore - Prepare for hibernate image restoration. + * @platform_mode: Whether or not to use the platform driver. + * + * Use the platform driver to prepare the system for resume from a hibernation + * image. + * + * If the restore fails after this function has been called, + * platform_restore_cleanup() must be called. + */ +static int platform_pre_restore(int platform_mode) +{ + return (platform_mode && hibernation_ops) ? + hibernation_ops->pre_restore() : 0; +} + +/** + * platform_restore_cleanup - Switch to the working state after failing restore. + * @platform_mode: Whether or not to use the platform driver. + * + * Use the platform driver to switch the system to the normal mode of operation + * after a failing restore. + * + * If platform_pre_restore() has been called before the failing restore, this + * function must be called too, regardless of the result of + * platform_pre_restore(). + */ +static void platform_restore_cleanup(int platform_mode) +{ + if (platform_mode && hibernation_ops) + hibernation_ops->restore_cleanup(); +} + +/** + * platform_recover - Recover from a failure to suspend devices. + * @platform_mode: Whether or not to use the platform driver. + */ +static void platform_recover(int platform_mode) +{ + if (platform_mode && hibernation_ops && hibernation_ops->recover) + hibernation_ops->recover(); +} + +/** + * swsusp_show_speed - Print time elapsed between two events during hibernation. + * @start: Starting event. + * @stop: Final event. + * @nr_pages: Number of memory pages processed between @start and @stop. + * @msg: Additional diagnostic message to print. + */ +void swsusp_show_speed(struct timeval *start, struct timeval *stop, + unsigned nr_pages, char *msg) +{ + s64 elapsed_centisecs64; + int centisecs; + int k; + int kps; + + elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start); + do_div(elapsed_centisecs64, NSEC_PER_SEC / 100); + centisecs = elapsed_centisecs64; + if (centisecs == 0) + centisecs = 1; /* avoid div-by-zero */ + k = nr_pages * (PAGE_SIZE / 1024); + kps = (k * 100) / centisecs; + printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", + msg, k, + centisecs / 100, centisecs % 100, + kps / 1000, (kps % 1000) / 10); +} + +/** + * create_image - Create a hibernation image. + * @platform_mode: Whether or not to use the platform driver. + * + * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image + * and execute the drivers' .thaw_noirq() callbacks. + * + * Control reappears in this routine after the subsequent restore. + */ +static int create_image(int platform_mode) +{ + int error; + + error = dpm_suspend_noirq(PMSG_FREEZE); + if (error) { + printk(KERN_ERR "PM: Some devices failed to power down, " + "aborting hibernation\n"); + return error; + } + + error = platform_pre_snapshot(platform_mode); + if (error || hibernation_test(TEST_PLATFORM)) + goto Platform_finish; + + error = disable_nonboot_cpus(); + if (error || hibernation_test(TEST_CPUS) + || hibernation_testmode(HIBERNATION_TEST)) + goto Enable_cpus; + + local_irq_disable(); + + error = syscore_suspend(); + if (error) { + printk(KERN_ERR "PM: Some system devices failed to power down, " + "aborting hibernation\n"); + goto Enable_irqs; + } + + if (hibernation_test(TEST_CORE) || pm_wakeup_pending()) + goto Power_up; + + in_suspend = 1; + save_processor_state(); + error = swsusp_arch_suspend(); + if (error) + printk(KERN_ERR "PM: Error %d creating hibernation image\n", + error); + /* Restore control flow magically appears here */ + restore_processor_state(); + if (!in_suspend) { + events_check_enabled = false; + platform_leave(platform_mode); + } + + Power_up: + syscore_resume(); + + Enable_irqs: + local_irq_enable(); + + Enable_cpus: + enable_nonboot_cpus(); + + Platform_finish: + platform_finish(platform_mode); + + dpm_resume_noirq(in_suspend ? + (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); + + return error; +} + +/** + * hibernation_snapshot - Quiesce devices and create a hibernation image. + * @platform_mode: If set, use platform driver to prepare for the transition. + * + * This routine must be called with pm_mutex held. + */ +int hibernation_snapshot(int platform_mode) +{ + pm_message_t msg = PMSG_RECOVER; + int error; + + error = platform_begin(platform_mode); + if (error) + goto Close; + + error = dpm_prepare(PMSG_FREEZE); + if (error) + goto Complete_devices; + + /* Preallocate image memory before shutting down devices. */ + error = hibernate_preallocate_memory(); + if (error) + goto Complete_devices; + + suspend_console(); + pm_restrict_gfp_mask(); + error = dpm_suspend(PMSG_FREEZE); + if (error) + goto Recover_platform; + + if (hibernation_test(TEST_DEVICES)) + goto Recover_platform; + + error = create_image(platform_mode); + /* + * Control returns here (1) after the image has been created or the + * image creation has failed and (2) after a successful restore. + */ + + Resume_devices: + /* We may need to release the preallocated image pages here. */ + if (error || !in_suspend) + swsusp_free(); + + msg = in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE; + dpm_resume(msg); + + if (error || !in_suspend) + pm_restore_gfp_mask(); + + resume_console(); + + Complete_devices: + dpm_complete(msg); + + Close: + platform_end(platform_mode); + return error; + + Recover_platform: + platform_recover(platform_mode); + goto Resume_devices; +} + +/** + * resume_target_kernel - Restore system state from a hibernation image. + * @platform_mode: Whether or not to use the platform driver. + * + * Execute device drivers' .freeze_noirq() callbacks, restore the contents of + * highmem that have not been restored yet from the image and run the low-level + * code that will restore the remaining contents of memory and switch to the + * just restored target kernel. + */ +static int resume_target_kernel(bool platform_mode) +{ + int error; + + error = dpm_suspend_noirq(PMSG_QUIESCE); + if (error) { + printk(KERN_ERR "PM: Some devices failed to power down, " + "aborting resume\n"); + return error; + } + + error = platform_pre_restore(platform_mode); + if (error) + goto Cleanup; + + error = disable_nonboot_cpus(); + if (error) + goto Enable_cpus; + + local_irq_disable(); + + error = syscore_suspend(); + if (error) + goto Enable_irqs; + + save_processor_state(); + error = restore_highmem(); + if (!error) { + error = swsusp_arch_resume(); + /* + * The code below is only ever reached in case of a failure. + * Otherwise, execution continues at the place where + * swsusp_arch_suspend() was called. + */ + BUG_ON(!error); + /* + * This call to restore_highmem() reverts the changes made by + * the previous one. + */ + restore_highmem(); + } + /* + * The only reason why swsusp_arch_resume() can fail is memory being + * very tight, so we have to free it as soon as we can to avoid + * subsequent failures. + */ + swsusp_free(); + restore_processor_state(); + touch_softlockup_watchdog(); + + syscore_resume(); + + Enable_irqs: + local_irq_enable(); + + Enable_cpus: + enable_nonboot_cpus(); + + Cleanup: + platform_restore_cleanup(platform_mode); + + dpm_resume_noirq(PMSG_RECOVER); + + return error; +} + +/** + * hibernation_restore - Quiesce devices and restore from a hibernation image. + * @platform_mode: If set, use platform driver to prepare for the transition. + * + * This routine must be called with pm_mutex held. If it is successful, control + * reappears in the restored target kernel in hibernation_snaphot(). + */ +int hibernation_restore(int platform_mode) +{ + int error; + + pm_prepare_console(); + suspend_console(); + pm_restrict_gfp_mask(); + error = dpm_suspend_start(PMSG_QUIESCE); + if (!error) { + error = resume_target_kernel(platform_mode); + dpm_resume_end(PMSG_RECOVER); + } + pm_restore_gfp_mask(); + resume_console(); + pm_restore_console(); + return error; +} + +/** + * hibernation_platform_enter - Power off the system using the platform driver. + */ +int hibernation_platform_enter(void) +{ + int error; + + if (!hibernation_ops) + return -ENOSYS; + + /* + * We have cancelled the power transition by running + * hibernation_ops->finish() before saving the image, so we should let + * the firmware know that we're going to enter the sleep state after all + */ + error = hibernation_ops->begin(); + if (error) + goto Close; + + entering_platform_hibernation = true; + suspend_console(); + error = dpm_suspend_start(PMSG_HIBERNATE); + if (error) { + if (hibernation_ops->recover) + hibernation_ops->recover(); + goto Resume_devices; + } + + error = dpm_suspend_noirq(PMSG_HIBERNATE); + if (error) + goto Resume_devices; + + error = hibernation_ops->prepare(); + if (error) + goto Platform_finish; + + error = disable_nonboot_cpus(); + if (error) + goto Platform_finish; + + local_irq_disable(); + syscore_suspend(); + if (pm_wakeup_pending()) { + error = -EAGAIN; + goto Power_up; + } + + hibernation_ops->enter(); + /* We should never get here */ + while (1); + + Power_up: + syscore_resume(); + local_irq_enable(); + enable_nonboot_cpus(); + + Platform_finish: + hibernation_ops->finish(); + + dpm_resume_noirq(PMSG_RESTORE); + + Resume_devices: + entering_platform_hibernation = false; + dpm_resume_end(PMSG_RESTORE); + resume_console(); + + Close: + hibernation_ops->end(); + + return error; +} + +/** + * power_down - Shut the machine down for hibernation. + * + * Use the platform driver, if configured, to put the system into the sleep + * state corresponding to hibernation, or try to power it off or reboot, + * depending on the value of hibernation_mode. + */ +static void power_down(void) +{ + switch (hibernation_mode) { + case HIBERNATION_TEST: + case HIBERNATION_TESTPROC: + break; + case HIBERNATION_REBOOT: + kernel_restart(NULL); + break; + case HIBERNATION_PLATFORM: + hibernation_platform_enter(); + case HIBERNATION_SHUTDOWN: + kernel_power_off(); + break; + } + kernel_halt(); + /* + * Valid image is on the disk, if we continue we risk serious data + * corruption after resume. + */ + printk(KERN_CRIT "PM: Please power down manually\n"); + while(1); +} + +static int prepare_processes(void) +{ + int error = 0; + + if (freeze_processes()) { + error = -EBUSY; + thaw_processes(); + } + return error; +} + +/** + * hibernate - Carry out system hibernation, including saving the image. + */ +int hibernate(void) +{ + int error; + + mutex_lock(&pm_mutex); + /* The snapshot device should not be opened while we're running */ + if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + error = -EBUSY; + goto Unlock; + } + + pm_prepare_console(); + error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); + if (error) + goto Exit; + + error = usermodehelper_disable(); + if (error) + goto Exit; + + /* Allocate memory management structures */ + error = create_basic_memory_bitmaps(); + if (error) + goto Enable_umh; + + printk(KERN_INFO "PM: Syncing filesystems ... "); + sys_sync(); + printk("done.\n"); + + error = prepare_processes(); + if (error) + goto Free_bitmaps; + + if (hibernation_test(TEST_FREEZER)) + goto Thaw; + + if (hibernation_testmode(HIBERNATION_TESTPROC)) + goto Thaw; + + error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); + if (error) + goto Thaw; + + if (in_suspend) { + unsigned int flags = 0; + + if (hibernation_mode == HIBERNATION_PLATFORM) + flags |= SF_PLATFORM_MODE; + if (nocompress) + flags |= SF_NOCOMPRESS_MODE; + pr_debug("PM: writing image.\n"); + error = swsusp_write(flags); + swsusp_free(); + if (!error) + power_down(); + in_suspend = 0; + pm_restore_gfp_mask(); + } else { + pr_debug("PM: Image restored successfully.\n"); + } + + Thaw: + thaw_processes(); + Free_bitmaps: + free_basic_memory_bitmaps(); + Enable_umh: + usermodehelper_enable(); + Exit: + pm_notifier_call_chain(PM_POST_HIBERNATION); + pm_restore_console(); + atomic_inc(&snapshot_device_available); + Unlock: + mutex_unlock(&pm_mutex); + return error; +} + + +/** + * software_resume - Resume from a saved hibernation image. + * + * This routine is called as a late initcall, when all devices have been + * discovered and initialized already. + * + * The image reading code is called to see if there is a hibernation image + * available for reading. If that is the case, devices are quiesced and the + * contents of memory is restored from the saved image. + * + * If this is successful, control reappears in the restored target kernel in + * hibernation_snaphot() which returns to hibernate(). Otherwise, the routine + * attempts to recover gracefully and make the kernel return to the normal mode + * of operation. + */ +static int software_resume(void) +{ + int error; + unsigned int flags; + + /* + * If the user said "noresume".. bail out early. + */ + if (noresume) + return 0; + + /* + * name_to_dev_t() below takes a sysfs buffer mutex when sysfs + * is configured into the kernel. Since the regular hibernate + * trigger path is via sysfs which takes a buffer mutex before + * calling hibernate functions (which take pm_mutex) this can + * cause lockdep to complain about a possible ABBA deadlock + * which cannot happen since we're in the boot code here and + * sysfs can't be invoked yet. Therefore, we use a subclass + * here to avoid lockdep complaining. + */ + mutex_lock_nested(&pm_mutex, SINGLE_DEPTH_NESTING); + + if (swsusp_resume_device) + goto Check_image; + + if (!strlen(resume_file)) { + error = -ENOENT; + goto Unlock; + } + + pr_debug("PM: Checking hibernation image partition %s\n", resume_file); + + /* Check if the device is there */ + swsusp_resume_device = name_to_dev_t(resume_file); + if (!swsusp_resume_device) { + /* + * Some device discovery might still be in progress; we need + * to wait for this to finish. + */ + wait_for_device_probe(); + /* + * We can't depend on SCSI devices being available after loading + * one of their modules until scsi_complete_async_scans() is + * called and the resume device usually is a SCSI one. + */ + scsi_complete_async_scans(); + + swsusp_resume_device = name_to_dev_t(resume_file); + if (!swsusp_resume_device) { + error = -ENODEV; + goto Unlock; + } + } + + Check_image: + pr_debug("PM: Hibernation image partition %d:%d present\n", + MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); + + pr_debug("PM: Looking for hibernation image.\n"); + error = swsusp_check(); + if (error) + goto Unlock; + + /* The snapshot device should not be opened while we're running */ + if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + error = -EBUSY; + swsusp_close(FMODE_READ); + goto Unlock; + } + + pm_prepare_console(); + error = pm_notifier_call_chain(PM_RESTORE_PREPARE); + if (error) + goto close_finish; + + error = usermodehelper_disable(); + if (error) + goto close_finish; + + error = create_basic_memory_bitmaps(); + if (error) + goto close_finish; + + pr_debug("PM: Preparing processes for restore.\n"); + error = prepare_processes(); + if (error) { + swsusp_close(FMODE_READ); + goto Done; + } + + pr_debug("PM: Loading hibernation image.\n"); + + error = swsusp_read(&flags); + swsusp_close(FMODE_READ); + if (!error) + hibernation_restore(flags & SF_PLATFORM_MODE); + + printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); + swsusp_free(); + thaw_processes(); + Done: + free_basic_memory_bitmaps(); + usermodehelper_enable(); + Finish: + pm_notifier_call_chain(PM_POST_RESTORE); + pm_restore_console(); + atomic_inc(&snapshot_device_available); + /* For success case, the suspend path will release the lock */ + Unlock: + mutex_unlock(&pm_mutex); + pr_debug("PM: Hibernation image not present or could not be loaded.\n"); + return error; +close_finish: + swsusp_close(FMODE_READ); + goto Finish; +} + +late_initcall(software_resume); + + +static const char * const hibernation_modes[] = { + [HIBERNATION_PLATFORM] = "platform", + [HIBERNATION_SHUTDOWN] = "shutdown", + [HIBERNATION_REBOOT] = "reboot", + [HIBERNATION_TEST] = "test", + [HIBERNATION_TESTPROC] = "testproc", +}; + +/* + * /sys/power/disk - Control hibernation mode. + * + * Hibernation can be handled in several ways. There are a few different ways + * to put the system into the sleep state: using the platform driver (e.g. ACPI + * or other hibernation_ops), powering it off or rebooting it (for testing + * mostly), or using one of the two available test modes. + * + * The sysfs file /sys/power/disk provides an interface for selecting the + * hibernation mode to use. Reading from this file causes the available modes + * to be printed. There are 5 modes that can be supported: + * + * 'platform' + * 'shutdown' + * 'reboot' + * 'test' + * 'testproc' + * + * If a platform hibernation driver is in use, 'platform' will be supported + * and will be used by default. Otherwise, 'shutdown' will be used by default. + * The selected option (i.e. the one corresponding to the current value of + * hibernation_mode) is enclosed by a square bracket. + * + * To select a given hibernation mode it is necessary to write the mode's + * string representation (as returned by reading from /sys/power/disk) back + * into /sys/power/disk. + */ + +static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + int i; + char *start = buf; + + for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { + if (!hibernation_modes[i]) + continue; + switch (i) { + case HIBERNATION_SHUTDOWN: + case HIBERNATION_REBOOT: + case HIBERNATION_TEST: + case HIBERNATION_TESTPROC: + break; + case HIBERNATION_PLATFORM: + if (hibernation_ops) + break; + /* not a valid mode, continue with loop */ + continue; + } + if (i == hibernation_mode) + buf += sprintf(buf, "[%s] ", hibernation_modes[i]); + else + buf += sprintf(buf, "%s ", hibernation_modes[i]); + } + buf += sprintf(buf, "\n"); + return buf-start; +} + +static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + int error = 0; + int i; + int len; + char *p; + int mode = HIBERNATION_INVALID; + + p = memchr(buf, '\n', n); + len = p ? p - buf : n; + + mutex_lock(&pm_mutex); + for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) { + if (len == strlen(hibernation_modes[i]) + && !strncmp(buf, hibernation_modes[i], len)) { + mode = i; + break; + } + } + if (mode != HIBERNATION_INVALID) { + switch (mode) { + case HIBERNATION_SHUTDOWN: + case HIBERNATION_REBOOT: + case HIBERNATION_TEST: + case HIBERNATION_TESTPROC: + hibernation_mode = mode; + break; + case HIBERNATION_PLATFORM: + if (hibernation_ops) + hibernation_mode = mode; + else + error = -EINVAL; + } + } else + error = -EINVAL; + + if (!error) + pr_debug("PM: Hibernation mode set to '%s'\n", + hibernation_modes[mode]); + mutex_unlock(&pm_mutex); + return error ? error : n; +} + +power_attr(disk); + +static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device), + MINOR(swsusp_resume_device)); +} + +static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned int maj, min; + dev_t res; + int ret = -EINVAL; + + if (sscanf(buf, "%u:%u", &maj, &min) != 2) + goto out; + + res = MKDEV(maj,min); + if (maj != MAJOR(res) || min != MINOR(res)) + goto out; + + mutex_lock(&pm_mutex); + swsusp_resume_device = res; + mutex_unlock(&pm_mutex); + printk(KERN_INFO "PM: Starting manual resume from disk\n"); + noresume = 0; + software_resume(); + ret = n; + out: + return ret; +} + +power_attr(resume); + +static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%lu\n", image_size); +} + +static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long size; + + if (sscanf(buf, "%lu", &size) == 1) { + image_size = size; + return n; + } + + return -EINVAL; +} + +power_attr(image_size); + +static ssize_t reserved_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", reserved_size); +} + +static ssize_t reserved_size_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long size; + + if (sscanf(buf, "%lu", &size) == 1) { + reserved_size = size; + return n; + } + + return -EINVAL; +} + +power_attr(reserved_size); + +static struct attribute * g[] = { + &disk_attr.attr, + &resume_attr.attr, + &image_size_attr.attr, + &reserved_size_attr.attr, + NULL, +}; + + +static struct attribute_group attr_group = { + .attrs = g, +}; + + +static int __init pm_disk_init(void) +{ + return sysfs_create_group(power_kobj, &attr_group); +} + +core_initcall(pm_disk_init); + + +static int __init resume_setup(char *str) +{ + if (noresume) + return 1; + + strncpy( resume_file, str, 255 ); + return 1; +} + +static int __init resume_offset_setup(char *str) +{ + unsigned long long offset; + + if (noresume) + return 1; + + if (sscanf(str, "%llu", &offset) == 1) + swsusp_resume_block = offset; + + return 1; +} + +static int __init hibernate_setup(char *str) +{ + if (!strncmp(str, "noresume", 8)) + noresume = 1; + else if (!strncmp(str, "nocompress", 10)) + nocompress = 1; + return 1; +} + +static int __init noresume_setup(char *str) +{ + noresume = 1; + return 1; +} + +__setup("noresume", noresume_setup); +__setup("resume_offset=", resume_offset_setup); +__setup("resume=", resume_setup); +__setup("hibernate=", hibernate_setup); diff --git a/kernel/power/main.c b/kernel/power/main.c new file mode 100644 index 00000000..88de0d86 --- /dev/null +++ b/kernel/power/main.c @@ -0,0 +1,453 @@ +/* + * kernel/power/main.c - PM subsystem core functionality. + * + * Copyright (c) 2003 Patrick Mochel + * Copyright (c) 2003 Open Source Development Lab + * + * This file is released under the GPLv2 + * + */ + +#include +#include +#include +#include + +#include "power.h" + +DEFINE_MUTEX(pm_mutex); +EXPORT_SYMBOL(pm_mutex); + +#include "../../drivers/misc/ntx-misc.h" +#include "../../arch/arm/mach-mx6/ntx_hwconfig.h" +extern volatile NTX_HWCONFIG *gptHWCFG; + +#ifdef CONFIG_PM_SLEEP + +/* Routines for PM-transition notifications */ + +static BLOCKING_NOTIFIER_HEAD(pm_chain_head); + +int register_pm_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&pm_chain_head, nb); +} +EXPORT_SYMBOL_GPL(register_pm_notifier); + +int unregister_pm_notifier(struct notifier_block *nb) +{ + return blocking_notifier_chain_unregister(&pm_chain_head, nb); +} +EXPORT_SYMBOL_GPL(unregister_pm_notifier); + +int pm_notifier_call_chain(unsigned long val) +{ + return (blocking_notifier_call_chain(&pm_chain_head, val, NULL) + == NOTIFY_BAD) ? -EINVAL : 0; +} + +/* If set, devices may be suspended and resumed asynchronously. */ +int pm_async_enabled = 1; + +static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", pm_async_enabled); +} + +static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (strict_strtoul(buf, 10, &val)) + return -EINVAL; + + if (val > 1) + return -EINVAL; + + pm_async_enabled = val; + return n; +} + +power_attr(pm_async); + +#ifdef CONFIG_PM_DEBUG +int pm_test_level = TEST_NONE; + +static const char * const pm_tests[__TEST_AFTER_LAST] = { + [TEST_NONE] = "none", + [TEST_CORE] = "core", + [TEST_CPUS] = "processors", + [TEST_PLATFORM] = "platform", + [TEST_DEVICES] = "devices", + [TEST_FREEZER] = "freezer", +}; + +static ssize_t pm_test_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + char *s = buf; + int level; + + for (level = TEST_FIRST; level <= TEST_MAX; level++) + if (pm_tests[level]) { + if (level == pm_test_level) + s += sprintf(s, "[%s] ", pm_tests[level]); + else + s += sprintf(s, "%s ", pm_tests[level]); + } + + if (s != buf) + /* convert the last space to a newline */ + *(s-1) = '\n'; + + return (s - buf); +} + +static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + const char * const *s; + int level; + char *p; + int len; + int error = -EINVAL; + + p = memchr(buf, '\n', n); + len = p ? p - buf : n; + + mutex_lock(&pm_mutex); + + level = TEST_FIRST; + for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) + if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) { + pm_test_level = level; + error = 0; + break; + } + + mutex_unlock(&pm_mutex); + + return error ? error : n; +} + +power_attr(pm_test); +#endif /* CONFIG_PM_DEBUG */ + +#endif /* CONFIG_PM_SLEEP */ + +struct kobject *power_kobj; + +/** + * state - control system power state. + * + * show() returns what states are supported, which is hard-coded to + * 'standby' (Power-On Suspend), 'mem' (Suspend-to-RAM), and + * 'disk' (Suspend-to-Disk). + * + * store() accepts one of those strings, translates it into the + * proper enumerated value, and initiates a suspend transition. + */ +static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + char *s = buf; +#ifdef CONFIG_SUSPEND + int i; + + for (i = 0; i < PM_SUSPEND_MAX; i++) { + if (pm_states[i] && valid_state(i)) + s += sprintf(s,"%s ", pm_states[i]); + } +#endif +#ifdef CONFIG_HIBERNATION + s += sprintf(s, "%s\n", "disk"); +#else + if (s != buf) + /* convert the last space to a newline */ + *(s-1) = '\n'; +#endif + return (s - buf); +} + +static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ +#ifdef CONFIG_SUSPEND +#ifdef CONFIG_EARLYSUSPEND + suspend_state_t state = PM_SUSPEND_ON; +#else + suspend_state_t state = PM_SUSPEND_STANDBY; +#endif + const char * const *s; +#endif + char *p; + int len; + int error = -EINVAL; + + p = memchr(buf, '\n', n); + len = p ? p - buf : n; + + /* First, check if we are requested to hibernate */ + if (len == 4 && !strncmp(buf, "disk", len)) { + error = hibernate(); + goto Exit; + } + +#ifdef CONFIG_SUSPEND + for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { + if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) + break; + } + if (state < PM_SUSPEND_MAX && *s) +#ifdef CONFIG_EARLYSUSPEND + if (state == PM_SUSPEND_ON || valid_state(state)) { + error = 0; + request_suspend_state(state); + } +#else + error = enter_state(state); +#endif +#endif + + Exit: + return error ? error : n; +} + +power_attr(state); + +extern int gSleep_Mode_Suspend; +extern int ntx_get_homepad_enabled_status(void); +static ssize_t state_extended_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + char *s = buf; + s += sprintf(s, "%d\n", gSleep_Mode_Suspend); + return (s - buf); +} + +static ssize_t state_extended_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + if ('1' == *buf) { + gSleep_Mode_Suspend = 1; + if(36==gptHWCFG->m_val.bPCB || 40==gptHWCFG->m_val.bPCB || 49==gptHWCFG->m_val.bPCB) { + // E60Q3X/E60Q5X/E60QDX + msp430_homepad_enable(0); + } + } + else { + gSleep_Mode_Suspend = 0; +// printk ("[%s-%d] %s() %d\n",__FILE__,__LINE__,__func__,gSleep_Mode_Suspend); + if(36==gptHWCFG->m_val.bPCB || 40==gptHWCFG->m_val.bPCB || 49==gptHWCFG->m_val.bPCB) { + // E60Q3X/E60Q5X/E60QDX + if(0!=ntx_get_homepad_enabled_status()){ + msp430_homepad_enable(2); + } + } + } + + return n; +} + +//power_attr(state_extended); +static struct kobj_attribute state_extended_attr = { + .attr = { + .name = "state-extended", + .mode = 0644, + }, + .show = state_extended_show, + .store = state_extended_store, +}; + +#ifdef CONFIG_PM_SLEEP +/* + * The 'wakeup_count' attribute, along with the functions defined in + * drivers/base/power/wakeup.c, provides a means by which wakeup events can be + * handled in a non-racy way. + * + * If a wakeup event occurs when the system is in a sleep state, it simply is + * woken up. In turn, if an event that would wake the system up from a sleep + * state occurs when it is undergoing a transition to that sleep state, the + * transition should be aborted. Moreover, if such an event occurs when the + * system is in the working state, an attempt to start a transition to the + * given sleep state should fail during certain period after the detection of + * the event. Using the 'state' attribute alone is not sufficient to satisfy + * these requirements, because a wakeup event may occur exactly when 'state' + * is being written to and may be delivered to user space right before it is + * frozen, so the event will remain only partially processed until the system is + * woken up by another event. In particular, it won't cause the transition to + * a sleep state to be aborted. + * + * This difficulty may be overcome if user space uses 'wakeup_count' before + * writing to 'state'. It first should read from 'wakeup_count' and store + * the read value. Then, after carrying out its own preparations for the system + * transition to a sleep state, it should write the stored value to + * 'wakeup_count'. If that fails, at least one wakeup event has occurred since + * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it + * is allowed to write to 'state', but the transition will be aborted if there + * are any wakeup events detected after 'wakeup_count' was written to. + */ + +static ssize_t wakeup_count_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + unsigned int val; + + return pm_get_wakeup_count(&val) ? sprintf(buf, "%u\n", val) : -EINTR; +} + +static ssize_t wakeup_count_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned int val; + + if (sscanf(buf, "%u", &val) == 1) { + if (pm_save_wakeup_count(val)) + return n; + } + return -EINVAL; +} + +power_attr(wakeup_count); +#endif /* CONFIG_PM_SLEEP */ + +#ifdef CONFIG_PM_TRACE +int pm_trace_enabled; + +static ssize_t pm_trace_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", pm_trace_enabled); +} + +static ssize_t +pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + int val; + + if (sscanf(buf, "%d", &val) == 1) { + pm_trace_enabled = !!val; + return n; + } + return -EINVAL; +} + +power_attr(pm_trace); + +static ssize_t pm_trace_dev_match_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return show_trace_dev_match(buf, PAGE_SIZE); +} + +static ssize_t +pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + return -EINVAL; +} + +power_attr(pm_trace_dev_match); + +#endif /* CONFIG_PM_TRACE */ + +#ifdef CONFIG_SUSPEND_DEVICE_TIME_DEBUG +/* + * threshold of device suspend time consumption in microsecond(0.5ms), the + * driver suspend/resume time longer than this threshold will be + * print to console, 0 to disable */ +int device_suspend_time_threshold; + +static ssize_t +device_suspend_time_threshold_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + if (device_suspend_time_threshold == 0) + return sprintf(buf, "off\n"); + else + return sprintf(buf, "%d usecs\n", + device_suspend_time_threshold); +} + +static ssize_t +device_suspend_time_threshold_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + int val; + if (sscanf(buf, "%d", &val) > 0) { + device_suspend_time_threshold = val; + return n; + } + return -EINVAL; +} +power_attr(device_suspend_time_threshold); +#endif + +#ifdef CONFIG_USER_WAKELOCK +power_attr(wake_lock); +power_attr(wake_unlock); +#endif + +static struct attribute * g[] = { + &state_attr.attr, +#ifdef CONFIG_PM_TRACE + &pm_trace_attr.attr, + &pm_trace_dev_match_attr.attr, +#endif +#ifdef CONFIG_SUSPEND_DEVICE_TIME_DEBUG + &device_suspend_time_threshold_attr.attr, +#endif +#ifdef CONFIG_PM_SLEEP + &pm_async_attr.attr, + &wakeup_count_attr.attr, +#ifdef CONFIG_PM_DEBUG + &pm_test_attr.attr, +#endif +#ifdef CONFIG_USER_WAKELOCK + &wake_lock_attr.attr, + &wake_unlock_attr.attr, +#endif +#endif + &state_extended_attr.attr, + NULL, +}; + +static struct attribute_group attr_group = { + .attrs = g, +}; + +#ifdef CONFIG_PM_RUNTIME +struct workqueue_struct *pm_wq; +EXPORT_SYMBOL_GPL(pm_wq); + +static int __init pm_start_workqueue(void) +{ + pm_wq = alloc_workqueue("pm", WQ_FREEZABLE, 0); + + return pm_wq ? 0 : -ENOMEM; +} +#else +static inline int pm_start_workqueue(void) { return 0; } +#endif + +static int __init pm_init(void) +{ + int error = pm_start_workqueue(); + if (error) + return error; + hibernate_image_size_init(); + hibernate_reserved_size_init(); + power_kobj = kobject_create_and_add("power", NULL); + if (!power_kobj) + return -ENOMEM; + return sysfs_create_group(power_kobj, &attr_group); +} + +core_initcall(pm_init); diff --git a/kernel/power/power.h b/kernel/power/power.h new file mode 100644 index 00000000..b6b90064 --- /dev/null +++ b/kernel/power/power.h @@ -0,0 +1,271 @@ +#include +#include +#include +#include + +struct swsusp_info { + struct new_utsname uts; + u32 version_code; + unsigned long num_physpages; + int cpus; + unsigned long image_pages; + unsigned long pages; + unsigned long size; +} __attribute__((aligned(PAGE_SIZE))); + +#ifdef CONFIG_HIBERNATION +/* kernel/power/snapshot.c */ +extern void __init hibernate_reserved_size_init(void); +extern void __init hibernate_image_size_init(void); + +#ifdef CONFIG_ARCH_HIBERNATION_HEADER +/* Maximum size of architecture specific data in a hibernation header */ +#define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4) + +extern int arch_hibernation_header_save(void *addr, unsigned int max_size); +extern int arch_hibernation_header_restore(void *addr); + +static inline int init_header_complete(struct swsusp_info *info) +{ + return arch_hibernation_header_save(info, MAX_ARCH_HEADER_SIZE); +} + +static inline char *check_image_kernel(struct swsusp_info *info) +{ + return arch_hibernation_header_restore(info) ? + "architecture specific data" : NULL; +} +#endif /* CONFIG_ARCH_HIBERNATION_HEADER */ + +/* + * Keep some memory free so that I/O operations can succeed without paging + * [Might this be more than 4 MB?] + */ +#define PAGES_FOR_IO ((4096 * 1024) >> PAGE_SHIFT) + +/* + * Keep 1 MB of memory free so that device drivers can allocate some pages in + * their .suspend() routines without breaking the suspend to disk. + */ +#define SPARE_PAGES ((1024 * 1024) >> PAGE_SHIFT) + +/* kernel/power/hibernate.c */ +extern int hibernation_snapshot(int platform_mode); +extern int hibernation_restore(int platform_mode); +extern int hibernation_platform_enter(void); + +#else /* !CONFIG_HIBERNATION */ + +static inline void hibernate_reserved_size_init(void) {} +static inline void hibernate_image_size_init(void) {} +#endif /* !CONFIG_HIBERNATION */ + +extern int pfn_is_nosave(unsigned long); + +#define power_attr(_name) \ +static struct kobj_attribute _name##_attr = { \ + .attr = { \ + .name = __stringify(_name), \ + .mode = 0644, \ + }, \ + .show = _name##_show, \ + .store = _name##_store, \ +} + +/* Preferred image size in bytes (default 500 MB) */ +extern unsigned long image_size; +/* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */ +extern unsigned long reserved_size; +extern int in_suspend; +extern dev_t swsusp_resume_device; +extern sector_t swsusp_resume_block; + +extern asmlinkage int swsusp_arch_suspend(void); +extern asmlinkage int swsusp_arch_resume(void); + +extern int create_basic_memory_bitmaps(void); +extern void free_basic_memory_bitmaps(void); +extern int hibernate_preallocate_memory(void); + +/** + * Auxiliary structure used for reading the snapshot image data and + * metadata from and writing them to the list of page backup entries + * (PBEs) which is the main data structure of swsusp. + * + * Using struct snapshot_handle we can transfer the image, including its + * metadata, as a continuous sequence of bytes with the help of + * snapshot_read_next() and snapshot_write_next(). + * + * The code that writes the image to a storage or transfers it to + * the user land is required to use snapshot_read_next() for this + * purpose and it should not make any assumptions regarding the internal + * structure of the image. Similarly, the code that reads the image from + * a storage or transfers it from the user land is required to use + * snapshot_write_next(). + * + * This may allow us to change the internal structure of the image + * in the future with considerably less effort. + */ + +struct snapshot_handle { + unsigned int cur; /* number of the block of PAGE_SIZE bytes the + * next operation will refer to (ie. current) + */ + void *buffer; /* address of the block to read from + * or write to + */ + int sync_read; /* Set to one to notify the caller of + * snapshot_write_next() that it may + * need to call wait_on_bio_chain() + */ +}; + +/* This macro returns the address from/to which the caller of + * snapshot_read_next()/snapshot_write_next() is allowed to + * read/write data after the function returns + */ +#define data_of(handle) ((handle).buffer) + +extern unsigned int snapshot_additional_pages(struct zone *zone); +extern unsigned long snapshot_get_image_size(void); +extern int snapshot_read_next(struct snapshot_handle *handle); +extern int snapshot_write_next(struct snapshot_handle *handle); +extern void snapshot_write_finalize(struct snapshot_handle *handle); +extern int snapshot_image_loaded(struct snapshot_handle *handle); + +/* If unset, the snapshot device cannot be open. */ +extern atomic_t snapshot_device_available; + +extern sector_t alloc_swapdev_block(int swap); +extern void free_all_swap_pages(int swap); +extern int swsusp_swap_in_use(void); + +/* + * Flags that can be passed from the hibernatig hernel to the "boot" kernel in + * the image header. + */ +#define SF_PLATFORM_MODE 1 +#define SF_NOCOMPRESS_MODE 2 + +/* kernel/power/hibernate.c */ +extern int swsusp_check(void); +extern void swsusp_free(void); +extern int swsusp_read(unsigned int *flags_p); +extern int swsusp_write(unsigned int flags); +extern void swsusp_close(fmode_t); + +/* kernel/power/block_io.c */ +extern struct block_device *hib_resume_bdev; + +extern int hib_bio_read_page(pgoff_t page_off, void *addr, + struct bio **bio_chain); +extern int hib_bio_write_page(pgoff_t page_off, void *addr, + struct bio **bio_chain); +extern int hib_wait_on_bio_chain(struct bio **bio_chain); + +struct timeval; +/* kernel/power/swsusp.c */ +extern void swsusp_show_speed(struct timeval *, struct timeval *, + unsigned int, char *); + +#ifdef CONFIG_SUSPEND +/* kernel/power/suspend.c */ +extern const char *const pm_states[]; + +extern bool valid_state(suspend_state_t state); +extern int suspend_devices_and_enter(suspend_state_t state); +extern int enter_state(suspend_state_t state); +#else /* !CONFIG_SUSPEND */ +static inline int suspend_devices_and_enter(suspend_state_t state) +{ + return -ENOSYS; +} +static inline int enter_state(suspend_state_t state) { return -ENOSYS; } +static inline bool valid_state(suspend_state_t state) { return false; } +#endif /* !CONFIG_SUSPEND */ + +#ifdef CONFIG_PM_TEST_SUSPEND +/* kernel/power/suspend_test.c */ +extern void suspend_test_start(void); +extern void suspend_test_finish(const char *label); +#else /* !CONFIG_PM_TEST_SUSPEND */ +static inline void suspend_test_start(void) {} +static inline void suspend_test_finish(const char *label) {} +#endif /* !CONFIG_PM_TEST_SUSPEND */ + +#ifdef CONFIG_PM_SLEEP +/* kernel/power/main.c */ +extern int pm_notifier_call_chain(unsigned long val); +#endif + +#ifdef CONFIG_HIGHMEM +int restore_highmem(void); +#else +static inline unsigned int count_highmem_pages(void) { return 0; } +static inline int restore_highmem(void) { return 0; } +#endif + +/* + * Suspend test levels + */ +enum { + /* keep first */ + TEST_NONE, + TEST_CORE, + TEST_CPUS, + TEST_PLATFORM, + TEST_DEVICES, + TEST_FREEZER, + /* keep last */ + __TEST_AFTER_LAST +}; + +#define TEST_FIRST TEST_NONE +#define TEST_MAX (__TEST_AFTER_LAST - 1) + +extern int pm_test_level; + +#ifdef CONFIG_SUSPEND_FREEZER +static inline int suspend_freeze_processes(void) +{ + return freeze_processes(); +} + +static inline void suspend_thaw_processes(void) +{ + thaw_processes(); +} +#else +static inline int suspend_freeze_processes(void) +{ + return 0; +} + +static inline void suspend_thaw_processes(void) +{ +} +#endif + +#ifdef CONFIG_WAKELOCK +/* kernel/power/wakelock.c */ +extern struct workqueue_struct *suspend_work_queue; +extern struct wake_lock main_wake_lock; +extern suspend_state_t requested_suspend_state; +#endif + +#ifdef CONFIG_USER_WAKELOCK +ssize_t wake_lock_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf); +ssize_t wake_lock_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n); +ssize_t wake_unlock_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf); +ssize_t wake_unlock_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n); +#endif + +#ifdef CONFIG_EARLYSUSPEND +/* kernel/power/earlysuspend.c */ +void request_suspend_state(suspend_state_t state); +suspend_state_t get_suspend_state(void); +#endif diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c new file mode 100644 index 00000000..d5235937 --- /dev/null +++ b/kernel/power/poweroff.c @@ -0,0 +1,46 @@ +/* + * poweroff.c - sysrq handler to gracefully power down machine. + * + * This file is released under the GPL v2 + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * When the user hits Sys-Rq o to power down the machine this is the + * callback we use. + */ + +static void do_poweroff(struct work_struct *dummy) +{ + kernel_power_off(); +} + +static DECLARE_WORK(poweroff_work, do_poweroff); + +static void handle_poweroff(int key) +{ + /* run sysrq poweroff on boot cpu */ + schedule_work_on(cpumask_first(cpu_online_mask), &poweroff_work); +} + +static struct sysrq_key_op sysrq_poweroff_op = { + .handler = handle_poweroff, + .help_msg = "powerOff", + .action_msg = "Power Off", + .enable_mask = SYSRQ_ENABLE_BOOT, +}; + +static int pm_sysrq_init(void) +{ + register_sysrq_key('o', &sysrq_poweroff_op); + return 0; +} + +subsys_initcall(pm_sysrq_init); diff --git a/kernel/power/process.c b/kernel/power/process.c new file mode 100644 index 00000000..31338cde --- /dev/null +++ b/kernel/power/process.c @@ -0,0 +1,206 @@ +/* + * drivers/power/process.c - Functions for starting/stopping processes on + * suspend transitions. + * + * Originally from swsusp. + */ + + +#undef DEBUG + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Timeout for stopping processes + */ +#define TIMEOUT (20 * HZ) + +static inline int freezable(struct task_struct * p) +{ + if ((p == current) || + (p->flags & PF_NOFREEZE) || + (p->exit_state != 0)) + return 0; + return 1; +} + +static int try_to_freeze_tasks(bool sig_only) +{ + struct task_struct *g, *p; + unsigned long end_time; + unsigned int todo; + bool wq_busy = false; + struct timeval start, end; + u64 elapsed_csecs64; + unsigned int elapsed_csecs; + bool wakeup = false; + + do_gettimeofday(&start); + + end_time = jiffies + TIMEOUT; + + if (!sig_only) + freeze_workqueues_begin(); + + while (true) { + todo = 0; + read_lock(&tasklist_lock); + do_each_thread(g, p) { + if (frozen(p) || !freezable(p)) + continue; + + if (!freeze_task(p, sig_only)) + continue; + + /* + * Now that we've done set_freeze_flag, don't + * perturb a task in TASK_STOPPED or TASK_TRACED. + * It is "frozen enough". If the task does wake + * up, it will immediately call try_to_freeze. + * + * Because freeze_task() goes through p's + * scheduler lock after setting TIF_FREEZE, it's + * guaranteed that either we see TASK_RUNNING or + * try_to_stop() after schedule() in ptrace/signal + * stop sees TIF_FREEZE. + */ + if (!task_is_stopped_or_traced(p) && + !freezer_should_skip(p)) + todo++; + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + + if (!sig_only) { + wq_busy = freeze_workqueues_busy(); + todo += wq_busy; + } + + if (todo && has_wake_lock(WAKE_LOCK_SUSPEND)) { + wakeup = 1; + break; + } + if (!todo || time_after(jiffies, end_time)) + break; + + if (pm_wakeup_pending()) { + wakeup = true; + break; + } + + /* + * We need to retry, but first give the freezing tasks some + * time to enter the regrigerator. + */ + msleep(10); + } + + do_gettimeofday(&end); + elapsed_csecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); + do_div(elapsed_csecs64, NSEC_PER_SEC / 100); + elapsed_csecs = elapsed_csecs64; + + if (todo) { + /* This does not unfreeze processes that are already frozen + * (we have slightly ugly calling convention in that respect, + * and caller must call thaw_processes() if something fails), + * but it cleans up leftover PF_FREEZE requests. + */ + if(wakeup) { + printk("\n"); + printk(KERN_ERR "Freezing of %s aborted\n", + sig_only ? "user space " : "tasks "); + } + else { + printk("\n"); + printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " + "(%d tasks refusing to freeze, wq_busy=%d):\n", + elapsed_csecs / 100, elapsed_csecs % 100, + todo - wq_busy, wq_busy); + } + thaw_workqueues(); + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + task_lock(p); + if (freezing(p) && !freezer_should_skip(p) && + elapsed_csecs > 100) + sched_show_task(p); + cancel_freezing(p); + task_unlock(p); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + } else { + printk("(elapsed %d.%02d seconds) ", elapsed_csecs / 100, + elapsed_csecs % 100); + } + + return todo ? -EBUSY : 0; +} + +/** + * freeze_processes - tell processes to enter the refrigerator + */ +int freeze_processes(void) +{ + int error; + + printk("Freezing user space processes ... "); + error = try_to_freeze_tasks(true); + if (error) + goto Exit; + printk("done.\n"); + + printk("Freezing remaining freezable tasks ... "); + error = try_to_freeze_tasks(false); + if (error) + goto Exit; + printk("done."); + + oom_killer_disable(); + Exit: + BUG_ON(in_atomic()); + printk("\n"); + + return error; +} + +static void thaw_tasks(bool nosig_only) +{ + struct task_struct *g, *p; + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + if (!freezable(p)) + continue; + + if (nosig_only && should_send_signal(p)) + continue; + + if (cgroup_freezing_or_frozen(p)) + continue; + + thaw_process(p); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); +} + +void thaw_processes(void) +{ + oom_killer_enable(); + + printk("Restarting tasks ... "); + thaw_workqueues(); + thaw_tasks(true); + thaw_tasks(false); + schedule(); + printk("done.\n"); +} + diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c new file mode 100644 index 00000000..06efa54f --- /dev/null +++ b/kernel/power/snapshot.c @@ -0,0 +1,2325 @@ +/* + * linux/kernel/power/snapshot.c + * + * This file provides system snapshot/restore functionality for swsusp. + * + * Copyright (C) 1998-2005 Pavel Machek + * Copyright (C) 2006 Rafael J. Wysocki + * + * This file is released under the GPLv2. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "power.h" + +static int swsusp_page_is_free(struct page *); +static void swsusp_set_page_forbidden(struct page *); +static void swsusp_unset_page_forbidden(struct page *); + +/* + * Number of bytes to reserve for memory allocations made by device drivers + * from their ->freeze() and ->freeze_noirq() callbacks so that they don't + * cause image creation to fail (tunable via /sys/power/reserved_size). + */ +unsigned long reserved_size; + +void __init hibernate_reserved_size_init(void) +{ + reserved_size = SPARE_PAGES * PAGE_SIZE; +} + +/* + * Preferred image size in bytes (tunable via /sys/power/image_size). + * When it is set to N, swsusp will do its best to ensure the image + * size will not exceed N bytes, but if that is impossible, it will + * try to create the smallest image possible. + */ +unsigned long image_size; + +void __init hibernate_image_size_init(void) +{ + image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE; +} + +/* List of PBEs needed for restoring the pages that were allocated before + * the suspend and included in the suspend image, but have also been + * allocated by the "resume" kernel, so their contents cannot be written + * directly to their "original" page frames. + */ +struct pbe *restore_pblist; + +/* Pointer to an auxiliary buffer (1 page) */ +static void *buffer; + +/** + * @safe_needed - on resume, for storing the PBE list and the image, + * we can only use memory pages that do not conflict with the pages + * used before suspend. The unsafe pages have PageNosaveFree set + * and we count them using unsafe_pages. + * + * Each allocated image page is marked as PageNosave and PageNosaveFree + * so that swsusp_free() can release it. + */ + +#define PG_ANY 0 +#define PG_SAFE 1 +#define PG_UNSAFE_CLEAR 1 +#define PG_UNSAFE_KEEP 0 + +static unsigned int allocated_unsafe_pages; + +static void *get_image_page(gfp_t gfp_mask, int safe_needed) +{ + void *res; + + res = (void *)get_zeroed_page(gfp_mask); + if (safe_needed) + while (res && swsusp_page_is_free(virt_to_page(res))) { + /* The page is unsafe, mark it for swsusp_free() */ + swsusp_set_page_forbidden(virt_to_page(res)); + allocated_unsafe_pages++; + res = (void *)get_zeroed_page(gfp_mask); + } + if (res) { + swsusp_set_page_forbidden(virt_to_page(res)); + swsusp_set_page_free(virt_to_page(res)); + } + return res; +} + +unsigned long get_safe_page(gfp_t gfp_mask) +{ + return (unsigned long)get_image_page(gfp_mask, PG_SAFE); +} + +static struct page *alloc_image_page(gfp_t gfp_mask) +{ + struct page *page; + + page = alloc_page(gfp_mask); + if (page) { + swsusp_set_page_forbidden(page); + swsusp_set_page_free(page); + } + return page; +} + +/** + * free_image_page - free page represented by @addr, allocated with + * get_image_page (page flags set by it must be cleared) + */ + +static inline void free_image_page(void *addr, int clear_nosave_free) +{ + struct page *page; + + BUG_ON(!virt_addr_valid(addr)); + + page = virt_to_page(addr); + + swsusp_unset_page_forbidden(page); + if (clear_nosave_free) + swsusp_unset_page_free(page); + + __free_page(page); +} + +/* struct linked_page is used to build chains of pages */ + +#define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *)) + +struct linked_page { + struct linked_page *next; + char data[LINKED_PAGE_DATA_SIZE]; +} __attribute__((packed)); + +static inline void +free_list_of_pages(struct linked_page *list, int clear_page_nosave) +{ + while (list) { + struct linked_page *lp = list->next; + + free_image_page(list, clear_page_nosave); + list = lp; + } +} + +/** + * struct chain_allocator is used for allocating small objects out of + * a linked list of pages called 'the chain'. + * + * The chain grows each time when there is no room for a new object in + * the current page. The allocated objects cannot be freed individually. + * It is only possible to free them all at once, by freeing the entire + * chain. + * + * NOTE: The chain allocator may be inefficient if the allocated objects + * are not much smaller than PAGE_SIZE. + */ + +struct chain_allocator { + struct linked_page *chain; /* the chain */ + unsigned int used_space; /* total size of objects allocated out + * of the current page + */ + gfp_t gfp_mask; /* mask for allocating pages */ + int safe_needed; /* if set, only "safe" pages are allocated */ +}; + +static void +chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed) +{ + ca->chain = NULL; + ca->used_space = LINKED_PAGE_DATA_SIZE; + ca->gfp_mask = gfp_mask; + ca->safe_needed = safe_needed; +} + +static void *chain_alloc(struct chain_allocator *ca, unsigned int size) +{ + void *ret; + + if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) { + struct linked_page *lp; + + lp = get_image_page(ca->gfp_mask, ca->safe_needed); + if (!lp) + return NULL; + + lp->next = ca->chain; + ca->chain = lp; + ca->used_space = 0; + } + ret = ca->chain->data + ca->used_space; + ca->used_space += size; + return ret; +} + +/** + * Data types related to memory bitmaps. + * + * Memory bitmap is a structure consiting of many linked lists of + * objects. The main list's elements are of type struct zone_bitmap + * and each of them corresonds to one zone. For each zone bitmap + * object there is a list of objects of type struct bm_block that + * represent each blocks of bitmap in which information is stored. + * + * struct memory_bitmap contains a pointer to the main list of zone + * bitmap objects, a struct bm_position used for browsing the bitmap, + * and a pointer to the list of pages used for allocating all of the + * zone bitmap objects and bitmap block objects. + * + * NOTE: It has to be possible to lay out the bitmap in memory + * using only allocations of order 0. Additionally, the bitmap is + * designed to work with arbitrary number of zones (this is over the + * top for now, but let's avoid making unnecessary assumptions ;-). + * + * struct zone_bitmap contains a pointer to a list of bitmap block + * objects and a pointer to the bitmap block object that has been + * most recently used for setting bits. Additionally, it contains the + * pfns that correspond to the start and end of the represented zone. + * + * struct bm_block contains a pointer to the memory page in which + * information is stored (in the form of a block of bitmap) + * It also contains the pfns that correspond to the start and end of + * the represented memory area. + */ + +#define BM_END_OF_MAP (~0UL) + +#define BM_BITS_PER_BLOCK (PAGE_SIZE * BITS_PER_BYTE) + +struct bm_block { + struct list_head hook; /* hook into a list of bitmap blocks */ + unsigned long start_pfn; /* pfn represented by the first bit */ + unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ + unsigned long *data; /* bitmap representing pages */ +}; + +static inline unsigned long bm_block_bits(struct bm_block *bb) +{ + return bb->end_pfn - bb->start_pfn; +} + +/* strcut bm_position is used for browsing memory bitmaps */ + +struct bm_position { + struct bm_block *block; + int bit; +}; + +struct memory_bitmap { + struct list_head blocks; /* list of bitmap blocks */ + struct linked_page *p_list; /* list of pages used to store zone + * bitmap objects and bitmap block + * objects + */ + struct bm_position cur; /* most recently used bit position */ +}; + +/* Functions that operate on memory bitmaps */ + +static void memory_bm_position_reset(struct memory_bitmap *bm) +{ + bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook); + bm->cur.bit = 0; +} + +static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); + +/** + * create_bm_block_list - create a list of block bitmap objects + * @pages - number of pages to track + * @list - list to put the allocated blocks into + * @ca - chain allocator to be used for allocating memory + */ +static int create_bm_block_list(unsigned long pages, + struct list_head *list, + struct chain_allocator *ca) +{ + unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK); + + while (nr_blocks-- > 0) { + struct bm_block *bb; + + bb = chain_alloc(ca, sizeof(struct bm_block)); + if (!bb) + return -ENOMEM; + list_add(&bb->hook, list); + } + + return 0; +} + +struct mem_extent { + struct list_head hook; + unsigned long start; + unsigned long end; +}; + +/** + * free_mem_extents - free a list of memory extents + * @list - list of extents to empty + */ +static void free_mem_extents(struct list_head *list) +{ + struct mem_extent *ext, *aux; + + list_for_each_entry_safe(ext, aux, list, hook) { + list_del(&ext->hook); + kfree(ext); + } +} + +/** + * create_mem_extents - create a list of memory extents representing + * contiguous ranges of PFNs + * @list - list to put the extents into + * @gfp_mask - mask to use for memory allocations + */ +static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) +{ + struct zone *zone; + + INIT_LIST_HEAD(list); + + for_each_populated_zone(zone) { + unsigned long zone_start, zone_end; + struct mem_extent *ext, *cur, *aux; + + zone_start = zone->zone_start_pfn; + zone_end = zone->zone_start_pfn + zone->spanned_pages; + + list_for_each_entry(ext, list, hook) + if (zone_start <= ext->end) + break; + + if (&ext->hook == list || zone_end < ext->start) { + /* New extent is necessary */ + struct mem_extent *new_ext; + + new_ext = kzalloc(sizeof(struct mem_extent), gfp_mask); + if (!new_ext) { + free_mem_extents(list); + return -ENOMEM; + } + new_ext->start = zone_start; + new_ext->end = zone_end; + list_add_tail(&new_ext->hook, &ext->hook); + continue; + } + + /* Merge this zone's range of PFNs with the existing one */ + if (zone_start < ext->start) + ext->start = zone_start; + if (zone_end > ext->end) + ext->end = zone_end; + + /* More merging may be possible */ + cur = ext; + list_for_each_entry_safe_continue(cur, aux, list, hook) { + if (zone_end < cur->start) + break; + if (zone_end < cur->end) + ext->end = cur->end; + list_del(&cur->hook); + kfree(cur); + } + } + + return 0; +} + +/** + * memory_bm_create - allocate memory for a memory bitmap + */ +static int +memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) +{ + struct chain_allocator ca; + struct list_head mem_extents; + struct mem_extent *ext; + int error; + + chain_init(&ca, gfp_mask, safe_needed); + INIT_LIST_HEAD(&bm->blocks); + + error = create_mem_extents(&mem_extents, gfp_mask); + if (error) + return error; + + list_for_each_entry(ext, &mem_extents, hook) { + struct bm_block *bb; + unsigned long pfn = ext->start; + unsigned long pages = ext->end - ext->start; + + bb = list_entry(bm->blocks.prev, struct bm_block, hook); + + error = create_bm_block_list(pages, bm->blocks.prev, &ca); + if (error) + goto Error; + + list_for_each_entry_continue(bb, &bm->blocks, hook) { + bb->data = get_image_page(gfp_mask, safe_needed); + if (!bb->data) { + error = -ENOMEM; + goto Error; + } + + bb->start_pfn = pfn; + if (pages >= BM_BITS_PER_BLOCK) { + pfn += BM_BITS_PER_BLOCK; + pages -= BM_BITS_PER_BLOCK; + } else { + /* This is executed only once in the loop */ + pfn += pages; + } + bb->end_pfn = pfn; + } + } + + bm->p_list = ca.chain; + memory_bm_position_reset(bm); + Exit: + free_mem_extents(&mem_extents); + return error; + + Error: + bm->p_list = ca.chain; + memory_bm_free(bm, PG_UNSAFE_CLEAR); + goto Exit; +} + +/** + * memory_bm_free - free memory occupied by the memory bitmap @bm + */ +static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) +{ + struct bm_block *bb; + + list_for_each_entry(bb, &bm->blocks, hook) + if (bb->data) + free_image_page(bb->data, clear_nosave_free); + + free_list_of_pages(bm->p_list, clear_nosave_free); + + INIT_LIST_HEAD(&bm->blocks); +} + +/** + * memory_bm_find_bit - find the bit in the bitmap @bm that corresponds + * to given pfn. The cur_zone_bm member of @bm and the cur_block member + * of @bm->cur_zone_bm are updated. + */ +static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, + void **addr, unsigned int *bit_nr) +{ + struct bm_block *bb; + + /* + * Check if the pfn corresponds to the current bitmap block and find + * the block where it fits if this is not the case. + */ + bb = bm->cur.block; + if (pfn < bb->start_pfn) + list_for_each_entry_continue_reverse(bb, &bm->blocks, hook) + if (pfn >= bb->start_pfn) + break; + + if (pfn >= bb->end_pfn) + list_for_each_entry_continue(bb, &bm->blocks, hook) + if (pfn >= bb->start_pfn && pfn < bb->end_pfn) + break; + + if (&bb->hook == &bm->blocks) + return -EFAULT; + + /* The block has been found */ + bm->cur.block = bb; + pfn -= bb->start_pfn; + bm->cur.bit = pfn + 1; + *bit_nr = pfn; + *addr = bb->data; + return 0; +} + +static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) +{ + void *addr; + unsigned int bit; + int error; + + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + BUG_ON(error); + set_bit(bit, addr); +} + +static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn) +{ + void *addr; + unsigned int bit; + int error; + + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + if (!error) + set_bit(bit, addr); + return error; +} + +static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) +{ + void *addr; + unsigned int bit; + int error; + + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + BUG_ON(error); + clear_bit(bit, addr); +} + +static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) +{ + void *addr; + unsigned int bit; + int error; + + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + BUG_ON(error); + return test_bit(bit, addr); +} + +static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) +{ + void *addr; + unsigned int bit; + + return !memory_bm_find_bit(bm, pfn, &addr, &bit); +} + +/** + * memory_bm_next_pfn - find the pfn that corresponds to the next set bit + * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is + * returned. + * + * It is required to run memory_bm_position_reset() before the first call to + * this function. + */ + +static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) +{ + struct bm_block *bb; + int bit; + + bb = bm->cur.block; + do { + bit = bm->cur.bit; + bit = find_next_bit(bb->data, bm_block_bits(bb), bit); + if (bit < bm_block_bits(bb)) + goto Return_pfn; + + bb = list_entry(bb->hook.next, struct bm_block, hook); + bm->cur.block = bb; + bm->cur.bit = 0; + } while (&bb->hook != &bm->blocks); + + memory_bm_position_reset(bm); + return BM_END_OF_MAP; + + Return_pfn: + bm->cur.bit = bit + 1; + return bb->start_pfn + bit; +} + +/** + * This structure represents a range of page frames the contents of which + * should not be saved during the suspend. + */ + +struct nosave_region { + struct list_head list; + unsigned long start_pfn; + unsigned long end_pfn; +}; + +static LIST_HEAD(nosave_regions); + +/** + * register_nosave_region - register a range of page frames the contents + * of which should not be saved during the suspend (to be used in the early + * initialization code) + */ + +void __init +__register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, + int use_kmalloc) +{ + struct nosave_region *region; + + if (start_pfn >= end_pfn) + return; + + if (!list_empty(&nosave_regions)) { + /* Try to extend the previous region (they should be sorted) */ + region = list_entry(nosave_regions.prev, + struct nosave_region, list); + if (region->end_pfn == start_pfn) { + region->end_pfn = end_pfn; + goto Report; + } + } + if (use_kmalloc) { + /* during init, this shouldn't fail */ + region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL); + BUG_ON(!region); + } else + /* This allocation cannot fail */ + region = alloc_bootmem(sizeof(struct nosave_region)); + region->start_pfn = start_pfn; + region->end_pfn = end_pfn; + list_add_tail(®ion->list, &nosave_regions); + Report: + printk(KERN_INFO "PM: Registered nosave memory: %016lx - %016lx\n", + start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); +} + +/* + * Set bits in this map correspond to the page frames the contents of which + * should not be saved during the suspend. + */ +static struct memory_bitmap *forbidden_pages_map; + +/* Set bits in this map correspond to free page frames. */ +static struct memory_bitmap *free_pages_map; + +/* + * Each page frame allocated for creating the image is marked by setting the + * corresponding bits in forbidden_pages_map and free_pages_map simultaneously + */ + +void swsusp_set_page_free(struct page *page) +{ + if (free_pages_map) + memory_bm_set_bit(free_pages_map, page_to_pfn(page)); +} + +static int swsusp_page_is_free(struct page *page) +{ + return free_pages_map ? + memory_bm_test_bit(free_pages_map, page_to_pfn(page)) : 0; +} + +void swsusp_unset_page_free(struct page *page) +{ + if (free_pages_map) + memory_bm_clear_bit(free_pages_map, page_to_pfn(page)); +} + +static void swsusp_set_page_forbidden(struct page *page) +{ + if (forbidden_pages_map) + memory_bm_set_bit(forbidden_pages_map, page_to_pfn(page)); +} + +int swsusp_page_is_forbidden(struct page *page) +{ + return forbidden_pages_map ? + memory_bm_test_bit(forbidden_pages_map, page_to_pfn(page)) : 0; +} + +static void swsusp_unset_page_forbidden(struct page *page) +{ + if (forbidden_pages_map) + memory_bm_clear_bit(forbidden_pages_map, page_to_pfn(page)); +} + +/** + * mark_nosave_pages - set bits corresponding to the page frames the + * contents of which should not be saved in a given bitmap. + */ + +static void mark_nosave_pages(struct memory_bitmap *bm) +{ + struct nosave_region *region; + + if (list_empty(&nosave_regions)) + return; + + list_for_each_entry(region, &nosave_regions, list) { + unsigned long pfn; + + pr_debug("PM: Marking nosave pages: %016lx - %016lx\n", + region->start_pfn << PAGE_SHIFT, + region->end_pfn << PAGE_SHIFT); + + for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) + if (pfn_valid(pfn)) { + /* + * It is safe to ignore the result of + * mem_bm_set_bit_check() here, since we won't + * touch the PFNs for which the error is + * returned anyway. + */ + mem_bm_set_bit_check(bm, pfn); + } + } +} + +/** + * create_basic_memory_bitmaps - create bitmaps needed for marking page + * frames that should not be saved and free page frames. The pointers + * forbidden_pages_map and free_pages_map are only modified if everything + * goes well, because we don't want the bits to be used before both bitmaps + * are set up. + */ + +int create_basic_memory_bitmaps(void) +{ + struct memory_bitmap *bm1, *bm2; + int error = 0; + + BUG_ON(forbidden_pages_map || free_pages_map); + + bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL); + if (!bm1) + return -ENOMEM; + + error = memory_bm_create(bm1, GFP_KERNEL, PG_ANY); + if (error) + goto Free_first_object; + + bm2 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL); + if (!bm2) + goto Free_first_bitmap; + + error = memory_bm_create(bm2, GFP_KERNEL, PG_ANY); + if (error) + goto Free_second_object; + + forbidden_pages_map = bm1; + free_pages_map = bm2; + mark_nosave_pages(forbidden_pages_map); + + pr_debug("PM: Basic memory bitmaps created\n"); + + return 0; + + Free_second_object: + kfree(bm2); + Free_first_bitmap: + memory_bm_free(bm1, PG_UNSAFE_CLEAR); + Free_first_object: + kfree(bm1); + return -ENOMEM; +} + +/** + * free_basic_memory_bitmaps - free memory bitmaps allocated by + * create_basic_memory_bitmaps(). The auxiliary pointers are necessary + * so that the bitmaps themselves are not referred to while they are being + * freed. + */ + +void free_basic_memory_bitmaps(void) +{ + struct memory_bitmap *bm1, *bm2; + + BUG_ON(!(forbidden_pages_map && free_pages_map)); + + bm1 = forbidden_pages_map; + bm2 = free_pages_map; + forbidden_pages_map = NULL; + free_pages_map = NULL; + memory_bm_free(bm1, PG_UNSAFE_CLEAR); + kfree(bm1); + memory_bm_free(bm2, PG_UNSAFE_CLEAR); + kfree(bm2); + + pr_debug("PM: Basic memory bitmaps freed\n"); +} + +/** + * snapshot_additional_pages - estimate the number of additional pages + * be needed for setting up the suspend image data structures for given + * zone (usually the returned value is greater than the exact number) + */ + +unsigned int snapshot_additional_pages(struct zone *zone) +{ + unsigned int res; + + res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); + res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE); + return 2 * res; +} + +#ifdef CONFIG_HIGHMEM +/** + * count_free_highmem_pages - compute the total number of free highmem + * pages, system-wide. + */ + +static unsigned int count_free_highmem_pages(void) +{ + struct zone *zone; + unsigned int cnt = 0; + + for_each_populated_zone(zone) + if (is_highmem(zone)) + cnt += zone_page_state(zone, NR_FREE_PAGES); + + return cnt; +} + +/** + * saveable_highmem_page - Determine whether a highmem page should be + * included in the suspend image. + * + * We should save the page if it isn't Nosave or NosaveFree, or Reserved, + * and it isn't a part of a free chunk of pages. + */ +static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) +{ + struct page *page; + + if (!pfn_valid(pfn)) + return NULL; + + page = pfn_to_page(pfn); + if (page_zone(page) != zone) + return NULL; + + BUG_ON(!PageHighMem(page)); + + if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page) || + PageReserved(page)) + return NULL; + + return page; +} + +/** + * count_highmem_pages - compute the total number of saveable highmem + * pages. + */ + +static unsigned int count_highmem_pages(void) +{ + struct zone *zone; + unsigned int n = 0; + + for_each_populated_zone(zone) { + unsigned long pfn, max_zone_pfn; + + if (!is_highmem(zone)) + continue; + + mark_free_pages(zone); + max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) + if (saveable_highmem_page(zone, pfn)) + n++; + } + return n; +} +#else +static inline void *saveable_highmem_page(struct zone *z, unsigned long p) +{ + return NULL; +} +#endif /* CONFIG_HIGHMEM */ + +/** + * saveable_page - Determine whether a non-highmem page should be included + * in the suspend image. + * + * We should save the page if it isn't Nosave, and is not in the range + * of pages statically defined as 'unsaveable', and it isn't a part of + * a free chunk of pages. + */ +static struct page *saveable_page(struct zone *zone, unsigned long pfn) +{ + struct page *page; + + if (!pfn_valid(pfn)) + return NULL; + + page = pfn_to_page(pfn); + if (page_zone(page) != zone) + return NULL; + + BUG_ON(PageHighMem(page)); + + if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) + return NULL; + + if (PageReserved(page) + && (!kernel_page_present(page) || pfn_is_nosave(pfn))) + return NULL; + + return page; +} + +/** + * count_data_pages - compute the total number of saveable non-highmem + * pages. + */ + +static unsigned int count_data_pages(void) +{ + struct zone *zone; + unsigned long pfn, max_zone_pfn; + unsigned int n = 0; + + for_each_populated_zone(zone) { + if (is_highmem(zone)) + continue; + + mark_free_pages(zone); + max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) + if (saveable_page(zone, pfn)) + n++; + } + return n; +} + +/* This is needed, because copy_page and memcpy are not usable for copying + * task structs. + */ +static inline void do_copy_page(long *dst, long *src) +{ + int n; + + for (n = PAGE_SIZE / sizeof(long); n; n--) + *dst++ = *src++; +} + + +/** + * safe_copy_page - check if the page we are going to copy is marked as + * present in the kernel page tables (this always is the case if + * CONFIG_DEBUG_PAGEALLOC is not set and in that case + * kernel_page_present() always returns 'true'). + */ +static void safe_copy_page(void *dst, struct page *s_page) +{ + if (kernel_page_present(s_page)) { + do_copy_page(dst, page_address(s_page)); + } else { + kernel_map_pages(s_page, 1, 1); + do_copy_page(dst, page_address(s_page)); + kernel_map_pages(s_page, 1, 0); + } +} + + +#ifdef CONFIG_HIGHMEM +static inline struct page * +page_is_saveable(struct zone *zone, unsigned long pfn) +{ + return is_highmem(zone) ? + saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn); +} + +static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) +{ + struct page *s_page, *d_page; + void *src, *dst; + + s_page = pfn_to_page(src_pfn); + d_page = pfn_to_page(dst_pfn); + if (PageHighMem(s_page)) { + src = kmap_atomic(s_page, KM_USER0); + dst = kmap_atomic(d_page, KM_USER1); + do_copy_page(dst, src); + kunmap_atomic(dst, KM_USER1); + kunmap_atomic(src, KM_USER0); + } else { + if (PageHighMem(d_page)) { + /* Page pointed to by src may contain some kernel + * data modified by kmap_atomic() + */ + safe_copy_page(buffer, s_page); + dst = kmap_atomic(d_page, KM_USER0); + copy_page(dst, buffer); + kunmap_atomic(dst, KM_USER0); + } else { + safe_copy_page(page_address(d_page), s_page); + } + } +} +#else +#define page_is_saveable(zone, pfn) saveable_page(zone, pfn) + +static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) +{ + safe_copy_page(page_address(pfn_to_page(dst_pfn)), + pfn_to_page(src_pfn)); +} +#endif /* CONFIG_HIGHMEM */ + +static void +copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) +{ + struct zone *zone; + unsigned long pfn; + + for_each_populated_zone(zone) { + unsigned long max_zone_pfn; + + mark_free_pages(zone); + max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) + if (page_is_saveable(zone, pfn)) + memory_bm_set_bit(orig_bm, pfn); + } + memory_bm_position_reset(orig_bm); + memory_bm_position_reset(copy_bm); + for(;;) { + pfn = memory_bm_next_pfn(orig_bm); + if (unlikely(pfn == BM_END_OF_MAP)) + break; + copy_data_page(memory_bm_next_pfn(copy_bm), pfn); + } +} + +/* Total number of image pages */ +static unsigned int nr_copy_pages; +/* Number of pages needed for saving the original pfns of the image pages */ +static unsigned int nr_meta_pages; +/* + * Numbers of normal and highmem page frames allocated for hibernation image + * before suspending devices. + */ +unsigned int alloc_normal, alloc_highmem; +/* + * Memory bitmap used for marking saveable pages (during hibernation) or + * hibernation image pages (during restore) + */ +static struct memory_bitmap orig_bm; +/* + * Memory bitmap used during hibernation for marking allocated page frames that + * will contain copies of saveable pages. During restore it is initially used + * for marking hibernation image pages, but then the set bits from it are + * duplicated in @orig_bm and it is released. On highmem systems it is next + * used for marking "safe" highmem pages, but it has to be reinitialized for + * this purpose. + */ +static struct memory_bitmap copy_bm; + +/** + * swsusp_free - free pages allocated for the suspend. + * + * Suspend pages are alocated before the atomic copy is made, so we + * need to release them after the resume. + */ + +void swsusp_free(void) +{ + struct zone *zone; + unsigned long pfn, max_zone_pfn; + + for_each_populated_zone(zone) { + max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) + if (pfn_valid(pfn)) { + struct page *page = pfn_to_page(pfn); + + if (swsusp_page_is_forbidden(page) && + swsusp_page_is_free(page)) { + swsusp_unset_page_forbidden(page); + swsusp_unset_page_free(page); + __free_page(page); + } + } + } + nr_copy_pages = 0; + nr_meta_pages = 0; + restore_pblist = NULL; + buffer = NULL; + alloc_normal = 0; + alloc_highmem = 0; +} + +/* Helper functions used for the shrinking of memory. */ + +#define GFP_IMAGE (GFP_KERNEL | __GFP_NOWARN) + +/** + * preallocate_image_pages - Allocate a number of pages for hibernation image + * @nr_pages: Number of page frames to allocate. + * @mask: GFP flags to use for the allocation. + * + * Return value: Number of page frames actually allocated + */ +static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask) +{ + unsigned long nr_alloc = 0; + + while (nr_pages > 0) { + struct page *page; + + page = alloc_image_page(mask); + if (!page) + break; + memory_bm_set_bit(©_bm, page_to_pfn(page)); + if (PageHighMem(page)) + alloc_highmem++; + else + alloc_normal++; + nr_pages--; + nr_alloc++; + } + + return nr_alloc; +} + +static unsigned long preallocate_image_memory(unsigned long nr_pages, + unsigned long avail_normal) +{ + unsigned long alloc; + + if (avail_normal <= alloc_normal) + return 0; + + alloc = avail_normal - alloc_normal; + if (nr_pages < alloc) + alloc = nr_pages; + + return preallocate_image_pages(alloc, GFP_IMAGE); +} + +#ifdef CONFIG_HIGHMEM +static unsigned long preallocate_image_highmem(unsigned long nr_pages) +{ + return preallocate_image_pages(nr_pages, GFP_IMAGE | __GFP_HIGHMEM); +} + +/** + * __fraction - Compute (an approximation of) x * (multiplier / base) + */ +static unsigned long __fraction(u64 x, u64 multiplier, u64 base) +{ + x *= multiplier; + do_div(x, base); + return (unsigned long)x; +} + +static unsigned long preallocate_highmem_fraction(unsigned long nr_pages, + unsigned long highmem, + unsigned long total) +{ + unsigned long alloc = __fraction(nr_pages, highmem, total); + + return preallocate_image_pages(alloc, GFP_IMAGE | __GFP_HIGHMEM); +} +#else /* CONFIG_HIGHMEM */ +static inline unsigned long preallocate_image_highmem(unsigned long nr_pages) +{ + return 0; +} + +static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages, + unsigned long highmem, + unsigned long total) +{ + return 0; +} +#endif /* CONFIG_HIGHMEM */ + +/** + * free_unnecessary_pages - Release preallocated pages not needed for the image + */ +static void free_unnecessary_pages(void) +{ + unsigned long save, to_free_normal, to_free_highmem; + + save = count_data_pages(); + if (alloc_normal >= save) { + to_free_normal = alloc_normal - save; + save = 0; + } else { + to_free_normal = 0; + save -= alloc_normal; + } + save += count_highmem_pages(); + if (alloc_highmem >= save) { + to_free_highmem = alloc_highmem - save; + } else { + to_free_highmem = 0; + save -= alloc_highmem; + if (to_free_normal > save) + to_free_normal -= save; + else + to_free_normal = 0; + } + + memory_bm_position_reset(©_bm); + + while (to_free_normal > 0 || to_free_highmem > 0) { + unsigned long pfn = memory_bm_next_pfn(©_bm); + struct page *page = pfn_to_page(pfn); + + if (PageHighMem(page)) { + if (!to_free_highmem) + continue; + to_free_highmem--; + alloc_highmem--; + } else { + if (!to_free_normal) + continue; + to_free_normal--; + alloc_normal--; + } + memory_bm_clear_bit(©_bm, pfn); + swsusp_unset_page_forbidden(page); + swsusp_unset_page_free(page); + __free_page(page); + } +} + +/** + * minimum_image_size - Estimate the minimum acceptable size of an image + * @saveable: Number of saveable pages in the system. + * + * We want to avoid attempting to free too much memory too hard, so estimate the + * minimum acceptable size of a hibernation image to use as the lower limit for + * preallocating memory. + * + * We assume that the minimum image size should be proportional to + * + * [number of saveable pages] - [number of pages that can be freed in theory] + * + * where the second term is the sum of (1) reclaimable slab pages, (2) active + * and (3) inactive anonymouns pages, (4) active and (5) inactive file pages, + * minus mapped file pages. + */ +static unsigned long minimum_image_size(unsigned long saveable) +{ + unsigned long size; + + size = global_page_state(NR_SLAB_RECLAIMABLE) + + global_page_state(NR_ACTIVE_ANON) + + global_page_state(NR_INACTIVE_ANON) + + global_page_state(NR_ACTIVE_FILE) + + global_page_state(NR_INACTIVE_FILE) + - global_page_state(NR_FILE_MAPPED); + + return saveable <= size ? 0 : saveable - size; +} + +/** + * hibernate_preallocate_memory - Preallocate memory for hibernation image + * + * To create a hibernation image it is necessary to make a copy of every page + * frame in use. We also need a number of page frames to be free during + * hibernation for allocations made while saving the image and for device + * drivers, in case they need to allocate memory from their hibernation + * callbacks (these two numbers are given by PAGES_FOR_IO (which is a rough + * estimate) and reserverd_size divided by PAGE_SIZE (which is tunable through + * /sys/power/reserved_size, respectively). To make this happen, we compute the + * total number of available page frames and allocate at least + * + * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + * + 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE) + * + * of them, which corresponds to the maximum size of a hibernation image. + * + * If image_size is set below the number following from the above formula, + * the preallocation of memory is continued until the total number of saveable + * pages in the system is below the requested image size or the minimum + * acceptable image size returned by minimum_image_size(), whichever is greater. + */ +int hibernate_preallocate_memory(void) +{ + struct zone *zone; + unsigned long saveable, size, max_size, count, highmem, pages = 0; + unsigned long alloc, save_highmem, pages_highmem, avail_normal; + struct timeval start, stop; + int error; + + printk(KERN_INFO "PM: Preallocating image memory... "); + do_gettimeofday(&start); + + error = memory_bm_create(&orig_bm, GFP_IMAGE, PG_ANY); + if (error) + goto err_out; + + error = memory_bm_create(©_bm, GFP_IMAGE, PG_ANY); + if (error) + goto err_out; + + alloc_normal = 0; + alloc_highmem = 0; + + /* Count the number of saveable data pages. */ + save_highmem = count_highmem_pages(); + saveable = count_data_pages(); + + /* + * Compute the total number of page frames we can use (count) and the + * number of pages needed for image metadata (size). + */ + count = saveable; + saveable += save_highmem; + highmem = save_highmem; + size = 0; + for_each_populated_zone(zone) { + size += snapshot_additional_pages(zone); + if (is_highmem(zone)) + highmem += zone_page_state(zone, NR_FREE_PAGES); + else + count += zone_page_state(zone, NR_FREE_PAGES); + } + avail_normal = count; + count += highmem; + count -= totalreserve_pages; + + /* Compute the maximum number of saveable pages to leave in memory. */ + max_size = (count - (size + PAGES_FOR_IO)) / 2 + - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE); + /* Compute the desired number of image pages specified by image_size. */ + size = DIV_ROUND_UP(image_size, PAGE_SIZE); + if (size > max_size) + size = max_size; + /* + * If the desired number of image pages is at least as large as the + * current number of saveable pages in memory, allocate page frames for + * the image and we're done. + */ + if (size >= saveable) { + pages = preallocate_image_highmem(save_highmem); + pages += preallocate_image_memory(saveable - pages, avail_normal); + goto out; + } + + /* Estimate the minimum size of the image. */ + pages = minimum_image_size(saveable); + /* + * To avoid excessive pressure on the normal zone, leave room in it to + * accommodate an image of the minimum size (unless it's already too + * small, in which case don't preallocate pages from it at all). + */ + if (avail_normal > pages) + avail_normal -= pages; + else + avail_normal = 0; + if (size < pages) + size = min_t(unsigned long, pages, max_size); + + /* + * Let the memory management subsystem know that we're going to need a + * large number of page frames to allocate and make it free some memory. + * NOTE: If this is not done, performance will be hurt badly in some + * test cases. + */ + shrink_all_memory(saveable - size); + + /* + * The number of saveable pages in memory was too high, so apply some + * pressure to decrease it. First, make room for the largest possible + * image and fail if that doesn't work. Next, try to decrease the size + * of the image as much as indicated by 'size' using allocations from + * highmem and non-highmem zones separately. + */ + pages_highmem = preallocate_image_highmem(highmem / 2); + alloc = (count - max_size) - pages_highmem; + pages = preallocate_image_memory(alloc, avail_normal); + if (pages < alloc) { + /* We have exhausted non-highmem pages, try highmem. */ + alloc -= pages; + pages += pages_highmem; + pages_highmem = preallocate_image_highmem(alloc); + if (pages_highmem < alloc) + goto err_out; + pages += pages_highmem; + /* + * size is the desired number of saveable pages to leave in + * memory, so try to preallocate (all memory - size) pages. + */ + alloc = (count - pages) - size; + pages += preallocate_image_highmem(alloc); + } else { + /* + * There are approximately max_size saveable pages at this point + * and we want to reduce this number down to size. + */ + alloc = max_size - size; + size = preallocate_highmem_fraction(alloc, highmem, count); + pages_highmem += size; + alloc -= size; + size = preallocate_image_memory(alloc, avail_normal); + pages_highmem += preallocate_image_highmem(alloc - size); + pages += pages_highmem + size; + } + + /* + * We only need as many page frames for the image as there are saveable + * pages in memory, but we have allocated more. Release the excessive + * ones now. + */ + free_unnecessary_pages(); + + out: + do_gettimeofday(&stop); + printk(KERN_CONT "done (allocated %lu pages)\n", pages); + swsusp_show_speed(&start, &stop, pages, "Allocated"); + + return 0; + + err_out: + printk(KERN_CONT "\n"); + swsusp_free(); + return -ENOMEM; +} + +#ifdef CONFIG_HIGHMEM +/** + * count_pages_for_highmem - compute the number of non-highmem pages + * that will be necessary for creating copies of highmem pages. + */ + +static unsigned int count_pages_for_highmem(unsigned int nr_highmem) +{ + unsigned int free_highmem = count_free_highmem_pages() + alloc_highmem; + + if (free_highmem >= nr_highmem) + nr_highmem = 0; + else + nr_highmem -= free_highmem; + + return nr_highmem; +} +#else +static unsigned int +count_pages_for_highmem(unsigned int nr_highmem) { return 0; } +#endif /* CONFIG_HIGHMEM */ + +/** + * enough_free_mem - Make sure we have enough free memory for the + * snapshot image. + */ + +static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) +{ + struct zone *zone; + unsigned int free = alloc_normal; + + for_each_populated_zone(zone) + if (!is_highmem(zone)) + free += zone_page_state(zone, NR_FREE_PAGES); + + nr_pages += count_pages_for_highmem(nr_highmem); + pr_debug("PM: Normal pages needed: %u + %u, available pages: %u\n", + nr_pages, PAGES_FOR_IO, free); + + return free > nr_pages + PAGES_FOR_IO; +} + +#ifdef CONFIG_HIGHMEM +/** + * get_highmem_buffer - if there are some highmem pages in the suspend + * image, we may need the buffer to copy them and/or load their data. + */ + +static inline int get_highmem_buffer(int safe_needed) +{ + buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed); + return buffer ? 0 : -ENOMEM; +} + +/** + * alloc_highmem_image_pages - allocate some highmem pages for the image. + * Try to allocate as many pages as needed, but if the number of free + * highmem pages is lesser than that, allocate them all. + */ + +static inline unsigned int +alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) +{ + unsigned int to_alloc = count_free_highmem_pages(); + + if (to_alloc > nr_highmem) + to_alloc = nr_highmem; + + nr_highmem -= to_alloc; + while (to_alloc-- > 0) { + struct page *page; + + page = alloc_image_page(__GFP_HIGHMEM); + memory_bm_set_bit(bm, page_to_pfn(page)); + } + return nr_highmem; +} +#else +static inline int get_highmem_buffer(int safe_needed) { return 0; } + +static inline unsigned int +alloc_highmem_pages(struct memory_bitmap *bm, unsigned int n) { return 0; } +#endif /* CONFIG_HIGHMEM */ + +/** + * swsusp_alloc - allocate memory for the suspend image + * + * We first try to allocate as many highmem pages as there are + * saveable highmem pages in the system. If that fails, we allocate + * non-highmem pages for the copies of the remaining highmem ones. + * + * In this approach it is likely that the copies of highmem pages will + * also be located in the high memory, because of the way in which + * copy_data_pages() works. + */ + +static int +swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm, + unsigned int nr_pages, unsigned int nr_highmem) +{ + if (nr_highmem > 0) { + if (get_highmem_buffer(PG_ANY)) + goto err_out; + if (nr_highmem > alloc_highmem) { + nr_highmem -= alloc_highmem; + nr_pages += alloc_highmem_pages(copy_bm, nr_highmem); + } + } + if (nr_pages > alloc_normal) { + nr_pages -= alloc_normal; + while (nr_pages-- > 0) { + struct page *page; + + page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); + if (!page) + goto err_out; + memory_bm_set_bit(copy_bm, page_to_pfn(page)); + } + } + + return 0; + + err_out: + swsusp_free(); + return -ENOMEM; +} + +asmlinkage int swsusp_save(void) +{ + unsigned int nr_pages, nr_highmem; + + printk(KERN_INFO "PM: Creating hibernation image:\n"); + + drain_local_pages(NULL); + nr_pages = count_data_pages(); + nr_highmem = count_highmem_pages(); + printk(KERN_INFO "PM: Need to copy %u pages\n", nr_pages + nr_highmem); + + if (!enough_free_mem(nr_pages, nr_highmem)) { + printk(KERN_ERR "PM: Not enough free memory\n"); + return -ENOMEM; + } + + if (swsusp_alloc(&orig_bm, ©_bm, nr_pages, nr_highmem)) { + printk(KERN_ERR "PM: Memory allocation failed\n"); + return -ENOMEM; + } + + /* During allocating of suspend pagedir, new cold pages may appear. + * Kill them. + */ + drain_local_pages(NULL); + copy_data_pages(©_bm, &orig_bm); + + /* + * End of critical section. From now on, we can write to memory, + * but we should not touch disk. This specially means we must _not_ + * touch swap space! Except we must write out our image of course. + */ + + nr_pages += nr_highmem; + nr_copy_pages = nr_pages; + nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE); + + printk(KERN_INFO "PM: Hibernation image created (%d pages copied)\n", + nr_pages); + + return 0; +} + +#ifndef CONFIG_ARCH_HIBERNATION_HEADER +static int init_header_complete(struct swsusp_info *info) +{ + memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname)); + info->version_code = LINUX_VERSION_CODE; + return 0; +} + +static char *check_image_kernel(struct swsusp_info *info) +{ + if (info->version_code != LINUX_VERSION_CODE) + return "kernel version"; + if (strcmp(info->uts.sysname,init_utsname()->sysname)) + return "system type"; + if (strcmp(info->uts.release,init_utsname()->release)) + return "kernel release"; + if (strcmp(info->uts.version,init_utsname()->version)) + return "version"; + if (strcmp(info->uts.machine,init_utsname()->machine)) + return "machine"; + return NULL; +} +#endif /* CONFIG_ARCH_HIBERNATION_HEADER */ + +unsigned long snapshot_get_image_size(void) +{ + return nr_copy_pages + nr_meta_pages + 1; +} + +static int init_header(struct swsusp_info *info) +{ + memset(info, 0, sizeof(struct swsusp_info)); + info->num_physpages = num_physpages; + info->image_pages = nr_copy_pages; + info->pages = snapshot_get_image_size(); + info->size = info->pages; + info->size <<= PAGE_SHIFT; + return init_header_complete(info); +} + +/** + * pack_pfns - pfns corresponding to the set bits found in the bitmap @bm + * are stored in the array @buf[] (1 page at a time) + */ + +static inline void +pack_pfns(unsigned long *buf, struct memory_bitmap *bm) +{ + int j; + + for (j = 0; j < PAGE_SIZE / sizeof(long); j++) { + buf[j] = memory_bm_next_pfn(bm); + if (unlikely(buf[j] == BM_END_OF_MAP)) + break; + } +} + +/** + * snapshot_read_next - used for reading the system memory snapshot. + * + * On the first call to it @handle should point to a zeroed + * snapshot_handle structure. The structure gets updated and a pointer + * to it should be passed to this function every next time. + * + * On success the function returns a positive number. Then, the caller + * is allowed to read up to the returned number of bytes from the memory + * location computed by the data_of() macro. + * + * The function returns 0 to indicate the end of data stream condition, + * and a negative number is returned on error. In such cases the + * structure pointed to by @handle is not updated and should not be used + * any more. + */ + +int snapshot_read_next(struct snapshot_handle *handle) +{ + if (handle->cur > nr_meta_pages + nr_copy_pages) + return 0; + + if (!buffer) { + /* This makes the buffer be freed by swsusp_free() */ + buffer = get_image_page(GFP_ATOMIC, PG_ANY); + if (!buffer) + return -ENOMEM; + } + if (!handle->cur) { + int error; + + error = init_header((struct swsusp_info *)buffer); + if (error) + return error; + handle->buffer = buffer; + memory_bm_position_reset(&orig_bm); + memory_bm_position_reset(©_bm); + } else if (handle->cur <= nr_meta_pages) { + clear_page(buffer); + pack_pfns(buffer, &orig_bm); + } else { + struct page *page; + + page = pfn_to_page(memory_bm_next_pfn(©_bm)); + if (PageHighMem(page)) { + /* Highmem pages are copied to the buffer, + * because we can't return with a kmapped + * highmem page (we may not be called again). + */ + void *kaddr; + + kaddr = kmap_atomic(page, KM_USER0); + copy_page(buffer, kaddr); + kunmap_atomic(kaddr, KM_USER0); + handle->buffer = buffer; + } else { + handle->buffer = page_address(page); + } + } + handle->cur++; + return PAGE_SIZE; +} + +/** + * mark_unsafe_pages - mark the pages that cannot be used for storing + * the image during resume, because they conflict with the pages that + * had been used before suspend + */ + +static int mark_unsafe_pages(struct memory_bitmap *bm) +{ + struct zone *zone; + unsigned long pfn, max_zone_pfn; + + /* Clear page flags */ + for_each_populated_zone(zone) { + max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) + if (pfn_valid(pfn)) + swsusp_unset_page_free(pfn_to_page(pfn)); + } + + /* Mark pages that correspond to the "original" pfns as "unsafe" */ + memory_bm_position_reset(bm); + do { + pfn = memory_bm_next_pfn(bm); + if (likely(pfn != BM_END_OF_MAP)) { + if (likely(pfn_valid(pfn))) + swsusp_set_page_free(pfn_to_page(pfn)); + else + return -EFAULT; + } + } while (pfn != BM_END_OF_MAP); + + allocated_unsafe_pages = 0; + + return 0; +} + +static void +duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src) +{ + unsigned long pfn; + + memory_bm_position_reset(src); + pfn = memory_bm_next_pfn(src); + while (pfn != BM_END_OF_MAP) { + memory_bm_set_bit(dst, pfn); + pfn = memory_bm_next_pfn(src); + } +} + +static int check_header(struct swsusp_info *info) +{ + char *reason; + + reason = check_image_kernel(info); + if (!reason && info->num_physpages != num_physpages) + reason = "memory size"; + if (reason) { + printk(KERN_ERR "PM: Image mismatch: %s\n", reason); + return -EPERM; + } + return 0; +} + +/** + * load header - check the image header and copy data from it + */ + +static int +load_header(struct swsusp_info *info) +{ + int error; + + restore_pblist = NULL; + error = check_header(info); + if (!error) { + nr_copy_pages = info->image_pages; + nr_meta_pages = info->pages - info->image_pages - 1; + } + return error; +} + +/** + * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set + * the corresponding bit in the memory bitmap @bm + */ +static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) +{ + int j; + + for (j = 0; j < PAGE_SIZE / sizeof(long); j++) { + if (unlikely(buf[j] == BM_END_OF_MAP)) + break; + + if (memory_bm_pfn_present(bm, buf[j])) + memory_bm_set_bit(bm, buf[j]); + else + return -EFAULT; + } + + return 0; +} + +/* List of "safe" pages that may be used to store data loaded from the suspend + * image + */ +static struct linked_page *safe_pages_list; + +#ifdef CONFIG_HIGHMEM +/* struct highmem_pbe is used for creating the list of highmem pages that + * should be restored atomically during the resume from disk, because the page + * frames they have occupied before the suspend are in use. + */ +struct highmem_pbe { + struct page *copy_page; /* data is here now */ + struct page *orig_page; /* data was here before the suspend */ + struct highmem_pbe *next; +}; + +/* List of highmem PBEs needed for restoring the highmem pages that were + * allocated before the suspend and included in the suspend image, but have + * also been allocated by the "resume" kernel, so their contents cannot be + * written directly to their "original" page frames. + */ +static struct highmem_pbe *highmem_pblist; + +/** + * count_highmem_image_pages - compute the number of highmem pages in the + * suspend image. The bits in the memory bitmap @bm that correspond to the + * image pages are assumed to be set. + */ + +static unsigned int count_highmem_image_pages(struct memory_bitmap *bm) +{ + unsigned long pfn; + unsigned int cnt = 0; + + memory_bm_position_reset(bm); + pfn = memory_bm_next_pfn(bm); + while (pfn != BM_END_OF_MAP) { + if (PageHighMem(pfn_to_page(pfn))) + cnt++; + + pfn = memory_bm_next_pfn(bm); + } + return cnt; +} + +/** + * prepare_highmem_image - try to allocate as many highmem pages as + * there are highmem image pages (@nr_highmem_p points to the variable + * containing the number of highmem image pages). The pages that are + * "safe" (ie. will not be overwritten when the suspend image is + * restored) have the corresponding bits set in @bm (it must be + * unitialized). + * + * NOTE: This function should not be called if there are no highmem + * image pages. + */ + +static unsigned int safe_highmem_pages; + +static struct memory_bitmap *safe_highmem_bm; + +static int +prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) +{ + unsigned int to_alloc; + + if (memory_bm_create(bm, GFP_ATOMIC, PG_SAFE)) + return -ENOMEM; + + if (get_highmem_buffer(PG_SAFE)) + return -ENOMEM; + + to_alloc = count_free_highmem_pages(); + if (to_alloc > *nr_highmem_p) + to_alloc = *nr_highmem_p; + else + *nr_highmem_p = to_alloc; + + safe_highmem_pages = 0; + while (to_alloc-- > 0) { + struct page *page; + + page = alloc_page(__GFP_HIGHMEM); + if (!swsusp_page_is_free(page)) { + /* The page is "safe", set its bit the bitmap */ + memory_bm_set_bit(bm, page_to_pfn(page)); + safe_highmem_pages++; + } + /* Mark the page as allocated */ + swsusp_set_page_forbidden(page); + swsusp_set_page_free(page); + } + memory_bm_position_reset(bm); + safe_highmem_bm = bm; + return 0; +} + +/** + * get_highmem_page_buffer - for given highmem image page find the buffer + * that suspend_write_next() should set for its caller to write to. + * + * If the page is to be saved to its "original" page frame or a copy of + * the page is to be made in the highmem, @buffer is returned. Otherwise, + * the copy of the page is to be made in normal memory, so the address of + * the copy is returned. + * + * If @buffer is returned, the caller of suspend_write_next() will write + * the page's contents to @buffer, so they will have to be copied to the + * right location on the next call to suspend_write_next() and it is done + * with the help of copy_last_highmem_page(). For this purpose, if + * @buffer is returned, @last_highmem page is set to the page to which + * the data will have to be copied from @buffer. + */ + +static struct page *last_highmem_page; + +static void * +get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) +{ + struct highmem_pbe *pbe; + void *kaddr; + + if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) { + /* We have allocated the "original" page frame and we can + * use it directly to store the loaded page. + */ + last_highmem_page = page; + return buffer; + } + /* The "original" page frame has not been allocated and we have to + * use a "safe" page frame to store the loaded page. + */ + pbe = chain_alloc(ca, sizeof(struct highmem_pbe)); + if (!pbe) { + swsusp_free(); + return ERR_PTR(-ENOMEM); + } + pbe->orig_page = page; + if (safe_highmem_pages > 0) { + struct page *tmp; + + /* Copy of the page will be stored in high memory */ + kaddr = buffer; + tmp = pfn_to_page(memory_bm_next_pfn(safe_highmem_bm)); + safe_highmem_pages--; + last_highmem_page = tmp; + pbe->copy_page = tmp; + } else { + /* Copy of the page will be stored in normal memory */ + kaddr = safe_pages_list; + safe_pages_list = safe_pages_list->next; + pbe->copy_page = virt_to_page(kaddr); + } + pbe->next = highmem_pblist; + highmem_pblist = pbe; + return kaddr; +} + +/** + * copy_last_highmem_page - copy the contents of a highmem image from + * @buffer, where the caller of snapshot_write_next() has place them, + * to the right location represented by @last_highmem_page . + */ + +static void copy_last_highmem_page(void) +{ + if (last_highmem_page) { + void *dst; + + dst = kmap_atomic(last_highmem_page, KM_USER0); + copy_page(dst, buffer); + kunmap_atomic(dst, KM_USER0); + last_highmem_page = NULL; + } +} + +static inline int last_highmem_page_copied(void) +{ + return !last_highmem_page; +} + +static inline void free_highmem_data(void) +{ + if (safe_highmem_bm) + memory_bm_free(safe_highmem_bm, PG_UNSAFE_CLEAR); + + if (buffer) + free_image_page(buffer, PG_UNSAFE_CLEAR); +} +#else +static inline int get_safe_write_buffer(void) { return 0; } + +static unsigned int +count_highmem_image_pages(struct memory_bitmap *bm) { return 0; } + +static inline int +prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) +{ + return 0; +} + +static inline void * +get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) +{ + return ERR_PTR(-EINVAL); +} + +static inline void copy_last_highmem_page(void) {} +static inline int last_highmem_page_copied(void) { return 1; } +static inline void free_highmem_data(void) {} +#endif /* CONFIG_HIGHMEM */ + +/** + * prepare_image - use the memory bitmap @bm to mark the pages that will + * be overwritten in the process of restoring the system memory state + * from the suspend image ("unsafe" pages) and allocate memory for the + * image. + * + * The idea is to allocate a new memory bitmap first and then allocate + * as many pages as needed for the image data, but not to assign these + * pages to specific tasks initially. Instead, we just mark them as + * allocated and create a lists of "safe" pages that will be used + * later. On systems with high memory a list of "safe" highmem pages is + * also created. + */ + +#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe)) + +static int +prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) +{ + unsigned int nr_pages, nr_highmem; + struct linked_page *sp_list, *lp; + int error; + + /* If there is no highmem, the buffer will not be necessary */ + free_image_page(buffer, PG_UNSAFE_CLEAR); + buffer = NULL; + + nr_highmem = count_highmem_image_pages(bm); + error = mark_unsafe_pages(bm); + if (error) + goto Free; + + error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE); + if (error) + goto Free; + + duplicate_memory_bitmap(new_bm, bm); + memory_bm_free(bm, PG_UNSAFE_KEEP); + if (nr_highmem > 0) { + error = prepare_highmem_image(bm, &nr_highmem); + if (error) + goto Free; + } + /* Reserve some safe pages for potential later use. + * + * NOTE: This way we make sure there will be enough safe pages for the + * chain_alloc() in get_buffer(). It is a bit wasteful, but + * nr_copy_pages cannot be greater than 50% of the memory anyway. + */ + sp_list = NULL; + /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */ + nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; + nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE); + while (nr_pages > 0) { + lp = get_image_page(GFP_ATOMIC, PG_SAFE); + if (!lp) { + error = -ENOMEM; + goto Free; + } + lp->next = sp_list; + sp_list = lp; + nr_pages--; + } + /* Preallocate memory for the image */ + safe_pages_list = NULL; + nr_pages = nr_copy_pages - nr_highmem - allocated_unsafe_pages; + while (nr_pages > 0) { + lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC); + if (!lp) { + error = -ENOMEM; + goto Free; + } + if (!swsusp_page_is_free(virt_to_page(lp))) { + /* The page is "safe", add it to the list */ + lp->next = safe_pages_list; + safe_pages_list = lp; + } + /* Mark the page as allocated */ + swsusp_set_page_forbidden(virt_to_page(lp)); + swsusp_set_page_free(virt_to_page(lp)); + nr_pages--; + } + /* Free the reserved safe pages so that chain_alloc() can use them */ + while (sp_list) { + lp = sp_list->next; + free_image_page(sp_list, PG_UNSAFE_CLEAR); + sp_list = lp; + } + return 0; + + Free: + swsusp_free(); + return error; +} + +/** + * get_buffer - compute the address that snapshot_write_next() should + * set for its caller to write to. + */ + +static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) +{ + struct pbe *pbe; + struct page *page; + unsigned long pfn = memory_bm_next_pfn(bm); + + if (pfn == BM_END_OF_MAP) + return ERR_PTR(-EFAULT); + + page = pfn_to_page(pfn); + if (PageHighMem(page)) + return get_highmem_page_buffer(page, ca); + + if (swsusp_page_is_forbidden(page) && swsusp_page_is_free(page)) + /* We have allocated the "original" page frame and we can + * use it directly to store the loaded page. + */ + return page_address(page); + + /* The "original" page frame has not been allocated and we have to + * use a "safe" page frame to store the loaded page. + */ + pbe = chain_alloc(ca, sizeof(struct pbe)); + if (!pbe) { + swsusp_free(); + return ERR_PTR(-ENOMEM); + } + pbe->orig_address = page_address(page); + pbe->address = safe_pages_list; + safe_pages_list = safe_pages_list->next; + pbe->next = restore_pblist; + restore_pblist = pbe; + return pbe->address; +} + +/** + * snapshot_write_next - used for writing the system memory snapshot. + * + * On the first call to it @handle should point to a zeroed + * snapshot_handle structure. The structure gets updated and a pointer + * to it should be passed to this function every next time. + * + * On success the function returns a positive number. Then, the caller + * is allowed to write up to the returned number of bytes to the memory + * location computed by the data_of() macro. + * + * The function returns 0 to indicate the "end of file" condition, + * and a negative number is returned on error. In such cases the + * structure pointed to by @handle is not updated and should not be used + * any more. + */ + +int snapshot_write_next(struct snapshot_handle *handle) +{ + static struct chain_allocator ca; + int error = 0; + + /* Check if we have already loaded the entire image */ + if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) + return 0; + + handle->sync_read = 1; + + if (!handle->cur) { + if (!buffer) + /* This makes the buffer be freed by swsusp_free() */ + buffer = get_image_page(GFP_ATOMIC, PG_ANY); + + if (!buffer) + return -ENOMEM; + + handle->buffer = buffer; + } else if (handle->cur == 1) { + error = load_header(buffer); + if (error) + return error; + + error = memory_bm_create(©_bm, GFP_ATOMIC, PG_ANY); + if (error) + return error; + + } else if (handle->cur <= nr_meta_pages + 1) { + error = unpack_orig_pfns(buffer, ©_bm); + if (error) + return error; + + if (handle->cur == nr_meta_pages + 1) { + error = prepare_image(&orig_bm, ©_bm); + if (error) + return error; + + chain_init(&ca, GFP_ATOMIC, PG_SAFE); + memory_bm_position_reset(&orig_bm); + restore_pblist = NULL; + handle->buffer = get_buffer(&orig_bm, &ca); + handle->sync_read = 0; + if (IS_ERR(handle->buffer)) + return PTR_ERR(handle->buffer); + } + } else { + copy_last_highmem_page(); + handle->buffer = get_buffer(&orig_bm, &ca); + if (IS_ERR(handle->buffer)) + return PTR_ERR(handle->buffer); + if (handle->buffer != buffer) + handle->sync_read = 0; + } + handle->cur++; + return PAGE_SIZE; +} + +/** + * snapshot_write_finalize - must be called after the last call to + * snapshot_write_next() in case the last page in the image happens + * to be a highmem page and its contents should be stored in the + * highmem. Additionally, it releases the memory that will not be + * used any more. + */ + +void snapshot_write_finalize(struct snapshot_handle *handle) +{ + copy_last_highmem_page(); + /* Free only if we have loaded the image entirely */ + if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) { + memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); + free_highmem_data(); + } +} + +int snapshot_image_loaded(struct snapshot_handle *handle) +{ + return !(!nr_copy_pages || !last_highmem_page_copied() || + handle->cur <= nr_meta_pages + nr_copy_pages); +} + +#ifdef CONFIG_HIGHMEM +/* Assumes that @buf is ready and points to a "safe" page */ +static inline void +swap_two_pages_data(struct page *p1, struct page *p2, void *buf) +{ + void *kaddr1, *kaddr2; + + kaddr1 = kmap_atomic(p1, KM_USER0); + kaddr2 = kmap_atomic(p2, KM_USER1); + copy_page(buf, kaddr1); + copy_page(kaddr1, kaddr2); + copy_page(kaddr2, buf); + kunmap_atomic(kaddr2, KM_USER1); + kunmap_atomic(kaddr1, KM_USER0); +} + +/** + * restore_highmem - for each highmem page that was allocated before + * the suspend and included in the suspend image, and also has been + * allocated by the "resume" kernel swap its current (ie. "before + * resume") contents with the previous (ie. "before suspend") one. + * + * If the resume eventually fails, we can call this function once + * again and restore the "before resume" highmem state. + */ + +int restore_highmem(void) +{ + struct highmem_pbe *pbe = highmem_pblist; + void *buf; + + if (!pbe) + return 0; + + buf = get_image_page(GFP_ATOMIC, PG_SAFE); + if (!buf) + return -ENOMEM; + + while (pbe) { + swap_two_pages_data(pbe->copy_page, pbe->orig_page, buf); + pbe = pbe->next; + } + free_image_page(buf, PG_UNSAFE_CLEAR); + return 0; +} +#endif /* CONFIG_HIGHMEM */ diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c new file mode 100644 index 00000000..3b06b54c --- /dev/null +++ b/kernel/power/suspend.c @@ -0,0 +1,335 @@ +/* + * kernel/power/suspend.c - Suspend to RAM and standby functionality. + * + * Copyright (c) 2003 Patrick Mochel + * Copyright (c) 2003 Open Source Development Lab + * Copyright (c) 2009 Rafael J. Wysocki , Novell Inc. + * + * This file is released under the GPLv2. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "power.h" +extern void cpufreq_save_default_governor(void); +extern void cpufreq_restore_default_governor(void); +extern void cpufreq_set_conservative_governor(void); +extern void cpufreq_set_performance_governor(void); +extern void cpufreq_set_conservative_governor_param(int up_th, int down_th); + +const char *const pm_states[PM_SUSPEND_MAX] = { +#ifdef CONFIG_EARLYSUSPEND + [PM_SUSPEND_ON] = "on", +#endif + [PM_SUSPEND_STANDBY] = "standby", + [PM_SUSPEND_MEM] = "mem", +}; + +static const struct platform_suspend_ops *suspend_ops; + +/** + * suspend_set_ops - Set the global suspend method table. + * @ops: Pointer to ops structure. + */ +void suspend_set_ops(const struct platform_suspend_ops *ops) +{ + mutex_lock(&pm_mutex); + suspend_ops = ops; + mutex_unlock(&pm_mutex); +} + +bool valid_state(suspend_state_t state) +{ + /* + * All states need lowlevel support and need to be valid to the lowlevel + * implementation, no valid callback implies that none are valid. + */ + return suspend_ops && suspend_ops->valid && suspend_ops->valid(state); +} + +/** + * suspend_valid_only_mem - generic memory-only valid callback + * + * Platform drivers that implement mem suspend only and only need + * to check for that in their .valid callback can use this instead + * of rolling their own .valid callback. + */ +int suspend_valid_only_mem(suspend_state_t state) +{ + return state == PM_SUSPEND_MEM; +} + +static int suspend_test(int level) +{ +#ifdef CONFIG_PM_DEBUG + if (pm_test_level == level) { + printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n"); + mdelay(5000); + return 1; + } +#endif /* !CONFIG_PM_DEBUG */ + return 0; +} + +/** + * suspend_prepare - Do prep work before entering low-power state. + * + * This is common code that is called for each state that we're entering. + * Run suspend notifiers, allocate a console and stop all processes. + */ +static int suspend_prepare(void) +{ + int error; + + if (!suspend_ops || !suspend_ops->enter) + return -EPERM; + + pm_prepare_console(); + + error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); + if (error) + goto Finish; + + error = usermodehelper_disable(); + if (error) + goto Finish; + + error = suspend_freeze_processes(); + if (!error) + return 0; + + suspend_thaw_processes(); + usermodehelper_enable(); + Finish: + pm_notifier_call_chain(PM_POST_SUSPEND); + pm_restore_console(); + return error; +} + +/* default implementation */ +void __attribute__ ((weak)) arch_suspend_disable_irqs(void) +{ + local_irq_disable(); +} + +/* default implementation */ +void __attribute__ ((weak)) arch_suspend_enable_irqs(void) +{ + local_irq_enable(); +} + +/** + * suspend_enter - enter the desired system sleep state. + * @state: state to enter + * + * This function should be called after devices have been suspended. + */ +static int suspend_enter(suspend_state_t state) +{ + int error; + + if (suspend_ops->prepare) { + error = suspend_ops->prepare(); + if (error) + goto Platform_finish; + } + + error = dpm_suspend_noirq(PMSG_SUSPEND); + if (error) { + printk(KERN_ERR "PM: Some devices failed to power down\n"); + goto Platform_finish; + } + + if (suspend_ops->prepare_late) { + error = suspend_ops->prepare_late(); + if (error) + goto Platform_wake; + } + + if (suspend_test(TEST_PLATFORM)) + goto Platform_wake; + + error = disable_nonboot_cpus(); + if (error || suspend_test(TEST_CPUS)) + goto Enable_cpus; + + arch_suspend_disable_irqs(); + BUG_ON(!irqs_disabled()); + + error = syscore_suspend(); + if (!error) { + if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) { + error = suspend_ops->enter(state); + events_check_enabled = false; + } + syscore_resume(); + } + + arch_suspend_enable_irqs(); + BUG_ON(irqs_disabled()); + + Enable_cpus: + enable_nonboot_cpus(); + + Platform_wake: + if (suspend_ops->wake) + suspend_ops->wake(); + + dpm_resume_noirq(PMSG_RESUME); + + Platform_finish: + if (suspend_ops->finish) + suspend_ops->finish(); + + return error; +} + +/** + * suspend_devices_and_enter - suspend devices and enter the desired system + * sleep state. + * @state: state to enter + */ +int suspend_devices_and_enter(suspend_state_t state) +{ + int error; + + if (!suspend_ops) + return -ENOSYS; + + trace_machine_suspend(state); + if (suspend_ops->begin) { + error = suspend_ops->begin(state); + if (error) + goto Close; + } + suspend_console(); + suspend_test_start(); + error = dpm_suspend_start(PMSG_SUSPEND); + if (error) { + printk(KERN_ERR "PM: Some devices failed to suspend\n"); + goto Recover_platform; + } + suspend_test_finish("suspend devices"); + if (suspend_test(TEST_DEVICES)) + goto Recover_platform; + + error = suspend_enter(state); + + Resume_devices: + suspend_test_start(); + dpm_resume_end(PMSG_RESUME); + suspend_test_finish("resume devices"); + resume_console(); + Close: + if (suspend_ops->end) + suspend_ops->end(); + trace_machine_suspend(PWR_EVENT_EXIT); + return error; + + Recover_platform: + if (suspend_ops->recover) + suspend_ops->recover(); + goto Resume_devices; +} + +/** + * suspend_finish - Do final work before exiting suspend sequence. + * + * Call platform code to clean up, restart processes, and free the + * console that we've allocated. This is not called for suspend-to-disk. + */ +static void suspend_finish(void) +{ + suspend_thaw_processes(); + usermodehelper_enable(); + pm_notifier_call_chain(PM_POST_SUSPEND); + pm_restore_console(); +} + +/** + * enter_state - Do common work of entering low-power state. + * @state: pm_state structure for state we're entering. + * + * Make sure we're the only ones trying to enter a sleep state. Fail + * if someone has beat us to it, since we don't want anything weird to + * happen when we wake up. + * Then, do the setup for suspend, enter the state, and cleaup (after + * we've woken up). + */ +int enter_state(suspend_state_t state) +{ + int error; + + if (!valid_state(state)) + return -ENODEV; + +#ifndef CONFIG_CPUFREQ_GOV_ON_EARLYSUPSEND//[ + cpufreq_save_default_governor(); + cpufreq_set_performance_governor(); +#endif //] CONFIG_CPUFREQ_GOV_ON_EARLYSUPSEND + + + if (!mutex_trylock(&pm_mutex)) + return -EBUSY; + + printk(KERN_INFO "PM: Syncing filesystems ... "); + sys_sync(); + printk("done.\n"); + + pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); + + error = suspend_prepare(); + if (error) + goto Unlock; + + if (suspend_test(TEST_FREEZER)) + goto Finish; + + pr_debug("PM: Entering %s sleep\n", pm_states[state]); + pm_restrict_gfp_mask(); + error = suspend_devices_and_enter(state); + pm_restore_gfp_mask(); + + Finish: + pr_debug("PM: Finishing wakeup.\n"); + suspend_finish(); + Unlock: + + mutex_unlock(&pm_mutex); + +#ifndef CONFIG_CPUFREQ_GOV_ON_EARLYSUPSEND//[ + cpufreq_restore_default_governor(); +#endif //] CONFIG_CPUFREQ_GOV_ON_EARLYSUPSEND + + return error; +} + +/** + * pm_suspend - Externally visible function for suspending system. + * @state: Enumerated value of state to enter. + * + * Determine whether or not value is within range, get state + * structure, and enter (above). + */ +int pm_suspend(suspend_state_t state) +{ + if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) + return enter_state(state); + return -EINVAL; +} +EXPORT_SYMBOL(pm_suspend); diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c new file mode 100644 index 00000000..25596e45 --- /dev/null +++ b/kernel/power/suspend_test.c @@ -0,0 +1,188 @@ +/* + * kernel/power/suspend_test.c - Suspend to RAM and standby test facility. + * + * Copyright (c) 2009 Pavel Machek + * + * This file is released under the GPLv2. + */ + +#include +#include + +#include "power.h" + +/* + * We test the system suspend code by setting an RTC wakealarm a short + * time in the future, then suspending. Suspending the devices won't + * normally take long ... some systems only need a few milliseconds. + * + * The time it takes is system-specific though, so when we test this + * during system bootup we allow a LOT of time. + */ +#define TEST_SUSPEND_SECONDS 10 + +static unsigned long suspend_test_start_time; + +void suspend_test_start(void) +{ + /* FIXME Use better timebase than "jiffies", ideally a clocksource. + * What we want is a hardware counter that will work correctly even + * during the irqs-are-off stages of the suspend/resume cycle... + */ + suspend_test_start_time = jiffies; +} + +void suspend_test_finish(const char *label) +{ + long nj = jiffies - suspend_test_start_time; + unsigned msec; + + msec = jiffies_to_msecs(abs(nj)); + pr_info("PM: %s took %d.%03d seconds\n", label, + msec / 1000, msec % 1000); + + /* Warning on suspend means the RTC alarm period needs to be + * larger -- the system was sooo slooowwww to suspend that the + * alarm (should have) fired before the system went to sleep! + * + * Warning on either suspend or resume also means the system + * has some performance issues. The stack dump of a WARN_ON + * is more likely to get the right attention than a printk... + */ + WARN(msec > (TEST_SUSPEND_SECONDS * 1000), + "Component: %s, time: %u\n", label, msec); +} + +/* + * To test system suspend, we need a hands-off mechanism to resume the + * system. RTCs wake alarms are a common self-contained mechanism. + */ + +static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state) +{ + static char err_readtime[] __initdata = + KERN_ERR "PM: can't read %s time, err %d\n"; + static char err_wakealarm [] __initdata = + KERN_ERR "PM: can't set %s wakealarm, err %d\n"; + static char err_suspend[] __initdata = + KERN_ERR "PM: suspend test failed, error %d\n"; + static char info_test[] __initdata = + KERN_INFO "PM: test RTC wakeup from '%s' suspend\n"; + + unsigned long now; + struct rtc_wkalrm alm; + int status; + + /* this may fail if the RTC hasn't been initialized */ + status = rtc_read_time(rtc, &alm.time); + if (status < 0) { + printk(err_readtime, dev_name(&rtc->dev), status); + return; + } + rtc_tm_to_time(&alm.time, &now); + + memset(&alm, 0, sizeof alm); + rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time); + alm.enabled = true; + + status = rtc_set_alarm(rtc, &alm); + if (status < 0) { + printk(err_wakealarm, dev_name(&rtc->dev), status); + return; + } + + if (state == PM_SUSPEND_MEM) { + printk(info_test, pm_states[state]); + status = pm_suspend(state); + if (status == -ENODEV) + state = PM_SUSPEND_STANDBY; + } + if (state == PM_SUSPEND_STANDBY) { + printk(info_test, pm_states[state]); + status = pm_suspend(state); + } + if (status < 0) + printk(err_suspend, status); + + /* Some platforms can't detect that the alarm triggered the + * wakeup, or (accordingly) disable it after it afterwards. + * It's supposed to give oneshot behavior; cope. + */ + alm.enabled = false; + rtc_set_alarm(rtc, &alm); +} + +static int __init has_wakealarm(struct device *dev, void *name_ptr) +{ + struct rtc_device *candidate = to_rtc_device(dev); + + if (!candidate->ops->set_alarm) + return 0; + if (!device_may_wakeup(candidate->dev.parent)) + return 0; + + *(const char **)name_ptr = dev_name(dev); + return 1; +} + +/* + * Kernel options like "test_suspend=mem" force suspend/resume sanity tests + * at startup time. They're normally disabled, for faster boot and because + * we can't know which states really work on this particular system. + */ +static suspend_state_t test_state __initdata = PM_SUSPEND_ON; + +static char warn_bad_state[] __initdata = + KERN_WARNING "PM: can't test '%s' suspend state\n"; + +static int __init setup_test_suspend(char *value) +{ + unsigned i; + + /* "=mem" ==> "mem" */ + value++; + for (i = 0; i < PM_SUSPEND_MAX; i++) { + if (!pm_states[i]) + continue; + if (strcmp(pm_states[i], value) != 0) + continue; + test_state = (__force suspend_state_t) i; + return 0; + } + printk(warn_bad_state, value); + return 0; +} +__setup("test_suspend", setup_test_suspend); + +static int __init test_suspend(void) +{ + static char warn_no_rtc[] __initdata = + KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n"; + + char *pony = NULL; + struct rtc_device *rtc = NULL; + + /* PM is initialized by now; is that state testable? */ + if (test_state == PM_SUSPEND_ON) + goto done; + if (!valid_state(test_state)) { + printk(warn_bad_state, pm_states[test_state]); + goto done; + } + + /* RTCs have initialized by now too ... can we use one? */ + class_find_device(rtc_class, NULL, &pony, has_wakealarm); + if (pony) + rtc = rtc_class_open(pony); + if (!rtc) { + printk(warn_no_rtc); + goto done; + } + + /* go for it */ + test_wakealarm(rtc, test_state); + rtc_class_close(rtc); +done: + return 0; +} +late_initcall(test_suspend); diff --git a/kernel/power/suspend_time.c b/kernel/power/suspend_time.c new file mode 100644 index 00000000..d2a65da9 --- /dev/null +++ b/kernel/power/suspend_time.c @@ -0,0 +1,111 @@ +/* + * debugfs file to track time spent in suspend + * + * Copyright (c) 2011, Google, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include +#include +#include +#include +#include +#include +#include + +static struct timespec suspend_time_before; +static unsigned int time_in_suspend_bins[32]; + +#ifdef CONFIG_DEBUG_FS +static int suspend_time_debug_show(struct seq_file *s, void *data) +{ + int bin; + seq_printf(s, "time (secs) count\n"); + seq_printf(s, "------------------\n"); + for (bin = 0; bin < 32; bin++) { + if (time_in_suspend_bins[bin] == 0) + continue; + seq_printf(s, "%4d - %4d %4u\n", + bin ? 1 << (bin - 1) : 0, 1 << bin, + time_in_suspend_bins[bin]); + } + return 0; +} + +static int suspend_time_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, suspend_time_debug_show, NULL); +} + +static const struct file_operations suspend_time_debug_fops = { + .open = suspend_time_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init suspend_time_debug_init(void) +{ + struct dentry *d; + + d = debugfs_create_file("suspend_time", 0755, NULL, NULL, + &suspend_time_debug_fops); + if (!d) { + pr_err("Failed to create suspend_time debug file\n"); + return -ENOMEM; + } + + return 0; +} + +late_initcall(suspend_time_debug_init); +#endif + +static int suspend_time_syscore_suspend(void) +{ + read_persistent_clock(&suspend_time_before); + + return 0; +} + +static void suspend_time_syscore_resume(void) +{ + struct timespec after; + + read_persistent_clock(&after); + + after = timespec_sub(after, suspend_time_before); + + time_in_suspend_bins[fls(after.tv_sec)]++; + + pr_info("Suspended for %lu.%03lu seconds\n", after.tv_sec, + after.tv_nsec / NSEC_PER_MSEC); +} + +static struct syscore_ops suspend_time_syscore_ops = { + .suspend = suspend_time_syscore_suspend, + .resume = suspend_time_syscore_resume, +}; + +static int suspend_time_syscore_init(void) +{ + register_syscore_ops(&suspend_time_syscore_ops); + + return 0; +} + +static void suspend_time_syscore_exit(void) +{ + unregister_syscore_ops(&suspend_time_syscore_ops); +} +module_init(suspend_time_syscore_init); +module_exit(suspend_time_syscore_exit); diff --git a/kernel/power/swap.c b/kernel/power/swap.c new file mode 100644 index 00000000..7c97c3a0 --- /dev/null +++ b/kernel/power/swap.c @@ -0,0 +1,989 @@ +/* + * linux/kernel/power/swap.c + * + * This file provides functions for reading the suspend image from + * and writing it to a swap partition. + * + * Copyright (C) 1998,2001-2005 Pavel Machek + * Copyright (C) 2006 Rafael J. Wysocki + * Copyright (C) 2010 Bojan Smojver + * + * This file is released under the GPLv2. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "power.h" + +#define HIBERNATE_SIG "S1SUSPEND" + +/* + * The swap map is a data structure used for keeping track of each page + * written to a swap partition. It consists of many swap_map_page + * structures that contain each an array of MAP_PAGE_ENTRIES swap entries. + * These structures are stored on the swap and linked together with the + * help of the .next_swap member. + * + * The swap map is created during suspend. The swap map pages are + * allocated and populated one at a time, so we only need one memory + * page to set up the entire structure. + * + * During resume we also only need to use one swap_map_page structure + * at a time. + */ + +#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1) + +struct swap_map_page { + sector_t entries[MAP_PAGE_ENTRIES]; + sector_t next_swap; +}; + +/** + * The swap_map_handle structure is used for handling swap in + * a file-alike way + */ + +struct swap_map_handle { + struct swap_map_page *cur; + sector_t cur_swap; + sector_t first_sector; + unsigned int k; +}; + +struct swsusp_header { + char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)]; + sector_t image; + unsigned int flags; /* Flags to pass to the "boot" kernel */ + char orig_sig[10]; + char sig[10]; +} __attribute__((packed)); + +static struct swsusp_header *swsusp_header; + +/** + * The following functions are used for tracing the allocated + * swap pages, so that they can be freed in case of an error. + */ + +struct swsusp_extent { + struct rb_node node; + unsigned long start; + unsigned long end; +}; + +static struct rb_root swsusp_extents = RB_ROOT; + +static int swsusp_extents_insert(unsigned long swap_offset) +{ + struct rb_node **new = &(swsusp_extents.rb_node); + struct rb_node *parent = NULL; + struct swsusp_extent *ext; + + /* Figure out where to put the new node */ + while (*new) { + ext = container_of(*new, struct swsusp_extent, node); + parent = *new; + if (swap_offset < ext->start) { + /* Try to merge */ + if (swap_offset == ext->start - 1) { + ext->start--; + return 0; + } + new = &((*new)->rb_left); + } else if (swap_offset > ext->end) { + /* Try to merge */ + if (swap_offset == ext->end + 1) { + ext->end++; + return 0; + } + new = &((*new)->rb_right); + } else { + /* It already is in the tree */ + return -EINVAL; + } + } + /* Add the new node and rebalance the tree. */ + ext = kzalloc(sizeof(struct swsusp_extent), GFP_KERNEL); + if (!ext) + return -ENOMEM; + + ext->start = swap_offset; + ext->end = swap_offset; + rb_link_node(&ext->node, parent, new); + rb_insert_color(&ext->node, &swsusp_extents); + return 0; +} + +/** + * alloc_swapdev_block - allocate a swap page and register that it has + * been allocated, so that it can be freed in case of an error. + */ + +sector_t alloc_swapdev_block(int swap) +{ + unsigned long offset; + + offset = swp_offset(get_swap_page_of_type(swap)); + if (offset) { + if (swsusp_extents_insert(offset)) + swap_free(swp_entry(swap, offset)); + else + return swapdev_block(swap, offset); + } + return 0; +} + +/** + * free_all_swap_pages - free swap pages allocated for saving image data. + * It also frees the extents used to register which swap entries had been + * allocated. + */ + +void free_all_swap_pages(int swap) +{ + struct rb_node *node; + + while ((node = swsusp_extents.rb_node)) { + struct swsusp_extent *ext; + unsigned long offset; + + ext = container_of(node, struct swsusp_extent, node); + rb_erase(node, &swsusp_extents); + for (offset = ext->start; offset <= ext->end; offset++) + swap_free(swp_entry(swap, offset)); + + kfree(ext); + } +} + +int swsusp_swap_in_use(void) +{ + return (swsusp_extents.rb_node != NULL); +} + +/* + * General things + */ + +static unsigned short root_swap = 0xffff; +struct block_device *hib_resume_bdev; + +/* + * Saving part + */ + +static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags) +{ + int error; + + hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL); + if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || + !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { + memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); + memcpy(swsusp_header->sig, HIBERNATE_SIG, 10); + swsusp_header->image = handle->first_sector; + swsusp_header->flags = flags; + error = hib_bio_write_page(swsusp_resume_block, + swsusp_header, NULL); + } else { + printk(KERN_ERR "PM: Swap header not found!\n"); + error = -ENODEV; + } + return error; +} + +/** + * swsusp_swap_check - check if the resume device is a swap device + * and get its index (if so) + * + * This is called before saving image + */ +static int swsusp_swap_check(void) +{ + int res; + + res = swap_type_of(swsusp_resume_device, swsusp_resume_block, + &hib_resume_bdev); + if (res < 0) + return res; + + root_swap = res; + res = blkdev_get(hib_resume_bdev, FMODE_WRITE, NULL); + if (res) + return res; + + res = set_blocksize(hib_resume_bdev, PAGE_SIZE); + if (res < 0) + blkdev_put(hib_resume_bdev, FMODE_WRITE); + + return res; +} + +/** + * write_page - Write one page to given swap location. + * @buf: Address we're writing. + * @offset: Offset of the swap page we're writing to. + * @bio_chain: Link the next write BIO here + */ + +static int write_page(void *buf, sector_t offset, struct bio **bio_chain) +{ + void *src; + + if (!offset) + return -ENOSPC; + + if (bio_chain) { + src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + if (src) { + copy_page(src, buf); + } else { + WARN_ON_ONCE(1); + bio_chain = NULL; /* Go synchronous */ + src = buf; + } + } else { + src = buf; + } + return hib_bio_write_page(offset, src, bio_chain); +} + +static void release_swap_writer(struct swap_map_handle *handle) +{ + if (handle->cur) + free_page((unsigned long)handle->cur); + handle->cur = NULL; +} + +static int get_swap_writer(struct swap_map_handle *handle) +{ + int ret; + + ret = swsusp_swap_check(); + if (ret) { + if (ret != -ENOSPC) + printk(KERN_ERR "PM: Cannot find swap device, try " + "swapon -a.\n"); + return ret; + } + handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); + if (!handle->cur) { + ret = -ENOMEM; + goto err_close; + } + handle->cur_swap = alloc_swapdev_block(root_swap); + if (!handle->cur_swap) { + ret = -ENOSPC; + goto err_rel; + } + handle->k = 0; + handle->first_sector = handle->cur_swap; + return 0; +err_rel: + release_swap_writer(handle); +err_close: + swsusp_close(FMODE_WRITE); + return ret; +} + +static int swap_write_page(struct swap_map_handle *handle, void *buf, + struct bio **bio_chain) +{ + int error = 0; + sector_t offset; + + if (!handle->cur) + return -EINVAL; + offset = alloc_swapdev_block(root_swap); + error = write_page(buf, offset, bio_chain); + if (error) + return error; + handle->cur->entries[handle->k++] = offset; + if (handle->k >= MAP_PAGE_ENTRIES) { + error = hib_wait_on_bio_chain(bio_chain); + if (error) + goto out; + offset = alloc_swapdev_block(root_swap); + if (!offset) + return -ENOSPC; + handle->cur->next_swap = offset; + error = write_page(handle->cur, handle->cur_swap, NULL); + if (error) + goto out; + clear_page(handle->cur); + handle->cur_swap = offset; + handle->k = 0; + } + out: + return error; +} + +static int flush_swap_writer(struct swap_map_handle *handle) +{ + if (handle->cur && handle->cur_swap) + return write_page(handle->cur, handle->cur_swap, NULL); + else + return -EINVAL; +} + +static int swap_writer_finish(struct swap_map_handle *handle, + unsigned int flags, int error) +{ + if (!error) { + flush_swap_writer(handle); + printk(KERN_INFO "PM: S"); + error = mark_swapfiles(handle, flags); + printk("|\n"); + } + + if (error) + free_all_swap_pages(root_swap); + release_swap_writer(handle); + swsusp_close(FMODE_WRITE); + + return error; +} + +/* We need to remember how much compressed data we need to read. */ +#define LZO_HEADER sizeof(size_t) + +/* Number of pages/bytes we'll compress at one time. */ +#define LZO_UNC_PAGES 32 +#define LZO_UNC_SIZE (LZO_UNC_PAGES * PAGE_SIZE) + +/* Number of pages/bytes we need for compressed data (worst case). */ +#define LZO_CMP_PAGES DIV_ROUND_UP(lzo1x_worst_compress(LZO_UNC_SIZE) + \ + LZO_HEADER, PAGE_SIZE) +#define LZO_CMP_SIZE (LZO_CMP_PAGES * PAGE_SIZE) + +/** + * save_image - save the suspend image data + */ + +static int save_image(struct swap_map_handle *handle, + struct snapshot_handle *snapshot, + unsigned int nr_to_write) +{ + unsigned int m; + int ret; + int nr_pages; + int err2; + struct bio *bio; + struct timeval start; + struct timeval stop; + + printk(KERN_INFO "PM: Saving image data pages (%u pages) ... ", + nr_to_write); + m = nr_to_write / 100; + if (!m) + m = 1; + nr_pages = 0; + bio = NULL; + do_gettimeofday(&start); + while (1) { + ret = snapshot_read_next(snapshot); + if (ret <= 0) + break; + ret = swap_write_page(handle, data_of(*snapshot), &bio); + if (ret) + break; + if (!(nr_pages % m)) + printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; + } + err2 = hib_wait_on_bio_chain(&bio); + do_gettimeofday(&stop); + if (!ret) + ret = err2; + if (!ret) + printk(KERN_CONT "\b\b\b\bdone\n"); + else + printk(KERN_CONT "\n"); + swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); + return ret; +} + + +/** + * save_image_lzo - Save the suspend image data compressed with LZO. + * @handle: Swap mam handle to use for saving the image. + * @snapshot: Image to read data from. + * @nr_to_write: Number of pages to save. + */ +static int save_image_lzo(struct swap_map_handle *handle, + struct snapshot_handle *snapshot, + unsigned int nr_to_write) +{ + unsigned int m; + int ret = 0; + int nr_pages; + int err2; + struct bio *bio; + struct timeval start; + struct timeval stop; + size_t off, unc_len, cmp_len; + unsigned char *unc, *cmp, *wrk, *page; + + page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + if (!page) { + printk(KERN_ERR "PM: Failed to allocate LZO page\n"); + return -ENOMEM; + } + + wrk = vmalloc(LZO1X_1_MEM_COMPRESS); + if (!wrk) { + printk(KERN_ERR "PM: Failed to allocate LZO workspace\n"); + free_page((unsigned long)page); + return -ENOMEM; + } + + unc = vmalloc(LZO_UNC_SIZE); + if (!unc) { + printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); + vfree(wrk); + free_page((unsigned long)page); + return -ENOMEM; + } + + cmp = vmalloc(LZO_CMP_SIZE); + if (!cmp) { + printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); + vfree(unc); + vfree(wrk); + free_page((unsigned long)page); + return -ENOMEM; + } + + printk(KERN_INFO + "PM: Compressing and saving image data (%u pages) ... ", + nr_to_write); + m = nr_to_write / 100; + if (!m) + m = 1; + nr_pages = 0; + bio = NULL; + do_gettimeofday(&start); + for (;;) { + for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) { + ret = snapshot_read_next(snapshot); + if (ret < 0) + goto out_finish; + + if (!ret) + break; + + memcpy(unc + off, data_of(*snapshot), PAGE_SIZE); + + if (!(nr_pages % m)) + printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; + } + + if (!off) + break; + + unc_len = off; + ret = lzo1x_1_compress(unc, unc_len, + cmp + LZO_HEADER, &cmp_len, wrk); + if (ret < 0) { + printk(KERN_ERR "PM: LZO compression failed\n"); + break; + } + + if (unlikely(!cmp_len || + cmp_len > lzo1x_worst_compress(unc_len))) { + printk(KERN_ERR "PM: Invalid LZO compressed length\n"); + ret = -1; + break; + } + + *(size_t *)cmp = cmp_len; + + /* + * Given we are writing one page at a time to disk, we copy + * that much from the buffer, although the last bit will likely + * be smaller than full page. This is OK - we saved the length + * of the compressed data, so any garbage at the end will be + * discarded when we read it. + */ + for (off = 0; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) { + memcpy(page, cmp + off, PAGE_SIZE); + + ret = swap_write_page(handle, page, &bio); + if (ret) + goto out_finish; + } + } + +out_finish: + err2 = hib_wait_on_bio_chain(&bio); + do_gettimeofday(&stop); + if (!ret) + ret = err2; + if (!ret) + printk(KERN_CONT "\b\b\b\bdone\n"); + else + printk(KERN_CONT "\n"); + swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); + + vfree(cmp); + vfree(unc); + vfree(wrk); + free_page((unsigned long)page); + + return ret; +} + +/** + * enough_swap - Make sure we have enough swap to save the image. + * + * Returns TRUE or FALSE after checking the total amount of swap + * space avaiable from the resume partition. + */ + +static int enough_swap(unsigned int nr_pages, unsigned int flags) +{ + unsigned int free_swap = count_swap_pages(root_swap, 1); + unsigned int required; + + pr_debug("PM: Free swap pages: %u\n", free_swap); + + required = PAGES_FOR_IO + ((flags & SF_NOCOMPRESS_MODE) ? + nr_pages : (nr_pages * LZO_CMP_PAGES) / LZO_UNC_PAGES + 1); + return free_swap > required; +} + +/** + * swsusp_write - Write entire image and metadata. + * @flags: flags to pass to the "boot" kernel in the image header + * + * It is important _NOT_ to umount filesystems at this point. We want + * them synced (in case something goes wrong) but we DO not want to mark + * filesystem clean: it is not. (And it does not matter, if we resume + * correctly, we'll mark system clean, anyway.) + */ + +int swsusp_write(unsigned int flags) +{ + struct swap_map_handle handle; + struct snapshot_handle snapshot; + struct swsusp_info *header; + unsigned long pages; + int error; + + pages = snapshot_get_image_size(); + error = get_swap_writer(&handle); + if (error) { + printk(KERN_ERR "PM: Cannot get swap writer\n"); + return error; + } + if (!enough_swap(pages, flags)) { + printk(KERN_ERR "PM: Not enough free swap\n"); + error = -ENOSPC; + goto out_finish; + } + memset(&snapshot, 0, sizeof(struct snapshot_handle)); + error = snapshot_read_next(&snapshot); + if (error < PAGE_SIZE) { + if (error >= 0) + error = -EFAULT; + + goto out_finish; + } + header = (struct swsusp_info *)data_of(snapshot); + error = swap_write_page(&handle, header, NULL); + if (!error) { + error = (flags & SF_NOCOMPRESS_MODE) ? + save_image(&handle, &snapshot, pages - 1) : + save_image_lzo(&handle, &snapshot, pages - 1); + } +out_finish: + error = swap_writer_finish(&handle, flags, error); + return error; +} + +/** + * The following functions allow us to read data using a swap map + * in a file-alike way + */ + +static void release_swap_reader(struct swap_map_handle *handle) +{ + if (handle->cur) + free_page((unsigned long)handle->cur); + handle->cur = NULL; +} + +static int get_swap_reader(struct swap_map_handle *handle, + unsigned int *flags_p) +{ + int error; + + *flags_p = swsusp_header->flags; + + if (!swsusp_header->image) /* how can this happen? */ + return -EINVAL; + + handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH); + if (!handle->cur) + return -ENOMEM; + + error = hib_bio_read_page(swsusp_header->image, handle->cur, NULL); + if (error) { + release_swap_reader(handle); + return error; + } + handle->k = 0; + return 0; +} + +static int swap_read_page(struct swap_map_handle *handle, void *buf, + struct bio **bio_chain) +{ + sector_t offset; + int error; + + if (!handle->cur) + return -EINVAL; + offset = handle->cur->entries[handle->k]; + if (!offset) + return -EFAULT; + error = hib_bio_read_page(offset, buf, bio_chain); + if (error) + return error; + if (++handle->k >= MAP_PAGE_ENTRIES) { + error = hib_wait_on_bio_chain(bio_chain); + handle->k = 0; + offset = handle->cur->next_swap; + if (!offset) + release_swap_reader(handle); + else if (!error) + error = hib_bio_read_page(offset, handle->cur, NULL); + } + return error; +} + +static int swap_reader_finish(struct swap_map_handle *handle) +{ + release_swap_reader(handle); + + return 0; +} + +/** + * load_image - load the image using the swap map handle + * @handle and the snapshot handle @snapshot + * (assume there are @nr_pages pages to load) + */ + +static int load_image(struct swap_map_handle *handle, + struct snapshot_handle *snapshot, + unsigned int nr_to_read) +{ + unsigned int m; + int error = 0; + struct timeval start; + struct timeval stop; + struct bio *bio; + int err2; + unsigned nr_pages; + + printk(KERN_INFO "PM: Loading image data pages (%u pages) ... ", + nr_to_read); + m = nr_to_read / 100; + if (!m) + m = 1; + nr_pages = 0; + bio = NULL; + do_gettimeofday(&start); + for ( ; ; ) { + error = snapshot_write_next(snapshot); + if (error <= 0) + break; + error = swap_read_page(handle, data_of(*snapshot), &bio); + if (error) + break; + if (snapshot->sync_read) + error = hib_wait_on_bio_chain(&bio); + if (error) + break; + if (!(nr_pages % m)) + printk("\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; + } + err2 = hib_wait_on_bio_chain(&bio); + do_gettimeofday(&stop); + if (!error) + error = err2; + if (!error) { + printk("\b\b\b\bdone\n"); + snapshot_write_finalize(snapshot); + if (!snapshot_image_loaded(snapshot)) + error = -ENODATA; + } else + printk("\n"); + swsusp_show_speed(&start, &stop, nr_to_read, "Read"); + return error; +} + +/** + * load_image_lzo - Load compressed image data and decompress them with LZO. + * @handle: Swap map handle to use for loading data. + * @snapshot: Image to copy uncompressed data into. + * @nr_to_read: Number of pages to load. + */ +static int load_image_lzo(struct swap_map_handle *handle, + struct snapshot_handle *snapshot, + unsigned int nr_to_read) +{ + unsigned int m; + int error = 0; + struct bio *bio; + struct timeval start; + struct timeval stop; + unsigned nr_pages; + size_t i, off, unc_len, cmp_len; + unsigned char *unc, *cmp, *page[LZO_CMP_PAGES]; + + for (i = 0; i < LZO_CMP_PAGES; i++) { + page[i] = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + if (!page[i]) { + printk(KERN_ERR "PM: Failed to allocate LZO page\n"); + + while (i) + free_page((unsigned long)page[--i]); + + return -ENOMEM; + } + } + + unc = vmalloc(LZO_UNC_SIZE); + if (!unc) { + printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n"); + + for (i = 0; i < LZO_CMP_PAGES; i++) + free_page((unsigned long)page[i]); + + return -ENOMEM; + } + + cmp = vmalloc(LZO_CMP_SIZE); + if (!cmp) { + printk(KERN_ERR "PM: Failed to allocate LZO compressed\n"); + + vfree(unc); + for (i = 0; i < LZO_CMP_PAGES; i++) + free_page((unsigned long)page[i]); + + return -ENOMEM; + } + + printk(KERN_INFO + "PM: Loading and decompressing image data (%u pages) ... ", + nr_to_read); + m = nr_to_read / 100; + if (!m) + m = 1; + nr_pages = 0; + bio = NULL; + do_gettimeofday(&start); + + error = snapshot_write_next(snapshot); + if (error <= 0) + goto out_finish; + + for (;;) { + error = swap_read_page(handle, page[0], NULL); /* sync */ + if (error) + break; + + cmp_len = *(size_t *)page[0]; + if (unlikely(!cmp_len || + cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) { + printk(KERN_ERR "PM: Invalid LZO compressed length\n"); + error = -1; + break; + } + + for (off = PAGE_SIZE, i = 1; + off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) { + error = swap_read_page(handle, page[i], &bio); + if (error) + goto out_finish; + } + + error = hib_wait_on_bio_chain(&bio); /* need all data now */ + if (error) + goto out_finish; + + for (off = 0, i = 0; + off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) { + memcpy(cmp + off, page[i], PAGE_SIZE); + } + + unc_len = LZO_UNC_SIZE; + error = lzo1x_decompress_safe(cmp + LZO_HEADER, cmp_len, + unc, &unc_len); + if (error < 0) { + printk(KERN_ERR "PM: LZO decompression failed\n"); + break; + } + + if (unlikely(!unc_len || + unc_len > LZO_UNC_SIZE || + unc_len & (PAGE_SIZE - 1))) { + printk(KERN_ERR "PM: Invalid LZO uncompressed length\n"); + error = -1; + break; + } + + for (off = 0; off < unc_len; off += PAGE_SIZE) { + memcpy(data_of(*snapshot), unc + off, PAGE_SIZE); + + if (!(nr_pages % m)) + printk("\b\b\b\b%3d%%", nr_pages / m); + nr_pages++; + + error = snapshot_write_next(snapshot); + if (error <= 0) + goto out_finish; + } + } + +out_finish: + do_gettimeofday(&stop); + if (!error) { + printk("\b\b\b\bdone\n"); + snapshot_write_finalize(snapshot); + if (!snapshot_image_loaded(snapshot)) + error = -ENODATA; + } else + printk("\n"); + swsusp_show_speed(&start, &stop, nr_to_read, "Read"); + + vfree(cmp); + vfree(unc); + for (i = 0; i < LZO_CMP_PAGES; i++) + free_page((unsigned long)page[i]); + + return error; +} + +/** + * swsusp_read - read the hibernation image. + * @flags_p: flags passed by the "frozen" kernel in the image header should + * be written into this memory location + */ + +int swsusp_read(unsigned int *flags_p) +{ + int error; + struct swap_map_handle handle; + struct snapshot_handle snapshot; + struct swsusp_info *header; + + memset(&snapshot, 0, sizeof(struct snapshot_handle)); + error = snapshot_write_next(&snapshot); + if (error < PAGE_SIZE) + return error < 0 ? error : -EFAULT; + header = (struct swsusp_info *)data_of(snapshot); + error = get_swap_reader(&handle, flags_p); + if (error) + goto end; + if (!error) + error = swap_read_page(&handle, header, NULL); + if (!error) { + error = (*flags_p & SF_NOCOMPRESS_MODE) ? + load_image(&handle, &snapshot, header->pages - 1) : + load_image_lzo(&handle, &snapshot, header->pages - 1); + } + swap_reader_finish(&handle); +end: + if (!error) + pr_debug("PM: Image successfully loaded\n"); + else + pr_debug("PM: Error %d resuming\n", error); + return error; +} + +/** + * swsusp_check - Check for swsusp signature in the resume device + */ + +int swsusp_check(void) +{ + int error; + + hib_resume_bdev = blkdev_get_by_dev(swsusp_resume_device, + FMODE_READ, NULL); + if (!IS_ERR(hib_resume_bdev)) { + set_blocksize(hib_resume_bdev, PAGE_SIZE); + clear_page(swsusp_header); + error = hib_bio_read_page(swsusp_resume_block, + swsusp_header, NULL); + if (error) + goto put; + + if (!memcmp(HIBERNATE_SIG, swsusp_header->sig, 10)) { + memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); + /* Reset swap signature now */ + error = hib_bio_write_page(swsusp_resume_block, + swsusp_header, NULL); + } else { + error = -EINVAL; + } + +put: + if (error) + blkdev_put(hib_resume_bdev, FMODE_READ); + else + pr_debug("PM: Image signature found, resuming\n"); + } else { + error = PTR_ERR(hib_resume_bdev); + } + + if (error) + pr_debug("PM: Image not found (code %d)\n", error); + + return error; +} + +/** + * swsusp_close - close swap device. + */ + +void swsusp_close(fmode_t mode) +{ + if (IS_ERR(hib_resume_bdev)) { + pr_debug("PM: Image device not initialised\n"); + return; + } + + blkdev_put(hib_resume_bdev, mode); +} + +static int swsusp_header_init(void) +{ + swsusp_header = (struct swsusp_header*) __get_free_page(GFP_KERNEL); + if (!swsusp_header) + panic("Could not allocate memory for swsusp_header\n"); + return 0; +} + +core_initcall(swsusp_header_init); diff --git a/kernel/power/user.c b/kernel/power/user.c new file mode 100644 index 00000000..42ddbc6f --- /dev/null +++ b/kernel/power/user.c @@ -0,0 +1,486 @@ +/* + * linux/kernel/power/user.c + * + * This file provides the user space interface for software suspend/resume. + * + * Copyright (C) 2006 Rafael J. Wysocki + * + * This file is released under the GPLv2. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "power.h" + +/* + * NOTE: The SNAPSHOT_SET_SWAP_FILE and SNAPSHOT_PMOPS ioctls are obsolete and + * will be removed in the future. They are only preserved here for + * compatibility with existing userland utilities. + */ +#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int) +#define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int) + +#define PMOPS_PREPARE 1 +#define PMOPS_ENTER 2 +#define PMOPS_FINISH 3 + +/* + * NOTE: The following ioctl definitions are wrong and have been replaced with + * correct ones. They are only preserved here for compatibility with existing + * userland utilities and will be removed in the future. + */ +#define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *) +#define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long) +#define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *) +#define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *) + + +#define SNAPSHOT_MINOR 231 + +static struct snapshot_data { + struct snapshot_handle handle; + int swap; + int mode; + char frozen; + char ready; + char platform_support; +} snapshot_state; + +atomic_t snapshot_device_available = ATOMIC_INIT(1); + +static int snapshot_open(struct inode *inode, struct file *filp) +{ + struct snapshot_data *data; + int error; + + mutex_lock(&pm_mutex); + + if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { + error = -EBUSY; + goto Unlock; + } + + if ((filp->f_flags & O_ACCMODE) == O_RDWR) { + atomic_inc(&snapshot_device_available); + error = -ENOSYS; + goto Unlock; + } + if(create_basic_memory_bitmaps()) { + atomic_inc(&snapshot_device_available); + error = -ENOMEM; + goto Unlock; + } + nonseekable_open(inode, filp); + data = &snapshot_state; + filp->private_data = data; + memset(&data->handle, 0, sizeof(struct snapshot_handle)); + if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { + /* Hibernating. The image device should be accessible. */ + data->swap = swsusp_resume_device ? + swap_type_of(swsusp_resume_device, 0, NULL) : -1; + data->mode = O_RDONLY; + error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); + if (error) + pm_notifier_call_chain(PM_POST_HIBERNATION); + } else { + /* + * Resuming. We may need to wait for the image device to + * appear. + */ + wait_for_device_probe(); + scsi_complete_async_scans(); + + data->swap = -1; + data->mode = O_WRONLY; + error = pm_notifier_call_chain(PM_RESTORE_PREPARE); + if (error) + pm_notifier_call_chain(PM_POST_RESTORE); + } + if (error) { + free_basic_memory_bitmaps(); + atomic_inc(&snapshot_device_available); + } + data->frozen = 0; + data->ready = 0; + data->platform_support = 0; + + Unlock: + mutex_unlock(&pm_mutex); + + return error; +} + +static int snapshot_release(struct inode *inode, struct file *filp) +{ + struct snapshot_data *data; + + mutex_lock(&pm_mutex); + + swsusp_free(); + free_basic_memory_bitmaps(); + data = filp->private_data; + free_all_swap_pages(data->swap); + if (data->frozen) { + pm_restore_gfp_mask(); + thaw_processes(); + } + pm_notifier_call_chain(data->mode == O_RDONLY ? + PM_POST_HIBERNATION : PM_POST_RESTORE); + atomic_inc(&snapshot_device_available); + + mutex_unlock(&pm_mutex); + + return 0; +} + +static ssize_t snapshot_read(struct file *filp, char __user *buf, + size_t count, loff_t *offp) +{ + struct snapshot_data *data; + ssize_t res; + loff_t pg_offp = *offp & ~PAGE_MASK; + + mutex_lock(&pm_mutex); + + data = filp->private_data; + if (!data->ready) { + res = -ENODATA; + goto Unlock; + } + if (!pg_offp) { /* on page boundary? */ + res = snapshot_read_next(&data->handle); + if (res <= 0) + goto Unlock; + } else { + res = PAGE_SIZE - pg_offp; + } + + res = simple_read_from_buffer(buf, count, &pg_offp, + data_of(data->handle), res); + if (res > 0) + *offp += res; + + Unlock: + mutex_unlock(&pm_mutex); + + return res; +} + +static ssize_t snapshot_write(struct file *filp, const char __user *buf, + size_t count, loff_t *offp) +{ + struct snapshot_data *data; + ssize_t res; + loff_t pg_offp = *offp & ~PAGE_MASK; + + mutex_lock(&pm_mutex); + + data = filp->private_data; + + if (!pg_offp) { + res = snapshot_write_next(&data->handle); + if (res <= 0) + goto unlock; + } else { + res = PAGE_SIZE - pg_offp; + } + + res = simple_write_to_buffer(data_of(data->handle), res, &pg_offp, + buf, count); + if (res > 0) + *offp += res; +unlock: + mutex_unlock(&pm_mutex); + + return res; +} + +static void snapshot_deprecated_ioctl(unsigned int cmd) +{ + if (printk_ratelimit()) + printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will " + "be removed soon, update your suspend-to-disk " + "utilities\n", + __builtin_return_address(0), cmd); +} + +static long snapshot_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + int error = 0; + struct snapshot_data *data; + loff_t size; + sector_t offset; + + if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC) + return -ENOTTY; + if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR) + return -ENOTTY; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!mutex_trylock(&pm_mutex)) + return -EBUSY; + + data = filp->private_data; + + switch (cmd) { + + case SNAPSHOT_FREEZE: + if (data->frozen) + break; + + printk("Syncing filesystems ... "); + sys_sync(); + printk("done.\n"); + + error = usermodehelper_disable(); + if (error) + break; + + error = freeze_processes(); + if (error) { + thaw_processes(); + usermodehelper_enable(); + } + if (!error) + data->frozen = 1; + break; + + case SNAPSHOT_UNFREEZE: + if (!data->frozen || data->ready) + break; + pm_restore_gfp_mask(); + thaw_processes(); + usermodehelper_enable(); + data->frozen = 0; + break; + + case SNAPSHOT_ATOMIC_SNAPSHOT: + snapshot_deprecated_ioctl(cmd); + case SNAPSHOT_CREATE_IMAGE: + if (data->mode != O_RDONLY || !data->frozen || data->ready) { + error = -EPERM; + break; + } + pm_restore_gfp_mask(); + error = hibernation_snapshot(data->platform_support); + if (!error) + error = put_user(in_suspend, (int __user *)arg); + if (!error) + data->ready = 1; + break; + + case SNAPSHOT_ATOMIC_RESTORE: + snapshot_write_finalize(&data->handle); + if (data->mode != O_WRONLY || !data->frozen || + !snapshot_image_loaded(&data->handle)) { + error = -EPERM; + break; + } + error = hibernation_restore(data->platform_support); + break; + + case SNAPSHOT_FREE: + swsusp_free(); + memset(&data->handle, 0, sizeof(struct snapshot_handle)); + data->ready = 0; + break; + + case SNAPSHOT_SET_IMAGE_SIZE: + snapshot_deprecated_ioctl(cmd); + case SNAPSHOT_PREF_IMAGE_SIZE: + image_size = arg; + break; + + case SNAPSHOT_GET_IMAGE_SIZE: + if (!data->ready) { + error = -ENODATA; + break; + } + size = snapshot_get_image_size(); + size <<= PAGE_SHIFT; + error = put_user(size, (loff_t __user *)arg); + break; + + case SNAPSHOT_AVAIL_SWAP: + snapshot_deprecated_ioctl(cmd); + case SNAPSHOT_AVAIL_SWAP_SIZE: + size = count_swap_pages(data->swap, 1); + size <<= PAGE_SHIFT; + error = put_user(size, (loff_t __user *)arg); + break; + + case SNAPSHOT_GET_SWAP_PAGE: + snapshot_deprecated_ioctl(cmd); + case SNAPSHOT_ALLOC_SWAP_PAGE: + if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { + error = -ENODEV; + break; + } + offset = alloc_swapdev_block(data->swap); + if (offset) { + offset <<= PAGE_SHIFT; + error = put_user(offset, (loff_t __user *)arg); + } else { + error = -ENOSPC; + } + break; + + case SNAPSHOT_FREE_SWAP_PAGES: + if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { + error = -ENODEV; + break; + } + free_all_swap_pages(data->swap); + break; + + case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */ + snapshot_deprecated_ioctl(cmd); + if (!swsusp_swap_in_use()) { + /* + * User space encodes device types as two-byte values, + * so we need to recode them + */ + if (old_decode_dev(arg)) { + data->swap = swap_type_of(old_decode_dev(arg), + 0, NULL); + if (data->swap < 0) + error = -ENODEV; + } else { + data->swap = -1; + error = -EINVAL; + } + } else { + error = -EPERM; + } + break; + + case SNAPSHOT_S2RAM: + if (!data->frozen) { + error = -EPERM; + break; + } + /* + * Tasks are frozen and the notifiers have been called with + * PM_HIBERNATION_PREPARE + */ + error = suspend_devices_and_enter(PM_SUSPEND_MEM); + data->ready = 0; + break; + + case SNAPSHOT_PLATFORM_SUPPORT: + data->platform_support = !!arg; + break; + + case SNAPSHOT_POWER_OFF: + if (data->platform_support) + error = hibernation_platform_enter(); + break; + + case SNAPSHOT_PMOPS: /* This ioctl is deprecated */ + snapshot_deprecated_ioctl(cmd); + error = -EINVAL; + + switch (arg) { + + case PMOPS_PREPARE: + data->platform_support = 1; + error = 0; + break; + + case PMOPS_ENTER: + if (data->platform_support) + error = hibernation_platform_enter(); + break; + + case PMOPS_FINISH: + if (data->platform_support) + error = 0; + break; + + default: + printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg); + + } + break; + + case SNAPSHOT_SET_SWAP_AREA: + if (swsusp_swap_in_use()) { + error = -EPERM; + } else { + struct resume_swap_area swap_area; + dev_t swdev; + + error = copy_from_user(&swap_area, (void __user *)arg, + sizeof(struct resume_swap_area)); + if (error) { + error = -EFAULT; + break; + } + + /* + * User space encodes device types as two-byte values, + * so we need to recode them + */ + swdev = new_decode_dev(swap_area.dev); + if (swdev) { + offset = swap_area.offset; + data->swap = swap_type_of(swdev, offset, NULL); + if (data->swap < 0) + error = -ENODEV; + } else { + data->swap = -1; + error = -EINVAL; + } + } + break; + + default: + error = -ENOTTY; + + } + + mutex_unlock(&pm_mutex); + + return error; +} + +static const struct file_operations snapshot_fops = { + .open = snapshot_open, + .release = snapshot_release, + .read = snapshot_read, + .write = snapshot_write, + .llseek = no_llseek, + .unlocked_ioctl = snapshot_ioctl, +}; + +static struct miscdevice snapshot_device = { + .minor = SNAPSHOT_MINOR, + .name = "snapshot", + .fops = &snapshot_fops, +}; + +static int __init snapshot_device_init(void) +{ + return misc_register(&snapshot_device); +}; + +device_initcall(snapshot_device_init); diff --git a/kernel/power/userwakelock.c b/kernel/power/userwakelock.c new file mode 100644 index 00000000..a28a8db4 --- /dev/null +++ b/kernel/power/userwakelock.c @@ -0,0 +1,219 @@ +/* kernel/power/userwakelock.c + * + * Copyright (C) 2005-2008 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include + +#include "power.h" + +enum { + DEBUG_FAILURE = BIT(0), + DEBUG_ERROR = BIT(1), + DEBUG_NEW = BIT(2), + DEBUG_ACCESS = BIT(3), + DEBUG_LOOKUP = BIT(4), +}; +static int debug_mask = DEBUG_FAILURE; +module_param_named(debug_mask, debug_mask, int, S_IRUGO | S_IWUSR | S_IWGRP); + +static DEFINE_MUTEX(tree_lock); + +struct user_wake_lock { + struct rb_node node; + struct wake_lock wake_lock; + char name[0]; +}; +struct rb_root user_wake_locks; + +static struct user_wake_lock *lookup_wake_lock_name( + const char *buf, int allocate, long *timeoutptr) +{ + struct rb_node **p = &user_wake_locks.rb_node; + struct rb_node *parent = NULL; + struct user_wake_lock *l; + int diff; + u64 timeout; + int name_len; + const char *arg; + + /* Find length of lock name and start of optional timeout string */ + arg = buf; + while (*arg && !isspace(*arg)) + arg++; + name_len = arg - buf; + if (!name_len) + goto bad_arg; + while (isspace(*arg)) + arg++; + + /* Process timeout string */ + if (timeoutptr && *arg) { + timeout = simple_strtoull(arg, (char **)&arg, 0); + while (isspace(*arg)) + arg++; + if (*arg) + goto bad_arg; + /* convert timeout from nanoseconds to jiffies > 0 */ + timeout += (NSEC_PER_SEC / HZ) - 1; + do_div(timeout, (NSEC_PER_SEC / HZ)); + if (timeout <= 0) + timeout = 1; + *timeoutptr = timeout; + } else if (*arg) + goto bad_arg; + else if (timeoutptr) + *timeoutptr = 0; + + /* Lookup wake lock in rbtree */ + while (*p) { + parent = *p; + l = rb_entry(parent, struct user_wake_lock, node); + diff = strncmp(buf, l->name, name_len); + if (!diff && l->name[name_len]) + diff = -1; + if (debug_mask & DEBUG_ERROR) + pr_info("lookup_wake_lock_name: compare %.*s %s %d\n", + name_len, buf, l->name, diff); + + if (diff < 0) + p = &(*p)->rb_left; + else if (diff > 0) + p = &(*p)->rb_right; + else + return l; + } + + /* Allocate and add new wakelock to rbtree */ + if (!allocate) { + if (debug_mask & DEBUG_ERROR) + pr_info("lookup_wake_lock_name: %.*s not found\n", + name_len, buf); + return ERR_PTR(-EINVAL); + } + l = kzalloc(sizeof(*l) + name_len + 1, GFP_KERNEL); + if (l == NULL) { + if (debug_mask & DEBUG_FAILURE) + pr_err("lookup_wake_lock_name: failed to allocate " + "memory for %.*s\n", name_len, buf); + return ERR_PTR(-ENOMEM); + } + memcpy(l->name, buf, name_len); + if (debug_mask & DEBUG_NEW) + pr_info("lookup_wake_lock_name: new wake lock %s\n", l->name); + wake_lock_init(&l->wake_lock, WAKE_LOCK_SUSPEND, l->name); + rb_link_node(&l->node, parent, p); + rb_insert_color(&l->node, &user_wake_locks); + return l; + +bad_arg: + if (debug_mask & DEBUG_ERROR) + pr_info("lookup_wake_lock_name: wake lock, %.*s, bad arg, %s\n", + name_len, buf, arg); + return ERR_PTR(-EINVAL); +} + +ssize_t wake_lock_show( + struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + char *s = buf; + char *end = buf + PAGE_SIZE; + struct rb_node *n; + struct user_wake_lock *l; + + mutex_lock(&tree_lock); + + for (n = rb_first(&user_wake_locks); n != NULL; n = rb_next(n)) { + l = rb_entry(n, struct user_wake_lock, node); + if (wake_lock_active(&l->wake_lock)) + s += scnprintf(s, end - s, "%s ", l->name); + } + s += scnprintf(s, end - s, "\n"); + + mutex_unlock(&tree_lock); + return (s - buf); +} + +ssize_t wake_lock_store( + struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + long timeout; + struct user_wake_lock *l; + + mutex_lock(&tree_lock); + l = lookup_wake_lock_name(buf, 1, &timeout); + if (IS_ERR(l)) { + n = PTR_ERR(l); + goto bad_name; + } + + if (debug_mask & DEBUG_ACCESS) + pr_info("wake_lock_store: %s, timeout %ld\n", l->name, timeout); + + if (timeout) + wake_lock_timeout(&l->wake_lock, timeout); + else + wake_lock(&l->wake_lock); +bad_name: + mutex_unlock(&tree_lock); + return n; +} + + +ssize_t wake_unlock_show( + struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + char *s = buf; + char *end = buf + PAGE_SIZE; + struct rb_node *n; + struct user_wake_lock *l; + + mutex_lock(&tree_lock); + + for (n = rb_first(&user_wake_locks); n != NULL; n = rb_next(n)) { + l = rb_entry(n, struct user_wake_lock, node); + if (!wake_lock_active(&l->wake_lock)) + s += scnprintf(s, end - s, "%s ", l->name); + } + s += scnprintf(s, end - s, "\n"); + + mutex_unlock(&tree_lock); + return (s - buf); +} + +ssize_t wake_unlock_store( + struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + struct user_wake_lock *l; + + mutex_lock(&tree_lock); + l = lookup_wake_lock_name(buf, 0, NULL); + if (IS_ERR(l)) { + n = PTR_ERR(l); + goto not_found; + } + + if (debug_mask & DEBUG_ACCESS) + pr_info("wake_unlock_store: %s\n", l->name); + + wake_unlock(&l->wake_lock); +not_found: + mutex_unlock(&tree_lock); + return n; +} + diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c new file mode 100644 index 00000000..81e1b7c6 --- /dev/null +++ b/kernel/power/wakelock.c @@ -0,0 +1,634 @@ +/* kernel/power/wakelock.c + * + * Copyright (C) 2005-2008 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include +#include /* sys_sync */ +#include +#ifdef CONFIG_WAKELOCK_STAT +#include +#endif +#include "power.h" + +enum { + DEBUG_EXIT_SUSPEND = 1U << 0, + DEBUG_WAKEUP = 1U << 1, + DEBUG_SUSPEND = 1U << 2, + DEBUG_EXPIRE = 1U << 3, + DEBUG_WAKE_LOCK = 1U << 4, +}; +static int debug_mask = DEBUG_EXIT_SUSPEND | DEBUG_WAKEUP; +module_param_named(debug_mask, debug_mask, int, S_IRUGO | S_IWUSR | S_IWGRP); + +#define WAKE_LOCK_TYPE_MASK (0x0f) +#define WAKE_LOCK_INITIALIZED (1U << 8) +#define WAKE_LOCK_ACTIVE (1U << 9) +#define WAKE_LOCK_AUTO_EXPIRE (1U << 10) +#define WAKE_LOCK_PREVENTING_SUSPEND (1U << 11) + +static DEFINE_SPINLOCK(list_lock); +static LIST_HEAD(inactive_locks); +static struct list_head active_wake_locks[WAKE_LOCK_TYPE_COUNT]; +static int current_event_num; +struct workqueue_struct *suspend_work_queue; +struct wake_lock main_wake_lock; +suspend_state_t requested_suspend_state = PM_SUSPEND_MEM; +static struct wake_lock unknown_wakeup; +static struct wake_lock suspend_backoff_lock; + +#define SUSPEND_BACKOFF_THRESHOLD 10 +#define SUSPEND_BACKOFF_INTERVAL 10000 + +static unsigned suspend_short_count; + +#ifdef CONFIG_WAKELOCK_STAT +static struct wake_lock deleted_wake_locks; +static ktime_t last_sleep_time_update; +static int wait_for_wakeup; + +int get_expired_time(struct wake_lock *lock, ktime_t *expire_time) +{ + struct timespec ts; + struct timespec kt; + struct timespec tomono; + struct timespec delta; + struct timespec sleep; + long timeout; + + if (!(lock->flags & WAKE_LOCK_AUTO_EXPIRE)) + return 0; + get_xtime_and_monotonic_and_sleep_offset(&kt, &tomono, &sleep); + timeout = lock->expires - jiffies; + if (timeout > 0) + return 0; + jiffies_to_timespec(-timeout, &delta); + set_normalized_timespec(&ts, kt.tv_sec + tomono.tv_sec - delta.tv_sec, + kt.tv_nsec + tomono.tv_nsec - delta.tv_nsec); + *expire_time = timespec_to_ktime(ts); + return 1; +} + + +static int print_lock_stat(struct seq_file *m, struct wake_lock *lock) +{ + int lock_count = lock->stat.count; + int expire_count = lock->stat.expire_count; + ktime_t active_time = ktime_set(0, 0); + ktime_t total_time = lock->stat.total_time; + ktime_t max_time = lock->stat.max_time; + + ktime_t prevent_suspend_time = lock->stat.prevent_suspend_time; + if (lock->flags & WAKE_LOCK_ACTIVE) { + ktime_t now, add_time; + int expired = get_expired_time(lock, &now); + if (!expired) + now = ktime_get(); + add_time = ktime_sub(now, lock->stat.last_time); + lock_count++; + if (!expired) + active_time = add_time; + else + expire_count++; + total_time = ktime_add(total_time, add_time); + if (lock->flags & WAKE_LOCK_PREVENTING_SUSPEND) + prevent_suspend_time = ktime_add(prevent_suspend_time, + ktime_sub(now, last_sleep_time_update)); + if (add_time.tv64 > max_time.tv64) + max_time = add_time; + } + + return seq_printf(m, + "\"%s\"\t%d\t%d\t%d\t%lld\t%lld\t%lld\t%lld\t%lld\n", + lock->name, lock_count, expire_count, + lock->stat.wakeup_count, ktime_to_ns(active_time), + ktime_to_ns(total_time), + ktime_to_ns(prevent_suspend_time), ktime_to_ns(max_time), + ktime_to_ns(lock->stat.last_time)); +} + +static int wakelock_stats_show(struct seq_file *m, void *unused) +{ + unsigned long irqflags; + struct wake_lock *lock; + int ret; + int type; + + spin_lock_irqsave(&list_lock, irqflags); + + ret = seq_puts(m, "name\tcount\texpire_count\twake_count\tactive_since" + "\ttotal_time\tsleep_time\tmax_time\tlast_change\n"); + list_for_each_entry(lock, &inactive_locks, link) + ret = print_lock_stat(m, lock); + for (type = 0; type < WAKE_LOCK_TYPE_COUNT; type++) { + list_for_each_entry(lock, &active_wake_locks[type], link) + ret = print_lock_stat(m, lock); + } + spin_unlock_irqrestore(&list_lock, irqflags); + return 0; +} + +static void wake_unlock_stat_locked(struct wake_lock *lock, int expired) +{ + ktime_t duration; + ktime_t now; + if (!(lock->flags & WAKE_LOCK_ACTIVE)) + return; + if (get_expired_time(lock, &now)) + expired = 1; + else + now = ktime_get(); + lock->stat.count++; + if (expired) + lock->stat.expire_count++; + duration = ktime_sub(now, lock->stat.last_time); + lock->stat.total_time = ktime_add(lock->stat.total_time, duration); + if (ktime_to_ns(duration) > ktime_to_ns(lock->stat.max_time)) + lock->stat.max_time = duration; + lock->stat.last_time = ktime_get(); + if (lock->flags & WAKE_LOCK_PREVENTING_SUSPEND) { + duration = ktime_sub(now, last_sleep_time_update); + lock->stat.prevent_suspend_time = ktime_add( + lock->stat.prevent_suspend_time, duration); + lock->flags &= ~WAKE_LOCK_PREVENTING_SUSPEND; + } +} + +static void update_sleep_wait_stats_locked(int done) +{ + struct wake_lock *lock; + ktime_t now, etime, elapsed, add; + int expired; + + now = ktime_get(); + elapsed = ktime_sub(now, last_sleep_time_update); + list_for_each_entry(lock, &active_wake_locks[WAKE_LOCK_SUSPEND], link) { + expired = get_expired_time(lock, &etime); + if (lock->flags & WAKE_LOCK_PREVENTING_SUSPEND) { + if (expired) + add = ktime_sub(etime, last_sleep_time_update); + else + add = elapsed; + lock->stat.prevent_suspend_time = ktime_add( + lock->stat.prevent_suspend_time, add); + } + if (done || expired) + lock->flags &= ~WAKE_LOCK_PREVENTING_SUSPEND; + else + lock->flags |= WAKE_LOCK_PREVENTING_SUSPEND; + } + last_sleep_time_update = now; +} +#endif + + +static void expire_wake_lock(struct wake_lock *lock) +{ +#ifdef CONFIG_WAKELOCK_STAT + wake_unlock_stat_locked(lock, 1); +#endif + lock->flags &= ~(WAKE_LOCK_ACTIVE | WAKE_LOCK_AUTO_EXPIRE); + list_del(&lock->link); + list_add(&lock->link, &inactive_locks); + if (debug_mask & (DEBUG_WAKE_LOCK | DEBUG_EXPIRE)) + pr_info("expired wake lock %s\n", lock->name); +} + +/* Caller must acquire the list_lock spinlock */ +static void print_active_locks(int type) +{ + struct wake_lock *lock; + bool print_expired = true; + + BUG_ON(type >= WAKE_LOCK_TYPE_COUNT); + list_for_each_entry(lock, &active_wake_locks[type], link) { + if (lock->flags & WAKE_LOCK_AUTO_EXPIRE) { + long timeout = lock->expires - jiffies; + if (timeout > 0) + pr_info("active wake lock %s, time left %ld\n", + lock->name, timeout); + else if (print_expired) + pr_info("wake lock %s, expired\n", lock->name); + } else { + pr_info("active wake lock %s\n", lock->name); + if (!(debug_mask & DEBUG_EXPIRE)) + print_expired = false; + } + } +} + +static long has_wake_lock_locked(int type) +{ + struct wake_lock *lock, *n; + long max_timeout = 0; + + BUG_ON(type >= WAKE_LOCK_TYPE_COUNT); + list_for_each_entry_safe(lock, n, &active_wake_locks[type], link) { + if (lock->flags & WAKE_LOCK_AUTO_EXPIRE) { + long timeout = lock->expires - jiffies; + if (timeout <= 0) + expire_wake_lock(lock); + else if (timeout > max_timeout) + max_timeout = timeout; + } else + return -1; + } + return max_timeout; +} + +long has_wake_lock(int type) +{ + long ret; + unsigned long irqflags; + spin_lock_irqsave(&list_lock, irqflags); + ret = has_wake_lock_locked(type); + if (ret && (debug_mask & DEBUG_WAKEUP) && type == WAKE_LOCK_SUSPEND) + print_active_locks(type); + spin_unlock_irqrestore(&list_lock, irqflags); + return ret; +} + +static void suspend_backoff(void) +{ + pr_info("suspend: too many immediate wakeups, back off\n"); + wake_lock_timeout(&suspend_backoff_lock, + msecs_to_jiffies(SUSPEND_BACKOFF_INTERVAL)); +} + +static void suspend(struct work_struct *work) +{ + int ret; + int entry_event_num; + struct timespec ts_entry, ts_exit; + + if (has_wake_lock(WAKE_LOCK_SUSPEND)) { + if (debug_mask & DEBUG_SUSPEND) + pr_info("suspend: abort suspend\n"); + return; + } + + entry_event_num = current_event_num; + sys_sync(); + if (debug_mask & DEBUG_SUSPEND) + pr_info("suspend: enter suspend\n"); + getnstimeofday(&ts_entry); + ret = pm_suspend(requested_suspend_state); + getnstimeofday(&ts_exit); + + if (debug_mask & DEBUG_EXIT_SUSPEND) { + struct rtc_time tm; + rtc_time_to_tm(ts_exit.tv_sec, &tm); + pr_info("suspend: exit suspend, ret = %d " + "(%d-%02d-%02d %02d:%02d:%02d.%09lu UTC)\n", ret, + tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, + tm.tm_hour, tm.tm_min, tm.tm_sec, ts_exit.tv_nsec); + } + + if (ts_exit.tv_sec - ts_entry.tv_sec <= 1) { + ++suspend_short_count; + + if (suspend_short_count == SUSPEND_BACKOFF_THRESHOLD) { + suspend_backoff(); + suspend_short_count = 0; + } + } else { + suspend_short_count = 0; + } + + if (current_event_num == entry_event_num) { + if (debug_mask & DEBUG_SUSPEND) + pr_info("suspend: pm_suspend returned with no event\n"); + wake_lock_timeout(&unknown_wakeup, HZ / 2); + } +} +static DECLARE_WORK(suspend_work, suspend); + +static void expire_wake_locks(unsigned long data) +{ + long has_lock; + unsigned long irqflags; + if (debug_mask & DEBUG_EXPIRE) + pr_info("expire_wake_locks: start\n"); + spin_lock_irqsave(&list_lock, irqflags); + if (debug_mask & DEBUG_SUSPEND) + print_active_locks(WAKE_LOCK_SUSPEND); + has_lock = has_wake_lock_locked(WAKE_LOCK_SUSPEND); + if (debug_mask & DEBUG_EXPIRE) + pr_info("expire_wake_locks: done, has_lock %ld\n", has_lock); + if (has_lock == 0) + queue_work(suspend_work_queue, &suspend_work); + spin_unlock_irqrestore(&list_lock, irqflags); +} +static DEFINE_TIMER(expire_timer, expire_wake_locks, 0, 0); + +static int power_suspend_late(struct device *dev) +{ + int ret = has_wake_lock(WAKE_LOCK_SUSPEND) ? -EAGAIN : 0; +#ifdef CONFIG_WAKELOCK_STAT + wait_for_wakeup = !ret; +#endif + if (debug_mask & DEBUG_SUSPEND) + pr_info("power_suspend_late return %d\n", ret); + return ret; +} + +static struct dev_pm_ops power_driver_pm_ops = { + .suspend_noirq = power_suspend_late, +}; + +static struct platform_driver power_driver = { + .driver.name = "power", + .driver.pm = &power_driver_pm_ops, +}; +static struct platform_device power_device = { + .name = "power", +}; + +void wake_lock_init(struct wake_lock *lock, int type, const char *name) +{ + unsigned long irqflags = 0; + + if (name) + lock->name = name; + BUG_ON(!lock->name); + + if (debug_mask & DEBUG_WAKE_LOCK) + pr_info("wake_lock_init name=%s\n", lock->name); +#ifdef CONFIG_WAKELOCK_STAT + lock->stat.count = 0; + lock->stat.expire_count = 0; + lock->stat.wakeup_count = 0; + lock->stat.total_time = ktime_set(0, 0); + lock->stat.prevent_suspend_time = ktime_set(0, 0); + lock->stat.max_time = ktime_set(0, 0); + lock->stat.last_time = ktime_set(0, 0); +#endif + lock->flags = (type & WAKE_LOCK_TYPE_MASK) | WAKE_LOCK_INITIALIZED; + + INIT_LIST_HEAD(&lock->link); + spin_lock_irqsave(&list_lock, irqflags); + list_add(&lock->link, &inactive_locks); + spin_unlock_irqrestore(&list_lock, irqflags); +} +EXPORT_SYMBOL(wake_lock_init); + +void wake_lock_destroy(struct wake_lock *lock) +{ + unsigned long irqflags; + if (debug_mask & DEBUG_WAKE_LOCK) + pr_info("wake_lock_destroy name=%s\n", lock->name); + spin_lock_irqsave(&list_lock, irqflags); + lock->flags &= ~WAKE_LOCK_INITIALIZED; +#ifdef CONFIG_WAKELOCK_STAT + if (lock->stat.count) { + deleted_wake_locks.stat.count += lock->stat.count; + deleted_wake_locks.stat.expire_count += lock->stat.expire_count; + deleted_wake_locks.stat.total_time = + ktime_add(deleted_wake_locks.stat.total_time, + lock->stat.total_time); + deleted_wake_locks.stat.prevent_suspend_time = + ktime_add(deleted_wake_locks.stat.prevent_suspend_time, + lock->stat.prevent_suspend_time); + deleted_wake_locks.stat.max_time = + ktime_add(deleted_wake_locks.stat.max_time, + lock->stat.max_time); + } +#endif + list_del(&lock->link); + spin_unlock_irqrestore(&list_lock, irqflags); +} +EXPORT_SYMBOL(wake_lock_destroy); + +static void wake_lock_internal( + struct wake_lock *lock, long timeout, int has_timeout) +{ + int type; + unsigned long irqflags; + long expire_in; + + spin_lock_irqsave(&list_lock, irqflags); + type = lock->flags & WAKE_LOCK_TYPE_MASK; + BUG_ON(type >= WAKE_LOCK_TYPE_COUNT); + BUG_ON(!(lock->flags & WAKE_LOCK_INITIALIZED)); +#ifdef CONFIG_WAKELOCK_STAT + if (type == WAKE_LOCK_SUSPEND && wait_for_wakeup) { + if (debug_mask & DEBUG_WAKEUP) + pr_info("wakeup wake lock: %s\n", lock->name); + wait_for_wakeup = 0; + lock->stat.wakeup_count++; + } + if ((lock->flags & WAKE_LOCK_AUTO_EXPIRE) && + (long)(lock->expires - jiffies) <= 0) { + wake_unlock_stat_locked(lock, 0); + lock->stat.last_time = ktime_get(); + } +#endif + if (!(lock->flags & WAKE_LOCK_ACTIVE)) { + lock->flags |= WAKE_LOCK_ACTIVE; +#ifdef CONFIG_WAKELOCK_STAT + lock->stat.last_time = ktime_get(); +#endif + } + list_del(&lock->link); + if (has_timeout) { + if (debug_mask & DEBUG_WAKE_LOCK) + pr_info("wake_lock: %s, type %d, timeout %ld.%03lu\n", + lock->name, type, timeout / HZ, + (timeout % HZ) * MSEC_PER_SEC / HZ); + lock->expires = jiffies + timeout; + lock->flags |= WAKE_LOCK_AUTO_EXPIRE; + list_add_tail(&lock->link, &active_wake_locks[type]); + } else { + if (debug_mask & DEBUG_WAKE_LOCK) + pr_info("wake_lock: %s, type %d\n", lock->name, type); + lock->expires = LONG_MAX; + lock->flags &= ~WAKE_LOCK_AUTO_EXPIRE; + list_add(&lock->link, &active_wake_locks[type]); + } + if (type == WAKE_LOCK_SUSPEND) { + current_event_num++; +#ifdef CONFIG_WAKELOCK_STAT + if (lock == &main_wake_lock) + update_sleep_wait_stats_locked(1); + else if (!wake_lock_active(&main_wake_lock)) + update_sleep_wait_stats_locked(0); +#endif + if (has_timeout) + expire_in = has_wake_lock_locked(type); + else + expire_in = -1; + if (expire_in > 0) { + if (debug_mask & DEBUG_EXPIRE) + pr_info("wake_lock: %s, start expire timer, " + "%ld\n", lock->name, expire_in); + mod_timer(&expire_timer, jiffies + expire_in); + } else { + if (del_timer(&expire_timer)) + if (debug_mask & DEBUG_EXPIRE) + pr_info("wake_lock: %s, stop expire timer\n", + lock->name); + if (expire_in == 0) + queue_work(suspend_work_queue, &suspend_work); + } + } + spin_unlock_irqrestore(&list_lock, irqflags); +} + +void wake_lock(struct wake_lock *lock) +{ + wake_lock_internal(lock, 0, 0); +} +EXPORT_SYMBOL(wake_lock); + +void wake_lock_timeout(struct wake_lock *lock, long timeout) +{ + wake_lock_internal(lock, timeout, 1); +} +EXPORT_SYMBOL(wake_lock_timeout); + +void wake_unlock(struct wake_lock *lock) +{ + int type; + unsigned long irqflags; + spin_lock_irqsave(&list_lock, irqflags); + type = lock->flags & WAKE_LOCK_TYPE_MASK; +#ifdef CONFIG_WAKELOCK_STAT + wake_unlock_stat_locked(lock, 0); +#endif + if (debug_mask & DEBUG_WAKE_LOCK) + pr_info("wake_unlock: %s\n", lock->name); + lock->flags &= ~(WAKE_LOCK_ACTIVE | WAKE_LOCK_AUTO_EXPIRE); + list_del(&lock->link); + list_add(&lock->link, &inactive_locks); + if (type == WAKE_LOCK_SUSPEND) { + long has_lock = has_wake_lock_locked(type); + if (has_lock > 0) { + if (debug_mask & DEBUG_EXPIRE) + pr_info("wake_unlock: %s, start expire timer, " + "%ld\n", lock->name, has_lock); + mod_timer(&expire_timer, jiffies + has_lock); + } else { + if (del_timer(&expire_timer)) + if (debug_mask & DEBUG_EXPIRE) + pr_info("wake_unlock: %s, stop expire " + "timer\n", lock->name); + if (has_lock == 0) + queue_work(suspend_work_queue, &suspend_work); + } + if (lock == &main_wake_lock) { + if (debug_mask & DEBUG_SUSPEND) + print_active_locks(WAKE_LOCK_SUSPEND); +#ifdef CONFIG_WAKELOCK_STAT + update_sleep_wait_stats_locked(0); +#endif + } + } + spin_unlock_irqrestore(&list_lock, irqflags); +} +EXPORT_SYMBOL(wake_unlock); + +int wake_lock_active(struct wake_lock *lock) +{ + return !!(lock->flags & WAKE_LOCK_ACTIVE); +} +EXPORT_SYMBOL(wake_lock_active); + +static int wakelock_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, wakelock_stats_show, NULL); +} + +static const struct file_operations wakelock_stats_fops = { + .owner = THIS_MODULE, + .open = wakelock_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init wakelocks_init(void) +{ + int ret; + int i; + + for (i = 0; i < ARRAY_SIZE(active_wake_locks); i++) + INIT_LIST_HEAD(&active_wake_locks[i]); + +#ifdef CONFIG_WAKELOCK_STAT + wake_lock_init(&deleted_wake_locks, WAKE_LOCK_SUSPEND, + "deleted_wake_locks"); +#endif + wake_lock_init(&main_wake_lock, WAKE_LOCK_SUSPEND, "main"); + wake_lock(&main_wake_lock); + wake_lock_init(&unknown_wakeup, WAKE_LOCK_SUSPEND, "unknown_wakeups"); + wake_lock_init(&suspend_backoff_lock, WAKE_LOCK_SUSPEND, + "suspend_backoff"); + + ret = platform_device_register(&power_device); + if (ret) { + pr_err("wakelocks_init: platform_device_register failed\n"); + goto err_platform_device_register; + } + ret = platform_driver_register(&power_driver); + if (ret) { + pr_err("wakelocks_init: platform_driver_register failed\n"); + goto err_platform_driver_register; + } + + suspend_work_queue = create_singlethread_workqueue("suspend"); + if (suspend_work_queue == NULL) { + ret = -ENOMEM; + goto err_suspend_work_queue; + } + +#ifdef CONFIG_WAKELOCK_STAT + proc_create("wakelocks", S_IRUGO, NULL, &wakelock_stats_fops); +#endif + + return 0; + +err_suspend_work_queue: + platform_driver_unregister(&power_driver); +err_platform_driver_register: + platform_device_unregister(&power_device); +err_platform_device_register: + wake_lock_destroy(&suspend_backoff_lock); + wake_lock_destroy(&unknown_wakeup); + wake_lock_destroy(&main_wake_lock); +#ifdef CONFIG_WAKELOCK_STAT + wake_lock_destroy(&deleted_wake_locks); +#endif + return ret; +} + +static void __exit wakelocks_exit(void) +{ +#ifdef CONFIG_WAKELOCK_STAT + remove_proc_entry("wakelocks", NULL); +#endif + destroy_workqueue(suspend_work_queue); + platform_driver_unregister(&power_driver); + platform_device_unregister(&power_device); + wake_lock_destroy(&suspend_backoff_lock); + wake_lock_destroy(&unknown_wakeup); + wake_lock_destroy(&main_wake_lock); +#ifdef CONFIG_WAKELOCK_STAT + wake_lock_destroy(&deleted_wake_locks); +#endif +} + +core_initcall(wakelocks_init); +module_exit(wakelocks_exit); diff --git a/kernel/printk.c b/kernel/printk.c new file mode 100644 index 00000000..24146142 --- /dev/null +++ b/kernel/printk.c @@ -0,0 +1,1794 @@ +/* + * linux/kernel/printk.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * Modified to make sys_syslog() more flexible: added commands to + * return the last 4k of kernel messages, regardless of whether + * they've been read or not. Added option to suppress kernel printk's + * to the console. Added hook for sending the console messages + * elsewhere, in preparation for a serial line console (someday). + * Ted Ts'o, 2/11/93. + * Modified for sysctl support, 1/8/97, Chris Horn. + * Fixed SMP synchronization, 08/08/99, Manfred Spraul + * manfred@colorfullife.com + * Rewrote bits to get rid of console_lock + * 01Mar01 Andrew Morton + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For in_interrupt() */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * Architectures can override it: + */ +void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) +{ +} + +#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) + +#ifdef CONFIG_DEBUG_LL +extern void printascii(char *); +#endif + +/* printk's without a loglevel use this.. */ +#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL + +/* We show everything that is MORE important than this.. */ +#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ +#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ + +DECLARE_WAIT_QUEUE_HEAD(log_wait); + +int console_printk[4] = { + DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ + DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ + MINIMUM_CONSOLE_LOGLEVEL, /* minimum_console_loglevel */ + DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ +}; + +/* + * Low level drivers may need that to know if they can schedule in + * their unblank() callback or not. So let's export it. + */ +int oops_in_progress; +EXPORT_SYMBOL(oops_in_progress); + +/* + * console_sem protects the console_drivers list, and also + * provides serialisation for access to the entire console + * driver system. + */ +static DEFINE_SEMAPHORE(console_sem); +struct console *console_drivers; +EXPORT_SYMBOL_GPL(console_drivers); + +/* + * This is used for debugging the mess that is the VT code by + * keeping track if we have the console semaphore held. It's + * definitely not the perfect debug tool (we don't know if _WE_ + * hold it are racing, but it helps tracking those weird code + * path in the console code where we end up in places I want + * locked without the console sempahore held + */ +static int console_locked, console_suspended; + +/* + * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars + * It is also used in interesting ways to provide interlocking in + * console_unlock();. + */ +static DEFINE_SPINLOCK(logbuf_lock); + +#define LOG_BUF_MASK (log_buf_len-1) +#define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK]) + +/* + * The indices into log_buf are not constrained to log_buf_len - they + * must be masked before subscripting + */ +static unsigned log_start; /* Index into log_buf: next char to be read by syslog() */ +static unsigned con_start; /* Index into log_buf: next char to be sent to consoles */ +static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */ + +/* + * If exclusive_console is non-NULL then only this console is to be printed to. + */ +static struct console *exclusive_console; + +/* + * Array of consoles built from command line options (console=) + */ +struct console_cmdline +{ + char name[8]; /* Name of the driver */ + int index; /* Minor dev. to use */ + char *options; /* Options for the driver */ +#ifdef CONFIG_A11Y_BRAILLE_CONSOLE + char *brl_options; /* Options for braille driver */ +#endif +}; + +#define MAX_CMDLINECONSOLES 8 + +static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; +static int selected_console = -1; +static int preferred_console = -1; +int console_set_on_cmdline; +EXPORT_SYMBOL(console_set_on_cmdline); + +/* Flag: console code may call schedule() */ +static int console_may_schedule; + +#ifdef CONFIG_PRINTK + +static char __log_buf[__LOG_BUF_LEN]; +static char *log_buf = __log_buf; +static int log_buf_len = __LOG_BUF_LEN; +static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ +static int saved_console_loglevel = -1; + +#ifdef CONFIG_KEXEC +/* + * This appends the listed symbols to /proc/vmcoreinfo + * + * /proc/vmcoreinfo is used by various utiilties, like crash and makedumpfile to + * obtain access to symbols that are otherwise very difficult to locate. These + * symbols are specifically used so that utilities can access and extract the + * dmesg log from a vmcore file after a crash. + */ +void log_buf_kexec_setup(void) +{ + VMCOREINFO_SYMBOL(log_buf); + VMCOREINFO_SYMBOL(log_end); + VMCOREINFO_SYMBOL(log_buf_len); + VMCOREINFO_SYMBOL(logged_chars); +} +#endif + +/* requested log_buf_len from kernel cmdline */ +static unsigned long __initdata new_log_buf_len; + +/* save requested log_buf_len since it's too early to process it */ +static int __init log_buf_len_setup(char *str) +{ + unsigned size = memparse(str, &str); + + if (size) + size = roundup_pow_of_two(size); + if (size > log_buf_len) + new_log_buf_len = size; + + return 0; +} +early_param("log_buf_len", log_buf_len_setup); + +void __init setup_log_buf(int early) +{ + unsigned long flags; + unsigned start, dest_idx, offset; + char *new_log_buf; + int free; + + if (!new_log_buf_len) + return; + + if (early) { + unsigned long mem; + + mem = memblock_alloc(new_log_buf_len, PAGE_SIZE); + if (mem == MEMBLOCK_ERROR) + return; + new_log_buf = __va(mem); + } else { + new_log_buf = alloc_bootmem_nopanic(new_log_buf_len); + } + + if (unlikely(!new_log_buf)) { + pr_err("log_buf_len: %ld bytes not available\n", + new_log_buf_len); + return; + } + + spin_lock_irqsave(&logbuf_lock, flags); + log_buf_len = new_log_buf_len; + log_buf = new_log_buf; + new_log_buf_len = 0; + free = __LOG_BUF_LEN - log_end; + + offset = start = min(con_start, log_start); + dest_idx = 0; + while (start != log_end) { + unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1); + + log_buf[dest_idx] = __log_buf[log_idx_mask]; + start++; + dest_idx++; + } + log_start -= offset; + con_start -= offset; + log_end -= offset; + spin_unlock_irqrestore(&logbuf_lock, flags); + + pr_info("log_buf_len: %d\n", log_buf_len); + pr_info("early log buf free: %d(%d%%)\n", + free, (free * 100) / __LOG_BUF_LEN); +} + +#ifdef CONFIG_BOOT_PRINTK_DELAY + +static int boot_delay; /* msecs delay after each printk during bootup */ +static unsigned long long loops_per_msec; /* based on boot_delay */ + +static int __init boot_delay_setup(char *str) +{ + unsigned long lpj; + + lpj = preset_lpj ? preset_lpj : 1000000; /* some guess */ + loops_per_msec = (unsigned long long)lpj / 1000 * HZ; + + get_option(&str, &boot_delay); + if (boot_delay > 10 * 1000) + boot_delay = 0; + + pr_debug("boot_delay: %u, preset_lpj: %ld, lpj: %lu, " + "HZ: %d, loops_per_msec: %llu\n", + boot_delay, preset_lpj, lpj, HZ, loops_per_msec); + return 1; +} +__setup("boot_delay=", boot_delay_setup); + +static void boot_delay_msec(void) +{ + unsigned long long k; + unsigned long timeout; + + if (boot_delay == 0 || system_state != SYSTEM_BOOTING) + return; + + k = (unsigned long long)loops_per_msec * boot_delay; + + timeout = jiffies + msecs_to_jiffies(boot_delay); + while (k) { + k--; + cpu_relax(); + /* + * use (volatile) jiffies to prevent + * compiler reduction; loop termination via jiffies + * is secondary and may or may not happen. + */ + if (time_after(jiffies, timeout)) + break; + touch_nmi_watchdog(); + } +} +#else +static inline void boot_delay_msec(void) +{ +} +#endif + +/* + * Return the number of unread characters in the log buffer. + */ +static int log_buf_get_len(void) +{ + return logged_chars; +} + +/* + * Clears the ring-buffer + */ +void log_buf_clear(void) +{ + logged_chars = 0; +} + +/* + * Copy a range of characters from the log buffer. + */ +int log_buf_copy(char *dest, int idx, int len) +{ + int ret, max; + bool took_lock = false; + + if (!oops_in_progress) { + spin_lock_irq(&logbuf_lock); + took_lock = true; + } + + max = log_buf_get_len(); + if (idx < 0 || idx >= max) { + ret = -1; + } else { + if (len > max - idx) + len = max - idx; + ret = len; + idx += (log_end - max); + while (len-- > 0) + dest[len] = LOG_BUF(idx + len); + } + + if (took_lock) + spin_unlock_irq(&logbuf_lock); + + return ret; +} + +#ifdef CONFIG_SECURITY_DMESG_RESTRICT +int dmesg_restrict = 1; +#else +int dmesg_restrict; +#endif + +static int syslog_action_restricted(int type) +{ + if (dmesg_restrict) + return 1; + /* Unless restricted, we allow "read all" and "get buffer size" for everybody */ + return type != SYSLOG_ACTION_READ_ALL && type != SYSLOG_ACTION_SIZE_BUFFER; +} + +static int check_syslog_permissions(int type, bool from_file) +{ + /* + * If this is from /proc/kmsg and we've already opened it, then we've + * already done the capabilities checks at open time. + */ + if (from_file && type != SYSLOG_ACTION_OPEN) + return 0; + + if (syslog_action_restricted(type)) { + if (capable(CAP_SYSLOG)) + return 0; + /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */ + if (capable(CAP_SYS_ADMIN)) { + printk_once(KERN_WARNING "%s (%d): " + "Attempt to access syslog with CAP_SYS_ADMIN " + "but no CAP_SYSLOG (deprecated).\n", + current->comm, task_pid_nr(current)); + return 0; + } + return -EPERM; + } + return 0; +} + +int do_syslog(int type, char __user *buf, int len, bool from_file) +{ + unsigned i, j, limit, count; + int do_clear = 0; + char c; + int error; + + error = check_syslog_permissions(type, from_file); + if (error) + goto out; + + error = security_syslog(type); + if (error) + return error; + + switch (type) { + case SYSLOG_ACTION_CLOSE: /* Close log */ + break; + case SYSLOG_ACTION_OPEN: /* Open log */ + break; + case SYSLOG_ACTION_READ: /* Read from log */ + error = -EINVAL; + if (!buf || len < 0) + goto out; + error = 0; + if (!len) + goto out; + if (!access_ok(VERIFY_WRITE, buf, len)) { + error = -EFAULT; + goto out; + } + error = wait_event_interruptible(log_wait, + (log_start - log_end)); + if (error) + goto out; + i = 0; + spin_lock_irq(&logbuf_lock); + while (!error && (log_start != log_end) && i < len) { + c = LOG_BUF(log_start); + log_start++; + spin_unlock_irq(&logbuf_lock); + error = __put_user(c,buf); + buf++; + i++; + cond_resched(); + spin_lock_irq(&logbuf_lock); + } + spin_unlock_irq(&logbuf_lock); + if (!error) + error = i; + break; + /* Read/clear last kernel messages */ + case SYSLOG_ACTION_READ_CLEAR: + do_clear = 1; + /* FALL THRU */ + /* Read last kernel messages */ + case SYSLOG_ACTION_READ_ALL: + error = -EINVAL; + if (!buf || len < 0) + goto out; + error = 0; + if (!len) + goto out; + if (!access_ok(VERIFY_WRITE, buf, len)) { + error = -EFAULT; + goto out; + } + count = len; + if (count > log_buf_len) + count = log_buf_len; + spin_lock_irq(&logbuf_lock); + if (count > logged_chars) + count = logged_chars; + if (do_clear) + logged_chars = 0; + limit = log_end; + /* + * __put_user() could sleep, and while we sleep + * printk() could overwrite the messages + * we try to copy to user space. Therefore + * the messages are copied in reverse. + */ + for (i = 0; i < count && !error; i++) { + j = limit-1-i; + if (j + log_buf_len < log_end) + break; + c = LOG_BUF(j); + spin_unlock_irq(&logbuf_lock); + error = __put_user(c,&buf[count-1-i]); + cond_resched(); + spin_lock_irq(&logbuf_lock); + } + spin_unlock_irq(&logbuf_lock); + if (error) + break; + error = i; + if (i != count) { + int offset = count-error; + /* buffer overflow during copy, correct user buffer. */ + for (i = 0; i < error; i++) { + if (__get_user(c,&buf[i+offset]) || + __put_user(c,&buf[i])) { + error = -EFAULT; + break; + } + cond_resched(); + } + } + break; + /* Clear ring buffer */ + case SYSLOG_ACTION_CLEAR: + logged_chars = 0; + break; + /* Disable logging to console */ + case SYSLOG_ACTION_CONSOLE_OFF: + if (saved_console_loglevel == -1) + saved_console_loglevel = console_loglevel; + console_loglevel = minimum_console_loglevel; + break; + /* Enable logging to console */ + case SYSLOG_ACTION_CONSOLE_ON: + if (saved_console_loglevel != -1) { + console_loglevel = saved_console_loglevel; + saved_console_loglevel = -1; + } + break; + /* Set level of messages printed to console */ + case SYSLOG_ACTION_CONSOLE_LEVEL: + error = -EINVAL; + if (len < 1 || len > 8) + goto out; + if (len < minimum_console_loglevel) + len = minimum_console_loglevel; + console_loglevel = len; + /* Implicitly re-enable logging to console */ + saved_console_loglevel = -1; + error = 0; + break; + /* Number of chars in the log buffer */ + case SYSLOG_ACTION_SIZE_UNREAD: + error = log_end - log_start; + break; + /* Size of the log buffer */ + case SYSLOG_ACTION_SIZE_BUFFER: + error = log_buf_len; + break; + default: + error = -EINVAL; + break; + } +out: + return error; +} + +SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) +{ + return do_syslog(type, buf, len, SYSLOG_FROM_CALL); +} + +#ifdef CONFIG_KGDB_KDB +/* kdb dmesg command needs access to the syslog buffer. do_syslog() + * uses locks so it cannot be used during debugging. Just tell kdb + * where the start and end of the physical and logical logs are. This + * is equivalent to do_syslog(3). + */ +void kdb_syslog_data(char *syslog_data[4]) +{ + syslog_data[0] = log_buf; + syslog_data[1] = log_buf + log_buf_len; + syslog_data[2] = log_buf + log_end - + (logged_chars < log_buf_len ? logged_chars : log_buf_len); + syslog_data[3] = log_buf + log_end; +} +#endif /* CONFIG_KGDB_KDB */ + +/* + * Call the console drivers on a range of log_buf + */ +static void __call_console_drivers(unsigned start, unsigned end) +{ + struct console *con; + + for_each_console(con) { + if (exclusive_console && con != exclusive_console) + continue; + if ((con->flags & CON_ENABLED) && con->write && + (cpu_online(smp_processor_id()) || + (con->flags & CON_ANYTIME))) + con->write(con, &LOG_BUF(start), end - start); + } +} + +static int __read_mostly ignore_loglevel; + +static int __init ignore_loglevel_setup(char *str) +{ + ignore_loglevel = 1; + printk(KERN_INFO "debug: ignoring loglevel setting.\n"); + + return 0; +} + +early_param("ignore_loglevel", ignore_loglevel_setup); + +/* + * Write out chars from start to end - 1 inclusive + */ +static void _call_console_drivers(unsigned start, + unsigned end, int msg_log_level) +{ + if ((msg_log_level < console_loglevel || ignore_loglevel) && + console_drivers && start != end) { + if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) { + /* wrapped write */ + __call_console_drivers(start & LOG_BUF_MASK, + log_buf_len); + __call_console_drivers(0, end & LOG_BUF_MASK); + } else { + __call_console_drivers(start, end); + } + } +} + +/* + * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the + * lower 3 bit are the log level, the rest are the log facility. In case + * userspace passes usual userspace syslog messages to /dev/kmsg or + * /dev/ttyprintk, the log prefix might contain the facility. Printk needs + * to extract the correct log level for in-kernel processing, and not mangle + * the original value. + * + * If a prefix is found, the length of the prefix is returned. If 'level' is + * passed, it will be filled in with the log level without a possible facility + * value. If 'special' is passed, the special printk prefix chars are accepted + * and returned. If no valid header is found, 0 is returned and the passed + * variables are not touched. + */ +static size_t log_prefix(const char *p, unsigned int *level, char *special) +{ + unsigned int lev = 0; + char sp = '\0'; + size_t len; + + if (p[0] != '<' || !p[1]) + return 0; + if (p[2] == '>') { + /* usual single digit level number or special char */ + switch (p[1]) { + case '0' ... '7': + lev = p[1] - '0'; + break; + case 'c': /* KERN_CONT */ + case 'd': /* KERN_DEFAULT */ + sp = p[1]; + break; + default: + return 0; + } + len = 3; + } else { + /* multi digit including the level and facility number */ + char *endp = NULL; + + if (p[1] < '0' && p[1] > '9') + return 0; + + lev = (simple_strtoul(&p[1], &endp, 10) & 7); + if (endp == NULL || endp[0] != '>') + return 0; + len = (endp + 1) - p; + } + + /* do not accept special char if not asked for */ + if (sp && !special) + return 0; + + if (special) { + *special = sp; + /* return special char, do not touch level */ + if (sp) + return len; + } + + if (level) + *level = lev; + return len; +} + +/* + * Call the console drivers, asking them to write out + * log_buf[start] to log_buf[end - 1]. + * The console_lock must be held. + */ +static void call_console_drivers(unsigned start, unsigned end) +{ + unsigned cur_index, start_print; + static int msg_level = -1; + + BUG_ON(((int)(start - end)) > 0); + + cur_index = start; + start_print = start; + while (cur_index != end) { + if (msg_level < 0 && ((end - cur_index) > 2)) { + /* strip log prefix */ + cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL); + start_print = cur_index; + } + while (cur_index != end) { + char c = LOG_BUF(cur_index); + + cur_index++; + if (c == '\n') { + if (msg_level < 0) { + /* + * printk() has already given us loglevel tags in + * the buffer. This code is here in case the + * log buffer has wrapped right round and scribbled + * on those tags + */ + msg_level = default_message_loglevel; + } + _call_console_drivers(start_print, cur_index, msg_level); + msg_level = -1; + start_print = cur_index; + break; + } + } + } + _call_console_drivers(start_print, end, msg_level); +} + +static void emit_log_char(char c) +{ + LOG_BUF(log_end) = c; + log_end++; + if (log_end - log_start > log_buf_len) + log_start = log_end - log_buf_len; + if (log_end - con_start > log_buf_len) + con_start = log_end - log_buf_len; + if (logged_chars < log_buf_len) + logged_chars++; +} + +/* + * Zap console related locks when oopsing. Only zap at most once + * every 10 seconds, to leave time for slow consoles to print a + * full oops. + */ +static void zap_locks(void) +{ + static unsigned long oops_timestamp; + + if (time_after_eq(jiffies, oops_timestamp) && + !time_after(jiffies, oops_timestamp + 30 * HZ)) + return; + + oops_timestamp = jiffies; + + /* If a crash is occurring, make sure we can't deadlock */ + spin_lock_init(&logbuf_lock); + /* And make sure that we print immediately */ + sema_init(&console_sem, 1); +} + +#if defined(CONFIG_PRINTK_TIME) +static int printk_time = 1; +#else +static int printk_time = 0; +#endif +module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); + +/* Check if we have any console registered that can be called early in boot. */ +static int have_callable_console(void) +{ + struct console *con; + + for_each_console(con) + if (con->flags & CON_ANYTIME) + return 1; + + return 0; +} + +/** + * printk - print a kernel message + * @fmt: format string + * + * This is printk(). It can be called from any context. We want it to work. + * + * We try to grab the console_lock. If we succeed, it's easy - we log the output and + * call the console drivers. If we fail to get the semaphore we place the output + * into the log buffer and return. The current holder of the console_sem will + * notice the new output in console_unlock(); and will send it to the + * consoles before releasing the lock. + * + * One effect of this deferred printing is that code which calls printk() and + * then changes console_loglevel may break. This is because console_loglevel + * is inspected when the actual printing occurs. + * + * See also: + * printf(3) + * + * See the vsnprintf() documentation for format string extensions over C99. + */ + +asmlinkage int printk(const char *fmt, ...) +{ + va_list args; + int r; + +#ifdef CONFIG_KGDB_KDB + if (unlikely(kdb_trap_printk)) { + va_start(args, fmt); + r = vkdb_printf(fmt, args); + va_end(args); + return r; + } +#endif + va_start(args, fmt); + r = vprintk(fmt, args); + va_end(args); + + return r; +} + +/* cpu currently holding logbuf_lock */ +static volatile unsigned int printk_cpu = UINT_MAX; + +/* + * Can we actually use the console at this time on this cpu? + * + * Console drivers may assume that per-cpu resources have + * been allocated. So unless they're explicitly marked as + * being able to cope (CON_ANYTIME) don't call them until + * this CPU is officially up. + */ +static inline int can_use_console(unsigned int cpu) +{ + return cpu_online(cpu) || have_callable_console(); +} + +/* + * Try to get console ownership to actually show the kernel + * messages from a 'printk'. Return true (and with the + * console_lock held, and 'console_locked' set) if it + * is successful, false otherwise. + * + * This gets called with the 'logbuf_lock' spinlock held and + * interrupts disabled. It should return with 'lockbuf_lock' + * released but interrupts still disabled. + */ +static int console_trylock_for_printk(unsigned int cpu) + __releases(&logbuf_lock) +{ + int retval = 0; + + if (console_trylock()) { + retval = 1; + + /* + * If we can't use the console, we need to release + * the console semaphore by hand to avoid flushing + * the buffer. We need to hold the console semaphore + * in order to do this test safely. + */ + if (!can_use_console(cpu)) { + console_locked = 0; + up(&console_sem); + retval = 0; + } + } + printk_cpu = UINT_MAX; + spin_unlock(&logbuf_lock); + return retval; +} +static const char recursion_bug_msg [] = + KERN_CRIT "BUG: recent printk recursion!\n"; +static int recursion_bug; +static int new_text_line = 1; +static char printk_buf[1024]; + +int printk_delay_msec __read_mostly; + +static inline void printk_delay(void) +{ + if (unlikely(printk_delay_msec)) { + int m = printk_delay_msec; + + while (m--) { + mdelay(1); + touch_nmi_watchdog(); + } + } +} + +asmlinkage int vprintk(const char *fmt, va_list args) +{ + int printed_len = 0; + int current_log_level = default_message_loglevel; + unsigned long flags; + int this_cpu; + char *p; + size_t plen; + char special; + + boot_delay_msec(); + printk_delay(); + + preempt_disable(); + /* This stops the holder of console_sem just where we want him */ + raw_local_irq_save(flags); + this_cpu = smp_processor_id(); + + /* + * Ouch, printk recursed into itself! + */ + if (unlikely(printk_cpu == this_cpu)) { + /* + * If a crash is occurring during printk() on this CPU, + * then try to get the crash message out but make sure + * we can't deadlock. Otherwise just return to avoid the + * recursion and return - but flag the recursion so that + * it can be printed at the next appropriate moment: + */ + if (!oops_in_progress) { + recursion_bug = 1; + goto out_restore_irqs; + } + zap_locks(); + } + + lockdep_off(); + spin_lock(&logbuf_lock); + printk_cpu = this_cpu; + + if (recursion_bug) { + recursion_bug = 0; + strcpy(printk_buf, recursion_bug_msg); + printed_len = strlen(recursion_bug_msg); + } + /* Emit the output into the temporary buffer */ + printed_len += vscnprintf(printk_buf + printed_len, + sizeof(printk_buf) - printed_len, fmt, args); + +#ifdef CONFIG_DEBUG_LL + printascii(printk_buf); +#endif + + p = printk_buf; + + /* Read log level and handle special printk prefix */ + plen = log_prefix(p, ¤t_log_level, &special); + if (plen) { + p += plen; + + switch (special) { + case 'c': /* Strip KERN_CONT, continue line */ + plen = 0; + break; + case 'd': /* Strip KERN_DEFAULT, start new line */ + plen = 0; + default: + if (!new_text_line) { + emit_log_char('\n'); + new_text_line = 1; + } + } + } + + /* + * Copy the output into log_buf. If the caller didn't provide + * the appropriate log prefix, we insert them here + */ + for (; *p; p++) { + if (new_text_line) { + new_text_line = 0; + + if (plen) { + /* Copy original log prefix */ + int i; + + for (i = 0; i < plen; i++) + emit_log_char(printk_buf[i]); + printed_len += plen; + } else { + /* Add log prefix */ + emit_log_char('<'); + emit_log_char(current_log_level + '0'); + emit_log_char('>'); + printed_len += 3; + } + + if (printk_time) { + /* Add the current time stamp */ + char tbuf[50], *tp; + unsigned tlen; + unsigned long long t; + unsigned long nanosec_rem; + + t = cpu_clock(printk_cpu); + nanosec_rem = do_div(t, 1000000000); + tlen = sprintf(tbuf, "[%5lu.%06lu] ", + (unsigned long) t, + nanosec_rem / 1000); + + for (tp = tbuf; tp < tbuf + tlen; tp++) + emit_log_char(*tp); + printed_len += tlen; + } + + if (!*p) + break; + } + + emit_log_char(*p); + if (*p == '\n') + new_text_line = 1; + } + + /* + * Try to acquire and then immediately release the + * console semaphore. The release will do all the + * actual magic (print out buffers, wake up klogd, + * etc). + * + * The console_trylock_for_printk() function + * will release 'logbuf_lock' regardless of whether it + * actually gets the semaphore or not. + */ + if (console_trylock_for_printk(this_cpu)) + console_unlock(); + + lockdep_on(); +out_restore_irqs: + raw_local_irq_restore(flags); + + preempt_enable(); + return printed_len; +} +EXPORT_SYMBOL(printk); +EXPORT_SYMBOL(vprintk); + +#else + +static void call_console_drivers(unsigned start, unsigned end) +{ +} + +#endif + +static int __add_preferred_console(char *name, int idx, char *options, + char *brl_options) +{ + struct console_cmdline *c; + int i; + + /* + * See if this tty is not yet registered, and + * if we have a slot free. + */ + for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) + if (strcmp(console_cmdline[i].name, name) == 0 && + console_cmdline[i].index == idx) { + if (!brl_options) + selected_console = i; + return 0; + } + if (i == MAX_CMDLINECONSOLES) + return -E2BIG; + if (!brl_options) + selected_console = i; + c = &console_cmdline[i]; + strlcpy(c->name, name, sizeof(c->name)); + c->options = options; +#ifdef CONFIG_A11Y_BRAILLE_CONSOLE + c->brl_options = brl_options; +#endif + c->index = idx; + return 0; +} +/* + * Set up a list of consoles. Called from init/main.c + */ +static int __init console_setup(char *str) +{ + char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */ + char *s, *options, *brl_options = NULL; + int idx; + +#ifdef CONFIG_A11Y_BRAILLE_CONSOLE + if (!memcmp(str, "brl,", 4)) { + brl_options = ""; + str += 4; + } else if (!memcmp(str, "brl=", 4)) { + brl_options = str + 4; + str = strchr(brl_options, ','); + if (!str) { + printk(KERN_ERR "need port name after brl=\n"); + return 1; + } + *(str++) = 0; + } +#endif + + /* + * Decode str into name, index, options. + */ + if (str[0] >= '0' && str[0] <= '9') { + strcpy(buf, "ttyS"); + strncpy(buf + 4, str, sizeof(buf) - 5); + } else { + strncpy(buf, str, sizeof(buf) - 1); + } + buf[sizeof(buf) - 1] = 0; + if ((options = strchr(str, ',')) != NULL) + *(options++) = 0; +#ifdef __sparc__ + if (!strcmp(str, "ttya")) + strcpy(buf, "ttyS0"); + if (!strcmp(str, "ttyb")) + strcpy(buf, "ttyS1"); +#endif + for (s = buf; *s; s++) + if ((*s >= '0' && *s <= '9') || *s == ',') + break; + idx = simple_strtoul(s, NULL, 10); + *s = 0; + + __add_preferred_console(buf, idx, options, brl_options); + console_set_on_cmdline = 1; + return 1; +} +__setup("console=", console_setup); + +/** + * add_preferred_console - add a device to the list of preferred consoles. + * @name: device name + * @idx: device index + * @options: options for this console + * + * The last preferred console added will be used for kernel messages + * and stdin/out/err for init. Normally this is used by console_setup + * above to handle user-supplied console arguments; however it can also + * be used by arch-specific code either to override the user or more + * commonly to provide a default console (ie from PROM variables) when + * the user has not supplied one. + */ +int add_preferred_console(char *name, int idx, char *options) +{ + return __add_preferred_console(name, idx, options, NULL); +} + +int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, char *options) +{ + struct console_cmdline *c; + int i; + + for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; i++) + if (strcmp(console_cmdline[i].name, name) == 0 && + console_cmdline[i].index == idx) { + c = &console_cmdline[i]; + strlcpy(c->name, name_new, sizeof(c->name)); + c->name[sizeof(c->name) - 1] = 0; + c->options = options; + c->index = idx_new; + return i; + } + /* not found */ + return -1; +} + +int console_suspend_enabled = 1; +EXPORT_SYMBOL(console_suspend_enabled); + +static int __init console_suspend_disable(char *str) +{ + console_suspend_enabled = 0; + return 1; +} +__setup("no_console_suspend", console_suspend_disable); + +/** + * suspend_console - suspend the console subsystem + * + * This disables printk() while we go into suspend states + */ +void suspend_console(void) +{ + if (!console_suspend_enabled) + return; + printk("Suspending console(s) (use no_console_suspend to debug)\n"); + console_lock(); + console_suspended = 1; + up(&console_sem); +} + +void resume_console(void) +{ + if (!console_suspend_enabled) + return; + down(&console_sem); + console_suspended = 0; + console_unlock(); +} + +/** + * console_cpu_notify - print deferred console messages after CPU hotplug + * @self: notifier struct + * @action: CPU hotplug event + * @hcpu: unused + * + * If printk() is called from a CPU that is not online yet, the messages + * will be spooled but will not show up on the console. This function is + * called when a new CPU comes online (or fails to come up), and ensures + * that any such output gets printed. + */ +static int __cpuinit console_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + switch (action) { + case CPU_ONLINE: + case CPU_DEAD: + case CPU_DOWN_FAILED: + case CPU_UP_CANCELED: + console_lock(); + console_unlock(); + } + return NOTIFY_OK; +} + +/** + * console_lock - lock the console system for exclusive use. + * + * Acquires a lock which guarantees that the caller has + * exclusive access to the console system and the console_drivers list. + * + * Can sleep, returns nothing. + */ +void console_lock(void) +{ + BUG_ON(in_interrupt()); + down(&console_sem); + if (console_suspended) + return; + console_locked = 1; + console_may_schedule = 1; +} +EXPORT_SYMBOL(console_lock); + +/** + * console_trylock - try to lock the console system for exclusive use. + * + * Tried to acquire a lock which guarantees that the caller has + * exclusive access to the console system and the console_drivers list. + * + * returns 1 on success, and 0 on failure to acquire the lock. + */ +int console_trylock(void) +{ + if (down_trylock(&console_sem)) + return 0; + if (console_suspended) { + up(&console_sem); + return 0; + } + console_locked = 1; + console_may_schedule = 0; + return 1; +} +EXPORT_SYMBOL(console_trylock); + +int is_console_locked(void) +{ + return console_locked; +} + +static DEFINE_PER_CPU(int, printk_pending); + +void printk_tick(void) +{ + if (__this_cpu_read(printk_pending)) { + __this_cpu_write(printk_pending, 0); + wake_up_interruptible(&log_wait); + } +} + +int printk_needs_cpu(int cpu) +{ + if (cpu_is_offline(cpu)) + printk_tick(); + return __this_cpu_read(printk_pending); +} + +void wake_up_klogd(void) +{ + if (waitqueue_active(&log_wait)) + this_cpu_write(printk_pending, 1); +} + +/** + * console_unlock - unlock the console system + * + * Releases the console_lock which the caller holds on the console system + * and the console driver list. + * + * While the console_lock was held, console output may have been buffered + * by printk(). If this is the case, console_unlock(); emits + * the output prior to releasing the lock. + * + * If there is output waiting for klogd, we wake it up. + * + * console_unlock(); may be called from any context. + */ +void console_unlock(void) +{ + unsigned long flags; + unsigned _con_start, _log_end; + unsigned wake_klogd = 0; + + if (console_suspended) { + up(&console_sem); + return; + } + + console_may_schedule = 0; + + for ( ; ; ) { + spin_lock_irqsave(&logbuf_lock, flags); + wake_klogd |= log_start - log_end; + if (con_start == log_end) + break; /* Nothing to print */ + _con_start = con_start; + _log_end = log_end; + con_start = log_end; /* Flush */ + spin_unlock(&logbuf_lock); + stop_critical_timings(); /* don't trace print latency */ + call_console_drivers(_con_start, _log_end); + start_critical_timings(); + local_irq_restore(flags); + } + console_locked = 0; + + /* Release the exclusive_console once it is used */ + if (unlikely(exclusive_console)) + exclusive_console = NULL; + + up(&console_sem); + spin_unlock_irqrestore(&logbuf_lock, flags); + if (wake_klogd) + wake_up_klogd(); +} +EXPORT_SYMBOL(console_unlock); + +/** + * console_conditional_schedule - yield the CPU if required + * + * If the console code is currently allowed to sleep, and + * if this CPU should yield the CPU to another task, do + * so here. + * + * Must be called within console_lock();. + */ +void __sched console_conditional_schedule(void) +{ + if (console_may_schedule) + cond_resched(); +} +EXPORT_SYMBOL(console_conditional_schedule); + +void console_unblank(void) +{ + struct console *c; + + /* + * console_unblank can no longer be called in interrupt context unless + * oops_in_progress is set to 1.. + */ + if (oops_in_progress) { + if (down_trylock(&console_sem) != 0) + return; + } else + console_lock(); + + console_locked = 1; + console_may_schedule = 0; + for_each_console(c) + if ((c->flags & CON_ENABLED) && c->unblank) + c->unblank(); + console_unlock(); +} + +/* + * Return the console tty driver structure and its associated index + */ +struct tty_driver *console_device(int *index) +{ + struct console *c; + struct tty_driver *driver = NULL; + + console_lock(); + for_each_console(c) { + if (!c->device) + continue; + driver = c->device(c, index); + if (driver) + break; + } + console_unlock(); + return driver; +} + +/* + * Prevent further output on the passed console device so that (for example) + * serial drivers can disable console output before suspending a port, and can + * re-enable output afterwards. + */ +void console_stop(struct console *console) +{ + console_lock(); + console->flags &= ~CON_ENABLED; + console_unlock(); +} +EXPORT_SYMBOL(console_stop); + +void console_start(struct console *console) +{ + console_lock(); + console->flags |= CON_ENABLED; + console_unlock(); +} +EXPORT_SYMBOL(console_start); + +static int __read_mostly keep_bootcon; + +static int __init keep_bootcon_setup(char *str) +{ + keep_bootcon = 1; + printk(KERN_INFO "debug: skip boot console de-registration.\n"); + + return 0; +} + +early_param("keep_bootcon", keep_bootcon_setup); + +/* + * The console driver calls this routine during kernel initialization + * to register the console printing procedure with printk() and to + * print any messages that were printed by the kernel before the + * console driver was initialized. + * + * This can happen pretty early during the boot process (because of + * early_printk) - sometimes before setup_arch() completes - be careful + * of what kernel features are used - they may not be initialised yet. + * + * There are two types of consoles - bootconsoles (early_printk) and + * "real" consoles (everything which is not a bootconsole) which are + * handled differently. + * - Any number of bootconsoles can be registered at any time. + * - As soon as a "real" console is registered, all bootconsoles + * will be unregistered automatically. + * - Once a "real" console is registered, any attempt to register a + * bootconsoles will be rejected + */ +void register_console(struct console *newcon) +{ + int i; + unsigned long flags; + struct console *bcon = NULL; + + /* + * before we register a new CON_BOOT console, make sure we don't + * already have a valid console + */ + if (console_drivers && newcon->flags & CON_BOOT) { + /* find the last or real console */ + for_each_console(bcon) { + if (!(bcon->flags & CON_BOOT)) { + printk(KERN_INFO "Too late to register bootconsole %s%d\n", + newcon->name, newcon->index); + return; + } + } + } + + if (console_drivers && console_drivers->flags & CON_BOOT) + bcon = console_drivers; + + if (preferred_console < 0 || bcon || !console_drivers) + preferred_console = selected_console; + + if (newcon->early_setup) + newcon->early_setup(); + + /* + * See if we want to use this console driver. If we + * didn't select a console we take the first one + * that registers here. + */ + if (preferred_console < 0) { + if (newcon->index < 0) + newcon->index = 0; + if (newcon->setup == NULL || + newcon->setup(newcon, NULL) == 0) { + newcon->flags |= CON_ENABLED; + if (newcon->device) { + newcon->flags |= CON_CONSDEV; + preferred_console = 0; + } + } + } + + /* + * See if this console matches one we selected on + * the command line. + */ + for (i = 0; i < MAX_CMDLINECONSOLES && console_cmdline[i].name[0]; + i++) { + if (strcmp(console_cmdline[i].name, newcon->name) != 0) + continue; + if (newcon->index >= 0 && + newcon->index != console_cmdline[i].index) + continue; + if (newcon->index < 0) + newcon->index = console_cmdline[i].index; +#ifdef CONFIG_A11Y_BRAILLE_CONSOLE + if (console_cmdline[i].brl_options) { + newcon->flags |= CON_BRL; + braille_register_console(newcon, + console_cmdline[i].index, + console_cmdline[i].options, + console_cmdline[i].brl_options); + return; + } +#endif + if (newcon->setup && + newcon->setup(newcon, console_cmdline[i].options) != 0) + break; + newcon->flags |= CON_ENABLED; + newcon->index = console_cmdline[i].index; + if (i == selected_console) { + newcon->flags |= CON_CONSDEV; + preferred_console = selected_console; + } + break; + } + + if (!(newcon->flags & CON_ENABLED)) + return; + + /* + * If we have a bootconsole, and are switching to a real console, + * don't print everything out again, since when the boot console, and + * the real console are the same physical device, it's annoying to + * see the beginning boot messages twice + */ + if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) + newcon->flags &= ~CON_PRINTBUFFER; + + /* + * Put this console in the list - keep the + * preferred driver at the head of the list. + */ + console_lock(); + if ((newcon->flags & CON_CONSDEV) || console_drivers == NULL) { + newcon->next = console_drivers; + console_drivers = newcon; + if (newcon->next) + newcon->next->flags &= ~CON_CONSDEV; + } else { + newcon->next = console_drivers->next; + console_drivers->next = newcon; + } + if (newcon->flags & CON_PRINTBUFFER) { + /* + * console_unlock(); will print out the buffered messages + * for us. + */ + spin_lock_irqsave(&logbuf_lock, flags); + con_start = log_start; + spin_unlock_irqrestore(&logbuf_lock, flags); + /* + * We're about to replay the log buffer. Only do this to the + * just-registered console to avoid excessive message spam to + * the already-registered consoles. + */ + exclusive_console = newcon; + } + console_unlock(); + console_sysfs_notify(); + + /* + * By unregistering the bootconsoles after we enable the real console + * we get the "console xxx enabled" message on all the consoles - + * boot consoles, real consoles, etc - this is to ensure that end + * users know there might be something in the kernel's log buffer that + * went to the bootconsole (that they do not see on the real console) + */ + if (bcon && + ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && + !keep_bootcon) { + /* we need to iterate through twice, to make sure we print + * everything out, before we unregister the console(s) + */ + printk(KERN_INFO "console [%s%d] enabled, bootconsole disabled\n", + newcon->name, newcon->index); + for_each_console(bcon) + if (bcon->flags & CON_BOOT) + unregister_console(bcon); + } else { + printk(KERN_INFO "%sconsole [%s%d] enabled\n", + (newcon->flags & CON_BOOT) ? "boot" : "" , + newcon->name, newcon->index); + } +} +EXPORT_SYMBOL(register_console); + +int unregister_console(struct console *console) +{ + struct console *a, *b; + int res = 1; + +#ifdef CONFIG_A11Y_BRAILLE_CONSOLE + if (console->flags & CON_BRL) + return braille_unregister_console(console); +#endif + + console_lock(); + if (console_drivers == console) { + console_drivers=console->next; + res = 0; + } else if (console_drivers) { + for (a=console_drivers->next, b=console_drivers ; + a; b=a, a=b->next) { + if (a == console) { + b->next = a->next; + res = 0; + break; + } + } + } + + /* + * If this isn't the last console and it has CON_CONSDEV set, we + * need to set it on the next preferred console. + */ + if (console_drivers != NULL && console->flags & CON_CONSDEV) + console_drivers->flags |= CON_CONSDEV; + + console_unlock(); + console_sysfs_notify(); + return res; +} +EXPORT_SYMBOL(unregister_console); + +static int __init printk_late_init(void) +{ + struct console *con; + + for_each_console(con) { + if (!keep_bootcon && con->flags & CON_BOOT) { + printk(KERN_INFO "turn off boot console %s%d\n", + con->name, con->index); + unregister_console(con); + } + } + hotcpu_notifier(console_cpu_notify, 0); + return 0; +} +late_initcall(printk_late_init); + +#if defined CONFIG_PRINTK + +/* + * printk rate limiting, lifted from the networking subsystem. + * + * This enforces a rate limit: not more than 10 kernel messages + * every 5s to make a denial-of-service attack impossible. + */ +DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10); + +int __printk_ratelimit(const char *func) +{ + return ___ratelimit(&printk_ratelimit_state, func); +} +EXPORT_SYMBOL(__printk_ratelimit); + +/** + * printk_timed_ratelimit - caller-controlled printk ratelimiting + * @caller_jiffies: pointer to caller's state + * @interval_msecs: minimum interval between prints + * + * printk_timed_ratelimit() returns true if more than @interval_msecs + * milliseconds have elapsed since the last time printk_timed_ratelimit() + * returned true. + */ +bool printk_timed_ratelimit(unsigned long *caller_jiffies, + unsigned int interval_msecs) +{ + if (*caller_jiffies == 0 + || !time_in_range(jiffies, *caller_jiffies, + *caller_jiffies + + msecs_to_jiffies(interval_msecs))) { + *caller_jiffies = jiffies; + return true; + } + return false; +} +EXPORT_SYMBOL(printk_timed_ratelimit); + +static DEFINE_SPINLOCK(dump_list_lock); +static LIST_HEAD(dump_list); + +/** + * kmsg_dump_register - register a kernel log dumper. + * @dumper: pointer to the kmsg_dumper structure + * + * Adds a kernel log dumper to the system. The dump callback in the + * structure will be called when the kernel oopses or panics and must be + * set. Returns zero on success and %-EINVAL or %-EBUSY otherwise. + */ +int kmsg_dump_register(struct kmsg_dumper *dumper) +{ + unsigned long flags; + int err = -EBUSY; + + /* The dump callback needs to be set */ + if (!dumper->dump) + return -EINVAL; + + spin_lock_irqsave(&dump_list_lock, flags); + /* Don't allow registering multiple times */ + if (!dumper->registered) { + dumper->registered = 1; + list_add_tail_rcu(&dumper->list, &dump_list); + err = 0; + } + spin_unlock_irqrestore(&dump_list_lock, flags); + + return err; +} +EXPORT_SYMBOL_GPL(kmsg_dump_register); + +/** + * kmsg_dump_unregister - unregister a kmsg dumper. + * @dumper: pointer to the kmsg_dumper structure + * + * Removes a dump device from the system. Returns zero on success and + * %-EINVAL otherwise. + */ +int kmsg_dump_unregister(struct kmsg_dumper *dumper) +{ + unsigned long flags; + int err = -EINVAL; + + spin_lock_irqsave(&dump_list_lock, flags); + if (dumper->registered) { + dumper->registered = 0; + list_del_rcu(&dumper->list); + err = 0; + } + spin_unlock_irqrestore(&dump_list_lock, flags); + synchronize_rcu(); + + return err; +} +EXPORT_SYMBOL_GPL(kmsg_dump_unregister); + +/** + * kmsg_dump - dump kernel log to kernel message dumpers. + * @reason: the reason (oops, panic etc) for dumping + * + * Iterate through each of the dump devices and call the oops/panic + * callbacks with the log buffer. + */ +void kmsg_dump(enum kmsg_dump_reason reason) +{ + unsigned long end; + unsigned chars; + struct kmsg_dumper *dumper; + const char *s1, *s2; + unsigned long l1, l2; + unsigned long flags; + + /* Theoretically, the log could move on after we do this, but + there's not a lot we can do about that. The new messages + will overwrite the start of what we dump. */ + spin_lock_irqsave(&logbuf_lock, flags); + end = log_end & LOG_BUF_MASK; + chars = logged_chars; + spin_unlock_irqrestore(&logbuf_lock, flags); + + if (chars > end) { + s1 = log_buf + log_buf_len - chars + end; + l1 = chars - end; + + s2 = log_buf; + l2 = end; + } else { + s1 = ""; + l1 = 0; + + s2 = log_buf + end - chars; + l2 = chars; + } + + rcu_read_lock(); + list_for_each_entry_rcu(dumper, &dump_list, list) + dumper->dump(dumper, reason, s1, l1, s2, l2); + rcu_read_unlock(); +} +#endif diff --git a/kernel/profile.c b/kernel/profile.c new file mode 100644 index 00000000..961b389f --- /dev/null +++ b/kernel/profile.c @@ -0,0 +1,631 @@ +/* + * linux/kernel/profile.c + * Simple profiling. Manages a direct-mapped profile hit count buffer, + * with configurable resolution, support for restricting the cpus on + * which profiling is done, and switching between cpu time and + * schedule() calls via kernel command line parameters passed at boot. + * + * Scheduler profiling support, Arjan van de Ven and Ingo Molnar, + * Red Hat, July 2004 + * Consolidation of architecture support code for profiling, + * William Irwin, Oracle, July 2004 + * Amortized hit count accounting via per-cpu open-addressed hashtables + * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct profile_hit { + u32 pc, hits; +}; +#define PROFILE_GRPSHIFT 3 +#define PROFILE_GRPSZ (1 << PROFILE_GRPSHIFT) +#define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit)) +#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ) + +/* Oprofile timer tick hook */ +static int (*timer_hook)(struct pt_regs *) __read_mostly; + +static atomic_t *prof_buffer; +static unsigned long prof_len, prof_shift; + +int prof_on __read_mostly; +EXPORT_SYMBOL_GPL(prof_on); + +static cpumask_var_t prof_cpu_mask; +#ifdef CONFIG_SMP +static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); +static DEFINE_PER_CPU(int, cpu_profile_flip); +static DEFINE_MUTEX(profile_flip_mutex); +#endif /* CONFIG_SMP */ + +int profile_setup(char *str) +{ + static char schedstr[] = "schedule"; + static char sleepstr[] = "sleep"; + static char kvmstr[] = "kvm"; + int par; + + if (!strncmp(str, sleepstr, strlen(sleepstr))) { +#ifdef CONFIG_SCHEDSTATS + prof_on = SLEEP_PROFILING; + if (str[strlen(sleepstr)] == ',') + str += strlen(sleepstr) + 1; + if (get_option(&str, &par)) + prof_shift = par; + printk(KERN_INFO + "kernel sleep profiling enabled (shift: %ld)\n", + prof_shift); +#else + printk(KERN_WARNING + "kernel sleep profiling requires CONFIG_SCHEDSTATS\n"); +#endif /* CONFIG_SCHEDSTATS */ + } else if (!strncmp(str, schedstr, strlen(schedstr))) { + prof_on = SCHED_PROFILING; + if (str[strlen(schedstr)] == ',') + str += strlen(schedstr) + 1; + if (get_option(&str, &par)) + prof_shift = par; + printk(KERN_INFO + "kernel schedule profiling enabled (shift: %ld)\n", + prof_shift); + } else if (!strncmp(str, kvmstr, strlen(kvmstr))) { + prof_on = KVM_PROFILING; + if (str[strlen(kvmstr)] == ',') + str += strlen(kvmstr) + 1; + if (get_option(&str, &par)) + prof_shift = par; + printk(KERN_INFO + "kernel KVM profiling enabled (shift: %ld)\n", + prof_shift); + } else if (get_option(&str, &par)) { + prof_shift = par; + prof_on = CPU_PROFILING; + printk(KERN_INFO "kernel profiling enabled (shift: %ld)\n", + prof_shift); + } + return 1; +} +__setup("profile=", profile_setup); + + +int __ref profile_init(void) +{ + int buffer_bytes; + if (!prof_on) + return 0; + + /* only text is profiled */ + prof_len = (_etext - _stext) >> prof_shift; + buffer_bytes = prof_len*sizeof(atomic_t); + + if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL)) + return -ENOMEM; + + cpumask_copy(prof_cpu_mask, cpu_possible_mask); + + prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL|__GFP_NOWARN); + if (prof_buffer) + return 0; + + prof_buffer = alloc_pages_exact(buffer_bytes, + GFP_KERNEL|__GFP_ZERO|__GFP_NOWARN); + if (prof_buffer) + return 0; + + prof_buffer = vzalloc(buffer_bytes); + if (prof_buffer) + return 0; + + free_cpumask_var(prof_cpu_mask); + return -ENOMEM; +} + +/* Profile event notifications */ + +static BLOCKING_NOTIFIER_HEAD(task_exit_notifier); +static ATOMIC_NOTIFIER_HEAD(task_free_notifier); +static BLOCKING_NOTIFIER_HEAD(munmap_notifier); + +void profile_task_exit(struct task_struct *task) +{ + blocking_notifier_call_chain(&task_exit_notifier, 0, task); +} + +int profile_handoff_task(struct task_struct *task) +{ + int ret; + ret = atomic_notifier_call_chain(&task_free_notifier, 0, task); + return (ret == NOTIFY_OK) ? 1 : 0; +} + +void profile_munmap(unsigned long addr) +{ + blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr); +} + +int task_handoff_register(struct notifier_block *n) +{ + return atomic_notifier_chain_register(&task_free_notifier, n); +} +EXPORT_SYMBOL_GPL(task_handoff_register); + +int task_handoff_unregister(struct notifier_block *n) +{ + return atomic_notifier_chain_unregister(&task_free_notifier, n); +} +EXPORT_SYMBOL_GPL(task_handoff_unregister); + +int profile_event_register(enum profile_type type, struct notifier_block *n) +{ + int err = -EINVAL; + + switch (type) { + case PROFILE_TASK_EXIT: + err = blocking_notifier_chain_register( + &task_exit_notifier, n); + break; + case PROFILE_MUNMAP: + err = blocking_notifier_chain_register( + &munmap_notifier, n); + break; + } + + return err; +} +EXPORT_SYMBOL_GPL(profile_event_register); + +int profile_event_unregister(enum profile_type type, struct notifier_block *n) +{ + int err = -EINVAL; + + switch (type) { + case PROFILE_TASK_EXIT: + err = blocking_notifier_chain_unregister( + &task_exit_notifier, n); + break; + case PROFILE_MUNMAP: + err = blocking_notifier_chain_unregister( + &munmap_notifier, n); + break; + } + + return err; +} +EXPORT_SYMBOL_GPL(profile_event_unregister); + +int register_timer_hook(int (*hook)(struct pt_regs *)) +{ + if (timer_hook) + return -EBUSY; + timer_hook = hook; + return 0; +} +EXPORT_SYMBOL_GPL(register_timer_hook); + +void unregister_timer_hook(int (*hook)(struct pt_regs *)) +{ + WARN_ON(hook != timer_hook); + timer_hook = NULL; + /* make sure all CPUs see the NULL hook */ + synchronize_sched(); /* Allow ongoing interrupts to complete. */ +} +EXPORT_SYMBOL_GPL(unregister_timer_hook); + + +#ifdef CONFIG_SMP +/* + * Each cpu has a pair of open-addressed hashtables for pending + * profile hits. read_profile() IPI's all cpus to request them + * to flip buffers and flushes their contents to prof_buffer itself. + * Flip requests are serialized by the profile_flip_mutex. The sole + * use of having a second hashtable is for avoiding cacheline + * contention that would otherwise happen during flushes of pending + * profile hits required for the accuracy of reported profile hits + * and so resurrect the interrupt livelock issue. + * + * The open-addressed hashtables are indexed by profile buffer slot + * and hold the number of pending hits to that profile buffer slot on + * a cpu in an entry. When the hashtable overflows, all pending hits + * are accounted to their corresponding profile buffer slots with + * atomic_add() and the hashtable emptied. As numerous pending hits + * may be accounted to a profile buffer slot in a hashtable entry, + * this amortizes a number of atomic profile buffer increments likely + * to be far larger than the number of entries in the hashtable, + * particularly given that the number of distinct profile buffer + * positions to which hits are accounted during short intervals (e.g. + * several seconds) is usually very small. Exclusion from buffer + * flipping is provided by interrupt disablement (note that for + * SCHED_PROFILING or SLEEP_PROFILING profile_hit() may be called from + * process context). + * The hash function is meant to be lightweight as opposed to strong, + * and was vaguely inspired by ppc64 firmware-supported inverted + * pagetable hash functions, but uses a full hashtable full of finite + * collision chains, not just pairs of them. + * + * -- wli + */ +static void __profile_flip_buffers(void *unused) +{ + int cpu = smp_processor_id(); + + per_cpu(cpu_profile_flip, cpu) = !per_cpu(cpu_profile_flip, cpu); +} + +static void profile_flip_buffers(void) +{ + int i, j, cpu; + + mutex_lock(&profile_flip_mutex); + j = per_cpu(cpu_profile_flip, get_cpu()); + put_cpu(); + on_each_cpu(__profile_flip_buffers, NULL, 1); + for_each_online_cpu(cpu) { + struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j]; + for (i = 0; i < NR_PROFILE_HIT; ++i) { + if (!hits[i].hits) { + if (hits[i].pc) + hits[i].pc = 0; + continue; + } + atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); + hits[i].hits = hits[i].pc = 0; + } + } + mutex_unlock(&profile_flip_mutex); +} + +static void profile_discard_flip_buffers(void) +{ + int i, cpu; + + mutex_lock(&profile_flip_mutex); + i = per_cpu(cpu_profile_flip, get_cpu()); + put_cpu(); + on_each_cpu(__profile_flip_buffers, NULL, 1); + for_each_online_cpu(cpu) { + struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i]; + memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit)); + } + mutex_unlock(&profile_flip_mutex); +} + +static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) +{ + unsigned long primary, secondary, flags, pc = (unsigned long)__pc; + int i, j, cpu; + struct profile_hit *hits; + + pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1); + i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; + secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT; + cpu = get_cpu(); + hits = per_cpu(cpu_profile_hits, cpu)[per_cpu(cpu_profile_flip, cpu)]; + if (!hits) { + put_cpu(); + return; + } + /* + * We buffer the global profiler buffer into a per-CPU + * queue and thus reduce the number of global (and possibly + * NUMA-alien) accesses. The write-queue is self-coalescing: + */ + local_irq_save(flags); + do { + for (j = 0; j < PROFILE_GRPSZ; ++j) { + if (hits[i + j].pc == pc) { + hits[i + j].hits += nr_hits; + goto out; + } else if (!hits[i + j].hits) { + hits[i + j].pc = pc; + hits[i + j].hits = nr_hits; + goto out; + } + } + i = (i + secondary) & (NR_PROFILE_HIT - 1); + } while (i != primary); + + /* + * Add the current hit(s) and flush the write-queue out + * to the global buffer: + */ + atomic_add(nr_hits, &prof_buffer[pc]); + for (i = 0; i < NR_PROFILE_HIT; ++i) { + atomic_add(hits[i].hits, &prof_buffer[hits[i].pc]); + hits[i].pc = hits[i].hits = 0; + } +out: + local_irq_restore(flags); + put_cpu(); +} + +static int __cpuinit profile_cpu_callback(struct notifier_block *info, + unsigned long action, void *__cpu) +{ + int node, cpu = (unsigned long)__cpu; + struct page *page; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + node = cpu_to_mem(cpu); + per_cpu(cpu_profile_flip, cpu) = 0; + if (!per_cpu(cpu_profile_hits, cpu)[1]) { + page = alloc_pages_exact_node(node, + GFP_KERNEL | __GFP_ZERO, + 0); + if (!page) + return notifier_from_errno(-ENOMEM); + per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); + } + if (!per_cpu(cpu_profile_hits, cpu)[0]) { + page = alloc_pages_exact_node(node, + GFP_KERNEL | __GFP_ZERO, + 0); + if (!page) + goto out_free; + per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); + } + break; +out_free: + page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); + per_cpu(cpu_profile_hits, cpu)[1] = NULL; + __free_page(page); + return notifier_from_errno(-ENOMEM); + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + if (prof_cpu_mask != NULL) + cpumask_set_cpu(cpu, prof_cpu_mask); + break; + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + if (prof_cpu_mask != NULL) + cpumask_clear_cpu(cpu, prof_cpu_mask); + if (per_cpu(cpu_profile_hits, cpu)[0]) { + page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); + per_cpu(cpu_profile_hits, cpu)[0] = NULL; + __free_page(page); + } + if (per_cpu(cpu_profile_hits, cpu)[1]) { + page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); + per_cpu(cpu_profile_hits, cpu)[1] = NULL; + __free_page(page); + } + break; + } + return NOTIFY_OK; +} +#else /* !CONFIG_SMP */ +#define profile_flip_buffers() do { } while (0) +#define profile_discard_flip_buffers() do { } while (0) +#define profile_cpu_callback NULL + +static void do_profile_hits(int type, void *__pc, unsigned int nr_hits) +{ + unsigned long pc; + pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift; + atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); +} +#endif /* !CONFIG_SMP */ + +void profile_hits(int type, void *__pc, unsigned int nr_hits) +{ + if (prof_on != type || !prof_buffer) + return; + do_profile_hits(type, __pc, nr_hits); +} +EXPORT_SYMBOL_GPL(profile_hits); + +void profile_tick(int type) +{ + struct pt_regs *regs = get_irq_regs(); + + if (type == CPU_PROFILING && timer_hook) + timer_hook(regs); + if (!user_mode(regs) && prof_cpu_mask != NULL && + cpumask_test_cpu(smp_processor_id(), prof_cpu_mask)) + profile_hit(type, (void *)profile_pc(regs)); +} + +#ifdef CONFIG_PROC_FS +#include +#include +#include + +static int prof_cpu_mask_proc_show(struct seq_file *m, void *v) +{ + seq_cpumask(m, prof_cpu_mask); + seq_putc(m, '\n'); + return 0; +} + +static int prof_cpu_mask_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, prof_cpu_mask_proc_show, NULL); +} + +static ssize_t prof_cpu_mask_proc_write(struct file *file, + const char __user *buffer, size_t count, loff_t *pos) +{ + cpumask_var_t new_value; + int err; + + if (!alloc_cpumask_var(&new_value, GFP_KERNEL)) + return -ENOMEM; + + err = cpumask_parse_user(buffer, count, new_value); + if (!err) { + cpumask_copy(prof_cpu_mask, new_value); + err = count; + } + free_cpumask_var(new_value); + return err; +} + +static const struct file_operations prof_cpu_mask_proc_fops = { + .open = prof_cpu_mask_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = prof_cpu_mask_proc_write, +}; + +void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) +{ + /* create /proc/irq/prof_cpu_mask */ + proc_create("prof_cpu_mask", 0600, root_irq_dir, &prof_cpu_mask_proc_fops); +} + +/* + * This function accesses profiling information. The returned data is + * binary: the sampling step and the actual contents of the profile + * buffer. Use of the program readprofile is recommended in order to + * get meaningful info out of these data. + */ +static ssize_t +read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + unsigned long p = *ppos; + ssize_t read; + char *pnt; + unsigned int sample_step = 1 << prof_shift; + + profile_flip_buffers(); + if (p >= (prof_len+1)*sizeof(unsigned int)) + return 0; + if (count > (prof_len+1)*sizeof(unsigned int) - p) + count = (prof_len+1)*sizeof(unsigned int) - p; + read = 0; + + while (p < sizeof(unsigned int) && count > 0) { + if (put_user(*((char *)(&sample_step)+p), buf)) + return -EFAULT; + buf++; p++; count--; read++; + } + pnt = (char *)prof_buffer + p - sizeof(atomic_t); + if (copy_to_user(buf, (void *)pnt, count)) + return -EFAULT; + read += count; + *ppos += read; + return read; +} + +/* + * Writing to /proc/profile resets the counters + * + * Writing a 'profiling multiplier' value into it also re-sets the profiling + * interrupt frequency, on architectures that support this. + */ +static ssize_t write_profile(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ +#ifdef CONFIG_SMP + extern int setup_profiling_timer(unsigned int multiplier); + + if (count == sizeof(int)) { + unsigned int multiplier; + + if (copy_from_user(&multiplier, buf, sizeof(int))) + return -EFAULT; + + if (setup_profiling_timer(multiplier)) + return -EINVAL; + } +#endif + profile_discard_flip_buffers(); + memset(prof_buffer, 0, prof_len * sizeof(atomic_t)); + return count; +} + +static const struct file_operations proc_profile_operations = { + .read = read_profile, + .write = write_profile, + .llseek = default_llseek, +}; + +#ifdef CONFIG_SMP +static void profile_nop(void *unused) +{ +} + +static int create_hash_tables(void) +{ + int cpu; + + for_each_online_cpu(cpu) { + int node = cpu_to_mem(cpu); + struct page *page; + + page = alloc_pages_exact_node(node, + GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + 0); + if (!page) + goto out_cleanup; + per_cpu(cpu_profile_hits, cpu)[1] + = (struct profile_hit *)page_address(page); + page = alloc_pages_exact_node(node, + GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + 0); + if (!page) + goto out_cleanup; + per_cpu(cpu_profile_hits, cpu)[0] + = (struct profile_hit *)page_address(page); + } + return 0; +out_cleanup: + prof_on = 0; + smp_mb(); + on_each_cpu(profile_nop, NULL, 1); + for_each_online_cpu(cpu) { + struct page *page; + + if (per_cpu(cpu_profile_hits, cpu)[0]) { + page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[0]); + per_cpu(cpu_profile_hits, cpu)[0] = NULL; + __free_page(page); + } + if (per_cpu(cpu_profile_hits, cpu)[1]) { + page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); + per_cpu(cpu_profile_hits, cpu)[1] = NULL; + __free_page(page); + } + } + return -1; +} +#else +#define create_hash_tables() ({ 0; }) +#endif + +int __ref create_proc_profile(void) /* false positive from hotcpu_notifier */ +{ + struct proc_dir_entry *entry; + + if (!prof_on) + return 0; + if (create_hash_tables()) + return -ENOMEM; + entry = proc_create("profile", S_IWUSR | S_IRUGO, + NULL, &proc_profile_operations); + if (!entry) + return 0; + entry->size = (1+prof_len) * sizeof(atomic_t); + hotcpu_notifier(profile_cpu_callback, 0); + return 0; +} +module_init(create_proc_profile); +#endif /* CONFIG_PROC_FS */ diff --git a/kernel/ptrace.c b/kernel/ptrace.c new file mode 100644 index 00000000..2df11579 --- /dev/null +++ b/kernel/ptrace.c @@ -0,0 +1,942 @@ +/* + * linux/kernel/ptrace.c + * + * (C) Copyright 1999 Linus Torvalds + * + * Common interfaces for "ptrace()" which we do not want + * to continually duplicate across every architecture. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* + * ptrace a task: make the debugger its new parent and + * move it to the ptrace list. + * + * Must be called with the tasklist lock write-held. + */ +void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) +{ + BUG_ON(!list_empty(&child->ptrace_entry)); + list_add(&child->ptrace_entry, &new_parent->ptraced); + child->parent = new_parent; +} + +/** + * __ptrace_unlink - unlink ptracee and restore its execution state + * @child: ptracee to be unlinked + * + * Remove @child from the ptrace list, move it back to the original parent, + * and restore the execution state so that it conforms to the group stop + * state. + * + * Unlinking can happen via two paths - explicit PTRACE_DETACH or ptracer + * exiting. For PTRACE_DETACH, unless the ptracee has been killed between + * ptrace_check_attach() and here, it's guaranteed to be in TASK_TRACED. + * If the ptracer is exiting, the ptracee can be in any state. + * + * After detach, the ptracee should be in a state which conforms to the + * group stop. If the group is stopped or in the process of stopping, the + * ptracee should be put into TASK_STOPPED; otherwise, it should be woken + * up from TASK_TRACED. + * + * If the ptracee is in TASK_TRACED and needs to be moved to TASK_STOPPED, + * it goes through TRACED -> RUNNING -> STOPPED transition which is similar + * to but in the opposite direction of what happens while attaching to a + * stopped task. However, in this direction, the intermediate RUNNING + * state is not hidden even from the current ptracer and if it immediately + * re-attaches and performs a WNOHANG wait(2), it may fail. + * + * CONTEXT: + * write_lock_irq(tasklist_lock) + */ +void __ptrace_unlink(struct task_struct *child) +{ + BUG_ON(!child->ptrace); + + child->ptrace = 0; + child->parent = child->real_parent; + list_del_init(&child->ptrace_entry); + + spin_lock(&child->sighand->siglock); + + /* + * Reinstate GROUP_STOP_PENDING if group stop is in effect and + * @child isn't dead. + */ + if (!(child->flags & PF_EXITING) && + (child->signal->flags & SIGNAL_STOP_STOPPED || + child->signal->group_stop_count)) + child->group_stop |= GROUP_STOP_PENDING; + + /* + * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick + * @child in the butt. Note that @resume should be used iff @child + * is in TASK_TRACED; otherwise, we might unduly disrupt + * TASK_KILLABLE sleeps. + */ + if (child->group_stop & GROUP_STOP_PENDING || task_is_traced(child)) + signal_wake_up(child, task_is_traced(child)); + + spin_unlock(&child->sighand->siglock); +} + +/* + * Check that we have indeed attached to the thing.. + */ +int ptrace_check_attach(struct task_struct *child, int kill) +{ + int ret = -ESRCH; + + /* + * We take the read lock around doing both checks to close a + * possible race where someone else was tracing our child and + * detached between these two checks. After this locked check, + * we are sure that this is our traced child and that can only + * be changed by us so it's not changing right after this. + */ + read_lock(&tasklist_lock); + if ((child->ptrace & PT_PTRACED) && child->parent == current) { + /* + * child->sighand can't be NULL, release_task() + * does ptrace_unlink() before __exit_signal(). + */ + spin_lock_irq(&child->sighand->siglock); + WARN_ON_ONCE(task_is_stopped(child)); + if (task_is_traced(child) || kill) + ret = 0; + spin_unlock_irq(&child->sighand->siglock); + } + read_unlock(&tasklist_lock); + + if (!ret && !kill) + ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH; + + /* All systems go.. */ + return ret; +} + +int __ptrace_may_access(struct task_struct *task, unsigned int mode) +{ + const struct cred *cred = current_cred(), *tcred; + + /* May we inspect the given task? + * This check is used both for attaching with ptrace + * and for allowing access to sensitive information in /proc. + * + * ptrace_attach denies several cases that /proc allows + * because setting up the necessary parent/child relationship + * or halting the specified task is impossible. + */ + int dumpable = 0; + /* Don't let security modules deny introspection */ + if (task == current) + return 0; + rcu_read_lock(); + tcred = __task_cred(task); + if (cred->user->user_ns == tcred->user->user_ns && + (cred->uid == tcred->euid && + cred->uid == tcred->suid && + cred->uid == tcred->uid && + cred->gid == tcred->egid && + cred->gid == tcred->sgid && + cred->gid == tcred->gid)) + goto ok; + if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE)) + goto ok; + rcu_read_unlock(); + return -EPERM; +ok: + rcu_read_unlock(); + smp_rmb(); + if (task->mm) + dumpable = get_dumpable(task->mm); + if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE)) + return -EPERM; + + return security_ptrace_access_check(task, mode); +} + +bool ptrace_may_access(struct task_struct *task, unsigned int mode) +{ + int err; + task_lock(task); + err = __ptrace_may_access(task, mode); + task_unlock(task); + return !err; +} + +static int ptrace_attach(struct task_struct *task) +{ + bool wait_trap = false; + int retval; + + audit_ptrace(task); + + retval = -EPERM; + if (unlikely(task->flags & PF_KTHREAD)) + goto out; + if (same_thread_group(task, current)) + goto out; + + /* + * Protect exec's credential calculations against our interference; + * interference; SUID, SGID and LSM creds get determined differently + * under ptrace. + */ + retval = -ERESTARTNOINTR; + if (mutex_lock_interruptible(&task->signal->cred_guard_mutex)) + goto out; + + task_lock(task); + retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH); + task_unlock(task); + if (retval) + goto unlock_creds; + + write_lock_irq(&tasklist_lock); + retval = -EPERM; + if (unlikely(task->exit_state)) + goto unlock_tasklist; + if (task->ptrace) + goto unlock_tasklist; + + task->ptrace = PT_PTRACED; + if (task_ns_capable(task, CAP_SYS_PTRACE)) + task->ptrace |= PT_PTRACE_CAP; + + __ptrace_link(task, current); + send_sig_info(SIGSTOP, SEND_SIG_FORCED, task); + + spin_lock(&task->sighand->siglock); + + /* + * If the task is already STOPPED, set GROUP_STOP_PENDING and + * TRAPPING, and kick it so that it transits to TRACED. TRAPPING + * will be cleared if the child completes the transition or any + * event which clears the group stop states happens. We'll wait + * for the transition to complete before returning from this + * function. + * + * This hides STOPPED -> RUNNING -> TRACED transition from the + * attaching thread but a different thread in the same group can + * still observe the transient RUNNING state. IOW, if another + * thread's WNOHANG wait(2) on the stopped tracee races against + * ATTACH, the wait(2) may fail due to the transient RUNNING. + * + * The following task_is_stopped() test is safe as both transitions + * in and out of STOPPED are protected by siglock. + */ + if (task_is_stopped(task)) { + task->group_stop |= GROUP_STOP_PENDING | GROUP_STOP_TRAPPING; + signal_wake_up(task, 1); + wait_trap = true; + } + + spin_unlock(&task->sighand->siglock); + + retval = 0; +unlock_tasklist: + write_unlock_irq(&tasklist_lock); +unlock_creds: + mutex_unlock(&task->signal->cred_guard_mutex); +out: + if (wait_trap) + wait_event(current->signal->wait_chldexit, + !(task->group_stop & GROUP_STOP_TRAPPING)); + return retval; +} + +/** + * ptrace_traceme -- helper for PTRACE_TRACEME + * + * Performs checks and sets PT_PTRACED. + * Should be used by all ptrace implementations for PTRACE_TRACEME. + */ +static int ptrace_traceme(void) +{ + int ret = -EPERM; + + write_lock_irq(&tasklist_lock); + /* Are we already being traced? */ + if (!current->ptrace) { + ret = security_ptrace_traceme(current->parent); + /* + * Check PF_EXITING to ensure ->real_parent has not passed + * exit_ptrace(). Otherwise we don't report the error but + * pretend ->real_parent untraces us right after return. + */ + if (!ret && !(current->real_parent->flags & PF_EXITING)) { + current->ptrace = PT_PTRACED; + __ptrace_link(current, current->real_parent); + } + } + write_unlock_irq(&tasklist_lock); + + return ret; +} + +/* + * Called with irqs disabled, returns true if childs should reap themselves. + */ +static int ignoring_children(struct sighand_struct *sigh) +{ + int ret; + spin_lock(&sigh->siglock); + ret = (sigh->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) || + (sigh->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT); + spin_unlock(&sigh->siglock); + return ret; +} + +/* + * Called with tasklist_lock held for writing. + * Unlink a traced task, and clean it up if it was a traced zombie. + * Return true if it needs to be reaped with release_task(). + * (We can't call release_task() here because we already hold tasklist_lock.) + * + * If it's a zombie, our attachedness prevented normal parent notification + * or self-reaping. Do notification now if it would have happened earlier. + * If it should reap itself, return true. + * + * If it's our own child, there is no notification to do. But if our normal + * children self-reap, then this child was prevented by ptrace and we must + * reap it now, in that case we must also wake up sub-threads sleeping in + * do_wait(). + */ +static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p) +{ + __ptrace_unlink(p); + + if (p->exit_state == EXIT_ZOMBIE) { + if (!task_detached(p) && thread_group_empty(p)) { + if (!same_thread_group(p->real_parent, tracer)) + do_notify_parent(p, p->exit_signal); + else if (ignoring_children(tracer->sighand)) { + __wake_up_parent(p, tracer); + p->exit_signal = -1; + } + } + if (task_detached(p)) { + /* Mark it as in the process of being reaped. */ + p->exit_state = EXIT_DEAD; + return true; + } + } + + return false; +} + +static int ptrace_detach(struct task_struct *child, unsigned int data) +{ + bool dead = false; + + if (!valid_signal(data)) + return -EIO; + + /* Architecture-specific hardware disable .. */ + ptrace_disable(child); + clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + + write_lock_irq(&tasklist_lock); + /* + * This child can be already killed. Make sure de_thread() or + * our sub-thread doing do_wait() didn't do release_task() yet. + */ + if (child->ptrace) { + child->exit_code = data; + dead = __ptrace_detach(current, child); + } + write_unlock_irq(&tasklist_lock); + + if (unlikely(dead)) + release_task(child); + + return 0; +} + +/* + * Detach all tasks we were using ptrace on. Called with tasklist held + * for writing, and returns with it held too. But note it can release + * and reacquire the lock. + */ +void exit_ptrace(struct task_struct *tracer) + __releases(&tasklist_lock) + __acquires(&tasklist_lock) +{ + struct task_struct *p, *n; + LIST_HEAD(ptrace_dead); + + if (likely(list_empty(&tracer->ptraced))) + return; + + list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) { + if (__ptrace_detach(tracer, p)) + list_add(&p->ptrace_entry, &ptrace_dead); + } + + write_unlock_irq(&tasklist_lock); + BUG_ON(!list_empty(&tracer->ptraced)); + + list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) { + list_del_init(&p->ptrace_entry); + release_task(p); + } + + write_lock_irq(&tasklist_lock); +} + +int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) +{ + int copied = 0; + + while (len > 0) { + char buf[128]; + int this_len, retval; + + this_len = (len > sizeof(buf)) ? sizeof(buf) : len; + retval = access_process_vm(tsk, src, buf, this_len, 0); + if (!retval) { + if (copied) + break; + return -EIO; + } + if (copy_to_user(dst, buf, retval)) + return -EFAULT; + copied += retval; + src += retval; + dst += retval; + len -= retval; + } + return copied; +} + +int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len) +{ + int copied = 0; + + while (len > 0) { + char buf[128]; + int this_len, retval; + + this_len = (len > sizeof(buf)) ? sizeof(buf) : len; + if (copy_from_user(buf, src, this_len)) + return -EFAULT; + retval = access_process_vm(tsk, dst, buf, this_len, 1); + if (!retval) { + if (copied) + break; + return -EIO; + } + copied += retval; + src += retval; + dst += retval; + len -= retval; + } + return copied; +} + +static int ptrace_setoptions(struct task_struct *child, unsigned long data) +{ + child->ptrace &= ~PT_TRACE_MASK; + + if (data & PTRACE_O_TRACESYSGOOD) + child->ptrace |= PT_TRACESYSGOOD; + + if (data & PTRACE_O_TRACEFORK) + child->ptrace |= PT_TRACE_FORK; + + if (data & PTRACE_O_TRACEVFORK) + child->ptrace |= PT_TRACE_VFORK; + + if (data & PTRACE_O_TRACECLONE) + child->ptrace |= PT_TRACE_CLONE; + + if (data & PTRACE_O_TRACEEXEC) + child->ptrace |= PT_TRACE_EXEC; + + if (data & PTRACE_O_TRACEVFORKDONE) + child->ptrace |= PT_TRACE_VFORK_DONE; + + if (data & PTRACE_O_TRACEEXIT) + child->ptrace |= PT_TRACE_EXIT; + + return (data & ~PTRACE_O_MASK) ? -EINVAL : 0; +} + +static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) +{ + unsigned long flags; + int error = -ESRCH; + + if (lock_task_sighand(child, &flags)) { + error = -EINVAL; + if (likely(child->last_siginfo != NULL)) { + *info = *child->last_siginfo; + error = 0; + } + unlock_task_sighand(child, &flags); + } + return error; +} + +static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info) +{ + unsigned long flags; + int error = -ESRCH; + + if (lock_task_sighand(child, &flags)) { + error = -EINVAL; + if (likely(child->last_siginfo != NULL)) { + *child->last_siginfo = *info; + error = 0; + } + unlock_task_sighand(child, &flags); + } + return error; +} + + +#ifdef PTRACE_SINGLESTEP +#define is_singlestep(request) ((request) == PTRACE_SINGLESTEP) +#else +#define is_singlestep(request) 0 +#endif + +#ifdef PTRACE_SINGLEBLOCK +#define is_singleblock(request) ((request) == PTRACE_SINGLEBLOCK) +#else +#define is_singleblock(request) 0 +#endif + +#ifdef PTRACE_SYSEMU +#define is_sysemu_singlestep(request) ((request) == PTRACE_SYSEMU_SINGLESTEP) +#else +#define is_sysemu_singlestep(request) 0 +#endif + +static int ptrace_resume(struct task_struct *child, long request, + unsigned long data) +{ + if (!valid_signal(data)) + return -EIO; + + if (request == PTRACE_SYSCALL) + set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + else + clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + +#ifdef TIF_SYSCALL_EMU + if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP) + set_tsk_thread_flag(child, TIF_SYSCALL_EMU); + else + clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); +#endif + + if (is_singleblock(request)) { + if (unlikely(!arch_has_block_step())) + return -EIO; + user_enable_block_step(child); + } else if (is_singlestep(request) || is_sysemu_singlestep(request)) { + if (unlikely(!arch_has_single_step())) + return -EIO; + user_enable_single_step(child); + } else { + user_disable_single_step(child); + } + + child->exit_code = data; + wake_up_state(child, __TASK_TRACED); + + return 0; +} + +#ifdef CONFIG_HAVE_ARCH_TRACEHOOK + +static const struct user_regset * +find_regset(const struct user_regset_view *view, unsigned int type) +{ + const struct user_regset *regset; + int n; + + for (n = 0; n < view->n; ++n) { + regset = view->regsets + n; + if (regset->core_note_type == type) + return regset; + } + + return NULL; +} + +static int ptrace_regset(struct task_struct *task, int req, unsigned int type, + struct iovec *kiov) +{ + const struct user_regset_view *view = task_user_regset_view(task); + const struct user_regset *regset = find_regset(view, type); + int regset_no; + + if (!regset || (kiov->iov_len % regset->size) != 0) + return -EINVAL; + + regset_no = regset - view->regsets; + kiov->iov_len = min(kiov->iov_len, + (__kernel_size_t) (regset->n * regset->size)); + + if (req == PTRACE_GETREGSET) + return copy_regset_to_user(task, view, regset_no, 0, + kiov->iov_len, kiov->iov_base); + else + return copy_regset_from_user(task, view, regset_no, 0, + kiov->iov_len, kiov->iov_base); +} + +#endif + +int ptrace_request(struct task_struct *child, long request, + unsigned long addr, unsigned long data) +{ + int ret = -EIO; + siginfo_t siginfo; + void __user *datavp = (void __user *) data; + unsigned long __user *datalp = datavp; + + switch (request) { + case PTRACE_PEEKTEXT: + case PTRACE_PEEKDATA: + return generic_ptrace_peekdata(child, addr, data); + case PTRACE_POKETEXT: + case PTRACE_POKEDATA: + return generic_ptrace_pokedata(child, addr, data); + +#ifdef PTRACE_OLDSETOPTIONS + case PTRACE_OLDSETOPTIONS: +#endif + case PTRACE_SETOPTIONS: + ret = ptrace_setoptions(child, data); + break; + case PTRACE_GETEVENTMSG: + ret = put_user(child->ptrace_message, datalp); + break; + + case PTRACE_GETSIGINFO: + ret = ptrace_getsiginfo(child, &siginfo); + if (!ret) + ret = copy_siginfo_to_user(datavp, &siginfo); + break; + + case PTRACE_SETSIGINFO: + if (copy_from_user(&siginfo, datavp, sizeof siginfo)) + ret = -EFAULT; + else + ret = ptrace_setsiginfo(child, &siginfo); + break; + + case PTRACE_DETACH: /* detach a process that was attached. */ + ret = ptrace_detach(child, data); + break; + +#ifdef CONFIG_BINFMT_ELF_FDPIC + case PTRACE_GETFDPIC: { + struct mm_struct *mm = get_task_mm(child); + unsigned long tmp = 0; + + ret = -ESRCH; + if (!mm) + break; + + switch (addr) { + case PTRACE_GETFDPIC_EXEC: + tmp = mm->context.exec_fdpic_loadmap; + break; + case PTRACE_GETFDPIC_INTERP: + tmp = mm->context.interp_fdpic_loadmap; + break; + default: + break; + } + mmput(mm); + + ret = put_user(tmp, datalp); + break; + } +#endif + +#ifdef PTRACE_SINGLESTEP + case PTRACE_SINGLESTEP: +#endif +#ifdef PTRACE_SINGLEBLOCK + case PTRACE_SINGLEBLOCK: +#endif +#ifdef PTRACE_SYSEMU + case PTRACE_SYSEMU: + case PTRACE_SYSEMU_SINGLESTEP: +#endif + case PTRACE_SYSCALL: + case PTRACE_CONT: + return ptrace_resume(child, request, data); + + case PTRACE_KILL: + if (child->exit_state) /* already dead */ + return 0; + return ptrace_resume(child, request, SIGKILL); + +#ifdef CONFIG_HAVE_ARCH_TRACEHOOK + case PTRACE_GETREGSET: + case PTRACE_SETREGSET: + { + struct iovec kiov; + struct iovec __user *uiov = datavp; + + if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov))) + return -EFAULT; + + if (__get_user(kiov.iov_base, &uiov->iov_base) || + __get_user(kiov.iov_len, &uiov->iov_len)) + return -EFAULT; + + ret = ptrace_regset(child, request, addr, &kiov); + if (!ret) + ret = __put_user(kiov.iov_len, &uiov->iov_len); + break; + } +#endif + default: + break; + } + + return ret; +} + +static struct task_struct *ptrace_get_task_struct(pid_t pid) +{ + struct task_struct *child; + + rcu_read_lock(); + child = find_task_by_vpid(pid); + if (child) + get_task_struct(child); + rcu_read_unlock(); + + if (!child) + return ERR_PTR(-ESRCH); + return child; +} + +#ifndef arch_ptrace_attach +#define arch_ptrace_attach(child) do { } while (0) +#endif + +SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, + unsigned long, data) +{ + struct task_struct *child; + long ret; + + if (request == PTRACE_TRACEME) { + ret = ptrace_traceme(); + if (!ret) + arch_ptrace_attach(current); + goto out; + } + + child = ptrace_get_task_struct(pid); + if (IS_ERR(child)) { + ret = PTR_ERR(child); + goto out; + } + + if (request == PTRACE_ATTACH) { + ret = ptrace_attach(child); + /* + * Some architectures need to do book-keeping after + * a ptrace attach. + */ + if (!ret) + arch_ptrace_attach(child); + goto out_put_task_struct; + } + + ret = ptrace_check_attach(child, request == PTRACE_KILL); + if (ret < 0) + goto out_put_task_struct; + + ret = arch_ptrace(child, request, addr, data); + + out_put_task_struct: + put_task_struct(child); + out: + return ret; +} + +int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, + unsigned long data) +{ + unsigned long tmp; + int copied; + + copied = access_process_vm(tsk, addr, &tmp, sizeof(tmp), 0); + if (copied != sizeof(tmp)) + return -EIO; + return put_user(tmp, (unsigned long __user *)data); +} + +int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, + unsigned long data) +{ + int copied; + + copied = access_process_vm(tsk, addr, &data, sizeof(data), 1); + return (copied == sizeof(data)) ? 0 : -EIO; +} + +#if defined CONFIG_COMPAT +#include + +int compat_ptrace_request(struct task_struct *child, compat_long_t request, + compat_ulong_t addr, compat_ulong_t data) +{ + compat_ulong_t __user *datap = compat_ptr(data); + compat_ulong_t word; + siginfo_t siginfo; + int ret; + + switch (request) { + case PTRACE_PEEKTEXT: + case PTRACE_PEEKDATA: + ret = access_process_vm(child, addr, &word, sizeof(word), 0); + if (ret != sizeof(word)) + ret = -EIO; + else + ret = put_user(word, datap); + break; + + case PTRACE_POKETEXT: + case PTRACE_POKEDATA: + ret = access_process_vm(child, addr, &data, sizeof(data), 1); + ret = (ret != sizeof(data) ? -EIO : 0); + break; + + case PTRACE_GETEVENTMSG: + ret = put_user((compat_ulong_t) child->ptrace_message, datap); + break; + + case PTRACE_GETSIGINFO: + ret = ptrace_getsiginfo(child, &siginfo); + if (!ret) + ret = copy_siginfo_to_user32( + (struct compat_siginfo __user *) datap, + &siginfo); + break; + + case PTRACE_SETSIGINFO: + memset(&siginfo, 0, sizeof siginfo); + if (copy_siginfo_from_user32( + &siginfo, (struct compat_siginfo __user *) datap)) + ret = -EFAULT; + else + ret = ptrace_setsiginfo(child, &siginfo); + break; +#ifdef CONFIG_HAVE_ARCH_TRACEHOOK + case PTRACE_GETREGSET: + case PTRACE_SETREGSET: + { + struct iovec kiov; + struct compat_iovec __user *uiov = + (struct compat_iovec __user *) datap; + compat_uptr_t ptr; + compat_size_t len; + + if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov))) + return -EFAULT; + + if (__get_user(ptr, &uiov->iov_base) || + __get_user(len, &uiov->iov_len)) + return -EFAULT; + + kiov.iov_base = compat_ptr(ptr); + kiov.iov_len = len; + + ret = ptrace_regset(child, request, addr, &kiov); + if (!ret) + ret = __put_user(kiov.iov_len, &uiov->iov_len); + break; + } +#endif + + default: + ret = ptrace_request(child, request, addr, data); + } + + return ret; +} + +asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, + compat_long_t addr, compat_long_t data) +{ + struct task_struct *child; + long ret; + + if (request == PTRACE_TRACEME) { + ret = ptrace_traceme(); + goto out; + } + + child = ptrace_get_task_struct(pid); + if (IS_ERR(child)) { + ret = PTR_ERR(child); + goto out; + } + + if (request == PTRACE_ATTACH) { + ret = ptrace_attach(child); + /* + * Some architectures need to do book-keeping after + * a ptrace attach. + */ + if (!ret) + arch_ptrace_attach(child); + goto out_put_task_struct; + } + + ret = ptrace_check_attach(child, request == PTRACE_KILL); + if (!ret) + ret = compat_arch_ptrace(child, request, addr, data); + + out_put_task_struct: + put_task_struct(child); + out: + return ret; +} +#endif /* CONFIG_COMPAT */ + +#ifdef CONFIG_HAVE_HW_BREAKPOINT +int ptrace_get_breakpoints(struct task_struct *tsk) +{ + if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt)) + return 0; + + return -1; +} + +void ptrace_put_breakpoints(struct task_struct *tsk) +{ + if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt)) + flush_ptrace_hw_breakpoint(tsk); +} +#endif /* CONFIG_HAVE_HW_BREAKPOINT */ diff --git a/kernel/range.c b/kernel/range.c new file mode 100644 index 00000000..37fa9b99 --- /dev/null +++ b/kernel/range.c @@ -0,0 +1,159 @@ +/* + * Range add and subtract + */ +#include +#include +#include + +#include + +int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) +{ + if (start >= end) + return nr_range; + + /* Out of slots: */ + if (nr_range >= az) + return nr_range; + + range[nr_range].start = start; + range[nr_range].end = end; + + nr_range++; + + return nr_range; +} + +int add_range_with_merge(struct range *range, int az, int nr_range, + u64 start, u64 end) +{ + int i; + + if (start >= end) + return nr_range; + + /* Try to merge it with old one: */ + for (i = 0; i < nr_range; i++) { + u64 final_start, final_end; + u64 common_start, common_end; + + if (!range[i].end) + continue; + + common_start = max(range[i].start, start); + common_end = min(range[i].end, end); + if (common_start > common_end) + continue; + + final_start = min(range[i].start, start); + final_end = max(range[i].end, end); + + range[i].start = final_start; + range[i].end = final_end; + return nr_range; + } + + /* Need to add it: */ + return add_range(range, az, nr_range, start, end); +} + +void subtract_range(struct range *range, int az, u64 start, u64 end) +{ + int i, j; + + if (start >= end) + return; + + for (j = 0; j < az; j++) { + if (!range[j].end) + continue; + + if (start <= range[j].start && end >= range[j].end) { + range[j].start = 0; + range[j].end = 0; + continue; + } + + if (start <= range[j].start && end < range[j].end && + range[j].start < end) { + range[j].start = end; + continue; + } + + + if (start > range[j].start && end >= range[j].end && + range[j].end > start) { + range[j].end = start; + continue; + } + + if (start > range[j].start && end < range[j].end) { + /* Find the new spare: */ + for (i = 0; i < az; i++) { + if (range[i].end == 0) + break; + } + if (i < az) { + range[i].end = range[j].end; + range[i].start = end; + } else { + printk(KERN_ERR "run of slot in ranges\n"); + } + range[j].end = start; + continue; + } + } +} + +static int cmp_range(const void *x1, const void *x2) +{ + const struct range *r1 = x1; + const struct range *r2 = x2; + s64 start1, start2; + + start1 = r1->start; + start2 = r2->start; + + return start1 - start2; +} + +int clean_sort_range(struct range *range, int az) +{ + int i, j, k = az - 1, nr_range = az; + + for (i = 0; i < k; i++) { + if (range[i].end) + continue; + for (j = k; j > i; j--) { + if (range[j].end) { + k = j; + break; + } + } + if (j == i) + break; + range[i].start = range[k].start; + range[i].end = range[k].end; + range[k].start = 0; + range[k].end = 0; + k--; + } + /* count it */ + for (i = 0; i < az; i++) { + if (!range[i].end) { + nr_range = i; + break; + } + } + + /* sort them */ + sort(range, nr_range, sizeof(struct range), cmp_range, NULL); + + return nr_range; +} + +void sort_range(struct range *range, int nr_range) +{ + /* sort them */ + sort(range, nr_range, sizeof(struct range), cmp_range, NULL); +} diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c new file mode 100644 index 00000000..7784bd21 --- /dev/null +++ b/kernel/rcupdate.c @@ -0,0 +1,294 @@ +/* + * Read-Copy Update mechanism for mutual exclusion + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2001 + * + * Authors: Dipankar Sarma + * Manfred Spraul + * + * Based on the original work by Paul McKenney + * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. + * Papers: + * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf + * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) + * + * For detailed explanation of Read-Copy Update mechanism see - + * http://lse.sourceforge.net/locking/rcupdate.html + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key rcu_lock_key; +struct lockdep_map rcu_lock_map = + STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); +EXPORT_SYMBOL_GPL(rcu_lock_map); + +static struct lock_class_key rcu_bh_lock_key; +struct lockdep_map rcu_bh_lock_map = + STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key); +EXPORT_SYMBOL_GPL(rcu_bh_lock_map); + +static struct lock_class_key rcu_sched_lock_key; +struct lockdep_map rcu_sched_lock_map = + STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); +EXPORT_SYMBOL_GPL(rcu_sched_lock_map); +#endif + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +int debug_lockdep_rcu_enabled(void) +{ + return rcu_scheduler_active && debug_locks && + current->lockdep_recursion == 0; +} +EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); + +/** + * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section? + * + * Check for bottom half being disabled, which covers both the + * CONFIG_PROVE_RCU and not cases. Note that if someone uses + * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) + * will show the situation. This is useful for debug checks in functions + * that require that they be called within an RCU read-side critical + * section. + * + * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. + */ +int rcu_read_lock_bh_held(void) +{ + if (!debug_lockdep_rcu_enabled()) + return 1; + return in_softirq() || irqs_disabled(); +} +EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); + +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +/* + * Awaken the corresponding synchronize_rcu() instance now that a + * grace period has elapsed. + */ +void wakeme_after_rcu(struct rcu_head *head) +{ + struct rcu_synchronize *rcu; + + rcu = container_of(head, struct rcu_synchronize, head); + complete(&rcu->completion); +} + +#ifdef CONFIG_PROVE_RCU +/* + * wrapper function to avoid #include problems. + */ +int rcu_my_thread_group_empty(void) +{ + return thread_group_empty(current); +} +EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty); +#endif /* #ifdef CONFIG_PROVE_RCU */ + +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD +static inline void debug_init_rcu_head(struct rcu_head *head) +{ + debug_object_init(head, &rcuhead_debug_descr); +} + +static inline void debug_rcu_head_free(struct rcu_head *head) +{ + debug_object_free(head, &rcuhead_debug_descr); +} + +/* + * fixup_init is called when: + * - an active object is initialized + */ +static int rcuhead_fixup_init(void *addr, enum debug_obj_state state) +{ + struct rcu_head *head = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + /* + * Ensure that queued callbacks are all executed. + * If we detect that we are nested in a RCU read-side critical + * section, we should simply fail, otherwise we would deadlock. + * In !PREEMPT configurations, there is no way to tell if we are + * in a RCU read-side critical section or not, so we never + * attempt any fixup and just print a warning. + */ +#ifndef CONFIG_PREEMPT + WARN_ON_ONCE(1); + return 0; +#endif + if (rcu_preempt_depth() != 0 || preempt_count() != 0 || + irqs_disabled()) { + WARN_ON_ONCE(1); + return 0; + } + rcu_barrier(); + rcu_barrier_sched(); + rcu_barrier_bh(); + debug_object_init(head, &rcuhead_debug_descr); + return 1; + default: + return 0; + } +} + +/* + * fixup_activate is called when: + * - an active object is activated + * - an unknown object is activated (might be a statically initialized object) + * Activation is performed internally by call_rcu(). + */ +static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) +{ + struct rcu_head *head = addr; + + switch (state) { + + case ODEBUG_STATE_NOTAVAILABLE: + /* + * This is not really a fixup. We just make sure that it is + * tracked in the object tracker. + */ + debug_object_init(head, &rcuhead_debug_descr); + debug_object_activate(head, &rcuhead_debug_descr); + return 0; + + case ODEBUG_STATE_ACTIVE: + /* + * Ensure that queued callbacks are all executed. + * If we detect that we are nested in a RCU read-side critical + * section, we should simply fail, otherwise we would deadlock. + * In !PREEMPT configurations, there is no way to tell if we are + * in a RCU read-side critical section or not, so we never + * attempt any fixup and just print a warning. + */ +#ifndef CONFIG_PREEMPT + WARN_ON_ONCE(1); + return 0; +#endif + if (rcu_preempt_depth() != 0 || preempt_count() != 0 || + irqs_disabled()) { + WARN_ON_ONCE(1); + return 0; + } + rcu_barrier(); + rcu_barrier_sched(); + rcu_barrier_bh(); + debug_object_activate(head, &rcuhead_debug_descr); + return 1; + default: + return 0; + } +} + +/* + * fixup_free is called when: + * - an active object is freed + */ +static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) +{ + struct rcu_head *head = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + /* + * Ensure that queued callbacks are all executed. + * If we detect that we are nested in a RCU read-side critical + * section, we should simply fail, otherwise we would deadlock. + * In !PREEMPT configurations, there is no way to tell if we are + * in a RCU read-side critical section or not, so we never + * attempt any fixup and just print a warning. + */ +#ifndef CONFIG_PREEMPT + WARN_ON_ONCE(1); + return 0; +#endif + if (rcu_preempt_depth() != 0 || preempt_count() != 0 || + irqs_disabled()) { + WARN_ON_ONCE(1); + return 0; + } + rcu_barrier(); + rcu_barrier_sched(); + rcu_barrier_bh(); + debug_object_free(head, &rcuhead_debug_descr); + return 1; + default: + return 0; + } +} + +/** + * init_rcu_head_on_stack() - initialize on-stack rcu_head for debugobjects + * @head: pointer to rcu_head structure to be initialized + * + * This function informs debugobjects of a new rcu_head structure that + * has been allocated as an auto variable on the stack. This function + * is not required for rcu_head structures that are statically defined or + * that are dynamically allocated on the heap. This function has no + * effect for !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds. + */ +void init_rcu_head_on_stack(struct rcu_head *head) +{ + debug_object_init_on_stack(head, &rcuhead_debug_descr); +} +EXPORT_SYMBOL_GPL(init_rcu_head_on_stack); + +/** + * destroy_rcu_head_on_stack() - destroy on-stack rcu_head for debugobjects + * @head: pointer to rcu_head structure to be initialized + * + * This function informs debugobjects that an on-stack rcu_head structure + * is about to go out of scope. As with init_rcu_head_on_stack(), this + * function is not required for rcu_head structures that are statically + * defined or that are dynamically allocated on the heap. Also as with + * init_rcu_head_on_stack(), this function has no effect for + * !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds. + */ +void destroy_rcu_head_on_stack(struct rcu_head *head) +{ + debug_object_free(head, &rcuhead_debug_descr); +} +EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); + +struct debug_obj_descr rcuhead_debug_descr = { + .name = "rcu_head", + .fixup_init = rcuhead_fixup_init, + .fixup_activate = rcuhead_fixup_activate, + .fixup_free = rcuhead_fixup_free, +}; +EXPORT_SYMBOL_GPL(rcuhead_debug_descr); +#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c new file mode 100644 index 00000000..7bbac7d0 --- /dev/null +++ b/kernel/rcutiny.c @@ -0,0 +1,324 @@ +/* + * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2008 + * + * Author: Paul E. McKenney + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */ +static struct task_struct *rcu_kthread_task; +static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq); +static unsigned long have_rcu_kthread_work; + +/* Forward declarations for rcutiny_plugin.h. */ +struct rcu_ctrlblk; +static void invoke_rcu_kthread(void); +static void rcu_process_callbacks(struct rcu_ctrlblk *rcp); +static int rcu_kthread(void *arg); +static void __call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *rcu), + struct rcu_ctrlblk *rcp); + +#include "rcutiny_plugin.h" + +#ifdef CONFIG_NO_HZ + +static long rcu_dynticks_nesting = 1; + +/* + * Enter dynticks-idle mode, which is an extended quiescent state + * if we have fully entered that mode (i.e., if the new value of + * dynticks_nesting is zero). + */ +void rcu_enter_nohz(void) +{ + if (--rcu_dynticks_nesting == 0) + rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ +} + +/* + * Exit dynticks-idle mode, so that we are no longer in an extended + * quiescent state. + */ +void rcu_exit_nohz(void) +{ + rcu_dynticks_nesting++; +} + +#endif /* #ifdef CONFIG_NO_HZ */ + +/* + * Helper function for rcu_sched_qs() and rcu_bh_qs(). + * Also irqs are disabled to avoid confusion due to interrupt handlers + * invoking call_rcu(). + */ +static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) +{ + if (rcp->rcucblist != NULL && + rcp->donetail != rcp->curtail) { + rcp->donetail = rcp->curtail; + return 1; + } + + return 0; +} + +/* + * Wake up rcu_kthread() to process callbacks now eligible for invocation + * or to boost readers. + */ +static void invoke_rcu_kthread(void) +{ + have_rcu_kthread_work = 1; + wake_up(&rcu_kthread_wq); +} + +/* + * Record an rcu quiescent state. And an rcu_bh quiescent state while we + * are at it, given that any rcu quiescent state is also an rcu_bh + * quiescent state. Use "+" instead of "||" to defeat short circuiting. + */ +void rcu_sched_qs(int cpu) +{ + unsigned long flags; + + local_irq_save(flags); + if (rcu_qsctr_help(&rcu_sched_ctrlblk) + + rcu_qsctr_help(&rcu_bh_ctrlblk)) + invoke_rcu_kthread(); + local_irq_restore(flags); +} + +/* + * Record an rcu_bh quiescent state. + */ +void rcu_bh_qs(int cpu) +{ + unsigned long flags; + + local_irq_save(flags); + if (rcu_qsctr_help(&rcu_bh_ctrlblk)) + invoke_rcu_kthread(); + local_irq_restore(flags); +} + +/* + * Check to see if the scheduling-clock interrupt came from an extended + * quiescent state, and, if so, tell RCU about it. + */ +void rcu_check_callbacks(int cpu, int user) +{ + if (user || + (idle_cpu(cpu) && + !in_softirq() && + hardirq_count() <= (1 << HARDIRQ_SHIFT))) + rcu_sched_qs(cpu); + else if (!in_softirq()) + rcu_bh_qs(cpu); + rcu_preempt_check_callbacks(); +} + +/* + * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure + * whose grace period has elapsed. + */ +static void rcu_process_callbacks(struct rcu_ctrlblk *rcp) +{ + struct rcu_head *next, *list; + unsigned long flags; + RCU_TRACE(int cb_count = 0); + + /* If no RCU callbacks ready to invoke, just return. */ + if (&rcp->rcucblist == rcp->donetail) + return; + + /* Move the ready-to-invoke callbacks to a local list. */ + local_irq_save(flags); + list = rcp->rcucblist; + rcp->rcucblist = *rcp->donetail; + *rcp->donetail = NULL; + if (rcp->curtail == rcp->donetail) + rcp->curtail = &rcp->rcucblist; + rcu_preempt_remove_callbacks(rcp); + rcp->donetail = &rcp->rcucblist; + local_irq_restore(flags); + + /* Invoke the callbacks on the local list. */ + while (list) { + next = list->next; + prefetch(next); + debug_rcu_head_unqueue(list); + local_bh_disable(); + __rcu_reclaim(list); + local_bh_enable(); + list = next; + RCU_TRACE(cb_count++); + } + RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); +} + +/* + * This kthread invokes RCU callbacks whose grace periods have + * elapsed. It is awakened as needed, and takes the place of the + * RCU_SOFTIRQ that was used previously for this purpose. + * This is a kthread, but it is never stopped, at least not until + * the system goes down. + */ +static int rcu_kthread(void *arg) +{ + unsigned long work; + unsigned long morework; + unsigned long flags; + + for (;;) { + wait_event_interruptible(rcu_kthread_wq, + have_rcu_kthread_work != 0); + morework = rcu_boost(); + local_irq_save(flags); + work = have_rcu_kthread_work; + have_rcu_kthread_work = morework; + local_irq_restore(flags); + if (work) { + rcu_process_callbacks(&rcu_sched_ctrlblk); + rcu_process_callbacks(&rcu_bh_ctrlblk); + rcu_preempt_process_callbacks(); + } + schedule_timeout_interruptible(1); /* Leave CPU for others. */ + } + + return 0; /* Not reached, but needed to shut gcc up. */ +} + +/* + * Wait for a grace period to elapse. But it is illegal to invoke + * synchronize_sched() from within an RCU read-side critical section. + * Therefore, any legal call to synchronize_sched() is a quiescent + * state, and so on a UP system, synchronize_sched() need do nothing. + * Ditto for synchronize_rcu_bh(). (But Lai Jiangshan points out the + * benefits of doing might_sleep() to reduce latency.) + * + * Cool, huh? (Due to Josh Triplett.) + * + * But we want to make this a static inline later. The cond_resched() + * currently makes this problematic. + */ +void synchronize_sched(void) +{ + cond_resched(); +} +EXPORT_SYMBOL_GPL(synchronize_sched); + +/* + * Helper function for call_rcu() and call_rcu_bh(). + */ +static void __call_rcu(struct rcu_head *head, + void (*func)(struct rcu_head *rcu), + struct rcu_ctrlblk *rcp) +{ + unsigned long flags; + + debug_rcu_head_queue(head); + head->func = func; + head->next = NULL; + + local_irq_save(flags); + *rcp->curtail = head; + rcp->curtail = &head->next; + RCU_TRACE(rcp->qlen++); + local_irq_restore(flags); +} + +/* + * Post an RCU callback to be invoked after the end of an RCU-sched grace + * period. But since we have but one CPU, that would be after any + * quiescent state. + */ +void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, &rcu_sched_ctrlblk); +} +EXPORT_SYMBOL_GPL(call_rcu_sched); + +/* + * Post an RCU bottom-half callback to be invoked after any subsequent + * quiescent state. + */ +void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, &rcu_bh_ctrlblk); +} +EXPORT_SYMBOL_GPL(call_rcu_bh); + +void rcu_barrier_bh(void) +{ + struct rcu_synchronize rcu; + + init_rcu_head_on_stack(&rcu.head); + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu_bh(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); + destroy_rcu_head_on_stack(&rcu.head); +} +EXPORT_SYMBOL_GPL(rcu_barrier_bh); + +void rcu_barrier_sched(void) +{ + struct rcu_synchronize rcu; + + init_rcu_head_on_stack(&rcu.head); + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu_sched(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); + destroy_rcu_head_on_stack(&rcu.head); +} +EXPORT_SYMBOL_GPL(rcu_barrier_sched); + +/* + * Spawn the kthread that invokes RCU callbacks. + */ +static int __init rcu_spawn_kthreads(void) +{ + struct sched_param sp; + + rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread"); + sp.sched_priority = RCU_BOOST_PRIO; + sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp); + return 0; +} +early_initcall(rcu_spawn_kthreads); diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h new file mode 100644 index 00000000..f259c676 --- /dev/null +++ b/kernel/rcutiny_plugin.h @@ -0,0 +1,1007 @@ +/* + * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition + * Internal non-public definitions that provide either classic + * or preemptible semantics. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (c) 2010 Linaro + * + * Author: Paul E. McKenney + */ + +#include +#include +#include + +#ifdef CONFIG_RCU_TRACE +#define RCU_TRACE(stmt) stmt +#else /* #ifdef CONFIG_RCU_TRACE */ +#define RCU_TRACE(stmt) +#endif /* #else #ifdef CONFIG_RCU_TRACE */ + +/* Global control variables for rcupdate callback mechanism. */ +struct rcu_ctrlblk { + struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ + struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ + struct rcu_head **curtail; /* ->next pointer of last CB. */ + RCU_TRACE(long qlen); /* Number of pending CBs. */ +}; + +/* Definition for rcupdate control block. */ +static struct rcu_ctrlblk rcu_sched_ctrlblk = { + .donetail = &rcu_sched_ctrlblk.rcucblist, + .curtail = &rcu_sched_ctrlblk.rcucblist, +}; + +static struct rcu_ctrlblk rcu_bh_ctrlblk = { + .donetail = &rcu_bh_ctrlblk.rcucblist, + .curtail = &rcu_bh_ctrlblk.rcucblist, +}; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +int rcu_scheduler_active __read_mostly; +EXPORT_SYMBOL_GPL(rcu_scheduler_active); +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +#ifdef CONFIG_TINY_PREEMPT_RCU + +#include + +/* Global control variables for preemptible RCU. */ +struct rcu_preempt_ctrlblk { + struct rcu_ctrlblk rcb; /* curtail: ->next ptr of last CB for GP. */ + struct rcu_head **nexttail; + /* Tasks blocked in a preemptible RCU */ + /* read-side critical section while an */ + /* preemptible-RCU grace period is in */ + /* progress must wait for a later grace */ + /* period. This pointer points to the */ + /* ->next pointer of the last task that */ + /* must wait for a later grace period, or */ + /* to &->rcb.rcucblist if there is no */ + /* such task. */ + struct list_head blkd_tasks; + /* Tasks blocked in RCU read-side critical */ + /* section. Tasks are placed at the head */ + /* of this list and age towards the tail. */ + struct list_head *gp_tasks; + /* Pointer to the first task blocking the */ + /* current grace period, or NULL if there */ + /* is no such task. */ + struct list_head *exp_tasks; + /* Pointer to first task blocking the */ + /* current expedited grace period, or NULL */ + /* if there is no such task. If there */ + /* is no current expedited grace period, */ + /* then there cannot be any such task. */ +#ifdef CONFIG_RCU_BOOST + struct list_head *boost_tasks; + /* Pointer to first task that needs to be */ + /* priority-boosted, or NULL if no priority */ + /* boosting is needed. If there is no */ + /* current or expedited grace period, there */ + /* can be no such task. */ +#endif /* #ifdef CONFIG_RCU_BOOST */ + u8 gpnum; /* Current grace period. */ + u8 gpcpu; /* Last grace period blocked by the CPU. */ + u8 completed; /* Last grace period completed. */ + /* If all three are equal, RCU is idle. */ +#ifdef CONFIG_RCU_BOOST + unsigned long boost_time; /* When to start boosting (jiffies) */ +#endif /* #ifdef CONFIG_RCU_BOOST */ +#ifdef CONFIG_RCU_TRACE + unsigned long n_grace_periods; +#ifdef CONFIG_RCU_BOOST + unsigned long n_tasks_boosted; + /* Total number of tasks boosted. */ + unsigned long n_exp_boosts; + /* Number of tasks boosted for expedited GP. */ + unsigned long n_normal_boosts; + /* Number of tasks boosted for normal GP. */ + unsigned long n_balk_blkd_tasks; + /* Refused to boost: no blocked tasks. */ + unsigned long n_balk_exp_gp_tasks; + /* Refused to boost: nothing blocking GP. */ + unsigned long n_balk_boost_tasks; + /* Refused to boost: already boosting. */ + unsigned long n_balk_notyet; + /* Refused to boost: not yet time. */ + unsigned long n_balk_nos; + /* Refused to boost: not sure why, though. */ + /* This can happen due to race conditions. */ +#endif /* #ifdef CONFIG_RCU_BOOST */ +#endif /* #ifdef CONFIG_RCU_TRACE */ +}; + +static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = { + .rcb.donetail = &rcu_preempt_ctrlblk.rcb.rcucblist, + .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist, + .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist, + .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks), +}; + +static int rcu_preempted_readers_exp(void); +static void rcu_report_exp_done(void); + +/* + * Return true if the CPU has not yet responded to the current grace period. + */ +static int rcu_cpu_blocking_cur_gp(void) +{ + return rcu_preempt_ctrlblk.gpcpu != rcu_preempt_ctrlblk.gpnum; +} + +/* + * Check for a running RCU reader. Because there is only one CPU, + * there can be but one running RCU reader at a time. ;-) + */ +static int rcu_preempt_running_reader(void) +{ + return current->rcu_read_lock_nesting; +} + +/* + * Check for preempted RCU readers blocking any grace period. + * If the caller needs a reliable answer, it must disable hard irqs. + */ +static int rcu_preempt_blocked_readers_any(void) +{ + return !list_empty(&rcu_preempt_ctrlblk.blkd_tasks); +} + +/* + * Check for preempted RCU readers blocking the current grace period. + * If the caller needs a reliable answer, it must disable hard irqs. + */ +static int rcu_preempt_blocked_readers_cgp(void) +{ + return rcu_preempt_ctrlblk.gp_tasks != NULL; +} + +/* + * Return true if another preemptible-RCU grace period is needed. + */ +static int rcu_preempt_needs_another_gp(void) +{ + return *rcu_preempt_ctrlblk.rcb.curtail != NULL; +} + +/* + * Return true if a preemptible-RCU grace period is in progress. + * The caller must disable hardirqs. + */ +static int rcu_preempt_gp_in_progress(void) +{ + return rcu_preempt_ctrlblk.completed != rcu_preempt_ctrlblk.gpnum; +} + +/* + * Advance a ->blkd_tasks-list pointer to the next entry, instead + * returning NULL if at the end of the list. + */ +static struct list_head *rcu_next_node_entry(struct task_struct *t) +{ + struct list_head *np; + + np = t->rcu_node_entry.next; + if (np == &rcu_preempt_ctrlblk.blkd_tasks) + np = NULL; + return np; +} + +#ifdef CONFIG_RCU_TRACE + +#ifdef CONFIG_RCU_BOOST +static void rcu_initiate_boost_trace(void); +#endif /* #ifdef CONFIG_RCU_BOOST */ + +/* + * Dump additional statistice for TINY_PREEMPT_RCU. + */ +static void show_tiny_preempt_stats(struct seq_file *m) +{ + seq_printf(m, "rcu_preempt: qlen=%ld gp=%lu g%u/p%u/c%u tasks=%c%c%c\n", + rcu_preempt_ctrlblk.rcb.qlen, + rcu_preempt_ctrlblk.n_grace_periods, + rcu_preempt_ctrlblk.gpnum, + rcu_preempt_ctrlblk.gpcpu, + rcu_preempt_ctrlblk.completed, + "T."[list_empty(&rcu_preempt_ctrlblk.blkd_tasks)], + "N."[!rcu_preempt_ctrlblk.gp_tasks], + "E."[!rcu_preempt_ctrlblk.exp_tasks]); +#ifdef CONFIG_RCU_BOOST + seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n", + " ", + "B."[!rcu_preempt_ctrlblk.boost_tasks], + rcu_preempt_ctrlblk.n_tasks_boosted, + rcu_preempt_ctrlblk.n_exp_boosts, + rcu_preempt_ctrlblk.n_normal_boosts, + (int)(jiffies & 0xffff), + (int)(rcu_preempt_ctrlblk.boost_time & 0xffff)); + seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n", + " balk", + rcu_preempt_ctrlblk.n_balk_blkd_tasks, + rcu_preempt_ctrlblk.n_balk_exp_gp_tasks, + rcu_preempt_ctrlblk.n_balk_boost_tasks, + rcu_preempt_ctrlblk.n_balk_notyet, + rcu_preempt_ctrlblk.n_balk_nos); +#endif /* #ifdef CONFIG_RCU_BOOST */ +} + +#endif /* #ifdef CONFIG_RCU_TRACE */ + +#ifdef CONFIG_RCU_BOOST + +#include "rtmutex_common.h" + +/* + * Carry out RCU priority boosting on the task indicated by ->boost_tasks, + * and advance ->boost_tasks to the next task in the ->blkd_tasks list. + */ +static int rcu_boost(void) +{ + unsigned long flags; + struct rt_mutex mtx; + struct task_struct *t; + struct list_head *tb; + + if (rcu_preempt_ctrlblk.boost_tasks == NULL && + rcu_preempt_ctrlblk.exp_tasks == NULL) + return 0; /* Nothing to boost. */ + + raw_local_irq_save(flags); + + /* + * Recheck with irqs disabled: all tasks in need of boosting + * might exit their RCU read-side critical sections on their own + * if we are preempted just before disabling irqs. + */ + if (rcu_preempt_ctrlblk.boost_tasks == NULL && + rcu_preempt_ctrlblk.exp_tasks == NULL) { + raw_local_irq_restore(flags); + return 0; + } + + /* + * Preferentially boost tasks blocking expedited grace periods. + * This cannot starve the normal grace periods because a second + * expedited grace period must boost all blocked tasks, including + * those blocking the pre-existing normal grace period. + */ + if (rcu_preempt_ctrlblk.exp_tasks != NULL) { + tb = rcu_preempt_ctrlblk.exp_tasks; + RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++); + } else { + tb = rcu_preempt_ctrlblk.boost_tasks; + RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++); + } + RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++); + + /* + * We boost task t by manufacturing an rt_mutex that appears to + * be held by task t. We leave a pointer to that rt_mutex where + * task t can find it, and task t will release the mutex when it + * exits its outermost RCU read-side critical section. Then + * simply acquiring this artificial rt_mutex will boost task + * t's priority. (Thanks to tglx for suggesting this approach!) + */ + t = container_of(tb, struct task_struct, rcu_node_entry); + rt_mutex_init_proxy_locked(&mtx, t); + t->rcu_boost_mutex = &mtx; + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED; + raw_local_irq_restore(flags); + rt_mutex_lock(&mtx); + rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ + + return rcu_preempt_ctrlblk.boost_tasks != NULL || + rcu_preempt_ctrlblk.exp_tasks != NULL; +} + +/* + * Check to see if it is now time to start boosting RCU readers blocking + * the current grace period, and, if so, tell the rcu_kthread_task to + * start boosting them. If there is an expedited boost in progress, + * we wait for it to complete. + * + * If there are no blocked readers blocking the current grace period, + * return 0 to let the caller know, otherwise return 1. Note that this + * return value is independent of whether or not boosting was done. + */ +static int rcu_initiate_boost(void) +{ + if (!rcu_preempt_blocked_readers_cgp() && + rcu_preempt_ctrlblk.exp_tasks == NULL) { + RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++); + return 0; + } + if (rcu_preempt_ctrlblk.exp_tasks != NULL || + (rcu_preempt_ctrlblk.gp_tasks != NULL && + rcu_preempt_ctrlblk.boost_tasks == NULL && + ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) { + if (rcu_preempt_ctrlblk.exp_tasks == NULL) + rcu_preempt_ctrlblk.boost_tasks = + rcu_preempt_ctrlblk.gp_tasks; + invoke_rcu_kthread(); + } else + RCU_TRACE(rcu_initiate_boost_trace()); + return 1; +} + +#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) + +/* + * Do priority-boost accounting for the start of a new grace period. + */ +static void rcu_preempt_boost_start_gp(void) +{ + rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; +} + +#else /* #ifdef CONFIG_RCU_BOOST */ + +/* + * If there is no RCU priority boosting, we don't boost. + */ +static int rcu_boost(void) +{ + return 0; +} + +/* + * If there is no RCU priority boosting, we don't initiate boosting, + * but we do indicate whether there are blocked readers blocking the + * current grace period. + */ +static int rcu_initiate_boost(void) +{ + return rcu_preempt_blocked_readers_cgp(); +} + +/* + * If there is no RCU priority boosting, nothing to do at grace-period start. + */ +static void rcu_preempt_boost_start_gp(void) +{ +} + +#endif /* else #ifdef CONFIG_RCU_BOOST */ + +/* + * Record a preemptible-RCU quiescent state for the specified CPU. Note + * that this just means that the task currently running on the CPU is + * in a quiescent state. There might be any number of tasks blocked + * while in an RCU read-side critical section. + * + * Unlike the other rcu_*_qs() functions, callers to this function + * must disable irqs in order to protect the assignment to + * ->rcu_read_unlock_special. + * + * Because this is a single-CPU implementation, the only way a grace + * period can end is if the CPU is in a quiescent state. The reason is + * that a blocked preemptible-RCU reader can exit its critical section + * only if the CPU is running it at the time. Therefore, when the + * last task blocking the current grace period exits its RCU read-side + * critical section, neither the CPU nor blocked tasks will be stopping + * the current grace period. (In contrast, SMP implementations + * might have CPUs running in RCU read-side critical sections that + * block later grace periods -- but this is not possible given only + * one CPU.) + */ +static void rcu_preempt_cpu_qs(void) +{ + /* Record both CPU and task as having responded to current GP. */ + rcu_preempt_ctrlblk.gpcpu = rcu_preempt_ctrlblk.gpnum; + current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; + + /* If there is no GP then there is nothing more to do. */ + if (!rcu_preempt_gp_in_progress()) + return; + /* + * Check up on boosting. If there are readers blocking the + * current grace period, leave. + */ + if (rcu_initiate_boost()) + return; + + /* Advance callbacks. */ + rcu_preempt_ctrlblk.completed = rcu_preempt_ctrlblk.gpnum; + rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.rcb.curtail; + rcu_preempt_ctrlblk.rcb.curtail = rcu_preempt_ctrlblk.nexttail; + + /* If there are no blocked readers, next GP is done instantly. */ + if (!rcu_preempt_blocked_readers_any()) + rcu_preempt_ctrlblk.rcb.donetail = rcu_preempt_ctrlblk.nexttail; + + /* If there are done callbacks, cause them to be invoked. */ + if (*rcu_preempt_ctrlblk.rcb.donetail != NULL) + invoke_rcu_kthread(); +} + +/* + * Start a new RCU grace period if warranted. Hard irqs must be disabled. + */ +static void rcu_preempt_start_gp(void) +{ + if (!rcu_preempt_gp_in_progress() && rcu_preempt_needs_another_gp()) { + + /* Official start of GP. */ + rcu_preempt_ctrlblk.gpnum++; + RCU_TRACE(rcu_preempt_ctrlblk.n_grace_periods++); + + /* Any blocked RCU readers block new GP. */ + if (rcu_preempt_blocked_readers_any()) + rcu_preempt_ctrlblk.gp_tasks = + rcu_preempt_ctrlblk.blkd_tasks.next; + + /* Set up for RCU priority boosting. */ + rcu_preempt_boost_start_gp(); + + /* If there is no running reader, CPU is done with GP. */ + if (!rcu_preempt_running_reader()) + rcu_preempt_cpu_qs(); + } +} + +/* + * We have entered the scheduler, and the current task might soon be + * context-switched away from. If this task is in an RCU read-side + * critical section, we will no longer be able to rely on the CPU to + * record that fact, so we enqueue the task on the blkd_tasks list. + * If the task started after the current grace period began, as recorded + * by ->gpcpu, we enqueue at the beginning of the list. Otherwise + * before the element referenced by ->gp_tasks (or at the tail if + * ->gp_tasks is NULL) and point ->gp_tasks at the newly added element. + * The task will dequeue itself when it exits the outermost enclosing + * RCU read-side critical section. Therefore, the current grace period + * cannot be permitted to complete until the ->gp_tasks pointer becomes + * NULL. + * + * Caller must disable preemption. + */ +void rcu_preempt_note_context_switch(void) +{ + struct task_struct *t = current; + unsigned long flags; + + local_irq_save(flags); /* must exclude scheduler_tick(). */ + if (rcu_preempt_running_reader() && + (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { + + /* Possibly blocking in an RCU read-side critical section. */ + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; + + /* + * If this CPU has already checked in, then this task + * will hold up the next grace period rather than the + * current grace period. Queue the task accordingly. + * If the task is queued for the current grace period + * (i.e., this CPU has not yet passed through a quiescent + * state for the current grace period), then as long + * as that task remains queued, the current grace period + * cannot end. + */ + list_add(&t->rcu_node_entry, &rcu_preempt_ctrlblk.blkd_tasks); + if (rcu_cpu_blocking_cur_gp()) + rcu_preempt_ctrlblk.gp_tasks = &t->rcu_node_entry; + } + + /* + * Either we were not in an RCU read-side critical section to + * begin with, or we have now recorded that critical section + * globally. Either way, we can now note a quiescent state + * for this CPU. Again, if we were in an RCU read-side critical + * section, and if that critical section was blocking the current + * grace period, then the fact that the task has been enqueued + * means that current grace period continues to be blocked. + */ + rcu_preempt_cpu_qs(); + local_irq_restore(flags); +} + +/* + * Tiny-preemptible RCU implementation for rcu_read_lock(). + * Just increment ->rcu_read_lock_nesting, shared state will be updated + * if we block. + */ +void __rcu_read_lock(void) +{ + current->rcu_read_lock_nesting++; + barrier(); /* needed if we ever invoke rcu_read_lock in rcutiny.c */ +} +EXPORT_SYMBOL_GPL(__rcu_read_lock); + +/* + * Handle special cases during rcu_read_unlock(), such as needing to + * notify RCU core processing or task having blocked during the RCU + * read-side critical section. + */ +static void rcu_read_unlock_special(struct task_struct *t) +{ + int empty; + int empty_exp; + unsigned long flags; + struct list_head *np; + int special; + + /* + * NMI handlers cannot block and cannot safely manipulate state. + * They therefore cannot possibly be special, so just leave. + */ + if (in_nmi()) + return; + + local_irq_save(flags); + + /* + * If RCU core is waiting for this CPU to exit critical section, + * let it know that we have done so. + */ + special = t->rcu_read_unlock_special; + if (special & RCU_READ_UNLOCK_NEED_QS) + rcu_preempt_cpu_qs(); + + /* Hardware IRQ handlers cannot block. */ + if (in_irq()) { + local_irq_restore(flags); + return; + } + + /* Clean up if blocked during RCU read-side critical section. */ + if (special & RCU_READ_UNLOCK_BLOCKED) { + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; + + /* + * Remove this task from the ->blkd_tasks list and adjust + * any pointers that might have been referencing it. + */ + empty = !rcu_preempt_blocked_readers_cgp(); + empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL; + np = rcu_next_node_entry(t); + list_del_init(&t->rcu_node_entry); + if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks) + rcu_preempt_ctrlblk.gp_tasks = np; + if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks) + rcu_preempt_ctrlblk.exp_tasks = np; +#ifdef CONFIG_RCU_BOOST + if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks) + rcu_preempt_ctrlblk.boost_tasks = np; +#endif /* #ifdef CONFIG_RCU_BOOST */ + + /* + * If this was the last task on the current list, and if + * we aren't waiting on the CPU, report the quiescent state + * and start a new grace period if needed. + */ + if (!empty && !rcu_preempt_blocked_readers_cgp()) { + rcu_preempt_cpu_qs(); + rcu_preempt_start_gp(); + } + + /* + * If this was the last task on the expedited lists, + * then we need wake up the waiting task. + */ + if (!empty_exp && rcu_preempt_ctrlblk.exp_tasks == NULL) + rcu_report_exp_done(); + } +#ifdef CONFIG_RCU_BOOST + /* Unboost self if was boosted. */ + if (special & RCU_READ_UNLOCK_BOOSTED) { + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED; + rt_mutex_unlock(t->rcu_boost_mutex); + t->rcu_boost_mutex = NULL; + } +#endif /* #ifdef CONFIG_RCU_BOOST */ + local_irq_restore(flags); +} + +/* + * Tiny-preemptible RCU implementation for rcu_read_unlock(). + * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost + * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then + * invoke rcu_read_unlock_special() to clean up after a context switch + * in an RCU read-side critical section and other special cases. + */ +void __rcu_read_unlock(void) +{ + struct task_struct *t = current; + + barrier(); /* needed if we ever invoke rcu_read_unlock in rcutiny.c */ + --t->rcu_read_lock_nesting; + barrier(); /* decrement before load of ->rcu_read_unlock_special */ + if (t->rcu_read_lock_nesting == 0 && + unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) + rcu_read_unlock_special(t); +#ifdef CONFIG_PROVE_LOCKING + WARN_ON_ONCE(t->rcu_read_lock_nesting < 0); +#endif /* #ifdef CONFIG_PROVE_LOCKING */ +} +EXPORT_SYMBOL_GPL(__rcu_read_unlock); + +/* + * Check for a quiescent state from the current CPU. When a task blocks, + * the task is recorded in the rcu_preempt_ctrlblk structure, which is + * checked elsewhere. This is called from the scheduling-clock interrupt. + * + * Caller must disable hard irqs. + */ +static void rcu_preempt_check_callbacks(void) +{ + struct task_struct *t = current; + + if (rcu_preempt_gp_in_progress() && + (!rcu_preempt_running_reader() || + !rcu_cpu_blocking_cur_gp())) + rcu_preempt_cpu_qs(); + if (&rcu_preempt_ctrlblk.rcb.rcucblist != + rcu_preempt_ctrlblk.rcb.donetail) + invoke_rcu_kthread(); + if (rcu_preempt_gp_in_progress() && + rcu_cpu_blocking_cur_gp() && + rcu_preempt_running_reader()) + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; +} + +/* + * TINY_PREEMPT_RCU has an extra callback-list tail pointer to + * update, so this is invoked from rcu_process_callbacks() to + * handle that case. Of course, it is invoked for all flavors of + * RCU, but RCU callbacks can appear only on one of the lists, and + * neither ->nexttail nor ->donetail can possibly be NULL, so there + * is no need for an explicit check. + */ +static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) +{ + if (rcu_preempt_ctrlblk.nexttail == rcp->donetail) + rcu_preempt_ctrlblk.nexttail = &rcp->rcucblist; +} + +/* + * Process callbacks for preemptible RCU. + */ +static void rcu_preempt_process_callbacks(void) +{ + rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb); +} + +/* + * Queue a preemptible -RCU callback for invocation after a grace period. + */ +void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + unsigned long flags; + + debug_rcu_head_queue(head); + head->func = func; + head->next = NULL; + + local_irq_save(flags); + *rcu_preempt_ctrlblk.nexttail = head; + rcu_preempt_ctrlblk.nexttail = &head->next; + RCU_TRACE(rcu_preempt_ctrlblk.rcb.qlen++); + rcu_preempt_start_gp(); /* checks to see if GP needed. */ + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(call_rcu); + +void rcu_barrier(void) +{ + struct rcu_synchronize rcu; + + init_rcu_head_on_stack(&rcu.head); + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); + destroy_rcu_head_on_stack(&rcu.head); +} +EXPORT_SYMBOL_GPL(rcu_barrier); + +/* + * synchronize_rcu - wait until a grace period has elapsed. + * + * Control will return to the caller some time after a full grace + * period has elapsed, in other words after all currently executing RCU + * read-side critical sections have completed. RCU read-side critical + * sections are delimited by rcu_read_lock() and rcu_read_unlock(), + * and may be nested. + */ +void synchronize_rcu(void) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (!rcu_scheduler_active) + return; +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + + WARN_ON_ONCE(rcu_preempt_running_reader()); + if (!rcu_preempt_blocked_readers_any()) + return; + + /* Once we get past the fastpath checks, same code as rcu_barrier(). */ + rcu_barrier(); +} +EXPORT_SYMBOL_GPL(synchronize_rcu); + +static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); +static unsigned long sync_rcu_preempt_exp_count; +static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); + +/* + * Return non-zero if there are any tasks in RCU read-side critical + * sections blocking the current preemptible-RCU expedited grace period. + * If there is no preemptible-RCU expedited grace period currently in + * progress, returns zero unconditionally. + */ +static int rcu_preempted_readers_exp(void) +{ + return rcu_preempt_ctrlblk.exp_tasks != NULL; +} + +/* + * Report the exit from RCU read-side critical section for the last task + * that queued itself during or before the current expedited preemptible-RCU + * grace period. + */ +static void rcu_report_exp_done(void) +{ + wake_up(&sync_rcu_preempt_exp_wq); +} + +/* + * Wait for an rcu-preempt grace period, but expedite it. The basic idea + * is to rely in the fact that there is but one CPU, and that it is + * illegal for a task to invoke synchronize_rcu_expedited() while in a + * preemptible-RCU read-side critical section. Therefore, any such + * critical sections must correspond to blocked tasks, which must therefore + * be on the ->blkd_tasks list. So just record the current head of the + * list in the ->exp_tasks pointer, and wait for all tasks including and + * after the task pointed to by ->exp_tasks to drain. + */ +void synchronize_rcu_expedited(void) +{ + unsigned long flags; + struct rcu_preempt_ctrlblk *rpcp = &rcu_preempt_ctrlblk; + unsigned long snap; + + barrier(); /* ensure prior action seen before grace period. */ + + WARN_ON_ONCE(rcu_preempt_running_reader()); + + /* + * Acquire lock so that there is only one preemptible RCU grace + * period in flight. Of course, if someone does the expedited + * grace period for us while we are acquiring the lock, just leave. + */ + snap = sync_rcu_preempt_exp_count + 1; + mutex_lock(&sync_rcu_preempt_exp_mutex); + if (ULONG_CMP_LT(snap, sync_rcu_preempt_exp_count)) + goto unlock_mb_ret; /* Others did our work for us. */ + + local_irq_save(flags); + + /* + * All RCU readers have to already be on blkd_tasks because + * we cannot legally be executing in an RCU read-side critical + * section. + */ + + /* Snapshot current head of ->blkd_tasks list. */ + rpcp->exp_tasks = rpcp->blkd_tasks.next; + if (rpcp->exp_tasks == &rpcp->blkd_tasks) + rpcp->exp_tasks = NULL; + + /* Wait for tail of ->blkd_tasks list to drain. */ + if (!rcu_preempted_readers_exp()) + local_irq_restore(flags); + else { + rcu_initiate_boost(); + local_irq_restore(flags); + wait_event(sync_rcu_preempt_exp_wq, + !rcu_preempted_readers_exp()); + } + + /* Clean up and exit. */ + barrier(); /* ensure expedited GP seen before counter increment. */ + sync_rcu_preempt_exp_count++; +unlock_mb_ret: + mutex_unlock(&sync_rcu_preempt_exp_mutex); + barrier(); /* ensure subsequent action seen after grace period. */ +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +/* + * Does preemptible RCU need the CPU to stay out of dynticks mode? + */ +int rcu_preempt_needs_cpu(void) +{ + if (!rcu_preempt_running_reader()) + rcu_preempt_cpu_qs(); + return rcu_preempt_ctrlblk.rcb.rcucblist != NULL; +} + +/* + * Check for a task exiting while in a preemptible -RCU read-side + * critical section, clean up if so. No need to issue warnings, + * as debug_check_no_locks_held() already does this if lockdep + * is enabled. + */ +void exit_rcu(void) +{ + struct task_struct *t = current; + + if (t->rcu_read_lock_nesting == 0) + return; + t->rcu_read_lock_nesting = 1; + __rcu_read_unlock(); +} + +#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */ + +#ifdef CONFIG_RCU_TRACE + +/* + * Because preemptible RCU does not exist, it is not necessary to + * dump out its statistics. + */ +static void show_tiny_preempt_stats(struct seq_file *m) +{ +} + +#endif /* #ifdef CONFIG_RCU_TRACE */ + +/* + * Because preemptible RCU does not exist, it is never necessary to + * boost preempted RCU readers. + */ +static int rcu_boost(void) +{ + return 0; +} + +/* + * Because preemptible RCU does not exist, it never has any callbacks + * to check. + */ +static void rcu_preempt_check_callbacks(void) +{ +} + +/* + * Because preemptible RCU does not exist, it never has any callbacks + * to remove. + */ +static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp) +{ +} + +/* + * Because preemptible RCU does not exist, it never has any callbacks + * to process. + */ +static void rcu_preempt_process_callbacks(void) +{ +} + +#endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */ + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +#include + +/* + * During boot, we forgive RCU lockdep issues. After this function is + * invoked, we start taking RCU lockdep issues seriously. + */ +void __init rcu_scheduler_starting(void) +{ + WARN_ON(nr_context_switches() > 0); + rcu_scheduler_active = 1; +} + +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +#ifdef CONFIG_RCU_BOOST +#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO +#else /* #ifdef CONFIG_RCU_BOOST */ +#define RCU_BOOST_PRIO 1 +#endif /* #else #ifdef CONFIG_RCU_BOOST */ + +#ifdef CONFIG_RCU_TRACE + +#ifdef CONFIG_RCU_BOOST + +static void rcu_initiate_boost_trace(void) +{ + if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) + rcu_preempt_ctrlblk.n_balk_blkd_tasks++; + else if (rcu_preempt_ctrlblk.gp_tasks == NULL && + rcu_preempt_ctrlblk.exp_tasks == NULL) + rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++; + else if (rcu_preempt_ctrlblk.boost_tasks != NULL) + rcu_preempt_ctrlblk.n_balk_boost_tasks++; + else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) + rcu_preempt_ctrlblk.n_balk_notyet++; + else + rcu_preempt_ctrlblk.n_balk_nos++; +} + +#endif /* #ifdef CONFIG_RCU_BOOST */ + +static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) +{ + unsigned long flags; + + raw_local_irq_save(flags); + rcp->qlen -= n; + raw_local_irq_restore(flags); +} + +/* + * Dump statistics for TINY_RCU, such as they are. + */ +static int show_tiny_stats(struct seq_file *m, void *unused) +{ + show_tiny_preempt_stats(m); + seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen); + seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen); + return 0; +} + +static int show_tiny_stats_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_tiny_stats, NULL); +} + +static const struct file_operations show_tiny_stats_fops = { + .owner = THIS_MODULE, + .open = show_tiny_stats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct dentry *rcudir; + +static int __init rcutiny_trace_init(void) +{ + struct dentry *retval; + + rcudir = debugfs_create_dir("rcu", NULL); + if (!rcudir) + goto free_out; + retval = debugfs_create_file("rcudata", 0444, rcudir, + NULL, &show_tiny_stats_fops); + if (!retval) + goto free_out; + return 0; +free_out: + debugfs_remove_recursive(rcudir); + return 1; +} + +static void __exit rcutiny_trace_cleanup(void) +{ + debugfs_remove_recursive(rcudir); +} + +module_init(rcutiny_trace_init); +module_exit(rcutiny_trace_cleanup); + +MODULE_AUTHOR("Paul E. McKenney"); +MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); +MODULE_LICENSE("GPL"); + +#endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c new file mode 100644 index 00000000..2e138db0 --- /dev/null +++ b/kernel/rcutorture.c @@ -0,0 +1,1633 @@ +/* + * Read-Copy Update module-based torture test facility + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2005, 2006 + * + * Authors: Paul E. McKenney + * Josh Triplett + * + * See also: Documentation/RCU/torture.txt + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Paul E. McKenney and " + "Josh Triplett "); + +static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ +static int nfakewriters = 4; /* # fake writer threads */ +static int stat_interval; /* Interval between stats, in seconds. */ + /* Defaults to "only at end of test". */ +static int verbose; /* Print more debug info. */ +static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ +static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ +static int stutter = 5; /* Start/stop testing interval (in sec) */ +static int irqreader = 1; /* RCU readers from irq (timers). */ +static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ +static int fqs_holdoff = 0; /* Hold time within burst (us). */ +static int fqs_stutter = 3; /* Wait time between bursts (s). */ +static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ +static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ +static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ +static char *torture_type = "rcu"; /* What RCU implementation to torture. */ + +module_param(nreaders, int, 0444); +MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); +module_param(nfakewriters, int, 0444); +MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); +module_param(stat_interval, int, 0444); +MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); +module_param(verbose, bool, 0444); +MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); +module_param(test_no_idle_hz, bool, 0444); +MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); +module_param(shuffle_interval, int, 0444); +MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); +module_param(stutter, int, 0444); +MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); +module_param(irqreader, int, 0444); +MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); +module_param(fqs_duration, int, 0444); +MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)"); +module_param(fqs_holdoff, int, 0444); +MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); +module_param(fqs_stutter, int, 0444); +MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); +module_param(test_boost, int, 0444); +MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); +module_param(test_boost_interval, int, 0444); +MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); +module_param(test_boost_duration, int, 0444); +MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); +module_param(torture_type, charp, 0444); +MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); + +#define TORTURE_FLAG "-torture:" +#define PRINTK_STRING(s) \ + do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) +#define VERBOSE_PRINTK_STRING(s) \ + do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) +#define VERBOSE_PRINTK_ERRSTRING(s) \ + do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) + +static char printk_buf[4096]; + +static int nrealreaders; +static struct task_struct *writer_task; +static struct task_struct **fakewriter_tasks; +static struct task_struct **reader_tasks; +static struct task_struct *stats_task; +static struct task_struct *shuffler_task; +static struct task_struct *stutter_task; +static struct task_struct *fqs_task; +static struct task_struct *boost_tasks[NR_CPUS]; + +#define RCU_TORTURE_PIPE_LEN 10 + +struct rcu_torture { + struct rcu_head rtort_rcu; + int rtort_pipe_count; + struct list_head rtort_free; + int rtort_mbtest; +}; + +static LIST_HEAD(rcu_torture_freelist); +static struct rcu_torture __rcu *rcu_torture_current; +static unsigned long rcu_torture_current_version; +static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; +static DEFINE_SPINLOCK(rcu_torture_lock); +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = + { 0 }; +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = + { 0 }; +static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; +static atomic_t n_rcu_torture_alloc; +static atomic_t n_rcu_torture_alloc_fail; +static atomic_t n_rcu_torture_free; +static atomic_t n_rcu_torture_mberror; +static atomic_t n_rcu_torture_error; +static long n_rcu_torture_boost_ktrerror; +static long n_rcu_torture_boost_rterror; +static long n_rcu_torture_boost_failure; +static long n_rcu_torture_boosts; +static long n_rcu_torture_timers; +static struct list_head rcu_torture_removed; +static cpumask_var_t shuffle_tmp_mask; + +static int stutter_pause_test; + +#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) +#define RCUTORTURE_RUNNABLE_INIT 1 +#else +#define RCUTORTURE_RUNNABLE_INIT 0 +#endif +int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; + +#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) +#define rcu_can_boost() 1 +#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ +#define rcu_can_boost() 0 +#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ + +static unsigned long boost_starttime; /* jiffies of next boost test start. */ +DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ + /* and boost task create/destroy. */ + +/* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ + +#define FULLSTOP_DONTSTOP 0 /* Normal operation. */ +#define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ +#define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ +static int fullstop = FULLSTOP_RMMOD; +/* + * Protect fullstop transitions and spawning of kthreads. + */ +static DEFINE_MUTEX(fullstop_mutex); + +/* + * Detect and respond to a system shutdown. + */ +static int +rcutorture_shutdown_notify(struct notifier_block *unused1, + unsigned long unused2, void *unused3) +{ + mutex_lock(&fullstop_mutex); + if (fullstop == FULLSTOP_DONTSTOP) + fullstop = FULLSTOP_SHUTDOWN; + else + printk(KERN_WARNING /* but going down anyway, so... */ + "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); + mutex_unlock(&fullstop_mutex); + return NOTIFY_DONE; +} + +/* + * Absorb kthreads into a kernel function that won't return, so that + * they won't ever access module text or data again. + */ +static void rcutorture_shutdown_absorb(char *title) +{ + if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { + printk(KERN_NOTICE + "rcutorture thread %s parking due to system shutdown\n", + title); + schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); + } +} + +/* + * Allocate an element from the rcu_tortures pool. + */ +static struct rcu_torture * +rcu_torture_alloc(void) +{ + struct list_head *p; + + spin_lock_bh(&rcu_torture_lock); + if (list_empty(&rcu_torture_freelist)) { + atomic_inc(&n_rcu_torture_alloc_fail); + spin_unlock_bh(&rcu_torture_lock); + return NULL; + } + atomic_inc(&n_rcu_torture_alloc); + p = rcu_torture_freelist.next; + list_del_init(p); + spin_unlock_bh(&rcu_torture_lock); + return container_of(p, struct rcu_torture, rtort_free); +} + +/* + * Free an element to the rcu_tortures pool. + */ +static void +rcu_torture_free(struct rcu_torture *p) +{ + atomic_inc(&n_rcu_torture_free); + spin_lock_bh(&rcu_torture_lock); + list_add_tail(&p->rtort_free, &rcu_torture_freelist); + spin_unlock_bh(&rcu_torture_lock); +} + +struct rcu_random_state { + unsigned long rrs_state; + long rrs_count; +}; + +#define RCU_RANDOM_MULT 39916801 /* prime */ +#define RCU_RANDOM_ADD 479001701 /* prime */ +#define RCU_RANDOM_REFRESH 10000 + +#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 } + +/* + * Crude but fast random-number generator. Uses a linear congruential + * generator, with occasional help from cpu_clock(). + */ +static unsigned long +rcu_random(struct rcu_random_state *rrsp) +{ + if (--rrsp->rrs_count < 0) { + rrsp->rrs_state += (unsigned long)local_clock(); + rrsp->rrs_count = RCU_RANDOM_REFRESH; + } + rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; + return swahw32(rrsp->rrs_state); +} + +static void +rcu_stutter_wait(char *title) +{ + while (stutter_pause_test || !rcutorture_runnable) { + if (rcutorture_runnable) + schedule_timeout_interruptible(1); + else + schedule_timeout_interruptible(round_jiffies_relative(HZ)); + rcutorture_shutdown_absorb(title); + } +} + +/* + * Operations vector for selecting different types of tests. + */ + +struct rcu_torture_ops { + void (*init)(void); + void (*cleanup)(void); + int (*readlock)(void); + void (*read_delay)(struct rcu_random_state *rrsp); + void (*readunlock)(int idx); + int (*completed)(void); + void (*deferred_free)(struct rcu_torture *p); + void (*sync)(void); + void (*cb_barrier)(void); + void (*fqs)(void); + int (*stats)(char *page); + int irq_capable; + int can_boost; + char *name; +}; + +static struct rcu_torture_ops *cur_ops; + +/* + * Definitions for rcu torture testing. + */ + +static int rcu_torture_read_lock(void) __acquires(RCU) +{ + rcu_read_lock(); + return 0; +} + +static void rcu_read_delay(struct rcu_random_state *rrsp) +{ + const unsigned long shortdelay_us = 200; + const unsigned long longdelay_ms = 50; + + /* We want a short delay sometimes to make a reader delay the grace + * period, and we want a long delay occasionally to trigger + * force_quiescent_state. */ + + if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) + mdelay(longdelay_ms); + if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) + udelay(shortdelay_us); +#ifdef CONFIG_PREEMPT + if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000))) + preempt_schedule(); /* No QS if preempt_disable() in effect */ +#endif +} + +static void rcu_torture_read_unlock(int idx) __releases(RCU) +{ + rcu_read_unlock(); +} + +static int rcu_torture_completed(void) +{ + return rcu_batches_completed(); +} + +static void +rcu_torture_cb(struct rcu_head *p) +{ + int i; + struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); + + if (fullstop != FULLSTOP_DONTSTOP) { + /* Test is ending, just drop callbacks on the floor. */ + /* The next initialization will pick up the pieces. */ + return; + } + i = rp->rtort_pipe_count; + if (i > RCU_TORTURE_PIPE_LEN) + i = RCU_TORTURE_PIPE_LEN; + atomic_inc(&rcu_torture_wcount[i]); + if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { + rp->rtort_mbtest = 0; + rcu_torture_free(rp); + } else + cur_ops->deferred_free(rp); +} + +static int rcu_no_completed(void) +{ + return 0; +} + +static void rcu_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu(&p->rtort_rcu, rcu_torture_cb); +} + +static struct rcu_torture_ops rcu_ops = { + .init = NULL, + .cleanup = NULL, + .readlock = rcu_torture_read_lock, + .read_delay = rcu_read_delay, + .readunlock = rcu_torture_read_unlock, + .completed = rcu_torture_completed, + .deferred_free = rcu_torture_deferred_free, + .sync = synchronize_rcu, + .cb_barrier = rcu_barrier, + .fqs = rcu_force_quiescent_state, + .stats = NULL, + .irq_capable = 1, + .can_boost = rcu_can_boost(), + .name = "rcu" +}; + +static void rcu_sync_torture_deferred_free(struct rcu_torture *p) +{ + int i; + struct rcu_torture *rp; + struct rcu_torture *rp1; + + cur_ops->sync(); + list_add(&p->rtort_free, &rcu_torture_removed); + list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) { + i = rp->rtort_pipe_count; + if (i > RCU_TORTURE_PIPE_LEN) + i = RCU_TORTURE_PIPE_LEN; + atomic_inc(&rcu_torture_wcount[i]); + if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { + rp->rtort_mbtest = 0; + list_del(&rp->rtort_free); + rcu_torture_free(rp); + } + } +} + +static void rcu_sync_torture_init(void) +{ + INIT_LIST_HEAD(&rcu_torture_removed); +} + +static struct rcu_torture_ops rcu_sync_ops = { + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = rcu_torture_read_lock, + .read_delay = rcu_read_delay, + .readunlock = rcu_torture_read_unlock, + .completed = rcu_torture_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = synchronize_rcu, + .cb_barrier = NULL, + .fqs = rcu_force_quiescent_state, + .stats = NULL, + .irq_capable = 1, + .can_boost = rcu_can_boost(), + .name = "rcu_sync" +}; + +static struct rcu_torture_ops rcu_expedited_ops = { + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = rcu_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_torture_read_unlock, + .completed = rcu_no_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = synchronize_rcu_expedited, + .cb_barrier = NULL, + .fqs = rcu_force_quiescent_state, + .stats = NULL, + .irq_capable = 1, + .can_boost = rcu_can_boost(), + .name = "rcu_expedited" +}; + +/* + * Definitions for rcu_bh torture testing. + */ + +static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH) +{ + rcu_read_lock_bh(); + return 0; +} + +static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH) +{ + rcu_read_unlock_bh(); +} + +static int rcu_bh_torture_completed(void) +{ + return rcu_batches_completed_bh(); +} + +static void rcu_bh_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); +} + +struct rcu_bh_torture_synchronize { + struct rcu_head head; + struct completion completion; +}; + +static void rcu_bh_torture_wakeme_after_cb(struct rcu_head *head) +{ + struct rcu_bh_torture_synchronize *rcu; + + rcu = container_of(head, struct rcu_bh_torture_synchronize, head); + complete(&rcu->completion); +} + +static void rcu_bh_torture_synchronize(void) +{ + struct rcu_bh_torture_synchronize rcu; + + init_rcu_head_on_stack(&rcu.head); + init_completion(&rcu.completion); + call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb); + wait_for_completion(&rcu.completion); + destroy_rcu_head_on_stack(&rcu.head); +} + +static struct rcu_torture_ops rcu_bh_ops = { + .init = NULL, + .cleanup = NULL, + .readlock = rcu_bh_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_bh_torture_read_unlock, + .completed = rcu_bh_torture_completed, + .deferred_free = rcu_bh_torture_deferred_free, + .sync = rcu_bh_torture_synchronize, + .cb_barrier = rcu_barrier_bh, + .fqs = rcu_bh_force_quiescent_state, + .stats = NULL, + .irq_capable = 1, + .name = "rcu_bh" +}; + +static struct rcu_torture_ops rcu_bh_sync_ops = { + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = rcu_bh_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = rcu_bh_torture_read_unlock, + .completed = rcu_bh_torture_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = rcu_bh_torture_synchronize, + .cb_barrier = NULL, + .fqs = rcu_bh_force_quiescent_state, + .stats = NULL, + .irq_capable = 1, + .name = "rcu_bh_sync" +}; + +/* + * Definitions for srcu torture testing. + */ + +static struct srcu_struct srcu_ctl; + +static void srcu_torture_init(void) +{ + init_srcu_struct(&srcu_ctl); + rcu_sync_torture_init(); +} + +static void srcu_torture_cleanup(void) +{ + synchronize_srcu(&srcu_ctl); + cleanup_srcu_struct(&srcu_ctl); +} + +static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) +{ + return srcu_read_lock(&srcu_ctl); +} + +static void srcu_read_delay(struct rcu_random_state *rrsp) +{ + long delay; + const long uspertick = 1000000 / HZ; + const long longdelay = 10; + + /* We want there to be long-running readers, but not all the time. */ + + delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); + if (!delay) + schedule_timeout_interruptible(longdelay); + else + rcu_read_delay(rrsp); +} + +static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) +{ + srcu_read_unlock(&srcu_ctl, idx); +} + +static int srcu_torture_completed(void) +{ + return srcu_batches_completed(&srcu_ctl); +} + +static void srcu_torture_synchronize(void) +{ + synchronize_srcu(&srcu_ctl); +} + +static int srcu_torture_stats(char *page) +{ + int cnt = 0; + int cpu; + int idx = srcu_ctl.completed & 0x1; + + cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", + torture_type, TORTURE_FLAG, idx); + for_each_possible_cpu(cpu) { + cnt += sprintf(&page[cnt], " %d(%d,%d)", cpu, + per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], + per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); + } + cnt += sprintf(&page[cnt], "\n"); + return cnt; +} + +static struct rcu_torture_ops srcu_ops = { + .init = srcu_torture_init, + .cleanup = srcu_torture_cleanup, + .readlock = srcu_torture_read_lock, + .read_delay = srcu_read_delay, + .readunlock = srcu_torture_read_unlock, + .completed = srcu_torture_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = srcu_torture_synchronize, + .cb_barrier = NULL, + .stats = srcu_torture_stats, + .name = "srcu" +}; + +static void srcu_torture_synchronize_expedited(void) +{ + synchronize_srcu_expedited(&srcu_ctl); +} + +static struct rcu_torture_ops srcu_expedited_ops = { + .init = srcu_torture_init, + .cleanup = srcu_torture_cleanup, + .readlock = srcu_torture_read_lock, + .read_delay = srcu_read_delay, + .readunlock = srcu_torture_read_unlock, + .completed = srcu_torture_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = srcu_torture_synchronize_expedited, + .cb_barrier = NULL, + .stats = srcu_torture_stats, + .name = "srcu_expedited" +}; + +/* + * Definitions for sched torture testing. + */ + +static int sched_torture_read_lock(void) +{ + preempt_disable(); + return 0; +} + +static void sched_torture_read_unlock(int idx) +{ + preempt_enable(); +} + +static void rcu_sched_torture_deferred_free(struct rcu_torture *p) +{ + call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); +} + +static void sched_torture_synchronize(void) +{ + synchronize_sched(); +} + +static struct rcu_torture_ops sched_ops = { + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = sched_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = sched_torture_read_unlock, + .completed = rcu_no_completed, + .deferred_free = rcu_sched_torture_deferred_free, + .sync = sched_torture_synchronize, + .cb_barrier = rcu_barrier_sched, + .fqs = rcu_sched_force_quiescent_state, + .stats = NULL, + .irq_capable = 1, + .name = "sched" +}; + +static struct rcu_torture_ops sched_sync_ops = { + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = sched_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = sched_torture_read_unlock, + .completed = rcu_no_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = sched_torture_synchronize, + .cb_barrier = NULL, + .fqs = rcu_sched_force_quiescent_state, + .stats = NULL, + .name = "sched_sync" +}; + +static struct rcu_torture_ops sched_expedited_ops = { + .init = rcu_sync_torture_init, + .cleanup = NULL, + .readlock = sched_torture_read_lock, + .read_delay = rcu_read_delay, /* just reuse rcu's version. */ + .readunlock = sched_torture_read_unlock, + .completed = rcu_no_completed, + .deferred_free = rcu_sync_torture_deferred_free, + .sync = synchronize_sched_expedited, + .cb_barrier = NULL, + .fqs = rcu_sched_force_quiescent_state, + .stats = NULL, + .irq_capable = 1, + .name = "sched_expedited" +}; + +/* + * RCU torture priority-boost testing. Runs one real-time thread per + * CPU for moderate bursts, repeatedly registering RCU callbacks and + * spinning waiting for them to be invoked. If a given callback takes + * too long to be invoked, we assume that priority inversion has occurred. + */ + +struct rcu_boost_inflight { + struct rcu_head rcu; + int inflight; +}; + +static void rcu_torture_boost_cb(struct rcu_head *head) +{ + struct rcu_boost_inflight *rbip = + container_of(head, struct rcu_boost_inflight, rcu); + + smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */ + rbip->inflight = 0; +} + +static int rcu_torture_boost(void *arg) +{ + unsigned long call_rcu_time; + unsigned long endtime; + unsigned long oldstarttime; + struct rcu_boost_inflight rbi = { .inflight = 0 }; + struct sched_param sp; + + VERBOSE_PRINTK_STRING("rcu_torture_boost started"); + + /* Set real-time priority. */ + sp.sched_priority = 1; + if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) { + VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!"); + n_rcu_torture_boost_rterror++; + } + + init_rcu_head_on_stack(&rbi.rcu); + /* Each pass through the following loop does one boost-test cycle. */ + do { + /* Wait for the next test interval. */ + oldstarttime = boost_starttime; + while (jiffies - oldstarttime > ULONG_MAX / 2) { + schedule_timeout_uninterruptible(1); + rcu_stutter_wait("rcu_torture_boost"); + if (kthread_should_stop() || + fullstop != FULLSTOP_DONTSTOP) + goto checkwait; + } + + /* Do one boost-test interval. */ + endtime = oldstarttime + test_boost_duration * HZ; + call_rcu_time = jiffies; + while (jiffies - endtime > ULONG_MAX / 2) { + /* If we don't have a callback in flight, post one. */ + if (!rbi.inflight) { + smp_mb(); /* RCU core before ->inflight = 1. */ + rbi.inflight = 1; + call_rcu(&rbi.rcu, rcu_torture_boost_cb); + if (jiffies - call_rcu_time > + test_boost_duration * HZ - HZ / 2) { + VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed"); + n_rcu_torture_boost_failure++; + } + call_rcu_time = jiffies; + } + cond_resched(); + rcu_stutter_wait("rcu_torture_boost"); + if (kthread_should_stop() || + fullstop != FULLSTOP_DONTSTOP) + goto checkwait; + } + + /* + * Set the start time of the next test interval. + * Yes, this is vulnerable to long delays, but such + * delays simply cause a false negative for the next + * interval. Besides, we are running at RT priority, + * so delays should be relatively rare. + */ + while (oldstarttime == boost_starttime) { + if (mutex_trylock(&boost_mutex)) { + boost_starttime = jiffies + + test_boost_interval * HZ; + n_rcu_torture_boosts++; + mutex_unlock(&boost_mutex); + break; + } + schedule_timeout_uninterruptible(1); + } + + /* Go do the stutter. */ +checkwait: rcu_stutter_wait("rcu_torture_boost"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + + /* Clean up and exit. */ + VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); + destroy_rcu_head_on_stack(&rbi.rcu); + rcutorture_shutdown_absorb("rcu_torture_boost"); + while (!kthread_should_stop() || rbi.inflight) + schedule_timeout_uninterruptible(1); + smp_mb(); /* order accesses to ->inflight before stack-frame death. */ + return 0; +} + +/* + * RCU torture force-quiescent-state kthread. Repeatedly induces + * bursts of calls to force_quiescent_state(), increasing the probability + * of occurrence of some important types of race conditions. + */ +static int +rcu_torture_fqs(void *arg) +{ + unsigned long fqs_resume_time; + int fqs_burst_remaining; + + VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); + do { + fqs_resume_time = jiffies + fqs_stutter * HZ; + while (jiffies - fqs_resume_time > LONG_MAX) { + schedule_timeout_interruptible(1); + } + fqs_burst_remaining = fqs_duration; + while (fqs_burst_remaining > 0) { + cur_ops->fqs(); + udelay(fqs_holdoff); + fqs_burst_remaining -= fqs_holdoff; + } + rcu_stutter_wait("rcu_torture_fqs"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping"); + rcutorture_shutdown_absorb("rcu_torture_fqs"); + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + return 0; +} + +/* + * RCU torture writer kthread. Repeatedly substitutes a new structure + * for that pointed to by rcu_torture_current, freeing the old structure + * after a series of grace periods (the "pipeline"). + */ +static int +rcu_torture_writer(void *arg) +{ + int i; + long oldbatch = rcu_batches_completed(); + struct rcu_torture *rp; + struct rcu_torture *old_rp; + static DEFINE_RCU_RANDOM(rand); + + VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); + set_user_nice(current, 19); + + do { + schedule_timeout_uninterruptible(1); + rp = rcu_torture_alloc(); + if (rp == NULL) + continue; + rp->rtort_pipe_count = 0; + udelay(rcu_random(&rand) & 0x3ff); + old_rp = rcu_dereference_check(rcu_torture_current, + current == writer_task); + rp->rtort_mbtest = 1; + rcu_assign_pointer(rcu_torture_current, rp); + smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ + if (old_rp) { + i = old_rp->rtort_pipe_count; + if (i > RCU_TORTURE_PIPE_LEN) + i = RCU_TORTURE_PIPE_LEN; + atomic_inc(&rcu_torture_wcount[i]); + old_rp->rtort_pipe_count++; + cur_ops->deferred_free(old_rp); + } + rcutorture_record_progress(++rcu_torture_current_version); + oldbatch = cur_ops->completed(); + rcu_stutter_wait("rcu_torture_writer"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); + rcutorture_shutdown_absorb("rcu_torture_writer"); + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + return 0; +} + +/* + * RCU torture fake writer kthread. Repeatedly calls sync, with a random + * delay between calls. + */ +static int +rcu_torture_fakewriter(void *arg) +{ + DEFINE_RCU_RANDOM(rand); + + VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); + set_user_nice(current, 19); + + do { + schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); + udelay(rcu_random(&rand) & 0x3ff); + cur_ops->sync(); + rcu_stutter_wait("rcu_torture_fakewriter"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + + VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); + rcutorture_shutdown_absorb("rcu_torture_fakewriter"); + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + return 0; +} + +/* + * RCU torture reader from timer handler. Dereferences rcu_torture_current, + * incrementing the corresponding element of the pipeline array. The + * counter in the element should never be greater than 1, otherwise, the + * RCU implementation is broken. + */ +static void rcu_torture_timer(unsigned long unused) +{ + int idx; + int completed; + static DEFINE_RCU_RANDOM(rand); + static DEFINE_SPINLOCK(rand_lock); + struct rcu_torture *p; + int pipe_count; + + idx = cur_ops->readlock(); + completed = cur_ops->completed(); + p = rcu_dereference_check(rcu_torture_current, + rcu_read_lock_held() || + rcu_read_lock_bh_held() || + rcu_read_lock_sched_held() || + srcu_read_lock_held(&srcu_ctl)); + if (p == NULL) { + /* Leave because rcu_torture_writer is not yet underway */ + cur_ops->readunlock(idx); + return; + } + if (p->rtort_mbtest == 0) + atomic_inc(&n_rcu_torture_mberror); + spin_lock(&rand_lock); + cur_ops->read_delay(&rand); + n_rcu_torture_timers++; + spin_unlock(&rand_lock); + preempt_disable(); + pipe_count = p->rtort_pipe_count; + if (pipe_count > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + pipe_count = RCU_TORTURE_PIPE_LEN; + } + __this_cpu_inc(rcu_torture_count[pipe_count]); + completed = cur_ops->completed() - completed; + if (completed > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + completed = RCU_TORTURE_PIPE_LEN; + } + __this_cpu_inc(rcu_torture_batch[completed]); + preempt_enable(); + cur_ops->readunlock(idx); +} + +/* + * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current, + * incrementing the corresponding element of the pipeline array. The + * counter in the element should never be greater than 1, otherwise, the + * RCU implementation is broken. + */ +static int +rcu_torture_reader(void *arg) +{ + int completed; + int idx; + DEFINE_RCU_RANDOM(rand); + struct rcu_torture *p; + int pipe_count; + struct timer_list t; + + VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); + set_user_nice(current, 19); + if (irqreader && cur_ops->irq_capable) + setup_timer_on_stack(&t, rcu_torture_timer, 0); + + do { + if (irqreader && cur_ops->irq_capable) { + if (!timer_pending(&t)) + mod_timer(&t, jiffies + 1); + } + idx = cur_ops->readlock(); + completed = cur_ops->completed(); + p = rcu_dereference_check(rcu_torture_current, + rcu_read_lock_held() || + rcu_read_lock_bh_held() || + rcu_read_lock_sched_held() || + srcu_read_lock_held(&srcu_ctl)); + if (p == NULL) { + /* Wait for rcu_torture_writer to get underway */ + cur_ops->readunlock(idx); + schedule_timeout_interruptible(HZ); + continue; + } + if (p->rtort_mbtest == 0) + atomic_inc(&n_rcu_torture_mberror); + cur_ops->read_delay(&rand); + preempt_disable(); + pipe_count = p->rtort_pipe_count; + if (pipe_count > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + pipe_count = RCU_TORTURE_PIPE_LEN; + } + __this_cpu_inc(rcu_torture_count[pipe_count]); + completed = cur_ops->completed() - completed; + if (completed > RCU_TORTURE_PIPE_LEN) { + /* Should not happen, but... */ + completed = RCU_TORTURE_PIPE_LEN; + } + __this_cpu_inc(rcu_torture_batch[completed]); + preempt_enable(); + cur_ops->readunlock(idx); + schedule(); + rcu_stutter_wait("rcu_torture_reader"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); + rcutorture_shutdown_absorb("rcu_torture_reader"); + if (irqreader && cur_ops->irq_capable) + del_timer_sync(&t); + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + return 0; +} + +/* + * Create an RCU-torture statistics message in the specified buffer. + */ +static int +rcu_torture_printk(char *page) +{ + int cnt = 0; + int cpu; + int i; + long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; + long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; + + for_each_possible_cpu(cpu) { + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { + pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; + batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; + } + } + for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { + if (pipesummary[i] != 0) + break; + } + cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); + cnt += sprintf(&page[cnt], + "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d " + "rtmbe: %d rtbke: %ld rtbre: %ld " + "rtbf: %ld rtb: %ld nt: %ld", + rcu_torture_current, + rcu_torture_current_version, + list_empty(&rcu_torture_freelist), + atomic_read(&n_rcu_torture_alloc), + atomic_read(&n_rcu_torture_alloc_fail), + atomic_read(&n_rcu_torture_free), + atomic_read(&n_rcu_torture_mberror), + n_rcu_torture_boost_ktrerror, + n_rcu_torture_boost_rterror, + n_rcu_torture_boost_failure, + n_rcu_torture_boosts, + n_rcu_torture_timers); + if (atomic_read(&n_rcu_torture_mberror) != 0 || + n_rcu_torture_boost_ktrerror != 0 || + n_rcu_torture_boost_rterror != 0 || + n_rcu_torture_boost_failure != 0) + cnt += sprintf(&page[cnt], " !!!"); + cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); + if (i > 1) { + cnt += sprintf(&page[cnt], "!!! "); + atomic_inc(&n_rcu_torture_error); + WARN_ON_ONCE(1); + } + cnt += sprintf(&page[cnt], "Reader Pipe: "); + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) + cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); + cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); + cnt += sprintf(&page[cnt], "Reader Batch: "); + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) + cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); + cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); + cnt += sprintf(&page[cnt], "Free-Block Circulation: "); + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { + cnt += sprintf(&page[cnt], " %d", + atomic_read(&rcu_torture_wcount[i])); + } + cnt += sprintf(&page[cnt], "\n"); + if (cur_ops->stats) + cnt += cur_ops->stats(&page[cnt]); + return cnt; +} + +/* + * Print torture statistics. Caller must ensure that there is only + * one call to this function at a given time!!! This is normally + * accomplished by relying on the module system to only have one copy + * of the module loaded, and then by giving the rcu_torture_stats + * kthread full control (or the init/cleanup functions when rcu_torture_stats + * thread is not running). + */ +static void +rcu_torture_stats_print(void) +{ + int cnt; + + cnt = rcu_torture_printk(printk_buf); + printk(KERN_ALERT "%s", printk_buf); +} + +/* + * Periodically prints torture statistics, if periodic statistics printing + * was specified via the stat_interval module parameter. + * + * No need to worry about fullstop here, since this one doesn't reference + * volatile state or register callbacks. + */ +static int +rcu_torture_stats(void *arg) +{ + VERBOSE_PRINTK_STRING("rcu_torture_stats task started"); + do { + schedule_timeout_interruptible(stat_interval * HZ); + rcu_torture_stats_print(); + rcutorture_shutdown_absorb("rcu_torture_stats"); + } while (!kthread_should_stop()); + VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); + return 0; +} + +static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ + +/* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case + * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs. + */ +static void rcu_torture_shuffle_tasks(void) +{ + int i; + + cpumask_setall(shuffle_tmp_mask); + get_online_cpus(); + + /* No point in shuffling if there is only one online CPU (ex: UP) */ + if (num_online_cpus() == 1) { + put_online_cpus(); + return; + } + + if (rcu_idle_cpu != -1) + cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask); + + set_cpus_allowed_ptr(current, shuffle_tmp_mask); + + if (reader_tasks) { + for (i = 0; i < nrealreaders; i++) + if (reader_tasks[i]) + set_cpus_allowed_ptr(reader_tasks[i], + shuffle_tmp_mask); + } + + if (fakewriter_tasks) { + for (i = 0; i < nfakewriters; i++) + if (fakewriter_tasks[i]) + set_cpus_allowed_ptr(fakewriter_tasks[i], + shuffle_tmp_mask); + } + + if (writer_task) + set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask); + + if (stats_task) + set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask); + + if (rcu_idle_cpu == -1) + rcu_idle_cpu = num_online_cpus() - 1; + else + rcu_idle_cpu--; + + put_online_cpus(); +} + +/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the + * system to become idle at a time and cut off its timer ticks. This is meant + * to test the support for such tickless idle CPU in RCU. + */ +static int +rcu_torture_shuffle(void *arg) +{ + VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started"); + do { + schedule_timeout_interruptible(shuffle_interval * HZ); + rcu_torture_shuffle_tasks(); + rcutorture_shutdown_absorb("rcu_torture_shuffle"); + } while (!kthread_should_stop()); + VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); + return 0; +} + +/* Cause the rcutorture test to "stutter", starting and stopping all + * threads periodically. + */ +static int +rcu_torture_stutter(void *arg) +{ + VERBOSE_PRINTK_STRING("rcu_torture_stutter task started"); + do { + schedule_timeout_interruptible(stutter * HZ); + stutter_pause_test = 1; + if (!kthread_should_stop()) + schedule_timeout_interruptible(stutter * HZ); + stutter_pause_test = 0; + rcutorture_shutdown_absorb("rcu_torture_stutter"); + } while (!kthread_should_stop()); + VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping"); + return 0; +} + +static inline void +rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) +{ + printk(KERN_ALERT "%s" TORTURE_FLAG + "--- %s: nreaders=%d nfakewriters=%d " + "stat_interval=%d verbose=%d test_no_idle_hz=%d " + "shuffle_interval=%d stutter=%d irqreader=%d " + "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " + "test_boost=%d/%d test_boost_interval=%d " + "test_boost_duration=%d\n", + torture_type, tag, nrealreaders, nfakewriters, + stat_interval, verbose, test_no_idle_hz, shuffle_interval, + stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, + test_boost, cur_ops->can_boost, + test_boost_interval, test_boost_duration); +} + +static struct notifier_block rcutorture_shutdown_nb = { + .notifier_call = rcutorture_shutdown_notify, +}; + +static void rcutorture_booster_cleanup(int cpu) +{ + struct task_struct *t; + + if (boost_tasks[cpu] == NULL) + return; + mutex_lock(&boost_mutex); + VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task"); + t = boost_tasks[cpu]; + boost_tasks[cpu] = NULL; + mutex_unlock(&boost_mutex); + + /* This must be outside of the mutex, otherwise deadlock! */ + kthread_stop(t); +} + +static int rcutorture_booster_init(int cpu) +{ + int retval; + + if (boost_tasks[cpu] != NULL) + return 0; /* Already created, nothing more to do. */ + + /* Don't allow time recalculation while creating a new task. */ + mutex_lock(&boost_mutex); + VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); + boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL, + "rcu_torture_boost"); + if (IS_ERR(boost_tasks[cpu])) { + retval = PTR_ERR(boost_tasks[cpu]); + VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); + n_rcu_torture_boost_ktrerror++; + boost_tasks[cpu] = NULL; + mutex_unlock(&boost_mutex); + return retval; + } + kthread_bind(boost_tasks[cpu], cpu); + wake_up_process(boost_tasks[cpu]); + mutex_unlock(&boost_mutex); + return 0; +} + +static int rcutorture_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + + switch (action) { + case CPU_ONLINE: + case CPU_DOWN_FAILED: + (void)rcutorture_booster_init(cpu); + break; + case CPU_DOWN_PREPARE: + rcutorture_booster_cleanup(cpu); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block rcutorture_cpu_nb = { + .notifier_call = rcutorture_cpu_notify, +}; + +static void +rcu_torture_cleanup(void) +{ + int i; + + mutex_lock(&fullstop_mutex); + rcutorture_record_test_transition(); + if (fullstop == FULLSTOP_SHUTDOWN) { + printk(KERN_WARNING /* but going down anyway, so... */ + "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); + mutex_unlock(&fullstop_mutex); + schedule_timeout_uninterruptible(10); + if (cur_ops->cb_barrier != NULL) + cur_ops->cb_barrier(); + return; + } + fullstop = FULLSTOP_RMMOD; + mutex_unlock(&fullstop_mutex); + unregister_reboot_notifier(&rcutorture_shutdown_nb); + if (stutter_task) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); + kthread_stop(stutter_task); + } + stutter_task = NULL; + if (shuffler_task) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); + kthread_stop(shuffler_task); + free_cpumask_var(shuffle_tmp_mask); + } + shuffler_task = NULL; + + if (writer_task) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); + kthread_stop(writer_task); + } + writer_task = NULL; + + if (reader_tasks) { + for (i = 0; i < nrealreaders; i++) { + if (reader_tasks[i]) { + VERBOSE_PRINTK_STRING( + "Stopping rcu_torture_reader task"); + kthread_stop(reader_tasks[i]); + } + reader_tasks[i] = NULL; + } + kfree(reader_tasks); + reader_tasks = NULL; + } + rcu_torture_current = NULL; + + if (fakewriter_tasks) { + for (i = 0; i < nfakewriters; i++) { + if (fakewriter_tasks[i]) { + VERBOSE_PRINTK_STRING( + "Stopping rcu_torture_fakewriter task"); + kthread_stop(fakewriter_tasks[i]); + } + fakewriter_tasks[i] = NULL; + } + kfree(fakewriter_tasks); + fakewriter_tasks = NULL; + } + + if (stats_task) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); + kthread_stop(stats_task); + } + stats_task = NULL; + + if (fqs_task) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task"); + kthread_stop(fqs_task); + } + fqs_task = NULL; + if ((test_boost == 1 && cur_ops->can_boost) || + test_boost == 2) { + unregister_cpu_notifier(&rcutorture_cpu_nb); + for_each_possible_cpu(i) + rcutorture_booster_cleanup(i); + } + + /* Wait for all RCU callbacks to fire. */ + + if (cur_ops->cb_barrier != NULL) + cur_ops->cb_barrier(); + + rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ + + if (cur_ops->cleanup) + cur_ops->cleanup(); + if (atomic_read(&n_rcu_torture_error)) + rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); + else + rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); +} + +static int __init +rcu_torture_init(void) +{ + int i; + int cpu; + int firsterr = 0; + static struct rcu_torture_ops *torture_ops[] = + { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, + &rcu_bh_ops, &rcu_bh_sync_ops, + &srcu_ops, &srcu_expedited_ops, + &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; + + mutex_lock(&fullstop_mutex); + + /* Process args and tell the world that the torturer is on the job. */ + for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { + cur_ops = torture_ops[i]; + if (strcmp(torture_type, cur_ops->name) == 0) + break; + } + if (i == ARRAY_SIZE(torture_ops)) { + printk(KERN_ALERT "rcu-torture: invalid torture type: \"%s\"\n", + torture_type); + printk(KERN_ALERT "rcu-torture types:"); + for (i = 0; i < ARRAY_SIZE(torture_ops); i++) + printk(KERN_ALERT " %s", torture_ops[i]->name); + printk(KERN_ALERT "\n"); + mutex_unlock(&fullstop_mutex); + return -EINVAL; + } + if (cur_ops->fqs == NULL && fqs_duration != 0) { + printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero " + "fqs_duration, fqs disabled.\n"); + fqs_duration = 0; + } + if (cur_ops->init) + cur_ops->init(); /* no "goto unwind" prior to this point!!! */ + + if (nreaders >= 0) + nrealreaders = nreaders; + else + nrealreaders = 2 * num_online_cpus(); + rcu_torture_print_module_parms(cur_ops, "Start of test"); + fullstop = FULLSTOP_DONTSTOP; + + /* Set up the freelist. */ + + INIT_LIST_HEAD(&rcu_torture_freelist); + for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) { + rcu_tortures[i].rtort_mbtest = 0; + list_add_tail(&rcu_tortures[i].rtort_free, + &rcu_torture_freelist); + } + + /* Initialize the statistics so that each run gets its own numbers. */ + + rcu_torture_current = NULL; + rcu_torture_current_version = 0; + atomic_set(&n_rcu_torture_alloc, 0); + atomic_set(&n_rcu_torture_alloc_fail, 0); + atomic_set(&n_rcu_torture_free, 0); + atomic_set(&n_rcu_torture_mberror, 0); + atomic_set(&n_rcu_torture_error, 0); + n_rcu_torture_boost_ktrerror = 0; + n_rcu_torture_boost_rterror = 0; + n_rcu_torture_boost_failure = 0; + n_rcu_torture_boosts = 0; + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) + atomic_set(&rcu_torture_wcount[i], 0); + for_each_possible_cpu(cpu) { + for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { + per_cpu(rcu_torture_count, cpu)[i] = 0; + per_cpu(rcu_torture_batch, cpu)[i] = 0; + } + } + + /* Start up the kthreads. */ + + VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); + writer_task = kthread_run(rcu_torture_writer, NULL, + "rcu_torture_writer"); + if (IS_ERR(writer_task)) { + firsterr = PTR_ERR(writer_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); + writer_task = NULL; + goto unwind; + } + fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), + GFP_KERNEL); + if (fakewriter_tasks == NULL) { + VERBOSE_PRINTK_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nfakewriters; i++) { + VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); + fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, + "rcu_torture_fakewriter"); + if (IS_ERR(fakewriter_tasks[i])) { + firsterr = PTR_ERR(fakewriter_tasks[i]); + VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter"); + fakewriter_tasks[i] = NULL; + goto unwind; + } + } + reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), + GFP_KERNEL); + if (reader_tasks == NULL) { + VERBOSE_PRINTK_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + for (i = 0; i < nrealreaders; i++) { + VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task"); + reader_tasks[i] = kthread_run(rcu_torture_reader, NULL, + "rcu_torture_reader"); + if (IS_ERR(reader_tasks[i])) { + firsterr = PTR_ERR(reader_tasks[i]); + VERBOSE_PRINTK_ERRSTRING("Failed to create reader"); + reader_tasks[i] = NULL; + goto unwind; + } + } + if (stat_interval > 0) { + VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task"); + stats_task = kthread_run(rcu_torture_stats, NULL, + "rcu_torture_stats"); + if (IS_ERR(stats_task)) { + firsterr = PTR_ERR(stats_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create stats"); + stats_task = NULL; + goto unwind; + } + } + if (test_no_idle_hz) { + rcu_idle_cpu = num_online_cpus() - 1; + + if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) { + firsterr = -ENOMEM; + VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask"); + goto unwind; + } + + /* Create the shuffler thread */ + shuffler_task = kthread_run(rcu_torture_shuffle, NULL, + "rcu_torture_shuffle"); + if (IS_ERR(shuffler_task)) { + free_cpumask_var(shuffle_tmp_mask); + firsterr = PTR_ERR(shuffler_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler"); + shuffler_task = NULL; + goto unwind; + } + } + if (stutter < 0) + stutter = 0; + if (stutter) { + /* Create the stutter thread */ + stutter_task = kthread_run(rcu_torture_stutter, NULL, + "rcu_torture_stutter"); + if (IS_ERR(stutter_task)) { + firsterr = PTR_ERR(stutter_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create stutter"); + stutter_task = NULL; + goto unwind; + } + } + if (fqs_duration < 0) + fqs_duration = 0; + if (fqs_duration) { + /* Create the stutter thread */ + fqs_task = kthread_run(rcu_torture_fqs, NULL, + "rcu_torture_fqs"); + if (IS_ERR(fqs_task)) { + firsterr = PTR_ERR(fqs_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create fqs"); + fqs_task = NULL; + goto unwind; + } + } + if (test_boost_interval < 1) + test_boost_interval = 1; + if (test_boost_duration < 2) + test_boost_duration = 2; + if ((test_boost == 1 && cur_ops->can_boost) || + test_boost == 2) { + int retval; + + boost_starttime = jiffies + test_boost_interval * HZ; + register_cpu_notifier(&rcutorture_cpu_nb); + for_each_possible_cpu(i) { + if (cpu_is_offline(i)) + continue; /* Heuristic: CPU can go offline. */ + retval = rcutorture_booster_init(i); + if (retval < 0) { + firsterr = retval; + goto unwind; + } + } + } + register_reboot_notifier(&rcutorture_shutdown_nb); + rcutorture_record_test_transition(); + mutex_unlock(&fullstop_mutex); + return 0; + +unwind: + mutex_unlock(&fullstop_mutex); + rcu_torture_cleanup(); + return firsterr; +} + +module_init(rcu_torture_init); +module_exit(rcu_torture_cleanup); diff --git a/kernel/rcutree.c b/kernel/rcutree.c new file mode 100644 index 00000000..3585b42e --- /dev/null +++ b/kernel/rcutree.c @@ -0,0 +1,2098 @@ +/* + * Read-Copy Update mechanism for mutual exclusion + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2008 + * + * Authors: Dipankar Sarma + * Manfred Spraul + * Paul E. McKenney Hierarchical version + * + * Based on the original work by Paul McKenney + * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rcutree.h" + +/* Data structures. */ + +static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; + +#define RCU_STATE_INITIALIZER(structname) { \ + .level = { &structname.node[0] }, \ + .levelcnt = { \ + NUM_RCU_LVL_0, /* root of hierarchy. */ \ + NUM_RCU_LVL_1, \ + NUM_RCU_LVL_2, \ + NUM_RCU_LVL_3, \ + NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \ + }, \ + .signaled = RCU_GP_IDLE, \ + .gpnum = -300, \ + .completed = -300, \ + .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \ + .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \ + .n_force_qs = 0, \ + .n_force_qs_ngp = 0, \ + .name = #structname, \ +} + +struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state); +DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); + +struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); +DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); + +static struct rcu_state *rcu_state; + +/* + * The rcu_scheduler_active variable transitions from zero to one just + * before the first task is spawned. So when this variable is zero, RCU + * can assume that there is but one task, allowing RCU to (for example) + * optimized synchronize_sched() to a simple barrier(). When this variable + * is one, RCU must actually do all the hard work required to detect real + * grace periods. This variable is also used to suppress boot-time false + * positives from lockdep-RCU error checking. + */ +int rcu_scheduler_active __read_mostly; +EXPORT_SYMBOL_GPL(rcu_scheduler_active); + +/* + * The rcu_scheduler_fully_active variable transitions from zero to one + * during the early_initcall() processing, which is after the scheduler + * is capable of creating new tasks. So RCU processing (for example, + * creating tasks for RCU priority boosting) must be delayed until after + * rcu_scheduler_fully_active transitions from zero to one. We also + * currently delay invocation of any RCU callbacks until after this point. + * + * It might later prove better for people registering RCU callbacks during + * early boot to take responsibility for these callbacks, but one step at + * a time. + */ +static int rcu_scheduler_fully_active __read_mostly; + +#ifdef CONFIG_RCU_BOOST + +/* + * Control variables for per-CPU and per-rcu_node kthreads. These + * handle all flavors of RCU. + */ +static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); +DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu); +DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); +DEFINE_PER_CPU(char, rcu_cpu_has_work); + +#endif /* #ifdef CONFIG_RCU_BOOST */ + +static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); +static void invoke_rcu_core(void); +static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); + +#define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */ + +/* + * Track the rcutorture test sequence number and the update version + * number within a given test. The rcutorture_testseq is incremented + * on every rcutorture module load and unload, so has an odd value + * when a test is running. The rcutorture_vernum is set to zero + * when rcutorture starts and is incremented on each rcutorture update. + * These variables enable correlating rcutorture output with the + * RCU tracing information. + */ +unsigned long rcutorture_testseq; +unsigned long rcutorture_vernum; + +/* + * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s + * permit this function to be invoked without holding the root rcu_node + * structure's ->lock, but of course results can be subject to change. + */ +static int rcu_gp_in_progress(struct rcu_state *rsp) +{ + return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum); +} + +/* + * Note a quiescent state. Because we do not need to know + * how many quiescent states passed, just if there was at least + * one since the start of the grace period, this just sets a flag. + */ +void rcu_sched_qs(int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); + + rdp->passed_quiesc_completed = rdp->gpnum - 1; + barrier(); + rdp->passed_quiesc = 1; +} + +void rcu_bh_qs(int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); + + rdp->passed_quiesc_completed = rdp->gpnum - 1; + barrier(); + rdp->passed_quiesc = 1; +} + +/* + * Note a context switch. This is a quiescent state for RCU-sched, + * and requires special handling for preemptible RCU. + */ +void rcu_note_context_switch(int cpu) +{ + rcu_sched_qs(cpu); + rcu_preempt_note_context_switch(cpu); +} +EXPORT_SYMBOL_GPL(rcu_note_context_switch); + +#ifdef CONFIG_NO_HZ +DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { + .dynticks_nesting = 1, + .dynticks = ATOMIC_INIT(1), +}; +#endif /* #ifdef CONFIG_NO_HZ */ + +static int blimit = 10; /* Maximum callbacks per softirq. */ +static int qhimark = 10000; /* If this many pending, ignore blimit. */ +static int qlowmark = 100; /* Once only this many pending, use blimit. */ + +module_param(blimit, int, 0); +module_param(qhimark, int, 0); +module_param(qlowmark, int, 0); + +int rcu_cpu_stall_suppress __read_mostly; +module_param(rcu_cpu_stall_suppress, int, 0644); + +static void force_quiescent_state(struct rcu_state *rsp, int relaxed); +static int rcu_pending(int cpu); + +/* + * Return the number of RCU-sched batches processed thus far for debug & stats. + */ +long rcu_batches_completed_sched(void) +{ + return rcu_sched_state.completed; +} +EXPORT_SYMBOL_GPL(rcu_batches_completed_sched); + +/* + * Return the number of RCU BH batches processed thus far for debug & stats. + */ +long rcu_batches_completed_bh(void) +{ + return rcu_bh_state.completed; +} +EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); + +/* + * Force a quiescent state for RCU BH. + */ +void rcu_bh_force_quiescent_state(void) +{ + force_quiescent_state(&rcu_bh_state, 0); +} +EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); + +/* + * Record the number of times rcutorture tests have been initiated and + * terminated. This information allows the debugfs tracing stats to be + * correlated to the rcutorture messages, even when the rcutorture module + * is being repeatedly loaded and unloaded. In other words, we cannot + * store this state in rcutorture itself. + */ +void rcutorture_record_test_transition(void) +{ + rcutorture_testseq++; + rcutorture_vernum = 0; +} +EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); + +/* + * Record the number of writer passes through the current rcutorture test. + * This is also used to correlate debugfs tracing stats with the rcutorture + * messages. + */ +void rcutorture_record_progress(unsigned long vernum) +{ + rcutorture_vernum++; +} +EXPORT_SYMBOL_GPL(rcutorture_record_progress); + +/* + * Force a quiescent state for RCU-sched. + */ +void rcu_sched_force_quiescent_state(void) +{ + force_quiescent_state(&rcu_sched_state, 0); +} +EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); + +/* + * Does the CPU have callbacks ready to be invoked? + */ +static int +cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) +{ + return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]; +} + +/* + * Does the current CPU require a yet-as-unscheduled grace period? + */ +static int +cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) +{ + return *rdp->nxttail[RCU_DONE_TAIL] && !rcu_gp_in_progress(rsp); +} + +/* + * Return the root node of the specified rcu_state structure. + */ +static struct rcu_node *rcu_get_root(struct rcu_state *rsp) +{ + return &rsp->node[0]; +} + +#ifdef CONFIG_SMP + +/* + * If the specified CPU is offline, tell the caller that it is in + * a quiescent state. Otherwise, whack it with a reschedule IPI. + * Grace periods can end up waiting on an offline CPU when that + * CPU is in the process of coming online -- it will be added to the + * rcu_node bitmasks before it actually makes it online. The same thing + * can happen while a CPU is in the process of coming online. Because this + * race is quite rare, we check for it after detecting that the grace + * period has been delayed rather than checking each and every CPU + * each and every time we start a new grace period. + */ +static int rcu_implicit_offline_qs(struct rcu_data *rdp) +{ + /* + * If the CPU is offline, it is in a quiescent state. We can + * trust its state not to change because interrupts are disabled. + */ + if (cpu_is_offline(rdp->cpu)) { + rdp->offline_fqs++; + return 1; + } + + /* If preemptible RCU, no point in sending reschedule IPI. */ + if (rdp->preemptible) + return 0; + + /* The CPU is online, so send it a reschedule IPI. */ + if (rdp->cpu != smp_processor_id()) + smp_send_reschedule(rdp->cpu); + else + set_need_resched(); + rdp->resched_ipi++; + return 0; +} + +#endif /* #ifdef CONFIG_SMP */ + +#ifdef CONFIG_NO_HZ + +/** + * rcu_enter_nohz - inform RCU that current CPU is entering nohz + * + * Enter nohz mode, in other words, -leave- the mode in which RCU + * read-side critical sections can occur. (Though RCU read-side + * critical sections can occur in irq handlers in nohz mode, a possibility + * handled by rcu_irq_enter() and rcu_irq_exit()). + */ +void rcu_enter_nohz(void) +{ + unsigned long flags; + struct rcu_dynticks *rdtp; + + local_irq_save(flags); + rdtp = &__get_cpu_var(rcu_dynticks); + if (--rdtp->dynticks_nesting) { + local_irq_restore(flags); + return; + } + /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ + smp_mb__before_atomic_inc(); /* See above. */ + atomic_inc(&rdtp->dynticks); + smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ + WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); + local_irq_restore(flags); + + /* If the interrupt queued a callback, get out of dyntick mode. */ + if (in_irq() && + (__get_cpu_var(rcu_sched_data).nxtlist || + __get_cpu_var(rcu_bh_data).nxtlist || + rcu_preempt_needs_cpu(smp_processor_id()))) + set_need_resched(); +} + +/* + * rcu_exit_nohz - inform RCU that current CPU is leaving nohz + * + * Exit nohz mode, in other words, -enter- the mode in which RCU + * read-side critical sections normally occur. + */ +void rcu_exit_nohz(void) +{ + unsigned long flags; + struct rcu_dynticks *rdtp; + + local_irq_save(flags); + rdtp = &__get_cpu_var(rcu_dynticks); + if (rdtp->dynticks_nesting++) { + local_irq_restore(flags); + return; + } + smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ + atomic_inc(&rdtp->dynticks); + /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ + smp_mb__after_atomic_inc(); /* See above. */ + WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); + local_irq_restore(flags); +} + +/** + * rcu_nmi_enter - inform RCU of entry to NMI context + * + * If the CPU was idle with dynamic ticks active, and there is no + * irq handler running, this updates rdtp->dynticks_nmi to let the + * RCU grace-period handling know that the CPU is active. + */ +void rcu_nmi_enter(void) +{ + struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); + + if (rdtp->dynticks_nmi_nesting == 0 && + (atomic_read(&rdtp->dynticks) & 0x1)) + return; + rdtp->dynticks_nmi_nesting++; + smp_mb__before_atomic_inc(); /* Force delay from prior write. */ + atomic_inc(&rdtp->dynticks); + /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ + smp_mb__after_atomic_inc(); /* See above. */ + WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); +} + +/** + * rcu_nmi_exit - inform RCU of exit from NMI context + * + * If the CPU was idle with dynamic ticks active, and there is no + * irq handler running, this updates rdtp->dynticks_nmi to let the + * RCU grace-period handling know that the CPU is no longer active. + */ +void rcu_nmi_exit(void) +{ + struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks); + + if (rdtp->dynticks_nmi_nesting == 0 || + --rdtp->dynticks_nmi_nesting != 0) + return; + /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ + smp_mb__before_atomic_inc(); /* See above. */ + atomic_inc(&rdtp->dynticks); + smp_mb__after_atomic_inc(); /* Force delay to next write. */ + WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); +} + +/** + * rcu_irq_enter - inform RCU of entry to hard irq context + * + * If the CPU was idle with dynamic ticks active, this updates the + * rdtp->dynticks to let the RCU handling know that the CPU is active. + */ +void rcu_irq_enter(void) +{ + rcu_exit_nohz(); +} + +/** + * rcu_irq_exit - inform RCU of exit from hard irq context + * + * If the CPU was idle with dynamic ticks active, update the rdp->dynticks + * to put let the RCU handling be aware that the CPU is going back to idle + * with no ticks. + */ +void rcu_irq_exit(void) +{ + rcu_enter_nohz(); +} + +#ifdef CONFIG_SMP + +/* + * Snapshot the specified CPU's dynticks counter so that we can later + * credit them with an implicit quiescent state. Return 1 if this CPU + * is in dynticks idle mode, which is an extended quiescent state. + */ +static int dyntick_save_progress_counter(struct rcu_data *rdp) +{ + rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); + return 0; +} + +/* + * Return true if the specified CPU has passed through a quiescent + * state by virtue of being in or having passed through an dynticks + * idle state since the last call to dyntick_save_progress_counter() + * for this same CPU. + */ +static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) +{ + unsigned long curr; + unsigned long snap; + + curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks); + snap = (unsigned long)rdp->dynticks_snap; + + /* + * If the CPU passed through or entered a dynticks idle phase with + * no active irq/NMI handlers, then we can safely pretend that the CPU + * already acknowledged the request to pass through a quiescent + * state. Either way, that CPU cannot possibly be in an RCU + * read-side critical section that started before the beginning + * of the current RCU grace period. + */ + if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) { + rdp->dynticks_fqs++; + return 1; + } + + /* Go check for the CPU being offline. */ + return rcu_implicit_offline_qs(rdp); +} + +#endif /* #ifdef CONFIG_SMP */ + +#else /* #ifdef CONFIG_NO_HZ */ + +#ifdef CONFIG_SMP + +static int dyntick_save_progress_counter(struct rcu_data *rdp) +{ + return 0; +} + +static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) +{ + return rcu_implicit_offline_qs(rdp); +} + +#endif /* #ifdef CONFIG_SMP */ + +#endif /* #else #ifdef CONFIG_NO_HZ */ + +int rcu_cpu_stall_suppress __read_mostly; + +static void record_gp_stall_check_time(struct rcu_state *rsp) +{ + rsp->gp_start = jiffies; + rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; +} + +static void print_other_cpu_stall(struct rcu_state *rsp) +{ + int cpu; + long delta; + unsigned long flags; + struct rcu_node *rnp = rcu_get_root(rsp); + + /* Only let one CPU complain about others per time interval. */ + + raw_spin_lock_irqsave(&rnp->lock, flags); + delta = jiffies - rsp->jiffies_stall; + if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + + /* + * Now rat on any tasks that got kicked up to the root rcu_node + * due to CPU offlining. + */ + rcu_print_task_stall(rnp); + raw_spin_unlock_irqrestore(&rnp->lock, flags); + + /* + * OK, time to rat on our buddy... + * See Documentation/RCU/stallwarn.txt for info on how to debug + * RCU CPU stall warnings. + */ + printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {", + rsp->name); + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irqsave(&rnp->lock, flags); + rcu_print_task_stall(rnp); + raw_spin_unlock_irqrestore(&rnp->lock, flags); + if (rnp->qsmask == 0) + continue; + for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) + if (rnp->qsmask & (1UL << cpu)) + printk(" %d", rnp->grplo + cpu); + } + printk("} (detected by %d, t=%ld jiffies)\n", + smp_processor_id(), (long)(jiffies - rsp->gp_start)); + trigger_all_cpu_backtrace(); + + /* If so configured, complain about tasks blocking the grace period. */ + + rcu_print_detail_task_stall(rsp); + + force_quiescent_state(rsp, 0); /* Kick them all. */ +} + +static void print_cpu_stall(struct rcu_state *rsp) +{ + unsigned long flags; + struct rcu_node *rnp = rcu_get_root(rsp); + + /* + * OK, time to rat on ourselves... + * See Documentation/RCU/stallwarn.txt for info on how to debug + * RCU CPU stall warnings. + */ + printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n", + rsp->name, smp_processor_id(), jiffies - rsp->gp_start); + trigger_all_cpu_backtrace(); + + raw_spin_lock_irqsave(&rnp->lock, flags); + if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) + rsp->jiffies_stall = + jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + + set_need_resched(); /* kick ourselves to get things going. */ +} + +static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) +{ + unsigned long j; + unsigned long js; + struct rcu_node *rnp; + + if (rcu_cpu_stall_suppress) + return; + j = ACCESS_ONCE(jiffies); + js = ACCESS_ONCE(rsp->jiffies_stall); + rnp = rdp->mynode; + if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { + + /* We haven't checked in, so go dump stack. */ + print_cpu_stall(rsp); + + } else if (rcu_gp_in_progress(rsp) && + ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { + + /* They had a few time units to dump stack, so complain. */ + print_other_cpu_stall(rsp); + } +} + +static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) +{ + rcu_cpu_stall_suppress = 1; + return NOTIFY_DONE; +} + +/** + * rcu_cpu_stall_reset - prevent further stall warnings in current grace period + * + * Set the stall-warning timeout way off into the future, thus preventing + * any RCU CPU stall-warning messages from appearing in the current set of + * RCU grace periods. + * + * The caller must disable hard irqs. + */ +void rcu_cpu_stall_reset(void) +{ + rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2; + rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2; + rcu_preempt_stall_reset(); +} + +static struct notifier_block rcu_panic_block = { + .notifier_call = rcu_panic, +}; + +static void __init check_cpu_stall_init(void) +{ + atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); +} + +/* + * Update CPU-local rcu_data state to record the newly noticed grace period. + * This is used both when we started the grace period and when we notice + * that someone else started the grace period. The caller must hold the + * ->lock of the leaf rcu_node structure corresponding to the current CPU, + * and must have irqs disabled. + */ +static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) +{ + if (rdp->gpnum != rnp->gpnum) { + /* + * If the current grace period is waiting for this CPU, + * set up to detect a quiescent state, otherwise don't + * go looking for one. + */ + rdp->gpnum = rnp->gpnum; + if (rnp->qsmask & rdp->grpmask) { + rdp->qs_pending = 1; + rdp->passed_quiesc = 0; + } else + rdp->qs_pending = 0; + } +} + +static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp) +{ + unsigned long flags; + struct rcu_node *rnp; + + local_irq_save(flags); + rnp = rdp->mynode; + if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */ + !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ + local_irq_restore(flags); + return; + } + __note_new_gpnum(rsp, rnp, rdp); + raw_spin_unlock_irqrestore(&rnp->lock, flags); +} + +/* + * Did someone else start a new RCU grace period start since we last + * checked? Update local state appropriately if so. Must be called + * on the CPU corresponding to rdp. + */ +static int +check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp) +{ + unsigned long flags; + int ret = 0; + + local_irq_save(flags); + if (rdp->gpnum != rsp->gpnum) { + note_new_gpnum(rsp, rdp); + ret = 1; + } + local_irq_restore(flags); + return ret; +} + +/* + * Advance this CPU's callbacks, but only if the current grace period + * has ended. This may be called only from the CPU to whom the rdp + * belongs. In addition, the corresponding leaf rcu_node structure's + * ->lock must be held by the caller, with irqs disabled. + */ +static void +__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) +{ + /* Did another grace period end? */ + if (rdp->completed != rnp->completed) { + + /* Advance callbacks. No harm if list empty. */ + rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL]; + rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL]; + rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + + /* Remember that we saw this grace-period completion. */ + rdp->completed = rnp->completed; + + /* + * If we were in an extended quiescent state, we may have + * missed some grace periods that others CPUs handled on + * our behalf. Catch up with this state to avoid noting + * spurious new grace periods. If another grace period + * has started, then rnp->gpnum will have advanced, so + * we will detect this later on. + */ + if (ULONG_CMP_LT(rdp->gpnum, rdp->completed)) + rdp->gpnum = rdp->completed; + + /* + * If RCU does not need a quiescent state from this CPU, + * then make sure that this CPU doesn't go looking for one. + */ + if ((rnp->qsmask & rdp->grpmask) == 0) + rdp->qs_pending = 0; + } +} + +/* + * Advance this CPU's callbacks, but only if the current grace period + * has ended. This may be called only from the CPU to whom the rdp + * belongs. + */ +static void +rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) +{ + unsigned long flags; + struct rcu_node *rnp; + + local_irq_save(flags); + rnp = rdp->mynode; + if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */ + !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ + local_irq_restore(flags); + return; + } + __rcu_process_gp_end(rsp, rnp, rdp); + raw_spin_unlock_irqrestore(&rnp->lock, flags); +} + +/* + * Do per-CPU grace-period initialization for running CPU. The caller + * must hold the lock of the leaf rcu_node structure corresponding to + * this CPU. + */ +static void +rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) +{ + /* Prior grace period ended, so advance callbacks for current CPU. */ + __rcu_process_gp_end(rsp, rnp, rdp); + + /* + * Because this CPU just now started the new grace period, we know + * that all of its callbacks will be covered by this upcoming grace + * period, even the ones that were registered arbitrarily recently. + * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL. + * + * Other CPUs cannot be sure exactly when the grace period started. + * Therefore, their recently registered callbacks must pass through + * an additional RCU_NEXT_READY stage, so that they will be handled + * by the next RCU grace period. + */ + rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + + /* Set state so that this CPU will detect the next quiescent state. */ + __note_new_gpnum(rsp, rnp, rdp); +} + +/* + * Start a new RCU grace period if warranted, re-initializing the hierarchy + * in preparation for detecting the next grace period. The caller must hold + * the root node's ->lock, which is released before return. Hard irqs must + * be disabled. + */ +static void +rcu_start_gp(struct rcu_state *rsp, unsigned long flags) + __releases(rcu_get_root(rsp)->lock) +{ + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); + struct rcu_node *rnp = rcu_get_root(rsp); + + if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { + if (cpu_needs_another_gp(rsp, rdp)) + rsp->fqs_need_gp = 1; + if (rnp->completed == rsp->completed) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + + /* + * Propagate new ->completed value to rcu_node structures + * so that other CPUs don't have to wait until the start + * of the next grace period to process their callbacks. + */ + rcu_for_each_node_breadth_first(rsp, rnp) { + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + rnp->completed = rsp->completed; + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + } + local_irq_restore(flags); + return; + } + + /* Advance to a new grace period and initialize state. */ + rsp->gpnum++; + WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT); + rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */ + rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; + record_gp_stall_check_time(rsp); + + /* Special-case the common single-level case. */ + if (NUM_RCU_NODES == 1) { + rcu_preempt_check_blocked_tasks(rnp); + rnp->qsmask = rnp->qsmaskinit; + rnp->gpnum = rsp->gpnum; + rnp->completed = rsp->completed; + rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ + rcu_start_gp_per_cpu(rsp, rnp, rdp); + rcu_preempt_boost_start_gp(rnp); + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + + raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */ + + + /* Exclude any concurrent CPU-hotplug operations. */ + raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ + + /* + * Set the quiescent-state-needed bits in all the rcu_node + * structures for all currently online CPUs in breadth-first + * order, starting from the root rcu_node structure. This + * operation relies on the layout of the hierarchy within the + * rsp->node[] array. Note that other CPUs will access only + * the leaves of the hierarchy, which still indicate that no + * grace period is in progress, at least until the corresponding + * leaf node has been initialized. In addition, we have excluded + * CPU-hotplug operations. + * + * Note that the grace period cannot complete until we finish + * the initialization process, as there will be at least one + * qsmask bit set in the root node until that time, namely the + * one corresponding to this CPU, due to the fact that we have + * irqs disabled. + */ + rcu_for_each_node_breadth_first(rsp, rnp) { + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + rcu_preempt_check_blocked_tasks(rnp); + rnp->qsmask = rnp->qsmaskinit; + rnp->gpnum = rsp->gpnum; + rnp->completed = rsp->completed; + if (rnp == rdp->mynode) + rcu_start_gp_per_cpu(rsp, rnp, rdp); + rcu_preempt_boost_start_gp(rnp); + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + } + + rnp = rcu_get_root(rsp); + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_irqrestore(&rsp->onofflock, flags); +} + +/* + * Report a full set of quiescent states to the specified rcu_state + * data structure. This involves cleaning up after the prior grace + * period and letting rcu_start_gp() start up the next grace period + * if one is needed. Note that the caller must hold rnp->lock, as + * required by rcu_start_gp(), which will release it. + */ +static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) + __releases(rcu_get_root(rsp)->lock) +{ + unsigned long gp_duration; + + WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); + + /* + * Ensure that all grace-period and pre-grace-period activity + * is seen before the assignment to rsp->completed. + */ + smp_mb(); /* See above block comment. */ + gp_duration = jiffies - rsp->gp_start; + if (gp_duration > rsp->gp_max) + rsp->gp_max = gp_duration; + rsp->completed = rsp->gpnum; + rsp->signaled = RCU_GP_IDLE; + rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ +} + +/* + * Similar to rcu_report_qs_rdp(), for which it is a helper function. + * Allows quiescent states for a group of CPUs to be reported at one go + * to the specified rcu_node structure, though all the CPUs in the group + * must be represented by the same rcu_node structure (which need not be + * a leaf rcu_node structure, though it often will be). That structure's + * lock must be held upon entry, and it is released before return. + */ +static void +rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, + struct rcu_node *rnp, unsigned long flags) + __releases(rnp->lock) +{ + struct rcu_node *rnp_c; + + /* Walk up the rcu_node hierarchy. */ + for (;;) { + if (!(rnp->qsmask & mask)) { + + /* Our bit has already been cleared, so done. */ + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + rnp->qsmask &= ~mask; + if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { + + /* Other bits still set at this level, so done. */ + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + mask = rnp->grpmask; + if (rnp->parent == NULL) { + + /* No more levels. Exit loop holding root lock. */ + + break; + } + raw_spin_unlock_irqrestore(&rnp->lock, flags); + rnp_c = rnp; + rnp = rnp->parent; + raw_spin_lock_irqsave(&rnp->lock, flags); + WARN_ON_ONCE(rnp_c->qsmask); + } + + /* + * Get here if we are the last CPU to pass through a quiescent + * state for this grace period. Invoke rcu_report_qs_rsp() + * to clean up and start the next grace period if one is needed. + */ + rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */ +} + +/* + * Record a quiescent state for the specified CPU to that CPU's rcu_data + * structure. This must be either called from the specified CPU, or + * called when the specified CPU is known to be offline (and when it is + * also known that no other CPU is concurrently trying to help the offline + * CPU). The lastcomp argument is used to make sure we are still in the + * grace period of interest. We don't want to end the current grace period + * based on quiescent states detected in an earlier grace period! + */ +static void +rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) +{ + unsigned long flags; + unsigned long mask; + struct rcu_node *rnp; + + rnp = rdp->mynode; + raw_spin_lock_irqsave(&rnp->lock, flags); + if (lastcomp != rnp->completed) { + + /* + * Someone beat us to it for this grace period, so leave. + * The race with GP start is resolved by the fact that we + * hold the leaf rcu_node lock, so that the per-CPU bits + * cannot yet be initialized -- so we would simply find our + * CPU's bit already cleared in rcu_report_qs_rnp() if this + * race occurred. + */ + rdp->passed_quiesc = 0; /* try again later! */ + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + mask = rdp->grpmask; + if ((rnp->qsmask & mask) == 0) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } else { + rdp->qs_pending = 0; + + /* + * This GP can't end until cpu checks in, so all of our + * callbacks can be processed during the next GP. + */ + rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + + rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ + } +} + +/* + * Check to see if there is a new grace period of which this CPU + * is not yet aware, and if so, set up local rcu_data state for it. + * Otherwise, see if this CPU has just passed through its first + * quiescent state for this grace period, and record that fact if so. + */ +static void +rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) +{ + /* If there is now a new grace period, record and return. */ + if (check_for_new_grace_period(rsp, rdp)) + return; + + /* + * Does this CPU still need to do its part for current grace period? + * If no, return and let the other CPUs do their part as well. + */ + if (!rdp->qs_pending) + return; + + /* + * Was there a quiescent state since the beginning of the grace + * period? If no, then exit and wait for the next call. + */ + if (!rdp->passed_quiesc) + return; + + /* + * Tell RCU we are done (but rcu_report_qs_rdp() will be the + * judge of that). + */ + rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed); +} + +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Move a dying CPU's RCU callbacks to online CPU's callback list. + * Synchronization is not required because this function executes + * in stop_machine() context. + */ +static void rcu_send_cbs_to_online(struct rcu_state *rsp) +{ + int i; + /* current DYING CPU is cleared in the cpu_online_mask */ + int receive_cpu = cpumask_any(cpu_online_mask); + struct rcu_data *rdp = this_cpu_ptr(rsp->rda); + struct rcu_data *receive_rdp = per_cpu_ptr(rsp->rda, receive_cpu); + + if (rdp->nxtlist == NULL) + return; /* irqs disabled, so comparison is stable. */ + + *receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; + receive_rdp->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + receive_rdp->qlen += rdp->qlen; + receive_rdp->n_cbs_adopted += rdp->qlen; + rdp->n_cbs_orphaned += rdp->qlen; + + rdp->nxtlist = NULL; + for (i = 0; i < RCU_NEXT_SIZE; i++) + rdp->nxttail[i] = &rdp->nxtlist; + rdp->qlen = 0; +} + +/* + * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy + * and move all callbacks from the outgoing CPU to the current one. + * There can only be one CPU hotplug operation at a time, so no other + * CPU can be attempting to update rcu_cpu_kthread_task. + */ +static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) +{ + unsigned long flags; + unsigned long mask; + int need_report = 0; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_node *rnp; + + rcu_stop_cpu_kthread(cpu); + + /* Exclude any attempts to start a new grace period. */ + raw_spin_lock_irqsave(&rsp->onofflock, flags); + + /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ + rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */ + mask = rdp->grpmask; /* rnp->grplo is constant. */ + do { + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + rnp->qsmaskinit &= ~mask; + if (rnp->qsmaskinit != 0) { + if (rnp != rdp->mynode) + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + break; + } + if (rnp == rdp->mynode) + need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); + else + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + mask = rnp->grpmask; + rnp = rnp->parent; + } while (rnp != NULL); + + /* + * We still hold the leaf rcu_node structure lock here, and + * irqs are still disabled. The reason for this subterfuge is + * because invoking rcu_report_unblock_qs_rnp() with ->onofflock + * held leads to deadlock. + */ + raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ + rnp = rdp->mynode; + if (need_report & RCU_OFL_TASKS_NORM_GP) + rcu_report_unblock_qs_rnp(rnp, flags); + else + raw_spin_unlock_irqrestore(&rnp->lock, flags); + if (need_report & RCU_OFL_TASKS_EXP_GP) + rcu_report_exp_rnp(rsp, rnp); + rcu_node_kthread_setaffinity(rnp, -1); +} + +/* + * Remove the specified CPU from the RCU hierarchy and move any pending + * callbacks that it might have to the current CPU. This code assumes + * that at least one CPU in the system will remain running at all times. + * Any attempt to offline -all- CPUs is likely to strand RCU callbacks. + */ +static void rcu_offline_cpu(int cpu) +{ + __rcu_offline_cpu(cpu, &rcu_sched_state); + __rcu_offline_cpu(cpu, &rcu_bh_state); + rcu_preempt_offline_cpu(cpu); +} + +#else /* #ifdef CONFIG_HOTPLUG_CPU */ + +static void rcu_send_cbs_to_online(struct rcu_state *rsp) +{ +} + +static void rcu_offline_cpu(int cpu) +{ +} + +#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ + +/* + * Invoke any RCU callbacks that have made it to the end of their grace + * period. Thottle as specified by rdp->blimit. + */ +static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) +{ + unsigned long flags; + struct rcu_head *next, *list, **tail; + int count; + + /* If no callbacks are ready, just return.*/ + if (!cpu_has_callbacks_ready_to_invoke(rdp)) + return; + + /* + * Extract the list of ready callbacks, disabling to prevent + * races with call_rcu() from interrupt handlers. + */ + local_irq_save(flags); + list = rdp->nxtlist; + rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; + *rdp->nxttail[RCU_DONE_TAIL] = NULL; + tail = rdp->nxttail[RCU_DONE_TAIL]; + for (count = RCU_NEXT_SIZE - 1; count >= 0; count--) + if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL]) + rdp->nxttail[count] = &rdp->nxtlist; + local_irq_restore(flags); + + /* Invoke callbacks. */ + count = 0; + while (list) { + next = list->next; + prefetch(next); + debug_rcu_head_unqueue(list); + __rcu_reclaim(list); + list = next; + if (++count >= rdp->blimit) + break; + } + + local_irq_save(flags); + + /* Update count, and requeue any remaining callbacks. */ + rdp->qlen -= count; + rdp->n_cbs_invoked += count; + if (list != NULL) { + *tail = rdp->nxtlist; + rdp->nxtlist = list; + for (count = 0; count < RCU_NEXT_SIZE; count++) + if (&rdp->nxtlist == rdp->nxttail[count]) + rdp->nxttail[count] = tail; + else + break; + } + + /* Reinstate batch limit if we have worked down the excess. */ + if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) + rdp->blimit = blimit; + + /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ + if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) { + rdp->qlen_last_fqs_check = 0; + rdp->n_force_qs_snap = rsp->n_force_qs; + } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) + rdp->qlen_last_fqs_check = rdp->qlen; + + local_irq_restore(flags); + + /* Re-raise the RCU softirq if there are callbacks remaining. */ + if (cpu_has_callbacks_ready_to_invoke(rdp)) + invoke_rcu_core(); +} + +/* + * Check to see if this CPU is in a non-context-switch quiescent state + * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). + * Also schedule the RCU softirq handler. + * + * This function must be called with hardirqs disabled. It is normally + * invoked from the scheduling-clock interrupt. If rcu_pending returns + * false, there is no point in invoking rcu_check_callbacks(). + */ +void rcu_check_callbacks(int cpu, int user) +{ + if (user || + (idle_cpu(cpu) && rcu_scheduler_active && + !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { + + /* + * Get here if this CPU took its interrupt from user + * mode or from the idle loop, and if this is not a + * nested interrupt. In this case, the CPU is in + * a quiescent state, so note it. + * + * No memory barrier is required here because both + * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local + * variables that other CPUs neither access nor modify, + * at least not while the corresponding CPU is online. + */ + + rcu_sched_qs(cpu); + rcu_bh_qs(cpu); + + } else if (!in_softirq()) { + + /* + * Get here if this CPU did not take its interrupt from + * softirq, in other words, if it is not interrupting + * a rcu_bh read-side critical section. This is an _bh + * critical section, so note it. + */ + + rcu_bh_qs(cpu); + } + rcu_preempt_check_callbacks(cpu); + if (rcu_pending(cpu)) + invoke_rcu_core(); +} + +#ifdef CONFIG_SMP + +/* + * Scan the leaf rcu_node structures, processing dyntick state for any that + * have not yet encountered a quiescent state, using the function specified. + * Also initiate boosting for any threads blocked on the root rcu_node. + * + * The caller must have suppressed start of new grace periods. + */ +static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) +{ + unsigned long bit; + int cpu; + unsigned long flags; + unsigned long mask; + struct rcu_node *rnp; + + rcu_for_each_leaf_node(rsp, rnp) { + mask = 0; + raw_spin_lock_irqsave(&rnp->lock, flags); + if (!rcu_gp_in_progress(rsp)) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + if (rnp->qsmask == 0) { + rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ + continue; + } + cpu = rnp->grplo; + bit = 1; + for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { + if ((rnp->qsmask & bit) != 0 && + f(per_cpu_ptr(rsp->rda, cpu))) + mask |= bit; + } + if (mask != 0) { + + /* rcu_report_qs_rnp() releases rnp->lock. */ + rcu_report_qs_rnp(mask, rsp, rnp, flags); + continue; + } + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } + rnp = rcu_get_root(rsp); + if (rnp->qsmask == 0) { + raw_spin_lock_irqsave(&rnp->lock, flags); + rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ + } +} + +/* + * Force quiescent states on reluctant CPUs, and also detect which + * CPUs are in dyntick-idle mode. + */ +static void force_quiescent_state(struct rcu_state *rsp, int relaxed) +{ + unsigned long flags; + struct rcu_node *rnp = rcu_get_root(rsp); + + if (!rcu_gp_in_progress(rsp)) + return; /* No grace period in progress, nothing to force. */ + if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) { + rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ + return; /* Someone else is already on the job. */ + } + if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies)) + goto unlock_fqs_ret; /* no emergency and done recently. */ + rsp->n_force_qs++; + raw_spin_lock(&rnp->lock); /* irqs already disabled */ + rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; + if(!rcu_gp_in_progress(rsp)) { + rsp->n_force_qs_ngp++; + raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ + goto unlock_fqs_ret; /* no GP in progress, time updated. */ + } + rsp->fqs_active = 1; + switch (rsp->signaled) { + case RCU_GP_IDLE: + case RCU_GP_INIT: + + break; /* grace period idle or initializing, ignore. */ + + case RCU_SAVE_DYNTICK: + if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) + break; /* So gcc recognizes the dead code. */ + + raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ + + /* Record dyntick-idle state. */ + force_qs_rnp(rsp, dyntick_save_progress_counter); + raw_spin_lock(&rnp->lock); /* irqs already disabled */ + if (rcu_gp_in_progress(rsp)) + rsp->signaled = RCU_FORCE_QS; + break; + + case RCU_FORCE_QS: + + /* Check dyntick-idle state, send IPI to laggarts. */ + raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ + force_qs_rnp(rsp, rcu_implicit_dynticks_qs); + + /* Leave state in case more forcing is required. */ + + raw_spin_lock(&rnp->lock); /* irqs already disabled */ + break; + } + rsp->fqs_active = 0; + if (rsp->fqs_need_gp) { + raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */ + rsp->fqs_need_gp = 0; + rcu_start_gp(rsp, flags); /* releases rnp->lock */ + return; + } + raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ +unlock_fqs_ret: + raw_spin_unlock_irqrestore(&rsp->fqslock, flags); +} + +#else /* #ifdef CONFIG_SMP */ + +static void force_quiescent_state(struct rcu_state *rsp, int relaxed) +{ + set_need_resched(); +} + +#endif /* #else #ifdef CONFIG_SMP */ + +/* + * This does the RCU processing work from softirq context for the + * specified rcu_state and rcu_data structures. This may be called + * only from the CPU to whom the rdp belongs. + */ +static void +__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) +{ + unsigned long flags; + + WARN_ON_ONCE(rdp->beenonline == 0); + + /* + * If an RCU GP has gone long enough, go check for dyntick + * idle CPUs and, if needed, send resched IPIs. + */ + if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) + force_quiescent_state(rsp, 1); + + /* + * Advance callbacks in response to end of earlier grace + * period that some other CPU ended. + */ + rcu_process_gp_end(rsp, rdp); + + /* Update RCU state based on any recent quiescent states. */ + rcu_check_quiescent_state(rsp, rdp); + + /* Does this CPU require a not-yet-started grace period? */ + if (cpu_needs_another_gp(rsp, rdp)) { + raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); + rcu_start_gp(rsp, flags); /* releases above lock */ + } + + /* If there are callbacks ready, invoke them. */ + if (cpu_has_callbacks_ready_to_invoke(rdp)) + invoke_rcu_callbacks(rsp, rdp); +} + +/* + * Do softirq processing for the current CPU. + */ +static void rcu_process_callbacks(struct softirq_action *unused) +{ + __rcu_process_callbacks(&rcu_sched_state, + &__get_cpu_var(rcu_sched_data)); + __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); + rcu_preempt_process_callbacks(); + + /* If we are last CPU on way to dyntick-idle mode, accelerate it. */ + rcu_needs_cpu_flush(); +} +static atomic_t rcu_barrier_cpu_count; + +/* + * Wake up the current CPU's kthread. This replaces raise_softirq() + * in earlier versions of RCU. Note that because we are running on + * the current CPU with interrupts disabled, the rcu_cpu_kthread_task + * cannot disappear out from under us. + */ +static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) +{ + if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active))) + return; + if (likely(!rsp->boost)) { + rcu_do_batch(rsp, rdp); + return; + } + invoke_rcu_callbacks_kthread(); +} + +static void invoke_rcu_core(void) +{ + raise_softirq(RCU_SOFTIRQ); +} + +static void +__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), + struct rcu_state *rsp) +{ + unsigned long flags; + struct rcu_data *rdp; + + debug_rcu_head_queue(head); + head->func = func; + head->next = NULL; + + smp_mb(); /* Ensure RCU update seen before callback registry. */ + + /* + * Opportunistically note grace-period endings and beginnings. + * Note that we might see a beginning right after we see an + * end, but never vice versa, since this CPU has to pass through + * a quiescent state betweentimes. + */ + local_irq_save(flags); + rdp = this_cpu_ptr(rsp->rda); + + /* Add the callback to our list. */ + *rdp->nxttail[RCU_NEXT_TAIL] = head; + rdp->nxttail[RCU_NEXT_TAIL] = &head->next; + rdp->qlen++; + + /* If interrupts were disabled, don't dive into RCU core. */ + if (irqs_disabled_flags(flags)) { + local_irq_restore(flags); + return; + } + + /* Work around for reboot issue, check rcu_barrier_cpu_count + to see whether it is in the _rcu_barrier process, do + tick_nohz_restart_sched_tick if yes. If we enqueue an rcu + callback, we need the CPU tick to stay alive until we take care + of those by completing the appropriate grace period. */ + if (atomic_read(&rcu_barrier_cpu_count) != 0) + tick_nohz_restart_sched_tick(); + + /* + * Force the grace period if too many callbacks or too long waiting. + * Enforce hysteresis, and don't invoke force_quiescent_state() + * if some other CPU has recently done so. Also, don't bother + * invoking force_quiescent_state() if the newly enqueued callback + * is the only one waiting for a grace period to complete. + */ + if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { + + /* Are we ignoring a completed grace period? */ + rcu_process_gp_end(rsp, rdp); + check_for_new_grace_period(rsp, rdp); + + /* Start a new grace period if one not already started. */ + if (!rcu_gp_in_progress(rsp)) { + unsigned long nestflag; + struct rcu_node *rnp_root = rcu_get_root(rsp); + + raw_spin_lock_irqsave(&rnp_root->lock, nestflag); + rcu_start_gp(rsp, nestflag); /* rlses rnp_root->lock */ + } else { + /* Give the grace period a kick. */ + rdp->blimit = LONG_MAX; + if (rsp->n_force_qs == rdp->n_force_qs_snap && + *rdp->nxttail[RCU_DONE_TAIL] != head) + force_quiescent_state(rsp, 0); + rdp->n_force_qs_snap = rsp->n_force_qs; + rdp->qlen_last_fqs_check = rdp->qlen; + } + } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) + force_quiescent_state(rsp, 1); + local_irq_restore(flags); +} + +/* + * Queue an RCU-sched callback for invocation after a grace period. + */ +void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, &rcu_sched_state); +} +EXPORT_SYMBOL_GPL(call_rcu_sched); + +/* + * Queue an RCU for invocation after a quicker grace period. + */ +void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, &rcu_bh_state); +} +EXPORT_SYMBOL_GPL(call_rcu_bh); + +/** + * synchronize_sched - wait until an rcu-sched grace period has elapsed. + * + * Control will return to the caller some time after a full rcu-sched + * grace period has elapsed, in other words after all currently executing + * rcu-sched read-side critical sections have completed. These read-side + * critical sections are delimited by rcu_read_lock_sched() and + * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(), + * local_irq_disable(), and so on may be used in place of + * rcu_read_lock_sched(). + * + * This means that all preempt_disable code sequences, including NMI and + * hardware-interrupt handlers, in progress on entry will have completed + * before this primitive returns. However, this does not guarantee that + * softirq handlers will have completed, since in some kernels, these + * handlers can run in process context, and can block. + * + * This primitive provides the guarantees made by the (now removed) + * synchronize_kernel() API. In contrast, synchronize_rcu() only + * guarantees that rcu_read_lock() sections will have completed. + * In "classic RCU", these two guarantees happen to be one and + * the same, but can differ in realtime RCU implementations. + */ +void synchronize_sched(void) +{ + struct rcu_synchronize rcu; + + if (rcu_blocking_is_gp()) + return; + + init_rcu_head_on_stack(&rcu.head); + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu_sched(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); + destroy_rcu_head_on_stack(&rcu.head); +} +EXPORT_SYMBOL_GPL(synchronize_sched); + +/** + * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed. + * + * Control will return to the caller some time after a full rcu_bh grace + * period has elapsed, in other words after all currently executing rcu_bh + * read-side critical sections have completed. RCU read-side critical + * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), + * and may be nested. + */ +void synchronize_rcu_bh(void) +{ + struct rcu_synchronize rcu; + + if (rcu_blocking_is_gp()) + return; + + init_rcu_head_on_stack(&rcu.head); + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu_bh(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); + destroy_rcu_head_on_stack(&rcu.head); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_bh); + +/* + * Check to see if there is any immediate RCU-related work to be done + * by the current CPU, for the specified type of RCU, returning 1 if so. + * The checks are in order of increasing expense: checks that can be + * carried out against CPU-local state are performed first. However, + * we must check for CPU stalls first, else we might not get a chance. + */ +static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) +{ + struct rcu_node *rnp = rdp->mynode; + + rdp->n_rcu_pending++; + + /* Check for CPU stalls, if enabled. */ + check_cpu_stall(rsp, rdp); + + /* Is the RCU core waiting for a quiescent state from this CPU? */ + if (rdp->qs_pending && !rdp->passed_quiesc) { + + /* + * If force_quiescent_state() coming soon and this CPU + * needs a quiescent state, and this is either RCU-sched + * or RCU-bh, force a local reschedule. + */ + rdp->n_rp_qs_pending++; + if (!rdp->preemptible && + ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1, + jiffies)) + set_need_resched(); + } else if (rdp->qs_pending && rdp->passed_quiesc) { + rdp->n_rp_report_qs++; + return 1; + } + + /* Does this CPU have callbacks ready to invoke? */ + if (cpu_has_callbacks_ready_to_invoke(rdp)) { + rdp->n_rp_cb_ready++; + return 1; + } + + /* Has RCU gone idle with this CPU needing another grace period? */ + if (cpu_needs_another_gp(rsp, rdp)) { + rdp->n_rp_cpu_needs_gp++; + return 1; + } + + /* Has another RCU grace period completed? */ + if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */ + rdp->n_rp_gp_completed++; + return 1; + } + + /* Has a new RCU grace period started? */ + if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */ + rdp->n_rp_gp_started++; + return 1; + } + + /* Has an RCU GP gone long enough to send resched IPIs &c? */ + if (rcu_gp_in_progress(rsp) && + ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) { + rdp->n_rp_need_fqs++; + return 1; + } + + /* nothing to do */ + rdp->n_rp_need_nothing++; + return 0; +} + +/* + * Check to see if there is any immediate RCU-related work to be done + * by the current CPU, returning 1 if so. This function is part of the + * RCU implementation; it is -not- an exported member of the RCU API. + */ +static int rcu_pending(int cpu) +{ + return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) || + __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) || + rcu_preempt_pending(cpu); +} + +/* + * Check to see if any future RCU-related work will need to be done + * by the current CPU, even if none need be done immediately, returning + * 1 if so. + */ +static int rcu_needs_cpu_quick_check(int cpu) +{ + /* RCU callbacks either ready or pending? */ + return per_cpu(rcu_sched_data, cpu).nxtlist || + per_cpu(rcu_bh_data, cpu).nxtlist || + rcu_preempt_needs_cpu(cpu); +} + +static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; +static DEFINE_MUTEX(rcu_barrier_mutex); +static struct completion rcu_barrier_completion; + +static void rcu_barrier_callback(struct rcu_head *notused) +{ + if (atomic_dec_and_test(&rcu_barrier_cpu_count)) + complete(&rcu_barrier_completion); +} + +/* + * Called with preemption disabled, and from cross-cpu IRQ context. + */ +static void rcu_barrier_func(void *type) +{ + int cpu = smp_processor_id(); + struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); + void (*call_rcu_func)(struct rcu_head *head, + void (*func)(struct rcu_head *head)); + + atomic_inc(&rcu_barrier_cpu_count); + call_rcu_func = type; + call_rcu_func(head, rcu_barrier_callback); +} + +/* + * Orchestrate the specified type of RCU barrier, waiting for all + * RCU callbacks of the specified type to complete. + */ +static void _rcu_barrier(struct rcu_state *rsp, + void (*call_rcu_func)(struct rcu_head *head, + void (*func)(struct rcu_head *head))) +{ + BUG_ON(in_interrupt()); + /* Take mutex to serialize concurrent rcu_barrier() requests. */ + mutex_lock(&rcu_barrier_mutex); + init_completion(&rcu_barrier_completion); + /* + * Initialize rcu_barrier_cpu_count to 1, then invoke + * rcu_barrier_func() on each CPU, so that each CPU also has + * incremented rcu_barrier_cpu_count. Only then is it safe to + * decrement rcu_barrier_cpu_count -- otherwise the first CPU + * might complete its grace period before all of the other CPUs + * did their increment, causing this function to return too + * early. Note that on_each_cpu() disables irqs, which prevents + * any CPUs from coming online or going offline until each online + * CPU has queued its RCU-barrier callback. + */ + atomic_set(&rcu_barrier_cpu_count, 1); + on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); + if (atomic_dec_and_test(&rcu_barrier_cpu_count)) + complete(&rcu_barrier_completion); + wait_for_completion(&rcu_barrier_completion); + mutex_unlock(&rcu_barrier_mutex); +} + +/** + * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete. + */ +void rcu_barrier_bh(void) +{ + _rcu_barrier(&rcu_bh_state, call_rcu_bh); +} +EXPORT_SYMBOL_GPL(rcu_barrier_bh); + +/** + * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks. + */ +void rcu_barrier_sched(void) +{ + _rcu_barrier(&rcu_sched_state, call_rcu_sched); +} +EXPORT_SYMBOL_GPL(rcu_barrier_sched); + +/* + * Do boot-time initialization of a CPU's per-CPU RCU data. + */ +static void __init +rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) +{ + unsigned long flags; + int i; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_node *rnp = rcu_get_root(rsp); + + /* Set up local state, ensuring consistent view of global state. */ + raw_spin_lock_irqsave(&rnp->lock, flags); + rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); + rdp->nxtlist = NULL; + for (i = 0; i < RCU_NEXT_SIZE; i++) + rdp->nxttail[i] = &rdp->nxtlist; + rdp->qlen = 0; +#ifdef CONFIG_NO_HZ + rdp->dynticks = &per_cpu(rcu_dynticks, cpu); +#endif /* #ifdef CONFIG_NO_HZ */ + rdp->cpu = cpu; + raw_spin_unlock_irqrestore(&rnp->lock, flags); +} + +/* + * Initialize a CPU's per-CPU RCU data. Note that only one online or + * offline event can be happening at a given time. Note also that we + * can accept some slop in the rsp->completed access due to the fact + * that this CPU cannot possibly have any RCU callbacks in flight yet. + */ +static void __cpuinit +rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) +{ + unsigned long flags; + unsigned long mask; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_node *rnp = rcu_get_root(rsp); + + /* Set up local state, ensuring consistent view of global state. */ + raw_spin_lock_irqsave(&rnp->lock, flags); + rdp->passed_quiesc = 0; /* We could be racing with new GP, */ + rdp->qs_pending = 1; /* so set up to respond to current GP. */ + rdp->beenonline = 1; /* We have now been online. */ + rdp->preemptible = preemptible; + rdp->qlen_last_fqs_check = 0; + rdp->n_force_qs_snap = rsp->n_force_qs; + rdp->blimit = blimit; + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + + /* + * A new grace period might start here. If so, we won't be part + * of it, but that is OK, as we are currently in a quiescent state. + */ + + /* Exclude any attempts to start a new GP on large systems. */ + raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ + + /* Add CPU to rcu_node bitmasks. */ + rnp = rdp->mynode; + mask = rdp->grpmask; + do { + /* Exclude any attempts to start a new GP on small systems. */ + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + rnp->qsmaskinit |= mask; + mask = rnp->grpmask; + if (rnp == rdp->mynode) { + rdp->gpnum = rnp->completed; /* if GP in progress... */ + rdp->completed = rnp->completed; + rdp->passed_quiesc_completed = rnp->completed - 1; + } + raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ + rnp = rnp->parent; + } while (rnp != NULL && !(rnp->qsmaskinit & mask)); + + raw_spin_unlock_irqrestore(&rsp->onofflock, flags); +} + +static void __cpuinit rcu_prepare_cpu(int cpu) +{ + rcu_init_percpu_data(cpu, &rcu_sched_state, 0); + rcu_init_percpu_data(cpu, &rcu_bh_state, 0); + rcu_preempt_init_percpu_data(cpu); +} + +/* + * Handle CPU online/offline notification events. + */ +static int __cpuinit rcu_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); + struct rcu_node *rnp = rdp->mynode; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + rcu_prepare_cpu(cpu); + rcu_prepare_kthreads(cpu); + break; + case CPU_ONLINE: + case CPU_DOWN_FAILED: + rcu_node_kthread_setaffinity(rnp, -1); + rcu_cpu_kthread_setrt(cpu, 1); + break; + case CPU_DOWN_PREPARE: + rcu_node_kthread_setaffinity(rnp, cpu); + rcu_cpu_kthread_setrt(cpu, 0); + break; + case CPU_DYING: + case CPU_DYING_FROZEN: + /* + * The whole machine is "stopped" except this CPU, so we can + * touch any data without introducing corruption. We send the + * dying CPU's callbacks to an arbitrarily chosen online CPU. + */ + rcu_send_cbs_to_online(&rcu_bh_state); + rcu_send_cbs_to_online(&rcu_sched_state); + rcu_preempt_send_cbs_to_online(); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + rcu_offline_cpu(cpu); + break; + default: + break; + } + return NOTIFY_OK; +} + +/* + * This function is invoked towards the end of the scheduler's initialization + * process. Before this is called, the idle task might contain + * RCU read-side critical sections (during which time, this idle + * task is booting the system). After this function is called, the + * idle tasks are prohibited from containing RCU read-side critical + * sections. This function also enables RCU lockdep checking. + */ +void rcu_scheduler_starting(void) +{ + WARN_ON(num_online_cpus() != 1); + WARN_ON(nr_context_switches() > 0); + rcu_scheduler_active = 1; +} + +/* + * Compute the per-level fanout, either using the exact fanout specified + * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT. + */ +#ifdef CONFIG_RCU_FANOUT_EXACT +static void __init rcu_init_levelspread(struct rcu_state *rsp) +{ + int i; + + for (i = NUM_RCU_LVLS - 1; i > 0; i--) + rsp->levelspread[i] = CONFIG_RCU_FANOUT; + rsp->levelspread[0] = RCU_FANOUT_LEAF; +} +#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ +static void __init rcu_init_levelspread(struct rcu_state *rsp) +{ + int ccur; + int cprv; + int i; + + cprv = NR_CPUS; + for (i = NUM_RCU_LVLS - 1; i >= 0; i--) { + ccur = rsp->levelcnt[i]; + rsp->levelspread[i] = (cprv + ccur - 1) / ccur; + cprv = ccur; + } +} +#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */ + +/* + * Helper function for rcu_init() that initializes one rcu_state structure. + */ +static void __init rcu_init_one(struct rcu_state *rsp, + struct rcu_data __percpu *rda) +{ + static char *buf[] = { "rcu_node_level_0", + "rcu_node_level_1", + "rcu_node_level_2", + "rcu_node_level_3" }; /* Match MAX_RCU_LVLS */ + int cpustride = 1; + int i; + int j; + struct rcu_node *rnp; + + BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ + + /* Initialize the level-tracking arrays. */ + + for (i = 1; i < NUM_RCU_LVLS; i++) + rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; + rcu_init_levelspread(rsp); + + /* Initialize the elements themselves, starting from the leaves. */ + + for (i = NUM_RCU_LVLS - 1; i >= 0; i--) { + cpustride *= rsp->levelspread[i]; + rnp = rsp->level[i]; + for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { + raw_spin_lock_init(&rnp->lock); + lockdep_set_class_and_name(&rnp->lock, + &rcu_node_class[i], buf[i]); + rnp->gpnum = 0; + rnp->qsmask = 0; + rnp->qsmaskinit = 0; + rnp->grplo = j * cpustride; + rnp->grphi = (j + 1) * cpustride - 1; + if (rnp->grphi >= NR_CPUS) + rnp->grphi = NR_CPUS - 1; + if (i == 0) { + rnp->grpnum = 0; + rnp->grpmask = 0; + rnp->parent = NULL; + } else { + rnp->grpnum = j % rsp->levelspread[i - 1]; + rnp->grpmask = 1UL << rnp->grpnum; + rnp->parent = rsp->level[i - 1] + + j / rsp->levelspread[i - 1]; + } + rnp->level = i; + INIT_LIST_HEAD(&rnp->blkd_tasks); + } + } + + rsp->rda = rda; + rnp = rsp->level[NUM_RCU_LVLS - 1]; + for_each_possible_cpu(i) { + while (i > rnp->grphi) + rnp++; + per_cpu_ptr(rsp->rda, i)->mynode = rnp; + rcu_boot_init_percpu_data(i, rsp); + } +} + +void __init rcu_init(void) +{ + int cpu; + + rcu_bootup_announce(); + rcu_init_one(&rcu_sched_state, &rcu_sched_data); + rcu_init_one(&rcu_bh_state, &rcu_bh_data); + __rcu_init_preempt(); + open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); + + /* + * We don't need protection against CPU-hotplug here because + * this is called early in boot, before either interrupts + * or the scheduler are operational. + */ + cpu_notifier(rcu_cpu_notify, 0); + for_each_online_cpu(cpu) + rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); + check_cpu_stall_init(); +} + +#include "rcutree_plugin.h" diff --git a/kernel/rcutree.h b/kernel/rcutree.h new file mode 100644 index 00000000..01b2ccda --- /dev/null +++ b/kernel/rcutree.h @@ -0,0 +1,470 @@ +/* + * Read-Copy Update mechanism for mutual exclusion (tree-based version) + * Internal non-public definitions. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2008 + * + * Author: Ingo Molnar + * Paul E. McKenney + */ + +#include +#include +#include +#include +#include + +/* + * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. + * In theory, it should be possible to add more levels straightforwardly. + * In practice, this did work well going from three levels to four. + * Of course, your mileage may vary. + */ +#define MAX_RCU_LVLS 4 +#if CONFIG_RCU_FANOUT > 16 +#define RCU_FANOUT_LEAF 16 +#else /* #if CONFIG_RCU_FANOUT > 16 */ +#define RCU_FANOUT_LEAF (CONFIG_RCU_FANOUT) +#endif /* #else #if CONFIG_RCU_FANOUT > 16 */ +#define RCU_FANOUT_1 (RCU_FANOUT_LEAF) +#define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) +#define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) +#define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) + +#if NR_CPUS <= RCU_FANOUT_1 +# define NUM_RCU_LVLS 1 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 (NR_CPUS) +# define NUM_RCU_LVL_2 0 +# define NUM_RCU_LVL_3 0 +# define NUM_RCU_LVL_4 0 +#elif NR_CPUS <= RCU_FANOUT_2 +# define NUM_RCU_LVLS 2 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) +# define NUM_RCU_LVL_2 (NR_CPUS) +# define NUM_RCU_LVL_3 0 +# define NUM_RCU_LVL_4 0 +#elif NR_CPUS <= RCU_FANOUT_3 +# define NUM_RCU_LVLS 3 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) +# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) +# define NUM_RCU_LVL_3 (NR_CPUS) +# define NUM_RCU_LVL_4 0 +#elif NR_CPUS <= RCU_FANOUT_4 +# define NUM_RCU_LVLS 4 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) +# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) +# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) +# define NUM_RCU_LVL_4 (NR_CPUS) +#else +# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" +#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ + +#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) +#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) + +/* + * Dynticks per-CPU state. + */ +struct rcu_dynticks { + int dynticks_nesting; /* Track irq/process nesting level. */ + int dynticks_nmi_nesting; /* Track NMI nesting level. */ + atomic_t dynticks; /* Even value for dynticks-idle, else odd. */ +}; + +/* RCU's kthread states for tracing. */ +#define RCU_KTHREAD_STOPPED 0 +#define RCU_KTHREAD_RUNNING 1 +#define RCU_KTHREAD_WAITING 2 +#define RCU_KTHREAD_OFFCPU 3 +#define RCU_KTHREAD_YIELDING 4 +#define RCU_KTHREAD_MAX 4 + +/* + * Definition for node within the RCU grace-period-detection hierarchy. + */ +struct rcu_node { + raw_spinlock_t lock; /* Root rcu_node's lock protects some */ + /* rcu_state fields as well as following. */ + unsigned long gpnum; /* Current grace period for this node. */ + /* This will either be equal to or one */ + /* behind the root rcu_node's gpnum. */ + unsigned long completed; /* Last GP completed for this node. */ + /* This will either be equal to or one */ + /* behind the root rcu_node's gpnum. */ + unsigned long qsmask; /* CPUs or groups that need to switch in */ + /* order for current grace period to proceed.*/ + /* In leaf rcu_node, each bit corresponds to */ + /* an rcu_data structure, otherwise, each */ + /* bit corresponds to a child rcu_node */ + /* structure. */ + unsigned long expmask; /* Groups that have ->blkd_tasks */ + /* elements that need to drain to allow the */ + /* current expedited grace period to */ + /* complete (only for TREE_PREEMPT_RCU). */ + atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */ + /* Since this has meaning only for leaf */ + /* rcu_node structures, 32 bits suffices. */ + unsigned long qsmaskinit; + /* Per-GP initial value for qsmask & expmask. */ + unsigned long grpmask; /* Mask to apply to parent qsmask. */ + /* Only one bit will be set in this mask. */ + int grplo; /* lowest-numbered CPU or group here. */ + int grphi; /* highest-numbered CPU or group here. */ + u8 grpnum; /* CPU/group number for next level up. */ + u8 level; /* root is at level 0. */ + struct rcu_node *parent; + struct list_head blkd_tasks; + /* Tasks blocked in RCU read-side critical */ + /* section. Tasks are placed at the head */ + /* of this list and age towards the tail. */ + struct list_head *gp_tasks; + /* Pointer to the first task blocking the */ + /* current grace period, or NULL if there */ + /* is no such task. */ + struct list_head *exp_tasks; + /* Pointer to the first task blocking the */ + /* current expedited grace period, or NULL */ + /* if there is no such task. If there */ + /* is no current expedited grace period, */ + /* then there can cannot be any such task. */ +#ifdef CONFIG_RCU_BOOST + struct list_head *boost_tasks; + /* Pointer to first task that needs to be */ + /* priority boosted, or NULL if no priority */ + /* boosting is needed for this rcu_node */ + /* structure. If there are no tasks */ + /* queued on this rcu_node structure that */ + /* are blocking the current grace period, */ + /* there can be no such task. */ + unsigned long boost_time; + /* When to start boosting (jiffies). */ + struct task_struct *boost_kthread_task; + /* kthread that takes care of priority */ + /* boosting for this rcu_node structure. */ + unsigned int boost_kthread_status; + /* State of boost_kthread_task for tracing. */ + unsigned long n_tasks_boosted; + /* Total number of tasks boosted. */ + unsigned long n_exp_boosts; + /* Number of tasks boosted for expedited GP. */ + unsigned long n_normal_boosts; + /* Number of tasks boosted for normal GP. */ + unsigned long n_balk_blkd_tasks; + /* Refused to boost: no blocked tasks. */ + unsigned long n_balk_exp_gp_tasks; + /* Refused to boost: nothing blocking GP. */ + unsigned long n_balk_boost_tasks; + /* Refused to boost: already boosting. */ + unsigned long n_balk_notblocked; + /* Refused to boost: RCU RS CS still running. */ + unsigned long n_balk_notyet; + /* Refused to boost: not yet time. */ + unsigned long n_balk_nos; + /* Refused to boost: not sure why, though. */ + /* This can happen due to race conditions. */ +#endif /* #ifdef CONFIG_RCU_BOOST */ + struct task_struct *node_kthread_task; + /* kthread that takes care of this rcu_node */ + /* structure, for example, awakening the */ + /* per-CPU kthreads as needed. */ + unsigned int node_kthread_status; + /* State of node_kthread_task for tracing. */ +} ____cacheline_internodealigned_in_smp; + +/* + * Do a full breadth-first scan of the rcu_node structures for the + * specified rcu_state structure. + */ +#define rcu_for_each_node_breadth_first(rsp, rnp) \ + for ((rnp) = &(rsp)->node[0]; \ + (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) + +/* + * Do a breadth-first scan of the non-leaf rcu_node structures for the + * specified rcu_state structure. Note that if there is a singleton + * rcu_node tree with but one rcu_node structure, this loop is a no-op. + */ +#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ + for ((rnp) = &(rsp)->node[0]; \ + (rnp) < (rsp)->level[NUM_RCU_LVLS - 1]; (rnp)++) + +/* + * Scan the leaves of the rcu_node hierarchy for the specified rcu_state + * structure. Note that if there is a singleton rcu_node tree with but + * one rcu_node structure, this loop -will- visit the rcu_node structure. + * It is still a leaf node, even if it is also the root node. + */ +#define rcu_for_each_leaf_node(rsp, rnp) \ + for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \ + (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++) + +/* Index values for nxttail array in struct rcu_data. */ +#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ +#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ +#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */ +#define RCU_NEXT_TAIL 3 +#define RCU_NEXT_SIZE 4 + +/* Per-CPU data for read-copy update. */ +struct rcu_data { + /* 1) quiescent-state and grace-period handling : */ + unsigned long completed; /* Track rsp->completed gp number */ + /* in order to detect GP end. */ + unsigned long gpnum; /* Highest gp number that this CPU */ + /* is aware of having started. */ + unsigned long passed_quiesc_completed; + /* Value of completed at time of qs. */ + bool passed_quiesc; /* User-mode/idle loop etc. */ + bool qs_pending; /* Core waits for quiesc state. */ + bool beenonline; /* CPU online at least once. */ + bool preemptible; /* Preemptible RCU? */ + struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ + unsigned long grpmask; /* Mask to apply to leaf qsmask. */ + + /* 2) batch handling */ + /* + * If nxtlist is not NULL, it is partitioned as follows. + * Any of the partitions might be empty, in which case the + * pointer to that partition will be equal to the pointer for + * the following partition. When the list is empty, all of + * the nxttail elements point to the ->nxtlist pointer itself, + * which in that case is NULL. + * + * [nxtlist, *nxttail[RCU_DONE_TAIL]): + * Entries that batch # <= ->completed + * The grace period for these entries has completed, and + * the other grace-period-completed entries may be moved + * here temporarily in rcu_process_callbacks(). + * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]): + * Entries that batch # <= ->completed - 1: waiting for current GP + * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]): + * Entries known to have arrived before current GP ended + * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]): + * Entries that might have arrived after current GP ended + * Note that the value of *nxttail[RCU_NEXT_TAIL] will + * always be NULL, as this is the end of the list. + */ + struct rcu_head *nxtlist; + struct rcu_head **nxttail[RCU_NEXT_SIZE]; + long qlen; /* # of queued callbacks */ + long qlen_last_fqs_check; + /* qlen at last check for QS forcing */ + unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ + unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ + unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */ + unsigned long n_force_qs_snap; + /* did other CPU force QS recently? */ + long blimit; /* Upper limit on a processed batch */ + +#ifdef CONFIG_NO_HZ + /* 3) dynticks interface. */ + struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ + int dynticks_snap; /* Per-GP tracking for dynticks. */ +#endif /* #ifdef CONFIG_NO_HZ */ + + /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ +#ifdef CONFIG_NO_HZ + unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ +#endif /* #ifdef CONFIG_NO_HZ */ + unsigned long offline_fqs; /* Kicked due to being offline. */ + unsigned long resched_ipi; /* Sent a resched IPI. */ + + /* 5) __rcu_pending() statistics. */ + unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ + unsigned long n_rp_qs_pending; + unsigned long n_rp_report_qs; + unsigned long n_rp_cb_ready; + unsigned long n_rp_cpu_needs_gp; + unsigned long n_rp_gp_completed; + unsigned long n_rp_gp_started; + unsigned long n_rp_need_fqs; + unsigned long n_rp_need_nothing; + + int cpu; +}; + +/* Values for signaled field in struct rcu_state. */ +#define RCU_GP_IDLE 0 /* No grace period in progress. */ +#define RCU_GP_INIT 1 /* Grace period being initialized. */ +#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ +#define RCU_FORCE_QS 3 /* Need to force quiescent state. */ +#ifdef CONFIG_NO_HZ +#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK +#else /* #ifdef CONFIG_NO_HZ */ +#define RCU_SIGNAL_INIT RCU_FORCE_QS +#endif /* #else #ifdef CONFIG_NO_HZ */ + +#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ + +#ifdef CONFIG_PROVE_RCU +#define RCU_STALL_DELAY_DELTA (5 * HZ) +#else +#define RCU_STALL_DELAY_DELTA 0 +#endif + +#define RCU_SECONDS_TILL_STALL_CHECK (CONFIG_RCU_CPU_STALL_TIMEOUT * HZ + \ + RCU_STALL_DELAY_DELTA) + /* for rsp->jiffies_stall */ +#define RCU_SECONDS_TILL_STALL_RECHECK (3 * RCU_SECONDS_TILL_STALL_CHECK + 30) + /* for rsp->jiffies_stall */ +#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ + /* to take at least one */ + /* scheduling clock irq */ + /* before ratting on them. */ + +#define rcu_wait(cond) \ +do { \ + for (;;) { \ + set_current_state(TASK_INTERRUPTIBLE); \ + if (cond) \ + break; \ + schedule(); \ + } \ + __set_current_state(TASK_RUNNING); \ +} while (0) + +/* + * RCU global state, including node hierarchy. This hierarchy is + * represented in "heap" form in a dense array. The root (first level) + * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second + * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]), + * and the third level in ->node[m+1] and following (->node[m+1] referenced + * by ->level[2]). The number of levels is determined by the number of + * CPUs and by CONFIG_RCU_FANOUT. Small systems will have a "hierarchy" + * consisting of a single rcu_node. + */ +struct rcu_state { + struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ + struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ + u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ + u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ + struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ + + /* The following fields are guarded by the root rcu_node's lock. */ + + u8 signaled ____cacheline_internodealigned_in_smp; + /* Force QS state. */ + u8 fqs_active; /* force_quiescent_state() */ + /* is running. */ + u8 fqs_need_gp; /* A CPU was prevented from */ + /* starting a new grace */ + /* period because */ + /* force_quiescent_state() */ + /* was running. */ + u8 boost; /* Subject to priority boost. */ + unsigned long gpnum; /* Current gp number. */ + unsigned long completed; /* # of last completed gp. */ + + /* End of fields guarded by root rcu_node's lock. */ + + raw_spinlock_t onofflock; /* exclude on/offline and */ + /* starting new GP. */ + raw_spinlock_t fqslock; /* Only one task forcing */ + /* quiescent states. */ + unsigned long jiffies_force_qs; /* Time at which to invoke */ + /* force_quiescent_state(). */ + unsigned long n_force_qs; /* Number of calls to */ + /* force_quiescent_state(). */ + unsigned long n_force_qs_lh; /* ~Number of calls leaving */ + /* due to lock unavailable. */ + unsigned long n_force_qs_ngp; /* Number of calls leaving */ + /* due to no GP active. */ + unsigned long gp_start; /* Time at which GP started, */ + /* but in jiffies. */ + unsigned long jiffies_stall; /* Time at which to check */ + /* for CPU stalls. */ + unsigned long gp_max; /* Maximum GP duration in */ + /* jiffies. */ + char *name; /* Name of structure. */ +}; + +/* Return values for rcu_preempt_offline_tasks(). */ + +#define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */ + /* GP were moved to root. */ +#define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */ + /* GP were moved to root. */ + +/* + * RCU implementation internal declarations: + */ +extern struct rcu_state rcu_sched_state; +DECLARE_PER_CPU(struct rcu_data, rcu_sched_data); + +extern struct rcu_state rcu_bh_state; +DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); + +#ifdef CONFIG_TREE_PREEMPT_RCU +extern struct rcu_state rcu_preempt_state; +DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + +#ifndef RCU_TREE_NONCORE + +/* Forward declarations for rcutree_plugin.h */ +static void rcu_bootup_announce(void); +long rcu_batches_completed(void); +static void rcu_preempt_note_context_switch(int cpu); +static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); +#ifdef CONFIG_HOTPLUG_CPU +static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, + unsigned long flags); +static void rcu_stop_cpu_kthread(int cpu); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ +static void rcu_print_detail_task_stall(struct rcu_state *rsp); +static void rcu_print_task_stall(struct rcu_node *rnp); +static void rcu_preempt_stall_reset(void); +static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); +#ifdef CONFIG_HOTPLUG_CPU +static int rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp); +static void rcu_preempt_offline_cpu(int cpu); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ +static void rcu_preempt_check_callbacks(int cpu); +static void rcu_preempt_process_callbacks(void); +void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); +#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp); +#endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ +static int rcu_preempt_pending(int cpu); +static int rcu_preempt_needs_cpu(int cpu); +static void __cpuinit rcu_preempt_init_percpu_data(int cpu); +static void rcu_preempt_send_cbs_to_online(void); +static void __init __rcu_init_preempt(void); +static void rcu_needs_cpu_flush(void); +static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); +static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); +static void invoke_rcu_callbacks_kthread(void); +#ifdef CONFIG_RCU_BOOST +static void rcu_preempt_do_callbacks(void); +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, + cpumask_var_t cm); +static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, + struct rcu_node *rnp, + int rnp_index); +static void invoke_rcu_node_kthread(struct rcu_node *rnp); +static void rcu_yield(void (*f)(unsigned long), unsigned long arg); +#endif /* #ifdef CONFIG_RCU_BOOST */ +static void rcu_cpu_kthread_setrt(int cpu, int to_rt); +static void __cpuinit rcu_prepare_kthreads(int cpu); + +#endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h new file mode 100644 index 00000000..8aafbb80 --- /dev/null +++ b/kernel/rcutree_plugin.h @@ -0,0 +1,2010 @@ +/* + * Read-Copy Update mechanism for mutual exclusion (tree-based version) + * Internal non-public definitions that provide either classic + * or preemptible semantics. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright Red Hat, 2009 + * Copyright IBM Corporation, 2009 + * + * Author: Ingo Molnar + * Paul E. McKenney + */ + +#include +#include + +/* + * Check the RCU kernel configuration parameters and print informative + * messages about anything out of the ordinary. If you like #ifdef, you + * will love this function. + */ +static void __init rcu_bootup_announce_oddness(void) +{ +#ifdef CONFIG_RCU_TRACE + printk(KERN_INFO "\tRCU debugfs-based tracing is enabled.\n"); +#endif +#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32) + printk(KERN_INFO "\tCONFIG_RCU_FANOUT set to non-default value of %d\n", + CONFIG_RCU_FANOUT); +#endif +#ifdef CONFIG_RCU_FANOUT_EXACT + printk(KERN_INFO "\tHierarchical RCU autobalancing is disabled.\n"); +#endif +#ifdef CONFIG_RCU_FAST_NO_HZ + printk(KERN_INFO + "\tRCU dyntick-idle grace-period acceleration is enabled.\n"); +#endif +#ifdef CONFIG_PROVE_RCU + printk(KERN_INFO "\tRCU lockdep checking is enabled.\n"); +#endif +#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE + printk(KERN_INFO "\tRCU torture testing starts during boot.\n"); +#endif +#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) + printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n"); +#endif +#if NUM_RCU_LVL_4 != 0 + printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n"); +#endif +} + +#ifdef CONFIG_TREE_PREEMPT_RCU + +struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); +DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); +static struct rcu_state *rcu_state = &rcu_preempt_state; + +static void rcu_read_unlock_special(struct task_struct *t); +static int rcu_preempted_readers_exp(struct rcu_node *rnp); + +/* + * Tell them what RCU they are running. + */ +static void __init rcu_bootup_announce(void) +{ + printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n"); + rcu_bootup_announce_oddness(); +} + +/* + * Return the number of RCU-preempt batches processed thus far + * for debug and statistics. + */ +long rcu_batches_completed_preempt(void) +{ + return rcu_preempt_state.completed; +} +EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt); + +/* + * Return the number of RCU batches processed thus far for debug & stats. + */ +long rcu_batches_completed(void) +{ + return rcu_batches_completed_preempt(); +} +EXPORT_SYMBOL_GPL(rcu_batches_completed); + +/* + * Force a quiescent state for preemptible RCU. + */ +void rcu_force_quiescent_state(void) +{ + force_quiescent_state(&rcu_preempt_state, 0); +} +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); + +/* + * Record a preemptible-RCU quiescent state for the specified CPU. Note + * that this just means that the task currently running on the CPU is + * not in a quiescent state. There might be any number of tasks blocked + * while in an RCU read-side critical section. + * + * Unlike the other rcu_*_qs() functions, callers to this function + * must disable irqs in order to protect the assignment to + * ->rcu_read_unlock_special. + */ +static void rcu_preempt_qs(int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); + + rdp->passed_quiesc_completed = rdp->gpnum - 1; + barrier(); + rdp->passed_quiesc = 1; + current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; +} + +/* + * We have entered the scheduler, and the current task might soon be + * context-switched away from. If this task is in an RCU read-side + * critical section, we will no longer be able to rely on the CPU to + * record that fact, so we enqueue the task on the blkd_tasks list. + * The task will dequeue itself when it exits the outermost enclosing + * RCU read-side critical section. Therefore, the current grace period + * cannot be permitted to complete until the blkd_tasks list entries + * predating the current grace period drain, in other words, until + * rnp->gp_tasks becomes NULL. + * + * Caller must disable preemption. + */ +static void rcu_preempt_note_context_switch(int cpu) +{ + struct task_struct *t = current; + unsigned long flags; + struct rcu_data *rdp; + struct rcu_node *rnp; + + if (t->rcu_read_lock_nesting > 0 && + (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { + + /* Possibly blocking in an RCU read-side critical section. */ + rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); + rnp = rdp->mynode; + raw_spin_lock_irqsave(&rnp->lock, flags); + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; + t->rcu_blocked_node = rnp; + + /* + * If this CPU has already checked in, then this task + * will hold up the next grace period rather than the + * current grace period. Queue the task accordingly. + * If the task is queued for the current grace period + * (i.e., this CPU has not yet passed through a quiescent + * state for the current grace period), then as long + * as that task remains queued, the current grace period + * cannot end. Note that there is some uncertainty as + * to exactly when the current grace period started. + * We take a conservative approach, which can result + * in unnecessarily waiting on tasks that started very + * slightly after the current grace period began. C'est + * la vie!!! + * + * But first, note that the current CPU must still be + * on line! + */ + WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); + WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); + if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { + list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); + rnp->gp_tasks = &t->rcu_node_entry; +#ifdef CONFIG_RCU_BOOST + if (rnp->boost_tasks != NULL) + rnp->boost_tasks = rnp->gp_tasks; +#endif /* #ifdef CONFIG_RCU_BOOST */ + } else { + list_add(&t->rcu_node_entry, &rnp->blkd_tasks); + if (rnp->qsmask & rdp->grpmask) + rnp->gp_tasks = &t->rcu_node_entry; + } + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } else if (t->rcu_read_lock_nesting < 0 && + t->rcu_read_unlock_special) { + + /* + * Complete exit from RCU read-side critical section on + * behalf of preempted instance of __rcu_read_unlock(). + */ + rcu_read_unlock_special(t); + } + + /* + * Either we were not in an RCU read-side critical section to + * begin with, or we have now recorded that critical section + * globally. Either way, we can now note a quiescent state + * for this CPU. Again, if we were in an RCU read-side critical + * section, and if that critical section was blocking the current + * grace period, then the fact that the task has been enqueued + * means that we continue to block the current grace period. + */ + local_irq_save(flags); + rcu_preempt_qs(cpu); + local_irq_restore(flags); +} + +/* + * Tree-preemptible RCU implementation for rcu_read_lock(). + * Just increment ->rcu_read_lock_nesting, shared state will be updated + * if we block. + */ +void __rcu_read_lock(void) +{ + current->rcu_read_lock_nesting++; + barrier(); /* needed if we ever invoke rcu_read_lock in rcutree.c */ +} +EXPORT_SYMBOL_GPL(__rcu_read_lock); + +/* + * Check for preempted RCU readers blocking the current grace period + * for the specified rcu_node structure. If the caller needs a reliable + * answer, it must hold the rcu_node's ->lock. + */ +static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) +{ + return rnp->gp_tasks != NULL; +} + +/* + * Record a quiescent state for all tasks that were previously queued + * on the specified rcu_node structure and that were blocking the current + * RCU grace period. The caller must hold the specified rnp->lock with + * irqs disabled, and this lock is released upon return, but irqs remain + * disabled. + */ +static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) + __releases(rnp->lock) +{ + unsigned long mask; + struct rcu_node *rnp_p; + + if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; /* Still need more quiescent states! */ + } + + rnp_p = rnp->parent; + if (rnp_p == NULL) { + /* + * Either there is only one rcu_node in the tree, + * or tasks were kicked up to root rcu_node due to + * CPUs going offline. + */ + rcu_report_qs_rsp(&rcu_preempt_state, flags); + return; + } + + /* Report up the rest of the hierarchy. */ + mask = rnp->grpmask; + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ + rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); +} + +/* + * Advance a ->blkd_tasks-list pointer to the next entry, instead + * returning NULL if at the end of the list. + */ +static struct list_head *rcu_next_node_entry(struct task_struct *t, + struct rcu_node *rnp) +{ + struct list_head *np; + + np = t->rcu_node_entry.next; + if (np == &rnp->blkd_tasks) + np = NULL; + return np; +} + +/* + * Handle special cases during rcu_read_unlock(), such as needing to + * notify RCU core processing or task having blocked during the RCU + * read-side critical section. + */ +static noinline void rcu_read_unlock_special(struct task_struct *t) +{ + int empty; + int empty_exp; + unsigned long flags; + struct list_head *np; + struct rcu_node *rnp; + int special; + + /* NMI handlers cannot block and cannot safely manipulate state. */ + if (in_nmi()) + return; + + local_irq_save(flags); + + /* + * If RCU core is waiting for this CPU to exit critical section, + * let it know that we have done so. + */ + special = t->rcu_read_unlock_special; + if (special & RCU_READ_UNLOCK_NEED_QS) { + rcu_preempt_qs(smp_processor_id()); + } + + /* Hardware IRQ handlers cannot block. */ + if (in_irq() || in_serving_softirq()) { + local_irq_restore(flags); + return; + } + + /* Clean up if blocked during RCU read-side critical section. */ + if (special & RCU_READ_UNLOCK_BLOCKED) { + t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; + + /* + * Remove this task from the list it blocked on. The + * task can migrate while we acquire the lock, but at + * most one time. So at most two passes through loop. + */ + for (;;) { + rnp = t->rcu_blocked_node; + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + if (rnp == t->rcu_blocked_node) + break; + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + } + empty = !rcu_preempt_blocked_readers_cgp(rnp); + empty_exp = !rcu_preempted_readers_exp(rnp); + smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ + np = rcu_next_node_entry(t, rnp); + list_del_init(&t->rcu_node_entry); + if (&t->rcu_node_entry == rnp->gp_tasks) + rnp->gp_tasks = np; + if (&t->rcu_node_entry == rnp->exp_tasks) + rnp->exp_tasks = np; +#ifdef CONFIG_RCU_BOOST + if (&t->rcu_node_entry == rnp->boost_tasks) + rnp->boost_tasks = np; + /* Snapshot and clear ->rcu_boosted with rcu_node lock held. */ + if (t->rcu_boosted) { + special |= RCU_READ_UNLOCK_BOOSTED; + t->rcu_boosted = 0; + } +#endif /* #ifdef CONFIG_RCU_BOOST */ + t->rcu_blocked_node = NULL; + + /* + * If this was the last task on the current list, and if + * we aren't waiting on any CPUs, report the quiescent state. + * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. + */ + if (empty) + raw_spin_unlock_irqrestore(&rnp->lock, flags); + else + rcu_report_unblock_qs_rnp(rnp, flags); + +#ifdef CONFIG_RCU_BOOST + /* Unboost if we were boosted. */ + if (special & RCU_READ_UNLOCK_BOOSTED) { + rt_mutex_unlock(t->rcu_boost_mutex); + t->rcu_boost_mutex = NULL; + } +#endif /* #ifdef CONFIG_RCU_BOOST */ + + /* + * If this was the last task on the expedited lists, + * then we need to report up the rcu_node hierarchy. + */ + if (!empty_exp && !rcu_preempted_readers_exp(rnp)) + rcu_report_exp_rnp(&rcu_preempt_state, rnp); + } else { + local_irq_restore(flags); + } +} + +/* + * Tree-preemptible RCU implementation for rcu_read_unlock(). + * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost + * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then + * invoke rcu_read_unlock_special() to clean up after a context switch + * in an RCU read-side critical section and other special cases. + */ +void __rcu_read_unlock(void) +{ + struct task_struct *t = current; + + barrier(); /* needed if we ever invoke rcu_read_unlock in rcutree.c */ + if (t->rcu_read_lock_nesting != 1) + --t->rcu_read_lock_nesting; + else { + t->rcu_read_lock_nesting = INT_MIN; + barrier(); /* assign before ->rcu_read_unlock_special load */ + if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) + rcu_read_unlock_special(t); + barrier(); /* ->rcu_read_unlock_special load before assign */ + t->rcu_read_lock_nesting = 0; + } +#ifdef CONFIG_PROVE_LOCKING + { + int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); + + WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); + } +#endif /* #ifdef CONFIG_PROVE_LOCKING */ +} +EXPORT_SYMBOL_GPL(__rcu_read_unlock); + +#ifdef CONFIG_RCU_CPU_STALL_VERBOSE + +/* + * Dump detailed information for all tasks blocking the current RCU + * grace period on the specified rcu_node structure. + */ +static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) +{ + unsigned long flags; + struct task_struct *t; + + if (!rcu_preempt_blocked_readers_cgp(rnp)) + return; + raw_spin_lock_irqsave(&rnp->lock, flags); + t = list_entry(rnp->gp_tasks, + struct task_struct, rcu_node_entry); + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) + sched_show_task(t); + raw_spin_unlock_irqrestore(&rnp->lock, flags); +} + +/* + * Dump detailed information for all tasks blocking the current RCU + * grace period. + */ +static void rcu_print_detail_task_stall(struct rcu_state *rsp) +{ + struct rcu_node *rnp = rcu_get_root(rsp); + + rcu_print_detail_task_stall_rnp(rnp); + rcu_for_each_leaf_node(rsp, rnp) + rcu_print_detail_task_stall_rnp(rnp); +} + +#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ + +static void rcu_print_detail_task_stall(struct rcu_state *rsp) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ + +/* + * Scan the current list of tasks blocked within RCU read-side critical + * sections, printing out the tid of each. + */ +static void rcu_print_task_stall(struct rcu_node *rnp) +{ + struct task_struct *t; + + if (!rcu_preempt_blocked_readers_cgp(rnp)) + return; + t = list_entry(rnp->gp_tasks, + struct task_struct, rcu_node_entry); + list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) + printk(" P%d", t->pid); +} + +/* + * Suppress preemptible RCU's CPU stall warnings by pushing the + * time of the next stall-warning message comfortably far into the + * future. + */ +static void rcu_preempt_stall_reset(void) +{ + rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2; +} + +/* + * Check that the list of blocked tasks for the newly completed grace + * period is in fact empty. It is a serious bug to complete a grace + * period that still has RCU readers blocked! This function must be + * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock + * must be held by the caller. + * + * Also, if there are blocked tasks on the list, they automatically + * block the newly created grace period, so set up ->gp_tasks accordingly. + */ +static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) +{ + WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); + if (!list_empty(&rnp->blkd_tasks)) + rnp->gp_tasks = rnp->blkd_tasks.next; + WARN_ON_ONCE(rnp->qsmask); +} + +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Handle tasklist migration for case in which all CPUs covered by the + * specified rcu_node have gone offline. Move them up to the root + * rcu_node. The reason for not just moving them to the immediate + * parent is to remove the need for rcu_read_unlock_special() to + * make more than two attempts to acquire the target rcu_node's lock. + * Returns true if there were tasks blocking the current RCU grace + * period. + * + * Returns 1 if there was previously a task blocking the current grace + * period on the specified rcu_node structure. + * + * The caller must hold rnp->lock with irqs disabled. + */ +static int rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp) +{ + struct list_head *lp; + struct list_head *lp_root; + int retval = 0; + struct rcu_node *rnp_root = rcu_get_root(rsp); + struct task_struct *t; + + if (rnp == rnp_root) { + WARN_ONCE(1, "Last CPU thought to be offlined?"); + return 0; /* Shouldn't happen: at least one CPU online. */ + } + + /* If we are on an internal node, complain bitterly. */ + WARN_ON_ONCE(rnp != rdp->mynode); + + /* + * Move tasks up to root rcu_node. Don't try to get fancy for + * this corner-case operation -- just put this node's tasks + * at the head of the root node's list, and update the root node's + * ->gp_tasks and ->exp_tasks pointers to those of this node's, + * if non-NULL. This might result in waiting for more tasks than + * absolutely necessary, but this is a good performance/complexity + * tradeoff. + */ + if (rcu_preempt_blocked_readers_cgp(rnp)) + retval |= RCU_OFL_TASKS_NORM_GP; + if (rcu_preempted_readers_exp(rnp)) + retval |= RCU_OFL_TASKS_EXP_GP; + lp = &rnp->blkd_tasks; + lp_root = &rnp_root->blkd_tasks; + while (!list_empty(lp)) { + t = list_entry(lp->next, typeof(*t), rcu_node_entry); + raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ + list_del(&t->rcu_node_entry); + t->rcu_blocked_node = rnp_root; + list_add(&t->rcu_node_entry, lp_root); + if (&t->rcu_node_entry == rnp->gp_tasks) + rnp_root->gp_tasks = rnp->gp_tasks; + if (&t->rcu_node_entry == rnp->exp_tasks) + rnp_root->exp_tasks = rnp->exp_tasks; +#ifdef CONFIG_RCU_BOOST + if (&t->rcu_node_entry == rnp->boost_tasks) + rnp_root->boost_tasks = rnp->boost_tasks; +#endif /* #ifdef CONFIG_RCU_BOOST */ + raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ + } + +#ifdef CONFIG_RCU_BOOST + /* In case root is being boosted and leaf is not. */ + raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ + if (rnp_root->boost_tasks != NULL && + rnp_root->boost_tasks != rnp_root->gp_tasks) + rnp_root->boost_tasks = rnp_root->gp_tasks; + raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ +#endif /* #ifdef CONFIG_RCU_BOOST */ + + rnp->gp_tasks = NULL; + rnp->exp_tasks = NULL; + return retval; +} + +/* + * Do CPU-offline processing for preemptible RCU. + */ +static void rcu_preempt_offline_cpu(int cpu) +{ + __rcu_offline_cpu(cpu, &rcu_preempt_state); +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + +/* + * Check for a quiescent state from the current CPU. When a task blocks, + * the task is recorded in the corresponding CPU's rcu_node structure, + * which is checked elsewhere. + * + * Caller must disable hard irqs. + */ +static void rcu_preempt_check_callbacks(int cpu) +{ + struct task_struct *t = current; + + if (t->rcu_read_lock_nesting == 0) { + rcu_preempt_qs(cpu); + return; + } + if (t->rcu_read_lock_nesting > 0 && + per_cpu(rcu_preempt_data, cpu).qs_pending) + t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; +} + +/* + * Process callbacks for preemptible RCU. + */ +static void rcu_preempt_process_callbacks(void) +{ + __rcu_process_callbacks(&rcu_preempt_state, + &__get_cpu_var(rcu_preempt_data)); +} + +#ifdef CONFIG_RCU_BOOST + +static void rcu_preempt_do_callbacks(void) +{ + rcu_do_batch(&rcu_preempt_state, &__get_cpu_var(rcu_preempt_data)); +} + +#endif /* #ifdef CONFIG_RCU_BOOST */ + +/* + * Queue a preemptible-RCU callback for invocation after a grace period. + */ +void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) +{ + __call_rcu(head, func, &rcu_preempt_state); +} +EXPORT_SYMBOL_GPL(call_rcu); + +/** + * synchronize_rcu - wait until a grace period has elapsed. + * + * Control will return to the caller some time after a full grace + * period has elapsed, in other words after all currently executing RCU + * read-side critical sections have completed. Note, however, that + * upon return from synchronize_rcu(), the caller might well be executing + * concurrently with new RCU read-side critical sections that began while + * synchronize_rcu() was waiting. RCU read-side critical sections are + * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. + */ +void synchronize_rcu(void) +{ + struct rcu_synchronize rcu; + + if (!rcu_scheduler_active) + return; + + init_rcu_head_on_stack(&rcu.head); + init_completion(&rcu.completion); + /* Will wake me after RCU finished. */ + call_rcu(&rcu.head, wakeme_after_rcu); + /* Wait for it. */ + wait_for_completion(&rcu.completion); + destroy_rcu_head_on_stack(&rcu.head); +} +EXPORT_SYMBOL_GPL(synchronize_rcu); + +static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); +static long sync_rcu_preempt_exp_count; +static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); + +/* + * Return non-zero if there are any tasks in RCU read-side critical + * sections blocking the current preemptible-RCU expedited grace period. + * If there is no preemptible-RCU expedited grace period currently in + * progress, returns zero unconditionally. + */ +static int rcu_preempted_readers_exp(struct rcu_node *rnp) +{ + return rnp->exp_tasks != NULL; +} + +/* + * return non-zero if there is no RCU expedited grace period in progress + * for the specified rcu_node structure, in other words, if all CPUs and + * tasks covered by the specified rcu_node structure have done their bit + * for the current expedited grace period. Works only for preemptible + * RCU -- other RCU implementation use other means. + * + * Caller must hold sync_rcu_preempt_exp_mutex. + */ +static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) +{ + return !rcu_preempted_readers_exp(rnp) && + ACCESS_ONCE(rnp->expmask) == 0; +} + +/* + * Report the exit from RCU read-side critical section for the last task + * that queued itself during or before the current expedited preemptible-RCU + * grace period. This event is reported either to the rcu_node structure on + * which the task was queued or to one of that rcu_node structure's ancestors, + * recursively up the tree. (Calm down, calm down, we do the recursion + * iteratively!) + * + * Caller must hold sync_rcu_preempt_exp_mutex. + */ +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) +{ + unsigned long flags; + unsigned long mask; + + raw_spin_lock_irqsave(&rnp->lock, flags); + for (;;) { + if (!sync_rcu_preempt_exp_done(rnp)) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + break; + } + if (rnp->parent == NULL) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + wake_up(&sync_rcu_preempt_exp_wq); + break; + } + mask = rnp->grpmask; + raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ + rnp = rnp->parent; + raw_spin_lock(&rnp->lock); /* irqs already disabled */ + rnp->expmask &= ~mask; + } +} + +/* + * Snapshot the tasks blocking the newly started preemptible-RCU expedited + * grace period for the specified rcu_node structure. If there are no such + * tasks, report it up the rcu_node hierarchy. + * + * Caller must hold sync_rcu_preempt_exp_mutex and rsp->onofflock. + */ +static void +sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) +{ + unsigned long flags; + int must_wait = 0; + + raw_spin_lock_irqsave(&rnp->lock, flags); + if (list_empty(&rnp->blkd_tasks)) + raw_spin_unlock_irqrestore(&rnp->lock, flags); + else { + rnp->exp_tasks = rnp->blkd_tasks.next; + rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ + must_wait = 1; + } + if (!must_wait) + rcu_report_exp_rnp(rsp, rnp); +} + +/* + * Wait for an rcu-preempt grace period, but expedite it. The basic idea + * is to invoke synchronize_sched_expedited() to push all the tasks to + * the ->blkd_tasks lists and wait for this list to drain. + */ +void synchronize_rcu_expedited(void) +{ + unsigned long flags; + struct rcu_node *rnp; + struct rcu_state *rsp = &rcu_preempt_state; + long snap; + int trycount = 0; + + smp_mb(); /* Caller's modifications seen first by other CPUs. */ + snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1; + smp_mb(); /* Above access cannot bleed into critical section. */ + + /* + * Acquire lock, falling back to synchronize_rcu() if too many + * lock-acquisition failures. Of course, if someone does the + * expedited grace period for us, just leave. + */ + while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { + if (trycount++ < 10) + udelay(trycount * num_online_cpus()); + else { + synchronize_rcu(); + return; + } + if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) + goto mb_ret; /* Others did our work for us. */ + } + if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0) + goto unlock_mb_ret; /* Others did our work for us. */ + + /* force all RCU readers onto ->blkd_tasks lists. */ + synchronize_sched_expedited(); + + raw_spin_lock_irqsave(&rsp->onofflock, flags); + + /* Initialize ->expmask for all non-leaf rcu_node structures. */ + rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + rnp->expmask = rnp->qsmaskinit; + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + } + + /* Snapshot current state of ->blkd_tasks lists. */ + rcu_for_each_leaf_node(rsp, rnp) + sync_rcu_preempt_exp_init(rsp, rnp); + if (NUM_RCU_NODES > 1) + sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); + + raw_spin_unlock_irqrestore(&rsp->onofflock, flags); + + /* Wait for snapshotted ->blkd_tasks lists to drain. */ + rnp = rcu_get_root(rsp); + wait_event(sync_rcu_preempt_exp_wq, + sync_rcu_preempt_exp_done(rnp)); + + /* Clean up and exit. */ + smp_mb(); /* ensure expedited GP seen before counter increment. */ + ACCESS_ONCE(sync_rcu_preempt_exp_count)++; +unlock_mb_ret: + mutex_unlock(&sync_rcu_preempt_exp_mutex); +mb_ret: + smp_mb(); /* ensure subsequent action seen after grace period. */ +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +/* + * Check to see if there is any immediate preemptible-RCU-related work + * to be done. + */ +static int rcu_preempt_pending(int cpu) +{ + return __rcu_pending(&rcu_preempt_state, + &per_cpu(rcu_preempt_data, cpu)); +} + +/* + * Does preemptible RCU need the CPU to stay out of dynticks mode? + */ +static int rcu_preempt_needs_cpu(int cpu) +{ + return !!per_cpu(rcu_preempt_data, cpu).nxtlist; +} + +/** + * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. + */ +void rcu_barrier(void) +{ + _rcu_barrier(&rcu_preempt_state, call_rcu); +} +EXPORT_SYMBOL_GPL(rcu_barrier); + +/* + * Initialize preemptible RCU's per-CPU data. + */ +static void __cpuinit rcu_preempt_init_percpu_data(int cpu) +{ + rcu_init_percpu_data(cpu, &rcu_preempt_state, 1); +} + +/* + * Move preemptible RCU's callbacks from dying CPU to other online CPU. + */ +static void rcu_preempt_send_cbs_to_online(void) +{ + rcu_send_cbs_to_online(&rcu_preempt_state); +} + +/* + * Initialize preemptible RCU's state structures. + */ +static void __init __rcu_init_preempt(void) +{ + rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); +} + +/* + * Check for a task exiting while in a preemptible-RCU read-side + * critical section, clean up if so. No need to issue warnings, + * as debug_check_no_locks_held() already does this if lockdep + * is enabled. + */ +void exit_rcu(void) +{ + struct task_struct *t = current; + + if (t->rcu_read_lock_nesting == 0) + return; + t->rcu_read_lock_nesting = 1; + __rcu_read_unlock(); +} + +#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + +static struct rcu_state *rcu_state = &rcu_sched_state; + +/* + * Tell them what RCU they are running. + */ +static void __init rcu_bootup_announce(void) +{ + printk(KERN_INFO "Hierarchical RCU implementation.\n"); + rcu_bootup_announce_oddness(); +} + +/* + * Return the number of RCU batches processed thus far for debug & stats. + */ +long rcu_batches_completed(void) +{ + return rcu_batches_completed_sched(); +} +EXPORT_SYMBOL_GPL(rcu_batches_completed); + +/* + * Force a quiescent state for RCU, which, because there is no preemptible + * RCU, becomes the same as rcu-sched. + */ +void rcu_force_quiescent_state(void) +{ + rcu_sched_force_quiescent_state(); +} +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); + +/* + * Because preemptible RCU does not exist, we never have to check for + * CPUs being in quiescent states. + */ +static void rcu_preempt_note_context_switch(int cpu) +{ +} + +/* + * Because preemptible RCU does not exist, there are never any preempted + * RCU readers. + */ +static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) +{ + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU + +/* Because preemptible RCU does not exist, no quieting of tasks. */ +static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) +{ + raw_spin_unlock_irqrestore(&rnp->lock, flags); +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + +/* + * Because preemptible RCU does not exist, we never have to check for + * tasks blocked within RCU read-side critical sections. + */ +static void rcu_print_detail_task_stall(struct rcu_state *rsp) +{ +} + +/* + * Because preemptible RCU does not exist, we never have to check for + * tasks blocked within RCU read-side critical sections. + */ +static void rcu_print_task_stall(struct rcu_node *rnp) +{ +} + +/* + * Because preemptible RCU does not exist, there is no need to suppress + * its CPU stall warnings. + */ +static void rcu_preempt_stall_reset(void) +{ +} + +/* + * Because there is no preemptible RCU, there can be no readers blocked, + * so there is no need to check for blocked tasks. So check only for + * bogus qsmask values. + */ +static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) +{ + WARN_ON_ONCE(rnp->qsmask); +} + +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Because preemptible RCU does not exist, it never needs to migrate + * tasks that were blocked within RCU read-side critical sections, and + * such non-existent tasks cannot possibly have been blocking the current + * grace period. + */ +static int rcu_preempt_offline_tasks(struct rcu_state *rsp, + struct rcu_node *rnp, + struct rcu_data *rdp) +{ + return 0; +} + +/* + * Because preemptible RCU does not exist, it never needs CPU-offline + * processing. + */ +static void rcu_preempt_offline_cpu(int cpu) +{ +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + +/* + * Because preemptible RCU does not exist, it never has any callbacks + * to check. + */ +static void rcu_preempt_check_callbacks(int cpu) +{ +} + +/* + * Because preemptible RCU does not exist, it never has any callbacks + * to process. + */ +static void rcu_preempt_process_callbacks(void) +{ +} + +/* + * Wait for an rcu-preempt grace period, but make it happen quickly. + * But because preemptible RCU does not exist, map to rcu-sched. + */ +void synchronize_rcu_expedited(void) +{ + synchronize_sched_expedited(); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); + +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Because preemptible RCU does not exist, there is never any need to + * report on tasks preempted in RCU read-side critical sections during + * expedited RCU grace periods. + */ +static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) +{ + return; +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + +/* + * Because preemptible RCU does not exist, it never has any work to do. + */ +static int rcu_preempt_pending(int cpu) +{ + return 0; +} + +/* + * Because preemptible RCU does not exist, it never needs any CPU. + */ +static int rcu_preempt_needs_cpu(int cpu) +{ + return 0; +} + +/* + * Because preemptible RCU does not exist, rcu_barrier() is just + * another name for rcu_barrier_sched(). + */ +void rcu_barrier(void) +{ + rcu_barrier_sched(); +} +EXPORT_SYMBOL_GPL(rcu_barrier); + +/* + * Because preemptible RCU does not exist, there is no per-CPU + * data to initialize. + */ +static void __cpuinit rcu_preempt_init_percpu_data(int cpu) +{ +} + +/* + * Because there is no preemptible RCU, there are no callbacks to move. + */ +static void rcu_preempt_send_cbs_to_online(void) +{ +} + +/* + * Because preemptible RCU does not exist, it need not be initialized. + */ +static void __init __rcu_init_preempt(void) +{ +} + +#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ + +#ifdef CONFIG_RCU_BOOST + +#include "rtmutex_common.h" + +#ifdef CONFIG_RCU_TRACE + +static void rcu_initiate_boost_trace(struct rcu_node *rnp) +{ + if (list_empty(&rnp->blkd_tasks)) + rnp->n_balk_blkd_tasks++; + else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) + rnp->n_balk_exp_gp_tasks++; + else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL) + rnp->n_balk_boost_tasks++; + else if (rnp->gp_tasks != NULL && rnp->qsmask != 0) + rnp->n_balk_notblocked++; + else if (rnp->gp_tasks != NULL && + ULONG_CMP_LT(jiffies, rnp->boost_time)) + rnp->n_balk_notyet++; + else + rnp->n_balk_nos++; +} + +#else /* #ifdef CONFIG_RCU_TRACE */ + +static void rcu_initiate_boost_trace(struct rcu_node *rnp) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_TRACE */ + +/* + * Carry out RCU priority boosting on the task indicated by ->exp_tasks + * or ->boost_tasks, advancing the pointer to the next task in the + * ->blkd_tasks list. + * + * Note that irqs must be enabled: boosting the task can block. + * Returns 1 if there are more tasks needing to be boosted. + */ +static int rcu_boost(struct rcu_node *rnp) +{ + unsigned long flags; + struct rt_mutex mtx; + struct task_struct *t; + struct list_head *tb; + + if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) + return 0; /* Nothing left to boost. */ + + raw_spin_lock_irqsave(&rnp->lock, flags); + + /* + * Recheck under the lock: all tasks in need of boosting + * might exit their RCU read-side critical sections on their own. + */ + if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return 0; + } + + /* + * Preferentially boost tasks blocking expedited grace periods. + * This cannot starve the normal grace periods because a second + * expedited grace period must boost all blocked tasks, including + * those blocking the pre-existing normal grace period. + */ + if (rnp->exp_tasks != NULL) { + tb = rnp->exp_tasks; + rnp->n_exp_boosts++; + } else { + tb = rnp->boost_tasks; + rnp->n_normal_boosts++; + } + rnp->n_tasks_boosted++; + + /* + * We boost task t by manufacturing an rt_mutex that appears to + * be held by task t. We leave a pointer to that rt_mutex where + * task t can find it, and task t will release the mutex when it + * exits its outermost RCU read-side critical section. Then + * simply acquiring this artificial rt_mutex will boost task + * t's priority. (Thanks to tglx for suggesting this approach!) + * + * Note that task t must acquire rnp->lock to remove itself from + * the ->blkd_tasks list, which it will do from exit() if from + * nowhere else. We therefore are guaranteed that task t will + * stay around at least until we drop rnp->lock. Note that + * rnp->lock also resolves races between our priority boosting + * and task t's exiting its outermost RCU read-side critical + * section. + */ + t = container_of(tb, struct task_struct, rcu_node_entry); + rt_mutex_init_proxy_locked(&mtx, t); + t->rcu_boost_mutex = &mtx; + t->rcu_boosted = 1; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ + rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ + + return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL; +} + +/* + * Timer handler to initiate waking up of boost kthreads that + * have yielded the CPU due to excessive numbers of tasks to + * boost. We wake up the per-rcu_node kthread, which in turn + * will wake up the booster kthread. + */ +static void rcu_boost_kthread_timer(unsigned long arg) +{ + invoke_rcu_node_kthread((struct rcu_node *)arg); +} + +/* + * Priority-boosting kthread. One per leaf rcu_node and one for the + * root rcu_node. + */ +static int rcu_boost_kthread(void *arg) +{ + struct rcu_node *rnp = (struct rcu_node *)arg; + int spincnt = 0; + int more2boost; + + for (;;) { + rnp->boost_kthread_status = RCU_KTHREAD_WAITING; + rcu_wait(rnp->boost_tasks || rnp->exp_tasks); + rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; + more2boost = rcu_boost(rnp); + if (more2boost) + spincnt++; + else + spincnt = 0; + if (spincnt > 10) { + rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp); + spincnt = 0; + } + } + /* NOTREACHED */ + return 0; +} + +/* + * Check to see if it is time to start boosting RCU readers that are + * blocking the current grace period, and, if so, tell the per-rcu_node + * kthread to start boosting them. If there is an expedited grace + * period in progress, it is always time to boost. + * + * The caller must hold rnp->lock, which this function releases, + * but irqs remain disabled. The ->boost_kthread_task is immortal, + * so we don't need to worry about it going away. + */ +static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) +{ + struct task_struct *t; + + if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { + rnp->n_balk_exp_gp_tasks++; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + if (rnp->exp_tasks != NULL || + (rnp->gp_tasks != NULL && + rnp->boost_tasks == NULL && + rnp->qsmask == 0 && + ULONG_CMP_GE(jiffies, rnp->boost_time))) { + if (rnp->exp_tasks == NULL) + rnp->boost_tasks = rnp->gp_tasks; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + t = rnp->boost_kthread_task; + if (t != NULL) + wake_up_process(t); + } else { + rcu_initiate_boost_trace(rnp); + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } +} + +/* + * Wake up the per-CPU kthread to invoke RCU callbacks. + */ +static void invoke_rcu_callbacks_kthread(void) +{ + unsigned long flags; + + local_irq_save(flags); + __this_cpu_write(rcu_cpu_has_work, 1); + if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) { + local_irq_restore(flags); + return; + } + wake_up_process(__this_cpu_read(rcu_cpu_kthread_task)); + local_irq_restore(flags); +} + +/* + * Set the affinity of the boost kthread. The CPU-hotplug locks are + * held, so no one should be messing with the existence of the boost + * kthread. + */ +static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, + cpumask_var_t cm) +{ + struct task_struct *t; + + t = rnp->boost_kthread_task; + if (t != NULL) + set_cpus_allowed_ptr(rnp->boost_kthread_task, cm); +} + +#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) + +/* + * Do priority-boost accounting for the start of a new grace period. + */ +static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) +{ + rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; +} + +/* + * Create an RCU-boost kthread for the specified node if one does not + * already exist. We only create this kthread for preemptible RCU. + * Returns zero if all is well, a negated errno otherwise. + */ +static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, + struct rcu_node *rnp, + int rnp_index) +{ + unsigned long flags; + struct sched_param sp; + struct task_struct *t; + + if (&rcu_preempt_state != rsp) + return 0; + rsp->boost = 1; + if (rnp->boost_kthread_task != NULL) + return 0; + t = kthread_create(rcu_boost_kthread, (void *)rnp, + "rcub%d", rnp_index); + if (IS_ERR(t)) + return PTR_ERR(t); + raw_spin_lock_irqsave(&rnp->lock, flags); + rnp->boost_kthread_task = t; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Stop the RCU's per-CPU kthread when its CPU goes offline,. + */ +static void rcu_stop_cpu_kthread(int cpu) +{ + struct task_struct *t; + + /* Stop the CPU's kthread. */ + t = per_cpu(rcu_cpu_kthread_task, cpu); + if (t != NULL) { + per_cpu(rcu_cpu_kthread_task, cpu) = NULL; + kthread_stop(t); + } +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + +static void rcu_kthread_do_work(void) +{ + rcu_do_batch(&rcu_sched_state, &__get_cpu_var(rcu_sched_data)); + rcu_do_batch(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); + rcu_preempt_do_callbacks(); +} + +/* + * Wake up the specified per-rcu_node-structure kthread. + * Because the per-rcu_node kthreads are immortal, we don't need + * to do anything to keep them alive. + */ +static void invoke_rcu_node_kthread(struct rcu_node *rnp) +{ + struct task_struct *t; + + t = rnp->node_kthread_task; + if (t != NULL) + wake_up_process(t); +} + +/* + * Set the specified CPU's kthread to run RT or not, as specified by + * the to_rt argument. The CPU-hotplug locks are held, so the task + * is not going away. + */ +static void rcu_cpu_kthread_setrt(int cpu, int to_rt) +{ + int policy; + struct sched_param sp; + struct task_struct *t; + + t = per_cpu(rcu_cpu_kthread_task, cpu); + if (t == NULL) + return; + if (to_rt) { + policy = SCHED_FIFO; + sp.sched_priority = RCU_KTHREAD_PRIO; + } else { + policy = SCHED_NORMAL; + sp.sched_priority = 0; + } + sched_setscheduler_nocheck(t, policy, &sp); +} + +/* + * Timer handler to initiate the waking up of per-CPU kthreads that + * have yielded the CPU due to excess numbers of RCU callbacks. + * We wake up the per-rcu_node kthread, which in turn will wake up + * the booster kthread. + */ +static void rcu_cpu_kthread_timer(unsigned long arg) +{ + struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg); + struct rcu_node *rnp = rdp->mynode; + + atomic_or(rdp->grpmask, &rnp->wakemask); + invoke_rcu_node_kthread(rnp); +} + +/* + * Drop to non-real-time priority and yield, but only after posting a + * timer that will cause us to regain our real-time priority if we + * remain preempted. Either way, we restore our real-time priority + * before returning. + */ +static void rcu_yield(void (*f)(unsigned long), unsigned long arg) +{ + struct sched_param sp; + struct timer_list yield_timer; + + setup_timer_on_stack(&yield_timer, f, arg); + mod_timer(&yield_timer, jiffies + 2); + sp.sched_priority = 0; + sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp); + set_user_nice(current, 19); + schedule(); + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); + del_timer(&yield_timer); +} + +/* + * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU. + * This can happen while the corresponding CPU is either coming online + * or going offline. We cannot wait until the CPU is fully online + * before starting the kthread, because the various notifier functions + * can wait for RCU grace periods. So we park rcu_cpu_kthread() until + * the corresponding CPU is online. + * + * Return 1 if the kthread needs to stop, 0 otherwise. + * + * Caller must disable bh. This function can momentarily enable it. + */ +static int rcu_cpu_kthread_should_stop(int cpu) +{ + while (cpu_is_offline(cpu) || + !cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu)) || + smp_processor_id() != cpu) { + if (kthread_should_stop()) + return 1; + per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; + per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id(); + local_bh_enable(); + schedule_timeout_uninterruptible(1); + if (!cpumask_equal(¤t->cpus_allowed, cpumask_of(cpu))) + set_cpus_allowed_ptr(current, cpumask_of(cpu)); + local_bh_disable(); + } + per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; + return 0; +} + +/* + * Per-CPU kernel thread that invokes RCU callbacks. This replaces the + * earlier RCU softirq. + */ +static int rcu_cpu_kthread(void *arg) +{ + int cpu = (int)(long)arg; + unsigned long flags; + int spincnt = 0; + unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu); + char work; + char *workp = &per_cpu(rcu_cpu_has_work, cpu); + + for (;;) { + *statusp = RCU_KTHREAD_WAITING; + rcu_wait(*workp != 0 || kthread_should_stop()); + local_bh_disable(); + if (rcu_cpu_kthread_should_stop(cpu)) { + local_bh_enable(); + break; + } + *statusp = RCU_KTHREAD_RUNNING; + per_cpu(rcu_cpu_kthread_loops, cpu)++; + local_irq_save(flags); + work = *workp; + *workp = 0; + local_irq_restore(flags); + if (work) + rcu_kthread_do_work(); + local_bh_enable(); + if (*workp != 0) + spincnt++; + else + spincnt = 0; + if (spincnt > 10) { + *statusp = RCU_KTHREAD_YIELDING; + rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu); + spincnt = 0; + } + } + *statusp = RCU_KTHREAD_STOPPED; + return 0; +} + +/* + * Spawn a per-CPU kthread, setting up affinity and priority. + * Because the CPU hotplug lock is held, no other CPU will be attempting + * to manipulate rcu_cpu_kthread_task. There might be another CPU + * attempting to access it during boot, but the locking in kthread_bind() + * will enforce sufficient ordering. + * + * Please note that we cannot simply refuse to wake up the per-CPU + * kthread because kthreads are created in TASK_UNINTERRUPTIBLE state, + * which can result in softlockup complaints if the task ends up being + * idle for more than a couple of minutes. + * + * However, please note also that we cannot bind the per-CPU kthread to its + * CPU until that CPU is fully online. We also cannot wait until the + * CPU is fully online before we create its per-CPU kthread, as this would + * deadlock the system when CPU notifiers tried waiting for grace + * periods. So we bind the per-CPU kthread to its CPU only if the CPU + * is online. If its CPU is not yet fully online, then the code in + * rcu_cpu_kthread() will wait until it is fully online, and then do + * the binding. + */ +static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) +{ + struct sched_param sp; + struct task_struct *t; + + if (!rcu_scheduler_fully_active || + per_cpu(rcu_cpu_kthread_task, cpu) != NULL) + return 0; + t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu); + if (IS_ERR(t)) + return PTR_ERR(t); + if (cpu_online(cpu)) + kthread_bind(t, cpu); + per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; + WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL); + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + per_cpu(rcu_cpu_kthread_task, cpu) = t; + wake_up_process(t); /* Get to TASK_INTERRUPTIBLE quickly. */ + return 0; +} + +/* + * Per-rcu_node kthread, which is in charge of waking up the per-CPU + * kthreads when needed. We ignore requests to wake up kthreads + * for offline CPUs, which is OK because force_quiescent_state() + * takes care of this case. + */ +static int rcu_node_kthread(void *arg) +{ + int cpu; + unsigned long flags; + unsigned long mask; + struct rcu_node *rnp = (struct rcu_node *)arg; + struct sched_param sp; + struct task_struct *t; + + for (;;) { + rnp->node_kthread_status = RCU_KTHREAD_WAITING; + rcu_wait(atomic_read(&rnp->wakemask) != 0); + rnp->node_kthread_status = RCU_KTHREAD_RUNNING; + raw_spin_lock_irqsave(&rnp->lock, flags); + mask = atomic_xchg(&rnp->wakemask, 0); + rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) { + if ((mask & 0x1) == 0) + continue; + preempt_disable(); + t = per_cpu(rcu_cpu_kthread_task, cpu); + if (!cpu_online(cpu) || t == NULL) { + preempt_enable(); + continue; + } + per_cpu(rcu_cpu_has_work, cpu) = 1; + sp.sched_priority = RCU_KTHREAD_PRIO; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + preempt_enable(); + } + } + /* NOTREACHED */ + rnp->node_kthread_status = RCU_KTHREAD_STOPPED; + return 0; +} + +/* + * Set the per-rcu_node kthread's affinity to cover all CPUs that are + * served by the rcu_node in question. The CPU hotplug lock is still + * held, so the value of rnp->qsmaskinit will be stable. + * + * We don't include outgoingcpu in the affinity set, use -1 if there is + * no outgoing CPU. If there are no CPUs left in the affinity set, + * this function allows the kthread to execute on any CPU. + */ +static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) +{ + cpumask_var_t cm; + int cpu; + unsigned long mask = rnp->qsmaskinit; + + if (rnp->node_kthread_task == NULL) + return; + if (!alloc_cpumask_var(&cm, GFP_KERNEL)) + return; + cpumask_clear(cm); + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) + if ((mask & 0x1) && cpu != outgoingcpu) + cpumask_set_cpu(cpu, cm); + if (cpumask_weight(cm) == 0) { + cpumask_setall(cm); + for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) + cpumask_clear_cpu(cpu, cm); + WARN_ON_ONCE(cpumask_weight(cm) == 0); + } + set_cpus_allowed_ptr(rnp->node_kthread_task, cm); + rcu_boost_kthread_setaffinity(rnp, cm); + free_cpumask_var(cm); +} + +/* + * Spawn a per-rcu_node kthread, setting priority and affinity. + * Called during boot before online/offline can happen, or, if + * during runtime, with the main CPU-hotplug locks held. So only + * one of these can be executing at a time. + */ +static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, + struct rcu_node *rnp) +{ + unsigned long flags; + int rnp_index = rnp - &rsp->node[0]; + struct sched_param sp; + struct task_struct *t; + + if (!rcu_scheduler_fully_active || + rnp->qsmaskinit == 0) + return 0; + if (rnp->node_kthread_task == NULL) { + t = kthread_create(rcu_node_kthread, (void *)rnp, + "rcun%d", rnp_index); + if (IS_ERR(t)) + return PTR_ERR(t); + raw_spin_lock_irqsave(&rnp->lock, flags); + rnp->node_kthread_task = t; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + sp.sched_priority = 99; + sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); + wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ + } + return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index); +} + +/* + * Spawn all kthreads -- called as soon as the scheduler is running. + */ +static int __init rcu_spawn_kthreads(void) +{ + int cpu; + struct rcu_node *rnp; + + rcu_scheduler_fully_active = 1; + for_each_possible_cpu(cpu) { + per_cpu(rcu_cpu_has_work, cpu) = 0; + if (cpu_online(cpu)) + (void)rcu_spawn_one_cpu_kthread(cpu); + } + rnp = rcu_get_root(rcu_state); + (void)rcu_spawn_one_node_kthread(rcu_state, rnp); + if (NUM_RCU_NODES > 1) { + rcu_for_each_leaf_node(rcu_state, rnp) + (void)rcu_spawn_one_node_kthread(rcu_state, rnp); + } + return 0; +} +early_initcall(rcu_spawn_kthreads); + +static void __cpuinit rcu_prepare_kthreads(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); + struct rcu_node *rnp = rdp->mynode; + + /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ + if (rcu_scheduler_fully_active) { + (void)rcu_spawn_one_cpu_kthread(cpu); + if (rnp->node_kthread_task == NULL) + (void)rcu_spawn_one_node_kthread(rcu_state, rnp); + } +} + +#else /* #ifdef CONFIG_RCU_BOOST */ + +static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) +{ + raw_spin_unlock_irqrestore(&rnp->lock, flags); +} + +static void invoke_rcu_callbacks_kthread(void) +{ + WARN_ON_ONCE(1); +} + +static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) +{ +} + +#ifdef CONFIG_HOTPLUG_CPU + +static void rcu_stop_cpu_kthread(int cpu) +{ +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + +static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) +{ +} + +static void rcu_cpu_kthread_setrt(int cpu, int to_rt) +{ +} + +static int __init rcu_scheduler_really_started(void) +{ + rcu_scheduler_fully_active = 1; + return 0; +} +early_initcall(rcu_scheduler_really_started); + +static void __cpuinit rcu_prepare_kthreads(int cpu) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_BOOST */ + +#ifndef CONFIG_SMP + +void synchronize_sched_expedited(void) +{ + cond_resched(); +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + +#else /* #ifndef CONFIG_SMP */ + +static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0); +static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0); + +static int synchronize_sched_expedited_cpu_stop(void *data) +{ + /* + * There must be a full memory barrier on each affected CPU + * between the time that try_stop_cpus() is called and the + * time that it returns. + * + * In the current initial implementation of cpu_stop, the + * above condition is already met when the control reaches + * this point and the following smp_mb() is not strictly + * necessary. Do smp_mb() anyway for documentation and + * robustness against future implementation changes. + */ + smp_mb(); /* See above comment block. */ + return 0; +} + +/* + * Wait for an rcu-sched grace period to elapse, but use "big hammer" + * approach to force grace period to end quickly. This consumes + * significant time on all CPUs, and is thus not recommended for + * any sort of common-case code. + * + * Note that it is illegal to call this function while holding any + * lock that is acquired by a CPU-hotplug notifier. Failing to + * observe this restriction will result in deadlock. + * + * This implementation can be thought of as an application of ticket + * locking to RCU, with sync_sched_expedited_started and + * sync_sched_expedited_done taking on the roles of the halves + * of the ticket-lock word. Each task atomically increments + * sync_sched_expedited_started upon entry, snapshotting the old value, + * then attempts to stop all the CPUs. If this succeeds, then each + * CPU will have executed a context switch, resulting in an RCU-sched + * grace period. We are then done, so we use atomic_cmpxchg() to + * update sync_sched_expedited_done to match our snapshot -- but + * only if someone else has not already advanced past our snapshot. + * + * On the other hand, if try_stop_cpus() fails, we check the value + * of sync_sched_expedited_done. If it has advanced past our + * initial snapshot, then someone else must have forced a grace period + * some time after we took our snapshot. In this case, our work is + * done for us, and we can simply return. Otherwise, we try again, + * but keep our initial snapshot for purposes of checking for someone + * doing our work for us. + * + * If we fail too many times in a row, we fall back to synchronize_sched(). + */ +void synchronize_sched_expedited(void) +{ + int firstsnap, s, snap, trycount = 0; + + /* Note that atomic_inc_return() implies full memory barrier. */ + firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); + get_online_cpus(); + + /* + * Each pass through the following loop attempts to force a + * context switch on each CPU. + */ + while (try_stop_cpus(cpu_online_mask, + synchronize_sched_expedited_cpu_stop, + NULL) == -EAGAIN) { + put_online_cpus(); + + /* No joy, try again later. Or just synchronize_sched(). */ + if (trycount++ < 10) + udelay(trycount * num_online_cpus()); + else { + synchronize_sched(); + return; + } + + /* Check to see if someone else did our work for us. */ + s = atomic_read(&sync_sched_expedited_done); + if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { + smp_mb(); /* ensure test happens before caller kfree */ + return; + } + + /* + * Refetching sync_sched_expedited_started allows later + * callers to piggyback on our grace period. We subtract + * 1 to get the same token that the last incrementer got. + * We retry after they started, so our grace period works + * for them, and they started after our first try, so their + * grace period works for us. + */ + get_online_cpus(); + snap = atomic_read(&sync_sched_expedited_started) - 1; + smp_mb(); /* ensure read is before try_stop_cpus(). */ + } + + /* + * Everyone up to our most recent fetch is covered by our grace + * period. Update the counter, but only if our work is still + * relevant -- which it won't be if someone who started later + * than we did beat us to the punch. + */ + do { + s = atomic_read(&sync_sched_expedited_done); + if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { + smp_mb(); /* ensure test happens before caller kfree */ + break; + } + } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); + + put_online_cpus(); +} +EXPORT_SYMBOL_GPL(synchronize_sched_expedited); + +#endif /* #else #ifndef CONFIG_SMP */ + +#if !defined(CONFIG_RCU_FAST_NO_HZ) + +/* + * Check to see if any future RCU-related work will need to be done + * by the current CPU, even if none need be done immediately, returning + * 1 if so. This function is part of the RCU implementation; it is -not- + * an exported member of the RCU API. + * + * Because we have preemptible RCU, just check whether this CPU needs + * any flavor of RCU. Do not chew up lots of CPU cycles with preemption + * disabled in a most-likely vain attempt to cause RCU not to need this CPU. + */ +int rcu_needs_cpu(int cpu) +{ + return rcu_needs_cpu_quick_check(cpu); +} + +/* + * Check to see if we need to continue a callback-flush operations to + * allow the last CPU to enter dyntick-idle mode. But fast dyntick-idle + * entry is not configured, so we never do need to. + */ +static void rcu_needs_cpu_flush(void) +{ +} + +#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ + +#define RCU_NEEDS_CPU_FLUSHES 5 +static DEFINE_PER_CPU(int, rcu_dyntick_drain); +static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); + +/* + * Check to see if any future RCU-related work will need to be done + * by the current CPU, even if none need be done immediately, returning + * 1 if so. This function is part of the RCU implementation; it is -not- + * an exported member of the RCU API. + * + * Because we are not supporting preemptible RCU, attempt to accelerate + * any current grace periods so that RCU no longer needs this CPU, but + * only if all other CPUs are already in dynticks-idle mode. This will + * allow the CPU cores to be powered down immediately, as opposed to after + * waiting many milliseconds for grace periods to elapse. + * + * Because it is not legal to invoke rcu_process_callbacks() with irqs + * disabled, we do one pass of force_quiescent_state(), then do a + * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked + * later. The per-cpu rcu_dyntick_drain variable controls the sequencing. + */ +int rcu_needs_cpu(int cpu) +{ + int c = 0; + int snap; + int thatcpu; + + /* Check for being in the holdoff period. */ + if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) + return rcu_needs_cpu_quick_check(cpu); + + /* Don't bother unless we are the last non-dyntick-idle CPU. */ + for_each_online_cpu(thatcpu) { + if (thatcpu == cpu) + continue; + snap = atomic_add_return(0, &per_cpu(rcu_dynticks, + thatcpu).dynticks); + smp_mb(); /* Order sampling of snap with end of grace period. */ + if ((snap & 0x1) != 0) { + per_cpu(rcu_dyntick_drain, cpu) = 0; + per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; + return rcu_needs_cpu_quick_check(cpu); + } + } + + /* Check and update the rcu_dyntick_drain sequencing. */ + if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { + /* First time through, initialize the counter. */ + per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES; + } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { + /* We have hit the limit, so time to give up. */ + per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; + return rcu_needs_cpu_quick_check(cpu); + } + + /* Do one step pushing remaining RCU callbacks through. */ + if (per_cpu(rcu_sched_data, cpu).nxtlist) { + rcu_sched_qs(cpu); + force_quiescent_state(&rcu_sched_state, 0); + c = c || per_cpu(rcu_sched_data, cpu).nxtlist; + } + if (per_cpu(rcu_bh_data, cpu).nxtlist) { + rcu_bh_qs(cpu); + force_quiescent_state(&rcu_bh_state, 0); + c = c || per_cpu(rcu_bh_data, cpu).nxtlist; + } + + /* If RCU callbacks are still pending, RCU still needs this CPU. */ + if (c) + invoke_rcu_core(); + return c; +} + +/* + * Check to see if we need to continue a callback-flush operations to + * allow the last CPU to enter dyntick-idle mode. + */ +static void rcu_needs_cpu_flush(void) +{ + int cpu = smp_processor_id(); + unsigned long flags; + + if (per_cpu(rcu_dyntick_drain, cpu) <= 0) + return; + local_irq_save(flags); + (void)rcu_needs_cpu(cpu); + local_irq_restore(flags); +} + +#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c new file mode 100644 index 00000000..4e144876 --- /dev/null +++ b/kernel/rcutree_trace.c @@ -0,0 +1,515 @@ +/* + * Read-Copy Update tracing for classic implementation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2008 + * + * Papers: http://www.rdrop.com/users/paulmck/RCU + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RCU_TREE_NONCORE +#include "rcutree.h" + +#ifdef CONFIG_RCU_BOOST + +DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); +DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu); +DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); +DECLARE_PER_CPU(char, rcu_cpu_has_work); + +static char convert_kthread_status(unsigned int kthread_status) +{ + if (kthread_status > RCU_KTHREAD_MAX) + return '?'; + return "SRWOY"[kthread_status]; +} + +#endif /* #ifdef CONFIG_RCU_BOOST */ + +static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) +{ + if (!rdp->beenonline) + return; + seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d", + rdp->cpu, + cpu_is_offline(rdp->cpu) ? '!' : ' ', + rdp->completed, rdp->gpnum, + rdp->passed_quiesc, rdp->passed_quiesc_completed, + rdp->qs_pending); +#ifdef CONFIG_NO_HZ + seq_printf(m, " dt=%d/%d/%d df=%lu", + atomic_read(&rdp->dynticks->dynticks), + rdp->dynticks->dynticks_nesting, + rdp->dynticks->dynticks_nmi_nesting, + rdp->dynticks_fqs); +#endif /* #ifdef CONFIG_NO_HZ */ + seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi); + seq_printf(m, " ql=%ld qs=%c%c%c%c", + rdp->qlen, + ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != + rdp->nxttail[RCU_NEXT_TAIL]], + ".R"[rdp->nxttail[RCU_WAIT_TAIL] != + rdp->nxttail[RCU_NEXT_READY_TAIL]], + ".W"[rdp->nxttail[RCU_DONE_TAIL] != + rdp->nxttail[RCU_WAIT_TAIL]], + ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); +#ifdef CONFIG_RCU_BOOST + seq_printf(m, " kt=%d/%c/%d ktl=%x", + per_cpu(rcu_cpu_has_work, rdp->cpu), + convert_kthread_status(per_cpu(rcu_cpu_kthread_status, + rdp->cpu)), + per_cpu(rcu_cpu_kthread_cpu, rdp->cpu), + per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); +#endif /* #ifdef CONFIG_RCU_BOOST */ + seq_printf(m, " b=%ld", rdp->blimit); + seq_printf(m, " ci=%lu co=%lu ca=%lu\n", + rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); +} + +#define PRINT_RCU_DATA(name, func, m) \ + do { \ + int _p_r_d_i; \ + \ + for_each_possible_cpu(_p_r_d_i) \ + func(m, &per_cpu(name, _p_r_d_i)); \ + } while (0) + +static int show_rcudata(struct seq_file *m, void *unused) +{ +#ifdef CONFIG_TREE_PREEMPT_RCU + seq_puts(m, "rcu_preempt:\n"); + PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data, m); +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + seq_puts(m, "rcu_sched:\n"); + PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data, m); + seq_puts(m, "rcu_bh:\n"); + PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data, m); + return 0; +} + +static int rcudata_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcudata, NULL); +} + +static const struct file_operations rcudata_fops = { + .owner = THIS_MODULE, + .open = rcudata_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) +{ + if (!rdp->beenonline) + return; + seq_printf(m, "%d,%s,%lu,%lu,%d,%lu,%d", + rdp->cpu, + cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", + rdp->completed, rdp->gpnum, + rdp->passed_quiesc, rdp->passed_quiesc_completed, + rdp->qs_pending); +#ifdef CONFIG_NO_HZ + seq_printf(m, ",%d,%d,%d,%lu", + atomic_read(&rdp->dynticks->dynticks), + rdp->dynticks->dynticks_nesting, + rdp->dynticks->dynticks_nmi_nesting, + rdp->dynticks_fqs); +#endif /* #ifdef CONFIG_NO_HZ */ + seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi); + seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen, + ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != + rdp->nxttail[RCU_NEXT_TAIL]], + ".R"[rdp->nxttail[RCU_WAIT_TAIL] != + rdp->nxttail[RCU_NEXT_READY_TAIL]], + ".W"[rdp->nxttail[RCU_DONE_TAIL] != + rdp->nxttail[RCU_WAIT_TAIL]], + ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); +#ifdef CONFIG_RCU_BOOST + seq_printf(m, ",%d,\"%c\"", + per_cpu(rcu_cpu_has_work, rdp->cpu), + convert_kthread_status(per_cpu(rcu_cpu_kthread_status, + rdp->cpu))); +#endif /* #ifdef CONFIG_RCU_BOOST */ + seq_printf(m, ",%ld", rdp->blimit); + seq_printf(m, ",%lu,%lu,%lu\n", + rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted); +} + +static int show_rcudata_csv(struct seq_file *m, void *unused) +{ + seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\","); +#ifdef CONFIG_NO_HZ + seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\","); +#endif /* #ifdef CONFIG_NO_HZ */ + seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\""); +#ifdef CONFIG_RCU_BOOST + seq_puts(m, "\"kt\",\"ktl\""); +#endif /* #ifdef CONFIG_RCU_BOOST */ + seq_puts(m, ",\"b\",\"ci\",\"co\",\"ca\"\n"); +#ifdef CONFIG_TREE_PREEMPT_RCU + seq_puts(m, "\"rcu_preempt:\"\n"); + PRINT_RCU_DATA(rcu_preempt_data, print_one_rcu_data_csv, m); +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + seq_puts(m, "\"rcu_sched:\"\n"); + PRINT_RCU_DATA(rcu_sched_data, print_one_rcu_data_csv, m); + seq_puts(m, "\"rcu_bh:\"\n"); + PRINT_RCU_DATA(rcu_bh_data, print_one_rcu_data_csv, m); + return 0; +} + +static int rcudata_csv_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcudata_csv, NULL); +} + +static const struct file_operations rcudata_csv_fops = { + .owner = THIS_MODULE, + .open = rcudata_csv_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#ifdef CONFIG_RCU_BOOST + +static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) +{ + seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu " + "j=%04x bt=%04x\n", + rnp->grplo, rnp->grphi, + "T."[list_empty(&rnp->blkd_tasks)], + "N."[!rnp->gp_tasks], + "E."[!rnp->exp_tasks], + "B."[!rnp->boost_tasks], + convert_kthread_status(rnp->boost_kthread_status), + rnp->n_tasks_boosted, rnp->n_exp_boosts, + rnp->n_normal_boosts, + (int)(jiffies & 0xffff), + (int)(rnp->boost_time & 0xffff)); + seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", + " balk", + rnp->n_balk_blkd_tasks, + rnp->n_balk_exp_gp_tasks, + rnp->n_balk_boost_tasks, + rnp->n_balk_notblocked, + rnp->n_balk_notyet, + rnp->n_balk_nos); +} + +static int show_rcu_node_boost(struct seq_file *m, void *unused) +{ + struct rcu_node *rnp; + + rcu_for_each_leaf_node(&rcu_preempt_state, rnp) + print_one_rcu_node_boost(m, rnp); + return 0; +} + +static int rcu_node_boost_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcu_node_boost, NULL); +} + +static const struct file_operations rcu_node_boost_fops = { + .owner = THIS_MODULE, + .open = rcu_node_boost_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* + * Create the rcuboost debugfs entry. Standard error return. + */ +static int rcu_boost_trace_create_file(struct dentry *rcudir) +{ + return !debugfs_create_file("rcuboost", 0444, rcudir, NULL, + &rcu_node_boost_fops); +} + +#else /* #ifdef CONFIG_RCU_BOOST */ + +static int rcu_boost_trace_create_file(struct dentry *rcudir) +{ + return 0; /* There cannot be an error if we didn't create it! */ +} + +#endif /* #else #ifdef CONFIG_RCU_BOOST */ + +static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) +{ + unsigned long gpnum; + int level = 0; + struct rcu_node *rnp; + + gpnum = rsp->gpnum; + seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " + "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n", + rsp->completed, gpnum, rsp->signaled, + (long)(rsp->jiffies_force_qs - jiffies), + (int)(jiffies & 0xffff), + rsp->n_force_qs, rsp->n_force_qs_ngp, + rsp->n_force_qs - rsp->n_force_qs_ngp, + rsp->n_force_qs_lh); + for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) { + if (rnp->level != level) { + seq_puts(m, "\n"); + level = rnp->level; + } + seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ", + rnp->qsmask, rnp->qsmaskinit, + ".G"[rnp->gp_tasks != NULL], + ".E"[rnp->exp_tasks != NULL], + ".T"[!list_empty(&rnp->blkd_tasks)], + rnp->grplo, rnp->grphi, rnp->grpnum); + } + seq_puts(m, "\n"); +} + +static int show_rcuhier(struct seq_file *m, void *unused) +{ +#ifdef CONFIG_TREE_PREEMPT_RCU + seq_puts(m, "rcu_preempt:\n"); + print_one_rcu_state(m, &rcu_preempt_state); +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + seq_puts(m, "rcu_sched:\n"); + print_one_rcu_state(m, &rcu_sched_state); + seq_puts(m, "rcu_bh:\n"); + print_one_rcu_state(m, &rcu_bh_state); + return 0; +} + +static int rcuhier_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcuhier, NULL); +} + +static const struct file_operations rcuhier_fops = { + .owner = THIS_MODULE, + .open = rcuhier_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) +{ + unsigned long flags; + unsigned long completed; + unsigned long gpnum; + unsigned long gpage; + unsigned long gpmax; + struct rcu_node *rnp = &rsp->node[0]; + + raw_spin_lock_irqsave(&rnp->lock, flags); + completed = rsp->completed; + gpnum = rsp->gpnum; + if (rsp->completed == rsp->gpnum) + gpage = 0; + else + gpage = jiffies - rsp->gp_start; + gpmax = rsp->gp_max; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n", + rsp->name, completed, gpnum, gpage, gpmax); +} + +static int show_rcugp(struct seq_file *m, void *unused) +{ +#ifdef CONFIG_TREE_PREEMPT_RCU + show_one_rcugp(m, &rcu_preempt_state); +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + show_one_rcugp(m, &rcu_sched_state); + show_one_rcugp(m, &rcu_bh_state); + return 0; +} + +static int rcugp_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcugp, NULL); +} + +static const struct file_operations rcugp_fops = { + .owner = THIS_MODULE, + .open = rcugp_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) +{ + seq_printf(m, "%3d%cnp=%ld " + "qsp=%ld rpq=%ld cbr=%ld cng=%ld " + "gpc=%ld gps=%ld nf=%ld nn=%ld\n", + rdp->cpu, + cpu_is_offline(rdp->cpu) ? '!' : ' ', + rdp->n_rcu_pending, + rdp->n_rp_qs_pending, + rdp->n_rp_report_qs, + rdp->n_rp_cb_ready, + rdp->n_rp_cpu_needs_gp, + rdp->n_rp_gp_completed, + rdp->n_rp_gp_started, + rdp->n_rp_need_fqs, + rdp->n_rp_need_nothing); +} + +static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp) +{ + int cpu; + struct rcu_data *rdp; + + for_each_possible_cpu(cpu) { + rdp = per_cpu_ptr(rsp->rda, cpu); + if (rdp->beenonline) + print_one_rcu_pending(m, rdp); + } +} + +static int show_rcu_pending(struct seq_file *m, void *unused) +{ +#ifdef CONFIG_TREE_PREEMPT_RCU + seq_puts(m, "rcu_preempt:\n"); + print_rcu_pendings(m, &rcu_preempt_state); +#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ + seq_puts(m, "rcu_sched:\n"); + print_rcu_pendings(m, &rcu_sched_state); + seq_puts(m, "rcu_bh:\n"); + print_rcu_pendings(m, &rcu_bh_state); + return 0; +} + +static int rcu_pending_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcu_pending, NULL); +} + +static const struct file_operations rcu_pending_fops = { + .owner = THIS_MODULE, + .open = rcu_pending_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int show_rcutorture(struct seq_file *m, void *unused) +{ + seq_printf(m, "rcutorture test sequence: %lu %s\n", + rcutorture_testseq >> 1, + (rcutorture_testseq & 0x1) ? "(test in progress)" : ""); + seq_printf(m, "rcutorture update version number: %lu\n", + rcutorture_vernum); + return 0; +} + +static int rcutorture_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcutorture, NULL); +} + +static const struct file_operations rcutorture_fops = { + .owner = THIS_MODULE, + .open = rcutorture_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct dentry *rcudir; + +static int __init rcutree_trace_init(void) +{ + struct dentry *retval; + + rcudir = debugfs_create_dir("rcu", NULL); + if (!rcudir) + goto free_out; + + retval = debugfs_create_file("rcudata", 0444, rcudir, + NULL, &rcudata_fops); + if (!retval) + goto free_out; + + retval = debugfs_create_file("rcudata.csv", 0444, rcudir, + NULL, &rcudata_csv_fops); + if (!retval) + goto free_out; + + if (rcu_boost_trace_create_file(rcudir)) + goto free_out; + + retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); + if (!retval) + goto free_out; + + retval = debugfs_create_file("rcuhier", 0444, rcudir, + NULL, &rcuhier_fops); + if (!retval) + goto free_out; + + retval = debugfs_create_file("rcu_pending", 0444, rcudir, + NULL, &rcu_pending_fops); + if (!retval) + goto free_out; + + retval = debugfs_create_file("rcutorture", 0444, rcudir, + NULL, &rcutorture_fops); + if (!retval) + goto free_out; + return 0; +free_out: + debugfs_remove_recursive(rcudir); + return 1; +} + +static void __exit rcutree_trace_cleanup(void) +{ + debugfs_remove_recursive(rcudir); +} + + +module_init(rcutree_trace_init); +module_exit(rcutree_trace_cleanup); + +MODULE_AUTHOR("Paul E. McKenney"); +MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation"); +MODULE_LICENSE("GPL"); diff --git a/kernel/relay.c b/kernel/relay.c new file mode 100644 index 00000000..2c242fb2 --- /dev/null +++ b/kernel/relay.c @@ -0,0 +1,1365 @@ +/* + * Public API and common code for kernel->userspace relay file support. + * + * See Documentation/filesystems/relay.txt for an overview. + * + * Copyright (C) 2002-2005 - Tom Zanussi (zanussi@us.ibm.com), IBM Corp + * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com) + * + * Moved to kernel/relay.c by Paul Mundt, 2006. + * November 2006 - CPU hotplug support by Mathieu Desnoyers + * (mathieu.desnoyers@polymtl.ca) + * + * This file is released under the GPL. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* list of open channels, for cpu hotplug */ +static DEFINE_MUTEX(relay_channels_mutex); +static LIST_HEAD(relay_channels); + +/* + * close() vm_op implementation for relay file mapping. + */ +static void relay_file_mmap_close(struct vm_area_struct *vma) +{ + struct rchan_buf *buf = vma->vm_private_data; + buf->chan->cb->buf_unmapped(buf, vma->vm_file); +} + +/* + * fault() vm_op implementation for relay file mapping. + */ +static int relay_buf_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *page; + struct rchan_buf *buf = vma->vm_private_data; + pgoff_t pgoff = vmf->pgoff; + + if (!buf) + return VM_FAULT_OOM; + + page = vmalloc_to_page(buf->start + (pgoff << PAGE_SHIFT)); + if (!page) + return VM_FAULT_SIGBUS; + get_page(page); + vmf->page = page; + + return 0; +} + +/* + * vm_ops for relay file mappings. + */ +static const struct vm_operations_struct relay_file_mmap_ops = { + .fault = relay_buf_fault, + .close = relay_file_mmap_close, +}; + +/* + * allocate an array of pointers of struct page + */ +static struct page **relay_alloc_page_array(unsigned int n_pages) +{ + const size_t pa_size = n_pages * sizeof(struct page *); + if (pa_size > PAGE_SIZE) + return vzalloc(pa_size); + return kzalloc(pa_size, GFP_KERNEL); +} + +/* + * free an array of pointers of struct page + */ +static void relay_free_page_array(struct page **array) +{ + if (is_vmalloc_addr(array)) + vfree(array); + else + kfree(array); +} + +/** + * relay_mmap_buf: - mmap channel buffer to process address space + * @buf: relay channel buffer + * @vma: vm_area_struct describing memory to be mapped + * + * Returns 0 if ok, negative on error + * + * Caller should already have grabbed mmap_sem. + */ +static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) +{ + unsigned long length = vma->vm_end - vma->vm_start; + struct file *filp = vma->vm_file; + + if (!buf) + return -EBADF; + + if (length != (unsigned long)buf->chan->alloc_size) + return -EINVAL; + + vma->vm_ops = &relay_file_mmap_ops; + vma->vm_flags |= VM_DONTEXPAND; + vma->vm_private_data = buf; + buf->chan->cb->buf_mapped(buf, filp); + + return 0; +} + +/** + * relay_alloc_buf - allocate a channel buffer + * @buf: the buffer struct + * @size: total size of the buffer + * + * Returns a pointer to the resulting buffer, %NULL if unsuccessful. The + * passed in size will get page aligned, if it isn't already. + */ +static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) +{ + void *mem; + unsigned int i, j, n_pages; + + *size = PAGE_ALIGN(*size); + n_pages = *size >> PAGE_SHIFT; + + buf->page_array = relay_alloc_page_array(n_pages); + if (!buf->page_array) + return NULL; + + for (i = 0; i < n_pages; i++) { + buf->page_array[i] = alloc_page(GFP_KERNEL); + if (unlikely(!buf->page_array[i])) + goto depopulate; + set_page_private(buf->page_array[i], (unsigned long)buf); + } + mem = vmap(buf->page_array, n_pages, VM_MAP, PAGE_KERNEL); + if (!mem) + goto depopulate; + + memset(mem, 0, *size); + buf->page_count = n_pages; + return mem; + +depopulate: + for (j = 0; j < i; j++) + __free_page(buf->page_array[j]); + relay_free_page_array(buf->page_array); + return NULL; +} + +/** + * relay_create_buf - allocate and initialize a channel buffer + * @chan: the relay channel + * + * Returns channel buffer if successful, %NULL otherwise. + */ +static struct rchan_buf *relay_create_buf(struct rchan *chan) +{ + struct rchan_buf *buf; + + if (chan->n_subbufs > UINT_MAX / sizeof(size_t *)) + return NULL; + + buf = kzalloc(sizeof(struct rchan_buf), GFP_KERNEL); + if (!buf) + return NULL; + buf->padding = kmalloc(chan->n_subbufs * sizeof(size_t *), GFP_KERNEL); + if (!buf->padding) + goto free_buf; + + buf->start = relay_alloc_buf(buf, &chan->alloc_size); + if (!buf->start) + goto free_buf; + + buf->chan = chan; + kref_get(&buf->chan->kref); + return buf; + +free_buf: + kfree(buf->padding); + kfree(buf); + return NULL; +} + +/** + * relay_destroy_channel - free the channel struct + * @kref: target kernel reference that contains the relay channel + * + * Should only be called from kref_put(). + */ +static void relay_destroy_channel(struct kref *kref) +{ + struct rchan *chan = container_of(kref, struct rchan, kref); + kfree(chan); +} + +/** + * relay_destroy_buf - destroy an rchan_buf struct and associated buffer + * @buf: the buffer struct + */ +static void relay_destroy_buf(struct rchan_buf *buf) +{ + struct rchan *chan = buf->chan; + unsigned int i; + + if (likely(buf->start)) { + vunmap(buf->start); + for (i = 0; i < buf->page_count; i++) + __free_page(buf->page_array[i]); + relay_free_page_array(buf->page_array); + } + chan->buf[buf->cpu] = NULL; + kfree(buf->padding); + kfree(buf); + kref_put(&chan->kref, relay_destroy_channel); +} + +/** + * relay_remove_buf - remove a channel buffer + * @kref: target kernel reference that contains the relay buffer + * + * Removes the file from the fileystem, which also frees the + * rchan_buf_struct and the channel buffer. Should only be called from + * kref_put(). + */ +static void relay_remove_buf(struct kref *kref) +{ + struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref); + buf->chan->cb->remove_buf_file(buf->dentry); + relay_destroy_buf(buf); +} + +/** + * relay_buf_empty - boolean, is the channel buffer empty? + * @buf: channel buffer + * + * Returns 1 if the buffer is empty, 0 otherwise. + */ +static int relay_buf_empty(struct rchan_buf *buf) +{ + return (buf->subbufs_produced - buf->subbufs_consumed) ? 0 : 1; +} + +/** + * relay_buf_full - boolean, is the channel buffer full? + * @buf: channel buffer + * + * Returns 1 if the buffer is full, 0 otherwise. + */ +int relay_buf_full(struct rchan_buf *buf) +{ + size_t ready = buf->subbufs_produced - buf->subbufs_consumed; + return (ready >= buf->chan->n_subbufs) ? 1 : 0; +} +EXPORT_SYMBOL_GPL(relay_buf_full); + +/* + * High-level relay kernel API and associated functions. + */ + +/* + * rchan_callback implementations defining default channel behavior. Used + * in place of corresponding NULL values in client callback struct. + */ + +/* + * subbuf_start() default callback. Does nothing. + */ +static int subbuf_start_default_callback (struct rchan_buf *buf, + void *subbuf, + void *prev_subbuf, + size_t prev_padding) +{ + if (relay_buf_full(buf)) + return 0; + + return 1; +} + +/* + * buf_mapped() default callback. Does nothing. + */ +static void buf_mapped_default_callback(struct rchan_buf *buf, + struct file *filp) +{ +} + +/* + * buf_unmapped() default callback. Does nothing. + */ +static void buf_unmapped_default_callback(struct rchan_buf *buf, + struct file *filp) +{ +} + +/* + * create_buf_file_create() default callback. Does nothing. + */ +static struct dentry *create_buf_file_default_callback(const char *filename, + struct dentry *parent, + int mode, + struct rchan_buf *buf, + int *is_global) +{ + return NULL; +} + +/* + * remove_buf_file() default callback. Does nothing. + */ +static int remove_buf_file_default_callback(struct dentry *dentry) +{ + return -EINVAL; +} + +/* relay channel default callbacks */ +static struct rchan_callbacks default_channel_callbacks = { + .subbuf_start = subbuf_start_default_callback, + .buf_mapped = buf_mapped_default_callback, + .buf_unmapped = buf_unmapped_default_callback, + .create_buf_file = create_buf_file_default_callback, + .remove_buf_file = remove_buf_file_default_callback, +}; + +/** + * wakeup_readers - wake up readers waiting on a channel + * @data: contains the channel buffer + * + * This is the timer function used to defer reader waking. + */ +static void wakeup_readers(unsigned long data) +{ + struct rchan_buf *buf = (struct rchan_buf *)data; + wake_up_interruptible(&buf->read_wait); +} + +/** + * __relay_reset - reset a channel buffer + * @buf: the channel buffer + * @init: 1 if this is a first-time initialization + * + * See relay_reset() for description of effect. + */ +static void __relay_reset(struct rchan_buf *buf, unsigned int init) +{ + size_t i; + + if (init) { + init_waitqueue_head(&buf->read_wait); + kref_init(&buf->kref); + setup_timer(&buf->timer, wakeup_readers, (unsigned long)buf); + } else + del_timer_sync(&buf->timer); + + buf->subbufs_produced = 0; + buf->subbufs_consumed = 0; + buf->bytes_consumed = 0; + buf->finalized = 0; + buf->data = buf->start; + buf->offset = 0; + + for (i = 0; i < buf->chan->n_subbufs; i++) + buf->padding[i] = 0; + + buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0); +} + +/** + * relay_reset - reset the channel + * @chan: the channel + * + * This has the effect of erasing all data from all channel buffers + * and restarting the channel in its initial state. The buffers + * are not freed, so any mappings are still in effect. + * + * NOTE. Care should be taken that the channel isn't actually + * being used by anything when this call is made. + */ +void relay_reset(struct rchan *chan) +{ + unsigned int i; + + if (!chan) + return; + + if (chan->is_global && chan->buf[0]) { + __relay_reset(chan->buf[0], 0); + return; + } + + mutex_lock(&relay_channels_mutex); + for_each_possible_cpu(i) + if (chan->buf[i]) + __relay_reset(chan->buf[i], 0); + mutex_unlock(&relay_channels_mutex); +} +EXPORT_SYMBOL_GPL(relay_reset); + +static inline void relay_set_buf_dentry(struct rchan_buf *buf, + struct dentry *dentry) +{ + buf->dentry = dentry; + buf->dentry->d_inode->i_size = buf->early_bytes; +} + +static struct dentry *relay_create_buf_file(struct rchan *chan, + struct rchan_buf *buf, + unsigned int cpu) +{ + struct dentry *dentry; + char *tmpname; + + tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL); + if (!tmpname) + return NULL; + snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu); + + /* Create file in fs */ + dentry = chan->cb->create_buf_file(tmpname, chan->parent, + S_IRUSR, buf, + &chan->is_global); + + kfree(tmpname); + + return dentry; +} + +/* + * relay_open_buf - create a new relay channel buffer + * + * used by relay_open() and CPU hotplug. + */ +static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) +{ + struct rchan_buf *buf = NULL; + struct dentry *dentry; + + if (chan->is_global) + return chan->buf[0]; + + buf = relay_create_buf(chan); + if (!buf) + return NULL; + + if (chan->has_base_filename) { + dentry = relay_create_buf_file(chan, buf, cpu); + if (!dentry) + goto free_buf; + relay_set_buf_dentry(buf, dentry); + } + + buf->cpu = cpu; + __relay_reset(buf, 1); + + if(chan->is_global) { + chan->buf[0] = buf; + buf->cpu = 0; + } + + return buf; + +free_buf: + relay_destroy_buf(buf); + return NULL; +} + +/** + * relay_close_buf - close a channel buffer + * @buf: channel buffer + * + * Marks the buffer finalized and restores the default callbacks. + * The channel buffer and channel buffer data structure are then freed + * automatically when the last reference is given up. + */ +static void relay_close_buf(struct rchan_buf *buf) +{ + buf->finalized = 1; + del_timer_sync(&buf->timer); + kref_put(&buf->kref, relay_remove_buf); +} + +static void setup_callbacks(struct rchan *chan, + struct rchan_callbacks *cb) +{ + if (!cb) { + chan->cb = &default_channel_callbacks; + return; + } + + if (!cb->subbuf_start) + cb->subbuf_start = subbuf_start_default_callback; + if (!cb->buf_mapped) + cb->buf_mapped = buf_mapped_default_callback; + if (!cb->buf_unmapped) + cb->buf_unmapped = buf_unmapped_default_callback; + if (!cb->create_buf_file) + cb->create_buf_file = create_buf_file_default_callback; + if (!cb->remove_buf_file) + cb->remove_buf_file = remove_buf_file_default_callback; + chan->cb = cb; +} + +/** + * relay_hotcpu_callback - CPU hotplug callback + * @nb: notifier block + * @action: hotplug action to take + * @hcpu: CPU number + * + * Returns the success/failure of the operation. (%NOTIFY_OK, %NOTIFY_BAD) + */ +static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb, + unsigned long action, + void *hcpu) +{ + unsigned int hotcpu = (unsigned long)hcpu; + struct rchan *chan; + + switch(action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + mutex_lock(&relay_channels_mutex); + list_for_each_entry(chan, &relay_channels, list) { + if (chan->buf[hotcpu]) + continue; + chan->buf[hotcpu] = relay_open_buf(chan, hotcpu); + if(!chan->buf[hotcpu]) { + printk(KERN_ERR + "relay_hotcpu_callback: cpu %d buffer " + "creation failed\n", hotcpu); + mutex_unlock(&relay_channels_mutex); + return notifier_from_errno(-ENOMEM); + } + } + mutex_unlock(&relay_channels_mutex); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + /* No need to flush the cpu : will be flushed upon + * final relay_flush() call. */ + break; + } + return NOTIFY_OK; +} + +/** + * relay_open - create a new relay channel + * @base_filename: base name of files to create, %NULL for buffering only + * @parent: dentry of parent directory, %NULL for root directory or buffer + * @subbuf_size: size of sub-buffers + * @n_subbufs: number of sub-buffers + * @cb: client callback functions + * @private_data: user-defined data + * + * Returns channel pointer if successful, %NULL otherwise. + * + * Creates a channel buffer for each cpu using the sizes and + * attributes specified. The created channel buffer files + * will be named base_filename0...base_filenameN-1. File + * permissions will be %S_IRUSR. + */ +struct rchan *relay_open(const char *base_filename, + struct dentry *parent, + size_t subbuf_size, + size_t n_subbufs, + struct rchan_callbacks *cb, + void *private_data) +{ + unsigned int i; + struct rchan *chan; + + if (!(subbuf_size && n_subbufs)) + return NULL; + if (subbuf_size > UINT_MAX / n_subbufs) + return NULL; + + chan = kzalloc(sizeof(struct rchan), GFP_KERNEL); + if (!chan) + return NULL; + + chan->version = RELAYFS_CHANNEL_VERSION; + chan->n_subbufs = n_subbufs; + chan->subbuf_size = subbuf_size; + chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs); + chan->parent = parent; + chan->private_data = private_data; + if (base_filename) { + chan->has_base_filename = 1; + strlcpy(chan->base_filename, base_filename, NAME_MAX); + } + setup_callbacks(chan, cb); + kref_init(&chan->kref); + + mutex_lock(&relay_channels_mutex); + for_each_online_cpu(i) { + chan->buf[i] = relay_open_buf(chan, i); + if (!chan->buf[i]) + goto free_bufs; + } + list_add(&chan->list, &relay_channels); + mutex_unlock(&relay_channels_mutex); + + return chan; + +free_bufs: + for_each_possible_cpu(i) { + if (chan->buf[i]) + relay_close_buf(chan->buf[i]); + } + + kref_put(&chan->kref, relay_destroy_channel); + mutex_unlock(&relay_channels_mutex); + return NULL; +} +EXPORT_SYMBOL_GPL(relay_open); + +struct rchan_percpu_buf_dispatcher { + struct rchan_buf *buf; + struct dentry *dentry; +}; + +/* Called in atomic context. */ +static void __relay_set_buf_dentry(void *info) +{ + struct rchan_percpu_buf_dispatcher *p = info; + + relay_set_buf_dentry(p->buf, p->dentry); +} + +/** + * relay_late_setup_files - triggers file creation + * @chan: channel to operate on + * @base_filename: base name of files to create + * @parent: dentry of parent directory, %NULL for root directory + * + * Returns 0 if successful, non-zero otherwise. + * + * Use to setup files for a previously buffer-only channel. + * Useful to do early tracing in kernel, before VFS is up, for example. + */ +int relay_late_setup_files(struct rchan *chan, + const char *base_filename, + struct dentry *parent) +{ + int err = 0; + unsigned int i, curr_cpu; + unsigned long flags; + struct dentry *dentry; + struct rchan_percpu_buf_dispatcher disp; + + if (!chan || !base_filename) + return -EINVAL; + + strlcpy(chan->base_filename, base_filename, NAME_MAX); + + mutex_lock(&relay_channels_mutex); + /* Is chan already set up? */ + if (unlikely(chan->has_base_filename)) { + mutex_unlock(&relay_channels_mutex); + return -EEXIST; + } + chan->has_base_filename = 1; + chan->parent = parent; + curr_cpu = get_cpu(); + /* + * The CPU hotplug notifier ran before us and created buffers with + * no files associated. So it's safe to call relay_setup_buf_file() + * on all currently online CPUs. + */ + for_each_online_cpu(i) { + if (unlikely(!chan->buf[i])) { + WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n"); + err = -EINVAL; + break; + } + + dentry = relay_create_buf_file(chan, chan->buf[i], i); + if (unlikely(!dentry)) { + err = -EINVAL; + break; + } + + if (curr_cpu == i) { + local_irq_save(flags); + relay_set_buf_dentry(chan->buf[i], dentry); + local_irq_restore(flags); + } else { + disp.buf = chan->buf[i]; + disp.dentry = dentry; + smp_mb(); + /* relay_channels_mutex must be held, so wait. */ + err = smp_call_function_single(i, + __relay_set_buf_dentry, + &disp, 1); + } + if (unlikely(err)) + break; + } + put_cpu(); + mutex_unlock(&relay_channels_mutex); + + return err; +} + +/** + * relay_switch_subbuf - switch to a new sub-buffer + * @buf: channel buffer + * @length: size of current event + * + * Returns either the length passed in or 0 if full. + * + * Performs sub-buffer-switch tasks such as invoking callbacks, + * updating padding counts, waking up readers, etc. + */ +size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) +{ + void *old, *new; + size_t old_subbuf, new_subbuf; + + if (unlikely(length > buf->chan->subbuf_size)) + goto toobig; + + if (buf->offset != buf->chan->subbuf_size + 1) { + buf->prev_padding = buf->chan->subbuf_size - buf->offset; + old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; + buf->padding[old_subbuf] = buf->prev_padding; + buf->subbufs_produced++; + if (buf->dentry) + buf->dentry->d_inode->i_size += + buf->chan->subbuf_size - + buf->padding[old_subbuf]; + else + buf->early_bytes += buf->chan->subbuf_size - + buf->padding[old_subbuf]; + smp_mb(); + if (waitqueue_active(&buf->read_wait)) + /* + * Calling wake_up_interruptible() from here + * will deadlock if we happen to be logging + * from the scheduler (trying to re-grab + * rq->lock), so defer it. + */ + mod_timer(&buf->timer, jiffies + 1); + } + + old = buf->data; + new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; + new = buf->start + new_subbuf * buf->chan->subbuf_size; + buf->offset = 0; + if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) { + buf->offset = buf->chan->subbuf_size + 1; + return 0; + } + buf->data = new; + buf->padding[new_subbuf] = 0; + + if (unlikely(length + buf->offset > buf->chan->subbuf_size)) + goto toobig; + + return length; + +toobig: + buf->chan->last_toobig = length; + return 0; +} +EXPORT_SYMBOL_GPL(relay_switch_subbuf); + +/** + * relay_subbufs_consumed - update the buffer's sub-buffers-consumed count + * @chan: the channel + * @cpu: the cpu associated with the channel buffer to update + * @subbufs_consumed: number of sub-buffers to add to current buf's count + * + * Adds to the channel buffer's consumed sub-buffer count. + * subbufs_consumed should be the number of sub-buffers newly consumed, + * not the total consumed. + * + * NOTE. Kernel clients don't need to call this function if the channel + * mode is 'overwrite'. + */ +void relay_subbufs_consumed(struct rchan *chan, + unsigned int cpu, + size_t subbufs_consumed) +{ + struct rchan_buf *buf; + + if (!chan) + return; + + if (cpu >= NR_CPUS || !chan->buf[cpu] || + subbufs_consumed > chan->n_subbufs) + return; + + buf = chan->buf[cpu]; + if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed) + buf->subbufs_consumed = buf->subbufs_produced; + else + buf->subbufs_consumed += subbufs_consumed; +} +EXPORT_SYMBOL_GPL(relay_subbufs_consumed); + +/** + * relay_close - close the channel + * @chan: the channel + * + * Closes all channel buffers and frees the channel. + */ +void relay_close(struct rchan *chan) +{ + unsigned int i; + + if (!chan) + return; + + mutex_lock(&relay_channels_mutex); + if (chan->is_global && chan->buf[0]) + relay_close_buf(chan->buf[0]); + else + for_each_possible_cpu(i) + if (chan->buf[i]) + relay_close_buf(chan->buf[i]); + + if (chan->last_toobig) + printk(KERN_WARNING "relay: one or more items not logged " + "[item size (%Zd) > sub-buffer size (%Zd)]\n", + chan->last_toobig, chan->subbuf_size); + + list_del(&chan->list); + kref_put(&chan->kref, relay_destroy_channel); + mutex_unlock(&relay_channels_mutex); +} +EXPORT_SYMBOL_GPL(relay_close); + +/** + * relay_flush - close the channel + * @chan: the channel + * + * Flushes all channel buffers, i.e. forces buffer switch. + */ +void relay_flush(struct rchan *chan) +{ + unsigned int i; + + if (!chan) + return; + + if (chan->is_global && chan->buf[0]) { + relay_switch_subbuf(chan->buf[0], 0); + return; + } + + mutex_lock(&relay_channels_mutex); + for_each_possible_cpu(i) + if (chan->buf[i]) + relay_switch_subbuf(chan->buf[i], 0); + mutex_unlock(&relay_channels_mutex); +} +EXPORT_SYMBOL_GPL(relay_flush); + +/** + * relay_file_open - open file op for relay files + * @inode: the inode + * @filp: the file + * + * Increments the channel buffer refcount. + */ +static int relay_file_open(struct inode *inode, struct file *filp) +{ + struct rchan_buf *buf = inode->i_private; + kref_get(&buf->kref); + filp->private_data = buf; + + return nonseekable_open(inode, filp); +} + +/** + * relay_file_mmap - mmap file op for relay files + * @filp: the file + * @vma: the vma describing what to map + * + * Calls upon relay_mmap_buf() to map the file into user space. + */ +static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct rchan_buf *buf = filp->private_data; + return relay_mmap_buf(buf, vma); +} + +/** + * relay_file_poll - poll file op for relay files + * @filp: the file + * @wait: poll table + * + * Poll implemention. + */ +static unsigned int relay_file_poll(struct file *filp, poll_table *wait) +{ + unsigned int mask = 0; + struct rchan_buf *buf = filp->private_data; + + if (buf->finalized) + return POLLERR; + + if (filp->f_mode & FMODE_READ) { + poll_wait(filp, &buf->read_wait, wait); + if (!relay_buf_empty(buf)) + mask |= POLLIN | POLLRDNORM; + } + + return mask; +} + +/** + * relay_file_release - release file op for relay files + * @inode: the inode + * @filp: the file + * + * Decrements the channel refcount, as the filesystem is + * no longer using it. + */ +static int relay_file_release(struct inode *inode, struct file *filp) +{ + struct rchan_buf *buf = filp->private_data; + kref_put(&buf->kref, relay_remove_buf); + + return 0; +} + +/* + * relay_file_read_consume - update the consumed count for the buffer + */ +static void relay_file_read_consume(struct rchan_buf *buf, + size_t read_pos, + size_t bytes_consumed) +{ + size_t subbuf_size = buf->chan->subbuf_size; + size_t n_subbufs = buf->chan->n_subbufs; + size_t read_subbuf; + + if (buf->subbufs_produced == buf->subbufs_consumed && + buf->offset == buf->bytes_consumed) + return; + + if (buf->bytes_consumed + bytes_consumed > subbuf_size) { + relay_subbufs_consumed(buf->chan, buf->cpu, 1); + buf->bytes_consumed = 0; + } + + buf->bytes_consumed += bytes_consumed; + if (!read_pos) + read_subbuf = buf->subbufs_consumed % n_subbufs; + else + read_subbuf = read_pos / buf->chan->subbuf_size; + if (buf->bytes_consumed + buf->padding[read_subbuf] == subbuf_size) { + if ((read_subbuf == buf->subbufs_produced % n_subbufs) && + (buf->offset == subbuf_size)) + return; + relay_subbufs_consumed(buf->chan, buf->cpu, 1); + buf->bytes_consumed = 0; + } +} + +/* + * relay_file_read_avail - boolean, are there unconsumed bytes available? + */ +static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) +{ + size_t subbuf_size = buf->chan->subbuf_size; + size_t n_subbufs = buf->chan->n_subbufs; + size_t produced = buf->subbufs_produced; + size_t consumed = buf->subbufs_consumed; + + relay_file_read_consume(buf, read_pos, 0); + + consumed = buf->subbufs_consumed; + + if (unlikely(buf->offset > subbuf_size)) { + if (produced == consumed) + return 0; + return 1; + } + + if (unlikely(produced - consumed >= n_subbufs)) { + consumed = produced - n_subbufs + 1; + buf->subbufs_consumed = consumed; + buf->bytes_consumed = 0; + } + + produced = (produced % n_subbufs) * subbuf_size + buf->offset; + consumed = (consumed % n_subbufs) * subbuf_size + buf->bytes_consumed; + + if (consumed > produced) + produced += n_subbufs * subbuf_size; + + if (consumed == produced) { + if (buf->offset == subbuf_size && + buf->subbufs_produced > buf->subbufs_consumed) + return 1; + return 0; + } + + return 1; +} + +/** + * relay_file_read_subbuf_avail - return bytes available in sub-buffer + * @read_pos: file read position + * @buf: relay channel buffer + */ +static size_t relay_file_read_subbuf_avail(size_t read_pos, + struct rchan_buf *buf) +{ + size_t padding, avail = 0; + size_t read_subbuf, read_offset, write_subbuf, write_offset; + size_t subbuf_size = buf->chan->subbuf_size; + + write_subbuf = (buf->data - buf->start) / subbuf_size; + write_offset = buf->offset > subbuf_size ? subbuf_size : buf->offset; + read_subbuf = read_pos / subbuf_size; + read_offset = read_pos % subbuf_size; + padding = buf->padding[read_subbuf]; + + if (read_subbuf == write_subbuf) { + if (read_offset + padding < write_offset) + avail = write_offset - (read_offset + padding); + } else + avail = (subbuf_size - padding) - read_offset; + + return avail; +} + +/** + * relay_file_read_start_pos - find the first available byte to read + * @read_pos: file read position + * @buf: relay channel buffer + * + * If the @read_pos is in the middle of padding, return the + * position of the first actually available byte, otherwise + * return the original value. + */ +static size_t relay_file_read_start_pos(size_t read_pos, + struct rchan_buf *buf) +{ + size_t read_subbuf, padding, padding_start, padding_end; + size_t subbuf_size = buf->chan->subbuf_size; + size_t n_subbufs = buf->chan->n_subbufs; + size_t consumed = buf->subbufs_consumed % n_subbufs; + + if (!read_pos) + read_pos = consumed * subbuf_size + buf->bytes_consumed; + read_subbuf = read_pos / subbuf_size; + padding = buf->padding[read_subbuf]; + padding_start = (read_subbuf + 1) * subbuf_size - padding; + padding_end = (read_subbuf + 1) * subbuf_size; + if (read_pos >= padding_start && read_pos < padding_end) { + read_subbuf = (read_subbuf + 1) % n_subbufs; + read_pos = read_subbuf * subbuf_size; + } + + return read_pos; +} + +/** + * relay_file_read_end_pos - return the new read position + * @read_pos: file read position + * @buf: relay channel buffer + * @count: number of bytes to be read + */ +static size_t relay_file_read_end_pos(struct rchan_buf *buf, + size_t read_pos, + size_t count) +{ + size_t read_subbuf, padding, end_pos; + size_t subbuf_size = buf->chan->subbuf_size; + size_t n_subbufs = buf->chan->n_subbufs; + + read_subbuf = read_pos / subbuf_size; + padding = buf->padding[read_subbuf]; + if (read_pos % subbuf_size + count + padding == subbuf_size) + end_pos = (read_subbuf + 1) * subbuf_size; + else + end_pos = read_pos + count; + if (end_pos >= subbuf_size * n_subbufs) + end_pos = 0; + + return end_pos; +} + +/* + * subbuf_read_actor - read up to one subbuf's worth of data + */ +static int subbuf_read_actor(size_t read_start, + struct rchan_buf *buf, + size_t avail, + read_descriptor_t *desc, + read_actor_t actor) +{ + void *from; + int ret = 0; + + from = buf->start + read_start; + ret = avail; + if (copy_to_user(desc->arg.buf, from, avail)) { + desc->error = -EFAULT; + ret = 0; + } + desc->arg.data += ret; + desc->written += ret; + desc->count -= ret; + + return ret; +} + +typedef int (*subbuf_actor_t) (size_t read_start, + struct rchan_buf *buf, + size_t avail, + read_descriptor_t *desc, + read_actor_t actor); + +/* + * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries + */ +static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, + subbuf_actor_t subbuf_actor, + read_actor_t actor, + read_descriptor_t *desc) +{ + struct rchan_buf *buf = filp->private_data; + size_t read_start, avail; + int ret; + + if (!desc->count) + return 0; + + mutex_lock(&filp->f_path.dentry->d_inode->i_mutex); + do { + if (!relay_file_read_avail(buf, *ppos)) + break; + + read_start = relay_file_read_start_pos(*ppos, buf); + avail = relay_file_read_subbuf_avail(read_start, buf); + if (!avail) + break; + + avail = min(desc->count, avail); + ret = subbuf_actor(read_start, buf, avail, desc, actor); + if (desc->error < 0) + break; + + if (ret) { + relay_file_read_consume(buf, read_start, ret); + *ppos = relay_file_read_end_pos(buf, read_start, ret); + } + } while (desc->count && ret); + mutex_unlock(&filp->f_path.dentry->d_inode->i_mutex); + + return desc->written; +} + +static ssize_t relay_file_read(struct file *filp, + char __user *buffer, + size_t count, + loff_t *ppos) +{ + read_descriptor_t desc; + desc.written = 0; + desc.count = count; + desc.arg.buf = buffer; + desc.error = 0; + return relay_file_read_subbufs(filp, ppos, subbuf_read_actor, + NULL, &desc); +} + +static void relay_consume_bytes(struct rchan_buf *rbuf, int bytes_consumed) +{ + rbuf->bytes_consumed += bytes_consumed; + + if (rbuf->bytes_consumed >= rbuf->chan->subbuf_size) { + relay_subbufs_consumed(rbuf->chan, rbuf->cpu, 1); + rbuf->bytes_consumed %= rbuf->chan->subbuf_size; + } +} + +static void relay_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct rchan_buf *rbuf; + + rbuf = (struct rchan_buf *)page_private(buf->page); + relay_consume_bytes(rbuf, buf->private); +} + +static const struct pipe_buf_operations relay_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, + .release = relay_pipe_buf_release, + .steal = generic_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i) +{ +} + +/* + * subbuf_splice_actor - splice up to one subbuf's worth of data + */ +static ssize_t subbuf_splice_actor(struct file *in, + loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, + unsigned int flags, + int *nonpad_ret) +{ + unsigned int pidx, poff, total_len, subbuf_pages, nr_pages; + struct rchan_buf *rbuf = in->private_data; + unsigned int subbuf_size = rbuf->chan->subbuf_size; + uint64_t pos = (uint64_t) *ppos; + uint32_t alloc_size = (uint32_t) rbuf->chan->alloc_size; + size_t read_start = (size_t) do_div(pos, alloc_size); + size_t read_subbuf = read_start / subbuf_size; + size_t padding = rbuf->padding[read_subbuf]; + size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding; + struct page *pages[PIPE_DEF_BUFFERS]; + struct partial_page partial[PIPE_DEF_BUFFERS]; + struct splice_pipe_desc spd = { + .pages = pages, + .nr_pages = 0, + .partial = partial, + .flags = flags, + .ops = &relay_pipe_buf_ops, + .spd_release = relay_page_release, + }; + ssize_t ret; + + if (rbuf->subbufs_produced == rbuf->subbufs_consumed) + return 0; + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + + /* + * Adjust read len, if longer than what is available + */ + if (len > (subbuf_size - read_start % subbuf_size)) + len = subbuf_size - read_start % subbuf_size; + + subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; + pidx = (read_start / PAGE_SIZE) % subbuf_pages; + poff = read_start & ~PAGE_MASK; + nr_pages = min_t(unsigned int, subbuf_pages, pipe->buffers); + + for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) { + unsigned int this_len, this_end, private; + unsigned int cur_pos = read_start + total_len; + + if (!len) + break; + + this_len = min_t(unsigned long, len, PAGE_SIZE - poff); + private = this_len; + + spd.pages[spd.nr_pages] = rbuf->page_array[pidx]; + spd.partial[spd.nr_pages].offset = poff; + + this_end = cur_pos + this_len; + if (this_end >= nonpad_end) { + this_len = nonpad_end - cur_pos; + private = this_len + padding; + } + spd.partial[spd.nr_pages].len = this_len; + spd.partial[spd.nr_pages].private = private; + + len -= this_len; + total_len += this_len; + poff = 0; + pidx = (pidx + 1) % subbuf_pages; + + if (this_end >= nonpad_end) { + spd.nr_pages++; + break; + } + } + + ret = 0; + if (!spd.nr_pages) + goto out; + + ret = *nonpad_ret = splice_to_pipe(pipe, &spd); + if (ret < 0 || ret < total_len) + goto out; + + if (read_start + ret == nonpad_end) + ret += padding; + +out: + splice_shrink_spd(pipe, &spd); + return ret; +} + +static ssize_t relay_file_splice_read(struct file *in, + loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, + unsigned int flags) +{ + ssize_t spliced; + int ret; + int nonpad_ret = 0; + + ret = 0; + spliced = 0; + + while (len && !spliced) { + ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret); + if (ret < 0) + break; + else if (!ret) { + if (flags & SPLICE_F_NONBLOCK) + ret = -EAGAIN; + break; + } + + *ppos += ret; + if (ret > len) + len = 0; + else + len -= ret; + spliced += nonpad_ret; + nonpad_ret = 0; + } + + if (spliced) + return spliced; + + return ret; +} + +const struct file_operations relay_file_operations = { + .open = relay_file_open, + .poll = relay_file_poll, + .mmap = relay_file_mmap, + .read = relay_file_read, + .llseek = no_llseek, + .release = relay_file_release, + .splice_read = relay_file_splice_read, +}; +EXPORT_SYMBOL_GPL(relay_file_operations); + +static __init int relay_init(void) +{ + + hotcpu_notifier(relay_hotcpu_callback, 0); + return 0; +} + +early_initcall(relay_init); diff --git a/kernel/res_counter.c b/kernel/res_counter.c new file mode 100644 index 00000000..34683efa --- /dev/null +++ b/kernel/res_counter.c @@ -0,0 +1,191 @@ +/* + * resource cgroups + * + * Copyright 2007 OpenVZ SWsoft Inc + * + * Author: Pavel Emelianov + * + */ + +#include +#include +#include +#include +#include +#include + +void res_counter_init(struct res_counter *counter, struct res_counter *parent) +{ + spin_lock_init(&counter->lock); + counter->limit = RESOURCE_MAX; + counter->soft_limit = RESOURCE_MAX; + counter->parent = parent; +} + +int res_counter_charge_locked(struct res_counter *counter, unsigned long val) +{ + if (counter->usage + val > counter->limit) { + counter->failcnt++; + return -ENOMEM; + } + + counter->usage += val; + if (counter->usage > counter->max_usage) + counter->max_usage = counter->usage; + return 0; +} + +int res_counter_charge(struct res_counter *counter, unsigned long val, + struct res_counter **limit_fail_at) +{ + int ret; + unsigned long flags; + struct res_counter *c, *u; + + *limit_fail_at = NULL; + local_irq_save(flags); + for (c = counter; c != NULL; c = c->parent) { + spin_lock(&c->lock); + ret = res_counter_charge_locked(c, val); + spin_unlock(&c->lock); + if (ret < 0) { + *limit_fail_at = c; + goto undo; + } + } + ret = 0; + goto done; +undo: + for (u = counter; u != c; u = u->parent) { + spin_lock(&u->lock); + res_counter_uncharge_locked(u, val); + spin_unlock(&u->lock); + } +done: + local_irq_restore(flags); + return ret; +} + +void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) +{ + if (WARN_ON(counter->usage < val)) + val = counter->usage; + + counter->usage -= val; +} + +void res_counter_uncharge(struct res_counter *counter, unsigned long val) +{ + unsigned long flags; + struct res_counter *c; + + local_irq_save(flags); + for (c = counter; c != NULL; c = c->parent) { + spin_lock(&c->lock); + res_counter_uncharge_locked(c, val); + spin_unlock(&c->lock); + } + local_irq_restore(flags); +} + + +static inline unsigned long long * +res_counter_member(struct res_counter *counter, int member) +{ + switch (member) { + case RES_USAGE: + return &counter->usage; + case RES_MAX_USAGE: + return &counter->max_usage; + case RES_LIMIT: + return &counter->limit; + case RES_FAILCNT: + return &counter->failcnt; + case RES_SOFT_LIMIT: + return &counter->soft_limit; + }; + + BUG(); + return NULL; +} + +ssize_t res_counter_read(struct res_counter *counter, int member, + const char __user *userbuf, size_t nbytes, loff_t *pos, + int (*read_strategy)(unsigned long long val, char *st_buf)) +{ + unsigned long long *val; + char buf[64], *s; + + s = buf; + val = res_counter_member(counter, member); + if (read_strategy) + s += read_strategy(*val, s); + else + s += sprintf(s, "%llu\n", *val); + return simple_read_from_buffer((void __user *)userbuf, nbytes, + pos, buf, s - buf); +} + +#if BITS_PER_LONG == 32 +u64 res_counter_read_u64(struct res_counter *counter, int member) +{ + unsigned long flags; + u64 ret; + + spin_lock_irqsave(&counter->lock, flags); + ret = *res_counter_member(counter, member); + spin_unlock_irqrestore(&counter->lock, flags); + + return ret; +} +#else +u64 res_counter_read_u64(struct res_counter *counter, int member) +{ + return *res_counter_member(counter, member); +} +#endif + +int res_counter_memparse_write_strategy(const char *buf, + unsigned long long *res) +{ + char *end; + + /* return RESOURCE_MAX(unlimited) if "-1" is specified */ + if (*buf == '-') { + *res = simple_strtoull(buf + 1, &end, 10); + if (*res != 1 || *end != '\0') + return -EINVAL; + *res = RESOURCE_MAX; + return 0; + } + + /* FIXME - make memparse() take const char* args */ + *res = memparse((char *)buf, &end); + if (*end != '\0') + return -EINVAL; + + *res = PAGE_ALIGN(*res); + return 0; +} + +int res_counter_write(struct res_counter *counter, int member, + const char *buf, write_strategy_fn write_strategy) +{ + char *end; + unsigned long flags; + unsigned long long tmp, *val; + + if (write_strategy) { + if (write_strategy(buf, &tmp)) + return -EINVAL; + } else { + tmp = simple_strtoull(buf, &end, 10); + if (*end != '\0') + return -EINVAL; + } + spin_lock_irqsave(&counter->lock, flags); + val = res_counter_member(counter, member); + *val = tmp; + spin_unlock_irqrestore(&counter->lock, flags); + return 0; +} diff --git a/kernel/resource.c b/kernel/resource.c new file mode 100644 index 00000000..3ff40178 --- /dev/null +++ b/kernel/resource.c @@ -0,0 +1,1132 @@ +/* + * linux/kernel/resource.c + * + * Copyright (C) 1999 Linus Torvalds + * Copyright (C) 1999 Martin Mares + * + * Arbitrary resource management. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +struct resource ioport_resource = { + .name = "PCI IO", + .start = 0, + .end = IO_SPACE_LIMIT, + .flags = IORESOURCE_IO, +}; +EXPORT_SYMBOL(ioport_resource); + +struct resource iomem_resource = { + .name = "PCI mem", + .start = 0, + .end = -1, + .flags = IORESOURCE_MEM, +}; +EXPORT_SYMBOL(iomem_resource); + +/* constraints to be met while allocating resources */ +struct resource_constraint { + resource_size_t min, max, align; + resource_size_t (*alignf)(void *, const struct resource *, + resource_size_t, resource_size_t); + void *alignf_data; +}; + +static DEFINE_RWLOCK(resource_lock); + +static void *r_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct resource *p = v; + (*pos)++; + if (p->child) + return p->child; + while (!p->sibling && p->parent) + p = p->parent; + return p->sibling; +} + +#ifdef CONFIG_PROC_FS + +enum { MAX_IORES_LEVEL = 5 }; + +static void *r_start(struct seq_file *m, loff_t *pos) + __acquires(resource_lock) +{ + struct resource *p = m->private; + loff_t l = 0; + read_lock(&resource_lock); + for (p = p->child; p && l < *pos; p = r_next(m, p, &l)) + ; + return p; +} + +static void r_stop(struct seq_file *m, void *v) + __releases(resource_lock) +{ + read_unlock(&resource_lock); +} + +static int r_show(struct seq_file *m, void *v) +{ + struct resource *root = m->private; + struct resource *r = v, *p; + int width = root->end < 0x10000 ? 4 : 8; + int depth; + + for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent) + if (p->parent == root) + break; + seq_printf(m, "%*s%0*llx-%0*llx : %s\n", + depth * 2, "", + width, (unsigned long long) r->start, + width, (unsigned long long) r->end, + r->name ? r->name : ""); + return 0; +} + +static const struct seq_operations resource_op = { + .start = r_start, + .next = r_next, + .stop = r_stop, + .show = r_show, +}; + +static int ioports_open(struct inode *inode, struct file *file) +{ + int res = seq_open(file, &resource_op); + if (!res) { + struct seq_file *m = file->private_data; + m->private = &ioport_resource; + } + return res; +} + +static int iomem_open(struct inode *inode, struct file *file) +{ + int res = seq_open(file, &resource_op); + if (!res) { + struct seq_file *m = file->private_data; + m->private = &iomem_resource; + } + return res; +} + +static const struct file_operations proc_ioports_operations = { + .open = ioports_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const struct file_operations proc_iomem_operations = { + .open = iomem_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init ioresources_init(void) +{ + proc_create("ioports", 0, NULL, &proc_ioports_operations); + proc_create("iomem", 0, NULL, &proc_iomem_operations); + return 0; +} +__initcall(ioresources_init); + +#endif /* CONFIG_PROC_FS */ + +/* Return the conflict entry if you can't request it */ +static struct resource * __request_resource(struct resource *root, struct resource *new) +{ + resource_size_t start = new->start; + resource_size_t end = new->end; + struct resource *tmp, **p; + + if (end < start) + return root; + if (start < root->start) + return root; + if (end > root->end) + return root; + p = &root->child; + for (;;) { + tmp = *p; + if (!tmp || tmp->start > end) { + new->sibling = tmp; + *p = new; + new->parent = root; + return NULL; + } + p = &tmp->sibling; + if (tmp->end < start) + continue; + return tmp; + } +} + +static int __release_resource(struct resource *old) +{ + struct resource *tmp, **p; + + p = &old->parent->child; + for (;;) { + tmp = *p; + if (!tmp) + break; + if (tmp == old) { + *p = tmp->sibling; + old->parent = NULL; + return 0; + } + p = &tmp->sibling; + } + return -EINVAL; +} + +static void __release_child_resources(struct resource *r) +{ + struct resource *tmp, *p; + resource_size_t size; + + p = r->child; + r->child = NULL; + while (p) { + tmp = p; + p = p->sibling; + + tmp->parent = NULL; + tmp->sibling = NULL; + __release_child_resources(tmp); + + printk(KERN_DEBUG "release child resource %pR\n", tmp); + /* need to restore size, and keep flags */ + size = resource_size(tmp); + tmp->start = 0; + tmp->end = size - 1; + } +} + +void release_child_resources(struct resource *r) +{ + write_lock(&resource_lock); + __release_child_resources(r); + write_unlock(&resource_lock); +} + +/** + * request_resource_conflict - request and reserve an I/O or memory resource + * @root: root resource descriptor + * @new: resource descriptor desired by caller + * + * Returns 0 for success, conflict resource on error. + */ +struct resource *request_resource_conflict(struct resource *root, struct resource *new) +{ + struct resource *conflict; + + write_lock(&resource_lock); + conflict = __request_resource(root, new); + write_unlock(&resource_lock); + return conflict; +} + +/** + * request_resource - request and reserve an I/O or memory resource + * @root: root resource descriptor + * @new: resource descriptor desired by caller + * + * Returns 0 for success, negative error code on error. + */ +int request_resource(struct resource *root, struct resource *new) +{ + struct resource *conflict; + + conflict = request_resource_conflict(root, new); + return conflict ? -EBUSY : 0; +} + +EXPORT_SYMBOL(request_resource); + +/** + * release_resource - release a previously reserved resource + * @old: resource pointer + */ +int release_resource(struct resource *old) +{ + int retval; + + write_lock(&resource_lock); + retval = __release_resource(old); + write_unlock(&resource_lock); + return retval; +} + +EXPORT_SYMBOL(release_resource); + +#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) +/* + * Finds the lowest memory reosurce exists within [res->start.res->end) + * the caller must specify res->start, res->end, res->flags and "name". + * If found, returns 0, res is overwritten, if not found, returns -1. + */ +static int find_next_system_ram(struct resource *res, char *name) +{ + resource_size_t start, end; + struct resource *p; + + BUG_ON(!res); + + start = res->start; + end = res->end; + BUG_ON(start >= end); + + read_lock(&resource_lock); + for (p = iomem_resource.child; p ; p = p->sibling) { + /* system ram is just marked as IORESOURCE_MEM */ + if (p->flags != res->flags) + continue; + if (name && strcmp(p->name, name)) + continue; + if (p->start > end) { + p = NULL; + break; + } + if ((p->end >= start) && (p->start < end)) + break; + } + read_unlock(&resource_lock); + if (!p) + return -1; + /* copy data */ + if (res->start < p->start) + res->start = p->start; + if (res->end > p->end) + res->end = p->end; + return 0; +} + +/* + * This function calls callback against all memory range of "System RAM" + * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. + * Now, this function is only for "System RAM". + */ +int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, + void *arg, int (*func)(unsigned long, unsigned long, void *)) +{ + struct resource res; + unsigned long pfn, end_pfn; + u64 orig_end; + int ret = -1; + + res.start = (u64) start_pfn << PAGE_SHIFT; + res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; + res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + orig_end = res.end; + while ((res.start < res.end) && + (find_next_system_ram(&res, "System RAM") >= 0)) { + pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; + end_pfn = (res.end + 1) >> PAGE_SHIFT; + if (end_pfn > pfn) + ret = (*func)(pfn, end_pfn - pfn, arg); + if (ret) + break; + res.start = res.end + 1; + res.end = orig_end; + } + return ret; +} + +#endif + +static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg) +{ + return 1; +} +/* + * This generic page_is_ram() returns true if specified address is + * registered as "System RAM" in iomem_resource list. + */ +int __weak page_is_ram(unsigned long pfn) +{ + return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1; +} + +void __weak arch_remove_reservations(struct resource *avail) +{ +} + +static resource_size_t simple_align_resource(void *data, + const struct resource *avail, + resource_size_t size, + resource_size_t align) +{ + return avail->start; +} + +static void resource_clip(struct resource *res, resource_size_t min, + resource_size_t max) +{ + if (res->start < min) + res->start = min; + if (res->end > max) + res->end = max; +} + +static bool resource_contains(struct resource *res1, struct resource *res2) +{ + return res1->start <= res2->start && res1->end >= res2->end; +} + +/* + * Find empty slot in the resource tree with the given range and + * alignment constraints + */ +static int __find_resource(struct resource *root, struct resource *old, + struct resource *new, + resource_size_t size, + struct resource_constraint *constraint) +{ + struct resource *this = root->child; + struct resource tmp = *new, avail, alloc; + + tmp.flags = new->flags; + tmp.start = root->start; + /* + * Skip past an allocated resource that starts at 0, since the assignment + * of this->start - 1 to tmp->end below would cause an underflow. + */ + if (this && this->start == root->start) { + tmp.start = (this == old) ? old->start : this->end + 1; + this = this->sibling; + } + for(;;) { + if (this) + tmp.end = (this == old) ? this->end : this->start - 1; + else + tmp.end = root->end; + + resource_clip(&tmp, constraint->min, constraint->max); + arch_remove_reservations(&tmp); + + /* Check for overflow after ALIGN() */ + avail = *new; + avail.start = ALIGN(tmp.start, constraint->align); + avail.end = tmp.end; + if (avail.start >= tmp.start) { + alloc.start = constraint->alignf(constraint->alignf_data, &avail, + size, constraint->align); + alloc.end = alloc.start + size - 1; + if (resource_contains(&avail, &alloc)) { + new->start = alloc.start; + new->end = alloc.end; + return 0; + } + } + if (!this) + break; + if (this != old) + tmp.start = this->end + 1; + this = this->sibling; + } + return -EBUSY; +} + +/* + * Find empty slot in the resource tree given range and alignment. + */ +static int find_resource(struct resource *root, struct resource *new, + resource_size_t size, + struct resource_constraint *constraint) +{ + return __find_resource(root, NULL, new, size, constraint); +} + +/** + * reallocate_resource - allocate a slot in the resource tree given range & alignment. + * The resource will be relocated if the new size cannot be reallocated in the + * current location. + * + * @root: root resource descriptor + * @old: resource descriptor desired by caller + * @newsize: new size of the resource descriptor + * @constraint: the size and alignment constraints to be met. + */ +int reallocate_resource(struct resource *root, struct resource *old, + resource_size_t newsize, + struct resource_constraint *constraint) +{ + int err=0; + struct resource new = *old; + struct resource *conflict; + + write_lock(&resource_lock); + + if ((err = __find_resource(root, old, &new, newsize, constraint))) + goto out; + + if (resource_contains(&new, old)) { + old->start = new.start; + old->end = new.end; + goto out; + } + + if (old->child) { + err = -EBUSY; + goto out; + } + + if (resource_contains(old, &new)) { + old->start = new.start; + old->end = new.end; + } else { + __release_resource(old); + *old = new; + conflict = __request_resource(root, old); + BUG_ON(conflict); + } +out: + write_unlock(&resource_lock); + return err; +} + + +/** + * allocate_resource - allocate empty slot in the resource tree given range & alignment. + * The resource will be reallocated with a new size if it was already allocated + * @root: root resource descriptor + * @new: resource descriptor desired by caller + * @size: requested resource region size + * @min: minimum size to allocate + * @max: maximum size to allocate + * @align: alignment requested, in bytes + * @alignf: alignment function, optional, called if not NULL + * @alignf_data: arbitrary data to pass to the @alignf function + */ +int allocate_resource(struct resource *root, struct resource *new, + resource_size_t size, resource_size_t min, + resource_size_t max, resource_size_t align, + resource_size_t (*alignf)(void *, + const struct resource *, + resource_size_t, + resource_size_t), + void *alignf_data) +{ + int err; + struct resource_constraint constraint; + + if (!alignf) + alignf = simple_align_resource; + + constraint.min = min; + constraint.max = max; + constraint.align = align; + constraint.alignf = alignf; + constraint.alignf_data = alignf_data; + + if ( new->parent ) { + /* resource is already allocated, try reallocating with + the new constraints */ + return reallocate_resource(root, new, size, &constraint); + } + + write_lock(&resource_lock); + err = find_resource(root, new, size, &constraint); + if (err >= 0 && __request_resource(root, new)) + err = -EBUSY; + write_unlock(&resource_lock); + return err; +} + +EXPORT_SYMBOL(allocate_resource); + +/* + * Insert a resource into the resource tree. If successful, return NULL, + * otherwise return the conflicting resource (compare to __request_resource()) + */ +static struct resource * __insert_resource(struct resource *parent, struct resource *new) +{ + struct resource *first, *next; + + for (;; parent = first) { + first = __request_resource(parent, new); + if (!first) + return first; + + if (first == parent) + return first; + if (WARN_ON(first == new)) /* duplicated insertion */ + return first; + + if ((first->start > new->start) || (first->end < new->end)) + break; + if ((first->start == new->start) && (first->end == new->end)) + break; + } + + for (next = first; ; next = next->sibling) { + /* Partial overlap? Bad, and unfixable */ + if (next->start < new->start || next->end > new->end) + return next; + if (!next->sibling) + break; + if (next->sibling->start > new->end) + break; + } + + new->parent = parent; + new->sibling = next->sibling; + new->child = first; + + next->sibling = NULL; + for (next = first; next; next = next->sibling) + next->parent = new; + + if (parent->child == first) { + parent->child = new; + } else { + next = parent->child; + while (next->sibling != first) + next = next->sibling; + next->sibling = new; + } + return NULL; +} + +/** + * insert_resource_conflict - Inserts resource in the resource tree + * @parent: parent of the new resource + * @new: new resource to insert + * + * Returns 0 on success, conflict resource if the resource can't be inserted. + * + * This function is equivalent to request_resource_conflict when no conflict + * happens. If a conflict happens, and the conflicting resources + * entirely fit within the range of the new resource, then the new + * resource is inserted and the conflicting resources become children of + * the new resource. + */ +struct resource *insert_resource_conflict(struct resource *parent, struct resource *new) +{ + struct resource *conflict; + + write_lock(&resource_lock); + conflict = __insert_resource(parent, new); + write_unlock(&resource_lock); + return conflict; +} + +/** + * insert_resource - Inserts a resource in the resource tree + * @parent: parent of the new resource + * @new: new resource to insert + * + * Returns 0 on success, -EBUSY if the resource can't be inserted. + */ +int insert_resource(struct resource *parent, struct resource *new) +{ + struct resource *conflict; + + conflict = insert_resource_conflict(parent, new); + return conflict ? -EBUSY : 0; +} + +/** + * insert_resource_expand_to_fit - Insert a resource into the resource tree + * @root: root resource descriptor + * @new: new resource to insert + * + * Insert a resource into the resource tree, possibly expanding it in order + * to make it encompass any conflicting resources. + */ +void insert_resource_expand_to_fit(struct resource *root, struct resource *new) +{ + if (new->parent) + return; + + write_lock(&resource_lock); + for (;;) { + struct resource *conflict; + + conflict = __insert_resource(root, new); + if (!conflict) + break; + if (conflict == root) + break; + + /* Ok, expand resource to cover the conflict, then try again .. */ + if (conflict->start < new->start) + new->start = conflict->start; + if (conflict->end > new->end) + new->end = conflict->end; + + printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name); + } + write_unlock(&resource_lock); +} + +/** + * adjust_resource - modify a resource's start and size + * @res: resource to modify + * @start: new start value + * @size: new size + * + * Given an existing resource, change its start and size to match the + * arguments. Returns 0 on success, -EBUSY if it can't fit. + * Existing children of the resource are assumed to be immutable. + */ +int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) +{ + struct resource *tmp, *parent = res->parent; + resource_size_t end = start + size - 1; + int result = -EBUSY; + + write_lock(&resource_lock); + + if ((start < parent->start) || (end > parent->end)) + goto out; + + for (tmp = res->child; tmp; tmp = tmp->sibling) { + if ((tmp->start < start) || (tmp->end > end)) + goto out; + } + + if (res->sibling && (res->sibling->start <= end)) + goto out; + + tmp = parent->child; + if (tmp != res) { + while (tmp->sibling != res) + tmp = tmp->sibling; + if (start <= tmp->end) + goto out; + } + + res->start = start; + res->end = end; + result = 0; + + out: + write_unlock(&resource_lock); + return result; +} + +static void __init __reserve_region_with_split(struct resource *root, + resource_size_t start, resource_size_t end, + const char *name) +{ + struct resource *parent = root; + struct resource *conflict; + struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC); + + if (!res) + return; + + res->name = name; + res->start = start; + res->end = end; + res->flags = IORESOURCE_BUSY; + + conflict = __request_resource(parent, res); + if (!conflict) + return; + + /* failed, split and try again */ + kfree(res); + + /* conflict covered whole area */ + if (conflict->start <= start && conflict->end >= end) + return; + + if (conflict->start > start) + __reserve_region_with_split(root, start, conflict->start-1, name); + if (conflict->end < end) + __reserve_region_with_split(root, conflict->end+1, end, name); +} + +void __init reserve_region_with_split(struct resource *root, + resource_size_t start, resource_size_t end, + const char *name) +{ + write_lock(&resource_lock); + __reserve_region_with_split(root, start, end, name); + write_unlock(&resource_lock); +} + +EXPORT_SYMBOL(adjust_resource); + +/** + * resource_alignment - calculate resource's alignment + * @res: resource pointer + * + * Returns alignment on success, 0 (invalid alignment) on failure. + */ +resource_size_t resource_alignment(struct resource *res) +{ + switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) { + case IORESOURCE_SIZEALIGN: + return resource_size(res); + case IORESOURCE_STARTALIGN: + return res->start; + default: + return 0; + } +} + +/* + * This is compatibility stuff for IO resources. + * + * Note how this, unlike the above, knows about + * the IO flag meanings (busy etc). + * + * request_region creates a new busy region. + * + * check_region returns non-zero if the area is already busy. + * + * release_region releases a matching busy region. + */ + +static DECLARE_WAIT_QUEUE_HEAD(muxed_resource_wait); + +/** + * __request_region - create a new busy resource region + * @parent: parent resource descriptor + * @start: resource start address + * @n: resource region size + * @name: reserving caller's ID string + * @flags: IO resource flags + */ +struct resource * __request_region(struct resource *parent, + resource_size_t start, resource_size_t n, + const char *name, int flags) +{ + DECLARE_WAITQUEUE(wait, current); + struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); + + if (!res) + return NULL; + + res->name = name; + res->start = start; + res->end = start + n - 1; + res->flags = IORESOURCE_BUSY; + res->flags |= flags; + + write_lock(&resource_lock); + + for (;;) { + struct resource *conflict; + + conflict = __request_resource(parent, res); + if (!conflict) + break; + if (conflict != parent) { + parent = conflict; + if (!(conflict->flags & IORESOURCE_BUSY)) + continue; + } + if (conflict->flags & flags & IORESOURCE_MUXED) { + add_wait_queue(&muxed_resource_wait, &wait); + write_unlock(&resource_lock); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + remove_wait_queue(&muxed_resource_wait, &wait); + write_lock(&resource_lock); + continue; + } + /* Uhhuh, that didn't work out.. */ + kfree(res); + res = NULL; + break; + } + write_unlock(&resource_lock); + return res; +} +EXPORT_SYMBOL(__request_region); + +/** + * __check_region - check if a resource region is busy or free + * @parent: parent resource descriptor + * @start: resource start address + * @n: resource region size + * + * Returns 0 if the region is free at the moment it is checked, + * returns %-EBUSY if the region is busy. + * + * NOTE: + * This function is deprecated because its use is racy. + * Even if it returns 0, a subsequent call to request_region() + * may fail because another driver etc. just allocated the region. + * Do NOT use it. It will be removed from the kernel. + */ +int __check_region(struct resource *parent, resource_size_t start, + resource_size_t n) +{ + struct resource * res; + + res = __request_region(parent, start, n, "check-region", 0); + if (!res) + return -EBUSY; + + release_resource(res); + kfree(res); + return 0; +} +EXPORT_SYMBOL(__check_region); + +/** + * __release_region - release a previously reserved resource region + * @parent: parent resource descriptor + * @start: resource start address + * @n: resource region size + * + * The described resource region must match a currently busy region. + */ +void __release_region(struct resource *parent, resource_size_t start, + resource_size_t n) +{ + struct resource **p; + resource_size_t end; + + p = &parent->child; + end = start + n - 1; + + write_lock(&resource_lock); + + for (;;) { + struct resource *res = *p; + + if (!res) + break; + if (res->start <= start && res->end >= end) { + if (!(res->flags & IORESOURCE_BUSY)) { + p = &res->child; + continue; + } + if (res->start != start || res->end != end) + break; + *p = res->sibling; + write_unlock(&resource_lock); + if (res->flags & IORESOURCE_MUXED) + wake_up(&muxed_resource_wait); + kfree(res); + return; + } + p = &res->sibling; + } + + write_unlock(&resource_lock); + + printk(KERN_WARNING "Trying to free nonexistent resource " + "<%016llx-%016llx>\n", (unsigned long long)start, + (unsigned long long)end); +} +EXPORT_SYMBOL(__release_region); + +/* + * Managed region resource + */ +struct region_devres { + struct resource *parent; + resource_size_t start; + resource_size_t n; +}; + +static void devm_region_release(struct device *dev, void *res) +{ + struct region_devres *this = res; + + __release_region(this->parent, this->start, this->n); +} + +static int devm_region_match(struct device *dev, void *res, void *match_data) +{ + struct region_devres *this = res, *match = match_data; + + return this->parent == match->parent && + this->start == match->start && this->n == match->n; +} + +struct resource * __devm_request_region(struct device *dev, + struct resource *parent, resource_size_t start, + resource_size_t n, const char *name) +{ + struct region_devres *dr = NULL; + struct resource *res; + + dr = devres_alloc(devm_region_release, sizeof(struct region_devres), + GFP_KERNEL); + if (!dr) + return NULL; + + dr->parent = parent; + dr->start = start; + dr->n = n; + + res = __request_region(parent, start, n, name, 0); + if (res) + devres_add(dev, dr); + else + devres_free(dr); + + return res; +} +EXPORT_SYMBOL(__devm_request_region); + +void __devm_release_region(struct device *dev, struct resource *parent, + resource_size_t start, resource_size_t n) +{ + struct region_devres match_data = { parent, start, n }; + + __release_region(parent, start, n); + WARN_ON(devres_destroy(dev, devm_region_release, devm_region_match, + &match_data)); +} +EXPORT_SYMBOL(__devm_release_region); + +/* + * Called from init/main.c to reserve IO ports. + */ +#define MAXRESERVE 4 +static int __init reserve_setup(char *str) +{ + static int reserved; + static struct resource reserve[MAXRESERVE]; + + for (;;) { + unsigned int io_start, io_num; + int x = reserved; + + if (get_option (&str, &io_start) != 2) + break; + if (get_option (&str, &io_num) == 0) + break; + if (x < MAXRESERVE) { + struct resource *res = reserve + x; + res->name = "reserved"; + res->start = io_start; + res->end = io_start + io_num - 1; + res->flags = IORESOURCE_BUSY; + res->child = NULL; + if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0) + reserved = x+1; + } + } + return 1; +} + +__setup("reserve=", reserve_setup); + +/* + * Check if the requested addr and size spans more than any slot in the + * iomem resource tree. + */ +int iomem_map_sanity_check(resource_size_t addr, unsigned long size) +{ + struct resource *p = &iomem_resource; + int err = 0; + loff_t l; + + read_lock(&resource_lock); + for (p = p->child; p ; p = r_next(NULL, p, &l)) { + /* + * We can probably skip the resources without + * IORESOURCE_IO attribute? + */ + if (p->start >= addr + size) + continue; + if (p->end < addr) + continue; + if (PFN_DOWN(p->start) <= PFN_DOWN(addr) && + PFN_DOWN(p->end) >= PFN_DOWN(addr + size - 1)) + continue; + /* + * if a resource is "BUSY", it's not a hardware resource + * but a driver mapping of such a resource; we don't want + * to warn for those; some drivers legitimately map only + * partial hardware resources. (example: vesafb) + */ + if (p->flags & IORESOURCE_BUSY) + continue; + + printk(KERN_WARNING "resource map sanity check conflict: " + "0x%llx 0x%llx 0x%llx 0x%llx %s\n", + (unsigned long long)addr, + (unsigned long long)(addr + size - 1), + (unsigned long long)p->start, + (unsigned long long)p->end, + p->name); + err = -1; + break; + } + read_unlock(&resource_lock); + + return err; +} + +#ifdef CONFIG_STRICT_DEVMEM +static int strict_iomem_checks = 1; +#else +static int strict_iomem_checks; +#endif + +/* + * check if an address is reserved in the iomem resource tree + * returns 1 if reserved, 0 if not reserved. + */ +int iomem_is_exclusive(u64 addr) +{ + struct resource *p = &iomem_resource; + int err = 0; + loff_t l; + int size = PAGE_SIZE; + + if (!strict_iomem_checks) + return 0; + + addr = addr & PAGE_MASK; + + read_lock(&resource_lock); + for (p = p->child; p ; p = r_next(NULL, p, &l)) { + /* + * We can probably skip the resources without + * IORESOURCE_IO attribute? + */ + if (p->start >= addr + size) + break; + if (p->end < addr) + continue; + if (p->flags & IORESOURCE_BUSY && + p->flags & IORESOURCE_EXCLUSIVE) { + err = 1; + break; + } + } + read_unlock(&resource_lock); + + return err; +} + +static int __init strict_iomem(char *str) +{ + if (strstr(str, "relaxed")) + strict_iomem_checks = 0; + if (strstr(str, "strict")) + strict_iomem_checks = 1; + return 1; +} + +__setup("iomem=", strict_iomem); diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c new file mode 100644 index 00000000..3c7cbc2c --- /dev/null +++ b/kernel/rtmutex-debug.c @@ -0,0 +1,238 @@ +/* + * RT-Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2006 Timesys Corp., Thomas Gleixner + * + * This code is based on the rt.c implementation in the preempt-rt tree. + * Portions of said code are + * + * Copyright (C) 2004 LynuxWorks, Inc., Igor Manyilov, Bill Huey + * Copyright (C) 2006 Esben Nielsen + * Copyright (C) 2006 Kihon Technologies Inc., + * Steven Rostedt + * + * See rt.c in preempt-rt for proper credits and further information + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rtmutex_common.h" + +# define TRACE_WARN_ON(x) WARN_ON(x) +# define TRACE_BUG_ON(x) BUG_ON(x) + +# define TRACE_OFF() \ +do { \ + if (rt_trace_on) { \ + rt_trace_on = 0; \ + console_verbose(); \ + if (raw_spin_is_locked(¤t->pi_lock)) \ + raw_spin_unlock(¤t->pi_lock); \ + } \ +} while (0) + +# define TRACE_OFF_NOLOCK() \ +do { \ + if (rt_trace_on) { \ + rt_trace_on = 0; \ + console_verbose(); \ + } \ +} while (0) + +# define TRACE_BUG_LOCKED() \ +do { \ + TRACE_OFF(); \ + BUG(); \ +} while (0) + +# define TRACE_WARN_ON_LOCKED(c) \ +do { \ + if (unlikely(c)) { \ + TRACE_OFF(); \ + WARN_ON(1); \ + } \ +} while (0) + +# define TRACE_BUG_ON_LOCKED(c) \ +do { \ + if (unlikely(c)) \ + TRACE_BUG_LOCKED(); \ +} while (0) + +#ifdef CONFIG_SMP +# define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c) +#else +# define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0) +#endif + +/* + * deadlock detection flag. We turn it off when we detect + * the first problem because we dont want to recurse back + * into the tracing code when doing error printk or + * executing a BUG(): + */ +static int rt_trace_on = 1; + +static void printk_task(struct task_struct *p) +{ + if (p) + printk("%16s:%5d [%p, %3d]", p->comm, task_pid_nr(p), p, p->prio); + else + printk(""); +} + +static void printk_lock(struct rt_mutex *lock, int print_owner) +{ + if (lock->name) + printk(" [%p] {%s}\n", + lock, lock->name); + else + printk(" [%p] {%s:%d}\n", + lock, lock->file, lock->line); + + if (print_owner && rt_mutex_owner(lock)) { + printk(".. ->owner: %p\n", lock->owner); + printk(".. held by: "); + printk_task(rt_mutex_owner(lock)); + printk("\n"); + } +} + +void rt_mutex_debug_task_free(struct task_struct *task) +{ + WARN_ON(!plist_head_empty(&task->pi_waiters)); + WARN_ON(task->pi_blocked_on); +} + +/* + * We fill out the fields in the waiter to store the information about + * the deadlock. We print when we return. act_waiter can be NULL in + * case of a remove waiter operation. + */ +void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter, + struct rt_mutex *lock) +{ + struct task_struct *task; + + if (!rt_trace_on || detect || !act_waiter) + return; + + task = rt_mutex_owner(act_waiter->lock); + if (task && task != current) { + act_waiter->deadlock_task_pid = get_pid(task_pid(task)); + act_waiter->deadlock_lock = lock; + } +} + +void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) +{ + struct task_struct *task; + + if (!waiter->deadlock_lock || !rt_trace_on) + return; + + rcu_read_lock(); + task = pid_task(waiter->deadlock_task_pid, PIDTYPE_PID); + if (!task) { + rcu_read_unlock(); + return; + } + + TRACE_OFF_NOLOCK(); + + printk("\n============================================\n"); + printk( "[ BUG: circular locking deadlock detected! ]\n"); + printk( "--------------------------------------------\n"); + printk("%s/%d is deadlocking current task %s/%d\n\n", + task->comm, task_pid_nr(task), + current->comm, task_pid_nr(current)); + + printk("\n1) %s/%d is trying to acquire this lock:\n", + current->comm, task_pid_nr(current)); + printk_lock(waiter->lock, 1); + + printk("\n2) %s/%d is blocked on this lock:\n", + task->comm, task_pid_nr(task)); + printk_lock(waiter->deadlock_lock, 1); + + debug_show_held_locks(current); + debug_show_held_locks(task); + + printk("\n%s/%d's [blocked] stackdump:\n\n", + task->comm, task_pid_nr(task)); + show_stack(task, NULL); + printk("\n%s/%d's [current] stackdump:\n\n", + current->comm, task_pid_nr(current)); + dump_stack(); + debug_show_all_locks(); + rcu_read_unlock(); + + printk("[ turning off deadlock detection." + "Please report this trace. ]\n\n"); + local_irq_disable(); +} + +void debug_rt_mutex_lock(struct rt_mutex *lock) +{ +} + +void debug_rt_mutex_unlock(struct rt_mutex *lock) +{ + TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current); +} + +void +debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner) +{ +} + +void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) +{ + TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock)); +} + +void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) +{ + memset(waiter, 0x11, sizeof(*waiter)); + plist_node_init(&waiter->list_entry, MAX_PRIO); + plist_node_init(&waiter->pi_list_entry, MAX_PRIO); + waiter->deadlock_task_pid = NULL; +} + +void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) +{ + put_pid(waiter->deadlock_task_pid); + TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); + TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); + memset(waiter, 0x22, sizeof(*waiter)); +} + +void debug_rt_mutex_init(struct rt_mutex *lock, const char *name) +{ + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lock->name = name; +} + +void +rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task) +{ +} + +void rt_mutex_deadlock_account_unlock(struct task_struct *task) +{ +} + diff --git a/kernel/rtmutex-debug.h b/kernel/rtmutex-debug.h new file mode 100644 index 00000000..14193d59 --- /dev/null +++ b/kernel/rtmutex-debug.h @@ -0,0 +1,33 @@ +/* + * RT-Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner + * + * This file contains macros used solely by rtmutex.c. Debug version. + */ + +extern void +rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task); +extern void rt_mutex_deadlock_account_unlock(struct task_struct *task); +extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); +extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); +extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); +extern void debug_rt_mutex_lock(struct rt_mutex *lock); +extern void debug_rt_mutex_unlock(struct rt_mutex *lock); +extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, + struct task_struct *powner); +extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); +extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter, + struct rt_mutex *lock); +extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); +# define debug_rt_mutex_reset_waiter(w) \ + do { (w)->deadlock_lock = NULL; } while (0) + +static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, + int detect) +{ + return (waiter != NULL); +} diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c new file mode 100644 index 00000000..5c9ccd38 --- /dev/null +++ b/kernel/rtmutex-tester.c @@ -0,0 +1,417 @@ +/* + * RT-Mutex-tester: scriptable tester for rt mutexes + * + * started by Thomas Gleixner: + * + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner + * + */ +#include +#include +#include +#include +#include +#include +#include + +#include "rtmutex.h" + +#define MAX_RT_TEST_THREADS 8 +#define MAX_RT_TEST_MUTEXES 8 + +static spinlock_t rttest_lock; +static atomic_t rttest_event; + +struct test_thread_data { + int opcode; + int opdata; + int mutexes[MAX_RT_TEST_MUTEXES]; + int event; + struct sys_device sysdev; +}; + +static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; +static struct task_struct *threads[MAX_RT_TEST_THREADS]; +static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES]; + +enum test_opcodes { + RTTEST_NOP = 0, + RTTEST_SCHEDOT, /* 1 Sched other, data = nice */ + RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */ + RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */ + RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */ + RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */ + RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ + RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ + RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ + /* 9, 10 - reserved for BKL commemoration */ + RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */ + RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ + RTTEST_RESET = 99, /* 99 Reset all pending operations */ +}; + +static int handle_op(struct test_thread_data *td, int lockwakeup) +{ + int i, id, ret = -EINVAL; + + switch(td->opcode) { + + case RTTEST_NOP: + return 0; + + case RTTEST_LOCKCONT: + td->mutexes[td->opdata] = 1; + td->event = atomic_add_return(1, &rttest_event); + return 0; + + case RTTEST_RESET: + for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) { + if (td->mutexes[i] == 4) { + rt_mutex_unlock(&mutexes[i]); + td->mutexes[i] = 0; + } + } + return 0; + + case RTTEST_RESETEVENT: + atomic_set(&rttest_event, 0); + return 0; + + default: + if (lockwakeup) + return ret; + } + + switch(td->opcode) { + + case RTTEST_LOCK: + case RTTEST_LOCKNOWAIT: + id = td->opdata; + if (id < 0 || id >= MAX_RT_TEST_MUTEXES) + return ret; + + td->mutexes[id] = 1; + td->event = atomic_add_return(1, &rttest_event); + rt_mutex_lock(&mutexes[id]); + td->event = atomic_add_return(1, &rttest_event); + td->mutexes[id] = 4; + return 0; + + case RTTEST_LOCKINT: + case RTTEST_LOCKINTNOWAIT: + id = td->opdata; + if (id < 0 || id >= MAX_RT_TEST_MUTEXES) + return ret; + + td->mutexes[id] = 1; + td->event = atomic_add_return(1, &rttest_event); + ret = rt_mutex_lock_interruptible(&mutexes[id], 0); + td->event = atomic_add_return(1, &rttest_event); + td->mutexes[id] = ret ? 0 : 4; + return ret ? -EINTR : 0; + + case RTTEST_UNLOCK: + id = td->opdata; + if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4) + return ret; + + td->event = atomic_add_return(1, &rttest_event); + rt_mutex_unlock(&mutexes[id]); + td->event = atomic_add_return(1, &rttest_event); + td->mutexes[id] = 0; + return 0; + + default: + break; + } + return ret; +} + +/* + * Schedule replacement for rtsem_down(). Only called for threads with + * PF_MUTEX_TESTER set. + * + * This allows us to have finegrained control over the event flow. + * + */ +void schedule_rt_mutex_test(struct rt_mutex *mutex) +{ + int tid, op, dat; + struct test_thread_data *td; + + /* We have to lookup the task */ + for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) { + if (threads[tid] == current) + break; + } + + BUG_ON(tid == MAX_RT_TEST_THREADS); + + td = &thread_data[tid]; + + op = td->opcode; + dat = td->opdata; + + switch (op) { + case RTTEST_LOCK: + case RTTEST_LOCKINT: + case RTTEST_LOCKNOWAIT: + case RTTEST_LOCKINTNOWAIT: + if (mutex != &mutexes[dat]) + break; + + if (td->mutexes[dat] != 1) + break; + + td->mutexes[dat] = 2; + td->event = atomic_add_return(1, &rttest_event); + break; + + default: + break; + } + + schedule(); + + + switch (op) { + case RTTEST_LOCK: + case RTTEST_LOCKINT: + if (mutex != &mutexes[dat]) + return; + + if (td->mutexes[dat] != 2) + return; + + td->mutexes[dat] = 3; + td->event = atomic_add_return(1, &rttest_event); + break; + + case RTTEST_LOCKNOWAIT: + case RTTEST_LOCKINTNOWAIT: + if (mutex != &mutexes[dat]) + return; + + if (td->mutexes[dat] != 2) + return; + + td->mutexes[dat] = 1; + td->event = atomic_add_return(1, &rttest_event); + return; + + default: + return; + } + + td->opcode = 0; + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + + if (td->opcode > 0) { + int ret; + + set_current_state(TASK_RUNNING); + ret = handle_op(td, 1); + set_current_state(TASK_INTERRUPTIBLE); + if (td->opcode == RTTEST_LOCKCONT) + break; + td->opcode = ret; + } + + /* Wait for the next command to be executed */ + schedule(); + } + + /* Restore previous command and data */ + td->opcode = op; + td->opdata = dat; +} + +static int test_func(void *data) +{ + struct test_thread_data *td = data; + int ret; + + current->flags |= PF_MUTEX_TESTER; + set_freezable(); + allow_signal(SIGHUP); + + for(;;) { + + set_current_state(TASK_INTERRUPTIBLE); + + if (td->opcode > 0) { + set_current_state(TASK_RUNNING); + ret = handle_op(td, 0); + set_current_state(TASK_INTERRUPTIBLE); + td->opcode = ret; + } + + /* Wait for the next command to be executed */ + schedule(); + try_to_freeze(); + + if (signal_pending(current)) + flush_signals(current); + + if(kthread_should_stop()) + break; + } + return 0; +} + +/** + * sysfs_test_command - interface for test commands + * @dev: thread reference + * @buf: command for actual step + * @count: length of buffer + * + * command syntax: + * + * opcode:data + */ +static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribute *attr, + const char *buf, size_t count) +{ + struct sched_param schedpar; + struct test_thread_data *td; + char cmdbuf[32]; + int op, dat, tid, ret; + + td = container_of(dev, struct test_thread_data, sysdev); + tid = td->sysdev.id; + + /* strings from sysfs write are not 0 terminated! */ + if (count >= sizeof(cmdbuf)) + return -EINVAL; + + /* strip of \n: */ + if (buf[count-1] == '\n') + count--; + if (count < 1) + return -EINVAL; + + memcpy(cmdbuf, buf, count); + cmdbuf[count] = 0; + + if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2) + return -EINVAL; + + switch (op) { + case RTTEST_SCHEDOT: + schedpar.sched_priority = 0; + ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar); + if (ret) + return ret; + set_user_nice(current, 0); + break; + + case RTTEST_SCHEDRT: + schedpar.sched_priority = dat; + ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar); + if (ret) + return ret; + break; + + case RTTEST_SIGNAL: + send_sig(SIGHUP, threads[tid], 0); + break; + + default: + if (td->opcode > 0) + return -EBUSY; + td->opdata = dat; + td->opcode = op; + wake_up_process(threads[tid]); + } + + return count; +} + +/** + * sysfs_test_status - sysfs interface for rt tester + * @dev: thread to query + * @buf: char buffer to be filled with thread status info + */ +static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute *attr, + char *buf) +{ + struct test_thread_data *td; + struct task_struct *tsk; + char *curr = buf; + int i; + + td = container_of(dev, struct test_thread_data, sysdev); + tsk = threads[td->sysdev.id]; + + spin_lock(&rttest_lock); + + curr += sprintf(curr, + "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:", + td->opcode, td->event, tsk->state, + (MAX_RT_PRIO - 1) - tsk->prio, + (MAX_RT_PRIO - 1) - tsk->normal_prio, + tsk->pi_blocked_on); + + for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) + curr += sprintf(curr, "%d", td->mutexes[i]); + + spin_unlock(&rttest_lock); + + curr += sprintf(curr, ", T: %p, R: %p\n", tsk, + mutexes[td->sysdev.id].owner); + + return curr - buf; +} + +static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL); +static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command); + +static struct sysdev_class rttest_sysclass = { + .name = "rttest", +}; + +static int init_test_thread(int id) +{ + thread_data[id].sysdev.cls = &rttest_sysclass; + thread_data[id].sysdev.id = id; + + threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); + if (IS_ERR(threads[id])) + return PTR_ERR(threads[id]); + + return sysdev_register(&thread_data[id].sysdev); +} + +static int init_rttest(void) +{ + int ret, i; + + spin_lock_init(&rttest_lock); + + for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) + rt_mutex_init(&mutexes[i]); + + ret = sysdev_class_register(&rttest_sysclass); + if (ret) + return ret; + + for (i = 0; i < MAX_RT_TEST_THREADS; i++) { + ret = init_test_thread(i); + if (ret) + break; + ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status); + if (ret) + break; + ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command); + if (ret) + break; + } + + printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" ); + + return ret; +} + +device_initcall(init_rttest); diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c new file mode 100644 index 00000000..255e1662 --- /dev/null +++ b/kernel/rtmutex.c @@ -0,0 +1,1046 @@ +/* + * RT-Mutexes: simple blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner. + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner + * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt + * Copyright (C) 2006 Esben Nielsen + * + * See Documentation/rt-mutex-design.txt for details. + */ +#include +#include +#include +#include + +#include "rtmutex_common.h" + +/* + * lock->owner state tracking: + * + * lock->owner holds the task_struct pointer of the owner. Bit 0 + * is used to keep track of the "lock has waiters" state. + * + * owner bit0 + * NULL 0 lock is free (fast acquire possible) + * NULL 1 lock is free and has waiters and the top waiter + * is going to take the lock* + * taskpointer 0 lock is held (fast release possible) + * taskpointer 1 lock is held and has waiters** + * + * The fast atomic compare exchange based acquire and release is only + * possible when bit 0 of lock->owner is 0. + * + * (*) It also can be a transitional state when grabbing the lock + * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock, + * we need to set the bit0 before looking at the lock, and the owner may be + * NULL in this small time, hence this can be a transitional state. + * + * (**) There is a small time when bit 0 is set but there are no + * waiters. This can happen when grabbing the lock in the slow path. + * To prevent a cmpxchg of the owner releasing the lock, we need to + * set this bit before looking at the lock. + */ + +static void +rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner) +{ + unsigned long val = (unsigned long)owner; + + if (rt_mutex_has_waiters(lock)) + val |= RT_MUTEX_HAS_WAITERS; + + lock->owner = (struct task_struct *)val; +} + +static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) +{ + lock->owner = (struct task_struct *) + ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); +} + +static void fixup_rt_mutex_waiters(struct rt_mutex *lock) +{ + if (!rt_mutex_has_waiters(lock)) + clear_rt_mutex_waiters(lock); +} + +/* + * We can speed up the acquire/release, if the architecture + * supports cmpxchg and if there's no debugging state to be set up + */ +#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES) +# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c) +static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +{ + unsigned long owner, *p = (unsigned long *) &lock->owner; + + do { + owner = *p; + } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); +} +#else +# define rt_mutex_cmpxchg(l,c,n) (0) +static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) +{ + lock->owner = (struct task_struct *) + ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); +} +#endif + +/* + * Calculate task priority from the waiter list priority + * + * Return task->normal_prio when the waiter list is empty or when + * the waiter is not allowed to do priority boosting + */ +int rt_mutex_getprio(struct task_struct *task) +{ + if (likely(!task_has_pi_waiters(task))) + return task->normal_prio; + + return min(task_top_pi_waiter(task)->pi_list_entry.prio, + task->normal_prio); +} + +/* + * Adjust the priority of a task, after its pi_waiters got modified. + * + * This can be both boosting and unboosting. task->pi_lock must be held. + */ +static void __rt_mutex_adjust_prio(struct task_struct *task) +{ + int prio = rt_mutex_getprio(task); + + if (task->prio != prio) + rt_mutex_setprio(task, prio); +} + +/* + * Adjust task priority (undo boosting). Called from the exit path of + * rt_mutex_slowunlock() and rt_mutex_slowlock(). + * + * (Note: We do this outside of the protection of lock->wait_lock to + * allow the lock to be taken while or before we readjust the priority + * of task. We do not use the spin_xx_mutex() variants here as we are + * outside of the debug path.) + */ +static void rt_mutex_adjust_prio(struct task_struct *task) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + __rt_mutex_adjust_prio(task); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); +} + +/* + * Max number of times we'll walk the boosting chain: + */ +int max_lock_depth = 1024; + +/* + * Adjust the priority chain. Also used for deadlock detection. + * Decreases task's usage by one - may thus free the task. + * Returns 0 or -EDEADLK. + */ +static int rt_mutex_adjust_prio_chain(struct task_struct *task, + int deadlock_detect, + struct rt_mutex *orig_lock, + struct rt_mutex_waiter *orig_waiter, + struct task_struct *top_task) +{ + struct rt_mutex *lock; + struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; + int detect_deadlock, ret = 0, depth = 0; + unsigned long flags; + + detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter, + deadlock_detect); + + /* + * The (de)boosting is a step by step approach with a lot of + * pitfalls. We want this to be preemptible and we want hold a + * maximum of two locks per step. So we have to check + * carefully whether things change under us. + */ + again: + if (++depth > max_lock_depth) { + static int prev_max; + + /* + * Print this only once. If the admin changes the limit, + * print a new message when reaching the limit again. + */ + if (prev_max != max_lock_depth) { + prev_max = max_lock_depth; + printk(KERN_WARNING "Maximum lock depth %d reached " + "task: %s (%d)\n", max_lock_depth, + top_task->comm, task_pid_nr(top_task)); + } + put_task_struct(task); + + return deadlock_detect ? -EDEADLK : 0; + } + retry: + /* + * Task can not go away as we did a get_task() before ! + */ + raw_spin_lock_irqsave(&task->pi_lock, flags); + + waiter = task->pi_blocked_on; + /* + * Check whether the end of the boosting chain has been + * reached or the state of the chain has changed while we + * dropped the locks. + */ + if (!waiter) + goto out_unlock_pi; + + /* + * Check the orig_waiter state. After we dropped the locks, + * the previous owner of the lock might have released the lock. + */ + if (orig_waiter && !rt_mutex_owner(orig_lock)) + goto out_unlock_pi; + + /* + * Drop out, when the task has no waiters. Note, + * top_waiter can be NULL, when we are in the deboosting + * mode! + */ + if (top_waiter && (!task_has_pi_waiters(task) || + top_waiter != task_top_pi_waiter(task))) + goto out_unlock_pi; + + /* + * When deadlock detection is off then we check, if further + * priority adjustment is necessary. + */ + if (!detect_deadlock && waiter->list_entry.prio == task->prio) + goto out_unlock_pi; + + lock = waiter->lock; + if (!raw_spin_trylock(&lock->wait_lock)) { + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + cpu_relax(); + goto retry; + } + + /* Deadlock detection */ + if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { + debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); + raw_spin_unlock(&lock->wait_lock); + ret = deadlock_detect ? -EDEADLK : 0; + goto out_unlock_pi; + } + + top_waiter = rt_mutex_top_waiter(lock); + + /* Requeue the waiter */ + plist_del(&waiter->list_entry, &lock->wait_list); + waiter->list_entry.prio = task->prio; + plist_add(&waiter->list_entry, &lock->wait_list); + + /* Release the task */ + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + if (!rt_mutex_owner(lock)) { + /* + * If the requeue above changed the top waiter, then we need + * to wake the new top waiter up to try to get the lock. + */ + + if (top_waiter != rt_mutex_top_waiter(lock)) + wake_up_process(rt_mutex_top_waiter(lock)->task); + raw_spin_unlock(&lock->wait_lock); + goto out_put_task; + } + put_task_struct(task); + + /* Grab the next task */ + task = rt_mutex_owner(lock); + get_task_struct(task); + raw_spin_lock_irqsave(&task->pi_lock, flags); + + if (waiter == rt_mutex_top_waiter(lock)) { + /* Boost the owner */ + plist_del(&top_waiter->pi_list_entry, &task->pi_waiters); + waiter->pi_list_entry.prio = waiter->list_entry.prio; + plist_add(&waiter->pi_list_entry, &task->pi_waiters); + __rt_mutex_adjust_prio(task); + + } else if (top_waiter == waiter) { + /* Deboost the owner */ + plist_del(&waiter->pi_list_entry, &task->pi_waiters); + waiter = rt_mutex_top_waiter(lock); + waiter->pi_list_entry.prio = waiter->list_entry.prio; + plist_add(&waiter->pi_list_entry, &task->pi_waiters); + __rt_mutex_adjust_prio(task); + } + + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + + top_waiter = rt_mutex_top_waiter(lock); + raw_spin_unlock(&lock->wait_lock); + + if (!detect_deadlock && waiter != top_waiter) + goto out_put_task; + + goto again; + + out_unlock_pi: + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + out_put_task: + put_task_struct(task); + + return ret; +} + +/* + * Try to take an rt-mutex + * + * Must be called with lock->wait_lock held. + * + * @lock: the lock to be acquired. + * @task: the task which wants to acquire the lock + * @waiter: the waiter that is queued to the lock's wait list. (could be NULL) + */ +static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, + struct rt_mutex_waiter *waiter) +{ + /* + * We have to be careful here if the atomic speedups are + * enabled, such that, when + * - no other waiter is on the lock + * - the lock has been released since we did the cmpxchg + * the lock can be released or taken while we are doing the + * checks and marking the lock with RT_MUTEX_HAS_WAITERS. + * + * The atomic acquire/release aware variant of + * mark_rt_mutex_waiters uses a cmpxchg loop. After setting + * the WAITERS bit, the atomic release / acquire can not + * happen anymore and lock->wait_lock protects us from the + * non-atomic case. + * + * Note, that this might set lock->owner = + * RT_MUTEX_HAS_WAITERS in the case the lock is not contended + * any more. This is fixed up when we take the ownership. + * This is the transitional state explained at the top of this file. + */ + mark_rt_mutex_waiters(lock); + + if (rt_mutex_owner(lock)) + return 0; + + /* + * It will get the lock because of one of these conditions: + * 1) there is no waiter + * 2) higher priority than waiters + * 3) it is top waiter + */ + if (rt_mutex_has_waiters(lock)) { + if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) { + if (!waiter || waiter != rt_mutex_top_waiter(lock)) + return 0; + } + } + + if (waiter || rt_mutex_has_waiters(lock)) { + unsigned long flags; + struct rt_mutex_waiter *top; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + + /* remove the queued waiter. */ + if (waiter) { + plist_del(&waiter->list_entry, &lock->wait_list); + task->pi_blocked_on = NULL; + } + + /* + * We have to enqueue the top waiter(if it exists) into + * task->pi_waiters list. + */ + if (rt_mutex_has_waiters(lock)) { + top = rt_mutex_top_waiter(lock); + top->pi_list_entry.prio = top->list_entry.prio; + plist_add(&top->pi_list_entry, &task->pi_waiters); + } + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + } + + /* We got the lock. */ + debug_rt_mutex_lock(lock); + + rt_mutex_set_owner(lock, task); + + rt_mutex_deadlock_account_lock(lock, task); + + return 1; +} + +/* + * Task blocks on lock. + * + * Prepare waiter and propagate pi chain + * + * This must be called with lock->wait_lock held. + */ +static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task, + int detect_deadlock) +{ + struct task_struct *owner = rt_mutex_owner(lock); + struct rt_mutex_waiter *top_waiter = waiter; + unsigned long flags; + int chain_walk = 0, res; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + __rt_mutex_adjust_prio(task); + waiter->task = task; + waiter->lock = lock; + plist_node_init(&waiter->list_entry, task->prio); + plist_node_init(&waiter->pi_list_entry, task->prio); + + /* Get the top priority waiter on the lock */ + if (rt_mutex_has_waiters(lock)) + top_waiter = rt_mutex_top_waiter(lock); + plist_add(&waiter->list_entry, &lock->wait_list); + + task->pi_blocked_on = waiter; + + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + + if (!owner) + return 0; + + if (waiter == rt_mutex_top_waiter(lock)) { + raw_spin_lock_irqsave(&owner->pi_lock, flags); + plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); + plist_add(&waiter->pi_list_entry, &owner->pi_waiters); + + __rt_mutex_adjust_prio(owner); + if (owner->pi_blocked_on) + chain_walk = 1; + raw_spin_unlock_irqrestore(&owner->pi_lock, flags); + } + else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) + chain_walk = 1; + + if (!chain_walk) + return 0; + + /* + * The owner can't disappear while holding a lock, + * so the owner struct is protected by wait_lock. + * Gets dropped in rt_mutex_adjust_prio_chain()! + */ + get_task_struct(owner); + + raw_spin_unlock(&lock->wait_lock); + + res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, + task); + + raw_spin_lock(&lock->wait_lock); + + return res; +} + +/* + * Wake up the next waiter on the lock. + * + * Remove the top waiter from the current tasks waiter list and wake it up. + * + * Called with lock->wait_lock held. + */ +static void wakeup_next_waiter(struct rt_mutex *lock) +{ + struct rt_mutex_waiter *waiter; + unsigned long flags; + + raw_spin_lock_irqsave(¤t->pi_lock, flags); + + waiter = rt_mutex_top_waiter(lock); + + /* + * Remove it from current->pi_waiters. We do not adjust a + * possible priority boost right now. We execute wakeup in the + * boosted mode and go back to normal after releasing + * lock->wait_lock. + */ + plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); + + rt_mutex_set_owner(lock, NULL); + + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + + wake_up_process(waiter->task); +} + +/* + * Remove a waiter from a lock and give up + * + * Must be called with lock->wait_lock held and + * have just failed to try_to_take_rt_mutex(). + */ +static void remove_waiter(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter) +{ + int first = (waiter == rt_mutex_top_waiter(lock)); + struct task_struct *owner = rt_mutex_owner(lock); + unsigned long flags; + int chain_walk = 0; + + raw_spin_lock_irqsave(¤t->pi_lock, flags); + plist_del(&waiter->list_entry, &lock->wait_list); + current->pi_blocked_on = NULL; + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + + if (!owner) + return; + + if (first) { + + raw_spin_lock_irqsave(&owner->pi_lock, flags); + + plist_del(&waiter->pi_list_entry, &owner->pi_waiters); + + if (rt_mutex_has_waiters(lock)) { + struct rt_mutex_waiter *next; + + next = rt_mutex_top_waiter(lock); + plist_add(&next->pi_list_entry, &owner->pi_waiters); + } + __rt_mutex_adjust_prio(owner); + + if (owner->pi_blocked_on) + chain_walk = 1; + + raw_spin_unlock_irqrestore(&owner->pi_lock, flags); + } + + WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); + + if (!chain_walk) + return; + + /* gets dropped in rt_mutex_adjust_prio_chain()! */ + get_task_struct(owner); + + raw_spin_unlock(&lock->wait_lock); + + rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current); + + raw_spin_lock(&lock->wait_lock); +} + +/* + * Recheck the pi chain, in case we got a priority setting + * + * Called from sched_setscheduler + */ +void rt_mutex_adjust_pi(struct task_struct *task) +{ + struct rt_mutex_waiter *waiter; + unsigned long flags; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + + waiter = task->pi_blocked_on; + if (!waiter || waiter->list_entry.prio == task->prio) { + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + return; + } + + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + + /* gets dropped in rt_mutex_adjust_prio_chain()! */ + get_task_struct(task); + rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); +} + +/** + * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop + * @lock: the rt_mutex to take + * @state: the state the task should block in (TASK_INTERRUPTIBLE + * or TASK_UNINTERRUPTIBLE) + * @timeout: the pre-initialized and started timer, or NULL for none + * @waiter: the pre-initialized rt_mutex_waiter + * + * lock->wait_lock must be held by the caller. + */ +static int __sched +__rt_mutex_slowlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + struct rt_mutex_waiter *waiter) +{ + int ret = 0; + + for (;;) { + /* Try to acquire the lock: */ + if (try_to_take_rt_mutex(lock, current, waiter)) + break; + + /* + * TASK_INTERRUPTIBLE checks for signals and + * timeout. Ignored otherwise. + */ + if (unlikely(state == TASK_INTERRUPTIBLE)) { + /* Signal pending? */ + if (signal_pending(current)) + ret = -EINTR; + if (timeout && !timeout->task) + ret = -ETIMEDOUT; + if (ret) + break; + } + + raw_spin_unlock(&lock->wait_lock); + + debug_rt_mutex_print_deadlock(waiter); + + schedule_rt_mutex(lock); + + raw_spin_lock(&lock->wait_lock); + set_current_state(state); + } + + return ret; +} + +/* + * Slow path lock function: + */ +static int __sched +rt_mutex_slowlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + int detect_deadlock) +{ + struct rt_mutex_waiter waiter; + int ret = 0; + + debug_rt_mutex_init_waiter(&waiter); + + raw_spin_lock(&lock->wait_lock); + + /* Try to acquire the lock again: */ + if (try_to_take_rt_mutex(lock, current, NULL)) { + raw_spin_unlock(&lock->wait_lock); + return 0; + } + + set_current_state(state); + + /* Setup the timer, when timeout != NULL */ + if (unlikely(timeout)) { + hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); + if (!hrtimer_active(&timeout->timer)) + timeout->task = NULL; + } + + ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock); + + if (likely(!ret)) + ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); + + set_current_state(TASK_RUNNING); + + if (unlikely(ret)) + remove_waiter(lock, &waiter); + + /* + * try_to_take_rt_mutex() sets the waiter bit + * unconditionally. We might have to fix that up. + */ + fixup_rt_mutex_waiters(lock); + + raw_spin_unlock(&lock->wait_lock); + + /* Remove pending timer: */ + if (unlikely(timeout)) + hrtimer_cancel(&timeout->timer); + + debug_rt_mutex_free_waiter(&waiter); + + return ret; +} + +/* + * Slow path try-lock function: + */ +static inline int +rt_mutex_slowtrylock(struct rt_mutex *lock) +{ + int ret = 0; + + raw_spin_lock(&lock->wait_lock); + + if (likely(rt_mutex_owner(lock) != current)) { + + ret = try_to_take_rt_mutex(lock, current, NULL); + /* + * try_to_take_rt_mutex() sets the lock waiters + * bit unconditionally. Clean this up. + */ + fixup_rt_mutex_waiters(lock); + } + + raw_spin_unlock(&lock->wait_lock); + + return ret; +} + +/* + * Slow path to release a rt-mutex: + */ +static void __sched +rt_mutex_slowunlock(struct rt_mutex *lock) +{ + raw_spin_lock(&lock->wait_lock); + + debug_rt_mutex_unlock(lock); + + rt_mutex_deadlock_account_unlock(current); + + if (!rt_mutex_has_waiters(lock)) { + lock->owner = NULL; + raw_spin_unlock(&lock->wait_lock); + return; + } + + wakeup_next_waiter(lock); + + raw_spin_unlock(&lock->wait_lock); + + /* Undo pi boosting if necessary: */ + rt_mutex_adjust_prio(current); +} + +/* + * debug aware fast / slowpath lock,trylock,unlock + * + * The atomic acquire/release ops are compiled away, when either the + * architecture does not support cmpxchg or when debugging is enabled. + */ +static inline int +rt_mutex_fastlock(struct rt_mutex *lock, int state, + int detect_deadlock, + int (*slowfn)(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + int detect_deadlock)) +{ + if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { + rt_mutex_deadlock_account_lock(lock, current); + return 0; + } else + return slowfn(lock, state, NULL, detect_deadlock); +} + +static inline int +rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, int detect_deadlock, + int (*slowfn)(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + int detect_deadlock)) +{ + if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { + rt_mutex_deadlock_account_lock(lock, current); + return 0; + } else + return slowfn(lock, state, timeout, detect_deadlock); +} + +static inline int +rt_mutex_fasttrylock(struct rt_mutex *lock, + int (*slowfn)(struct rt_mutex *lock)) +{ + if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { + rt_mutex_deadlock_account_lock(lock, current); + return 1; + } + return slowfn(lock); +} + +static inline void +rt_mutex_fastunlock(struct rt_mutex *lock, + void (*slowfn)(struct rt_mutex *lock)) +{ + if (likely(rt_mutex_cmpxchg(lock, current, NULL))) + rt_mutex_deadlock_account_unlock(current); + else + slowfn(lock); +} + +/** + * rt_mutex_lock - lock a rt_mutex + * + * @lock: the rt_mutex to be locked + */ +void __sched rt_mutex_lock(struct rt_mutex *lock) +{ + might_sleep(); + + rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock); + +/** + * rt_mutex_lock_interruptible - lock a rt_mutex interruptible + * + * @lock: the rt_mutex to be locked + * @detect_deadlock: deadlock detection on/off + * + * Returns: + * 0 on success + * -EINTR when interrupted by a signal + * -EDEADLK when the lock would deadlock (when deadlock detection is on) + */ +int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, + int detect_deadlock) +{ + might_sleep(); + + return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, + detect_deadlock, rt_mutex_slowlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); + +/** + * rt_mutex_timed_lock - lock a rt_mutex interruptible + * the timeout structure is provided + * by the caller + * + * @lock: the rt_mutex to be locked + * @timeout: timeout structure or NULL (no timeout) + * @detect_deadlock: deadlock detection on/off + * + * Returns: + * 0 on success + * -EINTR when interrupted by a signal + * -ETIMEDOUT when the timeout expired + * -EDEADLK when the lock would deadlock (when deadlock detection is on) + */ +int +rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout, + int detect_deadlock) +{ + might_sleep(); + + return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, + detect_deadlock, rt_mutex_slowlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); + +/** + * rt_mutex_trylock - try to lock a rt_mutex + * + * @lock: the rt_mutex to be locked + * + * Returns 1 on success and 0 on contention + */ +int __sched rt_mutex_trylock(struct rt_mutex *lock) +{ + return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); +} +EXPORT_SYMBOL_GPL(rt_mutex_trylock); + +/** + * rt_mutex_unlock - unlock a rt_mutex + * + * @lock: the rt_mutex to be unlocked + */ +void __sched rt_mutex_unlock(struct rt_mutex *lock) +{ + rt_mutex_fastunlock(lock, rt_mutex_slowunlock); +} +EXPORT_SYMBOL_GPL(rt_mutex_unlock); + +/** + * rt_mutex_destroy - mark a mutex unusable + * @lock: the mutex to be destroyed + * + * This function marks the mutex uninitialized, and any subsequent + * use of the mutex is forbidden. The mutex must not be locked when + * this function is called. + */ +void rt_mutex_destroy(struct rt_mutex *lock) +{ + WARN_ON(rt_mutex_is_locked(lock)); +#ifdef CONFIG_DEBUG_RT_MUTEXES + lock->magic = NULL; +#endif +} + +EXPORT_SYMBOL_GPL(rt_mutex_destroy); + +/** + * __rt_mutex_init - initialize the rt lock + * + * @lock: the rt lock to be initialized + * + * Initialize the rt lock to unlocked state. + * + * Initializing of a locked rt lock is not allowed + */ +void __rt_mutex_init(struct rt_mutex *lock, const char *name) +{ + lock->owner = NULL; + raw_spin_lock_init(&lock->wait_lock); + plist_head_init(&lock->wait_list); + + debug_rt_mutex_init(lock, name); +} +EXPORT_SYMBOL_GPL(__rt_mutex_init); + +/** + * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a + * proxy owner + * + * @lock: the rt_mutex to be locked + * @proxy_owner:the task to set as owner + * + * No locking. Caller has to do serializing itself + * Special API call for PI-futex support + */ +void rt_mutex_init_proxy_locked(struct rt_mutex *lock, + struct task_struct *proxy_owner) +{ + __rt_mutex_init(lock, NULL); + debug_rt_mutex_proxy_lock(lock, proxy_owner); + rt_mutex_set_owner(lock, proxy_owner); + rt_mutex_deadlock_account_lock(lock, proxy_owner); +} + +/** + * rt_mutex_proxy_unlock - release a lock on behalf of owner + * + * @lock: the rt_mutex to be locked + * + * No locking. Caller has to do serializing itself + * Special API call for PI-futex support + */ +void rt_mutex_proxy_unlock(struct rt_mutex *lock, + struct task_struct *proxy_owner) +{ + debug_rt_mutex_proxy_unlock(lock); + rt_mutex_set_owner(lock, NULL); + rt_mutex_deadlock_account_unlock(proxy_owner); +} + +/** + * rt_mutex_start_proxy_lock() - Start lock acquisition for another task + * @lock: the rt_mutex to take + * @waiter: the pre-initialized rt_mutex_waiter + * @task: the task to prepare + * @detect_deadlock: perform deadlock detection (1) or not (0) + * + * Returns: + * 0 - task blocked on lock + * 1 - acquired the lock for task, caller should wake it up + * <0 - error + * + * Special API call for FUTEX_REQUEUE_PI support. + */ +int rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task, int detect_deadlock) +{ + int ret; + + raw_spin_lock(&lock->wait_lock); + + if (try_to_take_rt_mutex(lock, task, NULL)) { + raw_spin_unlock(&lock->wait_lock); + return 1; + } + + ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); + + if (ret && !rt_mutex_owner(lock)) { + /* + * Reset the return value. We might have + * returned with -EDEADLK and the owner + * released the lock while we were walking the + * pi chain. Let the waiter sort it out. + */ + ret = 0; + } + + if (unlikely(ret)) + remove_waiter(lock, waiter); + + raw_spin_unlock(&lock->wait_lock); + + debug_rt_mutex_print_deadlock(waiter); + + return ret; +} + +/** + * rt_mutex_next_owner - return the next owner of the lock + * + * @lock: the rt lock query + * + * Returns the next owner of the lock or NULL + * + * Caller has to serialize against other accessors to the lock + * itself. + * + * Special API call for PI-futex support + */ +struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) +{ + if (!rt_mutex_has_waiters(lock)) + return NULL; + + return rt_mutex_top_waiter(lock)->task; +} + +/** + * rt_mutex_finish_proxy_lock() - Complete lock acquisition + * @lock: the rt_mutex we were woken on + * @to: the timeout, null if none. hrtimer should already have + * been started. + * @waiter: the pre-initialized rt_mutex_waiter + * @detect_deadlock: perform deadlock detection (1) or not (0) + * + * Complete the lock acquisition started our behalf by another thread. + * + * Returns: + * 0 - success + * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK + * + * Special API call for PI-futex requeue support + */ +int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter, + int detect_deadlock) +{ + int ret; + + raw_spin_lock(&lock->wait_lock); + + set_current_state(TASK_INTERRUPTIBLE); + + ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); + + set_current_state(TASK_RUNNING); + + if (unlikely(ret)) + remove_waiter(lock, waiter); + + /* + * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might + * have to fix that up. + */ + fixup_rt_mutex_waiters(lock); + + raw_spin_unlock(&lock->wait_lock); + + return ret; +} diff --git a/kernel/rtmutex.h b/kernel/rtmutex.h new file mode 100644 index 00000000..a1a1dd06 --- /dev/null +++ b/kernel/rtmutex.h @@ -0,0 +1,26 @@ +/* + * RT-Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner + * + * This file contains macros used solely by rtmutex.c. + * Non-debug version. + */ + +#define rt_mutex_deadlock_check(l) (0) +#define rt_mutex_deadlock_account_lock(m, t) do { } while (0) +#define rt_mutex_deadlock_account_unlock(l) do { } while (0) +#define debug_rt_mutex_init_waiter(w) do { } while (0) +#define debug_rt_mutex_free_waiter(w) do { } while (0) +#define debug_rt_mutex_lock(l) do { } while (0) +#define debug_rt_mutex_proxy_lock(l,p) do { } while (0) +#define debug_rt_mutex_proxy_unlock(l) do { } while (0) +#define debug_rt_mutex_unlock(l) do { } while (0) +#define debug_rt_mutex_init(m, n) do { } while (0) +#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) +#define debug_rt_mutex_print_deadlock(w) do { } while (0) +#define debug_rt_mutex_detect_deadlock(w,d) (d) +#define debug_rt_mutex_reset_waiter(w) do { } while (0) diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h new file mode 100644 index 00000000..53a66c85 --- /dev/null +++ b/kernel/rtmutex_common.h @@ -0,0 +1,126 @@ +/* + * RT Mutexes: blocking mutual exclusion locks with PI support + * + * started by Ingo Molnar and Thomas Gleixner: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner + * + * This file contains the private data structure and API definitions. + */ + +#ifndef __KERNEL_RTMUTEX_COMMON_H +#define __KERNEL_RTMUTEX_COMMON_H + +#include + +/* + * The rtmutex in kernel tester is independent of rtmutex debugging. We + * call schedule_rt_mutex_test() instead of schedule() for the tasks which + * belong to the tester. That way we can delay the wakeup path of those + * threads to provoke lock stealing and testing of complex boosting scenarios. + */ +#ifdef CONFIG_RT_MUTEX_TESTER + +extern void schedule_rt_mutex_test(struct rt_mutex *lock); + +#define schedule_rt_mutex(_lock) \ + do { \ + if (!(current->flags & PF_MUTEX_TESTER)) \ + schedule(); \ + else \ + schedule_rt_mutex_test(_lock); \ + } while (0) + +#else +# define schedule_rt_mutex(_lock) schedule() +#endif + +/* + * This is the control structure for tasks blocked on a rt_mutex, + * which is allocated on the kernel stack on of the blocked task. + * + * @list_entry: pi node to enqueue into the mutex waiters list + * @pi_list_entry: pi node to enqueue into the mutex owner waiters list + * @task: task reference to the blocked task + */ +struct rt_mutex_waiter { + struct plist_node list_entry; + struct plist_node pi_list_entry; + struct task_struct *task; + struct rt_mutex *lock; +#ifdef CONFIG_DEBUG_RT_MUTEXES + unsigned long ip; + struct pid *deadlock_task_pid; + struct rt_mutex *deadlock_lock; +#endif +}; + +/* + * Various helpers to access the waiters-plist: + */ +static inline int rt_mutex_has_waiters(struct rt_mutex *lock) +{ + return !plist_head_empty(&lock->wait_list); +} + +static inline struct rt_mutex_waiter * +rt_mutex_top_waiter(struct rt_mutex *lock) +{ + struct rt_mutex_waiter *w; + + w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, + list_entry); + BUG_ON(w->lock != lock); + + return w; +} + +static inline int task_has_pi_waiters(struct task_struct *p) +{ + return !plist_head_empty(&p->pi_waiters); +} + +static inline struct rt_mutex_waiter * +task_top_pi_waiter(struct task_struct *p) +{ + return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter, + pi_list_entry); +} + +/* + * lock->owner state tracking: + */ +#define RT_MUTEX_HAS_WAITERS 1UL +#define RT_MUTEX_OWNER_MASKALL 1UL + +static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) +{ + return (struct task_struct *) + ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); +} + +/* + * PI-futex support (proxy locking functions, etc.): + */ +extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); +extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, + struct task_struct *proxy_owner); +extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, + struct task_struct *proxy_owner); +extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task, + int detect_deadlock); +extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter, + int detect_deadlock); + +#ifdef CONFIG_DEBUG_RT_MUTEXES +# include "rtmutex-debug.h" +#else +# include "rtmutex.h" +#endif + +#endif diff --git a/kernel/rwsem.c b/kernel/rwsem.c new file mode 100644 index 00000000..cae050b0 --- /dev/null +++ b/kernel/rwsem.c @@ -0,0 +1,148 @@ +/* kernel/rwsem.c: R/W semaphores, public implementation + * + * Written by David Howells (dhowells@redhat.com). + * Derived from asm-i386/semaphore.h + */ + +#include +#include +#include +#include +#include + +#include +#include + +/* + * lock for reading + */ +void __sched down_read(struct rw_semaphore *sem) +{ + might_sleep(); + rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); + + LOCK_CONTENDED(sem, __down_read_trylock, __down_read); +} + +EXPORT_SYMBOL(down_read); + +/* + * trylock for reading -- returns 1 if successful, 0 if contention + */ +int down_read_trylock(struct rw_semaphore *sem) +{ + int ret = __down_read_trylock(sem); + + if (ret == 1) + rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); + return ret; +} + +EXPORT_SYMBOL(down_read_trylock); + +/* + * lock for writing + */ +void __sched down_write(struct rw_semaphore *sem) +{ + might_sleep(); + rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); + + LOCK_CONTENDED(sem, __down_write_trylock, __down_write); +} + +EXPORT_SYMBOL(down_write); + +/* + * trylock for writing -- returns 1 if successful, 0 if contention + */ +int down_write_trylock(struct rw_semaphore *sem) +{ + int ret = __down_write_trylock(sem); + + if (ret == 1) + rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_); + return ret; +} + +EXPORT_SYMBOL(down_write_trylock); + +/* + * release a read lock + */ +void up_read(struct rw_semaphore *sem) +{ + rwsem_release(&sem->dep_map, 1, _RET_IP_); + + __up_read(sem); +} + +EXPORT_SYMBOL(up_read); + +/* + * release a write lock + */ +void up_write(struct rw_semaphore *sem) +{ + rwsem_release(&sem->dep_map, 1, _RET_IP_); + + __up_write(sem); +} + +EXPORT_SYMBOL(up_write); + +/* + * downgrade write lock to read lock + */ +void downgrade_write(struct rw_semaphore *sem) +{ + /* + * lockdep: a downgraded write will live on as a write + * dependency. + */ + __downgrade_write(sem); +} + +EXPORT_SYMBOL(downgrade_write); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +void down_read_nested(struct rw_semaphore *sem, int subclass) +{ + might_sleep(); + rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); + + LOCK_CONTENDED(sem, __down_read_trylock, __down_read); +} + +EXPORT_SYMBOL(down_read_nested); + +void down_read_non_owner(struct rw_semaphore *sem) +{ + might_sleep(); + + __down_read(sem); +} + +EXPORT_SYMBOL(down_read_non_owner); + +void down_write_nested(struct rw_semaphore *sem, int subclass) +{ + might_sleep(); + rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_); + + LOCK_CONTENDED(sem, __down_write_trylock, __down_write); +} + +EXPORT_SYMBOL(down_write_nested); + +void up_read_non_owner(struct rw_semaphore *sem) +{ + __up_read(sem); +} + +EXPORT_SYMBOL(up_read_non_owner); + +#endif + + diff --git a/kernel/sched.c b/kernel/sched.c new file mode 100644 index 00000000..2da88acd --- /dev/null +++ b/kernel/sched.c @@ -0,0 +1,9414 @@ +/* + * kernel/sched.c + * + * Kernel scheduler and related syscalls + * + * Copyright (C) 1991-2002 Linus Torvalds + * + * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and + * make semaphores SMP safe + * 1998-11-19 Implemented schedule_timeout() and related stuff + * by Andrea Arcangeli + * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: + * hybrid priority-list and round-robin design with + * an array-switch method of distributing timeslices + * and per-CPU runqueues. Cleanups and useful suggestions + * by Davide Libenzi, preemptible kernel bits by Robert Love. + * 2003-09-03 Interactivity tuning by Con Kolivas. + * 2004-04-02 Scheduler domains code by Nick Piggin + * 2007-04-15 Work begun on replacing all interactivity tuning with a + * fair scheduling design by Con Kolivas. + * 2007-05-05 Load balancing (smp-nice) and other improvements + * by Peter Williams + * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith + * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri + * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, + * Thomas Gleixner, Mike Kravetz + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "sched_cpupri.h" +#include "workqueue_sched.h" +#include "sched_autogroup.h" + +#define CREATE_TRACE_POINTS +#include + +/* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], + * and back. + */ +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) +#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) + +/* + * 'User priority' is the nice value converted to something we + * can work with better when scaling various scheduler parameters, + * it's a [ 0 ... 39 ] range. + */ +#define USER_PRIO(p) ((p)-MAX_RT_PRIO) +#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) +#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) + +/* + * Helpers for converting nanosecond timing to jiffy resolution + */ +#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) + +#define NICE_0_LOAD SCHED_LOAD_SCALE +#define NICE_0_SHIFT SCHED_LOAD_SHIFT + +/* + * These are the 'tuning knobs' of the scheduler: + * + * default timeslice is 100 msecs (used only for SCHED_RR tasks). + * Timeslices get refilled after they expire. + */ +#define DEF_TIMESLICE (100 * HZ / 1000) + +/* + * single value that denotes runtime == period, ie unlimited time. + */ +#define RUNTIME_INF ((u64)~0ULL) + +static inline int rt_policy(int policy) +{ + if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) + return 1; + return 0; +} + +static inline int task_has_rt_policy(struct task_struct *p) +{ + return rt_policy(p->policy); +} + +/* + * This is the priority-queue data structure of the RT scheduling class: + */ +struct rt_prio_array { + DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ + struct list_head queue[MAX_RT_PRIO]; +}; + +struct rt_bandwidth { + /* nests inside the rq lock: */ + raw_spinlock_t rt_runtime_lock; + ktime_t rt_period; + u64 rt_runtime; + struct hrtimer rt_period_timer; +}; + +static struct rt_bandwidth def_rt_bandwidth; + +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); + +static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) +{ + struct rt_bandwidth *rt_b = + container_of(timer, struct rt_bandwidth, rt_period_timer); + ktime_t now; + int overrun; + int idle = 0; + + for (;;) { + now = hrtimer_cb_get_time(timer); + overrun = hrtimer_forward(timer, now, rt_b->rt_period); + + if (!overrun) + break; + + idle = do_sched_rt_period_timer(rt_b, overrun); + } + + return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; +} + +static +void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime) +{ + rt_b->rt_period = ns_to_ktime(period); + rt_b->rt_runtime = runtime; + + raw_spin_lock_init(&rt_b->rt_runtime_lock); + + hrtimer_init(&rt_b->rt_period_timer, + CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rt_b->rt_period_timer.function = sched_rt_period_timer; +} + +static inline int rt_bandwidth_enabled(void) +{ + return sysctl_sched_rt_runtime >= 0; +} + +static void start_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + ktime_t now; + + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) + return; + + if (hrtimer_active(&rt_b->rt_period_timer)) + return; + + raw_spin_lock(&rt_b->rt_runtime_lock); + for (;;) { + unsigned long delta; + ktime_t soft, hard; + + if (hrtimer_active(&rt_b->rt_period_timer)) + break; + + now = hrtimer_cb_get_time(&rt_b->rt_period_timer); + hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); + + soft = hrtimer_get_softexpires(&rt_b->rt_period_timer); + hard = hrtimer_get_expires(&rt_b->rt_period_timer); + delta = ktime_to_ns(ktime_sub(hard, soft)); + __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta, + HRTIMER_MODE_ABS_PINNED, 0); + } + raw_spin_unlock(&rt_b->rt_runtime_lock); +} + +#ifdef CONFIG_RT_GROUP_SCHED +static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) +{ + hrtimer_cancel(&rt_b->rt_period_timer); +} +#endif + +/* + * sched_domains_mutex serializes calls to init_sched_domains, + * detach_destroy_domains and partition_sched_domains. + */ +static DEFINE_MUTEX(sched_domains_mutex); + +#ifdef CONFIG_CGROUP_SCHED + +#include + +struct cfs_rq; + +static LIST_HEAD(task_groups); + +/* task group related information */ +struct task_group { + struct cgroup_subsys_state css; + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* schedulable entities of this group on each cpu */ + struct sched_entity **se; + /* runqueue "owned" by this group on each cpu */ + struct cfs_rq **cfs_rq; + unsigned long shares; + + atomic_t load_weight; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + struct sched_rt_entity **rt_se; + struct rt_rq **rt_rq; + + struct rt_bandwidth rt_bandwidth; +#endif + + struct rcu_head rcu; + struct list_head list; + + struct task_group *parent; + struct list_head siblings; + struct list_head children; + +#ifdef CONFIG_SCHED_AUTOGROUP + struct autogroup *autogroup; +#endif +}; + +/* task_group_lock serializes the addition/removal of task groups */ +static DEFINE_SPINLOCK(task_group_lock); + +#ifdef CONFIG_FAIR_GROUP_SCHED + +# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD + +/* + * A weight of 0 or 1 can cause arithmetics problems. + * A weight of a cfs_rq is the sum of weights of which entities + * are queued on this cfs_rq, so a weight of a entity should not be + * too large, so as the shares value of a task group. + * (The default weight is 1024 - so there's no practical + * limitation from this.) + */ +#define MIN_SHARES (1UL << 1) +#define MAX_SHARES (1UL << 18) + +static int root_task_group_load = ROOT_TASK_GROUP_LOAD; +#endif + +/* Default task group. + * Every task in system belong to this group at bootup. + */ +struct task_group root_task_group; + +#endif /* CONFIG_CGROUP_SCHED */ + +/* CFS-related fields in a runqueue */ +struct cfs_rq { + struct load_weight load; + unsigned long nr_running; + + u64 exec_clock; + u64 min_vruntime; +#ifndef CONFIG_64BIT + u64 min_vruntime_copy; +#endif + + struct rb_root tasks_timeline; + struct rb_node *rb_leftmost; + + struct list_head tasks; + struct list_head *balance_iterator; + + /* + * 'curr' points to currently running entity on this cfs_rq. + * It is set to NULL otherwise (i.e when none are currently running). + */ + struct sched_entity *curr, *next, *last, *skip; + +#ifdef CONFIG_SCHED_DEBUG + unsigned int nr_spread_over; +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED + struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ + + /* + * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in + * a hierarchy). Non-leaf lrqs hold other higher schedulable entities + * (like users, containers etc.) + * + * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This + * list is used during load balance. + */ + int on_list; + struct list_head leaf_cfs_rq_list; + struct task_group *tg; /* group that "owns" this runqueue */ + +#ifdef CONFIG_SMP + /* + * the part of load.weight contributed by tasks + */ + unsigned long task_weight; + + /* + * h_load = weight * f(tg) + * + * Where f(tg) is the recursive weight fraction assigned to + * this group. + */ + unsigned long h_load; + + /* + * Maintaining per-cpu shares distribution for group scheduling + * + * load_stamp is the last time we updated the load average + * load_last is the last time we updated the load average and saw load + * load_unacc_exec_time is currently unaccounted execution time + */ + u64 load_avg; + u64 load_period; + u64 load_stamp, load_last, load_unacc_exec_time; + + unsigned long load_contribution; +#endif +#endif +}; + +/* Real-Time classes' related field in a runqueue: */ +struct rt_rq { + struct rt_prio_array active; + unsigned long rt_nr_running; +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED + struct { + int curr; /* highest queued rt task prio */ +#ifdef CONFIG_SMP + int next; /* next highest */ +#endif + } highest_prio; +#endif +#ifdef CONFIG_SMP + unsigned long rt_nr_migratory; + unsigned long rt_nr_total; + int overloaded; + struct plist_head pushable_tasks; +#endif + int rt_throttled; + u64 rt_time; + u64 rt_runtime; + /* Nests inside the rq lock: */ + raw_spinlock_t rt_runtime_lock; + +#ifdef CONFIG_RT_GROUP_SCHED + unsigned long rt_nr_boosted; + + struct rq *rq; + struct list_head leaf_rt_rq_list; + struct task_group *tg; +#endif +}; + +#ifdef CONFIG_SMP + +/* + * We add the notion of a root-domain which will be used to define per-domain + * variables. Each exclusive cpuset essentially defines an island domain by + * fully partitioning the member cpus from any other cpuset. Whenever a new + * exclusive cpuset is created, we also create and attach a new root-domain + * object. + * + */ +struct root_domain { + atomic_t refcount; + struct rcu_head rcu; + cpumask_var_t span; + cpumask_var_t online; + + /* + * The "RT overload" flag: it gets set if a CPU has more than + * one runnable RT task. + */ + cpumask_var_t rto_mask; + atomic_t rto_count; + struct cpupri cpupri; +}; + +/* + * By default the system creates a single root-domain with all cpus as + * members (mimicking the global state we have today). + */ +static struct root_domain def_root_domain; + +#endif /* CONFIG_SMP */ + +/* + * This is the main, per-CPU runqueue data structure. + * + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the thread migration code), lock + * acquire operations must be ordered by ascending &runqueue. + */ +struct rq { + /* runqueue lock: */ + raw_spinlock_t lock; + + /* + * nr_running and cpu_load should be in the same cacheline because + * remote CPUs use both these fields when doing load calculation. + */ + unsigned long nr_running; + #define CPU_LOAD_IDX_MAX 5 + unsigned long cpu_load[CPU_LOAD_IDX_MAX]; + unsigned long last_load_update_tick; +#ifdef CONFIG_NO_HZ + u64 nohz_stamp; + unsigned char nohz_balance_kick; +#endif + int skip_clock_update; + + /* capture load from *all* tasks on this cpu: */ + struct load_weight load; + unsigned long nr_load_updates; + u64 nr_switches; + + struct cfs_rq cfs; + struct rt_rq rt; + +#ifdef CONFIG_FAIR_GROUP_SCHED + /* list of leaf cfs_rq on this cpu: */ + struct list_head leaf_cfs_rq_list; +#endif +#ifdef CONFIG_RT_GROUP_SCHED + struct list_head leaf_rt_rq_list; +#endif + + /* + * This is part of a global counter where only the total sum + * over all CPUs matters. A task can increase this counter on + * one CPU and if it got migrated afterwards it may decrease + * it on another CPU. Always updated under the runqueue lock: + */ + unsigned long nr_uninterruptible; + + struct task_struct *curr, *idle, *stop; + unsigned long next_balance; + struct mm_struct *prev_mm; + + u64 clock; + u64 clock_task; + + atomic_t nr_iowait; + +#ifdef CONFIG_SMP + struct root_domain *rd; + struct sched_domain *sd; + + unsigned long cpu_power; + + unsigned char idle_at_tick; + /* For active balancing */ + int post_schedule; + int active_balance; + int push_cpu; + struct cpu_stop_work active_balance_work; + /* cpu of this runqueue: */ + int cpu; + int online; + + unsigned long avg_load_per_task; + + u64 rt_avg; + u64 age_stamp; + u64 idle_stamp; + u64 avg_idle; +#endif + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + u64 prev_irq_time; +#endif + + /* calc_load related fields */ + unsigned long calc_load_update; + long calc_load_active; + +#ifdef CONFIG_SCHED_HRTICK +#ifdef CONFIG_SMP + int hrtick_csd_pending; + struct call_single_data hrtick_csd; +#endif + struct hrtimer hrtick_timer; +#endif + +#ifdef CONFIG_SCHEDSTATS + /* latency stats */ + struct sched_info rq_sched_info; + unsigned long long rq_cpu_time; + /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ + + /* sys_sched_yield() stats */ + unsigned int yld_count; + + /* schedule() stats */ + unsigned int sched_switch; + unsigned int sched_count; + unsigned int sched_goidle; + + /* try_to_wake_up() stats */ + unsigned int ttwu_count; + unsigned int ttwu_local; +#endif + +#ifdef CONFIG_SMP + struct task_struct *wake_list; +#endif +}; + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); + + +static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); + +static inline int cpu_of(struct rq *rq) +{ +#ifdef CONFIG_SMP + return rq->cpu; +#else + return 0; +#endif +} + +#define rcu_dereference_check_sched_domain(p) \ + rcu_dereference_check((p), \ + rcu_read_lock_held() || \ + lockdep_is_held(&sched_domains_mutex)) + +/* + * The domain tree (rq->sd) is protected by RCU's quiescent state transition. + * See detach_destroy_domains: synchronize_sched for details. + * + * The domain tree of any CPU may only be accessed from within + * preempt-disabled sections. + */ +#define for_each_domain(cpu, __sd) \ + for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) + +#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) +#define this_rq() (&__get_cpu_var(runqueues)) +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) +#define raw_rq() (&__raw_get_cpu_var(runqueues)) + +#ifdef CONFIG_CGROUP_SCHED + +/* + * Return the group to which this tasks belongs. + * + * We use task_subsys_state_check() and extend the RCU verification with + * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each + * task it moves into the cgroup. Therefore by holding either of those locks, + * we pin the task to the current cgroup. + */ +static inline struct task_group *task_group(struct task_struct *p) +{ + struct task_group *tg; + struct cgroup_subsys_state *css; + + css = task_subsys_state_check(p, cpu_cgroup_subsys_id, + lockdep_is_held(&p->pi_lock) || + lockdep_is_held(&task_rq(p)->lock)); + tg = container_of(css, struct task_group, css); + + return autogroup_task_group(p, tg); +} + +/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) +{ +#ifdef CONFIG_FAIR_GROUP_SCHED + p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; + p->se.parent = task_group(p)->se[cpu]; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + p->rt.rt_rq = task_group(p)->rt_rq[cpu]; + p->rt.parent = task_group(p)->rt_se[cpu]; +#endif +} + +#else /* CONFIG_CGROUP_SCHED */ + +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } +static inline struct task_group *task_group(struct task_struct *p) +{ + return NULL; +} + +#endif /* CONFIG_CGROUP_SCHED */ + +static void update_rq_clock_task(struct rq *rq, s64 delta); + +static void update_rq_clock(struct rq *rq) +{ + s64 delta; + + if (rq->skip_clock_update > 0) + return; + + delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; + rq->clock += delta; + update_rq_clock_task(rq, delta); +} + +/* + * Tunables that become constants when CONFIG_SCHED_DEBUG is off: + */ +#ifdef CONFIG_SCHED_DEBUG +# define const_debug __read_mostly +#else +# define const_debug static const +#endif + +/** + * runqueue_is_locked - Returns true if the current cpu runqueue is locked + * @cpu: the processor in question. + * + * This interface allows printk to be called with the runqueue lock + * held and know whether or not it is OK to wake up the klogd. + */ +int runqueue_is_locked(int cpu) +{ + return raw_spin_is_locked(&cpu_rq(cpu)->lock); +} + +/* + * Debugging: various feature bits + */ + +#define SCHED_FEAT(name, enabled) \ + __SCHED_FEAT_##name , + +enum { +#include "sched_features.h" +}; + +#undef SCHED_FEAT + +#define SCHED_FEAT(name, enabled) \ + (1UL << __SCHED_FEAT_##name) * enabled | + +const_debug unsigned int sysctl_sched_features = +#include "sched_features.h" + 0; + +#undef SCHED_FEAT + +#ifdef CONFIG_SCHED_DEBUG +#define SCHED_FEAT(name, enabled) \ + #name , + +static __read_mostly char *sched_feat_names[] = { +#include "sched_features.h" + NULL +}; + +#undef SCHED_FEAT + +static int sched_feat_show(struct seq_file *m, void *v) +{ + int i; + + for (i = 0; sched_feat_names[i]; i++) { + if (!(sysctl_sched_features & (1UL << i))) + seq_puts(m, "NO_"); + seq_printf(m, "%s ", sched_feat_names[i]); + } + seq_puts(m, "\n"); + + return 0; +} + +static ssize_t +sched_feat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + char *cmp; + int neg = 0; + int i; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + cmp = strstrip(buf); + + if (strncmp(cmp, "NO_", 3) == 0) { + neg = 1; + cmp += 3; + } + + for (i = 0; sched_feat_names[i]; i++) { + if (strcmp(cmp, sched_feat_names[i]) == 0) { + if (neg) + sysctl_sched_features &= ~(1UL << i); + else + sysctl_sched_features |= (1UL << i); + break; + } + } + + if (!sched_feat_names[i]) + return -EINVAL; + + *ppos += cnt; + + return cnt; +} + +static int sched_feat_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_feat_show, NULL); +} + +static const struct file_operations sched_feat_fops = { + .open = sched_feat_open, + .write = sched_feat_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static __init int sched_init_debug(void) +{ + debugfs_create_file("sched_features", 0644, NULL, NULL, + &sched_feat_fops); + + return 0; +} +late_initcall(sched_init_debug); + +#endif + +#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) + +/* + * Number of tasks to iterate in a single balance run. + * Limited because this is done with IRQs disabled. + */ +const_debug unsigned int sysctl_sched_nr_migrate = 32; + +/* + * period over which we average the RT time consumption, measured + * in ms. + * + * default: 1s + */ +const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC; + +/* + * period over which we measure -rt task cpu usage in us. + * default: 1s + */ +unsigned int sysctl_sched_rt_period = 1000000; + +static __read_mostly int scheduler_running; + +/* + * part of the period that we allow rt tasks to run in us. + * default: 0.95s + */ +int sysctl_sched_rt_runtime = 950000; + +static inline u64 global_rt_period(void) +{ + return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; +} + +static inline u64 global_rt_runtime(void) +{ + if (sysctl_sched_rt_runtime < 0) + return RUNTIME_INF; + + return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; +} + +#ifndef prepare_arch_switch +# define prepare_arch_switch(next) do { } while (0) +#endif +#ifndef finish_arch_switch +# define finish_arch_switch(prev) do { } while (0) +#endif + +static inline int task_current(struct rq *rq, struct task_struct *p) +{ + return rq->curr == p; +} + +static inline int task_running(struct rq *rq, struct task_struct *p) +{ +#ifdef CONFIG_SMP + return p->on_cpu; +#else + return task_current(rq, p); +#endif +} + +#ifndef __ARCH_WANT_UNLOCKED_CTXSW +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) +{ +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->on_cpu = 1; +#endif +} + +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) +{ +#ifdef CONFIG_SMP + /* + * After ->on_cpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->on_cpu = 0; +#endif +#ifdef CONFIG_DEBUG_SPINLOCK + /* this is a valid case when another task releases the spinlock */ + rq->lock.owner = current; +#endif + /* + * If we are tracking spinlock dependencies then we have to + * fix up the runqueue lock - which gets 'carried over' from + * prev into current: + */ + spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); + + raw_spin_unlock_irq(&rq->lock); +} + +#else /* __ARCH_WANT_UNLOCKED_CTXSW */ +static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) +{ +#ifdef CONFIG_SMP + /* + * We can optimise this out completely for !SMP, because the + * SMP rebalancing from interrupt is the only thing that cares + * here. + */ + next->on_cpu = 1; +#endif +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + raw_spin_unlock_irq(&rq->lock); +#else + raw_spin_unlock(&rq->lock); +#endif +} + +static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) +{ +#ifdef CONFIG_SMP + /* + * After ->on_cpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + */ + smp_wmb(); + prev->on_cpu = 0; +#endif +#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW + local_irq_enable(); +#endif +} +#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ + +/* + * __task_rq_lock - lock the rq @p resides on. + */ +static inline struct rq *__task_rq_lock(struct task_struct *p) + __acquires(rq->lock) +{ + struct rq *rq; + + lockdep_assert_held(&p->pi_lock); + + for (;;) { + rq = task_rq(p); + raw_spin_lock(&rq->lock); + if (likely(rq == task_rq(p))) + return rq; + raw_spin_unlock(&rq->lock); + } +} + +/* + * task_rq_lock - lock p->pi_lock and lock the rq @p resides on. + */ +static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) + __acquires(p->pi_lock) + __acquires(rq->lock) +{ + struct rq *rq; + + for (;;) { + raw_spin_lock_irqsave(&p->pi_lock, *flags); + rq = task_rq(p); + raw_spin_lock(&rq->lock); + if (likely(rq == task_rq(p))) + return rq; + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); + } +} + +static void __task_rq_unlock(struct rq *rq) + __releases(rq->lock) +{ + raw_spin_unlock(&rq->lock); +} + +static inline void +task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) + __releases(rq->lock) + __releases(p->pi_lock) +{ + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); +} + +/* + * this_rq_lock - lock this runqueue and disable interrupts. + */ +static struct rq *this_rq_lock(void) + __acquires(rq->lock) +{ + struct rq *rq; + + local_irq_disable(); + rq = this_rq(); + raw_spin_lock(&rq->lock); + + return rq; +} + +#ifdef CONFIG_SCHED_HRTICK +/* + * Use HR-timers to deliver accurate preemption points. + * + * Its all a bit involved since we cannot program an hrt while holding the + * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a + * reschedule event. + * + * When we get rescheduled we reprogram the hrtick_timer outside of the + * rq->lock. + */ + +/* + * Use hrtick when: + * - enabled by features + * - hrtimer is actually high res + */ +static inline int hrtick_enabled(struct rq *rq) +{ + if (!sched_feat(HRTICK)) + return 0; + if (!cpu_active(cpu_of(rq))) + return 0; + return hrtimer_is_hres_active(&rq->hrtick_timer); +} + +static void hrtick_clear(struct rq *rq) +{ + if (hrtimer_active(&rq->hrtick_timer)) + hrtimer_cancel(&rq->hrtick_timer); +} + +/* + * High-resolution timer tick. + * Runs from hardirq context with interrupts disabled. + */ +static enum hrtimer_restart hrtick(struct hrtimer *timer) +{ + struct rq *rq = container_of(timer, struct rq, hrtick_timer); + + WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); + + raw_spin_lock(&rq->lock); + update_rq_clock(rq); + rq->curr->sched_class->task_tick(rq, rq->curr, 1); + raw_spin_unlock(&rq->lock); + + return HRTIMER_NORESTART; +} + +#ifdef CONFIG_SMP +/* + * called from hardirq (IPI) context + */ +static void __hrtick_start(void *arg) +{ + struct rq *rq = arg; + + raw_spin_lock(&rq->lock); + hrtimer_restart(&rq->hrtick_timer); + rq->hrtick_csd_pending = 0; + raw_spin_unlock(&rq->lock); +} + +/* + * Called to set the hrtick timer state. + * + * called with rq->lock held and irqs disabled + */ +static void hrtick_start(struct rq *rq, u64 delay) +{ + struct hrtimer *timer = &rq->hrtick_timer; + ktime_t time = ktime_add_ns(timer->base->get_time(), delay); + + hrtimer_set_expires(timer, time); + + if (rq == this_rq()) { + hrtimer_restart(timer); + } else if (!rq->hrtick_csd_pending) { + __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0); + rq->hrtick_csd_pending = 1; + } +} + +static int +hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + int cpu = (int)(long)hcpu; + + switch (action) { + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + hrtick_clear(cpu_rq(cpu)); + return NOTIFY_OK; + } + + return NOTIFY_DONE; +} + +static __init void init_hrtick(void) +{ + hotcpu_notifier(hotplug_hrtick, 0); +} +#else +/* + * Called to set the hrtick timer state. + * + * called with rq->lock held and irqs disabled + */ +static void hrtick_start(struct rq *rq, u64 delay) +{ + __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, + HRTIMER_MODE_REL_PINNED, 0); +} + +static inline void init_hrtick(void) +{ +} +#endif /* CONFIG_SMP */ + +static void init_rq_hrtick(struct rq *rq) +{ +#ifdef CONFIG_SMP + rq->hrtick_csd_pending = 0; + + rq->hrtick_csd.flags = 0; + rq->hrtick_csd.func = __hrtick_start; + rq->hrtick_csd.info = rq; +#endif + + hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rq->hrtick_timer.function = hrtick; +} +#else /* CONFIG_SCHED_HRTICK */ +static inline void hrtick_clear(struct rq *rq) +{ +} + +static inline void init_rq_hrtick(struct rq *rq) +{ +} + +static inline void init_hrtick(void) +{ +} +#endif /* CONFIG_SCHED_HRTICK */ + +/* + * resched_task - mark a task 'to be rescheduled now'. + * + * On UP this means the setting of the need_resched flag, on SMP it + * might also involve a cross-CPU call to trigger the scheduler on + * the target CPU. + */ +#ifdef CONFIG_SMP + +#ifndef tsk_is_polling +#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) +#endif + +static void resched_task(struct task_struct *p) +{ + int cpu; + + assert_raw_spin_locked(&task_rq(p)->lock); + + if (test_tsk_need_resched(p)) + return; + + set_tsk_need_resched(p); + + cpu = task_cpu(p); + if (cpu == smp_processor_id()) + return; + + /* NEED_RESCHED must be visible before we test polling */ + smp_mb(); + if (!tsk_is_polling(p)) + smp_send_reschedule(cpu); +} + +static void resched_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + if (!raw_spin_trylock_irqsave(&rq->lock, flags)) + return; + resched_task(cpu_curr(cpu)); + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +#ifdef CONFIG_NO_HZ +/* + * In the semi idle case, use the nearest busy cpu for migrating timers + * from an idle cpu. This is good for power-savings. + * + * We don't do similar optimization for completely idle system, as + * selecting an idle cpu will add more delays to the timers than intended + * (as that cpu's timer base may not be uptodate wrt jiffies etc). + */ +int get_nohz_timer_target(void) +{ + int cpu = smp_processor_id(); + int i; + struct sched_domain *sd; + + rcu_read_lock(); + for_each_domain(cpu, sd) { + for_each_cpu(i, sched_domain_span(sd)) { + if (!idle_cpu(i)) { + cpu = i; + goto unlock; + } + } + } +unlock: + rcu_read_unlock(); + return cpu; +} +/* + * When add_timer_on() enqueues a timer into the timer wheel of an + * idle CPU then this timer might expire before the next timer event + * which is scheduled to wake up that CPU. In case of a completely + * idle system the next event might even be infinite time into the + * future. wake_up_idle_cpu() ensures that the CPU is woken up and + * leaves the inner idle loop so the newly added timer is taken into + * account when the CPU goes back to idle and evaluates the timer + * wheel for the next timer event. + */ +void wake_up_idle_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + if (cpu == smp_processor_id()) + return; + + /* + * This is safe, as this function is called with the timer + * wheel base lock of (cpu) held. When the CPU is on the way + * to idle and has not yet set rq->curr to idle then it will + * be serialized on the timer wheel base lock and take the new + * timer into account automatically. + */ + if (rq->curr != rq->idle) + return; + + /* + * We can set TIF_RESCHED on the idle task of the other CPU + * lockless. The worst case is that the other CPU runs the + * idle task through an additional NOOP schedule() + */ + set_tsk_need_resched(rq->idle); + + /* NEED_RESCHED must be visible before we test polling */ + smp_mb(); + if (!tsk_is_polling(rq->idle)) + smp_send_reschedule(cpu); +} + +#endif /* CONFIG_NO_HZ */ + +static u64 sched_avg_period(void) +{ + return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; +} + +static void sched_avg_update(struct rq *rq) +{ + s64 period = sched_avg_period(); + + while ((s64)(rq->clock - rq->age_stamp) > period) { + /* + * Inline assembly required to prevent the compiler + * optimising this loop into a divmod call. + * See __iter_div_u64_rem() for another example of this. + */ + asm("" : "+rm" (rq->age_stamp)); + rq->age_stamp += period; + rq->rt_avg /= 2; + } +} + +static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) +{ + rq->rt_avg += rt_delta; + sched_avg_update(rq); +} + +#else /* !CONFIG_SMP */ +static void resched_task(struct task_struct *p) +{ + assert_raw_spin_locked(&task_rq(p)->lock); + set_tsk_need_resched(p); +} + +static void sched_rt_avg_update(struct rq *rq, u64 rt_delta) +{ +} + +static void sched_avg_update(struct rq *rq) +{ +} +#endif /* CONFIG_SMP */ + +#if BITS_PER_LONG == 32 +# define WMULT_CONST (~0UL) +#else +# define WMULT_CONST (1UL << 32) +#endif + +#define WMULT_SHIFT 32 + +/* + * Shift right and round: + */ +#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) + +/* + * delta *= weight / lw + */ +static unsigned long +calc_delta_mine(unsigned long delta_exec, unsigned long weight, + struct load_weight *lw) +{ + u64 tmp; + + /* + * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched + * entities since MIN_SHARES = 2. Treat weight as 1 if less than + * 2^SCHED_LOAD_RESOLUTION. + */ + if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION))) + tmp = (u64)delta_exec * scale_load_down(weight); + else + tmp = (u64)delta_exec; + + if (!lw->inv_weight) { + unsigned long w = scale_load_down(lw->weight); + + if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST)) + lw->inv_weight = 1; + else if (unlikely(!w)) + lw->inv_weight = WMULT_CONST; + else + lw->inv_weight = WMULT_CONST / w; + } + + /* + * Check whether we'd overflow the 64-bit multiplication: + */ + if (unlikely(tmp > WMULT_CONST)) + tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight, + WMULT_SHIFT/2); + else + tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT); + + return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); +} + +static inline void update_load_add(struct load_weight *lw, unsigned long inc) +{ + lw->weight += inc; + lw->inv_weight = 0; +} + +static inline void update_load_sub(struct load_weight *lw, unsigned long dec) +{ + lw->weight -= dec; + lw->inv_weight = 0; +} + +static inline void update_load_set(struct load_weight *lw, unsigned long w) +{ + lw->weight = w; + lw->inv_weight = 0; +} + +/* + * To aid in avoiding the subversion of "niceness" due to uneven distribution + * of tasks with abnormal "nice" values across CPUs the contribution that + * each task makes to its run queue's load is weighted according to its + * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a + * scaled version of the new time slice allocation that they receive on time + * slice expiry etc. + */ + +#define WEIGHT_IDLEPRIO 3 +#define WMULT_IDLEPRIO 1431655765 + +/* + * Nice levels are multiplicative, with a gentle 10% change for every + * nice level changed. I.e. when a CPU-bound task goes from nice 0 to + * nice 1, it will get ~10% less CPU time than another CPU-bound task + * that remained on nice 0. + * + * The "10% effect" is relative and cumulative: from _any_ nice level, + * if you go up 1 level, it's -10% CPU usage, if you go down 1 level + * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. + * If a task goes up by ~10% and another task goes down by ~10% then + * the relative distance between them is ~25%.) + */ +static const int prio_to_weight[40] = { + /* -20 */ 88761, 71755, 56483, 46273, 36291, + /* -15 */ 29154, 23254, 18705, 14949, 11916, + /* -10 */ 9548, 7620, 6100, 4904, 3906, + /* -5 */ 3121, 2501, 1991, 1586, 1277, + /* 0 */ 1024, 820, 655, 526, 423, + /* 5 */ 335, 272, 215, 172, 137, + /* 10 */ 110, 87, 70, 56, 45, + /* 15 */ 36, 29, 23, 18, 15, +}; + +/* + * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. + * + * In cases where the weight does not change often, we can use the + * precalculated inverse to speed up arithmetics by turning divisions + * into multiplications: + */ +static const u32 prio_to_wmult[40] = { + /* -20 */ 48388, 59856, 76040, 92818, 118348, + /* -15 */ 147320, 184698, 229616, 287308, 360437, + /* -10 */ 449829, 563644, 704093, 875809, 1099582, + /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, + /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, + /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, + /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, + /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, +}; + +/* Time spent by the tasks of the cpu accounting group executing in ... */ +enum cpuacct_stat_index { + CPUACCT_STAT_USER, /* ... user mode */ + CPUACCT_STAT_SYSTEM, /* ... kernel mode */ + + CPUACCT_STAT_NSTATS, +}; + +#ifdef CONFIG_CGROUP_CPUACCT +static void cpuacct_charge(struct task_struct *tsk, u64 cputime); +static void cpuacct_update_stats(struct task_struct *tsk, + enum cpuacct_stat_index idx, cputime_t val); +#else +static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} +static inline void cpuacct_update_stats(struct task_struct *tsk, + enum cpuacct_stat_index idx, cputime_t val) {} +#endif + +static inline void inc_cpu_load(struct rq *rq, unsigned long load) +{ + update_load_add(&rq->load, load); +} + +static inline void dec_cpu_load(struct rq *rq, unsigned long load) +{ + update_load_sub(&rq->load, load); +} + +#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED) +typedef int (*tg_visitor)(struct task_group *, void *); + +/* + * Iterate the full tree, calling @down when first entering a node and @up when + * leaving it for the final time. + */ +static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) +{ + struct task_group *parent, *child; + int ret; + + rcu_read_lock(); + parent = &root_task_group; +down: + ret = (*down)(parent, data); + if (ret) + goto out_unlock; + list_for_each_entry_rcu(child, &parent->children, siblings) { + parent = child; + goto down; + +up: + continue; + } + ret = (*up)(parent, data); + if (ret) + goto out_unlock; + + child = parent; + parent = parent->parent; + if (parent) + goto up; +out_unlock: + rcu_read_unlock(); + + return ret; +} + +static int tg_nop(struct task_group *tg, void *data) +{ + return 0; +} +#endif + +#ifdef CONFIG_SMP +/* Used instead of source_load when we know the type == 0 */ +static unsigned long weighted_cpuload(const int cpu) +{ + return cpu_rq(cpu)->load.weight; +} + +/* + * Return a low guess at the load of a migration-source cpu weighted + * according to the scheduling class and "nice" value. + * + * We want to under-estimate the load of migration sources, to + * balance conservatively. + */ +static unsigned long source_load(int cpu, int type) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); + + if (type == 0 || !sched_feat(LB_BIAS)) + return total; + + return min(rq->cpu_load[type-1], total); +} + +/* + * Return a high guess at the load of a migration-target cpu weighted + * according to the scheduling class and "nice" value. + */ +static unsigned long target_load(int cpu, int type) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long total = weighted_cpuload(cpu); + + if (type == 0 || !sched_feat(LB_BIAS)) + return total; + + return max(rq->cpu_load[type-1], total); +} + +static unsigned long power_of(int cpu) +{ + return cpu_rq(cpu)->cpu_power; +} + +static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); + +static unsigned long cpu_avg_load_per_task(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long nr_running = ACCESS_ONCE(rq->nr_running); + + if (nr_running) + rq->avg_load_per_task = rq->load.weight / nr_running; + else + rq->avg_load_per_task = 0; + + return rq->avg_load_per_task; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED + +/* + * Compute the cpu's hierarchical load factor for each task group. + * This needs to be done in a top-down fashion because the load of a child + * group is a fraction of its parents load. + */ +static int tg_load_down(struct task_group *tg, void *data) +{ + unsigned long load; + long cpu = (long)data; + + if (!tg->parent) { + load = cpu_rq(cpu)->load.weight; + } else { + load = tg->parent->cfs_rq[cpu]->h_load; + load *= tg->se[cpu]->load.weight; + load /= tg->parent->cfs_rq[cpu]->load.weight + 1; + } + + tg->cfs_rq[cpu]->h_load = load; + + return 0; +} + +static void update_h_load(long cpu) +{ + walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); +} + +#endif + +#ifdef CONFIG_PREEMPT + +static void double_rq_lock(struct rq *rq1, struct rq *rq2); + +/* + * fair double_lock_balance: Safely acquires both rq->locks in a fair + * way at the expense of forcing extra atomic operations in all + * invocations. This assures that the double_lock is acquired using the + * same underlying policy as the spinlock_t on this architecture, which + * reduces latency compared to the unfair variant below. However, it + * also adds more overhead and therefore may reduce throughput. + */ +static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + raw_spin_unlock(&this_rq->lock); + double_rq_lock(this_rq, busiest); + + return 1; +} + +#else +/* + * Unfair double_lock_balance: Optimizes throughput at the expense of + * latency by eliminating extra atomic operations when the locks are + * already in proper order on entry. This favors lower cpu-ids and will + * grant the double lock to lower cpus over higher ids under contention, + * regardless of entry order into the function. + */ +static int _double_lock_balance(struct rq *this_rq, struct rq *busiest) + __releases(this_rq->lock) + __acquires(busiest->lock) + __acquires(this_rq->lock) +{ + int ret = 0; + + if (unlikely(!raw_spin_trylock(&busiest->lock))) { + if (busiest < this_rq) { + raw_spin_unlock(&this_rq->lock); + raw_spin_lock(&busiest->lock); + raw_spin_lock_nested(&this_rq->lock, + SINGLE_DEPTH_NESTING); + ret = 1; + } else + raw_spin_lock_nested(&busiest->lock, + SINGLE_DEPTH_NESTING); + } + return ret; +} + +#endif /* CONFIG_PREEMPT */ + +/* + * double_lock_balance - lock the busiest runqueue, this_rq is locked already. + */ +static int double_lock_balance(struct rq *this_rq, struct rq *busiest) +{ + if (unlikely(!irqs_disabled())) { + /* printk() doesn't work good under rq->lock */ + raw_spin_unlock(&this_rq->lock); + BUG_ON(1); + } + + return _double_lock_balance(this_rq, busiest); +} + +static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) + __releases(busiest->lock) +{ + raw_spin_unlock(&busiest->lock); + lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); +} + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + if (rq1 == rq2) { + raw_spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ + } else { + if (rq1 < rq2) { + raw_spin_lock(&rq1->lock); + raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); + } else { + raw_spin_lock(&rq2->lock); + raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); + } + } +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + raw_spin_unlock(&rq1->lock); + if (rq1 != rq2) + raw_spin_unlock(&rq2->lock); + else + __release(rq2->lock); +} + +#else /* CONFIG_SMP */ + +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ +static void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + BUG_ON(rq1 != rq2); + raw_spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + BUG_ON(rq1 != rq2); + raw_spin_unlock(&rq1->lock); + __release(rq2->lock); +} + +#endif + +static void calc_load_account_idle(struct rq *this_rq); +static void update_sysctl(void); +static int get_update_sysctl_factor(void); +static void update_cpu_load(struct rq *this_rq); + +static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + set_task_rq(p, cpu); +#ifdef CONFIG_SMP + /* + * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be + * successfuly executed on another CPU. We must ensure that updates of + * per-task data have been completed by this moment. + */ + smp_wmb(); + task_thread_info(p)->cpu = cpu; +#endif +} + +static const struct sched_class rt_sched_class; + +#define sched_class_highest (&stop_sched_class) +#define for_each_class(class) \ + for (class = sched_class_highest; class; class = class->next) + +#include "sched_stats.h" + +static void inc_nr_running(struct rq *rq) +{ + rq->nr_running++; +} + +static void dec_nr_running(struct rq *rq) +{ + rq->nr_running--; +} + +static void set_load_weight(struct task_struct *p) +{ + int prio = p->static_prio - MAX_RT_PRIO; + struct load_weight *load = &p->se.load; + + /* + * SCHED_IDLE tasks get minimal weight: + */ + if (p->policy == SCHED_IDLE) { + load->weight = scale_load(WEIGHT_IDLEPRIO); + load->inv_weight = WMULT_IDLEPRIO; + return; + } + + load->weight = scale_load(prio_to_weight[prio]); + load->inv_weight = prio_to_wmult[prio]; +} + +static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) +{ + update_rq_clock(rq); + sched_info_queued(p); + p->sched_class->enqueue_task(rq, p, flags); +} + +static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) +{ + update_rq_clock(rq); + sched_info_dequeued(p); + p->sched_class->dequeue_task(rq, p, flags); +} + +/* + * activate_task - move a task to the runqueue. + */ +static void activate_task(struct rq *rq, struct task_struct *p, int flags) +{ + if (task_contributes_to_load(p)) + rq->nr_uninterruptible--; + + enqueue_task(rq, p, flags); + inc_nr_running(rq); +} + +/* + * deactivate_task - remove a task from the runqueue. + */ +static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) +{ + if (task_contributes_to_load(p)) + rq->nr_uninterruptible++; + + dequeue_task(rq, p, flags); + dec_nr_running(rq); +} + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + +/* + * There are no locks covering percpu hardirq/softirq time. + * They are only modified in account_system_vtime, on corresponding CPU + * with interrupts disabled. So, writes are safe. + * They are read and saved off onto struct rq in update_rq_clock(). + * This may result in other CPU reading this CPU's irq time and can + * race with irq/account_system_vtime on this CPU. We would either get old + * or new value with a side effect of accounting a slice of irq time to wrong + * task when irq is in progress while we read rq->clock. That is a worthy + * compromise in place of having locks on each irq in account_system_time. + */ +static DEFINE_PER_CPU(u64, cpu_hardirq_time); +static DEFINE_PER_CPU(u64, cpu_softirq_time); + +static DEFINE_PER_CPU(u64, irq_start_time); +static int sched_clock_irqtime; + +void enable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 1; +} + +void disable_sched_clock_irqtime(void) +{ + sched_clock_irqtime = 0; +} + +#ifndef CONFIG_64BIT +static DEFINE_PER_CPU(seqcount_t, irq_time_seq); + +static inline void irq_time_write_begin(void) +{ + __this_cpu_inc(irq_time_seq.sequence); + smp_wmb(); +} + +static inline void irq_time_write_end(void) +{ + smp_wmb(); + __this_cpu_inc(irq_time_seq.sequence); +} + +static inline u64 irq_time_read(int cpu) +{ + u64 irq_time; + unsigned seq; + + do { + seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); + irq_time = per_cpu(cpu_softirq_time, cpu) + + per_cpu(cpu_hardirq_time, cpu); + } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); + + return irq_time; +} +#else /* CONFIG_64BIT */ +static inline void irq_time_write_begin(void) +{ +} + +static inline void irq_time_write_end(void) +{ +} + +static inline u64 irq_time_read(int cpu) +{ + return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); +} +#endif /* CONFIG_64BIT */ + +/* + * Called before incrementing preempt_count on {soft,}irq_enter + * and before decrementing preempt_count on {soft,}irq_exit. + */ +void account_system_vtime(struct task_struct *curr) +{ + unsigned long flags; + s64 delta; + int cpu; + + if (!sched_clock_irqtime) + return; + + local_irq_save(flags); + + cpu = smp_processor_id(); + delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); + __this_cpu_add(irq_start_time, delta); + + irq_time_write_begin(); + /* + * We do not account for softirq time from ksoftirqd here. + * We want to continue accounting softirq time to ksoftirqd thread + * in that case, so as not to confuse scheduler with a special task + * that do not consume any time, but still wants to run. + */ + if (hardirq_count()) + __this_cpu_add(cpu_hardirq_time, delta); + else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) + __this_cpu_add(cpu_softirq_time, delta); + + irq_time_write_end(); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(account_system_vtime); + +static void update_rq_clock_task(struct rq *rq, s64 delta) +{ + s64 irq_delta; + + irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; + + /* + * Since irq_time is only updated on {soft,}irq_exit, we might run into + * this case when a previous update_rq_clock() happened inside a + * {soft,}irq region. + * + * When this happens, we stop ->clock_task and only update the + * prev_irq_time stamp to account for the part that fit, so that a next + * update will consume the rest. This ensures ->clock_task is + * monotonic. + * + * It does however cause some slight miss-attribution of {soft,}irq + * time, a more accurate solution would be to update the irq_time using + * the current rq->clock timestamp, except that would require using + * atomic ops. + */ + if (irq_delta > delta) + irq_delta = delta; + + rq->prev_irq_time += irq_delta; + delta -= irq_delta; + rq->clock_task += delta; + + if (irq_delta && sched_feat(NONIRQ_POWER)) + sched_rt_avg_update(rq, irq_delta); +} + +static int irqtime_account_hi_update(void) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + unsigned long flags; + u64 latest_ns; + int ret = 0; + + local_irq_save(flags); + latest_ns = this_cpu_read(cpu_hardirq_time); + if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq)) + ret = 1; + local_irq_restore(flags); + return ret; +} + +static int irqtime_account_si_update(void) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + unsigned long flags; + u64 latest_ns; + int ret = 0; + + local_irq_save(flags); + latest_ns = this_cpu_read(cpu_softirq_time); + if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq)) + ret = 1; + local_irq_restore(flags); + return ret; +} + +#else /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#define sched_clock_irqtime (0) + +static void update_rq_clock_task(struct rq *rq, s64 delta) +{ + rq->clock_task += delta; +} + +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#include "sched_idletask.c" +#include "sched_fair.c" +#include "sched_rt.c" +#include "sched_autogroup.c" +#include "sched_stoptask.c" +#ifdef CONFIG_SCHED_DEBUG +# include "sched_debug.c" +#endif + +void sched_set_stop_task(int cpu, struct task_struct *stop) +{ + struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; + struct task_struct *old_stop = cpu_rq(cpu)->stop; + + if (stop) { + /* + * Make it appear like a SCHED_FIFO task, its something + * userspace knows about and won't get confused about. + * + * Also, it will make PI more or less work without too + * much confusion -- but then, stop work should not + * rely on PI working anyway. + */ + sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); + + stop->sched_class = &stop_sched_class; + } + + cpu_rq(cpu)->stop = stop; + + if (old_stop) { + /* + * Reset it back to a normal scheduling class so that + * it can die in pieces. + */ + old_stop->sched_class = &rt_sched_class; + } +} + +/* + * __normal_prio - return the priority that is based on the static prio + */ +static inline int __normal_prio(struct task_struct *p) +{ + return p->static_prio; +} + +/* + * Calculate the expected normal priority: i.e. priority + * without taking RT-inheritance into account. Might be + * boosted by interactivity modifiers. Changes upon fork, + * setprio syscalls, and whenever the interactivity + * estimator recalculates. + */ +static inline int normal_prio(struct task_struct *p) +{ + int prio; + + if (task_has_rt_policy(p)) + prio = MAX_RT_PRIO-1 - p->rt_priority; + else + prio = __normal_prio(p); + return prio; +} + +/* + * Calculate the current priority, i.e. the priority + * taken into account by the scheduler. This value might + * be boosted by RT tasks, or might be boosted by + * interactivity modifiers. Will be RT if the task got + * RT-boosted. If not then it returns p->normal_prio. + */ +static int effective_prio(struct task_struct *p) +{ + p->normal_prio = normal_prio(p); + /* + * If we are RT tasks or we were boosted to RT priority, + * keep the priority unchanged. Otherwise, update priority + * to the normal priority: + */ + if (!rt_prio(p->prio)) + return p->normal_prio; + return p->prio; +} + +/** + * task_curr - is this task currently executing on a CPU? + * @p: the task in question. + */ +inline int task_curr(const struct task_struct *p) +{ + return cpu_curr(task_cpu(p)) == p; +} + +static inline void check_class_changed(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class, + int oldprio) +{ + if (prev_class != p->sched_class) { + if (prev_class->switched_from) + prev_class->switched_from(rq, p); + p->sched_class->switched_to(rq, p); + } else if (oldprio != p->prio) + p->sched_class->prio_changed(rq, p, oldprio); +} + +static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) +{ + const struct sched_class *class; + + if (p->sched_class == rq->curr->sched_class) { + rq->curr->sched_class->check_preempt_curr(rq, p, flags); + } else { + for_each_class(class) { + if (class == rq->curr->sched_class) + break; + if (class == p->sched_class) { + resched_task(rq->curr); + break; + } + } + } + + /* + * A queue event has occurred, and we're going to schedule. In + * this case, we can save a useless back to back clock update. + */ + if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) + rq->skip_clock_update = 1; +} + +#ifdef CONFIG_SMP +/* + * Is this task likely cache-hot: + */ +static int +task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) +{ + s64 delta; + + if (p->sched_class != &fair_sched_class) + return 0; + + if (unlikely(p->policy == SCHED_IDLE)) + return 0; + + /* + * Buddy candidates are cache hot: + */ + if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && + (&p->se == cfs_rq_of(&p->se)->next || + &p->se == cfs_rq_of(&p->se)->last)) + return 1; + + if (sysctl_sched_migration_cost == -1) + return 1; + if (sysctl_sched_migration_cost == 0) + return 0; + + delta = now - p->se.exec_start; + + return delta < (s64)sysctl_sched_migration_cost; +} + +void set_task_cpu(struct task_struct *p, unsigned int new_cpu) +{ +#ifdef CONFIG_SCHED_DEBUG + /* + * We should never call set_task_cpu() on a blocked task, + * ttwu() will sort out the placement. + */ + WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && + !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)); + +#ifdef CONFIG_LOCKDEP + /* + * The caller should hold either p->pi_lock or rq->lock, when changing + * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. + * + * sched_move_task() holds both and thus holding either pins the cgroup, + * see set_task_rq(). + * + * Furthermore, all task_rq users should acquire both locks, see + * task_rq_lock(). + */ + WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || + lockdep_is_held(&task_rq(p)->lock))); +#endif +#endif + + trace_sched_migrate_task(p, new_cpu); + + if (task_cpu(p) != new_cpu) { + p->se.nr_migrations++; + perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); + } + + __set_task_cpu(p, new_cpu); +} + +struct migration_arg { + struct task_struct *task; + int dest_cpu; +}; + +static int migration_cpu_stop(void *data); + +/* + * wait_task_inactive - wait for a thread to unschedule. + * + * If @match_state is nonzero, it's the @p->state value just checked and + * not expected to change. If it changes, i.e. @p might have woken up, + * then return zero. When we succeed in waiting for @p to be off its CPU, + * we return a positive number (its total switch count). If a second call + * a short while later returns the same number, the caller can be sure that + * @p has remained unscheduled the whole time. + * + * The caller must ensure that the task *will* unschedule sometime soon, + * else this function might spin for a *long* time. This function can't + * be called with interrupts off, or it may introduce deadlock with + * smp_call_function() if an IPI is sent by the same process we are + * waiting to become inactive. + */ +unsigned long wait_task_inactive(struct task_struct *p, long match_state) +{ + unsigned long flags; + int running, on_rq; + unsigned long ncsw; + struct rq *rq; + + for (;;) { + /* + * We do the initial early heuristics without holding + * any task-queue locks at all. We'll only try to get + * the runqueue lock when things look like they will + * work out! + */ + rq = task_rq(p); + + /* + * If the task is actively running on another CPU + * still, just relax and busy-wait without holding + * any locks. + * + * NOTE! Since we don't hold any locks, it's not + * even sure that "rq" stays as the right runqueue! + * But we don't care, since "task_running()" will + * return false if the runqueue has changed and p + * is actually now running somewhere else! + */ + while (task_running(rq, p)) { + if (match_state && unlikely(p->state != match_state)) + return 0; + cpu_relax(); + } + + /* + * Ok, time to look more closely! We need the rq + * lock now, to be *sure*. If we're wrong, we'll + * just go back and repeat. + */ + rq = task_rq_lock(p, &flags); + trace_sched_wait_task(p); + running = task_running(rq, p); + on_rq = p->on_rq; + ncsw = 0; + if (!match_state || p->state == match_state) + ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ + task_rq_unlock(rq, p, &flags); + + /* + * If it changed from the expected state, bail out now. + */ + if (unlikely(!ncsw)) + break; + + /* + * Was it really running after all now that we + * checked with the proper locks actually held? + * + * Oops. Go back and try again.. + */ + if (unlikely(running)) { + cpu_relax(); + continue; + } + + /* + * It's not enough that it's not actively running, + * it must be off the runqueue _entirely_, and not + * preempted! + * + * So if it was still runnable (but just not actively + * running right now), it's preempted, and we should + * yield - it could be a while. + */ + if (unlikely(on_rq)) { + ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_hrtimeout(&to, HRTIMER_MODE_REL); + continue; + } + + /* + * Ahh, all good. It wasn't running, and it wasn't + * runnable, which means that it will never become + * running in the future either. We're all done! + */ + break; + } + + return ncsw; +} + +/*** + * kick_process - kick a running thread to enter/exit the kernel + * @p: the to-be-kicked thread + * + * Cause a process which is running on another CPU to enter + * kernel-mode, without any delay. (to get signals handled.) + * + * NOTE: this function doesn't have to take the runqueue lock, + * because all it wants to ensure is that the remote task enters + * the kernel. If the IPI races and the task has been migrated + * to another CPU then no harm is done and the purpose has been + * achieved as well. + */ +void kick_process(struct task_struct *p) +{ + int cpu; + + preempt_disable(); + cpu = task_cpu(p); + if ((cpu != smp_processor_id()) && task_curr(p)) + smp_send_reschedule(cpu); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(kick_process); +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_SMP +/* + * ->cpus_allowed is protected by both rq->lock and p->pi_lock + */ +static int select_fallback_rq(int cpu, struct task_struct *p) +{ + int dest_cpu; + const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu)); + + /* Look for allowed, online CPU in same node. */ + for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask) + if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + return dest_cpu; + + /* Any allowed, online CPU? */ + dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask); + if (dest_cpu < nr_cpu_ids) + return dest_cpu; + + /* No more Mr. Nice Guy. */ + dest_cpu = cpuset_cpus_allowed_fallback(p); + /* + * Don't tell them about moving exiting tasks or + * kernel threads (both mm NULL), since they never + * leave kernel. + */ + if (p->mm && printk_ratelimit()) { + printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", + task_pid_nr(p), p->comm, cpu); + } + + return dest_cpu; +} + +/* + * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable. + */ +static inline +int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) +{ + int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); + + /* + * In order not to call set_task_cpu() on a blocking task we need + * to rely on ttwu() to place the task on a valid ->cpus_allowed + * cpu. + * + * Since this is common to all placement strategies, this lives here. + * + * [ this allows ->select_task() to simply return task_cpu(p) and + * not worry about this generic constraint ] + */ + if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) || + !cpu_online(cpu))) + cpu = select_fallback_rq(task_cpu(p), p); + + return cpu; +} + +static void update_avg(u64 *avg, u64 sample) +{ + s64 diff = sample - *avg; + *avg += diff >> 3; +} +#endif + +static void +ttwu_stat(struct task_struct *p, int cpu, int wake_flags) +{ +#ifdef CONFIG_SCHEDSTATS + struct rq *rq = this_rq(); + +#ifdef CONFIG_SMP + int this_cpu = smp_processor_id(); + + if (cpu == this_cpu) { + schedstat_inc(rq, ttwu_local); + schedstat_inc(p, se.statistics.nr_wakeups_local); + } else { + struct sched_domain *sd; + + schedstat_inc(p, se.statistics.nr_wakeups_remote); + rcu_read_lock(); + for_each_domain(this_cpu, sd) { + if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { + schedstat_inc(sd, ttwu_wake_remote); + break; + } + } + rcu_read_unlock(); + } + + if (wake_flags & WF_MIGRATED) + schedstat_inc(p, se.statistics.nr_wakeups_migrate); + +#endif /* CONFIG_SMP */ + + schedstat_inc(rq, ttwu_count); + schedstat_inc(p, se.statistics.nr_wakeups); + + if (wake_flags & WF_SYNC) + schedstat_inc(p, se.statistics.nr_wakeups_sync); + +#endif /* CONFIG_SCHEDSTATS */ +} + +static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) +{ + activate_task(rq, p, en_flags); + p->on_rq = 1; + + /* if a worker is waking up, notify workqueue */ + if (p->flags & PF_WQ_WORKER) + wq_worker_waking_up(p, cpu_of(rq)); +} + +/* + * Mark the task runnable and perform wakeup-preemption. + */ +static void +ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) +{ + trace_sched_wakeup(p, true); + check_preempt_curr(rq, p, wake_flags); + + p->state = TASK_RUNNING; +#ifdef CONFIG_SMP + if (p->sched_class->task_woken) + p->sched_class->task_woken(rq, p); + + if (unlikely(rq->idle_stamp)) { + u64 delta = rq->clock - rq->idle_stamp; + u64 max = 2*sysctl_sched_migration_cost; + + if (delta > max) + rq->avg_idle = max; + else + update_avg(&rq->avg_idle, delta); + rq->idle_stamp = 0; + } +#endif +} + +static void +ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) +{ +#ifdef CONFIG_SMP + if (p->sched_contributes_to_load) + rq->nr_uninterruptible--; +#endif + + ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING); + ttwu_do_wakeup(rq, p, wake_flags); +} + +/* + * Called in case the task @p isn't fully descheduled from its runqueue, + * in this case we must do a remote wakeup. Its a 'light' wakeup though, + * since all we need to do is flip p->state to TASK_RUNNING, since + * the task is still ->on_rq. + */ +static int ttwu_remote(struct task_struct *p, int wake_flags) +{ + struct rq *rq; + int ret = 0; + + rq = __task_rq_lock(p); + if (p->on_rq) { + ttwu_do_wakeup(rq, p, wake_flags); + ret = 1; + } + __task_rq_unlock(rq); + + return ret; +} + +#ifdef CONFIG_SMP +static void sched_ttwu_do_pending(struct task_struct *list) +{ + struct rq *rq = this_rq(); + + raw_spin_lock(&rq->lock); + + while (list) { + struct task_struct *p = list; + list = list->wake_entry; + ttwu_do_activate(rq, p, 0); + } + + raw_spin_unlock(&rq->lock); +} + +#ifdef CONFIG_HOTPLUG_CPU + +static void sched_ttwu_pending(void) +{ + struct rq *rq = this_rq(); + struct task_struct *list = xchg(&rq->wake_list, NULL); + + if (!list) + return; + + sched_ttwu_do_pending(list); +} + +#endif /* CONFIG_HOTPLUG_CPU */ + +void scheduler_ipi(void) +{ + struct rq *rq = this_rq(); + struct task_struct *list = xchg(&rq->wake_list, NULL); + + if (!list) + return; + + /* + * Not all reschedule IPI handlers call irq_enter/irq_exit, since + * traditionally all their work was done from the interrupt return + * path. Now that we actually do some work, we need to make sure + * we do call them. + * + * Some archs already do call them, luckily irq_enter/exit nest + * properly. + * + * Arguably we should visit all archs and update all handlers, + * however a fair share of IPIs are still resched only so this would + * somewhat pessimize the simple resched case. + */ + irq_enter(); + sched_ttwu_do_pending(list); + irq_exit(); +} + +static void ttwu_queue_remote(struct task_struct *p, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct task_struct *next = rq->wake_list; + + for (;;) { + struct task_struct *old = next; + + p->wake_entry = next; + next = cmpxchg(&rq->wake_list, old, p); + if (next == old) + break; + } + + if (!next) + smp_send_reschedule(cpu); +} + +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW +static int ttwu_activate_remote(struct task_struct *p, int wake_flags) +{ + struct rq *rq; + int ret = 0; + + rq = __task_rq_lock(p); + if (p->on_cpu) { + ttwu_activate(rq, p, ENQUEUE_WAKEUP); + ttwu_do_wakeup(rq, p, wake_flags); + ret = 1; + } + __task_rq_unlock(rq); + + return ret; + +} +#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ +#endif /* CONFIG_SMP */ + +static void ttwu_queue(struct task_struct *p, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + +#if defined(CONFIG_SMP) + if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) { + sched_clock_cpu(cpu); /* sync clocks x-cpu */ + ttwu_queue_remote(p, cpu); + return; + } +#endif + + raw_spin_lock(&rq->lock); + ttwu_do_activate(rq, p, 0); + raw_spin_unlock(&rq->lock); +} + +/** + * try_to_wake_up - wake up a thread + * @p: the thread to be awakened + * @state: the mask of task states that can be woken + * @wake_flags: wake modifier flags (WF_*) + * + * Put it on the run-queue if it's not already there. The "current" + * thread is always on the run-queue (except when the actual + * re-schedule is in progress), and as such you're allowed to do + * the simpler "current->state = TASK_RUNNING" to mark yourself + * runnable without the overhead of this. + * + * Returns %true if @p was woken up, %false if it was already running + * or @state didn't match @p's state. + */ +static int +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) +{ + unsigned long flags; + int cpu, success = 0; + + smp_wmb(); + raw_spin_lock_irqsave(&p->pi_lock, flags); + if (!(p->state & state)) + goto out; + + success = 1; /* we're going to change ->state */ + cpu = task_cpu(p); + + if (p->on_rq && ttwu_remote(p, wake_flags)) + goto stat; + +#ifdef CONFIG_SMP + /* + * If the owning (remote) cpu is still in the middle of schedule() with + * this task as prev, wait until its done referencing the task. + */ + while (p->on_cpu) { +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + /* + * In case the architecture enables interrupts in + * context_switch(), we cannot busy wait, since that + * would lead to deadlocks when an interrupt hits and + * tries to wake up @prev. So bail and do a complete + * remote wakeup. + */ + if (ttwu_activate_remote(p, wake_flags)) + goto stat; +#else + cpu_relax(); +#endif + } + /* + * Pairs with the smp_wmb() in finish_lock_switch(). + */ + smp_rmb(); + + p->sched_contributes_to_load = !!task_contributes_to_load(p); + p->state = TASK_WAKING; + + if (p->sched_class->task_waking) + p->sched_class->task_waking(p); + + cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); + if (task_cpu(p) != cpu) { + wake_flags |= WF_MIGRATED; + set_task_cpu(p, cpu); + } +#endif /* CONFIG_SMP */ + + ttwu_queue(p, cpu); +stat: + ttwu_stat(p, cpu, wake_flags); +out: + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + + return success; +} + +/** + * try_to_wake_up_local - try to wake up a local task with rq lock held + * @p: the thread to be awakened + * + * Put @p on the run-queue if it's not already there. The caller must + * ensure that this_rq() is locked, @p is bound to this_rq() and not + * the current task. + */ +static void try_to_wake_up_local(struct task_struct *p) +{ + struct rq *rq = task_rq(p); + + BUG_ON(rq != this_rq()); + BUG_ON(p == current); + lockdep_assert_held(&rq->lock); + + if (!raw_spin_trylock(&p->pi_lock)) { + raw_spin_unlock(&rq->lock); + raw_spin_lock(&p->pi_lock); + raw_spin_lock(&rq->lock); + } + + if (!(p->state & TASK_NORMAL)) + goto out; + + if (!p->on_rq) + ttwu_activate(rq, p, ENQUEUE_WAKEUP); + + ttwu_do_wakeup(rq, p, 0); + ttwu_stat(p, smp_processor_id(), 0); +out: + raw_spin_unlock(&p->pi_lock); +} + +/** + * wake_up_process - Wake up a specific process + * @p: The process to be woken up. + * + * Attempt to wake up the nominated process and move it to the set of runnable + * processes. Returns 1 if the process was woken up, 0 if it was already + * running. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +int wake_up_process(struct task_struct *p) +{ + return try_to_wake_up(p, TASK_ALL, 0); +} +EXPORT_SYMBOL(wake_up_process); + +int wake_up_state(struct task_struct *p, unsigned int state) +{ + return try_to_wake_up(p, state, 0); +} + +/* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. + * + * __sched_fork() is basic setup used by init_idle() too: + */ +static void __sched_fork(struct task_struct *p) +{ + p->on_rq = 0; + + p->se.on_rq = 0; + p->se.exec_start = 0; + p->se.sum_exec_runtime = 0; + p->se.prev_sum_exec_runtime = 0; + p->se.nr_migrations = 0; + p->se.vruntime = 0; + INIT_LIST_HEAD(&p->se.group_node); + +#ifdef CONFIG_SCHEDSTATS + memset(&p->se.statistics, 0, sizeof(p->se.statistics)); +#endif + + INIT_LIST_HEAD(&p->rt.run_list); + +#ifdef CONFIG_PREEMPT_NOTIFIERS + INIT_HLIST_HEAD(&p->preempt_notifiers); +#endif +} + +/* + * fork()/clone()-time setup: + */ +void sched_fork(struct task_struct *p) +{ + unsigned long flags; + int cpu = get_cpu(); + + __sched_fork(p); + /* + * We mark the process as running here. This guarantees that + * nobody will actually run it, and a signal or other external + * event cannot wake it up and insert it on the runqueue either. + */ + p->state = TASK_RUNNING; + + /* + * Revert to default priority/policy on fork if requested. + */ + if (unlikely(p->sched_reset_on_fork)) { + if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { + p->policy = SCHED_NORMAL; + p->normal_prio = p->static_prio; + } + + if (PRIO_TO_NICE(p->static_prio) < 0) { + p->static_prio = NICE_TO_PRIO(0); + p->normal_prio = p->static_prio; + set_load_weight(p); + } + + /* + * We don't need the reset flag anymore after the fork. It has + * fulfilled its duty: + */ + p->sched_reset_on_fork = 0; + } + + /* + * Make sure we do not leak PI boosting priority to the child. + */ + p->prio = current->normal_prio; + + if (!rt_prio(p->prio)) + p->sched_class = &fair_sched_class; + + if (p->sched_class->task_fork) + p->sched_class->task_fork(p); + + /* + * The child is not yet in the pid-hash so no cgroup attach races, + * and the cgroup is pinned to this child due to cgroup_fork() + * is ran before sched_fork(). + * + * Silence PROVE_RCU. + */ + raw_spin_lock_irqsave(&p->pi_lock, flags); + set_task_cpu(p, cpu); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) + if (likely(sched_info_on())) + memset(&p->sched_info, 0, sizeof(p->sched_info)); +#endif +#if defined(CONFIG_SMP) + p->on_cpu = 0; +#endif +#ifdef CONFIG_PREEMPT + /* Want to start with kernel preemption disabled. */ + task_thread_info(p)->preempt_count = 1; +#endif +#ifdef CONFIG_SMP + plist_node_init(&p->pushable_tasks, MAX_PRIO); +#endif + + put_cpu(); +} + +/* + * wake_up_new_task - wake up a newly created task for the first time. + * + * This function will do some initial scheduler statistics housekeeping + * that must be done for every newly created context, then puts the task + * on the runqueue and wakes it. + */ +void wake_up_new_task(struct task_struct *p) +{ + unsigned long flags; + struct rq *rq; + + raw_spin_lock_irqsave(&p->pi_lock, flags); +#ifdef CONFIG_SMP + /* + * Fork balancing, do it here and not earlier because: + * - cpus_allowed can change in the fork path + * - any previously selected cpu might disappear through hotplug + */ + set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0)); +#endif + + rq = __task_rq_lock(p); + activate_task(rq, p, 0); + p->on_rq = 1; + trace_sched_wakeup_new(p, true); + check_preempt_curr(rq, p, WF_FORK); +#ifdef CONFIG_SMP + if (p->sched_class->task_woken) + p->sched_class->task_woken(rq, p); +#endif + task_rq_unlock(rq, p, &flags); +} + +#ifdef CONFIG_PREEMPT_NOTIFIERS + +/** + * preempt_notifier_register - tell me when current is being preempted & rescheduled + * @notifier: notifier struct to register + */ +void preempt_notifier_register(struct preempt_notifier *notifier) +{ + hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); +} +EXPORT_SYMBOL_GPL(preempt_notifier_register); + +/** + * preempt_notifier_unregister - no longer interested in preemption notifications + * @notifier: notifier struct to unregister + * + * This is safe to call from within a preemption notifier. + */ +void preempt_notifier_unregister(struct preempt_notifier *notifier) +{ + hlist_del(¬ifier->link); +} +EXPORT_SYMBOL_GPL(preempt_notifier_unregister); + +static void fire_sched_in_preempt_notifiers(struct task_struct *curr) +{ + struct preempt_notifier *notifier; + struct hlist_node *node; + + hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) + notifier->ops->sched_in(notifier, raw_smp_processor_id()); +} + +static void +fire_sched_out_preempt_notifiers(struct task_struct *curr, + struct task_struct *next) +{ + struct preempt_notifier *notifier; + struct hlist_node *node; + + hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) + notifier->ops->sched_out(notifier, next); +} + +#else /* !CONFIG_PREEMPT_NOTIFIERS */ + +static void fire_sched_in_preempt_notifiers(struct task_struct *curr) +{ +} + +static void +fire_sched_out_preempt_notifiers(struct task_struct *curr, + struct task_struct *next) +{ +} + +#endif /* CONFIG_PREEMPT_NOTIFIERS */ + +/** + * prepare_task_switch - prepare to switch tasks + * @rq: the runqueue preparing to switch + * @prev: the current task that is being switched out + * @next: the task we are going to switch to. + * + * This is called with the rq lock held and interrupts off. It must + * be paired with a subsequent finish_task_switch after the context + * switch. + * + * prepare_task_switch sets up locking and calls architecture specific + * hooks. + */ +static inline void +prepare_task_switch(struct rq *rq, struct task_struct *prev, + struct task_struct *next) +{ + sched_info_switch(prev, next); + perf_event_task_sched_out(prev, next); + fire_sched_out_preempt_notifiers(prev, next); + prepare_lock_switch(rq, next); + prepare_arch_switch(next); + trace_sched_switch(prev, next); +} + +/** + * finish_task_switch - clean up after a task-switch + * @rq: runqueue associated with task-switch + * @prev: the thread we just switched away from. + * + * finish_task_switch must be called after the context switch, paired + * with a prepare_task_switch call before the context switch. + * finish_task_switch will reconcile locking set up by prepare_task_switch, + * and do any other architecture-specific cleanup actions. + * + * Note that we may have delayed dropping an mm in context_switch(). If + * so, we finish that here outside of the runqueue lock. (Doing it + * with the lock held can cause deadlocks; see schedule() for + * details.) + */ +static void finish_task_switch(struct rq *rq, struct task_struct *prev) + __releases(rq->lock) +{ + struct mm_struct *mm = rq->prev_mm; + long prev_state; + + rq->prev_mm = NULL; + + /* + * A task struct has one reference for the use as "current". + * If a task dies, then it sets TASK_DEAD in tsk->state and calls + * schedule one last time. The schedule call will never return, and + * the scheduled task must drop that reference. + * The test for TASK_DEAD must occur while the runqueue locks are + * still held, otherwise prev could be scheduled on another cpu, die + * there before we look at prev->state, and then the reference would + * be dropped twice. + * Manfred Spraul + */ + prev_state = prev->state; + finish_arch_switch(prev); +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + local_irq_disable(); +#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ + perf_event_task_sched_in(current); +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + local_irq_enable(); +#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ + finish_lock_switch(rq, prev); + + fire_sched_in_preempt_notifiers(current); + if (mm) + mmdrop(mm); + if (unlikely(prev_state == TASK_DEAD)) { + /* + * Remove function-return probe instances associated with this + * task and put them back on the free list. + */ + kprobe_flush_task(prev); + put_task_struct(prev); + } +} + +#ifdef CONFIG_SMP + +/* assumes rq->lock is held */ +static inline void pre_schedule(struct rq *rq, struct task_struct *prev) +{ + if (prev->sched_class->pre_schedule) + prev->sched_class->pre_schedule(rq, prev); +} + +/* rq->lock is NOT held, but preemption is disabled */ +static inline void post_schedule(struct rq *rq) +{ + if (rq->post_schedule) { + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + if (rq->curr->sched_class->post_schedule) + rq->curr->sched_class->post_schedule(rq); + raw_spin_unlock_irqrestore(&rq->lock, flags); + + rq->post_schedule = 0; + } +} + +#else + +static inline void pre_schedule(struct rq *rq, struct task_struct *p) +{ +} + +static inline void post_schedule(struct rq *rq) +{ +} + +#endif + +/** + * schedule_tail - first thing a freshly forked thread must call. + * @prev: the thread we just switched away from. + */ +asmlinkage void schedule_tail(struct task_struct *prev) + __releases(rq->lock) +{ + struct rq *rq = this_rq(); + + finish_task_switch(rq, prev); + + /* + * FIXME: do we need to worry about rq being invalidated by the + * task_switch? + */ + post_schedule(rq); + +#ifdef __ARCH_WANT_UNLOCKED_CTXSW + /* In this case, finish_task_switch does not reenable preemption */ + preempt_enable(); +#endif + if (current->set_child_tid) + put_user(task_pid_vnr(current), current->set_child_tid); +} + +/* + * context_switch - switch to the new MM and the new + * thread's register state. + */ +static inline void +context_switch(struct rq *rq, struct task_struct *prev, + struct task_struct *next) +{ + struct mm_struct *mm, *oldmm; + + prepare_task_switch(rq, prev, next); + + mm = next->mm; + oldmm = prev->active_mm; + /* + * For paravirt, this is coupled with an exit in switch_to to + * combine the page table reload and the switch backend into + * one hypercall. + */ + arch_start_context_switch(prev); + + if (!mm) { + next->active_mm = oldmm; + atomic_inc(&oldmm->mm_count); + enter_lazy_tlb(oldmm, next); + } else + switch_mm(oldmm, mm, next); + + if (!prev->mm) { + prev->active_mm = NULL; + rq->prev_mm = oldmm; + } + /* + * Since the runqueue lock will be released by the next + * task (which is an invalid locking op but in the case + * of the scheduler it's an obvious special-case), so we + * do an early lockdep release here: + */ +#ifndef __ARCH_WANT_UNLOCKED_CTXSW + spin_release(&rq->lock.dep_map, 1, _THIS_IP_); +#endif + + /* Here we just switch the register state and the stack. */ + switch_to(prev, next, prev); + + barrier(); + /* + * this_rq must be evaluated again because prev may have moved + * CPUs since it called schedule(), thus the 'rq' on its stack + * frame will be invalid. + */ + finish_task_switch(this_rq(), prev); +} + +/* + * nr_running, nr_uninterruptible and nr_context_switches: + * + * externally visible scheduler statistics: current number of runnable + * threads, current number of uninterruptible-sleeping threads, total + * number of context switches performed since bootup. + */ +unsigned long nr_running(void) +{ + unsigned long i, sum = 0; + + for_each_online_cpu(i) + sum += cpu_rq(i)->nr_running; + + return sum; +} + +unsigned long nr_uninterruptible(void) +{ + unsigned long i, sum = 0; + + for_each_possible_cpu(i) + sum += cpu_rq(i)->nr_uninterruptible; + + /* + * Since we read the counters lockless, it might be slightly + * inaccurate. Do not allow it to go below zero though: + */ + if (unlikely((long)sum < 0)) + sum = 0; + + return sum; +} + +unsigned long long nr_context_switches(void) +{ + int i; + unsigned long long sum = 0; + + for_each_possible_cpu(i) + sum += cpu_rq(i)->nr_switches; + + return sum; +} + +unsigned long nr_iowait(void) +{ + unsigned long i, sum = 0; + + for_each_possible_cpu(i) + sum += atomic_read(&cpu_rq(i)->nr_iowait); + + return sum; +} + +unsigned long nr_iowait_cpu(int cpu) +{ + struct rq *this = cpu_rq(cpu); + return atomic_read(&this->nr_iowait); +} + +unsigned long this_cpu_load(void) +{ + struct rq *this = this_rq(); + return this->cpu_load[0]; +} + + +/* Variables and functions for calc_load */ +static atomic_long_t calc_load_tasks; +static unsigned long calc_load_update; +unsigned long avenrun[3]; +EXPORT_SYMBOL(avenrun); + +static long calc_load_fold_active(struct rq *this_rq) +{ + long nr_active, delta = 0; + + nr_active = this_rq->nr_running; + nr_active += (long) this_rq->nr_uninterruptible; + + if (nr_active != this_rq->calc_load_active) { + delta = nr_active - this_rq->calc_load_active; + this_rq->calc_load_active = nr_active; + } + + return delta; +} + +static unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) +{ + load *= exp; + load += active * (FIXED_1 - exp); + load += 1UL << (FSHIFT - 1); + return load >> FSHIFT; +} + +#ifdef CONFIG_NO_HZ +/* + * For NO_HZ we delay the active fold to the next LOAD_FREQ update. + * + * When making the ILB scale, we should try to pull this in as well. + */ +static atomic_long_t calc_load_tasks_idle; + +static void calc_load_account_idle(struct rq *this_rq) +{ + long delta; + + delta = calc_load_fold_active(this_rq); + if (delta) + atomic_long_add(delta, &calc_load_tasks_idle); +} + +static long calc_load_fold_idle(void) +{ + long delta = 0; + + /* + * Its got a race, we don't care... + */ + if (atomic_long_read(&calc_load_tasks_idle)) + delta = atomic_long_xchg(&calc_load_tasks_idle, 0); + + return delta; +} + +/** + * fixed_power_int - compute: x^n, in O(log n) time + * + * @x: base of the power + * @frac_bits: fractional bits of @x + * @n: power to raise @x to. + * + * By exploiting the relation between the definition of the natural power + * function: x^n := x*x*...*x (x multiplied by itself for n times), and + * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, + * (where: n_i \elem {0, 1}, the binary vector representing n), + * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is + * of course trivially computable in O(log_2 n), the length of our binary + * vector. + */ +static unsigned long +fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) +{ + unsigned long result = 1UL << frac_bits; + + if (n) for (;;) { + if (n & 1) { + result *= x; + result += 1UL << (frac_bits - 1); + result >>= frac_bits; + } + n >>= 1; + if (!n) + break; + x *= x; + x += 1UL << (frac_bits - 1); + x >>= frac_bits; + } + + return result; +} + +/* + * a1 = a0 * e + a * (1 - e) + * + * a2 = a1 * e + a * (1 - e) + * = (a0 * e + a * (1 - e)) * e + a * (1 - e) + * = a0 * e^2 + a * (1 - e) * (1 + e) + * + * a3 = a2 * e + a * (1 - e) + * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) + * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) + * + * ... + * + * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] + * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) + * = a0 * e^n + a * (1 - e^n) + * + * [1] application of the geometric series: + * + * n 1 - x^(n+1) + * S_n := \Sum x^i = ------------- + * i=0 1 - x + */ +static unsigned long +calc_load_n(unsigned long load, unsigned long exp, + unsigned long active, unsigned int n) +{ + + return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); +} + +/* + * NO_HZ can leave us missing all per-cpu ticks calling + * calc_load_account_active(), but since an idle CPU folds its delta into + * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold + * in the pending idle delta if our idle period crossed a load cycle boundary. + * + * Once we've updated the global active value, we need to apply the exponential + * weights adjusted to the number of cycles missed. + */ +static void calc_global_nohz(void) +{ + long delta, active, n; + + /* + * If we crossed a calc_load_update boundary, make sure to fold + * any pending idle changes, the respective CPUs might have + * missed the tick driven calc_load_account_active() update + * due to NO_HZ. + */ + delta = calc_load_fold_idle(); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + + /* + * It could be the one fold was all it took, we done! + */ + if (time_before(jiffies, calc_load_update + 10)) + return; + + /* + * Catch-up, fold however many we are behind still + */ + delta = jiffies - calc_load_update - 10; + n = 1 + (delta / LOAD_FREQ); + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; + + avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); + avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); + avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); + + calc_load_update += n * LOAD_FREQ; +} +#else +static void calc_load_account_idle(struct rq *this_rq) +{ +} + +static inline long calc_load_fold_idle(void) +{ + return 0; +} + +static void calc_global_nohz(void) +{ +} +#endif + +/** + * get_avenrun - get the load average array + * @loads: pointer to dest load array + * @offset: offset to add + * @shift: shift count to shift the result left + * + * These values are estimates at best, so no need for locking. + */ +void get_avenrun(unsigned long *loads, unsigned long offset, int shift) +{ + loads[0] = (avenrun[0] + offset) << shift; + loads[1] = (avenrun[1] + offset) << shift; + loads[2] = (avenrun[2] + offset) << shift; +} + +/* + * calc_load - update the avenrun load estimates 10 ticks after the + * CPUs have updated calc_load_tasks. + */ +void calc_global_load(unsigned long ticks) +{ + long active; + + if (time_before(jiffies, calc_load_update + 10)) + return; + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; + + avenrun[0] = calc_load(avenrun[0], EXP_1, active); + avenrun[1] = calc_load(avenrun[1], EXP_5, active); + avenrun[2] = calc_load(avenrun[2], EXP_15, active); + + calc_load_update += LOAD_FREQ; + + /* + * Account one period with whatever state we found before + * folding in the nohz state and ageing the entire idle period. + * + * This avoids loosing a sample when we go idle between + * calc_load_account_active() (10 ticks ago) and now and thus + * under-accounting. + */ + calc_global_nohz(); +} + +/* + * Called from update_cpu_load() to periodically update this CPU's + * active count. + */ +static void calc_load_account_active(struct rq *this_rq) +{ + long delta; + + if (time_before(jiffies, this_rq->calc_load_update)) + return; + + delta = calc_load_fold_active(this_rq); + delta += calc_load_fold_idle(); + if (delta) + atomic_long_add(delta, &calc_load_tasks); + + this_rq->calc_load_update += LOAD_FREQ; +} + +/* + * The exact cpuload at various idx values, calculated at every tick would be + * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load + * + * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called + * on nth tick when cpu may be busy, then we have: + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load + * + * decay_load_missed() below does efficient calculation of + * load = ((2^idx - 1) / 2^idx)^(n-1) * load + * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load + * + * The calculation is approximated on a 128 point scale. + * degrade_zero_ticks is the number of ticks after which load at any + * particular idx is approximated to be zero. + * degrade_factor is a precomputed table, a row for each load idx. + * Each column corresponds to degradation factor for a power of two ticks, + * based on 128 point scale. + * Example: + * row 2, col 3 (=12) says that the degradation at load idx 2 after + * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). + * + * With this power of 2 load factors, we can degrade the load n times + * by looking at 1 bits in n and doing as many mult/shift instead of + * n mult/shifts needed by the exact degradation. + */ +#define DEGRADE_SHIFT 7 +static const unsigned char + degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; +static const unsigned char + degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { + {0, 0, 0, 0, 0, 0, 0, 0}, + {64, 32, 8, 0, 0, 0, 0, 0}, + {96, 72, 40, 12, 1, 0, 0}, + {112, 98, 75, 43, 15, 1, 0}, + {120, 112, 98, 76, 45, 16, 2} }; + +/* + * Update cpu_load for any missed ticks, due to tickless idle. The backlog + * would be when CPU is idle and so we just decay the old load without + * adding any new load. + */ +static unsigned long +decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) +{ + int j = 0; + + if (!missed_updates) + return load; + + if (missed_updates >= degrade_zero_ticks[idx]) + return 0; + + if (idx == 1) + return load >> missed_updates; + + while (missed_updates) { + if (missed_updates % 2) + load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; + + missed_updates >>= 1; + j++; + } + return load; +} + +/* + * Update rq->cpu_load[] statistics. This function is usually called every + * scheduler tick (TICK_NSEC). With tickless idle this will not be called + * every tick. We fix it up based on jiffies. + */ +static void update_cpu_load(struct rq *this_rq) +{ + unsigned long this_load = this_rq->load.weight; + unsigned long curr_jiffies = jiffies; + unsigned long pending_updates; + int i, scale; + + this_rq->nr_load_updates++; + + /* Avoid repeated calls on same jiffy, when moving in and out of idle */ + if (curr_jiffies == this_rq->last_load_update_tick) + return; + + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + this_rq->last_load_update_tick = curr_jiffies; + + /* Update our load: */ + this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ + for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { + unsigned long old_load, new_load; + + /* scale is effectively 1 << i now, and >> i divides by scale */ + + old_load = this_rq->cpu_load[i]; + old_load = decay_load_missed(old_load, pending_updates - 1, i); + new_load = this_load; + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (new_load > old_load) + new_load += scale - 1; + + this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; + } + + sched_avg_update(this_rq); +} + +static void update_cpu_load_active(struct rq *this_rq) +{ + update_cpu_load(this_rq); + + calc_load_account_active(this_rq); +} + +#ifdef CONFIG_SMP + +/* + * sched_exec - execve() is a valuable balancing opportunity, because at + * this point the task has the smallest effective memory and cache footprint. + */ +void sched_exec(void) +{ + struct task_struct *p = current; + unsigned long flags; + int dest_cpu; + + raw_spin_lock_irqsave(&p->pi_lock, flags); + dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0); + if (dest_cpu == smp_processor_id()) + goto unlock; + + if (likely(cpu_active(dest_cpu))) { + struct migration_arg arg = { p, dest_cpu }; + + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg); + return; + } +unlock: + raw_spin_unlock_irqrestore(&p->pi_lock, flags); +} + +#endif + +DEFINE_PER_CPU(struct kernel_stat, kstat); + +EXPORT_PER_CPU_SYMBOL(kstat); + +/* + * Return any ns on the sched_clock that have not yet been accounted in + * @p in case that task is currently running. + * + * Called with task_rq_lock() held on @rq. + */ +static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) +{ + u64 ns = 0; + + if (task_current(rq, p)) { + update_rq_clock(rq); + ns = rq->clock_task - p->se.exec_start; + if ((s64)ns < 0) + ns = 0; + } + + return ns; +} + +unsigned long long task_delta_exec(struct task_struct *p) +{ + unsigned long flags; + struct rq *rq; + u64 ns = 0; + + rq = task_rq_lock(p, &flags); + ns = do_task_delta_exec(p, rq); + task_rq_unlock(rq, p, &flags); + + return ns; +} + +/* + * Return accounted runtime for the task. + * In case the task is currently running, return the runtime plus current's + * pending runtime that have not been accounted yet. + */ +unsigned long long task_sched_runtime(struct task_struct *p) +{ + unsigned long flags; + struct rq *rq; + u64 ns = 0; + + rq = task_rq_lock(p, &flags); + ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq); + task_rq_unlock(rq, p, &flags); + + return ns; +} + +/* + * Account user cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in user space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +void account_user_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t tmp; + + /* Add user time to process. */ + p->utime = cputime_add(p->utime, cputime); + p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); + account_group_user_time(p, cputime); + + /* Add user time to cpustat. */ + tmp = cputime_to_cputime64(cputime); + if (TASK_NICE(p) > 0) + cpustat->nice = cputime64_add(cpustat->nice, tmp); + else + cpustat->user = cputime64_add(cpustat->user, tmp); + + cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime); + /* Account for user time used */ + acct_update_integrals(p); +} + +/* + * Account guest cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in virtual machine since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +static void account_guest_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled) +{ + cputime64_t tmp; + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + + tmp = cputime_to_cputime64(cputime); + + /* Add guest time to process. */ + p->utime = cputime_add(p->utime, cputime); + p->utimescaled = cputime_add(p->utimescaled, cputime_scaled); + account_group_user_time(p, cputime); + p->gtime = cputime_add(p->gtime, cputime); + + /* Add guest time to cpustat. */ + if (TASK_NICE(p) > 0) { + cpustat->nice = cputime64_add(cpustat->nice, tmp); + cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp); + } else { + cpustat->user = cputime64_add(cpustat->user, tmp); + cpustat->guest = cputime64_add(cpustat->guest, tmp); + } +} + +/* + * Account system cpu time to a process and desired cpustat field + * @p: the process that the cpu time gets accounted to + * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + * @target_cputime64: pointer to cpustat field that has to be updated + */ +static inline +void __account_system_time(struct task_struct *p, cputime_t cputime, + cputime_t cputime_scaled, cputime64_t *target_cputime64) +{ + cputime64_t tmp = cputime_to_cputime64(cputime); + + /* Add system time to process. */ + p->stime = cputime_add(p->stime, cputime); + p->stimescaled = cputime_add(p->stimescaled, cputime_scaled); + account_group_system_time(p, cputime); + + /* Add system time to cpustat. */ + *target_cputime64 = cputime64_add(*target_cputime64, tmp); + cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime); + + /* Account for system time used */ + acct_update_integrals(p); +} + +/* + * Account system cpu time to a process. + * @p: the process that the cpu time gets accounted to + * @hardirq_offset: the offset to subtract from hardirq_count() + * @cputime: the cpu time spent in kernel space since the last update + * @cputime_scaled: cputime scaled by cpu frequency + */ +void account_system_time(struct task_struct *p, int hardirq_offset, + cputime_t cputime, cputime_t cputime_scaled) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t *target_cputime64; + + if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { + account_guest_time(p, cputime, cputime_scaled); + return; + } + + if (hardirq_count() - hardirq_offset) + target_cputime64 = &cpustat->irq; + else if (in_serving_softirq()) + target_cputime64 = &cpustat->softirq; + else + target_cputime64 = &cpustat->system; + + __account_system_time(p, cputime, cputime_scaled, target_cputime64); +} + +/* + * Account for involuntary wait time. + * @cputime: the cpu time spent in involuntary wait + */ +void account_steal_time(cputime_t cputime) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t cputime64 = cputime_to_cputime64(cputime); + + cpustat->steal = cputime64_add(cpustat->steal, cputime64); +} + +/* + * Account for idle time. + * @cputime: the cpu time spent in idle wait + */ +void account_idle_time(cputime_t cputime) +{ + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + cputime64_t cputime64 = cputime_to_cputime64(cputime); + struct rq *rq = this_rq(); + + if (atomic_read(&rq->nr_iowait) > 0) + cpustat->iowait = cputime64_add(cpustat->iowait, cputime64); + else + cpustat->idle = cputime64_add(cpustat->idle, cputime64); +} + +#ifndef CONFIG_VIRT_CPU_ACCOUNTING + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +/* + * Account a tick to a process and cpustat + * @p: the process that the cpu time gets accounted to + * @user_tick: is the tick from userspace + * @rq: the pointer to rq + * + * Tick demultiplexing follows the order + * - pending hardirq update + * - pending softirq update + * - user_time + * - idle_time + * - system time + * - check for guest_time + * - else account as system_time + * + * Check for hardirq is done both for system and user time as there is + * no timer going off while we are on hardirq and hence we may never get an + * opportunity to update it solely in system time. + * p->stime and friends are only updated on system time and not on irq + * softirq as those do not count in task exec_runtime any more. + */ +static void irqtime_account_process_tick(struct task_struct *p, int user_tick, + struct rq *rq) +{ + cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy); + struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; + + if (irqtime_account_hi_update()) { + cpustat->irq = cputime64_add(cpustat->irq, tmp); + } else if (irqtime_account_si_update()) { + cpustat->softirq = cputime64_add(cpustat->softirq, tmp); + } else if (this_cpu_ksoftirqd() == p) { + /* + * ksoftirqd time do not get accounted in cpu_softirq_time. + * So, we have to handle it separately here. + * Also, p->stime needs to be updated for ksoftirqd. + */ + __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, + &cpustat->softirq); + } else if (user_tick) { + account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + } else if (p == rq->idle) { + account_idle_time(cputime_one_jiffy); + } else if (p->flags & PF_VCPU) { /* System time or guest time */ + account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled); + } else { + __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled, + &cpustat->system); + } +} + +static void irqtime_account_idle_ticks(int ticks) +{ + int i; + struct rq *rq = this_rq(); + + for (i = 0; i < ticks; i++) + irqtime_account_process_tick(current, 0, rq); +} +#else /* CONFIG_IRQ_TIME_ACCOUNTING */ +static void irqtime_account_idle_ticks(int ticks) {} +static void irqtime_account_process_tick(struct task_struct *p, int user_tick, + struct rq *rq) {} +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +/* + * Account a single tick of cpu time. + * @p: the process that the cpu time gets accounted to + * @user_tick: indicates if the tick is a user or a system tick + */ +void account_process_tick(struct task_struct *p, int user_tick) +{ + cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + struct rq *rq = this_rq(); + + if (sched_clock_irqtime) { + irqtime_account_process_tick(p, user_tick, rq); + return; + } + + if (user_tick) + account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) + account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, + one_jiffy_scaled); + else + account_idle_time(cputime_one_jiffy); +} + +/* + * Account multiple ticks of steal time. + * @p: the process from which the cpu time has been stolen + * @ticks: number of stolen ticks + */ +void account_steal_ticks(unsigned long ticks) +{ + account_steal_time(jiffies_to_cputime(ticks)); +} + +/* + * Account multiple ticks of idle time. + * @ticks: number of stolen ticks + */ +void account_idle_ticks(unsigned long ticks) +{ + + if (sched_clock_irqtime) { + irqtime_account_idle_ticks(ticks); + return; + } + + account_idle_time(jiffies_to_cputime(ticks)); +} + +#endif + +/* + * Use precise platform statistics if available: + */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + *ut = p->utime; + *st = p->stime; +} + +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct task_cputime cputime; + + thread_group_cputime(p, &cputime); + + *ut = cputime.utime; + *st = cputime.stime; +} +#else + +#ifndef nsecs_to_cputime +# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) +#endif + +void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime); + + /* + * Use CFS's precise accounting: + */ + rtime = nsecs_to_cputime(p->se.sum_exec_runtime); + + if (total) { + u64 temp = rtime; + + temp *= utime; + do_div(temp, total); + utime = (cputime_t)temp; + } else + utime = rtime; + + /* + * Compare with previous values, to keep monotonicity: + */ + p->prev_utime = max(p->prev_utime, utime); + p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime)); + + *ut = p->prev_utime; + *st = p->prev_stime; +} + +/* + * Must be called with siglock held. + */ +void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st) +{ + struct signal_struct *sig = p->signal; + struct task_cputime cputime; + cputime_t rtime, utime, total; + + thread_group_cputime(p, &cputime); + + total = cputime_add(cputime.utime, cputime.stime); + rtime = nsecs_to_cputime(cputime.sum_exec_runtime); + + if (total) { + u64 temp = rtime; + + temp *= cputime.utime; + do_div(temp, total); + utime = (cputime_t)temp; + } else + utime = rtime; + + sig->prev_utime = max(sig->prev_utime, utime); + sig->prev_stime = max(sig->prev_stime, + cputime_sub(rtime, sig->prev_utime)); + + *ut = sig->prev_utime; + *st = sig->prev_stime; +} +#endif + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + */ +void scheduler_tick(void) +{ + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + struct task_struct *curr = rq->curr; + + sched_clock_tick(); + + raw_spin_lock(&rq->lock); + update_rq_clock(rq); + update_cpu_load_active(rq); + curr->sched_class->task_tick(rq, curr, 0); + raw_spin_unlock(&rq->lock); + + perf_event_task_tick(); + +#ifdef CONFIG_SMP + rq->idle_at_tick = idle_cpu(cpu); + trigger_load_balance(rq, cpu); +#endif +} + +notrace unsigned long get_parent_ip(unsigned long addr) +{ + if (in_lock_functions(addr)) { + addr = CALLER_ADDR2; + if (in_lock_functions(addr)) + addr = CALLER_ADDR3; + } + return addr; +} + +#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ + defined(CONFIG_PREEMPT_TRACER)) + +void __kprobes add_preempt_count(int val) +{ +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Underflow? + */ + if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) + return; +#endif + preempt_count() += val; +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Spinlock count overflowing soon? + */ + DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= + PREEMPT_MASK - 10); +#endif + if (preempt_count() == val) + trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); +} +EXPORT_SYMBOL(add_preempt_count); + +void __kprobes sub_preempt_count(int val) +{ +#ifdef CONFIG_DEBUG_PREEMPT + /* + * Underflow? + */ + if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) + return; + /* + * Is the spinlock portion underflowing? + */ + if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && + !(preempt_count() & PREEMPT_MASK))) + return; +#endif + + if (preempt_count() == val) + trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + preempt_count() -= val; +} +EXPORT_SYMBOL(sub_preempt_count); + +#endif + +/* + * Print scheduling while atomic bug: + */ +static noinline void __schedule_bug(struct task_struct *prev) +{ + struct pt_regs *regs = get_irq_regs(); + + printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", + prev->comm, prev->pid, preempt_count()); + + debug_show_held_locks(prev); + print_modules(); + if (irqs_disabled()) + print_irqtrace_events(prev); + + if (regs) + show_regs(regs); + else + dump_stack(); +} + +/* + * Various schedule()-time debugging checks and statistics: + */ +static inline void schedule_debug(struct task_struct *prev) +{ + /* + * Test if we are atomic. Since do_exit() needs to call into + * schedule() atomically, we ignore that path for now. + * Otherwise, whine if we are scheduling when we should not be. + */ + if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) + __schedule_bug(prev); + + profile_hit(SCHED_PROFILING, __builtin_return_address(0)); + + schedstat_inc(this_rq(), sched_count); +} + +static void put_prev_task(struct rq *rq, struct task_struct *prev) +{ + if (prev->on_rq || rq->skip_clock_update < 0) + update_rq_clock(rq); + prev->sched_class->put_prev_task(rq, prev); +} + +/* + * Pick up the highest-prio task: + */ +static inline struct task_struct * +pick_next_task(struct rq *rq) +{ + const struct sched_class *class; + struct task_struct *p; + + /* + * Optimization: we know that if all tasks are in + * the fair class we can call that function directly: + */ + if (likely(rq->nr_running == rq->cfs.nr_running)) { + p = fair_sched_class.pick_next_task(rq); + if (likely(p)) + return p; + } + + for_each_class(class) { + p = class->pick_next_task(rq); + if (p) + return p; + } + + BUG(); /* the idle class will always have a runnable task */ +} + +/* + * __schedule() is the main scheduler function. + */ +static void __sched __schedule(void) +{ + struct task_struct *prev, *next; + unsigned long *switch_count; + struct rq *rq; + int cpu; + +need_resched: + preempt_disable(); + cpu = smp_processor_id(); + rq = cpu_rq(cpu); + rcu_note_context_switch(cpu); + prev = rq->curr; + + schedule_debug(prev); + + if (sched_feat(HRTICK)) + hrtick_clear(rq); + + raw_spin_lock_irq(&rq->lock); + + switch_count = &prev->nivcsw; + if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + if (unlikely(signal_pending_state(prev->state, prev))) { + prev->state = TASK_RUNNING; + } else { + deactivate_task(rq, prev, DEQUEUE_SLEEP); + prev->on_rq = 0; + + /* + * If a worker went to sleep, notify and ask workqueue + * whether it wants to wake up a task to maintain + * concurrency. + */ + if (prev->flags & PF_WQ_WORKER) { + struct task_struct *to_wakeup; + + to_wakeup = wq_worker_sleeping(prev, cpu); + if (to_wakeup) + try_to_wake_up_local(to_wakeup); + } + } + switch_count = &prev->nvcsw; + } + + pre_schedule(rq, prev); + + if (unlikely(!rq->nr_running)) + idle_balance(cpu, rq); + + put_prev_task(rq, prev); + next = pick_next_task(rq); + clear_tsk_need_resched(prev); + rq->skip_clock_update = 0; + + if (likely(prev != next)) { + rq->nr_switches++; + rq->curr = next; + ++*switch_count; + + context_switch(rq, prev, next); /* unlocks the rq */ + /* + * The context switch have flipped the stack from under us + * and restored the local variables which were saved when + * this task called schedule() in the past. prev == current + * is still correct, but it can be moved to another cpu/rq. + */ + cpu = smp_processor_id(); + rq = cpu_rq(cpu); + } else + raw_spin_unlock_irq(&rq->lock); + + post_schedule(rq); + + preempt_enable_no_resched(); + if (need_resched()) + goto need_resched; +} + +static inline void sched_submit_work(struct task_struct *tsk) +{ + if (!tsk->state) + return; + /* + * If we are going to sleep and we have plugged IO queued, + * make sure to submit it to avoid deadlocks. + */ + if (blk_needs_flush_plug(tsk)) + blk_schedule_flush_plug(tsk); +} + +asmlinkage void __sched schedule(void) +{ + struct task_struct *tsk = current; + + sched_submit_work(tsk); + __schedule(); +} +EXPORT_SYMBOL(schedule); + +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER + +static inline bool owner_running(struct mutex *lock, struct task_struct *owner) +{ + bool ret = false; + + rcu_read_lock(); + if (lock->owner != owner) + goto fail; + + /* + * Ensure we emit the owner->on_cpu, dereference _after_ checking + * lock->owner still matches owner, if that fails, owner might + * point to free()d memory, if it still matches, the rcu_read_lock() + * ensures the memory stays valid. + */ + barrier(); + + ret = owner->on_cpu; +fail: + rcu_read_unlock(); + + return ret; +} + +/* + * Look out! "owner" is an entirely speculative pointer + * access and not reliable. + */ +int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) +{ + if (!sched_feat(OWNER_SPIN)) + return 0; + + while (owner_running(lock, owner)) { + if (need_resched()) + return 0; + + arch_mutex_cpu_relax(); + } + + /* + * If the owner changed to another task there is likely + * heavy contention, stop spinning. + */ + if (lock->owner) + return 0; + + return 1; +} +#endif + +#ifdef CONFIG_PREEMPT +/* + * this is the entry point to schedule() from in-kernel preemption + * off of preempt_enable. Kernel preemptions off return from interrupt + * occur there and call schedule directly. + */ +asmlinkage void __sched notrace preempt_schedule(void) +{ + struct thread_info *ti = current_thread_info(); + + /* + * If there is a non-zero preempt_count or interrupts are disabled, + * we do not want to preempt the current task. Just return.. + */ + if (likely(ti->preempt_count || irqs_disabled())) + return; + + do { + add_preempt_count_notrace(PREEMPT_ACTIVE); + __schedule(); + sub_preempt_count_notrace(PREEMPT_ACTIVE); + + /* + * Check again in case we missed a preemption opportunity + * between schedule and now. + */ + barrier(); + } while (need_resched()); +} +EXPORT_SYMBOL(preempt_schedule); + +/* + * this is the entry point to schedule() from kernel preemption + * off of irq context. + * Note, that this is called and return with irqs disabled. This will + * protect us against recursive calling from irq. + */ +asmlinkage void __sched preempt_schedule_irq(void) +{ + struct thread_info *ti = current_thread_info(); + + /* Catch callers which need to be fixed */ + BUG_ON(ti->preempt_count || !irqs_disabled()); + + do { + add_preempt_count(PREEMPT_ACTIVE); + local_irq_enable(); + __schedule(); + local_irq_disable(); + sub_preempt_count(PREEMPT_ACTIVE); + + /* + * Check again in case we missed a preemption opportunity + * between schedule and now. + */ + barrier(); + } while (need_resched()); +} + +#endif /* CONFIG_PREEMPT */ + +int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags, + void *key) +{ + return try_to_wake_up(curr->private, mode, wake_flags); +} +EXPORT_SYMBOL(default_wake_function); + +/* + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve + * number) then we wake all the non-exclusive tasks and one exclusive task. + * + * There are circumstances in which we can try to wake a task which has already + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns + * zero in this (rare) case, and we handle it by continuing to scan the queue. + */ +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, int wake_flags, void *key) +{ + wait_queue_t *curr, *next; + + list_for_each_entry_safe(curr, next, &q->task_list, task_list) { + unsigned flags = curr->flags; + + if (curr->func(curr, mode, wake_flags, key) && + (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) + break; + } +} + +/** + * __wake_up - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * @key: is directly passed to the wakeup function + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void __wake_up(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, void *key) +{ + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, 0, key); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(__wake_up); + +/* + * Same as __wake_up but called with the spinlock in wait_queue_head_t held. + */ +void __wake_up_locked(wait_queue_head_t *q, unsigned int mode) +{ + __wake_up_common(q, mode, 1, 0, NULL); +} +EXPORT_SYMBOL_GPL(__wake_up_locked); + +void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) +{ + __wake_up_common(q, mode, 1, 0, key); +} +EXPORT_SYMBOL_GPL(__wake_up_locked_key); + +/** + * __wake_up_sync_key - wake up threads blocked on a waitqueue. + * @q: the waitqueue + * @mode: which threads + * @nr_exclusive: how many wake-one or wake-many threads to wake up + * @key: opaque value to be passed to wakeup targets + * + * The sync wakeup differs that the waker knows that it will schedule + * away soon, so while the target thread will be woken up, it will not + * be migrated to another CPU - ie. the two threads are 'synchronized' + * with each other. This can prevent needless bouncing between CPUs. + * + * On UP it can prevent extra preemption. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, + int nr_exclusive, void *key) +{ + unsigned long flags; + int wake_flags = WF_SYNC; + + if (unlikely(!q)) + return; + + if (unlikely(!nr_exclusive)) + wake_flags = 0; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, wake_flags, key); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL_GPL(__wake_up_sync_key); + +/* + * __wake_up_sync - see __wake_up_sync_key() + */ +void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) +{ + __wake_up_sync_key(q, mode, nr_exclusive, NULL); +} +EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ + +/** + * complete: - signals a single thread waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up a single thread waiting on this completion. Threads will be + * awakened in the same order in which they were queued. + * + * See also complete_all(), wait_for_completion() and related routines. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void complete(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done++; + __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); + spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete); + +/** + * complete_all: - signals all threads waiting on this completion + * @x: holds the state of this particular completion + * + * This will wake up all threads waiting on this particular completion event. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ +void complete_all(struct completion *x) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done += UINT_MAX/2; + __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); + spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete_all); + +static inline long __sched +do_wait_for_common(struct completion *x, long timeout, int state) +{ + if (!x->done) { + DECLARE_WAITQUEUE(wait, current); + + __add_wait_queue_tail_exclusive(&x->wait, &wait); + do { + if (signal_pending_state(state, current)) { + timeout = -ERESTARTSYS; + break; + } + __set_current_state(state); + spin_unlock_irq(&x->wait.lock); + timeout = schedule_timeout(timeout); + spin_lock_irq(&x->wait.lock); + } while (!x->done && timeout); + __remove_wait_queue(&x->wait, &wait); + if (!x->done) + return timeout; + } + x->done--; + return timeout ?: 1; +} + +static long __sched +wait_for_common(struct completion *x, long timeout, int state) +{ + might_sleep(); + + spin_lock_irq(&x->wait.lock); + timeout = do_wait_for_common(x, timeout, state); + spin_unlock_irq(&x->wait.lock); + return timeout; +} + +/** + * wait_for_completion: - waits for completion of a task + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It is NOT + * interruptible and there is no timeout. + * + * See also similar routines (i.e. wait_for_completion_timeout()) with timeout + * and interrupt capability. Also see complete(). + */ +void __sched wait_for_completion(struct completion *x) +{ + wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion); + +/** + * wait_for_completion_timeout: - waits for completion of a task (w/timeout) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. The timeout is in jiffies. It is not + * interruptible. + */ +unsigned long __sched +wait_for_completion_timeout(struct completion *x, unsigned long timeout) +{ + return wait_for_common(x, timeout, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_timeout); + +/** + * wait_for_completion_interruptible: - waits for completion of a task (w/intr) + * @x: holds the state of this particular completion + * + * This waits for completion of a specific task to be signaled. It is + * interruptible. + */ +int __sched wait_for_completion_interruptible(struct completion *x) +{ + long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); + if (t == -ERESTARTSYS) + return t; + return 0; +} +EXPORT_SYMBOL(wait_for_completion_interruptible); + +/** + * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr)) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be signaled or for a + * specified timeout to expire. It is interruptible. The timeout is in jiffies. + */ +long __sched +wait_for_completion_interruptible_timeout(struct completion *x, + unsigned long timeout) +{ + return wait_for_common(x, timeout, TASK_INTERRUPTIBLE); +} +EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); + +/** + * wait_for_completion_killable: - waits for completion of a task (killable) + * @x: holds the state of this particular completion + * + * This waits to be signaled for completion of a specific task. It can be + * interrupted by a kill signal. + */ +int __sched wait_for_completion_killable(struct completion *x) +{ + long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); + if (t == -ERESTARTSYS) + return t; + return 0; +} +EXPORT_SYMBOL(wait_for_completion_killable); + +/** + * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be + * signaled or for a specified timeout to expire. It can be + * interrupted by a kill signal. The timeout is in jiffies. + */ +long __sched +wait_for_completion_killable_timeout(struct completion *x, + unsigned long timeout) +{ + return wait_for_common(x, timeout, TASK_KILLABLE); +} +EXPORT_SYMBOL(wait_for_completion_killable_timeout); + +/** + * try_wait_for_completion - try to decrement a completion without blocking + * @x: completion structure + * + * Returns: 0 if a decrement cannot be done without blocking + * 1 if a decrement succeeded. + * + * If a completion is being used as a counting completion, + * attempt to decrement the counter without blocking. This + * enables us to avoid waiting if the resource the completion + * is protecting is not available. + */ +bool try_wait_for_completion(struct completion *x) +{ + unsigned long flags; + int ret = 1; + + spin_lock_irqsave(&x->wait.lock, flags); + if (!x->done) + ret = 0; + else + x->done--; + spin_unlock_irqrestore(&x->wait.lock, flags); + return ret; +} +EXPORT_SYMBOL(try_wait_for_completion); + +/** + * completion_done - Test to see if a completion has any waiters + * @x: completion structure + * + * Returns: 0 if there are waiters (wait_for_completion() in progress) + * 1 if there are no waiters. + * + */ +bool completion_done(struct completion *x) +{ + unsigned long flags; + int ret = 1; + + spin_lock_irqsave(&x->wait.lock, flags); + if (!x->done) + ret = 0; + spin_unlock_irqrestore(&x->wait.lock, flags); + return ret; +} +EXPORT_SYMBOL(completion_done); + +static long __sched +sleep_on_common(wait_queue_head_t *q, int state, long timeout) +{ + unsigned long flags; + wait_queue_t wait; + + init_waitqueue_entry(&wait, current); + + __set_current_state(state); + + spin_lock_irqsave(&q->lock, flags); + __add_wait_queue(q, &wait); + spin_unlock(&q->lock); + timeout = schedule_timeout(timeout); + spin_lock_irq(&q->lock); + __remove_wait_queue(q, &wait); + spin_unlock_irqrestore(&q->lock, flags); + + return timeout; +} + +void __sched interruptible_sleep_on(wait_queue_head_t *q) +{ + sleep_on_common(q, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); +} +EXPORT_SYMBOL(interruptible_sleep_on); + +long __sched +interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) +{ + return sleep_on_common(q, TASK_INTERRUPTIBLE, timeout); +} +EXPORT_SYMBOL(interruptible_sleep_on_timeout); + +void __sched sleep_on(wait_queue_head_t *q) +{ + sleep_on_common(q, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); +} +EXPORT_SYMBOL(sleep_on); + +long __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) +{ + return sleep_on_common(q, TASK_UNINTERRUPTIBLE, timeout); +} +EXPORT_SYMBOL(sleep_on_timeout); + +#ifdef CONFIG_RT_MUTEXES + +/* + * rt_mutex_setprio - set the current priority of a task + * @p: task + * @prio: prio value (kernel-internal form) + * + * This function changes the 'effective' priority of a task. It does + * not touch ->normal_prio like __setscheduler(). + * + * Used by the rt_mutex code to implement priority inheritance logic. + */ +void rt_mutex_setprio(struct task_struct *p, int prio) +{ + int oldprio, on_rq, running; + struct rq *rq; + const struct sched_class *prev_class; + + BUG_ON(prio < 0 || prio > MAX_PRIO); + + rq = __task_rq_lock(p); + + trace_sched_pi_setprio(p, prio); + oldprio = p->prio; + prev_class = p->sched_class; + on_rq = p->on_rq; + running = task_current(rq, p); + if (on_rq) + dequeue_task(rq, p, 0); + if (running) + p->sched_class->put_prev_task(rq, p); + + if (rt_prio(prio)) + p->sched_class = &rt_sched_class; + else + p->sched_class = &fair_sched_class; + + p->prio = prio; + + if (running) + p->sched_class->set_curr_task(rq); + if (on_rq) + enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0); + + check_class_changed(rq, p, prev_class, oldprio); + __task_rq_unlock(rq); +} + +#endif + +void set_user_nice(struct task_struct *p, long nice) +{ + int old_prio, delta, on_rq; + unsigned long flags; + struct rq *rq; + + if (TASK_NICE(p) == nice || nice < -20 || nice > 19) + return; + /* + * We have to be careful, if called from sys_setpriority(), + * the task might be in the middle of scheduling on another CPU. + */ + rq = task_rq_lock(p, &flags); + /* + * The RT priorities are set via sched_setscheduler(), but we still + * allow the 'normal' nice value to be set - but as expected + * it wont have any effect on scheduling until the task is + * SCHED_FIFO/SCHED_RR: + */ + if (task_has_rt_policy(p)) { + p->static_prio = NICE_TO_PRIO(nice); + goto out_unlock; + } + on_rq = p->on_rq; + if (on_rq) + dequeue_task(rq, p, 0); + + p->static_prio = NICE_TO_PRIO(nice); + set_load_weight(p); + old_prio = p->prio; + p->prio = effective_prio(p); + delta = p->prio - old_prio; + + if (on_rq) { + enqueue_task(rq, p, 0); + /* + * If the task increased its priority or is running and + * lowered its priority, then reschedule its CPU: + */ + if (delta < 0 || (delta > 0 && task_running(rq, p))) + resched_task(rq->curr); + } +out_unlock: + task_rq_unlock(rq, p, &flags); +} +EXPORT_SYMBOL(set_user_nice); + +/* + * can_nice - check if a task can reduce its nice value + * @p: task + * @nice: nice value + */ +int can_nice(const struct task_struct *p, const int nice) +{ + /* convert nice value [19,-20] to rlimit style value [1,40] */ + int nice_rlim = 20 - nice; + + return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || + capable(CAP_SYS_NICE)); +} + +#ifdef __ARCH_WANT_SYS_NICE + +/* + * sys_nice - change the priority of the current process. + * @increment: priority increment + * + * sys_setpriority is a more generic, but much slower function that + * does similar things. + */ +SYSCALL_DEFINE1(nice, int, increment) +{ + long nice, retval; + + /* + * Setpriority might change our priority at the same moment. + * We don't have to worry. Conceptually one call occurs first + * and we have a single winner. + */ + if (increment < -40) + increment = -40; + if (increment > 40) + increment = 40; + + nice = TASK_NICE(current) + increment; + if (nice < -20) + nice = -20; + if (nice > 19) + nice = 19; + + if (increment < 0 && !can_nice(current, nice)) + return -EPERM; + + retval = security_task_setnice(current, nice); + if (retval) + return retval; + + set_user_nice(current, nice); + return 0; +} + +#endif + +/** + * task_prio - return the priority value of a given task. + * @p: the task in question. + * + * This is the priority value as seen by users in /proc. + * RT tasks are offset by -200. Normal tasks are centered + * around 0, value goes from -16 to +15. + */ +int task_prio(const struct task_struct *p) +{ + return p->prio - MAX_RT_PRIO; +} + +/** + * task_nice - return the nice value of a given task. + * @p: the task in question. + */ +int task_nice(const struct task_struct *p) +{ + return TASK_NICE(p); +} +EXPORT_SYMBOL(task_nice); + +/** + * idle_cpu - is a given cpu idle currently? + * @cpu: the processor in question. + */ +int idle_cpu(int cpu) +{ + return cpu_curr(cpu) == cpu_rq(cpu)->idle; +} + +/** + * idle_task - return the idle task for a given cpu. + * @cpu: the processor in question. + */ +struct task_struct *idle_task(int cpu) +{ + return cpu_rq(cpu)->idle; +} + +/** + * find_process_by_pid - find a process with a matching PID value. + * @pid: the pid in question. + */ +static struct task_struct *find_process_by_pid(pid_t pid) +{ + return pid ? find_task_by_vpid(pid) : current; +} + +/* Actually do priority change: must hold rq lock. */ +static void +__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) +{ + p->policy = policy; + p->rt_priority = prio; + p->normal_prio = normal_prio(p); + /* we are holding p->pi_lock already */ + p->prio = rt_mutex_getprio(p); + if (rt_prio(p->prio)) + p->sched_class = &rt_sched_class; + else + p->sched_class = &fair_sched_class; + set_load_weight(p); +} + +/* + * check the target process has a UID that matches the current process's + */ +static bool check_same_owner(struct task_struct *p) +{ + const struct cred *cred = current_cred(), *pcred; + bool match; + + rcu_read_lock(); + pcred = __task_cred(p); + if (cred->user->user_ns == pcred->user->user_ns) + match = (cred->euid == pcred->euid || + cred->euid == pcred->uid); + else + match = false; + rcu_read_unlock(); + return match; +} + +static int __sched_setscheduler(struct task_struct *p, int policy, + const struct sched_param *param, bool user) +{ + int retval, oldprio, oldpolicy = -1, on_rq, running; + unsigned long flags; + const struct sched_class *prev_class; + struct rq *rq; + int reset_on_fork; + + /* may grab non-irq protected spin_locks */ + BUG_ON(in_interrupt()); +recheck: + /* double check policy once rq lock held */ + if (policy < 0) { + reset_on_fork = p->sched_reset_on_fork; + policy = oldpolicy = p->policy; + } else { + reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); + policy &= ~SCHED_RESET_ON_FORK; + + if (policy != SCHED_FIFO && policy != SCHED_RR && + policy != SCHED_NORMAL && policy != SCHED_BATCH && + policy != SCHED_IDLE) + return -EINVAL; + } + + /* + * Valid priorities for SCHED_FIFO and SCHED_RR are + * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, + * SCHED_BATCH and SCHED_IDLE is 0. + */ + if (param->sched_priority < 0 || + (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || + (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) + return -EINVAL; + if (rt_policy(policy) != (param->sched_priority != 0)) + return -EINVAL; + + /* + * Allow unprivileged RT tasks to decrease priority: + */ + if (user && !capable(CAP_SYS_NICE)) { + if (rt_policy(policy)) { + unsigned long rlim_rtprio = + task_rlimit(p, RLIMIT_RTPRIO); + + /* can't set/change the rt policy */ + if (policy != p->policy && !rlim_rtprio) + return -EPERM; + + /* can't increase priority */ + if (param->sched_priority > p->rt_priority && + param->sched_priority > rlim_rtprio) + return -EPERM; + } + + /* + * Treat SCHED_IDLE as nice 20. Only allow a switch to + * SCHED_NORMAL if the RLIMIT_NICE would normally permit it. + */ + if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) { + if (!can_nice(p, TASK_NICE(p))) + return -EPERM; + } + + /* can't change other user's priorities */ + if (!check_same_owner(p)) + return -EPERM; + + /* Normal users shall not reset the sched_reset_on_fork flag */ + if (p->sched_reset_on_fork && !reset_on_fork) + return -EPERM; + } + + if (user) { + retval = security_task_setscheduler(p); + if (retval) + return retval; + } + + /* + * make sure no PI-waiters arrive (or leave) while we are + * changing the priority of the task: + * + * To be able to change p->policy safely, the appropriate + * runqueue lock must be held. + */ + rq = task_rq_lock(p, &flags); + + /* + * Changing the policy of the stop threads its a very bad idea + */ + if (p == rq->stop) { + task_rq_unlock(rq, p, &flags); + return -EINVAL; + } + + /* + * If not changing anything there's no need to proceed further: + */ + if (unlikely(policy == p->policy && (!rt_policy(policy) || + param->sched_priority == p->rt_priority))) { + + __task_rq_unlock(rq); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + return 0; + } + +#ifdef CONFIG_RT_GROUP_SCHED + if (user) { + /* + * Do not allow realtime tasks into groups that have no runtime + * assigned. + */ + if (rt_bandwidth_enabled() && rt_policy(policy) && + task_group(p)->rt_bandwidth.rt_runtime == 0 && + !task_group_is_autogroup(task_group(p))) { + task_rq_unlock(rq, p, &flags); + return -EPERM; + } + } +#endif + + /* recheck policy now with rq lock held */ + if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { + policy = oldpolicy = -1; + task_rq_unlock(rq, p, &flags); + goto recheck; + } + on_rq = p->on_rq; + running = task_current(rq, p); + if (on_rq) + deactivate_task(rq, p, 0); + if (running) + p->sched_class->put_prev_task(rq, p); + + p->sched_reset_on_fork = reset_on_fork; + + oldprio = p->prio; + prev_class = p->sched_class; + __setscheduler(rq, p, policy, param->sched_priority); + + if (running) + p->sched_class->set_curr_task(rq); + if (on_rq) + activate_task(rq, p, 0); + + check_class_changed(rq, p, prev_class, oldprio); + task_rq_unlock(rq, p, &flags); + + rt_mutex_adjust_pi(p); + + return 0; +} + +/** + * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * NOTE that the task may be already dead. + */ +int sched_setscheduler(struct task_struct *p, int policy, + const struct sched_param *param) +{ + return __sched_setscheduler(p, policy, param, true); +} +EXPORT_SYMBOL_GPL(sched_setscheduler); + +/** + * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. + * @p: the task in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + * + * Just like sched_setscheduler, only don't bother checking if the + * current context has permission. For example, this is needed in + * stop_machine(): we create temporary high priority worker threads, + * but our caller might not have that capability. + */ +int sched_setscheduler_nocheck(struct task_struct *p, int policy, + const struct sched_param *param) +{ + return __sched_setscheduler(p, policy, param, false); +} + +static int +do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) +{ + struct sched_param lparam; + struct task_struct *p; + int retval; + + if (!param || pid < 0) + return -EINVAL; + if (copy_from_user(&lparam, param, sizeof(struct sched_param))) + return -EFAULT; + + rcu_read_lock(); + retval = -ESRCH; + p = find_process_by_pid(pid); + if (p != NULL) + retval = sched_setscheduler(p, policy, &lparam); + rcu_read_unlock(); + + return retval; +} + +/** + * sys_sched_setscheduler - set/change the scheduler policy and RT priority + * @pid: the pid in question. + * @policy: new policy. + * @param: structure containing the new RT priority. + */ +SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, + struct sched_param __user *, param) +{ + /* negative values for policy are not valid */ + if (policy < 0) + return -EINVAL; + + return do_sched_setscheduler(pid, policy, param); +} + +/** + * sys_sched_setparam - set/change the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the new RT priority. + */ +SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) +{ + return do_sched_setscheduler(pid, -1, param); +} + +/** + * sys_sched_getscheduler - get the policy (scheduling class) of a thread + * @pid: the pid in question. + */ +SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) +{ + struct task_struct *p; + int retval; + + if (pid < 0) + return -EINVAL; + + retval = -ESRCH; + rcu_read_lock(); + p = find_process_by_pid(pid); + if (p) { + retval = security_task_getscheduler(p); + if (!retval) + retval = p->policy + | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); + } + rcu_read_unlock(); + return retval; +} + +/** + * sys_sched_getparam - get the RT priority of a thread + * @pid: the pid in question. + * @param: structure containing the RT priority. + */ +SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) +{ + struct sched_param lp; + struct task_struct *p; + int retval; + + if (!param || pid < 0) + return -EINVAL; + + rcu_read_lock(); + p = find_process_by_pid(pid); + retval = -ESRCH; + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + lp.sched_priority = p->rt_priority; + rcu_read_unlock(); + + /* + * This one might sleep, we cannot do it with a spinlock held ... + */ + retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; + + return retval; + +out_unlock: + rcu_read_unlock(); + return retval; +} + +long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) +{ + cpumask_var_t cpus_allowed, new_mask; + struct task_struct *p; + int retval; + + get_online_cpus(); + rcu_read_lock(); + + p = find_process_by_pid(pid); + if (!p) { + rcu_read_unlock(); + put_online_cpus(); + return -ESRCH; + } + + /* Prevent p going away */ + get_task_struct(p); + rcu_read_unlock(); + + if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_put_task; + } + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { + retval = -ENOMEM; + goto out_free_cpus_allowed; + } + retval = -EPERM; + if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE)) + goto out_unlock; + + retval = security_task_setscheduler(p); + if (retval) + goto out_unlock; + + cpuset_cpus_allowed(p, cpus_allowed); + cpumask_and(new_mask, in_mask, cpus_allowed); +again: + retval = set_cpus_allowed_ptr(p, new_mask); + + if (!retval) { + cpuset_cpus_allowed(p, cpus_allowed); + if (!cpumask_subset(new_mask, cpus_allowed)) { + /* + * We must have raced with a concurrent cpuset + * update. Just reset the cpus_allowed to the + * cpuset's cpus_allowed + */ + cpumask_copy(new_mask, cpus_allowed); + goto again; + } + } +out_unlock: + free_cpumask_var(new_mask); +out_free_cpus_allowed: + free_cpumask_var(cpus_allowed); +out_put_task: + put_task_struct(p); + put_online_cpus(); + return retval; +} + +static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, + struct cpumask *new_mask) +{ + if (len < cpumask_size()) + cpumask_clear(new_mask); + else if (len > cpumask_size()) + len = cpumask_size(); + + return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; +} + +/** + * sys_sched_setaffinity - set the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to the new cpu mask + */ +SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, + unsigned long __user *, user_mask_ptr) +{ + cpumask_var_t new_mask; + int retval; + + if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) + return -ENOMEM; + + retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); + if (retval == 0) + retval = sched_setaffinity(pid, new_mask); + free_cpumask_var(new_mask); + return retval; +} + +long sched_getaffinity(pid_t pid, struct cpumask *mask) +{ + struct task_struct *p; + unsigned long flags; + int retval; + + get_online_cpus(); + rcu_read_lock(); + + retval = -ESRCH; + p = find_process_by_pid(pid); + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + raw_spin_lock_irqsave(&p->pi_lock, flags); + cpumask_and(mask, &p->cpus_allowed, cpu_online_mask); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + +out_unlock: + rcu_read_unlock(); + put_online_cpus(); + + return retval; +} + +/** + * sys_sched_getaffinity - get the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to hold the current cpu mask + */ +SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, + unsigned long __user *, user_mask_ptr) +{ + int ret; + cpumask_var_t mask; + + if ((len * BITS_PER_BYTE) < nr_cpu_ids) + return -EINVAL; + if (len & (sizeof(unsigned long)-1)) + return -EINVAL; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + + ret = sched_getaffinity(pid, mask); + if (ret == 0) { + size_t retlen = min_t(size_t, len, cpumask_size()); + + if (copy_to_user(user_mask_ptr, mask, retlen)) + ret = -EFAULT; + else + ret = retlen; + } + free_cpumask_var(mask); + + return ret; +} + +/** + * sys_sched_yield - yield the current processor to other threads. + * + * This function yields the current CPU to other tasks. If there are no + * other threads running on this CPU then this function will return. + */ +SYSCALL_DEFINE0(sched_yield) +{ + struct rq *rq = this_rq_lock(); + + schedstat_inc(rq, yld_count); + current->sched_class->yield_task(rq); + + /* + * Since we are going to call schedule() anyway, there's + * no need to preempt or enable interrupts: + */ + __release(rq->lock); + spin_release(&rq->lock.dep_map, 1, _THIS_IP_); + do_raw_spin_unlock(&rq->lock); + preempt_enable_no_resched(); + + schedule(); + + return 0; +} + +static inline int should_resched(void) +{ + return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); +} + +static void __cond_resched(void) +{ + add_preempt_count(PREEMPT_ACTIVE); + __schedule(); + sub_preempt_count(PREEMPT_ACTIVE); +} + +int __sched _cond_resched(void) +{ + if (should_resched()) { + __cond_resched(); + return 1; + } + return 0; +} +EXPORT_SYMBOL(_cond_resched); + +/* + * __cond_resched_lock() - if a reschedule is pending, drop the given lock, + * call schedule, and on return reacquire the lock. + * + * This works OK both with and without CONFIG_PREEMPT. We do strange low-level + * operations here to prevent schedule() from being called twice (once via + * spin_unlock(), once by hand). + */ +int __cond_resched_lock(spinlock_t *lock) +{ + int resched = should_resched(); + int ret = 0; + + lockdep_assert_held(lock); + + if (spin_needbreak(lock) || resched) { + spin_unlock(lock); + if (resched) + __cond_resched(); + else + cpu_relax(); + ret = 1; + spin_lock(lock); + } + return ret; +} +EXPORT_SYMBOL(__cond_resched_lock); + +int __sched __cond_resched_softirq(void) +{ + BUG_ON(!in_softirq()); + + if (should_resched()) { + local_bh_enable(); + __cond_resched(); + local_bh_disable(); + return 1; + } + return 0; +} +EXPORT_SYMBOL(__cond_resched_softirq); + +/** + * yield - yield the current processor to other threads. + * + * This is a shortcut for kernel-space yielding - it marks the + * thread runnable and calls sys_sched_yield(). + */ +void __sched yield(void) +{ + set_current_state(TASK_RUNNING); + sys_sched_yield(); +} +EXPORT_SYMBOL(yield); + +/** + * yield_to - yield the current processor to another thread in + * your thread group, or accelerate that thread toward the + * processor it's on. + * @p: target task + * @preempt: whether task preemption is allowed or not + * + * It's the caller's job to ensure that the target task struct + * can't go away on us before we can do any checks. + * + * Returns true if we indeed boosted the target task. + */ +bool __sched yield_to(struct task_struct *p, bool preempt) +{ + struct task_struct *curr = current; + struct rq *rq, *p_rq; + unsigned long flags; + bool yielded = 0; + + local_irq_save(flags); + rq = this_rq(); + +again: + p_rq = task_rq(p); + double_rq_lock(rq, p_rq); + while (task_rq(p) != p_rq) { + double_rq_unlock(rq, p_rq); + goto again; + } + + if (!curr->sched_class->yield_to_task) + goto out; + + if (curr->sched_class != p->sched_class) + goto out; + + if (task_running(p_rq, p) || p->state) + goto out; + + yielded = curr->sched_class->yield_to_task(rq, p, preempt); + if (yielded) { + schedstat_inc(rq, yld_count); + /* + * Make p's CPU reschedule; pick_next_entity takes care of + * fairness. + */ + if (preempt && rq != p_rq) + resched_task(p_rq->curr); + } + +out: + double_rq_unlock(rq, p_rq); + local_irq_restore(flags); + + if (yielded) + schedule(); + + return yielded; +} +EXPORT_SYMBOL_GPL(yield_to); + +/* + * This task is about to go to sleep on IO. Increment rq->nr_iowait so + * that process accounting knows that this is a task in IO wait state. + */ +void __sched io_schedule(void) +{ + struct rq *rq = raw_rq(); + + delayacct_blkio_start(); + atomic_inc(&rq->nr_iowait); + blk_flush_plug(current); + current->in_iowait = 1; + schedule(); + current->in_iowait = 0; + atomic_dec(&rq->nr_iowait); + delayacct_blkio_end(); +} +EXPORT_SYMBOL(io_schedule); + +long __sched io_schedule_timeout(long timeout) +{ + struct rq *rq = raw_rq(); + long ret; + + delayacct_blkio_start(); + atomic_inc(&rq->nr_iowait); + blk_flush_plug(current); + current->in_iowait = 1; + ret = schedule_timeout(timeout); + current->in_iowait = 0; + atomic_dec(&rq->nr_iowait); + delayacct_blkio_end(); + return ret; +} + +/** + * sys_sched_get_priority_max - return maximum RT priority. + * @policy: scheduling class. + * + * this syscall returns the maximum rt_priority that can be used + * by a given scheduling class. + */ +SYSCALL_DEFINE1(sched_get_priority_max, int, policy) +{ + int ret = -EINVAL; + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = MAX_USER_RT_PRIO-1; + break; + case SCHED_NORMAL: + case SCHED_BATCH: + case SCHED_IDLE: + ret = 0; + break; + } + return ret; +} + +/** + * sys_sched_get_priority_min - return minimum RT priority. + * @policy: scheduling class. + * + * this syscall returns the minimum rt_priority that can be used + * by a given scheduling class. + */ +SYSCALL_DEFINE1(sched_get_priority_min, int, policy) +{ + int ret = -EINVAL; + + switch (policy) { + case SCHED_FIFO: + case SCHED_RR: + ret = 1; + break; + case SCHED_NORMAL: + case SCHED_BATCH: + case SCHED_IDLE: + ret = 0; + } + return ret; +} + +/** + * sys_sched_rr_get_interval - return the default timeslice of a process. + * @pid: pid of the process. + * @interval: userspace pointer to the timeslice value. + * + * this syscall writes the default timeslice value of a given process + * into the user-space timespec buffer. A value of '0' means infinity. + */ +SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, + struct timespec __user *, interval) +{ + struct task_struct *p; + unsigned int time_slice; + unsigned long flags; + struct rq *rq; + int retval; + struct timespec t; + + if (pid < 0) + return -EINVAL; + + retval = -ESRCH; + rcu_read_lock(); + p = find_process_by_pid(pid); + if (!p) + goto out_unlock; + + retval = security_task_getscheduler(p); + if (retval) + goto out_unlock; + + rq = task_rq_lock(p, &flags); + time_slice = p->sched_class->get_rr_interval(rq, p); + task_rq_unlock(rq, p, &flags); + + rcu_read_unlock(); + jiffies_to_timespec(time_slice, &t); + retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; + return retval; + +out_unlock: + rcu_read_unlock(); + return retval; +} + +static const char stat_nam[] = TASK_STATE_TO_CHAR_STR; + +void sched_show_task(struct task_struct *p) +{ + unsigned long free = 0; + unsigned state; + + state = p->state ? __ffs(p->state) + 1 : 0; + printk(KERN_INFO "%-15.15s %c", p->comm, + state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); +#if BITS_PER_LONG == 32 + if (state == TASK_RUNNING) + printk(KERN_CONT " running "); + else + printk(KERN_CONT " %08lx ", thread_saved_pc(p)); +#else + if (state == TASK_RUNNING) + printk(KERN_CONT " running task "); + else + printk(KERN_CONT " %016lx ", thread_saved_pc(p)); +#endif +#ifdef CONFIG_DEBUG_STACK_USAGE + free = stack_not_used(p); +#endif + printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, + task_pid_nr(p), task_pid_nr(p->real_parent), + (unsigned long)task_thread_info(p)->flags); + + show_stack(p, NULL); +} + +void show_state_filter(unsigned long state_filter) +{ + struct task_struct *g, *p; + +#if BITS_PER_LONG == 32 + printk(KERN_INFO + " task PC stack pid father\n"); +#else + printk(KERN_INFO + " task PC stack pid father\n"); +#endif + read_lock(&tasklist_lock); + do_each_thread(g, p) { + /* + * reset the NMI-timeout, listing all files on a slow + * console might take a lot of time: + */ + touch_nmi_watchdog(); + if (!state_filter || (p->state & state_filter)) + sched_show_task(p); + } while_each_thread(g, p); + + touch_all_softlockup_watchdogs(); + +#ifdef CONFIG_SCHED_DEBUG + sysrq_sched_debug_show(); +#endif + read_unlock(&tasklist_lock); + /* + * Only show locks if all tasks are dumped: + */ + if (!state_filter) + debug_show_all_locks(); +} + +void __cpuinit init_idle_bootup_task(struct task_struct *idle) +{ + idle->sched_class = &idle_sched_class; +} + +/** + * init_idle - set up an idle thread for a given CPU + * @idle: task in question + * @cpu: cpu the idle task belongs to + * + * NOTE: this function does not set the idle thread's NEED_RESCHED + * flag, to make booting more robust. + */ +void __cpuinit init_idle(struct task_struct *idle, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + + __sched_fork(idle); + idle->state = TASK_RUNNING; + idle->se.exec_start = sched_clock(); + + do_set_cpus_allowed(idle, cpumask_of(cpu)); + /* + * We're having a chicken and egg problem, even though we are + * holding rq->lock, the cpu isn't yet set to this cpu so the + * lockdep check in task_group() will fail. + * + * Similar case to sched_fork(). / Alternatively we could + * use task_rq_lock() here and obtain the other rq->lock. + * + * Silence PROVE_RCU + */ + rcu_read_lock(); + __set_task_cpu(idle, cpu); + rcu_read_unlock(); + + rq->curr = rq->idle = idle; +#if defined(CONFIG_SMP) + idle->on_cpu = 1; +#endif + raw_spin_unlock_irqrestore(&rq->lock, flags); + + /* Set the preempt count _outside_ the spinlocks! */ + task_thread_info(idle)->preempt_count = 0; + + /* + * The idle tasks have their own, simple scheduling class: + */ + idle->sched_class = &idle_sched_class; + ftrace_graph_init_idle_task(idle, cpu); +} + +/* + * In a system that switches off the HZ timer nohz_cpu_mask + * indicates which cpus entered this state. This is used + * in the rcu update to wait only for active cpus. For system + * which do not switch off the HZ timer nohz_cpu_mask should + * always be CPU_BITS_NONE. + */ +cpumask_var_t nohz_cpu_mask; + +/* + * Increase the granularity value when there are more CPUs, + * because with more CPUs the 'effective latency' as visible + * to users decreases. But the relationship is not linear, + * so pick a second-best guess by going with the log2 of the + * number of CPUs. + * + * This idea comes from the SD scheduler of Con Kolivas: + */ +static int get_update_sysctl_factor(void) +{ + unsigned int cpus = min_t(int, num_online_cpus(), 8); + unsigned int factor; + + switch (sysctl_sched_tunable_scaling) { + case SCHED_TUNABLESCALING_NONE: + factor = 1; + break; + case SCHED_TUNABLESCALING_LINEAR: + factor = cpus; + break; + case SCHED_TUNABLESCALING_LOG: + default: + factor = 1 + ilog2(cpus); + break; + } + + return factor; +} + +static void update_sysctl(void) +{ + unsigned int factor = get_update_sysctl_factor(); + +#define SET_SYSCTL(name) \ + (sysctl_##name = (factor) * normalized_sysctl_##name) + SET_SYSCTL(sched_min_granularity); + SET_SYSCTL(sched_latency); + SET_SYSCTL(sched_wakeup_granularity); +#undef SET_SYSCTL +} + +static inline void sched_init_granularity(void) +{ + update_sysctl(); +} + +#ifdef CONFIG_SMP +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +{ + if (p->sched_class && p->sched_class->set_cpus_allowed) + p->sched_class->set_cpus_allowed(p, new_mask); + else { + cpumask_copy(&p->cpus_allowed, new_mask); + p->rt.nr_cpus_allowed = cpumask_weight(new_mask); + } +} + +/* + * This is how migration works: + * + * 1) we invoke migration_cpu_stop() on the target CPU using + * stop_one_cpu(). + * 2) stopper starts to run (implicitly forcing the migrated thread + * off the CPU) + * 3) it checks whether the migrated task is still in the wrong runqueue. + * 4) if it's in the wrong runqueue then the migration thread removes + * it and puts it into the right queue. + * 5) stopper completes and stop_one_cpu() returns and the migration + * is done. + */ + +/* + * Change a given task's CPU affinity. Migrate the thread to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) +{ + unsigned long flags; + struct rq *rq; + unsigned int dest_cpu; + int ret = 0; + + rq = task_rq_lock(p, &flags); + + if (cpumask_equal(&p->cpus_allowed, new_mask)) + goto out; + + if (!cpumask_intersects(new_mask, cpu_active_mask)) { + ret = -EINVAL; + goto out; + } + + if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { + ret = -EINVAL; + goto out; + } + + do_set_cpus_allowed(p, new_mask); + + /* Can the task run on the task's current CPU? If so, we're done */ + if (cpumask_test_cpu(task_cpu(p), new_mask)) + goto out; + + dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); + if (p->on_rq) { + struct migration_arg arg = { p, dest_cpu }; + /* Need help from migration thread: drop lock and wait. */ + task_rq_unlock(rq, p, &flags); + stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); + tlb_migrate_finish(p->mm); + return 0; + } +out: + task_rq_unlock(rq, p, &flags); + + return ret; +} +EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); + +/* + * Move (not current) task off this cpu, onto dest cpu. We're doing + * this because either it can't run here any more (set_cpus_allowed() + * away from this CPU, or CPU going down), or because we're + * attempting to rebalance this task on exec (sched_exec). + * + * So we race with normal scheduler movements, but that's OK, as long + * as the task is no longer on this CPU. + * + * Returns non-zero if task was successfully migrated. + */ +static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) +{ + struct rq *rq_dest, *rq_src; + int ret = 0; + + if (unlikely(!cpu_active(dest_cpu))) + return ret; + + rq_src = cpu_rq(src_cpu); + rq_dest = cpu_rq(dest_cpu); + + raw_spin_lock(&p->pi_lock); + double_rq_lock(rq_src, rq_dest); + /* Already moved. */ + if (task_cpu(p) != src_cpu) + goto done; + /* Affinity changed (again). */ + if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) + goto fail; + + /* + * If we're not on a rq, the next wake-up will ensure we're + * placed properly. + */ + if (p->on_rq) { + deactivate_task(rq_src, p, 0); + set_task_cpu(p, dest_cpu); + activate_task(rq_dest, p, 0); + check_preempt_curr(rq_dest, p, 0); + } +done: + ret = 1; +fail: + double_rq_unlock(rq_src, rq_dest); + raw_spin_unlock(&p->pi_lock); + return ret; +} + +/* + * migration_cpu_stop - this will be executed by a highprio stopper thread + * and performs thread migration by bumping thread off CPU then + * 'pushing' onto another runqueue. + */ +static int migration_cpu_stop(void *data) +{ + struct migration_arg *arg = data; + + /* + * The original target cpu might have gone down and we might + * be on another cpu but it doesn't matter. + */ + local_irq_disable(); + __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); + local_irq_enable(); + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Ensures that the idle task is using init_mm right before its cpu goes + * offline. + */ +void idle_task_exit(void) +{ + struct mm_struct *mm = current->active_mm; + + BUG_ON(cpu_online(smp_processor_id())); + + if (mm != &init_mm) + switch_mm(mm, &init_mm, current); + mmdrop(mm); +} + +/* + * While a dead CPU has no uninterruptible tasks queued at this point, + * it might still have a nonzero ->nr_uninterruptible counter, because + * for performance reasons the counter is not stricly tracking tasks to + * their home CPUs. So we just add the counter to another CPU's counter, + * to keep the global sum constant after CPU-down: + */ +static void migrate_nr_uninterruptible(struct rq *rq_src) +{ + struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); + + rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; + rq_src->nr_uninterruptible = 0; +} + +/* + * remove the tasks which were accounted by rq from calc_load_tasks. + */ +static void calc_global_load_remove(struct rq *rq) +{ + atomic_long_sub(rq->calc_load_active, &calc_load_tasks); + rq->calc_load_active = 0; +} + +/* + * Migrate all tasks from the rq, sleeping tasks will be migrated by + * try_to_wake_up()->select_task_rq(). + * + * Called with rq->lock held even though we'er in stop_machine() and + * there's no concurrency possible, we hold the required locks anyway + * because of lock validation efforts. + */ +static void migrate_tasks(unsigned int dead_cpu) +{ + struct rq *rq = cpu_rq(dead_cpu); + struct task_struct *next, *stop = rq->stop; + int dest_cpu; + + /* + * Fudge the rq selection such that the below task selection loop + * doesn't get stuck on the currently eligible stop task. + * + * We're currently inside stop_machine() and the rq is either stuck + * in the stop_machine_cpu_stop() loop, or we're executing this code, + * either way we should never end up calling schedule() until we're + * done here. + */ + rq->stop = NULL; + + for ( ; ; ) { + /* + * There's this thread running, bail when that's the only + * remaining thread. + */ + if (rq->nr_running == 1) + break; + + next = pick_next_task(rq); + BUG_ON(!next); + next->sched_class->put_prev_task(rq, next); + + /* Find suitable destination for @next, with force if needed. */ + dest_cpu = select_fallback_rq(dead_cpu, next); + raw_spin_unlock(&rq->lock); + + __migrate_task(next, dead_cpu, dest_cpu); + + raw_spin_lock(&rq->lock); + } + + rq->stop = stop; +} + +#endif /* CONFIG_HOTPLUG_CPU */ + +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) + +static struct ctl_table sd_ctl_dir[] = { + { + .procname = "sched_domain", + .mode = 0555, + }, + {} +}; + +static struct ctl_table sd_ctl_root[] = { + { + .procname = "kernel", + .mode = 0555, + .child = sd_ctl_dir, + }, + {} +}; + +static struct ctl_table *sd_alloc_ctl_entry(int n) +{ + struct ctl_table *entry = + kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); + + return entry; +} + +static void sd_free_ctl_entry(struct ctl_table **tablep) +{ + struct ctl_table *entry; + + /* + * In the intermediate directories, both the child directory and + * procname are dynamically allocated and could fail but the mode + * will always be set. In the lowest directory the names are + * static strings and all have proc handlers. + */ + for (entry = *tablep; entry->mode; entry++) { + if (entry->child) + sd_free_ctl_entry(&entry->child); + if (entry->proc_handler == NULL) + kfree(entry->procname); + } + + kfree(*tablep); + *tablep = NULL; +} + +static void +set_table_entry(struct ctl_table *entry, + const char *procname, void *data, int maxlen, + mode_t mode, proc_handler *proc_handler) +{ + entry->procname = procname; + entry->data = data; + entry->maxlen = maxlen; + entry->mode = mode; + entry->proc_handler = proc_handler; +} + +static struct ctl_table * +sd_alloc_ctl_domain_table(struct sched_domain *sd) +{ + struct ctl_table *table = sd_alloc_ctl_entry(13); + + if (table == NULL) + return NULL; + + set_table_entry(&table[0], "min_interval", &sd->min_interval, + sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[1], "max_interval", &sd->max_interval, + sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[2], "busy_idx", &sd->busy_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[3], "idle_idx", &sd->idle_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[5], "wake_idx", &sd->wake_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[7], "busy_factor", &sd->busy_factor, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[9], "cache_nice_tries", + &sd->cache_nice_tries, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[10], "flags", &sd->flags, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[11], "name", sd->name, + CORENAME_MAX_SIZE, 0444, proc_dostring); + /* &table[12] is terminator */ + + return table; +} + +static ctl_table *sd_alloc_ctl_cpu_table(int cpu) +{ + struct ctl_table *entry, *table; + struct sched_domain *sd; + int domain_num = 0, i; + char buf[32]; + + for_each_domain(cpu, sd) + domain_num++; + entry = table = sd_alloc_ctl_entry(domain_num + 1); + if (table == NULL) + return NULL; + + i = 0; + for_each_domain(cpu, sd) { + snprintf(buf, 32, "domain%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_domain_table(sd); + entry++; + i++; + } + return table; +} + +static struct ctl_table_header *sd_sysctl_header; +static void register_sched_domain_sysctl(void) +{ + int i, cpu_num = num_possible_cpus(); + struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); + char buf[32]; + + WARN_ON(sd_ctl_dir[0].child); + sd_ctl_dir[0].child = entry; + + if (entry == NULL) + return; + + for_each_possible_cpu(i) { + snprintf(buf, 32, "cpu%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_cpu_table(i); + entry++; + } + + WARN_ON(sd_sysctl_header); + sd_sysctl_header = register_sysctl_table(sd_ctl_root); +} + +/* may be called multiple times per register */ +static void unregister_sched_domain_sysctl(void) +{ + if (sd_sysctl_header) + unregister_sysctl_table(sd_sysctl_header); + sd_sysctl_header = NULL; + if (sd_ctl_dir[0].child) + sd_free_ctl_entry(&sd_ctl_dir[0].child); +} +#else +static void register_sched_domain_sysctl(void) +{ +} +static void unregister_sched_domain_sysctl(void) +{ +} +#endif + +static void set_rq_online(struct rq *rq) +{ + if (!rq->online) { + const struct sched_class *class; + + cpumask_set_cpu(rq->cpu, rq->rd->online); + rq->online = 1; + + for_each_class(class) { + if (class->rq_online) + class->rq_online(rq); + } + } +} + +static void set_rq_offline(struct rq *rq) +{ + if (rq->online) { + const struct sched_class *class; + + for_each_class(class) { + if (class->rq_offline) + class->rq_offline(rq); + } + + cpumask_clear_cpu(rq->cpu, rq->rd->online); + rq->online = 0; + } +} + +/* + * migration_call - callback that gets triggered when a CPU is added. + * Here we can start up the necessary migration thread for the new CPU. + */ +static int __cpuinit +migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + int cpu = (long)hcpu; + unsigned long flags; + struct rq *rq = cpu_rq(cpu); + + switch (action & ~CPU_TASKS_FROZEN) { + + case CPU_UP_PREPARE: + rq->calc_load_update = calc_load_update; + break; + + case CPU_ONLINE: + /* Update our root-domain */ + raw_spin_lock_irqsave(&rq->lock, flags); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + + set_rq_online(rq); + } + raw_spin_unlock_irqrestore(&rq->lock, flags); + break; + +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DYING: + sched_ttwu_pending(); + /* Update our root-domain */ + raw_spin_lock_irqsave(&rq->lock, flags); + if (rq->rd) { + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_offline(rq); + } + migrate_tasks(cpu); + BUG_ON(rq->nr_running != 1); /* the migration thread */ + raw_spin_unlock_irqrestore(&rq->lock, flags); + + migrate_nr_uninterruptible(rq); + calc_global_load_remove(rq); + break; +#endif + } + + update_max_interval(); + + return NOTIFY_OK; +} + +/* + * Register at high priority so that task migration (migrate_all_tasks) + * happens before everything else. This has to be lower priority than + * the notifier in the perf_event subsystem, though. + */ +static struct notifier_block __cpuinitdata migration_notifier = { + .notifier_call = migration_call, + .priority = CPU_PRI_MIGRATION, +}; + +static int __cpuinit sched_cpu_active(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_STARTING: + case CPU_DOWN_FAILED: + set_cpu_active((long)hcpu, true); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + +static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + set_cpu_active((long)hcpu, false); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + +static int __init migration_init(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + int err; + + /* Initialize migration for the boot CPU */ + err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); + BUG_ON(err == NOTIFY_BAD); + migration_call(&migration_notifier, CPU_ONLINE, cpu); + register_cpu_notifier(&migration_notifier); + + /* Register cpu active notifiers */ + cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE); + cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE); + + return 0; +} +early_initcall(migration_init); +#endif + +#ifdef CONFIG_SMP + +static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ + +#ifdef CONFIG_SCHED_DEBUG + +static __read_mostly int sched_domain_debug_enabled; + +static int __init sched_domain_debug_setup(char *str) +{ + sched_domain_debug_enabled = 1; + + return 0; +} +early_param("sched_debug", sched_domain_debug_setup); + +static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, + struct cpumask *groupmask) +{ + struct sched_group *group = sd->groups; + char str[256]; + + cpulist_scnprintf(str, sizeof(str), sched_domain_span(sd)); + cpumask_clear(groupmask); + + printk(KERN_DEBUG "%*s domain %d: ", level, "", level); + + if (!(sd->flags & SD_LOAD_BALANCE)) { + printk("does not load-balance\n"); + if (sd->parent) + printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" + " has parent"); + return -1; + } + + printk(KERN_CONT "span %s level %s\n", str, sd->name); + + if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { + printk(KERN_ERR "ERROR: domain->span does not contain " + "CPU%d\n", cpu); + } + if (!cpumask_test_cpu(cpu, sched_group_cpus(group))) { + printk(KERN_ERR "ERROR: domain->groups does not contain" + " CPU%d\n", cpu); + } + + printk(KERN_DEBUG "%*s groups:", level + 1, ""); + do { + if (!group) { + printk("\n"); + printk(KERN_ERR "ERROR: group is NULL\n"); + break; + } + + if (!group->sgp->power) { + printk(KERN_CONT "\n"); + printk(KERN_ERR "ERROR: domain->cpu_power not " + "set\n"); + break; + } + + if (!cpumask_weight(sched_group_cpus(group))) { + printk(KERN_CONT "\n"); + printk(KERN_ERR "ERROR: empty group\n"); + break; + } + + if (cpumask_intersects(groupmask, sched_group_cpus(group))) { + printk(KERN_CONT "\n"); + printk(KERN_ERR "ERROR: repeated CPUs\n"); + break; + } + + cpumask_or(groupmask, groupmask, sched_group_cpus(group)); + + cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); + + printk(KERN_CONT " %s", str); + if (group->sgp->power != SCHED_POWER_SCALE) { + printk(KERN_CONT " (cpu_power = %d)", + group->sgp->power); + } + + group = group->next; + } while (group != sd->groups); + printk(KERN_CONT "\n"); + + if (!cpumask_equal(sched_domain_span(sd), groupmask)) + printk(KERN_ERR "ERROR: groups don't span domain->span\n"); + + if (sd->parent && + !cpumask_subset(groupmask, sched_domain_span(sd->parent))) + printk(KERN_ERR "ERROR: parent span is not a superset " + "of domain->span\n"); + return 0; +} + +static void sched_domain_debug(struct sched_domain *sd, int cpu) +{ + int level = 0; + + if (!sched_domain_debug_enabled) + return; + + if (!sd) { + printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu); + return; + } + + printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu); + + for (;;) { + if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask)) + break; + level++; + sd = sd->parent; + if (!sd) + break; + } +} +#else /* !CONFIG_SCHED_DEBUG */ +# define sched_domain_debug(sd, cpu) do { } while (0) +#endif /* CONFIG_SCHED_DEBUG */ + +static int sd_degenerate(struct sched_domain *sd) +{ + if (cpumask_weight(sched_domain_span(sd)) == 1) + return 1; + + /* Following flags need at least 2 groups */ + if (sd->flags & (SD_LOAD_BALANCE | + SD_BALANCE_NEWIDLE | + SD_BALANCE_FORK | + SD_BALANCE_EXEC | + SD_SHARE_CPUPOWER | + SD_SHARE_PKG_RESOURCES)) { + if (sd->groups != sd->groups->next) + return 0; + } + + /* Following flags don't use groups */ + if (sd->flags & (SD_WAKE_AFFINE)) + return 0; + + return 1; +} + +static int +sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) +{ + unsigned long cflags = sd->flags, pflags = parent->flags; + + if (sd_degenerate(parent)) + return 1; + + if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent))) + return 0; + + /* Flags needing groups don't count if only 1 group in parent */ + if (parent->groups == parent->groups->next) { + pflags &= ~(SD_LOAD_BALANCE | + SD_BALANCE_NEWIDLE | + SD_BALANCE_FORK | + SD_BALANCE_EXEC | + SD_SHARE_CPUPOWER | + SD_SHARE_PKG_RESOURCES); + if (nr_node_ids == 1) + pflags &= ~SD_SERIALIZE; + } + if (~cflags & pflags) + return 0; + + return 1; +} + +static void free_rootdomain(struct rcu_head *rcu) +{ + struct root_domain *rd = container_of(rcu, struct root_domain, rcu); + + cpupri_cleanup(&rd->cpupri); + free_cpumask_var(rd->rto_mask); + free_cpumask_var(rd->online); + free_cpumask_var(rd->span); + kfree(rd); +} + +static void rq_attach_root(struct rq *rq, struct root_domain *rd) +{ + struct root_domain *old_rd = NULL; + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + + if (rq->rd) { + old_rd = rq->rd; + + if (cpumask_test_cpu(rq->cpu, old_rd->online)) + set_rq_offline(rq); + + cpumask_clear_cpu(rq->cpu, old_rd->span); + + /* + * If we dont want to free the old_rt yet then + * set old_rd to NULL to skip the freeing later + * in this function: + */ + if (!atomic_dec_and_test(&old_rd->refcount)) + old_rd = NULL; + } + + atomic_inc(&rd->refcount); + rq->rd = rd; + + cpumask_set_cpu(rq->cpu, rd->span); + if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) + set_rq_online(rq); + + raw_spin_unlock_irqrestore(&rq->lock, flags); + + if (old_rd) + call_rcu_sched(&old_rd->rcu, free_rootdomain); +} + +static int init_rootdomain(struct root_domain *rd) +{ + memset(rd, 0, sizeof(*rd)); + + if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) + goto out; + if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) + goto free_span; + if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + goto free_online; + + if (cpupri_init(&rd->cpupri) != 0) + goto free_rto_mask; + return 0; + +free_rto_mask: + free_cpumask_var(rd->rto_mask); +free_online: + free_cpumask_var(rd->online); +free_span: + free_cpumask_var(rd->span); +out: + return -ENOMEM; +} + +static void init_defrootdomain(void) +{ + init_rootdomain(&def_root_domain); + + atomic_set(&def_root_domain.refcount, 1); +} + +static struct root_domain *alloc_rootdomain(void) +{ + struct root_domain *rd; + + rd = kmalloc(sizeof(*rd), GFP_KERNEL); + if (!rd) + return NULL; + + if (init_rootdomain(rd) != 0) { + kfree(rd); + return NULL; + } + + return rd; +} + +static void free_sched_groups(struct sched_group *sg, int free_sgp) +{ + struct sched_group *tmp, *first; + + if (!sg) + return; + + first = sg; + do { + tmp = sg->next; + + if (free_sgp && atomic_dec_and_test(&sg->sgp->ref)) + kfree(sg->sgp); + + kfree(sg); + sg = tmp; + } while (sg != first); +} + +static void free_sched_domain(struct rcu_head *rcu) +{ + struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); + + /* + * If its an overlapping domain it has private groups, iterate and + * nuke them all. + */ + if (sd->flags & SD_OVERLAP) { + free_sched_groups(sd->groups, 1); + } else if (atomic_dec_and_test(&sd->groups->ref)) { + kfree(sd->groups->sgp); + kfree(sd->groups); + } + kfree(sd); +} + +static void destroy_sched_domain(struct sched_domain *sd, int cpu) +{ + call_rcu(&sd->rcu, free_sched_domain); +} + +static void destroy_sched_domains(struct sched_domain *sd, int cpu) +{ + for (; sd; sd = sd->parent) + destroy_sched_domain(sd, cpu); +} + +/* + * Attach the domain 'sd' to 'cpu' as its base domain. Callers must + * hold the hotplug lock. + */ +static void +cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + struct sched_domain *tmp; + + /* Remove the sched domains which do not contribute to scheduling. */ + for (tmp = sd; tmp; ) { + struct sched_domain *parent = tmp->parent; + if (!parent) + break; + + if (sd_parent_degenerate(tmp, parent)) { + tmp->parent = parent->parent; + if (parent->parent) + parent->parent->child = tmp; + destroy_sched_domain(parent, cpu); + } else + tmp = tmp->parent; + } + + if (sd && sd_degenerate(sd)) { + tmp = sd; + sd = sd->parent; + destroy_sched_domain(tmp, cpu); + if (sd) + sd->child = NULL; + } + + sched_domain_debug(sd, cpu); + + rq_attach_root(rq, rd); + tmp = rq->sd; + rcu_assign_pointer(rq->sd, sd); + destroy_sched_domains(tmp, cpu); +} + +/* cpus with isolated domains */ +static cpumask_var_t cpu_isolated_map; + +/* Setup the mask of cpus configured for isolated domains */ +static int __init isolated_cpu_setup(char *str) +{ + alloc_bootmem_cpumask_var(&cpu_isolated_map); + cpulist_parse(str, cpu_isolated_map); + return 1; +} + +__setup("isolcpus=", isolated_cpu_setup); + +#define SD_NODES_PER_DOMAIN 16 + +#ifdef CONFIG_NUMA + +/** + * find_next_best_node - find the next node to include in a sched_domain + * @node: node whose sched_domain we're building + * @used_nodes: nodes already in the sched_domain + * + * Find the next node to include in a given scheduling domain. Simply + * finds the closest node not already in the @used_nodes map. + * + * Should use nodemask_t. + */ +static int find_next_best_node(int node, nodemask_t *used_nodes) +{ + int i, n, val, min_val, best_node = -1; + + min_val = INT_MAX; + + for (i = 0; i < nr_node_ids; i++) { + /* Start at @node */ + n = (node + i) % nr_node_ids; + + if (!nr_cpus_node(n)) + continue; + + /* Skip already used nodes */ + if (node_isset(n, *used_nodes)) + continue; + + /* Simple min distance search */ + val = node_distance(node, n); + + if (val < min_val) { + min_val = val; + best_node = n; + } + } + + if (best_node != -1) + node_set(best_node, *used_nodes); + return best_node; +} + +/** + * sched_domain_node_span - get a cpumask for a node's sched_domain + * @node: node whose cpumask we're constructing + * @span: resulting cpumask + * + * Given a node, construct a good cpumask for its sched_domain to span. It + * should be one that prevents unnecessary balancing, but also spreads tasks + * out optimally. + */ +static void sched_domain_node_span(int node, struct cpumask *span) +{ + nodemask_t used_nodes; + int i; + + cpumask_clear(span); + nodes_clear(used_nodes); + + cpumask_or(span, span, cpumask_of_node(node)); + node_set(node, used_nodes); + + for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { + int next_node = find_next_best_node(node, &used_nodes); + if (next_node < 0) + break; + cpumask_or(span, span, cpumask_of_node(next_node)); + } +} + +static const struct cpumask *cpu_node_mask(int cpu) +{ + lockdep_assert_held(&sched_domains_mutex); + + sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); + + return sched_domains_tmpmask; +} + +static const struct cpumask *cpu_allnodes_mask(int cpu) +{ + return cpu_possible_mask; +} +#endif /* CONFIG_NUMA */ + +static const struct cpumask *cpu_cpu_mask(int cpu) +{ + return cpumask_of_node(cpu_to_node(cpu)); +} + +int sched_smt_power_savings = 0, sched_mc_power_savings = 0; + +struct sd_data { + struct sched_domain **__percpu sd; + struct sched_group **__percpu sg; + struct sched_group_power **__percpu sgp; +}; + +struct s_data { + struct sched_domain ** __percpu sd; + struct root_domain *rd; +}; + +enum s_alloc { + sa_rootdomain, + sa_sd, + sa_sd_storage, + sa_none, +}; + +struct sched_domain_topology_level; + +typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu); +typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); + +#define SDTL_OVERLAP 0x01 + +struct sched_domain_topology_level { + sched_domain_init_f init; + sched_domain_mask_f mask; + int flags; + struct sd_data data; +}; + +static int +build_overlap_sched_groups(struct sched_domain *sd, int cpu) +{ + struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg; + const struct cpumask *span = sched_domain_span(sd); + struct cpumask *covered = sched_domains_tmpmask; + struct sd_data *sdd = sd->private; + struct sched_domain *child; + int i; + + cpumask_clear(covered); + + for_each_cpu(i, span) { + struct cpumask *sg_span; + + if (cpumask_test_cpu(i, covered)) + continue; + + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, cpu_to_node(i)); + + if (!sg) + goto fail; + + sg_span = sched_group_cpus(sg); + + child = *per_cpu_ptr(sdd->sd, i); + if (child->child) { + child = child->child; + cpumask_copy(sg_span, sched_domain_span(child)); + } else + cpumask_set_cpu(i, sg_span); + + cpumask_or(covered, covered, sg_span); + + sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); + atomic_inc(&sg->sgp->ref); + + if (cpumask_test_cpu(cpu, sg_span)) + groups = sg; + + if (!first) + first = sg; + if (last) + last->next = sg; + last = sg; + last->next = first; + } + sd->groups = groups; + + return 0; + +fail: + free_sched_groups(first, 0); + + return -ENOMEM; +} + +static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg) +{ + struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); + struct sched_domain *child = sd->child; + + if (child) + cpu = cpumask_first(sched_domain_span(child)); + + if (sg) { + *sg = *per_cpu_ptr(sdd->sg, cpu); + (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu); + atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */ + } + + return cpu; +} + +/* + * build_sched_groups will build a circular linked list of the groups + * covered by the given span, and will set each group's ->cpumask correctly, + * and ->cpu_power to 0. + * + * Assumes the sched_domain tree is fully constructed + */ +static int +build_sched_groups(struct sched_domain *sd, int cpu) +{ + struct sched_group *first = NULL, *last = NULL; + struct sd_data *sdd = sd->private; + const struct cpumask *span = sched_domain_span(sd); + struct cpumask *covered; + int i; + + get_group(cpu, sdd, &sd->groups); + atomic_inc(&sd->groups->ref); + + if (cpu != cpumask_first(sched_domain_span(sd))) + return 0; + + lockdep_assert_held(&sched_domains_mutex); + covered = sched_domains_tmpmask; + + cpumask_clear(covered); + + for_each_cpu(i, span) { + struct sched_group *sg; + int group = get_group(i, sdd, &sg); + int j; + + if (cpumask_test_cpu(i, covered)) + continue; + + cpumask_clear(sched_group_cpus(sg)); + sg->sgp->power = 0; + + for_each_cpu(j, span) { + if (get_group(j, sdd, NULL) != group) + continue; + + cpumask_set_cpu(j, covered); + cpumask_set_cpu(j, sched_group_cpus(sg)); + } + + if (!first) + first = sg; + if (last) + last->next = sg; + last = sg; + } + last->next = first; + + return 0; +} + +/* + * Initialize sched groups cpu_power. + * + * cpu_power indicates the capacity of sched group, which is used while + * distributing the load between different sched groups in a sched domain. + * Typically cpu_power for all the groups in a sched domain will be same unless + * there are asymmetries in the topology. If there are asymmetries, group + * having more cpu_power will pickup more load compared to the group having + * less cpu_power. + */ +static void init_sched_groups_power(int cpu, struct sched_domain *sd) +{ + struct sched_group *sg = sd->groups; + + WARN_ON(!sd || !sg); + + do { + sg->group_weight = cpumask_weight(sched_group_cpus(sg)); + sg = sg->next; + } while (sg != sd->groups); + + if (cpu != group_first_cpu(sg)) + return; + + update_group_power(sd, cpu); +} + +/* + * Initializers for schedule domains + * Non-inlined to reduce accumulated stack pressure in build_sched_domains() + */ + +#ifdef CONFIG_SCHED_DEBUG +# define SD_INIT_NAME(sd, type) sd->name = #type +#else +# define SD_INIT_NAME(sd, type) do { } while (0) +#endif + +#define SD_INIT_FUNC(type) \ +static noinline struct sched_domain * \ +sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ +{ \ + struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \ + *sd = SD_##type##_INIT; \ + SD_INIT_NAME(sd, type); \ + sd->private = &tl->data; \ + return sd; \ +} + +SD_INIT_FUNC(CPU) +#ifdef CONFIG_NUMA + SD_INIT_FUNC(ALLNODES) + SD_INIT_FUNC(NODE) +#endif +#ifdef CONFIG_SCHED_SMT + SD_INIT_FUNC(SIBLING) +#endif +#ifdef CONFIG_SCHED_MC + SD_INIT_FUNC(MC) +#endif +#ifdef CONFIG_SCHED_BOOK + SD_INIT_FUNC(BOOK) +#endif + +static int default_relax_domain_level = -1; +int sched_domain_level_max; + +static int __init setup_relax_domain_level(char *str) +{ + if (kstrtoint(str, 0, &default_relax_domain_level)) + pr_warn("Unable to set relax_domain_level\n"); + + return 1; +} +__setup("relax_domain_level=", setup_relax_domain_level); + +static void set_domain_attribute(struct sched_domain *sd, + struct sched_domain_attr *attr) +{ + int request; + + if (!attr || attr->relax_domain_level < 0) { + if (default_relax_domain_level < 0) + return; + else + request = default_relax_domain_level; + } else + request = attr->relax_domain_level; + if (request < sd->level) { + /* turn off idle balance on this domain */ + sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); + } else { + /* turn on idle balance on this domain */ + sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); + } +} + +static void __sdt_free(const struct cpumask *cpu_map); +static int __sdt_alloc(const struct cpumask *cpu_map); + +static void __free_domain_allocs(struct s_data *d, enum s_alloc what, + const struct cpumask *cpu_map) +{ + switch (what) { + case sa_rootdomain: + if (!atomic_read(&d->rd->refcount)) + free_rootdomain(&d->rd->rcu); /* fall through */ + case sa_sd: + free_percpu(d->sd); /* fall through */ + case sa_sd_storage: + __sdt_free(cpu_map); /* fall through */ + case sa_none: + break; + } +} + +static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, + const struct cpumask *cpu_map) +{ + memset(d, 0, sizeof(*d)); + + if (__sdt_alloc(cpu_map)) + return sa_sd_storage; + d->sd = alloc_percpu(struct sched_domain *); + if (!d->sd) + return sa_sd_storage; + d->rd = alloc_rootdomain(); + if (!d->rd) + return sa_sd; + return sa_rootdomain; +} + +/* + * NULL the sd_data elements we've used to build the sched_domain and + * sched_group structure so that the subsequent __free_domain_allocs() + * will not free the data we're using. + */ +static void claim_allocations(int cpu, struct sched_domain *sd) +{ + struct sd_data *sdd = sd->private; + + WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); + *per_cpu_ptr(sdd->sd, cpu) = NULL; + + if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref)) + *per_cpu_ptr(sdd->sg, cpu) = NULL; + + if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref)) + *per_cpu_ptr(sdd->sgp, cpu) = NULL; +} + +#ifdef CONFIG_SCHED_SMT +static const struct cpumask *cpu_smt_mask(int cpu) +{ + return topology_thread_cpumask(cpu); +} +#endif + +/* + * Topology list, bottom-up. + */ +static struct sched_domain_topology_level default_topology[] = { +#ifdef CONFIG_SCHED_SMT + { sd_init_SIBLING, cpu_smt_mask, }, +#endif +#ifdef CONFIG_SCHED_MC + { sd_init_MC, cpu_coregroup_mask, }, +#endif +#ifdef CONFIG_SCHED_BOOK + { sd_init_BOOK, cpu_book_mask, }, +#endif + { sd_init_CPU, cpu_cpu_mask, }, +#ifdef CONFIG_NUMA + { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, + { sd_init_ALLNODES, cpu_allnodes_mask, }, +#endif + { NULL, }, +}; + +static struct sched_domain_topology_level *sched_domain_topology = default_topology; + +static int __sdt_alloc(const struct cpumask *cpu_map) +{ + struct sched_domain_topology_level *tl; + int j; + + for (tl = sched_domain_topology; tl->init; tl++) { + struct sd_data *sdd = &tl->data; + + sdd->sd = alloc_percpu(struct sched_domain *); + if (!sdd->sd) + return -ENOMEM; + + sdd->sg = alloc_percpu(struct sched_group *); + if (!sdd->sg) + return -ENOMEM; + + sdd->sgp = alloc_percpu(struct sched_group_power *); + if (!sdd->sgp) + return -ENOMEM; + + for_each_cpu(j, cpu_map) { + struct sched_domain *sd; + struct sched_group *sg; + struct sched_group_power *sgp; + + sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(), + GFP_KERNEL, cpu_to_node(j)); + if (!sd) + return -ENOMEM; + + *per_cpu_ptr(sdd->sd, j) = sd; + + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), + GFP_KERNEL, cpu_to_node(j)); + if (!sg) + return -ENOMEM; + + *per_cpu_ptr(sdd->sg, j) = sg; + + sgp = kzalloc_node(sizeof(struct sched_group_power), + GFP_KERNEL, cpu_to_node(j)); + if (!sgp) + return -ENOMEM; + + *per_cpu_ptr(sdd->sgp, j) = sgp; + } + } + + return 0; +} + +static void __sdt_free(const struct cpumask *cpu_map) +{ + struct sched_domain_topology_level *tl; + int j; + + for (tl = sched_domain_topology; tl->init; tl++) { + struct sd_data *sdd = &tl->data; + + for_each_cpu(j, cpu_map) { + struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j); + if (sd && (sd->flags & SD_OVERLAP)) + free_sched_groups(sd->groups, 0); + kfree(*per_cpu_ptr(sdd->sd, j)); + kfree(*per_cpu_ptr(sdd->sg, j)); + kfree(*per_cpu_ptr(sdd->sgp, j)); + } + free_percpu(sdd->sd); + free_percpu(sdd->sg); + free_percpu(sdd->sgp); + } +} + +struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, + struct s_data *d, const struct cpumask *cpu_map, + struct sched_domain_attr *attr, struct sched_domain *child, + int cpu) +{ + struct sched_domain *sd = tl->init(tl, cpu); + if (!sd) + return child; + + cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); + if (child) { + sd->level = child->level + 1; + sched_domain_level_max = max(sched_domain_level_max, sd->level); + child->parent = sd; + } + sd->child = child; + set_domain_attribute(sd, attr); + + return sd; +} + +/* + * Build sched domains for a given set of cpus and attach the sched domains + * to the individual cpus + */ +static int build_sched_domains(const struct cpumask *cpu_map, + struct sched_domain_attr *attr) +{ + enum s_alloc alloc_state = sa_none; + struct sched_domain *sd; + struct s_data d; + int i, ret = -ENOMEM; + + alloc_state = __visit_domain_allocation_hell(&d, cpu_map); + if (alloc_state != sa_rootdomain) + goto error; + + /* Set up domains for cpus specified by the cpu_map. */ + for_each_cpu(i, cpu_map) { + struct sched_domain_topology_level *tl; + + sd = NULL; + for (tl = sched_domain_topology; tl->init; tl++) { + sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i); + if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP)) + sd->flags |= SD_OVERLAP; + if (cpumask_equal(cpu_map, sched_domain_span(sd))) + break; + } + + while (sd->child) + sd = sd->child; + + *per_cpu_ptr(d.sd, i) = sd; + } + + /* Build the groups for the domains */ + for_each_cpu(i, cpu_map) { + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { + sd->span_weight = cpumask_weight(sched_domain_span(sd)); + if (sd->flags & SD_OVERLAP) { + if (build_overlap_sched_groups(sd, i)) + goto error; + } else { + if (build_sched_groups(sd, i)) + goto error; + } + } + } + + /* Calculate CPU power for physical packages and nodes */ + for (i = nr_cpumask_bits-1; i >= 0; i--) { + if (!cpumask_test_cpu(i, cpu_map)) + continue; + + for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { + claim_allocations(i, sd); + init_sched_groups_power(i, sd); + } + } + + /* Attach the domains */ + rcu_read_lock(); + for_each_cpu(i, cpu_map) { + sd = *per_cpu_ptr(d.sd, i); + cpu_attach_domain(sd, d.rd, i); + } + rcu_read_unlock(); + + ret = 0; +error: + __free_domain_allocs(&d, alloc_state, cpu_map); + return ret; +} + +static cpumask_var_t *doms_cur; /* current sched domains */ +static int ndoms_cur; /* number of sched domains in 'doms_cur' */ +static struct sched_domain_attr *dattr_cur; + /* attribues of custom domains in 'doms_cur' */ + +/* + * Special case: If a kmalloc of a doms_cur partition (array of + * cpumask) fails, then fallback to a single sched domain, + * as determined by the single cpumask fallback_doms. + */ +static cpumask_var_t fallback_doms; + +/* + * arch_update_cpu_topology lets virtualized architectures update the + * cpu core maps. It is supposed to return 1 if the topology changed + * or 0 if it stayed the same. + */ +int __attribute__((weak)) arch_update_cpu_topology(void) +{ + return 0; +} + +cpumask_var_t *alloc_sched_domains(unsigned int ndoms) +{ + int i; + cpumask_var_t *doms; + + doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL); + if (!doms) + return NULL; + for (i = 0; i < ndoms; i++) { + if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) { + free_sched_domains(doms, i); + return NULL; + } + } + return doms; +} + +void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) +{ + unsigned int i; + for (i = 0; i < ndoms; i++) + free_cpumask_var(doms[i]); + kfree(doms); +} + +/* + * Set up scheduler domains and groups. Callers must hold the hotplug lock. + * For now this just excludes isolated cpus, but could be used to + * exclude other special cases in the future. + */ +static int init_sched_domains(const struct cpumask *cpu_map) +{ + int err; + + arch_update_cpu_topology(); + ndoms_cur = 1; + doms_cur = alloc_sched_domains(ndoms_cur); + if (!doms_cur) + doms_cur = &fallback_doms; + cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); + dattr_cur = NULL; + err = build_sched_domains(doms_cur[0], NULL); + register_sched_domain_sysctl(); + + return err; +} + +/* + * Detach sched domains from a group of cpus specified in cpu_map + * These cpus will now be attached to the NULL domain + */ +static void detach_destroy_domains(const struct cpumask *cpu_map) +{ + int i; + + rcu_read_lock(); + for_each_cpu(i, cpu_map) + cpu_attach_domain(NULL, &def_root_domain, i); + rcu_read_unlock(); +} + +/* handle null as "default" */ +static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, + struct sched_domain_attr *new, int idx_new) +{ + struct sched_domain_attr tmp; + + /* fast path */ + if (!new && !cur) + return 1; + + tmp = SD_ATTR_INIT; + return !memcmp(cur ? (cur + idx_cur) : &tmp, + new ? (new + idx_new) : &tmp, + sizeof(struct sched_domain_attr)); +} + +/* + * Partition sched domains as specified by the 'ndoms_new' + * cpumasks in the array doms_new[] of cpumasks. This compares + * doms_new[] to the current sched domain partitioning, doms_cur[]. + * It destroys each deleted domain and builds each new domain. + * + * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. + * The masks don't intersect (don't overlap.) We should setup one + * sched domain for each mask. CPUs not in any of the cpumasks will + * not be load balanced. If the same cpumask appears both in the + * current 'doms_cur' domains and in the new 'doms_new', we can leave + * it as it is. + * + * The passed in 'doms_new' should be allocated using + * alloc_sched_domains. This routine takes ownership of it and will + * free_sched_domains it when done with it. If the caller failed the + * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, + * and partition_sched_domains() will fallback to the single partition + * 'fallback_doms', it also forces the domains to be rebuilt. + * + * If doms_new == NULL it will be replaced with cpu_online_mask. + * ndoms_new == 0 is a special case for destroying existing domains, + * and it will not create the default domain. + * + * Call with hotplug lock held + */ +void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], + struct sched_domain_attr *dattr_new) +{ + int i, j, n; + int new_topology; + + mutex_lock(&sched_domains_mutex); + + /* always unregister in case we don't destroy any domains */ + unregister_sched_domain_sysctl(); + + /* Let architecture update cpu core mappings. */ + new_topology = arch_update_cpu_topology(); + + n = doms_new ? ndoms_new : 0; + + /* Destroy deleted domains */ + for (i = 0; i < ndoms_cur; i++) { + for (j = 0; j < n && !new_topology; j++) { + if (cpumask_equal(doms_cur[i], doms_new[j]) + && dattrs_equal(dattr_cur, i, dattr_new, j)) + goto match1; + } + /* no match - a current sched domain not in new doms_new[] */ + detach_destroy_domains(doms_cur[i]); +match1: + ; + } + + if (doms_new == NULL) { + ndoms_cur = 0; + doms_new = &fallback_doms; + cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); + WARN_ON_ONCE(dattr_new); + } + + /* Build new domains */ + for (i = 0; i < ndoms_new; i++) { + for (j = 0; j < ndoms_cur && !new_topology; j++) { + if (cpumask_equal(doms_new[i], doms_cur[j]) + && dattrs_equal(dattr_new, i, dattr_cur, j)) + goto match2; + } + /* no match - add a new doms_new */ + build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL); +match2: + ; + } + + /* Remember the new sched domains */ + if (doms_cur != &fallback_doms) + free_sched_domains(doms_cur, ndoms_cur); + kfree(dattr_cur); /* kfree(NULL) is safe */ + doms_cur = doms_new; + dattr_cur = dattr_new; + ndoms_cur = ndoms_new; + + register_sched_domain_sysctl(); + + mutex_unlock(&sched_domains_mutex); +} + +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +static void reinit_sched_domains(void) +{ + get_online_cpus(); + + /* Destroy domains first to force the rebuild */ + partition_sched_domains(0, NULL, NULL); + + rebuild_sched_domains(); + put_online_cpus(); +} + +static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) +{ + unsigned int level = 0; + + if (sscanf(buf, "%u", &level) != 1) + return -EINVAL; + + /* + * level is always be positive so don't check for + * level < POWERSAVINGS_BALANCE_NONE which is 0 + * What happens on 0 or 1 byte write, + * need to check for count as well? + */ + + if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) + return -EINVAL; + + if (smt) + sched_smt_power_savings = level; + else + sched_mc_power_savings = level; + + reinit_sched_domains(); + + return count; +} + +#ifdef CONFIG_SCHED_MC +static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, + struct sysdev_class_attribute *attr, + char *page) +{ + return sprintf(page, "%u\n", sched_mc_power_savings); +} +static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, + struct sysdev_class_attribute *attr, + const char *buf, size_t count) +{ + return sched_power_savings_store(buf, count, 0); +} +static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, + sched_mc_power_savings_show, + sched_mc_power_savings_store); +#endif + +#ifdef CONFIG_SCHED_SMT +static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, + struct sysdev_class_attribute *attr, + char *page) +{ + return sprintf(page, "%u\n", sched_smt_power_savings); +} +static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, + struct sysdev_class_attribute *attr, + const char *buf, size_t count) +{ + return sched_power_savings_store(buf, count, 1); +} +static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644, + sched_smt_power_savings_show, + sched_smt_power_savings_store); +#endif + +int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) +{ + int err = 0; + +#ifdef CONFIG_SCHED_SMT + if (smt_capable()) + err = sysfs_create_file(&cls->kset.kobj, + &attr_sched_smt_power_savings.attr); +#endif +#ifdef CONFIG_SCHED_MC + if (!err && mc_capable()) + err = sysfs_create_file(&cls->kset.kobj, + &attr_sched_mc_power_savings.attr); +#endif + return err; +} +#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ + +/* + * Update cpusets according to cpu_active mask. If cpusets are + * disabled, cpuset_update_active_cpus() becomes a simple wrapper + * around partition_sched_domains(). + */ +static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_ONLINE: + case CPU_DOWN_FAILED: + cpuset_update_active_cpus(); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + +static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, + void *hcpu) +{ + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + cpuset_update_active_cpus(); + return NOTIFY_OK; + default: + return NOTIFY_DONE; + } +} + +static int update_runtime(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + int cpu = (int)(long)hcpu; + + switch (action) { + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + disable_runtime(cpu_rq(cpu)); + return NOTIFY_OK; + + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + enable_runtime(cpu_rq(cpu)); + return NOTIFY_OK; + + default: + return NOTIFY_DONE; + } +} + +void __init sched_init_smp(void) +{ + cpumask_var_t non_isolated_cpus; + + alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); + alloc_cpumask_var(&fallback_doms, GFP_KERNEL); + + get_online_cpus(); + mutex_lock(&sched_domains_mutex); + init_sched_domains(cpu_active_mask); + cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); + if (cpumask_empty(non_isolated_cpus)) + cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); + mutex_unlock(&sched_domains_mutex); + put_online_cpus(); + + hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE); + hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE); + + /* RT runtime code needs to handle some hotplug events */ + hotcpu_notifier(update_runtime, 0); + + init_hrtick(); + + /* Move init over to a non-isolated CPU */ + if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) + BUG(); + sched_init_granularity(); + free_cpumask_var(non_isolated_cpus); + + init_sched_rt_class(); +} +#else +void __init sched_init_smp(void) +{ + sched_init_granularity(); +} +#endif /* CONFIG_SMP */ + +const_debug unsigned int sysctl_timer_migration = 1; + +int in_sched_functions(unsigned long addr) +{ + return in_lock_functions(addr) || + (addr >= (unsigned long)__sched_text_start + && addr < (unsigned long)__sched_text_end); +} + +static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) +{ + cfs_rq->tasks_timeline = RB_ROOT; + INIT_LIST_HEAD(&cfs_rq->tasks); +#ifdef CONFIG_FAIR_GROUP_SCHED + cfs_rq->rq = rq; + /* allow initial update_cfs_load() to truncate */ +#ifdef CONFIG_SMP + cfs_rq->load_stamp = 1; +#endif +#endif + cfs_rq->min_vruntime = (u64)(-(1LL << 20)); +#ifndef CONFIG_64BIT + cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; +#endif +} + +static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) +{ + struct rt_prio_array *array; + int i; + + array = &rt_rq->active; + for (i = 0; i < MAX_RT_PRIO; i++) { + INIT_LIST_HEAD(array->queue + i); + __clear_bit(i, array->bitmap); + } + /* delimiter for bitsearch: */ + __set_bit(MAX_RT_PRIO, array->bitmap); + +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED + rt_rq->highest_prio.curr = MAX_RT_PRIO; +#ifdef CONFIG_SMP + rt_rq->highest_prio.next = MAX_RT_PRIO; +#endif +#endif +#ifdef CONFIG_SMP + rt_rq->rt_nr_migratory = 0; + rt_rq->overloaded = 0; + plist_head_init(&rt_rq->pushable_tasks); +#endif + + rt_rq->rt_time = 0; + rt_rq->rt_throttled = 0; + rt_rq->rt_runtime = 0; + raw_spin_lock_init(&rt_rq->rt_runtime_lock); + +#ifdef CONFIG_RT_GROUP_SCHED + rt_rq->rt_nr_boosted = 0; + rt_rq->rq = rq; +#endif +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, + struct sched_entity *se, int cpu, + struct sched_entity *parent) +{ + struct rq *rq = cpu_rq(cpu); + tg->cfs_rq[cpu] = cfs_rq; + init_cfs_rq(cfs_rq, rq); + cfs_rq->tg = tg; + + tg->se[cpu] = se; + /* se could be NULL for root_task_group */ + if (!se) + return; + + if (!parent) + se->cfs_rq = &rq->cfs; + else + se->cfs_rq = parent->my_q; + + se->my_q = cfs_rq; + update_load_set(&se->load, 0); + se->parent = parent; +} +#endif + +#ifdef CONFIG_RT_GROUP_SCHED +static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, + struct sched_rt_entity *rt_se, int cpu, + struct sched_rt_entity *parent) +{ + struct rq *rq = cpu_rq(cpu); + + tg->rt_rq[cpu] = rt_rq; + init_rt_rq(rt_rq, rq); + rt_rq->tg = tg; + rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; + + tg->rt_se[cpu] = rt_se; + if (!rt_se) + return; + + if (!parent) + rt_se->rt_rq = &rq->rt; + else + rt_se->rt_rq = parent->my_q; + + rt_se->my_q = rt_rq; + rt_se->parent = parent; + INIT_LIST_HEAD(&rt_se->run_list); +} +#endif + +void __init sched_init(void) +{ + int i, j; + unsigned long alloc_size = 0, ptr; + +#ifdef CONFIG_FAIR_GROUP_SCHED + alloc_size += 2 * nr_cpu_ids * sizeof(void **); +#endif +#ifdef CONFIG_RT_GROUP_SCHED + alloc_size += 2 * nr_cpu_ids * sizeof(void **); +#endif +#ifdef CONFIG_CPUMASK_OFFSTACK + alloc_size += num_possible_cpus() * cpumask_size(); +#endif + if (alloc_size) { + ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); + +#ifdef CONFIG_FAIR_GROUP_SCHED + root_task_group.se = (struct sched_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + + root_task_group.cfs_rq = (struct cfs_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + +#endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_RT_GROUP_SCHED + root_task_group.rt_se = (struct sched_rt_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + + root_task_group.rt_rq = (struct rt_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); + +#endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_CPUMASK_OFFSTACK + for_each_possible_cpu(i) { + per_cpu(load_balance_tmpmask, i) = (void *)ptr; + ptr += cpumask_size(); + } +#endif /* CONFIG_CPUMASK_OFFSTACK */ + } + +#ifdef CONFIG_SMP + init_defrootdomain(); +#endif + + init_rt_bandwidth(&def_rt_bandwidth, + global_rt_period(), global_rt_runtime()); + +#ifdef CONFIG_RT_GROUP_SCHED + init_rt_bandwidth(&root_task_group.rt_bandwidth, + global_rt_period(), global_rt_runtime()); +#endif /* CONFIG_RT_GROUP_SCHED */ + +#ifdef CONFIG_CGROUP_SCHED + list_add(&root_task_group.list, &task_groups); + INIT_LIST_HEAD(&root_task_group.children); + autogroup_init(&init_task); +#endif /* CONFIG_CGROUP_SCHED */ + + for_each_possible_cpu(i) { + struct rq *rq; + + rq = cpu_rq(i); + raw_spin_lock_init(&rq->lock); + rq->nr_running = 0; + rq->calc_load_active = 0; + rq->calc_load_update = jiffies + LOAD_FREQ; + init_cfs_rq(&rq->cfs, rq); + init_rt_rq(&rq->rt, rq); +#ifdef CONFIG_FAIR_GROUP_SCHED + root_task_group.shares = root_task_group_load; + INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); + /* + * How much cpu bandwidth does root_task_group get? + * + * In case of task-groups formed thr' the cgroup filesystem, it + * gets 100% of the cpu resources in the system. This overall + * system cpu resource is divided among the tasks of + * root_task_group and its child task-groups in a fair manner, + * based on each entity's (task or task-group's) weight + * (se->load.weight). + * + * In other words, if root_task_group has 10 tasks of weight + * 1024) and two child groups A0 and A1 (of weight 1024 each), + * then A0's share of the cpu resource is: + * + * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% + * + * We achieve this by letting root_task_group's tasks sit + * directly in rq->cfs (i.e root_task_group->se[] = NULL). + */ + init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); +#endif /* CONFIG_FAIR_GROUP_SCHED */ + + rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; +#ifdef CONFIG_RT_GROUP_SCHED + INIT_LIST_HEAD(&rq->leaf_rt_rq_list); + init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); +#endif + + for (j = 0; j < CPU_LOAD_IDX_MAX; j++) + rq->cpu_load[j] = 0; + + rq->last_load_update_tick = jiffies; + +#ifdef CONFIG_SMP + rq->sd = NULL; + rq->rd = NULL; + rq->cpu_power = SCHED_POWER_SCALE; + rq->post_schedule = 0; + rq->active_balance = 0; + rq->next_balance = jiffies; + rq->push_cpu = 0; + rq->cpu = i; + rq->online = 0; + rq->idle_stamp = 0; + rq->avg_idle = 2*sysctl_sched_migration_cost; + rq_attach_root(rq, &def_root_domain); +#ifdef CONFIG_NO_HZ + rq->nohz_balance_kick = 0; + init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i)); +#endif +#endif + init_rq_hrtick(rq); + atomic_set(&rq->nr_iowait, 0); + } + + set_load_weight(&init_task); + +#ifdef CONFIG_PREEMPT_NOTIFIERS + INIT_HLIST_HEAD(&init_task.preempt_notifiers); +#endif + +#ifdef CONFIG_SMP + open_softirq(SCHED_SOFTIRQ, run_rebalance_domains); +#endif + +#ifdef CONFIG_RT_MUTEXES + plist_head_init(&init_task.pi_waiters); +#endif + + /* + * The boot idle thread does lazy MMU switching as well: + */ + atomic_inc(&init_mm.mm_count); + enter_lazy_tlb(&init_mm, current); + + /* + * Make us the idle thread. Technically, schedule() should not be + * called from this thread, however somewhere below it might be, + * but because we are the idle thread, we just pick up running again + * when this runqueue becomes "idle". + */ + init_idle(current, smp_processor_id()); + + calc_load_update = jiffies + LOAD_FREQ; + + /* + * During early bootup we pretend to be a normal task: + */ + current->sched_class = &fair_sched_class; + + /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ + zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); +#ifdef CONFIG_SMP + zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT); +#ifdef CONFIG_NO_HZ + zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT); + alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT); + atomic_set(&nohz.load_balancer, nr_cpu_ids); + atomic_set(&nohz.first_pick_cpu, nr_cpu_ids); + atomic_set(&nohz.second_pick_cpu, nr_cpu_ids); +#endif + /* May be allocated at isolcpus cmdline parse time */ + if (cpu_isolated_map == NULL) + zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); +#endif /* SMP */ + + scheduler_running = 1; +} + +#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +static inline int preempt_count_equals(int preempt_offset) +{ + int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); + + return (nested == preempt_offset); +} + +static int __might_sleep_init_called; +int __init __might_sleep_init(void) +{ + __might_sleep_init_called = 1; + return 0; +} +early_initcall(__might_sleep_init); + +void __might_sleep(const char *file, int line, int preempt_offset) +{ +#ifdef in_atomic + static unsigned long prev_jiffy; /* ratelimiting */ + + if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || + oops_in_progress) + return; + if (system_state != SYSTEM_RUNNING && + (!__might_sleep_init_called || system_state != SYSTEM_BOOTING)) + return; + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) + return; + prev_jiffy = jiffies; + + printk(KERN_ERR + "BUG: sleeping function called from invalid context at %s:%d\n", + file, line); + printk(KERN_ERR + "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), + current->pid, current->comm); + + debug_show_held_locks(current); + if (irqs_disabled()) + print_irqtrace_events(current); + dump_stack(); +#endif +} +EXPORT_SYMBOL(__might_sleep); +#endif + +#ifdef CONFIG_MAGIC_SYSRQ +static void normalize_task(struct rq *rq, struct task_struct *p) +{ + const struct sched_class *prev_class = p->sched_class; + int old_prio = p->prio; + int on_rq; + + on_rq = p->on_rq; + if (on_rq) + deactivate_task(rq, p, 0); + __setscheduler(rq, p, SCHED_NORMAL, 0); + if (on_rq) { + activate_task(rq, p, 0); + resched_task(rq->curr); + } + + check_class_changed(rq, p, prev_class, old_prio); +} + +void normalize_rt_tasks(void) +{ + struct task_struct *g, *p; + unsigned long flags; + struct rq *rq; + + read_lock_irqsave(&tasklist_lock, flags); + do_each_thread(g, p) { + /* + * Only normalize user tasks: + */ + if (!p->mm) + continue; + + p->se.exec_start = 0; +#ifdef CONFIG_SCHEDSTATS + p->se.statistics.wait_start = 0; + p->se.statistics.sleep_start = 0; + p->se.statistics.block_start = 0; +#endif + + if (!rt_task(p)) { + /* + * Renice negative nice level userspace + * tasks back to 0: + */ + if (TASK_NICE(p) < 0 && p->mm) + set_user_nice(p, 0); + continue; + } + + raw_spin_lock(&p->pi_lock); + rq = __task_rq_lock(p); + + normalize_task(rq, p); + + __task_rq_unlock(rq); + raw_spin_unlock(&p->pi_lock); + } while_each_thread(g, p); + + read_unlock_irqrestore(&tasklist_lock, flags); +} + +#endif /* CONFIG_MAGIC_SYSRQ */ + +#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) +/* + * These functions are only useful for the IA64 MCA handling, or kdb. + * + * They can only be called when the whole system has been + * stopped - every CPU needs to be quiescent, and no scheduling + * activity can take place. Using them for anything else would + * be a serious bug, and as a result, they aren't even visible + * under any other configuration. + */ + +/** + * curr_task - return the current task for a given cpu. + * @cpu: the processor in question. + * + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! + */ +struct task_struct *curr_task(int cpu) +{ + return cpu_curr(cpu); +} + +#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ + +#ifdef CONFIG_IA64 +/** + * set_curr_task - set the current task for a given cpu. + * @cpu: the processor in question. + * @p: the task pointer to set. + * + * Description: This function must only be used when non-maskable interrupts + * are serviced on a separate stack. It allows the architecture to switch the + * notion of the current task on a cpu in a non-blocking manner. This function + * must be called with all CPU's synchronized, and interrupts disabled, the + * and caller must save the original value of the current task (see + * curr_task() above) and restore that value before reenabling interrupts and + * re-starting the system. + * + * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! + */ +void set_curr_task(int cpu, struct task_struct *p) +{ + cpu_curr(cpu) = p; +} + +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED +static void free_fair_sched_group(struct task_group *tg) +{ + int i; + + for_each_possible_cpu(i) { + if (tg->cfs_rq) + kfree(tg->cfs_rq[i]); + if (tg->se) + kfree(tg->se[i]); + } + + kfree(tg->cfs_rq); + kfree(tg->se); +} + +static +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se; + int i; + + tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL); + if (!tg->cfs_rq) + goto err; + tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL); + if (!tg->se) + goto err; + + tg->shares = NICE_0_LOAD; + + for_each_possible_cpu(i) { + cfs_rq = kzalloc_node(sizeof(struct cfs_rq), + GFP_KERNEL, cpu_to_node(i)); + if (!cfs_rq) + goto err; + + se = kzalloc_node(sizeof(struct sched_entity), + GFP_KERNEL, cpu_to_node(i)); + if (!se) + goto err_free_rq; + + init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); + } + + return 1; + +err_free_rq: + kfree(cfs_rq); +err: + return 0; +} + +static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + /* + * Only empty task groups can be destroyed; so we can speculatively + * check on_list without danger of it being re-added. + */ + if (!tg->cfs_rq[cpu]->on_list) + return; + + raw_spin_lock_irqsave(&rq->lock, flags); + list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); + raw_spin_unlock_irqrestore(&rq->lock, flags); +} +#else /* !CONFG_FAIR_GROUP_SCHED */ +static inline void free_fair_sched_group(struct task_group *tg) +{ +} + +static inline +int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) +{ + return 1; +} + +static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) +{ +} +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +#ifdef CONFIG_RT_GROUP_SCHED +static void free_rt_sched_group(struct task_group *tg) +{ + int i; + + destroy_rt_bandwidth(&tg->rt_bandwidth); + + for_each_possible_cpu(i) { + if (tg->rt_rq) + kfree(tg->rt_rq[i]); + if (tg->rt_se) + kfree(tg->rt_se[i]); + } + + kfree(tg->rt_rq); + kfree(tg->rt_se); +} + +static +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) +{ + struct rt_rq *rt_rq; + struct sched_rt_entity *rt_se; + int i; + + tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL); + if (!tg->rt_rq) + goto err; + tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL); + if (!tg->rt_se) + goto err; + + init_rt_bandwidth(&tg->rt_bandwidth, + ktime_to_ns(def_rt_bandwidth.rt_period), 0); + + for_each_possible_cpu(i) { + rt_rq = kzalloc_node(sizeof(struct rt_rq), + GFP_KERNEL, cpu_to_node(i)); + if (!rt_rq) + goto err; + + rt_se = kzalloc_node(sizeof(struct sched_rt_entity), + GFP_KERNEL, cpu_to_node(i)); + if (!rt_se) + goto err_free_rq; + + init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); + } + + return 1; + +err_free_rq: + kfree(rt_rq); +err: + return 0; +} +#else /* !CONFIG_RT_GROUP_SCHED */ +static inline void free_rt_sched_group(struct task_group *tg) +{ +} + +static inline +int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) +{ + return 1; +} +#endif /* CONFIG_RT_GROUP_SCHED */ + +#ifdef CONFIG_CGROUP_SCHED +static void free_sched_group(struct task_group *tg) +{ + free_fair_sched_group(tg); + free_rt_sched_group(tg); + autogroup_free(tg); + kfree(tg); +} + +/* allocate runqueue etc for a new task group */ +struct task_group *sched_create_group(struct task_group *parent) +{ + struct task_group *tg; + unsigned long flags; + + tg = kzalloc(sizeof(*tg), GFP_KERNEL); + if (!tg) + return ERR_PTR(-ENOMEM); + + if (!alloc_fair_sched_group(tg, parent)) + goto err; + + if (!alloc_rt_sched_group(tg, parent)) + goto err; + + spin_lock_irqsave(&task_group_lock, flags); + list_add_rcu(&tg->list, &task_groups); + + WARN_ON(!parent); /* root should already exist */ + + tg->parent = parent; + INIT_LIST_HEAD(&tg->children); + list_add_rcu(&tg->siblings, &parent->children); + spin_unlock_irqrestore(&task_group_lock, flags); + + return tg; + +err: + free_sched_group(tg); + return ERR_PTR(-ENOMEM); +} + +/* rcu callback to free various structures associated with a task group */ +static void free_sched_group_rcu(struct rcu_head *rhp) +{ + /* now it should be safe to free those cfs_rqs */ + free_sched_group(container_of(rhp, struct task_group, rcu)); +} + +/* Destroy runqueue etc associated with a task group */ +void sched_destroy_group(struct task_group *tg) +{ + unsigned long flags; + int i; + + /* end participation in shares distribution */ + for_each_possible_cpu(i) + unregister_fair_sched_group(tg, i); + + spin_lock_irqsave(&task_group_lock, flags); + list_del_rcu(&tg->list); + list_del_rcu(&tg->siblings); + spin_unlock_irqrestore(&task_group_lock, flags); + + /* wait for possible concurrent references to cfs_rqs complete */ + call_rcu(&tg->rcu, free_sched_group_rcu); +} + +/* change task's runqueue when it moves between groups. + * The caller of this function should have put the task in its new group + * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to + * reflect its new group. + */ +void sched_move_task(struct task_struct *tsk) +{ + int on_rq, running; + unsigned long flags; + struct rq *rq; + + rq = task_rq_lock(tsk, &flags); + + running = task_current(rq, tsk); + on_rq = tsk->on_rq; + + if (on_rq) + dequeue_task(rq, tsk, 0); + if (unlikely(running)) + tsk->sched_class->put_prev_task(rq, tsk); + +#ifdef CONFIG_FAIR_GROUP_SCHED + if (tsk->sched_class->task_move_group) + tsk->sched_class->task_move_group(tsk, on_rq); + else +#endif + set_task_rq(tsk, task_cpu(tsk)); + + if (unlikely(running)) + tsk->sched_class->set_curr_task(rq); + if (on_rq) + enqueue_task(rq, tsk, 0); + + task_rq_unlock(rq, tsk, &flags); +} +#endif /* CONFIG_CGROUP_SCHED */ + +#ifdef CONFIG_FAIR_GROUP_SCHED +static DEFINE_MUTEX(shares_mutex); + +int sched_group_set_shares(struct task_group *tg, unsigned long shares) +{ + int i; + unsigned long flags; + + /* + * We can't change the weight of the root cgroup. + */ + if (!tg->se[0]) + return -EINVAL; + + shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES)); + + mutex_lock(&shares_mutex); + if (tg->shares == shares) + goto done; + + tg->shares = shares; + for_each_possible_cpu(i) { + struct rq *rq = cpu_rq(i); + struct sched_entity *se; + + se = tg->se[i]; + /* Propagate contribution to hierarchy */ + raw_spin_lock_irqsave(&rq->lock, flags); + for_each_sched_entity(se) + update_cfs_shares(group_cfs_rq(se)); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } + +done: + mutex_unlock(&shares_mutex); + return 0; +} + +unsigned long sched_group_shares(struct task_group *tg) +{ + return tg->shares; +} +#endif + +#ifdef CONFIG_RT_GROUP_SCHED +/* + * Ensure that the real time constraints are schedulable. + */ +static DEFINE_MUTEX(rt_constraints_mutex); + +static unsigned long to_ratio(u64 period, u64 runtime) +{ + if (runtime == RUNTIME_INF) + return 1ULL << 20; + + return div64_u64(runtime << 20, period); +} + +/* Must be called with tasklist_lock held */ +static inline int tg_has_rt_tasks(struct task_group *tg) +{ + struct task_struct *g, *p; + + do_each_thread(g, p) { + if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) + return 1; + } while_each_thread(g, p); + + return 0; +} + +struct rt_schedulable_data { + struct task_group *tg; + u64 rt_period; + u64 rt_runtime; +}; + +static int tg_schedulable(struct task_group *tg, void *data) +{ + struct rt_schedulable_data *d = data; + struct task_group *child; + unsigned long total, sum = 0; + u64 period, runtime; + + period = ktime_to_ns(tg->rt_bandwidth.rt_period); + runtime = tg->rt_bandwidth.rt_runtime; + + if (tg == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; + } + + /* + * Cannot have more runtime than the period. + */ + if (runtime > period && runtime != RUNTIME_INF) + return -EINVAL; + + /* + * Ensure we don't starve existing RT tasks. + */ + if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg)) + return -EBUSY; + + total = to_ratio(period, runtime); + + /* + * Nobody can have more than the global setting allows. + */ + if (total > to_ratio(global_rt_period(), global_rt_runtime())) + return -EINVAL; + + /* + * The sum of our children's runtime should not exceed our own. + */ + list_for_each_entry_rcu(child, &tg->children, siblings) { + period = ktime_to_ns(child->rt_bandwidth.rt_period); + runtime = child->rt_bandwidth.rt_runtime; + + if (child == d->tg) { + period = d->rt_period; + runtime = d->rt_runtime; + } + + sum += to_ratio(period, runtime); + } + + if (sum > total) + return -EINVAL; + + return 0; +} + +static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) +{ + struct rt_schedulable_data data = { + .tg = tg, + .rt_period = period, + .rt_runtime = runtime, + }; + + return walk_tg_tree(tg_schedulable, tg_nop, &data); +} + +static int tg_set_bandwidth(struct task_group *tg, + u64 rt_period, u64 rt_runtime) +{ + int i, err = 0; + + mutex_lock(&rt_constraints_mutex); + read_lock(&tasklist_lock); + err = __rt_schedulable(tg, rt_period, rt_runtime); + if (err) + goto unlock; + + raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); + tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); + tg->rt_bandwidth.rt_runtime = rt_runtime; + + for_each_possible_cpu(i) { + struct rt_rq *rt_rq = tg->rt_rq[i]; + + raw_spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_runtime = rt_runtime; + raw_spin_unlock(&rt_rq->rt_runtime_lock); + } + raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); +unlock: + read_unlock(&tasklist_lock); + mutex_unlock(&rt_constraints_mutex); + + return err; +} + +int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) +{ + u64 rt_runtime, rt_period; + + rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); + rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; + if (rt_runtime_us < 0) + rt_runtime = RUNTIME_INF; + + return tg_set_bandwidth(tg, rt_period, rt_runtime); +} + +long sched_group_rt_runtime(struct task_group *tg) +{ + u64 rt_runtime_us; + + if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF) + return -1; + + rt_runtime_us = tg->rt_bandwidth.rt_runtime; + do_div(rt_runtime_us, NSEC_PER_USEC); + return rt_runtime_us; +} + +int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) +{ + u64 rt_runtime, rt_period; + + rt_period = (u64)rt_period_us * NSEC_PER_USEC; + rt_runtime = tg->rt_bandwidth.rt_runtime; + + if (rt_period == 0) + return -EINVAL; + + return tg_set_bandwidth(tg, rt_period, rt_runtime); +} + +long sched_group_rt_period(struct task_group *tg) +{ + u64 rt_period_us; + + rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period); + do_div(rt_period_us, NSEC_PER_USEC); + return rt_period_us; +} + +static int sched_rt_global_constraints(void) +{ + u64 runtime, period; + int ret = 0; + + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + + runtime = global_rt_runtime(); + period = global_rt_period(); + + /* + * Sanity check on the sysctl variables. + */ + if (runtime > period && runtime != RUNTIME_INF) + return -EINVAL; + + mutex_lock(&rt_constraints_mutex); + read_lock(&tasklist_lock); + ret = __rt_schedulable(NULL, 0, 0); + read_unlock(&tasklist_lock); + mutex_unlock(&rt_constraints_mutex); + + return ret; +} + +int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) +{ + /* Don't accept realtime tasks when there is no way for them to run */ + if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0) + return 0; + + return 1; +} + +#else /* !CONFIG_RT_GROUP_SCHED */ +static int sched_rt_global_constraints(void) +{ + unsigned long flags; + int i; + + if (sysctl_sched_rt_period <= 0) + return -EINVAL; + + /* + * There's always some RT tasks in the root group + * -- migration, kstopmachine etc.. + */ + if (sysctl_sched_rt_runtime == 0) + return -EBUSY; + + raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); + for_each_possible_cpu(i) { + struct rt_rq *rt_rq = &cpu_rq(i)->rt; + + raw_spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_runtime = global_rt_runtime(); + raw_spin_unlock(&rt_rq->rt_runtime_lock); + } + raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); + + return 0; +} +#endif /* CONFIG_RT_GROUP_SCHED */ + +int sched_rt_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + int old_period, old_runtime; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); + old_period = sysctl_sched_rt_period; + old_runtime = sysctl_sched_rt_runtime; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (!ret && write) { + ret = sched_rt_global_constraints(); + if (ret) { + sysctl_sched_rt_period = old_period; + sysctl_sched_rt_runtime = old_runtime; + } else { + def_rt_bandwidth.rt_runtime = global_rt_runtime(); + def_rt_bandwidth.rt_period = + ns_to_ktime(global_rt_period()); + } + } + mutex_unlock(&mutex); + + return ret; +} + +#ifdef CONFIG_CGROUP_SCHED + +/* return corresponding task_group object of a cgroup */ +static inline struct task_group *cgroup_tg(struct cgroup *cgrp) +{ + return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), + struct task_group, css); +} + +static struct cgroup_subsys_state * +cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + struct task_group *tg, *parent; + + if (!cgrp->parent) { + /* This is early initialization for the top cgroup */ + return &root_task_group.css; + } + + parent = cgroup_tg(cgrp->parent); + tg = sched_create_group(parent); + if (IS_ERR(tg)) + return ERR_PTR(-ENOMEM); + + return &tg->css; +} + +static void +cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + struct task_group *tg = cgroup_tg(cgrp); + + sched_destroy_group(tg); +} + +static int +cpu_cgroup_allow_attach(struct cgroup *cgrp, struct task_struct *tsk) +{ + const struct cred *cred = current_cred(), *tcred; + + tcred = __task_cred(tsk); + + if ((current != tsk) && !capable(CAP_SYS_NICE) && + cred->euid != tcred->uid && cred->euid != tcred->suid) + return -EACCES; + + return 0; +} + +static int +cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk) +{ +#ifdef CONFIG_RT_GROUP_SCHED + if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk)) + return -EINVAL; +#else + /* We don't support RT-tasks being in separate groups */ + if (tsk->sched_class != &fair_sched_class) + return -EINVAL; +#endif + return 0; +} + +static void +cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) +{ + sched_move_task(tsk); +} + +static void +cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct cgroup *old_cgrp, struct task_struct *task) +{ + /* + * cgroup_exit() is called in the copy_process() failure path. + * Ignore this case since the task hasn't ran yet, this avoids + * trying to poke a half freed task state from generic code. + */ + if (!(task->flags & PF_EXITING)) + return; + + sched_move_task(task); +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, + u64 shareval) +{ + return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); +} + +static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) +{ + struct task_group *tg = cgroup_tg(cgrp); + + return (u64) scale_load_down(tg->shares); +} +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +#ifdef CONFIG_RT_GROUP_SCHED +static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, + s64 val) +{ + return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); +} + +static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) +{ + return sched_group_rt_runtime(cgroup_tg(cgrp)); +} + +static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, + u64 rt_period_us) +{ + return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); +} + +static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) +{ + return sched_group_rt_period(cgroup_tg(cgrp)); +} +#endif /* CONFIG_RT_GROUP_SCHED */ + +static struct cftype cpu_files[] = { +#ifdef CONFIG_FAIR_GROUP_SCHED + { + .name = "shares", + .read_u64 = cpu_shares_read_u64, + .write_u64 = cpu_shares_write_u64, + }, +#endif +#ifdef CONFIG_RT_GROUP_SCHED + { + .name = "rt_runtime_us", + .read_s64 = cpu_rt_runtime_read, + .write_s64 = cpu_rt_runtime_write, + }, + { + .name = "rt_period_us", + .read_u64 = cpu_rt_period_read_uint, + .write_u64 = cpu_rt_period_write_uint, + }, +#endif +}; + +static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) +{ + return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); +} + +struct cgroup_subsys cpu_cgroup_subsys = { + .name = "cpu", + .create = cpu_cgroup_create, + .destroy = cpu_cgroup_destroy, + .allow_attach = cpu_cgroup_allow_attach, + .can_attach_task = cpu_cgroup_can_attach_task, + .attach_task = cpu_cgroup_attach_task, + .exit = cpu_cgroup_exit, + .populate = cpu_cgroup_populate, + .subsys_id = cpu_cgroup_subsys_id, + .early_init = 1, +}; + +#endif /* CONFIG_CGROUP_SCHED */ + +#ifdef CONFIG_CGROUP_CPUACCT + +/* + * CPU accounting code for task groups. + * + * Based on the work by Paul Menage (menage@google.com) and Balbir Singh + * (balbir@in.ibm.com). + */ + +/* track cpu usage of a group of tasks and its child groups */ +struct cpuacct { + struct cgroup_subsys_state css; + /* cpuusage holds pointer to a u64-type object on every cpu */ + u64 __percpu *cpuusage; + struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; + struct cpuacct *parent; + struct cpuacct_charge_calls *cpufreq_fn; + void *cpuacct_data; +}; + +static struct cpuacct *cpuacct_root; + +/* Default calls for cpufreq accounting */ +static struct cpuacct_charge_calls *cpuacct_cpufreq; +int cpuacct_register_cpufreq(struct cpuacct_charge_calls *fn) +{ + cpuacct_cpufreq = fn; + + /* + * Root node is created before platform can register callbacks, + * initalize here. + */ + if (cpuacct_root && fn) { + cpuacct_root->cpufreq_fn = fn; + if (fn->init) + fn->init(&cpuacct_root->cpuacct_data); + } + return 0; +} + +struct cgroup_subsys cpuacct_subsys; + +/* return cpu accounting group corresponding to this container */ +static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) +{ + return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), + struct cpuacct, css); +} + +/* return cpu accounting group to which this task belongs */ +static inline struct cpuacct *task_ca(struct task_struct *tsk) +{ + return container_of(task_subsys_state(tsk, cpuacct_subsys_id), + struct cpuacct, css); +} + +/* create a new cpu accounting group */ +static struct cgroup_subsys_state *cpuacct_create( + struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); + int i; + + if (!ca) + goto out; + + ca->cpuusage = alloc_percpu(u64); + if (!ca->cpuusage) + goto out_free_ca; + + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) + if (percpu_counter_init(&ca->cpustat[i], 0)) + goto out_free_counters; + + ca->cpufreq_fn = cpuacct_cpufreq; + + /* If available, have platform code initalize cpu frequency table */ + if (ca->cpufreq_fn && ca->cpufreq_fn->init) + ca->cpufreq_fn->init(&ca->cpuacct_data); + + if (cgrp->parent) + ca->parent = cgroup_ca(cgrp->parent); + else + cpuacct_root = ca; + + return &ca->css; + +out_free_counters: + while (--i >= 0) + percpu_counter_destroy(&ca->cpustat[i]); + free_percpu(ca->cpuusage); +out_free_ca: + kfree(ca); +out: + return ERR_PTR(-ENOMEM); +} + +/* destroy an existing cpu accounting group */ +static void +cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + int i; + + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) + percpu_counter_destroy(&ca->cpustat[i]); + free_percpu(ca->cpuusage); + kfree(ca); +} + +static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) +{ + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 data; + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit read safe on 32-bit platforms. + */ + raw_spin_lock_irq(&cpu_rq(cpu)->lock); + data = *cpuusage; + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + data = *cpuusage; +#endif + + return data; +} + +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) +{ + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit write safe on 32-bit platforms. + */ + raw_spin_lock_irq(&cpu_rq(cpu)->lock); + *cpuusage = val; + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + *cpuusage = val; +#endif +} + +/* return total cpu usage (in nanoseconds) of a group */ +static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + u64 totalcpuusage = 0; + int i; + + for_each_present_cpu(i) + totalcpuusage += cpuacct_cpuusage_read(ca, i); + + return totalcpuusage; +} + +static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, + u64 reset) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + int err = 0; + int i; + + if (reset) { + err = -EINVAL; + goto out; + } + + for_each_present_cpu(i) + cpuacct_cpuusage_write(ca, i, 0); + +out: + return err; +} + +static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, + struct seq_file *m) +{ + struct cpuacct *ca = cgroup_ca(cgroup); + u64 percpu; + int i; + + for_each_present_cpu(i) { + percpu = cpuacct_cpuusage_read(ca, i); + seq_printf(m, "%llu ", (unsigned long long) percpu); + } + seq_printf(m, "\n"); + return 0; +} + +static const char *cpuacct_stat_desc[] = { + [CPUACCT_STAT_USER] = "user", + [CPUACCT_STAT_SYSTEM] = "system", +}; + +static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + int i; + + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) { + s64 val = percpu_counter_read(&ca->cpustat[i]); + val = cputime64_to_clock_t(val); + cb->fill(cb, cpuacct_stat_desc[i], val); + } + return 0; +} + +static int cpuacct_cpufreq_show(struct cgroup *cgrp, struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + if (ca->cpufreq_fn && ca->cpufreq_fn->cpufreq_show) + ca->cpufreq_fn->cpufreq_show(ca->cpuacct_data, cb); + + return 0; +} + +/* return total cpu power usage (milliWatt second) of a group */ +static u64 cpuacct_powerusage_read(struct cgroup *cgrp, struct cftype *cft) +{ + int i; + struct cpuacct *ca = cgroup_ca(cgrp); + u64 totalpower = 0; + + if (ca->cpufreq_fn && ca->cpufreq_fn->power_usage) + for_each_present_cpu(i) { + totalpower += ca->cpufreq_fn->power_usage( + ca->cpuacct_data); + } + + return totalpower; +} + +static struct cftype files[] = { + { + .name = "usage", + .read_u64 = cpuusage_read, + .write_u64 = cpuusage_write, + }, + { + .name = "usage_percpu", + .read_seq_string = cpuacct_percpu_seq_read, + }, + { + .name = "stat", + .read_map = cpuacct_stats_show, + }, + { + .name = "cpufreq", + .read_map = cpuacct_cpufreq_show, + }, + { + .name = "power", + .read_u64 = cpuacct_powerusage_read + }, +}; + +static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) +{ + return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); +} + +/* + * charge this task's execution time to its accounting group. + * + * called with rq->lock held. + */ +static void cpuacct_charge(struct task_struct *tsk, u64 cputime) +{ + struct cpuacct *ca; + int cpu; + + if (unlikely(!cpuacct_subsys.active)) + return; + + cpu = task_cpu(tsk); + + rcu_read_lock(); + + ca = task_ca(tsk); + + for (; ca; ca = ca->parent) { + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + *cpuusage += cputime; + + /* Call back into platform code to account for CPU speeds */ + if (ca->cpufreq_fn && ca->cpufreq_fn->charge) + ca->cpufreq_fn->charge(ca->cpuacct_data, cputime, cpu); + } + + rcu_read_unlock(); +} + +/* + * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large + * in cputime_t units. As a result, cpuacct_update_stats calls + * percpu_counter_add with values large enough to always overflow the + * per cpu batch limit causing bad SMP scalability. + * + * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we + * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled + * and enabled. We cap it at INT_MAX which is the largest allowed batch value. + */ +#ifdef CONFIG_SMP +#define CPUACCT_BATCH \ + min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX) +#else +#define CPUACCT_BATCH 0 +#endif + +/* + * Charge the system/user time to the task's accounting group. + */ +static void cpuacct_update_stats(struct task_struct *tsk, + enum cpuacct_stat_index idx, cputime_t val) +{ + struct cpuacct *ca; + int batch = CPUACCT_BATCH; + + if (unlikely(!cpuacct_subsys.active)) + return; + + rcu_read_lock(); + ca = task_ca(tsk); + + do { + __percpu_counter_add(&ca->cpustat[idx], val, batch); + ca = ca->parent; + } while (ca); + rcu_read_unlock(); +} + +struct cgroup_subsys cpuacct_subsys = { + .name = "cpuacct", + .create = cpuacct_create, + .destroy = cpuacct_destroy, + .populate = cpuacct_populate, + .subsys_id = cpuacct_subsys_id, +}; +#endif /* CONFIG_CGROUP_CPUACCT */ + diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c new file mode 100644 index 00000000..429242f3 --- /dev/null +++ b/kernel/sched_autogroup.c @@ -0,0 +1,275 @@ +#ifdef CONFIG_SCHED_AUTOGROUP + +#include +#include +#include +#include + +unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; +static struct autogroup autogroup_default; +static atomic_t autogroup_seq_nr; + +static void __init autogroup_init(struct task_struct *init_task) +{ + autogroup_default.tg = &root_task_group; + kref_init(&autogroup_default.kref); + init_rwsem(&autogroup_default.lock); + init_task->signal->autogroup = &autogroup_default; +} + +static inline void autogroup_free(struct task_group *tg) +{ + kfree(tg->autogroup); +} + +static inline void autogroup_destroy(struct kref *kref) +{ + struct autogroup *ag = container_of(kref, struct autogroup, kref); + +#ifdef CONFIG_RT_GROUP_SCHED + /* We've redirected RT tasks to the root task group... */ + ag->tg->rt_se = NULL; + ag->tg->rt_rq = NULL; +#endif + sched_destroy_group(ag->tg); +} + +static inline void autogroup_kref_put(struct autogroup *ag) +{ + kref_put(&ag->kref, autogroup_destroy); +} + +static inline struct autogroup *autogroup_kref_get(struct autogroup *ag) +{ + kref_get(&ag->kref); + return ag; +} + +static inline struct autogroup *autogroup_task_get(struct task_struct *p) +{ + struct autogroup *ag; + unsigned long flags; + + if (!lock_task_sighand(p, &flags)) + return autogroup_kref_get(&autogroup_default); + + ag = autogroup_kref_get(p->signal->autogroup); + unlock_task_sighand(p, &flags); + + return ag; +} + +#ifdef CONFIG_RT_GROUP_SCHED +static void free_rt_sched_group(struct task_group *tg); +#endif + +static inline struct autogroup *autogroup_create(void) +{ + struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL); + struct task_group *tg; + + if (!ag) + goto out_fail; + + tg = sched_create_group(&root_task_group); + + if (IS_ERR(tg)) + goto out_free; + + kref_init(&ag->kref); + init_rwsem(&ag->lock); + ag->id = atomic_inc_return(&autogroup_seq_nr); + ag->tg = tg; +#ifdef CONFIG_RT_GROUP_SCHED + /* + * Autogroup RT tasks are redirected to the root task group + * so we don't have to move tasks around upon policy change, + * or flail around trying to allocate bandwidth on the fly. + * A bandwidth exception in __sched_setscheduler() allows + * the policy change to proceed. Thereafter, task_group() + * returns &root_task_group, so zero bandwidth is required. + */ + free_rt_sched_group(tg); + tg->rt_se = root_task_group.rt_se; + tg->rt_rq = root_task_group.rt_rq; +#endif + tg->autogroup = ag; + + return ag; + +out_free: + kfree(ag); +out_fail: + if (printk_ratelimit()) { + printk(KERN_WARNING "autogroup_create: %s failure.\n", + ag ? "sched_create_group()" : "kmalloc()"); + } + + return autogroup_kref_get(&autogroup_default); +} + +static inline bool +task_wants_autogroup(struct task_struct *p, struct task_group *tg) +{ + if (tg != &root_task_group) + return false; + + if (p->sched_class != &fair_sched_class) + return false; + + /* + * We can only assume the task group can't go away on us if + * autogroup_move_group() can see us on ->thread_group list. + */ + if (p->flags & PF_EXITING) + return false; + + return true; +} + +static inline bool task_group_is_autogroup(struct task_group *tg) +{ + return !!tg->autogroup; +} + +static inline struct task_group * +autogroup_task_group(struct task_struct *p, struct task_group *tg) +{ + int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled); + + if (enabled && task_wants_autogroup(p, tg)) + return p->signal->autogroup->tg; + + return tg; +} + +static void +autogroup_move_group(struct task_struct *p, struct autogroup *ag) +{ + struct autogroup *prev; + struct task_struct *t; + unsigned long flags; + + BUG_ON(!lock_task_sighand(p, &flags)); + + prev = p->signal->autogroup; + if (prev == ag) { + unlock_task_sighand(p, &flags); + return; + } + + p->signal->autogroup = autogroup_kref_get(ag); + + if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) + goto out; + + t = p; + do { + sched_move_task(t); + } while_each_thread(p, t); + +out: + unlock_task_sighand(p, &flags); + autogroup_kref_put(prev); +} + +/* Allocates GFP_KERNEL, cannot be called under any spinlock */ +void sched_autogroup_create_attach(struct task_struct *p) +{ + struct autogroup *ag = autogroup_create(); + + autogroup_move_group(p, ag); + /* drop extra reference added by autogroup_create() */ + autogroup_kref_put(ag); +} +EXPORT_SYMBOL(sched_autogroup_create_attach); + +/* Cannot be called under siglock. Currently has no users */ +void sched_autogroup_detach(struct task_struct *p) +{ + autogroup_move_group(p, &autogroup_default); +} +EXPORT_SYMBOL(sched_autogroup_detach); + +void sched_autogroup_fork(struct signal_struct *sig) +{ + sig->autogroup = autogroup_task_get(current); +} + +void sched_autogroup_exit(struct signal_struct *sig) +{ + autogroup_kref_put(sig->autogroup); +} + +static int __init setup_autogroup(char *str) +{ + sysctl_sched_autogroup_enabled = 0; + + return 1; +} + +__setup("noautogroup", setup_autogroup); + +#ifdef CONFIG_PROC_FS + +int proc_sched_autogroup_set_nice(struct task_struct *p, int *nice) +{ + static unsigned long next = INITIAL_JIFFIES; + struct autogroup *ag; + int err; + + if (*nice < -20 || *nice > 19) + return -EINVAL; + + err = security_task_setnice(current, *nice); + if (err) + return err; + + if (*nice < 0 && !can_nice(current, *nice)) + return -EPERM; + + /* this is a heavy operation taking global locks.. */ + if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) + return -EAGAIN; + + next = HZ / 10 + jiffies; + ag = autogroup_task_get(p); + + down_write(&ag->lock); + err = sched_group_set_shares(ag->tg, prio_to_weight[*nice + 20]); + if (!err) + ag->nice = *nice; + up_write(&ag->lock); + + autogroup_kref_put(ag); + + return err; +} + +void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m) +{ + struct autogroup *ag = autogroup_task_get(p); + + if (!task_group_is_autogroup(ag->tg)) + goto out; + + down_read(&ag->lock); + seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice); + up_read(&ag->lock); + +out: + autogroup_kref_put(ag); +} +#endif /* CONFIG_PROC_FS */ + +#ifdef CONFIG_SCHED_DEBUG +static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) +{ + if (!task_group_is_autogroup(tg)) + return 0; + + return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); +} +#endif /* CONFIG_SCHED_DEBUG */ + +#endif /* CONFIG_SCHED_AUTOGROUP */ diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h new file mode 100644 index 00000000..05577055 --- /dev/null +++ b/kernel/sched_autogroup.h @@ -0,0 +1,41 @@ +#ifdef CONFIG_SCHED_AUTOGROUP + +struct autogroup { + /* + * reference doesn't mean how many thread attach to this + * autogroup now. It just stands for the number of task + * could use this autogroup. + */ + struct kref kref; + struct task_group *tg; + struct rw_semaphore lock; + unsigned long id; + int nice; +}; + +static inline struct task_group * +autogroup_task_group(struct task_struct *p, struct task_group *tg); + +#else /* !CONFIG_SCHED_AUTOGROUP */ + +static inline void autogroup_init(struct task_struct *init_task) { } +static inline void autogroup_free(struct task_group *tg) { } +static inline bool task_group_is_autogroup(struct task_group *tg) +{ + return 0; +} + +static inline struct task_group * +autogroup_task_group(struct task_struct *p, struct task_group *tg) +{ + return tg; +} + +#ifdef CONFIG_SCHED_DEBUG +static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) +{ + return 0; +} +#endif + +#endif /* CONFIG_SCHED_AUTOGROUP */ diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c new file mode 100644 index 00000000..9d8af0b3 --- /dev/null +++ b/kernel/sched_clock.c @@ -0,0 +1,350 @@ +/* + * sched_clock for unstable cpu clocks + * + * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra + * + * Updates and enhancements: + * Copyright (C) 2008 Red Hat, Inc. Steven Rostedt + * + * Based on code by: + * Ingo Molnar + * Guillaume Chazarain + * + * + * What: + * + * cpu_clock(i) provides a fast (execution time) high resolution + * clock with bounded drift between CPUs. The value of cpu_clock(i) + * is monotonic for constant i. The timestamp returned is in nanoseconds. + * + * ######################### BIG FAT WARNING ########################## + * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # + * # go backwards !! # + * #################################################################### + * + * There is no strict promise about the base, although it tends to start + * at 0 on boot (but people really shouldn't rely on that). + * + * cpu_clock(i) -- can be used from any context, including NMI. + * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI) + * local_clock() -- is cpu_clock() on the current cpu. + * + * How: + * + * The implementation either uses sched_clock() when + * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the + * sched_clock() is assumed to provide these properties (mostly it means + * the architecture provides a globally synchronized highres time source). + * + * Otherwise it tries to create a semi stable clock from a mixture of other + * clocks, including: + * + * - GTOD (clock monotomic) + * - sched_clock() + * - explicit idle events + * + * We use GTOD as base and use sched_clock() deltas to improve resolution. The + * deltas are filtered to provide monotonicity and keeping it within an + * expected window. + * + * Furthermore, explicit sleep and wakeup hooks allow us to account for time + * that is otherwise invisible (TSC gets stopped). + * + * + * Notes: + * + * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things + * like cpufreq interrupts that can change the base clock (TSC) multiplier + * and cause funny jumps in time -- although the filtering provided by + * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it + * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on + * sched_clock(). + */ +#include +#include +#include +#include +#include +#include + +/* + * Scheduler clock - returns current time in nanosec units. + * This is default implementation. + * Architectures and sub-architectures can override this. + */ +unsigned long long __attribute__((weak)) sched_clock(void) +{ + return (unsigned long long)(jiffies - INITIAL_JIFFIES) + * (NSEC_PER_SEC / HZ); +} +EXPORT_SYMBOL_GPL(sched_clock); + +__read_mostly int sched_clock_running; + +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +__read_mostly int sched_clock_stable; + +struct sched_clock_data { + u64 tick_raw; + u64 tick_gtod; + u64 clock; +}; + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); + +static inline struct sched_clock_data *this_scd(void) +{ + return &__get_cpu_var(sched_clock_data); +} + +static inline struct sched_clock_data *cpu_sdc(int cpu) +{ + return &per_cpu(sched_clock_data, cpu); +} + +void sched_clock_init(void) +{ + u64 ktime_now = ktime_to_ns(ktime_get()); + int cpu; + + for_each_possible_cpu(cpu) { + struct sched_clock_data *scd = cpu_sdc(cpu); + + scd->tick_raw = 0; + scd->tick_gtod = ktime_now; + scd->clock = ktime_now; + } + + sched_clock_running = 1; +} + +/* + * min, max except they take wrapping into account + */ + +static inline u64 wrap_min(u64 x, u64 y) +{ + return (s64)(x - y) < 0 ? x : y; +} + +static inline u64 wrap_max(u64 x, u64 y) +{ + return (s64)(x - y) > 0 ? x : y; +} + +/* + * update the percpu scd from the raw @now value + * + * - filter out backward motion + * - use the GTOD tick value to create a window to filter crazy TSC values + */ +static u64 sched_clock_local(struct sched_clock_data *scd) +{ + u64 now, clock, old_clock, min_clock, max_clock; + s64 delta; + +again: + now = sched_clock(); + delta = now - scd->tick_raw; + if (unlikely(delta < 0)) + delta = 0; + + old_clock = scd->clock; + + /* + * scd->clock = clamp(scd->tick_gtod + delta, + * max(scd->tick_gtod, scd->clock), + * scd->tick_gtod + TICK_NSEC); + */ + + clock = scd->tick_gtod + delta; + min_clock = wrap_max(scd->tick_gtod, old_clock); + max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC); + + clock = wrap_max(clock, min_clock); + clock = wrap_min(clock, max_clock); + + if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock) + goto again; + + return clock; +} + +static u64 sched_clock_remote(struct sched_clock_data *scd) +{ + struct sched_clock_data *my_scd = this_scd(); + u64 this_clock, remote_clock; + u64 *ptr, old_val, val; + + sched_clock_local(my_scd); +again: + this_clock = my_scd->clock; + remote_clock = scd->clock; + + /* + * Use the opportunity that we have both locks + * taken to couple the two clocks: we take the + * larger time as the latest time for both + * runqueues. (this creates monotonic movement) + */ + if (likely((s64)(remote_clock - this_clock) < 0)) { + ptr = &scd->clock; + old_val = remote_clock; + val = this_clock; + } else { + /* + * Should be rare, but possible: + */ + ptr = &my_scd->clock; + old_val = this_clock; + val = remote_clock; + } + + if (cmpxchg64(ptr, old_val, val) != old_val) + goto again; + + return val; +} + +/* + * Similar to cpu_clock(), but requires local IRQs to be disabled. + * + * See cpu_clock(). + */ +u64 sched_clock_cpu(int cpu) +{ + struct sched_clock_data *scd; + u64 clock; + + WARN_ON_ONCE(!irqs_disabled()); + + if (sched_clock_stable) + return sched_clock(); + + if (unlikely(!sched_clock_running)) + return 0ull; + + scd = cpu_sdc(cpu); + + if (cpu != smp_processor_id()) + clock = sched_clock_remote(scd); + else + clock = sched_clock_local(scd); + + return clock; +} + +void sched_clock_tick(void) +{ + struct sched_clock_data *scd; + u64 now, now_gtod; + + if (sched_clock_stable) + return; + + if (unlikely(!sched_clock_running)) + return; + + WARN_ON_ONCE(!irqs_disabled()); + + scd = this_scd(); + now_gtod = ktime_to_ns(ktime_get()); + now = sched_clock(); + + scd->tick_raw = now; + scd->tick_gtod = now_gtod; + sched_clock_local(scd); +} + +/* + * We are going deep-idle (irqs are disabled): + */ +void sched_clock_idle_sleep_event(void) +{ + sched_clock_cpu(smp_processor_id()); +} +EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); + +/* + * We just idled delta nanoseconds (called with irqs disabled): + */ +void sched_clock_idle_wakeup_event(u64 delta_ns) +{ + if (timekeeping_suspended) + return; + + sched_clock_tick(); + touch_softlockup_watchdog(); +} +EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); + +/* + * As outlined at the top, provides a fast, high resolution, nanosecond + * time source that is monotonic per cpu argument and has bounded drift + * between cpus. + * + * ######################### BIG FAT WARNING ########################## + * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # + * # go backwards !! # + * #################################################################### + */ +u64 cpu_clock(int cpu) +{ + u64 clock; + unsigned long flags; + + local_irq_save(flags); + clock = sched_clock_cpu(cpu); + local_irq_restore(flags); + + return clock; +} + +/* + * Similar to cpu_clock() for the current cpu. Time will only be observed + * to be monotonic if care is taken to only compare timestampt taken on the + * same CPU. + * + * See cpu_clock(). + */ +u64 local_clock(void) +{ + u64 clock; + unsigned long flags; + + local_irq_save(flags); + clock = sched_clock_cpu(smp_processor_id()); + local_irq_restore(flags); + + return clock; +} + +#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ + +void sched_clock_init(void) +{ + sched_clock_running = 1; +} + +u64 sched_clock_cpu(int cpu) +{ + if (unlikely(!sched_clock_running)) + return 0; + + return sched_clock(); +} + +u64 cpu_clock(int cpu) +{ + return sched_clock_cpu(cpu); +} + +u64 local_clock(void) +{ + return sched_clock_cpu(0); +} + +#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ + +EXPORT_SYMBOL_GPL(cpu_clock); +EXPORT_SYMBOL_GPL(local_clock); diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c new file mode 100644 index 00000000..2722dc1b --- /dev/null +++ b/kernel/sched_cpupri.c @@ -0,0 +1,204 @@ +/* + * kernel/sched_cpupri.c + * + * CPU priority management + * + * Copyright (C) 2007-2008 Novell + * + * Author: Gregory Haskins + * + * This code tracks the priority of each CPU so that global migration + * decisions are easy to calculate. Each CPU can be in a state as follows: + * + * (INVALID), IDLE, NORMAL, RT1, ... RT99 + * + * going from the lowest priority to the highest. CPUs in the INVALID state + * are not eligible for routing. The system maintains this state with + * a 2 dimensional bitmap (the first for priority class, the second for cpus + * in that class). Therefore a typical application without affinity + * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit + * searches). For tasks with affinity restrictions, the algorithm has a + * worst case complexity of O(min(102, nr_domcpus)), though the scenario that + * yields the worst case search is fairly contrived. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#include +#include "sched_cpupri.h" + +/* Convert between a 140 based task->prio, and our 102 based cpupri */ +static int convert_prio(int prio) +{ + int cpupri; + + if (prio == CPUPRI_INVALID) + cpupri = CPUPRI_INVALID; + else if (prio == MAX_PRIO) + cpupri = CPUPRI_IDLE; + else if (prio >= MAX_RT_PRIO) + cpupri = CPUPRI_NORMAL; + else + cpupri = MAX_RT_PRIO - prio + 1; + + return cpupri; +} + +#define for_each_cpupri_active(array, idx) \ + for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES) + +/** + * cpupri_find - find the best (lowest-pri) CPU in the system + * @cp: The cpupri context + * @p: The task + * @lowest_mask: A mask to fill in with selected CPUs (or NULL) + * + * Note: This function returns the recommended CPUs as calculated during the + * current invocation. By the time the call returns, the CPUs may have in + * fact changed priorities any number of times. While not ideal, it is not + * an issue of correctness since the normal rebalancer logic will correct + * any discrepancies created by racing against the uncertainty of the current + * priority configuration. + * + * Returns: (int)bool - CPUs were found + */ +int cpupri_find(struct cpupri *cp, struct task_struct *p, + struct cpumask *lowest_mask) +{ + int idx = 0; + int task_pri = convert_prio(p->prio); + + for_each_cpupri_active(cp->pri_active, idx) { + struct cpupri_vec *vec = &cp->pri_to_cpu[idx]; + + if (idx >= task_pri) + break; + + if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) + continue; + + if (lowest_mask) { + cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); + + /* + * We have to ensure that we have at least one bit + * still set in the array, since the map could have + * been concurrently emptied between the first and + * second reads of vec->mask. If we hit this + * condition, simply act as though we never hit this + * priority level and continue on. + */ + if (cpumask_any(lowest_mask) >= nr_cpu_ids) + continue; + } + + return 1; + } + + return 0; +} + +/** + * cpupri_set - update the cpu priority setting + * @cp: The cpupri context + * @cpu: The target cpu + * @pri: The priority (INVALID-RT99) to assign to this CPU + * + * Note: Assumes cpu_rq(cpu)->lock is locked + * + * Returns: (void) + */ +void cpupri_set(struct cpupri *cp, int cpu, int newpri) +{ + int *currpri = &cp->cpu_to_pri[cpu]; + int oldpri = *currpri; + unsigned long flags; + + newpri = convert_prio(newpri); + + BUG_ON(newpri >= CPUPRI_NR_PRIORITIES); + + if (newpri == oldpri) + return; + + /* + * If the cpu was currently mapped to a different value, we + * need to map it to the new value then remove the old value. + * Note, we must add the new value first, otherwise we risk the + * cpu being cleared from pri_active, and this cpu could be + * missed for a push or pull. + */ + if (likely(newpri != CPUPRI_INVALID)) { + struct cpupri_vec *vec = &cp->pri_to_cpu[newpri]; + + raw_spin_lock_irqsave(&vec->lock, flags); + + cpumask_set_cpu(cpu, vec->mask); + vec->count++; + if (vec->count == 1) + set_bit(newpri, cp->pri_active); + + raw_spin_unlock_irqrestore(&vec->lock, flags); + } + if (likely(oldpri != CPUPRI_INVALID)) { + struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri]; + + raw_spin_lock_irqsave(&vec->lock, flags); + + vec->count--; + if (!vec->count) + clear_bit(oldpri, cp->pri_active); + cpumask_clear_cpu(cpu, vec->mask); + + raw_spin_unlock_irqrestore(&vec->lock, flags); + } + + *currpri = newpri; +} + +/** + * cpupri_init - initialize the cpupri structure + * @cp: The cpupri context + * @bootmem: true if allocations need to use bootmem + * + * Returns: -ENOMEM if memory fails. + */ +int cpupri_init(struct cpupri *cp) +{ + int i; + + memset(cp, 0, sizeof(*cp)); + + for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) { + struct cpupri_vec *vec = &cp->pri_to_cpu[i]; + + raw_spin_lock_init(&vec->lock); + vec->count = 0; + if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL)) + goto cleanup; + } + + for_each_possible_cpu(i) + cp->cpu_to_pri[i] = CPUPRI_INVALID; + return 0; + +cleanup: + for (i--; i >= 0; i--) + free_cpumask_var(cp->pri_to_cpu[i].mask); + return -ENOMEM; +} + +/** + * cpupri_cleanup - clean up the cpupri structure + * @cp: The cpupri context + */ +void cpupri_cleanup(struct cpupri *cp) +{ + int i; + + for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) + free_cpumask_var(cp->pri_to_cpu[i].mask); +} diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h new file mode 100644 index 00000000..9fc7d386 --- /dev/null +++ b/kernel/sched_cpupri.h @@ -0,0 +1,37 @@ +#ifndef _LINUX_CPUPRI_H +#define _LINUX_CPUPRI_H + +#include + +#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) +#define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES) + +#define CPUPRI_INVALID -1 +#define CPUPRI_IDLE 0 +#define CPUPRI_NORMAL 1 +/* values 2-101 are RT priorities 0-99 */ + +struct cpupri_vec { + raw_spinlock_t lock; + int count; + cpumask_var_t mask; +}; + +struct cpupri { + struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; + long pri_active[CPUPRI_NR_PRI_WORDS]; + int cpu_to_pri[NR_CPUS]; +}; + +#ifdef CONFIG_SMP +int cpupri_find(struct cpupri *cp, + struct task_struct *p, struct cpumask *lowest_mask); +void cpupri_set(struct cpupri *cp, int cpu, int pri); +int cpupri_init(struct cpupri *cp); +void cpupri_cleanup(struct cpupri *cp); +#else +#define cpupri_set(cp, cpu, pri) do { } while (0) +#define cpupri_init() do { } while (0) +#endif + +#endif /* _LINUX_CPUPRI_H */ diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c new file mode 100644 index 00000000..a6710a11 --- /dev/null +++ b/kernel/sched_debug.c @@ -0,0 +1,508 @@ +/* + * kernel/time/sched_debug.c + * + * Print the CFS rbtree + * + * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include + +static DEFINE_SPINLOCK(sched_debug_lock); + +/* + * This allows printing both to /proc/sched_debug and + * to the console + */ +#define SEQ_printf(m, x...) \ + do { \ + if (m) \ + seq_printf(m, x); \ + else \ + printk(x); \ + } while (0) + +/* + * Ease the printing of nsec fields: + */ +static long long nsec_high(unsigned long long nsec) +{ + if ((long long)nsec < 0) { + nsec = -nsec; + do_div(nsec, 1000000); + return -nsec; + } + do_div(nsec, 1000000); + + return nsec; +} + +static unsigned long nsec_low(unsigned long long nsec) +{ + if ((long long)nsec < 0) + nsec = -nsec; + + return do_div(nsec, 1000000); +} + +#define SPLIT_NS(x) nsec_high(x), nsec_low(x) + +#ifdef CONFIG_FAIR_GROUP_SCHED +static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) +{ + struct sched_entity *se = tg->se[cpu]; + if (!se) + return; + +#define P(F) \ + SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) +#define PN(F) \ + SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) + + PN(se->exec_start); + PN(se->vruntime); + PN(se->sum_exec_runtime); +#ifdef CONFIG_SCHEDSTATS + PN(se->statistics.wait_start); + PN(se->statistics.sleep_start); + PN(se->statistics.block_start); + PN(se->statistics.sleep_max); + PN(se->statistics.block_max); + PN(se->statistics.exec_max); + PN(se->statistics.slice_max); + PN(se->statistics.wait_max); + PN(se->statistics.wait_sum); + P(se->statistics.wait_count); +#endif + P(se->load.weight); +#undef PN +#undef P +} +#endif + +#ifdef CONFIG_CGROUP_SCHED +static char group_path[PATH_MAX]; + +static char *task_group_path(struct task_group *tg) +{ + if (autogroup_path(tg, group_path, PATH_MAX)) + return group_path; + + /* + * May be NULL if the underlying cgroup isn't fully-created yet + */ + if (!tg->css.cgroup) { + group_path[0] = '\0'; + return group_path; + } + cgroup_path(tg->css.cgroup, group_path, PATH_MAX); + return group_path; +} +#endif + +static void +print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) +{ + if (rq->curr == p) + SEQ_printf(m, "R"); + else + SEQ_printf(m, " "); + + SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", + p->comm, p->pid, + SPLIT_NS(p->se.vruntime), + (long long)(p->nvcsw + p->nivcsw), + p->prio); +#ifdef CONFIG_SCHEDSTATS + SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", + SPLIT_NS(p->se.vruntime), + SPLIT_NS(p->se.sum_exec_runtime), + SPLIT_NS(p->se.statistics.sum_sleep_runtime)); +#else + SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", + 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); +#endif +#ifdef CONFIG_CGROUP_SCHED + SEQ_printf(m, " %s", task_group_path(task_group(p))); +#endif + + SEQ_printf(m, "\n"); +} + +static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) +{ + struct task_struct *g, *p; + unsigned long flags; + + SEQ_printf(m, + "\nrunnable tasks:\n" + " task PID tree-key switches prio" + " exec-runtime sum-exec sum-sleep\n" + "------------------------------------------------------" + "----------------------------------------------------\n"); + + read_lock_irqsave(&tasklist_lock, flags); + + do_each_thread(g, p) { + if (!p->on_rq || task_cpu(p) != rq_cpu) + continue; + + print_task(m, rq, p); + } while_each_thread(g, p); + + read_unlock_irqrestore(&tasklist_lock, flags); +} + +void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) +{ + s64 MIN_vruntime = -1, min_vruntime, max_vruntime = -1, + spread, rq0_min_vruntime, spread0; + struct rq *rq = cpu_rq(cpu); + struct sched_entity *last; + unsigned long flags; + +#ifdef CONFIG_FAIR_GROUP_SCHED + SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg)); +#else + SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); +#endif + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", + SPLIT_NS(cfs_rq->exec_clock)); + + raw_spin_lock_irqsave(&rq->lock, flags); + if (cfs_rq->rb_leftmost) + MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime; + last = __pick_last_entity(cfs_rq); + if (last) + max_vruntime = last->vruntime; + min_vruntime = cfs_rq->min_vruntime; + rq0_min_vruntime = cpu_rq(0)->cfs.min_vruntime; + raw_spin_unlock_irqrestore(&rq->lock, flags); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "MIN_vruntime", + SPLIT_NS(MIN_vruntime)); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "min_vruntime", + SPLIT_NS(min_vruntime)); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "max_vruntime", + SPLIT_NS(max_vruntime)); + spread = max_vruntime - MIN_vruntime; + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", + SPLIT_NS(spread)); + spread0 = min_vruntime - rq0_min_vruntime; + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", + SPLIT_NS(spread0)); + SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", + cfs_rq->nr_spread_over); + SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); + SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); +#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_SMP + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg", + SPLIT_NS(cfs_rq->load_avg)); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period", + SPLIT_NS(cfs_rq->load_period)); + SEQ_printf(m, " .%-30s: %ld\n", "load_contrib", + cfs_rq->load_contribution); + SEQ_printf(m, " .%-30s: %d\n", "load_tg", + atomic_read(&cfs_rq->tg->load_weight)); +#endif + + print_cfs_group_stats(m, cpu, cfs_rq->tg); +#endif +} + +void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) +{ +#ifdef CONFIG_RT_GROUP_SCHED + SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg)); +#else + SEQ_printf(m, "\nrt_rq[%d]:\n", cpu); +#endif + +#define P(x) \ + SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x)) +#define PN(x) \ + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x)) + + P(rt_nr_running); + P(rt_throttled); + PN(rt_time); + PN(rt_runtime); + +#undef PN +#undef P +} + +extern __read_mostly int sched_clock_running; + +static void print_cpu(struct seq_file *m, int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + +#ifdef CONFIG_X86 + { + unsigned int freq = cpu_khz ? : 1; + + SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n", + cpu, freq / 1000, (freq % 1000)); + } +#else + SEQ_printf(m, "\ncpu#%d\n", cpu); +#endif + +#define P(x) \ + SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x)) +#define PN(x) \ + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) + + P(nr_running); + SEQ_printf(m, " .%-30s: %lu\n", "load", + rq->load.weight); + P(nr_switches); + P(nr_load_updates); + P(nr_uninterruptible); + PN(next_balance); + P(curr->pid); + PN(clock); + P(cpu_load[0]); + P(cpu_load[1]); + P(cpu_load[2]); + P(cpu_load[3]); + P(cpu_load[4]); +#undef P +#undef PN + +#ifdef CONFIG_SCHEDSTATS +#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); +#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); + + P(yld_count); + + P(sched_switch); + P(sched_count); + P(sched_goidle); +#ifdef CONFIG_SMP + P64(avg_idle); +#endif + + P(ttwu_count); + P(ttwu_local); + +#undef P +#undef P64 +#endif + spin_lock_irqsave(&sched_debug_lock, flags); + print_cfs_stats(m, cpu); + print_rt_stats(m, cpu); + + rcu_read_lock(); + print_rq(m, rq, cpu); + rcu_read_unlock(); + spin_unlock_irqrestore(&sched_debug_lock, flags); +} + +static const char *sched_tunable_scaling_names[] = { + "none", + "logaritmic", + "linear" +}; + +static int sched_debug_show(struct seq_file *m, void *v) +{ + u64 ktime, sched_clk, cpu_clk; + unsigned long flags; + int cpu; + + local_irq_save(flags); + ktime = ktime_to_ns(ktime_get()); + sched_clk = sched_clock(); + cpu_clk = local_clock(); + local_irq_restore(flags); + + SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n", + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + +#define P(x) \ + SEQ_printf(m, "%-40s: %Ld\n", #x, (long long)(x)) +#define PN(x) \ + SEQ_printf(m, "%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) + PN(ktime); + PN(sched_clk); + PN(cpu_clk); + P(jiffies); +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK + P(sched_clock_stable); +#endif +#undef PN +#undef P + + SEQ_printf(m, "\n"); + SEQ_printf(m, "sysctl_sched\n"); + +#define P(x) \ + SEQ_printf(m, " .%-40s: %Ld\n", #x, (long long)(x)) +#define PN(x) \ + SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) + PN(sysctl_sched_latency); + PN(sysctl_sched_min_granularity); + PN(sysctl_sched_wakeup_granularity); + P(sysctl_sched_child_runs_first); + P(sysctl_sched_features); +#undef PN +#undef P + + SEQ_printf(m, " .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling", + sysctl_sched_tunable_scaling, + sched_tunable_scaling_names[sysctl_sched_tunable_scaling]); + + for_each_online_cpu(cpu) + print_cpu(m, cpu); + + SEQ_printf(m, "\n"); + + return 0; +} + +static void sysrq_sched_debug_show(void) +{ + sched_debug_show(NULL, NULL); +} + +static int sched_debug_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_debug_show, NULL); +} + +static const struct file_operations sched_debug_fops = { + .open = sched_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init init_sched_debug_procfs(void) +{ + struct proc_dir_entry *pe; + + pe = proc_create("sched_debug", 0444, NULL, &sched_debug_fops); + if (!pe) + return -ENOMEM; + return 0; +} + +__initcall(init_sched_debug_procfs); + +void proc_sched_show_task(struct task_struct *p, struct seq_file *m) +{ + unsigned long nr_switches; + + SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, + get_nr_threads(p)); + SEQ_printf(m, + "---------------------------------------------------------\n"); +#define __P(F) \ + SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)F) +#define P(F) \ + SEQ_printf(m, "%-35s:%21Ld\n", #F, (long long)p->F) +#define __PN(F) \ + SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) +#define PN(F) \ + SEQ_printf(m, "%-35s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) + + PN(se.exec_start); + PN(se.vruntime); + PN(se.sum_exec_runtime); + + nr_switches = p->nvcsw + p->nivcsw; + +#ifdef CONFIG_SCHEDSTATS + PN(se.statistics.wait_start); + PN(se.statistics.sleep_start); + PN(se.statistics.block_start); + PN(se.statistics.sleep_max); + PN(se.statistics.block_max); + PN(se.statistics.exec_max); + PN(se.statistics.slice_max); + PN(se.statistics.wait_max); + PN(se.statistics.wait_sum); + P(se.statistics.wait_count); + PN(se.statistics.iowait_sum); + P(se.statistics.iowait_count); + P(se.nr_migrations); + P(se.statistics.nr_migrations_cold); + P(se.statistics.nr_failed_migrations_affine); + P(se.statistics.nr_failed_migrations_running); + P(se.statistics.nr_failed_migrations_hot); + P(se.statistics.nr_forced_migrations); + P(se.statistics.nr_wakeups); + P(se.statistics.nr_wakeups_sync); + P(se.statistics.nr_wakeups_migrate); + P(se.statistics.nr_wakeups_local); + P(se.statistics.nr_wakeups_remote); + P(se.statistics.nr_wakeups_affine); + P(se.statistics.nr_wakeups_affine_attempts); + P(se.statistics.nr_wakeups_passive); + P(se.statistics.nr_wakeups_idle); + + { + u64 avg_atom, avg_per_cpu; + + avg_atom = p->se.sum_exec_runtime; + if (nr_switches) + do_div(avg_atom, nr_switches); + else + avg_atom = -1LL; + + avg_per_cpu = p->se.sum_exec_runtime; + if (p->se.nr_migrations) { + avg_per_cpu = div64_u64(avg_per_cpu, + p->se.nr_migrations); + } else { + avg_per_cpu = -1LL; + } + + __PN(avg_atom); + __PN(avg_per_cpu); + } +#endif + __P(nr_switches); + SEQ_printf(m, "%-35s:%21Ld\n", + "nr_voluntary_switches", (long long)p->nvcsw); + SEQ_printf(m, "%-35s:%21Ld\n", + "nr_involuntary_switches", (long long)p->nivcsw); + + P(se.load.weight); + P(policy); + P(prio); +#undef PN +#undef __PN +#undef P +#undef __P + + { + unsigned int this_cpu = raw_smp_processor_id(); + u64 t0, t1; + + t0 = cpu_clock(this_cpu); + t1 = cpu_clock(this_cpu); + SEQ_printf(m, "%-35s:%21Ld\n", + "clock-delta", (long long)(t1-t0)); + } +} + +void proc_sched_set_task(struct task_struct *p) +{ +#ifdef CONFIG_SCHEDSTATS + memset(&p->se.statistics, 0, sizeof(p->se.statistics)); +#endif +} diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c new file mode 100644 index 00000000..c768588e --- /dev/null +++ b/kernel/sched_fair.c @@ -0,0 +1,4334 @@ +/* + * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH) + * + * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar + * + * Interactivity improvements by Mike Galbraith + * (C) 2007 Mike Galbraith + * + * Various enhancements by Dmitry Adamushko. + * (C) 2007 Dmitry Adamushko + * + * Group scheduling enhancements by Srivatsa Vaddagiri + * Copyright IBM Corporation, 2007 + * Author: Srivatsa Vaddagiri + * + * Scaled math optimizations by Thomas Gleixner + * Copyright (C) 2007, Thomas Gleixner + * + * Adaptive scheduling granularity, math enhancements by Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra + */ + +#include +#include +#include + +/* + * Targeted preemption latency for CPU-bound tasks: + * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds) + * + * NOTE: this latency value is not the same as the concept of + * 'timeslice length' - timeslices in CFS are of variable length + * and have no persistent notion like in traditional, time-slice + * based scheduling concepts. + * + * (to see the precise effective timeslice length of your workload, + * run vmstat and monitor the context-switches (cs) field) + */ +unsigned int sysctl_sched_latency = 6000000ULL; +unsigned int normalized_sysctl_sched_latency = 6000000ULL; + +/* + * The initial- and re-scaling of tunables is configurable + * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) + * + * Options are: + * SCHED_TUNABLESCALING_NONE - unscaled, always *1 + * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) + * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus + */ +enum sched_tunable_scaling sysctl_sched_tunable_scaling + = SCHED_TUNABLESCALING_LOG; + +/* + * Minimal preemption granularity for CPU-bound tasks: + * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) + */ +unsigned int sysctl_sched_min_granularity = 750000ULL; +unsigned int normalized_sysctl_sched_min_granularity = 750000ULL; + +/* + * is kept at sysctl_sched_latency / sysctl_sched_min_granularity + */ +static unsigned int sched_nr_latency = 8; + +/* + * After fork, child runs first. If set to 0 (default) then + * parent will (try to) run first. + */ +unsigned int sysctl_sched_child_runs_first __read_mostly; + +/* + * SCHED_OTHER wake-up granularity. + * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) + * + * This option delays the preemption effects of decoupled workloads + * and reduces their over-scheduling. Synchronous workloads will still + * have immediate wakeup/sleep latencies. + */ +unsigned int sysctl_sched_wakeup_granularity = 1000000UL; +unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; + +const_debug unsigned int sysctl_sched_migration_cost = 500000UL; + +/* + * The exponential sliding window over which load is averaged for shares + * distribution. + * (default: 10msec) + */ +unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL; + +static const struct sched_class fair_sched_class; + +/************************************************************** + * CFS operations on generic schedulable entities: + */ + +#ifdef CONFIG_FAIR_GROUP_SCHED + +/* cpu runqueue to which this cfs_rq is attached */ +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ + return cfs_rq->rq; +} + +/* An entity is a task if it doesn't "own" a runqueue */ +#define entity_is_task(se) (!se->my_q) + +static inline struct task_struct *task_of(struct sched_entity *se) +{ +#ifdef CONFIG_SCHED_DEBUG + WARN_ON_ONCE(!entity_is_task(se)); +#endif + return container_of(se, struct task_struct, se); +} + +/* Walk up scheduling entities hierarchy */ +#define for_each_sched_entity(se) \ + for (; se; se = se->parent) + +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ + return p->se.cfs_rq; +} + +/* runqueue on which this entity is (to be) queued */ +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + return se->cfs_rq; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return grp->my_q; +} + +/* Given a group's cfs_rq on one cpu, return its corresponding cfs_rq on + * another cpu ('this_cpu') + */ +static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) +{ + return cfs_rq->tg->cfs_rq[this_cpu]; +} + +static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ + if (!cfs_rq->on_list) { + /* + * Ensure we either appear before our parent (if already + * enqueued) or force our parent to appear after us when it is + * enqueued. The fact that we always enqueue bottom-up + * reduces this to two cases. + */ + if (cfs_rq->tg->parent && + cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) { + list_add_rcu(&cfs_rq->leaf_cfs_rq_list, + &rq_of(cfs_rq)->leaf_cfs_rq_list); + } else { + list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, + &rq_of(cfs_rq)->leaf_cfs_rq_list); + } + + cfs_rq->on_list = 1; + } +} + +static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->on_list) { + list_del_rcu(&cfs_rq->leaf_cfs_rq_list); + cfs_rq->on_list = 0; + } +} + +/* Iterate thr' all leaf cfs_rq's on a runqueue */ +#define for_each_leaf_cfs_rq(rq, cfs_rq) \ + list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) + +/* Do the two (enqueued) entities belong to the same group ? */ +static inline int +is_same_group(struct sched_entity *se, struct sched_entity *pse) +{ + if (se->cfs_rq == pse->cfs_rq) + return 1; + + return 0; +} + +static inline struct sched_entity *parent_entity(struct sched_entity *se) +{ + return se->parent; +} + +/* return depth at which a sched entity is present in the hierarchy */ +static inline int depth_se(struct sched_entity *se) +{ + int depth = 0; + + for_each_sched_entity(se) + depth++; + + return depth; +} + +static void +find_matching_se(struct sched_entity **se, struct sched_entity **pse) +{ + int se_depth, pse_depth; + + /* + * preemption test can be made between sibling entities who are in the + * same cfs_rq i.e who have a common parent. Walk up the hierarchy of + * both tasks until we find their ancestors who are siblings of common + * parent. + */ + + /* First walk up until both entities are at same depth */ + se_depth = depth_se(*se); + pse_depth = depth_se(*pse); + + while (se_depth > pse_depth) { + se_depth--; + *se = parent_entity(*se); + } + + while (pse_depth > se_depth) { + pse_depth--; + *pse = parent_entity(*pse); + } + + while (!is_same_group(*se, *pse)) { + *se = parent_entity(*se); + *pse = parent_entity(*pse); + } +} + +#else /* !CONFIG_FAIR_GROUP_SCHED */ + +static inline struct task_struct *task_of(struct sched_entity *se) +{ + return container_of(se, struct task_struct, se); +} + +static inline struct rq *rq_of(struct cfs_rq *cfs_rq) +{ + return container_of(cfs_rq, struct rq, cfs); +} + +#define entity_is_task(se) 1 + +#define for_each_sched_entity(se) \ + for (; se; se = NULL) + +static inline struct cfs_rq *task_cfs_rq(struct task_struct *p) +{ + return &task_rq(p)->cfs; +} + +static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se) +{ + struct task_struct *p = task_of(se); + struct rq *rq = task_rq(p); + + return &rq->cfs; +} + +/* runqueue "owned" by this group */ +static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp) +{ + return NULL; +} + +static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) +{ + return &cpu_rq(this_cpu)->cfs; +} + +static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ +} + +static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq) +{ +} + +#define for_each_leaf_cfs_rq(rq, cfs_rq) \ + for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL) + +static inline int +is_same_group(struct sched_entity *se, struct sched_entity *pse) +{ + return 1; +} + +static inline struct sched_entity *parent_entity(struct sched_entity *se) +{ + return NULL; +} + +static inline void +find_matching_se(struct sched_entity **se, struct sched_entity **pse) +{ +} + +#endif /* CONFIG_FAIR_GROUP_SCHED */ + + +/************************************************************** + * Scheduling class tree data structure manipulation methods: + */ + +static inline u64 max_vruntime(u64 min_vruntime, u64 vruntime) +{ + s64 delta = (s64)(vruntime - min_vruntime); + if (delta > 0) + min_vruntime = vruntime; + + return min_vruntime; +} + +static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) +{ + s64 delta = (s64)(vruntime - min_vruntime); + if (delta < 0) + min_vruntime = vruntime; + + return min_vruntime; +} + +static inline int entity_before(struct sched_entity *a, + struct sched_entity *b) +{ + return (s64)(a->vruntime - b->vruntime) < 0; +} + +static inline s64 entity_key(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + return se->vruntime - cfs_rq->min_vruntime; +} + +static void update_min_vruntime(struct cfs_rq *cfs_rq) +{ + u64 vruntime = cfs_rq->min_vruntime; + + if (cfs_rq->curr) + vruntime = cfs_rq->curr->vruntime; + + if (cfs_rq->rb_leftmost) { + struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, + struct sched_entity, + run_node); + + if (!cfs_rq->curr) + vruntime = se->vruntime; + else + vruntime = min_vruntime(vruntime, se->vruntime); + } + + cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime); +#ifndef CONFIG_64BIT + smp_wmb(); + cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; +#endif +} + +/* + * Enqueue an entity into the rb-tree: + */ +static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; + struct rb_node *parent = NULL; + struct sched_entity *entry; + s64 key = entity_key(cfs_rq, se); + int leftmost = 1; + + /* + * Find the right place in the rbtree: + */ + while (*link) { + parent = *link; + entry = rb_entry(parent, struct sched_entity, run_node); + /* + * We dont care about collisions. Nodes with + * the same key stay together. + */ + if (key < entity_key(cfs_rq, entry)) { + link = &parent->rb_left; + } else { + link = &parent->rb_right; + leftmost = 0; + } + } + + /* + * Maintain a cache of leftmost tree entries (it is frequently + * used): + */ + if (leftmost) + cfs_rq->rb_leftmost = &se->run_node; + + rb_link_node(&se->run_node, parent, link); + rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); +} + +static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + if (cfs_rq->rb_leftmost == &se->run_node) { + struct rb_node *next_node; + + next_node = rb_next(&se->run_node); + cfs_rq->rb_leftmost = next_node; + } + + rb_erase(&se->run_node, &cfs_rq->tasks_timeline); +} + +static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) +{ + struct rb_node *left = cfs_rq->rb_leftmost; + + if (!left) + return NULL; + + return rb_entry(left, struct sched_entity, run_node); +} + +static struct sched_entity *__pick_next_entity(struct sched_entity *se) +{ + struct rb_node *next = rb_next(&se->run_node); + + if (!next) + return NULL; + + return rb_entry(next, struct sched_entity, run_node); +} + +#ifdef CONFIG_SCHED_DEBUG +static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) +{ + struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); + + if (!last) + return NULL; + + return rb_entry(last, struct sched_entity, run_node); +} + +/************************************************************** + * Scheduling class statistics methods: + */ + +int sched_proc_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + int factor = get_update_sysctl_factor(); + + if (ret || !write) + return ret; + + sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, + sysctl_sched_min_granularity); + +#define WRT_SYSCTL(name) \ + (normalized_sysctl_##name = sysctl_##name / (factor)) + WRT_SYSCTL(sched_min_granularity); + WRT_SYSCTL(sched_latency); + WRT_SYSCTL(sched_wakeup_granularity); +#undef WRT_SYSCTL + + return 0; +} +#endif + +/* + * delta /= w + */ +static inline unsigned long +calc_delta_fair(unsigned long delta, struct sched_entity *se) +{ + if (unlikely(se->load.weight != NICE_0_LOAD)) + delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load); + + return delta; +} + +/* + * The idea is to set a period in which each task runs once. + * + * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch + * this period because otherwise the slices get too small. + * + * p = (nr <= nl) ? l : l*nr/nl + */ +static u64 __sched_period(unsigned long nr_running) +{ + u64 period = sysctl_sched_latency; + unsigned long nr_latency = sched_nr_latency; + + if (unlikely(nr_running > nr_latency)) { + period = sysctl_sched_min_granularity; + period *= nr_running; + } + + return period; +} + +/* + * We calculate the wall-time slice from the period by taking a part + * proportional to the weight. + * + * s = p*P[w/rw] + */ +static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq); + + for_each_sched_entity(se) { + struct load_weight *load; + struct load_weight lw; + + cfs_rq = cfs_rq_of(se); + load = &cfs_rq->load; + + if (unlikely(!se->on_rq)) { + lw = cfs_rq->load; + + update_load_add(&lw, se->load.weight); + load = &lw; + } + slice = calc_delta_mine(slice, se->load.weight, load); + } + return slice; +} + +/* + * We calculate the vruntime slice of a to be inserted task + * + * vs = s/w + */ +static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + return calc_delta_fair(sched_slice(cfs_rq, se), se); +} + +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update); +static void update_cfs_shares(struct cfs_rq *cfs_rq); + +/* + * Update the current task's runtime statistics. Skip current tasks that + * are not in our scheduling class. + */ +static inline void +__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, + unsigned long delta_exec) +{ + unsigned long delta_exec_weighted; + + schedstat_set(curr->statistics.exec_max, + max((u64)delta_exec, curr->statistics.exec_max)); + + curr->sum_exec_runtime += delta_exec; + schedstat_add(cfs_rq, exec_clock, delta_exec); + delta_exec_weighted = calc_delta_fair(delta_exec, curr); + + curr->vruntime += delta_exec_weighted; + update_min_vruntime(cfs_rq); + +#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED + cfs_rq->load_unacc_exec_time += delta_exec; +#endif +} + +static void update_curr(struct cfs_rq *cfs_rq) +{ + struct sched_entity *curr = cfs_rq->curr; + u64 now = rq_of(cfs_rq)->clock_task; + unsigned long delta_exec; + + if (unlikely(!curr)) + return; + + /* + * Get the amount of time the current task was running + * since the last time we changed load (this cannot + * overflow on 32 bits): + */ + delta_exec = (unsigned long)(now - curr->exec_start); + if (!delta_exec) + return; + + __update_curr(cfs_rq, curr, delta_exec); + curr->exec_start = now; + + if (entity_is_task(curr)) { + struct task_struct *curtask = task_of(curr); + + trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); + cpuacct_charge(curtask, delta_exec); + account_group_exec_runtime(curtask, delta_exec); + } +} + +static inline void +update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock); +} + +/* + * Task is being enqueued - update stats: + */ +static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + /* + * Are we enqueueing a waiting task? (for current tasks + * a dequeue/enqueue event is a NOP) + */ + if (se != cfs_rq->curr) + update_stats_wait_start(cfs_rq, se); +} + +static void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, + rq_of(cfs_rq)->clock - se->statistics.wait_start)); + schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); + schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + + rq_of(cfs_rq)->clock - se->statistics.wait_start); +#ifdef CONFIG_SCHEDSTATS + if (entity_is_task(se)) { + trace_sched_stat_wait(task_of(se), + rq_of(cfs_rq)->clock - se->statistics.wait_start); + } +#endif + schedstat_set(se->statistics.wait_start, 0); +} + +static inline void +update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + /* + * Mark the end of the wait period if dequeueing a + * waiting task: + */ + if (se != cfs_rq->curr) + update_stats_wait_end(cfs_rq, se); +} + +/* + * We are picking a new current task - update its stats: + */ +static inline void +update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + /* + * We are starting a new run period: + */ + se->exec_start = rq_of(cfs_rq)->clock_task; +} + +/************************************************** + * Scheduling class queueing methods: + */ + +#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED +static void +add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) +{ + cfs_rq->task_weight += weight; +} +#else +static inline void +add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) +{ +} +#endif + +static void +account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + update_load_add(&cfs_rq->load, se->load.weight); + if (!parent_entity(se)) + inc_cpu_load(rq_of(cfs_rq), se->load.weight); + if (entity_is_task(se)) { + add_cfs_task_weight(cfs_rq, se->load.weight); + list_add(&se->group_node, &cfs_rq->tasks); + } + cfs_rq->nr_running++; +} + +static void +account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + update_load_sub(&cfs_rq->load, se->load.weight); + if (!parent_entity(se)) + dec_cpu_load(rq_of(cfs_rq), se->load.weight); + if (entity_is_task(se)) { + add_cfs_task_weight(cfs_rq, -se->load.weight); + list_del_init(&se->group_node); + } + cfs_rq->nr_running--; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +# ifdef CONFIG_SMP +static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq, + int global_update) +{ + struct task_group *tg = cfs_rq->tg; + long load_avg; + + load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); + load_avg -= cfs_rq->load_contribution; + + if (global_update || abs(load_avg) > cfs_rq->load_contribution / 8) { + atomic_add(load_avg, &tg->load_weight); + cfs_rq->load_contribution += load_avg; + } +} + +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) +{ + u64 period = sysctl_sched_shares_window; + u64 now, delta; + unsigned long load = cfs_rq->load.weight; + + if (cfs_rq->tg == &root_task_group) + return; + + now = rq_of(cfs_rq)->clock_task; + delta = now - cfs_rq->load_stamp; + + /* truncate load history at 4 idle periods */ + if (cfs_rq->load_stamp > cfs_rq->load_last && + now - cfs_rq->load_last > 4 * period) { + cfs_rq->load_period = 0; + cfs_rq->load_avg = 0; + delta = period - 1; + } + + cfs_rq->load_stamp = now; + cfs_rq->load_unacc_exec_time = 0; + cfs_rq->load_period += delta; + if (load) { + cfs_rq->load_last = now; + cfs_rq->load_avg += delta * load; + } + + /* consider updating load contribution on each fold or truncate */ + if (global_update || cfs_rq->load_period > period + || !cfs_rq->load_period) + update_cfs_rq_load_contribution(cfs_rq, global_update); + + while (cfs_rq->load_period > period) { + /* + * Inline assembly required to prevent the compiler + * optimising this loop into a divmod call. + * See __iter_div_u64_rem() for another example of this. + */ + asm("" : "+rm" (cfs_rq->load_period)); + cfs_rq->load_period /= 2; + cfs_rq->load_avg /= 2; + } + + if (!cfs_rq->curr && !cfs_rq->nr_running && !cfs_rq->load_avg) + list_del_leaf_cfs_rq(cfs_rq); +} + +static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) +{ + long load_weight, load, shares; + + load = cfs_rq->load.weight; + + load_weight = atomic_read(&tg->load_weight); + load_weight += load; + load_weight -= cfs_rq->load_contribution; + + shares = (tg->shares * load); + if (load_weight) + shares /= load_weight; + + if (shares < MIN_SHARES) + shares = MIN_SHARES; + if (shares > tg->shares) + shares = tg->shares; + + return shares; +} + +static void update_entity_shares_tick(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) { + update_cfs_load(cfs_rq, 0); + update_cfs_shares(cfs_rq); + } +} +# else /* CONFIG_SMP */ +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) +{ +} + +static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) +{ + return tg->shares; +} + +static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) +{ +} +# endif /* CONFIG_SMP */ +static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + unsigned long weight) +{ + if (se->on_rq) { + /* commit outstanding execution time */ + if (cfs_rq->curr == se) + update_curr(cfs_rq); + account_entity_dequeue(cfs_rq, se); + } + + update_load_set(&se->load, weight); + + if (se->on_rq) + account_entity_enqueue(cfs_rq, se); +} + +static void update_cfs_shares(struct cfs_rq *cfs_rq) +{ + struct task_group *tg; + struct sched_entity *se; + long shares; + + tg = cfs_rq->tg; + se = tg->se[cpu_of(rq_of(cfs_rq))]; + if (!se) + return; +#ifndef CONFIG_SMP + if (likely(se->load.weight == tg->shares)) + return; +#endif + shares = calc_cfs_shares(cfs_rq, tg); + + reweight_entity(cfs_rq_of(se), se, shares); +} +#else /* CONFIG_FAIR_GROUP_SCHED */ +static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update) +{ +} + +static inline void update_cfs_shares(struct cfs_rq *cfs_rq) +{ +} + +static inline void update_entity_shares_tick(struct cfs_rq *cfs_rq) +{ +} +#endif /* CONFIG_FAIR_GROUP_SCHED */ + +static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +#ifdef CONFIG_SCHEDSTATS + struct task_struct *tsk = NULL; + + if (entity_is_task(se)) + tsk = task_of(se); + + if (se->statistics.sleep_start) { + u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start; + + if ((s64)delta < 0) + delta = 0; + + if (unlikely(delta > se->statistics.sleep_max)) + se->statistics.sleep_max = delta; + + se->statistics.sleep_start = 0; + se->statistics.sum_sleep_runtime += delta; + + if (tsk) { + account_scheduler_latency(tsk, delta >> 10, 1); + trace_sched_stat_sleep(tsk, delta); + } + } + if (se->statistics.block_start) { + u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start; + + if ((s64)delta < 0) + delta = 0; + + if (unlikely(delta > se->statistics.block_max)) + se->statistics.block_max = delta; + + se->statistics.block_start = 0; + se->statistics.sum_sleep_runtime += delta; + + if (tsk) { + if (tsk->in_iowait) { + se->statistics.iowait_sum += delta; + se->statistics.iowait_count++; + trace_sched_stat_iowait(tsk, delta); + } + + /* + * Blocking time is in units of nanosecs, so shift by + * 20 to get a milliseconds-range estimation of the + * amount of time that the task spent sleeping: + */ + if (unlikely(prof_on == SLEEP_PROFILING)) { + profile_hits(SLEEP_PROFILING, + (void *)get_wchan(tsk), + delta >> 20); + } + account_scheduler_latency(tsk, delta >> 10, 0); + } + } +#endif +} + +static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +#ifdef CONFIG_SCHED_DEBUG + s64 d = se->vruntime - cfs_rq->min_vruntime; + + if (d < 0) + d = -d; + + if (d > 3*sysctl_sched_latency) + schedstat_inc(cfs_rq, nr_spread_over); +#endif +} + +static void +place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) +{ + u64 vruntime = cfs_rq->min_vruntime; + + /* + * The 'current' period is already promised to the current tasks, + * however the extra weight of the new task will slow them down a + * little, place the new task so that it fits in the slot that + * stays open at the end. + */ + if (initial && sched_feat(START_DEBIT)) + vruntime += sched_vslice(cfs_rq, se); + + /* sleeps up to a single latency don't count. */ + if (!initial) { + unsigned long thresh = sysctl_sched_latency; + + /* + * Halve their sleep time's effect, to allow + * for a gentler effect of sleepers: + */ + if (sched_feat(GENTLE_FAIR_SLEEPERS)) + thresh >>= 1; + + vruntime -= thresh; + } + + /* ensure we never gain time by being placed backwards. */ + vruntime = max_vruntime(se->vruntime, vruntime); + + se->vruntime = vruntime; +} + +static void +enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +{ + /* + * Update the normalized vruntime before updating min_vruntime + * through callig update_curr(). + */ + if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) + se->vruntime += cfs_rq->min_vruntime; + + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + update_cfs_load(cfs_rq, 0); + account_entity_enqueue(cfs_rq, se); + update_cfs_shares(cfs_rq); + + if (flags & ENQUEUE_WAKEUP) { + place_entity(cfs_rq, se, 0); + enqueue_sleeper(cfs_rq, se); + } + + update_stats_enqueue(cfs_rq, se); + check_spread(cfs_rq, se); + if (se != cfs_rq->curr) + __enqueue_entity(cfs_rq, se); + se->on_rq = 1; + + if (cfs_rq->nr_running == 1) + list_add_leaf_cfs_rq(cfs_rq); +} + +static void __clear_buddies_last(struct sched_entity *se) +{ + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + if (cfs_rq->last == se) + cfs_rq->last = NULL; + else + break; + } +} + +static void __clear_buddies_next(struct sched_entity *se) +{ + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + if (cfs_rq->next == se) + cfs_rq->next = NULL; + else + break; + } +} + +static void __clear_buddies_skip(struct sched_entity *se) +{ + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + if (cfs_rq->skip == se) + cfs_rq->skip = NULL; + else + break; + } +} + +static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + if (cfs_rq->last == se) + __clear_buddies_last(se); + + if (cfs_rq->next == se) + __clear_buddies_next(se); + + if (cfs_rq->skip == se) + __clear_buddies_skip(se); +} + +static void +dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +{ + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + + update_stats_dequeue(cfs_rq, se); + if (flags & DEQUEUE_SLEEP) { +#ifdef CONFIG_SCHEDSTATS + if (entity_is_task(se)) { + struct task_struct *tsk = task_of(se); + + if (tsk->state & TASK_INTERRUPTIBLE) + se->statistics.sleep_start = rq_of(cfs_rq)->clock; + if (tsk->state & TASK_UNINTERRUPTIBLE) + se->statistics.block_start = rq_of(cfs_rq)->clock; + } +#endif + } + + clear_buddies(cfs_rq, se); + + if (se != cfs_rq->curr) + __dequeue_entity(cfs_rq, se); + se->on_rq = 0; + update_cfs_load(cfs_rq, 0); + account_entity_dequeue(cfs_rq, se); + + /* + * Normalize the entity after updating the min_vruntime because the + * update can refer to the ->curr item and we need to reflect this + * movement in our normalized position. + */ + if (!(flags & DEQUEUE_SLEEP)) + se->vruntime -= cfs_rq->min_vruntime; + + update_min_vruntime(cfs_rq); + update_cfs_shares(cfs_rq); +} + +/* + * Preempt the current task with a newly woken task if needed: + */ +static void +check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) +{ + unsigned long ideal_runtime, delta_exec; + + ideal_runtime = sched_slice(cfs_rq, curr); + delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; + if (delta_exec > ideal_runtime) { + resched_task(rq_of(cfs_rq)->curr); + /* + * The current task ran long enough, ensure it doesn't get + * re-elected due to buddy favours. + */ + clear_buddies(cfs_rq, curr); + return; + } + + /* + * Ensure that a task that missed wakeup preemption by a + * narrow margin doesn't have to wait for a full slice. + * This also mitigates buddy induced latencies under load. + */ + if (!sched_feat(WAKEUP_PREEMPT)) + return; + + if (delta_exec < sysctl_sched_min_granularity) + return; + + if (cfs_rq->nr_running > 1) { + struct sched_entity *se = __pick_first_entity(cfs_rq); + s64 delta = curr->vruntime - se->vruntime; + + if (delta < 0) + return; + + if (delta > ideal_runtime) + resched_task(rq_of(cfs_rq)->curr); + } +} + +static void +set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + /* 'current' is not kept within the tree. */ + if (se->on_rq) { + /* + * Any task has to be enqueued before it get to execute on + * a CPU. So account for the time it spent waiting on the + * runqueue. + */ + update_stats_wait_end(cfs_rq, se); + __dequeue_entity(cfs_rq, se); + } + + update_stats_curr_start(cfs_rq, se); + cfs_rq->curr = se; +#ifdef CONFIG_SCHEDSTATS + /* + * Track our maximum slice length, if the CPU's load is at + * least twice that of our own weight (i.e. dont track it + * when there are only lesser-weight tasks around): + */ + if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { + se->statistics.slice_max = max(se->statistics.slice_max, + se->sum_exec_runtime - se->prev_sum_exec_runtime); + } +#endif + se->prev_sum_exec_runtime = se->sum_exec_runtime; +} + +static int +wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); + +/* + * Pick the next process, keeping these things in mind, in this order: + * 1) keep things fair between processes/task groups + * 2) pick the "next" process, since someone really wants that to run + * 3) pick the "last" process, for cache locality + * 4) do not run the "skip" process, if something else is available + */ +static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) +{ + struct sched_entity *se = __pick_first_entity(cfs_rq); + struct sched_entity *left = se; + + /* + * Avoid running the skip buddy, if running something else can + * be done without getting too unfair. + */ + if (cfs_rq->skip == se) { + struct sched_entity *second = __pick_next_entity(se); + if (second && wakeup_preempt_entity(second, left) < 1) + se = second; + } + + /* + * Prefer last buddy, try to return the CPU to a preempted task. + */ + if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) + se = cfs_rq->last; + + /* + * Someone really wants this to run. If it's not unfair, run it. + */ + if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) + se = cfs_rq->next; + + clear_buddies(cfs_rq, se); + + return se; +} + +static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) +{ + /* + * If still on the runqueue then deactivate_task() + * was not called and update_curr() has to be done: + */ + if (prev->on_rq) + update_curr(cfs_rq); + + check_spread(cfs_rq, prev); + if (prev->on_rq) { + update_stats_wait_start(cfs_rq, prev); + /* Put 'current' back into the tree. */ + __enqueue_entity(cfs_rq, prev); + } + cfs_rq->curr = NULL; +} + +static void +entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) +{ + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + + /* + * Update share accounting for long-running entities. + */ + update_entity_shares_tick(cfs_rq); + +#ifdef CONFIG_SCHED_HRTICK + /* + * queued ticks are scheduled to match the slice, so don't bother + * validating it and just reschedule. + */ + if (queued) { + resched_task(rq_of(cfs_rq)->curr); + return; + } + /* + * don't let the period tick interfere with the hrtick preemption + */ + if (!sched_feat(DOUBLE_TICK) && + hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) + return; +#endif + + if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) + check_preempt_tick(cfs_rq, curr); +} + +/************************************************** + * CFS operations on tasks: + */ + +#ifdef CONFIG_SCHED_HRTICK +static void hrtick_start_fair(struct rq *rq, struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + WARN_ON(task_rq(p) != rq); + + if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) { + u64 slice = sched_slice(cfs_rq, se); + u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; + s64 delta = slice - ran; + + if (delta < 0) { + if (rq->curr == p) + resched_task(p); + return; + } + + /* + * Don't schedule slices shorter than 10000ns, that just + * doesn't make sense. Rely on vruntime for fairness. + */ + if (rq->curr != p) + delta = max_t(s64, 10000LL, delta); + + hrtick_start(rq, delta); + } +} + +/* + * called from enqueue/dequeue and updates the hrtick when the + * current task is from our class and nr_running is low enough + * to matter. + */ +static void hrtick_update(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + + if (curr->sched_class != &fair_sched_class) + return; + + if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) + hrtick_start_fair(rq, curr); +} +#else /* !CONFIG_SCHED_HRTICK */ +static inline void +hrtick_start_fair(struct rq *rq, struct task_struct *p) +{ +} + +static inline void hrtick_update(struct rq *rq) +{ +} +#endif + +/* + * The enqueue_task method is called before nr_running is + * increased. Here we update the fair scheduling stats and + * then put the task into the rbtree: + */ +static void +enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se = &p->se; + + for_each_sched_entity(se) { + if (se->on_rq) + break; + cfs_rq = cfs_rq_of(se); + enqueue_entity(cfs_rq, se, flags); + flags = ENQUEUE_WAKEUP; + } + + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + update_cfs_load(cfs_rq, 0); + update_cfs_shares(cfs_rq); + } + + hrtick_update(rq); +} + +static void set_next_buddy(struct sched_entity *se); + +/* + * The dequeue_task method is called before nr_running is + * decreased. We remove the task from the rbtree and + * update the fair scheduling stats: + */ +static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se = &p->se; + int task_sleep = flags & DEQUEUE_SLEEP; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + dequeue_entity(cfs_rq, se, flags); + + /* Don't dequeue parent if it has other entities besides us */ + if (cfs_rq->load.weight) { + /* + * Bias pick_next to pick a task from this cfs_rq, as + * p is sleeping when it is within its sched_slice. + */ + if (task_sleep && parent_entity(se)) + set_next_buddy(parent_entity(se)); + break; + } + flags |= DEQUEUE_SLEEP; + } + + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + update_cfs_load(cfs_rq, 0); + update_cfs_shares(cfs_rq); + } + + hrtick_update(rq); +} + +#ifdef CONFIG_SMP + +static void task_waking_fair(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 min_vruntime; + +#ifndef CONFIG_64BIT + u64 min_vruntime_copy; + + do { + min_vruntime_copy = cfs_rq->min_vruntime_copy; + smp_rmb(); + min_vruntime = cfs_rq->min_vruntime; + } while (min_vruntime != min_vruntime_copy); +#else + min_vruntime = cfs_rq->min_vruntime; +#endif + + se->vruntime -= min_vruntime; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +/* + * effective_load() calculates the load change as seen from the root_task_group + * + * Adding load to a group doesn't make a group heavier, but can cause movement + * of group shares between cpus. Assuming the shares were perfectly aligned one + * can calculate the shift in shares. + */ +static long effective_load(struct task_group *tg, int cpu, long wl, long wg) +{ + struct sched_entity *se = tg->se[cpu]; + + if (!tg->parent) + return wl; + + for_each_sched_entity(se) { + long lw, w; + + tg = se->my_q->tg; + w = se->my_q->load.weight; + + /* use this cpu's instantaneous contribution */ + lw = atomic_read(&tg->load_weight); + lw -= se->my_q->load_contribution; + lw += w + wg; + + wl += w; + + if (lw > 0 && wl < lw) + wl = (wl * tg->shares) / lw; + else + wl = tg->shares; + + /* zero point is MIN_SHARES */ + if (wl < MIN_SHARES) + wl = MIN_SHARES; + wl -= se->load.weight; + wg = 0; + } + + return wl; +} + +#else + +static inline unsigned long effective_load(struct task_group *tg, int cpu, + unsigned long wl, unsigned long wg) +{ + return wl; +} + +#endif + +static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) +{ + s64 this_load, load; + int idx, this_cpu, prev_cpu; + unsigned long tl_per_task; + struct task_group *tg; + unsigned long weight; + int balanced; + + idx = sd->wake_idx; + this_cpu = smp_processor_id(); + prev_cpu = task_cpu(p); + load = source_load(prev_cpu, idx); + this_load = target_load(this_cpu, idx); + + /* + * If sync wakeup then subtract the (maximum possible) + * effect of the currently running task from the load + * of the current CPU: + */ + rcu_read_lock(); + if (sync) { + tg = task_group(current); + weight = current->se.load.weight; + + this_load += effective_load(tg, this_cpu, -weight, -weight); + load += effective_load(tg, prev_cpu, 0, -weight); + } + + tg = task_group(p); + weight = p->se.load.weight; + + /* + * In low-load situations, where prev_cpu is idle and this_cpu is idle + * due to the sync cause above having dropped this_load to 0, we'll + * always have an imbalance, but there's really nothing you can do + * about that, so that's good too. + * + * Otherwise check if either cpus are near enough in load to allow this + * task to be woken on this_cpu. + */ + if (this_load > 0) { + s64 this_eff_load, prev_eff_load; + + this_eff_load = 100; + this_eff_load *= power_of(prev_cpu); + this_eff_load *= this_load + + effective_load(tg, this_cpu, weight, weight); + + prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; + prev_eff_load *= power_of(this_cpu); + prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); + + balanced = this_eff_load <= prev_eff_load; + } else + balanced = true; + rcu_read_unlock(); + + /* + * If the currently running task will sleep within + * a reasonable amount of time then attract this newly + * woken task: + */ + if (sync && balanced) + return 1; + + schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); + tl_per_task = cpu_avg_load_per_task(this_cpu); + + if (balanced || + (this_load <= load && + this_load + target_load(prev_cpu, idx) <= tl_per_task)) { + /* + * This domain has SD_WAKE_AFFINE and + * p is cache cold in this domain, and + * there is no bad imbalance. + */ + schedstat_inc(sd, ttwu_move_affine); + schedstat_inc(p, se.statistics.nr_wakeups_affine); + + return 1; + } + return 0; +} + +/* + * find_idlest_group finds and returns the least busy CPU group within the + * domain. + */ +static struct sched_group * +find_idlest_group(struct sched_domain *sd, struct task_struct *p, + int this_cpu, int load_idx) +{ + struct sched_group *idlest = NULL, *group = sd->groups; + unsigned long min_load = ULONG_MAX, this_load = 0; + int imbalance = 100 + (sd->imbalance_pct-100)/2; + + do { + unsigned long load, avg_load; + int local_group; + int i; + + /* Skip over this group if it has no CPUs allowed */ + if (!cpumask_intersects(sched_group_cpus(group), + &p->cpus_allowed)) + continue; + + local_group = cpumask_test_cpu(this_cpu, + sched_group_cpus(group)); + + /* Tally up the load of all CPUs in the group */ + avg_load = 0; + + for_each_cpu(i, sched_group_cpus(group)) { + /* Bias balancing toward cpus of our domain */ + if (local_group) + load = source_load(i, load_idx); + else + load = target_load(i, load_idx); + + avg_load += load; + } + + /* Adjust by relative CPU power of the group */ + avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power; + + if (local_group) { + this_load = avg_load; + } else if (avg_load < min_load) { + min_load = avg_load; + idlest = group; + } + } while (group = group->next, group != sd->groups); + + if (!idlest || 100*this_load < imbalance*min_load) + return NULL; + return idlest; +} + +/* + * find_idlest_cpu - find the idlest cpu among the cpus in group. + */ +static int +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +{ + unsigned long load, min_load = ULONG_MAX; + int idlest = -1; + int i; + + /* Traverse only the allowed CPUs */ + for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { + load = weighted_cpuload(i); + + if (load < min_load || (load == min_load && i == this_cpu)) { + min_load = load; + idlest = i; + } + } + + return idlest; +} + +/* + * Try and locate an idle CPU in the sched_domain. + */ +static int select_idle_sibling(struct task_struct *p, int target) +{ + int cpu = smp_processor_id(); + int prev_cpu = task_cpu(p); + struct sched_domain *sd; + int i; + + /* + * If the task is going to be woken-up on this cpu and if it is + * already idle, then it is the right target. + */ + if (target == cpu && idle_cpu(cpu)) + return cpu; + + /* + * If the task is going to be woken-up on the cpu where it previously + * ran and if it is currently idle, then it the right target. + */ + if (target == prev_cpu && idle_cpu(prev_cpu)) + return prev_cpu; + + /* + * Otherwise, iterate the domains and find an elegible idle cpu. + */ + rcu_read_lock(); + for_each_domain(target, sd) { + if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) + break; + + for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { + if (idle_cpu(i)) { + target = i; + break; + } + } + + /* + * Lets stop looking for an idle sibling when we reached + * the domain that spans the current cpu and prev_cpu. + */ + if (cpumask_test_cpu(cpu, sched_domain_span(sd)) && + cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) + break; + } + rcu_read_unlock(); + + return target; +} + +/* + * sched_balance_self: balance the current task (running on cpu) in domains + * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and + * SD_BALANCE_EXEC. + * + * Balance, ie. select the least loaded group. + * + * Returns the target CPU number, or the same CPU if no balancing is needed. + * + * preempt must be disabled. + */ +static int +select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) +{ + struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; + int cpu = smp_processor_id(); + int prev_cpu = task_cpu(p); + int new_cpu = cpu; + int want_affine = 0; + int want_sd = 1; + int sync = wake_flags & WF_SYNC; + + if (sd_flag & SD_BALANCE_WAKE) { + if (cpumask_test_cpu(cpu, &p->cpus_allowed)) + want_affine = 1; + new_cpu = prev_cpu; + } + + rcu_read_lock(); + for_each_domain(cpu, tmp) { + if (!(tmp->flags & SD_LOAD_BALANCE)) + continue; + + /* + * If power savings logic is enabled for a domain, see if we + * are not overloaded, if so, don't balance wider. + */ + if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) { + unsigned long power = 0; + unsigned long nr_running = 0; + unsigned long capacity; + int i; + + for_each_cpu(i, sched_domain_span(tmp)) { + power += power_of(i); + nr_running += cpu_rq(i)->cfs.nr_running; + } + + capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); + + if (tmp->flags & SD_POWERSAVINGS_BALANCE) + nr_running /= 2; + + if (nr_running < capacity) + want_sd = 0; + } + + /* + * If both cpu and prev_cpu are part of this domain, + * cpu is a valid SD_WAKE_AFFINE target. + */ + if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && + cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { + affine_sd = tmp; + want_affine = 0; + } + + if (!want_sd && !want_affine) + break; + + if (!(tmp->flags & sd_flag)) + continue; + + if (want_sd) + sd = tmp; + } + + if (affine_sd) { + if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) + prev_cpu = cpu; + + new_cpu = select_idle_sibling(p, prev_cpu); + goto unlock; + } + + while (sd) { + int load_idx = sd->forkexec_idx; + struct sched_group *group; + int weight; + + if (!(sd->flags & sd_flag)) { + sd = sd->child; + continue; + } + + if (sd_flag & SD_BALANCE_WAKE) + load_idx = sd->wake_idx; + + group = find_idlest_group(sd, p, cpu, load_idx); + if (!group) { + sd = sd->child; + continue; + } + + new_cpu = find_idlest_cpu(group, p, cpu); + if (new_cpu == -1 || new_cpu == cpu) { + /* Now try balancing at a lower domain level of cpu */ + sd = sd->child; + continue; + } + + /* Now try balancing at a lower domain level of new_cpu */ + cpu = new_cpu; + weight = sd->span_weight; + sd = NULL; + for_each_domain(cpu, tmp) { + if (weight <= tmp->span_weight) + break; + if (tmp->flags & sd_flag) + sd = tmp; + } + /* while loop will break here if sd == NULL */ + } +unlock: + rcu_read_unlock(); + + return new_cpu; +} +#endif /* CONFIG_SMP */ + +static unsigned long +wakeup_gran(struct sched_entity *curr, struct sched_entity *se) +{ + unsigned long gran = sysctl_sched_wakeup_granularity; + + /* + * Since its curr running now, convert the gran from real-time + * to virtual-time in his units. + * + * By using 'se' instead of 'curr' we penalize light tasks, so + * they get preempted easier. That is, if 'se' < 'curr' then + * the resulting gran will be larger, therefore penalizing the + * lighter, if otoh 'se' > 'curr' then the resulting gran will + * be smaller, again penalizing the lighter task. + * + * This is especially important for buddies when the leftmost + * task is higher priority than the buddy. + */ + return calc_delta_fair(gran, se); +} + +/* + * Should 'se' preempt 'curr'. + * + * |s1 + * |s2 + * |s3 + * g + * |<--->|c + * + * w(c, s1) = -1 + * w(c, s2) = 0 + * w(c, s3) = 1 + * + */ +static int +wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) +{ + s64 gran, vdiff = curr->vruntime - se->vruntime; + + if (vdiff <= 0) + return -1; + + gran = wakeup_gran(curr, se); + if (vdiff > gran) + return 1; + + return 0; +} + +static void set_last_buddy(struct sched_entity *se) +{ + if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) + return; + + for_each_sched_entity(se) + cfs_rq_of(se)->last = se; +} + +static void set_next_buddy(struct sched_entity *se) +{ + if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE)) + return; + + for_each_sched_entity(se) + cfs_rq_of(se)->next = se; +} + +static void set_skip_buddy(struct sched_entity *se) +{ + for_each_sched_entity(se) + cfs_rq_of(se)->skip = se; +} + +/* + * Preempt the current task with a newly woken task if needed: + */ +static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) +{ + struct task_struct *curr = rq->curr; + struct sched_entity *se = &curr->se, *pse = &p->se; + struct cfs_rq *cfs_rq = task_cfs_rq(curr); + int scale = cfs_rq->nr_running >= sched_nr_latency; + int next_buddy_marked = 0; + + if (unlikely(se == pse)) + return; + + if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) { + set_next_buddy(pse); + next_buddy_marked = 1; + } + + /* + * We can come here with TIF_NEED_RESCHED already set from new task + * wake up path. + */ + if (test_tsk_need_resched(curr)) + return; + + /* Idle tasks are by definition preempted by non-idle tasks. */ + if (unlikely(curr->policy == SCHED_IDLE) && + likely(p->policy != SCHED_IDLE)) + goto preempt; + + /* + * Batch and idle tasks do not preempt non-idle tasks (their preemption + * is driven by the tick): + */ + if (unlikely(p->policy != SCHED_NORMAL)) + return; + + + if (!sched_feat(WAKEUP_PREEMPT)) + return; + + update_curr(cfs_rq); + find_matching_se(&se, &pse); + BUG_ON(!pse); + if (wakeup_preempt_entity(se, pse) == 1) { + /* + * Bias pick_next to pick the sched entity that is + * triggering this preemption. + */ + if (!next_buddy_marked) + set_next_buddy(pse); + goto preempt; + } + + return; + +preempt: + resched_task(curr); + /* + * Only set the backward buddy when the current task is still + * on the rq. This can happen when a wakeup gets interleaved + * with schedule on the ->pre_schedule() or idle_balance() + * point, either of which can * drop the rq lock. + * + * Also, during early boot the idle thread is in the fair class, + * for obvious reasons its a bad idea to schedule back to it. + */ + if (unlikely(!se->on_rq || curr == rq->idle)) + return; + + if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) + set_last_buddy(se); +} + +static struct task_struct *pick_next_task_fair(struct rq *rq) +{ + struct task_struct *p; + struct cfs_rq *cfs_rq = &rq->cfs; + struct sched_entity *se; + + if (!cfs_rq->nr_running) + return NULL; + + do { + se = pick_next_entity(cfs_rq); + set_next_entity(cfs_rq, se); + cfs_rq = group_cfs_rq(se); + } while (cfs_rq); + + p = task_of(se); + hrtick_start_fair(rq, p); + + return p; +} + +/* + * Account for a descheduled task: + */ +static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) +{ + struct sched_entity *se = &prev->se; + struct cfs_rq *cfs_rq; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + put_prev_entity(cfs_rq, se); + } +} + +/* + * sched_yield() is very simple + * + * The magic of dealing with the ->skip buddy is in pick_next_entity. + */ +static void yield_task_fair(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + struct cfs_rq *cfs_rq = task_cfs_rq(curr); + struct sched_entity *se = &curr->se; + + /* + * Are we the only task in the tree? + */ + if (unlikely(rq->nr_running == 1)) + return; + + clear_buddies(cfs_rq, se); + + if (curr->policy != SCHED_BATCH) { + update_rq_clock(rq); + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + } + + set_skip_buddy(se); +} + +static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt) +{ + struct sched_entity *se = &p->se; + + if (!se->on_rq) + return false; + + /* Tell the scheduler that we'd really like pse to run next. */ + set_next_buddy(se); + + yield_task_fair(rq); + + return true; +} + +#ifdef CONFIG_SMP +/************************************************** + * Fair scheduling class load-balancing methods: + */ + +/* + * pull_task - move a task from a remote runqueue to the local runqueue. + * Both runqueues must be locked. + */ +static void pull_task(struct rq *src_rq, struct task_struct *p, + struct rq *this_rq, int this_cpu) +{ + deactivate_task(src_rq, p, 0); + set_task_cpu(p, this_cpu); + activate_task(this_rq, p, 0); + check_preempt_curr(this_rq, p, 0); +} + +/* + * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? + */ +static +int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned) +{ + int tsk_cache_hot = 0; + /* + * We do not migrate tasks that are: + * 1) running (obviously), or + * 2) cannot be migrated to this CPU due to cpus_allowed, or + * 3) are cache-hot on their current CPU. + */ + if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { + schedstat_inc(p, se.statistics.nr_failed_migrations_affine); + return 0; + } + *all_pinned = 0; + + if (task_running(rq, p)) { + schedstat_inc(p, se.statistics.nr_failed_migrations_running); + return 0; + } + + /* + * Aggressive migration if: + * 1) task is cache cold, or + * 2) too many balance attempts have failed. + */ + + tsk_cache_hot = task_hot(p, rq->clock_task, sd); + if (!tsk_cache_hot || + sd->nr_balance_failed > sd->cache_nice_tries) { +#ifdef CONFIG_SCHEDSTATS + if (tsk_cache_hot) { + schedstat_inc(sd, lb_hot_gained[idle]); + schedstat_inc(p, se.statistics.nr_forced_migrations); + } +#endif + return 1; + } + + if (tsk_cache_hot) { + schedstat_inc(p, se.statistics.nr_failed_migrations_hot); + return 0; + } + return 1; +} + +/* + * move_one_task tries to move exactly one task from busiest to this_rq, as + * part of active balancing operations within "domain". + * Returns 1 if successful and 0 otherwise. + * + * Called with both runqueues locked. + */ +static int +move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle) +{ + struct task_struct *p, *n; + struct cfs_rq *cfs_rq; + int pinned = 0; + + for_each_leaf_cfs_rq(busiest, cfs_rq) { + list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) { + + if (!can_migrate_task(p, busiest, this_cpu, + sd, idle, &pinned)) + continue; + + pull_task(busiest, p, this_rq, this_cpu); + /* + * Right now, this is only the second place pull_task() + * is called, so we can safely collect pull_task() + * stats here rather than inside pull_task(). + */ + schedstat_inc(sd, lb_gained[idle]); + return 1; + } + } + + return 0; +} + +static unsigned long +balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, struct sched_domain *sd, + enum cpu_idle_type idle, int *all_pinned, + struct cfs_rq *busiest_cfs_rq) +{ + int loops = 0, pulled = 0; + long rem_load_move = max_load_move; + struct task_struct *p, *n; + + if (max_load_move == 0) + goto out; + + list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) { + if (loops++ > sysctl_sched_nr_migrate) + break; + + if ((p->se.load.weight >> 1) > rem_load_move || + !can_migrate_task(p, busiest, this_cpu, sd, idle, + all_pinned)) + continue; + + pull_task(busiest, p, this_rq, this_cpu); + pulled++; + rem_load_move -= p->se.load.weight; + +#ifdef CONFIG_PREEMPT + /* + * NEWIDLE balancing is a source of latency, so preemptible + * kernels will stop after the first task is pulled to minimize + * the critical section. + */ + if (idle == CPU_NEWLY_IDLE) + break; +#endif + + /* + * We only want to steal up to the prescribed amount of + * weighted load. + */ + if (rem_load_move <= 0) + break; + } +out: + /* + * Right now, this is one of only two places pull_task() is called, + * so we can safely collect pull_task() stats here rather than + * inside pull_task(). + */ + schedstat_add(sd, lb_gained[idle], pulled); + + return max_load_move - rem_load_move; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +/* + * update tg->load_weight by folding this cpu's load_avg + */ +static int update_shares_cpu(struct task_group *tg, int cpu) +{ + struct cfs_rq *cfs_rq; + unsigned long flags; + struct rq *rq; + + if (!tg->se[cpu]) + return 0; + + rq = cpu_rq(cpu); + cfs_rq = tg->cfs_rq[cpu]; + + raw_spin_lock_irqsave(&rq->lock, flags); + + update_rq_clock(rq); + update_cfs_load(cfs_rq, 1); + + /* + * We need to update shares after updating tg->load_weight in + * order to adjust the weight of groups with long running tasks. + */ + update_cfs_shares(cfs_rq); + + raw_spin_unlock_irqrestore(&rq->lock, flags); + + return 0; +} + +static void update_shares(int cpu) +{ + struct cfs_rq *cfs_rq; + struct rq *rq = cpu_rq(cpu); + + rcu_read_lock(); + for_each_leaf_cfs_rq(rq, cfs_rq) + update_shares_cpu(cfs_rq->tg, cpu); + rcu_read_unlock(); +} + +static unsigned long +load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned) +{ + long rem_load_move = max_load_move; + int busiest_cpu = cpu_of(busiest); + struct task_group *tg; + + rcu_read_lock(); + update_h_load(busiest_cpu); + + list_for_each_entry_rcu(tg, &task_groups, list) { + struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; + unsigned long busiest_h_load = busiest_cfs_rq->h_load; + unsigned long busiest_weight = busiest_cfs_rq->load.weight; + u64 rem_load, moved_load; + + /* + * empty group + */ + if (!busiest_cfs_rq->task_weight) + continue; + + rem_load = (u64)rem_load_move * busiest_weight; + rem_load = div_u64(rem_load, busiest_h_load + 1); + + moved_load = balance_tasks(this_rq, this_cpu, busiest, + rem_load, sd, idle, all_pinned, + busiest_cfs_rq); + + if (!moved_load) + continue; + + moved_load *= busiest_h_load; + moved_load = div_u64(moved_load, busiest_weight + 1); + + rem_load_move -= moved_load; + if (rem_load_move < 0) + break; + } + rcu_read_unlock(); + + return max_load_move - rem_load_move; +} +#else +static inline void update_shares(int cpu) +{ +} + +static unsigned long +load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned) +{ + return balance_tasks(this_rq, this_cpu, busiest, + max_load_move, sd, idle, all_pinned, + &busiest->cfs); +} +#endif + +/* + * move_tasks tries to move up to max_load_move weighted load from busiest to + * this_rq, as part of a balancing operation within domain "sd". + * Returns 1 if successful and 0 otherwise. + * + * Called with both runqueues locked. + */ +static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned) +{ + unsigned long total_load_moved = 0, load_moved; + + do { + load_moved = load_balance_fair(this_rq, this_cpu, busiest, + max_load_move - total_load_moved, + sd, idle, all_pinned); + + total_load_moved += load_moved; + +#ifdef CONFIG_PREEMPT + /* + * NEWIDLE balancing is a source of latency, so preemptible + * kernels will stop after the first task is pulled to minimize + * the critical section. + */ + if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) + break; + + if (raw_spin_is_contended(&this_rq->lock) || + raw_spin_is_contended(&busiest->lock)) + break; +#endif + } while (load_moved && max_load_move > total_load_moved); + + return total_load_moved > 0; +} + +/********** Helpers for find_busiest_group ************************/ +/* + * sd_lb_stats - Structure to store the statistics of a sched_domain + * during load balancing. + */ +struct sd_lb_stats { + struct sched_group *busiest; /* Busiest group in this sd */ + struct sched_group *this; /* Local group in this sd */ + unsigned long total_load; /* Total load of all groups in sd */ + unsigned long total_pwr; /* Total power of all groups in sd */ + unsigned long avg_load; /* Average load across all groups in sd */ + + /** Statistics of this group */ + unsigned long this_load; + unsigned long this_load_per_task; + unsigned long this_nr_running; + unsigned long this_has_capacity; + unsigned int this_idle_cpus; + + /* Statistics of the busiest group */ + unsigned int busiest_idle_cpus; + unsigned long max_load; + unsigned long busiest_load_per_task; + unsigned long busiest_nr_running; + unsigned long busiest_group_capacity; + unsigned long busiest_has_capacity; + unsigned int busiest_group_weight; + + int group_imb; /* Is there imbalance in this sd */ +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) + int power_savings_balance; /* Is powersave balance needed for this sd */ + struct sched_group *group_min; /* Least loaded group in sd */ + struct sched_group *group_leader; /* Group which relieves group_min */ + unsigned long min_load_per_task; /* load_per_task in group_min */ + unsigned long leader_nr_running; /* Nr running of group_leader */ + unsigned long min_nr_running; /* Nr running of group_min */ +#endif +}; + +/* + * sg_lb_stats - stats of a sched_group required for load_balancing + */ +struct sg_lb_stats { + unsigned long avg_load; /*Avg load across the CPUs of the group */ + unsigned long group_load; /* Total load over the CPUs of the group */ + unsigned long sum_nr_running; /* Nr tasks running in the group */ + unsigned long sum_weighted_load; /* Weighted load of group's tasks */ + unsigned long group_capacity; + unsigned long idle_cpus; + unsigned long group_weight; + int group_imb; /* Is there an imbalance in the group ? */ + int group_has_capacity; /* Is there extra capacity in the group? */ +}; + +/** + * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. + * @group: The group whose first cpu is to be returned. + */ +static inline unsigned int group_first_cpu(struct sched_group *group) +{ + return cpumask_first(sched_group_cpus(group)); +} + +/** + * get_sd_load_idx - Obtain the load index for a given sched domain. + * @sd: The sched_domain whose load_idx is to be obtained. + * @idle: The Idle status of the CPU for whose sd load_icx is obtained. + */ +static inline int get_sd_load_idx(struct sched_domain *sd, + enum cpu_idle_type idle) +{ + int load_idx; + + switch (idle) { + case CPU_NOT_IDLE: + load_idx = sd->busy_idx; + break; + + case CPU_NEWLY_IDLE: + load_idx = sd->newidle_idx; + break; + default: + load_idx = sd->idle_idx; + break; + } + + return load_idx; +} + + +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +/** + * init_sd_power_savings_stats - Initialize power savings statistics for + * the given sched_domain, during load balancing. + * + * @sd: Sched domain whose power-savings statistics are to be initialized. + * @sds: Variable containing the statistics for sd. + * @idle: Idle status of the CPU at which we're performing load-balancing. + */ +static inline void init_sd_power_savings_stats(struct sched_domain *sd, + struct sd_lb_stats *sds, enum cpu_idle_type idle) +{ + /* + * Busy processors will not participate in power savings + * balance. + */ + if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) + sds->power_savings_balance = 0; + else { + sds->power_savings_balance = 1; + sds->min_nr_running = ULONG_MAX; + sds->leader_nr_running = 0; + } +} + +/** + * update_sd_power_savings_stats - Update the power saving stats for a + * sched_domain while performing load balancing. + * + * @group: sched_group belonging to the sched_domain under consideration. + * @sds: Variable containing the statistics of the sched_domain + * @local_group: Does group contain the CPU for which we're performing + * load balancing ? + * @sgs: Variable containing the statistics of the group. + */ +static inline void update_sd_power_savings_stats(struct sched_group *group, + struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) +{ + + if (!sds->power_savings_balance) + return; + + /* + * If the local group is idle or completely loaded + * no need to do power savings balance at this domain + */ + if (local_group && (sds->this_nr_running >= sgs->group_capacity || + !sds->this_nr_running)) + sds->power_savings_balance = 0; + + /* + * If a group is already running at full capacity or idle, + * don't include that group in power savings calculations + */ + if (!sds->power_savings_balance || + sgs->sum_nr_running >= sgs->group_capacity || + !sgs->sum_nr_running) + return; + + /* + * Calculate the group which has the least non-idle load. + * This is the group from where we need to pick up the load + * for saving power + */ + if ((sgs->sum_nr_running < sds->min_nr_running) || + (sgs->sum_nr_running == sds->min_nr_running && + group_first_cpu(group) > group_first_cpu(sds->group_min))) { + sds->group_min = group; + sds->min_nr_running = sgs->sum_nr_running; + sds->min_load_per_task = sgs->sum_weighted_load / + sgs->sum_nr_running; + } + + /* + * Calculate the group which is almost near its + * capacity but still has some space to pick up some load + * from other group and save more power + */ + if (sgs->sum_nr_running + 1 > sgs->group_capacity) + return; + + if (sgs->sum_nr_running > sds->leader_nr_running || + (sgs->sum_nr_running == sds->leader_nr_running && + group_first_cpu(group) < group_first_cpu(sds->group_leader))) { + sds->group_leader = group; + sds->leader_nr_running = sgs->sum_nr_running; + } +} + +/** + * check_power_save_busiest_group - see if there is potential for some power-savings balance + * @sds: Variable containing the statistics of the sched_domain + * under consideration. + * @this_cpu: Cpu at which we're currently performing load-balancing. + * @imbalance: Variable to store the imbalance. + * + * Description: + * Check if we have potential to perform some power-savings balance. + * If yes, set the busiest group to be the least loaded group in the + * sched_domain, so that it's CPUs can be put to idle. + * + * Returns 1 if there is potential to perform power-savings balance. + * Else returns 0. + */ +static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, + int this_cpu, unsigned long *imbalance) +{ + if (!sds->power_savings_balance) + return 0; + + if (sds->this != sds->group_leader || + sds->group_leader == sds->group_min) + return 0; + + *imbalance = sds->min_load_per_task; + sds->busiest = sds->group_min; + + return 1; + +} +#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ +static inline void init_sd_power_savings_stats(struct sched_domain *sd, + struct sd_lb_stats *sds, enum cpu_idle_type idle) +{ + return; +} + +static inline void update_sd_power_savings_stats(struct sched_group *group, + struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs) +{ + return; +} + +static inline int check_power_save_busiest_group(struct sd_lb_stats *sds, + int this_cpu, unsigned long *imbalance) +{ + return 0; +} +#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ + + +unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) +{ + return SCHED_POWER_SCALE; +} + +unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) +{ + return default_scale_freq_power(sd, cpu); +} + +unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) +{ + unsigned long weight = sd->span_weight; + unsigned long smt_gain = sd->smt_gain; + + smt_gain /= weight; + + return smt_gain; +} + +unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) +{ + return default_scale_smt_power(sd, cpu); +} + +unsigned long scale_rt_power(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + u64 total, available; + + total = sched_avg_period() + (rq->clock - rq->age_stamp); + + if (unlikely(total < rq->rt_avg)) { + /* Ensures that power won't end up being negative */ + available = 0; + } else { + available = total - rq->rt_avg; + } + + if (unlikely((s64)total < SCHED_POWER_SCALE)) + total = SCHED_POWER_SCALE; + + total >>= SCHED_POWER_SHIFT; + + return div_u64(available, total); +} + +static void update_cpu_power(struct sched_domain *sd, int cpu) +{ + unsigned long weight = sd->span_weight; + unsigned long power = SCHED_POWER_SCALE; + struct sched_group *sdg = sd->groups; + + if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { + if (sched_feat(ARCH_POWER)) + power *= arch_scale_smt_power(sd, cpu); + else + power *= default_scale_smt_power(sd, cpu); + + power >>= SCHED_POWER_SHIFT; + } + + sdg->sgp->power_orig = power; + + if (sched_feat(ARCH_POWER)) + power *= arch_scale_freq_power(sd, cpu); + else + power *= default_scale_freq_power(sd, cpu); + + power >>= SCHED_POWER_SHIFT; + + power *= scale_rt_power(cpu); + power >>= SCHED_POWER_SHIFT; + + if (!power) + power = 1; + + cpu_rq(cpu)->cpu_power = power; + sdg->sgp->power = power; +} + +static void update_group_power(struct sched_domain *sd, int cpu) +{ + struct sched_domain *child = sd->child; + struct sched_group *group, *sdg = sd->groups; + unsigned long power; + + if (!child) { + update_cpu_power(sd, cpu); + return; + } + + power = 0; + + group = child->groups; + do { + power += group->sgp->power; + group = group->next; + } while (group != child->groups); + + sdg->sgp->power = power; +} + +/* + * Try and fix up capacity for tiny siblings, this is needed when + * things like SD_ASYM_PACKING need f_b_g to select another sibling + * which on its own isn't powerful enough. + * + * See update_sd_pick_busiest() and check_asym_packing(). + */ +static inline int +fix_small_capacity(struct sched_domain *sd, struct sched_group *group) +{ + /* + * Only siblings can have significantly less than SCHED_POWER_SCALE + */ + if (!(sd->flags & SD_SHARE_CPUPOWER)) + return 0; + + /* + * If ~90% of the cpu_power is still there, we're good. + */ + if (group->sgp->power * 32 > group->sgp->power_orig * 29) + return 1; + + return 0; +} + +/** + * update_sg_lb_stats - Update sched_group's statistics for load balancing. + * @sd: The sched_domain whose statistics are to be updated. + * @group: sched_group whose statistics are to be updated. + * @this_cpu: Cpu for which load balance is currently performed. + * @idle: Idle status of this_cpu + * @load_idx: Load index of sched_domain of this_cpu for load calc. + * @local_group: Does group contain this_cpu. + * @cpus: Set of cpus considered for load balancing. + * @balance: Should we balance. + * @sgs: variable to hold the statistics for this group. + */ +static inline void update_sg_lb_stats(struct sched_domain *sd, + struct sched_group *group, int this_cpu, + enum cpu_idle_type idle, int load_idx, + int local_group, const struct cpumask *cpus, + int *balance, struct sg_lb_stats *sgs) +{ + unsigned long load, max_cpu_load, min_cpu_load, max_nr_running; + int i; + unsigned int balance_cpu = -1, first_idle_cpu = 0; + unsigned long avg_load_per_task = 0; + + if (local_group) + balance_cpu = group_first_cpu(group); + + /* Tally up the load of all CPUs in the group */ + max_cpu_load = 0; + min_cpu_load = ~0UL; + max_nr_running = 0; + + for_each_cpu_and(i, sched_group_cpus(group), cpus) { + struct rq *rq = cpu_rq(i); + + /* Bias balancing toward cpus of our domain */ + if (local_group) { + if (idle_cpu(i) && !first_idle_cpu) { + first_idle_cpu = 1; + balance_cpu = i; + } + + load = target_load(i, load_idx); + } else { + load = source_load(i, load_idx); + if (load > max_cpu_load) { + max_cpu_load = load; + max_nr_running = rq->nr_running; + } + if (min_cpu_load > load) + min_cpu_load = load; + } + + sgs->group_load += load; + sgs->sum_nr_running += rq->nr_running; + sgs->sum_weighted_load += weighted_cpuload(i); + if (idle_cpu(i)) + sgs->idle_cpus++; + } + + /* + * First idle cpu or the first cpu(busiest) in this sched group + * is eligible for doing load balancing at this and above + * domains. In the newly idle case, we will allow all the cpu's + * to do the newly idle load balance. + */ + if (idle != CPU_NEWLY_IDLE && local_group) { + if (balance_cpu != this_cpu) { + *balance = 0; + return; + } + update_group_power(sd, this_cpu); + } + + /* Adjust by relative CPU power of the group */ + sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; + + /* + * Consider the group unbalanced when the imbalance is larger + * than the average weight of a task. + * + * APZ: with cgroup the avg task weight can vary wildly and + * might not be a suitable number - should we keep a + * normalized nr_running number somewhere that negates + * the hierarchy? + */ + if (sgs->sum_nr_running) + avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; + + if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1) + sgs->group_imb = 1; + + sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, + SCHED_POWER_SCALE); + if (!sgs->group_capacity) + sgs->group_capacity = fix_small_capacity(sd, group); + sgs->group_weight = group->group_weight; + + if (sgs->group_capacity > sgs->sum_nr_running) + sgs->group_has_capacity = 1; +} + +/** + * update_sd_pick_busiest - return 1 on busiest group + * @sd: sched_domain whose statistics are to be checked + * @sds: sched_domain statistics + * @sg: sched_group candidate to be checked for being the busiest + * @sgs: sched_group statistics + * @this_cpu: the current cpu + * + * Determine if @sg is a busier group than the previously selected + * busiest group. + */ +static bool update_sd_pick_busiest(struct sched_domain *sd, + struct sd_lb_stats *sds, + struct sched_group *sg, + struct sg_lb_stats *sgs, + int this_cpu) +{ + if (sgs->avg_load <= sds->max_load) + return false; + + if (sgs->sum_nr_running > sgs->group_capacity) + return true; + + if (sgs->group_imb) + return true; + + /* + * ASYM_PACKING needs to move all the work to the lowest + * numbered CPUs in the group, therefore mark all groups + * higher than ourself as busy. + */ + if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && + this_cpu < group_first_cpu(sg)) { + if (!sds->busiest) + return true; + + if (group_first_cpu(sds->busiest) > group_first_cpu(sg)) + return true; + } + + return false; +} + +/** + * update_sd_lb_stats - Update sched_group's statistics for load balancing. + * @sd: sched_domain whose statistics are to be updated. + * @this_cpu: Cpu for which load balance is currently performed. + * @idle: Idle status of this_cpu + * @cpus: Set of cpus considered for load balancing. + * @balance: Should we balance. + * @sds: variable to hold the statistics for this sched_domain. + */ +static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu, + enum cpu_idle_type idle, const struct cpumask *cpus, + int *balance, struct sd_lb_stats *sds) +{ + struct sched_domain *child = sd->child; + struct sched_group *sg = sd->groups; + struct sg_lb_stats sgs; + int load_idx, prefer_sibling = 0; + + if (child && child->flags & SD_PREFER_SIBLING) + prefer_sibling = 1; + + init_sd_power_savings_stats(sd, sds, idle); + load_idx = get_sd_load_idx(sd, idle); + + do { + int local_group; + + local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg)); + memset(&sgs, 0, sizeof(sgs)); + update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, + local_group, cpus, balance, &sgs); + + if (local_group && !(*balance)) + return; + + sds->total_load += sgs.group_load; + sds->total_pwr += sg->sgp->power; + + /* + * In case the child domain prefers tasks go to siblings + * first, lower the sg capacity to one so that we'll try + * and move all the excess tasks away. We lower the capacity + * of a group only if the local group has the capacity to fit + * these excess tasks, i.e. nr_running < group_capacity. The + * extra check prevents the case where you always pull from the + * heaviest group when it is already under-utilized (possible + * with a large weight task outweighs the tasks on the system). + */ + if (prefer_sibling && !local_group && sds->this_has_capacity) + sgs.group_capacity = min(sgs.group_capacity, 1UL); + + if (local_group) { + sds->this_load = sgs.avg_load; + sds->this = sg; + sds->this_nr_running = sgs.sum_nr_running; + sds->this_load_per_task = sgs.sum_weighted_load; + sds->this_has_capacity = sgs.group_has_capacity; + sds->this_idle_cpus = sgs.idle_cpus; + } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) { + sds->max_load = sgs.avg_load; + sds->busiest = sg; + sds->busiest_nr_running = sgs.sum_nr_running; + sds->busiest_idle_cpus = sgs.idle_cpus; + sds->busiest_group_capacity = sgs.group_capacity; + sds->busiest_load_per_task = sgs.sum_weighted_load; + sds->busiest_has_capacity = sgs.group_has_capacity; + sds->busiest_group_weight = sgs.group_weight; + sds->group_imb = sgs.group_imb; + } + + update_sd_power_savings_stats(sg, sds, local_group, &sgs); + sg = sg->next; + } while (sg != sd->groups); +} + +int __weak arch_sd_sibling_asym_packing(void) +{ + return 0*SD_ASYM_PACKING; +} + +/** + * check_asym_packing - Check to see if the group is packed into the + * sched doman. + * + * This is primarily intended to used at the sibling level. Some + * cores like POWER7 prefer to use lower numbered SMT threads. In the + * case of POWER7, it can move to lower SMT modes only when higher + * threads are idle. When in lower SMT modes, the threads will + * perform better since they share less core resources. Hence when we + * have idle threads, we want them to be the higher ones. + * + * This packing function is run on idle threads. It checks to see if + * the busiest CPU in this domain (core in the P7 case) has a higher + * CPU number than the packing function is being run on. Here we are + * assuming lower CPU number will be equivalent to lower a SMT thread + * number. + * + * Returns 1 when packing is required and a task should be moved to + * this CPU. The amount of the imbalance is returned in *imbalance. + * + * @sd: The sched_domain whose packing is to be checked. + * @sds: Statistics of the sched_domain which is to be packed + * @this_cpu: The cpu at whose sched_domain we're performing load-balance. + * @imbalance: returns amount of imbalanced due to packing. + */ +static int check_asym_packing(struct sched_domain *sd, + struct sd_lb_stats *sds, + int this_cpu, unsigned long *imbalance) +{ + int busiest_cpu; + + if (!(sd->flags & SD_ASYM_PACKING)) + return 0; + + if (!sds->busiest) + return 0; + + busiest_cpu = group_first_cpu(sds->busiest); + if (this_cpu > busiest_cpu) + return 0; + + *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power, + SCHED_POWER_SCALE); + return 1; +} + +/** + * fix_small_imbalance - Calculate the minor imbalance that exists + * amongst the groups of a sched_domain, during + * load balancing. + * @sds: Statistics of the sched_domain whose imbalance is to be calculated. + * @this_cpu: The cpu at whose sched_domain we're performing load-balance. + * @imbalance: Variable to store the imbalance. + */ +static inline void fix_small_imbalance(struct sd_lb_stats *sds, + int this_cpu, unsigned long *imbalance) +{ + unsigned long tmp, pwr_now = 0, pwr_move = 0; + unsigned int imbn = 2; + unsigned long scaled_busy_load_per_task; + + if (sds->this_nr_running) { + sds->this_load_per_task /= sds->this_nr_running; + if (sds->busiest_load_per_task > + sds->this_load_per_task) + imbn = 1; + } else + sds->this_load_per_task = + cpu_avg_load_per_task(this_cpu); + + scaled_busy_load_per_task = sds->busiest_load_per_task + * SCHED_POWER_SCALE; + scaled_busy_load_per_task /= sds->busiest->sgp->power; + + if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= + (scaled_busy_load_per_task * imbn)) { + *imbalance = sds->busiest_load_per_task; + return; + } + + /* + * OK, we don't have enough imbalance to justify moving tasks, + * however we may be able to increase total CPU power used by + * moving them. + */ + + pwr_now += sds->busiest->sgp->power * + min(sds->busiest_load_per_task, sds->max_load); + pwr_now += sds->this->sgp->power * + min(sds->this_load_per_task, sds->this_load); + pwr_now /= SCHED_POWER_SCALE; + + /* Amount of load we'd subtract */ + tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / + sds->busiest->sgp->power; + if (sds->max_load > tmp) + pwr_move += sds->busiest->sgp->power * + min(sds->busiest_load_per_task, sds->max_load - tmp); + + /* Amount of load we'd add */ + if (sds->max_load * sds->busiest->sgp->power < + sds->busiest_load_per_task * SCHED_POWER_SCALE) + tmp = (sds->max_load * sds->busiest->sgp->power) / + sds->this->sgp->power; + else + tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / + sds->this->sgp->power; + pwr_move += sds->this->sgp->power * + min(sds->this_load_per_task, sds->this_load + tmp); + pwr_move /= SCHED_POWER_SCALE; + + /* Move if we gain throughput */ + if (pwr_move > pwr_now) + *imbalance = sds->busiest_load_per_task; +} + +/** + * calculate_imbalance - Calculate the amount of imbalance present within the + * groups of a given sched_domain during load balance. + * @sds: statistics of the sched_domain whose imbalance is to be calculated. + * @this_cpu: Cpu for which currently load balance is being performed. + * @imbalance: The variable to store the imbalance. + */ +static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu, + unsigned long *imbalance) +{ + unsigned long max_pull, load_above_capacity = ~0UL; + + sds->busiest_load_per_task /= sds->busiest_nr_running; + if (sds->group_imb) { + sds->busiest_load_per_task = + min(sds->busiest_load_per_task, sds->avg_load); + } + + /* + * In the presence of smp nice balancing, certain scenarios can have + * max load less than avg load(as we skip the groups at or below + * its cpu_power, while calculating max_load..) + */ + if (sds->max_load < sds->avg_load) { + *imbalance = 0; + return fix_small_imbalance(sds, this_cpu, imbalance); + } + + if (!sds->group_imb) { + /* + * Don't want to pull so many tasks that a group would go idle. + */ + load_above_capacity = (sds->busiest_nr_running - + sds->busiest_group_capacity); + + load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); + + load_above_capacity /= sds->busiest->sgp->power; + } + + /* + * We're trying to get all the cpus to the average_load, so we don't + * want to push ourselves above the average load, nor do we wish to + * reduce the max loaded cpu below the average load. At the same time, + * we also don't want to reduce the group load below the group capacity + * (so that we can implement power-savings policies etc). Thus we look + * for the minimum possible imbalance. + * Be careful of negative numbers as they'll appear as very large values + * with unsigned longs. + */ + max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); + + /* How much load to actually move to equalise the imbalance */ + *imbalance = min(max_pull * sds->busiest->sgp->power, + (sds->avg_load - sds->this_load) * sds->this->sgp->power) + / SCHED_POWER_SCALE; + + /* + * if *imbalance is less than the average load per runnable task + * there is no guarantee that any tasks will be moved so we'll have + * a think about bumping its value to force at least one task to be + * moved + */ + if (*imbalance < sds->busiest_load_per_task) + return fix_small_imbalance(sds, this_cpu, imbalance); + +} + +/******* find_busiest_group() helpers end here *********************/ + +/** + * find_busiest_group - Returns the busiest group within the sched_domain + * if there is an imbalance. If there isn't an imbalance, and + * the user has opted for power-savings, it returns a group whose + * CPUs can be put to idle by rebalancing those tasks elsewhere, if + * such a group exists. + * + * Also calculates the amount of weighted load which should be moved + * to restore balance. + * + * @sd: The sched_domain whose busiest group is to be returned. + * @this_cpu: The cpu for which load balancing is currently being performed. + * @imbalance: Variable which stores amount of weighted load which should + * be moved to restore balance/put a group to idle. + * @idle: The idle status of this_cpu. + * @cpus: The set of CPUs under consideration for load-balancing. + * @balance: Pointer to a variable indicating if this_cpu + * is the appropriate cpu to perform load balancing at this_level. + * + * Returns: - the busiest group if imbalance exists. + * - If no imbalance and user has opted for power-savings balance, + * return the least loaded group whose CPUs can be + * put to idle by rebalancing its tasks onto our group. + */ +static struct sched_group * +find_busiest_group(struct sched_domain *sd, int this_cpu, + unsigned long *imbalance, enum cpu_idle_type idle, + const struct cpumask *cpus, int *balance) +{ + struct sd_lb_stats sds; + + memset(&sds, 0, sizeof(sds)); + + /* + * Compute the various statistics relavent for load balancing at + * this level. + */ + update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds); + + /* + * this_cpu is not the appropriate cpu to perform load balancing at + * this level. + */ + if (!(*balance)) + goto ret; + + if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) && + check_asym_packing(sd, &sds, this_cpu, imbalance)) + return sds.busiest; + + /* There is no busy sibling group to pull tasks from */ + if (!sds.busiest || sds.busiest_nr_running == 0) + goto out_balanced; + + sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; + + /* + * If the busiest group is imbalanced the below checks don't + * work because they assumes all things are equal, which typically + * isn't true due to cpus_allowed constraints and the like. + */ + if (sds.group_imb) + goto force_balance; + + /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ + if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity && + !sds.busiest_has_capacity) + goto force_balance; + + /* + * If the local group is more busy than the selected busiest group + * don't try and pull any tasks. + */ + if (sds.this_load >= sds.max_load) + goto out_balanced; + + /* + * Don't pull any tasks if this group is already above the domain + * average load. + */ + if (sds.this_load >= sds.avg_load) + goto out_balanced; + + if (idle == CPU_IDLE) { + /* + * This cpu is idle. If the busiest group load doesn't + * have more tasks than the number of available cpu's and + * there is no imbalance between this and busiest group + * wrt to idle cpu's, it is balanced. + */ + if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && + sds.busiest_nr_running <= sds.busiest_group_weight) + goto out_balanced; + } else { + /* + * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use + * imbalance_pct to be conservative. + */ + if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load) + goto out_balanced; + } + +force_balance: + /* Looks like there is an imbalance. Compute it */ + calculate_imbalance(&sds, this_cpu, imbalance); + return sds.busiest; + +out_balanced: + /* + * There is no obvious imbalance. But check if we can do some balancing + * to save power. + */ + if (check_power_save_busiest_group(&sds, this_cpu, imbalance)) + return sds.busiest; +ret: + *imbalance = 0; + return NULL; +} + +/* + * find_busiest_queue - find the busiest runqueue among the cpus in group. + */ +static struct rq * +find_busiest_queue(struct sched_domain *sd, struct sched_group *group, + enum cpu_idle_type idle, unsigned long imbalance, + const struct cpumask *cpus) +{ + struct rq *busiest = NULL, *rq; + unsigned long max_load = 0; + int i; + + for_each_cpu(i, sched_group_cpus(group)) { + unsigned long power = power_of(i); + unsigned long capacity = DIV_ROUND_CLOSEST(power, + SCHED_POWER_SCALE); + unsigned long wl; + + if (!capacity) + capacity = fix_small_capacity(sd, group); + + if (!cpumask_test_cpu(i, cpus)) + continue; + + rq = cpu_rq(i); + wl = weighted_cpuload(i); + + /* + * When comparing with imbalance, use weighted_cpuload() + * which is not scaled with the cpu power. + */ + if (capacity && rq->nr_running == 1 && wl > imbalance) + continue; + + /* + * For the load comparisons with the other cpu's, consider + * the weighted_cpuload() scaled with the cpu power, so that + * the load can be moved away from the cpu that is potentially + * running at a lower capacity. + */ + wl = (wl * SCHED_POWER_SCALE) / power; + + if (wl > max_load) { + max_load = wl; + busiest = rq; + } + } + + return busiest; +} + +/* + * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but + * so long as it is large enough. + */ +#define MAX_PINNED_INTERVAL 512 + +/* Working cpumask for load_balance and load_balance_newidle. */ +static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); + +static int need_active_balance(struct sched_domain *sd, int idle, + int busiest_cpu, int this_cpu) +{ + if (idle == CPU_NEWLY_IDLE) { + + /* + * ASYM_PACKING needs to force migrate tasks from busy but + * higher numbered CPUs in order to pack all tasks in the + * lowest numbered CPUs. + */ + if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu) + return 1; + + /* + * The only task running in a non-idle cpu can be moved to this + * cpu in an attempt to completely freeup the other CPU + * package. + * + * The package power saving logic comes from + * find_busiest_group(). If there are no imbalance, then + * f_b_g() will return NULL. However when sched_mc={1,2} then + * f_b_g() will select a group from which a running task may be + * pulled to this cpu in order to make the other package idle. + * If there is no opportunity to make a package idle and if + * there are no imbalance, then f_b_g() will return NULL and no + * action will be taken in load_balance_newidle(). + * + * Under normal task pull operation due to imbalance, there + * will be more than one task in the source run queue and + * move_tasks() will succeed. ld_moved will be true and this + * active balance code will not be triggered. + */ + if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP) + return 0; + } + + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); +} + +static int active_load_balance_cpu_stop(void *data); + +/* + * Check this_cpu to ensure it is balanced within domain. Attempt to move + * tasks if there is an imbalance. + */ +static int load_balance(int this_cpu, struct rq *this_rq, + struct sched_domain *sd, enum cpu_idle_type idle, + int *balance) +{ + int ld_moved, all_pinned = 0, active_balance = 0; + struct sched_group *group; + unsigned long imbalance; + struct rq *busiest; + unsigned long flags; + struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); + + cpumask_copy(cpus, cpu_active_mask); + + schedstat_inc(sd, lb_count[idle]); + +redo: + group = find_busiest_group(sd, this_cpu, &imbalance, idle, + cpus, balance); + + if (*balance == 0) + goto out_balanced; + + if (!group) { + schedstat_inc(sd, lb_nobusyg[idle]); + goto out_balanced; + } + + busiest = find_busiest_queue(sd, group, idle, imbalance, cpus); + if (!busiest) { + schedstat_inc(sd, lb_nobusyq[idle]); + goto out_balanced; + } + + BUG_ON(busiest == this_rq); + + schedstat_add(sd, lb_imbalance[idle], imbalance); + + ld_moved = 0; + if (busiest->nr_running > 1) { + /* + * Attempt to move tasks. If find_busiest_group has found + * an imbalance but busiest->nr_running <= 1, the group is + * still unbalanced. ld_moved simply stays zero, so it is + * correctly treated as an imbalance. + */ + all_pinned = 1; + local_irq_save(flags); + double_rq_lock(this_rq, busiest); + ld_moved = move_tasks(this_rq, this_cpu, busiest, + imbalance, sd, idle, &all_pinned); + double_rq_unlock(this_rq, busiest); + local_irq_restore(flags); + + /* + * some other cpu did the load balance for us. + */ + if (ld_moved && this_cpu != smp_processor_id()) + resched_cpu(this_cpu); + + /* All tasks on this runqueue were pinned by CPU affinity */ + if (unlikely(all_pinned)) { + cpumask_clear_cpu(cpu_of(busiest), cpus); + if (!cpumask_empty(cpus)) + goto redo; + goto out_balanced; + } + } + + if (!ld_moved) { + schedstat_inc(sd, lb_failed[idle]); + /* + * Increment the failure counter only on periodic balance. + * We do not want newidle balance, which can be very + * frequent, pollute the failure counter causing + * excessive cache_hot migrations and active balances. + */ + if (idle != CPU_NEWLY_IDLE) + sd->nr_balance_failed++; + + if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) { + raw_spin_lock_irqsave(&busiest->lock, flags); + + /* don't kick the active_load_balance_cpu_stop, + * if the curr task on busiest cpu can't be + * moved to this_cpu + */ + if (!cpumask_test_cpu(this_cpu, + &busiest->curr->cpus_allowed)) { + raw_spin_unlock_irqrestore(&busiest->lock, + flags); + all_pinned = 1; + goto out_one_pinned; + } + + /* + * ->active_balance synchronizes accesses to + * ->active_balance_work. Once set, it's cleared + * only after active load balance is finished. + */ + if (!busiest->active_balance) { + busiest->active_balance = 1; + busiest->push_cpu = this_cpu; + active_balance = 1; + } + raw_spin_unlock_irqrestore(&busiest->lock, flags); + + if (active_balance) + stop_one_cpu_nowait(cpu_of(busiest), + active_load_balance_cpu_stop, busiest, + &busiest->active_balance_work); + + /* + * We've kicked active balancing, reset the failure + * counter. + */ + sd->nr_balance_failed = sd->cache_nice_tries+1; + } + } else + sd->nr_balance_failed = 0; + + if (likely(!active_balance)) { + /* We were unbalanced, so reset the balancing interval */ + sd->balance_interval = sd->min_interval; + } else { + /* + * If we've begun active balancing, start to back off. This + * case may not be covered by the all_pinned logic if there + * is only 1 task on the busy runqueue (because we don't call + * move_tasks). + */ + if (sd->balance_interval < sd->max_interval) + sd->balance_interval *= 2; + } + + goto out; + +out_balanced: + schedstat_inc(sd, lb_balanced[idle]); + + sd->nr_balance_failed = 0; + +out_one_pinned: + /* tune up the balancing interval */ + if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || + (sd->balance_interval < sd->max_interval)) + sd->balance_interval *= 2; + + ld_moved = 0; +out: + return ld_moved; +} + +/* + * idle_balance is called by schedule() if this_cpu is about to become + * idle. Attempts to pull tasks from other CPUs. + */ +static void idle_balance(int this_cpu, struct rq *this_rq) +{ + struct sched_domain *sd; + int pulled_task = 0; + unsigned long next_balance = jiffies + HZ; + + this_rq->idle_stamp = this_rq->clock; + + if (this_rq->avg_idle < sysctl_sched_migration_cost) + return; + + /* + * Drop the rq->lock, but keep IRQ/preempt disabled. + */ + raw_spin_unlock(&this_rq->lock); + + update_shares(this_cpu); + rcu_read_lock(); + for_each_domain(this_cpu, sd) { + unsigned long interval; + int balance = 1; + + if (!(sd->flags & SD_LOAD_BALANCE)) + continue; + + if (sd->flags & SD_BALANCE_NEWIDLE) { + /* If we've pulled tasks over stop searching: */ + pulled_task = load_balance(this_cpu, this_rq, + sd, CPU_NEWLY_IDLE, &balance); + } + + interval = msecs_to_jiffies(sd->balance_interval); + if (time_after(next_balance, sd->last_balance + interval)) + next_balance = sd->last_balance + interval; + if (pulled_task) { + this_rq->idle_stamp = 0; + break; + } + } + rcu_read_unlock(); + + raw_spin_lock(&this_rq->lock); + + if (pulled_task || time_after(jiffies, this_rq->next_balance)) { + /* + * We are going idle. next_balance may be set based on + * a busy processor. So reset next_balance. + */ + this_rq->next_balance = next_balance; + } +} + +/* + * active_load_balance_cpu_stop is run by cpu stopper. It pushes + * running tasks off the busiest CPU onto idle CPUs. It requires at + * least 1 task to be running on each physical CPU where possible, and + * avoids physical / logical imbalances. + */ +static int active_load_balance_cpu_stop(void *data) +{ + struct rq *busiest_rq = data; + int busiest_cpu = cpu_of(busiest_rq); + int target_cpu = busiest_rq->push_cpu; + struct rq *target_rq = cpu_rq(target_cpu); + struct sched_domain *sd; + + raw_spin_lock_irq(&busiest_rq->lock); + + /* make sure the requested cpu hasn't gone down in the meantime */ + if (unlikely(busiest_cpu != smp_processor_id() || + !busiest_rq->active_balance)) + goto out_unlock; + + /* Is there any task to move? */ + if (busiest_rq->nr_running <= 1) + goto out_unlock; + + /* + * This condition is "impossible", if it occurs + * we need to fix it. Originally reported by + * Bjorn Helgaas on a 128-cpu setup. + */ + BUG_ON(busiest_rq == target_rq); + + /* move a task from busiest_rq to target_rq */ + double_lock_balance(busiest_rq, target_rq); + + /* Search for an sd spanning us and the target CPU. */ + rcu_read_lock(); + for_each_domain(target_cpu, sd) { + if ((sd->flags & SD_LOAD_BALANCE) && + cpumask_test_cpu(busiest_cpu, sched_domain_span(sd))) + break; + } + + if (likely(sd)) { + schedstat_inc(sd, alb_count); + + if (move_one_task(target_rq, target_cpu, busiest_rq, + sd, CPU_IDLE)) + schedstat_inc(sd, alb_pushed); + else + schedstat_inc(sd, alb_failed); + } + rcu_read_unlock(); + double_unlock_balance(busiest_rq, target_rq); +out_unlock: + busiest_rq->active_balance = 0; + raw_spin_unlock_irq(&busiest_rq->lock); + return 0; +} + +#ifdef CONFIG_NO_HZ + +static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb); + +static void trigger_sched_softirq(void *data) +{ + raise_softirq_irqoff(SCHED_SOFTIRQ); +} + +static inline void init_sched_softirq_csd(struct call_single_data *csd) +{ + csd->func = trigger_sched_softirq; + csd->info = NULL; + csd->flags = 0; + csd->priv = 0; +} + +/* + * idle load balancing details + * - One of the idle CPUs nominates itself as idle load_balancer, while + * entering idle. + * - This idle load balancer CPU will also go into tickless mode when + * it is idle, just like all other idle CPUs + * - When one of the busy CPUs notice that there may be an idle rebalancing + * needed, they will kick the idle load balancer, which then does idle + * load balancing for all the idle CPUs. + */ +static struct { + atomic_t load_balancer; + atomic_t first_pick_cpu; + atomic_t second_pick_cpu; + cpumask_var_t idle_cpus_mask; + cpumask_var_t grp_idle_mask; + unsigned long next_balance; /* in jiffy units */ +} nohz ____cacheline_aligned; + +int get_nohz_load_balancer(void) +{ + return atomic_read(&nohz.load_balancer); +} + +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +/** + * lowest_flag_domain - Return lowest sched_domain containing flag. + * @cpu: The cpu whose lowest level of sched domain is to + * be returned. + * @flag: The flag to check for the lowest sched_domain + * for the given cpu. + * + * Returns the lowest sched_domain of a cpu which contains the given flag. + */ +static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) +{ + struct sched_domain *sd; + + for_each_domain(cpu, sd) + if (sd && (sd->flags & flag)) + break; + + return sd; +} + +/** + * for_each_flag_domain - Iterates over sched_domains containing the flag. + * @cpu: The cpu whose domains we're iterating over. + * @sd: variable holding the value of the power_savings_sd + * for cpu. + * @flag: The flag to filter the sched_domains to be iterated. + * + * Iterates over all the scheduler domains for a given cpu that has the 'flag' + * set, starting from the lowest sched_domain to the highest. + */ +#define for_each_flag_domain(cpu, sd, flag) \ + for (sd = lowest_flag_domain(cpu, flag); \ + (sd && (sd->flags & flag)); sd = sd->parent) + +/** + * is_semi_idle_group - Checks if the given sched_group is semi-idle. + * @ilb_group: group to be checked for semi-idleness + * + * Returns: 1 if the group is semi-idle. 0 otherwise. + * + * We define a sched_group to be semi idle if it has atleast one idle-CPU + * and atleast one non-idle CPU. This helper function checks if the given + * sched_group is semi-idle or not. + */ +static inline int is_semi_idle_group(struct sched_group *ilb_group) +{ + cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask, + sched_group_cpus(ilb_group)); + + /* + * A sched_group is semi-idle when it has atleast one busy cpu + * and atleast one idle cpu. + */ + if (cpumask_empty(nohz.grp_idle_mask)) + return 0; + + if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group))) + return 0; + + return 1; +} +/** + * find_new_ilb - Finds the optimum idle load balancer for nomination. + * @cpu: The cpu which is nominating a new idle_load_balancer. + * + * Returns: Returns the id of the idle load balancer if it exists, + * Else, returns >= nr_cpu_ids. + * + * This algorithm picks the idle load balancer such that it belongs to a + * semi-idle powersavings sched_domain. The idea is to try and avoid + * completely idle packages/cores just for the purpose of idle load balancing + * when there are other idle cpu's which are better suited for that job. + */ +static int find_new_ilb(int cpu) +{ + struct sched_domain *sd; + struct sched_group *ilb_group; + int ilb = nr_cpu_ids; + + /* + * Have idle load balancer selection from semi-idle packages only + * when power-aware load balancing is enabled + */ + if (!(sched_smt_power_savings || sched_mc_power_savings)) + goto out_done; + + /* + * Optimize for the case when we have no idle CPUs or only one + * idle CPU. Don't walk the sched_domain hierarchy in such cases + */ + if (cpumask_weight(nohz.idle_cpus_mask) < 2) + goto out_done; + + rcu_read_lock(); + for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { + ilb_group = sd->groups; + + do { + if (is_semi_idle_group(ilb_group)) { + ilb = cpumask_first(nohz.grp_idle_mask); + goto unlock; + } + + ilb_group = ilb_group->next; + + } while (ilb_group != sd->groups); + } +unlock: + rcu_read_unlock(); + +out_done: + return ilb; +} +#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ +static inline int find_new_ilb(int call_cpu) +{ + return nr_cpu_ids; +} +#endif + +/* + * Kick a CPU to do the nohz balancing, if it is time for it. We pick the + * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle + * CPU (if there is one). + */ +static void nohz_balancer_kick(int cpu) +{ + int ilb_cpu; + + nohz.next_balance++; + + ilb_cpu = get_nohz_load_balancer(); + + if (ilb_cpu >= nr_cpu_ids) { + ilb_cpu = cpumask_first(nohz.idle_cpus_mask); + if (ilb_cpu >= nr_cpu_ids) + return; + } + + if (!cpu_rq(ilb_cpu)->nohz_balance_kick) { + struct call_single_data *cp; + + cpu_rq(ilb_cpu)->nohz_balance_kick = 1; + cp = &per_cpu(remote_sched_softirq_cb, cpu); + __smp_call_function_single(ilb_cpu, cp, 0); + } + return; +} + +/* + * This routine will try to nominate the ilb (idle load balancing) + * owner among the cpus whose ticks are stopped. ilb owner will do the idle + * load balancing on behalf of all those cpus. + * + * When the ilb owner becomes busy, we will not have new ilb owner until some + * idle CPU wakes up and goes back to idle or some busy CPU tries to kick + * idle load balancing by kicking one of the idle CPUs. + * + * Ticks are stopped for the ilb owner as well, with busy CPU kicking this + * ilb owner CPU in future (when there is a need for idle load balancing on + * behalf of all idle CPUs). + */ +void select_nohz_load_balancer(int stop_tick) +{ + int cpu = smp_processor_id(); + + if (stop_tick) { + if (!cpu_active(cpu)) { + if (atomic_read(&nohz.load_balancer) != cpu) + return; + + /* + * If we are going offline and still the leader, + * give up! + */ + if (atomic_cmpxchg(&nohz.load_balancer, cpu, + nr_cpu_ids) != cpu) + BUG(); + + return; + } + + cpumask_set_cpu(cpu, nohz.idle_cpus_mask); + + if (atomic_read(&nohz.first_pick_cpu) == cpu) + atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids); + if (atomic_read(&nohz.second_pick_cpu) == cpu) + atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); + + if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) { + int new_ilb; + + /* make me the ilb owner */ + if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids, + cpu) != nr_cpu_ids) + return; + + /* + * Check to see if there is a more power-efficient + * ilb. + */ + new_ilb = find_new_ilb(cpu); + if (new_ilb < nr_cpu_ids && new_ilb != cpu) { + atomic_set(&nohz.load_balancer, nr_cpu_ids); + resched_cpu(new_ilb); + return; + } + return; + } + } else { + if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask)) + return; + + cpumask_clear_cpu(cpu, nohz.idle_cpus_mask); + + if (atomic_read(&nohz.load_balancer) == cpu) + if (atomic_cmpxchg(&nohz.load_balancer, cpu, + nr_cpu_ids) != cpu) + BUG(); + } + return; +} +#endif + +static DEFINE_SPINLOCK(balancing); + +static unsigned long __read_mostly max_load_balance_interval = HZ/10; + +/* + * Scale the max load_balance interval with the number of CPUs in the system. + * This trades load-balance latency on larger machines for less cross talk. + */ +static void update_max_interval(void) +{ + max_load_balance_interval = HZ*num_online_cpus()/10; +} + +/* + * It checks each scheduling domain to see if it is due to be balanced, + * and initiates a balancing operation if so. + * + * Balancing parameters are set up in arch_init_sched_domains. + */ +static void rebalance_domains(int cpu, enum cpu_idle_type idle) +{ + int balance = 1; + struct rq *rq = cpu_rq(cpu); + unsigned long interval; + struct sched_domain *sd; + /* Earliest time when we have to do rebalance again */ + unsigned long next_balance = jiffies + 60*HZ; + int update_next_balance = 0; + int need_serialize; + + update_shares(cpu); + + rcu_read_lock(); + for_each_domain(cpu, sd) { + if (!(sd->flags & SD_LOAD_BALANCE)) + continue; + + interval = sd->balance_interval; + if (idle != CPU_IDLE) + interval *= sd->busy_factor; + + /* scale ms to jiffies */ + interval = msecs_to_jiffies(interval); + interval = clamp(interval, 1UL, max_load_balance_interval); + + need_serialize = sd->flags & SD_SERIALIZE; + + if (need_serialize) { + if (!spin_trylock(&balancing)) + goto out; + } + + if (time_after_eq(jiffies, sd->last_balance + interval)) { + if (load_balance(cpu, rq, sd, idle, &balance)) { + /* + * We've pulled tasks over so either we're no + * longer idle. + */ + idle = CPU_NOT_IDLE; + } + sd->last_balance = jiffies; + } + if (need_serialize) + spin_unlock(&balancing); +out: + if (time_after(next_balance, sd->last_balance + interval)) { + next_balance = sd->last_balance + interval; + update_next_balance = 1; + } + + /* + * Stop the load balance at this level. There is another + * CPU in our sched group which is doing load balancing more + * actively. + */ + if (!balance) + break; + } + rcu_read_unlock(); + + /* + * next_balance will be updated only when there is a need. + * When the cpu is attached to null domain for ex, it will not be + * updated. + */ + if (likely(update_next_balance)) + rq->next_balance = next_balance; +} + +#ifdef CONFIG_NO_HZ +/* + * In CONFIG_NO_HZ case, the idle balance kickee will do the + * rebalancing for all the cpus for whom scheduler ticks are stopped. + */ +static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) +{ + struct rq *this_rq = cpu_rq(this_cpu); + struct rq *rq; + int balance_cpu; + + if (idle != CPU_IDLE || !this_rq->nohz_balance_kick) + return; + + for_each_cpu(balance_cpu, nohz.idle_cpus_mask) { + if (balance_cpu == this_cpu) + continue; + + /* + * If this cpu gets work to do, stop the load balancing + * work being done for other cpus. Next load + * balancing owner will pick it up. + */ + if (need_resched()) { + this_rq->nohz_balance_kick = 0; + break; + } + + raw_spin_lock_irq(&this_rq->lock); + update_rq_clock(this_rq); + update_cpu_load(this_rq); + raw_spin_unlock_irq(&this_rq->lock); + + rebalance_domains(balance_cpu, CPU_IDLE); + + rq = cpu_rq(balance_cpu); + if (time_after(this_rq->next_balance, rq->next_balance)) + this_rq->next_balance = rq->next_balance; + } + nohz.next_balance = this_rq->next_balance; + this_rq->nohz_balance_kick = 0; +} + +/* + * Current heuristic for kicking the idle load balancer + * - first_pick_cpu is the one of the busy CPUs. It will kick + * idle load balancer when it has more than one process active. This + * eliminates the need for idle load balancing altogether when we have + * only one running process in the system (common case). + * - If there are more than one busy CPU, idle load balancer may have + * to run for active_load_balance to happen (i.e., two busy CPUs are + * SMT or core siblings and can run better if they move to different + * physical CPUs). So, second_pick_cpu is the second of the busy CPUs + * which will kick idle load balancer as soon as it has any load. + */ +static inline int nohz_kick_needed(struct rq *rq, int cpu) +{ + unsigned long now = jiffies; + int ret; + int first_pick_cpu, second_pick_cpu; + + if (time_before(now, nohz.next_balance)) + return 0; + + if (rq->idle_at_tick) + return 0; + + first_pick_cpu = atomic_read(&nohz.first_pick_cpu); + second_pick_cpu = atomic_read(&nohz.second_pick_cpu); + + if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu && + second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu) + return 0; + + ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu); + if (ret == nr_cpu_ids || ret == cpu) { + atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids); + if (rq->nr_running > 1) + return 1; + } else { + ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu); + if (ret == nr_cpu_ids || ret == cpu) { + if (rq->nr_running) + return 1; + } + } + return 0; +} +#else +static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { } +#endif + +/* + * run_rebalance_domains is triggered when needed from the scheduler tick. + * Also triggered for nohz idle balancing (with nohz_balancing_kick set). + */ +static void run_rebalance_domains(struct softirq_action *h) +{ + int this_cpu = smp_processor_id(); + struct rq *this_rq = cpu_rq(this_cpu); + enum cpu_idle_type idle = this_rq->idle_at_tick ? + CPU_IDLE : CPU_NOT_IDLE; + + rebalance_domains(this_cpu, idle); + + /* + * If this cpu has a pending nohz_balance_kick, then do the + * balancing on behalf of the other idle cpus whose ticks are + * stopped. + */ + nohz_idle_balance(this_cpu, idle); +} + +static inline int on_null_domain(int cpu) +{ + return !rcu_dereference_sched(cpu_rq(cpu)->sd); +} + +/* + * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. + */ +static inline void trigger_load_balance(struct rq *rq, int cpu) +{ + /* Don't need to rebalance while attached to NULL domain */ + if (time_after_eq(jiffies, rq->next_balance) && + likely(!on_null_domain(cpu))) + raise_softirq(SCHED_SOFTIRQ); +#ifdef CONFIG_NO_HZ + else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu))) + nohz_balancer_kick(cpu); +#endif +} + +static void rq_online_fair(struct rq *rq) +{ + update_sysctl(); +} + +static void rq_offline_fair(struct rq *rq) +{ + update_sysctl(); +} + +#else /* CONFIG_SMP */ + +/* + * on UP we do not need to balance between CPUs: + */ +static inline void idle_balance(int cpu, struct rq *rq) +{ +} + +#endif /* CONFIG_SMP */ + +/* + * scheduler tick hitting a task of our scheduling class: + */ +static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) +{ + struct cfs_rq *cfs_rq; + struct sched_entity *se = &curr->se; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + entity_tick(cfs_rq, se, queued); + } +} + +/* + * called on fork with the child task as argument from the parent's context + * - child not yet on the tasklist + * - preemption disabled + */ +static void task_fork_fair(struct task_struct *p) +{ + struct cfs_rq *cfs_rq = task_cfs_rq(current); + struct sched_entity *se = &p->se, *curr = cfs_rq->curr; + int this_cpu = smp_processor_id(); + struct rq *rq = this_rq(); + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + + update_rq_clock(rq); + + if (unlikely(task_cpu(p) != this_cpu)) { + rcu_read_lock(); + __set_task_cpu(p, this_cpu); + rcu_read_unlock(); + } + + update_curr(cfs_rq); + + if (curr) + se->vruntime = curr->vruntime; + place_entity(cfs_rq, se, 1); + + if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { + /* + * Upon rescheduling, sched_class::put_prev_task() will place + * 'current' within the tree based on its new key value. + */ + swap(curr->vruntime, se->vruntime); + resched_task(rq->curr); + } + + se->vruntime -= cfs_rq->min_vruntime; + + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +/* + * Priority of the task has changed. Check to see if we preempt + * the current task. + */ +static void +prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) +{ + if (!p->se.on_rq) + return; + + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (rq->curr == p) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else + check_preempt_curr(rq, p, 0); +} + +static void switched_from_fair(struct rq *rq, struct task_struct *p) +{ + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + /* + * Ensure the task's vruntime is normalized, so that when its + * switched back to the fair class the enqueue_entity(.flags=0) will + * do the right thing. + * + * If it was on_rq, then the dequeue_entity(.flags=0) will already + * have normalized the vruntime, if it was !on_rq, then only when + * the task is sleeping will it still have non-normalized vruntime. + */ + if (!se->on_rq && p->state != TASK_RUNNING) { + /* + * Fix up our vruntime so that the current sleep doesn't + * cause 'unlimited' sleep bonus. + */ + place_entity(cfs_rq, se, 0); + se->vruntime -= cfs_rq->min_vruntime; + } +} + +/* + * We switched to the sched_fair class. + */ +static void switched_to_fair(struct rq *rq, struct task_struct *p) +{ + if (!p->se.on_rq) + return; + + /* + * We were most likely switched from sched_rt, so + * kick off the schedule if running, otherwise just see + * if we can still preempt the current task. + */ + if (rq->curr == p) + resched_task(rq->curr); + else + check_preempt_curr(rq, p, 0); +} + +/* Account for a task changing its policy or group. + * + * This routine is mostly called to set cfs_rq->curr field when a task + * migrates between groups/classes. + */ +static void set_curr_task_fair(struct rq *rq) +{ + struct sched_entity *se = &rq->curr->se; + + for_each_sched_entity(se) + set_next_entity(cfs_rq_of(se), se); +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static void task_move_group_fair(struct task_struct *p, int on_rq) +{ + /* + * If the task was not on the rq at the time of this cgroup movement + * it must have been asleep, sleeping tasks keep their ->vruntime + * absolute on their old rq until wakeup (needed for the fair sleeper + * bonus in place_entity()). + * + * If it was on the rq, we've just 'preempted' it, which does convert + * ->vruntime to a relative base. + * + * Make sure both cases convert their relative position when migrating + * to another cgroup's rq. This does somewhat interfere with the + * fair sleeper stuff for the first placement, but who cares. + */ + if (!on_rq) + p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime; + set_task_rq(p, task_cpu(p)); + if (!on_rq) + p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime; +} +#endif + +static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) +{ + struct sched_entity *se = &task->se; + unsigned int rr_interval = 0; + + /* + * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise + * idle runqueue: + */ + if (rq->cfs.load.weight) + rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); + + return rr_interval; +} + +/* + * All the scheduling class methods: + */ +static const struct sched_class fair_sched_class = { + .next = &idle_sched_class, + .enqueue_task = enqueue_task_fair, + .dequeue_task = dequeue_task_fair, + .yield_task = yield_task_fair, + .yield_to_task = yield_to_task_fair, + + .check_preempt_curr = check_preempt_wakeup, + + .pick_next_task = pick_next_task_fair, + .put_prev_task = put_prev_task_fair, + +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_fair, + + .rq_online = rq_online_fair, + .rq_offline = rq_offline_fair, + + .task_waking = task_waking_fair, +#endif + + .set_curr_task = set_curr_task_fair, + .task_tick = task_tick_fair, + .task_fork = task_fork_fair, + + .prio_changed = prio_changed_fair, + .switched_from = switched_from_fair, + .switched_to = switched_to_fair, + + .get_rr_interval = get_rr_interval_fair, + +#ifdef CONFIG_FAIR_GROUP_SCHED + .task_move_group = task_move_group_fair, +#endif +}; + +#ifdef CONFIG_SCHED_DEBUG +static void print_cfs_stats(struct seq_file *m, int cpu) +{ + struct cfs_rq *cfs_rq; + + rcu_read_lock(); + for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) + print_cfs_rq(m, cpu, cfs_rq); + rcu_read_unlock(); +} +#endif diff --git a/kernel/sched_features.h b/kernel/sched_features.h new file mode 100644 index 00000000..1e7066d7 --- /dev/null +++ b/kernel/sched_features.h @@ -0,0 +1,74 @@ +/* + * Only give sleepers 50% of their service deficit. This allows + * them to run sooner, but does not allow tons of sleepers to + * rip the spread apart. + */ +SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) + +/* + * Place new tasks ahead so that they do not starve already running + * tasks + */ +SCHED_FEAT(START_DEBIT, 1) + +/* + * Should wakeups try to preempt running tasks. + */ +SCHED_FEAT(WAKEUP_PREEMPT, 1) + +/* + * Based on load and program behaviour, see if it makes sense to place + * a newly woken task on the same cpu as the task that woke it -- + * improve cache locality. Typically used with SYNC wakeups as + * generated by pipes and the like, see also SYNC_WAKEUPS. + */ +SCHED_FEAT(AFFINE_WAKEUPS, 1) + +/* + * Prefer to schedule the task we woke last (assuming it failed + * wakeup-preemption), since its likely going to consume data we + * touched, increases cache locality. + */ +SCHED_FEAT(NEXT_BUDDY, 0) + +/* + * Prefer to schedule the task that ran last (when we did + * wake-preempt) as that likely will touch the same data, increases + * cache locality. + */ +SCHED_FEAT(LAST_BUDDY, 1) + +/* + * Consider buddies to be cache hot, decreases the likelyness of a + * cache buddy being migrated away, increases cache locality. + */ +SCHED_FEAT(CACHE_HOT_BUDDY, 1) + +/* + * Use arch dependent cpu power functions + */ +SCHED_FEAT(ARCH_POWER, 0) + +SCHED_FEAT(HRTICK, 0) +SCHED_FEAT(DOUBLE_TICK, 0) +SCHED_FEAT(LB_BIAS, 1) + +/* + * Spin-wait on mutex acquisition when the mutex owner is running on + * another cpu -- assumes that when the owner is running, it will soon + * release the lock. Decreases scheduling overhead. + */ +SCHED_FEAT(OWNER_SPIN, 1) + +/* + * Decrement CPU power based on irq activity + */ +SCHED_FEAT(NONIRQ_POWER, 1) + +/* + * Queue remote wakeups on the target CPU and process them + * using the scheduler IPI. Reduces rq->lock contention/bounces. + */ +SCHED_FEAT(TTWU_QUEUE, 1) + +SCHED_FEAT(FORCE_SD_OVERLAP, 0) diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c new file mode 100644 index 00000000..0a518825 --- /dev/null +++ b/kernel/sched_idletask.c @@ -0,0 +1,97 @@ +/* + * idle-task scheduling class. + * + * (NOTE: these are not related to SCHED_IDLE tasks which are + * handled in sched_fair.c) + */ + +#ifdef CONFIG_SMP +static int +select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) +{ + return task_cpu(p); /* IDLE tasks as never migrated */ +} +#endif /* CONFIG_SMP */ +/* + * Idle tasks are unconditionally rescheduled: + */ +static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) +{ + resched_task(rq->idle); +} + +static struct task_struct *pick_next_task_idle(struct rq *rq) +{ + schedstat_inc(rq, sched_goidle); + calc_load_account_idle(rq); + return rq->idle; +} + +/* + * It is not legal to sleep in the idle task - print a warning + * message if some code attempts to do it: + */ +static void +dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) +{ + raw_spin_unlock_irq(&rq->lock); + printk(KERN_ERR "bad: scheduling from the idle thread!\n"); + dump_stack(); + raw_spin_lock_irq(&rq->lock); +} + +static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) +{ +} + +static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) +{ +} + +static void set_curr_task_idle(struct rq *rq) +{ +} + +static void switched_to_idle(struct rq *rq, struct task_struct *p) +{ + BUG(); +} + +static void +prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) +{ + BUG(); +} + +static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) +{ + return 0; +} + +/* + * Simple, special scheduling class for the per-CPU idle tasks: + */ +static const struct sched_class idle_sched_class = { + /* .next is NULL */ + /* no enqueue/yield_task for idle tasks */ + + /* dequeue is not valid, we print a debug message there: */ + .dequeue_task = dequeue_task_idle, + + .check_preempt_curr = check_preempt_curr_idle, + + .pick_next_task = pick_next_task_idle, + .put_prev_task = put_prev_task_idle, + +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_idle, +#endif + + .set_curr_task = set_curr_task_idle, + .task_tick = task_tick_idle, + + .get_rr_interval = get_rr_interval_idle, + + .prio_changed = prio_changed_idle, + .switched_to = switched_to_idle, +}; diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c new file mode 100644 index 00000000..ac79f9e3 --- /dev/null +++ b/kernel/sched_rt.c @@ -0,0 +1,1859 @@ +/* + * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR + * policies) + */ + +#ifdef CONFIG_RT_GROUP_SCHED + +#define rt_entity_is_task(rt_se) (!(rt_se)->my_q) + +static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) +{ +#ifdef CONFIG_SCHED_DEBUG + WARN_ON_ONCE(!rt_entity_is_task(rt_se)); +#endif + return container_of(rt_se, struct task_struct, rt); +} + +static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) +{ + return rt_rq->rq; +} + +static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) +{ + return rt_se->rt_rq; +} + +#else /* CONFIG_RT_GROUP_SCHED */ + +#define rt_entity_is_task(rt_se) (1) + +static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) +{ + return container_of(rt_se, struct task_struct, rt); +} + +static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) +{ + return container_of(rt_rq, struct rq, rt); +} + +static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) +{ + struct task_struct *p = rt_task_of(rt_se); + struct rq *rq = task_rq(p); + + return &rq->rt; +} + +#endif /* CONFIG_RT_GROUP_SCHED */ + +#ifdef CONFIG_SMP + +static inline int rt_overloaded(struct rq *rq) +{ + return atomic_read(&rq->rd->rto_count); +} + +static inline void rt_set_overload(struct rq *rq) +{ + if (!rq->online) + return; + + cpumask_set_cpu(rq->cpu, rq->rd->rto_mask); + /* + * Make sure the mask is visible before we set + * the overload count. That is checked to determine + * if we should look at the mask. It would be a shame + * if we looked at the mask, but the mask was not + * updated yet. + */ + wmb(); + atomic_inc(&rq->rd->rto_count); +} + +static inline void rt_clear_overload(struct rq *rq) +{ + if (!rq->online) + return; + + /* the order here really doesn't matter */ + atomic_dec(&rq->rd->rto_count); + cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); +} + +static void update_rt_migration(struct rt_rq *rt_rq) +{ + if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) { + if (!rt_rq->overloaded) { + rt_set_overload(rq_of_rt_rq(rt_rq)); + rt_rq->overloaded = 1; + } + } else if (rt_rq->overloaded) { + rt_clear_overload(rq_of_rt_rq(rt_rq)); + rt_rq->overloaded = 0; + } +} + +static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ + if (!rt_entity_is_task(rt_se)) + return; + + rt_rq = &rq_of_rt_rq(rt_rq)->rt; + + rt_rq->rt_nr_total++; + if (rt_se->nr_cpus_allowed > 1) + rt_rq->rt_nr_migratory++; + + update_rt_migration(rt_rq); +} + +static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ + if (!rt_entity_is_task(rt_se)) + return; + + rt_rq = &rq_of_rt_rq(rt_rq)->rt; + + rt_rq->rt_nr_total--; + if (rt_se->nr_cpus_allowed > 1) + rt_rq->rt_nr_migratory--; + + update_rt_migration(rt_rq); +} + +static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) +{ + plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); + plist_node_init(&p->pushable_tasks, p->prio); + plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks); +} + +static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) +{ + plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks); +} + +static inline int has_pushable_tasks(struct rq *rq) +{ + return !plist_head_empty(&rq->rt.pushable_tasks); +} + +#else + +static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p) +{ +} + +static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) +{ +} + +static inline +void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ +} + +static inline +void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ +} + +#endif /* CONFIG_SMP */ + +static inline int on_rt_rq(struct sched_rt_entity *rt_se) +{ + return !list_empty(&rt_se->run_list); +} + +#ifdef CONFIG_RT_GROUP_SCHED + +static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) +{ + if (!rt_rq->tg) + return RUNTIME_INF; + + return rt_rq->rt_runtime; +} + +static inline u64 sched_rt_period(struct rt_rq *rt_rq) +{ + return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period); +} + +typedef struct task_group *rt_rq_iter_t; + +#define for_each_rt_rq(rt_rq, iter, rq) \ + for (iter = list_entry_rcu(task_groups.next, typeof(*iter), list); \ + (&iter->list != &task_groups) && \ + (rt_rq = iter->rt_rq[cpu_of(rq)]); \ + iter = list_entry_rcu(iter->list.next, typeof(*iter), list)) + +static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) +{ + list_add_rcu(&rt_rq->leaf_rt_rq_list, + &rq_of_rt_rq(rt_rq)->leaf_rt_rq_list); +} + +static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) +{ + list_del_rcu(&rt_rq->leaf_rt_rq_list); +} + +#define for_each_leaf_rt_rq(rt_rq, rq) \ + list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) + +#define for_each_sched_rt_entity(rt_se) \ + for (; rt_se; rt_se = rt_se->parent) + +static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) +{ + return rt_se->my_q; +} + +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head); +static void dequeue_rt_entity(struct sched_rt_entity *rt_se); + +static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) +{ + struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; + struct sched_rt_entity *rt_se; + + int cpu = cpu_of(rq_of_rt_rq(rt_rq)); + + rt_se = rt_rq->tg->rt_se[cpu]; + + if (rt_rq->rt_nr_running) { + if (rt_se && !on_rt_rq(rt_se)) + enqueue_rt_entity(rt_se, false); + if (rt_rq->highest_prio.curr < curr->prio) + resched_task(curr); + } +} + +static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) +{ + struct sched_rt_entity *rt_se; + int cpu = cpu_of(rq_of_rt_rq(rt_rq)); + + rt_se = rt_rq->tg->rt_se[cpu]; + + if (rt_se && on_rt_rq(rt_se)) + dequeue_rt_entity(rt_se); +} + +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted; +} + +static int rt_se_boosted(struct sched_rt_entity *rt_se) +{ + struct rt_rq *rt_rq = group_rt_rq(rt_se); + struct task_struct *p; + + if (rt_rq) + return !!rt_rq->rt_nr_boosted; + + p = rt_task_of(rt_se); + return p->prio != p->normal_prio; +} + +#ifdef CONFIG_SMP +static inline const struct cpumask *sched_rt_period_mask(void) +{ + return cpu_rq(smp_processor_id())->rd->span; +} +#else +static inline const struct cpumask *sched_rt_period_mask(void) +{ + return cpu_online_mask; +} +#endif + +static inline +struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) +{ + return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu]; +} + +static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) +{ + return &rt_rq->tg->rt_bandwidth; +} + +#else /* !CONFIG_RT_GROUP_SCHED */ + +static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) +{ + return rt_rq->rt_runtime; +} + +static inline u64 sched_rt_period(struct rt_rq *rt_rq) +{ + return ktime_to_ns(def_rt_bandwidth.rt_period); +} + +typedef struct rt_rq *rt_rq_iter_t; + +#define for_each_rt_rq(rt_rq, iter, rq) \ + for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) + +static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq) +{ +} + +static inline void list_del_leaf_rt_rq(struct rt_rq *rt_rq) +{ +} + +#define for_each_leaf_rt_rq(rt_rq, rq) \ + for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) + +#define for_each_sched_rt_entity(rt_se) \ + for (; rt_se; rt_se = NULL) + +static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) +{ + return NULL; +} + +static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) +{ + if (rt_rq->rt_nr_running) + resched_task(rq_of_rt_rq(rt_rq)->curr); +} + +static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) +{ +} + +static inline int rt_rq_throttled(struct rt_rq *rt_rq) +{ + return rt_rq->rt_throttled; +} + +static inline const struct cpumask *sched_rt_period_mask(void) +{ + return cpu_online_mask; +} + +static inline +struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) +{ + return &cpu_rq(cpu)->rt; +} + +static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) +{ + return &def_rt_bandwidth; +} + +#endif /* CONFIG_RT_GROUP_SCHED */ + +#ifdef CONFIG_SMP +/* + * We ran out of runtime, see if we can borrow some from our neighbours. + */ +static int do_balance_runtime(struct rt_rq *rt_rq) +{ + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + struct root_domain *rd = cpu_rq(smp_processor_id())->rd; + int i, weight, more = 0; + u64 rt_period; + + weight = cpumask_weight(rd->span); + + raw_spin_lock(&rt_b->rt_runtime_lock); + rt_period = ktime_to_ns(rt_b->rt_period); + for_each_cpu(i, rd->span) { + struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); + s64 diff; + + if (iter == rt_rq) + continue; + + raw_spin_lock(&iter->rt_runtime_lock); + /* + * Either all rqs have inf runtime and there's nothing to steal + * or __disable_runtime() below sets a specific rq to inf to + * indicate its been disabled and disalow stealing. + */ + if (iter->rt_runtime == RUNTIME_INF) + goto next; + + /* + * From runqueues with spare time, take 1/n part of their + * spare time, but no more than our period. + */ + diff = iter->rt_runtime - iter->rt_time; + if (diff > 0) { + diff = div_u64((u64)diff, weight); + if (rt_rq->rt_runtime + diff > rt_period) + diff = rt_period - rt_rq->rt_runtime; + iter->rt_runtime -= diff; + rt_rq->rt_runtime += diff; + more = 1; + if (rt_rq->rt_runtime == rt_period) { + raw_spin_unlock(&iter->rt_runtime_lock); + break; + } + } +next: + raw_spin_unlock(&iter->rt_runtime_lock); + } + raw_spin_unlock(&rt_b->rt_runtime_lock); + + return more; +} + +/* + * Ensure this RQ takes back all the runtime it lend to its neighbours. + */ +static void __disable_runtime(struct rq *rq) +{ + struct root_domain *rd = rq->rd; + rt_rq_iter_t iter; + struct rt_rq *rt_rq; + + if (unlikely(!scheduler_running)) + return; + + for_each_rt_rq(rt_rq, iter, rq) { + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + s64 want; + int i; + + raw_spin_lock(&rt_b->rt_runtime_lock); + raw_spin_lock(&rt_rq->rt_runtime_lock); + /* + * Either we're all inf and nobody needs to borrow, or we're + * already disabled and thus have nothing to do, or we have + * exactly the right amount of runtime to take out. + */ + if (rt_rq->rt_runtime == RUNTIME_INF || + rt_rq->rt_runtime == rt_b->rt_runtime) + goto balanced; + raw_spin_unlock(&rt_rq->rt_runtime_lock); + + /* + * Calculate the difference between what we started out with + * and what we current have, that's the amount of runtime + * we lend and now have to reclaim. + */ + want = rt_b->rt_runtime - rt_rq->rt_runtime; + + /* + * Greedy reclaim, take back as much as we can. + */ + for_each_cpu(i, rd->span) { + struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); + s64 diff; + + /* + * Can't reclaim from ourselves or disabled runqueues. + */ + if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF) + continue; + + raw_spin_lock(&iter->rt_runtime_lock); + if (want > 0) { + diff = min_t(s64, iter->rt_runtime, want); + iter->rt_runtime -= diff; + want -= diff; + } else { + iter->rt_runtime -= want; + want -= want; + } + raw_spin_unlock(&iter->rt_runtime_lock); + + if (!want) + break; + } + + raw_spin_lock(&rt_rq->rt_runtime_lock); + /* + * We cannot be left wanting - that would mean some runtime + * leaked out of the system. + */ + BUG_ON(want); +balanced: + /* + * Disable all the borrow logic by pretending we have inf + * runtime - in which case borrowing doesn't make sense. + */ + rt_rq->rt_runtime = RUNTIME_INF; + raw_spin_unlock(&rt_rq->rt_runtime_lock); + raw_spin_unlock(&rt_b->rt_runtime_lock); + } +} + +static void disable_runtime(struct rq *rq) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + __disable_runtime(rq); + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +static void __enable_runtime(struct rq *rq) +{ + rt_rq_iter_t iter; + struct rt_rq *rt_rq; + + if (unlikely(!scheduler_running)) + return; + + /* + * Reset each runqueue's bandwidth settings + */ + for_each_rt_rq(rt_rq, iter, rq) { + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); + + raw_spin_lock(&rt_b->rt_runtime_lock); + raw_spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_runtime = rt_b->rt_runtime; + rt_rq->rt_time = 0; + rt_rq->rt_throttled = 0; + raw_spin_unlock(&rt_rq->rt_runtime_lock); + raw_spin_unlock(&rt_b->rt_runtime_lock); + } +} + +static void enable_runtime(struct rq *rq) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + __enable_runtime(rq); + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + +static int balance_runtime(struct rt_rq *rt_rq) +{ + int more = 0; + + if (rt_rq->rt_time > rt_rq->rt_runtime) { + raw_spin_unlock(&rt_rq->rt_runtime_lock); + more = do_balance_runtime(rt_rq); + raw_spin_lock(&rt_rq->rt_runtime_lock); + } + + return more; +} +#else /* !CONFIG_SMP */ +static inline int balance_runtime(struct rt_rq *rt_rq) +{ + return 0; +} +#endif /* CONFIG_SMP */ + +static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) +{ + int i, idle = 1; + const struct cpumask *span; + + if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) + return 1; + + span = sched_rt_period_mask(); + for_each_cpu(i, span) { + int enqueue = 0; + struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); + struct rq *rq = rq_of_rt_rq(rt_rq); + + raw_spin_lock(&rq->lock); + if (rt_rq->rt_time) { + u64 runtime; + + raw_spin_lock(&rt_rq->rt_runtime_lock); + if (rt_rq->rt_throttled) + balance_runtime(rt_rq); + runtime = rt_rq->rt_runtime; + rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); + if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { + rt_rq->rt_throttled = 0; + enqueue = 1; + + /* + * Force a clock update if the CPU was idle, + * lest wakeup -> unthrottle time accumulate. + */ + if (rt_rq->rt_nr_running && rq->curr == rq->idle) + rq->skip_clock_update = -1; + } + if (rt_rq->rt_time || rt_rq->rt_nr_running) + idle = 0; + raw_spin_unlock(&rt_rq->rt_runtime_lock); + } else if (rt_rq->rt_nr_running) { + idle = 0; + if (!rt_rq_throttled(rt_rq)) + enqueue = 1; + } + + if (enqueue) + sched_rt_rq_enqueue(rt_rq); + raw_spin_unlock(&rq->lock); + } + + return idle; +} + +static inline int rt_se_prio(struct sched_rt_entity *rt_se) +{ +#ifdef CONFIG_RT_GROUP_SCHED + struct rt_rq *rt_rq = group_rt_rq(rt_se); + + if (rt_rq) + return rt_rq->highest_prio.curr; +#endif + + return rt_task_of(rt_se)->prio; +} + +static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) +{ + u64 runtime = sched_rt_runtime(rt_rq); + + if (rt_rq->rt_throttled) + return rt_rq_throttled(rt_rq); + + if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) + return 0; + + balance_runtime(rt_rq); + runtime = sched_rt_runtime(rt_rq); + if (runtime == RUNTIME_INF) + return 0; + + if (rt_rq->rt_time > runtime) { + rt_rq->rt_throttled = 1; + if (rt_rq_throttled(rt_rq)) { + sched_rt_rq_dequeue(rt_rq); + return 1; + } + } + + return 0; +} + +/* + * Update the current task's runtime statistics. Skip current tasks that + * are not in our scheduling class. + */ +static void update_curr_rt(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + struct sched_rt_entity *rt_se = &curr->rt; + struct rt_rq *rt_rq = rt_rq_of_se(rt_se); + u64 delta_exec; + + if (curr->sched_class != &rt_sched_class) + return; + + delta_exec = rq->clock_task - curr->se.exec_start; + if (unlikely((s64)delta_exec < 0)) + delta_exec = 0; + + schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); + + curr->se.sum_exec_runtime += delta_exec; + account_group_exec_runtime(curr, delta_exec); + + curr->se.exec_start = rq->clock_task; + cpuacct_charge(curr, delta_exec); + + sched_rt_avg_update(rq, delta_exec); + + if (!rt_bandwidth_enabled()) + return; + + for_each_sched_rt_entity(rt_se) { + rt_rq = rt_rq_of_se(rt_se); + + if (sched_rt_runtime(rt_rq) != RUNTIME_INF) { + raw_spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_time += delta_exec; + if (sched_rt_runtime_exceeded(rt_rq)) + resched_task(curr); + raw_spin_unlock(&rt_rq->rt_runtime_lock); + } + } +} + +#if defined CONFIG_SMP + +static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu); + +static inline int next_prio(struct rq *rq) +{ + struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu); + + if (next && rt_prio(next->prio)) + return next->prio; + else + return MAX_RT_PRIO; +} + +static void +inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) +{ + struct rq *rq = rq_of_rt_rq(rt_rq); + + if (prio < prev_prio) { + + /* + * If the new task is higher in priority than anything on the + * run-queue, we know that the previous high becomes our + * next-highest. + */ + rt_rq->highest_prio.next = prev_prio; + + if (rq->online) + cpupri_set(&rq->rd->cpupri, rq->cpu, prio); + + } else if (prio == rt_rq->highest_prio.curr) + /* + * If the next task is equal in priority to the highest on + * the run-queue, then we implicitly know that the next highest + * task cannot be any lower than current + */ + rt_rq->highest_prio.next = prio; + else if (prio < rt_rq->highest_prio.next) + /* + * Otherwise, we need to recompute next-highest + */ + rt_rq->highest_prio.next = next_prio(rq); +} + +static void +dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) +{ + struct rq *rq = rq_of_rt_rq(rt_rq); + + if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next)) + rt_rq->highest_prio.next = next_prio(rq); + + if (rq->online && rt_rq->highest_prio.curr != prev_prio) + cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr); +} + +#else /* CONFIG_SMP */ + +static inline +void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} +static inline +void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {} + +#endif /* CONFIG_SMP */ + +#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED +static void +inc_rt_prio(struct rt_rq *rt_rq, int prio) +{ + int prev_prio = rt_rq->highest_prio.curr; + + if (prio < prev_prio) + rt_rq->highest_prio.curr = prio; + + inc_rt_prio_smp(rt_rq, prio, prev_prio); +} + +static void +dec_rt_prio(struct rt_rq *rt_rq, int prio) +{ + int prev_prio = rt_rq->highest_prio.curr; + + if (rt_rq->rt_nr_running) { + + WARN_ON(prio < prev_prio); + + /* + * This may have been our highest task, and therefore + * we may have some recomputation to do + */ + if (prio == prev_prio) { + struct rt_prio_array *array = &rt_rq->active; + + rt_rq->highest_prio.curr = + sched_find_first_bit(array->bitmap); + } + + } else + rt_rq->highest_prio.curr = MAX_RT_PRIO; + + dec_rt_prio_smp(rt_rq, prio, prev_prio); +} + +#else + +static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {} +static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {} + +#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */ + +#ifdef CONFIG_RT_GROUP_SCHED + +static void +inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ + if (rt_se_boosted(rt_se)) + rt_rq->rt_nr_boosted++; + + if (rt_rq->tg) + start_rt_bandwidth(&rt_rq->tg->rt_bandwidth); +} + +static void +dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ + if (rt_se_boosted(rt_se)) + rt_rq->rt_nr_boosted--; + + WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted); +} + +#else /* CONFIG_RT_GROUP_SCHED */ + +static void +inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ + start_rt_bandwidth(&def_rt_bandwidth); +} + +static inline +void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {} + +#endif /* CONFIG_RT_GROUP_SCHED */ + +static inline +void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ + int prio = rt_se_prio(rt_se); + + WARN_ON(!rt_prio(prio)); + rt_rq->rt_nr_running++; + + inc_rt_prio(rt_rq, prio); + inc_rt_migration(rt_se, rt_rq); + inc_rt_group(rt_se, rt_rq); +} + +static inline +void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) +{ + WARN_ON(!rt_prio(rt_se_prio(rt_se))); + WARN_ON(!rt_rq->rt_nr_running); + rt_rq->rt_nr_running--; + + dec_rt_prio(rt_rq, rt_se_prio(rt_se)); + dec_rt_migration(rt_se, rt_rq); + dec_rt_group(rt_se, rt_rq); +} + +static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) +{ + struct rt_rq *rt_rq = rt_rq_of_se(rt_se); + struct rt_prio_array *array = &rt_rq->active; + struct rt_rq *group_rq = group_rt_rq(rt_se); + struct list_head *queue = array->queue + rt_se_prio(rt_se); + + /* + * Don't enqueue the group if its throttled, or when empty. + * The latter is a consequence of the former when a child group + * get throttled and the current group doesn't have any other + * active members. + */ + if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) + return; + + if (!rt_rq->rt_nr_running) + list_add_leaf_rt_rq(rt_rq); + + if (head) + list_add(&rt_se->run_list, queue); + else + list_add_tail(&rt_se->run_list, queue); + __set_bit(rt_se_prio(rt_se), array->bitmap); + + inc_rt_tasks(rt_se, rt_rq); +} + +static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) +{ + struct rt_rq *rt_rq = rt_rq_of_se(rt_se); + struct rt_prio_array *array = &rt_rq->active; + + list_del_init(&rt_se->run_list); + if (list_empty(array->queue + rt_se_prio(rt_se))) + __clear_bit(rt_se_prio(rt_se), array->bitmap); + + dec_rt_tasks(rt_se, rt_rq); + if (!rt_rq->rt_nr_running) + list_del_leaf_rt_rq(rt_rq); +} + +/* + * Because the prio of an upper entry depends on the lower + * entries, we must remove entries top - down. + */ +static void dequeue_rt_stack(struct sched_rt_entity *rt_se) +{ + struct sched_rt_entity *back = NULL; + + for_each_sched_rt_entity(rt_se) { + rt_se->back = back; + back = rt_se; + } + + for (rt_se = back; rt_se; rt_se = rt_se->back) { + if (on_rt_rq(rt_se)) + __dequeue_rt_entity(rt_se); + } +} + +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) +{ + dequeue_rt_stack(rt_se); + for_each_sched_rt_entity(rt_se) + __enqueue_rt_entity(rt_se, head); +} + +static void dequeue_rt_entity(struct sched_rt_entity *rt_se) +{ + dequeue_rt_stack(rt_se); + + for_each_sched_rt_entity(rt_se) { + struct rt_rq *rt_rq = group_rt_rq(rt_se); + + if (rt_rq && rt_rq->rt_nr_running) + __enqueue_rt_entity(rt_se, false); + } +} + +/* + * Adding/removing a task to/from a priority array: + */ +static void +enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) +{ + struct sched_rt_entity *rt_se = &p->rt; + + if (flags & ENQUEUE_WAKEUP) + rt_se->timeout = 0; + + enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); + + if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) + enqueue_pushable_task(rq, p); +} + +static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) +{ + struct sched_rt_entity *rt_se = &p->rt; + + update_curr_rt(rq); + dequeue_rt_entity(rt_se); + + dequeue_pushable_task(rq, p); +} + +/* + * Put task to the end of the run list without the overhead of dequeue + * followed by enqueue. + */ +static void +requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head) +{ + if (on_rt_rq(rt_se)) { + struct rt_prio_array *array = &rt_rq->active; + struct list_head *queue = array->queue + rt_se_prio(rt_se); + + if (head) + list_move(&rt_se->run_list, queue); + else + list_move_tail(&rt_se->run_list, queue); + } +} + +static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head) +{ + struct sched_rt_entity *rt_se = &p->rt; + struct rt_rq *rt_rq; + + for_each_sched_rt_entity(rt_se) { + rt_rq = rt_rq_of_se(rt_se); + requeue_rt_entity(rt_rq, rt_se, head); + } +} + +static void yield_task_rt(struct rq *rq) +{ + requeue_task_rt(rq, rq->curr, 0); +} + +#ifdef CONFIG_SMP +static int find_lowest_rq(struct task_struct *task); + +static int +select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) +{ + struct task_struct *curr; + struct rq *rq; + int cpu; + + if (sd_flag != SD_BALANCE_WAKE) + return smp_processor_id(); + + cpu = task_cpu(p); + rq = cpu_rq(cpu); + + rcu_read_lock(); + curr = ACCESS_ONCE(rq->curr); /* unlocked access */ + + /* + * If the current task on @p's runqueue is an RT task, then + * try to see if we can wake this RT task up on another + * runqueue. Otherwise simply start this RT task + * on its current runqueue. + * + * We want to avoid overloading runqueues. If the woken + * task is a higher priority, then it will stay on this CPU + * and the lower prio task should be moved to another CPU. + * Even though this will probably make the lower prio task + * lose its cache, we do not want to bounce a higher task + * around just because it gave up its CPU, perhaps for a + * lock? + * + * For equal prio tasks, we just let the scheduler sort it out. + * + * Otherwise, just let it ride on the affined RQ and the + * post-schedule router will push the preempted task away + * + * This test is optimistic, if we get it wrong the load-balancer + * will have to sort it out. + */ + if (curr && unlikely(rt_task(curr)) && + (curr->rt.nr_cpus_allowed < 2 || + curr->prio <= p->prio) && + (p->rt.nr_cpus_allowed > 1)) { + int target = find_lowest_rq(p); + + if (target != -1) + cpu = target; + } + rcu_read_unlock(); + + return cpu; +} + +static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) +{ + if (rq->curr->rt.nr_cpus_allowed == 1) + return; + + if (p->rt.nr_cpus_allowed != 1 + && cpupri_find(&rq->rd->cpupri, p, NULL)) + return; + + if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL)) + return; + + /* + * There appears to be other cpus that can accept + * current and none to run 'p', so lets reschedule + * to try and push current away: + */ + requeue_task_rt(rq, p, 1); + resched_task(rq->curr); +} + +#endif /* CONFIG_SMP */ + +/* + * Preempt the current task with a newly woken task if needed: + */ +static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) +{ + if (p->prio < rq->curr->prio) { + resched_task(rq->curr); + return; + } + +#ifdef CONFIG_SMP + /* + * If: + * + * - the newly woken task is of equal priority to the current task + * - the newly woken task is non-migratable while current is migratable + * - current will be preempted on the next reschedule + * + * we should check to see if current can readily move to a different + * cpu. If so, we will reschedule to allow the push logic to try + * to move current somewhere else, making room for our non-migratable + * task. + */ + if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr)) + check_preempt_equal_prio(rq, p); +#endif +} + +static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, + struct rt_rq *rt_rq) +{ + struct rt_prio_array *array = &rt_rq->active; + struct sched_rt_entity *next = NULL; + struct list_head *queue; + int idx; + + idx = sched_find_first_bit(array->bitmap); + BUG_ON(idx >= MAX_RT_PRIO); + + queue = array->queue + idx; + next = list_entry(queue->next, struct sched_rt_entity, run_list); + + return next; +} + +static struct task_struct *_pick_next_task_rt(struct rq *rq) +{ + struct sched_rt_entity *rt_se; + struct task_struct *p; + struct rt_rq *rt_rq; + + rt_rq = &rq->rt; + + if (unlikely(!rt_rq->rt_nr_running)) + return NULL; + + if (rt_rq_throttled(rt_rq)) + return NULL; + + do { + rt_se = pick_next_rt_entity(rq, rt_rq); + BUG_ON(!rt_se); + rt_rq = group_rt_rq(rt_se); + } while (rt_rq); + + p = rt_task_of(rt_se); + p->se.exec_start = rq->clock_task; + + return p; +} + +static struct task_struct *pick_next_task_rt(struct rq *rq) +{ + struct task_struct *p = _pick_next_task_rt(rq); + + /* The running task is never eligible for pushing */ + if (p) + dequeue_pushable_task(rq, p); + +#ifdef CONFIG_SMP + /* + * We detect this state here so that we can avoid taking the RQ + * lock again later if there is no need to push + */ + rq->post_schedule = has_pushable_tasks(rq); +#endif + + return p; +} + +static void put_prev_task_rt(struct rq *rq, struct task_struct *p) +{ + update_curr_rt(rq); + p->se.exec_start = 0; + + /* + * The previous task needs to be made eligible for pushing + * if it is still active + */ + if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) + enqueue_pushable_task(rq, p); +} + +#ifdef CONFIG_SMP + +/* Only try algorithms three times */ +#define RT_MAX_TRIES 3 + +static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); + +static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) +{ + if (!task_running(rq, p) && + (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && + (p->rt.nr_cpus_allowed > 1)) + return 1; + return 0; +} + +/* Return the second highest RT task, NULL otherwise */ +static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) +{ + struct task_struct *next = NULL; + struct sched_rt_entity *rt_se; + struct rt_prio_array *array; + struct rt_rq *rt_rq; + int idx; + + for_each_leaf_rt_rq(rt_rq, rq) { + array = &rt_rq->active; + idx = sched_find_first_bit(array->bitmap); +next_idx: + if (idx >= MAX_RT_PRIO) + continue; + if (next && next->prio < idx) + continue; + list_for_each_entry(rt_se, array->queue + idx, run_list) { + struct task_struct *p; + + if (!rt_entity_is_task(rt_se)) + continue; + + p = rt_task_of(rt_se); + if (pick_rt_task(rq, p, cpu)) { + next = p; + break; + } + } + if (!next) { + idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); + goto next_idx; + } + } + + return next; +} + +static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); + +static int find_lowest_rq(struct task_struct *task) +{ + struct sched_domain *sd; + struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); + int this_cpu = smp_processor_id(); + int cpu = task_cpu(task); + + /* Make sure the mask is initialized first */ + if (unlikely(!lowest_mask)) + return -1; + + if (task->rt.nr_cpus_allowed == 1) + return -1; /* No other targets possible */ + + if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) + return -1; /* No targets found */ + + /* + * At this point we have built a mask of cpus representing the + * lowest priority tasks in the system. Now we want to elect + * the best one based on our affinity and topology. + * + * We prioritize the last cpu that the task executed on since + * it is most likely cache-hot in that location. + */ + if (cpumask_test_cpu(cpu, lowest_mask)) + return cpu; + + /* + * Otherwise, we consult the sched_domains span maps to figure + * out which cpu is logically closest to our hot cache data. + */ + if (!cpumask_test_cpu(this_cpu, lowest_mask)) + this_cpu = -1; /* Skip this_cpu opt if not among lowest */ + + rcu_read_lock(); + for_each_domain(cpu, sd) { + if (sd->flags & SD_WAKE_AFFINE) { + int best_cpu; + + /* + * "this_cpu" is cheaper to preempt than a + * remote processor. + */ + if (this_cpu != -1 && + cpumask_test_cpu(this_cpu, sched_domain_span(sd))) { + rcu_read_unlock(); + return this_cpu; + } + + best_cpu = cpumask_first_and(lowest_mask, + sched_domain_span(sd)); + if (best_cpu < nr_cpu_ids) { + rcu_read_unlock(); + return best_cpu; + } + } + } + rcu_read_unlock(); + + /* + * And finally, if there were no matches within the domains + * just give the caller *something* to work with from the compatible + * locations. + */ + if (this_cpu != -1) + return this_cpu; + + cpu = cpumask_any(lowest_mask); + if (cpu < nr_cpu_ids) + return cpu; + return -1; +} + +/* Will lock the rq it finds */ +static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) +{ + struct rq *lowest_rq = NULL; + int tries; + int cpu; + + for (tries = 0; tries < RT_MAX_TRIES; tries++) { + cpu = find_lowest_rq(task); + + if ((cpu == -1) || (cpu == rq->cpu)) + break; + + lowest_rq = cpu_rq(cpu); + + /* if the prio of this runqueue changed, try again */ + if (double_lock_balance(rq, lowest_rq)) { + /* + * We had to unlock the run queue. In + * the mean time, task could have + * migrated already or had its affinity changed. + * Also make sure that it wasn't scheduled on its rq. + */ + if (unlikely(task_rq(task) != rq || + !cpumask_test_cpu(lowest_rq->cpu, + &task->cpus_allowed) || + task_running(rq, task) || + !task->on_rq)) { + + raw_spin_unlock(&lowest_rq->lock); + lowest_rq = NULL; + break; + } + } + + /* If this rq is still suitable use it. */ + if (lowest_rq->rt.highest_prio.curr > task->prio) + break; + + /* try again */ + double_unlock_balance(rq, lowest_rq); + lowest_rq = NULL; + } + + return lowest_rq; +} + +static struct task_struct *pick_next_pushable_task(struct rq *rq) +{ + struct task_struct *p; + + if (!has_pushable_tasks(rq)) + return NULL; + + p = plist_first_entry(&rq->rt.pushable_tasks, + struct task_struct, pushable_tasks); + + BUG_ON(rq->cpu != task_cpu(p)); + BUG_ON(task_current(rq, p)); + BUG_ON(p->rt.nr_cpus_allowed <= 1); + + BUG_ON(!p->on_rq); + BUG_ON(!rt_task(p)); + + return p; +} + +/* + * If the current CPU has more than one RT task, see if the non + * running task can migrate over to a CPU that is running a task + * of lesser priority. + */ +static int push_rt_task(struct rq *rq) +{ + struct task_struct *next_task; + struct rq *lowest_rq; + + if (!rq->rt.overloaded) + return 0; + + next_task = pick_next_pushable_task(rq); + if (!next_task) + return 0; + +#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW + if (unlikely(task_running(rq, next_task))) + return 0; +#endif + +retry: + if (unlikely(next_task == rq->curr)) { + WARN_ON(1); + return 0; + } + + /* + * It's possible that the next_task slipped in of + * higher priority than current. If that's the case + * just reschedule current. + */ + if (unlikely(next_task->prio < rq->curr->prio)) { + resched_task(rq->curr); + return 0; + } + + /* We might release rq lock */ + get_task_struct(next_task); + + /* find_lock_lowest_rq locks the rq if found */ + lowest_rq = find_lock_lowest_rq(next_task, rq); + if (!lowest_rq) { + struct task_struct *task; + /* + * find lock_lowest_rq releases rq->lock + * so it is possible that next_task has migrated. + * + * We need to make sure that the task is still on the same + * run-queue and is also still the next task eligible for + * pushing. + */ + task = pick_next_pushable_task(rq); + if (task_cpu(next_task) == rq->cpu && task == next_task) { + /* + * If we get here, the task hasn't moved at all, but + * it has failed to push. We will not try again, + * since the other cpus will pull from us when they + * are ready. + */ + dequeue_pushable_task(rq, next_task); + goto out; + } + + if (!task) + /* No more tasks, just exit */ + goto out; + + /* + * Something has shifted, try again. + */ + put_task_struct(next_task); + next_task = task; + goto retry; + } + + deactivate_task(rq, next_task, 0); + set_task_cpu(next_task, lowest_rq->cpu); + activate_task(lowest_rq, next_task, 0); + + resched_task(lowest_rq->curr); + + double_unlock_balance(rq, lowest_rq); + +out: + put_task_struct(next_task); + + return 1; +} + +static void push_rt_tasks(struct rq *rq) +{ + /* push_rt_task will return true if it moved an RT */ + while (push_rt_task(rq)) + ; +} + +static int pull_rt_task(struct rq *this_rq) +{ + int this_cpu = this_rq->cpu, ret = 0, cpu; + struct task_struct *p; + struct rq *src_rq; + + if (likely(!rt_overloaded(this_rq))) + return 0; + + for_each_cpu(cpu, this_rq->rd->rto_mask) { + if (this_cpu == cpu) + continue; + + src_rq = cpu_rq(cpu); + + /* + * Don't bother taking the src_rq->lock if the next highest + * task is known to be lower-priority than our current task. + * This may look racy, but if this value is about to go + * logically higher, the src_rq will push this task away. + * And if its going logically lower, we do not care + */ + if (src_rq->rt.highest_prio.next >= + this_rq->rt.highest_prio.curr) + continue; + + /* + * We can potentially drop this_rq's lock in + * double_lock_balance, and another CPU could + * alter this_rq + */ + double_lock_balance(this_rq, src_rq); + + /* + * Are there still pullable RT tasks? + */ + if (src_rq->rt.rt_nr_running <= 1) + goto skip; + + p = pick_next_highest_task_rt(src_rq, this_cpu); + + /* + * Do we have an RT task that preempts + * the to-be-scheduled task? + */ + if (p && (p->prio < this_rq->rt.highest_prio.curr)) { + WARN_ON(p == src_rq->curr); + WARN_ON(!p->on_rq); + + /* + * There's a chance that p is higher in priority + * than what's currently running on its cpu. + * This is just that p is wakeing up and hasn't + * had a chance to schedule. We only pull + * p if it is lower in priority than the + * current task on the run queue + */ + if (p->prio < src_rq->curr->prio) + goto skip; + + ret = 1; + + deactivate_task(src_rq, p, 0); + set_task_cpu(p, this_cpu); + activate_task(this_rq, p, 0); + /* + * We continue with the search, just in + * case there's an even higher prio task + * in another runqueue. (low likelihood + * but possible) + */ + } +skip: + double_unlock_balance(this_rq, src_rq); + } + + return ret; +} + +static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) +{ + /* Try to pull RT tasks here if we lower this rq's prio */ + if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio) + pull_rt_task(rq); +} + +static void post_schedule_rt(struct rq *rq) +{ + push_rt_tasks(rq); +} + +/* + * If we are not running and we are not going to reschedule soon, we should + * try to push tasks away now + */ +static void task_woken_rt(struct rq *rq, struct task_struct *p) +{ + if (!task_running(rq, p) && + !test_tsk_need_resched(rq->curr) && + has_pushable_tasks(rq) && + p->rt.nr_cpus_allowed > 1 && + rt_task(rq->curr) && + (rq->curr->rt.nr_cpus_allowed < 2 || + rq->curr->prio <= p->prio)) + push_rt_tasks(rq); +} + +static void set_cpus_allowed_rt(struct task_struct *p, + const struct cpumask *new_mask) +{ + int weight = cpumask_weight(new_mask); + + BUG_ON(!rt_task(p)); + + /* + * Update the migration status of the RQ if we have an RT task + * which is running AND changing its weight value. + */ + if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) { + struct rq *rq = task_rq(p); + + if (!task_current(rq, p)) { + /* + * Make sure we dequeue this task from the pushable list + * before going further. It will either remain off of + * the list because we are no longer pushable, or it + * will be requeued. + */ + if (p->rt.nr_cpus_allowed > 1) + dequeue_pushable_task(rq, p); + + /* + * Requeue if our weight is changing and still > 1 + */ + if (weight > 1) + enqueue_pushable_task(rq, p); + + } + + if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { + rq->rt.rt_nr_migratory++; + } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) { + BUG_ON(!rq->rt.rt_nr_migratory); + rq->rt.rt_nr_migratory--; + } + + update_rt_migration(&rq->rt); + } + + cpumask_copy(&p->cpus_allowed, new_mask); + p->rt.nr_cpus_allowed = weight; +} + +/* Assumes rq->lock is held */ +static void rq_online_rt(struct rq *rq) +{ + if (rq->rt.overloaded) + rt_set_overload(rq); + + __enable_runtime(rq); + + cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); +} + +/* Assumes rq->lock is held */ +static void rq_offline_rt(struct rq *rq) +{ + if (rq->rt.overloaded) + rt_clear_overload(rq); + + __disable_runtime(rq); + + cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID); +} + +/* + * When switch from the rt queue, we bring ourselves to a position + * that we might want to pull RT tasks from other runqueues. + */ +static void switched_from_rt(struct rq *rq, struct task_struct *p) +{ + /* + * If there are other RT tasks then we will reschedule + * and the scheduling of the other RT tasks will handle + * the balancing. But if we are the last RT task + * we may need to handle the pulling of RT tasks + * now. + */ + if (p->on_rq && !rq->rt.rt_nr_running) + pull_rt_task(rq); +} + +static inline void init_sched_rt_class(void) +{ + unsigned int i; + + for_each_possible_cpu(i) + zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i), + GFP_KERNEL, cpu_to_node(i)); +} +#endif /* CONFIG_SMP */ + +/* + * When switching a task to RT, we may overload the runqueue + * with RT tasks. In this case we try to push them off to + * other runqueues. + */ +static void switched_to_rt(struct rq *rq, struct task_struct *p) +{ + int check_resched = 1; + + /* + * If we are already running, then there's nothing + * that needs to be done. But if we are not running + * we may need to preempt the current running task. + * If that current running task is also an RT task + * then see if we can move to another run queue. + */ + if (p->on_rq && rq->curr != p) { +#ifdef CONFIG_SMP + if (rq->rt.overloaded && push_rt_task(rq) && + /* Don't resched if we changed runqueues */ + rq != task_rq(p)) + check_resched = 0; +#endif /* CONFIG_SMP */ + if (check_resched && p->prio < rq->curr->prio) + resched_task(rq->curr); + } +} + +/* + * Priority of the task has changed. This may cause + * us to initiate a push or pull. + */ +static void +prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) +{ + if (!p->on_rq) + return; + + if (rq->curr == p) { +#ifdef CONFIG_SMP + /* + * If our priority decreases while running, we + * may need to pull tasks to this runqueue. + */ + if (oldprio < p->prio) + pull_rt_task(rq); + /* + * If there's a higher priority task waiting to run + * then reschedule. Note, the above pull_rt_task + * can release the rq lock and p could migrate. + * Only reschedule if p is still on the same runqueue. + */ + if (p->prio > rq->rt.highest_prio.curr && rq->curr == p) + resched_task(p); +#else + /* For UP simply resched on drop of prio */ + if (oldprio < p->prio) + resched_task(p); +#endif /* CONFIG_SMP */ + } else { + /* + * This task is not running, but if it is + * greater than the current running task + * then reschedule. + */ + if (p->prio < rq->curr->prio) + resched_task(rq->curr); + } +} + +static void watchdog(struct rq *rq, struct task_struct *p) +{ + unsigned long soft, hard; + + /* max may change after cur was read, this will be fixed next tick */ + soft = task_rlimit(p, RLIMIT_RTTIME); + hard = task_rlimit_max(p, RLIMIT_RTTIME); + + if (soft != RLIM_INFINITY) { + unsigned long next; + + p->rt.timeout++; + next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); + if (p->rt.timeout > next) + p->cputime_expires.sched_exp = p->se.sum_exec_runtime; + } +} + +static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) +{ + update_curr_rt(rq); + + watchdog(rq, p); + + /* + * RR tasks need a special form of timeslice management. + * FIFO tasks have no timeslices. + */ + if (p->policy != SCHED_RR) + return; + + if (--p->rt.time_slice) + return; + + p->rt.time_slice = DEF_TIMESLICE; + + /* + * Requeue to the end of queue if we are not the only element + * on the queue: + */ + if (p->rt.run_list.prev != p->rt.run_list.next) { + requeue_task_rt(rq, p, 0); + set_tsk_need_resched(p); + } +} + +static void set_curr_task_rt(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + p->se.exec_start = rq->clock_task; + + /* The running task is never eligible for pushing */ + dequeue_pushable_task(rq, p); +} + +static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) +{ + /* + * Time slice is 0 for SCHED_FIFO tasks + */ + if (task->policy == SCHED_RR) + return DEF_TIMESLICE; + else + return 0; +} + +static const struct sched_class rt_sched_class = { + .next = &fair_sched_class, + .enqueue_task = enqueue_task_rt, + .dequeue_task = dequeue_task_rt, + .yield_task = yield_task_rt, + + .check_preempt_curr = check_preempt_curr_rt, + + .pick_next_task = pick_next_task_rt, + .put_prev_task = put_prev_task_rt, + +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_rt, + + .set_cpus_allowed = set_cpus_allowed_rt, + .rq_online = rq_online_rt, + .rq_offline = rq_offline_rt, + .pre_schedule = pre_schedule_rt, + .post_schedule = post_schedule_rt, + .task_woken = task_woken_rt, + .switched_from = switched_from_rt, +#endif + + .set_curr_task = set_curr_task_rt, + .task_tick = task_tick_rt, + + .get_rr_interval = get_rr_interval_rt, + + .prio_changed = prio_changed_rt, + .switched_to = switched_to_rt, +}; + +#ifdef CONFIG_SCHED_DEBUG +extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); + +static void print_rt_stats(struct seq_file *m, int cpu) +{ + rt_rq_iter_t iter; + struct rt_rq *rt_rq; + + rcu_read_lock(); + for_each_rt_rq(rt_rq, iter, cpu_rq(cpu)) + print_rt_rq(m, cpu, rt_rq); + rcu_read_unlock(); +} +#endif /* CONFIG_SCHED_DEBUG */ + diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h new file mode 100644 index 00000000..331e01bc --- /dev/null +++ b/kernel/sched_stats.h @@ -0,0 +1,336 @@ + +#ifdef CONFIG_SCHEDSTATS +/* + * bump this up when changing the output format or the meaning of an existing + * format, so that tools can adapt (or abort) + */ +#define SCHEDSTAT_VERSION 15 + +static int show_schedstat(struct seq_file *seq, void *v) +{ + int cpu; + int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9; + char *mask_str = kmalloc(mask_len, GFP_KERNEL); + + if (mask_str == NULL) + return -ENOMEM; + + seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); + seq_printf(seq, "timestamp %lu\n", jiffies); + for_each_online_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); +#ifdef CONFIG_SMP + struct sched_domain *sd; + int dcount = 0; +#endif + + /* runqueue-specific stats */ + seq_printf(seq, + "cpu%d %u %u %u %u %u %u %llu %llu %lu", + cpu, rq->yld_count, + rq->sched_switch, rq->sched_count, rq->sched_goidle, + rq->ttwu_count, rq->ttwu_local, + rq->rq_cpu_time, + rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount); + + seq_printf(seq, "\n"); + +#ifdef CONFIG_SMP + /* domain-specific stats */ + rcu_read_lock(); + for_each_domain(cpu, sd) { + enum cpu_idle_type itype; + + cpumask_scnprintf(mask_str, mask_len, + sched_domain_span(sd)); + seq_printf(seq, "domain%d %s", dcount++, mask_str); + for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES; + itype++) { + seq_printf(seq, " %u %u %u %u %u %u %u %u", + sd->lb_count[itype], + sd->lb_balanced[itype], + sd->lb_failed[itype], + sd->lb_imbalance[itype], + sd->lb_gained[itype], + sd->lb_hot_gained[itype], + sd->lb_nobusyq[itype], + sd->lb_nobusyg[itype]); + } + seq_printf(seq, + " %u %u %u %u %u %u %u %u %u %u %u %u\n", + sd->alb_count, sd->alb_failed, sd->alb_pushed, + sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed, + sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, + sd->ttwu_wake_remote, sd->ttwu_move_affine, + sd->ttwu_move_balance); + } + rcu_read_unlock(); +#endif + } + kfree(mask_str); + return 0; +} + +static int schedstat_open(struct inode *inode, struct file *file) +{ + unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); + char *buf = kmalloc(size, GFP_KERNEL); + struct seq_file *m; + int res; + + if (!buf) + return -ENOMEM; + res = single_open(file, show_schedstat, NULL); + if (!res) { + m = file->private_data; + m->buf = buf; + m->size = size; + } else + kfree(buf); + return res; +} + +static const struct file_operations proc_schedstat_operations = { + .open = schedstat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init proc_schedstat_init(void) +{ + proc_create("schedstat", 0, NULL, &proc_schedstat_operations); + return 0; +} +module_init(proc_schedstat_init); + +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_arrive(struct rq *rq, unsigned long long delta) +{ + if (rq) { + rq->rq_sched_info.run_delay += delta; + rq->rq_sched_info.pcount++; + } +} + +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_depart(struct rq *rq, unsigned long long delta) +{ + if (rq) + rq->rq_cpu_time += delta; +} + +static inline void +rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) +{ + if (rq) + rq->rq_sched_info.run_delay += delta; +} +# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) +# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) +# define schedstat_set(var, val) do { var = (val); } while (0) +#else /* !CONFIG_SCHEDSTATS */ +static inline void +rq_sched_info_arrive(struct rq *rq, unsigned long long delta) +{} +static inline void +rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) +{} +static inline void +rq_sched_info_depart(struct rq *rq, unsigned long long delta) +{} +# define schedstat_inc(rq, field) do { } while (0) +# define schedstat_add(rq, field, amt) do { } while (0) +# define schedstat_set(var, val) do { } while (0) +#endif + +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) +static inline void sched_info_reset_dequeued(struct task_struct *t) +{ + t->sched_info.last_queued = 0; +} + +/* + * We are interested in knowing how long it was from the *first* time a + * task was queued to the time that it finally hit a cpu, we call this routine + * from dequeue_task() to account for possible rq->clock skew across cpus. The + * delta taken on each cpu would annul the skew. + */ +static inline void sched_info_dequeued(struct task_struct *t) +{ + unsigned long long now = task_rq(t)->clock, delta = 0; + + if (unlikely(sched_info_on())) + if (t->sched_info.last_queued) + delta = now - t->sched_info.last_queued; + sched_info_reset_dequeued(t); + t->sched_info.run_delay += delta; + + rq_sched_info_dequeued(task_rq(t), delta); +} + +/* + * Called when a task finally hits the cpu. We can now calculate how + * long it was waiting to run. We also note when it began so that we + * can keep stats on how long its timeslice is. + */ +static void sched_info_arrive(struct task_struct *t) +{ + unsigned long long now = task_rq(t)->clock, delta = 0; + + if (t->sched_info.last_queued) + delta = now - t->sched_info.last_queued; + sched_info_reset_dequeued(t); + t->sched_info.run_delay += delta; + t->sched_info.last_arrival = now; + t->sched_info.pcount++; + + rq_sched_info_arrive(task_rq(t), delta); +} + +/* + * This function is only called from enqueue_task(), but also only updates + * the timestamp if it is already not set. It's assumed that + * sched_info_dequeued() will clear that stamp when appropriate. + */ +static inline void sched_info_queued(struct task_struct *t) +{ + if (unlikely(sched_info_on())) + if (!t->sched_info.last_queued) + t->sched_info.last_queued = task_rq(t)->clock; +} + +/* + * Called when a process ceases being the active-running process, either + * voluntarily or involuntarily. Now we can calculate how long we ran. + * Also, if the process is still in the TASK_RUNNING state, call + * sched_info_queued() to mark that it has now again started waiting on + * the runqueue. + */ +static inline void sched_info_depart(struct task_struct *t) +{ + unsigned long long delta = task_rq(t)->clock - + t->sched_info.last_arrival; + + rq_sched_info_depart(task_rq(t), delta); + + if (t->state == TASK_RUNNING) + sched_info_queued(t); +} + +/* + * Called when tasks are switched involuntarily due, typically, to expiring + * their time slice. (This may also be called when switching to or from + * the idle task.) We are only called when prev != next. + */ +static inline void +__sched_info_switch(struct task_struct *prev, struct task_struct *next) +{ + struct rq *rq = task_rq(prev); + + /* + * prev now departs the cpu. It's not interesting to record + * stats about how efficient we were at scheduling the idle + * process, however. + */ + if (prev != rq->idle) + sched_info_depart(prev); + + if (next != rq->idle) + sched_info_arrive(next); +} +static inline void +sched_info_switch(struct task_struct *prev, struct task_struct *next) +{ + if (unlikely(sched_info_on())) + __sched_info_switch(prev, next); +} +#else +#define sched_info_queued(t) do { } while (0) +#define sched_info_reset_dequeued(t) do { } while (0) +#define sched_info_dequeued(t) do { } while (0) +#define sched_info_switch(t, next) do { } while (0) +#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ + +/* + * The following are functions that support scheduler-internal time accounting. + * These functions are generally called at the timer tick. None of this depends + * on CONFIG_SCHEDSTATS. + */ + +/** + * account_group_user_time - Maintain utime for a thread group. + * + * @tsk: Pointer to task structure. + * @cputime: Time value by which to increment the utime field of the + * thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the utime field there. + */ +static inline void account_group_user_time(struct task_struct *tsk, + cputime_t cputime) +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + + if (!cputimer->running) + return; + + spin_lock(&cputimer->lock); + cputimer->cputime.utime = + cputime_add(cputimer->cputime.utime, cputime); + spin_unlock(&cputimer->lock); +} + +/** + * account_group_system_time - Maintain stime for a thread group. + * + * @tsk: Pointer to task structure. + * @cputime: Time value by which to increment the stime field of the + * thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the stime field there. + */ +static inline void account_group_system_time(struct task_struct *tsk, + cputime_t cputime) +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + + if (!cputimer->running) + return; + + spin_lock(&cputimer->lock); + cputimer->cputime.stime = + cputime_add(cputimer->cputime.stime, cputime); + spin_unlock(&cputimer->lock); +} + +/** + * account_group_exec_runtime - Maintain exec runtime for a thread group. + * + * @tsk: Pointer to task structure. + * @ns: Time value by which to increment the sum_exec_runtime field + * of the thread_group_cputime structure. + * + * If thread group time is being maintained, get the structure for the + * running CPU and update the sum_exec_runtime field there. + */ +static inline void account_group_exec_runtime(struct task_struct *tsk, + unsigned long long ns) +{ + struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + + if (!cputimer->running) + return; + + spin_lock(&cputimer->lock); + cputimer->cputime.sum_exec_runtime += ns; + spin_unlock(&cputimer->lock); +} diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c new file mode 100644 index 00000000..6f437632 --- /dev/null +++ b/kernel/sched_stoptask.c @@ -0,0 +1,104 @@ +/* + * stop-task scheduling class. + * + * The stop task is the highest priority task in the system, it preempts + * everything and will be preempted by nothing. + * + * See kernel/stop_machine.c + */ + +#ifdef CONFIG_SMP +static int +select_task_rq_stop(struct task_struct *p, int sd_flag, int flags) +{ + return task_cpu(p); /* stop tasks as never migrate */ +} +#endif /* CONFIG_SMP */ + +static void +check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) +{ + /* we're never preempted */ +} + +static struct task_struct *pick_next_task_stop(struct rq *rq) +{ + struct task_struct *stop = rq->stop; + + if (stop && stop->on_rq) + return stop; + + return NULL; +} + +static void +enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) +{ +} + +static void +dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) +{ +} + +static void yield_task_stop(struct rq *rq) +{ + BUG(); /* the stop task should never yield, its pointless. */ +} + +static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) +{ +} + +static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) +{ +} + +static void set_curr_task_stop(struct rq *rq) +{ +} + +static void switched_to_stop(struct rq *rq, struct task_struct *p) +{ + BUG(); /* its impossible to change to this class */ +} + +static void +prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio) +{ + BUG(); /* how!?, what priority? */ +} + +static unsigned int +get_rr_interval_stop(struct rq *rq, struct task_struct *task) +{ + return 0; +} + +/* + * Simple, special scheduling class for the per-CPU stop tasks: + */ +static const struct sched_class stop_sched_class = { + .next = &rt_sched_class, + + .enqueue_task = enqueue_task_stop, + .dequeue_task = dequeue_task_stop, + .yield_task = yield_task_stop, + + .check_preempt_curr = check_preempt_curr_stop, + + .pick_next_task = pick_next_task_stop, + .put_prev_task = put_prev_task_stop, + +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_stop, +#endif + + .set_curr_task = set_curr_task_stop, + .task_tick = task_tick_stop, + + .get_rr_interval = get_rr_interval_stop, + + .prio_changed = prio_changed_stop, + .switched_to = switched_to_stop, +}; diff --git a/kernel/seccomp.c b/kernel/seccomp.c new file mode 100644 index 00000000..57d4b13b --- /dev/null +++ b/kernel/seccomp.c @@ -0,0 +1,86 @@ +/* + * linux/kernel/seccomp.c + * + * Copyright 2004-2005 Andrea Arcangeli + * + * This defines a simple but solid secure-computing mode. + */ + +#include +#include +#include + +/* #define SECCOMP_DEBUG 1 */ +#define NR_SECCOMP_MODES 1 + +/* + * Secure computing mode 1 allows only read/write/exit/sigreturn. + * To be fully secure this must be combined with rlimit + * to limit the stack allocations too. + */ +static int mode1_syscalls[] = { + __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn, + 0, /* null terminated */ +}; + +#ifdef CONFIG_COMPAT +static int mode1_syscalls_32[] = { + __NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32, + 0, /* null terminated */ +}; +#endif + +void __secure_computing(int this_syscall) +{ + int mode = current->seccomp.mode; + int * syscall; + + switch (mode) { + case 1: + syscall = mode1_syscalls; +#ifdef CONFIG_COMPAT + if (is_compat_task()) + syscall = mode1_syscalls_32; +#endif + do { + if (*syscall == this_syscall) + return; + } while (*++syscall); + break; + default: + BUG(); + } + +#ifdef SECCOMP_DEBUG + dump_stack(); +#endif + do_exit(SIGKILL); +} + +long prctl_get_seccomp(void) +{ + return current->seccomp.mode; +} + +long prctl_set_seccomp(unsigned long seccomp_mode) +{ + long ret; + + /* can set it only once to be even more secure */ + ret = -EPERM; + if (unlikely(current->seccomp.mode)) + goto out; + + ret = -EINVAL; + if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { + current->seccomp.mode = seccomp_mode; + set_thread_flag(TIF_SECCOMP); +#ifdef TIF_NOTSC + disable_TSC(); +#endif + ret = 0; + } + + out: + return ret; +} diff --git a/kernel/semaphore.c b/kernel/semaphore.c new file mode 100644 index 00000000..94a62c0d --- /dev/null +++ b/kernel/semaphore.c @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2008 Intel Corporation + * Author: Matthew Wilcox + * + * Distributed under the terms of the GNU GPL, version 2 + * + * This file implements counting semaphores. + * A counting semaphore may be acquired 'n' times before sleeping. + * See mutex.c for single-acquisition sleeping locks which enforce + * rules which allow code to be debugged more easily. + */ + +/* + * Some notes on the implementation: + * + * The spinlock controls access to the other members of the semaphore. + * down_trylock() and up() can be called from interrupt context, so we + * have to disable interrupts when taking the lock. It turns out various + * parts of the kernel expect to be able to use down() on a semaphore in + * interrupt context when they know it will succeed, so we have to use + * irqsave variants for down(), down_interruptible() and down_killable() + * too. + * + * The ->count variable represents how many more tasks can acquire this + * semaphore. If it's zero, there may be tasks waiting on the wait_list. + */ + +#include +#include +#include +#include +#include +#include +#include + +static noinline void __down(struct semaphore *sem); +static noinline int __down_interruptible(struct semaphore *sem); +static noinline int __down_killable(struct semaphore *sem); +static noinline int __down_timeout(struct semaphore *sem, long jiffies); +static noinline void __up(struct semaphore *sem); + +/** + * down - acquire the semaphore + * @sem: the semaphore to be acquired + * + * Acquires the semaphore. If no more tasks are allowed to acquire the + * semaphore, calling this function will put the task to sleep until the + * semaphore is released. + * + * Use of this function is deprecated, please use down_interruptible() or + * down_killable() instead. + */ +void down(struct semaphore *sem) +{ + unsigned long flags; + + spin_lock_irqsave(&sem->lock, flags); + if (likely(sem->count > 0)) + sem->count--; + else + __down(sem); + spin_unlock_irqrestore(&sem->lock, flags); +} +EXPORT_SYMBOL(down); + +/** + * down_interruptible - acquire the semaphore unless interrupted + * @sem: the semaphore to be acquired + * + * Attempts to acquire the semaphore. If no more tasks are allowed to + * acquire the semaphore, calling this function will put the task to sleep. + * If the sleep is interrupted by a signal, this function will return -EINTR. + * If the semaphore is successfully acquired, this function returns 0. + */ +int down_interruptible(struct semaphore *sem) +{ + unsigned long flags; + int result = 0; + + spin_lock_irqsave(&sem->lock, flags); + if (likely(sem->count > 0)) + sem->count--; + else + result = __down_interruptible(sem); + spin_unlock_irqrestore(&sem->lock, flags); + + return result; +} +EXPORT_SYMBOL(down_interruptible); + +/** + * down_killable - acquire the semaphore unless killed + * @sem: the semaphore to be acquired + * + * Attempts to acquire the semaphore. If no more tasks are allowed to + * acquire the semaphore, calling this function will put the task to sleep. + * If the sleep is interrupted by a fatal signal, this function will return + * -EINTR. If the semaphore is successfully acquired, this function returns + * 0. + */ +int down_killable(struct semaphore *sem) +{ + unsigned long flags; + int result = 0; + + spin_lock_irqsave(&sem->lock, flags); + if (likely(sem->count > 0)) + sem->count--; + else + result = __down_killable(sem); + spin_unlock_irqrestore(&sem->lock, flags); + + return result; +} +EXPORT_SYMBOL(down_killable); + +/** + * down_trylock - try to acquire the semaphore, without waiting + * @sem: the semaphore to be acquired + * + * Try to acquire the semaphore atomically. Returns 0 if the mutex has + * been acquired successfully or 1 if it it cannot be acquired. + * + * NOTE: This return value is inverted from both spin_trylock and + * mutex_trylock! Be careful about this when converting code. + * + * Unlike mutex_trylock, this function can be used from interrupt context, + * and the semaphore can be released by any task or interrupt. + */ +int down_trylock(struct semaphore *sem) +{ + unsigned long flags; + int count; + + spin_lock_irqsave(&sem->lock, flags); + count = sem->count - 1; + if (likely(count >= 0)) + sem->count = count; + spin_unlock_irqrestore(&sem->lock, flags); + + return (count < 0); +} +EXPORT_SYMBOL(down_trylock); + +/** + * down_timeout - acquire the semaphore within a specified time + * @sem: the semaphore to be acquired + * @jiffies: how long to wait before failing + * + * Attempts to acquire the semaphore. If no more tasks are allowed to + * acquire the semaphore, calling this function will put the task to sleep. + * If the semaphore is not released within the specified number of jiffies, + * this function returns -ETIME. It returns 0 if the semaphore was acquired. + */ +int down_timeout(struct semaphore *sem, long jiffies) +{ + unsigned long flags; + int result = 0; + + spin_lock_irqsave(&sem->lock, flags); + if (likely(sem->count > 0)) + sem->count--; + else + result = __down_timeout(sem, jiffies); + spin_unlock_irqrestore(&sem->lock, flags); + + return result; +} +EXPORT_SYMBOL(down_timeout); + +/** + * up - release the semaphore + * @sem: the semaphore to release + * + * Release the semaphore. Unlike mutexes, up() may be called from any + * context and even by tasks which have never called down(). + */ +void up(struct semaphore *sem) +{ + unsigned long flags; + + spin_lock_irqsave(&sem->lock, flags); + if (likely(list_empty(&sem->wait_list))) + sem->count++; + else + __up(sem); + spin_unlock_irqrestore(&sem->lock, flags); +} +EXPORT_SYMBOL(up); + +/* Functions for the contended case */ + +struct semaphore_waiter { + struct list_head list; + struct task_struct *task; + int up; +}; + +/* + * Because this function is inlined, the 'state' parameter will be + * constant, and thus optimised away by the compiler. Likewise the + * 'timeout' parameter for the cases without timeouts. + */ +static inline int __sched __down_common(struct semaphore *sem, long state, + long timeout) +{ + struct task_struct *task = current; + struct semaphore_waiter waiter; + + list_add_tail(&waiter.list, &sem->wait_list); + waiter.task = task; + waiter.up = 0; + + for (;;) { + if (signal_pending_state(state, task)) + goto interrupted; + if (timeout <= 0) + goto timed_out; + __set_task_state(task, state); + spin_unlock_irq(&sem->lock); + timeout = schedule_timeout(timeout); + spin_lock_irq(&sem->lock); + if (waiter.up) + return 0; + } + + timed_out: + list_del(&waiter.list); + return -ETIME; + + interrupted: + list_del(&waiter.list); + return -EINTR; +} + +static noinline void __sched __down(struct semaphore *sem) +{ + __down_common(sem, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); +} + +static noinline int __sched __down_interruptible(struct semaphore *sem) +{ + return __down_common(sem, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); +} + +static noinline int __sched __down_killable(struct semaphore *sem) +{ + return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT); +} + +static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies) +{ + return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies); +} + +static noinline void __sched __up(struct semaphore *sem) +{ + struct semaphore_waiter *waiter = list_first_entry(&sem->wait_list, + struct semaphore_waiter, list); + list_del(&waiter->list); + waiter->up = 1; + wake_up_process(waiter->task); +} diff --git a/kernel/signal.c b/kernel/signal.c new file mode 100644 index 00000000..43fee1cf --- /dev/null +++ b/kernel/signal.c @@ -0,0 +1,3120 @@ +/* + * linux/kernel/signal.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * 1997-11-02 Modified for POSIX.1b signals by Richard Henderson + * + * 2003-06-02 Jim Houston - Concurrent Computer Corp. + * Changes to use preallocated sigqueue structures + * to allow signals to be sent reliably. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define CREATE_TRACE_POINTS +#include + +#include +#include +#include +#include +#include "audit.h" /* audit_signal_info() */ + +/* + * SLAB caches for signal bits. + */ + +static struct kmem_cache *sigqueue_cachep; + +int print_fatal_signals __read_mostly; + +static void __user *sig_handler(struct task_struct *t, int sig) +{ + return t->sighand->action[sig - 1].sa.sa_handler; +} + +static int sig_handler_ignored(void __user *handler, int sig) +{ + /* Is it explicitly or implicitly ignored? */ + return handler == SIG_IGN || + (handler == SIG_DFL && sig_kernel_ignore(sig)); +} + +static int sig_task_ignored(struct task_struct *t, int sig, + int from_ancestor_ns) +{ + void __user *handler; + + handler = sig_handler(t, sig); + + if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) && + handler == SIG_DFL && !from_ancestor_ns) + return 1; + + return sig_handler_ignored(handler, sig); +} + +static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns) +{ + /* + * Blocked signals are never ignored, since the + * signal handler may change by the time it is + * unblocked. + */ + if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) + return 0; + + if (!sig_task_ignored(t, sig, from_ancestor_ns)) + return 0; + + /* + * Tracers may want to know about even ignored signals. + */ + return !tracehook_consider_ignored_signal(t, sig); +} + +/* + * Re-calculate pending state from the set of locally pending + * signals, globally pending signals, and blocked signals. + */ +static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) +{ + unsigned long ready; + long i; + + switch (_NSIG_WORDS) { + default: + for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;) + ready |= signal->sig[i] &~ blocked->sig[i]; + break; + + case 4: ready = signal->sig[3] &~ blocked->sig[3]; + ready |= signal->sig[2] &~ blocked->sig[2]; + ready |= signal->sig[1] &~ blocked->sig[1]; + ready |= signal->sig[0] &~ blocked->sig[0]; + break; + + case 2: ready = signal->sig[1] &~ blocked->sig[1]; + ready |= signal->sig[0] &~ blocked->sig[0]; + break; + + case 1: ready = signal->sig[0] &~ blocked->sig[0]; + } + return ready != 0; +} + +#define PENDING(p,b) has_pending_signals(&(p)->signal, (b)) + +static int recalc_sigpending_tsk(struct task_struct *t) +{ + if ((t->group_stop & GROUP_STOP_PENDING) || + PENDING(&t->pending, &t->blocked) || + PENDING(&t->signal->shared_pending, &t->blocked)) { + set_tsk_thread_flag(t, TIF_SIGPENDING); + return 1; + } + /* + * We must never clear the flag in another thread, or in current + * when it's possible the current syscall is returning -ERESTART*. + * So we don't clear it here, and only callers who know they should do. + */ + return 0; +} + +/* + * After recalculating TIF_SIGPENDING, we need to make sure the task wakes up. + * This is superfluous when called on current, the wakeup is a harmless no-op. + */ +void recalc_sigpending_and_wake(struct task_struct *t) +{ + if (recalc_sigpending_tsk(t)) + signal_wake_up(t, 0); +} + +void recalc_sigpending(void) +{ + if (unlikely(tracehook_force_sigpending())) + set_thread_flag(TIF_SIGPENDING); + else if (!recalc_sigpending_tsk(current) && !freezing(current)) + clear_thread_flag(TIF_SIGPENDING); + +} + +/* Given the mask, find the first available signal that should be serviced. */ + +#define SYNCHRONOUS_MASK \ + (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \ + sigmask(SIGTRAP) | sigmask(SIGFPE)) + +int next_signal(struct sigpending *pending, sigset_t *mask) +{ + unsigned long i, *s, *m, x; + int sig = 0; + + s = pending->signal.sig; + m = mask->sig; + + /* + * Handle the first word specially: it contains the + * synchronous signals that need to be dequeued first. + */ + x = *s &~ *m; + if (x) { + if (x & SYNCHRONOUS_MASK) + x &= SYNCHRONOUS_MASK; + sig = ffz(~x) + 1; + return sig; + } + + switch (_NSIG_WORDS) { + default: + for (i = 1; i < _NSIG_WORDS; ++i) { + x = *++s &~ *++m; + if (!x) + continue; + sig = ffz(~x) + i*_NSIG_BPW + 1; + break; + } + break; + + case 2: + x = s[1] &~ m[1]; + if (!x) + break; + sig = ffz(~x) + _NSIG_BPW + 1; + break; + + case 1: + /* Nothing to do */ + break; + } + + return sig; +} + +static inline void print_dropped_signal(int sig) +{ + static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); + + if (!print_fatal_signals) + return; + + if (!__ratelimit(&ratelimit_state)) + return; + + printk(KERN_INFO "%s/%d: reached RLIMIT_SIGPENDING, dropped signal %d\n", + current->comm, current->pid, sig); +} + +/** + * task_clear_group_stop_trapping - clear group stop trapping bit + * @task: target task + * + * If GROUP_STOP_TRAPPING is set, a ptracer is waiting for us. Clear it + * and wake up the ptracer. Note that we don't need any further locking. + * @task->siglock guarantees that @task->parent points to the ptracer. + * + * CONTEXT: + * Must be called with @task->sighand->siglock held. + */ +static void task_clear_group_stop_trapping(struct task_struct *task) +{ + if (unlikely(task->group_stop & GROUP_STOP_TRAPPING)) { + task->group_stop &= ~GROUP_STOP_TRAPPING; + __wake_up_sync_key(&task->parent->signal->wait_chldexit, + TASK_UNINTERRUPTIBLE, 1, task); + } +} + +/** + * task_clear_group_stop_pending - clear pending group stop + * @task: target task + * + * Clear group stop states for @task. + * + * CONTEXT: + * Must be called with @task->sighand->siglock held. + */ +void task_clear_group_stop_pending(struct task_struct *task) +{ + task->group_stop &= ~(GROUP_STOP_PENDING | GROUP_STOP_CONSUME | + GROUP_STOP_DEQUEUED); +} + +/** + * task_participate_group_stop - participate in a group stop + * @task: task participating in a group stop + * + * @task has GROUP_STOP_PENDING set and is participating in a group stop. + * Group stop states are cleared and the group stop count is consumed if + * %GROUP_STOP_CONSUME was set. If the consumption completes the group + * stop, the appropriate %SIGNAL_* flags are set. + * + * CONTEXT: + * Must be called with @task->sighand->siglock held. + * + * RETURNS: + * %true if group stop completion should be notified to the parent, %false + * otherwise. + */ +static bool task_participate_group_stop(struct task_struct *task) +{ + struct signal_struct *sig = task->signal; + bool consume = task->group_stop & GROUP_STOP_CONSUME; + + WARN_ON_ONCE(!(task->group_stop & GROUP_STOP_PENDING)); + + task_clear_group_stop_pending(task); + + if (!consume) + return false; + + if (!WARN_ON_ONCE(sig->group_stop_count == 0)) + sig->group_stop_count--; + + /* + * Tell the caller to notify completion iff we are entering into a + * fresh group stop. Read comment in do_signal_stop() for details. + */ + if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) { + sig->flags = SIGNAL_STOP_STOPPED; + return true; + } + return false; +} + +/* + * allocate a new signal queue record + * - this may be called without locks if and only if t == current, otherwise an + * appropriate lock must be held to stop the target task from exiting + */ +static struct sigqueue * +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit) +{ + struct sigqueue *q = NULL; + struct user_struct *user; + + /* + * Protect access to @t credentials. This can go away when all + * callers hold rcu read lock. + */ + rcu_read_lock(); + user = get_uid(__task_cred(t)->user); + atomic_inc(&user->sigpending); + rcu_read_unlock(); + + if (override_rlimit || + atomic_read(&user->sigpending) <= + task_rlimit(t, RLIMIT_SIGPENDING)) { + q = kmem_cache_alloc(sigqueue_cachep, flags); + } else { + print_dropped_signal(sig); + } + + if (unlikely(q == NULL)) { + atomic_dec(&user->sigpending); + free_uid(user); + } else { + INIT_LIST_HEAD(&q->list); + q->flags = 0; + q->user = user; + } + + return q; +} + +static void __sigqueue_free(struct sigqueue *q) +{ + if (q->flags & SIGQUEUE_PREALLOC) + return; + atomic_dec(&q->user->sigpending); + free_uid(q->user); + kmem_cache_free(sigqueue_cachep, q); +} + +void flush_sigqueue(struct sigpending *queue) +{ + struct sigqueue *q; + + sigemptyset(&queue->signal); + while (!list_empty(&queue->list)) { + q = list_entry(queue->list.next, struct sigqueue , list); + list_del_init(&q->list); + __sigqueue_free(q); + } +} + +/* + * Flush all pending signals for a task. + */ +void __flush_signals(struct task_struct *t) +{ + clear_tsk_thread_flag(t, TIF_SIGPENDING); + flush_sigqueue(&t->pending); + flush_sigqueue(&t->signal->shared_pending); +} + +void flush_signals(struct task_struct *t) +{ + unsigned long flags; + + spin_lock_irqsave(&t->sighand->siglock, flags); + __flush_signals(t); + spin_unlock_irqrestore(&t->sighand->siglock, flags); +} + +static void __flush_itimer_signals(struct sigpending *pending) +{ + sigset_t signal, retain; + struct sigqueue *q, *n; + + signal = pending->signal; + sigemptyset(&retain); + + list_for_each_entry_safe(q, n, &pending->list, list) { + int sig = q->info.si_signo; + + if (likely(q->info.si_code != SI_TIMER)) { + sigaddset(&retain, sig); + } else { + sigdelset(&signal, sig); + list_del_init(&q->list); + __sigqueue_free(q); + } + } + + sigorsets(&pending->signal, &signal, &retain); +} + +void flush_itimer_signals(void) +{ + struct task_struct *tsk = current; + unsigned long flags; + + spin_lock_irqsave(&tsk->sighand->siglock, flags); + __flush_itimer_signals(&tsk->pending); + __flush_itimer_signals(&tsk->signal->shared_pending); + spin_unlock_irqrestore(&tsk->sighand->siglock, flags); +} + +void ignore_signals(struct task_struct *t) +{ + int i; + + for (i = 0; i < _NSIG; ++i) + t->sighand->action[i].sa.sa_handler = SIG_IGN; + + flush_signals(t); +} + +/* + * Flush all handlers for a task. + */ + +void +flush_signal_handlers(struct task_struct *t, int force_default) +{ + int i; + struct k_sigaction *ka = &t->sighand->action[0]; + for (i = _NSIG ; i != 0 ; i--) { + if (force_default || ka->sa.sa_handler != SIG_IGN) + ka->sa.sa_handler = SIG_DFL; + ka->sa.sa_flags = 0; + sigemptyset(&ka->sa.sa_mask); + ka++; + } +} + +int unhandled_signal(struct task_struct *tsk, int sig) +{ + void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler; + if (is_global_init(tsk)) + return 1; + if (handler != SIG_IGN && handler != SIG_DFL) + return 0; + return !tracehook_consider_fatal_signal(tsk, sig); +} + +/* + * Notify the system that a driver wants to block all signals for this + * process, and wants to be notified if any signals at all were to be + * sent/acted upon. If the notifier routine returns non-zero, then the + * signal will be acted upon after all. If the notifier routine returns 0, + * then then signal will be blocked. Only one block per process is + * allowed. priv is a pointer to private data that the notifier routine + * can use to determine if the signal should be blocked or not. + */ +void +block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask) +{ + unsigned long flags; + + spin_lock_irqsave(¤t->sighand->siglock, flags); + current->notifier_mask = mask; + current->notifier_data = priv; + current->notifier = notifier; + spin_unlock_irqrestore(¤t->sighand->siglock, flags); +} + +/* Notify the system that blocking has ended. */ + +void +unblock_all_signals(void) +{ + unsigned long flags; + + spin_lock_irqsave(¤t->sighand->siglock, flags); + current->notifier = NULL; + current->notifier_data = NULL; + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); +} + +static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) +{ + struct sigqueue *q, *first = NULL; + + /* + * Collect the siginfo appropriate to this signal. Check if + * there is another siginfo for the same signal. + */ + list_for_each_entry(q, &list->list, list) { + if (q->info.si_signo == sig) { + if (first) + goto still_pending; + first = q; + } + } + + sigdelset(&list->signal, sig); + + if (first) { +still_pending: + list_del_init(&first->list); + copy_siginfo(info, &first->info); + __sigqueue_free(first); + } else { + /* + * Ok, it wasn't in the queue. This must be + * a fast-pathed signal or we must have been + * out of queue space. So zero out the info. + */ + info->si_signo = sig; + info->si_errno = 0; + info->si_code = SI_USER; + info->si_pid = 0; + info->si_uid = 0; + } +} + +static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, + siginfo_t *info) +{ + int sig = next_signal(pending, mask); + + if (sig) { + if (current->notifier) { + if (sigismember(current->notifier_mask, sig)) { + if (!(current->notifier)(current->notifier_data)) { + clear_thread_flag(TIF_SIGPENDING); + return 0; + } + } + } + + collect_signal(sig, pending, info); + } + + return sig; +} + +/* + * Dequeue a signal and return the element to the caller, which is + * expected to free it. + * + * All callers have to hold the siglock. + */ +int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) +{ + int signr; + + /* We only dequeue private signals from ourselves, we don't let + * signalfd steal them + */ + signr = __dequeue_signal(&tsk->pending, mask, info); + if (!signr) { + signr = __dequeue_signal(&tsk->signal->shared_pending, + mask, info); + /* + * itimer signal ? + * + * itimers are process shared and we restart periodic + * itimers in the signal delivery path to prevent DoS + * attacks in the high resolution timer case. This is + * compliant with the old way of self-restarting + * itimers, as the SIGALRM is a legacy signal and only + * queued once. Changing the restart behaviour to + * restart the timer in the signal dequeue path is + * reducing the timer noise on heavy loaded !highres + * systems too. + */ + if (unlikely(signr == SIGALRM)) { + struct hrtimer *tmr = &tsk->signal->real_timer; + + if (!hrtimer_is_queued(tmr) && + tsk->signal->it_real_incr.tv64 != 0) { + hrtimer_forward(tmr, tmr->base->get_time(), + tsk->signal->it_real_incr); + hrtimer_restart(tmr); + } + } + } + + recalc_sigpending(); + if (!signr) + return 0; + + if (unlikely(sig_kernel_stop(signr))) { + /* + * Set a marker that we have dequeued a stop signal. Our + * caller might release the siglock and then the pending + * stop signal it is about to process is no longer in the + * pending bitmasks, but must still be cleared by a SIGCONT + * (and overruled by a SIGKILL). So those cases clear this + * shared flag after we've set it. Note that this flag may + * remain set after the signal we return is ignored or + * handled. That doesn't matter because its only purpose + * is to alert stop-signal processing code when another + * processor has come along and cleared the flag. + */ + current->group_stop |= GROUP_STOP_DEQUEUED; + } + if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { + /* + * Release the siglock to ensure proper locking order + * of timer locks outside of siglocks. Note, we leave + * irqs disabled here, since the posix-timers code is + * about to disable them again anyway. + */ + spin_unlock(&tsk->sighand->siglock); + do_schedule_next_timer(info); + spin_lock(&tsk->sighand->siglock); + } + return signr; +} + +/* + * Tell a process that it has a new active signal.. + * + * NOTE! we rely on the previous spin_lock to + * lock interrupts for us! We can only be called with + * "siglock" held, and the local interrupt must + * have been disabled when that got acquired! + * + * No need to set need_resched since signal event passing + * goes through ->blocked + */ +void signal_wake_up(struct task_struct *t, int resume) +{ + unsigned int mask; + + set_tsk_thread_flag(t, TIF_SIGPENDING); + + /* + * For SIGKILL, we want to wake it up in the stopped/traced/killable + * case. We don't check t->state here because there is a race with it + * executing another processor and just now entering stopped state. + * By using wake_up_state, we ensure the process will wake up and + * handle its death signal. + */ + mask = TASK_INTERRUPTIBLE; + if (resume) + mask |= TASK_WAKEKILL; + if (!wake_up_state(t, mask)) + kick_process(t); +} + +/* + * Remove signals in mask from the pending set and queue. + * Returns 1 if any signals were found. + * + * All callers must be holding the siglock. + * + * This version takes a sigset mask and looks at all signals, + * not just those in the first mask word. + */ +static int rm_from_queue_full(sigset_t *mask, struct sigpending *s) +{ + struct sigqueue *q, *n; + sigset_t m; + + sigandsets(&m, mask, &s->signal); + if (sigisemptyset(&m)) + return 0; + + sigandnsets(&s->signal, &s->signal, mask); + list_for_each_entry_safe(q, n, &s->list, list) { + if (sigismember(mask, q->info.si_signo)) { + list_del_init(&q->list); + __sigqueue_free(q); + } + } + return 1; +} +/* + * Remove signals in mask from the pending set and queue. + * Returns 1 if any signals were found. + * + * All callers must be holding the siglock. + */ +static int rm_from_queue(unsigned long mask, struct sigpending *s) +{ + struct sigqueue *q, *n; + + if (!sigtestsetmask(&s->signal, mask)) + return 0; + + sigdelsetmask(&s->signal, mask); + list_for_each_entry_safe(q, n, &s->list, list) { + if (q->info.si_signo < SIGRTMIN && + (mask & sigmask(q->info.si_signo))) { + list_del_init(&q->list); + __sigqueue_free(q); + } + } + return 1; +} + +static inline int is_si_special(const struct siginfo *info) +{ + return info <= SEND_SIG_FORCED; +} + +static inline bool si_fromuser(const struct siginfo *info) +{ + return info == SEND_SIG_NOINFO || + (!is_si_special(info) && SI_FROMUSER(info)); +} + +/* + * called with RCU read lock from check_kill_permission() + */ +static int kill_ok_by_cred(struct task_struct *t) +{ + const struct cred *cred = current_cred(); + const struct cred *tcred = __task_cred(t); + + if (cred->user->user_ns == tcred->user->user_ns && + (cred->euid == tcred->suid || + cred->euid == tcred->uid || + cred->uid == tcred->suid || + cred->uid == tcred->uid)) + return 1; + + if (ns_capable(tcred->user->user_ns, CAP_KILL)) + return 1; + + return 0; +} + +/* + * Bad permissions for sending the signal + * - the caller must hold the RCU read lock + */ +static int check_kill_permission(int sig, struct siginfo *info, + struct task_struct *t) +{ + struct pid *sid; + int error; + + if (!valid_signal(sig)) + return -EINVAL; + + if (!si_fromuser(info)) + return 0; + + error = audit_signal_info(sig, t); /* Let audit system see the signal */ + if (error) + return error; + + if (!same_thread_group(current, t) && + !kill_ok_by_cred(t)) { + switch (sig) { + case SIGCONT: + sid = task_session(t); + /* + * We don't return the error if sid == NULL. The + * task was unhashed, the caller must notice this. + */ + if (!sid || sid == task_session(current)) + break; + default: + return -EPERM; + } + } + + return security_task_kill(t, info, sig, 0); +} + +/* + * Handle magic process-wide effects of stop/continue signals. Unlike + * the signal actions, these happen immediately at signal-generation + * time regardless of blocking, ignoring, or handling. This does the + * actual continuing for SIGCONT, but not the actual stopping for stop + * signals. The process stop is done as a signal action for SIG_DFL. + * + * Returns true if the signal should be actually delivered, otherwise + * it should be dropped. + */ +static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns) +{ + struct signal_struct *signal = p->signal; + struct task_struct *t; + + if (unlikely(signal->flags & SIGNAL_GROUP_EXIT)) { + /* + * The process is in the middle of dying, nothing to do. + */ + } else if (sig_kernel_stop(sig)) { + /* + * This is a stop signal. Remove SIGCONT from all queues. + */ + rm_from_queue(sigmask(SIGCONT), &signal->shared_pending); + t = p; + do { + rm_from_queue(sigmask(SIGCONT), &t->pending); + } while_each_thread(p, t); + } else if (sig == SIGCONT) { + unsigned int why; + /* + * Remove all stop signals from all queues, wake all threads. + */ + rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending); + t = p; + do { + task_clear_group_stop_pending(t); + rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending); + wake_up_state(t, __TASK_STOPPED); + } while_each_thread(p, t); + + /* + * Notify the parent with CLD_CONTINUED if we were stopped. + * + * If we were in the middle of a group stop, we pretend it + * was already finished, and then continued. Since SIGCHLD + * doesn't queue we report only CLD_STOPPED, as if the next + * CLD_CONTINUED was dropped. + */ + why = 0; + if (signal->flags & SIGNAL_STOP_STOPPED) + why |= SIGNAL_CLD_CONTINUED; + else if (signal->group_stop_count) + why |= SIGNAL_CLD_STOPPED; + + if (why) { + /* + * The first thread which returns from do_signal_stop() + * will take ->siglock, notice SIGNAL_CLD_MASK, and + * notify its parent. See get_signal_to_deliver(). + */ + signal->flags = why | SIGNAL_STOP_CONTINUED; + signal->group_stop_count = 0; + signal->group_exit_code = 0; + } + } + + return !sig_ignored(p, sig, from_ancestor_ns); +} + +/* + * Test if P wants to take SIG. After we've checked all threads with this, + * it's equivalent to finding no threads not blocking SIG. Any threads not + * blocking SIG were ruled out because they are not running and already + * have pending signals. Such threads will dequeue from the shared queue + * as soon as they're available, so putting the signal on the shared queue + * will be equivalent to sending it to one such thread. + */ +static inline int wants_signal(int sig, struct task_struct *p) +{ + if (sigismember(&p->blocked, sig)) + return 0; + if (p->flags & PF_EXITING) + return 0; + if (sig == SIGKILL) + return 1; + if (task_is_stopped_or_traced(p)) + return 0; + return task_curr(p) || !signal_pending(p); +} + +static void complete_signal(int sig, struct task_struct *p, int group) +{ + struct signal_struct *signal = p->signal; + struct task_struct *t; + + /* + * Now find a thread we can wake up to take the signal off the queue. + * + * If the main thread wants the signal, it gets first crack. + * Probably the least surprising to the average bear. + */ + if (wants_signal(sig, p)) + t = p; + else if (!group || thread_group_empty(p)) + /* + * There is just one thread and it does not need to be woken. + * It will dequeue unblocked signals before it runs again. + */ + return; + else { + /* + * Otherwise try to find a suitable thread. + */ + t = signal->curr_target; + while (!wants_signal(sig, t)) { + t = next_thread(t); + if (t == signal->curr_target) + /* + * No thread needs to be woken. + * Any eligible threads will see + * the signal in the queue soon. + */ + return; + } + signal->curr_target = t; + } + + /* + * Found a killable thread. If the signal will be fatal, + * then start taking the whole group down immediately. + */ + if (sig_fatal(p, sig) && + !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) && + !sigismember(&t->real_blocked, sig) && + (sig == SIGKILL || + !tracehook_consider_fatal_signal(t, sig))) { + /* + * This signal will be fatal to the whole group. + */ + if (!sig_kernel_coredump(sig)) { + /* + * Start a group exit and wake everybody up. + * This way we don't have other threads + * running and doing things after a slower + * thread has the fatal signal pending. + */ + signal->flags = SIGNAL_GROUP_EXIT; + signal->group_exit_code = sig; + signal->group_stop_count = 0; + t = p; + do { + task_clear_group_stop_pending(t); + sigaddset(&t->pending.signal, SIGKILL); + signal_wake_up(t, 1); + } while_each_thread(p, t); + return; + } + } + + /* + * The signal is already in the shared-pending queue. + * Tell the chosen thread to wake up and dequeue it. + */ + signal_wake_up(t, sig == SIGKILL); + return; +} + +static inline int legacy_queue(struct sigpending *signals, int sig) +{ + return (sig < SIGRTMIN) && sigismember(&signals->signal, sig); +} + +static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, + int group, int from_ancestor_ns) +{ + struct sigpending *pending; + struct sigqueue *q; + int override_rlimit; + + trace_signal_generate(sig, info, t); + + assert_spin_locked(&t->sighand->siglock); + + if (!prepare_signal(sig, t, from_ancestor_ns)) + return 0; + + pending = group ? &t->signal->shared_pending : &t->pending; + /* + * Short-circuit ignored signals and support queuing + * exactly one non-rt signal, so that we can get more + * detailed information about the cause of the signal. + */ + if (legacy_queue(pending, sig)) + return 0; + /* + * fast-pathed signals for kernel-internal things like SIGSTOP + * or SIGKILL. + */ + if (info == SEND_SIG_FORCED) + goto out_set; + + /* + * Real-time signals must be queued if sent by sigqueue, or + * some other real-time mechanism. It is implementation + * defined whether kill() does so. We attempt to do so, on + * the principle of least surprise, but since kill is not + * allowed to fail with EAGAIN when low on memory we just + * make sure at least one signal gets delivered and don't + * pass on the info struct. + */ + if (sig < SIGRTMIN) + override_rlimit = (is_si_special(info) || info->si_code >= 0); + else + override_rlimit = 0; + + q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE, + override_rlimit); + if (q) { + list_add_tail(&q->list, &pending->list); + switch ((unsigned long) info) { + case (unsigned long) SEND_SIG_NOINFO: + q->info.si_signo = sig; + q->info.si_errno = 0; + q->info.si_code = SI_USER; + q->info.si_pid = task_tgid_nr_ns(current, + task_active_pid_ns(t)); + q->info.si_uid = current_uid(); + break; + case (unsigned long) SEND_SIG_PRIV: + q->info.si_signo = sig; + q->info.si_errno = 0; + q->info.si_code = SI_KERNEL; + q->info.si_pid = 0; + q->info.si_uid = 0; + break; + default: + copy_siginfo(&q->info, info); + if (from_ancestor_ns) + q->info.si_pid = 0; + break; + } + } else if (!is_si_special(info)) { + if (sig >= SIGRTMIN && info->si_code != SI_USER) { + /* + * Queue overflow, abort. We may abort if the + * signal was rt and sent by user using something + * other than kill(). + */ + trace_signal_overflow_fail(sig, group, info); + return -EAGAIN; + } else { + /* + * This is a silent loss of information. We still + * send the signal, but the *info bits are lost. + */ + trace_signal_lose_info(sig, group, info); + } + } + +out_set: + signalfd_notify(t, sig); + sigaddset(&pending->signal, sig); + complete_signal(sig, t, group); + return 0; +} + +static int send_signal(int sig, struct siginfo *info, struct task_struct *t, + int group) +{ + int from_ancestor_ns = 0; + +#ifdef CONFIG_PID_NS + from_ancestor_ns = si_fromuser(info) && + !task_pid_nr_ns(current, task_active_pid_ns(t)); +#endif + + return __send_signal(sig, info, t, group, from_ancestor_ns); +} + +static void print_fatal_signal(struct pt_regs *regs, int signr) +{ + printk("%s/%d: potentially unexpected fatal signal %d.\n", + current->comm, task_pid_nr(current), signr); + +#if defined(__i386__) && !defined(__arch_um__) + printk("code at %08lx: ", regs->ip); + { + int i; + for (i = 0; i < 16; i++) { + unsigned char insn; + + if (get_user(insn, (unsigned char *)(regs->ip + i))) + break; + printk("%02x ", insn); + } + } +#endif + printk("\n"); + preempt_disable(); + show_regs(regs); + preempt_enable(); +} + +static int __init setup_print_fatal_signals(char *str) +{ + get_option (&str, &print_fatal_signals); + + return 1; +} + +__setup("print-fatal-signals=", setup_print_fatal_signals); + +int +__group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) +{ + return send_signal(sig, info, p, 1); +} + +static int +specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t) +{ + return send_signal(sig, info, t, 0); +} + +int do_send_sig_info(int sig, struct siginfo *info, struct task_struct *p, + bool group) +{ + unsigned long flags; + int ret = -ESRCH; + + if (lock_task_sighand(p, &flags)) { + ret = send_signal(sig, info, p, group); + unlock_task_sighand(p, &flags); + } + + return ret; +} + +/* + * Force a signal that the process can't ignore: if necessary + * we unblock the signal and change any SIG_IGN to SIG_DFL. + * + * Note: If we unblock the signal, we always reset it to SIG_DFL, + * since we do not want to have a signal handler that was blocked + * be invoked when user space had explicitly blocked it. + * + * We don't want to have recursive SIGSEGV's etc, for example, + * that is why we also clear SIGNAL_UNKILLABLE. + */ +int +force_sig_info(int sig, struct siginfo *info, struct task_struct *t) +{ + unsigned long int flags; + int ret, blocked, ignored; + struct k_sigaction *action; + + spin_lock_irqsave(&t->sighand->siglock, flags); + action = &t->sighand->action[sig-1]; + ignored = action->sa.sa_handler == SIG_IGN; + blocked = sigismember(&t->blocked, sig); + if (blocked || ignored) { + action->sa.sa_handler = SIG_DFL; + if (blocked) { + sigdelset(&t->blocked, sig); + recalc_sigpending_and_wake(t); + } + } + if (action->sa.sa_handler == SIG_DFL) + t->signal->flags &= ~SIGNAL_UNKILLABLE; + ret = specific_send_sig_info(sig, info, t); + spin_unlock_irqrestore(&t->sighand->siglock, flags); + + return ret; +} + +/* + * Nuke all other threads in the group. + */ +int zap_other_threads(struct task_struct *p) +{ + struct task_struct *t = p; + int count = 0; + + p->signal->group_stop_count = 0; + + while_each_thread(p, t) { + task_clear_group_stop_pending(t); + count++; + + /* Don't bother with already dead threads */ + if (t->exit_state) + continue; + sigaddset(&t->pending.signal, SIGKILL); + signal_wake_up(t, 1); + } + + return count; +} + +struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, + unsigned long *flags) +{ + struct sighand_struct *sighand; + + for (;;) { + local_irq_save(*flags); + rcu_read_lock(); + sighand = rcu_dereference(tsk->sighand); + if (unlikely(sighand == NULL)) { + rcu_read_unlock(); + local_irq_restore(*flags); + break; + } + + spin_lock(&sighand->siglock); + if (likely(sighand == tsk->sighand)) { + rcu_read_unlock(); + break; + } + spin_unlock(&sighand->siglock); + rcu_read_unlock(); + local_irq_restore(*flags); + } + + return sighand; +} + +/* + * send signal info to all the members of a group + */ +int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) +{ + int ret; + + rcu_read_lock(); + ret = check_kill_permission(sig, info, p); + rcu_read_unlock(); + + if (!ret && sig) + ret = do_send_sig_info(sig, info, p, true); + + return ret; +} + +/* + * __kill_pgrp_info() sends a signal to a process group: this is what the tty + * control characters do (^C, ^Z etc) + * - the caller must hold at least a readlock on tasklist_lock + */ +int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp) +{ + struct task_struct *p = NULL; + int retval, success; + + success = 0; + retval = -ESRCH; + do_each_pid_task(pgrp, PIDTYPE_PGID, p) { + int err = group_send_sig_info(sig, info, p); + success |= !err; + retval = err; + } while_each_pid_task(pgrp, PIDTYPE_PGID, p); + return success ? 0 : retval; +} + +int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) +{ + int error = -ESRCH; + struct task_struct *p; + + rcu_read_lock(); +retry: + p = pid_task(pid, PIDTYPE_PID); + if (p) { + error = group_send_sig_info(sig, info, p); + if (unlikely(error == -ESRCH)) + /* + * The task was unhashed in between, try again. + * If it is dead, pid_task() will return NULL, + * if we race with de_thread() it will find the + * new leader. + */ + goto retry; + } + rcu_read_unlock(); + + return error; +} + +int kill_proc_info(int sig, struct siginfo *info, pid_t pid) +{ + int error; + rcu_read_lock(); + error = kill_pid_info(sig, info, find_vpid(pid)); + rcu_read_unlock(); + return error; +} + +/* like kill_pid_info(), but doesn't use uid/euid of "current" */ +int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid, + uid_t uid, uid_t euid, u32 secid) +{ + int ret = -EINVAL; + struct task_struct *p; + const struct cred *pcred; + unsigned long flags; + + if (!valid_signal(sig)) + return ret; + + rcu_read_lock(); + p = pid_task(pid, PIDTYPE_PID); + if (!p) { + ret = -ESRCH; + goto out_unlock; + } + pcred = __task_cred(p); + if (si_fromuser(info) && + euid != pcred->suid && euid != pcred->uid && + uid != pcred->suid && uid != pcred->uid) { + ret = -EPERM; + goto out_unlock; + } + ret = security_task_kill(p, info, sig, secid); + if (ret) + goto out_unlock; + + if (sig) { + if (lock_task_sighand(p, &flags)) { + ret = __send_signal(sig, info, p, 1, 0); + unlock_task_sighand(p, &flags); + } else + ret = -ESRCH; + } +out_unlock: + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL_GPL(kill_pid_info_as_uid); + +/* + * kill_something_info() interprets pid in interesting ways just like kill(2). + * + * POSIX specifies that kill(-1,sig) is unspecified, but what we have + * is probably wrong. Should make it like BSD or SYSV. + */ + +static int kill_something_info(int sig, struct siginfo *info, pid_t pid) +{ + int ret; + + if (pid > 0) { + rcu_read_lock(); + ret = kill_pid_info(sig, info, find_vpid(pid)); + rcu_read_unlock(); + return ret; + } + + read_lock(&tasklist_lock); + if (pid != -1) { + ret = __kill_pgrp_info(sig, info, + pid ? find_vpid(-pid) : task_pgrp(current)); + } else { + int retval = 0, count = 0; + struct task_struct * p; + + for_each_process(p) { + if (task_pid_vnr(p) > 1 && + !same_thread_group(p, current)) { + int err = group_send_sig_info(sig, info, p); + ++count; + if (err != -EPERM) + retval = err; + } + } + ret = count ? retval : -ESRCH; + } + read_unlock(&tasklist_lock); + + return ret; +} + +/* + * These are for backward compatibility with the rest of the kernel source. + */ + +int send_sig_info(int sig, struct siginfo *info, struct task_struct *p) +{ + /* + * Make sure legacy kernel users don't send in bad values + * (normal paths check this in check_kill_permission). + */ + if (!valid_signal(sig)) + return -EINVAL; + + return do_send_sig_info(sig, info, p, false); +} + +#define __si_special(priv) \ + ((priv) ? SEND_SIG_PRIV : SEND_SIG_NOINFO) + +int +send_sig(int sig, struct task_struct *p, int priv) +{ + return send_sig_info(sig, __si_special(priv), p); +} + +void +force_sig(int sig, struct task_struct *p) +{ + force_sig_info(sig, SEND_SIG_PRIV, p); +} + +/* + * When things go south during signal handling, we + * will force a SIGSEGV. And if the signal that caused + * the problem was already a SIGSEGV, we'll want to + * make sure we don't even try to deliver the signal.. + */ +int +force_sigsegv(int sig, struct task_struct *p) +{ + if (sig == SIGSEGV) { + unsigned long flags; + spin_lock_irqsave(&p->sighand->siglock, flags); + p->sighand->action[sig - 1].sa.sa_handler = SIG_DFL; + spin_unlock_irqrestore(&p->sighand->siglock, flags); + } + force_sig(SIGSEGV, p); + return 0; +} + +int kill_pgrp(struct pid *pid, int sig, int priv) +{ + int ret; + + read_lock(&tasklist_lock); + ret = __kill_pgrp_info(sig, __si_special(priv), pid); + read_unlock(&tasklist_lock); + + return ret; +} +EXPORT_SYMBOL(kill_pgrp); + +int kill_pid(struct pid *pid, int sig, int priv) +{ + return kill_pid_info(sig, __si_special(priv), pid); +} +EXPORT_SYMBOL(kill_pid); + +/* + * These functions support sending signals using preallocated sigqueue + * structures. This is needed "because realtime applications cannot + * afford to lose notifications of asynchronous events, like timer + * expirations or I/O completions". In the case of POSIX Timers + * we allocate the sigqueue structure from the timer_create. If this + * allocation fails we are able to report the failure to the application + * with an EAGAIN error. + */ +struct sigqueue *sigqueue_alloc(void) +{ + struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0); + + if (q) + q->flags |= SIGQUEUE_PREALLOC; + + return q; +} + +void sigqueue_free(struct sigqueue *q) +{ + unsigned long flags; + spinlock_t *lock = ¤t->sighand->siglock; + + BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); + /* + * We must hold ->siglock while testing q->list + * to serialize with collect_signal() or with + * __exit_signal()->flush_sigqueue(). + */ + spin_lock_irqsave(lock, flags); + q->flags &= ~SIGQUEUE_PREALLOC; + /* + * If it is queued it will be freed when dequeued, + * like the "regular" sigqueue. + */ + if (!list_empty(&q->list)) + q = NULL; + spin_unlock_irqrestore(lock, flags); + + if (q) + __sigqueue_free(q); +} + +int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) +{ + int sig = q->info.si_signo; + struct sigpending *pending; + unsigned long flags; + int ret; + + BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); + + ret = -1; + if (!likely(lock_task_sighand(t, &flags))) + goto ret; + + ret = 1; /* the signal is ignored */ + if (!prepare_signal(sig, t, 0)) + goto out; + + ret = 0; + if (unlikely(!list_empty(&q->list))) { + /* + * If an SI_TIMER entry is already queue just increment + * the overrun count. + */ + BUG_ON(q->info.si_code != SI_TIMER); + q->info.si_overrun++; + goto out; + } + q->info.si_overrun = 0; + + signalfd_notify(t, sig); + pending = group ? &t->signal->shared_pending : &t->pending; + list_add_tail(&q->list, &pending->list); + sigaddset(&pending->signal, sig); + complete_signal(sig, t, group); +out: + unlock_task_sighand(t, &flags); +ret: + return ret; +} + +/* + * Let a parent know about the death of a child. + * For a stopped/continued status change, use do_notify_parent_cldstop instead. + * + * Returns -1 if our parent ignored us and so we've switched to + * self-reaping, or else @sig. + */ +int do_notify_parent(struct task_struct *tsk, int sig) +{ + struct siginfo info; + unsigned long flags; + struct sighand_struct *psig; + int ret = sig; + + BUG_ON(sig == -1); + + /* do_notify_parent_cldstop should have been called instead. */ + BUG_ON(task_is_stopped_or_traced(tsk)); + + BUG_ON(!task_ptrace(tsk) && + (tsk->group_leader != tsk || !thread_group_empty(tsk))); + + info.si_signo = sig; + info.si_errno = 0; + /* + * we are under tasklist_lock here so our parent is tied to + * us and cannot exit and release its namespace. + * + * the only it can is to switch its nsproxy with sys_unshare, + * bu uncharing pid namespaces is not allowed, so we'll always + * see relevant namespace + * + * write_lock() currently calls preempt_disable() which is the + * same as rcu_read_lock(), but according to Oleg, this is not + * correct to rely on this + */ + rcu_read_lock(); + info.si_pid = task_pid_nr_ns(tsk, tsk->parent->nsproxy->pid_ns); + info.si_uid = __task_cred(tsk)->uid; + rcu_read_unlock(); + + info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime, + tsk->signal->utime)); + info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime, + tsk->signal->stime)); + + info.si_status = tsk->exit_code & 0x7f; + if (tsk->exit_code & 0x80) + info.si_code = CLD_DUMPED; + else if (tsk->exit_code & 0x7f) + info.si_code = CLD_KILLED; + else { + info.si_code = CLD_EXITED; + info.si_status = tsk->exit_code >> 8; + } + + psig = tsk->parent->sighand; + spin_lock_irqsave(&psig->siglock, flags); + if (!task_ptrace(tsk) && sig == SIGCHLD && + (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN || + (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT))) { + /* + * We are exiting and our parent doesn't care. POSIX.1 + * defines special semantics for setting SIGCHLD to SIG_IGN + * or setting the SA_NOCLDWAIT flag: we should be reaped + * automatically and not left for our parent's wait4 call. + * Rather than having the parent do it as a magic kind of + * signal handler, we just set this to tell do_exit that we + * can be cleaned up without becoming a zombie. Note that + * we still call __wake_up_parent in this case, because a + * blocked sys_wait4 might now return -ECHILD. + * + * Whether we send SIGCHLD or not for SA_NOCLDWAIT + * is implementation-defined: we do (if you don't want + * it, just use SIG_IGN instead). + */ + ret = tsk->exit_signal = -1; + if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) + sig = -1; + } + if (valid_signal(sig) && sig > 0) + __group_send_sig_info(sig, &info, tsk->parent); + __wake_up_parent(tsk, tsk->parent); + spin_unlock_irqrestore(&psig->siglock, flags); + + return ret; +} + +/** + * do_notify_parent_cldstop - notify parent of stopped/continued state change + * @tsk: task reporting the state change + * @for_ptracer: the notification is for ptracer + * @why: CLD_{CONTINUED|STOPPED|TRAPPED} to report + * + * Notify @tsk's parent that the stopped/continued state has changed. If + * @for_ptracer is %false, @tsk's group leader notifies to its real parent. + * If %true, @tsk reports to @tsk->parent which should be the ptracer. + * + * CONTEXT: + * Must be called with tasklist_lock at least read locked. + */ +static void do_notify_parent_cldstop(struct task_struct *tsk, + bool for_ptracer, int why) +{ + struct siginfo info; + unsigned long flags; + struct task_struct *parent; + struct sighand_struct *sighand; + + if (for_ptracer) { + parent = tsk->parent; + } else { + tsk = tsk->group_leader; + parent = tsk->real_parent; + } + + info.si_signo = SIGCHLD; + info.si_errno = 0; + /* + * see comment in do_notify_parent() about the following 4 lines + */ + rcu_read_lock(); + info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns); + info.si_uid = __task_cred(tsk)->uid; + rcu_read_unlock(); + + info.si_utime = cputime_to_clock_t(tsk->utime); + info.si_stime = cputime_to_clock_t(tsk->stime); + + info.si_code = why; + switch (why) { + case CLD_CONTINUED: + info.si_status = SIGCONT; + break; + case CLD_STOPPED: + info.si_status = tsk->signal->group_exit_code & 0x7f; + break; + case CLD_TRAPPED: + info.si_status = tsk->exit_code & 0x7f; + break; + default: + BUG(); + } + + sighand = parent->sighand; + spin_lock_irqsave(&sighand->siglock, flags); + if (sighand->action[SIGCHLD-1].sa.sa_handler != SIG_IGN && + !(sighand->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDSTOP)) + __group_send_sig_info(SIGCHLD, &info, parent); + /* + * Even if SIGCHLD is not generated, we must wake up wait4 calls. + */ + __wake_up_parent(tsk, parent); + spin_unlock_irqrestore(&sighand->siglock, flags); +} + +static inline int may_ptrace_stop(void) +{ + if (!likely(task_ptrace(current))) + return 0; + /* + * Are we in the middle of do_coredump? + * If so and our tracer is also part of the coredump stopping + * is a deadlock situation, and pointless because our tracer + * is dead so don't allow us to stop. + * If SIGKILL was already sent before the caller unlocked + * ->siglock we must see ->core_state != NULL. Otherwise it + * is safe to enter schedule(). + */ + if (unlikely(current->mm->core_state) && + unlikely(current->mm == current->parent->mm)) + return 0; + + return 1; +} + +/* + * Return non-zero if there is a SIGKILL that should be waking us up. + * Called with the siglock held. + */ +static int sigkill_pending(struct task_struct *tsk) +{ + return sigismember(&tsk->pending.signal, SIGKILL) || + sigismember(&tsk->signal->shared_pending.signal, SIGKILL); +} + +/* + * Test whether the target task of the usual cldstop notification - the + * real_parent of @child - is in the same group as the ptracer. + */ +static bool real_parent_is_ptracer(struct task_struct *child) +{ + return same_thread_group(child->parent, child->real_parent); +} + +/* + * This must be called with current->sighand->siglock held. + * + * This should be the path for all ptrace stops. + * We always set current->last_siginfo while stopped here. + * That makes it a way to test a stopped process for + * being ptrace-stopped vs being job-control-stopped. + * + * If we actually decide not to stop at all because the tracer + * is gone, we keep current->exit_code unless clear_code. + */ +static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) + __releases(¤t->sighand->siglock) + __acquires(¤t->sighand->siglock) +{ + bool gstop_done = false; + + if (arch_ptrace_stop_needed(exit_code, info)) { + /* + * The arch code has something special to do before a + * ptrace stop. This is allowed to block, e.g. for faults + * on user stack pages. We can't keep the siglock while + * calling arch_ptrace_stop, so we must release it now. + * To preserve proper semantics, we must do this before + * any signal bookkeeping like checking group_stop_count. + * Meanwhile, a SIGKILL could come in before we retake the + * siglock. That must prevent us from sleeping in TASK_TRACED. + * So after regaining the lock, we must check for SIGKILL. + */ + spin_unlock_irq(¤t->sighand->siglock); + arch_ptrace_stop(exit_code, info); + spin_lock_irq(¤t->sighand->siglock); + if (sigkill_pending(current)) + return; + } + + /* + * If @why is CLD_STOPPED, we're trapping to participate in a group + * stop. Do the bookkeeping. Note that if SIGCONT was delievered + * while siglock was released for the arch hook, PENDING could be + * clear now. We act as if SIGCONT is received after TASK_TRACED + * is entered - ignore it. + */ + if (why == CLD_STOPPED && (current->group_stop & GROUP_STOP_PENDING)) + gstop_done = task_participate_group_stop(current); + + current->last_siginfo = info; + current->exit_code = exit_code; + + /* + * TRACED should be visible before TRAPPING is cleared; otherwise, + * the tracer might fail do_wait(). + */ + set_current_state(TASK_TRACED); + + /* + * We're committing to trapping. Clearing GROUP_STOP_TRAPPING and + * transition to TASK_TRACED should be atomic with respect to + * siglock. This hsould be done after the arch hook as siglock is + * released and regrabbed across it. + */ + task_clear_group_stop_trapping(current); + + spin_unlock_irq(¤t->sighand->siglock); + read_lock(&tasklist_lock); + if (may_ptrace_stop()) { + /* + * Notify parents of the stop. + * + * While ptraced, there are two parents - the ptracer and + * the real_parent of the group_leader. The ptracer should + * know about every stop while the real parent is only + * interested in the completion of group stop. The states + * for the two don't interact with each other. Notify + * separately unless they're gonna be duplicates. + */ + do_notify_parent_cldstop(current, true, why); + if (gstop_done && !real_parent_is_ptracer(current)) + do_notify_parent_cldstop(current, false, why); + + /* + * Don't want to allow preemption here, because + * sys_ptrace() needs this task to be inactive. + * + * XXX: implement read_unlock_no_resched(). + */ + preempt_disable(); + read_unlock(&tasklist_lock); + preempt_enable_no_resched(); + schedule(); + } else { + /* + * By the time we got the lock, our tracer went away. + * Don't drop the lock yet, another tracer may come. + * + * If @gstop_done, the ptracer went away between group stop + * completion and here. During detach, it would have set + * GROUP_STOP_PENDING on us and we'll re-enter TASK_STOPPED + * in do_signal_stop() on return, so notifying the real + * parent of the group stop completion is enough. + */ + if (gstop_done) + do_notify_parent_cldstop(current, false, why); + + __set_current_state(TASK_RUNNING); + if (clear_code) + current->exit_code = 0; + read_unlock(&tasklist_lock); + } + + /* + * While in TASK_TRACED, we were considered "frozen enough". + * Now that we woke up, it's crucial if we're supposed to be + * frozen that we freeze now before running anything substantial. + */ + try_to_freeze(); + + /* + * We are back. Now reacquire the siglock before touching + * last_siginfo, so that we are sure to have synchronized with + * any signal-sending on another CPU that wants to examine it. + */ + spin_lock_irq(¤t->sighand->siglock); + current->last_siginfo = NULL; + + /* + * Queued signals ignored us while we were stopped for tracing. + * So check for any that we should take before resuming user mode. + * This sets TIF_SIGPENDING, but never clears it. + */ + recalc_sigpending_tsk(current); +} + +void ptrace_notify(int exit_code) +{ + siginfo_t info; + + BUG_ON((exit_code & (0x7f | ~0xffff)) != SIGTRAP); + + memset(&info, 0, sizeof info); + info.si_signo = SIGTRAP; + info.si_code = exit_code; + info.si_pid = task_pid_vnr(current); + info.si_uid = current_uid(); + + /* Let the debugger run. */ + spin_lock_irq(¤t->sighand->siglock); + ptrace_stop(exit_code, CLD_TRAPPED, 1, &info); + spin_unlock_irq(¤t->sighand->siglock); +} + +/* + * This performs the stopping for SIGSTOP and other stop signals. + * We have to stop all threads in the thread group. + * Returns non-zero if we've actually stopped and released the siglock. + * Returns zero if we didn't stop and still hold the siglock. + */ +static int do_signal_stop(int signr) +{ + struct signal_struct *sig = current->signal; + + if (!(current->group_stop & GROUP_STOP_PENDING)) { + unsigned int gstop = GROUP_STOP_PENDING | GROUP_STOP_CONSUME; + struct task_struct *t; + + /* signr will be recorded in task->group_stop for retries */ + WARN_ON_ONCE(signr & ~GROUP_STOP_SIGMASK); + + if (!likely(current->group_stop & GROUP_STOP_DEQUEUED) || + unlikely(signal_group_exit(sig))) + return 0; + /* + * There is no group stop already in progress. We must + * initiate one now. + * + * While ptraced, a task may be resumed while group stop is + * still in effect and then receive a stop signal and + * initiate another group stop. This deviates from the + * usual behavior as two consecutive stop signals can't + * cause two group stops when !ptraced. That is why we + * also check !task_is_stopped(t) below. + * + * The condition can be distinguished by testing whether + * SIGNAL_STOP_STOPPED is already set. Don't generate + * group_exit_code in such case. + * + * This is not necessary for SIGNAL_STOP_CONTINUED because + * an intervening stop signal is required to cause two + * continued events regardless of ptrace. + */ + if (!(sig->flags & SIGNAL_STOP_STOPPED)) + sig->group_exit_code = signr; + + current->group_stop &= ~GROUP_STOP_SIGMASK; + current->group_stop |= signr | gstop; + sig->group_stop_count = 1; + for (t = next_thread(current); t != current; + t = next_thread(t)) { + /* + * Setting state to TASK_STOPPED for a group + * stop is always done with the siglock held, + * so this check has no races. + */ + if (!(t->flags & PF_EXITING) && !task_is_stopped(t)) { + t->group_stop &= ~GROUP_STOP_SIGMASK; + t->group_stop |= signr | gstop; + sig->group_stop_count++; + signal_wake_up(t, 0); + } + } + } +retry: + if (likely(!task_ptrace(current))) { + int notify = 0; + + /* + * If there are no other threads in the group, or if there + * is a group stop in progress and we are the last to stop, + * report to the parent. + */ + if (task_participate_group_stop(current)) + notify = CLD_STOPPED; + + __set_current_state(TASK_STOPPED); + spin_unlock_irq(¤t->sighand->siglock); + + /* + * Notify the parent of the group stop completion. Because + * we're not holding either the siglock or tasklist_lock + * here, ptracer may attach inbetween; however, this is for + * group stop and should always be delivered to the real + * parent of the group leader. The new ptracer will get + * its notification when this task transitions into + * TASK_TRACED. + */ + if (notify) { + read_lock(&tasklist_lock); + do_notify_parent_cldstop(current, false, notify); + read_unlock(&tasklist_lock); + } + + /* Now we don't run again until woken by SIGCONT or SIGKILL */ + schedule(); + + spin_lock_irq(¤t->sighand->siglock); + } else { + ptrace_stop(current->group_stop & GROUP_STOP_SIGMASK, + CLD_STOPPED, 0, NULL); + current->exit_code = 0; + } + + /* + * GROUP_STOP_PENDING could be set if another group stop has + * started since being woken up or ptrace wants us to transit + * between TASK_STOPPED and TRACED. Retry group stop. + */ + if (current->group_stop & GROUP_STOP_PENDING) { + WARN_ON_ONCE(!(current->group_stop & GROUP_STOP_SIGMASK)); + goto retry; + } + + /* PTRACE_ATTACH might have raced with task killing, clear trapping */ + task_clear_group_stop_trapping(current); + + spin_unlock_irq(¤t->sighand->siglock); + + tracehook_finish_jctl(); + + return 1; +} + +static int ptrace_signal(int signr, siginfo_t *info, + struct pt_regs *regs, void *cookie) +{ + if (!task_ptrace(current)) + return signr; + + ptrace_signal_deliver(regs, cookie); + + /* Let the debugger run. */ + ptrace_stop(signr, CLD_TRAPPED, 0, info); + + /* We're back. Did the debugger cancel the sig? */ + signr = current->exit_code; + if (signr == 0) + return signr; + + current->exit_code = 0; + + /* + * Update the siginfo structure if the signal has + * changed. If the debugger wanted something + * specific in the siginfo structure then it should + * have updated *info via PTRACE_SETSIGINFO. + */ + if (signr != info->si_signo) { + info->si_signo = signr; + info->si_errno = 0; + info->si_code = SI_USER; + info->si_pid = task_pid_vnr(current->parent); + info->si_uid = task_uid(current->parent); + } + + /* If the (new) signal is now blocked, requeue it. */ + if (sigismember(¤t->blocked, signr)) { + specific_send_sig_info(signr, info, current); + signr = 0; + } + + return signr; +} + +int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, + struct pt_regs *regs, void *cookie) +{ + struct sighand_struct *sighand = current->sighand; + struct signal_struct *signal = current->signal; + int signr; + +relock: + /* + * We'll jump back here after any time we were stopped in TASK_STOPPED. + * While in TASK_STOPPED, we were considered "frozen enough". + * Now that we woke up, it's crucial if we're supposed to be + * frozen that we freeze now before running anything substantial. + */ + try_to_freeze(); + + spin_lock_irq(&sighand->siglock); + /* + * Every stopped thread goes here after wakeup. Check to see if + * we should notify the parent, prepare_signal(SIGCONT) encodes + * the CLD_ si_code into SIGNAL_CLD_MASK bits. + */ + if (unlikely(signal->flags & SIGNAL_CLD_MASK)) { + struct task_struct *leader; + int why; + + if (signal->flags & SIGNAL_CLD_CONTINUED) + why = CLD_CONTINUED; + else + why = CLD_STOPPED; + + signal->flags &= ~SIGNAL_CLD_MASK; + + spin_unlock_irq(&sighand->siglock); + + /* + * Notify the parent that we're continuing. This event is + * always per-process and doesn't make whole lot of sense + * for ptracers, who shouldn't consume the state via + * wait(2) either, but, for backward compatibility, notify + * the ptracer of the group leader too unless it's gonna be + * a duplicate. + */ + read_lock(&tasklist_lock); + + do_notify_parent_cldstop(current, false, why); + + leader = current->group_leader; + if (task_ptrace(leader) && !real_parent_is_ptracer(leader)) + do_notify_parent_cldstop(leader, true, why); + + read_unlock(&tasklist_lock); + + goto relock; + } + + for (;;) { + struct k_sigaction *ka; + /* + * Tracing can induce an artificial signal and choose sigaction. + * The return value in @signr determines the default action, + * but @info->si_signo is the signal number we will report. + */ + signr = tracehook_get_signal(current, regs, info, return_ka); + if (unlikely(signr < 0)) + goto relock; + if (unlikely(signr != 0)) + ka = return_ka; + else { + if (unlikely(current->group_stop & + GROUP_STOP_PENDING) && do_signal_stop(0)) + goto relock; + + signr = dequeue_signal(current, ¤t->blocked, + info); + + if (!signr) + break; /* will return 0 */ + + if (signr != SIGKILL) { + signr = ptrace_signal(signr, info, + regs, cookie); + if (!signr) + continue; + } + + ka = &sighand->action[signr-1]; + } + + /* Trace actually delivered signals. */ + trace_signal_deliver(signr, info, ka); + + if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ + continue; + if (ka->sa.sa_handler != SIG_DFL) { + /* Run the handler. */ + *return_ka = *ka; + + if (ka->sa.sa_flags & SA_ONESHOT) + ka->sa.sa_handler = SIG_DFL; + + break; /* will return non-zero "signr" value */ + } + + /* + * Now we are doing the default action for this signal. + */ + if (sig_kernel_ignore(signr)) /* Default is nothing. */ + continue; + + /* + * Global init gets no signals it doesn't want. + * Container-init gets no signals it doesn't want from same + * container. + * + * Note that if global/container-init sees a sig_kernel_only() + * signal here, the signal must have been generated internally + * or must have come from an ancestor namespace. In either + * case, the signal cannot be dropped. + */ + if (unlikely(signal->flags & SIGNAL_UNKILLABLE) && + !sig_kernel_only(signr)) + continue; + + if (sig_kernel_stop(signr)) { + /* + * The default action is to stop all threads in + * the thread group. The job control signals + * do nothing in an orphaned pgrp, but SIGSTOP + * always works. Note that siglock needs to be + * dropped during the call to is_orphaned_pgrp() + * because of lock ordering with tasklist_lock. + * This allows an intervening SIGCONT to be posted. + * We need to check for that and bail out if necessary. + */ + if (signr != SIGSTOP) { + spin_unlock_irq(&sighand->siglock); + + /* signals can be posted during this window */ + + if (is_current_pgrp_orphaned()) + goto relock; + + spin_lock_irq(&sighand->siglock); + } + + if (likely(do_signal_stop(info->si_signo))) { + /* It released the siglock. */ + goto relock; + } + + /* + * We didn't actually stop, due to a race + * with SIGCONT or something like that. + */ + continue; + } + + spin_unlock_irq(&sighand->siglock); + + /* + * Anything else is fatal, maybe with a core dump. + */ + current->flags |= PF_SIGNALED; + + if (sig_kernel_coredump(signr)) { + if (print_fatal_signals) + print_fatal_signal(regs, info->si_signo); + /* + * If it was able to dump core, this kills all + * other threads in the group and synchronizes with + * their demise. If we lost the race with another + * thread getting here, it set group_exit_code + * first and our do_group_exit call below will use + * that value and ignore the one we pass it. + */ + do_coredump(info->si_signo, info->si_signo, regs); + } + + /* + * Death signals, no core dump. + */ + do_group_exit(info->si_signo); + /* NOTREACHED */ + } + spin_unlock_irq(&sighand->siglock); + return signr; +} + +/* + * It could be that complete_signal() picked us to notify about the + * group-wide signal. Other threads should be notified now to take + * the shared signals in @which since we will not. + */ +static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which) +{ + sigset_t retarget; + struct task_struct *t; + + sigandsets(&retarget, &tsk->signal->shared_pending.signal, which); + if (sigisemptyset(&retarget)) + return; + + t = tsk; + while_each_thread(tsk, t) { + if (t->flags & PF_EXITING) + continue; + + if (!has_pending_signals(&retarget, &t->blocked)) + continue; + /* Remove the signals this thread can handle. */ + sigandsets(&retarget, &retarget, &t->blocked); + + if (!signal_pending(t)) + signal_wake_up(t, 0); + + if (sigisemptyset(&retarget)) + break; + } +} + +void exit_signals(struct task_struct *tsk) +{ + int group_stop = 0; + sigset_t unblocked; + + if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) { + tsk->flags |= PF_EXITING; + return; + } + + spin_lock_irq(&tsk->sighand->siglock); + /* + * From now this task is not visible for group-wide signals, + * see wants_signal(), do_signal_stop(). + */ + tsk->flags |= PF_EXITING; + if (!signal_pending(tsk)) + goto out; + + unblocked = tsk->blocked; + signotset(&unblocked); + retarget_shared_pending(tsk, &unblocked); + + if (unlikely(tsk->group_stop & GROUP_STOP_PENDING) && + task_participate_group_stop(tsk)) + group_stop = CLD_STOPPED; +out: + spin_unlock_irq(&tsk->sighand->siglock); + + /* + * If group stop has completed, deliver the notification. This + * should always go to the real parent of the group leader. + */ + if (unlikely(group_stop)) { + read_lock(&tasklist_lock); + do_notify_parent_cldstop(tsk, false, group_stop); + read_unlock(&tasklist_lock); + } +} + +EXPORT_SYMBOL(recalc_sigpending); +EXPORT_SYMBOL_GPL(dequeue_signal); +EXPORT_SYMBOL(flush_signals); +EXPORT_SYMBOL(force_sig); +EXPORT_SYMBOL(send_sig); +EXPORT_SYMBOL(send_sig_info); +EXPORT_SYMBOL(sigprocmask); +EXPORT_SYMBOL(block_all_signals); +EXPORT_SYMBOL(unblock_all_signals); + + +/* + * System call entry points. + */ + +/** + * sys_restart_syscall - restart a system call + */ +SYSCALL_DEFINE0(restart_syscall) +{ + struct restart_block *restart = ¤t_thread_info()->restart_block; + return restart->fn(restart); +} + +long do_no_restart_syscall(struct restart_block *param) +{ + return -EINTR; +} + +static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset) +{ + if (signal_pending(tsk) && !thread_group_empty(tsk)) { + sigset_t newblocked; + /* A set of now blocked but previously unblocked signals. */ + sigandnsets(&newblocked, newset, ¤t->blocked); + retarget_shared_pending(tsk, &newblocked); + } + tsk->blocked = *newset; + recalc_sigpending(); +} + +/** + * set_current_blocked - change current->blocked mask + * @newset: new mask + * + * It is wrong to change ->blocked directly, this helper should be used + * to ensure the process can't miss a shared signal we are going to block. + */ +void set_current_blocked(const sigset_t *newset) +{ + struct task_struct *tsk = current; + + spin_lock_irq(&tsk->sighand->siglock); + __set_task_blocked(tsk, newset); + spin_unlock_irq(&tsk->sighand->siglock); +} + +/* + * This is also useful for kernel threads that want to temporarily + * (or permanently) block certain signals. + * + * NOTE! Unlike the user-mode sys_sigprocmask(), the kernel + * interface happily blocks "unblockable" signals like SIGKILL + * and friends. + */ +int sigprocmask(int how, sigset_t *set, sigset_t *oldset) +{ + struct task_struct *tsk = current; + sigset_t newset; + + /* Lockless, only current can change ->blocked, never from irq */ + if (oldset) + *oldset = tsk->blocked; + + switch (how) { + case SIG_BLOCK: + sigorsets(&newset, &tsk->blocked, set); + break; + case SIG_UNBLOCK: + sigandnsets(&newset, &tsk->blocked, set); + break; + case SIG_SETMASK: + newset = *set; + break; + default: + return -EINVAL; + } + + set_current_blocked(&newset); + return 0; +} + +/** + * sys_rt_sigprocmask - change the list of currently blocked signals + * @how: whether to add, remove, or set signals + * @nset: stores pending signals + * @oset: previous value of signal mask if non-null + * @sigsetsize: size of sigset_t type + */ +SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset, + sigset_t __user *, oset, size_t, sigsetsize) +{ + sigset_t old_set, new_set; + int error; + + /* XXX: Don't preclude handling different sized sigset_t's. */ + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + + old_set = current->blocked; + + if (nset) { + if (copy_from_user(&new_set, nset, sizeof(sigset_t))) + return -EFAULT; + sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP)); + + error = sigprocmask(how, &new_set, NULL); + if (error) + return error; + } + + if (oset) { + if (copy_to_user(oset, &old_set, sizeof(sigset_t))) + return -EFAULT; + } + + return 0; +} + +long do_sigpending(void __user *set, unsigned long sigsetsize) +{ + long error = -EINVAL; + sigset_t pending; + + if (sigsetsize > sizeof(sigset_t)) + goto out; + + spin_lock_irq(¤t->sighand->siglock); + sigorsets(&pending, ¤t->pending.signal, + ¤t->signal->shared_pending.signal); + spin_unlock_irq(¤t->sighand->siglock); + + /* Outside the lock because only this thread touches it. */ + sigandsets(&pending, ¤t->blocked, &pending); + + error = -EFAULT; + if (!copy_to_user(set, &pending, sigsetsize)) + error = 0; + +out: + return error; +} + +/** + * sys_rt_sigpending - examine a pending signal that has been raised + * while blocked + * @set: stores pending signals + * @sigsetsize: size of sigset_t type or larger + */ +SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize) +{ + return do_sigpending(set, sigsetsize); +} + +#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER + +int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) +{ + int err; + + if (!access_ok (VERIFY_WRITE, to, sizeof(siginfo_t))) + return -EFAULT; + if (from->si_code < 0) + return __copy_to_user(to, from, sizeof(siginfo_t)) + ? -EFAULT : 0; + /* + * If you change siginfo_t structure, please be sure + * this code is fixed accordingly. + * Please remember to update the signalfd_copyinfo() function + * inside fs/signalfd.c too, in case siginfo_t changes. + * It should never copy any pad contained in the structure + * to avoid security leaks, but must copy the generic + * 3 ints plus the relevant union member. + */ + err = __put_user(from->si_signo, &to->si_signo); + err |= __put_user(from->si_errno, &to->si_errno); + err |= __put_user((short)from->si_code, &to->si_code); + switch (from->si_code & __SI_MASK) { + case __SI_KILL: + err |= __put_user(from->si_pid, &to->si_pid); + err |= __put_user(from->si_uid, &to->si_uid); + break; + case __SI_TIMER: + err |= __put_user(from->si_tid, &to->si_tid); + err |= __put_user(from->si_overrun, &to->si_overrun); + err |= __put_user(from->si_ptr, &to->si_ptr); + break; + case __SI_POLL: + err |= __put_user(from->si_band, &to->si_band); + err |= __put_user(from->si_fd, &to->si_fd); + break; + case __SI_FAULT: + err |= __put_user(from->si_addr, &to->si_addr); +#ifdef __ARCH_SI_TRAPNO + err |= __put_user(from->si_trapno, &to->si_trapno); +#endif +#ifdef BUS_MCEERR_AO + /* + * Other callers might not initialize the si_lsb field, + * so check explicitly for the right codes here. + */ + if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) + err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); +#endif + break; + case __SI_CHLD: + err |= __put_user(from->si_pid, &to->si_pid); + err |= __put_user(from->si_uid, &to->si_uid); + err |= __put_user(from->si_status, &to->si_status); + err |= __put_user(from->si_utime, &to->si_utime); + err |= __put_user(from->si_stime, &to->si_stime); + break; + case __SI_RT: /* This is not generated by the kernel as of now. */ + case __SI_MESGQ: /* But this is */ + err |= __put_user(from->si_pid, &to->si_pid); + err |= __put_user(from->si_uid, &to->si_uid); + err |= __put_user(from->si_ptr, &to->si_ptr); + break; + default: /* this is just in case for now ... */ + err |= __put_user(from->si_pid, &to->si_pid); + err |= __put_user(from->si_uid, &to->si_uid); + break; + } + return err; +} + +#endif + +/** + * do_sigtimedwait - wait for queued signals specified in @which + * @which: queued signals to wait for + * @info: if non-null, the signal's siginfo is returned here + * @ts: upper bound on process time suspension + */ +int do_sigtimedwait(const sigset_t *which, siginfo_t *info, + const struct timespec *ts) +{ + struct task_struct *tsk = current; + long timeout = MAX_SCHEDULE_TIMEOUT; + sigset_t mask = *which; + int sig; + + if (ts) { + if (!timespec_valid(ts)) + return -EINVAL; + timeout = timespec_to_jiffies(ts); + /* + * We can be close to the next tick, add another one + * to ensure we will wait at least the time asked for. + */ + if (ts->tv_sec || ts->tv_nsec) + timeout++; + } + + /* + * Invert the set of allowed signals to get those we want to block. + */ + sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP)); + signotset(&mask); + + spin_lock_irq(&tsk->sighand->siglock); + sig = dequeue_signal(tsk, &mask, info); + if (!sig && timeout) { + /* + * None ready, temporarily unblock those we're interested + * while we are sleeping in so that we'll be awakened when + * they arrive. Unblocking is always fine, we can avoid + * set_current_blocked(). + */ + tsk->real_blocked = tsk->blocked; + sigandsets(&tsk->blocked, &tsk->blocked, &mask); + recalc_sigpending(); + spin_unlock_irq(&tsk->sighand->siglock); + + timeout = schedule_timeout_interruptible(timeout); + + spin_lock_irq(&tsk->sighand->siglock); + __set_task_blocked(tsk, &tsk->real_blocked); + siginitset(&tsk->real_blocked, 0); + sig = dequeue_signal(tsk, &mask, info); + } + spin_unlock_irq(&tsk->sighand->siglock); + + if (sig) + return sig; + return timeout ? -EINTR : -EAGAIN; +} + +/** + * sys_rt_sigtimedwait - synchronously wait for queued signals specified + * in @uthese + * @uthese: queued signals to wait for + * @uinfo: if non-null, the signal's siginfo is returned here + * @uts: upper bound on process time suspension + * @sigsetsize: size of sigset_t type + */ +SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese, + siginfo_t __user *, uinfo, const struct timespec __user *, uts, + size_t, sigsetsize) +{ + sigset_t these; + struct timespec ts; + siginfo_t info; + int ret; + + /* XXX: Don't preclude handling different sized sigset_t's. */ + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + + if (copy_from_user(&these, uthese, sizeof(these))) + return -EFAULT; + + if (uts) { + if (copy_from_user(&ts, uts, sizeof(ts))) + return -EFAULT; + } + + ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL); + + if (ret > 0 && uinfo) { + if (copy_siginfo_to_user(uinfo, &info)) + ret = -EFAULT; + } + + return ret; +} + +/** + * sys_kill - send a signal to a process + * @pid: the PID of the process + * @sig: signal to be sent + */ +SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) +{ + struct siginfo info; + + info.si_signo = sig; + info.si_errno = 0; + info.si_code = SI_USER; + info.si_pid = task_tgid_vnr(current); + info.si_uid = current_uid(); + + return kill_something_info(sig, &info, pid); +} + +static int +do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) +{ + struct task_struct *p; + int error = -ESRCH; + + rcu_read_lock(); + p = find_task_by_vpid(pid); + if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) { + error = check_kill_permission(sig, info, p); + /* + * The null signal is a permissions and process existence + * probe. No signal is actually delivered. + */ + if (!error && sig) { + error = do_send_sig_info(sig, info, p, false); + /* + * If lock_task_sighand() failed we pretend the task + * dies after receiving the signal. The window is tiny, + * and the signal is private anyway. + */ + if (unlikely(error == -ESRCH)) + error = 0; + } + } + rcu_read_unlock(); + + return error; +} + +static int do_tkill(pid_t tgid, pid_t pid, int sig) +{ + struct siginfo info; + + info.si_signo = sig; + info.si_errno = 0; + info.si_code = SI_TKILL; + info.si_pid = task_tgid_vnr(current); + info.si_uid = current_uid(); + + return do_send_specific(tgid, pid, sig, &info); +} + +/** + * sys_tgkill - send signal to one specific thread + * @tgid: the thread group ID of the thread + * @pid: the PID of the thread + * @sig: signal to be sent + * + * This syscall also checks the @tgid and returns -ESRCH even if the PID + * exists but it's not belonging to the target process anymore. This + * method solves the problem of threads exiting and PIDs getting reused. + */ +SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig) +{ + /* This is only valid for single tasks */ + if (pid <= 0 || tgid <= 0) + return -EINVAL; + + return do_tkill(tgid, pid, sig); +} + +/** + * sys_tkill - send signal to one specific task + * @pid: the PID of the task + * @sig: signal to be sent + * + * Send a signal to only one task, even if it's a CLONE_THREAD task. + */ +SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig) +{ + /* This is only valid for single tasks */ + if (pid <= 0) + return -EINVAL; + + return do_tkill(0, pid, sig); +} + +/** + * sys_rt_sigqueueinfo - send signal information to a signal + * @pid: the PID of the thread + * @sig: signal to be sent + * @uinfo: signal info to be sent + */ +SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, + siginfo_t __user *, uinfo) +{ + siginfo_t info; + + if (copy_from_user(&info, uinfo, sizeof(siginfo_t))) + return -EFAULT; + + /* Not even root can pretend to send signals from the kernel. + * Nor can they impersonate a kill()/tgkill(), which adds source info. + */ + if (info.si_code >= 0 || info.si_code == SI_TKILL) { + /* We used to allow any < 0 si_code */ + WARN_ON_ONCE(info.si_code < 0); + return -EPERM; + } + info.si_signo = sig; + + /* POSIX.1b doesn't mention process groups. */ + return kill_proc_info(sig, &info, pid); +} + +long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) +{ + /* This is only valid for single tasks */ + if (pid <= 0 || tgid <= 0) + return -EINVAL; + + /* Not even root can pretend to send signals from the kernel. + * Nor can they impersonate a kill()/tgkill(), which adds source info. + */ + if (info->si_code >= 0 || info->si_code == SI_TKILL) { + /* We used to allow any < 0 si_code */ + WARN_ON_ONCE(info->si_code < 0); + return -EPERM; + } + info->si_signo = sig; + + return do_send_specific(tgid, pid, sig, info); +} + +SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig, + siginfo_t __user *, uinfo) +{ + siginfo_t info; + + if (copy_from_user(&info, uinfo, sizeof(siginfo_t))) + return -EFAULT; + + return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); +} + +int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) +{ + struct task_struct *t = current; + struct k_sigaction *k; + sigset_t mask; + + if (!valid_signal(sig) || sig < 1 || (act && sig_kernel_only(sig))) + return -EINVAL; + + k = &t->sighand->action[sig-1]; + + spin_lock_irq(¤t->sighand->siglock); + if (oact) + *oact = *k; + + if (act) { + sigdelsetmask(&act->sa.sa_mask, + sigmask(SIGKILL) | sigmask(SIGSTOP)); + *k = *act; + /* + * POSIX 3.3.1.3: + * "Setting a signal action to SIG_IGN for a signal that is + * pending shall cause the pending signal to be discarded, + * whether or not it is blocked." + * + * "Setting a signal action to SIG_DFL for a signal that is + * pending and whose default action is to ignore the signal + * (for example, SIGCHLD), shall cause the pending signal to + * be discarded, whether or not it is blocked" + */ + if (sig_handler_ignored(sig_handler(t, sig), sig)) { + sigemptyset(&mask); + sigaddset(&mask, sig); + rm_from_queue_full(&mask, &t->signal->shared_pending); + do { + rm_from_queue_full(&mask, &t->pending); + t = next_thread(t); + } while (t != current); + } + } + + spin_unlock_irq(¤t->sighand->siglock); + return 0; +} + +int +do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp) +{ + stack_t oss; + int error; + + oss.ss_sp = (void __user *) current->sas_ss_sp; + oss.ss_size = current->sas_ss_size; + oss.ss_flags = sas_ss_flags(sp); + + if (uss) { + void __user *ss_sp; + size_t ss_size; + int ss_flags; + + error = -EFAULT; + if (!access_ok(VERIFY_READ, uss, sizeof(*uss))) + goto out; + error = __get_user(ss_sp, &uss->ss_sp) | + __get_user(ss_flags, &uss->ss_flags) | + __get_user(ss_size, &uss->ss_size); + if (error) + goto out; + + error = -EPERM; + if (on_sig_stack(sp)) + goto out; + + error = -EINVAL; + /* + * Note - this code used to test ss_flags incorrectly: + * old code may have been written using ss_flags==0 + * to mean ss_flags==SS_ONSTACK (as this was the only + * way that worked) - this fix preserves that older + * mechanism. + */ + if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0) + goto out; + + if (ss_flags == SS_DISABLE) { + ss_size = 0; + ss_sp = NULL; + } else { + error = -ENOMEM; + if (ss_size < MINSIGSTKSZ) + goto out; + } + + current->sas_ss_sp = (unsigned long) ss_sp; + current->sas_ss_size = ss_size; + } + + error = 0; + if (uoss) { + error = -EFAULT; + if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss))) + goto out; + error = __put_user(oss.ss_sp, &uoss->ss_sp) | + __put_user(oss.ss_size, &uoss->ss_size) | + __put_user(oss.ss_flags, &uoss->ss_flags); + } + +out: + return error; +} + +#ifdef __ARCH_WANT_SYS_SIGPENDING + +/** + * sys_sigpending - examine pending signals + * @set: where mask of pending signal is returned + */ +SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set) +{ + return do_sigpending(set, sizeof(*set)); +} + +#endif + +#ifdef __ARCH_WANT_SYS_SIGPROCMASK +/** + * sys_sigprocmask - examine and change blocked signals + * @how: whether to add, remove, or set signals + * @nset: signals to add or remove (if non-null) + * @oset: previous value of signal mask if non-null + * + * Some platforms have their own version with special arguments; + * others support only sys_rt_sigprocmask. + */ + +SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset, + old_sigset_t __user *, oset) +{ + old_sigset_t old_set, new_set; + sigset_t new_blocked; + + old_set = current->blocked.sig[0]; + + if (nset) { + if (copy_from_user(&new_set, nset, sizeof(*nset))) + return -EFAULT; + new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP)); + + new_blocked = current->blocked; + + switch (how) { + case SIG_BLOCK: + sigaddsetmask(&new_blocked, new_set); + break; + case SIG_UNBLOCK: + sigdelsetmask(&new_blocked, new_set); + break; + case SIG_SETMASK: + new_blocked.sig[0] = new_set; + break; + default: + return -EINVAL; + } + + set_current_blocked(&new_blocked); + } + + if (oset) { + if (copy_to_user(oset, &old_set, sizeof(*oset))) + return -EFAULT; + } + + return 0; +} +#endif /* __ARCH_WANT_SYS_SIGPROCMASK */ + +#ifdef __ARCH_WANT_SYS_RT_SIGACTION +/** + * sys_rt_sigaction - alter an action taken by a process + * @sig: signal to be sent + * @act: new sigaction + * @oact: used to save the previous sigaction + * @sigsetsize: size of sigset_t type + */ +SYSCALL_DEFINE4(rt_sigaction, int, sig, + const struct sigaction __user *, act, + struct sigaction __user *, oact, + size_t, sigsetsize) +{ + struct k_sigaction new_sa, old_sa; + int ret = -EINVAL; + + /* XXX: Don't preclude handling different sized sigset_t's. */ + if (sigsetsize != sizeof(sigset_t)) + goto out; + + if (act) { + if (copy_from_user(&new_sa.sa, act, sizeof(new_sa.sa))) + return -EFAULT; + } + + ret = do_sigaction(sig, act ? &new_sa : NULL, oact ? &old_sa : NULL); + + if (!ret && oact) { + if (copy_to_user(oact, &old_sa.sa, sizeof(old_sa.sa))) + return -EFAULT; + } +out: + return ret; +} +#endif /* __ARCH_WANT_SYS_RT_SIGACTION */ + +#ifdef __ARCH_WANT_SYS_SGETMASK + +/* + * For backwards compatibility. Functionality superseded by sigprocmask. + */ +SYSCALL_DEFINE0(sgetmask) +{ + /* SMP safe */ + return current->blocked.sig[0]; +} + +SYSCALL_DEFINE1(ssetmask, int, newmask) +{ + int old; + + spin_lock_irq(¤t->sighand->siglock); + old = current->blocked.sig[0]; + + siginitset(¤t->blocked, newmask & ~(sigmask(SIGKILL)| + sigmask(SIGSTOP))); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + return old; +} +#endif /* __ARCH_WANT_SGETMASK */ + +#ifdef __ARCH_WANT_SYS_SIGNAL +/* + * For backwards compatibility. Functionality superseded by sigaction. + */ +SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler) +{ + struct k_sigaction new_sa, old_sa; + int ret; + + new_sa.sa.sa_handler = handler; + new_sa.sa.sa_flags = SA_ONESHOT | SA_NOMASK; + sigemptyset(&new_sa.sa.sa_mask); + + ret = do_sigaction(sig, &new_sa, &old_sa); + + return ret ? ret : (unsigned long)old_sa.sa.sa_handler; +} +#endif /* __ARCH_WANT_SYS_SIGNAL */ + +#ifdef __ARCH_WANT_SYS_PAUSE + +SYSCALL_DEFINE0(pause) +{ + while (!signal_pending(current)) { + current->state = TASK_INTERRUPTIBLE; + schedule(); + } + return -ERESTARTNOHAND; +} + +#endif + +#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND +/** + * sys_rt_sigsuspend - replace the signal mask for a value with the + * @unewset value until a signal is received + * @unewset: new signal mask value + * @sigsetsize: size of sigset_t type + */ +SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize) +{ + sigset_t newset; + + /* XXX: Don't preclude handling different sized sigset_t's. */ + if (sigsetsize != sizeof(sigset_t)) + return -EINVAL; + + if (copy_from_user(&newset, unewset, sizeof(newset))) + return -EFAULT; + sigdelsetmask(&newset, sigmask(SIGKILL)|sigmask(SIGSTOP)); + + spin_lock_irq(¤t->sighand->siglock); + current->saved_sigmask = current->blocked; + current->blocked = newset; + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + current->state = TASK_INTERRUPTIBLE; + schedule(); + set_restore_sigmask(); + return -ERESTARTNOHAND; +} +#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ + +__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) +{ + return NULL; +} + +void __init signals_init(void) +{ + sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); +} + +#ifdef CONFIG_KGDB_KDB +#include +/* + * kdb_send_sig_info - Allows kdb to send signals without exposing + * signal internals. This function checks if the required locks are + * available before calling the main signal code, to avoid kdb + * deadlocks. + */ +void +kdb_send_sig_info(struct task_struct *t, struct siginfo *info) +{ + static struct task_struct *kdb_prev_t; + int sig, new_t; + if (!spin_trylock(&t->sighand->siglock)) { + kdb_printf("Can't do kill command now.\n" + "The sigmask lock is held somewhere else in " + "kernel, try again later\n"); + return; + } + spin_unlock(&t->sighand->siglock); + new_t = kdb_prev_t != t; + kdb_prev_t = t; + if (t->state != TASK_RUNNING && new_t) { + kdb_printf("Process is not RUNNING, sending a signal from " + "kdb risks deadlock\n" + "on the run queue locks. " + "The signal has _not_ been sent.\n" + "Reissue the kill command if you want to risk " + "the deadlock.\n"); + return; + } + sig = info->si_signo; + if (send_sig_info(sig, info, t)) + kdb_printf("Fail to deliver Signal %d to process %d.\n", + sig, t->pid); + else + kdb_printf("Signal %d is sent to process %d.\n", sig, t->pid); +} +#endif /* CONFIG_KGDB_KDB */ diff --git a/kernel/smp.c b/kernel/smp.c new file mode 100644 index 00000000..fb67dfa8 --- /dev/null +++ b/kernel/smp.c @@ -0,0 +1,703 @@ +/* + * Generic helpers for smp ipi calls + * + * (C) Jens Axboe 2008 + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_USE_GENERIC_SMP_HELPERS +static struct { + struct list_head queue; + raw_spinlock_t lock; +} call_function __cacheline_aligned_in_smp = + { + .queue = LIST_HEAD_INIT(call_function.queue), + .lock = __RAW_SPIN_LOCK_UNLOCKED(call_function.lock), + }; + +enum { + CSD_FLAG_LOCK = 0x01, +}; + +struct call_function_data { + struct call_single_data csd; + atomic_t refs; + cpumask_var_t cpumask; +}; + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); + +struct call_single_queue { + struct list_head list; + raw_spinlock_t lock; +}; + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_queue, call_single_queue); + +static int +hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + struct call_function_data *cfd = &per_cpu(cfd_data, cpu); + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, + cpu_to_node(cpu))) + return notifier_from_errno(-ENOMEM); + break; + +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + + case CPU_DEAD: + case CPU_DEAD_FROZEN: + free_cpumask_var(cfd->cpumask); + break; +#endif + }; + + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata hotplug_cfd_notifier = { + .notifier_call = hotplug_cfd, +}; + +void __init call_function_init(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + int i; + + for_each_possible_cpu(i) { + struct call_single_queue *q = &per_cpu(call_single_queue, i); + + raw_spin_lock_init(&q->lock); + INIT_LIST_HEAD(&q->list); + } + + hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu); + register_cpu_notifier(&hotplug_cfd_notifier); +} + +/* + * csd_lock/csd_unlock used to serialize access to per-cpu csd resources + * + * For non-synchronous ipi calls the csd can still be in use by the + * previous function call. For multi-cpu calls its even more interesting + * as we'll have to ensure no other cpu is observing our csd. + */ +static void csd_lock_wait(struct call_single_data *data) +{ + while (data->flags & CSD_FLAG_LOCK) + cpu_relax(); +} + +static void csd_lock(struct call_single_data *data) +{ + csd_lock_wait(data); + data->flags = CSD_FLAG_LOCK; + + /* + * prevent CPU from reordering the above assignment + * to ->flags with any subsequent assignments to other + * fields of the specified call_single_data structure: + */ + smp_mb(); +} + +static void csd_unlock(struct call_single_data *data) +{ + WARN_ON(!(data->flags & CSD_FLAG_LOCK)); + + /* + * ensure we're all done before releasing data: + */ + smp_mb(); + + data->flags &= ~CSD_FLAG_LOCK; +} + +/* + * Insert a previously allocated call_single_data element + * for execution on the given CPU. data must already have + * ->func, ->info, and ->flags set. + */ +static +void generic_exec_single(int cpu, struct call_single_data *data, int wait) +{ + struct call_single_queue *dst = &per_cpu(call_single_queue, cpu); + unsigned long flags; + int ipi; + + raw_spin_lock_irqsave(&dst->lock, flags); + ipi = list_empty(&dst->list); + list_add_tail(&data->list, &dst->list); + raw_spin_unlock_irqrestore(&dst->lock, flags); + + /* + * The list addition should be visible before sending the IPI + * handler locks the list to pull the entry off it because of + * normal cache coherency rules implied by spinlocks. + * + * If IPIs can go out of order to the cache coherency protocol + * in an architecture, sufficient synchronisation should be added + * to arch code to make it appear to obey cache coherency WRT + * locking and barrier primitives. Generic code isn't really + * equipped to do the right thing... + */ + if (ipi) + arch_send_call_function_single_ipi(cpu); + + if (wait) + csd_lock_wait(data); +} + +/* + * Invoked by arch to handle an IPI for call function. Must be called with + * interrupts disabled. + */ +void generic_smp_call_function_interrupt(void) +{ + struct call_function_data *data; + int cpu = smp_processor_id(); + + /* + * Shouldn't receive this interrupt on a cpu that is not yet online. + */ + WARN_ON_ONCE(!cpu_online(cpu)); + + /* + * Ensure entry is visible on call_function_queue after we have + * entered the IPI. See comment in smp_call_function_many. + * If we don't have this, then we may miss an entry on the list + * and never get another IPI to process it. + */ + smp_mb(); + + /* + * It's ok to use list_for_each_rcu() here even though we may + * delete 'pos', since list_del_rcu() doesn't clear ->next + */ + list_for_each_entry_rcu(data, &call_function.queue, csd.list) { + int refs; + smp_call_func_t func; + + /* + * Since we walk the list without any locks, we might + * see an entry that was completed, removed from the + * list and is in the process of being reused. + * + * We must check that the cpu is in the cpumask before + * checking the refs, and both must be set before + * executing the callback on this cpu. + */ + + if (!cpumask_test_cpu(cpu, data->cpumask)) + continue; + + smp_rmb(); + + if (atomic_read(&data->refs) == 0) + continue; + + func = data->csd.func; /* save for later warn */ + func(data->csd.info); + + /* + * If the cpu mask is not still set then func enabled + * interrupts (BUG), and this cpu took another smp call + * function interrupt and executed func(info) twice + * on this cpu. That nested execution decremented refs. + */ + if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) { + WARN(1, "%pf enabled interrupts and double executed\n", func); + continue; + } + + refs = atomic_dec_return(&data->refs); + WARN_ON(refs < 0); + + if (refs) + continue; + + WARN_ON(!cpumask_empty(data->cpumask)); + + raw_spin_lock(&call_function.lock); + list_del_rcu(&data->csd.list); + raw_spin_unlock(&call_function.lock); + + csd_unlock(&data->csd); + } + +} + +/* + * Invoked by arch to handle an IPI for call function single. Must be + * called from the arch with interrupts disabled. + */ +void generic_smp_call_function_single_interrupt(void) +{ + struct call_single_queue *q = &__get_cpu_var(call_single_queue); + unsigned int data_flags; + LIST_HEAD(list); + + /* + * Shouldn't receive this interrupt on a cpu that is not yet online. + */ + WARN_ON_ONCE(!cpu_online(smp_processor_id())); + + raw_spin_lock(&q->lock); + list_replace_init(&q->list, &list); + raw_spin_unlock(&q->lock); + + while (!list_empty(&list)) { + struct call_single_data *data; + + data = list_entry(list.next, struct call_single_data, list); + list_del(&data->list); + + /* + * 'data' can be invalid after this call if flags == 0 + * (when called through generic_exec_single()), + * so save them away before making the call: + */ + data_flags = data->flags; + + data->func(data->info); + + /* + * Unlocked CSDs are valid through generic_exec_single(): + */ + if (data_flags & CSD_FLAG_LOCK) + csd_unlock(data); + } +} + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); + +/* + * smp_call_function_single - Run a function on a specific CPU + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @wait: If true, wait until function has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. + */ +int smp_call_function_single(int cpu, smp_call_func_t func, void *info, + int wait) +{ + struct call_single_data d = { + .flags = 0, + }; + unsigned long flags; + int this_cpu; + int err = 0; + + /* + * prevent preemption and reschedule on another processor, + * as well as CPU removal + */ + this_cpu = get_cpu(); + + /* + * Can deadlock when called with interrupts disabled. + * We allow cpu's that are not yet online though, as no one else can + * send smp call function interrupt to this cpu and as such deadlocks + * can't happen. + */ + WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() + && !oops_in_progress); + + if (cpu == this_cpu) { + local_irq_save(flags); + func(info); + local_irq_restore(flags); + } else { + if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) { + struct call_single_data *data = &d; + + if (!wait) + data = &__get_cpu_var(csd_data); + + csd_lock(data); + + data->func = func; + data->info = info; + generic_exec_single(cpu, data, wait); + } else { + err = -ENXIO; /* CPU not online */ + } + } + + put_cpu(); + + return err; +} +EXPORT_SYMBOL(smp_call_function_single); + +/* + * smp_call_function_any - Run a function on any of the given cpus + * @mask: The mask of cpus it can run on. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @wait: If true, wait until function has completed. + * + * Returns 0 on success, else a negative status code (if no cpus were online). + * Note that @wait will be implicitly turned on in case of allocation failures, + * since we fall back to on-stack allocation. + * + * Selection preference: + * 1) current cpu if in @mask + * 2) any cpu of current node if in @mask + * 3) any other online cpu in @mask + */ +int smp_call_function_any(const struct cpumask *mask, + smp_call_func_t func, void *info, int wait) +{ + unsigned int cpu; + const struct cpumask *nodemask; + int ret; + + /* Try for same CPU (cheapest) */ + cpu = get_cpu(); + if (cpumask_test_cpu(cpu, mask)) + goto call; + + /* Try for same node. */ + nodemask = cpumask_of_node(cpu_to_node(cpu)); + for (cpu = cpumask_first_and(nodemask, mask); cpu < nr_cpu_ids; + cpu = cpumask_next_and(cpu, nodemask, mask)) { + if (cpu_online(cpu)) + goto call; + } + + /* Any online will do: smp_call_function_single handles nr_cpu_ids. */ + cpu = cpumask_any_and(mask, cpu_online_mask); +call: + ret = smp_call_function_single(cpu, func, info, wait); + put_cpu(); + return ret; +} +EXPORT_SYMBOL_GPL(smp_call_function_any); + +/** + * __smp_call_function_single(): Run a function on a specific CPU + * @cpu: The CPU to run on. + * @data: Pre-allocated and setup data structure + * @wait: If true, wait until function has completed on specified CPU. + * + * Like smp_call_function_single(), but allow caller to pass in a + * pre-allocated data structure. Useful for embedding @data inside + * other structures, for instance. + */ +void __smp_call_function_single(int cpu, struct call_single_data *data, + int wait) +{ + unsigned int this_cpu; + unsigned long flags; + + this_cpu = get_cpu(); + /* + * Can deadlock when called with interrupts disabled. + * We allow cpu's that are not yet online though, as no one else can + * send smp call function interrupt to this cpu and as such deadlocks + * can't happen. + */ + WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled() + && !oops_in_progress); + + if (cpu == this_cpu) { + local_irq_save(flags); + data->func(data->info); + local_irq_restore(flags); + } else { + csd_lock(data); + generic_exec_single(cpu, data, wait); + } + put_cpu(); +} + +/** + * smp_call_function_many(): Run a function on a set of other CPUs. + * @mask: The set of cpus to run on (only runs on online subset). + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @wait: If true, wait (atomically) until function has completed + * on other CPUs. + * + * If @wait is true, then returns once @func has returned. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. Preemption + * must be disabled when calling this function. + */ +void smp_call_function_many(const struct cpumask *mask, + smp_call_func_t func, void *info, bool wait) +{ + struct call_function_data *data; + unsigned long flags; + int refs, cpu, next_cpu, this_cpu = smp_processor_id(); + + /* + * Can deadlock when called with interrupts disabled. + * We allow cpu's that are not yet online though, as no one else can + * send smp call function interrupt to this cpu and as such deadlocks + * can't happen. + */ + WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() + && !oops_in_progress && !early_boot_irqs_disabled); + + /* Try to fastpath. So, what's a CPU they want? Ignoring this one. */ + cpu = cpumask_first_and(mask, cpu_online_mask); + if (cpu == this_cpu) + cpu = cpumask_next_and(cpu, mask, cpu_online_mask); + + /* No online cpus? We're done. */ + if (cpu >= nr_cpu_ids) + return; + + /* Do we have another CPU which isn't us? */ + next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask); + if (next_cpu == this_cpu) + next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask); + + /* Fastpath: do that cpu by itself. */ + if (next_cpu >= nr_cpu_ids) { + smp_call_function_single(cpu, func, info, wait); + return; + } + + data = &__get_cpu_var(cfd_data); + csd_lock(&data->csd); + + /* This BUG_ON verifies our reuse assertions and can be removed */ + BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask)); + + /* + * The global call function queue list add and delete are protected + * by a lock, but the list is traversed without any lock, relying + * on the rcu list add and delete to allow safe concurrent traversal. + * We reuse the call function data without waiting for any grace + * period after some other cpu removes it from the global queue. + * This means a cpu might find our data block as it is being + * filled out. + * + * We hold off the interrupt handler on the other cpu by + * ordering our writes to the cpu mask vs our setting of the + * refs counter. We assert only the cpu owning the data block + * will set a bit in cpumask, and each bit will only be cleared + * by the subject cpu. Each cpu must first find its bit is + * set and then check that refs is set indicating the element is + * ready to be processed, otherwise it must skip the entry. + * + * On the previous iteration refs was set to 0 by another cpu. + * To avoid the use of transitivity, set the counter to 0 here + * so the wmb will pair with the rmb in the interrupt handler. + */ + atomic_set(&data->refs, 0); /* convert 3rd to 1st party write */ + + data->csd.func = func; + data->csd.info = info; + + /* Ensure 0 refs is visible before mask. Also orders func and info */ + smp_wmb(); + + /* We rely on the "and" being processed before the store */ + cpumask_and(data->cpumask, mask, cpu_online_mask); + cpumask_clear_cpu(this_cpu, data->cpumask); + refs = cpumask_weight(data->cpumask); + + /* Some callers race with other cpus changing the passed mask */ + if (unlikely(!refs)) { + csd_unlock(&data->csd); + return; + } + + raw_spin_lock_irqsave(&call_function.lock, flags); + /* + * Place entry at the _HEAD_ of the list, so that any cpu still + * observing the entry in generic_smp_call_function_interrupt() + * will not miss any other list entries: + */ + list_add_rcu(&data->csd.list, &call_function.queue); + /* + * We rely on the wmb() in list_add_rcu to complete our writes + * to the cpumask before this write to refs, which indicates + * data is on the list and is ready to be processed. + */ + atomic_set(&data->refs, refs); + raw_spin_unlock_irqrestore(&call_function.lock, flags); + + /* + * Make the list addition visible before sending the ipi. + * (IPIs must obey or appear to obey normal Linux cache + * coherency rules -- see comment in generic_exec_single). + */ + smp_mb(); + + /* Send a message to all CPUs in the map */ + arch_send_call_function_ipi_mask(data->cpumask); + + /* Optionally wait for the CPUs to complete */ + if (wait) + csd_lock_wait(&data->csd); +} +EXPORT_SYMBOL(smp_call_function_many); + +/** + * smp_call_function(): Run a function on all other CPUs. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @wait: If true, wait (atomically) until function has completed + * on other CPUs. + * + * Returns 0. + * + * If @wait is true, then returns once @func has returned; otherwise + * it returns just before the target cpu calls @func. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. + */ +int smp_call_function(smp_call_func_t func, void *info, int wait) +{ + preempt_disable(); + smp_call_function_many(cpu_online_mask, func, info, wait); + preempt_enable(); + + return 0; +} +EXPORT_SYMBOL(smp_call_function); + +void ipi_call_lock(void) +{ + raw_spin_lock(&call_function.lock); +} + +void ipi_call_unlock(void) +{ + raw_spin_unlock(&call_function.lock); +} + +void ipi_call_lock_irq(void) +{ + raw_spin_lock_irq(&call_function.lock); +} + +void ipi_call_unlock_irq(void) +{ + raw_spin_unlock_irq(&call_function.lock); +} +#endif /* USE_GENERIC_SMP_HELPERS */ + +/* Setup configured maximum number of CPUs to activate */ +unsigned int setup_max_cpus = NR_CPUS; +EXPORT_SYMBOL(setup_max_cpus); + + +/* + * Setup routine for controlling SMP activation + * + * Command-line option of "nosmp" or "maxcpus=0" will disable SMP + * activation entirely (the MPS table probe still happens, though). + * + * Command-line option of "maxcpus=", where is an integer + * greater than 0, limits the maximum number of CPUs activated in + * SMP mode to . + */ + +void __weak arch_disable_smp_support(void) { } + +static int __init nosmp(char *str) +{ + setup_max_cpus = 0; + arch_disable_smp_support(); + + return 0; +} + +early_param("nosmp", nosmp); + +/* this is hard limit */ +static int __init nrcpus(char *str) +{ + int nr_cpus; + + get_option(&str, &nr_cpus); + if (nr_cpus > 0 && nr_cpus < nr_cpu_ids) + nr_cpu_ids = nr_cpus; + + return 0; +} + +early_param("nr_cpus", nrcpus); + +static int __init maxcpus(char *str) +{ + get_option(&str, &setup_max_cpus); + if (setup_max_cpus == 0) + arch_disable_smp_support(); + + return 0; +} + +early_param("maxcpus", maxcpus); + +/* Setup number of possible processor ids */ +int nr_cpu_ids __read_mostly = NR_CPUS; +EXPORT_SYMBOL(nr_cpu_ids); + +/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */ +void __init setup_nr_cpu_ids(void) +{ + nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1; +} + +/* Called by boot processor to activate the rest. */ +void __init smp_init(void) +{ + unsigned int cpu; + + /* FIXME: This should be done in userspace --RR */ + for_each_present_cpu(cpu) { + if (num_online_cpus() >= setup_max_cpus) + break; + if (!cpu_online(cpu)) + cpu_up(cpu); + } + + /* Any cleanup work */ + printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus()); + smp_cpus_done(setup_max_cpus); +} + +/* + * Call a function on all processors. May be used during early boot while + * early_boot_irqs_disabled is set. Use local_irq_save/restore() instead + * of local_irq_disable/enable(). + */ +int on_each_cpu(void (*func) (void *info), void *info, int wait) +{ + unsigned long flags; + int ret = 0; + + preempt_disable(); + ret = smp_call_function(func, info, wait); + local_irq_save(flags); + func(info); + local_irq_restore(flags); + preempt_enable(); + return ret; +} +EXPORT_SYMBOL(on_each_cpu); diff --git a/kernel/softirq.c b/kernel/softirq.c new file mode 100644 index 00000000..fca82c32 --- /dev/null +++ b/kernel/softirq.c @@ -0,0 +1,933 @@ +/* + * linux/kernel/softirq.c + * + * Copyright (C) 1992 Linus Torvalds + * + * Distribute under GPLv2. + * + * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) + * + * Remote softirq infrastructure is by Jens Axboe. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +#include +/* + - No shared variables, all the data are CPU local. + - If a softirq needs serialization, let it serialize itself + by its own spinlocks. + - Even if softirq is serialized, only local cpu is marked for + execution. Hence, we get something sort of weak cpu binding. + Though it is still not clear, will it result in better locality + or will not. + + Examples: + - NET RX softirq. It is multithreaded and does not require + any global serialization. + - NET TX softirq. It kicks software netdevice queues, hence + it is logically serialized per device, but this serialization + is invisible to common code. + - Tasklets: serialized wrt itself. + */ + +#ifndef __ARCH_IRQ_STAT +irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned; +EXPORT_SYMBOL(irq_stat); +#endif + +static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; + +DEFINE_PER_CPU(struct task_struct *, ksoftirqd); + +char *softirq_to_name[NR_SOFTIRQS] = { + "HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL", + "TASKLET", "SCHED", "HRTIMER", "RCU" +}; + +/* + * we cannot loop indefinitely here to avoid userspace starvation, + * but we also don't want to introduce a worst case 1/HZ latency + * to the pending events, so lets the scheduler to balance + * the softirq load for us. + */ +static void wakeup_softirqd(void) +{ + /* Interrupts are disabled: no need to stop preemption */ + struct task_struct *tsk = __this_cpu_read(ksoftirqd); + + if (tsk && tsk->state != TASK_RUNNING) + wake_up_process(tsk); +} + +/* + * preempt_count and SOFTIRQ_OFFSET usage: + * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving + * softirq processing. + * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET) + * on local_bh_disable or local_bh_enable. + * This lets us distinguish between whether we are currently processing + * softirq and whether we just have bh disabled. + */ + +/* + * This one is for softirq.c-internal use, + * where hardirqs are disabled legitimately: + */ +#ifdef CONFIG_TRACE_IRQFLAGS +static void __local_bh_disable(unsigned long ip, unsigned int cnt) +{ + unsigned long flags; + + WARN_ON_ONCE(in_irq()); + + raw_local_irq_save(flags); + /* + * The preempt tracer hooks into add_preempt_count and will break + * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET + * is set and before current->softirq_enabled is cleared. + * We must manually increment preempt_count here and manually + * call the trace_preempt_off later. + */ + preempt_count() += cnt; + /* + * Were softirqs turned off above: + */ + if (softirq_count() == cnt) + trace_softirqs_off(ip); + raw_local_irq_restore(flags); + + if (preempt_count() == cnt) + trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); +} +#else /* !CONFIG_TRACE_IRQFLAGS */ +static inline void __local_bh_disable(unsigned long ip, unsigned int cnt) +{ + add_preempt_count(cnt); + barrier(); +} +#endif /* CONFIG_TRACE_IRQFLAGS */ + +void local_bh_disable(void) +{ + __local_bh_disable((unsigned long)__builtin_return_address(0), + SOFTIRQ_DISABLE_OFFSET); +} + +EXPORT_SYMBOL(local_bh_disable); + +static void __local_bh_enable(unsigned int cnt) +{ + WARN_ON_ONCE(in_irq()); + WARN_ON_ONCE(!irqs_disabled()); + + if (softirq_count() == cnt) + trace_softirqs_on((unsigned long)__builtin_return_address(0)); + sub_preempt_count(cnt); +} + +/* + * Special-case - softirqs can safely be enabled in + * cond_resched_softirq(), or by __do_softirq(), + * without processing still-pending softirqs: + */ +void _local_bh_enable(void) +{ + __local_bh_enable(SOFTIRQ_DISABLE_OFFSET); +} + +EXPORT_SYMBOL(_local_bh_enable); + +static inline void _local_bh_enable_ip(unsigned long ip) +{ + WARN_ON_ONCE(in_irq() || irqs_disabled()); +#ifdef CONFIG_TRACE_IRQFLAGS + local_irq_disable(); +#endif + /* + * Are softirqs going to be turned on now: + */ + if (softirq_count() == SOFTIRQ_DISABLE_OFFSET) + trace_softirqs_on(ip); + /* + * Keep preemption disabled until we are done with + * softirq processing: + */ + sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1); + + if (unlikely(!in_interrupt() && local_softirq_pending())) + do_softirq(); + + dec_preempt_count(); +#ifdef CONFIG_TRACE_IRQFLAGS + local_irq_enable(); +#endif + preempt_check_resched(); +} + +void local_bh_enable(void) +{ + _local_bh_enable_ip((unsigned long)__builtin_return_address(0)); +} +EXPORT_SYMBOL(local_bh_enable); + +void local_bh_enable_ip(unsigned long ip) +{ + _local_bh_enable_ip(ip); +} +EXPORT_SYMBOL(local_bh_enable_ip); + +/* + * We restart softirq processing MAX_SOFTIRQ_RESTART times, + * and we fall back to softirqd after that. + * + * This number has been established via experimentation. + * The two things to balance is latency against fairness - + * we want to handle softirqs as soon as possible, but they + * should not be able to lock up the box. + */ +#define MAX_SOFTIRQ_RESTART 10 + +asmlinkage void __do_softirq(void) +{ + struct softirq_action *h; + __u32 pending; + int max_restart = MAX_SOFTIRQ_RESTART; + int cpu; + + pending = local_softirq_pending(); + account_system_vtime(current); + + __local_bh_disable((unsigned long)__builtin_return_address(0), + SOFTIRQ_OFFSET); + lockdep_softirq_enter(); + + cpu = smp_processor_id(); +restart: + /* Reset the pending bitmask before enabling irqs */ + set_softirq_pending(0); + + local_irq_enable(); + + h = softirq_vec; + + do { + if (pending & 1) { + unsigned int vec_nr = h - softirq_vec; + int prev_count = preempt_count(); + + kstat_incr_softirqs_this_cpu(vec_nr); + + trace_softirq_entry(vec_nr); + h->action(h); + trace_softirq_exit(vec_nr); + if (unlikely(prev_count != preempt_count())) { + printk(KERN_ERR "huh, entered softirq %u %s %p" + "with preempt_count %08x," + " exited with %08x?\n", vec_nr, + softirq_to_name[vec_nr], h->action, + prev_count, preempt_count()); + preempt_count() = prev_count; + } + + rcu_bh_qs(cpu); + } + h++; + pending >>= 1; + } while (pending); + + local_irq_disable(); + + pending = local_softirq_pending(); + if (pending && --max_restart) + goto restart; + + if (pending) + wakeup_softirqd(); + + lockdep_softirq_exit(); + + account_system_vtime(current); + __local_bh_enable(SOFTIRQ_OFFSET); +} + +#ifndef __ARCH_HAS_DO_SOFTIRQ + +asmlinkage void do_softirq(void) +{ + __u32 pending; + unsigned long flags; + + if (in_interrupt()) + return; + + local_irq_save(flags); + + pending = local_softirq_pending(); + + if (pending) + __do_softirq(); + + local_irq_restore(flags); +} + +#endif + +/* + * Enter an interrupt context. + */ +void irq_enter(void) +{ + int cpu = smp_processor_id(); + + rcu_irq_enter(); + if (idle_cpu(cpu) && !in_interrupt()) { + /* + * Prevent raise_softirq from needlessly waking up ksoftirqd + * here, as softirq will be serviced on return from interrupt. + */ + local_bh_disable(); + tick_check_idle(cpu); + _local_bh_enable(); + } + + __irq_enter(); +} + +#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED +static inline void invoke_softirq(void) +{ + if (!force_irqthreads) + __do_softirq(); + else { + __local_bh_disable((unsigned long)__builtin_return_address(0), + SOFTIRQ_OFFSET); + wakeup_softirqd(); + __local_bh_enable(SOFTIRQ_OFFSET); + } +} +#else +static inline void invoke_softirq(void) +{ + if (!force_irqthreads) + do_softirq(); + else { + __local_bh_disable((unsigned long)__builtin_return_address(0), + SOFTIRQ_OFFSET); + wakeup_softirqd(); + __local_bh_enable(SOFTIRQ_OFFSET); + } +} +#endif + +/* + * Exit an interrupt context. Process softirqs if needed and possible: + */ +void irq_exit(void) +{ + account_system_vtime(current); + trace_hardirq_exit(); + sub_preempt_count(IRQ_EXIT_OFFSET); + if (!in_interrupt() && local_softirq_pending()) + invoke_softirq(); + + rcu_irq_exit(); +#ifdef CONFIG_NO_HZ + /* Make sure that timer wheel updates are propagated */ + if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched()) + tick_nohz_stop_sched_tick(0); +#endif + preempt_enable_no_resched(); +} + +/* + * This function must run with irqs disabled! + */ +inline void raise_softirq_irqoff(unsigned int nr) +{ + __raise_softirq_irqoff(nr); + + /* + * If we're in an interrupt or softirq, we're done + * (this also catches softirq-disabled code). We will + * actually run the softirq once we return from + * the irq or softirq. + * + * Otherwise we wake up ksoftirqd to make sure we + * schedule the softirq soon. + */ + if (!in_interrupt()) + wakeup_softirqd(); +} + +void raise_softirq(unsigned int nr) +{ + unsigned long flags; + + local_irq_save(flags); + raise_softirq_irqoff(nr); + local_irq_restore(flags); +} + +void open_softirq(int nr, void (*action)(struct softirq_action *)) +{ + softirq_vec[nr].action = action; +} + +/* + * Tasklets + */ +struct tasklet_head +{ + struct tasklet_struct *head; + struct tasklet_struct **tail; +}; + +static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec); +static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec); + +void __tasklet_schedule(struct tasklet_struct *t) +{ + unsigned long flags; + + local_irq_save(flags); + t->next = NULL; + *__this_cpu_read(tasklet_vec.tail) = t; + __this_cpu_write(tasklet_vec.tail, &(t->next)); + raise_softirq_irqoff(TASKLET_SOFTIRQ); + local_irq_restore(flags); +} + +EXPORT_SYMBOL(__tasklet_schedule); + +void __tasklet_hi_schedule(struct tasklet_struct *t) +{ + unsigned long flags; + + local_irq_save(flags); + t->next = NULL; + *__this_cpu_read(tasklet_hi_vec.tail) = t; + __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); + raise_softirq_irqoff(HI_SOFTIRQ); + local_irq_restore(flags); +} + +EXPORT_SYMBOL(__tasklet_hi_schedule); + +void __tasklet_hi_schedule_first(struct tasklet_struct *t) +{ + BUG_ON(!irqs_disabled()); + + t->next = __this_cpu_read(tasklet_hi_vec.head); + __this_cpu_write(tasklet_hi_vec.head, t); + __raise_softirq_irqoff(HI_SOFTIRQ); +} + +EXPORT_SYMBOL(__tasklet_hi_schedule_first); + +static void tasklet_action(struct softirq_action *a) +{ + struct tasklet_struct *list; + + local_irq_disable(); + list = __this_cpu_read(tasklet_vec.head); + __this_cpu_write(tasklet_vec.head, NULL); + __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head); + local_irq_enable(); + + while (list) { + struct tasklet_struct *t = list; + + list = list->next; + + if (tasklet_trylock(t)) { + if (!atomic_read(&t->count)) { + if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) + BUG(); + t->func(t->data); + tasklet_unlock(t); + continue; + } + tasklet_unlock(t); + } + + local_irq_disable(); + t->next = NULL; + *__this_cpu_read(tasklet_vec.tail) = t; + __this_cpu_write(tasklet_vec.tail, &(t->next)); + __raise_softirq_irqoff(TASKLET_SOFTIRQ); + local_irq_enable(); + } +} + +static void tasklet_hi_action(struct softirq_action *a) +{ + struct tasklet_struct *list; + + local_irq_disable(); + list = __this_cpu_read(tasklet_hi_vec.head); + __this_cpu_write(tasklet_hi_vec.head, NULL); + __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head); + local_irq_enable(); + + while (list) { + struct tasklet_struct *t = list; + + list = list->next; + + if (tasklet_trylock(t)) { + if (!atomic_read(&t->count)) { + if (!test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) + BUG(); + t->func(t->data); + tasklet_unlock(t); + continue; + } + tasklet_unlock(t); + } + + local_irq_disable(); + t->next = NULL; + *__this_cpu_read(tasklet_hi_vec.tail) = t; + __this_cpu_write(tasklet_hi_vec.tail, &(t->next)); + __raise_softirq_irqoff(HI_SOFTIRQ); + local_irq_enable(); + } +} + + +void tasklet_init(struct tasklet_struct *t, + void (*func)(unsigned long), unsigned long data) +{ + t->next = NULL; + t->state = 0; + atomic_set(&t->count, 0); + t->func = func; + t->data = data; +} + +EXPORT_SYMBOL(tasklet_init); + +void tasklet_kill(struct tasklet_struct *t) +{ + if (in_interrupt()) + printk("Attempt to kill tasklet from interrupt\n"); + + while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { + do { + yield(); + } while (test_bit(TASKLET_STATE_SCHED, &t->state)); + } + tasklet_unlock_wait(t); + clear_bit(TASKLET_STATE_SCHED, &t->state); +} + +EXPORT_SYMBOL(tasklet_kill); + +/* + * tasklet_hrtimer + */ + +/* + * The trampoline is called when the hrtimer expires. It schedules a tasklet + * to run __tasklet_hrtimer_trampoline() which in turn will call the intended + * hrtimer callback, but from softirq context. + */ +static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) +{ + struct tasklet_hrtimer *ttimer = + container_of(timer, struct tasklet_hrtimer, timer); + + tasklet_hi_schedule(&ttimer->tasklet); + return HRTIMER_NORESTART; +} + +/* + * Helper function which calls the hrtimer callback from + * tasklet/softirq context + */ +static void __tasklet_hrtimer_trampoline(unsigned long data) +{ + struct tasklet_hrtimer *ttimer = (void *)data; + enum hrtimer_restart restart; + + restart = ttimer->function(&ttimer->timer); + if (restart != HRTIMER_NORESTART) + hrtimer_restart(&ttimer->timer); +} + +/** + * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks + * @ttimer: tasklet_hrtimer which is initialized + * @function: hrtimer callback function which gets called from softirq context + * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) + * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) + */ +void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, + enum hrtimer_restart (*function)(struct hrtimer *), + clockid_t which_clock, enum hrtimer_mode mode) +{ + hrtimer_init(&ttimer->timer, which_clock, mode); + ttimer->timer.function = __hrtimer_tasklet_trampoline; + tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline, + (unsigned long)ttimer); + ttimer->function = function; +} +EXPORT_SYMBOL_GPL(tasklet_hrtimer_init); + +/* + * Remote softirq bits + */ + +DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); +EXPORT_PER_CPU_SYMBOL(softirq_work_list); + +static void __local_trigger(struct call_single_data *cp, int softirq) +{ + struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]); + + list_add_tail(&cp->list, head); + + /* Trigger the softirq only if the list was previously empty. */ + if (head->next == &cp->list) + raise_softirq_irqoff(softirq); +} + +#ifdef CONFIG_USE_GENERIC_SMP_HELPERS +static void remote_softirq_receive(void *data) +{ + struct call_single_data *cp = data; + unsigned long flags; + int softirq; + + softirq = cp->priv; + + local_irq_save(flags); + __local_trigger(cp, softirq); + local_irq_restore(flags); +} + +static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +{ + if (cpu_online(cpu)) { + cp->func = remote_softirq_receive; + cp->info = cp; + cp->flags = 0; + cp->priv = softirq; + + __smp_call_function_single(cpu, cp, 0); + return 0; + } + return 1; +} +#else /* CONFIG_USE_GENERIC_SMP_HELPERS */ +static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +{ + return 1; +} +#endif + +/** + * __send_remote_softirq - try to schedule softirq work on a remote cpu + * @cp: private SMP call function data area + * @cpu: the remote cpu + * @this_cpu: the currently executing cpu + * @softirq: the softirq for the work + * + * Attempt to schedule softirq work on a remote cpu. If this cannot be + * done, the work is instead queued up on the local cpu. + * + * Interrupts must be disabled. + */ +void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq) +{ + if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq)) + __local_trigger(cp, softirq); +} +EXPORT_SYMBOL(__send_remote_softirq); + +/** + * send_remote_softirq - try to schedule softirq work on a remote cpu + * @cp: private SMP call function data area + * @cpu: the remote cpu + * @softirq: the softirq for the work + * + * Like __send_remote_softirq except that disabling interrupts and + * computing the current cpu is done for the caller. + */ +void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq) +{ + unsigned long flags; + int this_cpu; + + local_irq_save(flags); + this_cpu = smp_processor_id(); + __send_remote_softirq(cp, cpu, this_cpu, softirq); + local_irq_restore(flags); +} +EXPORT_SYMBOL(send_remote_softirq); + +static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + /* + * If a CPU goes away, splice its entries to the current CPU + * and trigger a run of the softirq + */ + if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { + int cpu = (unsigned long) hcpu; + int i; + + local_irq_disable(); + for (i = 0; i < NR_SOFTIRQS; i++) { + struct list_head *head = &per_cpu(softirq_work_list[i], cpu); + struct list_head *local_head; + + if (list_empty(head)) + continue; + + local_head = &__get_cpu_var(softirq_work_list[i]); + list_splice_init(head, local_head); + raise_softirq_irqoff(i); + } + local_irq_enable(); + } + + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = { + .notifier_call = remote_softirq_cpu_notify, +}; + +void __init softirq_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + int i; + + per_cpu(tasklet_vec, cpu).tail = + &per_cpu(tasklet_vec, cpu).head; + per_cpu(tasklet_hi_vec, cpu).tail = + &per_cpu(tasklet_hi_vec, cpu).head; + for (i = 0; i < NR_SOFTIRQS; i++) + INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu)); + } + + register_hotcpu_notifier(&remote_softirq_cpu_notifier); + + open_softirq(TASKLET_SOFTIRQ, tasklet_action); + open_softirq(HI_SOFTIRQ, tasklet_hi_action); +} + +static int run_ksoftirqd(void * __bind_cpu) +{ + set_current_state(TASK_INTERRUPTIBLE); + + while (!kthread_should_stop()) { + preempt_disable(); + if (!local_softirq_pending()) { + preempt_enable_no_resched(); + schedule(); + preempt_disable(); + } + + __set_current_state(TASK_RUNNING); + + while (local_softirq_pending()) { + /* Preempt disable stops cpu going offline. + If already offline, we'll be on wrong CPU: + don't process */ + if (cpu_is_offline((long)__bind_cpu)) + goto wait_to_die; + local_irq_disable(); + if (local_softirq_pending()) + __do_softirq(); + local_irq_enable(); + preempt_enable_no_resched(); + cond_resched(); + preempt_disable(); + rcu_note_context_switch((long)__bind_cpu); + } + preempt_enable(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; + +wait_to_die: + preempt_enable(); + /* Wait for kthread_stop */ + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU +/* + * tasklet_kill_immediate is called to remove a tasklet which can already be + * scheduled for execution on @cpu. + * + * Unlike tasklet_kill, this function removes the tasklet + * _immediately_, even if the tasklet is in TASKLET_STATE_SCHED state. + * + * When this function is called, @cpu must be in the CPU_DEAD state. + */ +void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu) +{ + struct tasklet_struct **i; + + BUG_ON(cpu_online(cpu)); + BUG_ON(test_bit(TASKLET_STATE_RUN, &t->state)); + + if (!test_bit(TASKLET_STATE_SCHED, &t->state)) + return; + + /* CPU is dead, so no lock needed. */ + for (i = &per_cpu(tasklet_vec, cpu).head; *i; i = &(*i)->next) { + if (*i == t) { + *i = t->next; + /* If this was the tail element, move the tail ptr */ + if (*i == NULL) + per_cpu(tasklet_vec, cpu).tail = i; + return; + } + } + BUG(); +} + +static void takeover_tasklets(unsigned int cpu) +{ + /* CPU is dead, so no lock needed. */ + local_irq_disable(); + + /* Find end, append list for that CPU. */ + if (&per_cpu(tasklet_vec, cpu).head != per_cpu(tasklet_vec, cpu).tail) { + *__this_cpu_read(tasklet_vec.tail) = per_cpu(tasklet_vec, cpu).head; + this_cpu_write(tasklet_vec.tail, per_cpu(tasklet_vec, cpu).tail); + per_cpu(tasklet_vec, cpu).head = NULL; + per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; + } + raise_softirq_irqoff(TASKLET_SOFTIRQ); + + if (&per_cpu(tasklet_hi_vec, cpu).head != per_cpu(tasklet_hi_vec, cpu).tail) { + *__this_cpu_read(tasklet_hi_vec.tail) = per_cpu(tasklet_hi_vec, cpu).head; + __this_cpu_write(tasklet_hi_vec.tail, per_cpu(tasklet_hi_vec, cpu).tail); + per_cpu(tasklet_hi_vec, cpu).head = NULL; + per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; + } + raise_softirq_irqoff(HI_SOFTIRQ); + + local_irq_enable(); +} +#endif /* CONFIG_HOTPLUG_CPU */ + +static int __cpuinit cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int hotcpu = (unsigned long)hcpu; + struct task_struct *p; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + p = kthread_create_on_node(run_ksoftirqd, + hcpu, + cpu_to_node(hotcpu), + "ksoftirqd/%d", hotcpu); + if (IS_ERR(p)) { + printk("ksoftirqd for %i failed\n", hotcpu); + return notifier_from_errno(PTR_ERR(p)); + } + kthread_bind(p, hotcpu); + per_cpu(ksoftirqd, hotcpu) = p; + break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + wake_up_process(per_cpu(ksoftirqd, hotcpu)); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + if (!per_cpu(ksoftirqd, hotcpu)) + break; + /* Unbind so it can run. Fall thru. */ + kthread_bind(per_cpu(ksoftirqd, hotcpu), + cpumask_any(cpu_online_mask)); + case CPU_DEAD: + case CPU_DEAD_FROZEN: { + static const struct sched_param param = { + .sched_priority = MAX_RT_PRIO-1 + }; + + p = per_cpu(ksoftirqd, hotcpu); + per_cpu(ksoftirqd, hotcpu) = NULL; + sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); + kthread_stop(p); + takeover_tasklets(hotcpu); + break; + } +#endif /* CONFIG_HOTPLUG_CPU */ + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata cpu_nfb = { + .notifier_call = cpu_callback +}; + +static __init int spawn_ksoftirqd(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + + BUG_ON(err != NOTIFY_OK); + cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); + register_cpu_notifier(&cpu_nfb); + return 0; +} +early_initcall(spawn_ksoftirqd); + +/* + * [ These __weak aliases are kept in a separate compilation unit, so that + * GCC does not inline them incorrectly. ] + */ + +int __init __weak early_irq_init(void) +{ + return 0; +} + +#ifdef CONFIG_GENERIC_HARDIRQS +int __init __weak arch_probe_nr_irqs(void) +{ + return NR_IRQS_LEGACY; +} + +int __init __weak arch_early_irq_init(void) +{ + return 0; +} +#endif diff --git a/kernel/spinlock.c b/kernel/spinlock.c new file mode 100644 index 00000000..be6517fb --- /dev/null +++ b/kernel/spinlock.c @@ -0,0 +1,385 @@ +/* + * Copyright (2004) Linus Torvalds + * + * Author: Zwane Mwaikambo + * + * Copyright (2004, 2005) Ingo Molnar + * + * This file contains the spinlock/rwlock implementations for the + * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them) + * + * Note that some architectures have special knowledge about the + * stack frames of these functions in their profile_pc. If you + * change anything significant here that could change the stack + * frame contact the architecture maintainers. + */ + +#include +#include +#include +#include +#include +#include + +/* + * If lockdep is enabled then we use the non-preemption spin-ops + * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are + * not re-enabled during lock-acquire (which the preempt-spin-ops do): + */ +#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) +/* + * The __lock_function inlines are taken from + * include/linux/spinlock_api_smp.h + */ +#else +#define raw_read_can_lock(l) read_can_lock(l) +#define raw_write_can_lock(l) write_can_lock(l) +/* + * We build the __lock_function inlines here. They are too large for + * inlining all over the place, but here is only one user per function + * which embedds them into the calling _lock_function below. + * + * This could be a long-held lock. We both prepare to spin for a long + * time (making _this_ CPU preemptable if possible), and we also signal + * towards that other CPU that it should break the lock ASAP. + */ +#define BUILD_LOCK_OPS(op, locktype) \ +void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ +{ \ + for (;;) { \ + preempt_disable(); \ + if (likely(do_raw_##op##_trylock(lock))) \ + break; \ + preempt_enable(); \ + \ + if (!(lock)->break_lock) \ + (lock)->break_lock = 1; \ + while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\ + arch_##op##_relax(&lock->raw_lock); \ + } \ + (lock)->break_lock = 0; \ +} \ + \ +unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ +{ \ + unsigned long flags; \ + \ + for (;;) { \ + preempt_disable(); \ + local_irq_save(flags); \ + if (likely(do_raw_##op##_trylock(lock))) \ + break; \ + local_irq_restore(flags); \ + preempt_enable(); \ + \ + if (!(lock)->break_lock) \ + (lock)->break_lock = 1; \ + while (!raw_##op##_can_lock(lock) && (lock)->break_lock)\ + arch_##op##_relax(&lock->raw_lock); \ + } \ + (lock)->break_lock = 0; \ + return flags; \ +} \ + \ +void __lockfunc __raw_##op##_lock_irq(locktype##_t *lock) \ +{ \ + _raw_##op##_lock_irqsave(lock); \ +} \ + \ +void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \ +{ \ + unsigned long flags; \ + \ + /* */ \ + /* Careful: we must exclude softirqs too, hence the */ \ + /* irq-disabling. We use the generic preemption-aware */ \ + /* function: */ \ + /**/ \ + flags = _raw_##op##_lock_irqsave(lock); \ + local_bh_disable(); \ + local_irq_restore(flags); \ +} \ + +/* + * Build preemption-friendly versions of the following + * lock-spinning functions: + * + * __[spin|read|write]_lock() + * __[spin|read|write]_lock_irq() + * __[spin|read|write]_lock_irqsave() + * __[spin|read|write]_lock_bh() + */ +BUILD_LOCK_OPS(spin, raw_spinlock); +BUILD_LOCK_OPS(read, rwlock); +BUILD_LOCK_OPS(write, rwlock); + +#endif + +#ifndef CONFIG_INLINE_SPIN_TRYLOCK +int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock) +{ + return __raw_spin_trylock(lock); +} +EXPORT_SYMBOL(_raw_spin_trylock); +#endif + +#ifndef CONFIG_INLINE_SPIN_TRYLOCK_BH +int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock) +{ + return __raw_spin_trylock_bh(lock); +} +EXPORT_SYMBOL(_raw_spin_trylock_bh); +#endif + +#ifndef CONFIG_INLINE_SPIN_LOCK +void __lockfunc _raw_spin_lock(raw_spinlock_t *lock) +{ + __raw_spin_lock(lock); +} +EXPORT_SYMBOL(_raw_spin_lock); +#endif + +#ifndef CONFIG_INLINE_SPIN_LOCK_IRQSAVE +unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock) +{ + return __raw_spin_lock_irqsave(lock); +} +EXPORT_SYMBOL(_raw_spin_lock_irqsave); +#endif + +#ifndef CONFIG_INLINE_SPIN_LOCK_IRQ +void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock) +{ + __raw_spin_lock_irq(lock); +} +EXPORT_SYMBOL(_raw_spin_lock_irq); +#endif + +#ifndef CONFIG_INLINE_SPIN_LOCK_BH +void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock) +{ + __raw_spin_lock_bh(lock); +} +EXPORT_SYMBOL(_raw_spin_lock_bh); +#endif + +#ifndef CONFIG_INLINE_SPIN_UNLOCK +void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock) +{ + __raw_spin_unlock(lock); +} +EXPORT_SYMBOL(_raw_spin_unlock); +#endif + +#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQRESTORE +void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags) +{ + __raw_spin_unlock_irqrestore(lock, flags); +} +EXPORT_SYMBOL(_raw_spin_unlock_irqrestore); +#endif + +#ifndef CONFIG_INLINE_SPIN_UNLOCK_IRQ +void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock) +{ + __raw_spin_unlock_irq(lock); +} +EXPORT_SYMBOL(_raw_spin_unlock_irq); +#endif + +#ifndef CONFIG_INLINE_SPIN_UNLOCK_BH +void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) +{ + __raw_spin_unlock_bh(lock); +} +EXPORT_SYMBOL(_raw_spin_unlock_bh); +#endif + +#ifndef CONFIG_INLINE_READ_TRYLOCK +int __lockfunc _raw_read_trylock(rwlock_t *lock) +{ + return __raw_read_trylock(lock); +} +EXPORT_SYMBOL(_raw_read_trylock); +#endif + +#ifndef CONFIG_INLINE_READ_LOCK +void __lockfunc _raw_read_lock(rwlock_t *lock) +{ + __raw_read_lock(lock); +} +EXPORT_SYMBOL(_raw_read_lock); +#endif + +#ifndef CONFIG_INLINE_READ_LOCK_IRQSAVE +unsigned long __lockfunc _raw_read_lock_irqsave(rwlock_t *lock) +{ + return __raw_read_lock_irqsave(lock); +} +EXPORT_SYMBOL(_raw_read_lock_irqsave); +#endif + +#ifndef CONFIG_INLINE_READ_LOCK_IRQ +void __lockfunc _raw_read_lock_irq(rwlock_t *lock) +{ + __raw_read_lock_irq(lock); +} +EXPORT_SYMBOL(_raw_read_lock_irq); +#endif + +#ifndef CONFIG_INLINE_READ_LOCK_BH +void __lockfunc _raw_read_lock_bh(rwlock_t *lock) +{ + __raw_read_lock_bh(lock); +} +EXPORT_SYMBOL(_raw_read_lock_bh); +#endif + +#ifndef CONFIG_INLINE_READ_UNLOCK +void __lockfunc _raw_read_unlock(rwlock_t *lock) +{ + __raw_read_unlock(lock); +} +EXPORT_SYMBOL(_raw_read_unlock); +#endif + +#ifndef CONFIG_INLINE_READ_UNLOCK_IRQRESTORE +void __lockfunc _raw_read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +{ + __raw_read_unlock_irqrestore(lock, flags); +} +EXPORT_SYMBOL(_raw_read_unlock_irqrestore); +#endif + +#ifndef CONFIG_INLINE_READ_UNLOCK_IRQ +void __lockfunc _raw_read_unlock_irq(rwlock_t *lock) +{ + __raw_read_unlock_irq(lock); +} +EXPORT_SYMBOL(_raw_read_unlock_irq); +#endif + +#ifndef CONFIG_INLINE_READ_UNLOCK_BH +void __lockfunc _raw_read_unlock_bh(rwlock_t *lock) +{ + __raw_read_unlock_bh(lock); +} +EXPORT_SYMBOL(_raw_read_unlock_bh); +#endif + +#ifndef CONFIG_INLINE_WRITE_TRYLOCK +int __lockfunc _raw_write_trylock(rwlock_t *lock) +{ + return __raw_write_trylock(lock); +} +EXPORT_SYMBOL(_raw_write_trylock); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK +void __lockfunc _raw_write_lock(rwlock_t *lock) +{ + __raw_write_lock(lock); +} +EXPORT_SYMBOL(_raw_write_lock); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK_IRQSAVE +unsigned long __lockfunc _raw_write_lock_irqsave(rwlock_t *lock) +{ + return __raw_write_lock_irqsave(lock); +} +EXPORT_SYMBOL(_raw_write_lock_irqsave); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK_IRQ +void __lockfunc _raw_write_lock_irq(rwlock_t *lock) +{ + __raw_write_lock_irq(lock); +} +EXPORT_SYMBOL(_raw_write_lock_irq); +#endif + +#ifndef CONFIG_INLINE_WRITE_LOCK_BH +void __lockfunc _raw_write_lock_bh(rwlock_t *lock) +{ + __raw_write_lock_bh(lock); +} +EXPORT_SYMBOL(_raw_write_lock_bh); +#endif + +#ifndef CONFIG_INLINE_WRITE_UNLOCK +void __lockfunc _raw_write_unlock(rwlock_t *lock) +{ + __raw_write_unlock(lock); +} +EXPORT_SYMBOL(_raw_write_unlock); +#endif + +#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQRESTORE +void __lockfunc _raw_write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) +{ + __raw_write_unlock_irqrestore(lock, flags); +} +EXPORT_SYMBOL(_raw_write_unlock_irqrestore); +#endif + +#ifndef CONFIG_INLINE_WRITE_UNLOCK_IRQ +void __lockfunc _raw_write_unlock_irq(rwlock_t *lock) +{ + __raw_write_unlock_irq(lock); +} +EXPORT_SYMBOL(_raw_write_unlock_irq); +#endif + +#ifndef CONFIG_INLINE_WRITE_UNLOCK_BH +void __lockfunc _raw_write_unlock_bh(rwlock_t *lock) +{ + __raw_write_unlock_bh(lock); +} +EXPORT_SYMBOL(_raw_write_unlock_bh); +#endif + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) +{ + preempt_disable(); + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); +} +EXPORT_SYMBOL(_raw_spin_lock_nested); + +unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock, + int subclass) +{ + unsigned long flags; + + local_irq_save(flags); + preempt_disable(); + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock, + do_raw_spin_lock_flags, &flags); + return flags; +} +EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested); + +void __lockfunc _raw_spin_lock_nest_lock(raw_spinlock_t *lock, + struct lockdep_map *nest_lock) +{ + preempt_disable(); + spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); + LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); +} +EXPORT_SYMBOL(_raw_spin_lock_nest_lock); + +#endif + +notrace int in_lock_functions(unsigned long addr) +{ + /* Linker adds these: start and end of __lockfunc functions */ + extern char __lock_text_start[], __lock_text_end[]; + + return addr >= (unsigned long)__lock_text_start + && addr < (unsigned long)__lock_text_end; +} +EXPORT_SYMBOL(in_lock_functions); diff --git a/kernel/srcu.c b/kernel/srcu.c new file mode 100644 index 00000000..73ce23fe --- /dev/null +++ b/kernel/srcu.c @@ -0,0 +1,315 @@ +/* + * Sleepable Read-Copy Update mechanism for mutual exclusion. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2006 + * + * Author: Paul McKenney + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU/ *.txt + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int init_srcu_struct_fields(struct srcu_struct *sp) +{ + sp->completed = 0; + mutex_init(&sp->mutex); + sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); + return sp->per_cpu_ref ? 0 : -ENOMEM; +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +int __init_srcu_struct(struct srcu_struct *sp, const char *name, + struct lock_class_key *key) +{ + /* Don't re-initialize a lock while it is held. */ + debug_check_no_locks_freed((void *)sp, sizeof(*sp)); + lockdep_init_map(&sp->dep_map, name, key, 0); + return init_srcu_struct_fields(sp); +} +EXPORT_SYMBOL_GPL(__init_srcu_struct); + +#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +/** + * init_srcu_struct - initialize a sleep-RCU structure + * @sp: structure to initialize. + * + * Must invoke this on a given srcu_struct before passing that srcu_struct + * to any other function. Each srcu_struct represents a separate domain + * of SRCU protection. + */ +int init_srcu_struct(struct srcu_struct *sp) +{ + return init_srcu_struct_fields(sp); +} +EXPORT_SYMBOL_GPL(init_srcu_struct); + +#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +/* + * srcu_readers_active_idx -- returns approximate number of readers + * active on the specified rank of per-CPU counters. + */ + +static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) +{ + int cpu; + int sum; + + sum = 0; + for_each_possible_cpu(cpu) + sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]; + return sum; +} + +/** + * srcu_readers_active - returns approximate number of readers. + * @sp: which srcu_struct to count active readers (holding srcu_read_lock). + * + * Note that this is not an atomic primitive, and can therefore suffer + * severe errors when invoked on an active srcu_struct. That said, it + * can be useful as an error check at cleanup time. + */ +static int srcu_readers_active(struct srcu_struct *sp) +{ + return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1); +} + +/** + * cleanup_srcu_struct - deconstruct a sleep-RCU structure + * @sp: structure to clean up. + * + * Must invoke this after you are finished using a given srcu_struct that + * was initialized via init_srcu_struct(), else you leak memory. + */ +void cleanup_srcu_struct(struct srcu_struct *sp) +{ + int sum; + + sum = srcu_readers_active(sp); + WARN_ON(sum); /* Leakage unless caller handles error. */ + if (sum != 0) + return; + free_percpu(sp->per_cpu_ref); + sp->per_cpu_ref = NULL; +} +EXPORT_SYMBOL_GPL(cleanup_srcu_struct); + +/* + * Counts the new reader in the appropriate per-CPU element of the + * srcu_struct. Must be called from process context. + * Returns an index that must be passed to the matching srcu_read_unlock(). + */ +int __srcu_read_lock(struct srcu_struct *sp) +{ + int idx; + + preempt_disable(); + idx = sp->completed & 0x1; + barrier(); /* ensure compiler looks -once- at sp->completed. */ + per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++; + srcu_barrier(); /* ensure compiler won't misorder critical section. */ + preempt_enable(); + return idx; +} +EXPORT_SYMBOL_GPL(__srcu_read_lock); + +/* + * Removes the count for the old reader from the appropriate per-CPU + * element of the srcu_struct. Note that this may well be a different + * CPU than that which was incremented by the corresponding srcu_read_lock(). + * Must be called from process context. + */ +void __srcu_read_unlock(struct srcu_struct *sp, int idx) +{ + preempt_disable(); + srcu_barrier(); /* ensure compiler won't misorder critical section. */ + per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; + preempt_enable(); +} +EXPORT_SYMBOL_GPL(__srcu_read_unlock); + +/* + * We use an adaptive strategy for synchronize_srcu() and especially for + * synchronize_srcu_expedited(). We spin for a fixed time period + * (defined below) to allow SRCU readers to exit their read-side critical + * sections. If there are still some readers after 10 microseconds, + * we repeatedly block for 1-millisecond time periods. This approach + * has done well in testing, so there is no need for a config parameter. + */ +#define SYNCHRONIZE_SRCU_READER_DELAY 10 + +/* + * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). + */ +static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) +{ + int idx; + + idx = sp->completed; + mutex_lock(&sp->mutex); + + /* + * Check to see if someone else did the work for us while we were + * waiting to acquire the lock. We need -two- advances of + * the counter, not just one. If there was but one, we might have + * shown up -after- our helper's first synchronize_sched(), thus + * having failed to prevent CPU-reordering races with concurrent + * srcu_read_unlock()s on other CPUs (see comment below). So we + * either (1) wait for two or (2) supply the second ourselves. + */ + + if ((sp->completed - idx) >= 2) { + mutex_unlock(&sp->mutex); + return; + } + + sync_func(); /* Force memory barrier on all CPUs. */ + + /* + * The preceding synchronize_sched() ensures that any CPU that + * sees the new value of sp->completed will also see any preceding + * changes to data structures made by this CPU. This prevents + * some other CPU from reordering the accesses in its SRCU + * read-side critical section to precede the corresponding + * srcu_read_lock() -- ensuring that such references will in + * fact be protected. + * + * So it is now safe to do the flip. + */ + + idx = sp->completed & 0x1; + sp->completed++; + + sync_func(); /* Force memory barrier on all CPUs. */ + + /* + * At this point, because of the preceding synchronize_sched(), + * all srcu_read_lock() calls using the old counters have completed. + * Their corresponding critical sections might well be still + * executing, but the srcu_read_lock() primitives themselves + * will have finished executing. We initially give readers + * an arbitrarily chosen 10 microseconds to get out of their + * SRCU read-side critical sections, then loop waiting 1/HZ + * seconds per iteration. The 10-microsecond value has done + * very well in testing. + */ + + if (srcu_readers_active_idx(sp, idx)) + udelay(SYNCHRONIZE_SRCU_READER_DELAY); + while (srcu_readers_active_idx(sp, idx)) + schedule_timeout_interruptible(1); + + sync_func(); /* Force memory barrier on all CPUs. */ + + /* + * The preceding synchronize_sched() forces all srcu_read_unlock() + * primitives that were executing concurrently with the preceding + * for_each_possible_cpu() loop to have completed by this point. + * More importantly, it also forces the corresponding SRCU read-side + * critical sections to have also completed, and the corresponding + * references to SRCU-protected data items to be dropped. + * + * Note: + * + * Despite what you might think at first glance, the + * preceding synchronize_sched() -must- be within the + * critical section ended by the following mutex_unlock(). + * Otherwise, a task taking the early exit can race + * with a srcu_read_unlock(), which might have executed + * just before the preceding srcu_readers_active() check, + * and whose CPU might have reordered the srcu_read_unlock() + * with the preceding critical section. In this case, there + * is nothing preventing the synchronize_sched() task that is + * taking the early exit from freeing a data structure that + * is still being referenced (out of order) by the task + * doing the srcu_read_unlock(). + * + * Alternatively, the comparison with "2" on the early exit + * could be changed to "3", but this increases synchronize_srcu() + * latency for bulk loads. So the current code is preferred. + */ + + mutex_unlock(&sp->mutex); +} + +/** + * synchronize_srcu - wait for prior SRCU read-side critical-section completion + * @sp: srcu_struct with which to synchronize. + * + * Flip the completed counter, and wait for the old count to drain to zero. + * As with classic RCU, the updater must use some separate means of + * synchronizing concurrent updates. Can block; must be called from + * process context. + * + * Note that it is illegal to call synchronize_srcu() from the corresponding + * SRCU read-side critical section; doing so will result in deadlock. + * However, it is perfectly legal to call synchronize_srcu() on one + * srcu_struct from some other srcu_struct's read-side critical section. + */ +void synchronize_srcu(struct srcu_struct *sp) +{ + __synchronize_srcu(sp, synchronize_sched); +} +EXPORT_SYMBOL_GPL(synchronize_srcu); + +/** + * synchronize_srcu_expedited - like synchronize_srcu, but less patient + * @sp: srcu_struct with which to synchronize. + * + * Flip the completed counter, and wait for the old count to drain to zero. + * As with classic RCU, the updater must use some separate means of + * synchronizing concurrent updates. Can block; must be called from + * process context. + * + * Note that it is illegal to call synchronize_srcu_expedited() + * from the corresponding SRCU read-side critical section; doing so + * will result in deadlock. However, it is perfectly legal to call + * synchronize_srcu_expedited() on one srcu_struct from some other + * srcu_struct's read-side critical section. + */ +void synchronize_srcu_expedited(struct srcu_struct *sp) +{ + __synchronize_srcu(sp, synchronize_sched_expedited); +} +EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); + +/** + * srcu_batches_completed - return batches completed. + * @sp: srcu_struct on which to report batch completion. + * + * Report the number of batches, correlated with, but not necessarily + * precisely the same as, the number of grace periods that have elapsed. + */ + +long srcu_batches_completed(struct srcu_struct *sp) +{ + return sp->completed; +} +EXPORT_SYMBOL_GPL(srcu_batches_completed); diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c new file mode 100644 index 00000000..eb212f8f --- /dev/null +++ b/kernel/stacktrace.c @@ -0,0 +1,37 @@ +/* + * kernel/stacktrace.c + * + * Stack trace management functions + * + * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar + */ +#include +#include +#include +#include +#include + +void print_stack_trace(struct stack_trace *trace, int spaces) +{ + int i; + + if (WARN_ON(!trace->entries)) + return; + + for (i = 0; i < trace->nr_entries; i++) { + printk("%*c", 1 + spaces, ' '); + print_ip_sym(trace->entries[i]); + } +} +EXPORT_SYMBOL_GPL(print_stack_trace); + +/* + * Architectures that do not implement save_stack_trace_tsk get this + * weak alias and a once-per-bootup warning (whenever this facility + * is utilized - for example by procfs): + */ +__weak void +save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) +{ + WARN_ONCE(1, KERN_INFO "save_stack_trace_tsk() not implemented yet.\n"); +} diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c new file mode 100644 index 00000000..0cae1cc3 --- /dev/null +++ b/kernel/stop_machine.c @@ -0,0 +1,490 @@ +/* + * kernel/stop_machine.c + * + * Copyright (C) 2008, 2005 IBM Corporation. + * Copyright (C) 2008, 2005 Rusty Russell rusty@rustcorp.com.au + * Copyright (C) 2010 SUSE Linux Products GmbH + * Copyright (C) 2010 Tejun Heo + * + * This file is released under the GPLv2 and any later version. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * Structure to determine completion condition and record errors. May + * be shared by works on different cpus. + */ +struct cpu_stop_done { + atomic_t nr_todo; /* nr left to execute */ + bool executed; /* actually executed? */ + int ret; /* collected return value */ + struct completion completion; /* fired if nr_todo reaches 0 */ +}; + +/* the actual stopper, one per every possible cpu, enabled on online cpus */ +struct cpu_stopper { + spinlock_t lock; + bool enabled; /* is this stopper enabled? */ + struct list_head works; /* list of pending works */ + struct task_struct *thread; /* stopper thread */ +}; + +static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); + +static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) +{ + memset(done, 0, sizeof(*done)); + atomic_set(&done->nr_todo, nr_todo); + init_completion(&done->completion); +} + +/* signal completion unless @done is NULL */ +static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed) +{ + if (done) { + if (executed) + done->executed = true; + if (atomic_dec_and_test(&done->nr_todo)) + complete(&done->completion); + } +} + +/* queue @work to @stopper. if offline, @work is completed immediately */ +static void cpu_stop_queue_work(struct cpu_stopper *stopper, + struct cpu_stop_work *work) +{ + unsigned long flags; + + spin_lock_irqsave(&stopper->lock, flags); + + if (stopper->enabled) { + list_add_tail(&work->list, &stopper->works); + wake_up_process(stopper->thread); + } else + cpu_stop_signal_done(work->done, false); + + spin_unlock_irqrestore(&stopper->lock, flags); +} + +/** + * stop_one_cpu - stop a cpu + * @cpu: cpu to stop + * @fn: function to execute + * @arg: argument to @fn + * + * Execute @fn(@arg) on @cpu. @fn is run in a process context with + * the highest priority preempting any task on the cpu and + * monopolizing it. This function returns after the execution is + * complete. + * + * This function doesn't guarantee @cpu stays online till @fn + * completes. If @cpu goes down in the middle, execution may happen + * partially or fully on different cpus. @fn should either be ready + * for that or the caller should ensure that @cpu stays online until + * this function completes. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * -ENOENT if @fn(@arg) was not executed because @cpu was offline; + * otherwise, the return value of @fn. + */ +int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) +{ + struct cpu_stop_done done; + struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done }; + + cpu_stop_init_done(&done, 1); + cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work); + wait_for_completion(&done.completion); + return done.executed ? done.ret : -ENOENT; +} + +/** + * stop_one_cpu_nowait - stop a cpu but don't wait for completion + * @cpu: cpu to stop + * @fn: function to execute + * @arg: argument to @fn + * + * Similar to stop_one_cpu() but doesn't wait for completion. The + * caller is responsible for ensuring @work_buf is currently unused + * and will remain untouched until stopper starts executing @fn. + * + * CONTEXT: + * Don't care. + */ +void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, + struct cpu_stop_work *work_buf) +{ + *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, }; + cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf); +} + +DEFINE_MUTEX(stop_cpus_mutex); +/* static data for stop_cpus */ +static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work); + +int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) +{ + struct cpu_stop_work *work; + struct cpu_stop_done done; + unsigned int cpu; + + /* initialize works and done */ + for_each_cpu(cpu, cpumask) { + work = &per_cpu(stop_cpus_work, cpu); + work->fn = fn; + work->arg = arg; + work->done = &done; + } + cpu_stop_init_done(&done, cpumask_weight(cpumask)); + + /* + * Disable preemption while queueing to avoid getting + * preempted by a stopper which might wait for other stoppers + * to enter @fn which can lead to deadlock. + */ + preempt_disable(); + for_each_cpu(cpu, cpumask) + cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), + &per_cpu(stop_cpus_work, cpu)); + preempt_enable(); + + wait_for_completion(&done.completion); + return done.executed ? done.ret : -ENOENT; +} + +/** + * stop_cpus - stop multiple cpus + * @cpumask: cpus to stop + * @fn: function to execute + * @arg: argument to @fn + * + * Execute @fn(@arg) on online cpus in @cpumask. On each target cpu, + * @fn is run in a process context with the highest priority + * preempting any task on the cpu and monopolizing it. This function + * returns after all executions are complete. + * + * This function doesn't guarantee the cpus in @cpumask stay online + * till @fn completes. If some cpus go down in the middle, execution + * on the cpu may happen partially or fully on different cpus. @fn + * should either be ready for that or the caller should ensure that + * the cpus stay online until this function completes. + * + * All stop_cpus() calls are serialized making it safe for @fn to wait + * for all cpus to start executing it. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * -ENOENT if @fn(@arg) was not executed at all because all cpus in + * @cpumask were offline; otherwise, 0 if all executions of @fn + * returned 0, any non zero return value if any returned non zero. + */ +int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) +{ + int ret; + + /* static works are used, process one request at a time */ + mutex_lock(&stop_cpus_mutex); + ret = __stop_cpus(cpumask, fn, arg); + mutex_unlock(&stop_cpus_mutex); + return ret; +} + +/** + * try_stop_cpus - try to stop multiple cpus + * @cpumask: cpus to stop + * @fn: function to execute + * @arg: argument to @fn + * + * Identical to stop_cpus() except that it fails with -EAGAIN if + * someone else is already using the facility. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * -EAGAIN if someone else is already stopping cpus, -ENOENT if + * @fn(@arg) was not executed at all because all cpus in @cpumask were + * offline; otherwise, 0 if all executions of @fn returned 0, any non + * zero return value if any returned non zero. + */ +int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg) +{ + int ret; + + /* static works are used, process one request at a time */ + if (!mutex_trylock(&stop_cpus_mutex)) + return -EAGAIN; + ret = __stop_cpus(cpumask, fn, arg); + mutex_unlock(&stop_cpus_mutex); + return ret; +} + +static int cpu_stopper_thread(void *data) +{ + struct cpu_stopper *stopper = data; + struct cpu_stop_work *work; + int ret; + +repeat: + set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */ + + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + return 0; + } + + work = NULL; + spin_lock_irq(&stopper->lock); + if (!list_empty(&stopper->works)) { + work = list_first_entry(&stopper->works, + struct cpu_stop_work, list); + list_del_init(&work->list); + } + spin_unlock_irq(&stopper->lock); + + if (work) { + cpu_stop_fn_t fn = work->fn; + void *arg = work->arg; + struct cpu_stop_done *done = work->done; + char ksym_buf[KSYM_NAME_LEN] __maybe_unused; + + __set_current_state(TASK_RUNNING); + + /* cpu stop callbacks are not allowed to sleep */ + preempt_disable(); + + ret = fn(arg); + if (ret) + done->ret = ret; + + /* restore preemption and check it's still balanced */ + preempt_enable(); + WARN_ONCE(preempt_count(), + "cpu_stop: %s(%p) leaked preempt count\n", + kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL, + ksym_buf), arg); + + cpu_stop_signal_done(done, true); + } else + schedule(); + + goto repeat; +} + +extern void sched_set_stop_task(int cpu, struct task_struct *stop); + +/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */ +static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); + struct task_struct *p; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_UP_PREPARE: + BUG_ON(stopper->thread || stopper->enabled || + !list_empty(&stopper->works)); + p = kthread_create_on_node(cpu_stopper_thread, + stopper, + cpu_to_node(cpu), + "migration/%d", cpu); + if (IS_ERR(p)) + return notifier_from_errno(PTR_ERR(p)); + get_task_struct(p); + kthread_bind(p, cpu); + sched_set_stop_task(cpu, p); + stopper->thread = p; + break; + + case CPU_ONLINE: + /* strictly unnecessary, as first user will wake it */ + wake_up_process(stopper->thread); + /* mark enabled */ + spin_lock_irq(&stopper->lock); + stopper->enabled = true; + spin_unlock_irq(&stopper->lock); + break; + +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + case CPU_POST_DEAD: + { + struct cpu_stop_work *work; + + sched_set_stop_task(cpu, NULL); + /* kill the stopper */ + kthread_stop(stopper->thread); + /* drain remaining works */ + spin_lock_irq(&stopper->lock); + list_for_each_entry(work, &stopper->works, list) + cpu_stop_signal_done(work->done, false); + stopper->enabled = false; + spin_unlock_irq(&stopper->lock); + /* release the stopper */ + put_task_struct(stopper->thread); + stopper->thread = NULL; + break; + } +#endif + } + + return NOTIFY_OK; +} + +/* + * Give it a higher priority so that cpu stopper is available to other + * cpu notifiers. It currently shares the same priority as sched + * migration_notifier. + */ +static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = { + .notifier_call = cpu_stop_cpu_callback, + .priority = 10, +}; + +static int __init cpu_stop_init(void) +{ + void *bcpu = (void *)(long)smp_processor_id(); + unsigned int cpu; + int err; + + for_each_possible_cpu(cpu) { + struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); + + spin_lock_init(&stopper->lock); + INIT_LIST_HEAD(&stopper->works); + } + + /* start one for the boot cpu */ + err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE, + bcpu); + BUG_ON(err != NOTIFY_OK); + cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu); + register_cpu_notifier(&cpu_stop_cpu_notifier); + + return 0; +} +early_initcall(cpu_stop_init); + +#ifdef CONFIG_STOP_MACHINE + +/* This controls the threads on each CPU. */ +enum stopmachine_state { + /* Dummy starting state for thread. */ + STOPMACHINE_NONE, + /* Awaiting everyone to be scheduled. */ + STOPMACHINE_PREPARE, + /* Disable interrupts. */ + STOPMACHINE_DISABLE_IRQ, + /* Run the function */ + STOPMACHINE_RUN, + /* Exit */ + STOPMACHINE_EXIT, +}; + +struct stop_machine_data { + int (*fn)(void *); + void *data; + /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ + unsigned int num_threads; + const struct cpumask *active_cpus; + + enum stopmachine_state state; + atomic_t thread_ack; +}; + +static void set_state(struct stop_machine_data *smdata, + enum stopmachine_state newstate) +{ + /* Reset ack counter. */ + atomic_set(&smdata->thread_ack, smdata->num_threads); + smp_wmb(); + smdata->state = newstate; +} + +/* Last one to ack a state moves to the next state. */ +static void ack_state(struct stop_machine_data *smdata) +{ + if (atomic_dec_and_test(&smdata->thread_ack)) + set_state(smdata, smdata->state + 1); +} + +/* This is the cpu_stop function which stops the CPU. */ +static int stop_machine_cpu_stop(void *data) +{ + struct stop_machine_data *smdata = data; + enum stopmachine_state curstate = STOPMACHINE_NONE; + int cpu = smp_processor_id(), err = 0; + bool is_active; + + if (!smdata->active_cpus) + is_active = cpu == cpumask_first(cpu_online_mask); + else + is_active = cpumask_test_cpu(cpu, smdata->active_cpus); + + /* Simple state machine */ + do { + /* Chill out and ensure we re-read stopmachine_state. */ + cpu_relax(); + if (smdata->state != curstate) { + curstate = smdata->state; + switch (curstate) { + case STOPMACHINE_DISABLE_IRQ: + local_irq_disable(); + hard_irq_disable(); + break; + case STOPMACHINE_RUN: + if (is_active) + err = smdata->fn(smdata->data); + break; + default: + break; + } + ack_state(smdata); + } + } while (curstate != STOPMACHINE_EXIT); + + local_irq_enable(); + return err; +} + +int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) +{ + struct stop_machine_data smdata = { .fn = fn, .data = data, + .num_threads = num_online_cpus(), + .active_cpus = cpus }; + + /* Set the initial state and stop all online cpus. */ + set_state(&smdata, STOPMACHINE_PREPARE); + return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata); +} + +int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) +{ + int ret; + + /* No CPUs can come up or down during this. */ + get_online_cpus(); + ret = __stop_machine(fn, data, cpus); + put_online_cpus(); + return ret; +} +EXPORT_SYMBOL_GPL(stop_machine); + +#endif /* CONFIG_STOP_MACHINE */ diff --git a/kernel/sys.c b/kernel/sys.c new file mode 100644 index 00000000..f88dadc8 --- /dev/null +++ b/kernel/sys.c @@ -0,0 +1,1878 @@ +/* + * linux/kernel/sys.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +/* Move somewhere else to avoid recompiling? */ +#include + +#include +#include +#include + +#ifndef SET_UNALIGN_CTL +# define SET_UNALIGN_CTL(a,b) (-EINVAL) +#endif +#ifndef GET_UNALIGN_CTL +# define GET_UNALIGN_CTL(a,b) (-EINVAL) +#endif +#ifndef SET_FPEMU_CTL +# define SET_FPEMU_CTL(a,b) (-EINVAL) +#endif +#ifndef GET_FPEMU_CTL +# define GET_FPEMU_CTL(a,b) (-EINVAL) +#endif +#ifndef SET_FPEXC_CTL +# define SET_FPEXC_CTL(a,b) (-EINVAL) +#endif +#ifndef GET_FPEXC_CTL +# define GET_FPEXC_CTL(a,b) (-EINVAL) +#endif +#ifndef GET_ENDIAN +# define GET_ENDIAN(a,b) (-EINVAL) +#endif +#ifndef SET_ENDIAN +# define SET_ENDIAN(a,b) (-EINVAL) +#endif +#ifndef GET_TSC_CTL +# define GET_TSC_CTL(a) (-EINVAL) +#endif +#ifndef SET_TSC_CTL +# define SET_TSC_CTL(a) (-EINVAL) +#endif + +/* + * this is where the system-wide overflow UID and GID are defined, for + * architectures that now have 32-bit UID/GID but didn't in the past + */ + +int overflowuid = DEFAULT_OVERFLOWUID; +int overflowgid = DEFAULT_OVERFLOWGID; + +#ifdef CONFIG_UID16 +EXPORT_SYMBOL(overflowuid); +EXPORT_SYMBOL(overflowgid); +#endif + +/* + * the same as above, but for filesystems which can only store a 16-bit + * UID and GID. as such, this is needed on all architectures + */ + +int fs_overflowuid = DEFAULT_FS_OVERFLOWUID; +int fs_overflowgid = DEFAULT_FS_OVERFLOWUID; + +EXPORT_SYMBOL(fs_overflowuid); +EXPORT_SYMBOL(fs_overflowgid); + +/* + * this indicates whether you can reboot with ctrl-alt-del: the default is yes + */ + +int C_A_D = 1; +struct pid *cad_pid; +EXPORT_SYMBOL(cad_pid); + +/* + * If set, this is used for preparing the system to power off. + */ + +void (*pm_power_off_prepare)(void); + +/* + * Returns true if current's euid is same as p's uid or euid, + * or has CAP_SYS_NICE to p's user_ns. + * + * Called with rcu_read_lock, creds are safe + */ +static bool set_one_prio_perm(struct task_struct *p) +{ + const struct cred *cred = current_cred(), *pcred = __task_cred(p); + + if (pcred->user->user_ns == cred->user->user_ns && + (pcred->uid == cred->euid || + pcred->euid == cred->euid)) + return true; + if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE)) + return true; + return false; +} + +/* + * set the priority of a task + * - the caller must hold the RCU read lock + */ +static int set_one_prio(struct task_struct *p, int niceval, int error) +{ + int no_nice; + + if (!set_one_prio_perm(p)) { + error = -EPERM; + goto out; + } + if (niceval < task_nice(p) && !can_nice(p, niceval)) { + error = -EACCES; + goto out; + } + no_nice = security_task_setnice(p, niceval); + if (no_nice) { + error = no_nice; + goto out; + } + if (error == -ESRCH) + error = 0; + set_user_nice(p, niceval); +out: + return error; +} + +SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) +{ + struct task_struct *g, *p; + struct user_struct *user; + const struct cred *cred = current_cred(); + int error = -EINVAL; + struct pid *pgrp; + + if (which > PRIO_USER || which < PRIO_PROCESS) + goto out; + + /* normalize: avoid signed division (rounding problems) */ + error = -ESRCH; + if (niceval < -20) + niceval = -20; + if (niceval > 19) + niceval = 19; + + rcu_read_lock(); + read_lock(&tasklist_lock); + switch (which) { + case PRIO_PROCESS: + if (who) + p = find_task_by_vpid(who); + else + p = current; + if (p) + error = set_one_prio(p, niceval, error); + break; + case PRIO_PGRP: + if (who) + pgrp = find_vpid(who); + else + pgrp = task_pgrp(current); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { + error = set_one_prio(p, niceval, error); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + break; + case PRIO_USER: + user = (struct user_struct *) cred->user; + if (!who) + who = cred->uid; + else if ((who != cred->uid) && + !(user = find_user(who))) + goto out_unlock; /* No processes for this user */ + + do_each_thread(g, p) { + if (__task_cred(p)->uid == who) + error = set_one_prio(p, niceval, error); + } while_each_thread(g, p); + if (who != cred->uid) + free_uid(user); /* For find_user() */ + break; + } +out_unlock: + read_unlock(&tasklist_lock); + rcu_read_unlock(); +out: + return error; +} + +/* + * Ugh. To avoid negative return values, "getpriority()" will + * not return the normal nice-value, but a negated value that + * has been offset by 20 (ie it returns 40..1 instead of -20..19) + * to stay compatible. + */ +SYSCALL_DEFINE2(getpriority, int, which, int, who) +{ + struct task_struct *g, *p; + struct user_struct *user; + const struct cred *cred = current_cred(); + long niceval, retval = -ESRCH; + struct pid *pgrp; + + if (which > PRIO_USER || which < PRIO_PROCESS) + return -EINVAL; + + rcu_read_lock(); + read_lock(&tasklist_lock); + switch (which) { + case PRIO_PROCESS: + if (who) + p = find_task_by_vpid(who); + else + p = current; + if (p) { + niceval = 20 - task_nice(p); + if (niceval > retval) + retval = niceval; + } + break; + case PRIO_PGRP: + if (who) + pgrp = find_vpid(who); + else + pgrp = task_pgrp(current); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { + niceval = 20 - task_nice(p); + if (niceval > retval) + retval = niceval; + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + break; + case PRIO_USER: + user = (struct user_struct *) cred->user; + if (!who) + who = cred->uid; + else if ((who != cred->uid) && + !(user = find_user(who))) + goto out_unlock; /* No processes for this user */ + + do_each_thread(g, p) { + if (__task_cred(p)->uid == who) { + niceval = 20 - task_nice(p); + if (niceval > retval) + retval = niceval; + } + } while_each_thread(g, p); + if (who != cred->uid) + free_uid(user); /* for find_user() */ + break; + } +out_unlock: + read_unlock(&tasklist_lock); + rcu_read_unlock(); + + return retval; +} + +/** + * emergency_restart - reboot the system + * + * Without shutting down any hardware or taking any locks + * reboot the system. This is called when we know we are in + * trouble so this is our best effort to reboot. This is + * safe to call in interrupt context. + */ +void emergency_restart(void) +{ + kmsg_dump(KMSG_DUMP_EMERG); + machine_emergency_restart(); +} +EXPORT_SYMBOL_GPL(emergency_restart); + +void kernel_restart_prepare(char *cmd) +{ + blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); + system_state = SYSTEM_RESTART; + usermodehelper_disable(); + device_shutdown(); + syscore_shutdown(); +} + +/** + * kernel_restart - reboot the system + * @cmd: pointer to buffer containing command to execute for restart + * or %NULL + * + * Shutdown everything and perform a clean reboot. + * This is not safe to call in interrupt context. + */ +void kernel_restart(char *cmd) +{ + kernel_restart_prepare(cmd); + if (!cmd) + printk(KERN_EMERG "Restarting system.\n"); + else + printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); + kmsg_dump(KMSG_DUMP_RESTART); + machine_restart(cmd); +} +EXPORT_SYMBOL_GPL(kernel_restart); + +static void kernel_shutdown_prepare(enum system_states state) +{ + blocking_notifier_call_chain(&reboot_notifier_list, + (state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL); + system_state = state; + usermodehelper_disable(); + device_shutdown(); +} +/** + * kernel_halt - halt the system + * + * Shutdown everything and perform a clean system halt. + */ +void kernel_halt(void) +{ + kernel_shutdown_prepare(SYSTEM_HALT); + syscore_shutdown(); + printk(KERN_EMERG "System halted.\n"); + kmsg_dump(KMSG_DUMP_HALT); + machine_halt(); +} + +EXPORT_SYMBOL_GPL(kernel_halt); + +/** + * kernel_power_off - power_off the system + * + * Shutdown everything and perform a clean system power_off. + */ +void kernel_power_off(void) +{ + kernel_shutdown_prepare(SYSTEM_POWER_OFF); + if (pm_power_off_prepare) + pm_power_off_prepare(); + disable_nonboot_cpus(); + syscore_shutdown(); + printk(KERN_EMERG "Power down.\n"); + kmsg_dump(KMSG_DUMP_POWEROFF); + machine_power_off(); +} +EXPORT_SYMBOL_GPL(kernel_power_off); + +static DEFINE_MUTEX(reboot_mutex); + +/* + * Reboot system call: for obvious reasons only root may call it, + * and even root needs to set up some magic numbers in the registers + * so that some mistake won't make this reboot the whole machine. + * You can also set the meaning of the ctrl-alt-del-key here. + * + * reboot doesn't sync: do that yourself before calling this. + */ +SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd, + void __user *, arg) +{ + char buffer[256]; + int ret = 0; + + /* We only trust the superuser with rebooting the system. */ + if (!capable(CAP_SYS_BOOT)) + return -EPERM; + + /* For safety, we require "magic" arguments. */ + if (magic1 != LINUX_REBOOT_MAGIC1 || + (magic2 != LINUX_REBOOT_MAGIC2 && + magic2 != LINUX_REBOOT_MAGIC2A && + magic2 != LINUX_REBOOT_MAGIC2B && + magic2 != LINUX_REBOOT_MAGIC2C)) + return -EINVAL; + + /* Instead of trying to make the power_off code look like + * halt when pm_power_off is not set do it the easy way. + */ + if ((cmd == LINUX_REBOOT_CMD_POWER_OFF) && !pm_power_off) + cmd = LINUX_REBOOT_CMD_HALT; + + mutex_lock(&reboot_mutex); + switch (cmd) { + case LINUX_REBOOT_CMD_RESTART: + kernel_restart(NULL); + break; + + case LINUX_REBOOT_CMD_CAD_ON: + C_A_D = 1; + break; + + case LINUX_REBOOT_CMD_CAD_OFF: + C_A_D = 0; + break; + + case LINUX_REBOOT_CMD_HALT: + kernel_halt(); + do_exit(0); + panic("cannot halt"); + + case LINUX_REBOOT_CMD_POWER_OFF: + kernel_power_off(); + do_exit(0); + break; + + case LINUX_REBOOT_CMD_RESTART2: + if (strncpy_from_user(&buffer[0], arg, sizeof(buffer) - 1) < 0) { + ret = -EFAULT; + break; + } + buffer[sizeof(buffer) - 1] = '\0'; + + kernel_restart(buffer); + break; + +#ifdef CONFIG_KEXEC + case LINUX_REBOOT_CMD_KEXEC: + ret = kernel_kexec(); + break; +#endif + +#ifdef CONFIG_HIBERNATION + case LINUX_REBOOT_CMD_SW_SUSPEND: + ret = hibernate(); + break; +#endif + + default: + ret = -EINVAL; + break; + } + mutex_unlock(&reboot_mutex); + return ret; +} + +static void deferred_cad(struct work_struct *dummy) +{ + kernel_restart(NULL); +} + +/* + * This function gets called by ctrl-alt-del - ie the keyboard interrupt. + * As it's called within an interrupt, it may NOT sync: the only choice + * is whether to reboot at once, or just ignore the ctrl-alt-del. + */ +void ctrl_alt_del(void) +{ + static DECLARE_WORK(cad_work, deferred_cad); + + if (C_A_D) + schedule_work(&cad_work); + else + kill_cad_pid(SIGINT, 1); +} + +/* + * Unprivileged users may change the real gid to the effective gid + * or vice versa. (BSD-style) + * + * If you set the real gid at all, or set the effective gid to a value not + * equal to the real gid, then the saved gid is set to the new effective gid. + * + * This makes it possible for a setgid program to completely drop its + * privileges, which is often a useful assertion to make when you are doing + * a security audit over a program. + * + * The general idea is that a program which uses just setregid() will be + * 100% compatible with BSD. A program which uses just setgid() will be + * 100% compatible with POSIX with saved IDs. + * + * SMP: There are not races, the GIDs are checked only by filesystem + * operations (as far as semantic preservation is concerned). + */ +SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) +{ + const struct cred *old; + struct cred *new; + int retval; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + old = current_cred(); + + retval = -EPERM; + if (rgid != (gid_t) -1) { + if (old->gid == rgid || + old->egid == rgid || + nsown_capable(CAP_SETGID)) + new->gid = rgid; + else + goto error; + } + if (egid != (gid_t) -1) { + if (old->gid == egid || + old->egid == egid || + old->sgid == egid || + nsown_capable(CAP_SETGID)) + new->egid = egid; + else + goto error; + } + + if (rgid != (gid_t) -1 || + (egid != (gid_t) -1 && egid != old->gid)) + new->sgid = new->egid; + new->fsgid = new->egid; + + return commit_creds(new); + +error: + abort_creds(new); + return retval; +} + +/* + * setgid() is implemented like SysV w/ SAVED_IDS + * + * SMP: Same implicit races as above. + */ +SYSCALL_DEFINE1(setgid, gid_t, gid) +{ + const struct cred *old; + struct cred *new; + int retval; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + old = current_cred(); + + retval = -EPERM; + if (nsown_capable(CAP_SETGID)) + new->gid = new->egid = new->sgid = new->fsgid = gid; + else if (gid == old->gid || gid == old->sgid) + new->egid = new->fsgid = gid; + else + goto error; + + return commit_creds(new); + +error: + abort_creds(new); + return retval; +} + +/* + * change the user struct in a credentials set to match the new UID + */ +static int set_user(struct cred *new) +{ + struct user_struct *new_user; + + new_user = alloc_uid(current_user_ns(), new->uid); + if (!new_user) + return -EAGAIN; + + if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) && + new_user != INIT_USER) { + free_uid(new_user); + return -EAGAIN; + } + + free_uid(new->user); + new->user = new_user; + return 0; +} + +/* + * Unprivileged users may change the real uid to the effective uid + * or vice versa. (BSD-style) + * + * If you set the real uid at all, or set the effective uid to a value not + * equal to the real uid, then the saved uid is set to the new effective uid. + * + * This makes it possible for a setuid program to completely drop its + * privileges, which is often a useful assertion to make when you are doing + * a security audit over a program. + * + * The general idea is that a program which uses just setreuid() will be + * 100% compatible with BSD. A program which uses just setuid() will be + * 100% compatible with POSIX with saved IDs. + */ +SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) +{ + const struct cred *old; + struct cred *new; + int retval; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + old = current_cred(); + + retval = -EPERM; + if (ruid != (uid_t) -1) { + new->uid = ruid; + if (old->uid != ruid && + old->euid != ruid && + !nsown_capable(CAP_SETUID)) + goto error; + } + + if (euid != (uid_t) -1) { + new->euid = euid; + if (old->uid != euid && + old->euid != euid && + old->suid != euid && + !nsown_capable(CAP_SETUID)) + goto error; + } + + if (new->uid != old->uid) { + retval = set_user(new); + if (retval < 0) + goto error; + } + if (ruid != (uid_t) -1 || + (euid != (uid_t) -1 && euid != old->uid)) + new->suid = new->euid; + new->fsuid = new->euid; + + retval = security_task_fix_setuid(new, old, LSM_SETID_RE); + if (retval < 0) + goto error; + + return commit_creds(new); + +error: + abort_creds(new); + return retval; +} + +/* + * setuid() is implemented like SysV with SAVED_IDS + * + * Note that SAVED_ID's is deficient in that a setuid root program + * like sendmail, for example, cannot set its uid to be a normal + * user and then switch back, because if you're root, setuid() sets + * the saved uid too. If you don't like this, blame the bright people + * in the POSIX committee and/or USG. Note that the BSD-style setreuid() + * will allow a root program to temporarily drop privileges and be able to + * regain them by swapping the real and effective uid. + */ +SYSCALL_DEFINE1(setuid, uid_t, uid) +{ + const struct cred *old; + struct cred *new; + int retval; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + old = current_cred(); + + retval = -EPERM; + if (nsown_capable(CAP_SETUID)) { + new->suid = new->uid = uid; + if (uid != old->uid) { + retval = set_user(new); + if (retval < 0) + goto error; + } + } else if (uid != old->uid && uid != new->suid) { + goto error; + } + + new->fsuid = new->euid = uid; + + retval = security_task_fix_setuid(new, old, LSM_SETID_ID); + if (retval < 0) + goto error; + + return commit_creds(new); + +error: + abort_creds(new); + return retval; +} + + +/* + * This function implements a generic ability to update ruid, euid, + * and suid. This allows you to implement the 4.4 compatible seteuid(). + */ +SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) +{ + const struct cred *old; + struct cred *new; + int retval; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + + old = current_cred(); + + retval = -EPERM; + if (!nsown_capable(CAP_SETUID)) { + if (ruid != (uid_t) -1 && ruid != old->uid && + ruid != old->euid && ruid != old->suid) + goto error; + if (euid != (uid_t) -1 && euid != old->uid && + euid != old->euid && euid != old->suid) + goto error; + if (suid != (uid_t) -1 && suid != old->uid && + suid != old->euid && suid != old->suid) + goto error; + } + + if (ruid != (uid_t) -1) { + new->uid = ruid; + if (ruid != old->uid) { + retval = set_user(new); + if (retval < 0) + goto error; + } + } + if (euid != (uid_t) -1) + new->euid = euid; + if (suid != (uid_t) -1) + new->suid = suid; + new->fsuid = new->euid; + + retval = security_task_fix_setuid(new, old, LSM_SETID_RES); + if (retval < 0) + goto error; + + return commit_creds(new); + +error: + abort_creds(new); + return retval; +} + +SYSCALL_DEFINE3(getresuid, uid_t __user *, ruid, uid_t __user *, euid, uid_t __user *, suid) +{ + const struct cred *cred = current_cred(); + int retval; + + if (!(retval = put_user(cred->uid, ruid)) && + !(retval = put_user(cred->euid, euid))) + retval = put_user(cred->suid, suid); + + return retval; +} + +/* + * Same as above, but for rgid, egid, sgid. + */ +SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) +{ + const struct cred *old; + struct cred *new; + int retval; + + new = prepare_creds(); + if (!new) + return -ENOMEM; + old = current_cred(); + + retval = -EPERM; + if (!nsown_capable(CAP_SETGID)) { + if (rgid != (gid_t) -1 && rgid != old->gid && + rgid != old->egid && rgid != old->sgid) + goto error; + if (egid != (gid_t) -1 && egid != old->gid && + egid != old->egid && egid != old->sgid) + goto error; + if (sgid != (gid_t) -1 && sgid != old->gid && + sgid != old->egid && sgid != old->sgid) + goto error; + } + + if (rgid != (gid_t) -1) + new->gid = rgid; + if (egid != (gid_t) -1) + new->egid = egid; + if (sgid != (gid_t) -1) + new->sgid = sgid; + new->fsgid = new->egid; + + return commit_creds(new); + +error: + abort_creds(new); + return retval; +} + +SYSCALL_DEFINE3(getresgid, gid_t __user *, rgid, gid_t __user *, egid, gid_t __user *, sgid) +{ + const struct cred *cred = current_cred(); + int retval; + + if (!(retval = put_user(cred->gid, rgid)) && + !(retval = put_user(cred->egid, egid))) + retval = put_user(cred->sgid, sgid); + + return retval; +} + + +/* + * "setfsuid()" sets the fsuid - the uid used for filesystem checks. This + * is used for "access()" and for the NFS daemon (letting nfsd stay at + * whatever uid it wants to). It normally shadows "euid", except when + * explicitly set by setfsuid() or for access.. + */ +SYSCALL_DEFINE1(setfsuid, uid_t, uid) +{ + const struct cred *old; + struct cred *new; + uid_t old_fsuid; + + new = prepare_creds(); + if (!new) + return current_fsuid(); + old = current_cred(); + old_fsuid = old->fsuid; + + if (uid == old->uid || uid == old->euid || + uid == old->suid || uid == old->fsuid || + nsown_capable(CAP_SETUID)) { + if (uid != old_fsuid) { + new->fsuid = uid; + if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) + goto change_okay; + } + } + + abort_creds(new); + return old_fsuid; + +change_okay: + commit_creds(new); + return old_fsuid; +} + +/* + * Samma pÃ¥ svenska.. + */ +SYSCALL_DEFINE1(setfsgid, gid_t, gid) +{ + const struct cred *old; + struct cred *new; + gid_t old_fsgid; + + new = prepare_creds(); + if (!new) + return current_fsgid(); + old = current_cred(); + old_fsgid = old->fsgid; + + if (gid == old->gid || gid == old->egid || + gid == old->sgid || gid == old->fsgid || + nsown_capable(CAP_SETGID)) { + if (gid != old_fsgid) { + new->fsgid = gid; + goto change_okay; + } + } + + abort_creds(new); + return old_fsgid; + +change_okay: + commit_creds(new); + return old_fsgid; +} + +void do_sys_times(struct tms *tms) +{ + cputime_t tgutime, tgstime, cutime, cstime; + + spin_lock_irq(¤t->sighand->siglock); + thread_group_times(current, &tgutime, &tgstime); + cutime = current->signal->cutime; + cstime = current->signal->cstime; + spin_unlock_irq(¤t->sighand->siglock); + tms->tms_utime = cputime_to_clock_t(tgutime); + tms->tms_stime = cputime_to_clock_t(tgstime); + tms->tms_cutime = cputime_to_clock_t(cutime); + tms->tms_cstime = cputime_to_clock_t(cstime); +} + +SYSCALL_DEFINE1(times, struct tms __user *, tbuf) +{ + if (tbuf) { + struct tms tmp; + + do_sys_times(&tmp); + if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) + return -EFAULT; + } + force_successful_syscall_return(); + return (long) jiffies_64_to_clock_t(get_jiffies_64()); +} + +/* + * This needs some heavy checking ... + * I just haven't the stomach for it. I also don't fully + * understand sessions/pgrp etc. Let somebody who does explain it. + * + * OK, I think I have the protection semantics right.... this is really + * only important on a multi-user system anyway, to make sure one user + * can't send a signal to a process owned by another. -TYT, 12/12/91 + * + * Auch. Had to add the 'did_exec' flag to conform completely to POSIX. + * LBT 04.03.94 + */ +SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid) +{ + struct task_struct *p; + struct task_struct *group_leader = current->group_leader; + struct pid *pgrp; + int err; + + if (!pid) + pid = task_pid_vnr(group_leader); + if (!pgid) + pgid = pid; + if (pgid < 0) + return -EINVAL; + rcu_read_lock(); + + /* From this point forward we keep holding onto the tasklist lock + * so that our parent does not change from under us. -DaveM + */ + write_lock_irq(&tasklist_lock); + + err = -ESRCH; + p = find_task_by_vpid(pid); + if (!p) + goto out; + + err = -EINVAL; + if (!thread_group_leader(p)) + goto out; + + if (same_thread_group(p->real_parent, group_leader)) { + err = -EPERM; + if (task_session(p) != task_session(group_leader)) + goto out; + err = -EACCES; + if (p->did_exec) + goto out; + } else { + err = -ESRCH; + if (p != group_leader) + goto out; + } + + err = -EPERM; + if (p->signal->leader) + goto out; + + pgrp = task_pid(p); + if (pgid != pid) { + struct task_struct *g; + + pgrp = find_vpid(pgid); + g = pid_task(pgrp, PIDTYPE_PGID); + if (!g || task_session(g) != task_session(group_leader)) + goto out; + } + + err = security_task_setpgid(p, pgid); + if (err) + goto out; + + if (task_pgrp(p) != pgrp) + change_pid(p, PIDTYPE_PGID, pgrp); + + err = 0; +out: + /* All paths lead to here, thus we are safe. -DaveM */ + write_unlock_irq(&tasklist_lock); + rcu_read_unlock(); + return err; +} + +SYSCALL_DEFINE1(getpgid, pid_t, pid) +{ + struct task_struct *p; + struct pid *grp; + int retval; + + rcu_read_lock(); + if (!pid) + grp = task_pgrp(current); + else { + retval = -ESRCH; + p = find_task_by_vpid(pid); + if (!p) + goto out; + grp = task_pgrp(p); + if (!grp) + goto out; + + retval = security_task_getpgid(p); + if (retval) + goto out; + } + retval = pid_vnr(grp); +out: + rcu_read_unlock(); + return retval; +} + +#ifdef __ARCH_WANT_SYS_GETPGRP + +SYSCALL_DEFINE0(getpgrp) +{ + return sys_getpgid(0); +} + +#endif + +SYSCALL_DEFINE1(getsid, pid_t, pid) +{ + struct task_struct *p; + struct pid *sid; + int retval; + + rcu_read_lock(); + if (!pid) + sid = task_session(current); + else { + retval = -ESRCH; + p = find_task_by_vpid(pid); + if (!p) + goto out; + sid = task_session(p); + if (!sid) + goto out; + + retval = security_task_getsid(p); + if (retval) + goto out; + } + retval = pid_vnr(sid); +out: + rcu_read_unlock(); + return retval; +} + +SYSCALL_DEFINE0(setsid) +{ + struct task_struct *group_leader = current->group_leader; + struct pid *sid = task_pid(group_leader); + pid_t session = pid_vnr(sid); + int err = -EPERM; + + write_lock_irq(&tasklist_lock); + /* Fail if I am already a session leader */ + if (group_leader->signal->leader) + goto out; + + /* Fail if a process group id already exists that equals the + * proposed session id. + */ + if (pid_task(sid, PIDTYPE_PGID)) + goto out; + + group_leader->signal->leader = 1; + __set_special_pids(sid); + + proc_clear_tty(group_leader); + + err = session; +out: + write_unlock_irq(&tasklist_lock); + if (err > 0) { + proc_sid_connector(group_leader); + sched_autogroup_create_attach(group_leader); + } + return err; +} + +DECLARE_RWSEM(uts_sem); + +#ifdef COMPAT_UTS_MACHINE +#define override_architecture(name) \ + (personality(current->personality) == PER_LINUX32 && \ + copy_to_user(name->machine, COMPAT_UTS_MACHINE, \ + sizeof(COMPAT_UTS_MACHINE))) +#else +#define override_architecture(name) 0 +#endif + +/* + * Work around broken programs that cannot handle "Linux 3.0". + * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 + */ +static int override_release(char __user *release, int len) +{ + int ret = 0; + char buf[65]; + + if (current->personality & UNAME26) { + char *rest = UTS_RELEASE; + int ndots = 0; + unsigned v; + + while (*rest) { + if (*rest == '.' && ++ndots >= 3) + break; + if (!isdigit(*rest) && *rest != '.') + break; + rest++; + } + v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40; + snprintf(buf, len, "2.6.%u%s", v, rest); + ret = copy_to_user(release, buf, len); + } + return ret; +} + +SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) +{ + int errno = 0; + + down_read(&uts_sem); + if (copy_to_user(name, utsname(), sizeof *name)) + errno = -EFAULT; + up_read(&uts_sem); + + if (!errno && override_release(name->release, sizeof(name->release))) + errno = -EFAULT; + if (!errno && override_architecture(name)) + errno = -EFAULT; + return errno; +} + +#ifdef __ARCH_WANT_SYS_OLD_UNAME +/* + * Old cruft + */ +SYSCALL_DEFINE1(uname, struct old_utsname __user *, name) +{ + int error = 0; + + if (!name) + return -EFAULT; + + down_read(&uts_sem); + if (copy_to_user(name, utsname(), sizeof(*name))) + error = -EFAULT; + up_read(&uts_sem); + + if (!error && override_release(name->release, sizeof(name->release))) + error = -EFAULT; + if (!error && override_architecture(name)) + error = -EFAULT; + return error; +} + +SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name) +{ + int error; + + if (!name) + return -EFAULT; + if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) + return -EFAULT; + + down_read(&uts_sem); + error = __copy_to_user(&name->sysname, &utsname()->sysname, + __OLD_UTS_LEN); + error |= __put_user(0, name->sysname + __OLD_UTS_LEN); + error |= __copy_to_user(&name->nodename, &utsname()->nodename, + __OLD_UTS_LEN); + error |= __put_user(0, name->nodename + __OLD_UTS_LEN); + error |= __copy_to_user(&name->release, &utsname()->release, + __OLD_UTS_LEN); + error |= __put_user(0, name->release + __OLD_UTS_LEN); + error |= __copy_to_user(&name->version, &utsname()->version, + __OLD_UTS_LEN); + error |= __put_user(0, name->version + __OLD_UTS_LEN); + error |= __copy_to_user(&name->machine, &utsname()->machine, + __OLD_UTS_LEN); + error |= __put_user(0, name->machine + __OLD_UTS_LEN); + up_read(&uts_sem); + + if (!error && override_architecture(name)) + error = -EFAULT; + if (!error && override_release(name->release, sizeof(name->release))) + error = -EFAULT; + return error ? -EFAULT : 0; +} +#endif + +SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) +{ + int errno; + char tmp[__NEW_UTS_LEN]; + + if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + if (len < 0 || len > __NEW_UTS_LEN) + return -EINVAL; + down_write(&uts_sem); + errno = -EFAULT; + if (!copy_from_user(tmp, name, len)) { + struct new_utsname *u = utsname(); + + memcpy(u->nodename, tmp, len); + memset(u->nodename + len, 0, sizeof(u->nodename) - len); + errno = 0; + } + up_write(&uts_sem); + return errno; +} + +#ifdef __ARCH_WANT_SYS_GETHOSTNAME + +SYSCALL_DEFINE2(gethostname, char __user *, name, int, len) +{ + int i, errno; + struct new_utsname *u; + + if (len < 0) + return -EINVAL; + down_read(&uts_sem); + u = utsname(); + i = 1 + strlen(u->nodename); + if (i > len) + i = len; + errno = 0; + if (copy_to_user(name, u->nodename, i)) + errno = -EFAULT; + up_read(&uts_sem); + return errno; +} + +#endif + +/* + * Only setdomainname; getdomainname can be implemented by calling + * uname() + */ +SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len) +{ + int errno; + char tmp[__NEW_UTS_LEN]; + + if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + if (len < 0 || len > __NEW_UTS_LEN) + return -EINVAL; + + down_write(&uts_sem); + errno = -EFAULT; + if (!copy_from_user(tmp, name, len)) { + struct new_utsname *u = utsname(); + + memcpy(u->domainname, tmp, len); + memset(u->domainname + len, 0, sizeof(u->domainname) - len); + errno = 0; + } + up_write(&uts_sem); + return errno; +} + +SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim) +{ + struct rlimit value; + int ret; + + ret = do_prlimit(current, resource, NULL, &value); + if (!ret) + ret = copy_to_user(rlim, &value, sizeof(*rlim)) ? -EFAULT : 0; + + return ret; +} + +#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT + +/* + * Back compatibility for getrlimit. Needed for some apps. + */ + +SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, + struct rlimit __user *, rlim) +{ + struct rlimit x; + if (resource >= RLIM_NLIMITS) + return -EINVAL; + + task_lock(current->group_leader); + x = current->signal->rlim[resource]; + task_unlock(current->group_leader); + if (x.rlim_cur > 0x7FFFFFFF) + x.rlim_cur = 0x7FFFFFFF; + if (x.rlim_max > 0x7FFFFFFF) + x.rlim_max = 0x7FFFFFFF; + return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0; +} + +#endif + +static inline bool rlim64_is_infinity(__u64 rlim64) +{ +#if BITS_PER_LONG < 64 + return rlim64 >= ULONG_MAX; +#else + return rlim64 == RLIM64_INFINITY; +#endif +} + +static void rlim_to_rlim64(const struct rlimit *rlim, struct rlimit64 *rlim64) +{ + if (rlim->rlim_cur == RLIM_INFINITY) + rlim64->rlim_cur = RLIM64_INFINITY; + else + rlim64->rlim_cur = rlim->rlim_cur; + if (rlim->rlim_max == RLIM_INFINITY) + rlim64->rlim_max = RLIM64_INFINITY; + else + rlim64->rlim_max = rlim->rlim_max; +} + +static void rlim64_to_rlim(const struct rlimit64 *rlim64, struct rlimit *rlim) +{ + if (rlim64_is_infinity(rlim64->rlim_cur)) + rlim->rlim_cur = RLIM_INFINITY; + else + rlim->rlim_cur = (unsigned long)rlim64->rlim_cur; + if (rlim64_is_infinity(rlim64->rlim_max)) + rlim->rlim_max = RLIM_INFINITY; + else + rlim->rlim_max = (unsigned long)rlim64->rlim_max; +} + +/* make sure you are allowed to change @tsk limits before calling this */ +int do_prlimit(struct task_struct *tsk, unsigned int resource, + struct rlimit *new_rlim, struct rlimit *old_rlim) +{ + struct rlimit *rlim; + int retval = 0; + + if (resource >= RLIM_NLIMITS) + return -EINVAL; + if (new_rlim) { + if (new_rlim->rlim_cur > new_rlim->rlim_max) + return -EINVAL; + if (resource == RLIMIT_NOFILE && + new_rlim->rlim_max > sysctl_nr_open) + return -EPERM; + } + + /* protect tsk->signal and tsk->sighand from disappearing */ + read_lock(&tasklist_lock); + if (!tsk->sighand) { + retval = -ESRCH; + goto out; + } + + rlim = tsk->signal->rlim + resource; + task_lock(tsk->group_leader); + if (new_rlim) { + /* Keep the capable check against init_user_ns until + cgroups can contain all limits */ + if (new_rlim->rlim_max > rlim->rlim_max && + !capable(CAP_SYS_RESOURCE)) + retval = -EPERM; + if (!retval) + retval = security_task_setrlimit(tsk->group_leader, + resource, new_rlim); + if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) { + /* + * The caller is asking for an immediate RLIMIT_CPU + * expiry. But we use the zero value to mean "it was + * never set". So let's cheat and make it one second + * instead + */ + new_rlim->rlim_cur = 1; + } + } + if (!retval) { + if (old_rlim) + *old_rlim = *rlim; + if (new_rlim) + *rlim = *new_rlim; + } + task_unlock(tsk->group_leader); + + /* + * RLIMIT_CPU handling. Note that the kernel fails to return an error + * code if it rejected the user's attempt to set RLIMIT_CPU. This is a + * very long-standing error, and fixing it now risks breakage of + * applications, so we live with it + */ + if (!retval && new_rlim && resource == RLIMIT_CPU && + new_rlim->rlim_cur != RLIM_INFINITY) + update_rlimit_cpu(tsk, new_rlim->rlim_cur); +out: + read_unlock(&tasklist_lock); + return retval; +} + +/* rcu lock must be held */ +static int check_prlimit_permission(struct task_struct *task) +{ + const struct cred *cred = current_cred(), *tcred; + + if (current == task) + return 0; + + tcred = __task_cred(task); + if (cred->user->user_ns == tcred->user->user_ns && + (cred->uid == tcred->euid && + cred->uid == tcred->suid && + cred->uid == tcred->uid && + cred->gid == tcred->egid && + cred->gid == tcred->sgid && + cred->gid == tcred->gid)) + return 0; + if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE)) + return 0; + + return -EPERM; +} + +SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, + const struct rlimit64 __user *, new_rlim, + struct rlimit64 __user *, old_rlim) +{ + struct rlimit64 old64, new64; + struct rlimit old, new; + struct task_struct *tsk; + int ret; + + if (new_rlim) { + if (copy_from_user(&new64, new_rlim, sizeof(new64))) + return -EFAULT; + rlim64_to_rlim(&new64, &new); + } + + rcu_read_lock(); + tsk = pid ? find_task_by_vpid(pid) : current; + if (!tsk) { + rcu_read_unlock(); + return -ESRCH; + } + ret = check_prlimit_permission(tsk); + if (ret) { + rcu_read_unlock(); + return ret; + } + get_task_struct(tsk); + rcu_read_unlock(); + + ret = do_prlimit(tsk, resource, new_rlim ? &new : NULL, + old_rlim ? &old : NULL); + + if (!ret && old_rlim) { + rlim_to_rlim64(&old, &old64); + if (copy_to_user(old_rlim, &old64, sizeof(old64))) + ret = -EFAULT; + } + + put_task_struct(tsk); + return ret; +} + +SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim) +{ + struct rlimit new_rlim; + + if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) + return -EFAULT; + return do_prlimit(current, resource, &new_rlim, NULL); +} + +/* + * It would make sense to put struct rusage in the task_struct, + * except that would make the task_struct be *really big*. After + * task_struct gets moved into malloc'ed memory, it would + * make sense to do this. It will make moving the rest of the information + * a lot simpler! (Which we're not doing right now because we're not + * measuring them yet). + * + * When sampling multiple threads for RUSAGE_SELF, under SMP we might have + * races with threads incrementing their own counters. But since word + * reads are atomic, we either get new values or old values and we don't + * care which for the sums. We always take the siglock to protect reading + * the c* fields from p->signal from races with exit.c updating those + * fields when reaping, so a sample either gets all the additions of a + * given child after it's reaped, or none so this sample is before reaping. + * + * Locking: + * We need to take the siglock for CHILDEREN, SELF and BOTH + * for the cases current multithreaded, non-current single threaded + * non-current multithreaded. Thread traversal is now safe with + * the siglock held. + * Strictly speaking, we donot need to take the siglock if we are current and + * single threaded, as no one else can take our signal_struct away, no one + * else can reap the children to update signal->c* counters, and no one else + * can race with the signal-> fields. If we do not take any lock, the + * signal-> fields could be read out of order while another thread was just + * exiting. So we should place a read memory barrier when we avoid the lock. + * On the writer side, write memory barrier is implied in __exit_signal + * as __exit_signal releases the siglock spinlock after updating the signal-> + * fields. But we don't do this yet to keep things simple. + * + */ + +static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r) +{ + r->ru_nvcsw += t->nvcsw; + r->ru_nivcsw += t->nivcsw; + r->ru_minflt += t->min_flt; + r->ru_majflt += t->maj_flt; + r->ru_inblock += task_io_get_inblock(t); + r->ru_oublock += task_io_get_oublock(t); +} + +static void k_getrusage(struct task_struct *p, int who, struct rusage *r) +{ + struct task_struct *t; + unsigned long flags; + cputime_t tgutime, tgstime, utime, stime; + unsigned long maxrss = 0; + + memset((char *) r, 0, sizeof *r); + utime = stime = cputime_zero; + + if (who == RUSAGE_THREAD) { + task_times(current, &utime, &stime); + accumulate_thread_rusage(p, r); + maxrss = p->signal->maxrss; + goto out; + } + + if (!lock_task_sighand(p, &flags)) + return; + + switch (who) { + case RUSAGE_BOTH: + case RUSAGE_CHILDREN: + utime = p->signal->cutime; + stime = p->signal->cstime; + r->ru_nvcsw = p->signal->cnvcsw; + r->ru_nivcsw = p->signal->cnivcsw; + r->ru_minflt = p->signal->cmin_flt; + r->ru_majflt = p->signal->cmaj_flt; + r->ru_inblock = p->signal->cinblock; + r->ru_oublock = p->signal->coublock; + maxrss = p->signal->cmaxrss; + + if (who == RUSAGE_CHILDREN) + break; + + case RUSAGE_SELF: + thread_group_times(p, &tgutime, &tgstime); + utime = cputime_add(utime, tgutime); + stime = cputime_add(stime, tgstime); + r->ru_nvcsw += p->signal->nvcsw; + r->ru_nivcsw += p->signal->nivcsw; + r->ru_minflt += p->signal->min_flt; + r->ru_majflt += p->signal->maj_flt; + r->ru_inblock += p->signal->inblock; + r->ru_oublock += p->signal->oublock; + if (maxrss < p->signal->maxrss) + maxrss = p->signal->maxrss; + t = p; + do { + accumulate_thread_rusage(t, r); + t = next_thread(t); + } while (t != p); + break; + + default: + BUG(); + } + unlock_task_sighand(p, &flags); + +out: + cputime_to_timeval(utime, &r->ru_utime); + cputime_to_timeval(stime, &r->ru_stime); + + if (who != RUSAGE_CHILDREN) { + struct mm_struct *mm = get_task_mm(p); + if (mm) { + setmax_mm_hiwater_rss(&maxrss, mm); + mmput(mm); + } + } + r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */ +} + +int getrusage(struct task_struct *p, int who, struct rusage __user *ru) +{ + struct rusage r; + k_getrusage(p, who, &r); + return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; +} + +SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru) +{ + if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN && + who != RUSAGE_THREAD) + return -EINVAL; + return getrusage(current, who, ru); +} + +SYSCALL_DEFINE1(umask, int, mask) +{ + mask = xchg(¤t->fs->umask, mask & S_IRWXUGO); + return mask; +} + +SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, + unsigned long, arg4, unsigned long, arg5) +{ + struct task_struct *me = current; + unsigned char comm[sizeof(me->comm)]; + long error; + + error = security_task_prctl(option, arg2, arg3, arg4, arg5); + if (error != -ENOSYS) + return error; + + error = 0; + switch (option) { + case PR_SET_PDEATHSIG: + if (!valid_signal(arg2)) { + error = -EINVAL; + break; + } + me->pdeath_signal = arg2; + error = 0; + break; + case PR_GET_PDEATHSIG: + error = put_user(me->pdeath_signal, (int __user *)arg2); + break; + case PR_GET_DUMPABLE: + error = get_dumpable(me->mm); + break; + case PR_SET_DUMPABLE: + if (arg2 < 0 || arg2 > 1) { + error = -EINVAL; + break; + } + set_dumpable(me->mm, arg2); + error = 0; + break; + + case PR_SET_UNALIGN: + error = SET_UNALIGN_CTL(me, arg2); + break; + case PR_GET_UNALIGN: + error = GET_UNALIGN_CTL(me, arg2); + break; + case PR_SET_FPEMU: + error = SET_FPEMU_CTL(me, arg2); + break; + case PR_GET_FPEMU: + error = GET_FPEMU_CTL(me, arg2); + break; + case PR_SET_FPEXC: + error = SET_FPEXC_CTL(me, arg2); + break; + case PR_GET_FPEXC: + error = GET_FPEXC_CTL(me, arg2); + break; + case PR_GET_TIMING: + error = PR_TIMING_STATISTICAL; + break; + case PR_SET_TIMING: + if (arg2 != PR_TIMING_STATISTICAL) + error = -EINVAL; + else + error = 0; + break; + + case PR_SET_NAME: + comm[sizeof(me->comm)-1] = 0; + if (strncpy_from_user(comm, (char __user *)arg2, + sizeof(me->comm) - 1) < 0) + return -EFAULT; + set_task_comm(me, comm); + return 0; + case PR_GET_NAME: + get_task_comm(comm, me); + if (copy_to_user((char __user *)arg2, comm, + sizeof(comm))) + return -EFAULT; + return 0; + case PR_GET_ENDIAN: + error = GET_ENDIAN(me, arg2); + break; + case PR_SET_ENDIAN: + error = SET_ENDIAN(me, arg2); + break; + + case PR_GET_SECCOMP: + error = prctl_get_seccomp(); + break; + case PR_SET_SECCOMP: + error = prctl_set_seccomp(arg2); + break; + case PR_GET_TSC: + error = GET_TSC_CTL(arg2); + break; + case PR_SET_TSC: + error = SET_TSC_CTL(arg2); + break; + case PR_TASK_PERF_EVENTS_DISABLE: + error = perf_event_task_disable(); + break; + case PR_TASK_PERF_EVENTS_ENABLE: + error = perf_event_task_enable(); + break; + case PR_GET_TIMERSLACK: + error = current->timer_slack_ns; + break; + case PR_SET_TIMERSLACK: + if (arg2 <= 0) + current->timer_slack_ns = + current->default_timer_slack_ns; + else + current->timer_slack_ns = arg2; + error = 0; + break; + case PR_MCE_KILL: + if (arg4 | arg5) + return -EINVAL; + switch (arg2) { + case PR_MCE_KILL_CLEAR: + if (arg3 != 0) + return -EINVAL; + current->flags &= ~PF_MCE_PROCESS; + break; + case PR_MCE_KILL_SET: + current->flags |= PF_MCE_PROCESS; + if (arg3 == PR_MCE_KILL_EARLY) + current->flags |= PF_MCE_EARLY; + else if (arg3 == PR_MCE_KILL_LATE) + current->flags &= ~PF_MCE_EARLY; + else if (arg3 == PR_MCE_KILL_DEFAULT) + current->flags &= + ~(PF_MCE_EARLY|PF_MCE_PROCESS); + else + return -EINVAL; + break; + default: + return -EINVAL; + } + error = 0; + break; + case PR_MCE_KILL_GET: + if (arg2 | arg3 | arg4 | arg5) + return -EINVAL; + if (current->flags & PF_MCE_PROCESS) + error = (current->flags & PF_MCE_EARLY) ? + PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE; + else + error = PR_MCE_KILL_DEFAULT; + break; + default: + error = -EINVAL; + break; + } + return error; +} + +SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, + struct getcpu_cache __user *, unused) +{ + int err = 0; + int cpu = raw_smp_processor_id(); + if (cpup) + err |= put_user(cpu, cpup); + if (nodep) + err |= put_user(cpu_to_node(cpu), nodep); + return err ? -EFAULT : 0; +} + +char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; + +static void argv_cleanup(struct subprocess_info *info) +{ + argv_free(info->argv); +} + +/** + * orderly_poweroff - Trigger an orderly system poweroff + * @force: force poweroff if command execution fails + * + * This may be called from any context to trigger a system shutdown. + * If the orderly shutdown fails, it will force an immediate shutdown. + */ +int orderly_poweroff(bool force) +{ + int argc; + char **argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); + static char *envp[] = { + "HOME=/", + "PATH=/sbin:/bin:/usr/sbin:/usr/bin", + NULL + }; + int ret = -ENOMEM; + struct subprocess_info *info; + + if (argv == NULL) { + printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", + __func__, poweroff_cmd); + goto out; + } + + info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC); + if (info == NULL) { + argv_free(argv); + goto out; + } + + call_usermodehelper_setfns(info, NULL, argv_cleanup, NULL); + + ret = call_usermodehelper_exec(info, UMH_NO_WAIT); + + out: + if (ret && force) { + printk(KERN_WARNING "Failed to start orderly shutdown: " + "forcing the issue\n"); + + /* I guess this should try to kick off some daemon to + sync and poweroff asap. Or not even bother syncing + if we're doing an emergency shutdown? */ + emergency_sync(); + kernel_power_off(); + } + + return ret; +} +EXPORT_SYMBOL_GPL(orderly_poweroff); diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c new file mode 100644 index 00000000..62cbc887 --- /dev/null +++ b/kernel/sys_ni.c @@ -0,0 +1,202 @@ + +#include +#include + +#include + +/* we can't #include here, + but tell gcc to not warn with -Wmissing-prototypes */ +asmlinkage long sys_ni_syscall(void); + +/* + * Non-implemented system calls get redirected here. + */ +asmlinkage long sys_ni_syscall(void) +{ + return -ENOSYS; +} + +cond_syscall(sys_nfsservctl); +cond_syscall(sys_quotactl); +cond_syscall(sys32_quotactl); +cond_syscall(sys_acct); +cond_syscall(sys_lookup_dcookie); +cond_syscall(sys_swapon); +cond_syscall(sys_swapoff); +cond_syscall(sys_kexec_load); +cond_syscall(compat_sys_kexec_load); +cond_syscall(sys_init_module); +cond_syscall(sys_delete_module); +cond_syscall(sys_socketpair); +cond_syscall(sys_bind); +cond_syscall(sys_listen); +cond_syscall(sys_accept); +cond_syscall(sys_accept4); +cond_syscall(sys_connect); +cond_syscall(sys_getsockname); +cond_syscall(sys_getpeername); +cond_syscall(sys_sendto); +cond_syscall(sys_send); +cond_syscall(sys_recvfrom); +cond_syscall(sys_recv); +cond_syscall(sys_socket); +cond_syscall(sys_setsockopt); +cond_syscall(compat_sys_setsockopt); +cond_syscall(sys_getsockopt); +cond_syscall(compat_sys_getsockopt); +cond_syscall(sys_shutdown); +cond_syscall(sys_sendmsg); +cond_syscall(sys_sendmmsg); +cond_syscall(compat_sys_sendmsg); +cond_syscall(compat_sys_sendmmsg); +cond_syscall(sys_recvmsg); +cond_syscall(sys_recvmmsg); +cond_syscall(compat_sys_recvmsg); +cond_syscall(compat_sys_recv); +cond_syscall(compat_sys_recvfrom); +cond_syscall(compat_sys_recvmmsg); +cond_syscall(sys_socketcall); +cond_syscall(sys_futex); +cond_syscall(compat_sys_futex); +cond_syscall(sys_set_robust_list); +cond_syscall(compat_sys_set_robust_list); +cond_syscall(sys_get_robust_list); +cond_syscall(compat_sys_get_robust_list); +cond_syscall(sys_epoll_create); +cond_syscall(sys_epoll_create1); +cond_syscall(sys_epoll_ctl); +cond_syscall(sys_epoll_wait); +cond_syscall(sys_epoll_pwait); +cond_syscall(compat_sys_epoll_pwait); +cond_syscall(sys_semget); +cond_syscall(sys_semop); +cond_syscall(sys_semtimedop); +cond_syscall(compat_sys_semtimedop); +cond_syscall(sys_semctl); +cond_syscall(compat_sys_semctl); +cond_syscall(sys_msgget); +cond_syscall(sys_msgsnd); +cond_syscall(compat_sys_msgsnd); +cond_syscall(sys_msgrcv); +cond_syscall(compat_sys_msgrcv); +cond_syscall(sys_msgctl); +cond_syscall(compat_sys_msgctl); +cond_syscall(sys_shmget); +cond_syscall(sys_shmat); +cond_syscall(compat_sys_shmat); +cond_syscall(sys_shmdt); +cond_syscall(sys_shmctl); +cond_syscall(compat_sys_shmctl); +cond_syscall(sys_mq_open); +cond_syscall(sys_mq_unlink); +cond_syscall(sys_mq_timedsend); +cond_syscall(sys_mq_timedreceive); +cond_syscall(sys_mq_notify); +cond_syscall(sys_mq_getsetattr); +cond_syscall(compat_sys_mq_open); +cond_syscall(compat_sys_mq_timedsend); +cond_syscall(compat_sys_mq_timedreceive); +cond_syscall(compat_sys_mq_notify); +cond_syscall(compat_sys_mq_getsetattr); +cond_syscall(sys_mbind); +cond_syscall(sys_get_mempolicy); +cond_syscall(sys_set_mempolicy); +cond_syscall(compat_sys_mbind); +cond_syscall(compat_sys_get_mempolicy); +cond_syscall(compat_sys_set_mempolicy); +cond_syscall(sys_add_key); +cond_syscall(sys_request_key); +cond_syscall(sys_keyctl); +cond_syscall(compat_sys_keyctl); +cond_syscall(compat_sys_socketcall); +cond_syscall(sys_inotify_init); +cond_syscall(sys_inotify_init1); +cond_syscall(sys_inotify_add_watch); +cond_syscall(sys_inotify_rm_watch); +cond_syscall(sys_migrate_pages); +cond_syscall(sys_move_pages); +cond_syscall(sys_chown16); +cond_syscall(sys_fchown16); +cond_syscall(sys_getegid16); +cond_syscall(sys_geteuid16); +cond_syscall(sys_getgid16); +cond_syscall(sys_getgroups16); +cond_syscall(sys_getresgid16); +cond_syscall(sys_getresuid16); +cond_syscall(sys_getuid16); +cond_syscall(sys_lchown16); +cond_syscall(sys_setfsgid16); +cond_syscall(sys_setfsuid16); +cond_syscall(sys_setgid16); +cond_syscall(sys_setgroups16); +cond_syscall(sys_setregid16); +cond_syscall(sys_setresgid16); +cond_syscall(sys_setresuid16); +cond_syscall(sys_setreuid16); +cond_syscall(sys_setuid16); +cond_syscall(sys_vm86old); +cond_syscall(sys_vm86); +cond_syscall(sys_ipc); +cond_syscall(compat_sys_ipc); +cond_syscall(compat_sys_sysctl); +cond_syscall(sys_flock); +cond_syscall(sys_io_setup); +cond_syscall(sys_io_destroy); +cond_syscall(sys_io_submit); +cond_syscall(sys_io_cancel); +cond_syscall(sys_io_getevents); +cond_syscall(sys_syslog); + +/* arch-specific weak syscall entries */ +cond_syscall(sys_pciconfig_read); +cond_syscall(sys_pciconfig_write); +cond_syscall(sys_pciconfig_iobase); +cond_syscall(sys32_ipc); +cond_syscall(ppc_rtas); +cond_syscall(sys_spu_run); +cond_syscall(sys_spu_create); +cond_syscall(sys_subpage_prot); + +/* mmu depending weak syscall entries */ +cond_syscall(sys_mprotect); +cond_syscall(sys_msync); +cond_syscall(sys_mlock); +cond_syscall(sys_munlock); +cond_syscall(sys_mlockall); +cond_syscall(sys_munlockall); +cond_syscall(sys_mincore); +cond_syscall(sys_madvise); +cond_syscall(sys_mremap); +cond_syscall(sys_remap_file_pages); +cond_syscall(compat_sys_move_pages); +cond_syscall(compat_sys_migrate_pages); + +/* block-layer dependent */ +cond_syscall(sys_bdflush); +cond_syscall(sys_ioprio_set); +cond_syscall(sys_ioprio_get); + +/* New file descriptors */ +cond_syscall(sys_signalfd); +cond_syscall(sys_signalfd4); +cond_syscall(compat_sys_signalfd); +cond_syscall(compat_sys_signalfd4); +cond_syscall(sys_timerfd_create); +cond_syscall(sys_timerfd_settime); +cond_syscall(sys_timerfd_gettime); +cond_syscall(compat_sys_timerfd_settime); +cond_syscall(compat_sys_timerfd_gettime); +cond_syscall(sys_eventfd); +cond_syscall(sys_eventfd2); + +/* performance counters: */ +cond_syscall(sys_perf_event_open); + +/* fanotify! */ +cond_syscall(sys_fanotify_init); +cond_syscall(sys_fanotify_mark); + +/* open by handle */ +cond_syscall(sys_name_to_handle_at); +cond_syscall(sys_open_by_handle_at); +cond_syscall(compat_sys_open_by_handle_at); diff --git a/kernel/sysctl.c b/kernel/sysctl.c new file mode 100644 index 00000000..5b6afb27 --- /dev/null +++ b/kernel/sysctl.c @@ -0,0 +1,3006 @@ +/* + * sysctl.c: General linux system control interface + * + * Begun 24 March 1995, Stephen Tweedie + * Added /proc support, Dec 1995 + * Added bdflush entry and intvec min/max checking, 2/23/96, Tom Dyas. + * Added hooks for /proc/sys/net (minor, minor patch), 96/4/1, Mike Shaver. + * Added kernel/java-{interpreter,appletviewer}, 96/5/10, Mike Shaver. + * Dynamic registration fixes, Stephen Tweedie. + * Added kswapd-interval, ctrl-alt-del, printk stuff, 1/8/97, Chris Horn. + * Made sysctl support optional via CONFIG_SYSCTL, 1/10/97, Chris + * Horn. + * Added proc_doulongvec_ms_jiffies_minmax, 09/08/99, Carlos H. Bauer. + * Added proc_doulongvec_minmax, 09/08/99, Carlos H. Bauer. + * Changed linked lists to use list.h instead of lists.h, 02/24/00, Bill + * Wendling. + * The list_for_each() macro wasn't appropriate for the sysctl loop. + * Removed it and replaced it with older style, 03/23/00, Bill Wendling + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifdef CONFIG_X86 +#include +#include +#include +#endif +#ifdef CONFIG_BSD_PROCESS_ACCT +#include +#endif +#ifdef CONFIG_RT_MUTEXES +#include +#endif +#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT) +#include +#endif +#ifdef CONFIG_CHR_DEV_SG +#include +#endif + +#ifdef CONFIG_LOCKUP_DETECTOR +#include +#endif + + +#if defined(CONFIG_SYSCTL) + +/* External variables not in a header file. */ +extern int sysctl_overcommit_memory; +extern int sysctl_overcommit_ratio; +extern int max_threads; +extern int core_uses_pid; +extern int suid_dumpable; +extern char core_pattern[]; +extern unsigned int core_pipe_limit; +extern int pid_max; +extern int min_free_kbytes; +extern int min_free_order_shift; +extern int pid_max_min, pid_max_max; +extern int sysctl_drop_caches; +extern int percpu_pagelist_fraction; +extern int compat_log; +extern int latencytop_enabled; +extern int sysctl_nr_open_min, sysctl_nr_open_max; +#ifndef CONFIG_MMU +extern int sysctl_nr_trim_pages; +#endif +#ifdef CONFIG_BLOCK +extern int blk_iopoll_enabled; +#endif + +/* Constants used for minimum and maximum */ +#ifdef CONFIG_LOCKUP_DETECTOR +static int sixty = 60; +static int neg_one = -1; +#endif + +static int zero; +static int __maybe_unused one = 1; +static int __maybe_unused two = 2; +static int __maybe_unused three = 3; +static unsigned long one_ul = 1; +static int one_hundred = 100; +#ifdef CONFIG_PRINTK +static int ten_thousand = 10000; +#endif + +/* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ +static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; + +/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ +static int maxolduid = 65535; +static int minolduid; +static int min_percpu_pagelist_fract = 8; + +static int ngroups_max = NGROUPS_MAX; + +#ifdef CONFIG_INOTIFY_USER +#include +#endif +#ifdef CONFIG_SPARC +#include +#endif + +#ifdef CONFIG_SPARC64 +extern int sysctl_tsb_ratio; +#endif + +#ifdef __hppa__ +extern int pwrsw_enabled; +extern int unaligned_enabled; +#endif + +#ifdef CONFIG_S390 +#ifdef CONFIG_MATHEMU +extern int sysctl_ieee_emulation_warnings; +#endif +extern int sysctl_userprocess_debug; +extern int spin_retry; +#endif + +#ifdef CONFIG_IA64 +extern int no_unaligned_warning; +extern int unaligned_dump_stack; +#endif + +#ifdef CONFIG_PROC_SYSCTL +static int proc_do_cad_pid(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +static int proc_taint(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +#endif + +#ifdef CONFIG_PRINTK +static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +#endif + +#ifdef CONFIG_MAGIC_SYSRQ +/* Note: sysrq code uses it's own private copy */ +static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE; + +static int sysrq_sysctl_handler(ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int error; + + error = proc_dointvec(table, write, buffer, lenp, ppos); + if (error) + return error; + + if (write) + sysrq_toggle_support(__sysrq_enabled); + + return 0; +} + +#endif + +static struct ctl_table root_table[]; +static struct ctl_table_root sysctl_table_root; +static struct ctl_table_header root_table_header = { + {{.count = 1, + .ctl_table = root_table, + .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),}}, + .root = &sysctl_table_root, + .set = &sysctl_table_root.default_set, +}; +static struct ctl_table_root sysctl_table_root = { + .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list), + .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry), +}; + +static struct ctl_table kern_table[]; +static struct ctl_table vm_table[]; +static struct ctl_table fs_table[]; +static struct ctl_table debug_table[]; +static struct ctl_table dev_table[]; +extern struct ctl_table random_table[]; +#ifdef CONFIG_EPOLL +extern struct ctl_table epoll_table[]; +#endif + +#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT +int sysctl_legacy_va_layout; +#endif + +/* The default sysctl tables: */ + +static struct ctl_table root_table[] = { + { + .procname = "kernel", + .mode = 0555, + .child = kern_table, + }, + { + .procname = "vm", + .mode = 0555, + .child = vm_table, + }, + { + .procname = "fs", + .mode = 0555, + .child = fs_table, + }, + { + .procname = "debug", + .mode = 0555, + .child = debug_table, + }, + { + .procname = "dev", + .mode = 0555, + .child = dev_table, + }, + { } +}; + +#ifdef CONFIG_SCHED_DEBUG +static int min_sched_granularity_ns = 100000; /* 100 usecs */ +static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ +static int min_wakeup_granularity_ns; /* 0 usecs */ +static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ +static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; +static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; +#endif + +#ifdef CONFIG_COMPACTION +static int min_extfrag_threshold; +static int max_extfrag_threshold = 1000; +#endif + +static struct ctl_table kern_table[] = { + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_SCHED_DEBUG + { + .procname = "sched_min_granularity_ns", + .data = &sysctl_sched_min_granularity, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_proc_update_handler, + .extra1 = &min_sched_granularity_ns, + .extra2 = &max_sched_granularity_ns, + }, + { + .procname = "sched_latency_ns", + .data = &sysctl_sched_latency, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_proc_update_handler, + .extra1 = &min_sched_granularity_ns, + .extra2 = &max_sched_granularity_ns, + }, + { + .procname = "sched_wakeup_granularity_ns", + .data = &sysctl_sched_wakeup_granularity, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_proc_update_handler, + .extra1 = &min_wakeup_granularity_ns, + .extra2 = &max_wakeup_granularity_ns, + }, + { + .procname = "sched_tunable_scaling", + .data = &sysctl_sched_tunable_scaling, + .maxlen = sizeof(enum sched_tunable_scaling), + .mode = 0644, + .proc_handler = sched_proc_update_handler, + .extra1 = &min_sched_tunable_scaling, + .extra2 = &max_sched_tunable_scaling, + }, + { + .procname = "sched_migration_cost", + .data = &sysctl_sched_migration_cost, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_nr_migrate", + .data = &sysctl_sched_nr_migrate, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_time_avg", + .data = &sysctl_sched_time_avg, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_shares_window", + .data = &sysctl_sched_shares_window, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "timer_migration", + .data = &sysctl_timer_migration, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif + { + .procname = "sched_rt_period_us", + .data = &sysctl_sched_rt_period, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_rt_handler, + }, + { + .procname = "sched_rt_runtime_us", + .data = &sysctl_sched_rt_runtime, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sched_rt_handler, + }, +#ifdef CONFIG_SCHED_AUTOGROUP + { + .procname = "sched_autogroup_enabled", + .data = &sysctl_sched_autogroup_enabled, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif +#ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", + .data = &prove_locking, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_LOCK_STAT + { + .procname = "lock_stat", + .data = &lock_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "panic", + .data = &panic_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "core_uses_pid", + .data = &core_uses_pid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "core_pattern", + .data = core_pattern, + .maxlen = CORENAME_MAX_SIZE, + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "core_pipe_limit", + .data = &core_pipe_limit, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_PROC_SYSCTL + { + .procname = "tainted", + .maxlen = sizeof(long), + .mode = 0644, + .proc_handler = proc_taint, + }, +#endif +#ifdef CONFIG_LATENCYTOP + { + .procname = "latencytop", + .data = &latencytop_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_BLK_DEV_INITRD + { + .procname = "real-root-dev", + .data = &real_root_dev, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "print-fatal-signals", + .data = &print_fatal_signals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_SPARC + { + .procname = "reboot-cmd", + .data = reboot_command, + .maxlen = 256, + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "stop-a", + .data = &stop_a_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "scons-poweroff", + .data = &scons_pwroff, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_SPARC64 + { + .procname = "tsb-ratio", + .data = &sysctl_tsb_ratio, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef __hppa__ + { + .procname = "soft-power", + .data = &pwrsw_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "unaligned-trap", + .data = &unaligned_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "ctrl-alt-del", + .data = &C_A_D, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#ifdef CONFIG_FUNCTION_TRACER + { + .procname = "ftrace_enabled", + .data = &ftrace_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = ftrace_enable_sysctl, + }, +#endif +#ifdef CONFIG_STACK_TRACER + { + .procname = "stack_tracer_enabled", + .data = &stack_tracer_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = stack_trace_sysctl, + }, +#endif +#ifdef CONFIG_TRACING + { + .procname = "ftrace_dump_on_oops", + .data = &ftrace_dump_on_oops, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_MODULES + { + .procname = "modprobe", + .data = &modprobe_path, + .maxlen = KMOD_PATH_LEN, + .mode = 0644, + .proc_handler = proc_dostring, + }, + { + .procname = "modules_disabled", + .data = &modules_disabled, + .maxlen = sizeof(int), + .mode = 0644, + /* only handle a transition from default "0" to "1" */ + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &one, + }, +#endif +#ifdef CONFIG_HOTPLUG + { + .procname = "hotplug", + .data = &uevent_helper, + .maxlen = UEVENT_HELPER_PATH_LEN, + .mode = 0644, + .proc_handler = proc_dostring, + }, +#endif +#ifdef CONFIG_CHR_DEV_SG + { + .procname = "sg-big-buff", + .data = &sg_big_buff, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_BSD_PROCESS_ACCT + { + .procname = "acct", + .data = &acct_parm, + .maxlen = 3*sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_MAGIC_SYSRQ + { + .procname = "sysrq", + .data = &__sysrq_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = sysrq_sysctl_handler, + }, +#endif +#ifdef CONFIG_PROC_SYSCTL + { + .procname = "cad_pid", + .data = NULL, + .maxlen = sizeof (int), + .mode = 0600, + .proc_handler = proc_do_cad_pid, + }, +#endif + { + .procname = "threads-max", + .data = &max_threads, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "random", + .mode = 0555, + .child = random_table, + }, + { + .procname = "usermodehelper", + .mode = 0555, + .child = usermodehelper_table, + }, + { + .procname = "overflowuid", + .data = &overflowuid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, + { + .procname = "overflowgid", + .data = &overflowgid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, +#ifdef CONFIG_S390 +#ifdef CONFIG_MATHEMU + { + .procname = "ieee_emulation_warnings", + .data = &sysctl_ieee_emulation_warnings, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "userprocess_debug", + .data = &show_unhandled_signals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "pid_max", + .data = &pid_max, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &pid_max_min, + .extra2 = &pid_max_max, + }, + { + .procname = "panic_on_oops", + .data = &panic_on_oops, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#if defined CONFIG_PRINTK + { + .procname = "printk", + .data = &console_loglevel, + .maxlen = 4*sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "printk_ratelimit", + .data = &printk_ratelimit_state.interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "printk_ratelimit_burst", + .data = &printk_ratelimit_state.burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "printk_delay", + .data = &printk_delay_msec, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &ten_thousand, + }, + { + .procname = "dmesg_restrict", + .data = &dmesg_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_sysadmin, + .extra1 = &zero, + .extra2 = &one, + }, + { + .procname = "kptr_restrict", + .data = &kptr_restrict, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax_sysadmin, + .extra1 = &zero, + .extra2 = &two, + }, +#endif + { + .procname = "ngroups_max", + .data = &ngroups_max, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, +#if defined(CONFIG_LOCKUP_DETECTOR) + { + .procname = "watchdog", + .data = &watchdog_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dowatchdog, + .extra1 = &zero, + .extra2 = &one, + }, + { + .procname = "watchdog_thresh", + .data = &watchdog_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dowatchdog, + .extra1 = &neg_one, + .extra2 = &sixty, + }, + { + .procname = "softlockup_panic", + .data = &softlockup_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { + .procname = "nmi_watchdog", + .data = &watchdog_enabled, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dowatchdog, + .extra1 = &zero, + .extra2 = &one, + }, +#endif +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) + { + .procname = "unknown_nmi_panic", + .data = &unknown_nmi_panic, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#if defined(CONFIG_X86) + { + .procname = "panic_on_unrecovered_nmi", + .data = &panic_on_unrecovered_nmi, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "panic_on_io_nmi", + .data = &panic_on_io_nmi, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "bootloader_type", + .data = &bootloader_type, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + { + .procname = "bootloader_version", + .data = &bootloader_version, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = proc_dointvec, + }, + { + .procname = "kstack_depth_to_print", + .data = &kstack_depth_to_print, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "io_delay_type", + .data = &io_delay_type, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#if defined(CONFIG_MMU) + { + .procname = "randomize_va_space", + .data = &randomize_va_space, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#if defined(CONFIG_S390) && defined(CONFIG_SMP) + { + .procname = "spin_retry", + .data = &spin_retry, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#if defined(CONFIG_ACPI_SLEEP) && defined(CONFIG_X86) + { + .procname = "acpi_video_flags", + .data = &acpi_realmode_flags, + .maxlen = sizeof (unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, +#endif +#ifdef CONFIG_IA64 + { + .procname = "ignore-unaligned-usertrap", + .data = &no_unaligned_warning, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "unaligned-dump-stack", + .data = &unaligned_dump_stack, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_DETECT_HUNG_TASK + { + .procname = "hung_task_panic", + .data = &sysctl_hung_task_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { + .procname = "hung_task_check_count", + .data = &sysctl_hung_task_check_count, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "hung_task_timeout_secs", + .data = &sysctl_hung_task_timeout_secs, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_dohung_task_timeout_secs, + }, + { + .procname = "hung_task_warnings", + .data = &sysctl_hung_task_warnings, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, +#endif +#ifdef CONFIG_COMPAT + { + .procname = "compat-log", + .data = &compat_log, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_RT_MUTEXES + { + .procname = "max_lock_depth", + .data = &max_lock_depth, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { + .procname = "poweroff_cmd", + .data = &poweroff_cmd, + .maxlen = POWEROFF_CMD_PATH_LEN, + .mode = 0644, + .proc_handler = proc_dostring, + }, +#ifdef CONFIG_KEYS + { + .procname = "keys", + .mode = 0555, + .child = key_sysctls, + }, +#endif +#ifdef CONFIG_RCU_TORTURE_TEST + { + .procname = "rcutorture_runnable", + .data = &rcutorture_runnable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_PERF_EVENTS + /* + * User-space scripts rely on the existence of this file + * as a feature check for perf_events being enabled. + * + * So it's an ABI, do not remove! + */ + { + .procname = "perf_event_paranoid", + .data = &sysctl_perf_event_paranoid, + .maxlen = sizeof(sysctl_perf_event_paranoid), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "perf_event_mlock_kb", + .data = &sysctl_perf_event_mlock, + .maxlen = sizeof(sysctl_perf_event_mlock), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "perf_event_max_sample_rate", + .data = &sysctl_perf_event_sample_rate, + .maxlen = sizeof(sysctl_perf_event_sample_rate), + .mode = 0644, + .proc_handler = perf_proc_update_handler, + }, +#endif +#ifdef CONFIG_KMEMCHECK + { + .procname = "kmemcheck", + .data = &kmemcheck_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_BLOCK + { + .procname = "blk_iopoll", + .data = &blk_iopoll_enabled, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif + { } +}; + +static struct ctl_table vm_table[] = { + { + .procname = "overcommit_memory", + .data = &sysctl_overcommit_memory, + .maxlen = sizeof(sysctl_overcommit_memory), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, + }, + { + .procname = "panic_on_oom", + .data = &sysctl_panic_on_oom, + .maxlen = sizeof(sysctl_panic_on_oom), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, + }, + { + .procname = "oom_kill_allocating_task", + .data = &sysctl_oom_kill_allocating_task, + .maxlen = sizeof(sysctl_oom_kill_allocating_task), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "oom_dump_tasks", + .data = &sysctl_oom_dump_tasks, + .maxlen = sizeof(sysctl_oom_dump_tasks), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "overcommit_ratio", + .data = &sysctl_overcommit_ratio, + .maxlen = sizeof(sysctl_overcommit_ratio), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "page-cluster", + .data = &page_cluster, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, + { + .procname = "dirty_background_ratio", + .data = &dirty_background_ratio, + .maxlen = sizeof(dirty_background_ratio), + .mode = 0644, + .proc_handler = dirty_background_ratio_handler, + .extra1 = &zero, + .extra2 = &one_hundred, + }, + { + .procname = "dirty_background_bytes", + .data = &dirty_background_bytes, + .maxlen = sizeof(dirty_background_bytes), + .mode = 0644, + .proc_handler = dirty_background_bytes_handler, + .extra1 = &one_ul, + }, + { + .procname = "dirty_ratio", + .data = &vm_dirty_ratio, + .maxlen = sizeof(vm_dirty_ratio), + .mode = 0644, + .proc_handler = dirty_ratio_handler, + .extra1 = &zero, + .extra2 = &one_hundred, + }, + { + .procname = "dirty_bytes", + .data = &vm_dirty_bytes, + .maxlen = sizeof(vm_dirty_bytes), + .mode = 0644, + .proc_handler = dirty_bytes_handler, + .extra1 = &dirty_bytes_min, + }, + { + .procname = "dirty_writeback_centisecs", + .data = &dirty_writeback_interval, + .maxlen = sizeof(dirty_writeback_interval), + .mode = 0644, + .proc_handler = dirty_writeback_centisecs_handler, + }, + { + .procname = "dirty_expire_centisecs", + .data = &dirty_expire_interval, + .maxlen = sizeof(dirty_expire_interval), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, + { + .procname = "nr_pdflush_threads", + .data = &nr_pdflush_threads, + .maxlen = sizeof nr_pdflush_threads, + .mode = 0444 /* read-only*/, + .proc_handler = proc_dointvec, + }, + { + .procname = "swappiness", + .data = &vm_swappiness, + .maxlen = sizeof(vm_swappiness), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one_hundred, + }, +#ifdef CONFIG_HUGETLB_PAGE + { + .procname = "nr_hugepages", + .data = NULL, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = hugetlb_sysctl_handler, + .extra1 = (void *)&hugetlb_zero, + .extra2 = (void *)&hugetlb_infinity, + }, +#ifdef CONFIG_NUMA + { + .procname = "nr_hugepages_mempolicy", + .data = NULL, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &hugetlb_mempolicy_sysctl_handler, + .extra1 = (void *)&hugetlb_zero, + .extra2 = (void *)&hugetlb_infinity, + }, +#endif + { + .procname = "hugetlb_shm_group", + .data = &sysctl_hugetlb_shm_group, + .maxlen = sizeof(gid_t), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "hugepages_treat_as_movable", + .data = &hugepages_treat_as_movable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = hugetlb_treat_movable_handler, + }, + { + .procname = "nr_overcommit_hugepages", + .data = NULL, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = hugetlb_overcommit_handler, + .extra1 = (void *)&hugetlb_zero, + .extra2 = (void *)&hugetlb_infinity, + }, +#endif + { + .procname = "lowmem_reserve_ratio", + .data = &sysctl_lowmem_reserve_ratio, + .maxlen = sizeof(sysctl_lowmem_reserve_ratio), + .mode = 0644, + .proc_handler = lowmem_reserve_ratio_sysctl_handler, + }, + { + .procname = "drop_caches", + .data = &sysctl_drop_caches, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = drop_caches_sysctl_handler, + .extra1 = &one, + .extra2 = &three, + }, +#ifdef CONFIG_COMPACTION + { + .procname = "compact_memory", + .data = &sysctl_compact_memory, + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = sysctl_compaction_handler, + }, + { + .procname = "extfrag_threshold", + .data = &sysctl_extfrag_threshold, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_extfrag_handler, + .extra1 = &min_extfrag_threshold, + .extra2 = &max_extfrag_threshold, + }, + +#endif /* CONFIG_COMPACTION */ + { + .procname = "min_free_kbytes", + .data = &min_free_kbytes, + .maxlen = sizeof(min_free_kbytes), + .mode = 0644, + .proc_handler = min_free_kbytes_sysctl_handler, + .extra1 = &zero, + }, + { + .procname = "min_free_order_shift", + .data = &min_free_order_shift, + .maxlen = sizeof(min_free_order_shift), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .procname = "percpu_pagelist_fraction", + .data = &percpu_pagelist_fraction, + .maxlen = sizeof(percpu_pagelist_fraction), + .mode = 0644, + .proc_handler = percpu_pagelist_fraction_sysctl_handler, + .extra1 = &min_percpu_pagelist_fract, + }, +#ifdef CONFIG_MMU + { + .procname = "max_map_count", + .data = &sysctl_max_map_count, + .maxlen = sizeof(sysctl_max_map_count), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, +#else + { + .procname = "nr_trim_pages", + .data = &sysctl_nr_trim_pages, + .maxlen = sizeof(sysctl_nr_trim_pages), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, +#endif + { + .procname = "laptop_mode", + .data = &laptop_mode, + .maxlen = sizeof(laptop_mode), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "block_dump", + .data = &block_dump, + .maxlen = sizeof(block_dump), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &zero, + }, + { + .procname = "vfs_cache_pressure", + .data = &sysctl_vfs_cache_pressure, + .maxlen = sizeof(sysctl_vfs_cache_pressure), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &zero, + }, +#ifdef HAVE_ARCH_PICK_MMAP_LAYOUT + { + .procname = "legacy_va_layout", + .data = &sysctl_legacy_va_layout, + .maxlen = sizeof(sysctl_legacy_va_layout), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &zero, + }, +#endif +#ifdef CONFIG_NUMA + { + .procname = "zone_reclaim_mode", + .data = &zone_reclaim_mode, + .maxlen = sizeof(zone_reclaim_mode), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &zero, + }, + { + .procname = "min_unmapped_ratio", + .data = &sysctl_min_unmapped_ratio, + .maxlen = sizeof(sysctl_min_unmapped_ratio), + .mode = 0644, + .proc_handler = sysctl_min_unmapped_ratio_sysctl_handler, + .extra1 = &zero, + .extra2 = &one_hundred, + }, + { + .procname = "min_slab_ratio", + .data = &sysctl_min_slab_ratio, + .maxlen = sizeof(sysctl_min_slab_ratio), + .mode = 0644, + .proc_handler = sysctl_min_slab_ratio_sysctl_handler, + .extra1 = &zero, + .extra2 = &one_hundred, + }, +#endif +#ifdef CONFIG_SMP + { + .procname = "stat_interval", + .data = &sysctl_stat_interval, + .maxlen = sizeof(sysctl_stat_interval), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, +#endif +#ifdef CONFIG_MMU + { + .procname = "mmap_min_addr", + .data = &dac_mmap_min_addr, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = mmap_min_addr_handler, + }, +#endif +#ifdef CONFIG_NUMA + { + .procname = "numa_zonelist_order", + .data = &numa_zonelist_order, + .maxlen = NUMA_ZONELIST_ORDER_LEN, + .mode = 0644, + .proc_handler = numa_zonelist_order_handler, + }, +#endif +#if (defined(CONFIG_X86_32) && !defined(CONFIG_UML))|| \ + (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) + { + .procname = "vdso_enabled", + .data = &vdso_enabled, + .maxlen = sizeof(vdso_enabled), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = &zero, + }, +#endif +#ifdef CONFIG_HIGHMEM + { + .procname = "highmem_is_dirtyable", + .data = &vm_highmem_is_dirtyable, + .maxlen = sizeof(vm_highmem_is_dirtyable), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif + { + .procname = "scan_unevictable_pages", + .data = &scan_unevictable_pages, + .maxlen = sizeof(scan_unevictable_pages), + .mode = 0644, + .proc_handler = scan_unevictable_handler, + }, +#ifdef CONFIG_MEMORY_FAILURE + { + .procname = "memory_failure_early_kill", + .data = &sysctl_memory_failure_early_kill, + .maxlen = sizeof(sysctl_memory_failure_early_kill), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { + .procname = "memory_failure_recovery", + .data = &sysctl_memory_failure_recovery, + .maxlen = sizeof(sysctl_memory_failure_recovery), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, +#endif + { } +}; + +#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) +static struct ctl_table binfmt_misc_table[] = { + { } +}; +#endif + +static struct ctl_table fs_table[] = { + { + .procname = "inode-nr", + .data = &inodes_stat, + .maxlen = 2*sizeof(int), + .mode = 0444, + .proc_handler = proc_nr_inodes, + }, + { + .procname = "inode-state", + .data = &inodes_stat, + .maxlen = 7*sizeof(int), + .mode = 0444, + .proc_handler = proc_nr_inodes, + }, + { + .procname = "file-nr", + .data = &files_stat, + .maxlen = sizeof(files_stat), + .mode = 0444, + .proc_handler = proc_nr_files, + }, + { + .procname = "file-max", + .data = &files_stat.max_files, + .maxlen = sizeof(files_stat.max_files), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "nr_open", + .data = &sysctl_nr_open, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &sysctl_nr_open_min, + .extra2 = &sysctl_nr_open_max, + }, + { + .procname = "dentry-state", + .data = &dentry_stat, + .maxlen = 6*sizeof(int), + .mode = 0444, + .proc_handler = proc_nr_dentry, + }, + { + .procname = "overflowuid", + .data = &fs_overflowuid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, + { + .procname = "overflowgid", + .data = &fs_overflowgid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &minolduid, + .extra2 = &maxolduid, + }, +#ifdef CONFIG_FILE_LOCKING + { + .procname = "leases-enable", + .data = &leases_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_DNOTIFY + { + .procname = "dir-notify-enable", + .data = &dir_notify_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_MMU +#ifdef CONFIG_FILE_LOCKING + { + .procname = "lease-break-time", + .data = &lease_break_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, +#endif +#ifdef CONFIG_AIO + { + .procname = "aio-nr", + .data = &aio_nr, + .maxlen = sizeof(aio_nr), + .mode = 0444, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "aio-max-nr", + .data = &aio_max_nr, + .maxlen = sizeof(aio_max_nr), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, +#endif /* CONFIG_AIO */ +#ifdef CONFIG_INOTIFY_USER + { + .procname = "inotify", + .mode = 0555, + .child = inotify_table, + }, +#endif +#ifdef CONFIG_EPOLL + { + .procname = "epoll", + .mode = 0555, + .child = epoll_table, + }, +#endif +#endif + { + .procname = "suid_dumpable", + .data = &suid_dumpable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, + }, +#if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE) + { + .procname = "binfmt_misc", + .mode = 0555, + .child = binfmt_misc_table, + }, +#endif + { + .procname = "pipe-max-size", + .data = &pipe_max_size, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &pipe_proc_fn, + .extra1 = &pipe_min_size, + }, + { } +}; + +static struct ctl_table debug_table[] = { +#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \ + defined(CONFIG_S390) || defined(CONFIG_TILE) + { + .procname = "exception-trace", + .data = &show_unhandled_signals, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, +#endif +#if defined(CONFIG_OPTPROBES) + { + .procname = "kprobes-optimization", + .data = &sysctl_kprobes_optimization, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_kprobes_optimization_handler, + .extra1 = &zero, + .extra2 = &one, + }, +#endif + { } +}; + +static struct ctl_table dev_table[] = { + { } +}; + +static DEFINE_SPINLOCK(sysctl_lock); + +/* called under sysctl_lock */ +static int use_table(struct ctl_table_header *p) +{ + if (unlikely(p->unregistering)) + return 0; + p->used++; + return 1; +} + +/* called under sysctl_lock */ +static void unuse_table(struct ctl_table_header *p) +{ + if (!--p->used) + if (unlikely(p->unregistering)) + complete(p->unregistering); +} + +/* called under sysctl_lock, will reacquire if has to wait */ +static void start_unregistering(struct ctl_table_header *p) +{ + /* + * if p->used is 0, nobody will ever touch that entry again; + * we'll eliminate all paths to it before dropping sysctl_lock + */ + if (unlikely(p->used)) { + struct completion wait; + init_completion(&wait); + p->unregistering = &wait; + spin_unlock(&sysctl_lock); + wait_for_completion(&wait); + spin_lock(&sysctl_lock); + } else { + /* anything non-NULL; we'll never dereference it */ + p->unregistering = ERR_PTR(-EINVAL); + } + /* + * do not remove from the list until nobody holds it; walking the + * list in do_sysctl() relies on that. + */ + list_del_init(&p->ctl_entry); +} + +void sysctl_head_get(struct ctl_table_header *head) +{ + spin_lock(&sysctl_lock); + head->count++; + spin_unlock(&sysctl_lock); +} + +static void free_head(struct rcu_head *rcu) +{ + kfree(container_of(rcu, struct ctl_table_header, rcu)); +} + +void sysctl_head_put(struct ctl_table_header *head) +{ + spin_lock(&sysctl_lock); + if (!--head->count) + call_rcu(&head->rcu, free_head); + spin_unlock(&sysctl_lock); +} + +struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head) +{ + if (!head) + BUG(); + spin_lock(&sysctl_lock); + if (!use_table(head)) + head = ERR_PTR(-ENOENT); + spin_unlock(&sysctl_lock); + return head; +} + +void sysctl_head_finish(struct ctl_table_header *head) +{ + if (!head) + return; + spin_lock(&sysctl_lock); + unuse_table(head); + spin_unlock(&sysctl_lock); +} + +static struct ctl_table_set * +lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces) +{ + struct ctl_table_set *set = &root->default_set; + if (root->lookup) + set = root->lookup(root, namespaces); + return set; +} + +static struct list_head * +lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces) +{ + struct ctl_table_set *set = lookup_header_set(root, namespaces); + return &set->list; +} + +struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces, + struct ctl_table_header *prev) +{ + struct ctl_table_root *root; + struct list_head *header_list; + struct ctl_table_header *head; + struct list_head *tmp; + + spin_lock(&sysctl_lock); + if (prev) { + head = prev; + tmp = &prev->ctl_entry; + unuse_table(prev); + goto next; + } + tmp = &root_table_header.ctl_entry; + for (;;) { + head = list_entry(tmp, struct ctl_table_header, ctl_entry); + + if (!use_table(head)) + goto next; + spin_unlock(&sysctl_lock); + return head; + next: + root = head->root; + tmp = tmp->next; + header_list = lookup_header_list(root, namespaces); + if (tmp != header_list) + continue; + + do { + root = list_entry(root->root_list.next, + struct ctl_table_root, root_list); + if (root == &sysctl_table_root) + goto out; + header_list = lookup_header_list(root, namespaces); + } while (list_empty(header_list)); + tmp = header_list->next; + } +out: + spin_unlock(&sysctl_lock); + return NULL; +} + +struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev) +{ + return __sysctl_head_next(current->nsproxy, prev); +} + +void register_sysctl_root(struct ctl_table_root *root) +{ + spin_lock(&sysctl_lock); + list_add_tail(&root->root_list, &sysctl_table_root.root_list); + spin_unlock(&sysctl_lock); +} + +/* + * sysctl_perm does NOT grant the superuser all rights automatically, because + * some sysctl variables are readonly even to root. + */ + +static int test_perm(int mode, int op) +{ + if (!current_euid()) + mode >>= 6; + else if (in_egroup_p(0)) + mode >>= 3; + if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0) + return 0; + return -EACCES; +} + +int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op) +{ + int mode; + + if (root->permissions) + mode = root->permissions(root, current->nsproxy, table); + else + mode = table->mode; + + return test_perm(mode, op); +} + +static void sysctl_set_parent(struct ctl_table *parent, struct ctl_table *table) +{ + for (; table->procname; table++) { + table->parent = parent; + if (table->child) + sysctl_set_parent(table, table->child); + } +} + +static __init int sysctl_init(void) +{ + sysctl_set_parent(NULL, root_table); +#ifdef CONFIG_SYSCTL_SYSCALL_CHECK + sysctl_check_table(current->nsproxy, root_table); +#endif + return 0; +} + +core_initcall(sysctl_init); + +static struct ctl_table *is_branch_in(struct ctl_table *branch, + struct ctl_table *table) +{ + struct ctl_table *p; + const char *s = branch->procname; + + /* branch should have named subdirectory as its first element */ + if (!s || !branch->child) + return NULL; + + /* ... and nothing else */ + if (branch[1].procname) + return NULL; + + /* table should contain subdirectory with the same name */ + for (p = table; p->procname; p++) { + if (!p->child) + continue; + if (p->procname && strcmp(p->procname, s) == 0) + return p; + } + return NULL; +} + +/* see if attaching q to p would be an improvement */ +static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q) +{ + struct ctl_table *to = p->ctl_table, *by = q->ctl_table; + struct ctl_table *next; + int is_better = 0; + int not_in_parent = !p->attached_by; + + while ((next = is_branch_in(by, to)) != NULL) { + if (by == q->attached_by) + is_better = 1; + if (to == p->attached_by) + not_in_parent = 1; + by = by->child; + to = next->child; + } + + if (is_better && not_in_parent) { + q->attached_by = by; + q->attached_to = to; + q->parent = p; + } +} + +/** + * __register_sysctl_paths - register a sysctl hierarchy + * @root: List of sysctl headers to register on + * @namespaces: Data to compute which lists of sysctl entries are visible + * @path: The path to the directory the sysctl table is in. + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * The members of the &struct ctl_table structure are used as follows: + * + * procname - the name of the sysctl file under /proc/sys. Set to %NULL to not + * enter a sysctl file + * + * data - a pointer to data for use by proc_handler + * + * maxlen - the maximum size in bytes of the data + * + * mode - the file permissions for the /proc/sys file, and for sysctl(2) + * + * child - a pointer to the child sysctl table if this entry is a directory, or + * %NULL. + * + * proc_handler - the text handler routine (described below) + * + * de - for internal use by the sysctl routines + * + * extra1, extra2 - extra pointers usable by the proc handler routines + * + * Leaf nodes in the sysctl tree will be represented by a single file + * under /proc; non-leaf nodes will be represented by directories. + * + * sysctl(2) can automatically manage read and write requests through + * the sysctl table. The data and maxlen fields of the ctl_table + * struct enable minimal validation of the values being written to be + * performed, and the mode field allows minimal authentication. + * + * There must be a proc_handler routine for any terminal nodes + * mirrored under /proc/sys (non-terminals are handled by a built-in + * directory handler). Several default handlers are available to + * cover common cases - + * + * proc_dostring(), proc_dointvec(), proc_dointvec_jiffies(), + * proc_dointvec_userhz_jiffies(), proc_dointvec_minmax(), + * proc_doulongvec_ms_jiffies_minmax(), proc_doulongvec_minmax() + * + * It is the handler's job to read the input buffer from user memory + * and process it. The handler should return 0 on success. + * + * This routine returns %NULL on a failure to register, and a pointer + * to the table header on success. + */ +struct ctl_table_header *__register_sysctl_paths( + struct ctl_table_root *root, + struct nsproxy *namespaces, + const struct ctl_path *path, struct ctl_table *table) +{ + struct ctl_table_header *header; + struct ctl_table *new, **prevp; + unsigned int n, npath; + struct ctl_table_set *set; + + /* Count the path components */ + for (npath = 0; path[npath].procname; ++npath) + ; + + /* + * For each path component, allocate a 2-element ctl_table array. + * The first array element will be filled with the sysctl entry + * for this, the second will be the sentinel (procname == 0). + * + * We allocate everything in one go so that we don't have to + * worry about freeing additional memory in unregister_sysctl_table. + */ + header = kzalloc(sizeof(struct ctl_table_header) + + (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL); + if (!header) + return NULL; + + new = (struct ctl_table *) (header + 1); + + /* Now connect the dots */ + prevp = &header->ctl_table; + for (n = 0; n < npath; ++n, ++path) { + /* Copy the procname */ + new->procname = path->procname; + new->mode = 0555; + + *prevp = new; + prevp = &new->child; + + new += 2; + } + *prevp = table; + header->ctl_table_arg = table; + + INIT_LIST_HEAD(&header->ctl_entry); + header->used = 0; + header->unregistering = NULL; + header->root = root; + sysctl_set_parent(NULL, header->ctl_table); + header->count = 1; +#ifdef CONFIG_SYSCTL_SYSCALL_CHECK + if (sysctl_check_table(namespaces, header->ctl_table)) { + kfree(header); + return NULL; + } +#endif + spin_lock(&sysctl_lock); + header->set = lookup_header_set(root, namespaces); + header->attached_by = header->ctl_table; + header->attached_to = root_table; + header->parent = &root_table_header; + for (set = header->set; set; set = set->parent) { + struct ctl_table_header *p; + list_for_each_entry(p, &set->list, ctl_entry) { + if (p->unregistering) + continue; + try_attach(p, header); + } + } + header->parent->count++; + list_add_tail(&header->ctl_entry, &header->set->list); + spin_unlock(&sysctl_lock); + + return header; +} + +/** + * register_sysctl_table_path - register a sysctl table hierarchy + * @path: The path to the directory the sysctl table is in. + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * See __register_sysctl_paths for more details. + */ +struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, + struct ctl_table *table) +{ + return __register_sysctl_paths(&sysctl_table_root, current->nsproxy, + path, table); +} + +/** + * register_sysctl_table - register a sysctl table hierarchy + * @table: the top-level table structure + * + * Register a sysctl table hierarchy. @table should be a filled in ctl_table + * array. A completely 0 filled entry terminates the table. + * + * See register_sysctl_paths for more details. + */ +struct ctl_table_header *register_sysctl_table(struct ctl_table *table) +{ + static const struct ctl_path null_path[] = { {} }; + + return register_sysctl_paths(null_path, table); +} + +/** + * unregister_sysctl_table - unregister a sysctl table hierarchy + * @header: the header returned from register_sysctl_table + * + * Unregisters the sysctl table and all children. proc entries may not + * actually be removed until they are no longer used by anyone. + */ +void unregister_sysctl_table(struct ctl_table_header * header) +{ + might_sleep(); + + if (header == NULL) + return; + + spin_lock(&sysctl_lock); + start_unregistering(header); + if (!--header->parent->count) { + WARN_ON(1); + call_rcu(&header->parent->rcu, free_head); + } + if (!--header->count) + call_rcu(&header->rcu, free_head); + spin_unlock(&sysctl_lock); +} + +int sysctl_is_seen(struct ctl_table_header *p) +{ + struct ctl_table_set *set = p->set; + int res; + spin_lock(&sysctl_lock); + if (p->unregistering) + res = 0; + else if (!set->is_seen) + res = 1; + else + res = set->is_seen(set); + spin_unlock(&sysctl_lock); + return res; +} + +void setup_sysctl_set(struct ctl_table_set *p, + struct ctl_table_set *parent, + int (*is_seen)(struct ctl_table_set *)) +{ + INIT_LIST_HEAD(&p->list); + p->parent = parent ? parent : &sysctl_table_root.default_set; + p->is_seen = is_seen; +} + +#else /* !CONFIG_SYSCTL */ +struct ctl_table_header *register_sysctl_table(struct ctl_table * table) +{ + return NULL; +} + +struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, + struct ctl_table *table) +{ + return NULL; +} + +void unregister_sysctl_table(struct ctl_table_header * table) +{ +} + +void setup_sysctl_set(struct ctl_table_set *p, + struct ctl_table_set *parent, + int (*is_seen)(struct ctl_table_set *)) +{ +} + +void sysctl_head_put(struct ctl_table_header *head) +{ +} + +#endif /* CONFIG_SYSCTL */ + +/* + * /proc/sys support + */ + +#ifdef CONFIG_PROC_SYSCTL + +static int _proc_do_string(void* data, int maxlen, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + size_t len; + char __user *p; + char c; + + if (!data || !maxlen || !*lenp) { + *lenp = 0; + return 0; + } + + if (write) { + len = 0; + p = buffer; + while (len < *lenp) { + if (get_user(c, p++)) + return -EFAULT; + if (c == 0 || c == '\n') + break; + len++; + } + if (len >= maxlen) + len = maxlen-1; + if(copy_from_user(data, buffer, len)) + return -EFAULT; + ((char *) data)[len] = 0; + *ppos += *lenp; + } else { + len = strlen(data); + if (len > maxlen) + len = maxlen; + + if (*ppos > len) { + *lenp = 0; + return 0; + } + + data += *ppos; + len -= *ppos; + + if (len > *lenp) + len = *lenp; + if (len) + if(copy_to_user(buffer, data, len)) + return -EFAULT; + if (len < *lenp) { + if(put_user('\n', ((char __user *) buffer) + len)) + return -EFAULT; + len++; + } + *lenp = len; + *ppos += len; + } + return 0; +} + +/** + * proc_dostring - read a string sysctl + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes a string from/to the user buffer. If the kernel + * buffer provided is not large enough to hold the string, the + * string is truncated. The copied string is %NULL-terminated. + * If the string is being read by the user process, it is copied + * and a newline '\n' is added. It is truncated if the buffer is + * not large enough. + * + * Returns 0 on success. + */ +int proc_dostring(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return _proc_do_string(table->data, table->maxlen, write, + buffer, lenp, ppos); +} + +static size_t proc_skip_spaces(char **buf) +{ + size_t ret; + char *tmp = skip_spaces(*buf); + ret = tmp - *buf; + *buf = tmp; + return ret; +} + +static void proc_skip_char(char **buf, size_t *size, const char v) +{ + while (*size) { + if (**buf != v) + break; + (*size)--; + (*buf)++; + } +} + +#define TMPBUFLEN 22 +/** + * proc_get_long - reads an ASCII formatted integer from a user buffer + * + * @buf: a kernel buffer + * @size: size of the kernel buffer + * @val: this is where the number will be stored + * @neg: set to %TRUE if number is negative + * @perm_tr: a vector which contains the allowed trailers + * @perm_tr_len: size of the perm_tr vector + * @tr: pointer to store the trailer character + * + * In case of success %0 is returned and @buf and @size are updated with + * the amount of bytes read. If @tr is non-NULL and a trailing + * character exists (size is non-zero after returning from this + * function), @tr is updated with the trailing character. + */ +static int proc_get_long(char **buf, size_t *size, + unsigned long *val, bool *neg, + const char *perm_tr, unsigned perm_tr_len, char *tr) +{ + int len; + char *p, tmp[TMPBUFLEN]; + + if (!*size) + return -EINVAL; + + len = *size; + if (len > TMPBUFLEN - 1) + len = TMPBUFLEN - 1; + + memcpy(tmp, *buf, len); + + tmp[len] = 0; + p = tmp; + if (*p == '-' && *size > 1) { + *neg = true; + p++; + } else + *neg = false; + if (!isdigit(*p)) + return -EINVAL; + + *val = simple_strtoul(p, &p, 0); + + len = p - tmp; + + /* We don't know if the next char is whitespace thus we may accept + * invalid integers (e.g. 1234...a) or two integers instead of one + * (e.g. 123...1). So lets not allow such large numbers. */ + if (len == TMPBUFLEN - 1) + return -EINVAL; + + if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len)) + return -EINVAL; + + if (tr && (len < *size)) + *tr = *p; + + *buf += len; + *size -= len; + + return 0; +} + +/** + * proc_put_long - converts an integer to a decimal ASCII formatted string + * + * @buf: the user buffer + * @size: the size of the user buffer + * @val: the integer to be converted + * @neg: sign of the number, %TRUE for negative + * + * In case of success %0 is returned and @buf and @size are updated with + * the amount of bytes written. + */ +static int proc_put_long(void __user **buf, size_t *size, unsigned long val, + bool neg) +{ + int len; + char tmp[TMPBUFLEN], *p = tmp; + + sprintf(p, "%s%lu", neg ? "-" : "", val); + len = strlen(tmp); + if (len > *size) + len = *size; + if (copy_to_user(*buf, tmp, len)) + return -EFAULT; + *size -= len; + *buf += len; + return 0; +} +#undef TMPBUFLEN + +static int proc_put_char(void __user **buf, size_t *size, char c) +{ + if (*size) { + char __user **buffer = (char __user **)buf; + if (put_user(c, *buffer)) + return -EFAULT; + (*size)--, (*buffer)++; + *buf = *buffer; + } + return 0; +} + +static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + *valp = *negp ? -*lvalp : *lvalp; + } else { + int val = *valp; + if (val < 0) { + *negp = true; + *lvalp = (unsigned long)-val; + } else { + *negp = false; + *lvalp = (unsigned long)val; + } + } + return 0; +} + +static const char proc_wspace_sep[] = { ' ', '\t', '\n' }; + +static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, + int write, void __user *buffer, + size_t *lenp, loff_t *ppos, + int (*conv)(bool *negp, unsigned long *lvalp, int *valp, + int write, void *data), + void *data) +{ + int *i, vleft, first = 1, err = 0; + unsigned long page = 0; + size_t left; + char *kbuf; + + if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + + i = (int *) tbl_data; + vleft = table->maxlen / sizeof(*i); + left = *lenp; + + if (!conv) + conv = do_proc_dointvec_conv; + + if (write) { + if (left > PAGE_SIZE - 1) + left = PAGE_SIZE - 1; + page = __get_free_page(GFP_TEMPORARY); + kbuf = (char *) page; + if (!kbuf) + return -ENOMEM; + if (copy_from_user(kbuf, buffer, left)) { + err = -EFAULT; + goto free; + } + kbuf[left] = 0; + } + + for (; left && vleft--; i++, first=0) { + unsigned long lval; + bool neg; + + if (write) { + left -= proc_skip_spaces(&kbuf); + + if (!left) + break; + err = proc_get_long(&kbuf, &left, &lval, &neg, + proc_wspace_sep, + sizeof(proc_wspace_sep), NULL); + if (err) + break; + if (conv(&neg, &lval, i, 1, data)) { + err = -EINVAL; + break; + } + } else { + if (conv(&neg, &lval, i, 0, data)) { + err = -EINVAL; + break; + } + if (!first) + err = proc_put_char(&buffer, &left, '\t'); + if (err) + break; + err = proc_put_long(&buffer, &left, lval, neg); + if (err) + break; + } + } + + if (!write && !first && left && !err) + err = proc_put_char(&buffer, &left, '\n'); + if (write && !err && left) + left -= proc_skip_spaces(&kbuf); +free: + if (write) { + free_page(page); + if (first) + return err ? : -EINVAL; + } + *lenp -= left; + *ppos += *lenp; + return err; +} + +static int do_proc_dointvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos, + int (*conv)(bool *negp, unsigned long *lvalp, int *valp, + int write, void *data), + void *data) +{ + return __do_proc_dointvec(table->data, table, write, + buffer, lenp, ppos, conv, data); +} + +/** + * proc_dointvec - read a vector of integers + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * + * Returns 0 on success. + */ +int proc_dointvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table,write,buffer,lenp,ppos, + NULL,NULL); +} + +/* + * Taint values can only be increased + * This means we can safely use a temporary. + */ +static int proc_taint(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + unsigned long tmptaint = get_taint(); + int err; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + t = *table; + t.data = &tmptaint; + err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + + if (write) { + /* + * Poor man's atomic or. Not worth adding a primitive + * to everyone's atomic.h for this + */ + int i; + for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { + if ((tmptaint >> i) & 1) + add_taint(i); + } + } + + return err; +} + +#ifdef CONFIG_PRINTK +static int proc_dointvec_minmax_sysadmin(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +} +#endif + +struct do_proc_dointvec_minmax_conv_param { + int *min; + int *max; +}; + +static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + struct do_proc_dointvec_minmax_conv_param *param = data; + if (write) { + int val = *negp ? -*lvalp : *lvalp; + if ((param->min && *param->min > val) || + (param->max && *param->max < val)) + return -EINVAL; + *valp = val; + } else { + int val = *valp; + if (val < 0) { + *negp = true; + *lvalp = (unsigned long)-val; + } else { + *negp = false; + *lvalp = (unsigned long)val; + } + } + return 0; +} + +/** + * proc_dointvec_minmax - read a vector of integers with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success. + */ +int proc_dointvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct do_proc_dointvec_minmax_conv_param param = { + .min = (int *) table->extra1, + .max = (int *) table->extra2, + }; + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_dointvec_minmax_conv, ¶m); +} + +static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos, + unsigned long convmul, + unsigned long convdiv) +{ + unsigned long *i, *min, *max; + int vleft, first = 1, err = 0; + unsigned long page = 0; + size_t left; + char *kbuf; + + if (!data || !table->maxlen || !*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + + i = (unsigned long *) data; + min = (unsigned long *) table->extra1; + max = (unsigned long *) table->extra2; + vleft = table->maxlen / sizeof(unsigned long); + left = *lenp; + + if (write) { + if (left > PAGE_SIZE - 1) + left = PAGE_SIZE - 1; + page = __get_free_page(GFP_TEMPORARY); + kbuf = (char *) page; + if (!kbuf) + return -ENOMEM; + if (copy_from_user(kbuf, buffer, left)) { + err = -EFAULT; + goto free; + } + kbuf[left] = 0; + } + + for (; left && vleft--; i++, first = 0) { + unsigned long val; + + if (write) { + bool neg; + + left -= proc_skip_spaces(&kbuf); + + err = proc_get_long(&kbuf, &left, &val, &neg, + proc_wspace_sep, + sizeof(proc_wspace_sep), NULL); + if (err) + break; + if (neg) + continue; + if ((min && val < *min) || (max && val > *max)) + continue; + *i = val; + } else { + val = convdiv * (*i) / convmul; + if (!first) + err = proc_put_char(&buffer, &left, '\t'); + err = proc_put_long(&buffer, &left, val, false); + if (err) + break; + } + } + + if (!write && !first && left && !err) + err = proc_put_char(&buffer, &left, '\n'); + if (write && !err) + left -= proc_skip_spaces(&kbuf); +free: + if (write) { + free_page(page); + if (first) + return err ? : -EINVAL; + } + *lenp -= left; + *ppos += *lenp; + return err; +} + +static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos, + unsigned long convmul, + unsigned long convdiv) +{ + return __do_proc_doulongvec_minmax(table->data, table, write, + buffer, lenp, ppos, convmul, convdiv); +} + +/** + * proc_doulongvec_minmax - read a vector of long integers with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long + * values from/to the user buffer, treated as an ASCII string. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success. + */ +int proc_doulongvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l); +} + +/** + * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned long) unsigned long + * values from/to the user buffer, treated as an ASCII string. The values + * are treated as milliseconds, and converted to jiffies when they are stored. + * + * This routine will ensure the values are within the range specified by + * table->extra1 (min) and table->extra2 (max). + * + * Returns 0 on success. + */ +int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + return do_proc_doulongvec_minmax(table, write, buffer, + lenp, ppos, HZ, 1000l); +} + + +static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + if (*lvalp > LONG_MAX / HZ) + return 1; + *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ); + } else { + int val = *valp; + unsigned long lval; + if (val < 0) { + *negp = true; + lval = (unsigned long)-val; + } else { + *negp = false; + lval = (unsigned long)val; + } + *lvalp = lval / HZ; + } + return 0; +} + +static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + if (USER_HZ < HZ && *lvalp > (LONG_MAX / HZ) * USER_HZ) + return 1; + *valp = clock_t_to_jiffies(*negp ? -*lvalp : *lvalp); + } else { + int val = *valp; + unsigned long lval; + if (val < 0) { + *negp = true; + lval = (unsigned long)-val; + } else { + *negp = false; + lval = (unsigned long)val; + } + *lvalp = jiffies_to_clock_t(lval); + } + return 0; +} + +static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp, + int *valp, + int write, void *data) +{ + if (write) { + *valp = msecs_to_jiffies(*negp ? -*lvalp : *lvalp); + } else { + int val = *valp; + unsigned long lval; + if (val < 0) { + *negp = true; + lval = (unsigned long)-val; + } else { + *negp = false; + lval = (unsigned long)val; + } + *lvalp = jiffies_to_msecs(lval); + } + return 0; +} + +/** + * proc_dointvec_jiffies - read a vector of integers as seconds + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in seconds, and are converted into + * jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_jiffies(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table,write,buffer,lenp,ppos, + do_proc_dointvec_jiffies_conv,NULL); +} + +/** + * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: pointer to the file position + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in 1/USER_HZ seconds, and + * are converted into jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table,write,buffer,lenp,ppos, + do_proc_dointvec_userhz_jiffies_conv,NULL); +} + +/** + * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * @ppos: the current position in the file + * + * Reads/writes up to table->maxlen/sizeof(unsigned int) integer + * values from/to the user buffer, treated as an ASCII string. + * The values read are assumed to be in 1/1000 seconds, and + * are converted into jiffies. + * + * Returns 0 on success. + */ +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_dointvec_ms_jiffies_conv, NULL); +} + +static int proc_do_cad_pid(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct pid *new_pid; + pid_t tmp; + int r; + + tmp = pid_vnr(cad_pid); + + r = __do_proc_dointvec(&tmp, table, write, buffer, + lenp, ppos, NULL, NULL); + if (r || !write) + return r; + + new_pid = find_get_pid(tmp); + if (!new_pid) + return -ESRCH; + + put_pid(xchg(&cad_pid, new_pid)); + return 0; +} + +/** + * proc_do_large_bitmap - read/write from/to a large bitmap + * @table: the sysctl table + * @write: %TRUE if this is a write to the sysctl file + * @buffer: the user buffer + * @lenp: the size of the user buffer + * @ppos: file position + * + * The bitmap is stored at table->data and the bitmap length (in bits) + * in table->maxlen. + * + * We use a range comma separated format (e.g. 1,3-4,10-10) so that + * large bitmaps may be represented in a compact manner. Writing into + * the file will clear the bitmap then update it with the given input. + * + * Returns 0 on success. + */ +int proc_do_large_bitmap(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int err = 0; + bool first = 1; + size_t left = *lenp; + unsigned long bitmap_len = table->maxlen; + unsigned long *bitmap = (unsigned long *) table->data; + unsigned long *tmp_bitmap = NULL; + char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c; + + if (!bitmap_len || !left || (*ppos && !write)) { + *lenp = 0; + return 0; + } + + if (write) { + unsigned long page = 0; + char *kbuf; + + if (left > PAGE_SIZE - 1) + left = PAGE_SIZE - 1; + + page = __get_free_page(GFP_TEMPORARY); + kbuf = (char *) page; + if (!kbuf) + return -ENOMEM; + if (copy_from_user(kbuf, buffer, left)) { + free_page(page); + return -EFAULT; + } + kbuf[left] = 0; + + tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long), + GFP_KERNEL); + if (!tmp_bitmap) { + free_page(page); + return -ENOMEM; + } + proc_skip_char(&kbuf, &left, '\n'); + while (!err && left) { + unsigned long val_a, val_b; + bool neg; + + err = proc_get_long(&kbuf, &left, &val_a, &neg, tr_a, + sizeof(tr_a), &c); + if (err) + break; + if (val_a >= bitmap_len || neg) { + err = -EINVAL; + break; + } + + val_b = val_a; + if (left) { + kbuf++; + left--; + } + + if (c == '-') { + err = proc_get_long(&kbuf, &left, &val_b, + &neg, tr_b, sizeof(tr_b), + &c); + if (err) + break; + if (val_b >= bitmap_len || neg || + val_a > val_b) { + err = -EINVAL; + break; + } + if (left) { + kbuf++; + left--; + } + } + + while (val_a <= val_b) + set_bit(val_a++, tmp_bitmap); + + first = 0; + proc_skip_char(&kbuf, &left, '\n'); + } + free_page(page); + } else { + unsigned long bit_a, bit_b = 0; + + while (left) { + bit_a = find_next_bit(bitmap, bitmap_len, bit_b); + if (bit_a >= bitmap_len) + break; + bit_b = find_next_zero_bit(bitmap, bitmap_len, + bit_a + 1) - 1; + + if (!first) { + err = proc_put_char(&buffer, &left, ','); + if (err) + break; + } + err = proc_put_long(&buffer, &left, bit_a, false); + if (err) + break; + if (bit_a != bit_b) { + err = proc_put_char(&buffer, &left, '-'); + if (err) + break; + err = proc_put_long(&buffer, &left, bit_b, false); + if (err) + break; + } + + first = 0; bit_b++; + } + if (!err) + err = proc_put_char(&buffer, &left, '\n'); + } + + if (!err) { + if (write) { + if (*ppos) + bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len); + else + memcpy(bitmap, tmp_bitmap, + BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long)); + } + kfree(tmp_bitmap); + *lenp -= left; + *ppos += *lenp; + return 0; + } else { + kfree(tmp_bitmap); + return err; + } +} + +#else /* CONFIG_PROC_SYSCTL */ + +int proc_dostring(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_jiffies(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_doulongvec_minmax(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + +int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + + +#endif /* CONFIG_PROC_SYSCTL */ + +/* + * No sense putting this after each symbol definition, twice, + * exception granted :-) + */ +EXPORT_SYMBOL(proc_dointvec); +EXPORT_SYMBOL(proc_dointvec_jiffies); +EXPORT_SYMBOL(proc_dointvec_minmax); +EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); +EXPORT_SYMBOL(proc_dointvec_ms_jiffies); +EXPORT_SYMBOL(proc_dostring); +EXPORT_SYMBOL(proc_doulongvec_minmax); +EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); +EXPORT_SYMBOL(register_sysctl_table); +EXPORT_SYMBOL(register_sysctl_paths); +EXPORT_SYMBOL(unregister_sysctl_table); diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c new file mode 100644 index 00000000..e055e8b5 --- /dev/null +++ b/kernel/sysctl_binary.c @@ -0,0 +1,1519 @@ +#include +#include +#include "../fs/xfs/linux-2.6/xfs_sysctl.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_SYSCTL_SYSCALL + +struct bin_table; +typedef ssize_t bin_convert_t(struct file *file, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen); + +static bin_convert_t bin_dir; +static bin_convert_t bin_string; +static bin_convert_t bin_intvec; +static bin_convert_t bin_ulongvec; +static bin_convert_t bin_uuid; +static bin_convert_t bin_dn_node_address; + +#define CTL_DIR bin_dir +#define CTL_STR bin_string +#define CTL_INT bin_intvec +#define CTL_ULONG bin_ulongvec +#define CTL_UUID bin_uuid +#define CTL_DNADR bin_dn_node_address + +#define BUFSZ 256 + +struct bin_table { + bin_convert_t *convert; + int ctl_name; + const char *procname; + const struct bin_table *child; +}; + +static const struct bin_table bin_random_table[] = { + { CTL_INT, RANDOM_POOLSIZE, "poolsize" }, + { CTL_INT, RANDOM_ENTROPY_COUNT, "entropy_avail" }, + { CTL_INT, RANDOM_READ_THRESH, "read_wakeup_threshold" }, + { CTL_INT, RANDOM_WRITE_THRESH, "write_wakeup_threshold" }, + { CTL_UUID, RANDOM_BOOT_ID, "boot_id" }, + { CTL_UUID, RANDOM_UUID, "uuid" }, + {} +}; + +static const struct bin_table bin_pty_table[] = { + { CTL_INT, PTY_MAX, "max" }, + { CTL_INT, PTY_NR, "nr" }, + {} +}; + +static const struct bin_table bin_kern_table[] = { + { CTL_STR, KERN_OSTYPE, "ostype" }, + { CTL_STR, KERN_OSRELEASE, "osrelease" }, + /* KERN_OSREV not used */ + { CTL_STR, KERN_VERSION, "version" }, + /* KERN_SECUREMASK not used */ + /* KERN_PROF not used */ + { CTL_STR, KERN_NODENAME, "hostname" }, + { CTL_STR, KERN_DOMAINNAME, "domainname" }, + + { CTL_INT, KERN_PANIC, "panic" }, + { CTL_INT, KERN_REALROOTDEV, "real-root-dev" }, + + { CTL_STR, KERN_SPARC_REBOOT, "reboot-cmd" }, + { CTL_INT, KERN_CTLALTDEL, "ctrl-alt-del" }, + { CTL_INT, KERN_PRINTK, "printk" }, + + /* KERN_NAMETRANS not used */ + /* KERN_PPC_HTABRECLAIM not used */ + /* KERN_PPC_ZEROPAGED not used */ + { CTL_INT, KERN_PPC_POWERSAVE_NAP, "powersave-nap" }, + + { CTL_STR, KERN_MODPROBE, "modprobe" }, + { CTL_INT, KERN_SG_BIG_BUFF, "sg-big-buff" }, + { CTL_INT, KERN_ACCT, "acct" }, + /* KERN_PPC_L2CR "l2cr" no longer used */ + + /* KERN_RTSIGNR not used */ + /* KERN_RTSIGMAX not used */ + + { CTL_ULONG, KERN_SHMMAX, "shmmax" }, + { CTL_INT, KERN_MSGMAX, "msgmax" }, + { CTL_INT, KERN_MSGMNB, "msgmnb" }, + /* KERN_MSGPOOL not used*/ + { CTL_INT, KERN_SYSRQ, "sysrq" }, + { CTL_INT, KERN_MAX_THREADS, "threads-max" }, + { CTL_DIR, KERN_RANDOM, "random", bin_random_table }, + { CTL_ULONG, KERN_SHMALL, "shmall" }, + { CTL_INT, KERN_MSGMNI, "msgmni" }, + { CTL_INT, KERN_SEM, "sem" }, + { CTL_INT, KERN_SPARC_STOP_A, "stop-a" }, + { CTL_INT, KERN_SHMMNI, "shmmni" }, + + { CTL_INT, KERN_OVERFLOWUID, "overflowuid" }, + { CTL_INT, KERN_OVERFLOWGID, "overflowgid" }, + + { CTL_STR, KERN_HOTPLUG, "hotplug", }, + { CTL_INT, KERN_IEEE_EMULATION_WARNINGS, "ieee_emulation_warnings" }, + + { CTL_INT, KERN_S390_USER_DEBUG_LOGGING, "userprocess_debug" }, + { CTL_INT, KERN_CORE_USES_PID, "core_uses_pid" }, + /* KERN_TAINTED "tainted" no longer used */ + { CTL_INT, KERN_CADPID, "cad_pid" }, + { CTL_INT, KERN_PIDMAX, "pid_max" }, + { CTL_STR, KERN_CORE_PATTERN, "core_pattern" }, + { CTL_INT, KERN_PANIC_ON_OOPS, "panic_on_oops" }, + { CTL_INT, KERN_HPPA_PWRSW, "soft-power" }, + { CTL_INT, KERN_HPPA_UNALIGNED, "unaligned-trap" }, + + { CTL_INT, KERN_PRINTK_RATELIMIT, "printk_ratelimit" }, + { CTL_INT, KERN_PRINTK_RATELIMIT_BURST, "printk_ratelimit_burst" }, + + { CTL_DIR, KERN_PTY, "pty", bin_pty_table }, + { CTL_INT, KERN_NGROUPS_MAX, "ngroups_max" }, + { CTL_INT, KERN_SPARC_SCONS_PWROFF, "scons-poweroff" }, + /* KERN_HZ_TIMER "hz_timer" no longer used */ + { CTL_INT, KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" }, + { CTL_INT, KERN_BOOTLOADER_TYPE, "bootloader_type" }, + { CTL_INT, KERN_RANDOMIZE, "randomize_va_space" }, + + { CTL_INT, KERN_SPIN_RETRY, "spin_retry" }, + /* KERN_ACPI_VIDEO_FLAGS "acpi_video_flags" no longer used */ + { CTL_INT, KERN_IA64_UNALIGNED, "ignore-unaligned-usertrap" }, + { CTL_INT, KERN_COMPAT_LOG, "compat-log" }, + { CTL_INT, KERN_MAX_LOCK_DEPTH, "max_lock_depth" }, + { CTL_INT, KERN_PANIC_ON_NMI, "panic_on_unrecovered_nmi" }, + {} +}; + +static const struct bin_table bin_vm_table[] = { + { CTL_INT, VM_OVERCOMMIT_MEMORY, "overcommit_memory" }, + { CTL_INT, VM_PAGE_CLUSTER, "page-cluster" }, + { CTL_INT, VM_DIRTY_BACKGROUND, "dirty_background_ratio" }, + { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" }, + /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */ + /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */ + { CTL_INT, VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" }, + { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" }, + /* VM_PAGEBUF unused */ + /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */ + { CTL_INT, VM_SWAPPINESS, "swappiness" }, + { CTL_INT, VM_LOWMEM_RESERVE_RATIO, "lowmem_reserve_ratio" }, + { CTL_INT, VM_MIN_FREE_KBYTES, "min_free_kbytes" }, + { CTL_INT, VM_MAX_MAP_COUNT, "max_map_count" }, + { CTL_INT, VM_LAPTOP_MODE, "laptop_mode" }, + { CTL_INT, VM_BLOCK_DUMP, "block_dump" }, + { CTL_INT, VM_HUGETLB_GROUP, "hugetlb_shm_group" }, + { CTL_INT, VM_VFS_CACHE_PRESSURE, "vfs_cache_pressure" }, + { CTL_INT, VM_LEGACY_VA_LAYOUT, "legacy_va_layout" }, + /* VM_SWAP_TOKEN_TIMEOUT unused */ + { CTL_INT, VM_DROP_PAGECACHE, "drop_caches" }, + { CTL_INT, VM_PERCPU_PAGELIST_FRACTION, "percpu_pagelist_fraction" }, + { CTL_INT, VM_ZONE_RECLAIM_MODE, "zone_reclaim_mode" }, + { CTL_INT, VM_MIN_UNMAPPED, "min_unmapped_ratio" }, + { CTL_INT, VM_PANIC_ON_OOM, "panic_on_oom" }, + { CTL_INT, VM_VDSO_ENABLED, "vdso_enabled" }, + { CTL_INT, VM_MIN_SLAB, "min_slab_ratio" }, + + {} +}; + +static const struct bin_table bin_net_core_table[] = { + { CTL_INT, NET_CORE_WMEM_MAX, "wmem_max" }, + { CTL_INT, NET_CORE_RMEM_MAX, "rmem_max" }, + { CTL_INT, NET_CORE_WMEM_DEFAULT, "wmem_default" }, + { CTL_INT, NET_CORE_RMEM_DEFAULT, "rmem_default" }, + /* NET_CORE_DESTROY_DELAY unused */ + { CTL_INT, NET_CORE_MAX_BACKLOG, "netdev_max_backlog" }, + /* NET_CORE_FASTROUTE unused */ + { CTL_INT, NET_CORE_MSG_COST, "message_cost" }, + { CTL_INT, NET_CORE_MSG_BURST, "message_burst" }, + { CTL_INT, NET_CORE_OPTMEM_MAX, "optmem_max" }, + /* NET_CORE_HOT_LIST_LENGTH unused */ + /* NET_CORE_DIVERT_VERSION unused */ + /* NET_CORE_NO_CONG_THRESH unused */ + /* NET_CORE_NO_CONG unused */ + /* NET_CORE_LO_CONG unused */ + /* NET_CORE_MOD_CONG unused */ + { CTL_INT, NET_CORE_DEV_WEIGHT, "dev_weight" }, + { CTL_INT, NET_CORE_SOMAXCONN, "somaxconn" }, + { CTL_INT, NET_CORE_BUDGET, "netdev_budget" }, + { CTL_INT, NET_CORE_AEVENT_ETIME, "xfrm_aevent_etime" }, + { CTL_INT, NET_CORE_AEVENT_RSEQTH, "xfrm_aevent_rseqth" }, + { CTL_INT, NET_CORE_WARNINGS, "warnings" }, + {}, +}; + +static const struct bin_table bin_net_unix_table[] = { + /* NET_UNIX_DESTROY_DELAY unused */ + /* NET_UNIX_DELETE_DELAY unused */ + { CTL_INT, NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen" }, + {} +}; + +static const struct bin_table bin_net_ipv4_route_table[] = { + { CTL_INT, NET_IPV4_ROUTE_FLUSH, "flush" }, + /* NET_IPV4_ROUTE_MIN_DELAY "min_delay" no longer used */ + /* NET_IPV4_ROUTE_MAX_DELAY "max_delay" no longer used */ + { CTL_INT, NET_IPV4_ROUTE_GC_THRESH, "gc_thresh" }, + { CTL_INT, NET_IPV4_ROUTE_MAX_SIZE, "max_size" }, + { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, + { CTL_INT, NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, + { CTL_INT, NET_IPV4_ROUTE_GC_TIMEOUT, "gc_timeout" }, + { CTL_INT, NET_IPV4_ROUTE_GC_INTERVAL, "gc_interval" }, + { CTL_INT, NET_IPV4_ROUTE_REDIRECT_LOAD, "redirect_load" }, + { CTL_INT, NET_IPV4_ROUTE_REDIRECT_NUMBER, "redirect_number" }, + { CTL_INT, NET_IPV4_ROUTE_REDIRECT_SILENCE, "redirect_silence" }, + { CTL_INT, NET_IPV4_ROUTE_ERROR_COST, "error_cost" }, + { CTL_INT, NET_IPV4_ROUTE_ERROR_BURST, "error_burst" }, + { CTL_INT, NET_IPV4_ROUTE_GC_ELASTICITY, "gc_elasticity" }, + { CTL_INT, NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" }, + { CTL_INT, NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" }, + { CTL_INT, NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" }, + {} +}; + +static const struct bin_table bin_net_ipv4_conf_vars_table[] = { + { CTL_INT, NET_IPV4_CONF_FORWARDING, "forwarding" }, + { CTL_INT, NET_IPV4_CONF_MC_FORWARDING, "mc_forwarding" }, + + { CTL_INT, NET_IPV4_CONF_ACCEPT_REDIRECTS, "accept_redirects" }, + { CTL_INT, NET_IPV4_CONF_SECURE_REDIRECTS, "secure_redirects" }, + { CTL_INT, NET_IPV4_CONF_SEND_REDIRECTS, "send_redirects" }, + { CTL_INT, NET_IPV4_CONF_SHARED_MEDIA, "shared_media" }, + { CTL_INT, NET_IPV4_CONF_RP_FILTER, "rp_filter" }, + { CTL_INT, NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, + { CTL_INT, NET_IPV4_CONF_PROXY_ARP, "proxy_arp" }, + { CTL_INT, NET_IPV4_CONF_MEDIUM_ID, "medium_id" }, + { CTL_INT, NET_IPV4_CONF_BOOTP_RELAY, "bootp_relay" }, + { CTL_INT, NET_IPV4_CONF_LOG_MARTIANS, "log_martians" }, + { CTL_INT, NET_IPV4_CONF_TAG, "tag" }, + { CTL_INT, NET_IPV4_CONF_ARPFILTER, "arp_filter" }, + { CTL_INT, NET_IPV4_CONF_ARP_ANNOUNCE, "arp_announce" }, + { CTL_INT, NET_IPV4_CONF_ARP_IGNORE, "arp_ignore" }, + { CTL_INT, NET_IPV4_CONF_ARP_ACCEPT, "arp_accept" }, + { CTL_INT, NET_IPV4_CONF_ARP_NOTIFY, "arp_notify" }, + + { CTL_INT, NET_IPV4_CONF_NOXFRM, "disable_xfrm" }, + { CTL_INT, NET_IPV4_CONF_NOPOLICY, "disable_policy" }, + { CTL_INT, NET_IPV4_CONF_FORCE_IGMP_VERSION, "force_igmp_version" }, + { CTL_INT, NET_IPV4_CONF_PROMOTE_SECONDARIES, "promote_secondaries" }, + {} +}; + +static const struct bin_table bin_net_ipv4_conf_table[] = { + { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv4_conf_vars_table }, + { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv4_conf_vars_table }, + { CTL_DIR, 0, NULL, bin_net_ipv4_conf_vars_table }, + {} +}; + +static const struct bin_table bin_net_neigh_vars_table[] = { + { CTL_INT, NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" }, + { CTL_INT, NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" }, + { CTL_INT, NET_NEIGH_APP_SOLICIT, "app_solicit" }, + /* NET_NEIGH_RETRANS_TIME "retrans_time" no longer used */ + { CTL_INT, NET_NEIGH_REACHABLE_TIME, "base_reachable_time" }, + { CTL_INT, NET_NEIGH_DELAY_PROBE_TIME, "delay_first_probe_time" }, + { CTL_INT, NET_NEIGH_GC_STALE_TIME, "gc_stale_time" }, + { CTL_INT, NET_NEIGH_UNRES_QLEN, "unres_qlen" }, + { CTL_INT, NET_NEIGH_PROXY_QLEN, "proxy_qlen" }, + /* NET_NEIGH_ANYCAST_DELAY "anycast_delay" no longer used */ + /* NET_NEIGH_PROXY_DELAY "proxy_delay" no longer used */ + /* NET_NEIGH_LOCKTIME "locktime" no longer used */ + { CTL_INT, NET_NEIGH_GC_INTERVAL, "gc_interval" }, + { CTL_INT, NET_NEIGH_GC_THRESH1, "gc_thresh1" }, + { CTL_INT, NET_NEIGH_GC_THRESH2, "gc_thresh2" }, + { CTL_INT, NET_NEIGH_GC_THRESH3, "gc_thresh3" }, + { CTL_INT, NET_NEIGH_RETRANS_TIME_MS, "retrans_time_ms" }, + { CTL_INT, NET_NEIGH_REACHABLE_TIME_MS, "base_reachable_time_ms" }, + {} +}; + +static const struct bin_table bin_net_neigh_table[] = { + { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_neigh_vars_table }, + { CTL_DIR, 0, NULL, bin_net_neigh_vars_table }, + {} +}; + +static const struct bin_table bin_net_ipv4_netfilter_table[] = { + { CTL_INT, NET_IPV4_NF_CONNTRACK_MAX, "ip_conntrack_max" }, + + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "ip_conntrack_tcp_timeout_syn_sent" no longer used */ + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "ip_conntrack_tcp_timeout_syn_recv" no longer used */ + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "ip_conntrack_tcp_timeout_established" no longer used */ + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "ip_conntrack_tcp_timeout_fin_wait" no longer used */ + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "ip_conntrack_tcp_timeout_close_wait" no longer used */ + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "ip_conntrack_tcp_timeout_last_ack" no longer used */ + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "ip_conntrack_tcp_timeout_time_wait" no longer used */ + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "ip_conntrack_tcp_timeout_close" no longer used */ + + /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT "ip_conntrack_udp_timeout" no longer used */ + /* NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM "ip_conntrack_udp_timeout_stream" no longer used */ + /* NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT "ip_conntrack_icmp_timeout" no longer used */ + /* NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT "ip_conntrack_generic_timeout" no longer used */ + + { CTL_INT, NET_IPV4_NF_CONNTRACK_BUCKETS, "ip_conntrack_buckets" }, + { CTL_INT, NET_IPV4_NF_CONNTRACK_LOG_INVALID, "ip_conntrack_log_invalid" }, + /* NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "ip_conntrack_tcp_timeout_max_retrans" no longer used */ + { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_LOOSE, "ip_conntrack_tcp_loose" }, + { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, "ip_conntrack_tcp_be_liberal" }, + { CTL_INT, NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, "ip_conntrack_tcp_max_retrans" }, + + /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "ip_conntrack_sctp_timeout_closed" no longer used */ + /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "ip_conntrack_sctp_timeout_cookie_wait" no longer used */ + /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "ip_conntrack_sctp_timeout_cookie_echoed" no longer used */ + /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "ip_conntrack_sctp_timeout_established" no longer used */ + /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "ip_conntrack_sctp_timeout_shutdown_sent" no longer used */ + /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "ip_conntrack_sctp_timeout_shutdown_recd" no longer used */ + /* NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "ip_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */ + + { CTL_INT, NET_IPV4_NF_CONNTRACK_COUNT, "ip_conntrack_count" }, + { CTL_INT, NET_IPV4_NF_CONNTRACK_CHECKSUM, "ip_conntrack_checksum" }, + {} +}; + +static const struct bin_table bin_net_ipv4_table[] = { + {CTL_INT, NET_IPV4_FORWARD, "ip_forward" }, + + { CTL_DIR, NET_IPV4_CONF, "conf", bin_net_ipv4_conf_table }, + { CTL_DIR, NET_IPV4_NEIGH, "neigh", bin_net_neigh_table }, + { CTL_DIR, NET_IPV4_ROUTE, "route", bin_net_ipv4_route_table }, + /* NET_IPV4_FIB_HASH unused */ + { CTL_DIR, NET_IPV4_NETFILTER, "netfilter", bin_net_ipv4_netfilter_table }, + + { CTL_INT, NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" }, + { CTL_INT, NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" }, + { CTL_INT, NET_IPV4_TCP_SACK, "tcp_sack" }, + { CTL_INT, NET_IPV4_TCP_RETRANS_COLLAPSE, "tcp_retrans_collapse" }, + { CTL_INT, NET_IPV4_DEFAULT_TTL, "ip_default_ttl" }, + /* NET_IPV4_AUTOCONFIG unused */ + { CTL_INT, NET_IPV4_NO_PMTU_DISC, "ip_no_pmtu_disc" }, + { CTL_INT, NET_IPV4_NONLOCAL_BIND, "ip_nonlocal_bind" }, + { CTL_INT, NET_IPV4_TCP_SYN_RETRIES, "tcp_syn_retries" }, + { CTL_INT, NET_TCP_SYNACK_RETRIES, "tcp_synack_retries" }, + { CTL_INT, NET_TCP_MAX_ORPHANS, "tcp_max_orphans" }, + { CTL_INT, NET_TCP_MAX_TW_BUCKETS, "tcp_max_tw_buckets" }, + { CTL_INT, NET_IPV4_DYNADDR, "ip_dynaddr" }, + { CTL_INT, NET_IPV4_TCP_KEEPALIVE_TIME, "tcp_keepalive_time" }, + { CTL_INT, NET_IPV4_TCP_KEEPALIVE_PROBES, "tcp_keepalive_probes" }, + { CTL_INT, NET_IPV4_TCP_KEEPALIVE_INTVL, "tcp_keepalive_intvl" }, + { CTL_INT, NET_IPV4_TCP_RETRIES1, "tcp_retries1" }, + { CTL_INT, NET_IPV4_TCP_RETRIES2, "tcp_retries2" }, + { CTL_INT, NET_IPV4_TCP_FIN_TIMEOUT, "tcp_fin_timeout" }, + { CTL_INT, NET_TCP_SYNCOOKIES, "tcp_syncookies" }, + { CTL_INT, NET_TCP_TW_RECYCLE, "tcp_tw_recycle" }, + { CTL_INT, NET_TCP_ABORT_ON_OVERFLOW, "tcp_abort_on_overflow" }, + { CTL_INT, NET_TCP_STDURG, "tcp_stdurg" }, + { CTL_INT, NET_TCP_RFC1337, "tcp_rfc1337" }, + { CTL_INT, NET_TCP_MAX_SYN_BACKLOG, "tcp_max_syn_backlog" }, + { CTL_INT, NET_IPV4_LOCAL_PORT_RANGE, "ip_local_port_range" }, + { CTL_INT, NET_IPV4_IGMP_MAX_MEMBERSHIPS, "igmp_max_memberships" }, + { CTL_INT, NET_IPV4_IGMP_MAX_MSF, "igmp_max_msf" }, + { CTL_INT, NET_IPV4_INET_PEER_THRESHOLD, "inet_peer_threshold" }, + { CTL_INT, NET_IPV4_INET_PEER_MINTTL, "inet_peer_minttl" }, + { CTL_INT, NET_IPV4_INET_PEER_MAXTTL, "inet_peer_maxttl" }, + { CTL_INT, NET_IPV4_INET_PEER_GC_MINTIME, "inet_peer_gc_mintime" }, + { CTL_INT, NET_IPV4_INET_PEER_GC_MAXTIME, "inet_peer_gc_maxtime" }, + { CTL_INT, NET_TCP_ORPHAN_RETRIES, "tcp_orphan_retries" }, + { CTL_INT, NET_TCP_FACK, "tcp_fack" }, + { CTL_INT, NET_TCP_REORDERING, "tcp_reordering" }, + { CTL_INT, NET_TCP_ECN, "tcp_ecn" }, + { CTL_INT, NET_TCP_DSACK, "tcp_dsack" }, + { CTL_INT, NET_TCP_MEM, "tcp_mem" }, + { CTL_INT, NET_TCP_WMEM, "tcp_wmem" }, + { CTL_INT, NET_TCP_RMEM, "tcp_rmem" }, + { CTL_INT, NET_TCP_APP_WIN, "tcp_app_win" }, + { CTL_INT, NET_TCP_ADV_WIN_SCALE, "tcp_adv_win_scale" }, + { CTL_INT, NET_TCP_TW_REUSE, "tcp_tw_reuse" }, + { CTL_INT, NET_TCP_FRTO, "tcp_frto" }, + { CTL_INT, NET_TCP_FRTO_RESPONSE, "tcp_frto_response" }, + { CTL_INT, NET_TCP_LOW_LATENCY, "tcp_low_latency" }, + { CTL_INT, NET_TCP_NO_METRICS_SAVE, "tcp_no_metrics_save" }, + { CTL_INT, NET_TCP_MODERATE_RCVBUF, "tcp_moderate_rcvbuf" }, + { CTL_INT, NET_TCP_TSO_WIN_DIVISOR, "tcp_tso_win_divisor" }, + { CTL_STR, NET_TCP_CONG_CONTROL, "tcp_congestion_control" }, + { CTL_INT, NET_TCP_ABC, "tcp_abc" }, + { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, + { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" }, + { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, + { CTL_INT, NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" }, + { CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" }, + { CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" }, + { CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" }, + { CTL_INT, NET_CIPSOV4_RBM_OPTFMT, "cipso_rbm_optfmt" }, + { CTL_INT, NET_CIPSOV4_RBM_STRICTVALID, "cipso_rbm_strictvalid" }, + /* NET_TCP_AVAIL_CONG_CONTROL "tcp_available_congestion_control" no longer used */ + { CTL_STR, NET_TCP_ALLOWED_CONG_CONTROL, "tcp_allowed_congestion_control" }, + { CTL_INT, NET_TCP_MAX_SSTHRESH, "tcp_max_ssthresh" }, + + { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_ALL, "icmp_echo_ignore_all" }, + { CTL_INT, NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, "icmp_echo_ignore_broadcasts" }, + { CTL_INT, NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, "icmp_ignore_bogus_error_responses" }, + { CTL_INT, NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR, "icmp_errors_use_inbound_ifaddr" }, + { CTL_INT, NET_IPV4_ICMP_RATELIMIT, "icmp_ratelimit" }, + { CTL_INT, NET_IPV4_ICMP_RATEMASK, "icmp_ratemask" }, + + { CTL_INT, NET_IPV4_IPFRAG_HIGH_THRESH, "ipfrag_high_thresh" }, + { CTL_INT, NET_IPV4_IPFRAG_LOW_THRESH, "ipfrag_low_thresh" }, + { CTL_INT, NET_IPV4_IPFRAG_TIME, "ipfrag_time" }, + + { CTL_INT, NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" }, + /* NET_IPV4_IPFRAG_MAX_DIST "ipfrag_max_dist" no longer used */ + + { CTL_INT, 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" }, + + /* NET_TCP_DEFAULT_WIN_SCALE unused */ + /* NET_TCP_BIC_BETA unused */ + /* NET_IPV4_TCP_MAX_KA_PROBES unused */ + /* NET_IPV4_IP_MASQ_DEBUG unused */ + /* NET_TCP_SYN_TAILDROP unused */ + /* NET_IPV4_ICMP_SOURCEQUENCH_RATE unused */ + /* NET_IPV4_ICMP_DESTUNREACH_RATE unused */ + /* NET_IPV4_ICMP_TIMEEXCEED_RATE unused */ + /* NET_IPV4_ICMP_PARAMPROB_RATE unused */ + /* NET_IPV4_ICMP_ECHOREPLY_RATE unused */ + /* NET_IPV4_ALWAYS_DEFRAG unused */ + {} +}; + +static const struct bin_table bin_net_ipx_table[] = { + { CTL_INT, NET_IPX_PPROP_BROADCASTING, "ipx_pprop_broadcasting" }, + /* NET_IPX_FORWARDING unused */ + {} +}; + +static const struct bin_table bin_net_atalk_table[] = { + { CTL_INT, NET_ATALK_AARP_EXPIRY_TIME, "aarp-expiry-time" }, + { CTL_INT, NET_ATALK_AARP_TICK_TIME, "aarp-tick-time" }, + { CTL_INT, NET_ATALK_AARP_RETRANSMIT_LIMIT, "aarp-retransmit-limit" }, + { CTL_INT, NET_ATALK_AARP_RESOLVE_TIME, "aarp-resolve-time" }, + {}, +}; + +static const struct bin_table bin_net_netrom_table[] = { + { CTL_INT, NET_NETROM_DEFAULT_PATH_QUALITY, "default_path_quality" }, + { CTL_INT, NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, "obsolescence_count_initialiser" }, + { CTL_INT, NET_NETROM_NETWORK_TTL_INITIALISER, "network_ttl_initialiser" }, + { CTL_INT, NET_NETROM_TRANSPORT_TIMEOUT, "transport_timeout" }, + { CTL_INT, NET_NETROM_TRANSPORT_MAXIMUM_TRIES, "transport_maximum_tries" }, + { CTL_INT, NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY, "transport_acknowledge_delay" }, + { CTL_INT, NET_NETROM_TRANSPORT_BUSY_DELAY, "transport_busy_delay" }, + { CTL_INT, NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE, "transport_requested_window_size" }, + { CTL_INT, NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT, "transport_no_activity_timeout" }, + { CTL_INT, NET_NETROM_ROUTING_CONTROL, "routing_control" }, + { CTL_INT, NET_NETROM_LINK_FAILS_COUNT, "link_fails_count" }, + { CTL_INT, NET_NETROM_RESET, "reset" }, + {} +}; + +static const struct bin_table bin_net_ax25_param_table[] = { + { CTL_INT, NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" }, + { CTL_INT, NET_AX25_DEFAULT_MODE, "ax25_default_mode" }, + { CTL_INT, NET_AX25_BACKOFF_TYPE, "backoff_type" }, + { CTL_INT, NET_AX25_CONNECT_MODE, "connect_mode" }, + { CTL_INT, NET_AX25_STANDARD_WINDOW, "standard_window_size" }, + { CTL_INT, NET_AX25_EXTENDED_WINDOW, "extended_window_size" }, + { CTL_INT, NET_AX25_T1_TIMEOUT, "t1_timeout" }, + { CTL_INT, NET_AX25_T2_TIMEOUT, "t2_timeout" }, + { CTL_INT, NET_AX25_T3_TIMEOUT, "t3_timeout" }, + { CTL_INT, NET_AX25_IDLE_TIMEOUT, "idle_timeout" }, + { CTL_INT, NET_AX25_N2, "maximum_retry_count" }, + { CTL_INT, NET_AX25_PACLEN, "maximum_packet_length" }, + { CTL_INT, NET_AX25_PROTOCOL, "protocol" }, + { CTL_INT, NET_AX25_DAMA_SLAVE_TIMEOUT, "dama_slave_timeout" }, + {} +}; + +static const struct bin_table bin_net_ax25_table[] = { + { CTL_DIR, 0, NULL, bin_net_ax25_param_table }, + {} +}; + +static const struct bin_table bin_net_rose_table[] = { + { CTL_INT, NET_ROSE_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" }, + { CTL_INT, NET_ROSE_CALL_REQUEST_TIMEOUT, "call_request_timeout" }, + { CTL_INT, NET_ROSE_RESET_REQUEST_TIMEOUT, "reset_request_timeout" }, + { CTL_INT, NET_ROSE_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" }, + { CTL_INT, NET_ROSE_ACK_HOLD_BACK_TIMEOUT, "acknowledge_hold_back_timeout" }, + { CTL_INT, NET_ROSE_ROUTING_CONTROL, "routing_control" }, + { CTL_INT, NET_ROSE_LINK_FAIL_TIMEOUT, "link_fail_timeout" }, + { CTL_INT, NET_ROSE_MAX_VCS, "maximum_virtual_circuits" }, + { CTL_INT, NET_ROSE_WINDOW_SIZE, "window_size" }, + { CTL_INT, NET_ROSE_NO_ACTIVITY_TIMEOUT, "no_activity_timeout" }, + {} +}; + +static const struct bin_table bin_net_ipv6_conf_var_table[] = { + { CTL_INT, NET_IPV6_FORWARDING, "forwarding" }, + { CTL_INT, NET_IPV6_HOP_LIMIT, "hop_limit" }, + { CTL_INT, NET_IPV6_MTU, "mtu" }, + { CTL_INT, NET_IPV6_ACCEPT_RA, "accept_ra" }, + { CTL_INT, NET_IPV6_ACCEPT_REDIRECTS, "accept_redirects" }, + { CTL_INT, NET_IPV6_AUTOCONF, "autoconf" }, + { CTL_INT, NET_IPV6_DAD_TRANSMITS, "dad_transmits" }, + { CTL_INT, NET_IPV6_RTR_SOLICITS, "router_solicitations" }, + { CTL_INT, NET_IPV6_RTR_SOLICIT_INTERVAL, "router_solicitation_interval" }, + { CTL_INT, NET_IPV6_RTR_SOLICIT_DELAY, "router_solicitation_delay" }, + { CTL_INT, NET_IPV6_USE_TEMPADDR, "use_tempaddr" }, + { CTL_INT, NET_IPV6_TEMP_VALID_LFT, "temp_valid_lft" }, + { CTL_INT, NET_IPV6_TEMP_PREFERED_LFT, "temp_prefered_lft" }, + { CTL_INT, NET_IPV6_REGEN_MAX_RETRY, "regen_max_retry" }, + { CTL_INT, NET_IPV6_MAX_DESYNC_FACTOR, "max_desync_factor" }, + { CTL_INT, NET_IPV6_MAX_ADDRESSES, "max_addresses" }, + { CTL_INT, NET_IPV6_FORCE_MLD_VERSION, "force_mld_version" }, + { CTL_INT, NET_IPV6_ACCEPT_RA_DEFRTR, "accept_ra_defrtr" }, + { CTL_INT, NET_IPV6_ACCEPT_RA_PINFO, "accept_ra_pinfo" }, + { CTL_INT, NET_IPV6_ACCEPT_RA_RTR_PREF, "accept_ra_rtr_pref" }, + { CTL_INT, NET_IPV6_RTR_PROBE_INTERVAL, "router_probe_interval" }, + { CTL_INT, NET_IPV6_ACCEPT_RA_RT_INFO_MAX_PLEN, "accept_ra_rt_info_max_plen" }, + { CTL_INT, NET_IPV6_PROXY_NDP, "proxy_ndp" }, + { CTL_INT, NET_IPV6_ACCEPT_SOURCE_ROUTE, "accept_source_route" }, + {} +}; + +static const struct bin_table bin_net_ipv6_conf_table[] = { + { CTL_DIR, NET_PROTO_CONF_ALL, "all", bin_net_ipv6_conf_var_table }, + { CTL_DIR, NET_PROTO_CONF_DEFAULT, "default", bin_net_ipv6_conf_var_table }, + { CTL_DIR, 0, NULL, bin_net_ipv6_conf_var_table }, + {} +}; + +static const struct bin_table bin_net_ipv6_route_table[] = { + /* NET_IPV6_ROUTE_FLUSH "flush" no longer used */ + { CTL_INT, NET_IPV6_ROUTE_GC_THRESH, "gc_thresh" }, + { CTL_INT, NET_IPV6_ROUTE_MAX_SIZE, "max_size" }, + { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL, "gc_min_interval" }, + { CTL_INT, NET_IPV6_ROUTE_GC_TIMEOUT, "gc_timeout" }, + { CTL_INT, NET_IPV6_ROUTE_GC_INTERVAL, "gc_interval" }, + { CTL_INT, NET_IPV6_ROUTE_GC_ELASTICITY, "gc_elasticity" }, + { CTL_INT, NET_IPV6_ROUTE_MTU_EXPIRES, "mtu_expires" }, + { CTL_INT, NET_IPV6_ROUTE_MIN_ADVMSS, "min_adv_mss" }, + { CTL_INT, NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, "gc_min_interval_ms" }, + {} +}; + +static const struct bin_table bin_net_ipv6_icmp_table[] = { + { CTL_INT, NET_IPV6_ICMP_RATELIMIT, "ratelimit" }, + {} +}; + +static const struct bin_table bin_net_ipv6_table[] = { + { CTL_DIR, NET_IPV6_CONF, "conf", bin_net_ipv6_conf_table }, + { CTL_DIR, NET_IPV6_NEIGH, "neigh", bin_net_neigh_table }, + { CTL_DIR, NET_IPV6_ROUTE, "route", bin_net_ipv6_route_table }, + { CTL_DIR, NET_IPV6_ICMP, "icmp", bin_net_ipv6_icmp_table }, + { CTL_INT, NET_IPV6_BINDV6ONLY, "bindv6only" }, + { CTL_INT, NET_IPV6_IP6FRAG_HIGH_THRESH, "ip6frag_high_thresh" }, + { CTL_INT, NET_IPV6_IP6FRAG_LOW_THRESH, "ip6frag_low_thresh" }, + { CTL_INT, NET_IPV6_IP6FRAG_TIME, "ip6frag_time" }, + { CTL_INT, NET_IPV6_IP6FRAG_SECRET_INTERVAL, "ip6frag_secret_interval" }, + { CTL_INT, NET_IPV6_MLD_MAX_MSF, "mld_max_msf" }, + { CTL_INT, 2088 /* IPQ_QMAX */, "ip6_queue_maxlen" }, + {} +}; + +static const struct bin_table bin_net_x25_table[] = { + { CTL_INT, NET_X25_RESTART_REQUEST_TIMEOUT, "restart_request_timeout" }, + { CTL_INT, NET_X25_CALL_REQUEST_TIMEOUT, "call_request_timeout" }, + { CTL_INT, NET_X25_RESET_REQUEST_TIMEOUT, "reset_request_timeout" }, + { CTL_INT, NET_X25_CLEAR_REQUEST_TIMEOUT, "clear_request_timeout" }, + { CTL_INT, NET_X25_ACK_HOLD_BACK_TIMEOUT, "acknowledgement_hold_back_timeout" }, + { CTL_INT, NET_X25_FORWARD, "x25_forward" }, + {} +}; + +static const struct bin_table bin_net_tr_table[] = { + { CTL_INT, NET_TR_RIF_TIMEOUT, "rif_timeout" }, + {} +}; + + +static const struct bin_table bin_net_decnet_conf_vars[] = { + { CTL_INT, NET_DECNET_CONF_DEV_FORWARDING, "forwarding" }, + { CTL_INT, NET_DECNET_CONF_DEV_PRIORITY, "priority" }, + { CTL_INT, NET_DECNET_CONF_DEV_T2, "t2" }, + { CTL_INT, NET_DECNET_CONF_DEV_T3, "t3" }, + {} +}; + +static const struct bin_table bin_net_decnet_conf[] = { + { CTL_DIR, NET_DECNET_CONF_ETHER, "ethernet", bin_net_decnet_conf_vars }, + { CTL_DIR, NET_DECNET_CONF_GRE, "ipgre", bin_net_decnet_conf_vars }, + { CTL_DIR, NET_DECNET_CONF_X25, "x25", bin_net_decnet_conf_vars }, + { CTL_DIR, NET_DECNET_CONF_PPP, "ppp", bin_net_decnet_conf_vars }, + { CTL_DIR, NET_DECNET_CONF_DDCMP, "ddcmp", bin_net_decnet_conf_vars }, + { CTL_DIR, NET_DECNET_CONF_LOOPBACK, "loopback", bin_net_decnet_conf_vars }, + { CTL_DIR, 0, NULL, bin_net_decnet_conf_vars }, + {} +}; + +static const struct bin_table bin_net_decnet_table[] = { + { CTL_DIR, NET_DECNET_CONF, "conf", bin_net_decnet_conf }, + { CTL_DNADR, NET_DECNET_NODE_ADDRESS, "node_address" }, + { CTL_STR, NET_DECNET_NODE_NAME, "node_name" }, + { CTL_STR, NET_DECNET_DEFAULT_DEVICE, "default_device" }, + { CTL_INT, NET_DECNET_TIME_WAIT, "time_wait" }, + { CTL_INT, NET_DECNET_DN_COUNT, "dn_count" }, + { CTL_INT, NET_DECNET_DI_COUNT, "di_count" }, + { CTL_INT, NET_DECNET_DR_COUNT, "dr_count" }, + { CTL_INT, NET_DECNET_DST_GC_INTERVAL, "dst_gc_interval" }, + { CTL_INT, NET_DECNET_NO_FC_MAX_CWND, "no_fc_max_cwnd" }, + { CTL_INT, NET_DECNET_MEM, "decnet_mem" }, + { CTL_INT, NET_DECNET_RMEM, "decnet_rmem" }, + { CTL_INT, NET_DECNET_WMEM, "decnet_wmem" }, + { CTL_INT, NET_DECNET_DEBUG_LEVEL, "debug" }, + {} +}; + +static const struct bin_table bin_net_sctp_table[] = { + { CTL_INT, NET_SCTP_RTO_INITIAL, "rto_initial" }, + { CTL_INT, NET_SCTP_RTO_MIN, "rto_min" }, + { CTL_INT, NET_SCTP_RTO_MAX, "rto_max" }, + { CTL_INT, NET_SCTP_RTO_ALPHA, "rto_alpha_exp_divisor" }, + { CTL_INT, NET_SCTP_RTO_BETA, "rto_beta_exp_divisor" }, + { CTL_INT, NET_SCTP_VALID_COOKIE_LIFE, "valid_cookie_life" }, + { CTL_INT, NET_SCTP_ASSOCIATION_MAX_RETRANS, "association_max_retrans" }, + { CTL_INT, NET_SCTP_PATH_MAX_RETRANS, "path_max_retrans" }, + { CTL_INT, NET_SCTP_MAX_INIT_RETRANSMITS, "max_init_retransmits" }, + { CTL_INT, NET_SCTP_HB_INTERVAL, "hb_interval" }, + { CTL_INT, NET_SCTP_PRESERVE_ENABLE, "cookie_preserve_enable" }, + { CTL_INT, NET_SCTP_MAX_BURST, "max_burst" }, + { CTL_INT, NET_SCTP_ADDIP_ENABLE, "addip_enable" }, + { CTL_INT, NET_SCTP_PRSCTP_ENABLE, "prsctp_enable" }, + { CTL_INT, NET_SCTP_SNDBUF_POLICY, "sndbuf_policy" }, + { CTL_INT, NET_SCTP_SACK_TIMEOUT, "sack_timeout" }, + { CTL_INT, NET_SCTP_RCVBUF_POLICY, "rcvbuf_policy" }, + {} +}; + +static const struct bin_table bin_net_llc_llc2_timeout_table[] = { + { CTL_INT, NET_LLC2_ACK_TIMEOUT, "ack" }, + { CTL_INT, NET_LLC2_P_TIMEOUT, "p" }, + { CTL_INT, NET_LLC2_REJ_TIMEOUT, "rej" }, + { CTL_INT, NET_LLC2_BUSY_TIMEOUT, "busy" }, + {} +}; + +static const struct bin_table bin_net_llc_station_table[] = { + { CTL_INT, NET_LLC_STATION_ACK_TIMEOUT, "ack_timeout" }, + {} +}; + +static const struct bin_table bin_net_llc_llc2_table[] = { + { CTL_DIR, NET_LLC2, "timeout", bin_net_llc_llc2_timeout_table }, + {} +}; + +static const struct bin_table bin_net_llc_table[] = { + { CTL_DIR, NET_LLC2, "llc2", bin_net_llc_llc2_table }, + { CTL_DIR, NET_LLC_STATION, "station", bin_net_llc_station_table }, + {} +}; + +static const struct bin_table bin_net_netfilter_table[] = { + { CTL_INT, NET_NF_CONNTRACK_MAX, "nf_conntrack_max" }, + /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT "nf_conntrack_tcp_timeout_syn_sent" no longer used */ + /* NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV "nf_conntrack_tcp_timeout_syn_recv" no longer used */ + /* NET_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED "nf_conntrack_tcp_timeout_established" no longer used */ + /* NET_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT "nf_conntrack_tcp_timeout_fin_wait" no longer used */ + /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT "nf_conntrack_tcp_timeout_close_wait" no longer used */ + /* NET_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK "nf_conntrack_tcp_timeout_last_ack" no longer used */ + /* NET_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT "nf_conntrack_tcp_timeout_time_wait" no longer used */ + /* NET_NF_CONNTRACK_TCP_TIMEOUT_CLOSE "nf_conntrack_tcp_timeout_close" no longer used */ + /* NET_NF_CONNTRACK_UDP_TIMEOUT "nf_conntrack_udp_timeout" no longer used */ + /* NET_NF_CONNTRACK_UDP_TIMEOUT_STREAM "nf_conntrack_udp_timeout_stream" no longer used */ + /* NET_NF_CONNTRACK_ICMP_TIMEOUT "nf_conntrack_icmp_timeout" no longer used */ + /* NET_NF_CONNTRACK_GENERIC_TIMEOUT "nf_conntrack_generic_timeout" no longer used */ + { CTL_INT, NET_NF_CONNTRACK_BUCKETS, "nf_conntrack_buckets" }, + { CTL_INT, NET_NF_CONNTRACK_LOG_INVALID, "nf_conntrack_log_invalid" }, + /* NET_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS "nf_conntrack_tcp_timeout_max_retrans" no longer used */ + { CTL_INT, NET_NF_CONNTRACK_TCP_LOOSE, "nf_conntrack_tcp_loose" }, + { CTL_INT, NET_NF_CONNTRACK_TCP_BE_LIBERAL, "nf_conntrack_tcp_be_liberal" }, + { CTL_INT, NET_NF_CONNTRACK_TCP_MAX_RETRANS, "nf_conntrack_tcp_max_retrans" }, + /* NET_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED "nf_conntrack_sctp_timeout_closed" no longer used */ + /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT "nf_conntrack_sctp_timeout_cookie_wait" no longer used */ + /* NET_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED "nf_conntrack_sctp_timeout_cookie_echoed" no longer used */ + /* NET_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED "nf_conntrack_sctp_timeout_established" no longer used */ + /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT "nf_conntrack_sctp_timeout_shutdown_sent" no longer used */ + /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD "nf_conntrack_sctp_timeout_shutdown_recd" no longer used */ + /* NET_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT "nf_conntrack_sctp_timeout_shutdown_ack_sent" no longer used */ + { CTL_INT, NET_NF_CONNTRACK_COUNT, "nf_conntrack_count" }, + /* NET_NF_CONNTRACK_ICMPV6_TIMEOUT "nf_conntrack_icmpv6_timeout" no longer used */ + /* NET_NF_CONNTRACK_FRAG6_TIMEOUT "nf_conntrack_frag6_timeout" no longer used */ + { CTL_INT, NET_NF_CONNTRACK_FRAG6_LOW_THRESH, "nf_conntrack_frag6_low_thresh" }, + { CTL_INT, NET_NF_CONNTRACK_FRAG6_HIGH_THRESH, "nf_conntrack_frag6_high_thresh" }, + { CTL_INT, NET_NF_CONNTRACK_CHECKSUM, "nf_conntrack_checksum" }, + + {} +}; + +static const struct bin_table bin_net_irda_table[] = { + { CTL_INT, NET_IRDA_DISCOVERY, "discovery" }, + { CTL_STR, NET_IRDA_DEVNAME, "devname" }, + { CTL_INT, NET_IRDA_DEBUG, "debug" }, + { CTL_INT, NET_IRDA_FAST_POLL, "fast_poll_increase" }, + { CTL_INT, NET_IRDA_DISCOVERY_SLOTS, "discovery_slots" }, + { CTL_INT, NET_IRDA_DISCOVERY_TIMEOUT, "discovery_timeout" }, + { CTL_INT, NET_IRDA_SLOT_TIMEOUT, "slot_timeout" }, + { CTL_INT, NET_IRDA_MAX_BAUD_RATE, "max_baud_rate" }, + { CTL_INT, NET_IRDA_MIN_TX_TURN_TIME, "min_tx_turn_time" }, + { CTL_INT, NET_IRDA_MAX_TX_DATA_SIZE, "max_tx_data_size" }, + { CTL_INT, NET_IRDA_MAX_TX_WINDOW, "max_tx_window" }, + { CTL_INT, NET_IRDA_MAX_NOREPLY_TIME, "max_noreply_time" }, + { CTL_INT, NET_IRDA_WARN_NOREPLY_TIME, "warn_noreply_time" }, + { CTL_INT, NET_IRDA_LAP_KEEPALIVE_TIME, "lap_keepalive_time" }, + {} +}; + +static const struct bin_table bin_net_table[] = { + { CTL_DIR, NET_CORE, "core", bin_net_core_table }, + /* NET_ETHER not used */ + /* NET_802 not used */ + { CTL_DIR, NET_UNIX, "unix", bin_net_unix_table }, + { CTL_DIR, NET_IPV4, "ipv4", bin_net_ipv4_table }, + { CTL_DIR, NET_IPX, "ipx", bin_net_ipx_table }, + { CTL_DIR, NET_ATALK, "appletalk", bin_net_atalk_table }, + { CTL_DIR, NET_NETROM, "netrom", bin_net_netrom_table }, + { CTL_DIR, NET_AX25, "ax25", bin_net_ax25_table }, + /* NET_BRIDGE "bridge" no longer used */ + { CTL_DIR, NET_ROSE, "rose", bin_net_rose_table }, + { CTL_DIR, NET_IPV6, "ipv6", bin_net_ipv6_table }, + { CTL_DIR, NET_X25, "x25", bin_net_x25_table }, + { CTL_DIR, NET_TR, "token-ring", bin_net_tr_table }, + { CTL_DIR, NET_DECNET, "decnet", bin_net_decnet_table }, + /* NET_ECONET not used */ + { CTL_DIR, NET_SCTP, "sctp", bin_net_sctp_table }, + { CTL_DIR, NET_LLC, "llc", bin_net_llc_table }, + { CTL_DIR, NET_NETFILTER, "netfilter", bin_net_netfilter_table }, + /* NET_DCCP "dccp" no longer used */ + { CTL_DIR, NET_IRDA, "irda", bin_net_irda_table }, + { CTL_INT, 2089, "nf_conntrack_max" }, + {} +}; + +static const struct bin_table bin_fs_quota_table[] = { + { CTL_INT, FS_DQ_LOOKUPS, "lookups" }, + { CTL_INT, FS_DQ_DROPS, "drops" }, + { CTL_INT, FS_DQ_READS, "reads" }, + { CTL_INT, FS_DQ_WRITES, "writes" }, + { CTL_INT, FS_DQ_CACHE_HITS, "cache_hits" }, + { CTL_INT, FS_DQ_ALLOCATED, "allocated_dquots" }, + { CTL_INT, FS_DQ_FREE, "free_dquots" }, + { CTL_INT, FS_DQ_SYNCS, "syncs" }, + { CTL_INT, FS_DQ_WARNINGS, "warnings" }, + {} +}; + +static const struct bin_table bin_fs_xfs_table[] = { + { CTL_INT, XFS_SGID_INHERIT, "irix_sgid_inherit" }, + { CTL_INT, XFS_SYMLINK_MODE, "irix_symlink_mode" }, + { CTL_INT, XFS_PANIC_MASK, "panic_mask" }, + + { CTL_INT, XFS_ERRLEVEL, "error_level" }, + { CTL_INT, XFS_SYNCD_TIMER, "xfssyncd_centisecs" }, + { CTL_INT, XFS_INHERIT_SYNC, "inherit_sync" }, + { CTL_INT, XFS_INHERIT_NODUMP, "inherit_nodump" }, + { CTL_INT, XFS_INHERIT_NOATIME, "inherit_noatime" }, + { CTL_INT, XFS_BUF_TIMER, "xfsbufd_centisecs" }, + { CTL_INT, XFS_BUF_AGE, "age_buffer_centisecs" }, + { CTL_INT, XFS_INHERIT_NOSYM, "inherit_nosymlinks" }, + { CTL_INT, XFS_ROTORSTEP, "rotorstep" }, + { CTL_INT, XFS_INHERIT_NODFRG, "inherit_nodefrag" }, + { CTL_INT, XFS_FILESTREAM_TIMER, "filestream_centisecs" }, + { CTL_INT, XFS_STATS_CLEAR, "stats_clear" }, + {} +}; + +static const struct bin_table bin_fs_ocfs2_nm_table[] = { + { CTL_STR, 1, "hb_ctl_path" }, + {} +}; + +static const struct bin_table bin_fs_ocfs2_table[] = { + { CTL_DIR, 1, "nm", bin_fs_ocfs2_nm_table }, + {} +}; + +static const struct bin_table bin_inotify_table[] = { + { CTL_INT, INOTIFY_MAX_USER_INSTANCES, "max_user_instances" }, + { CTL_INT, INOTIFY_MAX_USER_WATCHES, "max_user_watches" }, + { CTL_INT, INOTIFY_MAX_QUEUED_EVENTS, "max_queued_events" }, + {} +}; + +static const struct bin_table bin_fs_table[] = { + { CTL_INT, FS_NRINODE, "inode-nr" }, + { CTL_INT, FS_STATINODE, "inode-state" }, + /* FS_MAXINODE unused */ + /* FS_NRDQUOT unused */ + /* FS_MAXDQUOT unused */ + /* FS_NRFILE "file-nr" no longer used */ + { CTL_INT, FS_MAXFILE, "file-max" }, + { CTL_INT, FS_DENTRY, "dentry-state" }, + /* FS_NRSUPER unused */ + /* FS_MAXUPSER unused */ + { CTL_INT, FS_OVERFLOWUID, "overflowuid" }, + { CTL_INT, FS_OVERFLOWGID, "overflowgid" }, + { CTL_INT, FS_LEASES, "leases-enable" }, + { CTL_INT, FS_DIR_NOTIFY, "dir-notify-enable" }, + { CTL_INT, FS_LEASE_TIME, "lease-break-time" }, + { CTL_DIR, FS_DQSTATS, "quota", bin_fs_quota_table }, + { CTL_DIR, FS_XFS, "xfs", bin_fs_xfs_table }, + { CTL_ULONG, FS_AIO_NR, "aio-nr" }, + { CTL_ULONG, FS_AIO_MAX_NR, "aio-max-nr" }, + { CTL_DIR, FS_INOTIFY, "inotify", bin_inotify_table }, + { CTL_DIR, FS_OCFS2, "ocfs2", bin_fs_ocfs2_table }, + { CTL_INT, KERN_SETUID_DUMPABLE, "suid_dumpable" }, + {} +}; + +static const struct bin_table bin_ipmi_table[] = { + { CTL_INT, DEV_IPMI_POWEROFF_POWERCYCLE, "poweroff_powercycle" }, + {} +}; + +static const struct bin_table bin_mac_hid_files[] = { + /* DEV_MAC_HID_KEYBOARD_SENDS_LINUX_KEYCODES unused */ + /* DEV_MAC_HID_KEYBOARD_LOCK_KEYCODES unused */ + { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON_EMULATION, "mouse_button_emulation" }, + { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON2_KEYCODE, "mouse_button2_keycode" }, + { CTL_INT, DEV_MAC_HID_MOUSE_BUTTON3_KEYCODE, "mouse_button3_keycode" }, + /* DEV_MAC_HID_ADB_MOUSE_SENDS_KEYCODES unused */ + {} +}; + +static const struct bin_table bin_raid_table[] = { + { CTL_INT, DEV_RAID_SPEED_LIMIT_MIN, "speed_limit_min" }, + { CTL_INT, DEV_RAID_SPEED_LIMIT_MAX, "speed_limit_max" }, + {} +}; + +static const struct bin_table bin_scsi_table[] = { + { CTL_INT, DEV_SCSI_LOGGING_LEVEL, "logging_level" }, + {} +}; + +static const struct bin_table bin_dev_table[] = { + /* DEV_CDROM "cdrom" no longer used */ + /* DEV_HWMON unused */ + /* DEV_PARPORT "parport" no longer used */ + { CTL_DIR, DEV_RAID, "raid", bin_raid_table }, + { CTL_DIR, DEV_MAC_HID, "mac_hid", bin_mac_hid_files }, + { CTL_DIR, DEV_SCSI, "scsi", bin_scsi_table }, + { CTL_DIR, DEV_IPMI, "ipmi", bin_ipmi_table }, + {} +}; + +static const struct bin_table bin_bus_isa_table[] = { + { CTL_INT, BUS_ISA_MEM_BASE, "membase" }, + { CTL_INT, BUS_ISA_PORT_BASE, "portbase" }, + { CTL_INT, BUS_ISA_PORT_SHIFT, "portshift" }, + {} +}; + +static const struct bin_table bin_bus_table[] = { + { CTL_DIR, CTL_BUS_ISA, "isa", bin_bus_isa_table }, + {} +}; + + +static const struct bin_table bin_s390dbf_table[] = { + { CTL_INT, 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" }, + { CTL_INT, 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" }, + {} +}; + +static const struct bin_table bin_sunrpc_table[] = { + /* CTL_RPCDEBUG "rpc_debug" no longer used */ + /* CTL_NFSDEBUG "nfs_debug" no longer used */ + /* CTL_NFSDDEBUG "nfsd_debug" no longer used */ + /* CTL_NLMDEBUG "nlm_debug" no longer used */ + + { CTL_INT, CTL_SLOTTABLE_UDP, "udp_slot_table_entries" }, + { CTL_INT, CTL_SLOTTABLE_TCP, "tcp_slot_table_entries" }, + { CTL_INT, CTL_MIN_RESVPORT, "min_resvport" }, + { CTL_INT, CTL_MAX_RESVPORT, "max_resvport" }, + {} +}; + +static const struct bin_table bin_pm_table[] = { + /* frv specific */ + /* 1 == CTL_PM_SUSPEND "suspend" no longer used" */ + { CTL_INT, 2 /* CTL_PM_CMODE */, "cmode" }, + { CTL_INT, 3 /* CTL_PM_P0 */, "p0" }, + { CTL_INT, 4 /* CTL_PM_CM */, "cm" }, + {} +}; + +static const struct bin_table bin_root_table[] = { + { CTL_DIR, CTL_KERN, "kernel", bin_kern_table }, + { CTL_DIR, CTL_VM, "vm", bin_vm_table }, + { CTL_DIR, CTL_NET, "net", bin_net_table }, + /* CTL_PROC not used */ + { CTL_DIR, CTL_FS, "fs", bin_fs_table }, + /* CTL_DEBUG "debug" no longer used */ + { CTL_DIR, CTL_DEV, "dev", bin_dev_table }, + { CTL_DIR, CTL_BUS, "bus", bin_bus_table }, + { CTL_DIR, CTL_ABI, "abi" }, + /* CTL_CPU not used */ + /* CTL_ARLAN "arlan" no longer used */ + { CTL_DIR, CTL_S390DBF, "s390dbf", bin_s390dbf_table }, + { CTL_DIR, CTL_SUNRPC, "sunrpc", bin_sunrpc_table }, + { CTL_DIR, CTL_PM, "pm", bin_pm_table }, + {} +}; + +static ssize_t bin_dir(struct file *file, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + return -ENOTDIR; +} + + +static ssize_t bin_string(struct file *file, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + ssize_t result, copied = 0; + + if (oldval && oldlen) { + char __user *lastp; + loff_t pos = 0; + int ch; + + result = vfs_read(file, oldval, oldlen, &pos); + if (result < 0) + goto out; + + copied = result; + lastp = oldval + copied - 1; + + result = -EFAULT; + if (get_user(ch, lastp)) + goto out; + + /* Trim off the trailing newline */ + if (ch == '\n') { + result = -EFAULT; + if (put_user('\0', lastp)) + goto out; + copied -= 1; + } + } + + if (newval && newlen) { + loff_t pos = 0; + + result = vfs_write(file, newval, newlen, &pos); + if (result < 0) + goto out; + } + + result = copied; +out: + return result; +} + +static ssize_t bin_intvec(struct file *file, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + mm_segment_t old_fs = get_fs(); + ssize_t copied = 0; + char *buffer; + ssize_t result; + + result = -ENOMEM; + buffer = kmalloc(BUFSZ, GFP_KERNEL); + if (!buffer) + goto out; + + if (oldval && oldlen) { + unsigned __user *vec = oldval; + size_t length = oldlen / sizeof(*vec); + loff_t pos = 0; + char *str, *end; + int i; + + set_fs(KERNEL_DS); + result = vfs_read(file, buffer, BUFSZ - 1, &pos); + set_fs(old_fs); + if (result < 0) + goto out_kfree; + + str = buffer; + end = str + result; + *end++ = '\0'; + for (i = 0; i < length; i++) { + unsigned long value; + + value = simple_strtoul(str, &str, 10); + while (isspace(*str)) + str++; + + result = -EFAULT; + if (put_user(value, vec + i)) + goto out_kfree; + + copied += sizeof(*vec); + if (!isdigit(*str)) + break; + } + } + + if (newval && newlen) { + unsigned __user *vec = newval; + size_t length = newlen / sizeof(*vec); + loff_t pos = 0; + char *str, *end; + int i; + + str = buffer; + end = str + BUFSZ; + for (i = 0; i < length; i++) { + unsigned long value; + + result = -EFAULT; + if (get_user(value, vec + i)) + goto out_kfree; + + str += snprintf(str, end - str, "%lu\t", value); + } + + set_fs(KERNEL_DS); + result = vfs_write(file, buffer, str - buffer, &pos); + set_fs(old_fs); + if (result < 0) + goto out_kfree; + } + result = copied; +out_kfree: + kfree(buffer); +out: + return result; +} + +static ssize_t bin_ulongvec(struct file *file, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + mm_segment_t old_fs = get_fs(); + ssize_t copied = 0; + char *buffer; + ssize_t result; + + result = -ENOMEM; + buffer = kmalloc(BUFSZ, GFP_KERNEL); + if (!buffer) + goto out; + + if (oldval && oldlen) { + unsigned long __user *vec = oldval; + size_t length = oldlen / sizeof(*vec); + loff_t pos = 0; + char *str, *end; + int i; + + set_fs(KERNEL_DS); + result = vfs_read(file, buffer, BUFSZ - 1, &pos); + set_fs(old_fs); + if (result < 0) + goto out_kfree; + + str = buffer; + end = str + result; + *end++ = '\0'; + for (i = 0; i < length; i++) { + unsigned long value; + + value = simple_strtoul(str, &str, 10); + while (isspace(*str)) + str++; + + result = -EFAULT; + if (put_user(value, vec + i)) + goto out_kfree; + + copied += sizeof(*vec); + if (!isdigit(*str)) + break; + } + } + + if (newval && newlen) { + unsigned long __user *vec = newval; + size_t length = newlen / sizeof(*vec); + loff_t pos = 0; + char *str, *end; + int i; + + str = buffer; + end = str + BUFSZ; + for (i = 0; i < length; i++) { + unsigned long value; + + result = -EFAULT; + if (get_user(value, vec + i)) + goto out_kfree; + + str += snprintf(str, end - str, "%lu\t", value); + } + + set_fs(KERNEL_DS); + result = vfs_write(file, buffer, str - buffer, &pos); + set_fs(old_fs); + if (result < 0) + goto out_kfree; + } + result = copied; +out_kfree: + kfree(buffer); +out: + return result; +} + +static ssize_t bin_uuid(struct file *file, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + mm_segment_t old_fs = get_fs(); + ssize_t result, copied = 0; + + /* Only supports reads */ + if (oldval && oldlen) { + loff_t pos = 0; + char buf[40], *str = buf; + unsigned char uuid[16]; + int i; + + set_fs(KERNEL_DS); + result = vfs_read(file, buf, sizeof(buf) - 1, &pos); + set_fs(old_fs); + if (result < 0) + goto out; + + buf[result] = '\0'; + + /* Convert the uuid to from a string to binary */ + for (i = 0; i < 16; i++) { + result = -EIO; + if (!isxdigit(str[0]) || !isxdigit(str[1])) + goto out; + + uuid[i] = (hex_to_bin(str[0]) << 4) | + hex_to_bin(str[1]); + str += 2; + if (*str == '-') + str++; + } + + if (oldlen > 16) + oldlen = 16; + + result = -EFAULT; + if (copy_to_user(oldval, uuid, oldlen)) + goto out; + + copied = oldlen; + } + result = copied; +out: + return result; +} + +static ssize_t bin_dn_node_address(struct file *file, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + mm_segment_t old_fs = get_fs(); + ssize_t result, copied = 0; + + if (oldval && oldlen) { + loff_t pos = 0; + char buf[15], *nodep; + unsigned long area, node; + __le16 dnaddr; + + set_fs(KERNEL_DS); + result = vfs_read(file, buf, sizeof(buf) - 1, &pos); + set_fs(old_fs); + if (result < 0) + goto out; + + buf[result] = '\0'; + + /* Convert the decnet address to binary */ + result = -EIO; + nodep = strchr(buf, '.') + 1; + if (!nodep) + goto out; + + area = simple_strtoul(buf, NULL, 10); + node = simple_strtoul(nodep, NULL, 10); + + result = -EIO; + if ((area > 63)||(node > 1023)) + goto out; + + dnaddr = cpu_to_le16((area << 10) | node); + + result = -EFAULT; + if (put_user(dnaddr, (__le16 __user *)oldval)) + goto out; + + copied = sizeof(dnaddr); + } + + if (newval && newlen) { + loff_t pos = 0; + __le16 dnaddr; + char buf[15]; + int len; + + result = -EINVAL; + if (newlen != sizeof(dnaddr)) + goto out; + + result = -EFAULT; + if (get_user(dnaddr, (__le16 __user *)newval)) + goto out; + + len = snprintf(buf, sizeof(buf), "%hu.%hu", + le16_to_cpu(dnaddr) >> 10, + le16_to_cpu(dnaddr) & 0x3ff); + + set_fs(KERNEL_DS); + result = vfs_write(file, buf, len, &pos); + set_fs(old_fs); + if (result < 0) + goto out; + } + + result = copied; +out: + return result; +} + +static const struct bin_table *get_sysctl(const int *name, int nlen, char *path) +{ + const struct bin_table *table = &bin_root_table[0]; + int ctl_name; + + /* The binary sysctl tables have a small maximum depth so + * there is no danger of overflowing our path as it PATH_MAX + * bytes long. + */ + memcpy(path, "sys/", 4); + path += 4; + +repeat: + if (!nlen) + return ERR_PTR(-ENOTDIR); + ctl_name = *name; + name++; + nlen--; + for ( ; table->convert; table++) { + int len = 0; + + /* + * For a wild card entry map from ifindex to network + * device name. + */ + if (!table->ctl_name) { +#ifdef CONFIG_NET + struct net *net = current->nsproxy->net_ns; + struct net_device *dev; + dev = dev_get_by_index(net, ctl_name); + if (dev) { + len = strlen(dev->name); + memcpy(path, dev->name, len); + dev_put(dev); + } +#endif + /* Use the well known sysctl number to proc name mapping */ + } else if (ctl_name == table->ctl_name) { + len = strlen(table->procname); + memcpy(path, table->procname, len); + } + if (len) { + path += len; + if (table->child) { + *path++ = '/'; + table = table->child; + goto repeat; + } + *path = '\0'; + return table; + } + } + return ERR_PTR(-ENOTDIR); +} + +static char *sysctl_getname(const int *name, int nlen, const struct bin_table **tablep) +{ + char *tmp, *result; + + result = ERR_PTR(-ENOMEM); + tmp = __getname(); + if (tmp) { + const struct bin_table *table = get_sysctl(name, nlen, tmp); + result = tmp; + *tablep = table; + if (IS_ERR(table)) { + __putname(tmp); + result = ERR_CAST(table); + } + } + return result; +} + +static ssize_t binary_sysctl(const int *name, int nlen, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + const struct bin_table *table = NULL; + struct vfsmount *mnt; + struct file *file; + ssize_t result; + char *pathname; + int flags; + + pathname = sysctl_getname(name, nlen, &table); + result = PTR_ERR(pathname); + if (IS_ERR(pathname)) + goto out; + + /* How should the sysctl be accessed? */ + if (oldval && oldlen && newval && newlen) { + flags = O_RDWR; + } else if (newval && newlen) { + flags = O_WRONLY; + } else if (oldval && oldlen) { + flags = O_RDONLY; + } else { + result = 0; + goto out_putname; + } + + mnt = current->nsproxy->pid_ns->proc_mnt; + file = file_open_root(mnt->mnt_root, mnt, pathname, flags); + result = PTR_ERR(file); + if (IS_ERR(file)) + goto out_putname; + + result = table->convert(file, oldval, oldlen, newval, newlen); + + fput(file); +out_putname: + __putname(pathname); +out: + return result; +} + + +#else /* CONFIG_SYSCTL_SYSCALL */ + +static ssize_t binary_sysctl(const int *name, int nlen, + void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) +{ + return -ENOSYS; +} + +#endif /* CONFIG_SYSCTL_SYSCALL */ + + +static void deprecated_sysctl_warning(const int *name, int nlen) +{ + int i; + + /* + * CTL_KERN/KERN_VERSION is used by older glibc and cannot + * ever go away. + */ + if (name[0] == CTL_KERN && name[1] == KERN_VERSION) + return; + + if (printk_ratelimit()) { + printk(KERN_INFO + "warning: process `%s' used the deprecated sysctl " + "system call with ", current->comm); + for (i = 0; i < nlen; i++) + printk("%d.", name[i]); + printk("\n"); + } + return; +} + +#define WARN_ONCE_HASH_BITS 8 +#define WARN_ONCE_HASH_SIZE (1<nlen. */ + if (nlen < 0 || nlen > CTL_MAXNAME) + return -ENOTDIR; + /* Read in the sysctl name for simplicity */ + for (i = 0; i < nlen; i++) + if (get_user(name[i], args_name + i)) + return -EFAULT; + + warn_on_bintable(name, nlen); + + return binary_sysctl(name, nlen, oldval, oldlen, newval, newlen); +} + +SYSCALL_DEFINE1(sysctl, struct __sysctl_args __user *, args) +{ + struct __sysctl_args tmp; + size_t oldlen = 0; + ssize_t result; + + if (copy_from_user(&tmp, args, sizeof(tmp))) + return -EFAULT; + + if (tmp.oldval && !tmp.oldlenp) + return -EFAULT; + + if (tmp.oldlenp && get_user(oldlen, tmp.oldlenp)) + return -EFAULT; + + result = do_sysctl(tmp.name, tmp.nlen, tmp.oldval, oldlen, + tmp.newval, tmp.newlen); + + if (result >= 0) { + oldlen = result; + result = 0; + } + + if (tmp.oldlenp && put_user(oldlen, tmp.oldlenp)) + return -EFAULT; + + return result; +} + + +#ifdef CONFIG_COMPAT +#include + +struct compat_sysctl_args { + compat_uptr_t name; + int nlen; + compat_uptr_t oldval; + compat_uptr_t oldlenp; + compat_uptr_t newval; + compat_size_t newlen; + compat_ulong_t __unused[4]; +}; + +asmlinkage long compat_sys_sysctl(struct compat_sysctl_args __user *args) +{ + struct compat_sysctl_args tmp; + compat_size_t __user *compat_oldlenp; + size_t oldlen = 0; + ssize_t result; + + if (copy_from_user(&tmp, args, sizeof(tmp))) + return -EFAULT; + + if (tmp.oldval && !tmp.oldlenp) + return -EFAULT; + + compat_oldlenp = compat_ptr(tmp.oldlenp); + if (compat_oldlenp && get_user(oldlen, compat_oldlenp)) + return -EFAULT; + + result = do_sysctl(compat_ptr(tmp.name), tmp.nlen, + compat_ptr(tmp.oldval), oldlen, + compat_ptr(tmp.newval), tmp.newlen); + + if (result >= 0) { + oldlen = result; + result = 0; + } + + if (compat_oldlenp && put_user(oldlen, compat_oldlenp)) + return -EFAULT; + + return result; +} + +#endif /* CONFIG_COMPAT */ diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c new file mode 100644 index 00000000..4e4932a7 --- /dev/null +++ b/kernel/sysctl_check.c @@ -0,0 +1,160 @@ +#include +#include +#include "../fs/xfs/linux-2.6/xfs_sysctl.h" +#include +#include +#include + + +static int sysctl_depth(struct ctl_table *table) +{ + struct ctl_table *tmp; + int depth; + + depth = 0; + for (tmp = table; tmp->parent; tmp = tmp->parent) + depth++; + + return depth; +} + +static struct ctl_table *sysctl_parent(struct ctl_table *table, int n) +{ + int i; + + for (i = 0; table && i < n; i++) + table = table->parent; + + return table; +} + + +static void sysctl_print_path(struct ctl_table *table) +{ + struct ctl_table *tmp; + int depth, i; + depth = sysctl_depth(table); + if (table->procname) { + for (i = depth; i >= 0; i--) { + tmp = sysctl_parent(table, i); + printk("/%s", tmp->procname?tmp->procname:""); + } + } + printk(" "); +} + +static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces, + struct ctl_table *table) +{ + struct ctl_table_header *head; + struct ctl_table *ref, *test; + int depth, cur_depth; + + depth = sysctl_depth(table); + + for (head = __sysctl_head_next(namespaces, NULL); head; + head = __sysctl_head_next(namespaces, head)) { + cur_depth = depth; + ref = head->ctl_table; +repeat: + test = sysctl_parent(table, cur_depth); + for (; ref->procname; ref++) { + int match = 0; + if (cur_depth && !ref->child) + continue; + + if (test->procname && ref->procname && + (strcmp(test->procname, ref->procname) == 0)) + match++; + + if (match) { + if (cur_depth != 0) { + cur_depth--; + ref = ref->child; + goto repeat; + } + goto out; + } + } + } + ref = NULL; +out: + sysctl_head_finish(head); + return ref; +} + +static void set_fail(const char **fail, struct ctl_table *table, const char *str) +{ + if (*fail) { + printk(KERN_ERR "sysctl table check failed: "); + sysctl_print_path(table); + printk(" %s\n", *fail); + dump_stack(); + } + *fail = str; +} + +static void sysctl_check_leaf(struct nsproxy *namespaces, + struct ctl_table *table, const char **fail) +{ + struct ctl_table *ref; + + ref = sysctl_check_lookup(namespaces, table); + if (ref && (ref != table)) + set_fail(fail, table, "Sysctl already exists"); +} + +int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) +{ + int error = 0; + for (; table->procname; table++) { + const char *fail = NULL; + + if (table->parent) { + if (!table->parent->procname) + set_fail(&fail, table, "Parent without procname"); + } + if (table->child) { + if (table->data) + set_fail(&fail, table, "Directory with data?"); + if (table->maxlen) + set_fail(&fail, table, "Directory with maxlen?"); + if ((table->mode & (S_IRUGO|S_IXUGO)) != table->mode) + set_fail(&fail, table, "Writable sysctl directory"); + if (table->proc_handler) + set_fail(&fail, table, "Directory with proc_handler"); + if (table->extra1) + set_fail(&fail, table, "Directory with extra1"); + if (table->extra2) + set_fail(&fail, table, "Directory with extra2"); + } else { + if ((table->proc_handler == proc_dostring) || + (table->proc_handler == proc_dointvec) || + (table->proc_handler == proc_dointvec_minmax) || + (table->proc_handler == proc_dointvec_jiffies) || + (table->proc_handler == proc_dointvec_userhz_jiffies) || + (table->proc_handler == proc_dointvec_ms_jiffies) || + (table->proc_handler == proc_doulongvec_minmax) || + (table->proc_handler == proc_doulongvec_ms_jiffies_minmax)) { + if (!table->data) + set_fail(&fail, table, "No data"); + if (!table->maxlen) + set_fail(&fail, table, "No maxlen"); + } +#ifdef CONFIG_PROC_SYSCTL + if (!table->proc_handler) + set_fail(&fail, table, "No proc_handler"); +#endif + sysctl_check_leaf(namespaces, table, &fail); + } + if (table->mode > 0777) + set_fail(&fail, table, "bogus .mode"); + if (fail) { + set_fail(&fail, table, NULL); + error = -EINVAL; + } + if (table->child) + error |= sysctl_check_table(namespaces, table->child); + } + return error; +} diff --git a/kernel/taskstats.c b/kernel/taskstats.c new file mode 100644 index 00000000..8d597b19 --- /dev/null +++ b/kernel/taskstats.c @@ -0,0 +1,711 @@ +/* + * taskstats.c - Export per-task statistics to userland + * + * Copyright (C) Shailabh Nagar, IBM Corp. 2006 + * (C) Balbir Singh, IBM Corp. 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Maximum length of a cpumask that can be specified in + * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute + */ +#define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) + +static DEFINE_PER_CPU(__u32, taskstats_seqnum); +static int family_registered; +struct kmem_cache *taskstats_cache; + +static struct genl_family family = { + .id = GENL_ID_GENERATE, + .name = TASKSTATS_GENL_NAME, + .version = TASKSTATS_GENL_VERSION, + .maxattr = TASKSTATS_CMD_ATTR_MAX, +}; + +static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { + [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, + [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, + [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, + [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; + +static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = { + [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, +}; + +struct listener { + struct list_head list; + pid_t pid; + char valid; +}; + +struct listener_list { + struct rw_semaphore sem; + struct list_head list; +}; +static DEFINE_PER_CPU(struct listener_list, listener_array); + +enum actions { + REGISTER, + DEREGISTER, + CPU_DONT_CARE +}; + +static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, + size_t size) +{ + struct sk_buff *skb; + void *reply; + + /* + * If new attributes are added, please revisit this allocation + */ + skb = genlmsg_new(size, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + if (!info) { + int seq = this_cpu_inc_return(taskstats_seqnum) - 1; + + reply = genlmsg_put(skb, 0, seq, &family, 0, cmd); + } else + reply = genlmsg_put_reply(skb, info, &family, 0, cmd); + if (reply == NULL) { + nlmsg_free(skb); + return -EINVAL; + } + + *skbp = skb; + return 0; +} + +/* + * Send taskstats data in @skb to listener with nl_pid @pid + */ +static int send_reply(struct sk_buff *skb, struct genl_info *info) +{ + struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); + void *reply = genlmsg_data(genlhdr); + int rc; + + rc = genlmsg_end(skb, reply); + if (rc < 0) { + nlmsg_free(skb); + return rc; + } + + return genlmsg_reply(skb, info); +} + +/* + * Send taskstats data in @skb to listeners registered for @cpu's exit data + */ +static void send_cpu_listeners(struct sk_buff *skb, + struct listener_list *listeners) +{ + struct genlmsghdr *genlhdr = nlmsg_data(nlmsg_hdr(skb)); + struct listener *s, *tmp; + struct sk_buff *skb_next, *skb_cur = skb; + void *reply = genlmsg_data(genlhdr); + int rc, delcount = 0; + + rc = genlmsg_end(skb, reply); + if (rc < 0) { + nlmsg_free(skb); + return; + } + + rc = 0; + down_read(&listeners->sem); + list_for_each_entry(s, &listeners->list, list) { + skb_next = NULL; + if (!list_is_last(&s->list, &listeners->list)) { + skb_next = skb_clone(skb_cur, GFP_KERNEL); + if (!skb_next) + break; + } + rc = genlmsg_unicast(&init_net, skb_cur, s->pid); + if (rc == -ECONNREFUSED) { + s->valid = 0; + delcount++; + } + skb_cur = skb_next; + } + up_read(&listeners->sem); + + if (skb_cur) + nlmsg_free(skb_cur); + + if (!delcount) + return; + + /* Delete invalidated entries */ + down_write(&listeners->sem); + list_for_each_entry_safe(s, tmp, &listeners->list, list) { + if (!s->valid) { + list_del(&s->list); + kfree(s); + } + } + up_write(&listeners->sem); +} + +static void fill_stats(struct task_struct *tsk, struct taskstats *stats) +{ + memset(stats, 0, sizeof(*stats)); + /* + * Each accounting subsystem adds calls to its functions to + * fill in relevant parts of struct taskstsats as follows + * + * per-task-foo(stats, tsk); + */ + + delayacct_add_tsk(stats, tsk); + + /* fill in basic acct fields */ + stats->version = TASKSTATS_VERSION; + stats->nvcsw = tsk->nvcsw; + stats->nivcsw = tsk->nivcsw; + bacct_add_tsk(stats, tsk); + + /* fill in extended acct fields */ + xacct_add_tsk(stats, tsk); +} + +static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) +{ + struct task_struct *tsk; + + rcu_read_lock(); + tsk = find_task_by_vpid(pid); + if (tsk) + get_task_struct(tsk); + rcu_read_unlock(); + if (!tsk) + return -ESRCH; + fill_stats(tsk, stats); + put_task_struct(tsk); + return 0; +} + +static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) +{ + struct task_struct *tsk, *first; + unsigned long flags; + int rc = -ESRCH; + + /* + * Add additional stats from live tasks except zombie thread group + * leaders who are already counted with the dead tasks + */ + rcu_read_lock(); + first = find_task_by_vpid(tgid); + + if (!first || !lock_task_sighand(first, &flags)) + goto out; + + if (first->signal->stats) + memcpy(stats, first->signal->stats, sizeof(*stats)); + else + memset(stats, 0, sizeof(*stats)); + + tsk = first; + do { + if (tsk->exit_state) + continue; + /* + * Accounting subsystem can call its functions here to + * fill in relevant parts of struct taskstsats as follows + * + * per-task-foo(stats, tsk); + */ + delayacct_add_tsk(stats, tsk); + + stats->nvcsw += tsk->nvcsw; + stats->nivcsw += tsk->nivcsw; + } while_each_thread(first, tsk); + + unlock_task_sighand(first, &flags); + rc = 0; +out: + rcu_read_unlock(); + + stats->version = TASKSTATS_VERSION; + /* + * Accounting subsystems can also add calls here to modify + * fields of taskstats. + */ + return rc; +} + +static void fill_tgid_exit(struct task_struct *tsk) +{ + unsigned long flags; + + spin_lock_irqsave(&tsk->sighand->siglock, flags); + if (!tsk->signal->stats) + goto ret; + + /* + * Each accounting subsystem calls its functions here to + * accumalate its per-task stats for tsk, into the per-tgid structure + * + * per-task-foo(tsk->signal->stats, tsk); + */ + delayacct_add_tsk(tsk->signal->stats, tsk); +ret: + spin_unlock_irqrestore(&tsk->sighand->siglock, flags); + return; +} + +static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd) +{ + struct listener_list *listeners; + struct listener *s, *tmp, *s2; + unsigned int cpu; + + if (!cpumask_subset(mask, cpu_possible_mask)) + return -EINVAL; + + s = NULL; + if (isadd == REGISTER) { + for_each_cpu(cpu, mask) { + if (!s) + s = kmalloc_node(sizeof(struct listener), + GFP_KERNEL, cpu_to_node(cpu)); + if (!s) + goto cleanup; + s->pid = pid; + INIT_LIST_HEAD(&s->list); + s->valid = 1; + + listeners = &per_cpu(listener_array, cpu); + down_write(&listeners->sem); + list_for_each_entry_safe(s2, tmp, &listeners->list, list) { + if (s2->pid == pid) + goto next_cpu; + } + list_add(&s->list, &listeners->list); + s = NULL; +next_cpu: + up_write(&listeners->sem); + } + kfree(s); + return 0; + } + + /* Deregister or cleanup */ +cleanup: + for_each_cpu(cpu, mask) { + listeners = &per_cpu(listener_array, cpu); + down_write(&listeners->sem); + list_for_each_entry_safe(s, tmp, &listeners->list, list) { + if (s->pid == pid) { + list_del(&s->list); + kfree(s); + break; + } + } + up_write(&listeners->sem); + } + return 0; +} + +static int parse(struct nlattr *na, struct cpumask *mask) +{ + char *data; + int len; + int ret; + + if (na == NULL) + return 1; + len = nla_len(na); + if (len > TASKSTATS_CPUMASK_MAXLEN) + return -E2BIG; + if (len < 1) + return -EINVAL; + data = kmalloc(len, GFP_KERNEL); + if (!data) + return -ENOMEM; + nla_strlcpy(data, na, len); + ret = cpulist_parse(data, mask); + kfree(data); + return ret; +} + +#if defined(CONFIG_64BIT) && !defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) +#define TASKSTATS_NEEDS_PADDING 1 +#endif + +static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid) +{ + struct nlattr *na, *ret; + int aggr; + + aggr = (type == TASKSTATS_TYPE_PID) + ? TASKSTATS_TYPE_AGGR_PID + : TASKSTATS_TYPE_AGGR_TGID; + + /* + * The taskstats structure is internally aligned on 8 byte + * boundaries but the layout of the aggregrate reply, with + * two NLA headers and the pid (each 4 bytes), actually + * force the entire structure to be unaligned. This causes + * the kernel to issue unaligned access warnings on some + * architectures like ia64. Unfortunately, some software out there + * doesn't properly unroll the NLA packet and assumes that the start + * of the taskstats structure will always be 20 bytes from the start + * of the netlink payload. Aligning the start of the taskstats + * structure breaks this software, which we don't want. So, for now + * the alignment only happens on architectures that require it + * and those users will have to update to fixed versions of those + * packages. Space is reserved in the packet only when needed. + * This ifdef should be removed in several years e.g. 2012 once + * we can be confident that fixed versions are installed on most + * systems. We add the padding before the aggregate since the + * aggregate is already a defined type. + */ +#ifdef TASKSTATS_NEEDS_PADDING + if (nla_put(skb, TASKSTATS_TYPE_NULL, 0, NULL) < 0) + goto err; +#endif + na = nla_nest_start(skb, aggr); + if (!na) + goto err; + + if (nla_put(skb, type, sizeof(pid), &pid) < 0) + goto err; + ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats)); + if (!ret) + goto err; + nla_nest_end(skb, na); + + return nla_data(ret); +err: + return NULL; +} + +static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) +{ + int rc = 0; + struct sk_buff *rep_skb; + struct cgroupstats *stats; + struct nlattr *na; + size_t size; + u32 fd; + struct file *file; + int fput_needed; + + na = info->attrs[CGROUPSTATS_CMD_ATTR_FD]; + if (!na) + return -EINVAL; + + fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); + file = fget_light(fd, &fput_needed); + if (!file) + return 0; + + size = nla_total_size(sizeof(struct cgroupstats)); + + rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb, + size); + if (rc < 0) + goto err; + + na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, + sizeof(struct cgroupstats)); + stats = nla_data(na); + memset(stats, 0, sizeof(*stats)); + + rc = cgroupstats_build(stats, file->f_dentry); + if (rc < 0) { + nlmsg_free(rep_skb); + goto err; + } + + rc = send_reply(rep_skb, info); + +err: + fput_light(file, fput_needed); + return rc; +} + +static int cmd_attr_register_cpumask(struct genl_info *info) +{ + cpumask_var_t mask; + int rc; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask); + if (rc < 0) + goto out; + rc = add_del_listener(info->snd_pid, mask, REGISTER); +out: + free_cpumask_var(mask); + return rc; +} + +static int cmd_attr_deregister_cpumask(struct genl_info *info) +{ + cpumask_var_t mask; + int rc; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask); + if (rc < 0) + goto out; + rc = add_del_listener(info->snd_pid, mask, DEREGISTER); +out: + free_cpumask_var(mask); + return rc; +} + +static size_t taskstats_packet_size(void) +{ + size_t size; + + size = nla_total_size(sizeof(u32)) + + nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); +#ifdef TASKSTATS_NEEDS_PADDING + size += nla_total_size(0); /* Padding for alignment */ +#endif + return size; +} + +static int cmd_attr_pid(struct genl_info *info) +{ + struct taskstats *stats; + struct sk_buff *rep_skb; + size_t size; + u32 pid; + int rc; + + size = taskstats_packet_size(); + + rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); + if (rc < 0) + return rc; + + rc = -EINVAL; + pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); + stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid); + if (!stats) + goto err; + + rc = fill_stats_for_pid(pid, stats); + if (rc < 0) + goto err; + return send_reply(rep_skb, info); +err: + nlmsg_free(rep_skb); + return rc; +} + +static int cmd_attr_tgid(struct genl_info *info) +{ + struct taskstats *stats; + struct sk_buff *rep_skb; + size_t size; + u32 tgid; + int rc; + + size = taskstats_packet_size(); + + rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size); + if (rc < 0) + return rc; + + rc = -EINVAL; + tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); + stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid); + if (!stats) + goto err; + + rc = fill_stats_for_tgid(tgid, stats); + if (rc < 0) + goto err; + return send_reply(rep_skb, info); +err: + nlmsg_free(rep_skb); + return rc; +} + +static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) +{ + if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK]) + return cmd_attr_register_cpumask(info); + else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK]) + return cmd_attr_deregister_cpumask(info); + else if (info->attrs[TASKSTATS_CMD_ATTR_PID]) + return cmd_attr_pid(info); + else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) + return cmd_attr_tgid(info); + else + return -EINVAL; +} + +static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk) +{ + struct signal_struct *sig = tsk->signal; + struct taskstats *stats; + + if (sig->stats || thread_group_empty(tsk)) + goto ret; + + /* No problem if kmem_cache_zalloc() fails */ + stats = kmem_cache_zalloc(taskstats_cache, GFP_KERNEL); + + spin_lock_irq(&tsk->sighand->siglock); + if (!sig->stats) { + sig->stats = stats; + stats = NULL; + } + spin_unlock_irq(&tsk->sighand->siglock); + + if (stats) + kmem_cache_free(taskstats_cache, stats); +ret: + return sig->stats; +} + +/* Send pid data out on exit */ +void taskstats_exit(struct task_struct *tsk, int group_dead) +{ + int rc; + struct listener_list *listeners; + struct taskstats *stats; + struct sk_buff *rep_skb; + size_t size; + int is_thread_group; + + if (!family_registered) + return; + + /* + * Size includes space for nested attributes + */ + size = taskstats_packet_size(); + + is_thread_group = !!taskstats_tgid_alloc(tsk); + if (is_thread_group) { + /* PID + STATS + TGID + STATS */ + size = 2 * size; + /* fill the tsk->signal->stats structure */ + fill_tgid_exit(tsk); + } + + listeners = __this_cpu_ptr(&listener_array); + if (list_empty(&listeners->list)) + return; + + rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, size); + if (rc < 0) + return; + + stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, tsk->pid); + if (!stats) + goto err; + + fill_stats(tsk, stats); + + /* + * Doesn't matter if tsk is the leader or the last group member leaving + */ + if (!is_thread_group || !group_dead) + goto send; + + stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tsk->tgid); + if (!stats) + goto err; + + memcpy(stats, tsk->signal->stats, sizeof(*stats)); + +send: + send_cpu_listeners(rep_skb, listeners); + return; +err: + nlmsg_free(rep_skb); +} + +static struct genl_ops taskstats_ops = { + .cmd = TASKSTATS_CMD_GET, + .doit = taskstats_user_cmd, + .policy = taskstats_cmd_get_policy, + .flags = GENL_ADMIN_PERM, +}; + +static struct genl_ops cgroupstats_ops = { + .cmd = CGROUPSTATS_CMD_GET, + .doit = cgroupstats_user_cmd, + .policy = cgroupstats_cmd_get_policy, +}; + +/* Needed early in initialization */ +void __init taskstats_init_early(void) +{ + unsigned int i; + + taskstats_cache = KMEM_CACHE(taskstats, SLAB_PANIC); + for_each_possible_cpu(i) { + INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); + init_rwsem(&(per_cpu(listener_array, i).sem)); + } +} + +static int __init taskstats_init(void) +{ + int rc; + + rc = genl_register_family(&family); + if (rc) + return rc; + + rc = genl_register_ops(&family, &taskstats_ops); + if (rc < 0) + goto err; + + rc = genl_register_ops(&family, &cgroupstats_ops); + if (rc < 0) + goto err_cgroup_ops; + + family_registered = 1; + pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION); + return 0; +err_cgroup_ops: + genl_unregister_ops(&family, &taskstats_ops); +err: + genl_unregister_family(&family); + return rc; +} + +/* + * late initcall ensures initialization of statistics collection + * mechanisms precedes initialization of the taskstats interface + */ +late_initcall(taskstats_init); diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c new file mode 100644 index 00000000..f8b11a28 --- /dev/null +++ b/kernel/test_kprobes.c @@ -0,0 +1,414 @@ +/* + * test_kprobes.c - simple sanity test for *probes + * + * Copyright IBM Corp. 2008 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + */ + +#include +#include +#include + +#define div_factor 3 + +static u32 rand1, preh_val, posth_val, jph_val; +static int errors, handler_errors, num_tests; +static u32 (*target)(u32 value); +static u32 (*target2)(u32 value); + +static noinline u32 kprobe_target(u32 value) +{ + return (value / div_factor); +} + +static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + preh_val = (rand1 / div_factor); + return 0; +} + +static void kp_post_handler(struct kprobe *p, struct pt_regs *regs, + unsigned long flags) +{ + if (preh_val != (rand1 / div_factor)) { + handler_errors++; + printk(KERN_ERR "Kprobe smoke test failed: " + "incorrect value in post_handler\n"); + } + posth_val = preh_val + div_factor; +} + +static struct kprobe kp = { + .symbol_name = "kprobe_target", + .pre_handler = kp_pre_handler, + .post_handler = kp_post_handler +}; + +static int test_kprobe(void) +{ + int ret; + + ret = register_kprobe(&kp); + if (ret < 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "register_kprobe returned %d\n", ret); + return ret; + } + + ret = target(rand1); + unregister_kprobe(&kp); + + if (preh_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kprobe pre_handler not called\n"); + handler_errors++; + } + + if (posth_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kprobe post_handler not called\n"); + handler_errors++; + } + + return 0; +} + +static noinline u32 kprobe_target2(u32 value) +{ + return (value / div_factor) + 1; +} + +static int kp_pre_handler2(struct kprobe *p, struct pt_regs *regs) +{ + preh_val = (rand1 / div_factor) + 1; + return 0; +} + +static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs, + unsigned long flags) +{ + if (preh_val != (rand1 / div_factor) + 1) { + handler_errors++; + printk(KERN_ERR "Kprobe smoke test failed: " + "incorrect value in post_handler2\n"); + } + posth_val = preh_val + div_factor; +} + +static struct kprobe kp2 = { + .symbol_name = "kprobe_target2", + .pre_handler = kp_pre_handler2, + .post_handler = kp_post_handler2 +}; + +static int test_kprobes(void) +{ + int ret; + struct kprobe *kps[2] = {&kp, &kp2}; + + /* addr and flags should be cleard for reusing kprobe. */ + kp.addr = NULL; + kp.flags = 0; + ret = register_kprobes(kps, 2); + if (ret < 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "register_kprobes returned %d\n", ret); + return ret; + } + + preh_val = 0; + posth_val = 0; + ret = target(rand1); + + if (preh_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kprobe pre_handler not called\n"); + handler_errors++; + } + + if (posth_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kprobe post_handler not called\n"); + handler_errors++; + } + + preh_val = 0; + posth_val = 0; + ret = target2(rand1); + + if (preh_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kprobe pre_handler2 not called\n"); + handler_errors++; + } + + if (posth_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kprobe post_handler2 not called\n"); + handler_errors++; + } + + unregister_kprobes(kps, 2); + return 0; + +} + +static u32 j_kprobe_target(u32 value) +{ + if (value != rand1) { + handler_errors++; + printk(KERN_ERR "Kprobe smoke test failed: " + "incorrect value in jprobe handler\n"); + } + + jph_val = rand1; + jprobe_return(); + return 0; +} + +static struct jprobe jp = { + .entry = j_kprobe_target, + .kp.symbol_name = "kprobe_target" +}; + +static int test_jprobe(void) +{ + int ret; + + ret = register_jprobe(&jp); + if (ret < 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "register_jprobe returned %d\n", ret); + return ret; + } + + ret = target(rand1); + unregister_jprobe(&jp); + if (jph_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "jprobe handler not called\n"); + handler_errors++; + } + + return 0; +} + +static struct jprobe jp2 = { + .entry = j_kprobe_target, + .kp.symbol_name = "kprobe_target2" +}; + +static int test_jprobes(void) +{ + int ret; + struct jprobe *jps[2] = {&jp, &jp2}; + + /* addr and flags should be cleard for reusing kprobe. */ + jp.kp.addr = NULL; + jp.kp.flags = 0; + ret = register_jprobes(jps, 2); + if (ret < 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "register_jprobes returned %d\n", ret); + return ret; + } + + jph_val = 0; + ret = target(rand1); + if (jph_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "jprobe handler not called\n"); + handler_errors++; + } + + jph_val = 0; + ret = target2(rand1); + if (jph_val == 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "jprobe handler2 not called\n"); + handler_errors++; + } + unregister_jprobes(jps, 2); + + return 0; +} +#ifdef CONFIG_KRETPROBES +static u32 krph_val; + +static int entry_handler(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + krph_val = (rand1 / div_factor); + return 0; +} + +static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + unsigned long ret = regs_return_value(regs); + + if (ret != (rand1 / div_factor)) { + handler_errors++; + printk(KERN_ERR "Kprobe smoke test failed: " + "incorrect value in kretprobe handler\n"); + } + if (krph_val == 0) { + handler_errors++; + printk(KERN_ERR "Kprobe smoke test failed: " + "call to kretprobe entry handler failed\n"); + } + + krph_val = rand1; + return 0; +} + +static struct kretprobe rp = { + .handler = return_handler, + .entry_handler = entry_handler, + .kp.symbol_name = "kprobe_target" +}; + +static int test_kretprobe(void) +{ + int ret; + + ret = register_kretprobe(&rp); + if (ret < 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "register_kretprobe returned %d\n", ret); + return ret; + } + + ret = target(rand1); + unregister_kretprobe(&rp); + if (krph_val != rand1) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kretprobe handler not called\n"); + handler_errors++; + } + + return 0; +} + +static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + unsigned long ret = regs_return_value(regs); + + if (ret != (rand1 / div_factor) + 1) { + handler_errors++; + printk(KERN_ERR "Kprobe smoke test failed: " + "incorrect value in kretprobe handler2\n"); + } + if (krph_val == 0) { + handler_errors++; + printk(KERN_ERR "Kprobe smoke test failed: " + "call to kretprobe entry handler failed\n"); + } + + krph_val = rand1; + return 0; +} + +static struct kretprobe rp2 = { + .handler = return_handler2, + .entry_handler = entry_handler, + .kp.symbol_name = "kprobe_target2" +}; + +static int test_kretprobes(void) +{ + int ret; + struct kretprobe *rps[2] = {&rp, &rp2}; + + /* addr and flags should be cleard for reusing kprobe. */ + rp.kp.addr = NULL; + rp.kp.flags = 0; + ret = register_kretprobes(rps, 2); + if (ret < 0) { + printk(KERN_ERR "Kprobe smoke test failed: " + "register_kretprobe returned %d\n", ret); + return ret; + } + + krph_val = 0; + ret = target(rand1); + if (krph_val != rand1) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kretprobe handler not called\n"); + handler_errors++; + } + + krph_val = 0; + ret = target2(rand1); + if (krph_val != rand1) { + printk(KERN_ERR "Kprobe smoke test failed: " + "kretprobe handler2 not called\n"); + handler_errors++; + } + unregister_kretprobes(rps, 2); + return 0; +} +#endif /* CONFIG_KRETPROBES */ + +int init_test_probes(void) +{ + int ret; + + target = kprobe_target; + target2 = kprobe_target2; + + do { + rand1 = random32(); + } while (rand1 <= div_factor); + + printk(KERN_INFO "Kprobe smoke test started\n"); + num_tests++; + ret = test_kprobe(); + if (ret < 0) + errors++; + + num_tests++; + ret = test_kprobes(); + if (ret < 0) + errors++; + + num_tests++; + ret = test_jprobe(); + if (ret < 0) + errors++; + + num_tests++; + ret = test_jprobes(); + if (ret < 0) + errors++; + +#ifdef CONFIG_KRETPROBES + num_tests++; + ret = test_kretprobe(); + if (ret < 0) + errors++; + + num_tests++; + ret = test_kretprobes(); + if (ret < 0) + errors++; +#endif /* CONFIG_KRETPROBES */ + + if (errors) + printk(KERN_ERR "BUG: Kprobe smoke test: %d out of " + "%d tests failed\n", errors, num_tests); + else if (handler_errors) + printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) " + "running handlers\n", handler_errors); + else + printk(KERN_INFO "Kprobe smoke test passed successfully\n"); + + return 0; +} diff --git a/kernel/time.c b/kernel/time.c new file mode 100644 index 00000000..d7760621 --- /dev/null +++ b/kernel/time.c @@ -0,0 +1,711 @@ +/* + * linux/kernel/time.c + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * This file contains the interface functions for the various + * time related system calls: time, stime, gettimeofday, settimeofday, + * adjtime + */ +/* + * Modification history kernel/time.c + * + * 1993-09-02 Philip Gladstone + * Created file with time related functions from sched.c and adjtimex() + * 1993-10-08 Torsten Duwe + * adjtime interface update and CMOS clock write code + * 1995-08-13 Torsten Duwe + * kernel PLL updated to 1994-12-13 specs (rfc-1589) + * 1999-01-16 Ulrich Windl + * Introduced error checking for many cases in adjtimex(). + * Updated NTP code according to technical memorandum Jan '96 + * "A Kernel Model for Precision Timekeeping" by Dave Mills + * Allow time_constant larger than MAXTC(6) for NTP v4 (MAXTC == 10) + * (Even though the technical memorandum forbids it) + * 2004-07-14 Christoph Lameter + * Added getnstimeofday to allow the posix timer functions to return + * with nanosecond accuracy + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "timeconst.h" + +/* + * The timezone where the local system is located. Used as a default by some + * programs who obtain this value by using gettimeofday. + */ +struct timezone sys_tz; + +EXPORT_SYMBOL(sys_tz); + +#ifdef __ARCH_WANT_SYS_TIME + +/* + * sys_time() can be implemented in user-level using + * sys_gettimeofday(). Is this for backwards compatibility? If so, + * why not move it into the appropriate arch directory (for those + * architectures that need it). + */ +SYSCALL_DEFINE1(time, time_t __user *, tloc) +{ + time_t i = get_seconds(); + + if (tloc) { + if (put_user(i,tloc)) + return -EFAULT; + } + force_successful_syscall_return(); + return i; +} + +/* + * sys_stime() can be implemented in user-level using + * sys_settimeofday(). Is this for backwards compatibility? If so, + * why not move it into the appropriate arch directory (for those + * architectures that need it). + */ + +SYSCALL_DEFINE1(stime, time_t __user *, tptr) +{ + struct timespec tv; + int err; + + if (get_user(tv.tv_sec, tptr)) + return -EFAULT; + + tv.tv_nsec = 0; + + err = security_settime(&tv, NULL); + if (err) + return err; + + do_settimeofday(&tv); + return 0; +} + +#endif /* __ARCH_WANT_SYS_TIME */ + +SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv, + struct timezone __user *, tz) +{ + if (likely(tv != NULL)) { + struct timeval ktv; + do_gettimeofday(&ktv); + if (copy_to_user(tv, &ktv, sizeof(ktv))) + return -EFAULT; + } + if (unlikely(tz != NULL)) { + if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) + return -EFAULT; + } + return 0; +} + +/* + * Adjust the time obtained from the CMOS to be UTC time instead of + * local time. + * + * This is ugly, but preferable to the alternatives. Otherwise we + * would either need to write a program to do it in /etc/rc (and risk + * confusion if the program gets run more than once; it would also be + * hard to make the program warp the clock precisely n hours) or + * compile in the timezone information into the kernel. Bad, bad.... + * + * - TYT, 1992-01-01 + * + * The best thing to do is to keep the CMOS clock in universal time (UTC) + * as real UNIX machines always do it. This avoids all headaches about + * daylight saving times and warping kernel clocks. + */ +static inline void warp_clock(void) +{ + struct timespec adjust; + + adjust = current_kernel_time(); + adjust.tv_sec += sys_tz.tz_minuteswest * 60; + do_settimeofday(&adjust); +} + +/* + * In case for some reason the CMOS clock has not already been running + * in UTC, but in some local time: The first time we set the timezone, + * we will warp the clock so that it is ticking UTC time instead of + * local time. Presumably, if someone is setting the timezone then we + * are running in an environment where the programs understand about + * timezones. This should be done at boot time in the /etc/rc script, + * as soon as possible, so that the clock can be set right. Otherwise, + * various programs will get confused when the clock gets warped. + */ + +int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz) +{ + static int firsttime = 1; + int error = 0; + + if (tv && !timespec_valid(tv)) + return -EINVAL; + + error = security_settime(tv, tz); + if (error) + return error; + + if (tz) { + /* SMP safe, global irq locking makes it work. */ + sys_tz = *tz; + update_vsyscall_tz(); + if (firsttime) { + firsttime = 0; + if (!tv) + warp_clock(); + } + } + if (tv) + { + /* SMP safe, again the code in arch/foo/time.c should + * globally block out interrupts when it runs. + */ + return do_settimeofday(tv); + } + return 0; +} + +SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, + struct timezone __user *, tz) +{ + struct timeval user_tv; + struct timespec new_ts; + struct timezone new_tz; + + if (tv) { + if (copy_from_user(&user_tv, tv, sizeof(*tv))) + return -EFAULT; + new_ts.tv_sec = user_tv.tv_sec; + new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; + } + if (tz) { + if (copy_from_user(&new_tz, tz, sizeof(*tz))) + return -EFAULT; + } + + return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); +} + +SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) +{ + struct timex txc; /* Local copy of parameter */ + int ret; + + /* Copy the user data space into the kernel copy + * structure. But bear in mind that the structures + * may change + */ + if(copy_from_user(&txc, txc_p, sizeof(struct timex))) + return -EFAULT; + ret = do_adjtimex(&txc); + return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; +} + +/** + * current_fs_time - Return FS time + * @sb: Superblock. + * + * Return the current time truncated to the time granularity supported by + * the fs. + */ +struct timespec current_fs_time(struct super_block *sb) +{ + struct timespec now = current_kernel_time(); + return timespec_trunc(now, sb->s_time_gran); +} +EXPORT_SYMBOL(current_fs_time); + +/* + * Convert jiffies to milliseconds and back. + * + * Avoid unnecessary multiplications/divisions in the + * two most common HZ cases: + */ +inline unsigned int jiffies_to_msecs(const unsigned long j) +{ +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + return (MSEC_PER_SEC / HZ) * j; +#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) + return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); +#else +# if BITS_PER_LONG == 32 + return (HZ_TO_MSEC_MUL32 * j) >> HZ_TO_MSEC_SHR32; +# else + return (j * HZ_TO_MSEC_NUM) / HZ_TO_MSEC_DEN; +# endif +#endif +} +EXPORT_SYMBOL(jiffies_to_msecs); + +inline unsigned int jiffies_to_usecs(const unsigned long j) +{ +#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) + return (USEC_PER_SEC / HZ) * j; +#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) + return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC); +#else +# if BITS_PER_LONG == 32 + return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32; +# else + return (j * HZ_TO_USEC_NUM) / HZ_TO_USEC_DEN; +# endif +#endif +} +EXPORT_SYMBOL(jiffies_to_usecs); + +/** + * timespec_trunc - Truncate timespec to a granularity + * @t: Timespec + * @gran: Granularity in ns. + * + * Truncate a timespec to a granularity. gran must be smaller than a second. + * Always rounds down. + * + * This function should be only used for timestamps returned by + * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because + * it doesn't handle the better resolution of the latter. + */ +struct timespec timespec_trunc(struct timespec t, unsigned gran) +{ + /* + * Division is pretty slow so avoid it for common cases. + * Currently current_kernel_time() never returns better than + * jiffies resolution. Exploit that. + */ + if (gran <= jiffies_to_usecs(1) * 1000) { + /* nothing */ + } else if (gran == 1000000000) { + t.tv_nsec = 0; + } else { + t.tv_nsec -= t.tv_nsec % gran; + } + return t; +} +EXPORT_SYMBOL(timespec_trunc); + +/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. + * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 + * => year=1980, mon=12, day=31, hour=23, min=59, sec=59. + * + * [For the Julian calendar (which was used in Russia before 1917, + * Britain & colonies before 1752, anywhere else before 1582, + * and is still in use by some communities) leave out the + * -year/100+year/400 terms, and add 10.] + * + * This algorithm was first published by Gauss (I think). + * + * WARNING: this function will overflow on 2106-02-07 06:28:16 on + * machines where long is 32-bit! (However, as time_t is signed, we + * will already get problems at other places on 2038-01-19 03:14:08) + */ +unsigned long +mktime(const unsigned int year0, const unsigned int mon0, + const unsigned int day, const unsigned int hour, + const unsigned int min, const unsigned int sec) +{ + unsigned int mon = mon0, year = year0; + + /* 1..12 -> 11,12,1..10 */ + if (0 >= (int) (mon -= 2)) { + mon += 12; /* Puts Feb last since it has leap day */ + year -= 1; + } + + return ((((unsigned long) + (year/4 - year/100 + year/400 + 367*mon/12 + day) + + year*365 - 719499 + )*24 + hour /* now have hours */ + )*60 + min /* now have minutes */ + )*60 + sec; /* finally seconds */ +} + +EXPORT_SYMBOL(mktime); + +/** + * set_normalized_timespec - set timespec sec and nsec parts and normalize + * + * @ts: pointer to timespec variable to be set + * @sec: seconds to set + * @nsec: nanoseconds to set + * + * Set seconds and nanoseconds field of a timespec variable and + * normalize to the timespec storage format + * + * Note: The tv_nsec part is always in the range of + * 0 <= tv_nsec < NSEC_PER_SEC + * For negative values only the tv_sec field is negative ! + */ +void set_normalized_timespec(struct timespec *ts, time_t sec, s64 nsec) +{ + while (nsec >= NSEC_PER_SEC) { + /* + * The following asm() prevents the compiler from + * optimising this loop into a modulo operation. See + * also __iter_div_u64_rem() in include/linux/time.h + */ + asm("" : "+rm"(nsec)); + nsec -= NSEC_PER_SEC; + ++sec; + } + while (nsec < 0) { + asm("" : "+rm"(nsec)); + nsec += NSEC_PER_SEC; + --sec; + } + ts->tv_sec = sec; + ts->tv_nsec = nsec; +} +EXPORT_SYMBOL(set_normalized_timespec); + +/** + * ns_to_timespec - Convert nanoseconds to timespec + * @nsec: the nanoseconds value to be converted + * + * Returns the timespec representation of the nsec parameter. + */ +struct timespec ns_to_timespec(const s64 nsec) +{ + struct timespec ts; + s32 rem; + + if (!nsec) + return (struct timespec) {0, 0}; + + ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); + if (unlikely(rem < 0)) { + ts.tv_sec--; + rem += NSEC_PER_SEC; + } + ts.tv_nsec = rem; + + return ts; +} +EXPORT_SYMBOL(ns_to_timespec); + +/** + * ns_to_timeval - Convert nanoseconds to timeval + * @nsec: the nanoseconds value to be converted + * + * Returns the timeval representation of the nsec parameter. + */ +struct timeval ns_to_timeval(const s64 nsec) +{ + struct timespec ts = ns_to_timespec(nsec); + struct timeval tv; + + tv.tv_sec = ts.tv_sec; + tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000; + + return tv; +} +EXPORT_SYMBOL(ns_to_timeval); + +/* + * When we convert to jiffies then we interpret incoming values + * the following way: + * + * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) + * + * - 'too large' values [that would result in larger than + * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. + * + * - all other values are converted to jiffies by either multiplying + * the input value by a factor or dividing it with a factor + * + * We must also be careful about 32-bit overflows. + */ +unsigned long msecs_to_jiffies(const unsigned int m) +{ + /* + * Negative value, means infinite timeout: + */ + if ((int)m < 0) + return MAX_JIFFY_OFFSET; + +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) + /* + * HZ is equal to or smaller than 1000, and 1000 is a nice + * round multiple of HZ, divide with the factor between them, + * but round upwards: + */ + return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); +#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) + /* + * HZ is larger than 1000, and HZ is a nice round multiple of + * 1000 - simply multiply with the factor between them. + * + * But first make sure the multiplication result cannot + * overflow: + */ + if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + + return m * (HZ / MSEC_PER_SEC); +#else + /* + * Generic case - multiply, round and divide. But first + * check that if we are doing a net multiplication, that + * we wouldn't overflow: + */ + if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + + return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) + >> MSEC_TO_HZ_SHR32; +#endif +} +EXPORT_SYMBOL(msecs_to_jiffies); + +unsigned long usecs_to_jiffies(const unsigned int u) +{ + if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; +#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) + return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); +#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) + return u * (HZ / USEC_PER_SEC); +#else + return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32) + >> USEC_TO_HZ_SHR32; +#endif +} +EXPORT_SYMBOL(usecs_to_jiffies); + +/* + * The TICK_NSEC - 1 rounds up the value to the next resolution. Note + * that a remainder subtract here would not do the right thing as the + * resolution values don't fall on second boundries. I.e. the line: + * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. + * + * Rather, we just shift the bits off the right. + * + * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec + * value to a scaled second value. + */ +unsigned long +timespec_to_jiffies(const struct timespec *value) +{ + unsigned long sec = value->tv_sec; + long nsec = value->tv_nsec + TICK_NSEC - 1; + + if (sec >= MAX_SEC_IN_JIFFIES){ + sec = MAX_SEC_IN_JIFFIES; + nsec = 0; + } + return (((u64)sec * SEC_CONVERSION) + + (((u64)nsec * NSEC_CONVERSION) >> + (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; + +} +EXPORT_SYMBOL(timespec_to_jiffies); + +void +jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) +{ + /* + * Convert jiffies to nanoseconds and separate with + * one divide. + */ + u32 rem; + value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC, + NSEC_PER_SEC, &rem); + value->tv_nsec = rem; +} +EXPORT_SYMBOL(jiffies_to_timespec); + +/* Same for "timeval" + * + * Well, almost. The problem here is that the real system resolution is + * in nanoseconds and the value being converted is in micro seconds. + * Also for some machines (those that use HZ = 1024, in-particular), + * there is a LARGE error in the tick size in microseconds. + + * The solution we use is to do the rounding AFTER we convert the + * microsecond part. Thus the USEC_ROUND, the bits to be shifted off. + * Instruction wise, this should cost only an additional add with carry + * instruction above the way it was done above. + */ +unsigned long +timeval_to_jiffies(const struct timeval *value) +{ + unsigned long sec = value->tv_sec; + long usec = value->tv_usec; + + if (sec >= MAX_SEC_IN_JIFFIES){ + sec = MAX_SEC_IN_JIFFIES; + usec = 0; + } + return (((u64)sec * SEC_CONVERSION) + + (((u64)usec * USEC_CONVERSION + USEC_ROUND) >> + (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; +} +EXPORT_SYMBOL(timeval_to_jiffies); + +void jiffies_to_timeval(const unsigned long jiffies, struct timeval *value) +{ + /* + * Convert jiffies to nanoseconds and separate with + * one divide. + */ + u32 rem; + + value->tv_sec = div_u64_rem((u64)jiffies * TICK_NSEC, + NSEC_PER_SEC, &rem); + value->tv_usec = rem / NSEC_PER_USEC; +} +EXPORT_SYMBOL(jiffies_to_timeval); + +/* + * Convert jiffies/jiffies_64 to clock_t and back. + */ +clock_t jiffies_to_clock_t(unsigned long x) +{ +#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 +# if HZ < USER_HZ + return x * (USER_HZ / HZ); +# else + return x / (HZ / USER_HZ); +# endif +#else + return div_u64((u64)x * TICK_NSEC, NSEC_PER_SEC / USER_HZ); +#endif +} +EXPORT_SYMBOL(jiffies_to_clock_t); + +unsigned long clock_t_to_jiffies(unsigned long x) +{ +#if (HZ % USER_HZ)==0 + if (x >= ~0UL / (HZ / USER_HZ)) + return ~0UL; + return x * (HZ / USER_HZ); +#else + /* Don't worry about loss of precision here .. */ + if (x >= ~0UL / HZ * USER_HZ) + return ~0UL; + + /* .. but do try to contain it here */ + return div_u64((u64)x * HZ, USER_HZ); +#endif +} +EXPORT_SYMBOL(clock_t_to_jiffies); + +u64 jiffies_64_to_clock_t(u64 x) +{ +#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0 +# if HZ < USER_HZ + x = div_u64(x * USER_HZ, HZ); +# elif HZ > USER_HZ + x = div_u64(x, HZ / USER_HZ); +# else + /* Nothing to do */ +# endif +#else + /* + * There are better ways that don't overflow early, + * but even this doesn't overflow in hundreds of years + * in 64 bits, so.. + */ + x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ)); +#endif + return x; +} +EXPORT_SYMBOL(jiffies_64_to_clock_t); + +u64 nsec_to_clock_t(u64 x) +{ +#if (NSEC_PER_SEC % USER_HZ) == 0 + return div_u64(x, NSEC_PER_SEC / USER_HZ); +#elif (USER_HZ % 512) == 0 + return div_u64(x * USER_HZ / 512, NSEC_PER_SEC / 512); +#else + /* + * max relative error 5.7e-8 (1.8s per year) for USER_HZ <= 1024, + * overflow after 64.99 years. + * exact for HZ=60, 72, 90, 120, 144, 180, 300, 600, 900, ... + */ + return div_u64(x * 9, (9ull * NSEC_PER_SEC + (USER_HZ / 2)) / USER_HZ); +#endif +} + +/** + * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64 + * + * @n: nsecs in u64 + * + * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. + * And this doesn't return MAX_JIFFY_OFFSET since this function is designed + * for scheduler, not for use in device drivers to calculate timeout value. + * + * note: + * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) + * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years + */ +u64 nsecs_to_jiffies64(u64 n) +{ +#if (NSEC_PER_SEC % HZ) == 0 + /* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */ + return div_u64(n, NSEC_PER_SEC / HZ); +#elif (HZ % 512) == 0 + /* overflow after 292 years if HZ = 1024 */ + return div_u64(n * HZ / 512, NSEC_PER_SEC / 512); +#else + /* + * Generic case - optimized for cases where HZ is a multiple of 3. + * overflow after 64.99 years, exact for HZ = 60, 72, 90, 120 etc. + */ + return div_u64(n * 9, (9ull * NSEC_PER_SEC + HZ / 2) / HZ); +#endif +} + +/** + * nsecs_to_jiffies - Convert nsecs in u64 to jiffies + * + * @n: nsecs in u64 + * + * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64. + * And this doesn't return MAX_JIFFY_OFFSET since this function is designed + * for scheduler, not for use in device drivers to calculate timeout value. + * + * note: + * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512) + * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years + */ +unsigned long nsecs_to_jiffies(u64 n) +{ + return (unsigned long)nsecs_to_jiffies64(n); +} + +/* + * Add two timespec values and do a safety check for overflow. + * It's assumed that both values are valid (>= 0) + */ +struct timespec timespec_add_safe(const struct timespec lhs, + const struct timespec rhs) +{ + struct timespec res; + + set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec, + lhs.tv_nsec + rhs.tv_nsec); + + if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec) + res.tv_sec = TIME_T_MAX; + + return res; +} diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig new file mode 100644 index 00000000..f06a8a36 --- /dev/null +++ b/kernel/time/Kconfig @@ -0,0 +1,29 @@ +# +# Timer subsystem related configuration options +# +config TICK_ONESHOT + bool + +config NO_HZ + bool "Tickless System (Dynamic Ticks)" + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + select TICK_ONESHOT + help + This option enables a tickless system: timer interrupts will + only trigger on an as-needed basis both when the system is + busy and when the system is idle. + +config HIGH_RES_TIMERS + bool "High Resolution Timer Support" + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS + select TICK_ONESHOT + help + This option enables high resolution timer support. If your + hardware is not capable then this option only increases + the size of the kernel image. + +config GENERIC_CLOCKEVENTS_BUILD + bool + default y + depends on GENERIC_CLOCKEVENTS || GENERIC_CLOCKEVENTS_MIGR + diff --git a/kernel/time/Makefile b/kernel/time/Makefile new file mode 100644 index 00000000..cae2ad74 --- /dev/null +++ b/kernel/time/Makefile @@ -0,0 +1,9 @@ +obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o +obj-y += timeconv.o posix-clock.o #alarmtimer.o + +obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o +obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o +obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += tick-broadcast.o +obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o +obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o +obj-$(CONFIG_TIMER_STATS) += timer_stats.o diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c new file mode 100644 index 00000000..8b70c769 --- /dev/null +++ b/kernel/time/alarmtimer.c @@ -0,0 +1,728 @@ +/* + * Alarmtimer interface + * + * This interface provides a timer which is similarto hrtimers, + * but triggers a RTC alarm if the box is suspend. + * + * This interface is influenced by the Android RTC Alarm timer + * interface. + * + * Copyright (C) 2010 IBM Corperation + * + * Author: John Stultz + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * struct alarm_base - Alarm timer bases + * @lock: Lock for syncrhonized access to the base + * @timerqueue: Timerqueue head managing the list of events + * @timer: hrtimer used to schedule events while running + * @gettime: Function to read the time correlating to the base + * @base_clockid: clockid for the base + */ +static struct alarm_base { + spinlock_t lock; + struct timerqueue_head timerqueue; + struct hrtimer timer; + ktime_t (*gettime)(void); + clockid_t base_clockid; +} alarm_bases[ALARM_NUMTYPE]; + +/* freezer delta & lock used to handle clock_nanosleep triggered wakeups */ +static ktime_t freezer_delta; +static DEFINE_SPINLOCK(freezer_delta_lock); + +#ifdef CONFIG_RTC_CLASS +/* rtc timer and device for setting alarm wakeups at suspend */ +static struct rtc_timer rtctimer; +static struct rtc_device *rtcdev; +static DEFINE_SPINLOCK(rtcdev_lock); + +/** + * has_wakealarm - check rtc device has wakealarm ability + * @dev: current device + * @name_ptr: name to be returned + * + * This helper function checks to see if the rtc device can wake + * from suspend. + */ +static int has_wakealarm(struct device *dev, void *name_ptr) +{ + struct rtc_device *candidate = to_rtc_device(dev); + + if (!candidate->ops->set_alarm) + return 0; + if (!device_may_wakeup(candidate->dev.parent)) + return 0; + + *(const char **)name_ptr = dev_name(dev); + return 1; +} + +/** + * alarmtimer_get_rtcdev - Return selected rtcdevice + * + * This function returns the rtc device to use for wakealarms. + * If one has not already been chosen, it checks to see if a + * functional rtc device is available. + */ +static struct rtc_device *alarmtimer_get_rtcdev(void) +{ + struct device *dev; + char *str; + unsigned long flags; + struct rtc_device *ret; + + spin_lock_irqsave(&rtcdev_lock, flags); + if (!rtcdev) { + /* Find an rtc device and init the rtc_timer */ + dev = class_find_device(rtc_class, NULL, &str, has_wakealarm); + /* If we have a device then str is valid. See has_wakealarm() */ + if (dev) { + rtcdev = rtc_class_open(str); + /* + * Drop the reference we got in class_find_device, + * rtc_open takes its own. + */ + put_device(dev); + rtc_timer_init(&rtctimer, NULL, NULL); + } + } + ret = rtcdev; + spin_unlock_irqrestore(&rtcdev_lock, flags); + + return ret; +} +#else +#define alarmtimer_get_rtcdev() (0) +#define rtcdev (0) +#endif + + +/** + * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue + * @base: pointer to the base where the timer is being run + * @alarm: pointer to alarm being enqueued. + * + * Adds alarm to a alarm_base timerqueue and if necessary sets + * an hrtimer to run. + * + * Must hold base->lock when calling. + */ +static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm) +{ + timerqueue_add(&base->timerqueue, &alarm->node); + if (&alarm->node == timerqueue_getnext(&base->timerqueue)) { + hrtimer_try_to_cancel(&base->timer); + hrtimer_start(&base->timer, alarm->node.expires, + HRTIMER_MODE_ABS); + } +} + +/** + * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue + * @base: pointer to the base where the timer is running + * @alarm: pointer to alarm being removed + * + * Removes alarm to a alarm_base timerqueue and if necessary sets + * a new timer to run. + * + * Must hold base->lock when calling. + */ +static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm) +{ + struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue); + + timerqueue_del(&base->timerqueue, &alarm->node); + if (next == &alarm->node) { + hrtimer_try_to_cancel(&base->timer); + next = timerqueue_getnext(&base->timerqueue); + if (!next) + return; + hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS); + } +} + + +/** + * alarmtimer_fired - Handles alarm hrtimer being fired. + * @timer: pointer to hrtimer being run + * + * When a alarm timer fires, this runs through the timerqueue to + * see which alarms expired, and runs those. If there are more alarm + * timers queued for the future, we set the hrtimer to fire when + * when the next future alarm timer expires. + */ +static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer) +{ + struct alarm_base *base = container_of(timer, struct alarm_base, timer); + struct timerqueue_node *next; + unsigned long flags; + ktime_t now; + int ret = HRTIMER_NORESTART; + + spin_lock_irqsave(&base->lock, flags); + now = base->gettime(); + while ((next = timerqueue_getnext(&base->timerqueue))) { + struct alarm *alarm; + ktime_t expired = next->expires; + + if (expired.tv64 > now.tv64) + break; + + alarm = container_of(next, struct alarm, node); + + timerqueue_del(&base->timerqueue, &alarm->node); + alarm->enabled = 0; + /* Re-add periodic timers */ + if (alarm->period.tv64) { + alarm->node.expires = ktime_add(expired, alarm->period); + timerqueue_add(&base->timerqueue, &alarm->node); + alarm->enabled = 1; + } + spin_unlock_irqrestore(&base->lock, flags); + if (alarm->function) + alarm->function(alarm); + spin_lock_irqsave(&base->lock, flags); + } + + if (next) { + hrtimer_set_expires(&base->timer, next->expires); + ret = HRTIMER_RESTART; + } + spin_unlock_irqrestore(&base->lock, flags); + + return ret; + +} + +#ifdef CONFIG_RTC_CLASS +/** + * alarmtimer_suspend - Suspend time callback + * @dev: unused + * @state: unused + * + * When we are going into suspend, we look through the bases + * to see which is the soonest timer to expire. We then + * set an rtc timer to fire that far into the future, which + * will wake us from suspend. + */ +static int alarmtimer_suspend(struct device *dev) +{ + struct rtc_time tm; + ktime_t min, now; + unsigned long flags; + struct rtc_device *rtc; + int i; + + spin_lock_irqsave(&freezer_delta_lock, flags); + min = freezer_delta; + freezer_delta = ktime_set(0, 0); + spin_unlock_irqrestore(&freezer_delta_lock, flags); + + rtc = rtcdev; + /* If we have no rtcdev, just return */ + if (!rtc) + return 0; + + /* Find the soonest timer to expire*/ + for (i = 0; i < ALARM_NUMTYPE; i++) { + struct alarm_base *base = &alarm_bases[i]; + struct timerqueue_node *next; + ktime_t delta; + + spin_lock_irqsave(&base->lock, flags); + next = timerqueue_getnext(&base->timerqueue); + spin_unlock_irqrestore(&base->lock, flags); + if (!next) + continue; + delta = ktime_sub(next->expires, base->gettime()); + if (!min.tv64 || (delta.tv64 < min.tv64)) + min = delta; + } + if (min.tv64 == 0) + return 0; + + /* XXX - Should we enforce a minimum sleep time? */ + WARN_ON(min.tv64 < NSEC_PER_SEC); + + /* Setup an rtc timer to fire that far in the future */ + rtc_timer_cancel(rtc, &rtctimer); + rtc_read_time(rtc, &tm); + now = rtc_tm_to_ktime(tm); + now = ktime_add(now, min); + + rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); + + return 0; +} +#else +static int alarmtimer_suspend(struct device *dev) +{ + return 0; +} +#endif + +static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) +{ + ktime_t delta; + unsigned long flags; + struct alarm_base *base = &alarm_bases[type]; + + delta = ktime_sub(absexp, base->gettime()); + + spin_lock_irqsave(&freezer_delta_lock, flags); + if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64)) + freezer_delta = delta; + spin_unlock_irqrestore(&freezer_delta_lock, flags); +} + + +/** + * alarm_init - Initialize an alarm structure + * @alarm: ptr to alarm to be initialized + * @type: the type of the alarm + * @function: callback that is run when the alarm fires + */ +void alarm_init(struct alarm *alarm, enum alarmtimer_type type, + void (*function)(struct alarm *)) +{ + timerqueue_init(&alarm->node); + alarm->period = ktime_set(0, 0); + alarm->function = function; + alarm->type = type; + alarm->enabled = 0; +} + +/** + * alarm_start - Sets an alarm to fire + * @alarm: ptr to alarm to set + * @start: time to run the alarm + * @period: period at which the alarm will recur + */ +void alarm_start(struct alarm *alarm, ktime_t start, ktime_t period) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + unsigned long flags; + + spin_lock_irqsave(&base->lock, flags); + if (alarm->enabled) + alarmtimer_remove(base, alarm); + alarm->node.expires = start; + alarm->period = period; + alarmtimer_enqueue(base, alarm); + alarm->enabled = 1; + spin_unlock_irqrestore(&base->lock, flags); +} + +/** + * alarm_cancel - Tries to cancel an alarm timer + * @alarm: ptr to alarm to be canceled + */ +void alarm_cancel(struct alarm *alarm) +{ + struct alarm_base *base = &alarm_bases[alarm->type]; + unsigned long flags; + + spin_lock_irqsave(&base->lock, flags); + if (alarm->enabled) + alarmtimer_remove(base, alarm); + alarm->enabled = 0; + spin_unlock_irqrestore(&base->lock, flags); +} + + +/** + * clock2alarm - helper that converts from clockid to alarmtypes + * @clockid: clockid. + */ +static enum alarmtimer_type clock2alarm(clockid_t clockid) +{ + if (clockid == CLOCK_REALTIME_ALARM) + return ALARM_REALTIME; + if (clockid == CLOCK_BOOTTIME_ALARM) + return ALARM_BOOTTIME; + return -1; +} + +/** + * alarm_handle_timer - Callback for posix timers + * @alarm: alarm that fired + * + * Posix timer callback for expired alarm timers. + */ +static void alarm_handle_timer(struct alarm *alarm) +{ + struct k_itimer *ptr = container_of(alarm, struct k_itimer, + it.alarmtimer); + if (posix_timer_event(ptr, 0) != 0) + ptr->it_overrun++; +} + +/** + * alarm_clock_getres - posix getres interface + * @which_clock: clockid + * @tp: timespec to fill + * + * Returns the granularity of underlying alarm base clock + */ +static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) +{ + clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid; + + if (!alarmtimer_get_rtcdev()) + return -ENOTSUPP; + + return hrtimer_get_res(baseid, tp); +} + +/** + * alarm_clock_get - posix clock_get interface + * @which_clock: clockid + * @tp: timespec to fill. + * + * Provides the underlying alarm base time. + */ +static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) +{ + struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; + + if (!alarmtimer_get_rtcdev()) + return -ENOTSUPP; + + *tp = ktime_to_timespec(base->gettime()); + return 0; +} + +/** + * alarm_timer_create - posix timer_create interface + * @new_timer: k_itimer pointer to manage + * + * Initializes the k_itimer structure. + */ +static int alarm_timer_create(struct k_itimer *new_timer) +{ + enum alarmtimer_type type; + struct alarm_base *base; + + if (!alarmtimer_get_rtcdev()) + return -ENOTSUPP; + + if (!capable(CAP_WAKE_ALARM)) + return -EPERM; + + type = clock2alarm(new_timer->it_clock); + base = &alarm_bases[type]; + alarm_init(&new_timer->it.alarmtimer, type, alarm_handle_timer); + return 0; +} + +/** + * alarm_timer_get - posix timer_get interface + * @new_timer: k_itimer pointer + * @cur_setting: itimerspec data to fill + * + * Copies the itimerspec data out from the k_itimer + */ +static void alarm_timer_get(struct k_itimer *timr, + struct itimerspec *cur_setting) +{ + memset(cur_setting, 0, sizeof(struct itimerspec)); + + cur_setting->it_interval = + ktime_to_timespec(timr->it.alarmtimer.period); + cur_setting->it_value = + ktime_to_timespec(timr->it.alarmtimer.node.expires); + return; +} + +/** + * alarm_timer_del - posix timer_del interface + * @timr: k_itimer pointer to be deleted + * + * Cancels any programmed alarms for the given timer. + */ +static int alarm_timer_del(struct k_itimer *timr) +{ + if (!rtcdev) + return -ENOTSUPP; + + alarm_cancel(&timr->it.alarmtimer); + return 0; +} + +/** + * alarm_timer_set - posix timer_set interface + * @timr: k_itimer pointer to be deleted + * @flags: timer flags + * @new_setting: itimerspec to be used + * @old_setting: itimerspec being replaced + * + * Sets the timer to new_setting, and starts the timer. + */ +static int alarm_timer_set(struct k_itimer *timr, int flags, + struct itimerspec *new_setting, + struct itimerspec *old_setting) +{ + if (!rtcdev) + return -ENOTSUPP; + + /* + * XXX HACK! Currently we can DOS a system if the interval + * period on alarmtimers is too small. Cap the interval here + * to 100us and solve this properly in a future patch! -jstultz + */ + if ((new_setting->it_interval.tv_sec == 0) && + (new_setting->it_interval.tv_nsec < 100000)) + new_setting->it_interval.tv_nsec = 100000; + + if (old_setting) + alarm_timer_get(timr, old_setting); + + /* If the timer was already set, cancel it */ + alarm_cancel(&timr->it.alarmtimer); + + /* start the timer */ + alarm_start(&timr->it.alarmtimer, + timespec_to_ktime(new_setting->it_value), + timespec_to_ktime(new_setting->it_interval)); + return 0; +} + +/** + * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep + * @alarm: ptr to alarm that fired + * + * Wakes up the task that set the alarmtimer + */ +static void alarmtimer_nsleep_wakeup(struct alarm *alarm) +{ + struct task_struct *task = (struct task_struct *)alarm->data; + + alarm->data = NULL; + if (task) + wake_up_process(task); +} + +/** + * alarmtimer_do_nsleep - Internal alarmtimer nsleep implementation + * @alarm: ptr to alarmtimer + * @absexp: absolute expiration time + * + * Sets the alarm timer and sleeps until it is fired or interrupted. + */ +static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp) +{ + alarm->data = (void *)current; + do { + set_current_state(TASK_INTERRUPTIBLE); + alarm_start(alarm, absexp, ktime_set(0, 0)); + if (likely(alarm->data)) + schedule(); + + alarm_cancel(alarm); + } while (alarm->data && !signal_pending(current)); + + __set_current_state(TASK_RUNNING); + + return (alarm->data == NULL); +} + + +/** + * update_rmtp - Update remaining timespec value + * @exp: expiration time + * @type: timer type + * @rmtp: user pointer to remaining timepsec value + * + * Helper function that fills in rmtp value with time between + * now and the exp value + */ +static int update_rmtp(ktime_t exp, enum alarmtimer_type type, + struct timespec __user *rmtp) +{ + struct timespec rmt; + ktime_t rem; + + rem = ktime_sub(exp, alarm_bases[type].gettime()); + + if (rem.tv64 <= 0) + return 0; + rmt = ktime_to_timespec(rem); + + if (copy_to_user(rmtp, &rmt, sizeof(*rmtp))) + return -EFAULT; + + return 1; + +} + +/** + * alarm_timer_nsleep_restart - restartblock alarmtimer nsleep + * @restart: ptr to restart block + * + * Handles restarted clock_nanosleep calls + */ +static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) +{ + enum alarmtimer_type type = restart->nanosleep.clockid; + ktime_t exp; + struct timespec __user *rmtp; + struct alarm alarm; + int ret = 0; + + exp.tv64 = restart->nanosleep.expires; + alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); + + if (alarmtimer_do_nsleep(&alarm, exp)) + goto out; + + if (freezing(current)) + alarmtimer_freezerset(exp, type); + + rmtp = restart->nanosleep.rmtp; + if (rmtp) { + ret = update_rmtp(exp, type, rmtp); + if (ret <= 0) + goto out; + } + + + /* The other values in restart are already filled in */ + ret = -ERESTART_RESTARTBLOCK; +out: + return ret; +} + +/** + * alarm_timer_nsleep - alarmtimer nanosleep + * @which_clock: clockid + * @flags: determins abstime or relative + * @tsreq: requested sleep time (abs or rel) + * @rmtp: remaining sleep time saved + * + * Handles clock_nanosleep calls against _ALARM clockids + */ +static int alarm_timer_nsleep(const clockid_t which_clock, int flags, + struct timespec *tsreq, struct timespec __user *rmtp) +{ + enum alarmtimer_type type = clock2alarm(which_clock); + struct alarm alarm; + ktime_t exp; + int ret = 0; + struct restart_block *restart; + + if (!alarmtimer_get_rtcdev()) + return -ENOTSUPP; + + if (!capable(CAP_WAKE_ALARM)) + return -EPERM; + + alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); + + exp = timespec_to_ktime(*tsreq); + /* Convert (if necessary) to absolute time */ + if (flags != TIMER_ABSTIME) { + ktime_t now = alarm_bases[type].gettime(); + exp = ktime_add(now, exp); + } + + if (alarmtimer_do_nsleep(&alarm, exp)) + goto out; + + if (freezing(current)) + alarmtimer_freezerset(exp, type); + + /* abs timers don't set remaining time or restart */ + if (flags == TIMER_ABSTIME) { + ret = -ERESTARTNOHAND; + goto out; + } + + if (rmtp) { + ret = update_rmtp(exp, type, rmtp); + if (ret <= 0) + goto out; + } + + restart = ¤t_thread_info()->restart_block; + restart->fn = alarm_timer_nsleep_restart; + restart->nanosleep.clockid = type; + restart->nanosleep.expires = exp.tv64; + restart->nanosleep.rmtp = rmtp; + ret = -ERESTART_RESTARTBLOCK; + +out: + return ret; +} + + +/* Suspend hook structures */ +static const struct dev_pm_ops alarmtimer_pm_ops = { + .suspend = alarmtimer_suspend, +}; + +static struct platform_driver alarmtimer_driver = { + .driver = { + .name = "alarmtimer", + .pm = &alarmtimer_pm_ops, + } +}; + +/** + * alarmtimer_init - Initialize alarm timer code + * + * This function initializes the alarm bases and registers + * the posix clock ids. + */ +static int __init alarmtimer_init(void) +{ + int error = 0; + int i; + struct k_clock alarm_clock = { + .clock_getres = alarm_clock_getres, + .clock_get = alarm_clock_get, + .timer_create = alarm_timer_create, + .timer_set = alarm_timer_set, + .timer_del = alarm_timer_del, + .timer_get = alarm_timer_get, + .nsleep = alarm_timer_nsleep, + }; + + posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock); + posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock); + + /* Initialize alarm bases */ + alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME; + alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real; + alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME; + alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime; + for (i = 0; i < ALARM_NUMTYPE; i++) { + timerqueue_init_head(&alarm_bases[i].timerqueue); + spin_lock_init(&alarm_bases[i].lock); + hrtimer_init(&alarm_bases[i].timer, + alarm_bases[i].base_clockid, + HRTIMER_MODE_ABS); + alarm_bases[i].timer.function = alarmtimer_fired; + } + error = platform_driver_register(&alarmtimer_driver); + platform_device_register_simple("alarmtimer", -1, NULL, 0); + + return error; +} +device_initcall(alarmtimer_init); + diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c new file mode 100644 index 00000000..e4c699df --- /dev/null +++ b/kernel/time/clockevents.c @@ -0,0 +1,341 @@ +/* + * linux/kernel/time/clockevents.c + * + * This file contains functions which manage clock event devices. + * + * Copyright(C) 2005-2006, Thomas Gleixner + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + * + * This code is licenced under the GPL version 2. For details see + * kernel-base/COPYING. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "tick-internal.h" + +/* The registered clock event devices */ +static LIST_HEAD(clockevent_devices); +static LIST_HEAD(clockevents_released); + +/* Notification for clock events */ +static RAW_NOTIFIER_HEAD(clockevents_chain); + +/* Protection for the above */ +static DEFINE_RAW_SPINLOCK(clockevents_lock); + +/** + * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds + * @latch: value to convert + * @evt: pointer to clock event device descriptor + * + * Math helper, returns latch value converted to nanoseconds (bound checked) + */ +u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) +{ + u64 clc = (u64) latch << evt->shift; + + if (unlikely(!evt->mult)) { + evt->mult = 1; + WARN_ON(1); + } + + do_div(clc, evt->mult); + if (clc < 1000) + clc = 1000; + if (clc > KTIME_MAX) + clc = KTIME_MAX; + + return clc; +} +EXPORT_SYMBOL_GPL(clockevent_delta2ns); + +/** + * clockevents_set_mode - set the operating mode of a clock event device + * @dev: device to modify + * @mode: new mode + * + * Must be called with interrupts disabled ! + */ +void clockevents_set_mode(struct clock_event_device *dev, + enum clock_event_mode mode) +{ + if (dev->mode != mode) { + dev->set_mode(mode, dev); + dev->mode = mode; + + /* + * A nsec2cyc multiplicator of 0 is invalid and we'd crash + * on it, so fix it up and emit a warning: + */ + if (mode == CLOCK_EVT_MODE_ONESHOT) { + if (unlikely(!dev->mult)) { + dev->mult = 1; + WARN_ON(1); + } + } + } +} + +/** + * clockevents_shutdown - shutdown the device and clear next_event + * @dev: device to shutdown + */ +void clockevents_shutdown(struct clock_event_device *dev) +{ + clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); + dev->next_event.tv64 = KTIME_MAX; +} + +/** + * clockevents_program_event - Reprogram the clock event device. + * @expires: absolute expiry time (monotonic clock) + * + * Returns 0 on success, -ETIME when the event is in the past. + */ +int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, + ktime_t now) +{ + unsigned long long clc; + int64_t delta; + + if (unlikely(expires.tv64 < 0)) { + WARN_ON_ONCE(1); + return -ETIME; + } + + delta = ktime_to_ns(ktime_sub(expires, now)); + + if (delta <= 0) + return -ETIME; + + dev->next_event = expires; + + if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) + return 0; + + if (delta > dev->max_delta_ns) + delta = dev->max_delta_ns; + if (delta < dev->min_delta_ns) + delta = dev->min_delta_ns; + + clc = delta * dev->mult; + clc >>= dev->shift; + + return dev->set_next_event((unsigned long) clc, dev); +} + +/** + * clockevents_register_notifier - register a clock events change listener + */ +int clockevents_register_notifier(struct notifier_block *nb) +{ + unsigned long flags; + int ret; + + raw_spin_lock_irqsave(&clockevents_lock, flags); + ret = raw_notifier_chain_register(&clockevents_chain, nb); + raw_spin_unlock_irqrestore(&clockevents_lock, flags); + + return ret; +} + +/* + * Notify about a clock event change. Called with clockevents_lock + * held. + */ +static void clockevents_do_notify(unsigned long reason, void *dev) +{ + raw_notifier_call_chain(&clockevents_chain, reason, dev); +} + +/* + * Called after a notify add to make devices available which were + * released from the notifier call. + */ +static void clockevents_notify_released(void) +{ + struct clock_event_device *dev; + + while (!list_empty(&clockevents_released)) { + dev = list_entry(clockevents_released.next, + struct clock_event_device, list); + list_del(&dev->list); + list_add(&dev->list, &clockevent_devices); + clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); + } +} + +/** + * clockevents_register_device - register a clock event device + * @dev: device to register + */ +void clockevents_register_device(struct clock_event_device *dev) +{ + unsigned long flags; + + BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); + if (!dev->cpumask) { + WARN_ON(num_possible_cpus() > 1); + dev->cpumask = cpumask_of(smp_processor_id()); + } + + raw_spin_lock_irqsave(&clockevents_lock, flags); + + list_add(&dev->list, &clockevent_devices); + clockevents_do_notify(CLOCK_EVT_NOTIFY_ADD, dev); + clockevents_notify_released(); + + raw_spin_unlock_irqrestore(&clockevents_lock, flags); +} +EXPORT_SYMBOL_GPL(clockevents_register_device); + +static void clockevents_config(struct clock_event_device *dev, + u32 freq) +{ + u64 sec; + + if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) + return; + + /* + * Calculate the maximum number of seconds we can sleep. Limit + * to 10 minutes for hardware which can program more than + * 32bit ticks so we still get reasonable conversion values. + */ + sec = dev->max_delta_ticks; + do_div(sec, freq); + if (!sec) + sec = 1; + else if (sec > 600 && dev->max_delta_ticks > UINT_MAX) + sec = 600; + + clockevents_calc_mult_shift(dev, freq, sec); + dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev); + dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev); +} + +/** + * clockevents_config_and_register - Configure and register a clock event device + * @dev: device to register + * @freq: The clock frequency + * @min_delta: The minimum clock ticks to program in oneshot mode + * @max_delta: The maximum clock ticks to program in oneshot mode + * + * min/max_delta can be 0 for devices which do not support oneshot mode. + */ +void clockevents_config_and_register(struct clock_event_device *dev, + u32 freq, unsigned long min_delta, + unsigned long max_delta) +{ + dev->min_delta_ticks = min_delta; + dev->max_delta_ticks = max_delta; + clockevents_config(dev, freq); + clockevents_register_device(dev); +} + +/** + * clockevents_update_freq - Update frequency and reprogram a clock event device. + * @dev: device to modify + * @freq: new device frequency + * + * Reconfigure and reprogram a clock event device in oneshot + * mode. Must be called on the cpu for which the device delivers per + * cpu timer events with interrupts disabled! Returns 0 on success, + * -ETIME when the event is in the past. + */ +int clockevents_update_freq(struct clock_event_device *dev, u32 freq) +{ + clockevents_config(dev, freq); + + if (dev->mode != CLOCK_EVT_MODE_ONESHOT) + return 0; + + return clockevents_program_event(dev, dev->next_event, ktime_get()); +} + +/* + * Noop handler when we shut down an event device + */ +void clockevents_handle_noop(struct clock_event_device *dev) +{ +} + +/** + * clockevents_exchange_device - release and request clock devices + * @old: device to release (can be NULL) + * @new: device to request (can be NULL) + * + * Called from the notifier chain. clockevents_lock is held already + */ +void clockevents_exchange_device(struct clock_event_device *old, + struct clock_event_device *new) +{ + unsigned long flags; + + local_irq_save(flags); + /* + * Caller releases a clock event device. We queue it into the + * released list and do a notify add later. + */ + if (old) { + clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); + list_del(&old->list); + list_add(&old->list, &clockevents_released); + } + + if (new) { + BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED); + clockevents_shutdown(new); + } + local_irq_restore(flags); +} + +#ifdef CONFIG_GENERIC_CLOCKEVENTS +/** + * clockevents_notify - notification about relevant events + */ +void clockevents_notify(unsigned long reason, void *arg) +{ + struct clock_event_device *dev, *tmp; + unsigned long flags; + int cpu; + + raw_spin_lock_irqsave(&clockevents_lock, flags); + clockevents_do_notify(reason, arg); + + switch (reason) { + case CLOCK_EVT_NOTIFY_CPU_DEAD: + /* + * Unregister the clock event devices which were + * released from the users in the notify chain. + */ + list_for_each_entry_safe(dev, tmp, &clockevents_released, list) + list_del(&dev->list); + /* + * Now check whether the CPU has left unused per cpu devices + */ + cpu = *((int *)arg); + list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) { + if (cpumask_test_cpu(cpu, dev->cpumask) && + cpumask_weight(dev->cpumask) == 1 && + !tick_is_broadcast_device(dev)) { + BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); + list_del(&dev->list); + } + } + break; + default: + break; + } + raw_spin_unlock_irqrestore(&clockevents_lock, flags); +} +EXPORT_SYMBOL_GPL(clockevents_notify); +#endif diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c new file mode 100644 index 00000000..8b270063 --- /dev/null +++ b/kernel/time/clocksource.c @@ -0,0 +1,916 @@ +/* + * linux/kernel/time/clocksource.c + * + * This file contains the functions which manage clocksource drivers. + * + * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * TODO WishList: + * o Allow clocksource drivers to be unregistered + */ + +#include +#include +#include +#include +#include /* for spin_unlock_irq() using preempt_count() m68k */ +#include +#include + +void timecounter_init(struct timecounter *tc, + const struct cyclecounter *cc, + u64 start_tstamp) +{ + tc->cc = cc; + tc->cycle_last = cc->read(cc); + tc->nsec = start_tstamp; +} +EXPORT_SYMBOL_GPL(timecounter_init); + +/** + * timecounter_read_delta - get nanoseconds since last call of this function + * @tc: Pointer to time counter + * + * When the underlying cycle counter runs over, this will be handled + * correctly as long as it does not run over more than once between + * calls. + * + * The first call to this function for a new time counter initializes + * the time tracking and returns an undefined result. + */ +static u64 timecounter_read_delta(struct timecounter *tc) +{ + cycle_t cycle_now, cycle_delta; + u64 ns_offset; + + /* read cycle counter: */ + cycle_now = tc->cc->read(tc->cc); + + /* calculate the delta since the last timecounter_read_delta(): */ + cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask; + + /* convert to nanoseconds: */ + ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta); + + /* update time stamp of timecounter_read_delta() call: */ + tc->cycle_last = cycle_now; + + return ns_offset; +} + +u64 timecounter_read(struct timecounter *tc) +{ + u64 nsec; + + /* increment time by nanoseconds since last call */ + nsec = timecounter_read_delta(tc); + nsec += tc->nsec; + tc->nsec = nsec; + + return nsec; +} +EXPORT_SYMBOL_GPL(timecounter_read); + +u64 timecounter_cyc2time(struct timecounter *tc, + cycle_t cycle_tstamp) +{ + u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask; + u64 nsec; + + /* + * Instead of always treating cycle_tstamp as more recent + * than tc->cycle_last, detect when it is too far in the + * future and treat it as old time stamp instead. + */ + if (cycle_delta > tc->cc->mask / 2) { + cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask; + nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta); + } else { + nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec; + } + + return nsec; +} +EXPORT_SYMBOL_GPL(timecounter_cyc2time); + +/** + * clocks_calc_mult_shift - calculate mult/shift factors for scaled math of clocks + * @mult: pointer to mult variable + * @shift: pointer to shift variable + * @from: frequency to convert from + * @to: frequency to convert to + * @maxsec: guaranteed runtime conversion range in seconds + * + * The function evaluates the shift/mult pair for the scaled math + * operations of clocksources and clockevents. + * + * @to and @from are frequency values in HZ. For clock sources @to is + * NSEC_PER_SEC == 1GHz and @from is the counter frequency. For clock + * event @to is the counter frequency and @from is NSEC_PER_SEC. + * + * The @maxsec conversion range argument controls the time frame in + * seconds which must be covered by the runtime conversion with the + * calculated mult and shift factors. This guarantees that no 64bit + * overflow happens when the input value of the conversion is + * multiplied with the calculated mult factor. Larger ranges may + * reduce the conversion accuracy by chosing smaller mult and shift + * factors. + */ +void +clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 maxsec) +{ + u64 tmp; + u32 sft, sftacc= 32; + + /* + * Calculate the shift factor which is limiting the conversion + * range: + */ + tmp = ((u64)maxsec * from) >> 32; + while (tmp) { + tmp >>=1; + sftacc--; + } + + /* + * Find the conversion shift/mult pair which has the best + * accuracy and fits the maxsec conversion range: + */ + for (sft = 32; sft > 0; sft--) { + tmp = (u64) to << sft; + tmp += from / 2; + do_div(tmp, from); + if ((tmp >> sftacc) == 0) + break; + } + *mult = tmp; + *shift = sft; +} + +/*[Clocksource internal variables]--------- + * curr_clocksource: + * currently selected clocksource. + * clocksource_list: + * linked list with the registered clocksources + * clocksource_mutex: + * protects manipulations to curr_clocksource and the clocksource_list + * override_name: + * Name of the user-specified clocksource. + */ +static struct clocksource *curr_clocksource; +static LIST_HEAD(clocksource_list); +static DEFINE_MUTEX(clocksource_mutex); +static char override_name[32]; +static int finished_booting; + +#ifdef CONFIG_CLOCKSOURCE_WATCHDOG +static void clocksource_watchdog_work(struct work_struct *work); + +static LIST_HEAD(watchdog_list); +static struct clocksource *watchdog; +static struct timer_list watchdog_timer; +static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); +static DEFINE_SPINLOCK(watchdog_lock); +static int watchdog_running; + +static int clocksource_watchdog_kthread(void *data); +static void __clocksource_change_rating(struct clocksource *cs, int rating); + +/* + * Interval: 0.5sec Threshold: 0.0625s + */ +#define WATCHDOG_INTERVAL (HZ >> 1) +#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) + +static void clocksource_watchdog_work(struct work_struct *work) +{ + /* + * If kthread_run fails the next watchdog scan over the + * watchdog_list will find the unstable clock again. + */ + kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog"); +} + +static void __clocksource_unstable(struct clocksource *cs) +{ + cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); + cs->flags |= CLOCK_SOURCE_UNSTABLE; + if (finished_booting) + schedule_work(&watchdog_work); +} + +static void clocksource_unstable(struct clocksource *cs, int64_t delta) +{ + printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", + cs->name, delta); + __clocksource_unstable(cs); +} + +/** + * clocksource_mark_unstable - mark clocksource unstable via watchdog + * @cs: clocksource to be marked unstable + * + * This function is called instead of clocksource_change_rating from + * cpu hotplug code to avoid a deadlock between the clocksource mutex + * and the cpu hotplug mutex. It defers the update of the clocksource + * to the watchdog thread. + */ +void clocksource_mark_unstable(struct clocksource *cs) +{ + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + if (!(cs->flags & CLOCK_SOURCE_UNSTABLE)) { + if (list_empty(&cs->wd_list)) + list_add(&cs->wd_list, &watchdog_list); + __clocksource_unstable(cs); + } + spin_unlock_irqrestore(&watchdog_lock, flags); +} + +static void clocksource_watchdog(unsigned long data) +{ + struct clocksource *cs; + cycle_t csnow, wdnow; + int64_t wd_nsec, cs_nsec; + int next_cpu; + + spin_lock(&watchdog_lock); + if (!watchdog_running) + goto out; + + list_for_each_entry(cs, &watchdog_list, wd_list) { + + /* Clocksource already marked unstable? */ + if (cs->flags & CLOCK_SOURCE_UNSTABLE) { + if (finished_booting) + schedule_work(&watchdog_work); + continue; + } + + local_irq_disable(); + csnow = cs->read(cs); + wdnow = watchdog->read(watchdog); + local_irq_enable(); + + /* Clocksource initialized ? */ + if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) { + cs->flags |= CLOCK_SOURCE_WATCHDOG; + cs->wd_last = wdnow; + cs->cs_last = csnow; + continue; + } + + wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask, + watchdog->mult, watchdog->shift); + + cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) & + cs->mask, cs->mult, cs->shift); + cs->cs_last = csnow; + cs->wd_last = wdnow; + + /* Check the deviation from the watchdog clocksource. */ + if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { + clocksource_unstable(cs, cs_nsec - wd_nsec); + continue; + } + + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && + (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && + (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + /* + * We just marked the clocksource as highres-capable, + * notify the rest of the system as well so that we + * transition into high-res mode: + */ + tick_clock_notify(); + } + } + + /* + * Cycle through CPUs to check if the CPUs stay synchronized + * to each other. + */ + next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); + if (next_cpu >= nr_cpu_ids) + next_cpu = cpumask_first(cpu_online_mask); + watchdog_timer.expires += WATCHDOG_INTERVAL; + add_timer_on(&watchdog_timer, next_cpu); +out: + spin_unlock(&watchdog_lock); +} + +static inline void clocksource_start_watchdog(void) +{ + if (watchdog_running || !watchdog || list_empty(&watchdog_list)) + return; + init_timer(&watchdog_timer); + watchdog_timer.function = clocksource_watchdog; + watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; + add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); + watchdog_running = 1; +} + +static inline void clocksource_stop_watchdog(void) +{ + if (!watchdog_running || (watchdog && !list_empty(&watchdog_list))) + return; + del_timer(&watchdog_timer); + watchdog_running = 0; +} + +static inline void clocksource_reset_watchdog(void) +{ + struct clocksource *cs; + + list_for_each_entry(cs, &watchdog_list, wd_list) + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; +} + +static void clocksource_resume_watchdog(void) +{ + unsigned long flags; + + /* + * We use trylock here to avoid a potential dead lock when + * kgdb calls this code after the kernel has been stopped with + * watchdog_lock held. When watchdog_lock is held we just + * return and accept, that the watchdog might trigger and mark + * the monitored clock source (usually TSC) unstable. + * + * This does not affect the other caller clocksource_resume() + * because at this point the kernel is UP, interrupts are + * disabled and nothing can hold watchdog_lock. + */ + if (!spin_trylock_irqsave(&watchdog_lock, flags)) + return; + clocksource_reset_watchdog(); + spin_unlock_irqrestore(&watchdog_lock, flags); +} + +static void clocksource_enqueue_watchdog(struct clocksource *cs) +{ + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { + /* cs is a clocksource to be watched. */ + list_add(&cs->wd_list, &watchdog_list); + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; + } else { + /* cs is a watchdog. */ + if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + /* Pick the best watchdog. */ + if (!watchdog || cs->rating > watchdog->rating) { + watchdog = cs; + /* Reset watchdog cycles */ + clocksource_reset_watchdog(); + } + } + /* Check if the watchdog timer needs to be started. */ + clocksource_start_watchdog(); + spin_unlock_irqrestore(&watchdog_lock, flags); +} + +static void clocksource_dequeue_watchdog(struct clocksource *cs) +{ + struct clocksource *tmp; + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) { + /* cs is a watched clocksource. */ + list_del_init(&cs->wd_list); + } else if (cs == watchdog) { + /* Reset watchdog cycles */ + clocksource_reset_watchdog(); + /* Current watchdog is removed. Find an alternative. */ + watchdog = NULL; + list_for_each_entry(tmp, &clocksource_list, list) { + if (tmp == cs || tmp->flags & CLOCK_SOURCE_MUST_VERIFY) + continue; + if (!watchdog || tmp->rating > watchdog->rating) + watchdog = tmp; + } + } + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; + /* Check if the watchdog timer needs to be stopped. */ + clocksource_stop_watchdog(); + spin_unlock_irqrestore(&watchdog_lock, flags); +} + +static int clocksource_watchdog_kthread(void *data) +{ + struct clocksource *cs, *tmp; + unsigned long flags; + LIST_HEAD(unstable); + + mutex_lock(&clocksource_mutex); + spin_lock_irqsave(&watchdog_lock, flags); + list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) + if (cs->flags & CLOCK_SOURCE_UNSTABLE) { + list_del_init(&cs->wd_list); + list_add(&cs->wd_list, &unstable); + } + /* Check if the watchdog timer needs to be stopped. */ + clocksource_stop_watchdog(); + spin_unlock_irqrestore(&watchdog_lock, flags); + + /* Needs to be done outside of watchdog lock */ + list_for_each_entry_safe(cs, tmp, &unstable, wd_list) { + list_del_init(&cs->wd_list); + __clocksource_change_rating(cs, 0); + } + mutex_unlock(&clocksource_mutex); + return 0; +} + +#else /* CONFIG_CLOCKSOURCE_WATCHDOG */ + +static void clocksource_enqueue_watchdog(struct clocksource *cs) +{ + if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; +} + +static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } +static inline void clocksource_resume_watchdog(void) { } +static inline int clocksource_watchdog_kthread(void *data) { return 0; } + +#endif /* CONFIG_CLOCKSOURCE_WATCHDOG */ + +/** + * clocksource_suspend - suspend the clocksource(s) + */ +void clocksource_suspend(void) +{ + struct clocksource *cs; + + list_for_each_entry_reverse(cs, &clocksource_list, list) + if (cs->suspend) + cs->suspend(cs); +} + +/** + * clocksource_resume - resume the clocksource(s) + */ +void clocksource_resume(void) +{ + struct clocksource *cs; + + list_for_each_entry(cs, &clocksource_list, list) + if (cs->resume) + cs->resume(cs); + + clocksource_resume_watchdog(); +} + +/** + * clocksource_touch_watchdog - Update watchdog + * + * Update the watchdog after exception contexts such as kgdb so as not + * to incorrectly trip the watchdog. This might fail when the kernel + * was stopped in code which holds watchdog_lock. + */ +void clocksource_touch_watchdog(void) +{ + clocksource_resume_watchdog(); +} + +/** + * clocksource_max_deferment - Returns max time the clocksource can be deferred + * @cs: Pointer to clocksource + * + */ +static u64 clocksource_max_deferment(struct clocksource *cs) +{ + u64 max_nsecs, max_cycles; + + /* + * Calculate the maximum number of cycles that we can pass to the + * cyc2ns function without overflowing a 64-bit signed result. The + * maximum number of cycles is equal to ULLONG_MAX/cs->mult which + * is equivalent to the below. + * max_cycles < (2^63)/cs->mult + * max_cycles < 2^(log2((2^63)/cs->mult)) + * max_cycles < 2^(log2(2^63) - log2(cs->mult)) + * max_cycles < 2^(63 - log2(cs->mult)) + * max_cycles < 1 << (63 - log2(cs->mult)) + * Please note that we add 1 to the result of the log2 to account for + * any rounding errors, ensure the above inequality is satisfied and + * no overflow will occur. + */ + max_cycles = 1ULL << (63 - (ilog2(cs->mult) + 1)); + + /* + * The actual maximum number of cycles we can defer the clocksource is + * determined by the minimum of max_cycles and cs->mask. + */ + max_cycles = min_t(u64, max_cycles, (u64) cs->mask); + max_nsecs = clocksource_cyc2ns(max_cycles, cs->mult, cs->shift); + + /* + * To ensure that the clocksource does not wrap whilst we are idle, + * limit the time the clocksource can be deferred by 12.5%. Please + * note a margin of 12.5% is used because this can be computed with + * a shift, versus say 10% which would require division. + */ + return max_nsecs - (max_nsecs >> 3); +} + +#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET + +/** + * clocksource_select - Select the best clocksource available + * + * Private function. Must hold clocksource_mutex when called. + * + * Select the clocksource with the best rating, or the clocksource, + * which is selected by userspace override. + */ +static void clocksource_select(void) +{ + struct clocksource *best, *cs; + + if (!finished_booting || list_empty(&clocksource_list)) + return; + /* First clocksource on the list has the best rating. */ + best = list_first_entry(&clocksource_list, struct clocksource, list); + /* Check for the override clocksource. */ + list_for_each_entry(cs, &clocksource_list, list) { + if (strcmp(cs->name, override_name) != 0) + continue; + /* + * Check to make sure we don't switch to a non-highres + * capable clocksource if the tick code is in oneshot + * mode (highres or nohz) + */ + if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && + tick_oneshot_mode_active()) { + /* Override clocksource cannot be used. */ + printk(KERN_WARNING "Override clocksource %s is not " + "HRT compatible. Cannot switch while in " + "HRT/NOHZ mode\n", cs->name); + override_name[0] = 0; + } else + /* Override clocksource can be used. */ + best = cs; + break; + } + if (curr_clocksource != best) { + printk(KERN_INFO "Switching to clocksource %s\n", best->name); + curr_clocksource = best; + timekeeping_notify(curr_clocksource); + } +} + +#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ + +static inline void clocksource_select(void) { } + +#endif + +/* + * clocksource_done_booting - Called near the end of core bootup + * + * Hack to avoid lots of clocksource churn at boot time. + * We use fs_initcall because we want this to start before + * device_initcall but after subsys_initcall. + */ +static int __init clocksource_done_booting(void) +{ + mutex_lock(&clocksource_mutex); + curr_clocksource = clocksource_default_clock(); + mutex_unlock(&clocksource_mutex); + + finished_booting = 1; + + /* + * Run the watchdog first to eliminate unstable clock sources + */ + clocksource_watchdog_kthread(NULL); + + mutex_lock(&clocksource_mutex); + clocksource_select(); + mutex_unlock(&clocksource_mutex); + return 0; +} +fs_initcall(clocksource_done_booting); + +/* + * Enqueue the clocksource sorted by rating + */ +static void clocksource_enqueue(struct clocksource *cs) +{ + struct list_head *entry = &clocksource_list; + struct clocksource *tmp; + + list_for_each_entry(tmp, &clocksource_list, list) + /* Keep track of the place, where to insert */ + if (tmp->rating >= cs->rating) + entry = &tmp->list; + list_add(&cs->list, entry); +} + +/** + * __clocksource_updatefreq_scale - Used update clocksource with new freq + * @t: clocksource to be registered + * @scale: Scale factor multiplied against freq to get clocksource hz + * @freq: clocksource frequency (cycles per second) divided by scale + * + * This should only be called from the clocksource->enable() method. + * + * This *SHOULD NOT* be called directly! Please use the + * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions. + */ +void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) +{ + u64 sec; + + /* + * Calc the maximum number of seconds which we can run before + * wrapping around. For clocksources which have a mask > 32bit + * we need to limit the max sleep time to have a good + * conversion precision. 10 minutes is still a reasonable + * amount. That results in a shift value of 24 for a + * clocksource with mask >= 40bit and f >= 4GHz. That maps to + * ~ 0.06ppm granularity for NTP. We apply the same 12.5% + * margin as we do in clocksource_max_deferment() + */ + sec = (cs->mask - (cs->mask >> 3)); + do_div(sec, freq); + do_div(sec, scale); + if (!sec) + sec = 1; + else if (sec > 600 && cs->mask > UINT_MAX) + sec = 600; + + clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, + NSEC_PER_SEC / scale, sec * scale); + cs->max_idle_ns = clocksource_max_deferment(cs); +} +EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); + +/** + * __clocksource_register_scale - Used to install new clocksources + * @t: clocksource to be registered + * @scale: Scale factor multiplied against freq to get clocksource hz + * @freq: clocksource frequency (cycles per second) divided by scale + * + * Returns -EBUSY if registration fails, zero otherwise. + * + * This *SHOULD NOT* be called directly! Please use the + * clocksource_register_hz() or clocksource_register_khz helper functions. + */ +int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) +{ + + /* Initialize mult/shift and max_idle_ns */ + __clocksource_updatefreq_scale(cs, scale, freq); + + /* Add clocksource to the clcoksource list */ + mutex_lock(&clocksource_mutex); + clocksource_enqueue(cs); + clocksource_enqueue_watchdog(cs); + clocksource_select(); + mutex_unlock(&clocksource_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(__clocksource_register_scale); + + +/** + * clocksource_register - Used to install new clocksources + * @t: clocksource to be registered + * + * Returns -EBUSY if registration fails, zero otherwise. + */ +int clocksource_register(struct clocksource *cs) +{ + /* calculate max idle time permitted for this clocksource */ + cs->max_idle_ns = clocksource_max_deferment(cs); + + mutex_lock(&clocksource_mutex); + clocksource_enqueue(cs); + clocksource_enqueue_watchdog(cs); + clocksource_select(); + mutex_unlock(&clocksource_mutex); + return 0; +} +EXPORT_SYMBOL(clocksource_register); + +static void __clocksource_change_rating(struct clocksource *cs, int rating) +{ + list_del(&cs->list); + cs->rating = rating; + clocksource_enqueue(cs); + clocksource_select(); +} + +/** + * clocksource_change_rating - Change the rating of a registered clocksource + */ +void clocksource_change_rating(struct clocksource *cs, int rating) +{ + mutex_lock(&clocksource_mutex); + __clocksource_change_rating(cs, rating); + mutex_unlock(&clocksource_mutex); +} +EXPORT_SYMBOL(clocksource_change_rating); + +/** + * clocksource_unregister - remove a registered clocksource + */ +void clocksource_unregister(struct clocksource *cs) +{ + mutex_lock(&clocksource_mutex); + clocksource_dequeue_watchdog(cs); + list_del(&cs->list); + clocksource_select(); + mutex_unlock(&clocksource_mutex); +} +EXPORT_SYMBOL(clocksource_unregister); + +#ifdef CONFIG_SYSFS +/** + * sysfs_show_current_clocksources - sysfs interface for current clocksource + * @dev: unused + * @buf: char buffer to be filled with clocksource list + * + * Provides sysfs interface for listing current clocksource. + */ +static ssize_t +sysfs_show_current_clocksources(struct sys_device *dev, + struct sysdev_attribute *attr, char *buf) +{ + ssize_t count = 0; + + mutex_lock(&clocksource_mutex); + count = snprintf(buf, PAGE_SIZE, "%s\n", curr_clocksource->name); + mutex_unlock(&clocksource_mutex); + + return count; +} + +/** + * sysfs_override_clocksource - interface for manually overriding clocksource + * @dev: unused + * @buf: name of override clocksource + * @count: length of buffer + * + * Takes input from sysfs interface for manually overriding the default + * clocksource selection. + */ +static ssize_t sysfs_override_clocksource(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, size_t count) +{ + size_t ret = count; + + /* strings from sysfs write are not 0 terminated! */ + if (count >= sizeof(override_name)) + return -EINVAL; + + /* strip of \n: */ + if (buf[count-1] == '\n') + count--; + + mutex_lock(&clocksource_mutex); + + if (count > 0) + memcpy(override_name, buf, count); + override_name[count] = 0; + clocksource_select(); + + mutex_unlock(&clocksource_mutex); + + return ret; +} + +/** + * sysfs_show_available_clocksources - sysfs interface for listing clocksource + * @dev: unused + * @buf: char buffer to be filled with clocksource list + * + * Provides sysfs interface for listing registered clocksources + */ +static ssize_t +sysfs_show_available_clocksources(struct sys_device *dev, + struct sysdev_attribute *attr, + char *buf) +{ + struct clocksource *src; + ssize_t count = 0; + + mutex_lock(&clocksource_mutex); + list_for_each_entry(src, &clocksource_list, list) { + /* + * Don't show non-HRES clocksource if the tick code is + * in one shot mode (highres=on or nohz=on) + */ + if (!tick_oneshot_mode_active() || + (src->flags & CLOCK_SOURCE_VALID_FOR_HRES)) + count += snprintf(buf + count, + max((ssize_t)PAGE_SIZE - count, (ssize_t)0), + "%s ", src->name); + } + mutex_unlock(&clocksource_mutex); + + count += snprintf(buf + count, + max((ssize_t)PAGE_SIZE - count, (ssize_t)0), "\n"); + + return count; +} + +/* + * Sysfs setup bits: + */ +static SYSDEV_ATTR(current_clocksource, 0644, sysfs_show_current_clocksources, + sysfs_override_clocksource); + +static SYSDEV_ATTR(available_clocksource, 0444, + sysfs_show_available_clocksources, NULL); + +static struct sysdev_class clocksource_sysclass = { + .name = "clocksource", +}; + +static struct sys_device device_clocksource = { + .id = 0, + .cls = &clocksource_sysclass, +}; + +static int __init init_clocksource_sysfs(void) +{ + int error = sysdev_class_register(&clocksource_sysclass); + + if (!error) + error = sysdev_register(&device_clocksource); + if (!error) + error = sysdev_create_file( + &device_clocksource, + &attr_current_clocksource); + if (!error) + error = sysdev_create_file( + &device_clocksource, + &attr_available_clocksource); + return error; +} + +device_initcall(init_clocksource_sysfs); +#endif /* CONFIG_SYSFS */ + +/** + * boot_override_clocksource - boot clock override + * @str: override name + * + * Takes a clocksource= boot argument and uses it + * as the clocksource override name. + */ +static int __init boot_override_clocksource(char* str) +{ + mutex_lock(&clocksource_mutex); + if (str) + strlcpy(override_name, str, sizeof(override_name)); + mutex_unlock(&clocksource_mutex); + return 1; +} + +__setup("clocksource=", boot_override_clocksource); + +/** + * boot_override_clock - Compatibility layer for deprecated boot option + * @str: override name + * + * DEPRECATED! Takes a clock= boot argument and uses it + * as the clocksource override name + */ +static int __init boot_override_clock(char* str) +{ + if (!strcmp(str, "pmtmr")) { + printk("Warning: clock=pmtmr is deprecated. " + "Use clocksource=acpi_pm.\n"); + return boot_override_clocksource("acpi_pm"); + } + printk("Warning! clock= boot option is deprecated. " + "Use clocksource=xyz\n"); + return boot_override_clocksource(str); +} + +__setup("clock=", boot_override_clock); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c new file mode 100644 index 00000000..a470154e --- /dev/null +++ b/kernel/time/jiffies.c @@ -0,0 +1,97 @@ +/*********************************************************************** +* linux/kernel/time/jiffies.c +* +* This file contains the jiffies based clocksource. +* +* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) +* +* This program is free software; you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation; either version 2 of the License, or +* (at your option) any later version. +* +* This program is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with this program; if not, write to the Free Software +* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +* +************************************************************************/ +#include +#include +#include +#include + +#include "tick-internal.h" + +/* The Jiffies based clocksource is the lowest common + * denominator clock source which should function on + * all systems. It has the same coarse resolution as + * the timer interrupt frequency HZ and it suffers + * inaccuracies caused by missed or lost timer + * interrupts and the inability for the timer + * interrupt hardware to accuratly tick at the + * requested HZ value. It is also not recommended + * for "tick-less" systems. + */ +#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) + +/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier + * conversion, the .shift value could be zero. However + * this would make NTP adjustments impossible as they are + * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to + * shift both the nominator and denominator the same + * amount, and give ntp adjustments in units of 1/2^8 + * + * The value 8 is somewhat carefully chosen, as anything + * larger can result in overflows. NSEC_PER_JIFFY grows as + * HZ shrinks, so values greater than 8 overflow 32bits when + * HZ=100. + */ +#define JIFFIES_SHIFT 8 + +static cycle_t jiffies_read(struct clocksource *cs) +{ + return (cycle_t) jiffies; +} + +struct clocksource clocksource_jiffies = { + .name = "jiffies", + .rating = 1, /* lowest valid rating*/ + .read = jiffies_read, + .mask = 0xffffffff, /*32bits*/ + .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ + .shift = JIFFIES_SHIFT, +}; + +#if (BITS_PER_LONG < 64) +u64 get_jiffies_64(void) +{ + unsigned long seq; + u64 ret; + + do { + seq = read_seqbegin(&xtime_lock); + ret = jiffies_64; + } while (read_seqretry(&xtime_lock, seq)); + return ret; +} +EXPORT_SYMBOL(get_jiffies_64); +#endif + +EXPORT_SYMBOL(jiffies); + +static int __init init_jiffies_clocksource(void) +{ + return clocksource_register(&clocksource_jiffies); +} + +core_initcall(init_jiffies_clocksource); + +struct clocksource * __init __weak clocksource_default_clock(void) +{ + return &clocksource_jiffies; +} diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c new file mode 100644 index 00000000..4b85a7a7 --- /dev/null +++ b/kernel/time/ntp.c @@ -0,0 +1,972 @@ +/* + * NTP state machine interfaces and logic. + * + * This code was mainly moved from kernel/timer.c and kernel/time.c + * Please see those files for relevant copyright info and historical + * changelogs. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tick-internal.h" + +/* + * NTP timekeeping variables: + */ + +/* USER_HZ period (usecs): */ +unsigned long tick_usec = TICK_USEC; + +/* ACTHZ period (nsecs): */ +unsigned long tick_nsec; + +u64 tick_length; +static u64 tick_length_base; + +static struct hrtimer leap_timer; + +#define MAX_TICKADJ 500LL /* usecs */ +#define MAX_TICKADJ_SCALED \ + (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) + +/* + * phase-lock loop variables + */ + +/* + * clock synchronization status + * + * (TIME_ERROR prevents overwriting the CMOS clock) + */ +static int time_state = TIME_OK; + +/* clock status bits: */ +int time_status = STA_UNSYNC; + +/* TAI offset (secs): */ +static long time_tai; + +/* time adjustment (nsecs): */ +static s64 time_offset; + +/* pll time constant: */ +static long time_constant = 2; + +/* maximum error (usecs): */ +static long time_maxerror = NTP_PHASE_LIMIT; + +/* estimated error (usecs): */ +static long time_esterror = NTP_PHASE_LIMIT; + +/* frequency offset (scaled nsecs/secs): */ +static s64 time_freq; + +/* time at last adjustment (secs): */ +static long time_reftime; + +static long time_adjust; + +/* constant (boot-param configurable) NTP tick adjustment (upscaled) */ +static s64 ntp_tick_adj; + +#ifdef CONFIG_NTP_PPS + +/* + * The following variables are used when a pulse-per-second (PPS) signal + * is available. They establish the engineering parameters of the clock + * discipline loop when controlled by the PPS signal. + */ +#define PPS_VALID 10 /* PPS signal watchdog max (s) */ +#define PPS_POPCORN 4 /* popcorn spike threshold (shift) */ +#define PPS_INTMIN 2 /* min freq interval (s) (shift) */ +#define PPS_INTMAX 8 /* max freq interval (s) (shift) */ +#define PPS_INTCOUNT 4 /* number of consecutive good intervals to + increase pps_shift or consecutive bad + intervals to decrease it */ +#define PPS_MAXWANDER 100000 /* max PPS freq wander (ns/s) */ + +static int pps_valid; /* signal watchdog counter */ +static long pps_tf[3]; /* phase median filter */ +static long pps_jitter; /* current jitter (ns) */ +static struct timespec pps_fbase; /* beginning of the last freq interval */ +static int pps_shift; /* current interval duration (s) (shift) */ +static int pps_intcnt; /* interval counter */ +static s64 pps_freq; /* frequency offset (scaled ns/s) */ +static long pps_stabil; /* current stability (scaled ns/s) */ + +/* + * PPS signal quality monitors + */ +static long pps_calcnt; /* calibration intervals */ +static long pps_jitcnt; /* jitter limit exceeded */ +static long pps_stbcnt; /* stability limit exceeded */ +static long pps_errcnt; /* calibration errors */ + + +/* PPS kernel consumer compensates the whole phase error immediately. + * Otherwise, reduce the offset by a fixed factor times the time constant. + */ +static inline s64 ntp_offset_chunk(s64 offset) +{ + if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL) + return offset; + else + return shift_right(offset, SHIFT_PLL + time_constant); +} + +static inline void pps_reset_freq_interval(void) +{ + /* the PPS calibration interval may end + surprisingly early */ + pps_shift = PPS_INTMIN; + pps_intcnt = 0; +} + +/** + * pps_clear - Clears the PPS state variables + * + * Must be called while holding a write on the xtime_lock + */ +static inline void pps_clear(void) +{ + pps_reset_freq_interval(); + pps_tf[0] = 0; + pps_tf[1] = 0; + pps_tf[2] = 0; + pps_fbase.tv_sec = pps_fbase.tv_nsec = 0; + pps_freq = 0; +} + +/* Decrease pps_valid to indicate that another second has passed since + * the last PPS signal. When it reaches 0, indicate that PPS signal is + * missing. + * + * Must be called while holding a write on the xtime_lock + */ +static inline void pps_dec_valid(void) +{ + if (pps_valid > 0) + pps_valid--; + else { + time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | + STA_PPSWANDER | STA_PPSERROR); + pps_clear(); + } +} + +static inline void pps_set_freq(s64 freq) +{ + pps_freq = freq; +} + +static inline int is_error_status(int status) +{ + return (time_status & (STA_UNSYNC|STA_CLOCKERR)) + /* PPS signal lost when either PPS time or + * PPS frequency synchronization requested + */ + || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) + && !(time_status & STA_PPSSIGNAL)) + /* PPS jitter exceeded when + * PPS time synchronization requested */ + || ((time_status & (STA_PPSTIME|STA_PPSJITTER)) + == (STA_PPSTIME|STA_PPSJITTER)) + /* PPS wander exceeded or calibration error when + * PPS frequency synchronization requested + */ + || ((time_status & STA_PPSFREQ) + && (time_status & (STA_PPSWANDER|STA_PPSERROR))); +} + +static inline void pps_fill_timex(struct timex *txc) +{ + txc->ppsfreq = shift_right((pps_freq >> PPM_SCALE_INV_SHIFT) * + PPM_SCALE_INV, NTP_SCALE_SHIFT); + txc->jitter = pps_jitter; + if (!(time_status & STA_NANO)) + txc->jitter /= NSEC_PER_USEC; + txc->shift = pps_shift; + txc->stabil = pps_stabil; + txc->jitcnt = pps_jitcnt; + txc->calcnt = pps_calcnt; + txc->errcnt = pps_errcnt; + txc->stbcnt = pps_stbcnt; +} + +#else /* !CONFIG_NTP_PPS */ + +static inline s64 ntp_offset_chunk(s64 offset) +{ + return shift_right(offset, SHIFT_PLL + time_constant); +} + +static inline void pps_reset_freq_interval(void) {} +static inline void pps_clear(void) {} +static inline void pps_dec_valid(void) {} +static inline void pps_set_freq(s64 freq) {} + +static inline int is_error_status(int status) +{ + return status & (STA_UNSYNC|STA_CLOCKERR); +} + +static inline void pps_fill_timex(struct timex *txc) +{ + /* PPS is not implemented, so these are zero */ + txc->ppsfreq = 0; + txc->jitter = 0; + txc->shift = 0; + txc->stabil = 0; + txc->jitcnt = 0; + txc->calcnt = 0; + txc->errcnt = 0; + txc->stbcnt = 0; +} + +#endif /* CONFIG_NTP_PPS */ + +/* + * NTP methods: + */ + +/* + * Update (tick_length, tick_length_base, tick_nsec), based + * on (tick_usec, ntp_tick_adj, time_freq): + */ +static void ntp_update_frequency(void) +{ + u64 second_length; + u64 new_base; + + second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) + << NTP_SCALE_SHIFT; + + second_length += ntp_tick_adj; + second_length += time_freq; + + tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT; + new_base = div_u64(second_length, NTP_INTERVAL_FREQ); + + /* + * Don't wait for the next second_overflow, apply + * the change to the tick length immediately: + */ + tick_length += new_base - tick_length_base; + tick_length_base = new_base; +} + +static inline s64 ntp_update_offset_fll(s64 offset64, long secs) +{ + time_status &= ~STA_MODE; + + if (secs < MINSEC) + return 0; + + if (!(time_status & STA_FLL) && (secs <= MAXSEC)) + return 0; + + time_status |= STA_MODE; + + return div64_long(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs); +} + +static void ntp_update_offset(long offset) +{ + s64 freq_adj; + s64 offset64; + long secs; + + if (!(time_status & STA_PLL)) + return; + + if (!(time_status & STA_NANO)) + offset *= NSEC_PER_USEC; + + /* + * Scale the phase adjustment and + * clamp to the operating range. + */ + offset = min(offset, MAXPHASE); + offset = max(offset, -MAXPHASE); + + /* + * Select how the frequency is to be controlled + * and in which mode (PLL or FLL). + */ + secs = get_seconds() - time_reftime; + if (unlikely(time_status & STA_FREQHOLD)) + secs = 0; + + time_reftime = get_seconds(); + + offset64 = offset; + freq_adj = ntp_update_offset_fll(offset64, secs); + + /* + * Clamp update interval to reduce PLL gain with low + * sampling rate (e.g. intermittent network connection) + * to avoid instability. + */ + if (unlikely(secs > 1 << (SHIFT_PLL + 1 + time_constant))) + secs = 1 << (SHIFT_PLL + 1 + time_constant); + + freq_adj += (offset64 * secs) << + (NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant)); + + freq_adj = min(freq_adj + time_freq, MAXFREQ_SCALED); + + time_freq = max(freq_adj, -MAXFREQ_SCALED); + + time_offset = div_s64(offset64 << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ); +} + +/** + * ntp_clear - Clears the NTP state variables + * + * Must be called while holding a write on the xtime_lock + */ +void ntp_clear(void) +{ + time_adjust = 0; /* stop active adjtime() */ + time_status |= STA_UNSYNC; + time_maxerror = NTP_PHASE_LIMIT; + time_esterror = NTP_PHASE_LIMIT; + + ntp_update_frequency(); + + tick_length = tick_length_base; + time_offset = 0; + + /* Clear PPS state variables */ + pps_clear(); +} + +/* + * Leap second processing. If in leap-insert state at the end of the + * day, the system clock is set back one second; if in leap-delete + * state, the system clock is set ahead one second. + */ +static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer) +{ + enum hrtimer_restart res = HRTIMER_NORESTART; + + write_seqlock(&xtime_lock); + + switch (time_state) { + case TIME_OK: + break; + case TIME_INS: + timekeeping_leap_insert(-1); + time_state = TIME_OOP; + printk(KERN_NOTICE + "Clock: inserting leap second 23:59:60 UTC\n"); + hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC); + res = HRTIMER_RESTART; + break; + case TIME_DEL: + timekeeping_leap_insert(1); + time_tai--; + time_state = TIME_WAIT; + printk(KERN_NOTICE + "Clock: deleting leap second 23:59:59 UTC\n"); + break; + case TIME_OOP: + time_tai++; + time_state = TIME_WAIT; + /* fall through */ + case TIME_WAIT: + if (!(time_status & (STA_INS | STA_DEL))) + time_state = TIME_OK; + break; + } + + write_sequnlock(&xtime_lock); + + return res; +} + +/* + * this routine handles the overflow of the microsecond field + * + * The tricky bits of code to handle the accurate clock support + * were provided by Dave Mills (Mills@UDEL.EDU) of NTP fame. + * They were originally developed for SUN and DEC kernels. + * All the kudos should go to Dave for this stuff. + */ +void second_overflow(void) +{ + s64 delta; + + /* Bump the maxerror field */ + time_maxerror += MAXFREQ / NSEC_PER_USEC; + if (time_maxerror > NTP_PHASE_LIMIT) { + time_maxerror = NTP_PHASE_LIMIT; + time_status |= STA_UNSYNC; + } + + /* Compute the phase adjustment for the next second */ + tick_length = tick_length_base; + + delta = ntp_offset_chunk(time_offset); + time_offset -= delta; + tick_length += delta; + + /* Check PPS signal */ + pps_dec_valid(); + + if (!time_adjust) + return; + + if (time_adjust > MAX_TICKADJ) { + time_adjust -= MAX_TICKADJ; + tick_length += MAX_TICKADJ_SCALED; + return; + } + + if (time_adjust < -MAX_TICKADJ) { + time_adjust += MAX_TICKADJ; + tick_length -= MAX_TICKADJ_SCALED; + return; + } + + tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ) + << NTP_SCALE_SHIFT; + time_adjust = 0; +} + +#ifdef CONFIG_GENERIC_CMOS_UPDATE + +/* Disable the cmos update - used by virtualization and embedded */ +int no_sync_cmos_clock __read_mostly; + +static void sync_cmos_clock(struct work_struct *work); + +static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock); + +static void sync_cmos_clock(struct work_struct *work) +{ + struct timespec now, next; + int fail = 1; + + /* + * If we have an externally synchronized Linux clock, then update + * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be + * called as close as possible to 500 ms before the new second starts. + * This code is run on a timer. If the clock is set, that timer + * may not expire at the correct time. Thus, we adjust... + */ + if (!ntp_synced()) { + /* + * Not synced, exit, do not restart a timer (if one is + * running, let it run out). + */ + return; + } + + getnstimeofday(&now); + if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) + fail = update_persistent_clock(now); + + next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2); + if (next.tv_nsec <= 0) + next.tv_nsec += NSEC_PER_SEC; + + if (!fail) + next.tv_sec = 659; + else + next.tv_sec = 0; + + if (next.tv_nsec >= NSEC_PER_SEC) { + next.tv_sec++; + next.tv_nsec -= NSEC_PER_SEC; + } + schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); +} + +static void notify_cmos_timer(void) +{ + if (!no_sync_cmos_clock) + schedule_delayed_work(&sync_cmos_work, 0); +} + +#else +static inline void notify_cmos_timer(void) { } +#endif + +/* + * Start the leap seconds timer: + */ +static inline void ntp_start_leap_timer(struct timespec *ts) +{ + long now = ts->tv_sec; + + if (time_status & STA_INS) { + time_state = TIME_INS; + now += 86400 - now % 86400; + hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); + + return; + } + + if (time_status & STA_DEL) { + time_state = TIME_DEL; + now += 86400 - (now + 1) % 86400; + hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS); + } +} + +/* + * Propagate a new txc->status value into the NTP state: + */ +static inline void process_adj_status(struct timex *txc, struct timespec *ts) +{ + if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { + time_state = TIME_OK; + time_status = STA_UNSYNC; + /* restart PPS frequency calibration */ + pps_reset_freq_interval(); + } + + /* + * If we turn on PLL adjustments then reset the + * reference time to current time. + */ + if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) + time_reftime = get_seconds(); + + /* only set allowed bits */ + time_status &= STA_RONLY; + time_status |= txc->status & ~STA_RONLY; + + switch (time_state) { + case TIME_OK: + ntp_start_leap_timer(ts); + break; + case TIME_INS: + case TIME_DEL: + time_state = TIME_OK; + ntp_start_leap_timer(ts); + case TIME_WAIT: + if (!(time_status & (STA_INS | STA_DEL))) + time_state = TIME_OK; + break; + case TIME_OOP: + hrtimer_restart(&leap_timer); + break; + } +} +/* + * Called with the xtime lock held, so we can access and modify + * all the global NTP state: + */ +static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts) +{ + if (txc->modes & ADJ_STATUS) + process_adj_status(txc, ts); + + if (txc->modes & ADJ_NANO) + time_status |= STA_NANO; + + if (txc->modes & ADJ_MICRO) + time_status &= ~STA_NANO; + + if (txc->modes & ADJ_FREQUENCY) { + time_freq = txc->freq * PPM_SCALE; + time_freq = min(time_freq, MAXFREQ_SCALED); + time_freq = max(time_freq, -MAXFREQ_SCALED); + /* update pps_freq */ + pps_set_freq(time_freq); + } + + if (txc->modes & ADJ_MAXERROR) + time_maxerror = txc->maxerror; + + if (txc->modes & ADJ_ESTERROR) + time_esterror = txc->esterror; + + if (txc->modes & ADJ_TIMECONST) { + time_constant = txc->constant; + if (!(time_status & STA_NANO)) + time_constant += 4; + time_constant = min(time_constant, (long)MAXTC); + time_constant = max(time_constant, 0l); + } + + if (txc->modes & ADJ_TAI && txc->constant > 0) + time_tai = txc->constant; + + if (txc->modes & ADJ_OFFSET) + ntp_update_offset(txc->offset); + + if (txc->modes & ADJ_TICK) + tick_usec = txc->tick; + + if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET)) + ntp_update_frequency(); +} + +/* + * adjtimex mainly allows reading (and writing, if superuser) of + * kernel time-keeping variables. used by xntpd. + */ +int do_adjtimex(struct timex *txc) +{ + struct timespec ts; + int result; + + /* Validate the data before disabling interrupts */ + if (txc->modes & ADJ_ADJTIME) { + /* singleshot must not be used with any other mode bits */ + if (!(txc->modes & ADJ_OFFSET_SINGLESHOT)) + return -EINVAL; + if (!(txc->modes & ADJ_OFFSET_READONLY) && + !capable(CAP_SYS_TIME)) + return -EPERM; + } else { + /* In order to modify anything, you gotta be super-user! */ + if (txc->modes && !capable(CAP_SYS_TIME)) + return -EPERM; + + /* + * if the quartz is off by more than 10% then + * something is VERY wrong! + */ + if (txc->modes & ADJ_TICK && + (txc->tick < 900000/USER_HZ || + txc->tick > 1100000/USER_HZ)) + return -EINVAL; + + if (txc->modes & ADJ_STATUS && time_state != TIME_OK) + hrtimer_cancel(&leap_timer); + } + + if (txc->modes & ADJ_SETOFFSET) { + struct timespec delta; + delta.tv_sec = txc->time.tv_sec; + delta.tv_nsec = txc->time.tv_usec; + if (!capable(CAP_SYS_TIME)) + return -EPERM; + if (!(txc->modes & ADJ_NANO)) + delta.tv_nsec *= 1000; + result = timekeeping_inject_offset(&delta); + if (result) + return result; + } + + getnstimeofday(&ts); + + write_seqlock_irq(&xtime_lock); + + if (txc->modes & ADJ_ADJTIME) { + long save_adjust = time_adjust; + + if (!(txc->modes & ADJ_OFFSET_READONLY)) { + /* adjtime() is independent from ntp_adjtime() */ + time_adjust = txc->offset; + ntp_update_frequency(); + } + txc->offset = save_adjust; + } else { + + /* If there are input parameters, then process them: */ + if (txc->modes) + process_adjtimex_modes(txc, &ts); + + txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ, + NTP_SCALE_SHIFT); + if (!(time_status & STA_NANO)) + txc->offset /= NSEC_PER_USEC; + } + + result = time_state; /* mostly `TIME_OK' */ + /* check for errors */ + if (is_error_status(time_status)) + result = TIME_ERROR; + + txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) * + PPM_SCALE_INV, NTP_SCALE_SHIFT); + txc->maxerror = time_maxerror; + txc->esterror = time_esterror; + txc->status = time_status; + txc->constant = time_constant; + txc->precision = 1; + txc->tolerance = MAXFREQ_SCALED / PPM_SCALE; + txc->tick = tick_usec; + txc->tai = time_tai; + + /* fill PPS status fields */ + pps_fill_timex(txc); + + write_sequnlock_irq(&xtime_lock); + + txc->time.tv_sec = ts.tv_sec; + txc->time.tv_usec = ts.tv_nsec; + if (!(time_status & STA_NANO)) + txc->time.tv_usec /= NSEC_PER_USEC; + + notify_cmos_timer(); + + return result; +} + +#ifdef CONFIG_NTP_PPS + +/* actually struct pps_normtime is good old struct timespec, but it is + * semantically different (and it is the reason why it was invented): + * pps_normtime.nsec has a range of ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] + * while timespec.tv_nsec has a range of [0, NSEC_PER_SEC) */ +struct pps_normtime { + __kernel_time_t sec; /* seconds */ + long nsec; /* nanoseconds */ +}; + +/* normalize the timestamp so that nsec is in the + ( -NSEC_PER_SEC / 2, NSEC_PER_SEC / 2 ] interval */ +static inline struct pps_normtime pps_normalize_ts(struct timespec ts) +{ + struct pps_normtime norm = { + .sec = ts.tv_sec, + .nsec = ts.tv_nsec + }; + + if (norm.nsec > (NSEC_PER_SEC >> 1)) { + norm.nsec -= NSEC_PER_SEC; + norm.sec++; + } + + return norm; +} + +/* get current phase correction and jitter */ +static inline long pps_phase_filter_get(long *jitter) +{ + *jitter = pps_tf[0] - pps_tf[1]; + if (*jitter < 0) + *jitter = -*jitter; + + /* TODO: test various filters */ + return pps_tf[0]; +} + +/* add the sample to the phase filter */ +static inline void pps_phase_filter_add(long err) +{ + pps_tf[2] = pps_tf[1]; + pps_tf[1] = pps_tf[0]; + pps_tf[0] = err; +} + +/* decrease frequency calibration interval length. + * It is halved after four consecutive unstable intervals. + */ +static inline void pps_dec_freq_interval(void) +{ + if (--pps_intcnt <= -PPS_INTCOUNT) { + pps_intcnt = -PPS_INTCOUNT; + if (pps_shift > PPS_INTMIN) { + pps_shift--; + pps_intcnt = 0; + } + } +} + +/* increase frequency calibration interval length. + * It is doubled after four consecutive stable intervals. + */ +static inline void pps_inc_freq_interval(void) +{ + if (++pps_intcnt >= PPS_INTCOUNT) { + pps_intcnt = PPS_INTCOUNT; + if (pps_shift < PPS_INTMAX) { + pps_shift++; + pps_intcnt = 0; + } + } +} + +/* update clock frequency based on MONOTONIC_RAW clock PPS signal + * timestamps + * + * At the end of the calibration interval the difference between the + * first and last MONOTONIC_RAW clock timestamps divided by the length + * of the interval becomes the frequency update. If the interval was + * too long, the data are discarded. + * Returns the difference between old and new frequency values. + */ +static long hardpps_update_freq(struct pps_normtime freq_norm) +{ + long delta, delta_mod; + s64 ftemp; + + /* check if the frequency interval was too long */ + if (freq_norm.sec > (2 << pps_shift)) { + time_status |= STA_PPSERROR; + pps_errcnt++; + pps_dec_freq_interval(); + pr_err("hardpps: PPSERROR: interval too long - %ld s\n", + freq_norm.sec); + return 0; + } + + /* here the raw frequency offset and wander (stability) is + * calculated. If the wander is less than the wander threshold + * the interval is increased; otherwise it is decreased. + */ + ftemp = div_s64(((s64)(-freq_norm.nsec)) << NTP_SCALE_SHIFT, + freq_norm.sec); + delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); + pps_freq = ftemp; + if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { + pr_warning("hardpps: PPSWANDER: change=%ld\n", delta); + time_status |= STA_PPSWANDER; + pps_stbcnt++; + pps_dec_freq_interval(); + } else { /* good sample */ + pps_inc_freq_interval(); + } + + /* the stability metric is calculated as the average of recent + * frequency changes, but is used only for performance + * monitoring + */ + delta_mod = delta; + if (delta_mod < 0) + delta_mod = -delta_mod; + pps_stabil += (div_s64(((s64)delta_mod) << + (NTP_SCALE_SHIFT - SHIFT_USEC), + NSEC_PER_USEC) - pps_stabil) >> PPS_INTMIN; + + /* if enabled, the system clock frequency is updated */ + if ((time_status & STA_PPSFREQ) != 0 && + (time_status & STA_FREQHOLD) == 0) { + time_freq = pps_freq; + ntp_update_frequency(); + } + + return delta; +} + +/* correct REALTIME clock phase error against PPS signal */ +static void hardpps_update_phase(long error) +{ + long correction = -error; + long jitter; + + /* add the sample to the median filter */ + pps_phase_filter_add(correction); + correction = pps_phase_filter_get(&jitter); + + /* Nominal jitter is due to PPS signal noise. If it exceeds the + * threshold, the sample is discarded; otherwise, if so enabled, + * the time offset is updated. + */ + if (jitter > (pps_jitter << PPS_POPCORN)) { + pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", + jitter, (pps_jitter << PPS_POPCORN)); + time_status |= STA_PPSJITTER; + pps_jitcnt++; + } else if (time_status & STA_PPSTIME) { + /* correct the time using the phase offset */ + time_offset = div_s64(((s64)correction) << NTP_SCALE_SHIFT, + NTP_INTERVAL_FREQ); + /* cancel running adjtime() */ + time_adjust = 0; + } + /* update jitter */ + pps_jitter += (jitter - pps_jitter) >> PPS_INTMIN; +} + +/* + * hardpps() - discipline CPU clock oscillator to external PPS signal + * + * This routine is called at each PPS signal arrival in order to + * discipline the CPU clock oscillator to the PPS signal. It takes two + * parameters: REALTIME and MONOTONIC_RAW clock timestamps. The former + * is used to correct clock phase error and the latter is used to + * correct the frequency. + * + * This code is based on David Mills's reference nanokernel + * implementation. It was mostly rewritten but keeps the same idea. + */ +void hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) +{ + struct pps_normtime pts_norm, freq_norm; + unsigned long flags; + + pts_norm = pps_normalize_ts(*phase_ts); + + write_seqlock_irqsave(&xtime_lock, flags); + + /* clear the error bits, they will be set again if needed */ + time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); + + /* indicate signal presence */ + time_status |= STA_PPSSIGNAL; + pps_valid = PPS_VALID; + + /* when called for the first time, + * just start the frequency interval */ + if (unlikely(pps_fbase.tv_sec == 0)) { + pps_fbase = *raw_ts; + write_sequnlock_irqrestore(&xtime_lock, flags); + return; + } + + /* ok, now we have a base for frequency calculation */ + freq_norm = pps_normalize_ts(timespec_sub(*raw_ts, pps_fbase)); + + /* check that the signal is in the range + * [1s - MAXFREQ us, 1s + MAXFREQ us], otherwise reject it */ + if ((freq_norm.sec == 0) || + (freq_norm.nsec > MAXFREQ * freq_norm.sec) || + (freq_norm.nsec < -MAXFREQ * freq_norm.sec)) { + time_status |= STA_PPSJITTER; + /* restart the frequency calibration interval */ + pps_fbase = *raw_ts; + write_sequnlock_irqrestore(&xtime_lock, flags); + pr_err("hardpps: PPSJITTER: bad pulse\n"); + return; + } + + /* signal is ok */ + + /* check if the current frequency interval is finished */ + if (freq_norm.sec >= (1 << pps_shift)) { + pps_calcnt++; + /* restart the frequency calibration interval */ + pps_fbase = *raw_ts; + hardpps_update_freq(freq_norm); + } + + hardpps_update_phase(pts_norm.nsec); + + write_sequnlock_irqrestore(&xtime_lock, flags); +} +EXPORT_SYMBOL(hardpps); + +#endif /* CONFIG_NTP_PPS */ + +static int __init ntp_tick_adj_setup(char *str) +{ + ntp_tick_adj = simple_strtol(str, NULL, 0); + ntp_tick_adj <<= NTP_SCALE_SHIFT; + + return 1; +} + +__setup("ntp_tick_adj=", ntp_tick_adj_setup); + +void __init ntp_init(void) +{ + ntp_clear(); + hrtimer_init(&leap_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); + leap_timer.function = ntp_leap_second; +} diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c new file mode 100644 index 00000000..c340ca65 --- /dev/null +++ b/kernel/time/posix-clock.c @@ -0,0 +1,445 @@ +/* + * posix-clock.c - support for dynamic clock devices + * + * Copyright (C) 2010 OMICRON electronics GmbH + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ +#include +#include +#include +#include +#include +#include + +static void delete_clock(struct kref *kref); + +/* + * Returns NULL if the posix_clock instance attached to 'fp' is old and stale. + */ +static struct posix_clock *get_posix_clock(struct file *fp) +{ + struct posix_clock *clk = fp->private_data; + + down_read(&clk->rwsem); + + if (!clk->zombie) + return clk; + + up_read(&clk->rwsem); + + return NULL; +} + +static void put_posix_clock(struct posix_clock *clk) +{ + up_read(&clk->rwsem); +} + +static ssize_t posix_clock_read(struct file *fp, char __user *buf, + size_t count, loff_t *ppos) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -EINVAL; + + if (!clk) + return -ENODEV; + + if (clk->ops.read) + err = clk->ops.read(clk, fp->f_flags, buf, count); + + put_posix_clock(clk); + + return err; +} + +static unsigned int posix_clock_poll(struct file *fp, poll_table *wait) +{ + struct posix_clock *clk = get_posix_clock(fp); + int result = 0; + + if (!clk) + return -ENODEV; + + if (clk->ops.poll) + result = clk->ops.poll(clk, fp, wait); + + put_posix_clock(clk); + + return result; +} + +static int posix_clock_fasync(int fd, struct file *fp, int on) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = 0; + + if (!clk) + return -ENODEV; + + if (clk->ops.fasync) + err = clk->ops.fasync(clk, fd, fp, on); + + put_posix_clock(clk); + + return err; +} + +static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -ENODEV; + + if (!clk) + return -ENODEV; + + if (clk->ops.mmap) + err = clk->ops.mmap(clk, vma); + + put_posix_clock(clk); + + return err; +} + +static long posix_clock_ioctl(struct file *fp, + unsigned int cmd, unsigned long arg) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -ENOTTY; + + if (!clk) + return -ENODEV; + + if (clk->ops.ioctl) + err = clk->ops.ioctl(clk, cmd, arg); + + put_posix_clock(clk); + + return err; +} + +#ifdef CONFIG_COMPAT +static long posix_clock_compat_ioctl(struct file *fp, + unsigned int cmd, unsigned long arg) +{ + struct posix_clock *clk = get_posix_clock(fp); + int err = -ENOTTY; + + if (!clk) + return -ENODEV; + + if (clk->ops.ioctl) + err = clk->ops.ioctl(clk, cmd, arg); + + put_posix_clock(clk); + + return err; +} +#endif + +static int posix_clock_open(struct inode *inode, struct file *fp) +{ + int err; + struct posix_clock *clk = + container_of(inode->i_cdev, struct posix_clock, cdev); + + down_read(&clk->rwsem); + + if (clk->zombie) { + err = -ENODEV; + goto out; + } + if (clk->ops.open) + err = clk->ops.open(clk, fp->f_mode); + else + err = 0; + + if (!err) { + kref_get(&clk->kref); + fp->private_data = clk; + } +out: + up_read(&clk->rwsem); + return err; +} + +static int posix_clock_release(struct inode *inode, struct file *fp) +{ + struct posix_clock *clk = fp->private_data; + int err = 0; + + if (clk->ops.release) + err = clk->ops.release(clk); + + kref_put(&clk->kref, delete_clock); + + fp->private_data = NULL; + + return err; +} + +static const struct file_operations posix_clock_file_operations = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .read = posix_clock_read, + .poll = posix_clock_poll, + .unlocked_ioctl = posix_clock_ioctl, + .open = posix_clock_open, + .release = posix_clock_release, + .fasync = posix_clock_fasync, + .mmap = posix_clock_mmap, +#ifdef CONFIG_COMPAT + .compat_ioctl = posix_clock_compat_ioctl, +#endif +}; + +int posix_clock_register(struct posix_clock *clk, dev_t devid) +{ + int err; + + kref_init(&clk->kref); + init_rwsem(&clk->rwsem); + + cdev_init(&clk->cdev, &posix_clock_file_operations); + clk->cdev.owner = clk->ops.owner; + err = cdev_add(&clk->cdev, devid, 1); + + return err; +} +EXPORT_SYMBOL_GPL(posix_clock_register); + +static void delete_clock(struct kref *kref) +{ + struct posix_clock *clk = container_of(kref, struct posix_clock, kref); + + if (clk->release) + clk->release(clk); +} + +void posix_clock_unregister(struct posix_clock *clk) +{ + cdev_del(&clk->cdev); + + down_write(&clk->rwsem); + clk->zombie = true; + up_write(&clk->rwsem); + + kref_put(&clk->kref, delete_clock); +} +EXPORT_SYMBOL_GPL(posix_clock_unregister); + +struct posix_clock_desc { + struct file *fp; + struct posix_clock *clk; +}; + +static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd) +{ + struct file *fp = fget(CLOCKID_TO_FD(id)); + int err = -EINVAL; + + if (!fp) + return err; + + if (fp->f_op->open != posix_clock_open || !fp->private_data) + goto out; + + cd->fp = fp; + cd->clk = get_posix_clock(fp); + + err = cd->clk ? 0 : -ENODEV; +out: + if (err) + fput(fp); + return err; +} + +static void put_clock_desc(struct posix_clock_desc *cd) +{ + put_posix_clock(cd->clk); + fput(cd->fp); +} + +static int pc_clock_adjtime(clockid_t id, struct timex *tx) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if ((cd.fp->f_mode & FMODE_WRITE) == 0) { + err = -EACCES; + goto out; + } + + if (cd.clk->ops.clock_adjtime) + err = cd.clk->ops.clock_adjtime(cd.clk, tx); + else + err = -EOPNOTSUPP; +out: + put_clock_desc(&cd); + + return err; +} + +static int pc_clock_gettime(clockid_t id, struct timespec *ts) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.clock_gettime) + err = cd.clk->ops.clock_gettime(cd.clk, ts); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_clock_getres(clockid_t id, struct timespec *ts) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.clock_getres) + err = cd.clk->ops.clock_getres(cd.clk, ts); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_clock_settime(clockid_t id, const struct timespec *ts) +{ + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if ((cd.fp->f_mode & FMODE_WRITE) == 0) { + err = -EACCES; + goto out; + } + + if (cd.clk->ops.clock_settime) + err = cd.clk->ops.clock_settime(cd.clk, ts); + else + err = -EOPNOTSUPP; +out: + put_clock_desc(&cd); + + return err; +} + +static int pc_timer_create(struct k_itimer *kit) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.timer_create) + err = cd.clk->ops.timer_create(cd.clk, kit); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static int pc_timer_delete(struct k_itimer *kit) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.timer_delete) + err = cd.clk->ops.timer_delete(cd.clk, kit); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + + if (get_clock_desc(id, &cd)) + return; + + if (cd.clk->ops.timer_gettime) + cd.clk->ops.timer_gettime(cd.clk, kit, ts); + + put_clock_desc(&cd); +} + +static int pc_timer_settime(struct k_itimer *kit, int flags, + struct itimerspec *ts, struct itimerspec *old) +{ + clockid_t id = kit->it_clock; + struct posix_clock_desc cd; + int err; + + err = get_clock_desc(id, &cd); + if (err) + return err; + + if (cd.clk->ops.timer_settime) + err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old); + else + err = -EOPNOTSUPP; + + put_clock_desc(&cd); + + return err; +} + +struct k_clock clock_posix_dynamic = { + .clock_getres = pc_clock_getres, + .clock_set = pc_clock_settime, + .clock_get = pc_clock_gettime, + .clock_adj = pc_clock_adjtime, + .timer_create = pc_timer_create, + .timer_set = pc_timer_settime, + .timer_del = pc_timer_delete, + .timer_get = pc_timer_gettime, +}; diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c new file mode 100644 index 00000000..56f70434 --- /dev/null +++ b/kernel/time/tick-broadcast.c @@ -0,0 +1,701 @@ +/* + * linux/kernel/time/tick-broadcast.c + * + * This file contains functions which emulate a local clock-event + * device via a broadcast event source. + * + * Copyright(C) 2005-2006, Thomas Gleixner + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + * + * This code is licenced under the GPL version 2. For details see + * kernel-base/COPYING. + */ +#include +#include +#include +#include +#include +#include +#include + +#include "tick-internal.h" + +/* + * Broadcast support for broken x86 hardware, where the local apic + * timer stops in C3 state. + */ + +static struct tick_device tick_broadcast_device; +/* FIXME: Use cpumask_var_t. */ +static DECLARE_BITMAP(tick_broadcast_mask, NR_CPUS); +static DECLARE_BITMAP(tmpmask, NR_CPUS); +static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); +static int tick_broadcast_force; + +#ifdef CONFIG_TICK_ONESHOT +static void tick_broadcast_clear_oneshot(int cpu); +#else +static inline void tick_broadcast_clear_oneshot(int cpu) { } +#endif + +/* + * Debugging: see timer_list.c + */ +struct tick_device *tick_get_broadcast_device(void) +{ + return &tick_broadcast_device; +} + +struct cpumask *tick_get_broadcast_mask(void) +{ + return to_cpumask(tick_broadcast_mask); +} + +/* + * Start the device in periodic mode + */ +static void tick_broadcast_start_periodic(struct clock_event_device *bc) +{ + if (bc) + tick_setup_periodic(bc, 1); +} + +/* + * Check, if the device can be utilized as broadcast device: + */ +int tick_check_broadcast_device(struct clock_event_device *dev) +{ + if ((tick_broadcast_device.evtdev && + tick_broadcast_device.evtdev->rating >= dev->rating) || + (dev->features & CLOCK_EVT_FEAT_C3STOP)) + return 0; + + clockevents_exchange_device(tick_broadcast_device.evtdev, dev); + tick_broadcast_device.evtdev = dev; + if (!cpumask_empty(tick_get_broadcast_mask())) + tick_broadcast_start_periodic(dev); + return 1; +} + +/* + * Check, if the device is the broadcast device + */ +int tick_is_broadcast_device(struct clock_event_device *dev) +{ + return (dev && tick_broadcast_device.evtdev == dev); +} + +/* + * Check, if the device is disfunctional and a place holder, which + * needs to be handled by the broadcast device. + */ +int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) +{ + unsigned long flags; + int ret = 0; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + + /* + * Devices might be registered with both periodic and oneshot + * mode disabled. This signals, that the device needs to be + * operated from the broadcast device and is a placeholder for + * the cpu local device. + */ + if (!tick_device_is_functional(dev)) { + dev->event_handler = tick_handle_periodic; + cpumask_set_cpu(cpu, tick_get_broadcast_mask()); + tick_broadcast_start_periodic(tick_broadcast_device.evtdev); + ret = 1; + } else { + /* + * When the new device is not affected by the stop + * feature and the cpu is marked in the broadcast mask + * then clear the broadcast bit. + */ + if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) { + int cpu = smp_processor_id(); + + cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); + tick_broadcast_clear_oneshot(cpu); + } + } + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); + return ret; +} + +/* + * Broadcast the event to the cpus, which are set in the mask (mangled). + */ +static void tick_do_broadcast(struct cpumask *mask) +{ + int cpu = smp_processor_id(); + struct tick_device *td; + + /* + * Check, if the current cpu is in the mask + */ + if (cpumask_test_cpu(cpu, mask)) { + cpumask_clear_cpu(cpu, mask); + td = &per_cpu(tick_cpu_device, cpu); + td->evtdev->event_handler(td->evtdev); + } + + if (!cpumask_empty(mask)) { + /* + * It might be necessary to actually check whether the devices + * have different broadcast functions. For now, just use the + * one of the first device. This works as long as we have this + * misfeature only on x86 (lapic) + */ + td = &per_cpu(tick_cpu_device, cpumask_first(mask)); + td->evtdev->broadcast(mask); + } +} + +/* + * Periodic broadcast: + * - invoke the broadcast handlers + */ +static void tick_do_periodic_broadcast(void) +{ + raw_spin_lock(&tick_broadcast_lock); + + cpumask_and(to_cpumask(tmpmask), + cpu_online_mask, tick_get_broadcast_mask()); + tick_do_broadcast(to_cpumask(tmpmask)); + + raw_spin_unlock(&tick_broadcast_lock); +} + +/* + * Event handler for periodic broadcast ticks + */ +static void tick_handle_periodic_broadcast(struct clock_event_device *dev) +{ + ktime_t next; + + tick_do_periodic_broadcast(); + + /* + * The device is in periodic mode. No reprogramming necessary: + */ + if (dev->mode == CLOCK_EVT_MODE_PERIODIC) + return; + + /* + * Setup the next period for devices, which do not have + * periodic mode. We read dev->next_event first and add to it + * when the event already expired. clockevents_program_event() + * sets dev->next_event only when the event is really + * programmed to the device. + */ + for (next = dev->next_event; ;) { + next = ktime_add(next, tick_period); + + if (!clockevents_program_event(dev, next, ktime_get())) + return; + tick_do_periodic_broadcast(); + } +} + +/* + * Powerstate information: The system enters/leaves a state, where + * affected devices might stop + */ +static void tick_do_broadcast_on_off(unsigned long *reason) +{ + struct clock_event_device *bc, *dev; + struct tick_device *td; + unsigned long flags; + int cpu, bc_stopped; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + + cpu = smp_processor_id(); + td = &per_cpu(tick_cpu_device, cpu); + dev = td->evtdev; + bc = tick_broadcast_device.evtdev; + + /* + * Is the device not affected by the powerstate ? + */ + if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP)) + goto out; + + if (!tick_device_is_functional(dev)) + goto out; + + bc_stopped = cpumask_empty(tick_get_broadcast_mask()); + + switch (*reason) { + case CLOCK_EVT_NOTIFY_BROADCAST_ON: + case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: + if (!cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { + cpumask_set_cpu(cpu, tick_get_broadcast_mask()); + if (tick_broadcast_device.mode == + TICKDEV_MODE_PERIODIC) + clockevents_shutdown(dev); + } + if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) + tick_broadcast_force = 1; + break; + case CLOCK_EVT_NOTIFY_BROADCAST_OFF: + if (!tick_broadcast_force && + cpumask_test_cpu(cpu, tick_get_broadcast_mask())) { + cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); + if (tick_broadcast_device.mode == + TICKDEV_MODE_PERIODIC) + tick_setup_periodic(dev, 0); + } + break; + } + + if (cpumask_empty(tick_get_broadcast_mask())) { + if (!bc_stopped) + clockevents_shutdown(bc); + } else if (bc_stopped) { + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + tick_broadcast_start_periodic(bc); + else + tick_broadcast_setup_oneshot(bc); + } +out: + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +/* + * Powerstate information: The system enters/leaves a state, where + * affected devices might stop. + */ +void tick_broadcast_on_off(unsigned long reason, int *oncpu) +{ + if (!cpumask_test_cpu(*oncpu, cpu_online_mask)) + printk(KERN_ERR "tick-broadcast: ignoring broadcast for " + "offline CPU #%d\n", *oncpu); + else + tick_do_broadcast_on_off(&reason); +} + +/* + * Set the periodic handler depending on broadcast on/off + */ +void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) +{ + if (!broadcast) + dev->event_handler = tick_handle_periodic; + else + dev->event_handler = tick_handle_periodic_broadcast; +} + +/* + * Remove a CPU from broadcasting + */ +void tick_shutdown_broadcast(unsigned int *cpup) +{ + struct clock_event_device *bc; + unsigned long flags; + unsigned int cpu = *cpup; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + + bc = tick_broadcast_device.evtdev; + cpumask_clear_cpu(cpu, tick_get_broadcast_mask()); + + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { + if (bc && cpumask_empty(tick_get_broadcast_mask())) + clockevents_shutdown(bc); + } + + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +void tick_suspend_broadcast(void) +{ + struct clock_event_device *bc; + unsigned long flags; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + + bc = tick_broadcast_device.evtdev; + if (bc) + clockevents_shutdown(bc); + + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +int tick_resume_broadcast(void) +{ + struct clock_event_device *bc; + unsigned long flags; + int broadcast = 0; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + + bc = tick_broadcast_device.evtdev; + + if (bc) { + clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME); + + switch (tick_broadcast_device.mode) { + case TICKDEV_MODE_PERIODIC: + if (!cpumask_empty(tick_get_broadcast_mask())) + tick_broadcast_start_periodic(bc); + broadcast = cpumask_test_cpu(smp_processor_id(), + tick_get_broadcast_mask()); + break; + case TICKDEV_MODE_ONESHOT: + broadcast = tick_resume_broadcast_oneshot(bc); + break; + } + } + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); + + return broadcast; +} + + +#ifdef CONFIG_TICK_ONESHOT + +/* FIXME: use cpumask_var_t. */ +static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS); +static DECLARE_BITMAP(tick_broadcast_pending, NR_CPUS); +static DECLARE_BITMAP(tick_force_broadcast_mask, NR_CPUS); + +/* + * Exposed for debugging: see timer_list.c + */ +struct cpumask *tick_get_broadcast_oneshot_mask(void) +{ + return to_cpumask(tick_broadcast_oneshot_mask); +} + +static int tick_broadcast_set_event(ktime_t expires, int force) +{ + struct clock_event_device *bc = tick_broadcast_device.evtdev; + + return tick_dev_program_event(bc, expires, force); +} + +/* + * Called before going idle with interrupts disabled. Checks whether a + * broadcast event from the other core is about to happen. + */ +int tick_check_broadcast_pending(void) +{ + return test_bit(smp_processor_id(), tick_force_broadcast_mask); +} + +int tick_resume_broadcast_oneshot(struct clock_event_device *bc) +{ + clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); + return 0; +} + +/* + * Called from irq_enter() when idle was interrupted to reenable the + * per cpu device. + */ +void tick_check_oneshot_broadcast(int cpu) +{ + if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) { + struct tick_device *td = &per_cpu(tick_cpu_device, cpu); + + clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT); + } +} + +/* + * Handle oneshot mode broadcasting + */ +static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) +{ + struct tick_device *td; + ktime_t now, next_event; + int cpu; + + raw_spin_lock(&tick_broadcast_lock); +again: + dev->next_event.tv64 = KTIME_MAX; + next_event.tv64 = KTIME_MAX; + cpumask_clear(to_cpumask(tmpmask)); + now = ktime_get(); + /* Find all expired events */ + for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) { + td = &per_cpu(tick_cpu_device, cpu); + if (td->evtdev->next_event.tv64 <= now.tv64) { + cpumask_set_cpu(cpu, to_cpumask(tmpmask)); + /* + * Mark the remote cpu in the pending mask, so + * it can avoid reprogramming the cpu local + * timer in tick_broadcast_oneshot_control(). + */ + set_bit(cpu, tick_broadcast_pending); + } else if (td->evtdev->next_event.tv64 < next_event.tv64) + next_event.tv64 = td->evtdev->next_event.tv64; + } + + /* Take care of enforced broadcast requests */ + for_each_cpu(cpu, to_cpumask(tick_force_broadcast_mask)) { + set_bit(cpu, tmpmask); + clear_bit(cpu, tick_force_broadcast_mask); + } + + /* + * Wakeup the cpus which have an expired event. + */ + tick_do_broadcast(to_cpumask(tmpmask)); + + /* + * Two reasons for reprogram: + * + * - The global event did not expire any CPU local + * events. This happens in dyntick mode, as the maximum PIT + * delta is quite small. + * + * - There are pending events on sleeping CPUs which were not + * in the event mask + */ + if (next_event.tv64 != KTIME_MAX) { + /* + * Rearm the broadcast device. If event expired, + * repeat the above + */ + if (tick_broadcast_set_event(next_event, 0)) + goto again; + } + raw_spin_unlock(&tick_broadcast_lock); +} + +/* + * Powerstate information: The system enters/leaves a state, where + * affected devices might stop + */ +void tick_broadcast_oneshot_control(unsigned long reason) +{ + struct clock_event_device *bc, *dev; + struct tick_device *td; + unsigned long flags; + ktime_t now; + int cpu; + + /* + * Periodic mode does not care about the enter/exit of power + * states + */ + if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) + return; + + /* + * We are called with preemtion disabled from the depth of the + * idle code, so we can't be moved away. + */ + cpu = smp_processor_id(); + td = &per_cpu(tick_cpu_device, cpu); + dev = td->evtdev; + + if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) + return; + + bc = tick_broadcast_device.evtdev; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { + WARN_ON_ONCE(test_bit(cpu, tick_broadcast_pending)); + WARN_ON_ONCE(test_bit(cpu, tick_force_broadcast_mask)); + if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { + cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask()); + clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); + if (dev->next_event.tv64 < bc->next_event.tv64) + tick_broadcast_set_event(dev->next_event, 1); + } + } else { + if (cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) { + cpumask_clear_cpu(cpu, + tick_get_broadcast_oneshot_mask()); + clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); + if (dev->next_event.tv64 == KTIME_MAX) + goto out; + /* + * The cpu handling the broadcast timer marked + * this cpu in the broadcast pending mask and + * fired the broadcast IPI. So we are going to + * handle the expired event anyway via the + * broadcast IPI handler. No need to reprogram + * the timer with an already expired event. + */ + if (test_and_clear_bit(cpu, tick_broadcast_pending)) + goto out; + /* + * If the pending bit is not set, then we are + * either the CPU handling the broadcast + * interrupt or we got woken by something else. + * + * We are not longer in the broadcast mask, so + * if the cpu local expiry time is already + * reached, we would reprogram the cpu local + * timer with an already expired event. + * + * This can lead to a ping-pong when we return + * to idle and therefor rearm the broadcast + * timer before the cpu local timer was able + * to fire. This happens because the forced + * reprogramming makes sure that the event + * will happen in the future and depending on + * the min_delta setting this might be far + * enough out that the ping-pong starts. + * + * If the cpu local next_event has expired + * then we know that the broadcast timer + * next_event has expired as well and + * broadcast is about to be handled. So we + * avoid reprogramming and enforce that the + * broadcast handler, which did not run yet, + * will invoke the cpu local handler. + * + * We cannot call the handler directly from + * here, because we might be in a NOHZ phase + * and we did not go through the irq_enter() + * nohz fixups. + */ + now = ktime_get(); + if (dev->next_event.tv64 <= now.tv64) + set_bit(cpu, tick_force_broadcast_mask); + /* + * We got woken by something else. Reprogram + * the cpu local timer device. + */ + tick_program_event(dev->next_event, 1); + } + } +out: + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +/* + * Reset the one shot broadcast for a cpu + * + * Called with tick_broadcast_lock held + */ +static void tick_broadcast_clear_oneshot(int cpu) +{ + cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); +} + +static void tick_broadcast_init_next_event(struct cpumask *mask, + ktime_t expires) +{ + struct tick_device *td; + int cpu; + + for_each_cpu(cpu, mask) { + td = &per_cpu(tick_cpu_device, cpu); + if (td->evtdev) + td->evtdev->next_event = expires; + } +} + +/** + * tick_broadcast_setup_oneshot - setup the broadcast device + */ +void tick_broadcast_setup_oneshot(struct clock_event_device *bc) +{ + int cpu = smp_processor_id(); + + /* Set it up only once ! */ + if (bc->event_handler != tick_handle_oneshot_broadcast) { + int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; + + bc->event_handler = tick_handle_oneshot_broadcast; + clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); + + /* Take the do_timer update */ + tick_do_timer_cpu = cpu; + + /* + * We must be careful here. There might be other CPUs + * waiting for periodic broadcast. We need to set the + * oneshot_mask bits for those and program the + * broadcast device to fire. + */ + cpumask_copy(to_cpumask(tmpmask), tick_get_broadcast_mask()); + cpumask_clear_cpu(cpu, to_cpumask(tmpmask)); + cpumask_or(tick_get_broadcast_oneshot_mask(), + tick_get_broadcast_oneshot_mask(), + to_cpumask(tmpmask)); + + if (was_periodic && !cpumask_empty(to_cpumask(tmpmask))) { + tick_broadcast_init_next_event(to_cpumask(tmpmask), + tick_next_period); + tick_broadcast_set_event(tick_next_period, 1); + } else + bc->next_event.tv64 = KTIME_MAX; + } else { + /* + * The first cpu which switches to oneshot mode sets + * the bit for all other cpus which are in the general + * (periodic) broadcast mask. So the bit is set and + * would prevent the first broadcast enter after this + * to program the bc device. + */ + tick_broadcast_clear_oneshot(cpu); + } +} + +/* + * Select oneshot operating mode for the broadcast device + */ +void tick_broadcast_switch_to_oneshot(void) +{ + struct clock_event_device *bc; + unsigned long flags; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + + tick_broadcast_device.mode = TICKDEV_MODE_ONESHOT; + bc = tick_broadcast_device.evtdev; + if (bc) + tick_broadcast_setup_oneshot(bc); + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + + +/* + * Remove a dead CPU from broadcasting + */ +void tick_shutdown_broadcast_oneshot(unsigned int *cpup) +{ + unsigned long flags; + unsigned int cpu = *cpup; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + + /* + * Clear the broadcast mask flag for the dead cpu, but do not + * stop the broadcast device! + */ + cpumask_clear_cpu(cpu, tick_get_broadcast_oneshot_mask()); + + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} + +/* + * Check, whether the broadcast device is in one shot mode + */ +int tick_broadcast_oneshot_active(void) +{ + return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT; +} + +/* + * Check whether the broadcast device supports oneshot. + */ +bool tick_broadcast_oneshot_available(void) +{ + struct clock_event_device *bc = tick_broadcast_device.evtdev; + + return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false; +} + +#endif diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c new file mode 100644 index 00000000..119528de --- /dev/null +++ b/kernel/time/tick-common.c @@ -0,0 +1,419 @@ +/* + * linux/kernel/time/tick-common.c + * + * This file contains the base functions to manage periodic tick + * related events. + * + * Copyright(C) 2005-2006, Thomas Gleixner + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + * + * This code is licenced under the GPL version 2. For details see + * kernel-base/COPYING. + */ +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "tick-internal.h" + +/* + * Tick devices + */ +DEFINE_PER_CPU(struct tick_device, tick_cpu_device); +/* + * Tick next event: keeps track of the tick time + */ +ktime_t tick_next_period; +ktime_t tick_period; +int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT; +static DEFINE_RAW_SPINLOCK(tick_device_lock); + +/* + * Debugging: see timer_list.c + */ +struct tick_device *tick_get_device(int cpu) +{ + return &per_cpu(tick_cpu_device, cpu); +} + +/** + * tick_is_oneshot_available - check for a oneshot capable event device + */ +int tick_is_oneshot_available(void) +{ + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); + + if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT)) + return 0; + if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) + return 1; + return tick_broadcast_oneshot_available(); +} + +/* + * Periodic tick + */ +static void tick_periodic(int cpu) +{ + if (tick_do_timer_cpu == cpu) { + write_seqlock(&xtime_lock); + + /* Keep track of the next tick event */ + tick_next_period = ktime_add(tick_next_period, tick_period); + + do_timer(1); + write_sequnlock(&xtime_lock); + } + + update_process_times(user_mode(get_irq_regs())); + profile_tick(CPU_PROFILING); +} + +/* + * Event handler for periodic ticks + */ +void tick_handle_periodic(struct clock_event_device *dev) +{ + int cpu = smp_processor_id(); + ktime_t next; + + tick_periodic(cpu); + + if (dev->mode != CLOCK_EVT_MODE_ONESHOT) + return; + /* + * Setup the next period for devices, which do not have + * periodic mode: + */ + next = ktime_add(dev->next_event, tick_period); + for (;;) { + if (!clockevents_program_event(dev, next, ktime_get())) + return; + /* + * Have to be careful here. If we're in oneshot mode, + * before we call tick_periodic() in a loop, we need + * to be sure we're using a real hardware clocksource. + * Otherwise we could get trapped in an infinite + * loop, as the tick_periodic() increments jiffies, + * when then will increment time, posibly causing + * the loop to trigger again and again. + */ + if (timekeeping_valid_for_hres()) + tick_periodic(cpu); + next = ktime_add(next, tick_period); + } +} + +/* + * Setup the device for a periodic tick + */ +void tick_setup_periodic(struct clock_event_device *dev, int broadcast) +{ + tick_set_periodic_handler(dev, broadcast); + + /* Broadcast setup ? */ + if (!tick_device_is_functional(dev)) + return; + + if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && + !tick_broadcast_oneshot_active()) { + clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); + } else { + unsigned long seq; + ktime_t next; + + do { + seq = read_seqbegin(&xtime_lock); + next = tick_next_period; + } while (read_seqretry(&xtime_lock, seq)); + + clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); + + for (;;) { + if (!clockevents_program_event(dev, next, ktime_get())) + return; + next = ktime_add(next, tick_period); + } + } +} + +/* + * Setup the tick device + */ +static void tick_setup_device(struct tick_device *td, + struct clock_event_device *newdev, int cpu, + const struct cpumask *cpumask) +{ + ktime_t next_event; + void (*handler)(struct clock_event_device *) = NULL; + + /* + * First device setup ? + */ + if (!td->evtdev) { + /* + * If no cpu took the do_timer update, assign it to + * this cpu: + */ + if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) { + tick_do_timer_cpu = cpu; + tick_next_period = ktime_get(); + tick_period = ktime_set(0, NSEC_PER_SEC / HZ); + } + + /* + * Startup in periodic mode first. + */ + td->mode = TICKDEV_MODE_PERIODIC; + } else { + handler = td->evtdev->event_handler; + next_event = td->evtdev->next_event; + td->evtdev->event_handler = clockevents_handle_noop; + } + + td->evtdev = newdev; + + /* + * When the device is not per cpu, pin the interrupt to the + * current cpu: + */ + if (!cpumask_equal(newdev->cpumask, cpumask)) + irq_set_affinity(newdev->irq, cpumask); + + /* + * When global broadcasting is active, check if the current + * device is registered as a placeholder for broadcast mode. + * This allows us to handle this x86 misfeature in a generic + * way. + */ + if (tick_device_uses_broadcast(newdev, cpu)) + return; + + if (td->mode == TICKDEV_MODE_PERIODIC) + tick_setup_periodic(newdev, 0); + else + tick_setup_oneshot(newdev, handler, next_event); +} + +/* + * Check, if the new registered device should be used. + */ +static int tick_check_new_device(struct clock_event_device *newdev) +{ + struct clock_event_device *curdev; + struct tick_device *td; + int cpu, ret = NOTIFY_OK; + unsigned long flags; + + raw_spin_lock_irqsave(&tick_device_lock, flags); + + cpu = smp_processor_id(); + if (!cpumask_test_cpu(cpu, newdev->cpumask)) + goto out_bc; + + td = &per_cpu(tick_cpu_device, cpu); + curdev = td->evtdev; + + /* cpu local device ? */ + if (!cpumask_equal(newdev->cpumask, cpumask_of(cpu))) { + + /* + * If the cpu affinity of the device interrupt can not + * be set, ignore it. + */ + if (!irq_can_set_affinity(newdev->irq)) + goto out_bc; + + /* + * If we have a cpu local device already, do not replace it + * by a non cpu local device + */ + if (curdev && cpumask_equal(curdev->cpumask, cpumask_of(cpu))) + goto out_bc; + } + + /* + * If we have an active device, then check the rating and the oneshot + * feature. + */ + if (curdev) { + /* + * Prefer one shot capable devices ! + */ + if ((curdev->features & CLOCK_EVT_FEAT_ONESHOT) && + !(newdev->features & CLOCK_EVT_FEAT_ONESHOT)) + goto out_bc; + /* + * Check the rating + */ + if (curdev->rating >= newdev->rating) + goto out_bc; + } + + /* + * Replace the eventually existing device by the new + * device. If the current device is the broadcast device, do + * not give it back to the clockevents layer ! + */ + if (tick_is_broadcast_device(curdev)) { + clockevents_shutdown(curdev); + curdev = NULL; + } + clockevents_exchange_device(curdev, newdev); + tick_setup_device(td, newdev, cpu, cpumask_of(cpu)); + if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) + tick_oneshot_notify(); + + raw_spin_unlock_irqrestore(&tick_device_lock, flags); + return NOTIFY_STOP; + +out_bc: + /* + * Can the new device be used as a broadcast device ? + */ + if (tick_check_broadcast_device(newdev)) + ret = NOTIFY_STOP; + + raw_spin_unlock_irqrestore(&tick_device_lock, flags); + + return ret; +} + +/* + * Transfer the do_timer job away from a dying cpu. + * + * Called with interrupts disabled. + */ +static void tick_handover_do_timer(int *cpup) +{ + if (*cpup == tick_do_timer_cpu) { + int cpu = cpumask_first(cpu_online_mask); + + tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu : + TICK_DO_TIMER_NONE; + } +} + +/* + * Shutdown an event device on a given cpu: + * + * This is called on a life CPU, when a CPU is dead. So we cannot + * access the hardware device itself. + * We just set the mode and remove it from the lists. + */ +static void tick_shutdown(unsigned int *cpup) +{ + struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); + struct clock_event_device *dev = td->evtdev; + unsigned long flags; + + raw_spin_lock_irqsave(&tick_device_lock, flags); + td->mode = TICKDEV_MODE_PERIODIC; + if (dev) { + /* + * Prevent that the clock events layer tries to call + * the set mode function! + */ + dev->mode = CLOCK_EVT_MODE_UNUSED; + clockevents_exchange_device(dev, NULL); + td->evtdev = NULL; + } + raw_spin_unlock_irqrestore(&tick_device_lock, flags); +} + +static void tick_suspend(void) +{ + struct tick_device *td = &__get_cpu_var(tick_cpu_device); + unsigned long flags; + + raw_spin_lock_irqsave(&tick_device_lock, flags); + clockevents_shutdown(td->evtdev); + raw_spin_unlock_irqrestore(&tick_device_lock, flags); +} + +static void tick_resume(void) +{ + struct tick_device *td = &__get_cpu_var(tick_cpu_device); + unsigned long flags; + int broadcast = tick_resume_broadcast(); + + raw_spin_lock_irqsave(&tick_device_lock, flags); + clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); + + if (!broadcast) { + if (td->mode == TICKDEV_MODE_PERIODIC) + tick_setup_periodic(td->evtdev, 0); + else + tick_resume_oneshot(); + } + raw_spin_unlock_irqrestore(&tick_device_lock, flags); +} + +/* + * Notification about clock event devices + */ +static int tick_notify(struct notifier_block *nb, unsigned long reason, + void *dev) +{ + switch (reason) { + + case CLOCK_EVT_NOTIFY_ADD: + return tick_check_new_device(dev); + + case CLOCK_EVT_NOTIFY_BROADCAST_ON: + case CLOCK_EVT_NOTIFY_BROADCAST_OFF: + case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: + tick_broadcast_on_off(reason, dev); + break; + + case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: + case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: + tick_broadcast_oneshot_control(reason); + break; + + case CLOCK_EVT_NOTIFY_CPU_DYING: + tick_handover_do_timer(dev); + break; + + case CLOCK_EVT_NOTIFY_CPU_DEAD: + tick_shutdown_broadcast_oneshot(dev); + tick_shutdown_broadcast(dev); + tick_shutdown(dev); + break; + + case CLOCK_EVT_NOTIFY_SUSPEND: + tick_suspend(); + tick_suspend_broadcast(); + break; + + case CLOCK_EVT_NOTIFY_RESUME: + tick_resume(); + break; + + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block tick_notifier = { + .notifier_call = tick_notify, +}; + +/** + * tick_init - initialize the tick control + * + * Register the notifier with the clockevents framework + */ +void __init tick_init(void) +{ + clockevents_register_notifier(&tick_notifier); +} diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h new file mode 100644 index 00000000..1009b06d --- /dev/null +++ b/kernel/time/tick-internal.h @@ -0,0 +1,146 @@ +/* + * tick internal variable and functions used by low/high res code + */ +#include +#include + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD + +#define TICK_DO_TIMER_NONE -1 +#define TICK_DO_TIMER_BOOT -2 + +DECLARE_PER_CPU(struct tick_device, tick_cpu_device); +extern ktime_t tick_next_period; +extern ktime_t tick_period; +extern int tick_do_timer_cpu __read_mostly; + +extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); +extern void tick_handle_periodic(struct clock_event_device *dev); + +extern void clockevents_shutdown(struct clock_event_device *dev); + +/* + * NO_HZ / high resolution timer shared code + */ +#ifdef CONFIG_TICK_ONESHOT +extern void tick_setup_oneshot(struct clock_event_device *newdev, + void (*handler)(struct clock_event_device *), + ktime_t nextevt); +extern int tick_dev_program_event(struct clock_event_device *dev, + ktime_t expires, int force); +extern int tick_program_event(ktime_t expires, int force); +extern void tick_oneshot_notify(void); +extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); +extern void tick_resume_oneshot(void); +# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); +extern void tick_broadcast_oneshot_control(unsigned long reason); +extern void tick_broadcast_switch_to_oneshot(void); +extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); +extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); +extern int tick_broadcast_oneshot_active(void); +extern void tick_check_oneshot_broadcast(int cpu); +bool tick_broadcast_oneshot_available(void); +# else /* BROADCAST */ +static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) +{ + BUG(); +} +static inline void tick_broadcast_oneshot_control(unsigned long reason) { } +static inline void tick_broadcast_switch_to_oneshot(void) { } +static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } +static inline int tick_broadcast_oneshot_active(void) { return 0; } +static inline void tick_check_oneshot_broadcast(int cpu) { } +static inline bool tick_broadcast_oneshot_available(void) { return true; } +# endif /* !BROADCAST */ + +#else /* !ONESHOT */ +static inline +void tick_setup_oneshot(struct clock_event_device *newdev, + void (*handler)(struct clock_event_device *), + ktime_t nextevt) +{ + BUG(); +} +static inline void tick_resume_oneshot(void) +{ + BUG(); +} +static inline int tick_program_event(ktime_t expires, int force) +{ + return 0; +} +static inline void tick_oneshot_notify(void) { } +static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) +{ + BUG(); +} +static inline void tick_broadcast_oneshot_control(unsigned long reason) { } +static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } +static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) +{ + return 0; +} +static inline int tick_broadcast_oneshot_active(void) { return 0; } +static inline bool tick_broadcast_oneshot_available(void) { return false; } +#endif /* !TICK_ONESHOT */ + +/* + * Broadcasting support + */ +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); +extern int tick_check_broadcast_device(struct clock_event_device *dev); +extern int tick_is_broadcast_device(struct clock_event_device *dev); +extern void tick_broadcast_on_off(unsigned long reason, int *oncpu); +extern void tick_shutdown_broadcast(unsigned int *cpup); +extern void tick_suspend_broadcast(void); +extern int tick_resume_broadcast(void); + +extern void +tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); + +#else /* !BROADCAST */ + +static inline int tick_check_broadcast_device(struct clock_event_device *dev) +{ + return 0; +} + +static inline int tick_is_broadcast_device(struct clock_event_device *dev) +{ + return 0; +} +static inline int tick_device_uses_broadcast(struct clock_event_device *dev, + int cpu) +{ + return 0; +} +static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } +static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { } +static inline void tick_shutdown_broadcast(unsigned int *cpup) { } +static inline void tick_suspend_broadcast(void) { } +static inline int tick_resume_broadcast(void) { return 0; } + +/* + * Set the periodic handler in non broadcast mode + */ +static inline void tick_set_periodic_handler(struct clock_event_device *dev, + int broadcast) +{ + dev->event_handler = tick_handle_periodic; +} +#endif /* !BROADCAST */ + +/* + * Check, if the device is functional or a dummy for broadcast + */ +static inline int tick_device_is_functional(struct clock_event_device *dev) +{ + return !(dev->features & CLOCK_EVT_FEAT_DUMMY); +} + +#endif + +extern void do_timer(unsigned long ticks); +extern seqlock_t xtime_lock; diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c new file mode 100644 index 00000000..2d04411a --- /dev/null +++ b/kernel/time/tick-oneshot.c @@ -0,0 +1,185 @@ +/* + * linux/kernel/time/tick-oneshot.c + * + * This file contains functions which manage high resolution tick + * related events. + * + * Copyright(C) 2005-2006, Thomas Gleixner + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007, Timesys Corp., Thomas Gleixner + * + * This code is licenced under the GPL version 2. For details see + * kernel-base/COPYING. + */ +#include +#include +#include +#include +#include +#include +#include + +#include "tick-internal.h" + +/* Limit min_delta to a jiffie */ +#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) + +static int tick_increase_min_delta(struct clock_event_device *dev) +{ + /* Nothing to do if we already reached the limit */ + if (dev->min_delta_ns >= MIN_DELTA_LIMIT) + return -ETIME; + + if (dev->min_delta_ns < 5000) + dev->min_delta_ns = 5000; + else + dev->min_delta_ns += dev->min_delta_ns >> 1; + + if (dev->min_delta_ns > MIN_DELTA_LIMIT) + dev->min_delta_ns = MIN_DELTA_LIMIT; + + printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n", + dev->name ? dev->name : "?", + (unsigned long long) dev->min_delta_ns); + return 0; +} + +/** + * tick_program_event internal worker function + */ +int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, + int force) +{ + ktime_t now = ktime_get(); + int i; + + for (i = 0;;) { + int ret = clockevents_program_event(dev, expires, now); + + if (!ret || !force) + return ret; + + dev->retries++; + /* + * We tried 3 times to program the device with the given + * min_delta_ns. If that's not working then we increase it + * and emit a warning. + */ + if (++i > 2) { + /* Increase the min. delta and try again */ + if (tick_increase_min_delta(dev)) { + /* + * Get out of the loop if min_delta_ns + * hit the limit already. That's + * better than staying here forever. + * + * We clear next_event so we have a + * chance that the box survives. + */ + printk(KERN_WARNING + "CE: Reprogramming failure. Giving up\n"); + dev->next_event.tv64 = KTIME_MAX; + return -ETIME; + } + i = 0; + } + + now = ktime_get(); + expires = ktime_add_ns(now, dev->min_delta_ns); + } +} + +/** + * tick_program_event + */ +int tick_program_event(ktime_t expires, int force) +{ + struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); + + return tick_dev_program_event(dev, expires, force); +} + +/** + * tick_resume_onshot - resume oneshot mode + */ +void tick_resume_oneshot(void) +{ + struct tick_device *td = &__get_cpu_var(tick_cpu_device); + struct clock_event_device *dev = td->evtdev; + + clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); + tick_program_event(ktime_get(), 1); +} + +/** + * tick_setup_oneshot - setup the event device for oneshot mode (hres or nohz) + */ +void tick_setup_oneshot(struct clock_event_device *newdev, + void (*handler)(struct clock_event_device *), + ktime_t next_event) +{ + newdev->event_handler = handler; + clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); + tick_dev_program_event(newdev, next_event, 1); +} + +/** + * tick_switch_to_oneshot - switch to oneshot mode + */ +int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) +{ + struct tick_device *td = &__get_cpu_var(tick_cpu_device); + struct clock_event_device *dev = td->evtdev; + + if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || + !tick_device_is_functional(dev)) { + + printk(KERN_INFO "Clockevents: " + "could not switch to one-shot mode:"); + if (!dev) { + printk(" no tick device\n"); + } else { + if (!tick_device_is_functional(dev)) + printk(" %s is not functional.\n", dev->name); + else + printk(" %s does not support one-shot mode.\n", + dev->name); + } + return -EINVAL; + } + + td->mode = TICKDEV_MODE_ONESHOT; + dev->event_handler = handler; + clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); + tick_broadcast_switch_to_oneshot(); + return 0; +} + +/** + * tick_check_oneshot_mode - check whether the system is in oneshot mode + * + * returns 1 when either nohz or highres are enabled. otherwise 0. + */ +int tick_oneshot_mode_active(void) +{ + unsigned long flags; + int ret; + + local_irq_save(flags); + ret = __this_cpu_read(tick_cpu_device.mode) == TICKDEV_MODE_ONESHOT; + local_irq_restore(flags); + + return ret; +} + +#ifdef CONFIG_HIGH_RES_TIMERS +/** + * tick_init_highres - switch to high resolution mode + * + * Called with interrupts disabled. + */ +int tick_init_highres(void) +{ + return tick_switch_to_oneshot(hrtimer_interrupt); +} +#endif diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c new file mode 100644 index 00000000..032f1934 --- /dev/null +++ b/kernel/time/tick-sched.c @@ -0,0 +1,861 @@ +/* + * linux/kernel/time/tick-sched.c + * + * Copyright(C) 2005-2006, Thomas Gleixner + * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner + * + * No idle tick implementation for low and high resolution timers + * + * Started by: Thomas Gleixner and Ingo Molnar + * + * Distribute under GPLv2. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "tick-internal.h" + +/* + * Per cpu nohz control structure + */ +static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); + +/* + * The time, when the last jiffy update happened. Protected by xtime_lock. + */ +static ktime_t last_jiffies_update; + +struct tick_sched *tick_get_tick_sched(int cpu) +{ + return &per_cpu(tick_cpu_sched, cpu); +} + +/* + * Must be called with interrupts disabled ! + */ +static void tick_do_update_jiffies64(ktime_t now) +{ + unsigned long ticks = 0; + ktime_t delta; + + /* + * Do a quick check without holding xtime_lock: + */ + delta = ktime_sub(now, last_jiffies_update); + if (delta.tv64 < tick_period.tv64) + return; + + /* Reevalute with xtime_lock held */ + write_seqlock(&xtime_lock); + + delta = ktime_sub(now, last_jiffies_update); + if (delta.tv64 >= tick_period.tv64) { + + delta = ktime_sub(delta, tick_period); + last_jiffies_update = ktime_add(last_jiffies_update, + tick_period); + + /* Slow path for long timeouts */ + if (unlikely(delta.tv64 >= tick_period.tv64)) { + s64 incr = ktime_to_ns(tick_period); + + ticks = ktime_divns(delta, incr); + + last_jiffies_update = ktime_add_ns(last_jiffies_update, + incr * ticks); + } + do_timer(++ticks); + + /* Keep the tick_next_period variable up to date */ + tick_next_period = ktime_add(last_jiffies_update, tick_period); + } + write_sequnlock(&xtime_lock); +} + +/* + * Initialize and return retrieve the jiffies update. + */ +static ktime_t tick_init_jiffy_update(void) +{ + ktime_t period; + + write_seqlock(&xtime_lock); + /* Did we start the jiffies update yet ? */ + if (last_jiffies_update.tv64 == 0) + last_jiffies_update = tick_next_period; + period = last_jiffies_update; + write_sequnlock(&xtime_lock); + return period; +} + +/* + * NOHZ - aka dynamic tick functionality + */ +#ifdef CONFIG_NO_HZ +/* + * NO HZ enabled ? + */ +static int tick_nohz_enabled __read_mostly = 1; + +/* + * Enable / Disable tickless mode + */ +static int __init setup_tick_nohz(char *str) +{ + if (!strcmp(str, "off")) + tick_nohz_enabled = 0; + else if (!strcmp(str, "on")) + tick_nohz_enabled = 1; + else + return 0; + return 1; +} + +__setup("nohz=", setup_tick_nohz); + +/** + * tick_nohz_update_jiffies - update jiffies when idle was interrupted + * + * Called from interrupt entry when the CPU was idle + * + * In case the sched_tick was stopped on this CPU, we have to check if jiffies + * must be updated. Otherwise an interrupt handler could use a stale jiffy + * value. We do this unconditionally on any cpu, as we don't know whether the + * cpu, which has the update task assigned is in a long sleep. + */ +static void tick_nohz_update_jiffies(ktime_t now) +{ + int cpu = smp_processor_id(); + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + unsigned long flags; + + cpumask_clear_cpu(cpu, nohz_cpu_mask); + ts->idle_waketime = now; + + local_irq_save(flags); + tick_do_update_jiffies64(now); + local_irq_restore(flags); + + touch_softlockup_watchdog(); +} + +/* + * Updates the per cpu time idle statistics counters + */ +static void +update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) +{ + ktime_t delta; + + if (ts->idle_active) { + delta = ktime_sub(now, ts->idle_entrytime); + ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); + if (nr_iowait_cpu(cpu) > 0) + ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); + ts->idle_entrytime = now; + } + + if (last_update_time) + *last_update_time = ktime_to_us(now); + +} + +static void tick_nohz_stop_idle(int cpu, ktime_t now) +{ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + + update_ts_time_stats(cpu, ts, now, NULL); + ts->idle_active = 0; + + sched_clock_idle_wakeup_event(0); +} + +static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) +{ + ktime_t now; + + now = ktime_get(); + + update_ts_time_stats(cpu, ts, now, NULL); + + ts->idle_entrytime = now; + ts->idle_active = 1; + sched_clock_idle_sleep_event(); + return now; +} + +/** + * get_cpu_idle_time_us - get the total idle time of a cpu + * @cpu: CPU number to query + * @last_update_time: variable to store update time in + * + * Return the cummulative idle time (since boot) for a given + * CPU, in microseconds. The idle time returned includes + * the iowait time (unlike what "top" and co report). + * + * This time is measured via accounting rather than sampling, + * and is as accurate as ktime_get() is. + * + * This function returns -1 if NOHZ is not enabled. + */ +u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) +{ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + + if (!tick_nohz_enabled) + return -1; + + update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); + + return ktime_to_us(ts->idle_sleeptime); +} +EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); + +/* + * get_cpu_iowait_time_us - get the total iowait time of a cpu + * @cpu: CPU number to query + * @last_update_time: variable to store update time in + * + * Return the cummulative iowait time (since boot) for a given + * CPU, in microseconds. + * + * This time is measured via accounting rather than sampling, + * and is as accurate as ktime_get() is. + * + * This function returns -1 if NOHZ is not enabled. + */ +u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) +{ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + + if (!tick_nohz_enabled) + return -1; + + update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); + + return ktime_to_us(ts->iowait_sleeptime); +} +EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); + +/** + * tick_nohz_stop_sched_tick - stop the idle tick from the idle task + * + * When the next event is more than a tick into the future, stop the idle tick + * Called either from the idle loop or from irq_exit() when an idle period was + * just interrupted by an interrupt which did not cause a reschedule. + */ +void tick_nohz_stop_sched_tick(int inidle) +{ + unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; + struct tick_sched *ts; + ktime_t last_update, expires, now; + struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; + u64 time_delta; + int cpu; + + local_irq_save(flags); + + cpu = smp_processor_id(); + ts = &per_cpu(tick_cpu_sched, cpu); + + /* + * Call to tick_nohz_start_idle stops the last_update_time from being + * updated. Thus, it must not be called in the event we are called from + * irq_exit() with the prior state different than idle. + */ + if (!inidle && !ts->inidle) + goto end; + + /* + * Set ts->inidle unconditionally. Even if the system did not + * switch to NOHZ mode the cpu frequency governers rely on the + * update of the idle time accounting in tick_nohz_start_idle(). + */ + ts->inidle = 1; + + now = tick_nohz_start_idle(cpu, ts); + + /* + * If this cpu is offline and it is the one which updates + * jiffies, then give up the assignment and let it be taken by + * the cpu which runs the tick timer next. If we don't drop + * this here the jiffies might be stale and do_timer() never + * invoked. + */ + if (unlikely(!cpu_online(cpu))) { + if (cpu == tick_do_timer_cpu) + tick_do_timer_cpu = TICK_DO_TIMER_NONE; + } + + if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) + goto end; + + if (need_resched()) + goto end; + + if (unlikely(local_softirq_pending() && cpu_online(cpu))) { + static int ratelimit; + + if (ratelimit < 10) { + printk(KERN_ERR "NOHZ: local_softirq_pending %02x\n", + (unsigned int) local_softirq_pending()); + ratelimit++; + } + goto end; + } + + ts->idle_calls++; + /* Read jiffies and the time when jiffies were updated last */ + do { + seq = read_seqbegin(&xtime_lock); + last_update = last_jiffies_update; + last_jiffies = jiffies; + time_delta = timekeeping_max_deferment(); + } while (read_seqretry(&xtime_lock, seq)); + + if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || + arch_needs_cpu(cpu)) { + next_jiffies = last_jiffies + 1; + delta_jiffies = 1; + } else { + /* Get the next timer wheel timer */ + next_jiffies = get_next_timer_interrupt(last_jiffies); + delta_jiffies = next_jiffies - last_jiffies; + } + /* + * Do not stop the tick, if we are only one off + * or if the cpu is required for rcu + */ + if (!ts->tick_stopped && delta_jiffies == 1) + goto out; + + /* Schedule the tick, if we are at least one jiffie off */ + if ((long)delta_jiffies >= 1) { + + /* + * If this cpu is the one which updates jiffies, then + * give up the assignment and let it be taken by the + * cpu which runs the tick timer next, which might be + * this cpu as well. If we don't drop this here the + * jiffies might be stale and do_timer() never + * invoked. Keep track of the fact that it was the one + * which had the do_timer() duty last. If this cpu is + * the one which had the do_timer() duty last, we + * limit the sleep time to the timekeeping + * max_deferement value which we retrieved + * above. Otherwise we can sleep as long as we want. + */ + if (cpu == tick_do_timer_cpu) { + tick_do_timer_cpu = TICK_DO_TIMER_NONE; + ts->do_timer_last = 1; + } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { + time_delta = KTIME_MAX; + ts->do_timer_last = 0; + } else if (!ts->do_timer_last) { + time_delta = KTIME_MAX; + } + + /* + * calculate the expiry time for the next timer wheel + * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals + * that there is no timer pending or at least extremely + * far into the future (12 days for HZ=1000). In this + * case we set the expiry to the end of time. + */ + if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) { + /* + * Calculate the time delta for the next timer event. + * If the time delta exceeds the maximum time delta + * permitted by the current clocksource then adjust + * the time delta accordingly to ensure the + * clocksource does not wrap. + */ + time_delta = min_t(u64, time_delta, + tick_period.tv64 * delta_jiffies); + } + + if (time_delta < KTIME_MAX) + expires = ktime_add_ns(last_update, time_delta); + else + expires.tv64 = KTIME_MAX; + + if (delta_jiffies > 1) + cpumask_set_cpu(cpu, nohz_cpu_mask); + + /* Skip reprogram of event if its not changed */ + if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) + goto out; + + /* + * nohz_stop_sched_tick can be called several times before + * the nohz_restart_sched_tick is called. This happens when + * interrupts arrive which do not cause a reschedule. In the + * first call we save the current tick time, so we can restart + * the scheduler tick in nohz_restart_sched_tick. + */ + if (!ts->tick_stopped) { + select_nohz_load_balancer(1); + + ts->idle_tick = hrtimer_get_expires(&ts->sched_timer); + ts->tick_stopped = 1; + ts->idle_jiffies = last_jiffies; + rcu_enter_nohz(); + } + + ts->idle_sleeps++; + + /* Mark expires */ + ts->idle_expires = expires; + + /* + * If the expiration time == KTIME_MAX, then + * in this case we simply stop the tick timer. + */ + if (unlikely(expires.tv64 == KTIME_MAX)) { + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) + hrtimer_cancel(&ts->sched_timer); + goto out; + } + + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { + hrtimer_start(&ts->sched_timer, expires, + HRTIMER_MODE_ABS_PINNED); + /* Check, if the timer was already in the past */ + if (hrtimer_active(&ts->sched_timer)) + goto out; + } else if (!tick_program_event(expires, 0)) + goto out; + /* + * We are past the event already. So we crossed a + * jiffie boundary. Update jiffies and raise the + * softirq. + */ + tick_do_update_jiffies64(ktime_get()); + cpumask_clear_cpu(cpu, nohz_cpu_mask); + } + raise_softirq_irqoff(TIMER_SOFTIRQ); +out: + ts->next_jiffies = next_jiffies; + ts->last_jiffies = last_jiffies; + ts->sleep_length = ktime_sub(dev->next_event, now); +end: + local_irq_restore(flags); +} + +/** + * tick_nohz_get_sleep_length - return the length of the current sleep + * + * Called from power state control code with interrupts disabled + */ +ktime_t tick_nohz_get_sleep_length(void) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + + return ts->sleep_length; +} + +static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) +{ + hrtimer_cancel(&ts->sched_timer); + hrtimer_set_expires(&ts->sched_timer, ts->idle_tick); + + while (1) { + /* Forward the time to expire in the future */ + hrtimer_forward(&ts->sched_timer, now, tick_period); + + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { + hrtimer_start_expires(&ts->sched_timer, + HRTIMER_MODE_ABS_PINNED); + /* Check, if the timer was already in the past */ + if (hrtimer_active(&ts->sched_timer)) + break; + } else { + if (!tick_program_event( + hrtimer_get_expires(&ts->sched_timer), 0)) + break; + } + /* Reread time and update jiffies */ + now = ktime_get(); + tick_do_update_jiffies64(now); + } +} + +/** + * tick_nohz_restart_sched_tick - restart the idle tick from the idle task + * + * Restart the idle tick when the CPU is woken up from idle + */ +void tick_nohz_restart_sched_tick(void) +{ + int cpu = smp_processor_id(); + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); +#ifndef CONFIG_VIRT_CPU_ACCOUNTING + unsigned long ticks; +#endif + ktime_t now; + + local_irq_disable(); + if (ts->idle_active || (ts->inidle && ts->tick_stopped)) + now = ktime_get(); + + if (ts->idle_active) + tick_nohz_stop_idle(cpu, now); + + if (!ts->inidle || !ts->tick_stopped) { + ts->inidle = 0; + local_irq_enable(); + return; + } + + ts->inidle = 0; + + rcu_exit_nohz(); + + /* Update jiffies first */ + select_nohz_load_balancer(0); + tick_do_update_jiffies64(now); + cpumask_clear_cpu(cpu, nohz_cpu_mask); + +#ifndef CONFIG_VIRT_CPU_ACCOUNTING + /* + * We stopped the tick in idle. Update process times would miss the + * time we slept as update_process_times does only a 1 tick + * accounting. Enforce that this is accounted to idle ! + */ + ticks = jiffies - ts->idle_jiffies; + /* + * We might be one off. Do not randomly account a huge number of ticks! + */ + if (ticks && ticks < LONG_MAX) + account_idle_ticks(ticks); +#endif + + touch_softlockup_watchdog(); + /* + * Cancel the scheduled timer and restore the tick + */ + ts->tick_stopped = 0; + ts->idle_exittime = now; + + tick_nohz_restart(ts, now); + + local_irq_enable(); +} + +static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) +{ + hrtimer_forward(&ts->sched_timer, now, tick_period); + return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0); +} + +/* + * The nohz low res interrupt handler + */ +static void tick_nohz_handler(struct clock_event_device *dev) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + struct pt_regs *regs = get_irq_regs(); + int cpu = smp_processor_id(); + ktime_t now = ktime_get(); + + dev->next_event.tv64 = KTIME_MAX; + + /* + * Check if the do_timer duty was dropped. We don't care about + * concurrency: This happens only when the cpu in charge went + * into a long sleep. If two cpus happen to assign themself to + * this duty, then the jiffies update is still serialized by + * xtime_lock. + */ + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) + tick_do_timer_cpu = cpu; + + /* Check, if the jiffies need an update */ + if (tick_do_timer_cpu == cpu) + tick_do_update_jiffies64(now); + + /* + * When we are idle and the tick is stopped, we have to touch + * the watchdog as we might not schedule for a really long + * time. This happens on complete idle SMP systems while + * waiting on the login prompt. We also increment the "start + * of idle" jiffy stamp so the idle accounting adjustment we + * do when we go busy again does not account too much ticks. + */ + if (ts->tick_stopped) { + touch_softlockup_watchdog(); + ts->idle_jiffies++; + } + + update_process_times(user_mode(regs)); + profile_tick(CPU_PROFILING); + + while (tick_nohz_reprogram(ts, now)) { + now = ktime_get(); + tick_do_update_jiffies64(now); + } +} + +/** + * tick_nohz_switch_to_nohz - switch to nohz mode + */ +static void tick_nohz_switch_to_nohz(void) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + ktime_t next; + + if (!tick_nohz_enabled) + return; + + local_irq_disable(); + if (tick_switch_to_oneshot(tick_nohz_handler)) { + local_irq_enable(); + return; + } + + ts->nohz_mode = NOHZ_MODE_LOWRES; + + /* + * Recycle the hrtimer in ts, so we can share the + * hrtimer_forward with the highres code. + */ + hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + /* Get the next period */ + next = tick_init_jiffy_update(); + + for (;;) { + hrtimer_set_expires(&ts->sched_timer, next); + if (!tick_program_event(next, 0)) + break; + next = ktime_add(next, tick_period); + } + local_irq_enable(); +} + +/* + * When NOHZ is enabled and the tick is stopped, we need to kick the + * tick timer from irq_enter() so that the jiffies update is kept + * alive during long running softirqs. That's ugly as hell, but + * correctness is key even if we need to fix the offending softirq in + * the first place. + * + * Note, this is different to tick_nohz_restart. We just kick the + * timer and do not touch the other magic bits which need to be done + * when idle is left. + */ +static void tick_nohz_kick_tick(int cpu, ktime_t now) +{ +#if 0 + /* Switch back to 2.6.27 behaviour */ + + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + ktime_t delta; + + /* + * Do not touch the tick device, when the next expiry is either + * already reached or less/equal than the tick period. + */ + delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now); + if (delta.tv64 <= tick_period.tv64) + return; + + tick_nohz_restart(ts, now); +#endif +} + +static inline void tick_check_nohz(int cpu) +{ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + ktime_t now; + + if (!ts->idle_active && !ts->tick_stopped) + return; + now = ktime_get(); + if (ts->idle_active) + tick_nohz_stop_idle(cpu, now); + if (ts->tick_stopped) { + tick_nohz_update_jiffies(now); + tick_nohz_kick_tick(cpu, now); + } +} + +#else + +static inline void tick_nohz_switch_to_nohz(void) { } +static inline void tick_check_nohz(int cpu) { } + +#endif /* NO_HZ */ + +/* + * Called from irq_enter to notify about the possible interruption of idle() + */ +void tick_check_idle(int cpu) +{ + tick_check_oneshot_broadcast(cpu); + tick_check_nohz(cpu); +} + +/* + * High resolution timer specific code + */ +#ifdef CONFIG_HIGH_RES_TIMERS +/* + * We rearm the timer until we get disabled by the idle code. + * Called with interrupts disabled and timer->base->cpu_base->lock held. + */ +static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) +{ + struct tick_sched *ts = + container_of(timer, struct tick_sched, sched_timer); + struct pt_regs *regs = get_irq_regs(); + ktime_t now = ktime_get(); + int cpu = smp_processor_id(); + +#ifdef CONFIG_NO_HZ + /* + * Check if the do_timer duty was dropped. We don't care about + * concurrency: This happens only when the cpu in charge went + * into a long sleep. If two cpus happen to assign themself to + * this duty, then the jiffies update is still serialized by + * xtime_lock. + */ + if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) + tick_do_timer_cpu = cpu; +#endif + + /* Check, if the jiffies need an update */ + if (tick_do_timer_cpu == cpu) + tick_do_update_jiffies64(now); + + /* + * Do not call, when we are not in irq context and have + * no valid regs pointer + */ + if (regs) { + /* + * When we are idle and the tick is stopped, we have to touch + * the watchdog as we might not schedule for a really long + * time. This happens on complete idle SMP systems while + * waiting on the login prompt. We also increment the "start of + * idle" jiffy stamp so the idle accounting adjustment we do + * when we go busy again does not account too much ticks. + */ + if (ts->tick_stopped) { + touch_softlockup_watchdog(); + ts->idle_jiffies++; + } + update_process_times(user_mode(regs)); + profile_tick(CPU_PROFILING); + } + + hrtimer_forward(timer, now, tick_period); + + return HRTIMER_RESTART; +} + +/** + * tick_setup_sched_timer - setup the tick emulation timer + */ +void tick_setup_sched_timer(void) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + ktime_t now = ktime_get(); + + /* + * Emulate tick processing via per-CPU hrtimers: + */ + hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + ts->sched_timer.function = tick_sched_timer; + + /* Get the next period (per cpu) */ + hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); + + for (;;) { + hrtimer_forward(&ts->sched_timer, now, tick_period); + hrtimer_start_expires(&ts->sched_timer, + HRTIMER_MODE_ABS_PINNED); + /* Check, if the timer was already in the past */ + if (hrtimer_active(&ts->sched_timer)) + break; + now = ktime_get(); + } + +#ifdef CONFIG_NO_HZ + if (tick_nohz_enabled) + ts->nohz_mode = NOHZ_MODE_HIGHRES; +#endif +} +#endif /* HIGH_RES_TIMERS */ + +#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS +void tick_cancel_sched_timer(int cpu) +{ + struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); + +# ifdef CONFIG_HIGH_RES_TIMERS + if (ts->sched_timer.base) + hrtimer_cancel(&ts->sched_timer); +# endif + + ts->nohz_mode = NOHZ_MODE_INACTIVE; +} +#endif + +/** + * Async notification about clocksource changes + */ +void tick_clock_notify(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + set_bit(0, &per_cpu(tick_cpu_sched, cpu).check_clocks); +} + +/* + * Async notification about clock event changes + */ +void tick_oneshot_notify(void) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + + set_bit(0, &ts->check_clocks); +} + +/** + * Check, if a change happened, which makes oneshot possible. + * + * Called cyclic from the hrtimer softirq (driven by the timer + * softirq) allow_nohz signals, that we can switch into low-res nohz + * mode, because high resolution timers are disabled (either compile + * or runtime). + */ +int tick_check_oneshot_change(int allow_nohz) +{ + struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); + + if (!test_and_clear_bit(0, &ts->check_clocks)) + return 0; + + if (ts->nohz_mode != NOHZ_MODE_INACTIVE) + return 0; + + if (!timekeeping_valid_for_hres() || !tick_is_oneshot_available()) + return 0; + + if (!allow_nohz) + return 1; + + tick_nohz_switch_to_nohz(); + return 0; +} diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c new file mode 100644 index 00000000..a9ae3699 --- /dev/null +++ b/kernel/time/timecompare.c @@ -0,0 +1,193 @@ +/* + * Copyright (C) 2009 Intel Corporation. + * Author: Patrick Ohly + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include + +/* + * fixed point arithmetic scale factor for skew + * + * Usually one would measure skew in ppb (parts per billion, 1e9), but + * using a factor of 2 simplifies the math. + */ +#define TIMECOMPARE_SKEW_RESOLUTION (((s64)1)<<30) + +ktime_t timecompare_transform(struct timecompare *sync, + u64 source_tstamp) +{ + u64 nsec; + + nsec = source_tstamp + sync->offset; + nsec += (s64)(source_tstamp - sync->last_update) * sync->skew / + TIMECOMPARE_SKEW_RESOLUTION; + + return ns_to_ktime(nsec); +} +EXPORT_SYMBOL_GPL(timecompare_transform); + +int timecompare_offset(struct timecompare *sync, + s64 *offset, + u64 *source_tstamp) +{ + u64 start_source = 0, end_source = 0; + struct { + s64 offset; + s64 duration_target; + } buffer[10], sample, *samples; + int counter = 0, i; + int used; + int index; + int num_samples = sync->num_samples; + + if (num_samples > ARRAY_SIZE(buffer)) { + samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC); + if (!samples) { + samples = buffer; + num_samples = ARRAY_SIZE(buffer); + } + } else { + samples = buffer; + } + + /* run until we have enough valid samples, but do not try forever */ + i = 0; + counter = 0; + while (1) { + u64 ts; + ktime_t start, end; + + start = sync->target(); + ts = timecounter_read(sync->source); + end = sync->target(); + + if (!i) + start_source = ts; + + /* ignore negative durations */ + sample.duration_target = ktime_to_ns(ktime_sub(end, start)); + if (sample.duration_target >= 0) { + /* + * assume symetric delay to and from source: + * average target time corresponds to measured + * source time + */ + sample.offset = + (ktime_to_ns(end) + ktime_to_ns(start)) / 2 - + ts; + + /* simple insertion sort based on duration */ + index = counter - 1; + while (index >= 0) { + if (samples[index].duration_target < + sample.duration_target) + break; + samples[index + 1] = samples[index]; + index--; + } + samples[index + 1] = sample; + counter++; + } + + i++; + if (counter >= num_samples || i >= 100000) { + end_source = ts; + break; + } + } + + *source_tstamp = (end_source + start_source) / 2; + + /* remove outliers by only using 75% of the samples */ + used = counter * 3 / 4; + if (!used) + used = counter; + if (used) { + /* calculate average */ + s64 off = 0; + for (index = 0; index < used; index++) + off += samples[index].offset; + *offset = div_s64(off, used); + } + + if (samples && samples != buffer) + kfree(samples); + + return used; +} +EXPORT_SYMBOL_GPL(timecompare_offset); + +void __timecompare_update(struct timecompare *sync, + u64 source_tstamp) +{ + s64 offset; + u64 average_time; + + if (!timecompare_offset(sync, &offset, &average_time)) + return; + + if (!sync->last_update) { + sync->last_update = average_time; + sync->offset = offset; + sync->skew = 0; + } else { + s64 delta_nsec = average_time - sync->last_update; + + /* avoid division by negative or small deltas */ + if (delta_nsec >= 10000) { + s64 delta_offset_nsec = offset - sync->offset; + s64 skew; /* delta_offset_nsec * + TIMECOMPARE_SKEW_RESOLUTION / + delta_nsec */ + u64 divisor; + + /* div_s64() is limited to 32 bit divisor */ + skew = delta_offset_nsec * TIMECOMPARE_SKEW_RESOLUTION; + divisor = delta_nsec; + while (unlikely(divisor >= ((s64)1) << 32)) { + /* divide both by 2; beware, right shift + of negative value has undefined + behavior and can only be used for + the positive divisor */ + skew = div_s64(skew, 2); + divisor >>= 1; + } + skew = div_s64(skew, divisor); + + /* + * Calculate new overall skew as 4/16 the + * old value and 12/16 the new one. This is + * a rather arbitrary tradeoff between + * only using the latest measurement (0/16 and + * 16/16) and even more weight on past measurements. + */ +#define TIMECOMPARE_NEW_SKEW_PER_16 12 + sync->skew = + div_s64((16 - TIMECOMPARE_NEW_SKEW_PER_16) * + sync->skew + + TIMECOMPARE_NEW_SKEW_PER_16 * skew, + 16); + sync->last_update = average_time; + sync->offset = offset; + } + } +} +EXPORT_SYMBOL_GPL(__timecompare_update); diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c new file mode 100644 index 00000000..86628e75 --- /dev/null +++ b/kernel/time/timeconv.c @@ -0,0 +1,127 @@ +/* + * Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc. + * This file is part of the GNU C Library. + * Contributed by Paul Eggert (eggert@twinsun.com). + * + * The GNU C Library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * The GNU C Library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with the GNU C Library; see the file COPYING.LIB. If not, + * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* + * Converts the calendar time to broken-down time representation + * Based on code from glibc-2.6 + * + * 2009-7-14: + * Moved from glibc-2.6 to kernel by Zhaolei + */ + +#include +#include + +/* + * Nonzero if YEAR is a leap year (every 4 years, + * except every 100th isn't, and every 400th is). + */ +static int __isleap(long year) +{ + return (year) % 4 == 0 && ((year) % 100 != 0 || (year) % 400 == 0); +} + +/* do a mathdiv for long type */ +static long math_div(long a, long b) +{ + return a / b - (a % b < 0); +} + +/* How many leap years between y1 and y2, y1 must less or equal to y2 */ +static long leaps_between(long y1, long y2) +{ + long leaps1 = math_div(y1 - 1, 4) - math_div(y1 - 1, 100) + + math_div(y1 - 1, 400); + long leaps2 = math_div(y2 - 1, 4) - math_div(y2 - 1, 100) + + math_div(y2 - 1, 400); + return leaps2 - leaps1; +} + +/* How many days come before each month (0-12). */ +static const unsigned short __mon_yday[2][13] = { + /* Normal years. */ + {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 365}, + /* Leap years. */ + {0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335, 366} +}; + +#define SECS_PER_HOUR (60 * 60) +#define SECS_PER_DAY (SECS_PER_HOUR * 24) + +/** + * time_to_tm - converts the calendar time to local broken-down time + * + * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970, + * Coordinated Universal Time (UTC). + * @offset offset seconds adding to totalsecs. + * @result pointer to struct tm variable to receive broken-down time + */ +void time_to_tm(time_t totalsecs, int offset, struct tm *result) +{ + long days, rem, y; + const unsigned short *ip; + + days = totalsecs / SECS_PER_DAY; + rem = totalsecs % SECS_PER_DAY; + rem += offset; + while (rem < 0) { + rem += SECS_PER_DAY; + --days; + } + while (rem >= SECS_PER_DAY) { + rem -= SECS_PER_DAY; + ++days; + } + + result->tm_hour = rem / SECS_PER_HOUR; + rem %= SECS_PER_HOUR; + result->tm_min = rem / 60; + result->tm_sec = rem % 60; + + /* January 1, 1970 was a Thursday. */ + result->tm_wday = (4 + days) % 7; + if (result->tm_wday < 0) + result->tm_wday += 7; + + y = 1970; + + while (days < 0 || days >= (__isleap(y) ? 366 : 365)) { + /* Guess a corrected year, assuming 365 days per year. */ + long yg = y + math_div(days, 365); + + /* Adjust DAYS and Y to match the guessed year. */ + days -= (yg - y) * 365 + leaps_between(y, yg); + y = yg; + } + + result->tm_year = y - 1900; + + result->tm_yday = days; + + ip = __mon_yday[__isleap(y)]; + for (y = 11; days < ip[y]; y--) + continue; + days -= ip[y]; + + result->tm_mon = y; + result->tm_mday = days + 1; +} +EXPORT_SYMBOL(time_to_tm); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c new file mode 100644 index 00000000..9b28d040 --- /dev/null +++ b/kernel/time/timekeeping.c @@ -0,0 +1,1137 @@ +/* + * linux/kernel/time/timekeeping.c + * + * Kernel timekeeping code and accessor functions + * + * This code was moved from linux/kernel/timer.c. + * Please see that file for copyright and history logs. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Structure holding internal timekeeping values. */ +struct timekeeper { + /* Current clocksource used for timekeeping. */ + struct clocksource *clock; + /* The shift value of the current clocksource. */ + int shift; + + /* Number of clock cycles in one NTP interval. */ + cycle_t cycle_interval; + /* Number of clock shifted nano seconds in one NTP interval. */ + u64 xtime_interval; + /* shifted nano seconds left over when rounding cycle_interval */ + s64 xtime_remainder; + /* Raw nano seconds accumulated per NTP interval. */ + u32 raw_interval; + + /* Clock shifted nano seconds remainder not stored in xtime.tv_nsec. */ + u64 xtime_nsec; + /* Difference between accumulated time and NTP time in ntp + * shifted nano seconds. */ + s64 ntp_error; + /* Shift conversion between clock shifted nano seconds and + * ntp shifted nano seconds. */ + int ntp_error_shift; + /* NTP adjusted clock multiplier */ + u32 mult; +}; + +static struct timekeeper timekeeper; + +/** + * timekeeper_setup_internals - Set up internals to use clocksource clock. + * + * @clock: Pointer to clocksource. + * + * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment + * pair and interval request. + * + * Unless you're the timekeeping code, you should not be using this! + */ +static void timekeeper_setup_internals(struct clocksource *clock) +{ + cycle_t interval; + u64 tmp, ntpinterval; + + timekeeper.clock = clock; + clock->cycle_last = clock->read(clock); + + /* Do the ns -> cycle conversion first, using original mult */ + tmp = NTP_INTERVAL_LENGTH; + tmp <<= clock->shift; + ntpinterval = tmp; + tmp += clock->mult/2; + do_div(tmp, clock->mult); + if (tmp == 0) + tmp = 1; + + interval = (cycle_t) tmp; + timekeeper.cycle_interval = interval; + + /* Go back from cycles -> shifted ns */ + timekeeper.xtime_interval = (u64) interval * clock->mult; + timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval; + timekeeper.raw_interval = + ((u64) interval * clock->mult) >> clock->shift; + + timekeeper.xtime_nsec = 0; + timekeeper.shift = clock->shift; + + timekeeper.ntp_error = 0; + timekeeper.ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; + + /* + * The timekeeper keeps its own mult values for the currently + * active clocksource. These value will be adjusted via NTP + * to counteract clock drifting. + */ + timekeeper.mult = clock->mult; +} + +/* Timekeeper helper functions. */ +static inline s64 timekeeping_get_ns(void) +{ + cycle_t cycle_now, cycle_delta; + struct clocksource *clock; + + /* read clocksource: */ + clock = timekeeper.clock; + cycle_now = clock->read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* return delta convert to nanoseconds using ntp adjusted mult. */ + return clocksource_cyc2ns(cycle_delta, timekeeper.mult, + timekeeper.shift); +} + +static inline s64 timekeeping_get_ns_raw(void) +{ + cycle_t cycle_now, cycle_delta; + struct clocksource *clock; + + /* read clocksource: */ + clock = timekeeper.clock; + cycle_now = clock->read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* return delta convert to nanoseconds using ntp adjusted mult. */ + return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); +} + +/* + * This read-write spinlock protects us from races in SMP while + * playing with xtime. + */ +__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); + + +/* + * The current time + * wall_to_monotonic is what we need to add to xtime (or xtime corrected + * for sub jiffie times) to get to monotonic time. Monotonic is pegged + * at zero at system boot time, so wall_to_monotonic will be negative, + * however, we will ALWAYS keep the tv_nsec part positive so we can use + * the usual normalization. + * + * wall_to_monotonic is moved after resume from suspend for the monotonic + * time not to jump. We need to add total_sleep_time to wall_to_monotonic + * to get the real boot based time offset. + * + * - wall_to_monotonic is no longer the boot time, getboottime must be + * used instead. + */ +static struct timespec xtime __attribute__ ((aligned (16))); +static struct timespec wall_to_monotonic __attribute__ ((aligned (16))); +static struct timespec total_sleep_time; + +/* + * The raw monotonic time for the CLOCK_MONOTONIC_RAW posix clock. + */ +static struct timespec raw_time; + +/* flag for if timekeeping is suspended */ +int __read_mostly timekeeping_suspended; + +/* must hold xtime_lock */ +void timekeeping_leap_insert(int leapsecond) +{ + xtime.tv_sec += leapsecond; + wall_to_monotonic.tv_sec -= leapsecond; + update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, + timekeeper.mult); +} + +/** + * timekeeping_forward_now - update clock to the current time + * + * Forward the current clock to update its state since the last call to + * update_wall_time(). This is useful before significant clock changes, + * as it avoids having to deal with this time offset explicitly. + */ +static void timekeeping_forward_now(void) +{ + cycle_t cycle_now, cycle_delta; + struct clocksource *clock; + s64 nsec; + + clock = timekeeper.clock; + cycle_now = clock->read(clock); + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + clock->cycle_last = cycle_now; + + nsec = clocksource_cyc2ns(cycle_delta, timekeeper.mult, + timekeeper.shift); + + /* If arch requires, add in gettimeoffset() */ + nsec += arch_gettimeoffset(); + + timespec_add_ns(&xtime, nsec); + + nsec = clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift); + timespec_add_ns(&raw_time, nsec); +} + +/** + * getnstimeofday - Returns the time of day in a timespec + * @ts: pointer to the timespec to be set + * + * Returns the time of day in a timespec. + */ +void getnstimeofday(struct timespec *ts) +{ + unsigned long seq; + s64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqbegin(&xtime_lock); + + *ts = xtime; + nsecs = timekeeping_get_ns(); + + /* If arch requires, add in gettimeoffset() */ + nsecs += arch_gettimeoffset(); + + } while (read_seqretry(&xtime_lock, seq)); + + timespec_add_ns(ts, nsecs); +} + +EXPORT_SYMBOL(getnstimeofday); + +ktime_t ktime_get(void) +{ + unsigned int seq; + s64 secs, nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqbegin(&xtime_lock); + secs = xtime.tv_sec + wall_to_monotonic.tv_sec; + nsecs = xtime.tv_nsec + wall_to_monotonic.tv_nsec; + nsecs += timekeeping_get_ns(); + /* If arch requires, add in gettimeoffset() */ + nsecs += arch_gettimeoffset(); + + } while (read_seqretry(&xtime_lock, seq)); + /* + * Use ktime_set/ktime_add_ns to create a proper ktime on + * 32-bit architectures without CONFIG_KTIME_SCALAR. + */ + return ktime_add_ns(ktime_set(secs, 0), nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get); + +/** + * ktime_get_ts - get the monotonic clock in timespec format + * @ts: pointer to timespec variable + * + * The function calculates the monotonic clock from the realtime + * clock and the wall_to_monotonic offset and stores the result + * in normalized timespec format in the variable pointed to by @ts. + */ +void ktime_get_ts(struct timespec *ts) +{ + struct timespec tomono; + unsigned int seq; + s64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqbegin(&xtime_lock); + *ts = xtime; + tomono = wall_to_monotonic; + nsecs = timekeeping_get_ns(); + /* If arch requires, add in gettimeoffset() */ + nsecs += arch_gettimeoffset(); + + } while (read_seqretry(&xtime_lock, seq)); + + set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec, + ts->tv_nsec + tomono.tv_nsec + nsecs); +} +EXPORT_SYMBOL_GPL(ktime_get_ts); + +#ifdef CONFIG_NTP_PPS + +/** + * getnstime_raw_and_real - get day and raw monotonic time in timespec format + * @ts_raw: pointer to the timespec to be set to raw monotonic time + * @ts_real: pointer to the timespec to be set to the time of day + * + * This function reads both the time of day and raw monotonic time at the + * same time atomically and stores the resulting timestamps in timespec + * format. + */ +void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) +{ + unsigned long seq; + s64 nsecs_raw, nsecs_real; + + WARN_ON_ONCE(timekeeping_suspended); + + do { + u32 arch_offset; + + seq = read_seqbegin(&xtime_lock); + + *ts_raw = raw_time; + *ts_real = xtime; + + nsecs_raw = timekeeping_get_ns_raw(); + nsecs_real = timekeeping_get_ns(); + + /* If arch requires, add in gettimeoffset() */ + arch_offset = arch_gettimeoffset(); + nsecs_raw += arch_offset; + nsecs_real += arch_offset; + + } while (read_seqretry(&xtime_lock, seq)); + + timespec_add_ns(ts_raw, nsecs_raw); + timespec_add_ns(ts_real, nsecs_real); +} +EXPORT_SYMBOL(getnstime_raw_and_real); + +#endif /* CONFIG_NTP_PPS */ + +/** + * do_gettimeofday - Returns the time of day in a timeval + * @tv: pointer to the timeval to be set + * + * NOTE: Users should be converted to using getnstimeofday() + */ +void do_gettimeofday(struct timeval *tv) +{ + struct timespec now; + + getnstimeofday(&now); + tv->tv_sec = now.tv_sec; + tv->tv_usec = now.tv_nsec/1000; +} + +EXPORT_SYMBOL(do_gettimeofday); +/** + * do_settimeofday - Sets the time of day + * @tv: pointer to the timespec variable containing the new time + * + * Sets the time of day to the new time and update NTP and notify hrtimers + */ +int do_settimeofday(const struct timespec *tv) +{ + struct timespec ts_delta; + unsigned long flags; + + if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) + return -EINVAL; + + write_seqlock_irqsave(&xtime_lock, flags); + + timekeeping_forward_now(); + + ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec; + ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec; + wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta); + + xtime = *tv; + + timekeeper.ntp_error = 0; + ntp_clear(); + + update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, + timekeeper.mult); + + write_sequnlock_irqrestore(&xtime_lock, flags); + + /* signal hrtimers about time change */ + clock_was_set(); + + return 0; +} + +EXPORT_SYMBOL(do_settimeofday); + + +/** + * timekeeping_inject_offset - Adds or subtracts from the current time. + * @tv: pointer to the timespec variable containing the offset + * + * Adds or subtracts an offset value from the current time. + */ +int timekeeping_inject_offset(struct timespec *ts) +{ + unsigned long flags; + + if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) + return -EINVAL; + + write_seqlock_irqsave(&xtime_lock, flags); + + timekeeping_forward_now(); + + xtime = timespec_add(xtime, *ts); + wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts); + + timekeeper.ntp_error = 0; + ntp_clear(); + + update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, + timekeeper.mult); + + write_sequnlock_irqrestore(&xtime_lock, flags); + + /* signal hrtimers about time change */ + clock_was_set(); + + return 0; +} +EXPORT_SYMBOL(timekeeping_inject_offset); + +/** + * change_clocksource - Swaps clocksources if a new one is available + * + * Accumulates current time interval and initializes new clocksource + */ +static int change_clocksource(void *data) +{ + struct clocksource *new, *old; + + new = (struct clocksource *) data; + + timekeeping_forward_now(); + if (!new->enable || new->enable(new) == 0) { + old = timekeeper.clock; + timekeeper_setup_internals(new); + if (old->disable) + old->disable(old); + } + return 0; +} + +/** + * timekeeping_notify - Install a new clock source + * @clock: pointer to the clock source + * + * This function is called from clocksource.c after a new, better clock + * source has been registered. The caller holds the clocksource_mutex. + */ +void timekeeping_notify(struct clocksource *clock) +{ + if (timekeeper.clock == clock) + return; + stop_machine(change_clocksource, clock, NULL); + tick_clock_notify(); +} + +/** + * ktime_get_real - get the real (wall-) time in ktime_t format + * + * returns the time in ktime_t format + */ +ktime_t ktime_get_real(void) +{ + struct timespec now; + + getnstimeofday(&now); + + return timespec_to_ktime(now); +} +EXPORT_SYMBOL_GPL(ktime_get_real); + +/** + * getrawmonotonic - Returns the raw monotonic time in a timespec + * @ts: pointer to the timespec to be set + * + * Returns the raw monotonic time (completely un-modified by ntp) + */ +void getrawmonotonic(struct timespec *ts) +{ + unsigned long seq; + s64 nsecs; + + do { + seq = read_seqbegin(&xtime_lock); + nsecs = timekeeping_get_ns_raw(); + *ts = raw_time; + + } while (read_seqretry(&xtime_lock, seq)); + + timespec_add_ns(ts, nsecs); +} +EXPORT_SYMBOL(getrawmonotonic); + + +/** + * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres + */ +int timekeeping_valid_for_hres(void) +{ + unsigned long seq; + int ret; + + do { + seq = read_seqbegin(&xtime_lock); + + ret = timekeeper.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; + + } while (read_seqretry(&xtime_lock, seq)); + + return ret; +} + +/** + * timekeeping_max_deferment - Returns max time the clocksource can be deferred + * + * Caller must observe xtime_lock via read_seqbegin/read_seqretry to + * ensure that the clocksource does not change! + */ +u64 timekeeping_max_deferment(void) +{ + return timekeeper.clock->max_idle_ns; +} + +/** + * read_persistent_clock - Return time from the persistent clock. + * + * Weak dummy function for arches that do not yet support it. + * Reads the time from the battery backed persistent clock. + * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported. + * + * XXX - Do be sure to remove it once all arches implement it. + */ +void __attribute__((weak)) read_persistent_clock(struct timespec *ts) +{ + ts->tv_sec = 0; + ts->tv_nsec = 0; +} + +/** + * read_boot_clock - Return time of the system start. + * + * Weak dummy function for arches that do not yet support it. + * Function to read the exact time the system has been started. + * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported. + * + * XXX - Do be sure to remove it once all arches implement it. + */ +void __attribute__((weak)) read_boot_clock(struct timespec *ts) +{ + ts->tv_sec = 0; + ts->tv_nsec = 0; +} + +/* + * timekeeping_init - Initializes the clocksource and common timekeeping values + */ +void __init timekeeping_init(void) +{ + struct clocksource *clock; + unsigned long flags; + struct timespec now, boot; + + read_persistent_clock(&now); + read_boot_clock(&boot); + + write_seqlock_irqsave(&xtime_lock, flags); + + ntp_init(); + + clock = clocksource_default_clock(); + if (clock->enable) + clock->enable(clock); + timekeeper_setup_internals(clock); + + xtime.tv_sec = now.tv_sec; + xtime.tv_nsec = now.tv_nsec; + raw_time.tv_sec = 0; + raw_time.tv_nsec = 0; + if (boot.tv_sec == 0 && boot.tv_nsec == 0) { + boot.tv_sec = xtime.tv_sec; + boot.tv_nsec = xtime.tv_nsec; + } + set_normalized_timespec(&wall_to_monotonic, + -boot.tv_sec, -boot.tv_nsec); + total_sleep_time.tv_sec = 0; + total_sleep_time.tv_nsec = 0; + write_sequnlock_irqrestore(&xtime_lock, flags); +} + +/* time in seconds when suspend began */ +static struct timespec timekeeping_suspend_time; + +/** + * __timekeeping_inject_sleeptime - Internal function to add sleep interval + * @delta: pointer to a timespec delta value + * + * Takes a timespec offset measuring a suspend interval and properly + * adds the sleep offset to the timekeeping variables. + */ +static void __timekeeping_inject_sleeptime(struct timespec *delta) +{ + if (!timespec_valid(delta)) { + printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " + "sleep delta value!\n"); + return; + } + + xtime = timespec_add(xtime, *delta); + wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta); + total_sleep_time = timespec_add(total_sleep_time, *delta); +} + + +/** + * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values + * @delta: pointer to a timespec delta value + * + * This hook is for architectures that cannot support read_persistent_clock + * because their RTC/persistent clock is only accessible when irqs are enabled. + * + * This function should only be called by rtc_resume(), and allows + * a suspend offset to be injected into the timekeeping values. + */ +void timekeeping_inject_sleeptime(struct timespec *delta) +{ + unsigned long flags; + struct timespec ts; + + /* Make sure we don't set the clock twice */ + read_persistent_clock(&ts); + if (!(ts.tv_sec == 0 && ts.tv_nsec == 0)) + return; + + write_seqlock_irqsave(&xtime_lock, flags); + timekeeping_forward_now(); + + __timekeeping_inject_sleeptime(delta); + + timekeeper.ntp_error = 0; + ntp_clear(); + update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, + timekeeper.mult); + + write_sequnlock_irqrestore(&xtime_lock, flags); + + /* signal hrtimers about time change */ + clock_was_set(); +} + + +/** + * timekeeping_resume - Resumes the generic timekeeping subsystem. + * + * This is for the generic clocksource timekeeping. + * xtime/wall_to_monotonic/jiffies/etc are + * still managed by arch specific suspend/resume code. + */ +static void timekeeping_resume(void) +{ + unsigned long flags; + struct timespec ts; + + read_persistent_clock(&ts); + + clocksource_resume(); + + write_seqlock_irqsave(&xtime_lock, flags); + + if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) { + ts = timespec_sub(ts, timekeeping_suspend_time); + __timekeeping_inject_sleeptime(&ts); + } + /* re-base the last cycle value */ + timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); + timekeeper.ntp_error = 0; + timekeeping_suspended = 0; + write_sequnlock_irqrestore(&xtime_lock, flags); + + touch_softlockup_watchdog(); + + clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); + + /* Resume hrtimers */ + hrtimers_resume(); +} + +static int timekeeping_suspend(void) +{ + unsigned long flags; + + read_persistent_clock(&timekeeping_suspend_time); + + write_seqlock_irqsave(&xtime_lock, flags); + timekeeping_forward_now(); + timekeeping_suspended = 1; + write_sequnlock_irqrestore(&xtime_lock, flags); + + clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); + clocksource_suspend(); + + return 0; +} + +/* sysfs resume/suspend bits for timekeeping */ +static struct syscore_ops timekeeping_syscore_ops = { + .resume = timekeeping_resume, + .suspend = timekeeping_suspend, +}; + +static int __init timekeeping_init_ops(void) +{ + register_syscore_ops(&timekeeping_syscore_ops); + return 0; +} + +device_initcall(timekeeping_init_ops); + +/* + * If the error is already larger, we look ahead even further + * to compensate for late or lost adjustments. + */ +static __always_inline int timekeeping_bigadjust(s64 error, s64 *interval, + s64 *offset) +{ + s64 tick_error, i; + u32 look_ahead, adj; + s32 error2, mult; + + /* + * Use the current error value to determine how much to look ahead. + * The larger the error the slower we adjust for it to avoid problems + * with losing too many ticks, otherwise we would overadjust and + * produce an even larger error. The smaller the adjustment the + * faster we try to adjust for it, as lost ticks can do less harm + * here. This is tuned so that an error of about 1 msec is adjusted + * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks). + */ + error2 = timekeeper.ntp_error >> (NTP_SCALE_SHIFT + 22 - 2 * SHIFT_HZ); + error2 = abs(error2); + for (look_ahead = 0; error2 > 0; look_ahead++) + error2 >>= 2; + + /* + * Now calculate the error in (1 << look_ahead) ticks, but first + * remove the single look ahead already included in the error. + */ + tick_error = tick_length >> (timekeeper.ntp_error_shift + 1); + tick_error -= timekeeper.xtime_interval >> 1; + error = ((error - tick_error) >> look_ahead) + tick_error; + + /* Finally calculate the adjustment shift value. */ + i = *interval; + mult = 1; + if (error < 0) { + error = -error; + *interval = -*interval; + *offset = -*offset; + mult = -1; + } + for (adj = 0; error > i; adj++) + error >>= 1; + + *interval <<= adj; + *offset <<= adj; + return mult << adj; +} + +/* + * Adjust the multiplier to reduce the error value, + * this is optimized for the most common adjustments of -1,0,1, + * for other values we can do a bit more work. + */ +static void timekeeping_adjust(s64 offset) +{ + s64 error, interval = timekeeper.cycle_interval; + int adj; + + error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1); + if (error > interval) { + error >>= 2; + if (likely(error <= interval)) + adj = 1; + else + adj = timekeeping_bigadjust(error, &interval, &offset); + } else if (error < -interval) { + error >>= 2; + if (likely(error >= -interval)) { + adj = -1; + interval = -interval; + offset = -offset; + } else + adj = timekeeping_bigadjust(error, &interval, &offset); + } else + return; + + timekeeper.mult += adj; + timekeeper.xtime_interval += interval; + timekeeper.xtime_nsec -= offset; + timekeeper.ntp_error -= (interval - offset) << + timekeeper.ntp_error_shift; +} + + +/** + * logarithmic_accumulation - shifted accumulation of cycles + * + * This functions accumulates a shifted interval of cycles into + * into a shifted interval nanoseconds. Allows for O(log) accumulation + * loop. + * + * Returns the unconsumed cycles. + */ +static cycle_t logarithmic_accumulation(cycle_t offset, int shift) +{ + u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift; + u64 raw_nsecs; + + /* If the offset is smaller then a shifted interval, do nothing */ + if (offset < timekeeper.cycle_interval<cycle_last += timekeeper.cycle_interval << shift; + + timekeeper.xtime_nsec += timekeeper.xtime_interval << shift; + while (timekeeper.xtime_nsec >= nsecps) { + timekeeper.xtime_nsec -= nsecps; + xtime.tv_sec++; + second_overflow(); + } + + /* Accumulate raw time */ + raw_nsecs = timekeeper.raw_interval << shift; + raw_nsecs += raw_time.tv_nsec; + if (raw_nsecs >= NSEC_PER_SEC) { + u64 raw_secs = raw_nsecs; + raw_nsecs = do_div(raw_secs, NSEC_PER_SEC); + raw_time.tv_sec += raw_secs; + } + raw_time.tv_nsec = raw_nsecs; + + /* Accumulate error between NTP and clock interval */ + timekeeper.ntp_error += tick_length << shift; + timekeeper.ntp_error -= + (timekeeper.xtime_interval + timekeeper.xtime_remainder) << + (timekeeper.ntp_error_shift + shift); + + return offset; +} + + +/** + * update_wall_time - Uses the current clocksource to increment the wall time + * + * Called from the timer interrupt, must hold a write on xtime_lock. + */ +static void update_wall_time(void) +{ + struct clocksource *clock; + cycle_t offset; + int shift = 0, maxshift; + + /* Make sure we're fully resumed: */ + if (unlikely(timekeeping_suspended)) + return; + + clock = timekeeper.clock; + +#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET + offset = timekeeper.cycle_interval; +#else + offset = (clock->read(clock) - clock->cycle_last) & clock->mask; +#endif + timekeeper.xtime_nsec = (s64)xtime.tv_nsec << timekeeper.shift; + + /* + * With NO_HZ we may have to accumulate many cycle_intervals + * (think "ticks") worth of time at once. To do this efficiently, + * we calculate the largest doubling multiple of cycle_intervals + * that is smaller then the offset. We then accumulate that + * chunk in one go, and then try to consume the next smaller + * doubled multiple. + */ + shift = ilog2(offset) - ilog2(timekeeper.cycle_interval); + shift = max(0, shift); + /* Bound shift to one less then what overflows tick_length */ + maxshift = (8*sizeof(tick_length) - (ilog2(tick_length)+1)) - 1; + shift = min(shift, maxshift); + while (offset >= timekeeper.cycle_interval) { + offset = logarithmic_accumulation(offset, shift); + if(offset < timekeeper.cycle_interval<> timekeeper.shift) + 1; + timekeeper.xtime_nsec -= (s64) xtime.tv_nsec << timekeeper.shift; + timekeeper.ntp_error += timekeeper.xtime_nsec << + timekeeper.ntp_error_shift; + + /* + * Finally, make sure that after the rounding + * xtime.tv_nsec isn't larger then NSEC_PER_SEC + */ + if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) { + xtime.tv_nsec -= NSEC_PER_SEC; + xtime.tv_sec++; + second_overflow(); + } + + /* check to see if there is a new clocksource to use */ + update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock, + timekeeper.mult); +} + +/** + * getboottime - Return the real time of system boot. + * @ts: pointer to the timespec to be set + * + * Returns the wall-time of boot in a timespec. + * + * This is based on the wall_to_monotonic offset and the total suspend + * time. Calls to settimeofday will affect the value returned (which + * basically means that however wrong your real time clock is at boot time, + * you get the right time here). + */ +void getboottime(struct timespec *ts) +{ + struct timespec boottime = { + .tv_sec = wall_to_monotonic.tv_sec + total_sleep_time.tv_sec, + .tv_nsec = wall_to_monotonic.tv_nsec + total_sleep_time.tv_nsec + }; + + set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); +} +EXPORT_SYMBOL_GPL(getboottime); + + +/** + * get_monotonic_boottime - Returns monotonic time since boot + * @ts: pointer to the timespec to be set + * + * Returns the monotonic time since boot in a timespec. + * + * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also + * includes the time spent in suspend. + */ +void get_monotonic_boottime(struct timespec *ts) +{ + struct timespec tomono, sleep; + unsigned int seq; + s64 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqbegin(&xtime_lock); + *ts = xtime; + tomono = wall_to_monotonic; + sleep = total_sleep_time; + nsecs = timekeeping_get_ns(); + + } while (read_seqretry(&xtime_lock, seq)); + + set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec, + ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs); +} +EXPORT_SYMBOL_GPL(get_monotonic_boottime); + +/** + * ktime_get_boottime - Returns monotonic time since boot in a ktime + * + * Returns the monotonic time since boot in a ktime + * + * This is similar to CLOCK_MONTONIC/ktime_get, but also + * includes the time spent in suspend. + */ +ktime_t ktime_get_boottime(void) +{ + struct timespec ts; + + get_monotonic_boottime(&ts); + return timespec_to_ktime(ts); +} +EXPORT_SYMBOL_GPL(ktime_get_boottime); + +/** + * monotonic_to_bootbased - Convert the monotonic time to boot based. + * @ts: pointer to the timespec to be converted + */ +void monotonic_to_bootbased(struct timespec *ts) +{ + *ts = timespec_add(*ts, total_sleep_time); +} +EXPORT_SYMBOL_GPL(monotonic_to_bootbased); + +unsigned long get_seconds(void) +{ + return xtime.tv_sec; +} +EXPORT_SYMBOL(get_seconds); + +struct timespec __current_kernel_time(void) +{ + return xtime; +} + +struct timespec current_kernel_time(void) +{ + struct timespec now; + unsigned long seq; + + do { + seq = read_seqbegin(&xtime_lock); + + now = xtime; + } while (read_seqretry(&xtime_lock, seq)); + + return now; +} +EXPORT_SYMBOL(current_kernel_time); + +struct timespec get_monotonic_coarse(void) +{ + struct timespec now, mono; + unsigned long seq; + + do { + seq = read_seqbegin(&xtime_lock); + + now = xtime; + mono = wall_to_monotonic; + } while (read_seqretry(&xtime_lock, seq)); + + set_normalized_timespec(&now, now.tv_sec + mono.tv_sec, + now.tv_nsec + mono.tv_nsec); + return now; +} + +/* + * The 64-bit jiffies value is not atomic - you MUST NOT read it + * without sampling the sequence number in xtime_lock. + * jiffies is defined in the linker script... + */ +void do_timer(unsigned long ticks) +{ + jiffies_64 += ticks; + update_wall_time(); + calc_global_load(ticks); +} + +/** + * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic, + * and sleep offsets. + * @xtim: pointer to timespec to be set with xtime + * @wtom: pointer to timespec to be set with wall_to_monotonic + * @sleep: pointer to timespec to be set with time in suspend + */ +void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim, + struct timespec *wtom, struct timespec *sleep) +{ + unsigned long seq; + + do { + seq = read_seqbegin(&xtime_lock); + *xtim = xtime; + *wtom = wall_to_monotonic; + *sleep = total_sleep_time; + } while (read_seqretry(&xtime_lock, seq)); +} + +/** + * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format + */ +ktime_t ktime_get_monotonic_offset(void) +{ + unsigned long seq; + struct timespec wtom; + + do { + seq = read_seqbegin(&xtime_lock); + wtom = wall_to_monotonic; + } while (read_seqretry(&xtime_lock, seq)); + return timespec_to_ktime(wtom); +} + +/** + * xtime_update() - advances the timekeeping infrastructure + * @ticks: number of ticks, that have elapsed since the last call. + * + * Must be called with interrupts disabled. + */ +void xtime_update(unsigned long ticks) +{ + write_seqlock(&xtime_lock); + do_timer(ticks); + write_sequnlock(&xtime_lock); +} diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c new file mode 100644 index 00000000..32584555 --- /dev/null +++ b/kernel/time/timer_list.c @@ -0,0 +1,301 @@ +/* + * kernel/time/timer_list.c + * + * List pending timers + * + * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes); + +DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); + +/* + * This allows printing both to /proc/timer_list and + * to the console (on SysRq-Q): + */ +#define SEQ_printf(m, x...) \ + do { \ + if (m) \ + seq_printf(m, x); \ + else \ + printk(x); \ + } while (0) + +static void print_name_offset(struct seq_file *m, void *sym) +{ + char symname[KSYM_NAME_LEN]; + + if (lookup_symbol_name((unsigned long)sym, symname) < 0) + SEQ_printf(m, "<%pK>", sym); + else + SEQ_printf(m, "%s", symname); +} + +static void +print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, + int idx, u64 now) +{ +#ifdef CONFIG_TIMER_STATS + char tmp[TASK_COMM_LEN + 1]; +#endif + SEQ_printf(m, " #%d: ", idx); + print_name_offset(m, taddr); + SEQ_printf(m, ", "); + print_name_offset(m, timer->function); + SEQ_printf(m, ", S:%02lx", timer->state); +#ifdef CONFIG_TIMER_STATS + SEQ_printf(m, ", "); + print_name_offset(m, timer->start_site); + memcpy(tmp, timer->start_comm, TASK_COMM_LEN); + tmp[TASK_COMM_LEN] = 0; + SEQ_printf(m, ", %s/%d", tmp, timer->start_pid); +#endif + SEQ_printf(m, "\n"); + SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", + (unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)), + (unsigned long long)ktime_to_ns(hrtimer_get_expires(timer)), + (long long)(ktime_to_ns(hrtimer_get_softexpires(timer)) - now), + (long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now)); +} + +static void +print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, + u64 now) +{ + struct hrtimer *timer, tmp; + unsigned long next = 0, i; + struct timerqueue_node *curr; + unsigned long flags; + +next_one: + i = 0; + raw_spin_lock_irqsave(&base->cpu_base->lock, flags); + + curr = timerqueue_getnext(&base->active); + /* + * Crude but we have to do this O(N*N) thing, because + * we have to unlock the base when printing: + */ + while (curr && i < next) { + curr = timerqueue_iterate_next(curr); + i++; + } + + if (curr) { + + timer = container_of(curr, struct hrtimer, node); + tmp = *timer; + raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); + + print_timer(m, timer, &tmp, i, now); + next++; + goto next_one; + } + raw_spin_unlock_irqrestore(&base->cpu_base->lock, flags); +} + +static void +print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) +{ + SEQ_printf(m, " .base: %pK\n", base); + SEQ_printf(m, " .index: %d\n", + base->index); + SEQ_printf(m, " .resolution: %Lu nsecs\n", + (unsigned long long)ktime_to_ns(base->resolution)); + SEQ_printf(m, " .get_time: "); + print_name_offset(m, base->get_time); + SEQ_printf(m, "\n"); +#ifdef CONFIG_HIGH_RES_TIMERS + SEQ_printf(m, " .offset: %Lu nsecs\n", + (unsigned long long) ktime_to_ns(base->offset)); +#endif + SEQ_printf(m, "active timers:\n"); + print_active_timers(m, base, now); +} + +static void print_cpu(struct seq_file *m, int cpu, u64 now) +{ + struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); + int i; + + SEQ_printf(m, "\n"); + SEQ_printf(m, "cpu: %d\n", cpu); + for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { + SEQ_printf(m, " clock %d:\n", i); + print_base(m, cpu_base->clock_base + i, now); + } +#define P(x) \ + SEQ_printf(m, " .%-15s: %Lu\n", #x, \ + (unsigned long long)(cpu_base->x)) +#define P_ns(x) \ + SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \ + (unsigned long long)(ktime_to_ns(cpu_base->x))) + +#ifdef CONFIG_HIGH_RES_TIMERS + P_ns(expires_next); + P(hres_active); + P(nr_events); + P(nr_retries); + P(nr_hangs); + P_ns(max_hang_time); +#endif +#undef P +#undef P_ns + +#ifdef CONFIG_TICK_ONESHOT +# define P(x) \ + SEQ_printf(m, " .%-15s: %Lu\n", #x, \ + (unsigned long long)(ts->x)) +# define P_ns(x) \ + SEQ_printf(m, " .%-15s: %Lu nsecs\n", #x, \ + (unsigned long long)(ktime_to_ns(ts->x))) + { + struct tick_sched *ts = tick_get_tick_sched(cpu); + P(nohz_mode); + P_ns(idle_tick); + P(tick_stopped); + P(idle_jiffies); + P(idle_calls); + P(idle_sleeps); + P_ns(idle_entrytime); + P_ns(idle_waketime); + P_ns(idle_exittime); + P_ns(idle_sleeptime); + P_ns(iowait_sleeptime); + P(last_jiffies); + P(next_jiffies); + P_ns(idle_expires); + SEQ_printf(m, "jiffies: %Lu\n", + (unsigned long long)jiffies); + } +#endif + +#undef P +#undef P_ns +} + +#ifdef CONFIG_GENERIC_CLOCKEVENTS +static void +print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) +{ + struct clock_event_device *dev = td->evtdev; + + SEQ_printf(m, "\n"); + SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); + if (cpu < 0) + SEQ_printf(m, "Broadcast device\n"); + else + SEQ_printf(m, "Per CPU device: %d\n", cpu); + + SEQ_printf(m, "Clock Event Device: "); + if (!dev) { + SEQ_printf(m, "\n"); + return; + } + SEQ_printf(m, "%s\n", dev->name); + SEQ_printf(m, " max_delta_ns: %llu\n", + (unsigned long long) dev->max_delta_ns); + SEQ_printf(m, " min_delta_ns: %llu\n", + (unsigned long long) dev->min_delta_ns); + SEQ_printf(m, " mult: %u\n", dev->mult); + SEQ_printf(m, " shift: %u\n", dev->shift); + SEQ_printf(m, " mode: %d\n", dev->mode); + SEQ_printf(m, " next_event: %Ld nsecs\n", + (unsigned long long) ktime_to_ns(dev->next_event)); + + SEQ_printf(m, " set_next_event: "); + print_name_offset(m, dev->set_next_event); + SEQ_printf(m, "\n"); + + SEQ_printf(m, " set_mode: "); + print_name_offset(m, dev->set_mode); + SEQ_printf(m, "\n"); + + SEQ_printf(m, " event_handler: "); + print_name_offset(m, dev->event_handler); + SEQ_printf(m, "\n"); + SEQ_printf(m, " retries: %lu\n", dev->retries); +} + +static void timer_list_show_tickdevices(struct seq_file *m) +{ + int cpu; + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST + print_tickdevice(m, tick_get_broadcast_device(), -1); + SEQ_printf(m, "tick_broadcast_mask: %08lx\n", + cpumask_bits(tick_get_broadcast_mask())[0]); +#ifdef CONFIG_TICK_ONESHOT + SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n", + cpumask_bits(tick_get_broadcast_oneshot_mask())[0]); +#endif + SEQ_printf(m, "\n"); +#endif + for_each_online_cpu(cpu) + print_tickdevice(m, tick_get_device(cpu), cpu); + SEQ_printf(m, "\n"); +} +#else +static void timer_list_show_tickdevices(struct seq_file *m) { } +#endif + +static int timer_list_show(struct seq_file *m, void *v) +{ + u64 now = ktime_to_ns(ktime_get()); + int cpu; + + SEQ_printf(m, "Timer List Version: v0.6\n"); + SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); + SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); + + for_each_online_cpu(cpu) + print_cpu(m, cpu, now); + + SEQ_printf(m, "\n"); + timer_list_show_tickdevices(m); + + return 0; +} + +void sysrq_timer_list_show(void) +{ + timer_list_show(NULL, NULL); +} + +static int timer_list_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, timer_list_show, NULL); +} + +static const struct file_operations timer_list_fops = { + .open = timer_list_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init init_timer_list_procfs(void) +{ + struct proc_dir_entry *pe; + + pe = proc_create("timer_list", 0444, NULL, &timer_list_fops); + if (!pe) + return -ENOMEM; + return 0; +} +__initcall(init_timer_list_procfs); diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c new file mode 100644 index 00000000..a5d0a3a8 --- /dev/null +++ b/kernel/time/timer_stats.c @@ -0,0 +1,425 @@ +/* + * kernel/time/timer_stats.c + * + * Collect timer usage statistics. + * + * Copyright(C) 2006, Red Hat, Inc., Ingo Molnar + * Copyright(C) 2006 Timesys Corp., Thomas Gleixner + * + * timer_stats is based on timer_top, a similar functionality which was part of + * Con Kolivas dyntick patch set. It was developed by Daniel Petrini at the + * Instituto Nokia de Tecnologia - INdT - Manaus. timer_top's design was based + * on dynamic allocation of the statistics entries and linear search based + * lookup combined with a global lock, rather than the static array, hash + * and per-CPU locking which is used by timer_stats. It was written for the + * pre hrtimer kernel code and therefore did not take hrtimers into account. + * Nevertheless it provided the base for the timer_stats implementation and + * was a helpful source of inspiration. Kudos to Daniel and the Nokia folks + * for this effort. + * + * timer_top.c is + * Copyright (C) 2005 Instituto Nokia de Tecnologia - INdT - Manaus + * Written by Daniel Petrini + * timer_top.c was released under the GNU General Public License version 2 + * + * We export the addresses and counting of timer functions being called, + * the pid and cmdline from the owner process if applicable. + * + * Start/stop data collection: + * # echo [1|0] >/proc/timer_stats + * + * Display the information collected so far: + * # cat /proc/timer_stats + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include + +#include + +/* + * This is our basic unit of interest: a timer expiry event identified + * by the timer, its start/expire functions and the PID of the task that + * started the timer. We count the number of times an event happens: + */ +struct entry { + /* + * Hash list: + */ + struct entry *next; + + /* + * Hash keys: + */ + void *timer; + void *start_func; + void *expire_func; + pid_t pid; + + /* + * Number of timeout events: + */ + unsigned long count; + unsigned int timer_flag; + + /* + * We save the command-line string to preserve + * this information past task exit: + */ + char comm[TASK_COMM_LEN + 1]; + +} ____cacheline_aligned_in_smp; + +/* + * Spinlock protecting the tables - not taken during lookup: + */ +static DEFINE_SPINLOCK(table_lock); + +/* + * Per-CPU lookup locks for fast hash lookup: + */ +static DEFINE_PER_CPU(raw_spinlock_t, tstats_lookup_lock); + +/* + * Mutex to serialize state changes with show-stats activities: + */ +static DEFINE_MUTEX(show_mutex); + +/* + * Collection status, active/inactive: + */ +int __read_mostly timer_stats_active; + +/* + * Beginning/end timestamps of measurement: + */ +static ktime_t time_start, time_stop; + +/* + * tstat entry structs only get allocated while collection is + * active and never freed during that time - this simplifies + * things quite a bit. + * + * They get freed when a new collection period is started. + */ +#define MAX_ENTRIES_BITS 10 +#define MAX_ENTRIES (1UL << MAX_ENTRIES_BITS) + +static unsigned long nr_entries; +static struct entry entries[MAX_ENTRIES]; + +static atomic_t overflow_count; + +/* + * The entries are in a hash-table, for fast lookup: + */ +#define TSTAT_HASH_BITS (MAX_ENTRIES_BITS - 1) +#define TSTAT_HASH_SIZE (1UL << TSTAT_HASH_BITS) +#define TSTAT_HASH_MASK (TSTAT_HASH_SIZE - 1) + +#define __tstat_hashfn(entry) \ + (((unsigned long)(entry)->timer ^ \ + (unsigned long)(entry)->start_func ^ \ + (unsigned long)(entry)->expire_func ^ \ + (unsigned long)(entry)->pid ) & TSTAT_HASH_MASK) + +#define tstat_hashentry(entry) (tstat_hash_table + __tstat_hashfn(entry)) + +static struct entry *tstat_hash_table[TSTAT_HASH_SIZE] __read_mostly; + +static void reset_entries(void) +{ + nr_entries = 0; + memset(entries, 0, sizeof(entries)); + memset(tstat_hash_table, 0, sizeof(tstat_hash_table)); + atomic_set(&overflow_count, 0); +} + +static struct entry *alloc_entry(void) +{ + if (nr_entries >= MAX_ENTRIES) + return NULL; + + return entries + nr_entries++; +} + +static int match_entries(struct entry *entry1, struct entry *entry2) +{ + return entry1->timer == entry2->timer && + entry1->start_func == entry2->start_func && + entry1->expire_func == entry2->expire_func && + entry1->pid == entry2->pid; +} + +/* + * Look up whether an entry matching this item is present + * in the hash already. Must be called with irqs off and the + * lookup lock held: + */ +static struct entry *tstat_lookup(struct entry *entry, char *comm) +{ + struct entry **head, *curr, *prev; + + head = tstat_hashentry(entry); + curr = *head; + + /* + * The fastpath is when the entry is already hashed, + * we do this with the lookup lock held, but with the + * table lock not held: + */ + while (curr) { + if (match_entries(curr, entry)) + return curr; + + curr = curr->next; + } + /* + * Slowpath: allocate, set up and link a new hash entry: + */ + prev = NULL; + curr = *head; + + spin_lock(&table_lock); + /* + * Make sure we have not raced with another CPU: + */ + while (curr) { + if (match_entries(curr, entry)) + goto out_unlock; + + prev = curr; + curr = curr->next; + } + + curr = alloc_entry(); + if (curr) { + *curr = *entry; + curr->count = 0; + curr->next = NULL; + memcpy(curr->comm, comm, TASK_COMM_LEN); + + smp_mb(); /* Ensure that curr is initialized before insert */ + + if (prev) + prev->next = curr; + else + *head = curr; + } + out_unlock: + spin_unlock(&table_lock); + + return curr; +} + +/** + * timer_stats_update_stats - Update the statistics for a timer. + * @timer: pointer to either a timer_list or a hrtimer + * @pid: the pid of the task which set up the timer + * @startf: pointer to the function which did the timer setup + * @timerf: pointer to the timer callback function of the timer + * @comm: name of the process which set up the timer + * + * When the timer is already registered, then the event counter is + * incremented. Otherwise the timer is registered in a free slot. + */ +void timer_stats_update_stats(void *timer, pid_t pid, void *startf, + void *timerf, char *comm, + unsigned int timer_flag) +{ + /* + * It doesn't matter which lock we take: + */ + raw_spinlock_t *lock; + struct entry *entry, input; + unsigned long flags; + + if (likely(!timer_stats_active)) + return; + + lock = &per_cpu(tstats_lookup_lock, raw_smp_processor_id()); + + input.timer = timer; + input.start_func = startf; + input.expire_func = timerf; + input.pid = pid; + input.timer_flag = timer_flag; + + raw_spin_lock_irqsave(lock, flags); + if (!timer_stats_active) + goto out_unlock; + + entry = tstat_lookup(&input, comm); + if (likely(entry)) + entry->count++; + else + atomic_inc(&overflow_count); + + out_unlock: + raw_spin_unlock_irqrestore(lock, flags); +} + +static void print_name_offset(struct seq_file *m, unsigned long addr) +{ + char symname[KSYM_NAME_LEN]; + + if (lookup_symbol_name(addr, symname) < 0) + seq_printf(m, "<%p>", (void *)addr); + else + seq_printf(m, "%s", symname); +} + +static int tstats_show(struct seq_file *m, void *v) +{ + struct timespec period; + struct entry *entry; + unsigned long ms; + long events = 0; + ktime_t time; + int i; + + mutex_lock(&show_mutex); + /* + * If still active then calculate up to now: + */ + if (timer_stats_active) + time_stop = ktime_get(); + + time = ktime_sub(time_stop, time_start); + + period = ktime_to_timespec(time); + ms = period.tv_nsec / 1000000; + + seq_puts(m, "Timer Stats Version: v0.2\n"); + seq_printf(m, "Sample period: %ld.%03ld s\n", period.tv_sec, ms); + if (atomic_read(&overflow_count)) + seq_printf(m, "Overflow: %d entries\n", + atomic_read(&overflow_count)); + + for (i = 0; i < nr_entries; i++) { + entry = entries + i; + if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { + seq_printf(m, "%4luD, %5d %-16s ", + entry->count, entry->pid, entry->comm); + } else { + seq_printf(m, " %4lu, %5d %-16s ", + entry->count, entry->pid, entry->comm); + } + + print_name_offset(m, (unsigned long)entry->start_func); + seq_puts(m, " ("); + print_name_offset(m, (unsigned long)entry->expire_func); + seq_puts(m, ")\n"); + + events += entry->count; + } + + ms += period.tv_sec * 1000; + if (!ms) + ms = 1; + + if (events && period.tv_sec) + seq_printf(m, "%ld total events, %ld.%03ld events/sec\n", + events, events * 1000 / ms, + (events * 1000000 / ms) % 1000); + else + seq_printf(m, "%ld total events\n", events); + + mutex_unlock(&show_mutex); + + return 0; +} + +/* + * After a state change, make sure all concurrent lookup/update + * activities have stopped: + */ +static void sync_access(void) +{ + unsigned long flags; + int cpu; + + for_each_online_cpu(cpu) { + raw_spinlock_t *lock = &per_cpu(tstats_lookup_lock, cpu); + + raw_spin_lock_irqsave(lock, flags); + /* nothing */ + raw_spin_unlock_irqrestore(lock, flags); + } +} + +static ssize_t tstats_write(struct file *file, const char __user *buf, + size_t count, loff_t *offs) +{ + char ctl[2]; + + if (count != 2 || *offs) + return -EINVAL; + + if (copy_from_user(ctl, buf, count)) + return -EFAULT; + + mutex_lock(&show_mutex); + switch (ctl[0]) { + case '0': + if (timer_stats_active) { + timer_stats_active = 0; + time_stop = ktime_get(); + sync_access(); + } + break; + case '1': + if (!timer_stats_active) { + reset_entries(); + time_start = ktime_get(); + smp_mb(); + timer_stats_active = 1; + } + break; + default: + count = -EINVAL; + } + mutex_unlock(&show_mutex); + + return count; +} + +static int tstats_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, tstats_show, NULL); +} + +static const struct file_operations tstats_fops = { + .open = tstats_open, + .read = seq_read, + .write = tstats_write, + .llseek = seq_lseek, + .release = single_release, +}; + +void __init init_timer_stats(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + raw_spin_lock_init(&per_cpu(tstats_lookup_lock, cpu)); +} + +static int __init init_tstats_procfs(void) +{ + struct proc_dir_entry *pe; + + pe = proc_create("timer_stats", 0644, NULL, &tstats_fops); + if (!pe) + return -ENOMEM; + return 0; +} +__initcall(init_tstats_procfs); diff --git a/kernel/timeconst.pl b/kernel/timeconst.pl new file mode 100644 index 00000000..eb51d76e --- /dev/null +++ b/kernel/timeconst.pl @@ -0,0 +1,378 @@ +#!/usr/bin/perl +# ----------------------------------------------------------------------- +# +# Copyright 2007-2008 rPath, Inc. - All Rights Reserved +# +# This file is part of the Linux kernel, and is made available under +# the terms of the GNU General Public License version 2 or (at your +# option) any later version; incorporated herein by reference. +# +# ----------------------------------------------------------------------- +# + +# +# Usage: timeconst.pl HZ > timeconst.h +# + +# Precomputed values for systems without Math::BigInt +# Generated by: +# timeconst.pl --can 24 32 48 64 100 122 128 200 250 256 300 512 1000 1024 1200 +%canned_values = ( + 24 => [ + '0xa6aaaaab','0x2aaaaaa',26, + 125,3, + '0xc49ba5e4','0x1fbe76c8b4',37, + 3,125, + '0xa2c2aaab','0xaaaa',16, + 125000,3, + '0xc9539b89','0x7fffbce4217d',47, + 3,125000, + ], 32 => [ + '0xfa000000','0x6000000',27, + 125,4, + '0x83126e98','0xfdf3b645a',36, + 4,125, + '0xf4240000','0x0',17, + 31250,1, + '0x8637bd06','0x3fff79c842fa',46, + 1,31250, + ], 48 => [ + '0xa6aaaaab','0x6aaaaaa',27, + 125,6, + '0xc49ba5e4','0xfdf3b645a',36, + 6,125, + '0xa2c2aaab','0x15555',17, + 62500,3, + '0xc9539b89','0x3fffbce4217d',46, + 3,62500, + ], 64 => [ + '0xfa000000','0xe000000',28, + 125,8, + '0x83126e98','0x7ef9db22d',35, + 8,125, + '0xf4240000','0x0',18, + 15625,1, + '0x8637bd06','0x1fff79c842fa',45, + 1,15625, + ], 100 => [ + '0xa0000000','0x0',28, + 10,1, + '0xcccccccd','0x733333333',35, + 1,10, + '0x9c400000','0x0',18, + 10000,1, + '0xd1b71759','0x1fff2e48e8a7',45, + 1,10000, + ], 122 => [ + '0x8325c53f','0xfbcda3a',28, + 500,61, + '0xf9db22d1','0x7fbe76c8b',35, + 61,500, + '0x8012e2a0','0x3ef36',18, + 500000,61, + '0xffda4053','0x1ffffbce4217',45, + 61,500000, + ], 128 => [ + '0xfa000000','0x1e000000',29, + 125,16, + '0x83126e98','0x3f7ced916',34, + 16,125, + '0xf4240000','0x40000',19, + 15625,2, + '0x8637bd06','0xfffbce4217d',44, + 2,15625, + ], 200 => [ + '0xa0000000','0x0',29, + 5,1, + '0xcccccccd','0x333333333',34, + 1,5, + '0x9c400000','0x0',19, + 5000,1, + '0xd1b71759','0xfff2e48e8a7',44, + 1,5000, + ], 250 => [ + '0x80000000','0x0',29, + 4,1, + '0x80000000','0x180000000',33, + 1,4, + '0xfa000000','0x0',20, + 4000,1, + '0x83126e98','0x7ff7ced9168',43, + 1,4000, + ], 256 => [ + '0xfa000000','0x3e000000',30, + 125,32, + '0x83126e98','0x1fbe76c8b',33, + 32,125, + '0xf4240000','0xc0000',20, + 15625,4, + '0x8637bd06','0x7ffde7210be',43, + 4,15625, + ], 300 => [ + '0xd5555556','0x2aaaaaaa',30, + 10,3, + '0x9999999a','0x1cccccccc',33, + 3,10, + '0xd0555556','0xaaaaa',20, + 10000,3, + '0x9d495183','0x7ffcb923a29',43, + 3,10000, + ], 512 => [ + '0xfa000000','0x7e000000',31, + 125,64, + '0x83126e98','0xfdf3b645',32, + 64,125, + '0xf4240000','0x1c0000',21, + 15625,8, + '0x8637bd06','0x3ffef39085f',42, + 8,15625, + ], 1000 => [ + '0x80000000','0x0',31, + 1,1, + '0x80000000','0x0',31, + 1,1, + '0xfa000000','0x0',22, + 1000,1, + '0x83126e98','0x1ff7ced9168',41, + 1,1000, + ], 1024 => [ + '0xfa000000','0xfe000000',32, + 125,128, + '0x83126e98','0x7ef9db22',31, + 128,125, + '0xf4240000','0x3c0000',22, + 15625,16, + '0x8637bd06','0x1fff79c842f',41, + 16,15625, + ], 1200 => [ + '0xd5555556','0xd5555555',32, + 5,6, + '0x9999999a','0x66666666',31, + 6,5, + '0xd0555556','0x2aaaaa',22, + 2500,3, + '0x9d495183','0x1ffcb923a29',41, + 3,2500, + ] +); + +$has_bigint = eval 'use Math::BigInt qw(bgcd); 1;'; + +sub bint($) +{ + my($x) = @_; + return Math::BigInt->new($x); +} + +# +# Constants for division by reciprocal multiplication. +# (bits, numerator, denominator) +# +sub fmul($$$) +{ + my ($b,$n,$d) = @_; + + $n = bint($n); + $d = bint($d); + + return scalar (($n << $b)+$d-bint(1))/$d; +} + +sub fadj($$$) +{ + my($b,$n,$d) = @_; + + $n = bint($n); + $d = bint($d); + + $d = $d/bgcd($n, $d); + return scalar (($d-bint(1)) << $b)/$d; +} + +sub fmuls($$$) { + my($b,$n,$d) = @_; + my($s,$m); + my($thres) = bint(1) << ($b-1); + + $n = bint($n); + $d = bint($d); + + for ($s = 0; 1; $s++) { + $m = fmul($s,$n,$d); + return $s if ($m >= $thres); + } + return 0; +} + +# Generate a hex value if the result fits in 64 bits; +# otherwise skip. +sub bignum_hex($) { + my($x) = @_; + my $s = $x->as_hex(); + + return (length($s) > 18) ? undef : $s; +} + +# Provides mul, adj, and shr factors for a specific +# (bit, time, hz) combination +sub muladj($$$) { + my($b, $t, $hz) = @_; + my $s = fmuls($b, $t, $hz); + my $m = fmul($s, $t, $hz); + my $a = fadj($s, $t, $hz); + return (bignum_hex($m), bignum_hex($a), $s); +} + +# Provides numerator, denominator values +sub numden($$) { + my($n, $d) = @_; + my $g = bgcd($n, $d); + return ($n/$g, $d/$g); +} + +# All values for a specific (time, hz) combo +sub conversions($$) { + my ($t, $hz) = @_; + my @val = (); + + # HZ_TO_xx + push(@val, muladj(32, $t, $hz)); + push(@val, numden($t, $hz)); + + # xx_TO_HZ + push(@val, muladj(32, $hz, $t)); + push(@val, numden($hz, $t)); + + return @val; +} + +sub compute_values($) { + my($hz) = @_; + my @val = (); + my $s, $m, $a, $g; + + if (!$has_bigint) { + die "$0: HZ == $hz not canned and ". + "Math::BigInt not available\n"; + } + + # MSEC conversions + push(@val, conversions(1000, $hz)); + + # USEC conversions + push(@val, conversions(1000000, $hz)); + + return @val; +} + +sub outputval($$) +{ + my($name, $val) = @_; + my $csuf; + + if (defined($val)) { + if ($name !~ /SHR/) { + $val = "U64_C($val)"; + } + printf "#define %-23s %s\n", $name.$csuf, $val.$csuf; + } +} + +sub output($@) +{ + my($hz, @val) = @_; + my $pfx, $bit, $suf, $s, $m, $a; + + print "/* Automatically generated by kernel/timeconst.pl */\n"; + print "/* Conversion constants for HZ == $hz */\n"; + print "\n"; + print "#ifndef KERNEL_TIMECONST_H\n"; + print "#define KERNEL_TIMECONST_H\n"; + print "\n"; + + print "#include \n"; + print "#include \n"; + + print "\n"; + print "#if HZ != $hz\n"; + print "#error \"kernel/timeconst.h has the wrong HZ value!\"\n"; + print "#endif\n"; + print "\n"; + + foreach $pfx ('HZ_TO_MSEC','MSEC_TO_HZ', + 'HZ_TO_USEC','USEC_TO_HZ') { + foreach $bit (32) { + foreach $suf ('MUL', 'ADJ', 'SHR') { + outputval("${pfx}_$suf$bit", shift(@val)); + } + } + foreach $suf ('NUM', 'DEN') { + outputval("${pfx}_$suf", shift(@val)); + } + } + + print "\n"; + print "#endif /* KERNEL_TIMECONST_H */\n"; +} + +# Pretty-print Perl values +sub perlvals(@) { + my $v; + my @l = (); + + foreach $v (@_) { + if (!defined($v)) { + push(@l, 'undef'); + } elsif ($v =~ /^0x/) { + push(@l, "\'".$v."\'"); + } else { + push(@l, $v.''); + } + } + return join(',', @l); +} + +($hz) = @ARGV; + +# Use this to generate the %canned_values structure +if ($hz eq '--can') { + shift(@ARGV); + @hzlist = sort {$a <=> $b} (@ARGV); + + print "# Precomputed values for systems without Math::BigInt\n"; + print "# Generated by:\n"; + print "# timeconst.pl --can ", join(' ', @hzlist), "\n"; + print "\%canned_values = (\n"; + my $pf = "\t"; + foreach $hz (@hzlist) { + my @values = compute_values($hz); + print "$pf$hz => [\n"; + while (scalar(@values)) { + my $bit; + foreach $bit (32) { + my $m = shift(@values); + my $a = shift(@values); + my $s = shift(@values); + print "\t\t", perlvals($m,$a,$s), ",\n"; + } + my $n = shift(@values); + my $d = shift(@values); + print "\t\t", perlvals($n,$d), ",\n"; + } + print "\t]"; + $pf = ', '; + } + print "\n);\n"; +} else { + $hz += 0; # Force to number + if ($hz < 1) { + die "Usage: $0 HZ\n"; + } + + @val = @{$canned_values{$hz}}; + if (!defined(@val)) { + @val = compute_values($hz); + } + output($hz, @val); +} +exit 0; diff --git a/kernel/timer.c b/kernel/timer.c new file mode 100644 index 00000000..8cff3611 --- /dev/null +++ b/kernel/timer.c @@ -0,0 +1,1792 @@ +/* + * linux/kernel/timer.c + * + * Kernel internal timers, basic process system calls + * + * Copyright (C) 1991, 1992 Linus Torvalds + * + * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. + * + * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 + * "A Kernel Model for Precision Timekeeping" by Dave Mills + * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to + * serialize accesses to xtime/lost_ticks). + * Copyright (C) 1998 Andrea Arcangeli + * 1999-03-10 Improved NTP compatibility by Ulrich Windl + * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love + * 2000-10-05 Implemented scalable SMP per-CPU timer handling. + * Copyright (C) 2000, 2001, 2002 Ingo Molnar + * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; + +EXPORT_SYMBOL(jiffies_64); + +/* + * per-CPU timer vector definitions: + */ +#define TVN_BITS (CONFIG_BASE_SMALL ? 4 : 6) +#define TVR_BITS (CONFIG_BASE_SMALL ? 6 : 8) +#define TVN_SIZE (1 << TVN_BITS) +#define TVR_SIZE (1 << TVR_BITS) +#define TVN_MASK (TVN_SIZE - 1) +#define TVR_MASK (TVR_SIZE - 1) + +struct tvec { + struct list_head vec[TVN_SIZE]; +}; + +struct tvec_root { + struct list_head vec[TVR_SIZE]; +}; + +struct tvec_base { + spinlock_t lock; + struct timer_list *running_timer; + unsigned long timer_jiffies; + unsigned long next_timer; + struct tvec_root tv1; + struct tvec tv2; + struct tvec tv3; + struct tvec tv4; + struct tvec tv5; +} ____cacheline_aligned; + +struct tvec_base boot_tvec_bases; +EXPORT_SYMBOL(boot_tvec_bases); +static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; + +/* Functions below help us manage 'deferrable' flag */ +static inline unsigned int tbase_get_deferrable(struct tvec_base *base) +{ + return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); +} + +static inline struct tvec_base *tbase_get_base(struct tvec_base *base) +{ + return ((struct tvec_base *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); +} + +static inline void timer_set_deferrable(struct timer_list *timer) +{ + timer->base = TBASE_MAKE_DEFERRED(timer->base); +} + +static inline void +timer_set_base(struct timer_list *timer, struct tvec_base *new_base) +{ + timer->base = (struct tvec_base *)((unsigned long)(new_base) | + tbase_get_deferrable(timer->base)); +} + +static unsigned long round_jiffies_common(unsigned long j, int cpu, + bool force_up) +{ + int rem; + unsigned long original = j; + + /* + * We don't want all cpus firing their timers at once hitting the + * same lock or cachelines, so we skew each extra cpu with an extra + * 3 jiffies. This 3 jiffies came originally from the mm/ code which + * already did this. + * The skew is done by adding 3*cpunr, then round, then subtract this + * extra offset again. + */ + j += cpu * 3; + + rem = j % HZ; + + /* + * If the target jiffie is just after a whole second (which can happen + * due to delays of the timer irq, long irq off times etc etc) then + * we should round down to the whole second, not up. Use 1/4th second + * as cutoff for this rounding as an extreme upper bound for this. + * But never round down if @force_up is set. + */ + if (rem < HZ/4 && !force_up) /* round down */ + j = j - rem; + else /* round up */ + j = j - rem + HZ; + + /* now that we have rounded, subtract the extra skew again */ + j -= cpu * 3; + + if (j <= jiffies) /* rounding ate our timeout entirely; */ + return original; + return j; +} + +/** + * __round_jiffies - function to round jiffies to a full second + * @j: the time in (absolute) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * __round_jiffies() rounds an absolute time in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The exact rounding is skewed for each processor to avoid all + * processors firing at the exact same time, which could lead + * to lock contention or spurious cache line bouncing. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long __round_jiffies(unsigned long j, int cpu) +{ + return round_jiffies_common(j, cpu, false); +} +EXPORT_SYMBOL_GPL(__round_jiffies); + +/** + * __round_jiffies_relative - function to round jiffies to a full second + * @j: the time in (relative) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * __round_jiffies_relative() rounds a time delta in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The exact rounding is skewed for each processor to avoid all + * processors firing at the exact same time, which could lead + * to lock contention or spurious cache line bouncing. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long __round_jiffies_relative(unsigned long j, int cpu) +{ + unsigned long j0 = jiffies; + + /* Use j0 because jiffies might change while we run */ + return round_jiffies_common(j + j0, cpu, false) - j0; +} +EXPORT_SYMBOL_GPL(__round_jiffies_relative); + +/** + * round_jiffies - function to round jiffies to a full second + * @j: the time in (absolute) jiffies that should be rounded + * + * round_jiffies() rounds an absolute time in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long round_jiffies(unsigned long j) +{ + return round_jiffies_common(j, raw_smp_processor_id(), false); +} +EXPORT_SYMBOL_GPL(round_jiffies); + +/** + * round_jiffies_relative - function to round jiffies to a full second + * @j: the time in (relative) jiffies that should be rounded + * + * round_jiffies_relative() rounds a time delta in the future (in jiffies) + * up or down to (approximately) full seconds. This is useful for timers + * for which the exact time they fire does not matter too much, as long as + * they fire approximately every X seconds. + * + * By rounding these timers to whole seconds, all such timers will fire + * at the same time, rather than at various times spread out. The goal + * of this is to have the CPU wake up less, which saves power. + * + * The return value is the rounded version of the @j parameter. + */ +unsigned long round_jiffies_relative(unsigned long j) +{ + return __round_jiffies_relative(j, raw_smp_processor_id()); +} +EXPORT_SYMBOL_GPL(round_jiffies_relative); + +/** + * __round_jiffies_up - function to round jiffies up to a full second + * @j: the time in (absolute) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * This is the same as __round_jiffies() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long __round_jiffies_up(unsigned long j, int cpu) +{ + return round_jiffies_common(j, cpu, true); +} +EXPORT_SYMBOL_GPL(__round_jiffies_up); + +/** + * __round_jiffies_up_relative - function to round jiffies up to a full second + * @j: the time in (relative) jiffies that should be rounded + * @cpu: the processor number on which the timeout will happen + * + * This is the same as __round_jiffies_relative() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long __round_jiffies_up_relative(unsigned long j, int cpu) +{ + unsigned long j0 = jiffies; + + /* Use j0 because jiffies might change while we run */ + return round_jiffies_common(j + j0, cpu, true) - j0; +} +EXPORT_SYMBOL_GPL(__round_jiffies_up_relative); + +/** + * round_jiffies_up - function to round jiffies up to a full second + * @j: the time in (absolute) jiffies that should be rounded + * + * This is the same as round_jiffies() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long round_jiffies_up(unsigned long j) +{ + return round_jiffies_common(j, raw_smp_processor_id(), true); +} +EXPORT_SYMBOL_GPL(round_jiffies_up); + +/** + * round_jiffies_up_relative - function to round jiffies up to a full second + * @j: the time in (relative) jiffies that should be rounded + * + * This is the same as round_jiffies_relative() except that it will never + * round down. This is useful for timeouts for which the exact time + * of firing does not matter too much, as long as they don't fire too + * early. + */ +unsigned long round_jiffies_up_relative(unsigned long j) +{ + return __round_jiffies_up_relative(j, raw_smp_processor_id()); +} +EXPORT_SYMBOL_GPL(round_jiffies_up_relative); + +/** + * set_timer_slack - set the allowed slack for a timer + * @timer: the timer to be modified + * @slack_hz: the amount of time (in jiffies) allowed for rounding + * + * Set the amount of time, in jiffies, that a certain timer has + * in terms of slack. By setting this value, the timer subsystem + * will schedule the actual timer somewhere between + * the time mod_timer() asks for, and that time plus the slack. + * + * By setting the slack to -1, a percentage of the delay is used + * instead. + */ +void set_timer_slack(struct timer_list *timer, int slack_hz) +{ + timer->slack = slack_hz; +} +EXPORT_SYMBOL_GPL(set_timer_slack); + +static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) +{ + unsigned long expires = timer->expires; + unsigned long idx = expires - base->timer_jiffies; + struct list_head *vec; + + if (idx < TVR_SIZE) { + int i = expires & TVR_MASK; + vec = base->tv1.vec + i; + } else if (idx < 1 << (TVR_BITS + TVN_BITS)) { + int i = (expires >> TVR_BITS) & TVN_MASK; + vec = base->tv2.vec + i; + } else if (idx < 1 << (TVR_BITS + 2 * TVN_BITS)) { + int i = (expires >> (TVR_BITS + TVN_BITS)) & TVN_MASK; + vec = base->tv3.vec + i; + } else if (idx < 1 << (TVR_BITS + 3 * TVN_BITS)) { + int i = (expires >> (TVR_BITS + 2 * TVN_BITS)) & TVN_MASK; + vec = base->tv4.vec + i; + } else if ((signed long) idx < 0) { + /* + * Can happen if you add a timer with expires == jiffies, + * or you set a timer to go off in the past + */ + vec = base->tv1.vec + (base->timer_jiffies & TVR_MASK); + } else { + int i; + /* If the timeout is larger than 0xffffffff on 64-bit + * architectures then we use the maximum timeout: + */ + if (idx > 0xffffffffUL) { + idx = 0xffffffffUL; + expires = idx + base->timer_jiffies; + } + i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; + vec = base->tv5.vec + i; + } + /* + * Timers are FIFO: + */ + list_add_tail(&timer->entry, vec); +} + +#ifdef CONFIG_TIMER_STATS +void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) +{ + if (timer->start_site) + return; + + timer->start_site = addr; + memcpy(timer->start_comm, current->comm, TASK_COMM_LEN); + timer->start_pid = current->pid; +} + +static void timer_stats_account_timer(struct timer_list *timer) +{ + unsigned int flag = 0; + + if (likely(!timer->start_site)) + return; + if (unlikely(tbase_get_deferrable(timer->base))) + flag |= TIMER_STATS_FLAG_DEFERRABLE; + + timer_stats_update_stats(timer, timer->start_pid, timer->start_site, + timer->function, timer->start_comm, flag); +} + +#else +static void timer_stats_account_timer(struct timer_list *timer) {} +#endif + +#ifdef CONFIG_DEBUG_OBJECTS_TIMERS + +static struct debug_obj_descr timer_debug_descr; + +static void *timer_debug_hint(void *addr) +{ + return ((struct timer_list *) addr)->function; +} + +/* + * fixup_init is called when: + * - an active object is initialized + */ +static int timer_fixup_init(void *addr, enum debug_obj_state state) +{ + struct timer_list *timer = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + del_timer_sync(timer); + debug_object_init(timer, &timer_debug_descr); + return 1; + default: + return 0; + } +} + +/* + * fixup_activate is called when: + * - an active object is activated + * - an unknown object is activated (might be a statically initialized object) + */ +static int timer_fixup_activate(void *addr, enum debug_obj_state state) +{ + struct timer_list *timer = addr; + + switch (state) { + + case ODEBUG_STATE_NOTAVAILABLE: + /* + * This is not really a fixup. The timer was + * statically initialized. We just make sure that it + * is tracked in the object tracker. + */ + if (timer->entry.next == NULL && + timer->entry.prev == TIMER_ENTRY_STATIC) { + debug_object_init(timer, &timer_debug_descr); + debug_object_activate(timer, &timer_debug_descr); + return 0; + } else { + WARN_ON_ONCE(1); + } + return 0; + + case ODEBUG_STATE_ACTIVE: + WARN_ON(1); + + default: + return 0; + } +} + +/* + * fixup_free is called when: + * - an active object is freed + */ +static int timer_fixup_free(void *addr, enum debug_obj_state state) +{ + struct timer_list *timer = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + del_timer_sync(timer); + debug_object_free(timer, &timer_debug_descr); + return 1; + default: + return 0; + } +} + +static struct debug_obj_descr timer_debug_descr = { + .name = "timer_list", + .debug_hint = timer_debug_hint, + .fixup_init = timer_fixup_init, + .fixup_activate = timer_fixup_activate, + .fixup_free = timer_fixup_free, +}; + +static inline void debug_timer_init(struct timer_list *timer) +{ + debug_object_init(timer, &timer_debug_descr); +} + +static inline void debug_timer_activate(struct timer_list *timer) +{ + debug_object_activate(timer, &timer_debug_descr); +} + +static inline void debug_timer_deactivate(struct timer_list *timer) +{ + debug_object_deactivate(timer, &timer_debug_descr); +} + +static inline void debug_timer_free(struct timer_list *timer) +{ + debug_object_free(timer, &timer_debug_descr); +} + +static void __init_timer(struct timer_list *timer, + const char *name, + struct lock_class_key *key); + +void init_timer_on_stack_key(struct timer_list *timer, + const char *name, + struct lock_class_key *key) +{ + debug_object_init_on_stack(timer, &timer_debug_descr); + __init_timer(timer, name, key); +} +EXPORT_SYMBOL_GPL(init_timer_on_stack_key); + +void destroy_timer_on_stack(struct timer_list *timer) +{ + debug_object_free(timer, &timer_debug_descr); +} +EXPORT_SYMBOL_GPL(destroy_timer_on_stack); + +#else +static inline void debug_timer_init(struct timer_list *timer) { } +static inline void debug_timer_activate(struct timer_list *timer) { } +static inline void debug_timer_deactivate(struct timer_list *timer) { } +#endif + +static inline void debug_init(struct timer_list *timer) +{ + debug_timer_init(timer); + trace_timer_init(timer); +} + +static inline void +debug_activate(struct timer_list *timer, unsigned long expires) +{ + debug_timer_activate(timer); + trace_timer_start(timer, expires); +} + +static inline void debug_deactivate(struct timer_list *timer) +{ + debug_timer_deactivate(timer); + trace_timer_cancel(timer); +} + +static void __init_timer(struct timer_list *timer, + const char *name, + struct lock_class_key *key) +{ + timer->entry.next = NULL; + timer->base = __raw_get_cpu_var(tvec_bases); + timer->slack = -1; +#ifdef CONFIG_TIMER_STATS + timer->start_site = NULL; + timer->start_pid = -1; + memset(timer->start_comm, 0, TASK_COMM_LEN); +#endif + lockdep_init_map(&timer->lockdep_map, name, key, 0); +} + +void setup_deferrable_timer_on_stack_key(struct timer_list *timer, + const char *name, + struct lock_class_key *key, + void (*function)(unsigned long), + unsigned long data) +{ + timer->function = function; + timer->data = data; + init_timer_on_stack_key(timer, name, key); + timer_set_deferrable(timer); +} +EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key); + +/** + * init_timer_key - initialize a timer + * @timer: the timer to be initialized + * @name: name of the timer + * @key: lockdep class key of the fake lock used for tracking timer + * sync lock dependencies + * + * init_timer_key() must be done to a timer prior calling *any* of the + * other timer functions. + */ +void init_timer_key(struct timer_list *timer, + const char *name, + struct lock_class_key *key) +{ + debug_init(timer); + __init_timer(timer, name, key); +} +EXPORT_SYMBOL(init_timer_key); + +void init_timer_deferrable_key(struct timer_list *timer, + const char *name, + struct lock_class_key *key) +{ + init_timer_key(timer, name, key); + timer_set_deferrable(timer); +} +EXPORT_SYMBOL(init_timer_deferrable_key); + +static inline void detach_timer(struct timer_list *timer, + int clear_pending) +{ + struct list_head *entry = &timer->entry; + + debug_deactivate(timer); + + __list_del(entry->prev, entry->next); + if (clear_pending) + entry->next = NULL; + entry->prev = LIST_POISON2; +} + +/* + * We are using hashed locking: holding per_cpu(tvec_bases).lock + * means that all timers which are tied to this base via timer->base are + * locked, and the base itself is locked too. + * + * So __run_timers/migrate_timers can safely modify all timers which could + * be found on ->tvX lists. + * + * When the timer's base is locked, and the timer removed from list, it is + * possible to set timer->base = NULL and drop the lock: the timer remains + * locked. + */ +static struct tvec_base *lock_timer_base(struct timer_list *timer, + unsigned long *flags) + __acquires(timer->base->lock) +{ + struct tvec_base *base; + + for (;;) { + struct tvec_base *prelock_base = timer->base; + base = tbase_get_base(prelock_base); + if (likely(base != NULL)) { + spin_lock_irqsave(&base->lock, *flags); + if (likely(prelock_base == timer->base)) + return base; + /* The timer has migrated to another CPU */ + spin_unlock_irqrestore(&base->lock, *flags); + } + cpu_relax(); + } +} + +static inline int +__mod_timer(struct timer_list *timer, unsigned long expires, + bool pending_only, int pinned) +{ + struct tvec_base *base, *new_base; + unsigned long flags; + int ret = 0 , cpu; + + timer_stats_timer_set_start_info(timer); + BUG_ON(!timer->function); + + base = lock_timer_base(timer, &flags); + + if (timer_pending(timer)) { + detach_timer(timer, 0); + if (timer->expires == base->next_timer && + !tbase_get_deferrable(timer->base)) + base->next_timer = base->timer_jiffies; + ret = 1; + } else { + if (pending_only) + goto out_unlock; + } + + debug_activate(timer, expires); + + cpu = smp_processor_id(); + +#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) + if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) + cpu = get_nohz_timer_target(); +#endif + new_base = per_cpu(tvec_bases, cpu); + + if (base != new_base) { + /* + * We are trying to schedule the timer on the local CPU. + * However we can't change timer's base while it is running, + * otherwise del_timer_sync() can't detect that the timer's + * handler yet has not finished. This also guarantees that + * the timer is serialized wrt itself. + */ + if (likely(base->running_timer != timer)) { + /* See the comment in lock_timer_base() */ + timer_set_base(timer, NULL); + spin_unlock(&base->lock); + base = new_base; + spin_lock(&base->lock); + timer_set_base(timer, base); + } + } + + timer->expires = expires; + if (time_before(timer->expires, base->next_timer) && + !tbase_get_deferrable(timer->base)) + base->next_timer = timer->expires; + internal_add_timer(base, timer); + +out_unlock: + spin_unlock_irqrestore(&base->lock, flags); + + return ret; +} + +/** + * mod_timer_pending - modify a pending timer's timeout + * @timer: the pending timer to be modified + * @expires: new timeout in jiffies + * + * mod_timer_pending() is the same for pending timers as mod_timer(), + * but will not re-activate and modify already deleted timers. + * + * It is useful for unserialized use of timers. + */ +int mod_timer_pending(struct timer_list *timer, unsigned long expires) +{ + return __mod_timer(timer, expires, true, TIMER_NOT_PINNED); +} +EXPORT_SYMBOL(mod_timer_pending); + +/* + * Decide where to put the timer while taking the slack into account + * + * Algorithm: + * 1) calculate the maximum (absolute) time + * 2) calculate the highest bit where the expires and new max are different + * 3) use this bit to make a mask + * 4) use the bitmask to round down the maximum time, so that all last + * bits are zeros + */ +static inline +unsigned long apply_slack(struct timer_list *timer, unsigned long expires) +{ + unsigned long expires_limit, mask; + int bit; + + if (timer->slack >= 0) { + expires_limit = expires + timer->slack; + } else { + long delta = expires - jiffies; + + if (delta < 256) + return expires; + + expires_limit = expires + delta / 256; + } + mask = expires ^ expires_limit; + if (mask == 0) + return expires; + + bit = find_last_bit(&mask, BITS_PER_LONG); + + mask = (1 << bit) - 1; + + expires_limit = expires_limit & ~(mask); + + return expires_limit; +} + +/** + * mod_timer - modify a timer's timeout + * @timer: the timer to be modified + * @expires: new timeout in jiffies + * + * mod_timer() is a more efficient way to update the expire field of an + * active timer (if the timer is inactive it will be activated) + * + * mod_timer(timer, expires) is equivalent to: + * + * del_timer(timer); timer->expires = expires; add_timer(timer); + * + * Note that if there are multiple unserialized concurrent users of the + * same timer, then mod_timer() is the only safe way to modify the timeout, + * since add_timer() cannot modify an already running timer. + * + * The function returns whether it has modified a pending timer or not. + * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an + * active timer returns 1.) + */ +int mod_timer(struct timer_list *timer, unsigned long expires) +{ + expires = apply_slack(timer, expires); + + /* + * This is a common optimization triggered by the + * networking code - if the timer is re-modified + * to be the same thing then just return: + */ + if (timer_pending(timer) && timer->expires == expires) + return 1; + + return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); +} +EXPORT_SYMBOL(mod_timer); + +/** + * mod_timer_pinned - modify a timer's timeout + * @timer: the timer to be modified + * @expires: new timeout in jiffies + * + * mod_timer_pinned() is a way to update the expire field of an + * active timer (if the timer is inactive it will be activated) + * and not allow the timer to be migrated to a different CPU. + * + * mod_timer_pinned(timer, expires) is equivalent to: + * + * del_timer(timer); timer->expires = expires; add_timer(timer); + */ +int mod_timer_pinned(struct timer_list *timer, unsigned long expires) +{ + if (timer->expires == expires && timer_pending(timer)) + return 1; + + return __mod_timer(timer, expires, false, TIMER_PINNED); +} +EXPORT_SYMBOL(mod_timer_pinned); + +/** + * add_timer - start a timer + * @timer: the timer to be added + * + * The kernel will do a ->function(->data) callback from the + * timer interrupt at the ->expires point in the future. The + * current time is 'jiffies'. + * + * The timer's ->expires, ->function (and if the handler uses it, ->data) + * fields must be set prior calling this function. + * + * Timers with an ->expires field in the past will be executed in the next + * timer tick. + */ +void add_timer(struct timer_list *timer) +{ + BUG_ON(timer_pending(timer)); + mod_timer(timer, timer->expires); +} +EXPORT_SYMBOL(add_timer); + +/** + * add_timer_on - start a timer on a particular CPU + * @timer: the timer to be added + * @cpu: the CPU to start it on + * + * This is not very scalable on SMP. Double adds are not possible. + */ +void add_timer_on(struct timer_list *timer, int cpu) +{ + struct tvec_base *base = per_cpu(tvec_bases, cpu); + unsigned long flags; + + timer_stats_timer_set_start_info(timer); + BUG_ON(timer_pending(timer) || !timer->function); + spin_lock_irqsave(&base->lock, flags); + timer_set_base(timer, base); + debug_activate(timer, timer->expires); + if (time_before(timer->expires, base->next_timer) && + !tbase_get_deferrable(timer->base)) + base->next_timer = timer->expires; + internal_add_timer(base, timer); + /* + * Check whether the other CPU is idle and needs to be + * triggered to reevaluate the timer wheel when nohz is + * active. We are protected against the other CPU fiddling + * with the timer by holding the timer base lock. This also + * makes sure that a CPU on the way to idle can not evaluate + * the timer wheel. + */ + wake_up_idle_cpu(cpu); + spin_unlock_irqrestore(&base->lock, flags); +} +EXPORT_SYMBOL_GPL(add_timer_on); + +/** + * del_timer - deactive a timer. + * @timer: the timer to be deactivated + * + * del_timer() deactivates a timer - this works on both active and inactive + * timers. + * + * The function returns whether it has deactivated a pending timer or not. + * (ie. del_timer() of an inactive timer returns 0, del_timer() of an + * active timer returns 1.) + */ +int del_timer(struct timer_list *timer) +{ + struct tvec_base *base; + unsigned long flags; + int ret = 0; + + timer_stats_timer_clear_start_info(timer); + if (timer_pending(timer)) { + base = lock_timer_base(timer, &flags); + if (timer_pending(timer)) { + detach_timer(timer, 1); + if (timer->expires == base->next_timer && + !tbase_get_deferrable(timer->base)) + base->next_timer = base->timer_jiffies; + ret = 1; + } + spin_unlock_irqrestore(&base->lock, flags); + } + + return ret; +} +EXPORT_SYMBOL(del_timer); + +/** + * try_to_del_timer_sync - Try to deactivate a timer + * @timer: timer do del + * + * This function tries to deactivate a timer. Upon successful (ret >= 0) + * exit the timer is not queued and the handler is not running on any CPU. + */ +int try_to_del_timer_sync(struct timer_list *timer) +{ + struct tvec_base *base; + unsigned long flags; + int ret = -1; + + base = lock_timer_base(timer, &flags); + + if (base->running_timer == timer) + goto out; + + timer_stats_timer_clear_start_info(timer); + ret = 0; + if (timer_pending(timer)) { + detach_timer(timer, 1); + if (timer->expires == base->next_timer && + !tbase_get_deferrable(timer->base)) + base->next_timer = base->timer_jiffies; + ret = 1; + } +out: + spin_unlock_irqrestore(&base->lock, flags); + + return ret; +} +EXPORT_SYMBOL(try_to_del_timer_sync); + +#ifdef CONFIG_SMP +/** + * del_timer_sync - deactivate a timer and wait for the handler to finish. + * @timer: the timer to be deactivated + * + * This function only differs from del_timer() on SMP: besides deactivating + * the timer it also makes sure the handler has finished executing on other + * CPUs. + * + * Synchronization rules: Callers must prevent restarting of the timer, + * otherwise this function is meaningless. It must not be called from + * interrupt contexts. The caller must not hold locks which would prevent + * completion of the timer's handler. The timer's handler must not call + * add_timer_on(). Upon exit the timer is not queued and the handler is + * not running on any CPU. + * + * Note: You must not hold locks that are held in interrupt context + * while calling this function. Even if the lock has nothing to do + * with the timer in question. Here's why: + * + * CPU0 CPU1 + * ---- ---- + * + * call_timer_fn(); + * base->running_timer = mytimer; + * spin_lock_irq(somelock); + * + * spin_lock(somelock); + * del_timer_sync(mytimer); + * while (base->running_timer == mytimer); + * + * Now del_timer_sync() will never return and never release somelock. + * The interrupt on the other CPU is waiting to grab somelock but + * it has interrupted the softirq that CPU0 is waiting to finish. + * + * The function returns whether it has deactivated a pending timer or not. + */ +int del_timer_sync(struct timer_list *timer) +{ +#ifdef CONFIG_LOCKDEP + unsigned long flags; + + /* + * If lockdep gives a backtrace here, please reference + * the synchronization rules above. + */ + local_irq_save(flags); + lock_map_acquire(&timer->lockdep_map); + lock_map_release(&timer->lockdep_map); + local_irq_restore(flags); +#endif + /* + * don't use it in hardirq context, because it + * could lead to deadlock. + */ + WARN_ON(in_irq()); + for (;;) { + int ret = try_to_del_timer_sync(timer); + if (ret >= 0) + return ret; + cpu_relax(); + } +} +EXPORT_SYMBOL(del_timer_sync); +#endif + +static int cascade(struct tvec_base *base, struct tvec *tv, int index) +{ + /* cascade all the timers from tv up one level */ + struct timer_list *timer, *tmp; + struct list_head tv_list; + + list_replace_init(tv->vec + index, &tv_list); + + /* + * We are removing _all_ timers from the list, so we + * don't have to detach them individually. + */ + list_for_each_entry_safe(timer, tmp, &tv_list, entry) { + BUG_ON(tbase_get_base(timer->base) != base); + internal_add_timer(base, timer); + } + + return index; +} + +static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), + unsigned long data) +{ + int preempt_count = preempt_count(); + +#ifdef CONFIG_LOCKDEP + /* + * It is permissible to free the timer from inside the + * function that is called from it, this we need to take into + * account for lockdep too. To avoid bogus "held lock freed" + * warnings as well as problems when looking into + * timer->lockdep_map, make a copy and use that here. + */ + struct lockdep_map lockdep_map = timer->lockdep_map; +#endif + /* + * Couple the lock chain with the lock chain at + * del_timer_sync() by acquiring the lock_map around the fn() + * call here and in del_timer_sync(). + */ + lock_map_acquire(&lockdep_map); + + trace_timer_expire_entry(timer); + fn(data); + trace_timer_expire_exit(timer); + + lock_map_release(&lockdep_map); + + if (preempt_count != preempt_count()) { + WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n", + fn, preempt_count, preempt_count()); + /* + * Restore the preempt count. That gives us a decent + * chance to survive and extract information. If the + * callback kept a lock held, bad luck, but not worse + * than the BUG() we had. + */ + preempt_count() = preempt_count; + } +} + +#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) + +/** + * __run_timers - run all expired timers (if any) on this CPU. + * @base: the timer vector to be processed. + * + * This function cascades all vectors and executes all expired timer + * vectors. + */ +static inline void __run_timers(struct tvec_base *base) +{ + struct timer_list *timer; + + spin_lock_irq(&base->lock); + while (time_after_eq(jiffies, base->timer_jiffies)) { + struct list_head work_list; + struct list_head *head = &work_list; + int index = base->timer_jiffies & TVR_MASK; + + /* + * Cascade timers: + */ + if (!index && + (!cascade(base, &base->tv2, INDEX(0))) && + (!cascade(base, &base->tv3, INDEX(1))) && + !cascade(base, &base->tv4, INDEX(2))) + cascade(base, &base->tv5, INDEX(3)); + ++base->timer_jiffies; + list_replace_init(base->tv1.vec + index, &work_list); + while (!list_empty(head)) { + void (*fn)(unsigned long); + unsigned long data; + + timer = list_first_entry(head, struct timer_list,entry); + fn = timer->function; + data = timer->data; + + timer_stats_account_timer(timer); + + base->running_timer = timer; + detach_timer(timer, 1); + + spin_unlock_irq(&base->lock); + call_timer_fn(timer, fn, data); + spin_lock_irq(&base->lock); + } + } + base->running_timer = NULL; + spin_unlock_irq(&base->lock); +} + +#ifdef CONFIG_NO_HZ +/* + * Find out when the next timer event is due to happen. This + * is used on S/390 to stop all activity when a CPU is idle. + * This function needs to be called with interrupts disabled. + */ +static unsigned long __next_timer_interrupt(struct tvec_base *base) +{ + unsigned long timer_jiffies = base->timer_jiffies; + unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA; + int index, slot, array, found = 0; + struct timer_list *nte; + struct tvec *varray[4]; + + /* Look for timer events in tv1. */ + index = slot = timer_jiffies & TVR_MASK; + do { + list_for_each_entry(nte, base->tv1.vec + slot, entry) { + if (tbase_get_deferrable(nte->base)) + continue; + + found = 1; + expires = nte->expires; + /* Look at the cascade bucket(s)? */ + if (!index || slot < index) + goto cascade; + return expires; + } + slot = (slot + 1) & TVR_MASK; + } while (slot != index); + +cascade: + /* Calculate the next cascade event */ + if (index) + timer_jiffies += TVR_SIZE - index; + timer_jiffies >>= TVR_BITS; + + /* Check tv2-tv5. */ + varray[0] = &base->tv2; + varray[1] = &base->tv3; + varray[2] = &base->tv4; + varray[3] = &base->tv5; + + for (array = 0; array < 4; array++) { + struct tvec *varp = varray[array]; + + index = slot = timer_jiffies & TVN_MASK; + do { + list_for_each_entry(nte, varp->vec + slot, entry) { + if (tbase_get_deferrable(nte->base)) + continue; + + found = 1; + if (time_before(nte->expires, expires)) + expires = nte->expires; + } + /* + * Do we still search for the first timer or are + * we looking up the cascade buckets ? + */ + if (found) { + /* Look at the cascade bucket(s)? */ + if (!index || slot < index) + break; + return expires; + } + slot = (slot + 1) & TVN_MASK; + } while (slot != index); + + if (index) + timer_jiffies += TVN_SIZE - index; + timer_jiffies >>= TVN_BITS; + } + return expires; +} + +/* + * Check, if the next hrtimer event is before the next timer wheel + * event: + */ +static unsigned long cmp_next_hrtimer_event(unsigned long now, + unsigned long expires) +{ + ktime_t hr_delta = hrtimer_get_next_event(); + struct timespec tsdelta; + unsigned long delta; + + if (hr_delta.tv64 == KTIME_MAX) + return expires; + + /* + * Expired timer available, let it expire in the next tick + */ + if (hr_delta.tv64 <= 0) + return now + 1; + + tsdelta = ktime_to_timespec(hr_delta); + delta = timespec_to_jiffies(&tsdelta); + + /* + * Limit the delta to the max value, which is checked in + * tick_nohz_stop_sched_tick(): + */ + if (delta > NEXT_TIMER_MAX_DELTA) + delta = NEXT_TIMER_MAX_DELTA; + + /* + * Take rounding errors in to account and make sure, that it + * expires in the next tick. Otherwise we go into an endless + * ping pong due to tick_nohz_stop_sched_tick() retriggering + * the timer softirq + */ + if (delta < 1) + delta = 1; + now += delta; + if (time_before(now, expires)) + return now; + return expires; +} + +/** + * get_next_timer_interrupt - return the jiffy of the next pending timer + * @now: current time (in jiffies) + */ +unsigned long get_next_timer_interrupt(unsigned long now) +{ + struct tvec_base *base = __this_cpu_read(tvec_bases); + unsigned long expires; + + /* + * Pretend that there is no timer pending if the cpu is offline. + * Possible pending timers will be migrated later to an active cpu. + */ + if (cpu_is_offline(smp_processor_id())) + return now + NEXT_TIMER_MAX_DELTA; + spin_lock(&base->lock); + if (time_before_eq(base->next_timer, base->timer_jiffies)) + base->next_timer = __next_timer_interrupt(base); + expires = base->next_timer; + spin_unlock(&base->lock); + + if (time_before_eq(expires, now)) + return now; + + return cmp_next_hrtimer_event(now, expires); +} +#endif + +/* + * Called from the timer interrupt handler to charge one tick to the current + * process. user_tick is 1 if the tick is user time, 0 for system. + */ +void update_process_times(int user_tick) +{ + struct task_struct *p = current; + int cpu = smp_processor_id(); + + /* Note: this timer irq context must be accounted for as well. */ + account_process_tick(p, user_tick); + run_local_timers(); + rcu_check_callbacks(cpu, user_tick); + printk_tick(); +#ifdef CONFIG_IRQ_WORK + if (in_irq()) + irq_work_run(); +#endif + scheduler_tick(); + run_posix_cpu_timers(p); +} + +/* + * This function runs timers and the timer-tq in bottom half context. + */ +static void run_timer_softirq(struct softirq_action *h) +{ + struct tvec_base *base = __this_cpu_read(tvec_bases); + + hrtimer_run_pending(); + + if (time_after_eq(jiffies, base->timer_jiffies)) + __run_timers(base); +} + +/* + * Called by the local, per-CPU timer interrupt on SMP. + */ +void run_local_timers(void) +{ + hrtimer_run_queues(); + raise_softirq(TIMER_SOFTIRQ); +} + +#ifdef __ARCH_WANT_SYS_ALARM + +/* + * For backwards compatibility? This can be done in libc so Alpha + * and all newer ports shouldn't need it. + */ +SYSCALL_DEFINE1(alarm, unsigned int, seconds) +{ + return alarm_setitimer(seconds); +} + +#endif + +#ifndef __alpha__ + +/* + * The Alpha uses getxpid, getxuid, and getxgid instead. Maybe this + * should be moved into arch/i386 instead? + */ + +/** + * sys_getpid - return the thread group id of the current process + * + * Note, despite the name, this returns the tgid not the pid. The tgid and + * the pid are identical unless CLONE_THREAD was specified on clone() in + * which case the tgid is the same in all threads of the same group. + * + * This is SMP safe as current->tgid does not change. + */ +SYSCALL_DEFINE0(getpid) +{ + return task_tgid_vnr(current); +} + +/* + * Accessing ->real_parent is not SMP-safe, it could + * change from under us. However, we can use a stale + * value of ->real_parent under rcu_read_lock(), see + * release_task()->call_rcu(delayed_put_task_struct). + */ +SYSCALL_DEFINE0(getppid) +{ + int pid; + + rcu_read_lock(); + pid = task_tgid_vnr(current->real_parent); + rcu_read_unlock(); + + return pid; +} + +SYSCALL_DEFINE0(getuid) +{ + /* Only we change this so SMP safe */ + return current_uid(); +} + +SYSCALL_DEFINE0(geteuid) +{ + /* Only we change this so SMP safe */ + return current_euid(); +} + +SYSCALL_DEFINE0(getgid) +{ + /* Only we change this so SMP safe */ + return current_gid(); +} + +SYSCALL_DEFINE0(getegid) +{ + /* Only we change this so SMP safe */ + return current_egid(); +} + +#endif + +static void process_timeout(unsigned long __data) +{ + wake_up_process((struct task_struct *)__data); +} + +/** + * schedule_timeout - sleep until timeout + * @timeout: timeout value in jiffies + * + * Make the current task sleep until @timeout jiffies have + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to + * pass before the routine returns. The routine will return 0 + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task. In this case the remaining time + * in jiffies will be returned, or 0 if the timer expired in time + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule + * the CPU away without a bound on the timeout. In this case the return + * value will be %MAX_SCHEDULE_TIMEOUT. + * + * In all cases the return value is guaranteed to be non-negative. + */ +signed long __sched schedule_timeout(signed long timeout) +{ + struct timer_list timer; + unsigned long expire; + + switch (timeout) + { + case MAX_SCHEDULE_TIMEOUT: + /* + * These two special cases are useful to be comfortable + * in the caller. Nothing more. We could take + * MAX_SCHEDULE_TIMEOUT from one of the negative value + * but I' d like to return a valid offset (>=0) to allow + * the caller to do everything it want with the retval. + */ + schedule(); + goto out; + default: + /* + * Another bit of PARANOID. Note that the retval will be + * 0 since no piece of kernel is supposed to do a check + * for a negative retval of schedule_timeout() (since it + * should never happens anyway). You just have the printk() + * that will tell you if something is gone wrong and where. + */ + if (timeout < 0) { + printk(KERN_ERR "schedule_timeout: wrong timeout " + "value %lx\n", timeout); + dump_stack(); + current->state = TASK_RUNNING; + goto out; + } + } + + expire = timeout + jiffies; + + setup_timer_on_stack(&timer, process_timeout, (unsigned long)current); + __mod_timer(&timer, expire, false, TIMER_NOT_PINNED); + schedule(); + del_singleshot_timer_sync(&timer); + + /* Remove the timer from the object tracker */ + destroy_timer_on_stack(&timer); + + timeout = expire - jiffies; + + out: + return timeout < 0 ? 0 : timeout; +} +EXPORT_SYMBOL(schedule_timeout); + +/* + * We can use __set_current_state() here because schedule_timeout() calls + * schedule() unconditionally. + */ +signed long __sched schedule_timeout_interruptible(signed long timeout) +{ + __set_current_state(TASK_INTERRUPTIBLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_interruptible); + +signed long __sched schedule_timeout_killable(signed long timeout) +{ + __set_current_state(TASK_KILLABLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_killable); + +signed long __sched schedule_timeout_uninterruptible(signed long timeout) +{ + __set_current_state(TASK_UNINTERRUPTIBLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_uninterruptible); + +/* Thread ID - the internal kernel "pid" */ +SYSCALL_DEFINE0(gettid) +{ + return task_pid_vnr(current); +} + +/** + * do_sysinfo - fill in sysinfo struct + * @info: pointer to buffer to fill + */ +int do_sysinfo(struct sysinfo *info) +{ + unsigned long mem_total, sav_total; + unsigned int mem_unit, bitcount; + struct timespec tp; + + memset(info, 0, sizeof(struct sysinfo)); + + ktime_get_ts(&tp); + monotonic_to_bootbased(&tp); + info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); + + get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); + + info->procs = nr_threads; + + si_meminfo(info); + si_swapinfo(info); + + /* + * If the sum of all the available memory (i.e. ram + swap) + * is less than can be stored in a 32 bit unsigned long then + * we can be binary compatible with 2.2.x kernels. If not, + * well, in that case 2.2.x was broken anyways... + * + * -Erik Andersen + */ + + mem_total = info->totalram + info->totalswap; + if (mem_total < info->totalram || mem_total < info->totalswap) + goto out; + bitcount = 0; + mem_unit = info->mem_unit; + while (mem_unit > 1) { + bitcount++; + mem_unit >>= 1; + sav_total = mem_total; + mem_total <<= 1; + if (mem_total < sav_total) + goto out; + } + + /* + * If mem_total did not overflow, multiply all memory values by + * info->mem_unit and set it to 1. This leaves things compatible + * with 2.2.x, and also retains compatibility with earlier 2.4.x + * kernels... + */ + + info->mem_unit = 1; + info->totalram <<= bitcount; + info->freeram <<= bitcount; + info->sharedram <<= bitcount; + info->bufferram <<= bitcount; + info->totalswap <<= bitcount; + info->freeswap <<= bitcount; + info->totalhigh <<= bitcount; + info->freehigh <<= bitcount; + +out: + return 0; +} + +SYSCALL_DEFINE1(sysinfo, struct sysinfo __user *, info) +{ + struct sysinfo val; + + do_sysinfo(&val); + + if (copy_to_user(info, &val, sizeof(struct sysinfo))) + return -EFAULT; + + return 0; +} + +static int __cpuinit init_timers_cpu(int cpu) +{ + int j; + struct tvec_base *base; + static char __cpuinitdata tvec_base_done[NR_CPUS]; + + if (!tvec_base_done[cpu]) { + static char boot_done; + + if (boot_done) { + /* + * The APs use this path later in boot + */ + base = kmalloc_node(sizeof(*base), + GFP_KERNEL | __GFP_ZERO, + cpu_to_node(cpu)); + if (!base) + return -ENOMEM; + + /* Make sure that tvec_base is 2 byte aligned */ + if (tbase_get_deferrable(base)) { + WARN_ON(1); + kfree(base); + return -ENOMEM; + } + per_cpu(tvec_bases, cpu) = base; + } else { + /* + * This is for the boot CPU - we use compile-time + * static initialisation because per-cpu memory isn't + * ready yet and because the memory allocators are not + * initialised either. + */ + boot_done = 1; + base = &boot_tvec_bases; + } + tvec_base_done[cpu] = 1; + } else { + base = per_cpu(tvec_bases, cpu); + } + + spin_lock_init(&base->lock); + + for (j = 0; j < TVN_SIZE; j++) { + INIT_LIST_HEAD(base->tv5.vec + j); + INIT_LIST_HEAD(base->tv4.vec + j); + INIT_LIST_HEAD(base->tv3.vec + j); + INIT_LIST_HEAD(base->tv2.vec + j); + } + for (j = 0; j < TVR_SIZE; j++) + INIT_LIST_HEAD(base->tv1.vec + j); + + base->timer_jiffies = jiffies; + base->next_timer = base->timer_jiffies; + return 0; +} + +#ifdef CONFIG_HOTPLUG_CPU +static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head) +{ + struct timer_list *timer; + + while (!list_empty(head)) { + timer = list_first_entry(head, struct timer_list, entry); + detach_timer(timer, 0); + timer_set_base(timer, new_base); + if (time_before(timer->expires, new_base->next_timer) && + !tbase_get_deferrable(timer->base)) + new_base->next_timer = timer->expires; + internal_add_timer(new_base, timer); + } +} + +static void __cpuinit migrate_timers(int cpu) +{ + struct tvec_base *old_base; + struct tvec_base *new_base; + int i; + + BUG_ON(cpu_online(cpu)); + old_base = per_cpu(tvec_bases, cpu); + new_base = get_cpu_var(tvec_bases); + /* + * The caller is globally serialized and nobody else + * takes two locks at once, deadlock is not possible. + */ + spin_lock_irq(&new_base->lock); + spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + + BUG_ON(old_base->running_timer); + + for (i = 0; i < TVR_SIZE; i++) + migrate_timer_list(new_base, old_base->tv1.vec + i); + for (i = 0; i < TVN_SIZE; i++) { + migrate_timer_list(new_base, old_base->tv2.vec + i); + migrate_timer_list(new_base, old_base->tv3.vec + i); + migrate_timer_list(new_base, old_base->tv4.vec + i); + migrate_timer_list(new_base, old_base->tv5.vec + i); + } + + spin_unlock(&old_base->lock); + spin_unlock_irq(&new_base->lock); + put_cpu_var(tvec_bases); +} +#endif /* CONFIG_HOTPLUG_CPU */ + +static int __cpuinit timer_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + long cpu = (long)hcpu; + int err; + + switch(action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + err = init_timers_cpu(cpu); + if (err < 0) + return notifier_from_errno(err); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + case CPU_DEAD_FROZEN: + migrate_timers(cpu); + break; +#endif + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata timers_nb = { + .notifier_call = timer_cpu_notify, +}; + + +void __init init_timers(void) +{ + int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); + + init_timer_stats(); + + BUG_ON(err != NOTIFY_OK); + register_cpu_notifier(&timers_nb); + open_softirq(TIMER_SOFTIRQ, run_timer_softirq); +} + +/** + * msleep - sleep safely even with waitqueue interruptions + * @msecs: Time in milliseconds to sleep for + */ +void msleep(unsigned int msecs) +{ + unsigned long timeout = msecs_to_jiffies(msecs) + 1; + + while (timeout) + timeout = schedule_timeout_uninterruptible(timeout); +} + +EXPORT_SYMBOL(msleep); + +/** + * msleep_interruptible - sleep waiting for signals + * @msecs: Time in milliseconds to sleep for + */ +unsigned long msleep_interruptible(unsigned int msecs) +{ + unsigned long timeout = msecs_to_jiffies(msecs) + 1; + + while (timeout && !signal_pending(current)) + timeout = schedule_timeout_interruptible(timeout); + return jiffies_to_msecs(timeout); +} + +EXPORT_SYMBOL(msleep_interruptible); + +static int __sched do_usleep_range(unsigned long min, unsigned long max) +{ + ktime_t kmin; + unsigned long delta; + + kmin = ktime_set(0, min * NSEC_PER_USEC); + delta = (max - min) * NSEC_PER_USEC; + return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); +} + +/** + * usleep_range - Drop in replacement for udelay where wakeup is flexible + * @min: Minimum time in usecs to sleep + * @max: Maximum time in usecs to sleep + */ +void usleep_range(unsigned long min, unsigned long max) +{ + __set_current_state(TASK_UNINTERRUPTIBLE); + do_usleep_range(min, max); +} +EXPORT_SYMBOL(usleep_range); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig new file mode 100644 index 00000000..2ad39e55 --- /dev/null +++ b/kernel/trace/Kconfig @@ -0,0 +1,493 @@ +# +# Architectures that offer an FUNCTION_TRACER implementation should +# select HAVE_FUNCTION_TRACER: +# + +config USER_STACKTRACE_SUPPORT + bool + +config NOP_TRACER + bool + +config HAVE_FTRACE_NMI_ENTER + bool + help + See Documentation/trace/ftrace-design.txt + +config HAVE_FUNCTION_TRACER + bool + help + See Documentation/trace/ftrace-design.txt + +config HAVE_FUNCTION_GRAPH_TRACER + bool + help + See Documentation/trace/ftrace-design.txt + +config HAVE_FUNCTION_GRAPH_FP_TEST + bool + help + See Documentation/trace/ftrace-design.txt + +config HAVE_FUNCTION_TRACE_MCOUNT_TEST + bool + help + See Documentation/trace/ftrace-design.txt + +config HAVE_DYNAMIC_FTRACE + bool + help + See Documentation/trace/ftrace-design.txt + +config HAVE_FTRACE_MCOUNT_RECORD + bool + help + See Documentation/trace/ftrace-design.txt + +config HAVE_SYSCALL_TRACEPOINTS + bool + help + See Documentation/trace/ftrace-design.txt + +config HAVE_C_RECORDMCOUNT + bool + help + C version of recordmcount available? + +config TRACER_MAX_TRACE + bool + +config RING_BUFFER + bool + +config FTRACE_NMI_ENTER + bool + depends on HAVE_FTRACE_NMI_ENTER + default y + +config EVENT_TRACING + select CONTEXT_SWITCH_TRACER + bool + +config EVENT_POWER_TRACING_DEPRECATED + depends on EVENT_TRACING + bool "Deprecated power event trace API, to be removed" + default y + help + Provides old power event types: + C-state/idle accounting events: + power:power_start + power:power_end + and old cpufreq accounting event: + power:power_frequency + This is for userspace compatibility + and will vanish after 5 kernel iterations, + namely 2.6.41. + +config CONTEXT_SWITCH_TRACER + bool + +config RING_BUFFER_ALLOW_SWAP + bool + help + Allow the use of ring_buffer_swap_cpu. + Adds a very slight overhead to tracing when enabled. + +# All tracer options should select GENERIC_TRACER. For those options that are +# enabled by all tracers (context switch and event tracer) they select TRACING. +# This allows those options to appear when no other tracer is selected. But the +# options do not appear when something else selects it. We need the two options +# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the +# hiding of the automatic options. + +config TRACING + bool + select DEBUG_FS + select RING_BUFFER + select STACKTRACE if STACKTRACE_SUPPORT + select TRACEPOINTS + select NOP_TRACER + select BINARY_PRINTF + select EVENT_TRACING + +config GENERIC_TRACER + bool + select TRACING + +# +# Minimum requirements an architecture has to meet for us to +# be able to offer generic tracing facilities: +# +config TRACING_SUPPORT + bool + # PPC32 has no irqflags tracing support, but it can use most of the + # tracers anyway, they were tested to build and work. Note that new + # exceptions to this list aren't welcomed, better implement the + # irqflags tracing for your architecture. + depends on TRACE_IRQFLAGS_SUPPORT || PPC32 + depends on STACKTRACE_SUPPORT + default y + +if TRACING_SUPPORT + +menuconfig FTRACE + bool "Tracers" + default y if DEBUG_KERNEL + help + Enable the kernel tracing infrastructure. + +if FTRACE + +config FUNCTION_TRACER + bool "Kernel Function Tracer" + depends on HAVE_FUNCTION_TRACER + select FRAME_POINTER if !ARM_UNWIND && !S390 && !MICROBLAZE + select KALLSYMS + select GENERIC_TRACER + select CONTEXT_SWITCH_TRACER + help + Enable the kernel to trace every kernel function. This is done + by using a compiler feature to insert a small, 5-byte No-Operation + instruction at the beginning of every kernel function, which NOP + sequence is then dynamically patched into a tracer call when + tracing is enabled by the administrator. If it's runtime disabled + (the bootup default), then the overhead of the instructions is very + small and not measurable even in micro-benchmarks. + +config FUNCTION_GRAPH_TRACER + bool "Kernel Function Graph Tracer" + depends on HAVE_FUNCTION_GRAPH_TRACER + depends on FUNCTION_TRACER + depends on !X86_32 || !CC_OPTIMIZE_FOR_SIZE + default y + help + Enable the kernel to trace a function at both its return + and its entry. + Its first purpose is to trace the duration of functions and + draw a call graph for each thread with some information like + the return value. This is done by setting the current return + address on the current task structure into a stack of calls. + + +config IRQSOFF_TRACER + bool "Interrupts-off Latency Tracer" + default n + depends on TRACE_IRQFLAGS_SUPPORT + depends on !ARCH_USES_GETTIMEOFFSET + select TRACE_IRQFLAGS + select GENERIC_TRACER + select TRACER_MAX_TRACE + select RING_BUFFER_ALLOW_SWAP + help + This option measures the time spent in irqs-off critical + sections, with microsecond accuracy. + + The default measurement method is a maximum search, which is + disabled by default and can be runtime (re-)started + via: + + echo 0 > /sys/kernel/debug/tracing/tracing_max_latency + + (Note that kernel size and overhead increase with this option + enabled. This option and the preempt-off timing option can be + used together or separately.) + +config PREEMPT_TRACER + bool "Preemption-off Latency Tracer" + default n + depends on !ARCH_USES_GETTIMEOFFSET + depends on PREEMPT + select GENERIC_TRACER + select TRACER_MAX_TRACE + select RING_BUFFER_ALLOW_SWAP + help + This option measures the time spent in preemption-off critical + sections, with microsecond accuracy. + + The default measurement method is a maximum search, which is + disabled by default and can be runtime (re-)started + via: + + echo 0 > /sys/kernel/debug/tracing/tracing_max_latency + + (Note that kernel size and overhead increase with this option + enabled. This option and the irqs-off timing option can be + used together or separately.) + +config SCHED_TRACER + bool "Scheduling Latency Tracer" + select GENERIC_TRACER + select CONTEXT_SWITCH_TRACER + select TRACER_MAX_TRACE + help + This tracer tracks the latency of the highest priority task + to be scheduled in, starting from the point it has woken up. + +config ENABLE_DEFAULT_TRACERS + bool "Trace process context switches and events" + depends on !GENERIC_TRACER + select TRACING + help + This tracer hooks to various trace points in the kernel, + allowing the user to pick and choose which trace point they + want to trace. It also includes the sched_switch tracer plugin. + +config FTRACE_SYSCALLS + bool "Trace syscalls" + depends on HAVE_SYSCALL_TRACEPOINTS + select GENERIC_TRACER + select KALLSYMS + help + Basic tracer to catch the syscall entry and exit events. + +config TRACE_BRANCH_PROFILING + bool + select GENERIC_TRACER + +choice + prompt "Branch Profiling" + default BRANCH_PROFILE_NONE + help + The branch profiling is a software profiler. It will add hooks + into the C conditionals to test which path a branch takes. + + The likely/unlikely profiler only looks at the conditions that + are annotated with a likely or unlikely macro. + + The "all branch" profiler will profile every if-statement in the + kernel. This profiler will also enable the likely/unlikely + profiler. + + Either of the above profilers adds a bit of overhead to the system. + If unsure, choose "No branch profiling". + +config BRANCH_PROFILE_NONE + bool "No branch profiling" + help + No branch profiling. Branch profiling adds a bit of overhead. + Only enable it if you want to analyse the branching behavior. + Otherwise keep it disabled. + +config PROFILE_ANNOTATED_BRANCHES + bool "Trace likely/unlikely profiler" + select TRACE_BRANCH_PROFILING + help + This tracer profiles all the the likely and unlikely macros + in the kernel. It will display the results in: + + /sys/kernel/debug/tracing/trace_stat/branch_annotated + + Note: this will add a significant overhead; only turn this + on if you need to profile the system's use of these macros. + +config PROFILE_ALL_BRANCHES + bool "Profile all if conditionals" + select TRACE_BRANCH_PROFILING + help + This tracer profiles all branch conditions. Every if () + taken in the kernel is recorded whether it hit or miss. + The results will be displayed in: + + /sys/kernel/debug/tracing/trace_stat/branch_all + + This option also enables the likely/unlikely profiler. + + This configuration, when enabled, will impose a great overhead + on the system. This should only be enabled when the system + is to be analyzed in much detail. +endchoice + +config TRACING_BRANCHES + bool + help + Selected by tracers that will trace the likely and unlikely + conditions. This prevents the tracers themselves from being + profiled. Profiling the tracing infrastructure can only happen + when the likelys and unlikelys are not being traced. + +config BRANCH_TRACER + bool "Trace likely/unlikely instances" + depends on TRACE_BRANCH_PROFILING + select TRACING_BRANCHES + help + This traces the events of likely and unlikely condition + calls in the kernel. The difference between this and the + "Trace likely/unlikely profiler" is that this is not a + histogram of the callers, but actually places the calling + events into a running trace buffer to see when and where the + events happened, as well as their results. + + Say N if unsure. + +config STACK_TRACER + bool "Trace max stack" + depends on HAVE_FUNCTION_TRACER + select FUNCTION_TRACER + select STACKTRACE + select KALLSYMS + help + This special tracer records the maximum stack footprint of the + kernel and displays it in /sys/kernel/debug/tracing/stack_trace. + + This tracer works by hooking into every function call that the + kernel executes, and keeping a maximum stack depth value and + stack-trace saved. If this is configured with DYNAMIC_FTRACE + then it will not have any overhead while the stack tracer + is disabled. + + To enable the stack tracer on bootup, pass in 'stacktrace' + on the kernel command line. + + The stack tracer can also be enabled or disabled via the + sysctl kernel.stack_tracer_enabled + + Say N if unsure. + +config BLK_DEV_IO_TRACE + bool "Support for tracing block IO actions" + depends on SYSFS + depends on BLOCK + select RELAY + select DEBUG_FS + select TRACEPOINTS + select GENERIC_TRACER + select STACKTRACE + help + Say Y here if you want to be able to trace the block layer actions + on a given queue. Tracing allows you to see any traffic happening + on a block device queue. For more information (and the userspace + support tools needed), fetch the blktrace tools from: + + git://git.kernel.dk/blktrace.git + + Tracing also is possible using the ftrace interface, e.g.: + + echo 1 > /sys/block/sda/sda1/trace/enable + echo blk > /sys/kernel/debug/tracing/current_tracer + cat /sys/kernel/debug/tracing/trace_pipe + + If unsure, say N. + +config KPROBE_EVENT + depends on KPROBES + depends on HAVE_REGS_AND_STACK_ACCESS_API + bool "Enable kprobes-based dynamic events" + select TRACING + default y + help + This allows the user to add tracing events (similar to tracepoints) + on the fly via the ftrace interface. See + Documentation/trace/kprobetrace.txt for more details. + + Those events can be inserted wherever kprobes can probe, and record + various register and memory values. + + This option is also required by perf-probe subcommand of perf tools. + If you want to use perf tools, this option is strongly recommended. + +config DYNAMIC_FTRACE + bool "enable/disable ftrace tracepoints dynamically" + depends on FUNCTION_TRACER + depends on HAVE_DYNAMIC_FTRACE + default y + help + This option will modify all the calls to ftrace dynamically + (will patch them out of the binary image and replace them + with a No-Op instruction) as they are called. A table is + created to dynamically enable them again. + + This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but + otherwise has native performance as long as no tracing is active. + + The changes to the code are done by a kernel thread that + wakes up once a second and checks to see if any ftrace calls + were made. If so, it runs stop_machine (stops all CPUS) + and modifies the code to jump over the call to ftrace. + +config FUNCTION_PROFILER + bool "Kernel function profiler" + depends on FUNCTION_TRACER + default n + help + This option enables the kernel function profiler. A file is created + in debugfs called function_profile_enabled which defaults to zero. + When a 1 is echoed into this file profiling begins, and when a + zero is entered, profiling stops. A "functions" file is created in + the trace_stats directory; this file shows the list of functions that + have been hit and their counters. + + If in doubt, say N. + +config FTRACE_MCOUNT_RECORD + def_bool y + depends on DYNAMIC_FTRACE + depends on HAVE_FTRACE_MCOUNT_RECORD + +config FTRACE_SELFTEST + bool + +config FTRACE_STARTUP_TEST + bool "Perform a startup test on ftrace" + depends on GENERIC_TRACER + select FTRACE_SELFTEST + help + This option performs a series of startup tests on ftrace. On bootup + a series of tests are made to verify that the tracer is + functioning properly. It will do tests on all the configured + tracers of ftrace. + +config EVENT_TRACE_TEST_SYSCALLS + bool "Run selftest on syscall events" + depends on FTRACE_STARTUP_TEST + help + This option will also enable testing every syscall event. + It only enables the event and disables it and runs various loads + with the event enabled. This adds a bit more time for kernel boot + up since it runs this on every system call defined. + + TBD - enable a way to actually call the syscalls as we test their + events + +config MMIOTRACE + bool "Memory mapped IO tracing" + depends on HAVE_MMIOTRACE_SUPPORT && PCI + select GENERIC_TRACER + help + Mmiotrace traces Memory Mapped I/O access and is meant for + debugging and reverse engineering. It is called from the ioremap + implementation and works via page faults. Tracing is disabled by + default and can be enabled at run-time. + + See Documentation/trace/mmiotrace.txt. + If you are not helping to develop drivers, say N. + +config MMIOTRACE_TEST + tristate "Test module for mmiotrace" + depends on MMIOTRACE && m + help + This is a dumb module for testing mmiotrace. It is very dangerous + as it will write garbage to IO memory starting at a given address. + However, it should be safe to use on e.g. unused portion of VRAM. + + Say N, unless you absolutely know what you are doing. + +config RING_BUFFER_BENCHMARK + tristate "Ring buffer benchmark stress tester" + depends on RING_BUFFER + help + This option creates a test to stress the ring buffer and benchmark it. + It creates its own ring buffer such that it will not interfere with + any other users of the ring buffer (such as ftrace). It then creates + a producer and consumer that will run for 10 seconds and sleep for + 10 seconds. Each interval it will print out the number of events + it recorded and give a rough estimate of how long each iteration took. + + It does not disable interrupts or raise its priority, so it may be + affected by processes that are running. + + If unsure, say N. + +endif # FTRACE + +endif # TRACING_SUPPORT + diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile new file mode 100644 index 00000000..761c510a --- /dev/null +++ b/kernel/trace/Makefile @@ -0,0 +1,60 @@ + +# Do not instrument the tracer itself: + +ifdef CONFIG_FUNCTION_TRACER +ORIG_CFLAGS := $(KBUILD_CFLAGS) +KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS)) + +# selftest needs instrumentation +CFLAGS_trace_selftest_dynamic.o = -pg +obj-y += trace_selftest_dynamic.o +endif + +# If unlikely tracing is enabled, do not trace these files +ifdef CONFIG_TRACING_BRANCHES +KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING +endif + +# +# Make the trace clocks available generally: it's infrastructure +# relied on by ptrace for example: +# +obj-y += trace_clock.o + +obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o +obj-$(CONFIG_RING_BUFFER) += ring_buffer.o +obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o + +obj-$(CONFIG_TRACING) += trace.o +obj-$(CONFIG_TRACING) += trace_output.o +obj-$(CONFIG_TRACING) += trace_stat.o +obj-$(CONFIG_TRACING) += trace_printk.o +obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o +obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o +obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o +obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o +obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o +obj-$(CONFIG_NOP_TRACER) += trace_nop.o +obj-$(CONFIG_STACK_TRACER) += trace_stack.o +obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o +obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o +obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o +obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o +obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o +ifeq ($(CONFIG_BLOCK),y) +obj-$(CONFIG_EVENT_TRACING) += blktrace.o +endif +obj-$(CONFIG_EVENT_TRACING) += trace_events.o +obj-$(CONFIG_EVENT_TRACING) += trace_export.o +obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o +ifeq ($(CONFIG_PERF_EVENTS),y) +obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o +endif +obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o +obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o +obj-$(CONFIG_TRACEPOINTS) += power-traces.o +ifeq ($(CONFIG_TRACING),y) +obj-$(CONFIG_KGDB_KDB) += trace_kdb.o +endif + +libftrace-y := ftrace.o diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c new file mode 100644 index 00000000..6957aa29 --- /dev/null +++ b/kernel/trace/blktrace.c @@ -0,0 +1,1813 @@ +/* + * Copyright (C) 2006 Jens Axboe + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "trace_output.h" + +#ifdef CONFIG_BLK_DEV_IO_TRACE + +static unsigned int blktrace_seq __read_mostly = 1; + +static struct trace_array *blk_tr; +static bool blk_tracer_enabled __read_mostly; + +/* Select an alternative, minimalistic output than the original one */ +#define TRACE_BLK_OPT_CLASSIC 0x1 + +static struct tracer_opt blk_tracer_opts[] = { + /* Default disable the minimalistic output */ + { TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) }, + { } +}; + +static struct tracer_flags blk_tracer_flags = { + .val = 0, + .opts = blk_tracer_opts, +}; + +/* Global reference count of probes */ +static atomic_t blk_probes_ref = ATOMIC_INIT(0); + +static void blk_register_tracepoints(void); +static void blk_unregister_tracepoints(void); + +/* + * Send out a notify message. + */ +static void trace_note(struct blk_trace *bt, pid_t pid, int action, + const void *data, size_t len) +{ + struct blk_io_trace *t; + struct ring_buffer_event *event = NULL; + struct ring_buffer *buffer = NULL; + int pc = 0; + int cpu = smp_processor_id(); + bool blk_tracer = blk_tracer_enabled; + + if (blk_tracer) { + buffer = blk_tr->buffer; + pc = preempt_count(); + event = trace_buffer_lock_reserve(buffer, TRACE_BLK, + sizeof(*t) + len, + 0, pc); + if (!event) + return; + t = ring_buffer_event_data(event); + goto record_it; + } + + if (!bt->rchan) + return; + + t = relay_reserve(bt->rchan, sizeof(*t) + len); + if (t) { + t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; + t->time = ktime_to_ns(ktime_get()); +record_it: + t->device = bt->dev; + t->action = action; + t->pid = pid; + t->cpu = cpu; + t->pdu_len = len; + memcpy((void *) t + sizeof(*t), data, len); + + if (blk_tracer) + trace_buffer_unlock_commit(buffer, event, 0, pc); + } +} + +/* + * Send out a notify for this process, if we haven't done so since a trace + * started + */ +static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk) +{ + tsk->btrace_seq = blktrace_seq; + trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm)); +} + +static void trace_note_time(struct blk_trace *bt) +{ + struct timespec now; + unsigned long flags; + u32 words[2]; + + getnstimeofday(&now); + words[0] = now.tv_sec; + words[1] = now.tv_nsec; + + local_irq_save(flags); + trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words)); + local_irq_restore(flags); +} + +void __trace_note_message(struct blk_trace *bt, const char *fmt, ...) +{ + int n; + va_list args; + unsigned long flags; + char *buf; + + if (unlikely(bt->trace_state != Blktrace_running && + !blk_tracer_enabled)) + return; + + /* + * If the BLK_TC_NOTIFY action mask isn't set, don't send any note + * message to the trace. + */ + if (!(bt->act_mask & BLK_TC_NOTIFY)) + return; + + local_irq_save(flags); + buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); + va_start(args, fmt); + n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); + va_end(args); + + trace_note(bt, 0, BLK_TN_MESSAGE, buf, n); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(__trace_note_message); + +static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, + pid_t pid) +{ + if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) + return 1; + if (sector && (sector < bt->start_lba || sector > bt->end_lba)) + return 1; + if (bt->pid && pid != bt->pid) + return 1; + + return 0; +} + +/* + * Data direction bit lookup + */ +static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ), + BLK_TC_ACT(BLK_TC_WRITE) }; + +#define BLK_TC_RAHEAD BLK_TC_AHEAD + +/* The ilog2() calls fall out because they're constant */ +#define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \ + (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name)) + +/* + * The worker for the various blk_add_trace*() types. Fills out a + * blk_io_trace structure and places it in a per-cpu subbuffer. + */ +static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, + int rw, u32 what, int error, int pdu_len, void *pdu_data) +{ + struct task_struct *tsk = current; + struct ring_buffer_event *event = NULL; + struct ring_buffer *buffer = NULL; + struct blk_io_trace *t; + unsigned long flags = 0; + unsigned long *sequence; + pid_t pid; + int cpu, pc = 0; + bool blk_tracer = blk_tracer_enabled; + + if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer)) + return; + + what |= ddir_act[rw & WRITE]; + what |= MASK_TC_BIT(rw, SYNC); + what |= MASK_TC_BIT(rw, RAHEAD); + what |= MASK_TC_BIT(rw, META); + what |= MASK_TC_BIT(rw, DISCARD); + + pid = tsk->pid; + if (act_log_check(bt, what, sector, pid)) + return; + cpu = raw_smp_processor_id(); + + if (blk_tracer) { + tracing_record_cmdline(current); + + buffer = blk_tr->buffer; + pc = preempt_count(); + event = trace_buffer_lock_reserve(buffer, TRACE_BLK, + sizeof(*t) + pdu_len, + 0, pc); + if (!event) + return; + t = ring_buffer_event_data(event); + goto record_it; + } + + /* + * A word about the locking here - we disable interrupts to reserve + * some space in the relay per-cpu buffer, to prevent an irq + * from coming in and stepping on our toes. + */ + local_irq_save(flags); + + if (unlikely(tsk->btrace_seq != blktrace_seq)) + trace_note_tsk(bt, tsk); + + t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len); + if (t) { + sequence = per_cpu_ptr(bt->sequence, cpu); + + t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION; + t->sequence = ++(*sequence); + t->time = ktime_to_ns(ktime_get()); +record_it: + /* + * These two are not needed in ftrace as they are in the + * generic trace_entry, filled by tracing_generic_entry_update, + * but for the trace_event->bin() synthesizer benefit we do it + * here too. + */ + t->cpu = cpu; + t->pid = pid; + + t->sector = sector; + t->bytes = bytes; + t->action = what; + t->device = bt->dev; + t->error = error; + t->pdu_len = pdu_len; + + if (pdu_len) + memcpy((void *) t + sizeof(*t), pdu_data, pdu_len); + + if (blk_tracer) { + trace_buffer_unlock_commit(buffer, event, 0, pc); + return; + } + } + + local_irq_restore(flags); +} + +static struct dentry *blk_tree_root; +static DEFINE_MUTEX(blk_tree_mutex); + +static void blk_trace_free(struct blk_trace *bt) +{ + debugfs_remove(bt->msg_file); + debugfs_remove(bt->dropped_file); + relay_close(bt->rchan); + debugfs_remove(bt->dir); + free_percpu(bt->sequence); + free_percpu(bt->msg_data); + kfree(bt); +} + +static void blk_trace_cleanup(struct blk_trace *bt) +{ + blk_trace_free(bt); + if (atomic_dec_and_test(&blk_probes_ref)) + blk_unregister_tracepoints(); +} + +int blk_trace_remove(struct request_queue *q) +{ + struct blk_trace *bt; + + bt = xchg(&q->blk_trace, NULL); + if (!bt) + return -EINVAL; + + if (bt->trace_state != Blktrace_running) + blk_trace_cleanup(bt); + + return 0; +} +EXPORT_SYMBOL_GPL(blk_trace_remove); + +static int blk_dropped_open(struct inode *inode, struct file *filp) +{ + filp->private_data = inode->i_private; + + return 0; +} + +static ssize_t blk_dropped_read(struct file *filp, char __user *buffer, + size_t count, loff_t *ppos) +{ + struct blk_trace *bt = filp->private_data; + char buf[16]; + + snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped)); + + return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf)); +} + +static const struct file_operations blk_dropped_fops = { + .owner = THIS_MODULE, + .open = blk_dropped_open, + .read = blk_dropped_read, + .llseek = default_llseek, +}; + +static int blk_msg_open(struct inode *inode, struct file *filp) +{ + filp->private_data = inode->i_private; + + return 0; +} + +static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, + size_t count, loff_t *ppos) +{ + char *msg; + struct blk_trace *bt; + + if (count >= BLK_TN_MAX_MSG) + return -EINVAL; + + msg = kmalloc(count + 1, GFP_KERNEL); + if (msg == NULL) + return -ENOMEM; + + if (copy_from_user(msg, buffer, count)) { + kfree(msg); + return -EFAULT; + } + + msg[count] = '\0'; + bt = filp->private_data; + __trace_note_message(bt, "%s", msg); + kfree(msg); + + return count; +} + +static const struct file_operations blk_msg_fops = { + .owner = THIS_MODULE, + .open = blk_msg_open, + .write = blk_msg_write, + .llseek = noop_llseek, +}; + +/* + * Keep track of how many times we encountered a full subbuffer, to aid + * the user space app in telling how many lost events there were. + */ +static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, + void *prev_subbuf, size_t prev_padding) +{ + struct blk_trace *bt; + + if (!relay_buf_full(buf)) + return 1; + + bt = buf->chan->private_data; + atomic_inc(&bt->dropped); + return 0; +} + +static int blk_remove_buf_file_callback(struct dentry *dentry) +{ + debugfs_remove(dentry); + + return 0; +} + +static struct dentry *blk_create_buf_file_callback(const char *filename, + struct dentry *parent, + int mode, + struct rchan_buf *buf, + int *is_global) +{ + return debugfs_create_file(filename, mode, parent, buf, + &relay_file_operations); +} + +static struct rchan_callbacks blk_relay_callbacks = { + .subbuf_start = blk_subbuf_start_callback, + .create_buf_file = blk_create_buf_file_callback, + .remove_buf_file = blk_remove_buf_file_callback, +}; + +static void blk_trace_setup_lba(struct blk_trace *bt, + struct block_device *bdev) +{ + struct hd_struct *part = NULL; + + if (bdev) + part = bdev->bd_part; + + if (part) { + bt->start_lba = part->start_sect; + bt->end_lba = part->start_sect + part->nr_sects; + } else { + bt->start_lba = 0; + bt->end_lba = -1ULL; + } +} + +/* + * Setup everything required to start tracing + */ +int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, + struct blk_user_trace_setup *buts) +{ + struct blk_trace *old_bt, *bt = NULL; + struct dentry *dir = NULL; + int ret, i; + + if (!buts->buf_size || !buts->buf_nr) + return -EINVAL; + + strncpy(buts->name, name, BLKTRACE_BDEV_SIZE); + buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0'; + + /* + * some device names have larger paths - convert the slashes + * to underscores for this to work as expected + */ + for (i = 0; i < strlen(buts->name); i++) + if (buts->name[i] == '/') + buts->name[i] = '_'; + + bt = kzalloc(sizeof(*bt), GFP_KERNEL); + if (!bt) + return -ENOMEM; + + ret = -ENOMEM; + bt->sequence = alloc_percpu(unsigned long); + if (!bt->sequence) + goto err; + + bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char)); + if (!bt->msg_data) + goto err; + + ret = -ENOENT; + + mutex_lock(&blk_tree_mutex); + if (!blk_tree_root) { + blk_tree_root = debugfs_create_dir("block", NULL); + if (!blk_tree_root) { + mutex_unlock(&blk_tree_mutex); + goto err; + } + } + mutex_unlock(&blk_tree_mutex); + + dir = debugfs_create_dir(buts->name, blk_tree_root); + + if (!dir) + goto err; + + bt->dir = dir; + bt->dev = dev; + atomic_set(&bt->dropped, 0); + + ret = -EIO; + bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, + &blk_dropped_fops); + if (!bt->dropped_file) + goto err; + + bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops); + if (!bt->msg_file) + goto err; + + bt->rchan = relay_open("trace", dir, buts->buf_size, + buts->buf_nr, &blk_relay_callbacks, bt); + if (!bt->rchan) + goto err; + + bt->act_mask = buts->act_mask; + if (!bt->act_mask) + bt->act_mask = (u16) -1; + + blk_trace_setup_lba(bt, bdev); + + /* overwrite with user settings */ + if (buts->start_lba) + bt->start_lba = buts->start_lba; + if (buts->end_lba) + bt->end_lba = buts->end_lba; + + bt->pid = buts->pid; + bt->trace_state = Blktrace_setup; + + ret = -EBUSY; + old_bt = xchg(&q->blk_trace, bt); + if (old_bt) { + (void) xchg(&q->blk_trace, old_bt); + goto err; + } + + if (atomic_inc_return(&blk_probes_ref) == 1) + blk_register_tracepoints(); + + return 0; +err: + blk_trace_free(bt); + return ret; +} + +int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, + char __user *arg) +{ + struct blk_user_trace_setup buts; + int ret; + + ret = copy_from_user(&buts, arg, sizeof(buts)); + if (ret) + return -EFAULT; + + ret = do_blk_trace_setup(q, name, dev, bdev, &buts); + if (ret) + return ret; + + if (copy_to_user(arg, &buts, sizeof(buts))) { + blk_trace_remove(q); + return -EFAULT; + } + return 0; +} +EXPORT_SYMBOL_GPL(blk_trace_setup); + +#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) +static int compat_blk_trace_setup(struct request_queue *q, char *name, + dev_t dev, struct block_device *bdev, + char __user *arg) +{ + struct blk_user_trace_setup buts; + struct compat_blk_user_trace_setup cbuts; + int ret; + + if (copy_from_user(&cbuts, arg, sizeof(cbuts))) + return -EFAULT; + + buts = (struct blk_user_trace_setup) { + .act_mask = cbuts.act_mask, + .buf_size = cbuts.buf_size, + .buf_nr = cbuts.buf_nr, + .start_lba = cbuts.start_lba, + .end_lba = cbuts.end_lba, + .pid = cbuts.pid, + }; + memcpy(&buts.name, &cbuts.name, 32); + + ret = do_blk_trace_setup(q, name, dev, bdev, &buts); + if (ret) + return ret; + + if (copy_to_user(arg, &buts.name, 32)) { + blk_trace_remove(q); + return -EFAULT; + } + + return 0; +} +#endif + +int blk_trace_startstop(struct request_queue *q, int start) +{ + int ret; + struct blk_trace *bt = q->blk_trace; + + if (bt == NULL) + return -EINVAL; + + /* + * For starting a trace, we can transition from a setup or stopped + * trace. For stopping a trace, the state must be running + */ + ret = -EINVAL; + if (start) { + if (bt->trace_state == Blktrace_setup || + bt->trace_state == Blktrace_stopped) { + blktrace_seq++; + smp_mb(); + bt->trace_state = Blktrace_running; + + trace_note_time(bt); + ret = 0; + } + } else { + if (bt->trace_state == Blktrace_running) { + bt->trace_state = Blktrace_stopped; + relay_flush(bt->rchan); + ret = 0; + } + } + + return ret; +} +EXPORT_SYMBOL_GPL(blk_trace_startstop); + +/** + * blk_trace_ioctl: - handle the ioctls associated with tracing + * @bdev: the block device + * @cmd: the ioctl cmd + * @arg: the argument data, if any + * + **/ +int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) +{ + struct request_queue *q; + int ret, start = 0; + char b[BDEVNAME_SIZE]; + + q = bdev_get_queue(bdev); + if (!q) + return -ENXIO; + + mutex_lock(&bdev->bd_mutex); + + switch (cmd) { + case BLKTRACESETUP: + bdevname(bdev, b); + ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); + break; +#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64) + case BLKTRACESETUP32: + bdevname(bdev, b); + ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); + break; +#endif + case BLKTRACESTART: + start = 1; + case BLKTRACESTOP: + ret = blk_trace_startstop(q, start); + break; + case BLKTRACETEARDOWN: + ret = blk_trace_remove(q); + break; + default: + ret = -ENOTTY; + break; + } + + mutex_unlock(&bdev->bd_mutex); + return ret; +} + +/** + * blk_trace_shutdown: - stop and cleanup trace structures + * @q: the request queue associated with the device + * + **/ +void blk_trace_shutdown(struct request_queue *q) +{ + if (q->blk_trace) { + blk_trace_startstop(q, 0); + blk_trace_remove(q); + } +} + +/* + * blktrace probes + */ + +/** + * blk_add_trace_rq - Add a trace for a request oriented action + * @q: queue the io is for + * @rq: the source request + * @what: the action + * + * Description: + * Records an action against a request. Will log the bio offset + size. + * + **/ +static void blk_add_trace_rq(struct request_queue *q, struct request *rq, + u32 what) +{ + struct blk_trace *bt = q->blk_trace; + + if (likely(!bt)) + return; + + if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { + what |= BLK_TC_ACT(BLK_TC_PC); + __blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags, + what, rq->errors, rq->cmd_len, rq->cmd); + } else { + what |= BLK_TC_ACT(BLK_TC_FS); + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), + rq->cmd_flags, what, rq->errors, 0, NULL); + } +} + +static void blk_add_trace_rq_abort(void *ignore, + struct request_queue *q, struct request *rq) +{ + blk_add_trace_rq(q, rq, BLK_TA_ABORT); +} + +static void blk_add_trace_rq_insert(void *ignore, + struct request_queue *q, struct request *rq) +{ + blk_add_trace_rq(q, rq, BLK_TA_INSERT); +} + +static void blk_add_trace_rq_issue(void *ignore, + struct request_queue *q, struct request *rq) +{ + blk_add_trace_rq(q, rq, BLK_TA_ISSUE); +} + +static void blk_add_trace_rq_requeue(void *ignore, + struct request_queue *q, + struct request *rq) +{ + blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); +} + +static void blk_add_trace_rq_complete(void *ignore, + struct request_queue *q, + struct request *rq) +{ + blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); +} + +/** + * blk_add_trace_bio - Add a trace for a bio oriented action + * @q: queue the io is for + * @bio: the source bio + * @what: the action + * @error: error, if any + * + * Description: + * Records an action against a bio. Will log the bio offset + size. + * + **/ +static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, + u32 what, int error) +{ + struct blk_trace *bt = q->blk_trace; + + if (likely(!bt)) + return; + + if (!error && !bio_flagged(bio, BIO_UPTODATE)) + error = EIO; + + __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, + error, 0, NULL); +} + +static void blk_add_trace_bio_bounce(void *ignore, + struct request_queue *q, struct bio *bio) +{ + blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); +} + +static void blk_add_trace_bio_complete(void *ignore, + struct request_queue *q, struct bio *bio, + int error) +{ + blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); +} + +static void blk_add_trace_bio_backmerge(void *ignore, + struct request_queue *q, + struct bio *bio) +{ + blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); +} + +static void blk_add_trace_bio_frontmerge(void *ignore, + struct request_queue *q, + struct bio *bio) +{ + blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); +} + +static void blk_add_trace_bio_queue(void *ignore, + struct request_queue *q, struct bio *bio) +{ + blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0); +} + +static void blk_add_trace_getrq(void *ignore, + struct request_queue *q, + struct bio *bio, int rw) +{ + if (bio) + blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); + else { + struct blk_trace *bt = q->blk_trace; + + if (bt) + __blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL); + } +} + + +static void blk_add_trace_sleeprq(void *ignore, + struct request_queue *q, + struct bio *bio, int rw) +{ + if (bio) + blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0); + else { + struct blk_trace *bt = q->blk_trace; + + if (bt) + __blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ, + 0, 0, NULL); + } +} + +static void blk_add_trace_plug(void *ignore, struct request_queue *q) +{ + struct blk_trace *bt = q->blk_trace; + + if (bt) + __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); +} + +static void blk_add_trace_unplug(void *ignore, struct request_queue *q, + unsigned int depth, bool explicit) +{ + struct blk_trace *bt = q->blk_trace; + + if (bt) { + __be64 rpdu = cpu_to_be64(depth); + u32 what; + + if (explicit) + what = BLK_TA_UNPLUG_IO; + else + what = BLK_TA_UNPLUG_TIMER; + + __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu); + } +} + +static void blk_add_trace_split(void *ignore, + struct request_queue *q, struct bio *bio, + unsigned int pdu) +{ + struct blk_trace *bt = q->blk_trace; + + if (bt) { + __be64 rpdu = cpu_to_be64(pdu); + + __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, + BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE), + sizeof(rpdu), &rpdu); + } +} + +/** + * blk_add_trace_bio_remap - Add a trace for a bio-remap operation + * @ignore: trace callback data parameter (not used) + * @q: queue the io is for + * @bio: the source bio + * @dev: target device + * @from: source sector + * + * Description: + * Device mapper or raid target sometimes need to split a bio because + * it spans a stripe (or similar). Add a trace for that action. + * + **/ +static void blk_add_trace_bio_remap(void *ignore, + struct request_queue *q, struct bio *bio, + dev_t dev, sector_t from) +{ + struct blk_trace *bt = q->blk_trace; + struct blk_io_trace_remap r; + + if (likely(!bt)) + return; + + r.device_from = cpu_to_be32(dev); + r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev); + r.sector_from = cpu_to_be64(from); + + __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, + BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), + sizeof(r), &r); +} + +/** + * blk_add_trace_rq_remap - Add a trace for a request-remap operation + * @ignore: trace callback data parameter (not used) + * @q: queue the io is for + * @rq: the source request + * @dev: target device + * @from: source sector + * + * Description: + * Device mapper remaps request to other devices. + * Add a trace for that action. + * + **/ +static void blk_add_trace_rq_remap(void *ignore, + struct request_queue *q, + struct request *rq, dev_t dev, + sector_t from) +{ + struct blk_trace *bt = q->blk_trace; + struct blk_io_trace_remap r; + + if (likely(!bt)) + return; + + r.device_from = cpu_to_be32(dev); + r.device_to = cpu_to_be32(disk_devt(rq->rq_disk)); + r.sector_from = cpu_to_be64(from); + + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), + rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors, + sizeof(r), &r); +} + +/** + * blk_add_driver_data - Add binary message with driver-specific data + * @q: queue the io is for + * @rq: io request + * @data: driver-specific data + * @len: length of driver-specific data + * + * Description: + * Some drivers might want to write driver-specific data per request. + * + **/ +void blk_add_driver_data(struct request_queue *q, + struct request *rq, + void *data, size_t len) +{ + struct blk_trace *bt = q->blk_trace; + + if (likely(!bt)) + return; + + if (rq->cmd_type == REQ_TYPE_BLOCK_PC) + __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, + BLK_TA_DRV_DATA, rq->errors, len, data); + else + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, + BLK_TA_DRV_DATA, rq->errors, len, data); +} +EXPORT_SYMBOL_GPL(blk_add_driver_data); + +static void blk_register_tracepoints(void) +{ + int ret; + + ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL); + WARN_ON(ret); + ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); + WARN_ON(ret); + ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); + WARN_ON(ret); + ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); + WARN_ON(ret); + ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); + WARN_ON(ret); + ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); + WARN_ON(ret); + ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL); + WARN_ON(ret); + ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL); + WARN_ON(ret); + ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL); + WARN_ON(ret); + ret = register_trace_block_bio_queue(blk_add_trace_bio_queue, NULL); + WARN_ON(ret); + ret = register_trace_block_getrq(blk_add_trace_getrq, NULL); + WARN_ON(ret); + ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); + WARN_ON(ret); + ret = register_trace_block_plug(blk_add_trace_plug, NULL); + WARN_ON(ret); + ret = register_trace_block_unplug(blk_add_trace_unplug, NULL); + WARN_ON(ret); + ret = register_trace_block_split(blk_add_trace_split, NULL); + WARN_ON(ret); + ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); + WARN_ON(ret); + ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); + WARN_ON(ret); +} + +static void blk_unregister_tracepoints(void) +{ + unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); + unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL); + unregister_trace_block_split(blk_add_trace_split, NULL); + unregister_trace_block_unplug(blk_add_trace_unplug, NULL); + unregister_trace_block_plug(blk_add_trace_plug, NULL); + unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); + unregister_trace_block_getrq(blk_add_trace_getrq, NULL); + unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL); + unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL); + unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL); + unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL); + unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); + unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); + unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); + unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); + unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); + unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL); + + tracepoint_synchronize_unregister(); +} + +/* + * struct blk_io_tracer formatting routines + */ + +static void fill_rwbs(char *rwbs, const struct blk_io_trace *t) +{ + int i = 0; + int tc = t->action >> BLK_TC_SHIFT; + + if (t->action == BLK_TN_MESSAGE) { + rwbs[i++] = 'N'; + goto out; + } + + if (tc & BLK_TC_DISCARD) + rwbs[i++] = 'D'; + else if (tc & BLK_TC_WRITE) + rwbs[i++] = 'W'; + else if (t->bytes) + rwbs[i++] = 'R'; + else + rwbs[i++] = 'N'; + + if (tc & BLK_TC_AHEAD) + rwbs[i++] = 'A'; + if (tc & BLK_TC_BARRIER) + rwbs[i++] = 'B'; + if (tc & BLK_TC_SYNC) + rwbs[i++] = 'S'; + if (tc & BLK_TC_META) + rwbs[i++] = 'M'; +out: + rwbs[i] = '\0'; +} + +static inline +const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent) +{ + return (const struct blk_io_trace *)ent; +} + +static inline const void *pdu_start(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent) + 1; +} + +static inline u32 t_action(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->action; +} + +static inline u32 t_bytes(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->bytes; +} + +static inline u32 t_sec(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->bytes >> 9; +} + +static inline unsigned long long t_sector(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->sector; +} + +static inline __u16 t_error(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->error; +} + +static __u64 get_pdu_int(const struct trace_entry *ent) +{ + const __u64 *val = pdu_start(ent); + return be64_to_cpu(*val); +} + +static void get_pdu_remap(const struct trace_entry *ent, + struct blk_io_trace_remap *r) +{ + const struct blk_io_trace_remap *__r = pdu_start(ent); + __u64 sector_from = __r->sector_from; + + r->device_from = be32_to_cpu(__r->device_from); + r->device_to = be32_to_cpu(__r->device_to); + r->sector_from = be64_to_cpu(sector_from); +} + +typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act); + +static int blk_log_action_classic(struct trace_iterator *iter, const char *act) +{ + char rwbs[6]; + unsigned long long ts = iter->ts; + unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC); + unsigned secs = (unsigned long)ts; + const struct blk_io_trace *t = te_blk_io_trace(iter->ent); + + fill_rwbs(rwbs, t); + + return trace_seq_printf(&iter->seq, + "%3d,%-3d %2d %5d.%09lu %5u %2s %3s ", + MAJOR(t->device), MINOR(t->device), iter->cpu, + secs, nsec_rem, iter->ent->pid, act, rwbs); +} + +static int blk_log_action(struct trace_iterator *iter, const char *act) +{ + char rwbs[6]; + const struct blk_io_trace *t = te_blk_io_trace(iter->ent); + + fill_rwbs(rwbs, t); + return trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ", + MAJOR(t->device), MINOR(t->device), act, rwbs); +} + +static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) +{ + const unsigned char *pdu_buf; + int pdu_len; + int i, end, ret; + + pdu_buf = pdu_start(ent); + pdu_len = te_blk_io_trace(ent)->pdu_len; + + if (!pdu_len) + return 1; + + /* find the last zero that needs to be printed */ + for (end = pdu_len - 1; end >= 0; end--) + if (pdu_buf[end]) + break; + end++; + + if (!trace_seq_putc(s, '(')) + return 0; + + for (i = 0; i < pdu_len; i++) { + + ret = trace_seq_printf(s, "%s%02x", + i == 0 ? "" : " ", pdu_buf[i]); + if (!ret) + return ret; + + /* + * stop when the rest is just zeroes and indicate so + * with a ".." appended + */ + if (i == end && end != pdu_len - 1) + return trace_seq_puts(s, " ..) "); + } + + return trace_seq_puts(s, ") "); +} + +static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) +{ + char cmd[TASK_COMM_LEN]; + + trace_find_cmdline(ent->pid, cmd); + + if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { + int ret; + + ret = trace_seq_printf(s, "%u ", t_bytes(ent)); + if (!ret) + return 0; + ret = blk_log_dump_pdu(s, ent); + if (!ret) + return 0; + return trace_seq_printf(s, "[%s]\n", cmd); + } else { + if (t_sec(ent)) + return trace_seq_printf(s, "%llu + %u [%s]\n", + t_sector(ent), t_sec(ent), cmd); + return trace_seq_printf(s, "[%s]\n", cmd); + } +} + +static int blk_log_with_error(struct trace_seq *s, + const struct trace_entry *ent) +{ + if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { + int ret; + + ret = blk_log_dump_pdu(s, ent); + if (ret) + return trace_seq_printf(s, "[%d]\n", t_error(ent)); + return 0; + } else { + if (t_sec(ent)) + return trace_seq_printf(s, "%llu + %u [%d]\n", + t_sector(ent), + t_sec(ent), t_error(ent)); + return trace_seq_printf(s, "%llu [%d]\n", + t_sector(ent), t_error(ent)); + } +} + +static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) +{ + struct blk_io_trace_remap r = { .device_from = 0, }; + + get_pdu_remap(ent, &r); + return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", + t_sector(ent), t_sec(ent), + MAJOR(r.device_from), MINOR(r.device_from), + (unsigned long long)r.sector_from); +} + +static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) +{ + char cmd[TASK_COMM_LEN]; + + trace_find_cmdline(ent->pid, cmd); + + return trace_seq_printf(s, "[%s]\n", cmd); +} + +static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent) +{ + char cmd[TASK_COMM_LEN]; + + trace_find_cmdline(ent->pid, cmd); + + return trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent)); +} + +static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent) +{ + char cmd[TASK_COMM_LEN]; + + trace_find_cmdline(ent->pid, cmd); + + return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent), + get_pdu_int(ent), cmd); +} + +static int blk_log_msg(struct trace_seq *s, const struct trace_entry *ent) +{ + int ret; + const struct blk_io_trace *t = te_blk_io_trace(ent); + + ret = trace_seq_putmem(s, t + 1, t->pdu_len); + if (ret) + return trace_seq_putc(s, '\n'); + return ret; +} + +/* + * struct tracer operations + */ + +static void blk_tracer_print_header(struct seq_file *m) +{ + if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) + return; + seq_puts(m, "# DEV CPU TIMESTAMP PID ACT FLG\n" + "# | | | | | |\n"); +} + +static void blk_tracer_start(struct trace_array *tr) +{ + blk_tracer_enabled = true; +} + +static int blk_tracer_init(struct trace_array *tr) +{ + blk_tr = tr; + blk_tracer_start(tr); + return 0; +} + +static void blk_tracer_stop(struct trace_array *tr) +{ + blk_tracer_enabled = false; +} + +static void blk_tracer_reset(struct trace_array *tr) +{ + blk_tracer_stop(tr); +} + +static const struct { + const char *act[2]; + int (*print)(struct trace_seq *s, const struct trace_entry *ent); +} what2act[] = { + [__BLK_TA_QUEUE] = {{ "Q", "queue" }, blk_log_generic }, + [__BLK_TA_BACKMERGE] = {{ "M", "backmerge" }, blk_log_generic }, + [__BLK_TA_FRONTMERGE] = {{ "F", "frontmerge" }, blk_log_generic }, + [__BLK_TA_GETRQ] = {{ "G", "getrq" }, blk_log_generic }, + [__BLK_TA_SLEEPRQ] = {{ "S", "sleeprq" }, blk_log_generic }, + [__BLK_TA_REQUEUE] = {{ "R", "requeue" }, blk_log_with_error }, + [__BLK_TA_ISSUE] = {{ "D", "issue" }, blk_log_generic }, + [__BLK_TA_COMPLETE] = {{ "C", "complete" }, blk_log_with_error }, + [__BLK_TA_PLUG] = {{ "P", "plug" }, blk_log_plug }, + [__BLK_TA_UNPLUG_IO] = {{ "U", "unplug_io" }, blk_log_unplug }, + [__BLK_TA_UNPLUG_TIMER] = {{ "UT", "unplug_timer" }, blk_log_unplug }, + [__BLK_TA_INSERT] = {{ "I", "insert" }, blk_log_generic }, + [__BLK_TA_SPLIT] = {{ "X", "split" }, blk_log_split }, + [__BLK_TA_BOUNCE] = {{ "B", "bounce" }, blk_log_generic }, + [__BLK_TA_REMAP] = {{ "A", "remap" }, blk_log_remap }, +}; + +static enum print_line_t print_one_line(struct trace_iterator *iter, + bool classic) +{ + struct trace_seq *s = &iter->seq; + const struct blk_io_trace *t; + u16 what; + int ret; + bool long_act; + blk_log_action_t *log_action; + + t = te_blk_io_trace(iter->ent); + what = t->action & ((1 << BLK_TC_SHIFT) - 1); + long_act = !!(trace_flags & TRACE_ITER_VERBOSE); + log_action = classic ? &blk_log_action_classic : &blk_log_action; + + if (t->action == BLK_TN_MESSAGE) { + ret = log_action(iter, long_act ? "message" : "m"); + if (ret) + ret = blk_log_msg(s, iter->ent); + goto out; + } + + if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) + ret = trace_seq_printf(s, "Unknown action %x\n", what); + else { + ret = log_action(iter, what2act[what].act[long_act]); + if (ret) + ret = what2act[what].print(s, iter->ent); + } +out: + return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; +} + +static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, + int flags, struct trace_event *event) +{ + return print_one_line(iter, false); +} + +static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct blk_io_trace *t = (struct blk_io_trace *)iter->ent; + const int offset = offsetof(struct blk_io_trace, sector); + struct blk_io_trace old = { + .magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION, + .time = iter->ts, + }; + + if (!trace_seq_putmem(s, &old, offset)) + return 0; + return trace_seq_putmem(s, &t->sector, + sizeof(old) - offset + t->pdu_len); +} + +static enum print_line_t +blk_trace_event_print_binary(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + return blk_trace_synthesize_old_trace(iter) ? + TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; +} + +static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) +{ + if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC)) + return TRACE_TYPE_UNHANDLED; + + return print_one_line(iter, true); +} + +static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set) +{ + /* don't output context-info for blk_classic output */ + if (bit == TRACE_BLK_OPT_CLASSIC) { + if (set) + trace_flags &= ~TRACE_ITER_CONTEXT_INFO; + else + trace_flags |= TRACE_ITER_CONTEXT_INFO; + } + return 0; +} + +static struct tracer blk_tracer __read_mostly = { + .name = "blk", + .init = blk_tracer_init, + .reset = blk_tracer_reset, + .start = blk_tracer_start, + .stop = blk_tracer_stop, + .print_header = blk_tracer_print_header, + .print_line = blk_tracer_print_line, + .flags = &blk_tracer_flags, + .set_flag = blk_tracer_set_flag, +}; + +static struct trace_event_functions trace_blk_event_funcs = { + .trace = blk_trace_event_print, + .binary = blk_trace_event_print_binary, +}; + +static struct trace_event trace_blk_event = { + .type = TRACE_BLK, + .funcs = &trace_blk_event_funcs, +}; + +static int __init init_blk_tracer(void) +{ + if (!register_ftrace_event(&trace_blk_event)) { + pr_warning("Warning: could not register block events\n"); + return 1; + } + + if (register_tracer(&blk_tracer) != 0) { + pr_warning("Warning: could not register the block tracer\n"); + unregister_ftrace_event(&trace_blk_event); + return 1; + } + + return 0; +} + +device_initcall(init_blk_tracer); + +static int blk_trace_remove_queue(struct request_queue *q) +{ + struct blk_trace *bt; + + bt = xchg(&q->blk_trace, NULL); + if (bt == NULL) + return -EINVAL; + + if (atomic_dec_and_test(&blk_probes_ref)) + blk_unregister_tracepoints(); + + blk_trace_free(bt); + return 0; +} + +/* + * Setup everything required to start tracing + */ +static int blk_trace_setup_queue(struct request_queue *q, + struct block_device *bdev) +{ + struct blk_trace *old_bt, *bt = NULL; + int ret = -ENOMEM; + + bt = kzalloc(sizeof(*bt), GFP_KERNEL); + if (!bt) + return -ENOMEM; + + bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char)); + if (!bt->msg_data) + goto free_bt; + + bt->dev = bdev->bd_dev; + bt->act_mask = (u16)-1; + + blk_trace_setup_lba(bt, bdev); + + old_bt = xchg(&q->blk_trace, bt); + if (old_bt != NULL) { + (void)xchg(&q->blk_trace, old_bt); + ret = -EBUSY; + goto free_bt; + } + + if (atomic_inc_return(&blk_probes_ref) == 1) + blk_register_tracepoints(); + return 0; + +free_bt: + blk_trace_free(bt); + return ret; +} + +/* + * sysfs interface to enable and configure tracing + */ + +static ssize_t sysfs_blk_trace_attr_show(struct device *dev, + struct device_attribute *attr, + char *buf); +static ssize_t sysfs_blk_trace_attr_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count); +#define BLK_TRACE_DEVICE_ATTR(_name) \ + DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \ + sysfs_blk_trace_attr_show, \ + sysfs_blk_trace_attr_store) + +static BLK_TRACE_DEVICE_ATTR(enable); +static BLK_TRACE_DEVICE_ATTR(act_mask); +static BLK_TRACE_DEVICE_ATTR(pid); +static BLK_TRACE_DEVICE_ATTR(start_lba); +static BLK_TRACE_DEVICE_ATTR(end_lba); + +static struct attribute *blk_trace_attrs[] = { + &dev_attr_enable.attr, + &dev_attr_act_mask.attr, + &dev_attr_pid.attr, + &dev_attr_start_lba.attr, + &dev_attr_end_lba.attr, + NULL +}; + +struct attribute_group blk_trace_attr_group = { + .name = "trace", + .attrs = blk_trace_attrs, +}; + +static const struct { + int mask; + const char *str; +} mask_maps[] = { + { BLK_TC_READ, "read" }, + { BLK_TC_WRITE, "write" }, + { BLK_TC_BARRIER, "barrier" }, + { BLK_TC_SYNC, "sync" }, + { BLK_TC_QUEUE, "queue" }, + { BLK_TC_REQUEUE, "requeue" }, + { BLK_TC_ISSUE, "issue" }, + { BLK_TC_COMPLETE, "complete" }, + { BLK_TC_FS, "fs" }, + { BLK_TC_PC, "pc" }, + { BLK_TC_AHEAD, "ahead" }, + { BLK_TC_META, "meta" }, + { BLK_TC_DISCARD, "discard" }, + { BLK_TC_DRV_DATA, "drv_data" }, +}; + +static int blk_trace_str2mask(const char *str) +{ + int i; + int mask = 0; + char *buf, *s, *token; + + buf = kstrdup(str, GFP_KERNEL); + if (buf == NULL) + return -ENOMEM; + s = strstrip(buf); + + while (1) { + token = strsep(&s, ","); + if (token == NULL) + break; + + if (*token == '\0') + continue; + + for (i = 0; i < ARRAY_SIZE(mask_maps); i++) { + if (strcasecmp(token, mask_maps[i].str) == 0) { + mask |= mask_maps[i].mask; + break; + } + } + if (i == ARRAY_SIZE(mask_maps)) { + mask = -EINVAL; + break; + } + } + kfree(buf); + + return mask; +} + +static ssize_t blk_trace_mask2str(char *buf, int mask) +{ + int i; + char *p = buf; + + for (i = 0; i < ARRAY_SIZE(mask_maps); i++) { + if (mask & mask_maps[i].mask) { + p += sprintf(p, "%s%s", + (p == buf) ? "" : ",", mask_maps[i].str); + } + } + *p++ = '\n'; + + return p - buf; +} + +static struct request_queue *blk_trace_get_queue(struct block_device *bdev) +{ + if (bdev->bd_disk == NULL) + return NULL; + + return bdev_get_queue(bdev); +} + +static ssize_t sysfs_blk_trace_attr_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + struct request_queue *q; + struct block_device *bdev; + ssize_t ret = -ENXIO; + + bdev = bdget(part_devt(p)); + if (bdev == NULL) + goto out; + + q = blk_trace_get_queue(bdev); + if (q == NULL) + goto out_bdput; + + mutex_lock(&bdev->bd_mutex); + + if (attr == &dev_attr_enable) { + ret = sprintf(buf, "%u\n", !!q->blk_trace); + goto out_unlock_bdev; + } + + if (q->blk_trace == NULL) + ret = sprintf(buf, "disabled\n"); + else if (attr == &dev_attr_act_mask) + ret = blk_trace_mask2str(buf, q->blk_trace->act_mask); + else if (attr == &dev_attr_pid) + ret = sprintf(buf, "%u\n", q->blk_trace->pid); + else if (attr == &dev_attr_start_lba) + ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba); + else if (attr == &dev_attr_end_lba) + ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); + +out_unlock_bdev: + mutex_unlock(&bdev->bd_mutex); +out_bdput: + bdput(bdev); +out: + return ret; +} + +static ssize_t sysfs_blk_trace_attr_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct block_device *bdev; + struct request_queue *q; + struct hd_struct *p; + u64 value; + ssize_t ret = -EINVAL; + + if (count == 0) + goto out; + + if (attr == &dev_attr_act_mask) { + if (sscanf(buf, "%llx", &value) != 1) { + /* Assume it is a list of trace category names */ + ret = blk_trace_str2mask(buf); + if (ret < 0) + goto out; + value = ret; + } + } else if (sscanf(buf, "%llu", &value) != 1) + goto out; + + ret = -ENXIO; + + p = dev_to_part(dev); + bdev = bdget(part_devt(p)); + if (bdev == NULL) + goto out; + + q = blk_trace_get_queue(bdev); + if (q == NULL) + goto out_bdput; + + mutex_lock(&bdev->bd_mutex); + + if (attr == &dev_attr_enable) { + if (value) + ret = blk_trace_setup_queue(q, bdev); + else + ret = blk_trace_remove_queue(q); + goto out_unlock_bdev; + } + + ret = 0; + if (q->blk_trace == NULL) + ret = blk_trace_setup_queue(q, bdev); + + if (ret == 0) { + if (attr == &dev_attr_act_mask) + q->blk_trace->act_mask = value; + else if (attr == &dev_attr_pid) + q->blk_trace->pid = value; + else if (attr == &dev_attr_start_lba) + q->blk_trace->start_lba = value; + else if (attr == &dev_attr_end_lba) + q->blk_trace->end_lba = value; + } + +out_unlock_bdev: + mutex_unlock(&bdev->bd_mutex); +out_bdput: + bdput(bdev); +out: + return ret ? ret : count; +} + +int blk_trace_init_sysfs(struct device *dev) +{ + return sysfs_create_group(&dev->kobj, &blk_trace_attr_group); +} + +void blk_trace_remove_sysfs(struct device *dev) +{ + sysfs_remove_group(&dev->kobj, &blk_trace_attr_group); +} + +#endif /* CONFIG_BLK_DEV_IO_TRACE */ + +#ifdef CONFIG_EVENT_TRACING + +void blk_dump_cmd(char *buf, struct request *rq) +{ + int i, end; + int len = rq->cmd_len; + unsigned char *cmd = rq->cmd; + + if (rq->cmd_type != REQ_TYPE_BLOCK_PC) { + buf[0] = '\0'; + return; + } + + for (end = len - 1; end >= 0; end--) + if (cmd[end]) + break; + end++; + + for (i = 0; i < len; i++) { + buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]); + if (i == end && end != len - 1) { + sprintf(buf, " .."); + break; + } + } +} + +void blk_fill_rwbs(char *rwbs, u32 rw, int bytes) +{ + int i = 0; + + if (rw & WRITE) + rwbs[i++] = 'W'; + else if (rw & REQ_DISCARD) + rwbs[i++] = 'D'; + else if (bytes) + rwbs[i++] = 'R'; + else + rwbs[i++] = 'N'; + + if (rw & REQ_RAHEAD) + rwbs[i++] = 'A'; + if (rw & REQ_SYNC) + rwbs[i++] = 'S'; + if (rw & REQ_META) + rwbs[i++] = 'M'; + if (rw & REQ_SECURE) + rwbs[i++] = 'E'; + + rwbs[i] = '\0'; +} + +#endif /* CONFIG_EVENT_TRACING */ + diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c new file mode 100644 index 00000000..9f8e2e11 --- /dev/null +++ b/kernel/trace/ftrace.c @@ -0,0 +1,4214 @@ +/* + * Infrastructure for profiling code inserted by 'gcc -pg'. + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2004-2008 Ingo Molnar + * + * Originally ported from the -rt patch by: + * Copyright (C) 2007 Arnaldo Carvalho de Melo + * + * Based on code in the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include "trace_output.h" +#include "trace_stat.h" + +#define FTRACE_WARN_ON(cond) \ + ({ \ + int ___r = cond; \ + if (WARN_ON(___r)) \ + ftrace_kill(); \ + ___r; \ + }) + +#define FTRACE_WARN_ON_ONCE(cond) \ + ({ \ + int ___r = cond; \ + if (WARN_ON_ONCE(___r)) \ + ftrace_kill(); \ + ___r; \ + }) + +/* hash bits for specific function selection */ +#define FTRACE_HASH_BITS 7 +#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS) +#define FTRACE_HASH_DEFAULT_BITS 10 +#define FTRACE_HASH_MAX_BITS 12 + +/* ftrace_enabled is a method to turn ftrace on or off */ +int ftrace_enabled __read_mostly; +static int last_ftrace_enabled; + +/* Quick disabling of function tracer. */ +int function_trace_stop; + +/* List for set_ftrace_pid's pids. */ +LIST_HEAD(ftrace_pids); +struct ftrace_pid { + struct list_head list; + struct pid *pid; +}; + +/* + * ftrace_disabled is set when an anomaly is discovered. + * ftrace_disabled is much stronger than ftrace_enabled. + */ +static int ftrace_disabled __read_mostly; + +static DEFINE_MUTEX(ftrace_lock); + +static struct ftrace_ops ftrace_list_end __read_mostly = +{ + .func = ftrace_stub, +}; + +static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end; +static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; +ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; +ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; +ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; +static struct ftrace_ops global_ops; + +static void +ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip); + +/* + * Traverse the ftrace_global_list, invoking all entries. The reason that we + * can use rcu_dereference_raw() is that elements removed from this list + * are simply leaked, so there is no need to interact with a grace-period + * mechanism. The rcu_dereference_raw() calls are needed to handle + * concurrent insertions into the ftrace_global_list. + * + * Silly Alpha and silly pointer-speculation compiler optimizations! + */ +static void ftrace_global_list_func(unsigned long ip, + unsigned long parent_ip) +{ + struct ftrace_ops *op; + + if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) + return; + + trace_recursion_set(TRACE_GLOBAL_BIT); + op = rcu_dereference_raw(ftrace_global_list); /*see above*/ + while (op != &ftrace_list_end) { + op->func(ip, parent_ip); + op = rcu_dereference_raw(op->next); /*see above*/ + }; + trace_recursion_clear(TRACE_GLOBAL_BIT); +} + +static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip) +{ + if (!test_tsk_trace_trace(current)) + return; + + ftrace_pid_function(ip, parent_ip); +} + +static void set_ftrace_pid_function(ftrace_func_t func) +{ + /* do not set ftrace_pid_function to itself! */ + if (func != ftrace_pid_func) + ftrace_pid_function = func; +} + +/** + * clear_ftrace_function - reset the ftrace function + * + * This NULLs the ftrace function and in essence stops + * tracing. There may be lag + */ +void clear_ftrace_function(void) +{ + ftrace_trace_function = ftrace_stub; + __ftrace_trace_function = ftrace_stub; + ftrace_pid_function = ftrace_stub; +} + +#ifndef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST +/* + * For those archs that do not test ftrace_trace_stop in their + * mcount call site, we need to do it from C. + */ +static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip) +{ + if (function_trace_stop) + return; + + __ftrace_trace_function(ip, parent_ip); +} +#endif + +static void update_global_ops(void) +{ + ftrace_func_t func; + + /* + * If there's only one function registered, then call that + * function directly. Otherwise, we need to iterate over the + * registered callers. + */ + if (ftrace_global_list == &ftrace_list_end || + ftrace_global_list->next == &ftrace_list_end) + func = ftrace_global_list->func; + else + func = ftrace_global_list_func; + + /* If we filter on pids, update to use the pid function */ + if (!list_empty(&ftrace_pids)) { + set_ftrace_pid_function(func); + func = ftrace_pid_func; + } + + global_ops.func = func; +} + +static void update_ftrace_function(void) +{ + ftrace_func_t func; + + update_global_ops(); + + /* + * If we are at the end of the list and this ops is + * not dynamic, then have the mcount trampoline call + * the function directly + */ + if (ftrace_ops_list == &ftrace_list_end || + (ftrace_ops_list->next == &ftrace_list_end && + !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC))) + func = ftrace_ops_list->func; + else + func = ftrace_ops_list_func; + +#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST + ftrace_trace_function = func; +#else + __ftrace_trace_function = func; + ftrace_trace_function = ftrace_test_stop_func; +#endif +} + +static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) +{ + ops->next = *list; + /* + * We are entering ops into the list but another + * CPU might be walking that list. We need to make sure + * the ops->next pointer is valid before another CPU sees + * the ops pointer included into the list. + */ + rcu_assign_pointer(*list, ops); +} + +static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) +{ + struct ftrace_ops **p; + + /* + * If we are removing the last function, then simply point + * to the ftrace_stub. + */ + if (*list == ops && ops->next == &ftrace_list_end) { + *list = &ftrace_list_end; + return 0; + } + + for (p = list; *p != &ftrace_list_end; p = &(*p)->next) + if (*p == ops) + break; + + if (*p != ops) + return -1; + + *p = (*p)->next; + return 0; +} + +static int __register_ftrace_function(struct ftrace_ops *ops) +{ + if (ftrace_disabled) + return -ENODEV; + + if (FTRACE_WARN_ON(ops == &global_ops)) + return -EINVAL; + + if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED)) + return -EBUSY; + + if (!core_kernel_data((unsigned long)ops)) + ops->flags |= FTRACE_OPS_FL_DYNAMIC; + + if (ops->flags & FTRACE_OPS_FL_GLOBAL) { + int first = ftrace_global_list == &ftrace_list_end; + add_ftrace_ops(&ftrace_global_list, ops); + ops->flags |= FTRACE_OPS_FL_ENABLED; + if (first) + add_ftrace_ops(&ftrace_ops_list, &global_ops); + } else + add_ftrace_ops(&ftrace_ops_list, ops); + + if (ftrace_enabled) + update_ftrace_function(); + + return 0; +} + +static int __unregister_ftrace_function(struct ftrace_ops *ops) +{ + int ret; + + if (ftrace_disabled) + return -ENODEV; + + if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) + return -EBUSY; + + if (FTRACE_WARN_ON(ops == &global_ops)) + return -EINVAL; + + if (ops->flags & FTRACE_OPS_FL_GLOBAL) { + ret = remove_ftrace_ops(&ftrace_global_list, ops); + if (!ret && ftrace_global_list == &ftrace_list_end) + ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops); + if (!ret) + ops->flags &= ~FTRACE_OPS_FL_ENABLED; + } else + ret = remove_ftrace_ops(&ftrace_ops_list, ops); + + if (ret < 0) + return ret; + + if (ftrace_enabled) + update_ftrace_function(); + + /* + * Dynamic ops may be freed, we must make sure that all + * callers are done before leaving this function. + */ + if (ops->flags & FTRACE_OPS_FL_DYNAMIC) + synchronize_sched(); + + return 0; +} + +static void ftrace_update_pid_func(void) +{ + /* Only do something if we are tracing something */ + if (ftrace_trace_function == ftrace_stub) + return; + + update_ftrace_function(); +} + +#ifdef CONFIG_FUNCTION_PROFILER +struct ftrace_profile { + struct hlist_node node; + unsigned long ip; + unsigned long counter; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + unsigned long long time; + unsigned long long time_squared; +#endif +}; + +struct ftrace_profile_page { + struct ftrace_profile_page *next; + unsigned long index; + struct ftrace_profile records[]; +}; + +struct ftrace_profile_stat { + atomic_t disabled; + struct hlist_head *hash; + struct ftrace_profile_page *pages; + struct ftrace_profile_page *start; + struct tracer_stat stat; +}; + +#define PROFILE_RECORDS_SIZE \ + (PAGE_SIZE - offsetof(struct ftrace_profile_page, records)) + +#define PROFILES_PER_PAGE \ + (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) + +static int ftrace_profile_bits __read_mostly; +static int ftrace_profile_enabled __read_mostly; + +/* ftrace_profile_lock - synchronize the enable and disable of the profiler */ +static DEFINE_MUTEX(ftrace_profile_lock); + +static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats); + +#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */ + +static void * +function_stat_next(void *v, int idx) +{ + struct ftrace_profile *rec = v; + struct ftrace_profile_page *pg; + + pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK); + + again: + if (idx != 0) + rec++; + + if ((void *)rec >= (void *)&pg->records[pg->index]) { + pg = pg->next; + if (!pg) + return NULL; + rec = &pg->records[0]; + if (!rec->counter) + goto again; + } + + return rec; +} + +static void *function_stat_start(struct tracer_stat *trace) +{ + struct ftrace_profile_stat *stat = + container_of(trace, struct ftrace_profile_stat, stat); + + if (!stat || !stat->start) + return NULL; + + return function_stat_next(&stat->start->records[0], 0); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +/* function graph compares on total time */ +static int function_stat_cmp(void *p1, void *p2) +{ + struct ftrace_profile *a = p1; + struct ftrace_profile *b = p2; + + if (a->time < b->time) + return -1; + if (a->time > b->time) + return 1; + else + return 0; +} +#else +/* not function graph compares against hits */ +static int function_stat_cmp(void *p1, void *p2) +{ + struct ftrace_profile *a = p1; + struct ftrace_profile *b = p2; + + if (a->counter < b->counter) + return -1; + if (a->counter > b->counter) + return 1; + else + return 0; +} +#endif + +static int function_stat_headers(struct seq_file *m) +{ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + seq_printf(m, " Function " + "Hit Time Avg s^2\n" + " -------- " + "--- ---- --- ---\n"); +#else + seq_printf(m, " Function Hit\n" + " -------- ---\n"); +#endif + return 0; +} + +static int function_stat_show(struct seq_file *m, void *v) +{ + struct ftrace_profile *rec = v; + char str[KSYM_SYMBOL_LEN]; + int ret = 0; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + static struct trace_seq s; + unsigned long long avg; + unsigned long long stddev; +#endif + mutex_lock(&ftrace_profile_lock); + + /* we raced with function_profile_reset() */ + if (unlikely(rec->counter == 0)) { + ret = -EBUSY; + goto out; + } + + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + seq_printf(m, " %-30.30s %10lu", str, rec->counter); + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + seq_printf(m, " "); + avg = rec->time; + do_div(avg, rec->counter); + + /* Sample standard deviation (s^2) */ + if (rec->counter <= 1) + stddev = 0; + else { + stddev = rec->time_squared - rec->counter * avg * avg; + /* + * Divide only 1000 for ns^2 -> us^2 conversion. + * trace_print_graph_duration will divide 1000 again. + */ + do_div(stddev, (rec->counter - 1) * 1000); + } + + trace_seq_init(&s); + trace_print_graph_duration(rec->time, &s); + trace_seq_puts(&s, " "); + trace_print_graph_duration(avg, &s); + trace_seq_puts(&s, " "); + trace_print_graph_duration(stddev, &s); + trace_print_seq(m, &s); +#endif + seq_putc(m, '\n'); +out: + mutex_unlock(&ftrace_profile_lock); + + return ret; +} + +static void ftrace_profile_reset(struct ftrace_profile_stat *stat) +{ + struct ftrace_profile_page *pg; + + pg = stat->pages = stat->start; + + while (pg) { + memset(pg->records, 0, PROFILE_RECORDS_SIZE); + pg->index = 0; + pg = pg->next; + } + + memset(stat->hash, 0, + FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head)); +} + +int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) +{ + struct ftrace_profile_page *pg; + int functions; + int pages; + int i; + + /* If we already allocated, do nothing */ + if (stat->pages) + return 0; + + stat->pages = (void *)get_zeroed_page(GFP_KERNEL); + if (!stat->pages) + return -ENOMEM; + +#ifdef CONFIG_DYNAMIC_FTRACE + functions = ftrace_update_tot_cnt; +#else + /* + * We do not know the number of functions that exist because + * dynamic tracing is what counts them. With past experience + * we have around 20K functions. That should be more than enough. + * It is highly unlikely we will execute every function in + * the kernel. + */ + functions = 20000; +#endif + + pg = stat->start = stat->pages; + + pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE); + + for (i = 0; i < pages; i++) { + pg->next = (void *)get_zeroed_page(GFP_KERNEL); + if (!pg->next) + goto out_free; + pg = pg->next; + } + + return 0; + + out_free: + pg = stat->start; + while (pg) { + unsigned long tmp = (unsigned long)pg; + + pg = pg->next; + free_page(tmp); + } + + free_page((unsigned long)stat->pages); + stat->pages = NULL; + stat->start = NULL; + + return -ENOMEM; +} + +static int ftrace_profile_init_cpu(int cpu) +{ + struct ftrace_profile_stat *stat; + int size; + + stat = &per_cpu(ftrace_profile_stats, cpu); + + if (stat->hash) { + /* If the profile is already created, simply reset it */ + ftrace_profile_reset(stat); + return 0; + } + + /* + * We are profiling all functions, but usually only a few thousand + * functions are hit. We'll make a hash of 1024 items. + */ + size = FTRACE_PROFILE_HASH_SIZE; + + stat->hash = kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL); + + if (!stat->hash) + return -ENOMEM; + + if (!ftrace_profile_bits) { + size--; + + for (; size; size >>= 1) + ftrace_profile_bits++; + } + + /* Preallocate the function profiling pages */ + if (ftrace_profile_pages_init(stat) < 0) { + kfree(stat->hash); + stat->hash = NULL; + return -ENOMEM; + } + + return 0; +} + +static int ftrace_profile_init(void) +{ + int cpu; + int ret = 0; + + for_each_online_cpu(cpu) { + ret = ftrace_profile_init_cpu(cpu); + if (ret) + break; + } + + return ret; +} + +/* interrupts must be disabled */ +static struct ftrace_profile * +ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip) +{ + struct ftrace_profile *rec; + struct hlist_head *hhd; + struct hlist_node *n; + unsigned long key; + + key = hash_long(ip, ftrace_profile_bits); + hhd = &stat->hash[key]; + + if (hlist_empty(hhd)) + return NULL; + + hlist_for_each_entry_rcu(rec, n, hhd, node) { + if (rec->ip == ip) + return rec; + } + + return NULL; +} + +static void ftrace_add_profile(struct ftrace_profile_stat *stat, + struct ftrace_profile *rec) +{ + unsigned long key; + + key = hash_long(rec->ip, ftrace_profile_bits); + hlist_add_head_rcu(&rec->node, &stat->hash[key]); +} + +/* + * The memory is already allocated, this simply finds a new record to use. + */ +static struct ftrace_profile * +ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip) +{ + struct ftrace_profile *rec = NULL; + + /* prevent recursion (from NMIs) */ + if (atomic_inc_return(&stat->disabled) != 1) + goto out; + + /* + * Try to find the function again since an NMI + * could have added it + */ + rec = ftrace_find_profiled_func(stat, ip); + if (rec) + goto out; + + if (stat->pages->index == PROFILES_PER_PAGE) { + if (!stat->pages->next) + goto out; + stat->pages = stat->pages->next; + } + + rec = &stat->pages->records[stat->pages->index++]; + rec->ip = ip; + ftrace_add_profile(stat, rec); + + out: + atomic_dec(&stat->disabled); + + return rec; +} + +static void +function_profile_call(unsigned long ip, unsigned long parent_ip) +{ + struct ftrace_profile_stat *stat; + struct ftrace_profile *rec; + unsigned long flags; + + if (!ftrace_profile_enabled) + return; + + local_irq_save(flags); + + stat = &__get_cpu_var(ftrace_profile_stats); + if (!stat->hash || !ftrace_profile_enabled) + goto out; + + rec = ftrace_find_profiled_func(stat, ip); + if (!rec) { + rec = ftrace_profile_alloc(stat, ip); + if (!rec) + goto out; + } + + rec->counter++; + out: + local_irq_restore(flags); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int profile_graph_entry(struct ftrace_graph_ent *trace) +{ + function_profile_call(trace->func, 0); + return 1; +} + +static void profile_graph_return(struct ftrace_graph_ret *trace) +{ + struct ftrace_profile_stat *stat; + unsigned long long calltime; + struct ftrace_profile *rec; + unsigned long flags; + + local_irq_save(flags); + stat = &__get_cpu_var(ftrace_profile_stats); + if (!stat->hash || !ftrace_profile_enabled) + goto out; + + /* If the calltime was zero'd ignore it */ + if (!trace->calltime) + goto out; + + calltime = trace->rettime - trace->calltime; + + if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) { + int index; + + index = trace->depth; + + /* Append this call time to the parent time to subtract */ + if (index) + current->ret_stack[index - 1].subtime += calltime; + + if (current->ret_stack[index].subtime < calltime) + calltime -= current->ret_stack[index].subtime; + else + calltime = 0; + } + + rec = ftrace_find_profiled_func(stat, trace->func); + if (rec) { + rec->time += calltime; + rec->time_squared += calltime * calltime; + } + + out: + local_irq_restore(flags); +} + +static int register_ftrace_profiler(void) +{ + return register_ftrace_graph(&profile_graph_return, + &profile_graph_entry); +} + +static void unregister_ftrace_profiler(void) +{ + unregister_ftrace_graph(); +} +#else +static struct ftrace_ops ftrace_profile_ops __read_mostly = +{ + .func = function_profile_call, +}; + +static int register_ftrace_profiler(void) +{ + return register_ftrace_function(&ftrace_profile_ops); +} + +static void unregister_ftrace_profiler(void) +{ + unregister_ftrace_function(&ftrace_profile_ops); +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +static ssize_t +ftrace_profile_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long val; + char buf[64]; /* big enough to hold a number */ + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + val = !!val; + + mutex_lock(&ftrace_profile_lock); + if (ftrace_profile_enabled ^ val) { + if (val) { + ret = ftrace_profile_init(); + if (ret < 0) { + cnt = ret; + goto out; + } + + ret = register_ftrace_profiler(); + if (ret < 0) { + cnt = ret; + goto out; + } + ftrace_profile_enabled = 1; + } else { + ftrace_profile_enabled = 0; + /* + * unregister_ftrace_profiler calls stop_machine + * so this acts like an synchronize_sched. + */ + unregister_ftrace_profiler(); + } + } + out: + mutex_unlock(&ftrace_profile_lock); + + *ppos += cnt; + + return cnt; +} + +static ssize_t +ftrace_profile_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; /* big enough to hold a number */ + int r; + + r = sprintf(buf, "%u\n", ftrace_profile_enabled); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static const struct file_operations ftrace_profile_fops = { + .open = tracing_open_generic, + .read = ftrace_profile_read, + .write = ftrace_profile_write, + .llseek = default_llseek, +}; + +/* used to initialize the real stat files */ +static struct tracer_stat function_stats __initdata = { + .name = "functions", + .stat_start = function_stat_start, + .stat_next = function_stat_next, + .stat_cmp = function_stat_cmp, + .stat_headers = function_stat_headers, + .stat_show = function_stat_show +}; + +static __init void ftrace_profile_debugfs(struct dentry *d_tracer) +{ + struct ftrace_profile_stat *stat; + struct dentry *entry; + char *name; + int ret; + int cpu; + + for_each_possible_cpu(cpu) { + stat = &per_cpu(ftrace_profile_stats, cpu); + + /* allocate enough for function name + cpu number */ + name = kmalloc(32, GFP_KERNEL); + if (!name) { + /* + * The files created are permanent, if something happens + * we still do not free memory. + */ + WARN(1, + "Could not allocate stat file for cpu %d\n", + cpu); + return; + } + stat->stat = function_stats; + snprintf(name, 32, "function%d", cpu); + stat->stat.name = name; + ret = register_stat_tracer(&stat->stat); + if (ret) { + WARN(1, + "Could not register function stat for cpu %d\n", + cpu); + kfree(name); + return; + } + } + + entry = debugfs_create_file("function_profile_enabled", 0644, + d_tracer, NULL, &ftrace_profile_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'function_profile_enabled' entry\n"); +} + +#else /* CONFIG_FUNCTION_PROFILER */ +static __init void ftrace_profile_debugfs(struct dentry *d_tracer) +{ +} +#endif /* CONFIG_FUNCTION_PROFILER */ + +static struct pid * const ftrace_swapper_pid = &init_struct_pid; + +#ifdef CONFIG_DYNAMIC_FTRACE + +#ifndef CONFIG_FTRACE_MCOUNT_RECORD +# error Dynamic ftrace depends on MCOUNT_RECORD +#endif + +static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly; + +struct ftrace_func_probe { + struct hlist_node node; + struct ftrace_probe_ops *ops; + unsigned long flags; + unsigned long ip; + void *data; + struct rcu_head rcu; +}; + +enum { + FTRACE_UPDATE_CALLS = (1 << 0), + FTRACE_DISABLE_CALLS = (1 << 1), + FTRACE_UPDATE_TRACE_FUNC = (1 << 2), + FTRACE_START_FUNC_RET = (1 << 3), + FTRACE_STOP_FUNC_RET = (1 << 4), +}; +struct ftrace_func_entry { + struct hlist_node hlist; + unsigned long ip; +}; + +struct ftrace_hash { + unsigned long size_bits; + struct hlist_head *buckets; + unsigned long count; + struct rcu_head rcu; +}; + +/* + * We make these constant because no one should touch them, + * but they are used as the default "empty hash", to avoid allocating + * it all the time. These are in a read only section such that if + * anyone does try to modify it, it will cause an exception. + */ +static const struct hlist_head empty_buckets[1]; +static const struct ftrace_hash empty_hash = { + .buckets = (struct hlist_head *)empty_buckets, +}; +#define EMPTY_HASH ((struct ftrace_hash *)&empty_hash) + +static struct ftrace_ops global_ops = { + .func = ftrace_stub, + .notrace_hash = EMPTY_HASH, + .filter_hash = EMPTY_HASH, +}; + +static struct dyn_ftrace *ftrace_new_addrs; + +static DEFINE_MUTEX(ftrace_regex_lock); + +struct ftrace_page { + struct ftrace_page *next; + int index; + struct dyn_ftrace records[]; +}; + +#define ENTRIES_PER_PAGE \ + ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace)) + +/* estimate from running different kernels */ +#define NR_TO_INIT 10000 + +static struct ftrace_page *ftrace_pages_start; +static struct ftrace_page *ftrace_pages; + +static struct dyn_ftrace *ftrace_free_records; + +static struct ftrace_func_entry * +ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip) +{ + unsigned long key; + struct ftrace_func_entry *entry; + struct hlist_head *hhd; + struct hlist_node *n; + + if (!hash->count) + return NULL; + + if (hash->size_bits > 0) + key = hash_long(ip, hash->size_bits); + else + key = 0; + + hhd = &hash->buckets[key]; + + hlist_for_each_entry_rcu(entry, n, hhd, hlist) { + if (entry->ip == ip) + return entry; + } + return NULL; +} + +static void __add_hash_entry(struct ftrace_hash *hash, + struct ftrace_func_entry *entry) +{ + struct hlist_head *hhd; + unsigned long key; + + if (hash->size_bits) + key = hash_long(entry->ip, hash->size_bits); + else + key = 0; + + hhd = &hash->buckets[key]; + hlist_add_head(&entry->hlist, hhd); + hash->count++; +} + +static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip) +{ + struct ftrace_func_entry *entry; + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + return -ENOMEM; + + entry->ip = ip; + __add_hash_entry(hash, entry); + + return 0; +} + +static void +free_hash_entry(struct ftrace_hash *hash, + struct ftrace_func_entry *entry) +{ + hlist_del(&entry->hlist); + kfree(entry); + hash->count--; +} + +static void +remove_hash_entry(struct ftrace_hash *hash, + struct ftrace_func_entry *entry) +{ + hlist_del(&entry->hlist); + hash->count--; +} + +static void ftrace_hash_clear(struct ftrace_hash *hash) +{ + struct hlist_head *hhd; + struct hlist_node *tp, *tn; + struct ftrace_func_entry *entry; + int size = 1 << hash->size_bits; + int i; + + if (!hash->count) + return; + + for (i = 0; i < size; i++) { + hhd = &hash->buckets[i]; + hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) + free_hash_entry(hash, entry); + } + FTRACE_WARN_ON(hash->count); +} + +static void free_ftrace_hash(struct ftrace_hash *hash) +{ + if (!hash || hash == EMPTY_HASH) + return; + ftrace_hash_clear(hash); + kfree(hash->buckets); + kfree(hash); +} + +static void __free_ftrace_hash_rcu(struct rcu_head *rcu) +{ + struct ftrace_hash *hash; + + hash = container_of(rcu, struct ftrace_hash, rcu); + free_ftrace_hash(hash); +} + +static void free_ftrace_hash_rcu(struct ftrace_hash *hash) +{ + if (!hash || hash == EMPTY_HASH) + return; + call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu); +} + +static struct ftrace_hash *alloc_ftrace_hash(int size_bits) +{ + struct ftrace_hash *hash; + int size; + + hash = kzalloc(sizeof(*hash), GFP_KERNEL); + if (!hash) + return NULL; + + size = 1 << size_bits; + hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL); + + if (!hash->buckets) { + kfree(hash); + return NULL; + } + + hash->size_bits = size_bits; + + return hash; +} + +static struct ftrace_hash * +alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash) +{ + struct ftrace_func_entry *entry; + struct ftrace_hash *new_hash; + struct hlist_node *tp; + int size; + int ret; + int i; + + new_hash = alloc_ftrace_hash(size_bits); + if (!new_hash) + return NULL; + + /* Empty hash? */ + if (!hash || !hash->count) + return new_hash; + + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) { + ret = add_hash_entry(new_hash, entry->ip); + if (ret < 0) + goto free_hash; + } + } + + FTRACE_WARN_ON(new_hash->count != hash->count); + + return new_hash; + + free_hash: + free_ftrace_hash(new_hash); + return NULL; +} + +static void +ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash); +static void +ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash); + +static int +ftrace_hash_move(struct ftrace_ops *ops, int enable, + struct ftrace_hash **dst, struct ftrace_hash *src) +{ + struct ftrace_func_entry *entry; + struct hlist_node *tp, *tn; + struct hlist_head *hhd; + struct ftrace_hash *old_hash; + struct ftrace_hash *new_hash; + unsigned long key; + int size = src->count; + int bits = 0; + int ret; + int i; + + /* + * Remove the current set, update the hash and add + * them back. + */ + ftrace_hash_rec_disable(ops, enable); + + /* + * If the new source is empty, just free dst and assign it + * the empty_hash. + */ + if (!src->count) { + free_ftrace_hash_rcu(*dst); + rcu_assign_pointer(*dst, EMPTY_HASH); + return 0; + } + + /* + * Make the hash size about 1/2 the # found + */ + for (size /= 2; size; size >>= 1) + bits++; + + /* Don't allocate too much */ + if (bits > FTRACE_HASH_MAX_BITS) + bits = FTRACE_HASH_MAX_BITS; + + ret = -ENOMEM; + new_hash = alloc_ftrace_hash(bits); + if (!new_hash) + goto out; + + size = 1 << src->size_bits; + for (i = 0; i < size; i++) { + hhd = &src->buckets[i]; + hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) { + if (bits > 0) + key = hash_long(entry->ip, bits); + else + key = 0; + remove_hash_entry(src, entry); + __add_hash_entry(new_hash, entry); + } + } + + old_hash = *dst; + rcu_assign_pointer(*dst, new_hash); + free_ftrace_hash_rcu(old_hash); + + ret = 0; + out: + /* + * Enable regardless of ret: + * On success, we enable the new hash. + * On failure, we re-enable the original hash. + */ + ftrace_hash_rec_enable(ops, enable); + + return ret; +} + +/* + * Test the hashes for this ops to see if we want to call + * the ops->func or not. + * + * It's a match if the ip is in the ops->filter_hash or + * the filter_hash does not exist or is empty, + * AND + * the ip is not in the ops->notrace_hash. + * + * This needs to be called with preemption disabled as + * the hashes are freed with call_rcu_sched(). + */ +static int +ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) +{ + struct ftrace_hash *filter_hash; + struct ftrace_hash *notrace_hash; + int ret; + + filter_hash = rcu_dereference_raw(ops->filter_hash); + notrace_hash = rcu_dereference_raw(ops->notrace_hash); + + if ((!filter_hash || !filter_hash->count || + ftrace_lookup_ip(filter_hash, ip)) && + (!notrace_hash || !notrace_hash->count || + !ftrace_lookup_ip(notrace_hash, ip))) + ret = 1; + else + ret = 0; + + return ret; +} + +/* + * This is a double for. Do not use 'break' to break out of the loop, + * you must use a goto. + */ +#define do_for_each_ftrace_rec(pg, rec) \ + for (pg = ftrace_pages_start; pg; pg = pg->next) { \ + int _____i; \ + for (_____i = 0; _____i < pg->index; _____i++) { \ + rec = &pg->records[_____i]; + +#define while_for_each_ftrace_rec() \ + } \ + } + +static void __ftrace_hash_rec_update(struct ftrace_ops *ops, + int filter_hash, + bool inc) +{ + struct ftrace_hash *hash; + struct ftrace_hash *other_hash; + struct ftrace_page *pg; + struct dyn_ftrace *rec; + int count = 0; + int all = 0; + + /* Only update if the ops has been registered */ + if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) + return; + + /* + * In the filter_hash case: + * If the count is zero, we update all records. + * Otherwise we just update the items in the hash. + * + * In the notrace_hash case: + * We enable the update in the hash. + * As disabling notrace means enabling the tracing, + * and enabling notrace means disabling, the inc variable + * gets inversed. + */ + if (filter_hash) { + hash = ops->filter_hash; + other_hash = ops->notrace_hash; + if (!hash || !hash->count) + all = 1; + } else { + inc = !inc; + hash = ops->notrace_hash; + other_hash = ops->filter_hash; + /* + * If the notrace hash has no items, + * then there's nothing to do. + */ + if (hash && !hash->count) + return; + } + + do_for_each_ftrace_rec(pg, rec) { + int in_other_hash = 0; + int in_hash = 0; + int match = 0; + + if (all) { + /* + * Only the filter_hash affects all records. + * Update if the record is not in the notrace hash. + */ + if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip)) + match = 1; + } else { + in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip); + in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip); + + /* + * + */ + if (filter_hash && in_hash && !in_other_hash) + match = 1; + else if (!filter_hash && in_hash && + (in_other_hash || !other_hash->count)) + match = 1; + } + if (!match) + continue; + + if (inc) { + rec->flags++; + if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX)) + return; + } else { + if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0)) + return; + rec->flags--; + } + count++; + /* Shortcut, if we handled all records, we are done. */ + if (!all && count == hash->count) + return; + } while_for_each_ftrace_rec(); +} + +static void ftrace_hash_rec_disable(struct ftrace_ops *ops, + int filter_hash) +{ + __ftrace_hash_rec_update(ops, filter_hash, 0); +} + +static void ftrace_hash_rec_enable(struct ftrace_ops *ops, + int filter_hash) +{ + __ftrace_hash_rec_update(ops, filter_hash, 1); +} + +static void ftrace_free_rec(struct dyn_ftrace *rec) +{ + rec->freelist = ftrace_free_records; + ftrace_free_records = rec; + rec->flags |= FTRACE_FL_FREE; +} + +static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) +{ + struct dyn_ftrace *rec; + + /* First check for freed records */ + if (ftrace_free_records) { + rec = ftrace_free_records; + + if (unlikely(!(rec->flags & FTRACE_FL_FREE))) { + FTRACE_WARN_ON_ONCE(1); + ftrace_free_records = NULL; + return NULL; + } + + ftrace_free_records = rec->freelist; + memset(rec, 0, sizeof(*rec)); + return rec; + } + + if (ftrace_pages->index == ENTRIES_PER_PAGE) { + if (!ftrace_pages->next) { + /* allocate another page */ + ftrace_pages->next = + (void *)get_zeroed_page(GFP_KERNEL); + if (!ftrace_pages->next) + return NULL; + } + ftrace_pages = ftrace_pages->next; + } + + return &ftrace_pages->records[ftrace_pages->index++]; +} + +static struct dyn_ftrace * +ftrace_record_ip(unsigned long ip) +{ + struct dyn_ftrace *rec; + + if (ftrace_disabled) + return NULL; + + rec = ftrace_alloc_dyn_node(ip); + if (!rec) + return NULL; + + rec->ip = ip; + rec->newlist = ftrace_new_addrs; + ftrace_new_addrs = rec; + + return rec; +} + +static void print_ip_ins(const char *fmt, unsigned char *p) +{ + int i; + + printk(KERN_CONT "%s", fmt); + + for (i = 0; i < MCOUNT_INSN_SIZE; i++) + printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); +} + +static void ftrace_bug(int failed, unsigned long ip) +{ + switch (failed) { + case -EFAULT: + FTRACE_WARN_ON_ONCE(1); + pr_info("ftrace faulted on modifying "); + print_ip_sym(ip); + break; + case -EINVAL: + FTRACE_WARN_ON_ONCE(1); + pr_info("ftrace failed to modify "); + print_ip_sym(ip); + print_ip_ins(" actual: ", (unsigned char *)ip); + printk(KERN_CONT "\n"); + break; + case -EPERM: + FTRACE_WARN_ON_ONCE(1); + pr_info("ftrace faulted on writing "); + print_ip_sym(ip); + break; + default: + FTRACE_WARN_ON_ONCE(1); + pr_info("ftrace faulted on unknown error "); + print_ip_sym(ip); + } +} + + +/* Return 1 if the address range is reserved for ftrace */ +int ftrace_text_reserved(void *start, void *end) +{ + struct dyn_ftrace *rec; + struct ftrace_page *pg; + + do_for_each_ftrace_rec(pg, rec) { + if (rec->ip <= (unsigned long)end && + rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start) + return 1; + } while_for_each_ftrace_rec(); + return 0; +} + + +static int +__ftrace_replace_code(struct dyn_ftrace *rec, int update) +{ + unsigned long ftrace_addr; + unsigned long flag = 0UL; + + ftrace_addr = (unsigned long)FTRACE_ADDR; + + /* + * If we are updating calls: + * + * If the record has a ref count, then we need to enable it + * because someone is using it. + * + * Otherwise we make sure its disabled. + * + * If we are disabling calls, then disable all records that + * are enabled. + */ + if (update && (rec->flags & ~FTRACE_FL_MASK)) + flag = FTRACE_FL_ENABLED; + + /* If the state of this record hasn't changed, then do nothing */ + if ((rec->flags & FTRACE_FL_ENABLED) == flag) + return 0; + + if (flag) { + rec->flags |= FTRACE_FL_ENABLED; + return ftrace_make_call(rec, ftrace_addr); + } + + rec->flags &= ~FTRACE_FL_ENABLED; + return ftrace_make_nop(NULL, rec, ftrace_addr); +} + +static void ftrace_replace_code(int update) +{ + struct dyn_ftrace *rec; + struct ftrace_page *pg; + int failed; + + if (unlikely(ftrace_disabled)) + return; + + do_for_each_ftrace_rec(pg, rec) { + /* Skip over free records */ + if (rec->flags & FTRACE_FL_FREE) + continue; + + failed = __ftrace_replace_code(rec, update); + if (failed) { + ftrace_bug(failed, rec->ip); + /* Stop processing */ + return; + } + } while_for_each_ftrace_rec(); +} + +static int +ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) +{ + unsigned long ip; + int ret; + + ip = rec->ip; + + if (unlikely(ftrace_disabled)) + return 0; + + ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); + if (ret) { + ftrace_bug(ret, ip); + return 0; + } + return 1; +} + +/* + * archs can override this function if they must do something + * before the modifying code is performed. + */ +int __weak ftrace_arch_code_modify_prepare(void) +{ + return 0; +} + +/* + * archs can override this function if they must do something + * after the modifying code is performed. + */ +int __weak ftrace_arch_code_modify_post_process(void) +{ + return 0; +} + +static int __ftrace_modify_code(void *data) +{ + int *command = data; + + if (*command & FTRACE_UPDATE_CALLS) + ftrace_replace_code(1); + else if (*command & FTRACE_DISABLE_CALLS) + ftrace_replace_code(0); + + if (*command & FTRACE_UPDATE_TRACE_FUNC) + ftrace_update_ftrace_func(ftrace_trace_function); + + if (*command & FTRACE_START_FUNC_RET) + ftrace_enable_ftrace_graph_caller(); + else if (*command & FTRACE_STOP_FUNC_RET) + ftrace_disable_ftrace_graph_caller(); + + return 0; +} + +static void ftrace_run_update_code(int command) +{ + int ret; + + ret = ftrace_arch_code_modify_prepare(); + FTRACE_WARN_ON(ret); + if (ret) + return; + + stop_machine(__ftrace_modify_code, &command, NULL); + + ret = ftrace_arch_code_modify_post_process(); + FTRACE_WARN_ON(ret); +} + +static ftrace_func_t saved_ftrace_func; +static int ftrace_start_up; +static int global_start_up; + +static void ftrace_startup_enable(int command) +{ + if (saved_ftrace_func != ftrace_trace_function) { + saved_ftrace_func = ftrace_trace_function; + command |= FTRACE_UPDATE_TRACE_FUNC; + } + + if (!command || !ftrace_enabled) + return; + + ftrace_run_update_code(command); +} + +static int ftrace_startup(struct ftrace_ops *ops, int command) +{ + bool hash_enable = true; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + ftrace_start_up++; + command |= FTRACE_UPDATE_CALLS; + + /* ops marked global share the filter hashes */ + if (ops->flags & FTRACE_OPS_FL_GLOBAL) { + ops = &global_ops; + /* Don't update hash if global is already set */ + if (global_start_up) + hash_enable = false; + global_start_up++; + } + + ops->flags |= FTRACE_OPS_FL_ENABLED; + if (hash_enable) + ftrace_hash_rec_enable(ops, 1); + + ftrace_startup_enable(command); + + return 0; +} + +static void ftrace_shutdown(struct ftrace_ops *ops, int command) +{ + bool hash_disable = true; + + if (unlikely(ftrace_disabled)) + return; + + ftrace_start_up--; + /* + * Just warn in case of unbalance, no need to kill ftrace, it's not + * critical but the ftrace_call callers may be never nopped again after + * further ftrace uses. + */ + WARN_ON_ONCE(ftrace_start_up < 0); + + if (ops->flags & FTRACE_OPS_FL_GLOBAL) { + ops = &global_ops; + global_start_up--; + WARN_ON_ONCE(global_start_up < 0); + /* Don't update hash if global still has users */ + if (global_start_up) { + WARN_ON_ONCE(!ftrace_start_up); + hash_disable = false; + } + } + + if (hash_disable) + ftrace_hash_rec_disable(ops, 1); + + if (ops != &global_ops || !global_start_up) + ops->flags &= ~FTRACE_OPS_FL_ENABLED; + + command |= FTRACE_UPDATE_CALLS; + + if (saved_ftrace_func != ftrace_trace_function) { + saved_ftrace_func = ftrace_trace_function; + command |= FTRACE_UPDATE_TRACE_FUNC; + } + + if (!command || !ftrace_enabled) + return; + + ftrace_run_update_code(command); +} + +static void ftrace_startup_sysctl(void) +{ + if (unlikely(ftrace_disabled)) + return; + + /* Force update next time */ + saved_ftrace_func = NULL; + /* ftrace_start_up is true if we want ftrace running */ + if (ftrace_start_up) + ftrace_run_update_code(FTRACE_UPDATE_CALLS); +} + +static void ftrace_shutdown_sysctl(void) +{ + if (unlikely(ftrace_disabled)) + return; + + /* ftrace_start_up is true if ftrace is running */ + if (ftrace_start_up) + ftrace_run_update_code(FTRACE_DISABLE_CALLS); +} + +static cycle_t ftrace_update_time; +static unsigned long ftrace_update_cnt; +unsigned long ftrace_update_tot_cnt; + +static int ops_traces_mod(struct ftrace_ops *ops) +{ + struct ftrace_hash *hash; + + hash = ops->filter_hash; + return !!(!hash || !hash->count); +} + +static int ftrace_update_code(struct module *mod) +{ + struct dyn_ftrace *p; + cycle_t start, stop; + unsigned long ref = 0; + + /* + * When adding a module, we need to check if tracers are + * currently enabled and if they are set to trace all functions. + * If they are, we need to enable the module functions as well + * as update the reference counts for those function records. + */ + if (mod) { + struct ftrace_ops *ops; + + for (ops = ftrace_ops_list; + ops != &ftrace_list_end; ops = ops->next) { + if (ops->flags & FTRACE_OPS_FL_ENABLED && + ops_traces_mod(ops)) + ref++; + } + } + + start = ftrace_now(raw_smp_processor_id()); + ftrace_update_cnt = 0; + + while (ftrace_new_addrs) { + + /* If something went wrong, bail without enabling anything */ + if (unlikely(ftrace_disabled)) + return -1; + + p = ftrace_new_addrs; + ftrace_new_addrs = p->newlist; + p->flags = ref; + + /* + * Do the initial record conversion from mcount jump + * to the NOP instructions. + */ + if (!ftrace_code_disable(mod, p)) { + ftrace_free_rec(p); + /* Game over */ + break; + } + + ftrace_update_cnt++; + + /* + * If the tracing is enabled, go ahead and enable the record. + * + * The reason not to enable the record immediatelly is the + * inherent check of ftrace_make_nop/ftrace_make_call for + * correct previous instructions. Making first the NOP + * conversion puts the module to the correct state, thus + * passing the ftrace_make_call check. + */ + if (ftrace_start_up && ref) { + int failed = __ftrace_replace_code(p, 1); + if (failed) { + ftrace_bug(failed, p->ip); + ftrace_free_rec(p); + } + } + } + + stop = ftrace_now(raw_smp_processor_id()); + ftrace_update_time = stop - start; + ftrace_update_tot_cnt += ftrace_update_cnt; + + return 0; +} + +static int __init ftrace_dyn_table_alloc(unsigned long num_to_init) +{ + struct ftrace_page *pg; + int cnt; + int i; + + /* allocate a few pages */ + ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL); + if (!ftrace_pages_start) + return -1; + + /* + * Allocate a few more pages. + * + * TODO: have some parser search vmlinux before + * final linking to find all calls to ftrace. + * Then we can: + * a) know how many pages to allocate. + * and/or + * b) set up the table then. + * + * The dynamic code is still necessary for + * modules. + */ + + pg = ftrace_pages = ftrace_pages_start; + + cnt = num_to_init / ENTRIES_PER_PAGE; + pr_info("ftrace: allocating %ld entries in %d pages\n", + num_to_init, cnt + 1); + + for (i = 0; i < cnt; i++) { + pg->next = (void *)get_zeroed_page(GFP_KERNEL); + + /* If we fail, we'll try later anyway */ + if (!pg->next) + break; + + pg = pg->next; + } + + return 0; +} + +enum { + FTRACE_ITER_FILTER = (1 << 0), + FTRACE_ITER_NOTRACE = (1 << 1), + FTRACE_ITER_PRINTALL = (1 << 2), + FTRACE_ITER_HASH = (1 << 3), + FTRACE_ITER_ENABLED = (1 << 4), +}; + +#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ + +struct ftrace_iterator { + loff_t pos; + loff_t func_pos; + struct ftrace_page *pg; + struct dyn_ftrace *func; + struct ftrace_func_probe *probe; + struct trace_parser parser; + struct ftrace_hash *hash; + struct ftrace_ops *ops; + int hidx; + int idx; + unsigned flags; +}; + +static void * +t_hash_next(struct seq_file *m, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + struct hlist_node *hnd = NULL; + struct hlist_head *hhd; + + (*pos)++; + iter->pos = *pos; + + if (iter->probe) + hnd = &iter->probe->node; + retry: + if (iter->hidx >= FTRACE_FUNC_HASHSIZE) + return NULL; + + hhd = &ftrace_func_hash[iter->hidx]; + + if (hlist_empty(hhd)) { + iter->hidx++; + hnd = NULL; + goto retry; + } + + if (!hnd) + hnd = hhd->first; + else { + hnd = hnd->next; + if (!hnd) { + iter->hidx++; + goto retry; + } + } + + if (WARN_ON_ONCE(!hnd)) + return NULL; + + iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node); + + return iter; +} + +static void *t_hash_start(struct seq_file *m, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + void *p = NULL; + loff_t l; + + if (iter->func_pos > *pos) + return NULL; + + iter->hidx = 0; + for (l = 0; l <= (*pos - iter->func_pos); ) { + p = t_hash_next(m, &l); + if (!p) + break; + } + if (!p) + return NULL; + + /* Only set this if we have an item */ + iter->flags |= FTRACE_ITER_HASH; + + return iter; +} + +static int +t_hash_show(struct seq_file *m, struct ftrace_iterator *iter) +{ + struct ftrace_func_probe *rec; + + rec = iter->probe; + if (WARN_ON_ONCE(!rec)) + return -EIO; + + if (rec->ops->print) + return rec->ops->print(m, rec->ip, rec->ops, rec->data); + + seq_printf(m, "%ps:%ps", (void *)rec->ip, (void *)rec->ops->func); + + if (rec->data) + seq_printf(m, ":%p", rec->data); + seq_putc(m, '\n'); + + return 0; +} + +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + struct ftrace_ops *ops = &global_ops; + struct dyn_ftrace *rec = NULL; + + if (unlikely(ftrace_disabled)) + return NULL; + + if (iter->flags & FTRACE_ITER_HASH) + return t_hash_next(m, pos); + + (*pos)++; + iter->pos = iter->func_pos = *pos; + + if (iter->flags & FTRACE_ITER_PRINTALL) + return t_hash_start(m, pos); + + retry: + if (iter->idx >= iter->pg->index) { + if (iter->pg->next) { + iter->pg = iter->pg->next; + iter->idx = 0; + goto retry; + } + } else { + rec = &iter->pg->records[iter->idx++]; + if ((rec->flags & FTRACE_FL_FREE) || + + ((iter->flags & FTRACE_ITER_FILTER) && + !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) || + + ((iter->flags & FTRACE_ITER_NOTRACE) && + !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) || + + ((iter->flags & FTRACE_ITER_ENABLED) && + !(rec->flags & ~FTRACE_FL_MASK))) { + + rec = NULL; + goto retry; + } + } + + if (!rec) + return t_hash_start(m, pos); + + iter->func = rec; + + return iter; +} + +static void reset_iter_read(struct ftrace_iterator *iter) +{ + iter->pos = 0; + iter->func_pos = 0; + iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH); +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + struct ftrace_ops *ops = &global_ops; + void *p = NULL; + loff_t l; + + mutex_lock(&ftrace_lock); + + if (unlikely(ftrace_disabled)) + return NULL; + + /* + * If an lseek was done, then reset and start from beginning. + */ + if (*pos < iter->pos) + reset_iter_read(iter); + + /* + * For set_ftrace_filter reading, if we have the filter + * off, we can short cut and just print out that all + * functions are enabled. + */ + if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) { + if (*pos > 0) + return t_hash_start(m, pos); + iter->flags |= FTRACE_ITER_PRINTALL; + /* reset in case of seek/pread */ + iter->flags &= ~FTRACE_ITER_HASH; + return iter; + } + + if (iter->flags & FTRACE_ITER_HASH) + return t_hash_start(m, pos); + + /* + * Unfortunately, we need to restart at ftrace_pages_start + * every time we let go of the ftrace_mutex. This is because + * those pointers can change without the lock. + */ + iter->pg = ftrace_pages_start; + iter->idx = 0; + for (l = 0; l <= *pos; ) { + p = t_next(m, p, &l); + if (!p) + break; + } + + if (!p) { + if (iter->flags & FTRACE_ITER_FILTER) + return t_hash_start(m, pos); + + return NULL; + } + + return iter; +} + +static void t_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&ftrace_lock); +} + +static int t_show(struct seq_file *m, void *v) +{ + struct ftrace_iterator *iter = m->private; + struct dyn_ftrace *rec; + + if (iter->flags & FTRACE_ITER_HASH) + return t_hash_show(m, iter); + + if (iter->flags & FTRACE_ITER_PRINTALL) { + seq_printf(m, "#### all functions enabled ####\n"); + return 0; + } + + rec = iter->func; + + if (!rec) + return 0; + + seq_printf(m, "%ps", (void *)rec->ip); + if (iter->flags & FTRACE_ITER_ENABLED) + seq_printf(m, " (%ld)", + rec->flags & ~FTRACE_FL_MASK); + seq_printf(m, "\n"); + + return 0; +} + +static const struct seq_operations show_ftrace_seq_ops = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = t_show, +}; + +static int +ftrace_avail_open(struct inode *inode, struct file *file) +{ + struct ftrace_iterator *iter; + int ret; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + iter->pg = ftrace_pages_start; + + ret = seq_open(file, &show_ftrace_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + + m->private = iter; + } else { + kfree(iter); + } + + return ret; +} + +static int +ftrace_enabled_open(struct inode *inode, struct file *file) +{ + struct ftrace_iterator *iter; + int ret; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + iter->pg = ftrace_pages_start; + iter->flags = FTRACE_ITER_ENABLED; + + ret = seq_open(file, &show_ftrace_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + + m->private = iter; + } else { + kfree(iter); + } + + return ret; +} + +static void ftrace_filter_reset(struct ftrace_hash *hash) +{ + mutex_lock(&ftrace_lock); + ftrace_hash_clear(hash); + mutex_unlock(&ftrace_lock); +} + +static int +ftrace_regex_open(struct ftrace_ops *ops, int flag, + struct inode *inode, struct file *file) +{ + struct ftrace_iterator *iter; + struct ftrace_hash *hash; + int ret = 0; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) { + kfree(iter); + return -ENOMEM; + } + + if (flag & FTRACE_ITER_NOTRACE) + hash = ops->notrace_hash; + else + hash = ops->filter_hash; + + iter->ops = ops; + iter->flags = flag; + + if (file->f_mode & FMODE_WRITE) { + mutex_lock(&ftrace_lock); + iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash); + mutex_unlock(&ftrace_lock); + + if (!iter->hash) { + trace_parser_put(&iter->parser); + kfree(iter); + return -ENOMEM; + } + } + + mutex_lock(&ftrace_regex_lock); + + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) + ftrace_filter_reset(iter->hash); + + if (file->f_mode & FMODE_READ) { + iter->pg = ftrace_pages_start; + + ret = seq_open(file, &show_ftrace_seq_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = iter; + } else { + /* Failed */ + free_ftrace_hash(iter->hash); + trace_parser_put(&iter->parser); + kfree(iter); + } + } else + file->private_data = iter; + mutex_unlock(&ftrace_regex_lock); + + return ret; +} + +static int +ftrace_filter_open(struct inode *inode, struct file *file) +{ + return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER, + inode, file); +} + +static int +ftrace_notrace_open(struct inode *inode, struct file *file) +{ + return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE, + inode, file); +} + +static loff_t +ftrace_regex_lseek(struct file *file, loff_t offset, int origin) +{ + loff_t ret; + + if (file->f_mode & FMODE_READ) + ret = seq_lseek(file, offset, origin); + else + file->f_pos = ret = 1; + + return ret; +} + +static int ftrace_match(char *str, char *regex, int len, int type) +{ + int matched = 0; + int slen; + + switch (type) { + case MATCH_FULL: + if (strcmp(str, regex) == 0) + matched = 1; + break; + case MATCH_FRONT_ONLY: + if (strncmp(str, regex, len) == 0) + matched = 1; + break; + case MATCH_MIDDLE_ONLY: + if (strstr(str, regex)) + matched = 1; + break; + case MATCH_END_ONLY: + slen = strlen(str); + if (slen >= len && memcmp(str + slen - len, regex, len) == 0) + matched = 1; + break; + } + + return matched; +} + +static int +enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not) +{ + struct ftrace_func_entry *entry; + int ret = 0; + + entry = ftrace_lookup_ip(hash, rec->ip); + if (not) { + /* Do nothing if it doesn't exist */ + if (!entry) + return 0; + + free_hash_entry(hash, entry); + } else { + /* Do nothing if it exists */ + if (entry) + return 0; + + ret = add_hash_entry(hash, rec->ip); + } + return ret; +} + +static int +ftrace_match_record(struct dyn_ftrace *rec, char *mod, + char *regex, int len, int type) +{ + char str[KSYM_SYMBOL_LEN]; + char *modname; + + kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); + + if (mod) { + /* module lookup requires matching the module */ + if (!modname || strcmp(modname, mod)) + return 0; + + /* blank search means to match all funcs in the mod */ + if (!len) + return 1; + } + + return ftrace_match(str, regex, len, type); +} + +static int +match_records(struct ftrace_hash *hash, char *buff, + int len, char *mod, int not) +{ + unsigned search_len = 0; + struct ftrace_page *pg; + struct dyn_ftrace *rec; + int type = MATCH_FULL; + char *search = buff; + int found = 0; + int ret; + + if (len) { + type = filter_parse_regex(buff, len, &search, ¬); + search_len = strlen(search); + } + + mutex_lock(&ftrace_lock); + + if (unlikely(ftrace_disabled)) + goto out_unlock; + + do_for_each_ftrace_rec(pg, rec) { + + if (ftrace_match_record(rec, mod, search, search_len, type)) { + ret = enter_record(hash, rec, not); + if (ret < 0) { + found = ret; + goto out_unlock; + } + found = 1; + } + } while_for_each_ftrace_rec(); + out_unlock: + mutex_unlock(&ftrace_lock); + + return found; +} + +static int +ftrace_match_records(struct ftrace_hash *hash, char *buff, int len) +{ + return match_records(hash, buff, len, NULL, 0); +} + +static int +ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod) +{ + int not = 0; + + /* blank or '*' mean the same */ + if (strcmp(buff, "*") == 0) + buff[0] = 0; + + /* handle the case of 'dont filter this module' */ + if (strcmp(buff, "!") == 0 || strcmp(buff, "!*") == 0) { + buff[0] = 0; + not = 1; + } + + return match_records(hash, buff, strlen(buff), mod, not); +} + +/* + * We register the module command as a template to show others how + * to register the a command as well. + */ + +static int +ftrace_mod_callback(struct ftrace_hash *hash, + char *func, char *cmd, char *param, int enable) +{ + char *mod; + int ret = -EINVAL; + + /* + * cmd == 'mod' because we only registered this func + * for the 'mod' ftrace_func_command. + * But if you register one func with multiple commands, + * you can tell which command was used by the cmd + * parameter. + */ + + /* we must have a module name */ + if (!param) + return ret; + + mod = strsep(¶m, ":"); + if (!strlen(mod)) + return ret; + + ret = ftrace_match_module_records(hash, func, mod); + if (!ret) + ret = -EINVAL; + if (ret < 0) + return ret; + + return 0; +} + +static struct ftrace_func_command ftrace_mod_cmd = { + .name = "mod", + .func = ftrace_mod_callback, +}; + +static int __init ftrace_mod_cmd_init(void) +{ + return register_ftrace_command(&ftrace_mod_cmd); +} +device_initcall(ftrace_mod_cmd_init); + +static void +function_trace_probe_call(unsigned long ip, unsigned long parent_ip) +{ + struct ftrace_func_probe *entry; + struct hlist_head *hhd; + struct hlist_node *n; + unsigned long key; + + key = hash_long(ip, FTRACE_HASH_BITS); + + hhd = &ftrace_func_hash[key]; + + if (hlist_empty(hhd)) + return; + + /* + * Disable preemption for these calls to prevent a RCU grace + * period. This syncs the hash iteration and freeing of items + * on the hash. rcu_read_lock is too dangerous here. + */ + preempt_disable_notrace(); + hlist_for_each_entry_rcu(entry, n, hhd, node) { + if (entry->ip == ip) + entry->ops->func(ip, parent_ip, &entry->data); + } + preempt_enable_notrace(); +} + +static struct ftrace_ops trace_probe_ops __read_mostly = +{ + .func = function_trace_probe_call, +}; + +static int ftrace_probe_registered; + +static void __enable_ftrace_function_probe(void) +{ + int ret; + int i; + + if (ftrace_probe_registered) + return; + + for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { + struct hlist_head *hhd = &ftrace_func_hash[i]; + if (hhd->first) + break; + } + /* Nothing registered? */ + if (i == FTRACE_FUNC_HASHSIZE) + return; + + ret = __register_ftrace_function(&trace_probe_ops); + if (!ret) + ret = ftrace_startup(&trace_probe_ops, 0); + + ftrace_probe_registered = 1; +} + +static void __disable_ftrace_function_probe(void) +{ + int ret; + int i; + + if (!ftrace_probe_registered) + return; + + for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { + struct hlist_head *hhd = &ftrace_func_hash[i]; + if (hhd->first) + return; + } + + /* no more funcs left */ + ret = __unregister_ftrace_function(&trace_probe_ops); + if (!ret) + ftrace_shutdown(&trace_probe_ops, 0); + + ftrace_probe_registered = 0; +} + + +static void ftrace_free_entry_rcu(struct rcu_head *rhp) +{ + struct ftrace_func_probe *entry = + container_of(rhp, struct ftrace_func_probe, rcu); + + if (entry->ops->free) + entry->ops->free(&entry->data); + kfree(entry); +} + + +int +register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, + void *data) +{ + struct ftrace_func_probe *entry; + struct ftrace_page *pg; + struct dyn_ftrace *rec; + int type, len, not; + unsigned long key; + int count = 0; + char *search; + + type = filter_parse_regex(glob, strlen(glob), &search, ¬); + len = strlen(search); + + /* we do not support '!' for function probes */ + if (WARN_ON(not)) + return -EINVAL; + + mutex_lock(&ftrace_lock); + + if (unlikely(ftrace_disabled)) + goto out_unlock; + + do_for_each_ftrace_rec(pg, rec) { + + if (!ftrace_match_record(rec, NULL, search, len, type)) + continue; + + entry = kmalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) { + /* If we did not process any, then return error */ + if (!count) + count = -ENOMEM; + goto out_unlock; + } + + count++; + + entry->data = data; + + /* + * The caller might want to do something special + * for each function we find. We call the callback + * to give the caller an opportunity to do so. + */ + if (ops->callback) { + if (ops->callback(rec->ip, &entry->data) < 0) { + /* caller does not like this func */ + kfree(entry); + continue; + } + } + + entry->ops = ops; + entry->ip = rec->ip; + + key = hash_long(entry->ip, FTRACE_HASH_BITS); + hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]); + + } while_for_each_ftrace_rec(); + __enable_ftrace_function_probe(); + + out_unlock: + mutex_unlock(&ftrace_lock); + + return count; +} + +enum { + PROBE_TEST_FUNC = 1, + PROBE_TEST_DATA = 2 +}; + +static void +__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, + void *data, int flags) +{ + struct ftrace_func_probe *entry; + struct hlist_node *n, *tmp; + char str[KSYM_SYMBOL_LEN]; + int type = MATCH_FULL; + int i, len = 0; + char *search; + + if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) + glob = NULL; + else if (glob) { + int not; + + type = filter_parse_regex(glob, strlen(glob), &search, ¬); + len = strlen(search); + + /* we do not support '!' for function probes */ + if (WARN_ON(not)) + return; + } + + mutex_lock(&ftrace_lock); + for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { + struct hlist_head *hhd = &ftrace_func_hash[i]; + + hlist_for_each_entry_safe(entry, n, tmp, hhd, node) { + + /* break up if statements for readability */ + if ((flags & PROBE_TEST_FUNC) && entry->ops != ops) + continue; + + if ((flags & PROBE_TEST_DATA) && entry->data != data) + continue; + + /* do this last, since it is the most expensive */ + if (glob) { + kallsyms_lookup(entry->ip, NULL, NULL, + NULL, str); + if (!ftrace_match(str, glob, len, type)) + continue; + } + + hlist_del(&entry->node); + call_rcu(&entry->rcu, ftrace_free_entry_rcu); + } + } + __disable_ftrace_function_probe(); + mutex_unlock(&ftrace_lock); +} + +void +unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, + void *data) +{ + __unregister_ftrace_function_probe(glob, ops, data, + PROBE_TEST_FUNC | PROBE_TEST_DATA); +} + +void +unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) +{ + __unregister_ftrace_function_probe(glob, ops, NULL, PROBE_TEST_FUNC); +} + +void unregister_ftrace_function_probe_all(char *glob) +{ + __unregister_ftrace_function_probe(glob, NULL, NULL, 0); +} + +static LIST_HEAD(ftrace_commands); +static DEFINE_MUTEX(ftrace_cmd_mutex); + +int register_ftrace_command(struct ftrace_func_command *cmd) +{ + struct ftrace_func_command *p; + int ret = 0; + + mutex_lock(&ftrace_cmd_mutex); + list_for_each_entry(p, &ftrace_commands, list) { + if (strcmp(cmd->name, p->name) == 0) { + ret = -EBUSY; + goto out_unlock; + } + } + list_add(&cmd->list, &ftrace_commands); + out_unlock: + mutex_unlock(&ftrace_cmd_mutex); + + return ret; +} + +int unregister_ftrace_command(struct ftrace_func_command *cmd) +{ + struct ftrace_func_command *p, *n; + int ret = -ENODEV; + + mutex_lock(&ftrace_cmd_mutex); + list_for_each_entry_safe(p, n, &ftrace_commands, list) { + if (strcmp(cmd->name, p->name) == 0) { + ret = 0; + list_del_init(&p->list); + goto out_unlock; + } + } + out_unlock: + mutex_unlock(&ftrace_cmd_mutex); + + return ret; +} + +static int ftrace_process_regex(struct ftrace_hash *hash, + char *buff, int len, int enable) +{ + char *func, *command, *next = buff; + struct ftrace_func_command *p; + int ret = -EINVAL; + + func = strsep(&next, ":"); + + if (!next) { + ret = ftrace_match_records(hash, func, len); + if (!ret) + ret = -EINVAL; + if (ret < 0) + return ret; + return 0; + } + + /* command found */ + + command = strsep(&next, ":"); + + mutex_lock(&ftrace_cmd_mutex); + list_for_each_entry(p, &ftrace_commands, list) { + if (strcmp(p->name, command) == 0) { + ret = p->func(hash, func, command, next, enable); + goto out_unlock; + } + } + out_unlock: + mutex_unlock(&ftrace_cmd_mutex); + + return ret; +} + +static ssize_t +ftrace_regex_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos, int enable) +{ + struct ftrace_iterator *iter; + struct trace_parser *parser; + ssize_t ret, read; + + if (!cnt) + return 0; + + mutex_lock(&ftrace_regex_lock); + + ret = -ENODEV; + if (unlikely(ftrace_disabled)) + goto out_unlock; + + if (file->f_mode & FMODE_READ) { + struct seq_file *m = file->private_data; + iter = m->private; + } else + iter = file->private_data; + + parser = &iter->parser; + read = trace_get_user(parser, ubuf, cnt, ppos); + + if (read >= 0 && trace_parser_loaded(parser) && + !trace_parser_cont(parser)) { + ret = ftrace_process_regex(iter->hash, parser->buffer, + parser->idx, enable); + trace_parser_clear(parser); + if (ret) + goto out_unlock; + } + + ret = read; +out_unlock: + mutex_unlock(&ftrace_regex_lock); + + return ret; +} + +static ssize_t +ftrace_filter_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return ftrace_regex_write(file, ubuf, cnt, ppos, 1); +} + +static ssize_t +ftrace_notrace_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return ftrace_regex_write(file, ubuf, cnt, ppos, 0); +} + +static int +ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len, + int reset, int enable) +{ + struct ftrace_hash **orig_hash; + struct ftrace_hash *hash; + int ret; + + /* All global ops uses the global ops filters */ + if (ops->flags & FTRACE_OPS_FL_GLOBAL) + ops = &global_ops; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + if (enable) + orig_hash = &ops->filter_hash; + else + orig_hash = &ops->notrace_hash; + + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); + if (!hash) + return -ENOMEM; + + mutex_lock(&ftrace_regex_lock); + if (reset) + ftrace_filter_reset(hash); + if (buf) + ftrace_match_records(hash, buf, len); + + mutex_lock(&ftrace_lock); + ret = ftrace_hash_move(ops, enable, orig_hash, hash); + if (!ret && ops->flags & FTRACE_OPS_FL_ENABLED + && ftrace_enabled) + ftrace_run_update_code(FTRACE_UPDATE_CALLS); + + mutex_unlock(&ftrace_lock); + + mutex_unlock(&ftrace_regex_lock); + + free_ftrace_hash(hash); + return ret; +} + +/** + * ftrace_set_filter - set a function to filter on in ftrace + * @ops - the ops to set the filter with + * @buf - the string that holds the function filter text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Filters denote which functions should be enabled when tracing is enabled. + * If @buf is NULL and reset is set, all functions will be enabled for tracing. + */ +void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, + int len, int reset) +{ + ftrace_set_regex(ops, buf, len, reset, 1); +} +EXPORT_SYMBOL_GPL(ftrace_set_filter); + +/** + * ftrace_set_notrace - set a function to not trace in ftrace + * @ops - the ops to set the notrace filter with + * @buf - the string that holds the function notrace text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Notrace Filters denote which functions should not be enabled when tracing + * is enabled. If @buf is NULL and reset is set, all functions will be enabled + * for tracing. + */ +void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, + int len, int reset) +{ + ftrace_set_regex(ops, buf, len, reset, 0); +} +EXPORT_SYMBOL_GPL(ftrace_set_notrace); +/** + * ftrace_set_filter - set a function to filter on in ftrace + * @ops - the ops to set the filter with + * @buf - the string that holds the function filter text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Filters denote which functions should be enabled when tracing is enabled. + * If @buf is NULL and reset is set, all functions will be enabled for tracing. + */ +void ftrace_set_global_filter(unsigned char *buf, int len, int reset) +{ + ftrace_set_regex(&global_ops, buf, len, reset, 1); +} +EXPORT_SYMBOL_GPL(ftrace_set_global_filter); + +/** + * ftrace_set_notrace - set a function to not trace in ftrace + * @ops - the ops to set the notrace filter with + * @buf - the string that holds the function notrace text. + * @len - the length of the string. + * @reset - non zero to reset all filters before applying this filter. + * + * Notrace Filters denote which functions should not be enabled when tracing + * is enabled. If @buf is NULL and reset is set, all functions will be enabled + * for tracing. + */ +void ftrace_set_global_notrace(unsigned char *buf, int len, int reset) +{ + ftrace_set_regex(&global_ops, buf, len, reset, 0); +} +EXPORT_SYMBOL_GPL(ftrace_set_global_notrace); + +/* + * command line interface to allow users to set filters on boot up. + */ +#define FTRACE_FILTER_SIZE COMMAND_LINE_SIZE +static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata; +static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; + +static int __init set_ftrace_notrace(char *str) +{ + strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); + return 1; +} +__setup("ftrace_notrace=", set_ftrace_notrace); + +static int __init set_ftrace_filter(char *str) +{ + strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); + return 1; +} +__setup("ftrace_filter=", set_ftrace_filter); + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; +static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); + +static int __init set_graph_function(char *str) +{ + strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); + return 1; +} +__setup("ftrace_graph_filter=", set_graph_function); + +static void __init set_ftrace_early_graph(char *buf) +{ + int ret; + char *func; + + while (buf) { + func = strsep(&buf, ","); + /* we allow only one expression at a time */ + ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, + func); + if (ret) + printk(KERN_DEBUG "ftrace: function %s not " + "traceable\n", func); + } +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +static void __init +set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable) +{ + char *func; + + while (buf) { + func = strsep(&buf, ","); + ftrace_set_regex(ops, func, strlen(func), 0, enable); + } +} + +static void __init set_ftrace_early_filters(void) +{ + if (ftrace_filter_buf[0]) + set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1); + if (ftrace_notrace_buf[0]) + set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0); +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + if (ftrace_graph_buf[0]) + set_ftrace_early_graph(ftrace_graph_buf); +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ +} + +static int +ftrace_regex_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = (struct seq_file *)file->private_data; + struct ftrace_iterator *iter; + struct ftrace_hash **orig_hash; + struct trace_parser *parser; + int filter_hash; + int ret; + + mutex_lock(&ftrace_regex_lock); + if (file->f_mode & FMODE_READ) { + iter = m->private; + + seq_release(inode, file); + } else + iter = file->private_data; + + parser = &iter->parser; + if (trace_parser_loaded(parser)) { + parser->buffer[parser->idx] = 0; + ftrace_match_records(iter->hash, parser->buffer, parser->idx); + } + + trace_parser_put(parser); + + if (file->f_mode & FMODE_WRITE) { + filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); + + if (filter_hash) + orig_hash = &iter->ops->filter_hash; + else + orig_hash = &iter->ops->notrace_hash; + + mutex_lock(&ftrace_lock); + ret = ftrace_hash_move(iter->ops, filter_hash, + orig_hash, iter->hash); + if (!ret && (iter->ops->flags & FTRACE_OPS_FL_ENABLED) + && ftrace_enabled) + ftrace_run_update_code(FTRACE_UPDATE_CALLS); + + mutex_unlock(&ftrace_lock); + } + free_ftrace_hash(iter->hash); + kfree(iter); + + mutex_unlock(&ftrace_regex_lock); + return 0; +} + +static const struct file_operations ftrace_avail_fops = { + .open = ftrace_avail_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static const struct file_operations ftrace_enabled_fops = { + .open = ftrace_enabled_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static const struct file_operations ftrace_filter_fops = { + .open = ftrace_filter_open, + .read = seq_read, + .write = ftrace_filter_write, + .llseek = ftrace_regex_lseek, + .release = ftrace_regex_release, +}; + +static const struct file_operations ftrace_notrace_fops = { + .open = ftrace_notrace_open, + .read = seq_read, + .write = ftrace_notrace_write, + .llseek = ftrace_regex_lseek, + .release = ftrace_regex_release, +}; + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +static DEFINE_MUTEX(graph_lock); + +int ftrace_graph_count; +int ftrace_graph_filter_enabled; +unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; + +static void * +__g_next(struct seq_file *m, loff_t *pos) +{ + if (*pos >= ftrace_graph_count) + return NULL; + return &ftrace_graph_funcs[*pos]; +} + +static void * +g_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return __g_next(m, pos); +} + +static void *g_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&graph_lock); + + /* Nothing, tell g_show to print all functions are enabled */ + if (!ftrace_graph_filter_enabled && !*pos) + return (void *)1; + + return __g_next(m, pos); +} + +static void g_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&graph_lock); +} + +static int g_show(struct seq_file *m, void *v) +{ + unsigned long *ptr = v; + + if (!ptr) + return 0; + + if (ptr == (unsigned long *)1) { + seq_printf(m, "#### all functions enabled ####\n"); + return 0; + } + + seq_printf(m, "%ps\n", (void *)*ptr); + + return 0; +} + +static const struct seq_operations ftrace_graph_seq_ops = { + .start = g_start, + .next = g_next, + .stop = g_stop, + .show = g_show, +}; + +static int +ftrace_graph_open(struct inode *inode, struct file *file) +{ + int ret = 0; + + if (unlikely(ftrace_disabled)) + return -ENODEV; + + mutex_lock(&graph_lock); + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) { + ftrace_graph_filter_enabled = 0; + ftrace_graph_count = 0; + memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); + } + mutex_unlock(&graph_lock); + + if (file->f_mode & FMODE_READ) + ret = seq_open(file, &ftrace_graph_seq_ops); + + return ret; +} + +static int +ftrace_graph_release(struct inode *inode, struct file *file) +{ + if (file->f_mode & FMODE_READ) + seq_release(inode, file); + return 0; +} + +static int +ftrace_set_func(unsigned long *array, int *idx, char *buffer) +{ + struct dyn_ftrace *rec; + struct ftrace_page *pg; + int search_len; + int fail = 1; + int type, not; + char *search; + bool exists; + int i; + + /* decode regex */ + type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); + if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) + return -EBUSY; + + search_len = strlen(search); + + mutex_lock(&ftrace_lock); + + if (unlikely(ftrace_disabled)) { + mutex_unlock(&ftrace_lock); + return -ENODEV; + } + + do_for_each_ftrace_rec(pg, rec) { + + if (rec->flags & FTRACE_FL_FREE) + continue; + + if (ftrace_match_record(rec, NULL, search, search_len, type)) { + /* if it is in the array */ + exists = false; + for (i = 0; i < *idx; i++) { + if (array[i] == rec->ip) { + exists = true; + break; + } + } + + if (!not) { + fail = 0; + if (!exists) { + array[(*idx)++] = rec->ip; + if (*idx >= FTRACE_GRAPH_MAX_FUNCS) + goto out; + } + } else { + if (exists) { + array[i] = array[--(*idx)]; + array[*idx] = 0; + fail = 0; + } + } + } + } while_for_each_ftrace_rec(); +out: + mutex_unlock(&ftrace_lock); + + if (fail) + return -EINVAL; + + ftrace_graph_filter_enabled = 1; + return 0; +} + +static ssize_t +ftrace_graph_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_parser parser; + ssize_t read, ret; + + if (!cnt) + return 0; + + mutex_lock(&graph_lock); + + if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { + ret = -ENOMEM; + goto out_unlock; + } + + read = trace_get_user(&parser, ubuf, cnt, ppos); + + if (read >= 0 && trace_parser_loaded((&parser))) { + parser.buffer[parser.idx] = 0; + + /* we allow only one expression at a time */ + ret = ftrace_set_func(ftrace_graph_funcs, &ftrace_graph_count, + parser.buffer); + if (ret) + goto out_free; + } + + ret = read; + +out_free: + trace_parser_put(&parser); +out_unlock: + mutex_unlock(&graph_lock); + + return ret; +} + +static const struct file_operations ftrace_graph_fops = { + .open = ftrace_graph_open, + .read = seq_read, + .write = ftrace_graph_write, + .release = ftrace_graph_release, + .llseek = seq_lseek, +}; +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) +{ + + trace_create_file("available_filter_functions", 0444, + d_tracer, NULL, &ftrace_avail_fops); + + trace_create_file("enabled_functions", 0444, + d_tracer, NULL, &ftrace_enabled_fops); + + trace_create_file("set_ftrace_filter", 0644, d_tracer, + NULL, &ftrace_filter_fops); + + trace_create_file("set_ftrace_notrace", 0644, d_tracer, + NULL, &ftrace_notrace_fops); + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + trace_create_file("set_graph_function", 0444, d_tracer, + NULL, + &ftrace_graph_fops); +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + + return 0; +} + +static int ftrace_process_locs(struct module *mod, + unsigned long *start, + unsigned long *end) +{ + unsigned long *p; + unsigned long addr; + unsigned long flags; + + mutex_lock(&ftrace_lock); + p = start; + while (p < end) { + addr = ftrace_call_adjust(*p++); + /* + * Some architecture linkers will pad between + * the different mcount_loc sections of different + * object files to satisfy alignments. + * Skip any NULL pointers. + */ + if (!addr) + continue; + ftrace_record_ip(addr); + } + + /* + * Disable interrupts to prevent interrupts from executing + * code that is being modified. + */ + local_irq_save(flags); + ftrace_update_code(mod); + local_irq_restore(flags); + mutex_unlock(&ftrace_lock); + + return 0; +} + +#ifdef CONFIG_MODULES +void ftrace_release_mod(struct module *mod) +{ + struct dyn_ftrace *rec; + struct ftrace_page *pg; + + mutex_lock(&ftrace_lock); + + if (ftrace_disabled) + goto out_unlock; + + do_for_each_ftrace_rec(pg, rec) { + if (within_module_core(rec->ip, mod)) { + /* + * rec->ip is changed in ftrace_free_rec() + * It should not between s and e if record was freed. + */ + FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE); + ftrace_free_rec(rec); + } + } while_for_each_ftrace_rec(); + out_unlock: + mutex_unlock(&ftrace_lock); +} + +static void ftrace_init_module(struct module *mod, + unsigned long *start, unsigned long *end) +{ + if (ftrace_disabled || start == end) + return; + ftrace_process_locs(mod, start, end); +} + +static int ftrace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + switch (val) { + case MODULE_STATE_COMING: + ftrace_init_module(mod, mod->ftrace_callsites, + mod->ftrace_callsites + + mod->num_ftrace_callsites); + break; + case MODULE_STATE_GOING: + ftrace_release_mod(mod); + break; + } + + return 0; +} +#else +static int ftrace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + return 0; +} +#endif /* CONFIG_MODULES */ + +struct notifier_block ftrace_module_nb = { + .notifier_call = ftrace_module_notify, + .priority = 0, +}; + +extern unsigned long __start_mcount_loc[]; +extern unsigned long __stop_mcount_loc[]; + +void __init ftrace_init(void) +{ + unsigned long count, addr, flags; + int ret; + + /* Keep the ftrace pointer to the stub */ + addr = (unsigned long)ftrace_stub; + + local_irq_save(flags); + ftrace_dyn_arch_init(&addr); + local_irq_restore(flags); + + /* ftrace_dyn_arch_init places the return code in addr */ + if (addr) + goto failed; + + count = __stop_mcount_loc - __start_mcount_loc; + + ret = ftrace_dyn_table_alloc(count); + if (ret) + goto failed; + + last_ftrace_enabled = ftrace_enabled = 1; + + ret = ftrace_process_locs(NULL, + __start_mcount_loc, + __stop_mcount_loc); + + ret = register_module_notifier(&ftrace_module_nb); + if (ret) + pr_warning("Failed to register trace ftrace module notifier\n"); + + set_ftrace_early_filters(); + + return; + failed: + ftrace_disabled = 1; +} + +#else + +static struct ftrace_ops global_ops = { + .func = ftrace_stub, +}; + +static int __init ftrace_nodyn_init(void) +{ + ftrace_enabled = 1; + return 0; +} +device_initcall(ftrace_nodyn_init); + +static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } +static inline void ftrace_startup_enable(int command) { } +/* Keep as macros so we do not need to define the commands */ +# define ftrace_startup(ops, command) \ + ({ \ + (ops)->flags |= FTRACE_OPS_FL_ENABLED; \ + 0; \ + }) +# define ftrace_shutdown(ops, command) do { } while (0) +# define ftrace_startup_sysctl() do { } while (0) +# define ftrace_shutdown_sysctl() do { } while (0) + +static inline int +ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip) +{ + return 1; +} + +#endif /* CONFIG_DYNAMIC_FTRACE */ + +static void +ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip) +{ + struct ftrace_ops *op; + + if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT))) + return; + + trace_recursion_set(TRACE_INTERNAL_BIT); + /* + * Some of the ops may be dynamically allocated, + * they must be freed after a synchronize_sched(). + */ + preempt_disable_notrace(); + op = rcu_dereference_raw(ftrace_ops_list); + while (op != &ftrace_list_end) { + if (ftrace_ops_test(op, ip)) + op->func(ip, parent_ip); + op = rcu_dereference_raw(op->next); + }; + preempt_enable_notrace(); + trace_recursion_clear(TRACE_INTERNAL_BIT); +} + +static void clear_ftrace_swapper(void) +{ + struct task_struct *p; + int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) { + p = idle_task(cpu); + clear_tsk_trace_trace(p); + } + put_online_cpus(); +} + +static void set_ftrace_swapper(void) +{ + struct task_struct *p; + int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) { + p = idle_task(cpu); + set_tsk_trace_trace(p); + } + put_online_cpus(); +} + +static void clear_ftrace_pid(struct pid *pid) +{ + struct task_struct *p; + + rcu_read_lock(); + do_each_pid_task(pid, PIDTYPE_PID, p) { + clear_tsk_trace_trace(p); + } while_each_pid_task(pid, PIDTYPE_PID, p); + rcu_read_unlock(); + + put_pid(pid); +} + +static void set_ftrace_pid(struct pid *pid) +{ + struct task_struct *p; + + rcu_read_lock(); + do_each_pid_task(pid, PIDTYPE_PID, p) { + set_tsk_trace_trace(p); + } while_each_pid_task(pid, PIDTYPE_PID, p); + rcu_read_unlock(); +} + +static void clear_ftrace_pid_task(struct pid *pid) +{ + if (pid == ftrace_swapper_pid) + clear_ftrace_swapper(); + else + clear_ftrace_pid(pid); +} + +static void set_ftrace_pid_task(struct pid *pid) +{ + if (pid == ftrace_swapper_pid) + set_ftrace_swapper(); + else + set_ftrace_pid(pid); +} + +static int ftrace_pid_add(int p) +{ + struct pid *pid; + struct ftrace_pid *fpid; + int ret = -EINVAL; + + mutex_lock(&ftrace_lock); + + if (!p) + pid = ftrace_swapper_pid; + else + pid = find_get_pid(p); + + if (!pid) + goto out; + + ret = 0; + + list_for_each_entry(fpid, &ftrace_pids, list) + if (fpid->pid == pid) + goto out_put; + + ret = -ENOMEM; + + fpid = kmalloc(sizeof(*fpid), GFP_KERNEL); + if (!fpid) + goto out_put; + + list_add(&fpid->list, &ftrace_pids); + fpid->pid = pid; + + set_ftrace_pid_task(pid); + + ftrace_update_pid_func(); + ftrace_startup_enable(0); + + mutex_unlock(&ftrace_lock); + return 0; + +out_put: + if (pid != ftrace_swapper_pid) + put_pid(pid); + +out: + mutex_unlock(&ftrace_lock); + return ret; +} + +static void ftrace_pid_reset(void) +{ + struct ftrace_pid *fpid, *safe; + + mutex_lock(&ftrace_lock); + list_for_each_entry_safe(fpid, safe, &ftrace_pids, list) { + struct pid *pid = fpid->pid; + + clear_ftrace_pid_task(pid); + + list_del(&fpid->list); + kfree(fpid); + } + + ftrace_update_pid_func(); + ftrace_startup_enable(0); + + mutex_unlock(&ftrace_lock); +} + +static void *fpid_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&ftrace_lock); + + if (list_empty(&ftrace_pids) && (!*pos)) + return (void *) 1; + + return seq_list_start(&ftrace_pids, *pos); +} + +static void *fpid_next(struct seq_file *m, void *v, loff_t *pos) +{ + if (v == (void *)1) + return NULL; + + return seq_list_next(v, &ftrace_pids, pos); +} + +static void fpid_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&ftrace_lock); +} + +static int fpid_show(struct seq_file *m, void *v) +{ + const struct ftrace_pid *fpid = list_entry(v, struct ftrace_pid, list); + + if (v == (void *)1) { + seq_printf(m, "no pid\n"); + return 0; + } + + if (fpid->pid == ftrace_swapper_pid) + seq_printf(m, "swapper tasks\n"); + else + seq_printf(m, "%u\n", pid_vnr(fpid->pid)); + + return 0; +} + +static const struct seq_operations ftrace_pid_sops = { + .start = fpid_start, + .next = fpid_next, + .stop = fpid_stop, + .show = fpid_show, +}; + +static int +ftrace_pid_open(struct inode *inode, struct file *file) +{ + int ret = 0; + + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) + ftrace_pid_reset(); + + if (file->f_mode & FMODE_READ) + ret = seq_open(file, &ftrace_pid_sops); + + return ret; +} + +static ssize_t +ftrace_pid_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64], *tmp; + long val; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + /* + * Allow "echo > set_ftrace_pid" or "echo -n '' > set_ftrace_pid" + * to clean the filter quietly. + */ + tmp = strstrip(buf); + if (strlen(tmp) == 0) + return 1; + + ret = strict_strtol(tmp, 10, &val); + if (ret < 0) + return ret; + + ret = ftrace_pid_add(val); + + return ret ? ret : cnt; +} + +static int +ftrace_pid_release(struct inode *inode, struct file *file) +{ + if (file->f_mode & FMODE_READ) + seq_release(inode, file); + + return 0; +} + +static const struct file_operations ftrace_pid_fops = { + .open = ftrace_pid_open, + .write = ftrace_pid_write, + .read = seq_read, + .llseek = seq_lseek, + .release = ftrace_pid_release, +}; + +static __init int ftrace_init_debugfs(void) +{ + struct dentry *d_tracer; + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; + + ftrace_init_dyn_debugfs(d_tracer); + + trace_create_file("set_ftrace_pid", 0644, d_tracer, + NULL, &ftrace_pid_fops); + + ftrace_profile_debugfs(d_tracer); + + return 0; +} +fs_initcall(ftrace_init_debugfs); + +/** + * ftrace_kill - kill ftrace + * + * This function should be used by panic code. It stops ftrace + * but in a not so nice way. If you need to simply kill ftrace + * from a non-atomic section, use ftrace_kill. + */ +void ftrace_kill(void) +{ + ftrace_disabled = 1; + ftrace_enabled = 0; + clear_ftrace_function(); +} + +/** + * register_ftrace_function - register a function for profiling + * @ops - ops structure that holds the function for profiling. + * + * Register a function to be called by all functions in the + * kernel. + * + * Note: @ops->func and all the functions it calls must be labeled + * with "notrace", otherwise it will go into a + * recursive loop. + */ +int register_ftrace_function(struct ftrace_ops *ops) +{ + int ret = -1; + + mutex_lock(&ftrace_lock); + + if (unlikely(ftrace_disabled)) + goto out_unlock; + + ret = __register_ftrace_function(ops); + if (!ret) + ret = ftrace_startup(ops, 0); + + + out_unlock: + mutex_unlock(&ftrace_lock); + return ret; +} +EXPORT_SYMBOL_GPL(register_ftrace_function); + +/** + * unregister_ftrace_function - unregister a function for profiling. + * @ops - ops structure that holds the function to unregister + * + * Unregister a function that was added to be called by ftrace profiling. + */ +int unregister_ftrace_function(struct ftrace_ops *ops) +{ + int ret; + + mutex_lock(&ftrace_lock); + ret = __unregister_ftrace_function(ops); + if (!ret) + ftrace_shutdown(ops, 0); + mutex_unlock(&ftrace_lock); + + return ret; +} +EXPORT_SYMBOL_GPL(unregister_ftrace_function); + +int +ftrace_enable_sysctl(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret = -ENODEV; + + mutex_lock(&ftrace_lock); + + if (unlikely(ftrace_disabled)) + goto out; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) + goto out; + + last_ftrace_enabled = !!ftrace_enabled; + + if (ftrace_enabled) { + + ftrace_startup_sysctl(); + + /* we are starting ftrace again */ + if (ftrace_ops_list != &ftrace_list_end) { + if (ftrace_ops_list->next == &ftrace_list_end) + ftrace_trace_function = ftrace_ops_list->func; + else + ftrace_trace_function = ftrace_ops_list_func; + } + + } else { + /* stopping ftrace calls (just send to ftrace_stub) */ + ftrace_trace_function = ftrace_stub; + + ftrace_shutdown_sysctl(); + } + + out: + mutex_unlock(&ftrace_lock); + return ret; +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +static int ftrace_graph_active; +static struct notifier_block ftrace_suspend_notifier; + +int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) +{ + return 0; +} + +/* The callbacks that hook a function */ +trace_func_graph_ret_t ftrace_graph_return = + (trace_func_graph_ret_t)ftrace_stub; +trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub; + +/* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */ +static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) +{ + int i; + int ret = 0; + unsigned long flags; + int start = 0, end = FTRACE_RETSTACK_ALLOC_SIZE; + struct task_struct *g, *t; + + for (i = 0; i < FTRACE_RETSTACK_ALLOC_SIZE; i++) { + ret_stack_list[i] = kmalloc(FTRACE_RETFUNC_DEPTH + * sizeof(struct ftrace_ret_stack), + GFP_KERNEL); + if (!ret_stack_list[i]) { + start = 0; + end = i; + ret = -ENOMEM; + goto free; + } + } + + read_lock_irqsave(&tasklist_lock, flags); + do_each_thread(g, t) { + if (start == end) { + ret = -EAGAIN; + goto unlock; + } + + if (t->ret_stack == NULL) { + atomic_set(&t->tracing_graph_pause, 0); + atomic_set(&t->trace_overrun, 0); + t->curr_ret_stack = -1; + /* Make sure the tasks see the -1 first: */ + smp_wmb(); + t->ret_stack = ret_stack_list[start++]; + } + } while_each_thread(g, t); + +unlock: + read_unlock_irqrestore(&tasklist_lock, flags); +free: + for (i = start; i < end; i++) + kfree(ret_stack_list[i]); + return ret; +} + +static void +ftrace_graph_probe_sched_switch(void *ignore, + struct task_struct *prev, struct task_struct *next) +{ + unsigned long long timestamp; + int index; + + /* + * Does the user want to count the time a function was asleep. + * If so, do not update the time stamps. + */ + if (trace_flags & TRACE_ITER_SLEEP_TIME) + return; + + timestamp = trace_clock_local(); + + prev->ftrace_timestamp = timestamp; + + /* only process tasks that we timestamped */ + if (!next->ftrace_timestamp) + return; + + /* + * Update all the counters in next to make up for the + * time next was sleeping. + */ + timestamp -= next->ftrace_timestamp; + + for (index = next->curr_ret_stack; index >= 0; index--) + next->ret_stack[index].calltime += timestamp; +} + +/* Allocate a return stack for each task */ +static int start_graph_tracing(void) +{ + struct ftrace_ret_stack **ret_stack_list; + int ret, cpu; + + ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE * + sizeof(struct ftrace_ret_stack *), + GFP_KERNEL); + + if (!ret_stack_list) + return -ENOMEM; + + /* The cpu_boot init_task->ret_stack will never be freed */ + for_each_online_cpu(cpu) { + if (!idle_task(cpu)->ret_stack) + ftrace_graph_init_idle_task(idle_task(cpu), cpu); + } + + do { + ret = alloc_retstack_tasklist(ret_stack_list); + } while (ret == -EAGAIN); + + if (!ret) { + ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); + if (ret) + pr_info("ftrace_graph: Couldn't activate tracepoint" + " probe to kernel_sched_switch\n"); + } + + kfree(ret_stack_list); + return ret; +} + +/* + * Hibernation protection. + * The state of the current task is too much unstable during + * suspend/restore to disk. We want to protect against that. + */ +static int +ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state, + void *unused) +{ + switch (state) { + case PM_HIBERNATION_PREPARE: + pause_graph_tracing(); + break; + + case PM_POST_HIBERNATION: + unpause_graph_tracing(); + break; + } + return NOTIFY_DONE; +} + +int register_ftrace_graph(trace_func_graph_ret_t retfunc, + trace_func_graph_ent_t entryfunc) +{ + int ret = 0; + + mutex_lock(&ftrace_lock); + + /* we currently allow only one tracer registered at a time */ + if (ftrace_graph_active) { + ret = -EBUSY; + goto out; + } + + ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call; + register_pm_notifier(&ftrace_suspend_notifier); + + ftrace_graph_active++; + ret = start_graph_tracing(); + if (ret) { + ftrace_graph_active--; + goto out; + } + + ftrace_graph_return = retfunc; + ftrace_graph_entry = entryfunc; + + ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET); + +out: + mutex_unlock(&ftrace_lock); + return ret; +} + +void unregister_ftrace_graph(void) +{ + mutex_lock(&ftrace_lock); + + if (unlikely(!ftrace_graph_active)) + goto out; + + ftrace_graph_active--; + ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; + ftrace_graph_entry = ftrace_graph_entry_stub; + ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); + unregister_pm_notifier(&ftrace_suspend_notifier); + unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); + + out: + mutex_unlock(&ftrace_lock); +} + +static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack); + +static void +graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) +{ + atomic_set(&t->tracing_graph_pause, 0); + atomic_set(&t->trace_overrun, 0); + t->ftrace_timestamp = 0; + /* make curr_ret_stack visible before we add the ret_stack */ + smp_wmb(); + t->ret_stack = ret_stack; +} + +/* + * Allocate a return stack for the idle task. May be the first + * time through, or it may be done by CPU hotplug online. + */ +void ftrace_graph_init_idle_task(struct task_struct *t, int cpu) +{ + t->curr_ret_stack = -1; + /* + * The idle task has no parent, it either has its own + * stack or no stack at all. + */ + if (t->ret_stack) + WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu)); + + if (ftrace_graph_active) { + struct ftrace_ret_stack *ret_stack; + + ret_stack = per_cpu(idle_ret_stack, cpu); + if (!ret_stack) { + ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH + * sizeof(struct ftrace_ret_stack), + GFP_KERNEL); + if (!ret_stack) + return; + per_cpu(idle_ret_stack, cpu) = ret_stack; + } + graph_init_task(t, ret_stack); + } +} + +/* Allocate a return stack for newly created task */ +void ftrace_graph_init_task(struct task_struct *t) +{ + /* Make sure we do not use the parent ret_stack */ + t->ret_stack = NULL; + t->curr_ret_stack = -1; + + if (ftrace_graph_active) { + struct ftrace_ret_stack *ret_stack; + + ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH + * sizeof(struct ftrace_ret_stack), + GFP_KERNEL); + if (!ret_stack) + return; + graph_init_task(t, ret_stack); + } +} + +void ftrace_graph_exit_task(struct task_struct *t) +{ + struct ftrace_ret_stack *ret_stack = t->ret_stack; + + t->ret_stack = NULL; + /* NULL must become visible to IRQs before we free it: */ + barrier(); + + kfree(ret_stack); +} + +void ftrace_graph_stop(void) +{ + ftrace_stop(); +} +#endif diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c new file mode 100644 index 00000000..f55fcf61 --- /dev/null +++ b/kernel/trace/power-traces.c @@ -0,0 +1,20 @@ +/* + * Power trace points + * + * Copyright (C) 2009 Arjan van de Ven + */ + +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +#ifdef EVENT_POWER_TRACING_DEPRECATED +EXPORT_TRACEPOINT_SYMBOL_GPL(power_start); +#endif +EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); + diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c new file mode 100644 index 00000000..b0c7aa40 --- /dev/null +++ b/kernel/trace/ring_buffer.c @@ -0,0 +1,4067 @@ +/* + * Generic ring buffer + * + * Copyright (C) 2008 Steven Rostedt + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "trace.h" + +/* + * The ring buffer header is special. We must manually up keep it. + */ +int ring_buffer_print_entry_header(struct trace_seq *s) +{ + int ret; + + ret = trace_seq_printf(s, "# compressed entry header\n"); + ret = trace_seq_printf(s, "\ttype_len : 5 bits\n"); + ret = trace_seq_printf(s, "\ttime_delta : 27 bits\n"); + ret = trace_seq_printf(s, "\tarray : 32 bits\n"); + ret = trace_seq_printf(s, "\n"); + ret = trace_seq_printf(s, "\tpadding : type == %d\n", + RINGBUF_TYPE_PADDING); + ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", + RINGBUF_TYPE_TIME_EXTEND); + ret = trace_seq_printf(s, "\tdata max type_len == %d\n", + RINGBUF_TYPE_DATA_TYPE_LEN_MAX); + + return ret; +} + +/* + * The ring buffer is made up of a list of pages. A separate list of pages is + * allocated for each CPU. A writer may only write to a buffer that is + * associated with the CPU it is currently executing on. A reader may read + * from any per cpu buffer. + * + * The reader is special. For each per cpu buffer, the reader has its own + * reader page. When a reader has read the entire reader page, this reader + * page is swapped with another page in the ring buffer. + * + * Now, as long as the writer is off the reader page, the reader can do what + * ever it wants with that page. The writer will never write to that page + * again (as long as it is out of the ring buffer). + * + * Here's some silly ASCII art. + * + * +------+ + * |reader| RING BUFFER + * |page | + * +------+ +---+ +---+ +---+ + * | |-->| |-->| | + * +---+ +---+ +---+ + * ^ | + * | | + * +---------------+ + * + * + * +------+ + * |reader| RING BUFFER + * |page |------------------v + * +------+ +---+ +---+ +---+ + * | |-->| |-->| | + * +---+ +---+ +---+ + * ^ | + * | | + * +---------------+ + * + * + * +------+ + * |reader| RING BUFFER + * |page |------------------v + * +------+ +---+ +---+ +---+ + * ^ | |-->| |-->| | + * | +---+ +---+ +---+ + * | | + * | | + * +------------------------------+ + * + * + * +------+ + * |buffer| RING BUFFER + * |page |------------------v + * +------+ +---+ +---+ +---+ + * ^ | | | |-->| | + * | New +---+ +---+ +---+ + * | Reader------^ | + * | page | + * +------------------------------+ + * + * + * After we make this swap, the reader can hand this page off to the splice + * code and be done with it. It can even allocate a new page if it needs to + * and swap that into the ring buffer. + * + * We will be using cmpxchg soon to make all this lockless. + * + */ + +/* + * A fast way to enable or disable all ring buffers is to + * call tracing_on or tracing_off. Turning off the ring buffers + * prevents all ring buffers from being recorded to. + * Turning this switch on, makes it OK to write to the + * ring buffer, if the ring buffer is enabled itself. + * + * There's three layers that must be on in order to write + * to the ring buffer. + * + * 1) This global flag must be set. + * 2) The ring buffer must be enabled for recording. + * 3) The per cpu buffer must be enabled for recording. + * + * In case of an anomaly, this global flag has a bit set that + * will permantly disable all ring buffers. + */ + +/* + * Global flag to disable all recording to ring buffers + * This has two bits: ON, DISABLED + * + * ON DISABLED + * ---- ---------- + * 0 0 : ring buffers are off + * 1 0 : ring buffers are on + * X 1 : ring buffers are permanently disabled + */ + +enum { + RB_BUFFERS_ON_BIT = 0, + RB_BUFFERS_DISABLED_BIT = 1, +}; + +enum { + RB_BUFFERS_ON = 1 << RB_BUFFERS_ON_BIT, + RB_BUFFERS_DISABLED = 1 << RB_BUFFERS_DISABLED_BIT, +}; + +static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON; + +#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data) + +/** + * tracing_on - enable all tracing buffers + * + * This function enables all tracing buffers that may have been + * disabled with tracing_off. + */ +void tracing_on(void) +{ + set_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags); +} +EXPORT_SYMBOL_GPL(tracing_on); + +/** + * tracing_off - turn off all tracing buffers + * + * This function stops all tracing buffers from recording data. + * It does not disable any overhead the tracers themselves may + * be causing. This function simply causes all recording to + * the ring buffers to fail. + */ +void tracing_off(void) +{ + clear_bit(RB_BUFFERS_ON_BIT, &ring_buffer_flags); +} +EXPORT_SYMBOL_GPL(tracing_off); + +/** + * tracing_off_permanent - permanently disable ring buffers + * + * This function, once called, will disable all ring buffers + * permanently. + */ +void tracing_off_permanent(void) +{ + set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags); +} + +/** + * tracing_is_on - show state of ring buffers enabled + */ +int tracing_is_on(void) +{ + return ring_buffer_flags == RB_BUFFERS_ON; +} +EXPORT_SYMBOL_GPL(tracing_is_on); + +#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) +#define RB_ALIGNMENT 4U +#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) +#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ + +#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) +# define RB_FORCE_8BYTE_ALIGNMENT 0 +# define RB_ARCH_ALIGNMENT RB_ALIGNMENT +#else +# define RB_FORCE_8BYTE_ALIGNMENT 1 +# define RB_ARCH_ALIGNMENT 8U +#endif + +/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ +#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX + +enum { + RB_LEN_TIME_EXTEND = 8, + RB_LEN_TIME_STAMP = 16, +}; + +#define skip_time_extend(event) \ + ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND)) + +static inline int rb_null_event(struct ring_buffer_event *event) +{ + return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta; +} + +static void rb_event_set_padding(struct ring_buffer_event *event) +{ + /* padding has a NULL time_delta */ + event->type_len = RINGBUF_TYPE_PADDING; + event->time_delta = 0; +} + +static unsigned +rb_event_data_length(struct ring_buffer_event *event) +{ + unsigned length; + + if (event->type_len) + length = event->type_len * RB_ALIGNMENT; + else + length = event->array[0]; + return length + RB_EVNT_HDR_SIZE; +} + +/* + * Return the length of the given event. Will return + * the length of the time extend if the event is a + * time extend. + */ +static inline unsigned +rb_event_length(struct ring_buffer_event *event) +{ + switch (event->type_len) { + case RINGBUF_TYPE_PADDING: + if (rb_null_event(event)) + /* undefined */ + return -1; + return event->array[0] + RB_EVNT_HDR_SIZE; + + case RINGBUF_TYPE_TIME_EXTEND: + return RB_LEN_TIME_EXTEND; + + case RINGBUF_TYPE_TIME_STAMP: + return RB_LEN_TIME_STAMP; + + case RINGBUF_TYPE_DATA: + return rb_event_data_length(event); + default: + BUG(); + } + /* not hit */ + return 0; +} + +/* + * Return total length of time extend and data, + * or just the event length for all other events. + */ +static inline unsigned +rb_event_ts_length(struct ring_buffer_event *event) +{ + unsigned len = 0; + + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { + /* time extends include the data event after it */ + len = RB_LEN_TIME_EXTEND; + event = skip_time_extend(event); + } + return len + rb_event_length(event); +} + +/** + * ring_buffer_event_length - return the length of the event + * @event: the event to get the length of + * + * Returns the size of the data load of a data event. + * If the event is something other than a data event, it + * returns the size of the event itself. With the exception + * of a TIME EXTEND, where it still returns the size of the + * data load of the data event after it. + */ +unsigned ring_buffer_event_length(struct ring_buffer_event *event) +{ + unsigned length; + + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); + + length = rb_event_length(event); + if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) + return length; + length -= RB_EVNT_HDR_SIZE; + if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0])) + length -= sizeof(event->array[0]); + return length; +} +EXPORT_SYMBOL_GPL(ring_buffer_event_length); + +/* inline for ring buffer fast paths */ +static void * +rb_event_data(struct ring_buffer_event *event) +{ + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); + BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); + /* If length is in len field, then array[0] has the data */ + if (event->type_len) + return (void *)&event->array[0]; + /* Otherwise length is in array[0] and array[1] has the data */ + return (void *)&event->array[1]; +} + +/** + * ring_buffer_event_data - return the data of the event + * @event: the event to get the data from + */ +void *ring_buffer_event_data(struct ring_buffer_event *event) +{ + return rb_event_data(event); +} +EXPORT_SYMBOL_GPL(ring_buffer_event_data); + +#define for_each_buffer_cpu(buffer, cpu) \ + for_each_cpu(cpu, buffer->cpumask) + +#define TS_SHIFT 27 +#define TS_MASK ((1ULL << TS_SHIFT) - 1) +#define TS_DELTA_TEST (~TS_MASK) + +/* Flag when events were overwritten */ +#define RB_MISSED_EVENTS (1 << 31) +/* Missed count stored at end */ +#define RB_MISSED_STORED (1 << 30) + +struct buffer_data_page { + u64 time_stamp; /* page time stamp */ + local_t commit; /* write committed index */ + unsigned char data[]; /* data of buffer page */ +}; + +/* + * Note, the buffer_page list must be first. The buffer pages + * are allocated in cache lines, which means that each buffer + * page will be at the beginning of a cache line, and thus + * the least significant bits will be zero. We use this to + * add flags in the list struct pointers, to make the ring buffer + * lockless. + */ +struct buffer_page { + struct list_head list; /* list of buffer pages */ + local_t write; /* index for next write */ + unsigned read; /* index for next read */ + local_t entries; /* entries on this page */ + unsigned long real_end; /* real end of data */ + struct buffer_data_page *page; /* Actual data page */ +}; + +/* + * The buffer page counters, write and entries, must be reset + * atomically when crossing page boundaries. To synchronize this + * update, two counters are inserted into the number. One is + * the actual counter for the write position or count on the page. + * + * The other is a counter of updaters. Before an update happens + * the update partition of the counter is incremented. This will + * allow the updater to update the counter atomically. + * + * The counter is 20 bits, and the state data is 12. + */ +#define RB_WRITE_MASK 0xfffff +#define RB_WRITE_INTCNT (1 << 20) + +static void rb_init_page(struct buffer_data_page *bpage) +{ + local_set(&bpage->commit, 0); +} + +/** + * ring_buffer_page_len - the size of data on the page. + * @page: The page to read + * + * Returns the amount of data on the page, including buffer page header. + */ +size_t ring_buffer_page_len(void *page) +{ + return local_read(&((struct buffer_data_page *)page)->commit) + + BUF_PAGE_HDR_SIZE; +} + +/* + * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing + * this issue out. + */ +static void free_buffer_page(struct buffer_page *bpage) +{ + free_page((unsigned long)bpage->page); + kfree(bpage); +} + +/* + * We need to fit the time_stamp delta into 27 bits. + */ +static inline int test_time_stamp(u64 delta) +{ + if (delta & TS_DELTA_TEST) + return 1; + return 0; +} + +#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) + +/* Max payload is BUF_PAGE_SIZE - header (8bytes) */ +#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) + +int ring_buffer_print_page_header(struct trace_seq *s) +{ + struct buffer_data_page field; + int ret; + + ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" + "offset:0;\tsize:%u;\tsigned:%u;\n", + (unsigned int)sizeof(field.time_stamp), + (unsigned int)is_signed_type(u64)); + + ret = trace_seq_printf(s, "\tfield: local_t commit;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + (unsigned int)sizeof(field.commit), + (unsigned int)is_signed_type(long)); + + ret = trace_seq_printf(s, "\tfield: int overwrite;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + 1, + (unsigned int)is_signed_type(long)); + + ret = trace_seq_printf(s, "\tfield: char data;\t" + "offset:%u;\tsize:%u;\tsigned:%u;\n", + (unsigned int)offsetof(typeof(field), data), + (unsigned int)BUF_PAGE_SIZE, + (unsigned int)is_signed_type(char)); + + return ret; +} + +/* + * head_page == tail_page && head == tail then buffer is empty. + */ +struct ring_buffer_per_cpu { + int cpu; + atomic_t record_disabled; + struct ring_buffer *buffer; + spinlock_t reader_lock; /* serialize readers */ + arch_spinlock_t lock; + struct lock_class_key lock_key; + struct list_head *pages; + struct buffer_page *head_page; /* read from head */ + struct buffer_page *tail_page; /* write to tail */ + struct buffer_page *commit_page; /* committed pages */ + struct buffer_page *reader_page; + unsigned long lost_events; + unsigned long last_overrun; + local_t commit_overrun; + local_t overrun; + local_t entries; + local_t committing; + local_t commits; + unsigned long read; + u64 write_stamp; + u64 read_stamp; +}; + +struct ring_buffer { + unsigned pages; + unsigned flags; + int cpus; + atomic_t record_disabled; + cpumask_var_t cpumask; + + struct lock_class_key *reader_lock_key; + + struct mutex mutex; + + struct ring_buffer_per_cpu **buffers; + +#ifdef CONFIG_HOTPLUG_CPU + struct notifier_block cpu_notify; +#endif + u64 (*clock)(void); +}; + +struct ring_buffer_iter { + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long head; + struct buffer_page *head_page; + struct buffer_page *cache_reader_page; + unsigned long cache_read; + u64 read_stamp; +}; + +/* buffer may be either ring_buffer or ring_buffer_per_cpu */ +#define RB_WARN_ON(b, cond) \ + ({ \ + int _____ret = unlikely(cond); \ + if (_____ret) { \ + if (__same_type(*(b), struct ring_buffer_per_cpu)) { \ + struct ring_buffer_per_cpu *__b = \ + (void *)b; \ + atomic_inc(&__b->buffer->record_disabled); \ + } else \ + atomic_inc(&b->record_disabled); \ + WARN_ON(1); \ + } \ + _____ret; \ + }) + +/* Up this if you want to test the TIME_EXTENTS and normalization */ +#define DEBUG_SHIFT 0 + +static inline u64 rb_time_stamp(struct ring_buffer *buffer) +{ + /* shift to debug/test normalization and TIME_EXTENTS */ + return buffer->clock() << DEBUG_SHIFT; +} + +u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) +{ + u64 time; + + preempt_disable_notrace(); + time = rb_time_stamp(buffer); + preempt_enable_no_resched_notrace(); + + return time; +} +EXPORT_SYMBOL_GPL(ring_buffer_time_stamp); + +void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, + int cpu, u64 *ts) +{ + /* Just stupid testing the normalize function and deltas */ + *ts >>= DEBUG_SHIFT; +} +EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp); + +/* + * Making the ring buffer lockless makes things tricky. + * Although writes only happen on the CPU that they are on, + * and they only need to worry about interrupts. Reads can + * happen on any CPU. + * + * The reader page is always off the ring buffer, but when the + * reader finishes with a page, it needs to swap its page with + * a new one from the buffer. The reader needs to take from + * the head (writes go to the tail). But if a writer is in overwrite + * mode and wraps, it must push the head page forward. + * + * Here lies the problem. + * + * The reader must be careful to replace only the head page, and + * not another one. As described at the top of the file in the + * ASCII art, the reader sets its old page to point to the next + * page after head. It then sets the page after head to point to + * the old reader page. But if the writer moves the head page + * during this operation, the reader could end up with the tail. + * + * We use cmpxchg to help prevent this race. We also do something + * special with the page before head. We set the LSB to 1. + * + * When the writer must push the page forward, it will clear the + * bit that points to the head page, move the head, and then set + * the bit that points to the new head page. + * + * We also don't want an interrupt coming in and moving the head + * page on another writer. Thus we use the second LSB to catch + * that too. Thus: + * + * head->list->prev->next bit 1 bit 0 + * ------- ------- + * Normal page 0 0 + * Points to head page 0 1 + * New head page 1 0 + * + * Note we can not trust the prev pointer of the head page, because: + * + * +----+ +-----+ +-----+ + * | |------>| T |---X--->| N | + * | |<------| | | | + * +----+ +-----+ +-----+ + * ^ ^ | + * | +-----+ | | + * +----------| R |----------+ | + * | |<-----------+ + * +-----+ + * + * Key: ---X--> HEAD flag set in pointer + * T Tail page + * R Reader page + * N Next page + * + * (see __rb_reserve_next() to see where this happens) + * + * What the above shows is that the reader just swapped out + * the reader page with a page in the buffer, but before it + * could make the new header point back to the new page added + * it was preempted by a writer. The writer moved forward onto + * the new page added by the reader and is about to move forward + * again. + * + * You can see, it is legitimate for the previous pointer of + * the head (or any page) not to point back to itself. But only + * temporarially. + */ + +#define RB_PAGE_NORMAL 0UL +#define RB_PAGE_HEAD 1UL +#define RB_PAGE_UPDATE 2UL + + +#define RB_FLAG_MASK 3UL + +/* PAGE_MOVED is not part of the mask */ +#define RB_PAGE_MOVED 4UL + +/* + * rb_list_head - remove any bit + */ +static struct list_head *rb_list_head(struct list_head *list) +{ + unsigned long val = (unsigned long)list; + + return (struct list_head *)(val & ~RB_FLAG_MASK); +} + +/* + * rb_is_head_page - test if the given page is the head page + * + * Because the reader may move the head_page pointer, we can + * not trust what the head page is (it may be pointing to + * the reader page). But if the next page is a header page, + * its flags will be non zero. + */ +static inline int +rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *page, struct list_head *list) +{ + unsigned long val; + + val = (unsigned long)list->next; + + if ((val & ~RB_FLAG_MASK) != (unsigned long)&page->list) + return RB_PAGE_MOVED; + + return val & RB_FLAG_MASK; +} + +/* + * rb_is_reader_page + * + * The unique thing about the reader page, is that, if the + * writer is ever on it, the previous pointer never points + * back to the reader page. + */ +static int rb_is_reader_page(struct buffer_page *page) +{ + struct list_head *list = page->list.prev; + + return rb_list_head(list->next) != &page->list; +} + +/* + * rb_set_list_to_head - set a list_head to be pointing to head. + */ +static void rb_set_list_to_head(struct ring_buffer_per_cpu *cpu_buffer, + struct list_head *list) +{ + unsigned long *ptr; + + ptr = (unsigned long *)&list->next; + *ptr |= RB_PAGE_HEAD; + *ptr &= ~RB_PAGE_UPDATE; +} + +/* + * rb_head_page_activate - sets up head page + */ +static void rb_head_page_activate(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *head; + + head = cpu_buffer->head_page; + if (!head) + return; + + /* + * Set the previous list pointer to have the HEAD flag. + */ + rb_set_list_to_head(cpu_buffer, head->list.prev); +} + +static void rb_list_head_clear(struct list_head *list) +{ + unsigned long *ptr = (unsigned long *)&list->next; + + *ptr &= ~RB_FLAG_MASK; +} + +/* + * rb_head_page_dactivate - clears head page ptr (for free list) + */ +static void +rb_head_page_deactivate(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct list_head *hd; + + /* Go through the whole list and clear any pointers found. */ + rb_list_head_clear(cpu_buffer->pages); + + list_for_each(hd, cpu_buffer->pages) + rb_list_head_clear(hd); +} + +static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag, int new_flag) +{ + struct list_head *list; + unsigned long val = (unsigned long)&head->list; + unsigned long ret; + + list = &prev->list; + + val &= ~RB_FLAG_MASK; + + ret = cmpxchg((unsigned long *)&list->next, + val | old_flag, val | new_flag); + + /* check if the reader took the page */ + if ((ret & ~RB_FLAG_MASK) != val) + return RB_PAGE_MOVED; + + return ret & RB_FLAG_MASK; +} + +static int rb_head_page_set_update(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag) +{ + return rb_head_page_set(cpu_buffer, head, prev, + old_flag, RB_PAGE_UPDATE); +} + +static int rb_head_page_set_head(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag) +{ + return rb_head_page_set(cpu_buffer, head, prev, + old_flag, RB_PAGE_HEAD); +} + +static int rb_head_page_set_normal(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *head, + struct buffer_page *prev, + int old_flag) +{ + return rb_head_page_set(cpu_buffer, head, prev, + old_flag, RB_PAGE_NORMAL); +} + +static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page **bpage) +{ + struct list_head *p = rb_list_head((*bpage)->list.next); + + *bpage = list_entry(p, struct buffer_page, list); +} + +static struct buffer_page * +rb_set_head_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *head; + struct buffer_page *page; + struct list_head *list; + int i; + + if (RB_WARN_ON(cpu_buffer, !cpu_buffer->head_page)) + return NULL; + + /* sanity check */ + list = cpu_buffer->pages; + if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev->next) != list)) + return NULL; + + page = head = cpu_buffer->head_page; + /* + * It is possible that the writer moves the header behind + * where we started, and we miss in one loop. + * A second loop should grab the header, but we'll do + * three loops just because I'm paranoid. + */ + for (i = 0; i < 3; i++) { + do { + if (rb_is_head_page(cpu_buffer, page, page->list.prev)) { + cpu_buffer->head_page = page; + return page; + } + rb_inc_page(cpu_buffer, &page); + } while (page != head); + } + + RB_WARN_ON(cpu_buffer, 1); + + return NULL; +} + +static int rb_head_page_replace(struct buffer_page *old, + struct buffer_page *new) +{ + unsigned long *ptr = (unsigned long *)&old->list.prev->next; + unsigned long val; + unsigned long ret; + + val = *ptr & ~RB_FLAG_MASK; + val |= RB_PAGE_HEAD; + + ret = cmpxchg(ptr, val, (unsigned long)&new->list); + + return ret == val; +} + +/* + * rb_tail_page_update - move the tail page forward + * + * Returns 1 if moved tail page, 0 if someone else did. + */ +static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *tail_page, + struct buffer_page *next_page) +{ + struct buffer_page *old_tail; + unsigned long old_entries; + unsigned long old_write; + int ret = 0; + + /* + * The tail page now needs to be moved forward. + * + * We need to reset the tail page, but without messing + * with possible erasing of data brought in by interrupts + * that have moved the tail page and are currently on it. + * + * We add a counter to the write field to denote this. + */ + old_write = local_add_return(RB_WRITE_INTCNT, &next_page->write); + old_entries = local_add_return(RB_WRITE_INTCNT, &next_page->entries); + + /* + * Just make sure we have seen our old_write and synchronize + * with any interrupts that come in. + */ + barrier(); + + /* + * If the tail page is still the same as what we think + * it is, then it is up to us to update the tail + * pointer. + */ + if (tail_page == cpu_buffer->tail_page) { + /* Zero the write counter */ + unsigned long val = old_write & ~RB_WRITE_MASK; + unsigned long eval = old_entries & ~RB_WRITE_MASK; + + /* + * This will only succeed if an interrupt did + * not come in and change it. In which case, we + * do not want to modify it. + * + * We add (void) to let the compiler know that we do not care + * about the return value of these functions. We use the + * cmpxchg to only update if an interrupt did not already + * do it for us. If the cmpxchg fails, we don't care. + */ + (void)local_cmpxchg(&next_page->write, old_write, val); + (void)local_cmpxchg(&next_page->entries, old_entries, eval); + + /* + * No need to worry about races with clearing out the commit. + * it only can increment when a commit takes place. But that + * only happens in the outer most nested commit. + */ + local_set(&next_page->page->commit, 0); + + old_tail = cmpxchg(&cpu_buffer->tail_page, + tail_page, next_page); + + if (old_tail == tail_page) + ret = 1; + } + + return ret; +} + +static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *bpage) +{ + unsigned long val = (unsigned long)bpage; + + if (RB_WARN_ON(cpu_buffer, val & RB_FLAG_MASK)) + return 1; + + return 0; +} + +/** + * rb_check_list - make sure a pointer to a list has the last bits zero + */ +static int rb_check_list(struct ring_buffer_per_cpu *cpu_buffer, + struct list_head *list) +{ + if (RB_WARN_ON(cpu_buffer, rb_list_head(list->prev) != list->prev)) + return 1; + if (RB_WARN_ON(cpu_buffer, rb_list_head(list->next) != list->next)) + return 1; + return 0; +} + +/** + * check_pages - integrity check of buffer pages + * @cpu_buffer: CPU buffer with pages to test + * + * As a safety measure we check to make sure the data pages have not + * been corrupted. + */ +static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct list_head *head = cpu_buffer->pages; + struct buffer_page *bpage, *tmp; + + rb_head_page_deactivate(cpu_buffer); + + if (RB_WARN_ON(cpu_buffer, head->next->prev != head)) + return -1; + if (RB_WARN_ON(cpu_buffer, head->prev->next != head)) + return -1; + + if (rb_check_list(cpu_buffer, head)) + return -1; + + list_for_each_entry_safe(bpage, tmp, head, list) { + if (RB_WARN_ON(cpu_buffer, + bpage->list.next->prev != &bpage->list)) + return -1; + if (RB_WARN_ON(cpu_buffer, + bpage->list.prev->next != &bpage->list)) + return -1; + if (rb_check_list(cpu_buffer, &bpage->list)) + return -1; + } + + rb_head_page_activate(cpu_buffer); + + return 0; +} + +static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, + unsigned nr_pages) +{ + struct buffer_page *bpage, *tmp; + unsigned long addr; + LIST_HEAD(pages); + unsigned i; + + WARN_ON(!nr_pages); + + for (i = 0; i < nr_pages; i++) { + bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), + GFP_KERNEL, cpu_to_node(cpu_buffer->cpu)); + if (!bpage) + goto free_pages; + + rb_check_bpage(cpu_buffer, bpage); + + list_add(&bpage->list, &pages); + + addr = __get_free_page(GFP_KERNEL); + if (!addr) + goto free_pages; + bpage->page = (void *)addr; + rb_init_page(bpage->page); + } + + /* + * The ring buffer page list is a circular list that does not + * start and end with a list head. All page list items point to + * other pages. + */ + cpu_buffer->pages = pages.next; + list_del(&pages); + + rb_check_pages(cpu_buffer); + + return 0; + + free_pages: + list_for_each_entry_safe(bpage, tmp, &pages, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + return -ENOMEM; +} + +static struct ring_buffer_per_cpu * +rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct buffer_page *bpage; + unsigned long addr; + int ret; + + cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()), + GFP_KERNEL, cpu_to_node(cpu)); + if (!cpu_buffer) + return NULL; + + cpu_buffer->cpu = cpu; + cpu_buffer->buffer = buffer; + spin_lock_init(&cpu_buffer->reader_lock); + lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key); + cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + + bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), + GFP_KERNEL, cpu_to_node(cpu)); + if (!bpage) + goto fail_free_buffer; + + rb_check_bpage(cpu_buffer, bpage); + + cpu_buffer->reader_page = bpage; + addr = __get_free_page(GFP_KERNEL); + if (!addr) + goto fail_free_reader; + bpage->page = (void *)addr; + rb_init_page(bpage->page); + + INIT_LIST_HEAD(&cpu_buffer->reader_page->list); + + ret = rb_allocate_pages(cpu_buffer, buffer->pages); + if (ret < 0) + goto fail_free_reader; + + cpu_buffer->head_page + = list_entry(cpu_buffer->pages, struct buffer_page, list); + cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page; + + rb_head_page_activate(cpu_buffer); + + return cpu_buffer; + + fail_free_reader: + free_buffer_page(cpu_buffer->reader_page); + + fail_free_buffer: + kfree(cpu_buffer); + return NULL; +} + +static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct list_head *head = cpu_buffer->pages; + struct buffer_page *bpage, *tmp; + + free_buffer_page(cpu_buffer->reader_page); + + rb_head_page_deactivate(cpu_buffer); + + if (head) { + list_for_each_entry_safe(bpage, tmp, head, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + bpage = list_entry(head, struct buffer_page, list); + free_buffer_page(bpage); + } + + kfree(cpu_buffer); +} + +#ifdef CONFIG_HOTPLUG_CPU +static int rb_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu); +#endif + +/** + * ring_buffer_alloc - allocate a new ring_buffer + * @size: the size in bytes per cpu that is needed. + * @flags: attributes to set for the ring buffer. + * + * Currently the only flag that is available is the RB_FL_OVERWRITE + * flag. This flag means that the buffer will overwrite old data + * when the buffer wraps. If this flag is not set, the buffer will + * drop data when the tail hits the head. + */ +struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, + struct lock_class_key *key) +{ + struct ring_buffer *buffer; + int bsize; + int cpu; + + /* keep it in its own cache line */ + buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), + GFP_KERNEL); + if (!buffer) + return NULL; + + if (!alloc_cpumask_var(&buffer->cpumask, GFP_KERNEL)) + goto fail_free_buffer; + + buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + buffer->flags = flags; + buffer->clock = trace_clock_local; + buffer->reader_lock_key = key; + + /* need at least two pages */ + if (buffer->pages < 2) + buffer->pages = 2; + + /* + * In case of non-hotplug cpu, if the ring-buffer is allocated + * in early initcall, it will not be notified of secondary cpus. + * In that off case, we need to allocate for all possible cpus. + */ +#ifdef CONFIG_HOTPLUG_CPU + get_online_cpus(); + cpumask_copy(buffer->cpumask, cpu_online_mask); +#else + cpumask_copy(buffer->cpumask, cpu_possible_mask); +#endif + buffer->cpus = nr_cpu_ids; + + bsize = sizeof(void *) * nr_cpu_ids; + buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()), + GFP_KERNEL); + if (!buffer->buffers) + goto fail_free_cpumask; + + for_each_buffer_cpu(buffer, cpu) { + buffer->buffers[cpu] = + rb_allocate_cpu_buffer(buffer, cpu); + if (!buffer->buffers[cpu]) + goto fail_free_buffers; + } + +#ifdef CONFIG_HOTPLUG_CPU + buffer->cpu_notify.notifier_call = rb_cpu_notify; + buffer->cpu_notify.priority = 0; + register_cpu_notifier(&buffer->cpu_notify); +#endif + + put_online_cpus(); + mutex_init(&buffer->mutex); + + return buffer; + + fail_free_buffers: + for_each_buffer_cpu(buffer, cpu) { + if (buffer->buffers[cpu]) + rb_free_cpu_buffer(buffer->buffers[cpu]); + } + kfree(buffer->buffers); + + fail_free_cpumask: + free_cpumask_var(buffer->cpumask); + put_online_cpus(); + + fail_free_buffer: + kfree(buffer); + return NULL; +} +EXPORT_SYMBOL_GPL(__ring_buffer_alloc); + +/** + * ring_buffer_free - free a ring buffer. + * @buffer: the buffer to free. + */ +void +ring_buffer_free(struct ring_buffer *buffer) +{ + int cpu; + + get_online_cpus(); + +#ifdef CONFIG_HOTPLUG_CPU + unregister_cpu_notifier(&buffer->cpu_notify); +#endif + + for_each_buffer_cpu(buffer, cpu) + rb_free_cpu_buffer(buffer->buffers[cpu]); + + put_online_cpus(); + + kfree(buffer->buffers); + free_cpumask_var(buffer->cpumask); + + kfree(buffer); +} +EXPORT_SYMBOL_GPL(ring_buffer_free); + +void ring_buffer_set_clock(struct ring_buffer *buffer, + u64 (*clock)(void)) +{ + buffer->clock = clock; +} + +static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer); + +static void +rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) +{ + struct buffer_page *bpage; + struct list_head *p; + unsigned i; + + spin_lock_irq(&cpu_buffer->reader_lock); + rb_head_page_deactivate(cpu_buffer); + + for (i = 0; i < nr_pages; i++) { + if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) + goto out; + p = cpu_buffer->pages->next; + bpage = list_entry(p, struct buffer_page, list); + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) + goto out; + + rb_reset_cpu(cpu_buffer); + rb_check_pages(cpu_buffer); + +out: + spin_unlock_irq(&cpu_buffer->reader_lock); +} + +static void +rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer, + struct list_head *pages, unsigned nr_pages) +{ + struct buffer_page *bpage; + struct list_head *p; + unsigned i; + + spin_lock_irq(&cpu_buffer->reader_lock); + rb_head_page_deactivate(cpu_buffer); + + for (i = 0; i < nr_pages; i++) { + if (RB_WARN_ON(cpu_buffer, list_empty(pages))) + goto out; + p = pages->next; + bpage = list_entry(p, struct buffer_page, list); + list_del_init(&bpage->list); + list_add_tail(&bpage->list, cpu_buffer->pages); + } + rb_reset_cpu(cpu_buffer); + rb_check_pages(cpu_buffer); + +out: + spin_unlock_irq(&cpu_buffer->reader_lock); +} + +/** + * ring_buffer_resize - resize the ring buffer + * @buffer: the buffer to resize. + * @size: the new size. + * + * Minimum size is 2 * BUF_PAGE_SIZE. + * + * Returns -1 on failure. + */ +int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned nr_pages, rm_pages, new_pages; + struct buffer_page *bpage, *tmp; + unsigned long buffer_size; + unsigned long addr; + LIST_HEAD(pages); + int i, cpu; + + /* + * Always succeed at resizing a non-existent buffer: + */ + if (!buffer) + return size; + + size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + size *= BUF_PAGE_SIZE; + buffer_size = buffer->pages * BUF_PAGE_SIZE; + + /* we need a minimum of two pages */ + if (size < BUF_PAGE_SIZE * 2) + size = BUF_PAGE_SIZE * 2; + + if (size == buffer_size) + return size; + + atomic_inc(&buffer->record_disabled); + + /* Make sure all writers are done with this buffer. */ + synchronize_sched(); + + mutex_lock(&buffer->mutex); + get_online_cpus(); + + nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + + if (size < buffer_size) { + + /* easy case, just free pages */ + if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) + goto out_fail; + + rm_pages = buffer->pages - nr_pages; + + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + rb_remove_pages(cpu_buffer, rm_pages); + } + goto out; + } + + /* + * This is a bit more difficult. We only want to add pages + * when we can allocate enough for all CPUs. We do this + * by allocating all the pages and storing them on a local + * link list. If we succeed in our allocation, then we + * add these pages to the cpu_buffers. Otherwise we just free + * them all and return -ENOMEM; + */ + if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) + goto out_fail; + + new_pages = nr_pages - buffer->pages; + + for_each_buffer_cpu(buffer, cpu) { + for (i = 0; i < new_pages; i++) { + bpage = kzalloc_node(ALIGN(sizeof(*bpage), + cache_line_size()), + GFP_KERNEL, cpu_to_node(cpu)); + if (!bpage) + goto free_pages; + list_add(&bpage->list, &pages); + addr = __get_free_page(GFP_KERNEL); + if (!addr) + goto free_pages; + bpage->page = (void *)addr; + rb_init_page(bpage->page); + } + } + + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + rb_insert_pages(cpu_buffer, &pages, new_pages); + } + + if (RB_WARN_ON(buffer, !list_empty(&pages))) + goto out_fail; + + out: + buffer->pages = nr_pages; + put_online_cpus(); + mutex_unlock(&buffer->mutex); + + atomic_dec(&buffer->record_disabled); + + return size; + + free_pages: + list_for_each_entry_safe(bpage, tmp, &pages, list) { + list_del_init(&bpage->list); + free_buffer_page(bpage); + } + put_online_cpus(); + mutex_unlock(&buffer->mutex); + atomic_dec(&buffer->record_disabled); + return -ENOMEM; + + /* + * Something went totally wrong, and we are too paranoid + * to even clean up the mess. + */ + out_fail: + put_online_cpus(); + mutex_unlock(&buffer->mutex); + atomic_dec(&buffer->record_disabled); + return -1; +} +EXPORT_SYMBOL_GPL(ring_buffer_resize); + +void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val) +{ + mutex_lock(&buffer->mutex); + if (val) + buffer->flags |= RB_FL_OVERWRITE; + else + buffer->flags &= ~RB_FL_OVERWRITE; + mutex_unlock(&buffer->mutex); +} +EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); + +static inline void * +__rb_data_page_index(struct buffer_data_page *bpage, unsigned index) +{ + return bpage->data + index; +} + +static inline void *__rb_page_index(struct buffer_page *bpage, unsigned index) +{ + return bpage->page->data + index; +} + +static inline struct ring_buffer_event * +rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer) +{ + return __rb_page_index(cpu_buffer->reader_page, + cpu_buffer->reader_page->read); +} + +static inline struct ring_buffer_event * +rb_iter_head_event(struct ring_buffer_iter *iter) +{ + return __rb_page_index(iter->head_page, iter->head); +} + +static inline unsigned long rb_page_write(struct buffer_page *bpage) +{ + return local_read(&bpage->write) & RB_WRITE_MASK; +} + +static inline unsigned rb_page_commit(struct buffer_page *bpage) +{ + return local_read(&bpage->page->commit); +} + +static inline unsigned long rb_page_entries(struct buffer_page *bpage) +{ + return local_read(&bpage->entries) & RB_WRITE_MASK; +} + +/* Size is determined by what has been committed */ +static inline unsigned rb_page_size(struct buffer_page *bpage) +{ + return rb_page_commit(bpage); +} + +static inline unsigned +rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer) +{ + return rb_page_commit(cpu_buffer->commit_page); +} + +static inline unsigned +rb_event_index(struct ring_buffer_event *event) +{ + unsigned long addr = (unsigned long)event; + + return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE; +} + +static inline int +rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long addr = (unsigned long)event; + unsigned long index; + + index = rb_event_index(event); + addr &= PAGE_MASK; + + return cpu_buffer->commit_page->page == (void *)addr && + rb_commit_index(cpu_buffer) == index; +} + +static void +rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned long max_count; + + /* + * We only race with interrupts and NMIs on this CPU. + * If we own the commit event, then we can commit + * all others that interrupted us, since the interruptions + * are in stack format (they finish before they come + * back to us). This allows us to do a simple loop to + * assign the commit to the tail. + */ + again: + max_count = cpu_buffer->buffer->pages * 100; + + while (cpu_buffer->commit_page != cpu_buffer->tail_page) { + if (RB_WARN_ON(cpu_buffer, !(--max_count))) + return; + if (RB_WARN_ON(cpu_buffer, + rb_is_reader_page(cpu_buffer->tail_page))) + return; + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); + rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; + /* add barrier to keep gcc from optimizing too much */ + barrier(); + } + while (rb_commit_index(cpu_buffer) != + rb_page_write(cpu_buffer->commit_page)) { + + local_set(&cpu_buffer->commit_page->page->commit, + rb_page_write(cpu_buffer->commit_page)); + RB_WARN_ON(cpu_buffer, + local_read(&cpu_buffer->commit_page->page->commit) & + ~RB_WRITE_MASK); + barrier(); + } + + /* again, keep gcc from optimizing */ + barrier(); + + /* + * If an interrupt came in just after the first while loop + * and pushed the tail page forward, we will be left with + * a dangling commit that will never go forward. + */ + if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page)) + goto again; +} + +static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; + cpu_buffer->reader_page->read = 0; +} + +static void rb_inc_iter(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + + /* + * The iterator could be on the reader page (it starts there). + * But the head could have moved, since the reader was + * found. Check for this case and assign the iterator + * to the head page instead of next. + */ + if (iter->head_page == cpu_buffer->reader_page) + iter->head_page = rb_set_head_page(cpu_buffer); + else + rb_inc_page(cpu_buffer, &iter->head_page); + + iter->read_stamp = iter->head_page->page->time_stamp; + iter->head = 0; +} + +/* Slow path, do not inline */ +static noinline struct ring_buffer_event * +rb_add_time_stamp(struct ring_buffer_event *event, u64 delta) +{ + event->type_len = RINGBUF_TYPE_TIME_EXTEND; + + /* Not the first event on the page? */ + if (rb_event_index(event)) { + event->time_delta = delta & TS_MASK; + event->array[0] = delta >> TS_SHIFT; + } else { + /* nope, just zero it */ + event->time_delta = 0; + event->array[0] = 0; + } + + return skip_time_extend(event); +} + +/** + * ring_buffer_update_event - update event type and data + * @event: the even to update + * @type: the type of event + * @length: the size of the event field in the ring buffer + * + * Update the type and data fields of the event. The length + * is the actual size that is written to the ring buffer, + * and with this, we can determine what to place into the + * data field. + */ +static void +rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event, unsigned length, + int add_timestamp, u64 delta) +{ + /* Only a commit updates the timestamp */ + if (unlikely(!rb_event_is_commit(cpu_buffer, event))) + delta = 0; + + /* + * If we need to add a timestamp, then we + * add it to the start of the resevered space. + */ + if (unlikely(add_timestamp)) { + event = rb_add_time_stamp(event, delta); + length -= RB_LEN_TIME_EXTEND; + delta = 0; + } + + event->time_delta = delta; + length -= RB_EVNT_HDR_SIZE; + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { + event->type_len = 0; + event->array[0] = length; + } else + event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); +} + +/* + * rb_handle_head_page - writer hit the head page + * + * Returns: +1 to retry page + * 0 to continue + * -1 on error + */ +static int +rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *tail_page, + struct buffer_page *next_page) +{ + struct buffer_page *new_head; + int entries; + int type; + int ret; + + entries = rb_page_entries(next_page); + + /* + * The hard part is here. We need to move the head + * forward, and protect against both readers on + * other CPUs and writers coming in via interrupts. + */ + type = rb_head_page_set_update(cpu_buffer, next_page, tail_page, + RB_PAGE_HEAD); + + /* + * type can be one of four: + * NORMAL - an interrupt already moved it for us + * HEAD - we are the first to get here. + * UPDATE - we are the interrupt interrupting + * a current move. + * MOVED - a reader on another CPU moved the next + * pointer to its reader page. Give up + * and try again. + */ + + switch (type) { + case RB_PAGE_HEAD: + /* + * We changed the head to UPDATE, thus + * it is our responsibility to update + * the counters. + */ + local_add(entries, &cpu_buffer->overrun); + + /* + * The entries will be zeroed out when we move the + * tail page. + */ + + /* still more to do */ + break; + + case RB_PAGE_UPDATE: + /* + * This is an interrupt that interrupt the + * previous update. Still more to do. + */ + break; + case RB_PAGE_NORMAL: + /* + * An interrupt came in before the update + * and processed this for us. + * Nothing left to do. + */ + return 1; + case RB_PAGE_MOVED: + /* + * The reader is on another CPU and just did + * a swap with our next_page. + * Try again. + */ + return 1; + default: + RB_WARN_ON(cpu_buffer, 1); /* WTF??? */ + return -1; + } + + /* + * Now that we are here, the old head pointer is + * set to UPDATE. This will keep the reader from + * swapping the head page with the reader page. + * The reader (on another CPU) will spin till + * we are finished. + * + * We just need to protect against interrupts + * doing the job. We will set the next pointer + * to HEAD. After that, we set the old pointer + * to NORMAL, but only if it was HEAD before. + * otherwise we are an interrupt, and only + * want the outer most commit to reset it. + */ + new_head = next_page; + rb_inc_page(cpu_buffer, &new_head); + + ret = rb_head_page_set_head(cpu_buffer, new_head, next_page, + RB_PAGE_NORMAL); + + /* + * Valid returns are: + * HEAD - an interrupt came in and already set it. + * NORMAL - One of two things: + * 1) We really set it. + * 2) A bunch of interrupts came in and moved + * the page forward again. + */ + switch (ret) { + case RB_PAGE_HEAD: + case RB_PAGE_NORMAL: + /* OK */ + break; + default: + RB_WARN_ON(cpu_buffer, 1); + return -1; + } + + /* + * It is possible that an interrupt came in, + * set the head up, then more interrupts came in + * and moved it again. When we get back here, + * the page would have been set to NORMAL but we + * just set it back to HEAD. + * + * How do you detect this? Well, if that happened + * the tail page would have moved. + */ + if (ret == RB_PAGE_NORMAL) { + /* + * If the tail had moved passed next, then we need + * to reset the pointer. + */ + if (cpu_buffer->tail_page != tail_page && + cpu_buffer->tail_page != next_page) + rb_head_page_set_normal(cpu_buffer, new_head, + next_page, + RB_PAGE_HEAD); + } + + /* + * If this was the outer most commit (the one that + * changed the original pointer from HEAD to UPDATE), + * then it is up to us to reset it to NORMAL. + */ + if (type == RB_PAGE_HEAD) { + ret = rb_head_page_set_normal(cpu_buffer, next_page, + tail_page, + RB_PAGE_UPDATE); + if (RB_WARN_ON(cpu_buffer, + ret != RB_PAGE_UPDATE)) + return -1; + } + + return 0; +} + +static unsigned rb_calculate_event_length(unsigned length) +{ + struct ring_buffer_event event; /* Used only for sizeof array */ + + /* zero length can cause confusions */ + if (!length) + length = 1; + + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) + length += sizeof(event.array[0]); + + length += RB_EVNT_HDR_SIZE; + length = ALIGN(length, RB_ARCH_ALIGNMENT); + + return length; +} + +static inline void +rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, + struct buffer_page *tail_page, + unsigned long tail, unsigned long length) +{ + struct ring_buffer_event *event; + + /* + * Only the event that crossed the page boundary + * must fill the old tail_page with padding. + */ + if (tail >= BUF_PAGE_SIZE) { + /* + * If the page was filled, then we still need + * to update the real_end. Reset it to zero + * and the reader will ignore it. + */ + if (tail == BUF_PAGE_SIZE) + tail_page->real_end = 0; + + local_sub(length, &tail_page->write); + return; + } + + event = __rb_page_index(tail_page, tail); + kmemcheck_annotate_bitfield(event, bitfield); + + /* + * Save the original length to the meta data. + * This will be used by the reader to add lost event + * counter. + */ + tail_page->real_end = tail; + + /* + * If this event is bigger than the minimum size, then + * we need to be careful that we don't subtract the + * write counter enough to allow another writer to slip + * in on this page. + * We put in a discarded commit instead, to make sure + * that this space is not used again. + * + * If we are less than the minimum size, we don't need to + * worry about it. + */ + if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) { + /* No room for any events */ + + /* Mark the rest of the page with padding */ + rb_event_set_padding(event); + + /* Set the write back to the previous setting */ + local_sub(length, &tail_page->write); + return; + } + + /* Put in a discarded event */ + event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE; + event->type_len = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + event->time_delta = 1; + + /* Set write to end of buffer */ + length = (tail + length) - BUF_PAGE_SIZE; + local_sub(length, &tail_page->write); +} + +/* + * This is the slow path, force gcc not to inline it. + */ +static noinline struct ring_buffer_event * +rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, + unsigned long length, unsigned long tail, + struct buffer_page *tail_page, u64 ts) +{ + struct buffer_page *commit_page = cpu_buffer->commit_page; + struct ring_buffer *buffer = cpu_buffer->buffer; + struct buffer_page *next_page; + int ret; + + next_page = tail_page; + + rb_inc_page(cpu_buffer, &next_page); + + /* + * If for some reason, we had an interrupt storm that made + * it all the way around the buffer, bail, and warn + * about it. + */ + if (unlikely(next_page == commit_page)) { + local_inc(&cpu_buffer->commit_overrun); + goto out_reset; + } + + /* + * This is where the fun begins! + * + * We are fighting against races between a reader that + * could be on another CPU trying to swap its reader + * page with the buffer head. + * + * We are also fighting against interrupts coming in and + * moving the head or tail on us as well. + * + * If the next page is the head page then we have filled + * the buffer, unless the commit page is still on the + * reader page. + */ + if (rb_is_head_page(cpu_buffer, next_page, &tail_page->list)) { + + /* + * If the commit is not on the reader page, then + * move the header page. + */ + if (!rb_is_reader_page(cpu_buffer->commit_page)) { + /* + * If we are not in overwrite mode, + * this is easy, just stop here. + */ + if (!(buffer->flags & RB_FL_OVERWRITE)) + goto out_reset; + + ret = rb_handle_head_page(cpu_buffer, + tail_page, + next_page); + if (ret < 0) + goto out_reset; + if (ret) + goto out_again; + } else { + /* + * We need to be careful here too. The + * commit page could still be on the reader + * page. We could have a small buffer, and + * have filled up the buffer with events + * from interrupts and such, and wrapped. + * + * Note, if the tail page is also the on the + * reader_page, we let it move out. + */ + if (unlikely((cpu_buffer->commit_page != + cpu_buffer->tail_page) && + (cpu_buffer->commit_page == + cpu_buffer->reader_page))) { + local_inc(&cpu_buffer->commit_overrun); + goto out_reset; + } + } + } + + ret = rb_tail_page_update(cpu_buffer, tail_page, next_page); + if (ret) { + /* + * Nested commits always have zero deltas, so + * just reread the time stamp + */ + ts = rb_time_stamp(buffer); + next_page->page->time_stamp = ts; + } + + out_again: + + rb_reset_tail(cpu_buffer, tail_page, tail, length); + + /* fail and let the caller try again */ + return ERR_PTR(-EAGAIN); + + out_reset: + /* reset write */ + rb_reset_tail(cpu_buffer, tail_page, tail, length); + + return NULL; +} + +static struct ring_buffer_event * +__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, + unsigned long length, u64 ts, + u64 delta, int add_timestamp) +{ + struct buffer_page *tail_page; + struct ring_buffer_event *event; + unsigned long tail, write; + + /* + * If the time delta since the last event is too big to + * hold in the time field of the event, then we append a + * TIME EXTEND event ahead of the data event. + */ + if (unlikely(add_timestamp)) + length += RB_LEN_TIME_EXTEND; + + tail_page = cpu_buffer->tail_page; + write = local_add_return(length, &tail_page->write); + + /* set write to only the index of the write */ + write &= RB_WRITE_MASK; + tail = write - length; + + /* See if we shot pass the end of this buffer page */ + if (unlikely(write > BUF_PAGE_SIZE)) + return rb_move_tail(cpu_buffer, length, tail, + tail_page, ts); + + /* We reserved something on the buffer */ + + event = __rb_page_index(tail_page, tail); + kmemcheck_annotate_bitfield(event, bitfield); + rb_update_event(cpu_buffer, event, length, add_timestamp, delta); + + local_inc(&tail_page->entries); + + /* + * If this is the first commit on the page, then update + * its timestamp. + */ + if (!tail) + tail_page->page->time_stamp = ts; + + return event; +} + +static inline int +rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long new_index, old_index; + struct buffer_page *bpage; + unsigned long index; + unsigned long addr; + + new_index = rb_event_index(event); + old_index = new_index + rb_event_ts_length(event); + addr = (unsigned long)event; + addr &= PAGE_MASK; + + bpage = cpu_buffer->tail_page; + + if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { + unsigned long write_mask = + local_read(&bpage->write) & ~RB_WRITE_MASK; + /* + * This is on the tail page. It is possible that + * a write could come in and move the tail page + * and write to the next page. That is fine + * because we just shorten what is on this page. + */ + old_index += write_mask; + new_index += write_mask; + index = local_cmpxchg(&bpage->write, old_index, new_index); + if (index == old_index) + return 1; + } + + /* could not discard */ + return 0; +} + +static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer) +{ + local_inc(&cpu_buffer->committing); + local_inc(&cpu_buffer->commits); +} + +static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) +{ + unsigned long commits; + + if (RB_WARN_ON(cpu_buffer, + !local_read(&cpu_buffer->committing))) + return; + + again: + commits = local_read(&cpu_buffer->commits); + /* synchronize with interrupts */ + barrier(); + if (local_read(&cpu_buffer->committing) == 1) + rb_set_commit_to_write(cpu_buffer); + + local_dec(&cpu_buffer->committing); + + /* synchronize with interrupts */ + barrier(); + + /* + * Need to account for interrupts coming in between the + * updating of the commit page and the clearing of the + * committing counter. + */ + if (unlikely(local_read(&cpu_buffer->commits) != commits) && + !local_read(&cpu_buffer->committing)) { + local_inc(&cpu_buffer->committing); + goto again; + } +} + +static struct ring_buffer_event * +rb_reserve_next_event(struct ring_buffer *buffer, + struct ring_buffer_per_cpu *cpu_buffer, + unsigned long length) +{ + struct ring_buffer_event *event; + u64 ts, delta; + int nr_loops = 0; + int add_timestamp; + u64 diff; + + rb_start_commit(cpu_buffer); + +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP + /* + * Due to the ability to swap a cpu buffer from a buffer + * it is possible it was swapped before we committed. + * (committing stops a swap). We check for it here and + * if it happened, we have to fail the write. + */ + barrier(); + if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) { + local_dec(&cpu_buffer->committing); + local_dec(&cpu_buffer->commits); + return NULL; + } +#endif + + length = rb_calculate_event_length(length); + again: + add_timestamp = 0; + delta = 0; + + /* + * We allow for interrupts to reenter here and do a trace. + * If one does, it will cause this original code to loop + * back here. Even with heavy interrupts happening, this + * should only happen a few times in a row. If this happens + * 1000 times in a row, there must be either an interrupt + * storm or we have something buggy. + * Bail! + */ + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) + goto out_fail; + + ts = rb_time_stamp(cpu_buffer->buffer); + diff = ts - cpu_buffer->write_stamp; + + /* make sure this diff is calculated here */ + barrier(); + + /* Did the write stamp get updated already? */ + if (likely(ts >= cpu_buffer->write_stamp)) { + delta = diff; + if (unlikely(test_time_stamp(delta))) { + int local_clock_stable = 1; +#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK + local_clock_stable = sched_clock_stable; +#endif + WARN_ONCE(delta > (1ULL << 59), + KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s", + (unsigned long long)delta, + (unsigned long long)ts, + (unsigned long long)cpu_buffer->write_stamp, + local_clock_stable ? "" : + "If you just came from a suspend/resume,\n" + "please switch to the trace global clock:\n" + " echo global > /sys/kernel/debug/tracing/trace_clock\n"); + add_timestamp = 1; + } + } + + event = __rb_reserve_next(cpu_buffer, length, ts, + delta, add_timestamp); + if (unlikely(PTR_ERR(event) == -EAGAIN)) + goto again; + + if (!event) + goto out_fail; + + return event; + + out_fail: + rb_end_commit(cpu_buffer); + return NULL; +} + +#ifdef CONFIG_TRACING + +#define TRACE_RECURSIVE_DEPTH 16 + +/* Keep this code out of the fast path cache */ +static noinline void trace_recursive_fail(void) +{ + /* Disable all tracing before we do anything else */ + tracing_off_permanent(); + + printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" + "HC[%lu]:SC[%lu]:NMI[%lu]\n", + trace_recursion_buffer(), + hardirq_count() >> HARDIRQ_SHIFT, + softirq_count() >> SOFTIRQ_SHIFT, + in_nmi()); + + WARN_ON_ONCE(1); +} + +static inline int trace_recursive_lock(void) +{ + trace_recursion_inc(); + + if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH)) + return 0; + + trace_recursive_fail(); + + return -1; +} + +static inline void trace_recursive_unlock(void) +{ + WARN_ON_ONCE(!trace_recursion_buffer()); + + trace_recursion_dec(); +} + +#else + +#define trace_recursive_lock() (0) +#define trace_recursive_unlock() do { } while (0) + +#endif + +/** + * ring_buffer_lock_reserve - reserve a part of the buffer + * @buffer: the ring buffer to reserve from + * @length: the length of the data to reserve (excluding event header) + * + * Returns a reseverd event on the ring buffer to copy directly to. + * The user of this interface will need to get the body to write into + * and can use the ring_buffer_event_data() interface. + * + * The length is the length of the data needed, not the event length + * which also includes the event header. + * + * Must be paired with ring_buffer_unlock_commit, unless NULL is returned. + * If NULL is returned, then nothing has been allocated or locked. + */ +struct ring_buffer_event * +ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event; + int cpu; + + if (ring_buffer_flags != RB_BUFFERS_ON) + return NULL; + + /* If we are tracing schedule, we don't want to recurse */ + preempt_disable_notrace(); + + if (atomic_read(&buffer->record_disabled)) + goto out_nocheck; + + if (trace_recursive_lock()) + goto out_nocheck; + + cpu = raw_smp_processor_id(); + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + goto out; + + cpu_buffer = buffer->buffers[cpu]; + + if (atomic_read(&cpu_buffer->record_disabled)) + goto out; + + if (length > BUF_MAX_DATA_SIZE) + goto out; + + event = rb_reserve_next_event(buffer, cpu_buffer, length); + if (!event) + goto out; + + return event; + + out: + trace_recursive_unlock(); + + out_nocheck: + preempt_enable_notrace(); + return NULL; +} +EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); + +static void +rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + u64 delta; + + /* + * The event first in the commit queue updates the + * time stamp. + */ + if (rb_event_is_commit(cpu_buffer, event)) { + /* + * A commit event that is first on a page + * updates the write timestamp with the page stamp + */ + if (!rb_event_index(event)) + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; + else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) { + delta = event->array[0]; + delta <<= TS_SHIFT; + delta += event->time_delta; + cpu_buffer->write_stamp += delta; + } else + cpu_buffer->write_stamp += event->time_delta; + } +} + +static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + local_inc(&cpu_buffer->entries); + rb_update_write_stamp(cpu_buffer, event); + rb_end_commit(cpu_buffer); +} + +/** + * ring_buffer_unlock_commit - commit a reserved + * @buffer: The buffer to commit to + * @event: The event pointer to commit. + * + * This commits the data to the ring buffer, and releases any locks held. + * + * Must be paired with ring_buffer_lock_reserve. + */ +int ring_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu = raw_smp_processor_id(); + + cpu_buffer = buffer->buffers[cpu]; + + rb_commit(cpu_buffer, event); + + trace_recursive_unlock(); + + preempt_enable_notrace(); + + return 0; +} +EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); + +static inline void rb_event_discard(struct ring_buffer_event *event) +{ + if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) + event = skip_time_extend(event); + + /* array[0] holds the actual length for the discarded event */ + event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; + event->type_len = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + if (!event->time_delta) + event->time_delta = 1; +} + +/* + * Decrement the entries to the page that an event is on. + * The event does not even need to exist, only the pointer + * to the page it is on. This may only be called before the commit + * takes place. + */ +static inline void +rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + unsigned long addr = (unsigned long)event; + struct buffer_page *bpage = cpu_buffer->commit_page; + struct buffer_page *start; + + addr &= PAGE_MASK; + + /* Do the likely case first */ + if (likely(bpage->page == (void *)addr)) { + local_dec(&bpage->entries); + return; + } + + /* + * Because the commit page may be on the reader page we + * start with the next page and check the end loop there. + */ + rb_inc_page(cpu_buffer, &bpage); + start = bpage; + do { + if (bpage->page == (void *)addr) { + local_dec(&bpage->entries); + return; + } + rb_inc_page(cpu_buffer, &bpage); + } while (bpage != start); + + /* commit not part of this buffer?? */ + RB_WARN_ON(cpu_buffer, 1); +} + +/** + * ring_buffer_commit_discard - discard an event that has not been committed + * @buffer: the ring buffer + * @event: non committed event to discard + * + * Sometimes an event that is in the ring buffer needs to be ignored. + * This function lets the user discard an event in the ring buffer + * and then that event will not be read later. + * + * This function only works if it is called before the the item has been + * committed. It will try to free the event from the ring buffer + * if another event has not been added behind it. + * + * If another event has been added behind it, it will set the event + * up as discarded, and perform the commit. + * + * If this function is called, do not call ring_buffer_unlock_commit on + * the event. + */ +void ring_buffer_discard_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + struct ring_buffer_per_cpu *cpu_buffer; + int cpu; + + /* The event is discarded regardless */ + rb_event_discard(event); + + cpu = smp_processor_id(); + cpu_buffer = buffer->buffers[cpu]; + + /* + * This must only be called if the event has not been + * committed yet. Thus we can assume that preemption + * is still disabled. + */ + RB_WARN_ON(buffer, !local_read(&cpu_buffer->committing)); + + rb_decrement_entry(cpu_buffer, event); + if (rb_try_to_discard(cpu_buffer, event)) + goto out; + + /* + * The commit is still visible by the reader, so we + * must still update the timestamp. + */ + rb_update_write_stamp(cpu_buffer, event); + out: + rb_end_commit(cpu_buffer); + + trace_recursive_unlock(); + + preempt_enable_notrace(); + +} +EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); + +/** + * ring_buffer_write - write data to the buffer without reserving + * @buffer: The ring buffer to write to. + * @length: The length of the data being written (excluding the event header) + * @data: The data to write to the buffer. + * + * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as + * one function. If you already have the data to write to the buffer, it + * may be easier to simply call this function. + * + * Note, like ring_buffer_lock_reserve, the length is the length of the data + * and not the length of the event which would hold the header. + */ +int ring_buffer_write(struct ring_buffer *buffer, + unsigned long length, + void *data) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event; + void *body; + int ret = -EBUSY; + int cpu; + + if (ring_buffer_flags != RB_BUFFERS_ON) + return -EBUSY; + + preempt_disable_notrace(); + + if (atomic_read(&buffer->record_disabled)) + goto out; + + cpu = raw_smp_processor_id(); + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + goto out; + + cpu_buffer = buffer->buffers[cpu]; + + if (atomic_read(&cpu_buffer->record_disabled)) + goto out; + + if (length > BUF_MAX_DATA_SIZE) + goto out; + + event = rb_reserve_next_event(buffer, cpu_buffer, length); + if (!event) + goto out; + + body = rb_event_data(event); + + memcpy(body, data, length); + + rb_commit(cpu_buffer, event); + + ret = 0; + out: + preempt_enable_notrace(); + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_write); + +static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *reader = cpu_buffer->reader_page; + struct buffer_page *head = rb_set_head_page(cpu_buffer); + struct buffer_page *commit = cpu_buffer->commit_page; + + /* In case of error, head will be NULL */ + if (unlikely(!head)) + return 1; + + return reader->read == rb_page_commit(reader) && + (commit == reader || + (commit == head && + head->read == rb_page_commit(commit))); +} + +/** + * ring_buffer_record_disable - stop all writes into the buffer + * @buffer: The ring buffer to stop writes to. + * + * This prevents all writes to the buffer. Any attempt to write + * to the buffer after this will fail and return NULL. + * + * The caller should call synchronize_sched() after this. + */ +void ring_buffer_record_disable(struct ring_buffer *buffer) +{ + atomic_inc(&buffer->record_disabled); +} +EXPORT_SYMBOL_GPL(ring_buffer_record_disable); + +/** + * ring_buffer_record_enable - enable writes to the buffer + * @buffer: The ring buffer to enable writes + * + * Note, multiple disables will need the same number of enables + * to truly enable the writing (much like preempt_disable). + */ +void ring_buffer_record_enable(struct ring_buffer *buffer) +{ + atomic_dec(&buffer->record_disabled); +} +EXPORT_SYMBOL_GPL(ring_buffer_record_enable); + +/** + * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer + * @buffer: The ring buffer to stop writes to. + * @cpu: The CPU buffer to stop + * + * This prevents all writes to the buffer. Any attempt to write + * to the buffer after this will fail and return NULL. + * + * The caller should call synchronize_sched() after this. + */ +void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return; + + cpu_buffer = buffer->buffers[cpu]; + atomic_inc(&cpu_buffer->record_disabled); +} +EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu); + +/** + * ring_buffer_record_enable_cpu - enable writes to the buffer + * @buffer: The ring buffer to enable writes + * @cpu: The CPU to enable. + * + * Note, multiple disables will need the same number of enables + * to truly enable the writing (much like preempt_disable). + */ +void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return; + + cpu_buffer = buffer->buffers[cpu]; + atomic_dec(&cpu_buffer->record_disabled); +} +EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); + +/* + * The total entries in the ring buffer is the running counter + * of entries entered into the ring buffer, minus the sum of + * the entries read from the ring buffer and the number of + * entries that were overwritten. + */ +static inline unsigned long +rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) +{ + return local_read(&cpu_buffer->entries) - + (local_read(&cpu_buffer->overrun) + cpu_buffer->read); +} + +/** + * ring_buffer_entries_cpu - get the number of entries in a cpu buffer + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the entries from. + */ +unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + + return rb_num_of_entries(cpu_buffer); +} +EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); + +/** + * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = local_read(&cpu_buffer->overrun); + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); + +/** + * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long +ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = local_read(&cpu_buffer->commit_overrun); + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); + +/** + * ring_buffer_entries - get the number of entries in a buffer + * @buffer: The ring buffer + * + * Returns the total number of entries in the ring buffer + * (all CPU entries) + */ +unsigned long ring_buffer_entries(struct ring_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long entries = 0; + int cpu; + + /* if you care about this being correct, lock the buffer */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + entries += rb_num_of_entries(cpu_buffer); + } + + return entries; +} +EXPORT_SYMBOL_GPL(ring_buffer_entries); + +/** + * ring_buffer_overruns - get the number of overruns in buffer + * @buffer: The ring buffer + * + * Returns the total number of overruns in the ring buffer + * (all CPU entries) + */ +unsigned long ring_buffer_overruns(struct ring_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long overruns = 0; + int cpu; + + /* if you care about this being correct, lock the buffer */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + overruns += local_read(&cpu_buffer->overrun); + } + + return overruns; +} +EXPORT_SYMBOL_GPL(ring_buffer_overruns); + +static void rb_iter_reset(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + + /* Iterator usage is expected to have record disabled */ + if (list_empty(&cpu_buffer->reader_page->list)) { + iter->head_page = rb_set_head_page(cpu_buffer); + if (unlikely(!iter->head_page)) + return; + iter->head = iter->head_page->read; + } else { + iter->head_page = cpu_buffer->reader_page; + iter->head = cpu_buffer->reader_page->read; + } + if (iter->head) + iter->read_stamp = cpu_buffer->read_stamp; + else + iter->read_stamp = iter->head_page->page->time_stamp; + iter->cache_reader_page = cpu_buffer->reader_page; + iter->cache_read = cpu_buffer->read; +} + +/** + * ring_buffer_iter_reset - reset an iterator + * @iter: The iterator to reset + * + * Resets the iterator, so that it will start from the beginning + * again. + */ +void ring_buffer_iter_reset(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags; + + if (!iter) + return; + + cpu_buffer = iter->cpu_buffer; + + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + rb_iter_reset(iter); + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); +} +EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); + +/** + * ring_buffer_iter_empty - check if an iterator has no more to read + * @iter: The iterator to check + */ +int ring_buffer_iter_empty(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer; + + cpu_buffer = iter->cpu_buffer; + + return iter->head_page == cpu_buffer->commit_page && + iter->head == rb_commit_index(cpu_buffer); +} +EXPORT_SYMBOL_GPL(ring_buffer_iter_empty); + +static void +rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, + struct ring_buffer_event *event) +{ + u64 delta; + + switch (event->type_len) { + case RINGBUF_TYPE_PADDING: + return; + + case RINGBUF_TYPE_TIME_EXTEND: + delta = event->array[0]; + delta <<= TS_SHIFT; + delta += event->time_delta; + cpu_buffer->read_stamp += delta; + return; + + case RINGBUF_TYPE_TIME_STAMP: + /* FIXME: not implemented */ + return; + + case RINGBUF_TYPE_DATA: + cpu_buffer->read_stamp += event->time_delta; + return; + + default: + BUG(); + } + return; +} + +static void +rb_update_iter_read_stamp(struct ring_buffer_iter *iter, + struct ring_buffer_event *event) +{ + u64 delta; + + switch (event->type_len) { + case RINGBUF_TYPE_PADDING: + return; + + case RINGBUF_TYPE_TIME_EXTEND: + delta = event->array[0]; + delta <<= TS_SHIFT; + delta += event->time_delta; + iter->read_stamp += delta; + return; + + case RINGBUF_TYPE_TIME_STAMP: + /* FIXME: not implemented */ + return; + + case RINGBUF_TYPE_DATA: + iter->read_stamp += event->time_delta; + return; + + default: + BUG(); + } + return; +} + +static struct buffer_page * +rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct buffer_page *reader = NULL; + unsigned long overwrite; + unsigned long flags; + int nr_loops = 0; + int ret; + + local_irq_save(flags); + arch_spin_lock(&cpu_buffer->lock); + + again: + /* + * This should normally only loop twice. But because the + * start of the reader inserts an empty page, it causes + * a case where we will loop three times. There should be no + * reason to loop four times (that I know of). + */ + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3)) { + reader = NULL; + goto out; + } + + reader = cpu_buffer->reader_page; + + /* If there's more to read, return this page */ + if (cpu_buffer->reader_page->read < rb_page_size(reader)) + goto out; + + /* Never should we have an index greater than the size */ + if (RB_WARN_ON(cpu_buffer, + cpu_buffer->reader_page->read > rb_page_size(reader))) + goto out; + + /* check if we caught up to the tail */ + reader = NULL; + if (cpu_buffer->commit_page == cpu_buffer->reader_page) + goto out; + + /* + * Reset the reader page to size zero. + */ + local_set(&cpu_buffer->reader_page->write, 0); + local_set(&cpu_buffer->reader_page->entries, 0); + local_set(&cpu_buffer->reader_page->page->commit, 0); + cpu_buffer->reader_page->real_end = 0; + + spin: + /* + * Splice the empty reader page into the list around the head. + */ + reader = rb_set_head_page(cpu_buffer); + cpu_buffer->reader_page->list.next = rb_list_head(reader->list.next); + cpu_buffer->reader_page->list.prev = reader->list.prev; + + /* + * cpu_buffer->pages just needs to point to the buffer, it + * has no specific buffer page to point to. Lets move it out + * of our way so we don't accidentally swap it. + */ + cpu_buffer->pages = reader->list.prev; + + /* The reader page will be pointing to the new head */ + rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list); + + /* + * We want to make sure we read the overruns after we set up our + * pointers to the next object. The writer side does a + * cmpxchg to cross pages which acts as the mb on the writer + * side. Note, the reader will constantly fail the swap + * while the writer is updating the pointers, so this + * guarantees that the overwrite recorded here is the one we + * want to compare with the last_overrun. + */ + smp_mb(); + overwrite = local_read(&(cpu_buffer->overrun)); + + /* + * Here's the tricky part. + * + * We need to move the pointer past the header page. + * But we can only do that if a writer is not currently + * moving it. The page before the header page has the + * flag bit '1' set if it is pointing to the page we want. + * but if the writer is in the process of moving it + * than it will be '2' or already moved '0'. + */ + + ret = rb_head_page_replace(reader, cpu_buffer->reader_page); + + /* + * If we did not convert it, then we must try again. + */ + if (!ret) + goto spin; + + /* + * Yeah! We succeeded in replacing the page. + * + * Now make the new head point back to the reader page. + */ + rb_list_head(reader->list.next)->prev = &cpu_buffer->reader_page->list; + rb_inc_page(cpu_buffer, &cpu_buffer->head_page); + + /* Finally update the reader page to the new head */ + cpu_buffer->reader_page = reader; + rb_reset_reader_page(cpu_buffer); + + if (overwrite != cpu_buffer->last_overrun) { + cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun; + cpu_buffer->last_overrun = overwrite; + } + + goto again; + + out: + arch_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); + + return reader; +} + +static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) +{ + struct ring_buffer_event *event; + struct buffer_page *reader; + unsigned length; + + reader = rb_get_reader_page(cpu_buffer); + + /* This function should not be called when buffer is empty */ + if (RB_WARN_ON(cpu_buffer, !reader)) + return; + + event = rb_reader_event(cpu_buffer); + + if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX) + cpu_buffer->read++; + + rb_update_read_stamp(cpu_buffer, event); + + length = rb_event_length(event); + cpu_buffer->reader_page->read += length; +} + +static void rb_advance_iter(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event; + unsigned length; + + cpu_buffer = iter->cpu_buffer; + + /* + * Check if we are at the end of the buffer. + */ + if (iter->head >= rb_page_size(iter->head_page)) { + /* discarded commits can make the page empty */ + if (iter->head_page == cpu_buffer->commit_page) + return; + rb_inc_iter(iter); + return; + } + + event = rb_iter_head_event(iter); + + length = rb_event_length(event); + + /* + * This should not be called to advance the header if we are + * at the tail of the buffer. + */ + if (RB_WARN_ON(cpu_buffer, + (iter->head_page == cpu_buffer->commit_page) && + (iter->head + length > rb_commit_index(cpu_buffer)))) + return; + + rb_update_iter_read_stamp(iter, event); + + iter->head += length; + + /* check for end of page padding */ + if ((iter->head >= rb_page_size(iter->head_page)) && + (iter->head_page != cpu_buffer->commit_page)) + rb_advance_iter(iter); +} + +static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer) +{ + return cpu_buffer->lost_events; +} + +static struct ring_buffer_event * +rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, + unsigned long *lost_events) +{ + struct ring_buffer_event *event; + struct buffer_page *reader; + int nr_loops = 0; + + again: + /* + * We repeat when a time extend is encountered. + * Since the time extend is always attached to a data event, + * we should never loop more than once. + * (We never hit the following condition more than twice). + */ + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) + return NULL; + + reader = rb_get_reader_page(cpu_buffer); + if (!reader) + return NULL; + + event = rb_reader_event(cpu_buffer); + + switch (event->type_len) { + case RINGBUF_TYPE_PADDING: + if (rb_null_event(event)) + RB_WARN_ON(cpu_buffer, 1); + /* + * Because the writer could be discarding every + * event it creates (which would probably be bad) + * if we were to go back to "again" then we may never + * catch up, and will trigger the warn on, or lock + * the box. Return the padding, and we will release + * the current locks, and try again. + */ + return event; + + case RINGBUF_TYPE_TIME_EXTEND: + /* Internal data, OK to advance */ + rb_advance_reader(cpu_buffer); + goto again; + + case RINGBUF_TYPE_TIME_STAMP: + /* FIXME: not implemented */ + rb_advance_reader(cpu_buffer); + goto again; + + case RINGBUF_TYPE_DATA: + if (ts) { + *ts = cpu_buffer->read_stamp + event->time_delta; + ring_buffer_normalize_time_stamp(cpu_buffer->buffer, + cpu_buffer->cpu, ts); + } + if (lost_events) + *lost_events = rb_lost_events(cpu_buffer); + return event; + + default: + BUG(); + } + + return NULL; +} +EXPORT_SYMBOL_GPL(ring_buffer_peek); + +static struct ring_buffer_event * +rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) +{ + struct ring_buffer *buffer; + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event; + int nr_loops = 0; + + cpu_buffer = iter->cpu_buffer; + buffer = cpu_buffer->buffer; + + /* + * Check if someone performed a consuming read to + * the buffer. A consuming read invalidates the iterator + * and we need to reset the iterator in this case. + */ + if (unlikely(iter->cache_read != cpu_buffer->read || + iter->cache_reader_page != cpu_buffer->reader_page)) + rb_iter_reset(iter); + + again: + if (ring_buffer_iter_empty(iter)) + return NULL; + + /* + * We repeat when a time extend is encountered. + * Since the time extend is always attached to a data event, + * we should never loop more than once. + * (We never hit the following condition more than twice). + */ + if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) + return NULL; + + if (rb_per_cpu_empty(cpu_buffer)) + return NULL; + + if (iter->head >= local_read(&iter->head_page->page->commit)) { + rb_inc_iter(iter); + goto again; + } + + event = rb_iter_head_event(iter); + + switch (event->type_len) { + case RINGBUF_TYPE_PADDING: + if (rb_null_event(event)) { + rb_inc_iter(iter); + goto again; + } + rb_advance_iter(iter); + return event; + + case RINGBUF_TYPE_TIME_EXTEND: + /* Internal data, OK to advance */ + rb_advance_iter(iter); + goto again; + + case RINGBUF_TYPE_TIME_STAMP: + /* FIXME: not implemented */ + rb_advance_iter(iter); + goto again; + + case RINGBUF_TYPE_DATA: + if (ts) { + *ts = iter->read_stamp + event->time_delta; + ring_buffer_normalize_time_stamp(buffer, + cpu_buffer->cpu, ts); + } + return event; + + default: + BUG(); + } + + return NULL; +} +EXPORT_SYMBOL_GPL(ring_buffer_iter_peek); + +static inline int rb_ok_to_lock(void) +{ + /* + * If an NMI die dumps out the content of the ring buffer + * do not grab locks. We also permanently disable the ring + * buffer too. A one time deal is all you get from reading + * the ring buffer from an NMI. + */ + if (likely(!in_nmi())) + return 1; + + tracing_off_permanent(); + return 0; +} + +/** + * ring_buffer_peek - peek at the next event to be read + * @buffer: The ring buffer to read + * @cpu: The cpu to peak at + * @ts: The timestamp counter of this event. + * @lost_events: a variable to store if events were lost (may be NULL) + * + * This will return the event that will be read next, but does + * not consume the data. + */ +struct ring_buffer_event * +ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts, + unsigned long *lost_events) +{ + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct ring_buffer_event *event; + unsigned long flags; + int dolock; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return NULL; + + dolock = rb_ok_to_lock(); + again: + local_irq_save(flags); + if (dolock) + spin_lock(&cpu_buffer->reader_lock); + event = rb_buffer_peek(cpu_buffer, ts, lost_events); + if (event && event->type_len == RINGBUF_TYPE_PADDING) + rb_advance_reader(cpu_buffer); + if (dolock) + spin_unlock(&cpu_buffer->reader_lock); + local_irq_restore(flags); + + if (event && event->type_len == RINGBUF_TYPE_PADDING) + goto again; + + return event; +} + +/** + * ring_buffer_iter_peek - peek at the next event to be read + * @iter: The ring buffer iterator + * @ts: The timestamp counter of this event. + * + * This will return the event that will be read next, but does + * not increment the iterator. + */ +struct ring_buffer_event * +ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) +{ + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + struct ring_buffer_event *event; + unsigned long flags; + + again: + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + event = rb_iter_peek(iter, ts); + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + if (event && event->type_len == RINGBUF_TYPE_PADDING) + goto again; + + return event; +} + +/** + * ring_buffer_consume - return an event and consume it + * @buffer: The ring buffer to get the next event from + * @cpu: the cpu to read the buffer from + * @ts: a variable to store the timestamp (may be NULL) + * @lost_events: a variable to store if events were lost (may be NULL) + * + * Returns the next event in the ring buffer, and that event is consumed. + * Meaning, that sequential reads will keep returning a different event, + * and eventually empty the ring buffer if the producer is slower. + */ +struct ring_buffer_event * +ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts, + unsigned long *lost_events) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_event *event = NULL; + unsigned long flags; + int dolock; + + dolock = rb_ok_to_lock(); + + again: + /* might be called in atomic */ + preempt_disable(); + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + goto out; + + cpu_buffer = buffer->buffers[cpu]; + local_irq_save(flags); + if (dolock) + spin_lock(&cpu_buffer->reader_lock); + + event = rb_buffer_peek(cpu_buffer, ts, lost_events); + if (event) { + cpu_buffer->lost_events = 0; + rb_advance_reader(cpu_buffer); + } + + if (dolock) + spin_unlock(&cpu_buffer->reader_lock); + local_irq_restore(flags); + + out: + preempt_enable(); + + if (event && event->type_len == RINGBUF_TYPE_PADDING) + goto again; + + return event; +} +EXPORT_SYMBOL_GPL(ring_buffer_consume); + +/** + * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer + * @buffer: The ring buffer to read from + * @cpu: The cpu buffer to iterate over + * + * This performs the initial preparations necessary to iterate + * through the buffer. Memory is allocated, buffer recording + * is disabled, and the iterator pointer is returned to the caller. + * + * Disabling buffer recordng prevents the reading from being + * corrupted. This is not a consuming read, so a producer is not + * expected. + * + * After a sequence of ring_buffer_read_prepare calls, the user is + * expected to make at least one call to ring_buffer_prepare_sync. + * Afterwards, ring_buffer_read_start is invoked to get things going + * for real. + * + * This overall must be paired with ring_buffer_finish. + */ +struct ring_buffer_iter * +ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct ring_buffer_iter *iter; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return NULL; + + iter = kmalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return NULL; + + cpu_buffer = buffer->buffers[cpu]; + + iter->cpu_buffer = cpu_buffer; + + atomic_inc(&cpu_buffer->record_disabled); + + return iter; +} +EXPORT_SYMBOL_GPL(ring_buffer_read_prepare); + +/** + * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls + * + * All previously invoked ring_buffer_read_prepare calls to prepare + * iterators will be synchronized. Afterwards, read_buffer_read_start + * calls on those iterators are allowed. + */ +void +ring_buffer_read_prepare_sync(void) +{ + synchronize_sched(); +} +EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync); + +/** + * ring_buffer_read_start - start a non consuming read of the buffer + * @iter: The iterator returned by ring_buffer_read_prepare + * + * This finalizes the startup of an iteration through the buffer. + * The iterator comes from a call to ring_buffer_read_prepare and + * an intervening ring_buffer_read_prepare_sync must have been + * performed. + * + * Must be paired with ring_buffer_finish. + */ +void +ring_buffer_read_start(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags; + + if (!iter) + return; + + cpu_buffer = iter->cpu_buffer; + + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + arch_spin_lock(&cpu_buffer->lock); + rb_iter_reset(iter); + arch_spin_unlock(&cpu_buffer->lock); + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); +} +EXPORT_SYMBOL_GPL(ring_buffer_read_start); + +/** + * ring_buffer_finish - finish reading the iterator of the buffer + * @iter: The iterator retrieved by ring_buffer_start + * + * This re-enables the recording to the buffer, and frees the + * iterator. + */ +void +ring_buffer_read_finish(struct ring_buffer_iter *iter) +{ + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + + atomic_dec(&cpu_buffer->record_disabled); + kfree(iter); +} +EXPORT_SYMBOL_GPL(ring_buffer_read_finish); + +/** + * ring_buffer_read - read the next item in the ring buffer by the iterator + * @iter: The ring buffer iterator + * @ts: The time stamp of the event read. + * + * This reads the next event in the ring buffer and increments the iterator. + */ +struct ring_buffer_event * +ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) +{ + struct ring_buffer_event *event; + struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; + unsigned long flags; + + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + again: + event = rb_iter_peek(iter, ts); + if (!event) + goto out; + + if (event->type_len == RINGBUF_TYPE_PADDING) + goto again; + + rb_advance_iter(iter); + out: + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + return event; +} +EXPORT_SYMBOL_GPL(ring_buffer_read); + +/** + * ring_buffer_size - return the size of the ring buffer (in bytes) + * @buffer: The ring buffer. + */ +unsigned long ring_buffer_size(struct ring_buffer *buffer) +{ + return BUF_PAGE_SIZE * buffer->pages; +} +EXPORT_SYMBOL_GPL(ring_buffer_size); + +static void +rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) +{ + rb_head_page_deactivate(cpu_buffer); + + cpu_buffer->head_page + = list_entry(cpu_buffer->pages, struct buffer_page, list); + local_set(&cpu_buffer->head_page->write, 0); + local_set(&cpu_buffer->head_page->entries, 0); + local_set(&cpu_buffer->head_page->page->commit, 0); + + cpu_buffer->head_page->read = 0; + + cpu_buffer->tail_page = cpu_buffer->head_page; + cpu_buffer->commit_page = cpu_buffer->head_page; + + INIT_LIST_HEAD(&cpu_buffer->reader_page->list); + local_set(&cpu_buffer->reader_page->write, 0); + local_set(&cpu_buffer->reader_page->entries, 0); + local_set(&cpu_buffer->reader_page->page->commit, 0); + cpu_buffer->reader_page->read = 0; + + local_set(&cpu_buffer->commit_overrun, 0); + local_set(&cpu_buffer->overrun, 0); + local_set(&cpu_buffer->entries, 0); + local_set(&cpu_buffer->committing, 0); + local_set(&cpu_buffer->commits, 0); + cpu_buffer->read = 0; + + cpu_buffer->write_stamp = 0; + cpu_buffer->read_stamp = 0; + + cpu_buffer->lost_events = 0; + cpu_buffer->last_overrun = 0; + + rb_head_page_activate(cpu_buffer); +} + +/** + * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer + * @buffer: The ring buffer to reset a per cpu buffer of + * @cpu: The CPU buffer to be reset + */ +void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + unsigned long flags; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return; + + atomic_inc(&cpu_buffer->record_disabled); + + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing))) + goto out; + + arch_spin_lock(&cpu_buffer->lock); + + rb_reset_cpu(cpu_buffer); + + arch_spin_unlock(&cpu_buffer->lock); + + out: + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + atomic_dec(&cpu_buffer->record_disabled); +} +EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); + +/** + * ring_buffer_reset - reset a ring buffer + * @buffer: The ring buffer to reset all cpu buffers + */ +void ring_buffer_reset(struct ring_buffer *buffer) +{ + int cpu; + + for_each_buffer_cpu(buffer, cpu) + ring_buffer_reset_cpu(buffer, cpu); +} +EXPORT_SYMBOL_GPL(ring_buffer_reset); + +/** + * rind_buffer_empty - is the ring buffer empty? + * @buffer: The ring buffer to test + */ +int ring_buffer_empty(struct ring_buffer *buffer) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags; + int dolock; + int cpu; + int ret; + + dolock = rb_ok_to_lock(); + + /* yes this is racy, but if you don't like the race, lock the buffer */ + for_each_buffer_cpu(buffer, cpu) { + cpu_buffer = buffer->buffers[cpu]; + local_irq_save(flags); + if (dolock) + spin_lock(&cpu_buffer->reader_lock); + ret = rb_per_cpu_empty(cpu_buffer); + if (dolock) + spin_unlock(&cpu_buffer->reader_lock); + local_irq_restore(flags); + + if (!ret) + return 0; + } + + return 1; +} +EXPORT_SYMBOL_GPL(ring_buffer_empty); + +/** + * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty? + * @buffer: The ring buffer + * @cpu: The CPU buffer to test + */ +int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long flags; + int dolock; + int ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 1; + + dolock = rb_ok_to_lock(); + + cpu_buffer = buffer->buffers[cpu]; + local_irq_save(flags); + if (dolock) + spin_lock(&cpu_buffer->reader_lock); + ret = rb_per_cpu_empty(cpu_buffer); + if (dolock) + spin_unlock(&cpu_buffer->reader_lock); + local_irq_restore(flags); + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu); + +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP +/** + * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers + * @buffer_a: One buffer to swap with + * @buffer_b: The other buffer to swap with + * + * This function is useful for tracers that want to take a "snapshot" + * of a CPU buffer and has another back up buffer lying around. + * it is expected that the tracer handles the cpu buffer not being + * used at the moment. + */ +int ring_buffer_swap_cpu(struct ring_buffer *buffer_a, + struct ring_buffer *buffer_b, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer_a; + struct ring_buffer_per_cpu *cpu_buffer_b; + int ret = -EINVAL; + + if (!cpumask_test_cpu(cpu, buffer_a->cpumask) || + !cpumask_test_cpu(cpu, buffer_b->cpumask)) + goto out; + + /* At least make sure the two buffers are somewhat the same */ + if (buffer_a->pages != buffer_b->pages) + goto out; + + ret = -EAGAIN; + + if (ring_buffer_flags != RB_BUFFERS_ON) + goto out; + + if (atomic_read(&buffer_a->record_disabled)) + goto out; + + if (atomic_read(&buffer_b->record_disabled)) + goto out; + + cpu_buffer_a = buffer_a->buffers[cpu]; + cpu_buffer_b = buffer_b->buffers[cpu]; + + if (atomic_read(&cpu_buffer_a->record_disabled)) + goto out; + + if (atomic_read(&cpu_buffer_b->record_disabled)) + goto out; + + /* + * We can't do a synchronize_sched here because this + * function can be called in atomic context. + * Normally this will be called from the same CPU as cpu. + * If not it's up to the caller to protect this. + */ + atomic_inc(&cpu_buffer_a->record_disabled); + atomic_inc(&cpu_buffer_b->record_disabled); + + ret = -EBUSY; + if (local_read(&cpu_buffer_a->committing)) + goto out_dec; + if (local_read(&cpu_buffer_b->committing)) + goto out_dec; + + buffer_a->buffers[cpu] = cpu_buffer_b; + buffer_b->buffers[cpu] = cpu_buffer_a; + + cpu_buffer_b->buffer = buffer_a; + cpu_buffer_a->buffer = buffer_b; + + ret = 0; + +out_dec: + atomic_dec(&cpu_buffer_a->record_disabled); + atomic_dec(&cpu_buffer_b->record_disabled); +out: + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); +#endif /* CONFIG_RING_BUFFER_ALLOW_SWAP */ + +/** + * ring_buffer_alloc_read_page - allocate a page to read from buffer + * @buffer: the buffer to allocate for. + * + * This function is used in conjunction with ring_buffer_read_page. + * When reading a full page from the ring buffer, these functions + * can be used to speed up the process. The calling function should + * allocate a few pages first with this function. Then when it + * needs to get pages from the ring buffer, it passes the result + * of this function into ring_buffer_read_page, which will swap + * the page that was allocated, with the read page of the buffer. + * + * Returns: + * The page allocated, or NULL on error. + */ +void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) +{ + struct buffer_data_page *bpage; + unsigned long addr; + + addr = __get_free_page(GFP_KERNEL); + if (!addr) + return NULL; + + bpage = (void *)addr; + + rb_init_page(bpage); + + return bpage; +} +EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); + +/** + * ring_buffer_free_read_page - free an allocated read page + * @buffer: the buffer the page was allocate for + * @data: the page to free + * + * Free a page allocated from ring_buffer_alloc_read_page. + */ +void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) +{ + free_page((unsigned long)data); +} +EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); + +/** + * ring_buffer_read_page - extract a page from the ring buffer + * @buffer: buffer to extract from + * @data_page: the page to use allocated from ring_buffer_alloc_read_page + * @len: amount to extract + * @cpu: the cpu of the buffer to extract + * @full: should the extraction only happen when the page is full. + * + * This function will pull out a page from the ring buffer and consume it. + * @data_page must be the address of the variable that was returned + * from ring_buffer_alloc_read_page. This is because the page might be used + * to swap with a page in the ring buffer. + * + * for example: + * rpage = ring_buffer_alloc_read_page(buffer); + * if (!rpage) + * return error; + * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0); + * if (ret >= 0) + * process_page(rpage, ret); + * + * When @full is set, the function will not return true unless + * the writer is off the reader page. + * + * Note: it is up to the calling functions to handle sleeps and wakeups. + * The ring buffer can be used anywhere in the kernel and can not + * blindly call wake_up. The layer that uses the ring buffer must be + * responsible for that. + * + * Returns: + * >=0 if data has been transferred, returns the offset of consumed data. + * <0 if no data has been transferred. + */ +int ring_buffer_read_page(struct ring_buffer *buffer, + void **data_page, size_t len, int cpu, int full) +{ + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct ring_buffer_event *event; + struct buffer_data_page *bpage; + struct buffer_page *reader; + unsigned long missed_events; + unsigned long flags; + unsigned int commit; + unsigned int read; + u64 save_timestamp; + int ret = -1; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + goto out; + + /* + * If len is not big enough to hold the page header, then + * we can not copy anything. + */ + if (len <= BUF_PAGE_HDR_SIZE) + goto out; + + len -= BUF_PAGE_HDR_SIZE; + + if (!data_page) + goto out; + + bpage = *data_page; + if (!bpage) + goto out; + + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); + + reader = rb_get_reader_page(cpu_buffer); + if (!reader) + goto out_unlock; + + event = rb_reader_event(cpu_buffer); + + read = reader->read; + commit = rb_page_commit(reader); + + /* Check if any events were dropped */ + missed_events = cpu_buffer->lost_events; + + /* + * If this page has been partially read or + * if len is not big enough to read the rest of the page or + * a writer is still on the page, then + * we must copy the data from the page to the buffer. + * Otherwise, we can simply swap the page with the one passed in. + */ + if (read || (len < (commit - read)) || + cpu_buffer->reader_page == cpu_buffer->commit_page) { + struct buffer_data_page *rpage = cpu_buffer->reader_page->page; + unsigned int rpos = read; + unsigned int pos = 0; + unsigned int size; + + if (full) + goto out_unlock; + + if (len > (commit - read)) + len = (commit - read); + + /* Always keep the time extend and data together */ + size = rb_event_ts_length(event); + + if (len < size) + goto out_unlock; + + /* save the current timestamp, since the user will need it */ + save_timestamp = cpu_buffer->read_stamp; + + /* Need to copy one event at a time */ + do { + /* We need the size of one event, because + * rb_advance_reader only advances by one event, + * whereas rb_event_ts_length may include the size of + * one or two events. + * We have already ensured there's enough space if this + * is a time extend. */ + size = rb_event_length(event); + memcpy(bpage->data + pos, rpage->data + rpos, size); + + len -= size; + + rb_advance_reader(cpu_buffer); + rpos = reader->read; + pos += size; + + if (rpos >= commit) + break; + + event = rb_reader_event(cpu_buffer); + /* Always keep the time extend and data together */ + size = rb_event_ts_length(event); + } while (len >= size); + + /* update bpage */ + local_set(&bpage->commit, pos); + bpage->time_stamp = save_timestamp; + + /* we copied everything to the beginning */ + read = 0; + } else { + /* update the entry counter */ + cpu_buffer->read += rb_page_entries(reader); + + /* swap the pages */ + rb_init_page(bpage); + bpage = reader->page; + reader->page = *data_page; + local_set(&reader->write, 0); + local_set(&reader->entries, 0); + reader->read = 0; + *data_page = bpage; + + /* + * Use the real_end for the data size, + * This gives us a chance to store the lost events + * on the page. + */ + if (reader->real_end) + local_set(&bpage->commit, reader->real_end); + } + ret = read; + + cpu_buffer->lost_events = 0; + + commit = local_read(&bpage->commit); + /* + * Set a flag in the commit field if we lost events + */ + if (missed_events) { + /* If there is room at the end of the page to save the + * missed events, then record it there. + */ + if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) { + memcpy(&bpage->data[commit], &missed_events, + sizeof(missed_events)); + local_add(RB_MISSED_STORED, &bpage->commit); + commit += sizeof(missed_events); + } + local_add(RB_MISSED_EVENTS, &bpage->commit); + } + + /* + * This page may be off to user land. Zero it out here. + */ + if (commit < BUF_PAGE_SIZE) + memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit); + + out_unlock: + spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + out: + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_read_page); + +#ifdef CONFIG_TRACING +static ssize_t +rb_simple_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long *p = filp->private_data; + char buf[64]; + int r; + + if (test_bit(RB_BUFFERS_DISABLED_BIT, p)) + r = sprintf(buf, "permanently disabled\n"); + else + r = sprintf(buf, "%d\n", test_bit(RB_BUFFERS_ON_BIT, p)); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +rb_simple_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long *p = filp->private_data; + char buf[64]; + unsigned long val; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + if (val) + set_bit(RB_BUFFERS_ON_BIT, p); + else + clear_bit(RB_BUFFERS_ON_BIT, p); + + (*ppos)++; + + return cnt; +} + +static const struct file_operations rb_simple_fops = { + .open = tracing_open_generic, + .read = rb_simple_read, + .write = rb_simple_write, + .llseek = default_llseek, +}; + + +static __init int rb_init_debugfs(void) +{ + struct dentry *d_tracer; + + d_tracer = tracing_init_dentry(); + + trace_create_file("tracing_on", 0644, d_tracer, + &ring_buffer_flags, &rb_simple_fops); + + return 0; +} + +fs_initcall(rb_init_debugfs); +#endif + +#ifdef CONFIG_HOTPLUG_CPU +static int rb_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + struct ring_buffer *buffer = + container_of(self, struct ring_buffer, cpu_notify); + long cpu = (long)hcpu; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + if (cpumask_test_cpu(cpu, buffer->cpumask)) + return NOTIFY_OK; + + buffer->buffers[cpu] = + rb_allocate_cpu_buffer(buffer, cpu); + if (!buffer->buffers[cpu]) { + WARN(1, "failed to allocate ring buffer on CPU %ld\n", + cpu); + return NOTIFY_OK; + } + smp_wmb(); + cpumask_set_cpu(cpu, buffer->cpumask); + break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + /* + * Do nothing. + * If we were to free the buffer, then the user would + * lose any trace that was in the buffer. + */ + break; + default: + break; + } + return NOTIFY_OK; +} +#endif diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c new file mode 100644 index 00000000..302f8a61 --- /dev/null +++ b/kernel/trace/ring_buffer_benchmark.c @@ -0,0 +1,488 @@ +/* + * ring buffer tester and benchmark + * + * Copyright (C) 2009 Steven Rostedt + */ +#include +#include +#include +#include +#include +#include + +struct rb_page { + u64 ts; + local_t commit; + char data[4080]; +}; + +/* run time and sleep time in seconds */ +#define RUN_TIME 10 +#define SLEEP_TIME 10 + +/* number of events for writer to wake up the reader */ +static int wakeup_interval = 100; + +static int reader_finish; +static struct completion read_start; +static struct completion read_done; + +static struct ring_buffer *buffer; +static struct task_struct *producer; +static struct task_struct *consumer; +static unsigned long read; + +static int disable_reader; +module_param(disable_reader, uint, 0644); +MODULE_PARM_DESC(disable_reader, "only run producer"); + +static int write_iteration = 50; +module_param(write_iteration, uint, 0644); +MODULE_PARM_DESC(write_iteration, "# of writes between timestamp readings"); + +static int producer_nice = 19; +static int consumer_nice = 19; + +static int producer_fifo = -1; +static int consumer_fifo = -1; + +module_param(producer_nice, uint, 0644); +MODULE_PARM_DESC(producer_nice, "nice prio for producer"); + +module_param(consumer_nice, uint, 0644); +MODULE_PARM_DESC(consumer_nice, "nice prio for consumer"); + +module_param(producer_fifo, uint, 0644); +MODULE_PARM_DESC(producer_fifo, "fifo prio for producer"); + +module_param(consumer_fifo, uint, 0644); +MODULE_PARM_DESC(consumer_fifo, "fifo prio for consumer"); + +static int read_events; + +static int kill_test; + +#define KILL_TEST() \ + do { \ + if (!kill_test) { \ + kill_test = 1; \ + WARN_ON(1); \ + } \ + } while (0) + +enum event_status { + EVENT_FOUND, + EVENT_DROPPED, +}; + +static enum event_status read_event(int cpu) +{ + struct ring_buffer_event *event; + int *entry; + u64 ts; + + event = ring_buffer_consume(buffer, cpu, &ts, NULL); + if (!event) + return EVENT_DROPPED; + + entry = ring_buffer_event_data(event); + if (*entry != cpu) { + KILL_TEST(); + return EVENT_DROPPED; + } + + read++; + return EVENT_FOUND; +} + +static enum event_status read_page(int cpu) +{ + struct ring_buffer_event *event; + struct rb_page *rpage; + unsigned long commit; + void *bpage; + int *entry; + int ret; + int inc; + int i; + + bpage = ring_buffer_alloc_read_page(buffer); + if (!bpage) + return EVENT_DROPPED; + + ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); + if (ret >= 0) { + rpage = bpage; + /* The commit may have missed event flags set, clear them */ + commit = local_read(&rpage->commit) & 0xfffff; + for (i = 0; i < commit && !kill_test; i += inc) { + + if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) { + KILL_TEST(); + break; + } + + inc = -1; + event = (void *)&rpage->data[i]; + switch (event->type_len) { + case RINGBUF_TYPE_PADDING: + /* failed writes may be discarded events */ + if (!event->time_delta) + KILL_TEST(); + inc = event->array[0] + 4; + break; + case RINGBUF_TYPE_TIME_EXTEND: + inc = 8; + break; + case 0: + entry = ring_buffer_event_data(event); + if (*entry != cpu) { + KILL_TEST(); + break; + } + read++; + if (!event->array[0]) { + KILL_TEST(); + break; + } + inc = event->array[0] + 4; + break; + default: + entry = ring_buffer_event_data(event); + if (*entry != cpu) { + KILL_TEST(); + break; + } + read++; + inc = ((event->type_len + 1) * 4); + } + if (kill_test) + break; + + if (inc <= 0) { + KILL_TEST(); + break; + } + } + } + ring_buffer_free_read_page(buffer, bpage); + + if (ret < 0) + return EVENT_DROPPED; + return EVENT_FOUND; +} + +static void ring_buffer_consumer(void) +{ + /* toggle between reading pages and events */ + read_events ^= 1; + + read = 0; + while (!reader_finish && !kill_test) { + int found; + + do { + int cpu; + + found = 0; + for_each_online_cpu(cpu) { + enum event_status stat; + + if (read_events) + stat = read_event(cpu); + else + stat = read_page(cpu); + + if (kill_test) + break; + if (stat == EVENT_FOUND) + found = 1; + } + } while (found && !kill_test); + + set_current_state(TASK_INTERRUPTIBLE); + if (reader_finish) + break; + + schedule(); + __set_current_state(TASK_RUNNING); + } + reader_finish = 0; + complete(&read_done); +} + +static void ring_buffer_producer(void) +{ + struct timeval start_tv; + struct timeval end_tv; + unsigned long long time; + unsigned long long entries; + unsigned long long overruns; + unsigned long missed = 0; + unsigned long hit = 0; + unsigned long avg; + int cnt = 0; + + /* + * Hammer the buffer for 10 secs (this may + * make the system stall) + */ + trace_printk("Starting ring buffer hammer\n"); + do_gettimeofday(&start_tv); + do { + struct ring_buffer_event *event; + int *entry; + int i; + + for (i = 0; i < write_iteration; i++) { + event = ring_buffer_lock_reserve(buffer, 10); + if (!event) { + missed++; + } else { + hit++; + entry = ring_buffer_event_data(event); + *entry = smp_processor_id(); + ring_buffer_unlock_commit(buffer, event); + } + } + do_gettimeofday(&end_tv); + + cnt++; + if (consumer && !(cnt % wakeup_interval)) + wake_up_process(consumer); + +#ifndef CONFIG_PREEMPT + /* + * If we are a non preempt kernel, the 10 second run will + * stop everything while it runs. Instead, we will call + * cond_resched and also add any time that was lost by a + * rescedule. + * + * Do a cond resched at the same frequency we would wake up + * the reader. + */ + if (cnt % wakeup_interval) + cond_resched(); +#endif + + } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); + trace_printk("End ring buffer hammer\n"); + + if (consumer) { + /* Init both completions here to avoid races */ + init_completion(&read_start); + init_completion(&read_done); + /* the completions must be visible before the finish var */ + smp_wmb(); + reader_finish = 1; + /* finish var visible before waking up the consumer */ + smp_wmb(); + wake_up_process(consumer); + wait_for_completion(&read_done); + } + + time = end_tv.tv_sec - start_tv.tv_sec; + time *= USEC_PER_SEC; + time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec); + + entries = ring_buffer_entries(buffer); + overruns = ring_buffer_overruns(buffer); + + if (kill_test) + trace_printk("ERROR!\n"); + + if (!disable_reader) { + if (consumer_fifo < 0) + trace_printk("Running Consumer at nice: %d\n", + consumer_nice); + else + trace_printk("Running Consumer at SCHED_FIFO %d\n", + consumer_fifo); + } + if (producer_fifo < 0) + trace_printk("Running Producer at nice: %d\n", + producer_nice); + else + trace_printk("Running Producer at SCHED_FIFO %d\n", + producer_fifo); + + /* Let the user know that the test is running at low priority */ + if (producer_fifo < 0 && consumer_fifo < 0 && + producer_nice == 19 && consumer_nice == 19) + trace_printk("WARNING!!! This test is running at lowest priority.\n"); + + trace_printk("Time: %lld (usecs)\n", time); + trace_printk("Overruns: %lld\n", overruns); + if (disable_reader) + trace_printk("Read: (reader disabled)\n"); + else + trace_printk("Read: %ld (by %s)\n", read, + read_events ? "events" : "pages"); + trace_printk("Entries: %lld\n", entries); + trace_printk("Total: %lld\n", entries + overruns + read); + trace_printk("Missed: %ld\n", missed); + trace_printk("Hit: %ld\n", hit); + + /* Convert time from usecs to millisecs */ + do_div(time, USEC_PER_MSEC); + if (time) + hit /= (long)time; + else + trace_printk("TIME IS ZERO??\n"); + + trace_printk("Entries per millisec: %ld\n", hit); + + if (hit) { + /* Calculate the average time in nanosecs */ + avg = NSEC_PER_MSEC / hit; + trace_printk("%ld ns per entry\n", avg); + } + + if (missed) { + if (time) + missed /= (long)time; + + trace_printk("Total iterations per millisec: %ld\n", + hit + missed); + + /* it is possible that hit + missed will overflow and be zero */ + if (!(hit + missed)) { + trace_printk("hit + missed overflowed and totalled zero!\n"); + hit--; /* make it non zero */ + } + + /* Caculate the average time in nanosecs */ + avg = NSEC_PER_MSEC / (hit + missed); + trace_printk("%ld ns per entry\n", avg); + } +} + +static void wait_to_die(void) +{ + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); +} + +static int ring_buffer_consumer_thread(void *arg) +{ + while (!kthread_should_stop() && !kill_test) { + complete(&read_start); + + ring_buffer_consumer(); + + set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop() || kill_test) + break; + + schedule(); + __set_current_state(TASK_RUNNING); + } + __set_current_state(TASK_RUNNING); + + if (kill_test) + wait_to_die(); + + return 0; +} + +static int ring_buffer_producer_thread(void *arg) +{ + init_completion(&read_start); + + while (!kthread_should_stop() && !kill_test) { + ring_buffer_reset(buffer); + + if (consumer) { + smp_wmb(); + wake_up_process(consumer); + wait_for_completion(&read_start); + } + + ring_buffer_producer(); + + trace_printk("Sleeping for 10 secs\n"); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ * SLEEP_TIME); + __set_current_state(TASK_RUNNING); + } + + if (kill_test) + wait_to_die(); + + return 0; +} + +static int __init ring_buffer_benchmark_init(void) +{ + int ret; + + /* make a one meg buffer in overwite mode */ + buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE); + if (!buffer) + return -ENOMEM; + + if (!disable_reader) { + consumer = kthread_create(ring_buffer_consumer_thread, + NULL, "rb_consumer"); + ret = PTR_ERR(consumer); + if (IS_ERR(consumer)) + goto out_fail; + } + + producer = kthread_run(ring_buffer_producer_thread, + NULL, "rb_producer"); + ret = PTR_ERR(producer); + + if (IS_ERR(producer)) + goto out_kill; + + /* + * Run them as low-prio background tasks by default: + */ + if (!disable_reader) { + if (consumer_fifo >= 0) { + struct sched_param param = { + .sched_priority = consumer_fifo + }; + sched_setscheduler(consumer, SCHED_FIFO, ¶m); + } else + set_user_nice(consumer, consumer_nice); + } + + if (producer_fifo >= 0) { + struct sched_param param = { + .sched_priority = consumer_fifo + }; + sched_setscheduler(producer, SCHED_FIFO, ¶m); + } else + set_user_nice(producer, producer_nice); + + return 0; + + out_kill: + if (consumer) + kthread_stop(consumer); + + out_fail: + ring_buffer_free(buffer); + return ret; +} + +static void __exit ring_buffer_benchmark_exit(void) +{ + kthread_stop(producer); + if (consumer) + kthread_stop(consumer); + ring_buffer_free(buffer); +} + +module_init(ring_buffer_benchmark_init); +module_exit(ring_buffer_benchmark_exit); + +MODULE_AUTHOR("Steven Rostedt"); +MODULE_DESCRIPTION("ring_buffer_benchmark"); +MODULE_LICENSE("GPL"); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c new file mode 100644 index 00000000..0731e81a --- /dev/null +++ b/kernel/trace/trace.c @@ -0,0 +1,4669 @@ +/* + * ring buffer based function tracer + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2008 Ingo Molnar + * + * Originally taken from the RT patch by: + * Arnaldo Carvalho de Melo + * + * Based on code from the latency_tracer, that is: + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" +#include "trace_output.h" + +/* + * On boot up, the ring buffer is set to the minimum size, so that + * we do not waste memory on systems that are not using tracing. + */ +int ring_buffer_expanded; + +/* + * We need to change this state when a selftest is running. + * A selftest will lurk into the ring-buffer to count the + * entries inserted during the selftest although some concurrent + * insertions into the ring-buffer such as trace_printk could occurred + * at the same time, giving false positive or negative results. + */ +static bool __read_mostly tracing_selftest_running; + +/* + * If a tracer is running, we do not want to run SELFTEST. + */ +bool __read_mostly tracing_selftest_disabled; + +/* For tracers that don't implement custom flags */ +static struct tracer_opt dummy_tracer_opt[] = { + { } +}; + +static struct tracer_flags dummy_tracer_flags = { + .val = 0, + .opts = dummy_tracer_opt +}; + +static int dummy_set_flag(u32 old_flags, u32 bit, int set) +{ + return 0; +} + +/* + * Kill all tracing for good (never come back). + * It is initialized to 1 but will turn to zero if the initialization + * of the tracer is successful. But that is the only place that sets + * this back to zero. + */ +static int tracing_disabled = 1; + +DEFINE_PER_CPU(int, ftrace_cpu_disabled); + +static inline void ftrace_disable_cpu(void) +{ + preempt_disable(); + __this_cpu_inc(ftrace_cpu_disabled); +} + +static inline void ftrace_enable_cpu(void) +{ + __this_cpu_dec(ftrace_cpu_disabled); + preempt_enable(); +} + +cpumask_var_t __read_mostly tracing_buffer_mask; + +/* + * ftrace_dump_on_oops - variable to dump ftrace buffer on oops + * + * If there is an oops (or kernel panic) and the ftrace_dump_on_oops + * is set, then ftrace_dump is called. This will output the contents + * of the ftrace buffers to the console. This is very useful for + * capturing traces that lead to crashes and outputing it to a + * serial console. + * + * It is default off, but you can enable it with either specifying + * "ftrace_dump_on_oops" in the kernel command line, or setting + * /proc/sys/kernel/ftrace_dump_on_oops + * Set 1 if you want to dump buffers of all CPUs + * Set 2 if you want to dump the buffer of the CPU that triggered oops + */ + +enum ftrace_dump_mode ftrace_dump_on_oops; + +static int tracing_set_tracer(const char *buf); + +#define MAX_TRACER_SIZE 100 +static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; +static char *default_bootup_tracer; + +static int __init set_cmdline_ftrace(char *str) +{ + strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); + default_bootup_tracer = bootup_tracer_buf; + /* We are using ftrace early, expand it */ + ring_buffer_expanded = 1; + return 1; +} +__setup("ftrace=", set_cmdline_ftrace); + +static int __init set_ftrace_dump_on_oops(char *str) +{ + if (*str++ != '=' || !*str) { + ftrace_dump_on_oops = DUMP_ALL; + return 1; + } + + if (!strcmp("orig_cpu", str)) { + ftrace_dump_on_oops = DUMP_ORIG; + return 1; + } + + return 0; +} +__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); + +unsigned long long ns2usecs(cycle_t nsec) +{ + nsec += 500; + do_div(nsec, 1000); + return nsec; +} + +/* + * The global_trace is the descriptor that holds the tracing + * buffers for the live tracing. For each CPU, it contains + * a link list of pages that will store trace entries. The + * page descriptor of the pages in the memory is used to hold + * the link list by linking the lru item in the page descriptor + * to each of the pages in the buffer per CPU. + * + * For each active CPU there is a data field that holds the + * pages for the buffer for that CPU. Each CPU has the same number + * of pages allocated for its buffer. + */ +static struct trace_array global_trace; + +static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); + +int filter_current_check_discard(struct ring_buffer *buffer, + struct ftrace_event_call *call, void *rec, + struct ring_buffer_event *event) +{ + return filter_check_discard(call, rec, buffer, event); +} +EXPORT_SYMBOL_GPL(filter_current_check_discard); + +cycle_t ftrace_now(int cpu) +{ + u64 ts; + + /* Early boot up does not have a buffer yet */ + if (!global_trace.buffer) + return trace_clock_local(); + + ts = ring_buffer_time_stamp(global_trace.buffer, cpu); + ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts); + + return ts; +} + +/* + * The max_tr is used to snapshot the global_trace when a maximum + * latency is reached. Some tracers will use this to store a maximum + * trace while it continues examining live traces. + * + * The buffers for the max_tr are set up the same as the global_trace. + * When a snapshot is taken, the link list of the max_tr is swapped + * with the link list of the global_trace and the buffers are reset for + * the global_trace so the tracing can continue. + */ +static struct trace_array max_tr; + +static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data); + +/* tracer_enabled is used to toggle activation of a tracer */ +static int tracer_enabled = 1; + +/** + * tracing_is_enabled - return tracer_enabled status + * + * This function is used by other tracers to know the status + * of the tracer_enabled flag. Tracers may use this function + * to know if it should enable their features when starting + * up. See irqsoff tracer for an example (start_irqsoff_tracer). + */ +int tracing_is_enabled(void) +{ + return tracer_enabled; +} + +/* + * trace_buf_size is the size in bytes that is allocated + * for a buffer. Note, the number of bytes is always rounded + * to page size. + * + * This number is purposely set to a low number of 16384. + * If the dump on oops happens, it will be much appreciated + * to not have to wait for all that output. Anyway this can be + * boot time and run time configurable. + */ +#define TRACE_BUF_SIZE_DEFAULT 1441792UL /* 16384 * 88 (sizeof(entry)) */ + +static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT; + +/* trace_types holds a link list of available tracers. */ +static struct tracer *trace_types __read_mostly; + +/* current_trace points to the tracer that is currently active */ +static struct tracer *current_trace __read_mostly; + +/* + * trace_types_lock is used to protect the trace_types list. + */ +static DEFINE_MUTEX(trace_types_lock); + +/* + * serialize the access of the ring buffer + * + * ring buffer serializes readers, but it is low level protection. + * The validity of the events (which returns by ring_buffer_peek() ..etc) + * are not protected by ring buffer. + * + * The content of events may become garbage if we allow other process consumes + * these events concurrently: + * A) the page of the consumed events may become a normal page + * (not reader page) in ring buffer, and this page will be rewrited + * by events producer. + * B) The page of the consumed events may become a page for splice_read, + * and this page will be returned to system. + * + * These primitives allow multi process access to different cpu ring buffer + * concurrently. + * + * These primitives don't distinguish read-only and read-consume access. + * Multi read-only access are also serialized. + */ + +#ifdef CONFIG_SMP +static DECLARE_RWSEM(all_cpu_access_lock); +static DEFINE_PER_CPU(struct mutex, cpu_access_lock); + +static inline void trace_access_lock(int cpu) +{ + if (cpu == TRACE_PIPE_ALL_CPU) { + /* gain it for accessing the whole ring buffer. */ + down_write(&all_cpu_access_lock); + } else { + /* gain it for accessing a cpu ring buffer. */ + + /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */ + down_read(&all_cpu_access_lock); + + /* Secondly block other access to this @cpu ring buffer. */ + mutex_lock(&per_cpu(cpu_access_lock, cpu)); + } +} + +static inline void trace_access_unlock(int cpu) +{ + if (cpu == TRACE_PIPE_ALL_CPU) { + up_write(&all_cpu_access_lock); + } else { + mutex_unlock(&per_cpu(cpu_access_lock, cpu)); + up_read(&all_cpu_access_lock); + } +} + +static inline void trace_access_lock_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + mutex_init(&per_cpu(cpu_access_lock, cpu)); +} + +#else + +static DEFINE_MUTEX(access_lock); + +static inline void trace_access_lock(int cpu) +{ + (void)cpu; + mutex_lock(&access_lock); +} + +static inline void trace_access_unlock(int cpu) +{ + (void)cpu; + mutex_unlock(&access_lock); +} + +static inline void trace_access_lock_init(void) +{ +} + +#endif + +/* trace_wait is a waitqueue for tasks blocked on trace_poll */ +static DECLARE_WAIT_QUEUE_HEAD(trace_wait); + +/* trace_flags holds trace_options default values */ +unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | + TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | + TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE; + +static int trace_stop_count; +static DEFINE_SPINLOCK(tracing_start_lock); + +/** + * trace_wake_up - wake up tasks waiting for trace input + * + * Simply wakes up any task that is blocked on the trace_wait + * queue. These is used with trace_poll for tasks polling the trace. + */ +void trace_wake_up(void) +{ + int cpu; + + if (trace_flags & TRACE_ITER_BLOCK) + return; + /* + * The runqueue_is_locked() can fail, but this is the best we + * have for now: + */ + cpu = get_cpu(); + if (!runqueue_is_locked(cpu)) + wake_up(&trace_wait); + put_cpu(); +} + +static int __init set_buf_size(char *str) +{ + unsigned long buf_size; + + if (!str) + return 0; + buf_size = memparse(str, &str); + /* nr_entries can not be zero */ + if (buf_size == 0) + return 0; + trace_buf_size = buf_size; + return 1; +} +__setup("trace_buf_size=", set_buf_size); + +static int __init set_tracing_thresh(char *str) +{ + unsigned long threshhold; + int ret; + + if (!str) + return 0; + ret = strict_strtoul(str, 0, &threshhold); + if (ret < 0) + return 0; + tracing_thresh = threshhold * 1000; + return 1; +} +__setup("tracing_thresh=", set_tracing_thresh); + +unsigned long nsecs_to_usecs(unsigned long nsecs) +{ + return nsecs / 1000; +} + +/* These must match the bit postions in trace_iterator_flags */ +static const char *trace_options[] = { + "print-parent", + "sym-offset", + "sym-addr", + "verbose", + "raw", + "hex", + "bin", + "block", + "stacktrace", + "trace_printk", + "ftrace_preempt", + "branch", + "annotate", + "userstacktrace", + "sym-userobj", + "printk-msg-only", + "context-info", + "latency-format", + "sleep-time", + "graph-time", + "record-cmd", + "overwrite", + NULL +}; + +static struct { + u64 (*func)(void); + const char *name; +} trace_clocks[] = { + { trace_clock_local, "local" }, + { trace_clock_global, "global" }, +}; + +int trace_clock_id; + +/* + * trace_parser_get_init - gets the buffer for trace parser + */ +int trace_parser_get_init(struct trace_parser *parser, int size) +{ + memset(parser, 0, sizeof(*parser)); + + parser->buffer = kmalloc(size, GFP_KERNEL); + if (!parser->buffer) + return 1; + + parser->size = size; + return 0; +} + +/* + * trace_parser_put - frees the buffer for trace parser + */ +void trace_parser_put(struct trace_parser *parser) +{ + kfree(parser->buffer); +} + +/* + * trace_get_user - reads the user input string separated by space + * (matched by isspace(ch)) + * + * For each string found the 'struct trace_parser' is updated, + * and the function returns. + * + * Returns number of bytes read. + * + * See kernel/trace/trace.h for 'struct trace_parser' details. + */ +int trace_get_user(struct trace_parser *parser, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char ch; + size_t read = 0; + ssize_t ret; + + if (!*ppos) + trace_parser_clear(parser); + + ret = get_user(ch, ubuf++); + if (ret) + goto out; + + read++; + cnt--; + + /* + * The parser is not finished with the last write, + * continue reading the user input without skipping spaces. + */ + if (!parser->cont) { + /* skip white space */ + while (cnt && isspace(ch)) { + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + /* only spaces were written */ + if (isspace(ch)) { + *ppos += read; + ret = read; + goto out; + } + + parser->idx = 0; + } + + /* read the non-space input */ + while (cnt && !isspace(ch)) { + if (parser->idx < parser->size - 1) + parser->buffer[parser->idx++] = ch; + else { + ret = -EINVAL; + goto out; + } + ret = get_user(ch, ubuf++); + if (ret) + goto out; + read++; + cnt--; + } + + /* We either got finished input or we have to wait for another call. */ + if (isspace(ch)) { + parser->buffer[parser->idx] = 0; + parser->cont = false; + } else { + parser->cont = true; + parser->buffer[parser->idx++] = ch; + } + + *ppos += read; + ret = read; + +out: + return ret; +} + +ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt) +{ + int len; + int ret; + + if (!cnt) + return 0; + + if (s->len <= s->readpos) + return -EBUSY; + + len = s->len - s->readpos; + if (cnt > len) + cnt = len; + ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt); + if (ret == cnt) + return -EFAULT; + + cnt -= ret; + + s->readpos += cnt; + return cnt; +} + +static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) +{ + int len; + void *ret; + + if (s->len <= s->readpos) + return -EBUSY; + + len = s->len - s->readpos; + if (cnt > len) + cnt = len; + ret = memcpy(buf, s->buffer + s->readpos, cnt); + if (!ret) + return -EFAULT; + + s->readpos += cnt; + return cnt; +} + +/* + * ftrace_max_lock is used to protect the swapping of buffers + * when taking a max snapshot. The buffers themselves are + * protected by per_cpu spinlocks. But the action of the swap + * needs its own lock. + * + * This is defined as a arch_spinlock_t in order to help + * with performance when lockdep debugging is enabled. + * + * It is also used in other places outside the update_max_tr + * so it needs to be defined outside of the + * CONFIG_TRACER_MAX_TRACE. + */ +static arch_spinlock_t ftrace_max_lock = + (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + +unsigned long __read_mostly tracing_thresh; + +#ifdef CONFIG_TRACER_MAX_TRACE +unsigned long __read_mostly tracing_max_latency; + +/* + * Copy the new maximum trace into the separate maximum-trace + * structure. (this way the maximum trace is permanently saved, + * for later retrieval via /sys/kernel/debug/tracing/latency_trace) + */ +static void +__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + struct trace_array_cpu *data = tr->data[cpu]; + struct trace_array_cpu *max_data; + + max_tr.cpu = cpu; + max_tr.time_start = data->preempt_timestamp; + + max_data = max_tr.data[cpu]; + max_data->saved_latency = tracing_max_latency; + max_data->critical_start = data->critical_start; + max_data->critical_end = data->critical_end; + + memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN); + max_data->pid = tsk->pid; + max_data->uid = task_uid(tsk); + max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; + max_data->policy = tsk->policy; + max_data->rt_priority = tsk->rt_priority; + + /* record this tasks comm */ + tracing_record_cmdline(tsk); +} + +/** + * update_max_tr - snapshot all trace buffers from global_trace to max_tr + * @tr: tracer + * @tsk: the task with the latency + * @cpu: The cpu that initiated the trace. + * + * Flip the buffers between the @tr and the max_tr and record information + * about which task was the cause of this latency. + */ +void +update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + struct ring_buffer *buf = tr->buffer; + + if (trace_stop_count) + return; + + WARN_ON_ONCE(!irqs_disabled()); + if (!current_trace->use_max_tr) { + WARN_ON_ONCE(1); + return; + } + arch_spin_lock(&ftrace_max_lock); + + tr->buffer = max_tr.buffer; + max_tr.buffer = buf; + + __update_max_tr(tr, tsk, cpu); + arch_spin_unlock(&ftrace_max_lock); +} + +/** + * update_max_tr_single - only copy one trace over, and reset the rest + * @tr - tracer + * @tsk - task with the latency + * @cpu - the cpu of the buffer to copy. + * + * Flip the trace of a single CPU buffer between the @tr and the max_tr. + */ +void +update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) +{ + int ret; + + if (trace_stop_count) + return; + + WARN_ON_ONCE(!irqs_disabled()); + if (!current_trace->use_max_tr) { + WARN_ON_ONCE(1); + return; + } + + arch_spin_lock(&ftrace_max_lock); + + ftrace_disable_cpu(); + + ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); + + if (ret == -EBUSY) { + /* + * We failed to swap the buffer due to a commit taking + * place on this CPU. We fail to record, but we reset + * the max trace buffer (no one writes directly to it) + * and flag that it failed. + */ + trace_array_printk(&max_tr, _THIS_IP_, + "Failed to swap buffers due to commit in progress\n"); + } + + ftrace_enable_cpu(); + + WARN_ON_ONCE(ret && ret != -EAGAIN && ret != -EBUSY); + + __update_max_tr(tr, tsk, cpu); + arch_spin_unlock(&ftrace_max_lock); +} +#endif /* CONFIG_TRACER_MAX_TRACE */ + +/** + * register_tracer - register a tracer with the ftrace system. + * @type - the plugin for the tracer + * + * Register a new plugin tracer. + */ +int register_tracer(struct tracer *type) +__releases(kernel_lock) +__acquires(kernel_lock) +{ + struct tracer *t; + int ret = 0; + + if (!type->name) { + pr_info("Tracer must have a name\n"); + return -1; + } + + if (strlen(type->name) >= MAX_TRACER_SIZE) { + pr_info("Tracer has a name longer than %d\n", MAX_TRACER_SIZE); + return -1; + } + + mutex_lock(&trace_types_lock); + + tracing_selftest_running = true; + + for (t = trace_types; t; t = t->next) { + if (strcmp(type->name, t->name) == 0) { + /* already found */ + pr_info("Tracer %s already registered\n", + type->name); + ret = -1; + goto out; + } + } + + if (!type->set_flag) + type->set_flag = &dummy_set_flag; + if (!type->flags) + type->flags = &dummy_tracer_flags; + else + if (!type->flags->opts) + type->flags->opts = dummy_tracer_opt; + if (!type->wait_pipe) + type->wait_pipe = default_wait_pipe; + + +#ifdef CONFIG_FTRACE_STARTUP_TEST + if (type->selftest && !tracing_selftest_disabled) { + struct tracer *saved_tracer = current_trace; + struct trace_array *tr = &global_trace; + + /* + * Run a selftest on this tracer. + * Here we reset the trace buffer, and set the current + * tracer to be this tracer. The tracer can then run some + * internal tracing to verify that everything is in order. + * If we fail, we do not register this tracer. + */ + tracing_reset_online_cpus(tr); + + current_trace = type; + + /* If we expanded the buffers, make sure the max is expanded too */ + if (ring_buffer_expanded && type->use_max_tr) + ring_buffer_resize(max_tr.buffer, trace_buf_size); + + /* the test is responsible for initializing and enabling */ + pr_info("Testing tracer %s: ", type->name); + ret = type->selftest(type, tr); + /* the test is responsible for resetting too */ + current_trace = saved_tracer; + if (ret) { + printk(KERN_CONT "FAILED!\n"); + goto out; + } + /* Only reset on passing, to avoid touching corrupted buffers */ + tracing_reset_online_cpus(tr); + + /* Shrink the max buffer again */ + if (ring_buffer_expanded && type->use_max_tr) + ring_buffer_resize(max_tr.buffer, 1); + + printk(KERN_CONT "PASSED\n"); + } +#endif + + type->next = trace_types; + trace_types = type; + + out: + tracing_selftest_running = false; + mutex_unlock(&trace_types_lock); + + if (ret || !default_bootup_tracer) + goto out_unlock; + + if (strncmp(default_bootup_tracer, type->name, MAX_TRACER_SIZE)) + goto out_unlock; + + printk(KERN_INFO "Starting tracer '%s'\n", type->name); + /* Do we want this tracer to start on bootup? */ + tracing_set_tracer(type->name); + default_bootup_tracer = NULL; + /* disable other selftests, since this will break it. */ + tracing_selftest_disabled = 1; +#ifdef CONFIG_FTRACE_STARTUP_TEST + printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n", + type->name); +#endif + + out_unlock: + return ret; +} + +void unregister_tracer(struct tracer *type) +{ + struct tracer **t; + + mutex_lock(&trace_types_lock); + for (t = &trace_types; *t; t = &(*t)->next) { + if (*t == type) + goto found; + } + pr_info("Tracer %s not registered\n", type->name); + goto out; + + found: + *t = (*t)->next; + + if (type == current_trace && tracer_enabled) { + tracer_enabled = 0; + tracing_stop(); + if (current_trace->stop) + current_trace->stop(&global_trace); + current_trace = &nop_trace; + } +out: + mutex_unlock(&trace_types_lock); +} + +static void __tracing_reset(struct ring_buffer *buffer, int cpu) +{ + ftrace_disable_cpu(); + ring_buffer_reset_cpu(buffer, cpu); + ftrace_enable_cpu(); +} + +void tracing_reset(struct trace_array *tr, int cpu) +{ + struct ring_buffer *buffer = tr->buffer; + + ring_buffer_record_disable(buffer); + + /* Make sure all commits have finished */ + synchronize_sched(); + __tracing_reset(buffer, cpu); + + ring_buffer_record_enable(buffer); +} + +void tracing_reset_online_cpus(struct trace_array *tr) +{ + struct ring_buffer *buffer = tr->buffer; + int cpu; + + ring_buffer_record_disable(buffer); + + /* Make sure all commits have finished */ + synchronize_sched(); + + tr->time_start = ftrace_now(tr->cpu); + + for_each_online_cpu(cpu) + __tracing_reset(buffer, cpu); + + ring_buffer_record_enable(buffer); +} + +void tracing_reset_current(int cpu) +{ + tracing_reset(&global_trace, cpu); +} + +void tracing_reset_current_online_cpus(void) +{ + tracing_reset_online_cpus(&global_trace); +} + +#define SAVED_CMDLINES 128 +#define NO_CMDLINE_MAP UINT_MAX +static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; +static unsigned map_cmdline_to_pid[SAVED_CMDLINES]; +static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN]; +static int cmdline_idx; +static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED; + +/* temporary disable recording */ +static atomic_t trace_record_cmdline_disabled __read_mostly; + +static void trace_init_cmdlines(void) +{ + memset(&map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(map_pid_to_cmdline)); + memset(&map_cmdline_to_pid, NO_CMDLINE_MAP, sizeof(map_cmdline_to_pid)); + cmdline_idx = 0; +} + +int is_tracing_stopped(void) +{ + return trace_stop_count; +} + +/** + * ftrace_off_permanent - disable all ftrace code permanently + * + * This should only be called when a serious anomally has + * been detected. This will turn off the function tracing, + * ring buffers, and other tracing utilites. It takes no + * locks and can be called from any context. + */ +void ftrace_off_permanent(void) +{ + tracing_disabled = 1; + ftrace_stop(); + tracing_off_permanent(); +} + +/** + * tracing_start - quick start of the tracer + * + * If tracing is enabled but was stopped by tracing_stop, + * this will start the tracer back up. + */ +void tracing_start(void) +{ + struct ring_buffer *buffer; + unsigned long flags; + + if (tracing_disabled) + return; + + spin_lock_irqsave(&tracing_start_lock, flags); + if (--trace_stop_count) { + if (trace_stop_count < 0) { + /* Someone screwed up their debugging */ + WARN_ON_ONCE(1); + trace_stop_count = 0; + } + goto out; + } + + /* Prevent the buffers from switching */ + arch_spin_lock(&ftrace_max_lock); + + buffer = global_trace.buffer; + if (buffer) + ring_buffer_record_enable(buffer); + + buffer = max_tr.buffer; + if (buffer) + ring_buffer_record_enable(buffer); + + arch_spin_unlock(&ftrace_max_lock); + + ftrace_start(); + out: + spin_unlock_irqrestore(&tracing_start_lock, flags); +} + +/** + * tracing_stop - quick stop of the tracer + * + * Light weight way to stop tracing. Use in conjunction with + * tracing_start. + */ +void tracing_stop(void) +{ + struct ring_buffer *buffer; + unsigned long flags; + + ftrace_stop(); + spin_lock_irqsave(&tracing_start_lock, flags); + if (trace_stop_count++) + goto out; + + /* Prevent the buffers from switching */ + arch_spin_lock(&ftrace_max_lock); + + buffer = global_trace.buffer; + if (buffer) + ring_buffer_record_disable(buffer); + + buffer = max_tr.buffer; + if (buffer) + ring_buffer_record_disable(buffer); + + arch_spin_unlock(&ftrace_max_lock); + + out: + spin_unlock_irqrestore(&tracing_start_lock, flags); +} + +void trace_stop_cmdline_recording(void); + +static void trace_save_cmdline(struct task_struct *tsk) +{ + unsigned pid, idx; + + if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT)) + return; + + /* + * It's not the end of the world if we don't get + * the lock, but we also don't want to spin + * nor do we want to disable interrupts, + * so if we miss here, then better luck next time. + */ + if (!arch_spin_trylock(&trace_cmdline_lock)) + return; + + idx = map_pid_to_cmdline[tsk->pid]; + if (idx == NO_CMDLINE_MAP) { + idx = (cmdline_idx + 1) % SAVED_CMDLINES; + + /* + * Check whether the cmdline buffer at idx has a pid + * mapped. We are going to overwrite that entry so we + * need to clear the map_pid_to_cmdline. Otherwise we + * would read the new comm for the old pid. + */ + pid = map_cmdline_to_pid[idx]; + if (pid != NO_CMDLINE_MAP) + map_pid_to_cmdline[pid] = NO_CMDLINE_MAP; + + map_cmdline_to_pid[idx] = tsk->pid; + map_pid_to_cmdline[tsk->pid] = idx; + + cmdline_idx = idx; + } + + memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN); + + arch_spin_unlock(&trace_cmdline_lock); +} + +void trace_find_cmdline(int pid, char comm[]) +{ + unsigned map; + + if (!pid) { + strcpy(comm, ""); + return; + } + + if (WARN_ON_ONCE(pid < 0)) { + strcpy(comm, ""); + return; + } + + if (pid > PID_MAX_DEFAULT) { + strcpy(comm, "<...>"); + return; + } + + preempt_disable(); + arch_spin_lock(&trace_cmdline_lock); + map = map_pid_to_cmdline[pid]; + if (map != NO_CMDLINE_MAP) + strcpy(comm, saved_cmdlines[map]); + else + strcpy(comm, "<...>"); + + arch_spin_unlock(&trace_cmdline_lock); + preempt_enable(); +} + +void tracing_record_cmdline(struct task_struct *tsk) +{ + if (atomic_read(&trace_record_cmdline_disabled) || !tracer_enabled || + !tracing_is_on()) + return; + + trace_save_cmdline(tsk); +} + +void +tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, + int pc) +{ + struct task_struct *tsk = current; + + entry->preempt_count = pc & 0xff; + entry->pid = (tsk) ? tsk->pid : 0; + entry->padding = 0; + entry->flags = +#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT + (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | +#else + TRACE_FLAG_IRQS_NOSUPPORT | +#endif + ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | + ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | + (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); +} +EXPORT_SYMBOL_GPL(tracing_generic_entry_update); + +struct ring_buffer_event * +trace_buffer_lock_reserve(struct ring_buffer *buffer, + int type, + unsigned long len, + unsigned long flags, int pc) +{ + struct ring_buffer_event *event; + + event = ring_buffer_lock_reserve(buffer, len); + if (event != NULL) { + struct trace_entry *ent = ring_buffer_event_data(event); + + tracing_generic_entry_update(ent, flags, pc); + ent->type = type; + } + + return event; +} + +static inline void +__trace_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc, + int wake) +{ + ring_buffer_unlock_commit(buffer, event); + + ftrace_trace_stack(buffer, flags, 6, pc); + ftrace_trace_userstack(buffer, flags, pc); + + if (wake) + trace_wake_up(); +} + +void trace_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc) +{ + __trace_buffer_unlock_commit(buffer, event, flags, pc, 1); +} + +struct ring_buffer_event * +trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, + int type, unsigned long len, + unsigned long flags, int pc) +{ + *current_rb = global_trace.buffer; + return trace_buffer_lock_reserve(*current_rb, + type, len, flags, pc); +} +EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve); + +void trace_current_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc) +{ + __trace_buffer_unlock_commit(buffer, event, flags, pc, 1); +} +EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); + +void trace_nowake_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc) +{ + __trace_buffer_unlock_commit(buffer, event, flags, pc, 0); +} +EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); + +void trace_current_buffer_discard_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + ring_buffer_discard_commit(buffer, event); +} +EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit); + +void +trace_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, unsigned long flags, + int pc) +{ + struct ftrace_event_call *call = &event_function; + struct ring_buffer *buffer = tr->buffer; + struct ring_buffer_event *event; + struct ftrace_entry *entry; + + /* If we are reading the ring buffer, don't trace */ + if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) + return; + + event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), + flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->parent_ip = parent_ip; + + if (!filter_check_discard(call, entry, buffer, event)) + ring_buffer_unlock_commit(buffer, event); +} + +void +ftrace(struct trace_array *tr, struct trace_array_cpu *data, + unsigned long ip, unsigned long parent_ip, unsigned long flags, + int pc) +{ + if (likely(!atomic_read(&data->disabled))) + trace_function(tr, ip, parent_ip, flags, pc); +} + +#ifdef CONFIG_STACKTRACE +static void __ftrace_trace_stack(struct ring_buffer *buffer, + unsigned long flags, + int skip, int pc) +{ + struct ftrace_event_call *call = &event_kernel_stack; + struct ring_buffer_event *event; + struct stack_entry *entry; + struct stack_trace trace; + + event = trace_buffer_lock_reserve(buffer, TRACE_STACK, + sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + memset(&entry->caller, 0, sizeof(entry->caller)); + + trace.nr_entries = 0; + trace.max_entries = FTRACE_STACK_ENTRIES; + trace.skip = skip; + trace.entries = entry->caller; + + save_stack_trace(&trace); + if (!filter_check_discard(call, entry, buffer, event)) + ring_buffer_unlock_commit(buffer, event); +} + +void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, + int skip, int pc) +{ + if (!(trace_flags & TRACE_ITER_STACKTRACE)) + return; + + __ftrace_trace_stack(buffer, flags, skip, pc); +} + +void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, + int pc) +{ + __ftrace_trace_stack(tr->buffer, flags, skip, pc); +} + +/** + * trace_dump_stack - record a stack back trace in the trace buffer + */ +void trace_dump_stack(void) +{ + unsigned long flags; + + if (tracing_disabled || tracing_selftest_running) + return; + + local_save_flags(flags); + + /* skipping 3 traces, seems to get us at the caller of this function */ + __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count()); +} + +static DEFINE_PER_CPU(int, user_stack_count); + +void +ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) +{ + struct ftrace_event_call *call = &event_user_stack; + struct ring_buffer_event *event; + struct userstack_entry *entry; + struct stack_trace trace; + + if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) + return; + + /* + * NMIs can not handle page faults, even with fix ups. + * The save user stack can (and often does) fault. + */ + if (unlikely(in_nmi())) + return; + + /* + * prevent recursion, since the user stack tracing may + * trigger other kernel events. + */ + preempt_disable(); + if (__this_cpu_read(user_stack_count)) + goto out; + + __this_cpu_inc(user_stack_count); + + event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, + sizeof(*entry), flags, pc); + if (!event) + goto out_drop_count; + entry = ring_buffer_event_data(event); + + entry->tgid = current->tgid; + memset(&entry->caller, 0, sizeof(entry->caller)); + + trace.nr_entries = 0; + trace.max_entries = FTRACE_STACK_ENTRIES; + trace.skip = 0; + trace.entries = entry->caller; + + save_stack_trace_user(&trace); + if (!filter_check_discard(call, entry, buffer, event)) + ring_buffer_unlock_commit(buffer, event); + + out_drop_count: + __this_cpu_dec(user_stack_count); + out: + preempt_enable(); +} + +#ifdef UNUSED +static void __trace_userstack(struct trace_array *tr, unsigned long flags) +{ + ftrace_trace_userstack(tr, flags, preempt_count()); +} +#endif /* UNUSED */ + +#endif /* CONFIG_STACKTRACE */ + +/** + * trace_vbprintk - write binary msg to tracing buffer + * + */ +int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) +{ + static arch_spinlock_t trace_buf_lock = + (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + static u32 trace_buf[TRACE_BUF_SIZE]; + + struct ftrace_event_call *call = &event_bprint; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + struct trace_array *tr = &global_trace; + struct trace_array_cpu *data; + struct bprint_entry *entry; + unsigned long flags; + int disable; + int cpu, len = 0, size, pc; + + if (unlikely(tracing_selftest_running || tracing_disabled)) + return 0; + + /* Don't pollute graph traces with trace_vprintk internals */ + pause_graph_tracing(); + + pc = preempt_count(); + preempt_disable_notrace(); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + + disable = atomic_inc_return(&data->disabled); + if (unlikely(disable != 1)) + goto out; + + /* Lockdep uses trace_printk for lock tracing */ + local_irq_save(flags); + arch_spin_lock(&trace_buf_lock); + len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args); + + if (len > TRACE_BUF_SIZE || len < 0) + goto out_unlock; + + size = sizeof(*entry) + sizeof(u32) * len; + buffer = tr->buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, + flags, pc); + if (!event) + goto out_unlock; + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->fmt = fmt; + + memcpy(entry->buf, trace_buf, sizeof(u32) * len); + if (!filter_check_discard(call, entry, buffer, event)) { + ring_buffer_unlock_commit(buffer, event); + ftrace_trace_stack(buffer, flags, 6, pc); + } + +out_unlock: + arch_spin_unlock(&trace_buf_lock); + local_irq_restore(flags); + +out: + atomic_dec_return(&data->disabled); + preempt_enable_notrace(); + unpause_graph_tracing(); + + return len; +} +EXPORT_SYMBOL_GPL(trace_vbprintk); + +int trace_array_printk(struct trace_array *tr, + unsigned long ip, const char *fmt, ...) +{ + int ret; + va_list ap; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + va_start(ap, fmt); + ret = trace_array_vprintk(tr, ip, fmt, ap); + va_end(ap); + return ret; +} + +int trace_array_vprintk(struct trace_array *tr, + unsigned long ip, const char *fmt, va_list args) +{ + static arch_spinlock_t trace_buf_lock = __ARCH_SPIN_LOCK_UNLOCKED; + static char trace_buf[TRACE_BUF_SIZE]; + + struct ftrace_event_call *call = &event_print; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + struct trace_array_cpu *data; + int cpu, len = 0, size, pc; + struct print_entry *entry; + unsigned long irq_flags; + int disable; + + if (tracing_disabled || tracing_selftest_running) + return 0; + + pc = preempt_count(); + preempt_disable_notrace(); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + + disable = atomic_inc_return(&data->disabled); + if (unlikely(disable != 1)) + goto out; + + pause_graph_tracing(); + raw_local_irq_save(irq_flags); + arch_spin_lock(&trace_buf_lock); + len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args); + + size = sizeof(*entry) + len + 1; + buffer = tr->buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, + irq_flags, pc); + if (!event) + goto out_unlock; + entry = ring_buffer_event_data(event); + entry->ip = ip; + + memcpy(&entry->buf, trace_buf, len); + entry->buf[len] = '\0'; + if (!filter_check_discard(call, entry, buffer, event)) { + ring_buffer_unlock_commit(buffer, event); + ftrace_trace_stack(buffer, irq_flags, 6, pc); + } + + out_unlock: + arch_spin_unlock(&trace_buf_lock); + raw_local_irq_restore(irq_flags); + unpause_graph_tracing(); + out: + atomic_dec_return(&data->disabled); + preempt_enable_notrace(); + + return len; +} + +int trace_vprintk(unsigned long ip, const char *fmt, va_list args) +{ + return trace_array_vprintk(&global_trace, ip, fmt, args); +} +EXPORT_SYMBOL_GPL(trace_vprintk); + +static void trace_iterator_increment(struct trace_iterator *iter) +{ + /* Don't allow ftrace to trace into the ring buffers */ + ftrace_disable_cpu(); + + iter->idx++; + if (iter->buffer_iter[iter->cpu]) + ring_buffer_read(iter->buffer_iter[iter->cpu], NULL); + + ftrace_enable_cpu(); +} + +static struct trace_entry * +peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, + unsigned long *lost_events) +{ + struct ring_buffer_event *event; + struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; + + /* Don't allow ftrace to trace into the ring buffers */ + ftrace_disable_cpu(); + + if (buf_iter) + event = ring_buffer_iter_peek(buf_iter, ts); + else + event = ring_buffer_peek(iter->tr->buffer, cpu, ts, + lost_events); + + ftrace_enable_cpu(); + + return event ? ring_buffer_event_data(event) : NULL; +} + +static struct trace_entry * +__find_next_entry(struct trace_iterator *iter, int *ent_cpu, + unsigned long *missing_events, u64 *ent_ts) +{ + struct ring_buffer *buffer = iter->tr->buffer; + struct trace_entry *ent, *next = NULL; + unsigned long lost_events = 0, next_lost = 0; + int cpu_file = iter->cpu_file; + u64 next_ts = 0, ts; + int next_cpu = -1; + int cpu; + + /* + * If we are in a per_cpu trace file, don't bother by iterating over + * all cpu and peek directly. + */ + if (cpu_file > TRACE_PIPE_ALL_CPU) { + if (ring_buffer_empty_cpu(buffer, cpu_file)) + return NULL; + ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events); + if (ent_cpu) + *ent_cpu = cpu_file; + + return ent; + } + + for_each_tracing_cpu(cpu) { + + if (ring_buffer_empty_cpu(buffer, cpu)) + continue; + + ent = peek_next_entry(iter, cpu, &ts, &lost_events); + + /* + * Pick the entry with the smallest timestamp: + */ + if (ent && (!next || ts < next_ts)) { + next = ent; + next_cpu = cpu; + next_ts = ts; + next_lost = lost_events; + } + } + + if (ent_cpu) + *ent_cpu = next_cpu; + + if (ent_ts) + *ent_ts = next_ts; + + if (missing_events) + *missing_events = next_lost; + + return next; +} + +/* Find the next real entry, without updating the iterator itself */ +struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, + int *ent_cpu, u64 *ent_ts) +{ + return __find_next_entry(iter, ent_cpu, NULL, ent_ts); +} + +/* Find the next real entry, and increment the iterator to the next entry */ +void *trace_find_next_entry_inc(struct trace_iterator *iter) +{ + iter->ent = __find_next_entry(iter, &iter->cpu, + &iter->lost_events, &iter->ts); + + if (iter->ent) + trace_iterator_increment(iter); + + return iter->ent ? iter : NULL; +} + +static void trace_consume(struct trace_iterator *iter) +{ + /* Don't allow ftrace to trace into the ring buffers */ + ftrace_disable_cpu(); + ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, + &iter->lost_events); + ftrace_enable_cpu(); +} + +static void *s_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct trace_iterator *iter = m->private; + int i = (int)*pos; + void *ent; + + WARN_ON_ONCE(iter->leftover); + + (*pos)++; + + /* can't go backwards */ + if (iter->idx > i) + return NULL; + + if (iter->idx < 0) + ent = trace_find_next_entry_inc(iter); + else + ent = iter; + + while (ent && iter->idx < i) + ent = trace_find_next_entry_inc(iter); + + iter->pos = *pos; + + return ent; +} + +void tracing_iter_reset(struct trace_iterator *iter, int cpu) +{ + struct trace_array *tr = iter->tr; + struct ring_buffer_event *event; + struct ring_buffer_iter *buf_iter; + unsigned long entries = 0; + u64 ts; + + tr->data[cpu]->skipped_entries = 0; + + if (!iter->buffer_iter[cpu]) + return; + + buf_iter = iter->buffer_iter[cpu]; + ring_buffer_iter_reset(buf_iter); + + /* + * We could have the case with the max latency tracers + * that a reset never took place on a cpu. This is evident + * by the timestamp being before the start of the buffer. + */ + while ((event = ring_buffer_iter_peek(buf_iter, &ts))) { + if (ts >= iter->tr->time_start) + break; + entries++; + ring_buffer_read(buf_iter, NULL); + } + + tr->data[cpu]->skipped_entries = entries; +} + +/* + * The current tracer is copied to avoid a global locking + * all around. + */ +static void *s_start(struct seq_file *m, loff_t *pos) +{ + struct trace_iterator *iter = m->private; + static struct tracer *old_tracer; + int cpu_file = iter->cpu_file; + void *p = NULL; + loff_t l = 0; + int cpu; + + /* copy the tracer to avoid using a global lock all around */ + mutex_lock(&trace_types_lock); + if (unlikely(old_tracer != current_trace && current_trace)) { + old_tracer = current_trace; + *iter->trace = *current_trace; + } + mutex_unlock(&trace_types_lock); + + atomic_inc(&trace_record_cmdline_disabled); + + if (*pos != iter->pos) { + iter->ent = NULL; + iter->cpu = 0; + iter->idx = -1; + + ftrace_disable_cpu(); + + if (cpu_file == TRACE_PIPE_ALL_CPU) { + for_each_tracing_cpu(cpu) + tracing_iter_reset(iter, cpu); + } else + tracing_iter_reset(iter, cpu_file); + + ftrace_enable_cpu(); + + iter->leftover = 0; + for (p = iter; p && l < *pos; p = s_next(m, p, &l)) + ; + + } else { + /* + * If we overflowed the seq_file before, then we want + * to just reuse the trace_seq buffer again. + */ + if (iter->leftover) + p = iter; + else { + l = *pos - 1; + p = s_next(m, p, &l); + } + } + + trace_event_read_lock(); + trace_access_lock(cpu_file); + return p; +} + +static void s_stop(struct seq_file *m, void *p) +{ + struct trace_iterator *iter = m->private; + + atomic_dec(&trace_record_cmdline_disabled); + trace_access_unlock(iter->cpu_file); + trace_event_read_unlock(); +} + +static void print_lat_help_header(struct seq_file *m) +{ + seq_puts(m, "# _------=> CPU# \n"); + seq_puts(m, "# / _-----=> irqs-off \n"); + seq_puts(m, "# | / _----=> need-resched \n"); + seq_puts(m, "# || / _---=> hardirq/softirq \n"); + seq_puts(m, "# ||| / _--=> preempt-depth \n"); + seq_puts(m, "# |||| / delay \n"); + seq_puts(m, "# cmd pid ||||| time | caller \n"); + seq_puts(m, "# \\ / ||||| \\ | / \n"); +} + +static void print_func_help_header(struct seq_file *m) +{ + seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); + seq_puts(m, "# | | | | |\n"); +} + + +void +print_trace_header(struct seq_file *m, struct trace_iterator *iter) +{ + unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + struct trace_array *tr = iter->tr; + struct trace_array_cpu *data = tr->data[tr->cpu]; + struct tracer *type = current_trace; + unsigned long entries = 0; + unsigned long total = 0; + unsigned long count; + const char *name = "preemption"; + int cpu; + + if (type) + name = type->name; + + + for_each_tracing_cpu(cpu) { + count = ring_buffer_entries_cpu(tr->buffer, cpu); + /* + * If this buffer has skipped entries, then we hold all + * entries for the trace and we need to ignore the + * ones before the time stamp. + */ + if (tr->data[cpu]->skipped_entries) { + count -= tr->data[cpu]->skipped_entries; + /* total is the same as the entries */ + total += count; + } else + total += count + + ring_buffer_overrun_cpu(tr->buffer, cpu); + entries += count; + } + + seq_printf(m, "# %s latency trace v1.1.5 on %s\n", + name, UTS_RELEASE); + seq_puts(m, "# -----------------------------------" + "---------------------------------\n"); + seq_printf(m, "# latency: %lu us, #%lu/%lu, CPU#%d |" + " (M:%s VP:%d, KP:%d, SP:%d HP:%d", + nsecs_to_usecs(data->saved_latency), + entries, + total, + tr->cpu, +#if defined(CONFIG_PREEMPT_NONE) + "server", +#elif defined(CONFIG_PREEMPT_VOLUNTARY) + "desktop", +#elif defined(CONFIG_PREEMPT) + "preempt", +#else + "unknown", +#endif + /* These are reserved for later use */ + 0, 0, 0, 0); +#ifdef CONFIG_SMP + seq_printf(m, " #P:%d)\n", num_online_cpus()); +#else + seq_puts(m, ")\n"); +#endif + seq_puts(m, "# -----------------\n"); + seq_printf(m, "# | task: %.16s-%d " + "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n", + data->comm, data->pid, data->uid, data->nice, + data->policy, data->rt_priority); + seq_puts(m, "# -----------------\n"); + + if (data->critical_start) { + seq_puts(m, "# => started at: "); + seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags); + trace_print_seq(m, &iter->seq); + seq_puts(m, "\n# => ended at: "); + seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags); + trace_print_seq(m, &iter->seq); + seq_puts(m, "\n#\n"); + } + + seq_puts(m, "#\n"); +} + +static void test_cpu_buff_start(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + + if (!(trace_flags & TRACE_ITER_ANNOTATE)) + return; + + if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) + return; + + if (cpumask_test_cpu(iter->cpu, iter->started)) + return; + + if (iter->tr->data[iter->cpu]->skipped_entries) + return; + + cpumask_set_cpu(iter->cpu, iter->started); + + /* Don't print started cpu buffer for the first entry of the trace */ + if (iter->idx > 1) + trace_seq_printf(s, "##### CPU %u buffer started ####\n", + iter->cpu); +} + +static enum print_line_t print_trace_fmt(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + struct trace_entry *entry; + struct trace_event *event; + + entry = iter->ent; + + test_cpu_buff_start(iter); + + event = ftrace_find_event(entry->type); + + if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (iter->iter_flags & TRACE_FILE_LAT_FMT) { + if (!trace_print_lat_context(iter)) + goto partial; + } else { + if (!trace_print_context(iter)) + goto partial; + } + } + + if (event) + return event->funcs->trace(iter, sym_flags, event); + + if (!trace_seq_printf(s, "Unknown type %d\n", entry->type)) + goto partial; + + return TRACE_TYPE_HANDLED; +partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static enum print_line_t print_raw_fmt(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry; + struct trace_event *event; + + entry = iter->ent; + + if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + if (!trace_seq_printf(s, "%d %d %llu ", + entry->pid, iter->cpu, iter->ts)) + goto partial; + } + + event = ftrace_find_event(entry->type); + if (event) + return event->funcs->raw(iter, 0, event); + + if (!trace_seq_printf(s, "%d ?\n", entry->type)) + goto partial; + + return TRACE_TYPE_HANDLED; +partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static enum print_line_t print_hex_fmt(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + unsigned char newline = '\n'; + struct trace_entry *entry; + struct trace_event *event; + + entry = iter->ent; + + if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + SEQ_PUT_HEX_FIELD_RET(s, entry->pid); + SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); + SEQ_PUT_HEX_FIELD_RET(s, iter->ts); + } + + event = ftrace_find_event(entry->type); + if (event) { + enum print_line_t ret = event->funcs->hex(iter, 0, event); + if (ret != TRACE_TYPE_HANDLED) + return ret; + } + + SEQ_PUT_FIELD_RET(s, newline); + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t print_bin_fmt(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry; + struct trace_event *event; + + entry = iter->ent; + + if (trace_flags & TRACE_ITER_CONTEXT_INFO) { + SEQ_PUT_FIELD_RET(s, entry->pid); + SEQ_PUT_FIELD_RET(s, iter->cpu); + SEQ_PUT_FIELD_RET(s, iter->ts); + } + + event = ftrace_find_event(entry->type); + return event ? event->funcs->binary(iter, 0, event) : + TRACE_TYPE_HANDLED; +} + +int trace_empty(struct trace_iterator *iter) +{ + int cpu; + + /* If we are looking at one CPU buffer, only check that one */ + if (iter->cpu_file != TRACE_PIPE_ALL_CPU) { + cpu = iter->cpu_file; + if (iter->buffer_iter[cpu]) { + if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) + return 0; + } else { + if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) + return 0; + } + return 1; + } + + for_each_tracing_cpu(cpu) { + if (iter->buffer_iter[cpu]) { + if (!ring_buffer_iter_empty(iter->buffer_iter[cpu])) + return 0; + } else { + if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) + return 0; + } + } + + return 1; +} + +/* Called with trace_event_read_lock() held. */ +enum print_line_t print_trace_line(struct trace_iterator *iter) +{ + enum print_line_t ret; + + if (iter->lost_events && + !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n", + iter->cpu, iter->lost_events)) + return TRACE_TYPE_PARTIAL_LINE; + + if (iter->trace && iter->trace->print_line) { + ret = iter->trace->print_line(iter); + if (ret != TRACE_TYPE_UNHANDLED) + return ret; + } + + if (iter->ent->type == TRACE_BPRINT && + trace_flags & TRACE_ITER_PRINTK && + trace_flags & TRACE_ITER_PRINTK_MSGONLY) + return trace_print_bprintk_msg_only(iter); + + if (iter->ent->type == TRACE_PRINT && + trace_flags & TRACE_ITER_PRINTK && + trace_flags & TRACE_ITER_PRINTK_MSGONLY) + return trace_print_printk_msg_only(iter); + + if (trace_flags & TRACE_ITER_BIN) + return print_bin_fmt(iter); + + if (trace_flags & TRACE_ITER_HEX) + return print_hex_fmt(iter); + + if (trace_flags & TRACE_ITER_RAW) + return print_raw_fmt(iter); + + return print_trace_fmt(iter); +} + +void trace_default_header(struct seq_file *m) +{ + struct trace_iterator *iter = m->private; + + if (iter->iter_flags & TRACE_FILE_LAT_FMT) { + /* print nothing if the buffers are empty */ + if (trace_empty(iter)) + return; + print_trace_header(m, iter); + if (!(trace_flags & TRACE_ITER_VERBOSE)) + print_lat_help_header(m); + } else { + if (!(trace_flags & TRACE_ITER_VERBOSE)) + print_func_help_header(m); + } +} + +static int s_show(struct seq_file *m, void *v) +{ + struct trace_iterator *iter = v; + int ret; + + if (iter->ent == NULL) { + if (iter->tr) { + seq_printf(m, "# tracer: %s\n", iter->trace->name); + seq_puts(m, "#\n"); + } + if (iter->trace && iter->trace->print_header) + iter->trace->print_header(m); + else + trace_default_header(m); + + } else if (iter->leftover) { + /* + * If we filled the seq_file buffer earlier, we + * want to just show it now. + */ + ret = trace_print_seq(m, &iter->seq); + + /* ret should this time be zero, but you never know */ + iter->leftover = ret; + + } else { + print_trace_line(iter); + ret = trace_print_seq(m, &iter->seq); + /* + * If we overflow the seq_file buffer, then it will + * ask us for this data again at start up. + * Use that instead. + * ret is 0 if seq_file write succeeded. + * -1 otherwise. + */ + iter->leftover = ret; + } + + return 0; +} + +static const struct seq_operations tracer_seq_ops = { + .start = s_start, + .next = s_next, + .stop = s_stop, + .show = s_show, +}; + +static struct trace_iterator * +__tracing_open(struct inode *inode, struct file *file) +{ + long cpu_file = (long) inode->i_private; + void *fail_ret = ERR_PTR(-ENOMEM); + struct trace_iterator *iter; + struct seq_file *m; + int cpu, ret; + + if (tracing_disabled) + return ERR_PTR(-ENODEV); + + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return ERR_PTR(-ENOMEM); + + /* + * We make a copy of the current tracer to avoid concurrent + * changes on it while we are reading. + */ + mutex_lock(&trace_types_lock); + iter->trace = kzalloc(sizeof(*iter->trace), GFP_KERNEL); + if (!iter->trace) + goto fail; + + if (current_trace) + *iter->trace = *current_trace; + + if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) + goto fail; + + if (current_trace && current_trace->print_max) + iter->tr = &max_tr; + else + iter->tr = &global_trace; + iter->pos = -1; + mutex_init(&iter->mutex); + iter->cpu_file = cpu_file; + + /* Notify the tracer early; before we stop tracing. */ + if (iter->trace && iter->trace->open) + iter->trace->open(iter); + + /* Annotate start of buffers if we had overruns */ + if (ring_buffer_overruns(iter->tr->buffer)) + iter->iter_flags |= TRACE_FILE_ANNOTATE; + + /* stop the trace while dumping */ + tracing_stop(); + + if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { + for_each_tracing_cpu(cpu) { + iter->buffer_iter[cpu] = + ring_buffer_read_prepare(iter->tr->buffer, cpu); + } + ring_buffer_read_prepare_sync(); + for_each_tracing_cpu(cpu) { + ring_buffer_read_start(iter->buffer_iter[cpu]); + tracing_iter_reset(iter, cpu); + } + } else { + cpu = iter->cpu_file; + iter->buffer_iter[cpu] = + ring_buffer_read_prepare(iter->tr->buffer, cpu); + ring_buffer_read_prepare_sync(); + ring_buffer_read_start(iter->buffer_iter[cpu]); + tracing_iter_reset(iter, cpu); + } + + ret = seq_open(file, &tracer_seq_ops); + if (ret < 0) { + fail_ret = ERR_PTR(ret); + goto fail_buffer; + } + + m = file->private_data; + m->private = iter; + + mutex_unlock(&trace_types_lock); + + return iter; + + fail_buffer: + for_each_tracing_cpu(cpu) { + if (iter->buffer_iter[cpu]) + ring_buffer_read_finish(iter->buffer_iter[cpu]); + } + free_cpumask_var(iter->started); + tracing_start(); + fail: + mutex_unlock(&trace_types_lock); + kfree(iter->trace); + kfree(iter); + + return fail_ret; +} + +int tracing_open_generic(struct inode *inode, struct file *filp) +{ + if (tracing_disabled) + return -ENODEV; + + filp->private_data = inode->i_private; + return 0; +} + +static int tracing_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + struct trace_iterator *iter; + int cpu; + + if (!(file->f_mode & FMODE_READ)) + return 0; + + iter = m->private; + + mutex_lock(&trace_types_lock); + for_each_tracing_cpu(cpu) { + if (iter->buffer_iter[cpu]) + ring_buffer_read_finish(iter->buffer_iter[cpu]); + } + + if (iter->trace && iter->trace->close) + iter->trace->close(iter); + + /* reenable tracing if it was previously enabled */ + tracing_start(); + mutex_unlock(&trace_types_lock); + + seq_release(inode, file); + mutex_destroy(&iter->mutex); + free_cpumask_var(iter->started); + kfree(iter->trace); + kfree(iter); + return 0; +} + +static int tracing_open(struct inode *inode, struct file *file) +{ + struct trace_iterator *iter; + int ret = 0; + + /* If this file was open for write, then erase contents */ + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) { + long cpu = (long) inode->i_private; + + if (cpu == TRACE_PIPE_ALL_CPU) + tracing_reset_online_cpus(&global_trace); + else + tracing_reset(&global_trace, cpu); + } + + if (file->f_mode & FMODE_READ) { + iter = __tracing_open(inode, file); + if (IS_ERR(iter)) + ret = PTR_ERR(iter); + else if (trace_flags & TRACE_ITER_LATENCY_FMT) + iter->iter_flags |= TRACE_FILE_LAT_FMT; + } + return ret; +} + +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct tracer *t = v; + + (*pos)++; + + if (t) + t = t->next; + + return t; +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + struct tracer *t; + loff_t l = 0; + + mutex_lock(&trace_types_lock); + for (t = trace_types; t && l < *pos; t = t_next(m, t, &l)) + ; + + return t; +} + +static void t_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&trace_types_lock); +} + +static int t_show(struct seq_file *m, void *v) +{ + struct tracer *t = v; + + if (!t) + return 0; + + seq_printf(m, "%s", t->name); + if (t->next) + seq_putc(m, ' '); + else + seq_putc(m, '\n'); + + return 0; +} + +static const struct seq_operations show_traces_seq_ops = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = t_show, +}; + +static int show_traces_open(struct inode *inode, struct file *file) +{ + if (tracing_disabled) + return -ENODEV; + + return seq_open(file, &show_traces_seq_ops); +} + +static ssize_t +tracing_write_stub(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + return count; +} + +static loff_t tracing_seek(struct file *file, loff_t offset, int origin) +{ + if (file->f_mode & FMODE_READ) + return seq_lseek(file, offset, origin); + else + return 0; +} + +static const struct file_operations tracing_fops = { + .open = tracing_open, + .read = seq_read, + .write = tracing_write_stub, + .llseek = tracing_seek, + .release = tracing_release, +}; + +static const struct file_operations show_traces_fops = { + .open = show_traces_open, + .read = seq_read, + .release = seq_release, + .llseek = seq_lseek, +}; + +/* + * Only trace on a CPU if the bitmask is set: + */ +static cpumask_var_t tracing_cpumask; + +/* + * The tracer itself will not take this lock, but still we want + * to provide a consistent cpumask to user-space: + */ +static DEFINE_MUTEX(tracing_cpumask_update_lock); + +/* + * Temporary storage for the character representation of the + * CPU bitmask (and one more byte for the newline): + */ +static char mask_str[NR_CPUS + 1]; + +static ssize_t +tracing_cpumask_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + int len; + + mutex_lock(&tracing_cpumask_update_lock); + + len = cpumask_scnprintf(mask_str, count, tracing_cpumask); + if (count - len < 2) { + count = -EINVAL; + goto out_err; + } + len += sprintf(mask_str + len, "\n"); + count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1); + +out_err: + mutex_unlock(&tracing_cpumask_update_lock); + + return count; +} + +static ssize_t +tracing_cpumask_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + int err, cpu; + cpumask_var_t tracing_cpumask_new; + + if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) + return -ENOMEM; + + err = cpumask_parse_user(ubuf, count, tracing_cpumask_new); + if (err) + goto err_unlock; + + mutex_lock(&tracing_cpumask_update_lock); + + local_irq_disable(); + arch_spin_lock(&ftrace_max_lock); + for_each_tracing_cpu(cpu) { + /* + * Increase/decrease the disabled counter if we are + * about to flip a bit in the cpumask: + */ + if (cpumask_test_cpu(cpu, tracing_cpumask) && + !cpumask_test_cpu(cpu, tracing_cpumask_new)) { + atomic_inc(&global_trace.data[cpu]->disabled); + } + if (!cpumask_test_cpu(cpu, tracing_cpumask) && + cpumask_test_cpu(cpu, tracing_cpumask_new)) { + atomic_dec(&global_trace.data[cpu]->disabled); + } + } + arch_spin_unlock(&ftrace_max_lock); + local_irq_enable(); + + cpumask_copy(tracing_cpumask, tracing_cpumask_new); + + mutex_unlock(&tracing_cpumask_update_lock); + free_cpumask_var(tracing_cpumask_new); + + return count; + +err_unlock: + free_cpumask_var(tracing_cpumask_new); + + return err; +} + +static const struct file_operations tracing_cpumask_fops = { + .open = tracing_open_generic, + .read = tracing_cpumask_read, + .write = tracing_cpumask_write, + .llseek = generic_file_llseek, +}; + +static int tracing_trace_options_show(struct seq_file *m, void *v) +{ + struct tracer_opt *trace_opts; + u32 tracer_flags; + int i; + + mutex_lock(&trace_types_lock); + tracer_flags = current_trace->flags->val; + trace_opts = current_trace->flags->opts; + + for (i = 0; trace_options[i]; i++) { + if (trace_flags & (1 << i)) + seq_printf(m, "%s\n", trace_options[i]); + else + seq_printf(m, "no%s\n", trace_options[i]); + } + + for (i = 0; trace_opts[i].name; i++) { + if (tracer_flags & trace_opts[i].bit) + seq_printf(m, "%s\n", trace_opts[i].name); + else + seq_printf(m, "no%s\n", trace_opts[i].name); + } + mutex_unlock(&trace_types_lock); + + return 0; +} + +static int __set_tracer_option(struct tracer *trace, + struct tracer_flags *tracer_flags, + struct tracer_opt *opts, int neg) +{ + int ret; + + ret = trace->set_flag(tracer_flags->val, opts->bit, !neg); + if (ret) + return ret; + + if (neg) + tracer_flags->val &= ~opts->bit; + else + tracer_flags->val |= opts->bit; + return 0; +} + +/* Try to assign a tracer specific option */ +static int set_tracer_option(struct tracer *trace, char *cmp, int neg) +{ + struct tracer_flags *tracer_flags = trace->flags; + struct tracer_opt *opts = NULL; + int i; + + for (i = 0; tracer_flags->opts[i].name; i++) { + opts = &tracer_flags->opts[i]; + + if (strcmp(cmp, opts->name) == 0) + return __set_tracer_option(trace, trace->flags, + opts, neg); + } + + return -EINVAL; +} + +static void set_tracer_flags(unsigned int mask, int enabled) +{ + /* do nothing if flag is already set */ + if (!!(trace_flags & mask) == !!enabled) + return; + + if (enabled) + trace_flags |= mask; + else + trace_flags &= ~mask; + + if (mask == TRACE_ITER_RECORD_CMD) + trace_event_enable_cmd_record(enabled); + + if (mask == TRACE_ITER_OVERWRITE) + ring_buffer_change_overwrite(global_trace.buffer, enabled); +} + +static ssize_t +tracing_trace_options_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + char *cmp; + int neg = 0; + int ret; + int i; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + cmp = strstrip(buf); + + if (strncmp(cmp, "no", 2) == 0) { + neg = 1; + cmp += 2; + } + + for (i = 0; trace_options[i]; i++) { + if (strcmp(cmp, trace_options[i]) == 0) { + set_tracer_flags(1 << i, !neg); + break; + } + } + + /* If no option could be set, test the specific tracer options */ + if (!trace_options[i]) { + mutex_lock(&trace_types_lock); + ret = set_tracer_option(current_trace, cmp, neg); + mutex_unlock(&trace_types_lock); + if (ret) + return ret; + } + + *ppos += cnt; + + return cnt; +} + +static int tracing_trace_options_open(struct inode *inode, struct file *file) +{ + if (tracing_disabled) + return -ENODEV; + return single_open(file, tracing_trace_options_show, NULL); +} + +static const struct file_operations tracing_iter_fops = { + .open = tracing_trace_options_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = tracing_trace_options_write, +}; + +static const char readme_msg[] = + "tracing mini-HOWTO:\n\n" + "# mount -t debugfs nodev /sys/kernel/debug\n\n" + "# cat /sys/kernel/debug/tracing/available_tracers\n" + "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n" + "# cat /sys/kernel/debug/tracing/current_tracer\n" + "nop\n" + "# echo sched_switch > /sys/kernel/debug/tracing/current_tracer\n" + "# cat /sys/kernel/debug/tracing/current_tracer\n" + "sched_switch\n" + "# cat /sys/kernel/debug/tracing/trace_options\n" + "noprint-parent nosym-offset nosym-addr noverbose\n" + "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" + "# echo 1 > /sys/kernel/debug/tracing/tracing_enabled\n" + "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n" + "# echo 0 > /sys/kernel/debug/tracing/tracing_enabled\n" +; + +static ssize_t +tracing_readme_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + return simple_read_from_buffer(ubuf, cnt, ppos, + readme_msg, strlen(readme_msg)); +} + +static const struct file_operations tracing_readme_fops = { + .open = tracing_open_generic, + .read = tracing_readme_read, + .llseek = generic_file_llseek, +}; + +static ssize_t +tracing_saved_cmdlines_read(struct file *file, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char *buf_comm; + char *file_buf; + char *buf; + int len = 0; + int pid; + int i; + + file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL); + if (!file_buf) + return -ENOMEM; + + buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL); + if (!buf_comm) { + kfree(file_buf); + return -ENOMEM; + } + + buf = file_buf; + + for (i = 0; i < SAVED_CMDLINES; i++) { + int r; + + pid = map_cmdline_to_pid[i]; + if (pid == -1 || pid == NO_CMDLINE_MAP) + continue; + + trace_find_cmdline(pid, buf_comm); + r = sprintf(buf, "%d %s\n", pid, buf_comm); + buf += r; + len += r; + } + + len = simple_read_from_buffer(ubuf, cnt, ppos, + file_buf, len); + + kfree(file_buf); + kfree(buf_comm); + + return len; +} + +static const struct file_operations tracing_saved_cmdlines_fops = { + .open = tracing_open_generic, + .read = tracing_saved_cmdlines_read, + .llseek = generic_file_llseek, +}; + +static ssize_t +tracing_ctrl_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + int r; + + r = sprintf(buf, "%u\n", tracer_enabled); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +tracing_ctrl_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[64]; + unsigned long val; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + val = !!val; + + mutex_lock(&trace_types_lock); + if (tracer_enabled ^ val) { + + /* Only need to warn if this is used to change the state */ + WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on"); + + if (val) { + tracer_enabled = 1; + if (current_trace->start) + current_trace->start(tr); + tracing_start(); + } else { + tracer_enabled = 0; + tracing_stop(); + if (current_trace->stop) + current_trace->stop(tr); + } + } + mutex_unlock(&trace_types_lock); + + *ppos += cnt; + + return cnt; +} + +static ssize_t +tracing_set_trace_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[MAX_TRACER_SIZE+2]; + int r; + + mutex_lock(&trace_types_lock); + if (current_trace) + r = sprintf(buf, "%s\n", current_trace->name); + else + r = sprintf(buf, "\n"); + mutex_unlock(&trace_types_lock); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +int tracer_init(struct tracer *t, struct trace_array *tr) +{ + tracing_reset_online_cpus(tr); + return t->init(tr); +} + +static int tracing_resize_ring_buffer(unsigned long size) +{ + int ret; + + /* + * If kernel or user changes the size of the ring buffer + * we use the size that was given, and we can forget about + * expanding it later. + */ + ring_buffer_expanded = 1; + + ret = ring_buffer_resize(global_trace.buffer, size); + if (ret < 0) + return ret; + + if (!current_trace->use_max_tr) + goto out; + + ret = ring_buffer_resize(max_tr.buffer, size); + if (ret < 0) { + int r; + + r = ring_buffer_resize(global_trace.buffer, + global_trace.entries); + if (r < 0) { + /* + * AARGH! We are left with different + * size max buffer!!!! + * The max buffer is our "snapshot" buffer. + * When a tracer needs a snapshot (one of the + * latency tracers), it swaps the max buffer + * with the saved snap shot. We succeeded to + * update the size of the main buffer, but failed to + * update the size of the max buffer. But when we tried + * to reset the main buffer to the original size, we + * failed there too. This is very unlikely to + * happen, but if it does, warn and kill all + * tracing. + */ + WARN_ON(1); + tracing_disabled = 1; + } + return ret; + } + + max_tr.entries = size; + out: + global_trace.entries = size; + + return ret; +} + + +/** + * tracing_update_buffers - used by tracing facility to expand ring buffers + * + * To save on memory when the tracing is never used on a system with it + * configured in. The ring buffers are set to a minimum size. But once + * a user starts to use the tracing facility, then they need to grow + * to their default size. + * + * This function is to be called when a tracer is about to be used. + */ +int tracing_update_buffers(void) +{ + int ret = 0; + + mutex_lock(&trace_types_lock); + if (!ring_buffer_expanded) + ret = tracing_resize_ring_buffer(trace_buf_size); + mutex_unlock(&trace_types_lock); + + return ret; +} + +struct trace_option_dentry; + +static struct trace_option_dentry * +create_trace_option_files(struct tracer *tracer); + +static void +destroy_trace_option_files(struct trace_option_dentry *topts); + +static int tracing_set_tracer(const char *buf) +{ + static struct trace_option_dentry *topts; + struct trace_array *tr = &global_trace; + struct tracer *t; + int ret = 0; + + mutex_lock(&trace_types_lock); + + if (!ring_buffer_expanded) { + ret = tracing_resize_ring_buffer(trace_buf_size); + if (ret < 0) + goto out; + ret = 0; + } + + for (t = trace_types; t; t = t->next) { + if (strcmp(t->name, buf) == 0) + break; + } + if (!t) { + ret = -EINVAL; + goto out; + } + if (t == current_trace) + goto out; + + trace_branch_disable(); + if (current_trace && current_trace->reset) + current_trace->reset(tr); + if (current_trace && current_trace->use_max_tr) { + /* + * We don't free the ring buffer. instead, resize it because + * The max_tr ring buffer has some state (e.g. ring->clock) and + * we want preserve it. + */ + ring_buffer_resize(max_tr.buffer, 1); + max_tr.entries = 1; + } + destroy_trace_option_files(topts); + + current_trace = t; + + topts = create_trace_option_files(current_trace); + if (current_trace->use_max_tr) { + ret = ring_buffer_resize(max_tr.buffer, global_trace.entries); + if (ret < 0) + goto out; + max_tr.entries = global_trace.entries; + } + + if (t->init) { + ret = tracer_init(t, tr); + if (ret) + goto out; + } + + trace_branch_enable(tr); + out: + mutex_unlock(&trace_types_lock); + + return ret; +} + +static ssize_t +tracing_set_trace_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[MAX_TRACER_SIZE+1]; + int i; + size_t ret; + int err; + + ret = cnt; + + if (cnt > MAX_TRACER_SIZE) + cnt = MAX_TRACER_SIZE; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + /* strip ending whitespace. */ + for (i = cnt - 1; i > 0 && isspace(buf[i]); i--) + buf[i] = 0; + + err = tracing_set_tracer(buf); + if (err) + return err; + + *ppos += ret; + + return ret; +} + +static ssize_t +tracing_max_lat_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long *ptr = filp->private_data; + char buf[64]; + int r; + + r = snprintf(buf, sizeof(buf), "%ld\n", + *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr)); + if (r > sizeof(buf)) + r = sizeof(buf); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +tracing_max_lat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long *ptr = filp->private_data; + char buf[64]; + unsigned long val; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + *ptr = val * 1000; + + return cnt; +} + +static int tracing_open_pipe(struct inode *inode, struct file *filp) +{ + long cpu_file = (long) inode->i_private; + struct trace_iterator *iter; + int ret = 0; + + if (tracing_disabled) + return -ENODEV; + + mutex_lock(&trace_types_lock); + + /* create a buffer to store the information to pass to userspace */ + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) { + ret = -ENOMEM; + goto out; + } + + /* + * We make a copy of the current tracer to avoid concurrent + * changes on it while we are reading. + */ + iter->trace = kmalloc(sizeof(*iter->trace), GFP_KERNEL); + if (!iter->trace) { + ret = -ENOMEM; + goto fail; + } + if (current_trace) + *iter->trace = *current_trace; + + if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { + ret = -ENOMEM; + goto fail; + } + + /* trace pipe does not show start of buffer */ + cpumask_setall(iter->started); + + if (trace_flags & TRACE_ITER_LATENCY_FMT) + iter->iter_flags |= TRACE_FILE_LAT_FMT; + + iter->cpu_file = cpu_file; + iter->tr = &global_trace; + mutex_init(&iter->mutex); + filp->private_data = iter; + + if (iter->trace->pipe_open) + iter->trace->pipe_open(iter); + + nonseekable_open(inode, filp); +out: + mutex_unlock(&trace_types_lock); + return ret; + +fail: + kfree(iter->trace); + kfree(iter); + mutex_unlock(&trace_types_lock); + return ret; +} + +static int tracing_release_pipe(struct inode *inode, struct file *file) +{ + struct trace_iterator *iter = file->private_data; + + mutex_lock(&trace_types_lock); + + if (iter->trace->pipe_close) + iter->trace->pipe_close(iter); + + mutex_unlock(&trace_types_lock); + + free_cpumask_var(iter->started); + mutex_destroy(&iter->mutex); + kfree(iter->trace); + kfree(iter); + + return 0; +} + +static unsigned int +tracing_poll_pipe(struct file *filp, poll_table *poll_table) +{ + struct trace_iterator *iter = filp->private_data; + + if (trace_flags & TRACE_ITER_BLOCK) { + /* + * Always select as readable when in blocking mode + */ + return POLLIN | POLLRDNORM; + } else { + if (!trace_empty(iter)) + return POLLIN | POLLRDNORM; + poll_wait(filp, &trace_wait, poll_table); + if (!trace_empty(iter)) + return POLLIN | POLLRDNORM; + + return 0; + } +} + + +void default_wait_pipe(struct trace_iterator *iter) +{ + DEFINE_WAIT(wait); + + prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE); + + if (trace_empty(iter)) + schedule(); + + finish_wait(&trace_wait, &wait); +} + +/* + * This is a make-shift waitqueue. + * A tracer might use this callback on some rare cases: + * + * 1) the current tracer might hold the runqueue lock when it wakes up + * a reader, hence a deadlock (sched, function, and function graph tracers) + * 2) the function tracers, trace all functions, we don't want + * the overhead of calling wake_up and friends + * (and tracing them too) + * + * Anyway, this is really very primitive wakeup. + */ +void poll_wait_pipe(struct trace_iterator *iter) +{ + set_current_state(TASK_INTERRUPTIBLE); + /* sleep for 100 msecs, and try again. */ + schedule_timeout(HZ / 10); +} + +/* Must be called with trace_types_lock mutex held. */ +static int tracing_wait_pipe(struct file *filp) +{ + struct trace_iterator *iter = filp->private_data; + + while (trace_empty(iter)) { + + if ((filp->f_flags & O_NONBLOCK)) { + return -EAGAIN; + } + + mutex_unlock(&iter->mutex); + + iter->trace->wait_pipe(iter); + + mutex_lock(&iter->mutex); + + if (signal_pending(current)) + return -EINTR; + + /* + * We block until we read something and tracing is disabled. + * We still block if tracing is disabled, but we have never + * read anything. This allows a user to cat this file, and + * then enable tracing. But after we have read something, + * we give an EOF when tracing is again disabled. + * + * iter->pos will be 0 if we haven't read anything. + */ + if (!tracer_enabled && iter->pos) + break; + } + + return 1; +} + +/* + * Consumer reader. + */ +static ssize_t +tracing_read_pipe(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_iterator *iter = filp->private_data; + static struct tracer *old_tracer; + ssize_t sret; + + /* return any leftover data */ + sret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (sret != -EBUSY) + return sret; + + trace_seq_init(&iter->seq); + + /* copy the tracer to avoid using a global lock all around */ + mutex_lock(&trace_types_lock); + if (unlikely(old_tracer != current_trace && current_trace)) { + old_tracer = current_trace; + *iter->trace = *current_trace; + } + mutex_unlock(&trace_types_lock); + + /* + * Avoid more than one consumer on a single file descriptor + * This is just a matter of traces coherency, the ring buffer itself + * is protected. + */ + mutex_lock(&iter->mutex); + if (iter->trace->read) { + sret = iter->trace->read(iter, filp, ubuf, cnt, ppos); + if (sret) + goto out; + } + +waitagain: + sret = tracing_wait_pipe(filp); + if (sret <= 0) + goto out; + + /* stop when tracing is finished */ + if (trace_empty(iter)) { + sret = 0; + goto out; + } + + if (cnt >= PAGE_SIZE) + cnt = PAGE_SIZE - 1; + + /* reset all but tr, trace, and overruns */ + memset(&iter->seq, 0, + sizeof(struct trace_iterator) - + offsetof(struct trace_iterator, seq)); + iter->pos = -1; + + trace_event_read_lock(); + trace_access_lock(iter->cpu_file); + while (trace_find_next_entry_inc(iter) != NULL) { + enum print_line_t ret; + int len = iter->seq.len; + + ret = print_trace_line(iter); + if (ret == TRACE_TYPE_PARTIAL_LINE) { + /* don't print partial lines */ + iter->seq.len = len; + break; + } + if (ret != TRACE_TYPE_NO_CONSUME) + trace_consume(iter); + + if (iter->seq.len >= cnt) + break; + + /* + * Setting the full flag means we reached the trace_seq buffer + * size and we should leave by partial output condition above. + * One of the trace_seq_* functions is not used properly. + */ + WARN_ONCE(iter->seq.full, "full flag set for trace type %d", + iter->ent->type); + } + trace_access_unlock(iter->cpu_file); + trace_event_read_unlock(); + + /* Now copy what we have to the user */ + sret = trace_seq_to_user(&iter->seq, ubuf, cnt); + if (iter->seq.readpos >= iter->seq.len) + trace_seq_init(&iter->seq); + + /* + * If there was nothing to send to user, in spite of consuming trace + * entries, go back to wait for more entries. + */ + if (sret == -EBUSY) + goto waitagain; + +out: + mutex_unlock(&iter->mutex); + + return sret; +} + +static void tracing_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + __free_page(buf->page); +} + +static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, + unsigned int idx) +{ + __free_page(spd->pages[idx]); +} + +static const struct pipe_buf_operations tracing_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, + .release = tracing_pipe_buf_release, + .steal = generic_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +static size_t +tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) +{ + size_t count; + int ret; + + /* Seq buffer is page-sized, exactly what we need. */ + for (;;) { + count = iter->seq.len; + ret = print_trace_line(iter); + count = iter->seq.len - count; + if (rem < count) { + rem = 0; + iter->seq.len -= count; + break; + } + if (ret == TRACE_TYPE_PARTIAL_LINE) { + iter->seq.len -= count; + break; + } + + if (ret != TRACE_TYPE_NO_CONSUME) + trace_consume(iter); + rem -= count; + if (!trace_find_next_entry_inc(iter)) { + rem = 0; + iter->ent = NULL; + break; + } + } + + return rem; +} + +static ssize_t tracing_splice_read_pipe(struct file *filp, + loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, + unsigned int flags) +{ + struct page *pages_def[PIPE_DEF_BUFFERS]; + struct partial_page partial_def[PIPE_DEF_BUFFERS]; + struct trace_iterator *iter = filp->private_data; + struct splice_pipe_desc spd = { + .pages = pages_def, + .partial = partial_def, + .nr_pages = 0, /* This gets updated below. */ + .flags = flags, + .ops = &tracing_pipe_buf_ops, + .spd_release = tracing_spd_release_pipe, + }; + static struct tracer *old_tracer; + ssize_t ret; + size_t rem; + unsigned int i; + + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + + /* copy the tracer to avoid using a global lock all around */ + mutex_lock(&trace_types_lock); + if (unlikely(old_tracer != current_trace && current_trace)) { + old_tracer = current_trace; + *iter->trace = *current_trace; + } + mutex_unlock(&trace_types_lock); + + mutex_lock(&iter->mutex); + + if (iter->trace->splice_read) { + ret = iter->trace->splice_read(iter, filp, + ppos, pipe, len, flags); + if (ret) + goto out_err; + } + + ret = tracing_wait_pipe(filp); + if (ret <= 0) + goto out_err; + + if (!iter->ent && !trace_find_next_entry_inc(iter)) { + ret = -EFAULT; + goto out_err; + } + + trace_event_read_lock(); + trace_access_lock(iter->cpu_file); + + /* Fill as many pages as possible. */ + for (i = 0, rem = len; i < pipe->buffers && rem; i++) { + spd.pages[i] = alloc_page(GFP_KERNEL); + if (!spd.pages[i]) + break; + + rem = tracing_fill_pipe_page(rem, iter); + + /* Copy the data into the page, so we can start over. */ + ret = trace_seq_to_buffer(&iter->seq, + page_address(spd.pages[i]), + iter->seq.len); + if (ret < 0) { + __free_page(spd.pages[i]); + break; + } + spd.partial[i].offset = 0; + spd.partial[i].len = iter->seq.len; + + trace_seq_init(&iter->seq); + } + + trace_access_unlock(iter->cpu_file); + trace_event_read_unlock(); + mutex_unlock(&iter->mutex); + + spd.nr_pages = i; + + ret = splice_to_pipe(pipe, &spd); +out: + splice_shrink_spd(pipe, &spd); + return ret; + +out_err: + mutex_unlock(&iter->mutex); + goto out; +} + +static ssize_t +tracing_entries_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_array *tr = filp->private_data; + char buf[96]; + int r; + + mutex_lock(&trace_types_lock); + if (!ring_buffer_expanded) + r = sprintf(buf, "%lu (expanded: %lu)\n", + tr->entries >> 10, + trace_buf_size >> 10); + else + r = sprintf(buf, "%lu\n", tr->entries >> 10); + mutex_unlock(&trace_types_lock); + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static ssize_t +tracing_entries_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long val; + char buf[64]; + int ret, cpu; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + /* must have at least 1 entry */ + if (!val) + return -EINVAL; + + mutex_lock(&trace_types_lock); + + tracing_stop(); + + /* disable all cpu buffers */ + for_each_tracing_cpu(cpu) { + if (global_trace.data[cpu]) + atomic_inc(&global_trace.data[cpu]->disabled); + if (max_tr.data[cpu]) + atomic_inc(&max_tr.data[cpu]->disabled); + } + + /* value is in KB */ + val <<= 10; + + if (val != global_trace.entries) { + ret = tracing_resize_ring_buffer(val); + if (ret < 0) { + cnt = ret; + goto out; + } + } + + *ppos += cnt; + + /* If check pages failed, return ENOMEM */ + if (tracing_disabled) + cnt = -ENOMEM; + out: + for_each_tracing_cpu(cpu) { + if (global_trace.data[cpu]) + atomic_dec(&global_trace.data[cpu]->disabled); + if (max_tr.data[cpu]) + atomic_dec(&max_tr.data[cpu]->disabled); + } + + tracing_start(); + mutex_unlock(&trace_types_lock); + + return cnt; +} + +static int mark_printk(const char *fmt, ...) +{ + int ret; + va_list args; + va_start(args, fmt); + ret = trace_vprintk(0, fmt, args); + va_end(args); + return ret; +} + +static ssize_t +tracing_mark_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *fpos) +{ + char *buf; + size_t written; + + if (tracing_disabled) + return -EINVAL; + + if (cnt > TRACE_BUF_SIZE) + cnt = TRACE_BUF_SIZE; + + buf = kmalloc(cnt + 2, GFP_KERNEL); + if (buf == NULL) + return -ENOMEM; + + if (copy_from_user(buf, ubuf, cnt)) { + kfree(buf); + return -EFAULT; + } + if (buf[cnt-1] != '\n') { + buf[cnt] = '\n'; + buf[cnt+1] = '\0'; + } else + buf[cnt] = '\0'; + + written = mark_printk("%s", buf); + kfree(buf); + *fpos += written; + + /* don't tell userspace we wrote more - it might confuse them */ + if (written > cnt) + written = cnt; + + return written; +} + +static int tracing_clock_show(struct seq_file *m, void *v) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) + seq_printf(m, + "%s%s%s%s", i ? " " : "", + i == trace_clock_id ? "[" : "", trace_clocks[i].name, + i == trace_clock_id ? "]" : ""); + seq_putc(m, '\n'); + + return 0; +} + +static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *fpos) +{ + char buf[64]; + const char *clockstr; + int i; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + clockstr = strstrip(buf); + + for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) { + if (strcmp(trace_clocks[i].name, clockstr) == 0) + break; + } + if (i == ARRAY_SIZE(trace_clocks)) + return -EINVAL; + + trace_clock_id = i; + + mutex_lock(&trace_types_lock); + + ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func); + if (max_tr.buffer) + ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); + + mutex_unlock(&trace_types_lock); + + *fpos += cnt; + + return cnt; +} + +static int tracing_clock_open(struct inode *inode, struct file *file) +{ + if (tracing_disabled) + return -ENODEV; + return single_open(file, tracing_clock_show, NULL); +} + +static const struct file_operations tracing_max_lat_fops = { + .open = tracing_open_generic, + .read = tracing_max_lat_read, + .write = tracing_max_lat_write, + .llseek = generic_file_llseek, +}; + +static const struct file_operations tracing_ctrl_fops = { + .open = tracing_open_generic, + .read = tracing_ctrl_read, + .write = tracing_ctrl_write, + .llseek = generic_file_llseek, +}; + +static const struct file_operations set_tracer_fops = { + .open = tracing_open_generic, + .read = tracing_set_trace_read, + .write = tracing_set_trace_write, + .llseek = generic_file_llseek, +}; + +static const struct file_operations tracing_pipe_fops = { + .open = tracing_open_pipe, + .poll = tracing_poll_pipe, + .read = tracing_read_pipe, + .splice_read = tracing_splice_read_pipe, + .release = tracing_release_pipe, + .llseek = no_llseek, +}; + +static const struct file_operations tracing_entries_fops = { + .open = tracing_open_generic, + .read = tracing_entries_read, + .write = tracing_entries_write, + .llseek = generic_file_llseek, +}; + +static const struct file_operations tracing_mark_fops = { + .open = tracing_open_generic, + .write = tracing_mark_write, + .llseek = generic_file_llseek, +}; + +static const struct file_operations trace_clock_fops = { + .open = tracing_clock_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = tracing_clock_write, +}; + +struct ftrace_buffer_info { + struct trace_array *tr; + void *spare; + int cpu; + unsigned int read; +}; + +static int tracing_buffers_open(struct inode *inode, struct file *filp) +{ + int cpu = (int)(long)inode->i_private; + struct ftrace_buffer_info *info; + + if (tracing_disabled) + return -ENODEV; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return -ENOMEM; + + info->tr = &global_trace; + info->cpu = cpu; + info->spare = NULL; + /* Force reading ring buffer for first read */ + info->read = (unsigned int)-1; + + filp->private_data = info; + + return nonseekable_open(inode, filp); +} + +static ssize_t +tracing_buffers_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + struct ftrace_buffer_info *info = filp->private_data; + ssize_t ret; + size_t size; + + if (!count) + return 0; + + if (!info->spare) + info->spare = ring_buffer_alloc_read_page(info->tr->buffer); + if (!info->spare) + return -ENOMEM; + + /* Do we have previous read data to read? */ + if (info->read < PAGE_SIZE) + goto read; + + trace_access_lock(info->cpu); + ret = ring_buffer_read_page(info->tr->buffer, + &info->spare, + count, + info->cpu, 0); + trace_access_unlock(info->cpu); + if (ret < 0) + return 0; + + info->read = 0; + +read: + size = PAGE_SIZE - info->read; + if (size > count) + size = count; + + ret = copy_to_user(ubuf, info->spare + info->read, size); + if (ret == size) + return -EFAULT; + size -= ret; + + *ppos += size; + info->read += size; + + return size; +} + +static int tracing_buffers_release(struct inode *inode, struct file *file) +{ + struct ftrace_buffer_info *info = file->private_data; + + if (info->spare) + ring_buffer_free_read_page(info->tr->buffer, info->spare); + kfree(info); + + return 0; +} + +struct buffer_ref { + struct ring_buffer *buffer; + void *page; + int ref; +}; + +static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct buffer_ref *ref = (struct buffer_ref *)buf->private; + + if (--ref->ref) + return; + + ring_buffer_free_read_page(ref->buffer, ref->page); + kfree(ref); + buf->private = 0; +} + +static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return 1; +} + +static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + struct buffer_ref *ref = (struct buffer_ref *)buf->private; + + ref->ref++; +} + +/* Pipe buffer operations for a buffer. */ +static const struct pipe_buf_operations buffer_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, + .release = buffer_pipe_buf_release, + .steal = buffer_pipe_buf_steal, + .get = buffer_pipe_buf_get, +}; + +/* + * Callback from splice_to_pipe(), if we need to release some pages + * at the end of the spd in case we error'ed out in filling the pipe. + */ +static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i) +{ + struct buffer_ref *ref = + (struct buffer_ref *)spd->partial[i].private; + + if (--ref->ref) + return; + + ring_buffer_free_read_page(ref->buffer, ref->page); + kfree(ref); + spd->partial[i].private = 0; +} + +static ssize_t +tracing_buffers_splice_read(struct file *file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct ftrace_buffer_info *info = file->private_data; + struct partial_page partial_def[PIPE_DEF_BUFFERS]; + struct page *pages_def[PIPE_DEF_BUFFERS]; + struct splice_pipe_desc spd = { + .pages = pages_def, + .partial = partial_def, + .flags = flags, + .ops = &buffer_pipe_buf_ops, + .spd_release = buffer_spd_release, + }; + struct buffer_ref *ref; + int entries, size, i; + size_t ret; + + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + + if (*ppos & (PAGE_SIZE - 1)) { + WARN_ONCE(1, "Ftrace: previous read must page-align\n"); + ret = -EINVAL; + goto out; + } + + if (len & (PAGE_SIZE - 1)) { + WARN_ONCE(1, "Ftrace: splice_read should page-align\n"); + if (len < PAGE_SIZE) { + ret = -EINVAL; + goto out; + } + len &= PAGE_MASK; + } + + trace_access_lock(info->cpu); + entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); + + for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) { + struct page *page; + int r; + + ref = kzalloc(sizeof(*ref), GFP_KERNEL); + if (!ref) + break; + + ref->ref = 1; + ref->buffer = info->tr->buffer; + ref->page = ring_buffer_alloc_read_page(ref->buffer); + if (!ref->page) { + kfree(ref); + break; + } + + r = ring_buffer_read_page(ref->buffer, &ref->page, + len, info->cpu, 1); + if (r < 0) { + ring_buffer_free_read_page(ref->buffer, + ref->page); + kfree(ref); + break; + } + + /* + * zero out any left over data, this is going to + * user land. + */ + size = ring_buffer_page_len(ref->page); + if (size < PAGE_SIZE) + memset(ref->page + size, 0, PAGE_SIZE - size); + + page = virt_to_page(ref->page); + + spd.pages[i] = page; + spd.partial[i].len = PAGE_SIZE; + spd.partial[i].offset = 0; + spd.partial[i].private = (unsigned long)ref; + spd.nr_pages++; + *ppos += PAGE_SIZE; + + entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); + } + + trace_access_unlock(info->cpu); + spd.nr_pages = i; + + /* did we read anything? */ + if (!spd.nr_pages) { + if (flags & SPLICE_F_NONBLOCK) + ret = -EAGAIN; + else + ret = 0; + /* TODO: block */ + goto out; + } + + ret = splice_to_pipe(pipe, &spd); + splice_shrink_spd(pipe, &spd); +out: + return ret; +} + +static const struct file_operations tracing_buffers_fops = { + .open = tracing_buffers_open, + .read = tracing_buffers_read, + .release = tracing_buffers_release, + .splice_read = tracing_buffers_splice_read, + .llseek = no_llseek, +}; + +static ssize_t +tracing_stats_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + unsigned long cpu = (unsigned long)filp->private_data; + struct trace_array *tr = &global_trace; + struct trace_seq *s; + unsigned long cnt; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + cnt = ring_buffer_entries_cpu(tr->buffer, cpu); + trace_seq_printf(s, "entries: %ld\n", cnt); + + cnt = ring_buffer_overrun_cpu(tr->buffer, cpu); + trace_seq_printf(s, "overrun: %ld\n", cnt); + + cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); + trace_seq_printf(s, "commit overrun: %ld\n", cnt); + + count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); + + kfree(s); + + return count; +} + +static const struct file_operations tracing_stats_fops = { + .open = tracing_open_generic, + .read = tracing_stats_read, + .llseek = generic_file_llseek, +}; + +#ifdef CONFIG_DYNAMIC_FTRACE + +int __weak ftrace_arch_read_dyn_info(char *buf, int size) +{ + return 0; +} + +static ssize_t +tracing_read_dyn_info(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + static char ftrace_dyn_info_buffer[1024]; + static DEFINE_MUTEX(dyn_info_mutex); + unsigned long *p = filp->private_data; + char *buf = ftrace_dyn_info_buffer; + int size = ARRAY_SIZE(ftrace_dyn_info_buffer); + int r; + + mutex_lock(&dyn_info_mutex); + r = sprintf(buf, "%ld ", *p); + + r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r); + buf[r++] = '\n'; + + r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + + mutex_unlock(&dyn_info_mutex); + + return r; +} + +static const struct file_operations tracing_dyn_info_fops = { + .open = tracing_open_generic, + .read = tracing_read_dyn_info, + .llseek = generic_file_llseek, +}; +#endif + +static struct dentry *d_tracer; + +struct dentry *tracing_init_dentry(void) +{ + static int once; + + if (d_tracer) + return d_tracer; + + if (!debugfs_initialized()) + return NULL; + + d_tracer = debugfs_create_dir("tracing", NULL); + + if (!d_tracer && !once) { + once = 1; + pr_warning("Could not create debugfs directory 'tracing'\n"); + return NULL; + } + + return d_tracer; +} + +static struct dentry *d_percpu; + +struct dentry *tracing_dentry_percpu(void) +{ + static int once; + struct dentry *d_tracer; + + if (d_percpu) + return d_percpu; + + d_tracer = tracing_init_dentry(); + + if (!d_tracer) + return NULL; + + d_percpu = debugfs_create_dir("per_cpu", d_tracer); + + if (!d_percpu && !once) { + once = 1; + pr_warning("Could not create debugfs directory 'per_cpu'\n"); + return NULL; + } + + return d_percpu; +} + +static void tracing_init_debugfs_percpu(long cpu) +{ + struct dentry *d_percpu = tracing_dentry_percpu(); + struct dentry *d_cpu; + char cpu_dir[30]; /* 30 characters should be more than enough */ + + snprintf(cpu_dir, 30, "cpu%ld", cpu); + d_cpu = debugfs_create_dir(cpu_dir, d_percpu); + if (!d_cpu) { + pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); + return; + } + + /* per cpu trace_pipe */ + trace_create_file("trace_pipe", 0444, d_cpu, + (void *) cpu, &tracing_pipe_fops); + + /* per cpu trace */ + trace_create_file("trace", 0644, d_cpu, + (void *) cpu, &tracing_fops); + + trace_create_file("trace_pipe_raw", 0444, d_cpu, + (void *) cpu, &tracing_buffers_fops); + + trace_create_file("stats", 0444, d_cpu, + (void *) cpu, &tracing_stats_fops); +} + +#ifdef CONFIG_FTRACE_SELFTEST +/* Let selftest have access to static functions in this file */ +#include "trace_selftest.c" +#endif + +struct trace_option_dentry { + struct tracer_opt *opt; + struct tracer_flags *flags; + struct dentry *entry; +}; + +static ssize_t +trace_options_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct trace_option_dentry *topt = filp->private_data; + char *buf; + + if (topt->flags->val & topt->opt->bit) + buf = "1\n"; + else + buf = "0\n"; + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); +} + +static ssize_t +trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct trace_option_dentry *topt = filp->private_data; + unsigned long val; + char buf[64]; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + if (val != 0 && val != 1) + return -EINVAL; + + if (!!(topt->flags->val & topt->opt->bit) != val) { + mutex_lock(&trace_types_lock); + ret = __set_tracer_option(current_trace, topt->flags, + topt->opt, !val); + mutex_unlock(&trace_types_lock); + if (ret) + return ret; + } + + *ppos += cnt; + + return cnt; +} + + +static const struct file_operations trace_options_fops = { + .open = tracing_open_generic, + .read = trace_options_read, + .write = trace_options_write, + .llseek = generic_file_llseek, +}; + +static ssize_t +trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + long index = (long)filp->private_data; + char *buf; + + if (trace_flags & (1 << index)) + buf = "1\n"; + else + buf = "0\n"; + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); +} + +static ssize_t +trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + long index = (long)filp->private_data; + char buf[64]; + unsigned long val; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + if (val != 0 && val != 1) + return -EINVAL; + set_tracer_flags(1 << index, val); + + *ppos += cnt; + + return cnt; +} + +static const struct file_operations trace_options_core_fops = { + .open = tracing_open_generic, + .read = trace_options_core_read, + .write = trace_options_core_write, + .llseek = generic_file_llseek, +}; + +struct dentry *trace_create_file(const char *name, + mode_t mode, + struct dentry *parent, + void *data, + const struct file_operations *fops) +{ + struct dentry *ret; + + ret = debugfs_create_file(name, mode, parent, data, fops); + if (!ret) + pr_warning("Could not create debugfs '%s' entry\n", name); + + return ret; +} + + +static struct dentry *trace_options_init_dentry(void) +{ + struct dentry *d_tracer; + static struct dentry *t_options; + + if (t_options) + return t_options; + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return NULL; + + t_options = debugfs_create_dir("options", d_tracer); + if (!t_options) { + pr_warning("Could not create debugfs directory 'options'\n"); + return NULL; + } + + return t_options; +} + +static void +create_trace_option_file(struct trace_option_dentry *topt, + struct tracer_flags *flags, + struct tracer_opt *opt) +{ + struct dentry *t_options; + + t_options = trace_options_init_dentry(); + if (!t_options) + return; + + topt->flags = flags; + topt->opt = opt; + + topt->entry = trace_create_file(opt->name, 0644, t_options, topt, + &trace_options_fops); + +} + +static struct trace_option_dentry * +create_trace_option_files(struct tracer *tracer) +{ + struct trace_option_dentry *topts; + struct tracer_flags *flags; + struct tracer_opt *opts; + int cnt; + + if (!tracer) + return NULL; + + flags = tracer->flags; + + if (!flags || !flags->opts) + return NULL; + + opts = flags->opts; + + for (cnt = 0; opts[cnt].name; cnt++) + ; + + topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL); + if (!topts) + return NULL; + + for (cnt = 0; opts[cnt].name; cnt++) + create_trace_option_file(&topts[cnt], flags, + &opts[cnt]); + + return topts; +} + +static void +destroy_trace_option_files(struct trace_option_dentry *topts) +{ + int cnt; + + if (!topts) + return; + + for (cnt = 0; topts[cnt].opt; cnt++) { + if (topts[cnt].entry) + debugfs_remove(topts[cnt].entry); + } + + kfree(topts); +} + +static struct dentry * +create_trace_option_core_file(const char *option, long index) +{ + struct dentry *t_options; + + t_options = trace_options_init_dentry(); + if (!t_options) + return NULL; + + return trace_create_file(option, 0644, t_options, (void *)index, + &trace_options_core_fops); +} + +static __init void create_trace_options_dir(void) +{ + struct dentry *t_options; + int i; + + t_options = trace_options_init_dentry(); + if (!t_options) + return; + + for (i = 0; trace_options[i]; i++) + create_trace_option_core_file(trace_options[i], i); +} + +static __init int tracer_init_debugfs(void) +{ + struct dentry *d_tracer; + int cpu; + + trace_access_lock_init(); + + d_tracer = tracing_init_dentry(); + + trace_create_file("tracing_enabled", 0644, d_tracer, + &global_trace, &tracing_ctrl_fops); + + trace_create_file("trace_options", 0644, d_tracer, + NULL, &tracing_iter_fops); + + trace_create_file("tracing_cpumask", 0644, d_tracer, + NULL, &tracing_cpumask_fops); + + trace_create_file("trace", 0644, d_tracer, + (void *) TRACE_PIPE_ALL_CPU, &tracing_fops); + + trace_create_file("available_tracers", 0444, d_tracer, + &global_trace, &show_traces_fops); + + trace_create_file("current_tracer", 0644, d_tracer, + &global_trace, &set_tracer_fops); + +#ifdef CONFIG_TRACER_MAX_TRACE + trace_create_file("tracing_max_latency", 0644, d_tracer, + &tracing_max_latency, &tracing_max_lat_fops); +#endif + + trace_create_file("tracing_thresh", 0644, d_tracer, + &tracing_thresh, &tracing_max_lat_fops); + + trace_create_file("README", 0444, d_tracer, + NULL, &tracing_readme_fops); + + trace_create_file("trace_pipe", 0444, d_tracer, + (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); + + trace_create_file("buffer_size_kb", 0644, d_tracer, + &global_trace, &tracing_entries_fops); + + trace_create_file("trace_marker", 0220, d_tracer, + NULL, &tracing_mark_fops); + + trace_create_file("saved_cmdlines", 0444, d_tracer, + NULL, &tracing_saved_cmdlines_fops); + + trace_create_file("trace_clock", 0644, d_tracer, NULL, + &trace_clock_fops); + +#ifdef CONFIG_DYNAMIC_FTRACE + trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, + &ftrace_update_tot_cnt, &tracing_dyn_info_fops); +#endif + + create_trace_options_dir(); + + for_each_tracing_cpu(cpu) + tracing_init_debugfs_percpu(cpu); + + return 0; +} + +static int trace_panic_handler(struct notifier_block *this, + unsigned long event, void *unused) +{ + if (ftrace_dump_on_oops) + ftrace_dump(ftrace_dump_on_oops); + return NOTIFY_OK; +} + +static struct notifier_block trace_panic_notifier = { + .notifier_call = trace_panic_handler, + .next = NULL, + .priority = 150 /* priority: INT_MAX >= x >= 0 */ +}; + +static int trace_die_handler(struct notifier_block *self, + unsigned long val, + void *data) +{ + switch (val) { + case DIE_OOPS: + if (ftrace_dump_on_oops) + ftrace_dump(ftrace_dump_on_oops); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block trace_die_notifier = { + .notifier_call = trace_die_handler, + .priority = 200 +}; + +/* + * printk is set to max of 1024, we really don't need it that big. + * Nothing should be printing 1000 characters anyway. + */ +#define TRACE_MAX_PRINT 1000 + +/* + * Define here KERN_TRACE so that we have one place to modify + * it if we decide to change what log level the ftrace dump + * should be at. + */ +#define KERN_TRACE KERN_EMERG + +void +trace_printk_seq(struct trace_seq *s) +{ + /* Probably should print a warning here. */ + if (s->len >= 1000) + s->len = 1000; + + /* should be zero ended, but we are paranoid. */ + s->buffer[s->len] = 0; + + printk(KERN_TRACE "%s", s->buffer); + + trace_seq_init(s); +} + +void trace_init_global_iter(struct trace_iterator *iter) +{ + iter->tr = &global_trace; + iter->trace = current_trace; + iter->cpu_file = TRACE_PIPE_ALL_CPU; +} + +static void +__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) +{ + static arch_spinlock_t ftrace_dump_lock = + (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + /* use static because iter can be a bit big for the stack */ + static struct trace_iterator iter; + unsigned int old_userobj; + static int dump_ran; + unsigned long flags; + int cnt = 0, cpu; + + /* only one dump */ + local_irq_save(flags); + arch_spin_lock(&ftrace_dump_lock); + if (dump_ran) + goto out; + + dump_ran = 1; + + tracing_off(); + + if (disable_tracing) + ftrace_kill(); + + trace_init_global_iter(&iter); + + for_each_tracing_cpu(cpu) { + atomic_inc(&iter.tr->data[cpu]->disabled); + } + + old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; + + /* don't look at user memory in panic mode */ + trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + + /* Simulate the iterator */ + iter.tr = &global_trace; + iter.trace = current_trace; + + switch (oops_dump_mode) { + case DUMP_ALL: + iter.cpu_file = TRACE_PIPE_ALL_CPU; + break; + case DUMP_ORIG: + iter.cpu_file = raw_smp_processor_id(); + break; + case DUMP_NONE: + goto out_enable; + default: + printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); + iter.cpu_file = TRACE_PIPE_ALL_CPU; + } + + printk(KERN_TRACE "Dumping ftrace buffer:\n"); + + /* + * We need to stop all tracing on all CPUS to read the + * the next buffer. This is a bit expensive, but is + * not done often. We fill all what we can read, + * and then release the locks again. + */ + + while (!trace_empty(&iter)) { + + if (!cnt) + printk(KERN_TRACE "---------------------------------\n"); + + cnt++; + + /* reset all but tr, trace, and overruns */ + memset(&iter.seq, 0, + sizeof(struct trace_iterator) - + offsetof(struct trace_iterator, seq)); + iter.iter_flags |= TRACE_FILE_LAT_FMT; + iter.pos = -1; + + if (trace_find_next_entry_inc(&iter) != NULL) { + int ret; + + ret = print_trace_line(&iter); + if (ret != TRACE_TYPE_NO_CONSUME) + trace_consume(&iter); + } + + trace_printk_seq(&iter.seq); + } + + if (!cnt) + printk(KERN_TRACE " (ftrace buffer empty)\n"); + else + printk(KERN_TRACE "---------------------------------\n"); + + out_enable: + /* Re-enable tracing if requested */ + if (!disable_tracing) { + trace_flags |= old_userobj; + + for_each_tracing_cpu(cpu) { + atomic_dec(&iter.tr->data[cpu]->disabled); + } + tracing_on(); + } + + out: + arch_spin_unlock(&ftrace_dump_lock); + local_irq_restore(flags); +} + +/* By default: disable tracing after the dump */ +void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) +{ + __ftrace_dump(true, oops_dump_mode); +} + +__init static int tracer_alloc_buffers(void) +{ + int ring_buf_size; + enum ring_buffer_flags rb_flags; + int i; + int ret = -ENOMEM; + + + if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) + goto out; + + if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) + goto out_free_buffer_mask; + + /* To save memory, keep the ring buffer size to its minimum */ + if (ring_buffer_expanded) + ring_buf_size = trace_buf_size; + else + ring_buf_size = 1; + + rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; + + cpumask_copy(tracing_buffer_mask, cpu_possible_mask); + cpumask_copy(tracing_cpumask, cpu_all_mask); + + /* TODO: make the number of buffers hot pluggable with CPUS */ + global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags); + if (!global_trace.buffer) { + printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); + WARN_ON(1); + goto out_free_cpumask; + } + global_trace.entries = ring_buffer_size(global_trace.buffer); + + +#ifdef CONFIG_TRACER_MAX_TRACE + max_tr.buffer = ring_buffer_alloc(1, rb_flags); + if (!max_tr.buffer) { + printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); + WARN_ON(1); + ring_buffer_free(global_trace.buffer); + goto out_free_cpumask; + } + max_tr.entries = 1; +#endif + + /* Allocate the first page for all buffers */ + for_each_tracing_cpu(i) { + global_trace.data[i] = &per_cpu(global_trace_cpu, i); + max_tr.data[i] = &per_cpu(max_tr_data, i); + } + + trace_init_cmdlines(); + + register_tracer(&nop_trace); + current_trace = &nop_trace; + /* All seems OK, enable tracing */ + tracing_disabled = 0; + + atomic_notifier_chain_register(&panic_notifier_list, + &trace_panic_notifier); + + register_die_notifier(&trace_die_notifier); + + return 0; + +out_free_cpumask: + free_cpumask_var(tracing_cpumask); +out_free_buffer_mask: + free_cpumask_var(tracing_buffer_mask); +out: + return ret; +} + +__init static int clear_boot_tracer(void) +{ + /* + * The default tracer at boot buffer is an init section. + * This function is called in lateinit. If we did not + * find the boot tracer, then clear it out, to prevent + * later registration from accessing the buffer that is + * about to be freed. + */ + if (!default_bootup_tracer) + return 0; + + printk(KERN_INFO "ftrace bootup tracer '%s' not registered.\n", + default_bootup_tracer); + default_bootup_tracer = NULL; + + return 0; +} + +early_initcall(tracer_alloc_buffers); +fs_initcall(tracer_init_debugfs); +late_initcall(clear_boot_tracer); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h new file mode 100644 index 00000000..f8074072 --- /dev/null +++ b/kernel/trace/trace.h @@ -0,0 +1,803 @@ +#ifndef _LINUX_KERNEL_TRACE_H +#define _LINUX_KERNEL_TRACE_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +enum trace_type { + __TRACE_FIRST_TYPE = 0, + + TRACE_FN, + TRACE_CTX, + TRACE_WAKE, + TRACE_STACK, + TRACE_PRINT, + TRACE_BPRINT, + TRACE_MMIO_RW, + TRACE_MMIO_MAP, + TRACE_BRANCH, + TRACE_GRAPH_RET, + TRACE_GRAPH_ENT, + TRACE_USER_STACK, + TRACE_BLK, + + __TRACE_LAST_TYPE, +}; + + +#undef __field +#define __field(type, item) type item; + +#undef __field_struct +#define __field_struct(type, item) __field(type, item) + +#undef __field_desc +#define __field_desc(type, container, item) + +#undef __array +#define __array(type, item, size) type item[size]; + +#undef __array_desc +#define __array_desc(type, container, item, size) + +#undef __dynamic_array +#define __dynamic_array(type, item) type item[]; + +#undef F_STRUCT +#define F_STRUCT(args...) args + +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ + struct struct_name { \ + struct trace_entry ent; \ + tstruct \ + } + +#undef TP_ARGS +#define TP_ARGS(args...) args + +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk) + +#include "trace_entries.h" + +/* + * syscalls are special, and need special handling, this is why + * they are not included in trace_entries.h + */ +struct syscall_trace_enter { + struct trace_entry ent; + int nr; + unsigned long args[]; +}; + +struct syscall_trace_exit { + struct trace_entry ent; + int nr; + long ret; +}; + +struct kprobe_trace_entry_head { + struct trace_entry ent; + unsigned long ip; +}; + +struct kretprobe_trace_entry_head { + struct trace_entry ent; + unsigned long func; + unsigned long ret_ip; +}; + +/* + * trace_flag_type is an enumeration that holds different + * states when a trace occurs. These are: + * IRQS_OFF - interrupts were disabled + * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags + * NEED_RESCHED - reschedule is requested + * HARDIRQ - inside an interrupt handler + * SOFTIRQ - inside a softirq handler + */ +enum trace_flag_type { + TRACE_FLAG_IRQS_OFF = 0x01, + TRACE_FLAG_IRQS_NOSUPPORT = 0x02, + TRACE_FLAG_NEED_RESCHED = 0x04, + TRACE_FLAG_HARDIRQ = 0x08, + TRACE_FLAG_SOFTIRQ = 0x10, +}; + +#define TRACE_BUF_SIZE 1024 + +/* + * The CPU trace array - it consists of thousands of trace entries + * plus some other descriptor data: (for example which task started + * the trace, etc.) + */ +struct trace_array_cpu { + atomic_t disabled; + void *buffer_page; /* ring buffer spare */ + + unsigned long saved_latency; + unsigned long critical_start; + unsigned long critical_end; + unsigned long critical_sequence; + unsigned long nice; + unsigned long policy; + unsigned long rt_priority; + unsigned long skipped_entries; + cycle_t preempt_timestamp; + pid_t pid; + uid_t uid; + char comm[TASK_COMM_LEN]; +}; + +/* + * The trace array - an array of per-CPU trace arrays. This is the + * highest level data structure that individual tracers deal with. + * They have on/off state as well: + */ +struct trace_array { + struct ring_buffer *buffer; + unsigned long entries; + int cpu; + cycle_t time_start; + struct task_struct *waiter; + struct trace_array_cpu *data[NR_CPUS]; +}; + +#define FTRACE_CMP_TYPE(var, type) \ + __builtin_types_compatible_p(typeof(var), type *) + +#undef IF_ASSIGN +#define IF_ASSIGN(var, entry, etype, id) \ + if (FTRACE_CMP_TYPE(var, etype)) { \ + var = (typeof(var))(entry); \ + WARN_ON(id && (entry)->type != id); \ + break; \ + } + +/* Will cause compile errors if type is not found. */ +extern void __ftrace_bad_type(void); + +/* + * The trace_assign_type is a verifier that the entry type is + * the same as the type being assigned. To add new types simply + * add a line with the following format: + * + * IF_ASSIGN(var, ent, type, id); + * + * Where "type" is the trace type that includes the trace_entry + * as the "ent" item. And "id" is the trace identifier that is + * used in the trace_type enum. + * + * If the type can have more than one id, then use zero. + */ +#define trace_assign_type(var, ent) \ + do { \ + IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN); \ + IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \ + IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \ + IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ + IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ + IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ + IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ + TRACE_MMIO_RW); \ + IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ + TRACE_MMIO_MAP); \ + IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ + IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \ + TRACE_GRAPH_ENT); \ + IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ + TRACE_GRAPH_RET); \ + __ftrace_bad_type(); \ + } while (0) + +/* + * An option specific to a tracer. This is a boolean value. + * The bit is the bit index that sets its value on the + * flags value in struct tracer_flags. + */ +struct tracer_opt { + const char *name; /* Will appear on the trace_options file */ + u32 bit; /* Mask assigned in val field in tracer_flags */ +}; + +/* + * The set of specific options for a tracer. Your tracer + * have to set the initial value of the flags val. + */ +struct tracer_flags { + u32 val; + struct tracer_opt *opts; +}; + +/* Makes more easy to define a tracer opt */ +#define TRACER_OPT(s, b) .name = #s, .bit = b + + +/** + * struct tracer - a specific tracer and its callbacks to interact with debugfs + * @name: the name chosen to select it on the available_tracers file + * @init: called when one switches to this tracer (echo name > current_tracer) + * @reset: called when one switches to another tracer + * @start: called when tracing is unpaused (echo 1 > tracing_enabled) + * @stop: called when tracing is paused (echo 0 > tracing_enabled) + * @open: called when the trace file is opened + * @pipe_open: called when the trace_pipe file is opened + * @wait_pipe: override how the user waits for traces on trace_pipe + * @close: called when the trace file is released + * @pipe_close: called when the trace_pipe file is released + * @read: override the default read callback on trace_pipe + * @splice_read: override the default splice_read callback on trace_pipe + * @selftest: selftest to run on boot (see trace_selftest.c) + * @print_headers: override the first lines that describe your columns + * @print_line: callback that prints a trace + * @set_flag: signals one of your private flags changed (trace_options file) + * @flags: your private flags + */ +struct tracer { + const char *name; + int (*init)(struct trace_array *tr); + void (*reset)(struct trace_array *tr); + void (*start)(struct trace_array *tr); + void (*stop)(struct trace_array *tr); + void (*open)(struct trace_iterator *iter); + void (*pipe_open)(struct trace_iterator *iter); + void (*wait_pipe)(struct trace_iterator *iter); + void (*close)(struct trace_iterator *iter); + void (*pipe_close)(struct trace_iterator *iter); + ssize_t (*read)(struct trace_iterator *iter, + struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos); + ssize_t (*splice_read)(struct trace_iterator *iter, + struct file *filp, + loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, + unsigned int flags); +#ifdef CONFIG_FTRACE_STARTUP_TEST + int (*selftest)(struct tracer *trace, + struct trace_array *tr); +#endif + void (*print_header)(struct seq_file *m); + enum print_line_t (*print_line)(struct trace_iterator *iter); + /* If you handled the flag setting, return 0 */ + int (*set_flag)(u32 old_flags, u32 bit, int set); + struct tracer *next; + struct tracer_flags *flags; + int print_max; + int use_max_tr; +}; + + +#define TRACE_PIPE_ALL_CPU -1 + +int tracer_init(struct tracer *t, struct trace_array *tr); +int tracing_is_enabled(void); +void trace_wake_up(void); +void tracing_reset(struct trace_array *tr, int cpu); +void tracing_reset_online_cpus(struct trace_array *tr); +void tracing_reset_current(int cpu); +void tracing_reset_current_online_cpus(void); +int tracing_open_generic(struct inode *inode, struct file *filp); +struct dentry *trace_create_file(const char *name, + mode_t mode, + struct dentry *parent, + void *data, + const struct file_operations *fops); + +struct dentry *tracing_init_dentry(void); + +struct ring_buffer_event; + +struct ring_buffer_event * +trace_buffer_lock_reserve(struct ring_buffer *buffer, + int type, + unsigned long len, + unsigned long flags, + int pc); +void trace_buffer_unlock_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event, + unsigned long flags, int pc); + +struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, + struct trace_array_cpu *data); + +struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, + int *ent_cpu, u64 *ent_ts); + +int trace_empty(struct trace_iterator *iter); + +void *trace_find_next_entry_inc(struct trace_iterator *iter); + +void trace_init_global_iter(struct trace_iterator *iter); + +void tracing_iter_reset(struct trace_iterator *iter, int cpu); + +void default_wait_pipe(struct trace_iterator *iter); +void poll_wait_pipe(struct trace_iterator *iter); + +void ftrace(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long ip, + unsigned long parent_ip, + unsigned long flags, int pc); +void tracing_sched_switch_trace(struct trace_array *tr, + struct task_struct *prev, + struct task_struct *next, + unsigned long flags, int pc); + +void tracing_sched_wakeup_trace(struct trace_array *tr, + struct task_struct *wakee, + struct task_struct *cur, + unsigned long flags, int pc); +void trace_function(struct trace_array *tr, + unsigned long ip, + unsigned long parent_ip, + unsigned long flags, int pc); +void trace_graph_function(struct trace_array *tr, + unsigned long ip, + unsigned long parent_ip, + unsigned long flags, int pc); +void trace_default_header(struct seq_file *m); +void print_trace_header(struct seq_file *m, struct trace_iterator *iter); +int trace_empty(struct trace_iterator *iter); + +void trace_graph_return(struct ftrace_graph_ret *trace); +int trace_graph_entry(struct ftrace_graph_ent *trace); +void set_graph_array(struct trace_array *tr); + +void tracing_start_cmdline_record(void); +void tracing_stop_cmdline_record(void); +void tracing_sched_switch_assign_trace(struct trace_array *tr); +void tracing_stop_sched_switch_record(void); +void tracing_start_sched_switch_record(void); +int register_tracer(struct tracer *type); +void unregister_tracer(struct tracer *type); +int is_tracing_stopped(void); +enum trace_file_type { + TRACE_FILE_LAT_FMT = 1, + TRACE_FILE_ANNOTATE = 2, +}; + +extern cpumask_var_t __read_mostly tracing_buffer_mask; + +#define for_each_tracing_cpu(cpu) \ + for_each_cpu(cpu, tracing_buffer_mask) + +extern unsigned long nsecs_to_usecs(unsigned long nsecs); + +extern unsigned long tracing_thresh; + +#ifdef CONFIG_TRACER_MAX_TRACE +extern unsigned long tracing_max_latency; + +void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); +void update_max_tr_single(struct trace_array *tr, + struct task_struct *tsk, int cpu); +#endif /* CONFIG_TRACER_MAX_TRACE */ + +#ifdef CONFIG_STACKTRACE +void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, + int skip, int pc); + +void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, + int pc); + +void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, + int pc); +#else +static inline void ftrace_trace_stack(struct ring_buffer *buffer, + unsigned long flags, int skip, int pc) +{ +} + +static inline void ftrace_trace_userstack(struct ring_buffer *buffer, + unsigned long flags, int pc) +{ +} + +static inline void __trace_stack(struct trace_array *tr, unsigned long flags, + int skip, int pc) +{ +} +#endif /* CONFIG_STACKTRACE */ + +extern cycle_t ftrace_now(int cpu); + +extern void trace_find_cmdline(int pid, char comm[]); + +#ifdef CONFIG_DYNAMIC_FTRACE +extern unsigned long ftrace_update_tot_cnt; +#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func +extern int DYN_FTRACE_TEST_NAME(void); +#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 +extern int DYN_FTRACE_TEST_NAME2(void); +#endif + +extern int ring_buffer_expanded; +extern bool tracing_selftest_disabled; +DECLARE_PER_CPU(int, ftrace_cpu_disabled); + +#ifdef CONFIG_FTRACE_STARTUP_TEST +extern int trace_selftest_startup_function(struct tracer *trace, + struct trace_array *tr); +extern int trace_selftest_startup_function_graph(struct tracer *trace, + struct trace_array *tr); +extern int trace_selftest_startup_irqsoff(struct tracer *trace, + struct trace_array *tr); +extern int trace_selftest_startup_preemptoff(struct tracer *trace, + struct trace_array *tr); +extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace, + struct trace_array *tr); +extern int trace_selftest_startup_wakeup(struct tracer *trace, + struct trace_array *tr); +extern int trace_selftest_startup_nop(struct tracer *trace, + struct trace_array *tr); +extern int trace_selftest_startup_sched_switch(struct tracer *trace, + struct trace_array *tr); +extern int trace_selftest_startup_branch(struct tracer *trace, + struct trace_array *tr); +#endif /* CONFIG_FTRACE_STARTUP_TEST */ + +extern void *head_page(struct trace_array_cpu *data); +extern unsigned long long ns2usecs(cycle_t nsec); +extern int +trace_vbprintk(unsigned long ip, const char *fmt, va_list args); +extern int +trace_vprintk(unsigned long ip, const char *fmt, va_list args); +extern int +trace_array_vprintk(struct trace_array *tr, + unsigned long ip, const char *fmt, va_list args); +int trace_array_printk(struct trace_array *tr, + unsigned long ip, const char *fmt, ...); +void trace_printk_seq(struct trace_seq *s); +enum print_line_t print_trace_line(struct trace_iterator *iter); + +extern unsigned long trace_flags; + +extern int trace_clock_id; + +/* Standard output formatting function used for function return traces */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +/* Flag options */ +#define TRACE_GRAPH_PRINT_OVERRUN 0x1 +#define TRACE_GRAPH_PRINT_CPU 0x2 +#define TRACE_GRAPH_PRINT_OVERHEAD 0x4 +#define TRACE_GRAPH_PRINT_PROC 0x8 +#define TRACE_GRAPH_PRINT_DURATION 0x10 +#define TRACE_GRAPH_PRINT_ABS_TIME 0x20 + +extern enum print_line_t +print_graph_function_flags(struct trace_iterator *iter, u32 flags); +extern void print_graph_headers_flags(struct seq_file *s, u32 flags); +extern enum print_line_t +trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); +extern void graph_trace_open(struct trace_iterator *iter); +extern void graph_trace_close(struct trace_iterator *iter); +extern int __trace_graph_entry(struct trace_array *tr, + struct ftrace_graph_ent *trace, + unsigned long flags, int pc); +extern void __trace_graph_return(struct trace_array *tr, + struct ftrace_graph_ret *trace, + unsigned long flags, int pc); + + +#ifdef CONFIG_DYNAMIC_FTRACE +/* TODO: make this variable */ +#define FTRACE_GRAPH_MAX_FUNCS 32 +extern int ftrace_graph_filter_enabled; +extern int ftrace_graph_count; +extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; + +static inline int ftrace_graph_addr(unsigned long addr) +{ + int i; + + if (!ftrace_graph_filter_enabled) + return 1; + + for (i = 0; i < ftrace_graph_count; i++) { + if (addr == ftrace_graph_funcs[i]) + return 1; + } + + return 0; +} +#else +static inline int ftrace_graph_addr(unsigned long addr) +{ + return 1; +} +#endif /* CONFIG_DYNAMIC_FTRACE */ +#else /* CONFIG_FUNCTION_GRAPH_TRACER */ +static inline enum print_line_t +print_graph_function_flags(struct trace_iterator *iter, u32 flags) +{ + return TRACE_TYPE_UNHANDLED; +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +extern struct list_head ftrace_pids; + +#ifdef CONFIG_FUNCTION_TRACER +static inline int ftrace_trace_task(struct task_struct *task) +{ + if (list_empty(&ftrace_pids)) + return 1; + + return test_tsk_trace_trace(task); +} +#else +static inline int ftrace_trace_task(struct task_struct *task) +{ + return 1; +} +#endif + +/* + * struct trace_parser - servers for reading the user input separated by spaces + * @cont: set if the input is not complete - no final space char was found + * @buffer: holds the parsed user input + * @idx: user input length + * @size: buffer size + */ +struct trace_parser { + bool cont; + char *buffer; + unsigned idx; + unsigned size; +}; + +static inline bool trace_parser_loaded(struct trace_parser *parser) +{ + return (parser->idx != 0); +} + +static inline bool trace_parser_cont(struct trace_parser *parser) +{ + return parser->cont; +} + +static inline void trace_parser_clear(struct trace_parser *parser) +{ + parser->cont = false; + parser->idx = 0; +} + +extern int trace_parser_get_init(struct trace_parser *parser, int size); +extern void trace_parser_put(struct trace_parser *parser); +extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, + size_t cnt, loff_t *ppos); + +/* + * trace_iterator_flags is an enumeration that defines bit + * positions into trace_flags that controls the output. + * + * NOTE: These bits must match the trace_options array in + * trace.c. + */ +enum trace_iterator_flags { + TRACE_ITER_PRINT_PARENT = 0x01, + TRACE_ITER_SYM_OFFSET = 0x02, + TRACE_ITER_SYM_ADDR = 0x04, + TRACE_ITER_VERBOSE = 0x08, + TRACE_ITER_RAW = 0x10, + TRACE_ITER_HEX = 0x20, + TRACE_ITER_BIN = 0x40, + TRACE_ITER_BLOCK = 0x80, + TRACE_ITER_STACKTRACE = 0x100, + TRACE_ITER_PRINTK = 0x200, + TRACE_ITER_PREEMPTONLY = 0x400, + TRACE_ITER_BRANCH = 0x800, + TRACE_ITER_ANNOTATE = 0x1000, + TRACE_ITER_USERSTACKTRACE = 0x2000, + TRACE_ITER_SYM_USEROBJ = 0x4000, + TRACE_ITER_PRINTK_MSGONLY = 0x8000, + TRACE_ITER_CONTEXT_INFO = 0x10000, /* Print pid/cpu/time */ + TRACE_ITER_LATENCY_FMT = 0x20000, + TRACE_ITER_SLEEP_TIME = 0x40000, + TRACE_ITER_GRAPH_TIME = 0x80000, + TRACE_ITER_RECORD_CMD = 0x100000, + TRACE_ITER_OVERWRITE = 0x200000, +}; + +/* + * TRACE_ITER_SYM_MASK masks the options in trace_flags that + * control the output of kernel symbols. + */ +#define TRACE_ITER_SYM_MASK \ + (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR) + +extern struct tracer nop_trace; + +#ifdef CONFIG_BRANCH_TRACER +extern int enable_branch_tracing(struct trace_array *tr); +extern void disable_branch_tracing(void); +static inline int trace_branch_enable(struct trace_array *tr) +{ + if (trace_flags & TRACE_ITER_BRANCH) + return enable_branch_tracing(tr); + return 0; +} +static inline void trace_branch_disable(void) +{ + /* due to races, always disable */ + disable_branch_tracing(); +} +#else +static inline int trace_branch_enable(struct trace_array *tr) +{ + return 0; +} +static inline void trace_branch_disable(void) +{ +} +#endif /* CONFIG_BRANCH_TRACER */ + +/* set ring buffers to default size if not already done so */ +int tracing_update_buffers(void); + +/* trace event type bit fields, not numeric */ +enum { + TRACE_EVENT_TYPE_PRINTF = 1, + TRACE_EVENT_TYPE_RAW = 2, +}; + +struct ftrace_event_field { + struct list_head link; + char *name; + char *type; + int filter_type; + int offset; + int size; + int is_signed; +}; + +struct event_filter { + int n_preds; /* Number assigned */ + int a_preds; /* allocated */ + struct filter_pred *preds; + struct filter_pred *root; + char *filter_string; +}; + +struct event_subsystem { + struct list_head list; + const char *name; + struct dentry *entry; + struct event_filter *filter; + int nr_events; + int ref_count; +}; + +#define FILTER_PRED_INVALID ((unsigned short)-1) +#define FILTER_PRED_IS_RIGHT (1 << 15) +#define FILTER_PRED_FOLD (1 << 15) + +/* + * The max preds is the size of unsigned short with + * two flags at the MSBs. One bit is used for both the IS_RIGHT + * and FOLD flags. The other is reserved. + * + * 2^14 preds is way more than enough. + */ +#define MAX_FILTER_PRED 16384 + +struct filter_pred; +struct regex; + +typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); + +typedef int (*regex_match_func)(char *str, struct regex *r, int len); + +enum regex_type { + MATCH_FULL = 0, + MATCH_FRONT_ONLY, + MATCH_MIDDLE_ONLY, + MATCH_END_ONLY, +}; + +struct regex { + char pattern[MAX_FILTER_STR_VAL]; + int len; + int field_len; + regex_match_func match; +}; + +struct filter_pred { + filter_pred_fn_t fn; + u64 val; + struct regex regex; + /* + * Leaf nodes use field_name, ops is used by AND and OR + * nodes. The field_name is always freed when freeing a pred. + * We can overload field_name for ops and have it freed + * as well. + */ + union { + char *field_name; + unsigned short *ops; + }; + int offset; + int not; + int op; + unsigned short index; + unsigned short parent; + unsigned short left; + unsigned short right; +}; + +extern struct list_head ftrace_common_fields; + +extern enum regex_type +filter_parse_regex(char *buff, int len, char **search, int *not); +extern void print_event_filter(struct ftrace_event_call *call, + struct trace_seq *s); +extern int apply_event_filter(struct ftrace_event_call *call, + char *filter_string); +extern int apply_subsystem_event_filter(struct event_subsystem *system, + char *filter_string); +extern void print_subsystem_event_filter(struct event_subsystem *system, + struct trace_seq *s); +extern int filter_assign_type(const char *type); + +struct list_head * +trace_get_fields(struct ftrace_event_call *event_call); + +static inline int +filter_check_discard(struct ftrace_event_call *call, void *rec, + struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && + !filter_match_preds(call->filter, rec)) { + ring_buffer_discard_commit(buffer, event); + return 1; + } + + return 0; +} + +extern void trace_event_enable_cmd_record(bool enable); + +extern struct mutex event_mutex; +extern struct list_head ftrace_events; + +extern const char *__start___trace_bprintk_fmt[]; +extern const char *__stop___trace_bprintk_fmt[]; + +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ + extern struct ftrace_event_call \ + __attribute__((__aligned__(4))) event_##call; +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) +#include "trace_entries.h" + +/* Only current can touch trace_recursion */ +#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0) +#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0) + +/* Ring buffer has the 10 LSB bits to count */ +#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) + +/* for function tracing recursion */ +#define TRACE_INTERNAL_BIT (1<<11) +#define TRACE_GLOBAL_BIT (1<<12) + +#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) +#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) +#define trace_recursion_test(bit) ((current)->trace_recursion & (bit)) + +#endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c new file mode 100644 index 00000000..8d3538b4 --- /dev/null +++ b/kernel/trace/trace_branch.c @@ -0,0 +1,411 @@ +/* + * unlikely profiler + * + * Copyright (C) 2008 Steven Rostedt + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" +#include "trace_stat.h" +#include "trace_output.h" + +#ifdef CONFIG_BRANCH_TRACER + +static struct tracer branch_trace; +static int branch_tracing_enabled __read_mostly; +static DEFINE_MUTEX(branch_tracing_mutex); + +static struct trace_array *branch_tracer; + +static void +probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) +{ + struct ftrace_event_call *call = &event_branch; + struct trace_array *tr = branch_tracer; + struct ring_buffer_event *event; + struct trace_branch *entry; + struct ring_buffer *buffer; + unsigned long flags; + int cpu, pc; + const char *p; + + /* + * I would love to save just the ftrace_likely_data pointer, but + * this code can also be used by modules. Ugly things can happen + * if the module is unloaded, and then we go and read the + * pointer. This is slower, but much safer. + */ + + if (unlikely(!tr)) + return; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) + goto out; + + pc = preempt_count(); + buffer = tr->buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH, + sizeof(*entry), flags, pc); + if (!event) + goto out; + + entry = ring_buffer_event_data(event); + + /* Strip off the path, only save the file */ + p = f->file + strlen(f->file); + while (p >= f->file && *p != '/') + p--; + p++; + + strncpy(entry->func, f->func, TRACE_FUNC_SIZE); + strncpy(entry->file, p, TRACE_FILE_SIZE); + entry->func[TRACE_FUNC_SIZE] = 0; + entry->file[TRACE_FILE_SIZE] = 0; + entry->line = f->line; + entry->correct = val == expect; + + if (!filter_check_discard(call, entry, buffer, event)) + ring_buffer_unlock_commit(buffer, event); + + out: + atomic_dec(&tr->data[cpu]->disabled); + local_irq_restore(flags); +} + +static inline +void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) +{ + if (!branch_tracing_enabled) + return; + + probe_likely_condition(f, val, expect); +} + +int enable_branch_tracing(struct trace_array *tr) +{ + mutex_lock(&branch_tracing_mutex); + branch_tracer = tr; + /* + * Must be seen before enabling. The reader is a condition + * where we do not need a matching rmb() + */ + smp_wmb(); + branch_tracing_enabled++; + mutex_unlock(&branch_tracing_mutex); + + return 0; +} + +void disable_branch_tracing(void) +{ + mutex_lock(&branch_tracing_mutex); + + if (!branch_tracing_enabled) + goto out_unlock; + + branch_tracing_enabled--; + + out_unlock: + mutex_unlock(&branch_tracing_mutex); +} + +static void start_branch_trace(struct trace_array *tr) +{ + enable_branch_tracing(tr); +} + +static void stop_branch_trace(struct trace_array *tr) +{ + disable_branch_tracing(); +} + +static int branch_trace_init(struct trace_array *tr) +{ + start_branch_trace(tr); + return 0; +} + +static void branch_trace_reset(struct trace_array *tr) +{ + stop_branch_trace(tr); +} + +static enum print_line_t trace_branch_print(struct trace_iterator *iter, + int flags, struct trace_event *event) +{ + struct trace_branch *field; + + trace_assign_type(field, iter->ent); + + if (trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n", + field->correct ? " ok " : " MISS ", + field->func, + field->file, + field->line)) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static void branch_print_header(struct seq_file *s) +{ + seq_puts(s, "# TASK-PID CPU# TIMESTAMP CORRECT" + " FUNC:FILE:LINE\n"); + seq_puts(s, "# | | | | | " + " |\n"); +} + +static struct trace_event_functions trace_branch_funcs = { + .trace = trace_branch_print, +}; + +static struct trace_event trace_branch_event = { + .type = TRACE_BRANCH, + .funcs = &trace_branch_funcs, +}; + +static struct tracer branch_trace __read_mostly = +{ + .name = "branch", + .init = branch_trace_init, + .reset = branch_trace_reset, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_branch, +#endif /* CONFIG_FTRACE_SELFTEST */ + .print_header = branch_print_header, +}; + +__init static int init_branch_tracer(void) +{ + int ret; + + ret = register_ftrace_event(&trace_branch_event); + if (!ret) { + printk(KERN_WARNING "Warning: could not register " + "branch events\n"); + return 1; + } + return register_tracer(&branch_trace); +} +device_initcall(init_branch_tracer); + +#else +static inline +void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect) +{ +} +#endif /* CONFIG_BRANCH_TRACER */ + +void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect) +{ + /* + * I would love to have a trace point here instead, but the + * trace point code is so inundated with unlikely and likely + * conditions that the recursive nightmare that exists is too + * much to try to get working. At least for now. + */ + trace_likely_condition(f, val, expect); + + /* FIXME: Make this atomic! */ + if (val == expect) + f->correct++; + else + f->incorrect++; +} +EXPORT_SYMBOL(ftrace_likely_update); + +extern unsigned long __start_annotated_branch_profile[]; +extern unsigned long __stop_annotated_branch_profile[]; + +static int annotated_branch_stat_headers(struct seq_file *m) +{ + seq_printf(m, " correct incorrect %% "); + seq_printf(m, " Function " + " File Line\n" + " ------- --------- - " + " -------- " + " ---- ----\n"); + return 0; +} + +static inline long get_incorrect_percent(struct ftrace_branch_data *p) +{ + long percent; + + if (p->correct) { + percent = p->incorrect * 100; + percent /= p->correct + p->incorrect; + } else + percent = p->incorrect ? 100 : -1; + + return percent; +} + +static int branch_stat_show(struct seq_file *m, void *v) +{ + struct ftrace_branch_data *p = v; + const char *f; + long percent; + + /* Only print the file, not the path */ + f = p->file + strlen(p->file); + while (f >= p->file && *f != '/') + f--; + f++; + + /* + * The miss is overlayed on correct, and hit on incorrect. + */ + percent = get_incorrect_percent(p); + + seq_printf(m, "%8lu %8lu ", p->correct, p->incorrect); + if (percent < 0) + seq_printf(m, " X "); + else + seq_printf(m, "%3ld ", percent); + seq_printf(m, "%-30.30s %-20.20s %d\n", p->func, f, p->line); + return 0; +} + +static void *annotated_branch_stat_start(struct tracer_stat *trace) +{ + return __start_annotated_branch_profile; +} + +static void * +annotated_branch_stat_next(void *v, int idx) +{ + struct ftrace_branch_data *p = v; + + ++p; + + if ((void *)p >= (void *)__stop_annotated_branch_profile) + return NULL; + + return p; +} + +static int annotated_branch_stat_cmp(void *p1, void *p2) +{ + struct ftrace_branch_data *a = p1; + struct ftrace_branch_data *b = p2; + + long percent_a, percent_b; + + percent_a = get_incorrect_percent(a); + percent_b = get_incorrect_percent(b); + + if (percent_a < percent_b) + return -1; + if (percent_a > percent_b) + return 1; + + if (a->incorrect < b->incorrect) + return -1; + if (a->incorrect > b->incorrect) + return 1; + + /* + * Since the above shows worse (incorrect) cases + * first, we continue that by showing best (correct) + * cases last. + */ + if (a->correct > b->correct) + return -1; + if (a->correct < b->correct) + return 1; + + return 0; +} + +static struct tracer_stat annotated_branch_stats = { + .name = "branch_annotated", + .stat_start = annotated_branch_stat_start, + .stat_next = annotated_branch_stat_next, + .stat_cmp = annotated_branch_stat_cmp, + .stat_headers = annotated_branch_stat_headers, + .stat_show = branch_stat_show +}; + +__init static int init_annotated_branch_stats(void) +{ + int ret; + + ret = register_stat_tracer(&annotated_branch_stats); + if (!ret) { + printk(KERN_WARNING "Warning: could not register " + "annotated branches stats\n"); + return 1; + } + return 0; +} +fs_initcall(init_annotated_branch_stats); + +#ifdef CONFIG_PROFILE_ALL_BRANCHES + +extern unsigned long __start_branch_profile[]; +extern unsigned long __stop_branch_profile[]; + +static int all_branch_stat_headers(struct seq_file *m) +{ + seq_printf(m, " miss hit %% "); + seq_printf(m, " Function " + " File Line\n" + " ------- --------- - " + " -------- " + " ---- ----\n"); + return 0; +} + +static void *all_branch_stat_start(struct tracer_stat *trace) +{ + return __start_branch_profile; +} + +static void * +all_branch_stat_next(void *v, int idx) +{ + struct ftrace_branch_data *p = v; + + ++p; + + if ((void *)p >= (void *)__stop_branch_profile) + return NULL; + + return p; +} + +static struct tracer_stat all_branch_stats = { + .name = "branch_all", + .stat_start = all_branch_stat_start, + .stat_next = all_branch_stat_next, + .stat_headers = all_branch_stat_headers, + .stat_show = branch_stat_show +}; + +__init static int all_annotated_branch_stats(void) +{ + int ret; + + ret = register_stat_tracer(&all_branch_stats); + if (!ret) { + printk(KERN_WARNING "Warning: could not register " + "all branches stats\n"); + return 1; + } + return 0; +} +fs_initcall(all_annotated_branch_stats); +#endif /* CONFIG_PROFILE_ALL_BRANCHES */ diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c new file mode 100644 index 00000000..6302747a --- /dev/null +++ b/kernel/trace/trace_clock.c @@ -0,0 +1,115 @@ +/* + * tracing clocks + * + * Copyright (C) 2009 Red Hat, Inc., Ingo Molnar + * + * Implements 3 trace clock variants, with differing scalability/precision + * tradeoffs: + * + * - local: CPU-local trace clock + * - medium: scalable global clock with some jitter + * - global: globally monotonic, serialized clock + * + * Tracer plugins will chose a default from these clocks. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +/* + * trace_clock_local(): the simplest and least coherent tracing clock. + * + * Useful for tracing that does not cross to other CPUs nor + * does it go through idle events. + */ +u64 notrace trace_clock_local(void) +{ + u64 clock; + + /* + * sched_clock() is an architecture implemented, fast, scalable, + * lockless clock. It is not guaranteed to be coherent across + * CPUs, nor across CPU idle events. + */ + preempt_disable_notrace(); + clock = sched_clock(); + preempt_enable_notrace(); + + return clock; +} + +/* + * trace_clock(): 'between' trace clock. Not completely serialized, + * but not completely incorrect when crossing CPUs either. + * + * This is based on cpu_clock(), which will allow at most ~1 jiffy of + * jitter between CPUs. So it's a pretty scalable clock, but there + * can be offsets in the trace data. + */ +u64 notrace trace_clock(void) +{ + return local_clock(); +} + + +/* + * trace_clock_global(): special globally coherent trace clock + * + * It has higher overhead than the other trace clocks but is still + * an order of magnitude faster than GTOD derived hardware clocks. + * + * Used by plugins that need globally coherent timestamps. + */ + +/* keep prev_time and lock in the same cacheline. */ +static struct { + u64 prev_time; + arch_spinlock_t lock; +} trace_clock_struct ____cacheline_aligned_in_smp = + { + .lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED, + }; + +u64 notrace trace_clock_global(void) +{ + unsigned long flags; + int this_cpu; + u64 now; + + local_irq_save(flags); + + this_cpu = raw_smp_processor_id(); + now = cpu_clock(this_cpu); + /* + * If in an NMI context then dont risk lockups and return the + * cpu_clock() time: + */ + if (unlikely(in_nmi())) + goto out; + + arch_spin_lock(&trace_clock_struct.lock); + + /* + * TODO: if this happens often then maybe we should reset + * my_scd->clock to prev_time+1, to make sure + * we start ticking with the local clock from now on? + */ + if ((s64)(now - trace_clock_struct.prev_time) < 0) + now = trace_clock_struct.prev_time + 1; + + trace_clock_struct.prev_time = now; + + arch_spin_unlock(&trace_clock_struct.lock); + + out: + local_irq_restore(flags); + + return now; +} diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h new file mode 100644 index 00000000..1fe81eef --- /dev/null +++ b/kernel/trace/trace_entries.h @@ -0,0 +1,284 @@ +/* + * This file defines the trace event structures that go into the ring + * buffer directly. They are created via macros so that changes for them + * appear in the format file. Using macros will automate this process. + * + * The macro used to create a ftrace data structure is: + * + * FTRACE_ENTRY( name, struct_name, id, structure, print ) + * + * @name: the name used the event name, as well as the name of + * the directory that holds the format file. + * + * @struct_name: the name of the structure that is created. + * + * @id: The event identifier that is used to detect what event + * this is from the ring buffer. + * + * @structure: the structure layout + * + * - __field( type, item ) + * This is equivalent to declaring + * type item; + * in the structure. + * - __array( type, item, size ) + * This is equivalent to declaring + * type item[size]; + * in the structure. + * + * * for structures within structures, the format of the internal + * structure is laid out. This allows the internal structure + * to be deciphered for the format file. Although these macros + * may become out of sync with the internal structure, they + * will create a compile error if it happens. Since the + * internel structures are just tracing helpers, this is not + * an issue. + * + * When an internal structure is used, it should use: + * + * __field_struct( type, item ) + * + * instead of __field. This will prevent it from being shown in + * the output file. The fields in the structure should use. + * + * __field_desc( type, container, item ) + * __array_desc( type, container, item, len ) + * + * type, item and len are the same as __field and __array, but + * container is added. This is the name of the item in + * __field_struct that this is describing. + * + * + * @print: the print format shown to users in the format file. + */ + +/* + * Function trace entry - function address and parent function address: + */ +FTRACE_ENTRY(function, ftrace_entry, + + TRACE_FN, + + F_STRUCT( + __field( unsigned long, ip ) + __field( unsigned long, parent_ip ) + ), + + F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip) +); + +/* Function call entry */ +FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, + + TRACE_GRAPH_ENT, + + F_STRUCT( + __field_struct( struct ftrace_graph_ent, graph_ent ) + __field_desc( unsigned long, graph_ent, func ) + __field_desc( int, graph_ent, depth ) + ), + + F_printk("--> %lx (%d)", __entry->func, __entry->depth) +); + +/* Function return entry */ +FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, + + TRACE_GRAPH_RET, + + F_STRUCT( + __field_struct( struct ftrace_graph_ret, ret ) + __field_desc( unsigned long, ret, func ) + __field_desc( unsigned long long, ret, calltime) + __field_desc( unsigned long long, ret, rettime ) + __field_desc( unsigned long, ret, overrun ) + __field_desc( int, ret, depth ) + ), + + F_printk("<-- %lx (%d) (start: %llx end: %llx) over: %d", + __entry->func, __entry->depth, + __entry->calltime, __entry->rettime, + __entry->depth) +); + +/* + * Context switch trace entry - which task (and prio) we switched from/to: + * + * This is used for both wakeup and context switches. We only want + * to create one structure, but we need two outputs for it. + */ +#define FTRACE_CTX_FIELDS \ + __field( unsigned int, prev_pid ) \ + __field( unsigned int, next_pid ) \ + __field( unsigned int, next_cpu ) \ + __field( unsigned char, prev_prio ) \ + __field( unsigned char, prev_state ) \ + __field( unsigned char, next_prio ) \ + __field( unsigned char, next_state ) + +FTRACE_ENTRY(context_switch, ctx_switch_entry, + + TRACE_CTX, + + F_STRUCT( + FTRACE_CTX_FIELDS + ), + + F_printk("%u:%u:%u ==> %u:%u:%u [%03u]", + __entry->prev_pid, __entry->prev_prio, __entry->prev_state, + __entry->next_pid, __entry->next_prio, __entry->next_state, + __entry->next_cpu + ) +); + +/* + * FTRACE_ENTRY_DUP only creates the format file, it will not + * create another structure. + */ +FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, + + TRACE_WAKE, + + F_STRUCT( + FTRACE_CTX_FIELDS + ), + + F_printk("%u:%u:%u ==+ %u:%u:%u [%03u]", + __entry->prev_pid, __entry->prev_prio, __entry->prev_state, + __entry->next_pid, __entry->next_prio, __entry->next_state, + __entry->next_cpu + ) +); + +/* + * Stack-trace entry: + */ + +#define FTRACE_STACK_ENTRIES 8 + +#ifndef CONFIG_64BIT +# define IP_FMT "%08lx" +#else +# define IP_FMT "%016lx" +#endif + +FTRACE_ENTRY(kernel_stack, stack_entry, + + TRACE_STACK, + + F_STRUCT( + __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) + ), + + F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" + "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" + "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n", + __entry->caller[0], __entry->caller[1], __entry->caller[2], + __entry->caller[3], __entry->caller[4], __entry->caller[5], + __entry->caller[6], __entry->caller[7]) +); + +FTRACE_ENTRY(user_stack, userstack_entry, + + TRACE_USER_STACK, + + F_STRUCT( + __field( unsigned int, tgid ) + __array( unsigned long, caller, FTRACE_STACK_ENTRIES ) + ), + + F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" + "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n" + "\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n", + __entry->caller[0], __entry->caller[1], __entry->caller[2], + __entry->caller[3], __entry->caller[4], __entry->caller[5], + __entry->caller[6], __entry->caller[7]) +); + +/* + * trace_printk entry: + */ +FTRACE_ENTRY(bprint, bprint_entry, + + TRACE_BPRINT, + + F_STRUCT( + __field( unsigned long, ip ) + __field( const char *, fmt ) + __dynamic_array( u32, buf ) + ), + + F_printk("%08lx fmt:%p", + __entry->ip, __entry->fmt) +); + +FTRACE_ENTRY(print, print_entry, + + TRACE_PRINT, + + F_STRUCT( + __field( unsigned long, ip ) + __dynamic_array( char, buf ) + ), + + F_printk("%08lx %s", + __entry->ip, __entry->buf) +); + +FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, + + TRACE_MMIO_RW, + + F_STRUCT( + __field_struct( struct mmiotrace_rw, rw ) + __field_desc( resource_size_t, rw, phys ) + __field_desc( unsigned long, rw, value ) + __field_desc( unsigned long, rw, pc ) + __field_desc( int, rw, map_id ) + __field_desc( unsigned char, rw, opcode ) + __field_desc( unsigned char, rw, width ) + ), + + F_printk("%lx %lx %lx %d %x %x", + (unsigned long)__entry->phys, __entry->value, __entry->pc, + __entry->map_id, __entry->opcode, __entry->width) +); + +FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, + + TRACE_MMIO_MAP, + + F_STRUCT( + __field_struct( struct mmiotrace_map, map ) + __field_desc( resource_size_t, map, phys ) + __field_desc( unsigned long, map, virt ) + __field_desc( unsigned long, map, len ) + __field_desc( int, map, map_id ) + __field_desc( unsigned char, map, opcode ) + ), + + F_printk("%lx %lx %lx %d %x", + (unsigned long)__entry->phys, __entry->virt, __entry->len, + __entry->map_id, __entry->opcode) +); + + +#define TRACE_FUNC_SIZE 30 +#define TRACE_FILE_SIZE 20 + +FTRACE_ENTRY(branch, trace_branch, + + TRACE_BRANCH, + + F_STRUCT( + __field( unsigned int, line ) + __array( char, func, TRACE_FUNC_SIZE+1 ) + __array( char, file, TRACE_FILE_SIZE+1 ) + __field( char, correct ) + ), + + F_printk("%u:%s:%s (%u)", + __entry->line, + __entry->func, __entry->file, __entry->correct) +); + diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c new file mode 100644 index 00000000..19a359d5 --- /dev/null +++ b/kernel/trace/trace_event_perf.c @@ -0,0 +1,216 @@ +/* + * trace event based perf event profiling/tracing + * + * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra + * Copyright (C) 2009-2010 Frederic Weisbecker + */ + +#include +#include +#include "trace.h" + +static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; + +/* + * Force it to be aligned to unsigned long to avoid misaligned accesses + * suprises + */ +typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) + perf_trace_t; + +/* Count the events in use (per event id, not per instance) */ +static int total_ref_count; + +static int perf_trace_event_perm(struct ftrace_event_call *tp_event, + struct perf_event *p_event) +{ + /* No tracing, just counting, so no obvious leak */ + if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) + return 0; + + /* Some events are ok to be traced by non-root users... */ + if (p_event->attach_state == PERF_ATTACH_TASK) { + if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY) + return 0; + } + + /* + * ...otherwise raw tracepoint data can be a severe data leak, + * only allow root to have these. + */ + if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + return 0; +} + +static int perf_trace_event_init(struct ftrace_event_call *tp_event, + struct perf_event *p_event) +{ + struct hlist_head __percpu *list; + int ret; + int cpu; + + ret = perf_trace_event_perm(tp_event, p_event); + if (ret) + return ret; + + p_event->tp_event = tp_event; + if (tp_event->perf_refcount++ > 0) + return 0; + + ret = -ENOMEM; + + list = alloc_percpu(struct hlist_head); + if (!list) + goto fail; + + for_each_possible_cpu(cpu) + INIT_HLIST_HEAD(per_cpu_ptr(list, cpu)); + + tp_event->perf_events = list; + + if (!total_ref_count) { + char __percpu *buf; + int i; + + for (i = 0; i < PERF_NR_CONTEXTS; i++) { + buf = (char __percpu *)alloc_percpu(perf_trace_t); + if (!buf) + goto fail; + + perf_trace_buf[i] = buf; + } + } + + ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER); + if (ret) + goto fail; + + total_ref_count++; + return 0; + +fail: + if (!total_ref_count) { + int i; + + for (i = 0; i < PERF_NR_CONTEXTS; i++) { + free_percpu(perf_trace_buf[i]); + perf_trace_buf[i] = NULL; + } + } + + if (!--tp_event->perf_refcount) { + free_percpu(tp_event->perf_events); + tp_event->perf_events = NULL; + } + + return ret; +} + +int perf_trace_init(struct perf_event *p_event) +{ + struct ftrace_event_call *tp_event; + int event_id = p_event->attr.config; + int ret = -EINVAL; + + mutex_lock(&event_mutex); + list_for_each_entry(tp_event, &ftrace_events, list) { + if (tp_event->event.type == event_id && + tp_event->class && tp_event->class->reg && + try_module_get(tp_event->mod)) { + ret = perf_trace_event_init(tp_event, p_event); + if (ret) + module_put(tp_event->mod); + break; + } + } + mutex_unlock(&event_mutex); + + return ret; +} + +int perf_trace_add(struct perf_event *p_event, int flags) +{ + struct ftrace_event_call *tp_event = p_event->tp_event; + struct hlist_head __percpu *pcpu_list; + struct hlist_head *list; + + pcpu_list = tp_event->perf_events; + if (WARN_ON_ONCE(!pcpu_list)) + return -EINVAL; + + if (!(flags & PERF_EF_START)) + p_event->hw.state = PERF_HES_STOPPED; + + list = this_cpu_ptr(pcpu_list); + hlist_add_head_rcu(&p_event->hlist_entry, list); + + return 0; +} + +void perf_trace_del(struct perf_event *p_event, int flags) +{ + hlist_del_rcu(&p_event->hlist_entry); +} + +void perf_trace_destroy(struct perf_event *p_event) +{ + struct ftrace_event_call *tp_event = p_event->tp_event; + int i; + + mutex_lock(&event_mutex); + if (--tp_event->perf_refcount > 0) + goto out; + + tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); + + /* + * Ensure our callback won't be called anymore. The buffers + * will be freed after that. + */ + tracepoint_synchronize_unregister(); + + free_percpu(tp_event->perf_events); + tp_event->perf_events = NULL; + + if (!--total_ref_count) { + for (i = 0; i < PERF_NR_CONTEXTS; i++) { + free_percpu(perf_trace_buf[i]); + perf_trace_buf[i] = NULL; + } + } +out: + module_put(tp_event->mod); + mutex_unlock(&event_mutex); +} + +__kprobes void *perf_trace_buf_prepare(int size, unsigned short type, + struct pt_regs *regs, int *rctxp) +{ + struct trace_entry *entry; + unsigned long flags; + char *raw_data; + int pc; + + BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); + + pc = preempt_count(); + + *rctxp = perf_swevent_get_recursion_context(); + if (*rctxp < 0) + return NULL; + + raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]); + + /* zero the dead bytes from align to not leak stack to user */ + memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); + + entry = (struct trace_entry *)raw_data; + local_save_flags(flags); + tracing_generic_entry_update(entry, flags, pc); + entry->type = type; + + return raw_data; +} +EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c new file mode 100644 index 00000000..2d049368 --- /dev/null +++ b/kernel/trace/trace_events.c @@ -0,0 +1,1764 @@ +/* + * event tracer + * + * Copyright (C) 2008 Red Hat Inc, Steven Rostedt + * + * - Added format output of fields of the trace point. + * This was based off of work by Tom Zanussi . + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "trace_output.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM "TRACE_SYSTEM" + +DEFINE_MUTEX(event_mutex); + +DEFINE_MUTEX(event_storage_mutex); +EXPORT_SYMBOL_GPL(event_storage_mutex); + +char event_storage[EVENT_STORAGE_SIZE]; +EXPORT_SYMBOL_GPL(event_storage); + +LIST_HEAD(ftrace_events); +LIST_HEAD(ftrace_common_fields); + +struct list_head * +trace_get_fields(struct ftrace_event_call *event_call) +{ + if (!event_call->class->get_fields) + return &event_call->class->fields; + return event_call->class->get_fields(event_call); +} + +static int __trace_define_field(struct list_head *head, const char *type, + const char *name, int offset, int size, + int is_signed, int filter_type) +{ + struct ftrace_event_field *field; + + field = kzalloc(sizeof(*field), GFP_KERNEL); + if (!field) + goto err; + + field->name = kstrdup(name, GFP_KERNEL); + if (!field->name) + goto err; + + field->type = kstrdup(type, GFP_KERNEL); + if (!field->type) + goto err; + + if (filter_type == FILTER_OTHER) + field->filter_type = filter_assign_type(type); + else + field->filter_type = filter_type; + + field->offset = offset; + field->size = size; + field->is_signed = is_signed; + + list_add(&field->link, head); + + return 0; + +err: + if (field) + kfree(field->name); + kfree(field); + + return -ENOMEM; +} + +int trace_define_field(struct ftrace_event_call *call, const char *type, + const char *name, int offset, int size, int is_signed, + int filter_type) +{ + struct list_head *head; + + if (WARN_ON(!call->class)) + return 0; + + head = trace_get_fields(call); + return __trace_define_field(head, type, name, offset, size, + is_signed, filter_type); +} +EXPORT_SYMBOL_GPL(trace_define_field); + +#define __common_field(type, item) \ + ret = __trace_define_field(&ftrace_common_fields, #type, \ + "common_" #item, \ + offsetof(typeof(ent), item), \ + sizeof(ent.item), \ + is_signed_type(type), FILTER_OTHER); \ + if (ret) \ + return ret; + +static int trace_define_common_fields(void) +{ + int ret; + struct trace_entry ent; + + __common_field(unsigned short, type); + __common_field(unsigned char, flags); + __common_field(unsigned char, preempt_count); + __common_field(int, pid); + __common_field(int, padding); + + return ret; +} + +void trace_destroy_fields(struct ftrace_event_call *call) +{ + struct ftrace_event_field *field, *next; + struct list_head *head; + + head = trace_get_fields(call); + list_for_each_entry_safe(field, next, head, link) { + list_del(&field->link); + kfree(field->type); + kfree(field->name); + kfree(field); + } +} + +int trace_event_raw_init(struct ftrace_event_call *call) +{ + int id; + + id = register_ftrace_event(&call->event); + if (!id) + return -ENODEV; + + return 0; +} +EXPORT_SYMBOL_GPL(trace_event_raw_init); + +int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type) +{ + switch (type) { + case TRACE_REG_REGISTER: + return tracepoint_probe_register(call->name, + call->class->probe, + call); + case TRACE_REG_UNREGISTER: + tracepoint_probe_unregister(call->name, + call->class->probe, + call); + return 0; + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return tracepoint_probe_register(call->name, + call->class->perf_probe, + call); + case TRACE_REG_PERF_UNREGISTER: + tracepoint_probe_unregister(call->name, + call->class->perf_probe, + call); + return 0; +#endif + } + return 0; +} +EXPORT_SYMBOL_GPL(ftrace_event_reg); + +void trace_event_enable_cmd_record(bool enable) +{ + struct ftrace_event_call *call; + + mutex_lock(&event_mutex); + list_for_each_entry(call, &ftrace_events, list) { + if (!(call->flags & TRACE_EVENT_FL_ENABLED)) + continue; + + if (enable) { + tracing_start_cmdline_record(); + call->flags |= TRACE_EVENT_FL_RECORDED_CMD; + } else { + tracing_stop_cmdline_record(); + call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; + } + } + mutex_unlock(&event_mutex); +} + +static int ftrace_event_enable_disable(struct ftrace_event_call *call, + int enable) +{ + int ret = 0; + + switch (enable) { + case 0: + if (call->flags & TRACE_EVENT_FL_ENABLED) { + call->flags &= ~TRACE_EVENT_FL_ENABLED; + if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) { + tracing_stop_cmdline_record(); + call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; + } + call->class->reg(call, TRACE_REG_UNREGISTER); + } + break; + case 1: + if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { + if (trace_flags & TRACE_ITER_RECORD_CMD) { + tracing_start_cmdline_record(); + call->flags |= TRACE_EVENT_FL_RECORDED_CMD; + } + ret = call->class->reg(call, TRACE_REG_REGISTER); + if (ret) { + tracing_stop_cmdline_record(); + pr_info("event trace: Could not enable event " + "%s\n", call->name); + break; + } + call->flags |= TRACE_EVENT_FL_ENABLED; + } + break; + } + + return ret; +} + +static void ftrace_clear_events(void) +{ + struct ftrace_event_call *call; + + mutex_lock(&event_mutex); + list_for_each_entry(call, &ftrace_events, list) { + ftrace_event_enable_disable(call, 0); + } + mutex_unlock(&event_mutex); +} + +static void __put_system(struct event_subsystem *system) +{ + struct event_filter *filter = system->filter; + + WARN_ON_ONCE(system->ref_count == 0); + if (--system->ref_count) + return; + + if (filter) { + kfree(filter->filter_string); + kfree(filter); + } + kfree(system->name); + kfree(system); +} + +static void __get_system(struct event_subsystem *system) +{ + WARN_ON_ONCE(system->ref_count == 0); + system->ref_count++; +} + +static void put_system(struct event_subsystem *system) +{ + mutex_lock(&event_mutex); + __put_system(system); + mutex_unlock(&event_mutex); +} + +/* + * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. + */ +static int __ftrace_set_clr_event(const char *match, const char *sub, + const char *event, int set) +{ + struct ftrace_event_call *call; + int ret = -EINVAL; + + mutex_lock(&event_mutex); + list_for_each_entry(call, &ftrace_events, list) { + + if (!call->name || !call->class || !call->class->reg) + continue; + + if (match && + strcmp(match, call->name) != 0 && + strcmp(match, call->class->system) != 0) + continue; + + if (sub && strcmp(sub, call->class->system) != 0) + continue; + + if (event && strcmp(event, call->name) != 0) + continue; + + ftrace_event_enable_disable(call, set); + + ret = 0; + } + mutex_unlock(&event_mutex); + + return ret; +} + +static int ftrace_set_clr_event(char *buf, int set) +{ + char *event = NULL, *sub = NULL, *match; + + /* + * The buf format can be : + * *: means any event by that name. + * : is the same. + * + * :* means all events in that subsystem + * : means the same. + * + * (no ':') means all events in a subsystem with + * the name or any event that matches + */ + + match = strsep(&buf, ":"); + if (buf) { + sub = match; + event = buf; + match = NULL; + + if (!strlen(sub) || strcmp(sub, "*") == 0) + sub = NULL; + if (!strlen(event) || strcmp(event, "*") == 0) + event = NULL; + } + + return __ftrace_set_clr_event(match, sub, event, set); +} + +/** + * trace_set_clr_event - enable or disable an event + * @system: system name to match (NULL for any system) + * @event: event name to match (NULL for all events, within system) + * @set: 1 to enable, 0 to disable + * + * This is a way for other parts of the kernel to enable or disable + * event recording. + * + * Returns 0 on success, -EINVAL if the parameters do not match any + * registered events. + */ +int trace_set_clr_event(const char *system, const char *event, int set) +{ + return __ftrace_set_clr_event(NULL, system, event, set); +} +EXPORT_SYMBOL_GPL(trace_set_clr_event); + +/* 128 should be much more than enough */ +#define EVENT_BUF_SIZE 127 + +static ssize_t +ftrace_event_write(struct file *file, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + struct trace_parser parser; + ssize_t read, ret; + + if (!cnt) + return 0; + + ret = tracing_update_buffers(); + if (ret < 0) + return ret; + + if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1)) + return -ENOMEM; + + read = trace_get_user(&parser, ubuf, cnt, ppos); + + if (read >= 0 && trace_parser_loaded((&parser))) { + int set = 1; + + if (*parser.buffer == '!') + set = 0; + + parser.buffer[parser.idx] = 0; + + ret = ftrace_set_clr_event(parser.buffer + !set, set); + if (ret) + goto out_put; + } + + ret = read; + + out_put: + trace_parser_put(&parser); + + return ret; +} + +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ftrace_event_call *call = v; + + (*pos)++; + + list_for_each_entry_continue(call, &ftrace_events, list) { + /* + * The ftrace subsystem is for showing formats only. + * They can not be enabled or disabled via the event files. + */ + if (call->class && call->class->reg) + return call; + } + + return NULL; +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + struct ftrace_event_call *call; + loff_t l; + + mutex_lock(&event_mutex); + + call = list_entry(&ftrace_events, struct ftrace_event_call, list); + for (l = 0; l <= *pos; ) { + call = t_next(m, call, &l); + if (!call) + break; + } + return call; +} + +static void * +s_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ftrace_event_call *call = v; + + (*pos)++; + + list_for_each_entry_continue(call, &ftrace_events, list) { + if (call->flags & TRACE_EVENT_FL_ENABLED) + return call; + } + + return NULL; +} + +static void *s_start(struct seq_file *m, loff_t *pos) +{ + struct ftrace_event_call *call; + loff_t l; + + mutex_lock(&event_mutex); + + call = list_entry(&ftrace_events, struct ftrace_event_call, list); + for (l = 0; l <= *pos; ) { + call = s_next(m, call, &l); + if (!call) + break; + } + return call; +} + +static int t_show(struct seq_file *m, void *v) +{ + struct ftrace_event_call *call = v; + + if (strcmp(call->class->system, TRACE_SYSTEM) != 0) + seq_printf(m, "%s:", call->class->system); + seq_printf(m, "%s\n", call->name); + + return 0; +} + +static void t_stop(struct seq_file *m, void *p) +{ + mutex_unlock(&event_mutex); +} + +static int +ftrace_event_seq_open(struct inode *inode, struct file *file) +{ + const struct seq_operations *seq_ops; + + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) + ftrace_clear_events(); + + seq_ops = inode->i_private; + return seq_open(file, seq_ops); +} + +static ssize_t +event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct ftrace_event_call *call = filp->private_data; + char *buf; + + if (call->flags & TRACE_EVENT_FL_ENABLED) + buf = "1\n"; + else + buf = "0\n"; + + return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); +} + +static ssize_t +event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct ftrace_event_call *call = filp->private_data; + char buf[64]; + unsigned long val; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + ret = tracing_update_buffers(); + if (ret < 0) + return ret; + + switch (val) { + case 0: + case 1: + mutex_lock(&event_mutex); + ret = ftrace_event_enable_disable(call, val); + mutex_unlock(&event_mutex); + break; + + default: + return -EINVAL; + } + + *ppos += cnt; + + return ret ? ret : cnt; +} + +static ssize_t +system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + const char set_to_char[4] = { '?', '0', '1', 'X' }; + struct event_subsystem *system = filp->private_data; + struct ftrace_event_call *call; + char buf[2]; + int set = 0; + int ret; + + mutex_lock(&event_mutex); + list_for_each_entry(call, &ftrace_events, list) { + if (!call->name || !call->class || !call->class->reg) + continue; + + if (system && strcmp(call->class->system, system->name) != 0) + continue; + + /* + * We need to find out if all the events are set + * or if all events or cleared, or if we have + * a mixture. + */ + set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED)); + + /* + * If we have a mixture, no need to look further. + */ + if (set == 3) + break; + } + mutex_unlock(&event_mutex); + + buf[0] = set_to_char[set]; + buf[1] = '\n'; + + ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); + + return ret; +} + +static ssize_t +system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct event_subsystem *system = filp->private_data; + const char *name = NULL; + unsigned long val; + char buf[64]; + ssize_t ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + ret = tracing_update_buffers(); + if (ret < 0) + return ret; + + if (val != 0 && val != 1) + return -EINVAL; + + /* + * Opening of "enable" adds a ref count to system, + * so the name is safe to use. + */ + if (system) + name = system->name; + + ret = __ftrace_set_clr_event(NULL, name, NULL, val); + if (ret) + goto out; + + ret = cnt; + +out: + *ppos += cnt; + + return ret; +} + +enum { + FORMAT_HEADER = 1, + FORMAT_FIELD_SEPERATOR = 2, + FORMAT_PRINTFMT = 3, +}; + +static void *f_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ftrace_event_call *call = m->private; + struct ftrace_event_field *field; + struct list_head *common_head = &ftrace_common_fields; + struct list_head *head = trace_get_fields(call); + + (*pos)++; + + switch ((unsigned long)v) { + case FORMAT_HEADER: + if (unlikely(list_empty(common_head))) + return NULL; + + field = list_entry(common_head->prev, + struct ftrace_event_field, link); + return field; + + case FORMAT_FIELD_SEPERATOR: + if (unlikely(list_empty(head))) + return NULL; + + field = list_entry(head->prev, struct ftrace_event_field, link); + return field; + + case FORMAT_PRINTFMT: + /* all done */ + return NULL; + } + + field = v; + if (field->link.prev == common_head) + return (void *)FORMAT_FIELD_SEPERATOR; + else if (field->link.prev == head) + return (void *)FORMAT_PRINTFMT; + + field = list_entry(field->link.prev, struct ftrace_event_field, link); + + return field; +} + +static void *f_start(struct seq_file *m, loff_t *pos) +{ + loff_t l = 0; + void *p; + + /* Start by showing the header */ + if (!*pos) + return (void *)FORMAT_HEADER; + + p = (void *)FORMAT_HEADER; + do { + p = f_next(m, p, &l); + } while (p && l < *pos); + + return p; +} + +static int f_show(struct seq_file *m, void *v) +{ + struct ftrace_event_call *call = m->private; + struct ftrace_event_field *field; + const char *array_descriptor; + + switch ((unsigned long)v) { + case FORMAT_HEADER: + seq_printf(m, "name: %s\n", call->name); + seq_printf(m, "ID: %d\n", call->event.type); + seq_printf(m, "format:\n"); + return 0; + + case FORMAT_FIELD_SEPERATOR: + seq_putc(m, '\n'); + return 0; + + case FORMAT_PRINTFMT: + seq_printf(m, "\nprint fmt: %s\n", + call->print_fmt); + return 0; + } + + field = v; + + /* + * Smartly shows the array type(except dynamic array). + * Normal: + * field:TYPE VAR + * If TYPE := TYPE[LEN], it is shown: + * field:TYPE VAR[LEN] + */ + array_descriptor = strchr(field->type, '['); + + if (!strncmp(field->type, "__data_loc", 10)) + array_descriptor = NULL; + + if (!array_descriptor) + seq_printf(m, "\tfield:%s %s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", + field->type, field->name, field->offset, + field->size, !!field->is_signed); + else + seq_printf(m, "\tfield:%.*s %s%s;\toffset:%u;\tsize:%u;\tsigned:%d;\n", + (int)(array_descriptor - field->type), + field->type, field->name, + array_descriptor, field->offset, + field->size, !!field->is_signed); + + return 0; +} + +static void f_stop(struct seq_file *m, void *p) +{ +} + +static const struct seq_operations trace_format_seq_ops = { + .start = f_start, + .next = f_next, + .stop = f_stop, + .show = f_show, +}; + +static int trace_format_open(struct inode *inode, struct file *file) +{ + struct ftrace_event_call *call = inode->i_private; + struct seq_file *m; + int ret; + + ret = seq_open(file, &trace_format_seq_ops); + if (ret < 0) + return ret; + + m = file->private_data; + m->private = call; + + return 0; +} + +static ssize_t +event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + struct ftrace_event_call *call = filp->private_data; + struct trace_seq *s; + int r; + + if (*ppos) + return 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + trace_seq_printf(s, "%d\n", call->event.type); + + r = simple_read_from_buffer(ubuf, cnt, ppos, + s->buffer, s->len); + kfree(s); + return r; +} + +static ssize_t +event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct ftrace_event_call *call = filp->private_data; + struct trace_seq *s; + int r; + + if (*ppos) + return 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + print_event_filter(call, s); + r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); + + kfree(s); + + return r; +} + +static ssize_t +event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct ftrace_event_call *call = filp->private_data; + char *buf; + int err; + + if (cnt >= PAGE_SIZE) + return -EINVAL; + + buf = (char *)__get_free_page(GFP_TEMPORARY); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, ubuf, cnt)) { + free_page((unsigned long) buf); + return -EFAULT; + } + buf[cnt] = '\0'; + + err = apply_event_filter(call, buf); + free_page((unsigned long) buf); + if (err < 0) + return err; + + *ppos += cnt; + + return cnt; +} + +static LIST_HEAD(event_subsystems); + +static int subsystem_open(struct inode *inode, struct file *filp) +{ + struct event_subsystem *system = NULL; + int ret; + + if (!inode->i_private) + goto skip_search; + + /* Make sure the system still exists */ + mutex_lock(&event_mutex); + list_for_each_entry(system, &event_subsystems, list) { + if (system == inode->i_private) { + /* Don't open systems with no events */ + if (!system->nr_events) { + system = NULL; + break; + } + __get_system(system); + break; + } + } + mutex_unlock(&event_mutex); + + if (system != inode->i_private) + return -ENODEV; + + skip_search: + ret = tracing_open_generic(inode, filp); + if (ret < 0 && system) + put_system(system); + + return ret; +} + +static int subsystem_release(struct inode *inode, struct file *file) +{ + struct event_subsystem *system = inode->i_private; + + if (system) + put_system(system); + + return 0; +} + +static ssize_t +subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct event_subsystem *system = filp->private_data; + struct trace_seq *s; + int r; + + if (*ppos) + return 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + print_subsystem_event_filter(system, s); + r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); + + kfree(s); + + return r; +} + +static ssize_t +subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + struct event_subsystem *system = filp->private_data; + char *buf; + int err; + + if (cnt >= PAGE_SIZE) + return -EINVAL; + + buf = (char *)__get_free_page(GFP_TEMPORARY); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, ubuf, cnt)) { + free_page((unsigned long) buf); + return -EFAULT; + } + buf[cnt] = '\0'; + + err = apply_subsystem_event_filter(system, buf); + free_page((unsigned long) buf); + if (err < 0) + return err; + + *ppos += cnt; + + return cnt; +} + +static ssize_t +show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + int (*func)(struct trace_seq *s) = filp->private_data; + struct trace_seq *s; + int r; + + if (*ppos) + return 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + func(s); + r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); + + kfree(s); + + return r; +} + +static const struct seq_operations show_event_seq_ops = { + .start = t_start, + .next = t_next, + .show = t_show, + .stop = t_stop, +}; + +static const struct seq_operations show_set_event_seq_ops = { + .start = s_start, + .next = s_next, + .show = t_show, + .stop = t_stop, +}; + +static const struct file_operations ftrace_avail_fops = { + .open = ftrace_event_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const struct file_operations ftrace_set_event_fops = { + .open = ftrace_event_seq_open, + .read = seq_read, + .write = ftrace_event_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const struct file_operations ftrace_enable_fops = { + .open = tracing_open_generic, + .read = event_enable_read, + .write = event_enable_write, + .llseek = default_llseek, +}; + +static const struct file_operations ftrace_event_format_fops = { + .open = trace_format_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const struct file_operations ftrace_event_id_fops = { + .open = tracing_open_generic, + .read = event_id_read, + .llseek = default_llseek, +}; + +static const struct file_operations ftrace_event_filter_fops = { + .open = tracing_open_generic, + .read = event_filter_read, + .write = event_filter_write, + .llseek = default_llseek, +}; + +static const struct file_operations ftrace_subsystem_filter_fops = { + .open = subsystem_open, + .read = subsystem_filter_read, + .write = subsystem_filter_write, + .llseek = default_llseek, + .release = subsystem_release, +}; + +static const struct file_operations ftrace_system_enable_fops = { + .open = subsystem_open, + .read = system_enable_read, + .write = system_enable_write, + .llseek = default_llseek, + .release = subsystem_release, +}; + +static const struct file_operations ftrace_show_header_fops = { + .open = tracing_open_generic, + .read = show_header, + .llseek = default_llseek, +}; + +static struct dentry *event_trace_events_dir(void) +{ + static struct dentry *d_tracer; + static struct dentry *d_events; + + if (d_events) + return d_events; + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return NULL; + + d_events = debugfs_create_dir("events", d_tracer); + if (!d_events) + pr_warning("Could not create debugfs " + "'events' directory\n"); + + return d_events; +} + +static struct dentry * +event_subsystem_dir(const char *name, struct dentry *d_events) +{ + struct event_subsystem *system; + struct dentry *entry; + + /* First see if we did not already create this dir */ + list_for_each_entry(system, &event_subsystems, list) { + if (strcmp(system->name, name) == 0) { + system->nr_events++; + return system->entry; + } + } + + /* need to create new entry */ + system = kmalloc(sizeof(*system), GFP_KERNEL); + if (!system) { + pr_warning("No memory to create event subsystem %s\n", + name); + return d_events; + } + + system->entry = debugfs_create_dir(name, d_events); + if (!system->entry) { + pr_warning("Could not create event subsystem %s\n", + name); + kfree(system); + return d_events; + } + + system->nr_events = 1; + system->ref_count = 1; + system->name = kstrdup(name, GFP_KERNEL); + if (!system->name) { + debugfs_remove(system->entry); + kfree(system); + return d_events; + } + + list_add(&system->list, &event_subsystems); + + system->filter = NULL; + + system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); + if (!system->filter) { + pr_warning("Could not allocate filter for subsystem " + "'%s'\n", name); + return system->entry; + } + + entry = debugfs_create_file("filter", 0644, system->entry, system, + &ftrace_subsystem_filter_fops); + if (!entry) { + kfree(system->filter); + system->filter = NULL; + pr_warning("Could not create debugfs " + "'%s/filter' entry\n", name); + } + + trace_create_file("enable", 0644, system->entry, system, + &ftrace_system_enable_fops); + + return system->entry; +} + +static int +event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, + const struct file_operations *id, + const struct file_operations *enable, + const struct file_operations *filter, + const struct file_operations *format) +{ + struct list_head *head; + int ret; + + /* + * If the trace point header did not define TRACE_SYSTEM + * then the system would be called "TRACE_SYSTEM". + */ + if (strcmp(call->class->system, TRACE_SYSTEM) != 0) + d_events = event_subsystem_dir(call->class->system, d_events); + + call->dir = debugfs_create_dir(call->name, d_events); + if (!call->dir) { + pr_warning("Could not create debugfs " + "'%s' directory\n", call->name); + return -1; + } + + if (call->class->reg) + trace_create_file("enable", 0644, call->dir, call, + enable); + +#ifdef CONFIG_PERF_EVENTS + if (call->event.type && call->class->reg) + trace_create_file("id", 0444, call->dir, call, + id); +#endif + + /* + * Other events may have the same class. Only update + * the fields if they are not already defined. + */ + head = trace_get_fields(call); + if (list_empty(head)) { + ret = call->class->define_fields(call); + if (ret < 0) { + pr_warning("Could not initialize trace point" + " events/%s\n", call->name); + return ret; + } + } + trace_create_file("filter", 0644, call->dir, call, + filter); + + trace_create_file("format", 0444, call->dir, call, + format); + + return 0; +} + +static int +__trace_add_event_call(struct ftrace_event_call *call, struct module *mod, + const struct file_operations *id, + const struct file_operations *enable, + const struct file_operations *filter, + const struct file_operations *format) +{ + struct dentry *d_events; + int ret; + + /* The linker may leave blanks */ + if (!call->name) + return -EINVAL; + + if (call->class->raw_init) { + ret = call->class->raw_init(call); + if (ret < 0) { + if (ret != -ENOSYS) + pr_warning("Could not initialize trace events/%s\n", + call->name); + return ret; + } + } + + d_events = event_trace_events_dir(); + if (!d_events) + return -ENOENT; + + ret = event_create_dir(call, d_events, id, enable, filter, format); + if (!ret) + list_add(&call->list, &ftrace_events); + call->mod = mod; + + return ret; +} + +/* Add an additional event_call dynamically */ +int trace_add_event_call(struct ftrace_event_call *call) +{ + int ret; + mutex_lock(&event_mutex); + ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops, + &ftrace_enable_fops, + &ftrace_event_filter_fops, + &ftrace_event_format_fops); + mutex_unlock(&event_mutex); + return ret; +} + +static void remove_subsystem_dir(const char *name) +{ + struct event_subsystem *system; + + if (strcmp(name, TRACE_SYSTEM) == 0) + return; + + list_for_each_entry(system, &event_subsystems, list) { + if (strcmp(system->name, name) == 0) { + if (!--system->nr_events) { + debugfs_remove_recursive(system->entry); + list_del(&system->list); + __put_system(system); + } + break; + } + } +} + +/* + * Must be called under locking both of event_mutex and trace_event_mutex. + */ +static void __trace_remove_event_call(struct ftrace_event_call *call) +{ + ftrace_event_enable_disable(call, 0); + if (call->event.funcs) + __unregister_ftrace_event(&call->event); + debugfs_remove_recursive(call->dir); + list_del(&call->list); + trace_destroy_fields(call); + destroy_preds(call); + remove_subsystem_dir(call->class->system); +} + +/* Remove an event_call */ +void trace_remove_event_call(struct ftrace_event_call *call) +{ + mutex_lock(&event_mutex); + down_write(&trace_event_mutex); + __trace_remove_event_call(call); + up_write(&trace_event_mutex); + mutex_unlock(&event_mutex); +} + +#define for_each_event(event, start, end) \ + for (event = start; \ + (unsigned long)event < (unsigned long)end; \ + event++) + +#ifdef CONFIG_MODULES + +static LIST_HEAD(ftrace_module_file_list); + +/* + * Modules must own their file_operations to keep up with + * reference counting. + */ +struct ftrace_module_file_ops { + struct list_head list; + struct module *mod; + struct file_operations id; + struct file_operations enable; + struct file_operations format; + struct file_operations filter; +}; + +static struct ftrace_module_file_ops * +trace_create_file_ops(struct module *mod) +{ + struct ftrace_module_file_ops *file_ops; + + /* + * This is a bit of a PITA. To allow for correct reference + * counting, modules must "own" their file_operations. + * To do this, we allocate the file operations that will be + * used in the event directory. + */ + + file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL); + if (!file_ops) + return NULL; + + file_ops->mod = mod; + + file_ops->id = ftrace_event_id_fops; + file_ops->id.owner = mod; + + file_ops->enable = ftrace_enable_fops; + file_ops->enable.owner = mod; + + file_ops->filter = ftrace_event_filter_fops; + file_ops->filter.owner = mod; + + file_ops->format = ftrace_event_format_fops; + file_ops->format.owner = mod; + + list_add(&file_ops->list, &ftrace_module_file_list); + + return file_ops; +} + +static void trace_module_add_events(struct module *mod) +{ + struct ftrace_module_file_ops *file_ops = NULL; + struct ftrace_event_call **call, **start, **end; + + start = mod->trace_events; + end = mod->trace_events + mod->num_trace_events; + + if (start == end) + return; + + file_ops = trace_create_file_ops(mod); + if (!file_ops) + return; + + for_each_event(call, start, end) { + __trace_add_event_call(*call, mod, + &file_ops->id, &file_ops->enable, + &file_ops->filter, &file_ops->format); + } +} + +static void trace_module_remove_events(struct module *mod) +{ + struct ftrace_module_file_ops *file_ops; + struct ftrace_event_call *call, *p; + bool found = false; + + down_write(&trace_event_mutex); + list_for_each_entry_safe(call, p, &ftrace_events, list) { + if (call->mod == mod) { + found = true; + __trace_remove_event_call(call); + } + } + + /* Now free the file_operations */ + list_for_each_entry(file_ops, &ftrace_module_file_list, list) { + if (file_ops->mod == mod) + break; + } + if (&file_ops->list != &ftrace_module_file_list) { + list_del(&file_ops->list); + kfree(file_ops); + } + + /* + * It is safest to reset the ring buffer if the module being unloaded + * registered any events. + */ + if (found) + tracing_reset_current_online_cpus(); + up_write(&trace_event_mutex); +} + +static int trace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + mutex_lock(&event_mutex); + switch (val) { + case MODULE_STATE_COMING: + trace_module_add_events(mod); + break; + case MODULE_STATE_GOING: + trace_module_remove_events(mod); + break; + } + mutex_unlock(&event_mutex); + + return 0; +} +#else +static int trace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + return 0; +} +#endif /* CONFIG_MODULES */ + +static struct notifier_block trace_module_nb = { + .notifier_call = trace_module_notify, + .priority = 0, +}; + +extern struct ftrace_event_call *__start_ftrace_events[]; +extern struct ftrace_event_call *__stop_ftrace_events[]; + +static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; + +static __init int setup_trace_event(char *str) +{ + strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE); + ring_buffer_expanded = 1; + tracing_selftest_disabled = 1; + + return 1; +} +__setup("trace_event=", setup_trace_event); + +static __init int event_trace_init(void) +{ + struct ftrace_event_call **call; + struct dentry *d_tracer; + struct dentry *entry; + struct dentry *d_events; + int ret; + char *buf = bootup_event_buf; + char *token; + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; + + entry = debugfs_create_file("available_events", 0444, d_tracer, + (void *)&show_event_seq_ops, + &ftrace_avail_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'available_events' entry\n"); + + entry = debugfs_create_file("set_event", 0644, d_tracer, + (void *)&show_set_event_seq_ops, + &ftrace_set_event_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'set_event' entry\n"); + + d_events = event_trace_events_dir(); + if (!d_events) + return 0; + + /* ring buffer internal formats */ + trace_create_file("header_page", 0444, d_events, + ring_buffer_print_page_header, + &ftrace_show_header_fops); + + trace_create_file("header_event", 0444, d_events, + ring_buffer_print_entry_header, + &ftrace_show_header_fops); + + trace_create_file("enable", 0644, d_events, + NULL, &ftrace_system_enable_fops); + + if (trace_define_common_fields()) + pr_warning("tracing: Failed to allocate common fields"); + + for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { + __trace_add_event_call(*call, NULL, &ftrace_event_id_fops, + &ftrace_enable_fops, + &ftrace_event_filter_fops, + &ftrace_event_format_fops); + } + + while (true) { + token = strsep(&buf, ","); + + if (!token) + break; + if (!*token) + continue; + + ret = ftrace_set_clr_event(token, 1); + if (ret) + pr_warning("Failed to enable trace event: %s\n", token); + } + + ret = register_module_notifier(&trace_module_nb); + if (ret) + pr_warning("Failed to register trace events module notifier\n"); + + return 0; +} +fs_initcall(event_trace_init); + +#ifdef CONFIG_FTRACE_STARTUP_TEST + +static DEFINE_SPINLOCK(test_spinlock); +static DEFINE_SPINLOCK(test_spinlock_irq); +static DEFINE_MUTEX(test_mutex); + +static __init void test_work(struct work_struct *dummy) +{ + spin_lock(&test_spinlock); + spin_lock_irq(&test_spinlock_irq); + udelay(1); + spin_unlock_irq(&test_spinlock_irq); + spin_unlock(&test_spinlock); + + mutex_lock(&test_mutex); + msleep(1); + mutex_unlock(&test_mutex); +} + +static __init int event_test_thread(void *unused) +{ + void *test_malloc; + + test_malloc = kmalloc(1234, GFP_KERNEL); + if (!test_malloc) + pr_info("failed to kmalloc\n"); + + schedule_on_each_cpu(test_work); + + kfree(test_malloc); + + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) + schedule(); + + return 0; +} + +/* + * Do various things that may trigger events. + */ +static __init void event_test_stuff(void) +{ + struct task_struct *test_thread; + + test_thread = kthread_run(event_test_thread, NULL, "test-events"); + msleep(1); + kthread_stop(test_thread); +} + +/* + * For every trace event defined, we will test each trace point separately, + * and then by groups, and finally all trace points. + */ +static __init void event_trace_self_tests(void) +{ + struct ftrace_event_call *call; + struct event_subsystem *system; + int ret; + + pr_info("Running tests on trace events:\n"); + + list_for_each_entry(call, &ftrace_events, list) { + + /* Only test those that have a probe */ + if (!call->class || !call->class->probe) + continue; + +/* + * Testing syscall events here is pretty useless, but + * we still do it if configured. But this is time consuming. + * What we really need is a user thread to perform the + * syscalls as we test. + */ +#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS + if (call->class->system && + strcmp(call->class->system, "syscalls") == 0) + continue; +#endif + + pr_info("Testing event %s: ", call->name); + + /* + * If an event is already enabled, someone is using + * it and the self test should not be on. + */ + if (call->flags & TRACE_EVENT_FL_ENABLED) { + pr_warning("Enabled event during self test!\n"); + WARN_ON_ONCE(1); + continue; + } + + ftrace_event_enable_disable(call, 1); + event_test_stuff(); + ftrace_event_enable_disable(call, 0); + + pr_cont("OK\n"); + } + + /* Now test at the sub system level */ + + pr_info("Running tests on trace event systems:\n"); + + list_for_each_entry(system, &event_subsystems, list) { + + /* the ftrace system is special, skip it */ + if (strcmp(system->name, "ftrace") == 0) + continue; + + pr_info("Testing event system %s: ", system->name); + + ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1); + if (WARN_ON_ONCE(ret)) { + pr_warning("error enabling system %s\n", + system->name); + continue; + } + + event_test_stuff(); + + ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0); + if (WARN_ON_ONCE(ret)) + pr_warning("error disabling system %s\n", + system->name); + + pr_cont("OK\n"); + } + + /* Test with all events enabled */ + + pr_info("Running tests on all trace events:\n"); + pr_info("Testing all events: "); + + ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1); + if (WARN_ON_ONCE(ret)) { + pr_warning("error enabling all events\n"); + return; + } + + event_test_stuff(); + + /* reset sysname */ + ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0); + if (WARN_ON_ONCE(ret)) { + pr_warning("error disabling all events\n"); + return; + } + + pr_cont("OK\n"); +} + +#ifdef CONFIG_FUNCTION_TRACER + +static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable); + +static void +function_test_events_call(unsigned long ip, unsigned long parent_ip) +{ + struct ring_buffer_event *event; + struct ring_buffer *buffer; + struct ftrace_entry *entry; + unsigned long flags; + long disabled; + int cpu; + int pc; + + pc = preempt_count(); + preempt_disable_notrace(); + cpu = raw_smp_processor_id(); + disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu)); + + if (disabled != 1) + goto out; + + local_save_flags(flags); + + event = trace_current_buffer_lock_reserve(&buffer, + TRACE_FN, sizeof(*entry), + flags, pc); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->parent_ip = parent_ip; + + trace_nowake_buffer_unlock_commit(buffer, event, flags, pc); + + out: + atomic_dec(&per_cpu(ftrace_test_event_disable, cpu)); + preempt_enable_notrace(); +} + +static struct ftrace_ops trace_ops __initdata = +{ + .func = function_test_events_call, +}; + +static __init void event_trace_self_test_with_function(void) +{ + int ret; + ret = register_ftrace_function(&trace_ops); + if (WARN_ON(ret < 0)) { + pr_info("Failed to enable function tracer for event tests\n"); + return; + } + pr_info("Running tests again, along with the function tracer\n"); + event_trace_self_tests(); + unregister_ftrace_function(&trace_ops); +} +#else +static __init void event_trace_self_test_with_function(void) +{ +} +#endif + +static __init int event_trace_self_tests_init(void) +{ + if (!tracing_selftest_disabled) { + event_trace_self_tests(); + event_trace_self_test_with_function(); + } + + return 0; +} + +late_initcall(event_trace_self_tests_init); + +#endif diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c new file mode 100644 index 00000000..bd3c6369 --- /dev/null +++ b/kernel/trace/trace_events_filter.c @@ -0,0 +1,2014 @@ +/* + * trace_events_filter - generic event filtering + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) 2009 Tom Zanussi + */ + +#include +#include +#include +#include +#include + +#include "trace.h" +#include "trace_output.h" + +enum filter_op_ids +{ + OP_OR, + OP_AND, + OP_GLOB, + OP_NE, + OP_EQ, + OP_LT, + OP_LE, + OP_GT, + OP_GE, + OP_NONE, + OP_OPEN_PAREN, +}; + +struct filter_op { + int id; + char *string; + int precedence; +}; + +static struct filter_op filter_ops[] = { + { OP_OR, "||", 1 }, + { OP_AND, "&&", 2 }, + { OP_GLOB, "~", 4 }, + { OP_NE, "!=", 4 }, + { OP_EQ, "==", 4 }, + { OP_LT, "<", 5 }, + { OP_LE, "<=", 5 }, + { OP_GT, ">", 5 }, + { OP_GE, ">=", 5 }, + { OP_NONE, "OP_NONE", 0 }, + { OP_OPEN_PAREN, "(", 0 }, +}; + +enum { + FILT_ERR_NONE, + FILT_ERR_INVALID_OP, + FILT_ERR_UNBALANCED_PAREN, + FILT_ERR_TOO_MANY_OPERANDS, + FILT_ERR_OPERAND_TOO_LONG, + FILT_ERR_FIELD_NOT_FOUND, + FILT_ERR_ILLEGAL_FIELD_OP, + FILT_ERR_ILLEGAL_INTVAL, + FILT_ERR_BAD_SUBSYS_FILTER, + FILT_ERR_TOO_MANY_PREDS, + FILT_ERR_MISSING_FIELD, + FILT_ERR_INVALID_FILTER, +}; + +static char *err_text[] = { + "No error", + "Invalid operator", + "Unbalanced parens", + "Too many operands", + "Operand too long", + "Field not found", + "Illegal operation for field type", + "Illegal integer value", + "Couldn't find or set field in one of a subsystem's events", + "Too many terms in predicate expression", + "Missing field name and/or value", + "Meaningless filter expression", +}; + +struct opstack_op { + int op; + struct list_head list; +}; + +struct postfix_elt { + int op; + char *operand; + struct list_head list; +}; + +struct filter_parse_state { + struct filter_op *ops; + struct list_head opstack; + struct list_head postfix; + int lasterr; + int lasterr_pos; + + struct { + char *string; + unsigned int cnt; + unsigned int tail; + } infix; + + struct { + char string[MAX_FILTER_STR_VAL]; + int pos; + unsigned int tail; + } operand; +}; + +struct pred_stack { + struct filter_pred **preds; + int index; +}; + +#define DEFINE_COMPARISON_PRED(type) \ +static int filter_pred_##type(struct filter_pred *pred, void *event) \ +{ \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + int match = 0; \ + \ + switch (pred->op) { \ + case OP_LT: \ + match = (*addr < val); \ + break; \ + case OP_LE: \ + match = (*addr <= val); \ + break; \ + case OP_GT: \ + match = (*addr > val); \ + break; \ + case OP_GE: \ + match = (*addr >= val); \ + break; \ + default: \ + break; \ + } \ + \ + return match; \ +} + +#define DEFINE_EQUALITY_PRED(size) \ +static int filter_pred_##size(struct filter_pred *pred, void *event) \ +{ \ + u##size *addr = (u##size *)(event + pred->offset); \ + u##size val = (u##size)pred->val; \ + int match; \ + \ + match = (val == *addr) ^ pred->not; \ + \ + return match; \ +} + +DEFINE_COMPARISON_PRED(s64); +DEFINE_COMPARISON_PRED(u64); +DEFINE_COMPARISON_PRED(s32); +DEFINE_COMPARISON_PRED(u32); +DEFINE_COMPARISON_PRED(s16); +DEFINE_COMPARISON_PRED(u16); +DEFINE_COMPARISON_PRED(s8); +DEFINE_COMPARISON_PRED(u8); + +DEFINE_EQUALITY_PRED(64); +DEFINE_EQUALITY_PRED(32); +DEFINE_EQUALITY_PRED(16); +DEFINE_EQUALITY_PRED(8); + +/* Filter predicate for fixed sized arrays of characters */ +static int filter_pred_string(struct filter_pred *pred, void *event) +{ + char *addr = (char *)(event + pred->offset); + int cmp, match; + + cmp = pred->regex.match(addr, &pred->regex, pred->regex.field_len); + + match = cmp ^ pred->not; + + return match; +} + +/* Filter predicate for char * pointers */ +static int filter_pred_pchar(struct filter_pred *pred, void *event) +{ + char **addr = (char **)(event + pred->offset); + int cmp, match; + int len = strlen(*addr) + 1; /* including tailing '\0' */ + + cmp = pred->regex.match(*addr, &pred->regex, len); + + match = cmp ^ pred->not; + + return match; +} + +/* + * Filter predicate for dynamic sized arrays of characters. + * These are implemented through a list of strings at the end + * of the entry. + * Also each of these strings have a field in the entry which + * contains its offset from the beginning of the entry. + * We have then first to get this field, dereference it + * and add it to the address of the entry, and at last we have + * the address of the string. + */ +static int filter_pred_strloc(struct filter_pred *pred, void *event) +{ + u32 str_item = *(u32 *)(event + pred->offset); + int str_loc = str_item & 0xffff; + int str_len = str_item >> 16; + char *addr = (char *)(event + str_loc); + int cmp, match; + + cmp = pred->regex.match(addr, &pred->regex, str_len); + + match = cmp ^ pred->not; + + return match; +} + +static int filter_pred_none(struct filter_pred *pred, void *event) +{ + return 0; +} + +/* + * regex_match_foo - Basic regex callbacks + * + * @str: the string to be searched + * @r: the regex structure containing the pattern string + * @len: the length of the string to be searched (including '\0') + * + * Note: + * - @str might not be NULL-terminated if it's of type DYN_STRING + * or STATIC_STRING + */ + +static int regex_match_full(char *str, struct regex *r, int len) +{ + if (strncmp(str, r->pattern, len) == 0) + return 1; + return 0; +} + +static int regex_match_front(char *str, struct regex *r, int len) +{ + if (strncmp(str, r->pattern, r->len) == 0) + return 1; + return 0; +} + +static int regex_match_middle(char *str, struct regex *r, int len) +{ + if (strnstr(str, r->pattern, len)) + return 1; + return 0; +} + +static int regex_match_end(char *str, struct regex *r, int len) +{ + int strlen = len - 1; + + if (strlen >= r->len && + memcmp(str + strlen - r->len, r->pattern, r->len) == 0) + return 1; + return 0; +} + +/** + * filter_parse_regex - parse a basic regex + * @buff: the raw regex + * @len: length of the regex + * @search: will point to the beginning of the string to compare + * @not: tell whether the match will have to be inverted + * + * This passes in a buffer containing a regex and this function will + * set search to point to the search part of the buffer and + * return the type of search it is (see enum above). + * This does modify buff. + * + * Returns enum type. + * search returns the pointer to use for comparison. + * not returns 1 if buff started with a '!' + * 0 otherwise. + */ +enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not) +{ + int type = MATCH_FULL; + int i; + + if (buff[0] == '!') { + *not = 1; + buff++; + len--; + } else + *not = 0; + + *search = buff; + + for (i = 0; i < len; i++) { + if (buff[i] == '*') { + if (!i) { + *search = buff + 1; + type = MATCH_END_ONLY; + } else { + if (type == MATCH_END_ONLY) + type = MATCH_MIDDLE_ONLY; + else + type = MATCH_FRONT_ONLY; + buff[i] = 0; + break; + } + } + } + + return type; +} + +static void filter_build_regex(struct filter_pred *pred) +{ + struct regex *r = &pred->regex; + char *search; + enum regex_type type = MATCH_FULL; + int not = 0; + + if (pred->op == OP_GLOB) { + type = filter_parse_regex(r->pattern, r->len, &search, ¬); + r->len = strlen(search); + memmove(r->pattern, search, r->len+1); + } + + switch (type) { + case MATCH_FULL: + r->match = regex_match_full; + break; + case MATCH_FRONT_ONLY: + r->match = regex_match_front; + break; + case MATCH_MIDDLE_ONLY: + r->match = regex_match_middle; + break; + case MATCH_END_ONLY: + r->match = regex_match_end; + break; + } + + pred->not ^= not; +} + +enum move_type { + MOVE_DOWN, + MOVE_UP_FROM_LEFT, + MOVE_UP_FROM_RIGHT +}; + +static struct filter_pred * +get_pred_parent(struct filter_pred *pred, struct filter_pred *preds, + int index, enum move_type *move) +{ + if (pred->parent & FILTER_PRED_IS_RIGHT) + *move = MOVE_UP_FROM_RIGHT; + else + *move = MOVE_UP_FROM_LEFT; + pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT]; + + return pred; +} + +/* + * A series of AND or ORs where found together. Instead of + * climbing up and down the tree branches, an array of the + * ops were made in order of checks. We can just move across + * the array and short circuit if needed. + */ +static int process_ops(struct filter_pred *preds, + struct filter_pred *op, void *rec) +{ + struct filter_pred *pred; + int match = 0; + int type; + int i; + + /* + * Micro-optimization: We set type to true if op + * is an OR and false otherwise (AND). Then we + * just need to test if the match is equal to + * the type, and if it is, we can short circuit the + * rest of the checks: + * + * if ((match && op->op == OP_OR) || + * (!match && op->op == OP_AND)) + * return match; + */ + type = op->op == OP_OR; + + for (i = 0; i < op->val; i++) { + pred = &preds[op->ops[i]]; + match = pred->fn(pred, rec); + if (!!match == type) + return match; + } + return match; +} + +/* return 1 if event matches, 0 otherwise (discard) */ +int filter_match_preds(struct event_filter *filter, void *rec) +{ + int match = -1; + enum move_type move = MOVE_DOWN; + struct filter_pred *preds; + struct filter_pred *pred; + struct filter_pred *root; + int n_preds; + int done = 0; + + /* no filter is considered a match */ + if (!filter) + return 1; + + n_preds = filter->n_preds; + + if (!n_preds) + return 1; + + /* + * n_preds, root and filter->preds are protect with preemption disabled. + */ + preds = rcu_dereference_sched(filter->preds); + root = rcu_dereference_sched(filter->root); + if (!root) + return 1; + + pred = root; + + /* match is currently meaningless */ + match = -1; + + do { + switch (move) { + case MOVE_DOWN: + /* only AND and OR have children */ + if (pred->left != FILTER_PRED_INVALID) { + /* If ops is set, then it was folded. */ + if (!pred->ops) { + /* keep going to down the left side */ + pred = &preds[pred->left]; + continue; + } + /* We can treat folded ops as a leaf node */ + match = process_ops(preds, pred, rec); + } else + match = pred->fn(pred, rec); + /* If this pred is the only pred */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + /* + * Check for short circuits. + * + * Optimization: !!match == (pred->op == OP_OR) + * is the same as: + * if ((match && pred->op == OP_OR) || + * (!match && pred->op == OP_AND)) + */ + if (!!match == (pred->op == OP_OR)) { + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + /* now go down the right side of the tree. */ + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + /* We finished this equation. */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + return match; +} +EXPORT_SYMBOL_GPL(filter_match_preds); + +static void parse_error(struct filter_parse_state *ps, int err, int pos) +{ + ps->lasterr = err; + ps->lasterr_pos = pos; +} + +static void remove_filter_string(struct event_filter *filter) +{ + if (!filter) + return; + + kfree(filter->filter_string); + filter->filter_string = NULL; +} + +static int replace_filter_string(struct event_filter *filter, + char *filter_string) +{ + kfree(filter->filter_string); + filter->filter_string = kstrdup(filter_string, GFP_KERNEL); + if (!filter->filter_string) + return -ENOMEM; + + return 0; +} + +static int append_filter_string(struct event_filter *filter, + char *string) +{ + int newlen; + char *new_filter_string; + + BUG_ON(!filter->filter_string); + newlen = strlen(filter->filter_string) + strlen(string) + 1; + new_filter_string = kmalloc(newlen, GFP_KERNEL); + if (!new_filter_string) + return -ENOMEM; + + strcpy(new_filter_string, filter->filter_string); + strcat(new_filter_string, string); + kfree(filter->filter_string); + filter->filter_string = new_filter_string; + + return 0; +} + +static void append_filter_err(struct filter_parse_state *ps, + struct event_filter *filter) +{ + int pos = ps->lasterr_pos; + char *buf, *pbuf; + + buf = (char *)__get_free_page(GFP_TEMPORARY); + if (!buf) + return; + + append_filter_string(filter, "\n"); + memset(buf, ' ', PAGE_SIZE); + if (pos > PAGE_SIZE - 128) + pos = 0; + buf[pos] = '^'; + pbuf = &buf[pos] + 1; + + sprintf(pbuf, "\nparse_error: %s\n", err_text[ps->lasterr]); + append_filter_string(filter, buf); + free_page((unsigned long) buf); +} + +void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) +{ + struct event_filter *filter; + + mutex_lock(&event_mutex); + filter = call->filter; + if (filter && filter->filter_string) + trace_seq_printf(s, "%s\n", filter->filter_string); + else + trace_seq_printf(s, "none\n"); + mutex_unlock(&event_mutex); +} + +void print_subsystem_event_filter(struct event_subsystem *system, + struct trace_seq *s) +{ + struct event_filter *filter; + + mutex_lock(&event_mutex); + filter = system->filter; + if (filter && filter->filter_string) + trace_seq_printf(s, "%s\n", filter->filter_string); + else + trace_seq_printf(s, "none\n"); + mutex_unlock(&event_mutex); +} + +static struct ftrace_event_field * +__find_event_field(struct list_head *head, char *name) +{ + struct ftrace_event_field *field; + + list_for_each_entry(field, head, link) { + if (!strcmp(field->name, name)) + return field; + } + + return NULL; +} + +static struct ftrace_event_field * +find_event_field(struct ftrace_event_call *call, char *name) +{ + struct ftrace_event_field *field; + struct list_head *head; + + field = __find_event_field(&ftrace_common_fields, name); + if (field) + return field; + + head = trace_get_fields(call); + return __find_event_field(head, name); +} + +static void filter_free_pred(struct filter_pred *pred) +{ + if (!pred) + return; + + kfree(pred->field_name); + kfree(pred); +} + +static void filter_clear_pred(struct filter_pred *pred) +{ + kfree(pred->field_name); + pred->field_name = NULL; + pred->regex.len = 0; +} + +static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) +{ + stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL); + if (!stack->preds) + return -ENOMEM; + stack->index = n_preds; + return 0; +} + +static void __free_pred_stack(struct pred_stack *stack) +{ + kfree(stack->preds); + stack->index = 0; +} + +static int __push_pred_stack(struct pred_stack *stack, + struct filter_pred *pred) +{ + int index = stack->index; + + if (WARN_ON(index == 0)) + return -ENOSPC; + + stack->preds[--index] = pred; + stack->index = index; + return 0; +} + +static struct filter_pred * +__pop_pred_stack(struct pred_stack *stack) +{ + struct filter_pred *pred; + int index = stack->index; + + pred = stack->preds[index++]; + if (!pred) + return NULL; + + stack->index = index; + return pred; +} + +static int filter_set_pred(struct event_filter *filter, + int idx, + struct pred_stack *stack, + struct filter_pred *src, + filter_pred_fn_t fn) +{ + struct filter_pred *dest = &filter->preds[idx]; + struct filter_pred *left; + struct filter_pred *right; + + *dest = *src; + if (src->field_name) { + dest->field_name = kstrdup(src->field_name, GFP_KERNEL); + if (!dest->field_name) + return -ENOMEM; + } + dest->fn = fn; + dest->index = idx; + + if (dest->op == OP_OR || dest->op == OP_AND) { + right = __pop_pred_stack(stack); + left = __pop_pred_stack(stack); + if (!left || !right) + return -EINVAL; + /* + * If both children can be folded + * and they are the same op as this op or a leaf, + * then this op can be folded. + */ + if (left->index & FILTER_PRED_FOLD && + (left->op == dest->op || + left->left == FILTER_PRED_INVALID) && + right->index & FILTER_PRED_FOLD && + (right->op == dest->op || + right->left == FILTER_PRED_INVALID)) + dest->index |= FILTER_PRED_FOLD; + + dest->left = left->index & ~FILTER_PRED_FOLD; + dest->right = right->index & ~FILTER_PRED_FOLD; + left->parent = dest->index & ~FILTER_PRED_FOLD; + right->parent = dest->index | FILTER_PRED_IS_RIGHT; + } else { + /* + * Make dest->left invalid to be used as a quick + * way to know this is a leaf node. + */ + dest->left = FILTER_PRED_INVALID; + + /* All leafs allow folding the parent ops. */ + dest->index |= FILTER_PRED_FOLD; + } + + return __push_pred_stack(stack, dest); +} + +static void __free_preds(struct event_filter *filter) +{ + int i; + + if (filter->preds) { + for (i = 0; i < filter->a_preds; i++) + kfree(filter->preds[i].field_name); + kfree(filter->preds); + filter->preds = NULL; + } + filter->a_preds = 0; + filter->n_preds = 0; +} + +static void filter_disable(struct ftrace_event_call *call) +{ + call->flags &= ~TRACE_EVENT_FL_FILTERED; +} + +static void __free_filter(struct event_filter *filter) +{ + if (!filter) + return; + + __free_preds(filter); + kfree(filter->filter_string); + kfree(filter); +} + +/* + * Called when destroying the ftrace_event_call. + * The call is being freed, so we do not need to worry about + * the call being currently used. This is for module code removing + * the tracepoints from within it. + */ +void destroy_preds(struct ftrace_event_call *call) +{ + __free_filter(call->filter); + call->filter = NULL; +} + +static struct event_filter *__alloc_filter(void) +{ + struct event_filter *filter; + + filter = kzalloc(sizeof(*filter), GFP_KERNEL); + return filter; +} + +static int __alloc_preds(struct event_filter *filter, int n_preds) +{ + struct filter_pred *pred; + int i; + + if (filter->preds) + __free_preds(filter); + + filter->preds = + kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL); + + if (!filter->preds) + return -ENOMEM; + + filter->a_preds = n_preds; + filter->n_preds = 0; + + for (i = 0; i < n_preds; i++) { + pred = &filter->preds[i]; + pred->fn = filter_pred_none; + } + + return 0; +} + +static void filter_free_subsystem_preds(struct event_subsystem *system) +{ + struct ftrace_event_call *call; + + list_for_each_entry(call, &ftrace_events, list) { + if (strcmp(call->class->system, system->name) != 0) + continue; + + filter_disable(call); + remove_filter_string(call->filter); + } +} + +static void filter_free_subsystem_filters(struct event_subsystem *system) +{ + struct ftrace_event_call *call; + + list_for_each_entry(call, &ftrace_events, list) { + if (strcmp(call->class->system, system->name) != 0) + continue; + __free_filter(call->filter); + call->filter = NULL; + } +} + +static int filter_add_pred_fn(struct filter_parse_state *ps, + struct ftrace_event_call *call, + struct event_filter *filter, + struct filter_pred *pred, + struct pred_stack *stack, + filter_pred_fn_t fn) +{ + int idx, err; + + if (WARN_ON(filter->n_preds == filter->a_preds)) { + parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); + return -ENOSPC; + } + + idx = filter->n_preds; + filter_clear_pred(&filter->preds[idx]); + err = filter_set_pred(filter, idx, stack, pred, fn); + if (err) + return err; + + filter->n_preds++; + + return 0; +} + +int filter_assign_type(const char *type) +{ + if (strstr(type, "__data_loc") && strstr(type, "char")) + return FILTER_DYN_STRING; + + if (strchr(type, '[') && strstr(type, "char")) + return FILTER_STATIC_STRING; + + return FILTER_OTHER; +} + +static bool is_string_field(struct ftrace_event_field *field) +{ + return field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_STATIC_STRING || + field->filter_type == FILTER_PTR_STRING; +} + +static int is_legal_op(struct ftrace_event_field *field, int op) +{ + if (is_string_field(field) && + (op != OP_EQ && op != OP_NE && op != OP_GLOB)) + return 0; + if (!is_string_field(field) && op == OP_GLOB) + return 0; + + return 1; +} + +static filter_pred_fn_t select_comparison_fn(int op, int field_size, + int field_is_signed) +{ + filter_pred_fn_t fn = NULL; + + switch (field_size) { + case 8: + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_64; + else if (field_is_signed) + fn = filter_pred_s64; + else + fn = filter_pred_u64; + break; + case 4: + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_32; + else if (field_is_signed) + fn = filter_pred_s32; + else + fn = filter_pred_u32; + break; + case 2: + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_16; + else if (field_is_signed) + fn = filter_pred_s16; + else + fn = filter_pred_u16; + break; + case 1: + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_8; + else if (field_is_signed) + fn = filter_pred_s8; + else + fn = filter_pred_u8; + break; + } + + return fn; +} + +static int filter_add_pred(struct filter_parse_state *ps, + struct ftrace_event_call *call, + struct event_filter *filter, + struct filter_pred *pred, + struct pred_stack *stack, + bool dry_run) +{ + struct ftrace_event_field *field; + filter_pred_fn_t fn; + unsigned long long val; + int ret; + + fn = pred->fn = filter_pred_none; + + if (pred->op == OP_AND) + goto add_pred_fn; + else if (pred->op == OP_OR) + goto add_pred_fn; + + field = find_event_field(call, pred->field_name); + if (!field) { + parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); + return -EINVAL; + } + + pred->offset = field->offset; + + if (!is_legal_op(field, pred->op)) { + parse_error(ps, FILT_ERR_ILLEGAL_FIELD_OP, 0); + return -EINVAL; + } + + if (is_string_field(field)) { + filter_build_regex(pred); + + if (field->filter_type == FILTER_STATIC_STRING) { + fn = filter_pred_string; + pred->regex.field_len = field->size; + } else if (field->filter_type == FILTER_DYN_STRING) + fn = filter_pred_strloc; + else + fn = filter_pred_pchar; + } else { + if (field->is_signed) + ret = strict_strtoll(pred->regex.pattern, 0, &val); + else + ret = strict_strtoull(pred->regex.pattern, 0, &val); + if (ret) { + parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); + return -EINVAL; + } + pred->val = val; + + fn = select_comparison_fn(pred->op, field->size, + field->is_signed); + if (!fn) { + parse_error(ps, FILT_ERR_INVALID_OP, 0); + return -EINVAL; + } + } + + if (pred->op == OP_NE) + pred->not = 1; + +add_pred_fn: + if (!dry_run) + return filter_add_pred_fn(ps, call, filter, pred, stack, fn); + return 0; +} + +static void parse_init(struct filter_parse_state *ps, + struct filter_op *ops, + char *infix_string) +{ + memset(ps, '\0', sizeof(*ps)); + + ps->infix.string = infix_string; + ps->infix.cnt = strlen(infix_string); + ps->ops = ops; + + INIT_LIST_HEAD(&ps->opstack); + INIT_LIST_HEAD(&ps->postfix); +} + +static char infix_next(struct filter_parse_state *ps) +{ + ps->infix.cnt--; + + return ps->infix.string[ps->infix.tail++]; +} + +static char infix_peek(struct filter_parse_state *ps) +{ + if (ps->infix.tail == strlen(ps->infix.string)) + return 0; + + return ps->infix.string[ps->infix.tail]; +} + +static void infix_advance(struct filter_parse_state *ps) +{ + ps->infix.cnt--; + ps->infix.tail++; +} + +static inline int is_precedence_lower(struct filter_parse_state *ps, + int a, int b) +{ + return ps->ops[a].precedence < ps->ops[b].precedence; +} + +static inline int is_op_char(struct filter_parse_state *ps, char c) +{ + int i; + + for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { + if (ps->ops[i].string[0] == c) + return 1; + } + + return 0; +} + +static int infix_get_op(struct filter_parse_state *ps, char firstc) +{ + char nextc = infix_peek(ps); + char opstr[3]; + int i; + + opstr[0] = firstc; + opstr[1] = nextc; + opstr[2] = '\0'; + + for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { + if (!strcmp(opstr, ps->ops[i].string)) { + infix_advance(ps); + return ps->ops[i].id; + } + } + + opstr[1] = '\0'; + + for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { + if (!strcmp(opstr, ps->ops[i].string)) + return ps->ops[i].id; + } + + return OP_NONE; +} + +static inline void clear_operand_string(struct filter_parse_state *ps) +{ + memset(ps->operand.string, '\0', MAX_FILTER_STR_VAL); + ps->operand.tail = 0; +} + +static inline int append_operand_char(struct filter_parse_state *ps, char c) +{ + if (ps->operand.tail == MAX_FILTER_STR_VAL - 1) + return -EINVAL; + + ps->operand.string[ps->operand.tail++] = c; + + return 0; +} + +static int filter_opstack_push(struct filter_parse_state *ps, int op) +{ + struct opstack_op *opstack_op; + + opstack_op = kmalloc(sizeof(*opstack_op), GFP_KERNEL); + if (!opstack_op) + return -ENOMEM; + + opstack_op->op = op; + list_add(&opstack_op->list, &ps->opstack); + + return 0; +} + +static int filter_opstack_empty(struct filter_parse_state *ps) +{ + return list_empty(&ps->opstack); +} + +static int filter_opstack_top(struct filter_parse_state *ps) +{ + struct opstack_op *opstack_op; + + if (filter_opstack_empty(ps)) + return OP_NONE; + + opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list); + + return opstack_op->op; +} + +static int filter_opstack_pop(struct filter_parse_state *ps) +{ + struct opstack_op *opstack_op; + int op; + + if (filter_opstack_empty(ps)) + return OP_NONE; + + opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list); + op = opstack_op->op; + list_del(&opstack_op->list); + + kfree(opstack_op); + + return op; +} + +static void filter_opstack_clear(struct filter_parse_state *ps) +{ + while (!filter_opstack_empty(ps)) + filter_opstack_pop(ps); +} + +static char *curr_operand(struct filter_parse_state *ps) +{ + return ps->operand.string; +} + +static int postfix_append_operand(struct filter_parse_state *ps, char *operand) +{ + struct postfix_elt *elt; + + elt = kmalloc(sizeof(*elt), GFP_KERNEL); + if (!elt) + return -ENOMEM; + + elt->op = OP_NONE; + elt->operand = kstrdup(operand, GFP_KERNEL); + if (!elt->operand) { + kfree(elt); + return -ENOMEM; + } + + list_add_tail(&elt->list, &ps->postfix); + + return 0; +} + +static int postfix_append_op(struct filter_parse_state *ps, int op) +{ + struct postfix_elt *elt; + + elt = kmalloc(sizeof(*elt), GFP_KERNEL); + if (!elt) + return -ENOMEM; + + elt->op = op; + elt->operand = NULL; + + list_add_tail(&elt->list, &ps->postfix); + + return 0; +} + +static void postfix_clear(struct filter_parse_state *ps) +{ + struct postfix_elt *elt; + + while (!list_empty(&ps->postfix)) { + elt = list_first_entry(&ps->postfix, struct postfix_elt, list); + list_del(&elt->list); + kfree(elt->operand); + kfree(elt); + } +} + +static int filter_parse(struct filter_parse_state *ps) +{ + int in_string = 0; + int op, top_op; + char ch; + + while ((ch = infix_next(ps))) { + if (ch == '"') { + in_string ^= 1; + continue; + } + + if (in_string) + goto parse_operand; + + if (isspace(ch)) + continue; + + if (is_op_char(ps, ch)) { + op = infix_get_op(ps, ch); + if (op == OP_NONE) { + parse_error(ps, FILT_ERR_INVALID_OP, 0); + return -EINVAL; + } + + if (strlen(curr_operand(ps))) { + postfix_append_operand(ps, curr_operand(ps)); + clear_operand_string(ps); + } + + while (!filter_opstack_empty(ps)) { + top_op = filter_opstack_top(ps); + if (!is_precedence_lower(ps, top_op, op)) { + top_op = filter_opstack_pop(ps); + postfix_append_op(ps, top_op); + continue; + } + break; + } + + filter_opstack_push(ps, op); + continue; + } + + if (ch == '(') { + filter_opstack_push(ps, OP_OPEN_PAREN); + continue; + } + + if (ch == ')') { + if (strlen(curr_operand(ps))) { + postfix_append_operand(ps, curr_operand(ps)); + clear_operand_string(ps); + } + + top_op = filter_opstack_pop(ps); + while (top_op != OP_NONE) { + if (top_op == OP_OPEN_PAREN) + break; + postfix_append_op(ps, top_op); + top_op = filter_opstack_pop(ps); + } + if (top_op == OP_NONE) { + parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0); + return -EINVAL; + } + continue; + } +parse_operand: + if (append_operand_char(ps, ch)) { + parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0); + return -EINVAL; + } + } + + if (strlen(curr_operand(ps))) + postfix_append_operand(ps, curr_operand(ps)); + + while (!filter_opstack_empty(ps)) { + top_op = filter_opstack_pop(ps); + if (top_op == OP_NONE) + break; + if (top_op == OP_OPEN_PAREN) { + parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0); + return -EINVAL; + } + postfix_append_op(ps, top_op); + } + + return 0; +} + +static struct filter_pred *create_pred(int op, char *operand1, char *operand2) +{ + struct filter_pred *pred; + + pred = kzalloc(sizeof(*pred), GFP_KERNEL); + if (!pred) + return NULL; + + pred->field_name = kstrdup(operand1, GFP_KERNEL); + if (!pred->field_name) { + kfree(pred); + return NULL; + } + + strcpy(pred->regex.pattern, operand2); + pred->regex.len = strlen(pred->regex.pattern); + + pred->op = op; + + return pred; +} + +static struct filter_pred *create_logical_pred(int op) +{ + struct filter_pred *pred; + + pred = kzalloc(sizeof(*pred), GFP_KERNEL); + if (!pred) + return NULL; + + pred->op = op; + + return pred; +} + +static int check_preds(struct filter_parse_state *ps) +{ + int n_normal_preds = 0, n_logical_preds = 0; + struct postfix_elt *elt; + + list_for_each_entry(elt, &ps->postfix, list) { + if (elt->op == OP_NONE) + continue; + + if (elt->op == OP_AND || elt->op == OP_OR) { + n_logical_preds++; + continue; + } + n_normal_preds++; + } + + if (!n_normal_preds || n_logical_preds >= n_normal_preds) { + parse_error(ps, FILT_ERR_INVALID_FILTER, 0); + return -EINVAL; + } + + return 0; +} + +static int count_preds(struct filter_parse_state *ps) +{ + struct postfix_elt *elt; + int n_preds = 0; + + list_for_each_entry(elt, &ps->postfix, list) { + if (elt->op == OP_NONE) + continue; + n_preds++; + } + + return n_preds; +} + +/* + * The tree is walked at filtering of an event. If the tree is not correctly + * built, it may cause an infinite loop. Check here that the tree does + * indeed terminate. + */ +static int check_pred_tree(struct event_filter *filter, + struct filter_pred *root) +{ + struct filter_pred *preds; + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int count = 0; + int done = 0; + int max; + + /* + * The max that we can hit a node is three times. + * Once going down, once coming up from left, and + * once coming up from right. This is more than enough + * since leafs are only hit a single time. + */ + max = 3 * filter->n_preds; + + preds = filter->preds; + if (!preds) + return -EINVAL; + pred = root; + + do { + if (WARN_ON(count++ > max)) + return -EINVAL; + + switch (move) { + case MOVE_DOWN: + if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + /* A leaf at the root is just a leaf in the tree */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + /* We are fine. */ + return 0; +} + +static int count_leafs(struct filter_pred *preds, struct filter_pred *root) +{ + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int count = 0; + int done = 0; + + pred = root; + + do { + switch (move) { + case MOVE_DOWN: + if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + /* A leaf at the root is just a leaf in the tree */ + if (pred == root) + return 1; + count++; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + return count; +} + +static int fold_pred(struct filter_pred *preds, struct filter_pred *root) +{ + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int count = 0; + int children; + int done = 0; + + /* No need to keep the fold flag */ + root->index &= ~FILTER_PRED_FOLD; + + /* If the root is a leaf then do nothing */ + if (root->left == FILTER_PRED_INVALID) + return 0; + + /* count the children */ + children = count_leafs(preds, &preds[root->left]); + children += count_leafs(preds, &preds[root->right]); + + root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL); + if (!root->ops) + return -ENOMEM; + + root->val = children; + + pred = root; + do { + switch (move) { + case MOVE_DOWN: + if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + if (WARN_ON(count == children)) + return -EINVAL; + pred->index &= ~FILTER_PRED_FOLD; + root->ops[count++] = pred->index; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + return 0; +} + +/* + * To optimize the processing of the ops, if we have several "ors" or + * "ands" together, we can put them in an array and process them all + * together speeding up the filter logic. + */ +static int fold_pred_tree(struct event_filter *filter, + struct filter_pred *root) +{ + struct filter_pred *preds; + struct filter_pred *pred; + enum move_type move = MOVE_DOWN; + int done = 0; + int err; + + preds = filter->preds; + if (!preds) + return -EINVAL; + pred = root; + + do { + switch (move) { + case MOVE_DOWN: + if (pred->index & FILTER_PRED_FOLD) { + err = fold_pred(preds, pred); + if (err) + return err; + /* Folded nodes are like leafs */ + } else if (pred->left != FILTER_PRED_INVALID) { + pred = &preds[pred->left]; + continue; + } + + /* A leaf at the root is just a leaf in the tree */ + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + case MOVE_UP_FROM_LEFT: + pred = &preds[pred->right]; + move = MOVE_DOWN; + continue; + case MOVE_UP_FROM_RIGHT: + if (pred == root) + break; + pred = get_pred_parent(pred, preds, + pred->parent, &move); + continue; + } + done = 1; + } while (!done); + + return 0; +} + +static int replace_preds(struct ftrace_event_call *call, + struct event_filter *filter, + struct filter_parse_state *ps, + char *filter_string, + bool dry_run) +{ + char *operand1 = NULL, *operand2 = NULL; + struct filter_pred *pred; + struct filter_pred *root; + struct postfix_elt *elt; + struct pred_stack stack = { }; /* init to NULL */ + int err; + int n_preds = 0; + + n_preds = count_preds(ps); + if (n_preds >= MAX_FILTER_PRED) { + parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); + return -ENOSPC; + } + + err = check_preds(ps); + if (err) + return err; + + if (!dry_run) { + err = __alloc_pred_stack(&stack, n_preds); + if (err) + return err; + err = __alloc_preds(filter, n_preds); + if (err) + goto fail; + } + + n_preds = 0; + list_for_each_entry(elt, &ps->postfix, list) { + if (elt->op == OP_NONE) { + if (!operand1) + operand1 = elt->operand; + else if (!operand2) + operand2 = elt->operand; + else { + parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); + err = -EINVAL; + goto fail; + } + continue; + } + + if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) { + parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); + err = -ENOSPC; + goto fail; + } + + if (elt->op == OP_AND || elt->op == OP_OR) { + pred = create_logical_pred(elt->op); + goto add_pred; + } + + if (!operand1 || !operand2) { + parse_error(ps, FILT_ERR_MISSING_FIELD, 0); + err = -EINVAL; + goto fail; + } + + pred = create_pred(elt->op, operand1, operand2); +add_pred: + if (!pred) { + err = -ENOMEM; + goto fail; + } + err = filter_add_pred(ps, call, filter, pred, &stack, dry_run); + filter_free_pred(pred); + if (err) + goto fail; + + operand1 = operand2 = NULL; + } + + if (!dry_run) { + /* We should have one item left on the stack */ + pred = __pop_pred_stack(&stack); + if (!pred) + return -EINVAL; + /* This item is where we start from in matching */ + root = pred; + /* Make sure the stack is empty */ + pred = __pop_pred_stack(&stack); + if (WARN_ON(pred)) { + err = -EINVAL; + filter->root = NULL; + goto fail; + } + err = check_pred_tree(filter, root); + if (err) + goto fail; + + /* Optimize the tree */ + err = fold_pred_tree(filter, root); + if (err) + goto fail; + + /* We don't set root until we know it works */ + barrier(); + filter->root = root; + } + + err = 0; +fail: + __free_pred_stack(&stack); + return err; +} + +struct filter_list { + struct list_head list; + struct event_filter *filter; +}; + +static int replace_system_preds(struct event_subsystem *system, + struct filter_parse_state *ps, + char *filter_string) +{ + struct ftrace_event_call *call; + struct filter_list *filter_item; + struct filter_list *tmp; + LIST_HEAD(filter_list); + bool fail = true; + int err; + + list_for_each_entry(call, &ftrace_events, list) { + + if (strcmp(call->class->system, system->name) != 0) + continue; + + /* + * Try to see if the filter can be applied + * (filter arg is ignored on dry_run) + */ + err = replace_preds(call, NULL, ps, filter_string, true); + if (err) + goto fail; + } + + list_for_each_entry(call, &ftrace_events, list) { + struct event_filter *filter; + + if (strcmp(call->class->system, system->name) != 0) + continue; + + filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL); + if (!filter_item) + goto fail_mem; + + list_add_tail(&filter_item->list, &filter_list); + + filter_item->filter = __alloc_filter(); + if (!filter_item->filter) + goto fail_mem; + filter = filter_item->filter; + + /* Can only fail on no memory */ + err = replace_filter_string(filter, filter_string); + if (err) + goto fail_mem; + + err = replace_preds(call, filter, ps, filter_string, false); + if (err) { + filter_disable(call); + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + append_filter_err(ps, filter); + } else + call->flags |= TRACE_EVENT_FL_FILTERED; + /* + * Regardless of if this returned an error, we still + * replace the filter for the call. + */ + filter = call->filter; + rcu_assign_pointer(call->filter, filter_item->filter); + filter_item->filter = filter; + + fail = false; + } + + if (fail) + goto fail; + + /* + * The calls can still be using the old filters. + * Do a synchronize_sched() to ensure all calls are + * done with them before we free them. + */ + synchronize_sched(); + list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { + __free_filter(filter_item->filter); + list_del(&filter_item->list); + kfree(filter_item); + } + return 0; + fail: + /* No call succeeded */ + list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { + list_del(&filter_item->list); + kfree(filter_item); + } + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); + return -EINVAL; + fail_mem: + /* If any call succeeded, we still need to sync */ + if (!fail) + synchronize_sched(); + list_for_each_entry_safe(filter_item, tmp, &filter_list, list) { + __free_filter(filter_item->filter); + list_del(&filter_item->list); + kfree(filter_item); + } + return -ENOMEM; +} + +int apply_event_filter(struct ftrace_event_call *call, char *filter_string) +{ + struct filter_parse_state *ps; + struct event_filter *filter; + struct event_filter *tmp; + int err = 0; + + mutex_lock(&event_mutex); + + if (!strcmp(strstrip(filter_string), "0")) { + filter_disable(call); + filter = call->filter; + if (!filter) + goto out_unlock; + RCU_INIT_POINTER(call->filter, NULL); + /* Make sure the filter is not being used */ + synchronize_sched(); + __free_filter(filter); + goto out_unlock; + } + + err = -ENOMEM; + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + goto out_unlock; + + filter = __alloc_filter(); + if (!filter) { + kfree(ps); + goto out_unlock; + } + + replace_filter_string(filter, filter_string); + + parse_init(ps, filter_ops, filter_string); + err = filter_parse(ps); + if (err) { + append_filter_err(ps, filter); + goto out; + } + + err = replace_preds(call, filter, ps, filter_string, false); + if (err) { + filter_disable(call); + append_filter_err(ps, filter); + } else + call->flags |= TRACE_EVENT_FL_FILTERED; +out: + /* + * Always swap the call filter with the new filter + * even if there was an error. If there was an error + * in the filter, we disable the filter and show the error + * string + */ + tmp = call->filter; + rcu_assign_pointer(call->filter, filter); + if (tmp) { + /* Make sure the call is done with the filter */ + synchronize_sched(); + __free_filter(tmp); + } + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); +out_unlock: + mutex_unlock(&event_mutex); + + return err; +} + +int apply_subsystem_event_filter(struct event_subsystem *system, + char *filter_string) +{ + struct filter_parse_state *ps; + struct event_filter *filter; + int err = 0; + + mutex_lock(&event_mutex); + + /* Make sure the system still has events */ + if (!system->nr_events) { + err = -ENODEV; + goto out_unlock; + } + + if (!strcmp(strstrip(filter_string), "0")) { + filter_free_subsystem_preds(system); + remove_filter_string(system->filter); + filter = system->filter; + system->filter = NULL; + /* Ensure all filters are no longer used */ + synchronize_sched(); + filter_free_subsystem_filters(system); + __free_filter(filter); + goto out_unlock; + } + + err = -ENOMEM; + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + goto out_unlock; + + filter = __alloc_filter(); + if (!filter) + goto out; + + replace_filter_string(filter, filter_string); + /* + * No event actually uses the system filter + * we can free it without synchronize_sched(). + */ + __free_filter(system->filter); + system->filter = filter; + + parse_init(ps, filter_ops, filter_string); + err = filter_parse(ps); + if (err) { + append_filter_err(ps, system->filter); + goto out; + } + + err = replace_system_preds(system, ps, filter_string); + if (err) + append_filter_err(ps, system->filter); + +out: + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); +out_unlock: + mutex_unlock(&event_mutex); + + return err; +} + +#ifdef CONFIG_PERF_EVENTS + +void ftrace_profile_free_filter(struct perf_event *event) +{ + struct event_filter *filter = event->filter; + + event->filter = NULL; + __free_filter(filter); +} + +int ftrace_profile_set_filter(struct perf_event *event, int event_id, + char *filter_str) +{ + int err; + struct event_filter *filter; + struct filter_parse_state *ps; + struct ftrace_event_call *call = NULL; + + mutex_lock(&event_mutex); + + list_for_each_entry(call, &ftrace_events, list) { + if (call->event.type == event_id) + break; + } + + err = -EINVAL; + if (&call->list == &ftrace_events) + goto out_unlock; + + err = -EEXIST; + if (event->filter) + goto out_unlock; + + filter = __alloc_filter(); + if (!filter) { + err = PTR_ERR(filter); + goto out_unlock; + } + + err = -ENOMEM; + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + goto free_filter; + + parse_init(ps, filter_ops, filter_str); + err = filter_parse(ps); + if (err) + goto free_ps; + + err = replace_preds(call, filter, ps, filter_str, false); + if (!err) + event->filter = filter; + +free_ps: + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); + +free_filter: + if (err) + __free_filter(filter); + +out_unlock: + mutex_unlock(&event_mutex); + + return err; +} + +#endif /* CONFIG_PERF_EVENTS */ + diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c new file mode 100644 index 00000000..ad4000c7 --- /dev/null +++ b/kernel/trace/trace_export.c @@ -0,0 +1,173 @@ +/* + * trace_export.c - export basic ftrace utilities to user space + * + * Copyright (C) 2009 Steven Rostedt + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "trace_output.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM ftrace + +/* not needed for this file */ +#undef __field_struct +#define __field_struct(type, item) + +#undef __field +#define __field(type, item) type item; + +#undef __field_desc +#define __field_desc(type, container, item) type item; + +#undef __array +#define __array(type, item, size) type item[size]; + +#undef __array_desc +#define __array_desc(type, container, item, size) type item[size]; + +#undef __dynamic_array +#define __dynamic_array(type, item) type item[]; + +#undef F_STRUCT +#define F_STRUCT(args...) args + +#undef F_printk +#define F_printk(fmt, args...) fmt, args + +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ +struct ____ftrace_##name { \ + tstruct \ +}; \ +static void __always_unused ____ftrace_check_##name(void) \ +{ \ + struct ____ftrace_##name *__entry = NULL; \ + \ + /* force compile-time check on F_printk() */ \ + printk(print); \ +} + +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print) \ + FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) + +#include "trace_entries.h" + +#undef __field +#define __field(type, item) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item), \ + is_signed_type(type), FILTER_OTHER); \ + if (ret) \ + return ret; + +#undef __field_desc +#define __field_desc(type, container, item) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), \ + container.item), \ + sizeof(field.container.item), \ + is_signed_type(type), FILTER_OTHER); \ + if (ret) \ + return ret; + +#undef __array +#define __array(type, item, len) \ + do { \ + BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ + mutex_lock(&event_storage_mutex); \ + snprintf(event_storage, sizeof(event_storage), \ + "%s[%d]", #type, len); \ + ret = trace_define_field(event_call, event_storage, #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item), \ + is_signed_type(type), FILTER_OTHER); \ + mutex_unlock(&event_storage_mutex); \ + if (ret) \ + return ret; \ + } while (0); + +#undef __array_desc +#define __array_desc(type, container, item, len) \ + BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ + ret = trace_define_field(event_call, #type "[" #len "]", #item, \ + offsetof(typeof(field), \ + container.item), \ + sizeof(field.container.item), \ + is_signed_type(type), FILTER_OTHER); \ + if (ret) \ + return ret; + +#undef __dynamic_array +#define __dynamic_array(type, item) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), item), \ + 0, is_signed_type(type), FILTER_OTHER);\ + if (ret) \ + return ret; + +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ +int \ +ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ +{ \ + struct struct_name field; \ + int ret; \ + \ + tstruct; \ + \ + return ret; \ +} + +#include "trace_entries.h" + +#undef __entry +#define __entry REC + +#undef __field +#define __field(type, item) + +#undef __field_desc +#define __field_desc(type, container, item) + +#undef __array +#define __array(type, item, len) + +#undef __array_desc +#define __array_desc(type, container, item, len) + +#undef __dynamic_array +#define __dynamic_array(type, item) + +#undef F_printk +#define F_printk(fmt, args...) __stringify(fmt) ", " __stringify(args) + +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \ + \ +struct ftrace_event_class event_class_ftrace_##call = { \ + .system = __stringify(TRACE_SYSTEM), \ + .define_fields = ftrace_define_fields_##call, \ + .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ +}; \ + \ +struct ftrace_event_call __used event_##call = { \ + .name = #call, \ + .event.type = etype, \ + .class = &event_class_ftrace_##call, \ + .print_fmt = print, \ +}; \ +struct ftrace_event_call __used \ +__attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; + +#include "trace_entries.h" diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c new file mode 100644 index 00000000..c7b0c6a7 --- /dev/null +++ b/kernel/trace/trace_functions.c @@ -0,0 +1,406 @@ +/* + * ring buffer based function tracer + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2008 Ingo Molnar + * + * Based on code from the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include +#include +#include +#include +#include + +#include "trace.h" + +/* function tracing enabled */ +static int ftrace_function_enabled; + +static struct trace_array *func_trace; + +static void tracing_start_function_trace(void); +static void tracing_stop_function_trace(void); + +static int function_trace_init(struct trace_array *tr) +{ + func_trace = tr; + tr->cpu = get_cpu(); + put_cpu(); + + tracing_start_cmdline_record(); + tracing_start_function_trace(); + return 0; +} + +static void function_trace_reset(struct trace_array *tr) +{ + tracing_stop_function_trace(); + tracing_stop_cmdline_record(); +} + +static void function_trace_start(struct trace_array *tr) +{ + tracing_reset_online_cpus(tr); +} + +static void +function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = func_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + int pc; + + if (unlikely(!ftrace_function_enabled)) + return; + + pc = preempt_count(); + preempt_disable_notrace(); + local_save_flags(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) + trace_function(tr, ip, parent_ip, flags, pc); + + atomic_dec(&data->disabled); + preempt_enable_notrace(); +} + +static void +function_trace_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = func_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + int pc; + + if (unlikely(!ftrace_function_enabled)) + return; + + /* + * Need to use raw, since this must be called before the + * recursive protection is performed. + */ + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) { + pc = preempt_count(); + trace_function(tr, ip, parent_ip, flags, pc); + } + + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + +static void +function_stack_trace_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = func_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + int pc; + + if (unlikely(!ftrace_function_enabled)) + return; + + /* + * Need to use raw, since this must be called before the + * recursive protection is performed. + */ + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + + if (likely(disabled == 1)) { + pc = preempt_count(); + trace_function(tr, ip, parent_ip, flags, pc); + /* + * skip over 5 funcs: + * __ftrace_trace_stack, + * __trace_stack, + * function_stack_trace_call + * ftrace_list_func + * ftrace_call + */ + __trace_stack(tr, flags, 5, pc); + } + + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + + +static struct ftrace_ops trace_ops __read_mostly = +{ + .func = function_trace_call, + .flags = FTRACE_OPS_FL_GLOBAL, +}; + +static struct ftrace_ops trace_stack_ops __read_mostly = +{ + .func = function_stack_trace_call, + .flags = FTRACE_OPS_FL_GLOBAL, +}; + +/* Our two options */ +enum { + TRACE_FUNC_OPT_STACK = 0x1, +}; + +static struct tracer_opt func_opts[] = { +#ifdef CONFIG_STACKTRACE + { TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) }, +#endif + { } /* Always set a last empty entry */ +}; + +static struct tracer_flags func_flags = { + .val = 0, /* By default: all flags disabled */ + .opts = func_opts +}; + +static void tracing_start_function_trace(void) +{ + ftrace_function_enabled = 0; + + if (trace_flags & TRACE_ITER_PREEMPTONLY) + trace_ops.func = function_trace_call_preempt_only; + else + trace_ops.func = function_trace_call; + + if (func_flags.val & TRACE_FUNC_OPT_STACK) + register_ftrace_function(&trace_stack_ops); + else + register_ftrace_function(&trace_ops); + + ftrace_function_enabled = 1; +} + +static void tracing_stop_function_trace(void) +{ + ftrace_function_enabled = 0; + + if (func_flags.val & TRACE_FUNC_OPT_STACK) + unregister_ftrace_function(&trace_stack_ops); + else + unregister_ftrace_function(&trace_ops); +} + +static int func_set_flag(u32 old_flags, u32 bit, int set) +{ + if (bit == TRACE_FUNC_OPT_STACK) { + /* do nothing if already set */ + if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) + return 0; + + if (set) { + unregister_ftrace_function(&trace_ops); + register_ftrace_function(&trace_stack_ops); + } else { + unregister_ftrace_function(&trace_stack_ops); + register_ftrace_function(&trace_ops); + } + + return 0; + } + + return -EINVAL; +} + +static struct tracer function_trace __read_mostly = +{ + .name = "function", + .init = function_trace_init, + .reset = function_trace_reset, + .start = function_trace_start, + .wait_pipe = poll_wait_pipe, + .flags = &func_flags, + .set_flag = func_set_flag, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_function, +#endif +}; + +#ifdef CONFIG_DYNAMIC_FTRACE +static void +ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) +{ + long *count = (long *)data; + + if (tracing_is_on()) + return; + + if (!*count) + return; + + if (*count != -1) + (*count)--; + + tracing_on(); +} + +static void +ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) +{ + long *count = (long *)data; + + if (!tracing_is_on()) + return; + + if (!*count) + return; + + if (*count != -1) + (*count)--; + + tracing_off(); +} + +static int +ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data); + +static struct ftrace_probe_ops traceon_probe_ops = { + .func = ftrace_traceon, + .print = ftrace_trace_onoff_print, +}; + +static struct ftrace_probe_ops traceoff_probe_ops = { + .func = ftrace_traceoff, + .print = ftrace_trace_onoff_print, +}; + +static int +ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + long count = (long)data; + + seq_printf(m, "%ps:", (void *)ip); + + if (ops == &traceon_probe_ops) + seq_printf(m, "traceon"); + else + seq_printf(m, "traceoff"); + + if (count == -1) + seq_printf(m, ":unlimited\n"); + else + seq_printf(m, ":count=%ld\n", count); + + return 0; +} + +static int +ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param) +{ + struct ftrace_probe_ops *ops; + + /* we register both traceon and traceoff to this callback */ + if (strcmp(cmd, "traceon") == 0) + ops = &traceon_probe_ops; + else + ops = &traceoff_probe_ops; + + unregister_ftrace_function_probe_func(glob, ops); + + return 0; +} + +static int +ftrace_trace_onoff_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + void *count = (void *)-1; + char *number; + int ret; + + /* hash funcs only work with set_ftrace_filter */ + if (!enable) + return -EINVAL; + + if (glob[0] == '!') + return ftrace_trace_onoff_unreg(glob+1, cmd, param); + + /* we register both traceon and traceoff to this callback */ + if (strcmp(cmd, "traceon") == 0) + ops = &traceon_probe_ops; + else + ops = &traceoff_probe_ops; + + if (!param) + goto out_reg; + + number = strsep(¶m, ":"); + + if (!strlen(number)) + goto out_reg; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = strict_strtoul(number, 0, (unsigned long *)&count); + if (ret) + return ret; + + out_reg: + ret = register_ftrace_function_probe(glob, ops, count); + + return ret < 0 ? ret : 0; +} + +static struct ftrace_func_command ftrace_traceon_cmd = { + .name = "traceon", + .func = ftrace_trace_onoff_callback, +}; + +static struct ftrace_func_command ftrace_traceoff_cmd = { + .name = "traceoff", + .func = ftrace_trace_onoff_callback, +}; + +static int __init init_func_cmd_traceon(void) +{ + int ret; + + ret = register_ftrace_command(&ftrace_traceoff_cmd); + if (ret) + return ret; + + ret = register_ftrace_command(&ftrace_traceon_cmd); + if (ret) + unregister_ftrace_command(&ftrace_traceoff_cmd); + return ret; +} +#else +static inline int init_func_cmd_traceon(void) +{ + return 0; +} +#endif /* CONFIG_DYNAMIC_FTRACE */ + +static __init int init_function_trace(void) +{ + init_func_cmd_traceon(); + return register_tracer(&function_trace); +} +device_initcall(init_function_trace); + diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c new file mode 100644 index 00000000..962cdb24 --- /dev/null +++ b/kernel/trace/trace_functions_graph.c @@ -0,0 +1,1479 @@ +/* + * + * Function graph tracer. + * Copyright (c) 2008-2009 Frederic Weisbecker + * Mostly borrowed from function tracer which + * is Copyright (c) Steven Rostedt + * + */ +#include +#include +#include +#include +#include + +#include "trace.h" +#include "trace_output.h" + +/* When set, irq functions will be ignored */ +static int ftrace_graph_skip_irqs; + +struct fgraph_cpu_data { + pid_t last_pid; + int depth; + int depth_irq; + int ignore; + unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; +}; + +struct fgraph_data { + struct fgraph_cpu_data __percpu *cpu_data; + + /* Place to preserve last processed entry. */ + struct ftrace_graph_ent_entry ent; + struct ftrace_graph_ret_entry ret; + int failed; + int cpu; +}; + +#define TRACE_GRAPH_INDENT 2 + +/* Flag options */ +#define TRACE_GRAPH_PRINT_OVERRUN 0x1 +#define TRACE_GRAPH_PRINT_CPU 0x2 +#define TRACE_GRAPH_PRINT_OVERHEAD 0x4 +#define TRACE_GRAPH_PRINT_PROC 0x8 +#define TRACE_GRAPH_PRINT_DURATION 0x10 +#define TRACE_GRAPH_PRINT_ABS_TIME 0x20 +#define TRACE_GRAPH_PRINT_IRQS 0x40 + +static struct tracer_opt trace_opts[] = { + /* Display overruns? (for self-debug purpose) */ + { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) }, + /* Display CPU ? */ + { TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) }, + /* Display Overhead ? */ + { TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) }, + /* Display proc name/pid */ + { TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) }, + /* Display duration of execution */ + { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) }, + /* Display absolute time of an entry */ + { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, + /* Display interrupts */ + { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, + { } /* Empty entry */ +}; + +static struct tracer_flags tracer_flags = { + /* Don't display overruns and proc by default */ + .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | + TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS, + .opts = trace_opts +}; + +static struct trace_array *graph_array; + + +/* Add a function return address to the trace stack on thread info.*/ +int +ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, + unsigned long frame_pointer) +{ + unsigned long long calltime; + int index; + + if (!current->ret_stack) + return -EBUSY; + + /* + * We must make sure the ret_stack is tested before we read + * anything else. + */ + smp_rmb(); + + /* The return trace stack is full */ + if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) { + atomic_inc(¤t->trace_overrun); + return -EBUSY; + } + + calltime = trace_clock_local(); + + index = ++current->curr_ret_stack; + barrier(); + current->ret_stack[index].ret = ret; + current->ret_stack[index].func = func; + current->ret_stack[index].calltime = calltime; + current->ret_stack[index].subtime = 0; + current->ret_stack[index].fp = frame_pointer; + *depth = index; + + return 0; +} + +/* Retrieve a function return address to the trace stack on thread info.*/ +static void +ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, + unsigned long frame_pointer) +{ + int index; + + index = current->curr_ret_stack; + + if (unlikely(index < 0)) { + ftrace_graph_stop(); + WARN_ON(1); + /* Might as well panic, otherwise we have no where to go */ + *ret = (unsigned long)panic; + return; + } + +#ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST + /* + * The arch may choose to record the frame pointer used + * and check it here to make sure that it is what we expect it + * to be. If gcc does not set the place holder of the return + * address in the frame pointer, and does a copy instead, then + * the function graph trace will fail. This test detects this + * case. + * + * Currently, x86_32 with optimize for size (-Os) makes the latest + * gcc do the above. + */ + if (unlikely(current->ret_stack[index].fp != frame_pointer)) { + ftrace_graph_stop(); + WARN(1, "Bad frame pointer: expected %lx, received %lx\n" + " from func %ps return to %lx\n", + current->ret_stack[index].fp, + frame_pointer, + (void *)current->ret_stack[index].func, + current->ret_stack[index].ret); + *ret = (unsigned long)panic; + return; + } +#endif + + *ret = current->ret_stack[index].ret; + trace->func = current->ret_stack[index].func; + trace->calltime = current->ret_stack[index].calltime; + trace->overrun = atomic_read(¤t->trace_overrun); + trace->depth = index; +} + +/* + * Send the trace to the ring-buffer. + * @return the original return address. + */ +unsigned long ftrace_return_to_handler(unsigned long frame_pointer) +{ + struct ftrace_graph_ret trace; + unsigned long ret; + + ftrace_pop_return_trace(&trace, &ret, frame_pointer); + trace.rettime = trace_clock_local(); + ftrace_graph_return(&trace); + barrier(); + current->curr_ret_stack--; + + if (unlikely(!ret)) { + ftrace_graph_stop(); + WARN_ON(1); + /* Might as well panic. What else to do? */ + ret = (unsigned long)panic; + } + + return ret; +} + +int __trace_graph_entry(struct trace_array *tr, + struct ftrace_graph_ent *trace, + unsigned long flags, + int pc) +{ + struct ftrace_event_call *call = &event_funcgraph_entry; + struct ring_buffer_event *event; + struct ring_buffer *buffer = tr->buffer; + struct ftrace_graph_ent_entry *entry; + + if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) + return 0; + + event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, + sizeof(*entry), flags, pc); + if (!event) + return 0; + entry = ring_buffer_event_data(event); + entry->graph_ent = *trace; + if (!filter_current_check_discard(buffer, call, entry, event)) + ring_buffer_unlock_commit(buffer, event); + + return 1; +} + +static inline int ftrace_graph_ignore_irqs(void) +{ + if (!ftrace_graph_skip_irqs) + return 0; + + return in_irq(); +} + +int trace_graph_entry(struct ftrace_graph_ent *trace) +{ + struct trace_array *tr = graph_array; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int ret; + int cpu; + int pc; + + if (!ftrace_trace_task(current)) + return 0; + + /* trace it when it is-nested-in or is a function enabled. */ + if (!(trace->depth || ftrace_graph_addr(trace->func)) || + ftrace_graph_ignore_irqs()) + return 0; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + if (likely(disabled == 1)) { + pc = preempt_count(); + ret = __trace_graph_entry(tr, trace, flags, pc); + } else { + ret = 0; + } + + atomic_dec(&data->disabled); + local_irq_restore(flags); + + return ret; +} + +int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) +{ + if (tracing_thresh) + return 1; + else + return trace_graph_entry(trace); +} + +static void +__trace_graph_function(struct trace_array *tr, + unsigned long ip, unsigned long flags, int pc) +{ + u64 time = trace_clock_local(); + struct ftrace_graph_ent ent = { + .func = ip, + .depth = 0, + }; + struct ftrace_graph_ret ret = { + .func = ip, + .depth = 0, + .calltime = time, + .rettime = time, + }; + + __trace_graph_entry(tr, &ent, flags, pc); + __trace_graph_return(tr, &ret, flags, pc); +} + +void +trace_graph_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned long flags, int pc) +{ + __trace_graph_function(tr, ip, flags, pc); +} + +void __trace_graph_return(struct trace_array *tr, + struct ftrace_graph_ret *trace, + unsigned long flags, + int pc) +{ + struct ftrace_event_call *call = &event_funcgraph_exit; + struct ring_buffer_event *event; + struct ring_buffer *buffer = tr->buffer; + struct ftrace_graph_ret_entry *entry; + + if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) + return; + + event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, + sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->ret = *trace; + if (!filter_current_check_discard(buffer, call, entry, event)) + ring_buffer_unlock_commit(buffer, event); +} + +void trace_graph_return(struct ftrace_graph_ret *trace) +{ + struct trace_array *tr = graph_array; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu; + int pc; + + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + if (likely(disabled == 1)) { + pc = preempt_count(); + __trace_graph_return(tr, trace, flags, pc); + } + atomic_dec(&data->disabled); + local_irq_restore(flags); +} + +void set_graph_array(struct trace_array *tr) +{ + graph_array = tr; + + /* Make graph_array visible before we start tracing */ + + smp_mb(); +} + +void trace_graph_thresh_return(struct ftrace_graph_ret *trace) +{ + if (tracing_thresh && + (trace->rettime - trace->calltime < tracing_thresh)) + return; + else + trace_graph_return(trace); +} + +static int graph_trace_init(struct trace_array *tr) +{ + int ret; + + set_graph_array(tr); + if (tracing_thresh) + ret = register_ftrace_graph(&trace_graph_thresh_return, + &trace_graph_thresh_entry); + else + ret = register_ftrace_graph(&trace_graph_return, + &trace_graph_entry); + if (ret) + return ret; + tracing_start_cmdline_record(); + + return 0; +} + +static void graph_trace_reset(struct trace_array *tr) +{ + tracing_stop_cmdline_record(); + unregister_ftrace_graph(); +} + +static int max_bytes_for_cpu; + +static enum print_line_t +print_graph_cpu(struct trace_seq *s, int cpu) +{ + int ret; + + /* + * Start with a space character - to make it stand out + * to the right a bit when trace output is pasted into + * email: + */ + ret = trace_seq_printf(s, " %*d) ", max_bytes_for_cpu, cpu); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +#define TRACE_GRAPH_PROCINFO_LENGTH 14 + +static enum print_line_t +print_graph_proc(struct trace_seq *s, pid_t pid) +{ + char comm[TASK_COMM_LEN]; + /* sign + log10(MAX_INT) + '\0' */ + char pid_str[11]; + int spaces = 0; + int ret; + int len; + int i; + + trace_find_cmdline(pid, comm); + comm[7] = '\0'; + sprintf(pid_str, "%d", pid); + + /* 1 stands for the "-" character */ + len = strlen(comm) + strlen(pid_str) + 1; + + if (len < TRACE_GRAPH_PROCINFO_LENGTH) + spaces = TRACE_GRAPH_PROCINFO_LENGTH - len; + + /* First spaces to align center */ + for (i = 0; i < spaces / 2; i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + ret = trace_seq_printf(s, "%s-%s", comm, pid_str); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Last spaces to align center */ + for (i = 0; i < spaces - (spaces / 2); i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + return TRACE_TYPE_HANDLED; +} + + +static enum print_line_t +print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry) +{ + if (!trace_seq_putc(s, ' ')) + return 0; + + return trace_print_lat_fmt(s, entry); +} + +/* If the pid changed since the last trace, output this event */ +static enum print_line_t +verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) +{ + pid_t prev_pid; + pid_t *last_pid; + int ret; + + if (!data) + return TRACE_TYPE_HANDLED; + + last_pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); + + if (*last_pid == pid) + return TRACE_TYPE_HANDLED; + + prev_pid = *last_pid; + *last_pid = pid; + + if (prev_pid == -1) + return TRACE_TYPE_HANDLED; +/* + * Context-switch trace line: + + ------------------------------------------ + | 1) migration/0--1 => sshd-1755 + ------------------------------------------ + + */ + ret = trace_seq_printf(s, + " ------------------------------------------\n"); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + ret = print_graph_cpu(s, cpu); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + ret = print_graph_proc(s, prev_pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, " => "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + ret = print_graph_proc(s, pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, + "\n ------------------------------------------\n\n"); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static struct ftrace_graph_ret_entry * +get_return_for_leaf(struct trace_iterator *iter, + struct ftrace_graph_ent_entry *curr) +{ + struct fgraph_data *data = iter->private; + struct ring_buffer_iter *ring_iter = NULL; + struct ring_buffer_event *event; + struct ftrace_graph_ret_entry *next; + + /* + * If the previous output failed to write to the seq buffer, + * then we just reuse the data from before. + */ + if (data && data->failed) { + curr = &data->ent; + next = &data->ret; + } else { + + ring_iter = iter->buffer_iter[iter->cpu]; + + /* First peek to compare current entry and the next one */ + if (ring_iter) + event = ring_buffer_iter_peek(ring_iter, NULL); + else { + /* + * We need to consume the current entry to see + * the next one. + */ + ring_buffer_consume(iter->tr->buffer, iter->cpu, + NULL, NULL); + event = ring_buffer_peek(iter->tr->buffer, iter->cpu, + NULL, NULL); + } + + if (!event) + return NULL; + + next = ring_buffer_event_data(event); + + if (data) { + /* + * Save current and next entries for later reference + * if the output fails. + */ + data->ent = *curr; + /* + * If the next event is not a return type, then + * we only care about what type it is. Otherwise we can + * safely copy the entire event. + */ + if (next->ent.type == TRACE_GRAPH_RET) + data->ret = *next; + else + data->ret.ent.type = next->ent.type; + } + } + + if (next->ent.type != TRACE_GRAPH_RET) + return NULL; + + if (curr->ent.pid != next->ent.pid || + curr->graph_ent.func != next->ret.func) + return NULL; + + /* this is a leaf, now advance the iterator */ + if (ring_iter) + ring_buffer_read(ring_iter, NULL); + + return next; +} + +/* Signal a overhead of time execution to the output */ +static int +print_graph_overhead(unsigned long long duration, struct trace_seq *s, + u32 flags) +{ + /* If duration disappear, we don't need anything */ + if (!(flags & TRACE_GRAPH_PRINT_DURATION)) + return 1; + + /* Non nested entry or return */ + if (duration == -1) + return trace_seq_printf(s, " "); + + if (flags & TRACE_GRAPH_PRINT_OVERHEAD) { + /* Duration exceeded 100 msecs */ + if (duration > 100000ULL) + return trace_seq_printf(s, "! "); + + /* Duration exceeded 10 msecs */ + if (duration > 10000ULL) + return trace_seq_printf(s, "+ "); + } + + return trace_seq_printf(s, " "); +} + +static int print_graph_abs_time(u64 t, struct trace_seq *s) +{ + unsigned long usecs_rem; + + usecs_rem = do_div(t, NSEC_PER_SEC); + usecs_rem /= 1000; + + return trace_seq_printf(s, "%5lu.%06lu | ", + (unsigned long)t, usecs_rem); +} + +static enum print_line_t +print_graph_irq(struct trace_iterator *iter, unsigned long addr, + enum trace_type type, int cpu, pid_t pid, u32 flags) +{ + int ret; + struct trace_seq *s = &iter->seq; + + if (addr < (unsigned long)__irqentry_text_start || + addr >= (unsigned long)__irqentry_text_end) + return TRACE_TYPE_UNHANDLED; + + /* Absolute time */ + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { + ret = print_graph_abs_time(iter->ts, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Cpu */ + if (flags & TRACE_GRAPH_PRINT_CPU) { + ret = print_graph_cpu(s, cpu); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Proc */ + if (flags & TRACE_GRAPH_PRINT_PROC) { + ret = print_graph_proc(s, pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + ret = trace_seq_printf(s, " | "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* No overhead */ + ret = print_graph_overhead(-1, s, flags); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + if (type == TRACE_GRAPH_ENT) + ret = trace_seq_printf(s, "==========>"); + else + ret = trace_seq_printf(s, "<=========="); + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Don't close the duration column if haven't one */ + if (flags & TRACE_GRAPH_PRINT_DURATION) + trace_seq_printf(s, " |"); + ret = trace_seq_printf(s, "\n"); + + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + return TRACE_TYPE_HANDLED; +} + +enum print_line_t +trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) +{ + unsigned long nsecs_rem = do_div(duration, 1000); + /* log10(ULONG_MAX) + '\0' */ + char msecs_str[21]; + char nsecs_str[5]; + int ret, len; + int i; + + sprintf(msecs_str, "%lu", (unsigned long) duration); + + /* Print msecs */ + ret = trace_seq_printf(s, "%s", msecs_str); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + len = strlen(msecs_str); + + /* Print nsecs (we don't want to exceed 7 numbers) */ + if (len < 7) { + size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len); + + snprintf(nsecs_str, slen, "%03lu", nsecs_rem); + ret = trace_seq_printf(s, ".%s", nsecs_str); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + len += strlen(nsecs_str); + } + + ret = trace_seq_printf(s, " us "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Print remaining spaces to fit the row's width */ + for (i = len; i < 7; i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +print_graph_duration(unsigned long long duration, struct trace_seq *s) +{ + int ret; + + ret = trace_print_graph_duration(duration, s); + if (ret != TRACE_TYPE_HANDLED) + return ret; + + ret = trace_seq_printf(s, "| "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +/* Case of a leaf function on its call entry */ +static enum print_line_t +print_graph_entry_leaf(struct trace_iterator *iter, + struct ftrace_graph_ent_entry *entry, + struct ftrace_graph_ret_entry *ret_entry, + struct trace_seq *s, u32 flags) +{ + struct fgraph_data *data = iter->private; + struct ftrace_graph_ret *graph_ret; + struct ftrace_graph_ent *call; + unsigned long long duration; + int ret; + int i; + + graph_ret = &ret_entry->ret; + call = &entry->graph_ent; + duration = graph_ret->rettime - graph_ret->calltime; + + if (data) { + struct fgraph_cpu_data *cpu_data; + int cpu = iter->cpu; + + cpu_data = per_cpu_ptr(data->cpu_data, cpu); + + /* + * Comments display at + 1 to depth. Since + * this is a leaf function, keep the comments + * equal to this depth. + */ + cpu_data->depth = call->depth - 1; + + /* No need to keep this function around for this depth */ + if (call->depth < FTRACE_RETFUNC_DEPTH) + cpu_data->enter_funcs[call->depth] = 0; + } + + /* Overhead */ + ret = print_graph_overhead(duration, s, flags); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Duration */ + if (flags & TRACE_GRAPH_PRINT_DURATION) { + ret = print_graph_duration(duration, s); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Function */ + for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + ret = trace_seq_printf(s, "%ps();\n", (void *)call->func); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +print_graph_entry_nested(struct trace_iterator *iter, + struct ftrace_graph_ent_entry *entry, + struct trace_seq *s, int cpu, u32 flags) +{ + struct ftrace_graph_ent *call = &entry->graph_ent; + struct fgraph_data *data = iter->private; + int ret; + int i; + + if (data) { + struct fgraph_cpu_data *cpu_data; + int cpu = iter->cpu; + + cpu_data = per_cpu_ptr(data->cpu_data, cpu); + cpu_data->depth = call->depth; + + /* Save this function pointer to see if the exit matches */ + if (call->depth < FTRACE_RETFUNC_DEPTH) + cpu_data->enter_funcs[call->depth] = call->func; + } + + /* No overhead */ + ret = print_graph_overhead(-1, s, flags); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* No time */ + if (flags & TRACE_GRAPH_PRINT_DURATION) { + ret = trace_seq_printf(s, " | "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Function */ + for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* + * we already consumed the current entry to check the next one + * and see if this is a leaf. + */ + return TRACE_TYPE_NO_CONSUME; +} + +static enum print_line_t +print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, + int type, unsigned long addr, u32 flags) +{ + struct fgraph_data *data = iter->private; + struct trace_entry *ent = iter->ent; + int cpu = iter->cpu; + int ret; + + /* Pid */ + if (verif_pid(s, ent->pid, cpu, data) == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + if (type) { + /* Interrupt */ + ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Absolute time */ + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) { + ret = print_graph_abs_time(iter->ts, s); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Cpu */ + if (flags & TRACE_GRAPH_PRINT_CPU) { + ret = print_graph_cpu(s, cpu); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Proc */ + if (flags & TRACE_GRAPH_PRINT_PROC) { + ret = print_graph_proc(s, ent->pid); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + ret = trace_seq_printf(s, " | "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Latency format */ + if (trace_flags & TRACE_ITER_LATENCY_FMT) { + ret = print_graph_lat_fmt(s, ent); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } + + return 0; +} + +/* + * Entry check for irq code + * + * returns 1 if + * - we are inside irq code + * - we just entered irq code + * + * retunns 0 if + * - funcgraph-interrupts option is set + * - we are not inside irq code + */ +static int +check_irq_entry(struct trace_iterator *iter, u32 flags, + unsigned long addr, int depth) +{ + int cpu = iter->cpu; + int *depth_irq; + struct fgraph_data *data = iter->private; + + /* + * If we are either displaying irqs, or we got called as + * a graph event and private data does not exist, + * then we bypass the irq check. + */ + if ((flags & TRACE_GRAPH_PRINT_IRQS) || + (!data)) + return 0; + + depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + + /* + * We are inside the irq code + */ + if (*depth_irq >= 0) + return 1; + + if ((addr < (unsigned long)__irqentry_text_start) || + (addr >= (unsigned long)__irqentry_text_end)) + return 0; + + /* + * We are entering irq code. + */ + *depth_irq = depth; + return 1; +} + +/* + * Return check for irq code + * + * returns 1 if + * - we are inside irq code + * - we just left irq code + * + * returns 0 if + * - funcgraph-interrupts option is set + * - we are not inside irq code + */ +static int +check_irq_return(struct trace_iterator *iter, u32 flags, int depth) +{ + int cpu = iter->cpu; + int *depth_irq; + struct fgraph_data *data = iter->private; + + /* + * If we are either displaying irqs, or we got called as + * a graph event and private data does not exist, + * then we bypass the irq check. + */ + if ((flags & TRACE_GRAPH_PRINT_IRQS) || + (!data)) + return 0; + + depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + + /* + * We are not inside the irq code. + */ + if (*depth_irq == -1) + return 0; + + /* + * We are inside the irq code, and this is returning entry. + * Let's not trace it and clear the entry depth, since + * we are out of irq code. + * + * This condition ensures that we 'leave the irq code' once + * we are out of the entry depth. Thus protecting us from + * the RETURN entry loss. + */ + if (*depth_irq >= depth) { + *depth_irq = -1; + return 1; + } + + /* + * We are inside the irq code, and this is not the entry. + */ + return 1; +} + +static enum print_line_t +print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, + struct trace_iterator *iter, u32 flags) +{ + struct fgraph_data *data = iter->private; + struct ftrace_graph_ent *call = &field->graph_ent; + struct ftrace_graph_ret_entry *leaf_ret; + static enum print_line_t ret; + int cpu = iter->cpu; + + if (check_irq_entry(iter, flags, call->func, call->depth)) + return TRACE_TYPE_HANDLED; + + if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) + return TRACE_TYPE_PARTIAL_LINE; + + leaf_ret = get_return_for_leaf(iter, field); + if (leaf_ret) + ret = print_graph_entry_leaf(iter, field, leaf_ret, s, flags); + else + ret = print_graph_entry_nested(iter, field, s, cpu, flags); + + if (data) { + /* + * If we failed to write our output, then we need to make + * note of it. Because we already consumed our entry. + */ + if (s->full) { + data->failed = 1; + data->cpu = cpu; + } else + data->failed = 0; + } + + return ret; +} + +static enum print_line_t +print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, + struct trace_entry *ent, struct trace_iterator *iter, + u32 flags) +{ + unsigned long long duration = trace->rettime - trace->calltime; + struct fgraph_data *data = iter->private; + pid_t pid = ent->pid; + int cpu = iter->cpu; + int func_match = 1; + int ret; + int i; + + if (check_irq_return(iter, flags, trace->depth)) + return TRACE_TYPE_HANDLED; + + if (data) { + struct fgraph_cpu_data *cpu_data; + int cpu = iter->cpu; + + cpu_data = per_cpu_ptr(data->cpu_data, cpu); + + /* + * Comments display at + 1 to depth. This is the + * return from a function, we now want the comments + * to display at the same level of the bracket. + */ + cpu_data->depth = trace->depth - 1; + + if (trace->depth < FTRACE_RETFUNC_DEPTH) { + if (cpu_data->enter_funcs[trace->depth] != trace->func) + func_match = 0; + cpu_data->enter_funcs[trace->depth] = 0; + } + } + + if (print_graph_prologue(iter, s, 0, 0, flags)) + return TRACE_TYPE_PARTIAL_LINE; + + /* Overhead */ + ret = print_graph_overhead(duration, s, flags); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* Duration */ + if (flags & TRACE_GRAPH_PRINT_DURATION) { + ret = print_graph_duration(duration, s); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Closing brace */ + for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* + * If the return function does not have a matching entry, + * then the entry was lost. Instead of just printing + * the '}' and letting the user guess what function this + * belongs to, write out the function name. + */ + if (func_match) { + ret = trace_seq_printf(s, "}\n"); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } else { + ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Overrun */ + if (flags & TRACE_GRAPH_PRINT_OVERRUN) { + ret = trace_seq_printf(s, " (Overruns: %lu)\n", + trace->overrun); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, + cpu, pid, flags); + if (ret == TRACE_TYPE_PARTIAL_LINE) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +print_graph_comment(struct trace_seq *s, struct trace_entry *ent, + struct trace_iterator *iter, u32 flags) +{ + unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); + struct fgraph_data *data = iter->private; + struct trace_event *event; + int depth = 0; + int ret; + int i; + + if (data) + depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth; + + if (print_graph_prologue(iter, s, 0, 0, flags)) + return TRACE_TYPE_PARTIAL_LINE; + + /* No overhead */ + ret = print_graph_overhead(-1, s, flags); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + /* No time */ + if (flags & TRACE_GRAPH_PRINT_DURATION) { + ret = trace_seq_printf(s, " | "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* Indentation */ + if (depth > 0) + for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) { + ret = trace_seq_printf(s, " "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + /* The comment */ + ret = trace_seq_printf(s, "/* "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + switch (iter->ent->type) { + case TRACE_BPRINT: + ret = trace_print_bprintk_msg_only(iter); + if (ret != TRACE_TYPE_HANDLED) + return ret; + break; + case TRACE_PRINT: + ret = trace_print_printk_msg_only(iter); + if (ret != TRACE_TYPE_HANDLED) + return ret; + break; + default: + event = ftrace_find_event(ent->type); + if (!event) + return TRACE_TYPE_UNHANDLED; + + ret = event->funcs->trace(iter, sym_flags, event); + if (ret != TRACE_TYPE_HANDLED) + return ret; + } + + /* Strip ending newline */ + if (s->buffer[s->len - 1] == '\n') { + s->buffer[s->len - 1] = '\0'; + s->len--; + } + + ret = trace_seq_printf(s, " */\n"); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + + +enum print_line_t +__print_graph_function_flags(struct trace_iterator *iter, u32 flags) +{ + struct ftrace_graph_ent_entry *field; + struct fgraph_data *data = iter->private; + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + int cpu = iter->cpu; + int ret; + + if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) { + per_cpu_ptr(data->cpu_data, cpu)->ignore = 0; + return TRACE_TYPE_HANDLED; + } + + /* + * If the last output failed, there's a possibility we need + * to print out the missing entry which would never go out. + */ + if (data && data->failed) { + field = &data->ent; + iter->cpu = data->cpu; + ret = print_graph_entry(field, s, iter, flags); + if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) { + per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1; + ret = TRACE_TYPE_NO_CONSUME; + } + iter->cpu = cpu; + return ret; + } + + switch (entry->type) { + case TRACE_GRAPH_ENT: { + /* + * print_graph_entry() may consume the current event, + * thus @field may become invalid, so we need to save it. + * sizeof(struct ftrace_graph_ent_entry) is very small, + * it can be safely saved at the stack. + */ + struct ftrace_graph_ent_entry saved; + trace_assign_type(field, entry); + saved = *field; + return print_graph_entry(&saved, s, iter, flags); + } + case TRACE_GRAPH_RET: { + struct ftrace_graph_ret_entry *field; + trace_assign_type(field, entry); + return print_graph_return(&field->ret, s, entry, iter, flags); + } + case TRACE_STACK: + case TRACE_FN: + /* dont trace stack and functions as comments */ + return TRACE_TYPE_UNHANDLED; + + default: + return print_graph_comment(s, entry, iter, flags); + } + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +print_graph_function(struct trace_iterator *iter) +{ + return __print_graph_function_flags(iter, tracer_flags.val); +} + +enum print_line_t print_graph_function_flags(struct trace_iterator *iter, + u32 flags) +{ + if (trace_flags & TRACE_ITER_LATENCY_FMT) + flags |= TRACE_GRAPH_PRINT_DURATION; + else + flags |= TRACE_GRAPH_PRINT_ABS_TIME; + + return __print_graph_function_flags(iter, flags); +} + +static enum print_line_t +print_graph_function_event(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + return print_graph_function(iter); +} + +static void print_lat_header(struct seq_file *s, u32 flags) +{ + static const char spaces[] = " " /* 16 spaces */ + " " /* 4 spaces */ + " "; /* 17 spaces */ + int size = 0; + + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) + size += 16; + if (flags & TRACE_GRAPH_PRINT_CPU) + size += 4; + if (flags & TRACE_GRAPH_PRINT_PROC) + size += 17; + + seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces); + seq_printf(s, "#%.*s / _----=> need-resched \n", size, spaces); + seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces); + seq_printf(s, "#%.*s|| / _--=> preempt-depth \n", size, spaces); + seq_printf(s, "#%.*s||| / _-=> lock-depth \n", size, spaces); + seq_printf(s, "#%.*s|||| / \n", size, spaces); +} + +static void __print_graph_headers_flags(struct seq_file *s, u32 flags) +{ + int lat = trace_flags & TRACE_ITER_LATENCY_FMT; + + if (lat) + print_lat_header(s, flags); + + /* 1st line */ + seq_printf(s, "#"); + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) + seq_printf(s, " TIME "); + if (flags & TRACE_GRAPH_PRINT_CPU) + seq_printf(s, " CPU"); + if (flags & TRACE_GRAPH_PRINT_PROC) + seq_printf(s, " TASK/PID "); + if (lat) + seq_printf(s, "|||||"); + if (flags & TRACE_GRAPH_PRINT_DURATION) + seq_printf(s, " DURATION "); + seq_printf(s, " FUNCTION CALLS\n"); + + /* 2nd line */ + seq_printf(s, "#"); + if (flags & TRACE_GRAPH_PRINT_ABS_TIME) + seq_printf(s, " | "); + if (flags & TRACE_GRAPH_PRINT_CPU) + seq_printf(s, " | "); + if (flags & TRACE_GRAPH_PRINT_PROC) + seq_printf(s, " | | "); + if (lat) + seq_printf(s, "|||||"); + if (flags & TRACE_GRAPH_PRINT_DURATION) + seq_printf(s, " | | "); + seq_printf(s, " | | | |\n"); +} + +void print_graph_headers(struct seq_file *s) +{ + print_graph_headers_flags(s, tracer_flags.val); +} + +void print_graph_headers_flags(struct seq_file *s, u32 flags) +{ + struct trace_iterator *iter = s->private; + + if (trace_flags & TRACE_ITER_LATENCY_FMT) { + /* print nothing if the buffers are empty */ + if (trace_empty(iter)) + return; + + print_trace_header(s, iter); + flags |= TRACE_GRAPH_PRINT_DURATION; + } else + flags |= TRACE_GRAPH_PRINT_ABS_TIME; + + __print_graph_headers_flags(s, flags); +} + +void graph_trace_open(struct trace_iterator *iter) +{ + /* pid and depth on the last trace processed */ + struct fgraph_data *data; + int cpu; + + iter->private = NULL; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + goto out_err; + + data->cpu_data = alloc_percpu(struct fgraph_cpu_data); + if (!data->cpu_data) + goto out_err_free; + + for_each_possible_cpu(cpu) { + pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); + int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); + int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore); + int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + + *pid = -1; + *depth = 0; + *ignore = 0; + *depth_irq = -1; + } + + iter->private = data; + + return; + + out_err_free: + kfree(data); + out_err: + pr_warning("function graph tracer: not enough memory\n"); +} + +void graph_trace_close(struct trace_iterator *iter) +{ + struct fgraph_data *data = iter->private; + + if (data) { + free_percpu(data->cpu_data); + kfree(data); + } +} + +static int func_graph_set_flag(u32 old_flags, u32 bit, int set) +{ + if (bit == TRACE_GRAPH_PRINT_IRQS) + ftrace_graph_skip_irqs = !set; + + return 0; +} + +static struct trace_event_functions graph_functions = { + .trace = print_graph_function_event, +}; + +static struct trace_event graph_trace_entry_event = { + .type = TRACE_GRAPH_ENT, + .funcs = &graph_functions, +}; + +static struct trace_event graph_trace_ret_event = { + .type = TRACE_GRAPH_RET, + .funcs = &graph_functions +}; + +static struct tracer graph_trace __read_mostly = { + .name = "function_graph", + .open = graph_trace_open, + .pipe_open = graph_trace_open, + .close = graph_trace_close, + .pipe_close = graph_trace_close, + .wait_pipe = poll_wait_pipe, + .init = graph_trace_init, + .reset = graph_trace_reset, + .print_line = print_graph_function, + .print_header = print_graph_headers, + .flags = &tracer_flags, + .set_flag = func_graph_set_flag, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_function_graph, +#endif +}; + +static __init int init_graph_trace(void) +{ + max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); + + if (!register_ftrace_event(&graph_trace_entry_event)) { + pr_warning("Warning: could not register graph trace events\n"); + return 1; + } + + if (!register_ftrace_event(&graph_trace_ret_event)) { + pr_warning("Warning: could not register graph trace events\n"); + return 1; + } + + return register_tracer(&graph_trace); +} + +device_initcall(init_graph_trace); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c new file mode 100644 index 00000000..c77424be --- /dev/null +++ b/kernel/trace/trace_irqsoff.c @@ -0,0 +1,687 @@ +/* + * trace irqs off critical timings + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2008 Ingo Molnar + * + * From code in the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +static struct trace_array *irqsoff_trace __read_mostly; +static int tracer_enabled __read_mostly; + +static DEFINE_PER_CPU(int, tracing_cpu); + +static DEFINE_SPINLOCK(max_trace_lock); + +enum { + TRACER_IRQS_OFF = (1 << 1), + TRACER_PREEMPT_OFF = (1 << 2), +}; + +static int trace_type __read_mostly; + +static int save_lat_flag; + +static void stop_irqsoff_tracer(struct trace_array *tr, int graph); +static int start_irqsoff_tracer(struct trace_array *tr, int graph); + +#ifdef CONFIG_PREEMPT_TRACER +static inline int +preempt_trace(void) +{ + return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count()); +} +#else +# define preempt_trace() (0) +#endif + +#ifdef CONFIG_IRQSOFF_TRACER +static inline int +irq_trace(void) +{ + return ((trace_type & TRACER_IRQS_OFF) && + irqs_disabled()); +} +#else +# define irq_trace() (0) +#endif + +#define TRACE_DISPLAY_GRAPH 1 + +static struct tracer_opt trace_opts[] = { +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* display latency trace as call graph */ + { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) }, +#endif + { } /* Empty entry */ +}; + +static struct tracer_flags tracer_flags = { + .val = 0, + .opts = trace_opts, +}; + +#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH) + +/* + * Sequence count - we record it when starting a measurement and + * skip the latency if the sequence has changed - some other section + * did a maximum and could disturb our measurement with serial console + * printouts, etc. Truly coinciding maximum latencies should be rare + * and what happens together happens separately as well, so this doesn't + * decrease the validity of the maximum found: + */ +static __cacheline_aligned_in_smp unsigned long max_sequence; + +#ifdef CONFIG_FUNCTION_TRACER +/* + * Prologue for the preempt and irqs off function tracers. + * + * Returns 1 if it is OK to continue, and data->disabled is + * incremented. + * 0 if the trace is to be ignored, and data->disabled + * is kept the same. + * + * Note, this function is also used outside this ifdef but + * inside the #ifdef of the function graph tracer below. + * This is OK, since the function graph tracer is + * dependent on the function tracer. + */ +static int func_prolog_dec(struct trace_array *tr, + struct trace_array_cpu **data, + unsigned long *flags) +{ + long disabled; + int cpu; + + /* + * Does not matter if we preempt. We test the flags + * afterward, to see if irqs are disabled or not. + * If we preempt and get a false positive, the flags + * test will fail. + */ + cpu = raw_smp_processor_id(); + if (likely(!per_cpu(tracing_cpu, cpu))) + return 0; + + local_save_flags(*flags); + /* slight chance to get a false positive on tracing_cpu */ + if (!irqs_disabled_flags(*flags)) + return 0; + + *data = tr->data[cpu]; + disabled = atomic_inc_return(&(*data)->disabled); + + if (likely(disabled == 1)) + return 1; + + atomic_dec(&(*data)->disabled); + + return 0; +} + +/* + * irqsoff uses its own tracer function to keep the overhead down: + */ +static void +irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + + if (!func_prolog_dec(tr, &data, &flags)) + return; + + trace_function(tr, ip, parent_ip, flags, preempt_count()); + + atomic_dec(&data->disabled); +} + +static struct ftrace_ops trace_ops __read_mostly = +{ + .func = irqsoff_tracer_call, + .flags = FTRACE_OPS_FL_GLOBAL, +}; +#endif /* CONFIG_FUNCTION_TRACER */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int irqsoff_set_flag(u32 old_flags, u32 bit, int set) +{ + int cpu; + + if (!(bit & TRACE_DISPLAY_GRAPH)) + return -EINVAL; + + if (!(is_graph() ^ set)) + return 0; + + stop_irqsoff_tracer(irqsoff_trace, !set); + + for_each_possible_cpu(cpu) + per_cpu(tracing_cpu, cpu) = 0; + + tracing_max_latency = 0; + tracing_reset_online_cpus(irqsoff_trace); + + return start_irqsoff_tracer(irqsoff_trace, set); +} + +static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) +{ + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + int ret; + int pc; + + if (!func_prolog_dec(tr, &data, &flags)) + return 0; + + pc = preempt_count(); + ret = __trace_graph_entry(tr, trace, flags, pc); + atomic_dec(&data->disabled); + + return ret; +} + +static void irqsoff_graph_return(struct ftrace_graph_ret *trace) +{ + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + int pc; + + if (!func_prolog_dec(tr, &data, &flags)) + return; + + pc = preempt_count(); + __trace_graph_return(tr, trace, flags, pc); + atomic_dec(&data->disabled); +} + +static void irqsoff_trace_open(struct trace_iterator *iter) +{ + if (is_graph()) + graph_trace_open(iter); + +} + +static void irqsoff_trace_close(struct trace_iterator *iter) +{ + if (iter->private) + graph_trace_close(iter); +} + +#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \ + TRACE_GRAPH_PRINT_PROC) + +static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) +{ + /* + * In graph mode call the graph tracer output function, + * otherwise go with the TRACE_FN event handler + */ + if (is_graph()) + return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); + + return TRACE_TYPE_UNHANDLED; +} + +static void irqsoff_print_header(struct seq_file *s) +{ + if (is_graph()) + print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); + else + trace_default_header(s); +} + +static void +__trace_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned long flags, int pc) +{ + if (is_graph()) + trace_graph_function(tr, ip, parent_ip, flags, pc); + else + trace_function(tr, ip, parent_ip, flags, pc); +} + +#else +#define __trace_function trace_function + +static int irqsoff_set_flag(u32 old_flags, u32 bit, int set) +{ + return -EINVAL; +} + +static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) +{ + return -1; +} + +static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) +{ + return TRACE_TYPE_UNHANDLED; +} + +static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { } +static void irqsoff_print_header(struct seq_file *s) { } +static void irqsoff_trace_open(struct trace_iterator *iter) { } +static void irqsoff_trace_close(struct trace_iterator *iter) { } +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +/* + * Should this new latency be reported/recorded? + */ +static int report_latency(cycle_t delta) +{ + if (tracing_thresh) { + if (delta < tracing_thresh) + return 0; + } else { + if (delta <= tracing_max_latency) + return 0; + } + return 1; +} + +static void +check_critical_timing(struct trace_array *tr, + struct trace_array_cpu *data, + unsigned long parent_ip, + int cpu) +{ + cycle_t T0, T1, delta; + unsigned long flags; + int pc; + + T0 = data->preempt_timestamp; + T1 = ftrace_now(cpu); + delta = T1-T0; + + local_save_flags(flags); + + pc = preempt_count(); + + if (!report_latency(delta)) + goto out; + + spin_lock_irqsave(&max_trace_lock, flags); + + /* check if we are still the max latency */ + if (!report_latency(delta)) + goto out_unlock; + + __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); + /* Skip 5 functions to get to the irq/preempt enable function */ + __trace_stack(tr, flags, 5, pc); + + if (data->critical_sequence != max_sequence) + goto out_unlock; + + data->critical_end = parent_ip; + + if (likely(!is_tracing_stopped())) { + tracing_max_latency = delta; + update_max_tr_single(tr, current, cpu); + } + + max_sequence++; + +out_unlock: + spin_unlock_irqrestore(&max_trace_lock, flags); + +out: + data->critical_sequence = max_sequence; + data->preempt_timestamp = ftrace_now(cpu); + __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); +} + +static inline void +start_critical_timing(unsigned long ip, unsigned long parent_ip) +{ + int cpu; + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + + if (likely(!tracer_enabled)) + return; + + cpu = raw_smp_processor_id(); + + if (per_cpu(tracing_cpu, cpu)) + return; + + data = tr->data[cpu]; + + if (unlikely(!data) || atomic_read(&data->disabled)) + return; + + atomic_inc(&data->disabled); + + data->critical_sequence = max_sequence; + data->preempt_timestamp = ftrace_now(cpu); + data->critical_start = parent_ip ? : ip; + + local_save_flags(flags); + + __trace_function(tr, ip, parent_ip, flags, preempt_count()); + + per_cpu(tracing_cpu, cpu) = 1; + + atomic_dec(&data->disabled); +} + +static inline void +stop_critical_timing(unsigned long ip, unsigned long parent_ip) +{ + int cpu; + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + + cpu = raw_smp_processor_id(); + /* Always clear the tracing cpu on stopping the trace */ + if (unlikely(per_cpu(tracing_cpu, cpu))) + per_cpu(tracing_cpu, cpu) = 0; + else + return; + + if (!tracer_enabled) + return; + + data = tr->data[cpu]; + + if (unlikely(!data) || + !data->critical_start || atomic_read(&data->disabled)) + return; + + atomic_inc(&data->disabled); + + local_save_flags(flags); + __trace_function(tr, ip, parent_ip, flags, preempt_count()); + check_critical_timing(tr, data, parent_ip ? : ip, cpu); + data->critical_start = 0; + atomic_dec(&data->disabled); +} + +/* start and stop critical timings used to for stoppage (in idle) */ +void start_critical_timings(void) +{ + if (preempt_trace() || irq_trace()) + start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} +EXPORT_SYMBOL_GPL(start_critical_timings); + +void stop_critical_timings(void) +{ + if (preempt_trace() || irq_trace()) + stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} +EXPORT_SYMBOL_GPL(stop_critical_timings); + +#ifdef CONFIG_IRQSOFF_TRACER +#ifdef CONFIG_PROVE_LOCKING +void time_hardirqs_on(unsigned long a0, unsigned long a1) +{ + if (!preempt_trace() && irq_trace()) + stop_critical_timing(a0, a1); +} + +void time_hardirqs_off(unsigned long a0, unsigned long a1) +{ + if (!preempt_trace() && irq_trace()) + start_critical_timing(a0, a1); +} + +#else /* !CONFIG_PROVE_LOCKING */ + +/* + * Stubs: + */ + +void trace_softirqs_on(unsigned long ip) +{ +} + +void trace_softirqs_off(unsigned long ip) +{ +} + +inline void print_irqtrace_events(struct task_struct *curr) +{ +} + +/* + * We are only interested in hardirq on/off events: + */ +void trace_hardirqs_on(void) +{ + if (!preempt_trace() && irq_trace()) + stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} +EXPORT_SYMBOL(trace_hardirqs_on); + +void trace_hardirqs_off(void) +{ + if (!preempt_trace() && irq_trace()) + start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); +} +EXPORT_SYMBOL(trace_hardirqs_off); + +void trace_hardirqs_on_caller(unsigned long caller_addr) +{ + if (!preempt_trace() && irq_trace()) + stop_critical_timing(CALLER_ADDR0, caller_addr); +} +EXPORT_SYMBOL(trace_hardirqs_on_caller); + +void trace_hardirqs_off_caller(unsigned long caller_addr) +{ + if (!preempt_trace() && irq_trace()) + start_critical_timing(CALLER_ADDR0, caller_addr); +} +EXPORT_SYMBOL(trace_hardirqs_off_caller); + +#endif /* CONFIG_PROVE_LOCKING */ +#endif /* CONFIG_IRQSOFF_TRACER */ + +#ifdef CONFIG_PREEMPT_TRACER +void trace_preempt_on(unsigned long a0, unsigned long a1) +{ + if (preempt_trace()) + stop_critical_timing(a0, a1); +} + +void trace_preempt_off(unsigned long a0, unsigned long a1) +{ + if (preempt_trace()) + start_critical_timing(a0, a1); +} +#endif /* CONFIG_PREEMPT_TRACER */ + +static int start_irqsoff_tracer(struct trace_array *tr, int graph) +{ + int ret = 0; + + if (!graph) + ret = register_ftrace_function(&trace_ops); + else + ret = register_ftrace_graph(&irqsoff_graph_return, + &irqsoff_graph_entry); + + if (!ret && tracing_is_enabled()) + tracer_enabled = 1; + else + tracer_enabled = 0; + + return ret; +} + +static void stop_irqsoff_tracer(struct trace_array *tr, int graph) +{ + tracer_enabled = 0; + + if (!graph) + unregister_ftrace_function(&trace_ops); + else + unregister_ftrace_graph(); +} + +static void __irqsoff_tracer_init(struct trace_array *tr) +{ + save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; + trace_flags |= TRACE_ITER_LATENCY_FMT; + + tracing_max_latency = 0; + irqsoff_trace = tr; + /* make sure that the tracer is visible */ + smp_wmb(); + tracing_reset_online_cpus(tr); + + if (start_irqsoff_tracer(tr, is_graph())) + printk(KERN_ERR "failed to start irqsoff tracer\n"); +} + +static void irqsoff_tracer_reset(struct trace_array *tr) +{ + stop_irqsoff_tracer(tr, is_graph()); + + if (!save_lat_flag) + trace_flags &= ~TRACE_ITER_LATENCY_FMT; +} + +static void irqsoff_tracer_start(struct trace_array *tr) +{ + tracer_enabled = 1; +} + +static void irqsoff_tracer_stop(struct trace_array *tr) +{ + tracer_enabled = 0; +} + +#ifdef CONFIG_IRQSOFF_TRACER +static int irqsoff_tracer_init(struct trace_array *tr) +{ + trace_type = TRACER_IRQS_OFF; + + __irqsoff_tracer_init(tr); + return 0; +} +static struct tracer irqsoff_tracer __read_mostly = +{ + .name = "irqsoff", + .init = irqsoff_tracer_init, + .reset = irqsoff_tracer_reset, + .start = irqsoff_tracer_start, + .stop = irqsoff_tracer_stop, + .print_max = 1, + .print_header = irqsoff_print_header, + .print_line = irqsoff_print_line, + .flags = &tracer_flags, + .set_flag = irqsoff_set_flag, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_irqsoff, +#endif + .open = irqsoff_trace_open, + .close = irqsoff_trace_close, + .use_max_tr = 1, +}; +# define register_irqsoff(trace) register_tracer(&trace) +#else +# define register_irqsoff(trace) do { } while (0) +#endif + +#ifdef CONFIG_PREEMPT_TRACER +static int preemptoff_tracer_init(struct trace_array *tr) +{ + trace_type = TRACER_PREEMPT_OFF; + + __irqsoff_tracer_init(tr); + return 0; +} + +static struct tracer preemptoff_tracer __read_mostly = +{ + .name = "preemptoff", + .init = preemptoff_tracer_init, + .reset = irqsoff_tracer_reset, + .start = irqsoff_tracer_start, + .stop = irqsoff_tracer_stop, + .print_max = 1, + .print_header = irqsoff_print_header, + .print_line = irqsoff_print_line, + .flags = &tracer_flags, + .set_flag = irqsoff_set_flag, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_preemptoff, +#endif + .open = irqsoff_trace_open, + .close = irqsoff_trace_close, + .use_max_tr = 1, +}; +# define register_preemptoff(trace) register_tracer(&trace) +#else +# define register_preemptoff(trace) do { } while (0) +#endif + +#if defined(CONFIG_IRQSOFF_TRACER) && \ + defined(CONFIG_PREEMPT_TRACER) + +static int preemptirqsoff_tracer_init(struct trace_array *tr) +{ + trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF; + + __irqsoff_tracer_init(tr); + return 0; +} + +static struct tracer preemptirqsoff_tracer __read_mostly = +{ + .name = "preemptirqsoff", + .init = preemptirqsoff_tracer_init, + .reset = irqsoff_tracer_reset, + .start = irqsoff_tracer_start, + .stop = irqsoff_tracer_stop, + .print_max = 1, + .print_header = irqsoff_print_header, + .print_line = irqsoff_print_line, + .flags = &tracer_flags, + .set_flag = irqsoff_set_flag, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_preemptirqsoff, +#endif + .open = irqsoff_trace_open, + .close = irqsoff_trace_close, + .use_max_tr = 1, +}; + +# define register_preemptirqsoff(trace) register_tracer(&trace) +#else +# define register_preemptirqsoff(trace) do { } while (0) +#endif + +__init static int init_irqsoff_tracer(void) +{ + register_irqsoff(irqsoff_tracer); + register_preemptoff(preemptoff_tracer); + register_preemptirqsoff(preemptirqsoff_tracer); + + return 0; +} +device_initcall(init_irqsoff_tracer); diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c new file mode 100644 index 00000000..3c5c5dfe --- /dev/null +++ b/kernel/trace/trace_kdb.c @@ -0,0 +1,135 @@ +/* + * kdb helper for dumping the ftrace buffer + * + * Copyright (C) 2010 Jason Wessel + * + * ftrace_dump_buf based on ftrace_dump: + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2008 Ingo Molnar + * + */ +#include +#include +#include +#include + +#include "trace.h" +#include "trace_output.h" + +static void ftrace_dump_buf(int skip_lines, long cpu_file) +{ + /* use static because iter can be a bit big for the stack */ + static struct trace_iterator iter; + unsigned int old_userobj; + int cnt = 0, cpu; + + trace_init_global_iter(&iter); + + for_each_tracing_cpu(cpu) { + atomic_inc(&iter.tr->data[cpu]->disabled); + } + + old_userobj = trace_flags; + + /* don't look at user memory in panic mode */ + trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + + kdb_printf("Dumping ftrace buffer:\n"); + + /* reset all but tr, trace, and overruns */ + memset(&iter.seq, 0, + sizeof(struct trace_iterator) - + offsetof(struct trace_iterator, seq)); + iter.iter_flags |= TRACE_FILE_LAT_FMT; + iter.pos = -1; + + if (cpu_file == TRACE_PIPE_ALL_CPU) { + for_each_tracing_cpu(cpu) { + iter.buffer_iter[cpu] = + ring_buffer_read_prepare(iter.tr->buffer, cpu); + ring_buffer_read_start(iter.buffer_iter[cpu]); + tracing_iter_reset(&iter, cpu); + } + } else { + iter.cpu_file = cpu_file; + iter.buffer_iter[cpu_file] = + ring_buffer_read_prepare(iter.tr->buffer, cpu_file); + ring_buffer_read_start(iter.buffer_iter[cpu_file]); + tracing_iter_reset(&iter, cpu_file); + } + if (!trace_empty(&iter)) + trace_find_next_entry_inc(&iter); + while (!trace_empty(&iter)) { + if (!cnt) + kdb_printf("---------------------------------\n"); + cnt++; + + if (trace_find_next_entry_inc(&iter) != NULL && !skip_lines) + print_trace_line(&iter); + if (!skip_lines) + trace_printk_seq(&iter.seq); + else + skip_lines--; + if (KDB_FLAG(CMD_INTERRUPT)) + goto out; + } + + if (!cnt) + kdb_printf(" (ftrace buffer empty)\n"); + else + kdb_printf("---------------------------------\n"); + +out: + trace_flags = old_userobj; + + for_each_tracing_cpu(cpu) { + atomic_dec(&iter.tr->data[cpu]->disabled); + } + + for_each_tracing_cpu(cpu) + if (iter.buffer_iter[cpu]) + ring_buffer_read_finish(iter.buffer_iter[cpu]); +} + +/* + * kdb_ftdump - Dump the ftrace log buffer + */ +static int kdb_ftdump(int argc, const char **argv) +{ + int skip_lines = 0; + long cpu_file; + char *cp; + + if (argc > 2) + return KDB_ARGCOUNT; + + if (argc) { + skip_lines = simple_strtol(argv[1], &cp, 0); + if (*cp) + skip_lines = 0; + } + + if (argc == 2) { + cpu_file = simple_strtol(argv[2], &cp, 0); + if (*cp || cpu_file >= NR_CPUS || cpu_file < 0 || + !cpu_online(cpu_file)) + return KDB_BADINT; + } else { + cpu_file = TRACE_PIPE_ALL_CPU; + } + + kdb_trap_printk++; + ftrace_dump_buf(skip_lines, cpu_file); + kdb_trap_printk--; + + return 0; +} + +static __init int kdb_ftrace_register(void) +{ + kdb_register_repeat("ftdump", kdb_ftdump, "[skip_#lines] [cpu]", + "Dump ftrace log", 0, KDB_REPEAT_NONE); + return 0; +} + +late_initcall(kdb_ftrace_register); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c new file mode 100644 index 00000000..27d13b36 --- /dev/null +++ b/kernel/trace/trace_kprobe.c @@ -0,0 +1,1951 @@ +/* + * Kprobes-based tracing events + * + * Created by Masami Hiramatsu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" +#include "trace_output.h" + +#define MAX_TRACE_ARGS 128 +#define MAX_ARGSTR_LEN 63 +#define MAX_EVENT_NAME_LEN 64 +#define MAX_STRING_SIZE PATH_MAX +#define KPROBE_EVENT_SYSTEM "kprobes" + +/* Reserved field names */ +#define FIELD_STRING_IP "__probe_ip" +#define FIELD_STRING_RETIP "__probe_ret_ip" +#define FIELD_STRING_FUNC "__probe_func" + +const char *reserved_field_names[] = { + "common_type", + "common_flags", + "common_preempt_count", + "common_pid", + "common_tgid", + FIELD_STRING_IP, + FIELD_STRING_RETIP, + FIELD_STRING_FUNC, +}; + +/* Printing function type */ +typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *, + void *); +#define PRINT_TYPE_FUNC_NAME(type) print_type_##type +#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type + +/* Printing in basic type function template */ +#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \ +static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \ + const char *name, \ + void *data, void *ent)\ +{ \ + return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\ +} \ +static const char PRINT_TYPE_FMT_NAME(type)[] = fmt; + +DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int) +DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int) +DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long) +DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long) +DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int) +DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int) +DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long) +DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long) + +/* data_rloc: data relative location, compatible with u32 */ +#define make_data_rloc(len, roffs) \ + (((u32)(len) << 16) | ((u32)(roffs) & 0xffff)) +#define get_rloc_len(dl) ((u32)(dl) >> 16) +#define get_rloc_offs(dl) ((u32)(dl) & 0xffff) + +static inline void *get_rloc_data(u32 *dl) +{ + return (u8 *)dl + get_rloc_offs(*dl); +} + +/* For data_loc conversion */ +static inline void *get_loc_data(u32 *dl, void *ent) +{ + return (u8 *)ent + get_rloc_offs(*dl); +} + +/* + * Convert data_rloc to data_loc: + * data_rloc stores the offset from data_rloc itself, but data_loc + * stores the offset from event entry. + */ +#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs)) + +/* For defining macros, define string/string_size types */ +typedef u32 string; +typedef u32 string_size; + +/* Print type function for string type */ +static __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, + const char *name, + void *data, void *ent) +{ + int len = *(u32 *)data >> 16; + + if (!len) + return trace_seq_printf(s, " %s=(fault)", name); + else + return trace_seq_printf(s, " %s=\"%s\"", name, + (const char *)get_loc_data(data, ent)); +} +static const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\""; + +/* Data fetch function type */ +typedef void (*fetch_func_t)(struct pt_regs *, void *, void *); + +struct fetch_param { + fetch_func_t fn; + void *data; +}; + +static __kprobes void call_fetch(struct fetch_param *fprm, + struct pt_regs *regs, void *dest) +{ + return fprm->fn(regs, fprm->data, dest); +} + +#define FETCH_FUNC_NAME(method, type) fetch_##method##_##type +/* + * Define macro for basic types - we don't need to define s* types, because + * we have to care only about bitwidth at recording time. + */ +#define DEFINE_BASIC_FETCH_FUNCS(method) \ +DEFINE_FETCH_##method(u8) \ +DEFINE_FETCH_##method(u16) \ +DEFINE_FETCH_##method(u32) \ +DEFINE_FETCH_##method(u64) + +#define CHECK_FETCH_FUNCS(method, fn) \ + (((FETCH_FUNC_NAME(method, u8) == fn) || \ + (FETCH_FUNC_NAME(method, u16) == fn) || \ + (FETCH_FUNC_NAME(method, u32) == fn) || \ + (FETCH_FUNC_NAME(method, u64) == fn) || \ + (FETCH_FUNC_NAME(method, string) == fn) || \ + (FETCH_FUNC_NAME(method, string_size) == fn)) \ + && (fn != NULL)) + +/* Data fetch function templates */ +#define DEFINE_FETCH_reg(type) \ +static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \ + void *offset, void *dest) \ +{ \ + *(type *)dest = (type)regs_get_register(regs, \ + (unsigned int)((unsigned long)offset)); \ +} +DEFINE_BASIC_FETCH_FUNCS(reg) +/* No string on the register */ +#define fetch_reg_string NULL +#define fetch_reg_string_size NULL + +#define DEFINE_FETCH_stack(type) \ +static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\ + void *offset, void *dest) \ +{ \ + *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \ + (unsigned int)((unsigned long)offset)); \ +} +DEFINE_BASIC_FETCH_FUNCS(stack) +/* No string on the stack entry */ +#define fetch_stack_string NULL +#define fetch_stack_string_size NULL + +#define DEFINE_FETCH_retval(type) \ +static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\ + void *dummy, void *dest) \ +{ \ + *(type *)dest = (type)regs_return_value(regs); \ +} +DEFINE_BASIC_FETCH_FUNCS(retval) +/* No string on the retval */ +#define fetch_retval_string NULL +#define fetch_retval_string_size NULL + +#define DEFINE_FETCH_memory(type) \ +static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\ + void *addr, void *dest) \ +{ \ + type retval; \ + if (probe_kernel_address(addr, retval)) \ + *(type *)dest = 0; \ + else \ + *(type *)dest = retval; \ +} +DEFINE_BASIC_FETCH_FUNCS(memory) +/* + * Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max + * length and relative data location. + */ +static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, + void *addr, void *dest) +{ + long ret; + int maxlen = get_rloc_len(*(u32 *)dest); + u8 *dst = get_rloc_data(dest); + u8 *src = addr; + mm_segment_t old_fs = get_fs(); + if (!maxlen) + return; + /* + * Try to get string again, since the string can be changed while + * probing. + */ + set_fs(KERNEL_DS); + pagefault_disable(); + do + ret = __copy_from_user_inatomic(dst++, src++, 1); + while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen); + dst[-1] = '\0'; + pagefault_enable(); + set_fs(old_fs); + + if (ret < 0) { /* Failed to fetch string */ + ((u8 *)get_rloc_data(dest))[0] = '\0'; + *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); + } else + *(u32 *)dest = make_data_rloc(src - (u8 *)addr, + get_rloc_offs(*(u32 *)dest)); +} +/* Return the length of string -- including null terminal byte */ +static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs, + void *addr, void *dest) +{ + int ret, len = 0; + u8 c; + mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); + pagefault_disable(); + do { + ret = __copy_from_user_inatomic(&c, (u8 *)addr + len, 1); + len++; + } while (c && ret == 0 && len < MAX_STRING_SIZE); + pagefault_enable(); + set_fs(old_fs); + + if (ret < 0) /* Failed to check the length */ + *(u32 *)dest = 0; + else + *(u32 *)dest = len; +} + +/* Memory fetching by symbol */ +struct symbol_cache { + char *symbol; + long offset; + unsigned long addr; +}; + +static unsigned long update_symbol_cache(struct symbol_cache *sc) +{ + sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); + if (sc->addr) + sc->addr += sc->offset; + return sc->addr; +} + +static void free_symbol_cache(struct symbol_cache *sc) +{ + kfree(sc->symbol); + kfree(sc); +} + +static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) +{ + struct symbol_cache *sc; + + if (!sym || strlen(sym) == 0) + return NULL; + sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); + if (!sc) + return NULL; + + sc->symbol = kstrdup(sym, GFP_KERNEL); + if (!sc->symbol) { + kfree(sc); + return NULL; + } + sc->offset = offset; + + update_symbol_cache(sc); + return sc; +} + +#define DEFINE_FETCH_symbol(type) \ +static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\ + void *data, void *dest) \ +{ \ + struct symbol_cache *sc = data; \ + if (sc->addr) \ + fetch_memory_##type(regs, (void *)sc->addr, dest); \ + else \ + *(type *)dest = 0; \ +} +DEFINE_BASIC_FETCH_FUNCS(symbol) +DEFINE_FETCH_symbol(string) +DEFINE_FETCH_symbol(string_size) + +/* Dereference memory access function */ +struct deref_fetch_param { + struct fetch_param orig; + long offset; +}; + +#define DEFINE_FETCH_deref(type) \ +static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\ + void *data, void *dest) \ +{ \ + struct deref_fetch_param *dprm = data; \ + unsigned long addr; \ + call_fetch(&dprm->orig, regs, &addr); \ + if (addr) { \ + addr += dprm->offset; \ + fetch_memory_##type(regs, (void *)addr, dest); \ + } else \ + *(type *)dest = 0; \ +} +DEFINE_BASIC_FETCH_FUNCS(deref) +DEFINE_FETCH_deref(string) +DEFINE_FETCH_deref(string_size) + +static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data) +{ + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) + free_deref_fetch_param(data->orig.data); + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) + free_symbol_cache(data->orig.data); + kfree(data); +} + +/* Bitfield fetch function */ +struct bitfield_fetch_param { + struct fetch_param orig; + unsigned char hi_shift; + unsigned char low_shift; +}; + +#define DEFINE_FETCH_bitfield(type) \ +static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\ + void *data, void *dest) \ +{ \ + struct bitfield_fetch_param *bprm = data; \ + type buf = 0; \ + call_fetch(&bprm->orig, regs, &buf); \ + if (buf) { \ + buf <<= bprm->hi_shift; \ + buf >>= bprm->low_shift; \ + } \ + *(type *)dest = buf; \ +} +DEFINE_BASIC_FETCH_FUNCS(bitfield) +#define fetch_bitfield_string NULL +#define fetch_bitfield_string_size NULL + +static __kprobes void +free_bitfield_fetch_param(struct bitfield_fetch_param *data) +{ + /* + * Don't check the bitfield itself, because this must be the + * last fetch function. + */ + if (CHECK_FETCH_FUNCS(deref, data->orig.fn)) + free_deref_fetch_param(data->orig.data); + else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn)) + free_symbol_cache(data->orig.data); + kfree(data); +} +/* Default (unsigned long) fetch type */ +#define __DEFAULT_FETCH_TYPE(t) u##t +#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t) +#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG) +#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE) + +/* Fetch types */ +enum { + FETCH_MTD_reg = 0, + FETCH_MTD_stack, + FETCH_MTD_retval, + FETCH_MTD_memory, + FETCH_MTD_symbol, + FETCH_MTD_deref, + FETCH_MTD_bitfield, + FETCH_MTD_END, +}; + +#define ASSIGN_FETCH_FUNC(method, type) \ + [FETCH_MTD_##method] = FETCH_FUNC_NAME(method, type) + +#define __ASSIGN_FETCH_TYPE(_name, ptype, ftype, _size, sign, _fmttype) \ + {.name = _name, \ + .size = _size, \ + .is_signed = sign, \ + .print = PRINT_TYPE_FUNC_NAME(ptype), \ + .fmt = PRINT_TYPE_FMT_NAME(ptype), \ + .fmttype = _fmttype, \ + .fetch = { \ +ASSIGN_FETCH_FUNC(reg, ftype), \ +ASSIGN_FETCH_FUNC(stack, ftype), \ +ASSIGN_FETCH_FUNC(retval, ftype), \ +ASSIGN_FETCH_FUNC(memory, ftype), \ +ASSIGN_FETCH_FUNC(symbol, ftype), \ +ASSIGN_FETCH_FUNC(deref, ftype), \ +ASSIGN_FETCH_FUNC(bitfield, ftype), \ + } \ + } + +#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \ + __ASSIGN_FETCH_TYPE(#ptype, ptype, ftype, sizeof(ftype), sign, #ptype) + +#define FETCH_TYPE_STRING 0 +#define FETCH_TYPE_STRSIZE 1 + +/* Fetch type information table */ +static const struct fetch_type { + const char *name; /* Name of type */ + size_t size; /* Byte size of type */ + int is_signed; /* Signed flag */ + print_type_func_t print; /* Print functions */ + const char *fmt; /* Fromat string */ + const char *fmttype; /* Name in format file */ + /* Fetch functions */ + fetch_func_t fetch[FETCH_MTD_END]; +} fetch_type_table[] = { + /* Special types */ + [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, + sizeof(u32), 1, "__data_loc char[]"), + [FETCH_TYPE_STRSIZE] = __ASSIGN_FETCH_TYPE("string_size", u32, + string_size, sizeof(u32), 0, "u32"), + /* Basic types */ + ASSIGN_FETCH_TYPE(u8, u8, 0), + ASSIGN_FETCH_TYPE(u16, u16, 0), + ASSIGN_FETCH_TYPE(u32, u32, 0), + ASSIGN_FETCH_TYPE(u64, u64, 0), + ASSIGN_FETCH_TYPE(s8, u8, 1), + ASSIGN_FETCH_TYPE(s16, u16, 1), + ASSIGN_FETCH_TYPE(s32, u32, 1), + ASSIGN_FETCH_TYPE(s64, u64, 1), +}; + +static const struct fetch_type *find_fetch_type(const char *type) +{ + int i; + + if (!type) + type = DEFAULT_FETCH_TYPE_STR; + + /* Special case: bitfield */ + if (*type == 'b') { + unsigned long bs; + type = strchr(type, '/'); + if (!type) + goto fail; + type++; + if (strict_strtoul(type, 0, &bs)) + goto fail; + switch (bs) { + case 8: + return find_fetch_type("u8"); + case 16: + return find_fetch_type("u16"); + case 32: + return find_fetch_type("u32"); + case 64: + return find_fetch_type("u64"); + default: + goto fail; + } + } + + for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++) + if (strcmp(type, fetch_type_table[i].name) == 0) + return &fetch_type_table[i]; +fail: + return NULL; +} + +/* Special function : only accept unsigned long */ +static __kprobes void fetch_stack_address(struct pt_regs *regs, + void *dummy, void *dest) +{ + *(unsigned long *)dest = kernel_stack_pointer(regs); +} + +static fetch_func_t get_fetch_size_function(const struct fetch_type *type, + fetch_func_t orig_fn) +{ + int i; + + if (type != &fetch_type_table[FETCH_TYPE_STRING]) + return NULL; /* Only string type needs size function */ + for (i = 0; i < FETCH_MTD_END; i++) + if (type->fetch[i] == orig_fn) + return fetch_type_table[FETCH_TYPE_STRSIZE].fetch[i]; + + WARN_ON(1); /* This should not happen */ + return NULL; +} + +/** + * Kprobe event core functions + */ + +struct probe_arg { + struct fetch_param fetch; + struct fetch_param fetch_size; + unsigned int offset; /* Offset from argument entry */ + const char *name; /* Name of this argument */ + const char *comm; /* Command of this argument */ + const struct fetch_type *type; /* Type of this argument */ +}; + +/* Flags for trace_probe */ +#define TP_FLAG_TRACE 1 +#define TP_FLAG_PROFILE 2 + +struct trace_probe { + struct list_head list; + struct kretprobe rp; /* Use rp.kp for kprobe use */ + unsigned long nhit; + unsigned int flags; /* For TP_FLAG_* */ + const char *symbol; /* symbol name */ + struct ftrace_event_class class; + struct ftrace_event_call call; + ssize_t size; /* trace entry size */ + unsigned int nr_args; + struct probe_arg args[]; +}; + +#define SIZEOF_TRACE_PROBE(n) \ + (offsetof(struct trace_probe, args) + \ + (sizeof(struct probe_arg) * (n))) + + +static __kprobes int probe_is_return(struct trace_probe *tp) +{ + return tp->rp.handler != NULL; +} + +static __kprobes const char *probe_symbol(struct trace_probe *tp) +{ + return tp->symbol ? tp->symbol : "unknown"; +} + +static int register_probe_event(struct trace_probe *tp); +static void unregister_probe_event(struct trace_probe *tp); + +static DEFINE_MUTEX(probe_lock); +static LIST_HEAD(probe_list); + +static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); +static int kretprobe_dispatcher(struct kretprobe_instance *ri, + struct pt_regs *regs); + +/* Check the name is good for event/group/fields */ +static int is_good_name(const char *name) +{ + if (!isalpha(*name) && *name != '_') + return 0; + while (*++name != '\0') { + if (!isalpha(*name) && !isdigit(*name) && *name != '_') + return 0; + } + return 1; +} + +/* + * Allocate new trace_probe and initialize it (including kprobes). + */ +static struct trace_probe *alloc_trace_probe(const char *group, + const char *event, + void *addr, + const char *symbol, + unsigned long offs, + int nargs, int is_return) +{ + struct trace_probe *tp; + int ret = -ENOMEM; + + tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL); + if (!tp) + return ERR_PTR(ret); + + if (symbol) { + tp->symbol = kstrdup(symbol, GFP_KERNEL); + if (!tp->symbol) + goto error; + tp->rp.kp.symbol_name = tp->symbol; + tp->rp.kp.offset = offs; + } else + tp->rp.kp.addr = addr; + + if (is_return) + tp->rp.handler = kretprobe_dispatcher; + else + tp->rp.kp.pre_handler = kprobe_dispatcher; + + if (!event || !is_good_name(event)) { + ret = -EINVAL; + goto error; + } + + tp->call.class = &tp->class; + tp->call.name = kstrdup(event, GFP_KERNEL); + if (!tp->call.name) + goto error; + + if (!group || !is_good_name(group)) { + ret = -EINVAL; + goto error; + } + + tp->class.system = kstrdup(group, GFP_KERNEL); + if (!tp->class.system) + goto error; + + INIT_LIST_HEAD(&tp->list); + return tp; +error: + kfree(tp->call.name); + kfree(tp->symbol); + kfree(tp); + return ERR_PTR(ret); +} + +static void free_probe_arg(struct probe_arg *arg) +{ + if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn)) + free_bitfield_fetch_param(arg->fetch.data); + else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn)) + free_deref_fetch_param(arg->fetch.data); + else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn)) + free_symbol_cache(arg->fetch.data); + kfree(arg->name); + kfree(arg->comm); +} + +static void free_trace_probe(struct trace_probe *tp) +{ + int i; + + for (i = 0; i < tp->nr_args; i++) + free_probe_arg(&tp->args[i]); + + kfree(tp->call.class->system); + kfree(tp->call.name); + kfree(tp->symbol); + kfree(tp); +} + +static struct trace_probe *find_probe_event(const char *event, + const char *group) +{ + struct trace_probe *tp; + + list_for_each_entry(tp, &probe_list, list) + if (strcmp(tp->call.name, event) == 0 && + strcmp(tp->call.class->system, group) == 0) + return tp; + return NULL; +} + +/* Unregister a trace_probe and probe_event: call with locking probe_lock */ +static void unregister_trace_probe(struct trace_probe *tp) +{ + if (probe_is_return(tp)) + unregister_kretprobe(&tp->rp); + else + unregister_kprobe(&tp->rp.kp); + list_del(&tp->list); + unregister_probe_event(tp); +} + +/* Register a trace_probe and probe_event */ +static int register_trace_probe(struct trace_probe *tp) +{ + struct trace_probe *old_tp; + int ret; + + mutex_lock(&probe_lock); + + /* register as an event */ + old_tp = find_probe_event(tp->call.name, tp->call.class->system); + if (old_tp) { + /* delete old event */ + unregister_trace_probe(old_tp); + free_trace_probe(old_tp); + } + ret = register_probe_event(tp); + if (ret) { + pr_warning("Failed to register probe event(%d)\n", ret); + goto end; + } + + tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; + if (probe_is_return(tp)) + ret = register_kretprobe(&tp->rp); + else + ret = register_kprobe(&tp->rp.kp); + + if (ret) { + pr_warning("Could not insert probe(%d)\n", ret); + if (ret == -EILSEQ) { + pr_warning("Probing address(0x%p) is not an " + "instruction boundary.\n", + tp->rp.kp.addr); + ret = -EINVAL; + } + unregister_probe_event(tp); + } else + list_add_tail(&tp->list, &probe_list); +end: + mutex_unlock(&probe_lock); + return ret; +} + +/* Split symbol and offset. */ +static int split_symbol_offset(char *symbol, unsigned long *offset) +{ + char *tmp; + int ret; + + if (!offset) + return -EINVAL; + + tmp = strchr(symbol, '+'); + if (tmp) { + /* skip sign because strict_strtol doesn't accept '+' */ + ret = strict_strtoul(tmp + 1, 0, offset); + if (ret) + return ret; + *tmp = '\0'; + } else + *offset = 0; + return 0; +} + +#define PARAM_MAX_ARGS 16 +#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) + +static int parse_probe_vars(char *arg, const struct fetch_type *t, + struct fetch_param *f, int is_return) +{ + int ret = 0; + unsigned long param; + + if (strcmp(arg, "retval") == 0) { + if (is_return) + f->fn = t->fetch[FETCH_MTD_retval]; + else + ret = -EINVAL; + } else if (strncmp(arg, "stack", 5) == 0) { + if (arg[5] == '\0') { + if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0) + f->fn = fetch_stack_address; + else + ret = -EINVAL; + } else if (isdigit(arg[5])) { + ret = strict_strtoul(arg + 5, 10, ¶m); + if (ret || param > PARAM_MAX_STACK) + ret = -EINVAL; + else { + f->fn = t->fetch[FETCH_MTD_stack]; + f->data = (void *)param; + } + } else + ret = -EINVAL; + } else + ret = -EINVAL; + return ret; +} + +/* Recursive argument parser */ +static int __parse_probe_arg(char *arg, const struct fetch_type *t, + struct fetch_param *f, int is_return) +{ + int ret = 0; + unsigned long param; + long offset; + char *tmp; + + switch (arg[0]) { + case '$': + ret = parse_probe_vars(arg + 1, t, f, is_return); + break; + case '%': /* named register */ + ret = regs_query_register_offset(arg + 1); + if (ret >= 0) { + f->fn = t->fetch[FETCH_MTD_reg]; + f->data = (void *)(unsigned long)ret; + ret = 0; + } + break; + case '@': /* memory or symbol */ + if (isdigit(arg[1])) { + ret = strict_strtoul(arg + 1, 0, ¶m); + if (ret) + break; + f->fn = t->fetch[FETCH_MTD_memory]; + f->data = (void *)param; + } else { + ret = split_symbol_offset(arg + 1, &offset); + if (ret) + break; + f->data = alloc_symbol_cache(arg + 1, offset); + if (f->data) + f->fn = t->fetch[FETCH_MTD_symbol]; + } + break; + case '+': /* deref memory */ + arg++; /* Skip '+', because strict_strtol() rejects it. */ + case '-': + tmp = strchr(arg, '('); + if (!tmp) + break; + *tmp = '\0'; + ret = strict_strtol(arg, 0, &offset); + if (ret) + break; + arg = tmp + 1; + tmp = strrchr(arg, ')'); + if (tmp) { + struct deref_fetch_param *dprm; + const struct fetch_type *t2 = find_fetch_type(NULL); + *tmp = '\0'; + dprm = kzalloc(sizeof(struct deref_fetch_param), + GFP_KERNEL); + if (!dprm) + return -ENOMEM; + dprm->offset = offset; + ret = __parse_probe_arg(arg, t2, &dprm->orig, + is_return); + if (ret) + kfree(dprm); + else { + f->fn = t->fetch[FETCH_MTD_deref]; + f->data = (void *)dprm; + } + } + break; + } + if (!ret && !f->fn) { /* Parsed, but do not find fetch method */ + pr_info("%s type has no corresponding fetch method.\n", + t->name); + ret = -EINVAL; + } + return ret; +} + +#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long)) + +/* Bitfield type needs to be parsed into a fetch function */ +static int __parse_bitfield_probe_arg(const char *bf, + const struct fetch_type *t, + struct fetch_param *f) +{ + struct bitfield_fetch_param *bprm; + unsigned long bw, bo; + char *tail; + + if (*bf != 'b') + return 0; + + bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); + if (!bprm) + return -ENOMEM; + bprm->orig = *f; + f->fn = t->fetch[FETCH_MTD_bitfield]; + f->data = (void *)bprm; + + bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */ + if (bw == 0 || *tail != '@') + return -EINVAL; + + bf = tail + 1; + bo = simple_strtoul(bf, &tail, 0); + if (tail == bf || *tail != '/') + return -EINVAL; + + bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo); + bprm->low_shift = bprm->hi_shift + bo; + return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0; +} + +/* String length checking wrapper */ +static int parse_probe_arg(char *arg, struct trace_probe *tp, + struct probe_arg *parg, int is_return) +{ + const char *t; + int ret; + + if (strlen(arg) > MAX_ARGSTR_LEN) { + pr_info("Argument is too long.: %s\n", arg); + return -ENOSPC; + } + parg->comm = kstrdup(arg, GFP_KERNEL); + if (!parg->comm) { + pr_info("Failed to allocate memory for command '%s'.\n", arg); + return -ENOMEM; + } + t = strchr(parg->comm, ':'); + if (t) { + arg[t - parg->comm] = '\0'; + t++; + } + parg->type = find_fetch_type(t); + if (!parg->type) { + pr_info("Unsupported type: %s\n", t); + return -EINVAL; + } + parg->offset = tp->size; + tp->size += parg->type->size; + ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return); + if (ret >= 0 && t != NULL) + ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); + if (ret >= 0) { + parg->fetch_size.fn = get_fetch_size_function(parg->type, + parg->fetch.fn); + parg->fetch_size.data = parg->fetch.data; + } + return ret; +} + +/* Return 1 if name is reserved or already used by another argument */ +static int conflict_field_name(const char *name, + struct probe_arg *args, int narg) +{ + int i; + for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++) + if (strcmp(reserved_field_names[i], name) == 0) + return 1; + for (i = 0; i < narg; i++) + if (strcmp(args[i].name, name) == 0) + return 1; + return 0; +} + +static int create_trace_probe(int argc, char **argv) +{ + /* + * Argument syntax: + * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] + * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] + * Fetch args: + * $retval : fetch return value + * $stack : fetch stack address + * $stackN : fetch Nth of stack (N:0-) + * @ADDR : fetch memory at ADDR (ADDR should be in kernel) + * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) + * %REG : fetch register REG + * Dereferencing memory fetch: + * +|-offs(ARG) : fetch memory at ARG +|- offs address. + * Alias name of args: + * NAME=FETCHARG : set NAME as alias of FETCHARG. + * Type of args: + * FETCHARG:TYPE : use TYPE instead of unsigned long. + */ + struct trace_probe *tp; + int i, ret = 0; + int is_return = 0, is_delete = 0; + char *symbol = NULL, *event = NULL, *group = NULL; + char *arg; + unsigned long offset = 0; + void *addr = NULL; + char buf[MAX_EVENT_NAME_LEN]; + + /* argc must be >= 1 */ + if (argv[0][0] == 'p') + is_return = 0; + else if (argv[0][0] == 'r') + is_return = 1; + else if (argv[0][0] == '-') + is_delete = 1; + else { + pr_info("Probe definition must be started with 'p', 'r' or" + " '-'.\n"); + return -EINVAL; + } + + if (argv[0][1] == ':') { + event = &argv[0][2]; + if (strchr(event, '/')) { + group = event; + event = strchr(group, '/') + 1; + event[-1] = '\0'; + if (strlen(group) == 0) { + pr_info("Group name is not specified\n"); + return -EINVAL; + } + } + if (strlen(event) == 0) { + pr_info("Event name is not specified\n"); + return -EINVAL; + } + } + if (!group) + group = KPROBE_EVENT_SYSTEM; + + if (is_delete) { + if (!event) { + pr_info("Delete command needs an event name.\n"); + return -EINVAL; + } + mutex_lock(&probe_lock); + tp = find_probe_event(event, group); + if (!tp) { + mutex_unlock(&probe_lock); + pr_info("Event %s/%s doesn't exist.\n", group, event); + return -ENOENT; + } + /* delete an event */ + unregister_trace_probe(tp); + free_trace_probe(tp); + mutex_unlock(&probe_lock); + return 0; + } + + if (argc < 2) { + pr_info("Probe point is not specified.\n"); + return -EINVAL; + } + if (isdigit(argv[1][0])) { + if (is_return) { + pr_info("Return probe point must be a symbol.\n"); + return -EINVAL; + } + /* an address specified */ + ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr); + if (ret) { + pr_info("Failed to parse address.\n"); + return ret; + } + } else { + /* a symbol specified */ + symbol = argv[1]; + /* TODO: support .init module functions */ + ret = split_symbol_offset(symbol, &offset); + if (ret) { + pr_info("Failed to parse symbol.\n"); + return ret; + } + if (offset && is_return) { + pr_info("Return probe must be used without offset.\n"); + return -EINVAL; + } + } + argc -= 2; argv += 2; + + /* setup a probe */ + if (!event) { + /* Make a new event name */ + if (symbol) + snprintf(buf, MAX_EVENT_NAME_LEN, "%c_%s_%ld", + is_return ? 'r' : 'p', symbol, offset); + else + snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p", + is_return ? 'r' : 'p', addr); + event = buf; + } + tp = alloc_trace_probe(group, event, addr, symbol, offset, argc, + is_return); + if (IS_ERR(tp)) { + pr_info("Failed to allocate trace_probe.(%d)\n", + (int)PTR_ERR(tp)); + return PTR_ERR(tp); + } + + /* parse arguments */ + ret = 0; + for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { + /* Increment count for freeing args in error case */ + tp->nr_args++; + + /* Parse argument name */ + arg = strchr(argv[i], '='); + if (arg) { + *arg++ = '\0'; + tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); + } else { + arg = argv[i]; + /* If argument name is omitted, set "argN" */ + snprintf(buf, MAX_EVENT_NAME_LEN, "arg%d", i + 1); + tp->args[i].name = kstrdup(buf, GFP_KERNEL); + } + + if (!tp->args[i].name) { + pr_info("Failed to allocate argument[%d] name.\n", i); + ret = -ENOMEM; + goto error; + } + + if (!is_good_name(tp->args[i].name)) { + pr_info("Invalid argument[%d] name: %s\n", + i, tp->args[i].name); + ret = -EINVAL; + goto error; + } + + if (conflict_field_name(tp->args[i].name, tp->args, i)) { + pr_info("Argument[%d] name '%s' conflicts with " + "another field.\n", i, argv[i]); + ret = -EINVAL; + goto error; + } + + /* Parse fetch argument */ + ret = parse_probe_arg(arg, tp, &tp->args[i], is_return); + if (ret) { + pr_info("Parse error at argument[%d]. (%d)\n", i, ret); + goto error; + } + } + + ret = register_trace_probe(tp); + if (ret) + goto error; + return 0; + +error: + free_trace_probe(tp); + return ret; +} + +static void cleanup_all_probes(void) +{ + struct trace_probe *tp; + + mutex_lock(&probe_lock); + /* TODO: Use batch unregistration */ + while (!list_empty(&probe_list)) { + tp = list_entry(probe_list.next, struct trace_probe, list); + unregister_trace_probe(tp); + free_trace_probe(tp); + } + mutex_unlock(&probe_lock); +} + + +/* Probes listing interfaces */ +static void *probes_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&probe_lock); + return seq_list_start(&probe_list, *pos); +} + +static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &probe_list, pos); +} + +static void probes_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&probe_lock); +} + +static int probes_seq_show(struct seq_file *m, void *v) +{ + struct trace_probe *tp = v; + int i; + + seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); + seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name); + + if (!tp->symbol) + seq_printf(m, " 0x%p", tp->rp.kp.addr); + else if (tp->rp.kp.offset) + seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset); + else + seq_printf(m, " %s", probe_symbol(tp)); + + for (i = 0; i < tp->nr_args; i++) + seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm); + seq_printf(m, "\n"); + + return 0; +} + +static const struct seq_operations probes_seq_op = { + .start = probes_seq_start, + .next = probes_seq_next, + .stop = probes_seq_stop, + .show = probes_seq_show +}; + +static int probes_open(struct inode *inode, struct file *file) +{ + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) + cleanup_all_probes(); + + return seq_open(file, &probes_seq_op); +} + +static int command_trace_probe(const char *buf) +{ + char **argv; + int argc = 0, ret = 0; + + argv = argv_split(GFP_KERNEL, buf, &argc); + if (!argv) + return -ENOMEM; + + if (argc) + ret = create_trace_probe(argc, argv); + + argv_free(argv); + return ret; +} + +#define WRITE_BUFSIZE 4096 + +static ssize_t probes_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + char *kbuf, *tmp; + int ret; + size_t done; + size_t size; + + kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + ret = done = 0; + while (done < count) { + size = count - done; + if (size >= WRITE_BUFSIZE) + size = WRITE_BUFSIZE - 1; + if (copy_from_user(kbuf, buffer + done, size)) { + ret = -EFAULT; + goto out; + } + kbuf[size] = '\0'; + tmp = strchr(kbuf, '\n'); + if (tmp) { + *tmp = '\0'; + size = tmp - kbuf + 1; + } else if (done + size < count) { + pr_warning("Line length is too long: " + "Should be less than %d.", WRITE_BUFSIZE); + ret = -EINVAL; + goto out; + } + done += size; + /* Remove comments */ + tmp = strchr(kbuf, '#'); + if (tmp) + *tmp = '\0'; + + ret = command_trace_probe(kbuf); + if (ret) + goto out; + } + ret = done; +out: + kfree(kbuf); + return ret; +} + +static const struct file_operations kprobe_events_ops = { + .owner = THIS_MODULE, + .open = probes_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, + .write = probes_write, +}; + +/* Probes profiling interfaces */ +static int probes_profile_seq_show(struct seq_file *m, void *v) +{ + struct trace_probe *tp = v; + + seq_printf(m, " %-44s %15lu %15lu\n", tp->call.name, tp->nhit, + tp->rp.kp.nmissed); + + return 0; +} + +static const struct seq_operations profile_seq_op = { + .start = probes_seq_start, + .next = probes_seq_next, + .stop = probes_seq_stop, + .show = probes_profile_seq_show +}; + +static int profile_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &profile_seq_op); +} + +static const struct file_operations kprobe_profile_ops = { + .owner = THIS_MODULE, + .open = profile_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* Sum up total data length for dynamic arraies (strings) */ +static __kprobes int __get_data_size(struct trace_probe *tp, + struct pt_regs *regs) +{ + int i, ret = 0; + u32 len; + + for (i = 0; i < tp->nr_args; i++) + if (unlikely(tp->args[i].fetch_size.fn)) { + call_fetch(&tp->args[i].fetch_size, regs, &len); + ret += len; + } + + return ret; +} + +/* Store the value of each argument */ +static __kprobes void store_trace_args(int ent_size, struct trace_probe *tp, + struct pt_regs *regs, + u8 *data, int maxlen) +{ + int i; + u32 end = tp->size; + u32 *dl; /* Data (relative) location */ + + for (i = 0; i < tp->nr_args; i++) { + if (unlikely(tp->args[i].fetch_size.fn)) { + /* + * First, we set the relative location and + * maximum data length to *dl + */ + dl = (u32 *)(data + tp->args[i].offset); + *dl = make_data_rloc(maxlen, end - tp->args[i].offset); + /* Then try to fetch string or dynamic array data */ + call_fetch(&tp->args[i].fetch, regs, dl); + /* Reduce maximum length */ + end += get_rloc_len(*dl); + maxlen -= get_rloc_len(*dl); + /* Trick here, convert data_rloc to data_loc */ + *dl = convert_rloc_to_loc(*dl, + ent_size + tp->args[i].offset); + } else + /* Just fetching data normally */ + call_fetch(&tp->args[i].fetch, regs, + data + tp->args[i].offset); + } +} + +/* Kprobe handler */ +static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); + struct kprobe_trace_entry_head *entry; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + int size, dsize, pc; + unsigned long irq_flags; + struct ftrace_event_call *call = &tp->call; + + tp->nhit++; + + local_save_flags(irq_flags); + pc = preempt_count(); + + dsize = __get_data_size(tp, regs); + size = sizeof(*entry) + tp->size + dsize; + + event = trace_current_buffer_lock_reserve(&buffer, call->event.type, + size, irq_flags, pc); + if (!event) + return; + + entry = ring_buffer_event_data(event); + entry->ip = (unsigned long)kp->addr; + store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); + + if (!filter_current_check_discard(buffer, call, entry, event)) + trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); +} + +/* Kretprobe handler */ +static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); + struct kretprobe_trace_entry_head *entry; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + int size, pc, dsize; + unsigned long irq_flags; + struct ftrace_event_call *call = &tp->call; + + local_save_flags(irq_flags); + pc = preempt_count(); + + dsize = __get_data_size(tp, regs); + size = sizeof(*entry) + tp->size + dsize; + + event = trace_current_buffer_lock_reserve(&buffer, call->event.type, + size, irq_flags, pc); + if (!event) + return; + + entry = ring_buffer_event_data(event); + entry->func = (unsigned long)tp->rp.kp.addr; + entry->ret_ip = (unsigned long)ri->ret_addr; + store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); + + if (!filter_current_check_discard(buffer, call, entry, event)) + trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); +} + +/* Event entry printers */ +enum print_line_t +print_kprobe_event(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct kprobe_trace_entry_head *field; + struct trace_seq *s = &iter->seq; + struct trace_probe *tp; + u8 *data; + int i; + + field = (struct kprobe_trace_entry_head *)iter->ent; + tp = container_of(event, struct trace_probe, call.event); + + if (!trace_seq_printf(s, "%s: (", tp->call.name)) + goto partial; + + if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) + goto partial; + + if (!trace_seq_puts(s, ")")) + goto partial; + + data = (u8 *)&field[1]; + for (i = 0; i < tp->nr_args; i++) + if (!tp->args[i].type->print(s, tp->args[i].name, + data + tp->args[i].offset, field)) + goto partial; + + if (!trace_seq_puts(s, "\n")) + goto partial; + + return TRACE_TYPE_HANDLED; +partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +enum print_line_t +print_kretprobe_event(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct kretprobe_trace_entry_head *field; + struct trace_seq *s = &iter->seq; + struct trace_probe *tp; + u8 *data; + int i; + + field = (struct kretprobe_trace_entry_head *)iter->ent; + tp = container_of(event, struct trace_probe, call.event); + + if (!trace_seq_printf(s, "%s: (", tp->call.name)) + goto partial; + + if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) + goto partial; + + if (!trace_seq_puts(s, " <- ")) + goto partial; + + if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) + goto partial; + + if (!trace_seq_puts(s, ")")) + goto partial; + + data = (u8 *)&field[1]; + for (i = 0; i < tp->nr_args; i++) + if (!tp->args[i].type->print(s, tp->args[i].name, + data + tp->args[i].offset, field)) + goto partial; + + if (!trace_seq_puts(s, "\n")) + goto partial; + + return TRACE_TYPE_HANDLED; +partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static int probe_event_enable(struct ftrace_event_call *call) +{ + struct trace_probe *tp = (struct trace_probe *)call->data; + + tp->flags |= TP_FLAG_TRACE; + if (probe_is_return(tp)) + return enable_kretprobe(&tp->rp); + else + return enable_kprobe(&tp->rp.kp); +} + +static void probe_event_disable(struct ftrace_event_call *call) +{ + struct trace_probe *tp = (struct trace_probe *)call->data; + + tp->flags &= ~TP_FLAG_TRACE; + if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) { + if (probe_is_return(tp)) + disable_kretprobe(&tp->rp); + else + disable_kprobe(&tp->rp.kp); + } +} + +#undef DEFINE_FIELD +#define DEFINE_FIELD(type, item, name, is_signed) \ + do { \ + ret = trace_define_field(event_call, #type, name, \ + offsetof(typeof(field), item), \ + sizeof(field.item), is_signed, \ + FILTER_OTHER); \ + if (ret) \ + return ret; \ + } while (0) + +static int kprobe_event_define_fields(struct ftrace_event_call *event_call) +{ + int ret, i; + struct kprobe_trace_entry_head field; + struct trace_probe *tp = (struct trace_probe *)event_call->data; + + DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); + /* Set argument names as fields */ + for (i = 0; i < tp->nr_args; i++) { + ret = trace_define_field(event_call, tp->args[i].type->fmttype, + tp->args[i].name, + sizeof(field) + tp->args[i].offset, + tp->args[i].type->size, + tp->args[i].type->is_signed, + FILTER_OTHER); + if (ret) + return ret; + } + return 0; +} + +static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) +{ + int ret, i; + struct kretprobe_trace_entry_head field; + struct trace_probe *tp = (struct trace_probe *)event_call->data; + + DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); + DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); + /* Set argument names as fields */ + for (i = 0; i < tp->nr_args; i++) { + ret = trace_define_field(event_call, tp->args[i].type->fmttype, + tp->args[i].name, + sizeof(field) + tp->args[i].offset, + tp->args[i].type->size, + tp->args[i].type->is_signed, + FILTER_OTHER); + if (ret) + return ret; + } + return 0; +} + +static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) +{ + int i; + int pos = 0; + + const char *fmt, *arg; + + if (!probe_is_return(tp)) { + fmt = "(%lx)"; + arg = "REC->" FIELD_STRING_IP; + } else { + fmt = "(%lx <- %lx)"; + arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; + } + + /* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); + + for (i = 0; i < tp->nr_args; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s", + tp->args[i].name, tp->args[i].type->fmt); + } + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); + + for (i = 0; i < tp->nr_args; i++) { + if (strcmp(tp->args[i].type->name, "string") == 0) + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", __get_str(%s)", + tp->args[i].name); + else + pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", + tp->args[i].name); + } + +#undef LEN_OR_ZERO + + /* return the length of print_fmt */ + return pos; +} + +static int set_print_fmt(struct trace_probe *tp) +{ + int len; + char *print_fmt; + + /* First: called with 0 length to calculate the needed length */ + len = __set_print_fmt(tp, NULL, 0); + print_fmt = kmalloc(len + 1, GFP_KERNEL); + if (!print_fmt) + return -ENOMEM; + + /* Second: actually write the @print_fmt */ + __set_print_fmt(tp, print_fmt, len + 1); + tp->call.print_fmt = print_fmt; + + return 0; +} + +#ifdef CONFIG_PERF_EVENTS + +/* Kprobe profile handler */ +static __kprobes void kprobe_perf_func(struct kprobe *kp, + struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); + struct ftrace_event_call *call = &tp->call; + struct kprobe_trace_entry_head *entry; + struct hlist_head *head; + int size, __size, dsize; + int rctx; + + dsize = __get_data_size(tp, regs); + __size = sizeof(*entry) + tp->size + dsize; + size = ALIGN(__size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, + "profile buffer not large enough")) + return; + + entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); + if (!entry) + return; + + entry->ip = (unsigned long)kp->addr; + memset(&entry[1], 0, dsize); + store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); + + head = this_cpu_ptr(call->perf_events); + perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); +} + +/* Kretprobe profile handler */ +static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, + struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); + struct ftrace_event_call *call = &tp->call; + struct kretprobe_trace_entry_head *entry; + struct hlist_head *head; + int size, __size, dsize; + int rctx; + + dsize = __get_data_size(tp, regs); + __size = sizeof(*entry) + tp->size + dsize; + size = ALIGN(__size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, + "profile buffer not large enough")) + return; + + entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); + if (!entry) + return; + + entry->func = (unsigned long)tp->rp.kp.addr; + entry->ret_ip = (unsigned long)ri->ret_addr; + store_trace_args(sizeof(*entry), tp, regs, (u8 *)&entry[1], dsize); + + head = this_cpu_ptr(call->perf_events); + perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); +} + +static int probe_perf_enable(struct ftrace_event_call *call) +{ + struct trace_probe *tp = (struct trace_probe *)call->data; + + tp->flags |= TP_FLAG_PROFILE; + + if (probe_is_return(tp)) + return enable_kretprobe(&tp->rp); + else + return enable_kprobe(&tp->rp.kp); +} + +static void probe_perf_disable(struct ftrace_event_call *call) +{ + struct trace_probe *tp = (struct trace_probe *)call->data; + + tp->flags &= ~TP_FLAG_PROFILE; + + if (!(tp->flags & TP_FLAG_TRACE)) { + if (probe_is_return(tp)) + disable_kretprobe(&tp->rp); + else + disable_kprobe(&tp->rp.kp); + } +} +#endif /* CONFIG_PERF_EVENTS */ + +static __kprobes +int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) +{ + switch (type) { + case TRACE_REG_REGISTER: + return probe_event_enable(event); + case TRACE_REG_UNREGISTER: + probe_event_disable(event); + return 0; + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return probe_perf_enable(event); + case TRACE_REG_PERF_UNREGISTER: + probe_perf_disable(event); + return 0; +#endif + } + return 0; +} + +static __kprobes +int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); + + if (tp->flags & TP_FLAG_TRACE) + kprobe_trace_func(kp, regs); +#ifdef CONFIG_PERF_EVENTS + if (tp->flags & TP_FLAG_PROFILE) + kprobe_perf_func(kp, regs); +#endif + return 0; /* We don't tweek kernel, so just return 0 */ +} + +static __kprobes +int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) +{ + struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); + + if (tp->flags & TP_FLAG_TRACE) + kretprobe_trace_func(ri, regs); +#ifdef CONFIG_PERF_EVENTS + if (tp->flags & TP_FLAG_PROFILE) + kretprobe_perf_func(ri, regs); +#endif + return 0; /* We don't tweek kernel, so just return 0 */ +} + +static struct trace_event_functions kretprobe_funcs = { + .trace = print_kretprobe_event +}; + +static struct trace_event_functions kprobe_funcs = { + .trace = print_kprobe_event +}; + +static int register_probe_event(struct trace_probe *tp) +{ + struct ftrace_event_call *call = &tp->call; + int ret; + + /* Initialize ftrace_event_call */ + INIT_LIST_HEAD(&call->class->fields); + if (probe_is_return(tp)) { + call->event.funcs = &kretprobe_funcs; + call->class->define_fields = kretprobe_event_define_fields; + } else { + call->event.funcs = &kprobe_funcs; + call->class->define_fields = kprobe_event_define_fields; + } + if (set_print_fmt(tp) < 0) + return -ENOMEM; + ret = register_ftrace_event(&call->event); + if (!ret) { + kfree(call->print_fmt); + return -ENODEV; + } + call->flags = 0; + call->class->reg = kprobe_register; + call->data = tp; + ret = trace_add_event_call(call); + if (ret) { + pr_info("Failed to register kprobe event: %s\n", call->name); + kfree(call->print_fmt); + unregister_ftrace_event(&call->event); + } + return ret; +} + +static void unregister_probe_event(struct trace_probe *tp) +{ + /* tp->event is unregistered in trace_remove_event_call() */ + trace_remove_event_call(&tp->call); + kfree(tp->call.print_fmt); +} + +/* Make a debugfs interface for controlling probe points */ +static __init int init_kprobe_trace(void) +{ + struct dentry *d_tracer; + struct dentry *entry; + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; + + entry = debugfs_create_file("kprobe_events", 0644, d_tracer, + NULL, &kprobe_events_ops); + + /* Event list interface */ + if (!entry) + pr_warning("Could not create debugfs " + "'kprobe_events' entry\n"); + + /* Profile interface */ + entry = debugfs_create_file("kprobe_profile", 0444, d_tracer, + NULL, &kprobe_profile_ops); + + if (!entry) + pr_warning("Could not create debugfs " + "'kprobe_profile' entry\n"); + return 0; +} +fs_initcall(init_kprobe_trace); + + +#ifdef CONFIG_FTRACE_STARTUP_TEST + +/* + * The "__used" keeps gcc from removing the function symbol + * from the kallsyms table. + */ +static __used int kprobe_trace_selftest_target(int a1, int a2, int a3, + int a4, int a5, int a6) +{ + return a1 + a2 + a3 + a4 + a5 + a6; +} + +static __init int kprobe_trace_self_tests_init(void) +{ + int ret, warn = 0; + int (*target)(int, int, int, int, int, int); + struct trace_probe *tp; + + target = kprobe_trace_selftest_target; + + pr_info("Testing kprobe tracing: "); + + ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target " + "$stack $stack0 +0($stack)"); + if (WARN_ON_ONCE(ret)) { + pr_warning("error on probing function entry.\n"); + warn++; + } else { + /* Enable trace point */ + tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tp == NULL)) { + pr_warning("error on getting new probe.\n"); + warn++; + } else + probe_event_enable(&tp->call); + } + + ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " + "$retval"); + if (WARN_ON_ONCE(ret)) { + pr_warning("error on probing function return.\n"); + warn++; + } else { + /* Enable trace point */ + tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM); + if (WARN_ON_ONCE(tp == NULL)) { + pr_warning("error on getting new probe.\n"); + warn++; + } else + probe_event_enable(&tp->call); + } + + if (warn) + goto end; + + ret = target(1, 2, 3, 4, 5, 6); + + ret = command_trace_probe("-:testprobe"); + if (WARN_ON_ONCE(ret)) { + pr_warning("error on deleting a probe.\n"); + warn++; + } + + ret = command_trace_probe("-:testprobe2"); + if (WARN_ON_ONCE(ret)) { + pr_warning("error on deleting a probe.\n"); + warn++; + } + +end: + cleanup_all_probes(); + if (warn) + pr_cont("NG: Some tests are failed. Please check them.\n"); + else + pr_cont("OK\n"); + return 0; +} + +late_initcall(kprobe_trace_self_tests_init); + +#endif diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c new file mode 100644 index 00000000..017fa376 --- /dev/null +++ b/kernel/trace/trace_mmiotrace.c @@ -0,0 +1,374 @@ +/* + * Memory mapped I/O tracing + * + * Copyright (C) 2008 Pekka Paalanen + */ + +#define DEBUG 1 + +#include +#include +#include +#include +#include + +#include + +#include "trace.h" +#include "trace_output.h" + +struct header_iter { + struct pci_dev *dev; +}; + +static struct trace_array *mmio_trace_array; +static bool overrun_detected; +static unsigned long prev_overruns; +static atomic_t dropped_count; + +static void mmio_reset_data(struct trace_array *tr) +{ + overrun_detected = false; + prev_overruns = 0; + + tracing_reset_online_cpus(tr); +} + +static int mmio_trace_init(struct trace_array *tr) +{ + pr_debug("in %s\n", __func__); + mmio_trace_array = tr; + + mmio_reset_data(tr); + enable_mmiotrace(); + return 0; +} + +static void mmio_trace_reset(struct trace_array *tr) +{ + pr_debug("in %s\n", __func__); + + disable_mmiotrace(); + mmio_reset_data(tr); + mmio_trace_array = NULL; +} + +static void mmio_trace_start(struct trace_array *tr) +{ + pr_debug("in %s\n", __func__); + mmio_reset_data(tr); +} + +static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev) +{ + int ret = 0; + int i; + resource_size_t start, end; + const struct pci_driver *drv = pci_dev_driver(dev); + + /* XXX: incomplete checks for trace_seq_printf() return value */ + ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x", + dev->bus->number, dev->devfn, + dev->vendor, dev->device, dev->irq); + /* + * XXX: is pci_resource_to_user() appropriate, since we are + * supposed to interpret the __ioremap() phys_addr argument based on + * these printed values? + */ + for (i = 0; i < 7; i++) { + pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); + ret += trace_seq_printf(s, " %llx", + (unsigned long long)(start | + (dev->resource[i].flags & PCI_REGION_FLAG_MASK))); + } + for (i = 0; i < 7; i++) { + pci_resource_to_user(dev, i, &dev->resource[i], &start, &end); + ret += trace_seq_printf(s, " %llx", + dev->resource[i].start < dev->resource[i].end ? + (unsigned long long)(end - start) + 1 : 0); + } + if (drv) + ret += trace_seq_printf(s, " %s\n", drv->name); + else + ret += trace_seq_printf(s, " \n"); + return ret; +} + +static void destroy_header_iter(struct header_iter *hiter) +{ + if (!hiter) + return; + pci_dev_put(hiter->dev); + kfree(hiter); +} + +static void mmio_pipe_open(struct trace_iterator *iter) +{ + struct header_iter *hiter; + struct trace_seq *s = &iter->seq; + + trace_seq_printf(s, "VERSION 20070824\n"); + + hiter = kzalloc(sizeof(*hiter), GFP_KERNEL); + if (!hiter) + return; + + hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, NULL); + iter->private = hiter; +} + +/* XXX: This is not called when the pipe is closed! */ +static void mmio_close(struct trace_iterator *iter) +{ + struct header_iter *hiter = iter->private; + destroy_header_iter(hiter); + iter->private = NULL; +} + +static unsigned long count_overruns(struct trace_iterator *iter) +{ + unsigned long cnt = atomic_xchg(&dropped_count, 0); + unsigned long over = ring_buffer_overruns(iter->tr->buffer); + + if (over > prev_overruns) + cnt += over - prev_overruns; + prev_overruns = over; + return cnt; +} + +static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp, + char __user *ubuf, size_t cnt, loff_t *ppos) +{ + ssize_t ret; + struct header_iter *hiter = iter->private; + struct trace_seq *s = &iter->seq; + unsigned long n; + + n = count_overruns(iter); + if (n) { + /* XXX: This is later than where events were lost. */ + trace_seq_printf(s, "MARK 0.000000 Lost %lu events.\n", n); + if (!overrun_detected) + pr_warning("mmiotrace has lost events.\n"); + overrun_detected = true; + goto print_out; + } + + if (!hiter) + return 0; + + mmio_print_pcidev(s, hiter->dev); + hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, hiter->dev); + + if (!hiter->dev) { + destroy_header_iter(hiter); + iter->private = NULL; + } + +print_out: + ret = trace_seq_to_user(s, ubuf, cnt); + return (ret == -EBUSY) ? 0 : ret; +} + +static enum print_line_t mmio_print_rw(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct trace_mmiotrace_rw *field; + struct mmiotrace_rw *rw; + struct trace_seq *s = &iter->seq; + unsigned long long t = ns2usecs(iter->ts); + unsigned long usec_rem = do_div(t, USEC_PER_SEC); + unsigned secs = (unsigned long)t; + int ret = 1; + + trace_assign_type(field, entry); + rw = &field->rw; + + switch (rw->opcode) { + case MMIO_READ: + ret = trace_seq_printf(s, + "R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", + rw->width, secs, usec_rem, rw->map_id, + (unsigned long long)rw->phys, + rw->value, rw->pc, 0); + break; + case MMIO_WRITE: + ret = trace_seq_printf(s, + "W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", + rw->width, secs, usec_rem, rw->map_id, + (unsigned long long)rw->phys, + rw->value, rw->pc, 0); + break; + case MMIO_UNKNOWN_OP: + ret = trace_seq_printf(s, + "UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx," + "%02lx 0x%lx %d\n", + secs, usec_rem, rw->map_id, + (unsigned long long)rw->phys, + (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff, + (rw->value >> 0) & 0xff, rw->pc, 0); + break; + default: + ret = trace_seq_printf(s, "rw what?\n"); + break; + } + if (ret) + return TRACE_TYPE_HANDLED; + return TRACE_TYPE_PARTIAL_LINE; +} + +static enum print_line_t mmio_print_map(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct trace_mmiotrace_map *field; + struct mmiotrace_map *m; + struct trace_seq *s = &iter->seq; + unsigned long long t = ns2usecs(iter->ts); + unsigned long usec_rem = do_div(t, USEC_PER_SEC); + unsigned secs = (unsigned long)t; + int ret; + + trace_assign_type(field, entry); + m = &field->map; + + switch (m->opcode) { + case MMIO_PROBE: + ret = trace_seq_printf(s, + "MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", + secs, usec_rem, m->map_id, + (unsigned long long)m->phys, m->virt, m->len, + 0UL, 0); + break; + case MMIO_UNPROBE: + ret = trace_seq_printf(s, + "UNMAP %u.%06lu %d 0x%lx %d\n", + secs, usec_rem, m->map_id, 0UL, 0); + break; + default: + ret = trace_seq_printf(s, "map what?\n"); + break; + } + if (ret) + return TRACE_TYPE_HANDLED; + return TRACE_TYPE_PARTIAL_LINE; +} + +static enum print_line_t mmio_print_mark(struct trace_iterator *iter) +{ + struct trace_entry *entry = iter->ent; + struct print_entry *print = (struct print_entry *)entry; + const char *msg = print->buf; + struct trace_seq *s = &iter->seq; + unsigned long long t = ns2usecs(iter->ts); + unsigned long usec_rem = do_div(t, USEC_PER_SEC); + unsigned secs = (unsigned long)t; + int ret; + + /* The trailing newline must be in the message. */ + ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t mmio_print_line(struct trace_iterator *iter) +{ + switch (iter->ent->type) { + case TRACE_MMIO_RW: + return mmio_print_rw(iter); + case TRACE_MMIO_MAP: + return mmio_print_map(iter); + case TRACE_PRINT: + return mmio_print_mark(iter); + default: + return TRACE_TYPE_HANDLED; /* ignore unknown entries */ + } +} + +static struct tracer mmio_tracer __read_mostly = +{ + .name = "mmiotrace", + .init = mmio_trace_init, + .reset = mmio_trace_reset, + .start = mmio_trace_start, + .pipe_open = mmio_pipe_open, + .close = mmio_close, + .read = mmio_read, + .print_line = mmio_print_line, +}; + +__init static int init_mmio_trace(void) +{ + return register_tracer(&mmio_tracer); +} +device_initcall(init_mmio_trace); + +static void __trace_mmiotrace_rw(struct trace_array *tr, + struct trace_array_cpu *data, + struct mmiotrace_rw *rw) +{ + struct ftrace_event_call *call = &event_mmiotrace_rw; + struct ring_buffer *buffer = tr->buffer; + struct ring_buffer_event *event; + struct trace_mmiotrace_rw *entry; + int pc = preempt_count(); + + event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_RW, + sizeof(*entry), 0, pc); + if (!event) { + atomic_inc(&dropped_count); + return; + } + entry = ring_buffer_event_data(event); + entry->rw = *rw; + + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, pc); +} + +void mmio_trace_rw(struct mmiotrace_rw *rw) +{ + struct trace_array *tr = mmio_trace_array; + struct trace_array_cpu *data = tr->data[smp_processor_id()]; + __trace_mmiotrace_rw(tr, data, rw); +} + +static void __trace_mmiotrace_map(struct trace_array *tr, + struct trace_array_cpu *data, + struct mmiotrace_map *map) +{ + struct ftrace_event_call *call = &event_mmiotrace_map; + struct ring_buffer *buffer = tr->buffer; + struct ring_buffer_event *event; + struct trace_mmiotrace_map *entry; + int pc = preempt_count(); + + event = trace_buffer_lock_reserve(buffer, TRACE_MMIO_MAP, + sizeof(*entry), 0, pc); + if (!event) { + atomic_inc(&dropped_count); + return; + } + entry = ring_buffer_event_data(event); + entry->map = *map; + + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, 0, pc); +} + +void mmio_trace_mapping(struct mmiotrace_map *map) +{ + struct trace_array *tr = mmio_trace_array; + struct trace_array_cpu *data; + + preempt_disable(); + data = tr->data[smp_processor_id()]; + __trace_mmiotrace_map(tr, data, map); + preempt_enable(); +} + +int mmio_trace_printk(const char *fmt, va_list args) +{ + return trace_vprintk(0, fmt, args); +} diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c new file mode 100644 index 00000000..394f9441 --- /dev/null +++ b/kernel/trace/trace_nop.c @@ -0,0 +1,101 @@ +/* + * nop tracer + * + * Copyright (C) 2008 Steven Noonan + * + */ + +#include +#include +#include +#include + +#include "trace.h" + +/* Our two options */ +enum { + TRACE_NOP_OPT_ACCEPT = 0x1, + TRACE_NOP_OPT_REFUSE = 0x2 +}; + +/* Options for the tracer (see trace_options file) */ +static struct tracer_opt nop_opts[] = { + /* Option that will be accepted by set_flag callback */ + { TRACER_OPT(test_nop_accept, TRACE_NOP_OPT_ACCEPT) }, + /* Option that will be refused by set_flag callback */ + { TRACER_OPT(test_nop_refuse, TRACE_NOP_OPT_REFUSE) }, + { } /* Always set a last empty entry */ +}; + +static struct tracer_flags nop_flags = { + /* You can check your flags value here when you want. */ + .val = 0, /* By default: all flags disabled */ + .opts = nop_opts +}; + +static struct trace_array *ctx_trace; + +static void start_nop_trace(struct trace_array *tr) +{ + /* Nothing to do! */ +} + +static void stop_nop_trace(struct trace_array *tr) +{ + /* Nothing to do! */ +} + +static int nop_trace_init(struct trace_array *tr) +{ + ctx_trace = tr; + start_nop_trace(tr); + return 0; +} + +static void nop_trace_reset(struct trace_array *tr) +{ + stop_nop_trace(tr); +} + +/* It only serves as a signal handler and a callback to + * accept or refuse tthe setting of a flag. + * If you don't implement it, then the flag setting will be + * automatically accepted. + */ +static int nop_set_flag(u32 old_flags, u32 bit, int set) +{ + /* + * Note that you don't need to update nop_flags.val yourself. + * The tracing Api will do it automatically if you return 0 + */ + if (bit == TRACE_NOP_OPT_ACCEPT) { + printk(KERN_DEBUG "nop_test_accept flag set to %d: we accept." + " Now cat trace_options to see the result\n", + set); + return 0; + } + + if (bit == TRACE_NOP_OPT_REFUSE) { + printk(KERN_DEBUG "nop_test_refuse flag set to %d: we refuse." + "Now cat trace_options to see the result\n", + set); + return -EINVAL; + } + + return 0; +} + + +struct tracer nop_trace __read_mostly = +{ + .name = "nop", + .init = nop_trace_init, + .reset = nop_trace_reset, + .wait_pipe = poll_wait_pipe, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_nop, +#endif + .flags = &nop_flags, + .set_flag = nop_set_flag +}; + diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c new file mode 100644 index 00000000..e37de492 --- /dev/null +++ b/kernel/trace/trace_output.c @@ -0,0 +1,1308 @@ +/* + * trace_output.c + * + * Copyright (C) 2008 Red Hat Inc, Steven Rostedt + * + */ + +#include +#include +#include + +#include "trace_output.h" + +/* must be a power of 2 */ +#define EVENT_HASHSIZE 128 + +DECLARE_RWSEM(trace_event_mutex); + +static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; + +static int next_event_type = __TRACE_LAST_TYPE + 1; + +int trace_print_seq(struct seq_file *m, struct trace_seq *s) +{ + int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; + int ret; + + ret = seq_write(m, s->buffer, len); + + /* + * Only reset this buffer if we successfully wrote to the + * seq_file buffer. + */ + if (!ret) + trace_seq_init(s); + + return ret; +} + +enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + struct bprint_entry *field; + int ret; + + trace_assign_type(field, entry); + + ret = trace_seq_bprintf(s, field->fmt, field->buf); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + struct print_entry *field; + int ret; + + trace_assign_type(field, entry); + + ret = trace_seq_printf(s, "%s", field->buf); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +/** + * trace_seq_printf - sequence printing of trace information + * @s: trace sequence descriptor + * @fmt: printf format string + * + * It returns 0 if the trace oversizes the buffer's free + * space, 1 otherwise. + * + * The tracer may use either sequence operations or its own + * copy to user routines. To simplify formating of a trace + * trace_seq_printf is used to store strings into a special + * buffer (@s). Then the output may be either used by + * the sequencer or pulled into another buffer. + */ +int +trace_seq_printf(struct trace_seq *s, const char *fmt, ...) +{ + int len = (PAGE_SIZE - 1) - s->len; + va_list ap; + int ret; + + if (s->full || !len) + return 0; + + va_start(ap, fmt); + ret = vsnprintf(s->buffer + s->len, len, fmt, ap); + va_end(ap); + + /* If we can't write it all, don't bother writing anything */ + if (ret >= len) { + s->full = 1; + return 0; + } + + s->len += ret; + + return 1; +} +EXPORT_SYMBOL_GPL(trace_seq_printf); + +/** + * trace_seq_vprintf - sequence printing of trace information + * @s: trace sequence descriptor + * @fmt: printf format string + * + * The tracer may use either sequence operations or its own + * copy to user routines. To simplify formating of a trace + * trace_seq_printf is used to store strings into a special + * buffer (@s). Then the output may be either used by + * the sequencer or pulled into another buffer. + */ +int +trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args) +{ + int len = (PAGE_SIZE - 1) - s->len; + int ret; + + if (s->full || !len) + return 0; + + ret = vsnprintf(s->buffer + s->len, len, fmt, args); + + /* If we can't write it all, don't bother writing anything */ + if (ret >= len) { + s->full = 1; + return 0; + } + + s->len += ret; + + return len; +} +EXPORT_SYMBOL_GPL(trace_seq_vprintf); + +int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) +{ + int len = (PAGE_SIZE - 1) - s->len; + int ret; + + if (s->full || !len) + return 0; + + ret = bstr_printf(s->buffer + s->len, len, fmt, binary); + + /* If we can't write it all, don't bother writing anything */ + if (ret >= len) { + s->full = 1; + return 0; + } + + s->len += ret; + + return len; +} + +/** + * trace_seq_puts - trace sequence printing of simple string + * @s: trace sequence descriptor + * @str: simple string to record + * + * The tracer may use either the sequence operations or its own + * copy to user routines. This function records a simple string + * into a special buffer (@s) for later retrieval by a sequencer + * or other mechanism. + */ +int trace_seq_puts(struct trace_seq *s, const char *str) +{ + int len = strlen(str); + + if (s->full) + return 0; + + if (len > ((PAGE_SIZE - 1) - s->len)) { + s->full = 1; + return 0; + } + + memcpy(s->buffer + s->len, str, len); + s->len += len; + + return len; +} + +int trace_seq_putc(struct trace_seq *s, unsigned char c) +{ + if (s->full) + return 0; + + if (s->len >= (PAGE_SIZE - 1)) { + s->full = 1; + return 0; + } + + s->buffer[s->len++] = c; + + return 1; +} +EXPORT_SYMBOL(trace_seq_putc); + +int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len) +{ + if (s->full) + return 0; + + if (len > ((PAGE_SIZE - 1) - s->len)) { + s->full = 1; + return 0; + } + + memcpy(s->buffer + s->len, mem, len); + s->len += len; + + return len; +} + +int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len) +{ + unsigned char hex[HEX_CHARS]; + const unsigned char *data = mem; + int i, j; + + if (s->full) + return 0; + +#ifdef __BIG_ENDIAN + for (i = 0, j = 0; i < len; i++) { +#else + for (i = len-1, j = 0; i >= 0; i--) { +#endif + hex[j++] = hex_asc_hi(data[i]); + hex[j++] = hex_asc_lo(data[i]); + } + hex[j++] = ' '; + + return trace_seq_putmem(s, hex, j); +} + +void *trace_seq_reserve(struct trace_seq *s, size_t len) +{ + void *ret; + + if (s->full) + return NULL; + + if (len > ((PAGE_SIZE - 1) - s->len)) { + s->full = 1; + return NULL; + } + + ret = s->buffer + s->len; + s->len += len; + + return ret; +} + +int trace_seq_path(struct trace_seq *s, struct path *path) +{ + unsigned char *p; + + if (s->full) + return 0; + + if (s->len >= (PAGE_SIZE - 1)) { + s->full = 1; + return 0; + } + + p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len); + if (!IS_ERR(p)) { + p = mangle_path(s->buffer + s->len, p, "\n"); + if (p) { + s->len = p - s->buffer; + return 1; + } + } else { + s->buffer[s->len++] = '?'; + return 1; + } + + s->full = 1; + return 0; +} + +const char * +ftrace_print_flags_seq(struct trace_seq *p, const char *delim, + unsigned long flags, + const struct trace_print_flags *flag_array) +{ + unsigned long mask; + const char *str; + const char *ret = p->buffer + p->len; + int i; + + for (i = 0; flag_array[i].name && flags; i++) { + + mask = flag_array[i].mask; + if ((flags & mask) != mask) + continue; + + str = flag_array[i].name; + flags &= ~mask; + if (p->len && delim) + trace_seq_puts(p, delim); + trace_seq_puts(p, str); + } + + /* check for left over flags */ + if (flags) { + if (p->len && delim) + trace_seq_puts(p, delim); + trace_seq_printf(p, "0x%lx", flags); + } + + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL(ftrace_print_flags_seq); + +const char * +ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val, + const struct trace_print_flags *symbol_array) +{ + int i; + const char *ret = p->buffer + p->len; + + for (i = 0; symbol_array[i].name; i++) { + + if (val != symbol_array[i].mask) + continue; + + trace_seq_puts(p, symbol_array[i].name); + break; + } + + if (!p->len) + trace_seq_printf(p, "0x%lx", val); + + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL(ftrace_print_symbols_seq); + +#if BITS_PER_LONG == 32 +const char * +ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, + const struct trace_print_flags_u64 *symbol_array) +{ + int i; + const char *ret = p->buffer + p->len; + + for (i = 0; symbol_array[i].name; i++) { + + if (val != symbol_array[i].mask) + continue; + + trace_seq_puts(p, symbol_array[i].name); + break; + } + + if (!p->len) + trace_seq_printf(p, "0x%llx", val); + + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL(ftrace_print_symbols_seq_u64); +#endif + +const char * +ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) +{ + int i; + const char *ret = p->buffer + p->len; + + for (i = 0; i < buf_len; i++) + trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]); + + trace_seq_putc(p, 0); + + return ret; +} +EXPORT_SYMBOL(ftrace_print_hex_seq); + +#ifdef CONFIG_KRETPROBES +static inline const char *kretprobed(const char *name) +{ + static const char tramp_name[] = "kretprobe_trampoline"; + int size = sizeof(tramp_name); + + if (strncmp(tramp_name, name, size) == 0) + return "[unknown/kretprobe'd]"; + return name; +} +#else +static inline const char *kretprobed(const char *name) +{ + return name; +} +#endif /* CONFIG_KRETPROBES */ + +static int +seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address) +{ +#ifdef CONFIG_KALLSYMS + char str[KSYM_SYMBOL_LEN]; + const char *name; + + kallsyms_lookup(address, NULL, NULL, NULL, str); + + name = kretprobed(str); + + return trace_seq_printf(s, fmt, name); +#endif + return 1; +} + +static int +seq_print_sym_offset(struct trace_seq *s, const char *fmt, + unsigned long address) +{ +#ifdef CONFIG_KALLSYMS + char str[KSYM_SYMBOL_LEN]; + const char *name; + + sprint_symbol(str, address); + name = kretprobed(str); + + return trace_seq_printf(s, fmt, name); +#endif + return 1; +} + +#ifndef CONFIG_64BIT +# define IP_FMT "%08lx" +#else +# define IP_FMT "%016lx" +#endif + +int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, + unsigned long ip, unsigned long sym_flags) +{ + struct file *file = NULL; + unsigned long vmstart = 0; + int ret = 1; + + if (s->full) + return 0; + + if (mm) { + const struct vm_area_struct *vma; + + down_read(&mm->mmap_sem); + vma = find_vma(mm, ip); + if (vma) { + file = vma->vm_file; + vmstart = vma->vm_start; + } + if (file) { + ret = trace_seq_path(s, &file->f_path); + if (ret) + ret = trace_seq_printf(s, "[+0x%lx]", + ip - vmstart); + } + up_read(&mm->mmap_sem); + } + if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file)) + ret = trace_seq_printf(s, " <" IP_FMT ">", ip); + return ret; +} + +int +seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, + unsigned long sym_flags) +{ + struct mm_struct *mm = NULL; + int ret = 1; + unsigned int i; + + if (trace_flags & TRACE_ITER_SYM_USEROBJ) { + struct task_struct *task; + /* + * we do the lookup on the thread group leader, + * since individual threads might have already quit! + */ + rcu_read_lock(); + task = find_task_by_vpid(entry->tgid); + if (task) + mm = get_task_mm(task); + rcu_read_unlock(); + } + + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { + unsigned long ip = entry->caller[i]; + + if (ip == ULONG_MAX || !ret) + break; + if (ret) + ret = trace_seq_puts(s, " => "); + if (!ip) { + if (ret) + ret = trace_seq_puts(s, "??"); + if (ret) + ret = trace_seq_puts(s, "\n"); + continue; + } + if (!ret) + break; + if (ret) + ret = seq_print_user_ip(s, mm, ip, sym_flags); + ret = trace_seq_puts(s, "\n"); + } + + if (mm) + mmput(mm); + return ret; +} + +int +seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) +{ + int ret; + + if (!ip) + return trace_seq_printf(s, "0"); + + if (sym_flags & TRACE_ITER_SYM_OFFSET) + ret = seq_print_sym_offset(s, "%s", ip); + else + ret = seq_print_sym_short(s, "%s", ip); + + if (!ret) + return 0; + + if (sym_flags & TRACE_ITER_SYM_ADDR) + ret = trace_seq_printf(s, " <" IP_FMT ">", ip); + return ret; +} + +/** + * trace_print_lat_fmt - print the irq, preempt and lockdep fields + * @s: trace seq struct to write to + * @entry: The trace entry field from the ring buffer + * + * Prints the generic fields of irqs off, in hard or softirq, preempt + * count. + */ +int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) +{ + char hardsoft_irq; + char need_resched; + char irqs_off; + int hardirq; + int softirq; + int ret; + + hardirq = entry->flags & TRACE_FLAG_HARDIRQ; + softirq = entry->flags & TRACE_FLAG_SOFTIRQ; + + irqs_off = + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : + '.'; + need_resched = + (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'; + hardsoft_irq = + (hardirq && softirq) ? 'H' : + hardirq ? 'h' : + softirq ? 's' : + '.'; + + if (!trace_seq_printf(s, "%c%c%c", + irqs_off, need_resched, hardsoft_irq)) + return 0; + + if (entry->preempt_count) + ret = trace_seq_printf(s, "%x", entry->preempt_count); + else + ret = trace_seq_putc(s, '.'); + + return ret; +} + +static int +lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) +{ + char comm[TASK_COMM_LEN]; + + trace_find_cmdline(entry->pid, comm); + + if (!trace_seq_printf(s, "%8.8s-%-5d %3d", + comm, entry->pid, cpu)) + return 0; + + return trace_print_lat_fmt(s, entry); +} + +static unsigned long preempt_mark_thresh = 100; + +static int +lat_print_timestamp(struct trace_seq *s, u64 abs_usecs, + unsigned long rel_usecs) +{ + return trace_seq_printf(s, " %4lldus%c: ", abs_usecs, + rel_usecs > preempt_mark_thresh ? '!' : + rel_usecs > 1 ? '+' : ' '); +} + +int trace_print_context(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + unsigned long long t = ns2usecs(iter->ts); + unsigned long usec_rem = do_div(t, USEC_PER_SEC); + unsigned long secs = (unsigned long)t; + char comm[TASK_COMM_LEN]; + + trace_find_cmdline(entry->pid, comm); + + return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ", + comm, entry->pid, iter->cpu, secs, usec_rem); +} + +int trace_print_lat_context(struct trace_iterator *iter) +{ + u64 next_ts; + int ret; + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent, + *next_entry = trace_find_next_entry(iter, NULL, + &next_ts); + unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); + unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start); + unsigned long rel_usecs; + + if (!next_entry) + next_ts = iter->ts; + rel_usecs = ns2usecs(next_ts - iter->ts); + + if (verbose) { + char comm[TASK_COMM_LEN]; + + trace_find_cmdline(entry->pid, comm); + + ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08llx]" + " %ld.%03ldms (+%ld.%03ldms): ", comm, + entry->pid, iter->cpu, entry->flags, + entry->preempt_count, iter->idx, + ns2usecs(iter->ts), + abs_usecs / USEC_PER_MSEC, + abs_usecs % USEC_PER_MSEC, + rel_usecs / USEC_PER_MSEC, + rel_usecs % USEC_PER_MSEC); + } else { + ret = lat_print_generic(s, entry, iter->cpu); + if (ret) + ret = lat_print_timestamp(s, abs_usecs, rel_usecs); + } + + return ret; +} + +static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; + +static int task_state_char(unsigned long state) +{ + int bit = state ? __ffs(state) + 1 : 0; + + return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?'; +} + +/** + * ftrace_find_event - find a registered event + * @type: the type of event to look for + * + * Returns an event of type @type otherwise NULL + * Called with trace_event_read_lock() held. + */ +struct trace_event *ftrace_find_event(int type) +{ + struct trace_event *event; + struct hlist_node *n; + unsigned key; + + key = type & (EVENT_HASHSIZE - 1); + + hlist_for_each_entry(event, n, &event_hash[key], node) { + if (event->type == type) + return event; + } + + return NULL; +} + +static LIST_HEAD(ftrace_event_list); + +static int trace_search_list(struct list_head **list) +{ + struct trace_event *e; + int last = __TRACE_LAST_TYPE; + + if (list_empty(&ftrace_event_list)) { + *list = &ftrace_event_list; + return last + 1; + } + + /* + * We used up all possible max events, + * lets see if somebody freed one. + */ + list_for_each_entry(e, &ftrace_event_list, list) { + if (e->type != last + 1) + break; + last++; + } + + /* Did we used up all 65 thousand events??? */ + if ((last + 1) > FTRACE_MAX_EVENT) + return 0; + + *list = &e->list; + return last + 1; +} + +void trace_event_read_lock(void) +{ + down_read(&trace_event_mutex); +} + +void trace_event_read_unlock(void) +{ + up_read(&trace_event_mutex); +} + +/** + * register_ftrace_event - register output for an event type + * @event: the event type to register + * + * Event types are stored in a hash and this hash is used to + * find a way to print an event. If the @event->type is set + * then it will use that type, otherwise it will assign a + * type to use. + * + * If you assign your own type, please make sure it is added + * to the trace_type enum in trace.h, to avoid collisions + * with the dynamic types. + * + * Returns the event type number or zero on error. + */ +int register_ftrace_event(struct trace_event *event) +{ + unsigned key; + int ret = 0; + + down_write(&trace_event_mutex); + + if (WARN_ON(!event)) + goto out; + + if (WARN_ON(!event->funcs)) + goto out; + + INIT_LIST_HEAD(&event->list); + + if (!event->type) { + struct list_head *list = NULL; + + if (next_event_type > FTRACE_MAX_EVENT) { + + event->type = trace_search_list(&list); + if (!event->type) + goto out; + + } else { + + event->type = next_event_type++; + list = &ftrace_event_list; + } + + if (WARN_ON(ftrace_find_event(event->type))) + goto out; + + list_add_tail(&event->list, list); + + } else if (event->type > __TRACE_LAST_TYPE) { + printk(KERN_WARNING "Need to add type to trace.h\n"); + WARN_ON(1); + goto out; + } else { + /* Is this event already used */ + if (ftrace_find_event(event->type)) + goto out; + } + + if (event->funcs->trace == NULL) + event->funcs->trace = trace_nop_print; + if (event->funcs->raw == NULL) + event->funcs->raw = trace_nop_print; + if (event->funcs->hex == NULL) + event->funcs->hex = trace_nop_print; + if (event->funcs->binary == NULL) + event->funcs->binary = trace_nop_print; + + key = event->type & (EVENT_HASHSIZE - 1); + + hlist_add_head(&event->node, &event_hash[key]); + + ret = event->type; + out: + up_write(&trace_event_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(register_ftrace_event); + +/* + * Used by module code with the trace_event_mutex held for write. + */ +int __unregister_ftrace_event(struct trace_event *event) +{ + hlist_del(&event->node); + list_del(&event->list); + return 0; +} + +/** + * unregister_ftrace_event - remove a no longer used event + * @event: the event to remove + */ +int unregister_ftrace_event(struct trace_event *event) +{ + down_write(&trace_event_mutex); + __unregister_ftrace_event(event); + up_write(&trace_event_mutex); + + return 0; +} +EXPORT_SYMBOL_GPL(unregister_ftrace_event); + +/* + * Standard events + */ + +enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type)) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +/* TRACE_FN */ +static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct ftrace_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + if (!seq_print_ip_sym(s, field->ip, flags)) + goto partial; + + if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) { + if (!trace_seq_printf(s, " <-")) + goto partial; + if (!seq_print_ip_sym(s, + field->parent_ip, + flags)) + goto partial; + } + if (!trace_seq_printf(s, "\n")) + goto partial; + + return TRACE_TYPE_HANDLED; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct ftrace_entry *field; + + trace_assign_type(field, iter->ent); + + if (!trace_seq_printf(&iter->seq, "%lx %lx\n", + field->ip, + field->parent_ip)) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct ftrace_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + SEQ_PUT_HEX_FIELD_RET(s, field->ip); + SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip); + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct ftrace_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + SEQ_PUT_FIELD_RET(s, field->ip); + SEQ_PUT_FIELD_RET(s, field->parent_ip); + + return TRACE_TYPE_HANDLED; +} + +static struct trace_event_functions trace_fn_funcs = { + .trace = trace_fn_trace, + .raw = trace_fn_raw, + .hex = trace_fn_hex, + .binary = trace_fn_bin, +}; + +static struct trace_event trace_fn_event = { + .type = TRACE_FN, + .funcs = &trace_fn_funcs, +}; + +/* TRACE_CTX an TRACE_WAKE */ +static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, + char *delim) +{ + struct ctx_switch_entry *field; + char comm[TASK_COMM_LEN]; + int S, T; + + + trace_assign_type(field, iter->ent); + + T = task_state_char(field->next_state); + S = task_state_char(field->prev_state); + trace_find_cmdline(field->next_pid, comm); + if (!trace_seq_printf(&iter->seq, + " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n", + field->prev_pid, + field->prev_prio, + S, delim, + field->next_cpu, + field->next_pid, + field->next_prio, + T, comm)) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + return trace_ctxwake_print(iter, "==>"); +} + +static enum print_line_t trace_wake_print(struct trace_iterator *iter, + int flags, struct trace_event *event) +{ + return trace_ctxwake_print(iter, " +"); +} + +static int trace_ctxwake_raw(struct trace_iterator *iter, char S) +{ + struct ctx_switch_entry *field; + int T; + + trace_assign_type(field, iter->ent); + + if (!S) + S = task_state_char(field->prev_state); + T = task_state_char(field->next_state); + if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n", + field->prev_pid, + field->prev_prio, + S, + field->next_cpu, + field->next_pid, + field->next_prio, + T)) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + return trace_ctxwake_raw(iter, 0); +} + +static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + return trace_ctxwake_raw(iter, '+'); +} + + +static int trace_ctxwake_hex(struct trace_iterator *iter, char S) +{ + struct ctx_switch_entry *field; + struct trace_seq *s = &iter->seq; + int T; + + trace_assign_type(field, iter->ent); + + if (!S) + S = task_state_char(field->prev_state); + T = task_state_char(field->next_state); + + SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid); + SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio); + SEQ_PUT_HEX_FIELD_RET(s, S); + SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu); + SEQ_PUT_HEX_FIELD_RET(s, field->next_pid); + SEQ_PUT_HEX_FIELD_RET(s, field->next_prio); + SEQ_PUT_HEX_FIELD_RET(s, T); + + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + return trace_ctxwake_hex(iter, 0); +} + +static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + return trace_ctxwake_hex(iter, '+'); +} + +static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter, + int flags, struct trace_event *event) +{ + struct ctx_switch_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + SEQ_PUT_FIELD_RET(s, field->prev_pid); + SEQ_PUT_FIELD_RET(s, field->prev_prio); + SEQ_PUT_FIELD_RET(s, field->prev_state); + SEQ_PUT_FIELD_RET(s, field->next_pid); + SEQ_PUT_FIELD_RET(s, field->next_prio); + SEQ_PUT_FIELD_RET(s, field->next_state); + + return TRACE_TYPE_HANDLED; +} + +static struct trace_event_functions trace_ctx_funcs = { + .trace = trace_ctx_print, + .raw = trace_ctx_raw, + .hex = trace_ctx_hex, + .binary = trace_ctxwake_bin, +}; + +static struct trace_event trace_ctx_event = { + .type = TRACE_CTX, + .funcs = &trace_ctx_funcs, +}; + +static struct trace_event_functions trace_wake_funcs = { + .trace = trace_wake_print, + .raw = trace_wake_raw, + .hex = trace_wake_hex, + .binary = trace_ctxwake_bin, +}; + +static struct trace_event trace_wake_event = { + .type = TRACE_WAKE, + .funcs = &trace_wake_funcs, +}; + +/* TRACE_STACK */ + +static enum print_line_t trace_stack_print(struct trace_iterator *iter, + int flags, struct trace_event *event) +{ + struct stack_entry *field; + struct trace_seq *s = &iter->seq; + int i; + + trace_assign_type(field, iter->ent); + + if (!trace_seq_puts(s, "\n")) + goto partial; + for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { + if (!field->caller[i] || (field->caller[i] == ULONG_MAX)) + break; + if (!trace_seq_puts(s, " => ")) + goto partial; + + if (!seq_print_ip_sym(s, field->caller[i], flags)) + goto partial; + if (!trace_seq_puts(s, "\n")) + goto partial; + } + + return TRACE_TYPE_HANDLED; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static struct trace_event_functions trace_stack_funcs = { + .trace = trace_stack_print, +}; + +static struct trace_event trace_stack_event = { + .type = TRACE_STACK, + .funcs = &trace_stack_funcs, +}; + +/* TRACE_USER_STACK */ +static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, + int flags, struct trace_event *event) +{ + struct userstack_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + if (!trace_seq_puts(s, "\n")) + goto partial; + + if (!seq_print_userip_objs(field, s, flags)) + goto partial; + + return TRACE_TYPE_HANDLED; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static struct trace_event_functions trace_user_stack_funcs = { + .trace = trace_user_stack_print, +}; + +static struct trace_event trace_user_stack_event = { + .type = TRACE_USER_STACK, + .funcs = &trace_user_stack_funcs, +}; + +/* TRACE_BPRINT */ +static enum print_line_t +trace_bprint_print(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct bprint_entry *field; + + trace_assign_type(field, entry); + + if (!seq_print_ip_sym(s, field->ip, flags)) + goto partial; + + if (!trace_seq_puts(s, ": ")) + goto partial; + + if (!trace_seq_bprintf(s, field->fmt, field->buf)) + goto partial; + + return TRACE_TYPE_HANDLED; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + + +static enum print_line_t +trace_bprint_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct bprint_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + if (!trace_seq_printf(s, ": %lx : ", field->ip)) + goto partial; + + if (!trace_seq_bprintf(s, field->fmt, field->buf)) + goto partial; + + return TRACE_TYPE_HANDLED; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static struct trace_event_functions trace_bprint_funcs = { + .trace = trace_bprint_print, + .raw = trace_bprint_raw, +}; + +static struct trace_event trace_bprint_event = { + .type = TRACE_BPRINT, + .funcs = &trace_bprint_funcs, +}; + +/* TRACE_PRINT */ +static enum print_line_t trace_print_print(struct trace_iterator *iter, + int flags, struct trace_event *event) +{ + struct print_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + if (!seq_print_ip_sym(s, field->ip, flags)) + goto partial; + + if (!trace_seq_printf(s, ": %s", field->buf)) + goto partial; + + return TRACE_TYPE_HANDLED; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct print_entry *field; + + trace_assign_type(field, iter->ent); + + if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf)) + goto partial; + + return TRACE_TYPE_HANDLED; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static struct trace_event_functions trace_print_funcs = { + .trace = trace_print_print, + .raw = trace_print_raw, +}; + +static struct trace_event trace_print_event = { + .type = TRACE_PRINT, + .funcs = &trace_print_funcs, +}; + + +static struct trace_event *events[] __initdata = { + &trace_fn_event, + &trace_ctx_event, + &trace_wake_event, + &trace_stack_event, + &trace_user_stack_event, + &trace_bprint_event, + &trace_print_event, + NULL +}; + +__init static int init_events(void) +{ + struct trace_event *event; + int i, ret; + + for (i = 0; events[i]; i++) { + event = events[i]; + + ret = register_ftrace_event(event); + if (!ret) { + printk(KERN_WARNING "event %d failed to register\n", + event->type); + WARN_ON_ONCE(1); + } + } + + return 0; +} +device_initcall(init_events); diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h new file mode 100644 index 00000000..c038eba0 --- /dev/null +++ b/kernel/trace/trace_output.h @@ -0,0 +1,53 @@ +#ifndef __TRACE_EVENTS_H +#define __TRACE_EVENTS_H + +#include +#include "trace.h" + +extern enum print_line_t +trace_print_bprintk_msg_only(struct trace_iterator *iter); +extern enum print_line_t +trace_print_printk_msg_only(struct trace_iterator *iter); + +extern int +seq_print_ip_sym(struct trace_seq *s, unsigned long ip, + unsigned long sym_flags); +extern int seq_print_userip_objs(const struct userstack_entry *entry, + struct trace_seq *s, unsigned long sym_flags); +extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, + unsigned long ip, unsigned long sym_flags); + +extern int trace_print_context(struct trace_iterator *iter); +extern int trace_print_lat_context(struct trace_iterator *iter); + +extern void trace_event_read_lock(void); +extern void trace_event_read_unlock(void); +extern struct trace_event *ftrace_find_event(int type); + +extern enum print_line_t trace_nop_print(struct trace_iterator *iter, + int flags, struct trace_event *event); +extern int +trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); + +/* used by module unregistering */ +extern int __unregister_ftrace_event(struct trace_event *event); +extern struct rw_semaphore trace_event_mutex; + +#define MAX_MEMHEX_BYTES 8 +#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) + +#define SEQ_PUT_FIELD_RET(s, x) \ +do { \ + if (!trace_seq_putmem(s, &(x), sizeof(x))) \ + return TRACE_TYPE_PARTIAL_LINE; \ +} while (0) + +#define SEQ_PUT_HEX_FIELD_RET(s, x) \ +do { \ + BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \ + if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \ + return TRACE_TYPE_PARTIAL_LINE; \ +} while (0) + +#endif + diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c new file mode 100644 index 00000000..1f06468a --- /dev/null +++ b/kernel/trace/trace_printk.c @@ -0,0 +1,344 @@ +/* + * trace binary printk + * + * Copyright (C) 2008 Lai Jiangshan + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +#ifdef CONFIG_MODULES + +/* + * modules trace_printk()'s formats are autosaved in struct trace_bprintk_fmt + * which are queued on trace_bprintk_fmt_list. + */ +static LIST_HEAD(trace_bprintk_fmt_list); + +/* serialize accesses to trace_bprintk_fmt_list */ +static DEFINE_MUTEX(btrace_mutex); + +struct trace_bprintk_fmt { + struct list_head list; + const char *fmt; +}; + +static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) +{ + struct trace_bprintk_fmt *pos; + list_for_each_entry(pos, &trace_bprintk_fmt_list, list) { + if (!strcmp(pos->fmt, fmt)) + return pos; + } + return NULL; +} + +static +void hold_module_trace_bprintk_format(const char **start, const char **end) +{ + const char **iter; + char *fmt; + + mutex_lock(&btrace_mutex); + for (iter = start; iter < end; iter++) { + struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); + if (tb_fmt) { + *iter = tb_fmt->fmt; + continue; + } + + tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL); + if (tb_fmt) + fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL); + if (tb_fmt && fmt) { + list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list); + strcpy(fmt, *iter); + tb_fmt->fmt = fmt; + *iter = tb_fmt->fmt; + } else { + kfree(tb_fmt); + *iter = NULL; + } + } + mutex_unlock(&btrace_mutex); +} + +static int module_trace_bprintk_format_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + if (mod->num_trace_bprintk_fmt) { + const char **start = mod->trace_bprintk_fmt_start; + const char **end = start + mod->num_trace_bprintk_fmt; + + if (val == MODULE_STATE_COMING) + hold_module_trace_bprintk_format(start, end); + } + return 0; +} + +/* + * The debugfs/tracing/printk_formats file maps the addresses with + * the ASCII formats that are used in the bprintk events in the + * buffer. For userspace tools to be able to decode the events from + * the buffer, they need to be able to map the address with the format. + * + * The addresses of the bprintk formats are in their own section + * __trace_printk_fmt. But for modules we copy them into a link list. + * The code to print the formats and their addresses passes around the + * address of the fmt string. If the fmt address passed into the seq + * functions is within the kernel core __trace_printk_fmt section, then + * it simply uses the next pointer in the list. + * + * When the fmt pointer is outside the kernel core __trace_printk_fmt + * section, then we need to read the link list pointers. The trick is + * we pass the address of the string to the seq function just like + * we do for the kernel core formats. To get back the structure that + * holds the format, we simply use containerof() and then go to the + * next format in the list. + */ +static const char ** +find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos) +{ + struct trace_bprintk_fmt *mod_fmt; + + if (list_empty(&trace_bprintk_fmt_list)) + return NULL; + + /* + * v will point to the address of the fmt record from t_next + * v will be NULL from t_start. + * If this is the first pointer or called from start + * then we need to walk the list. + */ + if (!v || start_index == *pos) { + struct trace_bprintk_fmt *p; + + /* search the module list */ + list_for_each_entry(p, &trace_bprintk_fmt_list, list) { + if (start_index == *pos) + return &p->fmt; + start_index++; + } + /* pos > index */ + return NULL; + } + + /* + * v points to the address of the fmt field in the mod list + * structure that holds the module print format. + */ + mod_fmt = container_of(v, typeof(*mod_fmt), fmt); + if (mod_fmt->list.next == &trace_bprintk_fmt_list) + return NULL; + + mod_fmt = container_of(mod_fmt->list.next, typeof(*mod_fmt), list); + + return &mod_fmt->fmt; +} + +static void format_mod_start(void) +{ + mutex_lock(&btrace_mutex); +} + +static void format_mod_stop(void) +{ + mutex_unlock(&btrace_mutex); +} + +#else /* !CONFIG_MODULES */ +__init static int +module_trace_bprintk_format_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + return 0; +} +static inline const char ** +find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos) +{ + return NULL; +} +static inline void format_mod_start(void) { } +static inline void format_mod_stop(void) { } +#endif /* CONFIG_MODULES */ + + +__initdata_or_module static +struct notifier_block module_trace_bprintk_format_nb = { + .notifier_call = module_trace_bprintk_format_notify, +}; + +int __trace_bprintk(unsigned long ip, const char *fmt, ...) + { + int ret; + va_list ap; + + if (unlikely(!fmt)) + return 0; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + va_start(ap, fmt); + ret = trace_vbprintk(ip, fmt, ap); + va_end(ap); + return ret; +} +EXPORT_SYMBOL_GPL(__trace_bprintk); + +int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap) + { + if (unlikely(!fmt)) + return 0; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + return trace_vbprintk(ip, fmt, ap); +} +EXPORT_SYMBOL_GPL(__ftrace_vbprintk); + +int __trace_printk(unsigned long ip, const char *fmt, ...) +{ + int ret; + va_list ap; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + va_start(ap, fmt); + ret = trace_vprintk(ip, fmt, ap); + va_end(ap); + return ret; +} +EXPORT_SYMBOL_GPL(__trace_printk); + +int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap) +{ + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + return trace_vprintk(ip, fmt, ap); +} +EXPORT_SYMBOL_GPL(__ftrace_vprintk); + +static const char **find_next(void *v, loff_t *pos) +{ + const char **fmt = v; + int start_index; + + start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt; + + if (*pos < start_index) + return __start___trace_bprintk_fmt + *pos; + + return find_next_mod_format(start_index, v, fmt, pos); +} + +static void * +t_start(struct seq_file *m, loff_t *pos) +{ + format_mod_start(); + return find_next(NULL, pos); +} + +static void *t_next(struct seq_file *m, void * v, loff_t *pos) +{ + (*pos)++; + return find_next(v, pos); +} + +static int t_show(struct seq_file *m, void *v) +{ + const char **fmt = v; + const char *str = *fmt; + int i; + + seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt); + + /* + * Tabs and new lines need to be converted. + */ + for (i = 0; str[i]; i++) { + switch (str[i]) { + case '\n': + seq_puts(m, "\\n"); + break; + case '\t': + seq_puts(m, "\\t"); + break; + case '\\': + seq_puts(m, "\\"); + break; + case '"': + seq_puts(m, "\\\""); + break; + default: + seq_putc(m, str[i]); + } + } + seq_puts(m, "\"\n"); + + return 0; +} + +static void t_stop(struct seq_file *m, void *p) +{ + format_mod_stop(); +} + +static const struct seq_operations show_format_seq_ops = { + .start = t_start, + .next = t_next, + .show = t_show, + .stop = t_stop, +}; + +static int +ftrace_formats_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &show_format_seq_ops); +} + +static const struct file_operations ftrace_formats_fops = { + .open = ftrace_formats_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static __init int init_trace_printk_function_export(void) +{ + struct dentry *d_tracer; + + d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; + + trace_create_file("printk_formats", 0444, d_tracer, + NULL, &ftrace_formats_fops); + + return 0; +} + +fs_initcall(init_trace_printk_function_export); + +static __init int init_trace_printk(void) +{ + return register_module_notifier(&module_trace_bprintk_format_nb); +} + +early_initcall(init_trace_printk); diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c new file mode 100644 index 00000000..7e62c0a1 --- /dev/null +++ b/kernel/trace/trace_sched_switch.c @@ -0,0 +1,249 @@ +/* + * trace context switch + * + * Copyright (C) 2007 Steven Rostedt + * + */ +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +static struct trace_array *ctx_trace; +static int __read_mostly tracer_enabled; +static int sched_ref; +static DEFINE_MUTEX(sched_register_mutex); +static int sched_stopped; + + +void +tracing_sched_switch_trace(struct trace_array *tr, + struct task_struct *prev, + struct task_struct *next, + unsigned long flags, int pc) +{ + struct ftrace_event_call *call = &event_context_switch; + struct ring_buffer *buffer = tr->buffer; + struct ring_buffer_event *event; + struct ctx_switch_entry *entry; + + event = trace_buffer_lock_reserve(buffer, TRACE_CTX, + sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->prev_pid = prev->pid; + entry->prev_prio = prev->prio; + entry->prev_state = prev->state; + entry->next_pid = next->pid; + entry->next_prio = next->prio; + entry->next_state = next->state; + entry->next_cpu = task_cpu(next); + + if (!filter_check_discard(call, entry, buffer, event)) + trace_buffer_unlock_commit(buffer, event, flags, pc); +} + +static void +probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next) +{ + struct trace_array_cpu *data; + unsigned long flags; + int cpu; + int pc; + + if (unlikely(!sched_ref)) + return; + + tracing_record_cmdline(prev); + tracing_record_cmdline(next); + + if (!tracer_enabled || sched_stopped) + return; + + pc = preempt_count(); + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = ctx_trace->data[cpu]; + + if (likely(!atomic_read(&data->disabled))) + tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc); + + local_irq_restore(flags); +} + +void +tracing_sched_wakeup_trace(struct trace_array *tr, + struct task_struct *wakee, + struct task_struct *curr, + unsigned long flags, int pc) +{ + struct ftrace_event_call *call = &event_wakeup; + struct ring_buffer_event *event; + struct ctx_switch_entry *entry; + struct ring_buffer *buffer = tr->buffer; + + event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, + sizeof(*entry), flags, pc); + if (!event) + return; + entry = ring_buffer_event_data(event); + entry->prev_pid = curr->pid; + entry->prev_prio = curr->prio; + entry->prev_state = curr->state; + entry->next_pid = wakee->pid; + entry->next_prio = wakee->prio; + entry->next_state = wakee->state; + entry->next_cpu = task_cpu(wakee); + + if (!filter_check_discard(call, entry, buffer, event)) + ring_buffer_unlock_commit(buffer, event); + ftrace_trace_stack(tr->buffer, flags, 6, pc); + ftrace_trace_userstack(tr->buffer, flags, pc); +} + +static void +probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success) +{ + struct trace_array_cpu *data; + unsigned long flags; + int cpu, pc; + + if (unlikely(!sched_ref)) + return; + + tracing_record_cmdline(current); + + if (!tracer_enabled || sched_stopped) + return; + + pc = preempt_count(); + local_irq_save(flags); + cpu = raw_smp_processor_id(); + data = ctx_trace->data[cpu]; + + if (likely(!atomic_read(&data->disabled))) + tracing_sched_wakeup_trace(ctx_trace, wakee, current, + flags, pc); + + local_irq_restore(flags); +} + +static int tracing_sched_register(void) +{ + int ret; + + ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL); + if (ret) { + pr_info("wakeup trace: Couldn't activate tracepoint" + " probe to kernel_sched_wakeup\n"); + return ret; + } + + ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL); + if (ret) { + pr_info("wakeup trace: Couldn't activate tracepoint" + " probe to kernel_sched_wakeup_new\n"); + goto fail_deprobe; + } + + ret = register_trace_sched_switch(probe_sched_switch, NULL); + if (ret) { + pr_info("sched trace: Couldn't activate tracepoint" + " probe to kernel_sched_switch\n"); + goto fail_deprobe_wake_new; + } + + return ret; +fail_deprobe_wake_new: + unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL); +fail_deprobe: + unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); + return ret; +} + +static void tracing_sched_unregister(void) +{ + unregister_trace_sched_switch(probe_sched_switch, NULL); + unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL); + unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); +} + +static void tracing_start_sched_switch(void) +{ + mutex_lock(&sched_register_mutex); + if (!(sched_ref++)) + tracing_sched_register(); + mutex_unlock(&sched_register_mutex); +} + +static void tracing_stop_sched_switch(void) +{ + mutex_lock(&sched_register_mutex); + if (!(--sched_ref)) + tracing_sched_unregister(); + mutex_unlock(&sched_register_mutex); +} + +void tracing_start_cmdline_record(void) +{ + tracing_start_sched_switch(); +} + +void tracing_stop_cmdline_record(void) +{ + tracing_stop_sched_switch(); +} + +/** + * tracing_start_sched_switch_record - start tracing context switches + * + * Turns on context switch tracing for a tracer. + */ +void tracing_start_sched_switch_record(void) +{ + if (unlikely(!ctx_trace)) { + WARN_ON(1); + return; + } + + tracing_start_sched_switch(); + + mutex_lock(&sched_register_mutex); + tracer_enabled++; + mutex_unlock(&sched_register_mutex); +} + +/** + * tracing_stop_sched_switch_record - start tracing context switches + * + * Turns off context switch tracing for a tracer. + */ +void tracing_stop_sched_switch_record(void) +{ + mutex_lock(&sched_register_mutex); + tracer_enabled--; + WARN_ON(tracer_enabled < 0); + mutex_unlock(&sched_register_mutex); + + tracing_stop_sched_switch(); +} + +/** + * tracing_sched_switch_assign_trace - assign a trace array for ctx switch + * @tr: trace array pointer to assign + * + * Some tracers might want to record the context switches in their + * trace. This function lets those tracers assign the trace array + * to use. + */ +void tracing_sched_switch_assign_trace(struct trace_array *tr) +{ + ctx_trace = tr; +} + diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c new file mode 100644 index 00000000..f029dd4f --- /dev/null +++ b/kernel/trace/trace_sched_wakeup.c @@ -0,0 +1,626 @@ +/* + * trace task wakeup timings + * + * Copyright (C) 2007-2008 Steven Rostedt + * Copyright (C) 2008 Ingo Molnar + * + * Based on code from the latency_tracer, that is: + * + * Copyright (C) 2004-2006 Ingo Molnar + * Copyright (C) 2004 William Lee Irwin III + */ +#include +#include +#include +#include +#include +#include +#include + +#include "trace.h" + +static struct trace_array *wakeup_trace; +static int __read_mostly tracer_enabled; + +static struct task_struct *wakeup_task; +static int wakeup_cpu; +static int wakeup_current_cpu; +static unsigned wakeup_prio = -1; +static int wakeup_rt; + +static arch_spinlock_t wakeup_lock = + (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + +static void wakeup_reset(struct trace_array *tr); +static void __wakeup_reset(struct trace_array *tr); +static int wakeup_graph_entry(struct ftrace_graph_ent *trace); +static void wakeup_graph_return(struct ftrace_graph_ret *trace); + +static int save_lat_flag; + +#define TRACE_DISPLAY_GRAPH 1 + +static struct tracer_opt trace_opts[] = { +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* display latency trace as call graph */ + { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) }, +#endif + { } /* Empty entry */ +}; + +static struct tracer_flags tracer_flags = { + .val = 0, + .opts = trace_opts, +}; + +#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH) + +#ifdef CONFIG_FUNCTION_TRACER + +/* + * Prologue for the wakeup function tracers. + * + * Returns 1 if it is OK to continue, and preemption + * is disabled and data->disabled is incremented. + * 0 if the trace is to be ignored, and preemption + * is not disabled and data->disabled is + * kept the same. + * + * Note, this function is also used outside this ifdef but + * inside the #ifdef of the function graph tracer below. + * This is OK, since the function graph tracer is + * dependent on the function tracer. + */ +static int +func_prolog_preempt_disable(struct trace_array *tr, + struct trace_array_cpu **data, + int *pc) +{ + long disabled; + int cpu; + + if (likely(!wakeup_task)) + return 0; + + *pc = preempt_count(); + preempt_disable_notrace(); + + cpu = raw_smp_processor_id(); + if (cpu != wakeup_current_cpu) + goto out_enable; + + *data = tr->data[cpu]; + disabled = atomic_inc_return(&(*data)->disabled); + if (unlikely(disabled != 1)) + goto out; + + return 1; + +out: + atomic_dec(&(*data)->disabled); + +out_enable: + preempt_enable_notrace(); + return 0; +} + +/* + * wakeup uses its own tracer function to keep the overhead down: + */ +static void +wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + int pc; + + if (!func_prolog_preempt_disable(tr, &data, &pc)) + return; + + local_irq_save(flags); + trace_function(tr, ip, parent_ip, flags, pc); + local_irq_restore(flags); + + atomic_dec(&data->disabled); + preempt_enable_notrace(); +} + +static struct ftrace_ops trace_ops __read_mostly = +{ + .func = wakeup_tracer_call, + .flags = FTRACE_OPS_FL_GLOBAL, +}; +#endif /* CONFIG_FUNCTION_TRACER */ + +static int start_func_tracer(int graph) +{ + int ret; + + if (!graph) + ret = register_ftrace_function(&trace_ops); + else + ret = register_ftrace_graph(&wakeup_graph_return, + &wakeup_graph_entry); + + if (!ret && tracing_is_enabled()) + tracer_enabled = 1; + else + tracer_enabled = 0; + + return ret; +} + +static void stop_func_tracer(int graph) +{ + tracer_enabled = 0; + + if (!graph) + unregister_ftrace_function(&trace_ops); + else + unregister_ftrace_graph(); +} + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int wakeup_set_flag(u32 old_flags, u32 bit, int set) +{ + + if (!(bit & TRACE_DISPLAY_GRAPH)) + return -EINVAL; + + if (!(is_graph() ^ set)) + return 0; + + stop_func_tracer(!set); + + wakeup_reset(wakeup_trace); + tracing_max_latency = 0; + + return start_func_tracer(set); +} + +static int wakeup_graph_entry(struct ftrace_graph_ent *trace) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + int pc, ret = 0; + + if (!func_prolog_preempt_disable(tr, &data, &pc)) + return 0; + + local_save_flags(flags); + ret = __trace_graph_entry(tr, trace, flags, pc); + atomic_dec(&data->disabled); + preempt_enable_notrace(); + + return ret; +} + +static void wakeup_graph_return(struct ftrace_graph_ret *trace) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + int pc; + + if (!func_prolog_preempt_disable(tr, &data, &pc)) + return; + + local_save_flags(flags); + __trace_graph_return(tr, trace, flags, pc); + atomic_dec(&data->disabled); + + preempt_enable_notrace(); + return; +} + +static void wakeup_trace_open(struct trace_iterator *iter) +{ + if (is_graph()) + graph_trace_open(iter); +} + +static void wakeup_trace_close(struct trace_iterator *iter) +{ + if (iter->private) + graph_trace_close(iter); +} + +#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC) + +static enum print_line_t wakeup_print_line(struct trace_iterator *iter) +{ + /* + * In graph mode call the graph tracer output function, + * otherwise go with the TRACE_FN event handler + */ + if (is_graph()) + return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); + + return TRACE_TYPE_UNHANDLED; +} + +static void wakeup_print_header(struct seq_file *s) +{ + if (is_graph()) + print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); + else + trace_default_header(s); +} + +static void +__trace_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned long flags, int pc) +{ + if (is_graph()) + trace_graph_function(tr, ip, parent_ip, flags, pc); + else + trace_function(tr, ip, parent_ip, flags, pc); +} +#else +#define __trace_function trace_function + +static int wakeup_set_flag(u32 old_flags, u32 bit, int set) +{ + return -EINVAL; +} + +static int wakeup_graph_entry(struct ftrace_graph_ent *trace) +{ + return -1; +} + +static enum print_line_t wakeup_print_line(struct trace_iterator *iter) +{ + return TRACE_TYPE_UNHANDLED; +} + +static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } +static void wakeup_print_header(struct seq_file *s) { } +static void wakeup_trace_open(struct trace_iterator *iter) { } +static void wakeup_trace_close(struct trace_iterator *iter) { } +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + +/* + * Should this new latency be reported/recorded? + */ +static int report_latency(cycle_t delta) +{ + if (tracing_thresh) { + if (delta < tracing_thresh) + return 0; + } else { + if (delta <= tracing_max_latency) + return 0; + } + return 1; +} + +static void +probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu) +{ + if (task != wakeup_task) + return; + + wakeup_current_cpu = cpu; +} + +static void notrace +probe_wakeup_sched_switch(void *ignore, + struct task_struct *prev, struct task_struct *next) +{ + struct trace_array_cpu *data; + cycle_t T0, T1, delta; + unsigned long flags; + long disabled; + int cpu; + int pc; + + tracing_record_cmdline(prev); + + if (unlikely(!tracer_enabled)) + return; + + /* + * When we start a new trace, we set wakeup_task to NULL + * and then set tracer_enabled = 1. We want to make sure + * that another CPU does not see the tracer_enabled = 1 + * and the wakeup_task with an older task, that might + * actually be the same as next. + */ + smp_rmb(); + + if (next != wakeup_task) + return; + + pc = preempt_count(); + + /* disable local data, not wakeup_cpu data */ + cpu = raw_smp_processor_id(); + disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); + if (likely(disabled != 1)) + goto out; + + local_irq_save(flags); + arch_spin_lock(&wakeup_lock); + + /* We could race with grabbing wakeup_lock */ + if (unlikely(!tracer_enabled || next != wakeup_task)) + goto out_unlock; + + /* The task we are waiting for is waking up */ + data = wakeup_trace->data[wakeup_cpu]; + + __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); + tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); + + T0 = data->preempt_timestamp; + T1 = ftrace_now(cpu); + delta = T1-T0; + + if (!report_latency(delta)) + goto out_unlock; + + if (likely(!is_tracing_stopped())) { + tracing_max_latency = delta; + update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); + } + +out_unlock: + __wakeup_reset(wakeup_trace); + arch_spin_unlock(&wakeup_lock); + local_irq_restore(flags); +out: + atomic_dec(&wakeup_trace->data[cpu]->disabled); +} + +static void __wakeup_reset(struct trace_array *tr) +{ + wakeup_cpu = -1; + wakeup_prio = -1; + + if (wakeup_task) + put_task_struct(wakeup_task); + + wakeup_task = NULL; +} + +static void wakeup_reset(struct trace_array *tr) +{ + unsigned long flags; + + tracing_reset_online_cpus(tr); + + local_irq_save(flags); + arch_spin_lock(&wakeup_lock); + __wakeup_reset(tr); + arch_spin_unlock(&wakeup_lock); + local_irq_restore(flags); +} + +static void +probe_wakeup(void *ignore, struct task_struct *p, int success) +{ + struct trace_array_cpu *data; + int cpu = smp_processor_id(); + unsigned long flags; + long disabled; + int pc; + + if (likely(!tracer_enabled)) + return; + + tracing_record_cmdline(p); + tracing_record_cmdline(current); + + if ((wakeup_rt && !rt_task(p)) || + p->prio >= wakeup_prio || + p->prio >= current->prio) + return; + + pc = preempt_count(); + disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); + if (unlikely(disabled != 1)) + goto out; + + /* interrupts should be off from try_to_wake_up */ + arch_spin_lock(&wakeup_lock); + + /* check for races. */ + if (!tracer_enabled || p->prio >= wakeup_prio) + goto out_locked; + + /* reset the trace */ + __wakeup_reset(wakeup_trace); + + wakeup_cpu = task_cpu(p); + wakeup_current_cpu = wakeup_cpu; + wakeup_prio = p->prio; + + wakeup_task = p; + get_task_struct(wakeup_task); + + local_save_flags(flags); + + data = wakeup_trace->data[wakeup_cpu]; + data->preempt_timestamp = ftrace_now(cpu); + tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc); + + /* + * We must be careful in using CALLER_ADDR2. But since wake_up + * is not called by an assembly function (where as schedule is) + * it should be safe to use it here. + */ + __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); + +out_locked: + arch_spin_unlock(&wakeup_lock); +out: + atomic_dec(&wakeup_trace->data[cpu]->disabled); +} + +static void start_wakeup_tracer(struct trace_array *tr) +{ + int ret; + + ret = register_trace_sched_wakeup(probe_wakeup, NULL); + if (ret) { + pr_info("wakeup trace: Couldn't activate tracepoint" + " probe to kernel_sched_wakeup\n"); + return; + } + + ret = register_trace_sched_wakeup_new(probe_wakeup, NULL); + if (ret) { + pr_info("wakeup trace: Couldn't activate tracepoint" + " probe to kernel_sched_wakeup_new\n"); + goto fail_deprobe; + } + + ret = register_trace_sched_switch(probe_wakeup_sched_switch, NULL); + if (ret) { + pr_info("sched trace: Couldn't activate tracepoint" + " probe to kernel_sched_switch\n"); + goto fail_deprobe_wake_new; + } + + ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL); + if (ret) { + pr_info("wakeup trace: Couldn't activate tracepoint" + " probe to kernel_sched_migrate_task\n"); + return; + } + + wakeup_reset(tr); + + /* + * Don't let the tracer_enabled = 1 show up before + * the wakeup_task is reset. This may be overkill since + * wakeup_reset does a spin_unlock after setting the + * wakeup_task to NULL, but I want to be safe. + * This is a slow path anyway. + */ + smp_wmb(); + + if (start_func_tracer(is_graph())) + printk(KERN_ERR "failed to start wakeup tracer\n"); + + return; +fail_deprobe_wake_new: + unregister_trace_sched_wakeup_new(probe_wakeup, NULL); +fail_deprobe: + unregister_trace_sched_wakeup(probe_wakeup, NULL); +} + +static void stop_wakeup_tracer(struct trace_array *tr) +{ + tracer_enabled = 0; + stop_func_tracer(is_graph()); + unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); + unregister_trace_sched_wakeup_new(probe_wakeup, NULL); + unregister_trace_sched_wakeup(probe_wakeup, NULL); + unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL); +} + +static int __wakeup_tracer_init(struct trace_array *tr) +{ + save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; + trace_flags |= TRACE_ITER_LATENCY_FMT; + + tracing_max_latency = 0; + wakeup_trace = tr; + start_wakeup_tracer(tr); + return 0; +} + +static int wakeup_tracer_init(struct trace_array *tr) +{ + wakeup_rt = 0; + return __wakeup_tracer_init(tr); +} + +static int wakeup_rt_tracer_init(struct trace_array *tr) +{ + wakeup_rt = 1; + return __wakeup_tracer_init(tr); +} + +static void wakeup_tracer_reset(struct trace_array *tr) +{ + stop_wakeup_tracer(tr); + /* make sure we put back any tasks we are tracing */ + wakeup_reset(tr); + + if (!save_lat_flag) + trace_flags &= ~TRACE_ITER_LATENCY_FMT; +} + +static void wakeup_tracer_start(struct trace_array *tr) +{ + wakeup_reset(tr); + tracer_enabled = 1; +} + +static void wakeup_tracer_stop(struct trace_array *tr) +{ + tracer_enabled = 0; +} + +static struct tracer wakeup_tracer __read_mostly = +{ + .name = "wakeup", + .init = wakeup_tracer_init, + .reset = wakeup_tracer_reset, + .start = wakeup_tracer_start, + .stop = wakeup_tracer_stop, + .print_max = 1, + .print_header = wakeup_print_header, + .print_line = wakeup_print_line, + .flags = &tracer_flags, + .set_flag = wakeup_set_flag, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_wakeup, +#endif + .open = wakeup_trace_open, + .close = wakeup_trace_close, + .use_max_tr = 1, +}; + +static struct tracer wakeup_rt_tracer __read_mostly = +{ + .name = "wakeup_rt", + .init = wakeup_rt_tracer_init, + .reset = wakeup_tracer_reset, + .start = wakeup_tracer_start, + .stop = wakeup_tracer_stop, + .wait_pipe = poll_wait_pipe, + .print_max = 1, + .print_header = wakeup_print_header, + .print_line = wakeup_print_line, + .flags = &tracer_flags, + .set_flag = wakeup_set_flag, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_wakeup, +#endif + .open = wakeup_trace_open, + .close = wakeup_trace_close, + .use_max_tr = 1, +}; + +__init static int init_wakeup_tracer(void) +{ + int ret; + + ret = register_tracer(&wakeup_tracer); + if (ret) + return ret; + + ret = register_tracer(&wakeup_rt_tracer); + if (ret) + return ret; + + return 0; +} +device_initcall(init_wakeup_tracer); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c new file mode 100644 index 00000000..288541f9 --- /dev/null +++ b/kernel/trace/trace_selftest.c @@ -0,0 +1,931 @@ +/* Include in trace.c */ + +#include +#include +#include +#include + +static inline int trace_valid_entry(struct trace_entry *entry) +{ + switch (entry->type) { + case TRACE_FN: + case TRACE_CTX: + case TRACE_WAKE: + case TRACE_STACK: + case TRACE_PRINT: + case TRACE_BRANCH: + case TRACE_GRAPH_ENT: + case TRACE_GRAPH_RET: + return 1; + } + return 0; +} + +static int trace_test_buffer_cpu(struct trace_array *tr, int cpu) +{ + struct ring_buffer_event *event; + struct trace_entry *entry; + unsigned int loops = 0; + + while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) { + entry = ring_buffer_event_data(event); + + /* + * The ring buffer is a size of trace_buf_size, if + * we loop more than the size, there's something wrong + * with the ring buffer. + */ + if (loops++ > trace_buf_size) { + printk(KERN_CONT ".. bad ring buffer "); + goto failed; + } + if (!trace_valid_entry(entry)) { + printk(KERN_CONT ".. invalid entry %d ", + entry->type); + goto failed; + } + } + return 0; + + failed: + /* disable tracing */ + tracing_disabled = 1; + printk(KERN_CONT ".. corrupted trace buffer .. "); + return -1; +} + +/* + * Test the trace buffer to see if all the elements + * are still sane. + */ +static int trace_test_buffer(struct trace_array *tr, unsigned long *count) +{ + unsigned long flags, cnt = 0; + int cpu, ret = 0; + + /* Don't allow flipping of max traces now */ + local_irq_save(flags); + arch_spin_lock(&ftrace_max_lock); + + cnt = ring_buffer_entries(tr->buffer); + + /* + * The trace_test_buffer_cpu runs a while loop to consume all data. + * If the calling tracer is broken, and is constantly filling + * the buffer, this will run forever, and hard lock the box. + * We disable the ring buffer while we do this test to prevent + * a hard lock up. + */ + tracing_off(); + for_each_possible_cpu(cpu) { + ret = trace_test_buffer_cpu(tr, cpu); + if (ret) + break; + } + tracing_on(); + arch_spin_unlock(&ftrace_max_lock); + local_irq_restore(flags); + + if (count) + *count = cnt; + + return ret; +} + +static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret) +{ + printk(KERN_WARNING "Failed to init %s tracer, init returned %d\n", + trace->name, init_ret); +} +#ifdef CONFIG_FUNCTION_TRACER + +#ifdef CONFIG_DYNAMIC_FTRACE + +static int trace_selftest_test_probe1_cnt; +static void trace_selftest_test_probe1_func(unsigned long ip, + unsigned long pip) +{ + trace_selftest_test_probe1_cnt++; +} + +static int trace_selftest_test_probe2_cnt; +static void trace_selftest_test_probe2_func(unsigned long ip, + unsigned long pip) +{ + trace_selftest_test_probe2_cnt++; +} + +static int trace_selftest_test_probe3_cnt; +static void trace_selftest_test_probe3_func(unsigned long ip, + unsigned long pip) +{ + trace_selftest_test_probe3_cnt++; +} + +static int trace_selftest_test_global_cnt; +static void trace_selftest_test_global_func(unsigned long ip, + unsigned long pip) +{ + trace_selftest_test_global_cnt++; +} + +static int trace_selftest_test_dyn_cnt; +static void trace_selftest_test_dyn_func(unsigned long ip, + unsigned long pip) +{ + trace_selftest_test_dyn_cnt++; +} + +static struct ftrace_ops test_probe1 = { + .func = trace_selftest_test_probe1_func, +}; + +static struct ftrace_ops test_probe2 = { + .func = trace_selftest_test_probe2_func, +}; + +static struct ftrace_ops test_probe3 = { + .func = trace_selftest_test_probe3_func, +}; + +static struct ftrace_ops test_global = { + .func = trace_selftest_test_global_func, + .flags = FTRACE_OPS_FL_GLOBAL, +}; + +static void print_counts(void) +{ + printk("(%d %d %d %d %d) ", + trace_selftest_test_probe1_cnt, + trace_selftest_test_probe2_cnt, + trace_selftest_test_probe3_cnt, + trace_selftest_test_global_cnt, + trace_selftest_test_dyn_cnt); +} + +static void reset_counts(void) +{ + trace_selftest_test_probe1_cnt = 0; + trace_selftest_test_probe2_cnt = 0; + trace_selftest_test_probe3_cnt = 0; + trace_selftest_test_global_cnt = 0; + trace_selftest_test_dyn_cnt = 0; +} + +static int trace_selftest_ops(int cnt) +{ + int save_ftrace_enabled = ftrace_enabled; + struct ftrace_ops *dyn_ops; + char *func1_name; + char *func2_name; + int len1; + int len2; + int ret = -1; + + printk(KERN_CONT "PASSED\n"); + pr_info("Testing dynamic ftrace ops #%d: ", cnt); + + ftrace_enabled = 1; + reset_counts(); + + /* Handle PPC64 '.' name */ + func1_name = "*" __stringify(DYN_FTRACE_TEST_NAME); + func2_name = "*" __stringify(DYN_FTRACE_TEST_NAME2); + len1 = strlen(func1_name); + len2 = strlen(func2_name); + + /* + * Probe 1 will trace function 1. + * Probe 2 will trace function 2. + * Probe 3 will trace functions 1 and 2. + */ + ftrace_set_filter(&test_probe1, func1_name, len1, 1); + ftrace_set_filter(&test_probe2, func2_name, len2, 1); + ftrace_set_filter(&test_probe3, func1_name, len1, 1); + ftrace_set_filter(&test_probe3, func2_name, len2, 0); + + register_ftrace_function(&test_probe1); + register_ftrace_function(&test_probe2); + register_ftrace_function(&test_probe3); + register_ftrace_function(&test_global); + + DYN_FTRACE_TEST_NAME(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 1) + goto out; + if (trace_selftest_test_probe2_cnt != 0) + goto out; + if (trace_selftest_test_probe3_cnt != 1) + goto out; + if (trace_selftest_test_global_cnt == 0) + goto out; + + DYN_FTRACE_TEST_NAME2(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 1) + goto out; + if (trace_selftest_test_probe2_cnt != 1) + goto out; + if (trace_selftest_test_probe3_cnt != 2) + goto out; + + /* Add a dynamic probe */ + dyn_ops = kzalloc(sizeof(*dyn_ops), GFP_KERNEL); + if (!dyn_ops) { + printk("MEMORY ERROR "); + goto out; + } + + dyn_ops->func = trace_selftest_test_dyn_func; + + register_ftrace_function(dyn_ops); + + trace_selftest_test_global_cnt = 0; + + DYN_FTRACE_TEST_NAME(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 2) + goto out_free; + if (trace_selftest_test_probe2_cnt != 1) + goto out_free; + if (trace_selftest_test_probe3_cnt != 3) + goto out_free; + if (trace_selftest_test_global_cnt == 0) + goto out; + if (trace_selftest_test_dyn_cnt == 0) + goto out_free; + + DYN_FTRACE_TEST_NAME2(); + + print_counts(); + + if (trace_selftest_test_probe1_cnt != 2) + goto out_free; + if (trace_selftest_test_probe2_cnt != 2) + goto out_free; + if (trace_selftest_test_probe3_cnt != 4) + goto out_free; + + ret = 0; + out_free: + unregister_ftrace_function(dyn_ops); + kfree(dyn_ops); + + out: + /* Purposely unregister in the same order */ + unregister_ftrace_function(&test_probe1); + unregister_ftrace_function(&test_probe2); + unregister_ftrace_function(&test_probe3); + unregister_ftrace_function(&test_global); + + /* Make sure everything is off */ + reset_counts(); + DYN_FTRACE_TEST_NAME(); + DYN_FTRACE_TEST_NAME(); + + if (trace_selftest_test_probe1_cnt || + trace_selftest_test_probe2_cnt || + trace_selftest_test_probe3_cnt || + trace_selftest_test_global_cnt || + trace_selftest_test_dyn_cnt) + ret = -1; + + ftrace_enabled = save_ftrace_enabled; + + return ret; +} + +/* Test dynamic code modification and ftrace filters */ +int trace_selftest_startup_dynamic_tracing(struct tracer *trace, + struct trace_array *tr, + int (*func)(void)) +{ + int save_ftrace_enabled = ftrace_enabled; + int save_tracer_enabled = tracer_enabled; + unsigned long count; + char *func_name; + int ret; + + /* The ftrace test PASSED */ + printk(KERN_CONT "PASSED\n"); + pr_info("Testing dynamic ftrace: "); + + /* enable tracing, and record the filter function */ + ftrace_enabled = 1; + tracer_enabled = 1; + + /* passed in by parameter to fool gcc from optimizing */ + func(); + + /* + * Some archs *cough*PowerPC*cough* add characters to the + * start of the function names. We simply put a '*' to + * accommodate them. + */ + func_name = "*" __stringify(DYN_FTRACE_TEST_NAME); + + /* filter only on our function */ + ftrace_set_global_filter(func_name, strlen(func_name), 1); + + /* enable tracing */ + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + goto out; + } + + /* Sleep for a 1/10 of a second */ + msleep(100); + + /* we should have nothing in the buffer */ + ret = trace_test_buffer(tr, &count); + if (ret) + goto out; + + if (count) { + ret = -1; + printk(KERN_CONT ".. filter did not filter .. "); + goto out; + } + + /* call our function again */ + func(); + + /* sleep again */ + msleep(100); + + /* stop the tracing. */ + tracing_stop(); + ftrace_enabled = 0; + + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + tracing_start(); + + /* we should only have one item */ + if (!ret && count != 1) { + trace->reset(tr); + printk(KERN_CONT ".. filter failed count=%ld ..", count); + ret = -1; + goto out; + } + + /* Test the ops with global tracing running */ + ret = trace_selftest_ops(1); + trace->reset(tr); + + out: + ftrace_enabled = save_ftrace_enabled; + tracer_enabled = save_tracer_enabled; + + /* Enable tracing on all functions again */ + ftrace_set_global_filter(NULL, 0, 1); + + /* Test the ops with global tracing off */ + if (!ret) + ret = trace_selftest_ops(2); + + return ret; +} +#else +# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; }) +#endif /* CONFIG_DYNAMIC_FTRACE */ + +/* + * Simple verification test of ftrace function tracer. + * Enable ftrace, sleep 1/10 second, and then read the trace + * buffer to see if all is in order. + */ +int +trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) +{ + int save_ftrace_enabled = ftrace_enabled; + int save_tracer_enabled = tracer_enabled; + unsigned long count; + int ret; + + /* make sure msleep has been recorded */ + msleep(1); + + /* start the tracing */ + ftrace_enabled = 1; + tracer_enabled = 1; + + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + goto out; + } + + /* Sleep for a 1/10 of a second */ + msleep(100); + /* stop the tracing. */ + tracing_stop(); + ftrace_enabled = 0; + + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + tracing_start(); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + goto out; + } + + ret = trace_selftest_startup_dynamic_tracing(trace, tr, + DYN_FTRACE_TEST_NAME); + + out: + ftrace_enabled = save_ftrace_enabled; + tracer_enabled = save_tracer_enabled; + + /* kill ftrace totally if we failed */ + if (ret) + ftrace_kill(); + + return ret; +} +#endif /* CONFIG_FUNCTION_TRACER */ + + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + +/* Maximum number of functions to trace before diagnosing a hang */ +#define GRAPH_MAX_FUNC_TEST 100000000 + +static void +__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode); +static unsigned int graph_hang_thresh; + +/* Wrap the real function entry probe to avoid possible hanging */ +static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace) +{ + /* This is harmlessly racy, we want to approximately detect a hang */ + if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) { + ftrace_graph_stop(); + printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); + if (ftrace_dump_on_oops) + __ftrace_dump(false, DUMP_ALL); + return 0; + } + + return trace_graph_entry(trace); +} + +/* + * Pretty much the same than for the function tracer from which the selftest + * has been borrowed. + */ +int +trace_selftest_startup_function_graph(struct tracer *trace, + struct trace_array *tr) +{ + int ret; + unsigned long count; + + /* + * Simulate the init() callback but we attach a watchdog callback + * to detect and recover from possible hangs + */ + tracing_reset_online_cpus(tr); + set_graph_array(tr); + ret = register_ftrace_graph(&trace_graph_return, + &trace_graph_entry_watchdog); + if (ret) { + warn_failed_init_tracer(trace, ret); + goto out; + } + tracing_start_cmdline_record(); + + /* Sleep for a 1/10 of a second */ + msleep(100); + + /* Have we just recovered from a hang? */ + if (graph_hang_thresh > GRAPH_MAX_FUNC_TEST) { + tracing_selftest_disabled = true; + ret = -1; + goto out; + } + + tracing_stop(); + + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + + trace->reset(tr); + tracing_start(); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + goto out; + } + + /* Don't test dynamic tracing, the function tracer already did */ + +out: + /* Stop it if we failed */ + if (ret) + ftrace_graph_stop(); + + return ret; +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + + +#ifdef CONFIG_IRQSOFF_TRACER +int +trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tracing_max_latency; + unsigned long count; + int ret; + + /* start the tracing */ + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + + /* reset the max latency */ + tracing_max_latency = 0; + /* disable interrupts for a bit */ + local_irq_disable(); + udelay(100); + local_irq_enable(); + + /* + * Stop the tracer to avoid a warning subsequent + * to buffer flipping failure because tracing_stop() + * disables the tr and max buffers, making flipping impossible + * in case of parallels max irqs off latencies. + */ + trace->stop(tr); + /* stop the tracing. */ + tracing_stop(); + /* check both trace buffers */ + ret = trace_test_buffer(tr, NULL); + if (!ret) + ret = trace_test_buffer(&max_tr, &count); + trace->reset(tr); + tracing_start(); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + tracing_max_latency = save_max; + + return ret; +} +#endif /* CONFIG_IRQSOFF_TRACER */ + +#ifdef CONFIG_PREEMPT_TRACER +int +trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tracing_max_latency; + unsigned long count; + int ret; + + /* + * Now that the big kernel lock is no longer preemptable, + * and this is called with the BKL held, it will always + * fail. If preemption is already disabled, simply + * pass the test. When the BKL is removed, or becomes + * preemptible again, we will once again test this, + * so keep it in. + */ + if (preempt_count()) { + printk(KERN_CONT "can not test ... force "); + return 0; + } + + /* start the tracing */ + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + + /* reset the max latency */ + tracing_max_latency = 0; + /* disable preemption for a bit */ + preempt_disable(); + udelay(100); + preempt_enable(); + + /* + * Stop the tracer to avoid a warning subsequent + * to buffer flipping failure because tracing_stop() + * disables the tr and max buffers, making flipping impossible + * in case of parallels max preempt off latencies. + */ + trace->stop(tr); + /* stop the tracing. */ + tracing_stop(); + /* check both trace buffers */ + ret = trace_test_buffer(tr, NULL); + if (!ret) + ret = trace_test_buffer(&max_tr, &count); + trace->reset(tr); + tracing_start(); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + tracing_max_latency = save_max; + + return ret; +} +#endif /* CONFIG_PREEMPT_TRACER */ + +#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER) +int +trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tracing_max_latency; + unsigned long count; + int ret; + + /* + * Now that the big kernel lock is no longer preemptable, + * and this is called with the BKL held, it will always + * fail. If preemption is already disabled, simply + * pass the test. When the BKL is removed, or becomes + * preemptible again, we will once again test this, + * so keep it in. + */ + if (preempt_count()) { + printk(KERN_CONT "can not test ... force "); + return 0; + } + + /* start the tracing */ + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + goto out_no_start; + } + + /* reset the max latency */ + tracing_max_latency = 0; + + /* disable preemption and interrupts for a bit */ + preempt_disable(); + local_irq_disable(); + udelay(100); + preempt_enable(); + /* reverse the order of preempt vs irqs */ + local_irq_enable(); + + /* + * Stop the tracer to avoid a warning subsequent + * to buffer flipping failure because tracing_stop() + * disables the tr and max buffers, making flipping impossible + * in case of parallels max irqs/preempt off latencies. + */ + trace->stop(tr); + /* stop the tracing. */ + tracing_stop(); + /* check both trace buffers */ + ret = trace_test_buffer(tr, NULL); + if (ret) + goto out; + + ret = trace_test_buffer(&max_tr, &count); + if (ret) + goto out; + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + goto out; + } + + /* do the test by disabling interrupts first this time */ + tracing_max_latency = 0; + tracing_start(); + trace->start(tr); + + preempt_disable(); + local_irq_disable(); + udelay(100); + preempt_enable(); + /* reverse the order of preempt vs irqs */ + local_irq_enable(); + + trace->stop(tr); + /* stop the tracing. */ + tracing_stop(); + /* check both trace buffers */ + ret = trace_test_buffer(tr, NULL); + if (ret) + goto out; + + ret = trace_test_buffer(&max_tr, &count); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + goto out; + } + +out: + tracing_start(); +out_no_start: + trace->reset(tr); + tracing_max_latency = save_max; + + return ret; +} +#endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */ + +#ifdef CONFIG_NOP_TRACER +int +trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr) +{ + /* What could possibly go wrong? */ + return 0; +} +#endif + +#ifdef CONFIG_SCHED_TRACER +static int trace_wakeup_test_thread(void *data) +{ + /* Make this a RT thread, doesn't need to be too high */ + static const struct sched_param param = { .sched_priority = 5 }; + struct completion *x = data; + + sched_setscheduler(current, SCHED_FIFO, ¶m); + + /* Make it know we have a new prio */ + complete(x); + + /* now go to sleep and let the test wake us up */ + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + + /* we are awake, now wait to disappear */ + while (!kthread_should_stop()) { + /* + * This is an RT task, do short sleeps to let + * others run. + */ + msleep(100); + } + + return 0; +} + +int +trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) +{ + unsigned long save_max = tracing_max_latency; + struct task_struct *p; + struct completion isrt; + unsigned long count; + int ret; + + init_completion(&isrt); + + /* create a high prio thread */ + p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test"); + if (IS_ERR(p)) { + printk(KERN_CONT "Failed to create ftrace wakeup test thread "); + return -1; + } + + /* make sure the thread is running at an RT prio */ + wait_for_completion(&isrt); + + /* start the tracing */ + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + + /* reset the max latency */ + tracing_max_latency = 0; + + /* sleep to let the RT thread sleep too */ + msleep(100); + + /* + * Yes this is slightly racy. It is possible that for some + * strange reason that the RT thread we created, did not + * call schedule for 100ms after doing the completion, + * and we do a wakeup on a task that already is awake. + * But that is extremely unlikely, and the worst thing that + * happens in such a case, is that we disable tracing. + * Honestly, if this race does happen something is horrible + * wrong with the system. + */ + + wake_up_process(p); + + /* give a little time to let the thread wake up */ + msleep(100); + + /* stop the tracing. */ + tracing_stop(); + /* check both trace buffers */ + ret = trace_test_buffer(tr, NULL); + if (!ret) + ret = trace_test_buffer(&max_tr, &count); + + + trace->reset(tr); + tracing_start(); + + tracing_max_latency = save_max; + + /* kill the thread */ + kthread_stop(p); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + return ret; +} +#endif /* CONFIG_SCHED_TRACER */ + +#ifdef CONFIG_CONTEXT_SWITCH_TRACER +int +trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr) +{ + unsigned long count; + int ret; + + /* start the tracing */ + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + + /* Sleep for a 1/10 of a second */ + msleep(100); + /* stop the tracing. */ + tracing_stop(); + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + tracing_start(); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + return ret; +} +#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ + +#ifdef CONFIG_BRANCH_TRACER +int +trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) +{ + unsigned long count; + int ret; + + /* start the tracing */ + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + + /* Sleep for a 1/10 of a second */ + msleep(100); + /* stop the tracing. */ + tracing_stop(); + /* check the trace buffer */ + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + tracing_start(); + + if (!ret && !count) { + printk(KERN_CONT ".. no entries found .."); + ret = -1; + } + + return ret; +} +#endif /* CONFIG_BRANCH_TRACER */ + diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c new file mode 100644 index 00000000..b4c475a0 --- /dev/null +++ b/kernel/trace/trace_selftest_dynamic.c @@ -0,0 +1,13 @@ +#include "trace.h" + +int DYN_FTRACE_TEST_NAME(void) +{ + /* used to call mcount */ + return 0; +} + +int DYN_FTRACE_TEST_NAME2(void) +{ + /* used to call mcount */ + return 0; +} diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c new file mode 100644 index 00000000..b0b53b8e --- /dev/null +++ b/kernel/trace/trace_stack.c @@ -0,0 +1,376 @@ +/* + * Copyright (C) 2008 Steven Rostedt + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "trace.h" + +#define STACK_TRACE_ENTRIES 500 + +static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = + { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; +static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; + +static struct stack_trace max_stack_trace = { + .max_entries = STACK_TRACE_ENTRIES, + .entries = stack_dump_trace, +}; + +static unsigned long max_stack_size; +static arch_spinlock_t max_stack_lock = + (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; + +static int stack_trace_disabled __read_mostly; +static DEFINE_PER_CPU(int, trace_active); +static DEFINE_MUTEX(stack_sysctl_mutex); + +int stack_tracer_enabled; +static int last_stack_tracer_enabled; + +static inline void check_stack(void) +{ + unsigned long this_size, flags; + unsigned long *p, *top, *start; + int i; + + this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1); + this_size = THREAD_SIZE - this_size; + + if (this_size <= max_stack_size) + return; + + /* we do not handle interrupt stacks yet */ + if (!object_is_on_stack(&this_size)) + return; + + local_irq_save(flags); + arch_spin_lock(&max_stack_lock); + + /* a race could have already updated it */ + if (this_size <= max_stack_size) + goto out; + + max_stack_size = this_size; + + max_stack_trace.nr_entries = 0; + max_stack_trace.skip = 3; + + save_stack_trace(&max_stack_trace); + + /* + * Now find where in the stack these are. + */ + i = 0; + start = &this_size; + top = (unsigned long *) + (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); + + /* + * Loop through all the entries. One of the entries may + * for some reason be missed on the stack, so we may + * have to account for them. If they are all there, this + * loop will only happen once. This code only takes place + * on a new max, so it is far from a fast path. + */ + while (i < max_stack_trace.nr_entries) { + int found = 0; + + stack_dump_index[i] = this_size; + p = start; + + for (; p < top && i < max_stack_trace.nr_entries; p++) { + if (*p == stack_dump_trace[i]) { + this_size = stack_dump_index[i++] = + (top - p) * sizeof(unsigned long); + found = 1; + /* Start the search from here */ + start = p + 1; + } + } + + if (!found) + i++; + } + + out: + arch_spin_unlock(&max_stack_lock); + local_irq_restore(flags); +} + +static void +stack_trace_call(unsigned long ip, unsigned long parent_ip) +{ + int cpu; + + if (unlikely(!ftrace_enabled || stack_trace_disabled)) + return; + + preempt_disable_notrace(); + + cpu = raw_smp_processor_id(); + /* no atomic needed, we only modify this variable by this cpu */ + if (per_cpu(trace_active, cpu)++ != 0) + goto out; + + check_stack(); + + out: + per_cpu(trace_active, cpu)--; + /* prevent recursion in schedule */ + preempt_enable_notrace(); +} + +static struct ftrace_ops trace_ops __read_mostly = +{ + .func = stack_trace_call, + .flags = FTRACE_OPS_FL_GLOBAL, +}; + +static ssize_t +stack_max_size_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + unsigned long *ptr = filp->private_data; + char buf[64]; + int r; + + r = snprintf(buf, sizeof(buf), "%ld\n", *ptr); + if (r > sizeof(buf)) + r = sizeof(buf); + return simple_read_from_buffer(ubuf, count, ppos, buf, r); +} + +static ssize_t +stack_max_size_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *ppos) +{ + long *ptr = filp->private_data; + unsigned long val, flags; + char buf[64]; + int ret; + int cpu; + + if (count >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, count)) + return -EFAULT; + + buf[count] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + local_irq_save(flags); + + /* + * In case we trace inside arch_spin_lock() or after (NMI), + * we will cause circular lock, so we also need to increase + * the percpu trace_active here. + */ + cpu = smp_processor_id(); + per_cpu(trace_active, cpu)++; + + arch_spin_lock(&max_stack_lock); + *ptr = val; + arch_spin_unlock(&max_stack_lock); + + per_cpu(trace_active, cpu)--; + local_irq_restore(flags); + + return count; +} + +static const struct file_operations stack_max_size_fops = { + .open = tracing_open_generic, + .read = stack_max_size_read, + .write = stack_max_size_write, + .llseek = default_llseek, +}; + +static void * +__next(struct seq_file *m, loff_t *pos) +{ + long n = *pos - 1; + + if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX) + return NULL; + + m->private = (void *)n; + return &m->private; +} + +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return __next(m, pos); +} + +static void *t_start(struct seq_file *m, loff_t *pos) +{ + int cpu; + + local_irq_disable(); + + cpu = smp_processor_id(); + per_cpu(trace_active, cpu)++; + + arch_spin_lock(&max_stack_lock); + + if (*pos == 0) + return SEQ_START_TOKEN; + + return __next(m, pos); +} + +static void t_stop(struct seq_file *m, void *p) +{ + int cpu; + + arch_spin_unlock(&max_stack_lock); + + cpu = smp_processor_id(); + per_cpu(trace_active, cpu)--; + + local_irq_enable(); +} + +static int trace_lookup_stack(struct seq_file *m, long i) +{ + unsigned long addr = stack_dump_trace[i]; + + return seq_printf(m, "%pS\n", (void *)addr); +} + +static void print_disabled(struct seq_file *m) +{ + seq_puts(m, "#\n" + "# Stack tracer disabled\n" + "#\n" + "# To enable the stack tracer, either add 'stacktrace' to the\n" + "# kernel command line\n" + "# or 'echo 1 > /proc/sys/kernel/stack_tracer_enabled'\n" + "#\n"); +} + +static int t_show(struct seq_file *m, void *v) +{ + long i; + int size; + + if (v == SEQ_START_TOKEN) { + seq_printf(m, " Depth Size Location" + " (%d entries)\n" + " ----- ---- --------\n", + max_stack_trace.nr_entries - 1); + + if (!stack_tracer_enabled && !max_stack_size) + print_disabled(m); + + return 0; + } + + i = *(long *)v; + + if (i >= max_stack_trace.nr_entries || + stack_dump_trace[i] == ULONG_MAX) + return 0; + + if (i+1 == max_stack_trace.nr_entries || + stack_dump_trace[i+1] == ULONG_MAX) + size = stack_dump_index[i]; + else + size = stack_dump_index[i] - stack_dump_index[i+1]; + + seq_printf(m, "%3ld) %8d %5d ", i, stack_dump_index[i], size); + + trace_lookup_stack(m, i); + + return 0; +} + +static const struct seq_operations stack_trace_seq_ops = { + .start = t_start, + .next = t_next, + .stop = t_stop, + .show = t_show, +}; + +static int stack_trace_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &stack_trace_seq_ops); +} + +static const struct file_operations stack_trace_fops = { + .open = stack_trace_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +int +stack_trace_sysctl(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret; + + mutex_lock(&stack_sysctl_mutex); + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (ret || !write || + (last_stack_tracer_enabled == !!stack_tracer_enabled)) + goto out; + + last_stack_tracer_enabled = !!stack_tracer_enabled; + + if (stack_tracer_enabled) + register_ftrace_function(&trace_ops); + else + unregister_ftrace_function(&trace_ops); + + out: + mutex_unlock(&stack_sysctl_mutex); + return ret; +} + +static __init int enable_stacktrace(char *str) +{ + stack_tracer_enabled = 1; + last_stack_tracer_enabled = 1; + return 1; +} +__setup("stacktrace", enable_stacktrace); + +static __init int stack_trace_init(void) +{ + struct dentry *d_tracer; + + d_tracer = tracing_init_dentry(); + + trace_create_file("stack_max_size", 0644, d_tracer, + &max_stack_size, &stack_max_size_fops); + + trace_create_file("stack_trace", 0444, d_tracer, + NULL, &stack_trace_fops); + + if (stack_tracer_enabled) + register_ftrace_function(&trace_ops); + + return 0; +} + +device_initcall(stack_trace_init); diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c new file mode 100644 index 00000000..96cffb26 --- /dev/null +++ b/kernel/trace/trace_stat.c @@ -0,0 +1,388 @@ +/* + * Infrastructure for statistic tracing (histogram output). + * + * Copyright (C) 2008-2009 Frederic Weisbecker + * + * Based on the code from trace_branch.c which is + * Copyright (C) 2008 Steven Rostedt + * + */ + + +#include +#include +#include +#include +#include "trace_stat.h" +#include "trace.h" + + +/* + * List of stat red-black nodes from a tracer + * We use a such tree to sort quickly the stat + * entries from the tracer. + */ +struct stat_node { + struct rb_node node; + void *stat; +}; + +/* A stat session is the stats output in one file */ +struct stat_session { + struct list_head session_list; + struct tracer_stat *ts; + struct rb_root stat_root; + struct mutex stat_mutex; + struct dentry *file; +}; + +/* All of the sessions currently in use. Each stat file embed one session */ +static LIST_HEAD(all_stat_sessions); +static DEFINE_MUTEX(all_stat_sessions_mutex); + +/* The root directory for all stat files */ +static struct dentry *stat_dir; + +/* + * Iterate through the rbtree using a post order traversal path + * to release the next node. + * It won't necessary release one at each iteration + * but it will at least advance closer to the next one + * to be released. + */ +static struct rb_node *release_next(struct tracer_stat *ts, + struct rb_node *node) +{ + struct stat_node *snode; + struct rb_node *parent = rb_parent(node); + + if (node->rb_left) + return node->rb_left; + else if (node->rb_right) + return node->rb_right; + else { + if (!parent) + ; + else if (parent->rb_left == node) + parent->rb_left = NULL; + else + parent->rb_right = NULL; + + snode = container_of(node, struct stat_node, node); + if (ts->stat_release) + ts->stat_release(snode->stat); + kfree(snode); + + return parent; + } +} + +static void __reset_stat_session(struct stat_session *session) +{ + struct rb_node *node = session->stat_root.rb_node; + + while (node) + node = release_next(session->ts, node); + + session->stat_root = RB_ROOT; +} + +static void reset_stat_session(struct stat_session *session) +{ + mutex_lock(&session->stat_mutex); + __reset_stat_session(session); + mutex_unlock(&session->stat_mutex); +} + +static void destroy_session(struct stat_session *session) +{ + debugfs_remove(session->file); + __reset_stat_session(session); + mutex_destroy(&session->stat_mutex); + kfree(session); +} + +typedef int (*cmp_stat_t)(void *, void *); + +static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + struct stat_node *data; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + data->stat = stat; + + /* + * Figure out where to put new node + * This is a descendent sorting + */ + while (*new) { + struct stat_node *this; + int result; + + this = container_of(*new, struct stat_node, node); + result = cmp(data->stat, this->stat); + + parent = *new; + if (result >= 0) + new = &((*new)->rb_left); + else + new = &((*new)->rb_right); + } + + rb_link_node(&data->node, parent, new); + rb_insert_color(&data->node, root); + return 0; +} + +/* + * For tracers that don't provide a stat_cmp callback. + * This one will force an insertion as right-most node + * in the rbtree. + */ +static int dummy_cmp(void *p1, void *p2) +{ + return -1; +} + +/* + * Initialize the stat rbtree at each trace_stat file opening. + * All of these copies and sorting are required on all opening + * since the stats could have changed between two file sessions. + */ +static int stat_seq_init(struct stat_session *session) +{ + struct tracer_stat *ts = session->ts; + struct rb_root *root = &session->stat_root; + void *stat; + int ret = 0; + int i; + + mutex_lock(&session->stat_mutex); + __reset_stat_session(session); + + if (!ts->stat_cmp) + ts->stat_cmp = dummy_cmp; + + stat = ts->stat_start(ts); + if (!stat) + goto exit; + + ret = insert_stat(root, stat, ts->stat_cmp); + if (ret) + goto exit; + + /* + * Iterate over the tracer stat entries and store them in an rbtree. + */ + for (i = 1; ; i++) { + stat = ts->stat_next(stat, i); + + /* End of insertion */ + if (!stat) + break; + + ret = insert_stat(root, stat, ts->stat_cmp); + if (ret) + goto exit_free_rbtree; + } + +exit: + mutex_unlock(&session->stat_mutex); + return ret; + +exit_free_rbtree: + __reset_stat_session(session); + mutex_unlock(&session->stat_mutex); + return ret; +} + + +static void *stat_seq_start(struct seq_file *s, loff_t *pos) +{ + struct stat_session *session = s->private; + struct rb_node *node; + int n = *pos; + int i; + + /* Prevent from tracer switch or rbtree modification */ + mutex_lock(&session->stat_mutex); + + /* If we are in the beginning of the file, print the headers */ + if (session->ts->stat_headers) { + if (n == 0) + return SEQ_START_TOKEN; + n--; + } + + node = rb_first(&session->stat_root); + for (i = 0; node && i < n; i++) + node = rb_next(node); + + return node; +} + +static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos) +{ + struct stat_session *session = s->private; + struct rb_node *node = p; + + (*pos)++; + + if (p == SEQ_START_TOKEN) + return rb_first(&session->stat_root); + + return rb_next(node); +} + +static void stat_seq_stop(struct seq_file *s, void *p) +{ + struct stat_session *session = s->private; + mutex_unlock(&session->stat_mutex); +} + +static int stat_seq_show(struct seq_file *s, void *v) +{ + struct stat_session *session = s->private; + struct stat_node *l = container_of(v, struct stat_node, node); + + if (v == SEQ_START_TOKEN) + return session->ts->stat_headers(s); + + return session->ts->stat_show(s, l->stat); +} + +static const struct seq_operations trace_stat_seq_ops = { + .start = stat_seq_start, + .next = stat_seq_next, + .stop = stat_seq_stop, + .show = stat_seq_show +}; + +/* The session stat is refilled and resorted at each stat file opening */ +static int tracing_stat_open(struct inode *inode, struct file *file) +{ + int ret; + struct seq_file *m; + struct stat_session *session = inode->i_private; + + ret = stat_seq_init(session); + if (ret) + return ret; + + ret = seq_open(file, &trace_stat_seq_ops); + if (ret) { + reset_stat_session(session); + return ret; + } + + m = file->private_data; + m->private = session; + return ret; +} + +/* + * Avoid consuming memory with our now useless rbtree. + */ +static int tracing_stat_release(struct inode *i, struct file *f) +{ + struct stat_session *session = i->i_private; + + reset_stat_session(session); + + return seq_release(i, f); +} + +static const struct file_operations tracing_stat_fops = { + .open = tracing_stat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = tracing_stat_release +}; + +static int tracing_stat_init(void) +{ + struct dentry *d_tracing; + + d_tracing = tracing_init_dentry(); + + stat_dir = debugfs_create_dir("trace_stat", d_tracing); + if (!stat_dir) + pr_warning("Could not create debugfs " + "'trace_stat' entry\n"); + return 0; +} + +static int init_stat_file(struct stat_session *session) +{ + if (!stat_dir && tracing_stat_init()) + return -ENODEV; + + session->file = debugfs_create_file(session->ts->name, 0644, + stat_dir, + session, &tracing_stat_fops); + if (!session->file) + return -ENOMEM; + return 0; +} + +int register_stat_tracer(struct tracer_stat *trace) +{ + struct stat_session *session, *node; + int ret; + + if (!trace) + return -EINVAL; + + if (!trace->stat_start || !trace->stat_next || !trace->stat_show) + return -EINVAL; + + /* Already registered? */ + mutex_lock(&all_stat_sessions_mutex); + list_for_each_entry(node, &all_stat_sessions, session_list) { + if (node->ts == trace) { + mutex_unlock(&all_stat_sessions_mutex); + return -EINVAL; + } + } + mutex_unlock(&all_stat_sessions_mutex); + + /* Init the session */ + session = kzalloc(sizeof(*session), GFP_KERNEL); + if (!session) + return -ENOMEM; + + session->ts = trace; + INIT_LIST_HEAD(&session->session_list); + mutex_init(&session->stat_mutex); + + ret = init_stat_file(session); + if (ret) { + destroy_session(session); + return ret; + } + + /* Register */ + mutex_lock(&all_stat_sessions_mutex); + list_add_tail(&session->session_list, &all_stat_sessions); + mutex_unlock(&all_stat_sessions_mutex); + + return 0; +} + +void unregister_stat_tracer(struct tracer_stat *trace) +{ + struct stat_session *node, *tmp; + + mutex_lock(&all_stat_sessions_mutex); + list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) { + if (node->ts == trace) { + list_del(&node->session_list); + destroy_session(node); + break; + } + } + mutex_unlock(&all_stat_sessions_mutex); +} diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h new file mode 100644 index 00000000..8f03914b --- /dev/null +++ b/kernel/trace/trace_stat.h @@ -0,0 +1,33 @@ +#ifndef __TRACE_STAT_H +#define __TRACE_STAT_H + +#include + +/* + * If you want to provide a stat file (one-shot statistics), fill + * an iterator with stat_start/stat_next and a stat_show callbacks. + * The others callbacks are optional. + */ +struct tracer_stat { + /* The name of your stat file */ + const char *name; + /* Iteration over statistic entries */ + void *(*stat_start)(struct tracer_stat *trace); + void *(*stat_next)(void *prev, int idx); + /* Compare two entries for stats sorting */ + int (*stat_cmp)(void *p1, void *p2); + /* Print a stat entry */ + int (*stat_show)(struct seq_file *s, void *p); + /* Release an entry */ + void (*stat_release)(void *stat); + /* Print the headers of your stat entries */ + int (*stat_headers)(struct seq_file *s); +}; + +/* + * Destroy or create a stat file + */ +extern int register_stat_tracer(struct tracer_stat *trace); +extern void unregister_stat_tracer(struct tracer_stat *trace); + +#endif /* __TRACE_STAT_H */ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c new file mode 100644 index 00000000..ee7b5a0b --- /dev/null +++ b/kernel/trace/trace_syscalls.c @@ -0,0 +1,690 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "trace_output.h" +#include "trace.h" + +static DEFINE_MUTEX(syscall_trace_lock); +static int sys_refcount_enter; +static int sys_refcount_exit; +static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); +static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); + +static int syscall_enter_register(struct ftrace_event_call *event, + enum trace_reg type); +static int syscall_exit_register(struct ftrace_event_call *event, + enum trace_reg type); + +static int syscall_enter_define_fields(struct ftrace_event_call *call); +static int syscall_exit_define_fields(struct ftrace_event_call *call); + +static struct list_head * +syscall_get_enter_fields(struct ftrace_event_call *call) +{ + struct syscall_metadata *entry = call->data; + + return &entry->enter_fields; +} + +struct trace_event_functions enter_syscall_print_funcs = { + .trace = print_syscall_enter, +}; + +struct trace_event_functions exit_syscall_print_funcs = { + .trace = print_syscall_exit, +}; + +struct ftrace_event_class event_class_syscall_enter = { + .system = "syscalls", + .reg = syscall_enter_register, + .define_fields = syscall_enter_define_fields, + .get_fields = syscall_get_enter_fields, + .raw_init = init_syscall_trace, +}; + +struct ftrace_event_class event_class_syscall_exit = { + .system = "syscalls", + .reg = syscall_exit_register, + .define_fields = syscall_exit_define_fields, + .fields = LIST_HEAD_INIT(event_class_syscall_exit.fields), + .raw_init = init_syscall_trace, +}; + +extern struct syscall_metadata *__start_syscalls_metadata[]; +extern struct syscall_metadata *__stop_syscalls_metadata[]; + +static struct syscall_metadata **syscalls_metadata; + +#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME +static inline bool arch_syscall_match_sym_name(const char *sym, const char *name) +{ + /* + * Only compare after the "sys" prefix. Archs that use + * syscall wrappers may have syscalls symbols aliases prefixed + * with "SyS" instead of "sys", leading to an unwanted + * mismatch. + */ + return !strcmp(sym + 3, name + 3); +} +#endif + +static __init struct syscall_metadata * +find_syscall_meta(unsigned long syscall) +{ + struct syscall_metadata **start; + struct syscall_metadata **stop; + char str[KSYM_SYMBOL_LEN]; + + + start = __start_syscalls_metadata; + stop = __stop_syscalls_metadata; + kallsyms_lookup(syscall, NULL, NULL, NULL, str); + + if (arch_syscall_match_sym_name(str, "sys_ni_syscall")) + return NULL; + + for ( ; start < stop; start++) { + if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name)) + return *start; + } + return NULL; +} + +static struct syscall_metadata *syscall_nr_to_meta(int nr) +{ + if (!syscalls_metadata || nr >= NR_syscalls || nr < 0) + return NULL; + + return syscalls_metadata[nr]; +} + +enum print_line_t +print_syscall_enter(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *ent = iter->ent; + struct syscall_trace_enter *trace; + struct syscall_metadata *entry; + int i, ret, syscall; + + trace = (typeof(trace))ent; + syscall = trace->nr; + entry = syscall_nr_to_meta(syscall); + + if (!entry) + goto end; + + if (entry->enter_event->event.type != ent->type) { + WARN_ON_ONCE(1); + goto end; + } + + ret = trace_seq_printf(s, "%s(", entry->name); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + for (i = 0; i < entry->nb_args; i++) { + /* parameter types */ + if (trace_flags & TRACE_ITER_VERBOSE) { + ret = trace_seq_printf(s, "%s ", entry->types[i]); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + /* parameter values */ + ret = trace_seq_printf(s, "%s: %lx%s", entry->args[i], + trace->args[i], + i == entry->nb_args - 1 ? "" : ", "); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } + + ret = trace_seq_putc(s, ')'); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + +end: + ret = trace_seq_putc(s, '\n'); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +enum print_line_t +print_syscall_exit(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *ent = iter->ent; + struct syscall_trace_exit *trace; + int syscall; + struct syscall_metadata *entry; + int ret; + + trace = (typeof(trace))ent; + syscall = trace->nr; + entry = syscall_nr_to_meta(syscall); + + if (!entry) { + trace_seq_printf(s, "\n"); + return TRACE_TYPE_HANDLED; + } + + if (entry->exit_event->event.type != ent->type) { + WARN_ON_ONCE(1); + return TRACE_TYPE_UNHANDLED; + } + + ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name, + trace->ret); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + +extern char *__bad_type_size(void); + +#define SYSCALL_FIELD(type, name) \ + sizeof(type) != sizeof(trace.name) ? \ + __bad_type_size() : \ + #type, #name, offsetof(typeof(trace), name), \ + sizeof(trace.name), is_signed_type(type) + +static +int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) +{ + int i; + int pos = 0; + + /* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) + + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + for (i = 0; i < entry->nb_args; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s", + entry->args[i], sizeof(unsigned long), + i == entry->nb_args - 1 ? "" : ", "); + } + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); + + for (i = 0; i < entry->nb_args; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", ((unsigned long)(REC->%s))", entry->args[i]); + } + +#undef LEN_OR_ZERO + + /* return the length of print_fmt */ + return pos; +} + +static int set_syscall_print_fmt(struct ftrace_event_call *call) +{ + char *print_fmt; + int len; + struct syscall_metadata *entry = call->data; + + if (entry->enter_event != call) { + call->print_fmt = "\"0x%lx\", REC->ret"; + return 0; + } + + /* First: called with 0 length to calculate the needed length */ + len = __set_enter_print_fmt(entry, NULL, 0); + + print_fmt = kmalloc(len + 1, GFP_KERNEL); + if (!print_fmt) + return -ENOMEM; + + /* Second: actually write the @print_fmt */ + __set_enter_print_fmt(entry, print_fmt, len + 1); + call->print_fmt = print_fmt; + + return 0; +} + +static void free_syscall_print_fmt(struct ftrace_event_call *call) +{ + struct syscall_metadata *entry = call->data; + + if (entry->enter_event == call) + kfree(call->print_fmt); +} + +static int syscall_enter_define_fields(struct ftrace_event_call *call) +{ + struct syscall_trace_enter trace; + struct syscall_metadata *meta = call->data; + int ret; + int i; + int offset = offsetof(typeof(trace), args); + + ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); + if (ret) + return ret; + + for (i = 0; i < meta->nb_args; i++) { + ret = trace_define_field(call, meta->types[i], + meta->args[i], offset, + sizeof(unsigned long), 0, + FILTER_OTHER); + offset += sizeof(unsigned long); + } + + return ret; +} + +static int syscall_exit_define_fields(struct ftrace_event_call *call) +{ + struct syscall_trace_exit trace; + int ret; + + ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); + if (ret) + return ret; + + ret = trace_define_field(call, SYSCALL_FIELD(long, ret), + FILTER_OTHER); + + return ret; +} + +void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) +{ + struct syscall_trace_enter *entry; + struct syscall_metadata *sys_data; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + int size; + int syscall_nr; + + syscall_nr = syscall_get_nr(current, regs); + if (syscall_nr < 0) + return; + if (!test_bit(syscall_nr, enabled_enter_syscalls)) + return; + + sys_data = syscall_nr_to_meta(syscall_nr); + if (!sys_data) + return; + + size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; + + event = trace_current_buffer_lock_reserve(&buffer, + sys_data->enter_event->event.type, size, 0, 0); + if (!event) + return; + + entry = ring_buffer_event_data(event); + entry->nr = syscall_nr; + syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args); + + if (!filter_current_check_discard(buffer, sys_data->enter_event, + entry, event)) + trace_current_buffer_unlock_commit(buffer, event, 0, 0); +} + +void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) +{ + struct syscall_trace_exit *entry; + struct syscall_metadata *sys_data; + struct ring_buffer_event *event; + struct ring_buffer *buffer; + int syscall_nr; + + syscall_nr = syscall_get_nr(current, regs); + if (syscall_nr < 0) + return; + if (!test_bit(syscall_nr, enabled_exit_syscalls)) + return; + + sys_data = syscall_nr_to_meta(syscall_nr); + if (!sys_data) + return; + + event = trace_current_buffer_lock_reserve(&buffer, + sys_data->exit_event->event.type, sizeof(*entry), 0, 0); + if (!event) + return; + + entry = ring_buffer_event_data(event); + entry->nr = syscall_nr; + entry->ret = syscall_get_return_value(current, regs); + + if (!filter_current_check_discard(buffer, sys_data->exit_event, + entry, event)) + trace_current_buffer_unlock_commit(buffer, event, 0, 0); +} + +int reg_event_syscall_enter(struct ftrace_event_call *call) +{ + int ret = 0; + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) + return -ENOSYS; + mutex_lock(&syscall_trace_lock); + if (!sys_refcount_enter) + ret = register_trace_sys_enter(ftrace_syscall_enter, NULL); + if (!ret) { + set_bit(num, enabled_enter_syscalls); + sys_refcount_enter++; + } + mutex_unlock(&syscall_trace_lock); + return ret; +} + +void unreg_event_syscall_enter(struct ftrace_event_call *call) +{ + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) + return; + mutex_lock(&syscall_trace_lock); + sys_refcount_enter--; + clear_bit(num, enabled_enter_syscalls); + if (!sys_refcount_enter) + unregister_trace_sys_enter(ftrace_syscall_enter, NULL); + mutex_unlock(&syscall_trace_lock); +} + +int reg_event_syscall_exit(struct ftrace_event_call *call) +{ + int ret = 0; + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) + return -ENOSYS; + mutex_lock(&syscall_trace_lock); + if (!sys_refcount_exit) + ret = register_trace_sys_exit(ftrace_syscall_exit, NULL); + if (!ret) { + set_bit(num, enabled_exit_syscalls); + sys_refcount_exit++; + } + mutex_unlock(&syscall_trace_lock); + return ret; +} + +void unreg_event_syscall_exit(struct ftrace_event_call *call) +{ + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) + return; + mutex_lock(&syscall_trace_lock); + sys_refcount_exit--; + clear_bit(num, enabled_exit_syscalls); + if (!sys_refcount_exit) + unregister_trace_sys_exit(ftrace_syscall_exit, NULL); + mutex_unlock(&syscall_trace_lock); +} + +int init_syscall_trace(struct ftrace_event_call *call) +{ + int id; + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + if (num < 0 || num >= NR_syscalls) { + pr_debug("syscall %s metadata not mapped, disabling ftrace event\n", + ((struct syscall_metadata *)call->data)->name); + return -ENOSYS; + } + + if (set_syscall_print_fmt(call) < 0) + return -ENOMEM; + + id = trace_event_raw_init(call); + + if (id < 0) { + free_syscall_print_fmt(call); + return id; + } + + return id; +} + +unsigned long __init __weak arch_syscall_addr(int nr) +{ + return (unsigned long)sys_call_table[nr]; +} + +int __init init_ftrace_syscalls(void) +{ + struct syscall_metadata *meta; + unsigned long addr; + int i; + + syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) * + NR_syscalls, GFP_KERNEL); + if (!syscalls_metadata) { + WARN_ON(1); + return -ENOMEM; + } + + for (i = 0; i < NR_syscalls; i++) { + addr = arch_syscall_addr(i); + meta = find_syscall_meta(addr); + if (!meta) + continue; + + meta->syscall_nr = i; + syscalls_metadata[i] = meta; + } + + return 0; +} +core_initcall(init_ftrace_syscalls); + +#ifdef CONFIG_PERF_EVENTS + +static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls); +static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); +static int sys_perf_refcount_enter; +static int sys_perf_refcount_exit; + +static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) +{ + struct syscall_metadata *sys_data; + struct syscall_trace_enter *rec; + struct hlist_head *head; + int syscall_nr; + int rctx; + int size; + + syscall_nr = syscall_get_nr(current, regs); + if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) + return; + + sys_data = syscall_nr_to_meta(syscall_nr); + if (!sys_data) + return; + + /* get the size after alignment with the u32 buffer size field */ + size = sizeof(unsigned long) * sys_data->nb_args + sizeof(*rec); + size = ALIGN(size + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, + "perf buffer not large enough")) + return; + + rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, + sys_data->enter_event->event.type, regs, &rctx); + if (!rec) + return; + + rec->nr = syscall_nr; + syscall_get_arguments(current, regs, 0, sys_data->nb_args, + (unsigned long *)&rec->args); + + head = this_cpu_ptr(sys_data->enter_event->perf_events); + perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); +} + +int perf_sysenter_enable(struct ftrace_event_call *call) +{ + int ret = 0; + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + + mutex_lock(&syscall_trace_lock); + if (!sys_perf_refcount_enter) + ret = register_trace_sys_enter(perf_syscall_enter, NULL); + if (ret) { + pr_info("event trace: Could not activate" + "syscall entry trace point"); + } else { + set_bit(num, enabled_perf_enter_syscalls); + sys_perf_refcount_enter++; + } + mutex_unlock(&syscall_trace_lock); + return ret; +} + +void perf_sysenter_disable(struct ftrace_event_call *call) +{ + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + + mutex_lock(&syscall_trace_lock); + sys_perf_refcount_enter--; + clear_bit(num, enabled_perf_enter_syscalls); + if (!sys_perf_refcount_enter) + unregister_trace_sys_enter(perf_syscall_enter, NULL); + mutex_unlock(&syscall_trace_lock); +} + +static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) +{ + struct syscall_metadata *sys_data; + struct syscall_trace_exit *rec; + struct hlist_head *head; + int syscall_nr; + int rctx; + int size; + + syscall_nr = syscall_get_nr(current, regs); + if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) + return; + + sys_data = syscall_nr_to_meta(syscall_nr); + if (!sys_data) + return; + + /* We can probably do that at build time */ + size = ALIGN(sizeof(*rec) + sizeof(u32), sizeof(u64)); + size -= sizeof(u32); + + /* + * Impossible, but be paranoid with the future + * How to put this check outside runtime? + */ + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, + "exit event has grown above perf buffer size")) + return; + + rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, + sys_data->exit_event->event.type, regs, &rctx); + if (!rec) + return; + + rec->nr = syscall_nr; + rec->ret = syscall_get_return_value(current, regs); + + head = this_cpu_ptr(sys_data->exit_event->perf_events); + perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); +} + +int perf_sysexit_enable(struct ftrace_event_call *call) +{ + int ret = 0; + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + + mutex_lock(&syscall_trace_lock); + if (!sys_perf_refcount_exit) + ret = register_trace_sys_exit(perf_syscall_exit, NULL); + if (ret) { + pr_info("event trace: Could not activate" + "syscall exit trace point"); + } else { + set_bit(num, enabled_perf_exit_syscalls); + sys_perf_refcount_exit++; + } + mutex_unlock(&syscall_trace_lock); + return ret; +} + +void perf_sysexit_disable(struct ftrace_event_call *call) +{ + int num; + + num = ((struct syscall_metadata *)call->data)->syscall_nr; + + mutex_lock(&syscall_trace_lock); + sys_perf_refcount_exit--; + clear_bit(num, enabled_perf_exit_syscalls); + if (!sys_perf_refcount_exit) + unregister_trace_sys_exit(perf_syscall_exit, NULL); + mutex_unlock(&syscall_trace_lock); +} + +#endif /* CONFIG_PERF_EVENTS */ + +static int syscall_enter_register(struct ftrace_event_call *event, + enum trace_reg type) +{ + switch (type) { + case TRACE_REG_REGISTER: + return reg_event_syscall_enter(event); + case TRACE_REG_UNREGISTER: + unreg_event_syscall_enter(event); + return 0; + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return perf_sysenter_enable(event); + case TRACE_REG_PERF_UNREGISTER: + perf_sysenter_disable(event); + return 0; +#endif + } + return 0; +} + +static int syscall_exit_register(struct ftrace_event_call *event, + enum trace_reg type) +{ + switch (type) { + case TRACE_REG_REGISTER: + return reg_event_syscall_exit(event); + case TRACE_REG_UNREGISTER: + unreg_event_syscall_exit(event); + return 0; + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return perf_sysexit_enable(event); + case TRACE_REG_PERF_UNREGISTER: + perf_sysexit_disable(event); + return 0; +#endif + } + return 0; +} diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c new file mode 100644 index 00000000..209b379a --- /dev/null +++ b/kernel/trace/trace_workqueue.c @@ -0,0 +1,300 @@ +/* + * Workqueue statistical tracer. + * + * Copyright (C) 2008 Frederic Weisbecker + * + */ + + +#include +#include +#include +#include +#include +#include "trace_stat.h" +#include "trace.h" + + +/* A cpu workqueue thread */ +struct cpu_workqueue_stats { + struct list_head list; + struct kref kref; + int cpu; + pid_t pid; +/* Can be inserted from interrupt or user context, need to be atomic */ + atomic_t inserted; +/* + * Don't need to be atomic, works are serialized in a single workqueue thread + * on a single CPU. + */ + unsigned int executed; +}; + +/* List of workqueue threads on one cpu */ +struct workqueue_global_stats { + struct list_head list; + spinlock_t lock; +}; + +/* Don't need a global lock because allocated before the workqueues, and + * never freed. + */ +static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat); +#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu)) + +static void cpu_workqueue_stat_free(struct kref *kref) +{ + kfree(container_of(kref, struct cpu_workqueue_stats, kref)); +} + +/* Insertion of a work */ +static void +probe_workqueue_insertion(void *ignore, + struct task_struct *wq_thread, + struct work_struct *work) +{ + int cpu = cpumask_first(&wq_thread->cpus_allowed); + struct cpu_workqueue_stats *node; + unsigned long flags; + + spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); + list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { + if (node->pid == wq_thread->pid) { + atomic_inc(&node->inserted); + goto found; + } + } + pr_debug("trace_workqueue: entry not found\n"); +found: + spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); +} + +/* Execution of a work */ +static void +probe_workqueue_execution(void *ignore, + struct task_struct *wq_thread, + struct work_struct *work) +{ + int cpu = cpumask_first(&wq_thread->cpus_allowed); + struct cpu_workqueue_stats *node; + unsigned long flags; + + spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); + list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) { + if (node->pid == wq_thread->pid) { + node->executed++; + goto found; + } + } + pr_debug("trace_workqueue: entry not found\n"); +found: + spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); +} + +/* Creation of a cpu workqueue thread */ +static void probe_workqueue_creation(void *ignore, + struct task_struct *wq_thread, int cpu) +{ + struct cpu_workqueue_stats *cws; + unsigned long flags; + + WARN_ON(cpu < 0); + + /* Workqueues are sometimes created in atomic context */ + cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC); + if (!cws) { + pr_warning("trace_workqueue: not enough memory\n"); + return; + } + INIT_LIST_HEAD(&cws->list); + kref_init(&cws->kref); + cws->cpu = cpu; + cws->pid = wq_thread->pid; + + spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); + list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list); + spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); +} + +/* Destruction of a cpu workqueue thread */ +static void +probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread) +{ + /* Workqueue only execute on one cpu */ + int cpu = cpumask_first(&wq_thread->cpus_allowed); + struct cpu_workqueue_stats *node, *next; + unsigned long flags; + + spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); + list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list, + list) { + if (node->pid == wq_thread->pid) { + list_del(&node->list); + kref_put(&node->kref, cpu_workqueue_stat_free); + goto found; + } + } + + pr_debug("trace_workqueue: don't find workqueue to destroy\n"); +found: + spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); + +} + +static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu) +{ + unsigned long flags; + struct cpu_workqueue_stats *ret = NULL; + + + spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); + + if (!list_empty(&workqueue_cpu_stat(cpu)->list)) { + ret = list_entry(workqueue_cpu_stat(cpu)->list.next, + struct cpu_workqueue_stats, list); + kref_get(&ret->kref); + } + + spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); + + return ret; +} + +static void *workqueue_stat_start(struct tracer_stat *trace) +{ + int cpu; + void *ret = NULL; + + for_each_possible_cpu(cpu) { + ret = workqueue_stat_start_cpu(cpu); + if (ret) + return ret; + } + return NULL; +} + +static void *workqueue_stat_next(void *prev, int idx) +{ + struct cpu_workqueue_stats *prev_cws = prev; + struct cpu_workqueue_stats *ret; + int cpu = prev_cws->cpu; + unsigned long flags; + + spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags); + if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) { + spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); + do { + cpu = cpumask_next(cpu, cpu_possible_mask); + if (cpu >= nr_cpu_ids) + return NULL; + } while (!(ret = workqueue_stat_start_cpu(cpu))); + return ret; + } else { + ret = list_entry(prev_cws->list.next, + struct cpu_workqueue_stats, list); + kref_get(&ret->kref); + } + spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags); + + return ret; +} + +static int workqueue_stat_show(struct seq_file *s, void *p) +{ + struct cpu_workqueue_stats *cws = p; + struct pid *pid; + struct task_struct *tsk; + + pid = find_get_pid(cws->pid); + if (pid) { + tsk = get_pid_task(pid, PIDTYPE_PID); + if (tsk) { + seq_printf(s, "%3d %6d %6u %s\n", cws->cpu, + atomic_read(&cws->inserted), cws->executed, + tsk->comm); + put_task_struct(tsk); + } + put_pid(pid); + } + + return 0; +} + +static void workqueue_stat_release(void *stat) +{ + struct cpu_workqueue_stats *node = stat; + + kref_put(&node->kref, cpu_workqueue_stat_free); +} + +static int workqueue_stat_headers(struct seq_file *s) +{ + seq_printf(s, "# CPU INSERTED EXECUTED NAME\n"); + seq_printf(s, "# | | | |\n"); + return 0; +} + +struct tracer_stat workqueue_stats __read_mostly = { + .name = "workqueues", + .stat_start = workqueue_stat_start, + .stat_next = workqueue_stat_next, + .stat_show = workqueue_stat_show, + .stat_release = workqueue_stat_release, + .stat_headers = workqueue_stat_headers +}; + + +int __init stat_workqueue_init(void) +{ + if (register_stat_tracer(&workqueue_stats)) { + pr_warning("Unable to register workqueue stat tracer\n"); + return 1; + } + + return 0; +} +fs_initcall(stat_workqueue_init); + +/* + * Workqueues are created very early, just after pre-smp initcalls. + * So we must register our tracepoints at this stage. + */ +int __init trace_workqueue_early_init(void) +{ + int ret, cpu; + + for_each_possible_cpu(cpu) { + spin_lock_init(&workqueue_cpu_stat(cpu)->lock); + INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); + } + + ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); + if (ret) + goto out; + + ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL); + if (ret) + goto no_insertion; + + ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL); + if (ret) + goto no_execution; + + ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL); + if (ret) + goto no_creation; + + return 0; + +no_creation: + unregister_trace_workqueue_creation(probe_workqueue_creation, NULL); +no_execution: + unregister_trace_workqueue_execution(probe_workqueue_execution, NULL); +no_insertion: + unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL); +out: + pr_warning("trace_workqueue: unable to trace workqueues\n"); + + return 1; +} +early_initcall(trace_workqueue_early_init); diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c new file mode 100644 index 00000000..b219f144 --- /dev/null +++ b/kernel/tracepoint.c @@ -0,0 +1,640 @@ +/* + * Copyright (C) 2008 Mathieu Desnoyers + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern struct tracepoint * const __start___tracepoints_ptrs[]; +extern struct tracepoint * const __stop___tracepoints_ptrs[]; + +/* Set to 1 to enable tracepoint debug output */ +static const int tracepoint_debug; + +/* + * tracepoints_mutex nests inside module_mutex. Tracepoints mutex protects the + * builtin and module tracepoints and the hash table. + */ +static DEFINE_MUTEX(tracepoints_mutex); + +/* + * Tracepoint hash table, containing the active tracepoints. + * Protected by tracepoints_mutex. + */ +#define TRACEPOINT_HASH_BITS 6 +#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS) +static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE]; + +/* + * Note about RCU : + * It is used to delay the free of multiple probes array until a quiescent + * state is reached. + * Tracepoint entries modifications are protected by the tracepoints_mutex. + */ +struct tracepoint_entry { + struct hlist_node hlist; + struct tracepoint_func *funcs; + int refcount; /* Number of times armed. 0 if disarmed. */ + char name[0]; +}; + +struct tp_probes { + union { + struct rcu_head rcu; + struct list_head list; + } u; + struct tracepoint_func probes[0]; +}; + +static inline void *allocate_probes(int count) +{ + struct tp_probes *p = kmalloc(count * sizeof(struct tracepoint_func) + + sizeof(struct tp_probes), GFP_KERNEL); + return p == NULL ? NULL : p->probes; +} + +static void rcu_free_old_probes(struct rcu_head *head) +{ + kfree(container_of(head, struct tp_probes, u.rcu)); +} + +static inline void release_probes(struct tracepoint_func *old) +{ + if (old) { + struct tp_probes *tp_probes = container_of(old, + struct tp_probes, probes[0]); + call_rcu_sched(&tp_probes->u.rcu, rcu_free_old_probes); + } +} + +static void debug_print_probes(struct tracepoint_entry *entry) +{ + int i; + + if (!tracepoint_debug || !entry->funcs) + return; + + for (i = 0; entry->funcs[i].func; i++) + printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i].func); +} + +static struct tracepoint_func * +tracepoint_entry_add_probe(struct tracepoint_entry *entry, + void *probe, void *data) +{ + int nr_probes = 0; + struct tracepoint_func *old, *new; + + WARN_ON(!probe); + + debug_print_probes(entry); + old = entry->funcs; + if (old) { + /* (N -> N+1), (N != 0, 1) probes */ + for (nr_probes = 0; old[nr_probes].func; nr_probes++) + if (old[nr_probes].func == probe && + old[nr_probes].data == data) + return ERR_PTR(-EEXIST); + } + /* + 2 : one for new probe, one for NULL func */ + new = allocate_probes(nr_probes + 2); + if (new == NULL) + return ERR_PTR(-ENOMEM); + if (old) + memcpy(new, old, nr_probes * sizeof(struct tracepoint_func)); + new[nr_probes].func = probe; + new[nr_probes].data = data; + new[nr_probes + 1].func = NULL; + entry->refcount = nr_probes + 1; + entry->funcs = new; + debug_print_probes(entry); + return old; +} + +static void * +tracepoint_entry_remove_probe(struct tracepoint_entry *entry, + void *probe, void *data) +{ + int nr_probes = 0, nr_del = 0, i; + struct tracepoint_func *old, *new; + + old = entry->funcs; + + if (!old) + return ERR_PTR(-ENOENT); + + debug_print_probes(entry); + /* (N -> M), (N > 1, M >= 0) probes */ + for (nr_probes = 0; old[nr_probes].func; nr_probes++) { + if (!probe || + (old[nr_probes].func == probe && + old[nr_probes].data == data)) + nr_del++; + } + + if (nr_probes - nr_del == 0) { + /* N -> 0, (N > 1) */ + entry->funcs = NULL; + entry->refcount = 0; + debug_print_probes(entry); + return old; + } else { + int j = 0; + /* N -> M, (N > 1, M > 0) */ + /* + 1 for NULL */ + new = allocate_probes(nr_probes - nr_del + 1); + if (new == NULL) + return ERR_PTR(-ENOMEM); + for (i = 0; old[i].func; i++) + if (probe && + (old[i].func != probe || old[i].data != data)) + new[j++] = old[i]; + new[nr_probes - nr_del].func = NULL; + entry->refcount = nr_probes - nr_del; + entry->funcs = new; + } + debug_print_probes(entry); + return old; +} + +/* + * Get tracepoint if the tracepoint is present in the tracepoint hash table. + * Must be called with tracepoints_mutex held. + * Returns NULL if not present. + */ +static struct tracepoint_entry *get_tracepoint(const char *name) +{ + struct hlist_head *head; + struct hlist_node *node; + struct tracepoint_entry *e; + u32 hash = jhash(name, strlen(name), 0); + + head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (!strcmp(name, e->name)) + return e; + } + return NULL; +} + +/* + * Add the tracepoint to the tracepoint hash table. Must be called with + * tracepoints_mutex held. + */ +static struct tracepoint_entry *add_tracepoint(const char *name) +{ + struct hlist_head *head; + struct hlist_node *node; + struct tracepoint_entry *e; + size_t name_len = strlen(name) + 1; + u32 hash = jhash(name, name_len-1, 0); + + head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (!strcmp(name, e->name)) { + printk(KERN_NOTICE + "tracepoint %s busy\n", name); + return ERR_PTR(-EEXIST); /* Already there */ + } + } + /* + * Using kmalloc here to allocate a variable length element. Could + * cause some memory fragmentation if overused. + */ + e = kmalloc(sizeof(struct tracepoint_entry) + name_len, GFP_KERNEL); + if (!e) + return ERR_PTR(-ENOMEM); + memcpy(&e->name[0], name, name_len); + e->funcs = NULL; + e->refcount = 0; + hlist_add_head(&e->hlist, head); + return e; +} + +/* + * Remove the tracepoint from the tracepoint hash table. Must be called with + * mutex_lock held. + */ +static inline void remove_tracepoint(struct tracepoint_entry *e) +{ + hlist_del(&e->hlist); + kfree(e); +} + +/* + * Sets the probe callback corresponding to one tracepoint. + */ +static void set_tracepoint(struct tracepoint_entry **entry, + struct tracepoint *elem, int active) +{ + WARN_ON(strcmp((*entry)->name, elem->name) != 0); + + if (elem->regfunc && !jump_label_enabled(&elem->key) && active) + elem->regfunc(); + else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active) + elem->unregfunc(); + + /* + * rcu_assign_pointer has a smp_wmb() which makes sure that the new + * probe callbacks array is consistent before setting a pointer to it. + * This array is referenced by __DO_TRACE from + * include/linux/tracepoints.h. A matching smp_read_barrier_depends() + * is used. + */ + rcu_assign_pointer(elem->funcs, (*entry)->funcs); + if (active && !jump_label_enabled(&elem->key)) + jump_label_inc(&elem->key); + else if (!active && jump_label_enabled(&elem->key)) + jump_label_dec(&elem->key); +} + +/* + * Disable a tracepoint and its probe callback. + * Note: only waiting an RCU period after setting elem->call to the empty + * function insures that the original callback is not used anymore. This insured + * by preempt_disable around the call site. + */ +static void disable_tracepoint(struct tracepoint *elem) +{ + if (elem->unregfunc && jump_label_enabled(&elem->key)) + elem->unregfunc(); + + if (jump_label_enabled(&elem->key)) + jump_label_dec(&elem->key); + rcu_assign_pointer(elem->funcs, NULL); +} + +/** + * tracepoint_update_probe_range - Update a probe range + * @begin: beginning of the range + * @end: end of the range + * + * Updates the probe callback corresponding to a range of tracepoints. + */ +void tracepoint_update_probe_range(struct tracepoint * const *begin, + struct tracepoint * const *end) +{ + struct tracepoint * const *iter; + struct tracepoint_entry *mark_entry; + + if (!begin) + return; + + mutex_lock(&tracepoints_mutex); + for (iter = begin; iter < end; iter++) { + mark_entry = get_tracepoint((*iter)->name); + if (mark_entry) { + set_tracepoint(&mark_entry, *iter, + !!mark_entry->refcount); + } else { + disable_tracepoint(*iter); + } + } + mutex_unlock(&tracepoints_mutex); +} + +/* + * Update probes, removing the faulty probes. + */ +static void tracepoint_update_probes(void) +{ + /* Core kernel tracepoints */ + tracepoint_update_probe_range(__start___tracepoints_ptrs, + __stop___tracepoints_ptrs); + /* tracepoints in modules. */ + module_update_tracepoints(); +} + +static struct tracepoint_func * +tracepoint_add_probe(const char *name, void *probe, void *data) +{ + struct tracepoint_entry *entry; + struct tracepoint_func *old; + + entry = get_tracepoint(name); + if (!entry) { + entry = add_tracepoint(name); + if (IS_ERR(entry)) + return (struct tracepoint_func *)entry; + } + old = tracepoint_entry_add_probe(entry, probe, data); + if (IS_ERR(old) && !entry->refcount) + remove_tracepoint(entry); + return old; +} + +/** + * tracepoint_probe_register - Connect a probe to a tracepoint + * @name: tracepoint name + * @probe: probe handler + * + * Returns 0 if ok, error value on error. + * The probe address must at least be aligned on the architecture pointer size. + */ +int tracepoint_probe_register(const char *name, void *probe, void *data) +{ + struct tracepoint_func *old; + + mutex_lock(&tracepoints_mutex); + old = tracepoint_add_probe(name, probe, data); + mutex_unlock(&tracepoints_mutex); + if (IS_ERR(old)) + return PTR_ERR(old); + + tracepoint_update_probes(); /* may update entry */ + release_probes(old); + return 0; +} +EXPORT_SYMBOL_GPL(tracepoint_probe_register); + +static struct tracepoint_func * +tracepoint_remove_probe(const char *name, void *probe, void *data) +{ + struct tracepoint_entry *entry; + struct tracepoint_func *old; + + entry = get_tracepoint(name); + if (!entry) + return ERR_PTR(-ENOENT); + old = tracepoint_entry_remove_probe(entry, probe, data); + if (IS_ERR(old)) + return old; + if (!entry->refcount) + remove_tracepoint(entry); + return old; +} + +/** + * tracepoint_probe_unregister - Disconnect a probe from a tracepoint + * @name: tracepoint name + * @probe: probe function pointer + * + * We do not need to call a synchronize_sched to make sure the probes have + * finished running before doing a module unload, because the module unload + * itself uses stop_machine(), which insures that every preempt disabled section + * have finished. + */ +int tracepoint_probe_unregister(const char *name, void *probe, void *data) +{ + struct tracepoint_func *old; + + mutex_lock(&tracepoints_mutex); + old = tracepoint_remove_probe(name, probe, data); + mutex_unlock(&tracepoints_mutex); + if (IS_ERR(old)) + return PTR_ERR(old); + + tracepoint_update_probes(); /* may update entry */ + release_probes(old); + return 0; +} +EXPORT_SYMBOL_GPL(tracepoint_probe_unregister); + +static LIST_HEAD(old_probes); +static int need_update; + +static void tracepoint_add_old_probes(void *old) +{ + need_update = 1; + if (old) { + struct tp_probes *tp_probes = container_of(old, + struct tp_probes, probes[0]); + list_add(&tp_probes->u.list, &old_probes); + } +} + +/** + * tracepoint_probe_register_noupdate - register a probe but not connect + * @name: tracepoint name + * @probe: probe handler + * + * caller must call tracepoint_probe_update_all() + */ +int tracepoint_probe_register_noupdate(const char *name, void *probe, + void *data) +{ + struct tracepoint_func *old; + + mutex_lock(&tracepoints_mutex); + old = tracepoint_add_probe(name, probe, data); + if (IS_ERR(old)) { + mutex_unlock(&tracepoints_mutex); + return PTR_ERR(old); + } + tracepoint_add_old_probes(old); + mutex_unlock(&tracepoints_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate); + +/** + * tracepoint_probe_unregister_noupdate - remove a probe but not disconnect + * @name: tracepoint name + * @probe: probe function pointer + * + * caller must call tracepoint_probe_update_all() + */ +int tracepoint_probe_unregister_noupdate(const char *name, void *probe, + void *data) +{ + struct tracepoint_func *old; + + mutex_lock(&tracepoints_mutex); + old = tracepoint_remove_probe(name, probe, data); + if (IS_ERR(old)) { + mutex_unlock(&tracepoints_mutex); + return PTR_ERR(old); + } + tracepoint_add_old_probes(old); + mutex_unlock(&tracepoints_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(tracepoint_probe_unregister_noupdate); + +/** + * tracepoint_probe_update_all - update tracepoints + */ +void tracepoint_probe_update_all(void) +{ + LIST_HEAD(release_probes); + struct tp_probes *pos, *next; + + mutex_lock(&tracepoints_mutex); + if (!need_update) { + mutex_unlock(&tracepoints_mutex); + return; + } + if (!list_empty(&old_probes)) + list_replace_init(&old_probes, &release_probes); + need_update = 0; + mutex_unlock(&tracepoints_mutex); + + tracepoint_update_probes(); + list_for_each_entry_safe(pos, next, &release_probes, u.list) { + list_del(&pos->u.list); + call_rcu_sched(&pos->u.rcu, rcu_free_old_probes); + } +} +EXPORT_SYMBOL_GPL(tracepoint_probe_update_all); + +/** + * tracepoint_get_iter_range - Get a next tracepoint iterator given a range. + * @tracepoint: current tracepoints (in), next tracepoint (out) + * @begin: beginning of the range + * @end: end of the range + * + * Returns whether a next tracepoint has been found (1) or not (0). + * Will return the first tracepoint in the range if the input tracepoint is + * NULL. + */ +int tracepoint_get_iter_range(struct tracepoint * const **tracepoint, + struct tracepoint * const *begin, struct tracepoint * const *end) +{ + if (!*tracepoint && begin != end) { + *tracepoint = begin; + return 1; + } + if (*tracepoint >= begin && *tracepoint < end) + return 1; + return 0; +} +EXPORT_SYMBOL_GPL(tracepoint_get_iter_range); + +static void tracepoint_get_iter(struct tracepoint_iter *iter) +{ + int found = 0; + + /* Core kernel tracepoints */ + if (!iter->module) { + found = tracepoint_get_iter_range(&iter->tracepoint, + __start___tracepoints_ptrs, + __stop___tracepoints_ptrs); + if (found) + goto end; + } + /* tracepoints in modules. */ + found = module_get_iter_tracepoints(iter); +end: + if (!found) + tracepoint_iter_reset(iter); +} + +void tracepoint_iter_start(struct tracepoint_iter *iter) +{ + tracepoint_get_iter(iter); +} +EXPORT_SYMBOL_GPL(tracepoint_iter_start); + +void tracepoint_iter_next(struct tracepoint_iter *iter) +{ + iter->tracepoint++; + /* + * iter->tracepoint may be invalid because we blindly incremented it. + * Make sure it is valid by marshalling on the tracepoints, getting the + * tracepoints from following modules if necessary. + */ + tracepoint_get_iter(iter); +} +EXPORT_SYMBOL_GPL(tracepoint_iter_next); + +void tracepoint_iter_stop(struct tracepoint_iter *iter) +{ +} +EXPORT_SYMBOL_GPL(tracepoint_iter_stop); + +void tracepoint_iter_reset(struct tracepoint_iter *iter) +{ + iter->module = NULL; + iter->tracepoint = NULL; +} +EXPORT_SYMBOL_GPL(tracepoint_iter_reset); + +#ifdef CONFIG_MODULES + +int tracepoint_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + switch (val) { + case MODULE_STATE_COMING: + case MODULE_STATE_GOING: + tracepoint_update_probe_range(mod->tracepoints_ptrs, + mod->tracepoints_ptrs + mod->num_tracepoints); + break; + } + return 0; +} + +struct notifier_block tracepoint_module_nb = { + .notifier_call = tracepoint_module_notify, + .priority = 0, +}; + +static int init_tracepoints(void) +{ + return register_module_notifier(&tracepoint_module_nb); +} +__initcall(init_tracepoints); + +#endif /* CONFIG_MODULES */ + +#ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS + +/* NB: reg/unreg are called while guarded with the tracepoints_mutex */ +static int sys_tracepoint_refcount; + +void syscall_regfunc(void) +{ + unsigned long flags; + struct task_struct *g, *t; + + if (!sys_tracepoint_refcount) { + read_lock_irqsave(&tasklist_lock, flags); + do_each_thread(g, t) { + /* Skip kernel threads. */ + if (t->mm) + set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); + } while_each_thread(g, t); + read_unlock_irqrestore(&tasklist_lock, flags); + } + sys_tracepoint_refcount++; +} + +void syscall_unregfunc(void) +{ + unsigned long flags; + struct task_struct *g, *t; + + sys_tracepoint_refcount--; + if (!sys_tracepoint_refcount) { + read_lock_irqsave(&tasklist_lock, flags); + do_each_thread(g, t) { + clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); + } while_each_thread(g, t); + read_unlock_irqrestore(&tasklist_lock, flags); + } +} +#endif diff --git a/kernel/tsacct.c b/kernel/tsacct.c new file mode 100644 index 00000000..24dc60d9 --- /dev/null +++ b/kernel/tsacct.c @@ -0,0 +1,154 @@ +/* + * tsacct.c - System accounting over taskstats interface + * + * Copyright (C) Jay Lan, + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include +#include +#include +#include +#include +#include + +/* + * fill in basic accounting fields + */ +void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) +{ + const struct cred *tcred; + struct timespec uptime, ts; + u64 ac_etime; + + BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); + + /* calculate task elapsed time in timespec */ + do_posix_clock_monotonic_gettime(&uptime); + ts = timespec_sub(uptime, tsk->start_time); + /* rebase elapsed time to usec (should never be negative) */ + ac_etime = timespec_to_ns(&ts); + do_div(ac_etime, NSEC_PER_USEC); + stats->ac_etime = ac_etime; + stats->ac_btime = get_seconds() - ts.tv_sec; + if (thread_group_leader(tsk)) { + stats->ac_exitcode = tsk->exit_code; + if (tsk->flags & PF_FORKNOEXEC) + stats->ac_flag |= AFORK; + } + if (tsk->flags & PF_SUPERPRIV) + stats->ac_flag |= ASU; + if (tsk->flags & PF_DUMPCORE) + stats->ac_flag |= ACORE; + if (tsk->flags & PF_SIGNALED) + stats->ac_flag |= AXSIG; + stats->ac_nice = task_nice(tsk); + stats->ac_sched = tsk->policy; + stats->ac_pid = tsk->pid; + rcu_read_lock(); + tcred = __task_cred(tsk); + stats->ac_uid = tcred->uid; + stats->ac_gid = tcred->gid; + stats->ac_ppid = pid_alive(tsk) ? + rcu_dereference(tsk->real_parent)->tgid : 0; + rcu_read_unlock(); + stats->ac_utime = cputime_to_usecs(tsk->utime); + stats->ac_stime = cputime_to_usecs(tsk->stime); + stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled); + stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled); + stats->ac_minflt = tsk->min_flt; + stats->ac_majflt = tsk->maj_flt; + + strncpy(stats->ac_comm, tsk->comm, sizeof(stats->ac_comm)); +} + + +#ifdef CONFIG_TASK_XACCT + +#define KB 1024 +#define MB (1024*KB) +/* + * fill in extended accounting fields + */ +void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) +{ + struct mm_struct *mm; + + /* convert pages-usec to Mbyte-usec */ + stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB; + stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB; + mm = get_task_mm(p); + if (mm) { + /* adjust to KB unit */ + stats->hiwater_rss = get_mm_hiwater_rss(mm) * PAGE_SIZE / KB; + stats->hiwater_vm = get_mm_hiwater_vm(mm) * PAGE_SIZE / KB; + mmput(mm); + } + stats->read_char = p->ioac.rchar; + stats->write_char = p->ioac.wchar; + stats->read_syscalls = p->ioac.syscr; + stats->write_syscalls = p->ioac.syscw; +#ifdef CONFIG_TASK_IO_ACCOUNTING + stats->read_bytes = p->ioac.read_bytes; + stats->write_bytes = p->ioac.write_bytes; + stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes; +#else + stats->read_bytes = 0; + stats->write_bytes = 0; + stats->cancelled_write_bytes = 0; +#endif +} +#undef KB +#undef MB + +/** + * acct_update_integrals - update mm integral fields in task_struct + * @tsk: task_struct for accounting + */ +void acct_update_integrals(struct task_struct *tsk) +{ + if (likely(tsk->mm)) { + cputime_t time, dtime; + struct timeval value; + unsigned long flags; + u64 delta; + + local_irq_save(flags); + time = tsk->stime + tsk->utime; + dtime = cputime_sub(time, tsk->acct_timexpd); + jiffies_to_timeval(cputime_to_jiffies(dtime), &value); + delta = value.tv_sec; + delta = delta * USEC_PER_SEC + value.tv_usec; + + if (delta == 0) + goto out; + tsk->acct_timexpd = time; + tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); + tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; + out: + local_irq_restore(flags); + } +} + +/** + * acct_clear_integrals - clear the mm integral fields in task_struct + * @tsk: task_struct whose accounting fields are cleared + */ +void acct_clear_integrals(struct task_struct *tsk) +{ + tsk->acct_timexpd = 0; + tsk->acct_rss_mem1 = 0; + tsk->acct_vm_mem1 = 0; +} +#endif diff --git a/kernel/uid16.c b/kernel/uid16.c new file mode 100644 index 00000000..51c6e89e --- /dev/null +++ b/kernel/uid16.c @@ -0,0 +1,230 @@ +/* + * Wrapper functions for 16bit uid back compatibility. All nicely tied + * together in the faint hope we can take the out in five years time. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +SYSCALL_DEFINE3(chown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) +{ + long ret = sys_chown(filename, low2highuid(user), low2highgid(group)); + /* avoid REGPARM breakage on x86: */ + asmlinkage_protect(3, ret, filename, user, group); + return ret; +} + +SYSCALL_DEFINE3(lchown16, const char __user *, filename, old_uid_t, user, old_gid_t, group) +{ + long ret = sys_lchown(filename, low2highuid(user), low2highgid(group)); + /* avoid REGPARM breakage on x86: */ + asmlinkage_protect(3, ret, filename, user, group); + return ret; +} + +SYSCALL_DEFINE3(fchown16, unsigned int, fd, old_uid_t, user, old_gid_t, group) +{ + long ret = sys_fchown(fd, low2highuid(user), low2highgid(group)); + /* avoid REGPARM breakage on x86: */ + asmlinkage_protect(3, ret, fd, user, group); + return ret; +} + +SYSCALL_DEFINE2(setregid16, old_gid_t, rgid, old_gid_t, egid) +{ + long ret = sys_setregid(low2highgid(rgid), low2highgid(egid)); + /* avoid REGPARM breakage on x86: */ + asmlinkage_protect(2, ret, rgid, egid); + return ret; +} + +SYSCALL_DEFINE1(setgid16, old_gid_t, gid) +{ + long ret = sys_setgid(low2highgid(gid)); + /* avoid REGPARM breakage on x86: */ + asmlinkage_protect(1, ret, gid); + return ret; +} + +SYSCALL_DEFINE2(setreuid16, old_uid_t, ruid, old_uid_t, euid) +{ + long ret = sys_setreuid(low2highuid(ruid), low2highuid(euid)); + /* avoid REGPARM breakage on x86: */ + asmlinkage_protect(2, ret, ruid, euid); + return ret; +} + +SYSCALL_DEFINE1(setuid16, old_uid_t, uid) +{ + long ret = sys_setuid(low2highuid(uid)); + /* avoid REGPARM breakage on x86: */ + asmlinkage_protect(1, ret, uid); + return ret; +} + +SYSCALL_DEFINE3(setresuid16, old_uid_t, ruid, old_uid_t, euid, old_uid_t, suid) +{ + long ret = sys_setresuid(low2highuid(ruid), low2highuid(euid), + low2highuid(suid)); + /* avoid REGPARM breakage on x86: */ + asmlinkage_protect(3, ret, ruid, euid, suid); + return ret; +} + +SYSCALL_DEFINE3(getresuid16, old_uid_t __user *, ruid, old_uid_t __user *, euid, old_uid_t __user *, suid) +{ + const struct cred *cred = current_cred(); + int retval; + + if (!(retval = put_user(high2lowuid(cred->uid), ruid)) && + !(retval = put_user(high2lowuid(cred->euid), euid))) + retval = put_user(high2lowuid(cred->suid), suid); + + return retval; +} + +SYSCALL_DEFINE3(setresgid16, old_gid_t, rgid, old_gid_t, egid, old_gid_t, sgid) +{ + long ret = sys_setresgid(low2highgid(rgid), low2highgid(egid), + low2highgid(sgid)); + /* avoid REGPARM breakage on x86: */ + asmlinkage_protect(3, ret, rgid, egid, sgid); + return ret; +} + + +SYSCALL_DEFINE3(getresgid16, old_gid_t __user *, rgid, old_gid_t __user *, egid, old_gid_t __user *, sgid) +{ + const struct cred *cred = current_cred(); + int retval; + + if (!(retval = put_user(high2lowgid(cred->gid), rgid)) && + !(retval = put_user(high2lowgid(cred->egid), egid))) + retval = put_user(high2lowgid(cred->sgid), sgid); + + return retval; +} + +SYSCALL_DEFINE1(setfsuid16, old_uid_t, uid) +{ + long ret = sys_setfsuid(low2highuid(uid)); + /* avoid REGPARM breakage on x86: */ + asmlinkage_protect(1, ret, uid); + return ret; +} + +SYSCALL_DEFINE1(setfsgid16, old_gid_t, gid) +{ + long ret = sys_setfsgid(low2highgid(gid)); + /* avoid REGPARM breakage on x86: */ + asmlinkage_protect(1, ret, gid); + return ret; +} + +static int groups16_to_user(old_gid_t __user *grouplist, + struct group_info *group_info) +{ + int i; + old_gid_t group; + + for (i = 0; i < group_info->ngroups; i++) { + group = high2lowgid(GROUP_AT(group_info, i)); + if (put_user(group, grouplist+i)) + return -EFAULT; + } + + return 0; +} + +static int groups16_from_user(struct group_info *group_info, + old_gid_t __user *grouplist) +{ + int i; + old_gid_t group; + + for (i = 0; i < group_info->ngroups; i++) { + if (get_user(group, grouplist+i)) + return -EFAULT; + GROUP_AT(group_info, i) = low2highgid(group); + } + + return 0; +} + +SYSCALL_DEFINE2(getgroups16, int, gidsetsize, old_gid_t __user *, grouplist) +{ + const struct cred *cred = current_cred(); + int i; + + if (gidsetsize < 0) + return -EINVAL; + + i = cred->group_info->ngroups; + if (gidsetsize) { + if (i > gidsetsize) { + i = -EINVAL; + goto out; + } + if (groups16_to_user(grouplist, cred->group_info)) { + i = -EFAULT; + goto out; + } + } +out: + return i; +} + +SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) +{ + struct group_info *group_info; + int retval; + + if (!nsown_capable(CAP_SETGID)) + return -EPERM; + if ((unsigned)gidsetsize > NGROUPS_MAX) + return -EINVAL; + + group_info = groups_alloc(gidsetsize); + if (!group_info) + return -ENOMEM; + retval = groups16_from_user(group_info, grouplist); + if (retval) { + put_group_info(group_info); + return retval; + } + + retval = set_current_groups(group_info); + put_group_info(group_info); + + return retval; +} + +SYSCALL_DEFINE0(getuid16) +{ + return high2lowuid(current_uid()); +} + +SYSCALL_DEFINE0(geteuid16) +{ + return high2lowuid(current_euid()); +} + +SYSCALL_DEFINE0(getgid16) +{ + return high2lowgid(current_gid()); +} + +SYSCALL_DEFINE0(getegid16) +{ + return high2lowgid(current_egid()); +} diff --git a/kernel/up.c b/kernel/up.c new file mode 100644 index 00000000..1ff27a28 --- /dev/null +++ b/kernel/up.c @@ -0,0 +1,21 @@ +/* + * Uniprocessor-only support functions. The counterpart to kernel/smp.c + */ + +#include +#include +#include +#include + +int smp_call_function_single(int cpu, void (*func) (void *info), void *info, + int wait) +{ + WARN_ON(cpu != 0); + + local_irq_disable(); + (func)(info); + local_irq_enable(); + + return 0; +} +EXPORT_SYMBOL(smp_call_function_single); diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c new file mode 100644 index 00000000..92cb706c --- /dev/null +++ b/kernel/user-return-notifier.c @@ -0,0 +1,44 @@ + +#include +#include +#include +#include + +static DEFINE_PER_CPU(struct hlist_head, return_notifier_list); + +/* + * Request a notification when the current cpu returns to userspace. Must be + * called in atomic context. The notifier will also be called in atomic + * context. + */ +void user_return_notifier_register(struct user_return_notifier *urn) +{ + set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY); + hlist_add_head(&urn->link, &__get_cpu_var(return_notifier_list)); +} +EXPORT_SYMBOL_GPL(user_return_notifier_register); + +/* + * Removes a registered user return notifier. Must be called from atomic + * context, and from the same cpu registration occurred in. + */ +void user_return_notifier_unregister(struct user_return_notifier *urn) +{ + hlist_del(&urn->link); + if (hlist_empty(&__get_cpu_var(return_notifier_list))) + clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY); +} +EXPORT_SYMBOL_GPL(user_return_notifier_unregister); + +/* Calls registered user return notifiers */ +void fire_user_return_notifiers(void) +{ + struct user_return_notifier *urn; + struct hlist_node *tmp1, *tmp2; + struct hlist_head *head; + + head = &get_cpu_var(return_notifier_list); + hlist_for_each_entry_safe(urn, tmp1, tmp2, head, link) + urn->on_user_return(urn); + put_cpu_var(return_notifier_list); +} diff --git a/kernel/user.c b/kernel/user.c new file mode 100644 index 00000000..9e03e9c1 --- /dev/null +++ b/kernel/user.c @@ -0,0 +1,200 @@ +/* + * The "user cache". + * + * (C) Copyright 1991-2000 Linus Torvalds + * + * We have a per-user structure to keep track of how many + * processes, files etc the user has claimed, in order to be + * able to have per-user limits for system resources. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * userns count is 1 for root user, 1 for init_uts_ns, + * and 1 for... ? + */ +struct user_namespace init_user_ns = { + .kref = { + .refcount = ATOMIC_INIT(3), + }, + .creator = &root_user, +}; +EXPORT_SYMBOL_GPL(init_user_ns); + +/* + * UID task count cache, to get fast user lookup in "alloc_uid" + * when changing user ID's (ie setuid() and friends). + */ + +#define UIDHASH_MASK (UIDHASH_SZ - 1) +#define __uidhashfn(uid) (((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK) +#define uidhashentry(ns, uid) ((ns)->uidhash_table + __uidhashfn((uid))) + +static struct kmem_cache *uid_cachep; + +/* + * The uidhash_lock is mostly taken from process context, but it is + * occasionally also taken from softirq/tasklet context, when + * task-structs get RCU-freed. Hence all locking must be softirq-safe. + * But free_uid() is also called with local interrupts disabled, and running + * local_bh_enable() with local interrupts disabled is an error - we'll run + * softirq callbacks, and they can unconditionally enable interrupts, and + * the caller of free_uid() didn't expect that.. + */ +static DEFINE_SPINLOCK(uidhash_lock); + +/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */ +struct user_struct root_user = { + .__count = ATOMIC_INIT(2), + .processes = ATOMIC_INIT(1), + .files = ATOMIC_INIT(0), + .sigpending = ATOMIC_INIT(0), + .locked_shm = 0, + .user_ns = &init_user_ns, +}; + +/* + * These routines must be called with the uidhash spinlock held! + */ +static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent) +{ + hlist_add_head(&up->uidhash_node, hashent); +} + +static void uid_hash_remove(struct user_struct *up) +{ + hlist_del_init(&up->uidhash_node); + put_user_ns(up->user_ns); +} + +static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) +{ + struct user_struct *user; + struct hlist_node *h; + + hlist_for_each_entry(user, h, hashent, uidhash_node) { + if (user->uid == uid) { + atomic_inc(&user->__count); + return user; + } + } + + return NULL; +} + +/* IRQs are disabled and uidhash_lock is held upon function entry. + * IRQ state (as stored in flags) is restored and uidhash_lock released + * upon function exit. + */ +static void free_user(struct user_struct *up, unsigned long flags) + __releases(&uidhash_lock) +{ + uid_hash_remove(up); + spin_unlock_irqrestore(&uidhash_lock, flags); + key_put(up->uid_keyring); + key_put(up->session_keyring); + kmem_cache_free(uid_cachep, up); +} + +/* + * Locate the user_struct for the passed UID. If found, take a ref on it. The + * caller must undo that ref with free_uid(). + * + * If the user_struct could not be found, return NULL. + */ +struct user_struct *find_user(uid_t uid) +{ + struct user_struct *ret; + unsigned long flags; + struct user_namespace *ns = current_user_ns(); + + spin_lock_irqsave(&uidhash_lock, flags); + ret = uid_hash_find(uid, uidhashentry(ns, uid)); + spin_unlock_irqrestore(&uidhash_lock, flags); + return ret; +} + +void free_uid(struct user_struct *up) +{ + unsigned long flags; + + if (!up) + return; + + local_irq_save(flags); + if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) + free_user(up, flags); + else + local_irq_restore(flags); +} + +struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid) +{ + struct hlist_head *hashent = uidhashentry(ns, uid); + struct user_struct *up, *new; + + spin_lock_irq(&uidhash_lock); + up = uid_hash_find(uid, hashent); + spin_unlock_irq(&uidhash_lock); + + if (!up) { + new = kmem_cache_zalloc(uid_cachep, GFP_KERNEL); + if (!new) + goto out_unlock; + + new->uid = uid; + atomic_set(&new->__count, 1); + + new->user_ns = get_user_ns(ns); + + /* + * Before adding this, check whether we raced + * on adding the same user already.. + */ + spin_lock_irq(&uidhash_lock); + up = uid_hash_find(uid, hashent); + if (up) { + put_user_ns(ns); + key_put(new->uid_keyring); + key_put(new->session_keyring); + kmem_cache_free(uid_cachep, new); + } else { + uid_hash_insert(new, hashent); + up = new; + } + spin_unlock_irq(&uidhash_lock); + } + + return up; + +out_unlock: + return NULL; +} + +static int __init uid_cache_init(void) +{ + int n; + + uid_cachep = kmem_cache_create("uid_cache", sizeof(struct user_struct), + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + + for(n = 0; n < UIDHASH_SZ; ++n) + INIT_HLIST_HEAD(init_user_ns.uidhash_table + n); + + /* Insert the root user immediately (init already runs as root) */ + spin_lock_irq(&uidhash_lock); + uid_hash_insert(&root_user, uidhashentry(&init_user_ns, 0)); + spin_unlock_irq(&uidhash_lock); + + return 0; +} + +module_init(uid_cache_init); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c new file mode 100644 index 00000000..9da289c3 --- /dev/null +++ b/kernel/user_namespace.c @@ -0,0 +1,137 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#include +#include +#include +#include +#include +#include + +static struct kmem_cache *user_ns_cachep __read_mostly; + +/* + * Create a new user namespace, deriving the creator from the user in the + * passed credentials, and replacing that user with the new root user for the + * new namespace. + * + * This is called by copy_creds(), which will finish setting the target task's + * credentials. + */ +int create_user_ns(struct cred *new) +{ + struct user_namespace *ns; + struct user_struct *root_user; + int n; + + ns = kmem_cache_alloc(user_ns_cachep, GFP_KERNEL); + if (!ns) + return -ENOMEM; + + kref_init(&ns->kref); + + for (n = 0; n < UIDHASH_SZ; ++n) + INIT_HLIST_HEAD(ns->uidhash_table + n); + + /* Alloc new root user. */ + root_user = alloc_uid(ns, 0); + if (!root_user) { + kmem_cache_free(user_ns_cachep, ns); + return -ENOMEM; + } + + /* set the new root user in the credentials under preparation */ + ns->creator = new->user; + new->user = root_user; + new->uid = new->euid = new->suid = new->fsuid = 0; + new->gid = new->egid = new->sgid = new->fsgid = 0; + put_group_info(new->group_info); + new->group_info = get_group_info(&init_groups); +#ifdef CONFIG_KEYS + key_put(new->request_key_auth); + new->request_key_auth = NULL; +#endif + /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ + + /* root_user holds a reference to ns, our reference can be dropped */ + put_user_ns(ns); + + return 0; +} + +/* + * Deferred destructor for a user namespace. This is required because + * free_user_ns() may be called with uidhash_lock held, but we need to call + * back to free_uid() which will want to take the lock again. + */ +static void free_user_ns_work(struct work_struct *work) +{ + struct user_namespace *ns = + container_of(work, struct user_namespace, destroyer); + free_uid(ns->creator); + kmem_cache_free(user_ns_cachep, ns); +} + +void free_user_ns(struct kref *kref) +{ + struct user_namespace *ns = + container_of(kref, struct user_namespace, kref); + + INIT_WORK(&ns->destroyer, free_user_ns_work); + schedule_work(&ns->destroyer); +} +EXPORT_SYMBOL(free_user_ns); + +uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid) +{ + struct user_namespace *tmp; + + if (likely(to == cred->user->user_ns)) + return uid; + + + /* Is cred->user the creator of the target user_ns + * or the creator of one of it's parents? + */ + for ( tmp = to; tmp != &init_user_ns; + tmp = tmp->creator->user_ns ) { + if (cred->user == tmp->creator) { + return (uid_t)0; + } + } + + /* No useful relationship so no mapping */ + return overflowuid; +} + +gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid) +{ + struct user_namespace *tmp; + + if (likely(to == cred->user->user_ns)) + return gid; + + /* Is cred->user the creator of the target user_ns + * or the creator of one of it's parents? + */ + for ( tmp = to; tmp != &init_user_ns; + tmp = tmp->creator->user_ns ) { + if (cred->user == tmp->creator) { + return (gid_t)0; + } + } + + /* No useful relationship so no mapping */ + return overflowgid; +} + +static __init int user_namespaces_init(void) +{ + user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); + return 0; +} +module_init(user_namespaces_init); diff --git a/kernel/utsname.c b/kernel/utsname.c new file mode 100644 index 00000000..bff131b9 --- /dev/null +++ b/kernel/utsname.c @@ -0,0 +1,120 @@ +/* + * Copyright (C) 2004 IBM Corporation + * + * Author: Serge Hallyn + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#include +#include +#include +#include +#include +#include +#include + +static struct uts_namespace *create_uts_ns(void) +{ + struct uts_namespace *uts_ns; + + uts_ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL); + if (uts_ns) + kref_init(&uts_ns->kref); + return uts_ns; +} + +/* + * Clone a new ns copying an original utsname, setting refcount to 1 + * @old_ns: namespace to clone + * Return NULL on error (failure to kmalloc), new ns otherwise + */ +static struct uts_namespace *clone_uts_ns(struct task_struct *tsk, + struct uts_namespace *old_ns) +{ + struct uts_namespace *ns; + + ns = create_uts_ns(); + if (!ns) + return ERR_PTR(-ENOMEM); + + down_read(&uts_sem); + memcpy(&ns->name, &old_ns->name, sizeof(ns->name)); + ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns); + up_read(&uts_sem); + return ns; +} + +/* + * Copy task tsk's utsname namespace, or clone it if flags + * specifies CLONE_NEWUTS. In latter case, changes to the + * utsname of this process won't be seen by parent, and vice + * versa. + */ +struct uts_namespace *copy_utsname(unsigned long flags, + struct task_struct *tsk) +{ + struct uts_namespace *old_ns = tsk->nsproxy->uts_ns; + struct uts_namespace *new_ns; + + BUG_ON(!old_ns); + get_uts_ns(old_ns); + + if (!(flags & CLONE_NEWUTS)) + return old_ns; + + new_ns = clone_uts_ns(tsk, old_ns); + + put_uts_ns(old_ns); + return new_ns; +} + +void free_uts_ns(struct kref *kref) +{ + struct uts_namespace *ns; + + ns = container_of(kref, struct uts_namespace, kref); + put_user_ns(ns->user_ns); + kfree(ns); +} + +static void *utsns_get(struct task_struct *task) +{ + struct uts_namespace *ns = NULL; + struct nsproxy *nsproxy; + + rcu_read_lock(); + nsproxy = task_nsproxy(task); + if (nsproxy) { + ns = nsproxy->uts_ns; + get_uts_ns(ns); + } + rcu_read_unlock(); + + return ns; +} + +static void utsns_put(void *ns) +{ + put_uts_ns(ns); +} + +static int utsns_install(struct nsproxy *nsproxy, void *ns) +{ + get_uts_ns(ns); + put_uts_ns(nsproxy->uts_ns); + nsproxy->uts_ns = ns; + return 0; +} + +const struct proc_ns_operations utsns_operations = { + .name = "uts", + .type = CLONE_NEWUTS, + .get = utsns_get, + .put = utsns_put, + .install = utsns_install, +}; + diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c new file mode 100644 index 00000000..a2cd77e7 --- /dev/null +++ b/kernel/utsname_sysctl.c @@ -0,0 +1,114 @@ +/* + * Copyright (C) 2007 + * + * Author: Eric Biederman + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation, version 2 of the + * License. + */ + +#include +#include +#include +#include + +static void *get_uts(ctl_table *table, int write) +{ + char *which = table->data; + struct uts_namespace *uts_ns; + + uts_ns = current->nsproxy->uts_ns; + which = (which - (char *)&init_uts_ns) + (char *)uts_ns; + + if (!write) + down_read(&uts_sem); + else + down_write(&uts_sem); + return which; +} + +static void put_uts(ctl_table *table, int write, void *which) +{ + if (!write) + up_read(&uts_sem); + else + up_write(&uts_sem); +} + +#ifdef CONFIG_PROC_SYSCTL +/* + * Special case of dostring for the UTS structure. This has locks + * to observe. Should this be in kernel/sys.c ???? + */ +static int proc_do_uts_string(ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table uts_table; + int r; + memcpy(&uts_table, table, sizeof(uts_table)); + uts_table.data = get_uts(table, write); + r = proc_dostring(&uts_table,write,buffer,lenp, ppos); + put_uts(table, write, uts_table.data); + return r; +} +#else +#define proc_do_uts_string NULL +#endif + +static struct ctl_table uts_kern_table[] = { + { + .procname = "ostype", + .data = init_uts_ns.name.sysname, + .maxlen = sizeof(init_uts_ns.name.sysname), + .mode = 0444, + .proc_handler = proc_do_uts_string, + }, + { + .procname = "osrelease", + .data = init_uts_ns.name.release, + .maxlen = sizeof(init_uts_ns.name.release), + .mode = 0444, + .proc_handler = proc_do_uts_string, + }, + { + .procname = "version", + .data = init_uts_ns.name.version, + .maxlen = sizeof(init_uts_ns.name.version), + .mode = 0444, + .proc_handler = proc_do_uts_string, + }, + { + .procname = "hostname", + .data = init_uts_ns.name.nodename, + .maxlen = sizeof(init_uts_ns.name.nodename), + .mode = 0644, + .proc_handler = proc_do_uts_string, + }, + { + .procname = "domainname", + .data = init_uts_ns.name.domainname, + .maxlen = sizeof(init_uts_ns.name.domainname), + .mode = 0644, + .proc_handler = proc_do_uts_string, + }, + {} +}; + +static struct ctl_table uts_root_table[] = { + { + .procname = "kernel", + .mode = 0555, + .child = uts_kern_table, + }, + {} +}; + +static int __init utsname_sysctl_init(void) +{ + register_sysctl_table(uts_root_table); + return 0; +} + +__initcall(utsname_sysctl_init); diff --git a/kernel/wait.c b/kernel/wait.c new file mode 100644 index 00000000..f45ea8d2 --- /dev/null +++ b/kernel/wait.c @@ -0,0 +1,289 @@ +/* + * Generic waiting primitives. + * + * (C) 2004 William Irwin, Oracle + */ +#include +#include +#include +#include +#include +#include + +void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key) +{ + spin_lock_init(&q->lock); + lockdep_set_class(&q->lock, key); + INIT_LIST_HEAD(&q->task_list); +} + +EXPORT_SYMBOL(__init_waitqueue_head); + +void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + wait->flags &= ~WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + __add_wait_queue(q, wait); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(add_wait_queue); + +void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + wait->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + __add_wait_queue_tail(q, wait); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(add_wait_queue_exclusive); + +void remove_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + spin_lock_irqsave(&q->lock, flags); + __remove_wait_queue(q, wait); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(remove_wait_queue); + + +/* + * Note: we use "set_current_state()" _after_ the wait-queue add, + * because we need a memory barrier there on SMP, so that any + * wake-function that tests for the wait-queue being active + * will be guaranteed to see waitqueue addition _or_ subsequent + * tests in this thread will see the wakeup having taken place. + * + * The spin_unlock() itself is semi-permeable and only protects + * one way (it only protects stuff inside the critical region and + * stops them from bleeding out - it would still allow subsequent + * loads to move into the critical region). + */ +void +prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + + wait->flags &= ~WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + if (list_empty(&wait->task_list)) + __add_wait_queue(q, wait); + set_current_state(state); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(prepare_to_wait); + +void +prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + + wait->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + if (list_empty(&wait->task_list)) + __add_wait_queue_tail(q, wait); + set_current_state(state); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(prepare_to_wait_exclusive); + +/** + * finish_wait - clean up after waiting in a queue + * @q: waitqueue waited on + * @wait: wait descriptor + * + * Sets current thread back to running state and removes + * the wait descriptor from the given waitqueue if still + * queued. + */ +void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + __set_current_state(TASK_RUNNING); + /* + * We can check for list emptiness outside the lock + * IFF: + * - we use the "careful" check that verifies both + * the next and prev pointers, so that there cannot + * be any half-pending updates in progress on other + * CPU's that we haven't seen yet (and that might + * still change the stack area. + * and + * - all other users take the lock (ie we can only + * have _one_ other CPU that looks at or modifies + * the list). + */ + if (!list_empty_careful(&wait->task_list)) { + spin_lock_irqsave(&q->lock, flags); + list_del_init(&wait->task_list); + spin_unlock_irqrestore(&q->lock, flags); + } +} +EXPORT_SYMBOL(finish_wait); + +/** + * abort_exclusive_wait - abort exclusive waiting in a queue + * @q: waitqueue waited on + * @wait: wait descriptor + * @mode: runstate of the waiter to be woken + * @key: key to identify a wait bit queue or %NULL + * + * Sets current thread back to running state and removes + * the wait descriptor from the given waitqueue if still + * queued. + * + * Wakes up the next waiter if the caller is concurrently + * woken up through the queue. + * + * This prevents waiter starvation where an exclusive waiter + * aborts and is woken up concurrently and no one wakes up + * the next waiter. + */ +void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, + unsigned int mode, void *key) +{ + unsigned long flags; + + __set_current_state(TASK_RUNNING); + spin_lock_irqsave(&q->lock, flags); + if (!list_empty(&wait->task_list)) + list_del_init(&wait->task_list); + else if (waitqueue_active(q)) + __wake_up_locked_key(q, mode, key); + spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(abort_exclusive_wait); + +int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + int ret = default_wake_function(wait, mode, sync, key); + + if (ret) + list_del_init(&wait->task_list); + return ret; +} +EXPORT_SYMBOL(autoremove_wake_function); + +int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) +{ + struct wait_bit_key *key = arg; + struct wait_bit_queue *wait_bit + = container_of(wait, struct wait_bit_queue, wait); + + if (wait_bit->key.flags != key->flags || + wait_bit->key.bit_nr != key->bit_nr || + test_bit(key->bit_nr, key->flags)) + return 0; + else + return autoremove_wake_function(wait, mode, sync, key); +} +EXPORT_SYMBOL(wake_bit_function); + +/* + * To allow interruptible waiting and asynchronous (i.e. nonblocking) + * waiting, the actions of __wait_on_bit() and __wait_on_bit_lock() are + * permitted return codes. Nonzero return codes halt waiting and return. + */ +int __sched +__wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, + int (*action)(void *), unsigned mode) +{ + int ret = 0; + + do { + prepare_to_wait(wq, &q->wait, mode); + if (test_bit(q->key.bit_nr, q->key.flags)) + ret = (*action)(q->key.flags); + } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); + finish_wait(wq, &q->wait); + return ret; +} +EXPORT_SYMBOL(__wait_on_bit); + +int __sched out_of_line_wait_on_bit(void *word, int bit, + int (*action)(void *), unsigned mode) +{ + wait_queue_head_t *wq = bit_waitqueue(word, bit); + DEFINE_WAIT_BIT(wait, word, bit); + + return __wait_on_bit(wq, &wait, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_bit); + +int __sched +__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, + int (*action)(void *), unsigned mode) +{ + do { + int ret; + + prepare_to_wait_exclusive(wq, &q->wait, mode); + if (!test_bit(q->key.bit_nr, q->key.flags)) + continue; + ret = action(q->key.flags); + if (!ret) + continue; + abort_exclusive_wait(wq, &q->wait, mode, &q->key); + return ret; + } while (test_and_set_bit(q->key.bit_nr, q->key.flags)); + finish_wait(wq, &q->wait); + return 0; +} +EXPORT_SYMBOL(__wait_on_bit_lock); + +int __sched out_of_line_wait_on_bit_lock(void *word, int bit, + int (*action)(void *), unsigned mode) +{ + wait_queue_head_t *wq = bit_waitqueue(word, bit); + DEFINE_WAIT_BIT(wait, word, bit); + + return __wait_on_bit_lock(wq, &wait, action, mode); +} +EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); + +void __wake_up_bit(wait_queue_head_t *wq, void *word, int bit) +{ + struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); + if (waitqueue_active(wq)) + __wake_up(wq, TASK_NORMAL, 1, &key); +} +EXPORT_SYMBOL(__wake_up_bit); + +/** + * wake_up_bit - wake up a waiter on a bit + * @word: the word being waited on, a kernel virtual address + * @bit: the bit of the word being waited on + * + * There is a standard hashed waitqueue table for generic use. This + * is the part of the hashtable's accessor API that wakes up waiters + * on a bit. For instance, if one were to have waiters on a bitflag, + * one would call wake_up_bit() after clearing the bit. + * + * In order for this to function properly, as it uses waitqueue_active() + * internally, some kind of memory barrier must be done prior to calling + * this. Typically, this will be smp_mb__after_clear_bit(), but in some + * cases where bitflags are manipulated non-atomically under a lock, one + * may need to use a less regular barrier, such fs/inode.c's smp_mb(), + * because spin_unlock() does not guarantee a memory barrier. + */ +void wake_up_bit(void *word, int bit) +{ + __wake_up_bit(bit_waitqueue(word, bit), word, bit); +} +EXPORT_SYMBOL(wake_up_bit); + +wait_queue_head_t *bit_waitqueue(void *word, int bit) +{ + const int shift = BITS_PER_LONG == 32 ? 5 : 6; + const struct zone *zone = page_zone(virt_to_page(word)); + unsigned long val = (unsigned long)word << shift | bit; + + return &zone->wait_table[hash_long(val, zone->wait_table_bits)]; +} +EXPORT_SYMBOL(bit_waitqueue); diff --git a/kernel/watchdog.c b/kernel/watchdog.c new file mode 100644 index 00000000..3d0c56ad --- /dev/null +++ b/kernel/watchdog.c @@ -0,0 +1,590 @@ +/* + * Detect hard and soft lockups on a system + * + * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. + * + * this code detects hard lockups: incidents in where on a CPU + * the kernel does not respond to anything except NMI. + * + * Note: Most of this code is borrowed heavily from softlockup.c, + * so thanks to Ingo for the initial implementation. + * Some chunks also taken from arch/x86/kernel/apic/nmi.c, thanks + * to those contributors as well. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +int watchdog_enabled = 1; +int __read_mostly watchdog_thresh = 10; + +static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); +static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); +static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer); +static DEFINE_PER_CPU(bool, softlockup_touch_sync); +static DEFINE_PER_CPU(bool, soft_watchdog_warn); +#ifdef CONFIG_HARDLOCKUP_DETECTOR +static DEFINE_PER_CPU(bool, hard_watchdog_warn); +static DEFINE_PER_CPU(bool, watchdog_nmi_touch); +static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); +static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); +static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); +#endif + +/* boot commands */ +/* + * Should we panic when a soft-lockup or hard-lockup occurs: + */ +#ifdef CONFIG_HARDLOCKUP_DETECTOR +static int hardlockup_panic = + CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; + +static int __init hardlockup_panic_setup(char *str) +{ + if (!strncmp(str, "panic", 5)) + hardlockup_panic = 1; + else if (!strncmp(str, "nopanic", 7)) + hardlockup_panic = 0; + else if (!strncmp(str, "0", 1)) + watchdog_enabled = 0; + return 1; +} +__setup("nmi_watchdog=", hardlockup_panic_setup); +#endif + +unsigned int __read_mostly softlockup_panic = + CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; + +static int __init softlockup_panic_setup(char *str) +{ + softlockup_panic = simple_strtoul(str, NULL, 0); + + return 1; +} +__setup("softlockup_panic=", softlockup_panic_setup); + +static int __init nowatchdog_setup(char *str) +{ + watchdog_enabled = 0; + return 1; +} +__setup("nowatchdog", nowatchdog_setup); + +/* deprecated */ +static int __init nosoftlockup_setup(char *str) +{ + watchdog_enabled = 0; + return 1; +} +__setup("nosoftlockup", nosoftlockup_setup); +/* */ + +/* + * Hard-lockup warnings should be triggered after just a few seconds. Soft- + * lockups can have false positives under extreme conditions. So we generally + * want a higher threshold for soft lockups than for hard lockups. So we couple + * the thresholds with a factor: we make the soft threshold twice the amount of + * time the hard threshold is. + */ +static int get_softlockup_thresh(void) +{ + return watchdog_thresh * 2; +} + +/* + * Returns seconds, approximately. We don't need nanosecond + * resolution, and we don't need to waste time with a big divide when + * 2^30ns == 1.074s. + */ +static unsigned long get_timestamp(int this_cpu) +{ + return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ +} + +static unsigned long get_sample_period(void) +{ + /* + * convert watchdog_thresh from seconds to ns + * the divide by 5 is to give hrtimer 5 chances to + * increment before the hardlockup detector generates + * a warning + */ + return get_softlockup_thresh() * (NSEC_PER_SEC / 5); +} + +/* Commands for resetting the watchdog */ +static void __touch_watchdog(void) +{ + int this_cpu = smp_processor_id(); + + __this_cpu_write(watchdog_touch_ts, get_timestamp(this_cpu)); +} + +void touch_softlockup_watchdog(void) +{ + __this_cpu_write(watchdog_touch_ts, 0); +} +EXPORT_SYMBOL(touch_softlockup_watchdog); + +void touch_all_softlockup_watchdogs(void) +{ + int cpu; + + /* + * this is done lockless + * do we care if a 0 races with a timestamp? + * all it means is the softlock check starts one cycle later + */ + for_each_online_cpu(cpu) + per_cpu(watchdog_touch_ts, cpu) = 0; +} + +#ifdef CONFIG_HARDLOCKUP_DETECTOR +void touch_nmi_watchdog(void) +{ + if (watchdog_enabled) { + unsigned cpu; + + for_each_present_cpu(cpu) { + if (per_cpu(watchdog_nmi_touch, cpu) != true) + per_cpu(watchdog_nmi_touch, cpu) = true; + } + } + touch_softlockup_watchdog(); +} +EXPORT_SYMBOL(touch_nmi_watchdog); + +#endif + +void touch_softlockup_watchdog_sync(void) +{ + __raw_get_cpu_var(softlockup_touch_sync) = true; + __raw_get_cpu_var(watchdog_touch_ts) = 0; +} + +#ifdef CONFIG_HARDLOCKUP_DETECTOR +/* watchdog detector functions */ +static int is_hardlockup(void) +{ + unsigned long hrint = __this_cpu_read(hrtimer_interrupts); + + if (__this_cpu_read(hrtimer_interrupts_saved) == hrint) + return 1; + + __this_cpu_write(hrtimer_interrupts_saved, hrint); + return 0; +} +#endif + +static int is_softlockup(unsigned long touch_ts) +{ + unsigned long now = get_timestamp(smp_processor_id()); + + /* Warn about unreasonable delays: */ + if (time_after(now, touch_ts + get_softlockup_thresh())) + return now - touch_ts; + + return 0; +} + +#ifdef CONFIG_HARDLOCKUP_DETECTOR +static struct perf_event_attr wd_hw_attr = { + .type = PERF_TYPE_HARDWARE, + .config = PERF_COUNT_HW_CPU_CYCLES, + .size = sizeof(struct perf_event_attr), + .pinned = 1, + .disabled = 1, +}; + +/* Callback function for perf event subsystem */ +static void watchdog_overflow_callback(struct perf_event *event, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + /* Ensure the watchdog never gets throttled */ + event->hw.interrupts = 0; + + if (__this_cpu_read(watchdog_nmi_touch) == true) { + __this_cpu_write(watchdog_nmi_touch, false); + return; + } + + /* check for a hardlockup + * This is done by making sure our timer interrupt + * is incrementing. The timer interrupt should have + * fired multiple times before we overflow'd. If it hasn't + * then this is a good indication the cpu is stuck + */ + if (is_hardlockup()) { + int this_cpu = smp_processor_id(); + + /* only print hardlockups once */ + if (__this_cpu_read(hard_watchdog_warn) == true) + return; + + if (hardlockup_panic) + panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); + else + WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); + + __this_cpu_write(hard_watchdog_warn, true); + return; + } + + __this_cpu_write(hard_watchdog_warn, false); + return; +} +static void watchdog_interrupt_count(void) +{ + __this_cpu_inc(hrtimer_interrupts); +} +#else +static inline void watchdog_interrupt_count(void) { return; } +#endif /* CONFIG_HARDLOCKUP_DETECTOR */ + +/* watchdog kicker functions */ +static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) +{ + unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts); + struct pt_regs *regs = get_irq_regs(); + int duration; + + /* kick the hardlockup detector */ + watchdog_interrupt_count(); + + /* kick the softlockup detector */ + wake_up_process(__this_cpu_read(softlockup_watchdog)); + + /* .. and repeat */ + hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period())); + + if (touch_ts == 0) { + if (unlikely(__this_cpu_read(softlockup_touch_sync))) { + /* + * If the time stamp was touched atomically + * make sure the scheduler tick is up to date. + */ + __this_cpu_write(softlockup_touch_sync, false); + sched_clock_tick(); + } + __touch_watchdog(); + return HRTIMER_RESTART; + } + + /* check for a softlockup + * This is done by making sure a high priority task is + * being scheduled. The task touches the watchdog to + * indicate it is getting cpu time. If it hasn't then + * this is a good indication some task is hogging the cpu + */ + duration = is_softlockup(touch_ts); + if (unlikely(duration)) { + /* only warn once */ + if (__this_cpu_read(soft_watchdog_warn) == true) + return HRTIMER_RESTART; + + printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", + smp_processor_id(), duration, + current->comm, task_pid_nr(current)); + print_modules(); + print_irqtrace_events(current); + if (regs) + show_regs(regs); + else + dump_stack(); + + if (softlockup_panic) + panic("softlockup: hung tasks"); + __this_cpu_write(soft_watchdog_warn, true); + } else + __this_cpu_write(soft_watchdog_warn, false); + + return HRTIMER_RESTART; +} + + +/* + * The watchdog thread - touches the timestamp. + */ +static int watchdog(void *unused) +{ + static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); + + sched_setscheduler(current, SCHED_FIFO, ¶m); + + /* initialize timestamp */ + __touch_watchdog(); + + /* kick off the timer for the hardlockup detector */ + /* done here because hrtimer_start can only pin to smp_processor_id() */ + hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()), + HRTIMER_MODE_REL_PINNED); + + set_current_state(TASK_INTERRUPTIBLE); + /* + * Run briefly once per second to reset the softlockup timestamp. + * If this gets delayed for more than 60 seconds then the + * debug-printout triggers in watchdog_timer_fn(). + */ + while (!kthread_should_stop()) { + __touch_watchdog(); + schedule(); + + if (kthread_should_stop()) + break; + + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); + + return 0; +} + + +#ifdef CONFIG_HARDLOCKUP_DETECTOR +static int watchdog_nmi_enable(int cpu) +{ + struct perf_event_attr *wd_attr; + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + /* is it already setup and enabled? */ + if (event && event->state > PERF_EVENT_STATE_OFF) + goto out; + + /* it is setup but not enabled */ + if (event != NULL) + goto out_enable; + + /* Try to register using hardware perf events */ + wd_attr = &wd_hw_attr; + wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh); + event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); + if (!IS_ERR(event)) { + printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); + goto out_save; + } + + + /* vary the KERN level based on the returned errno */ + if (PTR_ERR(event) == -EOPNOTSUPP) + printk(KERN_INFO "NMI watchdog disabled (cpu%i): not supported (no LAPIC?)\n", cpu); + else if (PTR_ERR(event) == -ENOENT) + printk(KERN_WARNING "NMI watchdog disabled (cpu%i): hardware events not enabled\n", cpu); + else + printk(KERN_ERR "NMI watchdog disabled (cpu%i): unable to create perf event: %ld\n", cpu, PTR_ERR(event)); + return PTR_ERR(event); + + /* success path */ +out_save: + per_cpu(watchdog_ev, cpu) = event; +out_enable: + perf_event_enable(per_cpu(watchdog_ev, cpu)); +out: + return 0; +} + +static void watchdog_nmi_disable(int cpu) +{ + struct perf_event *event = per_cpu(watchdog_ev, cpu); + + if (event) { + perf_event_disable(event); + per_cpu(watchdog_ev, cpu) = NULL; + + /* should be in cleanup, but blocks oprofile */ + perf_event_release_kernel(event); + } + return; +} +#else +static int watchdog_nmi_enable(int cpu) { return 0; } +static void watchdog_nmi_disable(int cpu) { return; } +#endif /* CONFIG_HARDLOCKUP_DETECTOR */ + +/* prepare/enable/disable routines */ +static void watchdog_prepare_cpu(int cpu) +{ + struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); + + WARN_ON(per_cpu(softlockup_watchdog, cpu)); + hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer->function = watchdog_timer_fn; +} + +static int watchdog_enable(int cpu) +{ + struct task_struct *p = per_cpu(softlockup_watchdog, cpu); + int err = 0; + + /* enable the perf event */ + err = watchdog_nmi_enable(cpu); + + /* Regardless of err above, fall through and start softlockup */ + + /* create the watchdog thread */ + if (!p) { + p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); + if (IS_ERR(p)) { + printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); + if (!err) { + /* if hardlockup hasn't already set this */ + err = PTR_ERR(p); + /* and disable the perf event */ + watchdog_nmi_disable(cpu); + } + goto out; + } + kthread_bind(p, cpu); + per_cpu(watchdog_touch_ts, cpu) = 0; + per_cpu(softlockup_watchdog, cpu) = p; + wake_up_process(p); + } + +out: + return err; +} + +static void watchdog_disable(int cpu) +{ + struct task_struct *p = per_cpu(softlockup_watchdog, cpu); + struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu); + + /* + * cancel the timer first to stop incrementing the stats + * and waking up the kthread + */ + hrtimer_cancel(hrtimer); + + /* disable the perf event */ + watchdog_nmi_disable(cpu); + + /* stop the watchdog thread */ + if (p) { + per_cpu(softlockup_watchdog, cpu) = NULL; + kthread_stop(p); + } +} + +static void watchdog_enable_all_cpus(void) +{ + int cpu; + + watchdog_enabled = 0; + + for_each_online_cpu(cpu) + if (!watchdog_enable(cpu)) + /* if any cpu succeeds, watchdog is considered + enabled for the system */ + watchdog_enabled = 1; + + if (!watchdog_enabled) + printk(KERN_ERR "watchdog: failed to be enabled on some cpus\n"); + +} + +static void watchdog_disable_all_cpus(void) +{ + int cpu; + + for_each_online_cpu(cpu) + watchdog_disable(cpu); + + /* if all watchdogs are disabled, then they are disabled for the system */ + watchdog_enabled = 0; +} + + +/* sysctl functions */ +#ifdef CONFIG_SYSCTL +/* + * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh + */ + +int proc_dowatchdog(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (ret || !write) + goto out; + + if (watchdog_enabled && watchdog_thresh) + watchdog_enable_all_cpus(); + else + watchdog_disable_all_cpus(); + +out: + return ret; +} +#endif /* CONFIG_SYSCTL */ + + +/* + * Create/destroy watchdog threads as CPUs come and go: + */ +static int __cpuinit +cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + int hotcpu = (unsigned long)hcpu; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + watchdog_prepare_cpu(hotcpu); + break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + if (watchdog_enabled) + watchdog_enable(hotcpu); + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + watchdog_disable(hotcpu); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: + watchdog_disable(hotcpu); + break; +#endif /* CONFIG_HOTPLUG_CPU */ + } + + /* + * hardlockup and softlockup are not important enough + * to block cpu bring up. Just always succeed and + * rely on printk output to flag problems. + */ + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata cpu_nfb = { + .notifier_call = cpu_callback +}; + +void __init lockup_detector_init(void) +{ + void *cpu = (void *)(long)smp_processor_id(); + int err; + + err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); + WARN_ON(notifier_to_errno(err)); + + cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); + register_cpu_notifier(&cpu_nfb); + + return; +} diff --git a/kernel/workqueue.c b/kernel/workqueue.c new file mode 100644 index 00000000..ee1845b8 --- /dev/null +++ b/kernel/workqueue.c @@ -0,0 +1,3813 @@ +/* + * kernel/workqueue.c - generic async execution with shared worker pool + * + * Copyright (C) 2002 Ingo Molnar + * + * Derived from the taskqueue/keventd code by: + * David Woodhouse + * Andrew Morton + * Kai Petzke + * Theodore Ts'o + * + * Made to use alloc_percpu by Christoph Lameter. + * + * Copyright (C) 2010 SUSE Linux Products GmbH + * Copyright (C) 2010 Tejun Heo + * + * This is the generic async execution mechanism. Work items as are + * executed in process context. The worker pool is shared and + * automatically managed. There is one worker pool for each CPU and + * one extra for works which are better served by workers which are + * not bound to any specific CPU. + * + * Please read Documentation/workqueue.txt for details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "workqueue_sched.h" + +enum { + /* global_cwq flags */ + GCWQ_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ + GCWQ_MANAGING_WORKERS = 1 << 1, /* managing workers */ + GCWQ_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ + GCWQ_FREEZING = 1 << 3, /* freeze in progress */ + GCWQ_HIGHPRI_PENDING = 1 << 4, /* highpri works on queue */ + + /* worker flags */ + WORKER_STARTED = 1 << 0, /* started */ + WORKER_DIE = 1 << 1, /* die die die */ + WORKER_IDLE = 1 << 2, /* is idle */ + WORKER_PREP = 1 << 3, /* preparing to run works */ + WORKER_ROGUE = 1 << 4, /* not bound to any cpu */ + WORKER_REBIND = 1 << 5, /* mom is home, come back */ + WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ + WORKER_UNBOUND = 1 << 7, /* worker is unbound */ + + WORKER_NOT_RUNNING = WORKER_PREP | WORKER_ROGUE | WORKER_REBIND | + WORKER_CPU_INTENSIVE | WORKER_UNBOUND, + + /* gcwq->trustee_state */ + TRUSTEE_START = 0, /* start */ + TRUSTEE_IN_CHARGE = 1, /* trustee in charge of gcwq */ + TRUSTEE_BUTCHER = 2, /* butcher workers */ + TRUSTEE_RELEASE = 3, /* release workers */ + TRUSTEE_DONE = 4, /* trustee is done */ + + BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ + BUSY_WORKER_HASH_SIZE = 1 << BUSY_WORKER_HASH_ORDER, + BUSY_WORKER_HASH_MASK = BUSY_WORKER_HASH_SIZE - 1, + + MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ + IDLE_WORKER_TIMEOUT = 300 * HZ, /* keep idle ones for 5 mins */ + + MAYDAY_INITIAL_TIMEOUT = HZ / 100 >= 2 ? HZ / 100 : 2, + /* call for help after 10ms + (min two ticks) */ + MAYDAY_INTERVAL = HZ / 10, /* and then every 100ms */ + CREATE_COOLDOWN = HZ, /* time to breath after fail */ + TRUSTEE_COOLDOWN = HZ / 10, /* for trustee draining */ + + /* + * Rescue workers are used only on emergencies and shared by + * all cpus. Give -20. + */ + RESCUER_NICE_LEVEL = -20, +}; + +/* + * Structure fields follow one of the following exclusion rules. + * + * I: Modifiable by initialization/destruction paths and read-only for + * everyone else. + * + * P: Preemption protected. Disabling preemption is enough and should + * only be modified and accessed from the local cpu. + * + * L: gcwq->lock protected. Access with gcwq->lock held. + * + * X: During normal operation, modification requires gcwq->lock and + * should be done only from local cpu. Either disabling preemption + * on local cpu or grabbing gcwq->lock is enough for read access. + * If GCWQ_DISASSOCIATED is set, it's identical to L. + * + * F: wq->flush_mutex protected. + * + * W: workqueue_lock protected. + */ + +struct global_cwq; + +/* + * The poor guys doing the actual heavy lifting. All on-duty workers + * are either serving the manager role, on idle list or on busy hash. + */ +struct worker { + /* on idle list while idle, on busy hash table while busy */ + union { + struct list_head entry; /* L: while idle */ + struct hlist_node hentry; /* L: while busy */ + }; + + struct work_struct *current_work; /* L: work being processed */ + struct cpu_workqueue_struct *current_cwq; /* L: current_work's cwq */ + struct list_head scheduled; /* L: scheduled works */ + struct task_struct *task; /* I: worker task */ + struct global_cwq *gcwq; /* I: the associated gcwq */ + /* 64 bytes boundary on 64bit, 32 on 32bit */ + unsigned long last_active; /* L: last active timestamp */ + unsigned int flags; /* X: flags */ + int id; /* I: worker id */ + struct work_struct rebind_work; /* L: rebind worker to cpu */ +}; + +/* + * Global per-cpu workqueue. There's one and only one for each cpu + * and all works are queued and processed here regardless of their + * target workqueues. + */ +struct global_cwq { + spinlock_t lock; /* the gcwq lock */ + struct list_head worklist; /* L: list of pending works */ + unsigned int cpu; /* I: the associated cpu */ + unsigned int flags; /* L: GCWQ_* flags */ + + int nr_workers; /* L: total number of workers */ + int nr_idle; /* L: currently idle ones */ + + /* workers are chained either in the idle_list or busy_hash */ + struct list_head idle_list; /* X: list of idle workers */ + struct hlist_head busy_hash[BUSY_WORKER_HASH_SIZE]; + /* L: hash of busy workers */ + + struct timer_list idle_timer; /* L: worker idle timeout */ + struct timer_list mayday_timer; /* L: SOS timer for dworkers */ + + struct ida worker_ida; /* L: for worker IDs */ + + struct task_struct *trustee; /* L: for gcwq shutdown */ + unsigned int trustee_state; /* L: trustee state */ + wait_queue_head_t trustee_wait; /* trustee wait */ + struct worker *first_idle; /* L: first idle worker */ +} ____cacheline_aligned_in_smp; + +/* + * The per-CPU workqueue. The lower WORK_STRUCT_FLAG_BITS of + * work_struct->data are used for flags and thus cwqs need to be + * aligned at two's power of the number of flag bits. + */ +struct cpu_workqueue_struct { + struct global_cwq *gcwq; /* I: the associated gcwq */ + struct workqueue_struct *wq; /* I: the owning workqueue */ + int work_color; /* L: current color */ + int flush_color; /* L: flushing color */ + int nr_in_flight[WORK_NR_COLORS]; + /* L: nr of in_flight works */ + int nr_active; /* L: nr of active works */ + int max_active; /* L: max active works */ + struct list_head delayed_works; /* L: delayed works */ +}; + +/* + * Structure used to wait for workqueue flush. + */ +struct wq_flusher { + struct list_head list; /* F: list of flushers */ + int flush_color; /* F: flush color waiting for */ + struct completion done; /* flush completion */ +}; + +/* + * All cpumasks are assumed to be always set on UP and thus can't be + * used to determine whether there's something to be done. + */ +#ifdef CONFIG_SMP +typedef cpumask_var_t mayday_mask_t; +#define mayday_test_and_set_cpu(cpu, mask) \ + cpumask_test_and_set_cpu((cpu), (mask)) +#define mayday_clear_cpu(cpu, mask) cpumask_clear_cpu((cpu), (mask)) +#define for_each_mayday_cpu(cpu, mask) for_each_cpu((cpu), (mask)) +#define alloc_mayday_mask(maskp, gfp) zalloc_cpumask_var((maskp), (gfp)) +#define free_mayday_mask(mask) free_cpumask_var((mask)) +#else +typedef unsigned long mayday_mask_t; +#define mayday_test_and_set_cpu(cpu, mask) test_and_set_bit(0, &(mask)) +#define mayday_clear_cpu(cpu, mask) clear_bit(0, &(mask)) +#define for_each_mayday_cpu(cpu, mask) if ((cpu) = 0, (mask)) +#define alloc_mayday_mask(maskp, gfp) true +#define free_mayday_mask(mask) do { } while (0) +#endif + +/* + * The externally visible workqueue abstraction is an array of + * per-CPU workqueues: + */ +struct workqueue_struct { + unsigned int flags; /* I: WQ_* flags */ + union { + struct cpu_workqueue_struct __percpu *pcpu; + struct cpu_workqueue_struct *single; + unsigned long v; + } cpu_wq; /* I: cwq's */ + struct list_head list; /* W: list of all workqueues */ + + struct mutex flush_mutex; /* protects wq flushing */ + int work_color; /* F: current work color */ + int flush_color; /* F: current flush color */ + atomic_t nr_cwqs_to_flush; /* flush in progress */ + struct wq_flusher *first_flusher; /* F: first flusher */ + struct list_head flusher_queue; /* F: flush waiters */ + struct list_head flusher_overflow; /* F: flush overflow list */ + + mayday_mask_t mayday_mask; /* cpus requesting rescue */ + struct worker *rescuer; /* I: rescue worker */ + + int saved_max_active; /* W: saved cwq max_active */ + const char *name; /* I: workqueue name */ +#ifdef CONFIG_LOCKDEP + struct lockdep_map lockdep_map; +#endif +}; + +struct workqueue_struct *system_wq __read_mostly; +struct workqueue_struct *system_long_wq __read_mostly; +struct workqueue_struct *system_nrt_wq __read_mostly; +struct workqueue_struct *system_unbound_wq __read_mostly; +struct workqueue_struct *system_freezable_wq __read_mostly; +struct workqueue_struct *system_nrt_freezable_wq __read_mostly; +EXPORT_SYMBOL_GPL(system_wq); +EXPORT_SYMBOL_GPL(system_long_wq); +EXPORT_SYMBOL_GPL(system_nrt_wq); +EXPORT_SYMBOL_GPL(system_unbound_wq); +EXPORT_SYMBOL_GPL(system_freezable_wq); +EXPORT_SYMBOL_GPL(system_nrt_freezable_wq); + +#define CREATE_TRACE_POINTS +#include + +#define for_each_busy_worker(worker, i, pos, gcwq) \ + for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \ + hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry) + +static inline int __next_gcwq_cpu(int cpu, const struct cpumask *mask, + unsigned int sw) +{ + if (cpu < nr_cpu_ids) { + if (sw & 1) { + cpu = cpumask_next(cpu, mask); + if (cpu < nr_cpu_ids) + return cpu; + } + if (sw & 2) + return WORK_CPU_UNBOUND; + } + return WORK_CPU_NONE; +} + +static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, + struct workqueue_struct *wq) +{ + return __next_gcwq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2); +} + +/* + * CPU iterators + * + * An extra gcwq is defined for an invalid cpu number + * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any + * specific CPU. The following iterators are similar to + * for_each_*_cpu() iterators but also considers the unbound gcwq. + * + * for_each_gcwq_cpu() : possible CPUs + WORK_CPU_UNBOUND + * for_each_online_gcwq_cpu() : online CPUs + WORK_CPU_UNBOUND + * for_each_cwq_cpu() : possible CPUs for bound workqueues, + * WORK_CPU_UNBOUND for unbound workqueues + */ +#define for_each_gcwq_cpu(cpu) \ + for ((cpu) = __next_gcwq_cpu(-1, cpu_possible_mask, 3); \ + (cpu) < WORK_CPU_NONE; \ + (cpu) = __next_gcwq_cpu((cpu), cpu_possible_mask, 3)) + +#define for_each_online_gcwq_cpu(cpu) \ + for ((cpu) = __next_gcwq_cpu(-1, cpu_online_mask, 3); \ + (cpu) < WORK_CPU_NONE; \ + (cpu) = __next_gcwq_cpu((cpu), cpu_online_mask, 3)) + +#define for_each_cwq_cpu(cpu, wq) \ + for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, (wq)); \ + (cpu) < WORK_CPU_NONE; \ + (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq))) + +#ifdef CONFIG_DEBUG_OBJECTS_WORK + +static struct debug_obj_descr work_debug_descr; + +static void *work_debug_hint(void *addr) +{ + return ((struct work_struct *) addr)->func; +} + +/* + * fixup_init is called when: + * - an active object is initialized + */ +static int work_fixup_init(void *addr, enum debug_obj_state state) +{ + struct work_struct *work = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + cancel_work_sync(work); + debug_object_init(work, &work_debug_descr); + return 1; + default: + return 0; + } +} + +/* + * fixup_activate is called when: + * - an active object is activated + * - an unknown object is activated (might be a statically initialized object) + */ +static int work_fixup_activate(void *addr, enum debug_obj_state state) +{ + struct work_struct *work = addr; + + switch (state) { + + case ODEBUG_STATE_NOTAVAILABLE: + /* + * This is not really a fixup. The work struct was + * statically initialized. We just make sure that it + * is tracked in the object tracker. + */ + if (test_bit(WORK_STRUCT_STATIC_BIT, work_data_bits(work))) { + debug_object_init(work, &work_debug_descr); + debug_object_activate(work, &work_debug_descr); + return 0; + } + WARN_ON_ONCE(1); + return 0; + + case ODEBUG_STATE_ACTIVE: + WARN_ON(1); + + default: + return 0; + } +} + +/* + * fixup_free is called when: + * - an active object is freed + */ +static int work_fixup_free(void *addr, enum debug_obj_state state) +{ + struct work_struct *work = addr; + + switch (state) { + case ODEBUG_STATE_ACTIVE: + cancel_work_sync(work); + debug_object_free(work, &work_debug_descr); + return 1; + default: + return 0; + } +} + +static struct debug_obj_descr work_debug_descr = { + .name = "work_struct", + .debug_hint = work_debug_hint, + .fixup_init = work_fixup_init, + .fixup_activate = work_fixup_activate, + .fixup_free = work_fixup_free, +}; + +static inline void debug_work_activate(struct work_struct *work) +{ + debug_object_activate(work, &work_debug_descr); +} + +static inline void debug_work_deactivate(struct work_struct *work) +{ + debug_object_deactivate(work, &work_debug_descr); +} + +void __init_work(struct work_struct *work, int onstack) +{ + if (onstack) + debug_object_init_on_stack(work, &work_debug_descr); + else + debug_object_init(work, &work_debug_descr); +} +EXPORT_SYMBOL_GPL(__init_work); + +void destroy_work_on_stack(struct work_struct *work) +{ + debug_object_free(work, &work_debug_descr); +} +EXPORT_SYMBOL_GPL(destroy_work_on_stack); + +#else +static inline void debug_work_activate(struct work_struct *work) { } +static inline void debug_work_deactivate(struct work_struct *work) { } +#endif + +/* Serializes the accesses to the list of workqueues. */ +static DEFINE_SPINLOCK(workqueue_lock); +static LIST_HEAD(workqueues); +static bool workqueue_freezing; /* W: have wqs started freezing? */ + +/* + * The almighty global cpu workqueues. nr_running is the only field + * which is expected to be used frequently by other cpus via + * try_to_wake_up(). Put it in a separate cacheline. + */ +static DEFINE_PER_CPU(struct global_cwq, global_cwq); +static DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, gcwq_nr_running); + +/* + * Global cpu workqueue and nr_running counter for unbound gcwq. The + * gcwq is always online, has GCWQ_DISASSOCIATED set, and all its + * workers have WORKER_UNBOUND set. + */ +static struct global_cwq unbound_global_cwq; +static atomic_t unbound_gcwq_nr_running = ATOMIC_INIT(0); /* always 0 */ + +static int worker_thread(void *__worker); + +static struct global_cwq *get_gcwq(unsigned int cpu) +{ + if (cpu != WORK_CPU_UNBOUND) + return &per_cpu(global_cwq, cpu); + else + return &unbound_global_cwq; +} + +static atomic_t *get_gcwq_nr_running(unsigned int cpu) +{ + if (cpu != WORK_CPU_UNBOUND) + return &per_cpu(gcwq_nr_running, cpu); + else + return &unbound_gcwq_nr_running; +} + +static struct cpu_workqueue_struct *get_cwq(unsigned int cpu, + struct workqueue_struct *wq) +{ + if (!(wq->flags & WQ_UNBOUND)) { + if (likely(cpu < nr_cpu_ids)) { +#ifdef CONFIG_SMP + return per_cpu_ptr(wq->cpu_wq.pcpu, cpu); +#else + return wq->cpu_wq.single; +#endif + } + } else if (likely(cpu == WORK_CPU_UNBOUND)) + return wq->cpu_wq.single; + return NULL; +} + +static unsigned int work_color_to_flags(int color) +{ + return color << WORK_STRUCT_COLOR_SHIFT; +} + +static int get_work_color(struct work_struct *work) +{ + return (*work_data_bits(work) >> WORK_STRUCT_COLOR_SHIFT) & + ((1 << WORK_STRUCT_COLOR_BITS) - 1); +} + +static int work_next_color(int color) +{ + return (color + 1) % WORK_NR_COLORS; +} + +/* + * A work's data points to the cwq with WORK_STRUCT_CWQ set while the + * work is on queue. Once execution starts, WORK_STRUCT_CWQ is + * cleared and the work data contains the cpu number it was last on. + * + * set_work_{cwq|cpu}() and clear_work_data() can be used to set the + * cwq, cpu or clear work->data. These functions should only be + * called while the work is owned - ie. while the PENDING bit is set. + * + * get_work_[g]cwq() can be used to obtain the gcwq or cwq + * corresponding to a work. gcwq is available once the work has been + * queued anywhere after initialization. cwq is available only from + * queueing until execution starts. + */ +static inline void set_work_data(struct work_struct *work, unsigned long data, + unsigned long flags) +{ + BUG_ON(!work_pending(work)); + atomic_long_set(&work->data, data | flags | work_static(work)); +} + +static void set_work_cwq(struct work_struct *work, + struct cpu_workqueue_struct *cwq, + unsigned long extra_flags) +{ + set_work_data(work, (unsigned long)cwq, + WORK_STRUCT_PENDING | WORK_STRUCT_CWQ | extra_flags); +} + +static void set_work_cpu(struct work_struct *work, unsigned int cpu) +{ + set_work_data(work, cpu << WORK_STRUCT_FLAG_BITS, WORK_STRUCT_PENDING); +} + +static void clear_work_data(struct work_struct *work) +{ + set_work_data(work, WORK_STRUCT_NO_CPU, 0); +} + +static struct cpu_workqueue_struct *get_work_cwq(struct work_struct *work) +{ + unsigned long data = atomic_long_read(&work->data); + + if (data & WORK_STRUCT_CWQ) + return (void *)(data & WORK_STRUCT_WQ_DATA_MASK); + else + return NULL; +} + +static struct global_cwq *get_work_gcwq(struct work_struct *work) +{ + unsigned long data = atomic_long_read(&work->data); + unsigned int cpu; + + if (data & WORK_STRUCT_CWQ) + return ((struct cpu_workqueue_struct *) + (data & WORK_STRUCT_WQ_DATA_MASK))->gcwq; + + cpu = data >> WORK_STRUCT_FLAG_BITS; + if (cpu == WORK_CPU_NONE) + return NULL; + + BUG_ON(cpu >= nr_cpu_ids && cpu != WORK_CPU_UNBOUND); + return get_gcwq(cpu); +} + +/* + * Policy functions. These define the policies on how the global + * worker pool is managed. Unless noted otherwise, these functions + * assume that they're being called with gcwq->lock held. + */ + +static bool __need_more_worker(struct global_cwq *gcwq) +{ + return !atomic_read(get_gcwq_nr_running(gcwq->cpu)) || + gcwq->flags & GCWQ_HIGHPRI_PENDING; +} + +/* + * Need to wake up a worker? Called from anything but currently + * running workers. + */ +static bool need_more_worker(struct global_cwq *gcwq) +{ + return !list_empty(&gcwq->worklist) && __need_more_worker(gcwq); +} + +/* Can I start working? Called from busy but !running workers. */ +static bool may_start_working(struct global_cwq *gcwq) +{ + return gcwq->nr_idle; +} + +/* Do I need to keep working? Called from currently running workers. */ +static bool keep_working(struct global_cwq *gcwq) +{ + atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); + + return !list_empty(&gcwq->worklist) && + (atomic_read(nr_running) <= 1 || + gcwq->flags & GCWQ_HIGHPRI_PENDING); +} + +/* Do we need a new worker? Called from manager. */ +static bool need_to_create_worker(struct global_cwq *gcwq) +{ + return need_more_worker(gcwq) && !may_start_working(gcwq); +} + +/* Do I need to be the manager? */ +static bool need_to_manage_workers(struct global_cwq *gcwq) +{ + return need_to_create_worker(gcwq) || gcwq->flags & GCWQ_MANAGE_WORKERS; +} + +/* Do we have too many workers and should some go away? */ +static bool too_many_workers(struct global_cwq *gcwq) +{ + bool managing = gcwq->flags & GCWQ_MANAGING_WORKERS; + int nr_idle = gcwq->nr_idle + managing; /* manager is considered idle */ + int nr_busy = gcwq->nr_workers - nr_idle; + + return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; +} + +/* + * Wake up functions. + */ + +/* Return the first worker. Safe with preemption disabled */ +static struct worker *first_worker(struct global_cwq *gcwq) +{ + if (unlikely(list_empty(&gcwq->idle_list))) + return NULL; + + return list_first_entry(&gcwq->idle_list, struct worker, entry); +} + +/** + * wake_up_worker - wake up an idle worker + * @gcwq: gcwq to wake worker for + * + * Wake up the first idle worker of @gcwq. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void wake_up_worker(struct global_cwq *gcwq) +{ + struct worker *worker = first_worker(gcwq); + + if (likely(worker)) + wake_up_process(worker->task); +} + +/** + * wq_worker_waking_up - a worker is waking up + * @task: task waking up + * @cpu: CPU @task is waking up to + * + * This function is called during try_to_wake_up() when a worker is + * being awoken. + * + * CONTEXT: + * spin_lock_irq(rq->lock) + */ +void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) +{ + struct worker *worker = kthread_data(task); + + if (!(worker->flags & WORKER_NOT_RUNNING)) + atomic_inc(get_gcwq_nr_running(cpu)); +} + +/** + * wq_worker_sleeping - a worker is going to sleep + * @task: task going to sleep + * @cpu: CPU in question, must be the current CPU number + * + * This function is called during schedule() when a busy worker is + * going to sleep. Worker on the same cpu can be woken up by + * returning pointer to its task. + * + * CONTEXT: + * spin_lock_irq(rq->lock) + * + * RETURNS: + * Worker task on @cpu to wake up, %NULL if none. + */ +struct task_struct *wq_worker_sleeping(struct task_struct *task, + unsigned int cpu) +{ + struct worker *worker = kthread_data(task), *to_wakeup = NULL; + struct global_cwq *gcwq = get_gcwq(cpu); + atomic_t *nr_running = get_gcwq_nr_running(cpu); + + if (worker->flags & WORKER_NOT_RUNNING) + return NULL; + + /* this can only happen on the local cpu */ + BUG_ON(cpu != raw_smp_processor_id()); + + /* + * The counterpart of the following dec_and_test, implied mb, + * worklist not empty test sequence is in insert_work(). + * Please read comment there. + * + * NOT_RUNNING is clear. This means that trustee is not in + * charge and we're running on the local cpu w/ rq lock held + * and preemption disabled, which in turn means that none else + * could be manipulating idle_list, so dereferencing idle_list + * without gcwq lock is safe. + */ + if (atomic_dec_and_test(nr_running) && !list_empty(&gcwq->worklist)) + to_wakeup = first_worker(gcwq); + return to_wakeup ? to_wakeup->task : NULL; +} + +/** + * worker_set_flags - set worker flags and adjust nr_running accordingly + * @worker: self + * @flags: flags to set + * @wakeup: wakeup an idle worker if necessary + * + * Set @flags in @worker->flags and adjust nr_running accordingly. If + * nr_running becomes zero and @wakeup is %true, an idle worker is + * woken up. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) + */ +static inline void worker_set_flags(struct worker *worker, unsigned int flags, + bool wakeup) +{ + struct global_cwq *gcwq = worker->gcwq; + + WARN_ON_ONCE(worker->task != current); + + /* + * If transitioning into NOT_RUNNING, adjust nr_running and + * wake up an idle worker as necessary if requested by + * @wakeup. + */ + if ((flags & WORKER_NOT_RUNNING) && + !(worker->flags & WORKER_NOT_RUNNING)) { + atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu); + + if (wakeup) { + if (atomic_dec_and_test(nr_running) && + !list_empty(&gcwq->worklist)) + wake_up_worker(gcwq); + } else + atomic_dec(nr_running); + } + + worker->flags |= flags; +} + +/** + * worker_clr_flags - clear worker flags and adjust nr_running accordingly + * @worker: self + * @flags: flags to clear + * + * Clear @flags in @worker->flags and adjust nr_running accordingly. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) + */ +static inline void worker_clr_flags(struct worker *worker, unsigned int flags) +{ + struct global_cwq *gcwq = worker->gcwq; + unsigned int oflags = worker->flags; + + WARN_ON_ONCE(worker->task != current); + + worker->flags &= ~flags; + + /* + * If transitioning out of NOT_RUNNING, increment nr_running. Note + * that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask + * of multiple flags, not a single flag. + */ + if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING)) + if (!(worker->flags & WORKER_NOT_RUNNING)) + atomic_inc(get_gcwq_nr_running(gcwq->cpu)); +} + +/** + * busy_worker_head - return the busy hash head for a work + * @gcwq: gcwq of interest + * @work: work to be hashed + * + * Return hash head of @gcwq for @work. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + * + * RETURNS: + * Pointer to the hash head. + */ +static struct hlist_head *busy_worker_head(struct global_cwq *gcwq, + struct work_struct *work) +{ + const int base_shift = ilog2(sizeof(struct work_struct)); + unsigned long v = (unsigned long)work; + + /* simple shift and fold hash, do we need something better? */ + v >>= base_shift; + v += v >> BUSY_WORKER_HASH_ORDER; + v &= BUSY_WORKER_HASH_MASK; + + return &gcwq->busy_hash[v]; +} + +/** + * __find_worker_executing_work - find worker which is executing a work + * @gcwq: gcwq of interest + * @bwh: hash head as returned by busy_worker_head() + * @work: work to find worker for + * + * Find a worker which is executing @work on @gcwq. @bwh should be + * the hash head obtained by calling busy_worker_head() with the same + * work. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + * + * RETURNS: + * Pointer to worker which is executing @work if found, NULL + * otherwise. + */ +static struct worker *__find_worker_executing_work(struct global_cwq *gcwq, + struct hlist_head *bwh, + struct work_struct *work) +{ + struct worker *worker; + struct hlist_node *tmp; + + hlist_for_each_entry(worker, tmp, bwh, hentry) + if (worker->current_work == work) + return worker; + return NULL; +} + +/** + * find_worker_executing_work - find worker which is executing a work + * @gcwq: gcwq of interest + * @work: work to find worker for + * + * Find a worker which is executing @work on @gcwq. This function is + * identical to __find_worker_executing_work() except that this + * function calculates @bwh itself. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + * + * RETURNS: + * Pointer to worker which is executing @work if found, NULL + * otherwise. + */ +static struct worker *find_worker_executing_work(struct global_cwq *gcwq, + struct work_struct *work) +{ + return __find_worker_executing_work(gcwq, busy_worker_head(gcwq, work), + work); +} + +/** + * gcwq_determine_ins_pos - find insertion position + * @gcwq: gcwq of interest + * @cwq: cwq a work is being queued for + * + * A work for @cwq is about to be queued on @gcwq, determine insertion + * position for the work. If @cwq is for HIGHPRI wq, the work is + * queued at the head of the queue but in FIFO order with respect to + * other HIGHPRI works; otherwise, at the end of the queue. This + * function also sets GCWQ_HIGHPRI_PENDING flag to hint @gcwq that + * there are HIGHPRI works pending. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + * + * RETURNS: + * Pointer to inserstion position. + */ +static inline struct list_head *gcwq_determine_ins_pos(struct global_cwq *gcwq, + struct cpu_workqueue_struct *cwq) +{ + struct work_struct *twork; + + if (likely(!(cwq->wq->flags & WQ_HIGHPRI))) + return &gcwq->worklist; + + list_for_each_entry(twork, &gcwq->worklist, entry) { + struct cpu_workqueue_struct *tcwq = get_work_cwq(twork); + + if (!(tcwq->wq->flags & WQ_HIGHPRI)) + break; + } + + gcwq->flags |= GCWQ_HIGHPRI_PENDING; + return &twork->entry; +} + +/** + * insert_work - insert a work into gcwq + * @cwq: cwq @work belongs to + * @work: work to insert + * @head: insertion point + * @extra_flags: extra WORK_STRUCT_* flags to set + * + * Insert @work which belongs to @cwq into @gcwq after @head. + * @extra_flags is or'd to work_struct flags. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void insert_work(struct cpu_workqueue_struct *cwq, + struct work_struct *work, struct list_head *head, + unsigned int extra_flags) +{ + struct global_cwq *gcwq = cwq->gcwq; + + /* we own @work, set data and link */ + set_work_cwq(work, cwq, extra_flags); + + /* + * Ensure that we get the right work->data if we see the + * result of list_add() below, see try_to_grab_pending(). + */ + smp_wmb(); + + list_add_tail(&work->entry, head); + + /* + * Ensure either worker_sched_deactivated() sees the above + * list_add_tail() or we see zero nr_running to avoid workers + * lying around lazily while there are works to be processed. + */ + smp_mb(); + + if (__need_more_worker(gcwq)) + wake_up_worker(gcwq); +} + +/* + * Test whether @work is being queued from another work executing on the + * same workqueue. This is rather expensive and should only be used from + * cold paths. + */ +static bool is_chained_work(struct workqueue_struct *wq) +{ + unsigned long flags; + unsigned int cpu; + + for_each_gcwq_cpu(cpu) { + struct global_cwq *gcwq = get_gcwq(cpu); + struct worker *worker; + struct hlist_node *pos; + int i; + + spin_lock_irqsave(&gcwq->lock, flags); + for_each_busy_worker(worker, i, pos, gcwq) { + if (worker->task != current) + continue; + spin_unlock_irqrestore(&gcwq->lock, flags); + /* + * I'm @worker, no locking necessary. See if @work + * is headed to the same workqueue. + */ + return worker->current_cwq->wq == wq; + } + spin_unlock_irqrestore(&gcwq->lock, flags); + } + return false; +} + +static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, + struct work_struct *work) +{ + struct global_cwq *gcwq; + struct cpu_workqueue_struct *cwq; + struct list_head *worklist; + unsigned int work_flags; + unsigned long flags; + + debug_work_activate(work); + + /* if dying, only works from the same workqueue are allowed */ + if (unlikely(wq->flags & WQ_DYING) && + WARN_ON_ONCE(!is_chained_work(wq))) + return; + + /* determine gcwq to use */ + if (!(wq->flags & WQ_UNBOUND)) { + struct global_cwq *last_gcwq; + + if (unlikely(cpu == WORK_CPU_UNBOUND)) + cpu = raw_smp_processor_id(); + + /* + * It's multi cpu. If @wq is non-reentrant and @work + * was previously on a different cpu, it might still + * be running there, in which case the work needs to + * be queued on that cpu to guarantee non-reentrance. + */ + gcwq = get_gcwq(cpu); + if (wq->flags & WQ_NON_REENTRANT && + (last_gcwq = get_work_gcwq(work)) && last_gcwq != gcwq) { + struct worker *worker; + + spin_lock_irqsave(&last_gcwq->lock, flags); + + worker = find_worker_executing_work(last_gcwq, work); + + if (worker && worker->current_cwq->wq == wq) + gcwq = last_gcwq; + else { + /* meh... not running there, queue here */ + spin_unlock_irqrestore(&last_gcwq->lock, flags); + spin_lock_irqsave(&gcwq->lock, flags); + } + } else + spin_lock_irqsave(&gcwq->lock, flags); + } else { + gcwq = get_gcwq(WORK_CPU_UNBOUND); + spin_lock_irqsave(&gcwq->lock, flags); + } + + /* gcwq determined, get cwq and queue */ + cwq = get_cwq(gcwq->cpu, wq); + trace_workqueue_queue_work(cpu, cwq, work); + + BUG_ON(!list_empty(&work->entry)); + + cwq->nr_in_flight[cwq->work_color]++; + work_flags = work_color_to_flags(cwq->work_color); + + if (likely(cwq->nr_active < cwq->max_active)) { + trace_workqueue_activate_work(work); + cwq->nr_active++; + worklist = gcwq_determine_ins_pos(gcwq, cwq); + } else { + work_flags |= WORK_STRUCT_DELAYED; + worklist = &cwq->delayed_works; + } + + insert_work(cwq, work, worklist, work_flags); + + spin_unlock_irqrestore(&gcwq->lock, flags); +} + +/** + * queue_work - queue work on a workqueue + * @wq: workqueue to use + * @work: work to queue + * + * Returns 0 if @work was already on a queue, non-zero otherwise. + * + * We queue the work to the CPU on which it was submitted, but if the CPU dies + * it can be processed by another CPU. + */ +int queue_work(struct workqueue_struct *wq, struct work_struct *work) +{ + int ret; + + ret = queue_work_on(get_cpu(), wq, work); + put_cpu(); + + return ret; +} +EXPORT_SYMBOL_GPL(queue_work); + +/** + * queue_work_on - queue work on specific cpu + * @cpu: CPU number to execute work on + * @wq: workqueue to use + * @work: work to queue + * + * Returns 0 if @work was already on a queue, non-zero otherwise. + * + * We queue the work to a specific CPU, the caller must ensure it + * can't go away. + */ +int +queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) +{ + int ret = 0; + + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + __queue_work(cpu, wq, work); + ret = 1; + } + return ret; +} +EXPORT_SYMBOL_GPL(queue_work_on); + +static void delayed_work_timer_fn(unsigned long __data) +{ + struct delayed_work *dwork = (struct delayed_work *)__data; + struct cpu_workqueue_struct *cwq = get_work_cwq(&dwork->work); + + __queue_work(smp_processor_id(), cwq->wq, &dwork->work); +} + +/** + * queue_delayed_work - queue work on a workqueue after delay + * @wq: workqueue to use + * @dwork: delayable work to queue + * @delay: number of jiffies to wait before queueing + * + * Returns 0 if @work was already on a queue, non-zero otherwise. + */ +int queue_delayed_work(struct workqueue_struct *wq, + struct delayed_work *dwork, unsigned long delay) +{ + if (delay == 0) + return queue_work(wq, &dwork->work); + + return queue_delayed_work_on(-1, wq, dwork, delay); +} +EXPORT_SYMBOL_GPL(queue_delayed_work); + +/** + * queue_delayed_work_on - queue work on specific CPU after delay + * @cpu: CPU number to execute work on + * @wq: workqueue to use + * @dwork: work to queue + * @delay: number of jiffies to wait before queueing + * + * Returns 0 if @work was already on a queue, non-zero otherwise. + */ +int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, + struct delayed_work *dwork, unsigned long delay) +{ + int ret = 0; + struct timer_list *timer = &dwork->timer; + struct work_struct *work = &dwork->work; + + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) { + unsigned int lcpu; + + BUG_ON(timer_pending(timer)); + BUG_ON(!list_empty(&work->entry)); + + timer_stats_timer_set_start_info(&dwork->timer); + + /* + * This stores cwq for the moment, for the timer_fn. + * Note that the work's gcwq is preserved to allow + * reentrance detection for delayed works. + */ + if (!(wq->flags & WQ_UNBOUND)) { + struct global_cwq *gcwq = get_work_gcwq(work); + + if (gcwq && gcwq->cpu != WORK_CPU_UNBOUND) + lcpu = gcwq->cpu; + else + lcpu = raw_smp_processor_id(); + } else + lcpu = WORK_CPU_UNBOUND; + + set_work_cwq(work, get_cwq(lcpu, wq), 0); + + timer->expires = jiffies + delay; + timer->data = (unsigned long)dwork; + timer->function = delayed_work_timer_fn; + + if (unlikely(cpu >= 0)) + add_timer_on(timer, cpu); + else + add_timer(timer); + ret = 1; + } + return ret; +} +EXPORT_SYMBOL_GPL(queue_delayed_work_on); + +/** + * worker_enter_idle - enter idle state + * @worker: worker which is entering idle state + * + * @worker is entering idle state. Update stats and idle timer if + * necessary. + * + * LOCKING: + * spin_lock_irq(gcwq->lock). + */ +static void worker_enter_idle(struct worker *worker) +{ + struct global_cwq *gcwq = worker->gcwq; + + BUG_ON(worker->flags & WORKER_IDLE); + BUG_ON(!list_empty(&worker->entry) && + (worker->hentry.next || worker->hentry.pprev)); + + /* can't use worker_set_flags(), also called from start_worker() */ + worker->flags |= WORKER_IDLE; + gcwq->nr_idle++; + worker->last_active = jiffies; + + /* idle_list is LIFO */ + list_add(&worker->entry, &gcwq->idle_list); + + if (likely(!(worker->flags & WORKER_ROGUE))) { + if (too_many_workers(gcwq) && !timer_pending(&gcwq->idle_timer)) + mod_timer(&gcwq->idle_timer, + jiffies + IDLE_WORKER_TIMEOUT); + } else + wake_up_all(&gcwq->trustee_wait); + + /* + * Sanity check nr_running. Because trustee releases gcwq->lock + * between setting %WORKER_ROGUE and zapping nr_running, the + * warning may trigger spuriously. Check iff trustee is idle. + */ + WARN_ON_ONCE(gcwq->trustee_state == TRUSTEE_DONE && + gcwq->nr_workers == gcwq->nr_idle && + atomic_read(get_gcwq_nr_running(gcwq->cpu))); +} + +/** + * worker_leave_idle - leave idle state + * @worker: worker which is leaving idle state + * + * @worker is leaving idle state. Update stats. + * + * LOCKING: + * spin_lock_irq(gcwq->lock). + */ +static void worker_leave_idle(struct worker *worker) +{ + struct global_cwq *gcwq = worker->gcwq; + + BUG_ON(!(worker->flags & WORKER_IDLE)); + worker_clr_flags(worker, WORKER_IDLE); + gcwq->nr_idle--; + list_del_init(&worker->entry); +} + +/** + * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock gcwq + * @worker: self + * + * Works which are scheduled while the cpu is online must at least be + * scheduled to a worker which is bound to the cpu so that if they are + * flushed from cpu callbacks while cpu is going down, they are + * guaranteed to execute on the cpu. + * + * This function is to be used by rogue workers and rescuers to bind + * themselves to the target cpu and may race with cpu going down or + * coming online. kthread_bind() can't be used because it may put the + * worker to already dead cpu and set_cpus_allowed_ptr() can't be used + * verbatim as it's best effort and blocking and gcwq may be + * [dis]associated in the meantime. + * + * This function tries set_cpus_allowed() and locks gcwq and verifies + * the binding against GCWQ_DISASSOCIATED which is set during + * CPU_DYING and cleared during CPU_ONLINE, so if the worker enters + * idle state or fetches works without dropping lock, it can guarantee + * the scheduling requirement described in the first paragraph. + * + * CONTEXT: + * Might sleep. Called without any lock but returns with gcwq->lock + * held. + * + * RETURNS: + * %true if the associated gcwq is online (@worker is successfully + * bound), %false if offline. + */ +static bool worker_maybe_bind_and_lock(struct worker *worker) +__acquires(&gcwq->lock) +{ + struct global_cwq *gcwq = worker->gcwq; + struct task_struct *task = worker->task; + + while (true) { + /* + * The following call may fail, succeed or succeed + * without actually migrating the task to the cpu if + * it races with cpu hotunplug operation. Verify + * against GCWQ_DISASSOCIATED. + */ + if (!(gcwq->flags & GCWQ_DISASSOCIATED)) + set_cpus_allowed_ptr(task, get_cpu_mask(gcwq->cpu)); + + spin_lock_irq(&gcwq->lock); + if (gcwq->flags & GCWQ_DISASSOCIATED) + return false; + if (task_cpu(task) == gcwq->cpu && + cpumask_equal(¤t->cpus_allowed, + get_cpu_mask(gcwq->cpu))) + return true; + spin_unlock_irq(&gcwq->lock); + + /* + * We've raced with CPU hot[un]plug. Give it a breather + * and retry migration. cond_resched() is required here; + * otherwise, we might deadlock against cpu_stop trying to + * bring down the CPU on non-preemptive kernel. + */ + cpu_relax(); + cond_resched(); + } +} + +/* + * Function for worker->rebind_work used to rebind rogue busy workers + * to the associated cpu which is coming back online. This is + * scheduled by cpu up but can race with other cpu hotplug operations + * and may be executed twice without intervening cpu down. + */ +static void worker_rebind_fn(struct work_struct *work) +{ + struct worker *worker = container_of(work, struct worker, rebind_work); + struct global_cwq *gcwq = worker->gcwq; + + if (worker_maybe_bind_and_lock(worker)) + worker_clr_flags(worker, WORKER_REBIND); + + spin_unlock_irq(&gcwq->lock); +} + +static struct worker *alloc_worker(void) +{ + struct worker *worker; + + worker = kzalloc(sizeof(*worker), GFP_KERNEL); + if (worker) { + INIT_LIST_HEAD(&worker->entry); + INIT_LIST_HEAD(&worker->scheduled); + INIT_WORK(&worker->rebind_work, worker_rebind_fn); + /* on creation a worker is in !idle && prep state */ + worker->flags = WORKER_PREP; + } + return worker; +} + +/** + * create_worker - create a new workqueue worker + * @gcwq: gcwq the new worker will belong to + * @bind: whether to set affinity to @cpu or not + * + * Create a new worker which is bound to @gcwq. The returned worker + * can be started by calling start_worker() or destroyed using + * destroy_worker(). + * + * CONTEXT: + * Might sleep. Does GFP_KERNEL allocations. + * + * RETURNS: + * Pointer to the newly created worker. + */ +static struct worker *create_worker(struct global_cwq *gcwq, bool bind) +{ + bool on_unbound_cpu = gcwq->cpu == WORK_CPU_UNBOUND; + struct worker *worker = NULL; + int id = -1; + + spin_lock_irq(&gcwq->lock); + while (ida_get_new(&gcwq->worker_ida, &id)) { + spin_unlock_irq(&gcwq->lock); + if (!ida_pre_get(&gcwq->worker_ida, GFP_KERNEL)) + goto fail; + spin_lock_irq(&gcwq->lock); + } + spin_unlock_irq(&gcwq->lock); + + worker = alloc_worker(); + if (!worker) + goto fail; + + worker->gcwq = gcwq; + worker->id = id; + + if (!on_unbound_cpu) + worker->task = kthread_create_on_node(worker_thread, + worker, + cpu_to_node(gcwq->cpu), + "kworker/%u:%d", gcwq->cpu, id); + else + worker->task = kthread_create(worker_thread, worker, + "kworker/u:%d", id); + if (IS_ERR(worker->task)) + goto fail; + + /* + * A rogue worker will become a regular one if CPU comes + * online later on. Make sure every worker has + * PF_THREAD_BOUND set. + */ + if (bind && !on_unbound_cpu) + kthread_bind(worker->task, gcwq->cpu); + else { + worker->task->flags |= PF_THREAD_BOUND; + if (on_unbound_cpu) + worker->flags |= WORKER_UNBOUND; + } + + return worker; +fail: + if (id >= 0) { + spin_lock_irq(&gcwq->lock); + ida_remove(&gcwq->worker_ida, id); + spin_unlock_irq(&gcwq->lock); + } + kfree(worker); + return NULL; +} + +/** + * start_worker - start a newly created worker + * @worker: worker to start + * + * Make the gcwq aware of @worker and start it. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void start_worker(struct worker *worker) +{ + worker->flags |= WORKER_STARTED; + worker->gcwq->nr_workers++; + worker_enter_idle(worker); + wake_up_process(worker->task); +} + +/** + * destroy_worker - destroy a workqueue worker + * @worker: worker to be destroyed + * + * Destroy @worker and adjust @gcwq stats accordingly. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which is released and regrabbed. + */ +static void destroy_worker(struct worker *worker) +{ + struct global_cwq *gcwq = worker->gcwq; + int id = worker->id; + + /* sanity check frenzy */ + BUG_ON(worker->current_work); + BUG_ON(!list_empty(&worker->scheduled)); + + if (worker->flags & WORKER_STARTED) + gcwq->nr_workers--; + if (worker->flags & WORKER_IDLE) + gcwq->nr_idle--; + + list_del_init(&worker->entry); + worker->flags |= WORKER_DIE; + + spin_unlock_irq(&gcwq->lock); + + kthread_stop(worker->task); + kfree(worker); + + spin_lock_irq(&gcwq->lock); + ida_remove(&gcwq->worker_ida, id); +} + +static void idle_worker_timeout(unsigned long __gcwq) +{ + struct global_cwq *gcwq = (void *)__gcwq; + + spin_lock_irq(&gcwq->lock); + + if (too_many_workers(gcwq)) { + struct worker *worker; + unsigned long expires; + + /* idle_list is kept in LIFO order, check the last one */ + worker = list_entry(gcwq->idle_list.prev, struct worker, entry); + expires = worker->last_active + IDLE_WORKER_TIMEOUT; + + if (time_before(jiffies, expires)) + mod_timer(&gcwq->idle_timer, expires); + else { + /* it's been idle for too long, wake up manager */ + gcwq->flags |= GCWQ_MANAGE_WORKERS; + wake_up_worker(gcwq); + } + } + + spin_unlock_irq(&gcwq->lock); +} + +static bool send_mayday(struct work_struct *work) +{ + struct cpu_workqueue_struct *cwq = get_work_cwq(work); + struct workqueue_struct *wq = cwq->wq; + unsigned int cpu; + + if (!(wq->flags & WQ_RESCUER)) + return false; + + /* mayday mayday mayday */ + cpu = cwq->gcwq->cpu; + /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ + if (cpu == WORK_CPU_UNBOUND) + cpu = 0; + if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask)) + wake_up_process(wq->rescuer->task); + return true; +} + +static void gcwq_mayday_timeout(unsigned long __gcwq) +{ + struct global_cwq *gcwq = (void *)__gcwq; + struct work_struct *work; + + spin_lock_irq(&gcwq->lock); + + if (need_to_create_worker(gcwq)) { + /* + * We've been trying to create a new worker but + * haven't been successful. We might be hitting an + * allocation deadlock. Send distress signals to + * rescuers. + */ + list_for_each_entry(work, &gcwq->worklist, entry) + send_mayday(work); + } + + spin_unlock_irq(&gcwq->lock); + + mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INTERVAL); +} + +/** + * maybe_create_worker - create a new worker if necessary + * @gcwq: gcwq to create a new worker for + * + * Create a new worker for @gcwq if necessary. @gcwq is guaranteed to + * have at least one idle worker on return from this function. If + * creating a new worker takes longer than MAYDAY_INTERVAL, mayday is + * sent to all rescuers with works scheduled on @gcwq to resolve + * possible allocation deadlock. + * + * On return, need_to_create_worker() is guaranteed to be false and + * may_start_working() true. + * + * LOCKING: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. Does GFP_KERNEL allocations. Called only from + * manager. + * + * RETURNS: + * false if no action was taken and gcwq->lock stayed locked, true + * otherwise. + */ +static bool maybe_create_worker(struct global_cwq *gcwq) +__releases(&gcwq->lock) +__acquires(&gcwq->lock) +{ + if (!need_to_create_worker(gcwq)) + return false; +restart: + spin_unlock_irq(&gcwq->lock); + + /* if we don't make progress in MAYDAY_INITIAL_TIMEOUT, call for help */ + mod_timer(&gcwq->mayday_timer, jiffies + MAYDAY_INITIAL_TIMEOUT); + + while (true) { + struct worker *worker; + + worker = create_worker(gcwq, true); + if (worker) { + del_timer_sync(&gcwq->mayday_timer); + spin_lock_irq(&gcwq->lock); + start_worker(worker); + BUG_ON(need_to_create_worker(gcwq)); + return true; + } + + if (!need_to_create_worker(gcwq)) + break; + + __set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(CREATE_COOLDOWN); + + if (!need_to_create_worker(gcwq)) + break; + } + + del_timer_sync(&gcwq->mayday_timer); + spin_lock_irq(&gcwq->lock); + if (need_to_create_worker(gcwq)) + goto restart; + return true; +} + +/** + * maybe_destroy_worker - destroy workers which have been idle for a while + * @gcwq: gcwq to destroy workers for + * + * Destroy @gcwq workers which have been idle for longer than + * IDLE_WORKER_TIMEOUT. + * + * LOCKING: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. Called only from manager. + * + * RETURNS: + * false if no action was taken and gcwq->lock stayed locked, true + * otherwise. + */ +static bool maybe_destroy_workers(struct global_cwq *gcwq) +{ + bool ret = false; + + while (too_many_workers(gcwq)) { + struct worker *worker; + unsigned long expires; + + worker = list_entry(gcwq->idle_list.prev, struct worker, entry); + expires = worker->last_active + IDLE_WORKER_TIMEOUT; + + if (time_before(jiffies, expires)) { + mod_timer(&gcwq->idle_timer, expires); + break; + } + + destroy_worker(worker); + ret = true; + } + + return ret; +} + +/** + * manage_workers - manage worker pool + * @worker: self + * + * Assume the manager role and manage gcwq worker pool @worker belongs + * to. At any given time, there can be only zero or one manager per + * gcwq. The exclusion is handled automatically by this function. + * + * The caller can safely start processing works on false return. On + * true return, it's guaranteed that need_to_create_worker() is false + * and may_start_working() is true. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. Does GFP_KERNEL allocations. + * + * RETURNS: + * false if no action was taken and gcwq->lock stayed locked, true if + * some action was taken. + */ +static bool manage_workers(struct worker *worker) +{ + struct global_cwq *gcwq = worker->gcwq; + bool ret = false; + + if (gcwq->flags & GCWQ_MANAGING_WORKERS) + return ret; + + gcwq->flags &= ~GCWQ_MANAGE_WORKERS; + gcwq->flags |= GCWQ_MANAGING_WORKERS; + + /* + * Destroy and then create so that may_start_working() is true + * on return. + */ + ret |= maybe_destroy_workers(gcwq); + ret |= maybe_create_worker(gcwq); + + gcwq->flags &= ~GCWQ_MANAGING_WORKERS; + + /* + * The trustee might be waiting to take over the manager + * position, tell it we're done. + */ + if (unlikely(gcwq->trustee)) + wake_up_all(&gcwq->trustee_wait); + + return ret; +} + +/** + * move_linked_works - move linked works to a list + * @work: start of series of works to be scheduled + * @head: target list to append @work to + * @nextp: out paramter for nested worklist walking + * + * Schedule linked works starting from @work to @head. Work series to + * be scheduled starts at @work and includes any consecutive work with + * WORK_STRUCT_LINKED set in its predecessor. + * + * If @nextp is not NULL, it's updated to point to the next work of + * the last scheduled work. This allows move_linked_works() to be + * nested inside outer list_for_each_entry_safe(). + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void move_linked_works(struct work_struct *work, struct list_head *head, + struct work_struct **nextp) +{ + struct work_struct *n; + + /* + * Linked worklist will always end before the end of the list, + * use NULL for list head. + */ + list_for_each_entry_safe_from(work, n, NULL, entry) { + list_move_tail(&work->entry, head); + if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) + break; + } + + /* + * If we're already inside safe list traversal and have moved + * multiple works to the scheduled queue, the next position + * needs to be updated. + */ + if (nextp) + *nextp = n; +} + +static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq) +{ + struct work_struct *work = list_first_entry(&cwq->delayed_works, + struct work_struct, entry); + struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq); + + trace_workqueue_activate_work(work); + move_linked_works(work, pos, NULL); + __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); + cwq->nr_active++; +} + +/** + * cwq_dec_nr_in_flight - decrement cwq's nr_in_flight + * @cwq: cwq of interest + * @color: color of work which left the queue + * @delayed: for a delayed work + * + * A work either has completed or is removed from pending queue, + * decrement nr_in_flight of its cwq and handle workqueue flushing. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void cwq_dec_nr_in_flight(struct cpu_workqueue_struct *cwq, int color, + bool delayed) +{ + /* ignore uncolored works */ + if (color == WORK_NO_COLOR) + return; + + cwq->nr_in_flight[color]--; + + if (!delayed) { + cwq->nr_active--; + if (!list_empty(&cwq->delayed_works)) { + /* one down, submit a delayed one */ + if (cwq->nr_active < cwq->max_active) + cwq_activate_first_delayed(cwq); + } + } + + /* is flush in progress and are we at the flushing tip? */ + if (likely(cwq->flush_color != color)) + return; + + /* are there still in-flight works? */ + if (cwq->nr_in_flight[color]) + return; + + /* this cwq is done, clear flush_color */ + cwq->flush_color = -1; + + /* + * If this was the last cwq, wake up the first flusher. It + * will handle the rest. + */ + if (atomic_dec_and_test(&cwq->wq->nr_cwqs_to_flush)) + complete(&cwq->wq->first_flusher->done); +} + +/** + * process_one_work - process single work + * @worker: self + * @work: work to process + * + * Process @work. This function contains all the logics necessary to + * process a single work including synchronization against and + * interaction with other workers on the same cpu, queueing and + * flushing. As long as context requirement is met, any worker can + * call this function to process a work. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which is released and regrabbed. + */ +static void process_one_work(struct worker *worker, struct work_struct *work) +__releases(&gcwq->lock) +__acquires(&gcwq->lock) +{ + struct cpu_workqueue_struct *cwq = get_work_cwq(work); + struct global_cwq *gcwq = cwq->gcwq; + struct hlist_head *bwh = busy_worker_head(gcwq, work); + bool cpu_intensive = cwq->wq->flags & WQ_CPU_INTENSIVE; + work_func_t f = work->func; + int work_color; + struct worker *collision; +#ifdef CONFIG_LOCKDEP + /* + * It is permissible to free the struct work_struct from + * inside the function that is called from it, this we need to + * take into account for lockdep too. To avoid bogus "held + * lock freed" warnings as well as problems when looking into + * work->lockdep_map, make a copy and use that here. + */ + struct lockdep_map lockdep_map = work->lockdep_map; +#endif + /* + * A single work shouldn't be executed concurrently by + * multiple workers on a single cpu. Check whether anyone is + * already processing the work. If so, defer the work to the + * currently executing one. + */ + collision = __find_worker_executing_work(gcwq, bwh, work); + if (unlikely(collision)) { + move_linked_works(work, &collision->scheduled, NULL); + return; + } + + /* claim and process */ + debug_work_deactivate(work); + hlist_add_head(&worker->hentry, bwh); + worker->current_work = work; + worker->current_cwq = cwq; + work_color = get_work_color(work); + + /* record the current cpu number in the work data and dequeue */ + set_work_cpu(work, gcwq->cpu); + list_del_init(&work->entry); + + /* + * If HIGHPRI_PENDING, check the next work, and, if HIGHPRI, + * wake up another worker; otherwise, clear HIGHPRI_PENDING. + */ + if (unlikely(gcwq->flags & GCWQ_HIGHPRI_PENDING)) { + struct work_struct *nwork = list_first_entry(&gcwq->worklist, + struct work_struct, entry); + + if (!list_empty(&gcwq->worklist) && + get_work_cwq(nwork)->wq->flags & WQ_HIGHPRI) + wake_up_worker(gcwq); + else + gcwq->flags &= ~GCWQ_HIGHPRI_PENDING; + } + + /* + * CPU intensive works don't participate in concurrency + * management. They're the scheduler's responsibility. + */ + if (unlikely(cpu_intensive)) + worker_set_flags(worker, WORKER_CPU_INTENSIVE, true); + + spin_unlock_irq(&gcwq->lock); + + work_clear_pending(work); + lock_map_acquire_read(&cwq->wq->lockdep_map); + lock_map_acquire(&lockdep_map); + trace_workqueue_execute_start(work); + f(work); + /* + * While we must be careful to not use "work" after this, the trace + * point will only record its address. + */ + trace_workqueue_execute_end(work); + lock_map_release(&lockdep_map); + lock_map_release(&cwq->wq->lockdep_map); + + if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { + printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " + "%s/0x%08x/%d\n", + current->comm, preempt_count(), task_pid_nr(current)); + printk(KERN_ERR " last function: "); + print_symbol("%s\n", (unsigned long)f); + debug_show_held_locks(current); + dump_stack(); + } + + spin_lock_irq(&gcwq->lock); + + /* clear cpu intensive status */ + if (unlikely(cpu_intensive)) + worker_clr_flags(worker, WORKER_CPU_INTENSIVE); + + /* we're done with it, release */ + hlist_del_init(&worker->hentry); + worker->current_work = NULL; + worker->current_cwq = NULL; + cwq_dec_nr_in_flight(cwq, work_color, false); +} + +/** + * process_scheduled_works - process scheduled works + * @worker: self + * + * Process all scheduled works. Please note that the scheduled list + * may change while processing a work, so this function repeatedly + * fetches a work from the top and executes it. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. + */ +static void process_scheduled_works(struct worker *worker) +{ + while (!list_empty(&worker->scheduled)) { + struct work_struct *work = list_first_entry(&worker->scheduled, + struct work_struct, entry); + process_one_work(worker, work); + } +} + +/** + * worker_thread - the worker thread function + * @__worker: self + * + * The gcwq worker thread function. There's a single dynamic pool of + * these per each cpu. These workers process all works regardless of + * their specific target workqueue. The only exception is works which + * belong to workqueues with a rescuer which will be explained in + * rescuer_thread(). + */ +static int worker_thread(void *__worker) +{ + struct worker *worker = __worker; + struct global_cwq *gcwq = worker->gcwq; + + /* tell the scheduler that this is a workqueue worker */ + worker->task->flags |= PF_WQ_WORKER; +woke_up: + spin_lock_irq(&gcwq->lock); + + /* DIE can be set only while we're idle, checking here is enough */ + if (worker->flags & WORKER_DIE) { + spin_unlock_irq(&gcwq->lock); + worker->task->flags &= ~PF_WQ_WORKER; + return 0; + } + + worker_leave_idle(worker); +recheck: + /* no more worker necessary? */ + if (!need_more_worker(gcwq)) + goto sleep; + + /* do we need to manage? */ + if (unlikely(!may_start_working(gcwq)) && manage_workers(worker)) + goto recheck; + + /* + * ->scheduled list can only be filled while a worker is + * preparing to process a work or actually processing it. + * Make sure nobody diddled with it while I was sleeping. + */ + BUG_ON(!list_empty(&worker->scheduled)); + + /* + * When control reaches this point, we're guaranteed to have + * at least one idle worker or that someone else has already + * assumed the manager role. + */ + worker_clr_flags(worker, WORKER_PREP); + + do { + struct work_struct *work = + list_first_entry(&gcwq->worklist, + struct work_struct, entry); + + if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { + /* optimization path, not strictly necessary */ + process_one_work(worker, work); + if (unlikely(!list_empty(&worker->scheduled))) + process_scheduled_works(worker); + } else { + move_linked_works(work, &worker->scheduled, NULL); + process_scheduled_works(worker); + } + } while (keep_working(gcwq)); + + worker_set_flags(worker, WORKER_PREP, false); +sleep: + if (unlikely(need_to_manage_workers(gcwq)) && manage_workers(worker)) + goto recheck; + + /* + * gcwq->lock is held and there's no work to process and no + * need to manage, sleep. Workers are woken up only while + * holding gcwq->lock or from local cpu, so setting the + * current state before releasing gcwq->lock is enough to + * prevent losing any event. + */ + worker_enter_idle(worker); + __set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irq(&gcwq->lock); + schedule(); + goto woke_up; +} + +/** + * rescuer_thread - the rescuer thread function + * @__wq: the associated workqueue + * + * Workqueue rescuer thread function. There's one rescuer for each + * workqueue which has WQ_RESCUER set. + * + * Regular work processing on a gcwq may block trying to create a new + * worker which uses GFP_KERNEL allocation which has slight chance of + * developing into deadlock if some works currently on the same queue + * need to be processed to satisfy the GFP_KERNEL allocation. This is + * the problem rescuer solves. + * + * When such condition is possible, the gcwq summons rescuers of all + * workqueues which have works queued on the gcwq and let them process + * those works so that forward progress can be guaranteed. + * + * This should happen rarely. + */ +static int rescuer_thread(void *__wq) +{ + struct workqueue_struct *wq = __wq; + struct worker *rescuer = wq->rescuer; + struct list_head *scheduled = &rescuer->scheduled; + bool is_unbound = wq->flags & WQ_UNBOUND; + unsigned int cpu; + + set_user_nice(current, RESCUER_NICE_LEVEL); +repeat: + set_current_state(TASK_INTERRUPTIBLE); + + if (kthread_should_stop()) + return 0; + + /* + * See whether any cpu is asking for help. Unbounded + * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND. + */ + for_each_mayday_cpu(cpu, wq->mayday_mask) { + unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; + struct cpu_workqueue_struct *cwq = get_cwq(tcpu, wq); + struct global_cwq *gcwq = cwq->gcwq; + struct work_struct *work, *n; + + __set_current_state(TASK_RUNNING); + mayday_clear_cpu(cpu, wq->mayday_mask); + + /* migrate to the target cpu if possible */ + rescuer->gcwq = gcwq; + worker_maybe_bind_and_lock(rescuer); + + /* + * Slurp in all works issued via this workqueue and + * process'em. + */ + BUG_ON(!list_empty(&rescuer->scheduled)); + list_for_each_entry_safe(work, n, &gcwq->worklist, entry) + if (get_work_cwq(work) == cwq) + move_linked_works(work, scheduled, &n); + + process_scheduled_works(rescuer); + + /* + * Leave this gcwq. If keep_working() is %true, notify a + * regular worker; otherwise, we end up with 0 concurrency + * and stalling the execution. + */ + if (keep_working(gcwq)) + wake_up_worker(gcwq); + + spin_unlock_irq(&gcwq->lock); + } + + schedule(); + goto repeat; +} + +struct wq_barrier { + struct work_struct work; + struct completion done; +}; + +static void wq_barrier_func(struct work_struct *work) +{ + struct wq_barrier *barr = container_of(work, struct wq_barrier, work); + complete(&barr->done); +} + +/** + * insert_wq_barrier - insert a barrier work + * @cwq: cwq to insert barrier into + * @barr: wq_barrier to insert + * @target: target work to attach @barr to + * @worker: worker currently executing @target, NULL if @target is not executing + * + * @barr is linked to @target such that @barr is completed only after + * @target finishes execution. Please note that the ordering + * guarantee is observed only with respect to @target and on the local + * cpu. + * + * Currently, a queued barrier can't be canceled. This is because + * try_to_grab_pending() can't determine whether the work to be + * grabbed is at the head of the queue and thus can't clear LINKED + * flag of the previous work while there must be a valid next work + * after a work with LINKED flag set. + * + * Note that when @worker is non-NULL, @target may be modified + * underneath us, so we can't reliably determine cwq from @target. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock). + */ +static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, + struct wq_barrier *barr, + struct work_struct *target, struct worker *worker) +{ + struct list_head *head; + unsigned int linked = 0; + + /* + * debugobject calls are safe here even with gcwq->lock locked + * as we know for sure that this will not trigger any of the + * checks and call back into the fixup functions where we + * might deadlock. + */ + INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); + __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); + init_completion(&barr->done); + + /* + * If @target is currently being executed, schedule the + * barrier to the worker; otherwise, put it after @target. + */ + if (worker) + head = worker->scheduled.next; + else { + unsigned long *bits = work_data_bits(target); + + head = target->entry.next; + /* there can already be other linked works, inherit and set */ + linked = *bits & WORK_STRUCT_LINKED; + __set_bit(WORK_STRUCT_LINKED_BIT, bits); + } + + debug_work_activate(&barr->work); + insert_work(cwq, &barr->work, head, + work_color_to_flags(WORK_NO_COLOR) | linked); +} + +/** + * flush_workqueue_prep_cwqs - prepare cwqs for workqueue flushing + * @wq: workqueue being flushed + * @flush_color: new flush color, < 0 for no-op + * @work_color: new work color, < 0 for no-op + * + * Prepare cwqs for workqueue flushing. + * + * If @flush_color is non-negative, flush_color on all cwqs should be + * -1. If no cwq has in-flight commands at the specified color, all + * cwq->flush_color's stay at -1 and %false is returned. If any cwq + * has in flight commands, its cwq->flush_color is set to + * @flush_color, @wq->nr_cwqs_to_flush is updated accordingly, cwq + * wakeup logic is armed and %true is returned. + * + * The caller should have initialized @wq->first_flusher prior to + * calling this function with non-negative @flush_color. If + * @flush_color is negative, no flush color update is done and %false + * is returned. + * + * If @work_color is non-negative, all cwqs should have the same + * work_color which is previous to @work_color and all will be + * advanced to @work_color. + * + * CONTEXT: + * mutex_lock(wq->flush_mutex). + * + * RETURNS: + * %true if @flush_color >= 0 and there's something to flush. %false + * otherwise. + */ +static bool flush_workqueue_prep_cwqs(struct workqueue_struct *wq, + int flush_color, int work_color) +{ + bool wait = false; + unsigned int cpu; + + if (flush_color >= 0) { + BUG_ON(atomic_read(&wq->nr_cwqs_to_flush)); + atomic_set(&wq->nr_cwqs_to_flush, 1); + } + + for_each_cwq_cpu(cpu, wq) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + struct global_cwq *gcwq = cwq->gcwq; + + spin_lock_irq(&gcwq->lock); + + if (flush_color >= 0) { + BUG_ON(cwq->flush_color != -1); + + if (cwq->nr_in_flight[flush_color]) { + cwq->flush_color = flush_color; + atomic_inc(&wq->nr_cwqs_to_flush); + wait = true; + } + } + + if (work_color >= 0) { + BUG_ON(work_color != work_next_color(cwq->work_color)); + cwq->work_color = work_color; + } + + spin_unlock_irq(&gcwq->lock); + } + + if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_cwqs_to_flush)) + complete(&wq->first_flusher->done); + + return wait; +} + +/** + * flush_workqueue - ensure that any scheduled work has run to completion. + * @wq: workqueue to flush + * + * Forces execution of the workqueue and blocks until its completion. + * This is typically used in driver shutdown handlers. + * + * We sleep until all works which were queued on entry have been handled, + * but we are not livelocked by new incoming ones. + */ +void flush_workqueue(struct workqueue_struct *wq) +{ + struct wq_flusher this_flusher = { + .list = LIST_HEAD_INIT(this_flusher.list), + .flush_color = -1, + .done = COMPLETION_INITIALIZER_ONSTACK(this_flusher.done), + }; + int next_color; + + lock_map_acquire(&wq->lockdep_map); + lock_map_release(&wq->lockdep_map); + + mutex_lock(&wq->flush_mutex); + + /* + * Start-to-wait phase + */ + next_color = work_next_color(wq->work_color); + + if (next_color != wq->flush_color) { + /* + * Color space is not full. The current work_color + * becomes our flush_color and work_color is advanced + * by one. + */ + BUG_ON(!list_empty(&wq->flusher_overflow)); + this_flusher.flush_color = wq->work_color; + wq->work_color = next_color; + + if (!wq->first_flusher) { + /* no flush in progress, become the first flusher */ + BUG_ON(wq->flush_color != this_flusher.flush_color); + + wq->first_flusher = &this_flusher; + + if (!flush_workqueue_prep_cwqs(wq, wq->flush_color, + wq->work_color)) { + /* nothing to flush, done */ + wq->flush_color = next_color; + wq->first_flusher = NULL; + goto out_unlock; + } + } else { + /* wait in queue */ + BUG_ON(wq->flush_color == this_flusher.flush_color); + list_add_tail(&this_flusher.list, &wq->flusher_queue); + flush_workqueue_prep_cwqs(wq, -1, wq->work_color); + } + } else { + /* + * Oops, color space is full, wait on overflow queue. + * The next flush completion will assign us + * flush_color and transfer to flusher_queue. + */ + list_add_tail(&this_flusher.list, &wq->flusher_overflow); + } + + mutex_unlock(&wq->flush_mutex); + + wait_for_completion(&this_flusher.done); + + /* + * Wake-up-and-cascade phase + * + * First flushers are responsible for cascading flushes and + * handling overflow. Non-first flushers can simply return. + */ + if (wq->first_flusher != &this_flusher) + return; + + mutex_lock(&wq->flush_mutex); + + /* we might have raced, check again with mutex held */ + if (wq->first_flusher != &this_flusher) + goto out_unlock; + + wq->first_flusher = NULL; + + BUG_ON(!list_empty(&this_flusher.list)); + BUG_ON(wq->flush_color != this_flusher.flush_color); + + while (true) { + struct wq_flusher *next, *tmp; + + /* complete all the flushers sharing the current flush color */ + list_for_each_entry_safe(next, tmp, &wq->flusher_queue, list) { + if (next->flush_color != wq->flush_color) + break; + list_del_init(&next->list); + complete(&next->done); + } + + BUG_ON(!list_empty(&wq->flusher_overflow) && + wq->flush_color != work_next_color(wq->work_color)); + + /* this flush_color is finished, advance by one */ + wq->flush_color = work_next_color(wq->flush_color); + + /* one color has been freed, handle overflow queue */ + if (!list_empty(&wq->flusher_overflow)) { + /* + * Assign the same color to all overflowed + * flushers, advance work_color and append to + * flusher_queue. This is the start-to-wait + * phase for these overflowed flushers. + */ + list_for_each_entry(tmp, &wq->flusher_overflow, list) + tmp->flush_color = wq->work_color; + + wq->work_color = work_next_color(wq->work_color); + + list_splice_tail_init(&wq->flusher_overflow, + &wq->flusher_queue); + flush_workqueue_prep_cwqs(wq, -1, wq->work_color); + } + + if (list_empty(&wq->flusher_queue)) { + BUG_ON(wq->flush_color != wq->work_color); + break; + } + + /* + * Need to flush more colors. Make the next flusher + * the new first flusher and arm cwqs. + */ + BUG_ON(wq->flush_color == wq->work_color); + BUG_ON(wq->flush_color != next->flush_color); + + list_del_init(&next->list); + wq->first_flusher = next; + + if (flush_workqueue_prep_cwqs(wq, wq->flush_color, -1)) + break; + + /* + * Meh... this color is already done, clear first + * flusher and repeat cascading. + */ + wq->first_flusher = NULL; + } + +out_unlock: + mutex_unlock(&wq->flush_mutex); +} +EXPORT_SYMBOL_GPL(flush_workqueue); + +static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr, + bool wait_executing) +{ + struct worker *worker = NULL; + struct global_cwq *gcwq; + struct cpu_workqueue_struct *cwq; + + might_sleep(); + gcwq = get_work_gcwq(work); + if (!gcwq) + return false; + + spin_lock_irq(&gcwq->lock); + if (!list_empty(&work->entry)) { + /* + * See the comment near try_to_grab_pending()->smp_rmb(). + * If it was re-queued to a different gcwq under us, we + * are not going to wait. + */ + smp_rmb(); + cwq = get_work_cwq(work); + if (unlikely(!cwq || gcwq != cwq->gcwq)) + goto already_gone; + } else if (wait_executing) { + worker = find_worker_executing_work(gcwq, work); + if (!worker) + goto already_gone; + cwq = worker->current_cwq; + } else + goto already_gone; + + insert_wq_barrier(cwq, barr, work, worker); + spin_unlock_irq(&gcwq->lock); + + /* + * If @max_active is 1 or rescuer is in use, flushing another work + * item on the same workqueue may lead to deadlock. Make sure the + * flusher is not running on the same workqueue by verifying write + * access. + */ + if (cwq->wq->saved_max_active == 1 || cwq->wq->flags & WQ_RESCUER) + lock_map_acquire(&cwq->wq->lockdep_map); + else + lock_map_acquire_read(&cwq->wq->lockdep_map); + lock_map_release(&cwq->wq->lockdep_map); + + return true; +already_gone: + spin_unlock_irq(&gcwq->lock); + return false; +} + +/** + * flush_work - wait for a work to finish executing the last queueing instance + * @work: the work to flush + * + * Wait until @work has finished execution. This function considers + * only the last queueing instance of @work. If @work has been + * enqueued across different CPUs on a non-reentrant workqueue or on + * multiple workqueues, @work might still be executing on return on + * some of the CPUs from earlier queueing. + * + * If @work was queued only on a non-reentrant, ordered or unbound + * workqueue, @work is guaranteed to be idle on return if it hasn't + * been requeued since flush started. + * + * RETURNS: + * %true if flush_work() waited for the work to finish execution, + * %false if it was already idle. + */ +bool flush_work(struct work_struct *work) +{ + struct wq_barrier barr; + + if (start_flush_work(work, &barr, true)) { + wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); + return true; + } else + return false; +} +EXPORT_SYMBOL_GPL(flush_work); + +static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work) +{ + struct wq_barrier barr; + struct worker *worker; + + spin_lock_irq(&gcwq->lock); + + worker = find_worker_executing_work(gcwq, work); + if (unlikely(worker)) + insert_wq_barrier(worker->current_cwq, &barr, work, worker); + + spin_unlock_irq(&gcwq->lock); + + if (unlikely(worker)) { + wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); + return true; + } else + return false; +} + +static bool wait_on_work(struct work_struct *work) +{ + bool ret = false; + int cpu; + + might_sleep(); + + lock_map_acquire(&work->lockdep_map); + lock_map_release(&work->lockdep_map); + + for_each_gcwq_cpu(cpu) + ret |= wait_on_cpu_work(get_gcwq(cpu), work); + return ret; +} + +/** + * flush_work_sync - wait until a work has finished execution + * @work: the work to flush + * + * Wait until @work has finished execution. On return, it's + * guaranteed that all queueing instances of @work which happened + * before this function is called are finished. In other words, if + * @work hasn't been requeued since this function was called, @work is + * guaranteed to be idle on return. + * + * RETURNS: + * %true if flush_work_sync() waited for the work to finish execution, + * %false if it was already idle. + */ +bool flush_work_sync(struct work_struct *work) +{ + struct wq_barrier barr; + bool pending, waited; + + /* we'll wait for executions separately, queue barr only if pending */ + pending = start_flush_work(work, &barr, false); + + /* wait for executions to finish */ + waited = wait_on_work(work); + + /* wait for the pending one */ + if (pending) { + wait_for_completion(&barr.done); + destroy_work_on_stack(&barr.work); + } + + return pending || waited; +} +EXPORT_SYMBOL_GPL(flush_work_sync); + +/* + * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, + * so this work can't be re-armed in any way. + */ +static int try_to_grab_pending(struct work_struct *work) +{ + struct global_cwq *gcwq; + int ret = -1; + + if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) + return 0; + + /* + * The queueing is in progress, or it is already queued. Try to + * steal it from ->worklist without clearing WORK_STRUCT_PENDING. + */ + gcwq = get_work_gcwq(work); + if (!gcwq) + return ret; + + spin_lock_irq(&gcwq->lock); + if (!list_empty(&work->entry)) { + /* + * This work is queued, but perhaps we locked the wrong gcwq. + * In that case we must see the new value after rmb(), see + * insert_work()->wmb(). + */ + smp_rmb(); + if (gcwq == get_work_gcwq(work)) { + debug_work_deactivate(work); + list_del_init(&work->entry); + cwq_dec_nr_in_flight(get_work_cwq(work), + get_work_color(work), + *work_data_bits(work) & WORK_STRUCT_DELAYED); + ret = 1; + } + } + spin_unlock_irq(&gcwq->lock); + + return ret; +} + +static bool __cancel_work_timer(struct work_struct *work, + struct timer_list* timer) +{ + int ret; + + do { + ret = (timer && likely(del_timer(timer))); + if (!ret) + ret = try_to_grab_pending(work); + wait_on_work(work); + } while (unlikely(ret < 0)); + + clear_work_data(work); + return ret; +} + +/** + * cancel_work_sync - cancel a work and wait for it to finish + * @work: the work to cancel + * + * Cancel @work and wait for its execution to finish. This function + * can be used even if the work re-queues itself or migrates to + * another workqueue. On return from this function, @work is + * guaranteed to be not pending or executing on any CPU. + * + * cancel_work_sync(&delayed_work->work) must not be used for + * delayed_work's. Use cancel_delayed_work_sync() instead. + * + * The caller must ensure that the workqueue on which @work was last + * queued can't be destroyed before this function returns. + * + * RETURNS: + * %true if @work was pending, %false otherwise. + */ +bool cancel_work_sync(struct work_struct *work) +{ + return __cancel_work_timer(work, NULL); +} +EXPORT_SYMBOL_GPL(cancel_work_sync); + +/** + * flush_delayed_work - wait for a dwork to finish executing the last queueing + * @dwork: the delayed work to flush + * + * Delayed timer is cancelled and the pending work is queued for + * immediate execution. Like flush_work(), this function only + * considers the last queueing instance of @dwork. + * + * RETURNS: + * %true if flush_work() waited for the work to finish execution, + * %false if it was already idle. + */ +bool flush_delayed_work(struct delayed_work *dwork) +{ + if (del_timer_sync(&dwork->timer)) + __queue_work(raw_smp_processor_id(), + get_work_cwq(&dwork->work)->wq, &dwork->work); + return flush_work(&dwork->work); +} +EXPORT_SYMBOL(flush_delayed_work); + +/** + * flush_delayed_work_sync - wait for a dwork to finish + * @dwork: the delayed work to flush + * + * Delayed timer is cancelled and the pending work is queued for + * execution immediately. Other than timer handling, its behavior + * is identical to flush_work_sync(). + * + * RETURNS: + * %true if flush_work_sync() waited for the work to finish execution, + * %false if it was already idle. + */ +bool flush_delayed_work_sync(struct delayed_work *dwork) +{ + if (del_timer_sync(&dwork->timer)) + __queue_work(raw_smp_processor_id(), + get_work_cwq(&dwork->work)->wq, &dwork->work); + return flush_work_sync(&dwork->work); +} +EXPORT_SYMBOL(flush_delayed_work_sync); + +/** + * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish + * @dwork: the delayed work cancel + * + * This is cancel_work_sync() for delayed works. + * + * RETURNS: + * %true if @dwork was pending, %false otherwise. + */ +bool cancel_delayed_work_sync(struct delayed_work *dwork) +{ + return __cancel_work_timer(&dwork->work, &dwork->timer); +} +EXPORT_SYMBOL(cancel_delayed_work_sync); + +/** + * schedule_work - put work task in global workqueue + * @work: job to be done + * + * Returns zero if @work was already on the kernel-global workqueue and + * non-zero otherwise. + * + * This puts a job in the kernel-global workqueue if it was not already + * queued and leaves it in the same position on the kernel-global + * workqueue otherwise. + */ +int schedule_work(struct work_struct *work) +{ + return queue_work(system_wq, work); +} +EXPORT_SYMBOL(schedule_work); + +/* + * schedule_work_on - put work task on a specific cpu + * @cpu: cpu to put the work task on + * @work: job to be done + * + * This puts a job on a specific cpu + */ +int schedule_work_on(int cpu, struct work_struct *work) +{ + return queue_work_on(cpu, system_wq, work); +} +EXPORT_SYMBOL(schedule_work_on); + +/** + * schedule_delayed_work - put work task in global workqueue after delay + * @dwork: job to be done + * @delay: number of jiffies to wait or 0 for immediate execution + * + * After waiting for a given time this puts a job in the kernel-global + * workqueue. + */ +int schedule_delayed_work(struct delayed_work *dwork, + unsigned long delay) +{ + return queue_delayed_work(system_wq, dwork, delay); +} +EXPORT_SYMBOL(schedule_delayed_work); + +/** + * schedule_delayed_work_on - queue work in global workqueue on CPU after delay + * @cpu: cpu to use + * @dwork: job to be done + * @delay: number of jiffies to wait + * + * After waiting for a given time this puts a job in the kernel-global + * workqueue on the specified CPU. + */ +int schedule_delayed_work_on(int cpu, + struct delayed_work *dwork, unsigned long delay) +{ + return queue_delayed_work_on(cpu, system_wq, dwork, delay); +} +EXPORT_SYMBOL(schedule_delayed_work_on); + +/** + * schedule_on_each_cpu - execute a function synchronously on each online CPU + * @func: the function to call + * + * schedule_on_each_cpu() executes @func on each online CPU using the + * system workqueue and blocks until all CPUs have completed. + * schedule_on_each_cpu() is very slow. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int schedule_on_each_cpu(work_func_t func) +{ + int cpu; + struct work_struct __percpu *works; + + works = alloc_percpu(struct work_struct); + if (!works) + return -ENOMEM; + + get_online_cpus(); + + for_each_online_cpu(cpu) { + struct work_struct *work = per_cpu_ptr(works, cpu); + + INIT_WORK(work, func); + schedule_work_on(cpu, work); + } + + for_each_online_cpu(cpu) + flush_work(per_cpu_ptr(works, cpu)); + + put_online_cpus(); + free_percpu(works); + return 0; +} + +/** + * flush_scheduled_work - ensure that any scheduled work has run to completion. + * + * Forces execution of the kernel-global workqueue and blocks until its + * completion. + * + * Think twice before calling this function! It's very easy to get into + * trouble if you don't take great care. Either of the following situations + * will lead to deadlock: + * + * One of the work items currently on the workqueue needs to acquire + * a lock held by your code or its caller. + * + * Your code is running in the context of a work routine. + * + * They will be detected by lockdep when they occur, but the first might not + * occur very often. It depends on what work items are on the workqueue and + * what locks they need, which you have no control over. + * + * In most situations flushing the entire workqueue is overkill; you merely + * need to know that a particular work item isn't queued and isn't running. + * In such cases you should use cancel_delayed_work_sync() or + * cancel_work_sync() instead. + */ +void flush_scheduled_work(void) +{ + flush_workqueue(system_wq); +} +EXPORT_SYMBOL(flush_scheduled_work); + +/** + * execute_in_process_context - reliably execute the routine with user context + * @fn: the function to execute + * @ew: guaranteed storage for the execute work structure (must + * be available when the work executes) + * + * Executes the function immediately if process context is available, + * otherwise schedules the function for delayed execution. + * + * Returns: 0 - function was executed + * 1 - function was scheduled for execution + */ +int execute_in_process_context(work_func_t fn, struct execute_work *ew) +{ + if (!in_interrupt()) { + fn(&ew->work); + return 0; + } + + INIT_WORK(&ew->work, fn); + schedule_work(&ew->work); + + return 1; +} +EXPORT_SYMBOL_GPL(execute_in_process_context); + +int keventd_up(void) +{ + return system_wq != NULL; +} + +static int alloc_cwqs(struct workqueue_struct *wq) +{ + /* + * cwqs are forced aligned according to WORK_STRUCT_FLAG_BITS. + * Make sure that the alignment isn't lower than that of + * unsigned long long. + */ + const size_t size = sizeof(struct cpu_workqueue_struct); + const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, + __alignof__(unsigned long long)); +#ifdef CONFIG_SMP + bool percpu = !(wq->flags & WQ_UNBOUND); +#else + bool percpu = false; +#endif + + if (percpu) + wq->cpu_wq.pcpu = __alloc_percpu(size, align); + else { + void *ptr; + + /* + * Allocate enough room to align cwq and put an extra + * pointer at the end pointing back to the originally + * allocated pointer which will be used for free. + */ + ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL); + if (ptr) { + wq->cpu_wq.single = PTR_ALIGN(ptr, align); + *(void **)(wq->cpu_wq.single + 1) = ptr; + } + } + + /* just in case, make sure it's actually aligned */ + BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align)); + return wq->cpu_wq.v ? 0 : -ENOMEM; +} + +static void free_cwqs(struct workqueue_struct *wq) +{ +#ifdef CONFIG_SMP + bool percpu = !(wq->flags & WQ_UNBOUND); +#else + bool percpu = false; +#endif + + if (percpu) + free_percpu(wq->cpu_wq.pcpu); + else if (wq->cpu_wq.single) { + /* the pointer to free is stored right after the cwq */ + kfree(*(void **)(wq->cpu_wq.single + 1)); + } +} + +static int wq_clamp_max_active(int max_active, unsigned int flags, + const char *name) +{ + int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE; + + if (max_active < 1 || max_active > lim) + printk(KERN_WARNING "workqueue: max_active %d requested for %s " + "is out of range, clamping between %d and %d\n", + max_active, name, 1, lim); + + return clamp_val(max_active, 1, lim); +} + +struct workqueue_struct *__alloc_workqueue_key(const char *name, + unsigned int flags, + int max_active, + struct lock_class_key *key, + const char *lock_name) +{ + struct workqueue_struct *wq; + unsigned int cpu; + + /* + * Workqueues which may be used during memory reclaim should + * have a rescuer to guarantee forward progress. + */ + if (flags & WQ_MEM_RECLAIM) + flags |= WQ_RESCUER; + + /* + * Unbound workqueues aren't concurrency managed and should be + * dispatched to workers immediately. + */ + if (flags & WQ_UNBOUND) + flags |= WQ_HIGHPRI; + + max_active = max_active ?: WQ_DFL_ACTIVE; + max_active = wq_clamp_max_active(max_active, flags, name); + + wq = kzalloc(sizeof(*wq), GFP_KERNEL); + if (!wq) + goto err; + + wq->flags = flags; + wq->saved_max_active = max_active; + mutex_init(&wq->flush_mutex); + atomic_set(&wq->nr_cwqs_to_flush, 0); + INIT_LIST_HEAD(&wq->flusher_queue); + INIT_LIST_HEAD(&wq->flusher_overflow); + + wq->name = name; + lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); + INIT_LIST_HEAD(&wq->list); + + if (alloc_cwqs(wq) < 0) + goto err; + + for_each_cwq_cpu(cpu, wq) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + struct global_cwq *gcwq = get_gcwq(cpu); + + BUG_ON((unsigned long)cwq & WORK_STRUCT_FLAG_MASK); + cwq->gcwq = gcwq; + cwq->wq = wq; + cwq->flush_color = -1; + cwq->max_active = max_active; + INIT_LIST_HEAD(&cwq->delayed_works); + } + + if (flags & WQ_RESCUER) { + struct worker *rescuer; + + if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL)) + goto err; + + wq->rescuer = rescuer = alloc_worker(); + if (!rescuer) + goto err; + + rescuer->task = kthread_create(rescuer_thread, wq, "%s", name); + if (IS_ERR(rescuer->task)) + goto err; + + rescuer->task->flags |= PF_THREAD_BOUND; + wake_up_process(rescuer->task); + } + + /* + * workqueue_lock protects global freeze state and workqueues + * list. Grab it, set max_active accordingly and add the new + * workqueue to workqueues list. + */ + spin_lock(&workqueue_lock); + + if (workqueue_freezing && wq->flags & WQ_FREEZABLE) + for_each_cwq_cpu(cpu, wq) + get_cwq(cpu, wq)->max_active = 0; + + list_add(&wq->list, &workqueues); + + spin_unlock(&workqueue_lock); + + return wq; +err: + if (wq) { + free_cwqs(wq); + free_mayday_mask(wq->mayday_mask); + kfree(wq->rescuer); + kfree(wq); + } + return NULL; +} +EXPORT_SYMBOL_GPL(__alloc_workqueue_key); + +/** + * destroy_workqueue - safely terminate a workqueue + * @wq: target workqueue + * + * Safely destroy a workqueue. All work currently pending will be done first. + */ +void destroy_workqueue(struct workqueue_struct *wq) +{ + unsigned int flush_cnt = 0; + unsigned int cpu; + + /* + * Mark @wq dying and drain all pending works. Once WQ_DYING is + * set, only chain queueing is allowed. IOW, only currently + * pending or running work items on @wq can queue further work + * items on it. @wq is flushed repeatedly until it becomes empty. + * The number of flushing is detemined by the depth of chaining and + * should be relatively short. Whine if it takes too long. + */ + wq->flags |= WQ_DYING; +reflush: + flush_workqueue(wq); + + for_each_cwq_cpu(cpu, wq) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + bool drained; + + spin_lock_irq(&cwq->gcwq->lock); + drained = !cwq->nr_active && list_empty(&cwq->delayed_works); + spin_unlock_irq(&cwq->gcwq->lock); + + if (drained) + continue; + + if (++flush_cnt == 10 || + (flush_cnt % 100 == 0 && flush_cnt <= 1000)) + printk(KERN_WARNING "workqueue %s: flush on " + "destruction isn't complete after %u tries\n", + wq->name, flush_cnt); + goto reflush; + } + + /* + * wq list is used to freeze wq, remove from list after + * flushing is complete in case freeze races us. + */ + spin_lock(&workqueue_lock); + list_del(&wq->list); + spin_unlock(&workqueue_lock); + + /* sanity check */ + for_each_cwq_cpu(cpu, wq) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + int i; + + for (i = 0; i < WORK_NR_COLORS; i++) + BUG_ON(cwq->nr_in_flight[i]); + BUG_ON(cwq->nr_active); + BUG_ON(!list_empty(&cwq->delayed_works)); + } + + if (wq->flags & WQ_RESCUER) { + kthread_stop(wq->rescuer->task); + free_mayday_mask(wq->mayday_mask); + kfree(wq->rescuer); + } + + free_cwqs(wq); + kfree(wq); +} +EXPORT_SYMBOL_GPL(destroy_workqueue); + +/** + * workqueue_set_max_active - adjust max_active of a workqueue + * @wq: target workqueue + * @max_active: new max_active value. + * + * Set max_active of @wq to @max_active. + * + * CONTEXT: + * Don't call from IRQ context. + */ +void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) +{ + unsigned int cpu; + + max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); + + spin_lock(&workqueue_lock); + + wq->saved_max_active = max_active; + + for_each_cwq_cpu(cpu, wq) { + struct global_cwq *gcwq = get_gcwq(cpu); + + spin_lock_irq(&gcwq->lock); + + if (!(wq->flags & WQ_FREEZABLE) || + !(gcwq->flags & GCWQ_FREEZING)) + get_cwq(gcwq->cpu, wq)->max_active = max_active; + + spin_unlock_irq(&gcwq->lock); + } + + spin_unlock(&workqueue_lock); +} +EXPORT_SYMBOL_GPL(workqueue_set_max_active); + +/** + * workqueue_congested - test whether a workqueue is congested + * @cpu: CPU in question + * @wq: target workqueue + * + * Test whether @wq's cpu workqueue for @cpu is congested. There is + * no synchronization around this function and the test result is + * unreliable and only useful as advisory hints or for debugging. + * + * RETURNS: + * %true if congested, %false otherwise. + */ +bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq) +{ + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + + return !list_empty(&cwq->delayed_works); +} +EXPORT_SYMBOL_GPL(workqueue_congested); + +/** + * work_cpu - return the last known associated cpu for @work + * @work: the work of interest + * + * RETURNS: + * CPU number if @work was ever queued. WORK_CPU_NONE otherwise. + */ +unsigned int work_cpu(struct work_struct *work) +{ + struct global_cwq *gcwq = get_work_gcwq(work); + + return gcwq ? gcwq->cpu : WORK_CPU_NONE; +} +EXPORT_SYMBOL_GPL(work_cpu); + +/** + * work_busy - test whether a work is currently pending or running + * @work: the work to be tested + * + * Test whether @work is currently pending or running. There is no + * synchronization around this function and the test result is + * unreliable and only useful as advisory hints or for debugging. + * Especially for reentrant wqs, the pending state might hide the + * running state. + * + * RETURNS: + * OR'd bitmask of WORK_BUSY_* bits. + */ +unsigned int work_busy(struct work_struct *work) +{ + struct global_cwq *gcwq = get_work_gcwq(work); + unsigned long flags; + unsigned int ret = 0; + + if (!gcwq) + return false; + + spin_lock_irqsave(&gcwq->lock, flags); + + if (work_pending(work)) + ret |= WORK_BUSY_PENDING; + if (find_worker_executing_work(gcwq, work)) + ret |= WORK_BUSY_RUNNING; + + spin_unlock_irqrestore(&gcwq->lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(work_busy); + +/* + * CPU hotplug. + * + * There are two challenges in supporting CPU hotplug. Firstly, there + * are a lot of assumptions on strong associations among work, cwq and + * gcwq which make migrating pending and scheduled works very + * difficult to implement without impacting hot paths. Secondly, + * gcwqs serve mix of short, long and very long running works making + * blocked draining impractical. + * + * This is solved by allowing a gcwq to be detached from CPU, running + * it with unbound (rogue) workers and allowing it to be reattached + * later if the cpu comes back online. A separate thread is created + * to govern a gcwq in such state and is called the trustee of the + * gcwq. + * + * Trustee states and their descriptions. + * + * START Command state used on startup. On CPU_DOWN_PREPARE, a + * new trustee is started with this state. + * + * IN_CHARGE Once started, trustee will enter this state after + * assuming the manager role and making all existing + * workers rogue. DOWN_PREPARE waits for trustee to + * enter this state. After reaching IN_CHARGE, trustee + * tries to execute the pending worklist until it's empty + * and the state is set to BUTCHER, or the state is set + * to RELEASE. + * + * BUTCHER Command state which is set by the cpu callback after + * the cpu has went down. Once this state is set trustee + * knows that there will be no new works on the worklist + * and once the worklist is empty it can proceed to + * killing idle workers. + * + * RELEASE Command state which is set by the cpu callback if the + * cpu down has been canceled or it has come online + * again. After recognizing this state, trustee stops + * trying to drain or butcher and clears ROGUE, rebinds + * all remaining workers back to the cpu and releases + * manager role. + * + * DONE Trustee will enter this state after BUTCHER or RELEASE + * is complete. + * + * trustee CPU draining + * took over down complete + * START -----------> IN_CHARGE -----------> BUTCHER -----------> DONE + * | | ^ + * | CPU is back online v return workers | + * ----------------> RELEASE -------------- + */ + +/** + * trustee_wait_event_timeout - timed event wait for trustee + * @cond: condition to wait for + * @timeout: timeout in jiffies + * + * wait_event_timeout() for trustee to use. Handles locking and + * checks for RELEASE request. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. To be used by trustee. + * + * RETURNS: + * Positive indicating left time if @cond is satisfied, 0 if timed + * out, -1 if canceled. + */ +#define trustee_wait_event_timeout(cond, timeout) ({ \ + long __ret = (timeout); \ + while (!((cond) || (gcwq->trustee_state == TRUSTEE_RELEASE)) && \ + __ret) { \ + spin_unlock_irq(&gcwq->lock); \ + __wait_event_timeout(gcwq->trustee_wait, (cond) || \ + (gcwq->trustee_state == TRUSTEE_RELEASE), \ + __ret); \ + spin_lock_irq(&gcwq->lock); \ + } \ + gcwq->trustee_state == TRUSTEE_RELEASE ? -1 : (__ret); \ +}) + +/** + * trustee_wait_event - event wait for trustee + * @cond: condition to wait for + * + * wait_event() for trustee to use. Automatically handles locking and + * checks for CANCEL request. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. To be used by trustee. + * + * RETURNS: + * 0 if @cond is satisfied, -1 if canceled. + */ +#define trustee_wait_event(cond) ({ \ + long __ret1; \ + __ret1 = trustee_wait_event_timeout(cond, MAX_SCHEDULE_TIMEOUT);\ + __ret1 < 0 ? -1 : 0; \ +}) + +static int __cpuinit trustee_thread(void *__gcwq) +{ + struct global_cwq *gcwq = __gcwq; + struct worker *worker; + struct work_struct *work; + struct hlist_node *pos; + long rc; + int i; + + BUG_ON(gcwq->cpu != smp_processor_id()); + + spin_lock_irq(&gcwq->lock); + /* + * Claim the manager position and make all workers rogue. + * Trustee must be bound to the target cpu and can't be + * cancelled. + */ + BUG_ON(gcwq->cpu != smp_processor_id()); + rc = trustee_wait_event(!(gcwq->flags & GCWQ_MANAGING_WORKERS)); + BUG_ON(rc < 0); + + gcwq->flags |= GCWQ_MANAGING_WORKERS; + + list_for_each_entry(worker, &gcwq->idle_list, entry) + worker->flags |= WORKER_ROGUE; + + for_each_busy_worker(worker, i, pos, gcwq) + worker->flags |= WORKER_ROGUE; + + /* + * Call schedule() so that we cross rq->lock and thus can + * guarantee sched callbacks see the rogue flag. This is + * necessary as scheduler callbacks may be invoked from other + * cpus. + */ + spin_unlock_irq(&gcwq->lock); + schedule(); + spin_lock_irq(&gcwq->lock); + + /* + * Sched callbacks are disabled now. Zap nr_running. After + * this, nr_running stays zero and need_more_worker() and + * keep_working() are always true as long as the worklist is + * not empty. + */ + atomic_set(get_gcwq_nr_running(gcwq->cpu), 0); + + spin_unlock_irq(&gcwq->lock); + del_timer_sync(&gcwq->idle_timer); + spin_lock_irq(&gcwq->lock); + + /* + * We're now in charge. Notify and proceed to drain. We need + * to keep the gcwq running during the whole CPU down + * procedure as other cpu hotunplug callbacks may need to + * flush currently running tasks. + */ + gcwq->trustee_state = TRUSTEE_IN_CHARGE; + wake_up_all(&gcwq->trustee_wait); + + /* + * The original cpu is in the process of dying and may go away + * anytime now. When that happens, we and all workers would + * be migrated to other cpus. Try draining any left work. We + * want to get it over with ASAP - spam rescuers, wake up as + * many idlers as necessary and create new ones till the + * worklist is empty. Note that if the gcwq is frozen, there + * may be frozen works in freezable cwqs. Don't declare + * completion while frozen. + */ + while (gcwq->nr_workers != gcwq->nr_idle || + gcwq->flags & GCWQ_FREEZING || + gcwq->trustee_state == TRUSTEE_IN_CHARGE) { + int nr_works = 0; + + list_for_each_entry(work, &gcwq->worklist, entry) { + send_mayday(work); + nr_works++; + } + + list_for_each_entry(worker, &gcwq->idle_list, entry) { + if (!nr_works--) + break; + wake_up_process(worker->task); + } + + if (need_to_create_worker(gcwq)) { + spin_unlock_irq(&gcwq->lock); + worker = create_worker(gcwq, false); + spin_lock_irq(&gcwq->lock); + if (worker) { + worker->flags |= WORKER_ROGUE; + start_worker(worker); + } + } + + /* give a breather */ + if (trustee_wait_event_timeout(false, TRUSTEE_COOLDOWN) < 0) + break; + } + + /* + * Either all works have been scheduled and cpu is down, or + * cpu down has already been canceled. Wait for and butcher + * all workers till we're canceled. + */ + do { + rc = trustee_wait_event(!list_empty(&gcwq->idle_list)); + while (!list_empty(&gcwq->idle_list)) + destroy_worker(list_first_entry(&gcwq->idle_list, + struct worker, entry)); + } while (gcwq->nr_workers && rc >= 0); + + /* + * At this point, either draining has completed and no worker + * is left, or cpu down has been canceled or the cpu is being + * brought back up. There shouldn't be any idle one left. + * Tell the remaining busy ones to rebind once it finishes the + * currently scheduled works by scheduling the rebind_work. + */ + WARN_ON(!list_empty(&gcwq->idle_list)); + + for_each_busy_worker(worker, i, pos, gcwq) { + struct work_struct *rebind_work = &worker->rebind_work; + + /* + * Rebind_work may race with future cpu hotplug + * operations. Use a separate flag to mark that + * rebinding is scheduled. + */ + worker->flags |= WORKER_REBIND; + worker->flags &= ~WORKER_ROGUE; + + /* queue rebind_work, wq doesn't matter, use the default one */ + if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, + work_data_bits(rebind_work))) + continue; + + debug_work_activate(rebind_work); + insert_work(get_cwq(gcwq->cpu, system_wq), rebind_work, + worker->scheduled.next, + work_color_to_flags(WORK_NO_COLOR)); + } + + /* relinquish manager role */ + gcwq->flags &= ~GCWQ_MANAGING_WORKERS; + + /* notify completion */ + gcwq->trustee = NULL; + gcwq->trustee_state = TRUSTEE_DONE; + wake_up_all(&gcwq->trustee_wait); + spin_unlock_irq(&gcwq->lock); + return 0; +} + +/** + * wait_trustee_state - wait for trustee to enter the specified state + * @gcwq: gcwq the trustee of interest belongs to + * @state: target state to wait for + * + * Wait for the trustee to reach @state. DONE is already matched. + * + * CONTEXT: + * spin_lock_irq(gcwq->lock) which may be released and regrabbed + * multiple times. To be used by cpu_callback. + */ +static void __cpuinit wait_trustee_state(struct global_cwq *gcwq, int state) +__releases(&gcwq->lock) +__acquires(&gcwq->lock) +{ + if (!(gcwq->trustee_state == state || + gcwq->trustee_state == TRUSTEE_DONE)) { + spin_unlock_irq(&gcwq->lock); + __wait_event(gcwq->trustee_wait, + gcwq->trustee_state == state || + gcwq->trustee_state == TRUSTEE_DONE); + spin_lock_irq(&gcwq->lock); + } +} + +static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct global_cwq *gcwq = get_gcwq(cpu); + struct task_struct *new_trustee = NULL; + struct worker *uninitialized_var(new_worker); + unsigned long flags; + + action &= ~CPU_TASKS_FROZEN; + + switch (action) { + case CPU_DOWN_PREPARE: + new_trustee = kthread_create(trustee_thread, gcwq, + "workqueue_trustee/%d\n", cpu); + if (IS_ERR(new_trustee)) + return notifier_from_errno(PTR_ERR(new_trustee)); + kthread_bind(new_trustee, cpu); + /* fall through */ + case CPU_UP_PREPARE: + BUG_ON(gcwq->first_idle); + new_worker = create_worker(gcwq, false); + if (!new_worker) { + if (new_trustee) + kthread_stop(new_trustee); + return NOTIFY_BAD; + } + } + + /* some are called w/ irq disabled, don't disturb irq status */ + spin_lock_irqsave(&gcwq->lock, flags); + + switch (action) { + case CPU_DOWN_PREPARE: + /* initialize trustee and tell it to acquire the gcwq */ + BUG_ON(gcwq->trustee || gcwq->trustee_state != TRUSTEE_DONE); + gcwq->trustee = new_trustee; + gcwq->trustee_state = TRUSTEE_START; + wake_up_process(gcwq->trustee); + wait_trustee_state(gcwq, TRUSTEE_IN_CHARGE); + /* fall through */ + case CPU_UP_PREPARE: + BUG_ON(gcwq->first_idle); + gcwq->first_idle = new_worker; + break; + + case CPU_DYING: + /* + * Before this, the trustee and all workers except for + * the ones which are still executing works from + * before the last CPU down must be on the cpu. After + * this, they'll all be diasporas. + */ + gcwq->flags |= GCWQ_DISASSOCIATED; + break; + + case CPU_POST_DEAD: + gcwq->trustee_state = TRUSTEE_BUTCHER; + /* fall through */ + case CPU_UP_CANCELED: + destroy_worker(gcwq->first_idle); + gcwq->first_idle = NULL; + break; + + case CPU_DOWN_FAILED: + case CPU_ONLINE: + gcwq->flags &= ~GCWQ_DISASSOCIATED; + if (gcwq->trustee_state != TRUSTEE_DONE) { + gcwq->trustee_state = TRUSTEE_RELEASE; + wake_up_process(gcwq->trustee); + wait_trustee_state(gcwq, TRUSTEE_DONE); + } + + /* + * Trustee is done and there might be no worker left. + * Put the first_idle in and request a real manager to + * take a look. + */ + spin_unlock_irq(&gcwq->lock); + kthread_bind(gcwq->first_idle->task, cpu); + spin_lock_irq(&gcwq->lock); + gcwq->flags |= GCWQ_MANAGE_WORKERS; + start_worker(gcwq->first_idle); + gcwq->first_idle = NULL; + break; + } + + spin_unlock_irqrestore(&gcwq->lock, flags); + + return notifier_from_errno(0); +} + +#ifdef CONFIG_SMP + +struct work_for_cpu { + struct completion completion; + long (*fn)(void *); + void *arg; + long ret; +}; + +static int do_work_for_cpu(void *_wfc) +{ + struct work_for_cpu *wfc = _wfc; + wfc->ret = wfc->fn(wfc->arg); + complete(&wfc->completion); + return 0; +} + +/** + * work_on_cpu - run a function in user context on a particular cpu + * @cpu: the cpu to run on + * @fn: the function to run + * @arg: the function arg + * + * This will return the value @fn returns. + * It is up to the caller to ensure that the cpu doesn't go offline. + * The caller must not hold any locks which would prevent @fn from completing. + */ +long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) +{ + struct task_struct *sub_thread; + struct work_for_cpu wfc = { + .completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion), + .fn = fn, + .arg = arg, + }; + + sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu"); + if (IS_ERR(sub_thread)) + return PTR_ERR(sub_thread); + kthread_bind(sub_thread, cpu); + wake_up_process(sub_thread); + wait_for_completion(&wfc.completion); + return wfc.ret; +} +EXPORT_SYMBOL_GPL(work_on_cpu); +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_FREEZER + +/** + * freeze_workqueues_begin - begin freezing workqueues + * + * Start freezing workqueues. After this function returns, all freezable + * workqueues will queue new works to their frozen_works list instead of + * gcwq->worklist. + * + * CONTEXT: + * Grabs and releases workqueue_lock and gcwq->lock's. + */ +void freeze_workqueues_begin(void) +{ + unsigned int cpu; + + spin_lock(&workqueue_lock); + + BUG_ON(workqueue_freezing); + workqueue_freezing = true; + + for_each_gcwq_cpu(cpu) { + struct global_cwq *gcwq = get_gcwq(cpu); + struct workqueue_struct *wq; + + spin_lock_irq(&gcwq->lock); + + BUG_ON(gcwq->flags & GCWQ_FREEZING); + gcwq->flags |= GCWQ_FREEZING; + + list_for_each_entry(wq, &workqueues, list) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + + if (cwq && wq->flags & WQ_FREEZABLE) + cwq->max_active = 0; + } + + spin_unlock_irq(&gcwq->lock); + } + + spin_unlock(&workqueue_lock); +} + +/** + * freeze_workqueues_busy - are freezable workqueues still busy? + * + * Check whether freezing is complete. This function must be called + * between freeze_workqueues_begin() and thaw_workqueues(). + * + * CONTEXT: + * Grabs and releases workqueue_lock. + * + * RETURNS: + * %true if some freezable workqueues are still busy. %false if freezing + * is complete. + */ +bool freeze_workqueues_busy(void) +{ + unsigned int cpu; + bool busy = false; + + spin_lock(&workqueue_lock); + + BUG_ON(!workqueue_freezing); + + for_each_gcwq_cpu(cpu) { + struct workqueue_struct *wq; + /* + * nr_active is monotonically decreasing. It's safe + * to peek without lock. + */ + list_for_each_entry(wq, &workqueues, list) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + + if (!cwq || !(wq->flags & WQ_FREEZABLE)) + continue; + + BUG_ON(cwq->nr_active < 0); + if (cwq->nr_active) { + busy = true; + goto out_unlock; + } + } + } +out_unlock: + spin_unlock(&workqueue_lock); + return busy; +} + +/** + * thaw_workqueues - thaw workqueues + * + * Thaw workqueues. Normal queueing is restored and all collected + * frozen works are transferred to their respective gcwq worklists. + * + * CONTEXT: + * Grabs and releases workqueue_lock and gcwq->lock's. + */ +void thaw_workqueues(void) +{ + unsigned int cpu; + + spin_lock(&workqueue_lock); + + if (!workqueue_freezing) + goto out_unlock; + + for_each_gcwq_cpu(cpu) { + struct global_cwq *gcwq = get_gcwq(cpu); + struct workqueue_struct *wq; + + spin_lock_irq(&gcwq->lock); + + BUG_ON(!(gcwq->flags & GCWQ_FREEZING)); + gcwq->flags &= ~GCWQ_FREEZING; + + list_for_each_entry(wq, &workqueues, list) { + struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq); + + if (!cwq || !(wq->flags & WQ_FREEZABLE)) + continue; + + /* restore max_active and repopulate worklist */ + cwq->max_active = wq->saved_max_active; + + while (!list_empty(&cwq->delayed_works) && + cwq->nr_active < cwq->max_active) + cwq_activate_first_delayed(cwq); + } + + wake_up_worker(gcwq); + + spin_unlock_irq(&gcwq->lock); + } + + workqueue_freezing = false; +out_unlock: + spin_unlock(&workqueue_lock); +} +#endif /* CONFIG_FREEZER */ + +static int __init init_workqueues(void) +{ + unsigned int cpu; + int i; + + cpu_notifier(workqueue_cpu_callback, CPU_PRI_WORKQUEUE); + + /* initialize gcwqs */ + for_each_gcwq_cpu(cpu) { + struct global_cwq *gcwq = get_gcwq(cpu); + + spin_lock_init(&gcwq->lock); + INIT_LIST_HEAD(&gcwq->worklist); + gcwq->cpu = cpu; + gcwq->flags |= GCWQ_DISASSOCIATED; + + INIT_LIST_HEAD(&gcwq->idle_list); + for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) + INIT_HLIST_HEAD(&gcwq->busy_hash[i]); + + init_timer_deferrable(&gcwq->idle_timer); + gcwq->idle_timer.function = idle_worker_timeout; + gcwq->idle_timer.data = (unsigned long)gcwq; + + setup_timer(&gcwq->mayday_timer, gcwq_mayday_timeout, + (unsigned long)gcwq); + + ida_init(&gcwq->worker_ida); + + gcwq->trustee_state = TRUSTEE_DONE; + init_waitqueue_head(&gcwq->trustee_wait); + } + + /* create the initial worker */ + for_each_online_gcwq_cpu(cpu) { + struct global_cwq *gcwq = get_gcwq(cpu); + struct worker *worker; + + if (cpu != WORK_CPU_UNBOUND) + gcwq->flags &= ~GCWQ_DISASSOCIATED; + worker = create_worker(gcwq, true); + BUG_ON(!worker); + spin_lock_irq(&gcwq->lock); + start_worker(worker); + spin_unlock_irq(&gcwq->lock); + } + + system_wq = alloc_workqueue("events", 0, 0); + system_long_wq = alloc_workqueue("events_long", 0, 0); + system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0); + system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND, + WQ_UNBOUND_MAX_ACTIVE); + system_freezable_wq = alloc_workqueue("events_freezable", + WQ_FREEZABLE, 0); + system_nrt_freezable_wq = alloc_workqueue("events_nrt_freezable", + WQ_NON_REENTRANT | WQ_FREEZABLE, 0); + BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq || + !system_unbound_wq || !system_freezable_wq || + !system_nrt_freezable_wq); + return 0; +} +early_initcall(init_workqueues); diff --git a/kernel/workqueue_sched.h b/kernel/workqueue_sched.h new file mode 100644 index 00000000..2d10fc98 --- /dev/null +++ b/kernel/workqueue_sched.h @@ -0,0 +1,9 @@ +/* + * kernel/workqueue_sched.h + * + * Scheduler hooks for concurrency managed workqueue. Only to be + * included from sched.c and workqueue.c. + */ +void wq_worker_waking_up(struct task_struct *task, unsigned int cpu); +struct task_struct *wq_worker_sleeping(struct task_struct *task, + unsigned int cpu); -- cgit v1.2.3