diff options
author | Hamish Guthrie <hcg@openwrt.org> | 2008-11-14 16:21:23 +0000 |
---|---|---|
committer | Hamish Guthrie <hcg@openwrt.org> | 2008-11-14 16:21:23 +0000 |
commit | dd3377e19f2fa79a10aa3f5cac77de19b1c5cbb4 (patch) | |
tree | 6b962fa2c1c36da4c7b78910c9e8aaf02ce84a5f /target/linux/ps3/patches-2.6.27/001-perfmon-2.6.27.patch | |
parent | bedfcc443aa24b71d8b2b52cbad223b4382de2ce (diff) | |
download | upstream-dd3377e19f2fa79a10aa3f5cac77de19b1c5cbb4.tar.gz upstream-dd3377e19f2fa79a10aa3f5cac77de19b1c5cbb4.tar.bz2 upstream-dd3377e19f2fa79a10aa3f5cac77de19b1c5cbb4.zip |
Added support for 2.6.27 kernel
git-svn-id: svn://svn.openwrt.org/openwrt/trunk@13202 3c298f89-4303-0410-b956-a3cf2f4a3e73
Diffstat (limited to 'target/linux/ps3/patches-2.6.27/001-perfmon-2.6.27.patch')
-rw-r--r-- | target/linux/ps3/patches-2.6.27/001-perfmon-2.6.27.patch | 31652 |
1 files changed, 31652 insertions, 0 deletions
diff --git a/target/linux/ps3/patches-2.6.27/001-perfmon-2.6.27.patch b/target/linux/ps3/patches-2.6.27/001-perfmon-2.6.27.patch new file mode 100644 index 0000000000..958416aab6 --- /dev/null +++ b/target/linux/ps3/patches-2.6.27/001-perfmon-2.6.27.patch @@ -0,0 +1,31652 @@ +diff --git a/Documentation/ABI/testing/sysfs-perfmon b/Documentation/ABI/testing/sysfs-perfmon +new file mode 100644 +index 0000000..bde434c +--- /dev/null ++++ b/Documentation/ABI/testing/sysfs-perfmon +@@ -0,0 +1,87 @@ ++What: /sys/kernel/perfmon ++Date: Nov 2007 ++KernelVersion: 2.6.24 ++Contact: eranian@gmail.com ++ ++Description: provide the configuration interface for the perfmon2 subsystems. ++ The tree contains information about the detected hardware, current ++ state of the subsystem as well as some configuration parameters. ++ ++ The tree consists of the following entries: ++ ++ /sys/kernel/perfmon/debug (read-write): ++ ++ Enable perfmon2 debugging output via klogd. Debug messages produced during ++ PMU interrupt handling are not controlled by this entry. The traces a rate-limited ++ to avoid flooding of the console. It is possible to change the throttling ++ via /proc/sys/kernel/printk_ratelimit. The value is interpreted as a bitmask. ++ Each bit enables a particular type of debug messages. Refer to the file ++ include/linux/perfmon_kern.h for more information ++ ++ /sys/kernel/perfmon/pmc_max_fast_arg (read-only): ++ ++ Number of perfmon2 syscall arguments copied directly onto the ++ stack (copy_from_user) for pfm_write_pmcs(). Copying to the stack avoids ++ having to allocate a buffer. The unit is the number of pfarg_pmc_t ++ structures. ++ ++ /sys/kernel/perfmon/pmd_max_fast_arg (read-only): ++ ++ Number of perfmon2 syscall arguments copied directly onto the ++ stack (copy_from_user) for pfm_write_pmds()/pfm_read_pmds(). Copying ++ to the stack avoids having to allocate a buffer. The unit is the number ++ of pfarg_pmd_t structures. ++ ++ ++ /sys/kernel/perfmon/reset_stats (write-only): ++ ++ Reset the statistics collected by perfmon2. Stats are available ++ per-cpu via debugfs. ++ ++ /sys/kernel/perfmon/smpl_buffer_mem_cur (read-only): ++ ++ Reports the amount of memory currently dedicated to sampling ++ buffers by the kernel. The unit is byte. ++ ++ /sys/kernel/perfmon/smpl_buffer_mem_max (read-write): ++ ++ Maximum amount of kernel memory usable for sampling buffers. -1 means ++ everything that is available. Unit is byte. ++ ++ /sys/kernel/perfmon/smpl_buffer_mem_cur (read-only): ++ ++ Current utilization of kernel memory in bytes. ++ ++ /sys/kernel/perfmon/sys_group (read-write): ++ ++ Users group allowed to create a system-wide perfmon2 context (session). ++ -1 means any group. This control will be kept until we find a package ++ able to control capabilities via PAM. ++ ++ /sys/kernel/perfmon/task_group (read-write): ++ ++ Users group allowed to create a per-thread context (session). ++ -1 means any group. This control will be kept until we find a ++ package able to control capabilities via PAM. ++ ++ /sys/kernel/perfmon/sys_sessions_count (read-only): ++ ++ Number of system-wide contexts currently attached to CPUs. ++ ++ /sys/kernel/perfmon/task_sessions_count (read-only): ++ ++ Number of per-thread contexts currently attached to threads. ++ ++ /sys/kernel/perfmon/version (read-only): ++ ++ Perfmon2 interface revision number. ++ ++ /sys/kernel/perfmon/arg_mem_max(read-write): ++ ++ Maximum size of vector arguments expressed in bytes. Can be modified ++ ++ /sys/kernel/perfmon/mode(read-write): ++ ++ Bitmask to enable/disable certain perfmon2 features. ++ Currently defined: ++ - bit 0: if set, then reserved bitfield are ignored on PMC writes +diff --git a/Documentation/ABI/testing/sysfs-perfmon-fmt b/Documentation/ABI/testing/sysfs-perfmon-fmt +new file mode 100644 +index 0000000..1b45270 +--- /dev/null ++++ b/Documentation/ABI/testing/sysfs-perfmon-fmt +@@ -0,0 +1,18 @@ ++What: /sys/kernel/perfmon/formats ++Date: 2007 ++KernelVersion: 2.6.24 ++Contact: eranian@gmail.com ++ ++Description: provide description of available perfmon2 custom sampling buffer formats ++ which are implemented as independent kernel modules. Each formats gets ++ a subdir which a few entries. ++ ++ The name of the subdir is the name of the sampling format. The same name ++ must be passed to pfm_create_context() to use the format. ++ ++ Each subdir XX contains the following entries: ++ ++ /sys/kernel/perfmon/formats/XX/version (read-only): ++ ++ Version number of the format in clear text and null terminated. ++ +diff --git a/Documentation/ABI/testing/sysfs-perfmon-pmu b/Documentation/ABI/testing/sysfs-perfmon-pmu +new file mode 100644 +index 0000000..a1afc7e +--- /dev/null ++++ b/Documentation/ABI/testing/sysfs-perfmon-pmu +@@ -0,0 +1,46 @@ ++What: /sys/kernel/perfmon/pmu ++Date: Nov 2007 ++KernelVersion: 2.6.24 ++Contact: eranian@gmail.com ++ ++Description: provide information about the currently loaded PMU description module. ++ The module contains the mapping of the actual performance counter registers ++ onto the logical PMU exposed by perfmon. There is at most one PMU description ++ module loaded at any time. ++ ++ The sysfs PMU tree provides a description of the mapping for each register. ++ There is one subdir per config and data registers along an entry for the ++ name of the PMU model. ++ ++ The model entry is as follows: ++ ++ /sys/kernel/perfmon/pmu_desc/model (read-only): ++ ++ Name of the PMU model is clear text and zero terminated. ++ ++ Then for each logical PMU register, XX, gets a subtree with the following entries: ++ ++ /sys/kernel/perfmon/pmu_desc/pm*XX/addr (read-only): ++ ++ The physical address or index of the actual underlying hardware register. ++ On Itanium, it corresponds to the index. But on X86 processor, this is ++ the actual MSR address. ++ ++ /sys/kernel/perfmon/pmu_desc/pm*XX/dfl_val (read-only): ++ ++ The default value of the register in hexadecimal. ++ ++ /sys/kernel/perfmon/pmu_desc/pm*XX/name (read-only): ++ ++ The name of the hardware register. ++ ++ /sys/kernel/perfmon/pmu_desc/pm*XX/rsvd_msk (read-only): ++ ++ The bitmask of reserved bits, i.e., bits which cannot be changed by ++ applications. When a bit is set, it means the corresponding bit in the ++ actual register is reserved. ++ ++ /sys/kernel/perfmon/pmu_desc/pm*XX/width (read-only): ++ ++ the width in bits of the registers. This field is only relevant for counter ++ registers. +diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt +index 1150444..2652b6c 100644 +--- a/Documentation/kernel-parameters.txt ++++ b/Documentation/kernel-parameters.txt +@@ -1643,6 +1643,9 @@ and is between 256 and 4096 characters. It is defined in the file + Format: { 0 | 1 } + See arch/parisc/kernel/pdc_chassis.c + ++ perfmon_debug [PERFMON] Enables Perfmon debug messages. Needed ++ to see traces of the early startup startup phase. ++ + pf. [PARIDE] + See Documentation/paride.txt. + +diff --git a/Documentation/perfmon2-debugfs.txt b/Documentation/perfmon2-debugfs.txt +new file mode 100644 +index 0000000..b30cae8 +--- /dev/null ++++ b/Documentation/perfmon2-debugfs.txt +@@ -0,0 +1,126 @@ ++ The perfmon2 debug and statistics interface ++ ------------------------------------------ ++ Stephane Eranian ++ <eranian@gmail.com> ++ ++The perfmon2 interfaces exports a set of statistics which are used to tune and ++debug the implementation. The data is composed of a set of very simple metrics ++mostly aggregated counts and durations. They instruments key points in the ++perfmon2 code, such as context switch and interrupt handling. ++ ++The data is accessible via the debug filesystem (debugfs). Thus you need to ++have the filesystem support enabled in your kernel. Furthermore since, 2.6.25, ++the perfmon2 statistics interface is an optional component. It needs to be ++explicitely enabled in the kernel config file (CONFIG_PERFMON_DEBUG_FS). ++ ++To access the data, the debugs filesystem must be mounted. Supposing the mount ++point is /debugfs, you would need to do: ++ $ mount -t debugs none /debugfs ++ ++The data is located under the perfmon subdirectory and is organized per CPU. ++For each CPU, the same set of metrics is available, one metric per file in ++clear ASCII text. ++ ++The metrics are as follows: ++ ++ ctxswin_count (read-only): ++ ++ Number of PMU context switch in. ++ ++ ctxswin_ns (read-only): ++ ++ Number of nanoseconds spent in the PMU context switch in ++ routine. Dividing this number by the value of ctxswin_count, ++ yields average cost of the PMU context switch in. ++ ++ ctxswout_count (read-only): ++ ++ Number of PMU context switch out. ++ ++ ctxswout_ns (read-only): ++ ++ Number of nanoseconds spent in the PMU context switch in ++ routine. Dividing this number by the value of ctxswout_count, ++ yields average cost of the PMU context switch out. ++ ++ fmt_handler_calls (read-only): ++ ++ Number of calls to the sampling format routine that handles ++ PMU interrupts, i.e., typically the routine that records a ++ sample. ++ ++ fmt_handler_ns (read-only): ++ ++ Number of nanoseconds spent in the routine that handle PMU ++ interrupt in the sampling format. Dividing this number by ++ the number of calls provided by fmt_handler_calls, yields ++ average time spent in this routine. ++ ++ ovfl_intr_all_count (read-only): ++ ++ Number of PMU interrupts received by the kernel. ++ ++ ++ ovfl_intr_nmi_count (read-only): ++ ++ Number of Non Maskeable Interrupts (NMI) received by the kernel ++ for perfmon. This is relevant only on X86 hardware. ++ ++ ovfl_intr_ns (read-only): ++ ++ Number of nanoseconds spent in the perfmon2 PMU interrupt ++ handler routine. Dividing this number of ovfl_intr_all_count ++ yields the average time to handle one PMU interrupt. ++ ++ ovfl_intr_regular_count (read-only): ++ ++ Number of PMU interrupts which are actually processed by ++ the perfmon interrupt handler. There may be spurious or replay ++ interrupts. ++ ++ ovfl_intr_replay_count (read-only): ++ ++ Number of PMU interrupts which were replayed on context switch ++ in or on event set switching. Interrupts get replayed when they ++ were in flight at the time monitoring had to be stopped. ++ ++ perfmon/ovfl_intr_spurious_count (read-only): ++ ++ Number of PMU interrupts which were dropped because there was ++ no active context (session). ++ ++ ovfl_notify_count (read-only): ++ ++ Number of user level notifications sent. Notifications are ++ appended as messages to the context queue. Notifications may ++ be sent on PMU interrupts. ++ ++ pfm_restart_count (read-only): ++ ++ Number of times pfm_restart() is called. ++ ++ reset_pmds_count (read-only): ++ ++ Number of times pfm_reset_pmds() is called. ++ ++ set_switch_count (read-only): ++ ++ Number of event set switches. ++ ++ set_switch_ns (read-only): ++ ++ Number of nanoseconds spent in the set switching routine. ++ Dividing this number by set_switch_count yields the average ++ cost of switching sets. ++ ++ handle_timeout_count (read-only): ++ ++ Number of times the pfm_handle_timeout() routine is called. ++ It is used for timeout-based set switching. ++ ++ handle_work_count (read-only): ++ ++ Number of times pfm_handle_work() is called. The routine ++ handles asynchronous perfmon2 work for per-thread contexts ++ (sessions). ++ +diff --git a/Documentation/perfmon2.txt b/Documentation/perfmon2.txt +new file mode 100644 +index 0000000..4a8fada +--- /dev/null ++++ b/Documentation/perfmon2.txt +@@ -0,0 +1,213 @@ ++ The perfmon2 hardware monitoring interface ++ ------------------------------------------ ++ Stephane Eranian ++ <eranian@gmail.com> ++ ++I/ Introduction ++ ++ The perfmon2 interface provides access to the hardware performance counters of ++ major processors. Nowadays, all processors implement some flavors of performance ++ counters which capture micro-architectural level information such as the number ++ of elapsed cycles, number of cache misses, and so on. ++ ++ The interface is implemented as a set of new system calls and a set of config files ++ in /sys. ++ ++ It is possible to monitoring a single thread or a CPU. In either mode, applications ++ can count or collect samples. System-wide monitoring is supported by running a ++ monitoring session on each CPU. The interface support event-based sampling where the ++ sampling period is expressed as the number of occurrences of event, instead of just a ++ timeout. This approach provides a much better granularity and flexibility. ++ ++ For performance reason, it is possible to use a kernel-level sampling buffer to minimize ++ the overhead incurred by sampling. The format of the buffer, i.e., what is recorded, how ++ it is recorded, and how it is exported to user-land is controlled by a kernel module called ++ a custom sampling format. The current implementation comes with a default format but ++ it is possible to create additional formats. There is an in-kernel registration ++ interface for formats. Each format is identified by a simple string which a tool ++ can pass when a monitoring session is created. ++ ++ The interface also provides support for event set and multiplexing to work around ++ hardware limitations in the number of available counters or in how events can be ++ combined. Each set defines as many counters as the hardware can support. The kernel ++ then multiplexes the sets. The interface supports time-base switching but also ++ overflow based switching, i.e., after n overflows of designated counters. ++ ++ Applications never manipulates the actual performance counter registers. Instead they see ++ a logical Performance Monitoring Unit (PMU) composed of a set of config register (PMC) ++ and a set of data registers (PMD). Note that PMD are not necessarily counters, they ++ can be buffers. The logical PMU is then mapped onto the actual PMU using a mapping ++ table which is implemented as a kernel module. The mapping is chosen once for each ++ new processor. It is visible in /sys/kernel/perfmon/pmu_desc. The kernel module ++ is automatically loaded on first use. ++ ++ A monitoring session, or context, is uniquely identified by a file descriptor ++ obtained when the context is created. File sharing semantics apply to access ++ the context inside a process. A context is never inherited across fork. The file ++ descriptor can be used to received counter overflow notifications or when the ++ sampling buffer is full. It is possible to use poll/select on the descriptor ++ to wait for notifications from multiplex contexts. Similarly, the descriptor ++ supports asynchronous notification via SIGIO. ++ ++ Counters are always exported as being 64-bit wide regardless of what the underlying ++ hardware implements. ++ ++II/ Kernel compilation ++ ++ To enable perfmon2, you need to enable CONFIG_PERFMON ++ ++III/ OProfile interactions ++ ++ The set of features offered by perfmon2 is rich enough to support migrating ++ Oprofile on top of it. That means that PMU programming and low-level interrupt ++ handling could be done by perfmon2. The Oprofile sampling buffer management code ++ in the kernel as well as how samples are exported to users could remain through ++ the use of a custom sampling buffer format. This is how Oprofile work on Itanium. ++ ++ The current interactions with Oprofile are: ++ - on X86: Both subsystems can be compiled into the same kernel. There is enforced ++ mutual exclusion between the two subsystems. When there is an Oprofile ++ session, no perfmon2 session can exist and vice-versa. Perfmon2 session ++ encapsulates both per-thread and system-wide sessions here. ++ ++ - On IA-64: Oprofile works on top of perfmon2. Oprofile being a system-wide monitoring ++ tool, the regular per-thread vs. system-wide session restrictions apply. ++ ++ - on PPC: no integration yet. You need to enable/disble one of the two subsystems ++ - on MIPS: no integration yet. You need to enable/disble one of the two subsystems ++ ++IV/ User tools ++ ++ We have released a simple monitoring tool to demonstrate the feature of the ++ interface. The tool is called pfmon and it comes with a simple helper library ++ called libpfm. The library comes with a set of examples to show how to use the ++ kernel perfmon2 interface. Visit http://perfmon2.sf.net for details. ++ ++ There maybe other tools available for perfmon2. ++ ++V/ How to program? ++ ++ The best way to learn how to program perfmon2, is to take a look at the source ++ code for the examples in libpfm. The source code is available from: ++ http://perfmon2.sf.net ++ ++VI/ System calls overview ++ ++ The interface is implemented by the following system calls: ++ ++ * int pfm_create_context(pfarg_ctx_t *ctx, char *fmt, void *arg, size_t arg_size) ++ ++ This function create a perfmon2 context. The type of context is per-thread by ++ default unless PFM_FL_SYSTEM_WIDE is passed in ctx. The sampling format name ++ is passed in fmt. Arguments to the format are passed in arg which is of size ++ arg_size. Upon successful return, the file descriptor identifying the context ++ is returned. ++ ++ * int pfm_write_pmds(int fd, pfarg_pmd_t *pmds, int n) ++ ++ This function is used to program the PMD registers. It is possible to pass ++ vectors of PMDs. ++ ++ * int pfm_write_pmcs(int fd, pfarg_pmc_t *pmds, int n) ++ ++ This function is used to program the PMC registers. It is possible to pass ++ vectors of PMDs. ++ ++ * int pfm_read_pmds(int fd, pfarg_pmd_t *pmds, int n) ++ ++ This function is used to read the PMD registers. It is possible to pass ++ vectors of PMDs. ++ ++ * int pfm_load_context(int fd, pfarg_load_t *load) ++ ++ This function is used to attach the context to a thread or CPU. ++ Thread means kernel-visible thread (NPTL). The thread identification ++ as obtained by gettid must be passed to load->load_target. ++ ++ To operate on another thread (not self), it is mandatory that the thread ++ be stopped via ptrace(). ++ ++ To attach to a CPU, the CPU number must be specified in load->load_target ++ AND the call must be issued on that CPU. To monitor a CPU, a thread MUST ++ be pinned on that CPU. ++ ++ Until the context is attached, the actual counters are not accessed. ++ ++ * int pfm_unload_context(int fd) ++ ++ The context is detached for the thread or CPU is was attached to. ++ As a consequence monitoring is stopped. ++ ++ When monitoring another thread, the thread MUST be stopped via ptrace() ++ for this function to succeed. ++ ++ * int pfm_start(int fd, pfarg_start_t *st) ++ ++ Start monitoring. The context must be attached for this function to succeed. ++ Optionally, it is possible to specify the event set on which to start using the ++ st argument, otherwise just pass NULL. ++ ++ When monitoring another thread, the thread MUST be stopped via ptrace() ++ for this function to succeed. ++ ++ * int pfm_stop(int fd) ++ ++ Stop monitoring. The context must be attached for this function to succeed. ++ ++ When monitoring another thread, the thread MUST be stopped via ptrace() ++ for this function to succeed. ++ ++ ++ * int pfm_create_evtsets(int fd, pfarg_setdesc_t *sets, int n) ++ ++ This function is used to create or change event sets. By default set 0 exists. ++ It is possible to create/change multiple sets in one call. ++ ++ The context must be detached for this call to succeed. ++ ++ Sets are identified by a 16-bit integer. They are sorted based on this ++ set and switching occurs in a round-robin fashion. ++ ++ * int pfm_delete_evtsets(int fd, pfarg_setdesc_t *sets, int n) ++ ++ Delete event sets. The context must be detached for this call to succeed. ++ ++ ++ * int pfm_getinfo_evtsets(int fd, pfarg_setinfo_t *sets, int n) ++ ++ Retrieve information about event sets. In particular it is possible ++ to get the number of activation of a set. It is possible to retrieve ++ information about multiple sets in one call. ++ ++ ++ * int pfm_restart(int fd) ++ ++ Indicate to the kernel that the application is done processing an overflow ++ notification. A consequence of this call could be that monitoring resumes. ++ ++ * int read(fd, pfm_msg_t *msg, sizeof(pfm_msg_t)) ++ ++ the regular read() system call can be used with the context file descriptor to ++ receive overflow notification messages. Non-blocking read() is supported. ++ ++ Each message carry information about the overflow such as which counter overflowed ++ and where the program was (interrupted instruction pointer). ++ ++ * int close(int fd) ++ ++ To destroy a context, the regular close() system call is used. ++ ++ ++VII/ /sys interface overview ++ ++ Refer to Documentation/ABI/testing/sysfs-perfmon-* for a detailed description ++ of the sysfs interface of perfmon2. ++ ++VIII/ debugfs interface overview ++ ++ Refer to Documentation/perfmon2-debugfs.txt for a detailed description of the ++ debug and statistics interface of perfmon2. ++ ++IX/ Documentation ++ ++ Visit http://perfmon2.sf.net +diff --git a/MAINTAINERS b/MAINTAINERS +index 8dae455..fb38c2a 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -3239,6 +3239,14 @@ M: balbir@linux.vnet.ibm.com + L: linux-kernel@vger.kernel.org + S: Maintained + ++PERFMON SUBSYSTEM ++P: Stephane Eranian ++M: eranian@gmail.com ++L: perfmon2-devel@lists.sf.net ++W: http://perfmon2.sf.net ++T: git kernel.org:/pub/scm/linux/kernel/git/eranian/linux-2.6 ++S: Maintained ++ + PERSONALITY HANDLING + P: Christoph Hellwig + M: hch@infradead.org +diff --git a/Makefile b/Makefile +index 16e3fbb..7bb1320 100644 +--- a/Makefile ++++ b/Makefile +@@ -620,6 +620,7 @@ export mod_strip_cmd + + ifeq ($(KBUILD_EXTMOD),) + core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/ ++core-$(CONFIG_PERFMON) += perfmon/ + + vmlinux-dirs := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \ + $(core-y) $(core-m) $(drivers-y) $(drivers-m) \ +diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig +index 48e496f..1d79b01 100644 +--- a/arch/ia64/Kconfig ++++ b/arch/ia64/Kconfig +@@ -470,14 +470,6 @@ config COMPAT_FOR_U64_ALIGNMENT + config IA64_MCA_RECOVERY + tristate "MCA recovery from errors other than TLB." + +-config PERFMON +- bool "Performance monitor support" +- help +- Selects whether support for the IA-64 performance monitor hardware +- is included in the kernel. This makes some kernel data-structures a +- little bigger and slows down execution a bit, but it is generally +- a good idea to turn this on. If you're unsure, say Y. +- + config IA64_PALINFO + tristate "/proc/pal support" + help +@@ -549,6 +541,8 @@ source "drivers/firmware/Kconfig" + + source "fs/Kconfig.binfmt" + ++source "arch/ia64/perfmon/Kconfig" ++ + endmenu + + menu "Power management and ACPI" +diff --git a/arch/ia64/Makefile b/arch/ia64/Makefile +index 905d25b..9aa622d 100644 +--- a/arch/ia64/Makefile ++++ b/arch/ia64/Makefile +@@ -57,6 +57,7 @@ core-$(CONFIG_IA64_GENERIC) += arch/ia64/dig/ + core-$(CONFIG_IA64_HP_ZX1) += arch/ia64/dig/ + core-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += arch/ia64/dig/ + core-$(CONFIG_IA64_SGI_SN2) += arch/ia64/sn/ ++core-$(CONFIG_PERFMON) += arch/ia64/perfmon/ + core-$(CONFIG_IA64_SGI_UV) += arch/ia64/uv/ + core-$(CONFIG_KVM) += arch/ia64/kvm/ + +diff --git a/arch/ia64/configs/generic_defconfig b/arch/ia64/configs/generic_defconfig +index 9f48397..ff9572a 100644 +--- a/arch/ia64/configs/generic_defconfig ++++ b/arch/ia64/configs/generic_defconfig +@@ -209,7 +209,6 @@ CONFIG_IA32_SUPPORT=y + CONFIG_COMPAT=y + CONFIG_COMPAT_FOR_U64_ALIGNMENT=y + CONFIG_IA64_MCA_RECOVERY=y +-CONFIG_PERFMON=y + CONFIG_IA64_PALINFO=y + # CONFIG_IA64_MC_ERR_INJECT is not set + CONFIG_SGI_SN=y +@@ -234,6 +233,16 @@ CONFIG_BINFMT_ELF=y + CONFIG_BINFMT_MISC=m + + # ++# Hardware Performance Monitoring support ++# ++CONFIG_PERFMON=y ++CONFIG_IA64_PERFMON_COMPAT=y ++CONFIG_IA64_PERFMON_GENERIC=m ++CONFIG_IA64_PERFMON_ITANIUM=y ++CONFIG_IA64_PERFMON_MCKINLEY=y ++CONFIG_IA64_PERFMON_MONTECITO=y ++ ++# + # Power management and ACPI + # + CONFIG_PM=y +diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild +index ccbe8ae..cf64b3b 100644 +--- a/arch/ia64/include/asm/Kbuild ++++ b/arch/ia64/include/asm/Kbuild +@@ -5,10 +5,12 @@ header-y += fpu.h + header-y += fpswa.h + header-y += ia64regs.h + header-y += intel_intrin.h +-header-y += perfmon_default_smpl.h + header-y += ptrace_offsets.h + header-y += rse.h + header-y += ucontext.h ++header-y += perfmon.h ++header-y += perfmon_compat.h ++header-y += perfmon_default_smpl.h + + unifdef-y += gcc_intrin.h + unifdef-y += intrinsics.h +diff --git a/arch/ia64/include/asm/hw_irq.h b/arch/ia64/include/asm/hw_irq.h +index 5c99cbc..4a45cb0 100644 +--- a/arch/ia64/include/asm/hw_irq.h ++++ b/arch/ia64/include/asm/hw_irq.h +@@ -67,9 +67,9 @@ extern int ia64_last_device_vector; + #define IA64_NUM_DEVICE_VECTORS (IA64_LAST_DEVICE_VECTOR - IA64_FIRST_DEVICE_VECTOR + 1) + + #define IA64_MCA_RENDEZ_VECTOR 0xe8 /* MCA rendez interrupt */ +-#define IA64_PERFMON_VECTOR 0xee /* performance monitor interrupt vector */ + #define IA64_TIMER_VECTOR 0xef /* use highest-prio group 15 interrupt for timer */ + #define IA64_MCA_WAKEUP_VECTOR 0xf0 /* MCA wakeup (must be >MCA_RENDEZ_VECTOR) */ ++#define IA64_PERFMON_VECTOR 0xf1 /* performance monitor interrupt vector */ + #define IA64_IPI_LOCAL_TLB_FLUSH 0xfc /* SMP flush local TLB */ + #define IA64_IPI_RESCHEDULE 0xfd /* SMP reschedule */ + #define IA64_IPI_VECTOR 0xfe /* inter-processor interrupt vector */ +diff --git a/arch/ia64/include/asm/perfmon.h b/arch/ia64/include/asm/perfmon.h +index 7f3333d..150c4b4 100644 +--- a/arch/ia64/include/asm/perfmon.h ++++ b/arch/ia64/include/asm/perfmon.h +@@ -1,279 +1,59 @@ + /* +- * Copyright (C) 2001-2003 Hewlett-Packard Co +- * Stephane Eranian <eranian@hpl.hp.com> +- */ +- +-#ifndef _ASM_IA64_PERFMON_H +-#define _ASM_IA64_PERFMON_H +- +-/* +- * perfmon comamnds supported on all CPU models +- */ +-#define PFM_WRITE_PMCS 0x01 +-#define PFM_WRITE_PMDS 0x02 +-#define PFM_READ_PMDS 0x03 +-#define PFM_STOP 0x04 +-#define PFM_START 0x05 +-#define PFM_ENABLE 0x06 /* obsolete */ +-#define PFM_DISABLE 0x07 /* obsolete */ +-#define PFM_CREATE_CONTEXT 0x08 +-#define PFM_DESTROY_CONTEXT 0x09 /* obsolete use close() */ +-#define PFM_RESTART 0x0a +-#define PFM_PROTECT_CONTEXT 0x0b /* obsolete */ +-#define PFM_GET_FEATURES 0x0c +-#define PFM_DEBUG 0x0d +-#define PFM_UNPROTECT_CONTEXT 0x0e /* obsolete */ +-#define PFM_GET_PMC_RESET_VAL 0x0f +-#define PFM_LOAD_CONTEXT 0x10 +-#define PFM_UNLOAD_CONTEXT 0x11 +- +-/* +- * PMU model specific commands (may not be supported on all PMU models) +- */ +-#define PFM_WRITE_IBRS 0x20 +-#define PFM_WRITE_DBRS 0x21 +- +-/* +- * context flags +- */ +-#define PFM_FL_NOTIFY_BLOCK 0x01 /* block task on user level notifications */ +-#define PFM_FL_SYSTEM_WIDE 0x02 /* create a system wide context */ +-#define PFM_FL_OVFL_NO_MSG 0x80 /* do not post overflow/end messages for notification */ +- +-/* +- * event set flags +- */ +-#define PFM_SETFL_EXCL_IDLE 0x01 /* exclude idle task (syswide only) XXX: DO NOT USE YET */ +- +-/* +- * PMC flags +- */ +-#define PFM_REGFL_OVFL_NOTIFY 0x1 /* send notification on overflow */ +-#define PFM_REGFL_RANDOM 0x2 /* randomize sampling interval */ +- +-/* +- * PMD/PMC/IBR/DBR return flags (ignored on input) ++ * Copyright (c) 2001-2007 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> + * +- * Those flags are used on output and must be checked in case EAGAIN is returned +- * by any of the calls using a pfarg_reg_t or pfarg_dbreg_t structure. +- */ +-#define PFM_REG_RETFL_NOTAVAIL (1UL<<31) /* set if register is implemented but not available */ +-#define PFM_REG_RETFL_EINVAL (1UL<<30) /* set if register entry is invalid */ +-#define PFM_REG_RETFL_MASK (PFM_REG_RETFL_NOTAVAIL|PFM_REG_RETFL_EINVAL) +- +-#define PFM_REG_HAS_ERROR(flag) (((flag) & PFM_REG_RETFL_MASK) != 0) +- +-typedef unsigned char pfm_uuid_t[16]; /* custom sampling buffer identifier type */ +- +-/* +- * Request structure used to define a context +- */ +-typedef struct { +- pfm_uuid_t ctx_smpl_buf_id; /* which buffer format to use (if needed) */ +- unsigned long ctx_flags; /* noblock/block */ +- unsigned short ctx_nextra_sets; /* number of extra event sets (you always get 1) */ +- unsigned short ctx_reserved1; /* for future use */ +- int ctx_fd; /* return arg: unique identification for context */ +- void *ctx_smpl_vaddr; /* return arg: virtual address of sampling buffer, is used */ +- unsigned long ctx_reserved2[11];/* for future use */ +-} pfarg_context_t; +- +-/* +- * Request structure used to write/read a PMC or PMD +- */ +-typedef struct { +- unsigned int reg_num; /* which register */ +- unsigned short reg_set; /* event set for this register */ +- unsigned short reg_reserved1; /* for future use */ +- +- unsigned long reg_value; /* initial pmc/pmd value */ +- unsigned long reg_flags; /* input: pmc/pmd flags, return: reg error */ +- +- unsigned long reg_long_reset; /* reset after buffer overflow notification */ +- unsigned long reg_short_reset; /* reset after counter overflow */ +- +- unsigned long reg_reset_pmds[4]; /* which other counters to reset on overflow */ +- unsigned long reg_random_seed; /* seed value when randomization is used */ +- unsigned long reg_random_mask; /* bitmask used to limit random value */ +- unsigned long reg_last_reset_val;/* return: PMD last reset value */ +- +- unsigned long reg_smpl_pmds[4]; /* which pmds are accessed when PMC overflows */ +- unsigned long reg_smpl_eventid; /* opaque sampling event identifier */ +- +- unsigned long reg_reserved2[3]; /* for future use */ +-} pfarg_reg_t; +- +-typedef struct { +- unsigned int dbreg_num; /* which debug register */ +- unsigned short dbreg_set; /* event set for this register */ +- unsigned short dbreg_reserved1; /* for future use */ +- unsigned long dbreg_value; /* value for debug register */ +- unsigned long dbreg_flags; /* return: dbreg error */ +- unsigned long dbreg_reserved2[1]; /* for future use */ +-} pfarg_dbreg_t; +- +-typedef struct { +- unsigned int ft_version; /* perfmon: major [16-31], minor [0-15] */ +- unsigned int ft_reserved; /* reserved for future use */ +- unsigned long reserved[4]; /* for future use */ +-} pfarg_features_t; +- +-typedef struct { +- pid_t load_pid; /* process to load the context into */ +- unsigned short load_set; /* first event set to load */ +- unsigned short load_reserved1; /* for future use */ +- unsigned long load_reserved2[3]; /* for future use */ +-} pfarg_load_t; +- +-typedef struct { +- int msg_type; /* generic message header */ +- int msg_ctx_fd; /* generic message header */ +- unsigned long msg_ovfl_pmds[4]; /* which PMDs overflowed */ +- unsigned short msg_active_set; /* active set at the time of overflow */ +- unsigned short msg_reserved1; /* for future use */ +- unsigned int msg_reserved2; /* for future use */ +- unsigned long msg_tstamp; /* for perf tuning/debug */ +-} pfm_ovfl_msg_t; +- +-typedef struct { +- int msg_type; /* generic message header */ +- int msg_ctx_fd; /* generic message header */ +- unsigned long msg_tstamp; /* for perf tuning */ +-} pfm_end_msg_t; +- +-typedef struct { +- int msg_type; /* type of the message */ +- int msg_ctx_fd; /* unique identifier for the context */ +- unsigned long msg_tstamp; /* for perf tuning */ +-} pfm_gen_msg_t; +- +-#define PFM_MSG_OVFL 1 /* an overflow happened */ +-#define PFM_MSG_END 2 /* task to which context was attached ended */ +- +-typedef union { +- pfm_ovfl_msg_t pfm_ovfl_msg; +- pfm_end_msg_t pfm_end_msg; +- pfm_gen_msg_t pfm_gen_msg; +-} pfm_msg_t; +- +-/* +- * Define the version numbers for both perfmon as a whole and the sampling buffer format. ++ * This file contains Itanium Processor Family specific definitions ++ * for the perfmon interface. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA + */ +-#define PFM_VERSION_MAJ 2U +-#define PFM_VERSION_MIN 0U +-#define PFM_VERSION (((PFM_VERSION_MAJ&0xffff)<<16)|(PFM_VERSION_MIN & 0xffff)) +-#define PFM_VERSION_MAJOR(x) (((x)>>16) & 0xffff) +-#define PFM_VERSION_MINOR(x) ((x) & 0xffff) +- ++#ifndef _ASM_IA64_PERFMON_H_ ++#define _ASM_IA64_PERFMON_H_ + + /* +- * miscellaneous architected definitions ++ * arch-specific user visible interface definitions + */ +-#define PMU_FIRST_COUNTER 4 /* first counting monitor (PMC/PMD) */ +-#define PMU_MAX_PMCS 256 /* maximum architected number of PMC registers */ +-#define PMU_MAX_PMDS 256 /* maximum architected number of PMD registers */ +- +-#ifdef __KERNEL__ +- +-extern long perfmonctl(int fd, int cmd, void *arg, int narg); +- +-typedef struct { +- void (*handler)(int irq, void *arg, struct pt_regs *regs); +-} pfm_intr_handler_desc_t; +- +-extern void pfm_save_regs (struct task_struct *); +-extern void pfm_load_regs (struct task_struct *); + +-extern void pfm_exit_thread(struct task_struct *); +-extern int pfm_use_debug_registers(struct task_struct *); +-extern int pfm_release_debug_registers(struct task_struct *); +-extern void pfm_syst_wide_update_task(struct task_struct *, unsigned long info, int is_ctxswin); +-extern void pfm_inherit(struct task_struct *task, struct pt_regs *regs); +-extern void pfm_init_percpu(void); +-extern void pfm_handle_work(void); +-extern int pfm_install_alt_pmu_interrupt(pfm_intr_handler_desc_t *h); +-extern int pfm_remove_alt_pmu_interrupt(pfm_intr_handler_desc_t *h); ++#define PFM_ARCH_MAX_PMCS (256+64) ++#define PFM_ARCH_MAX_PMDS (256+64) + +- +- +-/* +- * Reset PMD register flags +- */ +-#define PFM_PMD_SHORT_RESET 0 +-#define PFM_PMD_LONG_RESET 1 +- +-typedef union { +- unsigned int val; +- struct { +- unsigned int notify_user:1; /* notify user program of overflow */ +- unsigned int reset_ovfl_pmds:1; /* reset overflowed PMDs */ +- unsigned int block_task:1; /* block monitored task on kernel exit */ +- unsigned int mask_monitoring:1; /* mask monitors via PMCx.plm */ +- unsigned int reserved:28; /* for future use */ +- } bits; +-} pfm_ovfl_ctrl_t; +- +-typedef struct { +- unsigned char ovfl_pmd; /* index of overflowed PMD */ +- unsigned char ovfl_notify; /* =1 if monitor requested overflow notification */ +- unsigned short active_set; /* event set active at the time of the overflow */ +- pfm_ovfl_ctrl_t ovfl_ctrl; /* return: perfmon controls to set by handler */ +- +- unsigned long pmd_last_reset; /* last reset value of of the PMD */ +- unsigned long smpl_pmds[4]; /* bitmask of other PMD of interest on overflow */ +- unsigned long smpl_pmds_values[PMU_MAX_PMDS]; /* values for the other PMDs of interest */ +- unsigned long pmd_value; /* current 64-bit value of the PMD */ +- unsigned long pmd_eventid; /* eventid associated with PMD */ +-} pfm_ovfl_arg_t; +- +- +-typedef struct { +- char *fmt_name; +- pfm_uuid_t fmt_uuid; +- size_t fmt_arg_size; +- unsigned long fmt_flags; +- +- int (*fmt_validate)(struct task_struct *task, unsigned int flags, int cpu, void *arg); +- int (*fmt_getsize)(struct task_struct *task, unsigned int flags, int cpu, void *arg, unsigned long *size); +- int (*fmt_init)(struct task_struct *task, void *buf, unsigned int flags, int cpu, void *arg); +- int (*fmt_handler)(struct task_struct *task, void *buf, pfm_ovfl_arg_t *arg, struct pt_regs *regs, unsigned long stamp); +- int (*fmt_restart)(struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs); +- int (*fmt_restart_active)(struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs); +- int (*fmt_exit)(struct task_struct *task, void *buf, struct pt_regs *regs); +- +- struct list_head fmt_list; +-} pfm_buffer_fmt_t; +- +-extern int pfm_register_buffer_fmt(pfm_buffer_fmt_t *fmt); +-extern int pfm_unregister_buffer_fmt(pfm_uuid_t uuid); ++#define PFM_ARCH_PMD_STK_ARG 8 ++#define PFM_ARCH_PMC_STK_ARG 8 + + /* +- * perfmon interface exported to modules ++ * Itanium specific context flags ++ * ++ * bits[00-15]: generic flags (see asm/perfmon.h) ++ * bits[16-31]: arch-specific flags + */ +-extern int pfm_mod_read_pmds(struct task_struct *, void *req, unsigned int nreq, struct pt_regs *regs); +-extern int pfm_mod_write_pmcs(struct task_struct *, void *req, unsigned int nreq, struct pt_regs *regs); +-extern int pfm_mod_write_ibrs(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs); +-extern int pfm_mod_write_dbrs(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs); ++#define PFM_ITA_FL_INSECURE 0x10000 /* clear psr.sp on non system, non self */ + + /* +- * describe the content of the local_cpu_date->pfm_syst_info field ++ * Itanium specific public event set flags (set_flags) ++ * ++ * event set flags layout: ++ * bits[00-15] : generic flags ++ * bits[16-31] : arch-specific flags + */ +-#define PFM_CPUINFO_SYST_WIDE 0x1 /* if set a system wide session exists */ +-#define PFM_CPUINFO_DCR_PP 0x2 /* if set the system wide session has started */ +-#define PFM_CPUINFO_EXCL_IDLE 0x4 /* the system wide session excludes the idle task */ ++#define PFM_ITA_SETFL_EXCL_INTR 0x10000 /* exclude interrupt execution */ ++#define PFM_ITA_SETFL_INTR_ONLY 0x20000 /* include only interrupt execution */ ++#define PFM_ITA_SETFL_IDLE_EXCL 0x40000 /* stop monitoring in idle loop */ + + /* +- * sysctl control structure. visible to sampling formats ++ * compatibility for version v2.0 of the interface + */ +-typedef struct { +- int debug; /* turn on/off debugging via syslog */ +- int debug_ovfl; /* turn on/off debug printk in overflow handler */ +- int fastctxsw; /* turn on/off fast (unsecure) ctxsw */ +- int expert_mode; /* turn on/off value checking */ +-} pfm_sysctl_t; +-extern pfm_sysctl_t pfm_sysctl; +- +- +-#endif /* __KERNEL__ */ ++#include <asm/perfmon_compat.h> + +-#endif /* _ASM_IA64_PERFMON_H */ ++#endif /* _ASM_IA64_PERFMON_H_ */ +diff --git a/arch/ia64/include/asm/perfmon_compat.h b/arch/ia64/include/asm/perfmon_compat.h +new file mode 100644 +index 0000000..5c14514 +--- /dev/null ++++ b/arch/ia64/include/asm/perfmon_compat.h +@@ -0,0 +1,167 @@ ++/* ++ * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * This header file contains perfmon interface definition ++ * that are now obsolete and should be dropped in favor ++ * of their equivalent functions as explained below. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++ ++#ifndef _ASM_IA64_PERFMON_COMPAT_H_ ++#define _ASM_IA64_PERFMON_COMPAT_H_ ++ ++/* ++ * custom sampling buffer identifier type ++ */ ++typedef __u8 pfm_uuid_t[16]; ++ ++/* ++ * obsolete perfmon commands. Supported only on IA-64 for ++ * backward compatiblity reasons with perfmon v2.0. ++ */ ++#define PFM_WRITE_PMCS 0x01 /* use pfm_write_pmcs */ ++#define PFM_WRITE_PMDS 0x02 /* use pfm_write_pmds */ ++#define PFM_READ_PMDS 0x03 /* use pfm_read_pmds */ ++#define PFM_STOP 0x04 /* use pfm_stop */ ++#define PFM_START 0x05 /* use pfm_start */ ++#define PFM_ENABLE 0x06 /* obsolete */ ++#define PFM_DISABLE 0x07 /* obsolete */ ++#define PFM_CREATE_CONTEXT 0x08 /* use pfm_create_context */ ++#define PFM_DESTROY_CONTEXT 0x09 /* use close() */ ++#define PFM_RESTART 0x0a /* use pfm_restart */ ++#define PFM_PROTECT_CONTEXT 0x0b /* obsolete */ ++#define PFM_GET_FEATURES 0x0c /* use /proc/sys/perfmon */ ++#define PFM_DEBUG 0x0d /* /proc/sys/kernel/perfmon/debug */ ++#define PFM_UNPROTECT_CONTEXT 0x0e /* obsolete */ ++#define PFM_GET_PMC_RESET_VAL 0x0f /* use /proc/perfmon_map */ ++#define PFM_LOAD_CONTEXT 0x10 /* use pfm_load_context */ ++#define PFM_UNLOAD_CONTEXT 0x11 /* use pfm_unload_context */ ++ ++/* ++ * PMU model specific commands (may not be supported on all PMU models) ++ */ ++#define PFM_WRITE_IBRS 0x20 /* obsolete: use PFM_WRITE_PMCS[256-263]*/ ++#define PFM_WRITE_DBRS 0x21 /* obsolete: use PFM_WRITE_PMCS[264-271]*/ ++ ++/* ++ * argument to PFM_CREATE_CONTEXT ++ */ ++struct pfarg_context { ++ pfm_uuid_t ctx_smpl_buf_id; /* buffer format to use */ ++ unsigned long ctx_flags; /* noblock/block */ ++ unsigned int ctx_reserved1; /* for future use */ ++ int ctx_fd; /* return: fildesc */ ++ void *ctx_smpl_vaddr; /* return: vaddr of buffer */ ++ unsigned long ctx_reserved3[11];/* for future use */ ++}; ++ ++/* ++ * argument structure for PFM_WRITE_PMCS/PFM_WRITE_PMDS/PFM_WRITE_PMDS ++ */ ++struct pfarg_reg { ++ unsigned int reg_num; /* which register */ ++ unsigned short reg_set; /* event set for this register */ ++ unsigned short reg_reserved1; /* for future use */ ++ ++ unsigned long reg_value; /* initial pmc/pmd value */ ++ unsigned long reg_flags; /* input: flags, ret: error */ ++ ++ unsigned long reg_long_reset; /* reset value after notification */ ++ unsigned long reg_short_reset; /* reset after counter overflow */ ++ ++ unsigned long reg_reset_pmds[4]; /* registers to reset on overflow */ ++ unsigned long reg_random_seed; /* seed for randomization */ ++ unsigned long reg_random_mask; /* random range limit */ ++ unsigned long reg_last_reset_val;/* return: PMD last reset value */ ++ ++ unsigned long reg_smpl_pmds[4]; /* pmds to be saved on overflow */ ++ unsigned long reg_smpl_eventid; /* opaque sampling event id */ ++ unsigned long reg_ovfl_switch_cnt;/* #overflows to switch */ ++ ++ unsigned long reg_reserved2[2]; /* for future use */ ++}; ++ ++/* ++ * argument to PFM_WRITE_IBRS/PFM_WRITE_DBRS ++ */ ++struct pfarg_dbreg { ++ unsigned int dbreg_num; /* which debug register */ ++ unsigned short dbreg_set; /* event set */ ++ unsigned short dbreg_reserved1; /* for future use */ ++ unsigned long dbreg_value; /* value for debug register */ ++ unsigned long dbreg_flags; /* return: dbreg error */ ++ unsigned long dbreg_reserved2[1]; /* for future use */ ++}; ++ ++/* ++ * argument to PFM_GET_FEATURES ++ */ ++struct pfarg_features { ++ unsigned int ft_version; /* major [16-31], minor [0-15] */ ++ unsigned int ft_reserved; /* reserved for future use */ ++ unsigned long reserved[4]; /* for future use */ ++}; ++ ++typedef struct { ++ int msg_type; /* generic message header */ ++ int msg_ctx_fd; /* generic message header */ ++ unsigned long msg_ovfl_pmds[4]; /* which PMDs overflowed */ ++ unsigned short msg_active_set; /* active set on overflow */ ++ unsigned short msg_reserved1; /* for future use */ ++ unsigned int msg_reserved2; /* for future use */ ++ unsigned long msg_tstamp; /* for perf tuning/debug */ ++} pfm_ovfl_msg_t; ++ ++typedef struct { ++ int msg_type; /* generic message header */ ++ int msg_ctx_fd; /* generic message header */ ++ unsigned long msg_tstamp; /* for perf tuning */ ++} pfm_end_msg_t; ++ ++typedef struct { ++ int msg_type; /* type of the message */ ++ int msg_ctx_fd; /* context file descriptor */ ++ unsigned long msg_tstamp; /* for perf tuning */ ++} pfm_gen_msg_t; ++ ++typedef union { ++ int type; ++ pfm_ovfl_msg_t pfm_ovfl_msg; ++ pfm_end_msg_t pfm_end_msg; ++ pfm_gen_msg_t pfm_gen_msg; ++} pfm_msg_t; ++ ++/* ++ * PMD/PMC return flags in case of error (ignored on input) ++ * ++ * reg_flags layout: ++ * bit 00-15 : generic flags ++ * bits[16-23] : arch-specific flags (see asm/perfmon.h) ++ * bit 24-31 : error codes ++ * ++ * Those flags are used on output and must be checked in case EINVAL is ++ * returned by a command accepting a vector of values and each has a flag ++ * field, such as pfarg_reg or pfarg_reg ++ */ ++#define PFM_REG_RETFL_NOTAVAIL (1<<31) /* not implemented or unaccessible */ ++#define PFM_REG_RETFL_EINVAL (1<<30) /* entry is invalid */ ++#define PFM_REG_RETFL_MASK (PFM_REG_RETFL_NOTAVAIL|\ ++ PFM_REG_RETFL_EINVAL) ++ ++#define PFM_REG_HAS_ERROR(flag) (((flag) & PFM_REG_RETFL_MASK) != 0) ++ ++#endif /* _ASM_IA64_PERFMON_COMPAT_H_ */ +diff --git a/arch/ia64/include/asm/perfmon_default_smpl.h b/arch/ia64/include/asm/perfmon_default_smpl.h +index 48822c0..8234f32 100644 +--- a/arch/ia64/include/asm/perfmon_default_smpl.h ++++ b/arch/ia64/include/asm/perfmon_default_smpl.h +@@ -1,83 +1,106 @@ + /* +- * Copyright (C) 2002-2003 Hewlett-Packard Co +- * Stephane Eranian <eranian@hpl.hp.com> ++ * Copyright (c) 2002-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> + * +- * This file implements the default sampling buffer format +- * for Linux/ia64 perfmon subsystem. ++ * This file implements the old default sampling buffer format ++ * for the perfmon2 subsystem. For IA-64 only. ++ * ++ * It requires the use of the perfmon_compat.h header. It is recommended ++ * that applications be ported to the new format instead. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA + */ +-#ifndef __PERFMON_DEFAULT_SMPL_H__ +-#define __PERFMON_DEFAULT_SMPL_H__ 1 ++#ifndef __ASM_IA64_PERFMON_DEFAULT_SMPL_H__ ++#define __ASM_IA64_PERFMON_DEFAULT_SMPL_H__ 1 ++ ++#ifndef __ia64__ ++#error "this file must be used for compatibility reasons only on IA-64" ++#endif + + #define PFM_DEFAULT_SMPL_UUID { \ +- 0x4d, 0x72, 0xbe, 0xc0, 0x06, 0x64, 0x41, 0x43, 0x82, 0xb4, 0xd3, 0xfd, 0x27, 0x24, 0x3c, 0x97} ++ 0x4d, 0x72, 0xbe, 0xc0, 0x06, 0x64, 0x41, 0x43, 0x82,\ ++ 0xb4, 0xd3, 0xfd, 0x27, 0x24, 0x3c, 0x97} + + /* + * format specific parameters (passed at context creation) + */ +-typedef struct { ++struct pfm_default_smpl_arg { + unsigned long buf_size; /* size of the buffer in bytes */ + unsigned int flags; /* buffer specific flags */ + unsigned int res1; /* for future use */ + unsigned long reserved[2]; /* for future use */ +-} pfm_default_smpl_arg_t; ++}; + + /* + * combined context+format specific structure. Can be passed +- * to PFM_CONTEXT_CREATE ++ * to PFM_CONTEXT_CREATE (not PFM_CONTEXT_CREATE2) + */ +-typedef struct { +- pfarg_context_t ctx_arg; +- pfm_default_smpl_arg_t buf_arg; +-} pfm_default_smpl_ctx_arg_t; ++struct pfm_default_smpl_ctx_arg { ++ struct pfarg_context ctx_arg; ++ struct pfm_default_smpl_arg buf_arg; ++}; + + /* + * This header is at the beginning of the sampling buffer returned to the user. + * It is directly followed by the first record. + */ +-typedef struct { +- unsigned long hdr_count; /* how many valid entries */ +- unsigned long hdr_cur_offs; /* current offset from top of buffer */ +- unsigned long hdr_reserved2; /* reserved for future use */ ++struct pfm_default_smpl_hdr { ++ u64 hdr_count; /* how many valid entries */ ++ u64 hdr_cur_offs; /* current offset from top of buffer */ ++ u64 dr_reserved2; /* reserved for future use */ + +- unsigned long hdr_overflows; /* how many times the buffer overflowed */ +- unsigned long hdr_buf_size; /* how many bytes in the buffer */ ++ u64 hdr_overflows; /* how many times the buffer overflowed */ ++ u64 hdr_buf_size; /* how many bytes in the buffer */ + +- unsigned int hdr_version; /* contains perfmon version (smpl format diffs) */ +- unsigned int hdr_reserved1; /* for future use */ +- unsigned long hdr_reserved[10]; /* for future use */ +-} pfm_default_smpl_hdr_t; ++ u32 hdr_version; /* smpl format version*/ ++ u32 hdr_reserved1; /* for future use */ ++ u64 hdr_reserved[10]; /* for future use */ ++}; + + /* + * Entry header in the sampling buffer. The header is directly followed +- * with the values of the PMD registers of interest saved in increasing +- * index order: PMD4, PMD5, and so on. How many PMDs are present depends ++ * with the values of the PMD registers of interest saved in increasing ++ * index order: PMD4, PMD5, and so on. How many PMDs are present depends + * on how the session was programmed. + * + * In the case where multiple counters overflow at the same time, multiple + * entries are written consecutively. + * +- * last_reset_value member indicates the initial value of the overflowed PMD. ++ * last_reset_value member indicates the initial value of the overflowed PMD. + */ +-typedef struct { +- int pid; /* thread id (for NPTL, this is gettid()) */ +- unsigned char reserved1[3]; /* reserved for future use */ +- unsigned char ovfl_pmd; /* index of overflowed PMD */ +- +- unsigned long last_reset_val; /* initial value of overflowed PMD */ +- unsigned long ip; /* where did the overflow interrupt happened */ +- unsigned long tstamp; /* ar.itc when entering perfmon intr. handler */ +- +- unsigned short cpu; /* cpu on which the overfow occured */ +- unsigned short set; /* event set active when overflow ocurred */ +- int tgid; /* thread group id (for NPTL, this is getpid()) */ +-} pfm_default_smpl_entry_t; ++struct pfm_default_smpl_entry { ++ pid_t pid; /* thread id (for NPTL, this is gettid()) */ ++ uint8_t reserved1[3]; /* for future use */ ++ uint8_t ovfl_pmd; /* overflow pmd for this sample */ ++ u64 last_reset_val; /* initial value of overflowed PMD */ ++ unsigned long ip; /* where did the overflow interrupt happened */ ++ u64 tstamp; /* overflow timetamp */ ++ u16 cpu; /* cpu on which the overfow occured */ ++ u16 set; /* event set active when overflow ocurred */ ++ pid_t tgid; /* thread group id (for NPTL, this is getpid()) */ ++}; + +-#define PFM_DEFAULT_MAX_PMDS 64 /* how many pmds supported by data structures (sizeof(unsigned long) */ +-#define PFM_DEFAULT_MAX_ENTRY_SIZE (sizeof(pfm_default_smpl_entry_t)+(sizeof(unsigned long)*PFM_DEFAULT_MAX_PMDS)) +-#define PFM_DEFAULT_SMPL_MIN_BUF_SIZE (sizeof(pfm_default_smpl_hdr_t)+PFM_DEFAULT_MAX_ENTRY_SIZE) ++#define PFM_DEFAULT_MAX_PMDS 64 /* #pmds supported */ ++#define PFM_DEFAULT_MAX_ENTRY_SIZE (sizeof(struct pfm_default_smpl_entry)+\ ++ (sizeof(u64)*PFM_DEFAULT_MAX_PMDS)) ++#define PFM_DEFAULT_SMPL_MIN_BUF_SIZE (sizeof(struct pfm_default_smpl_hdr)+\ ++ PFM_DEFAULT_MAX_ENTRY_SIZE) + + #define PFM_DEFAULT_SMPL_VERSION_MAJ 2U +-#define PFM_DEFAULT_SMPL_VERSION_MIN 0U +-#define PFM_DEFAULT_SMPL_VERSION (((PFM_DEFAULT_SMPL_VERSION_MAJ&0xffff)<<16)|(PFM_DEFAULT_SMPL_VERSION_MIN & 0xffff)) ++#define PFM_DEFAULT_SMPL_VERSION_MIN 1U ++#define PFM_DEFAULT_SMPL_VERSION (((PFM_DEFAULT_SMPL_VERSION_MAJ&0xffff)<<16)|\ ++ (PFM_DEFAULT_SMPL_VERSION_MIN & 0xffff)) + +-#endif /* __PERFMON_DEFAULT_SMPL_H__ */ ++#endif /* __ASM_IA64_PERFMON_DEFAULT_SMPL_H__ */ +diff --git a/arch/ia64/include/asm/perfmon_kern.h b/arch/ia64/include/asm/perfmon_kern.h +new file mode 100644 +index 0000000..fb40459 +--- /dev/null ++++ b/arch/ia64/include/asm/perfmon_kern.h +@@ -0,0 +1,356 @@ ++/* ++ * Copyright (c) 2001-2007 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * This file contains Itanium Processor Family specific definitions ++ * for the perfmon interface. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#ifndef _ASM_IA64_PERFMON_KERN_H_ ++#define _ASM_IA64_PERFMON_KERN_H_ ++ ++#ifdef __KERNEL__ ++ ++#ifdef CONFIG_PERFMON ++#include <asm/unistd.h> ++#include <asm/hw_irq.h> ++ ++/* ++ * describe the content of the pfm_syst_info field ++ * layout: ++ * bits[00-15] : generic flags ++ * bits[16-31] : arch-specific flags ++ */ ++#define PFM_ITA_CPUINFO_IDLE_EXCL 0x10000 /* stop monitoring in idle loop */ ++ ++/* ++ * For some CPUs, the upper bits of a counter must be set in order for the ++ * overflow interrupt to happen. On overflow, the counter has wrapped around, ++ * and the upper bits are cleared. This function may be used to set them back. ++ */ ++static inline void pfm_arch_ovfl_reset_pmd(struct pfm_context *ctx, ++ unsigned int cnum) ++{} ++ ++/* ++ * called from __pfm_interrupt_handler(). ctx is not NULL. ++ * ctx is locked. PMU interrupt is masked. ++ * ++ * must stop all monitoring to ensure handler has consistent view. ++ * must collect overflowed PMDs bitmask into povfls_pmds and ++ * npend_ovfls. If no interrupt detected then npend_ovfls ++ * must be set to zero. ++ */ ++static inline void pfm_arch_intr_freeze_pmu(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ u64 tmp; ++ ++ /* ++ * do not overwrite existing value, must ++ * process those first (coming from context switch replay) ++ */ ++ if (set->npend_ovfls) ++ return; ++ ++ ia64_srlz_d(); ++ ++ tmp = ia64_get_pmc(0) & ~0xf; ++ ++ set->povfl_pmds[0] = tmp; ++ ++ set->npend_ovfls = ia64_popcnt(tmp); ++} ++ ++static inline int pfm_arch_init_pmu_config(void) ++{ ++ return 0; ++} ++ ++static inline void pfm_arch_resend_irq(struct pfm_context *ctx) ++{ ++ ia64_resend_irq(IA64_PERFMON_VECTOR); ++} ++ ++static inline void pfm_arch_clear_pmd_ovfl_cond(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{} ++ ++static inline void pfm_arch_serialize(void) ++{ ++ ia64_srlz_d(); ++} ++ ++static inline void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx) ++{ ++ PFM_DBG_ovfl("state=%d", ctx->state); ++ ia64_set_pmc(0, 0); ++ /* no serialization */ ++} ++ ++static inline void pfm_arch_write_pmc(struct pfm_context *ctx, ++ unsigned int cnum, u64 value) ++{ ++ if (cnum < 256) { ++ ia64_set_pmc(pfm_pmu_conf->pmc_desc[cnum].hw_addr, value); ++ } else if (cnum < 264) { ++ ia64_set_ibr(cnum-256, value); ++ ia64_dv_serialize_instruction(); ++ } else { ++ ia64_set_dbr(cnum-264, value); ++ ia64_dv_serialize_instruction(); ++ } ++} ++ ++/* ++ * On IA-64, for per-thread context which have the ITA_FL_INSECURE ++ * flag, it is possible to start/stop monitoring directly from user evel ++ * without calling pfm_start()/pfm_stop. This allows very lightweight ++ * control yet the kernel sometimes needs to know if monitoring is actually ++ * on or off. ++ * ++ * Tracking of this information is normally done by pfm_start/pfm_stop ++ * in flags.started. Here we need to compensate by checking actual ++ * psr bit. ++ */ ++static inline int pfm_arch_is_active(struct pfm_context *ctx) ++{ ++ return ctx->flags.started ++ || ia64_getreg(_IA64_REG_PSR) & (IA64_PSR_UP|IA64_PSR_PP); ++} ++ ++static inline void pfm_arch_write_pmd(struct pfm_context *ctx, ++ unsigned int cnum, u64 value) ++{ ++ /* ++ * for a counting PMD, overflow bit must be cleared ++ */ ++ if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_C64) ++ value &= pfm_pmu_conf->ovfl_mask; ++ ++ /* ++ * for counters, write to upper bits are ignored, no need to mask ++ */ ++ ia64_set_pmd(pfm_pmu_conf->pmd_desc[cnum].hw_addr, value); ++} ++ ++static inline u64 pfm_arch_read_pmd(struct pfm_context *ctx, unsigned int cnum) ++{ ++ return ia64_get_pmd(pfm_pmu_conf->pmd_desc[cnum].hw_addr); ++} ++ ++static inline u64 pfm_arch_read_pmc(struct pfm_context *ctx, unsigned int cnum) ++{ ++ return ia64_get_pmc(pfm_pmu_conf->pmc_desc[cnum].hw_addr); ++} ++ ++static inline void pfm_arch_ctxswout_sys(struct task_struct *task, ++ struct pfm_context *ctx) ++{ ++ struct pt_regs *regs; ++ ++ regs = task_pt_regs(task); ++ ia64_psr(regs)->pp = 0; ++} ++ ++static inline void pfm_arch_ctxswin_sys(struct task_struct *task, ++ struct pfm_context *ctx) ++{ ++ struct pt_regs *regs; ++ ++ if (!(ctx->active_set->flags & PFM_ITA_SETFL_INTR_ONLY)) { ++ regs = task_pt_regs(task); ++ ia64_psr(regs)->pp = 1; ++ } ++} ++ ++/* ++ * On IA-64, the PMDs are NOT saved by pfm_arch_freeze_pmu() ++ * when entering the PMU interrupt handler, thus, we need ++ * to save them in pfm_switch_sets_from_intr() ++ */ ++static inline void pfm_arch_save_pmds_from_intr(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ pfm_save_pmds(ctx, set); ++} ++ ++int pfm_arch_context_create(struct pfm_context *ctx, u32 ctx_flags); ++ ++static inline void pfm_arch_context_free(struct pfm_context *ctx) ++{} ++ ++int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx); ++void pfm_arch_ctxswin_thread(struct task_struct *task, ++ struct pfm_context *ctx); ++ ++void pfm_arch_unload_context(struct pfm_context *ctx); ++int pfm_arch_load_context(struct pfm_context *ctx); ++int pfm_arch_setfl_sane(struct pfm_context *ctx, u32 flags); ++ ++void pfm_arch_mask_monitoring(struct pfm_context *ctx, ++ struct pfm_event_set *set); ++void pfm_arch_unmask_monitoring(struct pfm_context *ctx, ++ struct pfm_event_set *set); ++ ++void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set); ++void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set); ++ ++void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx); ++void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx); ++ ++int pfm_arch_init(void); ++void pfm_arch_init_percpu(void); ++char *pfm_arch_get_pmu_module_name(void); ++ ++int __pfm_use_dbregs(struct task_struct *task); ++int __pfm_release_dbregs(struct task_struct *task); ++int pfm_ia64_mark_dbregs_used(struct pfm_context *ctx, ++ struct pfm_event_set *set); ++ ++void pfm_arch_show_session(struct seq_file *m); ++ ++static inline int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds) ++{ ++ return 0; ++} ++ ++static inline void pfm_arch_pmu_release(void) ++{} ++ ++/* not necessary on IA-64 */ ++static inline void pfm_cacheflush(void *addr, unsigned int len) ++{} ++ ++/* ++ * miscellaneous architected definitions ++ */ ++#define PFM_ITA_FCNTR 4 /* first counting monitor (PMC/PMD) */ ++ ++/* ++ * private event set flags (set_priv_flags) ++ */ ++#define PFM_ITA_SETFL_USE_DBR 0x1000000 /* set uses debug registers */ ++ ++ ++/* ++ * Itanium-specific data structures ++ */ ++struct pfm_ia64_context_flags { ++ unsigned int use_dbr:1; /* use range restrictions (debug registers) */ ++ unsigned int insecure:1; /* insecure monitoring for non-self session */ ++ unsigned int reserved:30;/* for future use */ ++}; ++ ++struct pfm_arch_context { ++ struct pfm_ia64_context_flags flags; /* arch specific ctx flags */ ++ u64 ctx_saved_psr_up;/* storage for psr_up */ ++#ifdef CONFIG_IA64_PERFMON_COMPAT ++ void *ctx_smpl_vaddr; /* vaddr of user mapping */ ++#endif ++}; ++ ++#ifdef CONFIG_IA64_PERFMON_COMPAT ++ssize_t pfm_arch_compat_read(struct pfm_context *ctx, ++ char __user *buf, ++ int non_block, ++ size_t size); ++int pfm_ia64_compat_init(void); ++int pfm_smpl_buf_alloc_compat(struct pfm_context *ctx, ++ size_t rsize, struct file *filp); ++#else ++static inline ssize_t pfm_arch_compat_read(struct pfm_context *ctx, ++ char __user *buf, ++ int non_block, ++ size_t size) ++{ ++ return -EINVAL; ++} ++ ++static inline int pfm_smpl_buf_alloc_compat(struct pfm_context *ctx, ++ size_t rsize, struct file *filp) ++{ ++ return -EINVAL; ++} ++#endif ++ ++static inline void pfm_arch_arm_handle_work(struct task_struct *task) ++{ ++ /* ++ * On IA-64, we ran out of bits in the bottom 7 bits of the ++ * threadinfo bitmask.Thus we used a 2-stage approach by piggybacking ++ * on NOTIFY_RESUME and then in do_notify_resume() we demultiplex and ++ * call pfm_handle_work() if needed ++ */ ++ set_tsk_thread_flag(task, TIF_NOTIFY_RESUME); ++} ++ ++static inline void pfm_arch_disarm_handle_work(struct task_struct *task) ++{ ++ /* ++ * we cannot just clear TIF_NOTIFY_RESUME because other TIF flags are ++ * piggybackedonto it: TIF_PERFMON_WORK, TIF_RESTORE_RSE ++ * ++ * The tsk_clear_notify_resume() checks if any of those are set before ++ * clearing the * bit ++ */ ++ tsk_clear_notify_resume(task); ++} ++ ++static inline int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg) ++{ ++ return 0; ++} ++ ++extern struct pfm_ia64_pmu_info *pfm_ia64_pmu_info; ++ ++#define PFM_ARCH_CTX_SIZE (sizeof(struct pfm_arch_context)) ++ ++/* ++ * IA-64 does not need extra alignment requirements for the sampling buffer ++ */ ++#define PFM_ARCH_SMPL_ALIGN_SIZE 0 ++ ++ ++static inline void pfm_release_dbregs(struct task_struct *task) ++{ ++ if (task->thread.flags & IA64_THREAD_DBG_VALID) ++ __pfm_release_dbregs(task); ++} ++ ++#define pfm_use_dbregs(_t) __pfm_use_dbregs(_t) ++ ++static inline int pfm_arch_get_base_syscall(void) ++{ ++ return __NR_pfm_create_context; ++} ++ ++struct pfm_arch_pmu_info { ++ unsigned long mask_pmcs[PFM_PMC_BV]; /* modify on when masking */ ++}; ++ ++DECLARE_PER_CPU(u32, pfm_syst_info); ++#else /* !CONFIG_PERFMON */ ++/* ++ * perfmon ia64-specific hooks ++ */ ++#define pfm_release_dbregs(_t) do { } while (0) ++#define pfm_use_dbregs(_t) (0) ++ ++#endif /* CONFIG_PERFMON */ ++ ++#endif /* __KERNEL__ */ ++#endif /* _ASM_IA64_PERFMON_KERN_H_ */ +diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h +index f88fa05..9d6af9c 100644 +--- a/arch/ia64/include/asm/processor.h ++++ b/arch/ia64/include/asm/processor.h +@@ -42,7 +42,6 @@ + + #define IA64_THREAD_FPH_VALID (__IA64_UL(1) << 0) /* floating-point high state valid? */ + #define IA64_THREAD_DBG_VALID (__IA64_UL(1) << 1) /* debug registers valid? */ +-#define IA64_THREAD_PM_VALID (__IA64_UL(1) << 2) /* performance registers valid? */ + #define IA64_THREAD_UAC_NOPRINT (__IA64_UL(1) << 3) /* don't log unaligned accesses */ + #define IA64_THREAD_UAC_SIGBUS (__IA64_UL(1) << 4) /* generate SIGBUS on unaligned acc. */ + #define IA64_THREAD_MIGRATION (__IA64_UL(1) << 5) /* require migration +@@ -321,14 +320,6 @@ struct thread_struct { + #else + # define INIT_THREAD_IA32 + #endif /* CONFIG_IA32_SUPPORT */ +-#ifdef CONFIG_PERFMON +- void *pfm_context; /* pointer to detailed PMU context */ +- unsigned long pfm_needs_checking; /* when >0, pending perfmon work on kernel exit */ +-# define INIT_THREAD_PM .pfm_context = NULL, \ +- .pfm_needs_checking = 0UL, +-#else +-# define INIT_THREAD_PM +-#endif + __u64 dbr[IA64_NUM_DBG_REGS]; + __u64 ibr[IA64_NUM_DBG_REGS]; + struct ia64_fpreg fph[96]; /* saved/loaded on demand */ +@@ -343,7 +334,6 @@ struct thread_struct { + .task_size = DEFAULT_TASK_SIZE, \ + .last_fph_cpu = -1, \ + INIT_THREAD_IA32 \ +- INIT_THREAD_PM \ + .dbr = {0, }, \ + .ibr = {0, }, \ + .fph = {{{{0}}}, } \ +diff --git a/arch/ia64/include/asm/system.h b/arch/ia64/include/asm/system.h +index 927a381..ab5aeea 100644 +--- a/arch/ia64/include/asm/system.h ++++ b/arch/ia64/include/asm/system.h +@@ -217,6 +217,7 @@ struct task_struct; + extern void ia64_save_extra (struct task_struct *task); + extern void ia64_load_extra (struct task_struct *task); + ++ + #ifdef CONFIG_VIRT_CPU_ACCOUNTING + extern void ia64_account_on_switch (struct task_struct *prev, struct task_struct *next); + # define IA64_ACCOUNT_ON_SWITCH(p,n) ia64_account_on_switch(p,n) +@@ -224,16 +225,9 @@ extern void ia64_account_on_switch (struct task_struct *prev, struct task_struct + # define IA64_ACCOUNT_ON_SWITCH(p,n) + #endif + +-#ifdef CONFIG_PERFMON +- DECLARE_PER_CPU(unsigned long, pfm_syst_info); +-# define PERFMON_IS_SYSWIDE() (__get_cpu_var(pfm_syst_info) & 0x1) +-#else +-# define PERFMON_IS_SYSWIDE() (0) +-#endif +- +-#define IA64_HAS_EXTRA_STATE(t) \ +- ((t)->thread.flags & (IA64_THREAD_DBG_VALID|IA64_THREAD_PM_VALID) \ +- || IS_IA32_PROCESS(task_pt_regs(t)) || PERFMON_IS_SYSWIDE()) ++#define IA64_HAS_EXTRA_STATE(t) \ ++ (((t)->thread.flags & IA64_THREAD_DBG_VALID) \ ++ || IS_IA32_PROCESS(task_pt_regs(t))) + + #define __switch_to(prev,next,last) do { \ + IA64_ACCOUNT_ON_SWITCH(prev, next); \ +@@ -241,6 +235,10 @@ extern void ia64_account_on_switch (struct task_struct *prev, struct task_struct + ia64_save_extra(prev); \ + if (IA64_HAS_EXTRA_STATE(next)) \ + ia64_load_extra(next); \ ++ if (test_tsk_thread_flag(prev, TIF_PERFMON_CTXSW)) \ ++ pfm_ctxsw_out(prev, next); \ ++ if (test_tsk_thread_flag(next, TIF_PERFMON_CTXSW)) \ ++ pfm_ctxsw_in(prev, next); \ + ia64_psr(task_pt_regs(next))->dfh = !ia64_is_local_fpu_owner(next); \ + (last) = ia64_switch_to((next)); \ + } while (0) +diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h +index 7c60fcd..3355332 100644 +--- a/arch/ia64/include/asm/thread_info.h ++++ b/arch/ia64/include/asm/thread_info.h +@@ -110,6 +110,8 @@ extern void tsk_clear_notify_resume(struct task_struct *tsk); + #define TIF_DB_DISABLED 19 /* debug trap disabled for fsyscall */ + #define TIF_FREEZE 20 /* is freezing for suspend */ + #define TIF_RESTORE_RSE 21 /* user RBS is newer than kernel RBS */ ++#define TIF_PERFMON_CTXSW 22 /* perfmon needs ctxsw calls */ ++#define TIF_PERFMON_WORK 23 /* work for pfm_handle_work() */ + + #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) + #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) +@@ -123,6 +125,8 @@ extern void tsk_clear_notify_resume(struct task_struct *tsk); + #define _TIF_DB_DISABLED (1 << TIF_DB_DISABLED) + #define _TIF_FREEZE (1 << TIF_FREEZE) + #define _TIF_RESTORE_RSE (1 << TIF_RESTORE_RSE) ++#define _TIF_PERFMON_CTXSW (1 << TIF_PERFMON_CTXSW) ++#define _TIF_PERFMON_WORK (1 << TIF_PERFMON_WORK) + + /* "work to do on user-return" bits */ + #define TIF_ALLWORK_MASK (_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SYSCALL_AUDIT|\ +diff --git a/arch/ia64/include/asm/unistd.h b/arch/ia64/include/asm/unistd.h +index d535833..29a43bc 100644 +--- a/arch/ia64/include/asm/unistd.h ++++ b/arch/ia64/include/asm/unistd.h +@@ -308,11 +308,23 @@ + #define __NR_dup3 1316 + #define __NR_pipe2 1317 + #define __NR_inotify_init1 1318 ++#define __NR_pfm_create_context 1319 ++#define __NR_pfm_write_pmcs (__NR_pfm_create_context+1) ++#define __NR_pfm_write_pmds (__NR_pfm_create_context+2) ++#define __NR_pfm_read_pmds (__NR_pfm_create_context+3) ++#define __NR_pfm_load_context (__NR_pfm_create_context+4) ++#define __NR_pfm_start (__NR_pfm_create_context+5) ++#define __NR_pfm_stop (__NR_pfm_create_context+6) ++#define __NR_pfm_restart (__NR_pfm_create_context+7) ++#define __NR_pfm_create_evtsets (__NR_pfm_create_context+8) ++#define __NR_pfm_getinfo_evtsets (__NR_pfm_create_context+9) ++#define __NR_pfm_delete_evtsets (__NR_pfm_create_context+10) ++#define __NR_pfm_unload_context (__NR_pfm_create_context+11) + + #ifdef __KERNEL__ + + +-#define NR_syscalls 295 /* length of syscall table */ ++#define NR_syscalls 307 /* length of syscall table */ + + /* + * The following defines stop scripts/checksyscalls.sh from complaining about +diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile +index 87fea11..b5ac54c 100644 +--- a/arch/ia64/kernel/Makefile ++++ b/arch/ia64/kernel/Makefile +@@ -5,7 +5,7 @@ + extra-y := head.o init_task.o vmlinux.lds + + obj-y := acpi.o entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o \ +- irq_lsapic.o ivt.o machvec.o pal.o patch.o process.o perfmon.o ptrace.o sal.o \ ++ irq_lsapic.o ivt.o machvec.o pal.o patch.o process.o ptrace.o sal.o \ + salinfo.o setup.o signal.o sys_ia64.o time.o traps.o unaligned.o \ + unwind.o mca.o mca_asm.o topology.o + +@@ -23,7 +23,6 @@ obj-$(CONFIG_IOSAPIC) += iosapic.o + obj-$(CONFIG_MODULES) += module.o + obj-$(CONFIG_SMP) += smp.o smpboot.o + obj-$(CONFIG_NUMA) += numa.o +-obj-$(CONFIG_PERFMON) += perfmon_default_smpl.o + obj-$(CONFIG_IA64_CYCLONE) += cyclone.o + obj-$(CONFIG_CPU_FREQ) += cpufreq/ + obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o +diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S +index 0dd6c14..f1c3e41 100644 +--- a/arch/ia64/kernel/entry.S ++++ b/arch/ia64/kernel/entry.S +@@ -1697,6 +1697,18 @@ sys_call_table: + data8 sys_dup3 + data8 sys_pipe2 + data8 sys_inotify_init1 ++ data8 sys_pfm_create_context ++ data8 sys_pfm_write_pmcs // 1320 ++ data8 sys_pfm_write_pmds ++ data8 sys_pfm_read_pmds ++ data8 sys_pfm_load_context ++ data8 sys_pfm_start ++ data8 sys_pfm_stop // 1325 ++ data8 sys_pfm_restart ++ data8 sys_pfm_create_evtsets ++ data8 sys_pfm_getinfo_evtsets ++ data8 sys_pfm_delete_evtsets ++ data8 sys_pfm_unload_context // 1330 + + .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls + #endif /* __IA64_ASM_PARAVIRTUALIZED_NATIVE */ +diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c +index 28d3d48..ede8024 100644 +--- a/arch/ia64/kernel/irq_ia64.c ++++ b/arch/ia64/kernel/irq_ia64.c +@@ -40,10 +40,6 @@ + #include <asm/system.h> + #include <asm/tlbflush.h> + +-#ifdef CONFIG_PERFMON +-# include <asm/perfmon.h> +-#endif +- + #define IRQ_DEBUG 0 + + #define IRQ_VECTOR_UNASSIGNED (0) +@@ -660,9 +656,6 @@ init_IRQ (void) + } + #endif + #endif +-#ifdef CONFIG_PERFMON +- pfm_init_percpu(); +-#endif + platform_irq_init(); + } + +diff --git a/arch/ia64/kernel/perfmon_default_smpl.c b/arch/ia64/kernel/perfmon_default_smpl.c +deleted file mode 100644 +index 5f637bb..0000000 +--- a/arch/ia64/kernel/perfmon_default_smpl.c ++++ /dev/null +@@ -1,296 +0,0 @@ +-/* +- * Copyright (C) 2002-2003 Hewlett-Packard Co +- * Stephane Eranian <eranian@hpl.hp.com> +- * +- * This file implements the default sampling buffer format +- * for the Linux/ia64 perfmon-2 subsystem. +- */ +-#include <linux/kernel.h> +-#include <linux/types.h> +-#include <linux/module.h> +-#include <linux/init.h> +-#include <asm/delay.h> +-#include <linux/smp.h> +- +-#include <asm/perfmon.h> +-#include <asm/perfmon_default_smpl.h> +- +-MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>"); +-MODULE_DESCRIPTION("perfmon default sampling format"); +-MODULE_LICENSE("GPL"); +- +-#define DEFAULT_DEBUG 1 +- +-#ifdef DEFAULT_DEBUG +-#define DPRINT(a) \ +- do { \ +- if (unlikely(pfm_sysctl.debug >0)) { printk("%s.%d: CPU%d ", __func__, __LINE__, smp_processor_id()); printk a; } \ +- } while (0) +- +-#define DPRINT_ovfl(a) \ +- do { \ +- if (unlikely(pfm_sysctl.debug > 0 && pfm_sysctl.debug_ovfl >0)) { printk("%s.%d: CPU%d ", __func__, __LINE__, smp_processor_id()); printk a; } \ +- } while (0) +- +-#else +-#define DPRINT(a) +-#define DPRINT_ovfl(a) +-#endif +- +-static int +-default_validate(struct task_struct *task, unsigned int flags, int cpu, void *data) +-{ +- pfm_default_smpl_arg_t *arg = (pfm_default_smpl_arg_t*)data; +- int ret = 0; +- +- if (data == NULL) { +- DPRINT(("[%d] no argument passed\n", task_pid_nr(task))); +- return -EINVAL; +- } +- +- DPRINT(("[%d] validate flags=0x%x CPU%d\n", task_pid_nr(task), flags, cpu)); +- +- /* +- * must hold at least the buffer header + one minimally sized entry +- */ +- if (arg->buf_size < PFM_DEFAULT_SMPL_MIN_BUF_SIZE) return -EINVAL; +- +- DPRINT(("buf_size=%lu\n", arg->buf_size)); +- +- return ret; +-} +- +-static int +-default_get_size(struct task_struct *task, unsigned int flags, int cpu, void *data, unsigned long *size) +-{ +- pfm_default_smpl_arg_t *arg = (pfm_default_smpl_arg_t *)data; +- +- /* +- * size has been validated in default_validate +- */ +- *size = arg->buf_size; +- +- return 0; +-} +- +-static int +-default_init(struct task_struct *task, void *buf, unsigned int flags, int cpu, void *data) +-{ +- pfm_default_smpl_hdr_t *hdr; +- pfm_default_smpl_arg_t *arg = (pfm_default_smpl_arg_t *)data; +- +- hdr = (pfm_default_smpl_hdr_t *)buf; +- +- hdr->hdr_version = PFM_DEFAULT_SMPL_VERSION; +- hdr->hdr_buf_size = arg->buf_size; +- hdr->hdr_cur_offs = sizeof(*hdr); +- hdr->hdr_overflows = 0UL; +- hdr->hdr_count = 0UL; +- +- DPRINT(("[%d] buffer=%p buf_size=%lu hdr_size=%lu hdr_version=%u cur_offs=%lu\n", +- task_pid_nr(task), +- buf, +- hdr->hdr_buf_size, +- sizeof(*hdr), +- hdr->hdr_version, +- hdr->hdr_cur_offs)); +- +- return 0; +-} +- +-static int +-default_handler(struct task_struct *task, void *buf, pfm_ovfl_arg_t *arg, struct pt_regs *regs, unsigned long stamp) +-{ +- pfm_default_smpl_hdr_t *hdr; +- pfm_default_smpl_entry_t *ent; +- void *cur, *last; +- unsigned long *e, entry_size; +- unsigned int npmds, i; +- unsigned char ovfl_pmd; +- unsigned char ovfl_notify; +- +- if (unlikely(buf == NULL || arg == NULL|| regs == NULL || task == NULL)) { +- DPRINT(("[%d] invalid arguments buf=%p arg=%p\n", task->pid, buf, arg)); +- return -EINVAL; +- } +- +- hdr = (pfm_default_smpl_hdr_t *)buf; +- cur = buf+hdr->hdr_cur_offs; +- last = buf+hdr->hdr_buf_size; +- ovfl_pmd = arg->ovfl_pmd; +- ovfl_notify = arg->ovfl_notify; +- +- /* +- * precheck for sanity +- */ +- if ((last - cur) < PFM_DEFAULT_MAX_ENTRY_SIZE) goto full; +- +- npmds = hweight64(arg->smpl_pmds[0]); +- +- ent = (pfm_default_smpl_entry_t *)cur; +- +- prefetch(arg->smpl_pmds_values); +- +- entry_size = sizeof(*ent) + (npmds << 3); +- +- /* position for first pmd */ +- e = (unsigned long *)(ent+1); +- +- hdr->hdr_count++; +- +- DPRINT_ovfl(("[%d] count=%lu cur=%p last=%p free_bytes=%lu ovfl_pmd=%d ovfl_notify=%d npmds=%u\n", +- task->pid, +- hdr->hdr_count, +- cur, last, +- last-cur, +- ovfl_pmd, +- ovfl_notify, npmds)); +- +- /* +- * current = task running at the time of the overflow. +- * +- * per-task mode: +- * - this is ususally the task being monitored. +- * Under certain conditions, it might be a different task +- * +- * system-wide: +- * - this is not necessarily the task controlling the session +- */ +- ent->pid = current->pid; +- ent->ovfl_pmd = ovfl_pmd; +- ent->last_reset_val = arg->pmd_last_reset; //pmd[0].reg_last_reset_val; +- +- /* +- * where did the fault happen (includes slot number) +- */ +- ent->ip = regs->cr_iip | ((regs->cr_ipsr >> 41) & 0x3); +- +- ent->tstamp = stamp; +- ent->cpu = smp_processor_id(); +- ent->set = arg->active_set; +- ent->tgid = current->tgid; +- +- /* +- * selectively store PMDs in increasing index number +- */ +- if (npmds) { +- unsigned long *val = arg->smpl_pmds_values; +- for(i=0; i < npmds; i++) { +- *e++ = *val++; +- } +- } +- +- /* +- * update position for next entry +- */ +- hdr->hdr_cur_offs += entry_size; +- cur += entry_size; +- +- /* +- * post check to avoid losing the last sample +- */ +- if ((last - cur) < PFM_DEFAULT_MAX_ENTRY_SIZE) goto full; +- +- /* +- * keep same ovfl_pmds, ovfl_notify +- */ +- arg->ovfl_ctrl.bits.notify_user = 0; +- arg->ovfl_ctrl.bits.block_task = 0; +- arg->ovfl_ctrl.bits.mask_monitoring = 0; +- arg->ovfl_ctrl.bits.reset_ovfl_pmds = 1; /* reset before returning from interrupt handler */ +- +- return 0; +-full: +- DPRINT_ovfl(("sampling buffer full free=%lu, count=%lu, ovfl_notify=%d\n", last-cur, hdr->hdr_count, ovfl_notify)); +- +- /* +- * increment number of buffer overflow. +- * important to detect duplicate set of samples. +- */ +- hdr->hdr_overflows++; +- +- /* +- * if no notification requested, then we saturate the buffer +- */ +- if (ovfl_notify == 0) { +- arg->ovfl_ctrl.bits.notify_user = 0; +- arg->ovfl_ctrl.bits.block_task = 0; +- arg->ovfl_ctrl.bits.mask_monitoring = 1; +- arg->ovfl_ctrl.bits.reset_ovfl_pmds = 0; +- } else { +- arg->ovfl_ctrl.bits.notify_user = 1; +- arg->ovfl_ctrl.bits.block_task = 1; /* ignored for non-blocking context */ +- arg->ovfl_ctrl.bits.mask_monitoring = 1; +- arg->ovfl_ctrl.bits.reset_ovfl_pmds = 0; /* no reset now */ +- } +- return -1; /* we are full, sorry */ +-} +- +-static int +-default_restart(struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs) +-{ +- pfm_default_smpl_hdr_t *hdr; +- +- hdr = (pfm_default_smpl_hdr_t *)buf; +- +- hdr->hdr_count = 0UL; +- hdr->hdr_cur_offs = sizeof(*hdr); +- +- ctrl->bits.mask_monitoring = 0; +- ctrl->bits.reset_ovfl_pmds = 1; /* uses long-reset values */ +- +- return 0; +-} +- +-static int +-default_exit(struct task_struct *task, void *buf, struct pt_regs *regs) +-{ +- DPRINT(("[%d] exit(%p)\n", task_pid_nr(task), buf)); +- return 0; +-} +- +-static pfm_buffer_fmt_t default_fmt={ +- .fmt_name = "default_format", +- .fmt_uuid = PFM_DEFAULT_SMPL_UUID, +- .fmt_arg_size = sizeof(pfm_default_smpl_arg_t), +- .fmt_validate = default_validate, +- .fmt_getsize = default_get_size, +- .fmt_init = default_init, +- .fmt_handler = default_handler, +- .fmt_restart = default_restart, +- .fmt_restart_active = default_restart, +- .fmt_exit = default_exit, +-}; +- +-static int __init +-pfm_default_smpl_init_module(void) +-{ +- int ret; +- +- ret = pfm_register_buffer_fmt(&default_fmt); +- if (ret == 0) { +- printk("perfmon_default_smpl: %s v%u.%u registered\n", +- default_fmt.fmt_name, +- PFM_DEFAULT_SMPL_VERSION_MAJ, +- PFM_DEFAULT_SMPL_VERSION_MIN); +- } else { +- printk("perfmon_default_smpl: %s cannot register ret=%d\n", +- default_fmt.fmt_name, +- ret); +- } +- +- return ret; +-} +- +-static void __exit +-pfm_default_smpl_cleanup_module(void) +-{ +- int ret; +- ret = pfm_unregister_buffer_fmt(default_fmt.fmt_uuid); +- +- printk("perfmon_default_smpl: unregister %s=%d\n", default_fmt.fmt_name, ret); +-} +- +-module_init(pfm_default_smpl_init_module); +-module_exit(pfm_default_smpl_cleanup_module); +- +diff --git a/arch/ia64/kernel/perfmon_generic.h b/arch/ia64/kernel/perfmon_generic.h +deleted file mode 100644 +index 6748947..0000000 +--- a/arch/ia64/kernel/perfmon_generic.h ++++ /dev/null +@@ -1,45 +0,0 @@ +-/* +- * This file contains the generic PMU register description tables +- * and pmc checker used by perfmon.c. +- * +- * Copyright (C) 2002-2003 Hewlett Packard Co +- * Stephane Eranian <eranian@hpl.hp.com> +- */ +- +-static pfm_reg_desc_t pfm_gen_pmc_desc[PMU_MAX_PMCS]={ +-/* pmc0 */ { PFM_REG_CONTROL , 0, 0x1UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc1 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc2 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc3 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc4 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(4),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc5 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(5),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc6 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(6),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc7 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(7),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +- { PFM_REG_END , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */ +-}; +- +-static pfm_reg_desc_t pfm_gen_pmd_desc[PMU_MAX_PMDS]={ +-/* pmd0 */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, +-/* pmd1 */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, +-/* pmd2 */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, +-/* pmd3 */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, +-/* pmd4 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(4),0UL, 0UL, 0UL}}, +-/* pmd5 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(5),0UL, 0UL, 0UL}}, +-/* pmd6 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(6),0UL, 0UL, 0UL}}, +-/* pmd7 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(7),0UL, 0UL, 0UL}}, +- { PFM_REG_END , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */ +-}; +- +-/* +- * impl_pmcs, impl_pmds are computed at runtime to minimize errors! +- */ +-static pmu_config_t pmu_conf_gen={ +- .pmu_name = "Generic", +- .pmu_family = 0xff, /* any */ +- .ovfl_val = (1UL << 32) - 1, +- .num_ibrs = 0, /* does not use */ +- .num_dbrs = 0, /* does not use */ +- .pmd_desc = pfm_gen_pmd_desc, +- .pmc_desc = pfm_gen_pmc_desc +-}; +- +diff --git a/arch/ia64/kernel/perfmon_itanium.h b/arch/ia64/kernel/perfmon_itanium.h +deleted file mode 100644 +index d1d508a..0000000 +--- a/arch/ia64/kernel/perfmon_itanium.h ++++ /dev/null +@@ -1,115 +0,0 @@ +-/* +- * This file contains the Itanium PMU register description tables +- * and pmc checker used by perfmon.c. +- * +- * Copyright (C) 2002-2003 Hewlett Packard Co +- * Stephane Eranian <eranian@hpl.hp.com> +- */ +-static int pfm_ita_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs); +- +-static pfm_reg_desc_t pfm_ita_pmc_desc[PMU_MAX_PMCS]={ +-/* pmc0 */ { PFM_REG_CONTROL , 0, 0x1UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc1 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc2 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc3 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc4 */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(4),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc5 */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(5),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc6 */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(6),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc7 */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(7),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc8 */ { PFM_REG_CONFIG , 0, 0xf00000003ffffff8UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc9 */ { PFM_REG_CONFIG , 0, 0xf00000003ffffff8UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc10 */ { PFM_REG_MONITOR , 6, 0x0UL, -1UL, NULL, NULL, {RDEP(0)|RDEP(1),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc11 */ { PFM_REG_MONITOR , 6, 0x0000000010000000UL, -1UL, NULL, pfm_ita_pmc_check, {RDEP(2)|RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc12 */ { PFM_REG_MONITOR , 6, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc13 */ { PFM_REG_CONFIG , 0, 0x0003ffff00000001UL, -1UL, NULL, pfm_ita_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +- { PFM_REG_END , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */ +-}; +- +-static pfm_reg_desc_t pfm_ita_pmd_desc[PMU_MAX_PMDS]={ +-/* pmd0 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(1),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}}, +-/* pmd1 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(0),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}}, +-/* pmd2 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}}, +-/* pmd3 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}}, +-/* pmd4 */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(4),0UL, 0UL, 0UL}}, +-/* pmd5 */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(5),0UL, 0UL, 0UL}}, +-/* pmd6 */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(6),0UL, 0UL, 0UL}}, +-/* pmd7 */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(7),0UL, 0UL, 0UL}}, +-/* pmd8 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +-/* pmd9 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +-/* pmd10 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +-/* pmd11 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +-/* pmd12 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +-/* pmd13 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +-/* pmd14 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +-/* pmd15 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +-/* pmd16 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +-/* pmd17 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(3),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}}, +- { PFM_REG_END , 0, 0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */ +-}; +- +-static int +-pfm_ita_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs) +-{ +- int ret; +- int is_loaded; +- +- /* sanitfy check */ +- if (ctx == NULL) return -EINVAL; +- +- is_loaded = ctx->ctx_state == PFM_CTX_LOADED || ctx->ctx_state == PFM_CTX_MASKED; +- +- /* +- * we must clear the (instruction) debug registers if pmc13.ta bit is cleared +- * before they are written (fl_using_dbreg==0) to avoid picking up stale information. +- */ +- if (cnum == 13 && is_loaded && ((*val & 0x1) == 0UL) && ctx->ctx_fl_using_dbreg == 0) { +- +- DPRINT(("pmc[%d]=0x%lx has active pmc13.ta cleared, clearing ibr\n", cnum, *val)); +- +- /* don't mix debug with perfmon */ +- if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL; +- +- /* +- * a count of 0 will mark the debug registers as in use and also +- * ensure that they are properly cleared. +- */ +- ret = pfm_write_ibr_dbr(1, ctx, NULL, 0, regs); +- if (ret) return ret; +- } +- +- /* +- * we must clear the (data) debug registers if pmc11.pt bit is cleared +- * before they are written (fl_using_dbreg==0) to avoid picking up stale information. +- */ +- if (cnum == 11 && is_loaded && ((*val >> 28)& 0x1) == 0 && ctx->ctx_fl_using_dbreg == 0) { +- +- DPRINT(("pmc[%d]=0x%lx has active pmc11.pt cleared, clearing dbr\n", cnum, *val)); +- +- /* don't mix debug with perfmon */ +- if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL; +- +- /* +- * a count of 0 will mark the debug registers as in use and also +- * ensure that they are properly cleared. +- */ +- ret = pfm_write_ibr_dbr(0, ctx, NULL, 0, regs); +- if (ret) return ret; +- } +- return 0; +-} +- +-/* +- * impl_pmcs, impl_pmds are computed at runtime to minimize errors! +- */ +-static pmu_config_t pmu_conf_ita={ +- .pmu_name = "Itanium", +- .pmu_family = 0x7, +- .ovfl_val = (1UL << 32) - 1, +- .pmd_desc = pfm_ita_pmd_desc, +- .pmc_desc = pfm_ita_pmc_desc, +- .num_ibrs = 8, +- .num_dbrs = 8, +- .use_rr_dbregs = 1, /* debug register are use for range retrictions */ +-}; +- +- +diff --git a/arch/ia64/kernel/perfmon_mckinley.h b/arch/ia64/kernel/perfmon_mckinley.h +deleted file mode 100644 +index c4bec7a..0000000 +--- a/arch/ia64/kernel/perfmon_mckinley.h ++++ /dev/null +@@ -1,187 +0,0 @@ +-/* +- * This file contains the McKinley PMU register description tables +- * and pmc checker used by perfmon.c. +- * +- * Copyright (C) 2002-2003 Hewlett Packard Co +- * Stephane Eranian <eranian@hpl.hp.com> +- */ +-static int pfm_mck_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs); +- +-static pfm_reg_desc_t pfm_mck_pmc_desc[PMU_MAX_PMCS]={ +-/* pmc0 */ { PFM_REG_CONTROL , 0, 0x1UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc1 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc2 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc3 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc4 */ { PFM_REG_COUNTING, 6, 0x0000000000800000UL, 0xfffff7fUL, NULL, pfm_mck_pmc_check, {RDEP(4),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc5 */ { PFM_REG_COUNTING, 6, 0x0UL, 0xfffff7fUL, NULL, pfm_mck_pmc_check, {RDEP(5),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc6 */ { PFM_REG_COUNTING, 6, 0x0UL, 0xfffff7fUL, NULL, pfm_mck_pmc_check, {RDEP(6),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc7 */ { PFM_REG_COUNTING, 6, 0x0UL, 0xfffff7fUL, NULL, pfm_mck_pmc_check, {RDEP(7),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc8 */ { PFM_REG_CONFIG , 0, 0xffffffff3fffffffUL, 0xffffffff3ffffffbUL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc9 */ { PFM_REG_CONFIG , 0, 0xffffffff3ffffffcUL, 0xffffffff3ffffffbUL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc10 */ { PFM_REG_MONITOR , 4, 0x0UL, 0xffffUL, NULL, pfm_mck_pmc_check, {RDEP(0)|RDEP(1),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc11 */ { PFM_REG_MONITOR , 6, 0x0UL, 0x30f01cf, NULL, pfm_mck_pmc_check, {RDEP(2)|RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc12 */ { PFM_REG_MONITOR , 6, 0x0UL, 0xffffUL, NULL, pfm_mck_pmc_check, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc13 */ { PFM_REG_CONFIG , 0, 0x00002078fefefefeUL, 0x1e00018181818UL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc14 */ { PFM_REG_CONFIG , 0, 0x0db60db60db60db6UL, 0x2492UL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +-/* pmc15 */ { PFM_REG_CONFIG , 0, 0x00000000fffffff0UL, 0xfUL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, +- { PFM_REG_END , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */ +-}; +- +-static pfm_reg_desc_t pfm_mck_pmd_desc[PMU_MAX_PMDS]={ +-/* pmd0 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(1),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}}, +-/* pmd1 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(0),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}}, +-/* pmd2 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}}, +-/* pmd3 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}}, +-/* pmd4 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(4),0UL, 0UL, 0UL}}, +-/* pmd5 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(5),0UL, 0UL, 0UL}}, +-/* pmd6 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(6),0UL, 0UL, 0UL}}, +-/* pmd7 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(7),0UL, 0UL, 0UL}}, +-/* pmd8 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +-/* pmd9 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +-/* pmd10 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +-/* pmd11 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +-/* pmd12 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +-/* pmd13 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +-/* pmd14 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +-/* pmd15 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +-/* pmd16 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, +-/* pmd17 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(3),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}}, +- { PFM_REG_END , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */ +-}; +- +-/* +- * PMC reserved fields must have their power-up values preserved +- */ +-static int +-pfm_mck_reserved(unsigned int cnum, unsigned long *val, struct pt_regs *regs) +-{ +- unsigned long tmp1, tmp2, ival = *val; +- +- /* remove reserved areas from user value */ +- tmp1 = ival & PMC_RSVD_MASK(cnum); +- +- /* get reserved fields values */ +- tmp2 = PMC_DFL_VAL(cnum) & ~PMC_RSVD_MASK(cnum); +- +- *val = tmp1 | tmp2; +- +- DPRINT(("pmc[%d]=0x%lx, mask=0x%lx, reset=0x%lx, val=0x%lx\n", +- cnum, ival, PMC_RSVD_MASK(cnum), PMC_DFL_VAL(cnum), *val)); +- return 0; +-} +- +-/* +- * task can be NULL if the context is unloaded +- */ +-static int +-pfm_mck_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs) +-{ +- int ret = 0, check_case1 = 0; +- unsigned long val8 = 0, val14 = 0, val13 = 0; +- int is_loaded; +- +- /* first preserve the reserved fields */ +- pfm_mck_reserved(cnum, val, regs); +- +- /* sanitfy check */ +- if (ctx == NULL) return -EINVAL; +- +- is_loaded = ctx->ctx_state == PFM_CTX_LOADED || ctx->ctx_state == PFM_CTX_MASKED; +- +- /* +- * we must clear the debug registers if pmc13 has a value which enable +- * memory pipeline event constraints. In this case we need to clear the +- * the debug registers if they have not yet been accessed. This is required +- * to avoid picking stale state. +- * PMC13 is "active" if: +- * one of the pmc13.cfg_dbrpXX field is different from 0x3 +- * AND +- * at the corresponding pmc13.ena_dbrpXX is set. +- */ +- DPRINT(("cnum=%u val=0x%lx, using_dbreg=%d loaded=%d\n", cnum, *val, ctx->ctx_fl_using_dbreg, is_loaded)); +- +- if (cnum == 13 && is_loaded +- && (*val & 0x1e00000000000UL) && (*val & 0x18181818UL) != 0x18181818UL && ctx->ctx_fl_using_dbreg == 0) { +- +- DPRINT(("pmc[%d]=0x%lx has active pmc13 settings, clearing dbr\n", cnum, *val)); +- +- /* don't mix debug with perfmon */ +- if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL; +- +- /* +- * a count of 0 will mark the debug registers as in use and also +- * ensure that they are properly cleared. +- */ +- ret = pfm_write_ibr_dbr(PFM_DATA_RR, ctx, NULL, 0, regs); +- if (ret) return ret; +- } +- /* +- * we must clear the (instruction) debug registers if any pmc14.ibrpX bit is enabled +- * before they are (fl_using_dbreg==0) to avoid picking up stale information. +- */ +- if (cnum == 14 && is_loaded && ((*val & 0x2222UL) != 0x2222UL) && ctx->ctx_fl_using_dbreg == 0) { +- +- DPRINT(("pmc[%d]=0x%lx has active pmc14 settings, clearing ibr\n", cnum, *val)); +- +- /* don't mix debug with perfmon */ +- if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL; +- +- /* +- * a count of 0 will mark the debug registers as in use and also +- * ensure that they are properly cleared. +- */ +- ret = pfm_write_ibr_dbr(PFM_CODE_RR, ctx, NULL, 0, regs); +- if (ret) return ret; +- +- } +- +- switch(cnum) { +- case 4: *val |= 1UL << 23; /* force power enable bit */ +- break; +- case 8: val8 = *val; +- val13 = ctx->ctx_pmcs[13]; +- val14 = ctx->ctx_pmcs[14]; +- check_case1 = 1; +- break; +- case 13: val8 = ctx->ctx_pmcs[8]; +- val13 = *val; +- val14 = ctx->ctx_pmcs[14]; +- check_case1 = 1; +- break; +- case 14: val8 = ctx->ctx_pmcs[8]; +- val13 = ctx->ctx_pmcs[13]; +- val14 = *val; +- check_case1 = 1; +- break; +- } +- /* check illegal configuration which can produce inconsistencies in tagging +- * i-side events in L1D and L2 caches +- */ +- if (check_case1) { +- ret = ((val13 >> 45) & 0xf) == 0 +- && ((val8 & 0x1) == 0) +- && ((((val14>>1) & 0x3) == 0x2 || ((val14>>1) & 0x3) == 0x0) +- ||(((val14>>4) & 0x3) == 0x2 || ((val14>>4) & 0x3) == 0x0)); +- +- if (ret) DPRINT((KERN_DEBUG "perfmon: failure check_case1\n")); +- } +- +- return ret ? -EINVAL : 0; +-} +- +-/* +- * impl_pmcs, impl_pmds are computed at runtime to minimize errors! +- */ +-static pmu_config_t pmu_conf_mck={ +- .pmu_name = "Itanium 2", +- .pmu_family = 0x1f, +- .flags = PFM_PMU_IRQ_RESEND, +- .ovfl_val = (1UL << 47) - 1, +- .pmd_desc = pfm_mck_pmd_desc, +- .pmc_desc = pfm_mck_pmc_desc, +- .num_ibrs = 8, +- .num_dbrs = 8, +- .use_rr_dbregs = 1 /* debug register are use for range restrictions */ +-}; +- +- +diff --git a/arch/ia64/kernel/perfmon_montecito.h b/arch/ia64/kernel/perfmon_montecito.h +deleted file mode 100644 +index 7f8da4c..0000000 +--- a/arch/ia64/kernel/perfmon_montecito.h ++++ /dev/null +@@ -1,269 +0,0 @@ +-/* +- * This file contains the Montecito PMU register description tables +- * and pmc checker used by perfmon.c. +- * +- * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. +- * Contributed by Stephane Eranian <eranian@hpl.hp.com> +- */ +-static int pfm_mont_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs); +- +-#define RDEP_MONT_ETB (RDEP(38)|RDEP(39)|RDEP(48)|RDEP(49)|RDEP(50)|RDEP(51)|RDEP(52)|RDEP(53)|RDEP(54)|\ +- RDEP(55)|RDEP(56)|RDEP(57)|RDEP(58)|RDEP(59)|RDEP(60)|RDEP(61)|RDEP(62)|RDEP(63)) +-#define RDEP_MONT_DEAR (RDEP(32)|RDEP(33)|RDEP(36)) +-#define RDEP_MONT_IEAR (RDEP(34)|RDEP(35)) +- +-static pfm_reg_desc_t pfm_mont_pmc_desc[PMU_MAX_PMCS]={ +-/* pmc0 */ { PFM_REG_CONTROL , 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {0,0, 0, 0}}, +-/* pmc1 */ { PFM_REG_CONTROL , 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {0,0, 0, 0}}, +-/* pmc2 */ { PFM_REG_CONTROL , 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {0,0, 0, 0}}, +-/* pmc3 */ { PFM_REG_CONTROL , 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {0,0, 0, 0}}, +-/* pmc4 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(4),0, 0, 0}, {0,0, 0, 0}}, +-/* pmc5 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(5),0, 0, 0}, {0,0, 0, 0}}, +-/* pmc6 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(6),0, 0, 0}, {0,0, 0, 0}}, +-/* pmc7 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(7),0, 0, 0}, {0,0, 0, 0}}, +-/* pmc8 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(8),0, 0, 0}, {0,0, 0, 0}}, +-/* pmc9 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(9),0, 0, 0}, {0,0, 0, 0}}, +-/* pmc10 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(10),0, 0, 0}, {0,0, 0, 0}}, +-/* pmc11 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(11),0, 0, 0}, {0,0, 0, 0}}, +-/* pmc12 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(12),0, 0, 0}, {0,0, 0, 0}}, +-/* pmc13 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(13),0, 0, 0}, {0,0, 0, 0}}, +-/* pmc14 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(14),0, 0, 0}, {0,0, 0, 0}}, +-/* pmc15 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(15),0, 0, 0}, {0,0, 0, 0}}, +-/* pmc16 */ { PFM_REG_NOTIMPL, }, +-/* pmc17 */ { PFM_REG_NOTIMPL, }, +-/* pmc18 */ { PFM_REG_NOTIMPL, }, +-/* pmc19 */ { PFM_REG_NOTIMPL, }, +-/* pmc20 */ { PFM_REG_NOTIMPL, }, +-/* pmc21 */ { PFM_REG_NOTIMPL, }, +-/* pmc22 */ { PFM_REG_NOTIMPL, }, +-/* pmc23 */ { PFM_REG_NOTIMPL, }, +-/* pmc24 */ { PFM_REG_NOTIMPL, }, +-/* pmc25 */ { PFM_REG_NOTIMPL, }, +-/* pmc26 */ { PFM_REG_NOTIMPL, }, +-/* pmc27 */ { PFM_REG_NOTIMPL, }, +-/* pmc28 */ { PFM_REG_NOTIMPL, }, +-/* pmc29 */ { PFM_REG_NOTIMPL, }, +-/* pmc30 */ { PFM_REG_NOTIMPL, }, +-/* pmc31 */ { PFM_REG_NOTIMPL, }, +-/* pmc32 */ { PFM_REG_CONFIG, 0, 0x30f01ffffffffffUL, 0x30f01ffffffffffUL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}}, +-/* pmc33 */ { PFM_REG_CONFIG, 0, 0x0, 0x1ffffffffffUL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}}, +-/* pmc34 */ { PFM_REG_CONFIG, 0, 0xf01ffffffffffUL, 0xf01ffffffffffUL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}}, +-/* pmc35 */ { PFM_REG_CONFIG, 0, 0x0, 0x1ffffffffffUL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}}, +-/* pmc36 */ { PFM_REG_CONFIG, 0, 0xfffffff0, 0xf, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}}, +-/* pmc37 */ { PFM_REG_MONITOR, 4, 0x0, 0x3fff, NULL, pfm_mont_pmc_check, {RDEP_MONT_IEAR, 0, 0, 0}, {0, 0, 0, 0}}, +-/* pmc38 */ { PFM_REG_CONFIG, 0, 0xdb6, 0x2492, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}}, +-/* pmc39 */ { PFM_REG_MONITOR, 6, 0x0, 0xffcf, NULL, pfm_mont_pmc_check, {RDEP_MONT_ETB,0, 0, 0}, {0,0, 0, 0}}, +-/* pmc40 */ { PFM_REG_MONITOR, 6, 0x2000000, 0xf01cf, NULL, pfm_mont_pmc_check, {RDEP_MONT_DEAR,0, 0, 0}, {0,0, 0, 0}}, +-/* pmc41 */ { PFM_REG_CONFIG, 0, 0x00002078fefefefeUL, 0x1e00018181818UL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}}, +-/* pmc42 */ { PFM_REG_MONITOR, 6, 0x0, 0x7ff4f, NULL, pfm_mont_pmc_check, {RDEP_MONT_ETB,0, 0, 0}, {0,0, 0, 0}}, +- { PFM_REG_END , 0, 0x0, -1, NULL, NULL, {0,}, {0,}}, /* end marker */ +-}; +- +-static pfm_reg_desc_t pfm_mont_pmd_desc[PMU_MAX_PMDS]={ +-/* pmd0 */ { PFM_REG_NOTIMPL, }, +-/* pmd1 */ { PFM_REG_NOTIMPL, }, +-/* pmd2 */ { PFM_REG_NOTIMPL, }, +-/* pmd3 */ { PFM_REG_NOTIMPL, }, +-/* pmd4 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(4),0, 0, 0}}, +-/* pmd5 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(5),0, 0, 0}}, +-/* pmd6 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(6),0, 0, 0}}, +-/* pmd7 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(7),0, 0, 0}}, +-/* pmd8 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(8),0, 0, 0}}, +-/* pmd9 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(9),0, 0, 0}}, +-/* pmd10 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(10),0, 0, 0}}, +-/* pmd11 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(11),0, 0, 0}}, +-/* pmd12 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(12),0, 0, 0}}, +-/* pmd13 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(13),0, 0, 0}}, +-/* pmd14 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(14),0, 0, 0}}, +-/* pmd15 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(15),0, 0, 0}}, +-/* pmd16 */ { PFM_REG_NOTIMPL, }, +-/* pmd17 */ { PFM_REG_NOTIMPL, }, +-/* pmd18 */ { PFM_REG_NOTIMPL, }, +-/* pmd19 */ { PFM_REG_NOTIMPL, }, +-/* pmd20 */ { PFM_REG_NOTIMPL, }, +-/* pmd21 */ { PFM_REG_NOTIMPL, }, +-/* pmd22 */ { PFM_REG_NOTIMPL, }, +-/* pmd23 */ { PFM_REG_NOTIMPL, }, +-/* pmd24 */ { PFM_REG_NOTIMPL, }, +-/* pmd25 */ { PFM_REG_NOTIMPL, }, +-/* pmd26 */ { PFM_REG_NOTIMPL, }, +-/* pmd27 */ { PFM_REG_NOTIMPL, }, +-/* pmd28 */ { PFM_REG_NOTIMPL, }, +-/* pmd29 */ { PFM_REG_NOTIMPL, }, +-/* pmd30 */ { PFM_REG_NOTIMPL, }, +-/* pmd31 */ { PFM_REG_NOTIMPL, }, +-/* pmd32 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(33)|RDEP(36),0, 0, 0}, {RDEP(40),0, 0, 0}}, +-/* pmd33 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(32)|RDEP(36),0, 0, 0}, {RDEP(40),0, 0, 0}}, +-/* pmd34 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(35),0, 0, 0}, {RDEP(37),0, 0, 0}}, +-/* pmd35 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(34),0, 0, 0}, {RDEP(37),0, 0, 0}}, +-/* pmd36 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(32)|RDEP(33),0, 0, 0}, {RDEP(40),0, 0, 0}}, +-/* pmd37 */ { PFM_REG_NOTIMPL, }, +-/* pmd38 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +-/* pmd39 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +-/* pmd40 */ { PFM_REG_NOTIMPL, }, +-/* pmd41 */ { PFM_REG_NOTIMPL, }, +-/* pmd42 */ { PFM_REG_NOTIMPL, }, +-/* pmd43 */ { PFM_REG_NOTIMPL, }, +-/* pmd44 */ { PFM_REG_NOTIMPL, }, +-/* pmd45 */ { PFM_REG_NOTIMPL, }, +-/* pmd46 */ { PFM_REG_NOTIMPL, }, +-/* pmd47 */ { PFM_REG_NOTIMPL, }, +-/* pmd48 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +-/* pmd49 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +-/* pmd50 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +-/* pmd51 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +-/* pmd52 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +-/* pmd53 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +-/* pmd54 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +-/* pmd55 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +-/* pmd56 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +-/* pmd57 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +-/* pmd58 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +-/* pmd59 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +-/* pmd60 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +-/* pmd61 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +-/* pmd62 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +-/* pmd63 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, +- { PFM_REG_END , 0, 0x0, -1, NULL, NULL, {0,}, {0,}}, /* end marker */ +-}; +- +-/* +- * PMC reserved fields must have their power-up values preserved +- */ +-static int +-pfm_mont_reserved(unsigned int cnum, unsigned long *val, struct pt_regs *regs) +-{ +- unsigned long tmp1, tmp2, ival = *val; +- +- /* remove reserved areas from user value */ +- tmp1 = ival & PMC_RSVD_MASK(cnum); +- +- /* get reserved fields values */ +- tmp2 = PMC_DFL_VAL(cnum) & ~PMC_RSVD_MASK(cnum); +- +- *val = tmp1 | tmp2; +- +- DPRINT(("pmc[%d]=0x%lx, mask=0x%lx, reset=0x%lx, val=0x%lx\n", +- cnum, ival, PMC_RSVD_MASK(cnum), PMC_DFL_VAL(cnum), *val)); +- return 0; +-} +- +-/* +- * task can be NULL if the context is unloaded +- */ +-static int +-pfm_mont_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs) +-{ +- int ret = 0; +- unsigned long val32 = 0, val38 = 0, val41 = 0; +- unsigned long tmpval; +- int check_case1 = 0; +- int is_loaded; +- +- /* first preserve the reserved fields */ +- pfm_mont_reserved(cnum, val, regs); +- +- tmpval = *val; +- +- /* sanity check */ +- if (ctx == NULL) return -EINVAL; +- +- is_loaded = ctx->ctx_state == PFM_CTX_LOADED || ctx->ctx_state == PFM_CTX_MASKED; +- +- /* +- * we must clear the debug registers if pmc41 has a value which enable +- * memory pipeline event constraints. In this case we need to clear the +- * the debug registers if they have not yet been accessed. This is required +- * to avoid picking stale state. +- * PMC41 is "active" if: +- * one of the pmc41.cfg_dtagXX field is different from 0x3 +- * AND +- * at the corresponding pmc41.en_dbrpXX is set. +- * AND +- * ctx_fl_using_dbreg == 0 (i.e., dbr not yet used) +- */ +- DPRINT(("cnum=%u val=0x%lx, using_dbreg=%d loaded=%d\n", cnum, tmpval, ctx->ctx_fl_using_dbreg, is_loaded)); +- +- if (cnum == 41 && is_loaded +- && (tmpval & 0x1e00000000000UL) && (tmpval & 0x18181818UL) != 0x18181818UL && ctx->ctx_fl_using_dbreg == 0) { +- +- DPRINT(("pmc[%d]=0x%lx has active pmc41 settings, clearing dbr\n", cnum, tmpval)); +- +- /* don't mix debug with perfmon */ +- if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL; +- +- /* +- * a count of 0 will mark the debug registers if: +- * AND +- */ +- ret = pfm_write_ibr_dbr(PFM_DATA_RR, ctx, NULL, 0, regs); +- if (ret) return ret; +- } +- /* +- * we must clear the (instruction) debug registers if: +- * pmc38.ig_ibrpX is 0 (enabled) +- * AND +- * ctx_fl_using_dbreg == 0 (i.e., dbr not yet used) +- */ +- if (cnum == 38 && is_loaded && ((tmpval & 0x492UL) != 0x492UL) && ctx->ctx_fl_using_dbreg == 0) { +- +- DPRINT(("pmc38=0x%lx has active pmc38 settings, clearing ibr\n", tmpval)); +- +- /* don't mix debug with perfmon */ +- if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL; +- +- /* +- * a count of 0 will mark the debug registers as in use and also +- * ensure that they are properly cleared. +- */ +- ret = pfm_write_ibr_dbr(PFM_CODE_RR, ctx, NULL, 0, regs); +- if (ret) return ret; +- +- } +- switch(cnum) { +- case 32: val32 = *val; +- val38 = ctx->ctx_pmcs[38]; +- val41 = ctx->ctx_pmcs[41]; +- check_case1 = 1; +- break; +- case 38: val38 = *val; +- val32 = ctx->ctx_pmcs[32]; +- val41 = ctx->ctx_pmcs[41]; +- check_case1 = 1; +- break; +- case 41: val41 = *val; +- val32 = ctx->ctx_pmcs[32]; +- val38 = ctx->ctx_pmcs[38]; +- check_case1 = 1; +- break; +- } +- /* check illegal configuration which can produce inconsistencies in tagging +- * i-side events in L1D and L2 caches +- */ +- if (check_case1) { +- ret = (((val41 >> 45) & 0xf) == 0 && ((val32>>57) & 0x1) == 0) +- && ((((val38>>1) & 0x3) == 0x2 || ((val38>>1) & 0x3) == 0) +- || (((val38>>4) & 0x3) == 0x2 || ((val38>>4) & 0x3) == 0)); +- if (ret) { +- DPRINT(("invalid config pmc38=0x%lx pmc41=0x%lx pmc32=0x%lx\n", val38, val41, val32)); +- return -EINVAL; +- } +- } +- *val = tmpval; +- return 0; +-} +- +-/* +- * impl_pmcs, impl_pmds are computed at runtime to minimize errors! +- */ +-static pmu_config_t pmu_conf_mont={ +- .pmu_name = "Montecito", +- .pmu_family = 0x20, +- .flags = PFM_PMU_IRQ_RESEND, +- .ovfl_val = (1UL << 47) - 1, +- .pmd_desc = pfm_mont_pmd_desc, +- .pmc_desc = pfm_mont_pmc_desc, +- .num_ibrs = 8, +- .num_dbrs = 8, +- .use_rr_dbregs = 1 /* debug register are use for range retrictions */ +-}; +diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c +index 3ab8373..a7dfb39 100644 +--- a/arch/ia64/kernel/process.c ++++ b/arch/ia64/kernel/process.c +@@ -28,6 +28,7 @@ + #include <linux/delay.h> + #include <linux/kdebug.h> + #include <linux/utsname.h> ++#include <linux/perfmon_kern.h> + + #include <asm/cpu.h> + #include <asm/delay.h> +@@ -45,10 +46,6 @@ + + #include "entry.h" + +-#ifdef CONFIG_PERFMON +-# include <asm/perfmon.h> +-#endif +- + #include "sigframe.h" + + void (*ia64_mark_idle)(int); +@@ -162,10 +159,8 @@ show_regs (struct pt_regs *regs) + + void tsk_clear_notify_resume(struct task_struct *tsk) + { +-#ifdef CONFIG_PERFMON +- if (tsk->thread.pfm_needs_checking) ++ if (test_ti_thread_flag(task_thread_info(tsk), TIF_PERFMON_WORK)) + return; +-#endif + if (test_ti_thread_flag(task_thread_info(tsk), TIF_RESTORE_RSE)) + return; + clear_ti_thread_flag(task_thread_info(tsk), TIF_NOTIFY_RESUME); +@@ -188,14 +183,9 @@ do_notify_resume_user(sigset_t *unused, struct sigscratch *scr, long in_syscall) + return; + } + +-#ifdef CONFIG_PERFMON +- if (current->thread.pfm_needs_checking) +- /* +- * Note: pfm_handle_work() allow us to call it with interrupts +- * disabled, and may enable interrupts within the function. +- */ +- pfm_handle_work(); +-#endif ++ /* process perfmon asynchronous work (e.g. block thread or reset) */ ++ if (test_thread_flag(TIF_PERFMON_WORK)) ++ pfm_handle_work(task_pt_regs(current)); + + /* deal with pending signal delivery */ + if (test_thread_flag(TIF_SIGPENDING)) { +@@ -212,22 +202,15 @@ do_notify_resume_user(sigset_t *unused, struct sigscratch *scr, long in_syscall) + local_irq_disable(); /* force interrupt disable */ + } + +-static int pal_halt = 1; + static int can_do_pal_halt = 1; + + static int __init nohalt_setup(char * str) + { +- pal_halt = can_do_pal_halt = 0; ++ can_do_pal_halt = 0; + return 1; + } + __setup("nohalt", nohalt_setup); + +-void +-update_pal_halt_status(int status) +-{ +- can_do_pal_halt = pal_halt && status; +-} +- + /* + * We use this if we don't have any better idle routine.. + */ +@@ -236,6 +219,22 @@ default_idle (void) + { + local_irq_enable(); + while (!need_resched()) { ++#ifdef CONFIG_PERFMON ++ u64 psr = 0; ++ /* ++ * If requested, we stop the PMU to avoid ++ * measuring across the core idle loop. ++ * ++ * dcr.pp is not modified on purpose ++ * it is used when coming out of ++ * safe_halt() via interrupt ++ */ ++ if ((__get_cpu_var(pfm_syst_info) & PFM_ITA_CPUINFO_IDLE_EXCL)) { ++ psr = ia64_getreg(_IA64_REG_PSR); ++ if (psr & IA64_PSR_PP) ++ ia64_rsm(IA64_PSR_PP); ++ } ++#endif + if (can_do_pal_halt) { + local_irq_disable(); + if (!need_resched()) { +@@ -244,6 +243,12 @@ default_idle (void) + local_irq_enable(); + } else + cpu_relax(); ++#ifdef CONFIG_PERFMON ++ if ((__get_cpu_var(pfm_syst_info) & PFM_ITA_CPUINFO_IDLE_EXCL)) { ++ if (psr & IA64_PSR_PP) ++ ia64_ssm(IA64_PSR_PP); ++ } ++#endif + } + } + +@@ -344,22 +349,9 @@ cpu_idle (void) + void + ia64_save_extra (struct task_struct *task) + { +-#ifdef CONFIG_PERFMON +- unsigned long info; +-#endif +- + if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0) + ia64_save_debug_regs(&task->thread.dbr[0]); + +-#ifdef CONFIG_PERFMON +- if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0) +- pfm_save_regs(task); +- +- info = __get_cpu_var(pfm_syst_info); +- if (info & PFM_CPUINFO_SYST_WIDE) +- pfm_syst_wide_update_task(task, info, 0); +-#endif +- + #ifdef CONFIG_IA32_SUPPORT + if (IS_IA32_PROCESS(task_pt_regs(task))) + ia32_save_state(task); +@@ -369,22 +361,9 @@ ia64_save_extra (struct task_struct *task) + void + ia64_load_extra (struct task_struct *task) + { +-#ifdef CONFIG_PERFMON +- unsigned long info; +-#endif +- + if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0) + ia64_load_debug_regs(&task->thread.dbr[0]); + +-#ifdef CONFIG_PERFMON +- if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0) +- pfm_load_regs(task); +- +- info = __get_cpu_var(pfm_syst_info); +- if (info & PFM_CPUINFO_SYST_WIDE) +- pfm_syst_wide_update_task(task, info, 1); +-#endif +- + #ifdef CONFIG_IA32_SUPPORT + if (IS_IA32_PROCESS(task_pt_regs(task))) + ia32_load_state(task); +@@ -510,8 +489,7 @@ copy_thread (int nr, unsigned long clone_flags, + * call behavior where scratch registers are preserved across + * system calls (unless used by the system call itself). + */ +-# define THREAD_FLAGS_TO_CLEAR (IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID \ +- | IA64_THREAD_PM_VALID) ++# define THREAD_FLAGS_TO_CLEAR (IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID) + # define THREAD_FLAGS_TO_SET 0 + p->thread.flags = ((current->thread.flags & ~THREAD_FLAGS_TO_CLEAR) + | THREAD_FLAGS_TO_SET); +@@ -533,10 +511,8 @@ copy_thread (int nr, unsigned long clone_flags, + } + #endif + +-#ifdef CONFIG_PERFMON +- if (current->thread.pfm_context) +- pfm_inherit(p, child_ptregs); +-#endif ++ pfm_copy_thread(p); ++ + return retval; + } + +@@ -745,15 +721,13 @@ exit_thread (void) + { + + ia64_drop_fpu(current); +-#ifdef CONFIG_PERFMON +- /* if needed, stop monitoring and flush state to perfmon context */ +- if (current->thread.pfm_context) +- pfm_exit_thread(current); ++ ++ /* if needed, stop monitoring and flush state to perfmon context */ ++ pfm_exit_thread(); + + /* free debug register resources */ +- if (current->thread.flags & IA64_THREAD_DBG_VALID) +- pfm_release_debug_registers(current); +-#endif ++ pfm_release_dbregs(current); ++ + if (IS_IA32_PROCESS(task_pt_regs(current))) + ia32_drop_ia64_partial_page_list(current); + } +diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c +index 2a9943b..bb1ca1e 100644 +--- a/arch/ia64/kernel/ptrace.c ++++ b/arch/ia64/kernel/ptrace.c +@@ -20,6 +20,7 @@ + #include <linux/security.h> + #include <linux/audit.h> + #include <linux/signal.h> ++#include <linux/perfmon_kern.h> + #include <linux/regset.h> + #include <linux/elf.h> + +@@ -30,9 +31,6 @@ + #include <asm/system.h> + #include <asm/uaccess.h> + #include <asm/unwind.h> +-#ifdef CONFIG_PERFMON +-#include <asm/perfmon.h> +-#endif + + #include "entry.h" + +@@ -2124,7 +2122,6 @@ access_uarea(struct task_struct *child, unsigned long addr, + "address 0x%lx\n", addr); + return -1; + } +-#ifdef CONFIG_PERFMON + /* + * Check if debug registers are used by perfmon. This + * test must be done once we know that we can do the +@@ -2142,9 +2139,8 @@ access_uarea(struct task_struct *child, unsigned long addr, + * IA64_THREAD_DBG_VALID. The registers are restored + * by the PMU context switch code. + */ +- if (pfm_use_debug_registers(child)) ++ if (pfm_use_dbregs(child)) + return -1; +-#endif + + if (!(child->thread.flags & IA64_THREAD_DBG_VALID)) { + child->thread.flags |= IA64_THREAD_DBG_VALID; +diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c +index de636b2..677fa68 100644 +--- a/arch/ia64/kernel/setup.c ++++ b/arch/ia64/kernel/setup.c +@@ -45,6 +45,7 @@ + #include <linux/cpufreq.h> + #include <linux/kexec.h> + #include <linux/crash_dump.h> ++#include <linux/perfmon_kern.h> + + #include <asm/ia32.h> + #include <asm/machvec.h> +@@ -1051,6 +1052,8 @@ cpu_init (void) + } + platform_cpu_init(); + pm_idle = default_idle; ++ ++ pfm_init_percpu(); + } + + void __init +diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c +index d8f05e5..3d7a739 100644 +--- a/arch/ia64/kernel/smpboot.c ++++ b/arch/ia64/kernel/smpboot.c +@@ -39,6 +39,7 @@ + #include <linux/efi.h> + #include <linux/percpu.h> + #include <linux/bitops.h> ++#include <linux/perfmon_kern.h> + + #include <asm/atomic.h> + #include <asm/cache.h> +@@ -381,10 +382,6 @@ smp_callin (void) + extern void ia64_init_itm(void); + extern volatile int time_keeper_id; + +-#ifdef CONFIG_PERFMON +- extern void pfm_init_percpu(void); +-#endif +- + cpuid = smp_processor_id(); + phys_id = hard_smp_processor_id(); + itc_master = time_keeper_id; +@@ -410,10 +407,6 @@ smp_callin (void) + + ia64_mca_cmc_vector_setup(); /* Setup vector on AP */ + +-#ifdef CONFIG_PERFMON +- pfm_init_percpu(); +-#endif +- + local_irq_enable(); + + if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)) { +@@ -751,6 +744,7 @@ int __cpu_disable(void) + cpu_clear(cpu, cpu_online_map); + local_flush_tlb_all(); + cpu_clear(cpu, cpu_callin_map); ++ pfm_cpu_disable(); + return 0; + } + +diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c +index bcbb6d8..a0ed33a 100644 +--- a/arch/ia64/kernel/sys_ia64.c ++++ b/arch/ia64/kernel/sys_ia64.c +@@ -284,3 +284,11 @@ sys_pciconfig_write (unsigned long bus, unsigned long dfn, unsigned long off, un + } + + #endif /* CONFIG_PCI */ ++ ++#ifndef CONFIG_IA64_PERFMON_COMPAT ++asmlinkage long ++sys_perfmonctl (int fd, int cmd, void __user *arg, int count) ++{ ++ return -ENOSYS; ++} ++#endif +diff --git a/arch/ia64/lib/Makefile b/arch/ia64/lib/Makefile +index 98771e2..077fd09 100644 +--- a/arch/ia64/lib/Makefile ++++ b/arch/ia64/lib/Makefile +@@ -13,7 +13,6 @@ lib-y := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o \ + + obj-$(CONFIG_ITANIUM) += copy_page.o copy_user.o memcpy.o + obj-$(CONFIG_MCKINLEY) += copy_page_mck.o memcpy_mck.o +-lib-$(CONFIG_PERFMON) += carta_random.o + + AFLAGS___divdi3.o = + AFLAGS___udivdi3.o = -DUNSIGNED +diff --git a/arch/ia64/oprofile/init.c b/arch/ia64/oprofile/init.c +index 125a602..892de6a 100644 +--- a/arch/ia64/oprofile/init.c ++++ b/arch/ia64/oprofile/init.c +@@ -12,8 +12,8 @@ + #include <linux/init.h> + #include <linux/errno.h> + +-extern int perfmon_init(struct oprofile_operations * ops); +-extern void perfmon_exit(void); ++extern int op_perfmon_init(struct oprofile_operations * ops); ++extern void op_perfmon_exit(void); + extern void ia64_backtrace(struct pt_regs * const regs, unsigned int depth); + + int __init oprofile_arch_init(struct oprofile_operations * ops) +@@ -22,7 +22,7 @@ int __init oprofile_arch_init(struct oprofile_operations * ops) + + #ifdef CONFIG_PERFMON + /* perfmon_init() can fail, but we have no way to report it */ +- ret = perfmon_init(ops); ++ ret = op_perfmon_init(ops); + #endif + ops->backtrace = ia64_backtrace; + +@@ -33,6 +33,6 @@ int __init oprofile_arch_init(struct oprofile_operations * ops) + void oprofile_arch_exit(void) + { + #ifdef CONFIG_PERFMON +- perfmon_exit(); ++ op_perfmon_exit(); + #endif + } +diff --git a/arch/ia64/oprofile/perfmon.c b/arch/ia64/oprofile/perfmon.c +index bc41dd3..6fa9d17 100644 +--- a/arch/ia64/oprofile/perfmon.c ++++ b/arch/ia64/oprofile/perfmon.c +@@ -10,25 +10,30 @@ + #include <linux/kernel.h> + #include <linux/oprofile.h> + #include <linux/sched.h> +-#include <asm/perfmon.h> ++#include <linux/module.h> ++#include <linux/perfmon_kern.h> + #include <asm/ptrace.h> + #include <asm/errno.h> + + static int allow_ints; + + static int +-perfmon_handler(struct task_struct *task, void *buf, pfm_ovfl_arg_t *arg, +- struct pt_regs *regs, unsigned long stamp) ++perfmon_handler(struct pfm_context *ctx, ++ unsigned long ip, u64 stamp, void *data) + { +- int event = arg->pmd_eventid; ++ struct pt_regs *regs; ++ struct pfm_ovfl_arg *arg; ++ ++ regs = data; ++ arg = &ctx->ovfl_arg; + +- arg->ovfl_ctrl.bits.reset_ovfl_pmds = 1; ++ arg->ovfl_ctrl = PFM_OVFL_CTRL_RESET; + + /* the owner of the oprofile event buffer may have exited + * without perfmon being shutdown (e.g. SIGSEGV) + */ + if (allow_ints) +- oprofile_add_sample(regs, event); ++ oprofile_add_sample(regs, arg->pmd_eventid); + return 0; + } + +@@ -45,17 +50,13 @@ static void perfmon_stop(void) + allow_ints = 0; + } + +- +-#define OPROFILE_FMT_UUID { \ +- 0x77, 0x7a, 0x6e, 0x61, 0x20, 0x65, 0x73, 0x69, 0x74, 0x6e, 0x72, 0x20, 0x61, 0x65, 0x0a, 0x6c } +- +-static pfm_buffer_fmt_t oprofile_fmt = { +- .fmt_name = "oprofile_format", +- .fmt_uuid = OPROFILE_FMT_UUID, +- .fmt_handler = perfmon_handler, ++static struct pfm_smpl_fmt oprofile_fmt = { ++ .fmt_name = "OProfile", ++ .fmt_handler = perfmon_handler, ++ .fmt_flags = PFM_FMT_BUILTIN_FLAG, ++ .owner = THIS_MODULE + }; + +- + static char * get_cpu_type(void) + { + __u8 family = local_cpu_data->family; +@@ -75,9 +76,9 @@ static char * get_cpu_type(void) + + static int using_perfmon; + +-int perfmon_init(struct oprofile_operations * ops) ++int __init op_perfmon_init(struct oprofile_operations * ops) + { +- int ret = pfm_register_buffer_fmt(&oprofile_fmt); ++ int ret = pfm_fmt_register(&oprofile_fmt); + if (ret) + return -ENODEV; + +@@ -90,10 +91,10 @@ int perfmon_init(struct oprofile_operations * ops) + } + + +-void perfmon_exit(void) ++void __exit op_perfmon_exit(void) + { + if (!using_perfmon) + return; + +- pfm_unregister_buffer_fmt(oprofile_fmt.fmt_uuid); ++ pfm_fmt_unregister(&oprofile_fmt); + } +diff --git a/arch/ia64/perfmon/Kconfig b/arch/ia64/perfmon/Kconfig +new file mode 100644 +index 0000000..99c68bd +--- /dev/null ++++ b/arch/ia64/perfmon/Kconfig +@@ -0,0 +1,67 @@ ++menu "Hardware Performance Monitoring support" ++config PERFMON ++ bool "Perfmon2 performance monitoring interface" ++ default n ++ help ++ Enables the perfmon2 interface to access the hardware ++ performance counters. See <http://perfmon2.sf.net/> for ++ more details. ++ ++config PERFMON_DEBUG ++ bool "Perfmon debugging" ++ default n ++ depends on PERFMON ++ help ++ Enables perfmon debugging support ++ ++config PERFMON_DEBUG_FS ++ bool "Enable perfmon statistics reporting via debugfs" ++ default y ++ depends on PERFMON && DEBUG_FS ++ help ++ Enable collection and reporting of perfmon timing statistics under ++ debugfs. This is used for debugging and performance analysis of the ++ subsystem. The debugfs filesystem must be mounted. ++ ++config IA64_PERFMON_COMPAT ++ bool "Enable old perfmon-2 compatbility mode" ++ default n ++ depends on PERFMON ++ help ++ Enable this option to allow performance tools which used the old ++ perfmon-2 interface to continue to work. Old tools are those using ++ the obsolete commands and arguments. Check your programs and look ++ in include/asm-ia64/perfmon_compat.h for more information. ++ ++config IA64_PERFMON_GENERIC ++ tristate "Generic IA-64 PMU support" ++ depends on PERFMON ++ default n ++ help ++ Enables generic IA-64 PMU support. ++ The generic PMU is defined by the IA-64 architecture document. ++ This option should only be necessary when running with a PMU that ++ is not yet explicitely supported. Even then, there is no guarantee ++ that this support will work. ++ ++config IA64_PERFMON_ITANIUM ++ tristate "Itanium (Merced) Performance Monitoring support" ++ depends on PERFMON ++ default n ++ help ++ Enables Itanium (Merced) PMU support. ++ ++config IA64_PERFMON_MCKINLEY ++ tristate "Itanium 2 (McKinley) Performance Monitoring support" ++ depends on PERFMON ++ default n ++ help ++ Enables Itanium 2 (McKinley, Madison, Deerfield) PMU support. ++ ++config IA64_PERFMON_MONTECITO ++ tristate "Itanium 2 9000 (Montecito) Performance Monitoring support" ++ depends on PERFMON ++ default n ++ help ++ Enables support for Itanium 2 9000 (Montecito) PMU. ++endmenu +diff --git a/arch/ia64/perfmon/Makefile b/arch/ia64/perfmon/Makefile +new file mode 100644 +index 0000000..c9cdf9f +--- /dev/null ++++ b/arch/ia64/perfmon/Makefile +@@ -0,0 +1,11 @@ ++# ++# Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. ++# Contributed by Stephane Eranian <eranian@hpl.hp.com> ++# ++obj-$(CONFIG_PERFMON) += perfmon.o ++obj-$(CONFIG_IA64_PERFMON_COMPAT) += perfmon_default_smpl.o \ ++ perfmon_compat.o ++obj-$(CONFIG_IA64_PERFMON_GENERIC) += perfmon_generic.o ++obj-$(CONFIG_IA64_PERFMON_ITANIUM) += perfmon_itanium.o ++obj-$(CONFIG_IA64_PERFMON_MCKINLEY) += perfmon_mckinley.o ++obj-$(CONFIG_IA64_PERFMON_MONTECITO) += perfmon_montecito.o +diff --git a/arch/ia64/perfmon/perfmon.c b/arch/ia64/perfmon/perfmon.c +new file mode 100644 +index 0000000..3f59410 +--- /dev/null ++++ b/arch/ia64/perfmon/perfmon.c +@@ -0,0 +1,946 @@ ++/* ++ * This file implements the IA-64 specific ++ * support for the perfmon2 interface ++ * ++ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/module.h> ++#include <linux/perfmon_kern.h> ++ ++struct pfm_arch_session { ++ u32 pfs_sys_use_dbr; /* syswide session uses dbr */ ++ u32 pfs_ptrace_use_dbr; /* a thread uses dbr via ptrace()*/ ++}; ++ ++DEFINE_PER_CPU(u32, pfm_syst_info); ++ ++static struct pfm_arch_session pfm_arch_sessions; ++static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_arch_sessions_lock); ++ ++static inline void pfm_clear_psr_pp(void) ++{ ++ ia64_rsm(IA64_PSR_PP); ++} ++ ++static inline void pfm_set_psr_pp(void) ++{ ++ ia64_ssm(IA64_PSR_PP); ++} ++ ++static inline void pfm_clear_psr_up(void) ++{ ++ ia64_rsm(IA64_PSR_UP); ++} ++ ++static inline void pfm_set_psr_up(void) ++{ ++ ia64_ssm(IA64_PSR_UP); ++} ++ ++static inline void pfm_set_psr_l(u64 val) ++{ ++ ia64_setreg(_IA64_REG_PSR_L, val); ++} ++ ++static inline void pfm_restore_ibrs(u64 *ibrs, unsigned int nibrs) ++{ ++ unsigned int i; ++ ++ for (i = 0; i < nibrs; i++) { ++ ia64_set_ibr(i, ibrs[i]); ++ ia64_dv_serialize_instruction(); ++ } ++ ia64_srlz_i(); ++} ++ ++static inline void pfm_restore_dbrs(u64 *dbrs, unsigned int ndbrs) ++{ ++ unsigned int i; ++ ++ for (i = 0; i < ndbrs; i++) { ++ ia64_set_dbr(i, dbrs[i]); ++ ia64_dv_serialize_data(); ++ } ++ ia64_srlz_d(); ++} ++ ++irqreturn_t pmu_interrupt_handler(int irq, void *arg) ++{ ++ struct pt_regs *regs; ++ regs = get_irq_regs(); ++ irq_enter(); ++ pfm_interrupt_handler(instruction_pointer(regs), regs); ++ irq_exit(); ++ return IRQ_HANDLED; ++} ++static struct irqaction perfmon_irqaction = { ++ .handler = pmu_interrupt_handler, ++ .flags = IRQF_DISABLED, /* means keep interrupts masked */ ++ .name = "perfmon" ++}; ++ ++void pfm_arch_quiesce_pmu_percpu(void) ++{ ++ u64 dcr; ++ /* ++ * make sure no measurement is active ++ * (may inherit programmed PMCs from EFI). ++ */ ++ pfm_clear_psr_pp(); ++ pfm_clear_psr_up(); ++ ++ /* ++ * ensure dcr.pp is cleared ++ */ ++ dcr = ia64_getreg(_IA64_REG_CR_DCR); ++ ia64_setreg(_IA64_REG_CR_DCR, dcr & ~IA64_DCR_PP); ++ ++ /* ++ * we run with the PMU not frozen at all times ++ */ ++ ia64_set_pmc(0, 0); ++ ia64_srlz_d(); ++} ++ ++void pfm_arch_init_percpu(void) ++{ ++ pfm_arch_quiesce_pmu_percpu(); ++ /* ++ * program PMU interrupt vector ++ */ ++ ia64_setreg(_IA64_REG_CR_PMV, IA64_PERFMON_VECTOR); ++ ia64_srlz_d(); ++} ++ ++int pfm_arch_context_create(struct pfm_context *ctx, u32 ctx_flags) ++{ ++ struct pfm_arch_context *ctx_arch; ++ ++ ctx_arch = pfm_ctx_arch(ctx); ++ ++ ctx_arch->flags.use_dbr = 0; ++ ctx_arch->flags.insecure = (ctx_flags & PFM_ITA_FL_INSECURE) ? 1: 0; ++ ++ PFM_DBG("insecure=%d", ctx_arch->flags.insecure); ++ ++ return 0; ++} ++ ++/* ++ * Called from pfm_ctxsw(). Task is guaranteed to be current. ++ * Context is locked. Interrupts are masked. Monitoring may be active. ++ * PMU access is guaranteed. PMC and PMD registers are live in PMU. ++ * ++ * Return: ++ * non-zero : did not save PMDs (as part of stopping the PMU) ++ * 0 : saved PMDs (no need to save them in caller) ++ */ ++int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx) ++{ ++ struct pfm_arch_context *ctx_arch; ++ struct pfm_event_set *set; ++ u64 psr, tmp; ++ ++ ctx_arch = pfm_ctx_arch(ctx); ++ set = ctx->active_set; ++ ++ /* ++ * save current PSR: needed because we modify it ++ */ ++ ia64_srlz_d(); ++ psr = ia64_getreg(_IA64_REG_PSR); ++ ++ /* ++ * stop monitoring: ++ * This is the last instruction which may generate an overflow ++ * ++ * we do not clear ipsr.up ++ */ ++ pfm_clear_psr_up(); ++ ia64_srlz_d(); ++ ++ /* ++ * extract overflow status bits ++ */ ++ tmp = ia64_get_pmc(0) & ~0xf; ++ ++ /* ++ * keep a copy of psr.up (for reload) ++ */ ++ ctx_arch->ctx_saved_psr_up = psr & IA64_PSR_UP; ++ ++ /* ++ * save overflow status bits ++ */ ++ set->povfl_pmds[0] = tmp; ++ ++ /* ++ * record how many pending overflows ++ * XXX: assume identity mapping for counters ++ */ ++ set->npend_ovfls = ia64_popcnt(tmp); ++ ++ /* ++ * make sure the PMU is unfrozen for the next task ++ */ ++ if (set->npend_ovfls) { ++ ia64_set_pmc(0, 0); ++ ia64_srlz_d(); ++ } ++ return 1; ++} ++ ++/* ++ * Called from pfm_ctxsw(). Task is guaranteed to be current. ++ * set cannot be NULL. Context is locked. Interrupts are masked. ++ * Caller has already restored all PMD and PMC registers. ++ * ++ * must reactivate monitoring ++ */ ++void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx) ++{ ++ struct pfm_arch_context *ctx_arch; ++ ++ ctx_arch = pfm_ctx_arch(ctx); ++ ++ /* ++ * when monitoring is not explicitly started ++ * then psr_up = 0, in which case we do not ++ * need to restore ++ */ ++ if (likely(ctx_arch->ctx_saved_psr_up)) { ++ pfm_set_psr_up(); ++ ia64_srlz_d(); ++ } ++} ++ ++int pfm_arch_reserve_session(struct pfm_context *ctx, u32 cpu) ++{ ++ struct pfm_arch_context *ctx_arch; ++ int is_system; ++ int ret = 0; ++ ++ ctx_arch = pfm_ctx_arch(ctx); ++ is_system = ctx->flags.system; ++ ++ spin_lock(&pfm_arch_sessions_lock); ++ ++ if (is_system && ctx_arch->flags.use_dbr) { ++ PFM_DBG("syswide context uses dbregs"); ++ ++ if (pfm_arch_sessions.pfs_ptrace_use_dbr) { ++ PFM_DBG("cannot reserve syswide context: " ++ "dbregs in use by ptrace"); ++ ret = -EBUSY; ++ } else { ++ pfm_arch_sessions.pfs_sys_use_dbr++; ++ } ++ } ++ spin_unlock(&pfm_arch_sessions_lock); ++ ++ return ret; ++} ++ ++void pfm_arch_release_session(struct pfm_context *ctx, u32 cpu) ++{ ++ struct pfm_arch_context *ctx_arch; ++ int is_system; ++ ++ ctx_arch = pfm_ctx_arch(ctx); ++ is_system = ctx->flags.system; ++ ++ spin_lock(&pfm_arch_sessions_lock); ++ ++ if (is_system && ctx_arch->flags.use_dbr) ++ pfm_arch_sessions.pfs_sys_use_dbr--; ++ spin_unlock(&pfm_arch_sessions_lock); ++} ++ ++/* ++ * function called from pfm_load_context_*(). Task is not guaranteed to be ++ * current task. If not then other task is guaranteed stopped and off any CPU. ++ * context is locked and interrupts are masked. ++ * ++ * On PFM_LOAD_CONTEXT, the interface guarantees monitoring is stopped. ++ * ++ * For system-wide task is NULL ++ */ ++int pfm_arch_load_context(struct pfm_context *ctx) ++{ ++ struct pfm_arch_context *ctx_arch; ++ struct pt_regs *regs; ++ int ret = 0; ++ ++ ctx_arch = pfm_ctx_arch(ctx); ++ ++ /* ++ * cannot load a context which is using range restrictions, ++ * into a thread that is being debugged. ++ * ++ * if one set out of several is using the debug registers, then ++ * we assume the context as whole is using them. ++ */ ++ if (ctx_arch->flags.use_dbr) { ++ if (ctx->flags.system) { ++ spin_lock(&pfm_arch_sessions_lock); ++ ++ if (pfm_arch_sessions.pfs_ptrace_use_dbr) { ++ PFM_DBG("cannot reserve syswide context: " ++ "dbregs in use by ptrace"); ++ ret = -EBUSY; ++ } else { ++ pfm_arch_sessions.pfs_sys_use_dbr++; ++ PFM_DBG("pfs_sys_use_dbr=%u", ++ pfm_arch_sessions.pfs_sys_use_dbr); ++ } ++ spin_unlock(&pfm_arch_sessions_lock); ++ ++ } else if (ctx->task->thread.flags & IA64_THREAD_DBG_VALID) { ++ PFM_DBG("load_pid [%d] thread is debugged, cannot " ++ "use range restrictions", ctx->task->pid); ++ ret = -EBUSY; ++ } ++ if (ret) ++ return ret; ++ } ++ ++ /* ++ * We need to intervene on context switch to toggle the ++ * psr.pp bit in system-wide. As such, we set the TIF ++ * flag so that pfm_arch_ctxswout_sys() and the ++ * pfm_arch_ctxswin_sys() functions get called ++ * from pfm_ctxsw_sys(); ++ */ ++ if (ctx->flags.system) { ++ set_thread_flag(TIF_PERFMON_CTXSW); ++ PFM_DBG("[%d] set TIF", current->pid); ++ return 0; ++ } ++ ++ regs = task_pt_regs(ctx->task); ++ ++ /* ++ * self-monitoring systematically allows user level control ++ */ ++ if (ctx->task != current) { ++ /* ++ * when not current, task is stopped, so this is safe ++ */ ++ ctx_arch->ctx_saved_psr_up = 0; ++ ia64_psr(regs)->up = ia64_psr(regs)->pp = 0; ++ } else ++ ctx_arch->flags.insecure = 1; ++ ++ /* ++ * allow user level control (start/stop/read pmd) if: ++ * - self-monitoring ++ * - requested at context creation (PFM_IA64_FL_INSECURE) ++ * ++ * There is not security hole with PFM_IA64_FL_INSECURE because ++ * when not self-monitored, the caller must have permissions to ++ * attached to the task. ++ */ ++ if (ctx_arch->flags.insecure) { ++ ia64_psr(regs)->sp = 0; ++ PFM_DBG("clearing psr.sp for [%d]", ctx->task->pid); ++ } ++ return 0; ++} ++ ++int pfm_arch_setfl_sane(struct pfm_context *ctx, u32 flags) ++{ ++#define PFM_SETFL_BOTH_SWITCH (PFM_SETFL_OVFL_SWITCH|PFM_SETFL_TIME_SWITCH) ++#define PFM_ITA_SETFL_BOTH_INTR (PFM_ITA_SETFL_INTR_ONLY|\ ++ PFM_ITA_SETFL_EXCL_INTR) ++ ++/* exclude return value field */ ++#define PFM_SETFL_ALL_MASK (PFM_ITA_SETFL_BOTH_INTR \ ++ | PFM_SETFL_BOTH_SWITCH \ ++ | PFM_ITA_SETFL_IDLE_EXCL) ++ ++ if ((flags & ~PFM_SETFL_ALL_MASK)) { ++ PFM_DBG("invalid flags=0x%x", flags); ++ return -EINVAL; ++ } ++ ++ if ((flags & PFM_ITA_SETFL_BOTH_INTR) == PFM_ITA_SETFL_BOTH_INTR) { ++ PFM_DBG("both excl intr and ontr only are set"); ++ return -EINVAL; ++ } ++ ++ if ((flags & PFM_ITA_SETFL_IDLE_EXCL) && !ctx->flags.system) { ++ PFM_DBG("idle exclude flag only for system-wide context"); ++ return -EINVAL; ++ } ++ return 0; ++} ++ ++/* ++ * function called from pfm_unload_context_*(). Context is locked. ++ * interrupts are masked. task is not guaranteed to be current task. ++ * Access to PMU is not guaranteed. ++ * ++ * function must do whatever arch-specific action is required on unload ++ * of a context. ++ * ++ * called for both system-wide and per-thread. task is NULL for ssytem-wide ++ */ ++void pfm_arch_unload_context(struct pfm_context *ctx) ++{ ++ struct pfm_arch_context *ctx_arch; ++ struct pt_regs *regs; ++ ++ ctx_arch = pfm_ctx_arch(ctx); ++ ++ if (ctx->flags.system) { ++ /* ++ * disable context switch hook ++ */ ++ clear_thread_flag(TIF_PERFMON_CTXSW); ++ ++ if (ctx_arch->flags.use_dbr) { ++ spin_lock(&pfm_arch_sessions_lock); ++ pfm_arch_sessions.pfs_sys_use_dbr--; ++ PFM_DBG("sys_use_dbr=%u", pfm_arch_sessions.pfs_sys_use_dbr); ++ spin_unlock(&pfm_arch_sessions_lock); ++ } ++ } else { ++ regs = task_pt_regs(ctx->task); ++ ++ /* ++ * cancel user level control for per-task context ++ */ ++ ia64_psr(regs)->sp = 1; ++ PFM_DBG("setting psr.sp for [%d]", ctx->task->pid); ++ } ++} ++ ++/* ++ * mask monitoring by setting the privilege level to 0 ++ * we cannot use psr.pp/psr.up for this, it is controlled by ++ * the user ++ */ ++void pfm_arch_mask_monitoring(struct pfm_context *ctx, struct pfm_event_set *set) ++{ ++ struct pfm_arch_pmu_info *arch_info; ++ unsigned long mask; ++ unsigned int i; ++ ++ arch_info = pfm_pmu_info(); ++ /* ++ * as an optimization we look at the first 64 PMC ++ * registers only starting at PMC4. ++ */ ++ mask = arch_info->mask_pmcs[0] >> PFM_ITA_FCNTR; ++ for (i = PFM_ITA_FCNTR; mask; i++, mask >>= 1) { ++ if (likely(mask & 0x1)) ++ ia64_set_pmc(i, set->pmcs[i] & ~0xfUL); ++ } ++ /* ++ * make changes visisble ++ */ ++ ia64_srlz_d(); ++} ++ ++/* ++ * function called from pfm_switch_sets(), pfm_context_load_thread(), ++ * pfm_context_load_sys(), pfm_ctxsw(), pfm_switch_sets() ++ * context is locked. Interrupts are masked. set cannot be NULL. ++ * Access to the PMU is guaranteed. ++ * ++ * function must restore all PMD registers from set. ++ */ ++void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set) ++{ ++ struct pfm_arch_context *ctx_arch; ++ unsigned long *mask; ++ u16 i, num; ++ ++ ctx_arch = pfm_ctx_arch(ctx); ++ ++ if (ctx_arch->flags.insecure) { ++ num = ctx->regs.num_rw_pmd; ++ mask = ctx->regs.rw_pmds; ++ } else { ++ num = set->nused_pmds; ++ mask = set->used_pmds; ++ } ++ /* ++ * must restore all implemented read-write PMDS to avoid leaking ++ * information especially when PFM_IA64_FL_INSECURE is set. ++ * ++ * XXX: should check PFM_IA64_FL_INSECURE==0 and use used_pmd instead ++ */ ++ for (i = 0; num; i++) { ++ if (likely(test_bit(i, mask))) { ++ pfm_arch_write_pmd(ctx, i, set->pmds[i].value); ++ num--; ++ } ++ } ++ ia64_srlz_d(); ++} ++ ++/* ++ * function called from pfm_switch_sets(), pfm_context_load_thread(), ++ * pfm_context_load_sys(), pfm_ctxsw(), pfm_switch_sets() ++ * context is locked. Interrupts are masked. set cannot be NULL. ++ * Access to the PMU is guaranteed. ++ * ++ * function must restore all PMC registers from set if needed ++ */ ++void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set) ++{ ++ struct pfm_arch_pmu_info *arch_info; ++ u64 mask2 = 0, val, plm; ++ unsigned long impl_mask, mask_pmcs; ++ unsigned int i; ++ ++ arch_info = pfm_pmu_info(); ++ /* ++ * as an optimization we only look at the first 64 ++ * PMC registers. In fact, we should never scan the ++ * entire impl_pmcs because ibr/dbr are implemented ++ * separately. ++ * ++ * always skip PMC0-PMC3. PMC0 taken care of when saving ++ * state. PMC1-PMC3 not used until we get counters in ++ * the 60 and above index range. ++ */ ++ impl_mask = ctx->regs.pmcs[0] >> PFM_ITA_FCNTR; ++ mask_pmcs = arch_info->mask_pmcs[0] >> PFM_ITA_FCNTR; ++ plm = ctx->state == PFM_CTX_MASKED ? ~0xf : ~0x0; ++ ++ for (i = PFM_ITA_FCNTR; ++ impl_mask; ++ i++, impl_mask >>= 1, mask_pmcs >>= 1) { ++ if (likely(impl_mask & 0x1)) { ++ mask2 = mask_pmcs & 0x1 ? plm : ~0; ++ val = set->pmcs[i] & mask2; ++ ia64_set_pmc(i, val); ++ PFM_DBG_ovfl("pmc%u=0x%lx", i, val); ++ } ++ } ++ /* ++ * restore DBR/IBR ++ */ ++ if (set->priv_flags & PFM_ITA_SETFL_USE_DBR) { ++ pfm_restore_ibrs(set->pmcs+256, 8); ++ pfm_restore_dbrs(set->pmcs+264, 8); ++ } ++ ia64_srlz_d(); ++} ++ ++void pfm_arch_unmask_monitoring(struct pfm_context *ctx, struct pfm_event_set *set) ++{ ++ u64 psr; ++ int is_system; ++ ++ is_system = ctx->flags.system; ++ ++ psr = ia64_getreg(_IA64_REG_PSR); ++ ++ /* ++ * monitoring is masked via the PMC.plm ++ * ++ * As we restore their value, we do not want each counter to ++ * restart right away. We stop monitoring using the PSR, ++ * restore the PMC (and PMD) and then re-establish the psr ++ * as it was. Note that there can be no pending overflow at ++ * this point, because monitoring is still MASKED. ++ * ++ * Because interrupts are masked we can avoid changing ++ * DCR.pp. ++ */ ++ if (is_system) ++ pfm_clear_psr_pp(); ++ else ++ pfm_clear_psr_up(); ++ ++ ia64_srlz_d(); ++ ++ pfm_arch_restore_pmcs(ctx, set); ++ ++ /* ++ * restore psr ++ * ++ * monitoring may start right now but interrupts ++ * are still masked ++ */ ++ pfm_set_psr_l(psr); ++ ia64_srlz_d(); ++} ++ ++/* ++ * Called from pfm_stop() ++ * ++ * For per-thread: ++ * task is not necessarily current. If not current task, then ++ * task is guaranteed stopped and off any cpu. Access to PMU ++ * is not guaranteed. Interrupts are masked. Context is locked. ++ * Set is the active set. ++ * ++ * must disable active monitoring. ctx cannot be NULL ++ */ ++void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx) ++{ ++ struct pfm_arch_context *ctx_arch; ++ struct pt_regs *regs; ++ u64 dcr, psr; ++ ++ ctx_arch = pfm_ctx_arch(ctx); ++ regs = task_pt_regs(task); ++ ++ if (!ctx->flags.system) { ++ /* ++ * in ZOMBIE state we always have task == current due to ++ * pfm_exit_thread() ++ */ ++ ia64_psr(regs)->up = 0; ++ ctx_arch->ctx_saved_psr_up = 0; ++ ++ /* ++ * in case of ZOMBIE state, there is no unload to clear ++ * insecure monitoring, so we do it in stop instead. ++ */ ++ if (ctx->state == PFM_CTX_ZOMBIE) ++ ia64_psr(regs)->sp = 1; ++ ++ if (task == current) { ++ pfm_clear_psr_up(); ++ ia64_srlz_d(); ++ } ++ } else if (ctx->flags.started) { /* do not stop twice */ ++ dcr = ia64_getreg(_IA64_REG_CR_DCR); ++ psr = ia64_getreg(_IA64_REG_PSR); ++ ++ ia64_psr(regs)->pp = 0; ++ ia64_setreg(_IA64_REG_CR_DCR, dcr & ~IA64_DCR_PP); ++ pfm_clear_psr_pp(); ++ ia64_srlz_d(); ++ ++ if (ctx->active_set->flags & PFM_ITA_SETFL_IDLE_EXCL) { ++ PFM_DBG("disabling idle exclude"); ++ __get_cpu_var(pfm_syst_info) &= ~PFM_ITA_CPUINFO_IDLE_EXCL; ++ } ++ } ++} ++ ++/* ++ * called from pfm_start() ++ * ++ * Interrupts are masked. Context is locked. Set is the active set. ++ * ++ * For per-thread: ++ * Task is not necessarily current. If not current task, then task ++ * is guaranteed stopped and off any cpu. No access to PMU is task ++ * is not current. ++ * ++ * For system-wide: ++ * task is always current ++ * ++ * must enable active monitoring. ++ */ ++void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx) ++{ ++ struct pfm_arch_context *ctx_arch; ++ struct pt_regs *regs; ++ u64 dcr, dcr_pp, psr_pp; ++ u32 flags; ++ ++ ctx_arch = pfm_ctx_arch(ctx); ++ regs = task_pt_regs(task); ++ flags = ctx->active_set->flags; ++ ++ /* ++ * per-thread mode ++ */ ++ if (!ctx->flags.system) { ++ ++ ia64_psr(regs)->up = 1; ++ ++ if (task == current) { ++ pfm_set_psr_up(); ++ ia64_srlz_d(); ++ } else { ++ /* ++ * activate monitoring at next ctxswin ++ */ ++ ctx_arch->ctx_saved_psr_up = IA64_PSR_UP; ++ } ++ return; ++ } ++ ++ /* ++ * system-wide mode ++ */ ++ dcr = ia64_getreg(_IA64_REG_CR_DCR); ++ if (flags & PFM_ITA_SETFL_INTR_ONLY) { ++ dcr_pp = 1; ++ psr_pp = 0; ++ } else if (flags & PFM_ITA_SETFL_EXCL_INTR) { ++ dcr_pp = 0; ++ psr_pp = 1; ++ } else { ++ dcr_pp = psr_pp = 1; ++ } ++ PFM_DBG("dcr_pp=%lu psr_pp=%lu", dcr_pp, psr_pp); ++ ++ /* ++ * update dcr_pp and psr_pp ++ */ ++ if (dcr_pp) ++ ia64_setreg(_IA64_REG_CR_DCR, dcr | IA64_DCR_PP); ++ else ++ ia64_setreg(_IA64_REG_CR_DCR, dcr & ~IA64_DCR_PP); ++ ++ if (psr_pp) { ++ pfm_set_psr_pp(); ++ ia64_psr(regs)->pp = 1; ++ } else { ++ pfm_clear_psr_pp(); ++ ia64_psr(regs)->pp = 0; ++ } ++ ia64_srlz_d(); ++ ++ if (ctx->active_set->flags & PFM_ITA_SETFL_IDLE_EXCL) { ++ PFM_DBG("enable idle exclude"); ++ __get_cpu_var(pfm_syst_info) |= PFM_ITA_CPUINFO_IDLE_EXCL; ++ } ++} ++ ++/* ++ * Only call this function when a process is trying to ++ * write the debug registers (reading is always allowed) ++ * called from arch/ia64/kernel/ptrace.c:access_uarea() ++ */ ++int __pfm_use_dbregs(struct task_struct *task) ++{ ++ struct pfm_arch_context *ctx_arch; ++ struct pfm_context *ctx; ++ unsigned long flags; ++ int ret = 0; ++ ++ PFM_DBG("called for [%d]", task->pid); ++ ++ ctx = task->pfm_context; ++ ++ /* ++ * do it only once ++ */ ++ if (task->thread.flags & IA64_THREAD_DBG_VALID) { ++ PFM_DBG("IA64_THREAD_DBG_VALID already set"); ++ return 0; ++ } ++ if (ctx) { ++ spin_lock_irqsave(&ctx->lock, flags); ++ ctx_arch = pfm_ctx_arch(ctx); ++ ++ if (ctx_arch->flags.use_dbr == 1) { ++ PFM_DBG("PMU using dbregs already, no ptrace access"); ++ ret = -1; ++ } ++ spin_unlock_irqrestore(&ctx->lock, flags); ++ if (ret) ++ return ret; ++ } ++ ++ spin_lock(&pfm_arch_sessions_lock); ++ ++ /* ++ * We cannot allow setting breakpoints when system wide monitoring ++ * sessions are using the debug registers. ++ */ ++ if (!pfm_arch_sessions.pfs_sys_use_dbr) ++ pfm_arch_sessions.pfs_ptrace_use_dbr++; ++ else ++ ret = -1; ++ ++ PFM_DBG("ptrace_use_dbr=%u sys_use_dbr=%u by [%d] ret = %d", ++ pfm_arch_sessions.pfs_ptrace_use_dbr, ++ pfm_arch_sessions.pfs_sys_use_dbr, ++ task->pid, ret); ++ ++ spin_unlock(&pfm_arch_sessions_lock); ++ if (ret) ++ return ret; ++#ifndef CONFIG_SMP ++ /* ++ * in UP, we need to check whether the current ++ * owner of the PMU is not using the debug registers ++ * for monitoring. Because we are using a lazy ++ * save on ctxswout, we must force a save in this ++ * case because the debug registers are being ++ * modified by another task. We save the current ++ * PMD registers, and clear ownership. In ctxswin, ++ * full state will be reloaded. ++ * ++ * Note: we overwrite task. ++ */ ++ task = __get_cpu_var(pmu_owner); ++ ctx = __get_cpu_var(pmu_ctx); ++ ++ if (task == NULL) ++ return 0; ++ ++ ctx_arch = pfm_ctx_arch(ctx); ++ ++ if (ctx_arch->flags.use_dbr) ++ pfm_save_pmds_release(ctx); ++#endif ++ return 0; ++} ++ ++/* ++ * This function is called for every task that exits with the ++ * IA64_THREAD_DBG_VALID set. This indicates a task which was ++ * able to use the debug registers for debugging purposes via ++ * ptrace(). Therefore we know it was not using them for ++ * perfmormance monitoring, so we only decrement the number ++ * of "ptraced" debug register users to keep the count up to date ++ */ ++int __pfm_release_dbregs(struct task_struct *task) ++{ ++ int ret; ++ ++ spin_lock(&pfm_arch_sessions_lock); ++ ++ if (pfm_arch_sessions.pfs_ptrace_use_dbr == 0) { ++ PFM_ERR("invalid release for [%d] ptrace_use_dbr=0", task->pid); ++ ret = -1; ++ } else { ++ pfm_arch_sessions.pfs_ptrace_use_dbr--; ++ ret = 0; ++ } ++ spin_unlock(&pfm_arch_sessions_lock); ++ ++ return ret; ++} ++ ++int pfm_ia64_mark_dbregs_used(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ struct pfm_arch_context *ctx_arch; ++ struct task_struct *task; ++ struct thread_struct *thread; ++ int ret = 0, state; ++ int i, can_access_pmu = 0; ++ int is_loaded, is_system; ++ ++ ctx_arch = pfm_ctx_arch(ctx); ++ state = ctx->state; ++ task = ctx->task; ++ is_loaded = state == PFM_CTX_LOADED || state == PFM_CTX_MASKED; ++ is_system = ctx->flags.system; ++ can_access_pmu = __get_cpu_var(pmu_owner) == task || is_system; ++ ++ if (is_loaded == 0) ++ goto done; ++ ++ if (is_system == 0) { ++ thread = &(task->thread); ++ ++ /* ++ * cannot use debug registers for montioring if they are ++ * already used for debugging ++ */ ++ if (thread->flags & IA64_THREAD_DBG_VALID) { ++ PFM_DBG("debug registers already in use for [%d]", ++ task->pid); ++ return -EBUSY; ++ } ++ } ++ ++ /* ++ * check for debug registers in system wide mode ++ */ ++ spin_lock(&pfm_arch_sessions_lock); ++ ++ if (is_system) { ++ if (pfm_arch_sessions.pfs_ptrace_use_dbr) ++ ret = -EBUSY; ++ else ++ pfm_arch_sessions.pfs_sys_use_dbr++; ++ } ++ ++ spin_unlock(&pfm_arch_sessions_lock); ++ ++ if (ret != 0) ++ return ret; ++ ++ /* ++ * clear hardware registers to make sure we don't ++ * pick up stale state. ++ */ ++ if (can_access_pmu) { ++ PFM_DBG("clearing ibrs, dbrs"); ++ for (i = 0; i < 8; i++) { ++ ia64_set_ibr(i, 0); ++ ia64_dv_serialize_instruction(); ++ } ++ ia64_srlz_i(); ++ for (i = 0; i < 8; i++) { ++ ia64_set_dbr(i, 0); ++ ia64_dv_serialize_data(); ++ } ++ ia64_srlz_d(); ++ } ++done: ++ /* ++ * debug registers are now in use ++ */ ++ ctx_arch->flags.use_dbr = 1; ++ set->priv_flags |= PFM_ITA_SETFL_USE_DBR; ++ PFM_DBG("set%u use_dbr=1", set->id); ++ return 0; ++} ++EXPORT_SYMBOL(pfm_ia64_mark_dbregs_used); ++ ++char *pfm_arch_get_pmu_module_name(void) ++{ ++ switch (local_cpu_data->family) { ++ case 0x07: ++ return "perfmon_itanium"; ++ case 0x1f: ++ return "perfmon_mckinley"; ++ case 0x20: ++ return "perfmon_montecito"; ++ default: ++ return "perfmon_generic"; ++ } ++ return NULL; ++} ++ ++/* ++ * global arch-specific intialization, called only once ++ */ ++int __init pfm_arch_init(void) ++{ ++ int ret; ++ ++ spin_lock_init(&pfm_arch_sessions_lock); ++ ++#ifdef CONFIG_IA64_PERFMON_COMPAT ++ ret = pfm_ia64_compat_init(); ++ if (ret) ++ return ret; ++#endif ++ register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction); ++ ++ ++ return 0; ++} +diff --git a/arch/ia64/perfmon/perfmon_compat.c b/arch/ia64/perfmon/perfmon_compat.c +new file mode 100644 +index 0000000..2fd3d3c +--- /dev/null ++++ b/arch/ia64/perfmon/perfmon_compat.c +@@ -0,0 +1,1210 @@ ++/* ++ * This file implements the IA-64 specific ++ * support for the perfmon2 interface ++ * ++ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/interrupt.h> ++#include <linux/module.h> ++#include <linux/file.h> ++#include <linux/fdtable.h> ++#include <linux/seq_file.h> ++#include <linux/vmalloc.h> ++#include <linux/proc_fs.h> ++#include <linux/perfmon_kern.h> ++#include <linux/uaccess.h> ++ ++asmlinkage long sys_pfm_stop(int fd); ++asmlinkage long sys_pfm_start(int fd, struct pfarg_start __user *st); ++asmlinkage long sys_pfm_unload_context(int fd); ++asmlinkage long sys_pfm_restart(int fd); ++asmlinkage long sys_pfm_load_context(int fd, struct pfarg_load __user *ld); ++ ++ssize_t pfm_sysfs_res_show(char *buf, size_t sz, int what); ++ ++extern ssize_t __pfm_read(struct pfm_context *ctx, ++ union pfarg_msg *msg_buf, ++ int non_block); ++/* ++ * function providing some help for backward compatiblity with old IA-64 ++ * applications. In the old model, certain attributes of a counter were ++ * passed via the PMC, now they are passed via the PMD. ++ */ ++static int pfm_compat_update_pmd(struct pfm_context *ctx, u16 set_id, u16 cnum, ++ u32 rflags, ++ unsigned long *smpl_pmds, ++ unsigned long *reset_pmds, ++ u64 eventid) ++{ ++ struct pfm_event_set *set; ++ int is_counting; ++ unsigned long *impl_pmds; ++ u32 flags = 0; ++ u16 max_pmd; ++ ++ impl_pmds = ctx->regs.pmds; ++ max_pmd = ctx->regs.max_pmd; ++ ++ /* ++ * given that we do not maintain PMC ->PMD dependencies ++ * we cannot figure out what to do in case PMCxx != PMDxx ++ */ ++ if (cnum > max_pmd) ++ return 0; ++ ++ /* ++ * assumes PMCxx controls PMDxx which is always true for counters ++ * on Itanium PMUs. ++ */ ++ is_counting = pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_C64; ++ set = pfm_find_set(ctx, set_id, 0); ++ ++ /* ++ * for v2.0, we only allowed counting PMD to generate ++ * user-level notifications. Same thing with randomization. ++ */ ++ if (is_counting) { ++ if (rflags & PFM_REGFL_OVFL_NOTIFY) ++ flags |= PFM_REGFL_OVFL_NOTIFY; ++ if (rflags & PFM_REGFL_RANDOM) ++ flags |= PFM_REGFL_RANDOM; ++ /* ++ * verify validity of smpl_pmds ++ */ ++ if (unlikely(bitmap_subset(smpl_pmds, ++ impl_pmds, max_pmd) == 0)) { ++ PFM_DBG("invalid smpl_pmds=0x%llx for pmd%u", ++ (unsigned long long)smpl_pmds[0], cnum); ++ return -EINVAL; ++ } ++ /* ++ * verify validity of reset_pmds ++ */ ++ if (unlikely(bitmap_subset(reset_pmds, ++ impl_pmds, max_pmd) == 0)) { ++ PFM_DBG("invalid reset_pmds=0x%lx for pmd%u", ++ reset_pmds[0], cnum); ++ return -EINVAL; ++ } ++ /* ++ * ensures that a PFM_READ_PMDS succeeds with a ++ * corresponding PFM_WRITE_PMDS ++ */ ++ __set_bit(cnum, set->used_pmds); ++ ++ } else if (rflags & (PFM_REGFL_OVFL_NOTIFY|PFM_REGFL_RANDOM)) { ++ PFM_DBG("cannot set ovfl_notify or random on pmd%u", cnum); ++ return -EINVAL; ++ } ++ ++ set->pmds[cnum].flags = flags; ++ ++ if (is_counting) { ++ bitmap_copy(set->pmds[cnum].reset_pmds, ++ reset_pmds, ++ max_pmd); ++ ++ bitmap_copy(set->pmds[cnum].smpl_pmds, ++ smpl_pmds, ++ max_pmd); ++ ++ set->pmds[cnum].eventid = eventid; ++ ++ /* ++ * update ovfl_notify ++ */ ++ if (rflags & PFM_REGFL_OVFL_NOTIFY) ++ __set_bit(cnum, set->ovfl_notify); ++ else ++ __clear_bit(cnum, set->ovfl_notify); ++ ++ } ++ PFM_DBG("pmd%u flags=0x%x eventid=0x%lx r_pmds=0x%lx s_pmds=0x%lx", ++ cnum, flags, ++ eventid, ++ reset_pmds[0], ++ smpl_pmds[0]); ++ ++ return 0; ++} ++ ++ ++int __pfm_write_ibrs_old(struct pfm_context *ctx, void *arg, int count) ++{ ++ struct pfarg_dbreg *req = arg; ++ struct pfarg_pmc pmc; ++ int i, ret = 0; ++ ++ memset(&pmc, 0, sizeof(pmc)); ++ ++ for (i = 0; i < count; i++, req++) { ++ pmc.reg_num = 256+req->dbreg_num; ++ pmc.reg_value = req->dbreg_value; ++ pmc.reg_flags = 0; ++ pmc.reg_set = req->dbreg_set; ++ ++ ret = __pfm_write_pmcs(ctx, &pmc, 1); ++ ++ req->dbreg_flags &= ~PFM_REG_RETFL_MASK; ++ req->dbreg_flags |= pmc.reg_flags; ++ ++ if (ret) ++ return ret; ++ } ++ return 0; ++} ++ ++static long pfm_write_ibrs_old(int fd, void __user *ureq, int count) ++{ ++ struct pfm_context *ctx; ++ struct task_struct *task; ++ struct file *filp; ++ struct pfarg_dbreg *req = NULL; ++ void *fptr, *resume; ++ unsigned long flags; ++ size_t sz; ++ int ret, fput_needed; ++ ++ if (count < 1 || count >= PFM_MAX_ARG_COUNT(req)) ++ return -EINVAL; ++ ++ sz = count*sizeof(*req); ++ ++ filp = fget_light(fd, &fput_needed); ++ if (unlikely(filp == NULL)) { ++ PFM_DBG("invalid fd %d", fd); ++ return -EBADF; ++ } ++ ++ ctx = filp->private_data; ++ ret = -EBADF; ++ ++ if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) { ++ PFM_DBG("fd %d not related to perfmon", fd); ++ goto error; ++ } ++ ++ ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr); ++ if (ret) ++ goto error; ++ ++ spin_lock_irqsave(&ctx->lock, flags); ++ ++ task = ctx->task; ++ ++ ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags, &resume); ++ if (ret == 0) ++ ret = __pfm_write_ibrs_old(ctx, req, count); ++ ++ spin_unlock_irqrestore(&ctx->lock, flags); ++ ++ if (resume) ++ pfm_resume_task(task, resume); ++ ++ if (copy_to_user(ureq, req, sz)) ++ ret = -EFAULT; ++ ++ kfree(fptr); ++error: ++ fput_light(filp, fput_needed); ++ return ret; ++} ++ ++int __pfm_write_dbrs_old(struct pfm_context *ctx, void *arg, int count) ++{ ++ struct pfarg_dbreg *req = arg; ++ struct pfarg_pmc pmc; ++ int i, ret = 0; ++ ++ memset(&pmc, 0, sizeof(pmc)); ++ ++ for (i = 0; i < count; i++, req++) { ++ pmc.reg_num = 264+req->dbreg_num; ++ pmc.reg_value = req->dbreg_value; ++ pmc.reg_flags = 0; ++ pmc.reg_set = req->dbreg_set; ++ ++ ret = __pfm_write_pmcs(ctx, &pmc, 1); ++ ++ req->dbreg_flags &= ~PFM_REG_RETFL_MASK; ++ req->dbreg_flags |= pmc.reg_flags; ++ if (ret) ++ return ret; ++ } ++ return 0; ++} ++ ++static long pfm_write_dbrs_old(int fd, void __user *ureq, int count) ++{ ++ struct pfm_context *ctx; ++ struct task_struct *task; ++ struct file *filp; ++ struct pfarg_dbreg *req = NULL; ++ void *fptr, *resume; ++ unsigned long flags; ++ size_t sz; ++ int ret, fput_needed; ++ ++ if (count < 1 || count >= PFM_MAX_ARG_COUNT(req)) ++ return -EINVAL; ++ ++ sz = count*sizeof(*req); ++ ++ filp = fget_light(fd, &fput_needed); ++ if (unlikely(filp == NULL)) { ++ PFM_DBG("invalid fd %d", fd); ++ return -EBADF; ++ } ++ ++ ctx = filp->private_data; ++ ret = -EBADF; ++ ++ if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) { ++ PFM_DBG("fd %d not related to perfmon", fd); ++ goto error; ++ } ++ ++ ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr); ++ if (ret) ++ goto error; ++ ++ spin_lock_irqsave(&ctx->lock, flags); ++ ++ task = ctx->task; ++ ++ ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags, &resume); ++ if (ret == 0) ++ ret = __pfm_write_dbrs_old(ctx, req, count); ++ ++ spin_unlock_irqrestore(&ctx->lock, flags); ++ ++ if (resume) ++ pfm_resume_task(task, resume); ++ ++ if (copy_to_user(ureq, req, sz)) ++ ret = -EFAULT; ++ ++ kfree(fptr); ++error: ++ fput_light(filp, fput_needed); ++ return ret; ++} ++ ++int __pfm_write_pmcs_old(struct pfm_context *ctx, struct pfarg_reg *req_old, ++ int count) ++{ ++ struct pfarg_pmc req; ++ unsigned int i; ++ int ret, error_code; ++ ++ memset(&req, 0, sizeof(req)); ++ ++ for (i = 0; i < count; i++, req_old++) { ++ req.reg_num = req_old->reg_num; ++ req.reg_set = req_old->reg_set; ++ req.reg_flags = 0; ++ req.reg_value = req_old->reg_value; ++ ++ ret = __pfm_write_pmcs(ctx, (void *)&req, 1); ++ req_old->reg_flags &= ~PFM_REG_RETFL_MASK; ++ req_old->reg_flags |= req.reg_flags; ++ ++ if (ret) ++ return ret; ++ ++ ret = pfm_compat_update_pmd(ctx, req_old->reg_set, ++ req_old->reg_num, ++ (u32)req_old->reg_flags, ++ req_old->reg_smpl_pmds, ++ req_old->reg_reset_pmds, ++ req_old->reg_smpl_eventid); ++ ++ error_code = ret ? PFM_REG_RETFL_EINVAL : 0; ++ req_old->reg_flags &= ~PFM_REG_RETFL_MASK; ++ req_old->reg_flags |= error_code; ++ ++ if (ret) ++ return ret; ++ } ++ return 0; ++} ++ ++static long pfm_write_pmcs_old(int fd, void __user *ureq, int count) ++{ ++ struct pfm_context *ctx; ++ struct task_struct *task; ++ struct file *filp; ++ struct pfarg_reg *req = NULL; ++ void *fptr, *resume; ++ unsigned long flags; ++ size_t sz; ++ int ret, fput_needed; ++ ++ if (count < 1 || count >= PFM_MAX_ARG_COUNT(req)) ++ return -EINVAL; ++ ++ sz = count*sizeof(*req); ++ ++ filp = fget_light(fd, &fput_needed); ++ if (unlikely(filp == NULL)) { ++ PFM_DBG("invalid fd %d", fd); ++ return -EBADF; ++ } ++ ++ ctx = filp->private_data; ++ ret = -EBADF; ++ ++ if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) { ++ PFM_DBG("fd %d not related to perfmon", fd); ++ goto error; ++ } ++ ++ ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr); ++ if (ret) ++ goto error; ++ ++ spin_lock_irqsave(&ctx->lock, flags); ++ ++ task = ctx->task; ++ ++ ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags, &resume); ++ if (ret == 0) ++ ret = __pfm_write_pmcs_old(ctx, req, count); ++ ++ spin_unlock_irqrestore(&ctx->lock, flags); ++ ++ if (resume) ++ pfm_resume_task(task, resume); ++ ++ if (copy_to_user(ureq, req, sz)) ++ ret = -EFAULT; ++ ++ kfree(fptr); ++ ++error: ++ fput_light(filp, fput_needed); ++ return ret; ++} ++ ++int __pfm_write_pmds_old(struct pfm_context *ctx, struct pfarg_reg *req_old, ++ int count) ++{ ++ struct pfarg_pmd req; ++ int i, ret; ++ ++ memset(&req, 0, sizeof(req)); ++ ++ for (i = 0; i < count; i++, req_old++) { ++ req.reg_num = req_old->reg_num; ++ req.reg_set = req_old->reg_set; ++ req.reg_value = req_old->reg_value; ++ /* flags passed with pmcs in v2.0 */ ++ ++ req.reg_long_reset = req_old->reg_long_reset; ++ req.reg_short_reset = req_old->reg_short_reset; ++ req.reg_random_mask = req_old->reg_random_mask; ++ /* ++ * reg_random_seed is ignored since v2.3 ++ */ ++ ++ /* ++ * skip last_reset_val not used for writing ++ * skip smpl_pmds, reset_pmds, eventid, ovfl_swtch_cnt ++ * as set in pfm_write_pmcs_old. ++ * ++ * ovfl_switch_cnt ignored, not implemented in v2.0 ++ */ ++ ret = __pfm_write_pmds(ctx, (void *)&req, 1, 1); ++ ++ req_old->reg_flags &= ~PFM_REG_RETFL_MASK; ++ req_old->reg_flags |= req.reg_flags; ++ ++ if (ret) ++ return ret; ++ } ++ return 0; ++} ++ ++static long pfm_write_pmds_old(int fd, void __user *ureq, int count) ++{ ++ struct pfm_context *ctx; ++ struct task_struct *task; ++ struct file *filp; ++ struct pfarg_reg *req = NULL; ++ void *fptr, *resume; ++ unsigned long flags; ++ size_t sz; ++ int ret, fput_needed; ++ ++ if (count < 1 || count >= PFM_MAX_ARG_COUNT(req)) ++ return -EINVAL; ++ ++ sz = count*sizeof(*req); ++ ++ filp = fget_light(fd, &fput_needed); ++ if (unlikely(filp == NULL)) { ++ PFM_DBG("invalid fd %d", fd); ++ return -EBADF; ++ } ++ ++ ctx = filp->private_data; ++ ret = -EBADF; ++ ++ if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) { ++ PFM_DBG("fd %d not related to perfmon", fd); ++ goto error; ++ } ++ ++ ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr); ++ if (ret) ++ goto error; ++ ++ spin_lock_irqsave(&ctx->lock, flags); ++ ++ task = ctx->task; ++ ++ ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags, &resume); ++ if (ret == 0) ++ ret = __pfm_write_pmds_old(ctx, req, count); ++ ++ spin_unlock_irqrestore(&ctx->lock, flags); ++ ++ if (copy_to_user(ureq, req, sz)) ++ ret = -EFAULT; ++ ++ if (resume) ++ pfm_resume_task(task, resume); ++ ++ kfree(fptr); ++error: ++ fput_light(filp, fput_needed); ++ return ret; ++} ++ ++int __pfm_read_pmds_old(struct pfm_context *ctx, struct pfarg_reg *req_old, ++ int count) ++{ ++ struct pfarg_pmd req; ++ int i, ret; ++ ++ memset(&req, 0, sizeof(req)); ++ ++ for (i = 0; i < count; i++, req_old++) { ++ req.reg_num = req_old->reg_num; ++ req.reg_set = req_old->reg_set; ++ ++ /* skip value not used for reading */ ++ req.reg_flags = req_old->reg_flags; ++ ++ /* skip short/long_reset not used for reading */ ++ /* skip last_reset_val not used for reading */ ++ /* skip ovfl_switch_cnt not used for reading */ ++ ++ ret = __pfm_read_pmds(ctx, (void *)&req, 1); ++ ++ req_old->reg_flags &= ~PFM_REG_RETFL_MASK; ++ req_old->reg_flags |= req.reg_flags; ++ if (ret) ++ return ret; ++ ++ /* update fields */ ++ req_old->reg_value = req.reg_value; ++ ++ req_old->reg_last_reset_val = req.reg_last_reset_val; ++ req_old->reg_ovfl_switch_cnt = req.reg_ovfl_switch_cnt; ++ } ++ return 0; ++} ++ ++static long pfm_read_pmds_old(int fd, void __user *ureq, int count) ++{ ++ struct pfm_context *ctx; ++ struct task_struct *task; ++ struct file *filp; ++ struct pfarg_reg *req = NULL; ++ void *fptr, *resume; ++ unsigned long flags; ++ size_t sz; ++ int ret, fput_needed; ++ ++ if (count < 1 || count >= PFM_MAX_ARG_COUNT(req)) ++ return -EINVAL; ++ ++ sz = count*sizeof(*req); ++ ++ filp = fget_light(fd, &fput_needed); ++ if (unlikely(filp == NULL)) { ++ PFM_DBG("invalid fd %d", fd); ++ return -EBADF; ++ } ++ ++ ctx = filp->private_data; ++ ret = -EBADF; ++ ++ if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) { ++ PFM_DBG("fd %d not related to perfmon", fd); ++ goto error; ++ } ++ ++ ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr); ++ if (ret) ++ goto error; ++ ++ spin_lock_irqsave(&ctx->lock, flags); ++ ++ task = ctx->task; ++ ++ ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags, &resume); ++ if (ret == 0) ++ ret = __pfm_read_pmds_old(ctx, req, count); ++ ++ spin_unlock_irqrestore(&ctx->lock, flags); ++ ++ if (resume) ++ pfm_resume_task(task, resume); ++ ++ if (copy_to_user(ureq, req, sz)) ++ ret = -EFAULT; ++ ++ kfree(fptr); ++error: ++ fput_light(filp, fput_needed); ++ return ret; ++} ++ ++/* ++ * OBSOLETE: use /proc/perfmon_map instead ++ */ ++static long pfm_get_default_pmcs_old(int fd, void __user *ureq, int count) ++{ ++ struct pfarg_reg *req = NULL; ++ void *fptr; ++ size_t sz; ++ int ret, i; ++ unsigned int cnum; ++ ++ if (count < 1) ++ return -EINVAL; ++ ++ /* ++ * ensure the pfm_pmu_conf does not disappear while ++ * we use it ++ */ ++ ret = pfm_pmu_conf_get(1); ++ if (ret) ++ return ret; ++ ++ sz = count*sizeof(*ureq); ++ ++ ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr); ++ if (ret) ++ goto error; ++ ++ ++ for (i = 0; i < count; i++, req++) { ++ cnum = req->reg_num; ++ ++ if (i >= PFM_MAX_PMCS || ++ (pfm_pmu_conf->pmc_desc[cnum].type & PFM_REG_I) == 0) { ++ req->reg_flags = PFM_REG_RETFL_EINVAL; ++ break; ++ } ++ req->reg_value = pfm_pmu_conf->pmc_desc[cnum].dfl_val; ++ req->reg_flags = 0; ++ ++ PFM_DBG("pmc[%u]=0x%lx", cnum, req->reg_value); ++ } ++ ++ if (copy_to_user(ureq, req, sz)) ++ ret = -EFAULT; ++ ++ kfree(fptr); ++error: ++ pfm_pmu_conf_put(); ++ ++ return ret; ++} ++ ++/* ++ * allocate a sampling buffer and remaps it into the user address space of ++ * the task. This is only in compatibility mode ++ * ++ * function called ONLY on current task ++ */ ++int pfm_smpl_buf_alloc_compat(struct pfm_context *ctx, size_t rsize, ++ struct file *filp) ++{ ++ struct mm_struct *mm = current->mm; ++ struct vm_area_struct *vma = NULL; ++ struct pfm_arch_context *ctx_arch; ++ size_t size; ++ int ret; ++ extern struct vm_operations_struct pfm_buf_map_vm_ops; ++ ++ ctx_arch = pfm_ctx_arch(ctx); ++ ++ /* ++ * allocate buffer + map desc ++ */ ++ ret = pfm_smpl_buf_alloc(ctx, rsize); ++ if (ret) ++ return ret; ++ ++ size = ctx->smpl_size; ++ ++ ++ /* allocate vma */ ++ vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); ++ if (!vma) { ++ PFM_DBG("Cannot allocate vma"); ++ goto error_kmem; ++ } ++ memset(vma, 0, sizeof(*vma)); ++ ++ /* ++ * partially initialize the vma for the sampling buffer ++ */ ++ vma->vm_mm = mm; ++ vma->vm_flags = VM_READ | VM_MAYREAD | VM_RESERVED; ++ vma->vm_page_prot = PAGE_READONLY; ++ vma->vm_ops = &pfm_buf_map_vm_ops; ++ vma->vm_file = filp; ++ vma->vm_private_data = ctx; ++ vma->vm_pgoff = 0; ++ ++ /* ++ * simulate effect of mmap() ++ */ ++ get_file(filp); ++ ++ /* ++ * Let's do the difficult operations next. ++ * ++ * now we atomically find some area in the address space and ++ * remap the buffer into it. ++ */ ++ down_write(¤t->mm->mmap_sem); ++ ++ /* find some free area in address space, must have mmap sem held */ ++ vma->vm_start = get_unmapped_area(NULL, 0, size, 0, ++ MAP_PRIVATE|MAP_ANONYMOUS); ++ if (vma->vm_start == 0) { ++ PFM_DBG("cannot find unmapped area of size %zu", size); ++ up_write(¤t->mm->mmap_sem); ++ goto error; ++ } ++ vma->vm_end = vma->vm_start + size; ++ ++ PFM_DBG("aligned_size=%zu mapped @0x%lx", size, vma->vm_start); ++ /* ++ * now insert the vma in the vm list for the process, must be ++ * done with mmap lock held ++ */ ++ insert_vm_struct(mm, vma); ++ ++ mm->total_vm += size >> PAGE_SHIFT; ++ ++ up_write(¤t->mm->mmap_sem); ++ ++ /* ++ * IMPORTANT: we do not issue the fput() ++ * because we want to increase the ref count ++ * on the descriptor to simulate what mmap() ++ * would do ++ */ ++ ++ /* ++ * used to propagate vaddr to syscall stub ++ */ ++ ctx_arch->ctx_smpl_vaddr = (void *)vma->vm_start; ++ ++ return 0; ++error: ++ kmem_cache_free(vm_area_cachep, vma); ++error_kmem: ++ pfm_smpl_buf_space_release(ctx, ctx->smpl_size); ++ vfree(ctx->smpl_addr); ++ return -ENOMEM; ++} ++ ++#define PFM_DEFAULT_SMPL_UUID { \ ++ 0x4d, 0x72, 0xbe, 0xc0, 0x06, 0x64, 0x41, 0x43, 0x82,\ ++ 0xb4, 0xd3, 0xfd, 0x27, 0x24, 0x3c, 0x97} ++ ++static pfm_uuid_t old_default_uuid = PFM_DEFAULT_SMPL_UUID; ++static pfm_uuid_t null_uuid; ++ ++/* ++ * function invoked in case, pfm_context_create fails ++ * at the last operation, copy_to_user. It needs to ++ * undo memory allocations and free the file descriptor ++ */ ++static void pfm_undo_create_context_fd(int fd, struct pfm_context *ctx) ++{ ++ struct files_struct *files = current->files; ++ struct file *file; ++ int fput_needed; ++ ++ file = fget_light(fd, &fput_needed); ++ /* ++ * there is no fd_uninstall(), so we do it ++ * here. put_unused_fd() does not remove the ++ * effect of fd_install(). ++ */ ++ ++ spin_lock(&files->file_lock); ++ files->fd_array[fd] = NULL; ++ spin_unlock(&files->file_lock); ++ ++ fput_light(file, fput_needed); ++ ++ /* ++ * decrement ref count and kill file ++ */ ++ put_filp(file); ++ ++ put_unused_fd(fd); ++ ++ pfm_free_context(ctx); ++} ++ ++static int pfm_get_smpl_arg_old(pfm_uuid_t uuid, void __user *fmt_uarg, ++ size_t usize, void **arg, ++ struct pfm_smpl_fmt **fmt) ++{ ++ struct pfm_smpl_fmt *f; ++ void *addr = NULL; ++ size_t sz; ++ int ret; ++ ++ if (!memcmp(uuid, null_uuid, sizeof(pfm_uuid_t))) ++ return 0; ++ ++ if (memcmp(uuid, old_default_uuid, sizeof(pfm_uuid_t))) { ++ PFM_DBG("compatibility mode supports only default sampling format"); ++ return -EINVAL; ++ } ++ /* ++ * find fmt and increase refcount ++ */ ++ f = pfm_smpl_fmt_get("default-old"); ++ if (f == NULL) { ++ PFM_DBG("default-old buffer format not found"); ++ return -EINVAL; ++ } ++ ++ /* ++ * expected format argument size ++ */ ++ sz = f->fmt_arg_size; ++ ++ /* ++ * check user size matches expected size ++ * usize = -1 is for IA-64 backward compatibility ++ */ ++ ret = -EINVAL; ++ if (sz != usize && usize != -1) { ++ PFM_DBG("invalid arg size %zu, format expects %zu", ++ usize, sz); ++ goto error; ++ } ++ ++ ret = -ENOMEM; ++ addr = kmalloc(sz, GFP_KERNEL); ++ if (addr == NULL) ++ goto error; ++ ++ ret = -EFAULT; ++ if (copy_from_user(addr, fmt_uarg, sz)) ++ goto error; ++ ++ *arg = addr; ++ *fmt = f; ++ return 0; ++ ++error: ++ kfree(addr); ++ pfm_smpl_fmt_put(f); ++ return ret; ++} ++ ++static long pfm_create_context_old(int fd, void __user *ureq, int count) ++{ ++ struct pfm_context *new_ctx; ++ struct pfm_arch_context *ctx_arch; ++ struct pfm_smpl_fmt *fmt = NULL; ++ struct pfarg_context req_old; ++ void __user *usmpl_arg; ++ void *smpl_arg = NULL; ++ struct pfarg_ctx req; ++ int ret; ++ ++ if (count != 1) ++ return -EINVAL; ++ ++ if (copy_from_user(&req_old, ureq, sizeof(req_old))) ++ return -EFAULT; ++ ++ memset(&req, 0, sizeof(req)); ++ ++ /* ++ * sampling format args are following pfarg_context ++ */ ++ usmpl_arg = ureq+sizeof(req_old); ++ ++ ret = pfm_get_smpl_arg_old(req_old.ctx_smpl_buf_id, usmpl_arg, -1, ++ &smpl_arg, &fmt); ++ if (ret) ++ return ret; ++ ++ req.ctx_flags = req_old.ctx_flags; ++ ++ /* ++ * returns file descriptor if >=0, or error code */ ++ ret = __pfm_create_context(&req, fmt, smpl_arg, PFM_COMPAT, &new_ctx); ++ if (ret >= 0) { ++ ctx_arch = pfm_ctx_arch(new_ctx); ++ req_old.ctx_fd = ret; ++ req_old.ctx_smpl_vaddr = ctx_arch->ctx_smpl_vaddr; ++ } ++ ++ if (copy_to_user(ureq, &req_old, sizeof(req_old))) { ++ pfm_undo_create_context_fd(req_old.ctx_fd, new_ctx); ++ ret = -EFAULT; ++ } ++ ++ kfree(smpl_arg); ++ ++ return ret; ++} ++ ++/* ++ * obsolete call: use /proc/perfmon ++ */ ++static long pfm_get_features_old(int fd, void __user *arg, int count) ++{ ++ struct pfarg_features req; ++ int ret = 0; ++ ++ if (count != 1) ++ return -EINVAL; ++ ++ memset(&req, 0, sizeof(req)); ++ ++ req.ft_version = PFM_VERSION; ++ ++ if (copy_to_user(arg, &req, sizeof(req))) ++ ret = -EFAULT; ++ ++ return ret; ++} ++ ++static long pfm_debug_old(int fd, void __user *arg, int count) ++{ ++ int m; ++ ++ if (count != 1) ++ return -EINVAL; ++ ++ if (get_user(m, (int __user *)arg)) ++ return -EFAULT; ++ ++ ++ pfm_controls.debug = m == 0 ? 0 : 1; ++ ++ PFM_INFO("debugging %s (timing reset)", ++ pfm_controls.debug ? "on" : "off"); ++ ++ if (m == 0) ++ for_each_online_cpu(m) { ++ memset(&per_cpu(pfm_stats, m), 0, ++ sizeof(struct pfm_stats)); ++ } ++ return 0; ++} ++ ++static long pfm_unload_context_old(int fd, void __user *arg, int count) ++{ ++ if (count) ++ return -EINVAL; ++ ++ return sys_pfm_unload_context(fd); ++} ++ ++static long pfm_restart_old(int fd, void __user *arg, int count) ++{ ++ if (count) ++ return -EINVAL; ++ ++ return sys_pfm_restart(fd); ++} ++ ++static long pfm_stop_old(int fd, void __user *arg, int count) ++{ ++ if (count) ++ return -EINVAL; ++ ++ return sys_pfm_stop(fd); ++} ++ ++static long pfm_start_old(int fd, void __user *arg, int count) ++{ ++ if (count > 1) ++ return -EINVAL; ++ ++ return sys_pfm_start(fd, arg); ++} ++ ++static long pfm_load_context_old(int fd, void __user *ureq, int count) ++{ ++ if (count != 1) ++ return -EINVAL; ++ ++ return sys_pfm_load_context(fd, ureq); ++} ++ ++/* ++ * perfmon command descriptions ++ */ ++struct pfm_cmd_desc { ++ long (*cmd_func)(int fd, void __user *arg, int count); ++}; ++ ++/* ++ * functions MUST be listed in the increasing order of ++ * their index (see permfon.h) ++ */ ++#define PFM_CMD(name) \ ++ { .cmd_func = name, \ ++ } ++#define PFM_CMD_NONE \ ++ { .cmd_func = NULL \ ++ } ++ ++static struct pfm_cmd_desc pfm_cmd_tab[] = { ++/* 0 */PFM_CMD_NONE, ++/* 1 */PFM_CMD(pfm_write_pmcs_old), ++/* 2 */PFM_CMD(pfm_write_pmds_old), ++/* 3 */PFM_CMD(pfm_read_pmds_old), ++/* 4 */PFM_CMD(pfm_stop_old), ++/* 5 */PFM_CMD(pfm_start_old), ++/* 6 */PFM_CMD_NONE, ++/* 7 */PFM_CMD_NONE, ++/* 8 */PFM_CMD(pfm_create_context_old), ++/* 9 */PFM_CMD_NONE, ++/* 10 */PFM_CMD(pfm_restart_old), ++/* 11 */PFM_CMD_NONE, ++/* 12 */PFM_CMD(pfm_get_features_old), ++/* 13 */PFM_CMD(pfm_debug_old), ++/* 14 */PFM_CMD_NONE, ++/* 15 */PFM_CMD(pfm_get_default_pmcs_old), ++/* 16 */PFM_CMD(pfm_load_context_old), ++/* 17 */PFM_CMD(pfm_unload_context_old), ++/* 18 */PFM_CMD_NONE, ++/* 19 */PFM_CMD_NONE, ++/* 20 */PFM_CMD_NONE, ++/* 21 */PFM_CMD_NONE, ++/* 22 */PFM_CMD_NONE, ++/* 23 */PFM_CMD_NONE, ++/* 24 */PFM_CMD_NONE, ++/* 25 */PFM_CMD_NONE, ++/* 26 */PFM_CMD_NONE, ++/* 27 */PFM_CMD_NONE, ++/* 28 */PFM_CMD_NONE, ++/* 29 */PFM_CMD_NONE, ++/* 30 */PFM_CMD_NONE, ++/* 31 */PFM_CMD_NONE, ++/* 32 */PFM_CMD(pfm_write_ibrs_old), ++/* 33 */PFM_CMD(pfm_write_dbrs_old), ++}; ++#define PFM_CMD_COUNT ARRAY_SIZE(pfm_cmd_tab) ++ ++/* ++ * system-call entry point (must return long) ++ */ ++asmlinkage long sys_perfmonctl(int fd, int cmd, void __user *arg, int count) ++{ ++ if (perfmon_disabled) ++ return -ENOSYS; ++ ++ if (unlikely(cmd < 0 || cmd >= PFM_CMD_COUNT ++ || pfm_cmd_tab[cmd].cmd_func == NULL)) { ++ PFM_DBG("invalid cmd=%d", cmd); ++ return -EINVAL; ++ } ++ return (long)pfm_cmd_tab[cmd].cmd_func(fd, arg, count); ++} ++ ++/* ++ * Called from pfm_read() for a perfmon v2.0 context. ++ * ++ * compatibility mode pfm_read() routine. We need a separate ++ * routine because the definition of the message has changed. ++ * The pfm_msg and pfarg_msg structures are different. ++ * ++ * return: sizeof(pfm_msg_t) on success, -errno otherwise ++ */ ++ssize_t pfm_arch_compat_read(struct pfm_context *ctx, ++ char __user *buf, ++ int non_block, ++ size_t size) ++{ ++ union pfarg_msg msg_buf; ++ pfm_msg_t old_msg_buf; ++ pfm_ovfl_msg_t *o_msg; ++ struct pfarg_ovfl_msg *n_msg; ++ int ret; ++ ++ PFM_DBG("msg=%p size=%zu", buf, size); ++ ++ /* ++ * cannot extract partial messages. ++ * check even when there is no message ++ * ++ * cannot extract more than one message per call. Bytes ++ * above sizeof(msg) are ignored. ++ */ ++ if (size < sizeof(old_msg_buf)) { ++ PFM_DBG("message is too small size=%zu must be >=%zu)", ++ size, ++ sizeof(old_msg_buf)); ++ return -EINVAL; ++ } ++ ++ ret = __pfm_read(ctx, &msg_buf, non_block); ++ if (ret < 1) ++ return ret; ++ ++ /* ++ * force return value to old message size ++ */ ++ ret = sizeof(old_msg_buf); ++ ++ o_msg = &old_msg_buf.pfm_ovfl_msg; ++ n_msg = &msg_buf.pfm_ovfl_msg; ++ ++ switch (msg_buf.type) { ++ case PFM_MSG_OVFL: ++ o_msg->msg_type = PFM_MSG_OVFL; ++ o_msg->msg_ctx_fd = 0; ++ o_msg->msg_active_set = n_msg->msg_active_set; ++ o_msg->msg_tstamp = 0; ++ ++ o_msg->msg_ovfl_pmds[0] = n_msg->msg_ovfl_pmds[0]; ++ o_msg->msg_ovfl_pmds[1] = n_msg->msg_ovfl_pmds[1]; ++ o_msg->msg_ovfl_pmds[2] = n_msg->msg_ovfl_pmds[2]; ++ o_msg->msg_ovfl_pmds[3] = n_msg->msg_ovfl_pmds[3]; ++ break; ++ case PFM_MSG_END: ++ o_msg->msg_type = PFM_MSG_END; ++ o_msg->msg_ctx_fd = 0; ++ o_msg->msg_tstamp = 0; ++ break; ++ default: ++ PFM_DBG("unknown msg type=%d", msg_buf.type); ++ } ++ if (copy_to_user(buf, &old_msg_buf, sizeof(old_msg_buf))) ++ ret = -EFAULT; ++ PFM_DBG_ovfl("ret=%d", ret); ++ return ret; ++} ++ ++/* ++ * legacy /proc/perfmon simplified interface (we only maintain the ++ * global information (no more per-cpu stats, use ++ * /sys/devices/system/cpu/cpuXX/perfmon ++ */ ++static struct proc_dir_entry *perfmon_proc; ++ ++static void *pfm_proc_start(struct seq_file *m, loff_t *pos) ++{ ++ if (*pos == 0) ++ return (void *)1; ++ ++ return NULL; ++} ++ ++static void *pfm_proc_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ ++*pos; ++ return pfm_proc_start(m, pos); ++} ++ ++static void pfm_proc_stop(struct seq_file *m, void *v) ++{ ++} ++ ++/* ++ * this is a simplified version of the legacy /proc/perfmon. ++ * We have retained ONLY the key information that tools are actually ++ * using ++ */ ++static void pfm_proc_show_header(struct seq_file *m) ++{ ++ char buf[128]; ++ ++ pfm_sysfs_res_show(buf, sizeof(buf), 3); ++ ++ seq_printf(m, "perfmon version : %u.%u\n", ++ PFM_VERSION_MAJ, PFM_VERSION_MIN); ++ ++ seq_printf(m, "model : %s", buf); ++} ++ ++static int pfm_proc_show(struct seq_file *m, void *v) ++{ ++ pfm_proc_show_header(m); ++ return 0; ++} ++ ++struct seq_operations pfm_proc_seq_ops = { ++ .start = pfm_proc_start, ++ .next = pfm_proc_next, ++ .stop = pfm_proc_stop, ++ .show = pfm_proc_show ++}; ++ ++static int pfm_proc_open(struct inode *inode, struct file *file) ++{ ++ return seq_open(file, &pfm_proc_seq_ops); ++} ++ ++ ++static struct file_operations pfm_proc_fops = { ++ .open = pfm_proc_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ ++/* ++ * called from pfm_arch_init(), global initialization, called once ++ */ ++int __init pfm_ia64_compat_init(void) ++{ ++ /* ++ * create /proc/perfmon ++ */ ++ perfmon_proc = create_proc_entry("perfmon", S_IRUGO, NULL); ++ if (perfmon_proc == NULL) { ++ PFM_ERR("cannot create /proc entry, perfmon disabled"); ++ return -1; ++ } ++ perfmon_proc->proc_fops = &pfm_proc_fops; ++ return 0; ++} +diff --git a/arch/ia64/perfmon/perfmon_default_smpl.c b/arch/ia64/perfmon/perfmon_default_smpl.c +new file mode 100644 +index 0000000..b408a13 +--- /dev/null ++++ b/arch/ia64/perfmon/perfmon_default_smpl.c +@@ -0,0 +1,273 @@ ++/* ++ * Copyright (c) 2002-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * This file implements the old default sampling buffer format ++ * for the Linux/ia64 perfmon-2 subsystem. This is for backward ++ * compatibility only. use the new default format in perfmon/ ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/kernel.h> ++#include <linux/types.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/delay.h> ++#include <linux/smp.h> ++#include <linux/sysctl.h> ++ ++#ifdef MODULE ++#define FMT_FLAGS 0 ++#else ++#define FMT_FLAGS PFM_FMTFL_IS_BUILTIN ++#endif ++ ++#include <linux/perfmon_kern.h> ++#include <asm/perfmon_default_smpl.h> ++ ++MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>"); ++MODULE_DESCRIPTION("perfmon old default sampling format"); ++MODULE_LICENSE("GPL"); ++ ++static int pfm_default_fmt_validate(u32 flags, u16 npmds, void *data) ++{ ++ struct pfm_default_smpl_arg *arg = data; ++ size_t min_buf_size; ++ ++ if (data == NULL) { ++ PFM_DBG("no argument passed"); ++ return -EINVAL; ++ } ++ ++ /* ++ * compute min buf size. All PMD are manipulated as 64bit entities ++ */ ++ min_buf_size = sizeof(struct pfm_default_smpl_hdr) ++ + (sizeof(struct pfm_default_smpl_entry) + (npmds*sizeof(u64))); ++ ++ PFM_DBG("validate flags=0x%x npmds=%u min_buf_size=%lu " ++ "buf_size=%lu CPU%d", flags, npmds, min_buf_size, ++ arg->buf_size, smp_processor_id()); ++ ++ /* ++ * must hold at least the buffer header + one minimally sized entry ++ */ ++ if (arg->buf_size < min_buf_size) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++static int pfm_default_fmt_get_size(unsigned int flags, void *data, ++ size_t *size) ++{ ++ struct pfm_default_smpl_arg *arg = data; ++ ++ /* ++ * size has been validated in default_validate ++ */ ++ *size = arg->buf_size; ++ ++ return 0; ++} ++ ++static int pfm_default_fmt_init(struct pfm_context *ctx, void *buf, ++ u32 flags, u16 npmds, void *data) ++{ ++ struct pfm_default_smpl_hdr *hdr; ++ struct pfm_default_smpl_arg *arg = data; ++ ++ hdr = buf; ++ ++ hdr->hdr_version = PFM_DEFAULT_SMPL_VERSION; ++ hdr->hdr_buf_size = arg->buf_size; ++ hdr->hdr_cur_offs = sizeof(*hdr); ++ hdr->hdr_overflows = 0; ++ hdr->hdr_count = 0; ++ ++ PFM_DBG("buffer=%p buf_size=%lu hdr_size=%lu " ++ "hdr_version=%u cur_offs=%lu", ++ buf, ++ hdr->hdr_buf_size, ++ sizeof(*hdr), ++ hdr->hdr_version, ++ hdr->hdr_cur_offs); ++ ++ return 0; ++} ++ ++static int pfm_default_fmt_handler(struct pfm_context *ctx, ++ unsigned long ip, u64 tstamp, void *data) ++{ ++ struct pfm_default_smpl_hdr *hdr; ++ struct pfm_default_smpl_entry *ent; ++ void *cur, *last, *buf; ++ u64 *e; ++ size_t entry_size; ++ u16 npmds, i, ovfl_pmd; ++ struct pfm_ovfl_arg *arg; ++ ++ hdr = ctx->smpl_addr; ++ arg = &ctx->ovfl_arg; ++ ++ buf = hdr; ++ cur = buf+hdr->hdr_cur_offs; ++ last = buf+hdr->hdr_buf_size; ++ ovfl_pmd = arg->ovfl_pmd; ++ ++ /* ++ * precheck for sanity ++ */ ++ if ((last - cur) < PFM_DEFAULT_MAX_ENTRY_SIZE) ++ goto full; ++ ++ npmds = arg->num_smpl_pmds; ++ ++ ent = cur; ++ ++ prefetch(arg->smpl_pmds_values); ++ ++ entry_size = sizeof(*ent) + (npmds << 3); ++ ++ /* position for first pmd */ ++ e = (unsigned long *)(ent+1); ++ ++ hdr->hdr_count++; ++ ++ PFM_DBG_ovfl("count=%lu cur=%p last=%p free_bytes=%lu " ++ "ovfl_pmd=%d npmds=%u", ++ hdr->hdr_count, ++ cur, last, ++ last-cur, ++ ovfl_pmd, ++ npmds); ++ ++ /* ++ * current = task running at the time of the overflow. ++ * ++ * per-task mode: ++ * - this is ususally the task being monitored. ++ * Under certain conditions, it might be a different task ++ * ++ * system-wide: ++ * - this is not necessarily the task controlling the session ++ */ ++ ent->pid = current->pid; ++ ent->ovfl_pmd = ovfl_pmd; ++ ent->last_reset_val = arg->pmd_last_reset; ++ ++ /* ++ * where did the fault happen (includes slot number) ++ */ ++ ent->ip = ip; ++ ++ ent->tstamp = tstamp; ++ ent->cpu = smp_processor_id(); ++ ent->set = arg->active_set; ++ ent->tgid = current->tgid; ++ ++ /* ++ * selectively store PMDs in increasing index number ++ */ ++ if (npmds) { ++ u64 *val = arg->smpl_pmds_values; ++ for (i = 0; i < npmds; i++) ++ *e++ = *val++; ++ } ++ ++ /* ++ * update position for next entry ++ */ ++ hdr->hdr_cur_offs += entry_size; ++ cur += entry_size; ++ ++ /* ++ * post check to avoid losing the last sample ++ */ ++ if ((last - cur) < PFM_DEFAULT_MAX_ENTRY_SIZE) ++ goto full; ++ ++ /* ++ * reset before returning from interrupt handler ++ */ ++ arg->ovfl_ctrl = PFM_OVFL_CTRL_RESET; ++ return 0; ++full: ++ PFM_DBG_ovfl("smpl buffer full free=%lu, count=%lu", ++ last-cur, hdr->hdr_count); ++ ++ /* ++ * increment number of buffer overflow. ++ * important to detect duplicate set of samples. ++ */ ++ hdr->hdr_overflows++; ++ ++ /* ++ * request notification and masking of monitoring. ++ * Notification is still subject to the overflowed ++ */ ++ arg->ovfl_ctrl = PFM_OVFL_CTRL_NOTIFY | PFM_OVFL_CTRL_MASK; ++ ++ return -ENOBUFS; /* we are full, sorry */ ++} ++ ++static int pfm_default_fmt_restart(int is_active, u32 *ovfl_ctrl, void *buf) ++{ ++ struct pfm_default_smpl_hdr *hdr; ++ ++ hdr = buf; ++ ++ hdr->hdr_count = 0; ++ hdr->hdr_cur_offs = sizeof(*hdr); ++ ++ *ovfl_ctrl = PFM_OVFL_CTRL_RESET; ++ ++ return 0; ++} ++ ++static int pfm_default_fmt_exit(void *buf) ++{ ++ return 0; ++} ++ ++static struct pfm_smpl_fmt default_fmt = { ++ .fmt_name = "default-old", ++ .fmt_version = 0x10000, ++ .fmt_arg_size = sizeof(struct pfm_default_smpl_arg), ++ .fmt_validate = pfm_default_fmt_validate, ++ .fmt_getsize = pfm_default_fmt_get_size, ++ .fmt_init = pfm_default_fmt_init, ++ .fmt_handler = pfm_default_fmt_handler, ++ .fmt_restart = pfm_default_fmt_restart, ++ .fmt_exit = pfm_default_fmt_exit, ++ .fmt_flags = FMT_FLAGS, ++ .owner = THIS_MODULE ++}; ++ ++static int pfm_default_fmt_init_module(void) ++{ ++ int ret; ++ ++ return pfm_fmt_register(&default_fmt); ++ return ret; ++} ++ ++static void pfm_default_fmt_cleanup_module(void) ++{ ++ pfm_fmt_unregister(&default_fmt); ++} ++ ++module_init(pfm_default_fmt_init_module); ++module_exit(pfm_default_fmt_cleanup_module); +diff --git a/arch/ia64/perfmon/perfmon_generic.c b/arch/ia64/perfmon/perfmon_generic.c +new file mode 100644 +index 0000000..47b1870 +--- /dev/null ++++ b/arch/ia64/perfmon/perfmon_generic.c +@@ -0,0 +1,148 @@ ++/* ++ * This file contains the generic PMU register description tables ++ * and pmc checker used by perfmon.c. ++ * ++ * Copyright (c) 2002-2006 Hewlett-Packard Development Company, L.P. ++ * contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/module.h> ++#include <linux/perfmon_kern.h> ++#include <asm/pal.h> ++ ++MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>"); ++MODULE_DESCRIPTION("Generic IA-64 PMU description tables"); ++MODULE_LICENSE("GPL"); ++ ++#define RDEP(x) (1UL << (x)) ++ ++#define PFM_IA64GEN_MASK_PMCS (RDEP(4)|RDEP(5)|RDEP(6)|RDEP(7)) ++#define PFM_IA64GEN_RSVD (0xffffffffffff0080UL) ++#define PFM_IA64GEN_NO64 (1UL<<5) ++ ++/* forward declaration */ ++static struct pfm_pmu_config pfm_ia64gen_pmu_conf; ++ ++static struct pfm_arch_pmu_info pfm_ia64gen_pmu_info = { ++ .mask_pmcs = {PFM_IA64GEN_MASK_PMCS,}, ++}; ++ ++static struct pfm_regmap_desc pfm_ia64gen_pmc_desc[] = { ++/* pmc0 */ PMX_NA, ++/* pmc1 */ PMX_NA, ++/* pmc2 */ PMX_NA, ++/* pmc3 */ PMX_NA, ++/* pmc4 */ PMC_D(PFM_REG_W64, "PMC4", 0x0, PFM_IA64GEN_RSVD, PFM_IA64GEN_NO64, 4), ++/* pmc5 */ PMC_D(PFM_REG_W64, "PMC5", 0x0, PFM_IA64GEN_RSVD, PFM_IA64GEN_NO64, 5), ++/* pmc6 */ PMC_D(PFM_REG_W64, "PMC6", 0x0, PFM_IA64GEN_RSVD, PFM_IA64GEN_NO64, 6), ++/* pmc7 */ PMC_D(PFM_REG_W64, "PMC7", 0x0, PFM_IA64GEN_RSVD, PFM_IA64GEN_NO64, 7) ++}; ++#define PFM_IA64GEN_NUM_PMCS ARRAY_SIZE(pfm_ia64gen_pmc_desc) ++ ++static struct pfm_regmap_desc pfm_ia64gen_pmd_desc[] = { ++/* pmd0 */ PMX_NA, ++/* pmd1 */ PMX_NA, ++/* pmd2 */ PMX_NA, ++/* pmd3 */ PMX_NA, ++/* pmd4 */ PMD_DP(PFM_REG_C, "PMD4", 4, 1ull << 4), ++/* pmd5 */ PMD_DP(PFM_REG_C, "PMD5", 5, 1ull << 5), ++/* pmd6 */ PMD_DP(PFM_REG_C, "PMD6", 6, 1ull << 6), ++/* pmd7 */ PMD_DP(PFM_REG_C, "PMD7", 7, 1ull << 7) ++}; ++#define PFM_IA64GEN_NUM_PMDS ARRAY_SIZE(pfm_ia64gen_pmd_desc) ++ ++static int pfm_ia64gen_pmc_check(struct pfm_context *ctx, ++ struct pfm_event_set *set, ++ struct pfarg_pmc *req) ++{ ++#define PFM_IA64GEN_PMC_PM_POS6 (1UL<<6) ++ u64 tmpval; ++ int is_system; ++ ++ is_system = ctx->flags.system; ++ tmpval = req->reg_value; ++ ++ switch (req->reg_num) { ++ case 4: ++ case 5: ++ case 6: ++ case 7: ++ /* set pmc.oi for 64-bit emulation */ ++ tmpval |= 1UL << 5; ++ ++ if (is_system) ++ tmpval |= PFM_IA64GEN_PMC_PM_POS6; ++ else ++ tmpval &= ~PFM_IA64GEN_PMC_PM_POS6; ++ break; ++ ++ } ++ req->reg_value = tmpval; ++ ++ return 0; ++} ++ ++/* ++ * matches anything ++ */ ++static int pfm_ia64gen_probe_pmu(void) ++{ ++ u64 pm_buffer[16]; ++ pal_perf_mon_info_u_t pm_info; ++ ++ /* ++ * call PAL_PERFMON_INFO to retrieve counter width which ++ * is implementation specific ++ */ ++ if (ia64_pal_perf_mon_info(pm_buffer, &pm_info)) ++ return -1; ++ ++ pfm_ia64gen_pmu_conf.counter_width = pm_info.pal_perf_mon_info_s.width; ++ ++ return 0; ++} ++ ++/* ++ * impl_pmcs, impl_pmds are computed at runtime to minimize errors! ++ */ ++static struct pfm_pmu_config pfm_ia64gen_pmu_conf = { ++ .pmu_name = "Generic IA-64", ++ .counter_width = 0, /* computed from PAL_PERFMON_INFO */ ++ .pmd_desc = pfm_ia64gen_pmd_desc, ++ .pmc_desc = pfm_ia64gen_pmc_desc, ++ .probe_pmu = pfm_ia64gen_probe_pmu, ++ .num_pmc_entries = PFM_IA64GEN_NUM_PMCS, ++ .num_pmd_entries = PFM_IA64GEN_NUM_PMDS, ++ .pmc_write_check = pfm_ia64gen_pmc_check, ++ .version = "1.0", ++ .flags = PFM_PMU_BUILTIN_FLAG, ++ .owner = THIS_MODULE, ++ .pmu_info = &pfm_ia64gen_pmu_info ++ /* no read/write checkers */ ++}; ++ ++static int __init pfm_gen_pmu_init_module(void) ++{ ++ return pfm_pmu_register(&pfm_ia64gen_pmu_conf); ++} ++ ++static void __exit pfm_gen_pmu_cleanup_module(void) ++{ ++ pfm_pmu_unregister(&pfm_ia64gen_pmu_conf); ++} ++ ++module_init(pfm_gen_pmu_init_module); ++module_exit(pfm_gen_pmu_cleanup_module); +diff --git a/arch/ia64/perfmon/perfmon_itanium.c b/arch/ia64/perfmon/perfmon_itanium.c +new file mode 100644 +index 0000000..094b31b +--- /dev/null ++++ b/arch/ia64/perfmon/perfmon_itanium.c +@@ -0,0 +1,232 @@ ++/* ++ * This file contains the Itanium PMU register description tables ++ * and pmc checker used by perfmon.c. ++ * ++ * Copyright (c) 2002-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/module.h> ++#include <linux/perfmon_kern.h> ++ ++MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>"); ++MODULE_DESCRIPTION("Itanium (Merced) PMU description tables"); ++MODULE_LICENSE("GPL"); ++ ++#define RDEP(x) (1ULL << (x)) ++ ++#define PFM_ITA_MASK_PMCS (RDEP(4)|RDEP(5)|RDEP(6)|RDEP(7)|RDEP(10)|RDEP(11)|\ ++ RDEP(12)) ++ ++#define PFM_ITA_NO64 (1ULL<<5) ++ ++static struct pfm_arch_pmu_info pfm_ita_pmu_info = { ++ .mask_pmcs = {PFM_ITA_MASK_PMCS,}, ++}; ++/* reserved bits are 1 in the mask */ ++#define PFM_ITA_RSVD 0xfffffffffc8000a0UL ++/* ++ * For debug registers, writing xBR(y) means we use also xBR(y+1). Hence using ++ * PMC256+y means we use PMC256+y+1. Yet, we do not have dependency information ++ * but this is fine because they are handled separately in the IA-64 specific ++ * code. ++ */ ++static struct pfm_regmap_desc pfm_ita_pmc_desc[] = { ++/* pmc0 */ PMX_NA, ++/* pmc1 */ PMX_NA, ++/* pmc2 */ PMX_NA, ++/* pmc3 */ PMX_NA, ++/* pmc4 */ PMC_D(PFM_REG_W64, "PMC4" , 0x20, PFM_ITA_RSVD, PFM_ITA_NO64, 4), ++/* pmc5 */ PMC_D(PFM_REG_W64, "PMC5" , 0x20, PFM_ITA_RSVD, PFM_ITA_NO64, 5), ++/* pmc6 */ PMC_D(PFM_REG_W64, "PMC6" , 0x20, PFM_ITA_RSVD, PFM_ITA_NO64, 6), ++/* pmc7 */ PMC_D(PFM_REG_W64, "PMC7" , 0x20, PFM_ITA_RSVD, PFM_ITA_NO64, 7), ++/* pmc8 */ PMC_D(PFM_REG_W , "PMC8" , 0xfffffffe3ffffff8UL, 0xfff00000001c0000UL, 0, 8), ++/* pmc9 */ PMC_D(PFM_REG_W , "PMC9" , 0xfffffffe3ffffff8UL, 0xfff00000001c0000UL, 0, 9), ++/* pmc10 */ PMC_D(PFM_REG_W , "PMC10", 0x0, 0xfffffffff3f0ff30UL, 0, 10), ++/* pmc11 */ PMC_D(PFM_REG_W , "PMC11", 0x10000000UL, 0xffffffffecf0ff30UL, 0, 11), ++/* pmc12 */ PMC_D(PFM_REG_W , "PMC12", 0x0, 0xffffffffffff0030UL, 0, 12), ++/* pmc13 */ PMC_D(PFM_REG_W , "PMC13", 0x3ffff00000001UL, 0xfffffffffffffffeUL, 0, 13), ++/* pmc14 */ PMX_NA, ++/* pmc15 */ PMX_NA, ++/* pmc16 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc24 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc32 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc40 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc48 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc56 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc64 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc72 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc80 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc88 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc96 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc104 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc112 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc120 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc128 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc136 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc144 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc152 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc160 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc168 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc176 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc184 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc192 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc200 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc208 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc216 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc224 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc232 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc240 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc248 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc256 */ PMC_D(PFM_REG_W , "IBR0", 0x0, 0, 0, 0), ++/* pmc257 */ PMC_D(PFM_REG_W , "IBR1", 0x0, 0x8000000000000000UL, 0, 1), ++/* pmc258 */ PMC_D(PFM_REG_W , "IBR2", 0x0, 0, 0, 2), ++/* pmc259 */ PMC_D(PFM_REG_W , "IBR3", 0x0, 0x8000000000000000UL, 0, 3), ++/* pmc260 */ PMC_D(PFM_REG_W , "IBR4", 0x0, 0, 0, 4), ++/* pmc261 */ PMC_D(PFM_REG_W , "IBR5", 0x0, 0x8000000000000000UL, 0, 5), ++/* pmc262 */ PMC_D(PFM_REG_W , "IBR6", 0x0, 0, 0, 6), ++/* pmc263 */ PMC_D(PFM_REG_W , "IBR7", 0x0, 0x8000000000000000UL, 0, 7), ++/* pmc264 */ PMC_D(PFM_REG_W , "DBR0", 0x0, 0, 0, 0), ++/* pmc265 */ PMC_D(PFM_REG_W , "DBR1", 0x0, 0xc000000000000000UL, 0, 1), ++/* pmc266 */ PMC_D(PFM_REG_W , "DBR2", 0x0, 0, 0, 2), ++/* pmc267 */ PMC_D(PFM_REG_W , "DBR3", 0x0, 0xc000000000000000UL, 0, 3), ++/* pmc268 */ PMC_D(PFM_REG_W , "DBR4", 0x0, 0, 0, 4), ++/* pmc269 */ PMC_D(PFM_REG_W , "DBR5", 0x0, 0xc000000000000000UL, 0, 5), ++/* pmc270 */ PMC_D(PFM_REG_W , "DBR6", 0x0, 0, 0, 6), ++/* pmc271 */ PMC_D(PFM_REG_W , "DBR7", 0x0, 0xc000000000000000UL, 0, 7) ++}; ++#define PFM_ITA_NUM_PMCS ARRAY_SIZE(pfm_ita_pmc_desc) ++ ++static struct pfm_regmap_desc pfm_ita_pmd_desc[] = { ++/* pmd0 */ PMD_DP(PFM_REG_I , "PMD0", 0, 1ull << 10), ++/* pmd1 */ PMD_DP(PFM_REG_I , "PMD1", 1, 1ull << 10), ++/* pmd2 */ PMD_DP(PFM_REG_I , "PMD2", 2, 1ull << 11), ++/* pmd3 */ PMD_DP(PFM_REG_I , "PMD3", 3, 1ull << 11), ++/* pmd4 */ PMD_DP(PFM_REG_C , "PMD4", 4, 1ull << 4), ++/* pmd5 */ PMD_DP(PFM_REG_C , "PMD5", 5, 1ull << 5), ++/* pmd6 */ PMD_DP(PFM_REG_C , "PMD6", 6, 1ull << 6), ++/* pmd7 */ PMD_DP(PFM_REG_C , "PMD7", 7, 1ull << 7), ++/* pmd8 */ PMD_DP(PFM_REG_I , "PMD8", 8, 1ull << 12), ++/* pmd9 */ PMD_DP(PFM_REG_I , "PMD9", 9, 1ull << 12), ++/* pmd10 */ PMD_DP(PFM_REG_I , "PMD10", 10, 1ull << 12), ++/* pmd11 */ PMD_DP(PFM_REG_I , "PMD11", 11, 1ull << 12), ++/* pmd12 */ PMD_DP(PFM_REG_I , "PMD12", 12, 1ull << 12), ++/* pmd13 */ PMD_DP(PFM_REG_I , "PMD13", 13, 1ull << 12), ++/* pmd14 */ PMD_DP(PFM_REG_I , "PMD14", 14, 1ull << 12), ++/* pmd15 */ PMD_DP(PFM_REG_I , "PMD15", 15, 1ull << 12), ++/* pmd16 */ PMD_DP(PFM_REG_I , "PMD16", 16, 1ull << 12), ++/* pmd17 */ PMD_DP(PFM_REG_I , "PMD17", 17, 1ull << 11) ++}; ++#define PFM_ITA_NUM_PMDS ARRAY_SIZE(pfm_ita_pmd_desc) ++ ++static int pfm_ita_pmc_check(struct pfm_context *ctx, ++ struct pfm_event_set *set, ++ struct pfarg_pmc *req) ++{ ++#define PFM_ITA_PMC_PM_POS6 (1UL<<6) ++ struct pfm_arch_context *ctx_arch; ++ u64 tmpval; ++ u16 cnum; ++ int ret = 0, is_system; ++ ++ tmpval = req->reg_value; ++ cnum = req->reg_num; ++ ctx_arch = pfm_ctx_arch(ctx); ++ is_system = ctx->flags.system; ++ ++ switch (cnum) { ++ case 4: ++ case 5: ++ case 6: ++ case 7: ++ case 10: ++ case 11: ++ case 12: ++ if (is_system) ++ tmpval |= PFM_ITA_PMC_PM_POS6; ++ else ++ tmpval &= ~PFM_ITA_PMC_PM_POS6; ++ break; ++ } ++ ++ /* ++ * we must clear the (instruction) debug registers if pmc13.ta bit is ++ * cleared before they are written (fl_using_dbreg==0) to avoid ++ * picking up stale information. ++ */ ++ if (cnum == 13 && ((tmpval & 0x1) == 0) ++ && ctx_arch->flags.use_dbr == 0) { ++ PFM_DBG("pmc13 has pmc13.ta cleared, clearing ibr"); ++ ret = pfm_ia64_mark_dbregs_used(ctx, set); ++ if (ret) ++ return ret; ++ } ++ ++ /* ++ * we must clear the (data) debug registers if pmc11.pt bit is cleared ++ * before they are written (fl_using_dbreg==0) to avoid picking up ++ * stale information. ++ */ ++ if (cnum == 11 && ((tmpval >> 28) & 0x1) == 0 ++ && ctx_arch->flags.use_dbr == 0) { ++ PFM_DBG("pmc11 has pmc11.pt cleared, clearing dbr"); ++ ret = pfm_ia64_mark_dbregs_used(ctx, set); ++ if (ret) ++ return ret; ++ } ++ ++ req->reg_value = tmpval; ++ ++ return 0; ++} ++ ++static int pfm_ita_probe_pmu(void) ++{ ++ return local_cpu_data->family == 0x7 && !ia64_platform_is("hpsim") ++ ? 0 : -1; ++} ++ ++/* ++ * impl_pmcs, impl_pmds are computed at runtime to minimize errors! ++ */ ++static struct pfm_pmu_config pfm_ita_pmu_conf = { ++ .pmu_name = "Itanium", ++ .counter_width = 32, ++ .pmd_desc = pfm_ita_pmd_desc, ++ .pmc_desc = pfm_ita_pmc_desc, ++ .pmc_write_check = pfm_ita_pmc_check, ++ .num_pmc_entries = PFM_ITA_NUM_PMCS, ++ .num_pmd_entries = PFM_ITA_NUM_PMDS, ++ .probe_pmu = pfm_ita_probe_pmu, ++ .version = "1.0", ++ .flags = PFM_PMU_BUILTIN_FLAG, ++ .owner = THIS_MODULE, ++ .pmu_info = &pfm_ita_pmu_info ++}; ++ ++static int __init pfm_ita_pmu_init_module(void) ++{ ++ return pfm_pmu_register(&pfm_ita_pmu_conf); ++} ++ ++static void __exit pfm_ita_pmu_cleanup_module(void) ++{ ++ pfm_pmu_unregister(&pfm_ita_pmu_conf); ++} ++ ++module_init(pfm_ita_pmu_init_module); ++module_exit(pfm_ita_pmu_cleanup_module); ++ +diff --git a/arch/ia64/perfmon/perfmon_mckinley.c b/arch/ia64/perfmon/perfmon_mckinley.c +new file mode 100644 +index 0000000..dc59092 +--- /dev/null ++++ b/arch/ia64/perfmon/perfmon_mckinley.c +@@ -0,0 +1,290 @@ ++/* ++ * This file contains the McKinley PMU register description tables ++ * and pmc checker used by perfmon.c. ++ * ++ * Copyright (c) 2002-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/module.h> ++#include <linux/perfmon_kern.h> ++ ++MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>"); ++MODULE_DESCRIPTION("Itanium 2 (McKinley) PMU description tables"); ++MODULE_LICENSE("GPL"); ++ ++#define RDEP(x) (1UL << (x)) ++ ++#define PFM_MCK_MASK_PMCS (RDEP(4)|RDEP(5)|RDEP(6)|RDEP(7)|RDEP(10)|RDEP(11)|\ ++ RDEP(12)) ++ ++#define PFM_MCK_NO64 (1UL<<5) ++ ++static struct pfm_arch_pmu_info pfm_mck_pmu_info = { ++ .mask_pmcs = {PFM_MCK_MASK_PMCS,}, ++}; ++ ++/* reserved bits are 1 in the mask */ ++#define PFM_ITA2_RSVD 0xfffffffffc8000a0UL ++ ++/* ++ * For debug registers, writing xBR(y) means we use also xBR(y+1). Hence using ++ * PMC256+y means we use PMC256+y+1. Yet, we do not have dependency information ++ * but this is fine because they are handled separately in the IA-64 specific ++ * code. ++ */ ++static struct pfm_regmap_desc pfm_mck_pmc_desc[] = { ++/* pmc0 */ PMX_NA, ++/* pmc1 */ PMX_NA, ++/* pmc2 */ PMX_NA, ++/* pmc3 */ PMX_NA, ++/* pmc4 */ PMC_D(PFM_REG_W64, "PMC4" , 0x800020UL, 0xfffffffffc8000a0, PFM_MCK_NO64, 4), ++/* pmc5 */ PMC_D(PFM_REG_W64, "PMC5" , 0x20UL, PFM_ITA2_RSVD, PFM_MCK_NO64, 5), ++/* pmc6 */ PMC_D(PFM_REG_W64, "PMC6" , 0x20UL, PFM_ITA2_RSVD, PFM_MCK_NO64, 6), ++/* pmc7 */ PMC_D(PFM_REG_W64, "PMC7" , 0x20UL, PFM_ITA2_RSVD, PFM_MCK_NO64, 7), ++/* pmc8 */ PMC_D(PFM_REG_W , "PMC8" , 0xffffffff3fffffffUL, 0xc0000004UL, 0, 8), ++/* pmc9 */ PMC_D(PFM_REG_W , "PMC9" , 0xffffffff3ffffffcUL, 0xc0000004UL, 0, 9), ++/* pmc10 */ PMC_D(PFM_REG_W , "PMC10", 0x0, 0xffffffffffff0000UL, 0, 10), ++/* pmc11 */ PMC_D(PFM_REG_W , "PMC11", 0x0, 0xfffffffffcf0fe30UL, 0, 11), ++/* pmc12 */ PMC_D(PFM_REG_W , "PMC12", 0x0, 0xffffffffffff0000UL, 0, 12), ++/* pmc13 */ PMC_D(PFM_REG_W , "PMC13", 0x2078fefefefeUL, 0xfffe1fffe7e7e7e7UL, 0, 13), ++/* pmc14 */ PMC_D(PFM_REG_W , "PMC14", 0x0db60db60db60db6UL, 0xffffffffffffdb6dUL, 0, 14), ++/* pmc15 */ PMC_D(PFM_REG_W , "PMC15", 0xfffffff0UL, 0xfffffffffffffff0UL, 0, 15), ++/* pmc16 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc24 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc32 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc40 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc48 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc56 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc64 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc72 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc80 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc88 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc96 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc104 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc112 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc120 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc128 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc136 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc144 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc152 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc160 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc168 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc176 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc184 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc192 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc200 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc208 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc216 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc224 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc232 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc240 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc248 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc256 */ PMC_D(PFM_REG_W , "IBR0", 0x0, 0, 0, 0), ++/* pmc257 */ PMC_D(PFM_REG_W , "IBR1", 0x0, 0x8000000000000000UL, 0, 1), ++/* pmc258 */ PMC_D(PFM_REG_W , "IBR2", 0x0, 0, 0, 2), ++/* pmc259 */ PMC_D(PFM_REG_W , "IBR3", 0x0, 0x8000000000000000UL, 0, 3), ++/* pmc260 */ PMC_D(PFM_REG_W , "IBR4", 0x0, 0, 0, 4), ++/* pmc261 */ PMC_D(PFM_REG_W , "IBR5", 0x0, 0x8000000000000000UL, 0, 5), ++/* pmc262 */ PMC_D(PFM_REG_W , "IBR6", 0x0, 0, 0, 6), ++/* pmc263 */ PMC_D(PFM_REG_W , "IBR7", 0x0, 0x8000000000000000UL, 0, 7), ++/* pmc264 */ PMC_D(PFM_REG_W , "DBR0", 0x0, 0, 0, 0), ++/* pmc265 */ PMC_D(PFM_REG_W , "DBR1", 0x0, 0xc000000000000000UL, 0, 1), ++/* pmc266 */ PMC_D(PFM_REG_W , "DBR2", 0x0, 0, 0, 2), ++/* pmc267 */ PMC_D(PFM_REG_W , "DBR3", 0x0, 0xc000000000000000UL, 0, 3), ++/* pmc268 */ PMC_D(PFM_REG_W , "DBR4", 0x0, 0, 0, 4), ++/* pmc269 */ PMC_D(PFM_REG_W , "DBR5", 0x0, 0xc000000000000000UL, 0, 5), ++/* pmc270 */ PMC_D(PFM_REG_W , "DBR6", 0x0, 0, 0, 6), ++/* pmc271 */ PMC_D(PFM_REG_W , "DBR7", 0x0, 0xc000000000000000UL, 0, 7) ++}; ++#define PFM_MCK_NUM_PMCS ARRAY_SIZE(pfm_mck_pmc_desc) ++ ++static struct pfm_regmap_desc pfm_mck_pmd_desc[] = { ++/* pmd0 */ PMD_DP(PFM_REG_I, "PMD0", 0, 1ull << 10), ++/* pmd1 */ PMD_DP(PFM_REG_I, "PMD1", 1, 1ull << 10), ++/* pmd2 */ PMD_DP(PFM_REG_I, "PMD2", 2, 1ull << 11), ++/* pmd3 */ PMD_DP(PFM_REG_I, "PMD3", 3, 1ull << 11), ++/* pmd4 */ PMD_DP(PFM_REG_C, "PMD4", 4, 1ull << 4), ++/* pmd5 */ PMD_DP(PFM_REG_C, "PMD5", 5, 1ull << 5), ++/* pmd6 */ PMD_DP(PFM_REG_C, "PMD6", 6, 1ull << 6), ++/* pmd7 */ PMD_DP(PFM_REG_C, "PMD7", 7, 1ull << 7), ++/* pmd8 */ PMD_DP(PFM_REG_I, "PMD8", 8, 1ull << 12), ++/* pmd9 */ PMD_DP(PFM_REG_I, "PMD9", 9, 1ull << 12), ++/* pmd10 */ PMD_DP(PFM_REG_I, "PMD10", 10, 1ull << 12), ++/* pmd11 */ PMD_DP(PFM_REG_I, "PMD11", 11, 1ull << 12), ++/* pmd12 */ PMD_DP(PFM_REG_I, "PMD12", 12, 1ull << 12), ++/* pmd13 */ PMD_DP(PFM_REG_I, "PMD13", 13, 1ull << 12), ++/* pmd14 */ PMD_DP(PFM_REG_I, "PMD14", 14, 1ull << 12), ++/* pmd15 */ PMD_DP(PFM_REG_I, "PMD15", 15, 1ull << 12), ++/* pmd16 */ PMD_DP(PFM_REG_I, "PMD16", 16, 1ull << 12), ++/* pmd17 */ PMD_DP(PFM_REG_I, "PMD17", 17, 1ull << 11) ++}; ++#define PFM_MCK_NUM_PMDS ARRAY_SIZE(pfm_mck_pmd_desc) ++ ++static int pfm_mck_pmc_check(struct pfm_context *ctx, ++ struct pfm_event_set *set, ++ struct pfarg_pmc *req) ++{ ++ struct pfm_arch_context *ctx_arch; ++ u64 val8 = 0, val14 = 0, val13 = 0; ++ u64 tmpval; ++ u16 cnum; ++ int ret = 0, check_case1 = 0; ++ int is_system; ++ ++ tmpval = req->reg_value; ++ cnum = req->reg_num; ++ ctx_arch = pfm_ctx_arch(ctx); ++ is_system = ctx->flags.system; ++ ++#define PFM_MCK_PMC_PM_POS6 (1UL<<6) ++#define PFM_MCK_PMC_PM_POS4 (1UL<<4) ++ ++ switch (cnum) { ++ case 4: ++ case 5: ++ case 6: ++ case 7: ++ case 11: ++ case 12: ++ if (is_system) ++ tmpval |= PFM_MCK_PMC_PM_POS6; ++ else ++ tmpval &= ~PFM_MCK_PMC_PM_POS6; ++ break; ++ ++ case 8: ++ val8 = tmpval; ++ val13 = set->pmcs[13]; ++ val14 = set->pmcs[14]; ++ check_case1 = 1; ++ break; ++ ++ case 10: ++ if (is_system) ++ tmpval |= PFM_MCK_PMC_PM_POS4; ++ else ++ tmpval &= ~PFM_MCK_PMC_PM_POS4; ++ break; ++ ++ case 13: ++ val8 = set->pmcs[8]; ++ val13 = tmpval; ++ val14 = set->pmcs[14]; ++ check_case1 = 1; ++ break; ++ ++ case 14: ++ val8 = set->pmcs[8]; ++ val13 = set->pmcs[13]; ++ val14 = tmpval; ++ check_case1 = 1; ++ break; ++ } ++ ++ /* ++ * check illegal configuration which can produce inconsistencies ++ * in tagging i-side events in L1D and L2 caches ++ */ ++ if (check_case1) { ++ ret = (((val13 >> 45) & 0xf) == 0 && ((val8 & 0x1) == 0)) ++ && ((((val14>>1) & 0x3) == 0x2 || ((val14>>1) & 0x3) == 0x0) ++ || (((val14>>4) & 0x3) == 0x2 || ((val14>>4) & 0x3) == 0x0)); ++ ++ if (ret) { ++ PFM_DBG("perfmon: invalid config pmc8=0x%lx " ++ "pmc13=0x%lx pmc14=0x%lx", ++ val8, val13, val14); ++ return -EINVAL; ++ } ++ } ++ ++ /* ++ * check if configuration implicitely activates the use of ++ * the debug registers. If true, then we ensure that this is ++ * possible and that we do not pick up stale value in the HW ++ * registers. ++ * ++ * We postpone the checks of pmc13 and pmc14 to avoid side effects ++ * in case of errors ++ */ ++ ++ /* ++ * pmc13 is "active" if: ++ * one of the pmc13.cfg_dbrpXX field is different from 0x3 ++ * AND ++ * at the corresponding pmc13.ena_dbrpXX is set. ++ */ ++ if (cnum == 13 && (tmpval & 0x1e00000000000UL) ++ && (tmpval & 0x18181818UL) != 0x18181818UL ++ && ctx_arch->flags.use_dbr == 0) { ++ PFM_DBG("pmc13=0x%lx active", tmpval); ++ ret = pfm_ia64_mark_dbregs_used(ctx, set); ++ if (ret) ++ return ret; ++ } ++ ++ /* ++ * if any pmc14.ibrpX bit is enabled we must clear the ibrs ++ */ ++ if (cnum == 14 && ((tmpval & 0x2222UL) != 0x2222UL) ++ && ctx_arch->flags.use_dbr == 0) { ++ PFM_DBG("pmc14=0x%lx active", tmpval); ++ ret = pfm_ia64_mark_dbregs_used(ctx, set); ++ if (ret) ++ return ret; ++ } ++ ++ req->reg_value = tmpval; ++ ++ return 0; ++} ++ ++static int pfm_mck_probe_pmu(void) ++{ ++ return local_cpu_data->family == 0x1f ? 0 : -1; ++} ++ ++/* ++ * impl_pmcs, impl_pmds are computed at runtime to minimize errors! ++ */ ++static struct pfm_pmu_config pfm_mck_pmu_conf = { ++ .pmu_name = "Itanium 2", ++ .counter_width = 47, ++ .pmd_desc = pfm_mck_pmd_desc, ++ .pmc_desc = pfm_mck_pmc_desc, ++ .pmc_write_check = pfm_mck_pmc_check, ++ .num_pmc_entries = PFM_MCK_NUM_PMCS, ++ .num_pmd_entries = PFM_MCK_NUM_PMDS, ++ .probe_pmu = pfm_mck_probe_pmu, ++ .version = "1.0", ++ .flags = PFM_PMU_BUILTIN_FLAG, ++ .owner = THIS_MODULE, ++ .pmu_info = &pfm_mck_pmu_info, ++}; ++ ++static int __init pfm_mck_pmu_init_module(void) ++{ ++ return pfm_pmu_register(&pfm_mck_pmu_conf); ++} ++ ++static void __exit pfm_mck_pmu_cleanup_module(void) ++{ ++ pfm_pmu_unregister(&pfm_mck_pmu_conf); ++} ++ ++module_init(pfm_mck_pmu_init_module); ++module_exit(pfm_mck_pmu_cleanup_module); +diff --git a/arch/ia64/perfmon/perfmon_montecito.c b/arch/ia64/perfmon/perfmon_montecito.c +new file mode 100644 +index 0000000..3f76f73 +--- /dev/null ++++ b/arch/ia64/perfmon/perfmon_montecito.c +@@ -0,0 +1,412 @@ ++/* ++ * This file contains the McKinley PMU register description tables ++ * and pmc checker used by perfmon.c. ++ * ++ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/module.h> ++#include <linux/smp.h> ++#include <linux/perfmon_kern.h> ++ ++MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>"); ++MODULE_DESCRIPTION("Dual-Core Itanium 2 (Montecito) PMU description table"); ++MODULE_LICENSE("GPL"); ++ ++#define RDEP(x) (1UL << (x)) ++ ++#define PFM_MONT_MASK_PMCS (RDEP(4)|RDEP(5)|RDEP(6)|RDEP(7)|\ ++ RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|\ ++ RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|\ ++ RDEP(37)|RDEP(39)|RDEP(40)|RDEP(42)) ++ ++#define PFM_MONT_NO64 (1UL<<5) ++ ++static struct pfm_arch_pmu_info pfm_mont_pmu_info = { ++ .mask_pmcs = {PFM_MONT_MASK_PMCS,}, ++}; ++ ++#define PFM_MONT_RSVD 0xffffffff838000a0UL ++/* ++ * ++ * For debug registers, writing xBR(y) means we use also xBR(y+1). Hence using ++ * PMC256+y means we use PMC256+y+1. Yet, we do not have dependency information ++ * but this is fine because they are handled separately in the IA-64 specific ++ * code. ++ * ++ * For PMC4-PMC15, PMC40: we force pmc.ism=2 (IA-64 mode only) ++ */ ++static struct pfm_regmap_desc pfm_mont_pmc_desc[] = { ++/* pmc0 */ PMX_NA, ++/* pmc1 */ PMX_NA, ++/* pmc2 */ PMX_NA, ++/* pmc3 */ PMX_NA, ++/* pmc4 */ PMC_D(PFM_REG_W64, "PMC4" , 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 4), ++/* pmc5 */ PMC_D(PFM_REG_W64, "PMC5" , 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 5), ++/* pmc6 */ PMC_D(PFM_REG_W64, "PMC6" , 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 6), ++/* pmc7 */ PMC_D(PFM_REG_W64, "PMC7" , 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 7), ++/* pmc8 */ PMC_D(PFM_REG_W64, "PMC8" , 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 8), ++/* pmc9 */ PMC_D(PFM_REG_W64, "PMC9" , 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 9), ++/* pmc10 */ PMC_D(PFM_REG_W64, "PMC10", 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 10), ++/* pmc11 */ PMC_D(PFM_REG_W64, "PMC11", 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 11), ++/* pmc12 */ PMC_D(PFM_REG_W64, "PMC12", 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 12), ++/* pmc13 */ PMC_D(PFM_REG_W64, "PMC13", 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 13), ++/* pmc14 */ PMC_D(PFM_REG_W64, "PMC14", 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 14), ++/* pmc15 */ PMC_D(PFM_REG_W64, "PMC15", 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 15), ++/* pmc16 */ PMX_NA, ++/* pmc17 */ PMX_NA, ++/* pmc18 */ PMX_NA, ++/* pmc19 */ PMX_NA, ++/* pmc20 */ PMX_NA, ++/* pmc21 */ PMX_NA, ++/* pmc22 */ PMX_NA, ++/* pmc23 */ PMX_NA, ++/* pmc24 */ PMX_NA, ++/* pmc25 */ PMX_NA, ++/* pmc26 */ PMX_NA, ++/* pmc27 */ PMX_NA, ++/* pmc28 */ PMX_NA, ++/* pmc29 */ PMX_NA, ++/* pmc30 */ PMX_NA, ++/* pmc31 */ PMX_NA, ++/* pmc32 */ PMC_D(PFM_REG_W , "PMC32", 0x30f01ffffffffffUL, 0xfcf0fe0000000000UL, 0, 32), ++/* pmc33 */ PMC_D(PFM_REG_W , "PMC33", 0x0, 0xfffffe0000000000UL, 0, 33), ++/* pmc34 */ PMC_D(PFM_REG_W , "PMC34", 0xf01ffffffffffUL, 0xfff0fe0000000000UL, 0, 34), ++/* pmc35 */ PMC_D(PFM_REG_W , "PMC35", 0x0, 0x1ffffffffffUL, 0, 35), ++/* pmc36 */ PMC_D(PFM_REG_W , "PMC36", 0xfffffff0UL, 0xfffffffffffffff0UL, 0, 36), ++/* pmc37 */ PMC_D(PFM_REG_W , "PMC37", 0x0, 0xffffffffffffc000UL, 0, 37), ++/* pmc38 */ PMC_D(PFM_REG_W , "PMC38", 0xdb6UL, 0xffffffffffffdb6dUL, 0, 38), ++/* pmc39 */ PMC_D(PFM_REG_W , "PMC39", 0x0, 0xffffffffffff0030UL, 0, 39), ++/* pmc40 */ PMC_D(PFM_REG_W , "PMC40", 0x2000000UL, 0xfffffffffff0fe30UL, 0, 40), ++/* pmc41 */ PMC_D(PFM_REG_W , "PMC41", 0x00002078fefefefeUL, 0xfffe1fffe7e7e7e7UL, 0, 41), ++/* pmc42 */ PMC_D(PFM_REG_W , "PMC42", 0x0, 0xfff800b0UL, 0, 42), ++/* pmc43 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc48 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc56 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc64 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc72 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc80 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc88 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc96 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc104 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc112 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc120 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc128 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc136 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc144 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc152 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc160 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc168 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc176 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc184 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc192 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc200 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc208 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc216 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc224 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc232 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc240 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc248 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc256 */ PMC_D(PFM_REG_W, "IBR0", 0x0, 0, 0, 0), ++/* pmc257 */ PMC_D(PFM_REG_W, "IBR1", 0x0, 0x8000000000000000UL, 0, 1), ++/* pmc258 */ PMC_D(PFM_REG_W, "IBR2", 0x0, 0, 0, 2), ++/* pmc259 */ PMC_D(PFM_REG_W, "IBR3", 0x0, 0x8000000000000000UL, 0, 3), ++/* pmc260 */ PMC_D(PFM_REG_W, "IBR4", 0x0, 0, 0, 4), ++/* pmc261 */ PMC_D(PFM_REG_W, "IBR5", 0x0, 0x8000000000000000UL, 0, 5), ++/* pmc262 */ PMC_D(PFM_REG_W, "IBR6", 0x0, 0, 0, 6), ++/* pmc263 */ PMC_D(PFM_REG_W, "IBR7", 0x0, 0x8000000000000000UL, 0, 7), ++/* pmc264 */ PMC_D(PFM_REG_W, "DBR0", 0x0, 0, 0, 0), ++/* pmc265 */ PMC_D(PFM_REG_W, "DBR1", 0x0, 0xc000000000000000UL, 0, 1), ++/* pmc266 */ PMC_D(PFM_REG_W, "DBR2", 0x0, 0, 0, 2), ++/* pmc267 */ PMC_D(PFM_REG_W, "DBR3", 0x0, 0xc000000000000000UL, 0, 3), ++/* pmc268 */ PMC_D(PFM_REG_W, "DBR4", 0x0, 0, 0, 4), ++/* pmc269 */ PMC_D(PFM_REG_W, "DBR5", 0x0, 0xc000000000000000UL, 0, 5), ++/* pmc270 */ PMC_D(PFM_REG_W, "DBR6", 0x0, 0, 0, 6), ++/* pmc271 */ PMC_D(PFM_REG_W, "DBR7", 0x0, 0xc000000000000000UL, 0, 7) ++}; ++#define PFM_MONT_NUM_PMCS ARRAY_SIZE(pfm_mont_pmc_desc) ++ ++static struct pfm_regmap_desc pfm_mont_pmd_desc[] = { ++/* pmd0 */ PMX_NA, ++/* pmd1 */ PMX_NA, ++/* pmd2 */ PMX_NA, ++/* pmd3 */ PMX_NA, ++/* pmd4 */ PMD_DP(PFM_REG_C, "PMD4", 4, 1ull << 4), ++/* pmd5 */ PMD_DP(PFM_REG_C, "PMD5", 5, 1ull << 5), ++/* pmd6 */ PMD_DP(PFM_REG_C, "PMD6", 6, 1ull << 6), ++/* pmd7 */ PMD_DP(PFM_REG_C, "PMD7", 7, 1ull << 7), ++/* pmd8 */ PMD_DP(PFM_REG_C, "PMD8", 8, 1ull << 8), ++/* pmd9 */ PMD_DP(PFM_REG_C, "PMD9", 9, 1ull << 9), ++/* pmd10 */ PMD_DP(PFM_REG_C, "PMD10", 10, 1ull << 10), ++/* pmd11 */ PMD_DP(PFM_REG_C, "PMD11", 11, 1ull << 11), ++/* pmd12 */ PMD_DP(PFM_REG_C, "PMD12", 12, 1ull << 12), ++/* pmd13 */ PMD_DP(PFM_REG_C, "PMD13", 13, 1ull << 13), ++/* pmd14 */ PMD_DP(PFM_REG_C, "PMD14", 14, 1ull << 14), ++/* pmd15 */ PMD_DP(PFM_REG_C, "PMD15", 15, 1ull << 15), ++/* pmd16 */ PMX_NA, ++/* pmd17 */ PMX_NA, ++/* pmd18 */ PMX_NA, ++/* pmd19 */ PMX_NA, ++/* pmd20 */ PMX_NA, ++/* pmd21 */ PMX_NA, ++/* pmd22 */ PMX_NA, ++/* pmd23 */ PMX_NA, ++/* pmd24 */ PMX_NA, ++/* pmd25 */ PMX_NA, ++/* pmd26 */ PMX_NA, ++/* pmd27 */ PMX_NA, ++/* pmd28 */ PMX_NA, ++/* pmd29 */ PMX_NA, ++/* pmd30 */ PMX_NA, ++/* pmd31 */ PMX_NA, ++/* pmd32 */ PMD_DP(PFM_REG_I, "PMD32", 32, 1ull << 40), ++/* pmd33 */ PMD_DP(PFM_REG_I, "PMD33", 33, 1ull << 40), ++/* pmd34 */ PMD_DP(PFM_REG_I, "PMD34", 34, 1ull << 37), ++/* pmd35 */ PMD_DP(PFM_REG_I, "PMD35", 35, 1ull << 37), ++/* pmd36 */ PMD_DP(PFM_REG_I, "PMD36", 36, 1ull << 40), ++/* pmd37 */ PMX_NA, ++/* pmd38 */ PMD_DP(PFM_REG_I, "PMD38", 38, (1ull<<39)|(1ull<<42)), ++/* pmd39 */ PMD_DP(PFM_REG_I, "PMD39", 39, (1ull<<39)|(1ull<<42)), ++/* pmd40 */ PMX_NA, ++/* pmd41 */ PMX_NA, ++/* pmd42 */ PMX_NA, ++/* pmd43 */ PMX_NA, ++/* pmd44 */ PMX_NA, ++/* pmd45 */ PMX_NA, ++/* pmd46 */ PMX_NA, ++/* pmd47 */ PMX_NA, ++/* pmd48 */ PMD_DP(PFM_REG_I, "PMD48", 48, (1ull<<39)|(1ull<<42)), ++/* pmd49 */ PMD_DP(PFM_REG_I, "PMD49", 49, (1ull<<39)|(1ull<<42)), ++/* pmd50 */ PMD_DP(PFM_REG_I, "PMD50", 50, (1ull<<39)|(1ull<<42)), ++/* pmd51 */ PMD_DP(PFM_REG_I, "PMD51", 51, (1ull<<39)|(1ull<<42)), ++/* pmd52 */ PMD_DP(PFM_REG_I, "PMD52", 52, (1ull<<39)|(1ull<<42)), ++/* pmd53 */ PMD_DP(PFM_REG_I, "PMD53", 53, (1ull<<39)|(1ull<<42)), ++/* pmd54 */ PMD_DP(PFM_REG_I, "PMD54", 54, (1ull<<39)|(1ull<<42)), ++/* pmd55 */ PMD_DP(PFM_REG_I, "PMD55", 55, (1ull<<39)|(1ull<<42)), ++/* pmd56 */ PMD_DP(PFM_REG_I, "PMD56", 56, (1ull<<39)|(1ull<<42)), ++/* pmd57 */ PMD_DP(PFM_REG_I, "PMD57", 57, (1ull<<39)|(1ull<<42)), ++/* pmd58 */ PMD_DP(PFM_REG_I, "PMD58", 58, (1ull<<39)|(1ull<<42)), ++/* pmd59 */ PMD_DP(PFM_REG_I, "PMD59", 59, (1ull<<39)|(1ull<<42)), ++/* pmd60 */ PMD_DP(PFM_REG_I, "PMD60", 60, (1ull<<39)|(1ull<<42)), ++/* pmd61 */ PMD_DP(PFM_REG_I, "PMD61", 61, (1ull<<39)|(1ull<<42)), ++/* pmd62 */ PMD_DP(PFM_REG_I, "PMD62", 62, (1ull<<39)|(1ull<<42)), ++/* pmd63 */ PMD_DP(PFM_REG_I, "PMD63", 63, (1ull<<39)|(1ull<<42)) ++}; ++#define PFM_MONT_NUM_PMDS ARRAY_SIZE(pfm_mont_pmd_desc) ++ ++static int pfm_mont_has_ht; ++ ++static int pfm_mont_pmc_check(struct pfm_context *ctx, ++ struct pfm_event_set *set, ++ struct pfarg_pmc *req) ++{ ++ struct pfm_arch_context *ctx_arch; ++ u64 val32 = 0, val38 = 0, val41 = 0; ++ u64 tmpval; ++ u16 cnum; ++ int ret = 0, check_case1 = 0; ++ int is_system; ++ ++ tmpval = req->reg_value; ++ cnum = req->reg_num; ++ ctx_arch = pfm_ctx_arch(ctx); ++ is_system = ctx->flags.system; ++ ++#define PFM_MONT_PMC_PM_POS6 (1UL<<6) ++#define PFM_MONT_PMC_PM_POS4 (1UL<<4) ++ ++ switch (cnum) { ++ case 4: ++ case 5: ++ case 6: ++ case 7: ++ case 8: ++ case 9: ++ if (is_system) ++ tmpval |= PFM_MONT_PMC_PM_POS6; ++ else ++ tmpval &= ~PFM_MONT_PMC_PM_POS6; ++ break; ++ case 10: ++ case 11: ++ case 12: ++ case 13: ++ case 14: ++ case 15: ++ if ((req->reg_flags & PFM_REGFL_NO_EMUL64) == 0) { ++ if (pfm_mont_has_ht) { ++ PFM_INFO("perfmon: Errata 121 PMD10/PMD15 cannot be used to overflow" ++ "when threads on on"); ++ return -EINVAL; ++ } ++ } ++ if (is_system) ++ tmpval |= PFM_MONT_PMC_PM_POS6; ++ else ++ tmpval &= ~PFM_MONT_PMC_PM_POS6; ++ break; ++ case 39: ++ case 40: ++ case 42: ++ if (pfm_mont_has_ht && ((req->reg_value >> 8) & 0x7) == 4) { ++ PFM_INFO("perfmon: Errata 120: IP-EAR not available when threads are on"); ++ return -EINVAL; ++ } ++ if (is_system) ++ tmpval |= PFM_MONT_PMC_PM_POS6; ++ else ++ tmpval &= ~PFM_MONT_PMC_PM_POS6; ++ break; ++ ++ case 32: ++ val32 = tmpval; ++ val38 = set->pmcs[38]; ++ val41 = set->pmcs[41]; ++ check_case1 = 1; ++ break; ++ ++ case 37: ++ if (is_system) ++ tmpval |= PFM_MONT_PMC_PM_POS4; ++ else ++ tmpval &= ~PFM_MONT_PMC_PM_POS4; ++ break; ++ ++ case 38: ++ val38 = tmpval; ++ val32 = set->pmcs[32]; ++ val41 = set->pmcs[41]; ++ check_case1 = 1; ++ break; ++ case 41: ++ val41 = tmpval; ++ val32 = set->pmcs[32]; ++ val38 = set->pmcs[38]; ++ check_case1 = 1; ++ break; ++ } ++ ++ if (check_case1) { ++ ret = (((val41 >> 45) & 0xf) == 0 && ((val32>>57) & 0x1) == 0) ++ && ((((val38>>1) & 0x3) == 0x2 || ((val38>>1) & 0x3) == 0) ++ || (((val38>>4) & 0x3) == 0x2 || ((val38>>4) & 0x3) == 0)); ++ if (ret) { ++ PFM_DBG("perfmon: invalid config pmc38=0x%lx " ++ "pmc41=0x%lx pmc32=0x%lx", ++ val38, val41, val32); ++ return -EINVAL; ++ } ++ } ++ ++ /* ++ * check if configuration implicitely activates the use of the ++ * debug registers. If true, then we ensure that this is possible ++ * and that we do not pick up stale value in the HW registers. ++ */ ++ ++ /* ++ * ++ * pmc41 is "active" if: ++ * one of the pmc41.cfgdtagXX field is different from 0x3 ++ * AND ++ * the corsesponding pmc41.en_dbrpXX is set. ++ * AND ++ * ctx_fl_use_dbr (dbr not yet used) ++ */ ++ if (cnum == 41 ++ && (tmpval & 0x1e00000000000) ++ && (tmpval & 0x18181818) != 0x18181818 ++ && ctx_arch->flags.use_dbr == 0) { ++ PFM_DBG("pmc41=0x%lx active, clearing dbr", tmpval); ++ ret = pfm_ia64_mark_dbregs_used(ctx, set); ++ if (ret) ++ return ret; ++ } ++ /* ++ * we must clear the (instruction) debug registers if: ++ * pmc38.ig_ibrpX is 0 (enabled) ++ * and ++ * fl_use_dbr == 0 (dbr not yet used) ++ */ ++ if (cnum == 38 && ((tmpval & 0x492) != 0x492) ++ && ctx_arch->flags.use_dbr == 0) { ++ PFM_DBG("pmc38=0x%lx active pmc38, clearing ibr", tmpval); ++ ret = pfm_ia64_mark_dbregs_used(ctx, set); ++ if (ret) ++ return ret; ++ ++ } ++ req->reg_value = tmpval; ++ return 0; ++} ++ ++static void pfm_handle_errata(void) ++{ ++ pfm_mont_has_ht = 1; ++ ++ PFM_INFO("activating workaround for errata 120 " ++ "(Disable IP-EAR when threads are on)"); ++ ++ PFM_INFO("activating workaround for Errata 121 " ++ "(PMC10-PMC15 cannot be used to overflow" ++ " when threads are on"); ++} ++static int pfm_mont_probe_pmu(void) ++{ ++ if (local_cpu_data->family != 0x20) ++ return -1; ++ ++ /* ++ * the 2 errata must be activated when ++ * threads are/can be enabled ++ */ ++ if (is_multithreading_enabled()) ++ pfm_handle_errata(); ++ ++ return 0; ++} ++ ++/* ++ * impl_pmcs, impl_pmds are computed at runtime to minimize errors! ++ */ ++static struct pfm_pmu_config pfm_mont_pmu_conf = { ++ .pmu_name = "Montecito", ++ .counter_width = 47, ++ .pmd_desc = pfm_mont_pmd_desc, ++ .pmc_desc = pfm_mont_pmc_desc, ++ .num_pmc_entries = PFM_MONT_NUM_PMCS, ++ .num_pmd_entries = PFM_MONT_NUM_PMDS, ++ .pmc_write_check = pfm_mont_pmc_check, ++ .probe_pmu = pfm_mont_probe_pmu, ++ .version = "1.0", ++ .pmu_info = &pfm_mont_pmu_info, ++ .flags = PFM_PMU_BUILTIN_FLAG, ++ .owner = THIS_MODULE ++}; ++ ++static int __init pfm_mont_pmu_init_module(void) ++{ ++ return pfm_pmu_register(&pfm_mont_pmu_conf); ++} ++ ++static void __exit pfm_mont_pmu_cleanup_module(void) ++{ ++ pfm_pmu_unregister(&pfm_mont_pmu_conf); ++} ++ ++module_init(pfm_mont_pmu_init_module); ++module_exit(pfm_mont_pmu_cleanup_module); +diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig +index 1e06d23..b87f445 100644 +--- a/arch/mips/Kconfig ++++ b/arch/mips/Kconfig +@@ -1857,6 +1857,8 @@ config SECCOMP + + If unsure, say Y. Only embedded should say N here. + ++source "arch/mips/perfmon/Kconfig" ++ + endmenu + + config RWSEM_GENERIC_SPINLOCK +diff --git a/arch/mips/Makefile b/arch/mips/Makefile +index 9aab51c..712acf7 100644 +--- a/arch/mips/Makefile ++++ b/arch/mips/Makefile +@@ -154,6 +154,12 @@ endif + endif + + # ++# Perfmon support ++# ++ ++core-$(CONFIG_PERFMON) += arch/mips/perfmon/ ++ ++# + # Firmware support + # + libs-$(CONFIG_ARC) += arch/mips/fw/arc/ +diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c +index 22fc19b..4467361 100644 +--- a/arch/mips/kernel/process.c ++++ b/arch/mips/kernel/process.c +@@ -27,6 +27,7 @@ + #include <linux/completion.h> + #include <linux/kallsyms.h> + #include <linux/random.h> ++#include <linux/perfmon_kern.h> + + #include <asm/asm.h> + #include <asm/bootinfo.h> +@@ -94,6 +95,7 @@ void start_thread(struct pt_regs * regs, unsigned long pc, unsigned long sp) + + void exit_thread(void) + { ++ pfm_exit_thread(); + } + + void flush_thread(void) +@@ -162,6 +164,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long usp, + if (clone_flags & CLONE_SETTLS) + ti->tp_value = regs->regs[7]; + ++ pfm_copy_thread(p); ++ + return 0; + } + +diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S +index 5e75a31..e96ddd6 100644 +--- a/arch/mips/kernel/scall32-o32.S ++++ b/arch/mips/kernel/scall32-o32.S +@@ -653,6 +653,18 @@ einval: li v0, -EINVAL + sys sys_dup3 3 + sys sys_pipe2 2 + sys sys_inotify_init1 1 ++ sys sys_pfm_create_context 4 /* 4330 */ ++ sys sys_pfm_write_pmcs 3 ++ sys sys_pfm_write_pmds 4 ++ sys sys_pfm_read_pmds 3 ++ sys sys_pfm_load_context 2 ++ sys sys_pfm_start 2 /* 4335 */ ++ sys sys_pfm_stop 1 ++ sys sys_pfm_restart 1 ++ sys sys_pfm_create_evtsets 3 ++ sys sys_pfm_getinfo_evtsets 3 ++ sys sys_pfm_delete_evtsets 3 /* 4340 */ ++ sys sys_pfm_unload_context 1 + .endm + + /* We pre-compute the number of _instruction_ bytes needed to +diff --git a/arch/mips/kernel/scall64-64.S b/arch/mips/kernel/scall64-64.S +index 3d58204..adb2ba9 100644 +--- a/arch/mips/kernel/scall64-64.S ++++ b/arch/mips/kernel/scall64-64.S +@@ -487,4 +487,16 @@ sys_call_table: + PTR sys_dup3 + PTR sys_pipe2 + PTR sys_inotify_init1 ++ PTR sys_pfm_create_context ++ PTR sys_pfm_write_pmcs /* 5290 */ ++ PTR sys_pfm_write_pmds ++ PTR sys_pfm_read_pmds ++ PTR sys_pfm_load_context ++ PTR sys_pfm_start ++ PTR sys_pfm_stop /* 5295 */ ++ PTR sys_pfm_restart ++ PTR sys_pfm_create_evtsets ++ PTR sys_pfm_getinfo_evtsets ++ PTR sys_pfm_delete_evtsets ++ PTR sys_pfm_unload_context /* 5300 */ + .size sys_call_table,.-sys_call_table +diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S +index da7f1b6..6d12095 100644 +--- a/arch/mips/kernel/scall64-n32.S ++++ b/arch/mips/kernel/scall64-n32.S +@@ -400,12 +400,12 @@ EXPORT(sysn32_call_table) + PTR sys_ioprio_set + PTR sys_ioprio_get + PTR compat_sys_utimensat +- PTR compat_sys_signalfd /* 5280 */ ++ PTR compat_sys_signalfd /* 6280 */ + PTR sys_ni_syscall + PTR sys_eventfd + PTR sys_fallocate + PTR sys_timerfd_create +- PTR sys_timerfd_gettime /* 5285 */ ++ PTR sys_timerfd_gettime /* 6285 */ + PTR sys_timerfd_settime + PTR sys_signalfd4 + PTR sys_eventfd2 +@@ -413,4 +413,16 @@ EXPORT(sysn32_call_table) + PTR sys_dup3 /* 5290 */ + PTR sys_pipe2 + PTR sys_inotify_init1 ++ PTR sys_pfm_create_context ++ PTR sys_pfm_write_pmcs ++ PTR sys_pfm_write_pmds /* 6295 */ ++ PTR sys_pfm_read_pmds ++ PTR sys_pfm_load_context ++ PTR sys_pfm_start ++ PTR sys_pfm_stop ++ PTR sys_pfm_restart /* 6300 */ ++ PTR sys_pfm_create_evtsets ++ PTR sys_pfm_getinfo_evtsets ++ PTR sys_pfm_delete_evtsets ++ PTR sys_pfm_unload_context + .size sysn32_call_table,.-sysn32_call_table +diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S +index d7cd1aa..e77f55a 100644 +--- a/arch/mips/kernel/scall64-o32.S ++++ b/arch/mips/kernel/scall64-o32.S +@@ -535,4 +535,16 @@ sys_call_table: + PTR sys_dup3 + PTR sys_pipe2 + PTR sys_inotify_init1 ++ PTR sys_pfm_create_context /* 4330 */ ++ PTR sys_pfm_write_pmcs ++ PTR sys_pfm_write_pmds ++ PTR sys_pfm_read_pmds ++ PTR sys_pfm_load_context ++ PTR sys_pfm_start /* 4335 */ ++ PTR sys_pfm_stop ++ PTR sys_pfm_restart ++ PTR sys_pfm_create_evtsets ++ PTR sys_pfm_getinfo_evtsets ++ PTR sys_pfm_delete_evtsets /* 4340 */ ++ PTR sys_pfm_unload_context + .size sys_call_table,.-sys_call_table +diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c +index a4e106c..6a7e60c 100644 +--- a/arch/mips/kernel/signal.c ++++ b/arch/mips/kernel/signal.c +@@ -20,6 +20,7 @@ + #include <linux/unistd.h> + #include <linux/compiler.h> + #include <linux/uaccess.h> ++#include <linux/perfmon_kern.h> + + #include <asm/abi.h> + #include <asm/asm.h> +@@ -694,8 +695,11 @@ static void do_signal(struct pt_regs *regs) + * - triggered by the TIF_WORK_MASK flags + */ + asmlinkage void do_notify_resume(struct pt_regs *regs, void *unused, +- __u32 thread_info_flags) ++ __u32 thread_info_flags) + { ++ if (thread_info_flags & _TIF_PERFMON_WORK) ++ pfm_handle_work(regs); ++ + /* deal with pending signal delivery */ + if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK)) + do_signal(regs); +diff --git a/arch/mips/kernel/time.c b/arch/mips/kernel/time.c +index 1f467d5..163dfe4 100644 +--- a/arch/mips/kernel/time.c ++++ b/arch/mips/kernel/time.c +@@ -49,10 +49,11 @@ int update_persistent_clock(struct timespec now) + return rtc_mips_set_mmss(now.tv_sec); + } + +-static int null_perf_irq(void) ++int null_perf_irq(void) + { + return 0; + } ++EXPORT_SYMBOL(null_perf_irq); + + int (*perf_irq)(void) = null_perf_irq; + +diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c +index b602ac6..9cbd75f 100644 +--- a/arch/mips/kernel/traps.c ++++ b/arch/mips/kernel/traps.c +@@ -92,17 +92,15 @@ static void show_raw_backtrace(unsigned long reg29) + #ifdef CONFIG_KALLSYMS + printk("\n"); + #endif +- while (!kstack_end(sp)) { +- unsigned long __user *p = +- (unsigned long __user *)(unsigned long)sp++; +- if (__get_user(addr, p)) { +- printk(" (Bad stack address)"); +- break; ++#define IS_KVA01(a) ((((unsigned long)a) & 0xc0000000) == 0x80000000) ++ if (IS_KVA01(sp)) { ++ while (!kstack_end(sp)) { ++ addr = *sp++; ++ if (__kernel_text_address(addr)) ++ print_ip_sym(addr); + } +- if (__kernel_text_address(addr)) +- print_ip_sym(addr); ++ printk("\n"); + } +- printk("\n"); + } + + #ifdef CONFIG_KALLSYMS +diff --git a/arch/mips/mti-malta/malta-time.c b/arch/mips/mti-malta/malta-time.c +index 0b97d47..d8f36b5 100644 +--- a/arch/mips/mti-malta/malta-time.c ++++ b/arch/mips/mti-malta/malta-time.c +@@ -27,6 +27,7 @@ + #include <linux/time.h> + #include <linux/timex.h> + #include <linux/mc146818rtc.h> ++#include <linux/perfmon_kern.h> + + #include <asm/mipsregs.h> + #include <asm/mipsmtregs.h> +diff --git a/arch/mips/perfmon/Kconfig b/arch/mips/perfmon/Kconfig +new file mode 100644 +index 0000000..b426eea +--- /dev/null ++++ b/arch/mips/perfmon/Kconfig +@@ -0,0 +1,61 @@ ++menu "Hardware Performance Monitoring support" ++config PERFMON ++ bool "Perfmon2 performance monitoring interface" ++ default n ++ help ++ Enables the perfmon2 interface to access the hardware ++ performance counters. See <http://perfmon2.sf.net/> for ++ more details. ++ ++config PERFMON_DEBUG ++ bool "Perfmon debugging" ++ default n ++ depends on PERFMON ++ help ++ Enables perfmon debugging support ++ ++config PERFMON_DEBUG_FS ++ bool "Enable perfmon statistics reporting via debugfs" ++ default y ++ depends on PERFMON && DEBUG_FS ++ help ++ Enable collection and reporting of perfmon timing statistics under ++ debugfs. This is used for debugging and performance analysis of the ++ subsystem. The debugfs filesystem must be mounted. ++ ++config PERFMON_FLUSH ++ bool "Flush sampling buffer when modified" ++ depends on PERFMON ++ default n ++ help ++ On some MIPS models, cache aliasing may cause invalid ++ data to be read from the perfmon sampling buffer. Use this option ++ to flush the buffer when it is modified to ensure valid data is ++ visible at the user level. ++ ++config PERFMON_ALIGN ++ bool "Align sampling buffer to avoid cache aliasing" ++ depends on PERFMON ++ default n ++ help ++ On some MIPS models, cache aliasing may cause invalid ++ data to be read from the perfmon sampling buffer. By forcing a bigger ++ page alignment (4-page), one can guarantee the buffer virtual address ++ will conflict in the cache with the user level mapping of the buffer ++ thereby ensuring a consistent view by user programs. ++ ++config PERFMON_DEBUG ++ bool "Perfmon debugging" ++ depends on PERFMON ++ default n ++ depends on PERFMON ++ help ++ Enables perfmon debugging support ++ ++config PERFMON_MIPS64 ++ tristate "Support for MIPS64 hardware performance counters" ++ depends on PERFMON ++ default n ++ help ++ Enables support for the MIPS64 hardware performance counters" ++endmenu +diff --git a/arch/mips/perfmon/Makefile b/arch/mips/perfmon/Makefile +new file mode 100644 +index 0000000..153b83f +--- /dev/null ++++ b/arch/mips/perfmon/Makefile +@@ -0,0 +1,2 @@ ++obj-$(CONFIG_PERFMON) += perfmon.o ++obj-$(CONFIG_PERFMON_MIPS64) += perfmon_mips64.o +diff --git a/arch/mips/perfmon/perfmon.c b/arch/mips/perfmon/perfmon.c +new file mode 100644 +index 0000000..6615a77 +--- /dev/null ++++ b/arch/mips/perfmon/perfmon.c +@@ -0,0 +1,313 @@ ++/* ++ * This file implements the MIPS64 specific ++ * support for the perfmon2 interface ++ * ++ * Copyright (c) 2005 Philip J. Mucci ++ * ++ * based on versions for other architectures: ++ * Copyright (c) 2005 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@htrpl.hp.com> ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/interrupt.h> ++#include <linux/module.h> ++#include <linux/perfmon_kern.h> ++ ++/* ++ * collect pending overflowed PMDs. Called from pfm_ctxsw() ++ * and from PMU interrupt handler. Must fill in set->povfl_pmds[] ++ * and set->npend_ovfls. Interrupts are masked ++ */ ++static void __pfm_get_ovfl_pmds(struct pfm_context *ctx, struct pfm_event_set *set) ++{ ++ u64 new_val, wmask; ++ u64 *used_mask, *intr_pmds; ++ u64 mask[PFM_PMD_BV]; ++ unsigned int i, max; ++ ++ max = ctx->regs.max_intr_pmd; ++ intr_pmds = ctx->regs.intr_pmds; ++ used_mask = set->used_pmds; ++ ++ wmask = 1ULL << pfm_pmu_conf->counter_width; ++ ++ bitmap_and(cast_ulp(mask), ++ cast_ulp(intr_pmds), ++ cast_ulp(used_mask), ++ max); ++ ++ /* ++ * check all PMD that can generate interrupts ++ * (that includes counters) ++ */ ++ for (i = 0; i < max; i++) { ++ if (test_bit(i, mask)) { ++ new_val = pfm_arch_read_pmd(ctx, i); ++ ++ PFM_DBG_ovfl("pmd%u new_val=0x%llx bit=%d\n", ++ i, (unsigned long long)new_val, ++ (new_val&wmask) ? 1 : 0); ++ ++ if (new_val & wmask) { ++ __set_bit(i, set->povfl_pmds); ++ set->npend_ovfls++; ++ } ++ } ++ } ++} ++ ++static void pfm_stop_active(struct task_struct *task, struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ unsigned int i, max; ++ ++ max = ctx->regs.max_pmc; ++ ++ /* ++ * clear enable bits, assume all pmcs are enable pmcs ++ */ ++ for (i = 0; i < max; i++) { ++ if (test_bit(i, set->used_pmcs)) ++ pfm_arch_write_pmc(ctx, i, 0); ++ } ++ ++ if (set->npend_ovfls) ++ return; ++ ++ __pfm_get_ovfl_pmds(ctx, set); ++} ++ ++/* ++ * Called from pfm_ctxsw(). Task is guaranteed to be current. ++ * Context is locked. Interrupts are masked. Monitoring is active. ++ * PMU access is guaranteed. PMC and PMD registers are live in PMU. ++ * ++ * for per-thread: ++ * must stop monitoring for the task ++ * ++ * Return: ++ * non-zero : did not save PMDs (as part of stopping the PMU) ++ * 0 : saved PMDs (no need to save them in caller) ++ */ ++int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx) ++{ ++ /* ++ * disable lazy restore of PMC registers. ++ */ ++ ctx->active_set->priv_flags |= PFM_SETFL_PRIV_MOD_PMCS; ++ ++ /* ++ * if masked, monitoring is stopped, thus there is no ++ * need to stop the PMU again and there is no need to ++ * check for pending overflows. This is not just an ++ * optimization, this is also for correctness as you ++ * may end up detecting overflows twice. ++ */ ++ if (ctx->state == PFM_CTX_MASKED) ++ return 1; ++ ++ pfm_stop_active(task, ctx, ctx->active_set); ++ ++ return 1; ++} ++ ++/* ++ * Called from pfm_stop() and pfm_ctxsw() ++ * Interrupts are masked. Context is locked. Set is the active set. ++ * ++ * For per-thread: ++ * task is not necessarily current. If not current task, then ++ * task is guaranteed stopped and off any cpu. Access to PMU ++ * is not guaranteed. Interrupts are masked. Context is locked. ++ * Set is the active set. ++ * ++ * For system-wide: ++ * task is current ++ * ++ * must disable active monitoring. ctx cannot be NULL ++ */ ++void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx) ++{ ++ /* ++ * no need to go through stop_save() ++ * if we are already stopped ++ */ ++ if (!ctx->flags.started || ctx->state == PFM_CTX_MASKED) ++ return; ++ ++ /* ++ * stop live registers and collect pending overflow ++ */ ++ if (task == current) ++ pfm_stop_active(task, ctx, ctx->active_set); ++} ++ ++/* ++ * called from pfm_start() or pfm_ctxsw() when idle task and ++ * EXCL_IDLE is on. ++ * ++ * Interrupts are masked. Context is locked. Set is the active set. ++ * ++ * For per-trhead: ++ * Task is not necessarily current. If not current task, then task ++ * is guaranteed stopped and off any cpu. Access to PMU is not guaranteed. ++ * ++ * For system-wide: ++ * task is always current ++ * ++ * must enable active monitoring. ++ */ ++void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx) ++{ ++ struct pfm_event_set *set; ++ unsigned int i, max_pmc; ++ ++ if (task != current) ++ return; ++ ++ set = ctx->active_set; ++ max_pmc = ctx->regs.max_pmc; ++ ++ for (i = 0; i < max_pmc; i++) { ++ if (test_bit(i, set->used_pmcs)) ++ pfm_arch_write_pmc(ctx, i, set->pmcs[i]); ++ } ++} ++ ++/* ++ * function called from pfm_switch_sets(), pfm_context_load_thread(), ++ * pfm_context_load_sys(), pfm_ctxsw(), pfm_switch_sets() ++ * context is locked. Interrupts are masked. set cannot be NULL. ++ * Access to the PMU is guaranteed. ++ * ++ * function must restore all PMD registers from set. ++ */ ++void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set) ++{ ++ u64 ovfl_mask, val; ++ u64 *impl_pmds; ++ unsigned int i; ++ unsigned int max_pmd; ++ ++ max_pmd = ctx->regs.max_pmd; ++ ovfl_mask = pfm_pmu_conf->ovfl_mask; ++ impl_pmds = ctx->regs.pmds; ++ ++ /* ++ * must restore all pmds to avoid leaking ++ * information to user. ++ */ ++ for (i = 0; i < max_pmd; i++) { ++ ++ if (test_bit(i, impl_pmds) == 0) ++ continue; ++ ++ val = set->pmds[i].value; ++ ++ /* ++ * set upper bits for counter to ensure ++ * overflow will trigger ++ */ ++ val &= ovfl_mask; ++ ++ pfm_arch_write_pmd(ctx, i, val); ++ } ++} ++ ++/* ++ * function called from pfm_switch_sets(), pfm_context_load_thread(), ++ * pfm_context_load_sys(), pfm_ctxsw(). ++ * Context is locked. Interrupts are masked. set cannot be NULL. ++ * Access to the PMU is guaranteed. ++ * ++ * function must restore all PMC registers from set, if needed. ++ */ ++void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set) ++{ ++ u64 *impl_pmcs; ++ unsigned int i, max_pmc; ++ ++ max_pmc = ctx->regs.max_pmc; ++ impl_pmcs = ctx->regs.pmcs; ++ ++ /* ++ * - by default no PMCS measures anything ++ * - on ctxswout, all used PMCs are disabled (cccr enable bit cleared) ++ * hence when masked we do not need to restore anything ++ */ ++ if (ctx->state == PFM_CTX_MASKED || ctx->flags.started == 0) ++ return; ++ ++ /* ++ * restore all pmcs ++ */ ++ for (i = 0; i < max_pmc; i++) ++ if (test_bit(i, impl_pmcs)) ++ pfm_arch_write_pmc(ctx, i, set->pmcs[i]); ++} ++ ++char *pfm_arch_get_pmu_module_name(void) ++{ ++ switch (cpu_data->cputype) { ++#ifndef CONFIG_SMP ++ case CPU_34K: ++#if defined(CPU_74K) ++ case CPU_74K: ++#endif ++#endif ++ case CPU_SB1: ++ case CPU_SB1A: ++ case CPU_R12000: ++ case CPU_25KF: ++ case CPU_24K: ++ case CPU_20KC: ++ case CPU_5KC: ++ return "perfmon_mips64"; ++ default: ++ return NULL; ++ } ++ return NULL; ++} ++ ++int perfmon_perf_irq(void) ++{ ++ /* BLATANTLY STOLEN FROM OPROFILE, then modified */ ++ struct pt_regs *regs; ++ unsigned int counters = pfm_pmu_conf->regs_all.max_pmc; ++ unsigned int control; ++ unsigned int counter; ++ ++ regs = get_irq_regs(); ++ switch (counters) { ++#define HANDLE_COUNTER(n) \ ++ case n + 1: \ ++ control = read_c0_perfctrl ## n(); \ ++ counter = read_c0_perfcntr ## n(); \ ++ if ((control & MIPS64_PMC_INT_ENABLE_MASK) && \ ++ (counter & MIPS64_PMD_INTERRUPT)) { \ ++ pfm_interrupt_handler(instruction_pointer(regs),\ ++ regs); \ ++ return(1); \ ++ } ++ HANDLE_COUNTER(3) ++ HANDLE_COUNTER(2) ++ HANDLE_COUNTER(1) ++ HANDLE_COUNTER(0) ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL(perfmon_perf_irq); +diff --git a/arch/mips/perfmon/perfmon_mips64.c b/arch/mips/perfmon/perfmon_mips64.c +new file mode 100644 +index 0000000..78cb43d +--- /dev/null ++++ b/arch/mips/perfmon/perfmon_mips64.c +@@ -0,0 +1,218 @@ ++/* ++ * This file contains the MIPS64 and decendent PMU register description tables ++ * and pmc checker used by perfmon.c. ++ * ++ * Copyright (c) 2005 Philip Mucci ++ * ++ * Based on perfmon_p6.c: ++ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/module.h> ++#include <linux/perfmon_kern.h> ++ ++MODULE_AUTHOR("Philip Mucci <mucci@cs.utk.edu>"); ++MODULE_DESCRIPTION("MIPS64 PMU description tables"); ++MODULE_LICENSE("GPL"); ++ ++/* ++ * reserved: ++ * - bit 63-9 ++ * RSVD: reserved bits must be 1 ++ */ ++#define PFM_MIPS64_PMC_RSVD 0xfffffffffffff810ULL ++#define PFM_MIPS64_PMC_VAL (1ULL<<4) ++ ++extern int null_perf_irq(struct pt_regs *regs); ++extern int (*perf_irq)(struct pt_regs *regs); ++extern int perfmon_perf_irq(struct pt_regs *regs); ++ ++static struct pfm_arch_pmu_info pfm_mips64_pmu_info; ++ ++static struct pfm_regmap_desc pfm_mips64_pmc_desc[] = { ++/* pmc0 */ PMC_D(PFM_REG_I64, "CP0_25_0", PFM_MIPS64_PMC_VAL, PFM_MIPS64_PMC_RSVD, 0, 0), ++/* pmc1 */ PMC_D(PFM_REG_I64, "CP0_25_1", PFM_MIPS64_PMC_VAL, PFM_MIPS64_PMC_RSVD, 0, 1), ++/* pmc2 */ PMC_D(PFM_REG_I64, "CP0_25_2", PFM_MIPS64_PMC_VAL, PFM_MIPS64_PMC_RSVD, 0, 2), ++/* pmc3 */ PMC_D(PFM_REG_I64, "CP0_25_3", PFM_MIPS64_PMC_VAL, PFM_MIPS64_PMC_RSVD, 0, 3) ++}; ++#define PFM_MIPS64_NUM_PMCS ARRAY_SIZE(pfm_mips64_pmc_desc) ++ ++static struct pfm_regmap_desc pfm_mips64_pmd_desc[] = { ++/* pmd0 */ PMD_D(PFM_REG_C, "CP0_25_0", 0), ++/* pmd1 */ PMD_D(PFM_REG_C, "CP0_25_1", 1), ++/* pmd2 */ PMD_D(PFM_REG_C, "CP0_25_2", 2), ++/* pmd3 */ PMD_D(PFM_REG_C, "CP0_25_3", 3) ++}; ++#define PFM_MIPS64_NUM_PMDS ARRAY_SIZE(pfm_mips64_pmd_desc) ++ ++static int pfm_mips64_probe_pmu(void) ++{ ++ struct cpuinfo_mips *c = ¤t_cpu_data; ++ ++ switch (c->cputype) { ++#ifndef CONFIG_SMP ++ case CPU_34K: ++#if defined(CPU_74K) ++ case CPU_74K: ++#endif ++#endif ++ case CPU_SB1: ++ case CPU_SB1A: ++ case CPU_R12000: ++ case CPU_25KF: ++ case CPU_24K: ++ case CPU_20KC: ++ case CPU_5KC: ++ return 0; ++ break; ++ default: ++ PFM_INFO("Unknown cputype 0x%x", c->cputype); ++ } ++ return -1; ++} ++ ++/* ++ * impl_pmcs, impl_pmds are computed at runtime to minimize errors! ++ */ ++static struct pfm_pmu_config pfm_mips64_pmu_conf = { ++ .pmu_name = "MIPS", /* placeholder */ ++ .counter_width = 31, ++ .pmd_desc = pfm_mips64_pmd_desc, ++ .pmc_desc = pfm_mips64_pmc_desc, ++ .num_pmc_entries = PFM_MIPS64_NUM_PMCS, ++ .num_pmd_entries = PFM_MIPS64_NUM_PMDS, ++ .probe_pmu = pfm_mips64_probe_pmu, ++ .flags = PFM_PMU_BUILTIN_FLAG, ++ .owner = THIS_MODULE, ++ .pmu_info = &pfm_mips64_pmu_info ++}; ++ ++static inline int n_counters(void) ++{ ++ if (!(read_c0_config1() & MIPS64_CONFIG_PMC_MASK)) ++ return 0; ++ if (!(read_c0_perfctrl0() & MIPS64_PMC_CTR_MASK)) ++ return 1; ++ if (!(read_c0_perfctrl1() & MIPS64_PMC_CTR_MASK)) ++ return 2; ++ if (!(read_c0_perfctrl2() & MIPS64_PMC_CTR_MASK)) ++ return 3; ++ return 4; ++} ++ ++static int __init pfm_mips64_pmu_init_module(void) ++{ ++ struct cpuinfo_mips *c = ¤t_cpu_data; ++ int i, ret, num; ++ u64 temp_mask; ++ ++ switch (c->cputype) { ++ case CPU_5KC: ++ pfm_mips64_pmu_conf.pmu_name = "MIPS5KC"; ++ break; ++ case CPU_R12000: ++ pfm_mips64_pmu_conf.pmu_name = "MIPSR12000"; ++ break; ++ case CPU_20KC: ++ pfm_mips64_pmu_conf.pmu_name = "MIPS20KC"; ++ break; ++ case CPU_24K: ++ pfm_mips64_pmu_conf.pmu_name = "MIPS24K"; ++ break; ++ case CPU_25KF: ++ pfm_mips64_pmu_conf.pmu_name = "MIPS25KF"; ++ break; ++ case CPU_SB1: ++ pfm_mips64_pmu_conf.pmu_name = "SB1"; ++ break; ++ case CPU_SB1A: ++ pfm_mips64_pmu_conf.pmu_name = "SB1A"; ++ break; ++#ifndef CONFIG_SMP ++ case CPU_34K: ++ pfm_mips64_pmu_conf.pmu_name = "MIPS34K"; ++ break; ++#if defined(CPU_74K) ++ case CPU_74K: ++ pfm_mips64_pmu_conf.pmu_name = "MIPS74K"; ++ break; ++#endif ++#endif ++ default: ++ PFM_INFO("Unknown cputype 0x%x", c->cputype); ++ return -1; ++ } ++ ++ /* The R14k and older performance counters have to */ ++ /* be hard-coded, as there is no support for auto-detection */ ++ if ((c->cputype == CPU_R12000) || (c->cputype == CPU_R14000)) ++ num = 4; ++ else if (c->cputype == CPU_R10000) ++ num = 2; ++ else ++ num = n_counters(); ++ ++ if (num == 0) { ++ PFM_INFO("cputype 0x%x has no counters", c->cputype); ++ return -1; ++ } ++ /* mark remaining counters unavailable */ ++ for (i = num; i < PFM_MIPS64_NUM_PMCS; i++) ++ pfm_mips64_pmc_desc[i].type = PFM_REG_NA; ++ ++ for (i = num; i < PFM_MIPS64_NUM_PMDS; i++) ++ pfm_mips64_pmd_desc[i].type = PFM_REG_NA; ++ ++ /* set the PMC_RSVD mask */ ++ switch (c->cputype) { ++ case CPU_5KC: ++ case CPU_R10000: ++ case CPU_20KC: ++ /* 4-bits for event */ ++ temp_mask = 0xfffffffffffffe10ULL; ++ break; ++ case CPU_R12000: ++ case CPU_R14000: ++ /* 5-bits for event */ ++ temp_mask = 0xfffffffffffffc10ULL; ++ break; ++ default: ++ /* 6-bits for event */ ++ temp_mask = 0xfffffffffffff810ULL; ++ } ++ for (i = 0; i < PFM_MIPS64_NUM_PMCS; i++) ++ pfm_mips64_pmc_desc[i].rsvd_msk = temp_mask; ++ ++ pfm_mips64_pmu_conf.num_pmc_entries = num; ++ pfm_mips64_pmu_conf.num_pmd_entries = num; ++ ++ pfm_mips64_pmu_info.pmu_style = c->cputype; ++ ++ ret = pfm_pmu_register(&pfm_mips64_pmu_conf); ++ if (ret == 0) ++ perf_irq = perfmon_perf_irq; ++ return ret; ++} ++ ++static void __exit pfm_mips64_pmu_cleanup_module(void) ++{ ++ pfm_pmu_unregister(&pfm_mips64_pmu_conf); ++ perf_irq = null_perf_irq; ++} ++ ++module_init(pfm_mips64_pmu_init_module); ++module_exit(pfm_mips64_pmu_cleanup_module); +diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig +index 587da5e..a411389 100644 +--- a/arch/powerpc/Kconfig ++++ b/arch/powerpc/Kconfig +@@ -230,6 +230,8 @@ source "init/Kconfig" + source "arch/powerpc/sysdev/Kconfig" + source "arch/powerpc/platforms/Kconfig" + ++source "arch/powerpc/perfmon/Kconfig" ++ + menu "Kernel options" + + config HIGHMEM +diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile +index c6be19e..7ea20cb 100644 +--- a/arch/powerpc/Makefile ++++ b/arch/powerpc/Makefile +@@ -146,6 +146,7 @@ core-y += arch/powerpc/kernel/ \ + arch/powerpc/platforms/ + core-$(CONFIG_MATH_EMULATION) += arch/powerpc/math-emu/ + core-$(CONFIG_XMON) += arch/powerpc/xmon/ ++core-$(CONFIG_PERFMON) += arch/powerpc/perfmon/ + core-$(CONFIG_KVM) += arch/powerpc/kvm/ + + drivers-$(CONFIG_OPROFILE) += arch/powerpc/oprofile/ +diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild +index 5ab7d7f..88cb533 100644 +--- a/arch/powerpc/include/asm/Kbuild ++++ b/arch/powerpc/include/asm/Kbuild +@@ -21,6 +21,7 @@ header-y += resource.h + header-y += sigcontext.h + header-y += statfs.h + header-y += ps3fb.h ++header-y += perfmon.h + + unifdef-y += bootx.h + unifdef-y += byteorder.h +diff --git a/arch/powerpc/include/asm/cell-pmu.h b/arch/powerpc/include/asm/cell-pmu.h +index 8066eed..981db26 100644 +--- a/arch/powerpc/include/asm/cell-pmu.h ++++ b/arch/powerpc/include/asm/cell-pmu.h +@@ -61,6 +61,11 @@ + + /* Macros for the pm_status register. */ + #define CBE_PM_CTR_OVERFLOW_INTR(ctr) (1 << (31 - ((ctr) & 7))) ++#define CBE_PM_OVERFLOW_CTRS(pm_status) (((pm_status) >> 24) & 0xff) ++#define CBE_PM_ALL_OVERFLOW_INTR 0xff000000 ++#define CBE_PM_INTERVAL_INTR 0x00800000 ++#define CBE_PM_TRACE_BUFFER_FULL_INTR 0x00400000 ++#define CBE_PM_TRACE_BUFFER_UNDERFLOW_INTR 0x00200000 + + enum pm_reg_name { + group_control, +diff --git a/arch/powerpc/include/asm/cell-regs.h b/arch/powerpc/include/asm/cell-regs.h +index fd6fd00..580786d 100644 +--- a/arch/powerpc/include/asm/cell-regs.h ++++ b/arch/powerpc/include/asm/cell-regs.h +@@ -117,8 +117,9 @@ struct cbe_pmd_regs { + u8 pad_0x0c1c_0x0c20 [4]; /* 0x0c1c */ + #define CBE_PMD_FIR_MODE_M8 0x00800 + u64 fir_enable_mask; /* 0x0c20 */ +- +- u8 pad_0x0c28_0x0ca8 [0x0ca8 - 0x0c28]; /* 0x0c28 */ ++ u8 pad_0x0c28_0x0c98 [0x0c98 - 0x0c28]; /* 0x0c28 */ ++ u64 on_ramp_trace; /* 0x0c98 */ ++ u64 pad_0x0ca0; /* 0x0ca0 */ + u64 ras_esc_0; /* 0x0ca8 */ + u8 pad_0x0cb0_0x1000 [0x1000 - 0x0cb0]; /* 0x0cb0 */ + }; +@@ -218,7 +219,11 @@ extern struct cbe_iic_regs __iomem *cbe_get_cpu_iic_regs(int cpu); + + + struct cbe_mic_tm_regs { +- u8 pad_0x0000_0x0040[0x0040 - 0x0000]; /* 0x0000 */ ++ u8 pad_0x0000_0x0010[0x0010 - 0x0000]; /* 0x0000 */ ++ ++ u64 MBL_debug; /* 0x0010 */ ++ ++ u8 pad_0x0018_0x0040[0x0040 - 0x0018]; /* 0x0018 */ + + u64 mic_ctl_cnfg2; /* 0x0040 */ + #define CBE_MIC_ENABLE_AUX_TRC 0x8000000000000000LL +@@ -303,6 +308,25 @@ struct cbe_mic_tm_regs { + extern struct cbe_mic_tm_regs __iomem *cbe_get_mic_tm_regs(struct device_node *np); + extern struct cbe_mic_tm_regs __iomem *cbe_get_cpu_mic_tm_regs(int cpu); + ++/* ++ * ++ * PPE Privileged MMIO Registers definition. (offset 0x500000 - 0x500fff) ++ * ++ */ ++struct cbe_ppe_priv_regs { ++ u8 pad_0x0000_0x0858[0x0858 - 0x0000]; /* 0x0000 */ ++ ++ u64 L2_debug1; /* 0x0858 */ ++ ++ u8 pad_0x0860_0x0958[0x0958 - 0x0860]; /* 0x0860 */ ++ ++ u64 ciu_dr1; /* 0x0958 */ ++ ++ u8 pad_0x0960_0x1000[0x1000 - 0x0960]; /* 0x0960 */ ++}; ++ ++extern struct cbe_ppe_priv_regs __iomem *cbe_get_cpu_ppe_priv_regs(int cpu); ++ + /* some utility functions to deal with SMT */ + extern u32 cbe_get_hw_thread_id(int cpu); + extern u32 cbe_cpu_to_node(int cpu); +diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h +index 6493a39..ba9ead4 100644 +--- a/arch/powerpc/include/asm/paca.h ++++ b/arch/powerpc/include/asm/paca.h +@@ -97,6 +97,10 @@ struct paca_struct { + u8 soft_enabled; /* irq soft-enable flag */ + u8 hard_enabled; /* set if irqs are enabled in MSR */ + u8 io_sync; /* writel() needs spin_unlock sync */ ++#ifdef CONFIG_PERFMON ++ u8 pmu_except_pending; /* PMU exception occurred while soft ++ * disabled */ ++#endif + + /* Stuff for accurate time accounting */ + u64 user_time; /* accumulated usermode TB ticks */ +diff --git a/arch/powerpc/include/asm/perfmon.h b/arch/powerpc/include/asm/perfmon.h +new file mode 100644 +index 0000000..da0ae3b +--- /dev/null ++++ b/arch/powerpc/include/asm/perfmon.h +@@ -0,0 +1,33 @@ ++/* ++ * Copyright (c) 2007 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * This file contains powerpc specific definitions for the perfmon ++ * interface. ++ * ++ * This file MUST never be included directly. Use linux/perfmon.h. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#ifndef _ASM_POWERPC_PERFMON_H_ ++#define _ASM_POWERPC_PERFMON_H_ ++ ++/* ++ * arch-specific user visible interface definitions ++ */ ++#define PFM_ARCH_MAX_PMCS (256+64) /* 256 HW 64 SW */ ++#define PFM_ARCH_MAX_PMDS (256+64) /* 256 HW 64 SW */ ++ ++#endif /* _ASM_POWERPC_PERFMON_H_ */ +diff --git a/arch/powerpc/include/asm/perfmon_kern.h b/arch/powerpc/include/asm/perfmon_kern.h +new file mode 100644 +index 0000000..65ec984 +--- /dev/null ++++ b/arch/powerpc/include/asm/perfmon_kern.h +@@ -0,0 +1,390 @@ ++/* ++ * Copyright (c) 2005 David Gibson, IBM Corporation. ++ * ++ * Based on other versions: ++ * Copyright (c) 2005 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * This file contains powerpc specific definitions for the perfmon ++ * interface. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#ifndef _ASM_POWERPC_PERFMON_KERN_H_ ++#define _ASM_POWERPC_PERFMON_KERN_H_ ++ ++#ifdef __KERNEL__ ++ ++#ifdef CONFIG_PERFMON ++ ++#include <asm/pmc.h> ++#include <asm/unistd.h> ++ ++#define HID0_PMC5_6_GR_MODE (1UL << (63 - 40)) ++ ++enum powerpc_pmu_type { ++ PFM_POWERPC_PMU_NONE, ++ PFM_POWERPC_PMU_604, ++ PFM_POWERPC_PMU_604e, ++ PFM_POWERPC_PMU_750, /* XXX: Minor event set diffs between IBM and Moto. */ ++ PFM_POWERPC_PMU_7400, ++ PFM_POWERPC_PMU_7450, ++ PFM_POWERPC_PMU_POWER4, ++ PFM_POWERPC_PMU_POWER5, ++ PFM_POWERPC_PMU_POWER5p, ++ PFM_POWERPC_PMU_POWER6, ++ PFM_POWERPC_PMU_CELL, ++}; ++ ++struct pfm_arch_pmu_info { ++ enum powerpc_pmu_type pmu_style; ++ ++ void (*write_pmc)(unsigned int cnum, u64 value); ++ void (*write_pmd)(unsigned int cnum, u64 value); ++ ++ u64 (*read_pmd)(unsigned int cnum); ++ ++ void (*enable_counters)(struct pfm_context *ctx, ++ struct pfm_event_set *set); ++ void (*disable_counters)(struct pfm_context *ctx, ++ struct pfm_event_set *set); ++ ++ void (*irq_handler)(struct pt_regs *regs, struct pfm_context *ctx); ++ void (*get_ovfl_pmds)(struct pfm_context *ctx, ++ struct pfm_event_set *set); ++ ++ /* The following routines are optional. */ ++ void (*restore_pmcs)(struct pfm_context *ctx, ++ struct pfm_event_set *set); ++ void (*restore_pmds)(struct pfm_context *ctx, ++ struct pfm_event_set *set); ++ ++ int (*ctxswout_thread)(struct task_struct *task, ++ struct pfm_context *ctx, ++ struct pfm_event_set *set); ++ void (*ctxswin_thread)(struct task_struct *task, ++ struct pfm_context *ctx, ++ struct pfm_event_set *set); ++ int (*load_context)(struct pfm_context *ctx); ++ void (*unload_context)(struct pfm_context *ctx); ++ int (*acquire_pmu)(u64 *unavail_pmcs, u64 *unavail_pmds); ++ void (*release_pmu)(void); ++ void *platform_info; ++ void (*resend_irq)(struct pfm_context *ctx); ++}; ++ ++#ifdef CONFIG_PPC32 ++#define PFM_ARCH_PMD_STK_ARG 6 /* conservative value */ ++#define PFM_ARCH_PMC_STK_ARG 6 /* conservative value */ ++#else ++#define PFM_ARCH_PMD_STK_ARG 8 /* conservative value */ ++#define PFM_ARCH_PMC_STK_ARG 8 /* conservative value */ ++#endif ++ ++static inline void pfm_arch_resend_irq(struct pfm_context *ctx) ++{ ++ struct pfm_arch_pmu_info *arch_info; ++ ++ arch_info = pfm_pmu_info(); ++ arch_info->resend_irq(ctx); ++} ++ ++static inline void pfm_arch_serialize(void) ++{} ++ ++static inline void pfm_arch_write_pmc(struct pfm_context *ctx, ++ unsigned int cnum, ++ u64 value) ++{ ++ struct pfm_arch_pmu_info *arch_info; ++ ++ arch_info = pfm_pmu_info(); ++ ++ /* ++ * we only write to the actual register when monitoring is ++ * active (pfm_start was issued) ++ */ ++ if (ctx && ctx->flags.started == 0) ++ return; ++ ++ BUG_ON(!arch_info->write_pmc); ++ ++ arch_info->write_pmc(cnum, value); ++} ++ ++static inline void pfm_arch_write_pmd(struct pfm_context *ctx, ++ unsigned int cnum, u64 value) ++{ ++ struct pfm_arch_pmu_info *arch_info; ++ ++ arch_info = pfm_pmu_info(); ++ ++ value &= pfm_pmu_conf->ovfl_mask; ++ ++ BUG_ON(!arch_info->write_pmd); ++ ++ arch_info->write_pmd(cnum, value); ++} ++ ++static inline u64 pfm_arch_read_pmd(struct pfm_context *ctx, unsigned int cnum) ++{ ++ struct pfm_arch_pmu_info *arch_info; ++ ++ arch_info = pfm_pmu_info(); ++ ++ BUG_ON(!arch_info->read_pmd); ++ ++ return arch_info->read_pmd(cnum); ++} ++ ++/* ++ * For some CPUs, the upper bits of a counter must be set in order for the ++ * overflow interrupt to happen. On overflow, the counter has wrapped around, ++ * and the upper bits are cleared. This function may be used to set them back. ++ */ ++static inline void pfm_arch_ovfl_reset_pmd(struct pfm_context *ctx, ++ unsigned int cnum) ++{ ++ u64 val = pfm_arch_read_pmd(ctx, cnum); ++ ++ /* This masks out overflow bit 31 */ ++ pfm_arch_write_pmd(ctx, cnum, val); ++} ++ ++/* ++ * At certain points, perfmon needs to know if monitoring has been ++ * explicitely started/stopped by user via pfm_start/pfm_stop. The ++ * information is tracked in flags.started. However on certain ++ * architectures, it may be possible to start/stop directly from ++ * user level with a single assembly instruction bypassing ++ * the kernel. This function must be used to determine by ++ * an arch-specific mean if monitoring is actually started/stopped. ++ */ ++static inline int pfm_arch_is_active(struct pfm_context *ctx) ++{ ++ return ctx->flags.started; ++} ++ ++static inline void pfm_arch_ctxswout_sys(struct task_struct *task, ++ struct pfm_context *ctx) ++{} ++ ++static inline void pfm_arch_ctxswin_sys(struct task_struct *task, ++ struct pfm_context *ctx) ++{} ++ ++void pfm_arch_init_percpu(void); ++int pfm_arch_is_monitoring_active(struct pfm_context *ctx); ++int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx); ++void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx); ++void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx); ++void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx); ++void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set); ++void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set); ++void pfm_arch_clear_pmd_ovfl_cond(struct pfm_context *ctx, struct pfm_event_set *set); ++int pfm_arch_get_ovfl_pmds(struct pfm_context *ctx, ++ struct pfm_event_set *set); ++char *pfm_arch_get_pmu_module_name(void); ++/* ++ * called from __pfm_interrupt_handler(). ctx is not NULL. ++ * ctx is locked. PMU interrupt is masked. ++ * ++ * must stop all monitoring to ensure handler has consistent view. ++ * must collect overflowed PMDs bitmask into povfls_pmds and ++ * npend_ovfls. If no interrupt detected then npend_ovfls ++ * must be set to zero. ++ */ ++static inline void pfm_arch_intr_freeze_pmu(struct pfm_context *ctx, struct pfm_event_set *set) ++{ ++ pfm_arch_stop(current, ctx); ++} ++ ++void powerpc_irq_handler(struct pt_regs *regs); ++ ++/* ++ * unfreeze PMU from pfm_do_interrupt_handler() ++ * ctx may be NULL for spurious ++ */ ++static inline void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx) ++{ ++ struct pfm_arch_pmu_info *arch_info; ++ ++ if (!ctx) ++ return; ++ ++ PFM_DBG_ovfl("state=%d", ctx->state); ++ ++ ctx->flags.started = 1; ++ ++ if (ctx->state == PFM_CTX_MASKED) ++ return; ++ ++ arch_info = pfm_pmu_info(); ++ BUG_ON(!arch_info->enable_counters); ++ arch_info->enable_counters(ctx, ctx->active_set); ++} ++ ++/* ++ * PowerPC does not save the PMDs during pfm_arch_intr_freeze_pmu(), thus ++ * this routine needs to do it when switching sets on overflow ++ */ ++static inline void pfm_arch_save_pmds_from_intr(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ pfm_save_pmds(ctx, set); ++} ++ ++/* ++ * this function is called from the PMU interrupt handler ONLY. ++ * On PPC, the PMU is frozen via arch_stop, masking would be implemented ++ * via arch-stop as well. Given that the PMU is already stopped when ++ * entering the interrupt handler, we do not need to stop it again, so ++ * this function is a nop. ++ */ ++static inline void pfm_arch_mask_monitoring(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{} ++ ++/* ++ * Simply need to start the context in order to unmask. ++ */ ++static inline void pfm_arch_unmask_monitoring(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ pfm_arch_start(current, ctx); ++} ++ ++ ++static inline int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg) ++{ ++ return 0; ++} ++ ++static inline int pfm_arch_context_create(struct pfm_context *ctx, ++ u32 ctx_flags) ++{ ++ return 0; ++} ++ ++static inline void pfm_arch_context_free(struct pfm_context *ctx) ++{} ++ ++/* not necessary on PowerPC */ ++static inline void pfm_cacheflush(void *addr, unsigned int len) ++{} ++ ++/* ++ * function called from pfm_setfl_sane(). Context is locked ++ * and interrupts are masked. ++ * The value of flags is the value of ctx_flags as passed by ++ * user. ++ * ++ * function must check arch-specific set flags. ++ * Return: ++ * 1 when flags are valid ++ * 0 on error ++ */ ++static inline int pfm_arch_setfl_sane(struct pfm_context *ctx, u32 flags) ++{ ++ return 0; ++} ++ ++static inline int pfm_arch_init(void) ++{ ++ return 0; ++} ++ ++static inline int pfm_arch_load_context(struct pfm_context *ctx) ++{ ++ struct pfm_arch_pmu_info *arch_info; ++ int rc = 0; ++ ++ arch_info = pfm_pmu_info(); ++ if (arch_info->load_context) ++ rc = arch_info->load_context(ctx); ++ ++ return rc; ++} ++ ++static inline void pfm_arch_unload_context(struct pfm_context *ctx) ++{ ++ struct pfm_arch_pmu_info *arch_info; ++ ++ arch_info = pfm_pmu_info(); ++ if (arch_info->unload_context) ++ arch_info->unload_context(ctx); ++} ++ ++static inline int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds) ++{ ++ struct pfm_arch_pmu_info *arch_info; ++ int rc = 0; ++ ++ arch_info = pfm_pmu_info(); ++ if (arch_info->acquire_pmu) { ++ rc = arch_info->acquire_pmu(unavail_pmcs, unavail_pmds); ++ if (rc) ++ return rc; ++ } ++ ++ return reserve_pmc_hardware(powerpc_irq_handler); ++} ++ ++static inline void pfm_arch_pmu_release(void) ++{ ++ struct pfm_arch_pmu_info *arch_info; ++ ++ arch_info = pfm_pmu_info(); ++ if (arch_info->release_pmu) ++ arch_info->release_pmu(); ++ ++ release_pmc_hardware(); ++} ++ ++static inline void pfm_arch_arm_handle_work(struct task_struct *task) ++{} ++ ++static inline void pfm_arch_disarm_handle_work(struct task_struct *task) ++{} ++ ++static inline int pfm_arch_get_base_syscall(void) ++{ ++ return __NR_pfm_create_context; ++} ++ ++struct pfm_arch_context { ++ /* Cell: Most recent value of the pm_status ++ * register read by the interrupt handler. ++ * ++ * Interrupt handler sets last_read_updated if it ++ * just read and updated last_read_pm_status ++ */ ++ u32 last_read_pm_status; ++ u32 last_read_updated; ++ u64 powergs_pmc5, powergs_pmc6; ++ u64 delta_tb, delta_tb_start; ++ u64 delta_purr, delta_purr_start; ++}; ++ ++#define PFM_ARCH_CTX_SIZE sizeof(struct pfm_arch_context) ++/* ++ * PowerPC does not need extra alignment requirements for the sampling buffer ++ */ ++#define PFM_ARCH_SMPL_ALIGN_SIZE 0 ++ ++#endif /* CONFIG_PERFMON */ ++ ++#endif /* __KERNEL__ */ ++#endif /* _ASM_POWERPC_PERFMON_KERN_H_ */ +diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h +index c6d1ab6..a9f3ad0 100644 +--- a/arch/powerpc/include/asm/reg.h ++++ b/arch/powerpc/include/asm/reg.h +@@ -698,6 +698,7 @@ + #define PV_POWER5 0x003A + #define PV_POWER5p 0x003B + #define PV_970FX 0x003C ++#define PV_POWER6 0x003E + #define PV_630 0x0040 + #define PV_630p 0x0041 + #define PV_970MP 0x0044 +diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h +index f6cc7a4..0164841 100644 +--- a/arch/powerpc/include/asm/systbl.h ++++ b/arch/powerpc/include/asm/systbl.h +@@ -322,3 +322,15 @@ SYSCALL_SPU(epoll_create1) + SYSCALL_SPU(dup3) + SYSCALL_SPU(pipe2) + SYSCALL(inotify_init1) ++SYSCALL(pfm_create_context) ++SYSCALL(pfm_write_pmcs) ++SYSCALL(pfm_write_pmds) ++SYSCALL(pfm_read_pmds) ++SYSCALL(pfm_load_context) ++SYSCALL(pfm_start) ++SYSCALL(pfm_stop) ++SYSCALL(pfm_restart) ++SYSCALL(pfm_create_evtsets) ++SYSCALL(pfm_getinfo_evtsets) ++SYSCALL(pfm_delete_evtsets) ++SYSCALL(pfm_unload_context) +diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h +index 9665a26..6cda9f9 100644 +--- a/arch/powerpc/include/asm/thread_info.h ++++ b/arch/powerpc/include/asm/thread_info.h +@@ -130,10 +130,12 @@ static inline struct thread_info *current_thread_info(void) + #define _TIF_FREEZE (1<<TIF_FREEZE) + #define _TIF_RUNLATCH (1<<TIF_RUNLATCH) + #define _TIF_ABI_PENDING (1<<TIF_ABI_PENDING) ++#define _TIF_PERFMON_WORK (1<<TIF_PERFMON_WORK) ++#define _TIF_PERFMON_CTXSW (1<<TIF_PERFMON_CTXSW) + #define _TIF_SYSCALL_T_OR_A (_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP) + + #define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \ +- _TIF_NOTIFY_RESUME) ++ _TIF_NOTIFY_RESUME | _TIF_PERFMON_WORK) + #define _TIF_PERSYSCALL_MASK (_TIF_RESTOREALL|_TIF_NOERROR) + + /* Bits in local_flags */ +diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h +index e07d0c7..6226cba 100644 +--- a/arch/powerpc/include/asm/unistd.h ++++ b/arch/powerpc/include/asm/unistd.h +@@ -341,10 +341,22 @@ + #define __NR_dup3 316 + #define __NR_pipe2 317 + #define __NR_inotify_init1 318 ++#define __NR_pfm_create_context 319 ++#define __NR_pfm_write_pmcs 320 ++#define __NR_pfm_write_pmds 321 ++#define __NR_pfm_read_pmds 322 ++#define __NR_pfm_load_context 323 ++#define __NR_pfm_start 324 ++#define __NR_pfm_stop 325 ++#define __NR_pfm_restart 326 ++#define __NR_pfm_create_evtsets 327 ++#define __NR_pfm_getinfo_evtsets 328 ++#define __NR_pfm_delete_evtsets 329 ++#define __NR_pfm_unload_context 330 + + #ifdef __KERNEL__ + +-#define __NR_syscalls 319 ++#define __NR_syscalls 331 + + #define __NR__exit __NR_exit + #define NR_syscalls __NR_syscalls +diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S +index 1cbbf70..198645f 100644 +--- a/arch/powerpc/kernel/entry_32.S ++++ b/arch/powerpc/kernel/entry_32.S +@@ -39,7 +39,7 @@ + * MSR_KERNEL is > 0x10000 on 4xx/Book-E since it include MSR_CE. + */ + #if MSR_KERNEL >= 0x10000 +-#define LOAD_MSR_KERNEL(r, x) lis r,(x)@h; ori r,r,(x)@l ++#define LOAD_MSR_KERNEL(r, x) lis r,(x)@ha; ori r,r,(x)@l + #else + #define LOAD_MSR_KERNEL(r, x) li r,(x) + #endif +diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S +index 2d802e9..77a090d 100644 +--- a/arch/powerpc/kernel/entry_64.S ++++ b/arch/powerpc/kernel/entry_64.S +@@ -643,6 +643,10 @@ user_work: + b .ret_from_except_lite + + 1: bl .save_nvgprs ++#ifdef CONFIG_PERFMON ++ addi r3,r1,STACK_FRAME_OVERHEAD ++ bl .pfm_handle_work ++#endif /* CONFIG_PERFMON */ + addi r3,r1,STACK_FRAME_OVERHEAD + bl .do_signal + b .ret_from_except +diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c +index d972dec..b255fba 100644 +--- a/arch/powerpc/kernel/irq.c ++++ b/arch/powerpc/kernel/irq.c +@@ -104,6 +104,24 @@ static inline notrace void set_soft_enabled(unsigned long enable) + : : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled))); + } + ++#ifdef CONFIG_PERFMON ++static inline unsigned long get_pmu_except_pending(void) ++{ ++ unsigned long pending; ++ ++ __asm__ __volatile__("lbz %0,%1(13)" ++ : "=r" (pending) : "i" (offsetof(struct paca_struct, pmu_except_pending))); ++ ++ return pending; ++} ++ ++static inline void set_pmu_except_pending(unsigned long pending) ++{ ++ __asm__ __volatile__("stb %0,%1(13)" ++ : : "r" (pending), "i" (offsetof(struct paca_struct, pmu_except_pending))); ++} ++#endif /* CONFIG_PERFMON */ ++ + notrace void raw_local_irq_restore(unsigned long en) + { + /* +@@ -162,6 +180,19 @@ notrace void raw_local_irq_restore(unsigned long en) + lv1_get_version_info(&tmp); + } + ++#ifdef CONFIG_PERFMON ++ /* ++ * If a PMU exception occurred while interrupts were soft disabled, ++ * force a PMU exception. ++ */ ++ if (get_pmu_except_pending()) { ++ set_pmu_except_pending(0); ++ /* Make sure we trigger the edge detection circuitry */ ++ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_PMAO); ++ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_PMAO); ++ } ++#endif /* CONFIG_PERFMON */ ++ + __hard_irq_enable(); + } + EXPORT_SYMBOL(raw_local_irq_restore); +diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c +index 957bded..32dbc8e 100644 +--- a/arch/powerpc/kernel/process.c ++++ b/arch/powerpc/kernel/process.c +@@ -33,6 +33,7 @@ + #include <linux/mqueue.h> + #include <linux/hardirq.h> + #include <linux/utsname.h> ++#include <linux/perfmon_kern.h> + + #include <asm/pgtable.h> + #include <asm/uaccess.h> +@@ -393,9 +394,14 @@ struct task_struct *__switch_to(struct task_struct *prev, + new_thread->start_tb = current_tb; + } + #endif +- + local_irq_save(flags); + ++ if (test_tsk_thread_flag(prev, TIF_PERFMON_CTXSW)) ++ pfm_ctxsw_out(prev, new); ++ ++ if (test_tsk_thread_flag(new, TIF_PERFMON_CTXSW)) ++ pfm_ctxsw_in(prev, new); ++ + account_system_vtime(current); + account_process_vtime(current); + calculate_steal_time(); +@@ -544,6 +550,7 @@ void show_regs(struct pt_regs * regs) + void exit_thread(void) + { + discard_lazy_cpu_state(); ++ pfm_exit_thread(); + } + + void flush_thread(void) +@@ -669,6 +676,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long usp, + #else + kregs->nip = (unsigned long)ret_from_fork; + #endif ++ pfm_copy_thread(p); + + return 0; + } +diff --git a/arch/powerpc/perfmon/Kconfig b/arch/powerpc/perfmon/Kconfig +new file mode 100644 +index 0000000..3f4bbf2 +--- /dev/null ++++ b/arch/powerpc/perfmon/Kconfig +@@ -0,0 +1,67 @@ ++menu "Hardware Performance Monitoring support" ++config PERFMON ++ bool "Perfmon2 performance monitoring interface" ++ default n ++ help ++ Enables the perfmon2 interface to access the hardware ++ performance counters. See <http://perfmon2.sf.net/> for ++ more details. ++ ++config PERFMON_DEBUG ++ bool "Perfmon debugging" ++ default n ++ depends on PERFMON ++ help ++ Enables perfmon debugging support ++ ++config PERFMON_DEBUG_FS ++ bool "Enable perfmon statistics reporting via debugfs" ++ default y ++ depends on PERFMON && DEBUG_FS ++ help ++ Enable collection and reporting of perfmon timing statistics under ++ debugfs. This is used for debugging and performance analysis of the ++ subsystem. The debugfs filesystem must be mounted. ++ ++config PERFMON_POWER4 ++ tristate "Support for Power4 hardware performance counters" ++ depends on PERFMON && PPC64 ++ default n ++ help ++ Enables support for the Power 4 hardware performance counters ++ If unsure, say M. ++ ++config PERFMON_POWER5 ++ tristate "Support for Power5 hardware performance counters" ++ depends on PERFMON && PPC64 ++ default n ++ help ++ Enables support for the Power 5 hardware performance counters ++ If unsure, say M. ++ ++config PERFMON_POWER6 ++ tristate "Support for Power6 hardware performance counters" ++ depends on PERFMON && PPC64 ++ default n ++ help ++ Enables support for the Power 6 hardware performance counters ++ If unsure, say M. ++ ++config PERFMON_PPC32 ++ tristate "Support for PPC32 hardware performance counters" ++ depends on PERFMON && PPC32 ++ default n ++ help ++ Enables support for the PPC32 hardware performance counters ++ If unsure, say M. ++ ++config PERFMON_CELL ++ tristate "Support for Cell hardware performance counters" ++ depends on PERFMON && PPC_CELL ++ select PS3_LPM if PPC_PS3 ++ default n ++ help ++ Enables support for the Cell hardware performance counters. ++ If unsure, say M. ++ ++endmenu +diff --git a/arch/powerpc/perfmon/Makefile b/arch/powerpc/perfmon/Makefile +new file mode 100644 +index 0000000..300661f +--- /dev/null ++++ b/arch/powerpc/perfmon/Makefile +@@ -0,0 +1,6 @@ ++obj-$(CONFIG_PERFMON) += perfmon.o ++obj-$(CONFIG_PERFMON_POWER4) += perfmon_power4.o ++obj-$(CONFIG_PERFMON_POWER5) += perfmon_power5.o ++obj-$(CONFIG_PERFMON_POWER6) += perfmon_power6.o ++obj-$(CONFIG_PERFMON_PPC32) += perfmon_ppc32.o ++obj-$(CONFIG_PERFMON_CELL) += perfmon_cell.o +diff --git a/arch/powerpc/perfmon/perfmon.c b/arch/powerpc/perfmon/perfmon.c +new file mode 100644 +index 0000000..51a8b6a +--- /dev/null ++++ b/arch/powerpc/perfmon/perfmon.c +@@ -0,0 +1,334 @@ ++/* ++ * This file implements the powerpc specific ++ * support for the perfmon2 interface ++ * ++ * Copyright (c) 2005 David Gibson, IBM Corporation. ++ * ++ * based on versions for other architectures: ++ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/interrupt.h> ++#include <linux/perfmon_kern.h> ++ ++static void pfm_stop_active(struct task_struct *task, ++ struct pfm_context *ctx, struct pfm_event_set *set) ++{ ++ struct pfm_arch_pmu_info *arch_info; ++ ++ arch_info = pfm_pmu_info(); ++ BUG_ON(!arch_info->disable_counters || !arch_info->get_ovfl_pmds); ++ ++ arch_info->disable_counters(ctx, set); ++ ++ if (set->npend_ovfls) ++ return; ++ ++ arch_info->get_ovfl_pmds(ctx, set); ++} ++ ++/* ++ * Called from pfm_save_pmds(). Interrupts are masked. Registers are ++ * already saved away. ++ */ ++void pfm_arch_clear_pmd_ovfl_cond(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ int i, num; ++ u64 *used_pmds, *intr_pmds; ++ ++ num = set->nused_pmds; ++ used_pmds = set->used_pmds; ++ intr_pmds = ctx->regs.intr_pmds; ++ ++ for (i = 0; num; i++) ++ if (likely(test_bit(i, used_pmds))) { ++ if (likely(test_bit(i, intr_pmds))) ++ pfm_write_pmd(ctx, i, 0); ++ num--; ++ } ++} ++ ++/* ++ * Called from pfm_ctxsw(). Task is guaranteed to be current. ++ * Context is locked. Interrupts are masked. Monitoring is active. ++ * PMU access is guaranteed. PMC and PMD registers are live in PMU. ++ * ++ * for per-thread: ++ * must stop monitoring for the task ++ * Return: ++ * non-zero : did not save PMDs (as part of stopping the PMU) ++ * 0 : saved PMDs (no need to save them in caller) ++ */ ++int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx) ++{ ++ struct pfm_arch_pmu_info *arch_info; ++ ++ arch_info = pfm_pmu_info(); ++ /* ++ * disable lazy restore of the PMC/PMD registers. ++ */ ++ ctx->active_set->priv_flags |= PFM_SETFL_PRIV_MOD_BOTH; ++ ++ if (ctx->state == PFM_CTX_MASKED) ++ return 1; ++ ++ pfm_stop_active(task, ctx, ctx->active_set); ++ ++ if (arch_info->ctxswout_thread) ++ arch_info->ctxswout_thread(task, ctx, ctx->active_set); ++ ++ return pfm_arch_is_active(ctx); ++} ++ ++/* ++ * Called from pfm_ctxsw ++ */ ++void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx) ++{ ++ struct pfm_arch_pmu_info *arch_info; ++ ++ arch_info = pfm_pmu_info(); ++ if (ctx->state != PFM_CTX_MASKED && ctx->flags.started == 1) { ++ BUG_ON(!arch_info->enable_counters); ++ arch_info->enable_counters(ctx, ctx->active_set); ++ } ++ ++ if (arch_info->ctxswin_thread) ++ arch_info->ctxswin_thread(task, ctx, ctx->active_set); ++} ++ ++/* ++ * Called from pfm_stop() and idle notifier ++ * ++ * Interrupts are masked. Context is locked. Set is the active set. ++ * ++ * For per-thread: ++ * task is not necessarily current. If not current task, then ++ * task is guaranteed stopped and off any cpu. Access to PMU ++ * is not guaranteed. Interrupts are masked. Context is locked. ++ * Set is the active set. ++ * ++ * For system-wide: ++ * task is current ++ * ++ * must disable active monitoring. ctx cannot be NULL ++ */ ++void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx) ++{ ++ /* ++ * no need to go through stop_save() ++ * if we are already stopped ++ */ ++ if (!ctx->flags.started || ctx->state == PFM_CTX_MASKED) ++ return; ++ ++ /* ++ * stop live registers and collect pending overflow ++ */ ++ if (task == current) ++ pfm_stop_active(task, ctx, ctx->active_set); ++} ++ ++/* ++ * Enable active monitoring. Called from pfm_start() and ++ * pfm_arch_unmask_monitoring(). ++ * ++ * Interrupts are masked. Context is locked. Set is the active set. ++ * ++ * For per-thread: ++ * Task is not necessarily current. If not current task, then task ++ * is guaranteed stopped and off any cpu. No access to PMU if task ++ * is not current. ++ * ++ * For system-wide: ++ * Task is always current ++ */ ++void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx) ++{ ++ struct pfm_arch_pmu_info *arch_info; ++ ++ arch_info = pfm_pmu_info(); ++ if (task != current) ++ return; ++ ++ BUG_ON(!arch_info->enable_counters); ++ ++ arch_info->enable_counters(ctx, ctx->active_set); ++} ++ ++/* ++ * function called from pfm_switch_sets(), pfm_context_load_thread(), ++ * pfm_context_load_sys(), pfm_ctxsw(), pfm_switch_sets() ++ * context is locked. Interrupts are masked. set cannot be NULL. ++ * Access to the PMU is guaranteed. ++ * ++ * function must restore all PMD registers from set. ++ */ ++void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set) ++{ ++ struct pfm_arch_pmu_info *arch_info; ++ u64 *used_pmds; ++ u16 i, num; ++ ++ arch_info = pfm_pmu_info(); ++ ++ /* The model-specific module can override the default ++ * restore-PMD method. ++ */ ++ if (arch_info->restore_pmds) ++ return arch_info->restore_pmds(ctx, set); ++ ++ num = set->nused_pmds; ++ used_pmds = set->used_pmds; ++ ++ for (i = 0; num; i++) { ++ if (likely(test_bit(i, used_pmds))) { ++ pfm_write_pmd(ctx, i, set->pmds[i].value); ++ num--; ++ } ++ } ++} ++ ++/* ++ * function called from pfm_switch_sets(), pfm_context_load_thread(), ++ * pfm_context_load_sys(), pfm_ctxsw(), pfm_switch_sets() ++ * context is locked. Interrupts are masked. set cannot be NULL. ++ * Access to the PMU is guaranteed. ++ * ++ * function must restore all PMC registers from set, if needed. ++ */ ++void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set) ++{ ++ struct pfm_arch_pmu_info *arch_info; ++ u64 *impl_pmcs; ++ unsigned int i, max_pmc, reg; ++ ++ arch_info = pfm_pmu_info(); ++ /* The model-specific module can override the default ++ * restore-PMC method. ++ */ ++ if (arch_info->restore_pmcs) ++ return arch_info->restore_pmcs(ctx, set); ++ ++ /* The "common" powerpc model's enable the counters simply by writing ++ * all the control registers. Therefore, if we're masked or stopped we ++ * don't need to bother restoring the PMCs now. ++ */ ++ if (ctx->state == PFM_CTX_MASKED || ctx->flags.started == 0) ++ return; ++ ++ max_pmc = ctx->regs.max_pmc; ++ impl_pmcs = ctx->regs.pmcs; ++ ++ /* ++ * Restore all pmcs in reverse order to ensure the counters aren't ++ * enabled before their event selectors are set correctly. ++ */ ++ reg = max_pmc - 1; ++ for (i = 0; i < max_pmc; i++) { ++ if (test_bit(reg, impl_pmcs)) ++ pfm_arch_write_pmc(ctx, reg, set->pmcs[reg]); ++ reg--; ++ } ++} ++ ++char *pfm_arch_get_pmu_module_name(void) ++{ ++ unsigned int pvr = mfspr(SPRN_PVR); ++ ++ switch (PVR_VER(pvr)) { ++ case 0x0004: /* 604 */ ++ case 0x0009: /* 604e; */ ++ case 0x000A: /* 604ev */ ++ case 0x0008: /* 750/740 */ ++ case 0x7000: /* 750FX */ ++ case 0x7001: ++ case 0x7002: /* 750GX */ ++ case 0x000C: /* 7400 */ ++ case 0x800C: /* 7410 */ ++ case 0x8000: /* 7451/7441 */ ++ case 0x8001: /* 7455/7445 */ ++ case 0x8002: /* 7457/7447 */ ++ case 0x8003: /* 7447A */ ++ case 0x8004: /* 7448 */ ++ return("perfmon_ppc32"); ++ case PV_POWER4: ++ case PV_POWER4p: ++ return "perfmon_power4"; ++ case PV_POWER5: ++ return "perfmon_power5"; ++ case PV_POWER5p: ++ if (PVR_REV(pvr) < 0x300) ++ /* PMU behaves like POWER5 */ ++ return "perfmon_power5"; ++ else ++ /* PMU behaves like POWER6 */ ++ return "perfmon_power6"; ++ case PV_POWER6: ++ return "perfmon_power6"; ++ case PV_970: ++ case PV_970FX: ++ case PV_970MP: ++ return "perfmon_ppc970"; ++ case PV_BE: ++ return "perfmon_cell"; ++ } ++ return NULL; ++} ++ ++void pfm_arch_init_percpu(void) ++{ ++#ifdef CONFIG_PPC64 ++ extern void ppc64_enable_pmcs(void); ++ ppc64_enable_pmcs(); ++#endif ++} ++ ++/** ++ * powerpc_irq_handler ++ * ++ * Get the perfmon context that belongs to the current CPU, and call the ++ * model-specific interrupt handler. ++ **/ ++void powerpc_irq_handler(struct pt_regs *regs) ++{ ++ struct pfm_arch_pmu_info *arch_info; ++ struct pfm_context *ctx; ++ ++ if (! regs->softe) { ++ /* ++ * We got a PMU interrupt while interrupts were soft ++ * disabled. Disable hardware interrupts by clearing ++ * MSR_EE and also clear PMAO because we will need to set ++ * that again later when interrupts are re-enabled and ++ * raw_local_irq_restore() sees that the pmu_except_pending ++ * flag is set. ++ */ ++ regs->msr &= ~MSR_EE; ++ get_paca()->pmu_except_pending = 1; ++ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_PMAO); ++ return; ++ } ++ ++ arch_info = pfm_pmu_info(); ++ if (arch_info->irq_handler) { ++ ctx = __get_cpu_var(pmu_ctx); ++ if (likely(ctx)) ++ arch_info->irq_handler(regs, ctx); ++ } ++} +diff --git a/arch/powerpc/perfmon/perfmon_cell.c b/arch/powerpc/perfmon/perfmon_cell.c +new file mode 100644 +index 0000000..e1ae12c +--- /dev/null ++++ b/arch/powerpc/perfmon/perfmon_cell.c +@@ -0,0 +1,1449 @@ ++/* ++ * This file contains the Cell PMU register description tables ++ * and pmc checker used by perfmon.c. ++ * ++ * Copyright IBM Corporation 2007 ++ * (C) Copyright 2007 TOSHIBA CORPORATION ++ * ++ * Based on other Perfmon2 PMU modules. ++ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++ ++#include <linux/module.h> ++#include <linux/perfmon_kern.h> ++#include <linux/io.h> ++#include <asm/cell-pmu.h> ++#include <asm/cell-regs.h> ++#include <asm/machdep.h> ++#include <asm/rtas.h> ++#include <asm/ps3.h> ++#include <asm/spu.h> ++ ++MODULE_AUTHOR("Kevin Corry <kevcorry@us.ibm.com>, " ++ "Carl Love <carll@us.ibm.com>"); ++MODULE_DESCRIPTION("Cell PMU description table"); ++MODULE_LICENSE("GPL"); ++ ++struct pfm_cell_platform_pmu_info { ++ u32 (*read_ctr)(u32 cpu, u32 ctr); ++ void (*write_ctr)(u32 cpu, u32 ctr, u32 val); ++ void (*write_pm07_control)(u32 cpu, u32 ctr, u32 val); ++ void (*write_pm)(u32 cpu, enum pm_reg_name reg, u32 val); ++ void (*enable_pm)(u32 cpu); ++ void (*disable_pm)(u32 cpu); ++ void (*enable_pm_interrupts)(u32 cpu, u32 thread, u32 mask); ++ u32 (*get_and_clear_pm_interrupts)(u32 cpu); ++ u32 (*get_hw_thread_id)(int cpu); ++ struct cbe_ppe_priv_regs __iomem *(*get_cpu_ppe_priv_regs)(int cpu); ++ struct cbe_pmd_regs __iomem *(*get_cpu_pmd_regs)(int cpu); ++ struct cbe_mic_tm_regs __iomem *(*get_cpu_mic_tm_regs)(int cpu); ++ int (*rtas_token)(const char *service); ++ int (*rtas_call)(int token, int param1, int param2, int *param3, ...); ++}; ++ ++/* ++ * Mapping from Perfmon logical control registers to Cell hardware registers. ++ */ ++static struct pfm_regmap_desc pfm_cell_pmc_desc[] = { ++ /* Per-counter control registers. */ ++ PMC_D(PFM_REG_I, "pm0_control", 0, 0, 0, 0), ++ PMC_D(PFM_REG_I, "pm1_control", 0, 0, 0, 0), ++ PMC_D(PFM_REG_I, "pm2_control", 0, 0, 0, 0), ++ PMC_D(PFM_REG_I, "pm3_control", 0, 0, 0, 0), ++ PMC_D(PFM_REG_I, "pm4_control", 0, 0, 0, 0), ++ PMC_D(PFM_REG_I, "pm5_control", 0, 0, 0, 0), ++ PMC_D(PFM_REG_I, "pm6_control", 0, 0, 0, 0), ++ PMC_D(PFM_REG_I, "pm7_control", 0, 0, 0, 0), ++ ++ /* Per-counter RTAS arguments. Each of these registers has three fields. ++ * bits 63-48: debug-bus word ++ * bits 47-32: sub-unit ++ * bits 31-0 : full signal number ++ * (MSB = 63, LSB = 0) ++ */ ++ PMC_D(PFM_REG_I, "pm0_event", 0, 0, 0, 0), ++ PMC_D(PFM_REG_I, "pm1_event", 0, 0, 0, 0), ++ PMC_D(PFM_REG_I, "pm2_event", 0, 0, 0, 0), ++ PMC_D(PFM_REG_I, "pm3_event", 0, 0, 0, 0), ++ PMC_D(PFM_REG_I, "pm4_event", 0, 0, 0, 0), ++ PMC_D(PFM_REG_I, "pm5_event", 0, 0, 0, 0), ++ PMC_D(PFM_REG_I, "pm6_event", 0, 0, 0, 0), ++ PMC_D(PFM_REG_I, "pm7_event", 0, 0, 0, 0), ++ ++ /* Global control registers. Same order as enum pm_reg_name. */ ++ PMC_D(PFM_REG_I, "group_control", 0, 0, 0, 0), ++ PMC_D(PFM_REG_I, "debug_bus_control", 0, 0, 0, 0), ++ PMC_D(PFM_REG_I, "trace_address", 0, 0, 0, 0), ++ PMC_D(PFM_REG_I, "ext_trace_timer", 0, 0, 0, 0), ++ PMC_D(PFM_REG_I, "pm_status", 0, 0, 0, 0), ++ /* set the interrupt overflow bit for the four 32 bit counters ++ * that is currently supported. Will need to fix when 32 and 16 ++ * bit counters are supported. ++ */ ++ PMC_D(PFM_REG_I, "pm_control", 0xF0000000, 0xF0000000, 0, 0), ++ PMC_D(PFM_REG_I, "pm_interval", 0, 0, 0, 0), /* FIX: Does user-space also need read access to this one? */ ++ PMC_D(PFM_REG_I, "pm_start_stop", 0, 0, 0, 0), ++}; ++#define PFM_PM_NUM_PMCS ARRAY_SIZE(pfm_cell_pmc_desc) ++ ++#define CELL_PMC_GROUP_CONTROL 16 ++#define CELL_PMC_PM_STATUS 20 ++#define CELL_PMC_PM_CONTROL 21 ++#define CELL_PMC_PM_CONTROL_CNTR_MASK 0x01E00000UL ++#define CELL_PMC_PM_CONTROL_CNTR_16 0x01E00000UL ++ ++/* ++ * Mapping from Perfmon logical data counters to Cell hardware counters. ++ */ ++static struct pfm_regmap_desc pfm_cell_pmd_desc[] = { ++ PMD_D(PFM_REG_C, "pm0", 0), ++ PMD_D(PFM_REG_C, "pm1", 0), ++ PMD_D(PFM_REG_C, "pm2", 0), ++ PMD_D(PFM_REG_C, "pm3", 0), ++ PMD_D(PFM_REG_C, "pm4", 0), ++ PMD_D(PFM_REG_C, "pm5", 0), ++ PMD_D(PFM_REG_C, "pm6", 0), ++ PMD_D(PFM_REG_C, "pm7", 0), ++}; ++#define PFM_PM_NUM_PMDS ARRAY_SIZE(pfm_cell_pmd_desc) ++ ++#define PFM_EVENT_PMC_BUS_WORD(x) (((x) >> 48) & 0x00ff) ++#define PFM_EVENT_PMC_FULL_SIGNAL_NUMBER(x) ((x) & 0xffffffff) ++#define PFM_EVENT_PMC_SIGNAL_GROUP(x) (((x) & 0xffffffff) / 100) ++#define PFM_PM_CTR_INPUT_MUX_BIT(pm07_control) (((pm07_control) >> 26) & 0x1f) ++#define PFM_PM_CTR_INPUT_MUX_GROUP_INDEX(pm07_control) ((pm07_control) >> 31) ++#define PFM_GROUP_CONTROL_GROUP0_WORD(grp_ctrl) ((grp_ctrl) >> 30) ++#define PFM_GROUP_CONTROL_GROUP1_WORD(grp_ctrl) (((grp_ctrl) >> 28) & 0x3) ++#define PFM_NUM_OF_GROUPS 2 ++#define PFM_PPU_IU1_THREAD1_BASE_BIT 19 ++#define PFM_PPU_XU_THREAD1_BASE_BIT 16 ++#define PFM_COUNTER_CTRL_PMC_PPU_TH0 0x100000000ULL ++#define PFM_COUNTER_CTRL_PMC_PPU_TH1 0x200000000ULL ++ ++/* ++ * Debug-bus signal handling. ++ * ++ * Some Cell systems have firmware that can handle the debug-bus signal ++ * routing. For systems without this firmware, we have a minimal in-kernel ++ * implementation as well. ++ */ ++ ++/* The firmware only sees physical CPUs, so divide by 2 if SMT is on. */ ++#ifdef CONFIG_SCHED_SMT ++#define RTAS_CPU(cpu) ((cpu) / 2) ++#else ++#define RTAS_CPU(cpu) (cpu) ++#endif ++#define RTAS_BUS_WORD(x) (u16)(((x) >> 48) & 0x0000ffff) ++#define RTAS_SUB_UNIT(x) (u16)(((x) >> 32) & 0x0000ffff) ++#define RTAS_SIGNAL_NUMBER(x) (s32)( (x) & 0xffffffff) ++#define RTAS_SIGNAL_GROUP(x) (RTAS_SIGNAL_NUMBER(x) / 100) ++ ++#define subfunc_RESET 1 ++#define subfunc_ACTIVATE 2 ++ ++#define passthru_ENABLE 1 ++#define passthru_DISABLE 2 ++ ++/** ++ * struct cell_rtas_arg ++ * ++ * @cpu: Processor to modify. Linux numbers CPUs based on SMT IDs, but the ++ * firmware only sees the physical CPUs. So this value should be the ++ * SMT ID (from smp_processor_id() or get_cpu()) divided by 2. ++ * @sub_unit: Hardware subunit this applies to (if applicable). ++ * @signal_group: Signal group to enable/disable on the trace bus. ++ * @bus_word: For signal groups that propagate via the trace bus, this trace ++ * bus word will be used. This is a mask of (1 << TraceBusWord). ++ * For other signal groups, this specifies the trigger or event bus. ++ * @bit: Trigger/Event bit, if applicable for the signal group. ++ * ++ * An array of these structures are passed to rtas_call() to set up the ++ * signals on the debug bus. ++ **/ ++struct cell_rtas_arg { ++ u16 cpu; ++ u16 sub_unit; ++ s16 signal_group; ++ u8 bus_word; ++ u8 bit; ++}; ++ ++/** ++ * rtas_reset_signals ++ * ++ * Use the firmware RTAS call to disable signal pass-thru and to reset the ++ * debug-bus signals. ++ **/ ++static int rtas_reset_signals(u32 cpu) ++{ ++ struct cell_rtas_arg signal; ++ u64 real_addr = virt_to_phys(&signal); ++ int rc; ++ struct pfm_cell_platform_pmu_info *info = ++ ((struct pfm_arch_pmu_info *) ++ (pfm_pmu_conf->pmu_info))->platform_info; ++ ++ memset(&signal, 0, sizeof(signal)); ++ signal.cpu = RTAS_CPU(cpu); ++ rc = info->rtas_call(info->rtas_token("ibm,cbe-perftools"), ++ 5, 1, NULL, ++ subfunc_RESET, ++ passthru_DISABLE, ++ real_addr >> 32, ++ real_addr & 0xffffffff, ++ sizeof(signal)); ++ ++ return rc; ++} ++ ++/** ++ * rtas_activate_signals ++ * ++ * Use the firmware RTAS call to enable signal pass-thru and to activate the ++ * desired signal groups on the debug-bus. ++ **/ ++static int rtas_activate_signals(struct cell_rtas_arg *signals, ++ int num_signals) ++{ ++ u64 real_addr = virt_to_phys(signals); ++ int rc; ++ struct pfm_cell_platform_pmu_info *info = ++ ((struct pfm_arch_pmu_info *) ++ (pfm_pmu_conf->pmu_info))->platform_info; ++ ++ rc = info->rtas_call(info->rtas_token("ibm,cbe-perftools"), ++ 5, 1, NULL, ++ subfunc_ACTIVATE, ++ passthru_ENABLE, ++ real_addr >> 32, ++ real_addr & 0xffffffff, ++ num_signals * sizeof(*signals)); ++ ++ return rc; ++} ++ ++#define HID1_RESET_MASK (~0x00000001ffffffffUL) ++#define PPU_IU1_WORD0_HID1_EN_MASK (~0x00000001f0c0802cUL) ++#define PPU_IU1_WORD0_HID1_EN_WORD ( 0x00000001f0400000UL) ++#define PPU_IU1_WORD1_HID1_EN_MASK (~0x000000010fc08023UL) ++#define PPU_IU1_WORD1_HID1_EN_WORD ( 0x000000010f400001UL) ++#define PPU_XU_WORD0_HID1_EN_MASK (~0x00000001f038402cUL) ++#define PPU_XU_WORD0_HID1_EN_WORD ( 0x00000001f0080008UL) ++#define PPU_XU_WORD1_HID1_EN_MASK (~0x000000010f074023UL) ++#define PPU_XU_WORD1_HID1_EN_WORD ( 0x000000010f030002UL) ++ ++/* The bus_word field in the cell_rtas_arg structure is a bit-mask ++ * indicating which debug-bus word(s) to use. ++ */ ++enum { ++ BUS_WORD_0 = 1, ++ BUS_WORD_1 = 2, ++ BUS_WORD_2 = 4, ++ BUS_WORD_3 = 8, ++}; ++ ++/* Definitions of the signal-groups that the built-in signal-activation ++ * code can handle. ++ */ ++enum { ++ SIG_GROUP_NONE = 0, ++ ++ /* 2.x PowerPC Processor Unit (PPU) Signal Groups */ ++ SIG_GROUP_PPU_BASE = 20, ++ SIG_GROUP_PPU_IU1 = 21, ++ SIG_GROUP_PPU_XU = 22, ++ ++ /* 3.x PowerPC Storage Subsystem (PPSS) Signal Groups */ ++ SIG_GROUP_PPSS_BASE = 30, ++ ++ /* 4.x Synergistic Processor Unit (SPU) Signal Groups */ ++ SIG_GROUP_SPU_BASE = 40, ++ ++ /* 5.x Memory Flow Controller (MFC) Signal Groups */ ++ SIG_GROUP_MFC_BASE = 50, ++ ++ /* 6.x Element )nterconnect Bus (EIB) Signal Groups */ ++ SIG_GROUP_EIB_BASE = 60, ++ ++ /* 7.x Memory Interface Controller (MIC) Signal Groups */ ++ SIG_GROUP_MIC_BASE = 70, ++ ++ /* 8.x Cell Broadband Engine Interface (BEI) Signal Groups */ ++ SIG_GROUP_BEI_BASE = 80, ++}; ++ ++/** ++ * rmw_spr ++ * ++ * Read-modify-write for a special-purpose-register. ++ **/ ++#define rmw_spr(spr_id, a_mask, o_mask) \ ++ do { \ ++ u64 value = mfspr(spr_id); \ ++ value &= (u64)(a_mask); \ ++ value |= (u64)(o_mask); \ ++ mtspr((spr_id), value); \ ++ } while (0) ++ ++/** ++ * rmw_mmio_reg64 ++ * ++ * Read-modify-write for a 64-bit MMIO register. ++ **/ ++#define rmw_mmio_reg64(mem, a_mask, o_mask) \ ++ do { \ ++ u64 value = in_be64(&(mem)); \ ++ value &= (u64)(a_mask); \ ++ value |= (u64)(o_mask); \ ++ out_be64(&(mem), value); \ ++ } while (0) ++ ++/** ++ * rmwb_mmio_reg64 ++ * ++ * Set or unset a specified bit within a 64-bit MMIO register. ++ **/ ++#define rmwb_mmio_reg64(mem, bit_num, set_bit) \ ++ rmw_mmio_reg64((mem), ~(1UL << (63 - (bit_num))), \ ++ ((set_bit) << (63 - (bit_num)))) ++ ++/** ++ * passthru ++ * ++ * Enable or disable passthru mode in all the Cell signal islands. ++ **/ ++static int passthru(u32 cpu, u64 enable) ++{ ++ struct cbe_ppe_priv_regs __iomem *ppe_priv_regs; ++ struct cbe_pmd_regs __iomem *pmd_regs; ++ struct cbe_mic_tm_regs __iomem *mic_tm_regs; ++ struct pfm_cell_platform_pmu_info *info = ++ ((struct pfm_arch_pmu_info *) ++ (pfm_pmu_conf->pmu_info))->platform_info; ++ ++ ppe_priv_regs = info->get_cpu_ppe_priv_regs(cpu); ++ pmd_regs = info->get_cpu_pmd_regs(cpu); ++ mic_tm_regs = info->get_cpu_mic_tm_regs(cpu); ++ ++ if (!ppe_priv_regs || !pmd_regs || !mic_tm_regs) { ++ PFM_ERR("Error getting Cell PPE, PMD, and MIC " ++ "register maps: 0x%p, 0x%p, 0x%p", ++ ppe_priv_regs, pmd_regs, mic_tm_regs); ++ return -EINVAL; ++ } ++ ++ rmwb_mmio_reg64(ppe_priv_regs->L2_debug1, 61, enable); ++ rmwb_mmio_reg64(ppe_priv_regs->ciu_dr1, 5, enable); ++ rmwb_mmio_reg64(pmd_regs->on_ramp_trace, 39, enable); ++ rmwb_mmio_reg64(mic_tm_regs->MBL_debug, 20, enable); ++ ++ return 0; ++} ++ ++#define passthru_enable(cpu) passthru(cpu, 1) ++#define passthru_disable(cpu) passthru(cpu, 0) ++ ++static inline void reset_signal_registers(u32 cpu) ++{ ++ rmw_spr(SPRN_HID1, HID1_RESET_MASK, 0); ++} ++ ++/** ++ * celleb_reset_signals ++ * ++ * Non-rtas version of resetting the debug-bus signals. ++ **/ ++static int celleb_reset_signals(u32 cpu) ++{ ++ int rc; ++ rc = passthru_disable(cpu); ++ if (!rc) ++ reset_signal_registers(cpu); ++ return rc; ++} ++ ++/** ++ * ppu_selection ++ * ++ * Write the HID1 register to connect the specified PPU signal-group to the ++ * debug-bus. ++ **/ ++static int ppu_selection(struct cell_rtas_arg *signal) ++{ ++ u64 hid1_enable_word = 0; ++ u64 hid1_enable_mask = 0; ++ ++ switch (signal->signal_group) { ++ ++ case SIG_GROUP_PPU_IU1: /* 2.1 PPU Instruction Unit - Group 1 */ ++ switch (signal->bus_word) { ++ case BUS_WORD_0: ++ hid1_enable_mask = PPU_IU1_WORD0_HID1_EN_MASK; ++ hid1_enable_word = PPU_IU1_WORD0_HID1_EN_WORD; ++ break; ++ case BUS_WORD_1: ++ hid1_enable_mask = PPU_IU1_WORD1_HID1_EN_MASK; ++ hid1_enable_word = PPU_IU1_WORD1_HID1_EN_WORD; ++ break; ++ default: ++ PFM_ERR("Invalid bus-word (0x%x) for signal-group %d.", ++ signal->bus_word, signal->signal_group); ++ return -EINVAL; ++ } ++ break; ++ ++ case SIG_GROUP_PPU_XU: /* 2.2 PPU Execution Unit */ ++ switch (signal->bus_word) { ++ case BUS_WORD_0: ++ hid1_enable_mask = PPU_XU_WORD0_HID1_EN_MASK; ++ hid1_enable_word = PPU_XU_WORD0_HID1_EN_WORD; ++ break; ++ case BUS_WORD_1: ++ hid1_enable_mask = PPU_XU_WORD1_HID1_EN_MASK; ++ hid1_enable_word = PPU_XU_WORD1_HID1_EN_WORD; ++ break; ++ default: ++ PFM_ERR("Invalid bus-word (0x%x) for signal-group %d.", ++ signal->bus_word, signal->signal_group); ++ return -EINVAL; ++ } ++ break; ++ ++ default: ++ PFM_ERR("Signal-group %d not implemented.", ++ signal->signal_group); ++ return -EINVAL; ++ } ++ ++ rmw_spr(SPRN_HID1, hid1_enable_mask, hid1_enable_word); ++ ++ return 0; ++} ++ ++/** ++ * celleb_activate_signals ++ * ++ * Non-rtas version of activating the debug-bus signals. ++ **/ ++static int celleb_activate_signals(struct cell_rtas_arg *signals, ++ int num_signals) ++{ ++ int i, rc = -EINVAL; ++ ++ for (i = 0; i < num_signals; i++) { ++ switch (signals[i].signal_group) { ++ ++ /* 2.x PowerPC Processor Unit (PPU) Signal Selection */ ++ case SIG_GROUP_PPU_IU1: ++ case SIG_GROUP_PPU_XU: ++ rc = ppu_selection(signals + i); ++ if (rc) ++ return rc; ++ break; ++ ++ default: ++ PFM_ERR("Signal-group %d not implemented.", ++ signals[i].signal_group); ++ return -EINVAL; ++ } ++ } ++ ++ if (0 < i) ++ rc = passthru_enable(signals[0].cpu); ++ ++ return rc; ++} ++ ++/** ++ * ps3_reset_signals ++ * ++ * ps3 version of resetting the debug-bus signals. ++ **/ ++static int ps3_reset_signals(u32 cpu) ++{ ++#ifdef CONFIG_PPC_PS3 ++ return ps3_set_signal(0, 0, 0, 0); ++#else ++ return 0; ++#endif ++} ++ ++/** ++ * ps3_activate_signals ++ * ++ * ps3 version of activating the debug-bus signals. ++ **/ ++static int ps3_activate_signals(struct cell_rtas_arg *signals, ++ int num_signals) ++{ ++#ifdef CONFIG_PPC_PS3 ++ int i; ++ ++ for (i = 0; i < num_signals; i++) ++ ps3_set_signal(signals[i].signal_group, signals[i].bit, ++ signals[i].sub_unit, signals[i].bus_word); ++#endif ++ return 0; ++} ++ ++ ++/** ++ * reset_signals ++ * ++ * Call to the firmware (if available) to reset the debug-bus signals. ++ * Otherwise call the built-in version. ++ **/ ++int reset_signals(u32 cpu) ++{ ++ int rc; ++ ++ if (machine_is(celleb)) ++ rc = celleb_reset_signals(cpu); ++ else if (machine_is(ps3)) ++ rc = ps3_reset_signals(cpu); ++ else ++ rc = rtas_reset_signals(cpu); ++ ++ return rc; ++} ++ ++/** ++ * activate_signals ++ * ++ * Call to the firmware (if available) to activate the debug-bus signals. ++ * Otherwise call the built-in version. ++ **/ ++int activate_signals(struct cell_rtas_arg *signals, int num_signals) ++{ ++ int rc; ++ ++ if (machine_is(celleb)) ++ rc = celleb_activate_signals(signals, num_signals); ++ else if (machine_is(ps3)) ++ rc = ps3_activate_signals(signals, num_signals); ++ else ++ rc = rtas_activate_signals(signals, num_signals); ++ ++ return rc; ++} ++ ++/** ++ * pfm_cell_pmc_check ++ * ++ * Verify that we are going to write a valid value to the specified PMC. ++ **/ ++int pfm_cell_pmc_check(struct pfm_context *ctx, ++ struct pfm_event_set *set, ++ struct pfarg_pmc *req) ++{ ++ u16 cnum, reg_num = req->reg_num; ++ s16 signal_group = RTAS_SIGNAL_GROUP(req->reg_value); ++ u8 bus_word = RTAS_BUS_WORD(req->reg_value); ++ ++ if (reg_num < NR_CTRS || reg_num >= (NR_CTRS * 2)) ++ return -EINVAL; ++ ++ switch (signal_group) { ++ case SIG_GROUP_PPU_IU1: ++ case SIG_GROUP_PPU_XU: ++ if ((bus_word != 0) && (bus_word != 1)) { ++ PFM_ERR("Invalid bus word (%d) for signal-group %d", ++ bus_word, signal_group); ++ return -EINVAL; ++ } ++ break; ++ default: ++ PFM_ERR("Signal-group %d not implemented.", signal_group); ++ return -EINVAL; ++ } ++ ++ for (cnum = NR_CTRS; cnum < (NR_CTRS * 2); cnum++) { ++ if (test_bit(cnum, cast_ulp(set->used_pmcs)) && ++ bus_word == RTAS_BUS_WORD(set->pmcs[cnum]) && ++ signal_group != RTAS_SIGNAL_GROUP(set->pmcs[cnum])) { ++ PFM_ERR("Impossible signal-group combination: " ++ "(%u,%u,%d) (%u,%u,%d)", ++ reg_num, bus_word, signal_group, cnum, ++ RTAS_BUS_WORD(set->pmcs[cnum]), ++ RTAS_SIGNAL_GROUP(set->pmcs[cnum])); ++ return -EBUSY; ++ } ++ } ++ ++ return 0; ++} ++ ++/** ++ * write_pm07_event ++ * ++ * Pull out the RTAS arguments from the 64-bit register value and make the ++ * RTAS activate-signals call. ++ **/ ++static void write_pm07_event(int cpu, unsigned int ctr, u64 value) ++{ ++ struct cell_rtas_arg signal; ++ s32 signal_number; ++ int rc; ++ ++ signal_number = RTAS_SIGNAL_NUMBER(value); ++ if (!signal_number) { ++ /* Don't include counters that are counting cycles. */ ++ return; ++ } ++ ++ signal.cpu = RTAS_CPU(cpu); ++ signal.bus_word = 1 << RTAS_BUS_WORD(value); ++ signal.sub_unit = RTAS_SUB_UNIT(value); ++ signal.signal_group = signal_number / 100; ++ signal.bit = abs(signal_number) % 100; ++ ++ rc = activate_signals(&signal, 1); ++ if (rc) { ++ PFM_WARN("%s(%d, %u, %lu): Error calling " ++ "activate_signals(): %d\n", __func__, ++ cpu, ctr, (unsigned long)value, rc); ++ /* FIX: Could we change this routine to return an error? */ ++ } ++} ++ ++/** ++ * pfm_cell_probe_pmu ++ * ++ * Simply check the processor version register to see if we're currently ++ * on a Cell system. ++ **/ ++static int pfm_cell_probe_pmu(void) ++{ ++ unsigned long pvr = mfspr(SPRN_PVR); ++ ++ if (PVR_VER(pvr) != PV_BE) ++ return -1; ++ ++ return 0; ++} ++ ++/** ++ * pfm_cell_write_pmc ++ **/ ++static void pfm_cell_write_pmc(unsigned int cnum, u64 value) ++{ ++ int cpu = smp_processor_id(); ++ struct pfm_cell_platform_pmu_info *info = ++ ((struct pfm_arch_pmu_info *) ++ (pfm_pmu_conf->pmu_info))->platform_info; ++ ++ if (cnum < NR_CTRS) { ++ info->write_pm07_control(cpu, cnum, value); ++ ++ } else if (cnum < NR_CTRS * 2) { ++ write_pm07_event(cpu, cnum - NR_CTRS, value); ++ ++ } else if (cnum == CELL_PMC_PM_STATUS) { ++ /* The pm_status register must be treated separately from ++ * the other "global" PMCs. This call will ensure that ++ * the interrupts are routed to the correct CPU, as well ++ * as writing the desired value to the pm_status register. ++ */ ++ info->enable_pm_interrupts(cpu, info->get_hw_thread_id(cpu), ++ value); ++ ++ } else if (cnum < PFM_PM_NUM_PMCS) { ++ info->write_pm(cpu, cnum - (NR_CTRS * 2), value); ++ } ++} ++ ++/** ++ * pfm_cell_write_pmd ++ **/ ++static void pfm_cell_write_pmd(unsigned int cnum, u64 value) ++{ ++ int cpu = smp_processor_id(); ++ struct pfm_cell_platform_pmu_info *info = ++ ((struct pfm_arch_pmu_info *) ++ (pfm_pmu_conf->pmu_info))->platform_info; ++ ++ if (cnum < NR_CTRS) ++ info->write_ctr(cpu, cnum, value); ++} ++ ++/** ++ * pfm_cell_read_pmd ++ **/ ++static u64 pfm_cell_read_pmd(unsigned int cnum) ++{ ++ int cpu = smp_processor_id(); ++ struct pfm_cell_platform_pmu_info *info = ++ ((struct pfm_arch_pmu_info *) ++ (pfm_pmu_conf->pmu_info))->platform_info; ++ ++ if (cnum < NR_CTRS) ++ return info->read_ctr(cpu, cnum); ++ ++ return -EINVAL; ++} ++ ++/** ++ * pfm_cell_enable_counters ++ * ++ * Just need to turn on the global disable bit in pm_control. ++ **/ ++static void pfm_cell_enable_counters(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ struct pfm_cell_platform_pmu_info *info = ++ ((struct pfm_arch_pmu_info *) ++ (pfm_pmu_conf->pmu_info))->platform_info; ++ ++ info->enable_pm(smp_processor_id()); ++} ++ ++/** ++ * pfm_cell_disable_counters ++ * ++ * Just need to turn off the global disable bit in pm_control. ++ **/ ++static void pfm_cell_disable_counters(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ struct pfm_cell_platform_pmu_info *info = ++ ((struct pfm_arch_pmu_info *) ++ (pfm_pmu_conf->pmu_info))->platform_info; ++ ++ info->disable_pm(smp_processor_id()); ++ if (machine_is(ps3)) ++ reset_signals(smp_processor_id()); ++} ++ ++/* ++ * Return the thread id of the specified ppu signal. ++ */ ++static inline u32 get_target_ppu_thread_id(u32 group, u32 bit) ++{ ++ if ((group == SIG_GROUP_PPU_IU1 && ++ bit < PFM_PPU_IU1_THREAD1_BASE_BIT) || ++ (group == SIG_GROUP_PPU_XU && ++ bit < PFM_PPU_XU_THREAD1_BASE_BIT)) ++ return 0; ++ else ++ return 1; ++} ++ ++/* ++ * Return whether the specified counter is for PPU signal group. ++ */ ++static inline int is_counter_for_ppu_sig_grp(u32 counter_control, u32 sig_grp) ++{ ++ if (!(counter_control & CBE_PM_CTR_INPUT_CONTROL) && ++ (counter_control & CBE_PM_CTR_ENABLE) && ++ ((sig_grp == SIG_GROUP_PPU_IU1) || (sig_grp == SIG_GROUP_PPU_XU))) ++ return 1; ++ else ++ return 0; ++} ++ ++/* ++ * Search ppu signal groups. ++ */ ++static int get_ppu_signal_groups(struct pfm_event_set *set, ++ u32 *ppu_sig_grp0, u32 *ppu_sig_grp1) ++{ ++ u64 pm_event, *used_pmcs = set->used_pmcs; ++ int i, j; ++ u32 grp0_wd, grp1_wd, wd, sig_grp; ++ ++ *ppu_sig_grp0 = 0; ++ *ppu_sig_grp1 = 0; ++ grp0_wd = PFM_GROUP_CONTROL_GROUP0_WORD( ++ set->pmcs[CELL_PMC_GROUP_CONTROL]); ++ grp1_wd = PFM_GROUP_CONTROL_GROUP1_WORD( ++ set->pmcs[CELL_PMC_GROUP_CONTROL]); ++ ++ for (i = 0, j = 0; (i < NR_CTRS) && (j < PFM_NUM_OF_GROUPS); i++) { ++ if (test_bit(i + NR_CTRS, used_pmcs)) { ++ pm_event = set->pmcs[i + NR_CTRS]; ++ wd = PFM_EVENT_PMC_BUS_WORD(pm_event); ++ sig_grp = PFM_EVENT_PMC_SIGNAL_GROUP(pm_event); ++ if ((sig_grp == SIG_GROUP_PPU_IU1) || ++ (sig_grp == SIG_GROUP_PPU_XU)) { ++ ++ if (wd == grp0_wd && *ppu_sig_grp0 == 0) { ++ *ppu_sig_grp0 = sig_grp; ++ j++; ++ } else if (wd == grp1_wd && ++ *ppu_sig_grp1 == 0) { ++ *ppu_sig_grp1 = sig_grp; ++ j++; ++ } ++ } ++ } ++ } ++ return j; ++} ++ ++/** ++ * pfm_cell_restore_pmcs ++ * ++ * Write all control register values that are saved in the specified event ++ * set. We could use the pfm_arch_write_pmc() function to restore each PMC ++ * individually (as is done in other architectures), but that results in ++ * multiple RTAS calls. As an optimization, we will setup the RTAS argument ++ * array so we can do all event-control registers in one RTAS call. ++ * ++ * In per-thread mode, ++ * The counter enable bit of the pmX_control PMC is enabled while the target ++ * task runs on the target HW thread. ++ **/ ++void pfm_cell_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set) ++{ ++ u64 ctr_ctrl; ++ u64 *used_pmcs = set->used_pmcs; ++ int i; ++ int cpu = smp_processor_id(); ++ u32 current_th_id; ++ struct pfm_cell_platform_pmu_info *info = ++ ((struct pfm_arch_pmu_info *) ++ (pfm_pmu_conf->pmu_info))->platform_info; ++ ++ for (i = 0; i < NR_CTRS; i++) { ++ ctr_ctrl = set->pmcs[i]; ++ ++ if (ctr_ctrl & PFM_COUNTER_CTRL_PMC_PPU_TH0) { ++ current_th_id = info->get_hw_thread_id(cpu); ++ ++ /* ++ * Set the counter enable bit down if the current ++ * HW thread is NOT 0 ++ **/ ++ if (current_th_id) ++ ctr_ctrl = ctr_ctrl & ~CBE_PM_CTR_ENABLE; ++ ++ } else if (ctr_ctrl & PFM_COUNTER_CTRL_PMC_PPU_TH1) { ++ current_th_id = info->get_hw_thread_id(cpu); ++ ++ /* ++ * Set the counter enable bit down if the current ++ * HW thread is 0 ++ **/ ++ if (!current_th_id) ++ ctr_ctrl = ctr_ctrl & ~CBE_PM_CTR_ENABLE; ++ } ++ ++ /* Write the per-counter control register. If the PMC is not ++ * in use, then it will simply clear the register, which will ++ * disable the associated counter. ++ */ ++ info->write_pm07_control(cpu, i, ctr_ctrl); ++ ++ if (test_bit(i + NR_CTRS, used_pmcs)) ++ write_pm07_event(cpu, 0, set->pmcs[i + NR_CTRS]); ++ } ++ ++ /* Write all the global PMCs. Need to call pfm_cell_write_pmc() ++ * instead of cbe_write_pm() due to special handling for the ++ * pm_status register. ++ */ ++ for (i *= 2; i < PFM_PM_NUM_PMCS; i++) ++ pfm_cell_write_pmc(i, set->pmcs[i]); ++} ++ ++/** ++ * pfm_cell_restore_pmds ++ * ++ * Write to pm_control register before writing to counter registers ++ * so that we can decide the counter width berfore writing to the couters. ++ **/ ++void pfm_cell_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set) ++{ ++ u64 *used_pmds; ++ unsigned int i, max_pmd; ++ int cpu = smp_processor_id(); ++ struct pfm_cell_platform_pmu_info *info = ++ ((struct pfm_arch_pmu_info *) ++ (pfm_pmu_conf->pmu_info))->platform_info; ++ ++ /* ++ * Write pm_control register value ++ */ ++ info->write_pm(cpu, pm_control, ++ set->pmcs[CELL_PMC_PM_CONTROL] & ++ ~CBE_PM_ENABLE_PERF_MON); ++ PFM_DBG("restore pm_control(0x%lx) before restoring pmds", ++ set->pmcs[CELL_PMC_PM_CONTROL]); ++ ++ max_pmd = ctx->regs.max_pmd; ++ used_pmds = set->used_pmds; ++ ++ for (i = 0; i < max_pmd; i++) ++ if (test_bit(i, used_pmds) && ++ !(pfm_pmu_conf->pmd_desc[i].type & PFM_REG_RO)) ++ pfm_cell_write_pmd(i, set->pmds[i].value); ++} ++ ++/** ++ * pfm_cell_get_cntr_width ++ * ++ * This function check the 16bit counter field in pm_control pmc. ++ * ++ * Return value ++ * 16 : all counters are 16bit width. ++ * 32 : all counters are 32bit width. ++ * 0 : several counter width exists. ++ **/ ++static int pfm_cell_get_cntr_width(struct pfm_context *ctx, ++ struct pfm_event_set *s) ++{ ++ int width = 0; ++ int tmp = 0; ++ u64 cntr_field; ++ ++ if (ctx->flags.switch_ovfl || ctx->flags.switch_time) { ++ list_for_each_entry(s, &ctx->set_list, list) { ++ cntr_field = s->pmcs[CELL_PMC_PM_CONTROL] & ++ CELL_PMC_PM_CONTROL_CNTR_MASK; ++ ++ if (cntr_field == CELL_PMC_PM_CONTROL_CNTR_16) ++ tmp = 16; ++ else if (cntr_field == 0x0) ++ tmp = 32; ++ else ++ return 0; ++ ++ if (tmp != width && width != 0) ++ return 0; ++ ++ width = tmp; ++ } ++ } else { ++ cntr_field = s->pmcs[CELL_PMC_PM_CONTROL] & ++ CELL_PMC_PM_CONTROL_CNTR_MASK; ++ ++ if (cntr_field == CELL_PMC_PM_CONTROL_CNTR_16) ++ width = 16; ++ else if (cntr_field == 0x0) ++ width = 32; ++ else ++ width = 0; ++ } ++ return width; ++} ++ ++/** ++ * pfm_cell_check_cntr_ovfl_mask ++ * ++ * Return value ++ * 1 : cntr_ovfl interrupt is used. ++ * 0 : cntr_ovfl interrupt is not used. ++ **/ ++static int pfm_cell_check_cntr_ovfl(struct pfm_context *ctx, ++ struct pfm_event_set *s) ++{ ++ if (ctx->flags.switch_ovfl || ctx->flags.switch_time) { ++ list_for_each_entry(s, &ctx->set_list, list) { ++ if (CBE_PM_OVERFLOW_CTRS(s->pmcs[CELL_PMC_PM_STATUS])) ++ return 1; ++ } ++ } else { ++ if (CBE_PM_OVERFLOW_CTRS(s->pmcs[CELL_PMC_PM_STATUS])) ++ return 1; ++ } ++ return 0; ++} ++ ++#ifdef CONFIG_PPC_PS3 ++/** ++ * update_sub_unit_field ++ * ++ **/ ++static inline u64 update_sub_unit_field(u64 pm_event, u64 spe_id) ++{ ++ return ((pm_event & 0xFFFF0000FFFFFFFF) | (spe_id << 32)); ++} ++ ++/** ++ * pfm_get_spe_id ++ * ++ **/ ++static u64 pfm_get_spe_id(void *arg) ++{ ++ struct spu *spu = arg; ++ u64 spe_id; ++ ++ if (machine_is(ps3)) ++ spe_id = ps3_get_spe_id(arg); ++ else ++ spe_id = spu->spe_id; ++ ++ return spe_id; ++} ++ ++/** ++ * pfm_spu_number_to_id ++ * ++ **/ ++static int pfm_spu_number_to_id(int number, u64 *spe_id) ++{ ++ struct spu *spu; ++ int i; ++ ++ for (i = 0; i < MAX_NUMNODES; i++) { ++ if (cbe_spu_info[i].n_spus == 0) ++ continue; ++ ++ list_for_each_entry(spu, &cbe_spu_info[i].spus, cbe_list) ++ if (spu->number == number) { ++ *spe_id = pfm_get_spe_id(spu); ++ return 0; ++ } ++ } ++ return -ENODEV; ++} ++ ++/** ++ * pfm_update_pmX_event_subunit_field ++ * ++ * In system wide mode, ++ * This function updates the subunit field of SPE pmX_event. ++ **/ ++static int pfm_update_pmX_event_subunit_field(struct pfm_context *ctx) ++{ ++ struct pfm_event_set *set; ++ int i, last_pmc, ret; ++ u64 signal_group, spe_id; ++ int sub_unit; ++ u64 *used_pmcs; ++ ++ last_pmc = NR_CTRS + 8; ++ ret = 0; ++ list_for_each_entry(set, &ctx->set_list, list) { ++ ++ used_pmcs = set->used_pmcs; ++ for (i = NR_CTRS; i < last_pmc; i++) { ++ if (!test_bit(i, used_pmcs)) ++ continue; ++ ++ signal_group = PFM_EVENT_PMC_SIGNAL_GROUP(set->pmcs[i]); ++ ++ /* ++ * If the target event is a SPE signal group event, ++ * The sub_unit field in pmX_event pmc is changed to the ++ * specified spe_id. ++ */ ++ if (SIG_GROUP_SPU_BASE < signal_group && ++ signal_group < SIG_GROUP_EIB_BASE) { ++ sub_unit = RTAS_SUB_UNIT(set->pmcs[i]); ++ ++ ret = pfm_spu_number_to_id(sub_unit, &spe_id); ++ if (ret) ++ return ret; ++ ++ set->pmcs[i] = update_sub_unit_field( ++ set->pmcs[i], spe_id); ++ } ++ } ++ } ++ return 0; ++} ++#endif ++ ++/** ++ * pfm_cell_load_context ++ * ++ * In per-thread mode, ++ * The pmX_control PMCs which are used for PPU IU/XU event are marked with ++ * the thread id(PFM_COUNTER_CTRL_PMC_PPU_TH0/TH1). ++ **/ ++static int pfm_cell_load_context(struct pfm_context *ctx) ++{ ++ int i; ++ u32 ppu_sig_grp[PFM_NUM_OF_GROUPS] = {SIG_GROUP_NONE, SIG_GROUP_NONE}; ++ u32 bit; ++ int index; ++ u32 target_th_id; ++ int ppu_sig_num = 0; ++ struct pfm_event_set *s; ++ int cntr_width = 32; ++ int ret = 0; ++ ++ if (pfm_cell_check_cntr_ovfl(ctx, ctx->active_set)) { ++ cntr_width = pfm_cell_get_cntr_width(ctx, ctx->active_set); ++ ++ /* ++ * Counter overflow interrupt works with only 32bit counter, ++ * because perfmon core uses pfm_cell_pmu_conf.counter_width ++ * to deal with the counter overflow. we can't change the ++ * counter width here. ++ */ ++ if (cntr_width != 32) ++ return -EINVAL; ++ } ++ ++ if (ctx->flags.system) { ++#ifdef CONFIG_PPC_PS3 ++ if (machine_is(ps3)) ++ ret = pfm_update_pmX_event_subunit_field(ctx); ++#endif ++ return ret; ++ } ++ ++ list_for_each_entry(s, &ctx->set_list, list) { ++ ppu_sig_num = get_ppu_signal_groups(s, &ppu_sig_grp[0], ++ &ppu_sig_grp[1]); ++ ++ for (i = 0; i < NR_CTRS; i++) { ++ index = PFM_PM_CTR_INPUT_MUX_GROUP_INDEX(s->pmcs[i]); ++ if (ppu_sig_num && ++ (ppu_sig_grp[index] != SIG_GROUP_NONE) && ++ is_counter_for_ppu_sig_grp(s->pmcs[i], ++ ppu_sig_grp[index])) { ++ ++ bit = PFM_PM_CTR_INPUT_MUX_BIT(s->pmcs[i]); ++ target_th_id = get_target_ppu_thread_id( ++ ppu_sig_grp[index], bit); ++ if (!target_th_id) ++ s->pmcs[i] |= ++ PFM_COUNTER_CTRL_PMC_PPU_TH0; ++ else ++ s->pmcs[i] |= ++ PFM_COUNTER_CTRL_PMC_PPU_TH1; ++ PFM_DBG("set:%d mark ctr:%d target_thread:%d", ++ s->id, i, target_th_id); ++ } ++ } ++ } ++ ++ return ret; ++} ++ ++/** ++ * pfm_cell_unload_context ++ * ++ * For system-wide contexts and self-monitored contexts, make the RTAS call ++ * to reset the debug-bus signals. ++ * ++ * For non-self-monitored contexts, the monitored thread will already have ++ * been taken off the CPU and we don't need to do anything additional. ++ **/ ++static void pfm_cell_unload_context(struct pfm_context *ctx) ++{ ++ if (ctx->task == current || ctx->flags.system) ++ reset_signals(smp_processor_id()); ++} ++ ++/** ++ * pfm_cell_ctxswout_thread ++ * ++ * When a monitored thread is switched out (self-monitored or externally ++ * monitored) we need to reset the debug-bus signals so the next context that ++ * gets switched in can start from a clean set of signals. ++ **/ ++int pfm_cell_ctxswout_thread(struct task_struct *task, ++ struct pfm_context *ctx, struct pfm_event_set *set) ++{ ++ reset_signals(smp_processor_id()); ++ return 0; ++} ++ ++/** ++ * pfm_cell_get_ovfl_pmds ++ * ++ * Determine which counters in this set have overflowed and fill in the ++ * set->povfl_pmds mask and set->npend_ovfls count. On Cell, the pm_status ++ * register contains a bit for each counter to indicate overflow. However, ++ * those 8 bits are in the reverse order than what Perfmon2 is expecting, ++ * so we need to reverse the order of the overflow bits. ++ **/ ++static void pfm_cell_get_ovfl_pmds(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ struct pfm_arch_context *ctx_arch = pfm_ctx_arch(ctx); ++ u32 pm_status, ovfl_ctrs; ++ u64 povfl_pmds = 0; ++ int i; ++ struct pfm_cell_platform_pmu_info *info = ++ ((struct pfm_arch_pmu_info *) ++ (pfm_pmu_conf->pmu_info))->platform_info; ++ ++ if (!ctx_arch->last_read_updated) ++ /* This routine was not called via the interrupt handler. ++ * Need to start by getting interrupts and updating ++ * last_read_pm_status. ++ */ ++ ctx_arch->last_read_pm_status = ++ info->get_and_clear_pm_interrupts(smp_processor_id()); ++ ++ /* Reset the flag that the interrupt handler last read pm_status. */ ++ ctx_arch->last_read_updated = 0; ++ ++ pm_status = ctx_arch->last_read_pm_status & ++ set->pmcs[CELL_PMC_PM_STATUS]; ++ ovfl_ctrs = CBE_PM_OVERFLOW_CTRS(pm_status); ++ ++ /* Reverse the order of the bits in ovfl_ctrs ++ * and store the result in povfl_pmds. ++ */ ++ for (i = 0; i < PFM_PM_NUM_PMDS; i++) { ++ povfl_pmds = (povfl_pmds << 1) | (ovfl_ctrs & 1); ++ ovfl_ctrs >>= 1; ++ } ++ ++ /* Mask povfl_pmds with set->used_pmds to get set->povfl_pmds. ++ * Count the bits set in set->povfl_pmds to get set->npend_ovfls. ++ */ ++ bitmap_and(set->povfl_pmds, &povfl_pmds, ++ set->used_pmds, PFM_PM_NUM_PMDS); ++ set->npend_ovfls = bitmap_weight(set->povfl_pmds, PFM_PM_NUM_PMDS); ++} ++ ++/** ++ * pfm_cell_acquire_pmu ++ * ++ * acquire PMU resource. ++ * This acquisition is done when the first context is created. ++ **/ ++int pfm_cell_acquire_pmu(u64 *unavail_pmcs, u64 *unavail_pmds) ++{ ++#ifdef CONFIG_PPC_PS3 ++ int ret; ++ ++ if (machine_is(ps3)) { ++ PFM_DBG(""); ++ ret = ps3_lpm_open(PS3_LPM_TB_TYPE_INTERNAL, NULL, 0); ++ if (ret) { ++ PFM_ERR("Can't create PS3 lpm. error:%d", ret); ++ return -EFAULT; ++ } ++ } ++#endif ++ return 0; ++} ++ ++/** ++ * pfm_cell_release_pmu ++ * ++ * release PMU resource. ++ * actual release happens when last context is destroyed ++ **/ ++void pfm_cell_release_pmu(void) ++{ ++#ifdef CONFIG_PPC_PS3 ++ if (machine_is(ps3)) { ++ if (ps3_lpm_close()) ++ PFM_ERR("Can't delete PS3 lpm."); ++ } ++#endif ++} ++ ++/** ++ * handle_trace_buffer_interrupts ++ * ++ * This routine is for processing just the interval timer and trace buffer ++ * overflow interrupts. Performance counter interrupts are handled by the ++ * perf_irq_handler() routine, which reads and saves the pm_status register. ++ * This routine should not read the actual pm_status register, but rather ++ * the value passed in. ++ **/ ++static void handle_trace_buffer_interrupts(unsigned long iip, ++ struct pt_regs *regs, ++ struct pfm_context *ctx, ++ u32 pm_status) ++{ ++ /* FIX: Currently ignoring trace-buffer interrupts. */ ++ return; ++} ++ ++/** ++ * pfm_cell_irq_handler ++ * ++ * Handler for all Cell performance-monitor interrupts. ++ **/ ++static void pfm_cell_irq_handler(struct pt_regs *regs, struct pfm_context *ctx) ++{ ++ struct pfm_arch_context *ctx_arch = pfm_ctx_arch(ctx); ++ u32 last_read_pm_status; ++ int cpu = smp_processor_id(); ++ struct pfm_cell_platform_pmu_info *info = ++ ((struct pfm_arch_pmu_info *) ++ (pfm_pmu_conf->pmu_info))->platform_info; ++ ++ /* Need to disable and reenable the performance counters to get the ++ * desired behavior from the hardware. This is specific to the Cell ++ * PMU hardware. ++ */ ++ info->disable_pm(cpu); ++ ++ /* Read the pm_status register to get the interrupt bits. If a ++ * perfmormance counter overflow interrupt occurred, call the core ++ * perfmon interrupt handler to service the counter overflow. If the ++ * interrupt was for the interval timer or the trace_buffer, ++ * call the interval timer and trace buffer interrupt handler. ++ * ++ * The value read from the pm_status register is stored in the ++ * pmf_arch_context structure for use by other routines. Note that ++ * reading the pm_status register resets the interrupt flags to zero. ++ * Hence, it is important that the register is only read in one place. ++ * ++ * The pm_status reg interrupt reg format is: ++ * [pmd0:pmd1:pmd2:pmd3:pmd4:pmd5:pmd6:pmd7:intt:tbf:tbu:] ++ * - pmd0 to pm7 are the perf counter overflow interrupts. ++ * - intt is the interval timer overflowed interrupt. ++ * - tbf is the trace buffer full interrupt. ++ * - tbu is the trace buffer underflow interrupt. ++ * - The pmd0 bit is the MSB of the 32 bit register. ++ */ ++ ctx_arch->last_read_pm_status = last_read_pm_status = ++ info->get_and_clear_pm_interrupts(cpu); ++ ++ /* Set flag for pfm_cell_get_ovfl_pmds() routine so it knows ++ * last_read_pm_status was updated by the interrupt handler. ++ */ ++ ctx_arch->last_read_updated = 1; ++ ++ if (last_read_pm_status & CBE_PM_ALL_OVERFLOW_INTR) ++ /* At least one counter overflowed. */ ++ pfm_interrupt_handler(instruction_pointer(regs), regs); ++ ++ if (last_read_pm_status & (CBE_PM_INTERVAL_INTR | ++ CBE_PM_TRACE_BUFFER_FULL_INTR | ++ CBE_PM_TRACE_BUFFER_UNDERFLOW_INTR)) ++ /* Trace buffer or interval timer overflow. */ ++ handle_trace_buffer_interrupts(instruction_pointer(regs), ++ regs, ctx, last_read_pm_status); ++ ++ /* The interrupt settings is the value written to the pm_status ++ * register. It is saved in the context when the register is ++ * written. ++ */ ++ info->enable_pm_interrupts(cpu, info->get_hw_thread_id(cpu), ++ ctx->active_set->pmcs[CELL_PMC_PM_STATUS]); ++ ++ /* The writes to the various performance counters only writes to a ++ * latch. The new values (interrupt setting bits, reset counter value ++ * etc.) are not copied to the actual registers until the performance ++ * monitor is enabled. In order to get this to work as desired, the ++ * permormance monitor needs to be disabled while writting to the ++ * latches. This is a HW design issue. ++ */ ++ info->enable_pm(cpu); ++} ++ ++ ++static struct pfm_cell_platform_pmu_info ps3_platform_pmu_info = { ++#ifdef CONFIG_PPC_PS3 ++ .read_ctr = ps3_read_ctr, ++ .write_ctr = ps3_write_ctr, ++ .write_pm07_control = ps3_write_pm07_control, ++ .write_pm = ps3_write_pm, ++ .enable_pm = ps3_enable_pm, ++ .disable_pm = ps3_disable_pm, ++ .enable_pm_interrupts = ps3_enable_pm_interrupts, ++ .get_and_clear_pm_interrupts = ps3_get_and_clear_pm_interrupts, ++ .get_hw_thread_id = ps3_get_hw_thread_id, ++ .get_cpu_ppe_priv_regs = NULL, ++ .get_cpu_pmd_regs = NULL, ++ .get_cpu_mic_tm_regs = NULL, ++ .rtas_token = NULL, ++ .rtas_call = NULL, ++#endif ++}; ++ ++static struct pfm_cell_platform_pmu_info native_platform_pmu_info = { ++#ifdef CONFIG_PPC_CELL_NATIVE ++ .read_ctr = cbe_read_ctr, ++ .write_ctr = cbe_write_ctr, ++ .write_pm07_control = cbe_write_pm07_control, ++ .write_pm = cbe_write_pm, ++ .enable_pm = cbe_enable_pm, ++ .disable_pm = cbe_disable_pm, ++ .enable_pm_interrupts = cbe_enable_pm_interrupts, ++ .get_and_clear_pm_interrupts = cbe_get_and_clear_pm_interrupts, ++ .get_hw_thread_id = cbe_get_hw_thread_id, ++ .get_cpu_ppe_priv_regs = cbe_get_cpu_ppe_priv_regs, ++ .get_cpu_pmd_regs = cbe_get_cpu_pmd_regs, ++ .get_cpu_mic_tm_regs = cbe_get_cpu_mic_tm_regs, ++ .rtas_token = rtas_token, ++ .rtas_call = rtas_call, ++#endif ++}; ++ ++static struct pfm_arch_pmu_info pfm_cell_pmu_info = { ++ .pmu_style = PFM_POWERPC_PMU_CELL, ++ .acquire_pmu = pfm_cell_acquire_pmu, ++ .release_pmu = pfm_cell_release_pmu, ++ .write_pmc = pfm_cell_write_pmc, ++ .write_pmd = pfm_cell_write_pmd, ++ .read_pmd = pfm_cell_read_pmd, ++ .enable_counters = pfm_cell_enable_counters, ++ .disable_counters = pfm_cell_disable_counters, ++ .irq_handler = pfm_cell_irq_handler, ++ .get_ovfl_pmds = pfm_cell_get_ovfl_pmds, ++ .restore_pmcs = pfm_cell_restore_pmcs, ++ .restore_pmds = pfm_cell_restore_pmds, ++ .ctxswout_thread = pfm_cell_ctxswout_thread, ++ .load_context = pfm_cell_load_context, ++ .unload_context = pfm_cell_unload_context, ++}; ++ ++static struct pfm_pmu_config pfm_cell_pmu_conf = { ++ .pmu_name = "Cell", ++ .version = "0.1", ++ .counter_width = 32, ++ .pmd_desc = pfm_cell_pmd_desc, ++ .pmc_desc = pfm_cell_pmc_desc, ++ .num_pmc_entries = PFM_PM_NUM_PMCS, ++ .num_pmd_entries = PFM_PM_NUM_PMDS, ++ .probe_pmu = pfm_cell_probe_pmu, ++ .pmu_info = &pfm_cell_pmu_info, ++ .flags = PFM_PMU_BUILTIN_FLAG, ++ .owner = THIS_MODULE, ++}; ++ ++/** ++ * pfm_cell_platform_probe ++ * ++ * If we're on a system without the firmware rtas call available, set up the ++ * PMC write-checker for all the pmX_event control registers. ++ **/ ++static void pfm_cell_platform_probe(void) ++{ ++ if (machine_is(celleb)) { ++ int cnum; ++ pfm_cell_pmu_conf.pmc_write_check = pfm_cell_pmc_check; ++ for (cnum = NR_CTRS; cnum < (NR_CTRS * 2); cnum++) ++ pfm_cell_pmc_desc[cnum].type |= PFM_REG_WC; ++ } ++ ++ if (machine_is(ps3)) ++ pfm_cell_pmu_info.platform_info = &ps3_platform_pmu_info; ++ else ++ pfm_cell_pmu_info.platform_info = &native_platform_pmu_info; ++} ++ ++static int __init pfm_cell_pmu_init_module(void) ++{ ++ pfm_cell_platform_probe(); ++ return pfm_pmu_register(&pfm_cell_pmu_conf); ++} ++ ++static void __exit pfm_cell_pmu_cleanup_module(void) ++{ ++ pfm_pmu_unregister(&pfm_cell_pmu_conf); ++} ++ ++module_init(pfm_cell_pmu_init_module); ++module_exit(pfm_cell_pmu_cleanup_module); +diff --git a/arch/powerpc/perfmon/perfmon_power4.c b/arch/powerpc/perfmon/perfmon_power4.c +new file mode 100644 +index 0000000..eba9e8c +--- /dev/null ++++ b/arch/powerpc/perfmon/perfmon_power4.c +@@ -0,0 +1,309 @@ ++/* ++ * This file contains the POWER4 PMU register description tables ++ * and pmc checker used by perfmon.c. ++ * ++ * Copyright (c) 2007, IBM Corporation. ++ * ++ * Based on a simple modification of perfmon_power5.c for POWER4 by ++ * Corey Ashford <cjashfor@us.ibm.com>. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/module.h> ++#include <linux/perfmon_kern.h> ++ ++MODULE_AUTHOR("Corey Ashford <cjashfor@us.ibm.com>"); ++MODULE_DESCRIPTION("POWER4 PMU description table"); ++MODULE_LICENSE("GPL"); ++ ++static struct pfm_regmap_desc pfm_power4_pmc_desc[] = { ++/* mmcr0 */ PMC_D(PFM_REG_I, "MMCR0", MMCR0_FC, 0, 0, SPRN_MMCR0), ++/* mmcr1 */ PMC_D(PFM_REG_I, "MMCR1", 0, 0, 0, SPRN_MMCR1), ++/* mmcra */ PMC_D(PFM_REG_I, "MMCRA", 0, 0, 0, SPRN_MMCRA) ++}; ++#define PFM_PM_NUM_PMCS ARRAY_SIZE(pfm_power4_pmc_desc) ++ ++/* The TB and PURR registers are read-only. Also, note that the TB register ++ * actually consists of both the 32-bit SPRN_TBRU and SPRN_TBRL registers. ++ * For Perfmon2's purposes, we'll treat it as a single 64-bit register. ++ */ ++static struct pfm_regmap_desc pfm_power4_pmd_desc[] = { ++/* tb */ PMD_D((PFM_REG_I|PFM_REG_RO), "TB", SPRN_TBRL), ++/* pmd1 */ PMD_D(PFM_REG_C, "PMC1", SPRN_PMC1), ++/* pmd2 */ PMD_D(PFM_REG_C, "PMC2", SPRN_PMC2), ++/* pmd3 */ PMD_D(PFM_REG_C, "PMC3", SPRN_PMC3), ++/* pmd4 */ PMD_D(PFM_REG_C, "PMC4", SPRN_PMC4), ++/* pmd5 */ PMD_D(PFM_REG_C, "PMC5", SPRN_PMC5), ++/* pmd6 */ PMD_D(PFM_REG_C, "PMC6", SPRN_PMC6), ++/* pmd7 */ PMD_D(PFM_REG_C, "PMC7", SPRN_PMC7), ++/* pmd8 */ PMD_D(PFM_REG_C, "PMC8", SPRN_PMC8) ++}; ++#define PFM_PM_NUM_PMDS ARRAY_SIZE(pfm_power4_pmd_desc) ++ ++static int pfm_power4_probe_pmu(void) ++{ ++ unsigned long pvr = mfspr(SPRN_PVR); ++ int ver = PVR_VER(pvr); ++ ++ if ((ver == PV_POWER4) || (ver == PV_POWER4p)) ++ return 0; ++ ++ return -1; ++} ++ ++static void pfm_power4_write_pmc(unsigned int cnum, u64 value) ++{ ++ switch (pfm_pmu_conf->pmc_desc[cnum].hw_addr) { ++ case SPRN_MMCR0: ++ mtspr(SPRN_MMCR0, value); ++ break; ++ case SPRN_MMCR1: ++ mtspr(SPRN_MMCR1, value); ++ break; ++ case SPRN_MMCRA: ++ mtspr(SPRN_MMCRA, value); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static void pfm_power4_write_pmd(unsigned int cnum, u64 value) ++{ ++ u64 ovfl_mask = pfm_pmu_conf->ovfl_mask; ++ ++ switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) { ++ case SPRN_PMC1: ++ mtspr(SPRN_PMC1, value & ovfl_mask); ++ break; ++ case SPRN_PMC2: ++ mtspr(SPRN_PMC2, value & ovfl_mask); ++ break; ++ case SPRN_PMC3: ++ mtspr(SPRN_PMC3, value & ovfl_mask); ++ break; ++ case SPRN_PMC4: ++ mtspr(SPRN_PMC4, value & ovfl_mask); ++ break; ++ case SPRN_PMC5: ++ mtspr(SPRN_PMC5, value & ovfl_mask); ++ break; ++ case SPRN_PMC6: ++ mtspr(SPRN_PMC6, value & ovfl_mask); ++ break; ++ case SPRN_PMC7: ++ mtspr(SPRN_PMC7, value & ovfl_mask); ++ break; ++ case SPRN_PMC8: ++ mtspr(SPRN_PMC8, value & ovfl_mask); ++ break; ++ case SPRN_TBRL: ++ case SPRN_PURR: ++ /* Ignore writes to read-only registers. */ ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static u64 pfm_power4_read_pmd(unsigned int cnum) ++{ ++ switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) { ++ case SPRN_PMC1: ++ return mfspr(SPRN_PMC1); ++ case SPRN_PMC2: ++ return mfspr(SPRN_PMC2); ++ case SPRN_PMC3: ++ return mfspr(SPRN_PMC3); ++ case SPRN_PMC4: ++ return mfspr(SPRN_PMC4); ++ case SPRN_PMC5: ++ return mfspr(SPRN_PMC5); ++ case SPRN_PMC6: ++ return mfspr(SPRN_PMC6); ++ case SPRN_PMC7: ++ return mfspr(SPRN_PMC7); ++ case SPRN_PMC8: ++ return mfspr(SPRN_PMC8); ++ case SPRN_TBRL: ++ return ((u64)mfspr(SPRN_TBRU) << 32) | mfspr(SPRN_TBRL); ++ case SPRN_PURR: ++ if (cpu_has_feature(CPU_FTR_PURR)) ++ return mfspr(SPRN_PURR); ++ else ++ return 0; ++ default: ++ BUG(); ++ } ++} ++ ++/* forward decl */ ++static void pfm_power4_disable_counters(struct pfm_context *ctx, ++ struct pfm_event_set *set); ++ ++/** ++ * pfm_power4_enable_counters ++ * ++ **/ ++static void pfm_power4_enable_counters(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ unsigned int i, max_pmc; ++ ++ /* Make sure the counters are disabled before touching the other ++ control registers */ ++ pfm_power4_disable_counters(ctx, set); ++ ++ max_pmc = ctx->regs.max_pmc; ++ ++ /* Write MMCR0 last, and a fairly easy way to do this is to write ++ the registers in the reverse order */ ++ for (i = max_pmc; i != 0; i--) ++ if (test_bit(i - 1, set->used_pmcs)) ++ pfm_power4_write_pmc(i - 1, set->pmcs[i - 1]); ++} ++ ++/** ++ * pfm_power4_disable_counters ++ * ++ **/ ++static void pfm_power4_disable_counters(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ /* Set the Freeze Counters bit */ ++ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC); ++ asm volatile ("sync"); ++} ++ ++/** ++ * pfm_power4_get_ovfl_pmds ++ * ++ * Determine which counters in this set have overflowed and fill in the ++ * set->povfl_pmds mask and set->npend_ovfls count. ++ **/ ++static void pfm_power4_get_ovfl_pmds(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ unsigned int i; ++ unsigned int max_pmd = ctx->regs.max_intr_pmd; ++ u64 *used_pmds = set->used_pmds; ++ u64 *cntr_pmds = ctx->regs.cnt_pmds; ++ u64 width_mask = 1 << pfm_pmu_conf->counter_width; ++ u64 new_val, mask[PFM_PMD_BV]; ++ ++ bitmap_and(cast_ulp(mask), cast_ulp(cntr_pmds), ++ cast_ulp(used_pmds), max_pmd); ++ ++ for (i = 0; i < max_pmd; i++) { ++ if (test_bit(i, mask)) { ++ new_val = pfm_power4_read_pmd(i); ++ if (new_val & width_mask) { ++ set_bit(i, set->povfl_pmds); ++ set->npend_ovfls++; ++ } ++ } ++ } ++} ++ ++static void pfm_power4_irq_handler(struct pt_regs *regs, ++ struct pfm_context *ctx) ++{ ++ u32 mmcr0; ++ ++ /* Disable the counters (set the freeze bit) to not polute ++ * the counts. ++ */ ++ mmcr0 = mfspr(SPRN_MMCR0); ++ mtspr(SPRN_MMCR0, (mmcr0 | MMCR0_FC)); ++ ++ /* Set the PMM bit (see comment below). */ ++ mtmsrd(mfmsr() | MSR_PMM); ++ ++ pfm_interrupt_handler(instruction_pointer(regs), regs); ++ ++ mmcr0 = mfspr(SPRN_MMCR0); ++ ++ /* ++ * Reset the perfmon trigger if ++ * not in masking mode. ++ */ ++ if (ctx->state != PFM_CTX_MASKED) ++ mmcr0 |= MMCR0_PMXE; ++ ++ /* ++ * We must clear the PMAO bit on some (GQ) chips. Just do it ++ * all the time. ++ */ ++ mmcr0 &= ~MMCR0_PMAO; ++ ++ /* ++ * Now clear the freeze bit, counting will not start until we ++ * rfid from this exception, because only at that point will ++ * the PMM bit be cleared. ++ */ ++ mmcr0 &= ~MMCR0_FC; ++ mtspr(SPRN_MMCR0, mmcr0); ++} ++ ++static void pfm_power4_resend_irq(struct pfm_context *ctx) ++{ ++ /* ++ * Assert the PMAO bit to cause a PMU interrupt. Make sure we ++ * trigger the edge detection circuitry for PMAO ++ */ ++ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_PMAO); ++ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_PMAO); ++} ++ ++struct pfm_arch_pmu_info pfm_power4_pmu_info = { ++ .pmu_style = PFM_POWERPC_PMU_POWER4, ++ .write_pmc = pfm_power4_write_pmc, ++ .write_pmd = pfm_power4_write_pmd, ++ .read_pmd = pfm_power4_read_pmd, ++ .irq_handler = pfm_power4_irq_handler, ++ .get_ovfl_pmds = pfm_power4_get_ovfl_pmds, ++ .enable_counters = pfm_power4_enable_counters, ++ .disable_counters = pfm_power4_disable_counters, ++ .resend_irq = pfm_power4_resend_irq ++}; ++ ++/* ++ * impl_pmcs, impl_pmds are computed at runtime to minimize errors! ++ */ ++static struct pfm_pmu_config pfm_power4_pmu_conf = { ++ .pmu_name = "POWER4", ++ .counter_width = 31, ++ .pmd_desc = pfm_power4_pmd_desc, ++ .pmc_desc = pfm_power4_pmc_desc, ++ .num_pmc_entries = PFM_PM_NUM_PMCS, ++ .num_pmd_entries = PFM_PM_NUM_PMDS, ++ .probe_pmu = pfm_power4_probe_pmu, ++ .pmu_info = &pfm_power4_pmu_info, ++ .flags = PFM_PMU_BUILTIN_FLAG, ++ .owner = THIS_MODULE ++}; ++ ++static int __init pfm_power4_pmu_init_module(void) ++{ ++ return pfm_pmu_register(&pfm_power4_pmu_conf); ++} ++ ++static void __exit pfm_power4_pmu_cleanup_module(void) ++{ ++ pfm_pmu_unregister(&pfm_power4_pmu_conf); ++} ++ ++module_init(pfm_power4_pmu_init_module); ++module_exit(pfm_power4_pmu_cleanup_module); +diff --git a/arch/powerpc/perfmon/perfmon_power5.c b/arch/powerpc/perfmon/perfmon_power5.c +new file mode 100644 +index 0000000..f4bb1ac +--- /dev/null ++++ b/arch/powerpc/perfmon/perfmon_power5.c +@@ -0,0 +1,326 @@ ++/* ++ * This file contains the POWER5 PMU register description tables ++ * and pmc checker used by perfmon.c. ++ * ++ * Copyright (c) 2005 David Gibson, IBM Corporation. ++ * ++ * Based on perfmon_p6.c: ++ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/module.h> ++#include <linux/perfmon_kern.h> ++ ++MODULE_AUTHOR("David Gibson <dwg@au1.ibm.com>"); ++MODULE_DESCRIPTION("POWER5 PMU description table"); ++MODULE_LICENSE("GPL"); ++ ++static struct pfm_regmap_desc pfm_power5_pmc_desc[] = { ++/* mmcr0 */ PMC_D(PFM_REG_I, "MMCR0", MMCR0_FC, 0, 0, SPRN_MMCR0), ++/* mmcr1 */ PMC_D(PFM_REG_I, "MMCR1", 0, 0, 0, SPRN_MMCR1), ++/* mmcra */ PMC_D(PFM_REG_I, "MMCRA", 0, 0, 0, SPRN_MMCRA) ++}; ++#define PFM_PM_NUM_PMCS ARRAY_SIZE(pfm_power5_pmc_desc) ++ ++/* The TB and PURR registers are read-only. Also, note that the TB register ++ * actually consists of both the 32-bit SPRN_TBRU and SPRN_TBRL registers. ++ * For Perfmon2's purposes, we'll treat it as a single 64-bit register. ++ */ ++static struct pfm_regmap_desc pfm_power5_pmd_desc[] = { ++/* tb */ PMD_D((PFM_REG_I|PFM_REG_RO), "TB", SPRN_TBRL), ++/* pmd1 */ PMD_D(PFM_REG_C, "PMC1", SPRN_PMC1), ++/* pmd2 */ PMD_D(PFM_REG_C, "PMC2", SPRN_PMC2), ++/* pmd3 */ PMD_D(PFM_REG_C, "PMC3", SPRN_PMC3), ++/* pmd4 */ PMD_D(PFM_REG_C, "PMC4", SPRN_PMC4), ++/* pmd5 */ PMD_D(PFM_REG_C, "PMC5", SPRN_PMC5), ++/* pmd6 */ PMD_D(PFM_REG_C, "PMC6", SPRN_PMC6), ++/* purr */ PMD_D((PFM_REG_I|PFM_REG_RO), "PURR", SPRN_PURR), ++}; ++#define PFM_PM_NUM_PMDS ARRAY_SIZE(pfm_power5_pmd_desc) ++ ++/* forward decl */ ++static void pfm_power5_disable_counters(struct pfm_context *ctx, ++ struct pfm_event_set *set); ++ ++static int pfm_power5_probe_pmu(void) ++{ ++ unsigned long pvr = mfspr(SPRN_PVR); ++ ++ switch (PVR_VER(pvr)) { ++ case PV_POWER5: ++ return 0; ++ case PV_POWER5p: ++ return (PVR_REV(pvr) < 0x300) ? 0 : -1; ++ default: ++ return -1; ++ } ++} ++ ++static void pfm_power5_write_pmc(unsigned int cnum, u64 value) ++{ ++ switch (pfm_pmu_conf->pmc_desc[cnum].hw_addr) { ++ case SPRN_MMCR0: ++ mtspr(SPRN_MMCR0, value); ++ break; ++ case SPRN_MMCR1: ++ mtspr(SPRN_MMCR1, value); ++ break; ++ case SPRN_MMCRA: ++ mtspr(SPRN_MMCRA, value); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static void pfm_power5_write_pmd(unsigned int cnum, u64 value) ++{ ++ u64 ovfl_mask = pfm_pmu_conf->ovfl_mask; ++ ++ switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) { ++ case SPRN_PMC1: ++ mtspr(SPRN_PMC1, value & ovfl_mask); ++ break; ++ case SPRN_PMC2: ++ mtspr(SPRN_PMC2, value & ovfl_mask); ++ break; ++ case SPRN_PMC3: ++ mtspr(SPRN_PMC3, value & ovfl_mask); ++ break; ++ case SPRN_PMC4: ++ mtspr(SPRN_PMC4, value & ovfl_mask); ++ break; ++ case SPRN_PMC5: ++ mtspr(SPRN_PMC5, value & ovfl_mask); ++ break; ++ case SPRN_PMC6: ++ mtspr(SPRN_PMC6, value & ovfl_mask); ++ break; ++ case SPRN_TBRL: ++ case SPRN_PURR: ++ /* Ignore writes to read-only registers. */ ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static u64 pfm_power5_read_pmd(unsigned int cnum) ++{ ++ switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) { ++ case SPRN_PMC1: ++ return mfspr(SPRN_PMC1); ++ case SPRN_PMC2: ++ return mfspr(SPRN_PMC2); ++ case SPRN_PMC3: ++ return mfspr(SPRN_PMC3); ++ case SPRN_PMC4: ++ return mfspr(SPRN_PMC4); ++ case SPRN_PMC5: ++ return mfspr(SPRN_PMC5); ++ case SPRN_PMC6: ++ return mfspr(SPRN_PMC6); ++ case SPRN_TBRL: ++ return ((u64)mfspr(SPRN_TBRU) << 32) | mfspr(SPRN_TBRL); ++ case SPRN_PURR: ++ if (cpu_has_feature(CPU_FTR_PURR)) ++ return mfspr(SPRN_PURR); ++ else ++ return 0; ++ default: ++ BUG(); ++ } ++} ++ ++/** ++ * pfm_power5_enable_counters ++ * ++ **/ ++static void pfm_power5_enable_counters(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ unsigned int i, max_pmc; ++ ++ /* ++ * Make sure the counters are disabled before touching the ++ * other control registers ++ */ ++ pfm_power5_disable_counters(ctx, set); ++ ++ max_pmc = ctx->regs.max_pmc; ++ ++ /* ++ * Write MMCR0 last, and a fairly easy way to do ++ * this is to write the registers in the reverse ++ * order ++ */ ++ for (i = max_pmc; i != 0; i--) ++ if (test_bit(i - 1, set->used_pmcs)) ++ pfm_power5_write_pmc(i - 1, set->pmcs[i - 1]); ++} ++ ++/** ++ * pfm_power5_disable_counters ++ * ++ * Just need to zero all the control registers. ++ **/ ++static void pfm_power5_disable_counters(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ /* Set the Freeze Counters bit */ ++ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC); ++ asm volatile ("sync"); ++} ++ ++/** ++ * pfm_power5_get_ovfl_pmds ++ * ++ * Determine which counters in this set have overflowed and fill in the ++ * set->povfl_pmds mask and set->npend_ovfls count. ++ **/ ++static void pfm_power5_get_ovfl_pmds(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ unsigned int i; ++ unsigned int max = ctx->regs.max_intr_pmd; ++ u64 *used_pmds = set->used_pmds; ++ u64 *intr_pmds = ctx->regs.intr_pmds; ++ u64 width_mask = 1 << pfm_pmu_conf->counter_width; ++ u64 new_val, mask[PFM_PMD_BV]; ++ ++ bitmap_and(cast_ulp(mask), cast_ulp(intr_pmds), ++ cast_ulp(used_pmds), max); ++ /* ++ * If either PMC5 or PMC6 are not being used, just zero out the unused ++ * ones so that they won't interrupt again for another 2^31 counts. ++ * Note that if no other counters overflowed, set->npend_ovfls will ++ * be zero upon returning from this call (i.e. a spurious ++ * interrupt), but that should be ok. ++ * ++ * If neither PMC5 nor PMC6 are used, the counters should be frozen ++ * via MMCR0_FC5_6 and zeroed out. ++ * ++ * If both PMC5 and PMC6 are used, they can be handled correctly by ++ * the loop that follows. ++ */ ++ ++ if (!test_bit(5, cast_ulp(used_pmds))) ++ mtspr(SPRN_PMC5, 0); ++ if (!test_bit(6, cast_ulp(used_pmds))) ++ mtspr(SPRN_PMC6, 0); ++ ++ for (i = 0; i < max; i++) { ++ if (test_bit(i, mask)) { ++ new_val = pfm_power5_read_pmd(i); ++ if (new_val & width_mask) { ++ set_bit(i, set->povfl_pmds); ++ set->npend_ovfls++; ++ } ++ } ++ } ++} ++ ++static void pfm_power5_irq_handler(struct pt_regs *regs, ++ struct pfm_context *ctx) ++{ ++ u32 mmcr0; ++ ++ /* Disable the counters (set the freeze bit) to not polute ++ * the counts. ++ */ ++ mmcr0 = mfspr(SPRN_MMCR0); ++ mtspr(SPRN_MMCR0, (mmcr0 | MMCR0_FC)); ++ ++ /* Set the PMM bit (see comment below). */ ++ mtmsrd(mfmsr() | MSR_PMM); ++ ++ pfm_interrupt_handler(instruction_pointer(regs), regs); ++ ++ mmcr0 = mfspr(SPRN_MMCR0); ++ ++ /* ++ * Reset the perfmon trigger if ++ * not in masking mode. ++ */ ++ if (ctx->state != PFM_CTX_MASKED) ++ mmcr0 |= MMCR0_PMXE; ++ ++ /* ++ * We must clear the PMAO bit on some (GQ) chips. Just do it ++ * all the time. ++ */ ++ mmcr0 &= ~MMCR0_PMAO; ++ ++ /* ++ * Now clear the freeze bit, counting will not start until we ++ * rfid from this exception, because only at that point will ++ * the PMM bit be cleared. ++ */ ++ mmcr0 &= ~MMCR0_FC; ++ mtspr(SPRN_MMCR0, mmcr0); ++} ++ ++static void pfm_power5_resend_irq(struct pfm_context *ctx) ++{ ++ /* ++ * Assert the PMAO bit to cause a PMU interrupt. Make sure we ++ * trigger the edge detection circuitry for PMAO ++ */ ++ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_PMAO); ++ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_PMAO); ++} ++ ++struct pfm_arch_pmu_info pfm_power5_pmu_info = { ++ .pmu_style = PFM_POWERPC_PMU_POWER5, ++ .write_pmc = pfm_power5_write_pmc, ++ .write_pmd = pfm_power5_write_pmd, ++ .read_pmd = pfm_power5_read_pmd, ++ .irq_handler = pfm_power5_irq_handler, ++ .get_ovfl_pmds = pfm_power5_get_ovfl_pmds, ++ .enable_counters = pfm_power5_enable_counters, ++ .disable_counters = pfm_power5_disable_counters, ++ .resend_irq = pfm_power5_resend_irq ++}; ++ ++/* ++ * impl_pmcs, impl_pmds are computed at runtime to minimize errors! ++ */ ++static struct pfm_pmu_config pfm_power5_pmu_conf = { ++ .pmu_name = "POWER5", ++ .counter_width = 31, ++ .pmd_desc = pfm_power5_pmd_desc, ++ .pmc_desc = pfm_power5_pmc_desc, ++ .num_pmc_entries = PFM_PM_NUM_PMCS, ++ .num_pmd_entries = PFM_PM_NUM_PMDS, ++ .probe_pmu = pfm_power5_probe_pmu, ++ .pmu_info = &pfm_power5_pmu_info, ++ .flags = PFM_PMU_BUILTIN_FLAG, ++ .owner = THIS_MODULE ++}; ++ ++static int __init pfm_power5_pmu_init_module(void) ++{ ++ return pfm_pmu_register(&pfm_power5_pmu_conf); ++} ++ ++static void __exit pfm_power5_pmu_cleanup_module(void) ++{ ++ pfm_pmu_unregister(&pfm_power5_pmu_conf); ++} ++ ++module_init(pfm_power5_pmu_init_module); ++module_exit(pfm_power5_pmu_cleanup_module); +diff --git a/arch/powerpc/perfmon/perfmon_power6.c b/arch/powerpc/perfmon/perfmon_power6.c +new file mode 100644 +index 0000000..7882feb +--- /dev/null ++++ b/arch/powerpc/perfmon/perfmon_power6.c +@@ -0,0 +1,520 @@ ++/* ++ * This file contains the POWER6 PMU register description tables ++ * and pmc checker used by perfmon.c. ++ * ++ * Copyright (c) 2007, IBM Corporation ++ * ++ * Based on perfmon_power5.c, and written by Carl Love <carll@us.ibm.com> ++ * and Kevin Corry <kevcorry@us.ibm.com>. Some fixes and refinement by ++ * Corey Ashford <cjashfor@us.ibm.com> ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/module.h> ++#include <linux/perfmon_kern.h> ++ ++MODULE_AUTHOR("Corey Ashford <cjashfor@us.ibm.com>"); ++MODULE_DESCRIPTION("POWER6 PMU description table"); ++MODULE_LICENSE("GPL"); ++ ++static struct pfm_regmap_desc pfm_power6_pmc_desc[] = { ++/* mmcr0 */ PMC_D(PFM_REG_I, "MMCR0", MMCR0_FC, 0, 0, SPRN_MMCR0), ++/* mmcr1 */ PMC_D(PFM_REG_I, "MMCR1", 0, 0, 0, SPRN_MMCR1), ++/* mmcra */ PMC_D(PFM_REG_I, "MMCRA", 0, 0, 0, SPRN_MMCRA) ++}; ++#define PFM_PM_NUM_PMCS ARRAY_SIZE(pfm_power6_pmc_desc) ++#define PFM_DELTA_TB 10000 /* Not a real registers */ ++#define PFM_DELTA_PURR 10001 ++ ++/* ++ * counters wrap to zero at transition from 2^32-1 to 2^32. Note: ++ * interrupt generated at transition from 2^31-1 to 2^31 ++ */ ++#define OVERFLOW_VALUE 0x100000000UL ++ ++/* The TB and PURR registers are read-only. Also, note that the TB register ++ * actually consists of both the 32-bit SPRN_TBRU and SPRN_TBRL registers. ++ * For Perfmon2's purposes, we'll treat it as a single 64-bit register. ++ */ ++static struct pfm_regmap_desc pfm_power6_pmd_desc[] = { ++ /* On POWER 6 PMC5 and PMC6 are not writable, they do not ++ * generate interrupts, and do not qualify their counts ++ * based on problem mode, supervisor mode or hypervisor mode. ++ * These two counters are implemented as virtual counters ++ * to make the appear to work like the other counters. A ++ * kernel timer is used sample the real PMC5 and PMC6 and ++ * update the virtual counters. ++ */ ++/* tb */ PMD_D((PFM_REG_I|PFM_REG_RO), "TB", SPRN_TBRL), ++/* pmd1 */ PMD_D(PFM_REG_C, "PMC1", SPRN_PMC1), ++/* pmd2 */ PMD_D(PFM_REG_C, "PMC2", SPRN_PMC2), ++/* pmd3 */ PMD_D(PFM_REG_C, "PMC3", SPRN_PMC3), ++/* pmd4 */ PMD_D(PFM_REG_C, "PMC4", SPRN_PMC4), ++/* pmd5 */ PMD_D((PFM_REG_I|PFM_REG_V), "PMC5", SPRN_PMC5), ++/* pmd6 */ PMD_D((PFM_REG_I|PFM_REG_V), "PMC6", SPRN_PMC6), ++/* purr */ PMD_D((PFM_REG_I|PFM_REG_RO), "PURR", SPRN_PURR), ++/* delta purr */ PMD_D((PFM_REG_I|PFM_REG_V), "DELTA_TB", PFM_DELTA_TB), ++/* delta tb */ PMD_D((PFM_REG_I|PFM_REG_V), "DELTA_PURR", PFM_DELTA_PURR), ++}; ++ ++#define PFM_PM_NUM_PMDS ARRAY_SIZE(pfm_power6_pmd_desc) ++ ++u32 pmc5_start_save[NR_CPUS]; ++u32 pmc6_start_save[NR_CPUS]; ++ ++static struct timer_list pmc5_6_update[NR_CPUS]; ++u64 enable_cntrs_cnt; ++u64 disable_cntrs_cnt; ++u64 call_delta; ++u64 pm5_6_interrupt; ++u64 pm1_4_interrupt; ++/* need ctx_arch for kernel timer. Can't get it in context of the kernel ++ * timer. ++ */ ++struct pfm_arch_context *pmc5_6_ctx_arch[NR_CPUS]; ++long int update_time; ++ ++static void delta(int cpu_num, struct pfm_arch_context *ctx_arch) ++{ ++ u32 tmp5, tmp6; ++ ++ call_delta++; ++ ++ tmp5 = (u32) mfspr(SPRN_PMC5); ++ tmp6 = (u32) mfspr(SPRN_PMC6); ++ ++ /* ++ * The following difference calculation relies on 32-bit modular ++ * arithmetic for the deltas to come out correct (especially in the ++ * presence of a 32-bit counter wrap). ++ */ ++ ctx_arch->powergs_pmc5 += (u64)(tmp5 - pmc5_start_save[cpu_num]); ++ ctx_arch->powergs_pmc6 += (u64)(tmp6 - pmc6_start_save[cpu_num]); ++ ++ pmc5_start_save[cpu_num] = tmp5; ++ pmc6_start_save[cpu_num] = tmp6; ++ ++ return; ++} ++ ++ ++static void pmc5_6_updater(unsigned long cpu_num) ++{ ++ /* update the virtual pmd 5 and pmd 6 counters */ ++ ++ delta(cpu_num, pmc5_6_ctx_arch[cpu_num]); ++ mod_timer(&pmc5_6_update[cpu_num], jiffies + update_time); ++} ++ ++ ++static int pfm_power6_probe_pmu(void) ++{ ++ unsigned long pvr = mfspr(SPRN_PVR); ++ ++ switch (PVR_VER(pvr)) { ++ case PV_POWER6: ++ return 0; ++ case PV_POWER5p: ++ /* If this is a POWER5+ and the revision is less than 0x300, ++ don't treat it as a POWER6. */ ++ return (PVR_REV(pvr) < 0x300) ? -1 : 0; ++ default: ++ return -1; ++ } ++} ++ ++static void pfm_power6_write_pmc(unsigned int cnum, u64 value) ++{ ++ switch (pfm_pmu_conf->pmc_desc[cnum].hw_addr) { ++ case SPRN_MMCR0: ++ mtspr(SPRN_MMCR0, value); ++ break; ++ case SPRN_MMCR1: ++ mtspr(SPRN_MMCR1, value); ++ break; ++ case SPRN_MMCRA: ++ mtspr(SPRN_MMCRA, value); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static void pfm_power6_write_pmd(unsigned int cnum, u64 value) ++{ ++ /* On POWER 6 PMC5 and PMC6 are implemented as ++ * virtual counters. See comment in pfm_power6_pmd_desc ++ * definition. ++ */ ++ u64 ovfl_mask = pfm_pmu_conf->ovfl_mask; ++ ++ switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) { ++ case SPRN_PMC1: ++ mtspr(SPRN_PMC1, value & ovfl_mask); ++ break; ++ case SPRN_PMC2: ++ mtspr(SPRN_PMC2, value & ovfl_mask); ++ break; ++ case SPRN_PMC3: ++ mtspr(SPRN_PMC3, value & ovfl_mask); ++ break; ++ case SPRN_PMC4: ++ mtspr(SPRN_PMC4, value & ovfl_mask); ++ break; ++ case SPRN_TBRL: ++ case SPRN_PURR: ++ /* Ignore writes to read-only registers. */ ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static u64 pfm_power6_sread(struct pfm_context *ctx, unsigned int cnum) ++{ ++ struct pfm_arch_context *ctx_arch = pfm_ctx_arch(ctx); ++ int cpu_num = smp_processor_id(); ++ ++ /* On POWER 6 PMC5 and PMC6 are implemented as ++ * virtual counters. See comment in pfm_power6_pmd_desc ++ * definition. ++ */ ++ ++ switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) { ++ case SPRN_PMC5: ++ return ctx_arch->powergs_pmc5 + (u64)((u32)mfspr(SPRN_PMC5) - pmc5_start_save[cpu_num]); ++ break; ++ ++ case SPRN_PMC6: ++ return ctx_arch->powergs_pmc6 + (u64)((u32)mfspr(SPRN_PMC6) - pmc6_start_save[cpu_num]); ++ break; ++ ++ case PFM_DELTA_TB: ++ return ctx_arch->delta_tb ++ + (((u64)mfspr(SPRN_TBRU) << 32) | mfspr(SPRN_TBRL)) ++ - ctx_arch->delta_tb_start; ++ break; ++ ++ case PFM_DELTA_PURR: ++ return ctx_arch->delta_purr ++ + mfspr(SPRN_PURR) ++ - ctx_arch->delta_purr_start; ++ break; ++ ++ default: ++ BUG(); ++ } ++} ++ ++void pfm_power6_swrite(struct pfm_context *ctx, unsigned int cnum, ++ u64 val) ++{ ++ struct pfm_arch_context *ctx_arch = pfm_ctx_arch(ctx); ++ int cpu_num = smp_processor_id(); ++ ++ switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) { ++ case SPRN_PMC5: ++ pmc5_start_save[cpu_num] = mfspr(SPRN_PMC5); ++ ctx_arch->powergs_pmc5 = val; ++ break; ++ ++ case SPRN_PMC6: ++ pmc6_start_save[cpu_num] = mfspr(SPRN_PMC6); ++ ctx_arch->powergs_pmc6 = val; ++ break; ++ ++ case PFM_DELTA_TB: ++ ctx_arch->delta_tb_start = ++ (((u64)mfspr(SPRN_TBRU) << 32) | mfspr(SPRN_TBRL)); ++ ctx_arch->delta_tb = val; ++ break; ++ ++ case PFM_DELTA_PURR: ++ ctx_arch->delta_purr_start = mfspr(SPRN_PURR); ++ ctx_arch->delta_purr = val; ++ break; ++ ++ default: ++ BUG(); ++ } ++} ++ ++static u64 pfm_power6_read_pmd(unsigned int cnum) ++{ ++ switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) { ++ case SPRN_PMC1: ++ return mfspr(SPRN_PMC1); ++ case SPRN_PMC2: ++ return mfspr(SPRN_PMC2); ++ case SPRN_PMC3: ++ return mfspr(SPRN_PMC3); ++ case SPRN_PMC4: ++ return mfspr(SPRN_PMC4); ++ case SPRN_TBRL: ++ return ((u64)mfspr(SPRN_TBRU) << 32) | mfspr(SPRN_TBRL); ++ case SPRN_PURR: ++ if (cpu_has_feature(CPU_FTR_PURR)) ++ return mfspr(SPRN_PURR); ++ else ++ return 0; ++ default: ++ BUG(); ++ } ++} ++ ++ ++/** ++ * pfm_power6_enable_counters ++ * ++ **/ ++static void pfm_power6_enable_counters(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ ++ unsigned int i, max_pmc; ++ int cpu_num = smp_processor_id(); ++ struct pfm_arch_context *ctx_arch; ++ ++ enable_cntrs_cnt++; ++ ++ /* need the ctx passed down to the routine */ ++ ctx_arch = pfm_ctx_arch(ctx); ++ max_pmc = ctx->regs.max_pmc; ++ ++ /* Write MMCR0 last, and a fairly easy way to do this is to write ++ the registers in the reverse order */ ++ for (i = max_pmc; i != 0; i--) ++ if (test_bit(i - 1, set->used_pmcs)) ++ pfm_power6_write_pmc(i - 1, set->pmcs[i - 1]); ++ ++ /* save current free running HW event count */ ++ pmc5_start_save[cpu_num] = mfspr(SPRN_PMC5); ++ pmc6_start_save[cpu_num] = mfspr(SPRN_PMC6); ++ ++ ctx_arch->delta_purr_start = mfspr(SPRN_PURR); ++ ++ if (cpu_has_feature(CPU_FTR_PURR)) ++ ctx_arch->delta_tb_start = ++ ((u64)mfspr(SPRN_TBRU) << 32) | mfspr(SPRN_TBRL); ++ else ++ ctx_arch->delta_tb_start = 0; ++ ++ /* Start kernel timer for this cpu to periodically update ++ * the virtual counters. ++ */ ++ init_timer(&pmc5_6_update[cpu_num]); ++ pmc5_6_update[cpu_num].function = pmc5_6_updater; ++ pmc5_6_update[cpu_num].data = (unsigned long) cpu_num; ++ pmc5_6_update[cpu_num].expires = jiffies + update_time; ++ /* context for this timer, timer will be removed if context ++ * is switched because the counters will be stopped first. ++ * NEEDS WORK, I think this is all ok, a little concerned about a ++ * race between the kernel timer going off right as the counters ++ * are being stopped and the context switching. Need to think ++ * about this. ++ */ ++ pmc5_6_ctx_arch[cpu_num] = ctx_arch; ++ add_timer(&pmc5_6_update[cpu_num]); ++} ++ ++/** ++ * pfm_power6_disable_counters ++ * ++ **/ ++static void pfm_power6_disable_counters(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ struct pfm_arch_context *ctx_arch; ++ int cpu_num = smp_processor_id(); ++ ++ disable_cntrs_cnt++; ++ ++ /* Set the Freeze Counters bit */ ++ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC); ++ asm volatile ("sync"); ++ ++ /* delete kernel update timer */ ++ del_timer_sync(&pmc5_6_update[cpu_num]); ++ ++ /* Update the virtual pmd 5 and 6 counters from the free running ++ * HW counters ++ */ ++ ctx_arch = pfm_ctx_arch(ctx); ++ delta(cpu_num, ctx_arch); ++ ++ ctx_arch->delta_tb += ++ (((u64)mfspr(SPRN_TBRU) << 32) | mfspr(SPRN_TBRL)) ++ - ctx_arch->delta_tb_start; ++ ++ ctx_arch->delta_purr += mfspr(SPRN_PURR) ++ - ctx_arch->delta_purr_start; ++} ++ ++/** ++ * pfm_power6_get_ovfl_pmds ++ * ++ * Determine which counters in this set have overflowed and fill in the ++ * set->povfl_pmds mask and set->npend_ovfls count. ++ **/ ++static void pfm_power6_get_ovfl_pmds(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ unsigned int i; ++ unsigned int first_intr_pmd = ctx->regs.first_intr_pmd; ++ unsigned int max_intr_pmd = ctx->regs.max_intr_pmd; ++ u64 *used_pmds = set->used_pmds; ++ u64 *cntr_pmds = ctx->regs.cnt_pmds; ++ u64 width_mask = 1 << pfm_pmu_conf->counter_width; ++ u64 new_val, mask[PFM_PMD_BV]; ++ ++ bitmap_and(cast_ulp(mask), cast_ulp(cntr_pmds), cast_ulp(used_pmds), max_intr_pmd); ++ ++ /* max_intr_pmd is actually the last interrupting pmd register + 1 */ ++ for (i = first_intr_pmd; i < max_intr_pmd; i++) { ++ if (test_bit(i, mask)) { ++ new_val = pfm_power6_read_pmd(i); ++ if (new_val & width_mask) { ++ set_bit(i, set->povfl_pmds); ++ set->npend_ovfls++; ++ } ++ } ++ } ++} ++ ++static void pfm_power6_irq_handler(struct pt_regs *regs, ++ struct pfm_context *ctx) ++{ ++ u32 mmcr0; ++ u64 mmcra; ++ ++ /* Disable the counters (set the freeze bit) to not polute ++ * the counts. ++ */ ++ mmcr0 = mfspr(SPRN_MMCR0); ++ mtspr(SPRN_MMCR0, (mmcr0 | MMCR0_FC)); ++ mmcra = mfspr(SPRN_MMCRA); ++ ++ /* Set the PMM bit (see comment below). */ ++ mtmsrd(mfmsr() | MSR_PMM); ++ ++ pm1_4_interrupt++; ++ ++ pfm_interrupt_handler(instruction_pointer(regs), regs); ++ ++ mmcr0 = mfspr(SPRN_MMCR0); ++ ++ /* ++ * Reset the perfmon trigger if ++ * not in masking mode. ++ */ ++ if (ctx->state != PFM_CTX_MASKED) ++ mmcr0 |= MMCR0_PMXE; ++ ++ /* ++ * Clear the PMU Alert Occurred bit ++ */ ++ mmcr0 &= ~MMCR0_PMAO; ++ ++ /* Clear the appropriate bits in the MMCRA. */ ++ mmcra &= ~(POWER6_MMCRA_THRM | POWER6_MMCRA_OTHER); ++ mtspr(SPRN_MMCRA, mmcra); ++ ++ /* ++ * Now clear the freeze bit, counting will not start until we ++ * rfid from this exception, because only at that point will ++ * the PMM bit be cleared. ++ */ ++ mmcr0 &= ~MMCR0_FC; ++ mtspr(SPRN_MMCR0, mmcr0); ++} ++ ++static void pfm_power6_resend_irq(struct pfm_context *ctx) ++{ ++ /* ++ * Assert the PMAO bit to cause a PMU interrupt. Make sure we ++ * trigger the edge detection circuitry for PMAO ++ */ ++ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_PMAO); ++ mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_PMAO); ++} ++ ++struct pfm_arch_pmu_info pfm_power6_pmu_info = { ++ .pmu_style = PFM_POWERPC_PMU_POWER6, ++ .write_pmc = pfm_power6_write_pmc, ++ .write_pmd = pfm_power6_write_pmd, ++ .read_pmd = pfm_power6_read_pmd, ++ .irq_handler = pfm_power6_irq_handler, ++ .get_ovfl_pmds = pfm_power6_get_ovfl_pmds, ++ .enable_counters = pfm_power6_enable_counters, ++ .disable_counters = pfm_power6_disable_counters, ++ .resend_irq = pfm_power6_resend_irq ++}; ++ ++/* ++ * impl_pmcs, impl_pmds are computed at runtime to minimize errors! ++ */ ++static struct pfm_pmu_config pfm_power6_pmu_conf = { ++ .pmu_name = "POWER6", ++ .counter_width = 31, ++ .pmd_desc = pfm_power6_pmd_desc, ++ .pmc_desc = pfm_power6_pmc_desc, ++ .num_pmc_entries = PFM_PM_NUM_PMCS, ++ .num_pmd_entries = PFM_PM_NUM_PMDS, ++ .probe_pmu = pfm_power6_probe_pmu, ++ .pmu_info = &pfm_power6_pmu_info, ++ .pmd_sread = pfm_power6_sread, ++ .pmd_swrite = pfm_power6_swrite, ++ .flags = PFM_PMU_BUILTIN_FLAG, ++ .owner = THIS_MODULE ++}; ++ ++static int __init pfm_power6_pmu_init_module(void) ++{ ++ int ret; ++ disable_cntrs_cnt = 0; ++ enable_cntrs_cnt = 0; ++ call_delta = 0; ++ pm5_6_interrupt = 0; ++ pm1_4_interrupt = 0; ++ ++ /* calculate the time for updating counters 5 and 6 */ ++ ++ /* ++ * MAX_EVENT_RATE assumes a max instruction issue rate of 2 ++ * instructions per clock cycle. Experience shows that this factor ++ * of 2 is more than adequate. ++ */ ++ ++# define MAX_EVENT_RATE (ppc_proc_freq * 2) ++ ++ /* ++ * Calculate the time, in jiffies, it takes for event counter 5 or ++ * 6 to completely wrap when counting at the max event rate, and ++ * then figure on sampling at twice that rate. ++ */ ++ update_time = (((unsigned long)HZ * OVERFLOW_VALUE) ++ / ((unsigned long)MAX_EVENT_RATE)) / 2; ++ ++ ret = pfm_pmu_register(&pfm_power6_pmu_conf); ++ return ret; ++} ++ ++static void __exit pfm_power6_pmu_cleanup_module(void) ++{ ++ pfm_pmu_unregister(&pfm_power6_pmu_conf); ++} ++ ++module_init(pfm_power6_pmu_init_module); ++module_exit(pfm_power6_pmu_cleanup_module); +diff --git a/arch/powerpc/perfmon/perfmon_ppc32.c b/arch/powerpc/perfmon/perfmon_ppc32.c +new file mode 100644 +index 0000000..76f0b84 +--- /dev/null ++++ b/arch/powerpc/perfmon/perfmon_ppc32.c +@@ -0,0 +1,340 @@ ++/* ++ * This file contains the PPC32 PMU register description tables ++ * and pmc checker used by perfmon.c. ++ * ++ * Philip Mucci, mucci@cs.utk.edu ++ * ++ * Based on code from: ++ * Copyright (c) 2005 David Gibson, IBM Corporation. ++ * ++ * Based on perfmon_p6.c: ++ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/module.h> ++#include <linux/perfmon_kern.h> ++#include <asm/reg.h> ++ ++MODULE_AUTHOR("Philip Mucci <mucci@cs.utk.edu>"); ++MODULE_DESCRIPTION("PPC32 PMU description table"); ++MODULE_LICENSE("GPL"); ++ ++static struct pfm_pmu_config pfm_ppc32_pmu_conf; ++ ++static struct pfm_regmap_desc pfm_ppc32_pmc_desc[] = { ++/* mmcr0 */ PMC_D(PFM_REG_I, "MMCR0", 0x0, 0, 0, SPRN_MMCR0), ++/* mmcr1 */ PMC_D(PFM_REG_I, "MMCR1", 0x0, 0, 0, SPRN_MMCR1), ++/* mmcr2 */ PMC_D(PFM_REG_I, "MMCR2", 0x0, 0, 0, SPRN_MMCR2), ++}; ++#define PFM_PM_NUM_PMCS ARRAY_SIZE(pfm_ppc32_pmc_desc) ++ ++static struct pfm_regmap_desc pfm_ppc32_pmd_desc[] = { ++/* pmd0 */ PMD_D(PFM_REG_C, "PMC1", SPRN_PMC1), ++/* pmd1 */ PMD_D(PFM_REG_C, "PMC2", SPRN_PMC2), ++/* pmd2 */ PMD_D(PFM_REG_C, "PMC3", SPRN_PMC3), ++/* pmd3 */ PMD_D(PFM_REG_C, "PMC4", SPRN_PMC4), ++/* pmd4 */ PMD_D(PFM_REG_C, "PMC5", SPRN_PMC5), ++/* pmd5 */ PMD_D(PFM_REG_C, "PMC6", SPRN_PMC6), ++}; ++#define PFM_PM_NUM_PMDS ARRAY_SIZE(pfm_ppc32_pmd_desc) ++ ++static void perfmon_perf_irq(struct pt_regs *regs) ++{ ++ u32 mmcr0; ++ ++ /* BLATANTLY STOLEN FROM OPROFILE, then modified */ ++ ++ /* set the PMM bit (see comment below) */ ++ mtmsr(mfmsr() | MSR_PMM); ++ ++ pfm_interrupt_handler(instruction_pointer(regs), regs); ++ ++ /* The freeze bit was set by the interrupt. ++ * Clear the freeze bit, and reenable the interrupt. ++ * The counters won't actually start until the rfi clears ++ * the PMM bit. ++ */ ++ ++ /* Unfreezes the counters on this CPU, enables the interrupt, ++ * enables the counters to trigger the interrupt, and sets the ++ * counters to only count when the mark bit is not set. ++ */ ++ mmcr0 = mfspr(SPRN_MMCR0); ++ ++ mmcr0 &= ~(MMCR0_FC | MMCR0_FCM0); ++ mmcr0 |= (MMCR0_FCECE | MMCR0_PMC1CE | MMCR0_PMCnCE | MMCR0_PMXE); ++ ++ mtspr(SPRN_MMCR0, mmcr0); ++} ++ ++static int pfm_ppc32_probe_pmu(void) ++{ ++ enum ppc32_pmu_type pm_type; ++ int nmmcr = 0, npmds = 0, intsok = 0, i; ++ unsigned int pvr; ++ char *str; ++ ++ pvr = mfspr(SPRN_PVR); ++ ++ switch (PVR_VER(pvr)) { ++ case 0x0004: /* 604 */ ++ str = "PPC604"; ++ pm_type = PFM_POWERPC_PMU_604; ++ nmmcr = 1; ++ npmds = 2; ++ break; ++ case 0x0009: /* 604e; */ ++ case 0x000A: /* 604ev */ ++ str = "PPC604e"; ++ pm_type = PFM_POWERPC_PMU_604e; ++ nmmcr = 2; ++ npmds = 4; ++ break; ++ case 0x0008: /* 750/740 */ ++ str = "PPC750"; ++ pm_type = PFM_POWERPC_PMU_750; ++ nmmcr = 2; ++ npmds = 4; ++ break; ++ case 0x7000: /* 750FX */ ++ case 0x7001: ++ str = "PPC750"; ++ pm_type = PFM_POWERPC_PMU_750; ++ nmmcr = 2; ++ npmds = 4; ++ if ((pvr & 0xFF0F) >= 0x0203) ++ intsok = 1; ++ break; ++ case 0x7002: /* 750GX */ ++ str = "PPC750"; ++ pm_type = PFM_POWERPC_PMU_750; ++ nmmcr = 2; ++ npmds = 4; ++ intsok = 1; ++ case 0x000C: /* 7400 */ ++ str = "PPC7400"; ++ pm_type = PFM_POWERPC_PMU_7400; ++ nmmcr = 3; ++ npmds = 4; ++ break; ++ case 0x800C: /* 7410 */ ++ str = "PPC7410"; ++ pm_type = PFM_POWERPC_PMU_7400; ++ nmmcr = 3; ++ npmds = 4; ++ if ((pvr & 0xFFFF) >= 0x01103) ++ intsok = 1; ++ break; ++ case 0x8000: /* 7451/7441 */ ++ case 0x8001: /* 7455/7445 */ ++ case 0x8002: /* 7457/7447 */ ++ case 0x8003: /* 7447A */ ++ case 0x8004: /* 7448 */ ++ str = "PPC7450"; ++ pm_type = PFM_POWERPC_PMU_7450; ++ nmmcr = 3; npmds = 6; ++ intsok = 1; ++ break; ++ default: ++ PFM_INFO("Unknown PVR_VER(0x%x)\n", PVR_VER(pvr)); ++ return -1; ++ } ++ ++ /* ++ * deconfigure unimplemented registers ++ */ ++ for (i = npmds; i < PFM_PM_NUM_PMDS; i++) ++ pfm_ppc32_pmd_desc[i].type = PFM_REG_NA; ++ ++ for (i = nmmcr; i < PFM_PM_NUM_PMCS; i++) ++ pfm_ppc32_pmc_desc[i].type = PFM_REG_NA; ++ ++ /* ++ * update PMU description structure ++ */ ++ pfm_ppc32_pmu_conf.pmu_name = str; ++ pfm_ppc32_pmu_info.pmu_style = pm_type; ++ pfm_ppc32_pmu_conf.num_pmc_entries = nmmcr; ++ pfm_ppc32_pmu_conf.num_pmd_entries = npmds; ++ ++ if (intsok == 0) ++ PFM_INFO("Interrupts unlikely to work\n"); ++ ++ return reserve_pmc_hardware(perfmon_perf_irq); ++} ++ ++static void pfm_ppc32_write_pmc(unsigned int cnum, u64 value) ++{ ++ switch (pfm_pmu_conf->pmc_desc[cnum].hw_addr) { ++ case SPRN_MMCR0: ++ mtspr(SPRN_MMCR0, value); ++ break; ++ case SPRN_MMCR1: ++ mtspr(SPRN_MMCR1, value); ++ break; ++ case SPRN_MMCR2: ++ mtspr(SPRN_MMCR2, value); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static void pfm_ppc32_write_pmd(unsigned int cnum, u64 value) ++{ ++ switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) { ++ case SPRN_PMC1: ++ mtspr(SPRN_PMC1, value); ++ break; ++ case SPRN_PMC2: ++ mtspr(SPRN_PMC2, value); ++ break; ++ case SPRN_PMC3: ++ mtspr(SPRN_PMC3, value); ++ break; ++ case SPRN_PMC4: ++ mtspr(SPRN_PMC4, value); ++ break; ++ case SPRN_PMC5: ++ mtspr(SPRN_PMC5, value); ++ break; ++ case SPRN_PMC6: ++ mtspr(SPRN_PMC6, value); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static u64 pfm_ppc32_read_pmd(unsigned int cnum) ++{ ++ switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) { ++ case SPRN_PMC1: ++ return mfspr(SPRN_PMC1); ++ case SPRN_PMC2: ++ return mfspr(SPRN_PMC2); ++ case SPRN_PMC3: ++ return mfspr(SPRN_PMC3); ++ case SPRN_PMC4: ++ return mfspr(SPRN_PMC4); ++ case SPRN_PMC5: ++ return mfspr(SPRN_PMC5); ++ case SPRN_PMC6: ++ return mfspr(SPRN_PMC6); ++ default: ++ BUG(); ++ } ++} ++ ++/** ++ * pfm_ppc32_enable_counters ++ * ++ * Just need to load the current values into the control registers. ++ **/ ++static void pfm_ppc32_enable_counters(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ unsigned int i, max_pmc; ++ ++ max_pmc = pfm_pmu_conf->regs.max_pmc; ++ ++ for (i = 0; i < max_pmc; i++) ++ if (test_bit(i, set->used_pmcs)) ++ pfm_ppc32_write_pmc(i, set->pmcs[i]); ++} ++ ++/** ++ * pfm_ppc32_disable_counters ++ * ++ * Just need to zero all the control registers. ++ **/ ++static void pfm_ppc32_disable_counters(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ unsigned int i, max; ++ ++ max = pfm_pmu_conf->regs.max_pmc; ++ ++ for (i = 0; i < max; i++) ++ if (test_bit(i, set->used_pmcs)) ++ pfm_ppc32_write_pmc(ctx, 0); ++} ++ ++/** ++ * pfm_ppc32_get_ovfl_pmds ++ * ++ * Determine which counters in this set have overflowed and fill in the ++ * set->povfl_pmds mask and set->npend_ovfls count. ++ **/ ++static void pfm_ppc32_get_ovfl_pmds(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ unsigned int i; ++ unsigned int max_pmd = pfm_pmu_conf->regs.max_cnt_pmd; ++ u64 *used_pmds = set->used_pmds; ++ u64 *cntr_pmds = pfm_pmu_conf->regs.cnt_pmds; ++ u64 width_mask = 1 << pfm_pmu_conf->counter_width; ++ u64 new_val, mask[PFM_PMD_BV]; ++ ++ bitmap_and(cast_ulp(mask), cast_ulp(cntr_pmds), ++ cast_ulp(used_pmds), max_pmd); ++ ++ for (i = 0; i < max_pmd; i++) { ++ if (test_bit(i, mask)) { ++ new_val = pfm_ppc32_read_pmd(i); ++ if (new_val & width_mask) { ++ set_bit(i, set->povfl_pmds); ++ set->npend_ovfls++; ++ } ++ } ++ } ++} ++ ++struct pfm_arch_pmu_info pfm_ppc32_pmu_info = { ++ .pmu_style = PFM_POWERPC_PMU_NONE, ++ .write_pmc = pfm_ppc32_write_pmc, ++ .write_pmd = pfm_ppc32_write_pmd, ++ .read_pmd = pfm_ppc32_read_pmd, ++ .get_ovfl_pmds = pfm_ppc32_get_ovfl_pmds, ++ .enable_counters = pfm_ppc32_enable_counters, ++ .disable_counters = pfm_ppc32_disable_counters, ++}; ++ ++static struct pfm_pmu_config pfm_ppc32_pmu_conf = { ++ .counter_width = 31, ++ .pmd_desc = pfm_ppc32_pmd_desc, ++ .pmc_desc = pfm_ppc32_pmc_desc, ++ .probe_pmu = pfm_ppc32_probe_pmu, ++ .flags = PFM_PMU_BUILTIN_FLAG, ++ .owner = THIS_MODULE, ++ .version = "0.1", ++ .arch_info = &pfm_ppc32_pmu_info, ++}; ++ ++static int __init pfm_ppc32_pmu_init_module(void) ++{ ++ return pfm_pmu_register(&pfm_ppc32_pmu_conf); ++} ++ ++static void __exit pfm_ppc32_pmu_cleanup_module(void) ++{ ++ release_pmc_hardware(); ++ pfm_pmu_unregister(&pfm_ppc32_pmu_conf); ++} ++ ++module_init(pfm_ppc32_pmu_init_module); ++module_exit(pfm_ppc32_pmu_cleanup_module); +diff --git a/arch/powerpc/platforms/cell/cbe_regs.c b/arch/powerpc/platforms/cell/cbe_regs.c +index dbc338f..e24320e 100644 +--- a/arch/powerpc/platforms/cell/cbe_regs.c ++++ b/arch/powerpc/platforms/cell/cbe_regs.c +@@ -33,6 +33,7 @@ static struct cbe_regs_map + struct cbe_iic_regs __iomem *iic_regs; + struct cbe_mic_tm_regs __iomem *mic_tm_regs; + struct cbe_pmd_shadow_regs pmd_shadow_regs; ++ struct cbe_ppe_priv_regs __iomem *ppe_priv_regs; + } cbe_regs_maps[MAX_CBE]; + static int cbe_regs_map_count; + +@@ -145,6 +146,23 @@ struct cbe_mic_tm_regs __iomem *cbe_get_cpu_mic_tm_regs(int cpu) + } + EXPORT_SYMBOL_GPL(cbe_get_cpu_mic_tm_regs); + ++struct cbe_ppe_priv_regs __iomem *cbe_get_ppe_priv_regs(struct device_node *np) ++{ ++ struct cbe_regs_map *map = cbe_find_map(np); ++ if (map == NULL) ++ return NULL; ++ return map->ppe_priv_regs; ++} ++ ++struct cbe_ppe_priv_regs __iomem *cbe_get_cpu_ppe_priv_regs(int cpu) ++{ ++ struct cbe_regs_map *map = cbe_thread_map[cpu].regs; ++ if (map == NULL) ++ return NULL; ++ return map->ppe_priv_regs; ++} ++EXPORT_SYMBOL_GPL(cbe_get_cpu_ppe_priv_regs); ++ + u32 cbe_get_hw_thread_id(int cpu) + { + return cbe_thread_map[cpu].thread_id; +@@ -206,6 +224,11 @@ void __init cbe_fill_regs_map(struct cbe_regs_map *map) + for_each_node_by_type(np, "mic-tm") + if (of_get_parent(np) == be) + map->mic_tm_regs = of_iomap(np, 0); ++ ++ for_each_node_by_type(np, "ppe-mmio") ++ if (of_get_parent(np) == be) ++ map->ppe_priv_regs = of_iomap(np, 0); ++ + } else { + struct device_node *cpu; + /* That hack must die die die ! */ +@@ -227,6 +250,10 @@ void __init cbe_fill_regs_map(struct cbe_regs_map *map) + prop = of_get_property(cpu, "mic-tm", NULL); + if (prop != NULL) + map->mic_tm_regs = ioremap(prop->address, prop->len); ++ ++ prop = of_get_property(cpu, "ppe-mmio", NULL); ++ if (prop != NULL) ++ map->ppe_priv_regs = ioremap(prop->address, prop->len); + } + } + +diff --git a/arch/sparc/include/asm/hypervisor.h b/arch/sparc/include/asm/hypervisor.h +index 109ae24..bafe5a6 100644 +--- a/arch/sparc/include/asm/hypervisor.h ++++ b/arch/sparc/include/asm/hypervisor.h +@@ -2713,6 +2713,30 @@ extern unsigned long sun4v_ldc_revoke(unsigned long channel, + */ + #define HV_FAST_SET_PERFREG 0x101 + ++#define HV_N2_PERF_SPARC_CTL 0x0 ++#define HV_N2_PERF_DRAM_CTL0 0x1 ++#define HV_N2_PERF_DRAM_CNT0 0x2 ++#define HV_N2_PERF_DRAM_CTL1 0x3 ++#define HV_N2_PERF_DRAM_CNT1 0x4 ++#define HV_N2_PERF_DRAM_CTL2 0x5 ++#define HV_N2_PERF_DRAM_CNT2 0x6 ++#define HV_N2_PERF_DRAM_CTL3 0x7 ++#define HV_N2_PERF_DRAM_CNT3 0x8 ++ ++#define HV_FAST_N2_GET_PERFREG 0x104 ++#define HV_FAST_N2_SET_PERFREG 0x105 ++ ++#ifndef __ASSEMBLY__ ++extern unsigned long sun4v_niagara_getperf(unsigned long reg, ++ unsigned long *val); ++extern unsigned long sun4v_niagara_setperf(unsigned long reg, ++ unsigned long val); ++extern unsigned long sun4v_niagara2_getperf(unsigned long reg, ++ unsigned long *val); ++extern unsigned long sun4v_niagara2_setperf(unsigned long reg, ++ unsigned long val); ++#endif ++ + /* MMU statistics services. + * + * The hypervisor maintains MMU statistics and privileged code provides +diff --git a/arch/sparc/include/asm/irq_64.h b/arch/sparc/include/asm/irq_64.h +index e3dd930..6cf3aec 100644 +--- a/arch/sparc/include/asm/irq_64.h ++++ b/arch/sparc/include/asm/irq_64.h +@@ -67,6 +67,9 @@ extern void virt_irq_free(unsigned int virt_irq); + extern void __init init_IRQ(void); + extern void fixup_irqs(void); + ++extern int register_perfctr_intr(void (*handler)(struct pt_regs *)); ++extern void release_perfctr_intr(void (*handler)(struct pt_regs *)); ++ + static inline void set_softint(unsigned long bits) + { + __asm__ __volatile__("wr %0, 0x0, %%set_softint" +diff --git a/arch/sparc/include/asm/perfmon.h b/arch/sparc/include/asm/perfmon.h +new file mode 100644 +index 0000000..f20cbfa +--- /dev/null ++++ b/arch/sparc/include/asm/perfmon.h +@@ -0,0 +1,11 @@ ++#ifndef _SPARC64_PERFMON_H_ ++#define _SPARC64_PERFMON_H_ ++ ++/* ++ * arch-specific user visible interface definitions ++ */ ++ ++#define PFM_ARCH_MAX_PMCS 2 ++#define PFM_ARCH_MAX_PMDS 3 ++ ++#endif /* _SPARC64_PERFMON_H_ */ +diff --git a/arch/sparc/include/asm/perfmon_kern.h b/arch/sparc/include/asm/perfmon_kern.h +new file mode 100644 +index 0000000..033eff5 +--- /dev/null ++++ b/arch/sparc/include/asm/perfmon_kern.h +@@ -0,0 +1,286 @@ ++#ifndef _SPARC64_PERFMON_KERN_H_ ++#define _SPARC64_PERFMON_KERN_H_ ++ ++#ifdef __KERNEL__ ++ ++#ifdef CONFIG_PERFMON ++ ++#include <linux/irq.h> ++#include <asm/system.h> ++ ++#define PFM_ARCH_PMD_STK_ARG 2 ++#define PFM_ARCH_PMC_STK_ARG 1 ++ ++struct pfm_arch_pmu_info { ++ u32 pmu_style; ++}; ++ ++static inline void pfm_arch_resend_irq(struct pfm_context *ctx) ++{ ++} ++ ++static inline void pfm_arch_clear_pmd_ovfl_cond(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{} ++ ++static inline void pfm_arch_serialize(void) ++{ ++} ++ ++/* ++ * SPARC does not save the PMDs during pfm_arch_intr_freeze_pmu(), thus ++ * this routine needs to do it when switching sets on overflow ++ */ ++static inline void pfm_arch_save_pmds_from_intr(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ pfm_save_pmds(ctx, set); ++} ++ ++extern void pfm_arch_write_pmc(struct pfm_context *ctx, ++ unsigned int cnum, u64 value); ++extern u64 pfm_arch_read_pmc(struct pfm_context *ctx, unsigned int cnum); ++ ++static inline void pfm_arch_write_pmd(struct pfm_context *ctx, ++ unsigned int cnum, u64 value) ++{ ++ u64 pic; ++ ++ value &= pfm_pmu_conf->ovfl_mask; ++ ++ read_pic(pic); ++ ++ switch (cnum) { ++ case 0: ++ pic = (pic & 0xffffffff00000000UL) | ++ (value & 0xffffffffUL); ++ break; ++ case 1: ++ pic = (pic & 0xffffffffUL) | ++ (value << 32UL); ++ break; ++ default: ++ BUG(); ++ } ++ ++ write_pic(pic); ++} ++ ++static inline u64 pfm_arch_read_pmd(struct pfm_context *ctx, ++ unsigned int cnum) ++{ ++ u64 pic; ++ ++ read_pic(pic); ++ ++ switch (cnum) { ++ case 0: ++ return pic & 0xffffffffUL; ++ case 1: ++ return pic >> 32UL; ++ default: ++ BUG(); ++ return 0; ++ } ++} ++ ++/* ++ * For some CPUs, the upper bits of a counter must be set in order for the ++ * overflow interrupt to happen. On overflow, the counter has wrapped around, ++ * and the upper bits are cleared. This function may be used to set them back. ++ */ ++static inline void pfm_arch_ovfl_reset_pmd(struct pfm_context *ctx, ++ unsigned int cnum) ++{ ++ u64 val = pfm_arch_read_pmd(ctx, cnum); ++ ++ /* This masks out overflow bit 31 */ ++ pfm_arch_write_pmd(ctx, cnum, val); ++} ++ ++/* ++ * At certain points, perfmon needs to know if monitoring has been ++ * explicitely started/stopped by user via pfm_start/pfm_stop. The ++ * information is tracked in ctx.flags.started. However on certain ++ * architectures, it may be possible to start/stop directly from ++ * user level with a single assembly instruction bypassing ++ * the kernel. This function must be used to determine by ++ * an arch-specific mean if monitoring is actually started/stopped. ++ */ ++static inline int pfm_arch_is_active(struct pfm_context *ctx) ++{ ++ return ctx->flags.started; ++} ++ ++static inline void pfm_arch_ctxswout_sys(struct task_struct *task, ++ struct pfm_context *ctx) ++{ ++} ++ ++static inline void pfm_arch_ctxswin_sys(struct task_struct *task, ++ struct pfm_context *ctx) ++{ ++} ++ ++static inline void pfm_arch_ctxswin_thread(struct task_struct *task, ++ struct pfm_context *ctx) ++{ ++} ++ ++int pfm_arch_is_monitoring_active(struct pfm_context *ctx); ++int pfm_arch_ctxswout_thread(struct task_struct *task, ++ struct pfm_context *ctx); ++void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx); ++void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx); ++void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set); ++void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set); ++char *pfm_arch_get_pmu_module_name(void); ++ ++static inline void pfm_arch_intr_freeze_pmu(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ pfm_arch_stop(current, ctx); ++ /* ++ * we mark monitoring as stopped to avoid ++ * certain side effects especially in ++ * pfm_switch_sets_from_intr() on ++ * pfm_arch_restore_pmcs() ++ */ ++ ctx->flags.started = 0; ++} ++ ++/* ++ * unfreeze PMU from pfm_do_interrupt_handler() ++ * ctx may be NULL for spurious ++ */ ++static inline void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx) ++{ ++ if (!ctx) ++ return; ++ ++ PFM_DBG_ovfl("state=%d", ctx->state); ++ ++ ctx->flags.started = 1; ++ ++ if (ctx->state == PFM_CTX_MASKED) ++ return; ++ ++ pfm_arch_restore_pmcs(ctx, ctx->active_set); ++} ++ ++/* ++ * this function is called from the PMU interrupt handler ONLY. ++ * On SPARC, the PMU is frozen via arch_stop, masking would be implemented ++ * via arch-stop as well. Given that the PMU is already stopped when ++ * entering the interrupt handler, we do not need to stop it again, so ++ * this function is a nop. ++ */ ++static inline void pfm_arch_mask_monitoring(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++} ++ ++/* ++ * on MIPS masking/unmasking uses the start/stop mechanism, so we simply ++ * need to start here. ++ */ ++static inline void pfm_arch_unmask_monitoring(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ pfm_arch_start(current, ctx); ++} ++ ++static inline void pfm_arch_pmu_config_remove(void) ++{ ++} ++ ++static inline int pfm_arch_context_create(struct pfm_context *ctx, ++ u32 ctx_flags) ++{ ++ return 0; ++} ++ ++static inline void pfm_arch_context_free(struct pfm_context *ctx) ++{ ++} ++ ++/* ++ * function called from pfm_setfl_sane(). Context is locked ++ * and interrupts are masked. ++ * The value of flags is the value of ctx_flags as passed by ++ * user. ++ * ++ * function must check arch-specific set flags. ++ * Return: ++ * 1 when flags are valid ++ * 0 on error ++ */ ++static inline int pfm_arch_setfl_sane(struct pfm_context *ctx, u32 flags) ++{ ++ return 0; ++} ++ ++static inline int pfm_arch_init(void) ++{ ++ return 0; ++} ++ ++static inline void pfm_arch_init_percpu(void) ++{ ++} ++ ++static inline int pfm_arch_load_context(struct pfm_context *ctx) ++{ ++ return 0; ++} ++ ++static inline void pfm_arch_unload_context(struct pfm_context *ctx) ++{} ++ ++extern void perfmon_interrupt(struct pt_regs *); ++ ++static inline int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds) ++{ ++ return register_perfctr_intr(perfmon_interrupt); ++} ++ ++static inline void pfm_arch_pmu_release(void) ++{ ++ release_perfctr_intr(perfmon_interrupt); ++} ++ ++static inline void pfm_arch_arm_handle_work(struct task_struct *task) ++{} ++ ++static inline void pfm_arch_disarm_handle_work(struct task_struct *task) ++{} ++ ++static inline int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg) ++{ ++ return 0; ++} ++ ++static inline int pfm_arch_get_base_syscall(void) ++{ ++ return __NR_pfm_create_context; ++} ++ ++struct pfm_arch_context { ++ /* empty */ ++}; ++ ++#define PFM_ARCH_CTX_SIZE sizeof(struct pfm_arch_context) ++/* ++ * SPARC needs extra alignment for the sampling buffer ++ */ ++#define PFM_ARCH_SMPL_ALIGN_SIZE (16 * 1024) ++ ++static inline void pfm_cacheflush(void *addr, unsigned int len) ++{ ++} ++ ++#endif /* CONFIG_PERFMON */ ++ ++#endif /* __KERNEL__ */ ++ ++#endif /* _SPARC64_PERFMON_KERN_H_ */ +diff --git a/arch/sparc/include/asm/system_64.h b/arch/sparc/include/asm/system_64.h +index db9e742..2a9ddb9 100644 +--- a/arch/sparc/include/asm/system_64.h ++++ b/arch/sparc/include/asm/system_64.h +@@ -30,6 +30,9 @@ enum sparc_cpu { + #define ARCH_SUN4C_SUN4 0 + #define ARCH_SUN4 0 + ++extern char *sparc_cpu_type; ++extern char *sparc_fpu_type; ++extern char *sparc_pmu_type; + extern char reboot_command[]; + + /* These are here in an effort to more fully work around Spitfire Errata +@@ -104,15 +107,13 @@ do { __asm__ __volatile__("ba,pt %%xcc, 1f\n\t" \ + #define write_pcr(__p) __asm__ __volatile__("wr %0, 0x0, %%pcr" : : "r" (__p)) + #define read_pic(__p) __asm__ __volatile__("rd %%pic, %0" : "=r" (__p)) + +-/* Blackbird errata workaround. See commentary in +- * arch/sparc64/kernel/smp.c:smp_percpu_timer_interrupt() +- * for more information. +- */ +-#define reset_pic() \ +- __asm__ __volatile__("ba,pt %xcc, 99f\n\t" \ ++/* Blackbird errata workaround. */ ++#define write_pic(val) \ ++ __asm__ __volatile__("ba,pt %%xcc, 99f\n\t" \ + ".align 64\n" \ +- "99:wr %g0, 0x0, %pic\n\t" \ +- "rd %pic, %g0") ++ "99:wr %0, 0x0, %%pic\n\t" \ ++ "rd %%pic, %%g0" : : "r" (val)) ++#define reset_pic() write_pic(0) + + #ifndef __ASSEMBLY__ + +@@ -145,14 +146,10 @@ do { \ + * and 2 stores in this critical code path. -DaveM + */ + #define switch_to(prev, next, last) \ +-do { if (test_thread_flag(TIF_PERFCTR)) { \ +- unsigned long __tmp; \ +- read_pcr(__tmp); \ +- current_thread_info()->pcr_reg = __tmp; \ +- read_pic(__tmp); \ +- current_thread_info()->kernel_cntd0 += (unsigned int)(__tmp);\ +- current_thread_info()->kernel_cntd1 += ((__tmp) >> 32); \ +- } \ ++do { if (test_tsk_thread_flag(prev, TIF_PERFMON_CTXSW)) \ ++ pfm_ctxsw_out(prev, next); \ ++ if (test_tsk_thread_flag(next, TIF_PERFMON_CTXSW)) \ ++ pfm_ctxsw_in(prev, next); \ + flush_tlb_pending(); \ + save_and_clear_fpu(); \ + /* If you are tempted to conditionalize the following */ \ +@@ -197,11 +194,6 @@ do { if (test_thread_flag(TIF_PERFCTR)) { \ + "l1", "l2", "l3", "l4", "l5", "l6", "l7", \ + "i0", "i1", "i2", "i3", "i4", "i5", \ + "o0", "o1", "o2", "o3", "o4", "o5", "o7"); \ +- /* If you fuck with this, update ret_from_syscall code too. */ \ +- if (test_thread_flag(TIF_PERFCTR)) { \ +- write_pcr(current_thread_info()->pcr_reg); \ +- reset_pic(); \ +- } \ + } while(0) + + static inline unsigned long xchg32(__volatile__ unsigned int *m, unsigned int val) +diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h +index c0a737d..53857f7 100644 +--- a/arch/sparc/include/asm/thread_info_64.h ++++ b/arch/sparc/include/asm/thread_info_64.h +@@ -58,11 +58,6 @@ struct thread_info { + unsigned long gsr[7]; + unsigned long xfsr[7]; + +- __u64 __user *user_cntd0; +- __u64 __user *user_cntd1; +- __u64 kernel_cntd0, kernel_cntd1; +- __u64 pcr_reg; +- + struct restart_block restart_block; + + struct pt_regs *kern_una_regs; +@@ -96,15 +91,10 @@ struct thread_info { + #define TI_RWIN_SPTRS 0x000003c8 + #define TI_GSR 0x00000400 + #define TI_XFSR 0x00000438 +-#define TI_USER_CNTD0 0x00000470 +-#define TI_USER_CNTD1 0x00000478 +-#define TI_KERN_CNTD0 0x00000480 +-#define TI_KERN_CNTD1 0x00000488 +-#define TI_PCR 0x00000490 +-#define TI_RESTART_BLOCK 0x00000498 +-#define TI_KUNA_REGS 0x000004c0 +-#define TI_KUNA_INSN 0x000004c8 +-#define TI_FPREGS 0x00000500 ++#define TI_RESTART_BLOCK 0x00000470 ++#define TI_KUNA_REGS 0x00000498 ++#define TI_KUNA_INSN 0x000004a0 ++#define TI_FPREGS 0x000004c0 + + /* We embed this in the uppermost byte of thread_info->flags */ + #define FAULT_CODE_WRITE 0x01 /* Write access, implies D-TLB */ +@@ -222,11 +212,11 @@ register struct thread_info *current_thread_info_reg asm("g6"); + #define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ + #define TIF_SIGPENDING 2 /* signal pending */ + #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ +-#define TIF_PERFCTR 4 /* performance counters active */ ++/* Bit 4 is available */ + #define TIF_UNALIGNED 5 /* allowed to do unaligned accesses */ + /* flag bit 6 is available */ + #define TIF_32BIT 7 /* 32-bit binary */ +-/* flag bit 8 is available */ ++#define TIF_PERFMON_WORK 8 /* work for pfm_handle_work() */ + #define TIF_SECCOMP 9 /* secure computing */ + #define TIF_SYSCALL_AUDIT 10 /* syscall auditing active */ + /* flag bit 11 is available */ +@@ -237,22 +227,24 @@ register struct thread_info *current_thread_info_reg asm("g6"); + #define TIF_ABI_PENDING 12 + #define TIF_MEMDIE 13 + #define TIF_POLLING_NRFLAG 14 ++#define TIF_PERFMON_CTXSW 15 /* perfmon needs ctxsw calls */ + + #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) + #define _TIF_NOTIFY_RESUME (1<<TIF_NOTIFY_RESUME) + #define _TIF_SIGPENDING (1<<TIF_SIGPENDING) + #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED) +-#define _TIF_PERFCTR (1<<TIF_PERFCTR) + #define _TIF_UNALIGNED (1<<TIF_UNALIGNED) + #define _TIF_32BIT (1<<TIF_32BIT) ++#define _TIF_PERFMON_WORK (1<<TIF_PERFMON_WORK) + #define _TIF_SECCOMP (1<<TIF_SECCOMP) + #define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT) + #define _TIF_ABI_PENDING (1<<TIF_ABI_PENDING) + #define _TIF_POLLING_NRFLAG (1<<TIF_POLLING_NRFLAG) ++#define _TIF_PERFMON_CTXSW (1<<TIF_PERFMON_CTXSW) + + #define _TIF_USER_WORK_MASK ((0xff << TI_FLAG_WSAVED_SHIFT) | \ + _TIF_DO_NOTIFY_RESUME_MASK | \ +- _TIF_NEED_RESCHED | _TIF_PERFCTR) ++ _TIF_NEED_RESCHED) + #define _TIF_DO_NOTIFY_RESUME_MASK (_TIF_NOTIFY_RESUME | _TIF_SIGPENDING) + + /* +diff --git a/arch/sparc/include/asm/unistd_32.h b/arch/sparc/include/asm/unistd_32.h +index 648643a..efe4d86 100644 +--- a/arch/sparc/include/asm/unistd_32.h ++++ b/arch/sparc/include/asm/unistd_32.h +@@ -338,8 +338,20 @@ + #define __NR_dup3 320 + #define __NR_pipe2 321 + #define __NR_inotify_init1 322 ++#define __NR_pfm_create_context 323 ++#define __NR_pfm_write_pmcs 324 ++#define __NR_pfm_write_pmds 325 ++#define __NR_pfm_read_pmds 326 ++#define __NR_pfm_load_context 327 ++#define __NR_pfm_start 328 ++#define __NR_pfm_stop 329 ++#define __NR_pfm_restart 330 ++#define __NR_pfm_create_evtsets 331 ++#define __NR_pfm_getinfo_evtsets 332 ++#define __NR_pfm_delete_evtsets 333 ++#define __NR_pfm_unload_context 334 + +-#define NR_SYSCALLS 323 ++#define NR_SYSCALLS 325 + + /* Sparc 32-bit only has the "setresuid32", "getresuid32" variants, + * it never had the plain ones and there is no value to adding those +diff --git a/arch/sparc/include/asm/unistd_64.h b/arch/sparc/include/asm/unistd_64.h +index c5cc0e0..cbbb0b5 100644 +--- a/arch/sparc/include/asm/unistd_64.h ++++ b/arch/sparc/include/asm/unistd_64.h +@@ -340,8 +340,20 @@ + #define __NR_dup3 320 + #define __NR_pipe2 321 + #define __NR_inotify_init1 322 ++#define __NR_pfm_create_context 323 ++#define __NR_pfm_write_pmcs 324 ++#define __NR_pfm_write_pmds 325 ++#define __NR_pfm_read_pmds 326 ++#define __NR_pfm_load_context 327 ++#define __NR_pfm_start 328 ++#define __NR_pfm_stop 329 ++#define __NR_pfm_restart 330 ++#define __NR_pfm_create_evtsets 331 ++#define __NR_pfm_getinfo_evtsets 332 ++#define __NR_pfm_delete_evtsets 333 ++#define __NR_pfm_unload_context 334 + +-#define NR_SYSCALLS 323 ++#define NR_SYSCALLS 335 + + #ifdef __KERNEL__ + #define __ARCH_WANT_IPC_PARSE_VERSION +diff --git a/arch/sparc/kernel/systbls.S b/arch/sparc/kernel/systbls.S +index e1b9233..727e4e7 100644 +--- a/arch/sparc/kernel/systbls.S ++++ b/arch/sparc/kernel/systbls.S +@@ -81,4 +81,6 @@ sys_call_table: + /*305*/ .long sys_set_mempolicy, sys_kexec_load, sys_move_pages, sys_getcpu, sys_epoll_pwait + /*310*/ .long sys_utimensat, sys_signalfd, sys_timerfd_create, sys_eventfd, sys_fallocate + /*315*/ .long sys_timerfd_settime, sys_timerfd_gettime, sys_signalfd4, sys_eventfd2, sys_epoll_create1 +-/*320*/ .long sys_dup3, sys_pipe2, sys_inotify_init1 ++/*320*/ .long sys_dup3, sys_pipe2, sys_inotify_init1, sys_pfm_create_context, sys_pfm_write_pmcs, sys_pfm_write_pmds ++/*325*/ .long sys_pfm_write_pmds, sys_pfm_read_pmds, sys_pfm_load_context, sys_pfm_start, sys_pfm_stop ++/*330*/ .long sys_pfm_restart, sys_pfm_create_evtsets, sys_pfm_getinfo_evtsets, sys_pfm_delete_evtsets, sys_pfm_unload_context +diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig +index 36b4b7a..5555d1e 100644 +--- a/arch/sparc64/Kconfig ++++ b/arch/sparc64/Kconfig +@@ -401,6 +401,8 @@ source "drivers/sbus/char/Kconfig" + + source "fs/Kconfig" + ++source "arch/sparc64/perfmon/Kconfig" ++ + source "arch/sparc64/Kconfig.debug" + + source "security/Kconfig" +diff --git a/arch/sparc64/Makefile b/arch/sparc64/Makefile +index b785a39..646731c 100644 +--- a/arch/sparc64/Makefile ++++ b/arch/sparc64/Makefile +@@ -32,6 +32,8 @@ core-y += arch/sparc64/math-emu/ + libs-y += arch/sparc64/prom/ arch/sparc64/lib/ + drivers-$(CONFIG_OPROFILE) += arch/sparc64/oprofile/ + ++core-$(CONFIG_PERFMON) += arch/sparc64/perfmon/ ++ + boot := arch/sparc64/boot + + image tftpboot.img vmlinux.aout: vmlinux +diff --git a/arch/sparc64/kernel/cpu.c b/arch/sparc64/kernel/cpu.c +index 0097c08..f839f84 100644 +--- a/arch/sparc64/kernel/cpu.c ++++ b/arch/sparc64/kernel/cpu.c +@@ -20,16 +20,17 @@ + DEFINE_PER_CPU(cpuinfo_sparc, __cpu_data) = { 0 }; + + struct cpu_iu_info { +- short manuf; +- short impl; +- char* cpu_name; /* should be enough I hope... */ ++ short manuf; ++ short impl; ++ char *cpu_name; ++ char *pmu_name; + }; + + struct cpu_fp_info { +- short manuf; +- short impl; +- char fpu_vers; +- char* fp_name; ++ short manuf; ++ short impl; ++ char fpu_vers; ++ char* fp_name; + }; + + static struct cpu_fp_info linux_sparc_fpu[] = { +@@ -49,23 +50,24 @@ static struct cpu_fp_info linux_sparc_fpu[] = { + #define NSPARCFPU ARRAY_SIZE(linux_sparc_fpu) + + static struct cpu_iu_info linux_sparc_chips[] = { +- { 0x17, 0x10, "TI UltraSparc I (SpitFire)"}, +- { 0x22, 0x10, "TI UltraSparc I (SpitFire)"}, +- { 0x17, 0x11, "TI UltraSparc II (BlackBird)"}, +- { 0x17, 0x12, "TI UltraSparc IIi (Sabre)"}, +- { 0x17, 0x13, "TI UltraSparc IIe (Hummingbird)"}, +- { 0x3e, 0x14, "TI UltraSparc III (Cheetah)"}, +- { 0x3e, 0x15, "TI UltraSparc III+ (Cheetah+)"}, +- { 0x3e, 0x16, "TI UltraSparc IIIi (Jalapeno)"}, +- { 0x3e, 0x18, "TI UltraSparc IV (Jaguar)"}, +- { 0x3e, 0x19, "TI UltraSparc IV+ (Panther)"}, +- { 0x3e, 0x22, "TI UltraSparc IIIi+ (Serrano)"}, +-}; ++ { 0x17, 0x10, "TI UltraSparc I (SpitFire)", "ultra12"}, ++ { 0x22, 0x10, "TI UltraSparc I (SpitFire)", "ultra12"}, ++ { 0x17, 0x11, "TI UltraSparc II (BlackBird)", "ultra12"}, ++ { 0x17, 0x12, "TI UltraSparc IIi (Sabre)", "ultra12"}, ++ { 0x17, 0x13, "TI UltraSparc IIe (Hummingbird)", "ultra12"}, ++ { 0x3e, 0x14, "TI UltraSparc III (Cheetah)", "ultra3"}, ++ { 0x3e, 0x15, "TI UltraSparc III+ (Cheetah+)", "ultra3+"}, ++ { 0x3e, 0x16, "TI UltraSparc IIIi (Jalapeno)", "ultra3i"}, ++ { 0x3e, 0x18, "TI UltraSparc IV (Jaguar)", "ultra4"}, ++ { 0x3e, 0x19, "TI UltraSparc IV+ (Panther)", "ultra4+"}, ++ { 0x3e, 0x22, "TI UltraSparc IIIi+ (Serrano)", "ultra3+"}, ++ }; + + #define NSPARCCHIPS ARRAY_SIZE(linux_sparc_chips) + + char *sparc_cpu_type; + char *sparc_fpu_type; ++char *sparc_pmu_type; + + static void __init sun4v_cpu_probe(void) + { +@@ -73,11 +75,13 @@ static void __init sun4v_cpu_probe(void) + case SUN4V_CHIP_NIAGARA1: + sparc_cpu_type = "UltraSparc T1 (Niagara)"; + sparc_fpu_type = "UltraSparc T1 integrated FPU"; ++ sparc_pmu_type = "niagara"; + break; + + case SUN4V_CHIP_NIAGARA2: + sparc_cpu_type = "UltraSparc T2 (Niagara2)"; + sparc_fpu_type = "UltraSparc T2 integrated FPU"; ++ sparc_pmu_type = "niagara2"; + break; + + default: +@@ -85,6 +89,7 @@ static void __init sun4v_cpu_probe(void) + prom_cpu_compatible); + sparc_cpu_type = "Unknown SUN4V CPU"; + sparc_fpu_type = "Unknown SUN4V FPU"; ++ sparc_pmu_type = "Unknown SUN4V PMU"; + break; + } + } +@@ -117,6 +122,8 @@ retry: + if (linux_sparc_chips[i].impl == impl) { + sparc_cpu_type = + linux_sparc_chips[i].cpu_name; ++ sparc_pmu_type = ++ linux_sparc_chips[i].pmu_name; + break; + } + } +@@ -134,7 +141,7 @@ retry: + printk("DEBUG: manuf[%lx] impl[%lx]\n", + manuf, impl); + } +- sparc_cpu_type = "Unknown CPU"; ++ sparc_pmu_type = "Unknown PMU"; + } + + for (i = 0; i < NSPARCFPU; i++) { +diff --git a/arch/sparc64/kernel/hvcalls.S b/arch/sparc64/kernel/hvcalls.S +index a2810f3..b9f508c 100644 +--- a/arch/sparc64/kernel/hvcalls.S ++++ b/arch/sparc64/kernel/hvcalls.S +@@ -884,3 +884,44 @@ sun4v_mmu_demap_all: + retl + nop + .size sun4v_mmu_demap_all, .-sun4v_mmu_demap_all ++ ++ .globl sun4v_niagara_getperf ++ .type sun4v_niagara_getperf,#function ++sun4v_niagara_getperf: ++ mov %o0, %o4 ++ mov HV_FAST_GET_PERFREG, %o5 ++ ta HV_FAST_TRAP ++ stx %o1, [%o4] ++ retl ++ nop ++ .size sun4v_niagara_getperf, .-sun4v_niagara_getperf ++ ++ .globl sun4v_niagara_setperf ++ .type sun4v_niagara_setperf,#function ++sun4v_niagara_setperf: ++ mov HV_FAST_SET_PERFREG, %o5 ++ ta HV_FAST_TRAP ++ retl ++ nop ++ .size sun4v_niagara_setperf, .-sun4v_niagara_setperf ++ ++ .globl sun4v_niagara2_getperf ++ .type sun4v_niagara2_getperf,#function ++sun4v_niagara2_getperf: ++ mov %o0, %o4 ++ mov HV_FAST_N2_GET_PERFREG, %o5 ++ ta HV_FAST_TRAP ++ stx %o1, [%o4] ++ retl ++ nop ++ .size sun4v_niagara2_getperf, .-sun4v_niagara2_getperf ++ ++ .globl sun4v_niagara2_setperf ++ .type sun4v_niagara2_setperf,#function ++sun4v_niagara2_setperf: ++ mov HV_FAST_N2_SET_PERFREG, %o5 ++ ta HV_FAST_TRAP ++ retl ++ nop ++ .size sun4v_niagara2_setperf, .-sun4v_niagara2_setperf ++ +diff --git a/arch/sparc64/kernel/irq.c b/arch/sparc64/kernel/irq.c +index 7495bc7..e2bcca5 100644 +--- a/arch/sparc64/kernel/irq.c ++++ b/arch/sparc64/kernel/irq.c +@@ -749,6 +749,20 @@ void handler_irq(int irq, struct pt_regs *regs) + irq_exit(); + set_irq_regs(old_regs); + } ++static void unhandled_perf_irq(struct pt_regs *regs) ++{ ++ unsigned long pcr, pic; ++ ++ read_pcr(pcr); ++ read_pic(pic); ++ ++ write_pcr(0); ++ ++ printk(KERN_EMERG "CPU %d: Got unexpected perf counter IRQ.\n", ++ smp_processor_id()); ++ printk(KERN_EMERG "CPU %d: PCR[%016lx] PIC[%016lx]\n", ++ smp_processor_id(), pcr, pic); ++} + + void do_softirq(void) + { +@@ -776,6 +790,55 @@ void do_softirq(void) + local_irq_restore(flags); + } + ++/* Almost a direct copy of the powerpc PMC code. */ ++static DEFINE_SPINLOCK(perf_irq_lock); ++static void *perf_irq_owner_caller; /* mostly for debugging */ ++static void (*perf_irq)(struct pt_regs *regs) = unhandled_perf_irq; ++ ++/* Invoked from level 15 PIL handler in trap table. */ ++void perfctr_irq(int irq, struct pt_regs *regs) ++{ ++ clear_softint(1 << irq); ++ perf_irq(regs); ++} ++ ++int register_perfctr_intr(void (*handler)(struct pt_regs *)) ++{ ++ int ret; ++ ++ if (!handler) ++ return -EINVAL; ++ ++ spin_lock(&perf_irq_lock); ++ if (perf_irq != unhandled_perf_irq) { ++ printk(KERN_WARNING "register_perfctr_intr: " ++ "perf IRQ busy (reserved by caller %p)\n", ++ perf_irq_owner_caller); ++ ret = -EBUSY; ++ goto out; ++ } ++ ++ perf_irq_owner_caller = __builtin_return_address(0); ++ perf_irq = handler; ++ ++ ret = 0; ++out: ++ spin_unlock(&perf_irq_lock); ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(register_perfctr_intr); ++ ++void release_perfctr_intr(void (*handler)(struct pt_regs *)) ++{ ++ spin_lock(&perf_irq_lock); ++ perf_irq_owner_caller = NULL; ++ perf_irq = unhandled_perf_irq; ++ spin_unlock(&perf_irq_lock); ++} ++EXPORT_SYMBOL_GPL(release_perfctr_intr); ++ ++ + #ifdef CONFIG_HOTPLUG_CPU + void fixup_irqs(void) + { +diff --git a/arch/sparc64/kernel/process.c b/arch/sparc64/kernel/process.c +index 15f4178..7282d21 100644 +--- a/arch/sparc64/kernel/process.c ++++ b/arch/sparc64/kernel/process.c +@@ -30,6 +30,7 @@ + #include <linux/cpu.h> + #include <linux/elfcore.h> + #include <linux/sysrq.h> ++#include <linux/perfmon_kern.h> + + #include <asm/oplib.h> + #include <asm/uaccess.h> +@@ -385,11 +386,7 @@ void exit_thread(void) + t->utraps[0]--; + } + +- if (test_and_clear_thread_flag(TIF_PERFCTR)) { +- t->user_cntd0 = t->user_cntd1 = NULL; +- t->pcr_reg = 0; +- write_pcr(0); +- } ++ pfm_exit_thread(); + } + + void flush_thread(void) +@@ -411,13 +408,6 @@ void flush_thread(void) + + set_thread_wsaved(0); + +- /* Turn off performance counters if on. */ +- if (test_and_clear_thread_flag(TIF_PERFCTR)) { +- t->user_cntd0 = t->user_cntd1 = NULL; +- t->pcr_reg = 0; +- write_pcr(0); +- } +- + /* Clear FPU register state. */ + t->fpsaved[0] = 0; + +@@ -631,16 +621,6 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, + t->kregs->u_regs[UREG_FP] = + ((unsigned long) child_sf) - STACK_BIAS; + +- /* Special case, if we are spawning a kernel thread from +- * a userspace task (usermode helper, NFS or similar), we +- * must disable performance counters in the child because +- * the address space and protection realm are changing. +- */ +- if (t->flags & _TIF_PERFCTR) { +- t->user_cntd0 = t->user_cntd1 = NULL; +- t->pcr_reg = 0; +- t->flags &= ~_TIF_PERFCTR; +- } + t->flags |= ((long)ASI_P << TI_FLAG_CURRENT_DS_SHIFT); + t->kregs->u_regs[UREG_G6] = (unsigned long) t; + t->kregs->u_regs[UREG_G4] = (unsigned long) t->task; +@@ -673,6 +653,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, + if (clone_flags & CLONE_SETTLS) + t->kregs->u_regs[UREG_G7] = regs->u_regs[UREG_I3]; + ++ pfm_copy_thread(p); ++ + return 0; + } + +diff --git a/arch/sparc64/kernel/rtrap.S b/arch/sparc64/kernel/rtrap.S +index 97a993c..c2af29d 100644 +--- a/arch/sparc64/kernel/rtrap.S ++++ b/arch/sparc64/kernel/rtrap.S +@@ -65,55 +65,14 @@ __handle_user_windows: + ba,pt %xcc, __handle_user_windows_continue + + andn %l1, %l4, %l1 +-__handle_perfctrs: +- call update_perfctrs +- wrpr %g0, RTRAP_PSTATE, %pstate +- wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate +- ldub [%g6 + TI_WSAVED], %o2 +- brz,pt %o2, 1f +- nop +- /* Redo userwin+sched+sig checks */ +- call fault_in_user_windows +- +- wrpr %g0, RTRAP_PSTATE, %pstate +- wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate +- ldx [%g6 + TI_FLAGS], %l0 +- andcc %l0, _TIF_NEED_RESCHED, %g0 +- be,pt %xcc, 1f +- +- nop +- call schedule +- wrpr %g0, RTRAP_PSTATE, %pstate +- wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate +- ldx [%g6 + TI_FLAGS], %l0 +-1: andcc %l0, _TIF_DO_NOTIFY_RESUME_MASK, %g0 +- +- be,pt %xcc, __handle_perfctrs_continue +- sethi %hi(TSTATE_PEF), %o0 +- mov %l5, %o1 +- add %sp, PTREGS_OFF, %o0 +- mov %l0, %o2 +- call do_notify_resume +- +- wrpr %g0, RTRAP_PSTATE, %pstate +- wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate +- /* Signal delivery can modify pt_regs tstate, so we must +- * reload it. +- */ +- ldx [%sp + PTREGS_OFF + PT_V9_TSTATE], %l1 +- sethi %hi(0xf << 20), %l4 +- and %l1, %l4, %l4 +- andn %l1, %l4, %l1 +- ba,pt %xcc, __handle_perfctrs_continue +- +- sethi %hi(TSTATE_PEF), %o0 + __handle_userfpu: + rd %fprs, %l5 + andcc %l5, FPRS_FEF, %g0 + sethi %hi(TSTATE_PEF), %o0 + be,a,pn %icc, __handle_userfpu_continue + andn %l1, %o0, %l1 +- ba,a,pt %xcc, __handle_userfpu_continue ++ ba,pt %xcc, __handle_userfpu_continue ++ nop + + __handle_signal: + mov %l5, %o1 +@@ -202,12 +161,8 @@ __handle_signal_continue: + brnz,pn %o2, __handle_user_windows + nop + __handle_user_windows_continue: +- ldx [%g6 + TI_FLAGS], %l5 +- andcc %l5, _TIF_PERFCTR, %g0 + sethi %hi(TSTATE_PEF), %o0 +- bne,pn %xcc, __handle_perfctrs +-__handle_perfctrs_continue: +- andcc %l1, %o0, %g0 ++ andcc %l1, %o0, %g0 + + /* This fpdepth clear is necessary for non-syscall rtraps only */ + user_nowork: +diff --git a/arch/sparc64/kernel/setup.c b/arch/sparc64/kernel/setup.c +index c8b03a4..248aa1f 100644 +--- a/arch/sparc64/kernel/setup.c ++++ b/arch/sparc64/kernel/setup.c +@@ -352,6 +352,7 @@ static int show_cpuinfo(struct seq_file *m, void *__unused) + seq_printf(m, + "cpu\t\t: %s\n" + "fpu\t\t: %s\n" ++ "pmu\t\t: %s\n" + "prom\t\t: %s\n" + "type\t\t: %s\n" + "ncpus probed\t: %d\n" +@@ -364,6 +365,7 @@ static int show_cpuinfo(struct seq_file *m, void *__unused) + , + sparc_cpu_type, + sparc_fpu_type, ++ sparc_pmu_type, + prom_version, + ((tlb_type == hypervisor) ? + "sun4v" : +diff --git a/arch/sparc64/kernel/signal.c b/arch/sparc64/kernel/signal.c +index ec82d76..cea1082 100644 +--- a/arch/sparc64/kernel/signal.c ++++ b/arch/sparc64/kernel/signal.c +@@ -23,6 +23,7 @@ + #include <linux/tty.h> + #include <linux/binfmts.h> + #include <linux/bitops.h> ++#include <linux/perfmon_kern.h> + + #include <asm/uaccess.h> + #include <asm/ptrace.h> +@@ -608,6 +609,9 @@ static void do_signal(struct pt_regs *regs, unsigned long orig_i0) + + void do_notify_resume(struct pt_regs *regs, unsigned long orig_i0, unsigned long thread_info_flags) + { ++ if (thread_info_flags & _TIF_PERFMON_WORK) ++ pfm_handle_work(regs); ++ + if (thread_info_flags & _TIF_SIGPENDING) + do_signal(regs, orig_i0); + if (thread_info_flags & _TIF_NOTIFY_RESUME) { +diff --git a/arch/sparc64/kernel/sys_sparc.c b/arch/sparc64/kernel/sys_sparc.c +index 39749e3..384004b 100644 +--- a/arch/sparc64/kernel/sys_sparc.c ++++ b/arch/sparc64/kernel/sys_sparc.c +@@ -26,7 +26,6 @@ + + #include <asm/uaccess.h> + #include <asm/utrap.h> +-#include <asm/perfctr.h> + #include <asm/unistd.h> + + #include "entry.h" +@@ -791,106 +790,10 @@ asmlinkage long sys_rt_sigaction(int sig, + return ret; + } + +-/* Invoked by rtrap code to update performance counters in +- * user space. +- */ +-asmlinkage void update_perfctrs(void) +-{ +- unsigned long pic, tmp; +- +- read_pic(pic); +- tmp = (current_thread_info()->kernel_cntd0 += (unsigned int)pic); +- __put_user(tmp, current_thread_info()->user_cntd0); +- tmp = (current_thread_info()->kernel_cntd1 += (pic >> 32)); +- __put_user(tmp, current_thread_info()->user_cntd1); +- reset_pic(); +-} +- + asmlinkage long sys_perfctr(int opcode, unsigned long arg0, unsigned long arg1, unsigned long arg2) + { +- int err = 0; +- +- switch(opcode) { +- case PERFCTR_ON: +- current_thread_info()->pcr_reg = arg2; +- current_thread_info()->user_cntd0 = (u64 __user *) arg0; +- current_thread_info()->user_cntd1 = (u64 __user *) arg1; +- current_thread_info()->kernel_cntd0 = +- current_thread_info()->kernel_cntd1 = 0; +- write_pcr(arg2); +- reset_pic(); +- set_thread_flag(TIF_PERFCTR); +- break; +- +- case PERFCTR_OFF: +- err = -EINVAL; +- if (test_thread_flag(TIF_PERFCTR)) { +- current_thread_info()->user_cntd0 = +- current_thread_info()->user_cntd1 = NULL; +- current_thread_info()->pcr_reg = 0; +- write_pcr(0); +- clear_thread_flag(TIF_PERFCTR); +- err = 0; +- } +- break; +- +- case PERFCTR_READ: { +- unsigned long pic, tmp; +- +- if (!test_thread_flag(TIF_PERFCTR)) { +- err = -EINVAL; +- break; +- } +- read_pic(pic); +- tmp = (current_thread_info()->kernel_cntd0 += (unsigned int)pic); +- err |= __put_user(tmp, current_thread_info()->user_cntd0); +- tmp = (current_thread_info()->kernel_cntd1 += (pic >> 32)); +- err |= __put_user(tmp, current_thread_info()->user_cntd1); +- reset_pic(); +- break; +- } +- +- case PERFCTR_CLRPIC: +- if (!test_thread_flag(TIF_PERFCTR)) { +- err = -EINVAL; +- break; +- } +- current_thread_info()->kernel_cntd0 = +- current_thread_info()->kernel_cntd1 = 0; +- reset_pic(); +- break; +- +- case PERFCTR_SETPCR: { +- u64 __user *user_pcr = (u64 __user *)arg0; +- +- if (!test_thread_flag(TIF_PERFCTR)) { +- err = -EINVAL; +- break; +- } +- err |= __get_user(current_thread_info()->pcr_reg, user_pcr); +- write_pcr(current_thread_info()->pcr_reg); +- current_thread_info()->kernel_cntd0 = +- current_thread_info()->kernel_cntd1 = 0; +- reset_pic(); +- break; +- } +- +- case PERFCTR_GETPCR: { +- u64 __user *user_pcr = (u64 __user *)arg0; +- +- if (!test_thread_flag(TIF_PERFCTR)) { +- err = -EINVAL; +- break; +- } +- err |= __put_user(current_thread_info()->pcr_reg, user_pcr); +- break; +- } +- +- default: +- err = -EINVAL; +- break; +- }; +- return err; ++ /* Superceded by perfmon2 */ ++ return -ENOSYS; + } + + /* +diff --git a/arch/sparc64/kernel/syscalls.S b/arch/sparc64/kernel/syscalls.S +index a2f2427..b20bf1e 100644 +--- a/arch/sparc64/kernel/syscalls.S ++++ b/arch/sparc64/kernel/syscalls.S +@@ -117,26 +117,9 @@ ret_from_syscall: + stb %g0, [%g6 + TI_NEW_CHILD] + ldx [%g6 + TI_FLAGS], %l0 + call schedule_tail +- mov %g7, %o0 +- andcc %l0, _TIF_PERFCTR, %g0 +- be,pt %icc, 1f +- nop +- ldx [%g6 + TI_PCR], %o7 +- wr %g0, %o7, %pcr +- +- /* Blackbird errata workaround. See commentary in +- * smp.c:smp_percpu_timer_interrupt() for more +- * information. +- */ +- ba,pt %xcc, 99f +- nop +- +- .align 64 +-99: wr %g0, %g0, %pic +- rd %pic, %g0 +- +-1: ba,pt %xcc, ret_sys_call +- ldx [%sp + PTREGS_OFF + PT_V9_I0], %o0 ++ mov %g7, %o0 ++ ba,pt %xcc, ret_sys_call ++ ldx [%sp + PTREGS_OFF + PT_V9_I0], %o0 + + .globl sparc_exit + .type sparc_exit,#function +diff --git a/arch/sparc64/kernel/systbls.S b/arch/sparc64/kernel/systbls.S +index 0fdbf3b..1a1a296 100644 +--- a/arch/sparc64/kernel/systbls.S ++++ b/arch/sparc64/kernel/systbls.S +@@ -82,7 +82,9 @@ sys_call_table32: + .word compat_sys_set_mempolicy, compat_sys_kexec_load, compat_sys_move_pages, sys_getcpu, compat_sys_epoll_pwait + /*310*/ .word compat_sys_utimensat, compat_sys_signalfd, sys_timerfd_create, sys_eventfd, compat_sys_fallocate + .word compat_sys_timerfd_settime, compat_sys_timerfd_gettime, compat_sys_signalfd4, sys_eventfd2, sys_epoll_create1 +-/*320*/ .word sys_dup3, sys_pipe2, sys_inotify_init1 ++/*320*/ .word sys_dup3, sys_pipe2, sys_inotify_init1, sys_pfm_create_context, sys_pfm_write_pmcs ++ .word sys_pfm_write_pmds, sys_pfm_read_pmds, sys_pfm_load_context, sys_pfm_start, sys_pfm_stop ++/*330*/ .word sys_pfm_restart, sys_pfm_create_evtsets, sys_pfm_getinfo_evtsets, sys_pfm_delete_evtsets, sys_pfm_unload_context + + #endif /* CONFIG_COMPAT */ + +@@ -156,4 +158,6 @@ sys_call_table: + .word sys_set_mempolicy, sys_kexec_load, sys_move_pages, sys_getcpu, sys_epoll_pwait + /*310*/ .word sys_utimensat, sys_signalfd, sys_timerfd_create, sys_eventfd, sys_fallocate + .word sys_timerfd_settime, sys_timerfd_gettime, sys_signalfd4, sys_eventfd2, sys_epoll_create1 +-/*320*/ .word sys_dup3, sys_pipe2, sys_inotify_init1 ++/*320*/ .word sys_dup3, sys_pipe2, sys_inotify_init1, sys_pfm_create_context, sys_pfm_write_pmcs ++ .word sys_pfm_write_pmds, sys_pfm_read_pmds, sys_pfm_load_context, sys_pfm_start, sys_pfm_stop ++/*330*/ .word sys_pfm_restart, sys_pfm_create_evtsets, sys_pfm_getinfo_evtsets, sys_pfm_delete_evtsets, sys_pfm_unload_context +diff --git a/arch/sparc64/kernel/traps.c b/arch/sparc64/kernel/traps.c +index c824df1..be45d09 100644 +--- a/arch/sparc64/kernel/traps.c ++++ b/arch/sparc64/kernel/traps.c +@@ -2470,86 +2470,90 @@ extern void tsb_config_offsets_are_bolixed_dave(void); + /* Only invoked on boot processor. */ + void __init trap_init(void) + { +- /* Compile time sanity check. */ +- if (TI_TASK != offsetof(struct thread_info, task) || +- TI_FLAGS != offsetof(struct thread_info, flags) || +- TI_CPU != offsetof(struct thread_info, cpu) || +- TI_FPSAVED != offsetof(struct thread_info, fpsaved) || +- TI_KSP != offsetof(struct thread_info, ksp) || +- TI_FAULT_ADDR != offsetof(struct thread_info, fault_address) || +- TI_KREGS != offsetof(struct thread_info, kregs) || +- TI_UTRAPS != offsetof(struct thread_info, utraps) || +- TI_EXEC_DOMAIN != offsetof(struct thread_info, exec_domain) || +- TI_REG_WINDOW != offsetof(struct thread_info, reg_window) || +- TI_RWIN_SPTRS != offsetof(struct thread_info, rwbuf_stkptrs) || +- TI_GSR != offsetof(struct thread_info, gsr) || +- TI_XFSR != offsetof(struct thread_info, xfsr) || +- TI_USER_CNTD0 != offsetof(struct thread_info, user_cntd0) || +- TI_USER_CNTD1 != offsetof(struct thread_info, user_cntd1) || +- TI_KERN_CNTD0 != offsetof(struct thread_info, kernel_cntd0) || +- TI_KERN_CNTD1 != offsetof(struct thread_info, kernel_cntd1) || +- TI_PCR != offsetof(struct thread_info, pcr_reg) || +- TI_PRE_COUNT != offsetof(struct thread_info, preempt_count) || +- TI_NEW_CHILD != offsetof(struct thread_info, new_child) || +- TI_SYS_NOERROR != offsetof(struct thread_info, syscall_noerror) || +- TI_RESTART_BLOCK != offsetof(struct thread_info, restart_block) || +- TI_KUNA_REGS != offsetof(struct thread_info, kern_una_regs) || +- TI_KUNA_INSN != offsetof(struct thread_info, kern_una_insn) || +- TI_FPREGS != offsetof(struct thread_info, fpregs) || +- (TI_FPREGS & (64 - 1))) +- thread_info_offsets_are_bolixed_dave(); +- +- if (TRAP_PER_CPU_THREAD != offsetof(struct trap_per_cpu, thread) || +- (TRAP_PER_CPU_PGD_PADDR != +- offsetof(struct trap_per_cpu, pgd_paddr)) || +- (TRAP_PER_CPU_CPU_MONDO_PA != +- offsetof(struct trap_per_cpu, cpu_mondo_pa)) || +- (TRAP_PER_CPU_DEV_MONDO_PA != +- offsetof(struct trap_per_cpu, dev_mondo_pa)) || +- (TRAP_PER_CPU_RESUM_MONDO_PA != +- offsetof(struct trap_per_cpu, resum_mondo_pa)) || +- (TRAP_PER_CPU_RESUM_KBUF_PA != +- offsetof(struct trap_per_cpu, resum_kernel_buf_pa)) || +- (TRAP_PER_CPU_NONRESUM_MONDO_PA != +- offsetof(struct trap_per_cpu, nonresum_mondo_pa)) || +- (TRAP_PER_CPU_NONRESUM_KBUF_PA != +- offsetof(struct trap_per_cpu, nonresum_kernel_buf_pa)) || +- (TRAP_PER_CPU_FAULT_INFO != +- offsetof(struct trap_per_cpu, fault_info)) || +- (TRAP_PER_CPU_CPU_MONDO_BLOCK_PA != +- offsetof(struct trap_per_cpu, cpu_mondo_block_pa)) || +- (TRAP_PER_CPU_CPU_LIST_PA != +- offsetof(struct trap_per_cpu, cpu_list_pa)) || +- (TRAP_PER_CPU_TSB_HUGE != +- offsetof(struct trap_per_cpu, tsb_huge)) || +- (TRAP_PER_CPU_TSB_HUGE_TEMP != +- offsetof(struct trap_per_cpu, tsb_huge_temp)) || +- (TRAP_PER_CPU_IRQ_WORKLIST_PA != +- offsetof(struct trap_per_cpu, irq_worklist_pa)) || +- (TRAP_PER_CPU_CPU_MONDO_QMASK != +- offsetof(struct trap_per_cpu, cpu_mondo_qmask)) || +- (TRAP_PER_CPU_DEV_MONDO_QMASK != +- offsetof(struct trap_per_cpu, dev_mondo_qmask)) || +- (TRAP_PER_CPU_RESUM_QMASK != +- offsetof(struct trap_per_cpu, resum_qmask)) || +- (TRAP_PER_CPU_NONRESUM_QMASK != +- offsetof(struct trap_per_cpu, nonresum_qmask))) +- trap_per_cpu_offsets_are_bolixed_dave(); +- +- if ((TSB_CONFIG_TSB != +- offsetof(struct tsb_config, tsb)) || +- (TSB_CONFIG_RSS_LIMIT != +- offsetof(struct tsb_config, tsb_rss_limit)) || +- (TSB_CONFIG_NENTRIES != +- offsetof(struct tsb_config, tsb_nentries)) || +- (TSB_CONFIG_REG_VAL != +- offsetof(struct tsb_config, tsb_reg_val)) || +- (TSB_CONFIG_MAP_VADDR != +- offsetof(struct tsb_config, tsb_map_vaddr)) || +- (TSB_CONFIG_MAP_PTE != +- offsetof(struct tsb_config, tsb_map_pte))) +- tsb_config_offsets_are_bolixed_dave(); +- ++ BUILD_BUG_ON(TI_TASK != offsetof(struct thread_info, task)); ++ BUILD_BUG_ON(TI_FLAGS != offsetof(struct thread_info, flags)); ++ BUILD_BUG_ON(TI_CPU != offsetof(struct thread_info, cpu)); ++ BUILD_BUG_ON(TI_FPSAVED != offsetof(struct thread_info, fpsaved)); ++ BUILD_BUG_ON(TI_KSP != offsetof(struct thread_info, ksp)); ++ BUILD_BUG_ON(TI_FAULT_ADDR != ++ offsetof(struct thread_info, fault_address)); ++ BUILD_BUG_ON(TI_KREGS != offsetof(struct thread_info, kregs)); ++ BUILD_BUG_ON(TI_UTRAPS != offsetof(struct thread_info, utraps)); ++ BUILD_BUG_ON(TI_EXEC_DOMAIN != ++ offsetof(struct thread_info, exec_domain)); ++ BUILD_BUG_ON(TI_REG_WINDOW != ++ offsetof(struct thread_info, reg_window)); ++ BUILD_BUG_ON(TI_RWIN_SPTRS != ++ offsetof(struct thread_info, rwbuf_stkptrs)); ++ BUILD_BUG_ON(TI_GSR != offsetof(struct thread_info, gsr)); ++ BUILD_BUG_ON(TI_XFSR != offsetof(struct thread_info, xfsr)); ++ BUILD_BUG_ON(TI_PRE_COUNT != ++ offsetof(struct thread_info, preempt_count)); ++ BUILD_BUG_ON(TI_NEW_CHILD != ++ offsetof(struct thread_info, new_child)); ++ BUILD_BUG_ON(TI_SYS_NOERROR != ++ offsetof(struct thread_info, syscall_noerror)); ++ BUILD_BUG_ON(TI_RESTART_BLOCK != ++ offsetof(struct thread_info, restart_block)); ++ BUILD_BUG_ON(TI_KUNA_REGS != ++ offsetof(struct thread_info, kern_una_regs)); ++ BUILD_BUG_ON(TI_KUNA_INSN != ++ offsetof(struct thread_info, kern_una_insn)); ++ BUILD_BUG_ON(TI_FPREGS != offsetof(struct thread_info, fpregs)); ++ BUILD_BUG_ON((TI_FPREGS & (64 - 1))); ++ ++ BUILD_BUG_ON(TRAP_PER_CPU_THREAD != ++ offsetof(struct trap_per_cpu, thread)); ++ BUILD_BUG_ON(TRAP_PER_CPU_PGD_PADDR != ++ offsetof(struct trap_per_cpu, pgd_paddr)); ++ BUILD_BUG_ON(TRAP_PER_CPU_CPU_MONDO_PA != ++ offsetof(struct trap_per_cpu, cpu_mondo_pa)); ++ BUILD_BUG_ON(TRAP_PER_CPU_DEV_MONDO_PA != ++ offsetof(struct trap_per_cpu, dev_mondo_pa)); ++ BUILD_BUG_ON(TRAP_PER_CPU_RESUM_MONDO_PA != ++ offsetof(struct trap_per_cpu, resum_mondo_pa)); ++ BUILD_BUG_ON(TRAP_PER_CPU_RESUM_KBUF_PA != ++ offsetof(struct trap_per_cpu, resum_kernel_buf_pa)); ++ BUILD_BUG_ON(TRAP_PER_CPU_NONRESUM_MONDO_PA != ++ offsetof(struct trap_per_cpu, nonresum_mondo_pa)); ++ BUILD_BUG_ON(TRAP_PER_CPU_NONRESUM_KBUF_PA != ++ offsetof(struct trap_per_cpu, nonresum_kernel_buf_pa)); ++ BUILD_BUG_ON(TRAP_PER_CPU_FAULT_INFO != ++ offsetof(struct trap_per_cpu, fault_info)); ++ BUILD_BUG_ON(TRAP_PER_CPU_CPU_MONDO_BLOCK_PA != ++ offsetof(struct trap_per_cpu, cpu_mondo_block_pa)); ++ BUILD_BUG_ON(TRAP_PER_CPU_CPU_LIST_PA != ++ offsetof(struct trap_per_cpu, cpu_list_pa)); ++ BUILD_BUG_ON(TRAP_PER_CPU_TSB_HUGE != ++ offsetof(struct trap_per_cpu, tsb_huge)); ++ BUILD_BUG_ON(TRAP_PER_CPU_TSB_HUGE_TEMP != ++ offsetof(struct trap_per_cpu, tsb_huge_temp)); ++#if 0 ++ BUILD_BUG_ON(TRAP_PER_CPU_IRQ_WORKLIST != ++ offsetof(struct trap_per_cpu, irq_worklist)); ++#endif ++ BUILD_BUG_ON(TRAP_PER_CPU_CPU_MONDO_QMASK != ++ offsetof(struct trap_per_cpu, cpu_mondo_qmask)); ++ BUILD_BUG_ON(TRAP_PER_CPU_DEV_MONDO_QMASK != ++ offsetof(struct trap_per_cpu, dev_mondo_qmask)); ++ BUILD_BUG_ON(TRAP_PER_CPU_RESUM_QMASK != ++ offsetof(struct trap_per_cpu, resum_qmask)); ++ BUILD_BUG_ON(TRAP_PER_CPU_NONRESUM_QMASK != ++ offsetof(struct trap_per_cpu, nonresum_qmask)); ++ ++ BUILD_BUG_ON(TSB_CONFIG_TSB != ++ offsetof(struct tsb_config, tsb)); ++ BUILD_BUG_ON(TSB_CONFIG_RSS_LIMIT != ++ offsetof(struct tsb_config, tsb_rss_limit)); ++ BUILD_BUG_ON(TSB_CONFIG_NENTRIES != ++ offsetof(struct tsb_config, tsb_nentries)); ++ BUILD_BUG_ON(TSB_CONFIG_REG_VAL != ++ offsetof(struct tsb_config, tsb_reg_val)); ++ BUILD_BUG_ON(TSB_CONFIG_MAP_VADDR != ++ offsetof(struct tsb_config, tsb_map_vaddr)); ++ BUILD_BUG_ON(TSB_CONFIG_MAP_PTE != ++ offsetof(struct tsb_config, tsb_map_pte)); ++ + /* Attach to the address space of init_task. On SMP we + * do this in smp.c:smp_callin for other cpus. + */ +diff --git a/arch/sparc64/kernel/ttable.S b/arch/sparc64/kernel/ttable.S +index 1ade3d6..2a31ffa 100644 +--- a/arch/sparc64/kernel/ttable.S ++++ b/arch/sparc64/kernel/ttable.S +@@ -66,7 +66,7 @@ tl0_irq6: BTRAP(0x46) + tl0_irq7: BTRAP(0x47) BTRAP(0x48) BTRAP(0x49) + tl0_irq10: BTRAP(0x4a) BTRAP(0x4b) BTRAP(0x4c) BTRAP(0x4d) + tl0_irq14: TRAP_IRQ(timer_interrupt, 14) +-tl0_irq15: TRAP_IRQ(handler_irq, 15) ++tl0_irq15: TRAP_IRQ(perfctr_irq, 15) + tl0_resv050: BTRAP(0x50) BTRAP(0x51) BTRAP(0x52) BTRAP(0x53) BTRAP(0x54) BTRAP(0x55) + tl0_resv056: BTRAP(0x56) BTRAP(0x57) BTRAP(0x58) BTRAP(0x59) BTRAP(0x5a) BTRAP(0x5b) + tl0_resv05c: BTRAP(0x5c) BTRAP(0x5d) BTRAP(0x5e) BTRAP(0x5f) +diff --git a/arch/sparc64/perfmon/Kconfig b/arch/sparc64/perfmon/Kconfig +new file mode 100644 +index 0000000..4672024 +--- /dev/null ++++ b/arch/sparc64/perfmon/Kconfig +@@ -0,0 +1,26 @@ ++menu "Hardware Performance Monitoring support" ++config PERFMON ++ bool "Perfmon2 performance monitoring interface" ++ default n ++ help ++ Enables the perfmon2 interface to access the hardware ++ performance counters. See <http://perfmon2.sf.net/> for ++ more details. ++ ++config PERFMON_DEBUG ++ bool "Perfmon debugging" ++ depends on PERFMON ++ default n ++ help ++ Enables perfmon debugging support ++ ++config PERFMON_DEBUG_FS ++ bool "Enable perfmon statistics reporting via debugfs" ++ default y ++ depends on PERFMON && DEBUG_FS ++ help ++ Enable collection and reporting of perfmon timing statistics under ++ debugfs. This is used for debugging and performance analysis of the ++ subsystem. The debugfs filesystem must be mounted. ++ ++endmenu +diff --git a/arch/sparc64/perfmon/Makefile b/arch/sparc64/perfmon/Makefile +new file mode 100644 +index 0000000..ad2d907 +--- /dev/null ++++ b/arch/sparc64/perfmon/Makefile +@@ -0,0 +1 @@ ++obj-$(CONFIG_PERFMON) += perfmon.o +diff --git a/arch/sparc64/perfmon/perfmon.c b/arch/sparc64/perfmon/perfmon.c +new file mode 100644 +index 0000000..9e29833 +--- /dev/null ++++ b/arch/sparc64/perfmon/perfmon.c +@@ -0,0 +1,422 @@ ++/* perfmon.c: sparc64 perfmon support ++ * ++ * Copyright (C) 2007 David S. Miller (davem@davemloft.net) ++ */ ++ ++#include <linux/kernel.h> ++#include <linux/module.h> ++#include <linux/irq.h> ++#include <linux/perfmon_kern.h> ++ ++#include <asm/system.h> ++#include <asm/spitfire.h> ++#include <asm/hypervisor.h> ++ ++struct pcr_ops { ++ void (*write)(u64); ++ u64 (*read)(void); ++}; ++ ++static void direct_write_pcr(u64 val) ++{ ++ write_pcr(val); ++} ++ ++static u64 direct_read_pcr(void) ++{ ++ u64 pcr; ++ ++ read_pcr(pcr); ++ ++ return pcr; ++} ++ ++static struct pcr_ops direct_pcr_ops = { ++ .write = direct_write_pcr, ++ .read = direct_read_pcr, ++}; ++ ++/* Using the hypervisor call is needed so that we can set the ++ * hypervisor trace bit correctly, which is hyperprivileged. ++ */ ++static void n2_write_pcr(u64 val) ++{ ++ unsigned long ret; ++ ++ ret = sun4v_niagara2_setperf(HV_N2_PERF_SPARC_CTL, val); ++ if (val != HV_EOK) ++ write_pcr(val); ++} ++ ++static u64 n2_read_pcr(void) ++{ ++ u64 pcr; ++ ++ read_pcr(pcr); ++ ++ return pcr; ++} ++ ++static struct pcr_ops n2_pcr_ops = { ++ .write = n2_write_pcr, ++ .read = n2_read_pcr, ++}; ++ ++static struct pcr_ops *pcr_ops; ++ ++void pfm_arch_write_pmc(struct pfm_context *ctx, ++ unsigned int cnum, u64 value) ++{ ++ /* ++ * we only write to the actual register when monitoring is ++ * active (pfm_start was issued) ++ */ ++ if (ctx && ctx->flags.started == 0) ++ return; ++ ++ pcr_ops->write(value); ++} ++ ++u64 pfm_arch_read_pmc(struct pfm_context *ctx, unsigned int cnum) ++{ ++ return pcr_ops->read(); ++} ++ ++/* ++ * collect pending overflowed PMDs. Called from pfm_ctxsw() ++ * and from PMU interrupt handler. Must fill in set->povfl_pmds[] ++ * and set->npend_ovfls. Interrupts are masked ++ */ ++static void __pfm_get_ovfl_pmds(struct pfm_context *ctx, struct pfm_event_set *set) ++{ ++ unsigned int max = ctx->regs.max_intr_pmd; ++ u64 wmask = 1ULL << pfm_pmu_conf->counter_width; ++ u64 *intr_pmds = ctx->regs.intr_pmds; ++ u64 *used_mask = set->used_pmds; ++ u64 mask[PFM_PMD_BV]; ++ unsigned int i; ++ ++ bitmap_and(cast_ulp(mask), ++ cast_ulp(intr_pmds), ++ cast_ulp(used_mask), ++ max); ++ ++ /* ++ * check all PMD that can generate interrupts ++ * (that includes counters) ++ */ ++ for (i = 0; i < max; i++) { ++ if (test_bit(i, mask)) { ++ u64 new_val = pfm_arch_read_pmd(ctx, i); ++ ++ PFM_DBG_ovfl("pmd%u new_val=0x%llx bit=%d\n", ++ i, (unsigned long long)new_val, ++ (new_val&wmask) ? 1 : 0); ++ ++ if (new_val & wmask) { ++ __set_bit(i, set->povfl_pmds); ++ set->npend_ovfls++; ++ } ++ } ++ } ++} ++ ++static void pfm_stop_active(struct task_struct *task, struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ unsigned int i, max = ctx->regs.max_pmc; ++ ++ /* ++ * clear enable bits, assume all pmcs are enable pmcs ++ */ ++ for (i = 0; i < max; i++) { ++ if (test_bit(i, set->used_pmcs)) ++ pfm_arch_write_pmc(ctx, i, 0); ++ } ++ ++ if (set->npend_ovfls) ++ return; ++ ++ __pfm_get_ovfl_pmds(ctx, set); ++} ++ ++/* ++ * Called from pfm_ctxsw(). Task is guaranteed to be current. ++ * Context is locked. Interrupts are masked. Monitoring is active. ++ * PMU access is guaranteed. PMC and PMD registers are live in PMU. ++ * ++ * for per-thread: ++ * must stop monitoring for the task ++ * ++ * Return: ++ * non-zero : did not save PMDs (as part of stopping the PMU) ++ * 0 : saved PMDs (no need to save them in caller) ++ */ ++int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx) ++{ ++ /* ++ * disable lazy restore of PMC registers. ++ */ ++ ctx->active_set->priv_flags |= PFM_SETFL_PRIV_MOD_PMCS; ++ ++ pfm_stop_active(task, ctx, ctx->active_set); ++ ++ return 1; ++} ++ ++/* ++ * Called from pfm_stop() and idle notifier ++ * ++ * Interrupts are masked. Context is locked. Set is the active set. ++ * ++ * For per-thread: ++ * task is not necessarily current. If not current task, then ++ * task is guaranteed stopped and off any cpu. Access to PMU ++ * is not guaranteed. Interrupts are masked. Context is locked. ++ * Set is the active set. ++ * ++ * For system-wide: ++ * task is current ++ * ++ * must disable active monitoring. ctx cannot be NULL ++ */ ++void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx) ++{ ++ /* ++ * no need to go through stop_save() ++ * if we are already stopped ++ */ ++ if (!ctx->flags.started || ctx->state == PFM_CTX_MASKED) ++ return; ++ ++ /* ++ * stop live registers and collect pending overflow ++ */ ++ if (task == current) ++ pfm_stop_active(task, ctx, ctx->active_set); ++} ++ ++/* ++ * Enable active monitoring. Called from pfm_start() and ++ * pfm_arch_unmask_monitoring(). ++ * ++ * Interrupts are masked. Context is locked. Set is the active set. ++ * ++ * For per-trhead: ++ * Task is not necessarily current. If not current task, then task ++ * is guaranteed stopped and off any cpu. Access to PMU is not guaranteed. ++ * ++ * For system-wide: ++ * task is always current ++ * ++ * must enable active monitoring. ++ */ ++void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx) ++{ ++ struct pfm_event_set *set; ++ unsigned int max_pmc = ctx->regs.max_pmc; ++ unsigned int i; ++ ++ if (task != current) ++ return; ++ ++ set = ctx->active_set; ++ for (i = 0; i < max_pmc; i++) { ++ if (test_bit(i, set->used_pmcs)) ++ pfm_arch_write_pmc(ctx, i, set->pmcs[i]); ++ } ++} ++ ++/* ++ * function called from pfm_switch_sets(), pfm_context_load_thread(), ++ * pfm_context_load_sys(), pfm_ctxsw(), pfm_switch_sets() ++ * context is locked. Interrupts are masked. set cannot be NULL. ++ * Access to the PMU is guaranteed. ++ * ++ * function must restore all PMD registers from set. ++ */ ++void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set) ++{ ++ unsigned int max_pmd = ctx->regs.max_pmd; ++ u64 ovfl_mask = pfm_pmu_conf->ovfl_mask; ++ u64 *impl_pmds = ctx->regs.pmds; ++ unsigned int i; ++ ++ /* ++ * must restore all pmds to avoid leaking ++ * information to user. ++ */ ++ for (i = 0; i < max_pmd; i++) { ++ u64 val; ++ ++ if (test_bit(i, impl_pmds) == 0) ++ continue; ++ ++ val = set->pmds[i].value; ++ ++ /* ++ * set upper bits for counter to ensure ++ * overflow will trigger ++ */ ++ val &= ovfl_mask; ++ ++ pfm_arch_write_pmd(ctx, i, val); ++ } ++} ++ ++/* ++ * function called from pfm_switch_sets(), pfm_context_load_thread(), ++ * pfm_context_load_sys(), pfm_ctxsw(). ++ * Context is locked. Interrupts are masked. set cannot be NULL. ++ * Access to the PMU is guaranteed. ++ * ++ * function must restore all PMC registers from set, if needed. ++ */ ++void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set) ++{ ++ unsigned int max_pmc = ctx->regs.max_pmc; ++ u64 *impl_pmcs = ctx->regs.pmcs; ++ unsigned int i; ++ ++ /* If we're masked or stopped we don't need to bother restoring ++ * the PMCs now. ++ */ ++ if (ctx->state == PFM_CTX_MASKED || ctx->flags.started == 0) ++ return; ++ ++ /* ++ * restore all pmcs ++ */ ++ for (i = 0; i < max_pmc; i++) ++ if (test_bit(i, impl_pmcs)) ++ pfm_arch_write_pmc(ctx, i, set->pmcs[i]); ++} ++ ++char *pfm_arch_get_pmu_module_name(void) ++{ ++ return NULL; ++} ++ ++void perfmon_interrupt(struct pt_regs *regs) ++{ ++ pfm_interrupt_handler(instruction_pointer(regs), regs); ++} ++ ++static struct pfm_regmap_desc pfm_sparc64_pmc_desc[] = { ++ PMC_D(PFM_REG_I, "PCR", 0, 0, 0, 0), ++}; ++ ++static struct pfm_regmap_desc pfm_sparc64_pmd_desc[] = { ++ PMD_D(PFM_REG_C, "PIC0", 0), ++ PMD_D(PFM_REG_C, "PIC1", 0), ++}; ++ ++static int pfm_sparc64_probe(void) ++{ ++ return 0; ++} ++ ++static struct pfm_pmu_config pmu_sparc64_pmu_conf = { ++ .counter_width = 31, ++ .pmd_desc = pfm_sparc64_pmd_desc, ++ .num_pmd_entries = 2, ++ .pmc_desc = pfm_sparc64_pmc_desc, ++ .num_pmc_entries = 1, ++ .probe_pmu = pfm_sparc64_probe, ++ .flags = PFM_PMU_BUILTIN_FLAG, ++ .owner = THIS_MODULE, ++}; ++ ++static unsigned long perf_hsvc_group; ++static unsigned long perf_hsvc_major; ++static unsigned long perf_hsvc_minor; ++ ++static int __init register_perf_hsvc(void) ++{ ++ if (tlb_type == hypervisor) { ++ switch (sun4v_chip_type) { ++ case SUN4V_CHIP_NIAGARA1: ++ perf_hsvc_group = HV_GRP_N2_CPU; ++ break; ++ ++ case SUN4V_CHIP_NIAGARA2: ++ perf_hsvc_group = HV_GRP_N2_CPU; ++ break; ++ ++ default: ++ return -ENODEV; ++ } ++ ++ ++ perf_hsvc_major = 1; ++ perf_hsvc_minor = 0; ++ if (sun4v_hvapi_register(perf_hsvc_group, ++ perf_hsvc_major, ++ &perf_hsvc_minor)) { ++ printk("perfmon: Could not register N2 hvapi.\n"); ++ return -ENODEV; ++ } ++ } ++ return 0; ++} ++ ++static void unregister_perf_hsvc(void) ++{ ++ if (tlb_type != hypervisor) ++ return; ++ sun4v_hvapi_unregister(perf_hsvc_group); ++} ++ ++static int __init pfm_sparc64_pmu_init(void) ++{ ++ u64 mask; ++ int err; ++ ++ err = register_perf_hsvc(); ++ if (err) ++ return err; ++ ++ if (tlb_type == hypervisor && ++ sun4v_chip_type == SUN4V_CHIP_NIAGARA2) ++ pcr_ops = &n2_pcr_ops; ++ else ++ pcr_ops = &direct_pcr_ops; ++ ++ if (!strcmp(sparc_pmu_type, "ultra12")) ++ mask = (0xf << 11) | (0xf << 4) | 0x7; ++ else if (!strcmp(sparc_pmu_type, "ultra3") || ++ !strcmp(sparc_pmu_type, "ultra3i") || ++ !strcmp(sparc_pmu_type, "ultra3+") || ++ !strcmp(sparc_pmu_type, "ultra4+")) ++ mask = (0x3f << 11) | (0x3f << 4) | 0x7; ++ else if (!strcmp(sparc_pmu_type, "niagara2")) ++ mask = ((1UL << 63) | (1UL << 62) | ++ (1UL << 31) | (0xfUL << 27) | (0xffUL << 19) | ++ (1UL << 18) | (0xfUL << 14) | (0xff << 6) | ++ (0x3UL << 4) | 0x7UL); ++ else if (!strcmp(sparc_pmu_type, "niagara")) ++ mask = ((1UL << 9) | (1UL << 8) | ++ (0x7UL << 4) | 0x7UL); ++ else { ++ err = -ENODEV; ++ goto out_err; ++ } ++ ++ pmu_sparc64_pmu_conf.pmu_name = sparc_pmu_type; ++ pfm_sparc64_pmc_desc[0].rsvd_msk = ~mask; ++ ++ return pfm_pmu_register(&pmu_sparc64_pmu_conf); ++ ++out_err: ++ unregister_perf_hsvc(); ++ return err; ++} ++ ++static void __exit pfm_sparc64_pmu_exit(void) ++{ ++ unregister_perf_hsvc(); ++ return pfm_pmu_unregister(&pmu_sparc64_pmu_conf); ++} ++ ++module_init(pfm_sparc64_pmu_init); ++module_exit(pfm_sparc64_pmu_exit); +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index ed92864..3a2b544 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1378,6 +1378,8 @@ config COMPAT_VDSO + + If unsure, say Y. + ++source "arch/x86/perfmon/Kconfig" ++ + endmenu + + config ARCH_ENABLE_MEMORY_HOTPLUG +diff --git a/arch/x86/Makefile b/arch/x86/Makefile +index f5631da..c868ad6 100644 +--- a/arch/x86/Makefile ++++ b/arch/x86/Makefile +@@ -150,6 +150,8 @@ core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/ + core-y += arch/x86/kernel/ + core-y += arch/x86/mm/ + ++core-$(CONFIG_PERFMON) += arch/x86/perfmon/ ++ + # Remaining sub architecture files + core-y += $(mcore-y) + +diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S +index ffc1bb4..58e00cb 100644 +--- a/arch/x86/ia32/ia32entry.S ++++ b/arch/x86/ia32/ia32entry.S +@@ -832,4 +832,16 @@ ia32_sys_call_table: + .quad sys_dup3 /* 330 */ + .quad sys_pipe2 + .quad sys_inotify_init1 ++ .quad sys_pfm_create_context ++ .quad sys_pfm_write_pmcs ++ .quad sys_pfm_write_pmds /* 335 */ ++ .quad sys_pfm_read_pmds ++ .quad sys_pfm_load_context ++ .quad sys_pfm_start ++ .quad sys_pfm_stop ++ .quad sys_pfm_restart /* 340 */ ++ .quad sys_pfm_create_evtsets ++ .quad sys_pfm_getinfo_evtsets ++ .quad sys_pfm_delete_evtsets ++ .quad sys_pfm_unload_context + ia32_syscall_end: +diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c +index f88bd0d..53fe335 100644 +--- a/arch/x86/kernel/apic_32.c ++++ b/arch/x86/kernel/apic_32.c +@@ -28,6 +28,7 @@ + #include <linux/acpi_pmtmr.h> + #include <linux/module.h> + #include <linux/dmi.h> ++#include <linux/perfmon_kern.h> + + #include <asm/atomic.h> + #include <asm/smp.h> +@@ -669,6 +670,7 @@ u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) + setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); + return APIC_EILVT_LVTOFF_IBS; + } ++EXPORT_SYMBOL(setup_APIC_eilvt_ibs); + + /* + * Local APIC start and shutdown +@@ -1367,6 +1369,9 @@ void __init apic_intr_init(void) + #ifdef CONFIG_X86_MCE_P4THERMAL + alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); + #endif ++#ifdef CONFIG_PERFMON ++ set_intr_gate(LOCAL_PERFMON_VECTOR, pmu_interrupt); ++#endif + } + + /** +diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c +index 446c062..574cd3b 100644 +--- a/arch/x86/kernel/apic_64.c ++++ b/arch/x86/kernel/apic_64.c +@@ -228,6 +228,7 @@ u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) + setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); + return APIC_EILVT_LVTOFF_IBS; + } ++EXPORT_SYMBOL(setup_APIC_eilvt_ibs); + + /* + * Program the next event, relative to now +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 4e456bd..5b6d6ca 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -5,6 +5,7 @@ + #include <linux/module.h> + #include <linux/percpu.h> + #include <linux/bootmem.h> ++#include <linux/perfmon_kern.h> + #include <asm/processor.h> + #include <asm/i387.h> + #include <asm/msr.h> +@@ -726,6 +727,8 @@ void __cpuinit cpu_init(void) + current_thread_info()->status = 0; + clear_used_math(); + mxcsr_feature_mask_init(); ++ ++ pfm_init_percpu(); + } + + #ifdef CONFIG_HOTPLUG_CPU +diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S +index 109792b..0b6e34c 100644 +--- a/arch/x86/kernel/entry_32.S ++++ b/arch/x86/kernel/entry_32.S +@@ -513,7 +513,7 @@ ENDPROC(system_call) + ALIGN + RING0_PTREGS_FRAME # can't unwind into user space anyway + work_pending: +- testb $_TIF_NEED_RESCHED, %cl ++ testw $(_TIF_NEED_RESCHED|_TIF_PERFMON_WORK), %cx + jz work_notifysig + work_resched: + call schedule +diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S +index 89434d4..34e44f5 100644 +--- a/arch/x86/kernel/entry_64.S ++++ b/arch/x86/kernel/entry_64.S +@@ -888,7 +888,13 @@ END(error_interrupt) + ENTRY(spurious_interrupt) + apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt + END(spurious_interrupt) +- ++ ++#ifdef CONFIG_PERFMON ++ENTRY(pmu_interrupt) ++ apicinterrupt LOCAL_PERFMON_VECTOR,smp_pmu_interrupt ++END(pmu_interrupt) ++#endif ++ + /* + * Exception entry points. + */ +diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c +index 1f26fd9..83f6bc1 100644 +--- a/arch/x86/kernel/irqinit_64.c ++++ b/arch/x86/kernel/irqinit_64.c +@@ -11,6 +11,7 @@ + #include <linux/kernel_stat.h> + #include <linux/sysdev.h> + #include <linux/bitops.h> ++#include <linux/perfmon_kern.h> + + #include <asm/acpi.h> + #include <asm/atomic.h> +@@ -217,6 +218,10 @@ void __init native_init_IRQ(void) + alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); + alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); + ++#ifdef CONFIG_PERFMON ++ alloc_intr_gate(LOCAL_PERFMON_VECTOR, pmu_interrupt); ++#endif ++ + if (!acpi_ioapic) + setup_irq(2, &irq2); + } +diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c +index 31f40b2..ed27150 100644 +--- a/arch/x86/kernel/process_32.c ++++ b/arch/x86/kernel/process_32.c +@@ -36,6 +36,7 @@ + #include <linux/personality.h> + #include <linux/tick.h> + #include <linux/percpu.h> ++#include <linux/perfmon_kern.h> + #include <linux/prctl.h> + + #include <asm/uaccess.h> +@@ -277,6 +278,7 @@ void exit_thread(void) + tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; + put_cpu(); + } ++ pfm_exit_thread(); + } + + void flush_thread(void) +@@ -334,6 +336,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, + + savesegment(gs, p->thread.gs); + ++ pfm_copy_thread(p); ++ + tsk = current; + if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { + p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, +@@ -448,6 +452,9 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, + prev = &prev_p->thread; + next = &next_p->thread; + ++ if (test_tsk_thread_flag(prev_p, TIF_PERFMON_CTXSW)) ++ pfm_ctxsw_out(prev_p, next_p); ++ + debugctl = prev->debugctlmsr; + if (next->ds_area_msr != prev->ds_area_msr) { + /* we clear debugctl to make sure DS +@@ -460,6 +467,9 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, + if (next->debugctlmsr != debugctl) + update_debugctlmsr(next->debugctlmsr); + ++ if (test_tsk_thread_flag(next_p, TIF_PERFMON_CTXSW)) ++ pfm_ctxsw_in(prev_p, next_p); ++ + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { + set_debugreg(next->debugreg0, 0); + set_debugreg(next->debugreg1, 1); +diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c +index e12e0e4..97d49ce 100644 +--- a/arch/x86/kernel/process_64.c ++++ b/arch/x86/kernel/process_64.c +@@ -36,6 +36,7 @@ + #include <linux/kprobes.h> + #include <linux/kdebug.h> + #include <linux/tick.h> ++#include <linux/perfmon_kern.h> + #include <linux/prctl.h> + + #include <asm/uaccess.h> +@@ -240,6 +241,7 @@ void exit_thread(void) + t->io_bitmap_max = 0; + put_cpu(); + } ++ pfm_exit_thread(); + } + + void flush_thread(void) +@@ -344,6 +346,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, + savesegment(es, p->thread.es); + savesegment(ds, p->thread.ds); + ++ pfm_copy_thread(p); ++ + if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { + p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + if (!p->thread.io_bitmap_ptr) { +@@ -472,6 +476,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, + prev = &prev_p->thread, + next = &next_p->thread; + ++ if (test_tsk_thread_flag(prev_p, TIF_PERFMON_CTXSW)) ++ pfm_ctxsw_out(prev_p, next_p); ++ + debugctl = prev->debugctlmsr; + if (next->ds_area_msr != prev->ds_area_msr) { + /* we clear debugctl to make sure DS +@@ -484,6 +491,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, + if (next->debugctlmsr != debugctl) + update_debugctlmsr(next->debugctlmsr); + ++ if (test_tsk_thread_flag(next_p, TIF_PERFMON_CTXSW)) ++ pfm_ctxsw_in(prev_p, next_p); ++ + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { + loaddebug(next, 0); + loaddebug(next, 1); +diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c +index 6fb5bcd..53e6665 100644 +--- a/arch/x86/kernel/signal_32.c ++++ b/arch/x86/kernel/signal_32.c +@@ -18,6 +18,7 @@ + #include <linux/sched.h> + #include <linux/wait.h> + #include <linux/elf.h> ++#include <linux/perfmon_kern.h> + #include <linux/smp.h> + #include <linux/mm.h> + +@@ -657,6 +658,10 @@ static void do_signal(struct pt_regs *regs) + void + do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) + { ++ /* process perfmon asynchronous work (e.g. block thread or reset) */ ++ if (thread_info_flags & _TIF_PERFMON_WORK) ++ pfm_handle_work(regs); ++ + /* deal with pending signal delivery */ + if (thread_info_flags & _TIF_SIGPENDING) + do_signal(regs); +diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c +index ca316b5..6e9fa74 100644 +--- a/arch/x86/kernel/signal_64.c ++++ b/arch/x86/kernel/signal_64.c +@@ -19,6 +19,7 @@ + #include <linux/stddef.h> + #include <linux/personality.h> + #include <linux/compiler.h> ++#include <linux/perfmon_kern.h> + #include <asm/processor.h> + #include <asm/ucontext.h> + #include <asm/uaccess.h> +@@ -549,12 +550,17 @@ static void do_signal(struct pt_regs *regs) + void do_notify_resume(struct pt_regs *regs, void *unused, + __u32 thread_info_flags) + { ++ + #ifdef CONFIG_X86_MCE + /* notify userspace of pending MCEs */ + if (thread_info_flags & _TIF_MCE_NOTIFY) + mce_notify_user(); + #endif /* CONFIG_X86_MCE */ + ++ /* process perfmon asynchronous work (e.g. block thread or reset) */ ++ if (thread_info_flags & _TIF_PERFMON_WORK) ++ pfm_handle_work(regs); ++ + /* deal with pending signal delivery */ + if (thread_info_flags & _TIF_SIGPENDING) + do_signal(regs); +diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c +index 7985c5b..9ddf6db 100644 +--- a/arch/x86/kernel/smpboot.c ++++ b/arch/x86/kernel/smpboot.c +@@ -42,6 +42,7 @@ + #include <linux/init.h> + #include <linux/smp.h> + #include <linux/module.h> ++#include <linux/perfmon_kern.h> + #include <linux/sched.h> + #include <linux/percpu.h> + #include <linux/bootmem.h> +@@ -1382,6 +1383,7 @@ int __cpu_disable(void) + remove_cpu_from_maps(cpu); + unlock_vector_lock(); + fixup_irqs(cpu_online_map); ++ pfm_cpu_disable(); + return 0; + } + +diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S +index d44395f..e1384a9 100644 +--- a/arch/x86/kernel/syscall_table_32.S ++++ b/arch/x86/kernel/syscall_table_32.S +@@ -332,3 +332,15 @@ ENTRY(sys_call_table) + .long sys_dup3 /* 330 */ + .long sys_pipe2 + .long sys_inotify_init1 ++ .long sys_pfm_create_context ++ .long sys_pfm_write_pmcs ++ .long sys_pfm_write_pmds /* 335 */ ++ .long sys_pfm_read_pmds ++ .long sys_pfm_load_context ++ .long sys_pfm_start ++ .long sys_pfm_stop ++ .long sys_pfm_restart /* 340 */ ++ .long sys_pfm_create_evtsets ++ .long sys_pfm_getinfo_evtsets ++ .long sys_pfm_delete_evtsets ++ .long sys_pfm_unload_context +diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c +index 8a5f161..10faef5 100644 +--- a/arch/x86/oprofile/nmi_int.c ++++ b/arch/x86/oprofile/nmi_int.c +@@ -16,6 +16,7 @@ + #include <linux/moduleparam.h> + #include <linux/kdebug.h> + #include <linux/cpu.h> ++#include <linux/perfmon_kern.h> + #include <asm/nmi.h> + #include <asm/msr.h> + #include <asm/apic.h> +@@ -217,12 +218,18 @@ static int nmi_setup(void) + int err = 0; + int cpu; + +- if (!allocate_msrs()) ++ if (pfm_session_allcpus_acquire()) ++ return -EBUSY; ++ ++ if (!allocate_msrs()) { ++ pfm_session_allcpus_release(); + return -ENOMEM; ++ } + + err = register_die_notifier(&profile_exceptions_nb); + if (err) { + free_msrs(); ++ pfm_session_allcpus_release(); + return err; + } + +@@ -304,6 +311,7 @@ static void nmi_shutdown(void) + model->shutdown(msrs); + free_msrs(); + put_cpu_var(cpu_msrs); ++ pfm_session_allcpus_release(); + } + + static void nmi_cpu_start(void *dummy) +diff --git a/arch/x86/perfmon/Kconfig b/arch/x86/perfmon/Kconfig +new file mode 100644 +index 0000000..08842e6 +--- /dev/null ++++ b/arch/x86/perfmon/Kconfig +@@ -0,0 +1,89 @@ ++menu "Hardware Performance Monitoring support" ++config PERFMON ++ bool "Perfmon2 performance monitoring interface" ++ select X86_LOCAL_APIC ++ default n ++ help ++ Enables the perfmon2 interface to access the hardware ++ performance counters. See <http://perfmon2.sf.net/> for ++ more details. ++ ++config PERFMON_DEBUG ++ bool "Perfmon debugging" ++ default n ++ depends on PERFMON ++ help ++ Enables perfmon debugging support ++ ++config PERFMON_DEBUG_FS ++ bool "Enable perfmon statistics reporting via debugfs" ++ default y ++ depends on PERFMON && DEBUG_FS ++ help ++ Enable collection and reporting of perfmon timing statistics under ++ debugfs. This is used for debugging and performance analysis of the ++ subsystem.The debugfs filesystem must be mounted. ++ ++config X86_PERFMON_P6 ++ tristate "Support for Intel P6/Pentium M processor hardware performance counters" ++ depends on PERFMON && X86_32 ++ default n ++ help ++ Enables support for Intel P6-style hardware performance counters. ++ To be used for with Intel Pentium III, PentiumPro, Pentium M processors. ++ ++config X86_PERFMON_P4 ++ tristate "Support for Intel Pentium 4/Xeon hardware performance counters" ++ depends on PERFMON ++ default n ++ help ++ Enables support for Intel Pentium 4/Xeon (Netburst) hardware performance ++ counters. ++ ++config X86_PERFMON_PEBS_P4 ++ tristate "Support for Intel Netburst Precise Event-Based Sampling (PEBS)" ++ depends on PERFMON && X86_PERFMON_P4 ++ default n ++ help ++ Enables support for Precise Event-Based Sampling (PEBS) on the Intel ++ Netburst processors such as Pentium 4, Xeon which support it. ++ ++config X86_PERFMON_CORE ++ tristate "Support for Intel Core-based performance counters" ++ depends on PERFMON ++ default n ++ help ++ Enables support for Intel Core-based performance counters. Enable ++ this option to support Intel Core 2 processors. ++ ++config X86_PERFMON_PEBS_CORE ++ tristate "Support for Intel Core Precise Event-Based Sampling (PEBS)" ++ depends on PERFMON && X86_PERFMON_CORE ++ default n ++ help ++ Enables support for Precise Event-Based Sampling (PEBS) on the Intel ++ Core processors. ++ ++config X86_PERFMON_INTEL_ATOM ++ tristate "Support for Intel Atom processor" ++ depends on PERFMON ++ default n ++ help ++ Enables support for Intel Atom processors. ++ ++config X86_PERFMON_INTEL_ARCH ++ tristate "Support for Intel architectural perfmon v1/v2" ++ depends on PERFMON ++ default n ++ help ++ Enables support for Intel architectural performance counters. ++ This feature was introduced with Intel Core Solo/Core Duo processors. ++ ++config X86_PERFMON_AMD64 ++ tristate "Support AMD Athlon64/Opteron64 hardware performance counters" ++ depends on PERFMON ++ default n ++ help ++ Enables support for Athlon64/Opterton64 hardware performance counters. ++ Support for family 6, 15 and 16(10H) processors. ++endmenu +diff --git a/arch/x86/perfmon/Makefile b/arch/x86/perfmon/Makefile +new file mode 100644 +index 0000000..1cbed3e +--- /dev/null ++++ b/arch/x86/perfmon/Makefile +@@ -0,0 +1,13 @@ ++# ++# Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P. ++# Contributed by Stephane Eranian <eranian@hpl.hp.com> ++# ++obj-$(CONFIG_PERFMON) += perfmon.o ++obj-$(CONFIG_X86_PERFMON_P6) += perfmon_p6.o ++obj-$(CONFIG_X86_PERFMON_P4) += perfmon_p4.o ++obj-$(CONFIG_X86_PERFMON_CORE) += perfmon_intel_core.o ++obj-$(CONFIG_X86_PERFMON_INTEL_ARCH) += perfmon_intel_arch.o ++obj-$(CONFIG_X86_PERFMON_PEBS_P4) += perfmon_pebs_p4_smpl.o ++obj-$(CONFIG_X86_PERFMON_PEBS_CORE) += perfmon_pebs_core_smpl.o ++obj-$(CONFIG_X86_PERFMON_AMD64) += perfmon_amd64.o ++obj-$(CONFIG_X86_PERFMON_INTEL_ATOM) += perfmon_intel_atom.o +diff --git a/arch/x86/perfmon/perfmon.c b/arch/x86/perfmon/perfmon.c +new file mode 100644 +index 0000000..e727fed +--- /dev/null ++++ b/arch/x86/perfmon/perfmon.c +@@ -0,0 +1,761 @@ ++/* ++ * This file implements the X86 specific support for the perfmon2 interface ++ * ++ * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * Copyright (c) 2007 Advanced Micro Devices, Inc. ++ * Contributed by Robert Richter <robert.richter@amd.com> ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/interrupt.h> ++#include <linux/perfmon_kern.h> ++#include <linux/kprobes.h> ++#include <linux/kdebug.h> ++#include <linux/nmi.h> ++ ++#include <asm/apic.h> ++ ++DEFINE_PER_CPU(unsigned long, real_iip); ++DEFINE_PER_CPU(int, pfm_using_nmi); ++DEFINE_PER_CPU(unsigned long, saved_lvtpc); ++ ++/** ++ * pfm_arch_ctxswin_thread - thread context switch in ++ * @task: task switched in ++ * @ctx: context for the task ++ * ++ * Called from pfm_ctxsw(). Task is guaranteed to be current. ++ * set cannot be NULL. Context is locked. Interrupts are masked. ++ * ++ * Caller has already restored all PMD and PMC registers, if ++ * necessary (i.e., lazy restore scheme). ++ * ++ * On x86, the only common code just needs to unsecure RDPMC if necessary ++ * ++ * On model-specific features, e.g., PEBS, IBS, are taken care of in the ++ * corresponding PMU description module ++ */ ++void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx) ++{ ++ struct pfm_arch_context *ctx_arch; ++ ++ ctx_arch = pfm_ctx_arch(ctx); ++ ++ /* ++ * restore saved real iip ++ */ ++ if (ctx->active_set->npend_ovfls) ++ __get_cpu_var(real_iip) = ctx_arch->saved_real_iip; ++ ++ /* ++ * enable RDPMC on this CPU ++ */ ++ if (ctx_arch->flags.insecure) ++ set_in_cr4(X86_CR4_PCE); ++} ++ ++/** ++ * pfm_arch_ctxswout_thread - context switch out thread ++ * @task: task switched out ++ * @ctx : context switched out ++ * ++ * Called from pfm_ctxsw(). Task is guaranteed to be current. ++ * Context is locked. Interrupts are masked. Monitoring may be active. ++ * PMU access is guaranteed. PMC and PMD registers are live in PMU. ++ * ++ * Return: ++ * non-zero : did not save PMDs (as part of stopping the PMU) ++ * 0 : saved PMDs (no need to save them in caller) ++ */ ++int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx) ++{ ++ struct pfm_arch_context *ctx_arch; ++ struct pfm_arch_pmu_info *pmu_info; ++ ++ ctx_arch = pfm_ctx_arch(ctx); ++ pmu_info = pfm_pmu_info(); ++ ++ /* ++ * disable lazy restore of PMCS on ctxswin because ++ * we modify some of them. ++ */ ++ ctx->active_set->priv_flags |= PFM_SETFL_PRIV_MOD_PMCS; ++ ++ if (ctx->active_set->npend_ovfls) ++ ctx_arch->saved_real_iip = __get_cpu_var(real_iip); ++ ++ /* ++ * disable RDPMC on this CPU ++ */ ++ if (ctx_arch->flags.insecure) ++ clear_in_cr4(X86_CR4_PCE); ++ ++ if (ctx->state == PFM_CTX_MASKED) ++ return 1; ++ ++ return pmu_info->stop_save(ctx, ctx->active_set); ++} ++ ++/** ++ * pfm_arch_stop - deactivate monitoring ++ * @task: task to stop ++ * @ctx: context to stop ++ * ++ * Called from pfm_stop() ++ * Interrupts are masked. Context is locked. Set is the active set. ++ * ++ * For per-thread: ++ * task is not necessarily current. If not current task, then ++ * task is guaranteed stopped and off any cpu. Access to PMU ++ * is not guaranteed. ++ * ++ * For system-wide: ++ * task is current ++ * ++ * must disable active monitoring. ctx cannot be NULL ++ */ ++void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx) ++{ ++ struct pfm_arch_pmu_info *pmu_info; ++ ++ pmu_info = pfm_pmu_info(); ++ ++ /* ++ * no need to go through stop_save() ++ * if we are already stopped ++ */ ++ if (!ctx->flags.started || ctx->state == PFM_CTX_MASKED) ++ return; ++ ++ if (task != current) ++ return; ++ ++ pmu_info->stop_save(ctx, ctx->active_set); ++} ++ ++ ++/** ++ * pfm_arch_start - activate monitoring ++ * @task: task to start ++ * @ctx: context to stop ++ * ++ * Interrupts are masked. Context is locked. ++ * ++ * For per-thread: ++ * Task is not necessarily current. If not current task, then task ++ * is guaranteed stopped and off any cpu. No access to PMU is task ++ * is not current. ++ * ++ * For system-wide: ++ * task is always current ++ */ ++void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx) ++{ ++ struct pfm_event_set *set; ++ ++ set = ctx->active_set; ++ ++ if (task != current) ++ return; ++ ++ /* ++ * cannot restore PMC if no access to PMU. Will be done ++ * when the thread is switched back in ++ */ ++ ++ pfm_arch_restore_pmcs(ctx, set); ++} ++ ++/** ++ * pfm_arch_restore_pmds - reload PMD registers ++ * @ctx: context to restore from ++ * @set: current event set ++ * ++ * function called from pfm_switch_sets(), pfm_context_load_thread(), ++ * pfm_context_load_sys(), pfm_ctxsw() ++ * ++ * Context is locked. Interrupts are masked. Set cannot be NULL. ++ * Access to the PMU is guaranteed. ++ */ ++void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set) ++{ ++ struct pfm_arch_pmu_info *pmu_info; ++ u16 i, num; ++ ++ pmu_info = pfm_pmu_info(); ++ ++ num = set->nused_pmds; ++ ++ /* ++ * model-specific override ++ */ ++ if (pmu_info->restore_pmds) { ++ pmu_info->restore_pmds(ctx, set); ++ return; ++ } ++ ++ /* ++ * we can restore only the PMD we use because: ++ * ++ * - can only read with pfm_read_pmds() the registers ++ * declared used via pfm_write_pmds(), smpl_pmds, reset_pmds ++ * ++ * - if cr4.pce=1, only counters are exposed to user. RDPMC ++ * does not work with other types of PMU registers.Thus, no ++ * address is ever exposed by counters ++ * ++ * - there is never a dependency between one pmd register and ++ * another ++ */ ++ for (i = 0; num; i++) { ++ if (likely(test_bit(i, cast_ulp(set->used_pmds)))) { ++ pfm_write_pmd(ctx, i, set->pmds[i].value); ++ num--; ++ } ++ } ++} ++ ++/** ++ * pfm_arch_restore_pmcs - reload PMC registers ++ * @ctx: context to restore from ++ * @set: current event set ++ * ++ * function called from pfm_switch_sets(), pfm_context_load_thread(), ++ * pfm_context_load_sys(), pfm_ctxsw(). ++ * ++ * Context is locked. Interrupts are masked. set cannot be NULL. ++ * Access to the PMU is guaranteed. ++ * ++ * function must restore all PMC registers from set ++ */ ++void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set) ++{ ++ struct pfm_arch_pmu_info *pmu_info; ++ u64 *mask; ++ u16 i, num; ++ ++ pmu_info = pfm_pmu_info(); ++ ++ /* ++ * we need to restore PMCs only when: ++ * - context is not masked ++ * - monitoring activated ++ * ++ * Masking monitoring after an overflow does not change the ++ * value of flags.started ++ */ ++ if (ctx->state == PFM_CTX_MASKED || !ctx->flags.started) ++ return; ++ ++ /* ++ * model-specific override ++ */ ++ if (pmu_info->restore_pmcs) { ++ pmu_info->restore_pmcs(ctx, set); ++ return; ++ } ++ /* ++ * restore all pmcs ++ * ++ * It is not possible to restore only the pmcs we used because ++ * certain PMU models (e.g. Pentium 4) have dependencies. Thus ++ * we do not want one application using stale PMC coming from ++ * another one. ++ * ++ * On PMU models where there is no dependencies between pmc, then ++ * it is possible to optimize by only restoring the registers that ++ * are used, and this can be done with the models-specific override ++ * for this function. ++ * ++ * The default code takes the safest approach, i.e., assume the worse ++ */ ++ mask = ctx->regs.pmcs; ++ num = ctx->regs.num_pmcs; ++ for (i = 0; num; i++) { ++ if (test_bit(i, cast_ulp(mask))) { ++ pfm_arch_write_pmc(ctx, i, set->pmcs[i]); ++ num--; ++ } ++ } ++} ++ ++/** ++ * smp_pmu_interrupt - lowest level PMU interrupt handler for X86 ++ * @regs: machine state ++ * ++ * The PMU interrupt is handled through an interrupt gate, therefore ++ * the CPU automatically clears the EFLAGS.IF, i.e., masking interrupts. ++ * ++ * The perfmon interrupt handler MUST run with interrupts disabled due ++ * to possible race with other, higher priority interrupts, such as timer ++ * or IPI function calls. ++ * ++ * See description in IA-32 architecture manual, Vol 3 section 5.8.1 ++ */ ++void smp_pmu_interrupt(struct pt_regs *regs) ++{ ++ struct pfm_arch_pmu_info *pmu_info; ++ struct pfm_context *ctx; ++ unsigned long iip; ++ int using_nmi; ++ ++ using_nmi = __get_cpu_var(pfm_using_nmi); ++ ++ ack_APIC_irq(); ++ ++ irq_enter(); ++ ++ /* ++ * when using NMI, pfm_handle_nmi() gets called ++ * first. It stops monitoring and record the ++ * iip into real_iip, then it repost the interrupt ++ * using the lower priority vector LOCAL_PERFMON_VECTOR ++ * ++ * On some processors, e.g., P4, it may be that some ++ * state is already recorded from pfm_handle_nmi() ++ * and it only needs to be copied back into the normal ++ * fields so it can be used transparently by higher level ++ * code. ++ */ ++ if (using_nmi) { ++ ctx = __get_cpu_var(pmu_ctx); ++ pmu_info = pfm_pmu_info(); ++ iip = __get_cpu_var(real_iip); ++ if (ctx && pmu_info->nmi_copy_state) ++ pmu_info->nmi_copy_state(ctx); ++ } else ++ iip = instruction_pointer(regs); ++ ++ pfm_interrupt_handler(iip, regs); ++ ++ /* ++ * On Intel P6, Pentium M, P4, Intel Core: ++ * - it is necessary to clear the MASK field for the LVTPC ++ * vector. Otherwise interrupts remain masked. See ++ * section 8.5.1 ++ * AMD X86-64: ++ * - the documentation does not stipulate the behavior. ++ * To be safe, we also rewrite the vector to clear the ++ * mask field ++ */ ++ if (!using_nmi && current_cpu_data.x86_vendor == X86_VENDOR_INTEL) ++ apic_write(APIC_LVTPC, LOCAL_PERFMON_VECTOR); ++ ++ irq_exit(); ++} ++ ++/** ++ * pfm_handle_nmi - PMU NMI handler notifier callback ++ * @nb ; notifier block ++ * @val: type of die notifier ++ * @data: die notifier-specific data ++ * ++ * called from notify_die() notifier from an trap handler path. We only ++ * care about NMI related callbacks, and ignore everything else. ++ * ++ * Cannot grab any locks, include the perfmon context lock ++ * ++ * Must detect if NMI interrupt comes from perfmon, and if so it must ++ * stop the PMU and repost a lower-priority interrupt. The perfmon interrupt ++ * handler needs to grab the context lock, thus is cannot be run directly ++ * from the NMI interrupt call path. ++ */ ++static int __kprobes pfm_handle_nmi(struct notifier_block *nb, ++ unsigned long val, ++ void *data) ++{ ++ struct die_args *args = data; ++ struct pfm_context *ctx; ++ struct pfm_arch_pmu_info *pmu_info; ++ ++ /* ++ * only NMI related calls ++ */ ++ if (val != DIE_NMI_IPI) ++ return NOTIFY_DONE; ++ ++ /* ++ * perfmon not using NMI ++ */ ++ if (!__get_cpu_var(pfm_using_nmi)) ++ return NOTIFY_DONE; ++ ++ /* ++ * No context ++ */ ++ ctx = __get_cpu_var(pmu_ctx); ++ if (!ctx) { ++ PFM_DBG_ovfl("no ctx"); ++ return NOTIFY_DONE; ++ } ++ ++ /* ++ * Detect if we have overflows, i.e., NMI interrupt ++ * caused by PMU ++ */ ++ pmu_info = pfm_pmu_conf->pmu_info; ++ if (!pmu_info->has_ovfls(ctx)) { ++ PFM_DBG_ovfl("no ovfl"); ++ return NOTIFY_DONE; ++ } ++ ++ /* ++ * we stop the PMU to avoid further overflow before this ++ * one is treated by lower priority interrupt handler ++ */ ++ pmu_info->quiesce(); ++ ++ /* ++ * record actual instruction pointer ++ */ ++ __get_cpu_var(real_iip) = instruction_pointer(args->regs); ++ ++ /* ++ * post lower priority interrupt (LOCAL_PERFMON_VECTOR) ++ */ ++ pfm_arch_resend_irq(ctx); ++ ++ pfm_stats_inc(ovfl_intr_nmi_count); ++ ++ /* ++ * we need to rewrite the APIC vector on Intel ++ */ ++ if (current_cpu_data.x86_vendor == X86_VENDOR_INTEL) ++ apic_write(APIC_LVTPC, APIC_DM_NMI); ++ ++ /* ++ * the notification was for us ++ */ ++ return NOTIFY_STOP; ++} ++ ++static struct notifier_block pfm_nmi_nb = { ++ .notifier_call = pfm_handle_nmi ++}; ++ ++/** ++ * pfm_arch_get_pmu_module_name - get PMU description module name for autoload ++ * ++ * called from pfm_pmu_request_module ++ */ ++char *pfm_arch_get_pmu_module_name(void) ++{ ++ switch (current_cpu_data.x86) { ++ case 6: ++ switch (current_cpu_data.x86_model) { ++ case 3: /* Pentium II */ ++ case 7 ... 11: ++ case 13: ++ return "perfmon_p6"; ++ case 15: /* Merom */ ++ case 23: /* Penryn */ ++ return "perfmon_intel_core"; ++ case 28: /* Atom/Silverthorne */ ++ return "perfmon_intel_atom"; ++ case 29: /* Dunnington */ ++ return "perfmon_intel_core"; ++ default: ++ goto try_arch; ++ } ++ case 15: ++ case 16: ++ /* All Opteron processors */ ++ if (current_cpu_data.x86_vendor == X86_VENDOR_AMD) ++ return "perfmon_amd64"; ++ ++ switch (current_cpu_data.x86_model) { ++ case 0 ... 6: ++ return "perfmon_p4"; ++ } ++ /* FALL THROUGH */ ++ default: ++try_arch: ++ if (boot_cpu_has(X86_FEATURE_ARCH_PERFMON)) ++ return "perfmon_intel_arch"; ++ return NULL; ++ } ++ return NULL; ++} ++ ++/** ++ * pfm_arch_resend_irq - post perfmon interrupt on regular vector ++ * ++ * called from pfm_ctxswin_thread() and pfm_handle_nmi() ++ */ ++void pfm_arch_resend_irq(struct pfm_context *ctx) ++{ ++ unsigned long val, dest; ++ /* ++ * we cannot use hw_resend_irq() because it goes to ++ * the I/O APIC. We need to go to the Local APIC. ++ * ++ * The "int vec" is not the right solution either ++ * because it triggers a software intr. We need ++ * to regenerate the interrupt and have it pended ++ * until we unmask interrupts. ++ * ++ * Instead we send ourself an IPI on the perfmon ++ * vector. ++ */ ++ val = APIC_DEST_SELF|APIC_INT_ASSERT| ++ APIC_DM_FIXED|LOCAL_PERFMON_VECTOR; ++ ++ dest = apic_read(APIC_ID); ++ apic_write(APIC_ICR2, dest); ++ apic_write(APIC_ICR, val); ++} ++ ++/** ++ * pfm_arch_pmu_acquire_percpu - setup APIC per CPU ++ * @data: contains pmu flags ++ */ ++static void pfm_arch_pmu_acquire_percpu(void *data) ++{ ++ ++ struct pfm_arch_pmu_info *pmu_info; ++ unsigned int tmp, vec; ++ unsigned long flags = (unsigned long)data; ++ unsigned long lvtpc; ++ ++ pmu_info = pfm_pmu_conf->pmu_info; ++ ++ /* ++ * we only reprogram the LVTPC vector if we have detected ++ * no sharing, otherwise it means the APIC is already programmed ++ * and we use whatever vector (likely NMI) is there ++ */ ++ if (!(flags & PFM_X86_FL_SHARING)) { ++ if (flags & PFM_X86_FL_USE_NMI) ++ vec = APIC_DM_NMI; ++ else ++ vec = LOCAL_PERFMON_VECTOR; ++ ++ tmp = apic_read(APIC_LVTERR); ++ apic_write(APIC_LVTERR, tmp | APIC_LVT_MASKED); ++ apic_write(APIC_LVTPC, vec); ++ apic_write(APIC_LVTERR, tmp); ++ } ++ lvtpc = (unsigned long)apic_read(APIC_LVTPC); ++ ++ __get_cpu_var(pfm_using_nmi) = lvtpc == APIC_DM_NMI; ++ ++ PFM_DBG("LTVPC=0x%lx using_nmi=%d", lvtpc, __get_cpu_var(pfm_using_nmi)); ++ ++ /* ++ * invoke model specific acquire routine. May be used for ++ * model-specific initializations ++ */ ++ if (pmu_info->acquire_pmu_percpu) ++ pmu_info->acquire_pmu_percpu(); ++} ++ ++/** ++ * pfm_arch_pmu_acquire - acquire PMU resource from system ++ * @unavail_pmcs : bitmask to use to set unavailable pmcs ++ * @unavail_pmds : bitmask to use to set unavailable pmds ++ * ++ * interrupts are not masked ++ * ++ * Grab PMU registers from lower level MSR allocator ++ * ++ * Program the APIC according the possible interrupt vector ++ * either LOCAL_PERFMON_VECTOR or NMI ++ */ ++int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds) ++{ ++ struct pfm_arch_pmu_info *pmu_info; ++ struct pfm_regmap_desc *d; ++ u16 i, nlost; ++ ++ pmu_info = pfm_pmu_conf->pmu_info; ++ pmu_info->flags &= ~PFM_X86_FL_SHARING; ++ ++ nlost = 0; ++ ++ d = pfm_pmu_conf->pmc_desc; ++ for (i = 0; i < pfm_pmu_conf->num_pmc_entries; i++, d++) { ++ if (!(d->type & PFM_REG_I)) ++ continue; ++ ++ if (d->type & PFM_REG_V) ++ continue; ++ /* ++ * reserve register with lower-level allocator ++ */ ++ if (!reserve_evntsel_nmi(d->hw_addr)) { ++ PFM_DBG("pmc%d(%s) already used", i, d->desc); ++ __set_bit(i, cast_ulp(unavail_pmcs)); ++ nlost++; ++ continue; ++ } ++ } ++ PFM_DBG("nlost=%d info_flags=0x%x\n", nlost, pmu_info->flags); ++ /* ++ * some PMU models (e.g., P6) do not support sharing ++ * so check if we found less than the expected number of PMC registers ++ */ ++ if (nlost) { ++ if (pmu_info->flags & PFM_X86_FL_NO_SHARING) { ++ PFM_INFO("PMU already used by another subsystem, " ++ "PMU does not support sharing, " ++ "try disabling Oprofile or " ++ "reboot with nmi_watchdog=0"); ++ goto undo; ++ } ++ pmu_info->flags |= PFM_X86_FL_SHARING; ++ } ++ ++ d = pfm_pmu_conf->pmd_desc; ++ for (i = 0; i < pfm_pmu_conf->num_pmd_entries; i++, d++) { ++ if (!(d->type & PFM_REG_I)) ++ continue; ++ ++ if (d->type & PFM_REG_V) ++ continue; ++ ++ if (!reserve_perfctr_nmi(d->hw_addr)) { ++ PFM_DBG("pmd%d(%s) already used", i, d->desc); ++ __set_bit(i, cast_ulp(unavail_pmds)); ++ } ++ } ++ /* ++ * program APIC on each CPU ++ */ ++ on_each_cpu(pfm_arch_pmu_acquire_percpu, ++ (void *)(unsigned long)pmu_info->flags , 1); ++ ++ return 0; ++undo: ++ /* ++ * must undo reservation of pmcs in case of error ++ */ ++ d = pfm_pmu_conf->pmc_desc; ++ for (i = 0; i < pfm_pmu_conf->num_pmc_entries; i++, d++) { ++ if (!(d->type & (PFM_REG_I|PFM_REG_V))) ++ continue; ++ if (!test_bit(i, cast_ulp(unavail_pmcs))) ++ release_evntsel_nmi(d->hw_addr); ++ } ++ return -EBUSY; ++} ++/** ++ * pfm-arch_pmu_release_percpu - clear NMI state for one CPU ++ * ++ */ ++static void pfm_arch_pmu_release_percpu(void *data) ++{ ++ struct pfm_arch_pmu_info *pmu_info; ++ ++ pmu_info = pfm_pmu_conf->pmu_info; ++ ++ __get_cpu_var(pfm_using_nmi) = 0; ++ ++ /* ++ * invoke model specific release routine. ++ * May be used to undo certain initializations ++ * or free some model-specific ressources. ++ */ ++ if (pmu_info->release_pmu_percpu) ++ pmu_info->release_pmu_percpu(); ++} ++ ++/** ++ * pfm_arch_pmu_release - release PMU resource to system ++ * ++ * called from pfm_pmu_release() ++ * interrupts are not masked ++ * ++ * On x86, we return the PMU registers to the MSR allocator ++ */ ++void pfm_arch_pmu_release(void) ++{ ++ struct pfm_regmap_desc *d; ++ u16 i, n; ++ ++ d = pfm_pmu_conf->pmc_desc; ++ n = pfm_pmu_conf->regs_all.num_pmcs; ++ for (i = 0; n; i++, d++) { ++ if (!test_bit(i, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) ++ continue; ++ release_evntsel_nmi(d->hw_addr); ++ n--; ++ PFM_DBG("pmc%u released", i); ++ } ++ d = pfm_pmu_conf->pmd_desc; ++ n = pfm_pmu_conf->regs_all.num_pmds; ++ for (i = 0; n; i++, d++) { ++ if (!test_bit(i, cast_ulp(pfm_pmu_conf->regs_all.pmds))) ++ continue; ++ release_perfctr_nmi(d->hw_addr); ++ n--; ++ PFM_DBG("pmd%u released", i); ++ } ++ ++ /* clear NMI variable if used */ ++ if (__get_cpu_var(pfm_using_nmi)) ++ on_each_cpu(pfm_arch_pmu_release_percpu, NULL , 1); ++} ++ ++/** ++ * pfm_arch_pmu_config_init - validate PMU description structure ++ * @cfg: PMU description structure ++ * ++ * return: ++ * 0 if valid ++ * errno otherwise ++ * ++ * called from pfm_pmu_register() ++ */ ++int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg) ++{ ++ struct pfm_arch_pmu_info *pmu_info; ++ ++ pmu_info = pfm_pmu_info(); ++ if (!pmu_info) { ++ PFM_DBG("%s missing pmu_info", cfg->pmu_name); ++ return -EINVAL; ++ } ++ if (!pmu_info->has_ovfls) { ++ PFM_DBG("%s missing has_ovfls callback", cfg->pmu_name); ++ return -EINVAL; ++ } ++ if (!pmu_info->quiesce) { ++ PFM_DBG("%s missing quiesce callback", cfg->pmu_name); ++ return -EINVAL; ++ } ++ if (!pmu_info->stop_save) { ++ PFM_DBG("%s missing stop_save callback", cfg->pmu_name); ++ return -EINVAL; ++ } ++ return 0; ++} ++ ++/** ++ * pfm_arch_init - one time global arch-specific initialization ++ * ++ * called from pfm_init() ++ */ ++int __init pfm_arch_init(void) ++{ ++ /* ++ * we need to register our NMI handler when the kernels boots ++ * to avoid a deadlock condition with the NMI watchdog or Oprofile ++ * if we were to try and register/unregister on-demand. ++ */ ++ register_die_notifier(&pfm_nmi_nb); ++ return 0; ++} +diff --git a/arch/x86/perfmon/perfmon_amd64.c b/arch/x86/perfmon/perfmon_amd64.c +new file mode 100644 +index 0000000..f9b5f9c +--- /dev/null ++++ b/arch/x86/perfmon/perfmon_amd64.c +@@ -0,0 +1,754 @@ ++/* ++ * This file contains the PMU description for the Athlon64 and Opteron64 ++ * processors. It supports 32 and 64-bit modes. ++ * ++ * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * Copyright (c) 2007 Advanced Micro Devices, Inc. ++ * Contributed by Robert Richter <robert.richter@amd.com> ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/module.h> ++#include <linux/vmalloc.h> ++#include <linux/topology.h> ++#include <linux/kprobes.h> ++#include <linux/pci.h> ++#include <linux/perfmon_kern.h> ++#include <asm/hw_irq.h> ++#include <asm/apic.h> ++ ++MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>"); ++MODULE_AUTHOR("Robert Richter <robert.richter@amd.com>"); ++MODULE_DESCRIPTION("AMD64 PMU description table"); ++MODULE_LICENSE("GPL"); ++ ++#define PCI_DEVICE_ID_AMD_10H_NB_MISC 0x1203 ++ ++static int force_nmi; ++MODULE_PARM_DESC(force_nmi, "bool: force use of NMI for PMU interrupt"); ++module_param(force_nmi, bool, 0600); ++ ++#define HAS_IBS 0x01 /* has IBS support */ ++ ++static u8 ibs_eilvt_off, ibs_status; /* AMD: extended interrupt LVT offset */ ++ ++static void pfm_amd64_restore_pmcs(struct pfm_context *ctx, ++ struct pfm_event_set *set); ++static void __kprobes pfm_amd64_quiesce(void); ++static int pfm_amd64_has_ovfls(struct pfm_context *ctx); ++static int pfm_amd64_stop_save(struct pfm_context *ctx, ++ struct pfm_event_set *set); ++ ++#define IBSFETCHCTL_PMC 4 /* pmc4 */ ++#define IBSFETCHCTL_PMD 4 /* pmd4 */ ++#define IBSOPSCTL_PMC 5 /* pmc5 */ ++#define IBSOPSCTL_PMD 7 /* pmd7 */ ++ ++static u64 enable_mask[PFM_MAX_PMCS]; ++static u16 max_enable; ++ ++static struct pfm_arch_pmu_info pfm_amd64_pmu_info = { ++ .stop_save = pfm_amd64_stop_save, ++ .has_ovfls = pfm_amd64_has_ovfls, ++ .quiesce = pfm_amd64_quiesce, ++ .restore_pmcs = pfm_amd64_restore_pmcs ++}; ++ ++#define PFM_AMD64_IBSFETCHVAL (1ULL<<49) /* valid fetch sample */ ++#define PFM_AMD64_IBSFETCHEN (1ULL<<48) /* fetch sampling enabled */ ++#define PFM_AMD64_IBSOPVAL (1ULL<<18) /* valid execution sample */ ++#define PFM_AMD64_IBSOPEN (1ULL<<17) /* execution sampling enabled */ ++ ++/* ++ * force Local APIC interrupt on overflow ++ */ ++#define PFM_K8_VAL (1ULL<<20) ++#define PFM_K8_NO64 (1ULL<<20) ++ ++/* ++ * reserved bits must be 1 ++ * ++ * for family 15: ++ * - upper 32 bits are reserved ++ * - bit 20, bit 21 ++ * ++ * for family 16: ++ * - bits 36-39 are reserved ++ * - bits 42-63 are reserved ++ * - bit 20, bit 21 ++ * ++ * for IBS registers: ++ * IBSFETCHCTL: all bits are reserved except bits 57, 48, 15:0 ++ * IBSOPSCTL : all bits are reserved except bits 17, 15:0 ++ */ ++#define PFM_K8_RSVD ((~((1ULL<<32)-1)) | (1ULL<<20) | (1ULL<<21)) ++#define PFM_16_RSVD ((0x3fffffULL<<42) | (0xfULL<<36) | (1ULL<<20) | (1ULL<<21)) ++#define PFM_AMD64_IBSFETCHCTL_RSVD (~((1ULL<<48)|(1ULL<<57)|0xffffULL)) ++#define PFM_AMD64_IBSOPCTL_RSVD (~((1ULL<<17)|0xffffULL)) ++ ++static struct pfm_regmap_desc pfm_amd64_pmc_desc[] = { ++/* pmc0 */ PMC_D(PFM_REG_I64, "PERFSEL0", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL0), ++/* pmc1 */ PMC_D(PFM_REG_I64, "PERFSEL1", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL1), ++/* pmc2 */ PMC_D(PFM_REG_I64, "PERFSEL2", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL2), ++/* pmc3 */ PMC_D(PFM_REG_I64, "PERFSEL3", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL3), ++/* pmc4 */ PMC_D(PFM_REG_I, "IBSFETCHCTL", 0, PFM_AMD64_IBSFETCHCTL_RSVD, 0, MSR_AMD64_IBSFETCHCTL), ++/* pmc5 */ PMC_D(PFM_REG_I, "IBSOPCTL", 0, PFM_AMD64_IBSOPCTL_RSVD, 0, MSR_AMD64_IBSOPCTL), ++}; ++#define PFM_AMD_NUM_PMCS ARRAY_SIZE(pfm_amd64_pmc_desc) ++ ++#define PFM_REG_IBS (PFM_REG_I|PFM_REG_INTR) ++ ++/* ++ * AMD64 counters are 48 bits, upper bits are reserved ++ */ ++#define PFM_AMD64_CTR_RSVD (~((1ULL<<48)-1)) ++ ++#define PFM_AMD_D(n) \ ++ { .type = PFM_REG_C, \ ++ .desc = "PERFCTR"#n, \ ++ .hw_addr = MSR_K7_PERFCTR0+n, \ ++ .rsvd_msk = PFM_AMD64_CTR_RSVD, \ ++ .dep_pmcs[0] = 1ULL << n \ ++ } ++ ++#define PFM_AMD_IBSO(t, s, a) \ ++ { .type = t, \ ++ .desc = s, \ ++ .hw_addr = a, \ ++ .rsvd_msk = 0, \ ++ .dep_pmcs[0] = 1ULL << 5 \ ++ } ++ ++#define PFM_AMD_IBSF(t, s, a) \ ++ { .type = t, \ ++ .desc = s, \ ++ .hw_addr = a, \ ++ .rsvd_msk = 0, \ ++ .dep_pmcs[0] = 1ULL << 6 \ ++ } ++ ++static struct pfm_regmap_desc pfm_amd64_pmd_desc[] = { ++/* pmd0 */ PFM_AMD_D(0), ++/* pmd1 */ PFM_AMD_D(1), ++/* pmd2 */ PFM_AMD_D(2), ++/* pmd3 */ PFM_AMD_D(3), ++/* pmd4 */ PFM_AMD_IBSF(PFM_REG_IBS, "IBSFETCHCTL", MSR_AMD64_IBSFETCHCTL), ++/* pmd5 */ PFM_AMD_IBSF(PFM_REG_IRO, "IBSFETCHLINAD", MSR_AMD64_IBSFETCHLINAD), ++/* pmd6 */ PFM_AMD_IBSF(PFM_REG_IRO, "IBSFETCHPHYSAD", MSR_AMD64_IBSFETCHPHYSAD), ++/* pmd7 */ PFM_AMD_IBSO(PFM_REG_IBS, "IBSOPCTL", MSR_AMD64_IBSOPCTL), ++/* pmd8 */ PFM_AMD_IBSO(PFM_REG_IRO, "IBSOPRIP", MSR_AMD64_IBSOPRIP), ++/* pmd9 */ PFM_AMD_IBSO(PFM_REG_IRO, "IBSOPDATA", MSR_AMD64_IBSOPDATA), ++/* pmd10 */ PFM_AMD_IBSO(PFM_REG_IRO, "IBSOPDATA2", MSR_AMD64_IBSOPDATA2), ++/* pmd11 */ PFM_AMD_IBSO(PFM_REG_IRO, "IBSOPDATA3", MSR_AMD64_IBSOPDATA3), ++/* pmd12 */ PFM_AMD_IBSO(PFM_REG_IRO, "IBSDCLINAD", MSR_AMD64_IBSDCLINAD), ++/* pmd13 */ PFM_AMD_IBSO(PFM_REG_IRO, "IBSDCPHYSAD", MSR_AMD64_IBSDCPHYSAD), ++}; ++#define PFM_AMD_NUM_PMDS ARRAY_SIZE(pfm_amd64_pmd_desc) ++ ++static struct pfm_context **pfm_nb_sys_owners; ++static struct pfm_context *pfm_nb_task_owner; ++ ++static struct pfm_pmu_config pfm_amd64_pmu_conf; ++ ++#define is_ibs_pmc(x) (x == 4 || x == 5) ++ ++static void pfm_amd64_setup_eilvt_per_cpu(void *info) ++{ ++ u8 lvt_off; ++ ++ /* program the IBS vector to the perfmon vector */ ++ lvt_off = setup_APIC_eilvt_ibs(LOCAL_PERFMON_VECTOR, ++ APIC_EILVT_MSG_FIX, 0); ++ PFM_DBG("APIC_EILVT%d set to 0x%x", lvt_off, LOCAL_PERFMON_VECTOR); ++ ibs_eilvt_off = lvt_off; ++} ++ ++static int pfm_amd64_setup_eilvt(void) ++{ ++#define IBSCTL_LVTOFFSETVAL (1 << 8) ++#define IBSCTL 0x1cc ++ struct pci_dev *cpu_cfg; ++ int nodes; ++ u32 value = 0; ++ ++ /* per CPU setup */ ++ on_each_cpu(pfm_amd64_setup_eilvt_per_cpu, NULL, 1); ++ ++ nodes = 0; ++ cpu_cfg = NULL; ++ do { ++ cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD, ++ PCI_DEVICE_ID_AMD_10H_NB_MISC, ++ cpu_cfg); ++ if (!cpu_cfg) ++ break; ++ ++nodes; ++ pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off ++ | IBSCTL_LVTOFFSETVAL); ++ pci_read_config_dword(cpu_cfg, IBSCTL, &value); ++ if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) { ++ PFM_DBG("Failed to setup IBS LVT offset, " ++ "IBSCTL = 0x%08x", value); ++ return 1; ++ } ++ } while (1); ++ ++ if (!nodes) { ++ PFM_DBG("No CPU node configured for IBS"); ++ return 1; ++ } ++ ++#ifdef CONFIG_NUMA ++ /* Sanity check */ ++ /* Works only for 64bit with proper numa implementation. */ ++ if (nodes != num_possible_nodes()) { ++ PFM_DBG("Failed to setup CPU node(s) for IBS, " ++ "found: %d, expected %d", ++ nodes, num_possible_nodes()); ++ return 1; ++ } ++#endif ++ return 0; ++} ++ ++/* ++ * There can only be one user per socket for the Northbridge (NB) events, ++ * so we enforce mutual exclusion as follows: ++ * - per-thread : only one context machine-wide can use NB events ++ * - system-wide: only one context per processor socket ++ * ++ * Exclusion is enforced at: ++ * - pfm_load_context() ++ * - pfm_write_pmcs() for attached contexts ++ * ++ * Exclusion is released at: ++ * - pfm_unload_context() or any calls that implicitely uses it ++ * ++ * return: ++ * 0 : successfully acquire NB access ++ * < 0: errno, failed to acquire NB access ++ */ ++static int pfm_amd64_acquire_nb(struct pfm_context *ctx) ++{ ++ struct pfm_context **entry, *old; ++ int proc_id; ++ ++#ifdef CONFIG_SMP ++ proc_id = cpu_data(smp_processor_id()).phys_proc_id; ++#else ++ proc_id = 0; ++#endif ++ ++ if (ctx->flags.system) ++ entry = &pfm_nb_sys_owners[proc_id]; ++ else ++ entry = &pfm_nb_task_owner; ++ ++ old = cmpxchg(entry, NULL, ctx); ++ if (!old) { ++ if (ctx->flags.system) ++ PFM_DBG("acquired Northbridge event access on socket %u", proc_id); ++ else ++ PFM_DBG("acquired Northbridge event access globally"); ++ } else if (old != ctx) { ++ if (ctx->flags.system) ++ PFM_DBG("NorthBridge event conflict on socket %u", proc_id); ++ else ++ PFM_DBG("global NorthBridge event conflict"); ++ return -EBUSY; ++ } ++ return 0; ++} ++ ++/* ++ * invoked from pfm_write_pmcs() when pfm_nb_sys_owners is not NULL,i.e., ++ * when we have detected a multi-core processor. ++ * ++ * context is locked, interrupts are masked ++ */ ++static int pfm_amd64_pmc_write_check(struct pfm_context *ctx, ++ struct pfm_event_set *set, ++ struct pfarg_pmc *req) ++{ ++ unsigned int event; ++ ++ /* ++ * delay checking NB event until we load the context ++ */ ++ if (ctx->state == PFM_CTX_UNLOADED) ++ return 0; ++ ++ /* ++ * check event is NB event ++ */ ++ event = (unsigned int)(req->reg_value & 0xff); ++ if (event < 0xee) ++ return 0; ++ ++ return pfm_amd64_acquire_nb(ctx); ++} ++ ++/* ++ * invoked on pfm_load_context(). ++ * context is locked, interrupts are masked ++ */ ++static int pfm_amd64_load_context(struct pfm_context *ctx) ++{ ++ struct pfm_event_set *set; ++ unsigned int i, n; ++ ++ /* ++ * scan all sets for NB events ++ */ ++ list_for_each_entry(set, &ctx->set_list, list) { ++ n = set->nused_pmcs; ++ for (i = 0; n; i++) { ++ if (!test_bit(i, cast_ulp(set->used_pmcs))) ++ continue; ++ ++ if (!is_ibs_pmc(i) && (set->pmcs[i] & 0xff) >= 0xee) ++ goto found; ++ n--; ++ } ++ } ++ return 0; ++found: ++ return pfm_amd64_acquire_nb(ctx); ++} ++ ++/* ++ * invoked on pfm_unload_context() ++ */ ++static void pfm_amd64_unload_context(struct pfm_context *ctx) ++{ ++ struct pfm_context **entry, *old; ++ int proc_id; ++ ++#ifdef CONFIG_SMP ++ proc_id = cpu_data(smp_processor_id()).phys_proc_id; ++#else ++ proc_id = 0; ++#endif ++ ++ /* ++ * unload always happens on the monitored CPU in system-wide ++ */ ++ if (ctx->flags.system) ++ entry = &pfm_nb_sys_owners[proc_id]; ++ else ++ entry = &pfm_nb_task_owner; ++ ++ old = cmpxchg(entry, ctx, NULL); ++ if (old == ctx) { ++ if (ctx->flags.system) ++ PFM_DBG("released NorthBridge on socket %u", proc_id); ++ else ++ PFM_DBG("released NorthBridge events globally"); ++ } ++} ++ ++/* ++ * detect if we need to activate NorthBridge event access control ++ */ ++static int pfm_amd64_setup_nb_event_control(void) ++{ ++ unsigned int c, n = 0; ++ unsigned int max_phys = 0; ++ ++#ifdef CONFIG_SMP ++ for_each_possible_cpu(c) { ++ if (cpu_data(c).phys_proc_id > max_phys) ++ max_phys = cpu_data(c).phys_proc_id; ++ } ++#else ++ max_phys = 0; ++#endif ++ if (max_phys > 255) { ++ PFM_INFO("socket id %d is too big to handle", max_phys); ++ return -ENOMEM; ++ } ++ ++ n = max_phys + 1; ++ if (n < 2) ++ return 0; ++ ++ pfm_nb_sys_owners = vmalloc(n * sizeof(*pfm_nb_sys_owners)); ++ if (!pfm_nb_sys_owners) ++ return -ENOMEM; ++ ++ memset(pfm_nb_sys_owners, 0, n * sizeof(*pfm_nb_sys_owners)); ++ pfm_nb_task_owner = NULL; ++ ++ /* ++ * activate write-checker for PMC registers ++ */ ++ for (c = 0; c < PFM_AMD_NUM_PMCS; c++) { ++ if (!is_ibs_pmc(c)) ++ pfm_amd64_pmc_desc[c].type |= PFM_REG_WC; ++ } ++ ++ pfm_amd64_pmu_info.load_context = pfm_amd64_load_context; ++ pfm_amd64_pmu_info.unload_context = pfm_amd64_unload_context; ++ ++ pfm_amd64_pmu_conf.pmc_write_check = pfm_amd64_pmc_write_check; ++ ++ PFM_INFO("NorthBridge event access control enabled"); ++ ++ return 0; ++} ++ ++/* ++ * disable registers which are not available on ++ * the host (applies to IBS registers) ++ */ ++static void pfm_amd64_check_registers(void) ++{ ++ u16 i; ++ ++ PFM_DBG("has_ibs=%d", !!(ibs_status & HAS_IBS)); ++ ++ __set_bit(0, cast_ulp(enable_mask)); ++ __set_bit(1, cast_ulp(enable_mask)); ++ __set_bit(2, cast_ulp(enable_mask)); ++ __set_bit(3, cast_ulp(enable_mask)); ++ max_enable = 3+1; ++ ++ ++ /* ++ * remove IBS registers if feature not present ++ */ ++ if (!(ibs_status & HAS_IBS)) { ++ pfm_amd64_pmc_desc[4].type = PFM_REG_NA; ++ pfm_amd64_pmc_desc[5].type = PFM_REG_NA; ++ for (i = 4; i < 14; i++) ++ pfm_amd64_pmd_desc[i].type = PFM_REG_NA; ++ } else { ++ __set_bit(16, cast_ulp(enable_mask)); ++ __set_bit(17, cast_ulp(enable_mask)); ++ max_enable = 17 + 1; ++ } ++ ++ /* ++ * adjust reserved bit fields for family 16 ++ */ ++ if (current_cpu_data.x86 == 16) { ++ for (i = 0; i < PFM_AMD_NUM_PMCS; i++) ++ if (pfm_amd64_pmc_desc[i].rsvd_msk == PFM_K8_RSVD) ++ pfm_amd64_pmc_desc[i].rsvd_msk = PFM_16_RSVD; ++ } ++} ++ ++static int pfm_amd64_probe_pmu(void) ++{ ++ u64 val = 0; ++ if (current_cpu_data.x86_vendor != X86_VENDOR_AMD) { ++ PFM_INFO("not an AMD processor"); ++ return -1; ++ } ++ ++ switch (current_cpu_data.x86) { ++ case 16: ++ case 15: ++ case 6: ++ break; ++ default: ++ PFM_INFO("unsupported family=%d", current_cpu_data.x86); ++ return -1; ++ } ++ ++ /* check for IBS */ ++ if (cpu_has(¤t_cpu_data, X86_FEATURE_IBS)) { ++ ibs_status |= HAS_IBS; ++ rdmsrl(MSR_AMD64_IBSCTL, val); ++ } ++ ++ PFM_INFO("found family=%d IBSCTL=0x%llx", current_cpu_data.x86, (unsigned long long)val); ++ ++ /* ++ * check for local APIC (required) ++ */ ++ if (!cpu_has_apic) { ++ PFM_INFO("no local APIC, unsupported"); ++ return -1; ++ } ++ ++ if (current_cpu_data.x86_max_cores > 1 ++ && pfm_amd64_setup_nb_event_control()) ++ return -1; ++ ++ if (force_nmi) ++ pfm_amd64_pmu_info.flags |= PFM_X86_FL_USE_NMI; ++ ++ if (ibs_status & HAS_IBS) { ++ /* Setup extended interrupt */ ++ if (pfm_amd64_setup_eilvt()) { ++ PFM_INFO("Failed to initialize extended interrupts " ++ "for IBS"); ++ ibs_status &= ~HAS_IBS; ++ PFM_INFO("Unable to use IBS"); ++ } else { ++ PFM_INFO("IBS supported"); ++ } ++ } ++ ++ pfm_amd64_check_registers(); ++ ++ return 0; ++} ++ ++/* ++ * detect is counters have overflowed. ++ * return: ++ * 0 : no overflow ++ * 1 : at least one overflow ++ */ ++static int __kprobes pfm_amd64_has_ovfls(struct pfm_context *ctx) ++{ ++ struct pfm_regmap_desc *xrd; ++ u64 *cnt_mask; ++ u64 wmask, val; ++ u16 i, num; ++ ++ /* ++ * Check for IBS events ++ */ ++ if (ibs_status & HAS_IBS) { ++ rdmsrl(MSR_AMD64_IBSFETCHCTL, val); ++ if (val & PFM_AMD64_IBSFETCHVAL) ++ return 1; ++ rdmsrl(MSR_AMD64_IBSOPCTL, val); ++ if (val & PFM_AMD64_IBSOPVAL) ++ return 1; ++ } ++ /* ++ * Check regular counters ++ */ ++ cnt_mask = ctx->regs.cnt_pmds; ++ num = ctx->regs.num_counters; ++ wmask = 1ULL << pfm_pmu_conf->counter_width; ++ xrd = pfm_amd64_pmd_desc; ++ ++ for (i = 0; num; i++) { ++ if (test_bit(i, cast_ulp(cnt_mask))) { ++ rdmsrl(xrd[i].hw_addr, val); ++ if (!(val & wmask)) ++ return 1; ++ num--; ++ } ++ } ++ return 0; ++} ++ ++/* ++ * Must check for IBS event BEFORE stop_save_p6 because ++ * stopping monitoring does destroy IBS state information ++ * in IBSFETCHCTL/IBSOPCTL because they are tagged as enable ++ * registers. ++ */ ++static int pfm_amd64_stop_save(struct pfm_context *ctx, struct pfm_event_set *set) ++{ ++ struct pfm_arch_pmu_info *pmu_info; ++ u64 used_mask[PFM_PMC_BV]; ++ u64 *cnt_pmds; ++ u64 val, wmask, ovfl_mask; ++ u32 i, count, use_ibs; ++ ++ pmu_info = pfm_pmu_info(); ++ ++ /* ++ * IBS used if: ++ * - on family 10h processor with IBS ++ * - at least one of the IBS PMD registers is used ++ */ ++ use_ibs = (ibs_status & HAS_IBS) ++ && (test_bit(IBSFETCHCTL_PMD, cast_ulp(set->used_pmds)) ++ || test_bit(IBSOPSCTL_PMD, cast_ulp(set->used_pmds))); ++ ++ wmask = 1ULL << pfm_pmu_conf->counter_width; ++ ++ bitmap_and(cast_ulp(used_mask), ++ cast_ulp(set->used_pmcs), ++ cast_ulp(enable_mask), ++ max_enable); ++ ++ count = bitmap_weight(cast_ulp(used_mask), max_enable); ++ ++ /* ++ * stop monitoring ++ * Unfortunately, this is very expensive! ++ * wrmsrl() is serializing. ++ * ++ * With IBS, we need to do read-modify-write to preserve the content ++ * for OpsCTL and FetchCTL because they are also used as PMDs and saved ++ * below ++ */ ++ if (use_ibs) { ++ for (i = 0; count; i++) { ++ if (test_bit(i, cast_ulp(used_mask))) { ++ if (i == IBSFETCHCTL_PMC) { ++ rdmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, val); ++ val &= ~PFM_AMD64_IBSFETCHEN; ++ } else if (i == IBSOPSCTL_PMC) { ++ rdmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, val); ++ val &= ~PFM_AMD64_IBSOPEN; ++ } else ++ val = 0; ++ wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, val); ++ count--; ++ } ++ } ++ } else { ++ for (i = 0; count; i++) { ++ if (test_bit(i, cast_ulp(used_mask))) { ++ wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, 0); ++ count--; ++ } ++ } ++ } ++ ++ /* ++ * if we already having a pending overflow condition, we simply ++ * return to take care of this first. ++ */ ++ if (set->npend_ovfls) ++ return 1; ++ ++ ovfl_mask = pfm_pmu_conf->ovfl_mask; ++ cnt_pmds = ctx->regs.cnt_pmds; ++ ++ /* ++ * check for pending overflows and save PMDs (combo) ++ * we employ used_pmds because we also need to save ++ * and not just check for pending interrupts. ++ * ++ * Must check for counting PMDs because of virtual PMDs and IBS ++ */ ++ count = set->nused_pmds; ++ for (i = 0; count; i++) { ++ if (test_bit(i, cast_ulp(set->used_pmds))) { ++ val = pfm_arch_read_pmd(ctx, i); ++ if (likely(test_bit(i, cast_ulp(cnt_pmds)))) { ++ if (!(val & wmask)) { ++ __set_bit(i, cast_ulp(set->povfl_pmds)); ++ set->npend_ovfls++; ++ } ++ val = (set->pmds[i].value & ~ovfl_mask) | (val & ovfl_mask); ++ } ++ set->pmds[i].value = val; ++ count--; ++ } ++ } ++ ++ /* ++ * check if IBS contains valid data, and mark the corresponding ++ * PMD has overflowed ++ */ ++ if (use_ibs) { ++ if (set->pmds[IBSFETCHCTL_PMD].value & PFM_AMD64_IBSFETCHVAL) { ++ __set_bit(IBSFETCHCTL_PMD, cast_ulp(set->povfl_pmds)); ++ set->npend_ovfls++; ++ } ++ if (set->pmds[IBSOPSCTL_PMD].value & PFM_AMD64_IBSOPVAL) { ++ __set_bit(IBSOPSCTL_PMD, cast_ulp(set->povfl_pmds)); ++ set->npend_ovfls++; ++ } ++ } ++ /* 0 means: no need to save PMDs at upper level */ ++ return 0; ++} ++ ++/** ++ * pfm_amd64_quiesce_pmu -- stop monitoring without grabbing any lock ++ * ++ * called from NMI interrupt handler to immediately stop monitoring ++ * cannot grab any lock, including perfmon related locks ++ */ ++static void __kprobes pfm_amd64_quiesce(void) ++{ ++ /* ++ * quiesce PMU by clearing available registers that have ++ * the start/stop capability ++ */ ++ if (test_bit(0, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) ++ wrmsrl(MSR_K7_EVNTSEL0, 0); ++ if (test_bit(1, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) ++ wrmsrl(MSR_K7_EVNTSEL0+1, 0); ++ if (test_bit(2, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) ++ wrmsrl(MSR_K7_EVNTSEL0+2, 0); ++ if (test_bit(3, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) ++ wrmsrl(MSR_K7_EVNTSEL0+3, 0); ++ ++ if (test_bit(4, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) ++ wrmsrl(MSR_AMD64_IBSFETCHCTL, 0); ++ if (test_bit(5, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) ++ wrmsrl(MSR_AMD64_IBSOPCTL, 0); ++} ++ ++/** ++ * pfm_amd64_restore_pmcs - reload PMC registers ++ * @ctx: context to restore from ++ * @set: current event set ++ * ++ * optimized version of pfm_arch_restore_pmcs(). On AMD64, we can ++ * afford to only restore the pmcs registers we use, because they are ++ * all independent from each other. ++ */ ++static void pfm_amd64_restore_pmcs(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ u64 *mask; ++ u16 i, num; ++ ++ mask = set->used_pmcs; ++ num = set->nused_pmcs; ++ for (i = 0; num; i++) { ++ if (test_bit(i, cast_ulp(mask))) { ++ wrmsrl(pfm_amd64_pmc_desc[i].hw_addr, set->pmcs[i]); ++ num--; ++ } ++ } ++} ++ ++static struct pfm_pmu_config pfm_amd64_pmu_conf = { ++ .pmu_name = "AMD64", ++ .counter_width = 47, ++ .pmd_desc = pfm_amd64_pmd_desc, ++ .pmc_desc = pfm_amd64_pmc_desc, ++ .num_pmc_entries = PFM_AMD_NUM_PMCS, ++ .num_pmd_entries = PFM_AMD_NUM_PMDS, ++ .probe_pmu = pfm_amd64_probe_pmu, ++ .version = "1.2", ++ .pmu_info = &pfm_amd64_pmu_info, ++ .flags = PFM_PMU_BUILTIN_FLAG, ++ .owner = THIS_MODULE, ++}; ++ ++static int __init pfm_amd64_pmu_init_module(void) ++{ ++ return pfm_pmu_register(&pfm_amd64_pmu_conf); ++} ++ ++static void __exit pfm_amd64_pmu_cleanup_module(void) ++{ ++ if (pfm_nb_sys_owners) ++ vfree(pfm_nb_sys_owners); ++ ++ pfm_pmu_unregister(&pfm_amd64_pmu_conf); ++} ++ ++module_init(pfm_amd64_pmu_init_module); ++module_exit(pfm_amd64_pmu_cleanup_module); +diff --git a/arch/x86/perfmon/perfmon_intel_arch.c b/arch/x86/perfmon/perfmon_intel_arch.c +new file mode 100644 +index 0000000..e27a732 +--- /dev/null ++++ b/arch/x86/perfmon/perfmon_intel_arch.c +@@ -0,0 +1,610 @@ ++/* ++ * This file contains the Intel architectural perfmon v1, v2, v3 ++ * description tables. ++ * ++ * Architectural perfmon was introduced with Intel Core Solo/Duo ++ * processors. ++ * ++ * Copyright (c) 2006-2007 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/module.h> ++#include <linux/kprobes.h> ++#include <linux/perfmon_kern.h> ++#include <linux/nmi.h> ++#include <asm/msr.h> ++#include <asm/apic.h> ++ ++MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>"); ++MODULE_DESCRIPTION("Intel architectural perfmon v1"); ++MODULE_LICENSE("GPL"); ++ ++static int force, force_nmi; ++MODULE_PARM_DESC(force, "bool: force module to load succesfully"); ++MODULE_PARM_DESC(force_nmi, "bool: force use of NMI for PMU interrupt"); ++module_param(force, bool, 0600); ++module_param(force_nmi, bool, 0600); ++ ++static u64 enable_mask[PFM_MAX_PMCS]; ++static u16 max_enable; ++ ++/* ++ * - upper 32 bits are reserved ++ * - INT: APIC enable bit is reserved (forced to 1) ++ * - bit 21 is reserved ++ * ++ * RSVD: reserved bits are 1 ++ */ ++#define PFM_IA_PMC_RSVD ((~((1ULL<<32)-1)) \ ++ | (1ULL<<20) \ ++ | (1ULL<<21)) ++ ++/* ++ * force Local APIC interrupt on overflow ++ * disable with NO_EMUL64 ++ */ ++#define PFM_IA_PMC_VAL (1ULL<<20) ++#define PFM_IA_NO64 (1ULL<<20) ++ ++/* ++ * architectuture specifies that: ++ * IA32_PMCx MSR : starts at 0x0c1 & occupy a contiguous block of MSR ++ * IA32_PERFEVTSELx MSR : starts at 0x186 & occupy a contiguous block of MSR ++ * MSR_GEN_FIXED_CTR0 : starts at 0x309 & occupy a contiguous block of MSR ++ */ ++#define MSR_GEN_SEL_BASE MSR_P6_EVNTSEL0 ++#define MSR_GEN_PMC_BASE MSR_P6_PERFCTR0 ++#define MSR_GEN_FIXED_PMC_BASE MSR_CORE_PERF_FIXED_CTR0 ++ ++/* ++ * layout of EAX for CPUID.0xa leaf function ++ */ ++struct pmu_eax { ++ unsigned int version:8; /* architectural perfmon version */ ++ unsigned int num_cnt:8; /* number of generic counters */ ++ unsigned int cnt_width:8; /* width of generic counters */ ++ unsigned int ebx_length:8; /* number of architected events */ ++}; ++ ++/* ++ * layout of EDX for CPUID.0xa leaf function when perfmon v2 is detected ++ */ ++struct pmu_edx { ++ unsigned int num_cnt:5; /* number of fixed counters */ ++ unsigned int cnt_width:8; /* width of fixed counters */ ++ unsigned int reserved:19; ++}; ++ ++static void pfm_intel_arch_restore_pmcs(struct pfm_context *ctx, ++ struct pfm_event_set *set); ++static int pfm_intel_arch_stop_save(struct pfm_context *ctx, ++ struct pfm_event_set *set); ++static int pfm_intel_arch_has_ovfls(struct pfm_context *ctx); ++static void __kprobes pfm_intel_arch_quiesce(void); ++ ++/* ++ * physical addresses of MSR controlling the perfevtsel and counter registers ++ */ ++struct pfm_arch_pmu_info pfm_intel_arch_pmu_info = { ++ .stop_save = pfm_intel_arch_stop_save, ++ .has_ovfls = pfm_intel_arch_has_ovfls, ++ .quiesce = pfm_intel_arch_quiesce, ++ .restore_pmcs = pfm_intel_arch_restore_pmcs ++}; ++ ++#define PFM_IA_C(n) { \ ++ .type = PFM_REG_I64, \ ++ .desc = "PERFEVTSEL"#n, \ ++ .dfl_val = PFM_IA_PMC_VAL, \ ++ .rsvd_msk = PFM_IA_PMC_RSVD, \ ++ .no_emul64_msk = PFM_IA_NO64, \ ++ .hw_addr = MSR_GEN_SEL_BASE+(n) \ ++ } ++ ++#define PFM_IA_D(n) \ ++ { .type = PFM_REG_C, \ ++ .desc = "PMC"#n, \ ++ .hw_addr = MSR_P6_PERFCTR0+n, \ ++ .dep_pmcs[0] = 1ULL << n \ ++ } ++ ++#define PFM_IA_FD(n) \ ++ { .type = PFM_REG_C, \ ++ .desc = "FIXED_CTR"#n, \ ++ .hw_addr = MSR_CORE_PERF_FIXED_CTR0+n,\ ++ .dep_pmcs[0] = 1ULL << 16 \ ++ } ++ ++static struct pfm_regmap_desc pfm_intel_arch_pmc_desc[] = { ++/* pmc0 */ PFM_IA_C(0), PFM_IA_C(1), PFM_IA_C(2), PFM_IA_C(3), ++/* pmc4 */ PFM_IA_C(4), PFM_IA_C(5), PFM_IA_C(6), PFM_IA_C(7), ++/* pmc8 */ PFM_IA_C(8), PFM_IA_C(9), PFM_IA_C(10), PFM_IA_C(11), ++/* pmc12 */ PFM_IA_C(12), PFM_IA_C(13), PFM_IA_C(14), PFM_IA_C(15), ++ ++/* pmc16 */ { .type = PFM_REG_I, ++ .desc = "FIXED_CTRL", ++ .dfl_val = 0x8888888888888888ULL, /* force PMI */ ++ .rsvd_msk = 0, /* set dynamically */ ++ .no_emul64_msk = 0, ++ .hw_addr = MSR_CORE_PERF_FIXED_CTR_CTRL ++ }, ++}; ++#define PFM_IA_MAX_PMCS ARRAY_SIZE(pfm_intel_arch_pmc_desc) ++ ++static struct pfm_regmap_desc pfm_intel_arch_pmd_desc[] = { ++/* pmd0 */ PFM_IA_D(0), PFM_IA_D(1), PFM_IA_D(2), PFM_IA_D(3), ++/* pmd4 */ PFM_IA_D(4), PFM_IA_D(5), PFM_IA_D(6), PFM_IA_D(7), ++/* pmd8 */ PFM_IA_D(8), PFM_IA_D(9), PFM_IA_D(10), PFM_IA_D(11), ++/* pmd12 */ PFM_IA_D(12), PFM_IA_D(13), PFM_IA_D(14), PFM_IA_D(15), ++ ++/* pmd16 */ PFM_IA_FD(0), PFM_IA_FD(1), PFM_IA_FD(2), PFM_IA_FD(3), ++/* pmd20 */ PFM_IA_FD(4), PFM_IA_FD(5), PFM_IA_FD(6), PFM_IA_FD(7), ++/* pmd24 */ PFM_IA_FD(8), PFM_IA_FD(9), PFM_IA_FD(10), PFM_IA_FD(11), ++/* pmd28 */ PFM_IA_FD(16), PFM_IA_FD(17), PFM_IA_FD(18), PFM_IA_FD(19) ++}; ++#define PFM_IA_MAX_PMDS ARRAY_SIZE(pfm_intel_arch_pmd_desc) ++ ++#define PFM_IA_MAX_CNT 16 /* # generic counters in mapping table */ ++#define PFM_IA_MAX_FCNT 16 /* # of fixed counters in mapping table */ ++#define PFM_IA_FCNT_BASE 16 /* base index of fixed counters PMD */ ++ ++static struct pfm_pmu_config pfm_intel_arch_pmu_conf; ++ ++static void pfm_intel_arch_check_errata(void) ++{ ++ /* ++ * Core Duo errata AE49 (no fix). Both counters share a single ++ * enable bit in PERFEVTSEL0 ++ */ ++ if (current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 14) ++ pfm_intel_arch_pmu_info.flags |= PFM_X86_FL_NO_SHARING; ++} ++ ++static inline void set_enable_mask(unsigned int i) ++{ ++ __set_bit(i, cast_ulp(enable_mask)); ++ ++ /* max_enable = highest + 1 */ ++ if ((i+1) > max_enable) ++ max_enable = i+ 1; ++} ++ ++static void pfm_intel_arch_setup_generic(unsigned int version, ++ unsigned int width, ++ unsigned int count) ++{ ++ u64 rsvd; ++ unsigned int i; ++ ++ /* ++ * first we handle the generic counters: ++ * ++ * - ensure HW does not have more registers than hardcoded in the tables ++ * - adjust rsvd_msk to actual counter width ++ * - initialize enable_mask (list of PMC with start/stop capability) ++ * - mark unused hardcoded generic counters as unimplemented ++ */ ++ ++ /* ++ * min of number of Hw counters and hardcoded in the tables ++ */ ++ if (count >= PFM_IA_MAX_CNT) { ++ printk(KERN_INFO "perfmon: Limiting number of generic counters" ++ " to %u, HW supports %u", ++ PFM_IA_MAX_CNT, count); ++ count = PFM_IA_MAX_CNT; ++ } ++ ++ /* ++ * adjust rsvd_msk for generic counters based on actual width ++ * initialize enable_mask (1 per pmd) ++ */ ++ rsvd = ~((1ULL << width)-1); ++ for (i = 0; i < count; i++) { ++ pfm_intel_arch_pmd_desc[i].rsvd_msk = rsvd; ++ set_enable_mask(i); ++ } ++ ++ /* ++ * handle version 3 new anythread bit (21) ++ */ ++ if (version == 3) { ++ for (i = 0; i < count; i++) ++ pfm_intel_arch_pmc_desc[i].rsvd_msk &= ~(1ULL << 21); ++ } ++ ++ ++ /* ++ * mark unused generic counters as not available ++ */ ++ for (i = count ; i < PFM_IA_MAX_CNT; i++) { ++ pfm_intel_arch_pmd_desc[i].type = PFM_REG_NA; ++ pfm_intel_arch_pmc_desc[i].type = PFM_REG_NA; ++ } ++} ++ ++static void pfm_intel_arch_setup_fixed(unsigned int version, ++ unsigned int width, ++ unsigned int count) ++{ ++ u64 rsvd, dfl; ++ unsigned int i; ++ ++ /* ++ * handle the fixed counters (if any): ++ * ++ * - ensure HW does not have more registers than hardcoded in the tables ++ * - adjust rsvd_msk to actual counter width ++ * - initialize enable_mask (list of PMC with start/stop capability) ++ * - mark unused hardcoded generic counters as unimplemented ++ */ ++ if (count >= PFM_IA_MAX_FCNT) { ++ printk(KERN_INFO "perfmon: Limiting number of fixed counters" ++ " to %u, HW supports %u", ++ PFM_IA_MAX_FCNT, count); ++ count = PFM_IA_MAX_FCNT; ++ } ++ /* ++ * adjust rsvd_msk for fixed counters based on actual width ++ */ ++ rsvd = ~((1ULL << width)-1); ++ for (i = 0; i < count; i++) ++ pfm_intel_arch_pmd_desc[PFM_IA_FCNT_BASE+i].rsvd_msk = rsvd; ++ ++ /* ++ * handle version new anythread bit (bit 2) ++ */ ++ if (version == 3) ++ rsvd = 1ULL << 3; ++ else ++ rsvd = 3ULL << 2; ++ ++ pfm_intel_arch_pmc_desc[16].rsvd_msk = 0; ++ for (i = 0; i < count; i++) ++ pfm_intel_arch_pmc_desc[16].rsvd_msk |= rsvd << (i<<2); ++ ++ /* ++ * mark unused fixed counters as unimplemented ++ * ++ * update the rsvd_msk, dfl_val in FIXED_CTRL: ++ * - rsvd_msk: set all 4 bits ++ * - dfl_val : clear all 4 bits ++ */ ++ dfl = pfm_intel_arch_pmc_desc[16].dfl_val; ++ rsvd = pfm_intel_arch_pmc_desc[16].rsvd_msk; ++ ++ for (i = count ; i < PFM_IA_MAX_FCNT; i++) { ++ pfm_intel_arch_pmd_desc[PFM_IA_FCNT_BASE+i].type = PFM_REG_NA; ++ rsvd |= 0xfULL << (i<<2); ++ dfl &= ~(0xfULL << (i<<2)); ++ } ++ ++ /* ++ * FIXED_CTR_CTRL unavailable when no fixed counters are defined ++ */ ++ if (!count) { ++ pfm_intel_arch_pmc_desc[16].type = PFM_REG_NA; ++ } else { ++ /* update rsvd_mask and dfl_val */ ++ pfm_intel_arch_pmc_desc[16].rsvd_msk = rsvd; ++ pfm_intel_arch_pmc_desc[16].dfl_val = dfl; ++ set_enable_mask(16); ++ } ++} ++ ++static int pfm_intel_arch_probe_pmu(void) ++{ ++ union { ++ unsigned int val; ++ struct pmu_eax eax; ++ struct pmu_edx edx; ++ } eax, edx; ++ unsigned int ebx, ecx; ++ unsigned int width = 0; ++ ++ edx.val = 0; ++ ++ if (!(cpu_has_arch_perfmon || force)) { ++ PFM_INFO("no support for Intel architectural PMU"); ++ return -1; ++ } ++ ++ if (!cpu_has_apic) { ++ PFM_INFO("no Local APIC, try rebooting with lapic option"); ++ return -1; ++ } ++ ++ /* cpuid() call protected by cpu_has_arch_perfmon */ ++ cpuid(0xa, &eax.val, &ebx, &ecx, &edx.val); ++ ++ /* ++ * reject processors supported by perfmon_intel_core ++ * ++ * We need to do this explicitely to avoid depending ++ * on the link order in case, the modules are compiled as ++ * builtin. ++ * ++ * non Intel processors are rejected by cpu_has_arch_perfmon ++ */ ++ if (current_cpu_data.x86 == 6 && !force) { ++ switch (current_cpu_data.x86_model) { ++ case 15: /* Merom: use perfmon_intel_core */ ++ case 23: /* Penryn: use perfmon_intel_core */ ++ return -1; ++ default: ++ break; ++ } ++ } ++ ++ /* ++ * some 6/15 models have buggy BIOS ++ */ ++ if (eax.eax.version == 0 ++ && current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 15) { ++ PFM_INFO("buggy v2 BIOS, adjusting for 2 generic counters"); ++ eax.eax.version = 2; ++ eax.eax.num_cnt = 2; ++ eax.eax.cnt_width = 40; ++ } ++ ++ /* ++ * Intel Atom processors have a buggy firmware which does not report ++ * the correct number of fixed counters ++ */ ++ if (eax.eax.version == 3 && edx.edx.num_cnt < 3 ++ && current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 28) { ++ PFM_INFO("buggy v3 BIOS, adjusting for 3 fixed counters"); ++ edx.edx.num_cnt = 3; ++ } ++ ++ /* ++ * some v2 BIOSes are incomplete ++ */ ++ if (eax.eax.version == 2 && !edx.edx.num_cnt) { ++ PFM_INFO("buggy v2 BIOS, adjusting for 3 fixed counters"); ++ edx.edx.num_cnt = 3; ++ edx.edx.cnt_width = 40; ++ } ++ ++ /* ++ * no fixed counters on earlier versions ++ */ ++ if (eax.eax.version < 2) { ++ edx.val = 0; ++ } else { ++ /* ++ * use the min value of both widths until we support ++ * variable width counters ++ */ ++ width = eax.eax.cnt_width < edx.edx.cnt_width ? ++ eax.eax.cnt_width : edx.edx.cnt_width; ++ } ++ ++ PFM_INFO("detected architecural perfmon v%d", eax.eax.version); ++ PFM_INFO("num_gen=%d width=%d num_fixed=%d width=%d", ++ eax.eax.num_cnt, ++ eax.eax.cnt_width, ++ edx.edx.num_cnt, ++ edx.edx.cnt_width); ++ ++ ++ pfm_intel_arch_setup_generic(eax.eax.version, ++ width, ++ eax.eax.num_cnt); ++ ++ pfm_intel_arch_setup_fixed(eax.eax.version, ++ width, ++ edx.edx.num_cnt); ++ ++ if (force_nmi) ++ pfm_intel_arch_pmu_info.flags |= PFM_X86_FL_USE_NMI; ++ ++ pfm_intel_arch_check_errata(); ++ ++ return 0; ++} ++ ++/** ++ * pfm_intel_arch_has_ovfls - check for pending overflow condition ++ * @ctx: context to work on ++ * ++ * detect if counters have overflowed. ++ * return: ++ * 0 : no overflow ++ * 1 : at least one overflow ++ */ ++static int __kprobes pfm_intel_arch_has_ovfls(struct pfm_context *ctx) ++{ ++ u64 *cnt_mask; ++ u64 wmask, val; ++ u16 i, num; ++ ++ cnt_mask = ctx->regs.cnt_pmds; ++ num = ctx->regs.num_counters; ++ wmask = 1ULL << pfm_pmu_conf->counter_width; ++ ++ /* ++ * we can leverage the fact that we know the mapping ++ * to hardcode the MSR address and avoid accessing ++ * more cachelines ++ * ++ * We need to check cnt_mask because not all registers ++ * may be available. ++ */ ++ for (i = 0; num; i++) { ++ if (test_bit(i, cast_ulp(cnt_mask))) { ++ rdmsrl(pfm_intel_arch_pmd_desc[i].hw_addr, val); ++ if (!(val & wmask)) ++ return 1; ++ num--; ++ } ++ } ++ return 0; ++} ++ ++static int pfm_intel_arch_stop_save(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ u64 used_mask[PFM_PMC_BV]; ++ u64 *cnt_pmds; ++ u64 val, wmask, ovfl_mask; ++ u32 i, count; ++ ++ wmask = 1ULL << pfm_pmu_conf->counter_width; ++ ++ bitmap_and(cast_ulp(used_mask), ++ cast_ulp(set->used_pmcs), ++ cast_ulp(enable_mask), ++ max_enable); ++ ++ count = bitmap_weight(cast_ulp(used_mask), max_enable); ++ ++ /* ++ * stop monitoring ++ * Unfortunately, this is very expensive! ++ * wrmsrl() is serializing. ++ */ ++ for (i = 0; count; i++) { ++ if (test_bit(i, cast_ulp(used_mask))) { ++ wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, 0); ++ count--; ++ } ++ } ++ ++ /* ++ * if we already having a pending overflow condition, we simply ++ * return to take care of this first. ++ */ ++ if (set->npend_ovfls) ++ return 1; ++ ++ ovfl_mask = pfm_pmu_conf->ovfl_mask; ++ cnt_pmds = ctx->regs.cnt_pmds; ++ ++ /* ++ * check for pending overflows and save PMDs (combo) ++ * we employ used_pmds because we also need to save ++ * and not just check for pending interrupts. ++ * ++ * Must check for counting PMDs because of virtual PMDs ++ */ ++ count = set->nused_pmds; ++ for (i = 0; count; i++) { ++ if (test_bit(i, cast_ulp(set->used_pmds))) { ++ val = pfm_arch_read_pmd(ctx, i); ++ if (likely(test_bit(i, cast_ulp(cnt_pmds)))) { ++ if (!(val & wmask)) { ++ __set_bit(i, cast_ulp(set->povfl_pmds)); ++ set->npend_ovfls++; ++ } ++ val = (set->pmds[i].value & ~ovfl_mask) ++ | (val & ovfl_mask); ++ } ++ set->pmds[i].value = val; ++ count--; ++ } ++ } ++ /* 0 means: no need to save PMDs at upper level */ ++ return 0; ++} ++ ++/** ++ * pfm_intel_arch_quiesce - stop monitoring without grabbing any lock ++ * ++ * called from NMI interrupt handler to immediately stop monitoring ++ * cannot grab any lock, including perfmon related locks ++ */ ++static void __kprobes pfm_intel_arch_quiesce(void) ++{ ++ u16 i; ++ ++ /* ++ * PMC16 is the fixed control control register so it has a ++ * distinct MSR address ++ * ++ * We do not use the hw_addr field in the table to avoid touching ++ * too many cachelines ++ */ ++ for (i = 0; i < pfm_pmu_conf->regs_all.max_pmc; i++) { ++ if (test_bit(i, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) { ++ if (i == 16) ++ wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0); ++ else ++ wrmsrl(MSR_P6_EVNTSEL0+i, 0); ++ } ++ } ++} ++ ++/** ++ * pfm_intel_arch_restore_pmcs - reload PMC registers ++ * @ctx: context to restore from ++ * @set: current event set ++ * ++ * optimized version of pfm_arch_restore_pmcs(). On architectural perfmon, ++ * we can afford to only restore the pmcs registers we use, because they ++ * are all independent from each other. ++ */ ++static void pfm_intel_arch_restore_pmcs(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ u64 *mask; ++ u16 i, num; ++ ++ mask = set->used_pmcs; ++ num = set->nused_pmcs; ++ for (i = 0; num; i++) { ++ if (test_bit(i, cast_ulp(mask))) { ++ wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, set->pmcs[i]); ++ num--; ++ } ++ } ++} ++/* ++ * Counters may have model-specific width. Yet the documentation says ++ * that only the lower 32 bits can be written to due to the specification ++ * of wrmsr. bits [32-(w-1)] are sign extensions of bit 31. Bits [w-63] must ++ * not be set (see rsvd_msk for PMDs). As such the effective width of a ++ * counter is 31 bits only regardless of what CPUID.0xa returns. ++ * ++ * See IA-32 Intel Architecture Software developer manual Vol 3B chapter 18 ++ */ ++static struct pfm_pmu_config pfm_intel_arch_pmu_conf = { ++ .pmu_name = "Intel architectural", ++ .pmd_desc = pfm_intel_arch_pmd_desc, ++ .counter_width = 31, ++ .num_pmc_entries = PFM_IA_MAX_PMCS, ++ .num_pmd_entries = PFM_IA_MAX_PMDS, ++ .pmc_desc = pfm_intel_arch_pmc_desc, ++ .probe_pmu = pfm_intel_arch_probe_pmu, ++ .version = "1.0", ++ .flags = PFM_PMU_BUILTIN_FLAG, ++ .owner = THIS_MODULE, ++ .pmu_info = &pfm_intel_arch_pmu_info ++}; ++ ++static int __init pfm_intel_arch_pmu_init_module(void) ++{ ++ return pfm_pmu_register(&pfm_intel_arch_pmu_conf); ++} ++ ++static void __exit pfm_intel_arch_pmu_cleanup_module(void) ++{ ++ pfm_pmu_unregister(&pfm_intel_arch_pmu_conf); ++} ++ ++module_init(pfm_intel_arch_pmu_init_module); ++module_exit(pfm_intel_arch_pmu_cleanup_module); +diff --git a/arch/x86/perfmon/perfmon_intel_atom.c b/arch/x86/perfmon/perfmon_intel_atom.c +new file mode 100644 +index 0000000..9b94863 +--- /dev/null ++++ b/arch/x86/perfmon/perfmon_intel_atom.c +@@ -0,0 +1,541 @@ ++/* ++ * perfmon support for Intel Atom (architectural perfmon v3 + PEBS) ++ * ++ * Copyright (c) 2008 Google,Inc ++ * Contributed by Stephane Eranian <eranian@gmail.com> ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/module.h> ++#include <linux/kprobes.h> ++#include <linux/perfmon_kern.h> ++#include <asm/msr.h> ++ ++MODULE_AUTHOR("Stephane Eranian <eranian@gmail.com>"); ++MODULE_DESCRIPTION("Intel Atom"); ++MODULE_LICENSE("GPL"); ++ ++static int force, force_nmi; ++MODULE_PARM_DESC(force, "bool: force module to load succesfully"); ++MODULE_PARM_DESC(force_nmi, "bool: force use of NMI for PMU interrupt"); ++module_param(force, bool, 0600); ++module_param(force_nmi, bool, 0600); ++ ++/* ++ * - upper 32 bits are reserved ++ * - INT: APIC enable bit is reserved (forced to 1) ++ * ++ * RSVD: reserved bits are 1 ++ */ ++#define PFM_ATOM_PMC_RSVD ((~((1ULL<<32)-1)) | (1ULL<<20)) ++ ++/* ++ * force Local APIC interrupt on overflow ++ * disable with NO_EMUL64 ++ */ ++#define PFM_ATOM_PMC_VAL (1ULL<<20) ++#define PFM_ATOM_NO64 (1ULL<<20) ++ ++/* ++ * Atom counters are 40-bits. 40-bits can be read but ony 31 can be written ++ * to due to a limitation of wrmsr. Bits [[63-32] are sign extensions of bit 31. ++ * Bits [63-40] must not be set ++ * ++ * See IA-32 Intel Architecture Software developer manual Vol 3B chapter 18 ++ */ ++#define PFM_ATOM_PMD_WIDTH 31 ++#define PFM_ATOM_PMD_RSVD ~((1ULL << 40)-1) ++ ++static void pfm_intel_atom_acquire_pmu_percpu(void); ++static void pfm_intel_atom_release_pmu_percpu(void); ++static void pfm_intel_atom_restore_pmcs(struct pfm_context *ctx, ++ struct pfm_event_set *set); ++static int pfm_intel_atom_stop_save(struct pfm_context *ctx, ++ struct pfm_event_set *set); ++static int pfm_intel_atom_has_ovfls(struct pfm_context *ctx); ++static void __kprobes pfm_intel_atom_quiesce(void); ++ ++struct pfm_arch_pmu_info pfm_intel_atom_pmu_info = { ++ .stop_save = pfm_intel_atom_stop_save, ++ .has_ovfls = pfm_intel_atom_has_ovfls, ++ .quiesce = pfm_intel_atom_quiesce, ++ .restore_pmcs = pfm_intel_atom_restore_pmcs, ++ .acquire_pmu_percpu = pfm_intel_atom_acquire_pmu_percpu, ++ .release_pmu_percpu = pfm_intel_atom_release_pmu_percpu ++ ++}; ++ ++#define PFM_ATOM_C(n) { \ ++ .type = PFM_REG_I64, \ ++ .desc = "PERFEVTSEL"#n, \ ++ .dfl_val = PFM_ATOM_PMC_VAL, \ ++ .rsvd_msk = PFM_ATOM_PMC_RSVD, \ ++ .no_emul64_msk = PFM_ATOM_NO64, \ ++ .hw_addr = MSR_P6_EVNTSEL0 + (n) \ ++ } ++ ++ ++static struct pfm_regmap_desc pfm_intel_atom_pmc_desc[] = { ++/* pmc0 */ PFM_ATOM_C(0), ++/* pmc1 */ PFM_ATOM_C(1), ++/* pmc2 */ PMX_NA, PMX_NA, ++/* pmc4 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc8 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc12 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc16 */ { .type = PFM_REG_I, ++ .desc = "FIXED_CTRL", ++ .dfl_val = 0x0000000000000888ULL, /* force PMI */ ++ .rsvd_msk = 0xfffffffffffffcccULL, /* 3 fixed counters defined */ ++ .no_emul64_msk = 0, ++ .hw_addr = MSR_CORE_PERF_FIXED_CTR_CTRL ++ }, ++/* pmc17 */{ .type = PFM_REG_W, ++ .desc = "PEBS_ENABLE", ++ .dfl_val = 0, ++ .rsvd_msk = 0xfffffffffffffffeULL, ++ .no_emul64_msk = 0, ++ .hw_addr = MSR_IA32_PEBS_ENABLE ++ } ++}; ++#define PFM_ATOM_MAX_PMCS ARRAY_SIZE(pfm_intel_atom_pmc_desc) ++ ++#define PFM_ATOM_D(n) \ ++ { .type = PFM_REG_C, \ ++ .desc = "PMC"#n, \ ++ .rsvd_msk = PFM_ATOM_PMD_RSVD, \ ++ .hw_addr = MSR_P6_PERFCTR0+n, \ ++ .dep_pmcs[0] = 1ULL << n \ ++ } ++ ++#define PFM_ATOM_FD(n) \ ++ { .type = PFM_REG_C, \ ++ .desc = "FIXED_CTR"#n, \ ++ .rsvd_msk = PFM_ATOM_PMD_RSVD, \ ++ .hw_addr = MSR_CORE_PERF_FIXED_CTR0+n,\ ++ .dep_pmcs[0] = 1ULL << 16 \ ++ } ++ ++static struct pfm_regmap_desc pfm_intel_atom_pmd_desc[] = { ++/* pmd0 */ PFM_ATOM_D(0), ++/* pmd1 */ PFM_ATOM_D(1), ++/* pmd2 */ PMX_NA, ++/* pmd3 */ PMX_NA, ++/* pmd4 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmd8 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmd12 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmd16 */ PFM_ATOM_FD(0), ++/* pmd17 */ PFM_ATOM_FD(1), ++/* pmd18 */ PFM_ATOM_FD(2) ++}; ++#define PFM_ATOM_MAX_PMDS ARRAY_SIZE(pfm_intel_atom_pmd_desc) ++ ++static struct pfm_pmu_config pfm_intel_atom_pmu_conf; ++ ++static int pfm_intel_atom_probe_pmu(void) ++{ ++ if (force) ++ goto doit; ++ ++ if (current_cpu_data.x86_vendor != X86_VENDOR_INTEL) ++ return -1; ++ ++ if (current_cpu_data.x86 != 6) ++ return -1; ++ ++ if (current_cpu_data.x86_model != 28) ++ return -1; ++doit: ++ /* ++ * having APIC is mandatory, so disregard force option ++ */ ++ if (!cpu_has_apic) { ++ PFM_INFO("no Local APIC, try rebooting with lapic option"); ++ return -1; ++ } ++ ++ PFM_INFO("detected Intel Atom PMU"); ++ ++ if (force_nmi) ++ pfm_intel_atom_pmu_info.flags |= PFM_X86_FL_USE_NMI; ++ ++ return 0; ++} ++ ++/** ++ * pfm_intel_atom_has_ovfls - check for pending overflow condition ++ * @ctx: context to work on ++ * ++ * detect if counters have overflowed. ++ * return: ++ * 0 : no overflow ++ * 1 : at least one overflow ++ */ ++static int __kprobes pfm_intel_atom_has_ovfls(struct pfm_context *ctx) ++{ ++ struct pfm_regmap_desc *d; ++ u64 ovf; ++ ++ d = pfm_pmu_conf->pmd_desc; ++ /* ++ * read global overflow status register ++ * if sharing PMU, then not all bit are ours so must ++ * check only the ones we actually use ++ */ ++ rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, ovf); ++ ++ /* ++ * for pmd0, we also check PEBS overflow on bit 62 ++ */ ++ if ((d[0].type & PFM_REG_I) && (ovf & ((1ull << 62) | 1ull))) ++ return 1; ++ ++ if ((d[1].type & PFM_REG_I) && (ovf & 2ull)) ++ return 1; ++ ++ if ((d[16].type & PFM_REG_I) && (ovf & (1ull << 32))) ++ return 1; ++ ++ if ((d[17].type & PFM_REG_I) && (ovf & (2ull << 32))) ++ return 1; ++ ++ if ((d[18].type & PFM_REG_I) && (ovf & (4ull << 32))) ++ return 1; ++ ++ return 0; ++} ++ ++/** ++ * pfm_intel_atom_stop_save - stop monitoring, collect pending overflow, save pmds ++ * @ctx: context to work on ++ * @set: active set ++ * ++ * return: ++ * 1: caller needs to save pmds ++ * 0: caller does not need to save pmds, they have been saved by this call ++ */ ++static int pfm_intel_atom_stop_save(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++#define PFM_ATOM_WMASK (1ULL << 31) ++#define PFM_ATOM_OMASK ((1ULL << 31)-1) ++ u64 clear_ovf = 0; ++ u64 ovf, ovf2, val; ++ ++ /* ++ * read global overflow status register ++ * if sharing PMU, then not all bit are ours so must ++ * check only the ones we actually use. ++ * ++ * XXX: Atom seems to have a bug with the stickyness of ++ * GLOBAL_STATUS. If we read GLOBAL_STATUS after we ++ * clear the generic counters, then their bits in ++ * GLOBAL_STATUS are cleared. This should not be the ++ * case accoding to architected PMU. To workaround ++ * the problem, we read GLOBAL_STATUS BEFORE we stop ++ * all monitoring. ++ */ ++ rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, ovf); ++ ++ /* ++ * stop monitoring ++ */ ++ if (test_bit(0, cast_ulp(set->used_pmcs))) ++ wrmsrl(MSR_P6_EVNTSEL0, 0); ++ ++ if (test_bit(1, cast_ulp(set->used_pmcs))) ++ wrmsrl(MSR_P6_EVNTSEL1, 0); ++ ++ if (test_bit(16, cast_ulp(set->used_pmcs))) ++ wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0); ++ ++ if (test_bit(17, cast_ulp(set->used_pmcs))) ++ wrmsrl(MSR_IA32_PEBS_ENABLE, 0); ++ ++ /* ++ * XXX: related to bug mentioned above ++ * ++ * read GLOBAL_STATUS again to avoid race condition ++ * with overflows happening after first read and ++ * before stop. That avoids missing overflows on ++ * the fixed counters and PEBS ++ */ ++ rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, ovf2); ++ ovf |= ovf2; ++ ++ /* ++ * if we already have a pending overflow condition, we simply ++ * return to take care of it first. ++ */ ++ if (set->npend_ovfls) ++ return 1; ++ ++ /* ++ * check PMD 0,1,16,17,18 for overflow and save their value ++ */ ++ if (test_bit(0, cast_ulp(set->used_pmds))) { ++ rdmsrl(MSR_P6_PERFCTR0, val); ++ if (ovf & ((1ull<<62)|1ull)) { ++ __set_bit(0, cast_ulp(set->povfl_pmds)); ++ set->npend_ovfls++; ++ clear_ovf = (1ull << 62) | 1ull; ++ } ++ val = (set->pmds[0].value & ~PFM_ATOM_OMASK) ++ | (val & PFM_ATOM_OMASK); ++ set->pmds[0].value = val; ++ } ++ ++ if (test_bit(1, cast_ulp(set->used_pmds))) { ++ rdmsrl(MSR_P6_PERFCTR1, val); ++ if (ovf & 2ull) { ++ __set_bit(1, cast_ulp(set->povfl_pmds)); ++ set->npend_ovfls++; ++ clear_ovf |= 2ull; ++ } ++ val = (set->pmds[1].value & ~PFM_ATOM_OMASK) ++ | (val & PFM_ATOM_OMASK); ++ set->pmds[1].value = val; ++ } ++ ++ if (test_bit(16, cast_ulp(set->used_pmds))) { ++ rdmsrl(MSR_CORE_PERF_FIXED_CTR0, val); ++ if (ovf & (1ull << 32)) { ++ __set_bit(16, cast_ulp(set->povfl_pmds)); ++ set->npend_ovfls++; ++ clear_ovf |= 1ull << 32; ++ } ++ val = (set->pmds[16].value & ~PFM_ATOM_OMASK) ++ | (val & PFM_ATOM_OMASK); ++ set->pmds[16].value = val; ++ } ++ ++ if (test_bit(17, cast_ulp(set->used_pmds))) { ++ rdmsrl(MSR_CORE_PERF_FIXED_CTR0+1, val); ++ if (ovf & (2ull << 32)) { ++ __set_bit(17, cast_ulp(set->povfl_pmds)); ++ set->npend_ovfls++; ++ clear_ovf |= 2ull << 32; ++ } ++ val = (set->pmds[17].value & ~PFM_ATOM_OMASK) ++ | (val & PFM_ATOM_OMASK); ++ set->pmds[17].value = val; ++ } ++ ++ if (test_bit(18, cast_ulp(set->used_pmds))) { ++ rdmsrl(MSR_CORE_PERF_FIXED_CTR0+2, val); ++ if (ovf & (4ull << 32)) { ++ __set_bit(18, cast_ulp(set->povfl_pmds)); ++ set->npend_ovfls++; ++ clear_ovf |= 4ull << 32; ++ } ++ val = (set->pmds[18].value & ~PFM_ATOM_OMASK) ++ | (val & PFM_ATOM_OMASK); ++ set->pmds[18].value = val; ++ } ++ ++ if (clear_ovf) ++ wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, clear_ovf); ++ ++ /* 0 means: no need to save PMDs at upper level */ ++ return 0; ++} ++ ++/** ++ * pfm_intel_atom_quiesce - stop monitoring without grabbing any lock ++ * ++ * called from NMI interrupt handler to immediately stop monitoring ++ * cannot grab any lock, including perfmon related locks ++ */ ++static void __kprobes pfm_intel_atom_quiesce(void) ++{ ++ /* ++ * quiesce PMU by clearing available registers that have ++ * the start/stop capability ++ */ ++ if (test_bit(0, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) ++ wrmsrl(MSR_P6_EVNTSEL0, 0); ++ ++ if (test_bit(1, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) ++ wrmsrl(MSR_P6_EVNTSEL1, 0); ++ ++ if (test_bit(16, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) ++ wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0); ++ ++ if (test_bit(17, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) ++ wrmsrl(MSR_IA32_PEBS_ENABLE, 0); ++} ++ ++/** ++ * pfm_intel_atom_restore_pmcs - reload PMC registers ++ * @ctx: context to restore from ++ * @set: current event set ++ * ++ * restores pmcs and also PEBS Data Save area pointer ++ */ ++static void pfm_intel_atom_restore_pmcs(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ struct pfm_arch_context *ctx_arch; ++ u64 clear_ovf = 0; ++ ++ ctx_arch = pfm_ctx_arch(ctx); ++ /* ++ * must restore DS pointer before restoring PMCs ++ * as this can potentially reactivate monitoring ++ */ ++ if (ctx_arch->flags.use_ds) ++ wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ctx_arch->ds_area); ++ ++ if (test_bit(0, cast_ulp(set->used_pmcs))) { ++ wrmsrl(MSR_P6_EVNTSEL0, set->pmcs[0]); ++ clear_ovf = 1ull; ++ } ++ ++ if (test_bit(1, cast_ulp(set->used_pmcs))) { ++ wrmsrl(MSR_P6_EVNTSEL1, set->pmcs[1]); ++ clear_ovf |= 2ull; ++ } ++ ++ if (test_bit(16, cast_ulp(set->used_pmcs))) { ++ wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, set->pmcs[16]); ++ clear_ovf |= 7ull << 32; ++ } ++ ++ if (test_bit(17, cast_ulp(set->used_pmcs))) { ++ wrmsrl(MSR_IA32_PEBS_ENABLE, set->pmcs[17]); ++ clear_ovf |= 1ull << 62; ++ } ++ ++ if (clear_ovf) ++ wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, clear_ovf); ++} ++ ++static int pfm_intel_atom_pmc17_check(struct pfm_context *ctx, ++ struct pfm_event_set *set, ++ struct pfarg_pmc *req) ++{ ++ struct pfm_arch_context *ctx_arch; ++ ctx_arch = pfm_ctx_arch(ctx); ++ ++ /* ++ * if user activates PEBS_ENABLE, then we need to have a valid ++ * DS Area setup. This only happens when the PEBS sampling format is ++ * used in which case PFM_X86_USE_PEBS is set. We must reject all other ++ * requests. ++ * ++ * Otherwise we may pickup stale MSR_IA32_DS_AREA values. It appears ++ * that a value of 0 for this MSR does crash the system with ++ * PEBS_ENABLE=1. ++ */ ++ if (!ctx_arch->flags.use_pebs && req->reg_value) { ++ PFM_DBG("pmc17 useable only with a PEBS sampling format"); ++ return -EINVAL; ++ } ++ return 0; ++} ++ ++DEFINE_PER_CPU(u64, saved_global_ctrl); ++ ++/** ++ * pfm_intel_atom_acquire_pmu_percpu - acquire PMU resource per CPU ++ * ++ * For Atom, it is necessary to enable all available ++ * registers. The firmware rightfully has the fixed counters ++ * disabled for backward compatibility with architectural perfmon ++ * v1 ++ * ++ * This function is invoked on each online CPU ++ */ ++static void pfm_intel_atom_acquire_pmu_percpu(void) ++{ ++ struct pfm_regmap_desc *d; ++ u64 mask = 0; ++ unsigned int i; ++ ++ /* ++ * build bitmask of registers that are available to ++ * us. In some cases, there may be fewer registers than ++ * what Atom supports due to sharing with other kernel ++ * subsystems, such as NMI ++ */ ++ d = pfm_pmu_conf->pmd_desc; ++ for (i=0; i < 16; i++) { ++ if ((d[i].type & PFM_REG_I) == 0) ++ continue; ++ mask |= 1ull << i; ++ } ++ for (i=16; i < PFM_ATOM_MAX_PMDS; i++) { ++ if ((d[i].type & PFM_REG_I) == 0) ++ continue; ++ mask |= 1ull << (32+i-16); ++ } ++ ++ /* ++ * keep a local copy of the current MSR_CORE_PERF_GLOBAL_CTRL ++ */ ++ rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, __get_cpu_var(saved_global_ctrl)); ++ ++ PFM_DBG("global=0x%llx set to 0x%llx", ++ __get_cpu_var(saved_global_ctrl), ++ mask); ++ ++ /* ++ * enable all registers ++ * ++ * No need to quiesce PMU. If there is a overflow, it will be ++ * treated as spurious by the handler ++ */ ++ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, mask); ++} ++ ++/** ++ * pfm_intel_atom_release_pmu_percpu - release PMU resource per CPU ++ * ++ * For Atom, we restore MSR_CORE_PERF_GLOBAL_CTRL to its orginal value ++ */ ++static void pfm_intel_atom_release_pmu_percpu(void) ++{ ++ PFM_DBG("global_ctrl restored to 0x%llx\n", ++ __get_cpu_var(saved_global_ctrl)); ++ ++ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, __get_cpu_var(saved_global_ctrl)); ++} ++ ++static struct pfm_pmu_config pfm_intel_atom_pmu_conf = { ++ .pmu_name = "Intel Atom", ++ .pmd_desc = pfm_intel_atom_pmd_desc, ++ .counter_width = PFM_ATOM_PMD_WIDTH, ++ .num_pmc_entries = PFM_ATOM_MAX_PMCS, ++ .num_pmd_entries = PFM_ATOM_MAX_PMDS, ++ .pmc_desc = pfm_intel_atom_pmc_desc, ++ .probe_pmu = pfm_intel_atom_probe_pmu, ++ .version = "1.0", ++ .flags = PFM_PMU_BUILTIN_FLAG, ++ .owner = THIS_MODULE, ++ .pmc_write_check = pfm_intel_atom_pmc17_check, ++ .pmu_info = &pfm_intel_atom_pmu_info ++}; ++ ++static int __init pfm_intel_atom_pmu_init_module(void) ++{ ++ return pfm_pmu_register(&pfm_intel_atom_pmu_conf); ++} ++ ++static void __exit pfm_intel_atom_pmu_cleanup_module(void) ++{ ++ pfm_pmu_unregister(&pfm_intel_atom_pmu_conf); ++} ++ ++module_init(pfm_intel_atom_pmu_init_module); ++module_exit(pfm_intel_atom_pmu_cleanup_module); +diff --git a/arch/x86/perfmon/perfmon_intel_core.c b/arch/x86/perfmon/perfmon_intel_core.c +new file mode 100644 +index 0000000..fddc436 +--- /dev/null ++++ b/arch/x86/perfmon/perfmon_intel_core.c +@@ -0,0 +1,449 @@ ++/* ++ * This file contains the Intel Core PMU registers description tables. ++ * Intel Core-based processors support architectural perfmon v2 + PEBS ++ * ++ * Copyright (c) 2006-2007 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ */ ++#include <linux/module.h> ++#include <linux/kprobes.h> ++#include <linux/perfmon_kern.h> ++#include <linux/nmi.h> ++ ++MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>"); ++MODULE_DESCRIPTION("Intel Core"); ++MODULE_LICENSE("GPL"); ++ ++static int force_nmi; ++MODULE_PARM_DESC(force_nmi, "bool: force use of NMI for PMU interrupt"); ++module_param(force_nmi, bool, 0600); ++ ++/* ++ * - upper 32 bits are reserved ++ * - INT: APIC enable bit is reserved (forced to 1) ++ * - bit 21 is reserved ++ * ++ * RSVD: reserved bits must be 1 ++ */ ++#define PFM_CORE_PMC_RSVD ((~((1ULL<<32)-1)) \ ++ | (1ULL<<20) \ ++ | (1ULL<<21)) ++ ++/* ++ * Core counters are 40-bits ++ */ ++#define PFM_CORE_CTR_RSVD (~((1ULL<<40)-1)) ++ ++/* ++ * force Local APIC interrupt on overflow ++ * disable with NO_EMUL64 ++ */ ++#define PFM_CORE_PMC_VAL (1ULL<<20) ++#define PFM_CORE_NO64 (1ULL<<20) ++ ++#define PFM_CORE_NA { .reg_type = PFM_REGT_NA} ++ ++#define PFM_CORE_CA(m, c, t) \ ++ { \ ++ .addrs[0] = m, \ ++ .ctr = c, \ ++ .reg_type = t \ ++ } ++ ++struct pfm_ds_area_intel_core { ++ u64 bts_buf_base; ++ u64 bts_index; ++ u64 bts_abs_max; ++ u64 bts_intr_thres; ++ u64 pebs_buf_base; ++ u64 pebs_index; ++ u64 pebs_abs_max; ++ u64 pebs_intr_thres; ++ u64 pebs_cnt_reset; ++}; ++ ++static void pfm_core_restore_pmcs(struct pfm_context *ctx, ++ struct pfm_event_set *set); ++static int pfm_core_has_ovfls(struct pfm_context *ctx); ++static int pfm_core_stop_save(struct pfm_context *ctx, ++ struct pfm_event_set *set); ++static void __kprobes pfm_core_quiesce(void); ++ ++static u64 enable_mask[PFM_MAX_PMCS]; ++static u16 max_enable; ++ ++struct pfm_arch_pmu_info pfm_core_pmu_info = { ++ .stop_save = pfm_core_stop_save, ++ .has_ovfls = pfm_core_has_ovfls, ++ .quiesce = pfm_core_quiesce, ++ .restore_pmcs = pfm_core_restore_pmcs ++}; ++ ++static struct pfm_regmap_desc pfm_core_pmc_desc[] = { ++/* pmc0 */ { ++ .type = PFM_REG_I64, ++ .desc = "PERFEVTSEL0", ++ .dfl_val = PFM_CORE_PMC_VAL, ++ .rsvd_msk = PFM_CORE_PMC_RSVD, ++ .no_emul64_msk = PFM_CORE_NO64, ++ .hw_addr = MSR_P6_EVNTSEL0 ++ }, ++/* pmc1 */ { ++ .type = PFM_REG_I64, ++ .desc = "PERFEVTSEL1", ++ .dfl_val = PFM_CORE_PMC_VAL, ++ .rsvd_msk = PFM_CORE_PMC_RSVD, ++ .no_emul64_msk = PFM_CORE_NO64, ++ .hw_addr = MSR_P6_EVNTSEL1 ++ }, ++/* pmc2 */ PMX_NA, PMX_NA, ++/* pmc4 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc8 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc12 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmc16 */ { .type = PFM_REG_I, ++ .desc = "FIXED_CTRL", ++ .dfl_val = 0x888ULL, ++ .rsvd_msk = 0xfffffffffffffcccULL, ++ .no_emul64_msk = 0, ++ .hw_addr = MSR_CORE_PERF_FIXED_CTR_CTRL ++ }, ++/* pmc17 */ { .type = PFM_REG_W, ++ .desc = "PEBS_ENABLE", ++ .dfl_val = 0, ++ .rsvd_msk = 0xfffffffffffffffeULL, ++ .no_emul64_msk = 0, ++ .hw_addr = MSR_IA32_PEBS_ENABLE ++ } ++}; ++ ++#define PFM_CORE_D(n) \ ++ { .type = PFM_REG_C, \ ++ .desc = "PMC"#n, \ ++ .rsvd_msk = PFM_CORE_CTR_RSVD, \ ++ .hw_addr = MSR_P6_PERFCTR0+n, \ ++ .dep_pmcs[0] = 1ULL << n \ ++ } ++ ++#define PFM_CORE_FD(n) \ ++ { .type = PFM_REG_C, \ ++ .desc = "FIXED_CTR"#n, \ ++ .rsvd_msk = PFM_CORE_CTR_RSVD, \ ++ .hw_addr = MSR_CORE_PERF_FIXED_CTR0+n,\ ++ .dep_pmcs[0] = 1ULL << 16 \ ++ } ++ ++static struct pfm_regmap_desc pfm_core_pmd_desc[] = { ++/* pmd0 */ PFM_CORE_D(0), ++/* pmd1 */ PFM_CORE_D(1), ++/* pmd2 */ PMX_NA, PMX_NA, ++/* pmd4 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmd8 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmd12 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, ++/* pmd16 */ PFM_CORE_FD(0), ++/* pmd17 */ PFM_CORE_FD(1), ++/* pmd18 */ PFM_CORE_FD(2) ++}; ++#define PFM_CORE_NUM_PMCS ARRAY_SIZE(pfm_core_pmc_desc) ++#define PFM_CORE_NUM_PMDS ARRAY_SIZE(pfm_core_pmd_desc) ++ ++static struct pfm_pmu_config pfm_core_pmu_conf; ++ ++static int pfm_core_probe_pmu(void) ++{ ++ /* ++ * Check for Intel Core processor explicitely ++ * Checking for cpu_has_perfmon is not enough as this ++ * matches intel Core Duo/Core Solo but none supports ++ * PEBS. ++ * ++ * Intel Core = arch perfmon v2 + PEBS ++ */ ++ if (current_cpu_data.x86_vendor != X86_VENDOR_INTEL) { ++ PFM_INFO("not an AMD processor"); ++ return -1; ++ } ++ ++ if (current_cpu_data.x86 != 6) ++ return -1; ++ ++ switch (current_cpu_data.x86_model) { ++ case 15: /* Merom */ ++ break; ++ case 23: /* Penryn */ ++ break; ++ case 29: /* Dunnington */ ++ break; ++ default: ++ return -1; ++ } ++ ++ if (!cpu_has_apic) { ++ PFM_INFO("no Local APIC, unsupported"); ++ return -1; ++ } ++ ++ PFM_INFO("nmi_watchdog=%d nmi_active=%d force_nmi=%d", ++ nmi_watchdog, atomic_read(&nmi_active), force_nmi); ++ ++ /* ++ * Intel Core processors implement DS and PEBS, no need to check ++ */ ++ if (cpu_has_pebs) ++ PFM_INFO("PEBS supported, enabled"); ++ ++ /* ++ * initialize bitmask of register with enable capability, i.e., ++ * startstop. This is used to restrict the number of registers to ++ * touch on start/stop ++ * max_enable: number of bits to scan in enable_mask = highest + 1 ++ * ++ * may be adjusted in pfm_arch_pmu_acquire() ++ */ ++ __set_bit(0, cast_ulp(enable_mask)); ++ __set_bit(1, cast_ulp(enable_mask)); ++ __set_bit(16, cast_ulp(enable_mask)); ++ __set_bit(17, cast_ulp(enable_mask)); ++ max_enable = 17+1; ++ ++ if (force_nmi) ++ pfm_core_pmu_info.flags |= PFM_X86_FL_USE_NMI; ++ ++ return 0; ++} ++ ++static int pfm_core_pmc17_check(struct pfm_context *ctx, ++ struct pfm_event_set *set, ++ struct pfarg_pmc *req) ++{ ++ struct pfm_arch_context *ctx_arch; ++ ctx_arch = pfm_ctx_arch(ctx); ++ ++ /* ++ * if user activates PEBS_ENABLE, then we need to have a valid ++ * DS Area setup. This only happens when the PEBS sampling format is ++ * used in which case PFM_X86_USE_PEBS is set. We must reject all other ++ * requests. ++ * ++ * Otherwise we may pickup stale MSR_IA32_DS_AREA values. It appears ++ * that a value of 0 for this MSR does crash the system with ++ * PEBS_ENABLE=1. ++ */ ++ if (!ctx_arch->flags.use_pebs && req->reg_value) { ++ PFM_DBG("pmc17 useable only with a PEBS sampling format"); ++ return -EINVAL; ++ } ++ return 0; ++} ++ ++/* ++ * detect is counters have overflowed. ++ * return: ++ * 0 : no overflow ++ * 1 : at least one overflow ++ * ++ * used by Intel Core-based processors ++ */ ++static int __kprobes pfm_core_has_ovfls(struct pfm_context *ctx) ++{ ++ struct pfm_arch_pmu_info *pmu_info; ++ u64 *cnt_mask; ++ u64 wmask, val; ++ u16 i, num; ++ ++ pmu_info = &pfm_core_pmu_info; ++ cnt_mask = ctx->regs.cnt_pmds; ++ num = ctx->regs.num_counters; ++ wmask = 1ULL << pfm_pmu_conf->counter_width; ++ ++ for (i = 0; num; i++) { ++ if (test_bit(i, cast_ulp(cnt_mask))) { ++ rdmsrl(pfm_core_pmd_desc[i].hw_addr, val); ++ if (!(val & wmask)) ++ return 1; ++ num--; ++ } ++ } ++ return 0; ++} ++ ++static int pfm_core_stop_save(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ struct pfm_arch_context *ctx_arch; ++ struct pfm_ds_area_intel_core *ds = NULL; ++ u64 used_mask[PFM_PMC_BV]; ++ u64 *cnt_mask; ++ u64 val, wmask, ovfl_mask; ++ u16 count, has_ovfl; ++ u16 i, pebs_idx = ~0; ++ ++ ctx_arch = pfm_ctx_arch(ctx); ++ ++ wmask = 1ULL << pfm_pmu_conf->counter_width; ++ ++ /* ++ * used enable pmc bitmask ++ */ ++ bitmap_and(cast_ulp(used_mask), ++ cast_ulp(set->used_pmcs), ++ cast_ulp(enable_mask), ++ max_enable); ++ ++ count = bitmap_weight(cast_ulp(used_mask), max_enable); ++ /* ++ * stop monitoring ++ * Unfortunately, this is very expensive! ++ * wrmsrl() is serializing. ++ */ ++ for (i = 0; count; i++) { ++ if (test_bit(i, cast_ulp(used_mask))) { ++ wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, 0); ++ count--; ++ } ++ } ++ /* ++ * if we already having a pending overflow condition, we simply ++ * return to take care of this first. ++ */ ++ if (set->npend_ovfls) ++ return 1; ++ ++ ovfl_mask = pfm_pmu_conf->ovfl_mask; ++ cnt_mask = ctx->regs.cnt_pmds; ++ ++ if (ctx_arch->flags.use_pebs) { ++ ds = ctx_arch->ds_area; ++ pebs_idx = 0; /* PMC0/PMD0 */ ++ PFM_DBG("ds=%p pebs_idx=0x%llx thres=0x%llx", ++ ds, ++ (unsigned long long)ds->pebs_index, ++ (unsigned long long)ds->pebs_intr_thres); ++ } ++ ++ /* ++ * Check for pending overflows and save PMDs (combo) ++ * We employ used_pmds and not intr_pmds because we must ++ * also saved on PMD registers. ++ * Must check for counting PMDs because of virtual PMDs ++ * ++ * XXX: should use the ovf_status register instead, yet ++ * we would have to check if NMI is used and fallback ++ * to individual pmd inspection. ++ */ ++ count = set->nused_pmds; ++ ++ for (i = 0; count; i++) { ++ if (test_bit(i, cast_ulp(set->used_pmds))) { ++ val = pfm_arch_read_pmd(ctx, i); ++ if (likely(test_bit(i, cast_ulp(cnt_mask)))) { ++ if (i == pebs_idx) ++ has_ovfl = (ds->pebs_index >= ++ ds->pebs_intr_thres); ++ else ++ has_ovfl = !(val & wmask); ++ if (has_ovfl) { ++ __set_bit(i, cast_ulp(set->povfl_pmds)); ++ set->npend_ovfls++; ++ } ++ val = (set->pmds[i].value & ~ovfl_mask) ++ | (val & ovfl_mask); ++ } ++ set->pmds[i].value = val; ++ count--; ++ } ++ } ++ /* 0 means: no need to save PMDs at upper level */ ++ return 0; ++} ++ ++/** ++ * pfm_core_quiesce - stop monitoring without grabbing any lock ++ * ++ * called from NMI interrupt handler to immediately stop monitoring ++ * cannot grab any lock, including perfmon related locks ++ */ ++static void __kprobes pfm_core_quiesce(void) ++{ ++ /* ++ * quiesce PMU by clearing available registers that have ++ * the start/stop capability ++ */ ++ if (test_bit(0, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) ++ wrmsrl(MSR_P6_EVNTSEL0, 0); ++ if (test_bit(1, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) ++ wrmsrl(MSR_P6_EVNTSEL1, 0); ++ if (test_bit(16, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) ++ wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0); ++ if (test_bit(17, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) ++ wrmsrl(MSR_IA32_PEBS_ENABLE, 0); ++} ++/** ++ * pfm_core_restore_pmcs - reload PMC registers ++ * @ctx: context to restore from ++ * @set: current event set ++ * ++ * optimized version of pfm_arch_restore_pmcs(). On Core, we can ++ * afford to only restore the pmcs registers we use, because they are ++ * all independent from each other. ++ */ ++static void pfm_core_restore_pmcs(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ struct pfm_arch_context *ctx_arch; ++ u64 *mask; ++ u16 i, num; ++ ++ ctx_arch = pfm_ctx_arch(ctx); ++ ++ /* ++ * must restore DS pointer before restoring PMCs ++ * as this can potentially reactivate monitoring ++ */ ++ if (ctx_arch->flags.use_ds) ++ wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ctx_arch->ds_area); ++ ++ mask = set->used_pmcs; ++ num = set->nused_pmcs; ++ for (i = 0; num; i++) { ++ if (test_bit(i, cast_ulp(mask))) { ++ wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, set->pmcs[i]); ++ num--; ++ } ++ } ++} ++ ++/* ++ * Counters may have model-specific width which can be probed using ++ * the CPUID.0xa leaf. Yet, the documentation says: " ++ * In the initial implementation, only the read bit width is reported ++ * by CPUID, write operations are limited to the low 32 bits. ++ * Bits [w-32] are sign extensions of bit 31. As such the effective width ++ * of a counter is 31 bits only. ++ */ ++static struct pfm_pmu_config pfm_core_pmu_conf = { ++ .pmu_name = "Intel Core", ++ .pmd_desc = pfm_core_pmd_desc, ++ .counter_width = 31, ++ .num_pmc_entries = PFM_CORE_NUM_PMCS, ++ .num_pmd_entries = PFM_CORE_NUM_PMDS, ++ .pmc_desc = pfm_core_pmc_desc, ++ .probe_pmu = pfm_core_probe_pmu, ++ .version = "1.2", ++ .flags = PFM_PMU_BUILTIN_FLAG, ++ .owner = THIS_MODULE, ++ .pmu_info = &pfm_core_pmu_info, ++ .pmc_write_check = pfm_core_pmc17_check ++}; ++ ++static int __init pfm_core_pmu_init_module(void) ++{ ++ return pfm_pmu_register(&pfm_core_pmu_conf); ++} ++ ++static void __exit pfm_core_pmu_cleanup_module(void) ++{ ++ pfm_pmu_unregister(&pfm_core_pmu_conf); ++} ++ ++module_init(pfm_core_pmu_init_module); ++module_exit(pfm_core_pmu_cleanup_module); +diff --git a/arch/x86/perfmon/perfmon_p4.c b/arch/x86/perfmon/perfmon_p4.c +new file mode 100644 +index 0000000..1ffcf3c +--- /dev/null ++++ b/arch/x86/perfmon/perfmon_p4.c +@@ -0,0 +1,913 @@ ++/* ++ * This file contains the P4/Xeon PMU register description tables ++ * for both 32 and 64 bit modes. ++ * ++ * Copyright (c) 2005 Intel Corporation ++ * Contributed by Bryan Wilkerson <bryan.p.wilkerson@intel.com> ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/module.h> ++#include <linux/perfmon_kern.h> ++#include <linux/kprobes.h> ++#include <linux/nmi.h> ++#include <asm/msr.h> ++#include <asm/apic.h> ++ ++MODULE_AUTHOR("Bryan Wilkerson <bryan.p.wilkerson@intel.com>"); ++MODULE_DESCRIPTION("P4/Xeon/EM64T PMU description table"); ++MODULE_LICENSE("GPL"); ++ ++static int force; ++MODULE_PARM_DESC(force, "bool: force module to load succesfully"); ++module_param(force, bool, 0600); ++ ++static int force_nmi; ++MODULE_PARM_DESC(force_nmi, "bool: force use of NMI for PMU interrupt"); ++module_param(force_nmi, bool, 0600); ++ ++/* ++ * For extended register information in addition to address that is used ++ * at runtime to figure out the mapping of reg addresses to logical procs ++ * and association of registers to hardware specific features ++ */ ++struct pfm_p4_regmap { ++ /* ++ * one each for the logical CPUs. Index 0 corresponds to T0 and ++ * index 1 corresponds to T1. Index 1 can be zero if no T1 ++ * complement reg exists. ++ */ ++ unsigned long addrs[2]; /* 2 = number of threads */ ++ unsigned int ctr; /* for CCCR/PERFEVTSEL, associated counter */ ++ unsigned int reg_type; ++}; ++ ++/* ++ * bitmask for pfm_p4_regmap.reg_type ++ */ ++#define PFM_REGT_NA 0x0000 /* not available */ ++#define PFM_REGT_EN 0x0001 /* has enable bit (cleared on ctxsw) */ ++#define PFM_REGT_ESCR 0x0002 /* P4: ESCR */ ++#define PFM_REGT_CCCR 0x0004 /* P4: CCCR */ ++#define PFM_REGT_PEBS 0x0010 /* PEBS related */ ++#define PFM_REGT_NOHT 0x0020 /* unavailable with HT */ ++#define PFM_REGT_CTR 0x0040 /* counter */ ++ ++/* ++ * architecture specific context extension. ++ * located at: (struct pfm_arch_context *)(ctx+1) ++ */ ++struct pfm_arch_p4_context { ++ u32 npend_ovfls; /* P4 NMI #pending ovfls */ ++ u32 reserved; ++ u64 povfl_pmds[PFM_PMD_BV]; /* P4 NMI overflowed counters */ ++ u64 saved_cccrs[PFM_MAX_PMCS]; ++}; ++ ++/* ++ * ESCR reserved bitmask: ++ * - bits 31 - 63 reserved ++ * - T1_OS and T1_USR bits are reserved - set depending on logical proc ++ * user mode application should use T0_OS and T0_USR to indicate ++ * RSVD: reserved bits must be 1 ++ */ ++#define PFM_ESCR_RSVD ~0x000000007ffffffcULL ++ ++/* ++ * CCCR default value: ++ * - OVF_PMI_T0=1 (bit 26) ++ * - OVF_PMI_T1=0 (bit 27) (set if necessary in pfm_write_reg()) ++ * - all other bits are zero ++ * ++ * OVF_PMI is forced to zero if PFM_REGFL_NO_EMUL64 is set on CCCR ++ */ ++#define PFM_CCCR_DFL (1ULL<<26) | (3ULL<<16) ++ ++/* ++ * CCCR reserved fields: ++ * - bits 0-11, 25-29, 31-63 ++ * - OVF_PMI (26-27), override with REGFL_NO_EMUL64 ++ * ++ * RSVD: reserved bits must be 1 ++ */ ++#define PFM_CCCR_RSVD ~((0xfull<<12) \ ++ | (0x7full<<18) \ ++ | (0x1ull<<30)) ++ ++#define PFM_P4_NO64 (3ULL<<26) /* use 3 even in non HT mode */ ++ ++#define PEBS_PMD 8 /* thread0: IQ_CTR4, thread1: IQ_CTR5 */ ++ ++/* ++ * With HyperThreading enabled: ++ * ++ * The ESCRs and CCCRs are divided in half with the top half ++ * belonging to logical processor 0 and the bottom half going to ++ * logical processor 1. Thus only half of the PMU resources are ++ * accessible to applications. ++ * ++ * PEBS is not available due to the fact that: ++ * - MSR_PEBS_MATRIX_VERT is shared between the threads ++ * - IA32_PEBS_ENABLE is shared between the threads ++ * ++ * With HyperThreading disabled: ++ * ++ * The full set of PMU resources is exposed to applications. ++ * ++ * The mapping is chosen such that PMCxx -> MSR is the same ++ * in HT and non HT mode, if register is present in HT mode. ++ * ++ */ ++#define PFM_REGT_NHTESCR (PFM_REGT_ESCR|PFM_REGT_NOHT) ++#define PFM_REGT_NHTCCCR (PFM_REGT_CCCR|PFM_REGT_NOHT|PFM_REGT_EN) ++#define PFM_REGT_NHTPEBS (PFM_REGT_PEBS|PFM_REGT_NOHT|PFM_REGT_EN) ++#define PFM_REGT_NHTCTR (PFM_REGT_CTR|PFM_REGT_NOHT) ++#define PFM_REGT_ENAC (PFM_REGT_CCCR|PFM_REGT_EN) ++ ++static void pfm_p4_write_pmc(struct pfm_context *ctx, unsigned int cnum, u64 value); ++static void pfm_p4_write_pmd(struct pfm_context *ctx, unsigned int cnum, u64 value); ++static u64 pfm_p4_read_pmd(struct pfm_context *ctx, unsigned int cnum); ++static u64 pfm_p4_read_pmc(struct pfm_context *ctx, unsigned int cnum); ++static int pfm_p4_create_context(struct pfm_context *ctx, u32 ctx_flags); ++static void pfm_p4_free_context(struct pfm_context *ctx); ++static int pfm_p4_has_ovfls(struct pfm_context *ctx); ++static int pfm_p4_stop_save(struct pfm_context *ctx, struct pfm_event_set *set); ++static void pfm_p4_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set); ++static void pfm_p4_nmi_copy_state(struct pfm_context *ctx); ++static void __kprobes pfm_p4_quiesce(void); ++ ++static u64 enable_mask[PFM_MAX_PMCS]; ++static u16 max_enable; ++ ++static struct pfm_p4_regmap pmc_addrs[PFM_MAX_PMCS] = { ++ /*pmc 0 */ {{MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1}, 0, PFM_REGT_ESCR}, /* BPU_ESCR0,1 */ ++ /*pmc 1 */ {{MSR_P4_IS_ESCR0, MSR_P4_IS_ESCR1}, 0, PFM_REGT_ESCR}, /* IS_ESCR0,1 */ ++ /*pmc 2 */ {{MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1}, 0, PFM_REGT_ESCR}, /* MOB_ESCR0,1 */ ++ /*pmc 3 */ {{MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1}, 0, PFM_REGT_ESCR}, /* ITLB_ESCR0,1 */ ++ /*pmc 4 */ {{MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1}, 0, PFM_REGT_ESCR}, /* PMH_ESCR0,1 */ ++ /*pmc 5 */ {{MSR_P4_IX_ESCR0, MSR_P4_IX_ESCR1}, 0, PFM_REGT_ESCR}, /* IX_ESCR0,1 */ ++ /*pmc 6 */ {{MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1}, 0, PFM_REGT_ESCR}, /* FSB_ESCR0,1 */ ++ /*pmc 7 */ {{MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1}, 0, PFM_REGT_ESCR}, /* BSU_ESCR0,1 */ ++ /*pmc 8 */ {{MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1}, 0, PFM_REGT_ESCR}, /* MS_ESCR0,1 */ ++ /*pmc 9 */ {{MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1}, 0, PFM_REGT_ESCR}, /* TC_ESCR0,1 */ ++ /*pmc 10*/ {{MSR_P4_TBPU_ESCR0, MSR_P4_TBPU_ESCR1}, 0, PFM_REGT_ESCR}, /* TBPU_ESCR0,1 */ ++ /*pmc 11*/ {{MSR_P4_FLAME_ESCR0, MSR_P4_FLAME_ESCR1}, 0, PFM_REGT_ESCR}, /* FLAME_ESCR0,1 */ ++ /*pmc 12*/ {{MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1}, 0, PFM_REGT_ESCR}, /* FIRM_ESCR0,1 */ ++ /*pmc 13*/ {{MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1}, 0, PFM_REGT_ESCR}, /* SAAT_ESCR0,1 */ ++ /*pmc 14*/ {{MSR_P4_U2L_ESCR0, MSR_P4_U2L_ESCR1}, 0, PFM_REGT_ESCR}, /* U2L_ESCR0,1 */ ++ /*pmc 15*/ {{MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1}, 0, PFM_REGT_ESCR}, /* DAC_ESCR0,1 */ ++ /*pmc 16*/ {{MSR_P4_IQ_ESCR0, MSR_P4_IQ_ESCR1}, 0, PFM_REGT_ESCR}, /* IQ_ESCR0,1 (only model 1 and 2) */ ++ /*pmc 17*/ {{MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1}, 0, PFM_REGT_ESCR}, /* ALF_ESCR0,1 */ ++ /*pmc 18*/ {{MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1}, 0, PFM_REGT_ESCR}, /* RAT_ESCR0,1 */ ++ /*pmc 19*/ {{MSR_P4_SSU_ESCR0, 0}, 0, PFM_REGT_ESCR}, /* SSU_ESCR0 */ ++ /*pmc 20*/ {{MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1}, 0, PFM_REGT_ESCR}, /* CRU_ESCR0,1 */ ++ /*pmc 21*/ {{MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3}, 0, PFM_REGT_ESCR}, /* CRU_ESCR2,3 */ ++ /*pmc 22*/ {{MSR_P4_CRU_ESCR4, MSR_P4_CRU_ESCR5}, 0, PFM_REGT_ESCR}, /* CRU_ESCR4,5 */ ++ ++ /*pmc 23*/ {{MSR_P4_BPU_CCCR0, MSR_P4_BPU_CCCR2}, 0, PFM_REGT_ENAC}, /* BPU_CCCR0,2 */ ++ /*pmc 24*/ {{MSR_P4_BPU_CCCR1, MSR_P4_BPU_CCCR3}, 1, PFM_REGT_ENAC}, /* BPU_CCCR1,3 */ ++ /*pmc 25*/ {{MSR_P4_MS_CCCR0, MSR_P4_MS_CCCR2}, 2, PFM_REGT_ENAC}, /* MS_CCCR0,2 */ ++ /*pmc 26*/ {{MSR_P4_MS_CCCR1, MSR_P4_MS_CCCR3}, 3, PFM_REGT_ENAC}, /* MS_CCCR1,3 */ ++ /*pmc 27*/ {{MSR_P4_FLAME_CCCR0, MSR_P4_FLAME_CCCR2}, 4, PFM_REGT_ENAC}, /* FLAME_CCCR0,2 */ ++ /*pmc 28*/ {{MSR_P4_FLAME_CCCR1, MSR_P4_FLAME_CCCR3}, 5, PFM_REGT_ENAC}, /* FLAME_CCCR1,3 */ ++ /*pmc 29*/ {{MSR_P4_IQ_CCCR0, MSR_P4_IQ_CCCR2}, 6, PFM_REGT_ENAC}, /* IQ_CCCR0,2 */ ++ /*pmc 30*/ {{MSR_P4_IQ_CCCR1, MSR_P4_IQ_CCCR3}, 7, PFM_REGT_ENAC}, /* IQ_CCCR1,3 */ ++ /*pmc 31*/ {{MSR_P4_IQ_CCCR4, MSR_P4_IQ_CCCR5}, 8, PFM_REGT_ENAC}, /* IQ_CCCR4,5 */ ++ /* non HT extensions */ ++ /*pmc 32*/ {{MSR_P4_BPU_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* BPU_ESCR1 */ ++ /*pmc 33*/ {{MSR_P4_IS_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* IS_ESCR1 */ ++ /*pmc 34*/ {{MSR_P4_MOB_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* MOB_ESCR1 */ ++ /*pmc 35*/ {{MSR_P4_ITLB_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* ITLB_ESCR1 */ ++ /*pmc 36*/ {{MSR_P4_PMH_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* PMH_ESCR1 */ ++ /*pmc 37*/ {{MSR_P4_IX_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* IX_ESCR1 */ ++ /*pmc 38*/ {{MSR_P4_FSB_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* FSB_ESCR1 */ ++ /*pmc 39*/ {{MSR_P4_BSU_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* BSU_ESCR1 */ ++ /*pmc 40*/ {{MSR_P4_MS_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* MS_ESCR1 */ ++ /*pmc 41*/ {{MSR_P4_TC_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* TC_ESCR1 */ ++ /*pmc 42*/ {{MSR_P4_TBPU_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* TBPU_ESCR1 */ ++ /*pmc 43*/ {{MSR_P4_FLAME_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* FLAME_ESCR1 */ ++ /*pmc 44*/ {{MSR_P4_FIRM_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* FIRM_ESCR1 */ ++ /*pmc 45*/ {{MSR_P4_SAAT_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* SAAT_ESCR1 */ ++ /*pmc 46*/ {{MSR_P4_U2L_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* U2L_ESCR1 */ ++ /*pmc 47*/ {{MSR_P4_DAC_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* DAC_ESCR1 */ ++ /*pmc 48*/ {{MSR_P4_IQ_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* IQ_ESCR1 (only model 1 and 2) */ ++ /*pmc 49*/ {{MSR_P4_ALF_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* ALF_ESCR1 */ ++ /*pmc 50*/ {{MSR_P4_RAT_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* RAT_ESCR1 */ ++ /*pmc 51*/ {{MSR_P4_CRU_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* CRU_ESCR1 */ ++ /*pmc 52*/ {{MSR_P4_CRU_ESCR3, 0}, 0, PFM_REGT_NHTESCR}, /* CRU_ESCR3 */ ++ /*pmc 53*/ {{MSR_P4_CRU_ESCR5, 0}, 0, PFM_REGT_NHTESCR}, /* CRU_ESCR5 */ ++ /*pmc 54*/ {{MSR_P4_BPU_CCCR1, 0}, 9, PFM_REGT_NHTCCCR}, /* BPU_CCCR1 */ ++ /*pmc 55*/ {{MSR_P4_BPU_CCCR3, 0}, 10, PFM_REGT_NHTCCCR}, /* BPU_CCCR3 */ ++ /*pmc 56*/ {{MSR_P4_MS_CCCR1, 0}, 11, PFM_REGT_NHTCCCR}, /* MS_CCCR1 */ ++ /*pmc 57*/ {{MSR_P4_MS_CCCR3, 0}, 12, PFM_REGT_NHTCCCR}, /* MS_CCCR3 */ ++ /*pmc 58*/ {{MSR_P4_FLAME_CCCR1, 0}, 13, PFM_REGT_NHTCCCR}, /* FLAME_CCCR1 */ ++ /*pmc 59*/ {{MSR_P4_FLAME_CCCR3, 0}, 14, PFM_REGT_NHTCCCR}, /* FLAME_CCCR3 */ ++ /*pmc 60*/ {{MSR_P4_IQ_CCCR2, 0}, 15, PFM_REGT_NHTCCCR}, /* IQ_CCCR2 */ ++ /*pmc 61*/ {{MSR_P4_IQ_CCCR3, 0}, 16, PFM_REGT_NHTCCCR}, /* IQ_CCCR3 */ ++ /*pmc 62*/ {{MSR_P4_IQ_CCCR5, 0}, 17, PFM_REGT_NHTCCCR}, /* IQ_CCCR5 */ ++ /*pmc 63*/ {{0x3f2, 0}, 0, PFM_REGT_NHTPEBS},/* PEBS_MATRIX_VERT */ ++ /*pmc 64*/ {{0x3f1, 0}, 0, PFM_REGT_NHTPEBS} /* PEBS_ENABLE */ ++}; ++ ++static struct pfm_p4_regmap pmd_addrs[PFM_MAX_PMDS] = { ++ /*pmd 0 */ {{MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_PERFCTR2}, 0, PFM_REGT_CTR}, /* BPU_CTR0,2 */ ++ /*pmd 1 */ {{MSR_P4_BPU_PERFCTR1, MSR_P4_BPU_PERFCTR3}, 0, PFM_REGT_CTR}, /* BPU_CTR1,3 */ ++ /*pmd 2 */ {{MSR_P4_MS_PERFCTR0, MSR_P4_MS_PERFCTR2}, 0, PFM_REGT_CTR}, /* MS_CTR0,2 */ ++ /*pmd 3 */ {{MSR_P4_MS_PERFCTR1, MSR_P4_MS_PERFCTR3}, 0, PFM_REGT_CTR}, /* MS_CTR1,3 */ ++ /*pmd 4 */ {{MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_PERFCTR2}, 0, PFM_REGT_CTR}, /* FLAME_CTR0,2 */ ++ /*pmd 5 */ {{MSR_P4_FLAME_PERFCTR1, MSR_P4_FLAME_PERFCTR3}, 0, PFM_REGT_CTR}, /* FLAME_CTR1,3 */ ++ /*pmd 6 */ {{MSR_P4_IQ_PERFCTR0, MSR_P4_IQ_PERFCTR2}, 0, PFM_REGT_CTR}, /* IQ_CTR0,2 */ ++ /*pmd 7 */ {{MSR_P4_IQ_PERFCTR1, MSR_P4_IQ_PERFCTR3}, 0, PFM_REGT_CTR}, /* IQ_CTR1,3 */ ++ /*pmd 8 */ {{MSR_P4_IQ_PERFCTR4, MSR_P4_IQ_PERFCTR5}, 0, PFM_REGT_CTR}, /* IQ_CTR4,5 */ ++ /* ++ * non HT extensions ++ */ ++ /*pmd 9 */ {{MSR_P4_BPU_PERFCTR2, 0}, 0, PFM_REGT_NHTCTR}, /* BPU_CTR2 */ ++ /*pmd 10*/ {{MSR_P4_BPU_PERFCTR3, 0}, 0, PFM_REGT_NHTCTR}, /* BPU_CTR3 */ ++ /*pmd 11*/ {{MSR_P4_MS_PERFCTR2, 0}, 0, PFM_REGT_NHTCTR}, /* MS_CTR2 */ ++ /*pmd 12*/ {{MSR_P4_MS_PERFCTR3, 0}, 0, PFM_REGT_NHTCTR}, /* MS_CTR3 */ ++ /*pmd 13*/ {{MSR_P4_FLAME_PERFCTR2, 0}, 0, PFM_REGT_NHTCTR}, /* FLAME_CTR2 */ ++ /*pmd 14*/ {{MSR_P4_FLAME_PERFCTR3, 0}, 0, PFM_REGT_NHTCTR}, /* FLAME_CTR3 */ ++ /*pmd 15*/ {{MSR_P4_IQ_PERFCTR2, 0}, 0, PFM_REGT_NHTCTR}, /* IQ_CTR2 */ ++ /*pmd 16*/ {{MSR_P4_IQ_PERFCTR3, 0}, 0, PFM_REGT_NHTCTR}, /* IQ_CTR3 */ ++ /*pmd 17*/ {{MSR_P4_IQ_PERFCTR5, 0}, 0, PFM_REGT_NHTCTR}, /* IQ_CTR5 */ ++}; ++ ++static struct pfm_arch_pmu_info pfm_p4_pmu_info = { ++ .write_pmc = pfm_p4_write_pmc, ++ .write_pmd = pfm_p4_write_pmd, ++ .read_pmc = pfm_p4_read_pmc, ++ .read_pmd = pfm_p4_read_pmd, ++ .create_context = pfm_p4_create_context, ++ .free_context = pfm_p4_free_context, ++ .has_ovfls = pfm_p4_has_ovfls, ++ .stop_save = pfm_p4_stop_save, ++ .restore_pmcs = pfm_p4_restore_pmcs, ++ .nmi_copy_state = pfm_p4_nmi_copy_state, ++ .quiesce = pfm_p4_quiesce ++}; ++ ++static struct pfm_regmap_desc pfm_p4_pmc_desc[] = { ++/* pmc0 */ PMC_D(PFM_REG_I, "BPU_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_BPU_ESCR0), ++/* pmc1 */ PMC_D(PFM_REG_I, "IS_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_IQ_ESCR0), ++/* pmc2 */ PMC_D(PFM_REG_I, "MOB_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_MOB_ESCR0), ++/* pmc3 */ PMC_D(PFM_REG_I, "ITLB_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_ITLB_ESCR0), ++/* pmc4 */ PMC_D(PFM_REG_I, "PMH_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_PMH_ESCR0), ++/* pmc5 */ PMC_D(PFM_REG_I, "IX_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_IX_ESCR0), ++/* pmc6 */ PMC_D(PFM_REG_I, "FSB_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_FSB_ESCR0), ++/* pmc7 */ PMC_D(PFM_REG_I, "BSU_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_BSU_ESCR0), ++/* pmc8 */ PMC_D(PFM_REG_I, "MS_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_MS_ESCR0), ++/* pmc9 */ PMC_D(PFM_REG_I, "TC_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_TC_ESCR0), ++/* pmc10 */ PMC_D(PFM_REG_I, "TBPU_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_TBPU_ESCR0), ++/* pmc11 */ PMC_D(PFM_REG_I, "FLAME_ESCR0", 0x0, PFM_ESCR_RSVD, 0, MSR_P4_FLAME_ESCR0), ++/* pmc12 */ PMC_D(PFM_REG_I, "FIRM_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_FIRM_ESCR0), ++/* pmc13 */ PMC_D(PFM_REG_I, "SAAT_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_SAAT_ESCR0), ++/* pmc14 */ PMC_D(PFM_REG_I, "U2L_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_U2L_ESCR0), ++/* pmc15 */ PMC_D(PFM_REG_I, "DAC_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_DAC_ESCR0), ++/* pmc16 */ PMC_D(PFM_REG_I, "IQ_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_IQ_ESCR0), /* only model 1 and 2*/ ++/* pmc17 */ PMC_D(PFM_REG_I, "ALF_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_ALF_ESCR0), ++/* pmc18 */ PMC_D(PFM_REG_I, "RAT_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_RAT_ESCR0), ++/* pmc19 */ PMC_D(PFM_REG_I, "SSU_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_SSU_ESCR0), ++/* pmc20 */ PMC_D(PFM_REG_I, "CRU_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_CRU_ESCR0), ++/* pmc21 */ PMC_D(PFM_REG_I, "CRU_ESCR2" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_CRU_ESCR2), ++/* pmc22 */ PMC_D(PFM_REG_I, "CRU_ESCR4" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_CRU_ESCR4), ++/* pmc23 */ PMC_D(PFM_REG_I64, "BPU_CCCR0" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_BPU_CCCR0), ++/* pmc24 */ PMC_D(PFM_REG_I64, "BPU_CCCR1" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_BPU_CCCR1), ++/* pmc25 */ PMC_D(PFM_REG_I64, "MS_CCCR0" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_MS_CCCR0), ++/* pmc26 */ PMC_D(PFM_REG_I64, "MS_CCCR1" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_MS_CCCR1), ++/* pmc27 */ PMC_D(PFM_REG_I64, "FLAME_CCCR0", PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_FLAME_CCCR0), ++/* pmc28 */ PMC_D(PFM_REG_I64, "FLAME_CCCR1", PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_FLAME_CCCR1), ++/* pmc29 */ PMC_D(PFM_REG_I64, "IQ_CCCR0" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_IQ_CCCR0), ++/* pmc30 */ PMC_D(PFM_REG_I64, "IQ_CCCR1" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_IQ_CCCR1), ++/* pmc31 */ PMC_D(PFM_REG_I64, "IQ_CCCR4" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_IQ_CCCR4), ++ /* No HT extension */ ++/* pmc32 */ PMC_D(PFM_REG_I, "BPU_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_BPU_ESCR1), ++/* pmc33 */ PMC_D(PFM_REG_I, "IS_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_IS_ESCR1), ++/* pmc34 */ PMC_D(PFM_REG_I, "MOB_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_MOB_ESCR1), ++/* pmc35 */ PMC_D(PFM_REG_I, "ITLB_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_ITLB_ESCR1), ++/* pmc36 */ PMC_D(PFM_REG_I, "PMH_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_PMH_ESCR1), ++/* pmc37 */ PMC_D(PFM_REG_I, "IX_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_IX_ESCR1), ++/* pmc38 */ PMC_D(PFM_REG_I, "FSB_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_FSB_ESCR1), ++/* pmc39 */ PMC_D(PFM_REG_I, "BSU_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_BSU_ESCR1), ++/* pmc40 */ PMC_D(PFM_REG_I, "MS_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_MS_ESCR1), ++/* pmc41 */ PMC_D(PFM_REG_I, "TC_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_TC_ESCR1), ++/* pmc42 */ PMC_D(PFM_REG_I, "TBPU_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_TBPU_ESCR1), ++/* pmc43 */ PMC_D(PFM_REG_I, "FLAME_ESCR1", 0x0, PFM_ESCR_RSVD, 0, MSR_P4_FLAME_ESCR1), ++/* pmc44 */ PMC_D(PFM_REG_I, "FIRM_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_FIRM_ESCR1), ++/* pmc45 */ PMC_D(PFM_REG_I, "SAAT_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_SAAT_ESCR1), ++/* pmc46 */ PMC_D(PFM_REG_I, "U2L_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_U2L_ESCR1), ++/* pmc47 */ PMC_D(PFM_REG_I, "DAC_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_DAC_ESCR1), ++/* pmc48 */ PMC_D(PFM_REG_I, "IQ_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_IQ_ESCR1), /* only model 1 and 2 */ ++/* pmc49 */ PMC_D(PFM_REG_I, "ALF_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_ALF_ESCR1), ++/* pmc50 */ PMC_D(PFM_REG_I, "RAT_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_RAT_ESCR1), ++/* pmc51 */ PMC_D(PFM_REG_I, "CRU_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_CRU_ESCR1), ++/* pmc52 */ PMC_D(PFM_REG_I, "CRU_ESCR3" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_CRU_ESCR3), ++/* pmc53 */ PMC_D(PFM_REG_I, "CRU_ESCR5" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_CRU_ESCR5), ++/* pmc54 */ PMC_D(PFM_REG_I64, "BPU_CCCR2" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_BPU_CCCR2), ++/* pmc55 */ PMC_D(PFM_REG_I64, "BPU_CCCR3" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_BPU_CCCR3), ++/* pmc56 */ PMC_D(PFM_REG_I64, "MS_CCCR2" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_MS_CCCR2), ++/* pmc57 */ PMC_D(PFM_REG_I64, "MS_CCCR3" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_MS_CCCR3), ++/* pmc58 */ PMC_D(PFM_REG_I64, "FLAME_CCCR2", PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_FLAME_CCCR2), ++/* pmc59 */ PMC_D(PFM_REG_I64, "FLAME_CCCR3", PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_FLAME_CCCR3), ++/* pmc60 */ PMC_D(PFM_REG_I64, "IQ_CCCR2" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_IQ_CCCR2), ++/* pmc61 */ PMC_D(PFM_REG_I64, "IQ_CCCR3" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_IQ_CCCR3), ++/* pmc62 */ PMC_D(PFM_REG_I64, "IQ_CCCR5" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_IQ_CCCR5), ++/* pmc63 */ PMC_D(PFM_REG_I, "PEBS_MATRIX_VERT", 0, 0xffffffffffffffecULL, 0, 0x3f2), ++/* pmc64 */ PMC_D(PFM_REG_I, "PEBS_ENABLE", 0, 0xfffffffff8ffe000ULL, 0, 0x3f1) ++}; ++#define PFM_P4_NUM_PMCS ARRAY_SIZE(pfm_p4_pmc_desc) ++ ++/* ++ * See section 15.10.6.6 for details about the IQ block ++ */ ++static struct pfm_regmap_desc pfm_p4_pmd_desc[] = { ++/* pmd0 */ PMD_D(PFM_REG_C, "BPU_CTR0", MSR_P4_BPU_PERFCTR0), ++/* pmd1 */ PMD_D(PFM_REG_C, "BPU_CTR1", MSR_P4_BPU_PERFCTR1), ++/* pmd2 */ PMD_D(PFM_REG_C, "MS_CTR0", MSR_P4_MS_PERFCTR0), ++/* pmd3 */ PMD_D(PFM_REG_C, "MS_CTR1", MSR_P4_MS_PERFCTR1), ++/* pmd4 */ PMD_D(PFM_REG_C, "FLAME_CTR0", MSR_P4_FLAME_PERFCTR0), ++/* pmd5 */ PMD_D(PFM_REG_C, "FLAME_CTR1", MSR_P4_FLAME_PERFCTR1), ++/* pmd6 */ PMD_D(PFM_REG_C, "IQ_CTR0", MSR_P4_IQ_PERFCTR0), ++/* pmd7 */ PMD_D(PFM_REG_C, "IQ_CTR1", MSR_P4_IQ_PERFCTR1), ++/* pmd8 */ PMD_D(PFM_REG_C, "IQ_CTR4", MSR_P4_IQ_PERFCTR4), ++ /* no HT extension */ ++/* pmd9 */ PMD_D(PFM_REG_C, "BPU_CTR2", MSR_P4_BPU_PERFCTR2), ++/* pmd10 */ PMD_D(PFM_REG_C, "BPU_CTR3", MSR_P4_BPU_PERFCTR3), ++/* pmd11 */ PMD_D(PFM_REG_C, "MS_CTR2", MSR_P4_MS_PERFCTR2), ++/* pmd12 */ PMD_D(PFM_REG_C, "MS_CTR3", MSR_P4_MS_PERFCTR3), ++/* pmd13 */ PMD_D(PFM_REG_C, "FLAME_CTR2", MSR_P4_FLAME_PERFCTR2), ++/* pmd14 */ PMD_D(PFM_REG_C, "FLAME_CTR3", MSR_P4_FLAME_PERFCTR3), ++/* pmd15 */ PMD_D(PFM_REG_C, "IQ_CTR2", MSR_P4_IQ_PERFCTR2), ++/* pmd16 */ PMD_D(PFM_REG_C, "IQ_CTR3", MSR_P4_IQ_PERFCTR3), ++/* pmd17 */ PMD_D(PFM_REG_C, "IQ_CTR5", MSR_P4_IQ_PERFCTR5) ++}; ++#define PFM_P4_NUM_PMDS ARRAY_SIZE(pfm_p4_pmd_desc) ++ ++/* ++ * Due to hotplug CPU support, threads may not necessarily ++ * be activated at the time the module is inserted. We need ++ * to check whether they could be activated by looking at ++ * the present CPU (present != online). ++ */ ++static int pfm_p4_probe_pmu(void) ++{ ++ unsigned int i; ++ int ht_enabled; ++ ++ /* ++ * only works on Intel processors ++ */ ++ if (current_cpu_data.x86_vendor != X86_VENDOR_INTEL) { ++ PFM_INFO("not running on Intel processor"); ++ return -1; ++ } ++ ++ if (current_cpu_data.x86 != 15) { ++ PFM_INFO("unsupported family=%d", current_cpu_data.x86); ++ return -1; ++ } ++ ++ switch (current_cpu_data.x86_model) { ++ case 0 ... 2: ++ break; ++ case 3 ... 6: ++ /* ++ * IQ_ESCR0, IQ_ESCR1 only present on model 1, 2 ++ */ ++ pfm_p4_pmc_desc[16].type = PFM_REG_NA; ++ pfm_p4_pmc_desc[48].type = PFM_REG_NA; ++ break; ++ default: ++ /* ++ * do not know if they all work the same, so reject ++ * for now ++ */ ++ if (!force) { ++ PFM_INFO("unsupported model %d", ++ current_cpu_data.x86_model); ++ return -1; ++ } ++ } ++ ++ /* ++ * check for local APIC (required) ++ */ ++ if (!cpu_has_apic) { ++ PFM_INFO("no local APIC, unsupported"); ++ return -1; ++ } ++#ifdef CONFIG_SMP ++ ht_enabled = (cpus_weight(__get_cpu_var(cpu_core_map)) ++ / current_cpu_data.x86_max_cores) > 1; ++#else ++ ht_enabled = 0; ++#endif ++ if (cpu_has_ht) { ++ ++ PFM_INFO("HyperThreading supported, status %s", ++ ht_enabled ? "on": "off"); ++ /* ++ * disable registers not supporting HT ++ */ ++ if (ht_enabled) { ++ PFM_INFO("disabling half the registers for HT"); ++ for (i = 0; i < PFM_P4_NUM_PMCS; i++) { ++ if (pmc_addrs[(i)].reg_type & PFM_REGT_NOHT) ++ pfm_p4_pmc_desc[i].type = PFM_REG_NA; ++ } ++ for (i = 0; i < PFM_P4_NUM_PMDS; i++) { ++ if (pmd_addrs[(i)].reg_type & PFM_REGT_NOHT) ++ pfm_p4_pmd_desc[i].type = PFM_REG_NA; ++ } ++ } ++ } ++ ++ if (cpu_has_ds) { ++ PFM_INFO("Data Save Area (DS) supported"); ++ ++ if (cpu_has_pebs) { ++ /* ++ * PEBS does not work with HyperThreading enabled ++ */ ++ if (ht_enabled) ++ PFM_INFO("PEBS supported, status off (because of HT)"); ++ else ++ PFM_INFO("PEBS supported, status on"); ++ } ++ } ++ ++ /* ++ * build enable mask ++ */ ++ for (i = 0; i < PFM_P4_NUM_PMCS; i++) { ++ if (pmc_addrs[(i)].reg_type & PFM_REGT_EN) { ++ __set_bit(i, cast_ulp(enable_mask)); ++ max_enable = i + 1; ++ } ++ } ++ ++ if (force_nmi) ++ pfm_p4_pmu_info.flags |= PFM_X86_FL_USE_NMI; ++ return 0; ++} ++static inline int get_smt_id(void) ++{ ++#ifdef CONFIG_SMP ++ int cpu = smp_processor_id(); ++ return (cpu != first_cpu(__get_cpu_var(cpu_sibling_map))); ++#else ++ return 0; ++#endif ++} ++ ++static void __pfm_write_reg_p4(const struct pfm_p4_regmap *xreg, u64 val) ++{ ++ u64 pmi; ++ int smt_id; ++ ++ smt_id = get_smt_id(); ++ /* ++ * HT is only supported by P4-style PMU ++ * ++ * Adjust for T1 if necessary: ++ * ++ * - move the T0_OS/T0_USR bits into T1 slots ++ * - move the OVF_PMI_T0 bits into T1 slot ++ * ++ * The P4/EM64T T1 is cleared by description table. ++ * User only works with T0. ++ */ ++ if (smt_id) { ++ if (xreg->reg_type & PFM_REGT_ESCR) { ++ ++ /* copy T0_USR & T0_OS to T1 */ ++ val |= ((val & 0xc) >> 2); ++ ++ /* clear bits T0_USR & T0_OS */ ++ val &= ~0xc; ++ ++ } else if (xreg->reg_type & PFM_REGT_CCCR) { ++ pmi = (val >> 26) & 0x1; ++ if (pmi) { ++ val &= ~(1UL<<26); ++ val |= 1UL<<27; ++ } ++ } ++ } ++ if (xreg->addrs[smt_id]) ++ wrmsrl(xreg->addrs[smt_id], val); ++} ++ ++void __pfm_read_reg_p4(const struct pfm_p4_regmap *xreg, u64 *val) ++{ ++ int smt_id; ++ ++ smt_id = get_smt_id(); ++ ++ if (likely(xreg->addrs[smt_id])) { ++ rdmsrl(xreg->addrs[smt_id], *val); ++ /* ++ * HT is only supported by P4-style PMU ++ * ++ * move the Tx_OS and Tx_USR bits into ++ * T0 slots setting the T1 slots to zero ++ */ ++ if (xreg->reg_type & PFM_REGT_ESCR) { ++ if (smt_id) ++ *val |= (((*val) & 0x3) << 2); ++ ++ /* ++ * zero out bits that are reserved ++ * (including T1_OS and T1_USR) ++ */ ++ *val &= PFM_ESCR_RSVD; ++ } ++ } else { ++ *val = 0; ++ } ++} ++static void pfm_p4_write_pmc(struct pfm_context *ctx, unsigned int cnum, u64 value) ++{ ++ __pfm_write_reg_p4(&pmc_addrs[cnum], value); ++} ++ ++static void pfm_p4_write_pmd(struct pfm_context *ctx, unsigned int cnum, u64 value) ++{ ++ __pfm_write_reg_p4(&pmd_addrs[cnum], value); ++} ++ ++static u64 pfm_p4_read_pmd(struct pfm_context *ctx, unsigned int cnum) ++{ ++ u64 tmp; ++ __pfm_read_reg_p4(&pmd_addrs[cnum], &tmp); ++ return tmp; ++} ++ ++static u64 pfm_p4_read_pmc(struct pfm_context *ctx, unsigned int cnum) ++{ ++ u64 tmp; ++ __pfm_read_reg_p4(&pmc_addrs[cnum], &tmp); ++ return tmp; ++} ++ ++struct pfm_ds_area_p4 { ++ unsigned long bts_buf_base; ++ unsigned long bts_index; ++ unsigned long bts_abs_max; ++ unsigned long bts_intr_thres; ++ unsigned long pebs_buf_base; ++ unsigned long pebs_index; ++ unsigned long pebs_abs_max; ++ unsigned long pebs_intr_thres; ++ u64 pebs_cnt_reset; ++}; ++ ++ ++static int pfm_p4_stop_save(struct pfm_context *ctx, struct pfm_event_set *set) ++{ ++ struct pfm_arch_pmu_info *pmu_info; ++ struct pfm_arch_context *ctx_arch; ++ struct pfm_ds_area_p4 *ds = NULL; ++ u64 used_mask[PFM_PMC_BV]; ++ u16 i, j, count, pebs_idx = ~0; ++ u16 max_pmc; ++ u64 cccr, ctr1, ctr2, ovfl_mask; ++ ++ pmu_info = &pfm_p4_pmu_info; ++ ctx_arch = pfm_ctx_arch(ctx); ++ max_pmc = ctx->regs.max_pmc; ++ ovfl_mask = pfm_pmu_conf->ovfl_mask; ++ ++ /* ++ * build used enable PMC bitmask ++ * if user did not set any CCCR, then mask is ++ * empty and there is nothing to do because nothing ++ * was started ++ */ ++ bitmap_and(cast_ulp(used_mask), ++ cast_ulp(set->used_pmcs), ++ cast_ulp(enable_mask), ++ max_enable); ++ ++ count = bitmap_weight(cast_ulp(used_mask), max_enable); ++ ++ PFM_DBG_ovfl("npend=%u ena_mask=0x%llx u_pmcs=0x%llx count=%u num=%u", ++ set->npend_ovfls, ++ (unsigned long long)enable_mask[0], ++ (unsigned long long)set->used_pmcs[0], ++ count, max_enable); ++ ++ /* ++ * ensures we do not destroy pending overflow ++ * information. If pended interrupts are already ++ * known, then we just stop monitoring. ++ */ ++ if (set->npend_ovfls) { ++ /* ++ * clear enable bit ++ * unfortunately, this is very expensive! ++ */ ++ for (i = 0; count; i++) { ++ if (test_bit(i, cast_ulp(used_mask))) { ++ __pfm_write_reg_p4(pmc_addrs+i, 0); ++ count--; ++ } ++ } ++ /* need save PMDs at upper level */ ++ return 1; ++ } ++ ++ if (ctx_arch->flags.use_pebs) { ++ ds = ctx_arch->ds_area; ++ pebs_idx = PEBS_PMD; ++ PFM_DBG("ds=%p pebs_idx=0x%llx thres=0x%llx", ++ ds, ++ (unsigned long long)ds->pebs_index, ++ (unsigned long long)ds->pebs_intr_thres); ++ } ++ ++ /* ++ * stop monitoring AND collect pending overflow information AND ++ * save pmds. ++ * ++ * We need to access the CCCR twice, once to get overflow info ++ * and a second to stop monitoring (which destroys the OVF flag) ++ * Similarly, we need to read the counter twice to check whether ++ * it did overflow between the CCR read and the CCCR write. ++ */ ++ for (i = 0; count; i++) { ++ if (i != pebs_idx && test_bit(i, cast_ulp(used_mask))) { ++ /* ++ * controlled counter ++ */ ++ j = pmc_addrs[i].ctr; ++ ++ /* read CCCR (PMC) value */ ++ __pfm_read_reg_p4(pmc_addrs+i, &cccr); ++ ++ /* read counter (PMD) controlled by PMC */ ++ __pfm_read_reg_p4(pmd_addrs+j, &ctr1); ++ ++ /* clear CCCR value: stop counter but destroy OVF */ ++ __pfm_write_reg_p4(pmc_addrs+i, 0); ++ ++ /* read counter controlled by CCCR again */ ++ __pfm_read_reg_p4(pmd_addrs+j, &ctr2); ++ ++ /* ++ * there is an overflow if either: ++ * - CCCR.ovf is set (and we just cleared it) ++ * - ctr2 < ctr1 ++ * in that case we set the bit corresponding to the ++ * overflowed PMD in povfl_pmds. ++ */ ++ if ((cccr & (1ULL<<31)) || (ctr2 < ctr1)) { ++ __set_bit(j, cast_ulp(set->povfl_pmds)); ++ set->npend_ovfls++; ++ } ++ ctr2 = (set->pmds[j].value & ~ovfl_mask) | (ctr2 & ovfl_mask); ++ set->pmds[j].value = ctr2; ++ count--; ++ } ++ } ++ /* ++ * check for PEBS buffer full and set the corresponding PMD overflow ++ */ ++ if (ctx_arch->flags.use_pebs) { ++ PFM_DBG("ds=%p pebs_idx=0x%lx thres=0x%lx", ds, ds->pebs_index, ds->pebs_intr_thres); ++ if (ds->pebs_index >= ds->pebs_intr_thres ++ && test_bit(PEBS_PMD, cast_ulp(set->used_pmds))) { ++ __set_bit(PEBS_PMD, cast_ulp(set->povfl_pmds)); ++ set->npend_ovfls++; ++ } ++ } ++ /* 0 means: no need to save the PMD at higher level */ ++ return 0; ++} ++ ++static int pfm_p4_create_context(struct pfm_context *ctx, u32 ctx_flags) ++{ ++ struct pfm_arch_context *ctx_arch; ++ ++ ctx_arch = pfm_ctx_arch(ctx); ++ ++ ctx_arch->data = kzalloc(sizeof(struct pfm_arch_p4_context), GFP_KERNEL); ++ if (!ctx_arch->data) ++ return -ENOMEM; ++ ++ return 0; ++} ++ ++static void pfm_p4_free_context(struct pfm_context *ctx) ++{ ++ struct pfm_arch_context *ctx_arch; ++ ++ ctx_arch = pfm_ctx_arch(ctx); ++ /* ++ * we do not check if P4, because it would be NULL and ++ * kfree can deal with NULL ++ */ ++ kfree(ctx_arch->data); ++} ++ ++/* ++ * detect is counters have overflowed. ++ * return: ++ * 0 : no overflow ++ * 1 : at least one overflow ++ * ++ * used by Intel P4 ++ */ ++static int __kprobes pfm_p4_has_ovfls(struct pfm_context *ctx) ++{ ++ struct pfm_arch_pmu_info *pmu_info; ++ struct pfm_p4_regmap *xrc, *xrd; ++ struct pfm_arch_context *ctx_arch; ++ struct pfm_arch_p4_context *p4; ++ u64 ena_mask[PFM_PMC_BV]; ++ u64 cccr, ctr1, ctr2; ++ int n, i, j; ++ ++ pmu_info = &pfm_p4_pmu_info; ++ ++ ctx_arch = pfm_ctx_arch(ctx); ++ xrc = pmc_addrs; ++ xrd = pmd_addrs; ++ p4 = ctx_arch->data; ++ ++ bitmap_and(cast_ulp(ena_mask), ++ cast_ulp(ctx->regs.pmcs), ++ cast_ulp(enable_mask), ++ max_enable); ++ ++ n = bitmap_weight(cast_ulp(ena_mask), max_enable); ++ ++ for (i = 0; n; i++) { ++ if (!test_bit(i, cast_ulp(ena_mask))) ++ continue; ++ /* ++ * controlled counter ++ */ ++ j = xrc[i].ctr; ++ ++ /* read CCCR (PMC) value */ ++ __pfm_read_reg_p4(xrc+i, &cccr); ++ ++ /* read counter (PMD) controlled by PMC */ ++ __pfm_read_reg_p4(xrd+j, &ctr1); ++ ++ /* clear CCCR value: stop counter but destroy OVF */ ++ __pfm_write_reg_p4(xrc+i, 0); ++ ++ /* read counter controlled by CCCR again */ ++ __pfm_read_reg_p4(xrd+j, &ctr2); ++ ++ /* ++ * there is an overflow if either: ++ * - CCCR.ovf is set (and we just cleared it) ++ * - ctr2 < ctr1 ++ * in that case we set the bit corresponding to the ++ * overflowed PMD in povfl_pmds. ++ */ ++ if ((cccr & (1ULL<<31)) || (ctr2 < ctr1)) { ++ __set_bit(j, cast_ulp(p4->povfl_pmds)); ++ p4->npend_ovfls++; ++ } ++ p4->saved_cccrs[i] = cccr; ++ n--; ++ } ++ /* ++ * if there was no overflow, then it means the NMI was not really ++ * for us, so we have to resume monitoring ++ */ ++ if (unlikely(!p4->npend_ovfls)) { ++ for (i = 0; n; i++) { ++ if (!test_bit(i, cast_ulp(ena_mask))) ++ continue; ++ __pfm_write_reg_p4(xrc+i, p4->saved_cccrs[i]); ++ } ++ } ++ return 0; ++} ++ ++void pfm_p4_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set) ++{ ++ struct pfm_arch_pmu_info *pmu_info; ++ struct pfm_arch_context *ctx_arch; ++ u64 *mask; ++ u16 i, num; ++ ++ ctx_arch = pfm_ctx_arch(ctx); ++ pmu_info = pfm_pmu_info(); ++ ++ /* ++ * must restore DS pointer before restoring PMCs ++ * as this can potentially reactivate monitoring ++ */ ++ if (ctx_arch->flags.use_ds) ++ wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ctx_arch->ds_area); ++ ++ /* ++ * must restore everything because there are some dependencies ++ * (e.g., ESCR and CCCR) ++ */ ++ num = ctx->regs.num_pmcs; ++ mask = ctx->regs.pmcs; ++ for (i = 0; num; i++) { ++ if (test_bit(i, cast_ulp(mask))) { ++ pfm_arch_write_pmc(ctx, i, set->pmcs[i]); ++ num--; ++ } ++ } ++} ++ ++/* ++ * invoked only when NMI is used. Called from the LOCAL_PERFMON_VECTOR ++ * handler to copy P4 overflow state captured when the NMI triggered. ++ * Given that on P4, stopping monitoring destroy the overflow information ++ * we save it in pfm_has_ovfl_p4() where monitoring is also stopped. ++ * ++ * Here we propagate the overflow state to current active set. The ++ * freeze_pmu() call we not overwrite this state because npend_ovfls ++ * is non-zero. ++ */ ++static void pfm_p4_nmi_copy_state(struct pfm_context *ctx) ++{ ++ struct pfm_arch_context *ctx_arch; ++ struct pfm_event_set *set; ++ struct pfm_arch_p4_context *p4; ++ ++ ctx_arch = pfm_ctx_arch(ctx); ++ p4 = ctx_arch->data; ++ set = ctx->active_set; ++ ++ if (p4->npend_ovfls) { ++ set->npend_ovfls = p4->npend_ovfls; ++ ++ bitmap_copy(cast_ulp(set->povfl_pmds), ++ cast_ulp(p4->povfl_pmds), ++ ctx->regs.max_pmd); ++ ++ p4->npend_ovfls = 0; ++ } ++} ++ ++/** ++ * pfm_p4_quiesce - stop monitoring without grabbing any lock ++ * ++ * called from NMI interrupt handler to immediately stop monitoring ++ * cannot grab any lock, including perfmon related locks ++ */ ++static void __kprobes pfm_p4_quiesce(void) ++{ ++ u16 i; ++ /* ++ * quiesce PMU by clearing available registers that have ++ * the start/stop capability ++ */ ++ for (i = 0; i < pfm_pmu_conf->regs_all.max_pmc; i++) { ++ if (test_bit(i, cast_ulp(pfm_pmu_conf->regs_all.pmcs)) ++ && test_bit(i, cast_ulp(enable_mask))) ++ __pfm_write_reg_p4(pmc_addrs+i, 0); ++ } ++} ++ ++ ++static struct pfm_pmu_config pfm_p4_pmu_conf = { ++ .pmu_name = "Intel P4", ++ .counter_width = 40, ++ .pmd_desc = pfm_p4_pmd_desc, ++ .pmc_desc = pfm_p4_pmc_desc, ++ .num_pmc_entries = PFM_P4_NUM_PMCS, ++ .num_pmd_entries = PFM_P4_NUM_PMDS, ++ .probe_pmu = pfm_p4_probe_pmu, ++ .version = "1.0", ++ .flags = PFM_PMU_BUILTIN_FLAG, ++ .owner = THIS_MODULE, ++ .pmu_info = &pfm_p4_pmu_info ++}; ++ ++static int __init pfm_p4_pmu_init_module(void) ++{ ++ return pfm_pmu_register(&pfm_p4_pmu_conf); ++} ++ ++static void __exit pfm_p4_pmu_cleanup_module(void) ++{ ++ pfm_pmu_unregister(&pfm_p4_pmu_conf); ++} ++ ++module_init(pfm_p4_pmu_init_module); ++module_exit(pfm_p4_pmu_cleanup_module); +diff --git a/arch/x86/perfmon/perfmon_p6.c b/arch/x86/perfmon/perfmon_p6.c +new file mode 100644 +index 0000000..47c0a46 +--- /dev/null ++++ b/arch/x86/perfmon/perfmon_p6.c +@@ -0,0 +1,310 @@ ++/* ++ * This file contains the P6 family processor PMU register description tables ++ * ++ * This module supports original P6 processors ++ * (Pentium II, Pentium Pro, Pentium III) and Pentium M. ++ * ++ * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/module.h> ++#include <linux/kprobes.h> ++#include <linux/perfmon_kern.h> ++#include <linux/nmi.h> ++#include <asm/msr.h> ++ ++MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>"); ++MODULE_DESCRIPTION("P6 PMU description table"); ++MODULE_LICENSE("GPL"); ++ ++static int force_nmi; ++MODULE_PARM_DESC(force_nmi, "bool: force use of NMI for PMU interrupt"); ++module_param(force_nmi, bool, 0600); ++ ++/* ++ * - upper 32 bits are reserved ++ * - INT: APIC enable bit is reserved (forced to 1) ++ * - bit 21 is reserved ++ * - bit 22 is reserved on PEREVNTSEL1 ++ * ++ * RSVD: reserved bits are 1 ++ */ ++#define PFM_P6_PMC0_RSVD ((~((1ULL<<32)-1)) | (1ULL<<20) | (1ULL<<21)) ++#define PFM_P6_PMC1_RSVD ((~((1ULL<<32)-1)) | (1ULL<<20) | (3ULL<<21)) ++ ++/* ++ * force Local APIC interrupt on overflow ++ * disable with NO_EMUL64 ++ */ ++#define PFM_P6_PMC_VAL (1ULL<<20) ++#define PFM_P6_NO64 (1ULL<<20) ++ ++ ++static void __kprobes pfm_p6_quiesce(void); ++static int pfm_p6_has_ovfls(struct pfm_context *ctx); ++static int pfm_p6_stop_save(struct pfm_context *ctx, ++ struct pfm_event_set *set); ++ ++static u64 enable_mask[PFM_MAX_PMCS]; ++static u16 max_enable; ++ ++/* ++ * PFM_X86_FL_NO_SHARING: because of the single enable bit on MSR_P6_EVNTSEL0 ++ * the PMU cannot be shared with NMI watchdog or Oprofile ++ */ ++struct pfm_arch_pmu_info pfm_p6_pmu_info = { ++ .stop_save = pfm_p6_stop_save, ++ .has_ovfls = pfm_p6_has_ovfls, ++ .quiesce = pfm_p6_quiesce, ++ .flags = PFM_X86_FL_NO_SHARING, ++}; ++ ++static struct pfm_regmap_desc pfm_p6_pmc_desc[] = { ++/* pmc0 */ PMC_D(PFM_REG_I64, "PERFEVTSEL0", PFM_P6_PMC_VAL, PFM_P6_PMC0_RSVD, PFM_P6_NO64, MSR_P6_EVNTSEL0), ++/* pmc1 */ PMC_D(PFM_REG_I64, "PERFEVTSEL1", PFM_P6_PMC_VAL, PFM_P6_PMC1_RSVD, PFM_P6_NO64, MSR_P6_EVNTSEL1) ++}; ++#define PFM_P6_NUM_PMCS ARRAY_SIZE(pfm_p6_pmc_desc) ++ ++#define PFM_P6_D(n) \ ++ { .type = PFM_REG_C, \ ++ .desc = "PERFCTR"#n, \ ++ .hw_addr = MSR_P6_PERFCTR0+n, \ ++ .rsvd_msk = 0, \ ++ .dep_pmcs[0] = 1ULL << n \ ++ } ++ ++static struct pfm_regmap_desc pfm_p6_pmd_desc[] = { ++/* pmd0 */ PFM_P6_D(0), ++/* pmd1 */ PFM_P6_D(1) ++}; ++#define PFM_P6_NUM_PMDS ARRAY_SIZE(pfm_p6_pmd_desc) ++ ++static int pfm_p6_probe_pmu(void) ++{ ++ int high, low; ++ ++ if (current_cpu_data.x86_vendor != X86_VENDOR_INTEL) { ++ PFM_INFO("not an Intel processor"); ++ return -1; ++ } ++ ++ /* ++ * check for P6 processor family ++ */ ++ if (current_cpu_data.x86 != 6) { ++ PFM_INFO("unsupported family=%d", current_cpu_data.x86); ++ return -1; ++ } ++ ++ switch (current_cpu_data.x86_model) { ++ case 1: /* Pentium Pro */ ++ case 3: ++ case 5: /* Pentium II Deschutes */ ++ case 7 ... 11: ++ break; ++ case 13: ++ /* for Pentium M, we need to check if PMU exist */ ++ rdmsr(MSR_IA32_MISC_ENABLE, low, high); ++ if (low & (1U << 7)) ++ break; ++ default: ++ PFM_INFO("unsupported CPU model %d", ++ current_cpu_data.x86_model); ++ return -1; ++ ++ } ++ ++ if (!cpu_has_apic) { ++ PFM_INFO("no Local APIC, try rebooting with lapic"); ++ return -1; ++ } ++ __set_bit(0, cast_ulp(enable_mask)); ++ __set_bit(1, cast_ulp(enable_mask)); ++ max_enable = 1 + 1; ++ /* ++ * force NMI interrupt? ++ */ ++ if (force_nmi) ++ pfm_p6_pmu_info.flags |= PFM_X86_FL_USE_NMI; ++ ++ return 0; ++} ++ ++/** ++ * pfm_p6_has_ovfls - check for pending overflow condition ++ * @ctx: context to work on ++ * ++ * detect if counters have overflowed. ++ * return: ++ * 0 : no overflow ++ * 1 : at least one overflow ++ */ ++static int __kprobes pfm_p6_has_ovfls(struct pfm_context *ctx) ++{ ++ u64 *cnt_mask; ++ u64 wmask, val; ++ u16 i, num; ++ ++ cnt_mask = ctx->regs.cnt_pmds; ++ num = ctx->regs.num_counters; ++ wmask = 1ULL << pfm_pmu_conf->counter_width; ++ ++ /* ++ * we can leverage the fact that we know the mapping ++ * to hardcode the MSR address and avoid accessing ++ * more cachelines ++ * ++ * We need to check cnt_mask because not all registers ++ * may be available. ++ */ ++ for (i = 0; num; i++) { ++ if (test_bit(i, cast_ulp(cnt_mask))) { ++ rdmsrl(MSR_P6_PERFCTR0+i, val); ++ if (!(val & wmask)) ++ return 1; ++ num--; ++ } ++ } ++ return 0; ++} ++ ++/** ++ * pfm_p6_stop_save -- stop monitoring and save PMD values ++ * @ctx: context to work on ++ * @set: current event set ++ * ++ * return value: ++ * 0 - no need to save PMDs in caller ++ * 1 - need to save PMDs in caller ++ */ ++static int pfm_p6_stop_save(struct pfm_context *ctx, struct pfm_event_set *set) ++{ ++ struct pfm_arch_pmu_info *pmu_info; ++ u64 used_mask[PFM_PMC_BV]; ++ u64 *cnt_pmds; ++ u64 val, wmask, ovfl_mask; ++ u32 i, count; ++ ++ pmu_info = pfm_pmu_info(); ++ ++ wmask = 1ULL << pfm_pmu_conf->counter_width; ++ bitmap_and(cast_ulp(used_mask), ++ cast_ulp(set->used_pmcs), ++ cast_ulp(enable_mask), ++ max_enable); ++ ++ count = bitmap_weight(cast_ulp(used_mask), ctx->regs.max_pmc); ++ ++ /* ++ * stop monitoring ++ * Unfortunately, this is very expensive! ++ * wrmsrl() is serializing. ++ */ ++ for (i = 0; count; i++) { ++ if (test_bit(i, cast_ulp(used_mask))) { ++ wrmsrl(MSR_P6_EVNTSEL0+i, 0); ++ count--; ++ } ++ } ++ ++ /* ++ * if we already having a pending overflow condition, we simply ++ * return to take care of this first. ++ */ ++ if (set->npend_ovfls) ++ return 1; ++ ++ ovfl_mask = pfm_pmu_conf->ovfl_mask; ++ cnt_pmds = ctx->regs.cnt_pmds; ++ ++ /* ++ * check for pending overflows and save PMDs (combo) ++ * we employ used_pmds because we also need to save ++ * and not just check for pending interrupts. ++ * ++ * Must check for counting PMDs because of virtual PMDs ++ */ ++ count = set->nused_pmds; ++ for (i = 0; count; i++) { ++ if (test_bit(i, cast_ulp(set->used_pmds))) { ++ val = pfm_arch_read_pmd(ctx, i); ++ if (likely(test_bit(i, cast_ulp(cnt_pmds)))) { ++ if (!(val & wmask)) { ++ __set_bit(i, cast_ulp(set->povfl_pmds)); ++ set->npend_ovfls++; ++ } ++ val = (set->pmds[i].value & ~ovfl_mask) | (val & ovfl_mask); ++ } ++ set->pmds[i].value = val; ++ count--; ++ } ++ } ++ /* 0 means: no need to save PMDs at upper level */ ++ return 0; ++} ++ ++/** ++ * pfm_p6_quiesce_pmu -- stop monitoring without grabbing any lock ++ * ++ * called from NMI interrupt handler to immediately stop monitoring ++ * cannot grab any lock, including perfmon related locks ++ */ ++static void __kprobes pfm_p6_quiesce(void) ++{ ++ /* ++ * quiesce PMU by clearing available registers that have ++ * the start/stop capability ++ * ++ * P6 processors only have enable bit on PERFEVTSEL0 ++ */ ++ if (test_bit(0, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) ++ wrmsrl(MSR_P6_EVNTSEL0, 0); ++} ++ ++/* ++ * Counters have 40 bits implemented. However they are designed such ++ * that bits [32-39] are sign extensions of bit 31. As such the ++ * effective width of a counter for P6-like PMU is 31 bits only. ++ * ++ * See IA-32 Intel Architecture Software developer manual Vol 3B ++ */ ++static struct pfm_pmu_config pfm_p6_pmu_conf = { ++ .pmu_name = "Intel P6 processor Family", ++ .counter_width = 31, ++ .pmd_desc = pfm_p6_pmd_desc, ++ .pmc_desc = pfm_p6_pmc_desc, ++ .num_pmc_entries = PFM_P6_NUM_PMCS, ++ .num_pmd_entries = PFM_P6_NUM_PMDS, ++ .probe_pmu = pfm_p6_probe_pmu, ++ .version = "1.0", ++ .flags = PFM_PMU_BUILTIN_FLAG, ++ .owner = THIS_MODULE, ++ .pmu_info = &pfm_p6_pmu_info ++}; ++ ++static int __init pfm_p6_pmu_init_module(void) ++{ ++ return pfm_pmu_register(&pfm_p6_pmu_conf); ++} ++ ++static void __exit pfm_p6_pmu_cleanup_module(void) ++{ ++ pfm_pmu_unregister(&pfm_p6_pmu_conf); ++} ++ ++module_init(pfm_p6_pmu_init_module); ++module_exit(pfm_p6_pmu_cleanup_module); +diff --git a/arch/x86/perfmon/perfmon_pebs_core_smpl.c b/arch/x86/perfmon/perfmon_pebs_core_smpl.c +new file mode 100644 +index 0000000..eeb9174 +--- /dev/null ++++ b/arch/x86/perfmon/perfmon_pebs_core_smpl.c +@@ -0,0 +1,256 @@ ++/* ++ * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * This file implements the Precise Event Based Sampling (PEBS) ++ * sampling format for Intel Core and Atom processors. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/kernel.h> ++#include <linux/types.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/smp.h> ++#include <linux/perfmon_kern.h> ++ ++#include <asm/msr.h> ++#include <asm/perfmon_pebs_core_smpl.h> ++ ++MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>"); ++MODULE_DESCRIPTION("Intel Core Precise Event-Based Sampling (PEBS)"); ++MODULE_LICENSE("GPL"); ++ ++#define ALIGN_PEBS(a, order) \ ++ ((a)+(1UL<<(order))-1) & ~((1UL<<(order))-1) ++ ++#define PEBS_PADDING_ORDER 8 /* log2(256) padding for PEBS alignment constraint */ ++ ++static int pfm_pebs_core_fmt_validate(u32 flags, u16 npmds, void *data) ++{ ++ struct pfm_pebs_core_smpl_arg *arg = data; ++ size_t min_buf_size; ++ ++ /* ++ * need to define at least the size of the buffer ++ */ ++ if (data == NULL) { ++ PFM_DBG("no argument passed"); ++ return -EINVAL; ++ } ++ ++ /* ++ * compute min buf size. npmds is the maximum number ++ * of implemented PMD registers. ++ */ ++ min_buf_size = sizeof(struct pfm_pebs_core_smpl_hdr) ++ + sizeof(struct pfm_pebs_core_smpl_entry) ++ + (1UL<<PEBS_PADDING_ORDER); /* padding for alignment */ ++ ++ PFM_DBG("validate flags=0x%x min_buf_size=%zu buf_size=%zu", ++ flags, ++ min_buf_size, ++ arg->buf_size); ++ ++ /* ++ * must hold at least the buffer header + one minimally sized entry ++ */ ++ if (arg->buf_size < min_buf_size) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++static int pfm_pebs_core_fmt_get_size(unsigned int flags, void *data, size_t *size) ++{ ++ struct pfm_pebs_core_smpl_arg *arg = data; ++ ++ /* ++ * size has been validated in pfm_pebs_core_fmt_validate() ++ */ ++ *size = arg->buf_size + (1UL<<PEBS_PADDING_ORDER); ++ ++ return 0; ++} ++ ++static int pfm_pebs_core_fmt_init(struct pfm_context *ctx, void *buf, ++ u32 flags, u16 npmds, void *data) ++{ ++ struct pfm_arch_context *ctx_arch; ++ struct pfm_pebs_core_smpl_hdr *hdr; ++ struct pfm_pebs_core_smpl_arg *arg = data; ++ u64 pebs_start, pebs_end; ++ struct pfm_ds_area_core *ds; ++ ++ ctx_arch = pfm_ctx_arch(ctx); ++ ++ hdr = buf; ++ ds = &hdr->ds; ++ ++ /* ++ * align PEBS buffer base ++ */ ++ pebs_start = ALIGN_PEBS((unsigned long)(hdr+1), PEBS_PADDING_ORDER); ++ pebs_end = pebs_start + arg->buf_size + 1; ++ ++ hdr->version = PFM_PEBS_CORE_SMPL_VERSION; ++ hdr->buf_size = arg->buf_size; ++ hdr->overflows = 0; ++ ++ /* ++ * express PEBS buffer base as offset from the end of the header ++ */ ++ hdr->start_offs = pebs_start - (unsigned long)(hdr+1); ++ ++ /* ++ * PEBS buffer boundaries ++ */ ++ ds->pebs_buf_base = pebs_start; ++ ds->pebs_abs_max = pebs_end; ++ ++ /* ++ * PEBS starting position ++ */ ++ ds->pebs_index = pebs_start; ++ ++ /* ++ * PEBS interrupt threshold ++ */ ++ ds->pebs_intr_thres = pebs_start ++ + arg->intr_thres ++ * sizeof(struct pfm_pebs_core_smpl_entry); ++ ++ /* ++ * save counter reset value for PEBS counter ++ */ ++ ds->pebs_cnt_reset = arg->cnt_reset; ++ ++ /* ++ * keep track of DS AREA ++ */ ++ ctx_arch->ds_area = ds; ++ ctx_arch->flags.use_ds = 1; ++ ctx_arch->flags.use_pebs = 1; ++ ++ PFM_DBG("buffer=%p buf_size=%llu offs=%llu pebs_start=0x%llx " ++ "pebs_end=0x%llx ds=%p pebs_thres=0x%llx cnt_reset=0x%llx", ++ buf, ++ (unsigned long long)hdr->buf_size, ++ (unsigned long long)hdr->start_offs, ++ (unsigned long long)pebs_start, ++ (unsigned long long)pebs_end, ++ ds, ++ (unsigned long long)ds->pebs_intr_thres, ++ (unsigned long long)ds->pebs_cnt_reset); ++ ++ return 0; ++} ++ ++static int pfm_pebs_core_fmt_handler(struct pfm_context *ctx, ++ unsigned long ip, u64 tstamp, void *data) ++{ ++ struct pfm_pebs_core_smpl_hdr *hdr; ++ struct pfm_ovfl_arg *arg; ++ ++ hdr = ctx->smpl_addr; ++ arg = &ctx->ovfl_arg; ++ ++ PFM_DBG_ovfl("buffer full"); ++ /* ++ * increment number of buffer overflows. ++ * important to detect duplicate set of samples. ++ */ ++ hdr->overflows++; ++ ++ /* ++ * request notification and masking of monitoring. ++ * Notification is still subject to the overflowed ++ * register having the FL_NOTIFY flag set. ++ */ ++ arg->ovfl_ctrl = PFM_OVFL_CTRL_NOTIFY | PFM_OVFL_CTRL_MASK; ++ ++ return -ENOBUFS; /* we are full, sorry */ ++} ++ ++static int pfm_pebs_core_fmt_restart(int is_active, u32 *ovfl_ctrl, ++ void *buf) ++{ ++ struct pfm_pebs_core_smpl_hdr *hdr = buf; ++ ++ /* ++ * reset index to base of buffer ++ */ ++ hdr->ds.pebs_index = hdr->ds.pebs_buf_base; ++ ++ *ovfl_ctrl = PFM_OVFL_CTRL_RESET; ++ ++ return 0; ++} ++ ++static int pfm_pebs_core_fmt_exit(void *buf) ++{ ++ return 0; ++} ++ ++static struct pfm_smpl_fmt pebs_core_fmt = { ++ .fmt_name = PFM_PEBS_CORE_SMPL_NAME, ++ .fmt_version = 0x1, ++ .fmt_arg_size = sizeof(struct pfm_pebs_core_smpl_arg), ++ .fmt_validate = pfm_pebs_core_fmt_validate, ++ .fmt_getsize = pfm_pebs_core_fmt_get_size, ++ .fmt_init = pfm_pebs_core_fmt_init, ++ .fmt_handler = pfm_pebs_core_fmt_handler, ++ .fmt_restart = pfm_pebs_core_fmt_restart, ++ .fmt_exit = pfm_pebs_core_fmt_exit, ++ .fmt_flags = PFM_FMT_BUILTIN_FLAG, ++ .owner = THIS_MODULE, ++}; ++ ++static int __init pfm_pebs_core_fmt_init_module(void) ++{ ++ if (!cpu_has_pebs) { ++ PFM_INFO("processor does not have PEBS support"); ++ return -1; ++ } ++ /* ++ * cpu_has_pebs is not enough to identify Intel Core PEBS ++ * which is different fro Pentium 4 PEBS. Therefore we do ++ * a more detailed check here ++ */ ++ if (current_cpu_data.x86 != 6) { ++ PFM_INFO("not a supported Intel processor"); ++ return -1; ++ } ++ ++ switch (current_cpu_data.x86_model) { ++ case 15: /* Merom */ ++ case 23: /* Penryn */ ++ case 28: /* Atom (Silverthorne) */ ++ case 29: /* Dunnington */ ++ break; ++ default: ++ PFM_INFO("not a supported Intel processor"); ++ return -1; ++ } ++ return pfm_fmt_register(&pebs_core_fmt); ++} ++ ++static void __exit pfm_pebs_core_fmt_cleanup_module(void) ++{ ++ pfm_fmt_unregister(&pebs_core_fmt); ++} ++ ++module_init(pfm_pebs_core_fmt_init_module); ++module_exit(pfm_pebs_core_fmt_cleanup_module); +diff --git a/arch/x86/perfmon/perfmon_pebs_p4_smpl.c b/arch/x86/perfmon/perfmon_pebs_p4_smpl.c +new file mode 100644 +index 0000000..f4e9fd2 +--- /dev/null ++++ b/arch/x86/perfmon/perfmon_pebs_p4_smpl.c +@@ -0,0 +1,253 @@ ++/* ++ * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * This file implements the Precise Event Based Sampling (PEBS) ++ * sampling format. It supports the following processors: ++ * - 32-bit Pentium 4 or other Netburst-based processors ++ * - 64-bit Pentium 4 or other Netburst-based processors ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/kernel.h> ++#include <linux/types.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/smp.h> ++#include <linux/perfmon_kern.h> ++ ++#include <asm/msr.h> ++#include <asm/perfmon_pebs_p4_smpl.h> ++ ++MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>"); ++MODULE_DESCRIPTION("Intel P4 Precise Event-Based Sampling (PEBS)"); ++MODULE_LICENSE("GPL"); ++ ++#define ALIGN_PEBS(a, order) \ ++ ((a)+(1UL<<(order))-1) & ~((1UL<<(order))-1) ++ ++#define PEBS_PADDING_ORDER 8 /* log2(256) padding for PEBS alignment constraint */ ++ ++static int pfm_pebs_p4_fmt_validate(u32 flags, u16 npmds, void *data) ++{ ++ struct pfm_pebs_p4_smpl_arg *arg = data; ++ size_t min_buf_size; ++ ++ /* ++ * need to define at least the size of the buffer ++ */ ++ if (data == NULL) { ++ PFM_DBG("no argument passed"); ++ return -EINVAL; ++ } ++ ++ /* ++ * compute min buf size. npmds is the maximum number ++ * of implemented PMD registers. ++ */ ++ min_buf_size = sizeof(struct pfm_pebs_p4_smpl_hdr) ++ + sizeof(struct pfm_pebs_p4_smpl_entry) ++ + (1UL<<PEBS_PADDING_ORDER); /* padding for alignment */ ++ ++ PFM_DBG("validate flags=0x%x min_buf_size=%zu buf_size=%zu", ++ flags, ++ min_buf_size, ++ arg->buf_size); ++ ++ /* ++ * must hold at least the buffer header + one minimally sized entry ++ */ ++ if (arg->buf_size < min_buf_size) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++static int pfm_pebs_p4_fmt_get_size(unsigned int flags, void *data, size_t *size) ++{ ++ struct pfm_pebs_p4_smpl_arg *arg = data; ++ ++ /* ++ * size has been validated in pfm_pebs_p4_fmt_validate() ++ */ ++ *size = arg->buf_size + (1UL<<PEBS_PADDING_ORDER); ++ ++ return 0; ++} ++ ++static int pfm_pebs_p4_fmt_init(struct pfm_context *ctx, void *buf, ++ u32 flags, u16 npmds, void *data) ++{ ++ struct pfm_arch_context *ctx_arch; ++ struct pfm_pebs_p4_smpl_hdr *hdr; ++ struct pfm_pebs_p4_smpl_arg *arg = data; ++ unsigned long pebs_start, pebs_end; ++ struct pfm_ds_area_p4 *ds; ++ ++ ctx_arch = pfm_ctx_arch(ctx); ++ ++ hdr = buf; ++ ds = &hdr->ds; ++ ++ /* ++ * align PEBS buffer base ++ */ ++ pebs_start = ALIGN_PEBS((unsigned long)(hdr+1), PEBS_PADDING_ORDER); ++ pebs_end = pebs_start + arg->buf_size + 1; ++ ++ hdr->version = PFM_PEBS_P4_SMPL_VERSION; ++ hdr->buf_size = arg->buf_size; ++ hdr->overflows = 0; ++ ++ /* ++ * express PEBS buffer base as offset from the end of the header ++ */ ++ hdr->start_offs = pebs_start - (unsigned long)(hdr+1); ++ ++ /* ++ * PEBS buffer boundaries ++ */ ++ ds->pebs_buf_base = pebs_start; ++ ds->pebs_abs_max = pebs_end; ++ ++ /* ++ * PEBS starting position ++ */ ++ ds->pebs_index = pebs_start; ++ ++ /* ++ * PEBS interrupt threshold ++ */ ++ ds->pebs_intr_thres = pebs_start ++ + arg->intr_thres * sizeof(struct pfm_pebs_p4_smpl_entry); ++ ++ /* ++ * save counter reset value for PEBS counter ++ */ ++ ds->pebs_cnt_reset = arg->cnt_reset; ++ ++ /* ++ * keep track of DS AREA ++ */ ++ ctx_arch->ds_area = ds; ++ ctx_arch->flags.use_pebs = 1; ++ ctx_arch->flags.use_ds = 1; ++ ++ PFM_DBG("buffer=%p buf_size=%llu offs=%llu pebs_start=0x%lx " ++ "pebs_end=0x%lx ds=%p pebs_thres=0x%lx cnt_reset=0x%llx", ++ buf, ++ (unsigned long long)hdr->buf_size, ++ (unsigned long long)hdr->start_offs, ++ pebs_start, ++ pebs_end, ++ ds, ++ ds->pebs_intr_thres, ++ (unsigned long long)ds->pebs_cnt_reset); ++ ++ return 0; ++} ++ ++static int pfm_pebs_p4_fmt_handler(struct pfm_context *ctx, ++ unsigned long ip, u64 tstamp, void *data) ++{ ++ struct pfm_pebs_p4_smpl_hdr *hdr; ++ struct pfm_ovfl_arg *arg; ++ ++ hdr = ctx->smpl_addr; ++ arg = &ctx->ovfl_arg; ++ ++ PFM_DBG_ovfl("buffer full"); ++ /* ++ * increment number of buffer overflows. ++ * important to detect duplicate set of samples. ++ */ ++ hdr->overflows++; ++ ++ /* ++ * request notification and masking of monitoring. ++ * Notification is still subject to the overflowed ++ * register having the FL_NOTIFY flag set. ++ */ ++ arg->ovfl_ctrl = PFM_OVFL_CTRL_NOTIFY | PFM_OVFL_CTRL_MASK; ++ ++ return -ENOBUFS; /* we are full, sorry */ ++} ++ ++static int pfm_pebs_p4_fmt_restart(int is_active, u32 *ovfl_ctrl, ++ void *buf) ++{ ++ struct pfm_pebs_p4_smpl_hdr *hdr = buf; ++ ++ /* ++ * reset index to base of buffer ++ */ ++ hdr->ds.pebs_index = hdr->ds.pebs_buf_base; ++ ++ *ovfl_ctrl = PFM_OVFL_CTRL_RESET; ++ ++ return 0; ++} ++ ++static int pfm_pebs_p4_fmt_exit(void *buf) ++{ ++ return 0; ++} ++ ++static struct pfm_smpl_fmt pebs_p4_fmt = { ++ .fmt_name = PFM_PEBS_P4_SMPL_NAME, ++ .fmt_version = 0x1, ++ .fmt_arg_size = sizeof(struct pfm_pebs_p4_smpl_arg), ++ .fmt_validate = pfm_pebs_p4_fmt_validate, ++ .fmt_getsize = pfm_pebs_p4_fmt_get_size, ++ .fmt_init = pfm_pebs_p4_fmt_init, ++ .fmt_handler = pfm_pebs_p4_fmt_handler, ++ .fmt_restart = pfm_pebs_p4_fmt_restart, ++ .fmt_exit = pfm_pebs_p4_fmt_exit, ++ .fmt_flags = PFM_FMT_BUILTIN_FLAG, ++ .owner = THIS_MODULE, ++}; ++ ++static int __init pfm_pebs_p4_fmt_init_module(void) ++{ ++ int ht_enabled; ++ ++ if (!cpu_has_pebs) { ++ PFM_INFO("processor does not have PEBS support"); ++ return -1; ++ } ++ if (current_cpu_data.x86 != 15) { ++ PFM_INFO("not an Intel Pentium 4"); ++ return -1; ++ } ++#ifdef CONFIG_SMP ++ ht_enabled = (cpus_weight(__get_cpu_var(cpu_core_map)) ++ / current_cpu_data.x86_max_cores) > 1; ++#else ++ ht_enabled = 0; ++#endif ++ if (ht_enabled) { ++ PFM_INFO("PEBS not available because HyperThreading is on"); ++ return -1; ++ } ++ return pfm_fmt_register(&pebs_p4_fmt); ++} ++ ++static void __exit pfm_pebs_p4_fmt_cleanup_module(void) ++{ ++ pfm_fmt_unregister(&pebs_p4_fmt); ++} ++ ++module_init(pfm_pebs_p4_fmt_init_module); ++module_exit(pfm_pebs_p4_fmt_cleanup_module); +diff --git a/include/asm-mips/Kbuild b/include/asm-mips/Kbuild +index 7897f05..7ed16fc 100644 +--- a/include/asm-mips/Kbuild ++++ b/include/asm-mips/Kbuild +@@ -1,3 +1,4 @@ + include include/asm-generic/Kbuild.asm + + header-y += cachectl.h sgidefs.h sysmips.h ++header-y += perfmon.h +diff --git a/include/asm-mips/perfmon.h b/include/asm-mips/perfmon.h +new file mode 100644 +index 0000000..7915c17 +--- /dev/null ++++ b/include/asm-mips/perfmon.h +@@ -0,0 +1,34 @@ ++/* ++ * Copyright (c) 2007 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * This file contains mips64 specific definitions for the perfmon ++ * interface. ++ * ++ * This file MUST never be included directly. Use linux/perfmon.h. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#ifndef _ASM_MIPS64_PERFMON_H_ ++#define _ASM_MIPS64_PERFMON_H_ ++ ++/* ++ * arch-specific user visible interface definitions ++ */ ++ ++#define PFM_ARCH_MAX_PMCS (256+64) /* 256 HW 64 SW */ ++#define PFM_ARCH_MAX_PMDS (256+64) /* 256 HW 64 SW */ ++ ++#endif /* _ASM_MIPS64_PERFMON_H_ */ +diff --git a/include/asm-mips/perfmon_kern.h b/include/asm-mips/perfmon_kern.h +new file mode 100644 +index 0000000..7d213df +--- /dev/null ++++ b/include/asm-mips/perfmon_kern.h +@@ -0,0 +1,412 @@ ++/* ++ * Copyright (c) 2005 Philip Mucci. ++ * ++ * Based on other versions: ++ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * This file contains mips64 specific definitions for the perfmon ++ * interface. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#ifndef _ASM_MIPS64_PERFMON_KERN_H_ ++#define _ASM_MIPS64_PERFMON_KERN_H_ ++ ++#ifdef __KERNEL__ ++ ++#ifdef CONFIG_PERFMON ++#include <linux/unistd.h> ++#include <asm/cacheflush.h> ++ ++#define PFM_ARCH_PMD_STK_ARG 2 ++#define PFM_ARCH_PMC_STK_ARG 2 ++ ++struct pfm_arch_pmu_info { ++ u32 pmu_style; ++}; ++ ++#define MIPS64_CONFIG_PMC_MASK (1 << 4) ++#define MIPS64_PMC_INT_ENABLE_MASK (1 << 4) ++#define MIPS64_PMC_CNT_ENABLE_MASK (0xf) ++#define MIPS64_PMC_EVT_MASK (0x7 << 6) ++#define MIPS64_PMC_CTR_MASK (1 << 31) ++#define MIPS64_PMD_INTERRUPT (1 << 31) ++ ++/* Coprocessor register 25 contains the PMU interface. */ ++/* Sel 0 is control for counter 0 */ ++/* Sel 1 is count for counter 0. */ ++/* Sel 2 is control for counter 1. */ ++/* Sel 3 is count for counter 1. */ ++ ++/* ++ ++31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 ++M 0--------------------------------------------------------------0 Event-- IE U S K EXL ++ ++M 31 If this bit is one, another pair of Performance Control ++and Counter registers is implemented at a MTC0 ++ ++Event 8:5 Counter event enabled for this counter. Possible events ++are listed in Table 6-30. R/W Undefined ++ ++IE 4 Counter Interrupt Enable. This bit masks bit 31 of the ++associated count register from the interrupt exception ++request output. R/W 0 ++ ++U 3 Count in User Mode. When this bit is set, the specified ++event is counted in User Mode. R/W Undefined ++ ++S 2 Count in Supervisor Mode. When this bit is set, the ++specified event is counted in Supervisor Mode. R/W Undefined ++ ++K 1 Count in Kernel Mode. When this bit is set, count the ++event in Kernel Mode when EXL and ERL both are 0. R/W Undefined ++ ++EXL 0 Count when EXL. When this bit is set, count the event ++when EXL = 1 and ERL = 0. R/W Undefined ++*/ ++ ++static inline void pfm_arch_resend_irq(struct pfm_context *ctx) ++{} ++ ++static inline void pfm_arch_clear_pmd_ovfl_cond(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{} ++ ++static inline void pfm_arch_serialize(void) ++{} ++ ++ ++/* ++ * MIPS does not save the PMDs during pfm_arch_intr_freeze_pmu(), thus ++ * this routine needs to do it when switching sets on overflow ++ */ ++static inline void pfm_arch_save_pmds_from_intr(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ pfm_save_pmds(ctx, set); ++} ++ ++static inline void pfm_arch_write_pmc(struct pfm_context *ctx, ++ unsigned int cnum, u64 value) ++{ ++ /* ++ * we only write to the actual register when monitoring is ++ * active (pfm_start was issued) ++ */ ++ if (ctx && (ctx->flags.started == 0)) ++ return; ++ ++ switch (pfm_pmu_conf->pmc_desc[cnum].hw_addr) { ++ case 0: ++ write_c0_perfctrl0(value); ++ break; ++ case 1: ++ write_c0_perfctrl1(value); ++ break; ++ case 2: ++ write_c0_perfctrl2(value); ++ break; ++ case 3: ++ write_c0_perfctrl3(value); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static inline void pfm_arch_write_pmd(struct pfm_context *ctx, ++ unsigned int cnum, u64 value) ++{ ++ value &= pfm_pmu_conf->ovfl_mask; ++ ++ switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) { ++ case 0: ++ write_c0_perfcntr0(value); ++ break; ++ case 1: ++ write_c0_perfcntr1(value); ++ break; ++ case 2: ++ write_c0_perfcntr2(value); ++ break; ++ case 3: ++ write_c0_perfcntr3(value); ++ break; ++ default: ++ BUG(); ++ } ++} ++ ++static inline u64 pfm_arch_read_pmd(struct pfm_context *ctx, unsigned int cnum) ++{ ++ switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) { ++ case 0: ++ return read_c0_perfcntr0(); ++ break; ++ case 1: ++ return read_c0_perfcntr1(); ++ break; ++ case 2: ++ return read_c0_perfcntr2(); ++ break; ++ case 3: ++ return read_c0_perfcntr3(); ++ break; ++ default: ++ BUG(); ++ return 0; ++ } ++} ++ ++static inline u64 pfm_arch_read_pmc(struct pfm_context *ctx, unsigned int cnum) ++{ ++ switch (pfm_pmu_conf->pmc_desc[cnum].hw_addr) { ++ case 0: ++ return read_c0_perfctrl0(); ++ break; ++ case 1: ++ return read_c0_perfctrl1(); ++ break; ++ case 2: ++ return read_c0_perfctrl2(); ++ break; ++ case 3: ++ return read_c0_perfctrl3(); ++ break; ++ default: ++ BUG(); ++ return 0; ++ } ++} ++ ++/* ++ * For some CPUs, the upper bits of a counter must be set in order for the ++ * overflow interrupt to happen. On overflow, the counter has wrapped around, ++ * and the upper bits are cleared. This function may be used to set them back. ++ */ ++static inline void pfm_arch_ovfl_reset_pmd(struct pfm_context *ctx, ++ unsigned int cnum) ++{ ++ u64 val; ++ val = pfm_arch_read_pmd(ctx, cnum); ++ /* This masks out overflow bit 31 */ ++ pfm_arch_write_pmd(ctx, cnum, val); ++} ++ ++/* ++ * At certain points, perfmon needs to know if monitoring has been ++ * explicitely started/stopped by user via pfm_start/pfm_stop. The ++ * information is tracked in ctx.flags.started. However on certain ++ * architectures, it may be possible to start/stop directly from ++ * user level with a single assembly instruction bypassing ++ * the kernel. This function must be used to determine by ++ * an arch-specific mean if monitoring is actually started/stopped. ++ */ ++static inline int pfm_arch_is_active(struct pfm_context *ctx) ++{ ++ return ctx->flags.started; ++} ++ ++static inline void pfm_arch_ctxswout_sys(struct task_struct *task, ++ struct pfm_context *ctx) ++{} ++ ++static inline void pfm_arch_ctxswin_sys(struct task_struct *task, ++ struct pfm_context *ctx) ++{} ++ ++static inline void pfm_arch_ctxswin_thread(struct task_struct *task, ++ struct pfm_context *ctx) ++{} ++int pfm_arch_ctxswout_thread(struct task_struct *task, ++ struct pfm_context *ctx); ++ ++int pfm_arch_is_monitoring_active(struct pfm_context *ctx); ++void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx); ++void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx); ++void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set); ++void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set); ++char *pfm_arch_get_pmu_module_name(void); ++ ++static inline void pfm_arch_intr_freeze_pmu(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ pfm_arch_stop(current, ctx); ++ /* ++ * we mark monitoring as stopped to avoid ++ * certain side effects especially in ++ * pfm_switch_sets_from_intr() on ++ * pfm_arch_restore_pmcs() ++ */ ++ ctx->flags.started = 0; ++} ++ ++/* ++ * unfreeze PMU from pfm_do_interrupt_handler() ++ * ctx may be NULL for spurious ++ */ ++static inline void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx) ++{ ++ if (!ctx) ++ return; ++ ++ PFM_DBG_ovfl("state=%d", ctx->state); ++ ++ ctx->flags.started = 1; ++ ++ if (ctx->state == PFM_CTX_MASKED) ++ return; ++ ++ pfm_arch_restore_pmcs(ctx, ctx->active_set); ++} ++ ++/* ++ * this function is called from the PMU interrupt handler ONLY. ++ * On MIPS, the PMU is frozen via arch_stop, masking would be implemented ++ * via arch-stop as well. Given that the PMU is already stopped when ++ * entering the interrupt handler, we do not need to stop it again, so ++ * this function is a nop. ++ */ ++static inline void pfm_arch_mask_monitoring(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{} ++ ++/* ++ * on MIPS masking/unmasking uses the start/stop mechanism, so we simply ++ * need to start here. ++ */ ++static inline void pfm_arch_unmask_monitoring(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ pfm_arch_start(current, ctx); ++} ++ ++static inline int pfm_arch_context_create(struct pfm_context *ctx, ++ u32 ctx_flags) ++{ ++ return 0; ++} ++ ++static inline void pfm_arch_context_free(struct pfm_context *ctx) ++{} ++ ++ ++ ++ ++ ++/* ++ * function called from pfm_setfl_sane(). Context is locked ++ * and interrupts are masked. ++ * The value of flags is the value of ctx_flags as passed by ++ * user. ++ * ++ * function must check arch-specific set flags. ++ * Return: ++ * 1 when flags are valid ++ * 0 on error ++ */ ++static inline int ++pfm_arch_setfl_sane(struct pfm_context *ctx, u32 flags) ++{ ++ return 0; ++} ++ ++static inline int pfm_arch_init(void) ++{ ++ return 0; ++} ++ ++static inline void pfm_arch_init_percpu(void) ++{} ++ ++static inline int pfm_arch_load_context(struct pfm_context *ctx) ++{ ++ return 0; ++} ++ ++static inline void pfm_arch_unload_context(struct pfm_context *ctx) ++{} ++ ++static inline int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds) ++{ ++ return 0; ++} ++ ++static inline void pfm_arch_pmu_release(void) ++{} ++ ++#ifdef CONFIG_PERFMON_FLUSH ++/* ++ * due to cache aliasing problem on MIPS, it is necessary to flush ++ * pages out of the cache when they are modified. ++ */ ++static inline void pfm_cacheflush(void *addr, unsigned int len) ++{ ++ unsigned long start, end; ++ ++ start = (unsigned long)addr & PAGE_MASK; ++ end = ((unsigned long)addr + len + PAGE_SIZE - 1) & PAGE_MASK; ++ ++ while (start < end) { ++ flush_data_cache_page(start); ++ start += PAGE_SIZE; ++ } ++} ++#else ++static inline void pfm_cacheflush(void *addr, unsigned int len) ++{} ++#endif ++ ++static inline void pfm_arch_arm_handle_work(struct task_struct *task) ++{} ++ ++static inline void pfm_arch_disarm_handle_work(struct task_struct *task) ++{} ++ ++static inline int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg) ++{ ++ return 0; ++} ++ ++static inline int pfm_arch_get_base_syscall(void) ++{ ++ if (test_thread_flag(TIF_32BIT_ADDR)) { ++ if (test_thread_flag(TIF_32BIT_REGS)) ++ return __NR_O32_Linux+330; ++ return __NR_N32_Linux+293; ++ } ++ return __NR_64_Linux+289; ++} ++ ++struct pfm_arch_context { ++ /* empty */ ++}; ++ ++#define PFM_ARCH_CTX_SIZE sizeof(struct pfm_arch_context) ++/* ++ * MIPS may need extra alignment requirements for the sampling buffer ++ */ ++#ifdef CONFIG_PERFMON_SMPL_ALIGN ++#define PFM_ARCH_SMPL_ALIGN_SIZE 0x4000 ++#else ++#define PFM_ARCH_SMPL_ALIGN_SIZE 0 ++#endif ++ ++#endif /* CONFIG_PERFMON */ ++ ++#endif /* __KERNEL__ */ ++#endif /* _ASM_MIPS64_PERFMON_KERN_H_ */ +diff --git a/include/asm-mips/system.h b/include/asm-mips/system.h +index a944eda..470cdfc 100644 +--- a/include/asm-mips/system.h ++++ b/include/asm-mips/system.h +@@ -67,6 +67,10 @@ do { \ + __mips_mt_fpaff_switch_to(prev); \ + if (cpu_has_dsp) \ + __save_dsp(prev); \ ++ if (test_tsk_thread_flag(prev, TIF_PERFMON_CTXSW)) \ ++ pfm_ctxsw_out(prev, next); \ ++ if (test_tsk_thread_flag(next, TIF_PERFMON_CTXSW)) \ ++ pfm_ctxsw_in(prev, next); \ + (last) = resume(prev, next, task_thread_info(next)); \ + } while (0) + +diff --git a/include/asm-mips/thread_info.h b/include/asm-mips/thread_info.h +index bb30606..34fd6aa 100644 +--- a/include/asm-mips/thread_info.h ++++ b/include/asm-mips/thread_info.h +@@ -114,6 +114,7 @@ register struct thread_info *__current_thread_info __asm__("$28"); + #define TIF_NEED_RESCHED 2 /* rescheduling necessary */ + #define TIF_SYSCALL_AUDIT 3 /* syscall auditing active */ + #define TIF_SECCOMP 4 /* secure computing */ ++#define TIF_PERFMON_WORK 5 /* work for pfm_handle_work() */ + #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal() */ + #define TIF_USEDFPU 16 /* FPU was used by this task this quantum (SMP) */ + #define TIF_POLLING_NRFLAG 17 /* true if poll_idle() is polling TIF_NEED_RESCHED */ +@@ -124,6 +125,7 @@ register struct thread_info *__current_thread_info __asm__("$28"); + #define TIF_32BIT_REGS 22 /* also implies 16/32 fprs */ + #define TIF_32BIT_ADDR 23 /* 32-bit address space (o32/n32) */ + #define TIF_FPUBOUND 24 /* thread bound to FPU-full CPU set */ ++#define TIF_PERFMON_CTXSW 25 /* perfmon needs ctxsw calls */ + #define TIF_SYSCALL_TRACE 31 /* syscall trace active */ + + #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) +@@ -140,6 +142,8 @@ register struct thread_info *__current_thread_info __asm__("$28"); + #define _TIF_32BIT_REGS (1<<TIF_32BIT_REGS) + #define _TIF_32BIT_ADDR (1<<TIF_32BIT_ADDR) + #define _TIF_FPUBOUND (1<<TIF_FPUBOUND) ++#define _TIF_PERFMON_WORK (1<<TIF_PERFMON_WORK) ++#define _TIF_PERFMON_CTXSW (1<<TIF_PERFMON_CTXSW) + + /* work to do on interrupt/exception return */ + #define _TIF_WORK_MASK (0x0000ffef & ~_TIF_SECCOMP) +diff --git a/include/asm-mips/unistd.h b/include/asm-mips/unistd.h +index a73e153..200f654 100644 +--- a/include/asm-mips/unistd.h ++++ b/include/asm-mips/unistd.h +@@ -350,11 +350,23 @@ + #define __NR_dup3 (__NR_Linux + 327) + #define __NR_pipe2 (__NR_Linux + 328) + #define __NR_inotify_init1 (__NR_Linux + 329) ++#define __NR_pfm_create_context (__NR_Linux + 330) ++#define __NR_pfm_write_pmcs (__NR_pfm_create_context+1) ++#define __NR_pfm_write_pmds (__NR_pfm_create_context+2) ++#define __NR_pfm_read_pmds (__NR_pfm_create_context+3) ++#define __NR_pfm_load_context (__NR_pfm_create_context+4) ++#define __NR_pfm_start (__NR_pfm_create_context+5) ++#define __NR_pfm_stop (__NR_pfm_create_context+6) ++#define __NR_pfm_restart (__NR_pfm_create_context+7) ++#define __NR_pfm_create_evtsets (__NR_pfm_create_context+8) ++#define __NR_pfm_getinfo_evtsets (__NR_pfm_create_context+9) ++#define __NR_pfm_delete_evtsets (__NR_pfm_create_context+10) ++#define __NR_pfm_unload_context (__NR_pfm_create_context+11) + + /* + * Offset of the last Linux o32 flavoured syscall + */ +-#define __NR_Linux_syscalls 329 ++#define __NR_Linux_syscalls 341 + + #endif /* _MIPS_SIM == _MIPS_SIM_ABI32 */ + +@@ -656,16 +668,28 @@ + #define __NR_dup3 (__NR_Linux + 286) + #define __NR_pipe2 (__NR_Linux + 287) + #define __NR_inotify_init1 (__NR_Linux + 288) ++#define __NR_pfm_create_context (__NR_Linux + 289) ++#define __NR_pfm_write_pmcs (__NR_pfm_create_context+1) ++#define __NR_pfm_write_pmds (__NR_pfm_create_context+2) ++#define __NR_pfm_read_pmds (__NR_pfm_create_context+3) ++#define __NR_pfm_load_context (__NR_pfm_create_context+4) ++#define __NR_pfm_start (__NR_pfm_create_context+5) ++#define __NR_pfm_stop (__NR_pfm_create_context+6) ++#define __NR_pfm_restart (__NR_pfm_create_context+7) ++#define __NR_pfm_create_evtsets (__NR_pfm_create_context+8) ++#define __NR_pfm_getinfo_evtsets (__NR_pfm_create_context+9) ++#define __NR_pfm_delete_evtsets (__NR_pfm_create_context+10) ++#define __NR_pfm_unload_context (__NR_pfm_create_context+11) + + /* + * Offset of the last Linux 64-bit flavoured syscall + */ +-#define __NR_Linux_syscalls 288 ++#define __NR_Linux_syscalls 300 + + #endif /* _MIPS_SIM == _MIPS_SIM_ABI64 */ + + #define __NR_64_Linux 5000 +-#define __NR_64_Linux_syscalls 288 ++#define __NR_64_Linux_syscalls 300 + + #if _MIPS_SIM == _MIPS_SIM_NABI32 + +@@ -966,16 +990,28 @@ + #define __NR_dup3 (__NR_Linux + 290) + #define __NR_pipe2 (__NR_Linux + 291) + #define __NR_inotify_init1 (__NR_Linux + 292) ++#define __NR_pfm_create_context (__NR_Linux + 293) ++#define __NR_pfm_write_pmcs (__NR_pfm_create_context+1) ++#define __NR_pfm_write_pmds (__NR_pfm_create_context+2) ++#define __NR_pfm_read_pmds (__NR_pfm_create_context+3) ++#define __NR_pfm_load_context (__NR_pfm_create_context+4) ++#define __NR_pfm_start (__NR_pfm_create_context+5) ++#define __NR_pfm_stop (__NR_pfm_create_context+6) ++#define __NR_pfm_restart (__NR_pfm_create_context+7) ++#define __NR_pfm_create_evtsets (__NR_pfm_create_context+8) ++#define __NR_pfm_getinfo_evtsets (__NR_pfm_create_context+9) ++#define __NR_pfm_delete_evtsets (__NR_pfm_create_context+10) ++#define __NR_pfm_unload_context (__NR_pfm_create_context+11) + + /* + * Offset of the last N32 flavoured syscall + */ +-#define __NR_Linux_syscalls 292 ++#define __NR_Linux_syscalls 304 + + #endif /* _MIPS_SIM == _MIPS_SIM_NABI32 */ + + #define __NR_N32_Linux 6000 +-#define __NR_N32_Linux_syscalls 292 ++#define __NR_N32_Linux_syscalls 304 + + #ifdef __KERNEL__ + +diff --git a/include/asm-x86/Kbuild b/include/asm-x86/Kbuild +index 4a8e80c..d7d819e 100644 +--- a/include/asm-x86/Kbuild ++++ b/include/asm-x86/Kbuild +@@ -9,6 +9,7 @@ header-y += prctl.h + header-y += ptrace-abi.h + header-y += sigcontext32.h + header-y += ucontext.h ++header-y += perfmon.h + header-y += processor-flags.h + + unifdef-y += e820.h +diff --git a/include/asm-x86/ia32_unistd.h b/include/asm-x86/ia32_unistd.h +index 61cea9e..275e015 100644 +--- a/include/asm-x86/ia32_unistd.h ++++ b/include/asm-x86/ia32_unistd.h +@@ -8,11 +8,12 @@ + * the number. This should be otherwise in sync with asm-x86/unistd_32.h. -AK + */ + +-#define __NR_ia32_restart_syscall 0 +-#define __NR_ia32_exit 1 +-#define __NR_ia32_read 3 +-#define __NR_ia32_write 4 +-#define __NR_ia32_sigreturn 119 +-#define __NR_ia32_rt_sigreturn 173 ++#define __NR_ia32_restart_syscall 0 ++#define __NR_ia32_exit 1 ++#define __NR_ia32_read 3 ++#define __NR_ia32_write 4 ++#define __NR_ia32_sigreturn 119 ++#define __NR_ia32_rt_sigreturn 173 ++#define __NR_ia32_pfm_create_context 333 + + #endif /* _ASM_X86_64_IA32_UNISTD_H_ */ +diff --git a/include/asm-x86/irq_vectors.h b/include/asm-x86/irq_vectors.h +index a48c7f2..892fe8f 100644 +--- a/include/asm-x86/irq_vectors.h ++++ b/include/asm-x86/irq_vectors.h +@@ -92,6 +92,11 @@ + #define LOCAL_TIMER_VECTOR 0xef + + /* ++ * Perfmon PMU interrupt vector ++ */ ++#define LOCAL_PERFMON_VECTOR 0xee ++ ++/* + * First APIC vector available to drivers: (vectors 0x30-0xee) we + * start at 0x31(0x41) to spread out vectors evenly between priority + * levels. (0x80 is the syscall vector) +diff --git a/include/asm-x86/mach-default/entry_arch.h b/include/asm-x86/mach-default/entry_arch.h +index 9283b60..ac31c2d 100644 +--- a/include/asm-x86/mach-default/entry_arch.h ++++ b/include/asm-x86/mach-default/entry_arch.h +@@ -32,4 +32,8 @@ BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) + BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) + #endif + ++#ifdef CONFIG_PERFMON ++BUILD_INTERRUPT(pmu_interrupt,LOCAL_PERFMON_VECTOR) ++#endif ++ + #endif +diff --git a/include/asm-x86/perfmon.h b/include/asm-x86/perfmon.h +new file mode 100644 +index 0000000..906f4b2 +--- /dev/null ++++ b/include/asm-x86/perfmon.h +@@ -0,0 +1,34 @@ ++/* ++ * Copyright (c) 2007 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * This file contains i386/x86_64 specific definitions for the perfmon ++ * interface. ++ * ++ * This file MUST never be included directly. Use linux/perfmon.h. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#ifndef _ASM_X86_PERFMON__H_ ++#define _ASM_X86_PERFMON__H_ ++ ++/* ++ * arch-specific user visible interface definitions ++ */ ++ ++#define PFM_ARCH_MAX_PMCS (256+64) /* 256 HW 64 SW */ ++#define PFM_ARCH_MAX_PMDS (256+64) /* 256 HW 64 SW */ ++ ++#endif /* _ASM_X86_PERFMON_H_ */ +diff --git a/include/asm-x86/perfmon_kern.h b/include/asm-x86/perfmon_kern.h +new file mode 100644 +index 0000000..0e5d3a5 +--- /dev/null ++++ b/include/asm-x86/perfmon_kern.h +@@ -0,0 +1,548 @@ ++/* ++ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * Copyright (c) 2007 Advanced Micro Devices, Inc. ++ * Contributed by Robert Richter <robert.richter@amd.com> ++ * ++ * This file contains X86 Processor Family specific definitions ++ * for the perfmon interface. This covers P6, Pentium M, P4/Xeon ++ * (32-bit and 64-bit, i.e., EM64T) and AMD X86-64. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#ifndef _ASM_X86_PERFMON_KERN_H_ ++#define _ASM_X86_PERFMON_KERN_H_ ++ ++#ifdef CONFIG_PERFMON ++#include <linux/unistd.h> ++#ifdef CONFIG_4KSTACKS ++#define PFM_ARCH_PMD_STK_ARG 2 ++#define PFM_ARCH_PMC_STK_ARG 2 ++#else ++#define PFM_ARCH_PMD_STK_ARG 4 /* about 700 bytes of stack space */ ++#define PFM_ARCH_PMC_STK_ARG 4 /* about 200 bytes of stack space */ ++#endif ++ ++struct pfm_arch_pmu_info { ++ u32 flags; /* PMU feature flags */ ++ /* ++ * mandatory model-specific callbacks ++ */ ++ int (*stop_save)(struct pfm_context *ctx, struct pfm_event_set *set); ++ int (*has_ovfls)(struct pfm_context *ctx); ++ void (*quiesce)(void); ++ ++ /* ++ * optional model-specific callbacks ++ */ ++ void (*acquire_pmu_percpu)(void); ++ void (*release_pmu_percpu)(void); ++ int (*create_context)(struct pfm_context *ctx, u32 ctx_flags); ++ void (*free_context)(struct pfm_context *ctx); ++ int (*load_context)(struct pfm_context *ctx); ++ void (*unload_context)(struct pfm_context *ctx); ++ void (*write_pmc)(struct pfm_context *ctx, unsigned int cnum, u64 value); ++ void (*write_pmd)(struct pfm_context *ctx, unsigned int cnum, u64 value); ++ u64 (*read_pmd)(struct pfm_context *ctx, unsigned int cnum); ++ u64 (*read_pmc)(struct pfm_context *ctx, unsigned int cnum); ++ void (*nmi_copy_state)(struct pfm_context *ctx); ++ void (*restore_pmcs)(struct pfm_context *ctx, ++ struct pfm_event_set *set); ++ void (*restore_pmds)(struct pfm_context *ctx, ++ struct pfm_event_set *set); ++}; ++ ++/* ++ * PMU feature flags ++ */ ++#define PFM_X86_FL_USE_NMI 0x01 /* user asking for NMI */ ++#define PFM_X86_FL_NO_SHARING 0x02 /* no sharing with other subsystems */ ++#define PFM_X86_FL_SHARING 0x04 /* PMU is being shared */ ++ ++struct pfm_x86_ctx_flags { ++ unsigned int insecure:1; /* rdpmc per-thread self-monitoring */ ++ unsigned int use_pebs:1; /* PEBS used */ ++ unsigned int use_ds:1; /* DS used */ ++ unsigned int reserved:29; /* for future use */ ++}; ++ ++struct pfm_arch_context { ++ u64 saved_real_iip; /* instr pointer of last NMI intr */ ++ struct pfm_x86_ctx_flags flags; /* flags */ ++ void *ds_area; /* address of DS area (to go away) */ ++ void *data; /* model-specific data */ ++}; ++ ++/* ++ * functions implemented as inline on x86 ++ */ ++ ++/** ++ * pfm_arch_write_pmc - write a single PMC register ++ * @ctx: context to work on ++ * @cnum: PMC index ++ * @value: PMC 64-bit value ++ * ++ * in certain situations, ctx may be NULL ++ */ ++static inline void pfm_arch_write_pmc(struct pfm_context *ctx, ++ unsigned int cnum, u64 value) ++{ ++ struct pfm_arch_pmu_info *pmu_info; ++ ++ pmu_info = pfm_pmu_info(); ++ ++ /* ++ * we only write to the actual register when monitoring is ++ * active (pfm_start was issued) ++ */ ++ if (ctx && ctx->flags.started == 0) ++ return; ++ ++ /* ++ * model-specific override, if any ++ */ ++ if (pmu_info->write_pmc) { ++ pmu_info->write_pmc(ctx, cnum, value); ++ return; ++ } ++ ++ PFM_DBG_ovfl("pfm_arch_write_pmc(0x%lx, 0x%Lx)", ++ pfm_pmu_conf->pmc_desc[cnum].hw_addr, ++ (unsigned long long) value); ++ ++ wrmsrl(pfm_pmu_conf->pmc_desc[cnum].hw_addr, value); ++} ++ ++/** ++ * pfm_arch_write_pmd - write a single PMD register ++ * @ctx: context to work on ++ * @cnum: PMD index ++ * @value: PMD 64-bit value ++ */ ++static inline void pfm_arch_write_pmd(struct pfm_context *ctx, ++ unsigned int cnum, u64 value) ++{ ++ struct pfm_arch_pmu_info *pmu_info; ++ ++ pmu_info = pfm_pmu_info(); ++ ++ /* ++ * to make sure the counter overflows, we set the ++ * upper bits. we also clear any other unimplemented ++ * bits as this may cause crash on some processors. ++ */ ++ if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_C64) ++ value = (value | ~pfm_pmu_conf->ovfl_mask) ++ & ~pfm_pmu_conf->pmd_desc[cnum].rsvd_msk; ++ ++ PFM_DBG_ovfl("pfm_arch_write_pmd(0x%lx, 0x%Lx)", ++ pfm_pmu_conf->pmd_desc[cnum].hw_addr, ++ (unsigned long long) value); ++ ++ /* ++ * model-specific override, if any ++ */ ++ if (pmu_info->write_pmd) { ++ pmu_info->write_pmd(ctx, cnum, value); ++ return; ++ } ++ ++ wrmsrl(pfm_pmu_conf->pmd_desc[cnum].hw_addr, value); ++} ++ ++/** ++ * pfm_arch_read_pmd - read a single PMD register ++ * @ctx: context to work on ++ * @cnum: PMD index ++ * ++ * return value is register 64-bit value ++ */ ++static inline u64 pfm_arch_read_pmd(struct pfm_context *ctx, unsigned int cnum) ++{ ++ struct pfm_arch_pmu_info *pmu_info; ++ u64 tmp; ++ ++ pmu_info = pfm_pmu_info(); ++ ++ /* ++ * model-specific override, if any ++ */ ++ if (pmu_info->read_pmd) ++ tmp = pmu_info->read_pmd(ctx, cnum); ++ else ++ rdmsrl(pfm_pmu_conf->pmd_desc[cnum].hw_addr, tmp); ++ ++ PFM_DBG_ovfl("pfm_arch_read_pmd(0x%lx) = 0x%Lx", ++ pfm_pmu_conf->pmd_desc[cnum].hw_addr, ++ (unsigned long long) tmp); ++ return tmp; ++} ++ ++/** ++ * pfm_arch_read_pmc - read a single PMC register ++ * @ctx: context to work on ++ * @cnum: PMC index ++ * ++ * return value is register 64-bit value ++ */ ++static inline u64 pfm_arch_read_pmc(struct pfm_context *ctx, unsigned int cnum) ++{ ++ struct pfm_arch_pmu_info *pmu_info; ++ u64 tmp; ++ ++ pmu_info = pfm_pmu_info(); ++ ++ /* ++ * model-specific override, if any ++ */ ++ if (pmu_info->read_pmc) ++ tmp = pmu_info->read_pmc(ctx, cnum); ++ else ++ rdmsrl(pfm_pmu_conf->pmc_desc[cnum].hw_addr, tmp); ++ ++ PFM_DBG_ovfl("pfm_arch_read_pmc(0x%lx) = 0x%016Lx", ++ pfm_pmu_conf->pmc_desc[cnum].hw_addr, ++ (unsigned long long) tmp); ++ return tmp; ++} ++ ++/** ++ * pfm_arch_is_active - return non-zero is monitoring has been started ++ * @ctx: context to check ++ * ++ * At certain points, perfmon needs to know if monitoring has been ++ * explicitly started. ++ * ++ * On x86, there is not other way but to use pfm_start/pfm_stop ++ * to activate monitoring, thus we can simply check flags.started ++ */ ++static inline int pfm_arch_is_active(struct pfm_context *ctx) ++{ ++ return ctx->flags.started; ++} ++ ++ ++/** ++ * pfm_arch_unload_context - detach context from thread or CPU ++ * @ctx: context to detach ++ * ++ * in system-wide ctx->task is NULL, otherwise it points to the ++ * attached thread ++ */ ++static inline void pfm_arch_unload_context(struct pfm_context *ctx) ++{ ++ struct pfm_arch_pmu_info *pmu_info; ++ struct pfm_arch_context *ctx_arch; ++ ++ ctx_arch = pfm_ctx_arch(ctx); ++ pmu_info = pfm_pmu_info(); ++ ++ if (ctx_arch->flags.insecure) { ++ PFM_DBG("clear cr4.pce"); ++ clear_in_cr4(X86_CR4_PCE); ++ } ++ ++ if (pmu_info->unload_context) ++ pmu_info->unload_context(ctx); ++} ++ ++/** ++ * pfm_arch_load_context - attach context to thread or CPU ++ * @ctx: context to attach ++ */ ++static inline int pfm_arch_load_context(struct pfm_context *ctx) ++{ ++ struct pfm_arch_pmu_info *pmu_info; ++ struct pfm_arch_context *ctx_arch; ++ int ret = 0; ++ ++ ctx_arch = pfm_ctx_arch(ctx); ++ pmu_info = pfm_pmu_info(); ++ ++ /* ++ * RDPMC authorized in system-wide and ++ * per-thread self-monitoring. ++ * ++ * RDPMC only gives access to counts. ++ * ++ * The context-switch routine code does not restore ++ * all the PMD registers (optimization), thus there ++ * is a possible leak of counts there in per-thread ++ * mode. ++ */ ++ if (ctx->task == current || ctx->flags.system) { ++ PFM_DBG("set cr4.pce"); ++ set_in_cr4(X86_CR4_PCE); ++ ctx_arch->flags.insecure = 1; ++ } ++ ++ if (pmu_info->load_context) ++ ret = pmu_info->load_context(ctx); ++ ++ return ret; ++} ++ ++void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set); ++void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx); ++void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx); ++ ++/** ++ * pfm_arch_unmask_monitoring - unmask monitoring ++ * @ctx: context to mask ++ * @set: current event set ++ * ++ * masking is slightly different from stopping in that, it does not undo ++ * the pfm_start() issued by user. This is used in conjunction with ++ * sampling. Masking means stop monitoring, but do not authorize user ++ * to issue pfm_start/stop during that time. Unmasking is achieved via ++ * pfm_restart() and also may also depend on the sampling format used. ++ * ++ * on x86 masking/unmasking use the start/stop mechanism, except ++ * that flags.started is not modified. ++ */ ++static inline void pfm_arch_unmask_monitoring(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ pfm_arch_start(current, ctx); ++} ++ ++/** ++ * pfm_arch_intr_freeze_pmu - stop monitoring when handling PMU interrupt ++ * @ctx: current context ++ * @set: current event set ++ * ++ * called from __pfm_interrupt_handler(). ++ * ctx is not NULL. ctx is locked. interrupts are masked ++ * ++ * The following actions must take place: ++ * - stop all monitoring to ensure handler has consistent view. ++ * - collect overflowed PMDs bitmask into povfls_pmds and ++ * npend_ovfls. If no interrupt detected then npend_ovfls ++ * must be set to zero. ++ */ ++static inline void pfm_arch_intr_freeze_pmu(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ /* ++ * on X86, freezing is equivalent to stopping ++ */ ++ pfm_arch_stop(current, ctx); ++ ++ /* ++ * we mark monitoring as stopped to avoid ++ * certain side effects especially in ++ * pfm_switch_sets_from_intr() and ++ * pfm_arch_restore_pmcs() ++ */ ++ ctx->flags.started = 0; ++} ++ ++/** ++ * pfm_arch_intr_unfreeze_pmu - conditionally reactive monitoring ++ * @ctx: current context ++ * ++ * current context may be not when dealing when spurious interrupts ++ * ++ * Must re-activate monitoring if context is not MASKED. ++ * interrupts are masked. ++ */ ++static inline void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx) ++{ ++ if (ctx == NULL) ++ return; ++ ++ PFM_DBG_ovfl("state=%d", ctx->state); ++ ++ /* ++ * restore flags.started which is cleared in ++ * pfm_arch_intr_freeze_pmu() ++ */ ++ ctx->flags.started = 1; ++ ++ if (ctx->state == PFM_CTX_MASKED) ++ return; ++ ++ pfm_arch_restore_pmcs(ctx, ctx->active_set); ++} ++ ++/** ++ * pfm_arch_setfl_sane - check arch/model specific event set flags ++ * @ctx: context to work on ++ * @flags: event set flags as passed by user ++ * ++ * called from pfm_setfl_sane(). Context is locked. Interrupts are masked. ++ * ++ * Return: ++ * 0 when flags are valid ++ * 1 on error ++ */ ++static inline int pfm_arch_setfl_sane(struct pfm_context *ctx, u32 flags) ++{ ++ return 0; ++} ++ ++/** ++ * pfm_arch_ovfl_reset_pmd - reset pmd on overflow ++ * @ctx: current context ++ * @cnum: PMD index ++ * ++ * On some CPUs, the upper bits of a counter must be set in order for the ++ * overflow interrupt to happen. On overflow, the counter has wrapped around, ++ * and the upper bits are cleared. This function may be used to set them back. ++ * ++ * For x86, the current version loses whatever is remaining in the counter, ++ * which is usually has a small count. In order not to loose this count, ++ * we do a read-modify-write to set the upper bits while preserving the ++ * low-order bits. This is slow but works. ++ */ ++static inline void pfm_arch_ovfl_reset_pmd(struct pfm_context *ctx, unsigned int cnum) ++{ ++ u64 val; ++ val = pfm_arch_read_pmd(ctx, cnum); ++ pfm_arch_write_pmd(ctx, cnum, val); ++} ++ ++/** ++ * pfm_arch_context_create - create context ++ * @ctx: newly created context ++ * @flags: context flags as passed by user ++ * ++ * called from __pfm_create_context() ++ */ ++static inline int pfm_arch_context_create(struct pfm_context *ctx, u32 ctx_flags) ++{ ++ struct pfm_arch_pmu_info *pmu_info; ++ ++ pmu_info = pfm_pmu_info(); ++ ++ if (pmu_info->create_context) ++ return pmu_info->create_context(ctx, ctx_flags); ++ ++ return 0; ++} ++ ++/** ++ * pfm_arch_context_free - free context ++ * @ctx: context to free ++ */ ++static inline void pfm_arch_context_free(struct pfm_context *ctx) ++{ ++ struct pfm_arch_pmu_info *pmu_info; ++ ++ pmu_info = pfm_pmu_info(); ++ ++ if (pmu_info->free_context) ++ pmu_info->free_context(ctx); ++} ++ ++/* ++ * pfm_arch_clear_pmd_ovfl_cond - alter the pmds in such a way that they ++ * will not cause cause interrupts when unused. ++ * ++ * This is a nop on x86 ++ */ ++static inline void pfm_arch_clear_pmd_ovfl_cond(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{} ++ ++/* ++ * functions implemented in arch/x86/perfmon/perfmon.c ++ */ ++int pfm_arch_init(void); ++void pfm_arch_resend_irq(struct pfm_context *ctx); ++ ++int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx); ++void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx); ++ ++void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set); ++int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg); ++void pfm_arch_pmu_config_remove(void); ++char *pfm_arch_get_pmu_module_name(void); ++int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds); ++void pfm_arch_pmu_release(void); ++ ++/* ++ * pfm_arch_serialize - make PMU modifications visible to subsequent instructions ++ * ++ * This is a nop on x86 ++ */ ++static inline void pfm_arch_serialize(void) ++{} ++ ++/* ++ * on x86, the PMDs are already saved by pfm_arch_freeze_pmu() ++ * when entering the PMU interrupt handler, thus, we do not need ++ * to save them again in pfm_switch_sets_from_intr() ++ */ ++static inline void pfm_arch_save_pmds_from_intr(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{} ++ ++ ++static inline void pfm_arch_ctxswout_sys(struct task_struct *task, ++ struct pfm_context *ctx) ++{} ++ ++static inline void pfm_arch_ctxswin_sys(struct task_struct *task, ++ struct pfm_context *ctx) ++{} ++ ++static inline void pfm_arch_init_percpu(void) ++{} ++ ++static inline void pfm_cacheflush(void *addr, unsigned int len) ++{} ++ ++/* ++ * this function is called from the PMU interrupt handler ONLY. ++ * On x86, the PMU is frozen via arch_stop, masking would be implemented ++ * via arch-stop as well. Given that the PMU is already stopped when ++ * entering the interrupt handler, we do not need to stop it again, so ++ * this function is a nop. ++ */ ++static inline void pfm_arch_mask_monitoring(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{} ++ ++ ++static inline void pfm_arch_arm_handle_work(struct task_struct *task) ++{} ++ ++static inline void pfm_arch_disarm_handle_work(struct task_struct *task) ++{} ++ ++static inline int pfm_arch_get_base_syscall(void) ++{ ++#ifdef __x86_64__ ++ /* 32-bit syscall definition coming from ia32_unistd.h */ ++ if (test_thread_flag(TIF_IA32)) ++ return __NR_ia32_pfm_create_context; ++#endif ++ return __NR_pfm_create_context; ++} ++ ++#define PFM_ARCH_CTX_SIZE (sizeof(struct pfm_arch_context)) ++/* ++ * x86 does not need extra alignment requirements for the sampling buffer ++ */ ++#define PFM_ARCH_SMPL_ALIGN_SIZE 0 ++ ++asmlinkage void pmu_interrupt(void); ++ ++#endif /* CONFIG_PEFMON */ ++ ++#endif /* _ASM_X86_PERFMON_KERN_H_ */ +diff --git a/include/asm-x86/perfmon_pebs_core_smpl.h b/include/asm-x86/perfmon_pebs_core_smpl.h +new file mode 100644 +index 0000000..4a12e0d +--- /dev/null ++++ b/include/asm-x86/perfmon_pebs_core_smpl.h +@@ -0,0 +1,164 @@ ++/* ++ * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ * ++ * This file implements the sampling format to support Intel ++ * Precise Event Based Sampling (PEBS) feature of Intel Core ++ * processors, such as Intel Core 2. ++ * ++ * What is PEBS? ++ * ------------ ++ * This is a hardware feature to enhance sampling by providing ++ * better precision as to where a sample is taken. This avoids the ++ * typical skew in the instruction one can observe with any ++ * interrupt-based sampling technique. ++ * ++ * PEBS also lowers sampling overhead significantly by having the ++ * processor store samples instead of the OS. PMU interrupt are only ++ * generated after multiple samples are written. ++ * ++ * Another benefit of PEBS is that samples can be captured inside ++ * critical sections where interrupts are masked. ++ * ++ * How does it work? ++ * PEBS effectively implements a Hw buffer. The Os must pass a region ++ * of memory where samples are to be stored. The region can have any ++ * size. The OS must also specify the sampling period to reload. The PMU ++ * will interrupt when it reaches the end of the buffer or a specified ++ * threshold location inside the memory region. ++ * ++ * The description of the buffer is stored in the Data Save Area (DS). ++ * The samples are stored sequentially in the buffer. The format of the ++ * buffer is fixed and specified in the PEBS documentation. The sample ++ * format does not change between 32-bit and 64-bit modes unlike on the ++ * Pentium 4 version of PEBS. ++ * ++ * PEBS does not work when HyperThreading is enabled due to certain MSR ++ * being shared being to two threads. ++ * ++ * What does the format do? ++ * It provides access to the PEBS feature for both 32-bit and 64-bit ++ * processors that support it. ++ * ++ * The same code and data structures are used for both 32-bit and 64-bi ++ * modes. A single format name is used for both modes. In 32-bit mode, ++ * some of the extended registers are written to zero in each sample. ++ * ++ * It is important to realize that the format provides a zero-copy ++ * environment for the samples, i.e,, the OS never touches the ++ * samples. Whatever the processor write is directly accessible to ++ * the user. ++ * ++ * Parameters to the buffer can be passed via pfm_create_context() in ++ * the pfm_pebs_smpl_arg structure. ++ */ ++#ifndef __PERFMON_PEBS_CORE_SMPL_H__ ++#define __PERFMON_PEBS_CORE_SMPL_H__ 1 ++ ++/* ++ * The 32-bit and 64-bit formats are identical, thus we use only ++ * one name for the format. ++ */ ++#define PFM_PEBS_CORE_SMPL_NAME "pebs_core" ++ ++/* ++ * format specific parameters (passed at context creation) ++ * ++ * intr_thres: index from start of buffer of entry where the ++ * PMU interrupt must be triggered. It must be several samples ++ * short of the end of the buffer. ++ */ ++struct pfm_pebs_core_smpl_arg { ++ u64 cnt_reset; /* counter reset value */ ++ size_t buf_size; /* size of the PEBS buffer in bytes */ ++ size_t intr_thres;/* index of PEBS interrupt threshold entry */ ++ u64 reserved[6]; /* for future use */ ++}; ++ ++/* ++ * Data Save Area (32 and 64-bit mode) ++ * ++ * The DS area is exposed to the user. To determine the number ++ * of samples available in PEBS, it is necessary to substract ++ * pebs_index from pebs_base. ++ * ++ * Layout of the structure is mandated by hardware and specified ++ * in the Intel documentation. ++ */ ++struct pfm_ds_area_core { ++ u64 bts_buf_base; ++ u64 bts_index; ++ u64 bts_abs_max; ++ u64 bts_intr_thres; ++ u64 pebs_buf_base; ++ u64 pebs_index; ++ u64 pebs_abs_max; ++ u64 pebs_intr_thres; ++ u64 pebs_cnt_reset; ++}; ++ ++/* ++ * This header is at the beginning of the sampling buffer returned to the user. ++ * ++ * Because of PEBS alignement constraints, the actual PEBS buffer area does ++ * not necessarily begin right after the header. The hdr_start_offs must be ++ * used to compute the first byte of the buffer. The offset is defined as ++ * the number of bytes between the end of the header and the beginning of ++ * the buffer. As such the formula is: ++ * actual_buffer = (unsigned long)(hdr+1)+hdr->hdr_start_offs ++ */ ++struct pfm_pebs_core_smpl_hdr { ++ u64 overflows; /* #overflows for buffer */ ++ size_t buf_size; /* bytes in the buffer */ ++ size_t start_offs; /* actual buffer start offset */ ++ u32 version; /* smpl format version */ ++ u32 reserved1; /* for future use */ ++ u64 reserved2[5]; /* for future use */ ++ struct pfm_ds_area_core ds; /* data save area */ ++}; ++ ++/* ++ * Sample format as mandated by Intel documentation. ++ * The same format is used in both 32 and 64 bit modes. ++ */ ++struct pfm_pebs_core_smpl_entry { ++ u64 eflags; ++ u64 ip; ++ u64 eax; ++ u64 ebx; ++ u64 ecx; ++ u64 edx; ++ u64 esi; ++ u64 edi; ++ u64 ebp; ++ u64 esp; ++ u64 r8; /* 0 in 32-bit mode */ ++ u64 r9; /* 0 in 32-bit mode */ ++ u64 r10; /* 0 in 32-bit mode */ ++ u64 r11; /* 0 in 32-bit mode */ ++ u64 r12; /* 0 in 32-bit mode */ ++ u64 r13; /* 0 in 32-bit mode */ ++ u64 r14; /* 0 in 32-bit mode */ ++ u64 r15; /* 0 in 32-bit mode */ ++}; ++ ++#define PFM_PEBS_CORE_SMPL_VERSION_MAJ 1U ++#define PFM_PEBS_CORE_SMPL_VERSION_MIN 0U ++#define PFM_PEBS_CORE_SMPL_VERSION (((PFM_PEBS_CORE_SMPL_VERSION_MAJ&0xffff)<<16)|\ ++ (PFM_PEBS_CORE_SMPL_VERSION_MIN & 0xffff)) ++ ++#endif /* __PERFMON_PEBS_CORE_SMPL_H__ */ +diff --git a/include/asm-x86/perfmon_pebs_p4_smpl.h b/include/asm-x86/perfmon_pebs_p4_smpl.h +new file mode 100644 +index 0000000..26b51b4 +--- /dev/null ++++ b/include/asm-x86/perfmon_pebs_p4_smpl.h +@@ -0,0 +1,193 @@ ++/* ++ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ * ++ * This file implements the sampling format to support Intel ++ * Precise Event Based Sampling (PEBS) feature of Pentium 4 ++ * and other Netburst-based processors. Not to be used for ++ * Intel Core-based processors. ++ * ++ * What is PEBS? ++ * ------------ ++ * This is a hardware feature to enhance sampling by providing ++ * better precision as to where a sample is taken. This avoids the ++ * typical skew in the instruction one can observe with any ++ * interrupt-based sampling technique. ++ * ++ * PEBS also lowers sampling overhead significantly by having the ++ * processor store samples instead of the OS. PMU interrupt are only ++ * generated after multiple samples are written. ++ * ++ * Another benefit of PEBS is that samples can be captured inside ++ * critical sections where interrupts are masked. ++ * ++ * How does it work? ++ * PEBS effectively implements a Hw buffer. The Os must pass a region ++ * of memory where samples are to be stored. The region can have any ++ * size. The OS must also specify the sampling period to reload. The PMU ++ * will interrupt when it reaches the end of the buffer or a specified ++ * threshold location inside the memory region. ++ * ++ * The description of the buffer is stored in the Data Save Area (DS). ++ * The samples are stored sequentially in the buffer. The format of the ++ * buffer is fixed and specified in the PEBS documentation. The sample ++ * format changes between 32-bit and 64-bit modes due to extended register ++ * file. ++ * ++ * PEBS does not work when HyperThreading is enabled due to certain MSR ++ * being shared being to two threads. ++ * ++ * What does the format do? ++ * It provides access to the PEBS feature for both 32-bit and 64-bit ++ * processors that support it. ++ * ++ * The same code is used for both 32-bit and 64-bit modes, but different ++ * format names are used because the two modes are not compatible due to ++ * data model and register file differences. Similarly the public data ++ * structures describing the samples are different. ++ * ++ * It is important to realize that the format provides a zero-copy environment ++ * for the samples, i.e,, the OS never touches the samples. Whatever the ++ * processor write is directly accessible to the user. ++ * ++ * Parameters to the buffer can be passed via pfm_create_context() in ++ * the pfm_pebs_smpl_arg structure. ++ * ++ * It is not possible to mix a 32-bit PEBS application on top of a 64-bit ++ * host kernel. ++ */ ++#ifndef __PERFMON_PEBS_P4_SMPL_H__ ++#define __PERFMON_PEBS_P4_SMPL_H__ 1 ++ ++#ifdef __i386__ ++/* ++ * The 32-bit and 64-bit formats are not compatible, thus we have ++ * two different identifications so that 32-bit programs running on ++ * 64-bit OS will fail to use the 64-bit PEBS support. ++ */ ++#define PFM_PEBS_P4_SMPL_NAME "pebs32_p4" ++#else ++#define PFM_PEBS_P4_SMPL_NAME "pebs64_p4" ++#endif ++ ++/* ++ * format specific parameters (passed at context creation) ++ * ++ * intr_thres: index from start of buffer of entry where the ++ * PMU interrupt must be triggered. It must be several samples ++ * short of the end of the buffer. ++ */ ++struct pfm_pebs_p4_smpl_arg { ++ u64 cnt_reset; /* counter reset value */ ++ size_t buf_size; /* size of the PEBS buffer in bytes */ ++ size_t intr_thres;/* index of PEBS interrupt threshold entry */ ++ u64 reserved[6]; /* for future use */ ++}; ++ ++/* ++ * Data Save Area (32 and 64-bit mode) ++ * ++ * The DS area must be exposed to the user because this is the only ++ * way to report on the number of valid entries recorded by the CPU. ++ * This is required when the buffer is not full, i..e, there was not ++ * PMU interrupt. ++ * ++ * Layout of the structure is mandated by hardware and specified in ++ * the Intel documentation. ++ */ ++struct pfm_ds_area_p4 { ++ unsigned long bts_buf_base; ++ unsigned long bts_index; ++ unsigned long bts_abs_max; ++ unsigned long bts_intr_thres; ++ unsigned long pebs_buf_base; ++ unsigned long pebs_index; ++ unsigned long pebs_abs_max; ++ unsigned long pebs_intr_thres; ++ u64 pebs_cnt_reset; ++}; ++ ++/* ++ * This header is at the beginning of the sampling buffer returned to the user. ++ * ++ * Because of PEBS alignement constraints, the actual PEBS buffer area does ++ * not necessarily begin right after the header. The hdr_start_offs must be ++ * used to compute the first byte of the buffer. The offset is defined as ++ * the number of bytes between the end of the header and the beginning of ++ * the buffer. As such the formula is: ++ * actual_buffer = (unsigned long)(hdr+1)+hdr->hdr_start_offs ++ */ ++struct pfm_pebs_p4_smpl_hdr { ++ u64 overflows; /* #overflows for buffer */ ++ size_t buf_size; /* bytes in the buffer */ ++ size_t start_offs; /* actual buffer start offset */ ++ u32 version; /* smpl format version */ ++ u32 reserved1; /* for future use */ ++ u64 reserved2[5]; /* for future use */ ++ struct pfm_ds_area_p4 ds; /* data save area */ ++}; ++ ++/* ++ * 64-bit PEBS record format is described in ++ * http://www.intel.com/technology/64bitextensions/30083502.pdf ++ * ++ * The format does not peek at samples. The sample structure is only ++ * used to ensure that the buffer is large enough to accomodate one ++ * sample. ++ */ ++#ifdef __i386__ ++struct pfm_pebs_p4_smpl_entry { ++ u32 eflags; ++ u32 ip; ++ u32 eax; ++ u32 ebx; ++ u32 ecx; ++ u32 edx; ++ u32 esi; ++ u32 edi; ++ u32 ebp; ++ u32 esp; ++}; ++#else ++struct pfm_pebs_p4_smpl_entry { ++ u64 eflags; ++ u64 ip; ++ u64 eax; ++ u64 ebx; ++ u64 ecx; ++ u64 edx; ++ u64 esi; ++ u64 edi; ++ u64 ebp; ++ u64 esp; ++ u64 r8; ++ u64 r9; ++ u64 r10; ++ u64 r11; ++ u64 r12; ++ u64 r13; ++ u64 r14; ++ u64 r15; ++}; ++#endif ++ ++#define PFM_PEBS_P4_SMPL_VERSION_MAJ 1U ++#define PFM_PEBS_P4_SMPL_VERSION_MIN 0U ++#define PFM_PEBS_P4_SMPL_VERSION (((PFM_PEBS_P4_SMPL_VERSION_MAJ&0xffff)<<16)|\ ++ (PFM_PEBS_P4_SMPL_VERSION_MIN & 0xffff)) ++ ++#endif /* __PERFMON_PEBS_P4_SMPL_H__ */ +diff --git a/include/asm-x86/thread_info.h b/include/asm-x86/thread_info.h +index da0a675..b3a6ae9 100644 +--- a/include/asm-x86/thread_info.h ++++ b/include/asm-x86/thread_info.h +@@ -71,6 +71,7 @@ struct thread_info { + * Warning: layout of LSW is hardcoded in entry.S + */ + #define TIF_SYSCALL_TRACE 0 /* syscall trace active */ ++#define TIF_PERFMON_WORK 1 /* work for pfm_handle_work() */ + #define TIF_SIGPENDING 2 /* signal pending */ + #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ + #define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ +@@ -91,6 +92,7 @@ struct thread_info { + #define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ + #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ + #define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */ ++#define TIF_PERFMON_CTXSW 28 /* perfmon needs ctxsw calls */ + + #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) + #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) +@@ -112,6 +114,8 @@ struct thread_info { + #define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) + #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) + #define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS) ++#define _TIF_PERFMON_WORK (1<<TIF_PERFMON_WORK) ++#define _TIF_PERFMON_CTXSW (1<<TIF_PERFMON_CTXSW) + + /* work to do in syscall_trace_enter() */ + #define _TIF_WORK_SYSCALL_ENTRY \ +@@ -133,12 +137,12 @@ struct thread_info { + + /* Only used for 64 bit */ + #define _TIF_DO_NOTIFY_MASK \ +- (_TIF_SIGPENDING|_TIF_MCE_NOTIFY) ++ (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERFMON_WORK) + + /* flags to check in __switch_to() */ + #define _TIF_WORK_CTXSW \ + (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \ +- _TIF_NOTSC) ++ _TIF_NOTSC|_TIF_PERFMON_CTXSW) + + #define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW + #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) +diff --git a/include/asm-x86/unistd_32.h b/include/asm-x86/unistd_32.h +index d739467..5d8cca1 100644 +--- a/include/asm-x86/unistd_32.h ++++ b/include/asm-x86/unistd_32.h +@@ -338,9 +338,23 @@ + #define __NR_dup3 330 + #define __NR_pipe2 331 + #define __NR_inotify_init1 332 ++#define __NR_pfm_create_context 333 ++#define __NR_pfm_write_pmcs (__NR_pfm_create_context+1) ++#define __NR_pfm_write_pmds (__NR_pfm_create_context+2) ++#define __NR_pfm_read_pmds (__NR_pfm_create_context+3) ++#define __NR_pfm_load_context (__NR_pfm_create_context+4) ++#define __NR_pfm_start (__NR_pfm_create_context+5) ++#define __NR_pfm_stop (__NR_pfm_create_context+6) ++#define __NR_pfm_restart (__NR_pfm_create_context+7) ++#define __NR_pfm_create_evtsets (__NR_pfm_create_context+8) ++#define __NR_pfm_getinfo_evtsets (__NR_pfm_create_context+9) ++#define __NR_pfm_delete_evtsets (__NR_pfm_create_context+10) ++#define __NR_pfm_unload_context (__NR_pfm_create_context+11) + + #ifdef __KERNEL__ + ++#define NR_syscalls 345 ++ + #define __ARCH_WANT_IPC_PARSE_VERSION + #define __ARCH_WANT_OLD_READDIR + #define __ARCH_WANT_OLD_STAT +diff --git a/include/asm-x86/unistd_64.h b/include/asm-x86/unistd_64.h +index 3a341d7..75dac98 100644 +--- a/include/asm-x86/unistd_64.h ++++ b/include/asm-x86/unistd_64.h +@@ -653,7 +653,30 @@ __SYSCALL(__NR_dup3, sys_dup3) + __SYSCALL(__NR_pipe2, sys_pipe2) + #define __NR_inotify_init1 294 + __SYSCALL(__NR_inotify_init1, sys_inotify_init1) +- ++#define __NR_pfm_create_context 295 ++__SYSCALL(__NR_pfm_create_context, sys_pfm_create_context) ++#define __NR_pfm_write_pmcs (__NR_pfm_create_context+1) ++__SYSCALL(__NR_pfm_write_pmcs, sys_pfm_write_pmcs) ++#define __NR_pfm_write_pmds (__NR_pfm_create_context+2) ++__SYSCALL(__NR_pfm_write_pmds, sys_pfm_write_pmds) ++#define __NR_pfm_read_pmds (__NR_pfm_create_context+3) ++ __SYSCALL(__NR_pfm_read_pmds, sys_pfm_read_pmds) ++#define __NR_pfm_load_context (__NR_pfm_create_context+4) ++__SYSCALL(__NR_pfm_load_context, sys_pfm_load_context) ++#define __NR_pfm_start (__NR_pfm_create_context+5) ++__SYSCALL(__NR_pfm_start, sys_pfm_start) ++#define __NR_pfm_stop (__NR_pfm_create_context+6) ++__SYSCALL(__NR_pfm_stop, sys_pfm_stop) ++#define __NR_pfm_restart (__NR_pfm_create_context+7) ++__SYSCALL(__NR_pfm_restart, sys_pfm_restart) ++#define __NR_pfm_create_evtsets (__NR_pfm_create_context+8) ++__SYSCALL(__NR_pfm_create_evtsets, sys_pfm_create_evtsets) ++#define __NR_pfm_getinfo_evtsets (__NR_pfm_create_context+9) ++__SYSCALL(__NR_pfm_getinfo_evtsets, sys_pfm_getinfo_evtsets) ++#define __NR_pfm_delete_evtsets (__NR_pfm_create_context+10) ++__SYSCALL(__NR_pfm_delete_evtsets, sys_pfm_delete_evtsets) ++#define __NR_pfm_unload_context (__NR_pfm_create_context+11) ++__SYSCALL(__NR_pfm_unload_context, sys_pfm_unload_context) + + #ifndef __NO_STUBS + #define __ARCH_WANT_OLD_READDIR +diff --git a/include/linux/Kbuild b/include/linux/Kbuild +index b68ec09..d37036a 100644 +--- a/include/linux/Kbuild ++++ b/include/linux/Kbuild +@@ -162,6 +162,8 @@ header-y += video_decoder.h + header-y += video_encoder.h + header-y += videotext.h + header-y += x25.h ++header-y += perfmon.h ++header-y += perfmon_dfl_smpl.h + + unifdef-y += acct.h + unifdef-y += adb.h +diff --git a/include/linux/perfmon.h b/include/linux/perfmon.h +new file mode 100644 +index 0000000..5d9b977 +--- /dev/null ++++ b/include/linux/perfmon.h +@@ -0,0 +1,213 @@ ++/* ++ * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++ ++#ifndef __LINUX_PERFMON_H__ ++#define __LINUX_PERFMON_H__ ++ ++/* ++ * This file contains all the user visible generic definitions for the ++ * interface. Model-specific user-visible definitions are located in ++ * the asm/perfmon.h file. ++ */ ++ ++/* ++ * include arch-specific user interface definitions ++ */ ++#include <asm/perfmon.h> ++ ++/* ++ * defined by each arch ++ */ ++#define PFM_MAX_PMCS PFM_ARCH_MAX_PMCS ++#define PFM_MAX_PMDS PFM_ARCH_MAX_PMDS ++ ++/* ++ * number of elements for each type of bitvector ++ * all bitvectors use u64 fixed size type on all architectures. ++ */ ++#define PFM_BVSIZE(x) (((x)+(sizeof(__u64)<<3)-1) / (sizeof(__u64)<<3)) ++#define PFM_PMD_BV PFM_BVSIZE(PFM_MAX_PMDS) ++#define PFM_PMC_BV PFM_BVSIZE(PFM_MAX_PMCS) ++ ++/* ++ * register flags layout: ++ * bit[00-15] : generic flags ++ * bit[16-31] : arch-specific flags ++ * ++ * PFM_REGFL_NO_EMUL64: must be set on the PMC controlling the PMD ++ */ ++#define PFM_REGFL_OVFL_NOTIFY 0x1 /* PMD: send notification on event */ ++#define PFM_REGFL_RANDOM 0x2 /* PMD: randomize value after event */ ++#define PFM_REGFL_NO_EMUL64 0x4 /* PMC: no 64-bit emulation */ ++ ++/* ++ * event set flags layout: ++ * bits[00-15] : generic flags ++ * bits[16-31] : arch-specific flags (see asm/perfmon.h) ++ */ ++#define PFM_SETFL_OVFL_SWITCH 0x01 /* enable switch on overflow */ ++#define PFM_SETFL_TIME_SWITCH 0x02 /* enable switch on timeout */ ++ ++/* ++ * argument to pfm_create_context() system call ++ * structure shared with user level ++ */ ++struct pfarg_ctx { ++ __u32 ctx_flags; /* noblock/block/syswide */ ++ __u32 ctx_reserved1; /* for future use */ ++ __u64 ctx_reserved2[7]; /* for future use */ ++}; ++ ++/* ++ * context flags layout: ++ * bits[00-15]: generic flags ++ * bits[16-31]: arch-specific flags (see perfmon_const.h) ++ */ ++#define PFM_FL_NOTIFY_BLOCK 0x01 /* block task on user notifications */ ++#define PFM_FL_SYSTEM_WIDE 0x02 /* create a system wide context */ ++#define PFM_FL_OVFL_NO_MSG 0x80 /* no overflow msgs */ ++ ++/* ++ * argument to pfm_write_pmcs() system call. ++ * structure shared with user level ++ */ ++struct pfarg_pmc { ++ __u16 reg_num; /* which register */ ++ __u16 reg_set; /* event set for this register */ ++ __u32 reg_flags; /* REGFL flags */ ++ __u64 reg_value; /* pmc value */ ++ __u64 reg_reserved2[4]; /* for future use */ ++}; ++ ++/* ++ * argument to pfm_write_pmds() and pfm_read_pmds() system calls. ++ * structure shared with user level ++ */ ++struct pfarg_pmd { ++ __u16 reg_num; /* which register */ ++ __u16 reg_set; /* event set for this register */ ++ __u32 reg_flags; /* REGFL flags */ ++ __u64 reg_value; /* initial pmc/pmd value */ ++ __u64 reg_long_reset; /* value to reload after notification */ ++ __u64 reg_short_reset; /* reset after counter overflow */ ++ __u64 reg_last_reset_val; /* return: PMD last reset value */ ++ __u64 reg_ovfl_switch_cnt; /* #overflows before switch */ ++ __u64 reg_reset_pmds[PFM_PMD_BV]; /* reset on overflow */ ++ __u64 reg_smpl_pmds[PFM_PMD_BV]; /* record in sample */ ++ __u64 reg_smpl_eventid; /* opaque event identifier */ ++ __u64 reg_random_mask; /* bitmask used to limit random value */ ++ __u32 reg_random_seed; /* seed for randomization (OBSOLETE) */ ++ __u32 reg_reserved2[7]; /* for future use */ ++}; ++ ++/* ++ * optional argument to pfm_start() system call. Pass NULL if not needed. ++ * structure shared with user level ++ */ ++struct pfarg_start { ++ __u16 start_set; /* event set to start with */ ++ __u16 start_reserved1; /* for future use */ ++ __u32 start_reserved2; /* for future use */ ++ __u64 reserved3[3]; /* for future use */ ++}; ++ ++/* ++ * argument to pfm_load_context() system call. ++ * structure shared with user level ++ */ ++struct pfarg_load { ++ __u32 load_pid; /* thread or CPU to attach to */ ++ __u16 load_set; /* set to load first */ ++ __u16 load_reserved1; /* for future use */ ++ __u64 load_reserved2[3]; /* for future use */ ++}; ++ ++/* ++ * argument to pfm_create_evtsets() and pfm_delete_evtsets() system calls. ++ * structure shared with user level. ++ */ ++struct pfarg_setdesc { ++ __u16 set_id; /* which set */ ++ __u16 set_reserved1; /* for future use */ ++ __u32 set_flags; /* SETFL flags */ ++ __u64 set_timeout; /* switch timeout in nsecs */ ++ __u64 reserved[6]; /* for future use */ ++}; ++ ++/* ++ * argument to pfm_getinfo_evtsets() system call. ++ * structure shared with user level ++ */ ++struct pfarg_setinfo { ++ __u16 set_id; /* which set */ ++ __u16 set_reserved1; /* for future use */ ++ __u32 set_flags; /* out: SETFL flags */ ++ __u64 set_ovfl_pmds[PFM_PMD_BV]; /* out: last ovfl PMDs */ ++ __u64 set_runs; /* out: #times the set was active */ ++ __u64 set_timeout; /* out: eff/leftover timeout (nsecs) */ ++ __u64 set_act_duration; /* out: time set was active in nsecs */ ++ __u64 set_avail_pmcs[PFM_PMC_BV];/* out: available PMCs */ ++ __u64 set_avail_pmds[PFM_PMD_BV];/* out: available PMDs */ ++ __u64 set_reserved3[6]; /* for future use */ ++}; ++ ++/* ++ * default value for the user and group security parameters in ++ * /proc/sys/kernel/perfmon/sys_group ++ * /proc/sys/kernel/perfmon/task_group ++ */ ++#define PFM_GROUP_PERM_ANY -1 /* any user/group */ ++ ++/* ++ * overflow notification message. ++ * structure shared with user level ++ */ ++struct pfarg_ovfl_msg { ++ __u32 msg_type; /* message type: PFM_MSG_OVFL */ ++ __u32 msg_ovfl_pid; /* process id */ ++ __u16 msg_active_set; /* active set at overflow */ ++ __u16 msg_ovfl_cpu; /* cpu of PMU interrupt */ ++ __u32 msg_ovfl_tid; /* thread id */ ++ __u64 msg_ovfl_ip; /* IP on PMU intr */ ++ __u64 msg_ovfl_pmds[PFM_PMD_BV];/* overflowed PMDs */ ++}; ++ ++#define PFM_MSG_OVFL 1 /* an overflow happened */ ++#define PFM_MSG_END 2 /* task to which context was attached ended */ ++ ++/* ++ * generic notification message (union). ++ * union shared with user level ++ */ ++union pfarg_msg { ++ __u32 type; ++ struct pfarg_ovfl_msg pfm_ovfl_msg; ++}; ++ ++/* ++ * perfmon version number ++ */ ++#define PFM_VERSION_MAJ 2U ++#define PFM_VERSION_MIN 82U ++#define PFM_VERSION (((PFM_VERSION_MAJ&0xffff)<<16)|\ ++ (PFM_VERSION_MIN & 0xffff)) ++#define PFM_VERSION_MAJOR(x) (((x)>>16) & 0xffff) ++#define PFM_VERSION_MINOR(x) ((x) & 0xffff) ++ ++#endif /* __LINUX_PERFMON_H__ */ +diff --git a/include/linux/perfmon_dfl_smpl.h b/include/linux/perfmon_dfl_smpl.h +new file mode 100644 +index 0000000..e0817a8 +--- /dev/null ++++ b/include/linux/perfmon_dfl_smpl.h +@@ -0,0 +1,78 @@ ++/* ++ * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * This file implements the new dfl sampling buffer format ++ * for perfmon2 subsystem. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#ifndef __PERFMON_DFL_SMPL_H__ ++#define __PERFMON_DFL_SMPL_H__ 1 ++ ++/* ++ * format specific parameters (passed at context creation) ++ */ ++struct pfm_dfl_smpl_arg { ++ __u64 buf_size; /* size of the buffer in bytes */ ++ __u32 buf_flags; /* buffer specific flags */ ++ __u32 reserved1; /* for future use */ ++ __u64 reserved[6]; /* for future use */ ++}; ++ ++/* ++ * This header is at the beginning of the sampling buffer returned to the user. ++ * It is directly followed by the first record. ++ */ ++struct pfm_dfl_smpl_hdr { ++ __u64 hdr_count; /* how many valid entries */ ++ __u64 hdr_cur_offs; /* current offset from top of buffer */ ++ __u64 hdr_overflows; /* #overflows for buffer */ ++ __u64 hdr_buf_size; /* bytes in the buffer */ ++ __u64 hdr_min_buf_space;/* minimal buffer size (internal use) */ ++ __u32 hdr_version; /* smpl format version */ ++ __u32 hdr_buf_flags; /* copy of buf_flags */ ++ __u64 hdr_reserved[10]; /* for future use */ ++}; ++ ++/* ++ * Entry header in the sampling buffer. The header is directly followed ++ * with the values of the PMD registers of interest saved in increasing ++ * index order: PMD4, PMD5, and so on. How many PMDs are present depends ++ * on how the session was programmed. ++ * ++ * In the case where multiple counters overflow at the same time, multiple ++ * entries are written consecutively. ++ * ++ * last_reset_value member indicates the initial value of the overflowed PMD. ++ */ ++struct pfm_dfl_smpl_entry { ++ __u32 pid; /* thread id (for NPTL, this is gettid()) */ ++ __u16 ovfl_pmd; /* index of overflowed PMD for this sample */ ++ __u16 reserved; /* for future use */ ++ __u64 last_reset_val; /* initial value of overflowed PMD */ ++ __u64 ip; /* where did the overflow intr happened */ ++ __u64 tstamp; /* overflow timetamp */ ++ __u16 cpu; /* cpu on which the overfow occurred */ ++ __u16 set; /* event set active when overflow ocurred */ ++ __u32 tgid; /* thread group id (getpid() for NPTL) */ ++}; ++ ++#define PFM_DFL_SMPL_VERSION_MAJ 1U ++#define PFM_DFL_SMPL_VERSION_MIN 0U ++#define PFM_DFL_SMPL_VERSION (((PFM_DFL_SMPL_VERSION_MAJ&0xffff)<<16)|\ ++ (PFM_DFL_SMPL_VERSION_MIN & 0xffff)) ++ ++#endif /* __PERFMON_DFL_SMPL_H__ */ +diff --git a/include/linux/perfmon_fmt.h b/include/linux/perfmon_fmt.h +new file mode 100644 +index 0000000..82a6a90 +--- /dev/null ++++ b/include/linux/perfmon_fmt.h +@@ -0,0 +1,74 @@ ++/* ++ * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * Interface for custom sampling buffer format modules ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#ifndef __PERFMON_FMT_H__ ++#define __PERFMON_FMT_H__ 1 ++ ++#include <linux/kobject.h> ++ ++typedef int (*fmt_validate_t)(u32 flags, u16 npmds, void *arg); ++typedef int (*fmt_getsize_t)(u32 flags, void *arg, size_t *size); ++typedef int (*fmt_init_t)(struct pfm_context *ctx, void *buf, u32 flags, ++ u16 nmpds, void *arg); ++typedef int (*fmt_restart_t)(int is_active, u32 *ovfl_ctrl, void *buf); ++typedef int (*fmt_exit_t)(void *buf); ++typedef int (*fmt_handler_t)(struct pfm_context *ctx, ++ unsigned long ip, u64 stamp, void *data); ++ ++struct pfm_smpl_fmt { ++ char *fmt_name; /* name of the format (required) */ ++ size_t fmt_arg_size; /* size of fmt args for ctx create */ ++ u32 fmt_flags; /* format specific flags */ ++ u32 fmt_version; /* format version number */ ++ ++ fmt_validate_t fmt_validate; /* validate context flags */ ++ fmt_getsize_t fmt_getsize; /* get size for sampling buffer */ ++ fmt_init_t fmt_init; /* initialize buffer area */ ++ fmt_handler_t fmt_handler; /* overflow handler (required) */ ++ fmt_restart_t fmt_restart; /* restart after notification */ ++ fmt_exit_t fmt_exit; /* context termination */ ++ ++ struct list_head fmt_list; /* internal use only */ ++ ++ struct kobject kobj; /* sysfs internal use only */ ++ struct module *owner; /* pointer to module owner */ ++ u32 fmt_qdepth; /* Max notify queue depth (required) */ ++}; ++#define to_smpl_fmt(n) container_of(n, struct pfm_smpl_fmt, kobj) ++ ++#define PFM_FMTFL_IS_BUILTIN 0x1 /* fmt is compiled in */ ++/* ++ * we need to know whether the format is builtin or compiled ++ * as a module ++ */ ++#ifdef MODULE ++#define PFM_FMT_BUILTIN_FLAG 0 /* not built as a module */ ++#else ++#define PFM_FMT_BUILTIN_FLAG PFM_PMUFL_IS_BUILTIN /* built as a module */ ++#endif ++ ++int pfm_fmt_register(struct pfm_smpl_fmt *fmt); ++int pfm_fmt_unregister(struct pfm_smpl_fmt *fmt); ++void pfm_sysfs_builtin_fmt_add(void); ++ ++int pfm_sysfs_add_fmt(struct pfm_smpl_fmt *fmt); ++void pfm_sysfs_remove_fmt(struct pfm_smpl_fmt *fmt); ++ ++#endif /* __PERFMON_FMT_H__ */ +diff --git a/include/linux/perfmon_kern.h b/include/linux/perfmon_kern.h +new file mode 100644 +index 0000000..6c3b527 +--- /dev/null ++++ b/include/linux/perfmon_kern.h +@@ -0,0 +1,551 @@ ++/* ++ * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++ ++#ifndef __LINUX_PERFMON_KERN_H__ ++#define __LINUX_PERFMON_KERN_H__ ++/* ++ * This file contains all the definitions of data structures, variables, macros ++ * that are to be shared between generic code and arch-specific code ++ * ++ * For generic only definitions, use perfmon/perfmon_priv.h ++ */ ++#ifdef CONFIG_PERFMON ++ ++#include <linux/file.h> ++#include <linux/sched.h> ++#include <linux/perfmon.h> ++ ++/* ++ * system adminstrator configuration controls available via ++ * the /sys/kerne/perfmon interface ++ */ ++struct pfm_controls { ++ u32 debug; /* debugging control bitmask */ ++ gid_t sys_group; /* gid to create a syswide context */ ++ gid_t task_group; /* gid to create a per-task context */ ++ u32 flags; /* control flags (see below) */ ++ size_t arg_mem_max; /* maximum vector argument size */ ++ size_t smpl_buffer_mem_max; /* max buf mem, -1 for infinity */ ++}; ++extern struct pfm_controls pfm_controls; ++ ++/* ++ * control flags ++ */ ++#define PFM_CTRL_FL_RW_EXPERT 0x1 /* bypass reserved fields on read/write */ ++ ++/* ++ * software PMD ++ */ ++struct pfm_pmd { ++ u64 value; /* 64-bit value */ ++ u64 lval; /* last reset value */ ++ u64 ovflsw_thres; /* #ovfls left before switch */ ++ u64 long_reset; /* long reset value on overflow */ ++ u64 short_reset; /* short reset value on overflow */ ++ u64 reset_pmds[PFM_PMD_BV]; /* pmds to reset on overflow */ ++ u64 smpl_pmds[PFM_PMD_BV]; /* pmds to record on overflow */ ++ u64 mask; /* range mask for random value */ ++ u64 ovflsw_ref_thres; /* #ovfls before next set */ ++ u64 eventid; /* opaque event identifier */ ++ u32 flags; /* notify/do not notify */ ++}; ++ ++/* ++ * event_set: encapsulates the full PMU state ++ */ ++struct pfm_event_set { ++ struct list_head list; /* ordered chain of sets */ ++ u16 id; /* set identification */ ++ u16 nused_pmds; /* max number of used PMDs */ ++ u16 nused_pmcs; /* max number of used PMCs */ ++ u16 pad1; /* paddding */ ++ u32 flags; /* public flags */ ++ u32 priv_flags; /* private flags (see below) */ ++ u64 runs; /* # of activations */ ++ u32 npend_ovfls; /* number of pending PMD overflow */ ++ u32 pad2; /* padding */ ++ u64 used_pmds[PFM_PMD_BV]; /* used PMDs */ ++ u64 povfl_pmds[PFM_PMD_BV]; /* pending overflowed PMDs */ ++ u64 ovfl_pmds[PFM_PMD_BV]; /* last overflowed PMDs */ ++ u64 reset_pmds[PFM_PMD_BV]; /* PMDs to reset after overflow */ ++ u64 ovfl_notify[PFM_PMD_BV]; /* notify on overflow */ ++ u64 used_pmcs[PFM_PMC_BV]; /* used PMCs */ ++ u64 pmcs[PFM_MAX_PMCS]; /* PMC values */ ++ ++ struct pfm_pmd pmds[PFM_MAX_PMDS]; ++ ++ ktime_t hrtimer_exp; /* switch timeout reference */ ++ ktime_t hrtimer_rem; /* per-thread remainder timeout */ ++ ++ u64 duration_start; /* start time in ns */ ++ u64 duration; /* total active ns */ ++}; ++ ++/* ++ * common private event set flags (priv_flags) ++ * ++ * upper 16 bits: for arch-specific use ++ * lower 16 bits: for common use ++ */ ++#define PFM_SETFL_PRIV_MOD_PMDS 0x1 /* PMD register(s) modified */ ++#define PFM_SETFL_PRIV_MOD_PMCS 0x2 /* PMC register(s) modified */ ++#define PFM_SETFL_PRIV_SWITCH 0x4 /* must switch set on restart */ ++#define PFM_SETFL_PRIV_MOD_BOTH (PFM_SETFL_PRIV_MOD_PMDS \ ++ | PFM_SETFL_PRIV_MOD_PMCS) ++ ++/* ++ * context flags ++ */ ++struct pfm_context_flags { ++ unsigned int block:1; /* task blocks on user notifications */ ++ unsigned int system:1; /* do system wide monitoring */ ++ unsigned int no_msg:1; /* no message sent on overflow */ ++ unsigned int switch_ovfl:1; /* switch set on counter ovfl */ ++ unsigned int switch_time:1; /* switch set on timeout */ ++ unsigned int started:1; /* pfm_start() issued */ ++ unsigned int work_type:2; /* type of work for pfm_handle_work */ ++ unsigned int mmap_nlock:1; /* no lock in pfm_release_buf_space */ ++ unsigned int ia64_v20_compat:1; /* context is IA-64 v2.0 mode */ ++ unsigned int can_restart:8; /* allowed to issue a PFM_RESTART */ ++ unsigned int reset_count:8; /* number of pending resets */ ++ unsigned int is_self:1; /* per-thread and self-montoring */ ++ unsigned int reserved:5; /* for future use */ ++}; ++ ++/* ++ * values for work_type (TIF_PERFMON_WORK must be set) ++ */ ++#define PFM_WORK_NONE 0 /* nothing to do */ ++#define PFM_WORK_RESET 1 /* reset overflowed counters */ ++#define PFM_WORK_BLOCK 2 /* block current thread */ ++#define PFM_WORK_ZOMBIE 3 /* cleanup zombie context */ ++ ++/* ++ * overflow description argument passed to sampling format ++ */ ++struct pfm_ovfl_arg { ++ u16 ovfl_pmd; /* index of overflowed PMD */ ++ u16 active_set; /* set active at the time of the overflow */ ++ u32 ovfl_ctrl; /* control flags */ ++ u64 pmd_last_reset; /* last reset value of overflowed PMD */ ++ u64 smpl_pmds_values[PFM_MAX_PMDS]; /* values of other PMDs */ ++ u64 pmd_eventid; /* eventid associated with PMD */ ++ u16 num_smpl_pmds; /* number of PMDS in smpl_pmd_values */ ++}; ++/* ++ * depth of message queue ++ * ++ * Depth cannot be bigger than 255 (see reset_count) ++ */ ++#define PFM_MSGS_ORDER 3 /* log2(number of messages) */ ++#define PFM_MSGS_COUNT (1<<PFM_MSGS_ORDER) /* number of messages */ ++#define PFM_MSGQ_MASK (PFM_MSGS_COUNT-1) ++ ++/* ++ * perfmon context state ++ */ ++#define PFM_CTX_UNLOADED 1 /* context is not loaded onto any task */ ++#define PFM_CTX_LOADED 2 /* context is loaded onto a task */ ++#define PFM_CTX_MASKED 3 /* context is loaded, monitoring is masked */ ++#define PFM_CTX_ZOMBIE 4 /* context lost owner but still attached */ ++ ++/* ++ * registers description ++ */ ++struct pfm_regdesc { ++ u64 pmcs[PFM_PMC_BV]; /* available PMC */ ++ u64 pmds[PFM_PMD_BV]; /* available PMD */ ++ u64 rw_pmds[PFM_PMD_BV]; /* available RW PMD */ ++ u64 intr_pmds[PFM_PMD_BV]; /* PMD generating intr */ ++ u64 cnt_pmds[PFM_PMD_BV]; /* PMD counters */ ++ u16 max_pmc; /* highest+1 avail PMC */ ++ u16 max_pmd; /* highest+1 avail PMD */ ++ u16 max_rw_pmd; /* highest+1 avail RW PMD */ ++ u16 first_intr_pmd; /* first intr PMD */ ++ u16 max_intr_pmd; /* highest+1 intr PMD */ ++ u16 num_rw_pmd; /* number of avail RW PMD */ ++ u16 num_pmcs; /* number of logical PMCS */ ++ u16 num_pmds; /* number of logical PMDS */ ++ u16 num_counters; /* number of counting PMD */ ++}; ++ ++/* ++ * context: contains all the state of a session ++ */ ++struct pfm_context { ++ spinlock_t lock; /* context protection */ ++ ++ struct pfm_context_flags flags; ++ u32 state; /* current state */ ++ struct task_struct *task; /* attached task */ ++ ++ struct completion restart_complete;/* block on notification */ ++ u64 last_act; /* last activation */ ++ u32 last_cpu; /* last CPU used (SMP only) */ ++ u32 cpu; /* cpu bound to context */ ++ ++ struct pfm_smpl_fmt *smpl_fmt; /* sampling format callbacks */ ++ void *smpl_addr; /* user smpl buffer base */ ++ size_t smpl_size; /* user smpl buffer size */ ++ void *smpl_real_addr;/* actual smpl buffer base */ ++ size_t smpl_real_size; /* actual smpl buffer size */ ++ ++ wait_queue_head_t msgq_wait; /* pfm_read() wait queue */ ++ ++ union pfarg_msg msgq[PFM_MSGS_COUNT]; ++ int msgq_head; ++ int msgq_tail; ++ ++ struct fasync_struct *async_queue; /* async notification */ ++ ++ struct pfm_event_set *active_set; /* active set */ ++ struct list_head set_list; /* ordered list of sets */ ++ ++ struct pfm_regdesc regs; /* registers available to context */ ++ ++ /* ++ * save stack space by allocating temporary variables for ++ * pfm_overflow_handler() in pfm_context ++ */ ++ struct pfm_ovfl_arg ovfl_arg; ++ u64 tmp_ovfl_notify[PFM_PMD_BV]; ++}; ++ ++/* ++ * ovfl_ctrl bitmask (used by interrupt handler) ++ */ ++#define PFM_OVFL_CTRL_NOTIFY 0x1 /* notify user */ ++#define PFM_OVFL_CTRL_RESET 0x2 /* reset overflowed pmds */ ++#define PFM_OVFL_CTRL_MASK 0x4 /* mask monitoring */ ++#define PFM_OVFL_CTRL_SWITCH 0x8 /* switch sets */ ++ ++/* ++ * logging ++ */ ++#define PFM_ERR(f, x...) printk(KERN_ERR "perfmon: " f "\n", ## x) ++#define PFM_WARN(f, x...) printk(KERN_WARNING "perfmon: " f "\n", ## x) ++#define PFM_LOG(f, x...) printk(KERN_NOTICE "perfmon: " f "\n", ## x) ++#define PFM_INFO(f, x...) printk(KERN_INFO "perfmon: " f "\n", ## x) ++ ++/* ++ * debugging ++ * ++ * Printk rate limiting is enforced to avoid getting flooded with too many ++ * error messages on the console (which could render the machine unresponsive). ++ * To get full debug output (turn off ratelimit): ++ * $ echo 0 >/proc/sys/kernel/printk_ratelimit ++ * ++ * debug is a bitmask where bits are defined as follows: ++ * bit 0: enable non-interrupt code degbug messages ++ * bit 1: enable interrupt code debug messages ++ */ ++#ifdef CONFIG_PERFMON_DEBUG ++#define _PFM_DBG(lm, f, x...) \ ++ do { \ ++ if (unlikely((pfm_controls.debug & lm) && printk_ratelimit())) { \ ++ preempt_disable(); \ ++ printk("perfmon: %s.%d: CPU%d [%d]: " f "\n", \ ++ __func__, __LINE__, \ ++ smp_processor_id(), current->pid , ## x); \ ++ preempt_enable(); \ ++ } \ ++ } while (0) ++ ++#define PFM_DBG(f, x...) _PFM_DBG(0x1, f, ##x) ++#define PFM_DBG_ovfl(f, x...) _PFM_DBG(0x2, f, ## x) ++#else ++#define PFM_DBG(f, x...) do {} while (0) ++#define PFM_DBG_ovfl(f, x...) do {} while (0) ++#endif ++ ++extern struct pfm_pmu_config *pfm_pmu_conf; ++extern int perfmon_disabled; ++ ++static inline struct pfm_arch_context *pfm_ctx_arch(struct pfm_context *c) ++{ ++ return (struct pfm_arch_context *)(c+1); ++} ++ ++int pfm_get_args(void __user *ureq, size_t sz, size_t lsz, void *laddr, ++ void **req, void **to_free); ++ ++int pfm_get_smpl_arg(char __user *fmt_uname, void __user *uaddr, size_t usize, ++ void **arg, struct pfm_smpl_fmt **fmt); ++ ++int __pfm_write_pmcs(struct pfm_context *ctx, struct pfarg_pmc *req, ++ int count); ++int __pfm_write_pmds(struct pfm_context *ctx, struct pfarg_pmd *req, int count, ++ int compat); ++int __pfm_read_pmds(struct pfm_context *ctx, struct pfarg_pmd *req, int count); ++ ++int __pfm_load_context(struct pfm_context *ctx, struct pfarg_load *req, ++ struct task_struct *task); ++int __pfm_unload_context(struct pfm_context *ctx, int *can_release); ++ ++int __pfm_stop(struct pfm_context *ctx, int *release_info); ++int __pfm_restart(struct pfm_context *ctx, int *unblock); ++int __pfm_start(struct pfm_context *ctx, struct pfarg_start *start); ++ ++void pfm_free_context(struct pfm_context *ctx); ++ ++void pfm_smpl_buf_space_release(struct pfm_context *ctx, size_t size); ++ ++int pfm_check_task_state(struct pfm_context *ctx, int check_mask, ++ unsigned long *flags, void **resume); ++/* ++ * check_mask bitmask values for pfm_check_task_state() ++ */ ++#define PFM_CMD_STOPPED 0x01 /* command needs thread stopped */ ++#define PFM_CMD_UNLOADED 0x02 /* command needs ctx unloaded */ ++#define PFM_CMD_UNLOAD 0x04 /* command is unload */ ++ ++int __pfm_create_context(struct pfarg_ctx *req, ++ struct pfm_smpl_fmt *fmt, ++ void *fmt_arg, ++ int mode, ++ struct pfm_context **new_ctx); ++ ++struct pfm_event_set *pfm_find_set(struct pfm_context *ctx, u16 set_id, ++ int alloc); ++ ++int pfm_pmu_conf_get(int autoload); ++void pfm_pmu_conf_put(void); ++ ++int pfm_session_allcpus_acquire(void); ++void pfm_session_allcpus_release(void); ++ ++int pfm_smpl_buf_alloc(struct pfm_context *ctx, size_t rsize); ++void pfm_smpl_buf_free(struct pfm_context *ctx); ++ ++struct pfm_smpl_fmt *pfm_smpl_fmt_get(char *name); ++void pfm_smpl_fmt_put(struct pfm_smpl_fmt *fmt); ++ ++void pfm_interrupt_handler(unsigned long iip, struct pt_regs *regs); ++ ++void pfm_resume_task(struct task_struct *t, void *data); ++ ++#include <linux/perfmon_pmu.h> ++#include <linux/perfmon_fmt.h> ++ ++extern const struct file_operations pfm_file_ops; ++/* ++ * upper limit for count in calls that take vector arguments. This is used ++ * to prevent for multiplication overflow when we compute actual storage size ++ */ ++#define PFM_MAX_ARG_COUNT(m) (INT_MAX/sizeof(*(m))) ++ ++#define cast_ulp(_x) ((unsigned long *)_x) ++ ++#define PFM_NORMAL 0 ++#define PFM_COMPAT 1 ++ ++void __pfm_exit_thread(void); ++void pfm_ctxsw_in(struct task_struct *prev, struct task_struct *next); ++void pfm_ctxsw_out(struct task_struct *prev, struct task_struct *next); ++void pfm_handle_work(struct pt_regs *regs); ++void __pfm_init_percpu(void *dummy); ++void pfm_save_pmds(struct pfm_context *ctx, struct pfm_event_set *set); ++ ++static inline void pfm_exit_thread(void) ++{ ++ if (current->pfm_context) ++ __pfm_exit_thread(); ++} ++ ++/* ++ * include arch-specific kernel level definitions ++ */ ++#include <asm/perfmon_kern.h> ++ ++static inline void pfm_copy_thread(struct task_struct *task) ++{ ++ /* ++ * context or perfmon TIF state is NEVER inherited ++ * in child task. Holds for per-thread and system-wide ++ */ ++ task->pfm_context = NULL; ++ clear_tsk_thread_flag(task, TIF_PERFMON_CTXSW); ++ clear_tsk_thread_flag(task, TIF_PERFMON_WORK); ++ pfm_arch_disarm_handle_work(task); ++} ++ ++ ++/* ++ * read a single PMD register. ++ * ++ * virtual PMD registers have special handler. ++ * Depends on definitions in asm/perfmon_kern.h ++ */ ++static inline u64 pfm_read_pmd(struct pfm_context *ctx, unsigned int cnum) ++{ ++ if (unlikely(pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_V)) ++ return pfm_pmu_conf->pmd_sread(ctx, cnum); ++ ++ return pfm_arch_read_pmd(ctx, cnum); ++} ++/* ++ * write a single PMD register. ++ * ++ * virtual PMD registers have special handler. ++ * Depends on definitions in asm/perfmon_kern.h ++ */ ++static inline void pfm_write_pmd(struct pfm_context *ctx, unsigned int cnum, ++ u64 value) ++{ ++ /* ++ * PMD writes are ignored for read-only registers ++ */ ++ if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_RO) ++ return; ++ ++ if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_V) { ++ pfm_pmu_conf->pmd_swrite(ctx, cnum, value); ++ return; ++ } ++ /* ++ * clear unimplemented bits ++ */ ++ value &= ~pfm_pmu_conf->pmd_desc[cnum].rsvd_msk; ++ ++ pfm_arch_write_pmd(ctx, cnum, value); ++} ++ ++void __pfm_init_percpu(void *dummy); ++ ++static inline void pfm_init_percpu(void) ++{ ++ __pfm_init_percpu(NULL); ++} ++ ++/* ++ * pfm statistics are available via debugfs ++ * and perfmon subdir. ++ * ++ * When adding/removing new stats, make sure you also ++ * update the name table in perfmon_debugfs.c ++ */ ++enum pfm_stats_names { ++ PFM_ST_ovfl_intr_all_count = 0, ++ PFM_ST_ovfl_intr_ns, ++ PFM_ST_ovfl_intr_spurious_count, ++ PFM_ST_ovfl_intr_replay_count, ++ PFM_ST_ovfl_intr_regular_count, ++ PFM_ST_handle_work_count, ++ PFM_ST_ovfl_notify_count, ++ PFM_ST_reset_pmds_count, ++ PFM_ST_pfm_restart_count, ++ PFM_ST_fmt_handler_calls, ++ PFM_ST_fmt_handler_ns, ++ PFM_ST_set_switch_count, ++ PFM_ST_set_switch_ns, ++ PFM_ST_set_switch_exp, ++ PFM_ST_ctxswin_count, ++ PFM_ST_ctxswin_ns, ++ PFM_ST_handle_timeout_count, ++ PFM_ST_ovfl_intr_nmi_count, ++ PFM_ST_ctxswout_count, ++ PFM_ST_ctxswout_ns, ++ PFM_ST_LAST /* last entry marked */ ++}; ++#define PFM_NUM_STATS PFM_ST_LAST ++ ++struct pfm_stats { ++ u64 v[PFM_NUM_STATS]; ++ struct dentry *dirs[PFM_NUM_STATS]; ++ struct dentry *cpu_dir; ++ char cpu_name[8]; ++}; ++ ++#ifdef CONFIG_PERFMON_DEBUG_FS ++#define pfm_stats_get(x) __get_cpu_var(pfm_stats).v[PFM_ST_##x] ++#define pfm_stats_inc(x) __get_cpu_var(pfm_stats).v[PFM_ST_##x]++ ++#define pfm_stats_add(x, y) __get_cpu_var(pfm_stats).v[PFM_ST_##x] += (y) ++void pfm_reset_stats(int cpu); ++#else ++#define pfm_stats_get(x) ++#define pfm_stats_inc(x) ++#define pfm_stats_add(x, y) ++static inline void pfm_reset_stats(int cpu) ++{} ++#endif ++ ++ ++ ++DECLARE_PER_CPU(struct pfm_context *, pmu_ctx); ++DECLARE_PER_CPU(struct pfm_stats, pfm_stats); ++DECLARE_PER_CPU(struct task_struct *, pmu_owner); ++ ++void pfm_cpu_disable(void); ++ ++ ++/* ++ * max vector argument elements for local storage (no kmalloc/kfree) ++ * The PFM_ARCH_PM*_ARG should be defined in perfmon_kern.h. ++ * If not, default (conservative) values are used ++ */ ++#ifndef PFM_ARCH_PMC_STK_ARG ++#define PFM_ARCH_PMC_STK_ARG 1 ++#endif ++ ++#ifndef PFM_ARCH_PMD_STK_ARG ++#define PFM_ARCH_PMD_STK_ARG 1 ++#endif ++ ++#define PFM_PMC_STK_ARG PFM_ARCH_PMC_STK_ARG ++#define PFM_PMD_STK_ARG PFM_ARCH_PMD_STK_ARG ++ ++#else /* !CONFIG_PERFMON */ ++ ++ ++/* ++ * perfmon hooks are nops when CONFIG_PERFMON is undefined ++ */ ++static inline void pfm_cpu_disable(void) ++{} ++ ++static inline void pfm_exit_thread(void) ++{} ++ ++static inline void pfm_handle_work(struct pt_regs *regs) ++{} ++ ++static inline void pfm_copy_thread(struct task_struct *t) ++{} ++ ++static inline void pfm_ctxsw_in(struct task_struct *p, struct task_struct *n) ++{} ++ ++static inline void pfm_ctxsw_out(struct task_struct *p, struct task_struct *n) ++{} ++ ++static inline void pfm_session_allcpus_release(void) ++{} ++ ++static inline int pfm_session_allcpus_acquire(void) ++{ ++ return 0; ++} ++ ++static inline void pfm_init_percpu(void) ++{} ++ ++#endif /* CONFIG_PERFMON */ ++ ++#endif /* __LINUX_PERFMON_KERN_H__ */ +diff --git a/include/linux/perfmon_pmu.h b/include/linux/perfmon_pmu.h +new file mode 100644 +index 0000000..3f5f9e8 +--- /dev/null ++++ b/include/linux/perfmon_pmu.h +@@ -0,0 +1,192 @@ ++/* ++ * Copyright (c) 2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * Interface for PMU description modules ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#ifndef __PERFMON_PMU_H__ ++#define __PERFMON_PMU_H__ 1 ++ ++/* ++ * generic information about a PMC or PMD register ++ * ++ * Dependency bitmasks: ++ * They are used to allow lazy save/restore in the context switch ++ * code. To avoid picking up stale configuration from a previous ++ * thread. Usng the bitmask, the generic read/write routines can ++ * ensure that all registers needed to support the measurement are ++ * restored properly on context switch in. ++ */ ++struct pfm_regmap_desc { ++ u16 type; /* role of the register */ ++ u16 reserved1; /* for future use */ ++ u32 reserved2; /* for future use */ ++ u64 dfl_val; /* power-on default value (quiescent) */ ++ u64 rsvd_msk; /* reserved bits: 1 means reserved */ ++ u64 no_emul64_msk; /* bits to clear for PFM_REGFL_NO_EMUL64 */ ++ unsigned long hw_addr; /* HW register address or index */ ++ struct kobject kobj; /* for internal use only */ ++ char *desc; /* HW register description string */ ++ u64 dep_pmcs[PFM_PMC_BV];/* depending PMC registers */ ++}; ++#define to_reg(n) container_of(n, struct pfm_regmap_desc, kobj) ++ ++/* ++ * pfm_reg_desc helper macros ++ */ ++#define PMC_D(t, d, v, r, n, h) \ ++ { .type = t, \ ++ .desc = d, \ ++ .dfl_val = v, \ ++ .rsvd_msk = r, \ ++ .no_emul64_msk = n, \ ++ .hw_addr = h \ ++ } ++ ++#define PMD_D(t, d, h) \ ++ { .type = t, \ ++ .desc = d, \ ++ .rsvd_msk = 0, \ ++ .no_emul64_msk = 0, \ ++ .hw_addr = h \ ++ } ++ ++#define PMD_DR(t, d, h, r) \ ++ { .type = t, \ ++ .desc = d, \ ++ .rsvd_msk = r, \ ++ .no_emul64_msk = 0, \ ++ .hw_addr = h \ ++ } ++ ++#define PMX_NA \ ++ { .type = PFM_REG_NA } ++ ++#define PMD_DP(t, d, h, p) \ ++ { .type = t, \ ++ .desc = d, \ ++ .rsvd_msk = 0, \ ++ .no_emul64_msk = 0, \ ++ .dep_pmcs[0] = p, \ ++ .hw_addr = h \ ++ } ++ ++/* ++ * type of a PMU register (16-bit bitmask) for use with pfm_reg_desc.type ++ */ ++#define PFM_REG_NA 0x00 /* not avail. (not impl.,no access) must be 0 */ ++#define PFM_REG_I 0x01 /* PMC/PMD: implemented */ ++#define PFM_REG_WC 0x02 /* PMC: has write_checker */ ++#define PFM_REG_C64 0x04 /* PMD: 64-bit virtualization */ ++#define PFM_REG_RO 0x08 /* PMD: read-only (writes ignored) */ ++#define PFM_REG_V 0x10 /* PMD: virtual reg */ ++#define PFM_REG_INTR 0x20 /* PMD: register can generate interrupt */ ++#define PFM_REG_SYS 0x40 /* PMC/PMD: register is for system-wide only */ ++#define PFM_REG_THR 0x80 /* PMC/PMD: register is for per-thread only */ ++#define PFM_REG_NO64 0x100 /* PMC: supports PFM_REGFL_NO_EMUL64 */ ++ ++/* ++ * define some shortcuts for common types ++ */ ++#define PFM_REG_W (PFM_REG_WC|PFM_REG_I) ++#define PFM_REG_W64 (PFM_REG_WC|PFM_REG_NO64|PFM_REG_I) ++#define PFM_REG_C (PFM_REG_C64|PFM_REG_INTR|PFM_REG_I) ++#define PFM_REG_I64 (PFM_REG_NO64|PFM_REG_I) ++#define PFM_REG_IRO (PFM_REG_I|PFM_REG_RO) ++ ++typedef int (*pfm_pmc_check_t)(struct pfm_context *ctx, ++ struct pfm_event_set *set, ++ struct pfarg_pmc *req); ++ ++typedef int (*pfm_pmd_check_t)(struct pfm_context *ctx, ++ struct pfm_event_set *set, ++ struct pfarg_pmd *req); ++ ++ ++typedef u64 (*pfm_sread_t)(struct pfm_context *ctx, unsigned int cnum); ++typedef void (*pfm_swrite_t)(struct pfm_context *ctx, unsigned int cnum, u64 val); ++ ++/* ++ * structure used by pmu description modules ++ * ++ * probe_pmu() routine return value: ++ * - 1 means recognized PMU ++ * - 0 means not recognized PMU ++ */ ++struct pfm_pmu_config { ++ char *pmu_name; /* PMU family name */ ++ char *version; /* config module version */ ++ ++ int counter_width; /* width of hardware counter */ ++ ++ struct pfm_regmap_desc *pmc_desc; /* PMC register descriptions */ ++ struct pfm_regmap_desc *pmd_desc; /* PMD register descriptions */ ++ ++ pfm_pmc_check_t pmc_write_check;/* write checker (optional) */ ++ pfm_pmd_check_t pmd_write_check;/* write checker (optional) */ ++ pfm_pmd_check_t pmd_read_check; /* read checker (optional) */ ++ ++ pfm_sread_t pmd_sread; /* virtual pmd read */ ++ pfm_swrite_t pmd_swrite; /* virtual pmd write */ ++ ++ int (*probe_pmu)(void);/* probe PMU routine */ ++ ++ u16 num_pmc_entries;/* #entries in pmc_desc */ ++ u16 num_pmd_entries;/* #entries in pmd_desc */ ++ ++ void *pmu_info; /* model-specific infos */ ++ u32 flags; /* set of flags */ ++ ++ struct module *owner; /* pointer to module struct */ ++ ++ /* ++ * fields computed internally, do not set in module ++ */ ++ struct pfm_regdesc regs_all; /* regs available to all */ ++ struct pfm_regdesc regs_thr; /* regs avail per-thread */ ++ struct pfm_regdesc regs_sys; /* regs avail system-wide */ ++ ++ u64 ovfl_mask; /* overflow mask */ ++}; ++ ++static inline void *pfm_pmu_info(void) ++{ ++ return pfm_pmu_conf->pmu_info; ++} ++ ++/* ++ * pfm_pmu_config flags ++ */ ++#define PFM_PMUFL_IS_BUILTIN 0x1 /* pmu config is compiled in */ ++ ++/* ++ * we need to know whether the PMU description is builtin or compiled ++ * as a module ++ */ ++#ifdef MODULE ++#define PFM_PMU_BUILTIN_FLAG 0 /* not built as a module */ ++#else ++#define PFM_PMU_BUILTIN_FLAG PFM_PMUFL_IS_BUILTIN /* built as a module */ ++#endif ++ ++int pfm_pmu_register(struct pfm_pmu_config *cfg); ++void pfm_pmu_unregister(struct pfm_pmu_config *cfg); ++ ++int pfm_sysfs_remove_pmu(struct pfm_pmu_config *pmu); ++int pfm_sysfs_add_pmu(struct pfm_pmu_config *pmu); ++ ++#endif /* __PERFMON_PMU_H__ */ +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 3d9120c..8fb3b55 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -96,6 +96,7 @@ struct exec_domain; + struct futex_pi_state; + struct robust_list_head; + struct bio; ++struct pfm_context; + + /* + * List of flags we want to share for kernel threads, +@@ -1301,6 +1302,9 @@ struct task_struct { + int latency_record_count; + struct latency_record latency_record[LT_SAVECOUNT]; + #endif ++#ifdef CONFIG_PERFMON ++ struct pfm_context *pfm_context; ++#endif + }; + + /* +diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h +index d6ff145..e308523 100644 +--- a/include/linux/syscalls.h ++++ b/include/linux/syscalls.h +@@ -29,6 +29,13 @@ struct msqid_ds; + struct new_utsname; + struct nfsctl_arg; + struct __old_kernel_stat; ++struct pfarg_ctx; ++struct pfarg_pmc; ++struct pfarg_pmd; ++struct pfarg_start; ++struct pfarg_load; ++struct pfarg_setinfo; ++struct pfarg_setdesc; + struct pollfd; + struct rlimit; + struct rusage; +@@ -625,4 +632,27 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); + + int kernel_execve(const char *filename, char *const argv[], char *const envp[]); + ++asmlinkage long sys_pfm_create_context(struct pfarg_ctx __user *ureq, ++ void __user *uarg, size_t smpl_size); ++asmlinkage long sys_pfm_write_pmcs(int fd, struct pfarg_pmc __user *ureq, ++ int count); ++asmlinkage long sys_pfm_write_pmds(int fd, struct pfarg_pmd __user *ureq, ++ int count); ++asmlinkage long sys_pfm_read_pmds(int fd, struct pfarg_pmd __user *ureq, ++ int count); ++asmlinkage long sys_pfm_restart(int fd); ++asmlinkage long sys_pfm_stop(int fd); ++asmlinkage long sys_pfm_start(int fd, struct pfarg_start __user *ureq); ++asmlinkage long sys_pfm_load_context(int fd, struct pfarg_load __user *ureq); ++asmlinkage long sys_pfm_unload_context(int fd); ++asmlinkage long sys_pfm_delete_evtsets(int fd, ++ struct pfarg_setinfo __user *ureq, ++ int count); ++asmlinkage long sys_pfm_create_evtsets(int fd, ++ struct pfarg_setdesc __user *ureq, ++ int count); ++asmlinkage long sys_pfm_getinfo_evtsets(int fd, ++ struct pfarg_setinfo __user *ureq, ++ int count); ++ + #endif +diff --git a/kernel/sched.c b/kernel/sched.c +index ad1962d..1bc8fcf 100644 +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -71,6 +71,7 @@ + #include <linux/debugfs.h> + #include <linux/ctype.h> + #include <linux/ftrace.h> ++#include <linux/perfmon_kern.h> + + #include <asm/tlb.h> + #include <asm/irq_regs.h> +diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c +index 08d6e1b..61f4155 100644 +--- a/kernel/sys_ni.c ++++ b/kernel/sys_ni.c +@@ -126,6 +126,19 @@ cond_syscall(sys_vm86); + cond_syscall(compat_sys_ipc); + cond_syscall(compat_sys_sysctl); + ++cond_syscall(sys_pfm_create_context); ++cond_syscall(sys_pfm_write_pmcs); ++cond_syscall(sys_pfm_write_pmds); ++cond_syscall(sys_pfm_read_pmds); ++cond_syscall(sys_pfm_restart); ++cond_syscall(sys_pfm_start); ++cond_syscall(sys_pfm_stop); ++cond_syscall(sys_pfm_load_context); ++cond_syscall(sys_pfm_unload_context); ++cond_syscall(sys_pfm_create_evtsets); ++cond_syscall(sys_pfm_delete_evtsets); ++cond_syscall(sys_pfm_getinfo_evtsets); ++ + /* arch-specific weak syscall entries */ + cond_syscall(sys_pciconfig_read); + cond_syscall(sys_pciconfig_write); +diff --git a/perfmon/Makefile b/perfmon/Makefile +new file mode 100644 +index 0000000..32ff037 +--- /dev/null ++++ b/perfmon/Makefile +@@ -0,0 +1,12 @@ ++# ++# Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. ++# Contributed by Stephane Eranian <eranian@hpl.hp.com> ++# ++obj-y = perfmon_init.o perfmon_rw.o perfmon_res.o \ ++ perfmon_pmu.o perfmon_sysfs.o perfmon_syscalls.o \ ++ perfmon_file.o perfmon_ctxsw.o perfmon_intr.o \ ++ perfmon_dfl_smpl.o perfmon_sets.o perfmon_hotplug.o \ ++ perfmon_msg.o perfmon_smpl.o perfmon_attach.o \ ++ perfmon_activate.o perfmon_ctx.o perfmon_fmt.o ++ ++obj-$(CONFIG_PERFMON_DEBUG_FS) += perfmon_debugfs.o +diff --git a/perfmon/perfmon_activate.c b/perfmon/perfmon_activate.c +new file mode 100644 +index 0000000..d9f501d +--- /dev/null ++++ b/perfmon/perfmon_activate.c +@@ -0,0 +1,265 @@ ++/* ++ * perfmon_activate.c: perfmon2 start/stop functions ++ * ++ * This file implements the perfmon2 interface which ++ * provides access to the hardware performance counters ++ * of the host processor. ++ * ++ * ++ * The initial version of perfmon.c was written by ++ * Ganesh Venkitachalam, IBM Corp. ++ * ++ * Then it was modified for perfmon-1.x by Stephane Eranian and ++ * David Mosberger, Hewlett Packard Co. ++ * ++ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x ++ * by Stephane Eranian, Hewlett Packard Co. ++ * ++ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * David Mosberger-Tang <davidm@hpl.hp.com> ++ * ++ * More information about perfmon available at: ++ * http://perfmon2.sf.net ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/kernel.h> ++#include <linux/perfmon_kern.h> ++#include "perfmon_priv.h" ++ ++/** ++ * __pfm_start - activate monitoring ++ * @ctx: context to operate on ++ * @start: pfarg_start as passed by user ++ * ++ * When operating in per-thread mode and not self-monitoring, the monitored ++ * thread must be stopped. Activation will be effective next time the thread ++ * is context switched in. ++ * ++ * The pfarg_start argument is optional and may be used to designate ++ * the initial event set to activate. When not provided, the last active ++ * set is used. For the first activation, set0 is used when start is NULL. ++ * ++ * On some architectures, e.g., IA-64, it may be possible to start monitoring ++ * without calling this function under certain conditions (per-thread and self ++ * monitoring). In this case, either set0 or the last active set is used. ++ * ++ * the context is locked and interrupts are disabled. ++ */ ++int __pfm_start(struct pfm_context *ctx, struct pfarg_start *start) ++{ ++ struct task_struct *task, *owner_task; ++ struct pfm_event_set *new_set, *old_set; ++ int is_self; ++ ++ task = ctx->task; ++ ++ /* ++ * UNLOADED: error ++ * LOADED : normal start, nop if started unless set is different ++ * MASKED : nop or change set when unmasking ++ * ZOMBIE : cannot happen ++ */ ++ if (ctx->state == PFM_CTX_UNLOADED) ++ return -EINVAL; ++ ++ old_set = new_set = ctx->active_set; ++ ++ /* ++ * always the case for system-wide ++ */ ++ if (task == NULL) ++ task = current; ++ ++ is_self = task == current; ++ ++ /* ++ * argument is provided? ++ */ ++ if (start) { ++ /* ++ * find the set to load first ++ */ ++ new_set = pfm_find_set(ctx, start->start_set, 0); ++ if (new_set == NULL) { ++ PFM_DBG("event set%u does not exist", ++ start->start_set); ++ return -EINVAL; ++ } ++ } ++ ++ PFM_DBG("cur_set=%u req_set=%u", old_set->id, new_set->id); ++ ++ /* ++ * if we need to change the active set we need ++ * to check if we can access the PMU ++ */ ++ if (new_set != old_set) { ++ ++ owner_task = __get_cpu_var(pmu_owner); ++ /* ++ * system-wide: must run on the right CPU ++ * per-thread : must be the owner of the PMU context ++ * ++ * pfm_switch_sets() returns with monitoring stopped ++ */ ++ if (is_self) { ++ pfm_switch_sets(ctx, new_set, PFM_PMD_RESET_LONG, 1); ++ } else { ++ /* ++ * In a UP kernel, the PMU may contain the state ++ * of the task we want to operate on, yet the task ++ * may be switched out (lazy save). We need to save ++ * current state (old_set), switch active_set and ++ * mark it for reload. ++ */ ++ if (owner_task == task) ++ pfm_save_pmds(ctx, old_set); ++ ctx->active_set = new_set; ++ new_set->priv_flags |= PFM_SETFL_PRIV_MOD_BOTH; ++ } ++ } ++ ++ /* ++ * mark as started ++ * must be done before calling pfm_arch_start() ++ */ ++ ctx->flags.started = 1; ++ ++ pfm_arch_start(task, ctx); ++ ++ /* ++ * we check whether we had a pending ovfl before restarting. ++ * If so we need to regenerate the interrupt to make sure we ++ * keep recorded samples. For non-self monitoring this check ++ * is done in the pfm_ctxswin_thread() routine. ++ * ++ * we check new_set/old_set because pfm_switch_sets() already ++ * takes care of replaying the pending interrupts ++ */ ++ if (is_self && new_set != old_set && new_set->npend_ovfls) { ++ pfm_arch_resend_irq(ctx); ++ pfm_stats_inc(ovfl_intr_replay_count); ++ } ++ ++ /* ++ * always start with full timeout ++ */ ++ new_set->hrtimer_rem = new_set->hrtimer_exp; ++ ++ /* ++ * activate timeout for system-wide, self-montoring ++ * Always start with full timeout ++ * Timeout is at least one tick away, so no risk of ++ * having hrtimer_start() trying to wakeup softirqd ++ * and thus causing troubles. This cannot happen anmyway ++ * because cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ ++ */ ++ if (is_self && new_set->flags & PFM_SETFL_TIME_SWITCH) { ++ hrtimer_start(&__get_cpu_var(pfm_hrtimer), ++ new_set->hrtimer_rem, ++ HRTIMER_MODE_REL); ++ ++ PFM_DBG("set%u started timeout=%lld", ++ new_set->id, ++ (unsigned long long)new_set->hrtimer_rem.tv64); ++ } ++ ++ /* ++ * we restart total duration even if context was ++ * already started. In that case, counts are simply ++ * reset. ++ * ++ * For per-thread, if not self-monitoring, the statement ++ * below will have no effect because thread is stopped. ++ * The field is reset of ctxsw in. ++ */ ++ new_set->duration_start = sched_clock(); ++ ++ return 0; ++} ++ ++/** ++ * __pfm_stop - stop monitoring ++ * @ctx: context to operate on ++ * @release_info: infos for caller (see below) ++ * ++ * When operating in per-thread* mode and when not self-monitoring, ++ * the monitored thread must be stopped. ++ * ++ * the context is locked and interrupts are disabled. ++ * ++ * release_info value upon return: ++ * - bit 0 : unused ++ * - bit 1 : when set, must cancel hrtimer ++ */ ++int __pfm_stop(struct pfm_context *ctx, int *release_info) ++{ ++ struct pfm_event_set *set; ++ struct task_struct *task; ++ u64 now; ++ int state; ++ ++ *release_info = 0; ++ ++ now = sched_clock(); ++ state = ctx->state; ++ set = ctx->active_set; ++ ++ /* ++ * context must be attached (zombie cannot happen) ++ */ ++ if (state == PFM_CTX_UNLOADED) ++ return -EINVAL; ++ ++ task = ctx->task; ++ ++ PFM_DBG("ctx_task=[%d] ctx_state=%d is_system=%d", ++ task ? task->pid : -1, ++ state, ++ !task); ++ ++ /* ++ * this happens for system-wide context ++ */ ++ if (task == NULL) ++ task = current; ++ ++ /* ++ * compute elapsed time ++ * ++ * unless masked, compute elapsed duration, stop timeout ++ */ ++ if (task == current && state == PFM_CTX_LOADED) { ++ /* ++ * timeout cancel must be deferred until context is ++ * unlocked to avoid race with pfm_handle_switch_timeout() ++ */ ++ if (set->flags & PFM_SETFL_TIME_SWITCH) ++ *release_info |= 0x2; ++ ++ set->duration += now - set->duration_start; ++ } ++ ++ pfm_arch_stop(task, ctx); ++ ++ ctx->flags.started = 0; ++ /* ++ * starting now, in-flight PMU interrupt for this context ++ * are treated as spurious ++ */ ++ return 0; ++} +diff --git a/perfmon/perfmon_attach.c b/perfmon/perfmon_attach.c +new file mode 100644 +index 0000000..bbd1d1e +--- /dev/null ++++ b/perfmon/perfmon_attach.c +@@ -0,0 +1,474 @@ ++/* ++ * perfmon_attach.c: perfmon2 load/unload functions ++ * ++ * This file implements the perfmon2 interface which ++ * provides access to the hardware performance counters ++ * of the host processor. ++ * ++ * ++ * The initial version of perfmon.c was written by ++ * Ganesh Venkitachalam, IBM Corp. ++ * ++ * Then it was modified for perfmon-1.x by Stephane Eranian and ++ * David Mosberger, Hewlett Packard Co. ++ * ++ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x ++ * by Stephane Eranian, Hewlett Packard Co. ++ * ++ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * David Mosberger-Tang <davidm@hpl.hp.com> ++ * ++ * More information about perfmon available at: ++ * http://perfmon2.sf.net ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/kernel.h> ++#include <linux/fs.h> ++#include <linux/perfmon_kern.h> ++#include "perfmon_priv.h" ++ ++/** ++ * __pfm_load_context_sys - attach context to a CPU in system-wide mode ++ * @ctx: context to operate on ++ * @set_id: set to activate first ++ * @cpu: CPU to monitor ++ * ++ * The cpu specified in the pfarg_load.load_pid argument must be the current ++ * CPU. ++ * ++ * The function must be called with the context locked and interrupts disabled. ++ */ ++static int pfm_load_ctx_sys(struct pfm_context *ctx, u16 set_id, u32 cpu) ++{ ++ struct pfm_event_set *set; ++ int mycpu; ++ int ret; ++ ++ mycpu = smp_processor_id(); ++ ++ /* ++ * system-wide: check we are running on the desired CPU ++ */ ++ if (cpu != mycpu) { ++ PFM_DBG("wrong CPU: asking %u but on %u", cpu, mycpu); ++ return -EINVAL; ++ } ++ ++ /* ++ * initialize sets ++ */ ++ set = pfm_prepare_sets(ctx, set_id); ++ if (!set) { ++ PFM_DBG("event set%u does not exist", set_id); ++ return -EINVAL; ++ } ++ ++ PFM_DBG("set=%u set_flags=0x%x", set->id, set->flags); ++ ++ ctx->cpu = mycpu; ++ ctx->task = NULL; ++ ctx->active_set = set; ++ ++ /* ++ * perform any architecture specific actions ++ */ ++ ret = pfm_arch_load_context(ctx); ++ if (ret) ++ goto error_noload; ++ ++ /* ++ * now reserve the session, before we can proceed with ++ * actually accessing the PMU hardware ++ */ ++ ret = pfm_session_acquire(1, mycpu); ++ if (ret) ++ goto error; ++ ++ ++ /* ++ * caller must be on monitored CPU to access PMU, thus this is ++ * a form of self-monitoring ++ */ ++ ctx->flags.is_self = 1; ++ ++ set->runs++; ++ ++ /* ++ * load PMD from set ++ * load PMC from set ++ */ ++ pfm_arch_restore_pmds(ctx, set); ++ pfm_arch_restore_pmcs(ctx, set); ++ ++ /* ++ * set new ownership ++ */ ++ pfm_set_pmu_owner(NULL, ctx); ++ ++ /* ++ * reset pending work ++ */ ++ ctx->flags.work_type = PFM_WORK_NONE; ++ ctx->flags.reset_count = 0; ++ ++ /* ++ * reset message queue ++ */ ++ ctx->msgq_head = ctx->msgq_tail = 0; ++ ++ ctx->state = PFM_CTX_LOADED; ++ ++ return 0; ++error: ++ pfm_arch_unload_context(ctx); ++error_noload: ++ return ret; ++} ++ ++/** ++ * __pfm_load_context_thread - attach context to a thread ++ * @ctx: context to operate on ++ * @set_id: first set ++ * @task: threadf to attach to ++ * ++ * The function must be called with the context locked and interrupts disabled. ++ */ ++static int pfm_load_ctx_thread(struct pfm_context *ctx, u16 set_id, ++ struct task_struct *task) ++{ ++ struct pfm_event_set *set; ++ struct pfm_context *old; ++ int ret; ++ ++ PFM_DBG("load_pid=%d set=%u", task->pid, set_id); ++ /* ++ * per-thread: ++ * - task to attach to is checked in sys_pfm_load_context() to avoid ++ * locking issues. if found, and not self, task refcount was ++ * incremented. ++ */ ++ old = cmpxchg(&task->pfm_context, NULL, ctx); ++ if (old) { ++ PFM_DBG("load_pid=%d has a context " ++ "old=%p new=%p cur=%p", ++ task->pid, ++ old, ++ ctx, ++ task->pfm_context); ++ return -EEXIST; ++ } ++ ++ /* ++ * initialize sets ++ */ ++ set = pfm_prepare_sets(ctx, set_id); ++ if (!set) { ++ PFM_DBG("event set%u does not exist", set_id); ++ return -EINVAL; ++ } ++ ++ ++ ctx->task = task; ++ ctx->cpu = -1; ++ ctx->active_set = set; ++ ++ /* ++ * perform any architecture specific actions ++ */ ++ ret = pfm_arch_load_context(ctx); ++ if (ret) ++ goto error_noload; ++ ++ /* ++ * now reserve the session, before we can proceed with ++ * actually accessing the PMU hardware ++ */ ++ ret = pfm_session_acquire(0, -1); ++ if (ret) ++ goto error; ++ ++ ++ set->runs++; ++ if (ctx->task != current) { ++ ++ ctx->flags.is_self = 0; ++ ++ /* force a full reload */ ++ ctx->last_act = PFM_INVALID_ACTIVATION; ++ ctx->last_cpu = -1; ++ set->priv_flags |= PFM_SETFL_PRIV_MOD_BOTH; ++ ++ } else { ++ pfm_check_save_prev_ctx(); ++ ++ ctx->last_cpu = smp_processor_id(); ++ __get_cpu_var(pmu_activation_number)++; ++ ctx->last_act = __get_cpu_var(pmu_activation_number); ++ ++ ctx->flags.is_self = 1; ++ ++ /* ++ * load PMD from set ++ * load PMC from set ++ */ ++ pfm_arch_restore_pmds(ctx, set); ++ pfm_arch_restore_pmcs(ctx, set); ++ ++ /* ++ * set new ownership ++ */ ++ pfm_set_pmu_owner(ctx->task, ctx); ++ } ++ set_tsk_thread_flag(task, TIF_PERFMON_CTXSW); ++ ++ /* ++ * reset pending work ++ */ ++ ctx->flags.work_type = PFM_WORK_NONE; ++ ctx->flags.reset_count = 0; ++ ++ /* ++ * reset message queue ++ */ ++ ctx->msgq_head = ctx->msgq_tail = 0; ++ ++ ctx->state = PFM_CTX_LOADED; ++ ++ return 0; ++ ++error: ++ pfm_arch_unload_context(ctx); ++ ctx->task = NULL; ++error_noload: ++ /* ++ * detach context ++ */ ++ task->pfm_context = NULL; ++ return ret; ++} ++ ++/** ++ * __pfm_load_context - attach context to a CPU or thread ++ * @ctx: context to operate on ++ * @load: pfarg_load as passed by user ++ * @task: thread to attach to, NULL for system-wide ++ */ ++int __pfm_load_context(struct pfm_context *ctx, struct pfarg_load *load, ++ struct task_struct *task) ++{ ++ if (ctx->flags.system) ++ return pfm_load_ctx_sys(ctx, load->load_set, load->load_pid); ++ return pfm_load_ctx_thread(ctx, load->load_set, task); ++} ++ ++/** ++ * pfm_update_ovfl_pmds - account for pending ovfls on PMDs ++ * @ctx: context to operate on ++ * ++ * This function is always called after pfm_stop has been issued ++ */ ++static void pfm_update_ovfl_pmds(struct pfm_context *ctx) ++{ ++ struct pfm_event_set *set; ++ u64 *cnt_pmds; ++ u64 ovfl_mask; ++ u16 num_ovfls, i, first; ++ ++ ovfl_mask = pfm_pmu_conf->ovfl_mask; ++ first = ctx->regs.first_intr_pmd; ++ cnt_pmds = ctx->regs.cnt_pmds; ++ ++ /* ++ * look for pending interrupts and adjust PMD values accordingly ++ */ ++ list_for_each_entry(set, &ctx->set_list, list) { ++ ++ if (!set->npend_ovfls) ++ continue; ++ ++ num_ovfls = set->npend_ovfls; ++ PFM_DBG("set%u nintrs=%u", set->id, num_ovfls); ++ ++ for (i = first; num_ovfls; i++) { ++ if (test_bit(i, cast_ulp(set->povfl_pmds))) { ++ /* only correct value for counters */ ++ if (test_bit(i, cast_ulp(cnt_pmds))) ++ set->pmds[i].value += 1 + ovfl_mask; ++ num_ovfls--; ++ } ++ PFM_DBG("pmd%u set=%u val=0x%llx", ++ i, ++ set->id, ++ (unsigned long long)set->pmds[i].value); ++ } ++ /* ++ * we need to clear to prevent a pfm_getinfo_evtsets() from ++ * returning stale data even after the context is unloaded ++ */ ++ set->npend_ovfls = 0; ++ bitmap_zero(cast_ulp(set->povfl_pmds), ctx->regs.max_intr_pmd); ++ } ++} ++ ++ ++/** ++ * __pfm_unload_context - detach context from CPU or thread ++ * @ctx: context to operate on ++ * @release_info: pointer to return info (see below) ++ * ++ * The function must be called with the context locked and interrupts disabled. ++ * ++ * release_info value upon return: ++ * - bit 0: when set, must free context ++ * - bit 1: when set, must cancel hrtimer ++ */ ++int __pfm_unload_context(struct pfm_context *ctx, int *release_info) ++{ ++ struct task_struct *task; ++ int ret; ++ ++ PFM_DBG("ctx_state=%d task [%d]", ++ ctx->state, ++ ctx->task ? ctx->task->pid : -1); ++ ++ *release_info = 0; ++ ++ /* ++ * unload only when necessary ++ */ ++ if (ctx->state == PFM_CTX_UNLOADED) ++ return 0; ++ ++ task = ctx->task; ++ ++ /* ++ * stop monitoring ++ */ ++ ret = __pfm_stop(ctx, release_info); ++ if (ret) ++ return ret; ++ ++ ctx->state = PFM_CTX_UNLOADED; ++ ctx->flags.can_restart = 0; ++ ++ /* ++ * save active set ++ * UP: ++ * if not current task and due to lazy, state may ++ * still be live ++ * for system-wide, guaranteed to run on correct CPU ++ */ ++ if (__get_cpu_var(pmu_ctx) == ctx) { ++ /* ++ * pending overflows have been saved by pfm_stop() ++ */ ++ pfm_save_pmds(ctx, ctx->active_set); ++ pfm_set_pmu_owner(NULL, NULL); ++ PFM_DBG("released ownership"); ++ } ++ ++ /* ++ * account for pending overflows ++ */ ++ pfm_update_ovfl_pmds(ctx); ++ ++ /* ++ * arch-specific unload operations ++ */ ++ pfm_arch_unload_context(ctx); ++ ++ /* ++ * per-thread: disconnect from monitored task ++ */ ++ if (task) { ++ task->pfm_context = NULL; ++ ctx->task = NULL; ++ clear_tsk_thread_flag(task, TIF_PERFMON_CTXSW); ++ clear_tsk_thread_flag(task, TIF_PERFMON_WORK); ++ pfm_arch_disarm_handle_work(task); ++ } ++ /* ++ * session can be freed, must have interrupts enabled ++ * thus we release in the caller. Bit 0 signals to the ++ * caller that the session can be released. ++ */ ++ *release_info |= 0x1; ++ ++ return 0; ++} ++ ++/** ++ * __pfm_exit_thread - detach and free context on thread exit ++ */ ++void __pfm_exit_thread(void) ++{ ++ struct pfm_context *ctx; ++ unsigned long flags; ++ int free_ok = 0, release_info = 0; ++ int ret; ++ ++ ctx = current->pfm_context; ++ ++ BUG_ON(ctx->flags.system); ++ ++ spin_lock_irqsave(&ctx->lock, flags); ++ ++ PFM_DBG("state=%d is_self=%d", ctx->state, ctx->flags.is_self); ++ ++ /* ++ * __pfm_unload_context() cannot fail ++ * in the context states we are interested in ++ */ ++ switch (ctx->state) { ++ case PFM_CTX_LOADED: ++ case PFM_CTX_MASKED: ++ __pfm_unload_context(ctx, &release_info); ++ /* ++ * end notification only sent for non ++ * self-monitoring context ++ */ ++ if (!ctx->flags.is_self) ++ pfm_end_notify(ctx); ++ break; ++ case PFM_CTX_ZOMBIE: ++ __pfm_unload_context(ctx, &release_info); ++ free_ok = 1; ++ break; ++ default: ++ BUG_ON(ctx->state != PFM_CTX_LOADED); ++ break; ++ } ++ spin_unlock_irqrestore(&ctx->lock, flags); ++ ++ /* ++ * cancel timer now that context is unlocked ++ */ ++ if (release_info & 0x2) { ++ ret = hrtimer_cancel(&__get_cpu_var(pfm_hrtimer)); ++ PFM_DBG("timeout cancel=%d", ret); ++ } ++ ++ if (release_info & 0x1) ++ pfm_session_release(0, 0); ++ ++ /* ++ * All memory free operations (especially for vmalloc'ed memory) ++ * MUST be done with interrupts ENABLED. ++ */ ++ if (free_ok) ++ pfm_free_context(ctx); ++} +diff --git a/perfmon/perfmon_ctx.c b/perfmon/perfmon_ctx.c +new file mode 100644 +index 0000000..afe6078 +--- /dev/null ++++ b/perfmon/perfmon_ctx.c +@@ -0,0 +1,314 @@ ++/* ++ * perfmon_ctx.c: perfmon2 context functions ++ * ++ * This file implements the perfmon2 interface which ++ * provides access to the hardware performance counters ++ * of the host processor. ++ * ++ * ++ * The initial version of perfmon.c was written by ++ * Ganesh Venkitachalam, IBM Corp. ++ * ++ * Then it was modified for perfmon-1.x by Stephane Eranian and ++ * David Mosberger, Hewlett Packard Co. ++ * ++ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x ++ * by Stephane Eranian, Hewlett Packard Co. ++ * ++ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * David Mosberger-Tang <davidm@hpl.hp.com> ++ * ++ * More information about perfmon available at: ++ * http://perfmon2.sf.net ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/kernel.h> ++#include <linux/fs.h> ++#include <linux/perfmon_kern.h> ++#include "perfmon_priv.h" ++ ++/* ++ * context memory pool pointer ++ */ ++static struct kmem_cache *pfm_ctx_cachep; ++ ++/** ++ * pfm_free_context - de-allocate context and associated resources ++ * @ctx: context to free ++ */ ++void pfm_free_context(struct pfm_context *ctx) ++{ ++ pfm_arch_context_free(ctx); ++ ++ pfm_free_sets(ctx); ++ ++ pfm_smpl_buf_free(ctx); ++ ++ PFM_DBG("free ctx @0x%p", ctx); ++ kmem_cache_free(pfm_ctx_cachep, ctx); ++ /* ++ * decrease refcount on: ++ * - PMU description table ++ * - sampling format ++ */ ++ pfm_pmu_conf_put(); ++ pfm_pmu_release(); ++} ++ ++/** ++ * pfm_ctx_flags_sane - check if context flags passed by user are okay ++ * @ctx_flags: flags passed user on pfm_create_context ++ * ++ * return: ++ * 0 if successful ++ * <0 and error code otherwise ++ */ ++static inline int pfm_ctx_flags_sane(u32 ctx_flags) ++{ ++ if (ctx_flags & PFM_FL_SYSTEM_WIDE) { ++ if (ctx_flags & PFM_FL_NOTIFY_BLOCK) { ++ PFM_DBG("cannot use blocking mode in syswide mode"); ++ return -EINVAL; ++ } ++ } ++ return 0; ++} ++ ++/** ++ * pfm_ctx_permissions - check authorization to create new context ++ * @ctx_flags: context flags passed by user ++ * ++ * check for permissions to create a context. ++ * ++ * A sysadmin may decide to restrict creation of per-thread ++ * and/or system-wide context to a group of users using the ++ * group id via /sys/kernel/perfmon/task_group and ++ * /sys/kernel/perfmon/sys_group. ++ * ++ * Once we identify a user level package which can be used ++ * to grant/revoke Linux capabilites at login via PAM, we will ++ * be able to use capabilities. We would also need to increase ++ * the size of cap_t to support more than 32 capabilities (it ++ * is currently defined as u32 and 32 capabilities are alrady ++ * defined). ++ */ ++static inline int pfm_ctx_permissions(u32 ctx_flags) ++{ ++ if ((ctx_flags & PFM_FL_SYSTEM_WIDE) ++ && pfm_controls.sys_group != PFM_GROUP_PERM_ANY ++ && !in_group_p(pfm_controls.sys_group)) { ++ PFM_DBG("user group not allowed to create a syswide ctx"); ++ return -EPERM; ++ } else if (pfm_controls.task_group != PFM_GROUP_PERM_ANY ++ && !in_group_p(pfm_controls.task_group)) { ++ PFM_DBG("user group not allowed to create a task context"); ++ return -EPERM; ++ } ++ return 0; ++} ++ ++/** ++ * __pfm_create_context - allocate and initialize a perfmon context ++ * @req : pfarg_ctx from user ++ * @fmt : pointer sampling format, NULL if not used ++ * @fmt_arg: pointer to argument to sampling format, NULL if not used ++ * @mode: PFM_NORMAL or PFM_COMPAT(IA-64 v2.0 compatibility) ++ * @ctx : address of new context upon succesful return, undefined otherwise ++ * ++ * function used to allocate a new context. A context is allocated along ++ * with the default event set. If a sampling format is used, the buffer ++ * may be allocated and initialized. ++ * ++ * The file descriptor identifying the context is allocated and returned ++ * to caller. ++ * ++ * This function operates with no locks and interrupts are enabled. ++ * return: ++ * >=0: the file descriptor to identify the context ++ * <0 : the error code ++ */ ++int __pfm_create_context(struct pfarg_ctx *req, ++ struct pfm_smpl_fmt *fmt, ++ void *fmt_arg, ++ int mode, ++ struct pfm_context **new_ctx) ++{ ++ struct pfm_context *ctx; ++ struct file *filp = NULL; ++ u32 ctx_flags; ++ int fd = 0, ret; ++ ++ ctx_flags = req->ctx_flags; ++ ++ /* Increase refcount on PMU description */ ++ ret = pfm_pmu_conf_get(1); ++ if (ret < 0) ++ goto error_conf; ++ ++ ret = pfm_ctx_flags_sane(ctx_flags); ++ if (ret < 0) ++ goto error_alloc; ++ ++ ret = pfm_ctx_permissions(ctx_flags); ++ if (ret < 0) ++ goto error_alloc; ++ ++ /* ++ * we can use GFP_KERNEL and potentially sleep because we do ++ * not hold any lock at this point. ++ */ ++ might_sleep(); ++ ret = -ENOMEM; ++ ctx = kmem_cache_zalloc(pfm_ctx_cachep, GFP_KERNEL); ++ if (!ctx) ++ goto error_alloc; ++ ++ PFM_DBG("alloc ctx @0x%p", ctx); ++ ++ INIT_LIST_HEAD(&ctx->set_list); ++ spin_lock_init(&ctx->lock); ++ init_completion(&ctx->restart_complete); ++ init_waitqueue_head(&ctx->msgq_wait); ++ ++ /* ++ * context is unloaded ++ */ ++ ctx->state = PFM_CTX_UNLOADED; ++ ++ /* ++ * initialization of context's flags ++ * must be done before pfm_find_set() ++ */ ++ ctx->flags.block = (ctx_flags & PFM_FL_NOTIFY_BLOCK) ? 1 : 0; ++ ctx->flags.system = (ctx_flags & PFM_FL_SYSTEM_WIDE) ? 1: 0; ++ ctx->flags.no_msg = (ctx_flags & PFM_FL_OVFL_NO_MSG) ? 1: 0; ++ ctx->flags.ia64_v20_compat = mode == PFM_COMPAT ? 1 : 0; ++ ++ ret = pfm_pmu_acquire(ctx); ++ if (ret) ++ goto error_file; ++ /* ++ * check if PMU is usable ++ */ ++ if (!(ctx->regs.num_pmcs && ctx->regs.num_pmcs)) { ++ PFM_DBG("no usable PMU registers"); ++ ret = -EBUSY; ++ goto error_file; ++ } ++ ++ /* ++ * link to format, must be done first for correct ++ * error handling in pfm_context_free() ++ */ ++ ctx->smpl_fmt = fmt; ++ ++ ret = -ENFILE; ++ fd = pfm_alloc_fd(&filp); ++ if (fd < 0) ++ goto error_file; ++ ++ /* ++ * initialize arch-specific section ++ * must be done before fmt_init() ++ */ ++ ret = pfm_arch_context_create(ctx, ctx_flags); ++ if (ret) ++ goto error_set; ++ ++ ret = -ENOMEM; ++ ++ /* ++ * add initial set ++ */ ++ if (pfm_create_initial_set(ctx)) ++ goto error_set; ++ ++ /* ++ * does the user want to sample? ++ * must be done after pfm_pmu_acquire() because ++ * needs ctx->regs ++ */ ++ if (fmt) { ++ ret = pfm_setup_smpl_fmt(ctx, ctx_flags, fmt_arg, filp); ++ if (ret) ++ goto error_set; ++ } ++ ++ filp->private_data = ctx; ++ ++ ctx->last_act = PFM_INVALID_ACTIVATION; ++ ctx->last_cpu = -1; ++ ++ /* ++ * initialize notification message queue ++ */ ++ ctx->msgq_head = ctx->msgq_tail = 0; ++ ++ PFM_DBG("flags=0x%x system=%d notify_block=%d no_msg=%d" ++ " use_fmt=%d ctx_fd=%d mode=%d", ++ ctx_flags, ++ ctx->flags.system, ++ ctx->flags.block, ++ ctx->flags.no_msg, ++ !!fmt, ++ fd, mode); ++ ++ if (new_ctx) ++ *new_ctx = ctx; ++ ++ /* ++ * we defer the fd_install until we are certain the call succeeded ++ * to ensure we do not have to undo its effect. Neither put_filp() ++ * nor put_unused_fd() undoes the effect of fd_install(). ++ */ ++ fd_install(fd, filp); ++ ++ return fd; ++ ++error_set: ++ put_filp(filp); ++ put_unused_fd(fd); ++error_file: ++ /* ++ * calls the right *_put() functions ++ * calls pfm_release_pmu() ++ */ ++ pfm_free_context(ctx); ++ return ret; ++error_alloc: ++ pfm_pmu_conf_put(); ++error_conf: ++ pfm_smpl_fmt_put(fmt); ++ return ret; ++} ++ ++/** ++ * pfm_init_ctx -- initialize context SLAB ++ * ++ * called from pfm_init ++ */ ++int __init pfm_init_ctx(void) ++{ ++ pfm_ctx_cachep = kmem_cache_create("pfm_context", ++ sizeof(struct pfm_context)+PFM_ARCH_CTX_SIZE, ++ SLAB_HWCACHE_ALIGN, 0, NULL); ++ if (!pfm_ctx_cachep) { ++ PFM_ERR("cannot initialize context slab"); ++ return -ENOMEM; ++ } ++ return 0; ++} +diff --git a/perfmon/perfmon_ctxsw.c b/perfmon/perfmon_ctxsw.c +new file mode 100644 +index 0000000..9a28d13 +--- /dev/null ++++ b/perfmon/perfmon_ctxsw.c +@@ -0,0 +1,342 @@ ++/* ++ * perfmon_cxtsw.c: perfmon2 context switch code ++ * ++ * This file implements the perfmon2 interface which ++ * provides access to the hardware performance counters ++ * of the host processor. ++ * ++ * The initial version of perfmon.c was written by ++ * Ganesh Venkitachalam, IBM Corp. ++ * ++ * Then it was modified for perfmon-1.x by Stephane Eranian and ++ * David Mosberger, Hewlett Packard Co. ++ * ++ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x ++ * by Stephane Eranian, Hewlett Packard Co. ++ * ++ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * David Mosberger-Tang <davidm@hpl.hp.com> ++ * ++ * More information about perfmon available at: ++ * http://perfmon2.sf.net ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/kernel.h> ++#include <linux/perfmon_kern.h> ++#include "perfmon_priv.h" ++ ++void pfm_save_pmds(struct pfm_context *ctx, struct pfm_event_set *set) ++{ ++ u64 val, ovfl_mask; ++ u64 *used_pmds, *cnt_pmds; ++ u16 i, num; ++ ++ ovfl_mask = pfm_pmu_conf->ovfl_mask; ++ num = set->nused_pmds; ++ cnt_pmds = ctx->regs.cnt_pmds; ++ used_pmds = set->used_pmds; ++ ++ /* ++ * save HW PMD, for counters, reconstruct 64-bit value ++ */ ++ for (i = 0; num; i++) { ++ if (test_bit(i, cast_ulp(used_pmds))) { ++ val = pfm_read_pmd(ctx, i); ++ if (likely(test_bit(i, cast_ulp(cnt_pmds)))) ++ val = (set->pmds[i].value & ~ovfl_mask) | ++ (val & ovfl_mask); ++ set->pmds[i].value = val; ++ num--; ++ } ++ } ++ pfm_arch_clear_pmd_ovfl_cond(ctx, set); ++} ++ ++/* ++ * interrupts are disabled (no preemption) ++ */ ++void __pfm_ctxswin_thread(struct task_struct *task, ++ struct pfm_context *ctx, u64 now) ++{ ++ u64 cur_act; ++ struct pfm_event_set *set; ++ int reload_pmcs, reload_pmds; ++ int mycpu, is_active; ++ ++ mycpu = smp_processor_id(); ++ ++ cur_act = __get_cpu_var(pmu_activation_number); ++ /* ++ * we need to lock context because it could be accessed ++ * from another CPU. Normally the schedule() functions ++ * has masked interrupts which should be enough to ++ * protect against PMU interrupts. ++ */ ++ spin_lock(&ctx->lock); ++ ++ is_active = pfm_arch_is_active(ctx); ++ ++ set = ctx->active_set; ++ ++ /* ++ * in case fo zombie, we do not complete ctswin of the ++ * PMU, and we force a call to pfm_handle_work() to finish ++ * cleanup, i.e., free context + smpl_buff. The reason for ++ * deferring to pfm_handle_work() is that it is not possible ++ * to vfree() with interrupts disabled. ++ */ ++ if (unlikely(ctx->state == PFM_CTX_ZOMBIE)) { ++ pfm_post_work(task, ctx, PFM_WORK_ZOMBIE); ++ goto done; ++ } ++ ++ /* ++ * if we were the last user of the PMU on that CPU, ++ * then nothing to do except restore psr ++ */ ++ if (ctx->last_cpu == mycpu && ctx->last_act == cur_act) { ++ /* ++ * check for forced reload conditions ++ */ ++ reload_pmcs = set->priv_flags & PFM_SETFL_PRIV_MOD_PMCS; ++ reload_pmds = set->priv_flags & PFM_SETFL_PRIV_MOD_PMDS; ++ } else { ++#ifndef CONFIG_SMP ++ pfm_check_save_prev_ctx(); ++#endif ++ reload_pmcs = 1; ++ reload_pmds = 1; ++ } ++ /* consumed */ ++ set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH; ++ ++ if (reload_pmds) ++ pfm_arch_restore_pmds(ctx, set); ++ ++ /* ++ * need to check if had in-flight interrupt in ++ * pfm_ctxswout_thread(). If at least one bit set, then we must replay ++ * the interrupt to avoid losing some important performance data. ++ * ++ * npend_ovfls is cleared in interrupt handler ++ */ ++ if (set->npend_ovfls) { ++ pfm_arch_resend_irq(ctx); ++ pfm_stats_inc(ovfl_intr_replay_count); ++ } ++ ++ if (reload_pmcs) ++ pfm_arch_restore_pmcs(ctx, set); ++ ++ /* ++ * record current activation for this context ++ */ ++ __get_cpu_var(pmu_activation_number)++; ++ ctx->last_cpu = mycpu; ++ ctx->last_act = __get_cpu_var(pmu_activation_number); ++ ++ /* ++ * establish new ownership. ++ */ ++ pfm_set_pmu_owner(task, ctx); ++ ++ pfm_arch_ctxswin_thread(task, ctx); ++ /* ++ * set->duration does not count when context in MASKED state. ++ * set->duration_start is reset in unmask_monitoring() ++ */ ++ set->duration_start = now; ++ ++ /* ++ * re-arm switch timeout, if necessary ++ * Timeout is active only if monitoring is active, ++ * i.e., LOADED + started ++ * ++ * We reload the remainder timeout or the full timeout. ++ * Remainder is recorded on context switch out or in ++ * pfm_load_context() ++ */ ++ if (ctx->state == PFM_CTX_LOADED ++ && (set->flags & PFM_SETFL_TIME_SWITCH) && is_active) { ++ pfm_restart_timer(ctx, set); ++ /* careful here as pfm_restart_timer may switch sets */ ++ } ++done: ++ spin_unlock(&ctx->lock); ++} ++ ++/* ++ * interrupts are masked, runqueue lock is held. ++ * ++ * In UP. we simply stop monitoring and leave the state ++ * in place, i.e., lazy save ++ */ ++void __pfm_ctxswout_thread(struct task_struct *task, ++ struct pfm_context *ctx, u64 now) ++{ ++ struct pfm_event_set *set; ++ int need_save_pmds, is_active; ++ ++ /* ++ * we need to lock context because it could be accessed ++ * from another CPU. Normally the schedule() functions ++ * has masked interrupts which should be enough to ++ * protect against PMU interrupts. ++ */ ++ ++ spin_lock(&ctx->lock); ++ ++ is_active = pfm_arch_is_active(ctx); ++ set = ctx->active_set; ++ ++ /* ++ * stop monitoring and ++ * collect pending overflow information ++ * needed on ctxswin. We cannot afford to lose ++ * a PMU interrupt. ++ */ ++ need_save_pmds = pfm_arch_ctxswout_thread(task, ctx); ++ ++ if (ctx->state == PFM_CTX_LOADED) { ++ /* ++ * accumulate only when set is actively monitoring, ++ */ ++ set->duration += now - set->duration_start; ++ ++ /* ++ * record remaining timeout ++ * reload in pfm_ctxsw_in() ++ */ ++ if (is_active && (set->flags & PFM_SETFL_TIME_SWITCH)) { ++ struct hrtimer *h = NULL; ++ h = &__get_cpu_var(pfm_hrtimer); ++ hrtimer_cancel(h); ++ set->hrtimer_rem = hrtimer_get_remaining(h); ++ PFM_DBG_ovfl("hrtimer=%lld", ++ (long long)set->hrtimer_rem.tv64); ++ } ++ } ++ ++#ifdef CONFIG_SMP ++ /* ++ * in SMP, release ownership of this PMU. ++ * PMU interrupts are masked, so nothing ++ * can happen. ++ */ ++ pfm_set_pmu_owner(NULL, NULL); ++ ++ /* ++ * On some architectures, it is necessary to read the ++ * PMD registers to check for pending overflow in ++ * pfm_arch_ctxswout_thread(). In that case, saving of ++ * the PMDs may be done there and not here. ++ */ ++ if (need_save_pmds) ++ pfm_save_pmds(ctx, set); ++#endif ++ spin_unlock(&ctx->lock); ++} ++ ++/* ++ * ++ */ ++static void __pfm_ctxswout_sys(struct task_struct *prev, ++ struct task_struct *next) ++{ ++ struct pfm_context *ctx; ++ ++ ctx = __get_cpu_var(pmu_ctx); ++ BUG_ON(!ctx); ++ ++ /* ++ * propagate TIF_PERFMON_CTXSW to ensure that: ++ * - previous task has TIF_PERFMON_CTXSW cleared, in case it is ++ * scheduled onto another CPU where there is syswide monitoring ++ * - next task has TIF_PERFMON_CTXSW set to ensure it will come back ++ * here when context switched out ++ */ ++ clear_tsk_thread_flag(prev, TIF_PERFMON_CTXSW); ++ set_tsk_thread_flag(next, TIF_PERFMON_CTXSW); ++ ++ /* ++ * nothing to do until actually started ++ * XXX: assumes no mean to start from user level ++ */ ++ if (!ctx->flags.started) ++ return; ++ ++ pfm_arch_ctxswout_sys(prev, ctx); ++} ++ ++/* ++ * ++ */ ++static void __pfm_ctxswin_sys(struct task_struct *prev, ++ struct task_struct *next) ++{ ++ struct pfm_context *ctx; ++ ++ ctx = __get_cpu_var(pmu_ctx); ++ BUG_ON(!ctx); ++ ++ /* ++ * nothing to do until actually started ++ * XXX: assumes no mean to start from user level ++ */ ++ if (!ctx->flags.started) ++ return; ++ ++ pfm_arch_ctxswin_sys(next, ctx); ++} ++ ++void pfm_ctxsw_out(struct task_struct *prev, ++ struct task_struct *next) ++{ ++ struct pfm_context *ctxp; ++ u64 now; ++ ++ now = sched_clock(); ++ ++ ctxp = prev->pfm_context; ++ ++ if (ctxp) ++ __pfm_ctxswout_thread(prev, ctxp, now); ++ else ++ __pfm_ctxswout_sys(prev, next); ++ ++ pfm_stats_inc(ctxswout_count); ++ pfm_stats_add(ctxswout_ns, sched_clock() - now); ++} ++ ++void pfm_ctxsw_in(struct task_struct *prev, ++ struct task_struct *next) ++{ ++ struct pfm_context *ctxn; ++ u64 now; ++ ++ now = sched_clock(); ++ ++ ctxn = next->pfm_context; ++ ++ if (ctxn) ++ __pfm_ctxswin_thread(next, ctxn, now); ++ else ++ __pfm_ctxswin_sys(prev, next); ++ ++ pfm_stats_inc(ctxswin_count); ++ pfm_stats_add(ctxswin_ns, sched_clock() - now); ++} +diff --git a/perfmon/perfmon_debugfs.c b/perfmon/perfmon_debugfs.c +new file mode 100644 +index 0000000..e4d2fad +--- /dev/null ++++ b/perfmon/perfmon_debugfs.c +@@ -0,0 +1,168 @@ ++/* ++ * perfmon_debugfs.c: perfmon2 statistics interface to debugfs ++ * ++ * This file implements the perfmon2 interface which ++ * provides access to the hardware performance counters ++ * of the host processor. ++ * ++ * The initial version of perfmon.c was written by ++ * Ganesh Venkitachalam, IBM Corp. ++ * ++ * Then it was modified for perfmon-1.x by Stephane Eranian and ++ * David Mosberger, Hewlett Packard Co. ++ * ++ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x ++ * by Stephane Eranian, Hewlett Packard Co. ++ * ++ * Copyright (c) 2007 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * More information about perfmon available at: ++ * http://perfmon2.sf.net ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/kernel.h> ++#include <linux/debugfs.h> ++#include <linux/perfmon_kern.h> ++ ++/* ++ * to make the statistics visible to user space: ++ * $ mount -t debugfs none /mnt ++ * $ cd /mnt/perfmon ++ * then choose a CPU subdir ++ */ ++DECLARE_PER_CPU(struct pfm_stats, pfm_stats); ++ ++static struct dentry *pfm_debugfs_dir; ++ ++void pfm_reset_stats(int cpu) ++{ ++ struct pfm_stats *st; ++ unsigned long flags; ++ ++ st = &per_cpu(pfm_stats, cpu); ++ ++ local_irq_save(flags); ++ memset(st->v, 0, sizeof(st->v)); ++ local_irq_restore(flags); ++} ++ ++static const char *pfm_stats_strs[] = { ++ "ovfl_intr_all_count", ++ "ovfl_intr_ns", ++ "ovfl_intr_spurious_count", ++ "ovfl_intr_replay_count", ++ "ovfl_intr_regular_count", ++ "handle_work_count", ++ "ovfl_notify_count", ++ "reset_pmds_count", ++ "pfm_restart_count", ++ "fmt_handler_calls", ++ "fmt_handler_ns", ++ "set_switch_count", ++ "set_switch_ns", ++ "set_switch_exp", ++ "ctxswin_count", ++ "ctxswin_ns", ++ "handle_timeout_count", ++ "ovfl_intr_nmi_count", ++ "ctxswout_count", ++ "ctxswout_ns", ++}; ++#define PFM_NUM_STRS ARRAY_SIZE(pfm_stats_strs) ++ ++void pfm_debugfs_del_cpu(int cpu) ++{ ++ struct pfm_stats *st; ++ int i; ++ ++ st = &per_cpu(pfm_stats, cpu); ++ ++ for (i = 0; i < PFM_NUM_STATS; i++) { ++ if (st->dirs[i]) ++ debugfs_remove(st->dirs[i]); ++ st->dirs[i] = NULL; ++ } ++ if (st->cpu_dir) ++ debugfs_remove(st->cpu_dir); ++ st->cpu_dir = NULL; ++} ++ ++int pfm_debugfs_add_cpu(int cpu) ++{ ++ struct pfm_stats *st; ++ int i; ++ ++ /* ++ * sanity check between stats names and the number ++ * of entries in the pfm_stats value array. ++ */ ++ if (PFM_NUM_STRS != PFM_NUM_STATS) { ++ PFM_ERR("PFM_NUM_STRS != PFM_NUM_STATS error"); ++ return -1; ++ } ++ ++ st = &per_cpu(pfm_stats, cpu); ++ sprintf(st->cpu_name, "cpu%d", cpu); ++ ++ st->cpu_dir = debugfs_create_dir(st->cpu_name, pfm_debugfs_dir); ++ if (!st->cpu_dir) ++ return -1; ++ ++ for (i = 0; i < PFM_NUM_STATS; i++) { ++ st->dirs[i] = debugfs_create_u64(pfm_stats_strs[i], ++ S_IRUGO, ++ st->cpu_dir, ++ &st->v[i]); ++ if (!st->dirs[i]) ++ goto error; ++ } ++ pfm_reset_stats(cpu); ++ return 0; ++error: ++ while (i >= 0) { ++ debugfs_remove(st->dirs[i]); ++ i--; ++ } ++ debugfs_remove(st->cpu_dir); ++ return -1; ++} ++ ++/* ++ * called once from pfm_init() ++ */ ++int __init pfm_init_debugfs(void) ++{ ++ int cpu1, cpu2, ret; ++ ++ pfm_debugfs_dir = debugfs_create_dir("perfmon", NULL); ++ if (!pfm_debugfs_dir) ++ return -1; ++ ++ for_each_online_cpu(cpu1) { ++ ret = pfm_debugfs_add_cpu(cpu1); ++ if (ret) ++ goto error; ++ } ++ return 0; ++error: ++ for_each_online_cpu(cpu2) { ++ if (cpu2 == cpu1) ++ break; ++ pfm_debugfs_del_cpu(cpu2); ++ } ++ return -1; ++} +diff --git a/perfmon/perfmon_dfl_smpl.c b/perfmon/perfmon_dfl_smpl.c +new file mode 100644 +index 0000000..8c83489 +--- /dev/null ++++ b/perfmon/perfmon_dfl_smpl.c +@@ -0,0 +1,298 @@ ++/* ++ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * This file implements the new default sampling buffer format ++ * for the perfmon2 subsystem. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/kernel.h> ++#include <linux/types.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/smp.h> ++ ++#include <linux/perfmon_kern.h> ++#include <linux/perfmon_dfl_smpl.h> ++ ++MODULE_AUTHOR("Stephane Eranian <eranian@hpl.hp.com>"); ++MODULE_DESCRIPTION("new perfmon default sampling format"); ++MODULE_LICENSE("GPL"); ++ ++static int pfm_dfl_fmt_validate(u32 ctx_flags, u16 npmds, void *data) ++{ ++ struct pfm_dfl_smpl_arg *arg = data; ++ u64 min_buf_size; ++ ++ if (data == NULL) { ++ PFM_DBG("no argument passed"); ++ return -EINVAL; ++ } ++ ++ /* ++ * sanity check in case size_t is smaller then u64 ++ */ ++#if BITS_PER_LONG == 4 ++#define MAX_SIZE_T (1ULL<<(sizeof(size_t)<<3)) ++ if (sizeof(size_t) < sizeof(arg->buf_size)) { ++ if (arg->buf_size >= MAX_SIZE_T) ++ return -ETOOBIG; ++ } ++#endif ++ ++ /* ++ * compute min buf size. npmds is the maximum number ++ * of implemented PMD registers. ++ */ ++ min_buf_size = sizeof(struct pfm_dfl_smpl_hdr) ++ + (sizeof(struct pfm_dfl_smpl_entry) + (npmds*sizeof(u64))); ++ ++ PFM_DBG("validate ctx_flags=0x%x flags=0x%x npmds=%u " ++ "min_buf_size=%llu buf_size=%llu\n", ++ ctx_flags, ++ arg->buf_flags, ++ npmds, ++ (unsigned long long)min_buf_size, ++ (unsigned long long)arg->buf_size); ++ ++ /* ++ * must hold at least the buffer header + one minimally sized entry ++ */ ++ if (arg->buf_size < min_buf_size) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++static int pfm_dfl_fmt_get_size(u32 flags, void *data, size_t *size) ++{ ++ struct pfm_dfl_smpl_arg *arg = data; ++ ++ /* ++ * size has been validated in default_validate ++ * we can never loose bits from buf_size. ++ */ ++ *size = (size_t)arg->buf_size; ++ ++ return 0; ++} ++ ++static int pfm_dfl_fmt_init(struct pfm_context *ctx, void *buf, u32 ctx_flags, ++ u16 npmds, void *data) ++{ ++ struct pfm_dfl_smpl_hdr *hdr; ++ struct pfm_dfl_smpl_arg *arg = data; ++ ++ hdr = buf; ++ ++ hdr->hdr_version = PFM_DFL_SMPL_VERSION; ++ hdr->hdr_buf_size = arg->buf_size; ++ hdr->hdr_buf_flags = arg->buf_flags; ++ hdr->hdr_cur_offs = sizeof(*hdr); ++ hdr->hdr_overflows = 0; ++ hdr->hdr_count = 0; ++ hdr->hdr_min_buf_space = sizeof(struct pfm_dfl_smpl_entry) + (npmds*sizeof(u64)); ++ /* ++ * due to cache aliasing, it may be necessary to flush the cache ++ * on certain architectures (e.g., MIPS) ++ */ ++ pfm_cacheflush(hdr, sizeof(*hdr)); ++ ++ PFM_DBG("buffer=%p buf_size=%llu hdr_size=%zu hdr_version=%u.%u " ++ "min_space=%llu npmds=%u", ++ buf, ++ (unsigned long long)hdr->hdr_buf_size, ++ sizeof(*hdr), ++ PFM_VERSION_MAJOR(hdr->hdr_version), ++ PFM_VERSION_MINOR(hdr->hdr_version), ++ (unsigned long long)hdr->hdr_min_buf_space, ++ npmds); ++ ++ return 0; ++} ++ ++/* ++ * called from pfm_overflow_handler() to record a new sample ++ * ++ * context is locked, interrupts are disabled (no preemption) ++ */ ++static int pfm_dfl_fmt_handler(struct pfm_context *ctx, ++ unsigned long ip, u64 tstamp, void *data) ++{ ++ struct pfm_dfl_smpl_hdr *hdr; ++ struct pfm_dfl_smpl_entry *ent; ++ struct pfm_ovfl_arg *arg; ++ void *cur, *last; ++ u64 *e; ++ size_t entry_size, min_size; ++ u16 npmds, i; ++ u16 ovfl_pmd; ++ void *buf; ++ ++ hdr = ctx->smpl_addr; ++ arg = &ctx->ovfl_arg; ++ ++ buf = hdr; ++ cur = buf+hdr->hdr_cur_offs; ++ last = buf+hdr->hdr_buf_size; ++ ovfl_pmd = arg->ovfl_pmd; ++ min_size = hdr->hdr_min_buf_space; ++ ++ /* ++ * precheck for sanity ++ */ ++ if ((last - cur) < min_size) ++ goto full; ++ ++ npmds = arg->num_smpl_pmds; ++ ++ ent = (struct pfm_dfl_smpl_entry *)cur; ++ ++ entry_size = sizeof(*ent) + (npmds << 3); ++ ++ /* position for first pmd */ ++ e = (u64 *)(ent+1); ++ ++ hdr->hdr_count++; ++ ++ PFM_DBG_ovfl("count=%llu cur=%p last=%p free_bytes=%zu ovfl_pmd=%d " ++ "npmds=%u", ++ (unsigned long long)hdr->hdr_count, ++ cur, last, ++ (last-cur), ++ ovfl_pmd, ++ npmds); ++ ++ /* ++ * current = task running at the time of the overflow. ++ * ++ * per-task mode: ++ * - this is usually the task being monitored. ++ * Under certain conditions, it might be a different task ++ * ++ * system-wide: ++ * - this is not necessarily the task controlling the session ++ */ ++ ent->pid = current->pid; ++ ent->ovfl_pmd = ovfl_pmd; ++ ent->last_reset_val = arg->pmd_last_reset; ++ ++ /* ++ * where did the fault happen (includes slot number) ++ */ ++ ent->ip = ip; ++ ++ ent->tstamp = tstamp; ++ ent->cpu = smp_processor_id(); ++ ent->set = arg->active_set; ++ ent->tgid = current->tgid; ++ ++ /* ++ * selectively store PMDs in increasing index number ++ */ ++ if (npmds) { ++ u64 *val = arg->smpl_pmds_values; ++ for (i = 0; i < npmds; i++) ++ *e++ = *val++; ++ } ++ ++ /* ++ * update position for next entry ++ */ ++ hdr->hdr_cur_offs += entry_size; ++ cur += entry_size; ++ ++ pfm_cacheflush(hdr, sizeof(*hdr)); ++ pfm_cacheflush(ent, entry_size); ++ ++ /* ++ * post check to avoid losing the last sample ++ */ ++ if ((last - cur) < min_size) ++ goto full; ++ ++ /* reset before returning from interrupt handler */ ++ arg->ovfl_ctrl = PFM_OVFL_CTRL_RESET; ++ ++ return 0; ++full: ++ PFM_DBG_ovfl("sampling buffer full free=%zu, count=%llu", ++ last-cur, ++ (unsigned long long)hdr->hdr_count); ++ ++ /* ++ * increment number of buffer overflows. ++ * important to detect duplicate set of samples. ++ */ ++ hdr->hdr_overflows++; ++ ++ /* ++ * request notification and masking of monitoring. ++ * Notification is still subject to the overflowed ++ * register having the FL_NOTIFY flag set. ++ */ ++ arg->ovfl_ctrl = PFM_OVFL_CTRL_NOTIFY | PFM_OVFL_CTRL_MASK; ++ ++ return -ENOBUFS; /* we are full, sorry */ ++} ++ ++static int pfm_dfl_fmt_restart(int is_active, u32 *ovfl_ctrl, void *buf) ++{ ++ struct pfm_dfl_smpl_hdr *hdr; ++ ++ hdr = buf; ++ ++ hdr->hdr_count = 0; ++ hdr->hdr_cur_offs = sizeof(*hdr); ++ ++ pfm_cacheflush(hdr, sizeof(*hdr)); ++ ++ *ovfl_ctrl = PFM_OVFL_CTRL_RESET; ++ ++ return 0; ++} ++ ++static int pfm_dfl_fmt_exit(void *buf) ++{ ++ return 0; ++} ++ ++static struct pfm_smpl_fmt dfl_fmt = { ++ .fmt_name = "default", ++ .fmt_version = 0x10000, ++ .fmt_arg_size = sizeof(struct pfm_dfl_smpl_arg), ++ .fmt_validate = pfm_dfl_fmt_validate, ++ .fmt_getsize = pfm_dfl_fmt_get_size, ++ .fmt_init = pfm_dfl_fmt_init, ++ .fmt_handler = pfm_dfl_fmt_handler, ++ .fmt_restart = pfm_dfl_fmt_restart, ++ .fmt_exit = pfm_dfl_fmt_exit, ++ .fmt_flags = PFM_FMT_BUILTIN_FLAG, ++ .owner = THIS_MODULE ++}; ++ ++static int pfm_dfl_fmt_init_module(void) ++{ ++ return pfm_fmt_register(&dfl_fmt); ++} ++ ++static void pfm_dfl_fmt_cleanup_module(void) ++{ ++ pfm_fmt_unregister(&dfl_fmt); ++} ++ ++module_init(pfm_dfl_fmt_init_module); ++module_exit(pfm_dfl_fmt_cleanup_module); +diff --git a/perfmon/perfmon_file.c b/perfmon/perfmon_file.c +new file mode 100644 +index 0000000..1cde81b +--- /dev/null ++++ b/perfmon/perfmon_file.c +@@ -0,0 +1,751 @@ ++/* ++ * perfmon_file.c: perfmon2 file input/output functions ++ * ++ * This file implements the perfmon2 interface which ++ * provides access to the hardware performance counters ++ * of the host processor. ++ * ++ * The initial version of perfmon.c was written by ++ * Ganesh Venkitachalam, IBM Corp. ++ * ++ * Then it was modified for perfmon-1.x by Stephane Eranian and ++ * David Mosberger, Hewlett Packard Co. ++ * ++ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x ++ * by Stephane Eranian, Hewlett Packard Co. ++ * ++ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * David Mosberger-Tang <davidm@hpl.hp.com> ++ * ++ * More information about perfmon available at: ++ * http://perfmon2.sf.net ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/kernel.h> ++#include <linux/module.h> ++#include <linux/file.h> ++#include <linux/poll.h> ++#include <linux/vfs.h> ++#include <linux/pagemap.h> ++#include <linux/mount.h> ++#include <linux/perfmon_kern.h> ++#include "perfmon_priv.h" ++ ++#define PFMFS_MAGIC 0xa0b4d889 /* perfmon filesystem magic number */ ++ ++struct pfm_controls pfm_controls = { ++ .sys_group = PFM_GROUP_PERM_ANY, ++ .task_group = PFM_GROUP_PERM_ANY, ++ .arg_mem_max = PAGE_SIZE, ++ .smpl_buffer_mem_max = ~0, ++}; ++EXPORT_SYMBOL(pfm_controls); ++ ++static int __init enable_debug(char *str) ++{ ++ pfm_controls.debug = 1; ++ PFM_INFO("debug output enabled\n"); ++ return 1; ++} ++__setup("perfmon_debug", enable_debug); ++ ++static int pfmfs_delete_dentry(struct dentry *dentry) ++{ ++ return 1; ++} ++ ++static struct dentry_operations pfmfs_dentry_operations = { ++ .d_delete = pfmfs_delete_dentry, ++}; ++ ++int pfm_buf_map_pagefault(struct vm_area_struct *vma, struct vm_fault *vmf) ++{ ++ void *kaddr; ++ unsigned long address; ++ struct pfm_context *ctx; ++ size_t size; ++ ++ address = (unsigned long)vmf->virtual_address; ++ ++ ctx = vma->vm_private_data; ++ if (ctx == NULL) { ++ PFM_DBG("no ctx"); ++ return VM_FAULT_SIGBUS; ++ } ++ /* ++ * size available to user (maybe different from real_smpl_size ++ */ ++ size = ctx->smpl_size; ++ ++ if ((address < vma->vm_start) || ++ (address >= (vma->vm_start + size))) ++ return VM_FAULT_SIGBUS; ++ ++ kaddr = ctx->smpl_addr + (address - vma->vm_start); ++ ++ vmf->page = vmalloc_to_page(kaddr); ++ get_page(vmf->page); ++ ++ PFM_DBG("[%d] start=%p ref_count=%d", ++ current->pid, ++ kaddr, page_count(vmf->page)); ++ ++ return 0; ++} ++ ++/* ++ * we need to determine whther or not we are closing the last reference ++ * to the file and thus are going to end up in pfm_close() which eventually ++ * calls pfm_release_buf_space(). In that function, we update the accouting ++ * for locked_vm given that we are actually freeing the sampling buffer. The ++ * issue is that there are multiple paths leading to pfm_release_buf_space(), ++ * from exit(), munmap(), close(). The path coming from munmap() is problematic ++ * becuse do_munmap() grabs mmap_sem in write-mode which is also what ++ * pfm_release_buf_space does. To avoid deadlock, we need to determine where ++ * we are calling from and skip the locking. The vm_ops->close() callback ++ * is invoked for each remove_vma() independently of the number of references ++ * left on the file descriptor, therefore simple reference counter does not ++ * work. We need to determine if this is the last call, and then set a flag ++ * to skip the locking. ++ */ ++static void pfm_buf_map_close(struct vm_area_struct *vma) ++{ ++ struct file *file; ++ struct pfm_context *ctx; ++ ++ file = vma->vm_file; ++ ctx = vma->vm_private_data; ++ ++ /* ++ * if file is going to close, then pfm_close() will ++ * be called, do not lock in pfm_release_buf ++ */ ++ if (atomic_read(&file->f_count) == 1) ++ ctx->flags.mmap_nlock = 1; ++} ++ ++/* ++ * we do not have a close callback because, the locked ++ * memory accounting must be done when the actual buffer ++ * is freed. Munmap does not free the page backing the vma ++ * because they may still be in use by the PMU interrupt handler. ++ */ ++struct vm_operations_struct pfm_buf_map_vm_ops = { ++ .fault = pfm_buf_map_pagefault, ++ .close = pfm_buf_map_close ++}; ++ ++static int pfm_mmap_buffer(struct pfm_context *ctx, struct vm_area_struct *vma, ++ size_t size) ++{ ++ if (ctx->smpl_addr == NULL) { ++ PFM_DBG("no sampling buffer to map"); ++ return -EINVAL; ++ } ++ ++ if (size > ctx->smpl_size) { ++ PFM_DBG("mmap size=%zu >= actual buf size=%zu", ++ size, ++ ctx->smpl_size); ++ return -EINVAL; ++ } ++ ++ vma->vm_ops = &pfm_buf_map_vm_ops; ++ vma->vm_private_data = ctx; ++ ++ return 0; ++} ++ ++static int pfm_mmap(struct file *file, struct vm_area_struct *vma) ++{ ++ size_t size; ++ struct pfm_context *ctx; ++ unsigned long flags; ++ int ret; ++ ++ PFM_DBG("pfm_file_ops"); ++ ++ ctx = file->private_data; ++ size = (vma->vm_end - vma->vm_start); ++ ++ if (ctx == NULL) ++ return -EINVAL; ++ ++ ret = -EINVAL; ++ ++ spin_lock_irqsave(&ctx->lock, flags); ++ ++ if (vma->vm_flags & VM_WRITE) { ++ PFM_DBG("cannot map buffer for writing"); ++ goto done; ++ } ++ ++ PFM_DBG("vm_pgoff=%lu size=%zu vm_start=0x%lx", ++ vma->vm_pgoff, ++ size, ++ vma->vm_start); ++ ++ ret = pfm_mmap_buffer(ctx, vma, size); ++ if (ret == 0) ++ vma->vm_flags |= VM_RESERVED; ++ ++ PFM_DBG("ret=%d vma_flags=0x%lx vma_start=0x%lx vma_size=%lu", ++ ret, ++ vma->vm_flags, ++ vma->vm_start, ++ vma->vm_end-vma->vm_start); ++done: ++ spin_unlock_irqrestore(&ctx->lock, flags); ++ ++ return ret; ++} ++ ++/* ++ * Extract one message from queue. ++ * ++ * return: ++ * -EAGAIN: when non-blocking and nothing is* in the queue. ++ * -ERESTARTSYS: when blocking and signal is pending ++ * Otherwise returns size of message (sizeof(pfarg_msg)) ++ */ ++ssize_t __pfm_read(struct pfm_context *ctx, union pfarg_msg *msg_buf, int non_block) ++{ ++ ssize_t ret = 0; ++ unsigned long flags; ++ DECLARE_WAITQUEUE(wait, current); ++ ++ /* ++ * we must masks interrupts to avoid a race condition ++ * with the PMU interrupt handler. ++ */ ++ spin_lock_irqsave(&ctx->lock, flags); ++ ++ while (pfm_msgq_is_empty(ctx)) { ++ ++ /* ++ * handle non-blocking reads ++ * return -EAGAIN ++ */ ++ ret = -EAGAIN; ++ if (non_block) ++ break; ++ ++ add_wait_queue(&ctx->msgq_wait, &wait); ++ set_current_state(TASK_INTERRUPTIBLE); ++ ++ spin_unlock_irqrestore(&ctx->lock, flags); ++ ++ schedule(); ++ ++ /* ++ * during this window, another thread may call ++ * pfm_read() and steal our message ++ */ ++ ++ spin_lock_irqsave(&ctx->lock, flags); ++ ++ remove_wait_queue(&ctx->msgq_wait, &wait); ++ set_current_state(TASK_RUNNING); ++ ++ /* ++ * check for pending signals ++ * return -ERESTARTSYS ++ */ ++ ret = -ERESTARTSYS; ++ if (signal_pending(current)) ++ break; ++ ++ /* ++ * we may have a message ++ */ ++ ret = 0; ++ } ++ ++ /* ++ * extract message ++ */ ++ if (ret == 0) { ++ /* ++ * copy the oldest message into msg_buf. ++ * We cannot directly call copy_to_user() ++ * because interrupts masked. This is done ++ * in the caller ++ */ ++ pfm_get_next_msg(ctx, msg_buf); ++ ++ ret = sizeof(*msg_buf); ++ ++ PFM_DBG("extracted type=%d", msg_buf->type); ++ } ++ ++ spin_unlock_irqrestore(&ctx->lock, flags); ++ ++ PFM_DBG("blocking=%d ret=%zd", non_block, ret); ++ ++ return ret; ++} ++ ++static ssize_t pfm_read(struct file *filp, char __user *buf, size_t size, ++ loff_t *ppos) ++{ ++ struct pfm_context *ctx; ++ union pfarg_msg msg_buf; ++ int non_block, ret; ++ ++ PFM_DBG_ovfl("buf=%p size=%zu", buf, size); ++ ++ ctx = filp->private_data; ++ if (ctx == NULL) { ++ PFM_ERR("no ctx for pfm_read"); ++ return -EINVAL; ++ } ++ ++ non_block = filp->f_flags & O_NONBLOCK; ++ ++#ifdef CONFIG_IA64_PERFMON_COMPAT ++ /* ++ * detect IA-64 v2.0 context read (message size is different) ++ * nops on all other architectures ++ */ ++ if (unlikely(ctx->flags.ia64_v20_compat)) ++ return pfm_arch_compat_read(ctx, buf, non_block, size); ++#endif ++ /* ++ * cannot extract partial messages. ++ * check even when there is no message ++ * ++ * cannot extract more than one message per call. Bytes ++ * above sizeof(msg) are ignored. ++ */ ++ if (size < sizeof(msg_buf)) { ++ PFM_DBG("message is too small size=%zu must be >=%zu)", ++ size, ++ sizeof(msg_buf)); ++ return -EINVAL; ++ } ++ ++ ret = __pfm_read(ctx, &msg_buf, non_block); ++ if (ret > 0) { ++ if (copy_to_user(buf, &msg_buf, sizeof(msg_buf))) ++ ret = -EFAULT; ++ } ++ PFM_DBG_ovfl("ret=%d", ret); ++ return ret; ++} ++ ++static ssize_t pfm_write(struct file *file, const char __user *ubuf, ++ size_t size, loff_t *ppos) ++{ ++ PFM_DBG("pfm_write called"); ++ return -EINVAL; ++} ++ ++static unsigned int pfm_poll(struct file *filp, poll_table *wait) ++{ ++ struct pfm_context *ctx; ++ unsigned long flags; ++ unsigned int mask = 0; ++ ++ PFM_DBG("pfm_file_ops"); ++ ++ if (filp->f_op != &pfm_file_ops) { ++ PFM_ERR("pfm_poll bad magic"); ++ return 0; ++ } ++ ++ ctx = filp->private_data; ++ if (ctx == NULL) { ++ PFM_ERR("pfm_poll no ctx"); ++ return 0; ++ } ++ ++ PFM_DBG("before poll_wait"); ++ ++ poll_wait(filp, &ctx->msgq_wait, wait); ++ ++ /* ++ * pfm_msgq_is_empty() is non-atomic ++ * ++ * filp is protected by fget() at upper level ++ * context cannot be closed by another thread. ++ * ++ * There may be a race with a PMU interrupt adding ++ * messages to the queue. But we are interested in ++ * queue not empty, so adding more messages should ++ * not really be a problem. ++ * ++ * There may be a race with another thread issuing ++ * a read() and stealing messages from the queue thus ++ * may return the wrong answer. This could potentially ++ * lead to a blocking read, because nothing is ++ * available in the queue ++ */ ++ spin_lock_irqsave(&ctx->lock, flags); ++ ++ if (!pfm_msgq_is_empty(ctx)) ++ mask = POLLIN | POLLRDNORM; ++ ++ spin_unlock_irqrestore(&ctx->lock, flags); ++ ++ PFM_DBG("after poll_wait mask=0x%x", mask); ++ ++ return mask; ++} ++ ++static int pfm_ioctl(struct inode *inode, struct file *file, unsigned int cmd, ++ unsigned long arg) ++{ ++ PFM_DBG("pfm_ioctl called"); ++ return -EINVAL; ++} ++ ++/* ++ * interrupt cannot be masked when entering this function ++ */ ++static inline int __pfm_fasync(int fd, struct file *filp, ++ struct pfm_context *ctx, int on) ++{ ++ int ret; ++ ++ PFM_DBG("in fd=%d on=%d async_q=%p", ++ fd, ++ on, ++ ctx->async_queue); ++ ++ ret = fasync_helper(fd, filp, on, &ctx->async_queue); ++ ++ PFM_DBG("out fd=%d on=%d async_q=%p ret=%d", ++ fd, ++ on, ++ ctx->async_queue, ret); ++ ++ return ret; ++} ++ ++static int pfm_fasync(int fd, struct file *filp, int on) ++{ ++ struct pfm_context *ctx; ++ int ret; ++ ++ PFM_DBG("pfm_file_ops"); ++ ++ ctx = filp->private_data; ++ if (ctx == NULL) { ++ PFM_ERR("pfm_fasync no ctx"); ++ return -EBADF; ++ } ++ ++ /* ++ * we cannot mask interrupts during this call because this may ++ * may go to sleep if memory is not readily avalaible. ++ * ++ * We are protected from the context disappearing by the ++ * get_fd()/put_fd() done in caller. Serialization of this function ++ * is ensured by caller. ++ */ ++ ret = __pfm_fasync(fd, filp, ctx, on); ++ ++ PFM_DBG("pfm_fasync called on fd=%d on=%d async_queue=%p ret=%d", ++ fd, ++ on, ++ ctx->async_queue, ret); ++ ++ return ret; ++} ++ ++#ifdef CONFIG_SMP ++static void __pfm_close_remote_cpu(void *info) ++{ ++ struct pfm_context *ctx = info; ++ int can_release; ++ ++ BUG_ON(ctx != __get_cpu_var(pmu_ctx)); ++ ++ /* ++ * we are in IPI interrupt handler which has always higher ++ * priority than PMU interrupt, therefore we do not need to ++ * mask interrupts. context locking is not needed because we ++ * are in close(), no more user references. ++ * ++ * can_release is ignored, release done on calling CPU ++ */ ++ __pfm_unload_context(ctx, &can_release); ++ ++ /* ++ * we cannot free context here because we are in_interrupt(). ++ * we free on the calling CPU ++ */ ++} ++ ++static int pfm_close_remote_cpu(u32 cpu, struct pfm_context *ctx) ++{ ++ BUG_ON(irqs_disabled()); ++ return smp_call_function_single(cpu, __pfm_close_remote_cpu, ctx, 1); ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * called either on explicit close() or from exit_files(). ++ * Only the LAST user of the file gets to this point, i.e., it is ++ * called only ONCE. ++ * ++ * IMPORTANT: we get called ONLY when the refcnt on the file gets to zero ++ * (fput()),i.e, last task to access the file. Nobody else can access the ++ * file at this point. ++ * ++ * When called from exit_files(), the VMA has been freed because exit_mm() ++ * is executed before exit_files(). ++ * ++ * When called from exit_files(), the current task is not yet ZOMBIE but we ++ * flush the PMU state to the context. ++ */ ++int __pfm_close(struct pfm_context *ctx, struct file *filp) ++{ ++ unsigned long flags; ++ int state; ++ int can_free = 1, can_unload = 1; ++ int is_system, can_release = 0; ++ u32 cpu; ++ ++ /* ++ * no risk of ctx of filp disappearing so we can operate outside ++ * of spin_lock(). fasync_helper() runs with interrupts masked, ++ * thus there is no risk with the PMU interrupt handler ++ * ++ * In case of zombie, we will not have the async struct anymore ++ * thus kill_fasync() will not do anything ++ * ++ * fd is not used when removing the entry so we pass -1 ++ */ ++ if (filp->f_flags & FASYNC) ++ __pfm_fasync (-1, filp, ctx, 0); ++ ++ spin_lock_irqsave(&ctx->lock, flags); ++ ++ state = ctx->state; ++ is_system = ctx->flags.system; ++ cpu = ctx->cpu; ++ ++ PFM_DBG("state=%d", state); ++ ++ /* ++ * check if unload is needed ++ */ ++ if (state == PFM_CTX_UNLOADED) ++ goto doit; ++ ++#ifdef CONFIG_SMP ++ /* ++ * we need to release the resource on the ORIGINAL cpu. ++ * we need to release the context lock to avoid deadlocks ++ * on the original CPU, especially in the context switch ++ * routines. It is safe to unlock because we are in close(), ++ * in other words, there is no more access from user level. ++ * we can also unmask interrupts on this CPU because the ++ * context is running on the original CPU. Context will be ++ * unloaded and the session will be released on the original ++ * CPU. Upon return, the caller is guaranteed that the context ++ * is gone from original CPU. ++ */ ++ if (is_system && cpu != smp_processor_id()) { ++ spin_unlock_irqrestore(&ctx->lock, flags); ++ pfm_close_remote_cpu(cpu, ctx); ++ can_release = 1; ++ goto free_it; ++ } ++ ++ if (!is_system && ctx->task != current) { ++ /* ++ * switch context to zombie state ++ */ ++ ctx->state = PFM_CTX_ZOMBIE; ++ ++ PFM_DBG("zombie ctx for [%d]", ctx->task->pid); ++ /* ++ * must check if other thread is using block overflow ++ * notification mode. If so make sure it will not block ++ * because there will not be any pfm_restart() issued. ++ * When the thread notices the ZOMBIE state, it will clean ++ * up what is left of the context ++ */ ++ if (state == PFM_CTX_MASKED && ctx->flags.block) { ++ /* ++ * force task to wake up from MASKED state ++ */ ++ PFM_DBG("waking up [%d]", ctx->task->pid); ++ ++ complete(&ctx->restart_complete); ++ } ++ /* ++ * PMU session will be release by monitored task when it notices ++ * ZOMBIE state as part of pfm_unload_context() ++ */ ++ can_unload = can_free = 0; ++ } ++#endif ++ if (can_unload) ++ __pfm_unload_context(ctx, &can_release); ++doit: ++ spin_unlock_irqrestore(&ctx->lock, flags); ++ ++#ifdef CONFIG_SMP ++free_it: ++#endif ++ if (can_release) ++ pfm_session_release(is_system, cpu); ++ ++ if (can_free) ++ pfm_free_context(ctx); ++ ++ return 0; ++} ++ ++static int pfm_close(struct inode *inode, struct file *filp) ++{ ++ struct pfm_context *ctx; ++ ++ PFM_DBG("called filp=%p", filp); ++ ++ ctx = filp->private_data; ++ if (ctx == NULL) { ++ PFM_ERR("no ctx"); ++ return -EBADF; ++ } ++ return __pfm_close(ctx, filp); ++} ++ ++static int pfm_no_open(struct inode *irrelevant, struct file *dontcare) ++{ ++ PFM_DBG("pfm_file_ops"); ++ ++ return -ENXIO; ++} ++ ++ ++const struct file_operations pfm_file_ops = { ++ .llseek = no_llseek, ++ .read = pfm_read, ++ .write = pfm_write, ++ .poll = pfm_poll, ++ .ioctl = pfm_ioctl, ++ .open = pfm_no_open, /* special open to disallow open via /proc */ ++ .fasync = pfm_fasync, ++ .release = pfm_close, ++ .mmap = pfm_mmap ++}; ++ ++static int pfmfs_get_sb(struct file_system_type *fs_type, ++ int flags, const char *dev_name, ++ void *data, struct vfsmount *mnt) ++{ ++ return get_sb_pseudo(fs_type, "pfm:", NULL, PFMFS_MAGIC, mnt); ++} ++ ++static struct file_system_type pfm_fs_type = { ++ .name = "pfmfs", ++ .get_sb = pfmfs_get_sb, ++ .kill_sb = kill_anon_super, ++}; ++ ++/* ++ * pfmfs should _never_ be mounted by userland - too much of security hassle, ++ * no real gain from having the whole whorehouse mounted. So we don't need ++ * any operations on the root directory. However, we need a non-trivial ++ * d_name - pfm: will go nicely and kill the special-casing in procfs. ++ */ ++static struct vfsmount *pfmfs_mnt; ++ ++int __init pfm_init_fs(void) ++{ ++ int err = register_filesystem(&pfm_fs_type); ++ if (!err) { ++ pfmfs_mnt = kern_mount(&pfm_fs_type); ++ err = PTR_ERR(pfmfs_mnt); ++ if (IS_ERR(pfmfs_mnt)) ++ unregister_filesystem(&pfm_fs_type); ++ else ++ err = 0; ++ } ++ return err; ++} ++ ++int pfm_alloc_fd(struct file **cfile) ++{ ++ int fd, ret = 0; ++ struct file *file = NULL; ++ struct inode * inode; ++ char name[32]; ++ struct qstr this; ++ ++ fd = get_unused_fd(); ++ if (fd < 0) ++ return -ENFILE; ++ ++ ret = -ENFILE; ++ ++ file = get_empty_filp(); ++ if (!file) ++ goto out; ++ ++ /* ++ * allocate a new inode ++ */ ++ inode = new_inode(pfmfs_mnt->mnt_sb); ++ if (!inode) ++ goto out; ++ ++ PFM_DBG("new inode ino=%ld @%p", inode->i_ino, inode); ++ ++ inode->i_sb = pfmfs_mnt->mnt_sb; ++ inode->i_mode = S_IFCHR|S_IRUGO; ++ inode->i_uid = current->fsuid; ++ inode->i_gid = current->fsgid; ++ ++ sprintf(name, "[%lu]", inode->i_ino); ++ this.name = name; ++ this.hash = inode->i_ino; ++ this.len = strlen(name); ++ ++ ret = -ENOMEM; ++ ++ /* ++ * allocate a new dcache entry ++ */ ++ file->f_dentry = d_alloc(pfmfs_mnt->mnt_sb->s_root, &this); ++ if (!file->f_dentry) ++ goto out; ++ ++ file->f_dentry->d_op = &pfmfs_dentry_operations; ++ ++ d_add(file->f_dentry, inode); ++ file->f_vfsmnt = mntget(pfmfs_mnt); ++ file->f_mapping = inode->i_mapping; ++ ++ file->f_op = &pfm_file_ops; ++ file->f_mode = FMODE_READ; ++ file->f_flags = O_RDONLY; ++ file->f_pos = 0; ++ ++ *cfile = file; ++ ++ return fd; ++out: ++ if (file) ++ put_filp(file); ++ put_unused_fd(fd); ++ return ret; ++} +diff --git a/perfmon/perfmon_fmt.c b/perfmon/perfmon_fmt.c +new file mode 100644 +index 0000000..27c4340 +--- /dev/null ++++ b/perfmon/perfmon_fmt.c +@@ -0,0 +1,219 @@ ++/* ++ * perfmon_fmt.c: perfmon2 sampling buffer format management ++ * ++ * This file implements the perfmon2 interface which ++ * provides access to the hardware performance counters ++ * of the host processor. ++ * ++ * The initial version of perfmon.c was written by ++ * Ganesh Venkitachalam, IBM Corp. ++ * ++ * Then it was modified for perfmon-1.x by Stephane Eranian and ++ * David Mosberger, Hewlett Packard Co. ++ * ++ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x ++ * by Stephane Eranian, Hewlett Packard Co. ++ * ++ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * David Mosberger-Tang <davidm@hpl.hp.com> ++ * ++ * More information about perfmon available at: ++ * http://perfmon2.sf.net ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/module.h> ++#include <linux/perfmon_kern.h> ++#include "perfmon_priv.h" ++ ++static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_smpl_fmt_lock); ++static LIST_HEAD(pfm_smpl_fmt_list); ++ ++static inline int fmt_is_mod(struct pfm_smpl_fmt *f) ++{ ++ return !(f->fmt_flags & PFM_FMTFL_IS_BUILTIN); ++} ++ ++static struct pfm_smpl_fmt *pfm_find_fmt(char *name) ++{ ++ struct pfm_smpl_fmt *entry; ++ ++ list_for_each_entry(entry, &pfm_smpl_fmt_list, fmt_list) { ++ if (!strcmp(entry->fmt_name, name)) ++ return entry; ++ } ++ return NULL; ++} ++/* ++ * find a buffer format based on its name ++ */ ++struct pfm_smpl_fmt *pfm_smpl_fmt_get(char *name) ++{ ++ struct pfm_smpl_fmt *fmt; ++ ++ spin_lock(&pfm_smpl_fmt_lock); ++ ++ fmt = pfm_find_fmt(name); ++ ++ /* ++ * increase module refcount ++ */ ++ if (fmt && fmt_is_mod(fmt) && !try_module_get(fmt->owner)) ++ fmt = NULL; ++ ++ spin_unlock(&pfm_smpl_fmt_lock); ++ ++ return fmt; ++} ++ ++void pfm_smpl_fmt_put(struct pfm_smpl_fmt *fmt) ++{ ++ if (fmt == NULL || !fmt_is_mod(fmt)) ++ return; ++ BUG_ON(fmt->owner == NULL); ++ ++ spin_lock(&pfm_smpl_fmt_lock); ++ module_put(fmt->owner); ++ spin_unlock(&pfm_smpl_fmt_lock); ++} ++ ++int pfm_fmt_register(struct pfm_smpl_fmt *fmt) ++{ ++ int ret = 0; ++ ++ if (perfmon_disabled) { ++ PFM_INFO("perfmon disabled, cannot add sampling format"); ++ return -ENOSYS; ++ } ++ ++ /* some sanity checks */ ++ if (fmt == NULL) { ++ PFM_INFO("perfmon: NULL format for register"); ++ return -EINVAL; ++ } ++ ++ if (fmt->fmt_name == NULL) { ++ PFM_INFO("perfmon: format has no name"); ++ return -EINVAL; ++ } ++ ++ if (fmt->fmt_qdepth > PFM_MSGS_COUNT) { ++ PFM_INFO("perfmon: format %s requires %u msg queue depth (max %d)", ++ fmt->fmt_name, ++ fmt->fmt_qdepth, ++ PFM_MSGS_COUNT); ++ return -EINVAL; ++ } ++ ++ /* ++ * fmt is missing the initialization of .owner = THIS_MODULE ++ * this is only valid when format is compiled as a module ++ */ ++ if (fmt->owner == NULL && fmt_is_mod(fmt)) { ++ PFM_INFO("format %s has no module owner", fmt->fmt_name); ++ return -EINVAL; ++ } ++ /* ++ * we need at least a handler ++ */ ++ if (fmt->fmt_handler == NULL) { ++ PFM_INFO("format %s has no handler", fmt->fmt_name); ++ return -EINVAL; ++ } ++ ++ /* ++ * format argument size cannot be bigger than PAGE_SIZE ++ */ ++ if (fmt->fmt_arg_size > PAGE_SIZE) { ++ PFM_INFO("format %s arguments too big", fmt->fmt_name); ++ return -EINVAL; ++ } ++ ++ spin_lock(&pfm_smpl_fmt_lock); ++ ++ /* ++ * because of sysfs, we cannot have two formats with the same name ++ */ ++ if (pfm_find_fmt(fmt->fmt_name)) { ++ PFM_INFO("format %s already registered", fmt->fmt_name); ++ ret = -EBUSY; ++ goto out; ++ } ++ ++ ret = pfm_sysfs_add_fmt(fmt); ++ if (ret) { ++ PFM_INFO("sysfs cannot add format entry for %s", fmt->fmt_name); ++ goto out; ++ } ++ ++ list_add(&fmt->fmt_list, &pfm_smpl_fmt_list); ++ ++ PFM_INFO("added sampling format %s", fmt->fmt_name); ++out: ++ spin_unlock(&pfm_smpl_fmt_lock); ++ ++ return ret; ++} ++EXPORT_SYMBOL(pfm_fmt_register); ++ ++int pfm_fmt_unregister(struct pfm_smpl_fmt *fmt) ++{ ++ struct pfm_smpl_fmt *fmt2; ++ int ret = 0; ++ ++ if (!fmt || !fmt->fmt_name) { ++ PFM_DBG("invalid fmt"); ++ return -EINVAL; ++ } ++ ++ spin_lock(&pfm_smpl_fmt_lock); ++ ++ fmt2 = pfm_find_fmt(fmt->fmt_name); ++ if (!fmt) { ++ PFM_INFO("unregister failed, format not registered"); ++ ret = -EINVAL; ++ goto out; ++ } ++ list_del_init(&fmt->fmt_list); ++ ++ pfm_sysfs_remove_fmt(fmt); ++ ++ PFM_INFO("removed sampling format: %s", fmt->fmt_name); ++ ++out: ++ spin_unlock(&pfm_smpl_fmt_lock); ++ return ret; ++ ++} ++EXPORT_SYMBOL(pfm_fmt_unregister); ++ ++/* ++ * we defer adding the builtin formats to /sys/kernel/perfmon/formats ++ * until after the pfm sysfs subsystem is initialized. This function ++ * is called from pfm_init_sysfs() ++ */ ++void __init pfm_sysfs_builtin_fmt_add(void) ++{ ++ struct pfm_smpl_fmt *entry; ++ ++ /* ++ * locking not needed, kernel not fully booted ++ * when called ++ */ ++ list_for_each_entry(entry, &pfm_smpl_fmt_list, fmt_list) { ++ pfm_sysfs_add_fmt(entry); ++ } ++} +diff --git a/perfmon/perfmon_hotplug.c b/perfmon/perfmon_hotplug.c +new file mode 100644 +index 0000000..eaaba81 +--- /dev/null ++++ b/perfmon/perfmon_hotplug.c +@@ -0,0 +1,151 @@ ++/* ++ * perfmon_hotplug.c: handling of CPU hotplug ++ * ++ * The initial version of perfmon.c was written by ++ * Ganesh Venkitachalam, IBM Corp. ++ * ++ * Then it was modified for perfmon-1.x by Stephane Eranian and ++ * David Mosberger, Hewlett Packard Co. ++ * ++ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x ++ * by Stephane Eranian, Hewlett Packard Co. ++ * ++ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * David Mosberger-Tang <davidm@hpl.hp.com> ++ * ++ * More information about perfmon available at: ++ * http://perfmon2.sf.net ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/kernel.h> ++#include <linux/perfmon_kern.h> ++#include <linux/cpu.h> ++#include "perfmon_priv.h" ++ ++#ifndef CONFIG_HOTPLUG_CPU ++void pfm_cpu_disable(void) ++{} ++ ++int __init pfm_init_hotplug(void) ++{ ++ return 0; ++} ++#else /* CONFIG_HOTPLUG_CPU */ ++/* ++ * CPU hotplug event nofication callback ++ * ++ * We use the callback to do manage the sysfs interface. ++ * Note that the actual shutdown of monitoring on the CPU ++ * is done in pfm_cpu_disable(), see comments there for more ++ * information. ++ */ ++static int pfm_cpu_notify(struct notifier_block *nfb, ++ unsigned long action, void *hcpu) ++{ ++ unsigned int cpu = (unsigned long)hcpu; ++ int ret = NOTIFY_OK; ++ ++ pfm_pmu_conf_get(0); ++ ++ switch (action) { ++ case CPU_ONLINE: ++ pfm_debugfs_add_cpu(cpu); ++ PFM_INFO("CPU%d is online", cpu); ++ break; ++ case CPU_UP_PREPARE: ++ PFM_INFO("CPU%d prepare online", cpu); ++ break; ++ case CPU_UP_CANCELED: ++ pfm_debugfs_del_cpu(cpu); ++ PFM_INFO("CPU%d is up canceled", cpu); ++ break; ++ case CPU_DOWN_PREPARE: ++ PFM_INFO("CPU%d prepare offline", cpu); ++ break; ++ case CPU_DOWN_FAILED: ++ PFM_INFO("CPU%d is down failed", cpu); ++ break; ++ case CPU_DEAD: ++ pfm_debugfs_del_cpu(cpu); ++ PFM_INFO("CPU%d is offline", cpu); ++ break; ++ } ++ pfm_pmu_conf_put(); ++ return ret; ++} ++ ++/* ++ * called from cpu_disable() to detach the perfmon context ++ * from the CPU going down. ++ * ++ * We cannot use the cpu hotplug notifier because we MUST run ++ * on the CPU that is going down to save the PMU state ++ */ ++void pfm_cpu_disable(void) ++{ ++ struct pfm_context *ctx; ++ unsigned long flags; ++ int is_system, release_info = 0; ++ u32 cpu; ++ int r; ++ ++ ctx = __get_cpu_var(pmu_ctx); ++ if (ctx == NULL) ++ return; ++ ++ is_system = ctx->flags.system; ++ cpu = ctx->cpu; ++ ++ /* ++ * context is LOADED or MASKED ++ * ++ * we unload from CPU. That stops monitoring and does ++ * all the bookeeping of saving values and updating duration ++ */ ++ spin_lock_irqsave(&ctx->lock, flags); ++ if (is_system) ++ __pfm_unload_context(ctx, &release_info); ++ spin_unlock_irqrestore(&ctx->lock, flags); ++ ++ /* ++ * cancel timer ++ */ ++ if (release_info & 0x2) { ++ r = hrtimer_cancel(&__get_cpu_var(pfm_hrtimer)); ++ PFM_DBG("timeout cancel=%d", r); ++ } ++ ++ if (release_info & 0x1) ++ pfm_session_release(is_system, cpu); ++} ++ ++static struct notifier_block pfm_cpu_notifier = { ++ .notifier_call = pfm_cpu_notify ++}; ++ ++int __init pfm_init_hotplug(void) ++{ ++ int ret = 0; ++ /* ++ * register CPU hotplug event notifier ++ */ ++ ret = register_cpu_notifier(&pfm_cpu_notifier); ++ if (!ret) ++ PFM_LOG("CPU hotplug support enabled"); ++ return ret; ++} ++#endif /* CONFIG_HOTPLUG_CPU */ +diff --git a/perfmon/perfmon_init.c b/perfmon/perfmon_init.c +new file mode 100644 +index 0000000..bbb6e4d +--- /dev/null ++++ b/perfmon/perfmon_init.c +@@ -0,0 +1,131 @@ ++/* ++ * perfmon.c: perfmon2 global initialization functions ++ * ++ * This file implements the perfmon2 interface which ++ * provides access to the hardware performance counters ++ * of the host processor. ++ * ++ * ++ * The initial version of perfmon.c was written by ++ * Ganesh Venkitachalam, IBM Corp. ++ * ++ * Then it was modified for perfmon-1.x by Stephane Eranian and ++ * David Mosberger, Hewlett Packard Co. ++ * ++ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x ++ * by Stephane Eranian, Hewlett Packard Co. ++ * ++ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * David Mosberger-Tang <davidm@hpl.hp.com> ++ * ++ * More information about perfmon available at: ++ * http://perfmon2.sf.net ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/kernel.h> ++#include <linux/perfmon_kern.h> ++#include "perfmon_priv.h" ++ ++/* ++ * external variables ++ */ ++DEFINE_PER_CPU(struct task_struct *, pmu_owner); ++DEFINE_PER_CPU(struct pfm_context *, pmu_ctx); ++DEFINE_PER_CPU(u64, pmu_activation_number); ++DEFINE_PER_CPU(struct pfm_stats, pfm_stats); ++DEFINE_PER_CPU(struct hrtimer, pfm_hrtimer); ++ ++ ++int perfmon_disabled; /* >0 if perfmon is disabled */ ++ ++/* ++ * called from cpu_init() and pfm_pmu_register() ++ */ ++void __pfm_init_percpu(void *dummy) ++{ ++ struct hrtimer *h; ++ ++ h = &__get_cpu_var(pfm_hrtimer); ++ ++ pfm_arch_init_percpu(); ++ ++ /* ++ * initialize per-cpu high res timer ++ */ ++ hrtimer_init(h, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++#ifdef CONFIG_HIGH_RES_TIMERS ++ /* ++ * avoid potential deadlock on the runqueue lock ++ * during context switch when multiplexing. Situation ++ * arises on architectures which run switch_to() with ++ * the runqueue lock held, e.g., x86. On others, e.g., ++ * IA-64, the problem does not exist. ++ * Setting the callback mode to HRTIMER_CB_IRQSAFE_UNOCKED ++ * such that the callback routine is only called on hardirq ++ * context not on softirq, thus the context switch will not ++ * end up trying to wakeup the softirqd ++ */ ++ h->cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; ++#endif ++ h->function = pfm_handle_switch_timeout; ++} ++ ++/* ++ * global initialization routine, executed only once ++ */ ++int __init pfm_init(void) ++{ ++ PFM_LOG("version %u.%u", PFM_VERSION_MAJ, PFM_VERSION_MIN); ++ ++ if (pfm_init_ctx()) ++ goto error_disable; ++ ++ ++ if (pfm_init_sets()) ++ goto error_disable; ++ ++ if (pfm_init_fs()) ++ goto error_disable; ++ ++ if (pfm_init_sysfs()) ++ goto error_disable; ++ ++ /* not critical, so no error checking */ ++ pfm_init_debugfs(); ++ ++ /* ++ * one time, arch-specific global initialization ++ */ ++ if (pfm_arch_init()) ++ goto error_disable; ++ ++ if (pfm_init_hotplug()) ++ goto error_disable; ++ return 0; ++ ++error_disable: ++ PFM_ERR("perfmon is disabled due to initialization error"); ++ perfmon_disabled = 1; ++ return -1; ++} ++ ++/* ++ * must use subsys_initcall() to ensure that the perfmon2 core ++ * is initialized before any PMU description module when they are ++ * compiled in. ++ */ ++subsys_initcall(pfm_init); +diff --git a/perfmon/perfmon_intr.c b/perfmon/perfmon_intr.c +new file mode 100644 +index 0000000..c5e3cda +--- /dev/null ++++ b/perfmon/perfmon_intr.c +@@ -0,0 +1,648 @@ ++/* ++ * perfmon_intr.c: perfmon2 interrupt handling ++ * ++ * This file implements the perfmon2 interface which ++ * provides access to the hardware performance counters ++ * of the host processor. ++ * ++ * The initial version of perfmon.c was written by ++ * Ganesh Venkitachalam, IBM Corp. ++ * ++ * Then it was modified for perfmon-1.x by Stephane Eranian and ++ * David Mosberger, Hewlett Packard Co. ++ * ++ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x ++ * by Stephane Eranian, Hewlett Packard Co. ++ * ++ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * David Mosberger-Tang <davidm@hpl.hp.com> ++ * ++ * More information about perfmon available at: ++ * http://perfmon2.sf.net ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/kernel.h> ++#include <linux/module.h> ++#include <linux/perfmon_kern.h> ++#include "perfmon_priv.h" ++ ++/** ++ * pfm_intr_process_64bit_ovfls - handle 64-bit counter emulation ++ * @ctx: context to operate on ++ * @set: set to operate on ++ * ++ * The function returns the number of 64-bit overflows detected. ++ * ++ * 64-bit software pmds are updated for overflowed pmd registers ++ * the set->reset_pmds is updated to the list of pmds to reset ++ * ++ * In any case, set->npend_ovfls is cleared ++ */ ++static u16 pfm_intr_process_64bit_ovfls(struct pfm_context *ctx, ++ struct pfm_event_set *set, ++ u32 *ovfl_ctrl) ++{ ++ u16 i, num_ovfls, max_pmd, max_intr; ++ u16 num_64b_ovfls, has_ovfl_sw, must_switch; ++ u64 ovfl_thres, old_val, new_val, ovfl_mask; ++ ++ num_64b_ovfls = must_switch = 0; ++ ++ ovfl_mask = pfm_pmu_conf->ovfl_mask; ++ max_pmd = ctx->regs.max_pmd; ++ max_intr = ctx->regs.max_intr_pmd; ++ ++ num_ovfls = set->npend_ovfls; ++ has_ovfl_sw = set->flags & PFM_SETFL_OVFL_SWITCH; ++ ++ bitmap_zero(cast_ulp(set->reset_pmds), max_pmd); ++ ++ for (i = ctx->regs.first_intr_pmd; num_ovfls; i++) { ++ /* ++ * skip pmd which did not overflow ++ */ ++ if (!test_bit(i, cast_ulp(set->povfl_pmds))) ++ continue; ++ ++ num_ovfls--; ++ ++ /* ++ * Update software value for counters ONLY ++ * ++ * Note that the pmd is not necessarily 0 at this point as ++ * qualified events may have happened before the PMU was ++ * frozen. The residual count is not taken into consideration ++ * here but will be with any read of the pmd ++ */ ++ ovfl_thres = set->pmds[i].ovflsw_thres; ++ ++ if (likely(test_bit(i, cast_ulp(ctx->regs.cnt_pmds)))) { ++ old_val = new_val = set->pmds[i].value; ++ new_val += 1 + ovfl_mask; ++ set->pmds[i].value = new_val; ++ } else { ++ /* ++ * for non counters which interrupt, e.g., AMD IBS, ++ * we consider this equivalent to a 64-bit counter ++ * overflow. ++ */ ++ old_val = 1; new_val = 0; ++ } ++ ++ /* ++ * check for 64-bit overflow condition ++ */ ++ if (likely(old_val > new_val)) { ++ num_64b_ovfls++; ++ if (has_ovfl_sw && ovfl_thres > 0) { ++ if (ovfl_thres == 1) ++ must_switch = 1; ++ set->pmds[i].ovflsw_thres = ovfl_thres - 1; ++ } ++ ++ /* ++ * what to reset because of this overflow ++ * - the overflowed register ++ * - its reset_smpls ++ */ ++ __set_bit(i, cast_ulp(set->reset_pmds)); ++ ++ bitmap_or(cast_ulp(set->reset_pmds), ++ cast_ulp(set->reset_pmds), ++ cast_ulp(set->pmds[i].reset_pmds), ++ max_pmd); ++ } else { ++ /* ++ * only keep track of 64-bit overflows or ++ * assimilated ++ */ ++ __clear_bit(i, cast_ulp(set->povfl_pmds)); ++ ++ /* ++ * on some PMU, it may be necessary to re-arm the PMD ++ */ ++ pfm_arch_ovfl_reset_pmd(ctx, i); ++ } ++ ++ PFM_DBG_ovfl("ovfl=%s pmd%u new=0x%llx old=0x%llx " ++ "hw_pmd=0x%llx o_pmds=0x%llx must_switch=%u " ++ "o_thres=%llu o_thres_ref=%llu", ++ old_val > new_val ? "64-bit" : "HW", ++ i, ++ (unsigned long long)new_val, ++ (unsigned long long)old_val, ++ (unsigned long long)pfm_read_pmd(ctx, i), ++ (unsigned long long)set->povfl_pmds[0], ++ must_switch, ++ (unsigned long long)set->pmds[i].ovflsw_thres, ++ (unsigned long long)set->pmds[i].ovflsw_ref_thres); ++ } ++ /* ++ * update public bitmask of 64-bit overflowed pmds ++ */ ++ if (num_64b_ovfls) ++ bitmap_copy(cast_ulp(set->ovfl_pmds), cast_ulp(set->povfl_pmds), ++ max_intr); ++ ++ if (must_switch) ++ *ovfl_ctrl |= PFM_OVFL_CTRL_SWITCH; ++ ++ /* ++ * mark the overflows as consumed ++ */ ++ set->npend_ovfls = 0; ++ bitmap_zero(cast_ulp(set->povfl_pmds), max_intr); ++ ++ return num_64b_ovfls; ++} ++ ++/** ++ * pfm_intr_get_smpl_pmds_values - copy 64-bit pmd values for sampling format ++ * @ctx: context to work on ++ * @set: current event set ++ * @arg: overflow arg to be passed to format ++ * @smpl_pmds: list of PMDs of interest for the overflowed register ++ * ++ * build an array of 46-bit PMD values based on smpl_pmds. Values are ++ * stored in increasing order of the PMD indexes ++ */ ++static void pfm_intr_get_smpl_pmds_values(struct pfm_context *ctx, ++ struct pfm_event_set *set, ++ struct pfm_ovfl_arg *arg, ++ u64 *smpl_pmds) ++{ ++ u16 j, k, max_pmd; ++ u64 new_val, ovfl_mask; ++ u64 *cnt_pmds; ++ ++ cnt_pmds = ctx->regs.cnt_pmds; ++ max_pmd = ctx->regs.max_pmd; ++ ovfl_mask = pfm_pmu_conf->ovfl_mask; ++ ++ for (j = k = 0; j < max_pmd; j++) { ++ ++ if (!test_bit(j, cast_ulp(smpl_pmds))) ++ continue; ++ ++ new_val = pfm_read_pmd(ctx, j); ++ ++ /* for counters, build 64-bit value */ ++ if (test_bit(j, cast_ulp(cnt_pmds))) ++ new_val = (set->pmds[j].value & ~ovfl_mask) ++ | (new_val & ovfl_mask); ++ ++ arg->smpl_pmds_values[k++] = new_val; ++ ++ PFM_DBG_ovfl("s_pmd_val[%u]=pmd%u=0x%llx", k, j, ++ (unsigned long long)new_val); ++ } ++ arg->num_smpl_pmds = k; ++} ++ ++/** ++ * pfm_intr_process_smpl_fmt -- handle sampling format callback ++ * @ctx: context to work on ++ * @set: current event set ++ * @ip: interrupted instruction pointer ++ * @now: timestamp ++ * @num_ovfls: number of 64-bit overflows ++ * @ovfl_ctrl: set of controls for interrupt handler tail processing ++ * @regs: register state ++ * ++ * Prepare argument (ovfl_arg) to be passed to sampling format callback, then ++ * invoke the callback (fmt_handler) ++ */ ++static int pfm_intr_process_smpl_fmt(struct pfm_context *ctx, ++ struct pfm_event_set *set, ++ unsigned long ip, ++ u64 now, ++ u64 num_ovfls, ++ u32 *ovfl_ctrl, ++ struct pt_regs *regs) ++{ ++ struct pfm_ovfl_arg *ovfl_arg; ++ u64 start_cycles, end_cycles; ++ u16 i, max_pmd; ++ int ret = 0; ++ ++ ovfl_arg = &ctx->ovfl_arg; ++ ++ ovfl_arg->active_set = set->id; ++ max_pmd = ctx->regs.max_pmd; ++ ++ /* ++ * first_intr_pmd: first PMD which can generate PMU interrupts ++ */ ++ for (i = ctx->regs.first_intr_pmd; num_ovfls; i++) { ++ /* ++ * skip pmd which did not have 64-bit overflows ++ */ ++ if (!test_bit(i, cast_ulp(set->ovfl_pmds))) ++ continue; ++ ++ num_ovfls--; ++ ++ /* ++ * prepare argument to fmt_handler ++ */ ++ ovfl_arg->ovfl_pmd = i; ++ ovfl_arg->ovfl_ctrl = 0; ++ ++ ovfl_arg->pmd_last_reset = set->pmds[i].lval; ++ ovfl_arg->pmd_eventid = set->pmds[i].eventid; ++ ovfl_arg->num_smpl_pmds = 0; ++ ++ /* ++ * copy values of pmds of interest, if any ++ * Sampling format may use them ++ * We do not initialize the unused smpl_pmds_values ++ */ ++ if (!bitmap_empty(cast_ulp(set->pmds[i].smpl_pmds), max_pmd)) ++ pfm_intr_get_smpl_pmds_values(ctx, set, ovfl_arg, ++ set->pmds[i].smpl_pmds); ++ ++ pfm_stats_inc(fmt_handler_calls); ++ ++ /* ++ * call format record (handler) routine ++ */ ++ start_cycles = sched_clock(); ++ ret = (*ctx->smpl_fmt->fmt_handler)(ctx, ip, now, regs); ++ end_cycles = sched_clock(); ++ ++ /* ++ * The reset_pmds mask is constructed automatically ++ * on overflow. When the actual reset takes place ++ * depends on the masking, switch and notification ++ * status. It may be deferred until pfm_restart(). ++ */ ++ *ovfl_ctrl |= ovfl_arg->ovfl_ctrl; ++ ++ pfm_stats_add(fmt_handler_ns, end_cycles - start_cycles); ++ } ++ /* ++ * when the format cannot handle the rest of the overflow, we abort ++ */ ++ if (ret) ++ PFM_DBG_ovfl("handler aborted at PMD%u ret=%d", i, ret); ++ return ret; ++} ++/** ++ * pfm_overflow_handler - main overflow processing routine. ++ * @ctx: context to work on (always current context) ++ * @set: current event set ++ * @ip: interrupt instruction pointer ++ * @regs: machine state ++ * ++ * set->num_ovfl_pmds is 0 when returning from this function even though ++ * set->ovfl_pmds[] may have bits set. When leaving set->num_ovfl_pmds ++ * must never be used to determine if there was a pending overflow. ++ */ ++static void pfm_overflow_handler(struct pfm_context *ctx, ++ struct pfm_event_set *set, ++ unsigned long ip, ++ struct pt_regs *regs) ++{ ++ struct pfm_event_set *set_orig; ++ u64 now; ++ u32 ovfl_ctrl; ++ u16 max_intr, max_pmd; ++ u16 num_ovfls; ++ int ret, has_notify; ++ ++ /* ++ * take timestamp ++ */ ++ now = sched_clock(); ++ ++ max_pmd = ctx->regs.max_pmd; ++ max_intr = ctx->regs.max_intr_pmd; ++ ++ set_orig = set; ++ ovfl_ctrl = 0; ++ ++ /* ++ * skip ZOMBIE case ++ */ ++ if (unlikely(ctx->state == PFM_CTX_ZOMBIE)) ++ goto stop_monitoring; ++ ++ PFM_DBG_ovfl("intr_pmds=0x%llx npend=%u ip=%p, blocking=%d " ++ "u_pmds=0x%llx use_fmt=%u", ++ (unsigned long long)set->povfl_pmds[0], ++ set->npend_ovfls, ++ (void *)ip, ++ ctx->flags.block, ++ (unsigned long long)set->used_pmds[0], ++ !!ctx->smpl_fmt); ++ ++ /* ++ * return number of 64-bit overflows ++ */ ++ num_ovfls = pfm_intr_process_64bit_ovfls(ctx, set, &ovfl_ctrl); ++ ++ /* ++ * there were no 64-bit overflows ++ * nothing else to do ++ */ ++ if (!num_ovfls) ++ return; ++ ++ /* ++ * tmp_ovfl_notify = ovfl_pmds & ovfl_notify ++ * with: ++ * - ovfl_pmds: last 64-bit overflowed pmds ++ * - ovfl_notify: notify on overflow registers ++ */ ++ bitmap_and(cast_ulp(ctx->tmp_ovfl_notify), ++ cast_ulp(set->ovfl_pmds), ++ cast_ulp(set->ovfl_notify), ++ max_intr); ++ ++ has_notify = !bitmap_empty(cast_ulp(ctx->tmp_ovfl_notify), max_intr); ++ ++ /* ++ * check for sampling format and invoke fmt_handler ++ */ ++ if (likely(ctx->smpl_fmt)) { ++ pfm_intr_process_smpl_fmt(ctx, set, ip, now, num_ovfls, ++ &ovfl_ctrl, regs); ++ } else { ++ /* ++ * When no sampling format is used, the default ++ * is: ++ * - mask monitoring if not switching ++ * - notify user if requested ++ * ++ * If notification is not requested, monitoring is masked ++ * and overflowed registers are not reset (saturation). ++ * This mimics the behavior of the default sampling format. ++ */ ++ ovfl_ctrl |= PFM_OVFL_CTRL_NOTIFY; ++ if (has_notify || !(ovfl_ctrl & PFM_OVFL_CTRL_SWITCH)) ++ ovfl_ctrl |= PFM_OVFL_CTRL_MASK; ++ } ++ ++ PFM_DBG_ovfl("set%u o_notify=0x%llx o_pmds=0x%llx " ++ "r_pmds=0x%llx ovfl_ctrl=0x%x", ++ set->id, ++ (unsigned long long)ctx->tmp_ovfl_notify[0], ++ (unsigned long long)set->ovfl_pmds[0], ++ (unsigned long long)set->reset_pmds[0], ++ ovfl_ctrl); ++ ++ /* ++ * execute the various controls ++ * ORDER MATTERS ++ */ ++ ++ ++ /* ++ * mask monitoring ++ */ ++ if (ovfl_ctrl & PFM_OVFL_CTRL_MASK) { ++ pfm_mask_monitoring(ctx, set); ++ /* ++ * when masking, reset is deferred until ++ * pfm_restart() ++ */ ++ ovfl_ctrl &= ~PFM_OVFL_CTRL_RESET; ++ ++ /* ++ * when masking, switching is deferred until ++ * pfm_restart and we need to remember it ++ */ ++ if (ovfl_ctrl & PFM_OVFL_CTRL_SWITCH) { ++ set->priv_flags |= PFM_SETFL_PRIV_SWITCH; ++ ovfl_ctrl &= ~PFM_OVFL_CTRL_SWITCH; ++ } ++ } ++ ++ /* ++ * switch event set ++ */ ++ if (ovfl_ctrl & PFM_OVFL_CTRL_SWITCH) { ++ pfm_switch_sets_from_intr(ctx); ++ /* update view of active set */ ++ set = ctx->active_set; ++ } ++ /* ++ * send overflow notification ++ * ++ * only necessary if at least one overflowed ++ * register had the notify flag set ++ */ ++ if (has_notify && (ovfl_ctrl & PFM_OVFL_CTRL_NOTIFY)) { ++ /* ++ * block on notify, not on masking ++ */ ++ if (ctx->flags.block) ++ pfm_post_work(current, ctx, PFM_WORK_BLOCK); ++ ++ /* ++ * send notification and passed original set id ++ * if error, queue full, for instance, then default ++ * to masking monitoring, i.e., saturate ++ */ ++ ret = pfm_ovfl_notify(ctx, set_orig, ip); ++ if (unlikely(ret)) { ++ if (ctx->state == PFM_CTX_LOADED) { ++ pfm_mask_monitoring(ctx, set); ++ ovfl_ctrl &= ~PFM_OVFL_CTRL_RESET; ++ } ++ } else { ++ ctx->flags.can_restart++; ++ PFM_DBG_ovfl("can_restart=%u", ctx->flags.can_restart); ++ } ++ } ++ ++ /* ++ * reset overflowed registers ++ */ ++ if (ovfl_ctrl & PFM_OVFL_CTRL_RESET) { ++ u16 nn; ++ nn = bitmap_weight(cast_ulp(set->reset_pmds), max_pmd); ++ if (nn) ++ pfm_reset_pmds(ctx, set, nn, PFM_PMD_RESET_SHORT); ++ } ++ return; ++ ++stop_monitoring: ++ /* ++ * Does not happen for a system-wide context nor for a ++ * self-monitored context. We cannot attach to kernel-only ++ * thread, thus it is safe to set TIF bits, i.e., the thread ++ * will eventually leave the kernel or die and either we will ++ * catch the context and clean it up in pfm_handler_work() or ++ * pfm_exit_thread(). ++ * ++ * Mask until we get to pfm_handle_work() ++ */ ++ pfm_mask_monitoring(ctx, set); ++ ++ PFM_DBG_ovfl("ctx is zombie, converted to spurious"); ++ pfm_post_work(current, ctx, PFM_WORK_ZOMBIE); ++} ++ ++/** ++ * __pfm_interrupt_handler - 1st level interrupt handler ++ * @ip: interrupted instruction pointer ++ * @regs: machine state ++ * ++ * Function is static because we use a wrapper to easily capture timing infos. ++ * ++ * ++ * Context locking necessary to avoid concurrent accesses from other CPUs ++ * - For per-thread, we must prevent pfm_restart() which works when ++ * context is LOADED or MASKED ++ */ ++static void __pfm_interrupt_handler(unsigned long ip, struct pt_regs *regs) ++{ ++ struct task_struct *task; ++ struct pfm_context *ctx; ++ struct pfm_event_set *set; ++ ++ ++ task = __get_cpu_var(pmu_owner); ++ ctx = __get_cpu_var(pmu_ctx); ++ ++ /* ++ * verify if there is a context on this CPU ++ */ ++ if (unlikely(ctx == NULL)) { ++ PFM_DBG_ovfl("no ctx"); ++ goto spurious; ++ } ++ ++ /* ++ * we need to lock context because it could be accessed ++ * from another CPU. Depending on the priority level of ++ * the PMU interrupt or the arch, it may be necessary to ++ * mask interrupts alltogether to avoid race condition with ++ * the timer interrupt in case of time-based set switching, ++ * for instance. ++ */ ++ spin_lock(&ctx->lock); ++ ++ set = ctx->active_set; ++ ++ /* ++ * For SMP per-thread, it is not possible to have ++ * owner != NULL && task != current. ++ * ++ * For UP per-thread, because of lazy save, it ++ * is possible to receive an interrupt in another task ++ * which is not using the PMU. This means ++ * that the interrupt was in-flight at the ++ * time of pfm_ctxswout_thread(). In that ++ * case, it will be replayed when the task ++ * is scheduled again. Hence we convert to spurious. ++ * ++ * The basic rule is that an overflow is always ++ * processed in the context of the task that ++ * generated it for all per-thread contexts. ++ * ++ * for system-wide, task is always NULL ++ */ ++#ifndef CONFIG_SMP ++ if (unlikely((task && current->pfm_context != ctx))) { ++ PFM_DBG_ovfl("spurious: not owned by current task"); ++ goto spurious; ++ } ++#endif ++ if (unlikely(ctx->state == PFM_CTX_MASKED)) { ++ PFM_DBG_ovfl("spurious: monitoring masked"); ++ goto spurious; ++ } ++ ++ /* ++ * check that monitoring is active, otherwise convert ++ * to spurious ++ */ ++ if (unlikely(!pfm_arch_is_active(ctx))) { ++ PFM_DBG_ovfl("spurious: monitoring non active"); ++ goto spurious; ++ } ++ ++ /* ++ * freeze PMU and collect overflowed PMD registers ++ * into set->povfl_pmds. Number of overflowed PMDs ++ * reported in set->npend_ovfls ++ */ ++ pfm_arch_intr_freeze_pmu(ctx, set); ++ ++ /* ++ * no overflow detected, interrupt may have come ++ * from the previous thread running on this CPU ++ */ ++ if (unlikely(!set->npend_ovfls)) { ++ PFM_DBG_ovfl("no npend_ovfls"); ++ goto spurious; ++ } ++ ++ pfm_stats_inc(ovfl_intr_regular_count); ++ ++ /* ++ * invoke actual handler ++ */ ++ pfm_overflow_handler(ctx, set, ip, regs); ++ ++ /* ++ * unfreeze PMU, monitoring may not actual be restarted ++ * if context is MASKED ++ */ ++ pfm_arch_intr_unfreeze_pmu(ctx); ++ ++ spin_unlock(&ctx->lock); ++ ++ return; ++ ++spurious: ++ /* ctx may be NULL */ ++ pfm_arch_intr_unfreeze_pmu(ctx); ++ if (ctx) ++ spin_unlock(&ctx->lock); ++ ++ pfm_stats_inc(ovfl_intr_spurious_count); ++} ++ ++ ++/** ++ * pfm_interrupt_handler - 1st level interrupt handler ++ * @ip: interrupt instruction pointer ++ * @regs: machine state ++ * ++ * Function called from the low-level assembly code or arch-specific perfmon ++ * code. Simple wrapper used for timing purpose. Actual work done in ++ * __pfm_overflow_handler() ++ */ ++void pfm_interrupt_handler(unsigned long ip, struct pt_regs *regs) ++{ ++ u64 start; ++ ++ pfm_stats_inc(ovfl_intr_all_count); ++ ++ BUG_ON(!irqs_disabled()); ++ ++ start = sched_clock(); ++ ++ __pfm_interrupt_handler(ip, regs); ++ ++ pfm_stats_add(ovfl_intr_ns, sched_clock() - start); ++} ++EXPORT_SYMBOL(pfm_interrupt_handler); ++ +diff --git a/perfmon/perfmon_msg.c b/perfmon/perfmon_msg.c +new file mode 100644 +index 0000000..b8a1e4c +--- /dev/null ++++ b/perfmon/perfmon_msg.c +@@ -0,0 +1,229 @@ ++/* ++ * perfmon_msg.c: perfmon2 notification message queue management ++ * ++ * This file implements the perfmon2 interface which ++ * provides access to the hardware performance counters ++ * of the host processor. ++ * ++ * The initial version of perfmon.c was written by ++ * Ganesh Venkitachalam, IBM Corp. ++ * ++ * Then it was modified for perfmon-1.x by Stephane Eranian and ++ * David Mosberger, Hewlett Packard Co. ++ * ++ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x ++ * by Stephane Eranian, Hewlett Packard Co. ++ * ++ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * David Mosberger-Tang <davidm@hpl.hp.com> ++ * ++ * More information about perfmon available at: ++ * http://perfmon2.sf.net ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/kernel.h> ++#include <linux/poll.h> ++#include <linux/perfmon_kern.h> ++ ++/** ++ * pfm_get_new_msg - get a new message slot from the queue ++ * @ctx: context to operate on ++ * ++ * if queue if full NULL is returned ++ */ ++static union pfarg_msg *pfm_get_new_msg(struct pfm_context *ctx) ++{ ++ int next; ++ ++ next = ctx->msgq_head & PFM_MSGQ_MASK; ++ ++ if ((ctx->msgq_head - ctx->msgq_tail) == PFM_MSGS_COUNT) ++ return NULL; ++ ++ /* ++ * move to next possible slot ++ */ ++ ctx->msgq_head++; ++ ++ PFM_DBG_ovfl("head=%d tail=%d msg=%d", ++ ctx->msgq_head & PFM_MSGQ_MASK, ++ ctx->msgq_tail & PFM_MSGQ_MASK, ++ next); ++ ++ return ctx->msgq+next; ++} ++ ++/** ++ * pfm_notify_user - wakeup any thread wiating on msg queue, post SIGIO ++ * @ctx: context to operate on ++ * ++ * message is already enqueued ++ */ ++static void pfm_notify_user(struct pfm_context *ctx) ++{ ++ if (ctx->state == PFM_CTX_ZOMBIE) { ++ PFM_DBG("no notification, context is zombie"); ++ return; ++ } ++ ++ PFM_DBG_ovfl("waking up"); ++ ++ wake_up_interruptible(&ctx->msgq_wait); ++ ++ /* ++ * it is safe to call kill_fasync() from an interrupt ++ * handler. kill_fasync() grabs two RW locks (fasync_lock, ++ * tasklist_lock) in read mode. There is conflict only in ++ * case the PMU interrupt occurs during a write mode critical ++ * section. This cannot happen because for both locks, the ++ * write mode is always using interrupt masking (write_lock_irq). ++ */ ++ kill_fasync(&ctx->async_queue, SIGIO, POLL_IN); ++} ++ ++/** ++ * pfm_ovfl_notify - send overflow notification ++ * @ctx: context to operate on ++ * @set: which set the overflow comes from ++ * @ip: overflow interrupt instruction address (IIP) ++ * ++ * Appends an overflow notification message to context queue. ++ * call pfm_notify() to wakeup any threads and/or send a signal ++ * ++ * Context is locked and interrupts are disabled (no preemption). ++ */ ++int pfm_ovfl_notify(struct pfm_context *ctx, ++ struct pfm_event_set *set, ++ unsigned long ip) ++{ ++ union pfarg_msg *msg = NULL; ++ u64 *ovfl_pmds; ++ ++ if (!ctx->flags.no_msg) { ++ msg = pfm_get_new_msg(ctx); ++ if (msg == NULL) { ++ /* ++ * when message queue fills up it is because the user ++ * did not extract the message, yet issued ++ * pfm_restart(). At this point, we stop sending ++ * notification, thus the user will not be able to get ++ * new samples when using the default format. ++ */ ++ PFM_DBG_ovfl("no more notification msgs"); ++ return -1; ++ } ++ ++ msg->pfm_ovfl_msg.msg_type = PFM_MSG_OVFL; ++ msg->pfm_ovfl_msg.msg_ovfl_pid = current->pid; ++ msg->pfm_ovfl_msg.msg_active_set = set->id; ++ ++ ovfl_pmds = msg->pfm_ovfl_msg.msg_ovfl_pmds; ++ ++ /* ++ * copy bitmask of all pmd that interrupted last ++ */ ++ bitmap_copy(cast_ulp(ovfl_pmds), cast_ulp(set->ovfl_pmds), ++ ctx->regs.max_intr_pmd); ++ ++ msg->pfm_ovfl_msg.msg_ovfl_cpu = smp_processor_id(); ++ msg->pfm_ovfl_msg.msg_ovfl_tid = current->tgid; ++ msg->pfm_ovfl_msg.msg_ovfl_ip = ip; ++ ++ pfm_stats_inc(ovfl_notify_count); ++ } ++ ++ PFM_DBG_ovfl("ip=0x%lx o_pmds=0x%llx", ++ ip, ++ (unsigned long long)set->ovfl_pmds[0]); ++ ++ pfm_notify_user(ctx); ++ return 0; ++} ++ ++/** ++ * pfm_end_notify_user - notify of thread termination ++ * @ctx: context to operate on ++ * ++ * In per-thread mode, when not self-monitoring, perfmon ++ * sends a 'end' notification message when the monitored ++ * thread where the context is attached is exiting. ++ * ++ * This helper message alleviates the need to track the activity ++ * of the thread/process when it is not directly related, i.e., ++ * was attached. In other words, no needto keep the thread ++ * ptraced. ++ * ++ * The context must be locked and interrupts disabled. ++ */ ++int pfm_end_notify(struct pfm_context *ctx) ++{ ++ union pfarg_msg *msg; ++ ++ msg = pfm_get_new_msg(ctx); ++ if (msg == NULL) { ++ PFM_ERR("%s no more msgs", __func__); ++ return -1; ++ } ++ /* no leak */ ++ memset(msg, 0, sizeof(*msg)); ++ ++ msg->type = PFM_MSG_END; ++ ++ PFM_DBG("end msg: msg=%p no_msg=%d", ++ msg, ++ ctx->flags.no_msg); ++ ++ pfm_notify_user(ctx); ++ return 0; ++} ++ ++/** ++ * pfm_get_next_msg - copy the oldest message from the queue and move tail ++ * @ctx: context to use ++ * @m: where to copy the message into ++ * ++ * The tail of the queue is moved as a consequence of this call ++ */ ++void pfm_get_next_msg(struct pfm_context *ctx, union pfarg_msg *m) ++{ ++ union pfarg_msg *next; ++ ++ PFM_DBG_ovfl("in head=%d tail=%d", ++ ctx->msgq_head & PFM_MSGQ_MASK, ++ ctx->msgq_tail & PFM_MSGQ_MASK); ++ ++ /* ++ * get oldest message ++ */ ++ next = ctx->msgq + (ctx->msgq_tail & PFM_MSGQ_MASK); ++ ++ /* ++ * move tail forward ++ */ ++ ctx->msgq_tail++; ++ ++ /* ++ * copy message, we cannot simply point to it ++ * as it may be re-used before we copy it out ++ */ ++ *m = *next; ++ ++ PFM_DBG_ovfl("out head=%d tail=%d type=%d", ++ ctx->msgq_head & PFM_MSGQ_MASK, ++ ctx->msgq_tail & PFM_MSGQ_MASK, ++ m->type); ++} +diff --git a/perfmon/perfmon_pmu.c b/perfmon/perfmon_pmu.c +new file mode 100644 +index 0000000..df7a9c9 +--- /dev/null ++++ b/perfmon/perfmon_pmu.c +@@ -0,0 +1,590 @@ ++/* ++ * perfmon_pmu.c: perfmon2 PMU configuration management ++ * ++ * This file implements the perfmon2 interface which ++ * provides access to the hardware performance counters ++ * of the host processor. ++ * ++ * The initial version of perfmon.c was written by ++ * Ganesh Venkitachalam, IBM Corp. ++ * ++ * Then it was modified for perfmon-1.x by Stephane Eranian and ++ * David Mosberger, Hewlett Packard Co. ++ * ++ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x ++ * by Stephane Eranian, Hewlett Packard Co. ++ * ++ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * David Mosberger-Tang <davidm@hpl.hp.com> ++ * ++ * More information about perfmon available at: ++ * http://perfmon2.sf.net ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/module.h> ++#include <linux/perfmon_kern.h> ++#include "perfmon_priv.h" ++ ++#ifndef CONFIG_MODULE_UNLOAD ++#define module_refcount(n) 1 ++#endif ++ ++static __cacheline_aligned_in_smp int request_mod_in_progress; ++static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_pmu_conf_lock); ++ ++static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_pmu_acq_lock); ++static u32 pfm_pmu_acquired; ++ ++/* ++ * perfmon core must acces PMU information ONLY through pfm_pmu_conf ++ * if pfm_pmu_conf is NULL, then no description is registered ++ */ ++struct pfm_pmu_config *pfm_pmu_conf; ++EXPORT_SYMBOL(pfm_pmu_conf); ++ ++static inline int pmu_is_module(struct pfm_pmu_config *c) ++{ ++ return !(c->flags & PFM_PMUFL_IS_BUILTIN); ++} ++/** ++ * pfm_pmu_regdesc_init -- initialize regdesc structure from PMU table ++ * @regs: the regdesc structure to initialize ++ * @excl_type: the register type(s) to exclude from this regdesc ++ * @unvail_pmcs: unavailable PMC registers ++ * @unavail_pmds: unavailable PMD registers ++ * ++ * Return: ++ * 0 success ++ * errno in case of error ++ */ ++static int pfm_pmu_regdesc_init(struct pfm_regdesc *regs, int excl_type, ++ u64 *unavail_pmcs, u64 *unavail_pmds) ++{ ++ struct pfm_regmap_desc *d; ++ u16 n, n2, n_counters, i; ++ int first_intr_pmd = -1, max1, max2, max3; ++ ++ /* ++ * compute the number of implemented PMC from the ++ * description table ++ */ ++ n = 0; ++ max1 = max2 = -1; ++ d = pfm_pmu_conf->pmc_desc; ++ for (i = 0; i < pfm_pmu_conf->num_pmc_entries; i++, d++) { ++ if (!(d->type & PFM_REG_I)) ++ continue; ++ ++ if (test_bit(i, cast_ulp(unavail_pmcs))) ++ continue; ++ ++ if (d->type & excl_type) ++ continue; ++ ++ __set_bit(i, cast_ulp(regs->pmcs)); ++ ++ max1 = i; ++ n++; ++ } ++ ++ if (!n) { ++ PFM_INFO("%s PMU description has no PMC registers", ++ pfm_pmu_conf->pmu_name); ++ return -EINVAL; ++ } ++ ++ regs->max_pmc = max1 + 1; ++ regs->num_pmcs = n; ++ ++ n = n_counters = n2 = 0; ++ max1 = max2 = max3 = -1; ++ d = pfm_pmu_conf->pmd_desc; ++ for (i = 0; i < pfm_pmu_conf->num_pmd_entries; i++, d++) { ++ if (!(d->type & PFM_REG_I)) ++ continue; ++ ++ if (test_bit(i, cast_ulp(unavail_pmds))) ++ continue; ++ ++ if (d->type & excl_type) ++ continue; ++ ++ __set_bit(i, cast_ulp(regs->pmds)); ++ max1 = i; ++ n++; ++ ++ /* ++ * read-write registers ++ */ ++ if (!(d->type & PFM_REG_RO)) { ++ __set_bit(i, cast_ulp(regs->rw_pmds)); ++ max3 = i; ++ n2++; ++ } ++ ++ /* ++ * counter registers ++ */ ++ if (d->type & PFM_REG_C64) { ++ __set_bit(i, cast_ulp(regs->cnt_pmds)); ++ n_counters++; ++ } ++ ++ /* ++ * PMD with intr capabilities ++ */ ++ if (d->type & PFM_REG_INTR) { ++ __set_bit(i, cast_ulp(regs->intr_pmds)); ++ if (first_intr_pmd == -1) ++ first_intr_pmd = i; ++ max2 = i; ++ } ++ } ++ ++ if (!n) { ++ PFM_INFO("%s PMU description has no PMD registers", ++ pfm_pmu_conf->pmu_name); ++ return -EINVAL; ++ } ++ ++ regs->max_pmd = max1 + 1; ++ regs->first_intr_pmd = first_intr_pmd; ++ regs->max_intr_pmd = max2 + 1; ++ ++ regs->num_counters = n_counters; ++ regs->num_pmds = n; ++ regs->max_rw_pmd = max3 + 1; ++ regs->num_rw_pmd = n2; ++ ++ return 0; ++} ++ ++/** ++ * pfm_pmu_regdesc_init_all -- initialize all regdesc structures ++ * @una_pmcs : unavailable PMC registers ++ * @una_pmds : unavailable PMD registers ++ * ++ * Return: ++ * 0 sucess ++ * errno if error ++ * ++ * We maintain 3 regdesc: ++ * regs_all: all available registers ++ * regs_sys: registers available to system-wide contexts only ++ * regs_thr: registers available to per-thread contexts only ++ */ ++static int pfm_pmu_regdesc_init_all(u64 *una_pmcs, u64 *una_pmds) ++{ ++ int ret; ++ ++ memset(&pfm_pmu_conf->regs_all, 0, sizeof(struct pfm_regdesc)); ++ memset(&pfm_pmu_conf->regs_thr, 0, sizeof(struct pfm_regdesc)); ++ memset(&pfm_pmu_conf->regs_sys, 0, sizeof(struct pfm_regdesc)); ++ ++ ret = pfm_pmu_regdesc_init(&pfm_pmu_conf->regs_all, ++ 0, ++ una_pmcs, una_pmds); ++ if (ret) ++ return ret; ++ ++ PFM_DBG("regs_all.pmcs=0x%llx", ++ (unsigned long long)pfm_pmu_conf->regs_all.pmcs[0]); ++ ++ ret = pfm_pmu_regdesc_init(&pfm_pmu_conf->regs_thr, ++ PFM_REG_SYS, ++ una_pmcs, una_pmds); ++ if (ret) ++ return ret; ++ PFM_DBG("regs.thr.pmcs=0x%llx", ++ (unsigned long long)pfm_pmu_conf->regs_thr.pmcs[0]); ++ ++ ret = pfm_pmu_regdesc_init(&pfm_pmu_conf->regs_sys, ++ PFM_REG_THR, ++ una_pmcs, una_pmds); ++ ++ PFM_DBG("regs_sys.pmcs=0x%llx", ++ (unsigned long long)pfm_pmu_conf->regs_sys.pmcs[0]); ++ ++ return ret; ++} ++ ++int pfm_pmu_register(struct pfm_pmu_config *cfg) ++{ ++ u16 i, nspec, nspec_ro, num_pmcs, num_pmds, num_wc = 0; ++ int type, ret = -EBUSY; ++ ++ if (perfmon_disabled) { ++ PFM_INFO("perfmon disabled, cannot add PMU description"); ++ return -ENOSYS; ++ } ++ ++ nspec = nspec_ro = num_pmds = num_pmcs = 0; ++ ++ /* some sanity checks */ ++ if (cfg == NULL || cfg->pmu_name == NULL) { ++ PFM_INFO("PMU config descriptor is invalid"); ++ return -EINVAL; ++ } ++ ++ /* must have a probe */ ++ if (cfg->probe_pmu == NULL) { ++ PFM_INFO("PMU config has no probe routine"); ++ return -EINVAL; ++ } ++ ++ /* ++ * execute probe routine before anything else as it ++ * may update configuration tables ++ */ ++ if ((*cfg->probe_pmu)() == -1) { ++ PFM_INFO("%s PMU detection failed", cfg->pmu_name); ++ return -EINVAL; ++ } ++ ++ if (!(cfg->flags & PFM_PMUFL_IS_BUILTIN) && cfg->owner == NULL) { ++ PFM_INFO("PMU config %s is missing owner", cfg->pmu_name); ++ return -EINVAL; ++ } ++ ++ if (!cfg->num_pmd_entries) { ++ PFM_INFO("%s needs to define num_pmd_entries", cfg->pmu_name); ++ return -EINVAL; ++ } ++ ++ if (!cfg->num_pmc_entries) { ++ PFM_INFO("%s needs to define num_pmc_entries", cfg->pmu_name); ++ return -EINVAL; ++ } ++ ++ if (!cfg->counter_width) { ++ PFM_INFO("PMU config %s, zero width counters", cfg->pmu_name); ++ return -EINVAL; ++ } ++ ++ /* ++ * REG_RO, REG_V not supported on PMC registers ++ */ ++ for (i = 0; i < cfg->num_pmc_entries; i++) { ++ ++ type = cfg->pmc_desc[i].type; ++ ++ if (type & PFM_REG_I) ++ num_pmcs++; ++ ++ if (type & PFM_REG_WC) ++ num_wc++; ++ ++ if (type & PFM_REG_V) { ++ PFM_INFO("PFM_REG_V is not supported on " ++ "PMCs (PMC%d)", i); ++ return -EINVAL; ++ } ++ if (type & PFM_REG_RO) { ++ PFM_INFO("PFM_REG_RO meaningless on " ++ "PMCs (PMC%u)", i); ++ return -EINVAL; ++ } ++ } ++ ++ if (num_wc && cfg->pmc_write_check == NULL) { ++ PFM_INFO("some PMCs have write-checker but no callback provided\n"); ++ return -EINVAL; ++ } ++ ++ /* ++ * check virtual PMD registers ++ */ ++ num_wc = 0; ++ for (i = 0; i < cfg->num_pmd_entries; i++) { ++ ++ type = cfg->pmd_desc[i].type; ++ ++ if (type & PFM_REG_I) ++ num_pmds++; ++ ++ if (type & PFM_REG_V) { ++ nspec++; ++ if (type & PFM_REG_RO) ++ nspec_ro++; ++ } ++ ++ if (type & PFM_REG_WC) ++ num_wc++; ++ } ++ ++ if (num_wc && cfg->pmd_write_check == NULL) { ++ PFM_INFO("PMD have write-checker but no callback provided\n"); ++ return -EINVAL; ++ } ++ ++ if (nspec && cfg->pmd_sread == NULL) { ++ PFM_INFO("PMU config is missing pmd_sread()"); ++ return -EINVAL; ++ } ++ ++ nspec = nspec - nspec_ro; ++ if (nspec && cfg->pmd_swrite == NULL) { ++ PFM_INFO("PMU config is missing pmd_swrite()"); ++ return -EINVAL; ++ } ++ ++ if (num_pmcs >= PFM_MAX_PMCS) { ++ PFM_INFO("%s PMCS registers exceed name space [0-%u]", ++ cfg->pmu_name, ++ PFM_MAX_PMCS); ++ return -EINVAL; ++ } ++ if (num_pmds >= PFM_MAX_PMDS) { ++ PFM_INFO("%s PMDS registers exceed name space [0-%u]", ++ cfg->pmu_name, ++ PFM_MAX_PMDS); ++ return -EINVAL; ++ } ++ spin_lock(&pfm_pmu_conf_lock); ++ ++ if (pfm_pmu_conf) ++ goto unlock; ++ ++ if (!cfg->version) ++ cfg->version = "0.0"; ++ ++ pfm_pmu_conf = cfg; ++ pfm_pmu_conf->ovfl_mask = (1ULL << cfg->counter_width) - 1; ++ ++ ret = pfm_arch_pmu_config_init(cfg); ++ if (ret) ++ goto unlock; ++ ++ ret = pfm_sysfs_add_pmu(pfm_pmu_conf); ++ if (ret) ++ pfm_pmu_conf = NULL; ++ ++unlock: ++ spin_unlock(&pfm_pmu_conf_lock); ++ ++ if (ret) { ++ PFM_INFO("register %s PMU error %d", cfg->pmu_name, ret); ++ } else { ++ PFM_INFO("%s PMU installed", cfg->pmu_name); ++ /* ++ * (re)initialize PMU on each PMU now that we have a description ++ */ ++ on_each_cpu(__pfm_init_percpu, cfg, 0); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(pfm_pmu_register); ++ ++/* ++ * remove PMU description. Caller must pass address of current ++ * configuration. This is mostly for sanity checking as only ++ * one config can exist at any time. ++ * ++ * We are using the module refcount mechanism to protect against ++ * removal while the configuration is being used. As long as there is ++ * one context, a PMU configuration cannot be removed. The protection is ++ * managed in module logic. ++ */ ++void pfm_pmu_unregister(struct pfm_pmu_config *cfg) ++{ ++ if (!(cfg || pfm_pmu_conf)) ++ return; ++ ++ spin_lock(&pfm_pmu_conf_lock); ++ ++ BUG_ON(module_refcount(pfm_pmu_conf->owner)); ++ ++ if (cfg->owner == pfm_pmu_conf->owner) { ++ pfm_sysfs_remove_pmu(pfm_pmu_conf); ++ pfm_pmu_conf = NULL; ++ } ++ ++ spin_unlock(&pfm_pmu_conf_lock); ++} ++EXPORT_SYMBOL(pfm_pmu_unregister); ++ ++static int pfm_pmu_request_module(void) ++{ ++ char *mod_name; ++ int ret; ++ ++ mod_name = pfm_arch_get_pmu_module_name(); ++ if (mod_name == NULL) ++ return -ENOSYS; ++ ++ ret = request_module(mod_name); ++ ++ PFM_DBG("mod=%s ret=%d\n", mod_name, ret); ++ return ret; ++} ++ ++/* ++ * autoload: ++ * 0 : do not try to autoload the PMU description module ++ * not 0 : try to autoload the PMU description module ++ */ ++int pfm_pmu_conf_get(int autoload) ++{ ++ int ret; ++ ++ spin_lock(&pfm_pmu_conf_lock); ++ ++ if (request_mod_in_progress) { ++ ret = -ENOSYS; ++ goto skip; ++ } ++ ++ if (autoload && pfm_pmu_conf == NULL) { ++ ++ request_mod_in_progress = 1; ++ ++ spin_unlock(&pfm_pmu_conf_lock); ++ ++ pfm_pmu_request_module(); ++ ++ spin_lock(&pfm_pmu_conf_lock); ++ ++ request_mod_in_progress = 0; ++ ++ /* ++ * request_module() may succeed but the module ++ * may not have registered properly so we need ++ * to check ++ */ ++ } ++ ++ ret = pfm_pmu_conf == NULL ? -ENOSYS : 0; ++ if (!ret && pmu_is_module(pfm_pmu_conf) ++ && !try_module_get(pfm_pmu_conf->owner)) ++ ret = -ENOSYS; ++ ++skip: ++ spin_unlock(&pfm_pmu_conf_lock); ++ ++ return ret; ++} ++ ++void pfm_pmu_conf_put(void) ++{ ++ if (pfm_pmu_conf == NULL || !pmu_is_module(pfm_pmu_conf)) ++ return; ++ ++ spin_lock(&pfm_pmu_conf_lock); ++ module_put(pfm_pmu_conf->owner); ++ spin_unlock(&pfm_pmu_conf_lock); ++} ++ ++ ++/* ++ * acquire PMU resource from lower-level PMU register allocator ++ * (currently perfctr-watchdog.c) ++ * ++ * acquisition is done when the first context is created (and not ++ * when it is loaded). We grab all that is defined in the description ++ * module and then we make adjustments at the arch-specific level. ++ * ++ * The PMU resource is released when the last perfmon context is ++ * destroyed. ++ * ++ * interrupts are not masked ++ */ ++int pfm_pmu_acquire(struct pfm_context *ctx) ++{ ++ u64 unavail_pmcs[PFM_PMC_BV]; ++ u64 unavail_pmds[PFM_PMD_BV]; ++ int ret = 0; ++ ++ spin_lock(&pfm_pmu_acq_lock); ++ ++ PFM_DBG("pmu_acquired=%u", pfm_pmu_acquired); ++ ++ pfm_pmu_acquired++; ++ ++ /* ++ * we need to initialize regdesc each time we re-acquire ++ * the PMU for the first time as there may have been changes ++ * in the list of available registers, e.g., NMI may have ++ * been disabled. Checking on PMU module insert is not ++ * enough ++ */ ++ if (pfm_pmu_acquired == 1) { ++ memset(unavail_pmcs, 0, sizeof(unavail_pmcs)); ++ memset(unavail_pmds, 0, sizeof(unavail_pmds)); ++ ++ ret = pfm_arch_pmu_acquire(unavail_pmcs, unavail_pmds); ++ if (ret) { ++ pfm_pmu_acquired--; ++ } else { ++ pfm_pmu_regdesc_init_all(unavail_pmcs, unavail_pmds); ++ ++ /* available PMU ressources */ ++ PFM_DBG("PMU acquired: %u PMCs, %u PMDs, %u counters", ++ pfm_pmu_conf->regs_all.num_pmcs, ++ pfm_pmu_conf->regs_all.num_pmds, ++ pfm_pmu_conf->regs_all.num_counters); ++ } ++ } ++ spin_unlock(&pfm_pmu_acq_lock); ++ ++ /* ++ * copy the regdesc that corresponds to the context ++ * we copy and not just point because it helps with ++ * memory locality. the regdesc structure is accessed ++ * very frequently in performance critical code such ++ * as context switch and interrupt handling. By using ++ * a local copy, we increase memory footprint, but ++ * increase chance to have local memory access, ++ * especially for system-wide contexts. ++ */ ++ if (ctx->flags.system) ++ ctx->regs = pfm_pmu_conf->regs_sys; ++ else ++ ctx->regs = pfm_pmu_conf->regs_thr; ++ ++ return ret; ++} ++ ++/* ++ * release the PMU resource ++ * ++ * actual release happens when last context is destroyed ++ * ++ * interrupts are not masked ++ */ ++void pfm_pmu_release(void) ++{ ++ BUG_ON(irqs_disabled()); ++ ++ /* ++ * we need to use a spinlock because release takes some time ++ * and we may have a race with pfm_pmu_acquire() ++ */ ++ spin_lock(&pfm_pmu_acq_lock); ++ ++ PFM_DBG("pmu_acquired=%d", pfm_pmu_acquired); ++ ++ /* ++ * we decouple test and decrement because if we had errors ++ * in pfm_pmu_acquire(), we still come here on pfm_context_free() ++ * but with pfm_pmu_acquire=0 ++ */ ++ if (pfm_pmu_acquired > 0 && --pfm_pmu_acquired == 0) { ++ pfm_arch_pmu_release(); ++ PFM_DBG("PMU released"); ++ } ++ spin_unlock(&pfm_pmu_acq_lock); ++} +diff --git a/perfmon/perfmon_priv.h b/perfmon/perfmon_priv.h +new file mode 100644 +index 0000000..5b485de +--- /dev/null ++++ b/perfmon/perfmon_priv.h +@@ -0,0 +1,182 @@ ++/* ++ * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++ ++#ifndef __PERFMON_PRIV_H__ ++#define __PERFMON_PRIV_H__ ++/* ++ * This file contains all the definitions of data structures, variables, macros ++ * that are to private to the generic code, i.e., not shared with any code that ++ * lives under arch/ or include/asm-XX ++ * ++ * For shared definitions, use include/linux/perfmon_kern.h ++ */ ++ ++#ifdef CONFIG_PERFMON ++ ++/* ++ * type of PMD reset for pfm_reset_pmds() or pfm_switch_sets*() ++ */ ++#define PFM_PMD_RESET_SHORT 1 /* use short reset value */ ++#define PFM_PMD_RESET_LONG 2 /* use long reset value */ ++ ++/* ++ * context lazy save/restore activation count ++ */ ++#define PFM_INVALID_ACTIVATION ((u64)~0) ++ ++DECLARE_PER_CPU(u64, pmu_activation_number); ++DECLARE_PER_CPU(struct hrtimer, pfm_hrtimer); ++ ++static inline void pfm_set_pmu_owner(struct task_struct *task, ++ struct pfm_context *ctx) ++{ ++ __get_cpu_var(pmu_owner) = task; ++ __get_cpu_var(pmu_ctx) = ctx; ++} ++ ++static inline int pfm_msgq_is_empty(struct pfm_context *ctx) ++{ ++ return ctx->msgq_head == ctx->msgq_tail; ++} ++ ++void pfm_get_next_msg(struct pfm_context *ctx, union pfarg_msg *m); ++int pfm_end_notify(struct pfm_context *ctx); ++int pfm_ovfl_notify(struct pfm_context *ctx, struct pfm_event_set *set, ++ unsigned long ip); ++ ++int pfm_alloc_fd(struct file **cfile); ++ ++int __pfm_delete_evtsets(struct pfm_context *ctx, void *arg, int count); ++int __pfm_getinfo_evtsets(struct pfm_context *ctx, struct pfarg_setinfo *req, ++ int count); ++int __pfm_create_evtsets(struct pfm_context *ctx, struct pfarg_setdesc *req, ++ int count); ++ ++ ++int pfm_init_ctx(void); ++ ++int pfm_pmu_acquire(struct pfm_context *ctx); ++void pfm_pmu_release(void); ++ ++int pfm_session_acquire(int is_system, u32 cpu); ++void pfm_session_release(int is_system, u32 cpu); ++ ++int pfm_smpl_buf_space_acquire(struct pfm_context *ctx, size_t size); ++int pfm_smpl_buf_load_context(struct pfm_context *ctx); ++void pfm_smpl_buf_unload_context(struct pfm_context *ctx); ++ ++int pfm_init_sysfs(void); ++ ++#ifdef CONFIG_PERFMON_DEBUG_FS ++int pfm_init_debugfs(void); ++int pfm_debugfs_add_cpu(int mycpu); ++void pfm_debugfs_del_cpu(int mycpu); ++#else ++static inline int pfm_init_debugfs(void) ++{ ++ return 0; ++} ++static inline int pfm_debugfs_add_cpu(int mycpu) ++{ ++ return 0; ++} ++ ++static inline void pfm_debugfs_del_cpu(int mycpu) ++{} ++#endif ++ ++ ++void pfm_reset_pmds(struct pfm_context *ctx, struct pfm_event_set *set, ++ int num_pmds, ++ int reset_mode); ++ ++struct pfm_event_set *pfm_prepare_sets(struct pfm_context *ctx, u16 load_set); ++int pfm_init_sets(void); ++ ++ssize_t pfm_sysfs_res_show(char *buf, size_t sz, int what); ++ ++void pfm_free_sets(struct pfm_context *ctx); ++int pfm_create_initial_set(struct pfm_context *ctx); ++void pfm_switch_sets_from_intr(struct pfm_context *ctx); ++void pfm_restart_timer(struct pfm_context *ctx, struct pfm_event_set *set); ++enum hrtimer_restart pfm_handle_switch_timeout(struct hrtimer *t); ++ ++enum hrtimer_restart pfm_switch_sets(struct pfm_context *ctx, ++ struct pfm_event_set *new_set, ++ int reset_mode, ++ int no_restart); ++ ++/** ++ * pfm_save_prev_ctx - check if previous context exists and save state ++ * ++ * called from pfm_load_ctx_thread() and __pfm_ctxsin_thread() to ++ * check if previous context exists. If so saved its PMU state. This is used ++ * only for UP kernels. ++ * ++ * PMU ownership is not cleared because the function is always called while ++ * trying to install a new owner. ++ */ ++static inline void pfm_check_save_prev_ctx(void) ++{ ++#ifdef CONFIG_SMP ++ struct pfm_event_set *set; ++ struct pfm_context *ctxp; ++ ++ ctxp = __get_cpu_var(pmu_ctx); ++ if (!ctxp) ++ return; ++ /* ++ * in UP per-thread, due to lazy save ++ * there could be a context from another ++ * task. We need to push it first before ++ * installing our new state ++ */ ++ set = ctxp->active_set; ++ pfm_save_pmds(ctxp, set); ++ /* ++ * do not clear ownership because we rewrite ++ * right away ++ */ ++#endif ++} ++ ++ ++int pfm_init_fs(void); ++ ++int pfm_init_hotplug(void); ++ ++void pfm_mask_monitoring(struct pfm_context *ctx, struct pfm_event_set *set); ++void pfm_resume_after_ovfl(struct pfm_context *ctx); ++int pfm_setup_smpl_fmt(struct pfm_context *ctx, u32 ctx_flags, void *fmt_arg, ++ struct file *filp); ++ ++static inline void pfm_post_work(struct task_struct *task, ++ struct pfm_context *ctx, int type) ++{ ++ ctx->flags.work_type = type; ++ set_tsk_thread_flag(task, TIF_PERFMON_WORK); ++ pfm_arch_arm_handle_work(task); ++} ++ ++#define PFM_PMC_STK_ARG PFM_ARCH_PMC_STK_ARG ++#define PFM_PMD_STK_ARG PFM_ARCH_PMD_STK_ARG ++ ++#endif /* CONFIG_PERFMON */ ++ ++#endif /* __PERFMON_PRIV_H__ */ +diff --git a/perfmon/perfmon_res.c b/perfmon/perfmon_res.c +new file mode 100644 +index 0000000..7b0382b +--- /dev/null ++++ b/perfmon/perfmon_res.c +@@ -0,0 +1,450 @@ ++/* ++ * perfmon_res.c: perfmon2 resource allocations ++ * ++ * This file implements the perfmon2 interface which ++ * provides access to the hardware performance counters ++ * of the host processor. ++ * ++ * The initial version of perfmon.c was written by ++ * Ganesh Venkitachalam, IBM Corp. ++ * ++ * Then it was modified for perfmon-1.x by Stephane Eranian and ++ * David Mosberger, Hewlett Packard Co. ++ * ++ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x ++ * by Stephane Eranian, Hewlett Packard Co. ++ * ++ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * David Mosberger-Tang <davidm@hpl.hp.com> ++ * ++ * More information about perfmon available at: ++ * http://perfmon2.sf.net ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/kernel.h> ++#include <linux/module.h> ++#include <linux/perfmon_kern.h> ++#include "perfmon_priv.h" ++ ++/* ++ * global information about all sessions ++ * mostly used to synchronize between system wide and per-process ++ */ ++struct pfm_resources { ++ size_t smpl_buf_mem_cur;/* current smpl buf mem usage */ ++ cpumask_t sys_cpumask; /* bitmask of used cpus */ ++ u32 thread_sessions; /* #num loaded per-thread sessions */ ++}; ++ ++static struct pfm_resources pfm_res; ++ ++static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_res_lock); ++ ++/** ++ * pfm_smpl_buf_space_acquire - check memory resource usage for sampling buffer ++ * @ctx: context of interest ++ * @size: size fo requested buffer ++ * ++ * sampling buffer allocated by perfmon must be ++ * checked against max locked memory usage thresholds ++ * for security reasons. ++ * ++ * The first level check is against the system wide limit ++ * as indicated by the system administrator in /sys/kernel/perfmon ++ * ++ * The second level check is on a per-process basis using ++ * RLIMIT_MEMLOCK limit. ++ * ++ * Operating on the current task only. ++ */ ++int pfm_smpl_buf_space_acquire(struct pfm_context *ctx, size_t size) ++{ ++ struct mm_struct *mm; ++ unsigned long locked; ++ unsigned long buf_mem, buf_mem_max; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&pfm_res_lock, flags); ++ ++ /* ++ * check against global buffer limit ++ */ ++ buf_mem_max = pfm_controls.smpl_buffer_mem_max; ++ buf_mem = pfm_res.smpl_buf_mem_cur + size; ++ ++ if (buf_mem <= buf_mem_max) { ++ pfm_res.smpl_buf_mem_cur = buf_mem; ++ ++ PFM_DBG("buf_mem_max=%lu current_buf_mem=%lu", ++ buf_mem_max, ++ buf_mem); ++ } ++ ++ spin_unlock_irqrestore(&pfm_res_lock, flags); ++ ++ if (buf_mem > buf_mem_max) { ++ PFM_DBG("smpl buffer memory threshold reached"); ++ return -ENOMEM; ++ } ++ ++ /* ++ * check against per-process RLIMIT_MEMLOCK ++ */ ++ mm = get_task_mm(current); ++ ++ down_write(&mm->mmap_sem); ++ ++ locked = mm->locked_vm << PAGE_SHIFT; ++ locked += size; ++ ++ if (locked > current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur) { ++ ++ PFM_DBG("RLIMIT_MEMLOCK reached ask_locked=%lu rlim_cur=%lu", ++ locked, ++ current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur); ++ ++ up_write(&mm->mmap_sem); ++ mmput(mm); ++ goto unres; ++ } ++ ++ mm->locked_vm = locked >> PAGE_SHIFT; ++ ++ up_write(&mm->mmap_sem); ++ ++ mmput(mm); ++ ++ return 0; ++ ++unres: ++ /* ++ * remove global buffer memory allocation ++ */ ++ spin_lock_irqsave(&pfm_res_lock, flags); ++ ++ pfm_res.smpl_buf_mem_cur -= size; ++ ++ spin_unlock_irqrestore(&pfm_res_lock, flags); ++ ++ return -ENOMEM; ++} ++/** ++ * pfm_smpl_buf_space_release - release resource usage for sampling buffer ++ * @ctx: perfmon context of interest ++ * ++ * There exist multiple paths leading to this function. We need to ++ * be very careful withlokcing on the mmap_sem as it may already be ++ * held by the time we come here. ++ * The following paths exist: ++ * ++ * exit path: ++ * sys_exit_group ++ * do_group_exit ++ * do_exit ++ * exit_mm ++ * mmput ++ * exit_mmap ++ * remove_vma ++ * fput ++ * __fput ++ * pfm_close ++ * __pfm_close ++ * pfm_context_free ++ * pfm_release_buf_space ++ * munmap path: ++ * sys_munmap ++ * do_munmap ++ * remove_vma ++ * fput ++ * __fput ++ * pfm_close ++ * __pfm_close ++ * pfm_context_free ++ * pfm_release_buf_space ++ * ++ * close path: ++ * sys_close ++ * filp_close ++ * fput ++ * __fput ++ * pfm_close ++ * __pfm_close ++ * pfm_context_free ++ * pfm_release_buf_space ++ * ++ * The issue is that on the munmap() path, the mmap_sem is already held ++ * in write-mode by the time we come here. To avoid the deadlock, we need ++ * to know where we are coming from and skip down_write(). If is fairly ++ * difficult to know this because of the lack of good hooks and ++ * the fact that, there may not have been any mmap() of the sampling buffer ++ * (i.e. create_context() followed by close() or exit()). ++ * ++ * We use a set flag ctx->flags.mmap_nlock which is toggled in the vm_ops ++ * callback in remove_vma() which is called systematically for the call, so ++ * on all but the pure close() path. The exit path does not already hold ++ * the lock but this is exit so there is no task->mm by the time we come here. ++ * ++ * The mmap_nlock is set only when unmapping and this is the LAST reference ++ * to the file (i.e., close() followed by munmap()). ++ */ ++void pfm_smpl_buf_space_release(struct pfm_context *ctx, size_t size) ++{ ++ unsigned long flags; ++ struct mm_struct *mm; ++ ++ mm = get_task_mm(current); ++ if (mm) { ++ if (ctx->flags.mmap_nlock == 0) { ++ PFM_DBG("doing down_write"); ++ down_write(&mm->mmap_sem); ++ } ++ ++ mm->locked_vm -= size >> PAGE_SHIFT; ++ ++ PFM_DBG("size=%zu locked_vm=%lu", size, mm->locked_vm); ++ ++ if (ctx->flags.mmap_nlock == 0) ++ up_write(&mm->mmap_sem); ++ ++ mmput(mm); ++ } ++ ++ spin_lock_irqsave(&pfm_res_lock, flags); ++ ++ pfm_res.smpl_buf_mem_cur -= size; ++ ++ spin_unlock_irqrestore(&pfm_res_lock, flags); ++} ++ ++/** ++ * pfm_session_acquire - reserve a per-thread or per-cpu session ++ * @is_system: true if per-cpu session ++ * @cpu: cpu number for per-cpu session ++ * ++ * return: ++ * 0 : success ++ * -EBUSY: if conflicting session exist ++ */ ++int pfm_session_acquire(int is_system, u32 cpu) ++{ ++ unsigned long flags; ++ u32 nsys_cpus; ++ int ret = 0; ++ ++ /* ++ * validy checks on cpu_mask have been done upstream ++ */ ++ spin_lock_irqsave(&pfm_res_lock, flags); ++ ++ nsys_cpus = cpus_weight(pfm_res.sys_cpumask); ++ ++ PFM_DBG("in sys=%u task=%u is_sys=%d cpu=%u", ++ nsys_cpus, ++ pfm_res.thread_sessions, ++ is_system, ++ cpu); ++ ++ if (is_system) { ++ /* ++ * cannot mix system wide and per-task sessions ++ */ ++ if (pfm_res.thread_sessions > 0) { ++ PFM_DBG("%u conflicting thread_sessions", ++ pfm_res.thread_sessions); ++ ret = -EBUSY; ++ goto abort; ++ } ++ ++ if (cpu_isset(cpu, pfm_res.sys_cpumask)) { ++ PFM_DBG("conflicting session on CPU%u", cpu); ++ ret = -EBUSY; ++ goto abort; ++ } ++ ++ PFM_DBG("reserved session on CPU%u", cpu); ++ ++ cpu_set(cpu, pfm_res.sys_cpumask); ++ nsys_cpus++; ++ } else { ++ if (nsys_cpus) { ++ ret = -EBUSY; ++ goto abort; ++ } ++ pfm_res.thread_sessions++; ++ } ++ ++ PFM_DBG("out sys=%u task=%u is_sys=%d cpu=%u", ++ nsys_cpus, ++ pfm_res.thread_sessions, ++ is_system, ++ cpu); ++ ++abort: ++ spin_unlock_irqrestore(&pfm_res_lock, flags); ++ ++ return ret; ++} ++ ++/** ++ * pfm_session_release - release a per-cpu or per-thread session ++ * @is_system: true if per-cpu session ++ * @cpu: cpu number for per-cpu session ++ * ++ * called from __pfm_unload_context() ++ */ ++void pfm_session_release(int is_system, u32 cpu) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&pfm_res_lock, flags); ++ ++ PFM_DBG("in sys_sessions=%u thread_sessions=%u syswide=%d cpu=%u", ++ cpus_weight(pfm_res.sys_cpumask), ++ pfm_res.thread_sessions, ++ is_system, cpu); ++ ++ if (is_system) ++ cpu_clear(cpu, pfm_res.sys_cpumask); ++ else ++ pfm_res.thread_sessions--; ++ ++ PFM_DBG("out sys_sessions=%u thread_sessions=%u syswide=%d cpu=%u", ++ cpus_weight(pfm_res.sys_cpumask), ++ pfm_res.thread_sessions, ++ is_system, cpu); ++ ++ spin_unlock_irqrestore(&pfm_res_lock, flags); ++} ++ ++/** ++ * pfm_session_allcpus_acquire - acquire per-cpu sessions on all available cpus ++ * ++ * currently used by Oprofile on X86 ++ */ ++int pfm_session_allcpus_acquire(void) ++{ ++ unsigned long flags; ++ u32 nsys_cpus, cpu; ++ int ret = -EBUSY; ++ ++ spin_lock_irqsave(&pfm_res_lock, flags); ++ ++ nsys_cpus = cpus_weight(pfm_res.sys_cpumask); ++ ++ PFM_DBG("in sys=%u task=%u", ++ nsys_cpus, ++ pfm_res.thread_sessions); ++ ++ if (nsys_cpus) { ++ PFM_DBG("already some system-wide sessions"); ++ goto abort; ++ } ++ ++ /* ++ * cannot mix system wide and per-task sessions ++ */ ++ if (pfm_res.thread_sessions) { ++ PFM_DBG("%u conflicting thread_sessions", ++ pfm_res.thread_sessions); ++ goto abort; ++ } ++ ++ for_each_online_cpu(cpu) { ++ cpu_set(cpu, pfm_res.sys_cpumask); ++ nsys_cpus++; ++ } ++ ++ PFM_DBG("out sys=%u task=%u", ++ nsys_cpus, ++ pfm_res.thread_sessions); ++ ++ ret = 0; ++abort: ++ spin_unlock_irqrestore(&pfm_res_lock, flags); ++ ++ return ret; ++} ++EXPORT_SYMBOL(pfm_session_allcpus_acquire); ++ ++/** ++ * pfm_session_allcpus_release - relase per-cpu sessions on all cpus ++ * ++ * currently used by Oprofile code ++ */ ++void pfm_session_allcpus_release(void) ++{ ++ unsigned long flags; ++ u32 nsys_cpus, cpu; ++ ++ spin_lock_irqsave(&pfm_res_lock, flags); ++ ++ nsys_cpus = cpus_weight(pfm_res.sys_cpumask); ++ ++ PFM_DBG("in sys=%u task=%u", ++ nsys_cpus, ++ pfm_res.thread_sessions); ++ ++ /* ++ * XXX: could use __cpus_clear() with nbits ++ */ ++ for_each_online_cpu(cpu) { ++ cpu_clear(cpu, pfm_res.sys_cpumask); ++ nsys_cpus--; ++ } ++ ++ PFM_DBG("out sys=%u task=%u", ++ nsys_cpus, ++ pfm_res.thread_sessions); ++ ++ spin_unlock_irqrestore(&pfm_res_lock, flags); ++} ++EXPORT_SYMBOL(pfm_session_allcpus_release); ++ ++/** ++ * pfm_sysfs_res_show - return currnt resourcde usage for sysfs ++ * @buf: buffer to hold string in return ++ * @sz: size of buf ++ * @what: what to produce ++ * what=0 : thread_sessions ++ * what=1 : cpus_weight(sys_cpumask) ++ * what=2 : smpl_buf_mem_cur ++ * what=3 : pmu model name ++ * ++ * called from perfmon_sysfs.c ++ * return number of bytes written into buf (up to sz) ++ */ ++ssize_t pfm_sysfs_res_show(char *buf, size_t sz, int what) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&pfm_res_lock, flags); ++ ++ switch (what) { ++ case 0: snprintf(buf, sz, "%u\n", pfm_res.thread_sessions); ++ break; ++ case 1: snprintf(buf, sz, "%d\n", cpus_weight(pfm_res.sys_cpumask)); ++ break; ++ case 2: snprintf(buf, sz, "%zu\n", pfm_res.smpl_buf_mem_cur); ++ break; ++ case 3: ++ snprintf(buf, sz, "%s\n", ++ pfm_pmu_conf ? pfm_pmu_conf->pmu_name ++ : "unknown\n"); ++ } ++ spin_unlock_irqrestore(&pfm_res_lock, flags); ++ return strlen(buf); ++} +diff --git a/perfmon/perfmon_rw.c b/perfmon/perfmon_rw.c +new file mode 100644 +index 0000000..3168eb7 +--- /dev/null ++++ b/perfmon/perfmon_rw.c +@@ -0,0 +1,733 @@ ++/* ++ * perfmon.c: perfmon2 PMC/PMD read/write system calls ++ * ++ * This file implements the perfmon2 interface which ++ * provides access to the hardware performance counters ++ * of the host processor. ++ * ++ * The initial version of perfmon.c was written by ++ * Ganesh Venkitachalam, IBM Corp. ++ * ++ * Then it was modified for perfmon-1.x by Stephane Eranian and ++ * David Mosberger, Hewlett Packard Co. ++ * ++ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x ++ * by Stephane Eranian, Hewlett Packard Co. ++ * ++ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * David Mosberger-Tang <davidm@hpl.hp.com> ++ * ++ * More information about perfmon available at: ++ * http://perfmon2.sf.net/ ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/module.h> ++#include <linux/kernel.h> ++#include <linux/perfmon_kern.h> ++#include "perfmon_priv.h" ++ ++#define PFM_REGFL_PMC_ALL (PFM_REGFL_NO_EMUL64) ++#define PFM_REGFL_PMD_ALL (PFM_REGFL_RANDOM|PFM_REGFL_OVFL_NOTIFY) ++ ++/** ++ * update_used_reg -- updated used_pmcs for a single PMD ++ * @set: set to update ++ * @cnum: new PMD to add ++ * ++ * This function adds the pmds and pmcs depending on PMD cnum ++ */ ++static inline void update_used_reg(struct pfm_context *ctx, ++ struct pfm_event_set *set, u16 cnum) ++{ ++ bitmap_or(cast_ulp(set->used_pmcs), ++ cast_ulp(set->used_pmcs), ++ cast_ulp(pfm_pmu_conf->pmd_desc[cnum].dep_pmcs), ++ ctx->regs.max_pmc); ++} ++ ++/** ++ * update_used -- update used_pmcs bitmask ++ * @set: event set to update ++ * @bv: bitmask to inspect for new PMD registers ++ * ++ * This function updates the used_pmcs bitmask for ++ * the set using bv, a bitmask of pmds. For each pmd in bv, ++ * its depending pmcs are added to used_pmcs. ++ */ ++static void update_used_pmcs(struct pfm_context *ctx, ++ struct pfm_event_set *set, unsigned long *bv) ++{ ++ u16 max_pmd; ++ int n, p, q; ++ ++ max_pmd = ctx->regs.max_pmd; ++ ++ n = bitmap_weight(bv, max_pmd); ++ for(p = 0; n; n--, p = q+1) { ++ q = find_next_bit(bv, max_pmd, p); ++ update_used_reg(ctx, set, q); ++ } ++} ++ ++/** ++ * update_changes -- update nused_pmcs, nused_pmds, write newly touched pmcs ++ * @ctx: context to use ++ * @set: event set to use ++ * @old_used_pmcs: former used_pmc bitmask ++ * @can_access: non-zero if PMU is accessible, i.e., can be written to ++ * ++ * This function updates nused_pmcs and nused_pmds after the last modificiation ++ * to an event set. When new pmcs are used, then they must be initialized such ++ * that we do not pick up stale values from another session. ++ */ ++static inline int update_changes(struct pfm_context *ctx, struct pfm_event_set *set, ++ unsigned long *old_used_pmcs) ++{ ++ struct pfarg_pmc req; ++ u16 max_pmc, max_pmd; ++ int n, p, q, ret = 0; ++ ++ max_pmd = ctx->regs.max_pmd; ++ max_pmc = ctx->regs.max_pmc; ++ ++ /* ++ * update used counts ++ */ ++ set->nused_pmds = bitmap_weight(cast_ulp(set->used_pmds), max_pmd); ++ set->nused_pmcs = bitmap_weight(cast_ulp(set->used_pmcs), max_pmc); ++ ++ PFM_DBG("set%u u_pmds=0x%llx nu_pmds=%u u_pmcs=0x%llx nu_pmcs=%u", ++ set->id, ++ (unsigned long long)set->used_pmds[0], ++ set->nused_pmds, ++ (unsigned long long)set->used_pmcs[0], ++ set->nused_pmcs); ++ ++ memset(&req, 0, sizeof(req)); ++ ++ n = bitmap_weight(cast_ulp(set->used_pmcs), max_pmc); ++ for(p = 0; n; n--, p = q+1) { ++ q = find_next_bit(cast_ulp(set->used_pmcs), max_pmc, p); ++ ++ if (test_bit(q, cast_ulp(old_used_pmcs))) ++ continue; ++ ++ req.reg_num = q; ++ req.reg_value = set->pmcs[q]; ++ ++ ret = __pfm_write_pmcs(ctx, &req, 1); ++ if (ret) ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * handle_smpl_bv - checks sampling bitmasks for new PMDs ++ * @ctx: context to use ++ * @set: set to use ++ * @bv: sampling bitmask ++ * ++ * scans the smpl bitmask looking for new PMDs (not yet used), if found ++ * invoke pfm_write_pmds() on them to get them initialized and marked used ++ */ ++static int handle_smpl_bv(struct pfm_context *ctx, struct pfm_event_set *set, ++ unsigned long *bv) ++{ ++ struct pfarg_pmd req; ++ int p, q, n, ret = 0; ++ u16 max_pmd; ++ ++ memset(&req, 0, sizeof(req)); ++ ++ max_pmd = ctx->regs.max_pmd; ++ ++ n = bitmap_weight(cast_ulp(bv), max_pmd); ++ ++ for(p = 0; n; n--, p = q+1) { ++ q = find_next_bit(cast_ulp(bv), max_pmd, p); ++ ++ if (test_bit(q, cast_ulp(set->used_pmds))) ++ continue; ++ ++ req.reg_num = q; ++ req.reg_value = 0; ++ ++ ret = __pfm_write_pmds(ctx, &req, 1, 0); ++ if (ret) ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * is_invalid -- check if register index is within limits ++ * @cnum: register index ++ * @impl: bitmask of implemented registers ++ * @max: highest implemented registers + 1 ++ * ++ * return: ++ * 0 is register index is valid ++ * 1 if invalid ++ */ ++static inline int is_invalid(u16 cnum, unsigned long *impl, u16 max) ++{ ++ return cnum >= max || !test_bit(cnum, impl); ++} ++ ++/** ++ * __pfm_write_pmds - modified data registers ++ * @ctx: context to operate on ++ * @req: pfarg_pmd_t request from user ++ * @count: number of element in the pfarg_pmd_t vector ++ * @compat: used only on IA-64 to maintain backward compatibility with v2.0 ++ * ++ * The function succeeds whether the context is attached or not. ++ * When attached to another thread, that thread must be stopped. ++ * ++ * The context is locked and interrupts are disabled. ++ */ ++int __pfm_write_pmds(struct pfm_context *ctx, struct pfarg_pmd *req, int count, ++ int compat) ++{ ++ struct pfm_event_set *set, *active_set; ++ u64 old_used_pmcs[PFM_PMC_BV]; ++ unsigned long *smpl_pmds, *reset_pmds, *impl_pmds, *impl_rw_pmds; ++ u32 req_flags, flags; ++ u16 cnum, pmd_type, max_pmd; ++ u16 set_id; ++ int i, can_access_pmu; ++ int ret; ++ pfm_pmd_check_t wr_func; ++ ++ active_set = ctx->active_set; ++ max_pmd = ctx->regs.max_pmd; ++ impl_pmds = cast_ulp(ctx->regs.pmds); ++ impl_rw_pmds = cast_ulp(ctx->regs.rw_pmds); ++ wr_func = pfm_pmu_conf->pmd_write_check; ++ set = list_first_entry(&ctx->set_list, struct pfm_event_set, list); ++ ++ can_access_pmu = 0; ++ ++ /* ++ * we cannot access the actual PMD registers when monitoring is masked ++ */ ++ if (unlikely(ctx->state == PFM_CTX_LOADED)) ++ can_access_pmu = __get_cpu_var(pmu_owner) == ctx->task ++ || ctx->flags.system; ++ ++ bitmap_copy(cast_ulp(old_used_pmcs), ++ cast_ulp(set->used_pmcs), ++ ctx->regs.max_pmc); ++ ++ ret = -EINVAL; ++ for (i = 0; i < count; i++, req++) { ++ ++ cnum = req->reg_num; ++ set_id = req->reg_set; ++ req_flags = req->reg_flags; ++ smpl_pmds = cast_ulp(req->reg_smpl_pmds); ++ reset_pmds = cast_ulp(req->reg_reset_pmds); ++ flags = 0; ++ ++ /* ++ * cannot write to unexisting ++ * writes to read-only register are ignored ++ */ ++ if (unlikely(is_invalid(cnum, impl_pmds, max_pmd))) { ++ PFM_DBG("pmd%u is not available", cnum); ++ goto error; ++ } ++ ++ pmd_type = pfm_pmu_conf->pmd_desc[cnum].type; ++ ++ /* ++ * ensure only valid flags are set ++ */ ++ if (req_flags & ~(PFM_REGFL_PMD_ALL)) { ++ PFM_DBG("pmd%u: invalid flags=0x%x", ++ cnum, req_flags); ++ goto error; ++ } ++ ++ /* ++ * OVFL_NOTIFY is valid for all types of PMD. ++ * non counting PMD may trigger PMU interrupt ++ * and thus may trigger recording of a sample. ++ * This is true with IBS on AMD family 16. ++ */ ++ if (req_flags & PFM_REGFL_OVFL_NOTIFY) ++ flags |= PFM_REGFL_OVFL_NOTIFY; ++ ++ /* ++ * We allow randomization to non counting PMD ++ */ ++ if (req_flags & PFM_REGFL_RANDOM) ++ flags |= PFM_REGFL_RANDOM; ++ ++ /* ++ * verify validity of smpl_pmds ++ */ ++ if (unlikely(!bitmap_subset(smpl_pmds, impl_pmds, PFM_MAX_PMDS))) { ++ PFM_DBG("invalid smpl_pmds=0x%llx for pmd%u", ++ (unsigned long long)req->reg_smpl_pmds[0], ++ cnum); ++ goto error; ++ } ++ ++ /* ++ * verify validity of reset_pmds ++ * check against impl_rw_pmds because it is not ++ * possible to reset read-only PMDs ++ */ ++ if (unlikely(!bitmap_subset(reset_pmds, impl_rw_pmds, PFM_MAX_PMDS))) { ++ PFM_DBG("invalid reset_pmds=0x%llx for pmd%u", ++ (unsigned long long)req->reg_reset_pmds[0], ++ cnum); ++ goto error; ++ } ++ ++ /* ++ * locate event set ++ */ ++ if (set_id != set->id) { ++ /* update number of used register for previous set */ ++ if (i) { ++ ret = update_changes(ctx, set, cast_ulp(old_used_pmcs)); ++ if (ret) ++ goto error; ++ } ++ ++ set = pfm_find_set(ctx, set_id, 0); ++ if (set == NULL) { ++ PFM_DBG("event set%u does not exist", ++ set_id); ++ goto error; ++ } ++ bitmap_copy(cast_ulp(old_used_pmcs), ++ cast_ulp(set->used_pmcs), ++ ctx->regs.max_pmc); ++ } ++ ++ /* ++ * execute write checker, if any ++ */ ++ if (unlikely(wr_func && (pmd_type & PFM_REG_WC))) { ++ ret = (*wr_func)(ctx, set, req); ++ if (ret) ++ goto error; ++ ++ } ++ ++ ++ /* ++ * now commit changes to software state ++ */ ++ ++ if (unlikely(compat)) ++ goto skip_set; ++ ++ if (bitmap_weight(smpl_pmds, max_pmd)) { ++ ret = handle_smpl_bv(ctx, set, smpl_pmds); ++ if (ret) ++ goto error; ++ update_used_pmcs(ctx, set, cast_ulp(smpl_pmds)); ++ } ++ ++ bitmap_copy(cast_ulp(set->pmds[cnum].smpl_pmds), ++ smpl_pmds, ++ max_pmd); ++ ++ ++ if (bitmap_weight(reset_pmds, max_pmd)) { ++ ret = handle_smpl_bv(ctx, set, reset_pmds); ++ if (ret) ++ goto error; ++ update_used_pmcs(ctx, set, cast_ulp(reset_pmds)); ++ } ++ ++ bitmap_copy(cast_ulp(set->pmds[cnum].reset_pmds), ++ reset_pmds, ++ max_pmd); ++ ++ set->pmds[cnum].flags = flags; ++ ++ __set_bit(cnum, cast_ulp(set->used_pmds)); ++ update_used_reg(ctx, set, cnum); ++ ++ /* ++ * we reprogram the PMD hence, we clear any pending ++ * ovfl. Does affect ovfl switch on restart but new ++ * value has already been established here ++ */ ++ if (test_bit(cnum, cast_ulp(set->povfl_pmds))) { ++ set->npend_ovfls--; ++ __clear_bit(cnum, cast_ulp(set->povfl_pmds)); ++ } ++ __clear_bit(cnum, cast_ulp(set->ovfl_pmds)); ++ ++ /* ++ * update ovfl_notify ++ */ ++ if (flags & PFM_REGFL_OVFL_NOTIFY) ++ __set_bit(cnum, cast_ulp(set->ovfl_notify)); ++ else ++ __clear_bit(cnum, cast_ulp(set->ovfl_notify)); ++ ++ /* ++ * establish new switch count ++ */ ++ set->pmds[cnum].ovflsw_thres = req->reg_ovfl_switch_cnt; ++ set->pmds[cnum].ovflsw_ref_thres = req->reg_ovfl_switch_cnt; ++skip_set: ++ ++ /* ++ * set last value to new value for all types of PMD ++ */ ++ set->pmds[cnum].lval = req->reg_value; ++ set->pmds[cnum].value = req->reg_value; ++ ++ /* ++ * update reset values (not just for counters) ++ */ ++ set->pmds[cnum].long_reset = req->reg_long_reset; ++ set->pmds[cnum].short_reset = req->reg_short_reset; ++ ++ /* ++ * update randomization mask ++ */ ++ set->pmds[cnum].mask = req->reg_random_mask; ++ ++ set->pmds[cnum].eventid = req->reg_smpl_eventid; ++ ++ if (set == active_set) { ++ set->priv_flags |= PFM_SETFL_PRIV_MOD_PMDS; ++ if (can_access_pmu) ++ pfm_write_pmd(ctx, cnum, req->reg_value); ++ } ++ ++ ++ PFM_DBG("set%u pmd%u=0x%llx flags=0x%x a_pmu=%d " ++ "ctx_pmd=0x%llx s_reset=0x%llx " ++ "l_reset=0x%llx s_pmds=0x%llx " ++ "r_pmds=0x%llx o_pmds=0x%llx " ++ "o_thres=%llu compat=%d eventid=%llx", ++ set->id, ++ cnum, ++ (unsigned long long)req->reg_value, ++ set->pmds[cnum].flags, ++ can_access_pmu, ++ (unsigned long long)set->pmds[cnum].value, ++ (unsigned long long)set->pmds[cnum].short_reset, ++ (unsigned long long)set->pmds[cnum].long_reset, ++ (unsigned long long)set->pmds[cnum].smpl_pmds[0], ++ (unsigned long long)set->pmds[cnum].reset_pmds[0], ++ (unsigned long long)set->ovfl_pmds[0], ++ (unsigned long long)set->pmds[cnum].ovflsw_thres, ++ compat, ++ (unsigned long long)set->pmds[cnum].eventid); ++ } ++ ret = 0; ++ ++error: ++ update_changes(ctx, set, cast_ulp(old_used_pmcs)); ++ ++ /* ++ * make changes visible ++ */ ++ if (can_access_pmu) ++ pfm_arch_serialize(); ++ ++ return ret; ++} ++ ++/** ++ * __pfm_write_pmcs - modified config registers ++ * @ctx: context to operate on ++ * @req: pfarg_pmc_t request from user ++ * @count: number of element in the pfarg_pmc_t vector ++ * ++ * ++ * The function succeeds whether the context is * attached or not. ++ * When attached to another thread, that thread must be stopped. ++ * ++ * The context is locked and interrupts are disabled. ++ */ ++int __pfm_write_pmcs(struct pfm_context *ctx, struct pfarg_pmc *req, int count) ++{ ++ struct pfm_event_set *set, *active_set; ++ u64 value, dfl_val, rsvd_msk; ++ unsigned long *impl_pmcs; ++ int i, can_access_pmu; ++ int ret; ++ u16 set_id; ++ u16 cnum, pmc_type, max_pmc; ++ u32 flags, expert; ++ pfm_pmc_check_t wr_func; ++ ++ active_set = ctx->active_set; ++ ++ wr_func = pfm_pmu_conf->pmc_write_check; ++ max_pmc = ctx->regs.max_pmc; ++ impl_pmcs = cast_ulp(ctx->regs.pmcs); ++ set = list_first_entry(&ctx->set_list, struct pfm_event_set, list); ++ ++ expert = pfm_controls.flags & PFM_CTRL_FL_RW_EXPERT; ++ ++ can_access_pmu = 0; ++ ++ /* ++ * we cannot access the actual PMC registers when monitoring is masked ++ */ ++ if (unlikely(ctx->state == PFM_CTX_LOADED)) ++ can_access_pmu = __get_cpu_var(pmu_owner) == ctx->task ++ || ctx->flags.system; ++ ++ ret = -EINVAL; ++ ++ for (i = 0; i < count; i++, req++) { ++ ++ cnum = req->reg_num; ++ set_id = req->reg_set; ++ value = req->reg_value; ++ flags = req->reg_flags; ++ ++ /* ++ * no access to unavailable PMC register ++ */ ++ if (unlikely(is_invalid(cnum, impl_pmcs, max_pmc))) { ++ PFM_DBG("pmc%u is not available", cnum); ++ goto error; ++ } ++ ++ pmc_type = pfm_pmu_conf->pmc_desc[cnum].type; ++ dfl_val = pfm_pmu_conf->pmc_desc[cnum].dfl_val; ++ rsvd_msk = pfm_pmu_conf->pmc_desc[cnum].rsvd_msk; ++ ++ /* ++ * ensure only valid flags are set ++ */ ++ if (flags & ~PFM_REGFL_PMC_ALL) { ++ PFM_DBG("pmc%u: invalid flags=0x%x", cnum, flags); ++ goto error; ++ } ++ ++ /* ++ * locate event set ++ */ ++ if (set_id != set->id) { ++ set = pfm_find_set(ctx, set_id, 0); ++ if (set == NULL) { ++ PFM_DBG("event set%u does not exist", ++ set_id); ++ goto error; ++ } ++ } ++ ++ /* ++ * set reserved bits to default values ++ * (reserved bits must be 1 in rsvd_msk) ++ * ++ * bypass via /sys/kernel/perfmon/mode = 1 ++ */ ++ if (likely(!expert)) ++ value = (value & ~rsvd_msk) | (dfl_val & rsvd_msk); ++ ++ if (flags & PFM_REGFL_NO_EMUL64) { ++ if (!(pmc_type & PFM_REG_NO64)) { ++ PFM_DBG("pmc%u no support for " ++ "PFM_REGFL_NO_EMUL64", cnum); ++ goto error; ++ } ++ value &= ~pfm_pmu_conf->pmc_desc[cnum].no_emul64_msk; ++ } ++ ++ /* ++ * execute write checker, if any ++ */ ++ if (likely(wr_func && (pmc_type & PFM_REG_WC))) { ++ req->reg_value = value; ++ ret = (*wr_func)(ctx, set, req); ++ if (ret) ++ goto error; ++ value = req->reg_value; ++ } ++ ++ /* ++ * Now we commit the changes ++ */ ++ ++ /* ++ * mark PMC register as used ++ * We do not track associated PMC register based on ++ * the fact that they will likely need to be written ++ * in order to become useful at which point the statement ++ * below will catch that. ++ * ++ * The used_pmcs bitmask is only useful on architectures where ++ * the PMC needs to be modified for particular bits, especially ++ * on overflow or to stop/start. ++ */ ++ if (!test_bit(cnum, cast_ulp(set->used_pmcs))) { ++ __set_bit(cnum, cast_ulp(set->used_pmcs)); ++ set->nused_pmcs++; ++ } ++ ++ set->pmcs[cnum] = value; ++ ++ if (set == active_set) { ++ set->priv_flags |= PFM_SETFL_PRIV_MOD_PMCS; ++ if (can_access_pmu) ++ pfm_arch_write_pmc(ctx, cnum, value); ++ } ++ ++ PFM_DBG("set%u pmc%u=0x%llx a_pmu=%d " ++ "u_pmcs=0x%llx nu_pmcs=%u", ++ set->id, ++ cnum, ++ (unsigned long long)value, ++ can_access_pmu, ++ (unsigned long long)set->used_pmcs[0], ++ set->nused_pmcs); ++ } ++ ret = 0; ++error: ++ /* ++ * make sure the changes are visible ++ */ ++ if (can_access_pmu) ++ pfm_arch_serialize(); ++ ++ return ret; ++} ++ ++/** ++ * __pfm_read_pmds - read data registers ++ * @ctx: context to operate on ++ * @req: pfarg_pmd_t request from user ++ * @count: number of element in the pfarg_pmd_t vector ++ * ++ * ++ * The function succeeds whether the context is attached or not. ++ * When attached to another thread, that thread must be stopped. ++ * ++ * The context is locked and interrupts are disabled. ++ */ ++int __pfm_read_pmds(struct pfm_context *ctx, struct pfarg_pmd *req, int count) ++{ ++ u64 val = 0, lval, ovfl_mask, hw_val; ++ u64 sw_cnt; ++ unsigned long *impl_pmds; ++ struct pfm_event_set *set, *active_set; ++ int i, ret, can_access_pmu = 0; ++ u16 cnum, pmd_type, set_id, max_pmd; ++ ++ ovfl_mask = pfm_pmu_conf->ovfl_mask; ++ impl_pmds = cast_ulp(ctx->regs.pmds); ++ max_pmd = ctx->regs.max_pmd; ++ active_set = ctx->active_set; ++ set = list_first_entry(&ctx->set_list, struct pfm_event_set, list); ++ ++ if (likely(ctx->state == PFM_CTX_LOADED)) { ++ can_access_pmu = __get_cpu_var(pmu_owner) == ctx->task ++ || ctx->flags.system; ++ ++ if (can_access_pmu) ++ pfm_arch_serialize(); ++ } ++ ++ /* ++ * on both UP and SMP, we can only read the PMD from the hardware ++ * register when the task is the owner of the local PMU. ++ */ ++ ret = -EINVAL; ++ for (i = 0; i < count; i++, req++) { ++ ++ cnum = req->reg_num; ++ set_id = req->reg_set; ++ ++ if (unlikely(is_invalid(cnum, impl_pmds, max_pmd))) { ++ PFM_DBG("pmd%u is not implemented/unaccessible", cnum); ++ goto error; ++ } ++ ++ pmd_type = pfm_pmu_conf->pmd_desc[cnum].type; ++ ++ /* ++ * locate event set ++ */ ++ if (set_id != set->id) { ++ set = pfm_find_set(ctx, set_id, 0); ++ if (set == NULL) { ++ PFM_DBG("event set%u does not exist", ++ set_id); ++ goto error; ++ } ++ } ++ /* ++ * it is not possible to read a PMD which was not requested: ++ * - explicitly written via pfm_write_pmds() ++ * - provided as a reg_smpl_pmds[] to another PMD during ++ * pfm_write_pmds() ++ * ++ * This is motivated by security and for optimization purposes: ++ * - on context switch restore, we can restore only what ++ * we use (except when regs directly readable at user ++ * level, e.g., IA-64 self-monitoring, I386 RDPMC). ++ * - do not need to maintain PMC -> PMD dependencies ++ */ ++ if (unlikely(!test_bit(cnum, cast_ulp(set->used_pmds)))) { ++ PFM_DBG("pmd%u cannot read, because not used", cnum); ++ goto error; ++ } ++ ++ val = set->pmds[cnum].value; ++ lval = set->pmds[cnum].lval; ++ ++ /* ++ * extract remaining ovfl to switch ++ */ ++ sw_cnt = set->pmds[cnum].ovflsw_thres; ++ ++ /* ++ * If the task is not the current one, then we check if the ++ * PMU state is still in the local live register due to lazy ++ * ctxsw. If true, then we read directly from the registers. ++ */ ++ if (set == active_set && can_access_pmu) { ++ hw_val = pfm_read_pmd(ctx, cnum); ++ if (pmd_type & PFM_REG_C64) ++ val = (val & ~ovfl_mask) | (hw_val & ovfl_mask); ++ else ++ val = hw_val; ++ } ++ ++ PFM_DBG("set%u pmd%u=0x%llx sw_thr=%llu lval=0x%llx", ++ set->id, ++ cnum, ++ (unsigned long long)val, ++ (unsigned long long)sw_cnt, ++ (unsigned long long)lval); ++ ++ req->reg_value = val; ++ req->reg_last_reset_val = lval; ++ req->reg_ovfl_switch_cnt = sw_cnt; ++ } ++ ret = 0; ++error: ++ return ret; ++} +diff --git a/perfmon/perfmon_sets.c b/perfmon/perfmon_sets.c +new file mode 100644 +index 0000000..24534cb +--- /dev/null ++++ b/perfmon/perfmon_sets.c +@@ -0,0 +1,873 @@ ++/* ++ * perfmon_sets.c: perfmon2 event sets and multiplexing functions ++ * ++ * This file implements the perfmon2 interface which ++ * provides access to the hardware performance counters ++ * of the host processor. ++ * ++ * The initial version of perfmon.c was written by ++ * Ganesh Venkitachalam, IBM Corp. ++ * ++ * Then it was modified for perfmon-1.x by Stephane Eranian and ++ * David Mosberger, Hewlett Packard Co. ++ * ++ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x ++ * by Stephane Eranian, Hewlett Packard Co. ++ * ++ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * David Mosberger-Tang <davidm@hpl.hp.com> ++ * ++ * More information about perfmon available at: ++ * http://perfmon2.sf.net ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/kernel.h> ++#include <linux/perfmon_kern.h> ++#include "perfmon_priv.h" ++ ++static struct kmem_cache *pfm_set_cachep; ++ ++/** ++ * pfm_reload_switch_thresholds - reload overflow-based switch thresholds per set ++ * @set: the set for which to reload thresholds ++ * ++ */ ++static void pfm_reload_switch_thresholds(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ u64 *used_pmds; ++ u16 i, max, first; ++ ++ used_pmds = set->used_pmds; ++ first = ctx->regs.first_intr_pmd; ++ max = ctx->regs.max_intr_pmd; ++ ++ for (i = first; i < max; i++) { ++ if (test_bit(i, cast_ulp(used_pmds))) { ++ set->pmds[i].ovflsw_thres = set->pmds[i].ovflsw_ref_thres; ++ ++ PFM_DBG("set%u pmd%u ovflsw_thres=%llu", ++ set->id, ++ i, ++ (unsigned long long)set->pmds[i].ovflsw_thres); ++ } ++ } ++} ++ ++/** ++ * pfm_prepare_sets - initialize sets on pfm_load_context ++ * @ctx : context to operate on ++ * @load_set: set to activate first ++ * ++ * connect all sets, reset internal fields ++ */ ++struct pfm_event_set *pfm_prepare_sets(struct pfm_context *ctx, u16 load_set) ++{ ++ struct pfm_event_set *set, *p; ++ u16 max; ++ ++ /* ++ * locate first set to activate ++ */ ++ set = pfm_find_set(ctx, load_set, 0); ++ if (!set) ++ return NULL; ++ ++ if (set->flags & PFM_SETFL_OVFL_SWITCH) ++ pfm_reload_switch_thresholds(ctx, set); ++ ++ max = ctx->regs.max_intr_pmd; ++ ++ list_for_each_entry(p, &ctx->set_list, list) { ++ /* ++ * cleanup bitvectors ++ */ ++ bitmap_zero(cast_ulp(p->ovfl_pmds), max); ++ bitmap_zero(cast_ulp(p->povfl_pmds), max); ++ ++ p->npend_ovfls = 0; ++ ++ /* ++ * we cannot just use plain clear because of arch-specific flags ++ */ ++ p->priv_flags &= ~(PFM_SETFL_PRIV_MOD_BOTH|PFM_SETFL_PRIV_SWITCH); ++ /* ++ * neither duration nor runs are reset because typically loading/unloading ++ * does not mean counts are reset. To reset, the set must be modified ++ */ ++ } ++ return set; ++} ++ ++/* ++ * called by hrtimer_interrupt() ++ * ++ * This is the only function where we come with ++ * cpu_base->lock held before ctx->lock ++ * ++ * interrupts are disabled ++ */ ++enum hrtimer_restart pfm_handle_switch_timeout(struct hrtimer *t) ++{ ++ struct pfm_event_set *set; ++ struct pfm_context *ctx; ++ unsigned long flags; ++ enum hrtimer_restart ret = HRTIMER_NORESTART; ++ ++ /* ++ * prevent against race with unload ++ */ ++ ctx = __get_cpu_var(pmu_ctx); ++ if (!ctx) ++ return HRTIMER_NORESTART; ++ ++ spin_lock_irqsave(&ctx->lock, flags); ++ ++ set = ctx->active_set; ++ ++ /* ++ * switching occurs only when context is attached ++ */ ++ if (ctx->state != PFM_CTX_LOADED) ++ goto done; ++ /* ++ * timer does not run while monitoring is inactive (not started) ++ */ ++ if (!pfm_arch_is_active(ctx)) ++ goto done; ++ ++ pfm_stats_inc(handle_timeout_count); ++ ++ ret = pfm_switch_sets(ctx, NULL, PFM_PMD_RESET_SHORT, 0); ++done: ++ spin_unlock_irqrestore(&ctx->lock, flags); ++ return ret; ++} ++ ++/* ++ * ++ * always operating on the current task ++ * interrupts are masked ++ * ++ * input: ++ * - new_set: new set to switch to, if NULL follow normal chain ++ */ ++enum hrtimer_restart pfm_switch_sets(struct pfm_context *ctx, ++ struct pfm_event_set *new_set, ++ int reset_mode, ++ int no_restart) ++{ ++ struct pfm_event_set *set; ++ u64 now, end; ++ u32 new_flags; ++ int is_system, is_active, nn; ++ enum hrtimer_restart ret = HRTIMER_NORESTART; ++ ++ now = sched_clock(); ++ set = ctx->active_set; ++ is_active = pfm_arch_is_active(ctx); ++ ++ /* ++ * if no set is explicitly requested, ++ * use the set_switch_next field ++ */ ++ if (!new_set) { ++ /* ++ * we use round-robin unless the user specified ++ * a particular set to go to. ++ */ ++ new_set = list_first_entry(&set->list, struct pfm_event_set, list); ++ if (&new_set->list == &ctx->set_list) ++ new_set = list_first_entry(&ctx->set_list, struct pfm_event_set, list); ++ } ++ ++ PFM_DBG_ovfl("state=%d act=%d cur_set=%u cur_runs=%llu cur_npend=%d next_set=%u " ++ "next_runs=%llu new_npend=%d reset_mode=%d reset_pmds=%llx", ++ ctx->state, ++ is_active, ++ set->id, ++ (unsigned long long)set->runs, ++ set->npend_ovfls, ++ new_set->id, ++ (unsigned long long)new_set->runs, ++ new_set->npend_ovfls, ++ reset_mode, ++ (unsigned long long)new_set->reset_pmds[0]); ++ ++ is_system = ctx->flags.system; ++ new_flags = new_set->flags; ++ ++ /* ++ * nothing more to do ++ */ ++ if (new_set == set) ++ goto skip_same_set; ++ ++ if (is_active) { ++ pfm_arch_stop(current, ctx); ++ pfm_save_pmds(ctx, set); ++ /* ++ * compute elapsed ns for active set ++ */ ++ set->duration += now - set->duration_start; ++ } ++ ++ pfm_arch_restore_pmds(ctx, new_set); ++ /* ++ * if masked, we must restore the pmcs such that they ++ * do not capture anything. ++ */ ++ pfm_arch_restore_pmcs(ctx, new_set); ++ ++ if (new_set->npend_ovfls) { ++ pfm_arch_resend_irq(ctx); ++ pfm_stats_inc(ovfl_intr_replay_count); ++ } ++ ++ new_set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH; ++ ++skip_same_set: ++ new_set->runs++; ++ /* ++ * reset switch threshold ++ */ ++ if (new_flags & PFM_SETFL_OVFL_SWITCH) ++ pfm_reload_switch_thresholds(ctx, new_set); ++ ++ /* ++ * reset overflowed PMD registers in new set ++ */ ++ nn = bitmap_weight(cast_ulp(new_set->reset_pmds), ctx->regs.max_pmd); ++ if (nn) ++ pfm_reset_pmds(ctx, new_set, nn, reset_mode); ++ ++ ++ /* ++ * This is needed when coming from pfm_start() ++ * ++ * When switching to the same set, there is no ++ * need to restart ++ */ ++ if (no_restart) ++ goto skip_restart; ++ ++ if (is_active) { ++ /* ++ * do not need to restart when same set ++ */ ++ if (new_set != set) { ++ ctx->active_set = new_set; ++ new_set->duration_start = now; ++ pfm_arch_start(current, ctx); ++ } ++ /* ++ * install new timeout if necessary ++ */ ++ if (new_flags & PFM_SETFL_TIME_SWITCH) { ++ struct hrtimer *h; ++ h = &__get_cpu_var(pfm_hrtimer); ++ hrtimer_forward(h, h->base->get_time(), new_set->hrtimer_exp); ++ new_set->hrtimer_rem = new_set->hrtimer_exp; ++ ret = HRTIMER_RESTART; ++ } ++ } ++ ++skip_restart: ++ ctx->active_set = new_set; ++ ++ end = sched_clock(); ++ ++ pfm_stats_inc(set_switch_count); ++ pfm_stats_add(set_switch_ns, end - now); ++ ++ return ret; ++} ++ ++/* ++ * called from __pfm_overflow_handler() to switch event sets. ++ * monitoring is stopped, task is current, interrupts are masked. ++ * compared to pfm_switch_sets(), this version is simplified because ++ * it knows about the call path. There is no need to stop monitoring ++ * because it is already frozen by PMU handler. ++ */ ++void pfm_switch_sets_from_intr(struct pfm_context *ctx) ++{ ++ struct pfm_event_set *set, *new_set; ++ u64 now, end; ++ u32 new_flags; ++ int is_system, n; ++ ++ now = sched_clock(); ++ set = ctx->active_set; ++ new_set = list_first_entry(&set->list, struct pfm_event_set, list); ++ if (&new_set->list == &ctx->set_list) ++ new_set = list_first_entry(&ctx->set_list, struct pfm_event_set, list); ++ ++ PFM_DBG_ovfl("state=%d cur_set=%u cur_runs=%llu cur_npend=%d next_set=%u " ++ "next_runs=%llu new_npend=%d new_r_pmds=%llx", ++ ctx->state, ++ set->id, ++ (unsigned long long)set->runs, ++ set->npend_ovfls, ++ new_set->id, ++ (unsigned long long)new_set->runs, ++ new_set->npend_ovfls, ++ (unsigned long long)new_set->reset_pmds[0]); ++ ++ is_system = ctx->flags.system; ++ new_flags = new_set->flags; ++ ++ /* ++ * nothing more to do ++ */ ++ if (new_set == set) ++ goto skip_same_set; ++ ++ /* ++ * switch on intr only when set has OVFL_SWITCH ++ */ ++ BUG_ON(set->flags & PFM_SETFL_TIME_SWITCH); ++ ++ /* ++ * when called from PMU intr handler, monitoring ++ * is already stopped ++ * ++ * save current PMD registers, we use a special ++ * form for performance reason. On some architectures, ++ * such as x86, the pmds are already saved when entering ++ * the PMU interrupt handler via pfm-arch_intr_freeze() ++ * so we don't need to save them again. On the contrary, ++ * on IA-64, they are not saved by freeze, thus we have to ++ * to it here. ++ */ ++ pfm_arch_save_pmds_from_intr(ctx, set); ++ ++ /* ++ * compute elapsed ns for active set ++ */ ++ set->duration += now - set->duration_start; ++ ++ pfm_arch_restore_pmds(ctx, new_set); ++ ++ /* ++ * must not be restored active as we are still executing in the ++ * PMU interrupt handler. activation is deferred to unfreeze PMU ++ */ ++ pfm_arch_restore_pmcs(ctx, new_set); ++ ++ /* ++ * check for pending interrupt on incoming set. ++ * interrupts are masked so handler call deferred ++ */ ++ if (new_set->npend_ovfls) { ++ pfm_arch_resend_irq(ctx); ++ pfm_stats_inc(ovfl_intr_replay_count); ++ } ++ /* ++ * no need to restore anything, that is already done ++ */ ++ new_set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH; ++ /* ++ * reset duration counter ++ */ ++ new_set->duration_start = now; ++ ++skip_same_set: ++ new_set->runs++; ++ ++ /* ++ * reset switch threshold ++ */ ++ if (new_flags & PFM_SETFL_OVFL_SWITCH) ++ pfm_reload_switch_thresholds(ctx, new_set); ++ ++ /* ++ * reset overflowed PMD registers ++ */ ++ n = bitmap_weight(cast_ulp(new_set->reset_pmds), ctx->regs.max_pmd); ++ if (n) ++ pfm_reset_pmds(ctx, new_set, n, PFM_PMD_RESET_SHORT); ++ ++ /* ++ * XXX: isactive? ++ * ++ * Came here following a interrupt which triggered a switch, i.e., ++ * previous set was using OVFL_SWITCH, thus we just need to arm ++ * check if the next set is using timeout, and if so arm the timer. ++ * ++ * Timeout is always at least one tick away. No risk of having to ++ * invoke the timeout handler right now. In any case, cb_mode is ++ * set to HRTIMER_CB_IRQSAFE_NO_SOFTIRQ such that hrtimer_start ++ * will not try to wakeup the softirqd which could cause a locking ++ * problem. ++ */ ++ if (new_flags & PFM_SETFL_TIME_SWITCH) { ++ hrtimer_start(&__get_cpu_var(pfm_hrtimer), set->hrtimer_exp, HRTIMER_MODE_REL); ++ PFM_DBG("armed new timeout for set%u", new_set->id); ++ } ++ ++ ctx->active_set = new_set; ++ ++ end = sched_clock(); ++ ++ pfm_stats_inc(set_switch_count); ++ pfm_stats_add(set_switch_ns, end - now); ++} ++ ++ ++static int pfm_setfl_sane(struct pfm_context *ctx, u32 flags) ++{ ++#define PFM_SETFL_BOTH_SWITCH (PFM_SETFL_OVFL_SWITCH|PFM_SETFL_TIME_SWITCH) ++ int ret; ++ ++ ret = pfm_arch_setfl_sane(ctx, flags); ++ if (ret) ++ return ret; ++ ++ if ((flags & PFM_SETFL_BOTH_SWITCH) == PFM_SETFL_BOTH_SWITCH) { ++ PFM_DBG("both switch ovfl and switch time are set"); ++ return -EINVAL; ++ } ++ return 0; ++} ++ ++/* ++ * it is never possible to change the identification of an existing set ++ */ ++static int pfm_change_evtset(struct pfm_context *ctx, ++ struct pfm_event_set *set, ++ struct pfarg_setdesc *req) ++{ ++ struct timeval tv; ++ struct timespec ts; ++ ktime_t kt; ++ long d, res_ns; ++ s32 rem; ++ u32 flags; ++ int ret; ++ u16 set_id; ++ ++ BUG_ON(ctx->state == PFM_CTX_LOADED); ++ ++ set_id = req->set_id; ++ flags = req->set_flags; ++ ++ ret = pfm_setfl_sane(ctx, flags); ++ if (ret) { ++ PFM_DBG("invalid flags 0x%x set %u", flags, set_id); ++ return -EINVAL; ++ } ++ ++ /* ++ * compute timeout value ++ */ ++ if (flags & PFM_SETFL_TIME_SWITCH) { ++ /* ++ * timeout value of zero is illegal ++ */ ++ if (req->set_timeout == 0) { ++ PFM_DBG("invalid timeout 0"); ++ return -EINVAL; ++ } ++ ++ hrtimer_get_res(CLOCK_MONOTONIC, &ts); ++ res_ns = (long)ktime_to_ns(timespec_to_ktime(ts)); ++ ++ /* ++ * round-up to multiple of clock resolution ++ * timeout = ((req->set_timeout+res_ns-1)/res_ns)*res_ns; ++ * ++ * u64 division missing on 32-bit arch, so use div_s64_rem ++ */ ++ d = div_s64_rem(req->set_timeout, res_ns, &rem); ++ ++ PFM_DBG("set%u flags=0x%x req_timeout=%lluns " ++ "HZ=%u TICK_NSEC=%lu clock_res=%ldns rem=%dns", ++ set_id, ++ flags, ++ (unsigned long long)req->set_timeout, ++ HZ, TICK_NSEC, ++ res_ns, ++ rem); ++ ++ /* ++ * Only accept timeout, we can actually achieve. ++ * users can invoke clock_getres(CLOCK_MONOTONIC) ++ * to figure out resolution and adjust timeout ++ */ ++ if (rem) { ++ PFM_DBG("set%u invalid timeout=%llu", ++ set_id, ++ (unsigned long long)req->set_timeout); ++ return -EINVAL; ++ } ++ ++ tv = ns_to_timeval(req->set_timeout); ++ kt = timeval_to_ktime(tv); ++ set->hrtimer_exp = kt; ++ } else { ++ set->hrtimer_exp = ktime_set(0, 0); ++ } ++ ++ /* ++ * commit changes ++ */ ++ set->id = set_id; ++ set->flags = flags; ++ set->priv_flags = 0; ++ ++ /* ++ * activation and duration counters are reset as ++ * most likely major things will change in the set ++ */ ++ set->runs = 0; ++ set->duration = 0; ++ ++ return 0; ++} ++ ++/* ++ * this function does not modify the next field ++ */ ++static void pfm_initialize_set(struct pfm_context *ctx, ++ struct pfm_event_set *set) ++{ ++ u64 *impl_pmcs; ++ u16 i, max_pmc; ++ ++ max_pmc = ctx->regs.max_pmc; ++ impl_pmcs = ctx->regs.pmcs; ++ ++ /* ++ * install default values for all PMC registers ++ */ ++ for (i = 0; i < max_pmc; i++) { ++ if (test_bit(i, cast_ulp(impl_pmcs))) { ++ set->pmcs[i] = pfm_pmu_conf->pmc_desc[i].dfl_val; ++ PFM_DBG("set%u pmc%u=0x%llx", ++ set->id, ++ i, ++ (unsigned long long)set->pmcs[i]); ++ } ++ } ++ ++ /* ++ * PMD registers are set to 0 when the event set is allocated, ++ * hence we do not need to explicitly initialize them. ++ * ++ * For virtual PMD registers (i.e., those tied to a SW resource) ++ * their value becomes meaningful once the context is attached. ++ */ ++} ++ ++/* ++ * look for an event set using its identification. If the set does not ++ * exist: ++ * - if alloc == 0 then return error ++ * - if alloc == 1 then allocate set ++ * ++ * alloc is one ONLY when coming from pfm_create_evtsets() which can only ++ * be called when the context is detached, i.e. monitoring is stopped. ++ */ ++struct pfm_event_set *pfm_find_set(struct pfm_context *ctx, u16 set_id, int alloc) ++{ ++ struct pfm_event_set *set = NULL, *prev, *new_set; ++ ++ PFM_DBG("looking for set=%u", set_id); ++ ++ prev = NULL; ++ list_for_each_entry(set, &ctx->set_list, list) { ++ if (set->id == set_id) ++ return set; ++ if (set->id > set_id) ++ break; ++ prev = set; ++ } ++ ++ if (!alloc) ++ return NULL; ++ ++ /* ++ * we are holding the context spinlock and interrupts ++ * are unmasked. We must use GFP_ATOMIC as we cannot ++ * sleep while holding a spin lock. ++ */ ++ new_set = kmem_cache_zalloc(pfm_set_cachep, GFP_ATOMIC); ++ if (!new_set) ++ return NULL; ++ ++ new_set->id = set_id; ++ ++ INIT_LIST_HEAD(&new_set->list); ++ ++ if (prev == NULL) { ++ list_add(&(new_set->list), &ctx->set_list); ++ } else { ++ PFM_DBG("add after set=%u", prev->id); ++ list_add(&(new_set->list), &prev->list); ++ } ++ return new_set; ++} ++ ++/** ++ * pfm_create_initial_set - create initial set from __pfm_c reate_context ++ * @ctx: context to atatched the set to ++ */ ++int pfm_create_initial_set(struct pfm_context *ctx) ++{ ++ struct pfm_event_set *set; ++ ++ /* ++ * create initial set0 ++ */ ++ if (!pfm_find_set(ctx, 0, 1)) ++ return -ENOMEM; ++ ++ set = list_first_entry(&ctx->set_list, struct pfm_event_set, list); ++ ++ pfm_initialize_set(ctx, set); ++ ++ return 0; ++} ++ ++/* ++ * context is unloaded for this command. Interrupts are enabled ++ */ ++int __pfm_create_evtsets(struct pfm_context *ctx, struct pfarg_setdesc *req, ++ int count) ++{ ++ struct pfm_event_set *set; ++ u16 set_id; ++ int i, ret; ++ ++ for (i = 0; i < count; i++, req++) { ++ set_id = req->set_id; ++ ++ PFM_DBG("set_id=%u", set_id); ++ ++ set = pfm_find_set(ctx, set_id, 1); ++ if (set == NULL) ++ goto error_mem; ++ ++ ret = pfm_change_evtset(ctx, set, req); ++ if (ret) ++ goto error_params; ++ ++ pfm_initialize_set(ctx, set); ++ } ++ return 0; ++error_mem: ++ PFM_DBG("cannot allocate set %u", set_id); ++ return -ENOMEM; ++error_params: ++ return ret; ++} ++ ++int __pfm_getinfo_evtsets(struct pfm_context *ctx, struct pfarg_setinfo *req, ++ int count) ++{ ++ struct pfm_event_set *set; ++ int i, is_system, is_loaded, is_self, ret; ++ u16 set_id; ++ u64 end; ++ ++ end = sched_clock(); ++ ++ is_system = ctx->flags.system; ++ is_loaded = ctx->state == PFM_CTX_LOADED; ++ is_self = ctx->task == current || is_system; ++ ++ ret = -EINVAL; ++ for (i = 0; i < count; i++, req++) { ++ ++ set_id = req->set_id; ++ ++ list_for_each_entry(set, &ctx->set_list, list) { ++ if (set->id == set_id) ++ goto found; ++ if (set->id > set_id) ++ goto error; ++ } ++found: ++ req->set_flags = set->flags; ++ ++ /* ++ * compute leftover timeout ++ * ++ * lockdep may complain about lock inversion ++ * because of get_remaining() however, this ++ * applies to self-montoring only, thus the ++ * thread cannot be in the timeout handler ++ * and here at the same time given that we ++ * run with interrupts disabled ++ */ ++ if (is_loaded && is_self) { ++ struct hrtimer *h; ++ h = &__get_cpu_var(pfm_hrtimer); ++ req->set_timeout = ktime_to_ns(hrtimer_get_remaining(h)); ++ } else { ++ /* ++ * hrtimer_rem zero when not using ++ * timeout-based switching ++ */ ++ req->set_timeout = ktime_to_ns(set->hrtimer_rem); ++ } ++ ++ req->set_runs = set->runs; ++ req->set_act_duration = set->duration; ++ ++ /* ++ * adjust for active set if needed ++ */ ++ if (is_system && is_loaded && ctx->flags.started ++ && set == ctx->active_set) ++ req->set_act_duration += end - set->duration_start; ++ ++ /* ++ * copy the list of pmds which last overflowed ++ */ ++ bitmap_copy(cast_ulp(req->set_ovfl_pmds), ++ cast_ulp(set->ovfl_pmds), ++ PFM_MAX_PMDS); ++ ++ /* ++ * copy bitmask of available PMU registers ++ * ++ * must copy over the entire vector to avoid ++ * returning bogus upper bits pass by user ++ */ ++ bitmap_copy(cast_ulp(req->set_avail_pmcs), ++ cast_ulp(ctx->regs.pmcs), ++ PFM_MAX_PMCS); ++ ++ bitmap_copy(cast_ulp(req->set_avail_pmds), ++ cast_ulp(ctx->regs.pmds), ++ PFM_MAX_PMDS); ++ ++ PFM_DBG("set%u flags=0x%x eff_usec=%llu runs=%llu " ++ "a_pmcs=0x%llx a_pmds=0x%llx", ++ set_id, ++ set->flags, ++ (unsigned long long)req->set_timeout, ++ (unsigned long long)set->runs, ++ (unsigned long long)ctx->regs.pmcs[0], ++ (unsigned long long)ctx->regs.pmds[0]); ++ } ++ ret = 0; ++error: ++ return ret; ++} ++ ++/* ++ * context is unloaded for this command. Interrupts are enabled ++ */ ++int __pfm_delete_evtsets(struct pfm_context *ctx, void *arg, int count) ++{ ++ struct pfarg_setdesc *req = arg; ++ struct pfm_event_set *set; ++ u16 set_id; ++ int i, ret; ++ ++ ret = -EINVAL; ++ for (i = 0; i < count; i++, req++) { ++ set_id = req->set_id; ++ ++ list_for_each_entry(set, &ctx->set_list, list) { ++ if (set->id == set_id) ++ goto found; ++ if (set->id > set_id) ++ goto error; ++ } ++ goto error; ++found: ++ /* ++ * clear active set if necessary. ++ * will be updated when context is loaded ++ */ ++ if (set == ctx->active_set) ++ ctx->active_set = NULL; ++ ++ list_del(&set->list); ++ ++ kmem_cache_free(pfm_set_cachep, set); ++ ++ PFM_DBG("set%u deleted", set_id); ++ } ++ ret = 0; ++error: ++ return ret; ++} ++ ++/* ++ * called from pfm_context_free() to free all sets ++ */ ++void pfm_free_sets(struct pfm_context *ctx) ++{ ++ struct pfm_event_set *set, *tmp; ++ ++ list_for_each_entry_safe(set, tmp, &ctx->set_list, list) { ++ list_del(&set->list); ++ kmem_cache_free(pfm_set_cachep, set); ++ } ++} ++ ++/** ++ * pfm_restart_timer - restart hrtimer taking care of expired timeout ++ * @ctx : context to work with ++ * @set : current active set ++ * ++ * Must be called on the processor on which the timer is to be armed. ++ * Assumes context is locked and interrupts are masked ++ * ++ * Upon return the active set for the context may have changed ++ */ ++void pfm_restart_timer(struct pfm_context *ctx, struct pfm_event_set *set) ++{ ++ struct hrtimer *h; ++ enum hrtimer_restart ret; ++ ++ h = &__get_cpu_var(pfm_hrtimer); ++ ++ PFM_DBG_ovfl("hrtimer=%lld", (long long)ktime_to_ns(set->hrtimer_rem)); ++ ++ if (ktime_to_ns(set->hrtimer_rem) > 0) { ++ hrtimer_start(h, set->hrtimer_rem, HRTIMER_MODE_REL); ++ } else { ++ /* ++ * timer was not re-armed because it has already expired ++ * timer was not enqueued, we need to switch set now ++ */ ++ pfm_stats_inc(set_switch_exp); ++ ++ ret = pfm_switch_sets(ctx, NULL, 1, 0); ++ set = ctx->active_set; ++ if (ret == HRTIMER_RESTART) ++ hrtimer_start(h, set->hrtimer_rem, HRTIMER_MODE_REL); ++ } ++} ++ ++int __init pfm_init_sets(void) ++{ ++ pfm_set_cachep = kmem_cache_create("pfm_event_set", ++ sizeof(struct pfm_event_set), ++ SLAB_HWCACHE_ALIGN, 0, NULL); ++ if (!pfm_set_cachep) { ++ PFM_ERR("cannot initialize event set slab"); ++ return -ENOMEM; ++ } ++ return 0; ++} +diff --git a/perfmon/perfmon_smpl.c b/perfmon/perfmon_smpl.c +new file mode 100644 +index 0000000..e31fb15 +--- /dev/null ++++ b/perfmon/perfmon_smpl.c +@@ -0,0 +1,865 @@ ++/* ++ * perfmon_smpl.c: perfmon2 sampling management ++ * ++ * This file implements the perfmon2 interface which ++ * provides access to the hardware performance counters ++ * of the host processor. ++ * ++ * ++ * The initial version of perfmon.c was written by ++ * Ganesh Venkitachalam, IBM Corp. ++ * ++ * Then it was modified for perfmon-1.x by Stephane Eranian and ++ * David Mosberger, Hewlett Packard Co. ++ * ++ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x ++ * by Stephane Eranian, Hewlett Packard Co. ++ * ++ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * David Mosberger-Tang <davidm@hpl.hp.com> ++ * ++ * More information about perfmon available at: ++ * http://perfmon2.sf.net ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/module.h> ++#include <linux/kernel.h> ++#include <linux/vmalloc.h> ++#include <linux/fs.h> ++#include <linux/mm.h> ++#include <linux/random.h> ++#include <linux/uaccess.h> ++#include <linux/perfmon_kern.h> ++ ++#include "perfmon_priv.h" ++ ++/** ++ * pfm_smpl_buf_alloc - allocate memory for sampling buffer ++ * @ctx: context to operate on ++ * @rsize: requested size ++ * ++ * called from pfm_smpl_buffer_alloc_old() (IA64-COMPAT) ++ * and pfm_setup_smpl_fmt() ++ * ++ * interrupts are enabled, context is not locked. ++ * ++ * function is not static because it is called from the IA-64 ++ * compatibility module (perfmon_compat.c) ++ */ ++int pfm_smpl_buf_alloc(struct pfm_context *ctx, size_t rsize) ++{ ++#if PFM_ARCH_SMPL_ALIGN_SIZE > 0 ++#define PFM_ALIGN_SMPL(a, f) (void *)((((unsigned long)(a))+(f-1)) & ~(f-1)) ++#else ++#define PFM_ALIGN_SMPL(a, f) (a) ++#endif ++ void *addr, *real_addr; ++ size_t size, real_size; ++ int ret; ++ ++ might_sleep(); ++ ++ /* ++ * align page boundary ++ */ ++ size = PAGE_ALIGN(rsize); ++ ++ /* ++ * On some arch, it may be necessary to get an alignment greater ++ * than page size to avoid certain cache effects (e.g., MIPS). ++ * This is the reason for PFM_ARCH_SMPL_ALIGN_SIZE. ++ */ ++ real_size = size + PFM_ARCH_SMPL_ALIGN_SIZE; ++ ++ PFM_DBG("req_size=%zu size=%zu real_size=%zu", ++ rsize, ++ size, ++ real_size); ++ ++ ret = pfm_smpl_buf_space_acquire(ctx, real_size); ++ if (ret) ++ return ret; ++ ++ /* ++ * vmalloc can sleep. we do not hold ++ * any spinlock and interrupts are enabled ++ */ ++ real_addr = addr = vmalloc(real_size); ++ if (!real_addr) { ++ PFM_DBG("cannot allocate sampling buffer"); ++ goto unres; ++ } ++ ++ /* ++ * align the useable sampling buffer address to the arch requirement ++ * This is a nop on most architectures ++ */ ++ addr = PFM_ALIGN_SMPL(real_addr, PFM_ARCH_SMPL_ALIGN_SIZE); ++ ++ memset(addr, 0, real_size); ++ ++ /* ++ * due to cache aliasing, it may be necessary to flush the pages ++ * on certain architectures (e.g., MIPS) ++ */ ++ pfm_cacheflush(addr, real_size); ++ ++ /* ++ * what needs to be freed ++ */ ++ ctx->smpl_real_addr = real_addr; ++ ctx->smpl_real_size = real_size; ++ ++ /* ++ * what is actually available to user ++ */ ++ ctx->smpl_addr = addr; ++ ctx->smpl_size = size; ++ ++ PFM_DBG("addr=%p real_addr=%p", addr, real_addr); ++ ++ return 0; ++unres: ++ /* ++ * smpl_addr is NULL, no double freeing possible in pfm_context_free() ++ */ ++ pfm_smpl_buf_space_release(ctx, real_size); ++ ++ return -ENOMEM; ++} ++ ++/** ++ * pfm_smpl_buf_free - free resources associated with sampling ++ * @ctx: context to operate on ++ */ ++void pfm_smpl_buf_free(struct pfm_context *ctx) ++{ ++ struct pfm_smpl_fmt *fmt; ++ ++ fmt = ctx->smpl_fmt; ++ ++ /* ++ * some formats may not use a buffer, yet they may ++ * need to be called on exit ++ */ ++ if (fmt) { ++ if (fmt->fmt_exit) ++ (*fmt->fmt_exit)(ctx->smpl_addr); ++ /* ++ * decrease refcount of sampling format ++ */ ++ pfm_smpl_fmt_put(fmt); ++ } ++ ++ if (ctx->smpl_addr) { ++ pfm_smpl_buf_space_release(ctx, ctx->smpl_real_size); ++ ++ PFM_DBG("free buffer real_addr=0x%p real_size=%zu", ++ ctx->smpl_real_addr, ++ ctx->smpl_real_size); ++ ++ vfree(ctx->smpl_real_addr); ++ } ++} ++ ++/** ++ * pfm_setup_smpl_fmt - initialization of sampling format and buffer ++ * @ctx: context to operate on ++ * @fmt_arg: smapling format arguments ++ * @ctx_flags: context flags as passed by user ++ * @filp: file descriptor associated with context ++ * ++ * called from __pfm_create_context() ++ */ ++int pfm_setup_smpl_fmt(struct pfm_context *ctx, u32 ctx_flags, void *fmt_arg, ++ struct file *filp) ++{ ++ struct pfm_smpl_fmt *fmt; ++ size_t size = 0; ++ int ret = 0; ++ ++ fmt = ctx->smpl_fmt; ++ ++ /* ++ * validate parameters ++ */ ++ if (fmt->fmt_validate) { ++ ret = (*fmt->fmt_validate)(ctx_flags, ++ ctx->regs.num_pmds, ++ fmt_arg); ++ PFM_DBG("validate(0x%x,%p)=%d", ctx_flags, fmt_arg, ret); ++ if (ret) ++ goto error; ++ } ++ ++ /* ++ * check if buffer format needs buffer allocation ++ */ ++ size = 0; ++ if (fmt->fmt_getsize) { ++ ret = (*fmt->fmt_getsize)(ctx_flags, fmt_arg, &size); ++ if (ret) { ++ PFM_DBG("cannot get size ret=%d", ret); ++ goto error; ++ } ++ } ++ ++ /* ++ * allocate buffer ++ * v20_compat is for IA-64 backward compatibility with perfmon v2.0 ++ */ ++ if (size) { ++#ifdef CONFIG_IA64_PERFMON_COMPAT ++ /* ++ * backward compatibility with perfmon v2.0 on Ia-64 ++ */ ++ if (ctx->flags.ia64_v20_compat) ++ ret = pfm_smpl_buf_alloc_compat(ctx, size, filp); ++ else ++#endif ++ ret = pfm_smpl_buf_alloc(ctx, size); ++ ++ if (ret) ++ goto error; ++ ++ } ++ ++ if (fmt->fmt_init) { ++ ret = (*fmt->fmt_init)(ctx, ctx->smpl_addr, ctx_flags, ++ ctx->regs.num_pmds, ++ fmt_arg); ++ } ++ /* ++ * if there was an error, the buffer/resource will be freed by ++ * via pfm_context_free() ++ */ ++error: ++ return ret; ++} ++ ++void pfm_mask_monitoring(struct pfm_context *ctx, struct pfm_event_set *set) ++{ ++ u64 now; ++ ++ now = sched_clock(); ++ ++ /* ++ * we save the PMD values such that we can read them while ++ * MASKED without having the thread stopped ++ * because monitoring is stopped ++ * ++ * pfm_save_pmds() could be avoided if we knew ++ * that pfm_arch_intr_freeze() had saved them already ++ */ ++ pfm_save_pmds(ctx, set); ++ pfm_arch_mask_monitoring(ctx, set); ++ /* ++ * accumulate the set duration up to this point ++ */ ++ set->duration += now - set->duration_start; ++ ++ ctx->state = PFM_CTX_MASKED; ++ ++ /* ++ * need to stop timer and remember remaining time ++ * will be reloaded in pfm_unmask_monitoring ++ * hrtimer is cancelled in the tail of the interrupt ++ * handler once the context is unlocked ++ */ ++ if (set->flags & PFM_SETFL_TIME_SWITCH) { ++ struct hrtimer *h = &__get_cpu_var(pfm_hrtimer); ++ hrtimer_cancel(h); ++ set->hrtimer_rem = hrtimer_get_remaining(h); ++ } ++ PFM_DBG_ovfl("can_restart=%u", ctx->flags.can_restart); ++} ++ ++/** ++ * pfm_unmask_monitoring - unmask monitoring ++ * @ctx: context to work with ++ * @set: current active set ++ * ++ * interrupts are masked when entering this function. ++ * context must be in MASKED state when calling. ++ * ++ * Upon return, the active set may have changed when using timeout ++ * based switching. ++ */ ++static void pfm_unmask_monitoring(struct pfm_context *ctx, struct pfm_event_set *set) ++{ ++ if (ctx->state != PFM_CTX_MASKED) ++ return; ++ ++ PFM_DBG_ovfl("unmasking monitoring"); ++ ++ /* ++ * must be done before calling ++ * pfm_arch_unmask_monitoring() ++ */ ++ ctx->state = PFM_CTX_LOADED; ++ ++ /* ++ * we need to restore the PMDs because they ++ * may have been modified by user while MASKED in ++ * which case the actual registers have no yet ++ * been updated ++ */ ++ pfm_arch_restore_pmds(ctx, set); ++ ++ /* ++ * call arch specific handler ++ */ ++ pfm_arch_unmask_monitoring(ctx, set); ++ ++ /* ++ * clear force reload flag. May have been set ++ * in pfm_write_pmcs or pfm_write_pmds ++ */ ++ set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH; ++ ++ /* ++ * reset set duration timer ++ */ ++ set->duration_start = sched_clock(); ++ ++ /* ++ * restart hrtimer if needed ++ */ ++ if (set->flags & PFM_SETFL_TIME_SWITCH) { ++ pfm_restart_timer(ctx, set); ++ /* careful here as pfm_restart_timer may switch sets */ ++ } ++} ++ ++void pfm_reset_pmds(struct pfm_context *ctx, ++ struct pfm_event_set *set, ++ int num_pmds, ++ int reset_mode) ++{ ++ u64 val, mask, new_seed; ++ struct pfm_pmd *reg; ++ unsigned int i, not_masked; ++ ++ not_masked = ctx->state != PFM_CTX_MASKED; ++ ++ PFM_DBG_ovfl("%s r_pmds=0x%llx not_masked=%d", ++ reset_mode == PFM_PMD_RESET_LONG ? "long" : "short", ++ (unsigned long long)set->reset_pmds[0], ++ not_masked); ++ ++ pfm_stats_inc(reset_pmds_count); ++ ++ for (i = 0; num_pmds; i++) { ++ if (test_bit(i, cast_ulp(set->reset_pmds))) { ++ num_pmds--; ++ ++ reg = set->pmds + i; ++ ++ val = reset_mode == PFM_PMD_RESET_LONG ? ++ reg->long_reset : reg->short_reset; ++ ++ if (reg->flags & PFM_REGFL_RANDOM) { ++ mask = reg->mask; ++ new_seed = random32(); ++ ++ /* construct a full 64-bit random value: */ ++ if ((unlikely(mask >> 32) != 0)) ++ new_seed |= (u64)random32() << 32; ++ ++ /* counter values are negative numbers! */ ++ val -= (new_seed & mask); ++ } ++ ++ set->pmds[i].value = val; ++ reg->lval = val; ++ ++ /* ++ * not all PMD to reset are necessarily ++ * counters ++ */ ++ if (not_masked) ++ pfm_write_pmd(ctx, i, val); ++ ++ PFM_DBG_ovfl("set%u pmd%u sval=0x%llx", ++ set->id, ++ i, ++ (unsigned long long)val); ++ } ++ } ++ ++ /* ++ * done with reset ++ */ ++ bitmap_zero(cast_ulp(set->reset_pmds), i); ++ ++ /* ++ * make changes visible ++ */ ++ if (not_masked) ++ pfm_arch_serialize(); ++} ++ ++/* ++ * called from pfm_handle_work() and __pfm_restart() ++ * for system-wide and per-thread context to resume ++ * monitoring after a user level notification. ++ * ++ * In both cases, the context is locked and interrupts ++ * are disabled. ++ */ ++void pfm_resume_after_ovfl(struct pfm_context *ctx) ++{ ++ struct pfm_smpl_fmt *fmt; ++ u32 rst_ctrl; ++ struct pfm_event_set *set; ++ u64 *reset_pmds; ++ void *hdr; ++ int state, ret; ++ ++ hdr = ctx->smpl_addr; ++ fmt = ctx->smpl_fmt; ++ state = ctx->state; ++ set = ctx->active_set; ++ ret = 0; ++ ++ if (hdr) { ++ rst_ctrl = 0; ++ prefetch(hdr); ++ } else { ++ rst_ctrl = PFM_OVFL_CTRL_RESET; ++ } ++ ++ /* ++ * if using a sampling buffer format and it has a restart callback, ++ * then invoke it. hdr may be NULL, if the format does not use a ++ * perfmon buffer ++ */ ++ if (fmt && fmt->fmt_restart) ++ ret = (*fmt->fmt_restart)(state == PFM_CTX_LOADED, &rst_ctrl, ++ hdr); ++ ++ reset_pmds = set->reset_pmds; ++ ++ PFM_DBG("fmt_restart=%d reset_count=%d set=%u r_pmds=0x%llx switch=%d " ++ "ctx_state=%d", ++ ret, ++ ctx->flags.reset_count, ++ set->id, ++ (unsigned long long)reset_pmds[0], ++ (set->priv_flags & PFM_SETFL_PRIV_SWITCH), ++ state); ++ ++ if (!ret) { ++ /* ++ * switch set if needed ++ */ ++ if (set->priv_flags & PFM_SETFL_PRIV_SWITCH) { ++ set->priv_flags &= ~PFM_SETFL_PRIV_SWITCH; ++ pfm_switch_sets(ctx, NULL, PFM_PMD_RESET_LONG, 0); ++ set = ctx->active_set; ++ } else if (rst_ctrl & PFM_OVFL_CTRL_RESET) { ++ int nn; ++ nn = bitmap_weight(cast_ulp(set->reset_pmds), ++ ctx->regs.max_pmd); ++ if (nn) ++ pfm_reset_pmds(ctx, set, nn, PFM_PMD_RESET_LONG); ++ } ++ ++ if (!(rst_ctrl & PFM_OVFL_CTRL_MASK)) ++ pfm_unmask_monitoring(ctx, set); ++ else ++ PFM_DBG("stopping monitoring?"); ++ ctx->state = PFM_CTX_LOADED; ++ } ++} ++ ++/* ++ * This function is called when we need to perform asynchronous ++ * work on a context. This function is called ONLY when about to ++ * return to user mode (very much like with signal handling). ++ * ++ * There are several reasons why we come here: ++ * ++ * - per-thread mode, not self-monitoring, to reset the counters ++ * after a pfm_restart() ++ * ++ * - we are zombie and we need to cleanup our state ++ * ++ * - we need to block after an overflow notification ++ * on a context with the PFM_OVFL_NOTIFY_BLOCK flag ++ * ++ * This function is never called for a system-wide context. ++ * ++ * pfm_handle_work() can be called with interrupts enabled ++ * (TIF_NEED_RESCHED) or disabled. The down_interruptible ++ * call may sleep, therefore we must re-enable interrupts ++ * to avoid deadlocks. It is safe to do so because this function ++ * is called ONLY when returning to user level, in which case ++ * there is no risk of kernel stack overflow due to deep ++ * interrupt nesting. ++ */ ++void pfm_handle_work(struct pt_regs *regs) ++{ ++ struct pfm_context *ctx; ++ unsigned long flags, dummy_flags; ++ int type, ret, info; ++ ++#ifdef CONFIG_PPC ++ /* ++ * This is just a temporary fix. Obviously we'd like to fix the powerpc ++ * code to make that check before calling __pfm_handle_work() to ++ * prevent the function call overhead, but the call is made from ++ * assembly code, so it will take a little while to figure out how to ++ * perform the check correctly. ++ */ ++ if (!test_thread_flag(TIF_PERFMON_WORK)) ++ return; ++#endif ++ ++ if (!user_mode(regs)) ++ return; ++ ++ clear_thread_flag(TIF_PERFMON_WORK); ++ ++ pfm_stats_inc(handle_work_count); ++ ++ ctx = current->pfm_context; ++ if (ctx == NULL) { ++ PFM_DBG("[%d] has no ctx", current->pid); ++ return; ++ } ++ ++ BUG_ON(ctx->flags.system); ++ ++ spin_lock_irqsave(&ctx->lock, flags); ++ ++ type = ctx->flags.work_type; ++ ctx->flags.work_type = PFM_WORK_NONE; ++ ++ PFM_DBG("work_type=%d reset_count=%d", ++ type, ++ ctx->flags.reset_count); ++ ++ switch (type) { ++ case PFM_WORK_ZOMBIE: ++ goto do_zombie; ++ case PFM_WORK_RESET: ++ /* simply reset, no blocking */ ++ goto skip_blocking; ++ case PFM_WORK_NONE: ++ PFM_DBG("unexpected PFM_WORK_NONE"); ++ goto nothing_todo; ++ case PFM_WORK_BLOCK: ++ break; ++ default: ++ PFM_DBG("unkown type=%d", type); ++ goto nothing_todo; ++ } ++ ++ /* ++ * restore interrupt mask to what it was on entry. ++ * Could be enabled/disabled. ++ */ ++ spin_unlock_irqrestore(&ctx->lock, flags); ++ ++ /* ++ * force interrupt enable because of down_interruptible() ++ */ ++ local_irq_enable(); ++ ++ PFM_DBG("before block sleeping"); ++ ++ /* ++ * may go through without blocking on SMP systems ++ * if restart has been received already by the time we call down() ++ */ ++ ret = wait_for_completion_interruptible(&ctx->restart_complete); ++ ++ PFM_DBG("after block sleeping ret=%d", ret); ++ ++ /* ++ * lock context and mask interrupts again ++ * We save flags into a dummy because we may have ++ * altered interrupts mask compared to entry in this ++ * function. ++ */ ++ spin_lock_irqsave(&ctx->lock, dummy_flags); ++ ++ if (ctx->state == PFM_CTX_ZOMBIE) ++ goto do_zombie; ++ ++ /* ++ * in case of interruption of down() we don't restart anything ++ */ ++ if (ret < 0) ++ goto nothing_todo; ++ ++skip_blocking: ++ /* ++ * iterate over the number of pending resets ++ * There are certain situations where there may be ++ * multiple notifications sent before a pfm_restart(). ++ * As such, it may be that multiple pfm_restart() are ++ * issued before the monitored thread gets to ++ * pfm_handle_work(). To avoid losing restarts, pfm_restart() ++ * increments a counter (reset_counts). Here, we take this ++ * into account by potentially calling pfm_resume_after_ovfl() ++ * multiple times. It is up to the sampling format to take the ++ * appropriate actions. ++ */ ++ while (ctx->flags.reset_count) { ++ pfm_resume_after_ovfl(ctx); ++ /* careful as active set may have changed */ ++ ctx->flags.reset_count--; ++ } ++ ++nothing_todo: ++ /* ++ * restore flags as they were upon entry ++ */ ++ spin_unlock_irqrestore(&ctx->lock, flags); ++ return; ++ ++do_zombie: ++ PFM_DBG("context is zombie, bailing out"); ++ ++ __pfm_unload_context(ctx, &info); ++ ++ /* ++ * keep the spinlock check happy ++ */ ++ spin_unlock(&ctx->lock); ++ ++ /* ++ * enable interrupt for vfree() ++ */ ++ local_irq_enable(); ++ ++ /* ++ * cancel timer now that context is unlocked ++ */ ++ if (info & 0x2) { ++ ret = hrtimer_cancel(&__get_cpu_var(pfm_hrtimer)); ++ PFM_DBG("timeout cancel=%d", ret); ++ } ++ ++ /* ++ * actual context free ++ */ ++ pfm_free_context(ctx); ++ ++ /* ++ * restore interrupts as they were upon entry ++ */ ++ local_irq_restore(flags); ++ ++ /* always true */ ++ if (info & 0x1) ++ pfm_session_release(0, 0); ++} ++ ++/** ++ * __pfm_restart - resume monitoring after user-level notification ++ * @ctx: context to operate on ++ * @info: return information used to free resource once unlocked ++ * ++ * function called from sys_pfm_restart(). It is used when overflow ++ * notification is requested. For each notification received, the user ++ * must call pfm_restart() to indicate to the kernel that it is done ++ * processing the notification. ++ * ++ * When the caller is doing user level sampling, this function resets ++ * the overflowed counters and resumes monitoring which is normally stopped ++ * during notification (always the consequence of a counter overflow). ++ * ++ * When using a sampling format, the format restart() callback is invoked, ++ * overflowed PMDS may be reset based upon decision from sampling format. ++ * ++ * When operating in per-thread mode, and when not self-monitoring, the ++ * monitored thread DOES NOT need to be stopped, unlike for many other calls. ++ * ++ * This means that the effect of the restart may not necessarily be observed ++ * right when returning from the call. For instance, counters may not already ++ * be reset in the other thread. ++ * ++ * When operating in system-wide, the caller must be running on the monitored ++ * CPU. ++ * ++ * The context is locked and interrupts are disabled. ++ * ++ * info value upon return: ++ * - bit 0: when set, mudt issue complete() on restart semaphore ++ */ ++int __pfm_restart(struct pfm_context *ctx, int *info) ++{ ++ int state; ++ ++ state = ctx->state; ++ ++ PFM_DBG("state=%d can_restart=%d reset_count=%d", ++ state, ++ ctx->flags.can_restart, ++ ctx->flags.reset_count); ++ ++ *info = 0; ++ ++ switch (state) { ++ case PFM_CTX_MASKED: ++ break; ++ case PFM_CTX_LOADED: ++ if (ctx->smpl_addr && ctx->smpl_fmt->fmt_restart) ++ break; ++ default: ++ PFM_DBG("invalid state=%d", state); ++ return -EBUSY; ++ } ++ ++ /* ++ * first check if allowed to restart, i.e., notifications received ++ */ ++ if (!ctx->flags.can_restart) { ++ PFM_DBG("no restart can_restart=0"); ++ return -EBUSY; ++ } ++ ++ pfm_stats_inc(pfm_restart_count); ++ ++ /* ++ * at this point, the context is either LOADED or MASKED ++ */ ++ ctx->flags.can_restart--; ++ ++ /* ++ * handle self-monitoring case and system-wide ++ */ ++ if (ctx->task == current || ctx->flags.system) { ++ pfm_resume_after_ovfl(ctx); ++ return 0; ++ } ++ ++ /* ++ * restart another task ++ */ ++ ++ /* ++ * if blocking, then post the semaphore if PFM_CTX_MASKED, i.e. ++ * the task is blocked or on its way to block. That's the normal ++ * restart path. If the monitoring is not masked, then the task ++ * can be actively monitoring and we cannot directly intervene. ++ * Therefore we use the trap mechanism to catch the task and ++ * force it to reset the buffer/reset PMDs. ++ * ++ * if non-blocking, then we ensure that the task will go into ++ * pfm_handle_work() before returning to user mode. ++ * ++ * We cannot explicitly reset another task, it MUST always ++ * be done by the task itself. This works for system wide because ++ * the tool that is controlling the session is logically doing ++ * "self-monitoring". ++ */ ++ if (ctx->flags.block && state == PFM_CTX_MASKED) { ++ PFM_DBG("unblocking [%d]", ctx->task->pid); ++ /* ++ * It is not possible to call complete() with the context locked ++ * otherwise we have a potential deadlock with the PMU context ++ * switch code due to a lock inversion between task_rq_lock() ++ * and the context lock. ++ * Instead we mark whether or not we need to issue the complete ++ * and we invoke the function once the context lock is released ++ * in sys_pfm_restart() ++ */ ++ *info = 1; ++ } else { ++ PFM_DBG("[%d] armed exit trap", ctx->task->pid); ++ pfm_post_work(ctx->task, ctx, PFM_WORK_RESET); ++ } ++ ctx->flags.reset_count++; ++ return 0; ++} ++ ++/** ++ * pfm_get_smpl_arg -- copy user arguments to pfm_create_context() related to sampling format ++ * @name: format name as passed by user ++ * @fmt_arg: format optional argument as passed by user ++ * @uszie: size of structure pass in fmt_arg ++ * @arg: kernel copy of fmt_arg ++ * @fmt: pointer to sampling format upon success ++ * ++ * arg is kmalloc'ed, thus it needs a kfree by caller ++ */ ++int pfm_get_smpl_arg(char __user *fmt_uname, void __user *fmt_uarg, size_t usize, void **arg, ++ struct pfm_smpl_fmt **fmt) ++{ ++ struct pfm_smpl_fmt *f; ++ char *fmt_name; ++ void *addr = NULL; ++ size_t sz; ++ int ret; ++ ++ fmt_name = getname(fmt_uname); ++ if (!fmt_name) { ++ PFM_DBG("getname failed"); ++ return -ENOMEM; ++ } ++ ++ /* ++ * find fmt and increase refcount ++ */ ++ f = pfm_smpl_fmt_get(fmt_name); ++ ++ putname(fmt_name); ++ ++ if (f == NULL) { ++ PFM_DBG("buffer format not found"); ++ return -EINVAL; ++ } ++ ++ /* ++ * expected format argument size ++ */ ++ sz = f->fmt_arg_size; ++ ++ /* ++ * check user size matches expected size ++ * usize = -1 is for IA-64 backward compatibility ++ */ ++ ret = -EINVAL; ++ if (sz != usize && usize != -1) { ++ PFM_DBG("invalid arg size %zu, format expects %zu", ++ usize, sz); ++ goto error; ++ } ++ ++ if (sz) { ++ ret = -ENOMEM; ++ addr = kmalloc(sz, GFP_KERNEL); ++ if (addr == NULL) ++ goto error; ++ ++ ret = -EFAULT; ++ if (copy_from_user(addr, fmt_uarg, sz)) ++ goto error; ++ } ++ *arg = addr; ++ *fmt = f; ++ return 0; ++ ++error: ++ kfree(addr); ++ pfm_smpl_fmt_put(f); ++ return ret; ++} +diff --git a/perfmon/perfmon_syscalls.c b/perfmon/perfmon_syscalls.c +new file mode 100644 +index 0000000..8777b58 +--- /dev/null ++++ b/perfmon/perfmon_syscalls.c +@@ -0,0 +1,1060 @@ ++/* ++ * perfmon_syscalls.c: perfmon2 system call interface ++ * ++ * This file implements the perfmon2 interface which ++ * provides access to the hardware performance counters ++ * of the host processor. ++ * ++ * The initial version of perfmon.c was written by ++ * Ganesh Venkitachalam, IBM Corp. ++ * ++ * Then it was modified for perfmon-1.x by Stephane Eranian and ++ * David Mosberger, Hewlett Packard Co. ++ * ++ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x ++ * by Stephane Eranian, Hewlett Packard Co. ++ * ++ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * David Mosberger-Tang <davidm@hpl.hp.com> ++ * ++ * More information about perfmon available at: ++ * http://perfmon2.sf.net ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/kernel.h> ++#include <linux/fs.h> ++#include <linux/ptrace.h> ++#include <linux/perfmon_kern.h> ++#include <linux/uaccess.h> ++#include "perfmon_priv.h" ++ ++/* ++ * Context locking rules: ++ * --------------------- ++ * - any thread with access to the file descriptor of a context can ++ * potentially issue perfmon calls ++ * ++ * - calls must be serialized to guarantee correctness ++ * ++ * - as soon as a context is attached to a thread or CPU, it may be ++ * actively monitoring. On some architectures, such as IA-64, this ++ * is true even though the pfm_start() call has not been made. This ++ * comes from the fact that on some architectures, it is possible to ++ * start/stop monitoring from userland. ++ * ++ * - If monitoring is active, then there can PMU interrupts. Because ++ * context accesses must be serialized, the perfmon system calls ++ * must mask interrupts as soon as the context is attached. ++ * ++ * - perfmon system calls that operate with the context unloaded cannot ++ * assume it is actually unloaded when they are called. They first need ++ * to check and for that they need interrupts masked. Then, if the ++ * context is actually unloaded, they can unmask interrupts. ++ * ++ * - interrupt masking holds true for other internal perfmon functions as ++ * well. Except for PMU interrupt handler because those interrupts ++ * cannot be nested. ++ * ++ * - we mask ALL interrupts instead of just the PMU interrupt because we ++ * also need to protect against timer interrupts which could trigger ++ * a set switch. ++ */ ++#ifdef CONFIG_UTRACE ++#include <linux/utrace.h> ++ ++static u32 ++stopper_quiesce(struct utrace_attached_engine *engine, struct task_struct *tsk) ++{ ++ PFM_DBG("quiesced [%d]", tsk->pid); ++ complete(engine->data); ++ return UTRACE_ACTION_RESUME; ++} ++ ++void ++pfm_resume_task(struct task_struct *t, void *data) ++{ ++ PFM_DBG("utrace detach [%d]", t->pid); ++ (void) utrace_detach(t, data); ++} ++ ++static const struct utrace_engine_ops utrace_ops = ++{ ++ .report_quiesce = stopper_quiesce, ++}; ++ ++static int pfm_wait_task_stopped(struct task_struct *task, void **data) ++{ ++ DECLARE_COMPLETION_ONSTACK(done); ++ struct utrace_attached_engine *eng; ++ int ret; ++ ++ eng = utrace_attach(task, UTRACE_ATTACH_CREATE, &utrace_ops, &done); ++ if (IS_ERR(eng)) ++ return PTR_ERR(eng); ++ ++ ret = utrace_set_flags(task, eng, ++ UTRACE_ACTION_QUIESCE | UTRACE_EVENT(QUIESCE)); ++ PFM_DBG("wait quiesce [%d]", task->pid); ++ if (!ret) ++ ret = wait_for_completion_interruptible(&done); ++ ++ if (ret) ++ (void) utrace_detach(task, eng); ++ else ++ *data = eng; ++ return 0; ++} ++#else /* !CONFIG_UTRACE */ ++static int pfm_wait_task_stopped(struct task_struct *task, void **data) ++{ ++ int ret; ++ ++ *data = NULL; ++ ++ /* ++ * returns 0 if cannot attach ++ */ ++ ret = ptrace_may_access(task, PTRACE_MODE_ATTACH); ++ PFM_DBG("may_attach=%d", ret); ++ if (!ret) ++ return -EPERM; ++ ++ ret = ptrace_check_attach(task, 0); ++ PFM_DBG("check_attach=%d", ret); ++ return ret; ++} ++void pfm_resume_task(struct task_struct *t, void *data) ++{} ++#endif ++ ++struct pfm_syscall_cookie { ++ struct file *filp; ++ int fput_needed; ++}; ++ ++/* ++ * cannot attach if : ++ * - kernel task ++ * - task not owned by caller (checked by ptrace_may_attach()) ++ * - task is dead or zombie ++ * - cannot use blocking notification when self-monitoring ++ */ ++static int pfm_task_incompatible(struct pfm_context *ctx, ++ struct task_struct *task) ++{ ++ /* ++ * cannot attach to a kernel thread ++ */ ++ if (!task->mm) { ++ PFM_DBG("cannot attach to kernel thread [%d]", task->pid); ++ return -EPERM; ++ } ++ ++ /* ++ * cannot use block on notification when ++ * self-monitoring. ++ */ ++ if (ctx->flags.block && task == current) { ++ PFM_DBG("cannot use block on notification when self-monitoring" ++ "[%d]", task->pid); ++ return -EINVAL; ++ } ++ /* ++ * cannot attach to a zombie task ++ */ ++ if (task->exit_state == EXIT_ZOMBIE || task->exit_state == EXIT_DEAD) { ++ PFM_DBG("cannot attach to zombie/dead task [%d]", task->pid); ++ return -EBUSY; ++ } ++ return 0; ++} ++ ++/** ++ * pfm_get_task -- check permission and acquire task to monitor ++ * @ctx: perfmon context ++ * @pid: identification of the task to check ++ * @task: upon return, a pointer to the task to monitor ++ * ++ * This function is used in per-thread mode only AND when not ++ * self-monitoring. It finds the task to monitor and checks ++ * that the caller has permissions to attach. It also checks ++ * that the task is stopped via ptrace so that we can safely ++ * modify its state. ++ * ++ * task refcount is incremented when succesful. ++ */ ++static int pfm_get_task(struct pfm_context *ctx, pid_t pid, ++ struct task_struct **task, void **data) ++{ ++ struct task_struct *p; ++ int ret = 0, ret1 = 0; ++ ++ *data = NULL; ++ ++ /* ++ * When attaching to another thread we must ensure ++ * that the thread is actually stopped. ++ * ++ * As a consequence, only the ptracing parent can actually ++ * attach a context to a thread. Obviously, this constraint ++ * does not exist for self-monitoring threads. ++ * ++ * We use ptrace_may_attach() to check for permission. ++ */ ++ read_lock(&tasklist_lock); ++ ++ p = find_task_by_vpid(pid); ++ if (p) ++ get_task_struct(p); ++ ++ read_unlock(&tasklist_lock); ++ ++ if (!p) { ++ PFM_DBG("task not found %d", pid); ++ return -ESRCH; ++ } ++ ++ ret = pfm_task_incompatible(ctx, p); ++ if (ret) ++ goto error; ++ ++ ret = pfm_wait_task_stopped(p, data); ++ if (ret) ++ goto error; ++ ++ *task = p; ++ ++ return 0; ++error: ++ if (!(ret1 || ret)) ++ ret = -EPERM; ++ ++ put_task_struct(p); ++ ++ return ret; ++} ++ ++/* ++ * context must be locked when calling this function ++ */ ++int pfm_check_task_state(struct pfm_context *ctx, int check_mask, ++ unsigned long *flags, void **resume) ++{ ++ struct task_struct *task; ++ unsigned long local_flags, new_flags; ++ int state, ret; ++ ++ *resume = NULL; ++ ++recheck: ++ /* ++ * task is NULL for system-wide context ++ */ ++ task = ctx->task; ++ state = ctx->state; ++ local_flags = *flags; ++ ++ PFM_DBG("state=%d check_mask=0x%x", state, check_mask); ++ /* ++ * if the context is detached, then we do not touch ++ * hardware, therefore there is not restriction on when we can ++ * access it. ++ */ ++ if (state == PFM_CTX_UNLOADED) ++ return 0; ++ /* ++ * no command can operate on a zombie context. ++ * A context becomes zombie when the file that identifies ++ * it is closed while the context is still attached to the ++ * thread it monitors. ++ */ ++ if (state == PFM_CTX_ZOMBIE) ++ return -EINVAL; ++ ++ /* ++ * at this point, state is PFM_CTX_LOADED or PFM_CTX_MASKED ++ */ ++ ++ /* ++ * some commands require the context to be unloaded to operate ++ */ ++ if (check_mask & PFM_CMD_UNLOADED) { ++ PFM_DBG("state=%d, cmd needs context unloaded", state); ++ return -EBUSY; ++ } ++ ++ /* ++ * self-monitoring always ok. ++ */ ++ if (task == current) ++ return 0; ++ ++ /* ++ * for syswide, the calling thread must be running on the cpu ++ * the context is bound to. ++ */ ++ if (ctx->flags.system) { ++ if (ctx->cpu != smp_processor_id()) ++ return -EBUSY; ++ return 0; ++ } ++ ++ /* ++ * at this point, monitoring another thread ++ */ ++ ++ /* ++ * the pfm_unload_context() command is allowed on masked context ++ */ ++ if (state == PFM_CTX_MASKED && !(check_mask & PFM_CMD_UNLOAD)) ++ return 0; ++ ++ /* ++ * When we operate on another thread, we must wait for it to be ++ * stopped and completely off any CPU as we need to access the ++ * PMU state (or machine state). ++ * ++ * A thread can be put in the STOPPED state in various ways ++ * including PTRACE_ATTACH, or when it receives a SIGSTOP signal. ++ * We enforce that the thread must be ptraced, so it is stopped ++ * AND it CANNOT wake up while we operate on it because this ++ * would require an action from the ptracing parent which is the ++ * thread that is calling this function. ++ * ++ * The dependency on ptrace, imposes that only the ptracing ++ * parent can issue command on a thread. This is unfortunate ++ * but we do not know of a better way of doing this. ++ */ ++ if (check_mask & PFM_CMD_STOPPED) { ++ ++ spin_unlock_irqrestore(&ctx->lock, local_flags); ++ ++ /* ++ * check that the thread is ptraced AND STOPPED ++ */ ++ ret = pfm_wait_task_stopped(task, resume); ++ ++ spin_lock_irqsave(&ctx->lock, new_flags); ++ ++ /* ++ * flags may be different than when we released the lock ++ */ ++ *flags = new_flags; ++ ++ if (ret) ++ return ret; ++ /* ++ * we must recheck to verify if state has changed ++ */ ++ if (unlikely(ctx->state != state)) { ++ PFM_DBG("old_state=%d new_state=%d", ++ state, ++ ctx->state); ++ goto recheck; ++ } ++ } ++ return 0; ++} ++ ++/* ++ * pfm_get_args - Function used to copy the syscall argument into kernel memory. ++ * @ureq: user argument ++ * @sz: user argument size ++ * @lsz: size of stack buffer ++ * @laddr: stack buffer address ++ * @req: point to start of kernel copy of the argument ++ * @ptr_free: address of kernel copy to free ++ * ++ * There are two options: ++ * - use a stack buffer described by laddr (addresses) and lsz (size) ++ * - allocate memory ++ * ++ * return: ++ * < 0 : in case of error (ptr_free may not be updated) ++ * 0 : success ++ * - req: points to base of kernel copy of arguments ++ * - ptr_free: address of buffer to free by caller on exit. ++ * NULL if using the stack buffer ++ * ++ * when ptr_free is not NULL upon return, the caller must kfree() ++ */ ++int pfm_get_args(void __user *ureq, size_t sz, size_t lsz, void *laddr, ++ void **req, void **ptr_free) ++{ ++ void *addr; ++ ++ /* ++ * check syadmin argument limit ++ */ ++ if (unlikely(sz > pfm_controls.arg_mem_max)) { ++ PFM_DBG("argument too big %zu max=%zu", ++ sz, ++ pfm_controls.arg_mem_max); ++ return -E2BIG; ++ } ++ ++ /* ++ * check if vector fits on stack buffer ++ */ ++ if (sz > lsz) { ++ addr = kmalloc(sz, GFP_KERNEL); ++ if (unlikely(addr == NULL)) ++ return -ENOMEM; ++ *ptr_free = addr; ++ } else { ++ addr = laddr; ++ *req = laddr; ++ *ptr_free = NULL; ++ } ++ ++ /* ++ * bring the data in ++ */ ++ if (unlikely(copy_from_user(addr, ureq, sz))) { ++ if (addr != laddr) ++ kfree(addr); ++ return -EFAULT; ++ } ++ ++ /* ++ * base address of kernel buffer ++ */ ++ *req = addr; ++ ++ return 0; ++} ++ ++/** ++ * pfm_acquire_ctx_from_fd -- get ctx from file descriptor ++ * @fd: file descriptor ++ * @ctx: pointer to pointer of context updated on return ++ * @cookie: opaque structure to use for release ++ * ++ * This helper function extracts the ctx from the file descriptor. ++ * It also increments the refcount of the file structure. Thus ++ * it updates the cookie so the refcount can be decreased when ++ * leaving the perfmon syscall via pfm_release_ctx_from_fd ++ */ ++static int pfm_acquire_ctx_from_fd(int fd, struct pfm_context **ctx, ++ struct pfm_syscall_cookie *cookie) ++{ ++ struct file *filp; ++ int fput_needed; ++ ++ filp = fget_light(fd, &fput_needed); ++ if (unlikely(filp == NULL)) { ++ PFM_DBG("invalid fd %d", fd); ++ return -EBADF; ++ } ++ ++ *ctx = filp->private_data; ++ ++ if (unlikely(!*ctx || filp->f_op != &pfm_file_ops)) { ++ PFM_DBG("fd %d not related to perfmon", fd); ++ return -EBADF; ++ } ++ cookie->filp = filp; ++ cookie->fput_needed = fput_needed; ++ ++ return 0; ++} ++ ++/** ++ * pfm_release_ctx_from_fd -- decrease refcount of file associated with context ++ * @cookie: the cookie structure initialized by pfm_acquire_ctx_from_fd ++ */ ++static inline void pfm_release_ctx_from_fd(struct pfm_syscall_cookie *cookie) ++{ ++ fput_light(cookie->filp, cookie->fput_needed); ++} ++ ++/* ++ * unlike the other perfmon system calls, this one returns a file descriptor ++ * or a value < 0 in case of error, very much like open() or socket() ++ */ ++asmlinkage long sys_pfm_create_context(struct pfarg_ctx __user *ureq, ++ char __user *fmt_name, ++ void __user *fmt_uarg, size_t fmt_size) ++{ ++ struct pfarg_ctx req; ++ struct pfm_smpl_fmt *fmt = NULL; ++ void *fmt_arg = NULL; ++ int ret; ++ ++ PFM_DBG("req=%p fmt=%p fmt_arg=%p size=%zu", ++ ureq, fmt_name, fmt_uarg, fmt_size); ++ ++ if (perfmon_disabled) ++ return -ENOSYS; ++ ++ if (copy_from_user(&req, ureq, sizeof(req))) ++ return -EFAULT; ++ ++ if (fmt_name) { ++ ret = pfm_get_smpl_arg(fmt_name, fmt_uarg, fmt_size, &fmt_arg, &fmt); ++ if (ret) ++ goto abort; ++ } ++ ++ ret = __pfm_create_context(&req, fmt, fmt_arg, PFM_NORMAL, NULL); ++ ++ kfree(fmt_arg); ++abort: ++ return ret; ++} ++ ++asmlinkage long sys_pfm_write_pmcs(int fd, struct pfarg_pmc __user *ureq, int count) ++{ ++ struct pfm_context *ctx; ++ struct task_struct *task; ++ struct pfm_syscall_cookie cookie; ++ struct pfarg_pmc pmcs[PFM_PMC_STK_ARG]; ++ struct pfarg_pmc *req; ++ void *fptr, *resume; ++ unsigned long flags; ++ size_t sz; ++ int ret; ++ ++ PFM_DBG("fd=%d req=%p count=%d", fd, ureq, count); ++ ++ if (count < 0 || count >= PFM_MAX_ARG_COUNT(ureq)) { ++ PFM_DBG("invalid arg count %d", count); ++ return -EINVAL; ++ } ++ ++ sz = count*sizeof(*ureq); ++ ++ ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie); ++ if (ret) ++ return ret; ++ ++ ret = pfm_get_args(ureq, sz, sizeof(pmcs), pmcs, (void **)&req, &fptr); ++ if (ret) ++ goto error; ++ ++ spin_lock_irqsave(&ctx->lock, flags); ++ ++ task = ctx->task; ++ ++ ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags, &resume); ++ if (!ret) ++ ret = __pfm_write_pmcs(ctx, req, count); ++ ++ spin_unlock_irqrestore(&ctx->lock, flags); ++ ++ if (resume) ++ pfm_resume_task(task, resume); ++ ++ /* ++ * This function may be on the critical path. ++ * We want to avoid the branch if unecessary. ++ */ ++ if (fptr) ++ kfree(fptr); ++error: ++ pfm_release_ctx_from_fd(&cookie); ++ return ret; ++} ++ ++asmlinkage long sys_pfm_write_pmds(int fd, struct pfarg_pmd __user *ureq, int count) ++{ ++ struct pfm_context *ctx; ++ struct task_struct *task; ++ struct pfm_syscall_cookie cookie; ++ struct pfarg_pmd pmds[PFM_PMD_STK_ARG]; ++ struct pfarg_pmd *req; ++ void *fptr, *resume; ++ unsigned long flags; ++ size_t sz; ++ int ret; ++ ++ PFM_DBG("fd=%d req=%p count=%d", fd, ureq, count); ++ ++ if (count < 0 || count >= PFM_MAX_ARG_COUNT(ureq)) { ++ PFM_DBG("invalid arg count %d", count); ++ return -EINVAL; ++ } ++ ++ sz = count*sizeof(*ureq); ++ ++ ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie); ++ if (ret) ++ return ret; ++ ++ ret = pfm_get_args(ureq, sz, sizeof(pmds), pmds, (void **)&req, &fptr); ++ if (ret) ++ goto error; ++ ++ spin_lock_irqsave(&ctx->lock, flags); ++ ++ task = ctx->task; ++ ++ ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags, &resume); ++ if (!ret) ++ ret = __pfm_write_pmds(ctx, req, count, 0); ++ ++ spin_unlock_irqrestore(&ctx->lock, flags); ++ ++ if (resume) ++ pfm_resume_task(task, resume); ++ ++ if (fptr) ++ kfree(fptr); ++error: ++ pfm_release_ctx_from_fd(&cookie); ++ return ret; ++} ++ ++asmlinkage long sys_pfm_read_pmds(int fd, struct pfarg_pmd __user *ureq, int count) ++{ ++ struct pfm_context *ctx; ++ struct task_struct *task; ++ struct pfm_syscall_cookie cookie; ++ struct pfarg_pmd pmds[PFM_PMD_STK_ARG]; ++ struct pfarg_pmd *req; ++ void *fptr, *resume; ++ unsigned long flags; ++ size_t sz; ++ int ret; ++ ++ PFM_DBG("fd=%d req=%p count=%d", fd, ureq, count); ++ ++ if (count < 0 || count >= PFM_MAX_ARG_COUNT(ureq)) ++ return -EINVAL; ++ ++ sz = count*sizeof(*ureq); ++ ++ ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie); ++ if (ret) ++ return ret; ++ ++ ret = pfm_get_args(ureq, sz, sizeof(pmds), pmds, (void **)&req, &fptr); ++ if (ret) ++ goto error; ++ ++ spin_lock_irqsave(&ctx->lock, flags); ++ ++ task = ctx->task; ++ ++ ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags, &resume); ++ if (!ret) ++ ret = __pfm_read_pmds(ctx, req, count); ++ ++ spin_unlock_irqrestore(&ctx->lock, flags); ++ ++ if (copy_to_user(ureq, req, sz)) ++ ret = -EFAULT; ++ ++ if (resume) ++ pfm_resume_task(task, resume); ++ ++ if (fptr) ++ kfree(fptr); ++error: ++ pfm_release_ctx_from_fd(&cookie); ++ return ret; ++} ++ ++asmlinkage long sys_pfm_restart(int fd) ++{ ++ struct pfm_context *ctx; ++ struct task_struct *task; ++ struct pfm_syscall_cookie cookie; ++ void *resume; ++ unsigned long flags; ++ int ret, info; ++ ++ PFM_DBG("fd=%d", fd); ++ ++ ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie); ++ if (ret) ++ return ret; ++ ++ spin_lock_irqsave(&ctx->lock, flags); ++ ++ task = ctx->task; ++ ++ ret = pfm_check_task_state(ctx, 0, &flags, &resume); ++ if (!ret) ++ ret = __pfm_restart(ctx, &info); ++ ++ spin_unlock_irqrestore(&ctx->lock, flags); ++ ++ if (resume) ++ pfm_resume_task(task, resume); ++ /* ++ * In per-thread mode with blocking notification, i.e. ++ * ctx->flags.blocking=1, we need to defer issuing the ++ * complete to unblock the blocked monitored thread. ++ * Otherwise we have a potential deadlock due to a lock ++ * inversion between the context lock and the task_rq_lock() ++ * which can happen if one thread is in this call and the other ++ * (the monitored thread) is in the context switch code. ++ * ++ * It is safe to access the context outside the critical section ++ * because: ++ * - we are protected by the fget_light(), thus the context ++ * cannot disappear ++ */ ++ if (ret == 0 && info == 1) ++ complete(&ctx->restart_complete); ++ ++ pfm_release_ctx_from_fd(&cookie); ++ return ret; ++} ++ ++asmlinkage long sys_pfm_stop(int fd) ++{ ++ struct pfm_context *ctx; ++ struct task_struct *task; ++ struct pfm_syscall_cookie cookie; ++ void *resume; ++ unsigned long flags; ++ int ret; ++ int release_info; ++ ++ PFM_DBG("fd=%d", fd); ++ ++ ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie); ++ if (ret) ++ return ret; ++ ++ spin_lock_irqsave(&ctx->lock, flags); ++ ++ task = ctx->task; ++ ++ ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags, &resume); ++ if (!ret) ++ ret = __pfm_stop(ctx, &release_info); ++ ++ spin_unlock_irqrestore(&ctx->lock, flags); ++ ++ if (resume) ++ pfm_resume_task(task, resume); ++ ++ /* ++ * defer cancellation of timer to avoid race ++ * with pfm_handle_switch_timeout() ++ * ++ * applies only when self-monitoring ++ */ ++ if (release_info & 0x2) ++ hrtimer_cancel(&__get_cpu_var(pfm_hrtimer)); ++ ++ pfm_release_ctx_from_fd(&cookie); ++ return ret; ++} ++ ++asmlinkage long sys_pfm_start(int fd, struct pfarg_start __user *ureq) ++{ ++ struct pfm_context *ctx; ++ struct task_struct *task; ++ struct pfm_syscall_cookie cookie; ++ void *resume; ++ struct pfarg_start req; ++ unsigned long flags; ++ int ret; ++ ++ PFM_DBG("fd=%d req=%p", fd, ureq); ++ ++ ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie); ++ if (ret) ++ return ret; ++ ++ /* ++ * the one argument is actually optional ++ */ ++ if (ureq && copy_from_user(&req, ureq, sizeof(req))) ++ return -EFAULT; ++ ++ spin_lock_irqsave(&ctx->lock, flags); ++ ++ task = ctx->task; ++ ++ ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags, &resume); ++ if (!ret) ++ ret = __pfm_start(ctx, ureq ? &req : NULL); ++ ++ spin_unlock_irqrestore(&ctx->lock, flags); ++ ++ if (resume) ++ pfm_resume_task(task, resume); ++ ++ pfm_release_ctx_from_fd(&cookie); ++ return ret; ++} ++ ++asmlinkage long sys_pfm_load_context(int fd, struct pfarg_load __user *ureq) ++{ ++ struct pfm_context *ctx; ++ struct task_struct *task; ++ struct pfm_syscall_cookie cookie; ++ void *resume, *dummy_resume; ++ unsigned long flags; ++ struct pfarg_load req; ++ int ret; ++ ++ PFM_DBG("fd=%d req=%p", fd, ureq); ++ ++ if (copy_from_user(&req, ureq, sizeof(req))) ++ return -EFAULT; ++ ++ ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie); ++ if (ret) ++ return ret; ++ ++ task = current; ++ ++ /* ++ * in per-thread mode (not self-monitoring), get a reference ++ * on task to monitor. This must be done with interrupts enabled ++ * Upon succesful return, refcount on task is increased. ++ * ++ * fget_light() is protecting the context. ++ */ ++ if (!ctx->flags.system && req.load_pid != current->pid) { ++ ret = pfm_get_task(ctx, req.load_pid, &task, &resume); ++ if (ret) ++ goto error; ++ } ++ ++ /* ++ * irqsave is required to avoid race in case context is already ++ * loaded or with switch timeout in the case of self-monitoring ++ */ ++ spin_lock_irqsave(&ctx->lock, flags); ++ ++ ret = pfm_check_task_state(ctx, PFM_CMD_UNLOADED, &flags, &dummy_resume); ++ if (!ret) ++ ret = __pfm_load_context(ctx, &req, task); ++ ++ spin_unlock_irqrestore(&ctx->lock, flags); ++ ++ if (resume) ++ pfm_resume_task(task, resume); ++ ++ /* ++ * in per-thread mode (not self-monitoring), we need ++ * to decrease refcount on task to monitor: ++ * - load successful: we have a reference to the task in ctx->task ++ * - load failed : undo the effect of pfm_get_task() ++ */ ++ if (task != current) ++ put_task_struct(task); ++error: ++ pfm_release_ctx_from_fd(&cookie); ++ return ret; ++} ++ ++asmlinkage long sys_pfm_unload_context(int fd) ++{ ++ struct pfm_context *ctx; ++ struct task_struct *task; ++ struct pfm_syscall_cookie cookie; ++ void *resume; ++ unsigned long flags; ++ int ret; ++ int is_system, release_info = 0; ++ u32 cpu; ++ ++ PFM_DBG("fd=%d", fd); ++ ++ ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie); ++ if (ret) ++ return ret; ++ ++ is_system = ctx->flags.system; ++ ++ spin_lock_irqsave(&ctx->lock, flags); ++ ++ cpu = ctx->cpu; ++ task = ctx->task; ++ ++ ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED|PFM_CMD_UNLOAD, ++ &flags, &resume); ++ if (!ret) ++ ret = __pfm_unload_context(ctx, &release_info); ++ ++ spin_unlock_irqrestore(&ctx->lock, flags); ++ ++ if (resume) ++ pfm_resume_task(task, resume); ++ ++ /* ++ * cancel time now that context is unlocked ++ * avoid race with pfm_handle_switch_timeout() ++ */ ++ if (release_info & 0x2) { ++ int r; ++ r = hrtimer_cancel(&__get_cpu_var(pfm_hrtimer)); ++ PFM_DBG("timeout cancel=%d", r); ++ } ++ ++ if (release_info & 0x1) ++ pfm_session_release(is_system, cpu); ++ ++ pfm_release_ctx_from_fd(&cookie); ++ return ret; ++} ++ ++asmlinkage long sys_pfm_create_evtsets(int fd, struct pfarg_setdesc __user *ureq, int count) ++{ ++ struct pfm_context *ctx; ++ struct pfm_syscall_cookie cookie; ++ struct pfarg_setdesc *req; ++ void *fptr, *resume; ++ unsigned long flags; ++ size_t sz; ++ int ret; ++ ++ PFM_DBG("fd=%d req=%p count=%d", fd, ureq, count); ++ ++ if (count < 0 || count >= PFM_MAX_ARG_COUNT(ureq)) ++ return -EINVAL; ++ ++ sz = count*sizeof(*ureq); ++ ++ ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie); ++ if (ret) ++ return ret; ++ ++ ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr); ++ if (ret) ++ goto error; ++ ++ /* ++ * must mask interrupts because we do not know the state of context, ++ * could be attached and we could be getting PMU interrupts. So ++ * we mask and lock context and we check and possibly relax masking ++ */ ++ spin_lock_irqsave(&ctx->lock, flags); ++ ++ ret = pfm_check_task_state(ctx, PFM_CMD_UNLOADED, &flags, &resume); ++ if (!ret) ++ ret = __pfm_create_evtsets(ctx, req, count); ++ ++ spin_unlock_irqrestore(&ctx->lock, flags); ++ /* ++ * context must be unloaded for this command. The resume pointer ++ * is necessarily NULL, thus no need to call pfm_resume_task() ++ */ ++ kfree(fptr); ++ ++error: ++ pfm_release_ctx_from_fd(&cookie); ++ return ret; ++} ++ ++asmlinkage long sys_pfm_getinfo_evtsets(int fd, struct pfarg_setinfo __user *ureq, int count) ++{ ++ struct pfm_context *ctx; ++ struct task_struct *task; ++ struct pfm_syscall_cookie cookie; ++ struct pfarg_setinfo *req; ++ void *fptr, *resume; ++ unsigned long flags; ++ size_t sz; ++ int ret; ++ ++ PFM_DBG("fd=%d req=%p count=%d", fd, ureq, count); ++ ++ if (count < 0 || count >= PFM_MAX_ARG_COUNT(ureq)) ++ return -EINVAL; ++ ++ sz = count*sizeof(*ureq); ++ ++ ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie); ++ if (ret) ++ return ret; ++ ++ ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr); ++ if (ret) ++ goto error; ++ ++ /* ++ * this command operates even when context is loaded, so we need ++ * to keep interrupts masked to avoid a race with PMU interrupt ++ * which may switch the active set ++ */ ++ spin_lock_irqsave(&ctx->lock, flags); ++ ++ task = ctx->task; ++ ++ ret = pfm_check_task_state(ctx, 0, &flags, &resume); ++ if (!ret) ++ ret = __pfm_getinfo_evtsets(ctx, req, count); ++ ++ spin_unlock_irqrestore(&ctx->lock, flags); ++ ++ if (resume) ++ pfm_resume_task(task, resume); ++ ++ if (copy_to_user(ureq, req, sz)) ++ ret = -EFAULT; ++ ++ kfree(fptr); ++error: ++ pfm_release_ctx_from_fd(&cookie); ++ return ret; ++} ++ ++asmlinkage long sys_pfm_delete_evtsets(int fd, struct pfarg_setinfo __user *ureq, int count) ++{ ++ struct pfm_context *ctx; ++ struct pfm_syscall_cookie cookie; ++ struct pfarg_setinfo *req; ++ void *fptr, *resume; ++ unsigned long flags; ++ size_t sz; ++ int ret; ++ ++ PFM_DBG("fd=%d req=%p count=%d", fd, ureq, count); ++ ++ if (count < 0 || count >= PFM_MAX_ARG_COUNT(ureq)) ++ return -EINVAL; ++ ++ sz = count*sizeof(*ureq); ++ ++ ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie); ++ if (ret) ++ return ret; ++ ++ ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr); ++ if (ret) ++ goto error; ++ ++ /* ++ * must mask interrupts because we do not know the state of context, ++ * could be attached and we could be getting PMU interrupts ++ */ ++ spin_lock_irqsave(&ctx->lock, flags); ++ ++ ret = pfm_check_task_state(ctx, PFM_CMD_UNLOADED, &flags, &resume); ++ if (!ret) ++ ret = __pfm_delete_evtsets(ctx, req, count); ++ ++ spin_unlock_irqrestore(&ctx->lock, flags); ++ /* ++ * context must be unloaded for this command. The resume pointer ++ * is necessarily NULL, thus no need to call pfm_resume_task() ++ */ ++ kfree(fptr); ++ ++error: ++ pfm_release_ctx_from_fd(&cookie); ++ return ret; ++} +diff --git a/perfmon/perfmon_sysfs.c b/perfmon/perfmon_sysfs.c +new file mode 100644 +index 0000000..7353c3b +--- /dev/null ++++ b/perfmon/perfmon_sysfs.c +@@ -0,0 +1,525 @@ ++/* ++ * perfmon_sysfs.c: perfmon2 sysfs interface ++ * ++ * This file implements the perfmon2 interface which ++ * provides access to the hardware performance counters ++ * of the host processor. ++ * ++ * The initial version of perfmon.c was written by ++ * Ganesh Venkitachalam, IBM Corp. ++ * ++ * Then it was modified for perfmon-1.x by Stephane Eranian and ++ * David Mosberger, Hewlett Packard Co. ++ * ++ * Version Perfmon-2.x is a complete rewrite of perfmon-1.x ++ * by Stephane Eranian, Hewlett Packard Co. ++ * ++ * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. ++ * Contributed by Stephane Eranian <eranian@hpl.hp.com> ++ * David Mosberger-Tang <davidm@hpl.hp.com> ++ * ++ * More information about perfmon available at: ++ * http://perfmon2.sf.net ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of version 2 of the GNU General Public ++ * License as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA ++ * 02111-1307 USA ++ */ ++#include <linux/kernel.h> ++#include <linux/module.h> /* for EXPORT_SYMBOL */ ++#include <linux/perfmon_kern.h> ++#include "perfmon_priv.h" ++ ++struct pfm_attribute { ++ struct attribute attr; ++ ssize_t (*show)(void *, struct pfm_attribute *attr, char *); ++ ssize_t (*store)(void *, const char *, size_t); ++}; ++#define to_attr(n) container_of(n, struct pfm_attribute, attr); ++ ++#define PFM_RO_ATTR(_name, _show) \ ++ struct kobj_attribute attr_##_name = __ATTR(_name, 0444, _show, NULL) ++ ++#define PFM_RW_ATTR(_name, _show, _store) \ ++ struct kobj_attribute attr_##_name = __ATTR(_name, 0644, _show, _store) ++ ++#define PFM_ROS_ATTR(_name, _show) \ ++ struct pfm_attribute attr_##_name = __ATTR(_name, 0444, _show, NULL) ++ ++#define is_attr_name(a, n) (!strcmp((a)->attr.name, n)) ++int pfm_sysfs_add_pmu(struct pfm_pmu_config *pmu); ++ ++static struct kobject *pfm_kernel_kobj, *pfm_fmt_kobj; ++static struct kobject *pfm_pmu_kobj; ++ ++static ssize_t pfm_regs_attr_show(struct kobject *kobj, ++ struct attribute *attr, char *buf) ++{ ++ struct pfm_regmap_desc *reg = to_reg(kobj); ++ struct pfm_attribute *attribute = to_attr(attr); ++ return attribute->show ? attribute->show(reg, attribute, buf) : -EIO; ++} ++ ++static ssize_t pfm_fmt_attr_show(struct kobject *kobj, ++ struct attribute *attr, char *buf) ++{ ++ struct pfm_smpl_fmt *fmt = to_smpl_fmt(kobj); ++ struct pfm_attribute *attribute = to_attr(attr); ++ return attribute->show ? attribute->show(fmt, attribute, buf) : -EIO; ++} ++ ++static struct sysfs_ops pfm_regs_sysfs_ops = { ++ .show = pfm_regs_attr_show ++}; ++ ++static struct sysfs_ops pfm_fmt_sysfs_ops = { ++ .show = pfm_fmt_attr_show ++}; ++ ++static struct kobj_type pfm_regs_ktype = { ++ .sysfs_ops = &pfm_regs_sysfs_ops, ++}; ++ ++static struct kobj_type pfm_fmt_ktype = { ++ .sysfs_ops = &pfm_fmt_sysfs_ops, ++}; ++ ++static ssize_t pfm_controls_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) ++{ ++ int base; ++ ++ if (is_attr_name(attr, "version")) ++ return snprintf(buf, PAGE_SIZE, "%u.%u\n", PFM_VERSION_MAJ, PFM_VERSION_MIN); ++ ++ if (is_attr_name(attr, "task_sessions_count")) ++ return pfm_sysfs_res_show(buf, PAGE_SIZE, 0); ++ ++ if (is_attr_name(attr, "debug")) ++ return snprintf(buf, PAGE_SIZE, "%d\n", pfm_controls.debug); ++ ++ if (is_attr_name(attr, "task_group")) ++ return snprintf(buf, PAGE_SIZE, "%d\n", pfm_controls.task_group); ++ ++ if (is_attr_name(attr, "mode")) ++ return snprintf(buf, PAGE_SIZE, "%d\n", pfm_controls.flags); ++ ++ if (is_attr_name(attr, "arg_mem_max")) ++ return snprintf(buf, PAGE_SIZE, "%zu\n", pfm_controls.arg_mem_max); ++ ++ if (is_attr_name(attr, "syscall")) { ++ base = pfm_arch_get_base_syscall(); ++ return snprintf(buf, PAGE_SIZE, "%d\n", base); ++ } ++ ++ if (is_attr_name(attr, "sys_sessions_count")) ++ return pfm_sysfs_res_show(buf, PAGE_SIZE, 1); ++ ++ if (is_attr_name(attr, "smpl_buffer_mem_max")) ++ return snprintf(buf, PAGE_SIZE, "%zu\n", pfm_controls.smpl_buffer_mem_max); ++ ++ if (is_attr_name(attr, "smpl_buffer_mem_cur")) ++ return pfm_sysfs_res_show(buf, PAGE_SIZE, 2); ++ ++ if (is_attr_name(attr, "sys_group")) ++ return snprintf(buf, PAGE_SIZE, "%d\n", pfm_controls.sys_group); ++ ++ /* XXX: could be set to write-only */ ++ if (is_attr_name(attr, "reset_stats")) { ++ buf[0] = '0'; ++ buf[1] = '\0'; ++ return strnlen(buf, PAGE_SIZE); ++ } ++ return 0; ++} ++ ++static ssize_t pfm_controls_store(struct kobject *kobj, struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int i; ++ size_t d; ++ ++ if (sscanf(buf, "%zu", &d) != 1) ++ goto skip; ++ ++ if (is_attr_name(attr, "debug")) ++ pfm_controls.debug = d; ++ ++ if (is_attr_name(attr, "task_group")) ++ pfm_controls.task_group = d; ++ ++ if (is_attr_name(attr, "sys_group")) ++ pfm_controls.sys_group = d; ++ ++ if (is_attr_name(attr, "mode")) ++ pfm_controls.flags = d ? PFM_CTRL_FL_RW_EXPERT : 0; ++ ++ if (is_attr_name(attr, "arg_mem_max")) { ++ /* ++ * we impose a page as the minimum. ++ * ++ * This limit may be smaller than the stack buffer ++ * available and that is fine. ++ */ ++ if (d >= PAGE_SIZE) ++ pfm_controls.arg_mem_max = d; ++ } ++ if (is_attr_name(attr, "reset_stats")) { ++ for_each_online_cpu(i) { ++ pfm_reset_stats(i); ++ } ++ } ++ ++ if (is_attr_name(attr, "smpl_buffer_mem_max")) { ++ if (d >= PAGE_SIZE) ++ pfm_controls.smpl_buffer_mem_max = d; ++ } ++skip: ++ return count; ++} ++ ++/* ++ * /sys/kernel/perfmon attributes ++ */ ++static PFM_RO_ATTR(version, pfm_controls_show); ++static PFM_RO_ATTR(task_sessions_count, pfm_controls_show); ++static PFM_RO_ATTR(syscall, pfm_controls_show); ++static PFM_RO_ATTR(sys_sessions_count, pfm_controls_show); ++static PFM_RO_ATTR(smpl_buffer_mem_cur, pfm_controls_show); ++ ++static PFM_RW_ATTR(debug, pfm_controls_show, pfm_controls_store); ++static PFM_RW_ATTR(task_group, pfm_controls_show, pfm_controls_store); ++static PFM_RW_ATTR(mode, pfm_controls_show, pfm_controls_store); ++static PFM_RW_ATTR(sys_group, pfm_controls_show, pfm_controls_store); ++static PFM_RW_ATTR(arg_mem_max, pfm_controls_show, pfm_controls_store); ++static PFM_RW_ATTR(smpl_buffer_mem_max, pfm_controls_show, pfm_controls_store); ++static PFM_RW_ATTR(reset_stats, pfm_controls_show, pfm_controls_store); ++ ++static struct attribute *pfm_kernel_attrs[] = { ++ &attr_version.attr, ++ &attr_syscall.attr, ++ &attr_task_sessions_count.attr, ++ &attr_sys_sessions_count.attr, ++ &attr_smpl_buffer_mem_cur.attr, ++ &attr_debug.attr, ++ &attr_reset_stats.attr, ++ &attr_sys_group.attr, ++ &attr_task_group.attr, ++ &attr_mode.attr, ++ &attr_smpl_buffer_mem_max.attr, ++ &attr_arg_mem_max.attr, ++ NULL ++}; ++ ++static struct attribute_group pfm_kernel_attr_group = { ++ .attrs = pfm_kernel_attrs, ++}; ++ ++/* ++ * per-reg attributes ++ */ ++static ssize_t pfm_reg_show(void *data, struct pfm_attribute *attr, char *buf) ++{ ++ struct pfm_regmap_desc *reg; ++ int w; ++ ++ reg = data; ++ ++ if (is_attr_name(attr, "name")) ++ return snprintf(buf, PAGE_SIZE, "%s\n", reg->desc); ++ ++ if (is_attr_name(attr, "dfl_val")) ++ return snprintf(buf, PAGE_SIZE, "0x%llx\n", ++ (unsigned long long)reg->dfl_val); ++ ++ if (is_attr_name(attr, "width")) { ++ w = (reg->type & PFM_REG_C64) ? ++ pfm_pmu_conf->counter_width : 64; ++ return snprintf(buf, PAGE_SIZE, "%d\n", w); ++ } ++ ++ if (is_attr_name(attr, "rsvd_msk")) ++ return snprintf(buf, PAGE_SIZE, "0x%llx\n", ++ (unsigned long long)reg->rsvd_msk); ++ ++ if (is_attr_name(attr, "addr")) ++ return snprintf(buf, PAGE_SIZE, "0x%lx\n", reg->hw_addr); ++ ++ return 0; ++} ++ ++static PFM_ROS_ATTR(name, pfm_reg_show); ++static PFM_ROS_ATTR(dfl_val, pfm_reg_show); ++static PFM_ROS_ATTR(rsvd_msk, pfm_reg_show); ++static PFM_ROS_ATTR(width, pfm_reg_show); ++static PFM_ROS_ATTR(addr, pfm_reg_show); ++ ++static struct attribute *pfm_reg_attrs[] = { ++ &attr_name.attr, ++ &attr_dfl_val.attr, ++ &attr_rsvd_msk.attr, ++ &attr_width.attr, ++ &attr_addr.attr, ++ NULL ++}; ++ ++static struct attribute_group pfm_reg_attr_group = { ++ .attrs = pfm_reg_attrs, ++}; ++ ++static ssize_t pfm_pmu_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) ++{ ++ if (is_attr_name(attr, "model")) ++ return snprintf(buf, PAGE_SIZE, "%s\n", pfm_pmu_conf->pmu_name); ++ return 0; ++} ++static PFM_RO_ATTR(model, pfm_pmu_show); ++ ++static struct attribute *pfm_pmu_desc_attrs[] = { ++ &attr_model.attr, ++ NULL ++}; ++ ++static struct attribute_group pfm_pmu_desc_attr_group = { ++ .attrs = pfm_pmu_desc_attrs, ++}; ++ ++static int pfm_sysfs_add_pmu_regs(struct pfm_pmu_config *pmu) ++{ ++ struct pfm_regmap_desc *reg; ++ unsigned int i, k; ++ int ret; ++ ++ reg = pmu->pmc_desc; ++ for (i = 0; i < pmu->num_pmc_entries; i++, reg++) { ++ ++ if (!(reg->type & PFM_REG_I)) ++ continue; ++ ++ ret = kobject_init_and_add(®->kobj, &pfm_regs_ktype, ++ pfm_pmu_kobj, "pmc%u", i); ++ if (ret) ++ goto undo_pmcs; ++ ++ ret = sysfs_create_group(®->kobj, &pfm_reg_attr_group); ++ if (ret) { ++ kobject_del(®->kobj); ++ goto undo_pmcs; ++ } ++ } ++ ++ reg = pmu->pmd_desc; ++ for (i = 0; i < pmu->num_pmd_entries; i++, reg++) { ++ ++ if (!(reg->type & PFM_REG_I)) ++ continue; ++ ++ ret = kobject_init_and_add(®->kobj, &pfm_regs_ktype, ++ pfm_pmu_kobj, "pmd%u", i); ++ if (ret) ++ goto undo_pmds; ++ ++ ret = sysfs_create_group(®->kobj, &pfm_reg_attr_group); ++ if (ret) { ++ kobject_del(®->kobj); ++ goto undo_pmds; ++ } ++ } ++ return 0; ++undo_pmds: ++ reg = pmu->pmd_desc; ++ for (k = 0; k < i; k++, reg++) { ++ if (!(reg->type & PFM_REG_I)) ++ continue; ++ sysfs_remove_group(®->kobj, &pfm_reg_attr_group); ++ kobject_del(®->kobj); ++ } ++ i = pmu->num_pmc_entries; ++ /* fall through */ ++undo_pmcs: ++ reg = pmu->pmc_desc; ++ for (k = 0; k < i; k++, reg++) { ++ if (!(reg->type & PFM_REG_I)) ++ continue; ++ sysfs_remove_group(®->kobj, &pfm_reg_attr_group); ++ kobject_del(®->kobj); ++ } ++ return ret; ++} ++ ++static int pfm_sysfs_del_pmu_regs(struct pfm_pmu_config *pmu) ++{ ++ struct pfm_regmap_desc *reg; ++ unsigned int i; ++ ++ reg = pmu->pmc_desc; ++ for (i = 0; i < pmu->num_pmc_entries; i++, reg++) { ++ ++ if (!(reg->type & PFM_REG_I)) ++ continue; ++ ++ sysfs_remove_group(®->kobj, &pfm_reg_attr_group); ++ kobject_del(®->kobj); ++ } ++ ++ reg = pmu->pmd_desc; ++ for (i = 0; i < pmu->num_pmd_entries; i++, reg++) { ++ ++ if (!(reg->type & PFM_REG_I)) ++ continue; ++ ++ sysfs_remove_group(®->kobj, &pfm_reg_attr_group); ++ kobject_del(®->kobj); ++ } ++ return 0; ++} ++ ++/* ++ * when a PMU description module is inserted, we create ++ * a pmu_desc subdir in sysfs and we populate it with ++ * PMU specific information, such as register mappings ++ */ ++int pfm_sysfs_add_pmu(struct pfm_pmu_config *pmu) ++{ ++ int ret; ++ ++ pfm_pmu_kobj = kobject_create_and_add("pmu_desc", pfm_kernel_kobj); ++ if (!pfm_pmu_kobj) ++ return -ENOMEM; ++ ++ ret = sysfs_create_group(pfm_pmu_kobj, &pfm_pmu_desc_attr_group); ++ if (ret) { ++ /* will release pfm_pmu_kobj */ ++ kobject_put(pfm_pmu_kobj); ++ return ret; ++ } ++ ++ ret = pfm_sysfs_add_pmu_regs(pmu); ++ if (ret) { ++ sysfs_remove_group(pfm_pmu_kobj, &pfm_pmu_desc_attr_group); ++ /* will release pfm_pmu_kobj */ ++ kobject_put(pfm_pmu_kobj); ++ } else ++ kobject_uevent(pfm_pmu_kobj, KOBJ_ADD); ++ ++ return ret; ++} ++ ++/* ++ * when a PMU description module is removed, we also remove ++ * all its information from sysfs, i.e., the pmu_desc subdir ++ * disappears ++ */ ++int pfm_sysfs_remove_pmu(struct pfm_pmu_config *pmu) ++{ ++ pfm_sysfs_del_pmu_regs(pmu); ++ sysfs_remove_group(pfm_pmu_kobj, &pfm_pmu_desc_attr_group); ++ kobject_uevent(pfm_pmu_kobj, KOBJ_REMOVE); ++ kobject_put(pfm_pmu_kobj); ++ pfm_pmu_kobj = NULL; ++ return 0; ++} ++ ++static ssize_t pfm_fmt_show(void *data, struct pfm_attribute *attr, char *buf) ++{ ++ struct pfm_smpl_fmt *fmt = data; ++ ++ if (is_attr_name(attr, "version")) ++ return snprintf(buf, PAGE_SIZE, "%u.%u\n", ++ fmt->fmt_version >> 16 & 0xffff, ++ fmt->fmt_version & 0xffff); ++ return 0; ++} ++ ++/* ++ * do not use predefined macros because of name conflict ++ * with /sys/kernel/perfmon/version ++ */ ++struct pfm_attribute attr_fmt_version = { ++ .attr = { .name = "version", .mode = 0444 }, ++ .show = pfm_fmt_show, ++}; ++ ++static struct attribute *pfm_fmt_attrs[] = { ++ &attr_fmt_version.attr, ++ NULL ++}; ++ ++static struct attribute_group pfm_fmt_attr_group = { ++ .attrs = pfm_fmt_attrs, ++}; ++ ++/* ++ * when a sampling format module is inserted, we populate ++ * sysfs with some information ++ */ ++int pfm_sysfs_add_fmt(struct pfm_smpl_fmt *fmt) ++{ ++ int ret; ++ ++ ret = kobject_init_and_add(&fmt->kobj, &pfm_fmt_ktype, ++ pfm_fmt_kobj, fmt->fmt_name); ++ if (ret) ++ return ret; ++ ++ ret = sysfs_create_group(&fmt->kobj, &pfm_fmt_attr_group); ++ if (ret) ++ kobject_del(&fmt->kobj); ++ else ++ kobject_uevent(&fmt->kobj, KOBJ_ADD); ++ ++ return ret; ++} ++ ++/* ++ * when a sampling format module is removed, its information ++ * must also be removed from sysfs ++ */ ++void pfm_sysfs_remove_fmt(struct pfm_smpl_fmt *fmt) ++{ ++ sysfs_remove_group(&fmt->kobj, &pfm_fmt_attr_group); ++ kobject_uevent(&fmt->kobj, KOBJ_REMOVE); ++ kobject_del(&fmt->kobj); ++} ++ ++int __init pfm_init_sysfs(void) ++{ ++ int ret; ++ ++ pfm_kernel_kobj = kobject_create_and_add("perfmon", kernel_kobj); ++ if (!pfm_kernel_kobj) { ++ PFM_ERR("cannot add kernel object: /sys/kernel/perfmon"); ++ return -ENOMEM; ++ } ++ ++ ret = sysfs_create_group(pfm_kernel_kobj, &pfm_kernel_attr_group); ++ if (ret) { ++ kobject_put(pfm_kernel_kobj); ++ return ret; ++ } ++ ++ pfm_fmt_kobj = kobject_create_and_add("formats", pfm_kernel_kobj); ++ if (ret) { ++ PFM_ERR("cannot add fmt object: %d", ret); ++ goto error_fmt; ++ } ++ if (pfm_pmu_conf) ++ pfm_sysfs_add_pmu(pfm_pmu_conf); ++ ++ pfm_sysfs_builtin_fmt_add(); ++ ++ return 0; ++ ++error_fmt: ++ kobject_del(pfm_kernel_kobj); ++ return ret; ++} |