diff --git a/Documentation/ABI/testing/sysfs-perfmon b/Documentation/ABI/testing/sysfs-perfmon new file mode 100644 index 0000000..bde434c --- /dev/null +++ b/Documentation/ABI/testing/sysfs-perfmon @@ -0,0 +1,87 @@ +What: /sys/kernel/perfmon +Date: Nov 2007 +KernelVersion: 2.6.24 +Contact: eranian@gmail.com + +Description: provide the configuration interface for the perfmon2 subsystems. + The tree contains information about the detected hardware, current + state of the subsystem as well as some configuration parameters. + + The tree consists of the following entries: + + /sys/kernel/perfmon/debug (read-write): + + Enable perfmon2 debugging output via klogd. Debug messages produced during + PMU interrupt handling are not controlled by this entry. The traces a rate-limited + to avoid flooding of the console. It is possible to change the throttling + via /proc/sys/kernel/printk_ratelimit. The value is interpreted as a bitmask. + Each bit enables a particular type of debug messages. Refer to the file + include/linux/perfmon_kern.h for more information + + /sys/kernel/perfmon/pmc_max_fast_arg (read-only): + + Number of perfmon2 syscall arguments copied directly onto the + stack (copy_from_user) for pfm_write_pmcs(). Copying to the stack avoids + having to allocate a buffer. The unit is the number of pfarg_pmc_t + structures. + + /sys/kernel/perfmon/pmd_max_fast_arg (read-only): + + Number of perfmon2 syscall arguments copied directly onto the + stack (copy_from_user) for pfm_write_pmds()/pfm_read_pmds(). Copying + to the stack avoids having to allocate a buffer. The unit is the number + of pfarg_pmd_t structures. + + + /sys/kernel/perfmon/reset_stats (write-only): + + Reset the statistics collected by perfmon2. Stats are available + per-cpu via debugfs. + + /sys/kernel/perfmon/smpl_buffer_mem_cur (read-only): + + Reports the amount of memory currently dedicated to sampling + buffers by the kernel. The unit is byte. + + /sys/kernel/perfmon/smpl_buffer_mem_max (read-write): + + Maximum amount of kernel memory usable for sampling buffers. -1 means + everything that is available. Unit is byte. + + /sys/kernel/perfmon/smpl_buffer_mem_cur (read-only): + + Current utilization of kernel memory in bytes. + + /sys/kernel/perfmon/sys_group (read-write): + + Users group allowed to create a system-wide perfmon2 context (session). + -1 means any group. This control will be kept until we find a package + able to control capabilities via PAM. + + /sys/kernel/perfmon/task_group (read-write): + + Users group allowed to create a per-thread context (session). + -1 means any group. This control will be kept until we find a + package able to control capabilities via PAM. + + /sys/kernel/perfmon/sys_sessions_count (read-only): + + Number of system-wide contexts currently attached to CPUs. + + /sys/kernel/perfmon/task_sessions_count (read-only): + + Number of per-thread contexts currently attached to threads. + + /sys/kernel/perfmon/version (read-only): + + Perfmon2 interface revision number. + + /sys/kernel/perfmon/arg_mem_max(read-write): + + Maximum size of vector arguments expressed in bytes. Can be modified + + /sys/kernel/perfmon/mode(read-write): + + Bitmask to enable/disable certain perfmon2 features. + Currently defined: + - bit 0: if set, then reserved bitfield are ignored on PMC writes diff --git a/Documentation/ABI/testing/sysfs-perfmon-fmt b/Documentation/ABI/testing/sysfs-perfmon-fmt new file mode 100644 index 0000000..1b45270 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-perfmon-fmt @@ -0,0 +1,18 @@ +What: /sys/kernel/perfmon/formats +Date: 2007 +KernelVersion: 2.6.24 +Contact: eranian@gmail.com + +Description: provide description of available perfmon2 custom sampling buffer formats + which are implemented as independent kernel modules. Each formats gets + a subdir which a few entries. + + The name of the subdir is the name of the sampling format. The same name + must be passed to pfm_create_context() to use the format. + + Each subdir XX contains the following entries: + + /sys/kernel/perfmon/formats/XX/version (read-only): + + Version number of the format in clear text and null terminated. + diff --git a/Documentation/ABI/testing/sysfs-perfmon-pmu b/Documentation/ABI/testing/sysfs-perfmon-pmu new file mode 100644 index 0000000..a1afc7e --- /dev/null +++ b/Documentation/ABI/testing/sysfs-perfmon-pmu @@ -0,0 +1,46 @@ +What: /sys/kernel/perfmon/pmu +Date: Nov 2007 +KernelVersion: 2.6.24 +Contact: eranian@gmail.com + +Description: provide information about the currently loaded PMU description module. + The module contains the mapping of the actual performance counter registers + onto the logical PMU exposed by perfmon. There is at most one PMU description + module loaded at any time. + + The sysfs PMU tree provides a description of the mapping for each register. + There is one subdir per config and data registers along an entry for the + name of the PMU model. + + The model entry is as follows: + + /sys/kernel/perfmon/pmu_desc/model (read-only): + + Name of the PMU model is clear text and zero terminated. + + Then for each logical PMU register, XX, gets a subtree with the following entries: + + /sys/kernel/perfmon/pmu_desc/pm*XX/addr (read-only): + + The physical address or index of the actual underlying hardware register. + On Itanium, it corresponds to the index. But on X86 processor, this is + the actual MSR address. + + /sys/kernel/perfmon/pmu_desc/pm*XX/dfl_val (read-only): + + The default value of the register in hexadecimal. + + /sys/kernel/perfmon/pmu_desc/pm*XX/name (read-only): + + The name of the hardware register. + + /sys/kernel/perfmon/pmu_desc/pm*XX/rsvd_msk (read-only): + + The bitmask of reserved bits, i.e., bits which cannot be changed by + applications. When a bit is set, it means the corresponding bit in the + actual register is reserved. + + /sys/kernel/perfmon/pmu_desc/pm*XX/width (read-only): + + the width in bits of the registers. This field is only relevant for counter + registers. diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 1150444..2652b6c 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1643,6 +1643,9 @@ and is between 256 and 4096 characters. It is defined in the file Format: { 0 | 1 } See arch/parisc/kernel/pdc_chassis.c + perfmon_debug [PERFMON] Enables Perfmon debug messages. Needed + to see traces of the early startup startup phase. + pf. [PARIDE] See Documentation/paride.txt. diff --git a/Documentation/perfmon2-debugfs.txt b/Documentation/perfmon2-debugfs.txt new file mode 100644 index 0000000..b30cae8 --- /dev/null +++ b/Documentation/perfmon2-debugfs.txt @@ -0,0 +1,126 @@ + The perfmon2 debug and statistics interface + ------------------------------------------ + Stephane Eranian + + +The perfmon2 interfaces exports a set of statistics which are used to tune and +debug the implementation. The data is composed of a set of very simple metrics +mostly aggregated counts and durations. They instruments key points in the +perfmon2 code, such as context switch and interrupt handling. + +The data is accessible via the debug filesystem (debugfs). Thus you need to +have the filesystem support enabled in your kernel. Furthermore since, 2.6.25, +the perfmon2 statistics interface is an optional component. It needs to be +explicitely enabled in the kernel config file (CONFIG_PERFMON_DEBUG_FS). + +To access the data, the debugs filesystem must be mounted. Supposing the mount +point is /debugfs, you would need to do: + $ mount -t debugs none /debugfs + +The data is located under the perfmon subdirectory and is organized per CPU. +For each CPU, the same set of metrics is available, one metric per file in +clear ASCII text. + +The metrics are as follows: + + ctxswin_count (read-only): + + Number of PMU context switch in. + + ctxswin_ns (read-only): + + Number of nanoseconds spent in the PMU context switch in + routine. Dividing this number by the value of ctxswin_count, + yields average cost of the PMU context switch in. + + ctxswout_count (read-only): + + Number of PMU context switch out. + + ctxswout_ns (read-only): + + Number of nanoseconds spent in the PMU context switch in + routine. Dividing this number by the value of ctxswout_count, + yields average cost of the PMU context switch out. + + fmt_handler_calls (read-only): + + Number of calls to the sampling format routine that handles + PMU interrupts, i.e., typically the routine that records a + sample. + + fmt_handler_ns (read-only): + + Number of nanoseconds spent in the routine that handle PMU + interrupt in the sampling format. Dividing this number by + the number of calls provided by fmt_handler_calls, yields + average time spent in this routine. + + ovfl_intr_all_count (read-only): + + Number of PMU interrupts received by the kernel. + + + ovfl_intr_nmi_count (read-only): + + Number of Non Maskeable Interrupts (NMI) received by the kernel + for perfmon. This is relevant only on X86 hardware. + + ovfl_intr_ns (read-only): + + Number of nanoseconds spent in the perfmon2 PMU interrupt + handler routine. Dividing this number of ovfl_intr_all_count + yields the average time to handle one PMU interrupt. + + ovfl_intr_regular_count (read-only): + + Number of PMU interrupts which are actually processed by + the perfmon interrupt handler. There may be spurious or replay + interrupts. + + ovfl_intr_replay_count (read-only): + + Number of PMU interrupts which were replayed on context switch + in or on event set switching. Interrupts get replayed when they + were in flight at the time monitoring had to be stopped. + + perfmon/ovfl_intr_spurious_count (read-only): + + Number of PMU interrupts which were dropped because there was + no active context (session). + + ovfl_notify_count (read-only): + + Number of user level notifications sent. Notifications are + appended as messages to the context queue. Notifications may + be sent on PMU interrupts. + + pfm_restart_count (read-only): + + Number of times pfm_restart() is called. + + reset_pmds_count (read-only): + + Number of times pfm_reset_pmds() is called. + + set_switch_count (read-only): + + Number of event set switches. + + set_switch_ns (read-only): + + Number of nanoseconds spent in the set switching routine. + Dividing this number by set_switch_count yields the average + cost of switching sets. + + handle_timeout_count (read-only): + + Number of times the pfm_handle_timeout() routine is called. + It is used for timeout-based set switching. + + handle_work_count (read-only): + + Number of times pfm_handle_work() is called. The routine + handles asynchronous perfmon2 work for per-thread contexts + (sessions). + diff --git a/Documentation/perfmon2.txt b/Documentation/perfmon2.txt new file mode 100644 index 0000000..4a8fada --- /dev/null +++ b/Documentation/perfmon2.txt @@ -0,0 +1,213 @@ + The perfmon2 hardware monitoring interface + ------------------------------------------ + Stephane Eranian + + +I/ Introduction + + The perfmon2 interface provides access to the hardware performance counters of + major processors. Nowadays, all processors implement some flavors of performance + counters which capture micro-architectural level information such as the number + of elapsed cycles, number of cache misses, and so on. + + The interface is implemented as a set of new system calls and a set of config files + in /sys. + + It is possible to monitoring a single thread or a CPU. In either mode, applications + can count or collect samples. System-wide monitoring is supported by running a + monitoring session on each CPU. The interface support event-based sampling where the + sampling period is expressed as the number of occurrences of event, instead of just a + timeout. This approach provides a much better granularity and flexibility. + + For performance reason, it is possible to use a kernel-level sampling buffer to minimize + the overhead incurred by sampling. The format of the buffer, i.e., what is recorded, how + it is recorded, and how it is exported to user-land is controlled by a kernel module called + a custom sampling format. The current implementation comes with a default format but + it is possible to create additional formats. There is an in-kernel registration + interface for formats. Each format is identified by a simple string which a tool + can pass when a monitoring session is created. + + The interface also provides support for event set and multiplexing to work around + hardware limitations in the number of available counters or in how events can be + combined. Each set defines as many counters as the hardware can support. The kernel + then multiplexes the sets. The interface supports time-base switching but also + overflow based switching, i.e., after n overflows of designated counters. + + Applications never manipulates the actual performance counter registers. Instead they see + a logical Performance Monitoring Unit (PMU) composed of a set of config register (PMC) + and a set of data registers (PMD). Note that PMD are not necessarily counters, they + can be buffers. The logical PMU is then mapped onto the actual PMU using a mapping + table which is implemented as a kernel module. The mapping is chosen once for each + new processor. It is visible in /sys/kernel/perfmon/pmu_desc. The kernel module + is automatically loaded on first use. + + A monitoring session, or context, is uniquely identified by a file descriptor + obtained when the context is created. File sharing semantics apply to access + the context inside a process. A context is never inherited across fork. The file + descriptor can be used to received counter overflow notifications or when the + sampling buffer is full. It is possible to use poll/select on the descriptor + to wait for notifications from multiplex contexts. Similarly, the descriptor + supports asynchronous notification via SIGIO. + + Counters are always exported as being 64-bit wide regardless of what the underlying + hardware implements. + +II/ Kernel compilation + + To enable perfmon2, you need to enable CONFIG_PERFMON + +III/ OProfile interactions + + The set of features offered by perfmon2 is rich enough to support migrating + Oprofile on top of it. That means that PMU programming and low-level interrupt + handling could be done by perfmon2. The Oprofile sampling buffer management code + in the kernel as well as how samples are exported to users could remain through + the use of a custom sampling buffer format. This is how Oprofile work on Itanium. + + The current interactions with Oprofile are: + - on X86: Both subsystems can be compiled into the same kernel. There is enforced + mutual exclusion between the two subsystems. When there is an Oprofile + session, no perfmon2 session can exist and vice-versa. Perfmon2 session + encapsulates both per-thread and system-wide sessions here. + + - On IA-64: Oprofile works on top of perfmon2. Oprofile being a system-wide monitoring + tool, the regular per-thread vs. system-wide session restrictions apply. + + - on PPC: no integration yet. You need to enable/disble one of the two subsystems + - on MIPS: no integration yet. You need to enable/disble one of the two subsystems + +IV/ User tools + + We have released a simple monitoring tool to demonstrate the feature of the + interface. The tool is called pfmon and it comes with a simple helper library + called libpfm. The library comes with a set of examples to show how to use the + kernel perfmon2 interface. Visit http://perfmon2.sf.net for details. + + There maybe other tools available for perfmon2. + +V/ How to program? + + The best way to learn how to program perfmon2, is to take a look at the source + code for the examples in libpfm. The source code is available from: + http://perfmon2.sf.net + +VI/ System calls overview + + The interface is implemented by the following system calls: + + * int pfm_create_context(pfarg_ctx_t *ctx, char *fmt, void *arg, size_t arg_size) + + This function create a perfmon2 context. The type of context is per-thread by + default unless PFM_FL_SYSTEM_WIDE is passed in ctx. The sampling format name + is passed in fmt. Arguments to the format are passed in arg which is of size + arg_size. Upon successful return, the file descriptor identifying the context + is returned. + + * int pfm_write_pmds(int fd, pfarg_pmd_t *pmds, int n) + + This function is used to program the PMD registers. It is possible to pass + vectors of PMDs. + + * int pfm_write_pmcs(int fd, pfarg_pmc_t *pmds, int n) + + This function is used to program the PMC registers. It is possible to pass + vectors of PMDs. + + * int pfm_read_pmds(int fd, pfarg_pmd_t *pmds, int n) + + This function is used to read the PMD registers. It is possible to pass + vectors of PMDs. + + * int pfm_load_context(int fd, pfarg_load_t *load) + + This function is used to attach the context to a thread or CPU. + Thread means kernel-visible thread (NPTL). The thread identification + as obtained by gettid must be passed to load->load_target. + + To operate on another thread (not self), it is mandatory that the thread + be stopped via ptrace(). + + To attach to a CPU, the CPU number must be specified in load->load_target + AND the call must be issued on that CPU. To monitor a CPU, a thread MUST + be pinned on that CPU. + + Until the context is attached, the actual counters are not accessed. + + * int pfm_unload_context(int fd) + + The context is detached for the thread or CPU is was attached to. + As a consequence monitoring is stopped. + + When monitoring another thread, the thread MUST be stopped via ptrace() + for this function to succeed. + + * int pfm_start(int fd, pfarg_start_t *st) + + Start monitoring. The context must be attached for this function to succeed. + Optionally, it is possible to specify the event set on which to start using the + st argument, otherwise just pass NULL. + + When monitoring another thread, the thread MUST be stopped via ptrace() + for this function to succeed. + + * int pfm_stop(int fd) + + Stop monitoring. The context must be attached for this function to succeed. + + When monitoring another thread, the thread MUST be stopped via ptrace() + for this function to succeed. + + + * int pfm_create_evtsets(int fd, pfarg_setdesc_t *sets, int n) + + This function is used to create or change event sets. By default set 0 exists. + It is possible to create/change multiple sets in one call. + + The context must be detached for this call to succeed. + + Sets are identified by a 16-bit integer. They are sorted based on this + set and switching occurs in a round-robin fashion. + + * int pfm_delete_evtsets(int fd, pfarg_setdesc_t *sets, int n) + + Delete event sets. The context must be detached for this call to succeed. + + + * int pfm_getinfo_evtsets(int fd, pfarg_setinfo_t *sets, int n) + + Retrieve information about event sets. In particular it is possible + to get the number of activation of a set. It is possible to retrieve + information about multiple sets in one call. + + + * int pfm_restart(int fd) + + Indicate to the kernel that the application is done processing an overflow + notification. A consequence of this call could be that monitoring resumes. + + * int read(fd, pfm_msg_t *msg, sizeof(pfm_msg_t)) + + the regular read() system call can be used with the context file descriptor to + receive overflow notification messages. Non-blocking read() is supported. + + Each message carry information about the overflow such as which counter overflowed + and where the program was (interrupted instruction pointer). + + * int close(int fd) + + To destroy a context, the regular close() system call is used. + + +VII/ /sys interface overview + + Refer to Documentation/ABI/testing/sysfs-perfmon-* for a detailed description + of the sysfs interface of perfmon2. + +VIII/ debugfs interface overview + + Refer to Documentation/perfmon2-debugfs.txt for a detailed description of the + debug and statistics interface of perfmon2. + +IX/ Documentation + + Visit http://perfmon2.sf.net diff --git a/MAINTAINERS b/MAINTAINERS index 8dae455..fb38c2a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3239,6 +3239,14 @@ M: balbir@linux.vnet.ibm.com L: linux-kernel@vger.kernel.org S: Maintained +PERFMON SUBSYSTEM +P: Stephane Eranian +M: eranian@gmail.com +L: perfmon2-devel@lists.sf.net +W: http://perfmon2.sf.net +T: git kernel.org:/pub/scm/linux/kernel/git/eranian/linux-2.6 +S: Maintained + PERSONALITY HANDLING P: Christoph Hellwig M: hch@infradead.org diff --git a/Makefile b/Makefile index 16e3fbb..7bb1320 100644 --- a/Makefile +++ b/Makefile @@ -620,6 +620,7 @@ export mod_strip_cmd ifeq ($(KBUILD_EXTMOD),) core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/ +core-$(CONFIG_PERFMON) += perfmon/ vmlinux-dirs := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \ $(core-y) $(core-m) $(drivers-y) $(drivers-m) \ diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 48e496f..1d79b01 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -470,14 +470,6 @@ config COMPAT_FOR_U64_ALIGNMENT config IA64_MCA_RECOVERY tristate "MCA recovery from errors other than TLB." -config PERFMON - bool "Performance monitor support" - help - Selects whether support for the IA-64 performance monitor hardware - is included in the kernel. This makes some kernel data-structures a - little bigger and slows down execution a bit, but it is generally - a good idea to turn this on. If you're unsure, say Y. - config IA64_PALINFO tristate "/proc/pal support" help @@ -549,6 +541,8 @@ source "drivers/firmware/Kconfig" source "fs/Kconfig.binfmt" +source "arch/ia64/perfmon/Kconfig" + endmenu menu "Power management and ACPI" diff --git a/arch/ia64/Makefile b/arch/ia64/Makefile index 905d25b..9aa622d 100644 --- a/arch/ia64/Makefile +++ b/arch/ia64/Makefile @@ -57,6 +57,7 @@ core-$(CONFIG_IA64_GENERIC) += arch/ia64/dig/ core-$(CONFIG_IA64_HP_ZX1) += arch/ia64/dig/ core-$(CONFIG_IA64_HP_ZX1_SWIOTLB) += arch/ia64/dig/ core-$(CONFIG_IA64_SGI_SN2) += arch/ia64/sn/ +core-$(CONFIG_PERFMON) += arch/ia64/perfmon/ core-$(CONFIG_IA64_SGI_UV) += arch/ia64/uv/ core-$(CONFIG_KVM) += arch/ia64/kvm/ diff --git a/arch/ia64/configs/generic_defconfig b/arch/ia64/configs/generic_defconfig index 9f48397..ff9572a 100644 --- a/arch/ia64/configs/generic_defconfig +++ b/arch/ia64/configs/generic_defconfig @@ -209,7 +209,6 @@ CONFIG_IA32_SUPPORT=y CONFIG_COMPAT=y CONFIG_COMPAT_FOR_U64_ALIGNMENT=y CONFIG_IA64_MCA_RECOVERY=y -CONFIG_PERFMON=y CONFIG_IA64_PALINFO=y # CONFIG_IA64_MC_ERR_INJECT is not set CONFIG_SGI_SN=y @@ -234,6 +233,16 @@ CONFIG_BINFMT_ELF=y CONFIG_BINFMT_MISC=m # +# Hardware Performance Monitoring support +# +CONFIG_PERFMON=y +CONFIG_IA64_PERFMON_COMPAT=y +CONFIG_IA64_PERFMON_GENERIC=m +CONFIG_IA64_PERFMON_ITANIUM=y +CONFIG_IA64_PERFMON_MCKINLEY=y +CONFIG_IA64_PERFMON_MONTECITO=y + +# # Power management and ACPI # CONFIG_PM=y diff --git a/arch/ia64/include/asm/Kbuild b/arch/ia64/include/asm/Kbuild index ccbe8ae..cf64b3b 100644 --- a/arch/ia64/include/asm/Kbuild +++ b/arch/ia64/include/asm/Kbuild @@ -5,10 +5,12 @@ header-y += fpu.h header-y += fpswa.h header-y += ia64regs.h header-y += intel_intrin.h -header-y += perfmon_default_smpl.h header-y += ptrace_offsets.h header-y += rse.h header-y += ucontext.h +header-y += perfmon.h +header-y += perfmon_compat.h +header-y += perfmon_default_smpl.h unifdef-y += gcc_intrin.h unifdef-y += intrinsics.h diff --git a/arch/ia64/include/asm/hw_irq.h b/arch/ia64/include/asm/hw_irq.h index 5c99cbc..4a45cb0 100644 --- a/arch/ia64/include/asm/hw_irq.h +++ b/arch/ia64/include/asm/hw_irq.h @@ -67,9 +67,9 @@ extern int ia64_last_device_vector; #define IA64_NUM_DEVICE_VECTORS (IA64_LAST_DEVICE_VECTOR - IA64_FIRST_DEVICE_VECTOR + 1) #define IA64_MCA_RENDEZ_VECTOR 0xe8 /* MCA rendez interrupt */ -#define IA64_PERFMON_VECTOR 0xee /* performance monitor interrupt vector */ #define IA64_TIMER_VECTOR 0xef /* use highest-prio group 15 interrupt for timer */ #define IA64_MCA_WAKEUP_VECTOR 0xf0 /* MCA wakeup (must be >MCA_RENDEZ_VECTOR) */ +#define IA64_PERFMON_VECTOR 0xf1 /* performance monitor interrupt vector */ #define IA64_IPI_LOCAL_TLB_FLUSH 0xfc /* SMP flush local TLB */ #define IA64_IPI_RESCHEDULE 0xfd /* SMP reschedule */ #define IA64_IPI_VECTOR 0xfe /* inter-processor interrupt vector */ diff --git a/arch/ia64/include/asm/perfmon.h b/arch/ia64/include/asm/perfmon.h index 7f3333d..150c4b4 100644 --- a/arch/ia64/include/asm/perfmon.h +++ b/arch/ia64/include/asm/perfmon.h @@ -1,279 +1,59 @@ /* - * Copyright (C) 2001-2003 Hewlett-Packard Co - * Stephane Eranian - */ - -#ifndef _ASM_IA64_PERFMON_H -#define _ASM_IA64_PERFMON_H - -/* - * perfmon comamnds supported on all CPU models - */ -#define PFM_WRITE_PMCS 0x01 -#define PFM_WRITE_PMDS 0x02 -#define PFM_READ_PMDS 0x03 -#define PFM_STOP 0x04 -#define PFM_START 0x05 -#define PFM_ENABLE 0x06 /* obsolete */ -#define PFM_DISABLE 0x07 /* obsolete */ -#define PFM_CREATE_CONTEXT 0x08 -#define PFM_DESTROY_CONTEXT 0x09 /* obsolete use close() */ -#define PFM_RESTART 0x0a -#define PFM_PROTECT_CONTEXT 0x0b /* obsolete */ -#define PFM_GET_FEATURES 0x0c -#define PFM_DEBUG 0x0d -#define PFM_UNPROTECT_CONTEXT 0x0e /* obsolete */ -#define PFM_GET_PMC_RESET_VAL 0x0f -#define PFM_LOAD_CONTEXT 0x10 -#define PFM_UNLOAD_CONTEXT 0x11 - -/* - * PMU model specific commands (may not be supported on all PMU models) - */ -#define PFM_WRITE_IBRS 0x20 -#define PFM_WRITE_DBRS 0x21 - -/* - * context flags - */ -#define PFM_FL_NOTIFY_BLOCK 0x01 /* block task on user level notifications */ -#define PFM_FL_SYSTEM_WIDE 0x02 /* create a system wide context */ -#define PFM_FL_OVFL_NO_MSG 0x80 /* do not post overflow/end messages for notification */ - -/* - * event set flags - */ -#define PFM_SETFL_EXCL_IDLE 0x01 /* exclude idle task (syswide only) XXX: DO NOT USE YET */ - -/* - * PMC flags - */ -#define PFM_REGFL_OVFL_NOTIFY 0x1 /* send notification on overflow */ -#define PFM_REGFL_RANDOM 0x2 /* randomize sampling interval */ - -/* - * PMD/PMC/IBR/DBR return flags (ignored on input) + * Copyright (c) 2001-2007 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian * - * Those flags are used on output and must be checked in case EAGAIN is returned - * by any of the calls using a pfarg_reg_t or pfarg_dbreg_t structure. - */ -#define PFM_REG_RETFL_NOTAVAIL (1UL<<31) /* set if register is implemented but not available */ -#define PFM_REG_RETFL_EINVAL (1UL<<30) /* set if register entry is invalid */ -#define PFM_REG_RETFL_MASK (PFM_REG_RETFL_NOTAVAIL|PFM_REG_RETFL_EINVAL) - -#define PFM_REG_HAS_ERROR(flag) (((flag) & PFM_REG_RETFL_MASK) != 0) - -typedef unsigned char pfm_uuid_t[16]; /* custom sampling buffer identifier type */ - -/* - * Request structure used to define a context - */ -typedef struct { - pfm_uuid_t ctx_smpl_buf_id; /* which buffer format to use (if needed) */ - unsigned long ctx_flags; /* noblock/block */ - unsigned short ctx_nextra_sets; /* number of extra event sets (you always get 1) */ - unsigned short ctx_reserved1; /* for future use */ - int ctx_fd; /* return arg: unique identification for context */ - void *ctx_smpl_vaddr; /* return arg: virtual address of sampling buffer, is used */ - unsigned long ctx_reserved2[11];/* for future use */ -} pfarg_context_t; - -/* - * Request structure used to write/read a PMC or PMD - */ -typedef struct { - unsigned int reg_num; /* which register */ - unsigned short reg_set; /* event set for this register */ - unsigned short reg_reserved1; /* for future use */ - - unsigned long reg_value; /* initial pmc/pmd value */ - unsigned long reg_flags; /* input: pmc/pmd flags, return: reg error */ - - unsigned long reg_long_reset; /* reset after buffer overflow notification */ - unsigned long reg_short_reset; /* reset after counter overflow */ - - unsigned long reg_reset_pmds[4]; /* which other counters to reset on overflow */ - unsigned long reg_random_seed; /* seed value when randomization is used */ - unsigned long reg_random_mask; /* bitmask used to limit random value */ - unsigned long reg_last_reset_val;/* return: PMD last reset value */ - - unsigned long reg_smpl_pmds[4]; /* which pmds are accessed when PMC overflows */ - unsigned long reg_smpl_eventid; /* opaque sampling event identifier */ - - unsigned long reg_reserved2[3]; /* for future use */ -} pfarg_reg_t; - -typedef struct { - unsigned int dbreg_num; /* which debug register */ - unsigned short dbreg_set; /* event set for this register */ - unsigned short dbreg_reserved1; /* for future use */ - unsigned long dbreg_value; /* value for debug register */ - unsigned long dbreg_flags; /* return: dbreg error */ - unsigned long dbreg_reserved2[1]; /* for future use */ -} pfarg_dbreg_t; - -typedef struct { - unsigned int ft_version; /* perfmon: major [16-31], minor [0-15] */ - unsigned int ft_reserved; /* reserved for future use */ - unsigned long reserved[4]; /* for future use */ -} pfarg_features_t; - -typedef struct { - pid_t load_pid; /* process to load the context into */ - unsigned short load_set; /* first event set to load */ - unsigned short load_reserved1; /* for future use */ - unsigned long load_reserved2[3]; /* for future use */ -} pfarg_load_t; - -typedef struct { - int msg_type; /* generic message header */ - int msg_ctx_fd; /* generic message header */ - unsigned long msg_ovfl_pmds[4]; /* which PMDs overflowed */ - unsigned short msg_active_set; /* active set at the time of overflow */ - unsigned short msg_reserved1; /* for future use */ - unsigned int msg_reserved2; /* for future use */ - unsigned long msg_tstamp; /* for perf tuning/debug */ -} pfm_ovfl_msg_t; - -typedef struct { - int msg_type; /* generic message header */ - int msg_ctx_fd; /* generic message header */ - unsigned long msg_tstamp; /* for perf tuning */ -} pfm_end_msg_t; - -typedef struct { - int msg_type; /* type of the message */ - int msg_ctx_fd; /* unique identifier for the context */ - unsigned long msg_tstamp; /* for perf tuning */ -} pfm_gen_msg_t; - -#define PFM_MSG_OVFL 1 /* an overflow happened */ -#define PFM_MSG_END 2 /* task to which context was attached ended */ - -typedef union { - pfm_ovfl_msg_t pfm_ovfl_msg; - pfm_end_msg_t pfm_end_msg; - pfm_gen_msg_t pfm_gen_msg; -} pfm_msg_t; - -/* - * Define the version numbers for both perfmon as a whole and the sampling buffer format. + * This file contains Itanium Processor Family specific definitions + * for the perfmon interface. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA */ -#define PFM_VERSION_MAJ 2U -#define PFM_VERSION_MIN 0U -#define PFM_VERSION (((PFM_VERSION_MAJ&0xffff)<<16)|(PFM_VERSION_MIN & 0xffff)) -#define PFM_VERSION_MAJOR(x) (((x)>>16) & 0xffff) -#define PFM_VERSION_MINOR(x) ((x) & 0xffff) - +#ifndef _ASM_IA64_PERFMON_H_ +#define _ASM_IA64_PERFMON_H_ /* - * miscellaneous architected definitions + * arch-specific user visible interface definitions */ -#define PMU_FIRST_COUNTER 4 /* first counting monitor (PMC/PMD) */ -#define PMU_MAX_PMCS 256 /* maximum architected number of PMC registers */ -#define PMU_MAX_PMDS 256 /* maximum architected number of PMD registers */ - -#ifdef __KERNEL__ - -extern long perfmonctl(int fd, int cmd, void *arg, int narg); - -typedef struct { - void (*handler)(int irq, void *arg, struct pt_regs *regs); -} pfm_intr_handler_desc_t; - -extern void pfm_save_regs (struct task_struct *); -extern void pfm_load_regs (struct task_struct *); -extern void pfm_exit_thread(struct task_struct *); -extern int pfm_use_debug_registers(struct task_struct *); -extern int pfm_release_debug_registers(struct task_struct *); -extern void pfm_syst_wide_update_task(struct task_struct *, unsigned long info, int is_ctxswin); -extern void pfm_inherit(struct task_struct *task, struct pt_regs *regs); -extern void pfm_init_percpu(void); -extern void pfm_handle_work(void); -extern int pfm_install_alt_pmu_interrupt(pfm_intr_handler_desc_t *h); -extern int pfm_remove_alt_pmu_interrupt(pfm_intr_handler_desc_t *h); +#define PFM_ARCH_MAX_PMCS (256+64) +#define PFM_ARCH_MAX_PMDS (256+64) - - -/* - * Reset PMD register flags - */ -#define PFM_PMD_SHORT_RESET 0 -#define PFM_PMD_LONG_RESET 1 - -typedef union { - unsigned int val; - struct { - unsigned int notify_user:1; /* notify user program of overflow */ - unsigned int reset_ovfl_pmds:1; /* reset overflowed PMDs */ - unsigned int block_task:1; /* block monitored task on kernel exit */ - unsigned int mask_monitoring:1; /* mask monitors via PMCx.plm */ - unsigned int reserved:28; /* for future use */ - } bits; -} pfm_ovfl_ctrl_t; - -typedef struct { - unsigned char ovfl_pmd; /* index of overflowed PMD */ - unsigned char ovfl_notify; /* =1 if monitor requested overflow notification */ - unsigned short active_set; /* event set active at the time of the overflow */ - pfm_ovfl_ctrl_t ovfl_ctrl; /* return: perfmon controls to set by handler */ - - unsigned long pmd_last_reset; /* last reset value of of the PMD */ - unsigned long smpl_pmds[4]; /* bitmask of other PMD of interest on overflow */ - unsigned long smpl_pmds_values[PMU_MAX_PMDS]; /* values for the other PMDs of interest */ - unsigned long pmd_value; /* current 64-bit value of the PMD */ - unsigned long pmd_eventid; /* eventid associated with PMD */ -} pfm_ovfl_arg_t; - - -typedef struct { - char *fmt_name; - pfm_uuid_t fmt_uuid; - size_t fmt_arg_size; - unsigned long fmt_flags; - - int (*fmt_validate)(struct task_struct *task, unsigned int flags, int cpu, void *arg); - int (*fmt_getsize)(struct task_struct *task, unsigned int flags, int cpu, void *arg, unsigned long *size); - int (*fmt_init)(struct task_struct *task, void *buf, unsigned int flags, int cpu, void *arg); - int (*fmt_handler)(struct task_struct *task, void *buf, pfm_ovfl_arg_t *arg, struct pt_regs *regs, unsigned long stamp); - int (*fmt_restart)(struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs); - int (*fmt_restart_active)(struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs); - int (*fmt_exit)(struct task_struct *task, void *buf, struct pt_regs *regs); - - struct list_head fmt_list; -} pfm_buffer_fmt_t; - -extern int pfm_register_buffer_fmt(pfm_buffer_fmt_t *fmt); -extern int pfm_unregister_buffer_fmt(pfm_uuid_t uuid); +#define PFM_ARCH_PMD_STK_ARG 8 +#define PFM_ARCH_PMC_STK_ARG 8 /* - * perfmon interface exported to modules + * Itanium specific context flags + * + * bits[00-15]: generic flags (see asm/perfmon.h) + * bits[16-31]: arch-specific flags */ -extern int pfm_mod_read_pmds(struct task_struct *, void *req, unsigned int nreq, struct pt_regs *regs); -extern int pfm_mod_write_pmcs(struct task_struct *, void *req, unsigned int nreq, struct pt_regs *regs); -extern int pfm_mod_write_ibrs(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs); -extern int pfm_mod_write_dbrs(struct task_struct *task, void *req, unsigned int nreq, struct pt_regs *regs); +#define PFM_ITA_FL_INSECURE 0x10000 /* clear psr.sp on non system, non self */ /* - * describe the content of the local_cpu_date->pfm_syst_info field + * Itanium specific public event set flags (set_flags) + * + * event set flags layout: + * bits[00-15] : generic flags + * bits[16-31] : arch-specific flags */ -#define PFM_CPUINFO_SYST_WIDE 0x1 /* if set a system wide session exists */ -#define PFM_CPUINFO_DCR_PP 0x2 /* if set the system wide session has started */ -#define PFM_CPUINFO_EXCL_IDLE 0x4 /* the system wide session excludes the idle task */ +#define PFM_ITA_SETFL_EXCL_INTR 0x10000 /* exclude interrupt execution */ +#define PFM_ITA_SETFL_INTR_ONLY 0x20000 /* include only interrupt execution */ +#define PFM_ITA_SETFL_IDLE_EXCL 0x40000 /* stop monitoring in idle loop */ /* - * sysctl control structure. visible to sampling formats + * compatibility for version v2.0 of the interface */ -typedef struct { - int debug; /* turn on/off debugging via syslog */ - int debug_ovfl; /* turn on/off debug printk in overflow handler */ - int fastctxsw; /* turn on/off fast (unsecure) ctxsw */ - int expert_mode; /* turn on/off value checking */ -} pfm_sysctl_t; -extern pfm_sysctl_t pfm_sysctl; - - -#endif /* __KERNEL__ */ +#include -#endif /* _ASM_IA64_PERFMON_H */ +#endif /* _ASM_IA64_PERFMON_H_ */ diff --git a/arch/ia64/include/asm/perfmon_compat.h b/arch/ia64/include/asm/perfmon_compat.h new file mode 100644 index 0000000..5c14514 --- /dev/null +++ b/arch/ia64/include/asm/perfmon_compat.h @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This header file contains perfmon interface definition + * that are now obsolete and should be dropped in favor + * of their equivalent functions as explained below. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ + +#ifndef _ASM_IA64_PERFMON_COMPAT_H_ +#define _ASM_IA64_PERFMON_COMPAT_H_ + +/* + * custom sampling buffer identifier type + */ +typedef __u8 pfm_uuid_t[16]; + +/* + * obsolete perfmon commands. Supported only on IA-64 for + * backward compatiblity reasons with perfmon v2.0. + */ +#define PFM_WRITE_PMCS 0x01 /* use pfm_write_pmcs */ +#define PFM_WRITE_PMDS 0x02 /* use pfm_write_pmds */ +#define PFM_READ_PMDS 0x03 /* use pfm_read_pmds */ +#define PFM_STOP 0x04 /* use pfm_stop */ +#define PFM_START 0x05 /* use pfm_start */ +#define PFM_ENABLE 0x06 /* obsolete */ +#define PFM_DISABLE 0x07 /* obsolete */ +#define PFM_CREATE_CONTEXT 0x08 /* use pfm_create_context */ +#define PFM_DESTROY_CONTEXT 0x09 /* use close() */ +#define PFM_RESTART 0x0a /* use pfm_restart */ +#define PFM_PROTECT_CONTEXT 0x0b /* obsolete */ +#define PFM_GET_FEATURES 0x0c /* use /proc/sys/perfmon */ +#define PFM_DEBUG 0x0d /* /proc/sys/kernel/perfmon/debug */ +#define PFM_UNPROTECT_CONTEXT 0x0e /* obsolete */ +#define PFM_GET_PMC_RESET_VAL 0x0f /* use /proc/perfmon_map */ +#define PFM_LOAD_CONTEXT 0x10 /* use pfm_load_context */ +#define PFM_UNLOAD_CONTEXT 0x11 /* use pfm_unload_context */ + +/* + * PMU model specific commands (may not be supported on all PMU models) + */ +#define PFM_WRITE_IBRS 0x20 /* obsolete: use PFM_WRITE_PMCS[256-263]*/ +#define PFM_WRITE_DBRS 0x21 /* obsolete: use PFM_WRITE_PMCS[264-271]*/ + +/* + * argument to PFM_CREATE_CONTEXT + */ +struct pfarg_context { + pfm_uuid_t ctx_smpl_buf_id; /* buffer format to use */ + unsigned long ctx_flags; /* noblock/block */ + unsigned int ctx_reserved1; /* for future use */ + int ctx_fd; /* return: fildesc */ + void *ctx_smpl_vaddr; /* return: vaddr of buffer */ + unsigned long ctx_reserved3[11];/* for future use */ +}; + +/* + * argument structure for PFM_WRITE_PMCS/PFM_WRITE_PMDS/PFM_WRITE_PMDS + */ +struct pfarg_reg { + unsigned int reg_num; /* which register */ + unsigned short reg_set; /* event set for this register */ + unsigned short reg_reserved1; /* for future use */ + + unsigned long reg_value; /* initial pmc/pmd value */ + unsigned long reg_flags; /* input: flags, ret: error */ + + unsigned long reg_long_reset; /* reset value after notification */ + unsigned long reg_short_reset; /* reset after counter overflow */ + + unsigned long reg_reset_pmds[4]; /* registers to reset on overflow */ + unsigned long reg_random_seed; /* seed for randomization */ + unsigned long reg_random_mask; /* random range limit */ + unsigned long reg_last_reset_val;/* return: PMD last reset value */ + + unsigned long reg_smpl_pmds[4]; /* pmds to be saved on overflow */ + unsigned long reg_smpl_eventid; /* opaque sampling event id */ + unsigned long reg_ovfl_switch_cnt;/* #overflows to switch */ + + unsigned long reg_reserved2[2]; /* for future use */ +}; + +/* + * argument to PFM_WRITE_IBRS/PFM_WRITE_DBRS + */ +struct pfarg_dbreg { + unsigned int dbreg_num; /* which debug register */ + unsigned short dbreg_set; /* event set */ + unsigned short dbreg_reserved1; /* for future use */ + unsigned long dbreg_value; /* value for debug register */ + unsigned long dbreg_flags; /* return: dbreg error */ + unsigned long dbreg_reserved2[1]; /* for future use */ +}; + +/* + * argument to PFM_GET_FEATURES + */ +struct pfarg_features { + unsigned int ft_version; /* major [16-31], minor [0-15] */ + unsigned int ft_reserved; /* reserved for future use */ + unsigned long reserved[4]; /* for future use */ +}; + +typedef struct { + int msg_type; /* generic message header */ + int msg_ctx_fd; /* generic message header */ + unsigned long msg_ovfl_pmds[4]; /* which PMDs overflowed */ + unsigned short msg_active_set; /* active set on overflow */ + unsigned short msg_reserved1; /* for future use */ + unsigned int msg_reserved2; /* for future use */ + unsigned long msg_tstamp; /* for perf tuning/debug */ +} pfm_ovfl_msg_t; + +typedef struct { + int msg_type; /* generic message header */ + int msg_ctx_fd; /* generic message header */ + unsigned long msg_tstamp; /* for perf tuning */ +} pfm_end_msg_t; + +typedef struct { + int msg_type; /* type of the message */ + int msg_ctx_fd; /* context file descriptor */ + unsigned long msg_tstamp; /* for perf tuning */ +} pfm_gen_msg_t; + +typedef union { + int type; + pfm_ovfl_msg_t pfm_ovfl_msg; + pfm_end_msg_t pfm_end_msg; + pfm_gen_msg_t pfm_gen_msg; +} pfm_msg_t; + +/* + * PMD/PMC return flags in case of error (ignored on input) + * + * reg_flags layout: + * bit 00-15 : generic flags + * bits[16-23] : arch-specific flags (see asm/perfmon.h) + * bit 24-31 : error codes + * + * Those flags are used on output and must be checked in case EINVAL is + * returned by a command accepting a vector of values and each has a flag + * field, such as pfarg_reg or pfarg_reg + */ +#define PFM_REG_RETFL_NOTAVAIL (1<<31) /* not implemented or unaccessible */ +#define PFM_REG_RETFL_EINVAL (1<<30) /* entry is invalid */ +#define PFM_REG_RETFL_MASK (PFM_REG_RETFL_NOTAVAIL|\ + PFM_REG_RETFL_EINVAL) + +#define PFM_REG_HAS_ERROR(flag) (((flag) & PFM_REG_RETFL_MASK) != 0) + +#endif /* _ASM_IA64_PERFMON_COMPAT_H_ */ diff --git a/arch/ia64/include/asm/perfmon_default_smpl.h b/arch/ia64/include/asm/perfmon_default_smpl.h index 48822c0..8234f32 100644 --- a/arch/ia64/include/asm/perfmon_default_smpl.h +++ b/arch/ia64/include/asm/perfmon_default_smpl.h @@ -1,83 +1,106 @@ /* - * Copyright (C) 2002-2003 Hewlett-Packard Co - * Stephane Eranian + * Copyright (c) 2002-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian * - * This file implements the default sampling buffer format - * for Linux/ia64 perfmon subsystem. + * This file implements the old default sampling buffer format + * for the perfmon2 subsystem. For IA-64 only. + * + * It requires the use of the perfmon_compat.h header. It is recommended + * that applications be ported to the new format instead. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA */ -#ifndef __PERFMON_DEFAULT_SMPL_H__ -#define __PERFMON_DEFAULT_SMPL_H__ 1 +#ifndef __ASM_IA64_PERFMON_DEFAULT_SMPL_H__ +#define __ASM_IA64_PERFMON_DEFAULT_SMPL_H__ 1 + +#ifndef __ia64__ +#error "this file must be used for compatibility reasons only on IA-64" +#endif #define PFM_DEFAULT_SMPL_UUID { \ - 0x4d, 0x72, 0xbe, 0xc0, 0x06, 0x64, 0x41, 0x43, 0x82, 0xb4, 0xd3, 0xfd, 0x27, 0x24, 0x3c, 0x97} + 0x4d, 0x72, 0xbe, 0xc0, 0x06, 0x64, 0x41, 0x43, 0x82,\ + 0xb4, 0xd3, 0xfd, 0x27, 0x24, 0x3c, 0x97} /* * format specific parameters (passed at context creation) */ -typedef struct { +struct pfm_default_smpl_arg { unsigned long buf_size; /* size of the buffer in bytes */ unsigned int flags; /* buffer specific flags */ unsigned int res1; /* for future use */ unsigned long reserved[2]; /* for future use */ -} pfm_default_smpl_arg_t; +}; /* * combined context+format specific structure. Can be passed - * to PFM_CONTEXT_CREATE + * to PFM_CONTEXT_CREATE (not PFM_CONTEXT_CREATE2) */ -typedef struct { - pfarg_context_t ctx_arg; - pfm_default_smpl_arg_t buf_arg; -} pfm_default_smpl_ctx_arg_t; +struct pfm_default_smpl_ctx_arg { + struct pfarg_context ctx_arg; + struct pfm_default_smpl_arg buf_arg; +}; /* * This header is at the beginning of the sampling buffer returned to the user. * It is directly followed by the first record. */ -typedef struct { - unsigned long hdr_count; /* how many valid entries */ - unsigned long hdr_cur_offs; /* current offset from top of buffer */ - unsigned long hdr_reserved2; /* reserved for future use */ +struct pfm_default_smpl_hdr { + u64 hdr_count; /* how many valid entries */ + u64 hdr_cur_offs; /* current offset from top of buffer */ + u64 dr_reserved2; /* reserved for future use */ - unsigned long hdr_overflows; /* how many times the buffer overflowed */ - unsigned long hdr_buf_size; /* how many bytes in the buffer */ + u64 hdr_overflows; /* how many times the buffer overflowed */ + u64 hdr_buf_size; /* how many bytes in the buffer */ - unsigned int hdr_version; /* contains perfmon version (smpl format diffs) */ - unsigned int hdr_reserved1; /* for future use */ - unsigned long hdr_reserved[10]; /* for future use */ -} pfm_default_smpl_hdr_t; + u32 hdr_version; /* smpl format version*/ + u32 hdr_reserved1; /* for future use */ + u64 hdr_reserved[10]; /* for future use */ +}; /* * Entry header in the sampling buffer. The header is directly followed - * with the values of the PMD registers of interest saved in increasing - * index order: PMD4, PMD5, and so on. How many PMDs are present depends + * with the values of the PMD registers of interest saved in increasing + * index order: PMD4, PMD5, and so on. How many PMDs are present depends * on how the session was programmed. * * In the case where multiple counters overflow at the same time, multiple * entries are written consecutively. * - * last_reset_value member indicates the initial value of the overflowed PMD. + * last_reset_value member indicates the initial value of the overflowed PMD. */ -typedef struct { - int pid; /* thread id (for NPTL, this is gettid()) */ - unsigned char reserved1[3]; /* reserved for future use */ - unsigned char ovfl_pmd; /* index of overflowed PMD */ - - unsigned long last_reset_val; /* initial value of overflowed PMD */ - unsigned long ip; /* where did the overflow interrupt happened */ - unsigned long tstamp; /* ar.itc when entering perfmon intr. handler */ - - unsigned short cpu; /* cpu on which the overfow occured */ - unsigned short set; /* event set active when overflow ocurred */ - int tgid; /* thread group id (for NPTL, this is getpid()) */ -} pfm_default_smpl_entry_t; +struct pfm_default_smpl_entry { + pid_t pid; /* thread id (for NPTL, this is gettid()) */ + uint8_t reserved1[3]; /* for future use */ + uint8_t ovfl_pmd; /* overflow pmd for this sample */ + u64 last_reset_val; /* initial value of overflowed PMD */ + unsigned long ip; /* where did the overflow interrupt happened */ + u64 tstamp; /* overflow timetamp */ + u16 cpu; /* cpu on which the overfow occured */ + u16 set; /* event set active when overflow ocurred */ + pid_t tgid; /* thread group id (for NPTL, this is getpid()) */ +}; -#define PFM_DEFAULT_MAX_PMDS 64 /* how many pmds supported by data structures (sizeof(unsigned long) */ -#define PFM_DEFAULT_MAX_ENTRY_SIZE (sizeof(pfm_default_smpl_entry_t)+(sizeof(unsigned long)*PFM_DEFAULT_MAX_PMDS)) -#define PFM_DEFAULT_SMPL_MIN_BUF_SIZE (sizeof(pfm_default_smpl_hdr_t)+PFM_DEFAULT_MAX_ENTRY_SIZE) +#define PFM_DEFAULT_MAX_PMDS 64 /* #pmds supported */ +#define PFM_DEFAULT_MAX_ENTRY_SIZE (sizeof(struct pfm_default_smpl_entry)+\ + (sizeof(u64)*PFM_DEFAULT_MAX_PMDS)) +#define PFM_DEFAULT_SMPL_MIN_BUF_SIZE (sizeof(struct pfm_default_smpl_hdr)+\ + PFM_DEFAULT_MAX_ENTRY_SIZE) #define PFM_DEFAULT_SMPL_VERSION_MAJ 2U -#define PFM_DEFAULT_SMPL_VERSION_MIN 0U -#define PFM_DEFAULT_SMPL_VERSION (((PFM_DEFAULT_SMPL_VERSION_MAJ&0xffff)<<16)|(PFM_DEFAULT_SMPL_VERSION_MIN & 0xffff)) +#define PFM_DEFAULT_SMPL_VERSION_MIN 1U +#define PFM_DEFAULT_SMPL_VERSION (((PFM_DEFAULT_SMPL_VERSION_MAJ&0xffff)<<16)|\ + (PFM_DEFAULT_SMPL_VERSION_MIN & 0xffff)) -#endif /* __PERFMON_DEFAULT_SMPL_H__ */ +#endif /* __ASM_IA64_PERFMON_DEFAULT_SMPL_H__ */ diff --git a/arch/ia64/include/asm/perfmon_kern.h b/arch/ia64/include/asm/perfmon_kern.h new file mode 100644 index 0000000..fb40459 --- /dev/null +++ b/arch/ia64/include/asm/perfmon_kern.h @@ -0,0 +1,356 @@ +/* + * Copyright (c) 2001-2007 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This file contains Itanium Processor Family specific definitions + * for the perfmon interface. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#ifndef _ASM_IA64_PERFMON_KERN_H_ +#define _ASM_IA64_PERFMON_KERN_H_ + +#ifdef __KERNEL__ + +#ifdef CONFIG_PERFMON +#include +#include + +/* + * describe the content of the pfm_syst_info field + * layout: + * bits[00-15] : generic flags + * bits[16-31] : arch-specific flags + */ +#define PFM_ITA_CPUINFO_IDLE_EXCL 0x10000 /* stop monitoring in idle loop */ + +/* + * For some CPUs, the upper bits of a counter must be set in order for the + * overflow interrupt to happen. On overflow, the counter has wrapped around, + * and the upper bits are cleared. This function may be used to set them back. + */ +static inline void pfm_arch_ovfl_reset_pmd(struct pfm_context *ctx, + unsigned int cnum) +{} + +/* + * called from __pfm_interrupt_handler(). ctx is not NULL. + * ctx is locked. PMU interrupt is masked. + * + * must stop all monitoring to ensure handler has consistent view. + * must collect overflowed PMDs bitmask into povfls_pmds and + * npend_ovfls. If no interrupt detected then npend_ovfls + * must be set to zero. + */ +static inline void pfm_arch_intr_freeze_pmu(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + u64 tmp; + + /* + * do not overwrite existing value, must + * process those first (coming from context switch replay) + */ + if (set->npend_ovfls) + return; + + ia64_srlz_d(); + + tmp = ia64_get_pmc(0) & ~0xf; + + set->povfl_pmds[0] = tmp; + + set->npend_ovfls = ia64_popcnt(tmp); +} + +static inline int pfm_arch_init_pmu_config(void) +{ + return 0; +} + +static inline void pfm_arch_resend_irq(struct pfm_context *ctx) +{ + ia64_resend_irq(IA64_PERFMON_VECTOR); +} + +static inline void pfm_arch_clear_pmd_ovfl_cond(struct pfm_context *ctx, + struct pfm_event_set *set) +{} + +static inline void pfm_arch_serialize(void) +{ + ia64_srlz_d(); +} + +static inline void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx) +{ + PFM_DBG_ovfl("state=%d", ctx->state); + ia64_set_pmc(0, 0); + /* no serialization */ +} + +static inline void pfm_arch_write_pmc(struct pfm_context *ctx, + unsigned int cnum, u64 value) +{ + if (cnum < 256) { + ia64_set_pmc(pfm_pmu_conf->pmc_desc[cnum].hw_addr, value); + } else if (cnum < 264) { + ia64_set_ibr(cnum-256, value); + ia64_dv_serialize_instruction(); + } else { + ia64_set_dbr(cnum-264, value); + ia64_dv_serialize_instruction(); + } +} + +/* + * On IA-64, for per-thread context which have the ITA_FL_INSECURE + * flag, it is possible to start/stop monitoring directly from user evel + * without calling pfm_start()/pfm_stop. This allows very lightweight + * control yet the kernel sometimes needs to know if monitoring is actually + * on or off. + * + * Tracking of this information is normally done by pfm_start/pfm_stop + * in flags.started. Here we need to compensate by checking actual + * psr bit. + */ +static inline int pfm_arch_is_active(struct pfm_context *ctx) +{ + return ctx->flags.started + || ia64_getreg(_IA64_REG_PSR) & (IA64_PSR_UP|IA64_PSR_PP); +} + +static inline void pfm_arch_write_pmd(struct pfm_context *ctx, + unsigned int cnum, u64 value) +{ + /* + * for a counting PMD, overflow bit must be cleared + */ + if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_C64) + value &= pfm_pmu_conf->ovfl_mask; + + /* + * for counters, write to upper bits are ignored, no need to mask + */ + ia64_set_pmd(pfm_pmu_conf->pmd_desc[cnum].hw_addr, value); +} + +static inline u64 pfm_arch_read_pmd(struct pfm_context *ctx, unsigned int cnum) +{ + return ia64_get_pmd(pfm_pmu_conf->pmd_desc[cnum].hw_addr); +} + +static inline u64 pfm_arch_read_pmc(struct pfm_context *ctx, unsigned int cnum) +{ + return ia64_get_pmc(pfm_pmu_conf->pmc_desc[cnum].hw_addr); +} + +static inline void pfm_arch_ctxswout_sys(struct task_struct *task, + struct pfm_context *ctx) +{ + struct pt_regs *regs; + + regs = task_pt_regs(task); + ia64_psr(regs)->pp = 0; +} + +static inline void pfm_arch_ctxswin_sys(struct task_struct *task, + struct pfm_context *ctx) +{ + struct pt_regs *regs; + + if (!(ctx->active_set->flags & PFM_ITA_SETFL_INTR_ONLY)) { + regs = task_pt_regs(task); + ia64_psr(regs)->pp = 1; + } +} + +/* + * On IA-64, the PMDs are NOT saved by pfm_arch_freeze_pmu() + * when entering the PMU interrupt handler, thus, we need + * to save them in pfm_switch_sets_from_intr() + */ +static inline void pfm_arch_save_pmds_from_intr(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + pfm_save_pmds(ctx, set); +} + +int pfm_arch_context_create(struct pfm_context *ctx, u32 ctx_flags); + +static inline void pfm_arch_context_free(struct pfm_context *ctx) +{} + +int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx); +void pfm_arch_ctxswin_thread(struct task_struct *task, + struct pfm_context *ctx); + +void pfm_arch_unload_context(struct pfm_context *ctx); +int pfm_arch_load_context(struct pfm_context *ctx); +int pfm_arch_setfl_sane(struct pfm_context *ctx, u32 flags); + +void pfm_arch_mask_monitoring(struct pfm_context *ctx, + struct pfm_event_set *set); +void pfm_arch_unmask_monitoring(struct pfm_context *ctx, + struct pfm_event_set *set); + +void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set); +void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set); + +void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx); +void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx); + +int pfm_arch_init(void); +void pfm_arch_init_percpu(void); +char *pfm_arch_get_pmu_module_name(void); + +int __pfm_use_dbregs(struct task_struct *task); +int __pfm_release_dbregs(struct task_struct *task); +int pfm_ia64_mark_dbregs_used(struct pfm_context *ctx, + struct pfm_event_set *set); + +void pfm_arch_show_session(struct seq_file *m); + +static inline int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds) +{ + return 0; +} + +static inline void pfm_arch_pmu_release(void) +{} + +/* not necessary on IA-64 */ +static inline void pfm_cacheflush(void *addr, unsigned int len) +{} + +/* + * miscellaneous architected definitions + */ +#define PFM_ITA_FCNTR 4 /* first counting monitor (PMC/PMD) */ + +/* + * private event set flags (set_priv_flags) + */ +#define PFM_ITA_SETFL_USE_DBR 0x1000000 /* set uses debug registers */ + + +/* + * Itanium-specific data structures + */ +struct pfm_ia64_context_flags { + unsigned int use_dbr:1; /* use range restrictions (debug registers) */ + unsigned int insecure:1; /* insecure monitoring for non-self session */ + unsigned int reserved:30;/* for future use */ +}; + +struct pfm_arch_context { + struct pfm_ia64_context_flags flags; /* arch specific ctx flags */ + u64 ctx_saved_psr_up;/* storage for psr_up */ +#ifdef CONFIG_IA64_PERFMON_COMPAT + void *ctx_smpl_vaddr; /* vaddr of user mapping */ +#endif +}; + +#ifdef CONFIG_IA64_PERFMON_COMPAT +ssize_t pfm_arch_compat_read(struct pfm_context *ctx, + char __user *buf, + int non_block, + size_t size); +int pfm_ia64_compat_init(void); +int pfm_smpl_buf_alloc_compat(struct pfm_context *ctx, + size_t rsize, struct file *filp); +#else +static inline ssize_t pfm_arch_compat_read(struct pfm_context *ctx, + char __user *buf, + int non_block, + size_t size) +{ + return -EINVAL; +} + +static inline int pfm_smpl_buf_alloc_compat(struct pfm_context *ctx, + size_t rsize, struct file *filp) +{ + return -EINVAL; +} +#endif + +static inline void pfm_arch_arm_handle_work(struct task_struct *task) +{ + /* + * On IA-64, we ran out of bits in the bottom 7 bits of the + * threadinfo bitmask.Thus we used a 2-stage approach by piggybacking + * on NOTIFY_RESUME and then in do_notify_resume() we demultiplex and + * call pfm_handle_work() if needed + */ + set_tsk_thread_flag(task, TIF_NOTIFY_RESUME); +} + +static inline void pfm_arch_disarm_handle_work(struct task_struct *task) +{ + /* + * we cannot just clear TIF_NOTIFY_RESUME because other TIF flags are + * piggybackedonto it: TIF_PERFMON_WORK, TIF_RESTORE_RSE + * + * The tsk_clear_notify_resume() checks if any of those are set before + * clearing the * bit + */ + tsk_clear_notify_resume(task); +} + +static inline int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg) +{ + return 0; +} + +extern struct pfm_ia64_pmu_info *pfm_ia64_pmu_info; + +#define PFM_ARCH_CTX_SIZE (sizeof(struct pfm_arch_context)) + +/* + * IA-64 does not need extra alignment requirements for the sampling buffer + */ +#define PFM_ARCH_SMPL_ALIGN_SIZE 0 + + +static inline void pfm_release_dbregs(struct task_struct *task) +{ + if (task->thread.flags & IA64_THREAD_DBG_VALID) + __pfm_release_dbregs(task); +} + +#define pfm_use_dbregs(_t) __pfm_use_dbregs(_t) + +static inline int pfm_arch_get_base_syscall(void) +{ + return __NR_pfm_create_context; +} + +struct pfm_arch_pmu_info { + unsigned long mask_pmcs[PFM_PMC_BV]; /* modify on when masking */ +}; + +DECLARE_PER_CPU(u32, pfm_syst_info); +#else /* !CONFIG_PERFMON */ +/* + * perfmon ia64-specific hooks + */ +#define pfm_release_dbregs(_t) do { } while (0) +#define pfm_use_dbregs(_t) (0) + +#endif /* CONFIG_PERFMON */ + +#endif /* __KERNEL__ */ +#endif /* _ASM_IA64_PERFMON_KERN_H_ */ diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h index f88fa05..9d6af9c 100644 --- a/arch/ia64/include/asm/processor.h +++ b/arch/ia64/include/asm/processor.h @@ -42,7 +42,6 @@ #define IA64_THREAD_FPH_VALID (__IA64_UL(1) << 0) /* floating-point high state valid? */ #define IA64_THREAD_DBG_VALID (__IA64_UL(1) << 1) /* debug registers valid? */ -#define IA64_THREAD_PM_VALID (__IA64_UL(1) << 2) /* performance registers valid? */ #define IA64_THREAD_UAC_NOPRINT (__IA64_UL(1) << 3) /* don't log unaligned accesses */ #define IA64_THREAD_UAC_SIGBUS (__IA64_UL(1) << 4) /* generate SIGBUS on unaligned acc. */ #define IA64_THREAD_MIGRATION (__IA64_UL(1) << 5) /* require migration @@ -321,14 +320,6 @@ struct thread_struct { #else # define INIT_THREAD_IA32 #endif /* CONFIG_IA32_SUPPORT */ -#ifdef CONFIG_PERFMON - void *pfm_context; /* pointer to detailed PMU context */ - unsigned long pfm_needs_checking; /* when >0, pending perfmon work on kernel exit */ -# define INIT_THREAD_PM .pfm_context = NULL, \ - .pfm_needs_checking = 0UL, -#else -# define INIT_THREAD_PM -#endif __u64 dbr[IA64_NUM_DBG_REGS]; __u64 ibr[IA64_NUM_DBG_REGS]; struct ia64_fpreg fph[96]; /* saved/loaded on demand */ @@ -343,7 +334,6 @@ struct thread_struct { .task_size = DEFAULT_TASK_SIZE, \ .last_fph_cpu = -1, \ INIT_THREAD_IA32 \ - INIT_THREAD_PM \ .dbr = {0, }, \ .ibr = {0, }, \ .fph = {{{{0}}}, } \ diff --git a/arch/ia64/include/asm/system.h b/arch/ia64/include/asm/system.h index 927a381..ab5aeea 100644 --- a/arch/ia64/include/asm/system.h +++ b/arch/ia64/include/asm/system.h @@ -217,6 +217,7 @@ struct task_struct; extern void ia64_save_extra (struct task_struct *task); extern void ia64_load_extra (struct task_struct *task); + #ifdef CONFIG_VIRT_CPU_ACCOUNTING extern void ia64_account_on_switch (struct task_struct *prev, struct task_struct *next); # define IA64_ACCOUNT_ON_SWITCH(p,n) ia64_account_on_switch(p,n) @@ -224,16 +225,9 @@ extern void ia64_account_on_switch (struct task_struct *prev, struct task_struct # define IA64_ACCOUNT_ON_SWITCH(p,n) #endif -#ifdef CONFIG_PERFMON - DECLARE_PER_CPU(unsigned long, pfm_syst_info); -# define PERFMON_IS_SYSWIDE() (__get_cpu_var(pfm_syst_info) & 0x1) -#else -# define PERFMON_IS_SYSWIDE() (0) -#endif - -#define IA64_HAS_EXTRA_STATE(t) \ - ((t)->thread.flags & (IA64_THREAD_DBG_VALID|IA64_THREAD_PM_VALID) \ - || IS_IA32_PROCESS(task_pt_regs(t)) || PERFMON_IS_SYSWIDE()) +#define IA64_HAS_EXTRA_STATE(t) \ + (((t)->thread.flags & IA64_THREAD_DBG_VALID) \ + || IS_IA32_PROCESS(task_pt_regs(t))) #define __switch_to(prev,next,last) do { \ IA64_ACCOUNT_ON_SWITCH(prev, next); \ @@ -241,6 +235,10 @@ extern void ia64_account_on_switch (struct task_struct *prev, struct task_struct ia64_save_extra(prev); \ if (IA64_HAS_EXTRA_STATE(next)) \ ia64_load_extra(next); \ + if (test_tsk_thread_flag(prev, TIF_PERFMON_CTXSW)) \ + pfm_ctxsw_out(prev, next); \ + if (test_tsk_thread_flag(next, TIF_PERFMON_CTXSW)) \ + pfm_ctxsw_in(prev, next); \ ia64_psr(task_pt_regs(next))->dfh = !ia64_is_local_fpu_owner(next); \ (last) = ia64_switch_to((next)); \ } while (0) diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h index 7c60fcd..3355332 100644 --- a/arch/ia64/include/asm/thread_info.h +++ b/arch/ia64/include/asm/thread_info.h @@ -110,6 +110,8 @@ extern void tsk_clear_notify_resume(struct task_struct *tsk); #define TIF_DB_DISABLED 19 /* debug trap disabled for fsyscall */ #define TIF_FREEZE 20 /* is freezing for suspend */ #define TIF_RESTORE_RSE 21 /* user RBS is newer than kernel RBS */ +#define TIF_PERFMON_CTXSW 22 /* perfmon needs ctxsw calls */ +#define TIF_PERFMON_WORK 23 /* work for pfm_handle_work() */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) @@ -123,6 +125,8 @@ extern void tsk_clear_notify_resume(struct task_struct *tsk); #define _TIF_DB_DISABLED (1 << TIF_DB_DISABLED) #define _TIF_FREEZE (1 << TIF_FREEZE) #define _TIF_RESTORE_RSE (1 << TIF_RESTORE_RSE) +#define _TIF_PERFMON_CTXSW (1 << TIF_PERFMON_CTXSW) +#define _TIF_PERFMON_WORK (1 << TIF_PERFMON_WORK) /* "work to do on user-return" bits */ #define TIF_ALLWORK_MASK (_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SYSCALL_AUDIT|\ diff --git a/arch/ia64/include/asm/unistd.h b/arch/ia64/include/asm/unistd.h index d535833..29a43bc 100644 --- a/arch/ia64/include/asm/unistd.h +++ b/arch/ia64/include/asm/unistd.h @@ -308,11 +308,23 @@ #define __NR_dup3 1316 #define __NR_pipe2 1317 #define __NR_inotify_init1 1318 +#define __NR_pfm_create_context 1319 +#define __NR_pfm_write_pmcs (__NR_pfm_create_context+1) +#define __NR_pfm_write_pmds (__NR_pfm_create_context+2) +#define __NR_pfm_read_pmds (__NR_pfm_create_context+3) +#define __NR_pfm_load_context (__NR_pfm_create_context+4) +#define __NR_pfm_start (__NR_pfm_create_context+5) +#define __NR_pfm_stop (__NR_pfm_create_context+6) +#define __NR_pfm_restart (__NR_pfm_create_context+7) +#define __NR_pfm_create_evtsets (__NR_pfm_create_context+8) +#define __NR_pfm_getinfo_evtsets (__NR_pfm_create_context+9) +#define __NR_pfm_delete_evtsets (__NR_pfm_create_context+10) +#define __NR_pfm_unload_context (__NR_pfm_create_context+11) #ifdef __KERNEL__ -#define NR_syscalls 295 /* length of syscall table */ +#define NR_syscalls 307 /* length of syscall table */ /* * The following defines stop scripts/checksyscalls.sh from complaining about diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile index 87fea11..b5ac54c 100644 --- a/arch/ia64/kernel/Makefile +++ b/arch/ia64/kernel/Makefile @@ -5,7 +5,7 @@ extra-y := head.o init_task.o vmlinux.lds obj-y := acpi.o entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o \ - irq_lsapic.o ivt.o machvec.o pal.o patch.o process.o perfmon.o ptrace.o sal.o \ + irq_lsapic.o ivt.o machvec.o pal.o patch.o process.o ptrace.o sal.o \ salinfo.o setup.o signal.o sys_ia64.o time.o traps.o unaligned.o \ unwind.o mca.o mca_asm.o topology.o @@ -23,7 +23,6 @@ obj-$(CONFIG_IOSAPIC) += iosapic.o obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_SMP) += smp.o smpboot.o obj-$(CONFIG_NUMA) += numa.o -obj-$(CONFIG_PERFMON) += perfmon_default_smpl.o obj-$(CONFIG_IA64_CYCLONE) += cyclone.o obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_IA64_MCA_RECOVERY) += mca_recovery.o diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index 0dd6c14..f1c3e41 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -1697,6 +1697,18 @@ sys_call_table: data8 sys_dup3 data8 sys_pipe2 data8 sys_inotify_init1 + data8 sys_pfm_create_context + data8 sys_pfm_write_pmcs // 1320 + data8 sys_pfm_write_pmds + data8 sys_pfm_read_pmds + data8 sys_pfm_load_context + data8 sys_pfm_start + data8 sys_pfm_stop // 1325 + data8 sys_pfm_restart + data8 sys_pfm_create_evtsets + data8 sys_pfm_getinfo_evtsets + data8 sys_pfm_delete_evtsets + data8 sys_pfm_unload_context // 1330 .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls #endif /* __IA64_ASM_PARAVIRTUALIZED_NATIVE */ diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c index 28d3d48..ede8024 100644 --- a/arch/ia64/kernel/irq_ia64.c +++ b/arch/ia64/kernel/irq_ia64.c @@ -40,10 +40,6 @@ #include #include -#ifdef CONFIG_PERFMON -# include -#endif - #define IRQ_DEBUG 0 #define IRQ_VECTOR_UNASSIGNED (0) @@ -660,9 +656,6 @@ init_IRQ (void) } #endif #endif -#ifdef CONFIG_PERFMON - pfm_init_percpu(); -#endif platform_irq_init(); } diff --git a/arch/ia64/kernel/perfmon_default_smpl.c b/arch/ia64/kernel/perfmon_default_smpl.c deleted file mode 100644 index 5f637bb..0000000 --- a/arch/ia64/kernel/perfmon_default_smpl.c +++ /dev/null @@ -1,296 +0,0 @@ -/* - * Copyright (C) 2002-2003 Hewlett-Packard Co - * Stephane Eranian - * - * This file implements the default sampling buffer format - * for the Linux/ia64 perfmon-2 subsystem. - */ -#include -#include -#include -#include -#include -#include - -#include -#include - -MODULE_AUTHOR("Stephane Eranian "); -MODULE_DESCRIPTION("perfmon default sampling format"); -MODULE_LICENSE("GPL"); - -#define DEFAULT_DEBUG 1 - -#ifdef DEFAULT_DEBUG -#define DPRINT(a) \ - do { \ - if (unlikely(pfm_sysctl.debug >0)) { printk("%s.%d: CPU%d ", __func__, __LINE__, smp_processor_id()); printk a; } \ - } while (0) - -#define DPRINT_ovfl(a) \ - do { \ - if (unlikely(pfm_sysctl.debug > 0 && pfm_sysctl.debug_ovfl >0)) { printk("%s.%d: CPU%d ", __func__, __LINE__, smp_processor_id()); printk a; } \ - } while (0) - -#else -#define DPRINT(a) -#define DPRINT_ovfl(a) -#endif - -static int -default_validate(struct task_struct *task, unsigned int flags, int cpu, void *data) -{ - pfm_default_smpl_arg_t *arg = (pfm_default_smpl_arg_t*)data; - int ret = 0; - - if (data == NULL) { - DPRINT(("[%d] no argument passed\n", task_pid_nr(task))); - return -EINVAL; - } - - DPRINT(("[%d] validate flags=0x%x CPU%d\n", task_pid_nr(task), flags, cpu)); - - /* - * must hold at least the buffer header + one minimally sized entry - */ - if (arg->buf_size < PFM_DEFAULT_SMPL_MIN_BUF_SIZE) return -EINVAL; - - DPRINT(("buf_size=%lu\n", arg->buf_size)); - - return ret; -} - -static int -default_get_size(struct task_struct *task, unsigned int flags, int cpu, void *data, unsigned long *size) -{ - pfm_default_smpl_arg_t *arg = (pfm_default_smpl_arg_t *)data; - - /* - * size has been validated in default_validate - */ - *size = arg->buf_size; - - return 0; -} - -static int -default_init(struct task_struct *task, void *buf, unsigned int flags, int cpu, void *data) -{ - pfm_default_smpl_hdr_t *hdr; - pfm_default_smpl_arg_t *arg = (pfm_default_smpl_arg_t *)data; - - hdr = (pfm_default_smpl_hdr_t *)buf; - - hdr->hdr_version = PFM_DEFAULT_SMPL_VERSION; - hdr->hdr_buf_size = arg->buf_size; - hdr->hdr_cur_offs = sizeof(*hdr); - hdr->hdr_overflows = 0UL; - hdr->hdr_count = 0UL; - - DPRINT(("[%d] buffer=%p buf_size=%lu hdr_size=%lu hdr_version=%u cur_offs=%lu\n", - task_pid_nr(task), - buf, - hdr->hdr_buf_size, - sizeof(*hdr), - hdr->hdr_version, - hdr->hdr_cur_offs)); - - return 0; -} - -static int -default_handler(struct task_struct *task, void *buf, pfm_ovfl_arg_t *arg, struct pt_regs *regs, unsigned long stamp) -{ - pfm_default_smpl_hdr_t *hdr; - pfm_default_smpl_entry_t *ent; - void *cur, *last; - unsigned long *e, entry_size; - unsigned int npmds, i; - unsigned char ovfl_pmd; - unsigned char ovfl_notify; - - if (unlikely(buf == NULL || arg == NULL|| regs == NULL || task == NULL)) { - DPRINT(("[%d] invalid arguments buf=%p arg=%p\n", task->pid, buf, arg)); - return -EINVAL; - } - - hdr = (pfm_default_smpl_hdr_t *)buf; - cur = buf+hdr->hdr_cur_offs; - last = buf+hdr->hdr_buf_size; - ovfl_pmd = arg->ovfl_pmd; - ovfl_notify = arg->ovfl_notify; - - /* - * precheck for sanity - */ - if ((last - cur) < PFM_DEFAULT_MAX_ENTRY_SIZE) goto full; - - npmds = hweight64(arg->smpl_pmds[0]); - - ent = (pfm_default_smpl_entry_t *)cur; - - prefetch(arg->smpl_pmds_values); - - entry_size = sizeof(*ent) + (npmds << 3); - - /* position for first pmd */ - e = (unsigned long *)(ent+1); - - hdr->hdr_count++; - - DPRINT_ovfl(("[%d] count=%lu cur=%p last=%p free_bytes=%lu ovfl_pmd=%d ovfl_notify=%d npmds=%u\n", - task->pid, - hdr->hdr_count, - cur, last, - last-cur, - ovfl_pmd, - ovfl_notify, npmds)); - - /* - * current = task running at the time of the overflow. - * - * per-task mode: - * - this is ususally the task being monitored. - * Under certain conditions, it might be a different task - * - * system-wide: - * - this is not necessarily the task controlling the session - */ - ent->pid = current->pid; - ent->ovfl_pmd = ovfl_pmd; - ent->last_reset_val = arg->pmd_last_reset; //pmd[0].reg_last_reset_val; - - /* - * where did the fault happen (includes slot number) - */ - ent->ip = regs->cr_iip | ((regs->cr_ipsr >> 41) & 0x3); - - ent->tstamp = stamp; - ent->cpu = smp_processor_id(); - ent->set = arg->active_set; - ent->tgid = current->tgid; - - /* - * selectively store PMDs in increasing index number - */ - if (npmds) { - unsigned long *val = arg->smpl_pmds_values; - for(i=0; i < npmds; i++) { - *e++ = *val++; - } - } - - /* - * update position for next entry - */ - hdr->hdr_cur_offs += entry_size; - cur += entry_size; - - /* - * post check to avoid losing the last sample - */ - if ((last - cur) < PFM_DEFAULT_MAX_ENTRY_SIZE) goto full; - - /* - * keep same ovfl_pmds, ovfl_notify - */ - arg->ovfl_ctrl.bits.notify_user = 0; - arg->ovfl_ctrl.bits.block_task = 0; - arg->ovfl_ctrl.bits.mask_monitoring = 0; - arg->ovfl_ctrl.bits.reset_ovfl_pmds = 1; /* reset before returning from interrupt handler */ - - return 0; -full: - DPRINT_ovfl(("sampling buffer full free=%lu, count=%lu, ovfl_notify=%d\n", last-cur, hdr->hdr_count, ovfl_notify)); - - /* - * increment number of buffer overflow. - * important to detect duplicate set of samples. - */ - hdr->hdr_overflows++; - - /* - * if no notification requested, then we saturate the buffer - */ - if (ovfl_notify == 0) { - arg->ovfl_ctrl.bits.notify_user = 0; - arg->ovfl_ctrl.bits.block_task = 0; - arg->ovfl_ctrl.bits.mask_monitoring = 1; - arg->ovfl_ctrl.bits.reset_ovfl_pmds = 0; - } else { - arg->ovfl_ctrl.bits.notify_user = 1; - arg->ovfl_ctrl.bits.block_task = 1; /* ignored for non-blocking context */ - arg->ovfl_ctrl.bits.mask_monitoring = 1; - arg->ovfl_ctrl.bits.reset_ovfl_pmds = 0; /* no reset now */ - } - return -1; /* we are full, sorry */ -} - -static int -default_restart(struct task_struct *task, pfm_ovfl_ctrl_t *ctrl, void *buf, struct pt_regs *regs) -{ - pfm_default_smpl_hdr_t *hdr; - - hdr = (pfm_default_smpl_hdr_t *)buf; - - hdr->hdr_count = 0UL; - hdr->hdr_cur_offs = sizeof(*hdr); - - ctrl->bits.mask_monitoring = 0; - ctrl->bits.reset_ovfl_pmds = 1; /* uses long-reset values */ - - return 0; -} - -static int -default_exit(struct task_struct *task, void *buf, struct pt_regs *regs) -{ - DPRINT(("[%d] exit(%p)\n", task_pid_nr(task), buf)); - return 0; -} - -static pfm_buffer_fmt_t default_fmt={ - .fmt_name = "default_format", - .fmt_uuid = PFM_DEFAULT_SMPL_UUID, - .fmt_arg_size = sizeof(pfm_default_smpl_arg_t), - .fmt_validate = default_validate, - .fmt_getsize = default_get_size, - .fmt_init = default_init, - .fmt_handler = default_handler, - .fmt_restart = default_restart, - .fmt_restart_active = default_restart, - .fmt_exit = default_exit, -}; - -static int __init -pfm_default_smpl_init_module(void) -{ - int ret; - - ret = pfm_register_buffer_fmt(&default_fmt); - if (ret == 0) { - printk("perfmon_default_smpl: %s v%u.%u registered\n", - default_fmt.fmt_name, - PFM_DEFAULT_SMPL_VERSION_MAJ, - PFM_DEFAULT_SMPL_VERSION_MIN); - } else { - printk("perfmon_default_smpl: %s cannot register ret=%d\n", - default_fmt.fmt_name, - ret); - } - - return ret; -} - -static void __exit -pfm_default_smpl_cleanup_module(void) -{ - int ret; - ret = pfm_unregister_buffer_fmt(default_fmt.fmt_uuid); - - printk("perfmon_default_smpl: unregister %s=%d\n", default_fmt.fmt_name, ret); -} - -module_init(pfm_default_smpl_init_module); -module_exit(pfm_default_smpl_cleanup_module); - diff --git a/arch/ia64/kernel/perfmon_generic.h b/arch/ia64/kernel/perfmon_generic.h deleted file mode 100644 index 6748947..0000000 --- a/arch/ia64/kernel/perfmon_generic.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * This file contains the generic PMU register description tables - * and pmc checker used by perfmon.c. - * - * Copyright (C) 2002-2003 Hewlett Packard Co - * Stephane Eranian - */ - -static pfm_reg_desc_t pfm_gen_pmc_desc[PMU_MAX_PMCS]={ -/* pmc0 */ { PFM_REG_CONTROL , 0, 0x1UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc1 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc2 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc3 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc4 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(4),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc5 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(5),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc6 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(6),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc7 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {RDEP(7),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, - { PFM_REG_END , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */ -}; - -static pfm_reg_desc_t pfm_gen_pmd_desc[PMU_MAX_PMDS]={ -/* pmd0 */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, -/* pmd1 */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, -/* pmd2 */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, -/* pmd3 */ { PFM_REG_NOTIMPL , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, -/* pmd4 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(4),0UL, 0UL, 0UL}}, -/* pmd5 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(5),0UL, 0UL, 0UL}}, -/* pmd6 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(6),0UL, 0UL, 0UL}}, -/* pmd7 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(7),0UL, 0UL, 0UL}}, - { PFM_REG_END , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */ -}; - -/* - * impl_pmcs, impl_pmds are computed at runtime to minimize errors! - */ -static pmu_config_t pmu_conf_gen={ - .pmu_name = "Generic", - .pmu_family = 0xff, /* any */ - .ovfl_val = (1UL << 32) - 1, - .num_ibrs = 0, /* does not use */ - .num_dbrs = 0, /* does not use */ - .pmd_desc = pfm_gen_pmd_desc, - .pmc_desc = pfm_gen_pmc_desc -}; - diff --git a/arch/ia64/kernel/perfmon_itanium.h b/arch/ia64/kernel/perfmon_itanium.h deleted file mode 100644 index d1d508a..0000000 --- a/arch/ia64/kernel/perfmon_itanium.h +++ /dev/null @@ -1,115 +0,0 @@ -/* - * This file contains the Itanium PMU register description tables - * and pmc checker used by perfmon.c. - * - * Copyright (C) 2002-2003 Hewlett Packard Co - * Stephane Eranian - */ -static int pfm_ita_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs); - -static pfm_reg_desc_t pfm_ita_pmc_desc[PMU_MAX_PMCS]={ -/* pmc0 */ { PFM_REG_CONTROL , 0, 0x1UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc1 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc2 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc3 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc4 */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(4),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc5 */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(5),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc6 */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(6),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc7 */ { PFM_REG_COUNTING, 6, 0x0UL, -1UL, NULL, NULL, {RDEP(7),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc8 */ { PFM_REG_CONFIG , 0, 0xf00000003ffffff8UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc9 */ { PFM_REG_CONFIG , 0, 0xf00000003ffffff8UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc10 */ { PFM_REG_MONITOR , 6, 0x0UL, -1UL, NULL, NULL, {RDEP(0)|RDEP(1),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc11 */ { PFM_REG_MONITOR , 6, 0x0000000010000000UL, -1UL, NULL, pfm_ita_pmc_check, {RDEP(2)|RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc12 */ { PFM_REG_MONITOR , 6, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc13 */ { PFM_REG_CONFIG , 0, 0x0003ffff00000001UL, -1UL, NULL, pfm_ita_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, - { PFM_REG_END , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */ -}; - -static pfm_reg_desc_t pfm_ita_pmd_desc[PMU_MAX_PMDS]={ -/* pmd0 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(1),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}}, -/* pmd1 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(0),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}}, -/* pmd2 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}}, -/* pmd3 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}}, -/* pmd4 */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(4),0UL, 0UL, 0UL}}, -/* pmd5 */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(5),0UL, 0UL, 0UL}}, -/* pmd6 */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(6),0UL, 0UL, 0UL}}, -/* pmd7 */ { PFM_REG_COUNTING, 0, 0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(7),0UL, 0UL, 0UL}}, -/* pmd8 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd9 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd10 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd11 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd12 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd13 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd14 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd15 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd16 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd17 */ { PFM_REG_BUFFER , 0, 0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(3),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}}, - { PFM_REG_END , 0, 0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */ -}; - -static int -pfm_ita_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs) -{ - int ret; - int is_loaded; - - /* sanitfy check */ - if (ctx == NULL) return -EINVAL; - - is_loaded = ctx->ctx_state == PFM_CTX_LOADED || ctx->ctx_state == PFM_CTX_MASKED; - - /* - * we must clear the (instruction) debug registers if pmc13.ta bit is cleared - * before they are written (fl_using_dbreg==0) to avoid picking up stale information. - */ - if (cnum == 13 && is_loaded && ((*val & 0x1) == 0UL) && ctx->ctx_fl_using_dbreg == 0) { - - DPRINT(("pmc[%d]=0x%lx has active pmc13.ta cleared, clearing ibr\n", cnum, *val)); - - /* don't mix debug with perfmon */ - if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL; - - /* - * a count of 0 will mark the debug registers as in use and also - * ensure that they are properly cleared. - */ - ret = pfm_write_ibr_dbr(1, ctx, NULL, 0, regs); - if (ret) return ret; - } - - /* - * we must clear the (data) debug registers if pmc11.pt bit is cleared - * before they are written (fl_using_dbreg==0) to avoid picking up stale information. - */ - if (cnum == 11 && is_loaded && ((*val >> 28)& 0x1) == 0 && ctx->ctx_fl_using_dbreg == 0) { - - DPRINT(("pmc[%d]=0x%lx has active pmc11.pt cleared, clearing dbr\n", cnum, *val)); - - /* don't mix debug with perfmon */ - if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL; - - /* - * a count of 0 will mark the debug registers as in use and also - * ensure that they are properly cleared. - */ - ret = pfm_write_ibr_dbr(0, ctx, NULL, 0, regs); - if (ret) return ret; - } - return 0; -} - -/* - * impl_pmcs, impl_pmds are computed at runtime to minimize errors! - */ -static pmu_config_t pmu_conf_ita={ - .pmu_name = "Itanium", - .pmu_family = 0x7, - .ovfl_val = (1UL << 32) - 1, - .pmd_desc = pfm_ita_pmd_desc, - .pmc_desc = pfm_ita_pmc_desc, - .num_ibrs = 8, - .num_dbrs = 8, - .use_rr_dbregs = 1, /* debug register are use for range retrictions */ -}; - - diff --git a/arch/ia64/kernel/perfmon_mckinley.h b/arch/ia64/kernel/perfmon_mckinley.h deleted file mode 100644 index c4bec7a..0000000 --- a/arch/ia64/kernel/perfmon_mckinley.h +++ /dev/null @@ -1,187 +0,0 @@ -/* - * This file contains the McKinley PMU register description tables - * and pmc checker used by perfmon.c. - * - * Copyright (C) 2002-2003 Hewlett Packard Co - * Stephane Eranian - */ -static int pfm_mck_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs); - -static pfm_reg_desc_t pfm_mck_pmc_desc[PMU_MAX_PMCS]={ -/* pmc0 */ { PFM_REG_CONTROL , 0, 0x1UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc1 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc2 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc3 */ { PFM_REG_CONTROL , 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc4 */ { PFM_REG_COUNTING, 6, 0x0000000000800000UL, 0xfffff7fUL, NULL, pfm_mck_pmc_check, {RDEP(4),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc5 */ { PFM_REG_COUNTING, 6, 0x0UL, 0xfffff7fUL, NULL, pfm_mck_pmc_check, {RDEP(5),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc6 */ { PFM_REG_COUNTING, 6, 0x0UL, 0xfffff7fUL, NULL, pfm_mck_pmc_check, {RDEP(6),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc7 */ { PFM_REG_COUNTING, 6, 0x0UL, 0xfffff7fUL, NULL, pfm_mck_pmc_check, {RDEP(7),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc8 */ { PFM_REG_CONFIG , 0, 0xffffffff3fffffffUL, 0xffffffff3ffffffbUL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc9 */ { PFM_REG_CONFIG , 0, 0xffffffff3ffffffcUL, 0xffffffff3ffffffbUL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc10 */ { PFM_REG_MONITOR , 4, 0x0UL, 0xffffUL, NULL, pfm_mck_pmc_check, {RDEP(0)|RDEP(1),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc11 */ { PFM_REG_MONITOR , 6, 0x0UL, 0x30f01cf, NULL, pfm_mck_pmc_check, {RDEP(2)|RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc12 */ { PFM_REG_MONITOR , 6, 0x0UL, 0xffffUL, NULL, pfm_mck_pmc_check, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc13 */ { PFM_REG_CONFIG , 0, 0x00002078fefefefeUL, 0x1e00018181818UL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc14 */ { PFM_REG_CONFIG , 0, 0x0db60db60db60db6UL, 0x2492UL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, -/* pmc15 */ { PFM_REG_CONFIG , 0, 0x00000000fffffff0UL, 0xfUL, NULL, pfm_mck_pmc_check, {0UL,0UL, 0UL, 0UL}, {0UL,0UL, 0UL, 0UL}}, - { PFM_REG_END , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */ -}; - -static pfm_reg_desc_t pfm_mck_pmd_desc[PMU_MAX_PMDS]={ -/* pmd0 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(1),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}}, -/* pmd1 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(0),0UL, 0UL, 0UL}, {RDEP(10),0UL, 0UL, 0UL}}, -/* pmd2 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(3)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}}, -/* pmd3 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(17),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}}, -/* pmd4 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(4),0UL, 0UL, 0UL}}, -/* pmd5 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(5),0UL, 0UL, 0UL}}, -/* pmd6 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(6),0UL, 0UL, 0UL}}, -/* pmd7 */ { PFM_REG_COUNTING, 0, 0x0UL, -1UL, NULL, NULL, {0UL,0UL, 0UL, 0UL}, {RDEP(7),0UL, 0UL, 0UL}}, -/* pmd8 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd9 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd10 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd11 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd12 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(13)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd13 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(14)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd14 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(15)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd15 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(16),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd16 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15),0UL, 0UL, 0UL}, {RDEP(12),0UL, 0UL, 0UL}}, -/* pmd17 */ { PFM_REG_BUFFER , 0, 0x0UL, -1UL, NULL, NULL, {RDEP(2)|RDEP(3),0UL, 0UL, 0UL}, {RDEP(11),0UL, 0UL, 0UL}}, - { PFM_REG_END , 0, 0x0UL, -1UL, NULL, NULL, {0,}, {0,}}, /* end marker */ -}; - -/* - * PMC reserved fields must have their power-up values preserved - */ -static int -pfm_mck_reserved(unsigned int cnum, unsigned long *val, struct pt_regs *regs) -{ - unsigned long tmp1, tmp2, ival = *val; - - /* remove reserved areas from user value */ - tmp1 = ival & PMC_RSVD_MASK(cnum); - - /* get reserved fields values */ - tmp2 = PMC_DFL_VAL(cnum) & ~PMC_RSVD_MASK(cnum); - - *val = tmp1 | tmp2; - - DPRINT(("pmc[%d]=0x%lx, mask=0x%lx, reset=0x%lx, val=0x%lx\n", - cnum, ival, PMC_RSVD_MASK(cnum), PMC_DFL_VAL(cnum), *val)); - return 0; -} - -/* - * task can be NULL if the context is unloaded - */ -static int -pfm_mck_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs) -{ - int ret = 0, check_case1 = 0; - unsigned long val8 = 0, val14 = 0, val13 = 0; - int is_loaded; - - /* first preserve the reserved fields */ - pfm_mck_reserved(cnum, val, regs); - - /* sanitfy check */ - if (ctx == NULL) return -EINVAL; - - is_loaded = ctx->ctx_state == PFM_CTX_LOADED || ctx->ctx_state == PFM_CTX_MASKED; - - /* - * we must clear the debug registers if pmc13 has a value which enable - * memory pipeline event constraints. In this case we need to clear the - * the debug registers if they have not yet been accessed. This is required - * to avoid picking stale state. - * PMC13 is "active" if: - * one of the pmc13.cfg_dbrpXX field is different from 0x3 - * AND - * at the corresponding pmc13.ena_dbrpXX is set. - */ - DPRINT(("cnum=%u val=0x%lx, using_dbreg=%d loaded=%d\n", cnum, *val, ctx->ctx_fl_using_dbreg, is_loaded)); - - if (cnum == 13 && is_loaded - && (*val & 0x1e00000000000UL) && (*val & 0x18181818UL) != 0x18181818UL && ctx->ctx_fl_using_dbreg == 0) { - - DPRINT(("pmc[%d]=0x%lx has active pmc13 settings, clearing dbr\n", cnum, *val)); - - /* don't mix debug with perfmon */ - if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL; - - /* - * a count of 0 will mark the debug registers as in use and also - * ensure that they are properly cleared. - */ - ret = pfm_write_ibr_dbr(PFM_DATA_RR, ctx, NULL, 0, regs); - if (ret) return ret; - } - /* - * we must clear the (instruction) debug registers if any pmc14.ibrpX bit is enabled - * before they are (fl_using_dbreg==0) to avoid picking up stale information. - */ - if (cnum == 14 && is_loaded && ((*val & 0x2222UL) != 0x2222UL) && ctx->ctx_fl_using_dbreg == 0) { - - DPRINT(("pmc[%d]=0x%lx has active pmc14 settings, clearing ibr\n", cnum, *val)); - - /* don't mix debug with perfmon */ - if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL; - - /* - * a count of 0 will mark the debug registers as in use and also - * ensure that they are properly cleared. - */ - ret = pfm_write_ibr_dbr(PFM_CODE_RR, ctx, NULL, 0, regs); - if (ret) return ret; - - } - - switch(cnum) { - case 4: *val |= 1UL << 23; /* force power enable bit */ - break; - case 8: val8 = *val; - val13 = ctx->ctx_pmcs[13]; - val14 = ctx->ctx_pmcs[14]; - check_case1 = 1; - break; - case 13: val8 = ctx->ctx_pmcs[8]; - val13 = *val; - val14 = ctx->ctx_pmcs[14]; - check_case1 = 1; - break; - case 14: val8 = ctx->ctx_pmcs[8]; - val13 = ctx->ctx_pmcs[13]; - val14 = *val; - check_case1 = 1; - break; - } - /* check illegal configuration which can produce inconsistencies in tagging - * i-side events in L1D and L2 caches - */ - if (check_case1) { - ret = ((val13 >> 45) & 0xf) == 0 - && ((val8 & 0x1) == 0) - && ((((val14>>1) & 0x3) == 0x2 || ((val14>>1) & 0x3) == 0x0) - ||(((val14>>4) & 0x3) == 0x2 || ((val14>>4) & 0x3) == 0x0)); - - if (ret) DPRINT((KERN_DEBUG "perfmon: failure check_case1\n")); - } - - return ret ? -EINVAL : 0; -} - -/* - * impl_pmcs, impl_pmds are computed at runtime to minimize errors! - */ -static pmu_config_t pmu_conf_mck={ - .pmu_name = "Itanium 2", - .pmu_family = 0x1f, - .flags = PFM_PMU_IRQ_RESEND, - .ovfl_val = (1UL << 47) - 1, - .pmd_desc = pfm_mck_pmd_desc, - .pmc_desc = pfm_mck_pmc_desc, - .num_ibrs = 8, - .num_dbrs = 8, - .use_rr_dbregs = 1 /* debug register are use for range restrictions */ -}; - - diff --git a/arch/ia64/kernel/perfmon_montecito.h b/arch/ia64/kernel/perfmon_montecito.h deleted file mode 100644 index 7f8da4c..0000000 --- a/arch/ia64/kernel/perfmon_montecito.h +++ /dev/null @@ -1,269 +0,0 @@ -/* - * This file contains the Montecito PMU register description tables - * and pmc checker used by perfmon.c. - * - * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. - * Contributed by Stephane Eranian - */ -static int pfm_mont_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs); - -#define RDEP_MONT_ETB (RDEP(38)|RDEP(39)|RDEP(48)|RDEP(49)|RDEP(50)|RDEP(51)|RDEP(52)|RDEP(53)|RDEP(54)|\ - RDEP(55)|RDEP(56)|RDEP(57)|RDEP(58)|RDEP(59)|RDEP(60)|RDEP(61)|RDEP(62)|RDEP(63)) -#define RDEP_MONT_DEAR (RDEP(32)|RDEP(33)|RDEP(36)) -#define RDEP_MONT_IEAR (RDEP(34)|RDEP(35)) - -static pfm_reg_desc_t pfm_mont_pmc_desc[PMU_MAX_PMCS]={ -/* pmc0 */ { PFM_REG_CONTROL , 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {0,0, 0, 0}}, -/* pmc1 */ { PFM_REG_CONTROL , 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {0,0, 0, 0}}, -/* pmc2 */ { PFM_REG_CONTROL , 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {0,0, 0, 0}}, -/* pmc3 */ { PFM_REG_CONTROL , 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {0,0, 0, 0}}, -/* pmc4 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(4),0, 0, 0}, {0,0, 0, 0}}, -/* pmc5 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(5),0, 0, 0}, {0,0, 0, 0}}, -/* pmc6 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(6),0, 0, 0}, {0,0, 0, 0}}, -/* pmc7 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(7),0, 0, 0}, {0,0, 0, 0}}, -/* pmc8 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(8),0, 0, 0}, {0,0, 0, 0}}, -/* pmc9 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(9),0, 0, 0}, {0,0, 0, 0}}, -/* pmc10 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(10),0, 0, 0}, {0,0, 0, 0}}, -/* pmc11 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(11),0, 0, 0}, {0,0, 0, 0}}, -/* pmc12 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(12),0, 0, 0}, {0,0, 0, 0}}, -/* pmc13 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(13),0, 0, 0}, {0,0, 0, 0}}, -/* pmc14 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(14),0, 0, 0}, {0,0, 0, 0}}, -/* pmc15 */ { PFM_REG_COUNTING, 6, 0x2000000, 0x7c7fff7f, NULL, pfm_mont_pmc_check, {RDEP(15),0, 0, 0}, {0,0, 0, 0}}, -/* pmc16 */ { PFM_REG_NOTIMPL, }, -/* pmc17 */ { PFM_REG_NOTIMPL, }, -/* pmc18 */ { PFM_REG_NOTIMPL, }, -/* pmc19 */ { PFM_REG_NOTIMPL, }, -/* pmc20 */ { PFM_REG_NOTIMPL, }, -/* pmc21 */ { PFM_REG_NOTIMPL, }, -/* pmc22 */ { PFM_REG_NOTIMPL, }, -/* pmc23 */ { PFM_REG_NOTIMPL, }, -/* pmc24 */ { PFM_REG_NOTIMPL, }, -/* pmc25 */ { PFM_REG_NOTIMPL, }, -/* pmc26 */ { PFM_REG_NOTIMPL, }, -/* pmc27 */ { PFM_REG_NOTIMPL, }, -/* pmc28 */ { PFM_REG_NOTIMPL, }, -/* pmc29 */ { PFM_REG_NOTIMPL, }, -/* pmc30 */ { PFM_REG_NOTIMPL, }, -/* pmc31 */ { PFM_REG_NOTIMPL, }, -/* pmc32 */ { PFM_REG_CONFIG, 0, 0x30f01ffffffffffUL, 0x30f01ffffffffffUL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}}, -/* pmc33 */ { PFM_REG_CONFIG, 0, 0x0, 0x1ffffffffffUL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}}, -/* pmc34 */ { PFM_REG_CONFIG, 0, 0xf01ffffffffffUL, 0xf01ffffffffffUL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}}, -/* pmc35 */ { PFM_REG_CONFIG, 0, 0x0, 0x1ffffffffffUL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}}, -/* pmc36 */ { PFM_REG_CONFIG, 0, 0xfffffff0, 0xf, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}}, -/* pmc37 */ { PFM_REG_MONITOR, 4, 0x0, 0x3fff, NULL, pfm_mont_pmc_check, {RDEP_MONT_IEAR, 0, 0, 0}, {0, 0, 0, 0}}, -/* pmc38 */ { PFM_REG_CONFIG, 0, 0xdb6, 0x2492, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}}, -/* pmc39 */ { PFM_REG_MONITOR, 6, 0x0, 0xffcf, NULL, pfm_mont_pmc_check, {RDEP_MONT_ETB,0, 0, 0}, {0,0, 0, 0}}, -/* pmc40 */ { PFM_REG_MONITOR, 6, 0x2000000, 0xf01cf, NULL, pfm_mont_pmc_check, {RDEP_MONT_DEAR,0, 0, 0}, {0,0, 0, 0}}, -/* pmc41 */ { PFM_REG_CONFIG, 0, 0x00002078fefefefeUL, 0x1e00018181818UL, NULL, pfm_mont_pmc_check, {0,0, 0, 0}, {0,0, 0, 0}}, -/* pmc42 */ { PFM_REG_MONITOR, 6, 0x0, 0x7ff4f, NULL, pfm_mont_pmc_check, {RDEP_MONT_ETB,0, 0, 0}, {0,0, 0, 0}}, - { PFM_REG_END , 0, 0x0, -1, NULL, NULL, {0,}, {0,}}, /* end marker */ -}; - -static pfm_reg_desc_t pfm_mont_pmd_desc[PMU_MAX_PMDS]={ -/* pmd0 */ { PFM_REG_NOTIMPL, }, -/* pmd1 */ { PFM_REG_NOTIMPL, }, -/* pmd2 */ { PFM_REG_NOTIMPL, }, -/* pmd3 */ { PFM_REG_NOTIMPL, }, -/* pmd4 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(4),0, 0, 0}}, -/* pmd5 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(5),0, 0, 0}}, -/* pmd6 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(6),0, 0, 0}}, -/* pmd7 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(7),0, 0, 0}}, -/* pmd8 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(8),0, 0, 0}}, -/* pmd9 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(9),0, 0, 0}}, -/* pmd10 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(10),0, 0, 0}}, -/* pmd11 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(11),0, 0, 0}}, -/* pmd12 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(12),0, 0, 0}}, -/* pmd13 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(13),0, 0, 0}}, -/* pmd14 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(14),0, 0, 0}}, -/* pmd15 */ { PFM_REG_COUNTING, 0, 0x0, -1, NULL, NULL, {0,0, 0, 0}, {RDEP(15),0, 0, 0}}, -/* pmd16 */ { PFM_REG_NOTIMPL, }, -/* pmd17 */ { PFM_REG_NOTIMPL, }, -/* pmd18 */ { PFM_REG_NOTIMPL, }, -/* pmd19 */ { PFM_REG_NOTIMPL, }, -/* pmd20 */ { PFM_REG_NOTIMPL, }, -/* pmd21 */ { PFM_REG_NOTIMPL, }, -/* pmd22 */ { PFM_REG_NOTIMPL, }, -/* pmd23 */ { PFM_REG_NOTIMPL, }, -/* pmd24 */ { PFM_REG_NOTIMPL, }, -/* pmd25 */ { PFM_REG_NOTIMPL, }, -/* pmd26 */ { PFM_REG_NOTIMPL, }, -/* pmd27 */ { PFM_REG_NOTIMPL, }, -/* pmd28 */ { PFM_REG_NOTIMPL, }, -/* pmd29 */ { PFM_REG_NOTIMPL, }, -/* pmd30 */ { PFM_REG_NOTIMPL, }, -/* pmd31 */ { PFM_REG_NOTIMPL, }, -/* pmd32 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(33)|RDEP(36),0, 0, 0}, {RDEP(40),0, 0, 0}}, -/* pmd33 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(32)|RDEP(36),0, 0, 0}, {RDEP(40),0, 0, 0}}, -/* pmd34 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(35),0, 0, 0}, {RDEP(37),0, 0, 0}}, -/* pmd35 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(34),0, 0, 0}, {RDEP(37),0, 0, 0}}, -/* pmd36 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP(32)|RDEP(33),0, 0, 0}, {RDEP(40),0, 0, 0}}, -/* pmd37 */ { PFM_REG_NOTIMPL, }, -/* pmd38 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd39 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd40 */ { PFM_REG_NOTIMPL, }, -/* pmd41 */ { PFM_REG_NOTIMPL, }, -/* pmd42 */ { PFM_REG_NOTIMPL, }, -/* pmd43 */ { PFM_REG_NOTIMPL, }, -/* pmd44 */ { PFM_REG_NOTIMPL, }, -/* pmd45 */ { PFM_REG_NOTIMPL, }, -/* pmd46 */ { PFM_REG_NOTIMPL, }, -/* pmd47 */ { PFM_REG_NOTIMPL, }, -/* pmd48 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd49 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd50 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd51 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd52 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd53 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd54 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd55 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd56 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd57 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd58 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd59 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd60 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd61 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd62 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, -/* pmd63 */ { PFM_REG_BUFFER, 0, 0x0, -1, NULL, NULL, {RDEP_MONT_ETB,0, 0, 0}, {RDEP(39),0, 0, 0}}, - { PFM_REG_END , 0, 0x0, -1, NULL, NULL, {0,}, {0,}}, /* end marker */ -}; - -/* - * PMC reserved fields must have their power-up values preserved - */ -static int -pfm_mont_reserved(unsigned int cnum, unsigned long *val, struct pt_regs *regs) -{ - unsigned long tmp1, tmp2, ival = *val; - - /* remove reserved areas from user value */ - tmp1 = ival & PMC_RSVD_MASK(cnum); - - /* get reserved fields values */ - tmp2 = PMC_DFL_VAL(cnum) & ~PMC_RSVD_MASK(cnum); - - *val = tmp1 | tmp2; - - DPRINT(("pmc[%d]=0x%lx, mask=0x%lx, reset=0x%lx, val=0x%lx\n", - cnum, ival, PMC_RSVD_MASK(cnum), PMC_DFL_VAL(cnum), *val)); - return 0; -} - -/* - * task can be NULL if the context is unloaded - */ -static int -pfm_mont_pmc_check(struct task_struct *task, pfm_context_t *ctx, unsigned int cnum, unsigned long *val, struct pt_regs *regs) -{ - int ret = 0; - unsigned long val32 = 0, val38 = 0, val41 = 0; - unsigned long tmpval; - int check_case1 = 0; - int is_loaded; - - /* first preserve the reserved fields */ - pfm_mont_reserved(cnum, val, regs); - - tmpval = *val; - - /* sanity check */ - if (ctx == NULL) return -EINVAL; - - is_loaded = ctx->ctx_state == PFM_CTX_LOADED || ctx->ctx_state == PFM_CTX_MASKED; - - /* - * we must clear the debug registers if pmc41 has a value which enable - * memory pipeline event constraints. In this case we need to clear the - * the debug registers if they have not yet been accessed. This is required - * to avoid picking stale state. - * PMC41 is "active" if: - * one of the pmc41.cfg_dtagXX field is different from 0x3 - * AND - * at the corresponding pmc41.en_dbrpXX is set. - * AND - * ctx_fl_using_dbreg == 0 (i.e., dbr not yet used) - */ - DPRINT(("cnum=%u val=0x%lx, using_dbreg=%d loaded=%d\n", cnum, tmpval, ctx->ctx_fl_using_dbreg, is_loaded)); - - if (cnum == 41 && is_loaded - && (tmpval & 0x1e00000000000UL) && (tmpval & 0x18181818UL) != 0x18181818UL && ctx->ctx_fl_using_dbreg == 0) { - - DPRINT(("pmc[%d]=0x%lx has active pmc41 settings, clearing dbr\n", cnum, tmpval)); - - /* don't mix debug with perfmon */ - if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL; - - /* - * a count of 0 will mark the debug registers if: - * AND - */ - ret = pfm_write_ibr_dbr(PFM_DATA_RR, ctx, NULL, 0, regs); - if (ret) return ret; - } - /* - * we must clear the (instruction) debug registers if: - * pmc38.ig_ibrpX is 0 (enabled) - * AND - * ctx_fl_using_dbreg == 0 (i.e., dbr not yet used) - */ - if (cnum == 38 && is_loaded && ((tmpval & 0x492UL) != 0x492UL) && ctx->ctx_fl_using_dbreg == 0) { - - DPRINT(("pmc38=0x%lx has active pmc38 settings, clearing ibr\n", tmpval)); - - /* don't mix debug with perfmon */ - if (task && (task->thread.flags & IA64_THREAD_DBG_VALID) != 0) return -EINVAL; - - /* - * a count of 0 will mark the debug registers as in use and also - * ensure that they are properly cleared. - */ - ret = pfm_write_ibr_dbr(PFM_CODE_RR, ctx, NULL, 0, regs); - if (ret) return ret; - - } - switch(cnum) { - case 32: val32 = *val; - val38 = ctx->ctx_pmcs[38]; - val41 = ctx->ctx_pmcs[41]; - check_case1 = 1; - break; - case 38: val38 = *val; - val32 = ctx->ctx_pmcs[32]; - val41 = ctx->ctx_pmcs[41]; - check_case1 = 1; - break; - case 41: val41 = *val; - val32 = ctx->ctx_pmcs[32]; - val38 = ctx->ctx_pmcs[38]; - check_case1 = 1; - break; - } - /* check illegal configuration which can produce inconsistencies in tagging - * i-side events in L1D and L2 caches - */ - if (check_case1) { - ret = (((val41 >> 45) & 0xf) == 0 && ((val32>>57) & 0x1) == 0) - && ((((val38>>1) & 0x3) == 0x2 || ((val38>>1) & 0x3) == 0) - || (((val38>>4) & 0x3) == 0x2 || ((val38>>4) & 0x3) == 0)); - if (ret) { - DPRINT(("invalid config pmc38=0x%lx pmc41=0x%lx pmc32=0x%lx\n", val38, val41, val32)); - return -EINVAL; - } - } - *val = tmpval; - return 0; -} - -/* - * impl_pmcs, impl_pmds are computed at runtime to minimize errors! - */ -static pmu_config_t pmu_conf_mont={ - .pmu_name = "Montecito", - .pmu_family = 0x20, - .flags = PFM_PMU_IRQ_RESEND, - .ovfl_val = (1UL << 47) - 1, - .pmd_desc = pfm_mont_pmd_desc, - .pmc_desc = pfm_mont_pmc_desc, - .num_ibrs = 8, - .num_dbrs = 8, - .use_rr_dbregs = 1 /* debug register are use for range retrictions */ -}; diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 3ab8373..a7dfb39 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -45,10 +46,6 @@ #include "entry.h" -#ifdef CONFIG_PERFMON -# include -#endif - #include "sigframe.h" void (*ia64_mark_idle)(int); @@ -162,10 +159,8 @@ show_regs (struct pt_regs *regs) void tsk_clear_notify_resume(struct task_struct *tsk) { -#ifdef CONFIG_PERFMON - if (tsk->thread.pfm_needs_checking) + if (test_ti_thread_flag(task_thread_info(tsk), TIF_PERFMON_WORK)) return; -#endif if (test_ti_thread_flag(task_thread_info(tsk), TIF_RESTORE_RSE)) return; clear_ti_thread_flag(task_thread_info(tsk), TIF_NOTIFY_RESUME); @@ -188,14 +183,9 @@ do_notify_resume_user(sigset_t *unused, struct sigscratch *scr, long in_syscall) return; } -#ifdef CONFIG_PERFMON - if (current->thread.pfm_needs_checking) - /* - * Note: pfm_handle_work() allow us to call it with interrupts - * disabled, and may enable interrupts within the function. - */ - pfm_handle_work(); -#endif + /* process perfmon asynchronous work (e.g. block thread or reset) */ + if (test_thread_flag(TIF_PERFMON_WORK)) + pfm_handle_work(task_pt_regs(current)); /* deal with pending signal delivery */ if (test_thread_flag(TIF_SIGPENDING)) { @@ -212,22 +202,15 @@ do_notify_resume_user(sigset_t *unused, struct sigscratch *scr, long in_syscall) local_irq_disable(); /* force interrupt disable */ } -static int pal_halt = 1; static int can_do_pal_halt = 1; static int __init nohalt_setup(char * str) { - pal_halt = can_do_pal_halt = 0; + can_do_pal_halt = 0; return 1; } __setup("nohalt", nohalt_setup); -void -update_pal_halt_status(int status) -{ - can_do_pal_halt = pal_halt && status; -} - /* * We use this if we don't have any better idle routine.. */ @@ -236,6 +219,22 @@ default_idle (void) { local_irq_enable(); while (!need_resched()) { +#ifdef CONFIG_PERFMON + u64 psr = 0; + /* + * If requested, we stop the PMU to avoid + * measuring across the core idle loop. + * + * dcr.pp is not modified on purpose + * it is used when coming out of + * safe_halt() via interrupt + */ + if ((__get_cpu_var(pfm_syst_info) & PFM_ITA_CPUINFO_IDLE_EXCL)) { + psr = ia64_getreg(_IA64_REG_PSR); + if (psr & IA64_PSR_PP) + ia64_rsm(IA64_PSR_PP); + } +#endif if (can_do_pal_halt) { local_irq_disable(); if (!need_resched()) { @@ -244,6 +243,12 @@ default_idle (void) local_irq_enable(); } else cpu_relax(); +#ifdef CONFIG_PERFMON + if ((__get_cpu_var(pfm_syst_info) & PFM_ITA_CPUINFO_IDLE_EXCL)) { + if (psr & IA64_PSR_PP) + ia64_ssm(IA64_PSR_PP); + } +#endif } } @@ -344,22 +349,9 @@ cpu_idle (void) void ia64_save_extra (struct task_struct *task) { -#ifdef CONFIG_PERFMON - unsigned long info; -#endif - if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0) ia64_save_debug_regs(&task->thread.dbr[0]); -#ifdef CONFIG_PERFMON - if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0) - pfm_save_regs(task); - - info = __get_cpu_var(pfm_syst_info); - if (info & PFM_CPUINFO_SYST_WIDE) - pfm_syst_wide_update_task(task, info, 0); -#endif - #ifdef CONFIG_IA32_SUPPORT if (IS_IA32_PROCESS(task_pt_regs(task))) ia32_save_state(task); @@ -369,22 +361,9 @@ ia64_save_extra (struct task_struct *task) void ia64_load_extra (struct task_struct *task) { -#ifdef CONFIG_PERFMON - unsigned long info; -#endif - if ((task->thread.flags & IA64_THREAD_DBG_VALID) != 0) ia64_load_debug_regs(&task->thread.dbr[0]); -#ifdef CONFIG_PERFMON - if ((task->thread.flags & IA64_THREAD_PM_VALID) != 0) - pfm_load_regs(task); - - info = __get_cpu_var(pfm_syst_info); - if (info & PFM_CPUINFO_SYST_WIDE) - pfm_syst_wide_update_task(task, info, 1); -#endif - #ifdef CONFIG_IA32_SUPPORT if (IS_IA32_PROCESS(task_pt_regs(task))) ia32_load_state(task); @@ -510,8 +489,7 @@ copy_thread (int nr, unsigned long clone_flags, * call behavior where scratch registers are preserved across * system calls (unless used by the system call itself). */ -# define THREAD_FLAGS_TO_CLEAR (IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID \ - | IA64_THREAD_PM_VALID) +# define THREAD_FLAGS_TO_CLEAR (IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID) # define THREAD_FLAGS_TO_SET 0 p->thread.flags = ((current->thread.flags & ~THREAD_FLAGS_TO_CLEAR) | THREAD_FLAGS_TO_SET); @@ -533,10 +511,8 @@ copy_thread (int nr, unsigned long clone_flags, } #endif -#ifdef CONFIG_PERFMON - if (current->thread.pfm_context) - pfm_inherit(p, child_ptregs); -#endif + pfm_copy_thread(p); + return retval; } @@ -745,15 +721,13 @@ exit_thread (void) { ia64_drop_fpu(current); -#ifdef CONFIG_PERFMON - /* if needed, stop monitoring and flush state to perfmon context */ - if (current->thread.pfm_context) - pfm_exit_thread(current); + + /* if needed, stop monitoring and flush state to perfmon context */ + pfm_exit_thread(); /* free debug register resources */ - if (current->thread.flags & IA64_THREAD_DBG_VALID) - pfm_release_debug_registers(current); -#endif + pfm_release_dbregs(current); + if (IS_IA32_PROCESS(task_pt_regs(current))) ia32_drop_ia64_partial_page_list(current); } diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index 2a9943b..bb1ca1e 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -30,9 +31,6 @@ #include #include #include -#ifdef CONFIG_PERFMON -#include -#endif #include "entry.h" @@ -2124,7 +2122,6 @@ access_uarea(struct task_struct *child, unsigned long addr, "address 0x%lx\n", addr); return -1; } -#ifdef CONFIG_PERFMON /* * Check if debug registers are used by perfmon. This * test must be done once we know that we can do the @@ -2142,9 +2139,8 @@ access_uarea(struct task_struct *child, unsigned long addr, * IA64_THREAD_DBG_VALID. The registers are restored * by the PMU context switch code. */ - if (pfm_use_debug_registers(child)) + if (pfm_use_dbregs(child)) return -1; -#endif if (!(child->thread.flags & IA64_THREAD_DBG_VALID)) { child->thread.flags |= IA64_THREAD_DBG_VALID; diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index de636b2..677fa68 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -1051,6 +1052,8 @@ cpu_init (void) } platform_cpu_init(); pm_idle = default_idle; + + pfm_init_percpu(); } void __init diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c index d8f05e5..3d7a739 100644 --- a/arch/ia64/kernel/smpboot.c +++ b/arch/ia64/kernel/smpboot.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -381,10 +382,6 @@ smp_callin (void) extern void ia64_init_itm(void); extern volatile int time_keeper_id; -#ifdef CONFIG_PERFMON - extern void pfm_init_percpu(void); -#endif - cpuid = smp_processor_id(); phys_id = hard_smp_processor_id(); itc_master = time_keeper_id; @@ -410,10 +407,6 @@ smp_callin (void) ia64_mca_cmc_vector_setup(); /* Setup vector on AP */ -#ifdef CONFIG_PERFMON - pfm_init_percpu(); -#endif - local_irq_enable(); if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT)) { @@ -751,6 +744,7 @@ int __cpu_disable(void) cpu_clear(cpu, cpu_online_map); local_flush_tlb_all(); cpu_clear(cpu, cpu_callin_map); + pfm_cpu_disable(); return 0; } diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c index bcbb6d8..a0ed33a 100644 --- a/arch/ia64/kernel/sys_ia64.c +++ b/arch/ia64/kernel/sys_ia64.c @@ -284,3 +284,11 @@ sys_pciconfig_write (unsigned long bus, unsigned long dfn, unsigned long off, un } #endif /* CONFIG_PCI */ + +#ifndef CONFIG_IA64_PERFMON_COMPAT +asmlinkage long +sys_perfmonctl (int fd, int cmd, void __user *arg, int count) +{ + return -ENOSYS; +} +#endif diff --git a/arch/ia64/lib/Makefile b/arch/ia64/lib/Makefile index 98771e2..077fd09 100644 --- a/arch/ia64/lib/Makefile +++ b/arch/ia64/lib/Makefile @@ -13,7 +13,6 @@ lib-y := __divsi3.o __udivsi3.o __modsi3.o __umodsi3.o \ obj-$(CONFIG_ITANIUM) += copy_page.o copy_user.o memcpy.o obj-$(CONFIG_MCKINLEY) += copy_page_mck.o memcpy_mck.o -lib-$(CONFIG_PERFMON) += carta_random.o AFLAGS___divdi3.o = AFLAGS___udivdi3.o = -DUNSIGNED diff --git a/arch/ia64/oprofile/init.c b/arch/ia64/oprofile/init.c index 125a602..892de6a 100644 --- a/arch/ia64/oprofile/init.c +++ b/arch/ia64/oprofile/init.c @@ -12,8 +12,8 @@ #include #include -extern int perfmon_init(struct oprofile_operations * ops); -extern void perfmon_exit(void); +extern int op_perfmon_init(struct oprofile_operations * ops); +extern void op_perfmon_exit(void); extern void ia64_backtrace(struct pt_regs * const regs, unsigned int depth); int __init oprofile_arch_init(struct oprofile_operations * ops) @@ -22,7 +22,7 @@ int __init oprofile_arch_init(struct oprofile_operations * ops) #ifdef CONFIG_PERFMON /* perfmon_init() can fail, but we have no way to report it */ - ret = perfmon_init(ops); + ret = op_perfmon_init(ops); #endif ops->backtrace = ia64_backtrace; @@ -33,6 +33,6 @@ int __init oprofile_arch_init(struct oprofile_operations * ops) void oprofile_arch_exit(void) { #ifdef CONFIG_PERFMON - perfmon_exit(); + op_perfmon_exit(); #endif } diff --git a/arch/ia64/oprofile/perfmon.c b/arch/ia64/oprofile/perfmon.c index bc41dd3..6fa9d17 100644 --- a/arch/ia64/oprofile/perfmon.c +++ b/arch/ia64/oprofile/perfmon.c @@ -10,25 +10,30 @@ #include #include #include -#include +#include +#include #include #include static int allow_ints; static int -perfmon_handler(struct task_struct *task, void *buf, pfm_ovfl_arg_t *arg, - struct pt_regs *regs, unsigned long stamp) +perfmon_handler(struct pfm_context *ctx, + unsigned long ip, u64 stamp, void *data) { - int event = arg->pmd_eventid; + struct pt_regs *regs; + struct pfm_ovfl_arg *arg; + + regs = data; + arg = &ctx->ovfl_arg; - arg->ovfl_ctrl.bits.reset_ovfl_pmds = 1; + arg->ovfl_ctrl = PFM_OVFL_CTRL_RESET; /* the owner of the oprofile event buffer may have exited * without perfmon being shutdown (e.g. SIGSEGV) */ if (allow_ints) - oprofile_add_sample(regs, event); + oprofile_add_sample(regs, arg->pmd_eventid); return 0; } @@ -45,17 +50,13 @@ static void perfmon_stop(void) allow_ints = 0; } - -#define OPROFILE_FMT_UUID { \ - 0x77, 0x7a, 0x6e, 0x61, 0x20, 0x65, 0x73, 0x69, 0x74, 0x6e, 0x72, 0x20, 0x61, 0x65, 0x0a, 0x6c } - -static pfm_buffer_fmt_t oprofile_fmt = { - .fmt_name = "oprofile_format", - .fmt_uuid = OPROFILE_FMT_UUID, - .fmt_handler = perfmon_handler, +static struct pfm_smpl_fmt oprofile_fmt = { + .fmt_name = "OProfile", + .fmt_handler = perfmon_handler, + .fmt_flags = PFM_FMT_BUILTIN_FLAG, + .owner = THIS_MODULE }; - static char * get_cpu_type(void) { __u8 family = local_cpu_data->family; @@ -75,9 +76,9 @@ static char * get_cpu_type(void) static int using_perfmon; -int perfmon_init(struct oprofile_operations * ops) +int __init op_perfmon_init(struct oprofile_operations * ops) { - int ret = pfm_register_buffer_fmt(&oprofile_fmt); + int ret = pfm_fmt_register(&oprofile_fmt); if (ret) return -ENODEV; @@ -90,10 +91,10 @@ int perfmon_init(struct oprofile_operations * ops) } -void perfmon_exit(void) +void __exit op_perfmon_exit(void) { if (!using_perfmon) return; - pfm_unregister_buffer_fmt(oprofile_fmt.fmt_uuid); + pfm_fmt_unregister(&oprofile_fmt); } diff --git a/arch/ia64/perfmon/Kconfig b/arch/ia64/perfmon/Kconfig new file mode 100644 index 0000000..99c68bd --- /dev/null +++ b/arch/ia64/perfmon/Kconfig @@ -0,0 +1,67 @@ +menu "Hardware Performance Monitoring support" +config PERFMON + bool "Perfmon2 performance monitoring interface" + default n + help + Enables the perfmon2 interface to access the hardware + performance counters. See for + more details. + +config PERFMON_DEBUG + bool "Perfmon debugging" + default n + depends on PERFMON + help + Enables perfmon debugging support + +config PERFMON_DEBUG_FS + bool "Enable perfmon statistics reporting via debugfs" + default y + depends on PERFMON && DEBUG_FS + help + Enable collection and reporting of perfmon timing statistics under + debugfs. This is used for debugging and performance analysis of the + subsystem. The debugfs filesystem must be mounted. + +config IA64_PERFMON_COMPAT + bool "Enable old perfmon-2 compatbility mode" + default n + depends on PERFMON + help + Enable this option to allow performance tools which used the old + perfmon-2 interface to continue to work. Old tools are those using + the obsolete commands and arguments. Check your programs and look + in include/asm-ia64/perfmon_compat.h for more information. + +config IA64_PERFMON_GENERIC + tristate "Generic IA-64 PMU support" + depends on PERFMON + default n + help + Enables generic IA-64 PMU support. + The generic PMU is defined by the IA-64 architecture document. + This option should only be necessary when running with a PMU that + is not yet explicitely supported. Even then, there is no guarantee + that this support will work. + +config IA64_PERFMON_ITANIUM + tristate "Itanium (Merced) Performance Monitoring support" + depends on PERFMON + default n + help + Enables Itanium (Merced) PMU support. + +config IA64_PERFMON_MCKINLEY + tristate "Itanium 2 (McKinley) Performance Monitoring support" + depends on PERFMON + default n + help + Enables Itanium 2 (McKinley, Madison, Deerfield) PMU support. + +config IA64_PERFMON_MONTECITO + tristate "Itanium 2 9000 (Montecito) Performance Monitoring support" + depends on PERFMON + default n + help + Enables support for Itanium 2 9000 (Montecito) PMU. +endmenu diff --git a/arch/ia64/perfmon/Makefile b/arch/ia64/perfmon/Makefile new file mode 100644 index 0000000..c9cdf9f --- /dev/null +++ b/arch/ia64/perfmon/Makefile @@ -0,0 +1,11 @@ +# +# Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. +# Contributed by Stephane Eranian +# +obj-$(CONFIG_PERFMON) += perfmon.o +obj-$(CONFIG_IA64_PERFMON_COMPAT) += perfmon_default_smpl.o \ + perfmon_compat.o +obj-$(CONFIG_IA64_PERFMON_GENERIC) += perfmon_generic.o +obj-$(CONFIG_IA64_PERFMON_ITANIUM) += perfmon_itanium.o +obj-$(CONFIG_IA64_PERFMON_MCKINLEY) += perfmon_mckinley.o +obj-$(CONFIG_IA64_PERFMON_MONTECITO) += perfmon_montecito.o diff --git a/arch/ia64/perfmon/perfmon.c b/arch/ia64/perfmon/perfmon.c new file mode 100644 index 0000000..3f59410 --- /dev/null +++ b/arch/ia64/perfmon/perfmon.c @@ -0,0 +1,946 @@ +/* + * This file implements the IA-64 specific + * support for the perfmon2 interface + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include + +struct pfm_arch_session { + u32 pfs_sys_use_dbr; /* syswide session uses dbr */ + u32 pfs_ptrace_use_dbr; /* a thread uses dbr via ptrace()*/ +}; + +DEFINE_PER_CPU(u32, pfm_syst_info); + +static struct pfm_arch_session pfm_arch_sessions; +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_arch_sessions_lock); + +static inline void pfm_clear_psr_pp(void) +{ + ia64_rsm(IA64_PSR_PP); +} + +static inline void pfm_set_psr_pp(void) +{ + ia64_ssm(IA64_PSR_PP); +} + +static inline void pfm_clear_psr_up(void) +{ + ia64_rsm(IA64_PSR_UP); +} + +static inline void pfm_set_psr_up(void) +{ + ia64_ssm(IA64_PSR_UP); +} + +static inline void pfm_set_psr_l(u64 val) +{ + ia64_setreg(_IA64_REG_PSR_L, val); +} + +static inline void pfm_restore_ibrs(u64 *ibrs, unsigned int nibrs) +{ + unsigned int i; + + for (i = 0; i < nibrs; i++) { + ia64_set_ibr(i, ibrs[i]); + ia64_dv_serialize_instruction(); + } + ia64_srlz_i(); +} + +static inline void pfm_restore_dbrs(u64 *dbrs, unsigned int ndbrs) +{ + unsigned int i; + + for (i = 0; i < ndbrs; i++) { + ia64_set_dbr(i, dbrs[i]); + ia64_dv_serialize_data(); + } + ia64_srlz_d(); +} + +irqreturn_t pmu_interrupt_handler(int irq, void *arg) +{ + struct pt_regs *regs; + regs = get_irq_regs(); + irq_enter(); + pfm_interrupt_handler(instruction_pointer(regs), regs); + irq_exit(); + return IRQ_HANDLED; +} +static struct irqaction perfmon_irqaction = { + .handler = pmu_interrupt_handler, + .flags = IRQF_DISABLED, /* means keep interrupts masked */ + .name = "perfmon" +}; + +void pfm_arch_quiesce_pmu_percpu(void) +{ + u64 dcr; + /* + * make sure no measurement is active + * (may inherit programmed PMCs from EFI). + */ + pfm_clear_psr_pp(); + pfm_clear_psr_up(); + + /* + * ensure dcr.pp is cleared + */ + dcr = ia64_getreg(_IA64_REG_CR_DCR); + ia64_setreg(_IA64_REG_CR_DCR, dcr & ~IA64_DCR_PP); + + /* + * we run with the PMU not frozen at all times + */ + ia64_set_pmc(0, 0); + ia64_srlz_d(); +} + +void pfm_arch_init_percpu(void) +{ + pfm_arch_quiesce_pmu_percpu(); + /* + * program PMU interrupt vector + */ + ia64_setreg(_IA64_REG_CR_PMV, IA64_PERFMON_VECTOR); + ia64_srlz_d(); +} + +int pfm_arch_context_create(struct pfm_context *ctx, u32 ctx_flags) +{ + struct pfm_arch_context *ctx_arch; + + ctx_arch = pfm_ctx_arch(ctx); + + ctx_arch->flags.use_dbr = 0; + ctx_arch->flags.insecure = (ctx_flags & PFM_ITA_FL_INSECURE) ? 1: 0; + + PFM_DBG("insecure=%d", ctx_arch->flags.insecure); + + return 0; +} + +/* + * Called from pfm_ctxsw(). Task is guaranteed to be current. + * Context is locked. Interrupts are masked. Monitoring may be active. + * PMU access is guaranteed. PMC and PMD registers are live in PMU. + * + * Return: + * non-zero : did not save PMDs (as part of stopping the PMU) + * 0 : saved PMDs (no need to save them in caller) + */ +int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx) +{ + struct pfm_arch_context *ctx_arch; + struct pfm_event_set *set; + u64 psr, tmp; + + ctx_arch = pfm_ctx_arch(ctx); + set = ctx->active_set; + + /* + * save current PSR: needed because we modify it + */ + ia64_srlz_d(); + psr = ia64_getreg(_IA64_REG_PSR); + + /* + * stop monitoring: + * This is the last instruction which may generate an overflow + * + * we do not clear ipsr.up + */ + pfm_clear_psr_up(); + ia64_srlz_d(); + + /* + * extract overflow status bits + */ + tmp = ia64_get_pmc(0) & ~0xf; + + /* + * keep a copy of psr.up (for reload) + */ + ctx_arch->ctx_saved_psr_up = psr & IA64_PSR_UP; + + /* + * save overflow status bits + */ + set->povfl_pmds[0] = tmp; + + /* + * record how many pending overflows + * XXX: assume identity mapping for counters + */ + set->npend_ovfls = ia64_popcnt(tmp); + + /* + * make sure the PMU is unfrozen for the next task + */ + if (set->npend_ovfls) { + ia64_set_pmc(0, 0); + ia64_srlz_d(); + } + return 1; +} + +/* + * Called from pfm_ctxsw(). Task is guaranteed to be current. + * set cannot be NULL. Context is locked. Interrupts are masked. + * Caller has already restored all PMD and PMC registers. + * + * must reactivate monitoring + */ +void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx) +{ + struct pfm_arch_context *ctx_arch; + + ctx_arch = pfm_ctx_arch(ctx); + + /* + * when monitoring is not explicitly started + * then psr_up = 0, in which case we do not + * need to restore + */ + if (likely(ctx_arch->ctx_saved_psr_up)) { + pfm_set_psr_up(); + ia64_srlz_d(); + } +} + +int pfm_arch_reserve_session(struct pfm_context *ctx, u32 cpu) +{ + struct pfm_arch_context *ctx_arch; + int is_system; + int ret = 0; + + ctx_arch = pfm_ctx_arch(ctx); + is_system = ctx->flags.system; + + spin_lock(&pfm_arch_sessions_lock); + + if (is_system && ctx_arch->flags.use_dbr) { + PFM_DBG("syswide context uses dbregs"); + + if (pfm_arch_sessions.pfs_ptrace_use_dbr) { + PFM_DBG("cannot reserve syswide context: " + "dbregs in use by ptrace"); + ret = -EBUSY; + } else { + pfm_arch_sessions.pfs_sys_use_dbr++; + } + } + spin_unlock(&pfm_arch_sessions_lock); + + return ret; +} + +void pfm_arch_release_session(struct pfm_context *ctx, u32 cpu) +{ + struct pfm_arch_context *ctx_arch; + int is_system; + + ctx_arch = pfm_ctx_arch(ctx); + is_system = ctx->flags.system; + + spin_lock(&pfm_arch_sessions_lock); + + if (is_system && ctx_arch->flags.use_dbr) + pfm_arch_sessions.pfs_sys_use_dbr--; + spin_unlock(&pfm_arch_sessions_lock); +} + +/* + * function called from pfm_load_context_*(). Task is not guaranteed to be + * current task. If not then other task is guaranteed stopped and off any CPU. + * context is locked and interrupts are masked. + * + * On PFM_LOAD_CONTEXT, the interface guarantees monitoring is stopped. + * + * For system-wide task is NULL + */ +int pfm_arch_load_context(struct pfm_context *ctx) +{ + struct pfm_arch_context *ctx_arch; + struct pt_regs *regs; + int ret = 0; + + ctx_arch = pfm_ctx_arch(ctx); + + /* + * cannot load a context which is using range restrictions, + * into a thread that is being debugged. + * + * if one set out of several is using the debug registers, then + * we assume the context as whole is using them. + */ + if (ctx_arch->flags.use_dbr) { + if (ctx->flags.system) { + spin_lock(&pfm_arch_sessions_lock); + + if (pfm_arch_sessions.pfs_ptrace_use_dbr) { + PFM_DBG("cannot reserve syswide context: " + "dbregs in use by ptrace"); + ret = -EBUSY; + } else { + pfm_arch_sessions.pfs_sys_use_dbr++; + PFM_DBG("pfs_sys_use_dbr=%u", + pfm_arch_sessions.pfs_sys_use_dbr); + } + spin_unlock(&pfm_arch_sessions_lock); + + } else if (ctx->task->thread.flags & IA64_THREAD_DBG_VALID) { + PFM_DBG("load_pid [%d] thread is debugged, cannot " + "use range restrictions", ctx->task->pid); + ret = -EBUSY; + } + if (ret) + return ret; + } + + /* + * We need to intervene on context switch to toggle the + * psr.pp bit in system-wide. As such, we set the TIF + * flag so that pfm_arch_ctxswout_sys() and the + * pfm_arch_ctxswin_sys() functions get called + * from pfm_ctxsw_sys(); + */ + if (ctx->flags.system) { + set_thread_flag(TIF_PERFMON_CTXSW); + PFM_DBG("[%d] set TIF", current->pid); + return 0; + } + + regs = task_pt_regs(ctx->task); + + /* + * self-monitoring systematically allows user level control + */ + if (ctx->task != current) { + /* + * when not current, task is stopped, so this is safe + */ + ctx_arch->ctx_saved_psr_up = 0; + ia64_psr(regs)->up = ia64_psr(regs)->pp = 0; + } else + ctx_arch->flags.insecure = 1; + + /* + * allow user level control (start/stop/read pmd) if: + * - self-monitoring + * - requested at context creation (PFM_IA64_FL_INSECURE) + * + * There is not security hole with PFM_IA64_FL_INSECURE because + * when not self-monitored, the caller must have permissions to + * attached to the task. + */ + if (ctx_arch->flags.insecure) { + ia64_psr(regs)->sp = 0; + PFM_DBG("clearing psr.sp for [%d]", ctx->task->pid); + } + return 0; +} + +int pfm_arch_setfl_sane(struct pfm_context *ctx, u32 flags) +{ +#define PFM_SETFL_BOTH_SWITCH (PFM_SETFL_OVFL_SWITCH|PFM_SETFL_TIME_SWITCH) +#define PFM_ITA_SETFL_BOTH_INTR (PFM_ITA_SETFL_INTR_ONLY|\ + PFM_ITA_SETFL_EXCL_INTR) + +/* exclude return value field */ +#define PFM_SETFL_ALL_MASK (PFM_ITA_SETFL_BOTH_INTR \ + | PFM_SETFL_BOTH_SWITCH \ + | PFM_ITA_SETFL_IDLE_EXCL) + + if ((flags & ~PFM_SETFL_ALL_MASK)) { + PFM_DBG("invalid flags=0x%x", flags); + return -EINVAL; + } + + if ((flags & PFM_ITA_SETFL_BOTH_INTR) == PFM_ITA_SETFL_BOTH_INTR) { + PFM_DBG("both excl intr and ontr only are set"); + return -EINVAL; + } + + if ((flags & PFM_ITA_SETFL_IDLE_EXCL) && !ctx->flags.system) { + PFM_DBG("idle exclude flag only for system-wide context"); + return -EINVAL; + } + return 0; +} + +/* + * function called from pfm_unload_context_*(). Context is locked. + * interrupts are masked. task is not guaranteed to be current task. + * Access to PMU is not guaranteed. + * + * function must do whatever arch-specific action is required on unload + * of a context. + * + * called for both system-wide and per-thread. task is NULL for ssytem-wide + */ +void pfm_arch_unload_context(struct pfm_context *ctx) +{ + struct pfm_arch_context *ctx_arch; + struct pt_regs *regs; + + ctx_arch = pfm_ctx_arch(ctx); + + if (ctx->flags.system) { + /* + * disable context switch hook + */ + clear_thread_flag(TIF_PERFMON_CTXSW); + + if (ctx_arch->flags.use_dbr) { + spin_lock(&pfm_arch_sessions_lock); + pfm_arch_sessions.pfs_sys_use_dbr--; + PFM_DBG("sys_use_dbr=%u", pfm_arch_sessions.pfs_sys_use_dbr); + spin_unlock(&pfm_arch_sessions_lock); + } + } else { + regs = task_pt_regs(ctx->task); + + /* + * cancel user level control for per-task context + */ + ia64_psr(regs)->sp = 1; + PFM_DBG("setting psr.sp for [%d]", ctx->task->pid); + } +} + +/* + * mask monitoring by setting the privilege level to 0 + * we cannot use psr.pp/psr.up for this, it is controlled by + * the user + */ +void pfm_arch_mask_monitoring(struct pfm_context *ctx, struct pfm_event_set *set) +{ + struct pfm_arch_pmu_info *arch_info; + unsigned long mask; + unsigned int i; + + arch_info = pfm_pmu_info(); + /* + * as an optimization we look at the first 64 PMC + * registers only starting at PMC4. + */ + mask = arch_info->mask_pmcs[0] >> PFM_ITA_FCNTR; + for (i = PFM_ITA_FCNTR; mask; i++, mask >>= 1) { + if (likely(mask & 0x1)) + ia64_set_pmc(i, set->pmcs[i] & ~0xfUL); + } + /* + * make changes visisble + */ + ia64_srlz_d(); +} + +/* + * function called from pfm_switch_sets(), pfm_context_load_thread(), + * pfm_context_load_sys(), pfm_ctxsw(), pfm_switch_sets() + * context is locked. Interrupts are masked. set cannot be NULL. + * Access to the PMU is guaranteed. + * + * function must restore all PMD registers from set. + */ +void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set) +{ + struct pfm_arch_context *ctx_arch; + unsigned long *mask; + u16 i, num; + + ctx_arch = pfm_ctx_arch(ctx); + + if (ctx_arch->flags.insecure) { + num = ctx->regs.num_rw_pmd; + mask = ctx->regs.rw_pmds; + } else { + num = set->nused_pmds; + mask = set->used_pmds; + } + /* + * must restore all implemented read-write PMDS to avoid leaking + * information especially when PFM_IA64_FL_INSECURE is set. + * + * XXX: should check PFM_IA64_FL_INSECURE==0 and use used_pmd instead + */ + for (i = 0; num; i++) { + if (likely(test_bit(i, mask))) { + pfm_arch_write_pmd(ctx, i, set->pmds[i].value); + num--; + } + } + ia64_srlz_d(); +} + +/* + * function called from pfm_switch_sets(), pfm_context_load_thread(), + * pfm_context_load_sys(), pfm_ctxsw(), pfm_switch_sets() + * context is locked. Interrupts are masked. set cannot be NULL. + * Access to the PMU is guaranteed. + * + * function must restore all PMC registers from set if needed + */ +void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set) +{ + struct pfm_arch_pmu_info *arch_info; + u64 mask2 = 0, val, plm; + unsigned long impl_mask, mask_pmcs; + unsigned int i; + + arch_info = pfm_pmu_info(); + /* + * as an optimization we only look at the first 64 + * PMC registers. In fact, we should never scan the + * entire impl_pmcs because ibr/dbr are implemented + * separately. + * + * always skip PMC0-PMC3. PMC0 taken care of when saving + * state. PMC1-PMC3 not used until we get counters in + * the 60 and above index range. + */ + impl_mask = ctx->regs.pmcs[0] >> PFM_ITA_FCNTR; + mask_pmcs = arch_info->mask_pmcs[0] >> PFM_ITA_FCNTR; + plm = ctx->state == PFM_CTX_MASKED ? ~0xf : ~0x0; + + for (i = PFM_ITA_FCNTR; + impl_mask; + i++, impl_mask >>= 1, mask_pmcs >>= 1) { + if (likely(impl_mask & 0x1)) { + mask2 = mask_pmcs & 0x1 ? plm : ~0; + val = set->pmcs[i] & mask2; + ia64_set_pmc(i, val); + PFM_DBG_ovfl("pmc%u=0x%lx", i, val); + } + } + /* + * restore DBR/IBR + */ + if (set->priv_flags & PFM_ITA_SETFL_USE_DBR) { + pfm_restore_ibrs(set->pmcs+256, 8); + pfm_restore_dbrs(set->pmcs+264, 8); + } + ia64_srlz_d(); +} + +void pfm_arch_unmask_monitoring(struct pfm_context *ctx, struct pfm_event_set *set) +{ + u64 psr; + int is_system; + + is_system = ctx->flags.system; + + psr = ia64_getreg(_IA64_REG_PSR); + + /* + * monitoring is masked via the PMC.plm + * + * As we restore their value, we do not want each counter to + * restart right away. We stop monitoring using the PSR, + * restore the PMC (and PMD) and then re-establish the psr + * as it was. Note that there can be no pending overflow at + * this point, because monitoring is still MASKED. + * + * Because interrupts are masked we can avoid changing + * DCR.pp. + */ + if (is_system) + pfm_clear_psr_pp(); + else + pfm_clear_psr_up(); + + ia64_srlz_d(); + + pfm_arch_restore_pmcs(ctx, set); + + /* + * restore psr + * + * monitoring may start right now but interrupts + * are still masked + */ + pfm_set_psr_l(psr); + ia64_srlz_d(); +} + +/* + * Called from pfm_stop() + * + * For per-thread: + * task is not necessarily current. If not current task, then + * task is guaranteed stopped and off any cpu. Access to PMU + * is not guaranteed. Interrupts are masked. Context is locked. + * Set is the active set. + * + * must disable active monitoring. ctx cannot be NULL + */ +void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx) +{ + struct pfm_arch_context *ctx_arch; + struct pt_regs *regs; + u64 dcr, psr; + + ctx_arch = pfm_ctx_arch(ctx); + regs = task_pt_regs(task); + + if (!ctx->flags.system) { + /* + * in ZOMBIE state we always have task == current due to + * pfm_exit_thread() + */ + ia64_psr(regs)->up = 0; + ctx_arch->ctx_saved_psr_up = 0; + + /* + * in case of ZOMBIE state, there is no unload to clear + * insecure monitoring, so we do it in stop instead. + */ + if (ctx->state == PFM_CTX_ZOMBIE) + ia64_psr(regs)->sp = 1; + + if (task == current) { + pfm_clear_psr_up(); + ia64_srlz_d(); + } + } else if (ctx->flags.started) { /* do not stop twice */ + dcr = ia64_getreg(_IA64_REG_CR_DCR); + psr = ia64_getreg(_IA64_REG_PSR); + + ia64_psr(regs)->pp = 0; + ia64_setreg(_IA64_REG_CR_DCR, dcr & ~IA64_DCR_PP); + pfm_clear_psr_pp(); + ia64_srlz_d(); + + if (ctx->active_set->flags & PFM_ITA_SETFL_IDLE_EXCL) { + PFM_DBG("disabling idle exclude"); + __get_cpu_var(pfm_syst_info) &= ~PFM_ITA_CPUINFO_IDLE_EXCL; + } + } +} + +/* + * called from pfm_start() + * + * Interrupts are masked. Context is locked. Set is the active set. + * + * For per-thread: + * Task is not necessarily current. If not current task, then task + * is guaranteed stopped and off any cpu. No access to PMU is task + * is not current. + * + * For system-wide: + * task is always current + * + * must enable active monitoring. + */ +void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx) +{ + struct pfm_arch_context *ctx_arch; + struct pt_regs *regs; + u64 dcr, dcr_pp, psr_pp; + u32 flags; + + ctx_arch = pfm_ctx_arch(ctx); + regs = task_pt_regs(task); + flags = ctx->active_set->flags; + + /* + * per-thread mode + */ + if (!ctx->flags.system) { + + ia64_psr(regs)->up = 1; + + if (task == current) { + pfm_set_psr_up(); + ia64_srlz_d(); + } else { + /* + * activate monitoring at next ctxswin + */ + ctx_arch->ctx_saved_psr_up = IA64_PSR_UP; + } + return; + } + + /* + * system-wide mode + */ + dcr = ia64_getreg(_IA64_REG_CR_DCR); + if (flags & PFM_ITA_SETFL_INTR_ONLY) { + dcr_pp = 1; + psr_pp = 0; + } else if (flags & PFM_ITA_SETFL_EXCL_INTR) { + dcr_pp = 0; + psr_pp = 1; + } else { + dcr_pp = psr_pp = 1; + } + PFM_DBG("dcr_pp=%lu psr_pp=%lu", dcr_pp, psr_pp); + + /* + * update dcr_pp and psr_pp + */ + if (dcr_pp) + ia64_setreg(_IA64_REG_CR_DCR, dcr | IA64_DCR_PP); + else + ia64_setreg(_IA64_REG_CR_DCR, dcr & ~IA64_DCR_PP); + + if (psr_pp) { + pfm_set_psr_pp(); + ia64_psr(regs)->pp = 1; + } else { + pfm_clear_psr_pp(); + ia64_psr(regs)->pp = 0; + } + ia64_srlz_d(); + + if (ctx->active_set->flags & PFM_ITA_SETFL_IDLE_EXCL) { + PFM_DBG("enable idle exclude"); + __get_cpu_var(pfm_syst_info) |= PFM_ITA_CPUINFO_IDLE_EXCL; + } +} + +/* + * Only call this function when a process is trying to + * write the debug registers (reading is always allowed) + * called from arch/ia64/kernel/ptrace.c:access_uarea() + */ +int __pfm_use_dbregs(struct task_struct *task) +{ + struct pfm_arch_context *ctx_arch; + struct pfm_context *ctx; + unsigned long flags; + int ret = 0; + + PFM_DBG("called for [%d]", task->pid); + + ctx = task->pfm_context; + + /* + * do it only once + */ + if (task->thread.flags & IA64_THREAD_DBG_VALID) { + PFM_DBG("IA64_THREAD_DBG_VALID already set"); + return 0; + } + if (ctx) { + spin_lock_irqsave(&ctx->lock, flags); + ctx_arch = pfm_ctx_arch(ctx); + + if (ctx_arch->flags.use_dbr == 1) { + PFM_DBG("PMU using dbregs already, no ptrace access"); + ret = -1; + } + spin_unlock_irqrestore(&ctx->lock, flags); + if (ret) + return ret; + } + + spin_lock(&pfm_arch_sessions_lock); + + /* + * We cannot allow setting breakpoints when system wide monitoring + * sessions are using the debug registers. + */ + if (!pfm_arch_sessions.pfs_sys_use_dbr) + pfm_arch_sessions.pfs_ptrace_use_dbr++; + else + ret = -1; + + PFM_DBG("ptrace_use_dbr=%u sys_use_dbr=%u by [%d] ret = %d", + pfm_arch_sessions.pfs_ptrace_use_dbr, + pfm_arch_sessions.pfs_sys_use_dbr, + task->pid, ret); + + spin_unlock(&pfm_arch_sessions_lock); + if (ret) + return ret; +#ifndef CONFIG_SMP + /* + * in UP, we need to check whether the current + * owner of the PMU is not using the debug registers + * for monitoring. Because we are using a lazy + * save on ctxswout, we must force a save in this + * case because the debug registers are being + * modified by another task. We save the current + * PMD registers, and clear ownership. In ctxswin, + * full state will be reloaded. + * + * Note: we overwrite task. + */ + task = __get_cpu_var(pmu_owner); + ctx = __get_cpu_var(pmu_ctx); + + if (task == NULL) + return 0; + + ctx_arch = pfm_ctx_arch(ctx); + + if (ctx_arch->flags.use_dbr) + pfm_save_pmds_release(ctx); +#endif + return 0; +} + +/* + * This function is called for every task that exits with the + * IA64_THREAD_DBG_VALID set. This indicates a task which was + * able to use the debug registers for debugging purposes via + * ptrace(). Therefore we know it was not using them for + * perfmormance monitoring, so we only decrement the number + * of "ptraced" debug register users to keep the count up to date + */ +int __pfm_release_dbregs(struct task_struct *task) +{ + int ret; + + spin_lock(&pfm_arch_sessions_lock); + + if (pfm_arch_sessions.pfs_ptrace_use_dbr == 0) { + PFM_ERR("invalid release for [%d] ptrace_use_dbr=0", task->pid); + ret = -1; + } else { + pfm_arch_sessions.pfs_ptrace_use_dbr--; + ret = 0; + } + spin_unlock(&pfm_arch_sessions_lock); + + return ret; +} + +int pfm_ia64_mark_dbregs_used(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + struct pfm_arch_context *ctx_arch; + struct task_struct *task; + struct thread_struct *thread; + int ret = 0, state; + int i, can_access_pmu = 0; + int is_loaded, is_system; + + ctx_arch = pfm_ctx_arch(ctx); + state = ctx->state; + task = ctx->task; + is_loaded = state == PFM_CTX_LOADED || state == PFM_CTX_MASKED; + is_system = ctx->flags.system; + can_access_pmu = __get_cpu_var(pmu_owner) == task || is_system; + + if (is_loaded == 0) + goto done; + + if (is_system == 0) { + thread = &(task->thread); + + /* + * cannot use debug registers for montioring if they are + * already used for debugging + */ + if (thread->flags & IA64_THREAD_DBG_VALID) { + PFM_DBG("debug registers already in use for [%d]", + task->pid); + return -EBUSY; + } + } + + /* + * check for debug registers in system wide mode + */ + spin_lock(&pfm_arch_sessions_lock); + + if (is_system) { + if (pfm_arch_sessions.pfs_ptrace_use_dbr) + ret = -EBUSY; + else + pfm_arch_sessions.pfs_sys_use_dbr++; + } + + spin_unlock(&pfm_arch_sessions_lock); + + if (ret != 0) + return ret; + + /* + * clear hardware registers to make sure we don't + * pick up stale state. + */ + if (can_access_pmu) { + PFM_DBG("clearing ibrs, dbrs"); + for (i = 0; i < 8; i++) { + ia64_set_ibr(i, 0); + ia64_dv_serialize_instruction(); + } + ia64_srlz_i(); + for (i = 0; i < 8; i++) { + ia64_set_dbr(i, 0); + ia64_dv_serialize_data(); + } + ia64_srlz_d(); + } +done: + /* + * debug registers are now in use + */ + ctx_arch->flags.use_dbr = 1; + set->priv_flags |= PFM_ITA_SETFL_USE_DBR; + PFM_DBG("set%u use_dbr=1", set->id); + return 0; +} +EXPORT_SYMBOL(pfm_ia64_mark_dbregs_used); + +char *pfm_arch_get_pmu_module_name(void) +{ + switch (local_cpu_data->family) { + case 0x07: + return "perfmon_itanium"; + case 0x1f: + return "perfmon_mckinley"; + case 0x20: + return "perfmon_montecito"; + default: + return "perfmon_generic"; + } + return NULL; +} + +/* + * global arch-specific intialization, called only once + */ +int __init pfm_arch_init(void) +{ + int ret; + + spin_lock_init(&pfm_arch_sessions_lock); + +#ifdef CONFIG_IA64_PERFMON_COMPAT + ret = pfm_ia64_compat_init(); + if (ret) + return ret; +#endif + register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction); + + + return 0; +} diff --git a/arch/ia64/perfmon/perfmon_compat.c b/arch/ia64/perfmon/perfmon_compat.c new file mode 100644 index 0000000..2fd3d3c --- /dev/null +++ b/arch/ia64/perfmon/perfmon_compat.c @@ -0,0 +1,1210 @@ +/* + * This file implements the IA-64 specific + * support for the perfmon2 interface + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +asmlinkage long sys_pfm_stop(int fd); +asmlinkage long sys_pfm_start(int fd, struct pfarg_start __user *st); +asmlinkage long sys_pfm_unload_context(int fd); +asmlinkage long sys_pfm_restart(int fd); +asmlinkage long sys_pfm_load_context(int fd, struct pfarg_load __user *ld); + +ssize_t pfm_sysfs_res_show(char *buf, size_t sz, int what); + +extern ssize_t __pfm_read(struct pfm_context *ctx, + union pfarg_msg *msg_buf, + int non_block); +/* + * function providing some help for backward compatiblity with old IA-64 + * applications. In the old model, certain attributes of a counter were + * passed via the PMC, now they are passed via the PMD. + */ +static int pfm_compat_update_pmd(struct pfm_context *ctx, u16 set_id, u16 cnum, + u32 rflags, + unsigned long *smpl_pmds, + unsigned long *reset_pmds, + u64 eventid) +{ + struct pfm_event_set *set; + int is_counting; + unsigned long *impl_pmds; + u32 flags = 0; + u16 max_pmd; + + impl_pmds = ctx->regs.pmds; + max_pmd = ctx->regs.max_pmd; + + /* + * given that we do not maintain PMC ->PMD dependencies + * we cannot figure out what to do in case PMCxx != PMDxx + */ + if (cnum > max_pmd) + return 0; + + /* + * assumes PMCxx controls PMDxx which is always true for counters + * on Itanium PMUs. + */ + is_counting = pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_C64; + set = pfm_find_set(ctx, set_id, 0); + + /* + * for v2.0, we only allowed counting PMD to generate + * user-level notifications. Same thing with randomization. + */ + if (is_counting) { + if (rflags & PFM_REGFL_OVFL_NOTIFY) + flags |= PFM_REGFL_OVFL_NOTIFY; + if (rflags & PFM_REGFL_RANDOM) + flags |= PFM_REGFL_RANDOM; + /* + * verify validity of smpl_pmds + */ + if (unlikely(bitmap_subset(smpl_pmds, + impl_pmds, max_pmd) == 0)) { + PFM_DBG("invalid smpl_pmds=0x%llx for pmd%u", + (unsigned long long)smpl_pmds[0], cnum); + return -EINVAL; + } + /* + * verify validity of reset_pmds + */ + if (unlikely(bitmap_subset(reset_pmds, + impl_pmds, max_pmd) == 0)) { + PFM_DBG("invalid reset_pmds=0x%lx for pmd%u", + reset_pmds[0], cnum); + return -EINVAL; + } + /* + * ensures that a PFM_READ_PMDS succeeds with a + * corresponding PFM_WRITE_PMDS + */ + __set_bit(cnum, set->used_pmds); + + } else if (rflags & (PFM_REGFL_OVFL_NOTIFY|PFM_REGFL_RANDOM)) { + PFM_DBG("cannot set ovfl_notify or random on pmd%u", cnum); + return -EINVAL; + } + + set->pmds[cnum].flags = flags; + + if (is_counting) { + bitmap_copy(set->pmds[cnum].reset_pmds, + reset_pmds, + max_pmd); + + bitmap_copy(set->pmds[cnum].smpl_pmds, + smpl_pmds, + max_pmd); + + set->pmds[cnum].eventid = eventid; + + /* + * update ovfl_notify + */ + if (rflags & PFM_REGFL_OVFL_NOTIFY) + __set_bit(cnum, set->ovfl_notify); + else + __clear_bit(cnum, set->ovfl_notify); + + } + PFM_DBG("pmd%u flags=0x%x eventid=0x%lx r_pmds=0x%lx s_pmds=0x%lx", + cnum, flags, + eventid, + reset_pmds[0], + smpl_pmds[0]); + + return 0; +} + + +int __pfm_write_ibrs_old(struct pfm_context *ctx, void *arg, int count) +{ + struct pfarg_dbreg *req = arg; + struct pfarg_pmc pmc; + int i, ret = 0; + + memset(&pmc, 0, sizeof(pmc)); + + for (i = 0; i < count; i++, req++) { + pmc.reg_num = 256+req->dbreg_num; + pmc.reg_value = req->dbreg_value; + pmc.reg_flags = 0; + pmc.reg_set = req->dbreg_set; + + ret = __pfm_write_pmcs(ctx, &pmc, 1); + + req->dbreg_flags &= ~PFM_REG_RETFL_MASK; + req->dbreg_flags |= pmc.reg_flags; + + if (ret) + return ret; + } + return 0; +} + +static long pfm_write_ibrs_old(int fd, void __user *ureq, int count) +{ + struct pfm_context *ctx; + struct task_struct *task; + struct file *filp; + struct pfarg_dbreg *req = NULL; + void *fptr, *resume; + unsigned long flags; + size_t sz; + int ret, fput_needed; + + if (count < 1 || count >= PFM_MAX_ARG_COUNT(req)) + return -EINVAL; + + sz = count*sizeof(*req); + + filp = fget_light(fd, &fput_needed); + if (unlikely(filp == NULL)) { + PFM_DBG("invalid fd %d", fd); + return -EBADF; + } + + ctx = filp->private_data; + ret = -EBADF; + + if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) { + PFM_DBG("fd %d not related to perfmon", fd); + goto error; + } + + ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr); + if (ret) + goto error; + + spin_lock_irqsave(&ctx->lock, flags); + + task = ctx->task; + + ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags, &resume); + if (ret == 0) + ret = __pfm_write_ibrs_old(ctx, req, count); + + spin_unlock_irqrestore(&ctx->lock, flags); + + if (resume) + pfm_resume_task(task, resume); + + if (copy_to_user(ureq, req, sz)) + ret = -EFAULT; + + kfree(fptr); +error: + fput_light(filp, fput_needed); + return ret; +} + +int __pfm_write_dbrs_old(struct pfm_context *ctx, void *arg, int count) +{ + struct pfarg_dbreg *req = arg; + struct pfarg_pmc pmc; + int i, ret = 0; + + memset(&pmc, 0, sizeof(pmc)); + + for (i = 0; i < count; i++, req++) { + pmc.reg_num = 264+req->dbreg_num; + pmc.reg_value = req->dbreg_value; + pmc.reg_flags = 0; + pmc.reg_set = req->dbreg_set; + + ret = __pfm_write_pmcs(ctx, &pmc, 1); + + req->dbreg_flags &= ~PFM_REG_RETFL_MASK; + req->dbreg_flags |= pmc.reg_flags; + if (ret) + return ret; + } + return 0; +} + +static long pfm_write_dbrs_old(int fd, void __user *ureq, int count) +{ + struct pfm_context *ctx; + struct task_struct *task; + struct file *filp; + struct pfarg_dbreg *req = NULL; + void *fptr, *resume; + unsigned long flags; + size_t sz; + int ret, fput_needed; + + if (count < 1 || count >= PFM_MAX_ARG_COUNT(req)) + return -EINVAL; + + sz = count*sizeof(*req); + + filp = fget_light(fd, &fput_needed); + if (unlikely(filp == NULL)) { + PFM_DBG("invalid fd %d", fd); + return -EBADF; + } + + ctx = filp->private_data; + ret = -EBADF; + + if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) { + PFM_DBG("fd %d not related to perfmon", fd); + goto error; + } + + ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr); + if (ret) + goto error; + + spin_lock_irqsave(&ctx->lock, flags); + + task = ctx->task; + + ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags, &resume); + if (ret == 0) + ret = __pfm_write_dbrs_old(ctx, req, count); + + spin_unlock_irqrestore(&ctx->lock, flags); + + if (resume) + pfm_resume_task(task, resume); + + if (copy_to_user(ureq, req, sz)) + ret = -EFAULT; + + kfree(fptr); +error: + fput_light(filp, fput_needed); + return ret; +} + +int __pfm_write_pmcs_old(struct pfm_context *ctx, struct pfarg_reg *req_old, + int count) +{ + struct pfarg_pmc req; + unsigned int i; + int ret, error_code; + + memset(&req, 0, sizeof(req)); + + for (i = 0; i < count; i++, req_old++) { + req.reg_num = req_old->reg_num; + req.reg_set = req_old->reg_set; + req.reg_flags = 0; + req.reg_value = req_old->reg_value; + + ret = __pfm_write_pmcs(ctx, (void *)&req, 1); + req_old->reg_flags &= ~PFM_REG_RETFL_MASK; + req_old->reg_flags |= req.reg_flags; + + if (ret) + return ret; + + ret = pfm_compat_update_pmd(ctx, req_old->reg_set, + req_old->reg_num, + (u32)req_old->reg_flags, + req_old->reg_smpl_pmds, + req_old->reg_reset_pmds, + req_old->reg_smpl_eventid); + + error_code = ret ? PFM_REG_RETFL_EINVAL : 0; + req_old->reg_flags &= ~PFM_REG_RETFL_MASK; + req_old->reg_flags |= error_code; + + if (ret) + return ret; + } + return 0; +} + +static long pfm_write_pmcs_old(int fd, void __user *ureq, int count) +{ + struct pfm_context *ctx; + struct task_struct *task; + struct file *filp; + struct pfarg_reg *req = NULL; + void *fptr, *resume; + unsigned long flags; + size_t sz; + int ret, fput_needed; + + if (count < 1 || count >= PFM_MAX_ARG_COUNT(req)) + return -EINVAL; + + sz = count*sizeof(*req); + + filp = fget_light(fd, &fput_needed); + if (unlikely(filp == NULL)) { + PFM_DBG("invalid fd %d", fd); + return -EBADF; + } + + ctx = filp->private_data; + ret = -EBADF; + + if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) { + PFM_DBG("fd %d not related to perfmon", fd); + goto error; + } + + ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr); + if (ret) + goto error; + + spin_lock_irqsave(&ctx->lock, flags); + + task = ctx->task; + + ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags, &resume); + if (ret == 0) + ret = __pfm_write_pmcs_old(ctx, req, count); + + spin_unlock_irqrestore(&ctx->lock, flags); + + if (resume) + pfm_resume_task(task, resume); + + if (copy_to_user(ureq, req, sz)) + ret = -EFAULT; + + kfree(fptr); + +error: + fput_light(filp, fput_needed); + return ret; +} + +int __pfm_write_pmds_old(struct pfm_context *ctx, struct pfarg_reg *req_old, + int count) +{ + struct pfarg_pmd req; + int i, ret; + + memset(&req, 0, sizeof(req)); + + for (i = 0; i < count; i++, req_old++) { + req.reg_num = req_old->reg_num; + req.reg_set = req_old->reg_set; + req.reg_value = req_old->reg_value; + /* flags passed with pmcs in v2.0 */ + + req.reg_long_reset = req_old->reg_long_reset; + req.reg_short_reset = req_old->reg_short_reset; + req.reg_random_mask = req_old->reg_random_mask; + /* + * reg_random_seed is ignored since v2.3 + */ + + /* + * skip last_reset_val not used for writing + * skip smpl_pmds, reset_pmds, eventid, ovfl_swtch_cnt + * as set in pfm_write_pmcs_old. + * + * ovfl_switch_cnt ignored, not implemented in v2.0 + */ + ret = __pfm_write_pmds(ctx, (void *)&req, 1, 1); + + req_old->reg_flags &= ~PFM_REG_RETFL_MASK; + req_old->reg_flags |= req.reg_flags; + + if (ret) + return ret; + } + return 0; +} + +static long pfm_write_pmds_old(int fd, void __user *ureq, int count) +{ + struct pfm_context *ctx; + struct task_struct *task; + struct file *filp; + struct pfarg_reg *req = NULL; + void *fptr, *resume; + unsigned long flags; + size_t sz; + int ret, fput_needed; + + if (count < 1 || count >= PFM_MAX_ARG_COUNT(req)) + return -EINVAL; + + sz = count*sizeof(*req); + + filp = fget_light(fd, &fput_needed); + if (unlikely(filp == NULL)) { + PFM_DBG("invalid fd %d", fd); + return -EBADF; + } + + ctx = filp->private_data; + ret = -EBADF; + + if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) { + PFM_DBG("fd %d not related to perfmon", fd); + goto error; + } + + ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr); + if (ret) + goto error; + + spin_lock_irqsave(&ctx->lock, flags); + + task = ctx->task; + + ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags, &resume); + if (ret == 0) + ret = __pfm_write_pmds_old(ctx, req, count); + + spin_unlock_irqrestore(&ctx->lock, flags); + + if (copy_to_user(ureq, req, sz)) + ret = -EFAULT; + + if (resume) + pfm_resume_task(task, resume); + + kfree(fptr); +error: + fput_light(filp, fput_needed); + return ret; +} + +int __pfm_read_pmds_old(struct pfm_context *ctx, struct pfarg_reg *req_old, + int count) +{ + struct pfarg_pmd req; + int i, ret; + + memset(&req, 0, sizeof(req)); + + for (i = 0; i < count; i++, req_old++) { + req.reg_num = req_old->reg_num; + req.reg_set = req_old->reg_set; + + /* skip value not used for reading */ + req.reg_flags = req_old->reg_flags; + + /* skip short/long_reset not used for reading */ + /* skip last_reset_val not used for reading */ + /* skip ovfl_switch_cnt not used for reading */ + + ret = __pfm_read_pmds(ctx, (void *)&req, 1); + + req_old->reg_flags &= ~PFM_REG_RETFL_MASK; + req_old->reg_flags |= req.reg_flags; + if (ret) + return ret; + + /* update fields */ + req_old->reg_value = req.reg_value; + + req_old->reg_last_reset_val = req.reg_last_reset_val; + req_old->reg_ovfl_switch_cnt = req.reg_ovfl_switch_cnt; + } + return 0; +} + +static long pfm_read_pmds_old(int fd, void __user *ureq, int count) +{ + struct pfm_context *ctx; + struct task_struct *task; + struct file *filp; + struct pfarg_reg *req = NULL; + void *fptr, *resume; + unsigned long flags; + size_t sz; + int ret, fput_needed; + + if (count < 1 || count >= PFM_MAX_ARG_COUNT(req)) + return -EINVAL; + + sz = count*sizeof(*req); + + filp = fget_light(fd, &fput_needed); + if (unlikely(filp == NULL)) { + PFM_DBG("invalid fd %d", fd); + return -EBADF; + } + + ctx = filp->private_data; + ret = -EBADF; + + if (unlikely(!ctx || filp->f_op != &pfm_file_ops)) { + PFM_DBG("fd %d not related to perfmon", fd); + goto error; + } + + ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr); + if (ret) + goto error; + + spin_lock_irqsave(&ctx->lock, flags); + + task = ctx->task; + + ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags, &resume); + if (ret == 0) + ret = __pfm_read_pmds_old(ctx, req, count); + + spin_unlock_irqrestore(&ctx->lock, flags); + + if (resume) + pfm_resume_task(task, resume); + + if (copy_to_user(ureq, req, sz)) + ret = -EFAULT; + + kfree(fptr); +error: + fput_light(filp, fput_needed); + return ret; +} + +/* + * OBSOLETE: use /proc/perfmon_map instead + */ +static long pfm_get_default_pmcs_old(int fd, void __user *ureq, int count) +{ + struct pfarg_reg *req = NULL; + void *fptr; + size_t sz; + int ret, i; + unsigned int cnum; + + if (count < 1) + return -EINVAL; + + /* + * ensure the pfm_pmu_conf does not disappear while + * we use it + */ + ret = pfm_pmu_conf_get(1); + if (ret) + return ret; + + sz = count*sizeof(*ureq); + + ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr); + if (ret) + goto error; + + + for (i = 0; i < count; i++, req++) { + cnum = req->reg_num; + + if (i >= PFM_MAX_PMCS || + (pfm_pmu_conf->pmc_desc[cnum].type & PFM_REG_I) == 0) { + req->reg_flags = PFM_REG_RETFL_EINVAL; + break; + } + req->reg_value = pfm_pmu_conf->pmc_desc[cnum].dfl_val; + req->reg_flags = 0; + + PFM_DBG("pmc[%u]=0x%lx", cnum, req->reg_value); + } + + if (copy_to_user(ureq, req, sz)) + ret = -EFAULT; + + kfree(fptr); +error: + pfm_pmu_conf_put(); + + return ret; +} + +/* + * allocate a sampling buffer and remaps it into the user address space of + * the task. This is only in compatibility mode + * + * function called ONLY on current task + */ +int pfm_smpl_buf_alloc_compat(struct pfm_context *ctx, size_t rsize, + struct file *filp) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = NULL; + struct pfm_arch_context *ctx_arch; + size_t size; + int ret; + extern struct vm_operations_struct pfm_buf_map_vm_ops; + + ctx_arch = pfm_ctx_arch(ctx); + + /* + * allocate buffer + map desc + */ + ret = pfm_smpl_buf_alloc(ctx, rsize); + if (ret) + return ret; + + size = ctx->smpl_size; + + + /* allocate vma */ + vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + if (!vma) { + PFM_DBG("Cannot allocate vma"); + goto error_kmem; + } + memset(vma, 0, sizeof(*vma)); + + /* + * partially initialize the vma for the sampling buffer + */ + vma->vm_mm = mm; + vma->vm_flags = VM_READ | VM_MAYREAD | VM_RESERVED; + vma->vm_page_prot = PAGE_READONLY; + vma->vm_ops = &pfm_buf_map_vm_ops; + vma->vm_file = filp; + vma->vm_private_data = ctx; + vma->vm_pgoff = 0; + + /* + * simulate effect of mmap() + */ + get_file(filp); + + /* + * Let's do the difficult operations next. + * + * now we atomically find some area in the address space and + * remap the buffer into it. + */ + down_write(¤t->mm->mmap_sem); + + /* find some free area in address space, must have mmap sem held */ + vma->vm_start = get_unmapped_area(NULL, 0, size, 0, + MAP_PRIVATE|MAP_ANONYMOUS); + if (vma->vm_start == 0) { + PFM_DBG("cannot find unmapped area of size %zu", size); + up_write(¤t->mm->mmap_sem); + goto error; + } + vma->vm_end = vma->vm_start + size; + + PFM_DBG("aligned_size=%zu mapped @0x%lx", size, vma->vm_start); + /* + * now insert the vma in the vm list for the process, must be + * done with mmap lock held + */ + insert_vm_struct(mm, vma); + + mm->total_vm += size >> PAGE_SHIFT; + + up_write(¤t->mm->mmap_sem); + + /* + * IMPORTANT: we do not issue the fput() + * because we want to increase the ref count + * on the descriptor to simulate what mmap() + * would do + */ + + /* + * used to propagate vaddr to syscall stub + */ + ctx_arch->ctx_smpl_vaddr = (void *)vma->vm_start; + + return 0; +error: + kmem_cache_free(vm_area_cachep, vma); +error_kmem: + pfm_smpl_buf_space_release(ctx, ctx->smpl_size); + vfree(ctx->smpl_addr); + return -ENOMEM; +} + +#define PFM_DEFAULT_SMPL_UUID { \ + 0x4d, 0x72, 0xbe, 0xc0, 0x06, 0x64, 0x41, 0x43, 0x82,\ + 0xb4, 0xd3, 0xfd, 0x27, 0x24, 0x3c, 0x97} + +static pfm_uuid_t old_default_uuid = PFM_DEFAULT_SMPL_UUID; +static pfm_uuid_t null_uuid; + +/* + * function invoked in case, pfm_context_create fails + * at the last operation, copy_to_user. It needs to + * undo memory allocations and free the file descriptor + */ +static void pfm_undo_create_context_fd(int fd, struct pfm_context *ctx) +{ + struct files_struct *files = current->files; + struct file *file; + int fput_needed; + + file = fget_light(fd, &fput_needed); + /* + * there is no fd_uninstall(), so we do it + * here. put_unused_fd() does not remove the + * effect of fd_install(). + */ + + spin_lock(&files->file_lock); + files->fd_array[fd] = NULL; + spin_unlock(&files->file_lock); + + fput_light(file, fput_needed); + + /* + * decrement ref count and kill file + */ + put_filp(file); + + put_unused_fd(fd); + + pfm_free_context(ctx); +} + +static int pfm_get_smpl_arg_old(pfm_uuid_t uuid, void __user *fmt_uarg, + size_t usize, void **arg, + struct pfm_smpl_fmt **fmt) +{ + struct pfm_smpl_fmt *f; + void *addr = NULL; + size_t sz; + int ret; + + if (!memcmp(uuid, null_uuid, sizeof(pfm_uuid_t))) + return 0; + + if (memcmp(uuid, old_default_uuid, sizeof(pfm_uuid_t))) { + PFM_DBG("compatibility mode supports only default sampling format"); + return -EINVAL; + } + /* + * find fmt and increase refcount + */ + f = pfm_smpl_fmt_get("default-old"); + if (f == NULL) { + PFM_DBG("default-old buffer format not found"); + return -EINVAL; + } + + /* + * expected format argument size + */ + sz = f->fmt_arg_size; + + /* + * check user size matches expected size + * usize = -1 is for IA-64 backward compatibility + */ + ret = -EINVAL; + if (sz != usize && usize != -1) { + PFM_DBG("invalid arg size %zu, format expects %zu", + usize, sz); + goto error; + } + + ret = -ENOMEM; + addr = kmalloc(sz, GFP_KERNEL); + if (addr == NULL) + goto error; + + ret = -EFAULT; + if (copy_from_user(addr, fmt_uarg, sz)) + goto error; + + *arg = addr; + *fmt = f; + return 0; + +error: + kfree(addr); + pfm_smpl_fmt_put(f); + return ret; +} + +static long pfm_create_context_old(int fd, void __user *ureq, int count) +{ + struct pfm_context *new_ctx; + struct pfm_arch_context *ctx_arch; + struct pfm_smpl_fmt *fmt = NULL; + struct pfarg_context req_old; + void __user *usmpl_arg; + void *smpl_arg = NULL; + struct pfarg_ctx req; + int ret; + + if (count != 1) + return -EINVAL; + + if (copy_from_user(&req_old, ureq, sizeof(req_old))) + return -EFAULT; + + memset(&req, 0, sizeof(req)); + + /* + * sampling format args are following pfarg_context + */ + usmpl_arg = ureq+sizeof(req_old); + + ret = pfm_get_smpl_arg_old(req_old.ctx_smpl_buf_id, usmpl_arg, -1, + &smpl_arg, &fmt); + if (ret) + return ret; + + req.ctx_flags = req_old.ctx_flags; + + /* + * returns file descriptor if >=0, or error code */ + ret = __pfm_create_context(&req, fmt, smpl_arg, PFM_COMPAT, &new_ctx); + if (ret >= 0) { + ctx_arch = pfm_ctx_arch(new_ctx); + req_old.ctx_fd = ret; + req_old.ctx_smpl_vaddr = ctx_arch->ctx_smpl_vaddr; + } + + if (copy_to_user(ureq, &req_old, sizeof(req_old))) { + pfm_undo_create_context_fd(req_old.ctx_fd, new_ctx); + ret = -EFAULT; + } + + kfree(smpl_arg); + + return ret; +} + +/* + * obsolete call: use /proc/perfmon + */ +static long pfm_get_features_old(int fd, void __user *arg, int count) +{ + struct pfarg_features req; + int ret = 0; + + if (count != 1) + return -EINVAL; + + memset(&req, 0, sizeof(req)); + + req.ft_version = PFM_VERSION; + + if (copy_to_user(arg, &req, sizeof(req))) + ret = -EFAULT; + + return ret; +} + +static long pfm_debug_old(int fd, void __user *arg, int count) +{ + int m; + + if (count != 1) + return -EINVAL; + + if (get_user(m, (int __user *)arg)) + return -EFAULT; + + + pfm_controls.debug = m == 0 ? 0 : 1; + + PFM_INFO("debugging %s (timing reset)", + pfm_controls.debug ? "on" : "off"); + + if (m == 0) + for_each_online_cpu(m) { + memset(&per_cpu(pfm_stats, m), 0, + sizeof(struct pfm_stats)); + } + return 0; +} + +static long pfm_unload_context_old(int fd, void __user *arg, int count) +{ + if (count) + return -EINVAL; + + return sys_pfm_unload_context(fd); +} + +static long pfm_restart_old(int fd, void __user *arg, int count) +{ + if (count) + return -EINVAL; + + return sys_pfm_restart(fd); +} + +static long pfm_stop_old(int fd, void __user *arg, int count) +{ + if (count) + return -EINVAL; + + return sys_pfm_stop(fd); +} + +static long pfm_start_old(int fd, void __user *arg, int count) +{ + if (count > 1) + return -EINVAL; + + return sys_pfm_start(fd, arg); +} + +static long pfm_load_context_old(int fd, void __user *ureq, int count) +{ + if (count != 1) + return -EINVAL; + + return sys_pfm_load_context(fd, ureq); +} + +/* + * perfmon command descriptions + */ +struct pfm_cmd_desc { + long (*cmd_func)(int fd, void __user *arg, int count); +}; + +/* + * functions MUST be listed in the increasing order of + * their index (see permfon.h) + */ +#define PFM_CMD(name) \ + { .cmd_func = name, \ + } +#define PFM_CMD_NONE \ + { .cmd_func = NULL \ + } + +static struct pfm_cmd_desc pfm_cmd_tab[] = { +/* 0 */PFM_CMD_NONE, +/* 1 */PFM_CMD(pfm_write_pmcs_old), +/* 2 */PFM_CMD(pfm_write_pmds_old), +/* 3 */PFM_CMD(pfm_read_pmds_old), +/* 4 */PFM_CMD(pfm_stop_old), +/* 5 */PFM_CMD(pfm_start_old), +/* 6 */PFM_CMD_NONE, +/* 7 */PFM_CMD_NONE, +/* 8 */PFM_CMD(pfm_create_context_old), +/* 9 */PFM_CMD_NONE, +/* 10 */PFM_CMD(pfm_restart_old), +/* 11 */PFM_CMD_NONE, +/* 12 */PFM_CMD(pfm_get_features_old), +/* 13 */PFM_CMD(pfm_debug_old), +/* 14 */PFM_CMD_NONE, +/* 15 */PFM_CMD(pfm_get_default_pmcs_old), +/* 16 */PFM_CMD(pfm_load_context_old), +/* 17 */PFM_CMD(pfm_unload_context_old), +/* 18 */PFM_CMD_NONE, +/* 19 */PFM_CMD_NONE, +/* 20 */PFM_CMD_NONE, +/* 21 */PFM_CMD_NONE, +/* 22 */PFM_CMD_NONE, +/* 23 */PFM_CMD_NONE, +/* 24 */PFM_CMD_NONE, +/* 25 */PFM_CMD_NONE, +/* 26 */PFM_CMD_NONE, +/* 27 */PFM_CMD_NONE, +/* 28 */PFM_CMD_NONE, +/* 29 */PFM_CMD_NONE, +/* 30 */PFM_CMD_NONE, +/* 31 */PFM_CMD_NONE, +/* 32 */PFM_CMD(pfm_write_ibrs_old), +/* 33 */PFM_CMD(pfm_write_dbrs_old), +}; +#define PFM_CMD_COUNT ARRAY_SIZE(pfm_cmd_tab) + +/* + * system-call entry point (must return long) + */ +asmlinkage long sys_perfmonctl(int fd, int cmd, void __user *arg, int count) +{ + if (perfmon_disabled) + return -ENOSYS; + + if (unlikely(cmd < 0 || cmd >= PFM_CMD_COUNT + || pfm_cmd_tab[cmd].cmd_func == NULL)) { + PFM_DBG("invalid cmd=%d", cmd); + return -EINVAL; + } + return (long)pfm_cmd_tab[cmd].cmd_func(fd, arg, count); +} + +/* + * Called from pfm_read() for a perfmon v2.0 context. + * + * compatibility mode pfm_read() routine. We need a separate + * routine because the definition of the message has changed. + * The pfm_msg and pfarg_msg structures are different. + * + * return: sizeof(pfm_msg_t) on success, -errno otherwise + */ +ssize_t pfm_arch_compat_read(struct pfm_context *ctx, + char __user *buf, + int non_block, + size_t size) +{ + union pfarg_msg msg_buf; + pfm_msg_t old_msg_buf; + pfm_ovfl_msg_t *o_msg; + struct pfarg_ovfl_msg *n_msg; + int ret; + + PFM_DBG("msg=%p size=%zu", buf, size); + + /* + * cannot extract partial messages. + * check even when there is no message + * + * cannot extract more than one message per call. Bytes + * above sizeof(msg) are ignored. + */ + if (size < sizeof(old_msg_buf)) { + PFM_DBG("message is too small size=%zu must be >=%zu)", + size, + sizeof(old_msg_buf)); + return -EINVAL; + } + + ret = __pfm_read(ctx, &msg_buf, non_block); + if (ret < 1) + return ret; + + /* + * force return value to old message size + */ + ret = sizeof(old_msg_buf); + + o_msg = &old_msg_buf.pfm_ovfl_msg; + n_msg = &msg_buf.pfm_ovfl_msg; + + switch (msg_buf.type) { + case PFM_MSG_OVFL: + o_msg->msg_type = PFM_MSG_OVFL; + o_msg->msg_ctx_fd = 0; + o_msg->msg_active_set = n_msg->msg_active_set; + o_msg->msg_tstamp = 0; + + o_msg->msg_ovfl_pmds[0] = n_msg->msg_ovfl_pmds[0]; + o_msg->msg_ovfl_pmds[1] = n_msg->msg_ovfl_pmds[1]; + o_msg->msg_ovfl_pmds[2] = n_msg->msg_ovfl_pmds[2]; + o_msg->msg_ovfl_pmds[3] = n_msg->msg_ovfl_pmds[3]; + break; + case PFM_MSG_END: + o_msg->msg_type = PFM_MSG_END; + o_msg->msg_ctx_fd = 0; + o_msg->msg_tstamp = 0; + break; + default: + PFM_DBG("unknown msg type=%d", msg_buf.type); + } + if (copy_to_user(buf, &old_msg_buf, sizeof(old_msg_buf))) + ret = -EFAULT; + PFM_DBG_ovfl("ret=%d", ret); + return ret; +} + +/* + * legacy /proc/perfmon simplified interface (we only maintain the + * global information (no more per-cpu stats, use + * /sys/devices/system/cpu/cpuXX/perfmon + */ +static struct proc_dir_entry *perfmon_proc; + +static void *pfm_proc_start(struct seq_file *m, loff_t *pos) +{ + if (*pos == 0) + return (void *)1; + + return NULL; +} + +static void *pfm_proc_next(struct seq_file *m, void *v, loff_t *pos) +{ + ++*pos; + return pfm_proc_start(m, pos); +} + +static void pfm_proc_stop(struct seq_file *m, void *v) +{ +} + +/* + * this is a simplified version of the legacy /proc/perfmon. + * We have retained ONLY the key information that tools are actually + * using + */ +static void pfm_proc_show_header(struct seq_file *m) +{ + char buf[128]; + + pfm_sysfs_res_show(buf, sizeof(buf), 3); + + seq_printf(m, "perfmon version : %u.%u\n", + PFM_VERSION_MAJ, PFM_VERSION_MIN); + + seq_printf(m, "model : %s", buf); +} + +static int pfm_proc_show(struct seq_file *m, void *v) +{ + pfm_proc_show_header(m); + return 0; +} + +struct seq_operations pfm_proc_seq_ops = { + .start = pfm_proc_start, + .next = pfm_proc_next, + .stop = pfm_proc_stop, + .show = pfm_proc_show +}; + +static int pfm_proc_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &pfm_proc_seq_ops); +} + + +static struct file_operations pfm_proc_fops = { + .open = pfm_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* + * called from pfm_arch_init(), global initialization, called once + */ +int __init pfm_ia64_compat_init(void) +{ + /* + * create /proc/perfmon + */ + perfmon_proc = create_proc_entry("perfmon", S_IRUGO, NULL); + if (perfmon_proc == NULL) { + PFM_ERR("cannot create /proc entry, perfmon disabled"); + return -1; + } + perfmon_proc->proc_fops = &pfm_proc_fops; + return 0; +} diff --git a/arch/ia64/perfmon/perfmon_default_smpl.c b/arch/ia64/perfmon/perfmon_default_smpl.c new file mode 100644 index 0000000..b408a13 --- /dev/null +++ b/arch/ia64/perfmon/perfmon_default_smpl.c @@ -0,0 +1,273 @@ +/* + * Copyright (c) 2002-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This file implements the old default sampling buffer format + * for the Linux/ia64 perfmon-2 subsystem. This is for backward + * compatibility only. use the new default format in perfmon/ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include +#include + +#ifdef MODULE +#define FMT_FLAGS 0 +#else +#define FMT_FLAGS PFM_FMTFL_IS_BUILTIN +#endif + +#include +#include + +MODULE_AUTHOR("Stephane Eranian "); +MODULE_DESCRIPTION("perfmon old default sampling format"); +MODULE_LICENSE("GPL"); + +static int pfm_default_fmt_validate(u32 flags, u16 npmds, void *data) +{ + struct pfm_default_smpl_arg *arg = data; + size_t min_buf_size; + + if (data == NULL) { + PFM_DBG("no argument passed"); + return -EINVAL; + } + + /* + * compute min buf size. All PMD are manipulated as 64bit entities + */ + min_buf_size = sizeof(struct pfm_default_smpl_hdr) + + (sizeof(struct pfm_default_smpl_entry) + (npmds*sizeof(u64))); + + PFM_DBG("validate flags=0x%x npmds=%u min_buf_size=%lu " + "buf_size=%lu CPU%d", flags, npmds, min_buf_size, + arg->buf_size, smp_processor_id()); + + /* + * must hold at least the buffer header + one minimally sized entry + */ + if (arg->buf_size < min_buf_size) + return -EINVAL; + + return 0; +} + +static int pfm_default_fmt_get_size(unsigned int flags, void *data, + size_t *size) +{ + struct pfm_default_smpl_arg *arg = data; + + /* + * size has been validated in default_validate + */ + *size = arg->buf_size; + + return 0; +} + +static int pfm_default_fmt_init(struct pfm_context *ctx, void *buf, + u32 flags, u16 npmds, void *data) +{ + struct pfm_default_smpl_hdr *hdr; + struct pfm_default_smpl_arg *arg = data; + + hdr = buf; + + hdr->hdr_version = PFM_DEFAULT_SMPL_VERSION; + hdr->hdr_buf_size = arg->buf_size; + hdr->hdr_cur_offs = sizeof(*hdr); + hdr->hdr_overflows = 0; + hdr->hdr_count = 0; + + PFM_DBG("buffer=%p buf_size=%lu hdr_size=%lu " + "hdr_version=%u cur_offs=%lu", + buf, + hdr->hdr_buf_size, + sizeof(*hdr), + hdr->hdr_version, + hdr->hdr_cur_offs); + + return 0; +} + +static int pfm_default_fmt_handler(struct pfm_context *ctx, + unsigned long ip, u64 tstamp, void *data) +{ + struct pfm_default_smpl_hdr *hdr; + struct pfm_default_smpl_entry *ent; + void *cur, *last, *buf; + u64 *e; + size_t entry_size; + u16 npmds, i, ovfl_pmd; + struct pfm_ovfl_arg *arg; + + hdr = ctx->smpl_addr; + arg = &ctx->ovfl_arg; + + buf = hdr; + cur = buf+hdr->hdr_cur_offs; + last = buf+hdr->hdr_buf_size; + ovfl_pmd = arg->ovfl_pmd; + + /* + * precheck for sanity + */ + if ((last - cur) < PFM_DEFAULT_MAX_ENTRY_SIZE) + goto full; + + npmds = arg->num_smpl_pmds; + + ent = cur; + + prefetch(arg->smpl_pmds_values); + + entry_size = sizeof(*ent) + (npmds << 3); + + /* position for first pmd */ + e = (unsigned long *)(ent+1); + + hdr->hdr_count++; + + PFM_DBG_ovfl("count=%lu cur=%p last=%p free_bytes=%lu " + "ovfl_pmd=%d npmds=%u", + hdr->hdr_count, + cur, last, + last-cur, + ovfl_pmd, + npmds); + + /* + * current = task running at the time of the overflow. + * + * per-task mode: + * - this is ususally the task being monitored. + * Under certain conditions, it might be a different task + * + * system-wide: + * - this is not necessarily the task controlling the session + */ + ent->pid = current->pid; + ent->ovfl_pmd = ovfl_pmd; + ent->last_reset_val = arg->pmd_last_reset; + + /* + * where did the fault happen (includes slot number) + */ + ent->ip = ip; + + ent->tstamp = tstamp; + ent->cpu = smp_processor_id(); + ent->set = arg->active_set; + ent->tgid = current->tgid; + + /* + * selectively store PMDs in increasing index number + */ + if (npmds) { + u64 *val = arg->smpl_pmds_values; + for (i = 0; i < npmds; i++) + *e++ = *val++; + } + + /* + * update position for next entry + */ + hdr->hdr_cur_offs += entry_size; + cur += entry_size; + + /* + * post check to avoid losing the last sample + */ + if ((last - cur) < PFM_DEFAULT_MAX_ENTRY_SIZE) + goto full; + + /* + * reset before returning from interrupt handler + */ + arg->ovfl_ctrl = PFM_OVFL_CTRL_RESET; + return 0; +full: + PFM_DBG_ovfl("smpl buffer full free=%lu, count=%lu", + last-cur, hdr->hdr_count); + + /* + * increment number of buffer overflow. + * important to detect duplicate set of samples. + */ + hdr->hdr_overflows++; + + /* + * request notification and masking of monitoring. + * Notification is still subject to the overflowed + */ + arg->ovfl_ctrl = PFM_OVFL_CTRL_NOTIFY | PFM_OVFL_CTRL_MASK; + + return -ENOBUFS; /* we are full, sorry */ +} + +static int pfm_default_fmt_restart(int is_active, u32 *ovfl_ctrl, void *buf) +{ + struct pfm_default_smpl_hdr *hdr; + + hdr = buf; + + hdr->hdr_count = 0; + hdr->hdr_cur_offs = sizeof(*hdr); + + *ovfl_ctrl = PFM_OVFL_CTRL_RESET; + + return 0; +} + +static int pfm_default_fmt_exit(void *buf) +{ + return 0; +} + +static struct pfm_smpl_fmt default_fmt = { + .fmt_name = "default-old", + .fmt_version = 0x10000, + .fmt_arg_size = sizeof(struct pfm_default_smpl_arg), + .fmt_validate = pfm_default_fmt_validate, + .fmt_getsize = pfm_default_fmt_get_size, + .fmt_init = pfm_default_fmt_init, + .fmt_handler = pfm_default_fmt_handler, + .fmt_restart = pfm_default_fmt_restart, + .fmt_exit = pfm_default_fmt_exit, + .fmt_flags = FMT_FLAGS, + .owner = THIS_MODULE +}; + +static int pfm_default_fmt_init_module(void) +{ + int ret; + + return pfm_fmt_register(&default_fmt); + return ret; +} + +static void pfm_default_fmt_cleanup_module(void) +{ + pfm_fmt_unregister(&default_fmt); +} + +module_init(pfm_default_fmt_init_module); +module_exit(pfm_default_fmt_cleanup_module); diff --git a/arch/ia64/perfmon/perfmon_generic.c b/arch/ia64/perfmon/perfmon_generic.c new file mode 100644 index 0000000..47b1870 --- /dev/null +++ b/arch/ia64/perfmon/perfmon_generic.c @@ -0,0 +1,148 @@ +/* + * This file contains the generic PMU register description tables + * and pmc checker used by perfmon.c. + * + * Copyright (c) 2002-2006 Hewlett-Packard Development Company, L.P. + * contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include + +MODULE_AUTHOR("Stephane Eranian "); +MODULE_DESCRIPTION("Generic IA-64 PMU description tables"); +MODULE_LICENSE("GPL"); + +#define RDEP(x) (1UL << (x)) + +#define PFM_IA64GEN_MASK_PMCS (RDEP(4)|RDEP(5)|RDEP(6)|RDEP(7)) +#define PFM_IA64GEN_RSVD (0xffffffffffff0080UL) +#define PFM_IA64GEN_NO64 (1UL<<5) + +/* forward declaration */ +static struct pfm_pmu_config pfm_ia64gen_pmu_conf; + +static struct pfm_arch_pmu_info pfm_ia64gen_pmu_info = { + .mask_pmcs = {PFM_IA64GEN_MASK_PMCS,}, +}; + +static struct pfm_regmap_desc pfm_ia64gen_pmc_desc[] = { +/* pmc0 */ PMX_NA, +/* pmc1 */ PMX_NA, +/* pmc2 */ PMX_NA, +/* pmc3 */ PMX_NA, +/* pmc4 */ PMC_D(PFM_REG_W64, "PMC4", 0x0, PFM_IA64GEN_RSVD, PFM_IA64GEN_NO64, 4), +/* pmc5 */ PMC_D(PFM_REG_W64, "PMC5", 0x0, PFM_IA64GEN_RSVD, PFM_IA64GEN_NO64, 5), +/* pmc6 */ PMC_D(PFM_REG_W64, "PMC6", 0x0, PFM_IA64GEN_RSVD, PFM_IA64GEN_NO64, 6), +/* pmc7 */ PMC_D(PFM_REG_W64, "PMC7", 0x0, PFM_IA64GEN_RSVD, PFM_IA64GEN_NO64, 7) +}; +#define PFM_IA64GEN_NUM_PMCS ARRAY_SIZE(pfm_ia64gen_pmc_desc) + +static struct pfm_regmap_desc pfm_ia64gen_pmd_desc[] = { +/* pmd0 */ PMX_NA, +/* pmd1 */ PMX_NA, +/* pmd2 */ PMX_NA, +/* pmd3 */ PMX_NA, +/* pmd4 */ PMD_DP(PFM_REG_C, "PMD4", 4, 1ull << 4), +/* pmd5 */ PMD_DP(PFM_REG_C, "PMD5", 5, 1ull << 5), +/* pmd6 */ PMD_DP(PFM_REG_C, "PMD6", 6, 1ull << 6), +/* pmd7 */ PMD_DP(PFM_REG_C, "PMD7", 7, 1ull << 7) +}; +#define PFM_IA64GEN_NUM_PMDS ARRAY_SIZE(pfm_ia64gen_pmd_desc) + +static int pfm_ia64gen_pmc_check(struct pfm_context *ctx, + struct pfm_event_set *set, + struct pfarg_pmc *req) +{ +#define PFM_IA64GEN_PMC_PM_POS6 (1UL<<6) + u64 tmpval; + int is_system; + + is_system = ctx->flags.system; + tmpval = req->reg_value; + + switch (req->reg_num) { + case 4: + case 5: + case 6: + case 7: + /* set pmc.oi for 64-bit emulation */ + tmpval |= 1UL << 5; + + if (is_system) + tmpval |= PFM_IA64GEN_PMC_PM_POS6; + else + tmpval &= ~PFM_IA64GEN_PMC_PM_POS6; + break; + + } + req->reg_value = tmpval; + + return 0; +} + +/* + * matches anything + */ +static int pfm_ia64gen_probe_pmu(void) +{ + u64 pm_buffer[16]; + pal_perf_mon_info_u_t pm_info; + + /* + * call PAL_PERFMON_INFO to retrieve counter width which + * is implementation specific + */ + if (ia64_pal_perf_mon_info(pm_buffer, &pm_info)) + return -1; + + pfm_ia64gen_pmu_conf.counter_width = pm_info.pal_perf_mon_info_s.width; + + return 0; +} + +/* + * impl_pmcs, impl_pmds are computed at runtime to minimize errors! + */ +static struct pfm_pmu_config pfm_ia64gen_pmu_conf = { + .pmu_name = "Generic IA-64", + .counter_width = 0, /* computed from PAL_PERFMON_INFO */ + .pmd_desc = pfm_ia64gen_pmd_desc, + .pmc_desc = pfm_ia64gen_pmc_desc, + .probe_pmu = pfm_ia64gen_probe_pmu, + .num_pmc_entries = PFM_IA64GEN_NUM_PMCS, + .num_pmd_entries = PFM_IA64GEN_NUM_PMDS, + .pmc_write_check = pfm_ia64gen_pmc_check, + .version = "1.0", + .flags = PFM_PMU_BUILTIN_FLAG, + .owner = THIS_MODULE, + .pmu_info = &pfm_ia64gen_pmu_info + /* no read/write checkers */ +}; + +static int __init pfm_gen_pmu_init_module(void) +{ + return pfm_pmu_register(&pfm_ia64gen_pmu_conf); +} + +static void __exit pfm_gen_pmu_cleanup_module(void) +{ + pfm_pmu_unregister(&pfm_ia64gen_pmu_conf); +} + +module_init(pfm_gen_pmu_init_module); +module_exit(pfm_gen_pmu_cleanup_module); diff --git a/arch/ia64/perfmon/perfmon_itanium.c b/arch/ia64/perfmon/perfmon_itanium.c new file mode 100644 index 0000000..094b31b --- /dev/null +++ b/arch/ia64/perfmon/perfmon_itanium.c @@ -0,0 +1,232 @@ +/* + * This file contains the Itanium PMU register description tables + * and pmc checker used by perfmon.c. + * + * Copyright (c) 2002-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include + +MODULE_AUTHOR("Stephane Eranian "); +MODULE_DESCRIPTION("Itanium (Merced) PMU description tables"); +MODULE_LICENSE("GPL"); + +#define RDEP(x) (1ULL << (x)) + +#define PFM_ITA_MASK_PMCS (RDEP(4)|RDEP(5)|RDEP(6)|RDEP(7)|RDEP(10)|RDEP(11)|\ + RDEP(12)) + +#define PFM_ITA_NO64 (1ULL<<5) + +static struct pfm_arch_pmu_info pfm_ita_pmu_info = { + .mask_pmcs = {PFM_ITA_MASK_PMCS,}, +}; +/* reserved bits are 1 in the mask */ +#define PFM_ITA_RSVD 0xfffffffffc8000a0UL +/* + * For debug registers, writing xBR(y) means we use also xBR(y+1). Hence using + * PMC256+y means we use PMC256+y+1. Yet, we do not have dependency information + * but this is fine because they are handled separately in the IA-64 specific + * code. + */ +static struct pfm_regmap_desc pfm_ita_pmc_desc[] = { +/* pmc0 */ PMX_NA, +/* pmc1 */ PMX_NA, +/* pmc2 */ PMX_NA, +/* pmc3 */ PMX_NA, +/* pmc4 */ PMC_D(PFM_REG_W64, "PMC4" , 0x20, PFM_ITA_RSVD, PFM_ITA_NO64, 4), +/* pmc5 */ PMC_D(PFM_REG_W64, "PMC5" , 0x20, PFM_ITA_RSVD, PFM_ITA_NO64, 5), +/* pmc6 */ PMC_D(PFM_REG_W64, "PMC6" , 0x20, PFM_ITA_RSVD, PFM_ITA_NO64, 6), +/* pmc7 */ PMC_D(PFM_REG_W64, "PMC7" , 0x20, PFM_ITA_RSVD, PFM_ITA_NO64, 7), +/* pmc8 */ PMC_D(PFM_REG_W , "PMC8" , 0xfffffffe3ffffff8UL, 0xfff00000001c0000UL, 0, 8), +/* pmc9 */ PMC_D(PFM_REG_W , "PMC9" , 0xfffffffe3ffffff8UL, 0xfff00000001c0000UL, 0, 9), +/* pmc10 */ PMC_D(PFM_REG_W , "PMC10", 0x0, 0xfffffffff3f0ff30UL, 0, 10), +/* pmc11 */ PMC_D(PFM_REG_W , "PMC11", 0x10000000UL, 0xffffffffecf0ff30UL, 0, 11), +/* pmc12 */ PMC_D(PFM_REG_W , "PMC12", 0x0, 0xffffffffffff0030UL, 0, 12), +/* pmc13 */ PMC_D(PFM_REG_W , "PMC13", 0x3ffff00000001UL, 0xfffffffffffffffeUL, 0, 13), +/* pmc14 */ PMX_NA, +/* pmc15 */ PMX_NA, +/* pmc16 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc24 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc32 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc40 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc48 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc56 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc64 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc72 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc80 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc88 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc96 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc104 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc112 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc120 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc128 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc136 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc144 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc152 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc160 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc168 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc176 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc184 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc192 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc200 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc208 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc216 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc224 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc232 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc240 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc248 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc256 */ PMC_D(PFM_REG_W , "IBR0", 0x0, 0, 0, 0), +/* pmc257 */ PMC_D(PFM_REG_W , "IBR1", 0x0, 0x8000000000000000UL, 0, 1), +/* pmc258 */ PMC_D(PFM_REG_W , "IBR2", 0x0, 0, 0, 2), +/* pmc259 */ PMC_D(PFM_REG_W , "IBR3", 0x0, 0x8000000000000000UL, 0, 3), +/* pmc260 */ PMC_D(PFM_REG_W , "IBR4", 0x0, 0, 0, 4), +/* pmc261 */ PMC_D(PFM_REG_W , "IBR5", 0x0, 0x8000000000000000UL, 0, 5), +/* pmc262 */ PMC_D(PFM_REG_W , "IBR6", 0x0, 0, 0, 6), +/* pmc263 */ PMC_D(PFM_REG_W , "IBR7", 0x0, 0x8000000000000000UL, 0, 7), +/* pmc264 */ PMC_D(PFM_REG_W , "DBR0", 0x0, 0, 0, 0), +/* pmc265 */ PMC_D(PFM_REG_W , "DBR1", 0x0, 0xc000000000000000UL, 0, 1), +/* pmc266 */ PMC_D(PFM_REG_W , "DBR2", 0x0, 0, 0, 2), +/* pmc267 */ PMC_D(PFM_REG_W , "DBR3", 0x0, 0xc000000000000000UL, 0, 3), +/* pmc268 */ PMC_D(PFM_REG_W , "DBR4", 0x0, 0, 0, 4), +/* pmc269 */ PMC_D(PFM_REG_W , "DBR5", 0x0, 0xc000000000000000UL, 0, 5), +/* pmc270 */ PMC_D(PFM_REG_W , "DBR6", 0x0, 0, 0, 6), +/* pmc271 */ PMC_D(PFM_REG_W , "DBR7", 0x0, 0xc000000000000000UL, 0, 7) +}; +#define PFM_ITA_NUM_PMCS ARRAY_SIZE(pfm_ita_pmc_desc) + +static struct pfm_regmap_desc pfm_ita_pmd_desc[] = { +/* pmd0 */ PMD_DP(PFM_REG_I , "PMD0", 0, 1ull << 10), +/* pmd1 */ PMD_DP(PFM_REG_I , "PMD1", 1, 1ull << 10), +/* pmd2 */ PMD_DP(PFM_REG_I , "PMD2", 2, 1ull << 11), +/* pmd3 */ PMD_DP(PFM_REG_I , "PMD3", 3, 1ull << 11), +/* pmd4 */ PMD_DP(PFM_REG_C , "PMD4", 4, 1ull << 4), +/* pmd5 */ PMD_DP(PFM_REG_C , "PMD5", 5, 1ull << 5), +/* pmd6 */ PMD_DP(PFM_REG_C , "PMD6", 6, 1ull << 6), +/* pmd7 */ PMD_DP(PFM_REG_C , "PMD7", 7, 1ull << 7), +/* pmd8 */ PMD_DP(PFM_REG_I , "PMD8", 8, 1ull << 12), +/* pmd9 */ PMD_DP(PFM_REG_I , "PMD9", 9, 1ull << 12), +/* pmd10 */ PMD_DP(PFM_REG_I , "PMD10", 10, 1ull << 12), +/* pmd11 */ PMD_DP(PFM_REG_I , "PMD11", 11, 1ull << 12), +/* pmd12 */ PMD_DP(PFM_REG_I , "PMD12", 12, 1ull << 12), +/* pmd13 */ PMD_DP(PFM_REG_I , "PMD13", 13, 1ull << 12), +/* pmd14 */ PMD_DP(PFM_REG_I , "PMD14", 14, 1ull << 12), +/* pmd15 */ PMD_DP(PFM_REG_I , "PMD15", 15, 1ull << 12), +/* pmd16 */ PMD_DP(PFM_REG_I , "PMD16", 16, 1ull << 12), +/* pmd17 */ PMD_DP(PFM_REG_I , "PMD17", 17, 1ull << 11) +}; +#define PFM_ITA_NUM_PMDS ARRAY_SIZE(pfm_ita_pmd_desc) + +static int pfm_ita_pmc_check(struct pfm_context *ctx, + struct pfm_event_set *set, + struct pfarg_pmc *req) +{ +#define PFM_ITA_PMC_PM_POS6 (1UL<<6) + struct pfm_arch_context *ctx_arch; + u64 tmpval; + u16 cnum; + int ret = 0, is_system; + + tmpval = req->reg_value; + cnum = req->reg_num; + ctx_arch = pfm_ctx_arch(ctx); + is_system = ctx->flags.system; + + switch (cnum) { + case 4: + case 5: + case 6: + case 7: + case 10: + case 11: + case 12: + if (is_system) + tmpval |= PFM_ITA_PMC_PM_POS6; + else + tmpval &= ~PFM_ITA_PMC_PM_POS6; + break; + } + + /* + * we must clear the (instruction) debug registers if pmc13.ta bit is + * cleared before they are written (fl_using_dbreg==0) to avoid + * picking up stale information. + */ + if (cnum == 13 && ((tmpval & 0x1) == 0) + && ctx_arch->flags.use_dbr == 0) { + PFM_DBG("pmc13 has pmc13.ta cleared, clearing ibr"); + ret = pfm_ia64_mark_dbregs_used(ctx, set); + if (ret) + return ret; + } + + /* + * we must clear the (data) debug registers if pmc11.pt bit is cleared + * before they are written (fl_using_dbreg==0) to avoid picking up + * stale information. + */ + if (cnum == 11 && ((tmpval >> 28) & 0x1) == 0 + && ctx_arch->flags.use_dbr == 0) { + PFM_DBG("pmc11 has pmc11.pt cleared, clearing dbr"); + ret = pfm_ia64_mark_dbregs_used(ctx, set); + if (ret) + return ret; + } + + req->reg_value = tmpval; + + return 0; +} + +static int pfm_ita_probe_pmu(void) +{ + return local_cpu_data->family == 0x7 && !ia64_platform_is("hpsim") + ? 0 : -1; +} + +/* + * impl_pmcs, impl_pmds are computed at runtime to minimize errors! + */ +static struct pfm_pmu_config pfm_ita_pmu_conf = { + .pmu_name = "Itanium", + .counter_width = 32, + .pmd_desc = pfm_ita_pmd_desc, + .pmc_desc = pfm_ita_pmc_desc, + .pmc_write_check = pfm_ita_pmc_check, + .num_pmc_entries = PFM_ITA_NUM_PMCS, + .num_pmd_entries = PFM_ITA_NUM_PMDS, + .probe_pmu = pfm_ita_probe_pmu, + .version = "1.0", + .flags = PFM_PMU_BUILTIN_FLAG, + .owner = THIS_MODULE, + .pmu_info = &pfm_ita_pmu_info +}; + +static int __init pfm_ita_pmu_init_module(void) +{ + return pfm_pmu_register(&pfm_ita_pmu_conf); +} + +static void __exit pfm_ita_pmu_cleanup_module(void) +{ + pfm_pmu_unregister(&pfm_ita_pmu_conf); +} + +module_init(pfm_ita_pmu_init_module); +module_exit(pfm_ita_pmu_cleanup_module); + diff --git a/arch/ia64/perfmon/perfmon_mckinley.c b/arch/ia64/perfmon/perfmon_mckinley.c new file mode 100644 index 0000000..dc59092 --- /dev/null +++ b/arch/ia64/perfmon/perfmon_mckinley.c @@ -0,0 +1,290 @@ +/* + * This file contains the McKinley PMU register description tables + * and pmc checker used by perfmon.c. + * + * Copyright (c) 2002-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include + +MODULE_AUTHOR("Stephane Eranian "); +MODULE_DESCRIPTION("Itanium 2 (McKinley) PMU description tables"); +MODULE_LICENSE("GPL"); + +#define RDEP(x) (1UL << (x)) + +#define PFM_MCK_MASK_PMCS (RDEP(4)|RDEP(5)|RDEP(6)|RDEP(7)|RDEP(10)|RDEP(11)|\ + RDEP(12)) + +#define PFM_MCK_NO64 (1UL<<5) + +static struct pfm_arch_pmu_info pfm_mck_pmu_info = { + .mask_pmcs = {PFM_MCK_MASK_PMCS,}, +}; + +/* reserved bits are 1 in the mask */ +#define PFM_ITA2_RSVD 0xfffffffffc8000a0UL + +/* + * For debug registers, writing xBR(y) means we use also xBR(y+1). Hence using + * PMC256+y means we use PMC256+y+1. Yet, we do not have dependency information + * but this is fine because they are handled separately in the IA-64 specific + * code. + */ +static struct pfm_regmap_desc pfm_mck_pmc_desc[] = { +/* pmc0 */ PMX_NA, +/* pmc1 */ PMX_NA, +/* pmc2 */ PMX_NA, +/* pmc3 */ PMX_NA, +/* pmc4 */ PMC_D(PFM_REG_W64, "PMC4" , 0x800020UL, 0xfffffffffc8000a0, PFM_MCK_NO64, 4), +/* pmc5 */ PMC_D(PFM_REG_W64, "PMC5" , 0x20UL, PFM_ITA2_RSVD, PFM_MCK_NO64, 5), +/* pmc6 */ PMC_D(PFM_REG_W64, "PMC6" , 0x20UL, PFM_ITA2_RSVD, PFM_MCK_NO64, 6), +/* pmc7 */ PMC_D(PFM_REG_W64, "PMC7" , 0x20UL, PFM_ITA2_RSVD, PFM_MCK_NO64, 7), +/* pmc8 */ PMC_D(PFM_REG_W , "PMC8" , 0xffffffff3fffffffUL, 0xc0000004UL, 0, 8), +/* pmc9 */ PMC_D(PFM_REG_W , "PMC9" , 0xffffffff3ffffffcUL, 0xc0000004UL, 0, 9), +/* pmc10 */ PMC_D(PFM_REG_W , "PMC10", 0x0, 0xffffffffffff0000UL, 0, 10), +/* pmc11 */ PMC_D(PFM_REG_W , "PMC11", 0x0, 0xfffffffffcf0fe30UL, 0, 11), +/* pmc12 */ PMC_D(PFM_REG_W , "PMC12", 0x0, 0xffffffffffff0000UL, 0, 12), +/* pmc13 */ PMC_D(PFM_REG_W , "PMC13", 0x2078fefefefeUL, 0xfffe1fffe7e7e7e7UL, 0, 13), +/* pmc14 */ PMC_D(PFM_REG_W , "PMC14", 0x0db60db60db60db6UL, 0xffffffffffffdb6dUL, 0, 14), +/* pmc15 */ PMC_D(PFM_REG_W , "PMC15", 0xfffffff0UL, 0xfffffffffffffff0UL, 0, 15), +/* pmc16 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc24 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc32 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc40 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc48 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc56 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc64 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc72 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc80 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc88 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc96 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc104 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc112 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc120 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc128 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc136 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc144 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc152 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc160 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc168 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc176 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc184 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc192 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc200 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc208 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc216 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc224 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc232 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc240 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc248 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc256 */ PMC_D(PFM_REG_W , "IBR0", 0x0, 0, 0, 0), +/* pmc257 */ PMC_D(PFM_REG_W , "IBR1", 0x0, 0x8000000000000000UL, 0, 1), +/* pmc258 */ PMC_D(PFM_REG_W , "IBR2", 0x0, 0, 0, 2), +/* pmc259 */ PMC_D(PFM_REG_W , "IBR3", 0x0, 0x8000000000000000UL, 0, 3), +/* pmc260 */ PMC_D(PFM_REG_W , "IBR4", 0x0, 0, 0, 4), +/* pmc261 */ PMC_D(PFM_REG_W , "IBR5", 0x0, 0x8000000000000000UL, 0, 5), +/* pmc262 */ PMC_D(PFM_REG_W , "IBR6", 0x0, 0, 0, 6), +/* pmc263 */ PMC_D(PFM_REG_W , "IBR7", 0x0, 0x8000000000000000UL, 0, 7), +/* pmc264 */ PMC_D(PFM_REG_W , "DBR0", 0x0, 0, 0, 0), +/* pmc265 */ PMC_D(PFM_REG_W , "DBR1", 0x0, 0xc000000000000000UL, 0, 1), +/* pmc266 */ PMC_D(PFM_REG_W , "DBR2", 0x0, 0, 0, 2), +/* pmc267 */ PMC_D(PFM_REG_W , "DBR3", 0x0, 0xc000000000000000UL, 0, 3), +/* pmc268 */ PMC_D(PFM_REG_W , "DBR4", 0x0, 0, 0, 4), +/* pmc269 */ PMC_D(PFM_REG_W , "DBR5", 0x0, 0xc000000000000000UL, 0, 5), +/* pmc270 */ PMC_D(PFM_REG_W , "DBR6", 0x0, 0, 0, 6), +/* pmc271 */ PMC_D(PFM_REG_W , "DBR7", 0x0, 0xc000000000000000UL, 0, 7) +}; +#define PFM_MCK_NUM_PMCS ARRAY_SIZE(pfm_mck_pmc_desc) + +static struct pfm_regmap_desc pfm_mck_pmd_desc[] = { +/* pmd0 */ PMD_DP(PFM_REG_I, "PMD0", 0, 1ull << 10), +/* pmd1 */ PMD_DP(PFM_REG_I, "PMD1", 1, 1ull << 10), +/* pmd2 */ PMD_DP(PFM_REG_I, "PMD2", 2, 1ull << 11), +/* pmd3 */ PMD_DP(PFM_REG_I, "PMD3", 3, 1ull << 11), +/* pmd4 */ PMD_DP(PFM_REG_C, "PMD4", 4, 1ull << 4), +/* pmd5 */ PMD_DP(PFM_REG_C, "PMD5", 5, 1ull << 5), +/* pmd6 */ PMD_DP(PFM_REG_C, "PMD6", 6, 1ull << 6), +/* pmd7 */ PMD_DP(PFM_REG_C, "PMD7", 7, 1ull << 7), +/* pmd8 */ PMD_DP(PFM_REG_I, "PMD8", 8, 1ull << 12), +/* pmd9 */ PMD_DP(PFM_REG_I, "PMD9", 9, 1ull << 12), +/* pmd10 */ PMD_DP(PFM_REG_I, "PMD10", 10, 1ull << 12), +/* pmd11 */ PMD_DP(PFM_REG_I, "PMD11", 11, 1ull << 12), +/* pmd12 */ PMD_DP(PFM_REG_I, "PMD12", 12, 1ull << 12), +/* pmd13 */ PMD_DP(PFM_REG_I, "PMD13", 13, 1ull << 12), +/* pmd14 */ PMD_DP(PFM_REG_I, "PMD14", 14, 1ull << 12), +/* pmd15 */ PMD_DP(PFM_REG_I, "PMD15", 15, 1ull << 12), +/* pmd16 */ PMD_DP(PFM_REG_I, "PMD16", 16, 1ull << 12), +/* pmd17 */ PMD_DP(PFM_REG_I, "PMD17", 17, 1ull << 11) +}; +#define PFM_MCK_NUM_PMDS ARRAY_SIZE(pfm_mck_pmd_desc) + +static int pfm_mck_pmc_check(struct pfm_context *ctx, + struct pfm_event_set *set, + struct pfarg_pmc *req) +{ + struct pfm_arch_context *ctx_arch; + u64 val8 = 0, val14 = 0, val13 = 0; + u64 tmpval; + u16 cnum; + int ret = 0, check_case1 = 0; + int is_system; + + tmpval = req->reg_value; + cnum = req->reg_num; + ctx_arch = pfm_ctx_arch(ctx); + is_system = ctx->flags.system; + +#define PFM_MCK_PMC_PM_POS6 (1UL<<6) +#define PFM_MCK_PMC_PM_POS4 (1UL<<4) + + switch (cnum) { + case 4: + case 5: + case 6: + case 7: + case 11: + case 12: + if (is_system) + tmpval |= PFM_MCK_PMC_PM_POS6; + else + tmpval &= ~PFM_MCK_PMC_PM_POS6; + break; + + case 8: + val8 = tmpval; + val13 = set->pmcs[13]; + val14 = set->pmcs[14]; + check_case1 = 1; + break; + + case 10: + if (is_system) + tmpval |= PFM_MCK_PMC_PM_POS4; + else + tmpval &= ~PFM_MCK_PMC_PM_POS4; + break; + + case 13: + val8 = set->pmcs[8]; + val13 = tmpval; + val14 = set->pmcs[14]; + check_case1 = 1; + break; + + case 14: + val8 = set->pmcs[8]; + val13 = set->pmcs[13]; + val14 = tmpval; + check_case1 = 1; + break; + } + + /* + * check illegal configuration which can produce inconsistencies + * in tagging i-side events in L1D and L2 caches + */ + if (check_case1) { + ret = (((val13 >> 45) & 0xf) == 0 && ((val8 & 0x1) == 0)) + && ((((val14>>1) & 0x3) == 0x2 || ((val14>>1) & 0x3) == 0x0) + || (((val14>>4) & 0x3) == 0x2 || ((val14>>4) & 0x3) == 0x0)); + + if (ret) { + PFM_DBG("perfmon: invalid config pmc8=0x%lx " + "pmc13=0x%lx pmc14=0x%lx", + val8, val13, val14); + return -EINVAL; + } + } + + /* + * check if configuration implicitely activates the use of + * the debug registers. If true, then we ensure that this is + * possible and that we do not pick up stale value in the HW + * registers. + * + * We postpone the checks of pmc13 and pmc14 to avoid side effects + * in case of errors + */ + + /* + * pmc13 is "active" if: + * one of the pmc13.cfg_dbrpXX field is different from 0x3 + * AND + * at the corresponding pmc13.ena_dbrpXX is set. + */ + if (cnum == 13 && (tmpval & 0x1e00000000000UL) + && (tmpval & 0x18181818UL) != 0x18181818UL + && ctx_arch->flags.use_dbr == 0) { + PFM_DBG("pmc13=0x%lx active", tmpval); + ret = pfm_ia64_mark_dbregs_used(ctx, set); + if (ret) + return ret; + } + + /* + * if any pmc14.ibrpX bit is enabled we must clear the ibrs + */ + if (cnum == 14 && ((tmpval & 0x2222UL) != 0x2222UL) + && ctx_arch->flags.use_dbr == 0) { + PFM_DBG("pmc14=0x%lx active", tmpval); + ret = pfm_ia64_mark_dbregs_used(ctx, set); + if (ret) + return ret; + } + + req->reg_value = tmpval; + + return 0; +} + +static int pfm_mck_probe_pmu(void) +{ + return local_cpu_data->family == 0x1f ? 0 : -1; +} + +/* + * impl_pmcs, impl_pmds are computed at runtime to minimize errors! + */ +static struct pfm_pmu_config pfm_mck_pmu_conf = { + .pmu_name = "Itanium 2", + .counter_width = 47, + .pmd_desc = pfm_mck_pmd_desc, + .pmc_desc = pfm_mck_pmc_desc, + .pmc_write_check = pfm_mck_pmc_check, + .num_pmc_entries = PFM_MCK_NUM_PMCS, + .num_pmd_entries = PFM_MCK_NUM_PMDS, + .probe_pmu = pfm_mck_probe_pmu, + .version = "1.0", + .flags = PFM_PMU_BUILTIN_FLAG, + .owner = THIS_MODULE, + .pmu_info = &pfm_mck_pmu_info, +}; + +static int __init pfm_mck_pmu_init_module(void) +{ + return pfm_pmu_register(&pfm_mck_pmu_conf); +} + +static void __exit pfm_mck_pmu_cleanup_module(void) +{ + pfm_pmu_unregister(&pfm_mck_pmu_conf); +} + +module_init(pfm_mck_pmu_init_module); +module_exit(pfm_mck_pmu_cleanup_module); diff --git a/arch/ia64/perfmon/perfmon_montecito.c b/arch/ia64/perfmon/perfmon_montecito.c new file mode 100644 index 0000000..3f76f73 --- /dev/null +++ b/arch/ia64/perfmon/perfmon_montecito.c @@ -0,0 +1,412 @@ +/* + * This file contains the McKinley PMU register description tables + * and pmc checker used by perfmon.c. + * + * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. + * Contributed Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include + +MODULE_AUTHOR("Stephane Eranian "); +MODULE_DESCRIPTION("Dual-Core Itanium 2 (Montecito) PMU description table"); +MODULE_LICENSE("GPL"); + +#define RDEP(x) (1UL << (x)) + +#define PFM_MONT_MASK_PMCS (RDEP(4)|RDEP(5)|RDEP(6)|RDEP(7)|\ + RDEP(8)|RDEP(9)|RDEP(10)|RDEP(11)|\ + RDEP(12)|RDEP(13)|RDEP(14)|RDEP(15)|\ + RDEP(37)|RDEP(39)|RDEP(40)|RDEP(42)) + +#define PFM_MONT_NO64 (1UL<<5) + +static struct pfm_arch_pmu_info pfm_mont_pmu_info = { + .mask_pmcs = {PFM_MONT_MASK_PMCS,}, +}; + +#define PFM_MONT_RSVD 0xffffffff838000a0UL +/* + * + * For debug registers, writing xBR(y) means we use also xBR(y+1). Hence using + * PMC256+y means we use PMC256+y+1. Yet, we do not have dependency information + * but this is fine because they are handled separately in the IA-64 specific + * code. + * + * For PMC4-PMC15, PMC40: we force pmc.ism=2 (IA-64 mode only) + */ +static struct pfm_regmap_desc pfm_mont_pmc_desc[] = { +/* pmc0 */ PMX_NA, +/* pmc1 */ PMX_NA, +/* pmc2 */ PMX_NA, +/* pmc3 */ PMX_NA, +/* pmc4 */ PMC_D(PFM_REG_W64, "PMC4" , 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 4), +/* pmc5 */ PMC_D(PFM_REG_W64, "PMC5" , 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 5), +/* pmc6 */ PMC_D(PFM_REG_W64, "PMC6" , 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 6), +/* pmc7 */ PMC_D(PFM_REG_W64, "PMC7" , 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 7), +/* pmc8 */ PMC_D(PFM_REG_W64, "PMC8" , 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 8), +/* pmc9 */ PMC_D(PFM_REG_W64, "PMC9" , 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 9), +/* pmc10 */ PMC_D(PFM_REG_W64, "PMC10", 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 10), +/* pmc11 */ PMC_D(PFM_REG_W64, "PMC11", 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 11), +/* pmc12 */ PMC_D(PFM_REG_W64, "PMC12", 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 12), +/* pmc13 */ PMC_D(PFM_REG_W64, "PMC13", 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 13), +/* pmc14 */ PMC_D(PFM_REG_W64, "PMC14", 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 14), +/* pmc15 */ PMC_D(PFM_REG_W64, "PMC15", 0x2000020UL, PFM_MONT_RSVD, PFM_MONT_NO64, 15), +/* pmc16 */ PMX_NA, +/* pmc17 */ PMX_NA, +/* pmc18 */ PMX_NA, +/* pmc19 */ PMX_NA, +/* pmc20 */ PMX_NA, +/* pmc21 */ PMX_NA, +/* pmc22 */ PMX_NA, +/* pmc23 */ PMX_NA, +/* pmc24 */ PMX_NA, +/* pmc25 */ PMX_NA, +/* pmc26 */ PMX_NA, +/* pmc27 */ PMX_NA, +/* pmc28 */ PMX_NA, +/* pmc29 */ PMX_NA, +/* pmc30 */ PMX_NA, +/* pmc31 */ PMX_NA, +/* pmc32 */ PMC_D(PFM_REG_W , "PMC32", 0x30f01ffffffffffUL, 0xfcf0fe0000000000UL, 0, 32), +/* pmc33 */ PMC_D(PFM_REG_W , "PMC33", 0x0, 0xfffffe0000000000UL, 0, 33), +/* pmc34 */ PMC_D(PFM_REG_W , "PMC34", 0xf01ffffffffffUL, 0xfff0fe0000000000UL, 0, 34), +/* pmc35 */ PMC_D(PFM_REG_W , "PMC35", 0x0, 0x1ffffffffffUL, 0, 35), +/* pmc36 */ PMC_D(PFM_REG_W , "PMC36", 0xfffffff0UL, 0xfffffffffffffff0UL, 0, 36), +/* pmc37 */ PMC_D(PFM_REG_W , "PMC37", 0x0, 0xffffffffffffc000UL, 0, 37), +/* pmc38 */ PMC_D(PFM_REG_W , "PMC38", 0xdb6UL, 0xffffffffffffdb6dUL, 0, 38), +/* pmc39 */ PMC_D(PFM_REG_W , "PMC39", 0x0, 0xffffffffffff0030UL, 0, 39), +/* pmc40 */ PMC_D(PFM_REG_W , "PMC40", 0x2000000UL, 0xfffffffffff0fe30UL, 0, 40), +/* pmc41 */ PMC_D(PFM_REG_W , "PMC41", 0x00002078fefefefeUL, 0xfffe1fffe7e7e7e7UL, 0, 41), +/* pmc42 */ PMC_D(PFM_REG_W , "PMC42", 0x0, 0xfff800b0UL, 0, 42), +/* pmc43 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc48 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc56 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc64 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc72 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc80 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc88 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc96 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc104 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc112 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc120 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc128 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc136 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc144 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc152 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc160 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc168 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc176 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc184 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc192 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc200 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc208 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc216 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc224 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc232 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc240 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc248 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc256 */ PMC_D(PFM_REG_W, "IBR0", 0x0, 0, 0, 0), +/* pmc257 */ PMC_D(PFM_REG_W, "IBR1", 0x0, 0x8000000000000000UL, 0, 1), +/* pmc258 */ PMC_D(PFM_REG_W, "IBR2", 0x0, 0, 0, 2), +/* pmc259 */ PMC_D(PFM_REG_W, "IBR3", 0x0, 0x8000000000000000UL, 0, 3), +/* pmc260 */ PMC_D(PFM_REG_W, "IBR4", 0x0, 0, 0, 4), +/* pmc261 */ PMC_D(PFM_REG_W, "IBR5", 0x0, 0x8000000000000000UL, 0, 5), +/* pmc262 */ PMC_D(PFM_REG_W, "IBR6", 0x0, 0, 0, 6), +/* pmc263 */ PMC_D(PFM_REG_W, "IBR7", 0x0, 0x8000000000000000UL, 0, 7), +/* pmc264 */ PMC_D(PFM_REG_W, "DBR0", 0x0, 0, 0, 0), +/* pmc265 */ PMC_D(PFM_REG_W, "DBR1", 0x0, 0xc000000000000000UL, 0, 1), +/* pmc266 */ PMC_D(PFM_REG_W, "DBR2", 0x0, 0, 0, 2), +/* pmc267 */ PMC_D(PFM_REG_W, "DBR3", 0x0, 0xc000000000000000UL, 0, 3), +/* pmc268 */ PMC_D(PFM_REG_W, "DBR4", 0x0, 0, 0, 4), +/* pmc269 */ PMC_D(PFM_REG_W, "DBR5", 0x0, 0xc000000000000000UL, 0, 5), +/* pmc270 */ PMC_D(PFM_REG_W, "DBR6", 0x0, 0, 0, 6), +/* pmc271 */ PMC_D(PFM_REG_W, "DBR7", 0x0, 0xc000000000000000UL, 0, 7) +}; +#define PFM_MONT_NUM_PMCS ARRAY_SIZE(pfm_mont_pmc_desc) + +static struct pfm_regmap_desc pfm_mont_pmd_desc[] = { +/* pmd0 */ PMX_NA, +/* pmd1 */ PMX_NA, +/* pmd2 */ PMX_NA, +/* pmd3 */ PMX_NA, +/* pmd4 */ PMD_DP(PFM_REG_C, "PMD4", 4, 1ull << 4), +/* pmd5 */ PMD_DP(PFM_REG_C, "PMD5", 5, 1ull << 5), +/* pmd6 */ PMD_DP(PFM_REG_C, "PMD6", 6, 1ull << 6), +/* pmd7 */ PMD_DP(PFM_REG_C, "PMD7", 7, 1ull << 7), +/* pmd8 */ PMD_DP(PFM_REG_C, "PMD8", 8, 1ull << 8), +/* pmd9 */ PMD_DP(PFM_REG_C, "PMD9", 9, 1ull << 9), +/* pmd10 */ PMD_DP(PFM_REG_C, "PMD10", 10, 1ull << 10), +/* pmd11 */ PMD_DP(PFM_REG_C, "PMD11", 11, 1ull << 11), +/* pmd12 */ PMD_DP(PFM_REG_C, "PMD12", 12, 1ull << 12), +/* pmd13 */ PMD_DP(PFM_REG_C, "PMD13", 13, 1ull << 13), +/* pmd14 */ PMD_DP(PFM_REG_C, "PMD14", 14, 1ull << 14), +/* pmd15 */ PMD_DP(PFM_REG_C, "PMD15", 15, 1ull << 15), +/* pmd16 */ PMX_NA, +/* pmd17 */ PMX_NA, +/* pmd18 */ PMX_NA, +/* pmd19 */ PMX_NA, +/* pmd20 */ PMX_NA, +/* pmd21 */ PMX_NA, +/* pmd22 */ PMX_NA, +/* pmd23 */ PMX_NA, +/* pmd24 */ PMX_NA, +/* pmd25 */ PMX_NA, +/* pmd26 */ PMX_NA, +/* pmd27 */ PMX_NA, +/* pmd28 */ PMX_NA, +/* pmd29 */ PMX_NA, +/* pmd30 */ PMX_NA, +/* pmd31 */ PMX_NA, +/* pmd32 */ PMD_DP(PFM_REG_I, "PMD32", 32, 1ull << 40), +/* pmd33 */ PMD_DP(PFM_REG_I, "PMD33", 33, 1ull << 40), +/* pmd34 */ PMD_DP(PFM_REG_I, "PMD34", 34, 1ull << 37), +/* pmd35 */ PMD_DP(PFM_REG_I, "PMD35", 35, 1ull << 37), +/* pmd36 */ PMD_DP(PFM_REG_I, "PMD36", 36, 1ull << 40), +/* pmd37 */ PMX_NA, +/* pmd38 */ PMD_DP(PFM_REG_I, "PMD38", 38, (1ull<<39)|(1ull<<42)), +/* pmd39 */ PMD_DP(PFM_REG_I, "PMD39", 39, (1ull<<39)|(1ull<<42)), +/* pmd40 */ PMX_NA, +/* pmd41 */ PMX_NA, +/* pmd42 */ PMX_NA, +/* pmd43 */ PMX_NA, +/* pmd44 */ PMX_NA, +/* pmd45 */ PMX_NA, +/* pmd46 */ PMX_NA, +/* pmd47 */ PMX_NA, +/* pmd48 */ PMD_DP(PFM_REG_I, "PMD48", 48, (1ull<<39)|(1ull<<42)), +/* pmd49 */ PMD_DP(PFM_REG_I, "PMD49", 49, (1ull<<39)|(1ull<<42)), +/* pmd50 */ PMD_DP(PFM_REG_I, "PMD50", 50, (1ull<<39)|(1ull<<42)), +/* pmd51 */ PMD_DP(PFM_REG_I, "PMD51", 51, (1ull<<39)|(1ull<<42)), +/* pmd52 */ PMD_DP(PFM_REG_I, "PMD52", 52, (1ull<<39)|(1ull<<42)), +/* pmd53 */ PMD_DP(PFM_REG_I, "PMD53", 53, (1ull<<39)|(1ull<<42)), +/* pmd54 */ PMD_DP(PFM_REG_I, "PMD54", 54, (1ull<<39)|(1ull<<42)), +/* pmd55 */ PMD_DP(PFM_REG_I, "PMD55", 55, (1ull<<39)|(1ull<<42)), +/* pmd56 */ PMD_DP(PFM_REG_I, "PMD56", 56, (1ull<<39)|(1ull<<42)), +/* pmd57 */ PMD_DP(PFM_REG_I, "PMD57", 57, (1ull<<39)|(1ull<<42)), +/* pmd58 */ PMD_DP(PFM_REG_I, "PMD58", 58, (1ull<<39)|(1ull<<42)), +/* pmd59 */ PMD_DP(PFM_REG_I, "PMD59", 59, (1ull<<39)|(1ull<<42)), +/* pmd60 */ PMD_DP(PFM_REG_I, "PMD60", 60, (1ull<<39)|(1ull<<42)), +/* pmd61 */ PMD_DP(PFM_REG_I, "PMD61", 61, (1ull<<39)|(1ull<<42)), +/* pmd62 */ PMD_DP(PFM_REG_I, "PMD62", 62, (1ull<<39)|(1ull<<42)), +/* pmd63 */ PMD_DP(PFM_REG_I, "PMD63", 63, (1ull<<39)|(1ull<<42)) +}; +#define PFM_MONT_NUM_PMDS ARRAY_SIZE(pfm_mont_pmd_desc) + +static int pfm_mont_has_ht; + +static int pfm_mont_pmc_check(struct pfm_context *ctx, + struct pfm_event_set *set, + struct pfarg_pmc *req) +{ + struct pfm_arch_context *ctx_arch; + u64 val32 = 0, val38 = 0, val41 = 0; + u64 tmpval; + u16 cnum; + int ret = 0, check_case1 = 0; + int is_system; + + tmpval = req->reg_value; + cnum = req->reg_num; + ctx_arch = pfm_ctx_arch(ctx); + is_system = ctx->flags.system; + +#define PFM_MONT_PMC_PM_POS6 (1UL<<6) +#define PFM_MONT_PMC_PM_POS4 (1UL<<4) + + switch (cnum) { + case 4: + case 5: + case 6: + case 7: + case 8: + case 9: + if (is_system) + tmpval |= PFM_MONT_PMC_PM_POS6; + else + tmpval &= ~PFM_MONT_PMC_PM_POS6; + break; + case 10: + case 11: + case 12: + case 13: + case 14: + case 15: + if ((req->reg_flags & PFM_REGFL_NO_EMUL64) == 0) { + if (pfm_mont_has_ht) { + PFM_INFO("perfmon: Errata 121 PMD10/PMD15 cannot be used to overflow" + "when threads on on"); + return -EINVAL; + } + } + if (is_system) + tmpval |= PFM_MONT_PMC_PM_POS6; + else + tmpval &= ~PFM_MONT_PMC_PM_POS6; + break; + case 39: + case 40: + case 42: + if (pfm_mont_has_ht && ((req->reg_value >> 8) & 0x7) == 4) { + PFM_INFO("perfmon: Errata 120: IP-EAR not available when threads are on"); + return -EINVAL; + } + if (is_system) + tmpval |= PFM_MONT_PMC_PM_POS6; + else + tmpval &= ~PFM_MONT_PMC_PM_POS6; + break; + + case 32: + val32 = tmpval; + val38 = set->pmcs[38]; + val41 = set->pmcs[41]; + check_case1 = 1; + break; + + case 37: + if (is_system) + tmpval |= PFM_MONT_PMC_PM_POS4; + else + tmpval &= ~PFM_MONT_PMC_PM_POS4; + break; + + case 38: + val38 = tmpval; + val32 = set->pmcs[32]; + val41 = set->pmcs[41]; + check_case1 = 1; + break; + case 41: + val41 = tmpval; + val32 = set->pmcs[32]; + val38 = set->pmcs[38]; + check_case1 = 1; + break; + } + + if (check_case1) { + ret = (((val41 >> 45) & 0xf) == 0 && ((val32>>57) & 0x1) == 0) + && ((((val38>>1) & 0x3) == 0x2 || ((val38>>1) & 0x3) == 0) + || (((val38>>4) & 0x3) == 0x2 || ((val38>>4) & 0x3) == 0)); + if (ret) { + PFM_DBG("perfmon: invalid config pmc38=0x%lx " + "pmc41=0x%lx pmc32=0x%lx", + val38, val41, val32); + return -EINVAL; + } + } + + /* + * check if configuration implicitely activates the use of the + * debug registers. If true, then we ensure that this is possible + * and that we do not pick up stale value in the HW registers. + */ + + /* + * + * pmc41 is "active" if: + * one of the pmc41.cfgdtagXX field is different from 0x3 + * AND + * the corsesponding pmc41.en_dbrpXX is set. + * AND + * ctx_fl_use_dbr (dbr not yet used) + */ + if (cnum == 41 + && (tmpval & 0x1e00000000000) + && (tmpval & 0x18181818) != 0x18181818 + && ctx_arch->flags.use_dbr == 0) { + PFM_DBG("pmc41=0x%lx active, clearing dbr", tmpval); + ret = pfm_ia64_mark_dbregs_used(ctx, set); + if (ret) + return ret; + } + /* + * we must clear the (instruction) debug registers if: + * pmc38.ig_ibrpX is 0 (enabled) + * and + * fl_use_dbr == 0 (dbr not yet used) + */ + if (cnum == 38 && ((tmpval & 0x492) != 0x492) + && ctx_arch->flags.use_dbr == 0) { + PFM_DBG("pmc38=0x%lx active pmc38, clearing ibr", tmpval); + ret = pfm_ia64_mark_dbregs_used(ctx, set); + if (ret) + return ret; + + } + req->reg_value = tmpval; + return 0; +} + +static void pfm_handle_errata(void) +{ + pfm_mont_has_ht = 1; + + PFM_INFO("activating workaround for errata 120 " + "(Disable IP-EAR when threads are on)"); + + PFM_INFO("activating workaround for Errata 121 " + "(PMC10-PMC15 cannot be used to overflow" + " when threads are on"); +} +static int pfm_mont_probe_pmu(void) +{ + if (local_cpu_data->family != 0x20) + return -1; + + /* + * the 2 errata must be activated when + * threads are/can be enabled + */ + if (is_multithreading_enabled()) + pfm_handle_errata(); + + return 0; +} + +/* + * impl_pmcs, impl_pmds are computed at runtime to minimize errors! + */ +static struct pfm_pmu_config pfm_mont_pmu_conf = { + .pmu_name = "Montecito", + .counter_width = 47, + .pmd_desc = pfm_mont_pmd_desc, + .pmc_desc = pfm_mont_pmc_desc, + .num_pmc_entries = PFM_MONT_NUM_PMCS, + .num_pmd_entries = PFM_MONT_NUM_PMDS, + .pmc_write_check = pfm_mont_pmc_check, + .probe_pmu = pfm_mont_probe_pmu, + .version = "1.0", + .pmu_info = &pfm_mont_pmu_info, + .flags = PFM_PMU_BUILTIN_FLAG, + .owner = THIS_MODULE +}; + +static int __init pfm_mont_pmu_init_module(void) +{ + return pfm_pmu_register(&pfm_mont_pmu_conf); +} + +static void __exit pfm_mont_pmu_cleanup_module(void) +{ + pfm_pmu_unregister(&pfm_mont_pmu_conf); +} + +module_init(pfm_mont_pmu_init_module); +module_exit(pfm_mont_pmu_cleanup_module); diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 1e06d23..b87f445 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -1857,6 +1857,8 @@ config SECCOMP If unsure, say Y. Only embedded should say N here. +source "arch/mips/perfmon/Kconfig" + endmenu config RWSEM_GENERIC_SPINLOCK diff --git a/arch/mips/Makefile b/arch/mips/Makefile index 9aab51c..712acf7 100644 --- a/arch/mips/Makefile +++ b/arch/mips/Makefile @@ -154,6 +154,12 @@ endif endif # +# Perfmon support +# + +core-$(CONFIG_PERFMON) += arch/mips/perfmon/ + +# # Firmware support # libs-$(CONFIG_ARC) += arch/mips/fw/arc/ diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index 22fc19b..4467361 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -94,6 +95,7 @@ void start_thread(struct pt_regs * regs, unsigned long pc, unsigned long sp) void exit_thread(void) { + pfm_exit_thread(); } void flush_thread(void) @@ -162,6 +164,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long usp, if (clone_flags & CLONE_SETTLS) ti->tp_value = regs->regs[7]; + pfm_copy_thread(p); + return 0; } diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S index 5e75a31..e96ddd6 100644 --- a/arch/mips/kernel/scall32-o32.S +++ b/arch/mips/kernel/scall32-o32.S @@ -653,6 +653,18 @@ einval: li v0, -EINVAL sys sys_dup3 3 sys sys_pipe2 2 sys sys_inotify_init1 1 + sys sys_pfm_create_context 4 /* 4330 */ + sys sys_pfm_write_pmcs 3 + sys sys_pfm_write_pmds 4 + sys sys_pfm_read_pmds 3 + sys sys_pfm_load_context 2 + sys sys_pfm_start 2 /* 4335 */ + sys sys_pfm_stop 1 + sys sys_pfm_restart 1 + sys sys_pfm_create_evtsets 3 + sys sys_pfm_getinfo_evtsets 3 + sys sys_pfm_delete_evtsets 3 /* 4340 */ + sys sys_pfm_unload_context 1 .endm /* We pre-compute the number of _instruction_ bytes needed to diff --git a/arch/mips/kernel/scall64-64.S b/arch/mips/kernel/scall64-64.S index 3d58204..adb2ba9 100644 --- a/arch/mips/kernel/scall64-64.S +++ b/arch/mips/kernel/scall64-64.S @@ -487,4 +487,16 @@ sys_call_table: PTR sys_dup3 PTR sys_pipe2 PTR sys_inotify_init1 + PTR sys_pfm_create_context + PTR sys_pfm_write_pmcs /* 5290 */ + PTR sys_pfm_write_pmds + PTR sys_pfm_read_pmds + PTR sys_pfm_load_context + PTR sys_pfm_start + PTR sys_pfm_stop /* 5295 */ + PTR sys_pfm_restart + PTR sys_pfm_create_evtsets + PTR sys_pfm_getinfo_evtsets + PTR sys_pfm_delete_evtsets + PTR sys_pfm_unload_context /* 5300 */ .size sys_call_table,.-sys_call_table diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S index da7f1b6..6d12095 100644 --- a/arch/mips/kernel/scall64-n32.S +++ b/arch/mips/kernel/scall64-n32.S @@ -400,12 +400,12 @@ EXPORT(sysn32_call_table) PTR sys_ioprio_set PTR sys_ioprio_get PTR compat_sys_utimensat - PTR compat_sys_signalfd /* 5280 */ + PTR compat_sys_signalfd /* 6280 */ PTR sys_ni_syscall PTR sys_eventfd PTR sys_fallocate PTR sys_timerfd_create - PTR sys_timerfd_gettime /* 5285 */ + PTR sys_timerfd_gettime /* 6285 */ PTR sys_timerfd_settime PTR sys_signalfd4 PTR sys_eventfd2 @@ -413,4 +413,16 @@ EXPORT(sysn32_call_table) PTR sys_dup3 /* 5290 */ PTR sys_pipe2 PTR sys_inotify_init1 + PTR sys_pfm_create_context + PTR sys_pfm_write_pmcs + PTR sys_pfm_write_pmds /* 6295 */ + PTR sys_pfm_read_pmds + PTR sys_pfm_load_context + PTR sys_pfm_start + PTR sys_pfm_stop + PTR sys_pfm_restart /* 6300 */ + PTR sys_pfm_create_evtsets + PTR sys_pfm_getinfo_evtsets + PTR sys_pfm_delete_evtsets + PTR sys_pfm_unload_context .size sysn32_call_table,.-sysn32_call_table diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S index d7cd1aa..e77f55a 100644 --- a/arch/mips/kernel/scall64-o32.S +++ b/arch/mips/kernel/scall64-o32.S @@ -535,4 +535,16 @@ sys_call_table: PTR sys_dup3 PTR sys_pipe2 PTR sys_inotify_init1 + PTR sys_pfm_create_context /* 4330 */ + PTR sys_pfm_write_pmcs + PTR sys_pfm_write_pmds + PTR sys_pfm_read_pmds + PTR sys_pfm_load_context + PTR sys_pfm_start /* 4335 */ + PTR sys_pfm_stop + PTR sys_pfm_restart + PTR sys_pfm_create_evtsets + PTR sys_pfm_getinfo_evtsets + PTR sys_pfm_delete_evtsets /* 4340 */ + PTR sys_pfm_unload_context .size sys_call_table,.-sys_call_table diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c index a4e106c..6a7e60c 100644 --- a/arch/mips/kernel/signal.c +++ b/arch/mips/kernel/signal.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -694,8 +695,11 @@ static void do_signal(struct pt_regs *regs) * - triggered by the TIF_WORK_MASK flags */ asmlinkage void do_notify_resume(struct pt_regs *regs, void *unused, - __u32 thread_info_flags) + __u32 thread_info_flags) { + if (thread_info_flags & _TIF_PERFMON_WORK) + pfm_handle_work(regs); + /* deal with pending signal delivery */ if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK)) do_signal(regs); diff --git a/arch/mips/kernel/time.c b/arch/mips/kernel/time.c index 1f467d5..163dfe4 100644 --- a/arch/mips/kernel/time.c +++ b/arch/mips/kernel/time.c @@ -49,10 +49,11 @@ int update_persistent_clock(struct timespec now) return rtc_mips_set_mmss(now.tv_sec); } -static int null_perf_irq(void) +int null_perf_irq(void) { return 0; } +EXPORT_SYMBOL(null_perf_irq); int (*perf_irq)(void) = null_perf_irq; diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index b602ac6..9cbd75f 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -92,17 +92,15 @@ static void show_raw_backtrace(unsigned long reg29) #ifdef CONFIG_KALLSYMS printk("\n"); #endif - while (!kstack_end(sp)) { - unsigned long __user *p = - (unsigned long __user *)(unsigned long)sp++; - if (__get_user(addr, p)) { - printk(" (Bad stack address)"); - break; +#define IS_KVA01(a) ((((unsigned long)a) & 0xc0000000) == 0x80000000) + if (IS_KVA01(sp)) { + while (!kstack_end(sp)) { + addr = *sp++; + if (__kernel_text_address(addr)) + print_ip_sym(addr); } - if (__kernel_text_address(addr)) - print_ip_sym(addr); + printk("\n"); } - printk("\n"); } #ifdef CONFIG_KALLSYMS diff --git a/arch/mips/mti-malta/malta-time.c b/arch/mips/mti-malta/malta-time.c index 0b97d47..d8f36b5 100644 --- a/arch/mips/mti-malta/malta-time.c +++ b/arch/mips/mti-malta/malta-time.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include diff --git a/arch/mips/perfmon/Kconfig b/arch/mips/perfmon/Kconfig new file mode 100644 index 0000000..b426eea --- /dev/null +++ b/arch/mips/perfmon/Kconfig @@ -0,0 +1,61 @@ +menu "Hardware Performance Monitoring support" +config PERFMON + bool "Perfmon2 performance monitoring interface" + default n + help + Enables the perfmon2 interface to access the hardware + performance counters. See for + more details. + +config PERFMON_DEBUG + bool "Perfmon debugging" + default n + depends on PERFMON + help + Enables perfmon debugging support + +config PERFMON_DEBUG_FS + bool "Enable perfmon statistics reporting via debugfs" + default y + depends on PERFMON && DEBUG_FS + help + Enable collection and reporting of perfmon timing statistics under + debugfs. This is used for debugging and performance analysis of the + subsystem. The debugfs filesystem must be mounted. + +config PERFMON_FLUSH + bool "Flush sampling buffer when modified" + depends on PERFMON + default n + help + On some MIPS models, cache aliasing may cause invalid + data to be read from the perfmon sampling buffer. Use this option + to flush the buffer when it is modified to ensure valid data is + visible at the user level. + +config PERFMON_ALIGN + bool "Align sampling buffer to avoid cache aliasing" + depends on PERFMON + default n + help + On some MIPS models, cache aliasing may cause invalid + data to be read from the perfmon sampling buffer. By forcing a bigger + page alignment (4-page), one can guarantee the buffer virtual address + will conflict in the cache with the user level mapping of the buffer + thereby ensuring a consistent view by user programs. + +config PERFMON_DEBUG + bool "Perfmon debugging" + depends on PERFMON + default n + depends on PERFMON + help + Enables perfmon debugging support + +config PERFMON_MIPS64 + tristate "Support for MIPS64 hardware performance counters" + depends on PERFMON + default n + help + Enables support for the MIPS64 hardware performance counters" +endmenu diff --git a/arch/mips/perfmon/Makefile b/arch/mips/perfmon/Makefile new file mode 100644 index 0000000..153b83f --- /dev/null +++ b/arch/mips/perfmon/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_PERFMON) += perfmon.o +obj-$(CONFIG_PERFMON_MIPS64) += perfmon_mips64.o diff --git a/arch/mips/perfmon/perfmon.c b/arch/mips/perfmon/perfmon.c new file mode 100644 index 0000000..6615a77 --- /dev/null +++ b/arch/mips/perfmon/perfmon.c @@ -0,0 +1,313 @@ +/* + * This file implements the MIPS64 specific + * support for the perfmon2 interface + * + * Copyright (c) 2005 Philip J. Mucci + * + * based on versions for other architectures: + * Copyright (c) 2005 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include + +/* + * collect pending overflowed PMDs. Called from pfm_ctxsw() + * and from PMU interrupt handler. Must fill in set->povfl_pmds[] + * and set->npend_ovfls. Interrupts are masked + */ +static void __pfm_get_ovfl_pmds(struct pfm_context *ctx, struct pfm_event_set *set) +{ + u64 new_val, wmask; + u64 *used_mask, *intr_pmds; + u64 mask[PFM_PMD_BV]; + unsigned int i, max; + + max = ctx->regs.max_intr_pmd; + intr_pmds = ctx->regs.intr_pmds; + used_mask = set->used_pmds; + + wmask = 1ULL << pfm_pmu_conf->counter_width; + + bitmap_and(cast_ulp(mask), + cast_ulp(intr_pmds), + cast_ulp(used_mask), + max); + + /* + * check all PMD that can generate interrupts + * (that includes counters) + */ + for (i = 0; i < max; i++) { + if (test_bit(i, mask)) { + new_val = pfm_arch_read_pmd(ctx, i); + + PFM_DBG_ovfl("pmd%u new_val=0x%llx bit=%d\n", + i, (unsigned long long)new_val, + (new_val&wmask) ? 1 : 0); + + if (new_val & wmask) { + __set_bit(i, set->povfl_pmds); + set->npend_ovfls++; + } + } + } +} + +static void pfm_stop_active(struct task_struct *task, struct pfm_context *ctx, + struct pfm_event_set *set) +{ + unsigned int i, max; + + max = ctx->regs.max_pmc; + + /* + * clear enable bits, assume all pmcs are enable pmcs + */ + for (i = 0; i < max; i++) { + if (test_bit(i, set->used_pmcs)) + pfm_arch_write_pmc(ctx, i, 0); + } + + if (set->npend_ovfls) + return; + + __pfm_get_ovfl_pmds(ctx, set); +} + +/* + * Called from pfm_ctxsw(). Task is guaranteed to be current. + * Context is locked. Interrupts are masked. Monitoring is active. + * PMU access is guaranteed. PMC and PMD registers are live in PMU. + * + * for per-thread: + * must stop monitoring for the task + * + * Return: + * non-zero : did not save PMDs (as part of stopping the PMU) + * 0 : saved PMDs (no need to save them in caller) + */ +int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx) +{ + /* + * disable lazy restore of PMC registers. + */ + ctx->active_set->priv_flags |= PFM_SETFL_PRIV_MOD_PMCS; + + /* + * if masked, monitoring is stopped, thus there is no + * need to stop the PMU again and there is no need to + * check for pending overflows. This is not just an + * optimization, this is also for correctness as you + * may end up detecting overflows twice. + */ + if (ctx->state == PFM_CTX_MASKED) + return 1; + + pfm_stop_active(task, ctx, ctx->active_set); + + return 1; +} + +/* + * Called from pfm_stop() and pfm_ctxsw() + * Interrupts are masked. Context is locked. Set is the active set. + * + * For per-thread: + * task is not necessarily current. If not current task, then + * task is guaranteed stopped and off any cpu. Access to PMU + * is not guaranteed. Interrupts are masked. Context is locked. + * Set is the active set. + * + * For system-wide: + * task is current + * + * must disable active monitoring. ctx cannot be NULL + */ +void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx) +{ + /* + * no need to go through stop_save() + * if we are already stopped + */ + if (!ctx->flags.started || ctx->state == PFM_CTX_MASKED) + return; + + /* + * stop live registers and collect pending overflow + */ + if (task == current) + pfm_stop_active(task, ctx, ctx->active_set); +} + +/* + * called from pfm_start() or pfm_ctxsw() when idle task and + * EXCL_IDLE is on. + * + * Interrupts are masked. Context is locked. Set is the active set. + * + * For per-trhead: + * Task is not necessarily current. If not current task, then task + * is guaranteed stopped and off any cpu. Access to PMU is not guaranteed. + * + * For system-wide: + * task is always current + * + * must enable active monitoring. + */ +void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx) +{ + struct pfm_event_set *set; + unsigned int i, max_pmc; + + if (task != current) + return; + + set = ctx->active_set; + max_pmc = ctx->regs.max_pmc; + + for (i = 0; i < max_pmc; i++) { + if (test_bit(i, set->used_pmcs)) + pfm_arch_write_pmc(ctx, i, set->pmcs[i]); + } +} + +/* + * function called from pfm_switch_sets(), pfm_context_load_thread(), + * pfm_context_load_sys(), pfm_ctxsw(), pfm_switch_sets() + * context is locked. Interrupts are masked. set cannot be NULL. + * Access to the PMU is guaranteed. + * + * function must restore all PMD registers from set. + */ +void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set) +{ + u64 ovfl_mask, val; + u64 *impl_pmds; + unsigned int i; + unsigned int max_pmd; + + max_pmd = ctx->regs.max_pmd; + ovfl_mask = pfm_pmu_conf->ovfl_mask; + impl_pmds = ctx->regs.pmds; + + /* + * must restore all pmds to avoid leaking + * information to user. + */ + for (i = 0; i < max_pmd; i++) { + + if (test_bit(i, impl_pmds) == 0) + continue; + + val = set->pmds[i].value; + + /* + * set upper bits for counter to ensure + * overflow will trigger + */ + val &= ovfl_mask; + + pfm_arch_write_pmd(ctx, i, val); + } +} + +/* + * function called from pfm_switch_sets(), pfm_context_load_thread(), + * pfm_context_load_sys(), pfm_ctxsw(). + * Context is locked. Interrupts are masked. set cannot be NULL. + * Access to the PMU is guaranteed. + * + * function must restore all PMC registers from set, if needed. + */ +void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set) +{ + u64 *impl_pmcs; + unsigned int i, max_pmc; + + max_pmc = ctx->regs.max_pmc; + impl_pmcs = ctx->regs.pmcs; + + /* + * - by default no PMCS measures anything + * - on ctxswout, all used PMCs are disabled (cccr enable bit cleared) + * hence when masked we do not need to restore anything + */ + if (ctx->state == PFM_CTX_MASKED || ctx->flags.started == 0) + return; + + /* + * restore all pmcs + */ + for (i = 0; i < max_pmc; i++) + if (test_bit(i, impl_pmcs)) + pfm_arch_write_pmc(ctx, i, set->pmcs[i]); +} + +char *pfm_arch_get_pmu_module_name(void) +{ + switch (cpu_data->cputype) { +#ifndef CONFIG_SMP + case CPU_34K: +#if defined(CPU_74K) + case CPU_74K: +#endif +#endif + case CPU_SB1: + case CPU_SB1A: + case CPU_R12000: + case CPU_25KF: + case CPU_24K: + case CPU_20KC: + case CPU_5KC: + return "perfmon_mips64"; + default: + return NULL; + } + return NULL; +} + +int perfmon_perf_irq(void) +{ + /* BLATANTLY STOLEN FROM OPROFILE, then modified */ + struct pt_regs *regs; + unsigned int counters = pfm_pmu_conf->regs_all.max_pmc; + unsigned int control; + unsigned int counter; + + regs = get_irq_regs(); + switch (counters) { +#define HANDLE_COUNTER(n) \ + case n + 1: \ + control = read_c0_perfctrl ## n(); \ + counter = read_c0_perfcntr ## n(); \ + if ((control & MIPS64_PMC_INT_ENABLE_MASK) && \ + (counter & MIPS64_PMD_INTERRUPT)) { \ + pfm_interrupt_handler(instruction_pointer(regs),\ + regs); \ + return(1); \ + } + HANDLE_COUNTER(3) + HANDLE_COUNTER(2) + HANDLE_COUNTER(1) + HANDLE_COUNTER(0) + } + + return 0; +} +EXPORT_SYMBOL(perfmon_perf_irq); diff --git a/arch/mips/perfmon/perfmon_mips64.c b/arch/mips/perfmon/perfmon_mips64.c new file mode 100644 index 0000000..78cb43d --- /dev/null +++ b/arch/mips/perfmon/perfmon_mips64.c @@ -0,0 +1,218 @@ +/* + * This file contains the MIPS64 and decendent PMU register description tables + * and pmc checker used by perfmon.c. + * + * Copyright (c) 2005 Philip Mucci + * + * Based on perfmon_p6.c: + * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include + +MODULE_AUTHOR("Philip Mucci "); +MODULE_DESCRIPTION("MIPS64 PMU description tables"); +MODULE_LICENSE("GPL"); + +/* + * reserved: + * - bit 63-9 + * RSVD: reserved bits must be 1 + */ +#define PFM_MIPS64_PMC_RSVD 0xfffffffffffff810ULL +#define PFM_MIPS64_PMC_VAL (1ULL<<4) + +extern int null_perf_irq(struct pt_regs *regs); +extern int (*perf_irq)(struct pt_regs *regs); +extern int perfmon_perf_irq(struct pt_regs *regs); + +static struct pfm_arch_pmu_info pfm_mips64_pmu_info; + +static struct pfm_regmap_desc pfm_mips64_pmc_desc[] = { +/* pmc0 */ PMC_D(PFM_REG_I64, "CP0_25_0", PFM_MIPS64_PMC_VAL, PFM_MIPS64_PMC_RSVD, 0, 0), +/* pmc1 */ PMC_D(PFM_REG_I64, "CP0_25_1", PFM_MIPS64_PMC_VAL, PFM_MIPS64_PMC_RSVD, 0, 1), +/* pmc2 */ PMC_D(PFM_REG_I64, "CP0_25_2", PFM_MIPS64_PMC_VAL, PFM_MIPS64_PMC_RSVD, 0, 2), +/* pmc3 */ PMC_D(PFM_REG_I64, "CP0_25_3", PFM_MIPS64_PMC_VAL, PFM_MIPS64_PMC_RSVD, 0, 3) +}; +#define PFM_MIPS64_NUM_PMCS ARRAY_SIZE(pfm_mips64_pmc_desc) + +static struct pfm_regmap_desc pfm_mips64_pmd_desc[] = { +/* pmd0 */ PMD_D(PFM_REG_C, "CP0_25_0", 0), +/* pmd1 */ PMD_D(PFM_REG_C, "CP0_25_1", 1), +/* pmd2 */ PMD_D(PFM_REG_C, "CP0_25_2", 2), +/* pmd3 */ PMD_D(PFM_REG_C, "CP0_25_3", 3) +}; +#define PFM_MIPS64_NUM_PMDS ARRAY_SIZE(pfm_mips64_pmd_desc) + +static int pfm_mips64_probe_pmu(void) +{ + struct cpuinfo_mips *c = ¤t_cpu_data; + + switch (c->cputype) { +#ifndef CONFIG_SMP + case CPU_34K: +#if defined(CPU_74K) + case CPU_74K: +#endif +#endif + case CPU_SB1: + case CPU_SB1A: + case CPU_R12000: + case CPU_25KF: + case CPU_24K: + case CPU_20KC: + case CPU_5KC: + return 0; + break; + default: + PFM_INFO("Unknown cputype 0x%x", c->cputype); + } + return -1; +} + +/* + * impl_pmcs, impl_pmds are computed at runtime to minimize errors! + */ +static struct pfm_pmu_config pfm_mips64_pmu_conf = { + .pmu_name = "MIPS", /* placeholder */ + .counter_width = 31, + .pmd_desc = pfm_mips64_pmd_desc, + .pmc_desc = pfm_mips64_pmc_desc, + .num_pmc_entries = PFM_MIPS64_NUM_PMCS, + .num_pmd_entries = PFM_MIPS64_NUM_PMDS, + .probe_pmu = pfm_mips64_probe_pmu, + .flags = PFM_PMU_BUILTIN_FLAG, + .owner = THIS_MODULE, + .pmu_info = &pfm_mips64_pmu_info +}; + +static inline int n_counters(void) +{ + if (!(read_c0_config1() & MIPS64_CONFIG_PMC_MASK)) + return 0; + if (!(read_c0_perfctrl0() & MIPS64_PMC_CTR_MASK)) + return 1; + if (!(read_c0_perfctrl1() & MIPS64_PMC_CTR_MASK)) + return 2; + if (!(read_c0_perfctrl2() & MIPS64_PMC_CTR_MASK)) + return 3; + return 4; +} + +static int __init pfm_mips64_pmu_init_module(void) +{ + struct cpuinfo_mips *c = ¤t_cpu_data; + int i, ret, num; + u64 temp_mask; + + switch (c->cputype) { + case CPU_5KC: + pfm_mips64_pmu_conf.pmu_name = "MIPS5KC"; + break; + case CPU_R12000: + pfm_mips64_pmu_conf.pmu_name = "MIPSR12000"; + break; + case CPU_20KC: + pfm_mips64_pmu_conf.pmu_name = "MIPS20KC"; + break; + case CPU_24K: + pfm_mips64_pmu_conf.pmu_name = "MIPS24K"; + break; + case CPU_25KF: + pfm_mips64_pmu_conf.pmu_name = "MIPS25KF"; + break; + case CPU_SB1: + pfm_mips64_pmu_conf.pmu_name = "SB1"; + break; + case CPU_SB1A: + pfm_mips64_pmu_conf.pmu_name = "SB1A"; + break; +#ifndef CONFIG_SMP + case CPU_34K: + pfm_mips64_pmu_conf.pmu_name = "MIPS34K"; + break; +#if defined(CPU_74K) + case CPU_74K: + pfm_mips64_pmu_conf.pmu_name = "MIPS74K"; + break; +#endif +#endif + default: + PFM_INFO("Unknown cputype 0x%x", c->cputype); + return -1; + } + + /* The R14k and older performance counters have to */ + /* be hard-coded, as there is no support for auto-detection */ + if ((c->cputype == CPU_R12000) || (c->cputype == CPU_R14000)) + num = 4; + else if (c->cputype == CPU_R10000) + num = 2; + else + num = n_counters(); + + if (num == 0) { + PFM_INFO("cputype 0x%x has no counters", c->cputype); + return -1; + } + /* mark remaining counters unavailable */ + for (i = num; i < PFM_MIPS64_NUM_PMCS; i++) + pfm_mips64_pmc_desc[i].type = PFM_REG_NA; + + for (i = num; i < PFM_MIPS64_NUM_PMDS; i++) + pfm_mips64_pmd_desc[i].type = PFM_REG_NA; + + /* set the PMC_RSVD mask */ + switch (c->cputype) { + case CPU_5KC: + case CPU_R10000: + case CPU_20KC: + /* 4-bits for event */ + temp_mask = 0xfffffffffffffe10ULL; + break; + case CPU_R12000: + case CPU_R14000: + /* 5-bits for event */ + temp_mask = 0xfffffffffffffc10ULL; + break; + default: + /* 6-bits for event */ + temp_mask = 0xfffffffffffff810ULL; + } + for (i = 0; i < PFM_MIPS64_NUM_PMCS; i++) + pfm_mips64_pmc_desc[i].rsvd_msk = temp_mask; + + pfm_mips64_pmu_conf.num_pmc_entries = num; + pfm_mips64_pmu_conf.num_pmd_entries = num; + + pfm_mips64_pmu_info.pmu_style = c->cputype; + + ret = pfm_pmu_register(&pfm_mips64_pmu_conf); + if (ret == 0) + perf_irq = perfmon_perf_irq; + return ret; +} + +static void __exit pfm_mips64_pmu_cleanup_module(void) +{ + pfm_pmu_unregister(&pfm_mips64_pmu_conf); + perf_irq = null_perf_irq; +} + +module_init(pfm_mips64_pmu_init_module); +module_exit(pfm_mips64_pmu_cleanup_module); diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 587da5e..a411389 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -230,6 +230,8 @@ source "init/Kconfig" source "arch/powerpc/sysdev/Kconfig" source "arch/powerpc/platforms/Kconfig" +source "arch/powerpc/perfmon/Kconfig" + menu "Kernel options" config HIGHMEM diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index c6be19e..7ea20cb 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -146,6 +146,7 @@ core-y += arch/powerpc/kernel/ \ arch/powerpc/platforms/ core-$(CONFIG_MATH_EMULATION) += arch/powerpc/math-emu/ core-$(CONFIG_XMON) += arch/powerpc/xmon/ +core-$(CONFIG_PERFMON) += arch/powerpc/perfmon/ core-$(CONFIG_KVM) += arch/powerpc/kvm/ drivers-$(CONFIG_OPROFILE) += arch/powerpc/oprofile/ diff --git a/arch/powerpc/include/asm/Kbuild b/arch/powerpc/include/asm/Kbuild index 5ab7d7f..88cb533 100644 --- a/arch/powerpc/include/asm/Kbuild +++ b/arch/powerpc/include/asm/Kbuild @@ -21,6 +21,7 @@ header-y += resource.h header-y += sigcontext.h header-y += statfs.h header-y += ps3fb.h +header-y += perfmon.h unifdef-y += bootx.h unifdef-y += byteorder.h diff --git a/arch/powerpc/include/asm/cell-pmu.h b/arch/powerpc/include/asm/cell-pmu.h index 8066eed..981db26 100644 --- a/arch/powerpc/include/asm/cell-pmu.h +++ b/arch/powerpc/include/asm/cell-pmu.h @@ -61,6 +61,11 @@ /* Macros for the pm_status register. */ #define CBE_PM_CTR_OVERFLOW_INTR(ctr) (1 << (31 - ((ctr) & 7))) +#define CBE_PM_OVERFLOW_CTRS(pm_status) (((pm_status) >> 24) & 0xff) +#define CBE_PM_ALL_OVERFLOW_INTR 0xff000000 +#define CBE_PM_INTERVAL_INTR 0x00800000 +#define CBE_PM_TRACE_BUFFER_FULL_INTR 0x00400000 +#define CBE_PM_TRACE_BUFFER_UNDERFLOW_INTR 0x00200000 enum pm_reg_name { group_control, diff --git a/arch/powerpc/include/asm/cell-regs.h b/arch/powerpc/include/asm/cell-regs.h index fd6fd00..580786d 100644 --- a/arch/powerpc/include/asm/cell-regs.h +++ b/arch/powerpc/include/asm/cell-regs.h @@ -117,8 +117,9 @@ struct cbe_pmd_regs { u8 pad_0x0c1c_0x0c20 [4]; /* 0x0c1c */ #define CBE_PMD_FIR_MODE_M8 0x00800 u64 fir_enable_mask; /* 0x0c20 */ - - u8 pad_0x0c28_0x0ca8 [0x0ca8 - 0x0c28]; /* 0x0c28 */ + u8 pad_0x0c28_0x0c98 [0x0c98 - 0x0c28]; /* 0x0c28 */ + u64 on_ramp_trace; /* 0x0c98 */ + u64 pad_0x0ca0; /* 0x0ca0 */ u64 ras_esc_0; /* 0x0ca8 */ u8 pad_0x0cb0_0x1000 [0x1000 - 0x0cb0]; /* 0x0cb0 */ }; @@ -218,7 +219,11 @@ extern struct cbe_iic_regs __iomem *cbe_get_cpu_iic_regs(int cpu); struct cbe_mic_tm_regs { - u8 pad_0x0000_0x0040[0x0040 - 0x0000]; /* 0x0000 */ + u8 pad_0x0000_0x0010[0x0010 - 0x0000]; /* 0x0000 */ + + u64 MBL_debug; /* 0x0010 */ + + u8 pad_0x0018_0x0040[0x0040 - 0x0018]; /* 0x0018 */ u64 mic_ctl_cnfg2; /* 0x0040 */ #define CBE_MIC_ENABLE_AUX_TRC 0x8000000000000000LL @@ -303,6 +308,25 @@ struct cbe_mic_tm_regs { extern struct cbe_mic_tm_regs __iomem *cbe_get_mic_tm_regs(struct device_node *np); extern struct cbe_mic_tm_regs __iomem *cbe_get_cpu_mic_tm_regs(int cpu); +/* + * + * PPE Privileged MMIO Registers definition. (offset 0x500000 - 0x500fff) + * + */ +struct cbe_ppe_priv_regs { + u8 pad_0x0000_0x0858[0x0858 - 0x0000]; /* 0x0000 */ + + u64 L2_debug1; /* 0x0858 */ + + u8 pad_0x0860_0x0958[0x0958 - 0x0860]; /* 0x0860 */ + + u64 ciu_dr1; /* 0x0958 */ + + u8 pad_0x0960_0x1000[0x1000 - 0x0960]; /* 0x0960 */ +}; + +extern struct cbe_ppe_priv_regs __iomem *cbe_get_cpu_ppe_priv_regs(int cpu); + /* some utility functions to deal with SMT */ extern u32 cbe_get_hw_thread_id(int cpu); extern u32 cbe_cpu_to_node(int cpu); diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 6493a39..ba9ead4 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -97,6 +97,10 @@ struct paca_struct { u8 soft_enabled; /* irq soft-enable flag */ u8 hard_enabled; /* set if irqs are enabled in MSR */ u8 io_sync; /* writel() needs spin_unlock sync */ +#ifdef CONFIG_PERFMON + u8 pmu_except_pending; /* PMU exception occurred while soft + * disabled */ +#endif /* Stuff for accurate time accounting */ u64 user_time; /* accumulated usermode TB ticks */ diff --git a/arch/powerpc/include/asm/perfmon.h b/arch/powerpc/include/asm/perfmon.h new file mode 100644 index 0000000..da0ae3b --- /dev/null +++ b/arch/powerpc/include/asm/perfmon.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2007 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This file contains powerpc specific definitions for the perfmon + * interface. + * + * This file MUST never be included directly. Use linux/perfmon.h. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#ifndef _ASM_POWERPC_PERFMON_H_ +#define _ASM_POWERPC_PERFMON_H_ + +/* + * arch-specific user visible interface definitions + */ +#define PFM_ARCH_MAX_PMCS (256+64) /* 256 HW 64 SW */ +#define PFM_ARCH_MAX_PMDS (256+64) /* 256 HW 64 SW */ + +#endif /* _ASM_POWERPC_PERFMON_H_ */ diff --git a/arch/powerpc/include/asm/perfmon_kern.h b/arch/powerpc/include/asm/perfmon_kern.h new file mode 100644 index 0000000..65ec984 --- /dev/null +++ b/arch/powerpc/include/asm/perfmon_kern.h @@ -0,0 +1,390 @@ +/* + * Copyright (c) 2005 David Gibson, IBM Corporation. + * + * Based on other versions: + * Copyright (c) 2005 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This file contains powerpc specific definitions for the perfmon + * interface. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#ifndef _ASM_POWERPC_PERFMON_KERN_H_ +#define _ASM_POWERPC_PERFMON_KERN_H_ + +#ifdef __KERNEL__ + +#ifdef CONFIG_PERFMON + +#include +#include + +#define HID0_PMC5_6_GR_MODE (1UL << (63 - 40)) + +enum powerpc_pmu_type { + PFM_POWERPC_PMU_NONE, + PFM_POWERPC_PMU_604, + PFM_POWERPC_PMU_604e, + PFM_POWERPC_PMU_750, /* XXX: Minor event set diffs between IBM and Moto. */ + PFM_POWERPC_PMU_7400, + PFM_POWERPC_PMU_7450, + PFM_POWERPC_PMU_POWER4, + PFM_POWERPC_PMU_POWER5, + PFM_POWERPC_PMU_POWER5p, + PFM_POWERPC_PMU_POWER6, + PFM_POWERPC_PMU_CELL, +}; + +struct pfm_arch_pmu_info { + enum powerpc_pmu_type pmu_style; + + void (*write_pmc)(unsigned int cnum, u64 value); + void (*write_pmd)(unsigned int cnum, u64 value); + + u64 (*read_pmd)(unsigned int cnum); + + void (*enable_counters)(struct pfm_context *ctx, + struct pfm_event_set *set); + void (*disable_counters)(struct pfm_context *ctx, + struct pfm_event_set *set); + + void (*irq_handler)(struct pt_regs *regs, struct pfm_context *ctx); + void (*get_ovfl_pmds)(struct pfm_context *ctx, + struct pfm_event_set *set); + + /* The following routines are optional. */ + void (*restore_pmcs)(struct pfm_context *ctx, + struct pfm_event_set *set); + void (*restore_pmds)(struct pfm_context *ctx, + struct pfm_event_set *set); + + int (*ctxswout_thread)(struct task_struct *task, + struct pfm_context *ctx, + struct pfm_event_set *set); + void (*ctxswin_thread)(struct task_struct *task, + struct pfm_context *ctx, + struct pfm_event_set *set); + int (*load_context)(struct pfm_context *ctx); + void (*unload_context)(struct pfm_context *ctx); + int (*acquire_pmu)(u64 *unavail_pmcs, u64 *unavail_pmds); + void (*release_pmu)(void); + void *platform_info; + void (*resend_irq)(struct pfm_context *ctx); +}; + +#ifdef CONFIG_PPC32 +#define PFM_ARCH_PMD_STK_ARG 6 /* conservative value */ +#define PFM_ARCH_PMC_STK_ARG 6 /* conservative value */ +#else +#define PFM_ARCH_PMD_STK_ARG 8 /* conservative value */ +#define PFM_ARCH_PMC_STK_ARG 8 /* conservative value */ +#endif + +static inline void pfm_arch_resend_irq(struct pfm_context *ctx) +{ + struct pfm_arch_pmu_info *arch_info; + + arch_info = pfm_pmu_info(); + arch_info->resend_irq(ctx); +} + +static inline void pfm_arch_serialize(void) +{} + +static inline void pfm_arch_write_pmc(struct pfm_context *ctx, + unsigned int cnum, + u64 value) +{ + struct pfm_arch_pmu_info *arch_info; + + arch_info = pfm_pmu_info(); + + /* + * we only write to the actual register when monitoring is + * active (pfm_start was issued) + */ + if (ctx && ctx->flags.started == 0) + return; + + BUG_ON(!arch_info->write_pmc); + + arch_info->write_pmc(cnum, value); +} + +static inline void pfm_arch_write_pmd(struct pfm_context *ctx, + unsigned int cnum, u64 value) +{ + struct pfm_arch_pmu_info *arch_info; + + arch_info = pfm_pmu_info(); + + value &= pfm_pmu_conf->ovfl_mask; + + BUG_ON(!arch_info->write_pmd); + + arch_info->write_pmd(cnum, value); +} + +static inline u64 pfm_arch_read_pmd(struct pfm_context *ctx, unsigned int cnum) +{ + struct pfm_arch_pmu_info *arch_info; + + arch_info = pfm_pmu_info(); + + BUG_ON(!arch_info->read_pmd); + + return arch_info->read_pmd(cnum); +} + +/* + * For some CPUs, the upper bits of a counter must be set in order for the + * overflow interrupt to happen. On overflow, the counter has wrapped around, + * and the upper bits are cleared. This function may be used to set them back. + */ +static inline void pfm_arch_ovfl_reset_pmd(struct pfm_context *ctx, + unsigned int cnum) +{ + u64 val = pfm_arch_read_pmd(ctx, cnum); + + /* This masks out overflow bit 31 */ + pfm_arch_write_pmd(ctx, cnum, val); +} + +/* + * At certain points, perfmon needs to know if monitoring has been + * explicitely started/stopped by user via pfm_start/pfm_stop. The + * information is tracked in flags.started. However on certain + * architectures, it may be possible to start/stop directly from + * user level with a single assembly instruction bypassing + * the kernel. This function must be used to determine by + * an arch-specific mean if monitoring is actually started/stopped. + */ +static inline int pfm_arch_is_active(struct pfm_context *ctx) +{ + return ctx->flags.started; +} + +static inline void pfm_arch_ctxswout_sys(struct task_struct *task, + struct pfm_context *ctx) +{} + +static inline void pfm_arch_ctxswin_sys(struct task_struct *task, + struct pfm_context *ctx) +{} + +void pfm_arch_init_percpu(void); +int pfm_arch_is_monitoring_active(struct pfm_context *ctx); +int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx); +void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx); +void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx); +void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx); +void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set); +void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set); +void pfm_arch_clear_pmd_ovfl_cond(struct pfm_context *ctx, struct pfm_event_set *set); +int pfm_arch_get_ovfl_pmds(struct pfm_context *ctx, + struct pfm_event_set *set); +char *pfm_arch_get_pmu_module_name(void); +/* + * called from __pfm_interrupt_handler(). ctx is not NULL. + * ctx is locked. PMU interrupt is masked. + * + * must stop all monitoring to ensure handler has consistent view. + * must collect overflowed PMDs bitmask into povfls_pmds and + * npend_ovfls. If no interrupt detected then npend_ovfls + * must be set to zero. + */ +static inline void pfm_arch_intr_freeze_pmu(struct pfm_context *ctx, struct pfm_event_set *set) +{ + pfm_arch_stop(current, ctx); +} + +void powerpc_irq_handler(struct pt_regs *regs); + +/* + * unfreeze PMU from pfm_do_interrupt_handler() + * ctx may be NULL for spurious + */ +static inline void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx) +{ + struct pfm_arch_pmu_info *arch_info; + + if (!ctx) + return; + + PFM_DBG_ovfl("state=%d", ctx->state); + + ctx->flags.started = 1; + + if (ctx->state == PFM_CTX_MASKED) + return; + + arch_info = pfm_pmu_info(); + BUG_ON(!arch_info->enable_counters); + arch_info->enable_counters(ctx, ctx->active_set); +} + +/* + * PowerPC does not save the PMDs during pfm_arch_intr_freeze_pmu(), thus + * this routine needs to do it when switching sets on overflow + */ +static inline void pfm_arch_save_pmds_from_intr(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + pfm_save_pmds(ctx, set); +} + +/* + * this function is called from the PMU interrupt handler ONLY. + * On PPC, the PMU is frozen via arch_stop, masking would be implemented + * via arch-stop as well. Given that the PMU is already stopped when + * entering the interrupt handler, we do not need to stop it again, so + * this function is a nop. + */ +static inline void pfm_arch_mask_monitoring(struct pfm_context *ctx, + struct pfm_event_set *set) +{} + +/* + * Simply need to start the context in order to unmask. + */ +static inline void pfm_arch_unmask_monitoring(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + pfm_arch_start(current, ctx); +} + + +static inline int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg) +{ + return 0; +} + +static inline int pfm_arch_context_create(struct pfm_context *ctx, + u32 ctx_flags) +{ + return 0; +} + +static inline void pfm_arch_context_free(struct pfm_context *ctx) +{} + +/* not necessary on PowerPC */ +static inline void pfm_cacheflush(void *addr, unsigned int len) +{} + +/* + * function called from pfm_setfl_sane(). Context is locked + * and interrupts are masked. + * The value of flags is the value of ctx_flags as passed by + * user. + * + * function must check arch-specific set flags. + * Return: + * 1 when flags are valid + * 0 on error + */ +static inline int pfm_arch_setfl_sane(struct pfm_context *ctx, u32 flags) +{ + return 0; +} + +static inline int pfm_arch_init(void) +{ + return 0; +} + +static inline int pfm_arch_load_context(struct pfm_context *ctx) +{ + struct pfm_arch_pmu_info *arch_info; + int rc = 0; + + arch_info = pfm_pmu_info(); + if (arch_info->load_context) + rc = arch_info->load_context(ctx); + + return rc; +} + +static inline void pfm_arch_unload_context(struct pfm_context *ctx) +{ + struct pfm_arch_pmu_info *arch_info; + + arch_info = pfm_pmu_info(); + if (arch_info->unload_context) + arch_info->unload_context(ctx); +} + +static inline int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds) +{ + struct pfm_arch_pmu_info *arch_info; + int rc = 0; + + arch_info = pfm_pmu_info(); + if (arch_info->acquire_pmu) { + rc = arch_info->acquire_pmu(unavail_pmcs, unavail_pmds); + if (rc) + return rc; + } + + return reserve_pmc_hardware(powerpc_irq_handler); +} + +static inline void pfm_arch_pmu_release(void) +{ + struct pfm_arch_pmu_info *arch_info; + + arch_info = pfm_pmu_info(); + if (arch_info->release_pmu) + arch_info->release_pmu(); + + release_pmc_hardware(); +} + +static inline void pfm_arch_arm_handle_work(struct task_struct *task) +{} + +static inline void pfm_arch_disarm_handle_work(struct task_struct *task) +{} + +static inline int pfm_arch_get_base_syscall(void) +{ + return __NR_pfm_create_context; +} + +struct pfm_arch_context { + /* Cell: Most recent value of the pm_status + * register read by the interrupt handler. + * + * Interrupt handler sets last_read_updated if it + * just read and updated last_read_pm_status + */ + u32 last_read_pm_status; + u32 last_read_updated; + u64 powergs_pmc5, powergs_pmc6; + u64 delta_tb, delta_tb_start; + u64 delta_purr, delta_purr_start; +}; + +#define PFM_ARCH_CTX_SIZE sizeof(struct pfm_arch_context) +/* + * PowerPC does not need extra alignment requirements for the sampling buffer + */ +#define PFM_ARCH_SMPL_ALIGN_SIZE 0 + +#endif /* CONFIG_PERFMON */ + +#endif /* __KERNEL__ */ +#endif /* _ASM_POWERPC_PERFMON_KERN_H_ */ diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index c6d1ab6..a9f3ad0 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -698,6 +698,7 @@ #define PV_POWER5 0x003A #define PV_POWER5p 0x003B #define PV_970FX 0x003C +#define PV_POWER6 0x003E #define PV_630 0x0040 #define PV_630p 0x0041 #define PV_970MP 0x0044 diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h index f6cc7a4..0164841 100644 --- a/arch/powerpc/include/asm/systbl.h +++ b/arch/powerpc/include/asm/systbl.h @@ -322,3 +322,15 @@ SYSCALL_SPU(epoll_create1) SYSCALL_SPU(dup3) SYSCALL_SPU(pipe2) SYSCALL(inotify_init1) +SYSCALL(pfm_create_context) +SYSCALL(pfm_write_pmcs) +SYSCALL(pfm_write_pmds) +SYSCALL(pfm_read_pmds) +SYSCALL(pfm_load_context) +SYSCALL(pfm_start) +SYSCALL(pfm_stop) +SYSCALL(pfm_restart) +SYSCALL(pfm_create_evtsets) +SYSCALL(pfm_getinfo_evtsets) +SYSCALL(pfm_delete_evtsets) +SYSCALL(pfm_unload_context) diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 9665a26..6cda9f9 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -130,10 +130,12 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_FREEZE (1< 0x10000 on 4xx/Book-E since it include MSR_CE. */ #if MSR_KERNEL >= 0x10000 -#define LOAD_MSR_KERNEL(r, x) lis r,(x)@h; ori r,r,(x)@l +#define LOAD_MSR_KERNEL(r, x) lis r,(x)@ha; ori r,r,(x)@l #else #define LOAD_MSR_KERNEL(r, x) li r,(x) #endif diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 2d802e9..77a090d 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -643,6 +643,10 @@ user_work: b .ret_from_except_lite 1: bl .save_nvgprs +#ifdef CONFIG_PERFMON + addi r3,r1,STACK_FRAME_OVERHEAD + bl .pfm_handle_work +#endif /* CONFIG_PERFMON */ addi r3,r1,STACK_FRAME_OVERHEAD bl .do_signal b .ret_from_except diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index d972dec..b255fba 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -104,6 +104,24 @@ static inline notrace void set_soft_enabled(unsigned long enable) : : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled))); } +#ifdef CONFIG_PERFMON +static inline unsigned long get_pmu_except_pending(void) +{ + unsigned long pending; + + __asm__ __volatile__("lbz %0,%1(13)" + : "=r" (pending) : "i" (offsetof(struct paca_struct, pmu_except_pending))); + + return pending; +} + +static inline void set_pmu_except_pending(unsigned long pending) +{ + __asm__ __volatile__("stb %0,%1(13)" + : : "r" (pending), "i" (offsetof(struct paca_struct, pmu_except_pending))); +} +#endif /* CONFIG_PERFMON */ + notrace void raw_local_irq_restore(unsigned long en) { /* @@ -162,6 +180,19 @@ notrace void raw_local_irq_restore(unsigned long en) lv1_get_version_info(&tmp); } +#ifdef CONFIG_PERFMON + /* + * If a PMU exception occurred while interrupts were soft disabled, + * force a PMU exception. + */ + if (get_pmu_except_pending()) { + set_pmu_except_pending(0); + /* Make sure we trigger the edge detection circuitry */ + mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_PMAO); + mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_PMAO); + } +#endif /* CONFIG_PERFMON */ + __hard_irq_enable(); } EXPORT_SYMBOL(raw_local_irq_restore); diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 957bded..32dbc8e 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -393,9 +394,14 @@ struct task_struct *__switch_to(struct task_struct *prev, new_thread->start_tb = current_tb; } #endif - local_irq_save(flags); + if (test_tsk_thread_flag(prev, TIF_PERFMON_CTXSW)) + pfm_ctxsw_out(prev, new); + + if (test_tsk_thread_flag(new, TIF_PERFMON_CTXSW)) + pfm_ctxsw_in(prev, new); + account_system_vtime(current); account_process_vtime(current); calculate_steal_time(); @@ -544,6 +550,7 @@ void show_regs(struct pt_regs * regs) void exit_thread(void) { discard_lazy_cpu_state(); + pfm_exit_thread(); } void flush_thread(void) @@ -669,6 +676,7 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long usp, #else kregs->nip = (unsigned long)ret_from_fork; #endif + pfm_copy_thread(p); return 0; } diff --git a/arch/powerpc/perfmon/Kconfig b/arch/powerpc/perfmon/Kconfig new file mode 100644 index 0000000..3f4bbf2 --- /dev/null +++ b/arch/powerpc/perfmon/Kconfig @@ -0,0 +1,67 @@ +menu "Hardware Performance Monitoring support" +config PERFMON + bool "Perfmon2 performance monitoring interface" + default n + help + Enables the perfmon2 interface to access the hardware + performance counters. See for + more details. + +config PERFMON_DEBUG + bool "Perfmon debugging" + default n + depends on PERFMON + help + Enables perfmon debugging support + +config PERFMON_DEBUG_FS + bool "Enable perfmon statistics reporting via debugfs" + default y + depends on PERFMON && DEBUG_FS + help + Enable collection and reporting of perfmon timing statistics under + debugfs. This is used for debugging and performance analysis of the + subsystem. The debugfs filesystem must be mounted. + +config PERFMON_POWER4 + tristate "Support for Power4 hardware performance counters" + depends on PERFMON && PPC64 + default n + help + Enables support for the Power 4 hardware performance counters + If unsure, say M. + +config PERFMON_POWER5 + tristate "Support for Power5 hardware performance counters" + depends on PERFMON && PPC64 + default n + help + Enables support for the Power 5 hardware performance counters + If unsure, say M. + +config PERFMON_POWER6 + tristate "Support for Power6 hardware performance counters" + depends on PERFMON && PPC64 + default n + help + Enables support for the Power 6 hardware performance counters + If unsure, say M. + +config PERFMON_PPC32 + tristate "Support for PPC32 hardware performance counters" + depends on PERFMON && PPC32 + default n + help + Enables support for the PPC32 hardware performance counters + If unsure, say M. + +config PERFMON_CELL + tristate "Support for Cell hardware performance counters" + depends on PERFMON && PPC_CELL + select PS3_LPM if PPC_PS3 + default n + help + Enables support for the Cell hardware performance counters. + If unsure, say M. + +endmenu diff --git a/arch/powerpc/perfmon/Makefile b/arch/powerpc/perfmon/Makefile new file mode 100644 index 0000000..300661f --- /dev/null +++ b/arch/powerpc/perfmon/Makefile @@ -0,0 +1,6 @@ +obj-$(CONFIG_PERFMON) += perfmon.o +obj-$(CONFIG_PERFMON_POWER4) += perfmon_power4.o +obj-$(CONFIG_PERFMON_POWER5) += perfmon_power5.o +obj-$(CONFIG_PERFMON_POWER6) += perfmon_power6.o +obj-$(CONFIG_PERFMON_PPC32) += perfmon_ppc32.o +obj-$(CONFIG_PERFMON_CELL) += perfmon_cell.o diff --git a/arch/powerpc/perfmon/perfmon.c b/arch/powerpc/perfmon/perfmon.c new file mode 100644 index 0000000..51a8b6a --- /dev/null +++ b/arch/powerpc/perfmon/perfmon.c @@ -0,0 +1,334 @@ +/* + * This file implements the powerpc specific + * support for the perfmon2 interface + * + * Copyright (c) 2005 David Gibson, IBM Corporation. + * + * based on versions for other architectures: + * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include + +static void pfm_stop_active(struct task_struct *task, + struct pfm_context *ctx, struct pfm_event_set *set) +{ + struct pfm_arch_pmu_info *arch_info; + + arch_info = pfm_pmu_info(); + BUG_ON(!arch_info->disable_counters || !arch_info->get_ovfl_pmds); + + arch_info->disable_counters(ctx, set); + + if (set->npend_ovfls) + return; + + arch_info->get_ovfl_pmds(ctx, set); +} + +/* + * Called from pfm_save_pmds(). Interrupts are masked. Registers are + * already saved away. + */ +void pfm_arch_clear_pmd_ovfl_cond(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + int i, num; + u64 *used_pmds, *intr_pmds; + + num = set->nused_pmds; + used_pmds = set->used_pmds; + intr_pmds = ctx->regs.intr_pmds; + + for (i = 0; num; i++) + if (likely(test_bit(i, used_pmds))) { + if (likely(test_bit(i, intr_pmds))) + pfm_write_pmd(ctx, i, 0); + num--; + } +} + +/* + * Called from pfm_ctxsw(). Task is guaranteed to be current. + * Context is locked. Interrupts are masked. Monitoring is active. + * PMU access is guaranteed. PMC and PMD registers are live in PMU. + * + * for per-thread: + * must stop monitoring for the task + * Return: + * non-zero : did not save PMDs (as part of stopping the PMU) + * 0 : saved PMDs (no need to save them in caller) + */ +int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx) +{ + struct pfm_arch_pmu_info *arch_info; + + arch_info = pfm_pmu_info(); + /* + * disable lazy restore of the PMC/PMD registers. + */ + ctx->active_set->priv_flags |= PFM_SETFL_PRIV_MOD_BOTH; + + if (ctx->state == PFM_CTX_MASKED) + return 1; + + pfm_stop_active(task, ctx, ctx->active_set); + + if (arch_info->ctxswout_thread) + arch_info->ctxswout_thread(task, ctx, ctx->active_set); + + return pfm_arch_is_active(ctx); +} + +/* + * Called from pfm_ctxsw + */ +void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx) +{ + struct pfm_arch_pmu_info *arch_info; + + arch_info = pfm_pmu_info(); + if (ctx->state != PFM_CTX_MASKED && ctx->flags.started == 1) { + BUG_ON(!arch_info->enable_counters); + arch_info->enable_counters(ctx, ctx->active_set); + } + + if (arch_info->ctxswin_thread) + arch_info->ctxswin_thread(task, ctx, ctx->active_set); +} + +/* + * Called from pfm_stop() and idle notifier + * + * Interrupts are masked. Context is locked. Set is the active set. + * + * For per-thread: + * task is not necessarily current. If not current task, then + * task is guaranteed stopped and off any cpu. Access to PMU + * is not guaranteed. Interrupts are masked. Context is locked. + * Set is the active set. + * + * For system-wide: + * task is current + * + * must disable active monitoring. ctx cannot be NULL + */ +void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx) +{ + /* + * no need to go through stop_save() + * if we are already stopped + */ + if (!ctx->flags.started || ctx->state == PFM_CTX_MASKED) + return; + + /* + * stop live registers and collect pending overflow + */ + if (task == current) + pfm_stop_active(task, ctx, ctx->active_set); +} + +/* + * Enable active monitoring. Called from pfm_start() and + * pfm_arch_unmask_monitoring(). + * + * Interrupts are masked. Context is locked. Set is the active set. + * + * For per-thread: + * Task is not necessarily current. If not current task, then task + * is guaranteed stopped and off any cpu. No access to PMU if task + * is not current. + * + * For system-wide: + * Task is always current + */ +void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx) +{ + struct pfm_arch_pmu_info *arch_info; + + arch_info = pfm_pmu_info(); + if (task != current) + return; + + BUG_ON(!arch_info->enable_counters); + + arch_info->enable_counters(ctx, ctx->active_set); +} + +/* + * function called from pfm_switch_sets(), pfm_context_load_thread(), + * pfm_context_load_sys(), pfm_ctxsw(), pfm_switch_sets() + * context is locked. Interrupts are masked. set cannot be NULL. + * Access to the PMU is guaranteed. + * + * function must restore all PMD registers from set. + */ +void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set) +{ + struct pfm_arch_pmu_info *arch_info; + u64 *used_pmds; + u16 i, num; + + arch_info = pfm_pmu_info(); + + /* The model-specific module can override the default + * restore-PMD method. + */ + if (arch_info->restore_pmds) + return arch_info->restore_pmds(ctx, set); + + num = set->nused_pmds; + used_pmds = set->used_pmds; + + for (i = 0; num; i++) { + if (likely(test_bit(i, used_pmds))) { + pfm_write_pmd(ctx, i, set->pmds[i].value); + num--; + } + } +} + +/* + * function called from pfm_switch_sets(), pfm_context_load_thread(), + * pfm_context_load_sys(), pfm_ctxsw(), pfm_switch_sets() + * context is locked. Interrupts are masked. set cannot be NULL. + * Access to the PMU is guaranteed. + * + * function must restore all PMC registers from set, if needed. + */ +void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set) +{ + struct pfm_arch_pmu_info *arch_info; + u64 *impl_pmcs; + unsigned int i, max_pmc, reg; + + arch_info = pfm_pmu_info(); + /* The model-specific module can override the default + * restore-PMC method. + */ + if (arch_info->restore_pmcs) + return arch_info->restore_pmcs(ctx, set); + + /* The "common" powerpc model's enable the counters simply by writing + * all the control registers. Therefore, if we're masked or stopped we + * don't need to bother restoring the PMCs now. + */ + if (ctx->state == PFM_CTX_MASKED || ctx->flags.started == 0) + return; + + max_pmc = ctx->regs.max_pmc; + impl_pmcs = ctx->regs.pmcs; + + /* + * Restore all pmcs in reverse order to ensure the counters aren't + * enabled before their event selectors are set correctly. + */ + reg = max_pmc - 1; + for (i = 0; i < max_pmc; i++) { + if (test_bit(reg, impl_pmcs)) + pfm_arch_write_pmc(ctx, reg, set->pmcs[reg]); + reg--; + } +} + +char *pfm_arch_get_pmu_module_name(void) +{ + unsigned int pvr = mfspr(SPRN_PVR); + + switch (PVR_VER(pvr)) { + case 0x0004: /* 604 */ + case 0x0009: /* 604e; */ + case 0x000A: /* 604ev */ + case 0x0008: /* 750/740 */ + case 0x7000: /* 750FX */ + case 0x7001: + case 0x7002: /* 750GX */ + case 0x000C: /* 7400 */ + case 0x800C: /* 7410 */ + case 0x8000: /* 7451/7441 */ + case 0x8001: /* 7455/7445 */ + case 0x8002: /* 7457/7447 */ + case 0x8003: /* 7447A */ + case 0x8004: /* 7448 */ + return("perfmon_ppc32"); + case PV_POWER4: + case PV_POWER4p: + return "perfmon_power4"; + case PV_POWER5: + return "perfmon_power5"; + case PV_POWER5p: + if (PVR_REV(pvr) < 0x300) + /* PMU behaves like POWER5 */ + return "perfmon_power5"; + else + /* PMU behaves like POWER6 */ + return "perfmon_power6"; + case PV_POWER6: + return "perfmon_power6"; + case PV_970: + case PV_970FX: + case PV_970MP: + return "perfmon_ppc970"; + case PV_BE: + return "perfmon_cell"; + } + return NULL; +} + +void pfm_arch_init_percpu(void) +{ +#ifdef CONFIG_PPC64 + extern void ppc64_enable_pmcs(void); + ppc64_enable_pmcs(); +#endif +} + +/** + * powerpc_irq_handler + * + * Get the perfmon context that belongs to the current CPU, and call the + * model-specific interrupt handler. + **/ +void powerpc_irq_handler(struct pt_regs *regs) +{ + struct pfm_arch_pmu_info *arch_info; + struct pfm_context *ctx; + + if (! regs->softe) { + /* + * We got a PMU interrupt while interrupts were soft + * disabled. Disable hardware interrupts by clearing + * MSR_EE and also clear PMAO because we will need to set + * that again later when interrupts are re-enabled and + * raw_local_irq_restore() sees that the pmu_except_pending + * flag is set. + */ + regs->msr &= ~MSR_EE; + get_paca()->pmu_except_pending = 1; + mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_PMAO); + return; + } + + arch_info = pfm_pmu_info(); + if (arch_info->irq_handler) { + ctx = __get_cpu_var(pmu_ctx); + if (likely(ctx)) + arch_info->irq_handler(regs, ctx); + } +} diff --git a/arch/powerpc/perfmon/perfmon_cell.c b/arch/powerpc/perfmon/perfmon_cell.c new file mode 100644 index 0000000..e1ae12c --- /dev/null +++ b/arch/powerpc/perfmon/perfmon_cell.c @@ -0,0 +1,1449 @@ +/* + * This file contains the Cell PMU register description tables + * and pmc checker used by perfmon.c. + * + * Copyright IBM Corporation 2007 + * (C) Copyright 2007 TOSHIBA CORPORATION + * + * Based on other Perfmon2 PMU modules. + * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Kevin Corry , " + "Carl Love "); +MODULE_DESCRIPTION("Cell PMU description table"); +MODULE_LICENSE("GPL"); + +struct pfm_cell_platform_pmu_info { + u32 (*read_ctr)(u32 cpu, u32 ctr); + void (*write_ctr)(u32 cpu, u32 ctr, u32 val); + void (*write_pm07_control)(u32 cpu, u32 ctr, u32 val); + void (*write_pm)(u32 cpu, enum pm_reg_name reg, u32 val); + void (*enable_pm)(u32 cpu); + void (*disable_pm)(u32 cpu); + void (*enable_pm_interrupts)(u32 cpu, u32 thread, u32 mask); + u32 (*get_and_clear_pm_interrupts)(u32 cpu); + u32 (*get_hw_thread_id)(int cpu); + struct cbe_ppe_priv_regs __iomem *(*get_cpu_ppe_priv_regs)(int cpu); + struct cbe_pmd_regs __iomem *(*get_cpu_pmd_regs)(int cpu); + struct cbe_mic_tm_regs __iomem *(*get_cpu_mic_tm_regs)(int cpu); + int (*rtas_token)(const char *service); + int (*rtas_call)(int token, int param1, int param2, int *param3, ...); +}; + +/* + * Mapping from Perfmon logical control registers to Cell hardware registers. + */ +static struct pfm_regmap_desc pfm_cell_pmc_desc[] = { + /* Per-counter control registers. */ + PMC_D(PFM_REG_I, "pm0_control", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm1_control", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm2_control", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm3_control", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm4_control", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm5_control", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm6_control", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm7_control", 0, 0, 0, 0), + + /* Per-counter RTAS arguments. Each of these registers has three fields. + * bits 63-48: debug-bus word + * bits 47-32: sub-unit + * bits 31-0 : full signal number + * (MSB = 63, LSB = 0) + */ + PMC_D(PFM_REG_I, "pm0_event", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm1_event", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm2_event", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm3_event", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm4_event", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm5_event", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm6_event", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm7_event", 0, 0, 0, 0), + + /* Global control registers. Same order as enum pm_reg_name. */ + PMC_D(PFM_REG_I, "group_control", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "debug_bus_control", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "trace_address", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "ext_trace_timer", 0, 0, 0, 0), + PMC_D(PFM_REG_I, "pm_status", 0, 0, 0, 0), + /* set the interrupt overflow bit for the four 32 bit counters + * that is currently supported. Will need to fix when 32 and 16 + * bit counters are supported. + */ + PMC_D(PFM_REG_I, "pm_control", 0xF0000000, 0xF0000000, 0, 0), + PMC_D(PFM_REG_I, "pm_interval", 0, 0, 0, 0), /* FIX: Does user-space also need read access to this one? */ + PMC_D(PFM_REG_I, "pm_start_stop", 0, 0, 0, 0), +}; +#define PFM_PM_NUM_PMCS ARRAY_SIZE(pfm_cell_pmc_desc) + +#define CELL_PMC_GROUP_CONTROL 16 +#define CELL_PMC_PM_STATUS 20 +#define CELL_PMC_PM_CONTROL 21 +#define CELL_PMC_PM_CONTROL_CNTR_MASK 0x01E00000UL +#define CELL_PMC_PM_CONTROL_CNTR_16 0x01E00000UL + +/* + * Mapping from Perfmon logical data counters to Cell hardware counters. + */ +static struct pfm_regmap_desc pfm_cell_pmd_desc[] = { + PMD_D(PFM_REG_C, "pm0", 0), + PMD_D(PFM_REG_C, "pm1", 0), + PMD_D(PFM_REG_C, "pm2", 0), + PMD_D(PFM_REG_C, "pm3", 0), + PMD_D(PFM_REG_C, "pm4", 0), + PMD_D(PFM_REG_C, "pm5", 0), + PMD_D(PFM_REG_C, "pm6", 0), + PMD_D(PFM_REG_C, "pm7", 0), +}; +#define PFM_PM_NUM_PMDS ARRAY_SIZE(pfm_cell_pmd_desc) + +#define PFM_EVENT_PMC_BUS_WORD(x) (((x) >> 48) & 0x00ff) +#define PFM_EVENT_PMC_FULL_SIGNAL_NUMBER(x) ((x) & 0xffffffff) +#define PFM_EVENT_PMC_SIGNAL_GROUP(x) (((x) & 0xffffffff) / 100) +#define PFM_PM_CTR_INPUT_MUX_BIT(pm07_control) (((pm07_control) >> 26) & 0x1f) +#define PFM_PM_CTR_INPUT_MUX_GROUP_INDEX(pm07_control) ((pm07_control) >> 31) +#define PFM_GROUP_CONTROL_GROUP0_WORD(grp_ctrl) ((grp_ctrl) >> 30) +#define PFM_GROUP_CONTROL_GROUP1_WORD(grp_ctrl) (((grp_ctrl) >> 28) & 0x3) +#define PFM_NUM_OF_GROUPS 2 +#define PFM_PPU_IU1_THREAD1_BASE_BIT 19 +#define PFM_PPU_XU_THREAD1_BASE_BIT 16 +#define PFM_COUNTER_CTRL_PMC_PPU_TH0 0x100000000ULL +#define PFM_COUNTER_CTRL_PMC_PPU_TH1 0x200000000ULL + +/* + * Debug-bus signal handling. + * + * Some Cell systems have firmware that can handle the debug-bus signal + * routing. For systems without this firmware, we have a minimal in-kernel + * implementation as well. + */ + +/* The firmware only sees physical CPUs, so divide by 2 if SMT is on. */ +#ifdef CONFIG_SCHED_SMT +#define RTAS_CPU(cpu) ((cpu) / 2) +#else +#define RTAS_CPU(cpu) (cpu) +#endif +#define RTAS_BUS_WORD(x) (u16)(((x) >> 48) & 0x0000ffff) +#define RTAS_SUB_UNIT(x) (u16)(((x) >> 32) & 0x0000ffff) +#define RTAS_SIGNAL_NUMBER(x) (s32)( (x) & 0xffffffff) +#define RTAS_SIGNAL_GROUP(x) (RTAS_SIGNAL_NUMBER(x) / 100) + +#define subfunc_RESET 1 +#define subfunc_ACTIVATE 2 + +#define passthru_ENABLE 1 +#define passthru_DISABLE 2 + +/** + * struct cell_rtas_arg + * + * @cpu: Processor to modify. Linux numbers CPUs based on SMT IDs, but the + * firmware only sees the physical CPUs. So this value should be the + * SMT ID (from smp_processor_id() or get_cpu()) divided by 2. + * @sub_unit: Hardware subunit this applies to (if applicable). + * @signal_group: Signal group to enable/disable on the trace bus. + * @bus_word: For signal groups that propagate via the trace bus, this trace + * bus word will be used. This is a mask of (1 << TraceBusWord). + * For other signal groups, this specifies the trigger or event bus. + * @bit: Trigger/Event bit, if applicable for the signal group. + * + * An array of these structures are passed to rtas_call() to set up the + * signals on the debug bus. + **/ +struct cell_rtas_arg { + u16 cpu; + u16 sub_unit; + s16 signal_group; + u8 bus_word; + u8 bit; +}; + +/** + * rtas_reset_signals + * + * Use the firmware RTAS call to disable signal pass-thru and to reset the + * debug-bus signals. + **/ +static int rtas_reset_signals(u32 cpu) +{ + struct cell_rtas_arg signal; + u64 real_addr = virt_to_phys(&signal); + int rc; + struct pfm_cell_platform_pmu_info *info = + ((struct pfm_arch_pmu_info *) + (pfm_pmu_conf->pmu_info))->platform_info; + + memset(&signal, 0, sizeof(signal)); + signal.cpu = RTAS_CPU(cpu); + rc = info->rtas_call(info->rtas_token("ibm,cbe-perftools"), + 5, 1, NULL, + subfunc_RESET, + passthru_DISABLE, + real_addr >> 32, + real_addr & 0xffffffff, + sizeof(signal)); + + return rc; +} + +/** + * rtas_activate_signals + * + * Use the firmware RTAS call to enable signal pass-thru and to activate the + * desired signal groups on the debug-bus. + **/ +static int rtas_activate_signals(struct cell_rtas_arg *signals, + int num_signals) +{ + u64 real_addr = virt_to_phys(signals); + int rc; + struct pfm_cell_platform_pmu_info *info = + ((struct pfm_arch_pmu_info *) + (pfm_pmu_conf->pmu_info))->platform_info; + + rc = info->rtas_call(info->rtas_token("ibm,cbe-perftools"), + 5, 1, NULL, + subfunc_ACTIVATE, + passthru_ENABLE, + real_addr >> 32, + real_addr & 0xffffffff, + num_signals * sizeof(*signals)); + + return rc; +} + +#define HID1_RESET_MASK (~0x00000001ffffffffUL) +#define PPU_IU1_WORD0_HID1_EN_MASK (~0x00000001f0c0802cUL) +#define PPU_IU1_WORD0_HID1_EN_WORD ( 0x00000001f0400000UL) +#define PPU_IU1_WORD1_HID1_EN_MASK (~0x000000010fc08023UL) +#define PPU_IU1_WORD1_HID1_EN_WORD ( 0x000000010f400001UL) +#define PPU_XU_WORD0_HID1_EN_MASK (~0x00000001f038402cUL) +#define PPU_XU_WORD0_HID1_EN_WORD ( 0x00000001f0080008UL) +#define PPU_XU_WORD1_HID1_EN_MASK (~0x000000010f074023UL) +#define PPU_XU_WORD1_HID1_EN_WORD ( 0x000000010f030002UL) + +/* The bus_word field in the cell_rtas_arg structure is a bit-mask + * indicating which debug-bus word(s) to use. + */ +enum { + BUS_WORD_0 = 1, + BUS_WORD_1 = 2, + BUS_WORD_2 = 4, + BUS_WORD_3 = 8, +}; + +/* Definitions of the signal-groups that the built-in signal-activation + * code can handle. + */ +enum { + SIG_GROUP_NONE = 0, + + /* 2.x PowerPC Processor Unit (PPU) Signal Groups */ + SIG_GROUP_PPU_BASE = 20, + SIG_GROUP_PPU_IU1 = 21, + SIG_GROUP_PPU_XU = 22, + + /* 3.x PowerPC Storage Subsystem (PPSS) Signal Groups */ + SIG_GROUP_PPSS_BASE = 30, + + /* 4.x Synergistic Processor Unit (SPU) Signal Groups */ + SIG_GROUP_SPU_BASE = 40, + + /* 5.x Memory Flow Controller (MFC) Signal Groups */ + SIG_GROUP_MFC_BASE = 50, + + /* 6.x Element )nterconnect Bus (EIB) Signal Groups */ + SIG_GROUP_EIB_BASE = 60, + + /* 7.x Memory Interface Controller (MIC) Signal Groups */ + SIG_GROUP_MIC_BASE = 70, + + /* 8.x Cell Broadband Engine Interface (BEI) Signal Groups */ + SIG_GROUP_BEI_BASE = 80, +}; + +/** + * rmw_spr + * + * Read-modify-write for a special-purpose-register. + **/ +#define rmw_spr(spr_id, a_mask, o_mask) \ + do { \ + u64 value = mfspr(spr_id); \ + value &= (u64)(a_mask); \ + value |= (u64)(o_mask); \ + mtspr((spr_id), value); \ + } while (0) + +/** + * rmw_mmio_reg64 + * + * Read-modify-write for a 64-bit MMIO register. + **/ +#define rmw_mmio_reg64(mem, a_mask, o_mask) \ + do { \ + u64 value = in_be64(&(mem)); \ + value &= (u64)(a_mask); \ + value |= (u64)(o_mask); \ + out_be64(&(mem), value); \ + } while (0) + +/** + * rmwb_mmio_reg64 + * + * Set or unset a specified bit within a 64-bit MMIO register. + **/ +#define rmwb_mmio_reg64(mem, bit_num, set_bit) \ + rmw_mmio_reg64((mem), ~(1UL << (63 - (bit_num))), \ + ((set_bit) << (63 - (bit_num)))) + +/** + * passthru + * + * Enable or disable passthru mode in all the Cell signal islands. + **/ +static int passthru(u32 cpu, u64 enable) +{ + struct cbe_ppe_priv_regs __iomem *ppe_priv_regs; + struct cbe_pmd_regs __iomem *pmd_regs; + struct cbe_mic_tm_regs __iomem *mic_tm_regs; + struct pfm_cell_platform_pmu_info *info = + ((struct pfm_arch_pmu_info *) + (pfm_pmu_conf->pmu_info))->platform_info; + + ppe_priv_regs = info->get_cpu_ppe_priv_regs(cpu); + pmd_regs = info->get_cpu_pmd_regs(cpu); + mic_tm_regs = info->get_cpu_mic_tm_regs(cpu); + + if (!ppe_priv_regs || !pmd_regs || !mic_tm_regs) { + PFM_ERR("Error getting Cell PPE, PMD, and MIC " + "register maps: 0x%p, 0x%p, 0x%p", + ppe_priv_regs, pmd_regs, mic_tm_regs); + return -EINVAL; + } + + rmwb_mmio_reg64(ppe_priv_regs->L2_debug1, 61, enable); + rmwb_mmio_reg64(ppe_priv_regs->ciu_dr1, 5, enable); + rmwb_mmio_reg64(pmd_regs->on_ramp_trace, 39, enable); + rmwb_mmio_reg64(mic_tm_regs->MBL_debug, 20, enable); + + return 0; +} + +#define passthru_enable(cpu) passthru(cpu, 1) +#define passthru_disable(cpu) passthru(cpu, 0) + +static inline void reset_signal_registers(u32 cpu) +{ + rmw_spr(SPRN_HID1, HID1_RESET_MASK, 0); +} + +/** + * celleb_reset_signals + * + * Non-rtas version of resetting the debug-bus signals. + **/ +static int celleb_reset_signals(u32 cpu) +{ + int rc; + rc = passthru_disable(cpu); + if (!rc) + reset_signal_registers(cpu); + return rc; +} + +/** + * ppu_selection + * + * Write the HID1 register to connect the specified PPU signal-group to the + * debug-bus. + **/ +static int ppu_selection(struct cell_rtas_arg *signal) +{ + u64 hid1_enable_word = 0; + u64 hid1_enable_mask = 0; + + switch (signal->signal_group) { + + case SIG_GROUP_PPU_IU1: /* 2.1 PPU Instruction Unit - Group 1 */ + switch (signal->bus_word) { + case BUS_WORD_0: + hid1_enable_mask = PPU_IU1_WORD0_HID1_EN_MASK; + hid1_enable_word = PPU_IU1_WORD0_HID1_EN_WORD; + break; + case BUS_WORD_1: + hid1_enable_mask = PPU_IU1_WORD1_HID1_EN_MASK; + hid1_enable_word = PPU_IU1_WORD1_HID1_EN_WORD; + break; + default: + PFM_ERR("Invalid bus-word (0x%x) for signal-group %d.", + signal->bus_word, signal->signal_group); + return -EINVAL; + } + break; + + case SIG_GROUP_PPU_XU: /* 2.2 PPU Execution Unit */ + switch (signal->bus_word) { + case BUS_WORD_0: + hid1_enable_mask = PPU_XU_WORD0_HID1_EN_MASK; + hid1_enable_word = PPU_XU_WORD0_HID1_EN_WORD; + break; + case BUS_WORD_1: + hid1_enable_mask = PPU_XU_WORD1_HID1_EN_MASK; + hid1_enable_word = PPU_XU_WORD1_HID1_EN_WORD; + break; + default: + PFM_ERR("Invalid bus-word (0x%x) for signal-group %d.", + signal->bus_word, signal->signal_group); + return -EINVAL; + } + break; + + default: + PFM_ERR("Signal-group %d not implemented.", + signal->signal_group); + return -EINVAL; + } + + rmw_spr(SPRN_HID1, hid1_enable_mask, hid1_enable_word); + + return 0; +} + +/** + * celleb_activate_signals + * + * Non-rtas version of activating the debug-bus signals. + **/ +static int celleb_activate_signals(struct cell_rtas_arg *signals, + int num_signals) +{ + int i, rc = -EINVAL; + + for (i = 0; i < num_signals; i++) { + switch (signals[i].signal_group) { + + /* 2.x PowerPC Processor Unit (PPU) Signal Selection */ + case SIG_GROUP_PPU_IU1: + case SIG_GROUP_PPU_XU: + rc = ppu_selection(signals + i); + if (rc) + return rc; + break; + + default: + PFM_ERR("Signal-group %d not implemented.", + signals[i].signal_group); + return -EINVAL; + } + } + + if (0 < i) + rc = passthru_enable(signals[0].cpu); + + return rc; +} + +/** + * ps3_reset_signals + * + * ps3 version of resetting the debug-bus signals. + **/ +static int ps3_reset_signals(u32 cpu) +{ +#ifdef CONFIG_PPC_PS3 + return ps3_set_signal(0, 0, 0, 0); +#else + return 0; +#endif +} + +/** + * ps3_activate_signals + * + * ps3 version of activating the debug-bus signals. + **/ +static int ps3_activate_signals(struct cell_rtas_arg *signals, + int num_signals) +{ +#ifdef CONFIG_PPC_PS3 + int i; + + for (i = 0; i < num_signals; i++) + ps3_set_signal(signals[i].signal_group, signals[i].bit, + signals[i].sub_unit, signals[i].bus_word); +#endif + return 0; +} + + +/** + * reset_signals + * + * Call to the firmware (if available) to reset the debug-bus signals. + * Otherwise call the built-in version. + **/ +int reset_signals(u32 cpu) +{ + int rc; + + if (machine_is(celleb)) + rc = celleb_reset_signals(cpu); + else if (machine_is(ps3)) + rc = ps3_reset_signals(cpu); + else + rc = rtas_reset_signals(cpu); + + return rc; +} + +/** + * activate_signals + * + * Call to the firmware (if available) to activate the debug-bus signals. + * Otherwise call the built-in version. + **/ +int activate_signals(struct cell_rtas_arg *signals, int num_signals) +{ + int rc; + + if (machine_is(celleb)) + rc = celleb_activate_signals(signals, num_signals); + else if (machine_is(ps3)) + rc = ps3_activate_signals(signals, num_signals); + else + rc = rtas_activate_signals(signals, num_signals); + + return rc; +} + +/** + * pfm_cell_pmc_check + * + * Verify that we are going to write a valid value to the specified PMC. + **/ +int pfm_cell_pmc_check(struct pfm_context *ctx, + struct pfm_event_set *set, + struct pfarg_pmc *req) +{ + u16 cnum, reg_num = req->reg_num; + s16 signal_group = RTAS_SIGNAL_GROUP(req->reg_value); + u8 bus_word = RTAS_BUS_WORD(req->reg_value); + + if (reg_num < NR_CTRS || reg_num >= (NR_CTRS * 2)) + return -EINVAL; + + switch (signal_group) { + case SIG_GROUP_PPU_IU1: + case SIG_GROUP_PPU_XU: + if ((bus_word != 0) && (bus_word != 1)) { + PFM_ERR("Invalid bus word (%d) for signal-group %d", + bus_word, signal_group); + return -EINVAL; + } + break; + default: + PFM_ERR("Signal-group %d not implemented.", signal_group); + return -EINVAL; + } + + for (cnum = NR_CTRS; cnum < (NR_CTRS * 2); cnum++) { + if (test_bit(cnum, cast_ulp(set->used_pmcs)) && + bus_word == RTAS_BUS_WORD(set->pmcs[cnum]) && + signal_group != RTAS_SIGNAL_GROUP(set->pmcs[cnum])) { + PFM_ERR("Impossible signal-group combination: " + "(%u,%u,%d) (%u,%u,%d)", + reg_num, bus_word, signal_group, cnum, + RTAS_BUS_WORD(set->pmcs[cnum]), + RTAS_SIGNAL_GROUP(set->pmcs[cnum])); + return -EBUSY; + } + } + + return 0; +} + +/** + * write_pm07_event + * + * Pull out the RTAS arguments from the 64-bit register value and make the + * RTAS activate-signals call. + **/ +static void write_pm07_event(int cpu, unsigned int ctr, u64 value) +{ + struct cell_rtas_arg signal; + s32 signal_number; + int rc; + + signal_number = RTAS_SIGNAL_NUMBER(value); + if (!signal_number) { + /* Don't include counters that are counting cycles. */ + return; + } + + signal.cpu = RTAS_CPU(cpu); + signal.bus_word = 1 << RTAS_BUS_WORD(value); + signal.sub_unit = RTAS_SUB_UNIT(value); + signal.signal_group = signal_number / 100; + signal.bit = abs(signal_number) % 100; + + rc = activate_signals(&signal, 1); + if (rc) { + PFM_WARN("%s(%d, %u, %lu): Error calling " + "activate_signals(): %d\n", __func__, + cpu, ctr, (unsigned long)value, rc); + /* FIX: Could we change this routine to return an error? */ + } +} + +/** + * pfm_cell_probe_pmu + * + * Simply check the processor version register to see if we're currently + * on a Cell system. + **/ +static int pfm_cell_probe_pmu(void) +{ + unsigned long pvr = mfspr(SPRN_PVR); + + if (PVR_VER(pvr) != PV_BE) + return -1; + + return 0; +} + +/** + * pfm_cell_write_pmc + **/ +static void pfm_cell_write_pmc(unsigned int cnum, u64 value) +{ + int cpu = smp_processor_id(); + struct pfm_cell_platform_pmu_info *info = + ((struct pfm_arch_pmu_info *) + (pfm_pmu_conf->pmu_info))->platform_info; + + if (cnum < NR_CTRS) { + info->write_pm07_control(cpu, cnum, value); + + } else if (cnum < NR_CTRS * 2) { + write_pm07_event(cpu, cnum - NR_CTRS, value); + + } else if (cnum == CELL_PMC_PM_STATUS) { + /* The pm_status register must be treated separately from + * the other "global" PMCs. This call will ensure that + * the interrupts are routed to the correct CPU, as well + * as writing the desired value to the pm_status register. + */ + info->enable_pm_interrupts(cpu, info->get_hw_thread_id(cpu), + value); + + } else if (cnum < PFM_PM_NUM_PMCS) { + info->write_pm(cpu, cnum - (NR_CTRS * 2), value); + } +} + +/** + * pfm_cell_write_pmd + **/ +static void pfm_cell_write_pmd(unsigned int cnum, u64 value) +{ + int cpu = smp_processor_id(); + struct pfm_cell_platform_pmu_info *info = + ((struct pfm_arch_pmu_info *) + (pfm_pmu_conf->pmu_info))->platform_info; + + if (cnum < NR_CTRS) + info->write_ctr(cpu, cnum, value); +} + +/** + * pfm_cell_read_pmd + **/ +static u64 pfm_cell_read_pmd(unsigned int cnum) +{ + int cpu = smp_processor_id(); + struct pfm_cell_platform_pmu_info *info = + ((struct pfm_arch_pmu_info *) + (pfm_pmu_conf->pmu_info))->platform_info; + + if (cnum < NR_CTRS) + return info->read_ctr(cpu, cnum); + + return -EINVAL; +} + +/** + * pfm_cell_enable_counters + * + * Just need to turn on the global disable bit in pm_control. + **/ +static void pfm_cell_enable_counters(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + struct pfm_cell_platform_pmu_info *info = + ((struct pfm_arch_pmu_info *) + (pfm_pmu_conf->pmu_info))->platform_info; + + info->enable_pm(smp_processor_id()); +} + +/** + * pfm_cell_disable_counters + * + * Just need to turn off the global disable bit in pm_control. + **/ +static void pfm_cell_disable_counters(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + struct pfm_cell_platform_pmu_info *info = + ((struct pfm_arch_pmu_info *) + (pfm_pmu_conf->pmu_info))->platform_info; + + info->disable_pm(smp_processor_id()); + if (machine_is(ps3)) + reset_signals(smp_processor_id()); +} + +/* + * Return the thread id of the specified ppu signal. + */ +static inline u32 get_target_ppu_thread_id(u32 group, u32 bit) +{ + if ((group == SIG_GROUP_PPU_IU1 && + bit < PFM_PPU_IU1_THREAD1_BASE_BIT) || + (group == SIG_GROUP_PPU_XU && + bit < PFM_PPU_XU_THREAD1_BASE_BIT)) + return 0; + else + return 1; +} + +/* + * Return whether the specified counter is for PPU signal group. + */ +static inline int is_counter_for_ppu_sig_grp(u32 counter_control, u32 sig_grp) +{ + if (!(counter_control & CBE_PM_CTR_INPUT_CONTROL) && + (counter_control & CBE_PM_CTR_ENABLE) && + ((sig_grp == SIG_GROUP_PPU_IU1) || (sig_grp == SIG_GROUP_PPU_XU))) + return 1; + else + return 0; +} + +/* + * Search ppu signal groups. + */ +static int get_ppu_signal_groups(struct pfm_event_set *set, + u32 *ppu_sig_grp0, u32 *ppu_sig_grp1) +{ + u64 pm_event, *used_pmcs = set->used_pmcs; + int i, j; + u32 grp0_wd, grp1_wd, wd, sig_grp; + + *ppu_sig_grp0 = 0; + *ppu_sig_grp1 = 0; + grp0_wd = PFM_GROUP_CONTROL_GROUP0_WORD( + set->pmcs[CELL_PMC_GROUP_CONTROL]); + grp1_wd = PFM_GROUP_CONTROL_GROUP1_WORD( + set->pmcs[CELL_PMC_GROUP_CONTROL]); + + for (i = 0, j = 0; (i < NR_CTRS) && (j < PFM_NUM_OF_GROUPS); i++) { + if (test_bit(i + NR_CTRS, used_pmcs)) { + pm_event = set->pmcs[i + NR_CTRS]; + wd = PFM_EVENT_PMC_BUS_WORD(pm_event); + sig_grp = PFM_EVENT_PMC_SIGNAL_GROUP(pm_event); + if ((sig_grp == SIG_GROUP_PPU_IU1) || + (sig_grp == SIG_GROUP_PPU_XU)) { + + if (wd == grp0_wd && *ppu_sig_grp0 == 0) { + *ppu_sig_grp0 = sig_grp; + j++; + } else if (wd == grp1_wd && + *ppu_sig_grp1 == 0) { + *ppu_sig_grp1 = sig_grp; + j++; + } + } + } + } + return j; +} + +/** + * pfm_cell_restore_pmcs + * + * Write all control register values that are saved in the specified event + * set. We could use the pfm_arch_write_pmc() function to restore each PMC + * individually (as is done in other architectures), but that results in + * multiple RTAS calls. As an optimization, we will setup the RTAS argument + * array so we can do all event-control registers in one RTAS call. + * + * In per-thread mode, + * The counter enable bit of the pmX_control PMC is enabled while the target + * task runs on the target HW thread. + **/ +void pfm_cell_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set) +{ + u64 ctr_ctrl; + u64 *used_pmcs = set->used_pmcs; + int i; + int cpu = smp_processor_id(); + u32 current_th_id; + struct pfm_cell_platform_pmu_info *info = + ((struct pfm_arch_pmu_info *) + (pfm_pmu_conf->pmu_info))->platform_info; + + for (i = 0; i < NR_CTRS; i++) { + ctr_ctrl = set->pmcs[i]; + + if (ctr_ctrl & PFM_COUNTER_CTRL_PMC_PPU_TH0) { + current_th_id = info->get_hw_thread_id(cpu); + + /* + * Set the counter enable bit down if the current + * HW thread is NOT 0 + **/ + if (current_th_id) + ctr_ctrl = ctr_ctrl & ~CBE_PM_CTR_ENABLE; + + } else if (ctr_ctrl & PFM_COUNTER_CTRL_PMC_PPU_TH1) { + current_th_id = info->get_hw_thread_id(cpu); + + /* + * Set the counter enable bit down if the current + * HW thread is 0 + **/ + if (!current_th_id) + ctr_ctrl = ctr_ctrl & ~CBE_PM_CTR_ENABLE; + } + + /* Write the per-counter control register. If the PMC is not + * in use, then it will simply clear the register, which will + * disable the associated counter. + */ + info->write_pm07_control(cpu, i, ctr_ctrl); + + if (test_bit(i + NR_CTRS, used_pmcs)) + write_pm07_event(cpu, 0, set->pmcs[i + NR_CTRS]); + } + + /* Write all the global PMCs. Need to call pfm_cell_write_pmc() + * instead of cbe_write_pm() due to special handling for the + * pm_status register. + */ + for (i *= 2; i < PFM_PM_NUM_PMCS; i++) + pfm_cell_write_pmc(i, set->pmcs[i]); +} + +/** + * pfm_cell_restore_pmds + * + * Write to pm_control register before writing to counter registers + * so that we can decide the counter width berfore writing to the couters. + **/ +void pfm_cell_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set) +{ + u64 *used_pmds; + unsigned int i, max_pmd; + int cpu = smp_processor_id(); + struct pfm_cell_platform_pmu_info *info = + ((struct pfm_arch_pmu_info *) + (pfm_pmu_conf->pmu_info))->platform_info; + + /* + * Write pm_control register value + */ + info->write_pm(cpu, pm_control, + set->pmcs[CELL_PMC_PM_CONTROL] & + ~CBE_PM_ENABLE_PERF_MON); + PFM_DBG("restore pm_control(0x%lx) before restoring pmds", + set->pmcs[CELL_PMC_PM_CONTROL]); + + max_pmd = ctx->regs.max_pmd; + used_pmds = set->used_pmds; + + for (i = 0; i < max_pmd; i++) + if (test_bit(i, used_pmds) && + !(pfm_pmu_conf->pmd_desc[i].type & PFM_REG_RO)) + pfm_cell_write_pmd(i, set->pmds[i].value); +} + +/** + * pfm_cell_get_cntr_width + * + * This function check the 16bit counter field in pm_control pmc. + * + * Return value + * 16 : all counters are 16bit width. + * 32 : all counters are 32bit width. + * 0 : several counter width exists. + **/ +static int pfm_cell_get_cntr_width(struct pfm_context *ctx, + struct pfm_event_set *s) +{ + int width = 0; + int tmp = 0; + u64 cntr_field; + + if (ctx->flags.switch_ovfl || ctx->flags.switch_time) { + list_for_each_entry(s, &ctx->set_list, list) { + cntr_field = s->pmcs[CELL_PMC_PM_CONTROL] & + CELL_PMC_PM_CONTROL_CNTR_MASK; + + if (cntr_field == CELL_PMC_PM_CONTROL_CNTR_16) + tmp = 16; + else if (cntr_field == 0x0) + tmp = 32; + else + return 0; + + if (tmp != width && width != 0) + return 0; + + width = tmp; + } + } else { + cntr_field = s->pmcs[CELL_PMC_PM_CONTROL] & + CELL_PMC_PM_CONTROL_CNTR_MASK; + + if (cntr_field == CELL_PMC_PM_CONTROL_CNTR_16) + width = 16; + else if (cntr_field == 0x0) + width = 32; + else + width = 0; + } + return width; +} + +/** + * pfm_cell_check_cntr_ovfl_mask + * + * Return value + * 1 : cntr_ovfl interrupt is used. + * 0 : cntr_ovfl interrupt is not used. + **/ +static int pfm_cell_check_cntr_ovfl(struct pfm_context *ctx, + struct pfm_event_set *s) +{ + if (ctx->flags.switch_ovfl || ctx->flags.switch_time) { + list_for_each_entry(s, &ctx->set_list, list) { + if (CBE_PM_OVERFLOW_CTRS(s->pmcs[CELL_PMC_PM_STATUS])) + return 1; + } + } else { + if (CBE_PM_OVERFLOW_CTRS(s->pmcs[CELL_PMC_PM_STATUS])) + return 1; + } + return 0; +} + +#ifdef CONFIG_PPC_PS3 +/** + * update_sub_unit_field + * + **/ +static inline u64 update_sub_unit_field(u64 pm_event, u64 spe_id) +{ + return ((pm_event & 0xFFFF0000FFFFFFFF) | (spe_id << 32)); +} + +/** + * pfm_get_spe_id + * + **/ +static u64 pfm_get_spe_id(void *arg) +{ + struct spu *spu = arg; + u64 spe_id; + + if (machine_is(ps3)) + spe_id = ps3_get_spe_id(arg); + else + spe_id = spu->spe_id; + + return spe_id; +} + +/** + * pfm_spu_number_to_id + * + **/ +static int pfm_spu_number_to_id(int number, u64 *spe_id) +{ + struct spu *spu; + int i; + + for (i = 0; i < MAX_NUMNODES; i++) { + if (cbe_spu_info[i].n_spus == 0) + continue; + + list_for_each_entry(spu, &cbe_spu_info[i].spus, cbe_list) + if (spu->number == number) { + *spe_id = pfm_get_spe_id(spu); + return 0; + } + } + return -ENODEV; +} + +/** + * pfm_update_pmX_event_subunit_field + * + * In system wide mode, + * This function updates the subunit field of SPE pmX_event. + **/ +static int pfm_update_pmX_event_subunit_field(struct pfm_context *ctx) +{ + struct pfm_event_set *set; + int i, last_pmc, ret; + u64 signal_group, spe_id; + int sub_unit; + u64 *used_pmcs; + + last_pmc = NR_CTRS + 8; + ret = 0; + list_for_each_entry(set, &ctx->set_list, list) { + + used_pmcs = set->used_pmcs; + for (i = NR_CTRS; i < last_pmc; i++) { + if (!test_bit(i, used_pmcs)) + continue; + + signal_group = PFM_EVENT_PMC_SIGNAL_GROUP(set->pmcs[i]); + + /* + * If the target event is a SPE signal group event, + * The sub_unit field in pmX_event pmc is changed to the + * specified spe_id. + */ + if (SIG_GROUP_SPU_BASE < signal_group && + signal_group < SIG_GROUP_EIB_BASE) { + sub_unit = RTAS_SUB_UNIT(set->pmcs[i]); + + ret = pfm_spu_number_to_id(sub_unit, &spe_id); + if (ret) + return ret; + + set->pmcs[i] = update_sub_unit_field( + set->pmcs[i], spe_id); + } + } + } + return 0; +} +#endif + +/** + * pfm_cell_load_context + * + * In per-thread mode, + * The pmX_control PMCs which are used for PPU IU/XU event are marked with + * the thread id(PFM_COUNTER_CTRL_PMC_PPU_TH0/TH1). + **/ +static int pfm_cell_load_context(struct pfm_context *ctx) +{ + int i; + u32 ppu_sig_grp[PFM_NUM_OF_GROUPS] = {SIG_GROUP_NONE, SIG_GROUP_NONE}; + u32 bit; + int index; + u32 target_th_id; + int ppu_sig_num = 0; + struct pfm_event_set *s; + int cntr_width = 32; + int ret = 0; + + if (pfm_cell_check_cntr_ovfl(ctx, ctx->active_set)) { + cntr_width = pfm_cell_get_cntr_width(ctx, ctx->active_set); + + /* + * Counter overflow interrupt works with only 32bit counter, + * because perfmon core uses pfm_cell_pmu_conf.counter_width + * to deal with the counter overflow. we can't change the + * counter width here. + */ + if (cntr_width != 32) + return -EINVAL; + } + + if (ctx->flags.system) { +#ifdef CONFIG_PPC_PS3 + if (machine_is(ps3)) + ret = pfm_update_pmX_event_subunit_field(ctx); +#endif + return ret; + } + + list_for_each_entry(s, &ctx->set_list, list) { + ppu_sig_num = get_ppu_signal_groups(s, &ppu_sig_grp[0], + &ppu_sig_grp[1]); + + for (i = 0; i < NR_CTRS; i++) { + index = PFM_PM_CTR_INPUT_MUX_GROUP_INDEX(s->pmcs[i]); + if (ppu_sig_num && + (ppu_sig_grp[index] != SIG_GROUP_NONE) && + is_counter_for_ppu_sig_grp(s->pmcs[i], + ppu_sig_grp[index])) { + + bit = PFM_PM_CTR_INPUT_MUX_BIT(s->pmcs[i]); + target_th_id = get_target_ppu_thread_id( + ppu_sig_grp[index], bit); + if (!target_th_id) + s->pmcs[i] |= + PFM_COUNTER_CTRL_PMC_PPU_TH0; + else + s->pmcs[i] |= + PFM_COUNTER_CTRL_PMC_PPU_TH1; + PFM_DBG("set:%d mark ctr:%d target_thread:%d", + s->id, i, target_th_id); + } + } + } + + return ret; +} + +/** + * pfm_cell_unload_context + * + * For system-wide contexts and self-monitored contexts, make the RTAS call + * to reset the debug-bus signals. + * + * For non-self-monitored contexts, the monitored thread will already have + * been taken off the CPU and we don't need to do anything additional. + **/ +static void pfm_cell_unload_context(struct pfm_context *ctx) +{ + if (ctx->task == current || ctx->flags.system) + reset_signals(smp_processor_id()); +} + +/** + * pfm_cell_ctxswout_thread + * + * When a monitored thread is switched out (self-monitored or externally + * monitored) we need to reset the debug-bus signals so the next context that + * gets switched in can start from a clean set of signals. + **/ +int pfm_cell_ctxswout_thread(struct task_struct *task, + struct pfm_context *ctx, struct pfm_event_set *set) +{ + reset_signals(smp_processor_id()); + return 0; +} + +/** + * pfm_cell_get_ovfl_pmds + * + * Determine which counters in this set have overflowed and fill in the + * set->povfl_pmds mask and set->npend_ovfls count. On Cell, the pm_status + * register contains a bit for each counter to indicate overflow. However, + * those 8 bits are in the reverse order than what Perfmon2 is expecting, + * so we need to reverse the order of the overflow bits. + **/ +static void pfm_cell_get_ovfl_pmds(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + struct pfm_arch_context *ctx_arch = pfm_ctx_arch(ctx); + u32 pm_status, ovfl_ctrs; + u64 povfl_pmds = 0; + int i; + struct pfm_cell_platform_pmu_info *info = + ((struct pfm_arch_pmu_info *) + (pfm_pmu_conf->pmu_info))->platform_info; + + if (!ctx_arch->last_read_updated) + /* This routine was not called via the interrupt handler. + * Need to start by getting interrupts and updating + * last_read_pm_status. + */ + ctx_arch->last_read_pm_status = + info->get_and_clear_pm_interrupts(smp_processor_id()); + + /* Reset the flag that the interrupt handler last read pm_status. */ + ctx_arch->last_read_updated = 0; + + pm_status = ctx_arch->last_read_pm_status & + set->pmcs[CELL_PMC_PM_STATUS]; + ovfl_ctrs = CBE_PM_OVERFLOW_CTRS(pm_status); + + /* Reverse the order of the bits in ovfl_ctrs + * and store the result in povfl_pmds. + */ + for (i = 0; i < PFM_PM_NUM_PMDS; i++) { + povfl_pmds = (povfl_pmds << 1) | (ovfl_ctrs & 1); + ovfl_ctrs >>= 1; + } + + /* Mask povfl_pmds with set->used_pmds to get set->povfl_pmds. + * Count the bits set in set->povfl_pmds to get set->npend_ovfls. + */ + bitmap_and(set->povfl_pmds, &povfl_pmds, + set->used_pmds, PFM_PM_NUM_PMDS); + set->npend_ovfls = bitmap_weight(set->povfl_pmds, PFM_PM_NUM_PMDS); +} + +/** + * pfm_cell_acquire_pmu + * + * acquire PMU resource. + * This acquisition is done when the first context is created. + **/ +int pfm_cell_acquire_pmu(u64 *unavail_pmcs, u64 *unavail_pmds) +{ +#ifdef CONFIG_PPC_PS3 + int ret; + + if (machine_is(ps3)) { + PFM_DBG(""); + ret = ps3_lpm_open(PS3_LPM_TB_TYPE_INTERNAL, NULL, 0); + if (ret) { + PFM_ERR("Can't create PS3 lpm. error:%d", ret); + return -EFAULT; + } + } +#endif + return 0; +} + +/** + * pfm_cell_release_pmu + * + * release PMU resource. + * actual release happens when last context is destroyed + **/ +void pfm_cell_release_pmu(void) +{ +#ifdef CONFIG_PPC_PS3 + if (machine_is(ps3)) { + if (ps3_lpm_close()) + PFM_ERR("Can't delete PS3 lpm."); + } +#endif +} + +/** + * handle_trace_buffer_interrupts + * + * This routine is for processing just the interval timer and trace buffer + * overflow interrupts. Performance counter interrupts are handled by the + * perf_irq_handler() routine, which reads and saves the pm_status register. + * This routine should not read the actual pm_status register, but rather + * the value passed in. + **/ +static void handle_trace_buffer_interrupts(unsigned long iip, + struct pt_regs *regs, + struct pfm_context *ctx, + u32 pm_status) +{ + /* FIX: Currently ignoring trace-buffer interrupts. */ + return; +} + +/** + * pfm_cell_irq_handler + * + * Handler for all Cell performance-monitor interrupts. + **/ +static void pfm_cell_irq_handler(struct pt_regs *regs, struct pfm_context *ctx) +{ + struct pfm_arch_context *ctx_arch = pfm_ctx_arch(ctx); + u32 last_read_pm_status; + int cpu = smp_processor_id(); + struct pfm_cell_platform_pmu_info *info = + ((struct pfm_arch_pmu_info *) + (pfm_pmu_conf->pmu_info))->platform_info; + + /* Need to disable and reenable the performance counters to get the + * desired behavior from the hardware. This is specific to the Cell + * PMU hardware. + */ + info->disable_pm(cpu); + + /* Read the pm_status register to get the interrupt bits. If a + * perfmormance counter overflow interrupt occurred, call the core + * perfmon interrupt handler to service the counter overflow. If the + * interrupt was for the interval timer or the trace_buffer, + * call the interval timer and trace buffer interrupt handler. + * + * The value read from the pm_status register is stored in the + * pmf_arch_context structure for use by other routines. Note that + * reading the pm_status register resets the interrupt flags to zero. + * Hence, it is important that the register is only read in one place. + * + * The pm_status reg interrupt reg format is: + * [pmd0:pmd1:pmd2:pmd3:pmd4:pmd5:pmd6:pmd7:intt:tbf:tbu:] + * - pmd0 to pm7 are the perf counter overflow interrupts. + * - intt is the interval timer overflowed interrupt. + * - tbf is the trace buffer full interrupt. + * - tbu is the trace buffer underflow interrupt. + * - The pmd0 bit is the MSB of the 32 bit register. + */ + ctx_arch->last_read_pm_status = last_read_pm_status = + info->get_and_clear_pm_interrupts(cpu); + + /* Set flag for pfm_cell_get_ovfl_pmds() routine so it knows + * last_read_pm_status was updated by the interrupt handler. + */ + ctx_arch->last_read_updated = 1; + + if (last_read_pm_status & CBE_PM_ALL_OVERFLOW_INTR) + /* At least one counter overflowed. */ + pfm_interrupt_handler(instruction_pointer(regs), regs); + + if (last_read_pm_status & (CBE_PM_INTERVAL_INTR | + CBE_PM_TRACE_BUFFER_FULL_INTR | + CBE_PM_TRACE_BUFFER_UNDERFLOW_INTR)) + /* Trace buffer or interval timer overflow. */ + handle_trace_buffer_interrupts(instruction_pointer(regs), + regs, ctx, last_read_pm_status); + + /* The interrupt settings is the value written to the pm_status + * register. It is saved in the context when the register is + * written. + */ + info->enable_pm_interrupts(cpu, info->get_hw_thread_id(cpu), + ctx->active_set->pmcs[CELL_PMC_PM_STATUS]); + + /* The writes to the various performance counters only writes to a + * latch. The new values (interrupt setting bits, reset counter value + * etc.) are not copied to the actual registers until the performance + * monitor is enabled. In order to get this to work as desired, the + * permormance monitor needs to be disabled while writting to the + * latches. This is a HW design issue. + */ + info->enable_pm(cpu); +} + + +static struct pfm_cell_platform_pmu_info ps3_platform_pmu_info = { +#ifdef CONFIG_PPC_PS3 + .read_ctr = ps3_read_ctr, + .write_ctr = ps3_write_ctr, + .write_pm07_control = ps3_write_pm07_control, + .write_pm = ps3_write_pm, + .enable_pm = ps3_enable_pm, + .disable_pm = ps3_disable_pm, + .enable_pm_interrupts = ps3_enable_pm_interrupts, + .get_and_clear_pm_interrupts = ps3_get_and_clear_pm_interrupts, + .get_hw_thread_id = ps3_get_hw_thread_id, + .get_cpu_ppe_priv_regs = NULL, + .get_cpu_pmd_regs = NULL, + .get_cpu_mic_tm_regs = NULL, + .rtas_token = NULL, + .rtas_call = NULL, +#endif +}; + +static struct pfm_cell_platform_pmu_info native_platform_pmu_info = { +#ifdef CONFIG_PPC_CELL_NATIVE + .read_ctr = cbe_read_ctr, + .write_ctr = cbe_write_ctr, + .write_pm07_control = cbe_write_pm07_control, + .write_pm = cbe_write_pm, + .enable_pm = cbe_enable_pm, + .disable_pm = cbe_disable_pm, + .enable_pm_interrupts = cbe_enable_pm_interrupts, + .get_and_clear_pm_interrupts = cbe_get_and_clear_pm_interrupts, + .get_hw_thread_id = cbe_get_hw_thread_id, + .get_cpu_ppe_priv_regs = cbe_get_cpu_ppe_priv_regs, + .get_cpu_pmd_regs = cbe_get_cpu_pmd_regs, + .get_cpu_mic_tm_regs = cbe_get_cpu_mic_tm_regs, + .rtas_token = rtas_token, + .rtas_call = rtas_call, +#endif +}; + +static struct pfm_arch_pmu_info pfm_cell_pmu_info = { + .pmu_style = PFM_POWERPC_PMU_CELL, + .acquire_pmu = pfm_cell_acquire_pmu, + .release_pmu = pfm_cell_release_pmu, + .write_pmc = pfm_cell_write_pmc, + .write_pmd = pfm_cell_write_pmd, + .read_pmd = pfm_cell_read_pmd, + .enable_counters = pfm_cell_enable_counters, + .disable_counters = pfm_cell_disable_counters, + .irq_handler = pfm_cell_irq_handler, + .get_ovfl_pmds = pfm_cell_get_ovfl_pmds, + .restore_pmcs = pfm_cell_restore_pmcs, + .restore_pmds = pfm_cell_restore_pmds, + .ctxswout_thread = pfm_cell_ctxswout_thread, + .load_context = pfm_cell_load_context, + .unload_context = pfm_cell_unload_context, +}; + +static struct pfm_pmu_config pfm_cell_pmu_conf = { + .pmu_name = "Cell", + .version = "0.1", + .counter_width = 32, + .pmd_desc = pfm_cell_pmd_desc, + .pmc_desc = pfm_cell_pmc_desc, + .num_pmc_entries = PFM_PM_NUM_PMCS, + .num_pmd_entries = PFM_PM_NUM_PMDS, + .probe_pmu = pfm_cell_probe_pmu, + .pmu_info = &pfm_cell_pmu_info, + .flags = PFM_PMU_BUILTIN_FLAG, + .owner = THIS_MODULE, +}; + +/** + * pfm_cell_platform_probe + * + * If we're on a system without the firmware rtas call available, set up the + * PMC write-checker for all the pmX_event control registers. + **/ +static void pfm_cell_platform_probe(void) +{ + if (machine_is(celleb)) { + int cnum; + pfm_cell_pmu_conf.pmc_write_check = pfm_cell_pmc_check; + for (cnum = NR_CTRS; cnum < (NR_CTRS * 2); cnum++) + pfm_cell_pmc_desc[cnum].type |= PFM_REG_WC; + } + + if (machine_is(ps3)) + pfm_cell_pmu_info.platform_info = &ps3_platform_pmu_info; + else + pfm_cell_pmu_info.platform_info = &native_platform_pmu_info; +} + +static int __init pfm_cell_pmu_init_module(void) +{ + pfm_cell_platform_probe(); + return pfm_pmu_register(&pfm_cell_pmu_conf); +} + +static void __exit pfm_cell_pmu_cleanup_module(void) +{ + pfm_pmu_unregister(&pfm_cell_pmu_conf); +} + +module_init(pfm_cell_pmu_init_module); +module_exit(pfm_cell_pmu_cleanup_module); diff --git a/arch/powerpc/perfmon/perfmon_power4.c b/arch/powerpc/perfmon/perfmon_power4.c new file mode 100644 index 0000000..eba9e8c --- /dev/null +++ b/arch/powerpc/perfmon/perfmon_power4.c @@ -0,0 +1,309 @@ +/* + * This file contains the POWER4 PMU register description tables + * and pmc checker used by perfmon.c. + * + * Copyright (c) 2007, IBM Corporation. + * + * Based on a simple modification of perfmon_power5.c for POWER4 by + * Corey Ashford . + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include + +MODULE_AUTHOR("Corey Ashford "); +MODULE_DESCRIPTION("POWER4 PMU description table"); +MODULE_LICENSE("GPL"); + +static struct pfm_regmap_desc pfm_power4_pmc_desc[] = { +/* mmcr0 */ PMC_D(PFM_REG_I, "MMCR0", MMCR0_FC, 0, 0, SPRN_MMCR0), +/* mmcr1 */ PMC_D(PFM_REG_I, "MMCR1", 0, 0, 0, SPRN_MMCR1), +/* mmcra */ PMC_D(PFM_REG_I, "MMCRA", 0, 0, 0, SPRN_MMCRA) +}; +#define PFM_PM_NUM_PMCS ARRAY_SIZE(pfm_power4_pmc_desc) + +/* The TB and PURR registers are read-only. Also, note that the TB register + * actually consists of both the 32-bit SPRN_TBRU and SPRN_TBRL registers. + * For Perfmon2's purposes, we'll treat it as a single 64-bit register. + */ +static struct pfm_regmap_desc pfm_power4_pmd_desc[] = { +/* tb */ PMD_D((PFM_REG_I|PFM_REG_RO), "TB", SPRN_TBRL), +/* pmd1 */ PMD_D(PFM_REG_C, "PMC1", SPRN_PMC1), +/* pmd2 */ PMD_D(PFM_REG_C, "PMC2", SPRN_PMC2), +/* pmd3 */ PMD_D(PFM_REG_C, "PMC3", SPRN_PMC3), +/* pmd4 */ PMD_D(PFM_REG_C, "PMC4", SPRN_PMC4), +/* pmd5 */ PMD_D(PFM_REG_C, "PMC5", SPRN_PMC5), +/* pmd6 */ PMD_D(PFM_REG_C, "PMC6", SPRN_PMC6), +/* pmd7 */ PMD_D(PFM_REG_C, "PMC7", SPRN_PMC7), +/* pmd8 */ PMD_D(PFM_REG_C, "PMC8", SPRN_PMC8) +}; +#define PFM_PM_NUM_PMDS ARRAY_SIZE(pfm_power4_pmd_desc) + +static int pfm_power4_probe_pmu(void) +{ + unsigned long pvr = mfspr(SPRN_PVR); + int ver = PVR_VER(pvr); + + if ((ver == PV_POWER4) || (ver == PV_POWER4p)) + return 0; + + return -1; +} + +static void pfm_power4_write_pmc(unsigned int cnum, u64 value) +{ + switch (pfm_pmu_conf->pmc_desc[cnum].hw_addr) { + case SPRN_MMCR0: + mtspr(SPRN_MMCR0, value); + break; + case SPRN_MMCR1: + mtspr(SPRN_MMCR1, value); + break; + case SPRN_MMCRA: + mtspr(SPRN_MMCRA, value); + break; + default: + BUG(); + } +} + +static void pfm_power4_write_pmd(unsigned int cnum, u64 value) +{ + u64 ovfl_mask = pfm_pmu_conf->ovfl_mask; + + switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) { + case SPRN_PMC1: + mtspr(SPRN_PMC1, value & ovfl_mask); + break; + case SPRN_PMC2: + mtspr(SPRN_PMC2, value & ovfl_mask); + break; + case SPRN_PMC3: + mtspr(SPRN_PMC3, value & ovfl_mask); + break; + case SPRN_PMC4: + mtspr(SPRN_PMC4, value & ovfl_mask); + break; + case SPRN_PMC5: + mtspr(SPRN_PMC5, value & ovfl_mask); + break; + case SPRN_PMC6: + mtspr(SPRN_PMC6, value & ovfl_mask); + break; + case SPRN_PMC7: + mtspr(SPRN_PMC7, value & ovfl_mask); + break; + case SPRN_PMC8: + mtspr(SPRN_PMC8, value & ovfl_mask); + break; + case SPRN_TBRL: + case SPRN_PURR: + /* Ignore writes to read-only registers. */ + break; + default: + BUG(); + } +} + +static u64 pfm_power4_read_pmd(unsigned int cnum) +{ + switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) { + case SPRN_PMC1: + return mfspr(SPRN_PMC1); + case SPRN_PMC2: + return mfspr(SPRN_PMC2); + case SPRN_PMC3: + return mfspr(SPRN_PMC3); + case SPRN_PMC4: + return mfspr(SPRN_PMC4); + case SPRN_PMC5: + return mfspr(SPRN_PMC5); + case SPRN_PMC6: + return mfspr(SPRN_PMC6); + case SPRN_PMC7: + return mfspr(SPRN_PMC7); + case SPRN_PMC8: + return mfspr(SPRN_PMC8); + case SPRN_TBRL: + return ((u64)mfspr(SPRN_TBRU) << 32) | mfspr(SPRN_TBRL); + case SPRN_PURR: + if (cpu_has_feature(CPU_FTR_PURR)) + return mfspr(SPRN_PURR); + else + return 0; + default: + BUG(); + } +} + +/* forward decl */ +static void pfm_power4_disable_counters(struct pfm_context *ctx, + struct pfm_event_set *set); + +/** + * pfm_power4_enable_counters + * + **/ +static void pfm_power4_enable_counters(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + unsigned int i, max_pmc; + + /* Make sure the counters are disabled before touching the other + control registers */ + pfm_power4_disable_counters(ctx, set); + + max_pmc = ctx->regs.max_pmc; + + /* Write MMCR0 last, and a fairly easy way to do this is to write + the registers in the reverse order */ + for (i = max_pmc; i != 0; i--) + if (test_bit(i - 1, set->used_pmcs)) + pfm_power4_write_pmc(i - 1, set->pmcs[i - 1]); +} + +/** + * pfm_power4_disable_counters + * + **/ +static void pfm_power4_disable_counters(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + /* Set the Freeze Counters bit */ + mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC); + asm volatile ("sync"); +} + +/** + * pfm_power4_get_ovfl_pmds + * + * Determine which counters in this set have overflowed and fill in the + * set->povfl_pmds mask and set->npend_ovfls count. + **/ +static void pfm_power4_get_ovfl_pmds(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + unsigned int i; + unsigned int max_pmd = ctx->regs.max_intr_pmd; + u64 *used_pmds = set->used_pmds; + u64 *cntr_pmds = ctx->regs.cnt_pmds; + u64 width_mask = 1 << pfm_pmu_conf->counter_width; + u64 new_val, mask[PFM_PMD_BV]; + + bitmap_and(cast_ulp(mask), cast_ulp(cntr_pmds), + cast_ulp(used_pmds), max_pmd); + + for (i = 0; i < max_pmd; i++) { + if (test_bit(i, mask)) { + new_val = pfm_power4_read_pmd(i); + if (new_val & width_mask) { + set_bit(i, set->povfl_pmds); + set->npend_ovfls++; + } + } + } +} + +static void pfm_power4_irq_handler(struct pt_regs *regs, + struct pfm_context *ctx) +{ + u32 mmcr0; + + /* Disable the counters (set the freeze bit) to not polute + * the counts. + */ + mmcr0 = mfspr(SPRN_MMCR0); + mtspr(SPRN_MMCR0, (mmcr0 | MMCR0_FC)); + + /* Set the PMM bit (see comment below). */ + mtmsrd(mfmsr() | MSR_PMM); + + pfm_interrupt_handler(instruction_pointer(regs), regs); + + mmcr0 = mfspr(SPRN_MMCR0); + + /* + * Reset the perfmon trigger if + * not in masking mode. + */ + if (ctx->state != PFM_CTX_MASKED) + mmcr0 |= MMCR0_PMXE; + + /* + * We must clear the PMAO bit on some (GQ) chips. Just do it + * all the time. + */ + mmcr0 &= ~MMCR0_PMAO; + + /* + * Now clear the freeze bit, counting will not start until we + * rfid from this exception, because only at that point will + * the PMM bit be cleared. + */ + mmcr0 &= ~MMCR0_FC; + mtspr(SPRN_MMCR0, mmcr0); +} + +static void pfm_power4_resend_irq(struct pfm_context *ctx) +{ + /* + * Assert the PMAO bit to cause a PMU interrupt. Make sure we + * trigger the edge detection circuitry for PMAO + */ + mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_PMAO); + mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_PMAO); +} + +struct pfm_arch_pmu_info pfm_power4_pmu_info = { + .pmu_style = PFM_POWERPC_PMU_POWER4, + .write_pmc = pfm_power4_write_pmc, + .write_pmd = pfm_power4_write_pmd, + .read_pmd = pfm_power4_read_pmd, + .irq_handler = pfm_power4_irq_handler, + .get_ovfl_pmds = pfm_power4_get_ovfl_pmds, + .enable_counters = pfm_power4_enable_counters, + .disable_counters = pfm_power4_disable_counters, + .resend_irq = pfm_power4_resend_irq +}; + +/* + * impl_pmcs, impl_pmds are computed at runtime to minimize errors! + */ +static struct pfm_pmu_config pfm_power4_pmu_conf = { + .pmu_name = "POWER4", + .counter_width = 31, + .pmd_desc = pfm_power4_pmd_desc, + .pmc_desc = pfm_power4_pmc_desc, + .num_pmc_entries = PFM_PM_NUM_PMCS, + .num_pmd_entries = PFM_PM_NUM_PMDS, + .probe_pmu = pfm_power4_probe_pmu, + .pmu_info = &pfm_power4_pmu_info, + .flags = PFM_PMU_BUILTIN_FLAG, + .owner = THIS_MODULE +}; + +static int __init pfm_power4_pmu_init_module(void) +{ + return pfm_pmu_register(&pfm_power4_pmu_conf); +} + +static void __exit pfm_power4_pmu_cleanup_module(void) +{ + pfm_pmu_unregister(&pfm_power4_pmu_conf); +} + +module_init(pfm_power4_pmu_init_module); +module_exit(pfm_power4_pmu_cleanup_module); diff --git a/arch/powerpc/perfmon/perfmon_power5.c b/arch/powerpc/perfmon/perfmon_power5.c new file mode 100644 index 0000000..f4bb1ac --- /dev/null +++ b/arch/powerpc/perfmon/perfmon_power5.c @@ -0,0 +1,326 @@ +/* + * This file contains the POWER5 PMU register description tables + * and pmc checker used by perfmon.c. + * + * Copyright (c) 2005 David Gibson, IBM Corporation. + * + * Based on perfmon_p6.c: + * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include + +MODULE_AUTHOR("David Gibson "); +MODULE_DESCRIPTION("POWER5 PMU description table"); +MODULE_LICENSE("GPL"); + +static struct pfm_regmap_desc pfm_power5_pmc_desc[] = { +/* mmcr0 */ PMC_D(PFM_REG_I, "MMCR0", MMCR0_FC, 0, 0, SPRN_MMCR0), +/* mmcr1 */ PMC_D(PFM_REG_I, "MMCR1", 0, 0, 0, SPRN_MMCR1), +/* mmcra */ PMC_D(PFM_REG_I, "MMCRA", 0, 0, 0, SPRN_MMCRA) +}; +#define PFM_PM_NUM_PMCS ARRAY_SIZE(pfm_power5_pmc_desc) + +/* The TB and PURR registers are read-only. Also, note that the TB register + * actually consists of both the 32-bit SPRN_TBRU and SPRN_TBRL registers. + * For Perfmon2's purposes, we'll treat it as a single 64-bit register. + */ +static struct pfm_regmap_desc pfm_power5_pmd_desc[] = { +/* tb */ PMD_D((PFM_REG_I|PFM_REG_RO), "TB", SPRN_TBRL), +/* pmd1 */ PMD_D(PFM_REG_C, "PMC1", SPRN_PMC1), +/* pmd2 */ PMD_D(PFM_REG_C, "PMC2", SPRN_PMC2), +/* pmd3 */ PMD_D(PFM_REG_C, "PMC3", SPRN_PMC3), +/* pmd4 */ PMD_D(PFM_REG_C, "PMC4", SPRN_PMC4), +/* pmd5 */ PMD_D(PFM_REG_C, "PMC5", SPRN_PMC5), +/* pmd6 */ PMD_D(PFM_REG_C, "PMC6", SPRN_PMC6), +/* purr */ PMD_D((PFM_REG_I|PFM_REG_RO), "PURR", SPRN_PURR), +}; +#define PFM_PM_NUM_PMDS ARRAY_SIZE(pfm_power5_pmd_desc) + +/* forward decl */ +static void pfm_power5_disable_counters(struct pfm_context *ctx, + struct pfm_event_set *set); + +static int pfm_power5_probe_pmu(void) +{ + unsigned long pvr = mfspr(SPRN_PVR); + + switch (PVR_VER(pvr)) { + case PV_POWER5: + return 0; + case PV_POWER5p: + return (PVR_REV(pvr) < 0x300) ? 0 : -1; + default: + return -1; + } +} + +static void pfm_power5_write_pmc(unsigned int cnum, u64 value) +{ + switch (pfm_pmu_conf->pmc_desc[cnum].hw_addr) { + case SPRN_MMCR0: + mtspr(SPRN_MMCR0, value); + break; + case SPRN_MMCR1: + mtspr(SPRN_MMCR1, value); + break; + case SPRN_MMCRA: + mtspr(SPRN_MMCRA, value); + break; + default: + BUG(); + } +} + +static void pfm_power5_write_pmd(unsigned int cnum, u64 value) +{ + u64 ovfl_mask = pfm_pmu_conf->ovfl_mask; + + switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) { + case SPRN_PMC1: + mtspr(SPRN_PMC1, value & ovfl_mask); + break; + case SPRN_PMC2: + mtspr(SPRN_PMC2, value & ovfl_mask); + break; + case SPRN_PMC3: + mtspr(SPRN_PMC3, value & ovfl_mask); + break; + case SPRN_PMC4: + mtspr(SPRN_PMC4, value & ovfl_mask); + break; + case SPRN_PMC5: + mtspr(SPRN_PMC5, value & ovfl_mask); + break; + case SPRN_PMC6: + mtspr(SPRN_PMC6, value & ovfl_mask); + break; + case SPRN_TBRL: + case SPRN_PURR: + /* Ignore writes to read-only registers. */ + break; + default: + BUG(); + } +} + +static u64 pfm_power5_read_pmd(unsigned int cnum) +{ + switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) { + case SPRN_PMC1: + return mfspr(SPRN_PMC1); + case SPRN_PMC2: + return mfspr(SPRN_PMC2); + case SPRN_PMC3: + return mfspr(SPRN_PMC3); + case SPRN_PMC4: + return mfspr(SPRN_PMC4); + case SPRN_PMC5: + return mfspr(SPRN_PMC5); + case SPRN_PMC6: + return mfspr(SPRN_PMC6); + case SPRN_TBRL: + return ((u64)mfspr(SPRN_TBRU) << 32) | mfspr(SPRN_TBRL); + case SPRN_PURR: + if (cpu_has_feature(CPU_FTR_PURR)) + return mfspr(SPRN_PURR); + else + return 0; + default: + BUG(); + } +} + +/** + * pfm_power5_enable_counters + * + **/ +static void pfm_power5_enable_counters(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + unsigned int i, max_pmc; + + /* + * Make sure the counters are disabled before touching the + * other control registers + */ + pfm_power5_disable_counters(ctx, set); + + max_pmc = ctx->regs.max_pmc; + + /* + * Write MMCR0 last, and a fairly easy way to do + * this is to write the registers in the reverse + * order + */ + for (i = max_pmc; i != 0; i--) + if (test_bit(i - 1, set->used_pmcs)) + pfm_power5_write_pmc(i - 1, set->pmcs[i - 1]); +} + +/** + * pfm_power5_disable_counters + * + * Just need to zero all the control registers. + **/ +static void pfm_power5_disable_counters(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + /* Set the Freeze Counters bit */ + mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC); + asm volatile ("sync"); +} + +/** + * pfm_power5_get_ovfl_pmds + * + * Determine which counters in this set have overflowed and fill in the + * set->povfl_pmds mask and set->npend_ovfls count. + **/ +static void pfm_power5_get_ovfl_pmds(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + unsigned int i; + unsigned int max = ctx->regs.max_intr_pmd; + u64 *used_pmds = set->used_pmds; + u64 *intr_pmds = ctx->regs.intr_pmds; + u64 width_mask = 1 << pfm_pmu_conf->counter_width; + u64 new_val, mask[PFM_PMD_BV]; + + bitmap_and(cast_ulp(mask), cast_ulp(intr_pmds), + cast_ulp(used_pmds), max); + /* + * If either PMC5 or PMC6 are not being used, just zero out the unused + * ones so that they won't interrupt again for another 2^31 counts. + * Note that if no other counters overflowed, set->npend_ovfls will + * be zero upon returning from this call (i.e. a spurious + * interrupt), but that should be ok. + * + * If neither PMC5 nor PMC6 are used, the counters should be frozen + * via MMCR0_FC5_6 and zeroed out. + * + * If both PMC5 and PMC6 are used, they can be handled correctly by + * the loop that follows. + */ + + if (!test_bit(5, cast_ulp(used_pmds))) + mtspr(SPRN_PMC5, 0); + if (!test_bit(6, cast_ulp(used_pmds))) + mtspr(SPRN_PMC6, 0); + + for (i = 0; i < max; i++) { + if (test_bit(i, mask)) { + new_val = pfm_power5_read_pmd(i); + if (new_val & width_mask) { + set_bit(i, set->povfl_pmds); + set->npend_ovfls++; + } + } + } +} + +static void pfm_power5_irq_handler(struct pt_regs *regs, + struct pfm_context *ctx) +{ + u32 mmcr0; + + /* Disable the counters (set the freeze bit) to not polute + * the counts. + */ + mmcr0 = mfspr(SPRN_MMCR0); + mtspr(SPRN_MMCR0, (mmcr0 | MMCR0_FC)); + + /* Set the PMM bit (see comment below). */ + mtmsrd(mfmsr() | MSR_PMM); + + pfm_interrupt_handler(instruction_pointer(regs), regs); + + mmcr0 = mfspr(SPRN_MMCR0); + + /* + * Reset the perfmon trigger if + * not in masking mode. + */ + if (ctx->state != PFM_CTX_MASKED) + mmcr0 |= MMCR0_PMXE; + + /* + * We must clear the PMAO bit on some (GQ) chips. Just do it + * all the time. + */ + mmcr0 &= ~MMCR0_PMAO; + + /* + * Now clear the freeze bit, counting will not start until we + * rfid from this exception, because only at that point will + * the PMM bit be cleared. + */ + mmcr0 &= ~MMCR0_FC; + mtspr(SPRN_MMCR0, mmcr0); +} + +static void pfm_power5_resend_irq(struct pfm_context *ctx) +{ + /* + * Assert the PMAO bit to cause a PMU interrupt. Make sure we + * trigger the edge detection circuitry for PMAO + */ + mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_PMAO); + mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_PMAO); +} + +struct pfm_arch_pmu_info pfm_power5_pmu_info = { + .pmu_style = PFM_POWERPC_PMU_POWER5, + .write_pmc = pfm_power5_write_pmc, + .write_pmd = pfm_power5_write_pmd, + .read_pmd = pfm_power5_read_pmd, + .irq_handler = pfm_power5_irq_handler, + .get_ovfl_pmds = pfm_power5_get_ovfl_pmds, + .enable_counters = pfm_power5_enable_counters, + .disable_counters = pfm_power5_disable_counters, + .resend_irq = pfm_power5_resend_irq +}; + +/* + * impl_pmcs, impl_pmds are computed at runtime to minimize errors! + */ +static struct pfm_pmu_config pfm_power5_pmu_conf = { + .pmu_name = "POWER5", + .counter_width = 31, + .pmd_desc = pfm_power5_pmd_desc, + .pmc_desc = pfm_power5_pmc_desc, + .num_pmc_entries = PFM_PM_NUM_PMCS, + .num_pmd_entries = PFM_PM_NUM_PMDS, + .probe_pmu = pfm_power5_probe_pmu, + .pmu_info = &pfm_power5_pmu_info, + .flags = PFM_PMU_BUILTIN_FLAG, + .owner = THIS_MODULE +}; + +static int __init pfm_power5_pmu_init_module(void) +{ + return pfm_pmu_register(&pfm_power5_pmu_conf); +} + +static void __exit pfm_power5_pmu_cleanup_module(void) +{ + pfm_pmu_unregister(&pfm_power5_pmu_conf); +} + +module_init(pfm_power5_pmu_init_module); +module_exit(pfm_power5_pmu_cleanup_module); diff --git a/arch/powerpc/perfmon/perfmon_power6.c b/arch/powerpc/perfmon/perfmon_power6.c new file mode 100644 index 0000000..7882feb --- /dev/null +++ b/arch/powerpc/perfmon/perfmon_power6.c @@ -0,0 +1,520 @@ +/* + * This file contains the POWER6 PMU register description tables + * and pmc checker used by perfmon.c. + * + * Copyright (c) 2007, IBM Corporation + * + * Based on perfmon_power5.c, and written by Carl Love + * and Kevin Corry . Some fixes and refinement by + * Corey Ashford + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include + +MODULE_AUTHOR("Corey Ashford "); +MODULE_DESCRIPTION("POWER6 PMU description table"); +MODULE_LICENSE("GPL"); + +static struct pfm_regmap_desc pfm_power6_pmc_desc[] = { +/* mmcr0 */ PMC_D(PFM_REG_I, "MMCR0", MMCR0_FC, 0, 0, SPRN_MMCR0), +/* mmcr1 */ PMC_D(PFM_REG_I, "MMCR1", 0, 0, 0, SPRN_MMCR1), +/* mmcra */ PMC_D(PFM_REG_I, "MMCRA", 0, 0, 0, SPRN_MMCRA) +}; +#define PFM_PM_NUM_PMCS ARRAY_SIZE(pfm_power6_pmc_desc) +#define PFM_DELTA_TB 10000 /* Not a real registers */ +#define PFM_DELTA_PURR 10001 + +/* + * counters wrap to zero at transition from 2^32-1 to 2^32. Note: + * interrupt generated at transition from 2^31-1 to 2^31 + */ +#define OVERFLOW_VALUE 0x100000000UL + +/* The TB and PURR registers are read-only. Also, note that the TB register + * actually consists of both the 32-bit SPRN_TBRU and SPRN_TBRL registers. + * For Perfmon2's purposes, we'll treat it as a single 64-bit register. + */ +static struct pfm_regmap_desc pfm_power6_pmd_desc[] = { + /* On POWER 6 PMC5 and PMC6 are not writable, they do not + * generate interrupts, and do not qualify their counts + * based on problem mode, supervisor mode or hypervisor mode. + * These two counters are implemented as virtual counters + * to make the appear to work like the other counters. A + * kernel timer is used sample the real PMC5 and PMC6 and + * update the virtual counters. + */ +/* tb */ PMD_D((PFM_REG_I|PFM_REG_RO), "TB", SPRN_TBRL), +/* pmd1 */ PMD_D(PFM_REG_C, "PMC1", SPRN_PMC1), +/* pmd2 */ PMD_D(PFM_REG_C, "PMC2", SPRN_PMC2), +/* pmd3 */ PMD_D(PFM_REG_C, "PMC3", SPRN_PMC3), +/* pmd4 */ PMD_D(PFM_REG_C, "PMC4", SPRN_PMC4), +/* pmd5 */ PMD_D((PFM_REG_I|PFM_REG_V), "PMC5", SPRN_PMC5), +/* pmd6 */ PMD_D((PFM_REG_I|PFM_REG_V), "PMC6", SPRN_PMC6), +/* purr */ PMD_D((PFM_REG_I|PFM_REG_RO), "PURR", SPRN_PURR), +/* delta purr */ PMD_D((PFM_REG_I|PFM_REG_V), "DELTA_TB", PFM_DELTA_TB), +/* delta tb */ PMD_D((PFM_REG_I|PFM_REG_V), "DELTA_PURR", PFM_DELTA_PURR), +}; + +#define PFM_PM_NUM_PMDS ARRAY_SIZE(pfm_power6_pmd_desc) + +u32 pmc5_start_save[NR_CPUS]; +u32 pmc6_start_save[NR_CPUS]; + +static struct timer_list pmc5_6_update[NR_CPUS]; +u64 enable_cntrs_cnt; +u64 disable_cntrs_cnt; +u64 call_delta; +u64 pm5_6_interrupt; +u64 pm1_4_interrupt; +/* need ctx_arch for kernel timer. Can't get it in context of the kernel + * timer. + */ +struct pfm_arch_context *pmc5_6_ctx_arch[NR_CPUS]; +long int update_time; + +static void delta(int cpu_num, struct pfm_arch_context *ctx_arch) +{ + u32 tmp5, tmp6; + + call_delta++; + + tmp5 = (u32) mfspr(SPRN_PMC5); + tmp6 = (u32) mfspr(SPRN_PMC6); + + /* + * The following difference calculation relies on 32-bit modular + * arithmetic for the deltas to come out correct (especially in the + * presence of a 32-bit counter wrap). + */ + ctx_arch->powergs_pmc5 += (u64)(tmp5 - pmc5_start_save[cpu_num]); + ctx_arch->powergs_pmc6 += (u64)(tmp6 - pmc6_start_save[cpu_num]); + + pmc5_start_save[cpu_num] = tmp5; + pmc6_start_save[cpu_num] = tmp6; + + return; +} + + +static void pmc5_6_updater(unsigned long cpu_num) +{ + /* update the virtual pmd 5 and pmd 6 counters */ + + delta(cpu_num, pmc5_6_ctx_arch[cpu_num]); + mod_timer(&pmc5_6_update[cpu_num], jiffies + update_time); +} + + +static int pfm_power6_probe_pmu(void) +{ + unsigned long pvr = mfspr(SPRN_PVR); + + switch (PVR_VER(pvr)) { + case PV_POWER6: + return 0; + case PV_POWER5p: + /* If this is a POWER5+ and the revision is less than 0x300, + don't treat it as a POWER6. */ + return (PVR_REV(pvr) < 0x300) ? -1 : 0; + default: + return -1; + } +} + +static void pfm_power6_write_pmc(unsigned int cnum, u64 value) +{ + switch (pfm_pmu_conf->pmc_desc[cnum].hw_addr) { + case SPRN_MMCR0: + mtspr(SPRN_MMCR0, value); + break; + case SPRN_MMCR1: + mtspr(SPRN_MMCR1, value); + break; + case SPRN_MMCRA: + mtspr(SPRN_MMCRA, value); + break; + default: + BUG(); + } +} + +static void pfm_power6_write_pmd(unsigned int cnum, u64 value) +{ + /* On POWER 6 PMC5 and PMC6 are implemented as + * virtual counters. See comment in pfm_power6_pmd_desc + * definition. + */ + u64 ovfl_mask = pfm_pmu_conf->ovfl_mask; + + switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) { + case SPRN_PMC1: + mtspr(SPRN_PMC1, value & ovfl_mask); + break; + case SPRN_PMC2: + mtspr(SPRN_PMC2, value & ovfl_mask); + break; + case SPRN_PMC3: + mtspr(SPRN_PMC3, value & ovfl_mask); + break; + case SPRN_PMC4: + mtspr(SPRN_PMC4, value & ovfl_mask); + break; + case SPRN_TBRL: + case SPRN_PURR: + /* Ignore writes to read-only registers. */ + break; + default: + BUG(); + } +} + +static u64 pfm_power6_sread(struct pfm_context *ctx, unsigned int cnum) +{ + struct pfm_arch_context *ctx_arch = pfm_ctx_arch(ctx); + int cpu_num = smp_processor_id(); + + /* On POWER 6 PMC5 and PMC6 are implemented as + * virtual counters. See comment in pfm_power6_pmd_desc + * definition. + */ + + switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) { + case SPRN_PMC5: + return ctx_arch->powergs_pmc5 + (u64)((u32)mfspr(SPRN_PMC5) - pmc5_start_save[cpu_num]); + break; + + case SPRN_PMC6: + return ctx_arch->powergs_pmc6 + (u64)((u32)mfspr(SPRN_PMC6) - pmc6_start_save[cpu_num]); + break; + + case PFM_DELTA_TB: + return ctx_arch->delta_tb + + (((u64)mfspr(SPRN_TBRU) << 32) | mfspr(SPRN_TBRL)) + - ctx_arch->delta_tb_start; + break; + + case PFM_DELTA_PURR: + return ctx_arch->delta_purr + + mfspr(SPRN_PURR) + - ctx_arch->delta_purr_start; + break; + + default: + BUG(); + } +} + +void pfm_power6_swrite(struct pfm_context *ctx, unsigned int cnum, + u64 val) +{ + struct pfm_arch_context *ctx_arch = pfm_ctx_arch(ctx); + int cpu_num = smp_processor_id(); + + switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) { + case SPRN_PMC5: + pmc5_start_save[cpu_num] = mfspr(SPRN_PMC5); + ctx_arch->powergs_pmc5 = val; + break; + + case SPRN_PMC6: + pmc6_start_save[cpu_num] = mfspr(SPRN_PMC6); + ctx_arch->powergs_pmc6 = val; + break; + + case PFM_DELTA_TB: + ctx_arch->delta_tb_start = + (((u64)mfspr(SPRN_TBRU) << 32) | mfspr(SPRN_TBRL)); + ctx_arch->delta_tb = val; + break; + + case PFM_DELTA_PURR: + ctx_arch->delta_purr_start = mfspr(SPRN_PURR); + ctx_arch->delta_purr = val; + break; + + default: + BUG(); + } +} + +static u64 pfm_power6_read_pmd(unsigned int cnum) +{ + switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) { + case SPRN_PMC1: + return mfspr(SPRN_PMC1); + case SPRN_PMC2: + return mfspr(SPRN_PMC2); + case SPRN_PMC3: + return mfspr(SPRN_PMC3); + case SPRN_PMC4: + return mfspr(SPRN_PMC4); + case SPRN_TBRL: + return ((u64)mfspr(SPRN_TBRU) << 32) | mfspr(SPRN_TBRL); + case SPRN_PURR: + if (cpu_has_feature(CPU_FTR_PURR)) + return mfspr(SPRN_PURR); + else + return 0; + default: + BUG(); + } +} + + +/** + * pfm_power6_enable_counters + * + **/ +static void pfm_power6_enable_counters(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + + unsigned int i, max_pmc; + int cpu_num = smp_processor_id(); + struct pfm_arch_context *ctx_arch; + + enable_cntrs_cnt++; + + /* need the ctx passed down to the routine */ + ctx_arch = pfm_ctx_arch(ctx); + max_pmc = ctx->regs.max_pmc; + + /* Write MMCR0 last, and a fairly easy way to do this is to write + the registers in the reverse order */ + for (i = max_pmc; i != 0; i--) + if (test_bit(i - 1, set->used_pmcs)) + pfm_power6_write_pmc(i - 1, set->pmcs[i - 1]); + + /* save current free running HW event count */ + pmc5_start_save[cpu_num] = mfspr(SPRN_PMC5); + pmc6_start_save[cpu_num] = mfspr(SPRN_PMC6); + + ctx_arch->delta_purr_start = mfspr(SPRN_PURR); + + if (cpu_has_feature(CPU_FTR_PURR)) + ctx_arch->delta_tb_start = + ((u64)mfspr(SPRN_TBRU) << 32) | mfspr(SPRN_TBRL); + else + ctx_arch->delta_tb_start = 0; + + /* Start kernel timer for this cpu to periodically update + * the virtual counters. + */ + init_timer(&pmc5_6_update[cpu_num]); + pmc5_6_update[cpu_num].function = pmc5_6_updater; + pmc5_6_update[cpu_num].data = (unsigned long) cpu_num; + pmc5_6_update[cpu_num].expires = jiffies + update_time; + /* context for this timer, timer will be removed if context + * is switched because the counters will be stopped first. + * NEEDS WORK, I think this is all ok, a little concerned about a + * race between the kernel timer going off right as the counters + * are being stopped and the context switching. Need to think + * about this. + */ + pmc5_6_ctx_arch[cpu_num] = ctx_arch; + add_timer(&pmc5_6_update[cpu_num]); +} + +/** + * pfm_power6_disable_counters + * + **/ +static void pfm_power6_disable_counters(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + struct pfm_arch_context *ctx_arch; + int cpu_num = smp_processor_id(); + + disable_cntrs_cnt++; + + /* Set the Freeze Counters bit */ + mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC); + asm volatile ("sync"); + + /* delete kernel update timer */ + del_timer_sync(&pmc5_6_update[cpu_num]); + + /* Update the virtual pmd 5 and 6 counters from the free running + * HW counters + */ + ctx_arch = pfm_ctx_arch(ctx); + delta(cpu_num, ctx_arch); + + ctx_arch->delta_tb += + (((u64)mfspr(SPRN_TBRU) << 32) | mfspr(SPRN_TBRL)) + - ctx_arch->delta_tb_start; + + ctx_arch->delta_purr += mfspr(SPRN_PURR) + - ctx_arch->delta_purr_start; +} + +/** + * pfm_power6_get_ovfl_pmds + * + * Determine which counters in this set have overflowed and fill in the + * set->povfl_pmds mask and set->npend_ovfls count. + **/ +static void pfm_power6_get_ovfl_pmds(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + unsigned int i; + unsigned int first_intr_pmd = ctx->regs.first_intr_pmd; + unsigned int max_intr_pmd = ctx->regs.max_intr_pmd; + u64 *used_pmds = set->used_pmds; + u64 *cntr_pmds = ctx->regs.cnt_pmds; + u64 width_mask = 1 << pfm_pmu_conf->counter_width; + u64 new_val, mask[PFM_PMD_BV]; + + bitmap_and(cast_ulp(mask), cast_ulp(cntr_pmds), cast_ulp(used_pmds), max_intr_pmd); + + /* max_intr_pmd is actually the last interrupting pmd register + 1 */ + for (i = first_intr_pmd; i < max_intr_pmd; i++) { + if (test_bit(i, mask)) { + new_val = pfm_power6_read_pmd(i); + if (new_val & width_mask) { + set_bit(i, set->povfl_pmds); + set->npend_ovfls++; + } + } + } +} + +static void pfm_power6_irq_handler(struct pt_regs *regs, + struct pfm_context *ctx) +{ + u32 mmcr0; + u64 mmcra; + + /* Disable the counters (set the freeze bit) to not polute + * the counts. + */ + mmcr0 = mfspr(SPRN_MMCR0); + mtspr(SPRN_MMCR0, (mmcr0 | MMCR0_FC)); + mmcra = mfspr(SPRN_MMCRA); + + /* Set the PMM bit (see comment below). */ + mtmsrd(mfmsr() | MSR_PMM); + + pm1_4_interrupt++; + + pfm_interrupt_handler(instruction_pointer(regs), regs); + + mmcr0 = mfspr(SPRN_MMCR0); + + /* + * Reset the perfmon trigger if + * not in masking mode. + */ + if (ctx->state != PFM_CTX_MASKED) + mmcr0 |= MMCR0_PMXE; + + /* + * Clear the PMU Alert Occurred bit + */ + mmcr0 &= ~MMCR0_PMAO; + + /* Clear the appropriate bits in the MMCRA. */ + mmcra &= ~(POWER6_MMCRA_THRM | POWER6_MMCRA_OTHER); + mtspr(SPRN_MMCRA, mmcra); + + /* + * Now clear the freeze bit, counting will not start until we + * rfid from this exception, because only at that point will + * the PMM bit be cleared. + */ + mmcr0 &= ~MMCR0_FC; + mtspr(SPRN_MMCR0, mmcr0); +} + +static void pfm_power6_resend_irq(struct pfm_context *ctx) +{ + /* + * Assert the PMAO bit to cause a PMU interrupt. Make sure we + * trigger the edge detection circuitry for PMAO + */ + mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_PMAO); + mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_PMAO); +} + +struct pfm_arch_pmu_info pfm_power6_pmu_info = { + .pmu_style = PFM_POWERPC_PMU_POWER6, + .write_pmc = pfm_power6_write_pmc, + .write_pmd = pfm_power6_write_pmd, + .read_pmd = pfm_power6_read_pmd, + .irq_handler = pfm_power6_irq_handler, + .get_ovfl_pmds = pfm_power6_get_ovfl_pmds, + .enable_counters = pfm_power6_enable_counters, + .disable_counters = pfm_power6_disable_counters, + .resend_irq = pfm_power6_resend_irq +}; + +/* + * impl_pmcs, impl_pmds are computed at runtime to minimize errors! + */ +static struct pfm_pmu_config pfm_power6_pmu_conf = { + .pmu_name = "POWER6", + .counter_width = 31, + .pmd_desc = pfm_power6_pmd_desc, + .pmc_desc = pfm_power6_pmc_desc, + .num_pmc_entries = PFM_PM_NUM_PMCS, + .num_pmd_entries = PFM_PM_NUM_PMDS, + .probe_pmu = pfm_power6_probe_pmu, + .pmu_info = &pfm_power6_pmu_info, + .pmd_sread = pfm_power6_sread, + .pmd_swrite = pfm_power6_swrite, + .flags = PFM_PMU_BUILTIN_FLAG, + .owner = THIS_MODULE +}; + +static int __init pfm_power6_pmu_init_module(void) +{ + int ret; + disable_cntrs_cnt = 0; + enable_cntrs_cnt = 0; + call_delta = 0; + pm5_6_interrupt = 0; + pm1_4_interrupt = 0; + + /* calculate the time for updating counters 5 and 6 */ + + /* + * MAX_EVENT_RATE assumes a max instruction issue rate of 2 + * instructions per clock cycle. Experience shows that this factor + * of 2 is more than adequate. + */ + +# define MAX_EVENT_RATE (ppc_proc_freq * 2) + + /* + * Calculate the time, in jiffies, it takes for event counter 5 or + * 6 to completely wrap when counting at the max event rate, and + * then figure on sampling at twice that rate. + */ + update_time = (((unsigned long)HZ * OVERFLOW_VALUE) + / ((unsigned long)MAX_EVENT_RATE)) / 2; + + ret = pfm_pmu_register(&pfm_power6_pmu_conf); + return ret; +} + +static void __exit pfm_power6_pmu_cleanup_module(void) +{ + pfm_pmu_unregister(&pfm_power6_pmu_conf); +} + +module_init(pfm_power6_pmu_init_module); +module_exit(pfm_power6_pmu_cleanup_module); diff --git a/arch/powerpc/perfmon/perfmon_ppc32.c b/arch/powerpc/perfmon/perfmon_ppc32.c new file mode 100644 index 0000000..76f0b84 --- /dev/null +++ b/arch/powerpc/perfmon/perfmon_ppc32.c @@ -0,0 +1,340 @@ +/* + * This file contains the PPC32 PMU register description tables + * and pmc checker used by perfmon.c. + * + * Philip Mucci, mucci@cs.utk.edu + * + * Based on code from: + * Copyright (c) 2005 David Gibson, IBM Corporation. + * + * Based on perfmon_p6.c: + * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include + +MODULE_AUTHOR("Philip Mucci "); +MODULE_DESCRIPTION("PPC32 PMU description table"); +MODULE_LICENSE("GPL"); + +static struct pfm_pmu_config pfm_ppc32_pmu_conf; + +static struct pfm_regmap_desc pfm_ppc32_pmc_desc[] = { +/* mmcr0 */ PMC_D(PFM_REG_I, "MMCR0", 0x0, 0, 0, SPRN_MMCR0), +/* mmcr1 */ PMC_D(PFM_REG_I, "MMCR1", 0x0, 0, 0, SPRN_MMCR1), +/* mmcr2 */ PMC_D(PFM_REG_I, "MMCR2", 0x0, 0, 0, SPRN_MMCR2), +}; +#define PFM_PM_NUM_PMCS ARRAY_SIZE(pfm_ppc32_pmc_desc) + +static struct pfm_regmap_desc pfm_ppc32_pmd_desc[] = { +/* pmd0 */ PMD_D(PFM_REG_C, "PMC1", SPRN_PMC1), +/* pmd1 */ PMD_D(PFM_REG_C, "PMC2", SPRN_PMC2), +/* pmd2 */ PMD_D(PFM_REG_C, "PMC3", SPRN_PMC3), +/* pmd3 */ PMD_D(PFM_REG_C, "PMC4", SPRN_PMC4), +/* pmd4 */ PMD_D(PFM_REG_C, "PMC5", SPRN_PMC5), +/* pmd5 */ PMD_D(PFM_REG_C, "PMC6", SPRN_PMC6), +}; +#define PFM_PM_NUM_PMDS ARRAY_SIZE(pfm_ppc32_pmd_desc) + +static void perfmon_perf_irq(struct pt_regs *regs) +{ + u32 mmcr0; + + /* BLATANTLY STOLEN FROM OPROFILE, then modified */ + + /* set the PMM bit (see comment below) */ + mtmsr(mfmsr() | MSR_PMM); + + pfm_interrupt_handler(instruction_pointer(regs), regs); + + /* The freeze bit was set by the interrupt. + * Clear the freeze bit, and reenable the interrupt. + * The counters won't actually start until the rfi clears + * the PMM bit. + */ + + /* Unfreezes the counters on this CPU, enables the interrupt, + * enables the counters to trigger the interrupt, and sets the + * counters to only count when the mark bit is not set. + */ + mmcr0 = mfspr(SPRN_MMCR0); + + mmcr0 &= ~(MMCR0_FC | MMCR0_FCM0); + mmcr0 |= (MMCR0_FCECE | MMCR0_PMC1CE | MMCR0_PMCnCE | MMCR0_PMXE); + + mtspr(SPRN_MMCR0, mmcr0); +} + +static int pfm_ppc32_probe_pmu(void) +{ + enum ppc32_pmu_type pm_type; + int nmmcr = 0, npmds = 0, intsok = 0, i; + unsigned int pvr; + char *str; + + pvr = mfspr(SPRN_PVR); + + switch (PVR_VER(pvr)) { + case 0x0004: /* 604 */ + str = "PPC604"; + pm_type = PFM_POWERPC_PMU_604; + nmmcr = 1; + npmds = 2; + break; + case 0x0009: /* 604e; */ + case 0x000A: /* 604ev */ + str = "PPC604e"; + pm_type = PFM_POWERPC_PMU_604e; + nmmcr = 2; + npmds = 4; + break; + case 0x0008: /* 750/740 */ + str = "PPC750"; + pm_type = PFM_POWERPC_PMU_750; + nmmcr = 2; + npmds = 4; + break; + case 0x7000: /* 750FX */ + case 0x7001: + str = "PPC750"; + pm_type = PFM_POWERPC_PMU_750; + nmmcr = 2; + npmds = 4; + if ((pvr & 0xFF0F) >= 0x0203) + intsok = 1; + break; + case 0x7002: /* 750GX */ + str = "PPC750"; + pm_type = PFM_POWERPC_PMU_750; + nmmcr = 2; + npmds = 4; + intsok = 1; + case 0x000C: /* 7400 */ + str = "PPC7400"; + pm_type = PFM_POWERPC_PMU_7400; + nmmcr = 3; + npmds = 4; + break; + case 0x800C: /* 7410 */ + str = "PPC7410"; + pm_type = PFM_POWERPC_PMU_7400; + nmmcr = 3; + npmds = 4; + if ((pvr & 0xFFFF) >= 0x01103) + intsok = 1; + break; + case 0x8000: /* 7451/7441 */ + case 0x8001: /* 7455/7445 */ + case 0x8002: /* 7457/7447 */ + case 0x8003: /* 7447A */ + case 0x8004: /* 7448 */ + str = "PPC7450"; + pm_type = PFM_POWERPC_PMU_7450; + nmmcr = 3; npmds = 6; + intsok = 1; + break; + default: + PFM_INFO("Unknown PVR_VER(0x%x)\n", PVR_VER(pvr)); + return -1; + } + + /* + * deconfigure unimplemented registers + */ + for (i = npmds; i < PFM_PM_NUM_PMDS; i++) + pfm_ppc32_pmd_desc[i].type = PFM_REG_NA; + + for (i = nmmcr; i < PFM_PM_NUM_PMCS; i++) + pfm_ppc32_pmc_desc[i].type = PFM_REG_NA; + + /* + * update PMU description structure + */ + pfm_ppc32_pmu_conf.pmu_name = str; + pfm_ppc32_pmu_info.pmu_style = pm_type; + pfm_ppc32_pmu_conf.num_pmc_entries = nmmcr; + pfm_ppc32_pmu_conf.num_pmd_entries = npmds; + + if (intsok == 0) + PFM_INFO("Interrupts unlikely to work\n"); + + return reserve_pmc_hardware(perfmon_perf_irq); +} + +static void pfm_ppc32_write_pmc(unsigned int cnum, u64 value) +{ + switch (pfm_pmu_conf->pmc_desc[cnum].hw_addr) { + case SPRN_MMCR0: + mtspr(SPRN_MMCR0, value); + break; + case SPRN_MMCR1: + mtspr(SPRN_MMCR1, value); + break; + case SPRN_MMCR2: + mtspr(SPRN_MMCR2, value); + break; + default: + BUG(); + } +} + +static void pfm_ppc32_write_pmd(unsigned int cnum, u64 value) +{ + switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) { + case SPRN_PMC1: + mtspr(SPRN_PMC1, value); + break; + case SPRN_PMC2: + mtspr(SPRN_PMC2, value); + break; + case SPRN_PMC3: + mtspr(SPRN_PMC3, value); + break; + case SPRN_PMC4: + mtspr(SPRN_PMC4, value); + break; + case SPRN_PMC5: + mtspr(SPRN_PMC5, value); + break; + case SPRN_PMC6: + mtspr(SPRN_PMC6, value); + break; + default: + BUG(); + } +} + +static u64 pfm_ppc32_read_pmd(unsigned int cnum) +{ + switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) { + case SPRN_PMC1: + return mfspr(SPRN_PMC1); + case SPRN_PMC2: + return mfspr(SPRN_PMC2); + case SPRN_PMC3: + return mfspr(SPRN_PMC3); + case SPRN_PMC4: + return mfspr(SPRN_PMC4); + case SPRN_PMC5: + return mfspr(SPRN_PMC5); + case SPRN_PMC6: + return mfspr(SPRN_PMC6); + default: + BUG(); + } +} + +/** + * pfm_ppc32_enable_counters + * + * Just need to load the current values into the control registers. + **/ +static void pfm_ppc32_enable_counters(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + unsigned int i, max_pmc; + + max_pmc = pfm_pmu_conf->regs.max_pmc; + + for (i = 0; i < max_pmc; i++) + if (test_bit(i, set->used_pmcs)) + pfm_ppc32_write_pmc(i, set->pmcs[i]); +} + +/** + * pfm_ppc32_disable_counters + * + * Just need to zero all the control registers. + **/ +static void pfm_ppc32_disable_counters(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + unsigned int i, max; + + max = pfm_pmu_conf->regs.max_pmc; + + for (i = 0; i < max; i++) + if (test_bit(i, set->used_pmcs)) + pfm_ppc32_write_pmc(ctx, 0); +} + +/** + * pfm_ppc32_get_ovfl_pmds + * + * Determine which counters in this set have overflowed and fill in the + * set->povfl_pmds mask and set->npend_ovfls count. + **/ +static void pfm_ppc32_get_ovfl_pmds(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + unsigned int i; + unsigned int max_pmd = pfm_pmu_conf->regs.max_cnt_pmd; + u64 *used_pmds = set->used_pmds; + u64 *cntr_pmds = pfm_pmu_conf->regs.cnt_pmds; + u64 width_mask = 1 << pfm_pmu_conf->counter_width; + u64 new_val, mask[PFM_PMD_BV]; + + bitmap_and(cast_ulp(mask), cast_ulp(cntr_pmds), + cast_ulp(used_pmds), max_pmd); + + for (i = 0; i < max_pmd; i++) { + if (test_bit(i, mask)) { + new_val = pfm_ppc32_read_pmd(i); + if (new_val & width_mask) { + set_bit(i, set->povfl_pmds); + set->npend_ovfls++; + } + } + } +} + +struct pfm_arch_pmu_info pfm_ppc32_pmu_info = { + .pmu_style = PFM_POWERPC_PMU_NONE, + .write_pmc = pfm_ppc32_write_pmc, + .write_pmd = pfm_ppc32_write_pmd, + .read_pmd = pfm_ppc32_read_pmd, + .get_ovfl_pmds = pfm_ppc32_get_ovfl_pmds, + .enable_counters = pfm_ppc32_enable_counters, + .disable_counters = pfm_ppc32_disable_counters, +}; + +static struct pfm_pmu_config pfm_ppc32_pmu_conf = { + .counter_width = 31, + .pmd_desc = pfm_ppc32_pmd_desc, + .pmc_desc = pfm_ppc32_pmc_desc, + .probe_pmu = pfm_ppc32_probe_pmu, + .flags = PFM_PMU_BUILTIN_FLAG, + .owner = THIS_MODULE, + .version = "0.1", + .arch_info = &pfm_ppc32_pmu_info, +}; + +static int __init pfm_ppc32_pmu_init_module(void) +{ + return pfm_pmu_register(&pfm_ppc32_pmu_conf); +} + +static void __exit pfm_ppc32_pmu_cleanup_module(void) +{ + release_pmc_hardware(); + pfm_pmu_unregister(&pfm_ppc32_pmu_conf); +} + +module_init(pfm_ppc32_pmu_init_module); +module_exit(pfm_ppc32_pmu_cleanup_module); diff --git a/arch/powerpc/platforms/cell/cbe_regs.c b/arch/powerpc/platforms/cell/cbe_regs.c index dbc338f..e24320e 100644 --- a/arch/powerpc/platforms/cell/cbe_regs.c +++ b/arch/powerpc/platforms/cell/cbe_regs.c @@ -33,6 +33,7 @@ static struct cbe_regs_map struct cbe_iic_regs __iomem *iic_regs; struct cbe_mic_tm_regs __iomem *mic_tm_regs; struct cbe_pmd_shadow_regs pmd_shadow_regs; + struct cbe_ppe_priv_regs __iomem *ppe_priv_regs; } cbe_regs_maps[MAX_CBE]; static int cbe_regs_map_count; @@ -145,6 +146,23 @@ struct cbe_mic_tm_regs __iomem *cbe_get_cpu_mic_tm_regs(int cpu) } EXPORT_SYMBOL_GPL(cbe_get_cpu_mic_tm_regs); +struct cbe_ppe_priv_regs __iomem *cbe_get_ppe_priv_regs(struct device_node *np) +{ + struct cbe_regs_map *map = cbe_find_map(np); + if (map == NULL) + return NULL; + return map->ppe_priv_regs; +} + +struct cbe_ppe_priv_regs __iomem *cbe_get_cpu_ppe_priv_regs(int cpu) +{ + struct cbe_regs_map *map = cbe_thread_map[cpu].regs; + if (map == NULL) + return NULL; + return map->ppe_priv_regs; +} +EXPORT_SYMBOL_GPL(cbe_get_cpu_ppe_priv_regs); + u32 cbe_get_hw_thread_id(int cpu) { return cbe_thread_map[cpu].thread_id; @@ -206,6 +224,11 @@ void __init cbe_fill_regs_map(struct cbe_regs_map *map) for_each_node_by_type(np, "mic-tm") if (of_get_parent(np) == be) map->mic_tm_regs = of_iomap(np, 0); + + for_each_node_by_type(np, "ppe-mmio") + if (of_get_parent(np) == be) + map->ppe_priv_regs = of_iomap(np, 0); + } else { struct device_node *cpu; /* That hack must die die die ! */ @@ -227,6 +250,10 @@ void __init cbe_fill_regs_map(struct cbe_regs_map *map) prop = of_get_property(cpu, "mic-tm", NULL); if (prop != NULL) map->mic_tm_regs = ioremap(prop->address, prop->len); + + prop = of_get_property(cpu, "ppe-mmio", NULL); + if (prop != NULL) + map->ppe_priv_regs = ioremap(prop->address, prop->len); } } diff --git a/arch/sparc/include/asm/hypervisor.h b/arch/sparc/include/asm/hypervisor.h index 109ae24..bafe5a6 100644 --- a/arch/sparc/include/asm/hypervisor.h +++ b/arch/sparc/include/asm/hypervisor.h @@ -2713,6 +2713,30 @@ extern unsigned long sun4v_ldc_revoke(unsigned long channel, */ #define HV_FAST_SET_PERFREG 0x101 +#define HV_N2_PERF_SPARC_CTL 0x0 +#define HV_N2_PERF_DRAM_CTL0 0x1 +#define HV_N2_PERF_DRAM_CNT0 0x2 +#define HV_N2_PERF_DRAM_CTL1 0x3 +#define HV_N2_PERF_DRAM_CNT1 0x4 +#define HV_N2_PERF_DRAM_CTL2 0x5 +#define HV_N2_PERF_DRAM_CNT2 0x6 +#define HV_N2_PERF_DRAM_CTL3 0x7 +#define HV_N2_PERF_DRAM_CNT3 0x8 + +#define HV_FAST_N2_GET_PERFREG 0x104 +#define HV_FAST_N2_SET_PERFREG 0x105 + +#ifndef __ASSEMBLY__ +extern unsigned long sun4v_niagara_getperf(unsigned long reg, + unsigned long *val); +extern unsigned long sun4v_niagara_setperf(unsigned long reg, + unsigned long val); +extern unsigned long sun4v_niagara2_getperf(unsigned long reg, + unsigned long *val); +extern unsigned long sun4v_niagara2_setperf(unsigned long reg, + unsigned long val); +#endif + /* MMU statistics services. * * The hypervisor maintains MMU statistics and privileged code provides diff --git a/arch/sparc/include/asm/irq_64.h b/arch/sparc/include/asm/irq_64.h index e3dd930..6cf3aec 100644 --- a/arch/sparc/include/asm/irq_64.h +++ b/arch/sparc/include/asm/irq_64.h @@ -67,6 +67,9 @@ extern void virt_irq_free(unsigned int virt_irq); extern void __init init_IRQ(void); extern void fixup_irqs(void); +extern int register_perfctr_intr(void (*handler)(struct pt_regs *)); +extern void release_perfctr_intr(void (*handler)(struct pt_regs *)); + static inline void set_softint(unsigned long bits) { __asm__ __volatile__("wr %0, 0x0, %%set_softint" diff --git a/arch/sparc/include/asm/perfmon.h b/arch/sparc/include/asm/perfmon.h new file mode 100644 index 0000000..f20cbfa --- /dev/null +++ b/arch/sparc/include/asm/perfmon.h @@ -0,0 +1,11 @@ +#ifndef _SPARC64_PERFMON_H_ +#define _SPARC64_PERFMON_H_ + +/* + * arch-specific user visible interface definitions + */ + +#define PFM_ARCH_MAX_PMCS 2 +#define PFM_ARCH_MAX_PMDS 3 + +#endif /* _SPARC64_PERFMON_H_ */ diff --git a/arch/sparc/include/asm/perfmon_kern.h b/arch/sparc/include/asm/perfmon_kern.h new file mode 100644 index 0000000..033eff5 --- /dev/null +++ b/arch/sparc/include/asm/perfmon_kern.h @@ -0,0 +1,286 @@ +#ifndef _SPARC64_PERFMON_KERN_H_ +#define _SPARC64_PERFMON_KERN_H_ + +#ifdef __KERNEL__ + +#ifdef CONFIG_PERFMON + +#include +#include + +#define PFM_ARCH_PMD_STK_ARG 2 +#define PFM_ARCH_PMC_STK_ARG 1 + +struct pfm_arch_pmu_info { + u32 pmu_style; +}; + +static inline void pfm_arch_resend_irq(struct pfm_context *ctx) +{ +} + +static inline void pfm_arch_clear_pmd_ovfl_cond(struct pfm_context *ctx, + struct pfm_event_set *set) +{} + +static inline void pfm_arch_serialize(void) +{ +} + +/* + * SPARC does not save the PMDs during pfm_arch_intr_freeze_pmu(), thus + * this routine needs to do it when switching sets on overflow + */ +static inline void pfm_arch_save_pmds_from_intr(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + pfm_save_pmds(ctx, set); +} + +extern void pfm_arch_write_pmc(struct pfm_context *ctx, + unsigned int cnum, u64 value); +extern u64 pfm_arch_read_pmc(struct pfm_context *ctx, unsigned int cnum); + +static inline void pfm_arch_write_pmd(struct pfm_context *ctx, + unsigned int cnum, u64 value) +{ + u64 pic; + + value &= pfm_pmu_conf->ovfl_mask; + + read_pic(pic); + + switch (cnum) { + case 0: + pic = (pic & 0xffffffff00000000UL) | + (value & 0xffffffffUL); + break; + case 1: + pic = (pic & 0xffffffffUL) | + (value << 32UL); + break; + default: + BUG(); + } + + write_pic(pic); +} + +static inline u64 pfm_arch_read_pmd(struct pfm_context *ctx, + unsigned int cnum) +{ + u64 pic; + + read_pic(pic); + + switch (cnum) { + case 0: + return pic & 0xffffffffUL; + case 1: + return pic >> 32UL; + default: + BUG(); + return 0; + } +} + +/* + * For some CPUs, the upper bits of a counter must be set in order for the + * overflow interrupt to happen. On overflow, the counter has wrapped around, + * and the upper bits are cleared. This function may be used to set them back. + */ +static inline void pfm_arch_ovfl_reset_pmd(struct pfm_context *ctx, + unsigned int cnum) +{ + u64 val = pfm_arch_read_pmd(ctx, cnum); + + /* This masks out overflow bit 31 */ + pfm_arch_write_pmd(ctx, cnum, val); +} + +/* + * At certain points, perfmon needs to know if monitoring has been + * explicitely started/stopped by user via pfm_start/pfm_stop. The + * information is tracked in ctx.flags.started. However on certain + * architectures, it may be possible to start/stop directly from + * user level with a single assembly instruction bypassing + * the kernel. This function must be used to determine by + * an arch-specific mean if monitoring is actually started/stopped. + */ +static inline int pfm_arch_is_active(struct pfm_context *ctx) +{ + return ctx->flags.started; +} + +static inline void pfm_arch_ctxswout_sys(struct task_struct *task, + struct pfm_context *ctx) +{ +} + +static inline void pfm_arch_ctxswin_sys(struct task_struct *task, + struct pfm_context *ctx) +{ +} + +static inline void pfm_arch_ctxswin_thread(struct task_struct *task, + struct pfm_context *ctx) +{ +} + +int pfm_arch_is_monitoring_active(struct pfm_context *ctx); +int pfm_arch_ctxswout_thread(struct task_struct *task, + struct pfm_context *ctx); +void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx); +void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx); +void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set); +void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set); +char *pfm_arch_get_pmu_module_name(void); + +static inline void pfm_arch_intr_freeze_pmu(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + pfm_arch_stop(current, ctx); + /* + * we mark monitoring as stopped to avoid + * certain side effects especially in + * pfm_switch_sets_from_intr() on + * pfm_arch_restore_pmcs() + */ + ctx->flags.started = 0; +} + +/* + * unfreeze PMU from pfm_do_interrupt_handler() + * ctx may be NULL for spurious + */ +static inline void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx) +{ + if (!ctx) + return; + + PFM_DBG_ovfl("state=%d", ctx->state); + + ctx->flags.started = 1; + + if (ctx->state == PFM_CTX_MASKED) + return; + + pfm_arch_restore_pmcs(ctx, ctx->active_set); +} + +/* + * this function is called from the PMU interrupt handler ONLY. + * On SPARC, the PMU is frozen via arch_stop, masking would be implemented + * via arch-stop as well. Given that the PMU is already stopped when + * entering the interrupt handler, we do not need to stop it again, so + * this function is a nop. + */ +static inline void pfm_arch_mask_monitoring(struct pfm_context *ctx, + struct pfm_event_set *set) +{ +} + +/* + * on MIPS masking/unmasking uses the start/stop mechanism, so we simply + * need to start here. + */ +static inline void pfm_arch_unmask_monitoring(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + pfm_arch_start(current, ctx); +} + +static inline void pfm_arch_pmu_config_remove(void) +{ +} + +static inline int pfm_arch_context_create(struct pfm_context *ctx, + u32 ctx_flags) +{ + return 0; +} + +static inline void pfm_arch_context_free(struct pfm_context *ctx) +{ +} + +/* + * function called from pfm_setfl_sane(). Context is locked + * and interrupts are masked. + * The value of flags is the value of ctx_flags as passed by + * user. + * + * function must check arch-specific set flags. + * Return: + * 1 when flags are valid + * 0 on error + */ +static inline int pfm_arch_setfl_sane(struct pfm_context *ctx, u32 flags) +{ + return 0; +} + +static inline int pfm_arch_init(void) +{ + return 0; +} + +static inline void pfm_arch_init_percpu(void) +{ +} + +static inline int pfm_arch_load_context(struct pfm_context *ctx) +{ + return 0; +} + +static inline void pfm_arch_unload_context(struct pfm_context *ctx) +{} + +extern void perfmon_interrupt(struct pt_regs *); + +static inline int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds) +{ + return register_perfctr_intr(perfmon_interrupt); +} + +static inline void pfm_arch_pmu_release(void) +{ + release_perfctr_intr(perfmon_interrupt); +} + +static inline void pfm_arch_arm_handle_work(struct task_struct *task) +{} + +static inline void pfm_arch_disarm_handle_work(struct task_struct *task) +{} + +static inline int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg) +{ + return 0; +} + +static inline int pfm_arch_get_base_syscall(void) +{ + return __NR_pfm_create_context; +} + +struct pfm_arch_context { + /* empty */ +}; + +#define PFM_ARCH_CTX_SIZE sizeof(struct pfm_arch_context) +/* + * SPARC needs extra alignment for the sampling buffer + */ +#define PFM_ARCH_SMPL_ALIGN_SIZE (16 * 1024) + +static inline void pfm_cacheflush(void *addr, unsigned int len) +{ +} + +#endif /* CONFIG_PERFMON */ + +#endif /* __KERNEL__ */ + +#endif /* _SPARC64_PERFMON_KERN_H_ */ diff --git a/arch/sparc/include/asm/system_64.h b/arch/sparc/include/asm/system_64.h index db9e742..2a9ddb9 100644 --- a/arch/sparc/include/asm/system_64.h +++ b/arch/sparc/include/asm/system_64.h @@ -30,6 +30,9 @@ enum sparc_cpu { #define ARCH_SUN4C_SUN4 0 #define ARCH_SUN4 0 +extern char *sparc_cpu_type; +extern char *sparc_fpu_type; +extern char *sparc_pmu_type; extern char reboot_command[]; /* These are here in an effort to more fully work around Spitfire Errata @@ -104,15 +107,13 @@ do { __asm__ __volatile__("ba,pt %%xcc, 1f\n\t" \ #define write_pcr(__p) __asm__ __volatile__("wr %0, 0x0, %%pcr" : : "r" (__p)) #define read_pic(__p) __asm__ __volatile__("rd %%pic, %0" : "=r" (__p)) -/* Blackbird errata workaround. See commentary in - * arch/sparc64/kernel/smp.c:smp_percpu_timer_interrupt() - * for more information. - */ -#define reset_pic() \ - __asm__ __volatile__("ba,pt %xcc, 99f\n\t" \ +/* Blackbird errata workaround. */ +#define write_pic(val) \ + __asm__ __volatile__("ba,pt %%xcc, 99f\n\t" \ ".align 64\n" \ - "99:wr %g0, 0x0, %pic\n\t" \ - "rd %pic, %g0") + "99:wr %0, 0x0, %%pic\n\t" \ + "rd %%pic, %%g0" : : "r" (val)) +#define reset_pic() write_pic(0) #ifndef __ASSEMBLY__ @@ -145,14 +146,10 @@ do { \ * and 2 stores in this critical code path. -DaveM */ #define switch_to(prev, next, last) \ -do { if (test_thread_flag(TIF_PERFCTR)) { \ - unsigned long __tmp; \ - read_pcr(__tmp); \ - current_thread_info()->pcr_reg = __tmp; \ - read_pic(__tmp); \ - current_thread_info()->kernel_cntd0 += (unsigned int)(__tmp);\ - current_thread_info()->kernel_cntd1 += ((__tmp) >> 32); \ - } \ +do { if (test_tsk_thread_flag(prev, TIF_PERFMON_CTXSW)) \ + pfm_ctxsw_out(prev, next); \ + if (test_tsk_thread_flag(next, TIF_PERFMON_CTXSW)) \ + pfm_ctxsw_in(prev, next); \ flush_tlb_pending(); \ save_and_clear_fpu(); \ /* If you are tempted to conditionalize the following */ \ @@ -197,11 +194,6 @@ do { if (test_thread_flag(TIF_PERFCTR)) { \ "l1", "l2", "l3", "l4", "l5", "l6", "l7", \ "i0", "i1", "i2", "i3", "i4", "i5", \ "o0", "o1", "o2", "o3", "o4", "o5", "o7"); \ - /* If you fuck with this, update ret_from_syscall code too. */ \ - if (test_thread_flag(TIF_PERFCTR)) { \ - write_pcr(current_thread_info()->pcr_reg); \ - reset_pic(); \ - } \ } while(0) static inline unsigned long xchg32(__volatile__ unsigned int *m, unsigned int val) diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h index c0a737d..53857f7 100644 --- a/arch/sparc/include/asm/thread_info_64.h +++ b/arch/sparc/include/asm/thread_info_64.h @@ -58,11 +58,6 @@ struct thread_info { unsigned long gsr[7]; unsigned long xfsr[7]; - __u64 __user *user_cntd0; - __u64 __user *user_cntd1; - __u64 kernel_cntd0, kernel_cntd1; - __u64 pcr_reg; - struct restart_block restart_block; struct pt_regs *kern_una_regs; @@ -96,15 +91,10 @@ struct thread_info { #define TI_RWIN_SPTRS 0x000003c8 #define TI_GSR 0x00000400 #define TI_XFSR 0x00000438 -#define TI_USER_CNTD0 0x00000470 -#define TI_USER_CNTD1 0x00000478 -#define TI_KERN_CNTD0 0x00000480 -#define TI_KERN_CNTD1 0x00000488 -#define TI_PCR 0x00000490 -#define TI_RESTART_BLOCK 0x00000498 -#define TI_KUNA_REGS 0x000004c0 -#define TI_KUNA_INSN 0x000004c8 -#define TI_FPREGS 0x00000500 +#define TI_RESTART_BLOCK 0x00000470 +#define TI_KUNA_REGS 0x00000498 +#define TI_KUNA_INSN 0x000004a0 +#define TI_FPREGS 0x000004c0 /* We embed this in the uppermost byte of thread_info->flags */ #define FAULT_CODE_WRITE 0x01 /* Write access, implies D-TLB */ @@ -222,11 +212,11 @@ register struct thread_info *current_thread_info_reg asm("g6"); #define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ #define TIF_SIGPENDING 2 /* signal pending */ #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ -#define TIF_PERFCTR 4 /* performance counters active */ +/* Bit 4 is available */ #define TIF_UNALIGNED 5 /* allowed to do unaligned accesses */ /* flag bit 6 is available */ #define TIF_32BIT 7 /* 32-bit binary */ -/* flag bit 8 is available */ +#define TIF_PERFMON_WORK 8 /* work for pfm_handle_work() */ #define TIF_SECCOMP 9 /* secure computing */ #define TIF_SYSCALL_AUDIT 10 /* syscall auditing active */ /* flag bit 11 is available */ @@ -237,22 +227,24 @@ register struct thread_info *current_thread_info_reg asm("g6"); #define TIF_ABI_PENDING 12 #define TIF_MEMDIE 13 #define TIF_POLLING_NRFLAG 14 +#define TIF_PERFMON_CTXSW 15 /* perfmon needs ctxsw calls */ #define _TIF_SYSCALL_TRACE (1< #include #include +#include #include #include @@ -385,11 +386,7 @@ void exit_thread(void) t->utraps[0]--; } - if (test_and_clear_thread_flag(TIF_PERFCTR)) { - t->user_cntd0 = t->user_cntd1 = NULL; - t->pcr_reg = 0; - write_pcr(0); - } + pfm_exit_thread(); } void flush_thread(void) @@ -411,13 +408,6 @@ void flush_thread(void) set_thread_wsaved(0); - /* Turn off performance counters if on. */ - if (test_and_clear_thread_flag(TIF_PERFCTR)) { - t->user_cntd0 = t->user_cntd1 = NULL; - t->pcr_reg = 0; - write_pcr(0); - } - /* Clear FPU register state. */ t->fpsaved[0] = 0; @@ -631,16 +621,6 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, t->kregs->u_regs[UREG_FP] = ((unsigned long) child_sf) - STACK_BIAS; - /* Special case, if we are spawning a kernel thread from - * a userspace task (usermode helper, NFS or similar), we - * must disable performance counters in the child because - * the address space and protection realm are changing. - */ - if (t->flags & _TIF_PERFCTR) { - t->user_cntd0 = t->user_cntd1 = NULL; - t->pcr_reg = 0; - t->flags &= ~_TIF_PERFCTR; - } t->flags |= ((long)ASI_P << TI_FLAG_CURRENT_DS_SHIFT); t->kregs->u_regs[UREG_G6] = (unsigned long) t; t->kregs->u_regs[UREG_G4] = (unsigned long) t->task; @@ -673,6 +653,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, if (clone_flags & CLONE_SETTLS) t->kregs->u_regs[UREG_G7] = regs->u_regs[UREG_I3]; + pfm_copy_thread(p); + return 0; } diff --git a/arch/sparc64/kernel/rtrap.S b/arch/sparc64/kernel/rtrap.S index 97a993c..c2af29d 100644 --- a/arch/sparc64/kernel/rtrap.S +++ b/arch/sparc64/kernel/rtrap.S @@ -65,55 +65,14 @@ __handle_user_windows: ba,pt %xcc, __handle_user_windows_continue andn %l1, %l4, %l1 -__handle_perfctrs: - call update_perfctrs - wrpr %g0, RTRAP_PSTATE, %pstate - wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate - ldub [%g6 + TI_WSAVED], %o2 - brz,pt %o2, 1f - nop - /* Redo userwin+sched+sig checks */ - call fault_in_user_windows - - wrpr %g0, RTRAP_PSTATE, %pstate - wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate - ldx [%g6 + TI_FLAGS], %l0 - andcc %l0, _TIF_NEED_RESCHED, %g0 - be,pt %xcc, 1f - - nop - call schedule - wrpr %g0, RTRAP_PSTATE, %pstate - wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate - ldx [%g6 + TI_FLAGS], %l0 -1: andcc %l0, _TIF_DO_NOTIFY_RESUME_MASK, %g0 - - be,pt %xcc, __handle_perfctrs_continue - sethi %hi(TSTATE_PEF), %o0 - mov %l5, %o1 - add %sp, PTREGS_OFF, %o0 - mov %l0, %o2 - call do_notify_resume - - wrpr %g0, RTRAP_PSTATE, %pstate - wrpr %g0, RTRAP_PSTATE_IRQOFF, %pstate - /* Signal delivery can modify pt_regs tstate, so we must - * reload it. - */ - ldx [%sp + PTREGS_OFF + PT_V9_TSTATE], %l1 - sethi %hi(0xf << 20), %l4 - and %l1, %l4, %l4 - andn %l1, %l4, %l1 - ba,pt %xcc, __handle_perfctrs_continue - - sethi %hi(TSTATE_PEF), %o0 __handle_userfpu: rd %fprs, %l5 andcc %l5, FPRS_FEF, %g0 sethi %hi(TSTATE_PEF), %o0 be,a,pn %icc, __handle_userfpu_continue andn %l1, %o0, %l1 - ba,a,pt %xcc, __handle_userfpu_continue + ba,pt %xcc, __handle_userfpu_continue + nop __handle_signal: mov %l5, %o1 @@ -202,12 +161,8 @@ __handle_signal_continue: brnz,pn %o2, __handle_user_windows nop __handle_user_windows_continue: - ldx [%g6 + TI_FLAGS], %l5 - andcc %l5, _TIF_PERFCTR, %g0 sethi %hi(TSTATE_PEF), %o0 - bne,pn %xcc, __handle_perfctrs -__handle_perfctrs_continue: - andcc %l1, %o0, %g0 + andcc %l1, %o0, %g0 /* This fpdepth clear is necessary for non-syscall rtraps only */ user_nowork: diff --git a/arch/sparc64/kernel/setup.c b/arch/sparc64/kernel/setup.c index c8b03a4..248aa1f 100644 --- a/arch/sparc64/kernel/setup.c +++ b/arch/sparc64/kernel/setup.c @@ -352,6 +352,7 @@ static int show_cpuinfo(struct seq_file *m, void *__unused) seq_printf(m, "cpu\t\t: %s\n" "fpu\t\t: %s\n" + "pmu\t\t: %s\n" "prom\t\t: %s\n" "type\t\t: %s\n" "ncpus probed\t: %d\n" @@ -364,6 +365,7 @@ static int show_cpuinfo(struct seq_file *m, void *__unused) , sparc_cpu_type, sparc_fpu_type, + sparc_pmu_type, prom_version, ((tlb_type == hypervisor) ? "sun4v" : diff --git a/arch/sparc64/kernel/signal.c b/arch/sparc64/kernel/signal.c index ec82d76..cea1082 100644 --- a/arch/sparc64/kernel/signal.c +++ b/arch/sparc64/kernel/signal.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -608,6 +609,9 @@ static void do_signal(struct pt_regs *regs, unsigned long orig_i0) void do_notify_resume(struct pt_regs *regs, unsigned long orig_i0, unsigned long thread_info_flags) { + if (thread_info_flags & _TIF_PERFMON_WORK) + pfm_handle_work(regs); + if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs, orig_i0); if (thread_info_flags & _TIF_NOTIFY_RESUME) { diff --git a/arch/sparc64/kernel/sys_sparc.c b/arch/sparc64/kernel/sys_sparc.c index 39749e3..384004b 100644 --- a/arch/sparc64/kernel/sys_sparc.c +++ b/arch/sparc64/kernel/sys_sparc.c @@ -26,7 +26,6 @@ #include #include -#include #include #include "entry.h" @@ -791,106 +790,10 @@ asmlinkage long sys_rt_sigaction(int sig, return ret; } -/* Invoked by rtrap code to update performance counters in - * user space. - */ -asmlinkage void update_perfctrs(void) -{ - unsigned long pic, tmp; - - read_pic(pic); - tmp = (current_thread_info()->kernel_cntd0 += (unsigned int)pic); - __put_user(tmp, current_thread_info()->user_cntd0); - tmp = (current_thread_info()->kernel_cntd1 += (pic >> 32)); - __put_user(tmp, current_thread_info()->user_cntd1); - reset_pic(); -} - asmlinkage long sys_perfctr(int opcode, unsigned long arg0, unsigned long arg1, unsigned long arg2) { - int err = 0; - - switch(opcode) { - case PERFCTR_ON: - current_thread_info()->pcr_reg = arg2; - current_thread_info()->user_cntd0 = (u64 __user *) arg0; - current_thread_info()->user_cntd1 = (u64 __user *) arg1; - current_thread_info()->kernel_cntd0 = - current_thread_info()->kernel_cntd1 = 0; - write_pcr(arg2); - reset_pic(); - set_thread_flag(TIF_PERFCTR); - break; - - case PERFCTR_OFF: - err = -EINVAL; - if (test_thread_flag(TIF_PERFCTR)) { - current_thread_info()->user_cntd0 = - current_thread_info()->user_cntd1 = NULL; - current_thread_info()->pcr_reg = 0; - write_pcr(0); - clear_thread_flag(TIF_PERFCTR); - err = 0; - } - break; - - case PERFCTR_READ: { - unsigned long pic, tmp; - - if (!test_thread_flag(TIF_PERFCTR)) { - err = -EINVAL; - break; - } - read_pic(pic); - tmp = (current_thread_info()->kernel_cntd0 += (unsigned int)pic); - err |= __put_user(tmp, current_thread_info()->user_cntd0); - tmp = (current_thread_info()->kernel_cntd1 += (pic >> 32)); - err |= __put_user(tmp, current_thread_info()->user_cntd1); - reset_pic(); - break; - } - - case PERFCTR_CLRPIC: - if (!test_thread_flag(TIF_PERFCTR)) { - err = -EINVAL; - break; - } - current_thread_info()->kernel_cntd0 = - current_thread_info()->kernel_cntd1 = 0; - reset_pic(); - break; - - case PERFCTR_SETPCR: { - u64 __user *user_pcr = (u64 __user *)arg0; - - if (!test_thread_flag(TIF_PERFCTR)) { - err = -EINVAL; - break; - } - err |= __get_user(current_thread_info()->pcr_reg, user_pcr); - write_pcr(current_thread_info()->pcr_reg); - current_thread_info()->kernel_cntd0 = - current_thread_info()->kernel_cntd1 = 0; - reset_pic(); - break; - } - - case PERFCTR_GETPCR: { - u64 __user *user_pcr = (u64 __user *)arg0; - - if (!test_thread_flag(TIF_PERFCTR)) { - err = -EINVAL; - break; - } - err |= __put_user(current_thread_info()->pcr_reg, user_pcr); - break; - } - - default: - err = -EINVAL; - break; - }; - return err; + /* Superceded by perfmon2 */ + return -ENOSYS; } /* diff --git a/arch/sparc64/kernel/syscalls.S b/arch/sparc64/kernel/syscalls.S index a2f2427..b20bf1e 100644 --- a/arch/sparc64/kernel/syscalls.S +++ b/arch/sparc64/kernel/syscalls.S @@ -117,26 +117,9 @@ ret_from_syscall: stb %g0, [%g6 + TI_NEW_CHILD] ldx [%g6 + TI_FLAGS], %l0 call schedule_tail - mov %g7, %o0 - andcc %l0, _TIF_PERFCTR, %g0 - be,pt %icc, 1f - nop - ldx [%g6 + TI_PCR], %o7 - wr %g0, %o7, %pcr - - /* Blackbird errata workaround. See commentary in - * smp.c:smp_percpu_timer_interrupt() for more - * information. - */ - ba,pt %xcc, 99f - nop - - .align 64 -99: wr %g0, %g0, %pic - rd %pic, %g0 - -1: ba,pt %xcc, ret_sys_call - ldx [%sp + PTREGS_OFF + PT_V9_I0], %o0 + mov %g7, %o0 + ba,pt %xcc, ret_sys_call + ldx [%sp + PTREGS_OFF + PT_V9_I0], %o0 .globl sparc_exit .type sparc_exit,#function diff --git a/arch/sparc64/kernel/systbls.S b/arch/sparc64/kernel/systbls.S index 0fdbf3b..1a1a296 100644 --- a/arch/sparc64/kernel/systbls.S +++ b/arch/sparc64/kernel/systbls.S @@ -82,7 +82,9 @@ sys_call_table32: .word compat_sys_set_mempolicy, compat_sys_kexec_load, compat_sys_move_pages, sys_getcpu, compat_sys_epoll_pwait /*310*/ .word compat_sys_utimensat, compat_sys_signalfd, sys_timerfd_create, sys_eventfd, compat_sys_fallocate .word compat_sys_timerfd_settime, compat_sys_timerfd_gettime, compat_sys_signalfd4, sys_eventfd2, sys_epoll_create1 -/*320*/ .word sys_dup3, sys_pipe2, sys_inotify_init1 +/*320*/ .word sys_dup3, sys_pipe2, sys_inotify_init1, sys_pfm_create_context, sys_pfm_write_pmcs + .word sys_pfm_write_pmds, sys_pfm_read_pmds, sys_pfm_load_context, sys_pfm_start, sys_pfm_stop +/*330*/ .word sys_pfm_restart, sys_pfm_create_evtsets, sys_pfm_getinfo_evtsets, sys_pfm_delete_evtsets, sys_pfm_unload_context #endif /* CONFIG_COMPAT */ @@ -156,4 +158,6 @@ sys_call_table: .word sys_set_mempolicy, sys_kexec_load, sys_move_pages, sys_getcpu, sys_epoll_pwait /*310*/ .word sys_utimensat, sys_signalfd, sys_timerfd_create, sys_eventfd, sys_fallocate .word sys_timerfd_settime, sys_timerfd_gettime, sys_signalfd4, sys_eventfd2, sys_epoll_create1 -/*320*/ .word sys_dup3, sys_pipe2, sys_inotify_init1 +/*320*/ .word sys_dup3, sys_pipe2, sys_inotify_init1, sys_pfm_create_context, sys_pfm_write_pmcs + .word sys_pfm_write_pmds, sys_pfm_read_pmds, sys_pfm_load_context, sys_pfm_start, sys_pfm_stop +/*330*/ .word sys_pfm_restart, sys_pfm_create_evtsets, sys_pfm_getinfo_evtsets, sys_pfm_delete_evtsets, sys_pfm_unload_context diff --git a/arch/sparc64/kernel/traps.c b/arch/sparc64/kernel/traps.c index c824df1..be45d09 100644 --- a/arch/sparc64/kernel/traps.c +++ b/arch/sparc64/kernel/traps.c @@ -2470,86 +2470,90 @@ extern void tsb_config_offsets_are_bolixed_dave(void); /* Only invoked on boot processor. */ void __init trap_init(void) { - /* Compile time sanity check. */ - if (TI_TASK != offsetof(struct thread_info, task) || - TI_FLAGS != offsetof(struct thread_info, flags) || - TI_CPU != offsetof(struct thread_info, cpu) || - TI_FPSAVED != offsetof(struct thread_info, fpsaved) || - TI_KSP != offsetof(struct thread_info, ksp) || - TI_FAULT_ADDR != offsetof(struct thread_info, fault_address) || - TI_KREGS != offsetof(struct thread_info, kregs) || - TI_UTRAPS != offsetof(struct thread_info, utraps) || - TI_EXEC_DOMAIN != offsetof(struct thread_info, exec_domain) || - TI_REG_WINDOW != offsetof(struct thread_info, reg_window) || - TI_RWIN_SPTRS != offsetof(struct thread_info, rwbuf_stkptrs) || - TI_GSR != offsetof(struct thread_info, gsr) || - TI_XFSR != offsetof(struct thread_info, xfsr) || - TI_USER_CNTD0 != offsetof(struct thread_info, user_cntd0) || - TI_USER_CNTD1 != offsetof(struct thread_info, user_cntd1) || - TI_KERN_CNTD0 != offsetof(struct thread_info, kernel_cntd0) || - TI_KERN_CNTD1 != offsetof(struct thread_info, kernel_cntd1) || - TI_PCR != offsetof(struct thread_info, pcr_reg) || - TI_PRE_COUNT != offsetof(struct thread_info, preempt_count) || - TI_NEW_CHILD != offsetof(struct thread_info, new_child) || - TI_SYS_NOERROR != offsetof(struct thread_info, syscall_noerror) || - TI_RESTART_BLOCK != offsetof(struct thread_info, restart_block) || - TI_KUNA_REGS != offsetof(struct thread_info, kern_una_regs) || - TI_KUNA_INSN != offsetof(struct thread_info, kern_una_insn) || - TI_FPREGS != offsetof(struct thread_info, fpregs) || - (TI_FPREGS & (64 - 1))) - thread_info_offsets_are_bolixed_dave(); - - if (TRAP_PER_CPU_THREAD != offsetof(struct trap_per_cpu, thread) || - (TRAP_PER_CPU_PGD_PADDR != - offsetof(struct trap_per_cpu, pgd_paddr)) || - (TRAP_PER_CPU_CPU_MONDO_PA != - offsetof(struct trap_per_cpu, cpu_mondo_pa)) || - (TRAP_PER_CPU_DEV_MONDO_PA != - offsetof(struct trap_per_cpu, dev_mondo_pa)) || - (TRAP_PER_CPU_RESUM_MONDO_PA != - offsetof(struct trap_per_cpu, resum_mondo_pa)) || - (TRAP_PER_CPU_RESUM_KBUF_PA != - offsetof(struct trap_per_cpu, resum_kernel_buf_pa)) || - (TRAP_PER_CPU_NONRESUM_MONDO_PA != - offsetof(struct trap_per_cpu, nonresum_mondo_pa)) || - (TRAP_PER_CPU_NONRESUM_KBUF_PA != - offsetof(struct trap_per_cpu, nonresum_kernel_buf_pa)) || - (TRAP_PER_CPU_FAULT_INFO != - offsetof(struct trap_per_cpu, fault_info)) || - (TRAP_PER_CPU_CPU_MONDO_BLOCK_PA != - offsetof(struct trap_per_cpu, cpu_mondo_block_pa)) || - (TRAP_PER_CPU_CPU_LIST_PA != - offsetof(struct trap_per_cpu, cpu_list_pa)) || - (TRAP_PER_CPU_TSB_HUGE != - offsetof(struct trap_per_cpu, tsb_huge)) || - (TRAP_PER_CPU_TSB_HUGE_TEMP != - offsetof(struct trap_per_cpu, tsb_huge_temp)) || - (TRAP_PER_CPU_IRQ_WORKLIST_PA != - offsetof(struct trap_per_cpu, irq_worklist_pa)) || - (TRAP_PER_CPU_CPU_MONDO_QMASK != - offsetof(struct trap_per_cpu, cpu_mondo_qmask)) || - (TRAP_PER_CPU_DEV_MONDO_QMASK != - offsetof(struct trap_per_cpu, dev_mondo_qmask)) || - (TRAP_PER_CPU_RESUM_QMASK != - offsetof(struct trap_per_cpu, resum_qmask)) || - (TRAP_PER_CPU_NONRESUM_QMASK != - offsetof(struct trap_per_cpu, nonresum_qmask))) - trap_per_cpu_offsets_are_bolixed_dave(); - - if ((TSB_CONFIG_TSB != - offsetof(struct tsb_config, tsb)) || - (TSB_CONFIG_RSS_LIMIT != - offsetof(struct tsb_config, tsb_rss_limit)) || - (TSB_CONFIG_NENTRIES != - offsetof(struct tsb_config, tsb_nentries)) || - (TSB_CONFIG_REG_VAL != - offsetof(struct tsb_config, tsb_reg_val)) || - (TSB_CONFIG_MAP_VADDR != - offsetof(struct tsb_config, tsb_map_vaddr)) || - (TSB_CONFIG_MAP_PTE != - offsetof(struct tsb_config, tsb_map_pte))) - tsb_config_offsets_are_bolixed_dave(); - + BUILD_BUG_ON(TI_TASK != offsetof(struct thread_info, task)); + BUILD_BUG_ON(TI_FLAGS != offsetof(struct thread_info, flags)); + BUILD_BUG_ON(TI_CPU != offsetof(struct thread_info, cpu)); + BUILD_BUG_ON(TI_FPSAVED != offsetof(struct thread_info, fpsaved)); + BUILD_BUG_ON(TI_KSP != offsetof(struct thread_info, ksp)); + BUILD_BUG_ON(TI_FAULT_ADDR != + offsetof(struct thread_info, fault_address)); + BUILD_BUG_ON(TI_KREGS != offsetof(struct thread_info, kregs)); + BUILD_BUG_ON(TI_UTRAPS != offsetof(struct thread_info, utraps)); + BUILD_BUG_ON(TI_EXEC_DOMAIN != + offsetof(struct thread_info, exec_domain)); + BUILD_BUG_ON(TI_REG_WINDOW != + offsetof(struct thread_info, reg_window)); + BUILD_BUG_ON(TI_RWIN_SPTRS != + offsetof(struct thread_info, rwbuf_stkptrs)); + BUILD_BUG_ON(TI_GSR != offsetof(struct thread_info, gsr)); + BUILD_BUG_ON(TI_XFSR != offsetof(struct thread_info, xfsr)); + BUILD_BUG_ON(TI_PRE_COUNT != + offsetof(struct thread_info, preempt_count)); + BUILD_BUG_ON(TI_NEW_CHILD != + offsetof(struct thread_info, new_child)); + BUILD_BUG_ON(TI_SYS_NOERROR != + offsetof(struct thread_info, syscall_noerror)); + BUILD_BUG_ON(TI_RESTART_BLOCK != + offsetof(struct thread_info, restart_block)); + BUILD_BUG_ON(TI_KUNA_REGS != + offsetof(struct thread_info, kern_una_regs)); + BUILD_BUG_ON(TI_KUNA_INSN != + offsetof(struct thread_info, kern_una_insn)); + BUILD_BUG_ON(TI_FPREGS != offsetof(struct thread_info, fpregs)); + BUILD_BUG_ON((TI_FPREGS & (64 - 1))); + + BUILD_BUG_ON(TRAP_PER_CPU_THREAD != + offsetof(struct trap_per_cpu, thread)); + BUILD_BUG_ON(TRAP_PER_CPU_PGD_PADDR != + offsetof(struct trap_per_cpu, pgd_paddr)); + BUILD_BUG_ON(TRAP_PER_CPU_CPU_MONDO_PA != + offsetof(struct trap_per_cpu, cpu_mondo_pa)); + BUILD_BUG_ON(TRAP_PER_CPU_DEV_MONDO_PA != + offsetof(struct trap_per_cpu, dev_mondo_pa)); + BUILD_BUG_ON(TRAP_PER_CPU_RESUM_MONDO_PA != + offsetof(struct trap_per_cpu, resum_mondo_pa)); + BUILD_BUG_ON(TRAP_PER_CPU_RESUM_KBUF_PA != + offsetof(struct trap_per_cpu, resum_kernel_buf_pa)); + BUILD_BUG_ON(TRAP_PER_CPU_NONRESUM_MONDO_PA != + offsetof(struct trap_per_cpu, nonresum_mondo_pa)); + BUILD_BUG_ON(TRAP_PER_CPU_NONRESUM_KBUF_PA != + offsetof(struct trap_per_cpu, nonresum_kernel_buf_pa)); + BUILD_BUG_ON(TRAP_PER_CPU_FAULT_INFO != + offsetof(struct trap_per_cpu, fault_info)); + BUILD_BUG_ON(TRAP_PER_CPU_CPU_MONDO_BLOCK_PA != + offsetof(struct trap_per_cpu, cpu_mondo_block_pa)); + BUILD_BUG_ON(TRAP_PER_CPU_CPU_LIST_PA != + offsetof(struct trap_per_cpu, cpu_list_pa)); + BUILD_BUG_ON(TRAP_PER_CPU_TSB_HUGE != + offsetof(struct trap_per_cpu, tsb_huge)); + BUILD_BUG_ON(TRAP_PER_CPU_TSB_HUGE_TEMP != + offsetof(struct trap_per_cpu, tsb_huge_temp)); +#if 0 + BUILD_BUG_ON(TRAP_PER_CPU_IRQ_WORKLIST != + offsetof(struct trap_per_cpu, irq_worklist)); +#endif + BUILD_BUG_ON(TRAP_PER_CPU_CPU_MONDO_QMASK != + offsetof(struct trap_per_cpu, cpu_mondo_qmask)); + BUILD_BUG_ON(TRAP_PER_CPU_DEV_MONDO_QMASK != + offsetof(struct trap_per_cpu, dev_mondo_qmask)); + BUILD_BUG_ON(TRAP_PER_CPU_RESUM_QMASK != + offsetof(struct trap_per_cpu, resum_qmask)); + BUILD_BUG_ON(TRAP_PER_CPU_NONRESUM_QMASK != + offsetof(struct trap_per_cpu, nonresum_qmask)); + + BUILD_BUG_ON(TSB_CONFIG_TSB != + offsetof(struct tsb_config, tsb)); + BUILD_BUG_ON(TSB_CONFIG_RSS_LIMIT != + offsetof(struct tsb_config, tsb_rss_limit)); + BUILD_BUG_ON(TSB_CONFIG_NENTRIES != + offsetof(struct tsb_config, tsb_nentries)); + BUILD_BUG_ON(TSB_CONFIG_REG_VAL != + offsetof(struct tsb_config, tsb_reg_val)); + BUILD_BUG_ON(TSB_CONFIG_MAP_VADDR != + offsetof(struct tsb_config, tsb_map_vaddr)); + BUILD_BUG_ON(TSB_CONFIG_MAP_PTE != + offsetof(struct tsb_config, tsb_map_pte)); + /* Attach to the address space of init_task. On SMP we * do this in smp.c:smp_callin for other cpus. */ diff --git a/arch/sparc64/kernel/ttable.S b/arch/sparc64/kernel/ttable.S index 1ade3d6..2a31ffa 100644 --- a/arch/sparc64/kernel/ttable.S +++ b/arch/sparc64/kernel/ttable.S @@ -66,7 +66,7 @@ tl0_irq6: BTRAP(0x46) tl0_irq7: BTRAP(0x47) BTRAP(0x48) BTRAP(0x49) tl0_irq10: BTRAP(0x4a) BTRAP(0x4b) BTRAP(0x4c) BTRAP(0x4d) tl0_irq14: TRAP_IRQ(timer_interrupt, 14) -tl0_irq15: TRAP_IRQ(handler_irq, 15) +tl0_irq15: TRAP_IRQ(perfctr_irq, 15) tl0_resv050: BTRAP(0x50) BTRAP(0x51) BTRAP(0x52) BTRAP(0x53) BTRAP(0x54) BTRAP(0x55) tl0_resv056: BTRAP(0x56) BTRAP(0x57) BTRAP(0x58) BTRAP(0x59) BTRAP(0x5a) BTRAP(0x5b) tl0_resv05c: BTRAP(0x5c) BTRAP(0x5d) BTRAP(0x5e) BTRAP(0x5f) diff --git a/arch/sparc64/perfmon/Kconfig b/arch/sparc64/perfmon/Kconfig new file mode 100644 index 0000000..4672024 --- /dev/null +++ b/arch/sparc64/perfmon/Kconfig @@ -0,0 +1,26 @@ +menu "Hardware Performance Monitoring support" +config PERFMON + bool "Perfmon2 performance monitoring interface" + default n + help + Enables the perfmon2 interface to access the hardware + performance counters. See for + more details. + +config PERFMON_DEBUG + bool "Perfmon debugging" + depends on PERFMON + default n + help + Enables perfmon debugging support + +config PERFMON_DEBUG_FS + bool "Enable perfmon statistics reporting via debugfs" + default y + depends on PERFMON && DEBUG_FS + help + Enable collection and reporting of perfmon timing statistics under + debugfs. This is used for debugging and performance analysis of the + subsystem. The debugfs filesystem must be mounted. + +endmenu diff --git a/arch/sparc64/perfmon/Makefile b/arch/sparc64/perfmon/Makefile new file mode 100644 index 0000000..ad2d907 --- /dev/null +++ b/arch/sparc64/perfmon/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_PERFMON) += perfmon.o diff --git a/arch/sparc64/perfmon/perfmon.c b/arch/sparc64/perfmon/perfmon.c new file mode 100644 index 0000000..9e29833 --- /dev/null +++ b/arch/sparc64/perfmon/perfmon.c @@ -0,0 +1,422 @@ +/* perfmon.c: sparc64 perfmon support + * + * Copyright (C) 2007 David S. Miller (davem@davemloft.net) + */ + +#include +#include +#include +#include + +#include +#include +#include + +struct pcr_ops { + void (*write)(u64); + u64 (*read)(void); +}; + +static void direct_write_pcr(u64 val) +{ + write_pcr(val); +} + +static u64 direct_read_pcr(void) +{ + u64 pcr; + + read_pcr(pcr); + + return pcr; +} + +static struct pcr_ops direct_pcr_ops = { + .write = direct_write_pcr, + .read = direct_read_pcr, +}; + +/* Using the hypervisor call is needed so that we can set the + * hypervisor trace bit correctly, which is hyperprivileged. + */ +static void n2_write_pcr(u64 val) +{ + unsigned long ret; + + ret = sun4v_niagara2_setperf(HV_N2_PERF_SPARC_CTL, val); + if (val != HV_EOK) + write_pcr(val); +} + +static u64 n2_read_pcr(void) +{ + u64 pcr; + + read_pcr(pcr); + + return pcr; +} + +static struct pcr_ops n2_pcr_ops = { + .write = n2_write_pcr, + .read = n2_read_pcr, +}; + +static struct pcr_ops *pcr_ops; + +void pfm_arch_write_pmc(struct pfm_context *ctx, + unsigned int cnum, u64 value) +{ + /* + * we only write to the actual register when monitoring is + * active (pfm_start was issued) + */ + if (ctx && ctx->flags.started == 0) + return; + + pcr_ops->write(value); +} + +u64 pfm_arch_read_pmc(struct pfm_context *ctx, unsigned int cnum) +{ + return pcr_ops->read(); +} + +/* + * collect pending overflowed PMDs. Called from pfm_ctxsw() + * and from PMU interrupt handler. Must fill in set->povfl_pmds[] + * and set->npend_ovfls. Interrupts are masked + */ +static void __pfm_get_ovfl_pmds(struct pfm_context *ctx, struct pfm_event_set *set) +{ + unsigned int max = ctx->regs.max_intr_pmd; + u64 wmask = 1ULL << pfm_pmu_conf->counter_width; + u64 *intr_pmds = ctx->regs.intr_pmds; + u64 *used_mask = set->used_pmds; + u64 mask[PFM_PMD_BV]; + unsigned int i; + + bitmap_and(cast_ulp(mask), + cast_ulp(intr_pmds), + cast_ulp(used_mask), + max); + + /* + * check all PMD that can generate interrupts + * (that includes counters) + */ + for (i = 0; i < max; i++) { + if (test_bit(i, mask)) { + u64 new_val = pfm_arch_read_pmd(ctx, i); + + PFM_DBG_ovfl("pmd%u new_val=0x%llx bit=%d\n", + i, (unsigned long long)new_val, + (new_val&wmask) ? 1 : 0); + + if (new_val & wmask) { + __set_bit(i, set->povfl_pmds); + set->npend_ovfls++; + } + } + } +} + +static void pfm_stop_active(struct task_struct *task, struct pfm_context *ctx, + struct pfm_event_set *set) +{ + unsigned int i, max = ctx->regs.max_pmc; + + /* + * clear enable bits, assume all pmcs are enable pmcs + */ + for (i = 0; i < max; i++) { + if (test_bit(i, set->used_pmcs)) + pfm_arch_write_pmc(ctx, i, 0); + } + + if (set->npend_ovfls) + return; + + __pfm_get_ovfl_pmds(ctx, set); +} + +/* + * Called from pfm_ctxsw(). Task is guaranteed to be current. + * Context is locked. Interrupts are masked. Monitoring is active. + * PMU access is guaranteed. PMC and PMD registers are live in PMU. + * + * for per-thread: + * must stop monitoring for the task + * + * Return: + * non-zero : did not save PMDs (as part of stopping the PMU) + * 0 : saved PMDs (no need to save them in caller) + */ +int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx) +{ + /* + * disable lazy restore of PMC registers. + */ + ctx->active_set->priv_flags |= PFM_SETFL_PRIV_MOD_PMCS; + + pfm_stop_active(task, ctx, ctx->active_set); + + return 1; +} + +/* + * Called from pfm_stop() and idle notifier + * + * Interrupts are masked. Context is locked. Set is the active set. + * + * For per-thread: + * task is not necessarily current. If not current task, then + * task is guaranteed stopped and off any cpu. Access to PMU + * is not guaranteed. Interrupts are masked. Context is locked. + * Set is the active set. + * + * For system-wide: + * task is current + * + * must disable active monitoring. ctx cannot be NULL + */ +void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx) +{ + /* + * no need to go through stop_save() + * if we are already stopped + */ + if (!ctx->flags.started || ctx->state == PFM_CTX_MASKED) + return; + + /* + * stop live registers and collect pending overflow + */ + if (task == current) + pfm_stop_active(task, ctx, ctx->active_set); +} + +/* + * Enable active monitoring. Called from pfm_start() and + * pfm_arch_unmask_monitoring(). + * + * Interrupts are masked. Context is locked. Set is the active set. + * + * For per-trhead: + * Task is not necessarily current. If not current task, then task + * is guaranteed stopped and off any cpu. Access to PMU is not guaranteed. + * + * For system-wide: + * task is always current + * + * must enable active monitoring. + */ +void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx) +{ + struct pfm_event_set *set; + unsigned int max_pmc = ctx->regs.max_pmc; + unsigned int i; + + if (task != current) + return; + + set = ctx->active_set; + for (i = 0; i < max_pmc; i++) { + if (test_bit(i, set->used_pmcs)) + pfm_arch_write_pmc(ctx, i, set->pmcs[i]); + } +} + +/* + * function called from pfm_switch_sets(), pfm_context_load_thread(), + * pfm_context_load_sys(), pfm_ctxsw(), pfm_switch_sets() + * context is locked. Interrupts are masked. set cannot be NULL. + * Access to the PMU is guaranteed. + * + * function must restore all PMD registers from set. + */ +void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set) +{ + unsigned int max_pmd = ctx->regs.max_pmd; + u64 ovfl_mask = pfm_pmu_conf->ovfl_mask; + u64 *impl_pmds = ctx->regs.pmds; + unsigned int i; + + /* + * must restore all pmds to avoid leaking + * information to user. + */ + for (i = 0; i < max_pmd; i++) { + u64 val; + + if (test_bit(i, impl_pmds) == 0) + continue; + + val = set->pmds[i].value; + + /* + * set upper bits for counter to ensure + * overflow will trigger + */ + val &= ovfl_mask; + + pfm_arch_write_pmd(ctx, i, val); + } +} + +/* + * function called from pfm_switch_sets(), pfm_context_load_thread(), + * pfm_context_load_sys(), pfm_ctxsw(). + * Context is locked. Interrupts are masked. set cannot be NULL. + * Access to the PMU is guaranteed. + * + * function must restore all PMC registers from set, if needed. + */ +void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set) +{ + unsigned int max_pmc = ctx->regs.max_pmc; + u64 *impl_pmcs = ctx->regs.pmcs; + unsigned int i; + + /* If we're masked or stopped we don't need to bother restoring + * the PMCs now. + */ + if (ctx->state == PFM_CTX_MASKED || ctx->flags.started == 0) + return; + + /* + * restore all pmcs + */ + for (i = 0; i < max_pmc; i++) + if (test_bit(i, impl_pmcs)) + pfm_arch_write_pmc(ctx, i, set->pmcs[i]); +} + +char *pfm_arch_get_pmu_module_name(void) +{ + return NULL; +} + +void perfmon_interrupt(struct pt_regs *regs) +{ + pfm_interrupt_handler(instruction_pointer(regs), regs); +} + +static struct pfm_regmap_desc pfm_sparc64_pmc_desc[] = { + PMC_D(PFM_REG_I, "PCR", 0, 0, 0, 0), +}; + +static struct pfm_regmap_desc pfm_sparc64_pmd_desc[] = { + PMD_D(PFM_REG_C, "PIC0", 0), + PMD_D(PFM_REG_C, "PIC1", 0), +}; + +static int pfm_sparc64_probe(void) +{ + return 0; +} + +static struct pfm_pmu_config pmu_sparc64_pmu_conf = { + .counter_width = 31, + .pmd_desc = pfm_sparc64_pmd_desc, + .num_pmd_entries = 2, + .pmc_desc = pfm_sparc64_pmc_desc, + .num_pmc_entries = 1, + .probe_pmu = pfm_sparc64_probe, + .flags = PFM_PMU_BUILTIN_FLAG, + .owner = THIS_MODULE, +}; + +static unsigned long perf_hsvc_group; +static unsigned long perf_hsvc_major; +static unsigned long perf_hsvc_minor; + +static int __init register_perf_hsvc(void) +{ + if (tlb_type == hypervisor) { + switch (sun4v_chip_type) { + case SUN4V_CHIP_NIAGARA1: + perf_hsvc_group = HV_GRP_N2_CPU; + break; + + case SUN4V_CHIP_NIAGARA2: + perf_hsvc_group = HV_GRP_N2_CPU; + break; + + default: + return -ENODEV; + } + + + perf_hsvc_major = 1; + perf_hsvc_minor = 0; + if (sun4v_hvapi_register(perf_hsvc_group, + perf_hsvc_major, + &perf_hsvc_minor)) { + printk("perfmon: Could not register N2 hvapi.\n"); + return -ENODEV; + } + } + return 0; +} + +static void unregister_perf_hsvc(void) +{ + if (tlb_type != hypervisor) + return; + sun4v_hvapi_unregister(perf_hsvc_group); +} + +static int __init pfm_sparc64_pmu_init(void) +{ + u64 mask; + int err; + + err = register_perf_hsvc(); + if (err) + return err; + + if (tlb_type == hypervisor && + sun4v_chip_type == SUN4V_CHIP_NIAGARA2) + pcr_ops = &n2_pcr_ops; + else + pcr_ops = &direct_pcr_ops; + + if (!strcmp(sparc_pmu_type, "ultra12")) + mask = (0xf << 11) | (0xf << 4) | 0x7; + else if (!strcmp(sparc_pmu_type, "ultra3") || + !strcmp(sparc_pmu_type, "ultra3i") || + !strcmp(sparc_pmu_type, "ultra3+") || + !strcmp(sparc_pmu_type, "ultra4+")) + mask = (0x3f << 11) | (0x3f << 4) | 0x7; + else if (!strcmp(sparc_pmu_type, "niagara2")) + mask = ((1UL << 63) | (1UL << 62) | + (1UL << 31) | (0xfUL << 27) | (0xffUL << 19) | + (1UL << 18) | (0xfUL << 14) | (0xff << 6) | + (0x3UL << 4) | 0x7UL); + else if (!strcmp(sparc_pmu_type, "niagara")) + mask = ((1UL << 9) | (1UL << 8) | + (0x7UL << 4) | 0x7UL); + else { + err = -ENODEV; + goto out_err; + } + + pmu_sparc64_pmu_conf.pmu_name = sparc_pmu_type; + pfm_sparc64_pmc_desc[0].rsvd_msk = ~mask; + + return pfm_pmu_register(&pmu_sparc64_pmu_conf); + +out_err: + unregister_perf_hsvc(); + return err; +} + +static void __exit pfm_sparc64_pmu_exit(void) +{ + unregister_perf_hsvc(); + return pfm_pmu_unregister(&pmu_sparc64_pmu_conf); +} + +module_init(pfm_sparc64_pmu_init); +module_exit(pfm_sparc64_pmu_exit); diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ed92864..3a2b544 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1378,6 +1378,8 @@ config COMPAT_VDSO If unsure, say Y. +source "arch/x86/perfmon/Kconfig" + endmenu config ARCH_ENABLE_MEMORY_HOTPLUG diff --git a/arch/x86/Makefile b/arch/x86/Makefile index f5631da..c868ad6 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -150,6 +150,8 @@ core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/ core-y += arch/x86/kernel/ core-y += arch/x86/mm/ +core-$(CONFIG_PERFMON) += arch/x86/perfmon/ + # Remaining sub architecture files core-y += $(mcore-y) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index ffc1bb4..58e00cb 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -832,4 +832,16 @@ ia32_sys_call_table: .quad sys_dup3 /* 330 */ .quad sys_pipe2 .quad sys_inotify_init1 + .quad sys_pfm_create_context + .quad sys_pfm_write_pmcs + .quad sys_pfm_write_pmds /* 335 */ + .quad sys_pfm_read_pmds + .quad sys_pfm_load_context + .quad sys_pfm_start + .quad sys_pfm_stop + .quad sys_pfm_restart /* 340 */ + .quad sys_pfm_create_evtsets + .quad sys_pfm_getinfo_evtsets + .quad sys_pfm_delete_evtsets + .quad sys_pfm_unload_context ia32_syscall_end: diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index f88bd0d..53fe335 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -669,6 +670,7 @@ u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); return APIC_EILVT_LVTOFF_IBS; } +EXPORT_SYMBOL(setup_APIC_eilvt_ibs); /* * Local APIC start and shutdown @@ -1367,6 +1369,9 @@ void __init apic_intr_init(void) #ifdef CONFIG_X86_MCE_P4THERMAL alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); #endif +#ifdef CONFIG_PERFMON + set_intr_gate(LOCAL_PERFMON_VECTOR, pmu_interrupt); +#endif } /** diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 446c062..574cd3b 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -228,6 +228,7 @@ u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); return APIC_EILVT_LVTOFF_IBS; } +EXPORT_SYMBOL(setup_APIC_eilvt_ibs); /* * Program the next event, relative to now diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4e456bd..5b6d6ca 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -726,6 +727,8 @@ void __cpuinit cpu_init(void) current_thread_info()->status = 0; clear_used_math(); mxcsr_feature_mask_init(); + + pfm_init_percpu(); } #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 109792b..0b6e34c 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -513,7 +513,7 @@ ENDPROC(system_call) ALIGN RING0_PTREGS_FRAME # can't unwind into user space anyway work_pending: - testb $_TIF_NEED_RESCHED, %cl + testw $(_TIF_NEED_RESCHED|_TIF_PERFMON_WORK), %cx jz work_notifysig work_resched: call schedule diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 89434d4..34e44f5 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -888,7 +888,13 @@ END(error_interrupt) ENTRY(spurious_interrupt) apicinterrupt SPURIOUS_APIC_VECTOR,smp_spurious_interrupt END(spurious_interrupt) - + +#ifdef CONFIG_PERFMON +ENTRY(pmu_interrupt) + apicinterrupt LOCAL_PERFMON_VECTOR,smp_pmu_interrupt +END(pmu_interrupt) +#endif + /* * Exception entry points. */ diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 1f26fd9..83f6bc1 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -217,6 +218,10 @@ void __init native_init_IRQ(void) alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); +#ifdef CONFIG_PERFMON + alloc_intr_gate(LOCAL_PERFMON_VECTOR, pmu_interrupt); +#endif + if (!acpi_ioapic) setup_irq(2, &irq2); } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 31f40b2..ed27150 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -277,6 +278,7 @@ void exit_thread(void) tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; put_cpu(); } + pfm_exit_thread(); } void flush_thread(void) @@ -334,6 +336,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, savesegment(gs, p->thread.gs); + pfm_copy_thread(p); + tsk = current; if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, @@ -448,6 +452,9 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, prev = &prev_p->thread; next = &next_p->thread; + if (test_tsk_thread_flag(prev_p, TIF_PERFMON_CTXSW)) + pfm_ctxsw_out(prev_p, next_p); + debugctl = prev->debugctlmsr; if (next->ds_area_msr != prev->ds_area_msr) { /* we clear debugctl to make sure DS @@ -460,6 +467,9 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, if (next->debugctlmsr != debugctl) update_debugctlmsr(next->debugctlmsr); + if (test_tsk_thread_flag(next_p, TIF_PERFMON_CTXSW)) + pfm_ctxsw_in(prev_p, next_p); + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { set_debugreg(next->debugreg0, 0); set_debugreg(next->debugreg1, 1); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index e12e0e4..97d49ce 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -240,6 +241,7 @@ void exit_thread(void) t->io_bitmap_max = 0; put_cpu(); } + pfm_exit_thread(); } void flush_thread(void) @@ -344,6 +346,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, savesegment(es, p->thread.es); savesegment(ds, p->thread.ds); + pfm_copy_thread(p); + if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) { @@ -472,6 +476,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, prev = &prev_p->thread, next = &next_p->thread; + if (test_tsk_thread_flag(prev_p, TIF_PERFMON_CTXSW)) + pfm_ctxsw_out(prev_p, next_p); + debugctl = prev->debugctlmsr; if (next->ds_area_msr != prev->ds_area_msr) { /* we clear debugctl to make sure DS @@ -484,6 +491,9 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, if (next->debugctlmsr != debugctl) update_debugctlmsr(next->debugctlmsr); + if (test_tsk_thread_flag(next_p, TIF_PERFMON_CTXSW)) + pfm_ctxsw_in(prev_p, next_p); + if (test_tsk_thread_flag(next_p, TIF_DEBUG)) { loaddebug(next, 0); loaddebug(next, 1); diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 6fb5bcd..53e6665 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -657,6 +658,10 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { + /* process perfmon asynchronous work (e.g. block thread or reset) */ + if (thread_info_flags & _TIF_PERFMON_WORK) + pfm_handle_work(regs); + /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index ca316b5..6e9fa74 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -549,12 +550,17 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { + #ifdef CONFIG_X86_MCE /* notify userspace of pending MCEs */ if (thread_info_flags & _TIF_MCE_NOTIFY) mce_notify_user(); #endif /* CONFIG_X86_MCE */ + /* process perfmon asynchronous work (e.g. block thread or reset) */ + if (thread_info_flags & _TIF_PERFMON_WORK) + pfm_handle_work(regs); + /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7985c5b..9ddf6db 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -1382,6 +1383,7 @@ int __cpu_disable(void) remove_cpu_from_maps(cpu); unlock_vector_lock(); fixup_irqs(cpu_online_map); + pfm_cpu_disable(); return 0; } diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index d44395f..e1384a9 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -332,3 +332,15 @@ ENTRY(sys_call_table) .long sys_dup3 /* 330 */ .long sys_pipe2 .long sys_inotify_init1 + .long sys_pfm_create_context + .long sys_pfm_write_pmcs + .long sys_pfm_write_pmds /* 335 */ + .long sys_pfm_read_pmds + .long sys_pfm_load_context + .long sys_pfm_start + .long sys_pfm_stop + .long sys_pfm_restart /* 340 */ + .long sys_pfm_create_evtsets + .long sys_pfm_getinfo_evtsets + .long sys_pfm_delete_evtsets + .long sys_pfm_unload_context diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 8a5f161..10faef5 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -217,12 +218,18 @@ static int nmi_setup(void) int err = 0; int cpu; - if (!allocate_msrs()) + if (pfm_session_allcpus_acquire()) + return -EBUSY; + + if (!allocate_msrs()) { + pfm_session_allcpus_release(); return -ENOMEM; + } err = register_die_notifier(&profile_exceptions_nb); if (err) { free_msrs(); + pfm_session_allcpus_release(); return err; } @@ -304,6 +311,7 @@ static void nmi_shutdown(void) model->shutdown(msrs); free_msrs(); put_cpu_var(cpu_msrs); + pfm_session_allcpus_release(); } static void nmi_cpu_start(void *dummy) diff --git a/arch/x86/perfmon/Kconfig b/arch/x86/perfmon/Kconfig new file mode 100644 index 0000000..08842e6 --- /dev/null +++ b/arch/x86/perfmon/Kconfig @@ -0,0 +1,89 @@ +menu "Hardware Performance Monitoring support" +config PERFMON + bool "Perfmon2 performance monitoring interface" + select X86_LOCAL_APIC + default n + help + Enables the perfmon2 interface to access the hardware + performance counters. See for + more details. + +config PERFMON_DEBUG + bool "Perfmon debugging" + default n + depends on PERFMON + help + Enables perfmon debugging support + +config PERFMON_DEBUG_FS + bool "Enable perfmon statistics reporting via debugfs" + default y + depends on PERFMON && DEBUG_FS + help + Enable collection and reporting of perfmon timing statistics under + debugfs. This is used for debugging and performance analysis of the + subsystem.The debugfs filesystem must be mounted. + +config X86_PERFMON_P6 + tristate "Support for Intel P6/Pentium M processor hardware performance counters" + depends on PERFMON && X86_32 + default n + help + Enables support for Intel P6-style hardware performance counters. + To be used for with Intel Pentium III, PentiumPro, Pentium M processors. + +config X86_PERFMON_P4 + tristate "Support for Intel Pentium 4/Xeon hardware performance counters" + depends on PERFMON + default n + help + Enables support for Intel Pentium 4/Xeon (Netburst) hardware performance + counters. + +config X86_PERFMON_PEBS_P4 + tristate "Support for Intel Netburst Precise Event-Based Sampling (PEBS)" + depends on PERFMON && X86_PERFMON_P4 + default n + help + Enables support for Precise Event-Based Sampling (PEBS) on the Intel + Netburst processors such as Pentium 4, Xeon which support it. + +config X86_PERFMON_CORE + tristate "Support for Intel Core-based performance counters" + depends on PERFMON + default n + help + Enables support for Intel Core-based performance counters. Enable + this option to support Intel Core 2 processors. + +config X86_PERFMON_PEBS_CORE + tristate "Support for Intel Core Precise Event-Based Sampling (PEBS)" + depends on PERFMON && X86_PERFMON_CORE + default n + help + Enables support for Precise Event-Based Sampling (PEBS) on the Intel + Core processors. + +config X86_PERFMON_INTEL_ATOM + tristate "Support for Intel Atom processor" + depends on PERFMON + default n + help + Enables support for Intel Atom processors. + +config X86_PERFMON_INTEL_ARCH + tristate "Support for Intel architectural perfmon v1/v2" + depends on PERFMON + default n + help + Enables support for Intel architectural performance counters. + This feature was introduced with Intel Core Solo/Core Duo processors. + +config X86_PERFMON_AMD64 + tristate "Support AMD Athlon64/Opteron64 hardware performance counters" + depends on PERFMON + default n + help + Enables support for Athlon64/Opterton64 hardware performance counters. + Support for family 6, 15 and 16(10H) processors. +endmenu diff --git a/arch/x86/perfmon/Makefile b/arch/x86/perfmon/Makefile new file mode 100644 index 0000000..1cbed3e --- /dev/null +++ b/arch/x86/perfmon/Makefile @@ -0,0 +1,13 @@ +# +# Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P. +# Contributed by Stephane Eranian +# +obj-$(CONFIG_PERFMON) += perfmon.o +obj-$(CONFIG_X86_PERFMON_P6) += perfmon_p6.o +obj-$(CONFIG_X86_PERFMON_P4) += perfmon_p4.o +obj-$(CONFIG_X86_PERFMON_CORE) += perfmon_intel_core.o +obj-$(CONFIG_X86_PERFMON_INTEL_ARCH) += perfmon_intel_arch.o +obj-$(CONFIG_X86_PERFMON_PEBS_P4) += perfmon_pebs_p4_smpl.o +obj-$(CONFIG_X86_PERFMON_PEBS_CORE) += perfmon_pebs_core_smpl.o +obj-$(CONFIG_X86_PERFMON_AMD64) += perfmon_amd64.o +obj-$(CONFIG_X86_PERFMON_INTEL_ATOM) += perfmon_intel_atom.o diff --git a/arch/x86/perfmon/perfmon.c b/arch/x86/perfmon/perfmon.c new file mode 100644 index 0000000..e727fed --- /dev/null +++ b/arch/x86/perfmon/perfmon.c @@ -0,0 +1,761 @@ +/* + * This file implements the X86 specific support for the perfmon2 interface + * + * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * Copyright (c) 2007 Advanced Micro Devices, Inc. + * Contributed by Robert Richter + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include +#include + +#include + +DEFINE_PER_CPU(unsigned long, real_iip); +DEFINE_PER_CPU(int, pfm_using_nmi); +DEFINE_PER_CPU(unsigned long, saved_lvtpc); + +/** + * pfm_arch_ctxswin_thread - thread context switch in + * @task: task switched in + * @ctx: context for the task + * + * Called from pfm_ctxsw(). Task is guaranteed to be current. + * set cannot be NULL. Context is locked. Interrupts are masked. + * + * Caller has already restored all PMD and PMC registers, if + * necessary (i.e., lazy restore scheme). + * + * On x86, the only common code just needs to unsecure RDPMC if necessary + * + * On model-specific features, e.g., PEBS, IBS, are taken care of in the + * corresponding PMU description module + */ +void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx) +{ + struct pfm_arch_context *ctx_arch; + + ctx_arch = pfm_ctx_arch(ctx); + + /* + * restore saved real iip + */ + if (ctx->active_set->npend_ovfls) + __get_cpu_var(real_iip) = ctx_arch->saved_real_iip; + + /* + * enable RDPMC on this CPU + */ + if (ctx_arch->flags.insecure) + set_in_cr4(X86_CR4_PCE); +} + +/** + * pfm_arch_ctxswout_thread - context switch out thread + * @task: task switched out + * @ctx : context switched out + * + * Called from pfm_ctxsw(). Task is guaranteed to be current. + * Context is locked. Interrupts are masked. Monitoring may be active. + * PMU access is guaranteed. PMC and PMD registers are live in PMU. + * + * Return: + * non-zero : did not save PMDs (as part of stopping the PMU) + * 0 : saved PMDs (no need to save them in caller) + */ +int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx) +{ + struct pfm_arch_context *ctx_arch; + struct pfm_arch_pmu_info *pmu_info; + + ctx_arch = pfm_ctx_arch(ctx); + pmu_info = pfm_pmu_info(); + + /* + * disable lazy restore of PMCS on ctxswin because + * we modify some of them. + */ + ctx->active_set->priv_flags |= PFM_SETFL_PRIV_MOD_PMCS; + + if (ctx->active_set->npend_ovfls) + ctx_arch->saved_real_iip = __get_cpu_var(real_iip); + + /* + * disable RDPMC on this CPU + */ + if (ctx_arch->flags.insecure) + clear_in_cr4(X86_CR4_PCE); + + if (ctx->state == PFM_CTX_MASKED) + return 1; + + return pmu_info->stop_save(ctx, ctx->active_set); +} + +/** + * pfm_arch_stop - deactivate monitoring + * @task: task to stop + * @ctx: context to stop + * + * Called from pfm_stop() + * Interrupts are masked. Context is locked. Set is the active set. + * + * For per-thread: + * task is not necessarily current. If not current task, then + * task is guaranteed stopped and off any cpu. Access to PMU + * is not guaranteed. + * + * For system-wide: + * task is current + * + * must disable active monitoring. ctx cannot be NULL + */ +void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx) +{ + struct pfm_arch_pmu_info *pmu_info; + + pmu_info = pfm_pmu_info(); + + /* + * no need to go through stop_save() + * if we are already stopped + */ + if (!ctx->flags.started || ctx->state == PFM_CTX_MASKED) + return; + + if (task != current) + return; + + pmu_info->stop_save(ctx, ctx->active_set); +} + + +/** + * pfm_arch_start - activate monitoring + * @task: task to start + * @ctx: context to stop + * + * Interrupts are masked. Context is locked. + * + * For per-thread: + * Task is not necessarily current. If not current task, then task + * is guaranteed stopped and off any cpu. No access to PMU is task + * is not current. + * + * For system-wide: + * task is always current + */ +void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx) +{ + struct pfm_event_set *set; + + set = ctx->active_set; + + if (task != current) + return; + + /* + * cannot restore PMC if no access to PMU. Will be done + * when the thread is switched back in + */ + + pfm_arch_restore_pmcs(ctx, set); +} + +/** + * pfm_arch_restore_pmds - reload PMD registers + * @ctx: context to restore from + * @set: current event set + * + * function called from pfm_switch_sets(), pfm_context_load_thread(), + * pfm_context_load_sys(), pfm_ctxsw() + * + * Context is locked. Interrupts are masked. Set cannot be NULL. + * Access to the PMU is guaranteed. + */ +void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set) +{ + struct pfm_arch_pmu_info *pmu_info; + u16 i, num; + + pmu_info = pfm_pmu_info(); + + num = set->nused_pmds; + + /* + * model-specific override + */ + if (pmu_info->restore_pmds) { + pmu_info->restore_pmds(ctx, set); + return; + } + + /* + * we can restore only the PMD we use because: + * + * - can only read with pfm_read_pmds() the registers + * declared used via pfm_write_pmds(), smpl_pmds, reset_pmds + * + * - if cr4.pce=1, only counters are exposed to user. RDPMC + * does not work with other types of PMU registers.Thus, no + * address is ever exposed by counters + * + * - there is never a dependency between one pmd register and + * another + */ + for (i = 0; num; i++) { + if (likely(test_bit(i, cast_ulp(set->used_pmds)))) { + pfm_write_pmd(ctx, i, set->pmds[i].value); + num--; + } + } +} + +/** + * pfm_arch_restore_pmcs - reload PMC registers + * @ctx: context to restore from + * @set: current event set + * + * function called from pfm_switch_sets(), pfm_context_load_thread(), + * pfm_context_load_sys(), pfm_ctxsw(). + * + * Context is locked. Interrupts are masked. set cannot be NULL. + * Access to the PMU is guaranteed. + * + * function must restore all PMC registers from set + */ +void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set) +{ + struct pfm_arch_pmu_info *pmu_info; + u64 *mask; + u16 i, num; + + pmu_info = pfm_pmu_info(); + + /* + * we need to restore PMCs only when: + * - context is not masked + * - monitoring activated + * + * Masking monitoring after an overflow does not change the + * value of flags.started + */ + if (ctx->state == PFM_CTX_MASKED || !ctx->flags.started) + return; + + /* + * model-specific override + */ + if (pmu_info->restore_pmcs) { + pmu_info->restore_pmcs(ctx, set); + return; + } + /* + * restore all pmcs + * + * It is not possible to restore only the pmcs we used because + * certain PMU models (e.g. Pentium 4) have dependencies. Thus + * we do not want one application using stale PMC coming from + * another one. + * + * On PMU models where there is no dependencies between pmc, then + * it is possible to optimize by only restoring the registers that + * are used, and this can be done with the models-specific override + * for this function. + * + * The default code takes the safest approach, i.e., assume the worse + */ + mask = ctx->regs.pmcs; + num = ctx->regs.num_pmcs; + for (i = 0; num; i++) { + if (test_bit(i, cast_ulp(mask))) { + pfm_arch_write_pmc(ctx, i, set->pmcs[i]); + num--; + } + } +} + +/** + * smp_pmu_interrupt - lowest level PMU interrupt handler for X86 + * @regs: machine state + * + * The PMU interrupt is handled through an interrupt gate, therefore + * the CPU automatically clears the EFLAGS.IF, i.e., masking interrupts. + * + * The perfmon interrupt handler MUST run with interrupts disabled due + * to possible race with other, higher priority interrupts, such as timer + * or IPI function calls. + * + * See description in IA-32 architecture manual, Vol 3 section 5.8.1 + */ +void smp_pmu_interrupt(struct pt_regs *regs) +{ + struct pfm_arch_pmu_info *pmu_info; + struct pfm_context *ctx; + unsigned long iip; + int using_nmi; + + using_nmi = __get_cpu_var(pfm_using_nmi); + + ack_APIC_irq(); + + irq_enter(); + + /* + * when using NMI, pfm_handle_nmi() gets called + * first. It stops monitoring and record the + * iip into real_iip, then it repost the interrupt + * using the lower priority vector LOCAL_PERFMON_VECTOR + * + * On some processors, e.g., P4, it may be that some + * state is already recorded from pfm_handle_nmi() + * and it only needs to be copied back into the normal + * fields so it can be used transparently by higher level + * code. + */ + if (using_nmi) { + ctx = __get_cpu_var(pmu_ctx); + pmu_info = pfm_pmu_info(); + iip = __get_cpu_var(real_iip); + if (ctx && pmu_info->nmi_copy_state) + pmu_info->nmi_copy_state(ctx); + } else + iip = instruction_pointer(regs); + + pfm_interrupt_handler(iip, regs); + + /* + * On Intel P6, Pentium M, P4, Intel Core: + * - it is necessary to clear the MASK field for the LVTPC + * vector. Otherwise interrupts remain masked. See + * section 8.5.1 + * AMD X86-64: + * - the documentation does not stipulate the behavior. + * To be safe, we also rewrite the vector to clear the + * mask field + */ + if (!using_nmi && current_cpu_data.x86_vendor == X86_VENDOR_INTEL) + apic_write(APIC_LVTPC, LOCAL_PERFMON_VECTOR); + + irq_exit(); +} + +/** + * pfm_handle_nmi - PMU NMI handler notifier callback + * @nb ; notifier block + * @val: type of die notifier + * @data: die notifier-specific data + * + * called from notify_die() notifier from an trap handler path. We only + * care about NMI related callbacks, and ignore everything else. + * + * Cannot grab any locks, include the perfmon context lock + * + * Must detect if NMI interrupt comes from perfmon, and if so it must + * stop the PMU and repost a lower-priority interrupt. The perfmon interrupt + * handler needs to grab the context lock, thus is cannot be run directly + * from the NMI interrupt call path. + */ +static int __kprobes pfm_handle_nmi(struct notifier_block *nb, + unsigned long val, + void *data) +{ + struct die_args *args = data; + struct pfm_context *ctx; + struct pfm_arch_pmu_info *pmu_info; + + /* + * only NMI related calls + */ + if (val != DIE_NMI_IPI) + return NOTIFY_DONE; + + /* + * perfmon not using NMI + */ + if (!__get_cpu_var(pfm_using_nmi)) + return NOTIFY_DONE; + + /* + * No context + */ + ctx = __get_cpu_var(pmu_ctx); + if (!ctx) { + PFM_DBG_ovfl("no ctx"); + return NOTIFY_DONE; + } + + /* + * Detect if we have overflows, i.e., NMI interrupt + * caused by PMU + */ + pmu_info = pfm_pmu_conf->pmu_info; + if (!pmu_info->has_ovfls(ctx)) { + PFM_DBG_ovfl("no ovfl"); + return NOTIFY_DONE; + } + + /* + * we stop the PMU to avoid further overflow before this + * one is treated by lower priority interrupt handler + */ + pmu_info->quiesce(); + + /* + * record actual instruction pointer + */ + __get_cpu_var(real_iip) = instruction_pointer(args->regs); + + /* + * post lower priority interrupt (LOCAL_PERFMON_VECTOR) + */ + pfm_arch_resend_irq(ctx); + + pfm_stats_inc(ovfl_intr_nmi_count); + + /* + * we need to rewrite the APIC vector on Intel + */ + if (current_cpu_data.x86_vendor == X86_VENDOR_INTEL) + apic_write(APIC_LVTPC, APIC_DM_NMI); + + /* + * the notification was for us + */ + return NOTIFY_STOP; +} + +static struct notifier_block pfm_nmi_nb = { + .notifier_call = pfm_handle_nmi +}; + +/** + * pfm_arch_get_pmu_module_name - get PMU description module name for autoload + * + * called from pfm_pmu_request_module + */ +char *pfm_arch_get_pmu_module_name(void) +{ + switch (current_cpu_data.x86) { + case 6: + switch (current_cpu_data.x86_model) { + case 3: /* Pentium II */ + case 7 ... 11: + case 13: + return "perfmon_p6"; + case 15: /* Merom */ + case 23: /* Penryn */ + return "perfmon_intel_core"; + case 28: /* Atom/Silverthorne */ + return "perfmon_intel_atom"; + case 29: /* Dunnington */ + return "perfmon_intel_core"; + default: + goto try_arch; + } + case 15: + case 16: + /* All Opteron processors */ + if (current_cpu_data.x86_vendor == X86_VENDOR_AMD) + return "perfmon_amd64"; + + switch (current_cpu_data.x86_model) { + case 0 ... 6: + return "perfmon_p4"; + } + /* FALL THROUGH */ + default: +try_arch: + if (boot_cpu_has(X86_FEATURE_ARCH_PERFMON)) + return "perfmon_intel_arch"; + return NULL; + } + return NULL; +} + +/** + * pfm_arch_resend_irq - post perfmon interrupt on regular vector + * + * called from pfm_ctxswin_thread() and pfm_handle_nmi() + */ +void pfm_arch_resend_irq(struct pfm_context *ctx) +{ + unsigned long val, dest; + /* + * we cannot use hw_resend_irq() because it goes to + * the I/O APIC. We need to go to the Local APIC. + * + * The "int vec" is not the right solution either + * because it triggers a software intr. We need + * to regenerate the interrupt and have it pended + * until we unmask interrupts. + * + * Instead we send ourself an IPI on the perfmon + * vector. + */ + val = APIC_DEST_SELF|APIC_INT_ASSERT| + APIC_DM_FIXED|LOCAL_PERFMON_VECTOR; + + dest = apic_read(APIC_ID); + apic_write(APIC_ICR2, dest); + apic_write(APIC_ICR, val); +} + +/** + * pfm_arch_pmu_acquire_percpu - setup APIC per CPU + * @data: contains pmu flags + */ +static void pfm_arch_pmu_acquire_percpu(void *data) +{ + + struct pfm_arch_pmu_info *pmu_info; + unsigned int tmp, vec; + unsigned long flags = (unsigned long)data; + unsigned long lvtpc; + + pmu_info = pfm_pmu_conf->pmu_info; + + /* + * we only reprogram the LVTPC vector if we have detected + * no sharing, otherwise it means the APIC is already programmed + * and we use whatever vector (likely NMI) is there + */ + if (!(flags & PFM_X86_FL_SHARING)) { + if (flags & PFM_X86_FL_USE_NMI) + vec = APIC_DM_NMI; + else + vec = LOCAL_PERFMON_VECTOR; + + tmp = apic_read(APIC_LVTERR); + apic_write(APIC_LVTERR, tmp | APIC_LVT_MASKED); + apic_write(APIC_LVTPC, vec); + apic_write(APIC_LVTERR, tmp); + } + lvtpc = (unsigned long)apic_read(APIC_LVTPC); + + __get_cpu_var(pfm_using_nmi) = lvtpc == APIC_DM_NMI; + + PFM_DBG("LTVPC=0x%lx using_nmi=%d", lvtpc, __get_cpu_var(pfm_using_nmi)); + + /* + * invoke model specific acquire routine. May be used for + * model-specific initializations + */ + if (pmu_info->acquire_pmu_percpu) + pmu_info->acquire_pmu_percpu(); +} + +/** + * pfm_arch_pmu_acquire - acquire PMU resource from system + * @unavail_pmcs : bitmask to use to set unavailable pmcs + * @unavail_pmds : bitmask to use to set unavailable pmds + * + * interrupts are not masked + * + * Grab PMU registers from lower level MSR allocator + * + * Program the APIC according the possible interrupt vector + * either LOCAL_PERFMON_VECTOR or NMI + */ +int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds) +{ + struct pfm_arch_pmu_info *pmu_info; + struct pfm_regmap_desc *d; + u16 i, nlost; + + pmu_info = pfm_pmu_conf->pmu_info; + pmu_info->flags &= ~PFM_X86_FL_SHARING; + + nlost = 0; + + d = pfm_pmu_conf->pmc_desc; + for (i = 0; i < pfm_pmu_conf->num_pmc_entries; i++, d++) { + if (!(d->type & PFM_REG_I)) + continue; + + if (d->type & PFM_REG_V) + continue; + /* + * reserve register with lower-level allocator + */ + if (!reserve_evntsel_nmi(d->hw_addr)) { + PFM_DBG("pmc%d(%s) already used", i, d->desc); + __set_bit(i, cast_ulp(unavail_pmcs)); + nlost++; + continue; + } + } + PFM_DBG("nlost=%d info_flags=0x%x\n", nlost, pmu_info->flags); + /* + * some PMU models (e.g., P6) do not support sharing + * so check if we found less than the expected number of PMC registers + */ + if (nlost) { + if (pmu_info->flags & PFM_X86_FL_NO_SHARING) { + PFM_INFO("PMU already used by another subsystem, " + "PMU does not support sharing, " + "try disabling Oprofile or " + "reboot with nmi_watchdog=0"); + goto undo; + } + pmu_info->flags |= PFM_X86_FL_SHARING; + } + + d = pfm_pmu_conf->pmd_desc; + for (i = 0; i < pfm_pmu_conf->num_pmd_entries; i++, d++) { + if (!(d->type & PFM_REG_I)) + continue; + + if (d->type & PFM_REG_V) + continue; + + if (!reserve_perfctr_nmi(d->hw_addr)) { + PFM_DBG("pmd%d(%s) already used", i, d->desc); + __set_bit(i, cast_ulp(unavail_pmds)); + } + } + /* + * program APIC on each CPU + */ + on_each_cpu(pfm_arch_pmu_acquire_percpu, + (void *)(unsigned long)pmu_info->flags , 1); + + return 0; +undo: + /* + * must undo reservation of pmcs in case of error + */ + d = pfm_pmu_conf->pmc_desc; + for (i = 0; i < pfm_pmu_conf->num_pmc_entries; i++, d++) { + if (!(d->type & (PFM_REG_I|PFM_REG_V))) + continue; + if (!test_bit(i, cast_ulp(unavail_pmcs))) + release_evntsel_nmi(d->hw_addr); + } + return -EBUSY; +} +/** + * pfm-arch_pmu_release_percpu - clear NMI state for one CPU + * + */ +static void pfm_arch_pmu_release_percpu(void *data) +{ + struct pfm_arch_pmu_info *pmu_info; + + pmu_info = pfm_pmu_conf->pmu_info; + + __get_cpu_var(pfm_using_nmi) = 0; + + /* + * invoke model specific release routine. + * May be used to undo certain initializations + * or free some model-specific ressources. + */ + if (pmu_info->release_pmu_percpu) + pmu_info->release_pmu_percpu(); +} + +/** + * pfm_arch_pmu_release - release PMU resource to system + * + * called from pfm_pmu_release() + * interrupts are not masked + * + * On x86, we return the PMU registers to the MSR allocator + */ +void pfm_arch_pmu_release(void) +{ + struct pfm_regmap_desc *d; + u16 i, n; + + d = pfm_pmu_conf->pmc_desc; + n = pfm_pmu_conf->regs_all.num_pmcs; + for (i = 0; n; i++, d++) { + if (!test_bit(i, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) + continue; + release_evntsel_nmi(d->hw_addr); + n--; + PFM_DBG("pmc%u released", i); + } + d = pfm_pmu_conf->pmd_desc; + n = pfm_pmu_conf->regs_all.num_pmds; + for (i = 0; n; i++, d++) { + if (!test_bit(i, cast_ulp(pfm_pmu_conf->regs_all.pmds))) + continue; + release_perfctr_nmi(d->hw_addr); + n--; + PFM_DBG("pmd%u released", i); + } + + /* clear NMI variable if used */ + if (__get_cpu_var(pfm_using_nmi)) + on_each_cpu(pfm_arch_pmu_release_percpu, NULL , 1); +} + +/** + * pfm_arch_pmu_config_init - validate PMU description structure + * @cfg: PMU description structure + * + * return: + * 0 if valid + * errno otherwise + * + * called from pfm_pmu_register() + */ +int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg) +{ + struct pfm_arch_pmu_info *pmu_info; + + pmu_info = pfm_pmu_info(); + if (!pmu_info) { + PFM_DBG("%s missing pmu_info", cfg->pmu_name); + return -EINVAL; + } + if (!pmu_info->has_ovfls) { + PFM_DBG("%s missing has_ovfls callback", cfg->pmu_name); + return -EINVAL; + } + if (!pmu_info->quiesce) { + PFM_DBG("%s missing quiesce callback", cfg->pmu_name); + return -EINVAL; + } + if (!pmu_info->stop_save) { + PFM_DBG("%s missing stop_save callback", cfg->pmu_name); + return -EINVAL; + } + return 0; +} + +/** + * pfm_arch_init - one time global arch-specific initialization + * + * called from pfm_init() + */ +int __init pfm_arch_init(void) +{ + /* + * we need to register our NMI handler when the kernels boots + * to avoid a deadlock condition with the NMI watchdog or Oprofile + * if we were to try and register/unregister on-demand. + */ + register_die_notifier(&pfm_nmi_nb); + return 0; +} diff --git a/arch/x86/perfmon/perfmon_amd64.c b/arch/x86/perfmon/perfmon_amd64.c new file mode 100644 index 0000000..f9b5f9c --- /dev/null +++ b/arch/x86/perfmon/perfmon_amd64.c @@ -0,0 +1,754 @@ +/* + * This file contains the PMU description for the Athlon64 and Opteron64 + * processors. It supports 32 and 64-bit modes. + * + * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * Copyright (c) 2007 Advanced Micro Devices, Inc. + * Contributed by Robert Richter + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Stephane Eranian "); +MODULE_AUTHOR("Robert Richter "); +MODULE_DESCRIPTION("AMD64 PMU description table"); +MODULE_LICENSE("GPL"); + +#define PCI_DEVICE_ID_AMD_10H_NB_MISC 0x1203 + +static int force_nmi; +MODULE_PARM_DESC(force_nmi, "bool: force use of NMI for PMU interrupt"); +module_param(force_nmi, bool, 0600); + +#define HAS_IBS 0x01 /* has IBS support */ + +static u8 ibs_eilvt_off, ibs_status; /* AMD: extended interrupt LVT offset */ + +static void pfm_amd64_restore_pmcs(struct pfm_context *ctx, + struct pfm_event_set *set); +static void __kprobes pfm_amd64_quiesce(void); +static int pfm_amd64_has_ovfls(struct pfm_context *ctx); +static int pfm_amd64_stop_save(struct pfm_context *ctx, + struct pfm_event_set *set); + +#define IBSFETCHCTL_PMC 4 /* pmc4 */ +#define IBSFETCHCTL_PMD 4 /* pmd4 */ +#define IBSOPSCTL_PMC 5 /* pmc5 */ +#define IBSOPSCTL_PMD 7 /* pmd7 */ + +static u64 enable_mask[PFM_MAX_PMCS]; +static u16 max_enable; + +static struct pfm_arch_pmu_info pfm_amd64_pmu_info = { + .stop_save = pfm_amd64_stop_save, + .has_ovfls = pfm_amd64_has_ovfls, + .quiesce = pfm_amd64_quiesce, + .restore_pmcs = pfm_amd64_restore_pmcs +}; + +#define PFM_AMD64_IBSFETCHVAL (1ULL<<49) /* valid fetch sample */ +#define PFM_AMD64_IBSFETCHEN (1ULL<<48) /* fetch sampling enabled */ +#define PFM_AMD64_IBSOPVAL (1ULL<<18) /* valid execution sample */ +#define PFM_AMD64_IBSOPEN (1ULL<<17) /* execution sampling enabled */ + +/* + * force Local APIC interrupt on overflow + */ +#define PFM_K8_VAL (1ULL<<20) +#define PFM_K8_NO64 (1ULL<<20) + +/* + * reserved bits must be 1 + * + * for family 15: + * - upper 32 bits are reserved + * - bit 20, bit 21 + * + * for family 16: + * - bits 36-39 are reserved + * - bits 42-63 are reserved + * - bit 20, bit 21 + * + * for IBS registers: + * IBSFETCHCTL: all bits are reserved except bits 57, 48, 15:0 + * IBSOPSCTL : all bits are reserved except bits 17, 15:0 + */ +#define PFM_K8_RSVD ((~((1ULL<<32)-1)) | (1ULL<<20) | (1ULL<<21)) +#define PFM_16_RSVD ((0x3fffffULL<<42) | (0xfULL<<36) | (1ULL<<20) | (1ULL<<21)) +#define PFM_AMD64_IBSFETCHCTL_RSVD (~((1ULL<<48)|(1ULL<<57)|0xffffULL)) +#define PFM_AMD64_IBSOPCTL_RSVD (~((1ULL<<17)|0xffffULL)) + +static struct pfm_regmap_desc pfm_amd64_pmc_desc[] = { +/* pmc0 */ PMC_D(PFM_REG_I64, "PERFSEL0", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL0), +/* pmc1 */ PMC_D(PFM_REG_I64, "PERFSEL1", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL1), +/* pmc2 */ PMC_D(PFM_REG_I64, "PERFSEL2", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL2), +/* pmc3 */ PMC_D(PFM_REG_I64, "PERFSEL3", PFM_K8_VAL, PFM_K8_RSVD, PFM_K8_NO64, MSR_K7_EVNTSEL3), +/* pmc4 */ PMC_D(PFM_REG_I, "IBSFETCHCTL", 0, PFM_AMD64_IBSFETCHCTL_RSVD, 0, MSR_AMD64_IBSFETCHCTL), +/* pmc5 */ PMC_D(PFM_REG_I, "IBSOPCTL", 0, PFM_AMD64_IBSOPCTL_RSVD, 0, MSR_AMD64_IBSOPCTL), +}; +#define PFM_AMD_NUM_PMCS ARRAY_SIZE(pfm_amd64_pmc_desc) + +#define PFM_REG_IBS (PFM_REG_I|PFM_REG_INTR) + +/* + * AMD64 counters are 48 bits, upper bits are reserved + */ +#define PFM_AMD64_CTR_RSVD (~((1ULL<<48)-1)) + +#define PFM_AMD_D(n) \ + { .type = PFM_REG_C, \ + .desc = "PERFCTR"#n, \ + .hw_addr = MSR_K7_PERFCTR0+n, \ + .rsvd_msk = PFM_AMD64_CTR_RSVD, \ + .dep_pmcs[0] = 1ULL << n \ + } + +#define PFM_AMD_IBSO(t, s, a) \ + { .type = t, \ + .desc = s, \ + .hw_addr = a, \ + .rsvd_msk = 0, \ + .dep_pmcs[0] = 1ULL << 5 \ + } + +#define PFM_AMD_IBSF(t, s, a) \ + { .type = t, \ + .desc = s, \ + .hw_addr = a, \ + .rsvd_msk = 0, \ + .dep_pmcs[0] = 1ULL << 6 \ + } + +static struct pfm_regmap_desc pfm_amd64_pmd_desc[] = { +/* pmd0 */ PFM_AMD_D(0), +/* pmd1 */ PFM_AMD_D(1), +/* pmd2 */ PFM_AMD_D(2), +/* pmd3 */ PFM_AMD_D(3), +/* pmd4 */ PFM_AMD_IBSF(PFM_REG_IBS, "IBSFETCHCTL", MSR_AMD64_IBSFETCHCTL), +/* pmd5 */ PFM_AMD_IBSF(PFM_REG_IRO, "IBSFETCHLINAD", MSR_AMD64_IBSFETCHLINAD), +/* pmd6 */ PFM_AMD_IBSF(PFM_REG_IRO, "IBSFETCHPHYSAD", MSR_AMD64_IBSFETCHPHYSAD), +/* pmd7 */ PFM_AMD_IBSO(PFM_REG_IBS, "IBSOPCTL", MSR_AMD64_IBSOPCTL), +/* pmd8 */ PFM_AMD_IBSO(PFM_REG_IRO, "IBSOPRIP", MSR_AMD64_IBSOPRIP), +/* pmd9 */ PFM_AMD_IBSO(PFM_REG_IRO, "IBSOPDATA", MSR_AMD64_IBSOPDATA), +/* pmd10 */ PFM_AMD_IBSO(PFM_REG_IRO, "IBSOPDATA2", MSR_AMD64_IBSOPDATA2), +/* pmd11 */ PFM_AMD_IBSO(PFM_REG_IRO, "IBSOPDATA3", MSR_AMD64_IBSOPDATA3), +/* pmd12 */ PFM_AMD_IBSO(PFM_REG_IRO, "IBSDCLINAD", MSR_AMD64_IBSDCLINAD), +/* pmd13 */ PFM_AMD_IBSO(PFM_REG_IRO, "IBSDCPHYSAD", MSR_AMD64_IBSDCPHYSAD), +}; +#define PFM_AMD_NUM_PMDS ARRAY_SIZE(pfm_amd64_pmd_desc) + +static struct pfm_context **pfm_nb_sys_owners; +static struct pfm_context *pfm_nb_task_owner; + +static struct pfm_pmu_config pfm_amd64_pmu_conf; + +#define is_ibs_pmc(x) (x == 4 || x == 5) + +static void pfm_amd64_setup_eilvt_per_cpu(void *info) +{ + u8 lvt_off; + + /* program the IBS vector to the perfmon vector */ + lvt_off = setup_APIC_eilvt_ibs(LOCAL_PERFMON_VECTOR, + APIC_EILVT_MSG_FIX, 0); + PFM_DBG("APIC_EILVT%d set to 0x%x", lvt_off, LOCAL_PERFMON_VECTOR); + ibs_eilvt_off = lvt_off; +} + +static int pfm_amd64_setup_eilvt(void) +{ +#define IBSCTL_LVTOFFSETVAL (1 << 8) +#define IBSCTL 0x1cc + struct pci_dev *cpu_cfg; + int nodes; + u32 value = 0; + + /* per CPU setup */ + on_each_cpu(pfm_amd64_setup_eilvt_per_cpu, NULL, 1); + + nodes = 0; + cpu_cfg = NULL; + do { + cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD, + PCI_DEVICE_ID_AMD_10H_NB_MISC, + cpu_cfg); + if (!cpu_cfg) + break; + ++nodes; + pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off + | IBSCTL_LVTOFFSETVAL); + pci_read_config_dword(cpu_cfg, IBSCTL, &value); + if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) { + PFM_DBG("Failed to setup IBS LVT offset, " + "IBSCTL = 0x%08x", value); + return 1; + } + } while (1); + + if (!nodes) { + PFM_DBG("No CPU node configured for IBS"); + return 1; + } + +#ifdef CONFIG_NUMA + /* Sanity check */ + /* Works only for 64bit with proper numa implementation. */ + if (nodes != num_possible_nodes()) { + PFM_DBG("Failed to setup CPU node(s) for IBS, " + "found: %d, expected %d", + nodes, num_possible_nodes()); + return 1; + } +#endif + return 0; +} + +/* + * There can only be one user per socket for the Northbridge (NB) events, + * so we enforce mutual exclusion as follows: + * - per-thread : only one context machine-wide can use NB events + * - system-wide: only one context per processor socket + * + * Exclusion is enforced at: + * - pfm_load_context() + * - pfm_write_pmcs() for attached contexts + * + * Exclusion is released at: + * - pfm_unload_context() or any calls that implicitely uses it + * + * return: + * 0 : successfully acquire NB access + * < 0: errno, failed to acquire NB access + */ +static int pfm_amd64_acquire_nb(struct pfm_context *ctx) +{ + struct pfm_context **entry, *old; + int proc_id; + +#ifdef CONFIG_SMP + proc_id = cpu_data(smp_processor_id()).phys_proc_id; +#else + proc_id = 0; +#endif + + if (ctx->flags.system) + entry = &pfm_nb_sys_owners[proc_id]; + else + entry = &pfm_nb_task_owner; + + old = cmpxchg(entry, NULL, ctx); + if (!old) { + if (ctx->flags.system) + PFM_DBG("acquired Northbridge event access on socket %u", proc_id); + else + PFM_DBG("acquired Northbridge event access globally"); + } else if (old != ctx) { + if (ctx->flags.system) + PFM_DBG("NorthBridge event conflict on socket %u", proc_id); + else + PFM_DBG("global NorthBridge event conflict"); + return -EBUSY; + } + return 0; +} + +/* + * invoked from pfm_write_pmcs() when pfm_nb_sys_owners is not NULL,i.e., + * when we have detected a multi-core processor. + * + * context is locked, interrupts are masked + */ +static int pfm_amd64_pmc_write_check(struct pfm_context *ctx, + struct pfm_event_set *set, + struct pfarg_pmc *req) +{ + unsigned int event; + + /* + * delay checking NB event until we load the context + */ + if (ctx->state == PFM_CTX_UNLOADED) + return 0; + + /* + * check event is NB event + */ + event = (unsigned int)(req->reg_value & 0xff); + if (event < 0xee) + return 0; + + return pfm_amd64_acquire_nb(ctx); +} + +/* + * invoked on pfm_load_context(). + * context is locked, interrupts are masked + */ +static int pfm_amd64_load_context(struct pfm_context *ctx) +{ + struct pfm_event_set *set; + unsigned int i, n; + + /* + * scan all sets for NB events + */ + list_for_each_entry(set, &ctx->set_list, list) { + n = set->nused_pmcs; + for (i = 0; n; i++) { + if (!test_bit(i, cast_ulp(set->used_pmcs))) + continue; + + if (!is_ibs_pmc(i) && (set->pmcs[i] & 0xff) >= 0xee) + goto found; + n--; + } + } + return 0; +found: + return pfm_amd64_acquire_nb(ctx); +} + +/* + * invoked on pfm_unload_context() + */ +static void pfm_amd64_unload_context(struct pfm_context *ctx) +{ + struct pfm_context **entry, *old; + int proc_id; + +#ifdef CONFIG_SMP + proc_id = cpu_data(smp_processor_id()).phys_proc_id; +#else + proc_id = 0; +#endif + + /* + * unload always happens on the monitored CPU in system-wide + */ + if (ctx->flags.system) + entry = &pfm_nb_sys_owners[proc_id]; + else + entry = &pfm_nb_task_owner; + + old = cmpxchg(entry, ctx, NULL); + if (old == ctx) { + if (ctx->flags.system) + PFM_DBG("released NorthBridge on socket %u", proc_id); + else + PFM_DBG("released NorthBridge events globally"); + } +} + +/* + * detect if we need to activate NorthBridge event access control + */ +static int pfm_amd64_setup_nb_event_control(void) +{ + unsigned int c, n = 0; + unsigned int max_phys = 0; + +#ifdef CONFIG_SMP + for_each_possible_cpu(c) { + if (cpu_data(c).phys_proc_id > max_phys) + max_phys = cpu_data(c).phys_proc_id; + } +#else + max_phys = 0; +#endif + if (max_phys > 255) { + PFM_INFO("socket id %d is too big to handle", max_phys); + return -ENOMEM; + } + + n = max_phys + 1; + if (n < 2) + return 0; + + pfm_nb_sys_owners = vmalloc(n * sizeof(*pfm_nb_sys_owners)); + if (!pfm_nb_sys_owners) + return -ENOMEM; + + memset(pfm_nb_sys_owners, 0, n * sizeof(*pfm_nb_sys_owners)); + pfm_nb_task_owner = NULL; + + /* + * activate write-checker for PMC registers + */ + for (c = 0; c < PFM_AMD_NUM_PMCS; c++) { + if (!is_ibs_pmc(c)) + pfm_amd64_pmc_desc[c].type |= PFM_REG_WC; + } + + pfm_amd64_pmu_info.load_context = pfm_amd64_load_context; + pfm_amd64_pmu_info.unload_context = pfm_amd64_unload_context; + + pfm_amd64_pmu_conf.pmc_write_check = pfm_amd64_pmc_write_check; + + PFM_INFO("NorthBridge event access control enabled"); + + return 0; +} + +/* + * disable registers which are not available on + * the host (applies to IBS registers) + */ +static void pfm_amd64_check_registers(void) +{ + u16 i; + + PFM_DBG("has_ibs=%d", !!(ibs_status & HAS_IBS)); + + __set_bit(0, cast_ulp(enable_mask)); + __set_bit(1, cast_ulp(enable_mask)); + __set_bit(2, cast_ulp(enable_mask)); + __set_bit(3, cast_ulp(enable_mask)); + max_enable = 3+1; + + + /* + * remove IBS registers if feature not present + */ + if (!(ibs_status & HAS_IBS)) { + pfm_amd64_pmc_desc[4].type = PFM_REG_NA; + pfm_amd64_pmc_desc[5].type = PFM_REG_NA; + for (i = 4; i < 14; i++) + pfm_amd64_pmd_desc[i].type = PFM_REG_NA; + } else { + __set_bit(16, cast_ulp(enable_mask)); + __set_bit(17, cast_ulp(enable_mask)); + max_enable = 17 + 1; + } + + /* + * adjust reserved bit fields for family 16 + */ + if (current_cpu_data.x86 == 16) { + for (i = 0; i < PFM_AMD_NUM_PMCS; i++) + if (pfm_amd64_pmc_desc[i].rsvd_msk == PFM_K8_RSVD) + pfm_amd64_pmc_desc[i].rsvd_msk = PFM_16_RSVD; + } +} + +static int pfm_amd64_probe_pmu(void) +{ + u64 val = 0; + if (current_cpu_data.x86_vendor != X86_VENDOR_AMD) { + PFM_INFO("not an AMD processor"); + return -1; + } + + switch (current_cpu_data.x86) { + case 16: + case 15: + case 6: + break; + default: + PFM_INFO("unsupported family=%d", current_cpu_data.x86); + return -1; + } + + /* check for IBS */ + if (cpu_has(¤t_cpu_data, X86_FEATURE_IBS)) { + ibs_status |= HAS_IBS; + rdmsrl(MSR_AMD64_IBSCTL, val); + } + + PFM_INFO("found family=%d IBSCTL=0x%llx", current_cpu_data.x86, (unsigned long long)val); + + /* + * check for local APIC (required) + */ + if (!cpu_has_apic) { + PFM_INFO("no local APIC, unsupported"); + return -1; + } + + if (current_cpu_data.x86_max_cores > 1 + && pfm_amd64_setup_nb_event_control()) + return -1; + + if (force_nmi) + pfm_amd64_pmu_info.flags |= PFM_X86_FL_USE_NMI; + + if (ibs_status & HAS_IBS) { + /* Setup extended interrupt */ + if (pfm_amd64_setup_eilvt()) { + PFM_INFO("Failed to initialize extended interrupts " + "for IBS"); + ibs_status &= ~HAS_IBS; + PFM_INFO("Unable to use IBS"); + } else { + PFM_INFO("IBS supported"); + } + } + + pfm_amd64_check_registers(); + + return 0; +} + +/* + * detect is counters have overflowed. + * return: + * 0 : no overflow + * 1 : at least one overflow + */ +static int __kprobes pfm_amd64_has_ovfls(struct pfm_context *ctx) +{ + struct pfm_regmap_desc *xrd; + u64 *cnt_mask; + u64 wmask, val; + u16 i, num; + + /* + * Check for IBS events + */ + if (ibs_status & HAS_IBS) { + rdmsrl(MSR_AMD64_IBSFETCHCTL, val); + if (val & PFM_AMD64_IBSFETCHVAL) + return 1; + rdmsrl(MSR_AMD64_IBSOPCTL, val); + if (val & PFM_AMD64_IBSOPVAL) + return 1; + } + /* + * Check regular counters + */ + cnt_mask = ctx->regs.cnt_pmds; + num = ctx->regs.num_counters; + wmask = 1ULL << pfm_pmu_conf->counter_width; + xrd = pfm_amd64_pmd_desc; + + for (i = 0; num; i++) { + if (test_bit(i, cast_ulp(cnt_mask))) { + rdmsrl(xrd[i].hw_addr, val); + if (!(val & wmask)) + return 1; + num--; + } + } + return 0; +} + +/* + * Must check for IBS event BEFORE stop_save_p6 because + * stopping monitoring does destroy IBS state information + * in IBSFETCHCTL/IBSOPCTL because they are tagged as enable + * registers. + */ +static int pfm_amd64_stop_save(struct pfm_context *ctx, struct pfm_event_set *set) +{ + struct pfm_arch_pmu_info *pmu_info; + u64 used_mask[PFM_PMC_BV]; + u64 *cnt_pmds; + u64 val, wmask, ovfl_mask; + u32 i, count, use_ibs; + + pmu_info = pfm_pmu_info(); + + /* + * IBS used if: + * - on family 10h processor with IBS + * - at least one of the IBS PMD registers is used + */ + use_ibs = (ibs_status & HAS_IBS) + && (test_bit(IBSFETCHCTL_PMD, cast_ulp(set->used_pmds)) + || test_bit(IBSOPSCTL_PMD, cast_ulp(set->used_pmds))); + + wmask = 1ULL << pfm_pmu_conf->counter_width; + + bitmap_and(cast_ulp(used_mask), + cast_ulp(set->used_pmcs), + cast_ulp(enable_mask), + max_enable); + + count = bitmap_weight(cast_ulp(used_mask), max_enable); + + /* + * stop monitoring + * Unfortunately, this is very expensive! + * wrmsrl() is serializing. + * + * With IBS, we need to do read-modify-write to preserve the content + * for OpsCTL and FetchCTL because they are also used as PMDs and saved + * below + */ + if (use_ibs) { + for (i = 0; count; i++) { + if (test_bit(i, cast_ulp(used_mask))) { + if (i == IBSFETCHCTL_PMC) { + rdmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, val); + val &= ~PFM_AMD64_IBSFETCHEN; + } else if (i == IBSOPSCTL_PMC) { + rdmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, val); + val &= ~PFM_AMD64_IBSOPEN; + } else + val = 0; + wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, val); + count--; + } + } + } else { + for (i = 0; count; i++) { + if (test_bit(i, cast_ulp(used_mask))) { + wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, 0); + count--; + } + } + } + + /* + * if we already having a pending overflow condition, we simply + * return to take care of this first. + */ + if (set->npend_ovfls) + return 1; + + ovfl_mask = pfm_pmu_conf->ovfl_mask; + cnt_pmds = ctx->regs.cnt_pmds; + + /* + * check for pending overflows and save PMDs (combo) + * we employ used_pmds because we also need to save + * and not just check for pending interrupts. + * + * Must check for counting PMDs because of virtual PMDs and IBS + */ + count = set->nused_pmds; + for (i = 0; count; i++) { + if (test_bit(i, cast_ulp(set->used_pmds))) { + val = pfm_arch_read_pmd(ctx, i); + if (likely(test_bit(i, cast_ulp(cnt_pmds)))) { + if (!(val & wmask)) { + __set_bit(i, cast_ulp(set->povfl_pmds)); + set->npend_ovfls++; + } + val = (set->pmds[i].value & ~ovfl_mask) | (val & ovfl_mask); + } + set->pmds[i].value = val; + count--; + } + } + + /* + * check if IBS contains valid data, and mark the corresponding + * PMD has overflowed + */ + if (use_ibs) { + if (set->pmds[IBSFETCHCTL_PMD].value & PFM_AMD64_IBSFETCHVAL) { + __set_bit(IBSFETCHCTL_PMD, cast_ulp(set->povfl_pmds)); + set->npend_ovfls++; + } + if (set->pmds[IBSOPSCTL_PMD].value & PFM_AMD64_IBSOPVAL) { + __set_bit(IBSOPSCTL_PMD, cast_ulp(set->povfl_pmds)); + set->npend_ovfls++; + } + } + /* 0 means: no need to save PMDs at upper level */ + return 0; +} + +/** + * pfm_amd64_quiesce_pmu -- stop monitoring without grabbing any lock + * + * called from NMI interrupt handler to immediately stop monitoring + * cannot grab any lock, including perfmon related locks + */ +static void __kprobes pfm_amd64_quiesce(void) +{ + /* + * quiesce PMU by clearing available registers that have + * the start/stop capability + */ + if (test_bit(0, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) + wrmsrl(MSR_K7_EVNTSEL0, 0); + if (test_bit(1, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) + wrmsrl(MSR_K7_EVNTSEL0+1, 0); + if (test_bit(2, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) + wrmsrl(MSR_K7_EVNTSEL0+2, 0); + if (test_bit(3, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) + wrmsrl(MSR_K7_EVNTSEL0+3, 0); + + if (test_bit(4, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) + wrmsrl(MSR_AMD64_IBSFETCHCTL, 0); + if (test_bit(5, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) + wrmsrl(MSR_AMD64_IBSOPCTL, 0); +} + +/** + * pfm_amd64_restore_pmcs - reload PMC registers + * @ctx: context to restore from + * @set: current event set + * + * optimized version of pfm_arch_restore_pmcs(). On AMD64, we can + * afford to only restore the pmcs registers we use, because they are + * all independent from each other. + */ +static void pfm_amd64_restore_pmcs(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + u64 *mask; + u16 i, num; + + mask = set->used_pmcs; + num = set->nused_pmcs; + for (i = 0; num; i++) { + if (test_bit(i, cast_ulp(mask))) { + wrmsrl(pfm_amd64_pmc_desc[i].hw_addr, set->pmcs[i]); + num--; + } + } +} + +static struct pfm_pmu_config pfm_amd64_pmu_conf = { + .pmu_name = "AMD64", + .counter_width = 47, + .pmd_desc = pfm_amd64_pmd_desc, + .pmc_desc = pfm_amd64_pmc_desc, + .num_pmc_entries = PFM_AMD_NUM_PMCS, + .num_pmd_entries = PFM_AMD_NUM_PMDS, + .probe_pmu = pfm_amd64_probe_pmu, + .version = "1.2", + .pmu_info = &pfm_amd64_pmu_info, + .flags = PFM_PMU_BUILTIN_FLAG, + .owner = THIS_MODULE, +}; + +static int __init pfm_amd64_pmu_init_module(void) +{ + return pfm_pmu_register(&pfm_amd64_pmu_conf); +} + +static void __exit pfm_amd64_pmu_cleanup_module(void) +{ + if (pfm_nb_sys_owners) + vfree(pfm_nb_sys_owners); + + pfm_pmu_unregister(&pfm_amd64_pmu_conf); +} + +module_init(pfm_amd64_pmu_init_module); +module_exit(pfm_amd64_pmu_cleanup_module); diff --git a/arch/x86/perfmon/perfmon_intel_arch.c b/arch/x86/perfmon/perfmon_intel_arch.c new file mode 100644 index 0000000..e27a732 --- /dev/null +++ b/arch/x86/perfmon/perfmon_intel_arch.c @@ -0,0 +1,610 @@ +/* + * This file contains the Intel architectural perfmon v1, v2, v3 + * description tables. + * + * Architectural perfmon was introduced with Intel Core Solo/Duo + * processors. + * + * Copyright (c) 2006-2007 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Stephane Eranian "); +MODULE_DESCRIPTION("Intel architectural perfmon v1"); +MODULE_LICENSE("GPL"); + +static int force, force_nmi; +MODULE_PARM_DESC(force, "bool: force module to load succesfully"); +MODULE_PARM_DESC(force_nmi, "bool: force use of NMI for PMU interrupt"); +module_param(force, bool, 0600); +module_param(force_nmi, bool, 0600); + +static u64 enable_mask[PFM_MAX_PMCS]; +static u16 max_enable; + +/* + * - upper 32 bits are reserved + * - INT: APIC enable bit is reserved (forced to 1) + * - bit 21 is reserved + * + * RSVD: reserved bits are 1 + */ +#define PFM_IA_PMC_RSVD ((~((1ULL<<32)-1)) \ + | (1ULL<<20) \ + | (1ULL<<21)) + +/* + * force Local APIC interrupt on overflow + * disable with NO_EMUL64 + */ +#define PFM_IA_PMC_VAL (1ULL<<20) +#define PFM_IA_NO64 (1ULL<<20) + +/* + * architectuture specifies that: + * IA32_PMCx MSR : starts at 0x0c1 & occupy a contiguous block of MSR + * IA32_PERFEVTSELx MSR : starts at 0x186 & occupy a contiguous block of MSR + * MSR_GEN_FIXED_CTR0 : starts at 0x309 & occupy a contiguous block of MSR + */ +#define MSR_GEN_SEL_BASE MSR_P6_EVNTSEL0 +#define MSR_GEN_PMC_BASE MSR_P6_PERFCTR0 +#define MSR_GEN_FIXED_PMC_BASE MSR_CORE_PERF_FIXED_CTR0 + +/* + * layout of EAX for CPUID.0xa leaf function + */ +struct pmu_eax { + unsigned int version:8; /* architectural perfmon version */ + unsigned int num_cnt:8; /* number of generic counters */ + unsigned int cnt_width:8; /* width of generic counters */ + unsigned int ebx_length:8; /* number of architected events */ +}; + +/* + * layout of EDX for CPUID.0xa leaf function when perfmon v2 is detected + */ +struct pmu_edx { + unsigned int num_cnt:5; /* number of fixed counters */ + unsigned int cnt_width:8; /* width of fixed counters */ + unsigned int reserved:19; +}; + +static void pfm_intel_arch_restore_pmcs(struct pfm_context *ctx, + struct pfm_event_set *set); +static int pfm_intel_arch_stop_save(struct pfm_context *ctx, + struct pfm_event_set *set); +static int pfm_intel_arch_has_ovfls(struct pfm_context *ctx); +static void __kprobes pfm_intel_arch_quiesce(void); + +/* + * physical addresses of MSR controlling the perfevtsel and counter registers + */ +struct pfm_arch_pmu_info pfm_intel_arch_pmu_info = { + .stop_save = pfm_intel_arch_stop_save, + .has_ovfls = pfm_intel_arch_has_ovfls, + .quiesce = pfm_intel_arch_quiesce, + .restore_pmcs = pfm_intel_arch_restore_pmcs +}; + +#define PFM_IA_C(n) { \ + .type = PFM_REG_I64, \ + .desc = "PERFEVTSEL"#n, \ + .dfl_val = PFM_IA_PMC_VAL, \ + .rsvd_msk = PFM_IA_PMC_RSVD, \ + .no_emul64_msk = PFM_IA_NO64, \ + .hw_addr = MSR_GEN_SEL_BASE+(n) \ + } + +#define PFM_IA_D(n) \ + { .type = PFM_REG_C, \ + .desc = "PMC"#n, \ + .hw_addr = MSR_P6_PERFCTR0+n, \ + .dep_pmcs[0] = 1ULL << n \ + } + +#define PFM_IA_FD(n) \ + { .type = PFM_REG_C, \ + .desc = "FIXED_CTR"#n, \ + .hw_addr = MSR_CORE_PERF_FIXED_CTR0+n,\ + .dep_pmcs[0] = 1ULL << 16 \ + } + +static struct pfm_regmap_desc pfm_intel_arch_pmc_desc[] = { +/* pmc0 */ PFM_IA_C(0), PFM_IA_C(1), PFM_IA_C(2), PFM_IA_C(3), +/* pmc4 */ PFM_IA_C(4), PFM_IA_C(5), PFM_IA_C(6), PFM_IA_C(7), +/* pmc8 */ PFM_IA_C(8), PFM_IA_C(9), PFM_IA_C(10), PFM_IA_C(11), +/* pmc12 */ PFM_IA_C(12), PFM_IA_C(13), PFM_IA_C(14), PFM_IA_C(15), + +/* pmc16 */ { .type = PFM_REG_I, + .desc = "FIXED_CTRL", + .dfl_val = 0x8888888888888888ULL, /* force PMI */ + .rsvd_msk = 0, /* set dynamically */ + .no_emul64_msk = 0, + .hw_addr = MSR_CORE_PERF_FIXED_CTR_CTRL + }, +}; +#define PFM_IA_MAX_PMCS ARRAY_SIZE(pfm_intel_arch_pmc_desc) + +static struct pfm_regmap_desc pfm_intel_arch_pmd_desc[] = { +/* pmd0 */ PFM_IA_D(0), PFM_IA_D(1), PFM_IA_D(2), PFM_IA_D(3), +/* pmd4 */ PFM_IA_D(4), PFM_IA_D(5), PFM_IA_D(6), PFM_IA_D(7), +/* pmd8 */ PFM_IA_D(8), PFM_IA_D(9), PFM_IA_D(10), PFM_IA_D(11), +/* pmd12 */ PFM_IA_D(12), PFM_IA_D(13), PFM_IA_D(14), PFM_IA_D(15), + +/* pmd16 */ PFM_IA_FD(0), PFM_IA_FD(1), PFM_IA_FD(2), PFM_IA_FD(3), +/* pmd20 */ PFM_IA_FD(4), PFM_IA_FD(5), PFM_IA_FD(6), PFM_IA_FD(7), +/* pmd24 */ PFM_IA_FD(8), PFM_IA_FD(9), PFM_IA_FD(10), PFM_IA_FD(11), +/* pmd28 */ PFM_IA_FD(16), PFM_IA_FD(17), PFM_IA_FD(18), PFM_IA_FD(19) +}; +#define PFM_IA_MAX_PMDS ARRAY_SIZE(pfm_intel_arch_pmd_desc) + +#define PFM_IA_MAX_CNT 16 /* # generic counters in mapping table */ +#define PFM_IA_MAX_FCNT 16 /* # of fixed counters in mapping table */ +#define PFM_IA_FCNT_BASE 16 /* base index of fixed counters PMD */ + +static struct pfm_pmu_config pfm_intel_arch_pmu_conf; + +static void pfm_intel_arch_check_errata(void) +{ + /* + * Core Duo errata AE49 (no fix). Both counters share a single + * enable bit in PERFEVTSEL0 + */ + if (current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 14) + pfm_intel_arch_pmu_info.flags |= PFM_X86_FL_NO_SHARING; +} + +static inline void set_enable_mask(unsigned int i) +{ + __set_bit(i, cast_ulp(enable_mask)); + + /* max_enable = highest + 1 */ + if ((i+1) > max_enable) + max_enable = i+ 1; +} + +static void pfm_intel_arch_setup_generic(unsigned int version, + unsigned int width, + unsigned int count) +{ + u64 rsvd; + unsigned int i; + + /* + * first we handle the generic counters: + * + * - ensure HW does not have more registers than hardcoded in the tables + * - adjust rsvd_msk to actual counter width + * - initialize enable_mask (list of PMC with start/stop capability) + * - mark unused hardcoded generic counters as unimplemented + */ + + /* + * min of number of Hw counters and hardcoded in the tables + */ + if (count >= PFM_IA_MAX_CNT) { + printk(KERN_INFO "perfmon: Limiting number of generic counters" + " to %u, HW supports %u", + PFM_IA_MAX_CNT, count); + count = PFM_IA_MAX_CNT; + } + + /* + * adjust rsvd_msk for generic counters based on actual width + * initialize enable_mask (1 per pmd) + */ + rsvd = ~((1ULL << width)-1); + for (i = 0; i < count; i++) { + pfm_intel_arch_pmd_desc[i].rsvd_msk = rsvd; + set_enable_mask(i); + } + + /* + * handle version 3 new anythread bit (21) + */ + if (version == 3) { + for (i = 0; i < count; i++) + pfm_intel_arch_pmc_desc[i].rsvd_msk &= ~(1ULL << 21); + } + + + /* + * mark unused generic counters as not available + */ + for (i = count ; i < PFM_IA_MAX_CNT; i++) { + pfm_intel_arch_pmd_desc[i].type = PFM_REG_NA; + pfm_intel_arch_pmc_desc[i].type = PFM_REG_NA; + } +} + +static void pfm_intel_arch_setup_fixed(unsigned int version, + unsigned int width, + unsigned int count) +{ + u64 rsvd, dfl; + unsigned int i; + + /* + * handle the fixed counters (if any): + * + * - ensure HW does not have more registers than hardcoded in the tables + * - adjust rsvd_msk to actual counter width + * - initialize enable_mask (list of PMC with start/stop capability) + * - mark unused hardcoded generic counters as unimplemented + */ + if (count >= PFM_IA_MAX_FCNT) { + printk(KERN_INFO "perfmon: Limiting number of fixed counters" + " to %u, HW supports %u", + PFM_IA_MAX_FCNT, count); + count = PFM_IA_MAX_FCNT; + } + /* + * adjust rsvd_msk for fixed counters based on actual width + */ + rsvd = ~((1ULL << width)-1); + for (i = 0; i < count; i++) + pfm_intel_arch_pmd_desc[PFM_IA_FCNT_BASE+i].rsvd_msk = rsvd; + + /* + * handle version new anythread bit (bit 2) + */ + if (version == 3) + rsvd = 1ULL << 3; + else + rsvd = 3ULL << 2; + + pfm_intel_arch_pmc_desc[16].rsvd_msk = 0; + for (i = 0; i < count; i++) + pfm_intel_arch_pmc_desc[16].rsvd_msk |= rsvd << (i<<2); + + /* + * mark unused fixed counters as unimplemented + * + * update the rsvd_msk, dfl_val in FIXED_CTRL: + * - rsvd_msk: set all 4 bits + * - dfl_val : clear all 4 bits + */ + dfl = pfm_intel_arch_pmc_desc[16].dfl_val; + rsvd = pfm_intel_arch_pmc_desc[16].rsvd_msk; + + for (i = count ; i < PFM_IA_MAX_FCNT; i++) { + pfm_intel_arch_pmd_desc[PFM_IA_FCNT_BASE+i].type = PFM_REG_NA; + rsvd |= 0xfULL << (i<<2); + dfl &= ~(0xfULL << (i<<2)); + } + + /* + * FIXED_CTR_CTRL unavailable when no fixed counters are defined + */ + if (!count) { + pfm_intel_arch_pmc_desc[16].type = PFM_REG_NA; + } else { + /* update rsvd_mask and dfl_val */ + pfm_intel_arch_pmc_desc[16].rsvd_msk = rsvd; + pfm_intel_arch_pmc_desc[16].dfl_val = dfl; + set_enable_mask(16); + } +} + +static int pfm_intel_arch_probe_pmu(void) +{ + union { + unsigned int val; + struct pmu_eax eax; + struct pmu_edx edx; + } eax, edx; + unsigned int ebx, ecx; + unsigned int width = 0; + + edx.val = 0; + + if (!(cpu_has_arch_perfmon || force)) { + PFM_INFO("no support for Intel architectural PMU"); + return -1; + } + + if (!cpu_has_apic) { + PFM_INFO("no Local APIC, try rebooting with lapic option"); + return -1; + } + + /* cpuid() call protected by cpu_has_arch_perfmon */ + cpuid(0xa, &eax.val, &ebx, &ecx, &edx.val); + + /* + * reject processors supported by perfmon_intel_core + * + * We need to do this explicitely to avoid depending + * on the link order in case, the modules are compiled as + * builtin. + * + * non Intel processors are rejected by cpu_has_arch_perfmon + */ + if (current_cpu_data.x86 == 6 && !force) { + switch (current_cpu_data.x86_model) { + case 15: /* Merom: use perfmon_intel_core */ + case 23: /* Penryn: use perfmon_intel_core */ + return -1; + default: + break; + } + } + + /* + * some 6/15 models have buggy BIOS + */ + if (eax.eax.version == 0 + && current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 15) { + PFM_INFO("buggy v2 BIOS, adjusting for 2 generic counters"); + eax.eax.version = 2; + eax.eax.num_cnt = 2; + eax.eax.cnt_width = 40; + } + + /* + * Intel Atom processors have a buggy firmware which does not report + * the correct number of fixed counters + */ + if (eax.eax.version == 3 && edx.edx.num_cnt < 3 + && current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 28) { + PFM_INFO("buggy v3 BIOS, adjusting for 3 fixed counters"); + edx.edx.num_cnt = 3; + } + + /* + * some v2 BIOSes are incomplete + */ + if (eax.eax.version == 2 && !edx.edx.num_cnt) { + PFM_INFO("buggy v2 BIOS, adjusting for 3 fixed counters"); + edx.edx.num_cnt = 3; + edx.edx.cnt_width = 40; + } + + /* + * no fixed counters on earlier versions + */ + if (eax.eax.version < 2) { + edx.val = 0; + } else { + /* + * use the min value of both widths until we support + * variable width counters + */ + width = eax.eax.cnt_width < edx.edx.cnt_width ? + eax.eax.cnt_width : edx.edx.cnt_width; + } + + PFM_INFO("detected architecural perfmon v%d", eax.eax.version); + PFM_INFO("num_gen=%d width=%d num_fixed=%d width=%d", + eax.eax.num_cnt, + eax.eax.cnt_width, + edx.edx.num_cnt, + edx.edx.cnt_width); + + + pfm_intel_arch_setup_generic(eax.eax.version, + width, + eax.eax.num_cnt); + + pfm_intel_arch_setup_fixed(eax.eax.version, + width, + edx.edx.num_cnt); + + if (force_nmi) + pfm_intel_arch_pmu_info.flags |= PFM_X86_FL_USE_NMI; + + pfm_intel_arch_check_errata(); + + return 0; +} + +/** + * pfm_intel_arch_has_ovfls - check for pending overflow condition + * @ctx: context to work on + * + * detect if counters have overflowed. + * return: + * 0 : no overflow + * 1 : at least one overflow + */ +static int __kprobes pfm_intel_arch_has_ovfls(struct pfm_context *ctx) +{ + u64 *cnt_mask; + u64 wmask, val; + u16 i, num; + + cnt_mask = ctx->regs.cnt_pmds; + num = ctx->regs.num_counters; + wmask = 1ULL << pfm_pmu_conf->counter_width; + + /* + * we can leverage the fact that we know the mapping + * to hardcode the MSR address and avoid accessing + * more cachelines + * + * We need to check cnt_mask because not all registers + * may be available. + */ + for (i = 0; num; i++) { + if (test_bit(i, cast_ulp(cnt_mask))) { + rdmsrl(pfm_intel_arch_pmd_desc[i].hw_addr, val); + if (!(val & wmask)) + return 1; + num--; + } + } + return 0; +} + +static int pfm_intel_arch_stop_save(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + u64 used_mask[PFM_PMC_BV]; + u64 *cnt_pmds; + u64 val, wmask, ovfl_mask; + u32 i, count; + + wmask = 1ULL << pfm_pmu_conf->counter_width; + + bitmap_and(cast_ulp(used_mask), + cast_ulp(set->used_pmcs), + cast_ulp(enable_mask), + max_enable); + + count = bitmap_weight(cast_ulp(used_mask), max_enable); + + /* + * stop monitoring + * Unfortunately, this is very expensive! + * wrmsrl() is serializing. + */ + for (i = 0; count; i++) { + if (test_bit(i, cast_ulp(used_mask))) { + wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, 0); + count--; + } + } + + /* + * if we already having a pending overflow condition, we simply + * return to take care of this first. + */ + if (set->npend_ovfls) + return 1; + + ovfl_mask = pfm_pmu_conf->ovfl_mask; + cnt_pmds = ctx->regs.cnt_pmds; + + /* + * check for pending overflows and save PMDs (combo) + * we employ used_pmds because we also need to save + * and not just check for pending interrupts. + * + * Must check for counting PMDs because of virtual PMDs + */ + count = set->nused_pmds; + for (i = 0; count; i++) { + if (test_bit(i, cast_ulp(set->used_pmds))) { + val = pfm_arch_read_pmd(ctx, i); + if (likely(test_bit(i, cast_ulp(cnt_pmds)))) { + if (!(val & wmask)) { + __set_bit(i, cast_ulp(set->povfl_pmds)); + set->npend_ovfls++; + } + val = (set->pmds[i].value & ~ovfl_mask) + | (val & ovfl_mask); + } + set->pmds[i].value = val; + count--; + } + } + /* 0 means: no need to save PMDs at upper level */ + return 0; +} + +/** + * pfm_intel_arch_quiesce - stop monitoring without grabbing any lock + * + * called from NMI interrupt handler to immediately stop monitoring + * cannot grab any lock, including perfmon related locks + */ +static void __kprobes pfm_intel_arch_quiesce(void) +{ + u16 i; + + /* + * PMC16 is the fixed control control register so it has a + * distinct MSR address + * + * We do not use the hw_addr field in the table to avoid touching + * too many cachelines + */ + for (i = 0; i < pfm_pmu_conf->regs_all.max_pmc; i++) { + if (test_bit(i, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) { + if (i == 16) + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0); + else + wrmsrl(MSR_P6_EVNTSEL0+i, 0); + } + } +} + +/** + * pfm_intel_arch_restore_pmcs - reload PMC registers + * @ctx: context to restore from + * @set: current event set + * + * optimized version of pfm_arch_restore_pmcs(). On architectural perfmon, + * we can afford to only restore the pmcs registers we use, because they + * are all independent from each other. + */ +static void pfm_intel_arch_restore_pmcs(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + u64 *mask; + u16 i, num; + + mask = set->used_pmcs; + num = set->nused_pmcs; + for (i = 0; num; i++) { + if (test_bit(i, cast_ulp(mask))) { + wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, set->pmcs[i]); + num--; + } + } +} +/* + * Counters may have model-specific width. Yet the documentation says + * that only the lower 32 bits can be written to due to the specification + * of wrmsr. bits [32-(w-1)] are sign extensions of bit 31. Bits [w-63] must + * not be set (see rsvd_msk for PMDs). As such the effective width of a + * counter is 31 bits only regardless of what CPUID.0xa returns. + * + * See IA-32 Intel Architecture Software developer manual Vol 3B chapter 18 + */ +static struct pfm_pmu_config pfm_intel_arch_pmu_conf = { + .pmu_name = "Intel architectural", + .pmd_desc = pfm_intel_arch_pmd_desc, + .counter_width = 31, + .num_pmc_entries = PFM_IA_MAX_PMCS, + .num_pmd_entries = PFM_IA_MAX_PMDS, + .pmc_desc = pfm_intel_arch_pmc_desc, + .probe_pmu = pfm_intel_arch_probe_pmu, + .version = "1.0", + .flags = PFM_PMU_BUILTIN_FLAG, + .owner = THIS_MODULE, + .pmu_info = &pfm_intel_arch_pmu_info +}; + +static int __init pfm_intel_arch_pmu_init_module(void) +{ + return pfm_pmu_register(&pfm_intel_arch_pmu_conf); +} + +static void __exit pfm_intel_arch_pmu_cleanup_module(void) +{ + pfm_pmu_unregister(&pfm_intel_arch_pmu_conf); +} + +module_init(pfm_intel_arch_pmu_init_module); +module_exit(pfm_intel_arch_pmu_cleanup_module); diff --git a/arch/x86/perfmon/perfmon_intel_atom.c b/arch/x86/perfmon/perfmon_intel_atom.c new file mode 100644 index 0000000..9b94863 --- /dev/null +++ b/arch/x86/perfmon/perfmon_intel_atom.c @@ -0,0 +1,541 @@ +/* + * perfmon support for Intel Atom (architectural perfmon v3 + PEBS) + * + * Copyright (c) 2008 Google,Inc + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include + +MODULE_AUTHOR("Stephane Eranian "); +MODULE_DESCRIPTION("Intel Atom"); +MODULE_LICENSE("GPL"); + +static int force, force_nmi; +MODULE_PARM_DESC(force, "bool: force module to load succesfully"); +MODULE_PARM_DESC(force_nmi, "bool: force use of NMI for PMU interrupt"); +module_param(force, bool, 0600); +module_param(force_nmi, bool, 0600); + +/* + * - upper 32 bits are reserved + * - INT: APIC enable bit is reserved (forced to 1) + * + * RSVD: reserved bits are 1 + */ +#define PFM_ATOM_PMC_RSVD ((~((1ULL<<32)-1)) | (1ULL<<20)) + +/* + * force Local APIC interrupt on overflow + * disable with NO_EMUL64 + */ +#define PFM_ATOM_PMC_VAL (1ULL<<20) +#define PFM_ATOM_NO64 (1ULL<<20) + +/* + * Atom counters are 40-bits. 40-bits can be read but ony 31 can be written + * to due to a limitation of wrmsr. Bits [[63-32] are sign extensions of bit 31. + * Bits [63-40] must not be set + * + * See IA-32 Intel Architecture Software developer manual Vol 3B chapter 18 + */ +#define PFM_ATOM_PMD_WIDTH 31 +#define PFM_ATOM_PMD_RSVD ~((1ULL << 40)-1) + +static void pfm_intel_atom_acquire_pmu_percpu(void); +static void pfm_intel_atom_release_pmu_percpu(void); +static void pfm_intel_atom_restore_pmcs(struct pfm_context *ctx, + struct pfm_event_set *set); +static int pfm_intel_atom_stop_save(struct pfm_context *ctx, + struct pfm_event_set *set); +static int pfm_intel_atom_has_ovfls(struct pfm_context *ctx); +static void __kprobes pfm_intel_atom_quiesce(void); + +struct pfm_arch_pmu_info pfm_intel_atom_pmu_info = { + .stop_save = pfm_intel_atom_stop_save, + .has_ovfls = pfm_intel_atom_has_ovfls, + .quiesce = pfm_intel_atom_quiesce, + .restore_pmcs = pfm_intel_atom_restore_pmcs, + .acquire_pmu_percpu = pfm_intel_atom_acquire_pmu_percpu, + .release_pmu_percpu = pfm_intel_atom_release_pmu_percpu + +}; + +#define PFM_ATOM_C(n) { \ + .type = PFM_REG_I64, \ + .desc = "PERFEVTSEL"#n, \ + .dfl_val = PFM_ATOM_PMC_VAL, \ + .rsvd_msk = PFM_ATOM_PMC_RSVD, \ + .no_emul64_msk = PFM_ATOM_NO64, \ + .hw_addr = MSR_P6_EVNTSEL0 + (n) \ + } + + +static struct pfm_regmap_desc pfm_intel_atom_pmc_desc[] = { +/* pmc0 */ PFM_ATOM_C(0), +/* pmc1 */ PFM_ATOM_C(1), +/* pmc2 */ PMX_NA, PMX_NA, +/* pmc4 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc8 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc12 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc16 */ { .type = PFM_REG_I, + .desc = "FIXED_CTRL", + .dfl_val = 0x0000000000000888ULL, /* force PMI */ + .rsvd_msk = 0xfffffffffffffcccULL, /* 3 fixed counters defined */ + .no_emul64_msk = 0, + .hw_addr = MSR_CORE_PERF_FIXED_CTR_CTRL + }, +/* pmc17 */{ .type = PFM_REG_W, + .desc = "PEBS_ENABLE", + .dfl_val = 0, + .rsvd_msk = 0xfffffffffffffffeULL, + .no_emul64_msk = 0, + .hw_addr = MSR_IA32_PEBS_ENABLE + } +}; +#define PFM_ATOM_MAX_PMCS ARRAY_SIZE(pfm_intel_atom_pmc_desc) + +#define PFM_ATOM_D(n) \ + { .type = PFM_REG_C, \ + .desc = "PMC"#n, \ + .rsvd_msk = PFM_ATOM_PMD_RSVD, \ + .hw_addr = MSR_P6_PERFCTR0+n, \ + .dep_pmcs[0] = 1ULL << n \ + } + +#define PFM_ATOM_FD(n) \ + { .type = PFM_REG_C, \ + .desc = "FIXED_CTR"#n, \ + .rsvd_msk = PFM_ATOM_PMD_RSVD, \ + .hw_addr = MSR_CORE_PERF_FIXED_CTR0+n,\ + .dep_pmcs[0] = 1ULL << 16 \ + } + +static struct pfm_regmap_desc pfm_intel_atom_pmd_desc[] = { +/* pmd0 */ PFM_ATOM_D(0), +/* pmd1 */ PFM_ATOM_D(1), +/* pmd2 */ PMX_NA, +/* pmd3 */ PMX_NA, +/* pmd4 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmd8 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmd12 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmd16 */ PFM_ATOM_FD(0), +/* pmd17 */ PFM_ATOM_FD(1), +/* pmd18 */ PFM_ATOM_FD(2) +}; +#define PFM_ATOM_MAX_PMDS ARRAY_SIZE(pfm_intel_atom_pmd_desc) + +static struct pfm_pmu_config pfm_intel_atom_pmu_conf; + +static int pfm_intel_atom_probe_pmu(void) +{ + if (force) + goto doit; + + if (current_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return -1; + + if (current_cpu_data.x86 != 6) + return -1; + + if (current_cpu_data.x86_model != 28) + return -1; +doit: + /* + * having APIC is mandatory, so disregard force option + */ + if (!cpu_has_apic) { + PFM_INFO("no Local APIC, try rebooting with lapic option"); + return -1; + } + + PFM_INFO("detected Intel Atom PMU"); + + if (force_nmi) + pfm_intel_atom_pmu_info.flags |= PFM_X86_FL_USE_NMI; + + return 0; +} + +/** + * pfm_intel_atom_has_ovfls - check for pending overflow condition + * @ctx: context to work on + * + * detect if counters have overflowed. + * return: + * 0 : no overflow + * 1 : at least one overflow + */ +static int __kprobes pfm_intel_atom_has_ovfls(struct pfm_context *ctx) +{ + struct pfm_regmap_desc *d; + u64 ovf; + + d = pfm_pmu_conf->pmd_desc; + /* + * read global overflow status register + * if sharing PMU, then not all bit are ours so must + * check only the ones we actually use + */ + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, ovf); + + /* + * for pmd0, we also check PEBS overflow on bit 62 + */ + if ((d[0].type & PFM_REG_I) && (ovf & ((1ull << 62) | 1ull))) + return 1; + + if ((d[1].type & PFM_REG_I) && (ovf & 2ull)) + return 1; + + if ((d[16].type & PFM_REG_I) && (ovf & (1ull << 32))) + return 1; + + if ((d[17].type & PFM_REG_I) && (ovf & (2ull << 32))) + return 1; + + if ((d[18].type & PFM_REG_I) && (ovf & (4ull << 32))) + return 1; + + return 0; +} + +/** + * pfm_intel_atom_stop_save - stop monitoring, collect pending overflow, save pmds + * @ctx: context to work on + * @set: active set + * + * return: + * 1: caller needs to save pmds + * 0: caller does not need to save pmds, they have been saved by this call + */ +static int pfm_intel_atom_stop_save(struct pfm_context *ctx, + struct pfm_event_set *set) +{ +#define PFM_ATOM_WMASK (1ULL << 31) +#define PFM_ATOM_OMASK ((1ULL << 31)-1) + u64 clear_ovf = 0; + u64 ovf, ovf2, val; + + /* + * read global overflow status register + * if sharing PMU, then not all bit are ours so must + * check only the ones we actually use. + * + * XXX: Atom seems to have a bug with the stickyness of + * GLOBAL_STATUS. If we read GLOBAL_STATUS after we + * clear the generic counters, then their bits in + * GLOBAL_STATUS are cleared. This should not be the + * case accoding to architected PMU. To workaround + * the problem, we read GLOBAL_STATUS BEFORE we stop + * all monitoring. + */ + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, ovf); + + /* + * stop monitoring + */ + if (test_bit(0, cast_ulp(set->used_pmcs))) + wrmsrl(MSR_P6_EVNTSEL0, 0); + + if (test_bit(1, cast_ulp(set->used_pmcs))) + wrmsrl(MSR_P6_EVNTSEL1, 0); + + if (test_bit(16, cast_ulp(set->used_pmcs))) + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0); + + if (test_bit(17, cast_ulp(set->used_pmcs))) + wrmsrl(MSR_IA32_PEBS_ENABLE, 0); + + /* + * XXX: related to bug mentioned above + * + * read GLOBAL_STATUS again to avoid race condition + * with overflows happening after first read and + * before stop. That avoids missing overflows on + * the fixed counters and PEBS + */ + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, ovf2); + ovf |= ovf2; + + /* + * if we already have a pending overflow condition, we simply + * return to take care of it first. + */ + if (set->npend_ovfls) + return 1; + + /* + * check PMD 0,1,16,17,18 for overflow and save their value + */ + if (test_bit(0, cast_ulp(set->used_pmds))) { + rdmsrl(MSR_P6_PERFCTR0, val); + if (ovf & ((1ull<<62)|1ull)) { + __set_bit(0, cast_ulp(set->povfl_pmds)); + set->npend_ovfls++; + clear_ovf = (1ull << 62) | 1ull; + } + val = (set->pmds[0].value & ~PFM_ATOM_OMASK) + | (val & PFM_ATOM_OMASK); + set->pmds[0].value = val; + } + + if (test_bit(1, cast_ulp(set->used_pmds))) { + rdmsrl(MSR_P6_PERFCTR1, val); + if (ovf & 2ull) { + __set_bit(1, cast_ulp(set->povfl_pmds)); + set->npend_ovfls++; + clear_ovf |= 2ull; + } + val = (set->pmds[1].value & ~PFM_ATOM_OMASK) + | (val & PFM_ATOM_OMASK); + set->pmds[1].value = val; + } + + if (test_bit(16, cast_ulp(set->used_pmds))) { + rdmsrl(MSR_CORE_PERF_FIXED_CTR0, val); + if (ovf & (1ull << 32)) { + __set_bit(16, cast_ulp(set->povfl_pmds)); + set->npend_ovfls++; + clear_ovf |= 1ull << 32; + } + val = (set->pmds[16].value & ~PFM_ATOM_OMASK) + | (val & PFM_ATOM_OMASK); + set->pmds[16].value = val; + } + + if (test_bit(17, cast_ulp(set->used_pmds))) { + rdmsrl(MSR_CORE_PERF_FIXED_CTR0+1, val); + if (ovf & (2ull << 32)) { + __set_bit(17, cast_ulp(set->povfl_pmds)); + set->npend_ovfls++; + clear_ovf |= 2ull << 32; + } + val = (set->pmds[17].value & ~PFM_ATOM_OMASK) + | (val & PFM_ATOM_OMASK); + set->pmds[17].value = val; + } + + if (test_bit(18, cast_ulp(set->used_pmds))) { + rdmsrl(MSR_CORE_PERF_FIXED_CTR0+2, val); + if (ovf & (4ull << 32)) { + __set_bit(18, cast_ulp(set->povfl_pmds)); + set->npend_ovfls++; + clear_ovf |= 4ull << 32; + } + val = (set->pmds[18].value & ~PFM_ATOM_OMASK) + | (val & PFM_ATOM_OMASK); + set->pmds[18].value = val; + } + + if (clear_ovf) + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, clear_ovf); + + /* 0 means: no need to save PMDs at upper level */ + return 0; +} + +/** + * pfm_intel_atom_quiesce - stop monitoring without grabbing any lock + * + * called from NMI interrupt handler to immediately stop monitoring + * cannot grab any lock, including perfmon related locks + */ +static void __kprobes pfm_intel_atom_quiesce(void) +{ + /* + * quiesce PMU by clearing available registers that have + * the start/stop capability + */ + if (test_bit(0, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) + wrmsrl(MSR_P6_EVNTSEL0, 0); + + if (test_bit(1, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) + wrmsrl(MSR_P6_EVNTSEL1, 0); + + if (test_bit(16, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0); + + if (test_bit(17, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) + wrmsrl(MSR_IA32_PEBS_ENABLE, 0); +} + +/** + * pfm_intel_atom_restore_pmcs - reload PMC registers + * @ctx: context to restore from + * @set: current event set + * + * restores pmcs and also PEBS Data Save area pointer + */ +static void pfm_intel_atom_restore_pmcs(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + struct pfm_arch_context *ctx_arch; + u64 clear_ovf = 0; + + ctx_arch = pfm_ctx_arch(ctx); + /* + * must restore DS pointer before restoring PMCs + * as this can potentially reactivate monitoring + */ + if (ctx_arch->flags.use_ds) + wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ctx_arch->ds_area); + + if (test_bit(0, cast_ulp(set->used_pmcs))) { + wrmsrl(MSR_P6_EVNTSEL0, set->pmcs[0]); + clear_ovf = 1ull; + } + + if (test_bit(1, cast_ulp(set->used_pmcs))) { + wrmsrl(MSR_P6_EVNTSEL1, set->pmcs[1]); + clear_ovf |= 2ull; + } + + if (test_bit(16, cast_ulp(set->used_pmcs))) { + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, set->pmcs[16]); + clear_ovf |= 7ull << 32; + } + + if (test_bit(17, cast_ulp(set->used_pmcs))) { + wrmsrl(MSR_IA32_PEBS_ENABLE, set->pmcs[17]); + clear_ovf |= 1ull << 62; + } + + if (clear_ovf) + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, clear_ovf); +} + +static int pfm_intel_atom_pmc17_check(struct pfm_context *ctx, + struct pfm_event_set *set, + struct pfarg_pmc *req) +{ + struct pfm_arch_context *ctx_arch; + ctx_arch = pfm_ctx_arch(ctx); + + /* + * if user activates PEBS_ENABLE, then we need to have a valid + * DS Area setup. This only happens when the PEBS sampling format is + * used in which case PFM_X86_USE_PEBS is set. We must reject all other + * requests. + * + * Otherwise we may pickup stale MSR_IA32_DS_AREA values. It appears + * that a value of 0 for this MSR does crash the system with + * PEBS_ENABLE=1. + */ + if (!ctx_arch->flags.use_pebs && req->reg_value) { + PFM_DBG("pmc17 useable only with a PEBS sampling format"); + return -EINVAL; + } + return 0; +} + +DEFINE_PER_CPU(u64, saved_global_ctrl); + +/** + * pfm_intel_atom_acquire_pmu_percpu - acquire PMU resource per CPU + * + * For Atom, it is necessary to enable all available + * registers. The firmware rightfully has the fixed counters + * disabled for backward compatibility with architectural perfmon + * v1 + * + * This function is invoked on each online CPU + */ +static void pfm_intel_atom_acquire_pmu_percpu(void) +{ + struct pfm_regmap_desc *d; + u64 mask = 0; + unsigned int i; + + /* + * build bitmask of registers that are available to + * us. In some cases, there may be fewer registers than + * what Atom supports due to sharing with other kernel + * subsystems, such as NMI + */ + d = pfm_pmu_conf->pmd_desc; + for (i=0; i < 16; i++) { + if ((d[i].type & PFM_REG_I) == 0) + continue; + mask |= 1ull << i; + } + for (i=16; i < PFM_ATOM_MAX_PMDS; i++) { + if ((d[i].type & PFM_REG_I) == 0) + continue; + mask |= 1ull << (32+i-16); + } + + /* + * keep a local copy of the current MSR_CORE_PERF_GLOBAL_CTRL + */ + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, __get_cpu_var(saved_global_ctrl)); + + PFM_DBG("global=0x%llx set to 0x%llx", + __get_cpu_var(saved_global_ctrl), + mask); + + /* + * enable all registers + * + * No need to quiesce PMU. If there is a overflow, it will be + * treated as spurious by the handler + */ + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, mask); +} + +/** + * pfm_intel_atom_release_pmu_percpu - release PMU resource per CPU + * + * For Atom, we restore MSR_CORE_PERF_GLOBAL_CTRL to its orginal value + */ +static void pfm_intel_atom_release_pmu_percpu(void) +{ + PFM_DBG("global_ctrl restored to 0x%llx\n", + __get_cpu_var(saved_global_ctrl)); + + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, __get_cpu_var(saved_global_ctrl)); +} + +static struct pfm_pmu_config pfm_intel_atom_pmu_conf = { + .pmu_name = "Intel Atom", + .pmd_desc = pfm_intel_atom_pmd_desc, + .counter_width = PFM_ATOM_PMD_WIDTH, + .num_pmc_entries = PFM_ATOM_MAX_PMCS, + .num_pmd_entries = PFM_ATOM_MAX_PMDS, + .pmc_desc = pfm_intel_atom_pmc_desc, + .probe_pmu = pfm_intel_atom_probe_pmu, + .version = "1.0", + .flags = PFM_PMU_BUILTIN_FLAG, + .owner = THIS_MODULE, + .pmc_write_check = pfm_intel_atom_pmc17_check, + .pmu_info = &pfm_intel_atom_pmu_info +}; + +static int __init pfm_intel_atom_pmu_init_module(void) +{ + return pfm_pmu_register(&pfm_intel_atom_pmu_conf); +} + +static void __exit pfm_intel_atom_pmu_cleanup_module(void) +{ + pfm_pmu_unregister(&pfm_intel_atom_pmu_conf); +} + +module_init(pfm_intel_atom_pmu_init_module); +module_exit(pfm_intel_atom_pmu_cleanup_module); diff --git a/arch/x86/perfmon/perfmon_intel_core.c b/arch/x86/perfmon/perfmon_intel_core.c new file mode 100644 index 0000000..fddc436 --- /dev/null +++ b/arch/x86/perfmon/perfmon_intel_core.c @@ -0,0 +1,449 @@ +/* + * This file contains the Intel Core PMU registers description tables. + * Intel Core-based processors support architectural perfmon v2 + PEBS + * + * Copyright (c) 2006-2007 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + */ +#include +#include +#include +#include + +MODULE_AUTHOR("Stephane Eranian "); +MODULE_DESCRIPTION("Intel Core"); +MODULE_LICENSE("GPL"); + +static int force_nmi; +MODULE_PARM_DESC(force_nmi, "bool: force use of NMI for PMU interrupt"); +module_param(force_nmi, bool, 0600); + +/* + * - upper 32 bits are reserved + * - INT: APIC enable bit is reserved (forced to 1) + * - bit 21 is reserved + * + * RSVD: reserved bits must be 1 + */ +#define PFM_CORE_PMC_RSVD ((~((1ULL<<32)-1)) \ + | (1ULL<<20) \ + | (1ULL<<21)) + +/* + * Core counters are 40-bits + */ +#define PFM_CORE_CTR_RSVD (~((1ULL<<40)-1)) + +/* + * force Local APIC interrupt on overflow + * disable with NO_EMUL64 + */ +#define PFM_CORE_PMC_VAL (1ULL<<20) +#define PFM_CORE_NO64 (1ULL<<20) + +#define PFM_CORE_NA { .reg_type = PFM_REGT_NA} + +#define PFM_CORE_CA(m, c, t) \ + { \ + .addrs[0] = m, \ + .ctr = c, \ + .reg_type = t \ + } + +struct pfm_ds_area_intel_core { + u64 bts_buf_base; + u64 bts_index; + u64 bts_abs_max; + u64 bts_intr_thres; + u64 pebs_buf_base; + u64 pebs_index; + u64 pebs_abs_max; + u64 pebs_intr_thres; + u64 pebs_cnt_reset; +}; + +static void pfm_core_restore_pmcs(struct pfm_context *ctx, + struct pfm_event_set *set); +static int pfm_core_has_ovfls(struct pfm_context *ctx); +static int pfm_core_stop_save(struct pfm_context *ctx, + struct pfm_event_set *set); +static void __kprobes pfm_core_quiesce(void); + +static u64 enable_mask[PFM_MAX_PMCS]; +static u16 max_enable; + +struct pfm_arch_pmu_info pfm_core_pmu_info = { + .stop_save = pfm_core_stop_save, + .has_ovfls = pfm_core_has_ovfls, + .quiesce = pfm_core_quiesce, + .restore_pmcs = pfm_core_restore_pmcs +}; + +static struct pfm_regmap_desc pfm_core_pmc_desc[] = { +/* pmc0 */ { + .type = PFM_REG_I64, + .desc = "PERFEVTSEL0", + .dfl_val = PFM_CORE_PMC_VAL, + .rsvd_msk = PFM_CORE_PMC_RSVD, + .no_emul64_msk = PFM_CORE_NO64, + .hw_addr = MSR_P6_EVNTSEL0 + }, +/* pmc1 */ { + .type = PFM_REG_I64, + .desc = "PERFEVTSEL1", + .dfl_val = PFM_CORE_PMC_VAL, + .rsvd_msk = PFM_CORE_PMC_RSVD, + .no_emul64_msk = PFM_CORE_NO64, + .hw_addr = MSR_P6_EVNTSEL1 + }, +/* pmc2 */ PMX_NA, PMX_NA, +/* pmc4 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc8 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc12 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmc16 */ { .type = PFM_REG_I, + .desc = "FIXED_CTRL", + .dfl_val = 0x888ULL, + .rsvd_msk = 0xfffffffffffffcccULL, + .no_emul64_msk = 0, + .hw_addr = MSR_CORE_PERF_FIXED_CTR_CTRL + }, +/* pmc17 */ { .type = PFM_REG_W, + .desc = "PEBS_ENABLE", + .dfl_val = 0, + .rsvd_msk = 0xfffffffffffffffeULL, + .no_emul64_msk = 0, + .hw_addr = MSR_IA32_PEBS_ENABLE + } +}; + +#define PFM_CORE_D(n) \ + { .type = PFM_REG_C, \ + .desc = "PMC"#n, \ + .rsvd_msk = PFM_CORE_CTR_RSVD, \ + .hw_addr = MSR_P6_PERFCTR0+n, \ + .dep_pmcs[0] = 1ULL << n \ + } + +#define PFM_CORE_FD(n) \ + { .type = PFM_REG_C, \ + .desc = "FIXED_CTR"#n, \ + .rsvd_msk = PFM_CORE_CTR_RSVD, \ + .hw_addr = MSR_CORE_PERF_FIXED_CTR0+n,\ + .dep_pmcs[0] = 1ULL << 16 \ + } + +static struct pfm_regmap_desc pfm_core_pmd_desc[] = { +/* pmd0 */ PFM_CORE_D(0), +/* pmd1 */ PFM_CORE_D(1), +/* pmd2 */ PMX_NA, PMX_NA, +/* pmd4 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmd8 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmd12 */ PMX_NA, PMX_NA, PMX_NA, PMX_NA, +/* pmd16 */ PFM_CORE_FD(0), +/* pmd17 */ PFM_CORE_FD(1), +/* pmd18 */ PFM_CORE_FD(2) +}; +#define PFM_CORE_NUM_PMCS ARRAY_SIZE(pfm_core_pmc_desc) +#define PFM_CORE_NUM_PMDS ARRAY_SIZE(pfm_core_pmd_desc) + +static struct pfm_pmu_config pfm_core_pmu_conf; + +static int pfm_core_probe_pmu(void) +{ + /* + * Check for Intel Core processor explicitely + * Checking for cpu_has_perfmon is not enough as this + * matches intel Core Duo/Core Solo but none supports + * PEBS. + * + * Intel Core = arch perfmon v2 + PEBS + */ + if (current_cpu_data.x86_vendor != X86_VENDOR_INTEL) { + PFM_INFO("not an AMD processor"); + return -1; + } + + if (current_cpu_data.x86 != 6) + return -1; + + switch (current_cpu_data.x86_model) { + case 15: /* Merom */ + break; + case 23: /* Penryn */ + break; + case 29: /* Dunnington */ + break; + default: + return -1; + } + + if (!cpu_has_apic) { + PFM_INFO("no Local APIC, unsupported"); + return -1; + } + + PFM_INFO("nmi_watchdog=%d nmi_active=%d force_nmi=%d", + nmi_watchdog, atomic_read(&nmi_active), force_nmi); + + /* + * Intel Core processors implement DS and PEBS, no need to check + */ + if (cpu_has_pebs) + PFM_INFO("PEBS supported, enabled"); + + /* + * initialize bitmask of register with enable capability, i.e., + * startstop. This is used to restrict the number of registers to + * touch on start/stop + * max_enable: number of bits to scan in enable_mask = highest + 1 + * + * may be adjusted in pfm_arch_pmu_acquire() + */ + __set_bit(0, cast_ulp(enable_mask)); + __set_bit(1, cast_ulp(enable_mask)); + __set_bit(16, cast_ulp(enable_mask)); + __set_bit(17, cast_ulp(enable_mask)); + max_enable = 17+1; + + if (force_nmi) + pfm_core_pmu_info.flags |= PFM_X86_FL_USE_NMI; + + return 0; +} + +static int pfm_core_pmc17_check(struct pfm_context *ctx, + struct pfm_event_set *set, + struct pfarg_pmc *req) +{ + struct pfm_arch_context *ctx_arch; + ctx_arch = pfm_ctx_arch(ctx); + + /* + * if user activates PEBS_ENABLE, then we need to have a valid + * DS Area setup. This only happens when the PEBS sampling format is + * used in which case PFM_X86_USE_PEBS is set. We must reject all other + * requests. + * + * Otherwise we may pickup stale MSR_IA32_DS_AREA values. It appears + * that a value of 0 for this MSR does crash the system with + * PEBS_ENABLE=1. + */ + if (!ctx_arch->flags.use_pebs && req->reg_value) { + PFM_DBG("pmc17 useable only with a PEBS sampling format"); + return -EINVAL; + } + return 0; +} + +/* + * detect is counters have overflowed. + * return: + * 0 : no overflow + * 1 : at least one overflow + * + * used by Intel Core-based processors + */ +static int __kprobes pfm_core_has_ovfls(struct pfm_context *ctx) +{ + struct pfm_arch_pmu_info *pmu_info; + u64 *cnt_mask; + u64 wmask, val; + u16 i, num; + + pmu_info = &pfm_core_pmu_info; + cnt_mask = ctx->regs.cnt_pmds; + num = ctx->regs.num_counters; + wmask = 1ULL << pfm_pmu_conf->counter_width; + + for (i = 0; num; i++) { + if (test_bit(i, cast_ulp(cnt_mask))) { + rdmsrl(pfm_core_pmd_desc[i].hw_addr, val); + if (!(val & wmask)) + return 1; + num--; + } + } + return 0; +} + +static int pfm_core_stop_save(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + struct pfm_arch_context *ctx_arch; + struct pfm_ds_area_intel_core *ds = NULL; + u64 used_mask[PFM_PMC_BV]; + u64 *cnt_mask; + u64 val, wmask, ovfl_mask; + u16 count, has_ovfl; + u16 i, pebs_idx = ~0; + + ctx_arch = pfm_ctx_arch(ctx); + + wmask = 1ULL << pfm_pmu_conf->counter_width; + + /* + * used enable pmc bitmask + */ + bitmap_and(cast_ulp(used_mask), + cast_ulp(set->used_pmcs), + cast_ulp(enable_mask), + max_enable); + + count = bitmap_weight(cast_ulp(used_mask), max_enable); + /* + * stop monitoring + * Unfortunately, this is very expensive! + * wrmsrl() is serializing. + */ + for (i = 0; count; i++) { + if (test_bit(i, cast_ulp(used_mask))) { + wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, 0); + count--; + } + } + /* + * if we already having a pending overflow condition, we simply + * return to take care of this first. + */ + if (set->npend_ovfls) + return 1; + + ovfl_mask = pfm_pmu_conf->ovfl_mask; + cnt_mask = ctx->regs.cnt_pmds; + + if (ctx_arch->flags.use_pebs) { + ds = ctx_arch->ds_area; + pebs_idx = 0; /* PMC0/PMD0 */ + PFM_DBG("ds=%p pebs_idx=0x%llx thres=0x%llx", + ds, + (unsigned long long)ds->pebs_index, + (unsigned long long)ds->pebs_intr_thres); + } + + /* + * Check for pending overflows and save PMDs (combo) + * We employ used_pmds and not intr_pmds because we must + * also saved on PMD registers. + * Must check for counting PMDs because of virtual PMDs + * + * XXX: should use the ovf_status register instead, yet + * we would have to check if NMI is used and fallback + * to individual pmd inspection. + */ + count = set->nused_pmds; + + for (i = 0; count; i++) { + if (test_bit(i, cast_ulp(set->used_pmds))) { + val = pfm_arch_read_pmd(ctx, i); + if (likely(test_bit(i, cast_ulp(cnt_mask)))) { + if (i == pebs_idx) + has_ovfl = (ds->pebs_index >= + ds->pebs_intr_thres); + else + has_ovfl = !(val & wmask); + if (has_ovfl) { + __set_bit(i, cast_ulp(set->povfl_pmds)); + set->npend_ovfls++; + } + val = (set->pmds[i].value & ~ovfl_mask) + | (val & ovfl_mask); + } + set->pmds[i].value = val; + count--; + } + } + /* 0 means: no need to save PMDs at upper level */ + return 0; +} + +/** + * pfm_core_quiesce - stop monitoring without grabbing any lock + * + * called from NMI interrupt handler to immediately stop monitoring + * cannot grab any lock, including perfmon related locks + */ +static void __kprobes pfm_core_quiesce(void) +{ + /* + * quiesce PMU by clearing available registers that have + * the start/stop capability + */ + if (test_bit(0, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) + wrmsrl(MSR_P6_EVNTSEL0, 0); + if (test_bit(1, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) + wrmsrl(MSR_P6_EVNTSEL1, 0); + if (test_bit(16, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) + wrmsrl(MSR_CORE_PERF_FIXED_CTR_CTRL, 0); + if (test_bit(17, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) + wrmsrl(MSR_IA32_PEBS_ENABLE, 0); +} +/** + * pfm_core_restore_pmcs - reload PMC registers + * @ctx: context to restore from + * @set: current event set + * + * optimized version of pfm_arch_restore_pmcs(). On Core, we can + * afford to only restore the pmcs registers we use, because they are + * all independent from each other. + */ +static void pfm_core_restore_pmcs(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + struct pfm_arch_context *ctx_arch; + u64 *mask; + u16 i, num; + + ctx_arch = pfm_ctx_arch(ctx); + + /* + * must restore DS pointer before restoring PMCs + * as this can potentially reactivate monitoring + */ + if (ctx_arch->flags.use_ds) + wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ctx_arch->ds_area); + + mask = set->used_pmcs; + num = set->nused_pmcs; + for (i = 0; num; i++) { + if (test_bit(i, cast_ulp(mask))) { + wrmsrl(pfm_pmu_conf->pmc_desc[i].hw_addr, set->pmcs[i]); + num--; + } + } +} + +/* + * Counters may have model-specific width which can be probed using + * the CPUID.0xa leaf. Yet, the documentation says: " + * In the initial implementation, only the read bit width is reported + * by CPUID, write operations are limited to the low 32 bits. + * Bits [w-32] are sign extensions of bit 31. As such the effective width + * of a counter is 31 bits only. + */ +static struct pfm_pmu_config pfm_core_pmu_conf = { + .pmu_name = "Intel Core", + .pmd_desc = pfm_core_pmd_desc, + .counter_width = 31, + .num_pmc_entries = PFM_CORE_NUM_PMCS, + .num_pmd_entries = PFM_CORE_NUM_PMDS, + .pmc_desc = pfm_core_pmc_desc, + .probe_pmu = pfm_core_probe_pmu, + .version = "1.2", + .flags = PFM_PMU_BUILTIN_FLAG, + .owner = THIS_MODULE, + .pmu_info = &pfm_core_pmu_info, + .pmc_write_check = pfm_core_pmc17_check +}; + +static int __init pfm_core_pmu_init_module(void) +{ + return pfm_pmu_register(&pfm_core_pmu_conf); +} + +static void __exit pfm_core_pmu_cleanup_module(void) +{ + pfm_pmu_unregister(&pfm_core_pmu_conf); +} + +module_init(pfm_core_pmu_init_module); +module_exit(pfm_core_pmu_cleanup_module); diff --git a/arch/x86/perfmon/perfmon_p4.c b/arch/x86/perfmon/perfmon_p4.c new file mode 100644 index 0000000..1ffcf3c --- /dev/null +++ b/arch/x86/perfmon/perfmon_p4.c @@ -0,0 +1,913 @@ +/* + * This file contains the P4/Xeon PMU register description tables + * for both 32 and 64 bit modes. + * + * Copyright (c) 2005 Intel Corporation + * Contributed by Bryan Wilkerson + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Bryan Wilkerson "); +MODULE_DESCRIPTION("P4/Xeon/EM64T PMU description table"); +MODULE_LICENSE("GPL"); + +static int force; +MODULE_PARM_DESC(force, "bool: force module to load succesfully"); +module_param(force, bool, 0600); + +static int force_nmi; +MODULE_PARM_DESC(force_nmi, "bool: force use of NMI for PMU interrupt"); +module_param(force_nmi, bool, 0600); + +/* + * For extended register information in addition to address that is used + * at runtime to figure out the mapping of reg addresses to logical procs + * and association of registers to hardware specific features + */ +struct pfm_p4_regmap { + /* + * one each for the logical CPUs. Index 0 corresponds to T0 and + * index 1 corresponds to T1. Index 1 can be zero if no T1 + * complement reg exists. + */ + unsigned long addrs[2]; /* 2 = number of threads */ + unsigned int ctr; /* for CCCR/PERFEVTSEL, associated counter */ + unsigned int reg_type; +}; + +/* + * bitmask for pfm_p4_regmap.reg_type + */ +#define PFM_REGT_NA 0x0000 /* not available */ +#define PFM_REGT_EN 0x0001 /* has enable bit (cleared on ctxsw) */ +#define PFM_REGT_ESCR 0x0002 /* P4: ESCR */ +#define PFM_REGT_CCCR 0x0004 /* P4: CCCR */ +#define PFM_REGT_PEBS 0x0010 /* PEBS related */ +#define PFM_REGT_NOHT 0x0020 /* unavailable with HT */ +#define PFM_REGT_CTR 0x0040 /* counter */ + +/* + * architecture specific context extension. + * located at: (struct pfm_arch_context *)(ctx+1) + */ +struct pfm_arch_p4_context { + u32 npend_ovfls; /* P4 NMI #pending ovfls */ + u32 reserved; + u64 povfl_pmds[PFM_PMD_BV]; /* P4 NMI overflowed counters */ + u64 saved_cccrs[PFM_MAX_PMCS]; +}; + +/* + * ESCR reserved bitmask: + * - bits 31 - 63 reserved + * - T1_OS and T1_USR bits are reserved - set depending on logical proc + * user mode application should use T0_OS and T0_USR to indicate + * RSVD: reserved bits must be 1 + */ +#define PFM_ESCR_RSVD ~0x000000007ffffffcULL + +/* + * CCCR default value: + * - OVF_PMI_T0=1 (bit 26) + * - OVF_PMI_T1=0 (bit 27) (set if necessary in pfm_write_reg()) + * - all other bits are zero + * + * OVF_PMI is forced to zero if PFM_REGFL_NO_EMUL64 is set on CCCR + */ +#define PFM_CCCR_DFL (1ULL<<26) | (3ULL<<16) + +/* + * CCCR reserved fields: + * - bits 0-11, 25-29, 31-63 + * - OVF_PMI (26-27), override with REGFL_NO_EMUL64 + * + * RSVD: reserved bits must be 1 + */ +#define PFM_CCCR_RSVD ~((0xfull<<12) \ + | (0x7full<<18) \ + | (0x1ull<<30)) + +#define PFM_P4_NO64 (3ULL<<26) /* use 3 even in non HT mode */ + +#define PEBS_PMD 8 /* thread0: IQ_CTR4, thread1: IQ_CTR5 */ + +/* + * With HyperThreading enabled: + * + * The ESCRs and CCCRs are divided in half with the top half + * belonging to logical processor 0 and the bottom half going to + * logical processor 1. Thus only half of the PMU resources are + * accessible to applications. + * + * PEBS is not available due to the fact that: + * - MSR_PEBS_MATRIX_VERT is shared between the threads + * - IA32_PEBS_ENABLE is shared between the threads + * + * With HyperThreading disabled: + * + * The full set of PMU resources is exposed to applications. + * + * The mapping is chosen such that PMCxx -> MSR is the same + * in HT and non HT mode, if register is present in HT mode. + * + */ +#define PFM_REGT_NHTESCR (PFM_REGT_ESCR|PFM_REGT_NOHT) +#define PFM_REGT_NHTCCCR (PFM_REGT_CCCR|PFM_REGT_NOHT|PFM_REGT_EN) +#define PFM_REGT_NHTPEBS (PFM_REGT_PEBS|PFM_REGT_NOHT|PFM_REGT_EN) +#define PFM_REGT_NHTCTR (PFM_REGT_CTR|PFM_REGT_NOHT) +#define PFM_REGT_ENAC (PFM_REGT_CCCR|PFM_REGT_EN) + +static void pfm_p4_write_pmc(struct pfm_context *ctx, unsigned int cnum, u64 value); +static void pfm_p4_write_pmd(struct pfm_context *ctx, unsigned int cnum, u64 value); +static u64 pfm_p4_read_pmd(struct pfm_context *ctx, unsigned int cnum); +static u64 pfm_p4_read_pmc(struct pfm_context *ctx, unsigned int cnum); +static int pfm_p4_create_context(struct pfm_context *ctx, u32 ctx_flags); +static void pfm_p4_free_context(struct pfm_context *ctx); +static int pfm_p4_has_ovfls(struct pfm_context *ctx); +static int pfm_p4_stop_save(struct pfm_context *ctx, struct pfm_event_set *set); +static void pfm_p4_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set); +static void pfm_p4_nmi_copy_state(struct pfm_context *ctx); +static void __kprobes pfm_p4_quiesce(void); + +static u64 enable_mask[PFM_MAX_PMCS]; +static u16 max_enable; + +static struct pfm_p4_regmap pmc_addrs[PFM_MAX_PMCS] = { + /*pmc 0 */ {{MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1}, 0, PFM_REGT_ESCR}, /* BPU_ESCR0,1 */ + /*pmc 1 */ {{MSR_P4_IS_ESCR0, MSR_P4_IS_ESCR1}, 0, PFM_REGT_ESCR}, /* IS_ESCR0,1 */ + /*pmc 2 */ {{MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1}, 0, PFM_REGT_ESCR}, /* MOB_ESCR0,1 */ + /*pmc 3 */ {{MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1}, 0, PFM_REGT_ESCR}, /* ITLB_ESCR0,1 */ + /*pmc 4 */ {{MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1}, 0, PFM_REGT_ESCR}, /* PMH_ESCR0,1 */ + /*pmc 5 */ {{MSR_P4_IX_ESCR0, MSR_P4_IX_ESCR1}, 0, PFM_REGT_ESCR}, /* IX_ESCR0,1 */ + /*pmc 6 */ {{MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1}, 0, PFM_REGT_ESCR}, /* FSB_ESCR0,1 */ + /*pmc 7 */ {{MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1}, 0, PFM_REGT_ESCR}, /* BSU_ESCR0,1 */ + /*pmc 8 */ {{MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1}, 0, PFM_REGT_ESCR}, /* MS_ESCR0,1 */ + /*pmc 9 */ {{MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1}, 0, PFM_REGT_ESCR}, /* TC_ESCR0,1 */ + /*pmc 10*/ {{MSR_P4_TBPU_ESCR0, MSR_P4_TBPU_ESCR1}, 0, PFM_REGT_ESCR}, /* TBPU_ESCR0,1 */ + /*pmc 11*/ {{MSR_P4_FLAME_ESCR0, MSR_P4_FLAME_ESCR1}, 0, PFM_REGT_ESCR}, /* FLAME_ESCR0,1 */ + /*pmc 12*/ {{MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1}, 0, PFM_REGT_ESCR}, /* FIRM_ESCR0,1 */ + /*pmc 13*/ {{MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1}, 0, PFM_REGT_ESCR}, /* SAAT_ESCR0,1 */ + /*pmc 14*/ {{MSR_P4_U2L_ESCR0, MSR_P4_U2L_ESCR1}, 0, PFM_REGT_ESCR}, /* U2L_ESCR0,1 */ + /*pmc 15*/ {{MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1}, 0, PFM_REGT_ESCR}, /* DAC_ESCR0,1 */ + /*pmc 16*/ {{MSR_P4_IQ_ESCR0, MSR_P4_IQ_ESCR1}, 0, PFM_REGT_ESCR}, /* IQ_ESCR0,1 (only model 1 and 2) */ + /*pmc 17*/ {{MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1}, 0, PFM_REGT_ESCR}, /* ALF_ESCR0,1 */ + /*pmc 18*/ {{MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1}, 0, PFM_REGT_ESCR}, /* RAT_ESCR0,1 */ + /*pmc 19*/ {{MSR_P4_SSU_ESCR0, 0}, 0, PFM_REGT_ESCR}, /* SSU_ESCR0 */ + /*pmc 20*/ {{MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1}, 0, PFM_REGT_ESCR}, /* CRU_ESCR0,1 */ + /*pmc 21*/ {{MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3}, 0, PFM_REGT_ESCR}, /* CRU_ESCR2,3 */ + /*pmc 22*/ {{MSR_P4_CRU_ESCR4, MSR_P4_CRU_ESCR5}, 0, PFM_REGT_ESCR}, /* CRU_ESCR4,5 */ + + /*pmc 23*/ {{MSR_P4_BPU_CCCR0, MSR_P4_BPU_CCCR2}, 0, PFM_REGT_ENAC}, /* BPU_CCCR0,2 */ + /*pmc 24*/ {{MSR_P4_BPU_CCCR1, MSR_P4_BPU_CCCR3}, 1, PFM_REGT_ENAC}, /* BPU_CCCR1,3 */ + /*pmc 25*/ {{MSR_P4_MS_CCCR0, MSR_P4_MS_CCCR2}, 2, PFM_REGT_ENAC}, /* MS_CCCR0,2 */ + /*pmc 26*/ {{MSR_P4_MS_CCCR1, MSR_P4_MS_CCCR3}, 3, PFM_REGT_ENAC}, /* MS_CCCR1,3 */ + /*pmc 27*/ {{MSR_P4_FLAME_CCCR0, MSR_P4_FLAME_CCCR2}, 4, PFM_REGT_ENAC}, /* FLAME_CCCR0,2 */ + /*pmc 28*/ {{MSR_P4_FLAME_CCCR1, MSR_P4_FLAME_CCCR3}, 5, PFM_REGT_ENAC}, /* FLAME_CCCR1,3 */ + /*pmc 29*/ {{MSR_P4_IQ_CCCR0, MSR_P4_IQ_CCCR2}, 6, PFM_REGT_ENAC}, /* IQ_CCCR0,2 */ + /*pmc 30*/ {{MSR_P4_IQ_CCCR1, MSR_P4_IQ_CCCR3}, 7, PFM_REGT_ENAC}, /* IQ_CCCR1,3 */ + /*pmc 31*/ {{MSR_P4_IQ_CCCR4, MSR_P4_IQ_CCCR5}, 8, PFM_REGT_ENAC}, /* IQ_CCCR4,5 */ + /* non HT extensions */ + /*pmc 32*/ {{MSR_P4_BPU_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* BPU_ESCR1 */ + /*pmc 33*/ {{MSR_P4_IS_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* IS_ESCR1 */ + /*pmc 34*/ {{MSR_P4_MOB_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* MOB_ESCR1 */ + /*pmc 35*/ {{MSR_P4_ITLB_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* ITLB_ESCR1 */ + /*pmc 36*/ {{MSR_P4_PMH_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* PMH_ESCR1 */ + /*pmc 37*/ {{MSR_P4_IX_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* IX_ESCR1 */ + /*pmc 38*/ {{MSR_P4_FSB_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* FSB_ESCR1 */ + /*pmc 39*/ {{MSR_P4_BSU_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* BSU_ESCR1 */ + /*pmc 40*/ {{MSR_P4_MS_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* MS_ESCR1 */ + /*pmc 41*/ {{MSR_P4_TC_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* TC_ESCR1 */ + /*pmc 42*/ {{MSR_P4_TBPU_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* TBPU_ESCR1 */ + /*pmc 43*/ {{MSR_P4_FLAME_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* FLAME_ESCR1 */ + /*pmc 44*/ {{MSR_P4_FIRM_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* FIRM_ESCR1 */ + /*pmc 45*/ {{MSR_P4_SAAT_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* SAAT_ESCR1 */ + /*pmc 46*/ {{MSR_P4_U2L_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* U2L_ESCR1 */ + /*pmc 47*/ {{MSR_P4_DAC_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* DAC_ESCR1 */ + /*pmc 48*/ {{MSR_P4_IQ_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* IQ_ESCR1 (only model 1 and 2) */ + /*pmc 49*/ {{MSR_P4_ALF_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* ALF_ESCR1 */ + /*pmc 50*/ {{MSR_P4_RAT_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* RAT_ESCR1 */ + /*pmc 51*/ {{MSR_P4_CRU_ESCR1, 0}, 0, PFM_REGT_NHTESCR}, /* CRU_ESCR1 */ + /*pmc 52*/ {{MSR_P4_CRU_ESCR3, 0}, 0, PFM_REGT_NHTESCR}, /* CRU_ESCR3 */ + /*pmc 53*/ {{MSR_P4_CRU_ESCR5, 0}, 0, PFM_REGT_NHTESCR}, /* CRU_ESCR5 */ + /*pmc 54*/ {{MSR_P4_BPU_CCCR1, 0}, 9, PFM_REGT_NHTCCCR}, /* BPU_CCCR1 */ + /*pmc 55*/ {{MSR_P4_BPU_CCCR3, 0}, 10, PFM_REGT_NHTCCCR}, /* BPU_CCCR3 */ + /*pmc 56*/ {{MSR_P4_MS_CCCR1, 0}, 11, PFM_REGT_NHTCCCR}, /* MS_CCCR1 */ + /*pmc 57*/ {{MSR_P4_MS_CCCR3, 0}, 12, PFM_REGT_NHTCCCR}, /* MS_CCCR3 */ + /*pmc 58*/ {{MSR_P4_FLAME_CCCR1, 0}, 13, PFM_REGT_NHTCCCR}, /* FLAME_CCCR1 */ + /*pmc 59*/ {{MSR_P4_FLAME_CCCR3, 0}, 14, PFM_REGT_NHTCCCR}, /* FLAME_CCCR3 */ + /*pmc 60*/ {{MSR_P4_IQ_CCCR2, 0}, 15, PFM_REGT_NHTCCCR}, /* IQ_CCCR2 */ + /*pmc 61*/ {{MSR_P4_IQ_CCCR3, 0}, 16, PFM_REGT_NHTCCCR}, /* IQ_CCCR3 */ + /*pmc 62*/ {{MSR_P4_IQ_CCCR5, 0}, 17, PFM_REGT_NHTCCCR}, /* IQ_CCCR5 */ + /*pmc 63*/ {{0x3f2, 0}, 0, PFM_REGT_NHTPEBS},/* PEBS_MATRIX_VERT */ + /*pmc 64*/ {{0x3f1, 0}, 0, PFM_REGT_NHTPEBS} /* PEBS_ENABLE */ +}; + +static struct pfm_p4_regmap pmd_addrs[PFM_MAX_PMDS] = { + /*pmd 0 */ {{MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_PERFCTR2}, 0, PFM_REGT_CTR}, /* BPU_CTR0,2 */ + /*pmd 1 */ {{MSR_P4_BPU_PERFCTR1, MSR_P4_BPU_PERFCTR3}, 0, PFM_REGT_CTR}, /* BPU_CTR1,3 */ + /*pmd 2 */ {{MSR_P4_MS_PERFCTR0, MSR_P4_MS_PERFCTR2}, 0, PFM_REGT_CTR}, /* MS_CTR0,2 */ + /*pmd 3 */ {{MSR_P4_MS_PERFCTR1, MSR_P4_MS_PERFCTR3}, 0, PFM_REGT_CTR}, /* MS_CTR1,3 */ + /*pmd 4 */ {{MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_PERFCTR2}, 0, PFM_REGT_CTR}, /* FLAME_CTR0,2 */ + /*pmd 5 */ {{MSR_P4_FLAME_PERFCTR1, MSR_P4_FLAME_PERFCTR3}, 0, PFM_REGT_CTR}, /* FLAME_CTR1,3 */ + /*pmd 6 */ {{MSR_P4_IQ_PERFCTR0, MSR_P4_IQ_PERFCTR2}, 0, PFM_REGT_CTR}, /* IQ_CTR0,2 */ + /*pmd 7 */ {{MSR_P4_IQ_PERFCTR1, MSR_P4_IQ_PERFCTR3}, 0, PFM_REGT_CTR}, /* IQ_CTR1,3 */ + /*pmd 8 */ {{MSR_P4_IQ_PERFCTR4, MSR_P4_IQ_PERFCTR5}, 0, PFM_REGT_CTR}, /* IQ_CTR4,5 */ + /* + * non HT extensions + */ + /*pmd 9 */ {{MSR_P4_BPU_PERFCTR2, 0}, 0, PFM_REGT_NHTCTR}, /* BPU_CTR2 */ + /*pmd 10*/ {{MSR_P4_BPU_PERFCTR3, 0}, 0, PFM_REGT_NHTCTR}, /* BPU_CTR3 */ + /*pmd 11*/ {{MSR_P4_MS_PERFCTR2, 0}, 0, PFM_REGT_NHTCTR}, /* MS_CTR2 */ + /*pmd 12*/ {{MSR_P4_MS_PERFCTR3, 0}, 0, PFM_REGT_NHTCTR}, /* MS_CTR3 */ + /*pmd 13*/ {{MSR_P4_FLAME_PERFCTR2, 0}, 0, PFM_REGT_NHTCTR}, /* FLAME_CTR2 */ + /*pmd 14*/ {{MSR_P4_FLAME_PERFCTR3, 0}, 0, PFM_REGT_NHTCTR}, /* FLAME_CTR3 */ + /*pmd 15*/ {{MSR_P4_IQ_PERFCTR2, 0}, 0, PFM_REGT_NHTCTR}, /* IQ_CTR2 */ + /*pmd 16*/ {{MSR_P4_IQ_PERFCTR3, 0}, 0, PFM_REGT_NHTCTR}, /* IQ_CTR3 */ + /*pmd 17*/ {{MSR_P4_IQ_PERFCTR5, 0}, 0, PFM_REGT_NHTCTR}, /* IQ_CTR5 */ +}; + +static struct pfm_arch_pmu_info pfm_p4_pmu_info = { + .write_pmc = pfm_p4_write_pmc, + .write_pmd = pfm_p4_write_pmd, + .read_pmc = pfm_p4_read_pmc, + .read_pmd = pfm_p4_read_pmd, + .create_context = pfm_p4_create_context, + .free_context = pfm_p4_free_context, + .has_ovfls = pfm_p4_has_ovfls, + .stop_save = pfm_p4_stop_save, + .restore_pmcs = pfm_p4_restore_pmcs, + .nmi_copy_state = pfm_p4_nmi_copy_state, + .quiesce = pfm_p4_quiesce +}; + +static struct pfm_regmap_desc pfm_p4_pmc_desc[] = { +/* pmc0 */ PMC_D(PFM_REG_I, "BPU_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_BPU_ESCR0), +/* pmc1 */ PMC_D(PFM_REG_I, "IS_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_IQ_ESCR0), +/* pmc2 */ PMC_D(PFM_REG_I, "MOB_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_MOB_ESCR0), +/* pmc3 */ PMC_D(PFM_REG_I, "ITLB_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_ITLB_ESCR0), +/* pmc4 */ PMC_D(PFM_REG_I, "PMH_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_PMH_ESCR0), +/* pmc5 */ PMC_D(PFM_REG_I, "IX_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_IX_ESCR0), +/* pmc6 */ PMC_D(PFM_REG_I, "FSB_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_FSB_ESCR0), +/* pmc7 */ PMC_D(PFM_REG_I, "BSU_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_BSU_ESCR0), +/* pmc8 */ PMC_D(PFM_REG_I, "MS_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_MS_ESCR0), +/* pmc9 */ PMC_D(PFM_REG_I, "TC_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_TC_ESCR0), +/* pmc10 */ PMC_D(PFM_REG_I, "TBPU_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_TBPU_ESCR0), +/* pmc11 */ PMC_D(PFM_REG_I, "FLAME_ESCR0", 0x0, PFM_ESCR_RSVD, 0, MSR_P4_FLAME_ESCR0), +/* pmc12 */ PMC_D(PFM_REG_I, "FIRM_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_FIRM_ESCR0), +/* pmc13 */ PMC_D(PFM_REG_I, "SAAT_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_SAAT_ESCR0), +/* pmc14 */ PMC_D(PFM_REG_I, "U2L_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_U2L_ESCR0), +/* pmc15 */ PMC_D(PFM_REG_I, "DAC_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_DAC_ESCR0), +/* pmc16 */ PMC_D(PFM_REG_I, "IQ_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_IQ_ESCR0), /* only model 1 and 2*/ +/* pmc17 */ PMC_D(PFM_REG_I, "ALF_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_ALF_ESCR0), +/* pmc18 */ PMC_D(PFM_REG_I, "RAT_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_RAT_ESCR0), +/* pmc19 */ PMC_D(PFM_REG_I, "SSU_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_SSU_ESCR0), +/* pmc20 */ PMC_D(PFM_REG_I, "CRU_ESCR0" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_CRU_ESCR0), +/* pmc21 */ PMC_D(PFM_REG_I, "CRU_ESCR2" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_CRU_ESCR2), +/* pmc22 */ PMC_D(PFM_REG_I, "CRU_ESCR4" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_CRU_ESCR4), +/* pmc23 */ PMC_D(PFM_REG_I64, "BPU_CCCR0" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_BPU_CCCR0), +/* pmc24 */ PMC_D(PFM_REG_I64, "BPU_CCCR1" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_BPU_CCCR1), +/* pmc25 */ PMC_D(PFM_REG_I64, "MS_CCCR0" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_MS_CCCR0), +/* pmc26 */ PMC_D(PFM_REG_I64, "MS_CCCR1" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_MS_CCCR1), +/* pmc27 */ PMC_D(PFM_REG_I64, "FLAME_CCCR0", PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_FLAME_CCCR0), +/* pmc28 */ PMC_D(PFM_REG_I64, "FLAME_CCCR1", PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_FLAME_CCCR1), +/* pmc29 */ PMC_D(PFM_REG_I64, "IQ_CCCR0" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_IQ_CCCR0), +/* pmc30 */ PMC_D(PFM_REG_I64, "IQ_CCCR1" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_IQ_CCCR1), +/* pmc31 */ PMC_D(PFM_REG_I64, "IQ_CCCR4" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_IQ_CCCR4), + /* No HT extension */ +/* pmc32 */ PMC_D(PFM_REG_I, "BPU_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_BPU_ESCR1), +/* pmc33 */ PMC_D(PFM_REG_I, "IS_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_IS_ESCR1), +/* pmc34 */ PMC_D(PFM_REG_I, "MOB_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_MOB_ESCR1), +/* pmc35 */ PMC_D(PFM_REG_I, "ITLB_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_ITLB_ESCR1), +/* pmc36 */ PMC_D(PFM_REG_I, "PMH_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_PMH_ESCR1), +/* pmc37 */ PMC_D(PFM_REG_I, "IX_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_IX_ESCR1), +/* pmc38 */ PMC_D(PFM_REG_I, "FSB_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_FSB_ESCR1), +/* pmc39 */ PMC_D(PFM_REG_I, "BSU_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_BSU_ESCR1), +/* pmc40 */ PMC_D(PFM_REG_I, "MS_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_MS_ESCR1), +/* pmc41 */ PMC_D(PFM_REG_I, "TC_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_TC_ESCR1), +/* pmc42 */ PMC_D(PFM_REG_I, "TBPU_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_TBPU_ESCR1), +/* pmc43 */ PMC_D(PFM_REG_I, "FLAME_ESCR1", 0x0, PFM_ESCR_RSVD, 0, MSR_P4_FLAME_ESCR1), +/* pmc44 */ PMC_D(PFM_REG_I, "FIRM_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_FIRM_ESCR1), +/* pmc45 */ PMC_D(PFM_REG_I, "SAAT_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_SAAT_ESCR1), +/* pmc46 */ PMC_D(PFM_REG_I, "U2L_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_U2L_ESCR1), +/* pmc47 */ PMC_D(PFM_REG_I, "DAC_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_DAC_ESCR1), +/* pmc48 */ PMC_D(PFM_REG_I, "IQ_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_IQ_ESCR1), /* only model 1 and 2 */ +/* pmc49 */ PMC_D(PFM_REG_I, "ALF_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_ALF_ESCR1), +/* pmc50 */ PMC_D(PFM_REG_I, "RAT_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_RAT_ESCR1), +/* pmc51 */ PMC_D(PFM_REG_I, "CRU_ESCR1" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_CRU_ESCR1), +/* pmc52 */ PMC_D(PFM_REG_I, "CRU_ESCR3" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_CRU_ESCR3), +/* pmc53 */ PMC_D(PFM_REG_I, "CRU_ESCR5" , 0x0, PFM_ESCR_RSVD, 0, MSR_P4_CRU_ESCR5), +/* pmc54 */ PMC_D(PFM_REG_I64, "BPU_CCCR2" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_BPU_CCCR2), +/* pmc55 */ PMC_D(PFM_REG_I64, "BPU_CCCR3" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_BPU_CCCR3), +/* pmc56 */ PMC_D(PFM_REG_I64, "MS_CCCR2" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_MS_CCCR2), +/* pmc57 */ PMC_D(PFM_REG_I64, "MS_CCCR3" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_MS_CCCR3), +/* pmc58 */ PMC_D(PFM_REG_I64, "FLAME_CCCR2", PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_FLAME_CCCR2), +/* pmc59 */ PMC_D(PFM_REG_I64, "FLAME_CCCR3", PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_FLAME_CCCR3), +/* pmc60 */ PMC_D(PFM_REG_I64, "IQ_CCCR2" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_IQ_CCCR2), +/* pmc61 */ PMC_D(PFM_REG_I64, "IQ_CCCR3" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_IQ_CCCR3), +/* pmc62 */ PMC_D(PFM_REG_I64, "IQ_CCCR5" , PFM_CCCR_DFL, PFM_CCCR_RSVD, PFM_P4_NO64, MSR_P4_IQ_CCCR5), +/* pmc63 */ PMC_D(PFM_REG_I, "PEBS_MATRIX_VERT", 0, 0xffffffffffffffecULL, 0, 0x3f2), +/* pmc64 */ PMC_D(PFM_REG_I, "PEBS_ENABLE", 0, 0xfffffffff8ffe000ULL, 0, 0x3f1) +}; +#define PFM_P4_NUM_PMCS ARRAY_SIZE(pfm_p4_pmc_desc) + +/* + * See section 15.10.6.6 for details about the IQ block + */ +static struct pfm_regmap_desc pfm_p4_pmd_desc[] = { +/* pmd0 */ PMD_D(PFM_REG_C, "BPU_CTR0", MSR_P4_BPU_PERFCTR0), +/* pmd1 */ PMD_D(PFM_REG_C, "BPU_CTR1", MSR_P4_BPU_PERFCTR1), +/* pmd2 */ PMD_D(PFM_REG_C, "MS_CTR0", MSR_P4_MS_PERFCTR0), +/* pmd3 */ PMD_D(PFM_REG_C, "MS_CTR1", MSR_P4_MS_PERFCTR1), +/* pmd4 */ PMD_D(PFM_REG_C, "FLAME_CTR0", MSR_P4_FLAME_PERFCTR0), +/* pmd5 */ PMD_D(PFM_REG_C, "FLAME_CTR1", MSR_P4_FLAME_PERFCTR1), +/* pmd6 */ PMD_D(PFM_REG_C, "IQ_CTR0", MSR_P4_IQ_PERFCTR0), +/* pmd7 */ PMD_D(PFM_REG_C, "IQ_CTR1", MSR_P4_IQ_PERFCTR1), +/* pmd8 */ PMD_D(PFM_REG_C, "IQ_CTR4", MSR_P4_IQ_PERFCTR4), + /* no HT extension */ +/* pmd9 */ PMD_D(PFM_REG_C, "BPU_CTR2", MSR_P4_BPU_PERFCTR2), +/* pmd10 */ PMD_D(PFM_REG_C, "BPU_CTR3", MSR_P4_BPU_PERFCTR3), +/* pmd11 */ PMD_D(PFM_REG_C, "MS_CTR2", MSR_P4_MS_PERFCTR2), +/* pmd12 */ PMD_D(PFM_REG_C, "MS_CTR3", MSR_P4_MS_PERFCTR3), +/* pmd13 */ PMD_D(PFM_REG_C, "FLAME_CTR2", MSR_P4_FLAME_PERFCTR2), +/* pmd14 */ PMD_D(PFM_REG_C, "FLAME_CTR3", MSR_P4_FLAME_PERFCTR3), +/* pmd15 */ PMD_D(PFM_REG_C, "IQ_CTR2", MSR_P4_IQ_PERFCTR2), +/* pmd16 */ PMD_D(PFM_REG_C, "IQ_CTR3", MSR_P4_IQ_PERFCTR3), +/* pmd17 */ PMD_D(PFM_REG_C, "IQ_CTR5", MSR_P4_IQ_PERFCTR5) +}; +#define PFM_P4_NUM_PMDS ARRAY_SIZE(pfm_p4_pmd_desc) + +/* + * Due to hotplug CPU support, threads may not necessarily + * be activated at the time the module is inserted. We need + * to check whether they could be activated by looking at + * the present CPU (present != online). + */ +static int pfm_p4_probe_pmu(void) +{ + unsigned int i; + int ht_enabled; + + /* + * only works on Intel processors + */ + if (current_cpu_data.x86_vendor != X86_VENDOR_INTEL) { + PFM_INFO("not running on Intel processor"); + return -1; + } + + if (current_cpu_data.x86 != 15) { + PFM_INFO("unsupported family=%d", current_cpu_data.x86); + return -1; + } + + switch (current_cpu_data.x86_model) { + case 0 ... 2: + break; + case 3 ... 6: + /* + * IQ_ESCR0, IQ_ESCR1 only present on model 1, 2 + */ + pfm_p4_pmc_desc[16].type = PFM_REG_NA; + pfm_p4_pmc_desc[48].type = PFM_REG_NA; + break; + default: + /* + * do not know if they all work the same, so reject + * for now + */ + if (!force) { + PFM_INFO("unsupported model %d", + current_cpu_data.x86_model); + return -1; + } + } + + /* + * check for local APIC (required) + */ + if (!cpu_has_apic) { + PFM_INFO("no local APIC, unsupported"); + return -1; + } +#ifdef CONFIG_SMP + ht_enabled = (cpus_weight(__get_cpu_var(cpu_core_map)) + / current_cpu_data.x86_max_cores) > 1; +#else + ht_enabled = 0; +#endif + if (cpu_has_ht) { + + PFM_INFO("HyperThreading supported, status %s", + ht_enabled ? "on": "off"); + /* + * disable registers not supporting HT + */ + if (ht_enabled) { + PFM_INFO("disabling half the registers for HT"); + for (i = 0; i < PFM_P4_NUM_PMCS; i++) { + if (pmc_addrs[(i)].reg_type & PFM_REGT_NOHT) + pfm_p4_pmc_desc[i].type = PFM_REG_NA; + } + for (i = 0; i < PFM_P4_NUM_PMDS; i++) { + if (pmd_addrs[(i)].reg_type & PFM_REGT_NOHT) + pfm_p4_pmd_desc[i].type = PFM_REG_NA; + } + } + } + + if (cpu_has_ds) { + PFM_INFO("Data Save Area (DS) supported"); + + if (cpu_has_pebs) { + /* + * PEBS does not work with HyperThreading enabled + */ + if (ht_enabled) + PFM_INFO("PEBS supported, status off (because of HT)"); + else + PFM_INFO("PEBS supported, status on"); + } + } + + /* + * build enable mask + */ + for (i = 0; i < PFM_P4_NUM_PMCS; i++) { + if (pmc_addrs[(i)].reg_type & PFM_REGT_EN) { + __set_bit(i, cast_ulp(enable_mask)); + max_enable = i + 1; + } + } + + if (force_nmi) + pfm_p4_pmu_info.flags |= PFM_X86_FL_USE_NMI; + return 0; +} +static inline int get_smt_id(void) +{ +#ifdef CONFIG_SMP + int cpu = smp_processor_id(); + return (cpu != first_cpu(__get_cpu_var(cpu_sibling_map))); +#else + return 0; +#endif +} + +static void __pfm_write_reg_p4(const struct pfm_p4_regmap *xreg, u64 val) +{ + u64 pmi; + int smt_id; + + smt_id = get_smt_id(); + /* + * HT is only supported by P4-style PMU + * + * Adjust for T1 if necessary: + * + * - move the T0_OS/T0_USR bits into T1 slots + * - move the OVF_PMI_T0 bits into T1 slot + * + * The P4/EM64T T1 is cleared by description table. + * User only works with T0. + */ + if (smt_id) { + if (xreg->reg_type & PFM_REGT_ESCR) { + + /* copy T0_USR & T0_OS to T1 */ + val |= ((val & 0xc) >> 2); + + /* clear bits T0_USR & T0_OS */ + val &= ~0xc; + + } else if (xreg->reg_type & PFM_REGT_CCCR) { + pmi = (val >> 26) & 0x1; + if (pmi) { + val &= ~(1UL<<26); + val |= 1UL<<27; + } + } + } + if (xreg->addrs[smt_id]) + wrmsrl(xreg->addrs[smt_id], val); +} + +void __pfm_read_reg_p4(const struct pfm_p4_regmap *xreg, u64 *val) +{ + int smt_id; + + smt_id = get_smt_id(); + + if (likely(xreg->addrs[smt_id])) { + rdmsrl(xreg->addrs[smt_id], *val); + /* + * HT is only supported by P4-style PMU + * + * move the Tx_OS and Tx_USR bits into + * T0 slots setting the T1 slots to zero + */ + if (xreg->reg_type & PFM_REGT_ESCR) { + if (smt_id) + *val |= (((*val) & 0x3) << 2); + + /* + * zero out bits that are reserved + * (including T1_OS and T1_USR) + */ + *val &= PFM_ESCR_RSVD; + } + } else { + *val = 0; + } +} +static void pfm_p4_write_pmc(struct pfm_context *ctx, unsigned int cnum, u64 value) +{ + __pfm_write_reg_p4(&pmc_addrs[cnum], value); +} + +static void pfm_p4_write_pmd(struct pfm_context *ctx, unsigned int cnum, u64 value) +{ + __pfm_write_reg_p4(&pmd_addrs[cnum], value); +} + +static u64 pfm_p4_read_pmd(struct pfm_context *ctx, unsigned int cnum) +{ + u64 tmp; + __pfm_read_reg_p4(&pmd_addrs[cnum], &tmp); + return tmp; +} + +static u64 pfm_p4_read_pmc(struct pfm_context *ctx, unsigned int cnum) +{ + u64 tmp; + __pfm_read_reg_p4(&pmc_addrs[cnum], &tmp); + return tmp; +} + +struct pfm_ds_area_p4 { + unsigned long bts_buf_base; + unsigned long bts_index; + unsigned long bts_abs_max; + unsigned long bts_intr_thres; + unsigned long pebs_buf_base; + unsigned long pebs_index; + unsigned long pebs_abs_max; + unsigned long pebs_intr_thres; + u64 pebs_cnt_reset; +}; + + +static int pfm_p4_stop_save(struct pfm_context *ctx, struct pfm_event_set *set) +{ + struct pfm_arch_pmu_info *pmu_info; + struct pfm_arch_context *ctx_arch; + struct pfm_ds_area_p4 *ds = NULL; + u64 used_mask[PFM_PMC_BV]; + u16 i, j, count, pebs_idx = ~0; + u16 max_pmc; + u64 cccr, ctr1, ctr2, ovfl_mask; + + pmu_info = &pfm_p4_pmu_info; + ctx_arch = pfm_ctx_arch(ctx); + max_pmc = ctx->regs.max_pmc; + ovfl_mask = pfm_pmu_conf->ovfl_mask; + + /* + * build used enable PMC bitmask + * if user did not set any CCCR, then mask is + * empty and there is nothing to do because nothing + * was started + */ + bitmap_and(cast_ulp(used_mask), + cast_ulp(set->used_pmcs), + cast_ulp(enable_mask), + max_enable); + + count = bitmap_weight(cast_ulp(used_mask), max_enable); + + PFM_DBG_ovfl("npend=%u ena_mask=0x%llx u_pmcs=0x%llx count=%u num=%u", + set->npend_ovfls, + (unsigned long long)enable_mask[0], + (unsigned long long)set->used_pmcs[0], + count, max_enable); + + /* + * ensures we do not destroy pending overflow + * information. If pended interrupts are already + * known, then we just stop monitoring. + */ + if (set->npend_ovfls) { + /* + * clear enable bit + * unfortunately, this is very expensive! + */ + for (i = 0; count; i++) { + if (test_bit(i, cast_ulp(used_mask))) { + __pfm_write_reg_p4(pmc_addrs+i, 0); + count--; + } + } + /* need save PMDs at upper level */ + return 1; + } + + if (ctx_arch->flags.use_pebs) { + ds = ctx_arch->ds_area; + pebs_idx = PEBS_PMD; + PFM_DBG("ds=%p pebs_idx=0x%llx thres=0x%llx", + ds, + (unsigned long long)ds->pebs_index, + (unsigned long long)ds->pebs_intr_thres); + } + + /* + * stop monitoring AND collect pending overflow information AND + * save pmds. + * + * We need to access the CCCR twice, once to get overflow info + * and a second to stop monitoring (which destroys the OVF flag) + * Similarly, we need to read the counter twice to check whether + * it did overflow between the CCR read and the CCCR write. + */ + for (i = 0; count; i++) { + if (i != pebs_idx && test_bit(i, cast_ulp(used_mask))) { + /* + * controlled counter + */ + j = pmc_addrs[i].ctr; + + /* read CCCR (PMC) value */ + __pfm_read_reg_p4(pmc_addrs+i, &cccr); + + /* read counter (PMD) controlled by PMC */ + __pfm_read_reg_p4(pmd_addrs+j, &ctr1); + + /* clear CCCR value: stop counter but destroy OVF */ + __pfm_write_reg_p4(pmc_addrs+i, 0); + + /* read counter controlled by CCCR again */ + __pfm_read_reg_p4(pmd_addrs+j, &ctr2); + + /* + * there is an overflow if either: + * - CCCR.ovf is set (and we just cleared it) + * - ctr2 < ctr1 + * in that case we set the bit corresponding to the + * overflowed PMD in povfl_pmds. + */ + if ((cccr & (1ULL<<31)) || (ctr2 < ctr1)) { + __set_bit(j, cast_ulp(set->povfl_pmds)); + set->npend_ovfls++; + } + ctr2 = (set->pmds[j].value & ~ovfl_mask) | (ctr2 & ovfl_mask); + set->pmds[j].value = ctr2; + count--; + } + } + /* + * check for PEBS buffer full and set the corresponding PMD overflow + */ + if (ctx_arch->flags.use_pebs) { + PFM_DBG("ds=%p pebs_idx=0x%lx thres=0x%lx", ds, ds->pebs_index, ds->pebs_intr_thres); + if (ds->pebs_index >= ds->pebs_intr_thres + && test_bit(PEBS_PMD, cast_ulp(set->used_pmds))) { + __set_bit(PEBS_PMD, cast_ulp(set->povfl_pmds)); + set->npend_ovfls++; + } + } + /* 0 means: no need to save the PMD at higher level */ + return 0; +} + +static int pfm_p4_create_context(struct pfm_context *ctx, u32 ctx_flags) +{ + struct pfm_arch_context *ctx_arch; + + ctx_arch = pfm_ctx_arch(ctx); + + ctx_arch->data = kzalloc(sizeof(struct pfm_arch_p4_context), GFP_KERNEL); + if (!ctx_arch->data) + return -ENOMEM; + + return 0; +} + +static void pfm_p4_free_context(struct pfm_context *ctx) +{ + struct pfm_arch_context *ctx_arch; + + ctx_arch = pfm_ctx_arch(ctx); + /* + * we do not check if P4, because it would be NULL and + * kfree can deal with NULL + */ + kfree(ctx_arch->data); +} + +/* + * detect is counters have overflowed. + * return: + * 0 : no overflow + * 1 : at least one overflow + * + * used by Intel P4 + */ +static int __kprobes pfm_p4_has_ovfls(struct pfm_context *ctx) +{ + struct pfm_arch_pmu_info *pmu_info; + struct pfm_p4_regmap *xrc, *xrd; + struct pfm_arch_context *ctx_arch; + struct pfm_arch_p4_context *p4; + u64 ena_mask[PFM_PMC_BV]; + u64 cccr, ctr1, ctr2; + int n, i, j; + + pmu_info = &pfm_p4_pmu_info; + + ctx_arch = pfm_ctx_arch(ctx); + xrc = pmc_addrs; + xrd = pmd_addrs; + p4 = ctx_arch->data; + + bitmap_and(cast_ulp(ena_mask), + cast_ulp(ctx->regs.pmcs), + cast_ulp(enable_mask), + max_enable); + + n = bitmap_weight(cast_ulp(ena_mask), max_enable); + + for (i = 0; n; i++) { + if (!test_bit(i, cast_ulp(ena_mask))) + continue; + /* + * controlled counter + */ + j = xrc[i].ctr; + + /* read CCCR (PMC) value */ + __pfm_read_reg_p4(xrc+i, &cccr); + + /* read counter (PMD) controlled by PMC */ + __pfm_read_reg_p4(xrd+j, &ctr1); + + /* clear CCCR value: stop counter but destroy OVF */ + __pfm_write_reg_p4(xrc+i, 0); + + /* read counter controlled by CCCR again */ + __pfm_read_reg_p4(xrd+j, &ctr2); + + /* + * there is an overflow if either: + * - CCCR.ovf is set (and we just cleared it) + * - ctr2 < ctr1 + * in that case we set the bit corresponding to the + * overflowed PMD in povfl_pmds. + */ + if ((cccr & (1ULL<<31)) || (ctr2 < ctr1)) { + __set_bit(j, cast_ulp(p4->povfl_pmds)); + p4->npend_ovfls++; + } + p4->saved_cccrs[i] = cccr; + n--; + } + /* + * if there was no overflow, then it means the NMI was not really + * for us, so we have to resume monitoring + */ + if (unlikely(!p4->npend_ovfls)) { + for (i = 0; n; i++) { + if (!test_bit(i, cast_ulp(ena_mask))) + continue; + __pfm_write_reg_p4(xrc+i, p4->saved_cccrs[i]); + } + } + return 0; +} + +void pfm_p4_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set) +{ + struct pfm_arch_pmu_info *pmu_info; + struct pfm_arch_context *ctx_arch; + u64 *mask; + u16 i, num; + + ctx_arch = pfm_ctx_arch(ctx); + pmu_info = pfm_pmu_info(); + + /* + * must restore DS pointer before restoring PMCs + * as this can potentially reactivate monitoring + */ + if (ctx_arch->flags.use_ds) + wrmsrl(MSR_IA32_DS_AREA, (unsigned long)ctx_arch->ds_area); + + /* + * must restore everything because there are some dependencies + * (e.g., ESCR and CCCR) + */ + num = ctx->regs.num_pmcs; + mask = ctx->regs.pmcs; + for (i = 0; num; i++) { + if (test_bit(i, cast_ulp(mask))) { + pfm_arch_write_pmc(ctx, i, set->pmcs[i]); + num--; + } + } +} + +/* + * invoked only when NMI is used. Called from the LOCAL_PERFMON_VECTOR + * handler to copy P4 overflow state captured when the NMI triggered. + * Given that on P4, stopping monitoring destroy the overflow information + * we save it in pfm_has_ovfl_p4() where monitoring is also stopped. + * + * Here we propagate the overflow state to current active set. The + * freeze_pmu() call we not overwrite this state because npend_ovfls + * is non-zero. + */ +static void pfm_p4_nmi_copy_state(struct pfm_context *ctx) +{ + struct pfm_arch_context *ctx_arch; + struct pfm_event_set *set; + struct pfm_arch_p4_context *p4; + + ctx_arch = pfm_ctx_arch(ctx); + p4 = ctx_arch->data; + set = ctx->active_set; + + if (p4->npend_ovfls) { + set->npend_ovfls = p4->npend_ovfls; + + bitmap_copy(cast_ulp(set->povfl_pmds), + cast_ulp(p4->povfl_pmds), + ctx->regs.max_pmd); + + p4->npend_ovfls = 0; + } +} + +/** + * pfm_p4_quiesce - stop monitoring without grabbing any lock + * + * called from NMI interrupt handler to immediately stop monitoring + * cannot grab any lock, including perfmon related locks + */ +static void __kprobes pfm_p4_quiesce(void) +{ + u16 i; + /* + * quiesce PMU by clearing available registers that have + * the start/stop capability + */ + for (i = 0; i < pfm_pmu_conf->regs_all.max_pmc; i++) { + if (test_bit(i, cast_ulp(pfm_pmu_conf->regs_all.pmcs)) + && test_bit(i, cast_ulp(enable_mask))) + __pfm_write_reg_p4(pmc_addrs+i, 0); + } +} + + +static struct pfm_pmu_config pfm_p4_pmu_conf = { + .pmu_name = "Intel P4", + .counter_width = 40, + .pmd_desc = pfm_p4_pmd_desc, + .pmc_desc = pfm_p4_pmc_desc, + .num_pmc_entries = PFM_P4_NUM_PMCS, + .num_pmd_entries = PFM_P4_NUM_PMDS, + .probe_pmu = pfm_p4_probe_pmu, + .version = "1.0", + .flags = PFM_PMU_BUILTIN_FLAG, + .owner = THIS_MODULE, + .pmu_info = &pfm_p4_pmu_info +}; + +static int __init pfm_p4_pmu_init_module(void) +{ + return pfm_pmu_register(&pfm_p4_pmu_conf); +} + +static void __exit pfm_p4_pmu_cleanup_module(void) +{ + pfm_pmu_unregister(&pfm_p4_pmu_conf); +} + +module_init(pfm_p4_pmu_init_module); +module_exit(pfm_p4_pmu_cleanup_module); diff --git a/arch/x86/perfmon/perfmon_p6.c b/arch/x86/perfmon/perfmon_p6.c new file mode 100644 index 0000000..47c0a46 --- /dev/null +++ b/arch/x86/perfmon/perfmon_p6.c @@ -0,0 +1,310 @@ +/* + * This file contains the P6 family processor PMU register description tables + * + * This module supports original P6 processors + * (Pentium II, Pentium Pro, Pentium III) and Pentium M. + * + * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Stephane Eranian "); +MODULE_DESCRIPTION("P6 PMU description table"); +MODULE_LICENSE("GPL"); + +static int force_nmi; +MODULE_PARM_DESC(force_nmi, "bool: force use of NMI for PMU interrupt"); +module_param(force_nmi, bool, 0600); + +/* + * - upper 32 bits are reserved + * - INT: APIC enable bit is reserved (forced to 1) + * - bit 21 is reserved + * - bit 22 is reserved on PEREVNTSEL1 + * + * RSVD: reserved bits are 1 + */ +#define PFM_P6_PMC0_RSVD ((~((1ULL<<32)-1)) | (1ULL<<20) | (1ULL<<21)) +#define PFM_P6_PMC1_RSVD ((~((1ULL<<32)-1)) | (1ULL<<20) | (3ULL<<21)) + +/* + * force Local APIC interrupt on overflow + * disable with NO_EMUL64 + */ +#define PFM_P6_PMC_VAL (1ULL<<20) +#define PFM_P6_NO64 (1ULL<<20) + + +static void __kprobes pfm_p6_quiesce(void); +static int pfm_p6_has_ovfls(struct pfm_context *ctx); +static int pfm_p6_stop_save(struct pfm_context *ctx, + struct pfm_event_set *set); + +static u64 enable_mask[PFM_MAX_PMCS]; +static u16 max_enable; + +/* + * PFM_X86_FL_NO_SHARING: because of the single enable bit on MSR_P6_EVNTSEL0 + * the PMU cannot be shared with NMI watchdog or Oprofile + */ +struct pfm_arch_pmu_info pfm_p6_pmu_info = { + .stop_save = pfm_p6_stop_save, + .has_ovfls = pfm_p6_has_ovfls, + .quiesce = pfm_p6_quiesce, + .flags = PFM_X86_FL_NO_SHARING, +}; + +static struct pfm_regmap_desc pfm_p6_pmc_desc[] = { +/* pmc0 */ PMC_D(PFM_REG_I64, "PERFEVTSEL0", PFM_P6_PMC_VAL, PFM_P6_PMC0_RSVD, PFM_P6_NO64, MSR_P6_EVNTSEL0), +/* pmc1 */ PMC_D(PFM_REG_I64, "PERFEVTSEL1", PFM_P6_PMC_VAL, PFM_P6_PMC1_RSVD, PFM_P6_NO64, MSR_P6_EVNTSEL1) +}; +#define PFM_P6_NUM_PMCS ARRAY_SIZE(pfm_p6_pmc_desc) + +#define PFM_P6_D(n) \ + { .type = PFM_REG_C, \ + .desc = "PERFCTR"#n, \ + .hw_addr = MSR_P6_PERFCTR0+n, \ + .rsvd_msk = 0, \ + .dep_pmcs[0] = 1ULL << n \ + } + +static struct pfm_regmap_desc pfm_p6_pmd_desc[] = { +/* pmd0 */ PFM_P6_D(0), +/* pmd1 */ PFM_P6_D(1) +}; +#define PFM_P6_NUM_PMDS ARRAY_SIZE(pfm_p6_pmd_desc) + +static int pfm_p6_probe_pmu(void) +{ + int high, low; + + if (current_cpu_data.x86_vendor != X86_VENDOR_INTEL) { + PFM_INFO("not an Intel processor"); + return -1; + } + + /* + * check for P6 processor family + */ + if (current_cpu_data.x86 != 6) { + PFM_INFO("unsupported family=%d", current_cpu_data.x86); + return -1; + } + + switch (current_cpu_data.x86_model) { + case 1: /* Pentium Pro */ + case 3: + case 5: /* Pentium II Deschutes */ + case 7 ... 11: + break; + case 13: + /* for Pentium M, we need to check if PMU exist */ + rdmsr(MSR_IA32_MISC_ENABLE, low, high); + if (low & (1U << 7)) + break; + default: + PFM_INFO("unsupported CPU model %d", + current_cpu_data.x86_model); + return -1; + + } + + if (!cpu_has_apic) { + PFM_INFO("no Local APIC, try rebooting with lapic"); + return -1; + } + __set_bit(0, cast_ulp(enable_mask)); + __set_bit(1, cast_ulp(enable_mask)); + max_enable = 1 + 1; + /* + * force NMI interrupt? + */ + if (force_nmi) + pfm_p6_pmu_info.flags |= PFM_X86_FL_USE_NMI; + + return 0; +} + +/** + * pfm_p6_has_ovfls - check for pending overflow condition + * @ctx: context to work on + * + * detect if counters have overflowed. + * return: + * 0 : no overflow + * 1 : at least one overflow + */ +static int __kprobes pfm_p6_has_ovfls(struct pfm_context *ctx) +{ + u64 *cnt_mask; + u64 wmask, val; + u16 i, num; + + cnt_mask = ctx->regs.cnt_pmds; + num = ctx->regs.num_counters; + wmask = 1ULL << pfm_pmu_conf->counter_width; + + /* + * we can leverage the fact that we know the mapping + * to hardcode the MSR address and avoid accessing + * more cachelines + * + * We need to check cnt_mask because not all registers + * may be available. + */ + for (i = 0; num; i++) { + if (test_bit(i, cast_ulp(cnt_mask))) { + rdmsrl(MSR_P6_PERFCTR0+i, val); + if (!(val & wmask)) + return 1; + num--; + } + } + return 0; +} + +/** + * pfm_p6_stop_save -- stop monitoring and save PMD values + * @ctx: context to work on + * @set: current event set + * + * return value: + * 0 - no need to save PMDs in caller + * 1 - need to save PMDs in caller + */ +static int pfm_p6_stop_save(struct pfm_context *ctx, struct pfm_event_set *set) +{ + struct pfm_arch_pmu_info *pmu_info; + u64 used_mask[PFM_PMC_BV]; + u64 *cnt_pmds; + u64 val, wmask, ovfl_mask; + u32 i, count; + + pmu_info = pfm_pmu_info(); + + wmask = 1ULL << pfm_pmu_conf->counter_width; + bitmap_and(cast_ulp(used_mask), + cast_ulp(set->used_pmcs), + cast_ulp(enable_mask), + max_enable); + + count = bitmap_weight(cast_ulp(used_mask), ctx->regs.max_pmc); + + /* + * stop monitoring + * Unfortunately, this is very expensive! + * wrmsrl() is serializing. + */ + for (i = 0; count; i++) { + if (test_bit(i, cast_ulp(used_mask))) { + wrmsrl(MSR_P6_EVNTSEL0+i, 0); + count--; + } + } + + /* + * if we already having a pending overflow condition, we simply + * return to take care of this first. + */ + if (set->npend_ovfls) + return 1; + + ovfl_mask = pfm_pmu_conf->ovfl_mask; + cnt_pmds = ctx->regs.cnt_pmds; + + /* + * check for pending overflows and save PMDs (combo) + * we employ used_pmds because we also need to save + * and not just check for pending interrupts. + * + * Must check for counting PMDs because of virtual PMDs + */ + count = set->nused_pmds; + for (i = 0; count; i++) { + if (test_bit(i, cast_ulp(set->used_pmds))) { + val = pfm_arch_read_pmd(ctx, i); + if (likely(test_bit(i, cast_ulp(cnt_pmds)))) { + if (!(val & wmask)) { + __set_bit(i, cast_ulp(set->povfl_pmds)); + set->npend_ovfls++; + } + val = (set->pmds[i].value & ~ovfl_mask) | (val & ovfl_mask); + } + set->pmds[i].value = val; + count--; + } + } + /* 0 means: no need to save PMDs at upper level */ + return 0; +} + +/** + * pfm_p6_quiesce_pmu -- stop monitoring without grabbing any lock + * + * called from NMI interrupt handler to immediately stop monitoring + * cannot grab any lock, including perfmon related locks + */ +static void __kprobes pfm_p6_quiesce(void) +{ + /* + * quiesce PMU by clearing available registers that have + * the start/stop capability + * + * P6 processors only have enable bit on PERFEVTSEL0 + */ + if (test_bit(0, cast_ulp(pfm_pmu_conf->regs_all.pmcs))) + wrmsrl(MSR_P6_EVNTSEL0, 0); +} + +/* + * Counters have 40 bits implemented. However they are designed such + * that bits [32-39] are sign extensions of bit 31. As such the + * effective width of a counter for P6-like PMU is 31 bits only. + * + * See IA-32 Intel Architecture Software developer manual Vol 3B + */ +static struct pfm_pmu_config pfm_p6_pmu_conf = { + .pmu_name = "Intel P6 processor Family", + .counter_width = 31, + .pmd_desc = pfm_p6_pmd_desc, + .pmc_desc = pfm_p6_pmc_desc, + .num_pmc_entries = PFM_P6_NUM_PMCS, + .num_pmd_entries = PFM_P6_NUM_PMDS, + .probe_pmu = pfm_p6_probe_pmu, + .version = "1.0", + .flags = PFM_PMU_BUILTIN_FLAG, + .owner = THIS_MODULE, + .pmu_info = &pfm_p6_pmu_info +}; + +static int __init pfm_p6_pmu_init_module(void) +{ + return pfm_pmu_register(&pfm_p6_pmu_conf); +} + +static void __exit pfm_p6_pmu_cleanup_module(void) +{ + pfm_pmu_unregister(&pfm_p6_pmu_conf); +} + +module_init(pfm_p6_pmu_init_module); +module_exit(pfm_p6_pmu_cleanup_module); diff --git a/arch/x86/perfmon/perfmon_pebs_core_smpl.c b/arch/x86/perfmon/perfmon_pebs_core_smpl.c new file mode 100644 index 0000000..eeb9174 --- /dev/null +++ b/arch/x86/perfmon/perfmon_pebs_core_smpl.c @@ -0,0 +1,256 @@ +/* + * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This file implements the Precise Event Based Sampling (PEBS) + * sampling format for Intel Core and Atom processors. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include + +#include +#include + +MODULE_AUTHOR("Stephane Eranian "); +MODULE_DESCRIPTION("Intel Core Precise Event-Based Sampling (PEBS)"); +MODULE_LICENSE("GPL"); + +#define ALIGN_PEBS(a, order) \ + ((a)+(1UL<<(order))-1) & ~((1UL<<(order))-1) + +#define PEBS_PADDING_ORDER 8 /* log2(256) padding for PEBS alignment constraint */ + +static int pfm_pebs_core_fmt_validate(u32 flags, u16 npmds, void *data) +{ + struct pfm_pebs_core_smpl_arg *arg = data; + size_t min_buf_size; + + /* + * need to define at least the size of the buffer + */ + if (data == NULL) { + PFM_DBG("no argument passed"); + return -EINVAL; + } + + /* + * compute min buf size. npmds is the maximum number + * of implemented PMD registers. + */ + min_buf_size = sizeof(struct pfm_pebs_core_smpl_hdr) + + sizeof(struct pfm_pebs_core_smpl_entry) + + (1UL<buf_size); + + /* + * must hold at least the buffer header + one minimally sized entry + */ + if (arg->buf_size < min_buf_size) + return -EINVAL; + + return 0; +} + +static int pfm_pebs_core_fmt_get_size(unsigned int flags, void *data, size_t *size) +{ + struct pfm_pebs_core_smpl_arg *arg = data; + + /* + * size has been validated in pfm_pebs_core_fmt_validate() + */ + *size = arg->buf_size + (1UL<ds; + + /* + * align PEBS buffer base + */ + pebs_start = ALIGN_PEBS((unsigned long)(hdr+1), PEBS_PADDING_ORDER); + pebs_end = pebs_start + arg->buf_size + 1; + + hdr->version = PFM_PEBS_CORE_SMPL_VERSION; + hdr->buf_size = arg->buf_size; + hdr->overflows = 0; + + /* + * express PEBS buffer base as offset from the end of the header + */ + hdr->start_offs = pebs_start - (unsigned long)(hdr+1); + + /* + * PEBS buffer boundaries + */ + ds->pebs_buf_base = pebs_start; + ds->pebs_abs_max = pebs_end; + + /* + * PEBS starting position + */ + ds->pebs_index = pebs_start; + + /* + * PEBS interrupt threshold + */ + ds->pebs_intr_thres = pebs_start + + arg->intr_thres + * sizeof(struct pfm_pebs_core_smpl_entry); + + /* + * save counter reset value for PEBS counter + */ + ds->pebs_cnt_reset = arg->cnt_reset; + + /* + * keep track of DS AREA + */ + ctx_arch->ds_area = ds; + ctx_arch->flags.use_ds = 1; + ctx_arch->flags.use_pebs = 1; + + PFM_DBG("buffer=%p buf_size=%llu offs=%llu pebs_start=0x%llx " + "pebs_end=0x%llx ds=%p pebs_thres=0x%llx cnt_reset=0x%llx", + buf, + (unsigned long long)hdr->buf_size, + (unsigned long long)hdr->start_offs, + (unsigned long long)pebs_start, + (unsigned long long)pebs_end, + ds, + (unsigned long long)ds->pebs_intr_thres, + (unsigned long long)ds->pebs_cnt_reset); + + return 0; +} + +static int pfm_pebs_core_fmt_handler(struct pfm_context *ctx, + unsigned long ip, u64 tstamp, void *data) +{ + struct pfm_pebs_core_smpl_hdr *hdr; + struct pfm_ovfl_arg *arg; + + hdr = ctx->smpl_addr; + arg = &ctx->ovfl_arg; + + PFM_DBG_ovfl("buffer full"); + /* + * increment number of buffer overflows. + * important to detect duplicate set of samples. + */ + hdr->overflows++; + + /* + * request notification and masking of monitoring. + * Notification is still subject to the overflowed + * register having the FL_NOTIFY flag set. + */ + arg->ovfl_ctrl = PFM_OVFL_CTRL_NOTIFY | PFM_OVFL_CTRL_MASK; + + return -ENOBUFS; /* we are full, sorry */ +} + +static int pfm_pebs_core_fmt_restart(int is_active, u32 *ovfl_ctrl, + void *buf) +{ + struct pfm_pebs_core_smpl_hdr *hdr = buf; + + /* + * reset index to base of buffer + */ + hdr->ds.pebs_index = hdr->ds.pebs_buf_base; + + *ovfl_ctrl = PFM_OVFL_CTRL_RESET; + + return 0; +} + +static int pfm_pebs_core_fmt_exit(void *buf) +{ + return 0; +} + +static struct pfm_smpl_fmt pebs_core_fmt = { + .fmt_name = PFM_PEBS_CORE_SMPL_NAME, + .fmt_version = 0x1, + .fmt_arg_size = sizeof(struct pfm_pebs_core_smpl_arg), + .fmt_validate = pfm_pebs_core_fmt_validate, + .fmt_getsize = pfm_pebs_core_fmt_get_size, + .fmt_init = pfm_pebs_core_fmt_init, + .fmt_handler = pfm_pebs_core_fmt_handler, + .fmt_restart = pfm_pebs_core_fmt_restart, + .fmt_exit = pfm_pebs_core_fmt_exit, + .fmt_flags = PFM_FMT_BUILTIN_FLAG, + .owner = THIS_MODULE, +}; + +static int __init pfm_pebs_core_fmt_init_module(void) +{ + if (!cpu_has_pebs) { + PFM_INFO("processor does not have PEBS support"); + return -1; + } + /* + * cpu_has_pebs is not enough to identify Intel Core PEBS + * which is different fro Pentium 4 PEBS. Therefore we do + * a more detailed check here + */ + if (current_cpu_data.x86 != 6) { + PFM_INFO("not a supported Intel processor"); + return -1; + } + + switch (current_cpu_data.x86_model) { + case 15: /* Merom */ + case 23: /* Penryn */ + case 28: /* Atom (Silverthorne) */ + case 29: /* Dunnington */ + break; + default: + PFM_INFO("not a supported Intel processor"); + return -1; + } + return pfm_fmt_register(&pebs_core_fmt); +} + +static void __exit pfm_pebs_core_fmt_cleanup_module(void) +{ + pfm_fmt_unregister(&pebs_core_fmt); +} + +module_init(pfm_pebs_core_fmt_init_module); +module_exit(pfm_pebs_core_fmt_cleanup_module); diff --git a/arch/x86/perfmon/perfmon_pebs_p4_smpl.c b/arch/x86/perfmon/perfmon_pebs_p4_smpl.c new file mode 100644 index 0000000..f4e9fd2 --- /dev/null +++ b/arch/x86/perfmon/perfmon_pebs_p4_smpl.c @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This file implements the Precise Event Based Sampling (PEBS) + * sampling format. It supports the following processors: + * - 32-bit Pentium 4 or other Netburst-based processors + * - 64-bit Pentium 4 or other Netburst-based processors + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include + +#include +#include + +MODULE_AUTHOR("Stephane Eranian "); +MODULE_DESCRIPTION("Intel P4 Precise Event-Based Sampling (PEBS)"); +MODULE_LICENSE("GPL"); + +#define ALIGN_PEBS(a, order) \ + ((a)+(1UL<<(order))-1) & ~((1UL<<(order))-1) + +#define PEBS_PADDING_ORDER 8 /* log2(256) padding for PEBS alignment constraint */ + +static int pfm_pebs_p4_fmt_validate(u32 flags, u16 npmds, void *data) +{ + struct pfm_pebs_p4_smpl_arg *arg = data; + size_t min_buf_size; + + /* + * need to define at least the size of the buffer + */ + if (data == NULL) { + PFM_DBG("no argument passed"); + return -EINVAL; + } + + /* + * compute min buf size. npmds is the maximum number + * of implemented PMD registers. + */ + min_buf_size = sizeof(struct pfm_pebs_p4_smpl_hdr) + + sizeof(struct pfm_pebs_p4_smpl_entry) + + (1UL<buf_size); + + /* + * must hold at least the buffer header + one minimally sized entry + */ + if (arg->buf_size < min_buf_size) + return -EINVAL; + + return 0; +} + +static int pfm_pebs_p4_fmt_get_size(unsigned int flags, void *data, size_t *size) +{ + struct pfm_pebs_p4_smpl_arg *arg = data; + + /* + * size has been validated in pfm_pebs_p4_fmt_validate() + */ + *size = arg->buf_size + (1UL<ds; + + /* + * align PEBS buffer base + */ + pebs_start = ALIGN_PEBS((unsigned long)(hdr+1), PEBS_PADDING_ORDER); + pebs_end = pebs_start + arg->buf_size + 1; + + hdr->version = PFM_PEBS_P4_SMPL_VERSION; + hdr->buf_size = arg->buf_size; + hdr->overflows = 0; + + /* + * express PEBS buffer base as offset from the end of the header + */ + hdr->start_offs = pebs_start - (unsigned long)(hdr+1); + + /* + * PEBS buffer boundaries + */ + ds->pebs_buf_base = pebs_start; + ds->pebs_abs_max = pebs_end; + + /* + * PEBS starting position + */ + ds->pebs_index = pebs_start; + + /* + * PEBS interrupt threshold + */ + ds->pebs_intr_thres = pebs_start + + arg->intr_thres * sizeof(struct pfm_pebs_p4_smpl_entry); + + /* + * save counter reset value for PEBS counter + */ + ds->pebs_cnt_reset = arg->cnt_reset; + + /* + * keep track of DS AREA + */ + ctx_arch->ds_area = ds; + ctx_arch->flags.use_pebs = 1; + ctx_arch->flags.use_ds = 1; + + PFM_DBG("buffer=%p buf_size=%llu offs=%llu pebs_start=0x%lx " + "pebs_end=0x%lx ds=%p pebs_thres=0x%lx cnt_reset=0x%llx", + buf, + (unsigned long long)hdr->buf_size, + (unsigned long long)hdr->start_offs, + pebs_start, + pebs_end, + ds, + ds->pebs_intr_thres, + (unsigned long long)ds->pebs_cnt_reset); + + return 0; +} + +static int pfm_pebs_p4_fmt_handler(struct pfm_context *ctx, + unsigned long ip, u64 tstamp, void *data) +{ + struct pfm_pebs_p4_smpl_hdr *hdr; + struct pfm_ovfl_arg *arg; + + hdr = ctx->smpl_addr; + arg = &ctx->ovfl_arg; + + PFM_DBG_ovfl("buffer full"); + /* + * increment number of buffer overflows. + * important to detect duplicate set of samples. + */ + hdr->overflows++; + + /* + * request notification and masking of monitoring. + * Notification is still subject to the overflowed + * register having the FL_NOTIFY flag set. + */ + arg->ovfl_ctrl = PFM_OVFL_CTRL_NOTIFY | PFM_OVFL_CTRL_MASK; + + return -ENOBUFS; /* we are full, sorry */ +} + +static int pfm_pebs_p4_fmt_restart(int is_active, u32 *ovfl_ctrl, + void *buf) +{ + struct pfm_pebs_p4_smpl_hdr *hdr = buf; + + /* + * reset index to base of buffer + */ + hdr->ds.pebs_index = hdr->ds.pebs_buf_base; + + *ovfl_ctrl = PFM_OVFL_CTRL_RESET; + + return 0; +} + +static int pfm_pebs_p4_fmt_exit(void *buf) +{ + return 0; +} + +static struct pfm_smpl_fmt pebs_p4_fmt = { + .fmt_name = PFM_PEBS_P4_SMPL_NAME, + .fmt_version = 0x1, + .fmt_arg_size = sizeof(struct pfm_pebs_p4_smpl_arg), + .fmt_validate = pfm_pebs_p4_fmt_validate, + .fmt_getsize = pfm_pebs_p4_fmt_get_size, + .fmt_init = pfm_pebs_p4_fmt_init, + .fmt_handler = pfm_pebs_p4_fmt_handler, + .fmt_restart = pfm_pebs_p4_fmt_restart, + .fmt_exit = pfm_pebs_p4_fmt_exit, + .fmt_flags = PFM_FMT_BUILTIN_FLAG, + .owner = THIS_MODULE, +}; + +static int __init pfm_pebs_p4_fmt_init_module(void) +{ + int ht_enabled; + + if (!cpu_has_pebs) { + PFM_INFO("processor does not have PEBS support"); + return -1; + } + if (current_cpu_data.x86 != 15) { + PFM_INFO("not an Intel Pentium 4"); + return -1; + } +#ifdef CONFIG_SMP + ht_enabled = (cpus_weight(__get_cpu_var(cpu_core_map)) + / current_cpu_data.x86_max_cores) > 1; +#else + ht_enabled = 0; +#endif + if (ht_enabled) { + PFM_INFO("PEBS not available because HyperThreading is on"); + return -1; + } + return pfm_fmt_register(&pebs_p4_fmt); +} + +static void __exit pfm_pebs_p4_fmt_cleanup_module(void) +{ + pfm_fmt_unregister(&pebs_p4_fmt); +} + +module_init(pfm_pebs_p4_fmt_init_module); +module_exit(pfm_pebs_p4_fmt_cleanup_module); diff --git a/include/asm-mips/Kbuild b/include/asm-mips/Kbuild index 7897f05..7ed16fc 100644 --- a/include/asm-mips/Kbuild +++ b/include/asm-mips/Kbuild @@ -1,3 +1,4 @@ include include/asm-generic/Kbuild.asm header-y += cachectl.h sgidefs.h sysmips.h +header-y += perfmon.h diff --git a/include/asm-mips/perfmon.h b/include/asm-mips/perfmon.h new file mode 100644 index 0000000..7915c17 --- /dev/null +++ b/include/asm-mips/perfmon.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2007 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This file contains mips64 specific definitions for the perfmon + * interface. + * + * This file MUST never be included directly. Use linux/perfmon.h. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#ifndef _ASM_MIPS64_PERFMON_H_ +#define _ASM_MIPS64_PERFMON_H_ + +/* + * arch-specific user visible interface definitions + */ + +#define PFM_ARCH_MAX_PMCS (256+64) /* 256 HW 64 SW */ +#define PFM_ARCH_MAX_PMDS (256+64) /* 256 HW 64 SW */ + +#endif /* _ASM_MIPS64_PERFMON_H_ */ diff --git a/include/asm-mips/perfmon_kern.h b/include/asm-mips/perfmon_kern.h new file mode 100644 index 0000000..7d213df --- /dev/null +++ b/include/asm-mips/perfmon_kern.h @@ -0,0 +1,412 @@ +/* + * Copyright (c) 2005 Philip Mucci. + * + * Based on other versions: + * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This file contains mips64 specific definitions for the perfmon + * interface. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#ifndef _ASM_MIPS64_PERFMON_KERN_H_ +#define _ASM_MIPS64_PERFMON_KERN_H_ + +#ifdef __KERNEL__ + +#ifdef CONFIG_PERFMON +#include +#include + +#define PFM_ARCH_PMD_STK_ARG 2 +#define PFM_ARCH_PMC_STK_ARG 2 + +struct pfm_arch_pmu_info { + u32 pmu_style; +}; + +#define MIPS64_CONFIG_PMC_MASK (1 << 4) +#define MIPS64_PMC_INT_ENABLE_MASK (1 << 4) +#define MIPS64_PMC_CNT_ENABLE_MASK (0xf) +#define MIPS64_PMC_EVT_MASK (0x7 << 6) +#define MIPS64_PMC_CTR_MASK (1 << 31) +#define MIPS64_PMD_INTERRUPT (1 << 31) + +/* Coprocessor register 25 contains the PMU interface. */ +/* Sel 0 is control for counter 0 */ +/* Sel 1 is count for counter 0. */ +/* Sel 2 is control for counter 1. */ +/* Sel 3 is count for counter 1. */ + +/* + +31 30 29 28 27 26 25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 +M 0--------------------------------------------------------------0 Event-- IE U S K EXL + +M 31 If this bit is one, another pair of Performance Control +and Counter registers is implemented at a MTC0 + +Event 8:5 Counter event enabled for this counter. Possible events +are listed in Table 6-30. R/W Undefined + +IE 4 Counter Interrupt Enable. This bit masks bit 31 of the +associated count register from the interrupt exception +request output. R/W 0 + +U 3 Count in User Mode. When this bit is set, the specified +event is counted in User Mode. R/W Undefined + +S 2 Count in Supervisor Mode. When this bit is set, the +specified event is counted in Supervisor Mode. R/W Undefined + +K 1 Count in Kernel Mode. When this bit is set, count the +event in Kernel Mode when EXL and ERL both are 0. R/W Undefined + +EXL 0 Count when EXL. When this bit is set, count the event +when EXL = 1 and ERL = 0. R/W Undefined +*/ + +static inline void pfm_arch_resend_irq(struct pfm_context *ctx) +{} + +static inline void pfm_arch_clear_pmd_ovfl_cond(struct pfm_context *ctx, + struct pfm_event_set *set) +{} + +static inline void pfm_arch_serialize(void) +{} + + +/* + * MIPS does not save the PMDs during pfm_arch_intr_freeze_pmu(), thus + * this routine needs to do it when switching sets on overflow + */ +static inline void pfm_arch_save_pmds_from_intr(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + pfm_save_pmds(ctx, set); +} + +static inline void pfm_arch_write_pmc(struct pfm_context *ctx, + unsigned int cnum, u64 value) +{ + /* + * we only write to the actual register when monitoring is + * active (pfm_start was issued) + */ + if (ctx && (ctx->flags.started == 0)) + return; + + switch (pfm_pmu_conf->pmc_desc[cnum].hw_addr) { + case 0: + write_c0_perfctrl0(value); + break; + case 1: + write_c0_perfctrl1(value); + break; + case 2: + write_c0_perfctrl2(value); + break; + case 3: + write_c0_perfctrl3(value); + break; + default: + BUG(); + } +} + +static inline void pfm_arch_write_pmd(struct pfm_context *ctx, + unsigned int cnum, u64 value) +{ + value &= pfm_pmu_conf->ovfl_mask; + + switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) { + case 0: + write_c0_perfcntr0(value); + break; + case 1: + write_c0_perfcntr1(value); + break; + case 2: + write_c0_perfcntr2(value); + break; + case 3: + write_c0_perfcntr3(value); + break; + default: + BUG(); + } +} + +static inline u64 pfm_arch_read_pmd(struct pfm_context *ctx, unsigned int cnum) +{ + switch (pfm_pmu_conf->pmd_desc[cnum].hw_addr) { + case 0: + return read_c0_perfcntr0(); + break; + case 1: + return read_c0_perfcntr1(); + break; + case 2: + return read_c0_perfcntr2(); + break; + case 3: + return read_c0_perfcntr3(); + break; + default: + BUG(); + return 0; + } +} + +static inline u64 pfm_arch_read_pmc(struct pfm_context *ctx, unsigned int cnum) +{ + switch (pfm_pmu_conf->pmc_desc[cnum].hw_addr) { + case 0: + return read_c0_perfctrl0(); + break; + case 1: + return read_c0_perfctrl1(); + break; + case 2: + return read_c0_perfctrl2(); + break; + case 3: + return read_c0_perfctrl3(); + break; + default: + BUG(); + return 0; + } +} + +/* + * For some CPUs, the upper bits of a counter must be set in order for the + * overflow interrupt to happen. On overflow, the counter has wrapped around, + * and the upper bits are cleared. This function may be used to set them back. + */ +static inline void pfm_arch_ovfl_reset_pmd(struct pfm_context *ctx, + unsigned int cnum) +{ + u64 val; + val = pfm_arch_read_pmd(ctx, cnum); + /* This masks out overflow bit 31 */ + pfm_arch_write_pmd(ctx, cnum, val); +} + +/* + * At certain points, perfmon needs to know if monitoring has been + * explicitely started/stopped by user via pfm_start/pfm_stop. The + * information is tracked in ctx.flags.started. However on certain + * architectures, it may be possible to start/stop directly from + * user level with a single assembly instruction bypassing + * the kernel. This function must be used to determine by + * an arch-specific mean if monitoring is actually started/stopped. + */ +static inline int pfm_arch_is_active(struct pfm_context *ctx) +{ + return ctx->flags.started; +} + +static inline void pfm_arch_ctxswout_sys(struct task_struct *task, + struct pfm_context *ctx) +{} + +static inline void pfm_arch_ctxswin_sys(struct task_struct *task, + struct pfm_context *ctx) +{} + +static inline void pfm_arch_ctxswin_thread(struct task_struct *task, + struct pfm_context *ctx) +{} +int pfm_arch_ctxswout_thread(struct task_struct *task, + struct pfm_context *ctx); + +int pfm_arch_is_monitoring_active(struct pfm_context *ctx); +void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx); +void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx); +void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set); +void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set); +char *pfm_arch_get_pmu_module_name(void); + +static inline void pfm_arch_intr_freeze_pmu(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + pfm_arch_stop(current, ctx); + /* + * we mark monitoring as stopped to avoid + * certain side effects especially in + * pfm_switch_sets_from_intr() on + * pfm_arch_restore_pmcs() + */ + ctx->flags.started = 0; +} + +/* + * unfreeze PMU from pfm_do_interrupt_handler() + * ctx may be NULL for spurious + */ +static inline void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx) +{ + if (!ctx) + return; + + PFM_DBG_ovfl("state=%d", ctx->state); + + ctx->flags.started = 1; + + if (ctx->state == PFM_CTX_MASKED) + return; + + pfm_arch_restore_pmcs(ctx, ctx->active_set); +} + +/* + * this function is called from the PMU interrupt handler ONLY. + * On MIPS, the PMU is frozen via arch_stop, masking would be implemented + * via arch-stop as well. Given that the PMU is already stopped when + * entering the interrupt handler, we do not need to stop it again, so + * this function is a nop. + */ +static inline void pfm_arch_mask_monitoring(struct pfm_context *ctx, + struct pfm_event_set *set) +{} + +/* + * on MIPS masking/unmasking uses the start/stop mechanism, so we simply + * need to start here. + */ +static inline void pfm_arch_unmask_monitoring(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + pfm_arch_start(current, ctx); +} + +static inline int pfm_arch_context_create(struct pfm_context *ctx, + u32 ctx_flags) +{ + return 0; +} + +static inline void pfm_arch_context_free(struct pfm_context *ctx) +{} + + + + + +/* + * function called from pfm_setfl_sane(). Context is locked + * and interrupts are masked. + * The value of flags is the value of ctx_flags as passed by + * user. + * + * function must check arch-specific set flags. + * Return: + * 1 when flags are valid + * 0 on error + */ +static inline int +pfm_arch_setfl_sane(struct pfm_context *ctx, u32 flags) +{ + return 0; +} + +static inline int pfm_arch_init(void) +{ + return 0; +} + +static inline void pfm_arch_init_percpu(void) +{} + +static inline int pfm_arch_load_context(struct pfm_context *ctx) +{ + return 0; +} + +static inline void pfm_arch_unload_context(struct pfm_context *ctx) +{} + +static inline int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds) +{ + return 0; +} + +static inline void pfm_arch_pmu_release(void) +{} + +#ifdef CONFIG_PERFMON_FLUSH +/* + * due to cache aliasing problem on MIPS, it is necessary to flush + * pages out of the cache when they are modified. + */ +static inline void pfm_cacheflush(void *addr, unsigned int len) +{ + unsigned long start, end; + + start = (unsigned long)addr & PAGE_MASK; + end = ((unsigned long)addr + len + PAGE_SIZE - 1) & PAGE_MASK; + + while (start < end) { + flush_data_cache_page(start); + start += PAGE_SIZE; + } +} +#else +static inline void pfm_cacheflush(void *addr, unsigned int len) +{} +#endif + +static inline void pfm_arch_arm_handle_work(struct task_struct *task) +{} + +static inline void pfm_arch_disarm_handle_work(struct task_struct *task) +{} + +static inline int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg) +{ + return 0; +} + +static inline int pfm_arch_get_base_syscall(void) +{ + if (test_thread_flag(TIF_32BIT_ADDR)) { + if (test_thread_flag(TIF_32BIT_REGS)) + return __NR_O32_Linux+330; + return __NR_N32_Linux+293; + } + return __NR_64_Linux+289; +} + +struct pfm_arch_context { + /* empty */ +}; + +#define PFM_ARCH_CTX_SIZE sizeof(struct pfm_arch_context) +/* + * MIPS may need extra alignment requirements for the sampling buffer + */ +#ifdef CONFIG_PERFMON_SMPL_ALIGN +#define PFM_ARCH_SMPL_ALIGN_SIZE 0x4000 +#else +#define PFM_ARCH_SMPL_ALIGN_SIZE 0 +#endif + +#endif /* CONFIG_PERFMON */ + +#endif /* __KERNEL__ */ +#endif /* _ASM_MIPS64_PERFMON_KERN_H_ */ diff --git a/include/asm-mips/system.h b/include/asm-mips/system.h index a944eda..470cdfc 100644 --- a/include/asm-mips/system.h +++ b/include/asm-mips/system.h @@ -67,6 +67,10 @@ do { \ __mips_mt_fpaff_switch_to(prev); \ if (cpu_has_dsp) \ __save_dsp(prev); \ + if (test_tsk_thread_flag(prev, TIF_PERFMON_CTXSW)) \ + pfm_ctxsw_out(prev, next); \ + if (test_tsk_thread_flag(next, TIF_PERFMON_CTXSW)) \ + pfm_ctxsw_in(prev, next); \ (last) = resume(prev, next, task_thread_info(next)); \ } while (0) diff --git a/include/asm-mips/thread_info.h b/include/asm-mips/thread_info.h index bb30606..34fd6aa 100644 --- a/include/asm-mips/thread_info.h +++ b/include/asm-mips/thread_info.h @@ -114,6 +114,7 @@ register struct thread_info *__current_thread_info __asm__("$28"); #define TIF_NEED_RESCHED 2 /* rescheduling necessary */ #define TIF_SYSCALL_AUDIT 3 /* syscall auditing active */ #define TIF_SECCOMP 4 /* secure computing */ +#define TIF_PERFMON_WORK 5 /* work for pfm_handle_work() */ #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal() */ #define TIF_USEDFPU 16 /* FPU was used by this task this quantum (SMP) */ #define TIF_POLLING_NRFLAG 17 /* true if poll_idle() is polling TIF_NEED_RESCHED */ @@ -124,6 +125,7 @@ register struct thread_info *__current_thread_info __asm__("$28"); #define TIF_32BIT_REGS 22 /* also implies 16/32 fprs */ #define TIF_32BIT_ADDR 23 /* 32-bit address space (o32/n32) */ #define TIF_FPUBOUND 24 /* thread bound to FPU-full CPU set */ +#define TIF_PERFMON_CTXSW 25 /* perfmon needs ctxsw calls */ #define TIF_SYSCALL_TRACE 31 /* syscall trace active */ #define _TIF_SYSCALL_TRACE (1< + * + * This file contains i386/x86_64 specific definitions for the perfmon + * interface. + * + * This file MUST never be included directly. Use linux/perfmon.h. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#ifndef _ASM_X86_PERFMON__H_ +#define _ASM_X86_PERFMON__H_ + +/* + * arch-specific user visible interface definitions + */ + +#define PFM_ARCH_MAX_PMCS (256+64) /* 256 HW 64 SW */ +#define PFM_ARCH_MAX_PMDS (256+64) /* 256 HW 64 SW */ + +#endif /* _ASM_X86_PERFMON_H_ */ diff --git a/include/asm-x86/perfmon_kern.h b/include/asm-x86/perfmon_kern.h new file mode 100644 index 0000000..0e5d3a5 --- /dev/null +++ b/include/asm-x86/perfmon_kern.h @@ -0,0 +1,548 @@ +/* + * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * Copyright (c) 2007 Advanced Micro Devices, Inc. + * Contributed by Robert Richter + * + * This file contains X86 Processor Family specific definitions + * for the perfmon interface. This covers P6, Pentium M, P4/Xeon + * (32-bit and 64-bit, i.e., EM64T) and AMD X86-64. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#ifndef _ASM_X86_PERFMON_KERN_H_ +#define _ASM_X86_PERFMON_KERN_H_ + +#ifdef CONFIG_PERFMON +#include +#ifdef CONFIG_4KSTACKS +#define PFM_ARCH_PMD_STK_ARG 2 +#define PFM_ARCH_PMC_STK_ARG 2 +#else +#define PFM_ARCH_PMD_STK_ARG 4 /* about 700 bytes of stack space */ +#define PFM_ARCH_PMC_STK_ARG 4 /* about 200 bytes of stack space */ +#endif + +struct pfm_arch_pmu_info { + u32 flags; /* PMU feature flags */ + /* + * mandatory model-specific callbacks + */ + int (*stop_save)(struct pfm_context *ctx, struct pfm_event_set *set); + int (*has_ovfls)(struct pfm_context *ctx); + void (*quiesce)(void); + + /* + * optional model-specific callbacks + */ + void (*acquire_pmu_percpu)(void); + void (*release_pmu_percpu)(void); + int (*create_context)(struct pfm_context *ctx, u32 ctx_flags); + void (*free_context)(struct pfm_context *ctx); + int (*load_context)(struct pfm_context *ctx); + void (*unload_context)(struct pfm_context *ctx); + void (*write_pmc)(struct pfm_context *ctx, unsigned int cnum, u64 value); + void (*write_pmd)(struct pfm_context *ctx, unsigned int cnum, u64 value); + u64 (*read_pmd)(struct pfm_context *ctx, unsigned int cnum); + u64 (*read_pmc)(struct pfm_context *ctx, unsigned int cnum); + void (*nmi_copy_state)(struct pfm_context *ctx); + void (*restore_pmcs)(struct pfm_context *ctx, + struct pfm_event_set *set); + void (*restore_pmds)(struct pfm_context *ctx, + struct pfm_event_set *set); +}; + +/* + * PMU feature flags + */ +#define PFM_X86_FL_USE_NMI 0x01 /* user asking for NMI */ +#define PFM_X86_FL_NO_SHARING 0x02 /* no sharing with other subsystems */ +#define PFM_X86_FL_SHARING 0x04 /* PMU is being shared */ + +struct pfm_x86_ctx_flags { + unsigned int insecure:1; /* rdpmc per-thread self-monitoring */ + unsigned int use_pebs:1; /* PEBS used */ + unsigned int use_ds:1; /* DS used */ + unsigned int reserved:29; /* for future use */ +}; + +struct pfm_arch_context { + u64 saved_real_iip; /* instr pointer of last NMI intr */ + struct pfm_x86_ctx_flags flags; /* flags */ + void *ds_area; /* address of DS area (to go away) */ + void *data; /* model-specific data */ +}; + +/* + * functions implemented as inline on x86 + */ + +/** + * pfm_arch_write_pmc - write a single PMC register + * @ctx: context to work on + * @cnum: PMC index + * @value: PMC 64-bit value + * + * in certain situations, ctx may be NULL + */ +static inline void pfm_arch_write_pmc(struct pfm_context *ctx, + unsigned int cnum, u64 value) +{ + struct pfm_arch_pmu_info *pmu_info; + + pmu_info = pfm_pmu_info(); + + /* + * we only write to the actual register when monitoring is + * active (pfm_start was issued) + */ + if (ctx && ctx->flags.started == 0) + return; + + /* + * model-specific override, if any + */ + if (pmu_info->write_pmc) { + pmu_info->write_pmc(ctx, cnum, value); + return; + } + + PFM_DBG_ovfl("pfm_arch_write_pmc(0x%lx, 0x%Lx)", + pfm_pmu_conf->pmc_desc[cnum].hw_addr, + (unsigned long long) value); + + wrmsrl(pfm_pmu_conf->pmc_desc[cnum].hw_addr, value); +} + +/** + * pfm_arch_write_pmd - write a single PMD register + * @ctx: context to work on + * @cnum: PMD index + * @value: PMD 64-bit value + */ +static inline void pfm_arch_write_pmd(struct pfm_context *ctx, + unsigned int cnum, u64 value) +{ + struct pfm_arch_pmu_info *pmu_info; + + pmu_info = pfm_pmu_info(); + + /* + * to make sure the counter overflows, we set the + * upper bits. we also clear any other unimplemented + * bits as this may cause crash on some processors. + */ + if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_C64) + value = (value | ~pfm_pmu_conf->ovfl_mask) + & ~pfm_pmu_conf->pmd_desc[cnum].rsvd_msk; + + PFM_DBG_ovfl("pfm_arch_write_pmd(0x%lx, 0x%Lx)", + pfm_pmu_conf->pmd_desc[cnum].hw_addr, + (unsigned long long) value); + + /* + * model-specific override, if any + */ + if (pmu_info->write_pmd) { + pmu_info->write_pmd(ctx, cnum, value); + return; + } + + wrmsrl(pfm_pmu_conf->pmd_desc[cnum].hw_addr, value); +} + +/** + * pfm_arch_read_pmd - read a single PMD register + * @ctx: context to work on + * @cnum: PMD index + * + * return value is register 64-bit value + */ +static inline u64 pfm_arch_read_pmd(struct pfm_context *ctx, unsigned int cnum) +{ + struct pfm_arch_pmu_info *pmu_info; + u64 tmp; + + pmu_info = pfm_pmu_info(); + + /* + * model-specific override, if any + */ + if (pmu_info->read_pmd) + tmp = pmu_info->read_pmd(ctx, cnum); + else + rdmsrl(pfm_pmu_conf->pmd_desc[cnum].hw_addr, tmp); + + PFM_DBG_ovfl("pfm_arch_read_pmd(0x%lx) = 0x%Lx", + pfm_pmu_conf->pmd_desc[cnum].hw_addr, + (unsigned long long) tmp); + return tmp; +} + +/** + * pfm_arch_read_pmc - read a single PMC register + * @ctx: context to work on + * @cnum: PMC index + * + * return value is register 64-bit value + */ +static inline u64 pfm_arch_read_pmc(struct pfm_context *ctx, unsigned int cnum) +{ + struct pfm_arch_pmu_info *pmu_info; + u64 tmp; + + pmu_info = pfm_pmu_info(); + + /* + * model-specific override, if any + */ + if (pmu_info->read_pmc) + tmp = pmu_info->read_pmc(ctx, cnum); + else + rdmsrl(pfm_pmu_conf->pmc_desc[cnum].hw_addr, tmp); + + PFM_DBG_ovfl("pfm_arch_read_pmc(0x%lx) = 0x%016Lx", + pfm_pmu_conf->pmc_desc[cnum].hw_addr, + (unsigned long long) tmp); + return tmp; +} + +/** + * pfm_arch_is_active - return non-zero is monitoring has been started + * @ctx: context to check + * + * At certain points, perfmon needs to know if monitoring has been + * explicitly started. + * + * On x86, there is not other way but to use pfm_start/pfm_stop + * to activate monitoring, thus we can simply check flags.started + */ +static inline int pfm_arch_is_active(struct pfm_context *ctx) +{ + return ctx->flags.started; +} + + +/** + * pfm_arch_unload_context - detach context from thread or CPU + * @ctx: context to detach + * + * in system-wide ctx->task is NULL, otherwise it points to the + * attached thread + */ +static inline void pfm_arch_unload_context(struct pfm_context *ctx) +{ + struct pfm_arch_pmu_info *pmu_info; + struct pfm_arch_context *ctx_arch; + + ctx_arch = pfm_ctx_arch(ctx); + pmu_info = pfm_pmu_info(); + + if (ctx_arch->flags.insecure) { + PFM_DBG("clear cr4.pce"); + clear_in_cr4(X86_CR4_PCE); + } + + if (pmu_info->unload_context) + pmu_info->unload_context(ctx); +} + +/** + * pfm_arch_load_context - attach context to thread or CPU + * @ctx: context to attach + */ +static inline int pfm_arch_load_context(struct pfm_context *ctx) +{ + struct pfm_arch_pmu_info *pmu_info; + struct pfm_arch_context *ctx_arch; + int ret = 0; + + ctx_arch = pfm_ctx_arch(ctx); + pmu_info = pfm_pmu_info(); + + /* + * RDPMC authorized in system-wide and + * per-thread self-monitoring. + * + * RDPMC only gives access to counts. + * + * The context-switch routine code does not restore + * all the PMD registers (optimization), thus there + * is a possible leak of counts there in per-thread + * mode. + */ + if (ctx->task == current || ctx->flags.system) { + PFM_DBG("set cr4.pce"); + set_in_cr4(X86_CR4_PCE); + ctx_arch->flags.insecure = 1; + } + + if (pmu_info->load_context) + ret = pmu_info->load_context(ctx); + + return ret; +} + +void pfm_arch_restore_pmcs(struct pfm_context *ctx, struct pfm_event_set *set); +void pfm_arch_start(struct task_struct *task, struct pfm_context *ctx); +void pfm_arch_stop(struct task_struct *task, struct pfm_context *ctx); + +/** + * pfm_arch_unmask_monitoring - unmask monitoring + * @ctx: context to mask + * @set: current event set + * + * masking is slightly different from stopping in that, it does not undo + * the pfm_start() issued by user. This is used in conjunction with + * sampling. Masking means stop monitoring, but do not authorize user + * to issue pfm_start/stop during that time. Unmasking is achieved via + * pfm_restart() and also may also depend on the sampling format used. + * + * on x86 masking/unmasking use the start/stop mechanism, except + * that flags.started is not modified. + */ +static inline void pfm_arch_unmask_monitoring(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + pfm_arch_start(current, ctx); +} + +/** + * pfm_arch_intr_freeze_pmu - stop monitoring when handling PMU interrupt + * @ctx: current context + * @set: current event set + * + * called from __pfm_interrupt_handler(). + * ctx is not NULL. ctx is locked. interrupts are masked + * + * The following actions must take place: + * - stop all monitoring to ensure handler has consistent view. + * - collect overflowed PMDs bitmask into povfls_pmds and + * npend_ovfls. If no interrupt detected then npend_ovfls + * must be set to zero. + */ +static inline void pfm_arch_intr_freeze_pmu(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + /* + * on X86, freezing is equivalent to stopping + */ + pfm_arch_stop(current, ctx); + + /* + * we mark monitoring as stopped to avoid + * certain side effects especially in + * pfm_switch_sets_from_intr() and + * pfm_arch_restore_pmcs() + */ + ctx->flags.started = 0; +} + +/** + * pfm_arch_intr_unfreeze_pmu - conditionally reactive monitoring + * @ctx: current context + * + * current context may be not when dealing when spurious interrupts + * + * Must re-activate monitoring if context is not MASKED. + * interrupts are masked. + */ +static inline void pfm_arch_intr_unfreeze_pmu(struct pfm_context *ctx) +{ + if (ctx == NULL) + return; + + PFM_DBG_ovfl("state=%d", ctx->state); + + /* + * restore flags.started which is cleared in + * pfm_arch_intr_freeze_pmu() + */ + ctx->flags.started = 1; + + if (ctx->state == PFM_CTX_MASKED) + return; + + pfm_arch_restore_pmcs(ctx, ctx->active_set); +} + +/** + * pfm_arch_setfl_sane - check arch/model specific event set flags + * @ctx: context to work on + * @flags: event set flags as passed by user + * + * called from pfm_setfl_sane(). Context is locked. Interrupts are masked. + * + * Return: + * 0 when flags are valid + * 1 on error + */ +static inline int pfm_arch_setfl_sane(struct pfm_context *ctx, u32 flags) +{ + return 0; +} + +/** + * pfm_arch_ovfl_reset_pmd - reset pmd on overflow + * @ctx: current context + * @cnum: PMD index + * + * On some CPUs, the upper bits of a counter must be set in order for the + * overflow interrupt to happen. On overflow, the counter has wrapped around, + * and the upper bits are cleared. This function may be used to set them back. + * + * For x86, the current version loses whatever is remaining in the counter, + * which is usually has a small count. In order not to loose this count, + * we do a read-modify-write to set the upper bits while preserving the + * low-order bits. This is slow but works. + */ +static inline void pfm_arch_ovfl_reset_pmd(struct pfm_context *ctx, unsigned int cnum) +{ + u64 val; + val = pfm_arch_read_pmd(ctx, cnum); + pfm_arch_write_pmd(ctx, cnum, val); +} + +/** + * pfm_arch_context_create - create context + * @ctx: newly created context + * @flags: context flags as passed by user + * + * called from __pfm_create_context() + */ +static inline int pfm_arch_context_create(struct pfm_context *ctx, u32 ctx_flags) +{ + struct pfm_arch_pmu_info *pmu_info; + + pmu_info = pfm_pmu_info(); + + if (pmu_info->create_context) + return pmu_info->create_context(ctx, ctx_flags); + + return 0; +} + +/** + * pfm_arch_context_free - free context + * @ctx: context to free + */ +static inline void pfm_arch_context_free(struct pfm_context *ctx) +{ + struct pfm_arch_pmu_info *pmu_info; + + pmu_info = pfm_pmu_info(); + + if (pmu_info->free_context) + pmu_info->free_context(ctx); +} + +/* + * pfm_arch_clear_pmd_ovfl_cond - alter the pmds in such a way that they + * will not cause cause interrupts when unused. + * + * This is a nop on x86 + */ +static inline void pfm_arch_clear_pmd_ovfl_cond(struct pfm_context *ctx, + struct pfm_event_set *set) +{} + +/* + * functions implemented in arch/x86/perfmon/perfmon.c + */ +int pfm_arch_init(void); +void pfm_arch_resend_irq(struct pfm_context *ctx); + +int pfm_arch_ctxswout_thread(struct task_struct *task, struct pfm_context *ctx); +void pfm_arch_ctxswin_thread(struct task_struct *task, struct pfm_context *ctx); + +void pfm_arch_restore_pmds(struct pfm_context *ctx, struct pfm_event_set *set); +int pfm_arch_pmu_config_init(struct pfm_pmu_config *cfg); +void pfm_arch_pmu_config_remove(void); +char *pfm_arch_get_pmu_module_name(void); +int pfm_arch_pmu_acquire(u64 *unavail_pmcs, u64 *unavail_pmds); +void pfm_arch_pmu_release(void); + +/* + * pfm_arch_serialize - make PMU modifications visible to subsequent instructions + * + * This is a nop on x86 + */ +static inline void pfm_arch_serialize(void) +{} + +/* + * on x86, the PMDs are already saved by pfm_arch_freeze_pmu() + * when entering the PMU interrupt handler, thus, we do not need + * to save them again in pfm_switch_sets_from_intr() + */ +static inline void pfm_arch_save_pmds_from_intr(struct pfm_context *ctx, + struct pfm_event_set *set) +{} + + +static inline void pfm_arch_ctxswout_sys(struct task_struct *task, + struct pfm_context *ctx) +{} + +static inline void pfm_arch_ctxswin_sys(struct task_struct *task, + struct pfm_context *ctx) +{} + +static inline void pfm_arch_init_percpu(void) +{} + +static inline void pfm_cacheflush(void *addr, unsigned int len) +{} + +/* + * this function is called from the PMU interrupt handler ONLY. + * On x86, the PMU is frozen via arch_stop, masking would be implemented + * via arch-stop as well. Given that the PMU is already stopped when + * entering the interrupt handler, we do not need to stop it again, so + * this function is a nop. + */ +static inline void pfm_arch_mask_monitoring(struct pfm_context *ctx, + struct pfm_event_set *set) +{} + + +static inline void pfm_arch_arm_handle_work(struct task_struct *task) +{} + +static inline void pfm_arch_disarm_handle_work(struct task_struct *task) +{} + +static inline int pfm_arch_get_base_syscall(void) +{ +#ifdef __x86_64__ + /* 32-bit syscall definition coming from ia32_unistd.h */ + if (test_thread_flag(TIF_IA32)) + return __NR_ia32_pfm_create_context; +#endif + return __NR_pfm_create_context; +} + +#define PFM_ARCH_CTX_SIZE (sizeof(struct pfm_arch_context)) +/* + * x86 does not need extra alignment requirements for the sampling buffer + */ +#define PFM_ARCH_SMPL_ALIGN_SIZE 0 + +asmlinkage void pmu_interrupt(void); + +#endif /* CONFIG_PEFMON */ + +#endif /* _ASM_X86_PERFMON_KERN_H_ */ diff --git a/include/asm-x86/perfmon_pebs_core_smpl.h b/include/asm-x86/perfmon_pebs_core_smpl.h new file mode 100644 index 0000000..4a12e0d --- /dev/null +++ b/include/asm-x86/perfmon_pebs_core_smpl.h @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2005-2007 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + * + * This file implements the sampling format to support Intel + * Precise Event Based Sampling (PEBS) feature of Intel Core + * processors, such as Intel Core 2. + * + * What is PEBS? + * ------------ + * This is a hardware feature to enhance sampling by providing + * better precision as to where a sample is taken. This avoids the + * typical skew in the instruction one can observe with any + * interrupt-based sampling technique. + * + * PEBS also lowers sampling overhead significantly by having the + * processor store samples instead of the OS. PMU interrupt are only + * generated after multiple samples are written. + * + * Another benefit of PEBS is that samples can be captured inside + * critical sections where interrupts are masked. + * + * How does it work? + * PEBS effectively implements a Hw buffer. The Os must pass a region + * of memory where samples are to be stored. The region can have any + * size. The OS must also specify the sampling period to reload. The PMU + * will interrupt when it reaches the end of the buffer or a specified + * threshold location inside the memory region. + * + * The description of the buffer is stored in the Data Save Area (DS). + * The samples are stored sequentially in the buffer. The format of the + * buffer is fixed and specified in the PEBS documentation. The sample + * format does not change between 32-bit and 64-bit modes unlike on the + * Pentium 4 version of PEBS. + * + * PEBS does not work when HyperThreading is enabled due to certain MSR + * being shared being to two threads. + * + * What does the format do? + * It provides access to the PEBS feature for both 32-bit and 64-bit + * processors that support it. + * + * The same code and data structures are used for both 32-bit and 64-bi + * modes. A single format name is used for both modes. In 32-bit mode, + * some of the extended registers are written to zero in each sample. + * + * It is important to realize that the format provides a zero-copy + * environment for the samples, i.e,, the OS never touches the + * samples. Whatever the processor write is directly accessible to + * the user. + * + * Parameters to the buffer can be passed via pfm_create_context() in + * the pfm_pebs_smpl_arg structure. + */ +#ifndef __PERFMON_PEBS_CORE_SMPL_H__ +#define __PERFMON_PEBS_CORE_SMPL_H__ 1 + +/* + * The 32-bit and 64-bit formats are identical, thus we use only + * one name for the format. + */ +#define PFM_PEBS_CORE_SMPL_NAME "pebs_core" + +/* + * format specific parameters (passed at context creation) + * + * intr_thres: index from start of buffer of entry where the + * PMU interrupt must be triggered. It must be several samples + * short of the end of the buffer. + */ +struct pfm_pebs_core_smpl_arg { + u64 cnt_reset; /* counter reset value */ + size_t buf_size; /* size of the PEBS buffer in bytes */ + size_t intr_thres;/* index of PEBS interrupt threshold entry */ + u64 reserved[6]; /* for future use */ +}; + +/* + * Data Save Area (32 and 64-bit mode) + * + * The DS area is exposed to the user. To determine the number + * of samples available in PEBS, it is necessary to substract + * pebs_index from pebs_base. + * + * Layout of the structure is mandated by hardware and specified + * in the Intel documentation. + */ +struct pfm_ds_area_core { + u64 bts_buf_base; + u64 bts_index; + u64 bts_abs_max; + u64 bts_intr_thres; + u64 pebs_buf_base; + u64 pebs_index; + u64 pebs_abs_max; + u64 pebs_intr_thres; + u64 pebs_cnt_reset; +}; + +/* + * This header is at the beginning of the sampling buffer returned to the user. + * + * Because of PEBS alignement constraints, the actual PEBS buffer area does + * not necessarily begin right after the header. The hdr_start_offs must be + * used to compute the first byte of the buffer. The offset is defined as + * the number of bytes between the end of the header and the beginning of + * the buffer. As such the formula is: + * actual_buffer = (unsigned long)(hdr+1)+hdr->hdr_start_offs + */ +struct pfm_pebs_core_smpl_hdr { + u64 overflows; /* #overflows for buffer */ + size_t buf_size; /* bytes in the buffer */ + size_t start_offs; /* actual buffer start offset */ + u32 version; /* smpl format version */ + u32 reserved1; /* for future use */ + u64 reserved2[5]; /* for future use */ + struct pfm_ds_area_core ds; /* data save area */ +}; + +/* + * Sample format as mandated by Intel documentation. + * The same format is used in both 32 and 64 bit modes. + */ +struct pfm_pebs_core_smpl_entry { + u64 eflags; + u64 ip; + u64 eax; + u64 ebx; + u64 ecx; + u64 edx; + u64 esi; + u64 edi; + u64 ebp; + u64 esp; + u64 r8; /* 0 in 32-bit mode */ + u64 r9; /* 0 in 32-bit mode */ + u64 r10; /* 0 in 32-bit mode */ + u64 r11; /* 0 in 32-bit mode */ + u64 r12; /* 0 in 32-bit mode */ + u64 r13; /* 0 in 32-bit mode */ + u64 r14; /* 0 in 32-bit mode */ + u64 r15; /* 0 in 32-bit mode */ +}; + +#define PFM_PEBS_CORE_SMPL_VERSION_MAJ 1U +#define PFM_PEBS_CORE_SMPL_VERSION_MIN 0U +#define PFM_PEBS_CORE_SMPL_VERSION (((PFM_PEBS_CORE_SMPL_VERSION_MAJ&0xffff)<<16)|\ + (PFM_PEBS_CORE_SMPL_VERSION_MIN & 0xffff)) + +#endif /* __PERFMON_PEBS_CORE_SMPL_H__ */ diff --git a/include/asm-x86/perfmon_pebs_p4_smpl.h b/include/asm-x86/perfmon_pebs_p4_smpl.h new file mode 100644 index 0000000..26b51b4 --- /dev/null +++ b/include/asm-x86/perfmon_pebs_p4_smpl.h @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + * + * This file implements the sampling format to support Intel + * Precise Event Based Sampling (PEBS) feature of Pentium 4 + * and other Netburst-based processors. Not to be used for + * Intel Core-based processors. + * + * What is PEBS? + * ------------ + * This is a hardware feature to enhance sampling by providing + * better precision as to where a sample is taken. This avoids the + * typical skew in the instruction one can observe with any + * interrupt-based sampling technique. + * + * PEBS also lowers sampling overhead significantly by having the + * processor store samples instead of the OS. PMU interrupt are only + * generated after multiple samples are written. + * + * Another benefit of PEBS is that samples can be captured inside + * critical sections where interrupts are masked. + * + * How does it work? + * PEBS effectively implements a Hw buffer. The Os must pass a region + * of memory where samples are to be stored. The region can have any + * size. The OS must also specify the sampling period to reload. The PMU + * will interrupt when it reaches the end of the buffer or a specified + * threshold location inside the memory region. + * + * The description of the buffer is stored in the Data Save Area (DS). + * The samples are stored sequentially in the buffer. The format of the + * buffer is fixed and specified in the PEBS documentation. The sample + * format changes between 32-bit and 64-bit modes due to extended register + * file. + * + * PEBS does not work when HyperThreading is enabled due to certain MSR + * being shared being to two threads. + * + * What does the format do? + * It provides access to the PEBS feature for both 32-bit and 64-bit + * processors that support it. + * + * The same code is used for both 32-bit and 64-bit modes, but different + * format names are used because the two modes are not compatible due to + * data model and register file differences. Similarly the public data + * structures describing the samples are different. + * + * It is important to realize that the format provides a zero-copy environment + * for the samples, i.e,, the OS never touches the samples. Whatever the + * processor write is directly accessible to the user. + * + * Parameters to the buffer can be passed via pfm_create_context() in + * the pfm_pebs_smpl_arg structure. + * + * It is not possible to mix a 32-bit PEBS application on top of a 64-bit + * host kernel. + */ +#ifndef __PERFMON_PEBS_P4_SMPL_H__ +#define __PERFMON_PEBS_P4_SMPL_H__ 1 + +#ifdef __i386__ +/* + * The 32-bit and 64-bit formats are not compatible, thus we have + * two different identifications so that 32-bit programs running on + * 64-bit OS will fail to use the 64-bit PEBS support. + */ +#define PFM_PEBS_P4_SMPL_NAME "pebs32_p4" +#else +#define PFM_PEBS_P4_SMPL_NAME "pebs64_p4" +#endif + +/* + * format specific parameters (passed at context creation) + * + * intr_thres: index from start of buffer of entry where the + * PMU interrupt must be triggered. It must be several samples + * short of the end of the buffer. + */ +struct pfm_pebs_p4_smpl_arg { + u64 cnt_reset; /* counter reset value */ + size_t buf_size; /* size of the PEBS buffer in bytes */ + size_t intr_thres;/* index of PEBS interrupt threshold entry */ + u64 reserved[6]; /* for future use */ +}; + +/* + * Data Save Area (32 and 64-bit mode) + * + * The DS area must be exposed to the user because this is the only + * way to report on the number of valid entries recorded by the CPU. + * This is required when the buffer is not full, i..e, there was not + * PMU interrupt. + * + * Layout of the structure is mandated by hardware and specified in + * the Intel documentation. + */ +struct pfm_ds_area_p4 { + unsigned long bts_buf_base; + unsigned long bts_index; + unsigned long bts_abs_max; + unsigned long bts_intr_thres; + unsigned long pebs_buf_base; + unsigned long pebs_index; + unsigned long pebs_abs_max; + unsigned long pebs_intr_thres; + u64 pebs_cnt_reset; +}; + +/* + * This header is at the beginning of the sampling buffer returned to the user. + * + * Because of PEBS alignement constraints, the actual PEBS buffer area does + * not necessarily begin right after the header. The hdr_start_offs must be + * used to compute the first byte of the buffer. The offset is defined as + * the number of bytes between the end of the header and the beginning of + * the buffer. As such the formula is: + * actual_buffer = (unsigned long)(hdr+1)+hdr->hdr_start_offs + */ +struct pfm_pebs_p4_smpl_hdr { + u64 overflows; /* #overflows for buffer */ + size_t buf_size; /* bytes in the buffer */ + size_t start_offs; /* actual buffer start offset */ + u32 version; /* smpl format version */ + u32 reserved1; /* for future use */ + u64 reserved2[5]; /* for future use */ + struct pfm_ds_area_p4 ds; /* data save area */ +}; + +/* + * 64-bit PEBS record format is described in + * http://www.intel.com/technology/64bitextensions/30083502.pdf + * + * The format does not peek at samples. The sample structure is only + * used to ensure that the buffer is large enough to accomodate one + * sample. + */ +#ifdef __i386__ +struct pfm_pebs_p4_smpl_entry { + u32 eflags; + u32 ip; + u32 eax; + u32 ebx; + u32 ecx; + u32 edx; + u32 esi; + u32 edi; + u32 ebp; + u32 esp; +}; +#else +struct pfm_pebs_p4_smpl_entry { + u64 eflags; + u64 ip; + u64 eax; + u64 ebx; + u64 ecx; + u64 edx; + u64 esi; + u64 edi; + u64 ebp; + u64 esp; + u64 r8; + u64 r9; + u64 r10; + u64 r11; + u64 r12; + u64 r13; + u64 r14; + u64 r15; +}; +#endif + +#define PFM_PEBS_P4_SMPL_VERSION_MAJ 1U +#define PFM_PEBS_P4_SMPL_VERSION_MIN 0U +#define PFM_PEBS_P4_SMPL_VERSION (((PFM_PEBS_P4_SMPL_VERSION_MAJ&0xffff)<<16)|\ + (PFM_PEBS_P4_SMPL_VERSION_MIN & 0xffff)) + +#endif /* __PERFMON_PEBS_P4_SMPL_H__ */ diff --git a/include/asm-x86/thread_info.h b/include/asm-x86/thread_info.h index da0a675..b3a6ae9 100644 --- a/include/asm-x86/thread_info.h +++ b/include/asm-x86/thread_info.h @@ -71,6 +71,7 @@ struct thread_info { * Warning: layout of LSW is hardcoded in entry.S */ #define TIF_SYSCALL_TRACE 0 /* syscall trace active */ +#define TIF_PERFMON_WORK 1 /* work for pfm_handle_work() */ #define TIF_SIGPENDING 2 /* signal pending */ #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ #define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ @@ -91,6 +92,7 @@ struct thread_info { #define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ #define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */ +#define TIF_PERFMON_CTXSW 28 /* perfmon needs ctxsw calls */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) @@ -112,6 +114,8 @@ struct thread_info { #define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) #define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS) +#define _TIF_PERFMON_WORK (1< + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ + +#ifndef __LINUX_PERFMON_H__ +#define __LINUX_PERFMON_H__ + +/* + * This file contains all the user visible generic definitions for the + * interface. Model-specific user-visible definitions are located in + * the asm/perfmon.h file. + */ + +/* + * include arch-specific user interface definitions + */ +#include + +/* + * defined by each arch + */ +#define PFM_MAX_PMCS PFM_ARCH_MAX_PMCS +#define PFM_MAX_PMDS PFM_ARCH_MAX_PMDS + +/* + * number of elements for each type of bitvector + * all bitvectors use u64 fixed size type on all architectures. + */ +#define PFM_BVSIZE(x) (((x)+(sizeof(__u64)<<3)-1) / (sizeof(__u64)<<3)) +#define PFM_PMD_BV PFM_BVSIZE(PFM_MAX_PMDS) +#define PFM_PMC_BV PFM_BVSIZE(PFM_MAX_PMCS) + +/* + * register flags layout: + * bit[00-15] : generic flags + * bit[16-31] : arch-specific flags + * + * PFM_REGFL_NO_EMUL64: must be set on the PMC controlling the PMD + */ +#define PFM_REGFL_OVFL_NOTIFY 0x1 /* PMD: send notification on event */ +#define PFM_REGFL_RANDOM 0x2 /* PMD: randomize value after event */ +#define PFM_REGFL_NO_EMUL64 0x4 /* PMC: no 64-bit emulation */ + +/* + * event set flags layout: + * bits[00-15] : generic flags + * bits[16-31] : arch-specific flags (see asm/perfmon.h) + */ +#define PFM_SETFL_OVFL_SWITCH 0x01 /* enable switch on overflow */ +#define PFM_SETFL_TIME_SWITCH 0x02 /* enable switch on timeout */ + +/* + * argument to pfm_create_context() system call + * structure shared with user level + */ +struct pfarg_ctx { + __u32 ctx_flags; /* noblock/block/syswide */ + __u32 ctx_reserved1; /* for future use */ + __u64 ctx_reserved2[7]; /* for future use */ +}; + +/* + * context flags layout: + * bits[00-15]: generic flags + * bits[16-31]: arch-specific flags (see perfmon_const.h) + */ +#define PFM_FL_NOTIFY_BLOCK 0x01 /* block task on user notifications */ +#define PFM_FL_SYSTEM_WIDE 0x02 /* create a system wide context */ +#define PFM_FL_OVFL_NO_MSG 0x80 /* no overflow msgs */ + +/* + * argument to pfm_write_pmcs() system call. + * structure shared with user level + */ +struct pfarg_pmc { + __u16 reg_num; /* which register */ + __u16 reg_set; /* event set for this register */ + __u32 reg_flags; /* REGFL flags */ + __u64 reg_value; /* pmc value */ + __u64 reg_reserved2[4]; /* for future use */ +}; + +/* + * argument to pfm_write_pmds() and pfm_read_pmds() system calls. + * structure shared with user level + */ +struct pfarg_pmd { + __u16 reg_num; /* which register */ + __u16 reg_set; /* event set for this register */ + __u32 reg_flags; /* REGFL flags */ + __u64 reg_value; /* initial pmc/pmd value */ + __u64 reg_long_reset; /* value to reload after notification */ + __u64 reg_short_reset; /* reset after counter overflow */ + __u64 reg_last_reset_val; /* return: PMD last reset value */ + __u64 reg_ovfl_switch_cnt; /* #overflows before switch */ + __u64 reg_reset_pmds[PFM_PMD_BV]; /* reset on overflow */ + __u64 reg_smpl_pmds[PFM_PMD_BV]; /* record in sample */ + __u64 reg_smpl_eventid; /* opaque event identifier */ + __u64 reg_random_mask; /* bitmask used to limit random value */ + __u32 reg_random_seed; /* seed for randomization (OBSOLETE) */ + __u32 reg_reserved2[7]; /* for future use */ +}; + +/* + * optional argument to pfm_start() system call. Pass NULL if not needed. + * structure shared with user level + */ +struct pfarg_start { + __u16 start_set; /* event set to start with */ + __u16 start_reserved1; /* for future use */ + __u32 start_reserved2; /* for future use */ + __u64 reserved3[3]; /* for future use */ +}; + +/* + * argument to pfm_load_context() system call. + * structure shared with user level + */ +struct pfarg_load { + __u32 load_pid; /* thread or CPU to attach to */ + __u16 load_set; /* set to load first */ + __u16 load_reserved1; /* for future use */ + __u64 load_reserved2[3]; /* for future use */ +}; + +/* + * argument to pfm_create_evtsets() and pfm_delete_evtsets() system calls. + * structure shared with user level. + */ +struct pfarg_setdesc { + __u16 set_id; /* which set */ + __u16 set_reserved1; /* for future use */ + __u32 set_flags; /* SETFL flags */ + __u64 set_timeout; /* switch timeout in nsecs */ + __u64 reserved[6]; /* for future use */ +}; + +/* + * argument to pfm_getinfo_evtsets() system call. + * structure shared with user level + */ +struct pfarg_setinfo { + __u16 set_id; /* which set */ + __u16 set_reserved1; /* for future use */ + __u32 set_flags; /* out: SETFL flags */ + __u64 set_ovfl_pmds[PFM_PMD_BV]; /* out: last ovfl PMDs */ + __u64 set_runs; /* out: #times the set was active */ + __u64 set_timeout; /* out: eff/leftover timeout (nsecs) */ + __u64 set_act_duration; /* out: time set was active in nsecs */ + __u64 set_avail_pmcs[PFM_PMC_BV];/* out: available PMCs */ + __u64 set_avail_pmds[PFM_PMD_BV];/* out: available PMDs */ + __u64 set_reserved3[6]; /* for future use */ +}; + +/* + * default value for the user and group security parameters in + * /proc/sys/kernel/perfmon/sys_group + * /proc/sys/kernel/perfmon/task_group + */ +#define PFM_GROUP_PERM_ANY -1 /* any user/group */ + +/* + * overflow notification message. + * structure shared with user level + */ +struct pfarg_ovfl_msg { + __u32 msg_type; /* message type: PFM_MSG_OVFL */ + __u32 msg_ovfl_pid; /* process id */ + __u16 msg_active_set; /* active set at overflow */ + __u16 msg_ovfl_cpu; /* cpu of PMU interrupt */ + __u32 msg_ovfl_tid; /* thread id */ + __u64 msg_ovfl_ip; /* IP on PMU intr */ + __u64 msg_ovfl_pmds[PFM_PMD_BV];/* overflowed PMDs */ +}; + +#define PFM_MSG_OVFL 1 /* an overflow happened */ +#define PFM_MSG_END 2 /* task to which context was attached ended */ + +/* + * generic notification message (union). + * union shared with user level + */ +union pfarg_msg { + __u32 type; + struct pfarg_ovfl_msg pfm_ovfl_msg; +}; + +/* + * perfmon version number + */ +#define PFM_VERSION_MAJ 2U +#define PFM_VERSION_MIN 82U +#define PFM_VERSION (((PFM_VERSION_MAJ&0xffff)<<16)|\ + (PFM_VERSION_MIN & 0xffff)) +#define PFM_VERSION_MAJOR(x) (((x)>>16) & 0xffff) +#define PFM_VERSION_MINOR(x) ((x) & 0xffff) + +#endif /* __LINUX_PERFMON_H__ */ diff --git a/include/linux/perfmon_dfl_smpl.h b/include/linux/perfmon_dfl_smpl.h new file mode 100644 index 0000000..e0817a8 --- /dev/null +++ b/include/linux/perfmon_dfl_smpl.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This file implements the new dfl sampling buffer format + * for perfmon2 subsystem. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#ifndef __PERFMON_DFL_SMPL_H__ +#define __PERFMON_DFL_SMPL_H__ 1 + +/* + * format specific parameters (passed at context creation) + */ +struct pfm_dfl_smpl_arg { + __u64 buf_size; /* size of the buffer in bytes */ + __u32 buf_flags; /* buffer specific flags */ + __u32 reserved1; /* for future use */ + __u64 reserved[6]; /* for future use */ +}; + +/* + * This header is at the beginning of the sampling buffer returned to the user. + * It is directly followed by the first record. + */ +struct pfm_dfl_smpl_hdr { + __u64 hdr_count; /* how many valid entries */ + __u64 hdr_cur_offs; /* current offset from top of buffer */ + __u64 hdr_overflows; /* #overflows for buffer */ + __u64 hdr_buf_size; /* bytes in the buffer */ + __u64 hdr_min_buf_space;/* minimal buffer size (internal use) */ + __u32 hdr_version; /* smpl format version */ + __u32 hdr_buf_flags; /* copy of buf_flags */ + __u64 hdr_reserved[10]; /* for future use */ +}; + +/* + * Entry header in the sampling buffer. The header is directly followed + * with the values of the PMD registers of interest saved in increasing + * index order: PMD4, PMD5, and so on. How many PMDs are present depends + * on how the session was programmed. + * + * In the case where multiple counters overflow at the same time, multiple + * entries are written consecutively. + * + * last_reset_value member indicates the initial value of the overflowed PMD. + */ +struct pfm_dfl_smpl_entry { + __u32 pid; /* thread id (for NPTL, this is gettid()) */ + __u16 ovfl_pmd; /* index of overflowed PMD for this sample */ + __u16 reserved; /* for future use */ + __u64 last_reset_val; /* initial value of overflowed PMD */ + __u64 ip; /* where did the overflow intr happened */ + __u64 tstamp; /* overflow timetamp */ + __u16 cpu; /* cpu on which the overfow occurred */ + __u16 set; /* event set active when overflow ocurred */ + __u32 tgid; /* thread group id (getpid() for NPTL) */ +}; + +#define PFM_DFL_SMPL_VERSION_MAJ 1U +#define PFM_DFL_SMPL_VERSION_MIN 0U +#define PFM_DFL_SMPL_VERSION (((PFM_DFL_SMPL_VERSION_MAJ&0xffff)<<16)|\ + (PFM_DFL_SMPL_VERSION_MIN & 0xffff)) + +#endif /* __PERFMON_DFL_SMPL_H__ */ diff --git a/include/linux/perfmon_fmt.h b/include/linux/perfmon_fmt.h new file mode 100644 index 0000000..82a6a90 --- /dev/null +++ b/include/linux/perfmon_fmt.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * Interface for custom sampling buffer format modules + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#ifndef __PERFMON_FMT_H__ +#define __PERFMON_FMT_H__ 1 + +#include + +typedef int (*fmt_validate_t)(u32 flags, u16 npmds, void *arg); +typedef int (*fmt_getsize_t)(u32 flags, void *arg, size_t *size); +typedef int (*fmt_init_t)(struct pfm_context *ctx, void *buf, u32 flags, + u16 nmpds, void *arg); +typedef int (*fmt_restart_t)(int is_active, u32 *ovfl_ctrl, void *buf); +typedef int (*fmt_exit_t)(void *buf); +typedef int (*fmt_handler_t)(struct pfm_context *ctx, + unsigned long ip, u64 stamp, void *data); + +struct pfm_smpl_fmt { + char *fmt_name; /* name of the format (required) */ + size_t fmt_arg_size; /* size of fmt args for ctx create */ + u32 fmt_flags; /* format specific flags */ + u32 fmt_version; /* format version number */ + + fmt_validate_t fmt_validate; /* validate context flags */ + fmt_getsize_t fmt_getsize; /* get size for sampling buffer */ + fmt_init_t fmt_init; /* initialize buffer area */ + fmt_handler_t fmt_handler; /* overflow handler (required) */ + fmt_restart_t fmt_restart; /* restart after notification */ + fmt_exit_t fmt_exit; /* context termination */ + + struct list_head fmt_list; /* internal use only */ + + struct kobject kobj; /* sysfs internal use only */ + struct module *owner; /* pointer to module owner */ + u32 fmt_qdepth; /* Max notify queue depth (required) */ +}; +#define to_smpl_fmt(n) container_of(n, struct pfm_smpl_fmt, kobj) + +#define PFM_FMTFL_IS_BUILTIN 0x1 /* fmt is compiled in */ +/* + * we need to know whether the format is builtin or compiled + * as a module + */ +#ifdef MODULE +#define PFM_FMT_BUILTIN_FLAG 0 /* not built as a module */ +#else +#define PFM_FMT_BUILTIN_FLAG PFM_PMUFL_IS_BUILTIN /* built as a module */ +#endif + +int pfm_fmt_register(struct pfm_smpl_fmt *fmt); +int pfm_fmt_unregister(struct pfm_smpl_fmt *fmt); +void pfm_sysfs_builtin_fmt_add(void); + +int pfm_sysfs_add_fmt(struct pfm_smpl_fmt *fmt); +void pfm_sysfs_remove_fmt(struct pfm_smpl_fmt *fmt); + +#endif /* __PERFMON_FMT_H__ */ diff --git a/include/linux/perfmon_kern.h b/include/linux/perfmon_kern.h new file mode 100644 index 0000000..6c3b527 --- /dev/null +++ b/include/linux/perfmon_kern.h @@ -0,0 +1,551 @@ +/* + * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ + +#ifndef __LINUX_PERFMON_KERN_H__ +#define __LINUX_PERFMON_KERN_H__ +/* + * This file contains all the definitions of data structures, variables, macros + * that are to be shared between generic code and arch-specific code + * + * For generic only definitions, use perfmon/perfmon_priv.h + */ +#ifdef CONFIG_PERFMON + +#include +#include +#include + +/* + * system adminstrator configuration controls available via + * the /sys/kerne/perfmon interface + */ +struct pfm_controls { + u32 debug; /* debugging control bitmask */ + gid_t sys_group; /* gid to create a syswide context */ + gid_t task_group; /* gid to create a per-task context */ + u32 flags; /* control flags (see below) */ + size_t arg_mem_max; /* maximum vector argument size */ + size_t smpl_buffer_mem_max; /* max buf mem, -1 for infinity */ +}; +extern struct pfm_controls pfm_controls; + +/* + * control flags + */ +#define PFM_CTRL_FL_RW_EXPERT 0x1 /* bypass reserved fields on read/write */ + +/* + * software PMD + */ +struct pfm_pmd { + u64 value; /* 64-bit value */ + u64 lval; /* last reset value */ + u64 ovflsw_thres; /* #ovfls left before switch */ + u64 long_reset; /* long reset value on overflow */ + u64 short_reset; /* short reset value on overflow */ + u64 reset_pmds[PFM_PMD_BV]; /* pmds to reset on overflow */ + u64 smpl_pmds[PFM_PMD_BV]; /* pmds to record on overflow */ + u64 mask; /* range mask for random value */ + u64 ovflsw_ref_thres; /* #ovfls before next set */ + u64 eventid; /* opaque event identifier */ + u32 flags; /* notify/do not notify */ +}; + +/* + * event_set: encapsulates the full PMU state + */ +struct pfm_event_set { + struct list_head list; /* ordered chain of sets */ + u16 id; /* set identification */ + u16 nused_pmds; /* max number of used PMDs */ + u16 nused_pmcs; /* max number of used PMCs */ + u16 pad1; /* paddding */ + u32 flags; /* public flags */ + u32 priv_flags; /* private flags (see below) */ + u64 runs; /* # of activations */ + u32 npend_ovfls; /* number of pending PMD overflow */ + u32 pad2; /* padding */ + u64 used_pmds[PFM_PMD_BV]; /* used PMDs */ + u64 povfl_pmds[PFM_PMD_BV]; /* pending overflowed PMDs */ + u64 ovfl_pmds[PFM_PMD_BV]; /* last overflowed PMDs */ + u64 reset_pmds[PFM_PMD_BV]; /* PMDs to reset after overflow */ + u64 ovfl_notify[PFM_PMD_BV]; /* notify on overflow */ + u64 used_pmcs[PFM_PMC_BV]; /* used PMCs */ + u64 pmcs[PFM_MAX_PMCS]; /* PMC values */ + + struct pfm_pmd pmds[PFM_MAX_PMDS]; + + ktime_t hrtimer_exp; /* switch timeout reference */ + ktime_t hrtimer_rem; /* per-thread remainder timeout */ + + u64 duration_start; /* start time in ns */ + u64 duration; /* total active ns */ +}; + +/* + * common private event set flags (priv_flags) + * + * upper 16 bits: for arch-specific use + * lower 16 bits: for common use + */ +#define PFM_SETFL_PRIV_MOD_PMDS 0x1 /* PMD register(s) modified */ +#define PFM_SETFL_PRIV_MOD_PMCS 0x2 /* PMC register(s) modified */ +#define PFM_SETFL_PRIV_SWITCH 0x4 /* must switch set on restart */ +#define PFM_SETFL_PRIV_MOD_BOTH (PFM_SETFL_PRIV_MOD_PMDS \ + | PFM_SETFL_PRIV_MOD_PMCS) + +/* + * context flags + */ +struct pfm_context_flags { + unsigned int block:1; /* task blocks on user notifications */ + unsigned int system:1; /* do system wide monitoring */ + unsigned int no_msg:1; /* no message sent on overflow */ + unsigned int switch_ovfl:1; /* switch set on counter ovfl */ + unsigned int switch_time:1; /* switch set on timeout */ + unsigned int started:1; /* pfm_start() issued */ + unsigned int work_type:2; /* type of work for pfm_handle_work */ + unsigned int mmap_nlock:1; /* no lock in pfm_release_buf_space */ + unsigned int ia64_v20_compat:1; /* context is IA-64 v2.0 mode */ + unsigned int can_restart:8; /* allowed to issue a PFM_RESTART */ + unsigned int reset_count:8; /* number of pending resets */ + unsigned int is_self:1; /* per-thread and self-montoring */ + unsigned int reserved:5; /* for future use */ +}; + +/* + * values for work_type (TIF_PERFMON_WORK must be set) + */ +#define PFM_WORK_NONE 0 /* nothing to do */ +#define PFM_WORK_RESET 1 /* reset overflowed counters */ +#define PFM_WORK_BLOCK 2 /* block current thread */ +#define PFM_WORK_ZOMBIE 3 /* cleanup zombie context */ + +/* + * overflow description argument passed to sampling format + */ +struct pfm_ovfl_arg { + u16 ovfl_pmd; /* index of overflowed PMD */ + u16 active_set; /* set active at the time of the overflow */ + u32 ovfl_ctrl; /* control flags */ + u64 pmd_last_reset; /* last reset value of overflowed PMD */ + u64 smpl_pmds_values[PFM_MAX_PMDS]; /* values of other PMDs */ + u64 pmd_eventid; /* eventid associated with PMD */ + u16 num_smpl_pmds; /* number of PMDS in smpl_pmd_values */ +}; +/* + * depth of message queue + * + * Depth cannot be bigger than 255 (see reset_count) + */ +#define PFM_MSGS_ORDER 3 /* log2(number of messages) */ +#define PFM_MSGS_COUNT (1</proc/sys/kernel/printk_ratelimit + * + * debug is a bitmask where bits are defined as follows: + * bit 0: enable non-interrupt code degbug messages + * bit 1: enable interrupt code debug messages + */ +#ifdef CONFIG_PERFMON_DEBUG +#define _PFM_DBG(lm, f, x...) \ + do { \ + if (unlikely((pfm_controls.debug & lm) && printk_ratelimit())) { \ + preempt_disable(); \ + printk("perfmon: %s.%d: CPU%d [%d]: " f "\n", \ + __func__, __LINE__, \ + smp_processor_id(), current->pid , ## x); \ + preempt_enable(); \ + } \ + } while (0) + +#define PFM_DBG(f, x...) _PFM_DBG(0x1, f, ##x) +#define PFM_DBG_ovfl(f, x...) _PFM_DBG(0x2, f, ## x) +#else +#define PFM_DBG(f, x...) do {} while (0) +#define PFM_DBG_ovfl(f, x...) do {} while (0) +#endif + +extern struct pfm_pmu_config *pfm_pmu_conf; +extern int perfmon_disabled; + +static inline struct pfm_arch_context *pfm_ctx_arch(struct pfm_context *c) +{ + return (struct pfm_arch_context *)(c+1); +} + +int pfm_get_args(void __user *ureq, size_t sz, size_t lsz, void *laddr, + void **req, void **to_free); + +int pfm_get_smpl_arg(char __user *fmt_uname, void __user *uaddr, size_t usize, + void **arg, struct pfm_smpl_fmt **fmt); + +int __pfm_write_pmcs(struct pfm_context *ctx, struct pfarg_pmc *req, + int count); +int __pfm_write_pmds(struct pfm_context *ctx, struct pfarg_pmd *req, int count, + int compat); +int __pfm_read_pmds(struct pfm_context *ctx, struct pfarg_pmd *req, int count); + +int __pfm_load_context(struct pfm_context *ctx, struct pfarg_load *req, + struct task_struct *task); +int __pfm_unload_context(struct pfm_context *ctx, int *can_release); + +int __pfm_stop(struct pfm_context *ctx, int *release_info); +int __pfm_restart(struct pfm_context *ctx, int *unblock); +int __pfm_start(struct pfm_context *ctx, struct pfarg_start *start); + +void pfm_free_context(struct pfm_context *ctx); + +void pfm_smpl_buf_space_release(struct pfm_context *ctx, size_t size); + +int pfm_check_task_state(struct pfm_context *ctx, int check_mask, + unsigned long *flags, void **resume); +/* + * check_mask bitmask values for pfm_check_task_state() + */ +#define PFM_CMD_STOPPED 0x01 /* command needs thread stopped */ +#define PFM_CMD_UNLOADED 0x02 /* command needs ctx unloaded */ +#define PFM_CMD_UNLOAD 0x04 /* command is unload */ + +int __pfm_create_context(struct pfarg_ctx *req, + struct pfm_smpl_fmt *fmt, + void *fmt_arg, + int mode, + struct pfm_context **new_ctx); + +struct pfm_event_set *pfm_find_set(struct pfm_context *ctx, u16 set_id, + int alloc); + +int pfm_pmu_conf_get(int autoload); +void pfm_pmu_conf_put(void); + +int pfm_session_allcpus_acquire(void); +void pfm_session_allcpus_release(void); + +int pfm_smpl_buf_alloc(struct pfm_context *ctx, size_t rsize); +void pfm_smpl_buf_free(struct pfm_context *ctx); + +struct pfm_smpl_fmt *pfm_smpl_fmt_get(char *name); +void pfm_smpl_fmt_put(struct pfm_smpl_fmt *fmt); + +void pfm_interrupt_handler(unsigned long iip, struct pt_regs *regs); + +void pfm_resume_task(struct task_struct *t, void *data); + +#include +#include + +extern const struct file_operations pfm_file_ops; +/* + * upper limit for count in calls that take vector arguments. This is used + * to prevent for multiplication overflow when we compute actual storage size + */ +#define PFM_MAX_ARG_COUNT(m) (INT_MAX/sizeof(*(m))) + +#define cast_ulp(_x) ((unsigned long *)_x) + +#define PFM_NORMAL 0 +#define PFM_COMPAT 1 + +void __pfm_exit_thread(void); +void pfm_ctxsw_in(struct task_struct *prev, struct task_struct *next); +void pfm_ctxsw_out(struct task_struct *prev, struct task_struct *next); +void pfm_handle_work(struct pt_regs *regs); +void __pfm_init_percpu(void *dummy); +void pfm_save_pmds(struct pfm_context *ctx, struct pfm_event_set *set); + +static inline void pfm_exit_thread(void) +{ + if (current->pfm_context) + __pfm_exit_thread(); +} + +/* + * include arch-specific kernel level definitions + */ +#include + +static inline void pfm_copy_thread(struct task_struct *task) +{ + /* + * context or perfmon TIF state is NEVER inherited + * in child task. Holds for per-thread and system-wide + */ + task->pfm_context = NULL; + clear_tsk_thread_flag(task, TIF_PERFMON_CTXSW); + clear_tsk_thread_flag(task, TIF_PERFMON_WORK); + pfm_arch_disarm_handle_work(task); +} + + +/* + * read a single PMD register. + * + * virtual PMD registers have special handler. + * Depends on definitions in asm/perfmon_kern.h + */ +static inline u64 pfm_read_pmd(struct pfm_context *ctx, unsigned int cnum) +{ + if (unlikely(pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_V)) + return pfm_pmu_conf->pmd_sread(ctx, cnum); + + return pfm_arch_read_pmd(ctx, cnum); +} +/* + * write a single PMD register. + * + * virtual PMD registers have special handler. + * Depends on definitions in asm/perfmon_kern.h + */ +static inline void pfm_write_pmd(struct pfm_context *ctx, unsigned int cnum, + u64 value) +{ + /* + * PMD writes are ignored for read-only registers + */ + if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_RO) + return; + + if (pfm_pmu_conf->pmd_desc[cnum].type & PFM_REG_V) { + pfm_pmu_conf->pmd_swrite(ctx, cnum, value); + return; + } + /* + * clear unimplemented bits + */ + value &= ~pfm_pmu_conf->pmd_desc[cnum].rsvd_msk; + + pfm_arch_write_pmd(ctx, cnum, value); +} + +void __pfm_init_percpu(void *dummy); + +static inline void pfm_init_percpu(void) +{ + __pfm_init_percpu(NULL); +} + +/* + * pfm statistics are available via debugfs + * and perfmon subdir. + * + * When adding/removing new stats, make sure you also + * update the name table in perfmon_debugfs.c + */ +enum pfm_stats_names { + PFM_ST_ovfl_intr_all_count = 0, + PFM_ST_ovfl_intr_ns, + PFM_ST_ovfl_intr_spurious_count, + PFM_ST_ovfl_intr_replay_count, + PFM_ST_ovfl_intr_regular_count, + PFM_ST_handle_work_count, + PFM_ST_ovfl_notify_count, + PFM_ST_reset_pmds_count, + PFM_ST_pfm_restart_count, + PFM_ST_fmt_handler_calls, + PFM_ST_fmt_handler_ns, + PFM_ST_set_switch_count, + PFM_ST_set_switch_ns, + PFM_ST_set_switch_exp, + PFM_ST_ctxswin_count, + PFM_ST_ctxswin_ns, + PFM_ST_handle_timeout_count, + PFM_ST_ovfl_intr_nmi_count, + PFM_ST_ctxswout_count, + PFM_ST_ctxswout_ns, + PFM_ST_LAST /* last entry marked */ +}; +#define PFM_NUM_STATS PFM_ST_LAST + +struct pfm_stats { + u64 v[PFM_NUM_STATS]; + struct dentry *dirs[PFM_NUM_STATS]; + struct dentry *cpu_dir; + char cpu_name[8]; +}; + +#ifdef CONFIG_PERFMON_DEBUG_FS +#define pfm_stats_get(x) __get_cpu_var(pfm_stats).v[PFM_ST_##x] +#define pfm_stats_inc(x) __get_cpu_var(pfm_stats).v[PFM_ST_##x]++ +#define pfm_stats_add(x, y) __get_cpu_var(pfm_stats).v[PFM_ST_##x] += (y) +void pfm_reset_stats(int cpu); +#else +#define pfm_stats_get(x) +#define pfm_stats_inc(x) +#define pfm_stats_add(x, y) +static inline void pfm_reset_stats(int cpu) +{} +#endif + + + +DECLARE_PER_CPU(struct pfm_context *, pmu_ctx); +DECLARE_PER_CPU(struct pfm_stats, pfm_stats); +DECLARE_PER_CPU(struct task_struct *, pmu_owner); + +void pfm_cpu_disable(void); + + +/* + * max vector argument elements for local storage (no kmalloc/kfree) + * The PFM_ARCH_PM*_ARG should be defined in perfmon_kern.h. + * If not, default (conservative) values are used + */ +#ifndef PFM_ARCH_PMC_STK_ARG +#define PFM_ARCH_PMC_STK_ARG 1 +#endif + +#ifndef PFM_ARCH_PMD_STK_ARG +#define PFM_ARCH_PMD_STK_ARG 1 +#endif + +#define PFM_PMC_STK_ARG PFM_ARCH_PMC_STK_ARG +#define PFM_PMD_STK_ARG PFM_ARCH_PMD_STK_ARG + +#else /* !CONFIG_PERFMON */ + + +/* + * perfmon hooks are nops when CONFIG_PERFMON is undefined + */ +static inline void pfm_cpu_disable(void) +{} + +static inline void pfm_exit_thread(void) +{} + +static inline void pfm_handle_work(struct pt_regs *regs) +{} + +static inline void pfm_copy_thread(struct task_struct *t) +{} + +static inline void pfm_ctxsw_in(struct task_struct *p, struct task_struct *n) +{} + +static inline void pfm_ctxsw_out(struct task_struct *p, struct task_struct *n) +{} + +static inline void pfm_session_allcpus_release(void) +{} + +static inline int pfm_session_allcpus_acquire(void) +{ + return 0; +} + +static inline void pfm_init_percpu(void) +{} + +#endif /* CONFIG_PERFMON */ + +#endif /* __LINUX_PERFMON_KERN_H__ */ diff --git a/include/linux/perfmon_pmu.h b/include/linux/perfmon_pmu.h new file mode 100644 index 0000000..3f5f9e8 --- /dev/null +++ b/include/linux/perfmon_pmu.h @@ -0,0 +1,192 @@ +/* + * Copyright (c) 2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * Interface for PMU description modules + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#ifndef __PERFMON_PMU_H__ +#define __PERFMON_PMU_H__ 1 + +/* + * generic information about a PMC or PMD register + * + * Dependency bitmasks: + * They are used to allow lazy save/restore in the context switch + * code. To avoid picking up stale configuration from a previous + * thread. Usng the bitmask, the generic read/write routines can + * ensure that all registers needed to support the measurement are + * restored properly on context switch in. + */ +struct pfm_regmap_desc { + u16 type; /* role of the register */ + u16 reserved1; /* for future use */ + u32 reserved2; /* for future use */ + u64 dfl_val; /* power-on default value (quiescent) */ + u64 rsvd_msk; /* reserved bits: 1 means reserved */ + u64 no_emul64_msk; /* bits to clear for PFM_REGFL_NO_EMUL64 */ + unsigned long hw_addr; /* HW register address or index */ + struct kobject kobj; /* for internal use only */ + char *desc; /* HW register description string */ + u64 dep_pmcs[PFM_PMC_BV];/* depending PMC registers */ +}; +#define to_reg(n) container_of(n, struct pfm_regmap_desc, kobj) + +/* + * pfm_reg_desc helper macros + */ +#define PMC_D(t, d, v, r, n, h) \ + { .type = t, \ + .desc = d, \ + .dfl_val = v, \ + .rsvd_msk = r, \ + .no_emul64_msk = n, \ + .hw_addr = h \ + } + +#define PMD_D(t, d, h) \ + { .type = t, \ + .desc = d, \ + .rsvd_msk = 0, \ + .no_emul64_msk = 0, \ + .hw_addr = h \ + } + +#define PMD_DR(t, d, h, r) \ + { .type = t, \ + .desc = d, \ + .rsvd_msk = r, \ + .no_emul64_msk = 0, \ + .hw_addr = h \ + } + +#define PMX_NA \ + { .type = PFM_REG_NA } + +#define PMD_DP(t, d, h, p) \ + { .type = t, \ + .desc = d, \ + .rsvd_msk = 0, \ + .no_emul64_msk = 0, \ + .dep_pmcs[0] = p, \ + .hw_addr = h \ + } + +/* + * type of a PMU register (16-bit bitmask) for use with pfm_reg_desc.type + */ +#define PFM_REG_NA 0x00 /* not avail. (not impl.,no access) must be 0 */ +#define PFM_REG_I 0x01 /* PMC/PMD: implemented */ +#define PFM_REG_WC 0x02 /* PMC: has write_checker */ +#define PFM_REG_C64 0x04 /* PMD: 64-bit virtualization */ +#define PFM_REG_RO 0x08 /* PMD: read-only (writes ignored) */ +#define PFM_REG_V 0x10 /* PMD: virtual reg */ +#define PFM_REG_INTR 0x20 /* PMD: register can generate interrupt */ +#define PFM_REG_SYS 0x40 /* PMC/PMD: register is for system-wide only */ +#define PFM_REG_THR 0x80 /* PMC/PMD: register is for per-thread only */ +#define PFM_REG_NO64 0x100 /* PMC: supports PFM_REGFL_NO_EMUL64 */ + +/* + * define some shortcuts for common types + */ +#define PFM_REG_W (PFM_REG_WC|PFM_REG_I) +#define PFM_REG_W64 (PFM_REG_WC|PFM_REG_NO64|PFM_REG_I) +#define PFM_REG_C (PFM_REG_C64|PFM_REG_INTR|PFM_REG_I) +#define PFM_REG_I64 (PFM_REG_NO64|PFM_REG_I) +#define PFM_REG_IRO (PFM_REG_I|PFM_REG_RO) + +typedef int (*pfm_pmc_check_t)(struct pfm_context *ctx, + struct pfm_event_set *set, + struct pfarg_pmc *req); + +typedef int (*pfm_pmd_check_t)(struct pfm_context *ctx, + struct pfm_event_set *set, + struct pfarg_pmd *req); + + +typedef u64 (*pfm_sread_t)(struct pfm_context *ctx, unsigned int cnum); +typedef void (*pfm_swrite_t)(struct pfm_context *ctx, unsigned int cnum, u64 val); + +/* + * structure used by pmu description modules + * + * probe_pmu() routine return value: + * - 1 means recognized PMU + * - 0 means not recognized PMU + */ +struct pfm_pmu_config { + char *pmu_name; /* PMU family name */ + char *version; /* config module version */ + + int counter_width; /* width of hardware counter */ + + struct pfm_regmap_desc *pmc_desc; /* PMC register descriptions */ + struct pfm_regmap_desc *pmd_desc; /* PMD register descriptions */ + + pfm_pmc_check_t pmc_write_check;/* write checker (optional) */ + pfm_pmd_check_t pmd_write_check;/* write checker (optional) */ + pfm_pmd_check_t pmd_read_check; /* read checker (optional) */ + + pfm_sread_t pmd_sread; /* virtual pmd read */ + pfm_swrite_t pmd_swrite; /* virtual pmd write */ + + int (*probe_pmu)(void);/* probe PMU routine */ + + u16 num_pmc_entries;/* #entries in pmc_desc */ + u16 num_pmd_entries;/* #entries in pmd_desc */ + + void *pmu_info; /* model-specific infos */ + u32 flags; /* set of flags */ + + struct module *owner; /* pointer to module struct */ + + /* + * fields computed internally, do not set in module + */ + struct pfm_regdesc regs_all; /* regs available to all */ + struct pfm_regdesc regs_thr; /* regs avail per-thread */ + struct pfm_regdesc regs_sys; /* regs avail system-wide */ + + u64 ovfl_mask; /* overflow mask */ +}; + +static inline void *pfm_pmu_info(void) +{ + return pfm_pmu_conf->pmu_info; +} + +/* + * pfm_pmu_config flags + */ +#define PFM_PMUFL_IS_BUILTIN 0x1 /* pmu config is compiled in */ + +/* + * we need to know whether the PMU description is builtin or compiled + * as a module + */ +#ifdef MODULE +#define PFM_PMU_BUILTIN_FLAG 0 /* not built as a module */ +#else +#define PFM_PMU_BUILTIN_FLAG PFM_PMUFL_IS_BUILTIN /* built as a module */ +#endif + +int pfm_pmu_register(struct pfm_pmu_config *cfg); +void pfm_pmu_unregister(struct pfm_pmu_config *cfg); + +int pfm_sysfs_remove_pmu(struct pfm_pmu_config *pmu); +int pfm_sysfs_add_pmu(struct pfm_pmu_config *pmu); + +#endif /* __PERFMON_PMU_H__ */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 3d9120c..8fb3b55 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -96,6 +96,7 @@ struct exec_domain; struct futex_pi_state; struct robust_list_head; struct bio; +struct pfm_context; /* * List of flags we want to share for kernel threads, @@ -1301,6 +1302,9 @@ struct task_struct { int latency_record_count; struct latency_record latency_record[LT_SAVECOUNT]; #endif +#ifdef CONFIG_PERFMON + struct pfm_context *pfm_context; +#endif }; /* diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index d6ff145..e308523 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -29,6 +29,13 @@ struct msqid_ds; struct new_utsname; struct nfsctl_arg; struct __old_kernel_stat; +struct pfarg_ctx; +struct pfarg_pmc; +struct pfarg_pmd; +struct pfarg_start; +struct pfarg_load; +struct pfarg_setinfo; +struct pfarg_setdesc; struct pollfd; struct rlimit; struct rusage; @@ -625,4 +632,27 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); +asmlinkage long sys_pfm_create_context(struct pfarg_ctx __user *ureq, + void __user *uarg, size_t smpl_size); +asmlinkage long sys_pfm_write_pmcs(int fd, struct pfarg_pmc __user *ureq, + int count); +asmlinkage long sys_pfm_write_pmds(int fd, struct pfarg_pmd __user *ureq, + int count); +asmlinkage long sys_pfm_read_pmds(int fd, struct pfarg_pmd __user *ureq, + int count); +asmlinkage long sys_pfm_restart(int fd); +asmlinkage long sys_pfm_stop(int fd); +asmlinkage long sys_pfm_start(int fd, struct pfarg_start __user *ureq); +asmlinkage long sys_pfm_load_context(int fd, struct pfarg_load __user *ureq); +asmlinkage long sys_pfm_unload_context(int fd); +asmlinkage long sys_pfm_delete_evtsets(int fd, + struct pfarg_setinfo __user *ureq, + int count); +asmlinkage long sys_pfm_create_evtsets(int fd, + struct pfarg_setdesc __user *ureq, + int count); +asmlinkage long sys_pfm_getinfo_evtsets(int fd, + struct pfarg_setinfo __user *ureq, + int count); + #endif diff --git a/kernel/sched.c b/kernel/sched.c index ad1962d..1bc8fcf 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -71,6 +71,7 @@ #include #include #include +#include #include #include diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 08d6e1b..61f4155 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -126,6 +126,19 @@ cond_syscall(sys_vm86); cond_syscall(compat_sys_ipc); cond_syscall(compat_sys_sysctl); +cond_syscall(sys_pfm_create_context); +cond_syscall(sys_pfm_write_pmcs); +cond_syscall(sys_pfm_write_pmds); +cond_syscall(sys_pfm_read_pmds); +cond_syscall(sys_pfm_restart); +cond_syscall(sys_pfm_start); +cond_syscall(sys_pfm_stop); +cond_syscall(sys_pfm_load_context); +cond_syscall(sys_pfm_unload_context); +cond_syscall(sys_pfm_create_evtsets); +cond_syscall(sys_pfm_delete_evtsets); +cond_syscall(sys_pfm_getinfo_evtsets); + /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); cond_syscall(sys_pciconfig_write); diff --git a/perfmon/Makefile b/perfmon/Makefile new file mode 100644 index 0000000..32ff037 --- /dev/null +++ b/perfmon/Makefile @@ -0,0 +1,12 @@ +# +# Copyright (c) 2005-2006 Hewlett-Packard Development Company, L.P. +# Contributed by Stephane Eranian +# +obj-y = perfmon_init.o perfmon_rw.o perfmon_res.o \ + perfmon_pmu.o perfmon_sysfs.o perfmon_syscalls.o \ + perfmon_file.o perfmon_ctxsw.o perfmon_intr.o \ + perfmon_dfl_smpl.o perfmon_sets.o perfmon_hotplug.o \ + perfmon_msg.o perfmon_smpl.o perfmon_attach.o \ + perfmon_activate.o perfmon_ctx.o perfmon_fmt.o + +obj-$(CONFIG_PERFMON_DEBUG_FS) += perfmon_debugfs.o diff --git a/perfmon/perfmon_activate.c b/perfmon/perfmon_activate.c new file mode 100644 index 0000000..d9f501d --- /dev/null +++ b/perfmon/perfmon_activate.c @@ -0,0 +1,265 @@ +/* + * perfmon_activate.c: perfmon2 start/stop functions + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * David Mosberger-Tang + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include "perfmon_priv.h" + +/** + * __pfm_start - activate monitoring + * @ctx: context to operate on + * @start: pfarg_start as passed by user + * + * When operating in per-thread mode and not self-monitoring, the monitored + * thread must be stopped. Activation will be effective next time the thread + * is context switched in. + * + * The pfarg_start argument is optional and may be used to designate + * the initial event set to activate. When not provided, the last active + * set is used. For the first activation, set0 is used when start is NULL. + * + * On some architectures, e.g., IA-64, it may be possible to start monitoring + * without calling this function under certain conditions (per-thread and self + * monitoring). In this case, either set0 or the last active set is used. + * + * the context is locked and interrupts are disabled. + */ +int __pfm_start(struct pfm_context *ctx, struct pfarg_start *start) +{ + struct task_struct *task, *owner_task; + struct pfm_event_set *new_set, *old_set; + int is_self; + + task = ctx->task; + + /* + * UNLOADED: error + * LOADED : normal start, nop if started unless set is different + * MASKED : nop or change set when unmasking + * ZOMBIE : cannot happen + */ + if (ctx->state == PFM_CTX_UNLOADED) + return -EINVAL; + + old_set = new_set = ctx->active_set; + + /* + * always the case for system-wide + */ + if (task == NULL) + task = current; + + is_self = task == current; + + /* + * argument is provided? + */ + if (start) { + /* + * find the set to load first + */ + new_set = pfm_find_set(ctx, start->start_set, 0); + if (new_set == NULL) { + PFM_DBG("event set%u does not exist", + start->start_set); + return -EINVAL; + } + } + + PFM_DBG("cur_set=%u req_set=%u", old_set->id, new_set->id); + + /* + * if we need to change the active set we need + * to check if we can access the PMU + */ + if (new_set != old_set) { + + owner_task = __get_cpu_var(pmu_owner); + /* + * system-wide: must run on the right CPU + * per-thread : must be the owner of the PMU context + * + * pfm_switch_sets() returns with monitoring stopped + */ + if (is_self) { + pfm_switch_sets(ctx, new_set, PFM_PMD_RESET_LONG, 1); + } else { + /* + * In a UP kernel, the PMU may contain the state + * of the task we want to operate on, yet the task + * may be switched out (lazy save). We need to save + * current state (old_set), switch active_set and + * mark it for reload. + */ + if (owner_task == task) + pfm_save_pmds(ctx, old_set); + ctx->active_set = new_set; + new_set->priv_flags |= PFM_SETFL_PRIV_MOD_BOTH; + } + } + + /* + * mark as started + * must be done before calling pfm_arch_start() + */ + ctx->flags.started = 1; + + pfm_arch_start(task, ctx); + + /* + * we check whether we had a pending ovfl before restarting. + * If so we need to regenerate the interrupt to make sure we + * keep recorded samples. For non-self monitoring this check + * is done in the pfm_ctxswin_thread() routine. + * + * we check new_set/old_set because pfm_switch_sets() already + * takes care of replaying the pending interrupts + */ + if (is_self && new_set != old_set && new_set->npend_ovfls) { + pfm_arch_resend_irq(ctx); + pfm_stats_inc(ovfl_intr_replay_count); + } + + /* + * always start with full timeout + */ + new_set->hrtimer_rem = new_set->hrtimer_exp; + + /* + * activate timeout for system-wide, self-montoring + * Always start with full timeout + * Timeout is at least one tick away, so no risk of + * having hrtimer_start() trying to wakeup softirqd + * and thus causing troubles. This cannot happen anmyway + * because cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ + */ + if (is_self && new_set->flags & PFM_SETFL_TIME_SWITCH) { + hrtimer_start(&__get_cpu_var(pfm_hrtimer), + new_set->hrtimer_rem, + HRTIMER_MODE_REL); + + PFM_DBG("set%u started timeout=%lld", + new_set->id, + (unsigned long long)new_set->hrtimer_rem.tv64); + } + + /* + * we restart total duration even if context was + * already started. In that case, counts are simply + * reset. + * + * For per-thread, if not self-monitoring, the statement + * below will have no effect because thread is stopped. + * The field is reset of ctxsw in. + */ + new_set->duration_start = sched_clock(); + + return 0; +} + +/** + * __pfm_stop - stop monitoring + * @ctx: context to operate on + * @release_info: infos for caller (see below) + * + * When operating in per-thread* mode and when not self-monitoring, + * the monitored thread must be stopped. + * + * the context is locked and interrupts are disabled. + * + * release_info value upon return: + * - bit 0 : unused + * - bit 1 : when set, must cancel hrtimer + */ +int __pfm_stop(struct pfm_context *ctx, int *release_info) +{ + struct pfm_event_set *set; + struct task_struct *task; + u64 now; + int state; + + *release_info = 0; + + now = sched_clock(); + state = ctx->state; + set = ctx->active_set; + + /* + * context must be attached (zombie cannot happen) + */ + if (state == PFM_CTX_UNLOADED) + return -EINVAL; + + task = ctx->task; + + PFM_DBG("ctx_task=[%d] ctx_state=%d is_system=%d", + task ? task->pid : -1, + state, + !task); + + /* + * this happens for system-wide context + */ + if (task == NULL) + task = current; + + /* + * compute elapsed time + * + * unless masked, compute elapsed duration, stop timeout + */ + if (task == current && state == PFM_CTX_LOADED) { + /* + * timeout cancel must be deferred until context is + * unlocked to avoid race with pfm_handle_switch_timeout() + */ + if (set->flags & PFM_SETFL_TIME_SWITCH) + *release_info |= 0x2; + + set->duration += now - set->duration_start; + } + + pfm_arch_stop(task, ctx); + + ctx->flags.started = 0; + /* + * starting now, in-flight PMU interrupt for this context + * are treated as spurious + */ + return 0; +} diff --git a/perfmon/perfmon_attach.c b/perfmon/perfmon_attach.c new file mode 100644 index 0000000..bbd1d1e --- /dev/null +++ b/perfmon/perfmon_attach.c @@ -0,0 +1,474 @@ +/* + * perfmon_attach.c: perfmon2 load/unload functions + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * David Mosberger-Tang + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include "perfmon_priv.h" + +/** + * __pfm_load_context_sys - attach context to a CPU in system-wide mode + * @ctx: context to operate on + * @set_id: set to activate first + * @cpu: CPU to monitor + * + * The cpu specified in the pfarg_load.load_pid argument must be the current + * CPU. + * + * The function must be called with the context locked and interrupts disabled. + */ +static int pfm_load_ctx_sys(struct pfm_context *ctx, u16 set_id, u32 cpu) +{ + struct pfm_event_set *set; + int mycpu; + int ret; + + mycpu = smp_processor_id(); + + /* + * system-wide: check we are running on the desired CPU + */ + if (cpu != mycpu) { + PFM_DBG("wrong CPU: asking %u but on %u", cpu, mycpu); + return -EINVAL; + } + + /* + * initialize sets + */ + set = pfm_prepare_sets(ctx, set_id); + if (!set) { + PFM_DBG("event set%u does not exist", set_id); + return -EINVAL; + } + + PFM_DBG("set=%u set_flags=0x%x", set->id, set->flags); + + ctx->cpu = mycpu; + ctx->task = NULL; + ctx->active_set = set; + + /* + * perform any architecture specific actions + */ + ret = pfm_arch_load_context(ctx); + if (ret) + goto error_noload; + + /* + * now reserve the session, before we can proceed with + * actually accessing the PMU hardware + */ + ret = pfm_session_acquire(1, mycpu); + if (ret) + goto error; + + + /* + * caller must be on monitored CPU to access PMU, thus this is + * a form of self-monitoring + */ + ctx->flags.is_self = 1; + + set->runs++; + + /* + * load PMD from set + * load PMC from set + */ + pfm_arch_restore_pmds(ctx, set); + pfm_arch_restore_pmcs(ctx, set); + + /* + * set new ownership + */ + pfm_set_pmu_owner(NULL, ctx); + + /* + * reset pending work + */ + ctx->flags.work_type = PFM_WORK_NONE; + ctx->flags.reset_count = 0; + + /* + * reset message queue + */ + ctx->msgq_head = ctx->msgq_tail = 0; + + ctx->state = PFM_CTX_LOADED; + + return 0; +error: + pfm_arch_unload_context(ctx); +error_noload: + return ret; +} + +/** + * __pfm_load_context_thread - attach context to a thread + * @ctx: context to operate on + * @set_id: first set + * @task: threadf to attach to + * + * The function must be called with the context locked and interrupts disabled. + */ +static int pfm_load_ctx_thread(struct pfm_context *ctx, u16 set_id, + struct task_struct *task) +{ + struct pfm_event_set *set; + struct pfm_context *old; + int ret; + + PFM_DBG("load_pid=%d set=%u", task->pid, set_id); + /* + * per-thread: + * - task to attach to is checked in sys_pfm_load_context() to avoid + * locking issues. if found, and not self, task refcount was + * incremented. + */ + old = cmpxchg(&task->pfm_context, NULL, ctx); + if (old) { + PFM_DBG("load_pid=%d has a context " + "old=%p new=%p cur=%p", + task->pid, + old, + ctx, + task->pfm_context); + return -EEXIST; + } + + /* + * initialize sets + */ + set = pfm_prepare_sets(ctx, set_id); + if (!set) { + PFM_DBG("event set%u does not exist", set_id); + return -EINVAL; + } + + + ctx->task = task; + ctx->cpu = -1; + ctx->active_set = set; + + /* + * perform any architecture specific actions + */ + ret = pfm_arch_load_context(ctx); + if (ret) + goto error_noload; + + /* + * now reserve the session, before we can proceed with + * actually accessing the PMU hardware + */ + ret = pfm_session_acquire(0, -1); + if (ret) + goto error; + + + set->runs++; + if (ctx->task != current) { + + ctx->flags.is_self = 0; + + /* force a full reload */ + ctx->last_act = PFM_INVALID_ACTIVATION; + ctx->last_cpu = -1; + set->priv_flags |= PFM_SETFL_PRIV_MOD_BOTH; + + } else { + pfm_check_save_prev_ctx(); + + ctx->last_cpu = smp_processor_id(); + __get_cpu_var(pmu_activation_number)++; + ctx->last_act = __get_cpu_var(pmu_activation_number); + + ctx->flags.is_self = 1; + + /* + * load PMD from set + * load PMC from set + */ + pfm_arch_restore_pmds(ctx, set); + pfm_arch_restore_pmcs(ctx, set); + + /* + * set new ownership + */ + pfm_set_pmu_owner(ctx->task, ctx); + } + set_tsk_thread_flag(task, TIF_PERFMON_CTXSW); + + /* + * reset pending work + */ + ctx->flags.work_type = PFM_WORK_NONE; + ctx->flags.reset_count = 0; + + /* + * reset message queue + */ + ctx->msgq_head = ctx->msgq_tail = 0; + + ctx->state = PFM_CTX_LOADED; + + return 0; + +error: + pfm_arch_unload_context(ctx); + ctx->task = NULL; +error_noload: + /* + * detach context + */ + task->pfm_context = NULL; + return ret; +} + +/** + * __pfm_load_context - attach context to a CPU or thread + * @ctx: context to operate on + * @load: pfarg_load as passed by user + * @task: thread to attach to, NULL for system-wide + */ +int __pfm_load_context(struct pfm_context *ctx, struct pfarg_load *load, + struct task_struct *task) +{ + if (ctx->flags.system) + return pfm_load_ctx_sys(ctx, load->load_set, load->load_pid); + return pfm_load_ctx_thread(ctx, load->load_set, task); +} + +/** + * pfm_update_ovfl_pmds - account for pending ovfls on PMDs + * @ctx: context to operate on + * + * This function is always called after pfm_stop has been issued + */ +static void pfm_update_ovfl_pmds(struct pfm_context *ctx) +{ + struct pfm_event_set *set; + u64 *cnt_pmds; + u64 ovfl_mask; + u16 num_ovfls, i, first; + + ovfl_mask = pfm_pmu_conf->ovfl_mask; + first = ctx->regs.first_intr_pmd; + cnt_pmds = ctx->regs.cnt_pmds; + + /* + * look for pending interrupts and adjust PMD values accordingly + */ + list_for_each_entry(set, &ctx->set_list, list) { + + if (!set->npend_ovfls) + continue; + + num_ovfls = set->npend_ovfls; + PFM_DBG("set%u nintrs=%u", set->id, num_ovfls); + + for (i = first; num_ovfls; i++) { + if (test_bit(i, cast_ulp(set->povfl_pmds))) { + /* only correct value for counters */ + if (test_bit(i, cast_ulp(cnt_pmds))) + set->pmds[i].value += 1 + ovfl_mask; + num_ovfls--; + } + PFM_DBG("pmd%u set=%u val=0x%llx", + i, + set->id, + (unsigned long long)set->pmds[i].value); + } + /* + * we need to clear to prevent a pfm_getinfo_evtsets() from + * returning stale data even after the context is unloaded + */ + set->npend_ovfls = 0; + bitmap_zero(cast_ulp(set->povfl_pmds), ctx->regs.max_intr_pmd); + } +} + + +/** + * __pfm_unload_context - detach context from CPU or thread + * @ctx: context to operate on + * @release_info: pointer to return info (see below) + * + * The function must be called with the context locked and interrupts disabled. + * + * release_info value upon return: + * - bit 0: when set, must free context + * - bit 1: when set, must cancel hrtimer + */ +int __pfm_unload_context(struct pfm_context *ctx, int *release_info) +{ + struct task_struct *task; + int ret; + + PFM_DBG("ctx_state=%d task [%d]", + ctx->state, + ctx->task ? ctx->task->pid : -1); + + *release_info = 0; + + /* + * unload only when necessary + */ + if (ctx->state == PFM_CTX_UNLOADED) + return 0; + + task = ctx->task; + + /* + * stop monitoring + */ + ret = __pfm_stop(ctx, release_info); + if (ret) + return ret; + + ctx->state = PFM_CTX_UNLOADED; + ctx->flags.can_restart = 0; + + /* + * save active set + * UP: + * if not current task and due to lazy, state may + * still be live + * for system-wide, guaranteed to run on correct CPU + */ + if (__get_cpu_var(pmu_ctx) == ctx) { + /* + * pending overflows have been saved by pfm_stop() + */ + pfm_save_pmds(ctx, ctx->active_set); + pfm_set_pmu_owner(NULL, NULL); + PFM_DBG("released ownership"); + } + + /* + * account for pending overflows + */ + pfm_update_ovfl_pmds(ctx); + + /* + * arch-specific unload operations + */ + pfm_arch_unload_context(ctx); + + /* + * per-thread: disconnect from monitored task + */ + if (task) { + task->pfm_context = NULL; + ctx->task = NULL; + clear_tsk_thread_flag(task, TIF_PERFMON_CTXSW); + clear_tsk_thread_flag(task, TIF_PERFMON_WORK); + pfm_arch_disarm_handle_work(task); + } + /* + * session can be freed, must have interrupts enabled + * thus we release in the caller. Bit 0 signals to the + * caller that the session can be released. + */ + *release_info |= 0x1; + + return 0; +} + +/** + * __pfm_exit_thread - detach and free context on thread exit + */ +void __pfm_exit_thread(void) +{ + struct pfm_context *ctx; + unsigned long flags; + int free_ok = 0, release_info = 0; + int ret; + + ctx = current->pfm_context; + + BUG_ON(ctx->flags.system); + + spin_lock_irqsave(&ctx->lock, flags); + + PFM_DBG("state=%d is_self=%d", ctx->state, ctx->flags.is_self); + + /* + * __pfm_unload_context() cannot fail + * in the context states we are interested in + */ + switch (ctx->state) { + case PFM_CTX_LOADED: + case PFM_CTX_MASKED: + __pfm_unload_context(ctx, &release_info); + /* + * end notification only sent for non + * self-monitoring context + */ + if (!ctx->flags.is_self) + pfm_end_notify(ctx); + break; + case PFM_CTX_ZOMBIE: + __pfm_unload_context(ctx, &release_info); + free_ok = 1; + break; + default: + BUG_ON(ctx->state != PFM_CTX_LOADED); + break; + } + spin_unlock_irqrestore(&ctx->lock, flags); + + /* + * cancel timer now that context is unlocked + */ + if (release_info & 0x2) { + ret = hrtimer_cancel(&__get_cpu_var(pfm_hrtimer)); + PFM_DBG("timeout cancel=%d", ret); + } + + if (release_info & 0x1) + pfm_session_release(0, 0); + + /* + * All memory free operations (especially for vmalloc'ed memory) + * MUST be done with interrupts ENABLED. + */ + if (free_ok) + pfm_free_context(ctx); +} diff --git a/perfmon/perfmon_ctx.c b/perfmon/perfmon_ctx.c new file mode 100644 index 0000000..afe6078 --- /dev/null +++ b/perfmon/perfmon_ctx.c @@ -0,0 +1,314 @@ +/* + * perfmon_ctx.c: perfmon2 context functions + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * David Mosberger-Tang + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include "perfmon_priv.h" + +/* + * context memory pool pointer + */ +static struct kmem_cache *pfm_ctx_cachep; + +/** + * pfm_free_context - de-allocate context and associated resources + * @ctx: context to free + */ +void pfm_free_context(struct pfm_context *ctx) +{ + pfm_arch_context_free(ctx); + + pfm_free_sets(ctx); + + pfm_smpl_buf_free(ctx); + + PFM_DBG("free ctx @0x%p", ctx); + kmem_cache_free(pfm_ctx_cachep, ctx); + /* + * decrease refcount on: + * - PMU description table + * - sampling format + */ + pfm_pmu_conf_put(); + pfm_pmu_release(); +} + +/** + * pfm_ctx_flags_sane - check if context flags passed by user are okay + * @ctx_flags: flags passed user on pfm_create_context + * + * return: + * 0 if successful + * <0 and error code otherwise + */ +static inline int pfm_ctx_flags_sane(u32 ctx_flags) +{ + if (ctx_flags & PFM_FL_SYSTEM_WIDE) { + if (ctx_flags & PFM_FL_NOTIFY_BLOCK) { + PFM_DBG("cannot use blocking mode in syswide mode"); + return -EINVAL; + } + } + return 0; +} + +/** + * pfm_ctx_permissions - check authorization to create new context + * @ctx_flags: context flags passed by user + * + * check for permissions to create a context. + * + * A sysadmin may decide to restrict creation of per-thread + * and/or system-wide context to a group of users using the + * group id via /sys/kernel/perfmon/task_group and + * /sys/kernel/perfmon/sys_group. + * + * Once we identify a user level package which can be used + * to grant/revoke Linux capabilites at login via PAM, we will + * be able to use capabilities. We would also need to increase + * the size of cap_t to support more than 32 capabilities (it + * is currently defined as u32 and 32 capabilities are alrady + * defined). + */ +static inline int pfm_ctx_permissions(u32 ctx_flags) +{ + if ((ctx_flags & PFM_FL_SYSTEM_WIDE) + && pfm_controls.sys_group != PFM_GROUP_PERM_ANY + && !in_group_p(pfm_controls.sys_group)) { + PFM_DBG("user group not allowed to create a syswide ctx"); + return -EPERM; + } else if (pfm_controls.task_group != PFM_GROUP_PERM_ANY + && !in_group_p(pfm_controls.task_group)) { + PFM_DBG("user group not allowed to create a task context"); + return -EPERM; + } + return 0; +} + +/** + * __pfm_create_context - allocate and initialize a perfmon context + * @req : pfarg_ctx from user + * @fmt : pointer sampling format, NULL if not used + * @fmt_arg: pointer to argument to sampling format, NULL if not used + * @mode: PFM_NORMAL or PFM_COMPAT(IA-64 v2.0 compatibility) + * @ctx : address of new context upon succesful return, undefined otherwise + * + * function used to allocate a new context. A context is allocated along + * with the default event set. If a sampling format is used, the buffer + * may be allocated and initialized. + * + * The file descriptor identifying the context is allocated and returned + * to caller. + * + * This function operates with no locks and interrupts are enabled. + * return: + * >=0: the file descriptor to identify the context + * <0 : the error code + */ +int __pfm_create_context(struct pfarg_ctx *req, + struct pfm_smpl_fmt *fmt, + void *fmt_arg, + int mode, + struct pfm_context **new_ctx) +{ + struct pfm_context *ctx; + struct file *filp = NULL; + u32 ctx_flags; + int fd = 0, ret; + + ctx_flags = req->ctx_flags; + + /* Increase refcount on PMU description */ + ret = pfm_pmu_conf_get(1); + if (ret < 0) + goto error_conf; + + ret = pfm_ctx_flags_sane(ctx_flags); + if (ret < 0) + goto error_alloc; + + ret = pfm_ctx_permissions(ctx_flags); + if (ret < 0) + goto error_alloc; + + /* + * we can use GFP_KERNEL and potentially sleep because we do + * not hold any lock at this point. + */ + might_sleep(); + ret = -ENOMEM; + ctx = kmem_cache_zalloc(pfm_ctx_cachep, GFP_KERNEL); + if (!ctx) + goto error_alloc; + + PFM_DBG("alloc ctx @0x%p", ctx); + + INIT_LIST_HEAD(&ctx->set_list); + spin_lock_init(&ctx->lock); + init_completion(&ctx->restart_complete); + init_waitqueue_head(&ctx->msgq_wait); + + /* + * context is unloaded + */ + ctx->state = PFM_CTX_UNLOADED; + + /* + * initialization of context's flags + * must be done before pfm_find_set() + */ + ctx->flags.block = (ctx_flags & PFM_FL_NOTIFY_BLOCK) ? 1 : 0; + ctx->flags.system = (ctx_flags & PFM_FL_SYSTEM_WIDE) ? 1: 0; + ctx->flags.no_msg = (ctx_flags & PFM_FL_OVFL_NO_MSG) ? 1: 0; + ctx->flags.ia64_v20_compat = mode == PFM_COMPAT ? 1 : 0; + + ret = pfm_pmu_acquire(ctx); + if (ret) + goto error_file; + /* + * check if PMU is usable + */ + if (!(ctx->regs.num_pmcs && ctx->regs.num_pmcs)) { + PFM_DBG("no usable PMU registers"); + ret = -EBUSY; + goto error_file; + } + + /* + * link to format, must be done first for correct + * error handling in pfm_context_free() + */ + ctx->smpl_fmt = fmt; + + ret = -ENFILE; + fd = pfm_alloc_fd(&filp); + if (fd < 0) + goto error_file; + + /* + * initialize arch-specific section + * must be done before fmt_init() + */ + ret = pfm_arch_context_create(ctx, ctx_flags); + if (ret) + goto error_set; + + ret = -ENOMEM; + + /* + * add initial set + */ + if (pfm_create_initial_set(ctx)) + goto error_set; + + /* + * does the user want to sample? + * must be done after pfm_pmu_acquire() because + * needs ctx->regs + */ + if (fmt) { + ret = pfm_setup_smpl_fmt(ctx, ctx_flags, fmt_arg, filp); + if (ret) + goto error_set; + } + + filp->private_data = ctx; + + ctx->last_act = PFM_INVALID_ACTIVATION; + ctx->last_cpu = -1; + + /* + * initialize notification message queue + */ + ctx->msgq_head = ctx->msgq_tail = 0; + + PFM_DBG("flags=0x%x system=%d notify_block=%d no_msg=%d" + " use_fmt=%d ctx_fd=%d mode=%d", + ctx_flags, + ctx->flags.system, + ctx->flags.block, + ctx->flags.no_msg, + !!fmt, + fd, mode); + + if (new_ctx) + *new_ctx = ctx; + + /* + * we defer the fd_install until we are certain the call succeeded + * to ensure we do not have to undo its effect. Neither put_filp() + * nor put_unused_fd() undoes the effect of fd_install(). + */ + fd_install(fd, filp); + + return fd; + +error_set: + put_filp(filp); + put_unused_fd(fd); +error_file: + /* + * calls the right *_put() functions + * calls pfm_release_pmu() + */ + pfm_free_context(ctx); + return ret; +error_alloc: + pfm_pmu_conf_put(); +error_conf: + pfm_smpl_fmt_put(fmt); + return ret; +} + +/** + * pfm_init_ctx -- initialize context SLAB + * + * called from pfm_init + */ +int __init pfm_init_ctx(void) +{ + pfm_ctx_cachep = kmem_cache_create("pfm_context", + sizeof(struct pfm_context)+PFM_ARCH_CTX_SIZE, + SLAB_HWCACHE_ALIGN, 0, NULL); + if (!pfm_ctx_cachep) { + PFM_ERR("cannot initialize context slab"); + return -ENOMEM; + } + return 0; +} diff --git a/perfmon/perfmon_ctxsw.c b/perfmon/perfmon_ctxsw.c new file mode 100644 index 0000000..9a28d13 --- /dev/null +++ b/perfmon/perfmon_ctxsw.c @@ -0,0 +1,342 @@ +/* + * perfmon_cxtsw.c: perfmon2 context switch code + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * David Mosberger-Tang + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include "perfmon_priv.h" + +void pfm_save_pmds(struct pfm_context *ctx, struct pfm_event_set *set) +{ + u64 val, ovfl_mask; + u64 *used_pmds, *cnt_pmds; + u16 i, num; + + ovfl_mask = pfm_pmu_conf->ovfl_mask; + num = set->nused_pmds; + cnt_pmds = ctx->regs.cnt_pmds; + used_pmds = set->used_pmds; + + /* + * save HW PMD, for counters, reconstruct 64-bit value + */ + for (i = 0; num; i++) { + if (test_bit(i, cast_ulp(used_pmds))) { + val = pfm_read_pmd(ctx, i); + if (likely(test_bit(i, cast_ulp(cnt_pmds)))) + val = (set->pmds[i].value & ~ovfl_mask) | + (val & ovfl_mask); + set->pmds[i].value = val; + num--; + } + } + pfm_arch_clear_pmd_ovfl_cond(ctx, set); +} + +/* + * interrupts are disabled (no preemption) + */ +void __pfm_ctxswin_thread(struct task_struct *task, + struct pfm_context *ctx, u64 now) +{ + u64 cur_act; + struct pfm_event_set *set; + int reload_pmcs, reload_pmds; + int mycpu, is_active; + + mycpu = smp_processor_id(); + + cur_act = __get_cpu_var(pmu_activation_number); + /* + * we need to lock context because it could be accessed + * from another CPU. Normally the schedule() functions + * has masked interrupts which should be enough to + * protect against PMU interrupts. + */ + spin_lock(&ctx->lock); + + is_active = pfm_arch_is_active(ctx); + + set = ctx->active_set; + + /* + * in case fo zombie, we do not complete ctswin of the + * PMU, and we force a call to pfm_handle_work() to finish + * cleanup, i.e., free context + smpl_buff. The reason for + * deferring to pfm_handle_work() is that it is not possible + * to vfree() with interrupts disabled. + */ + if (unlikely(ctx->state == PFM_CTX_ZOMBIE)) { + pfm_post_work(task, ctx, PFM_WORK_ZOMBIE); + goto done; + } + + /* + * if we were the last user of the PMU on that CPU, + * then nothing to do except restore psr + */ + if (ctx->last_cpu == mycpu && ctx->last_act == cur_act) { + /* + * check for forced reload conditions + */ + reload_pmcs = set->priv_flags & PFM_SETFL_PRIV_MOD_PMCS; + reload_pmds = set->priv_flags & PFM_SETFL_PRIV_MOD_PMDS; + } else { +#ifndef CONFIG_SMP + pfm_check_save_prev_ctx(); +#endif + reload_pmcs = 1; + reload_pmds = 1; + } + /* consumed */ + set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH; + + if (reload_pmds) + pfm_arch_restore_pmds(ctx, set); + + /* + * need to check if had in-flight interrupt in + * pfm_ctxswout_thread(). If at least one bit set, then we must replay + * the interrupt to avoid losing some important performance data. + * + * npend_ovfls is cleared in interrupt handler + */ + if (set->npend_ovfls) { + pfm_arch_resend_irq(ctx); + pfm_stats_inc(ovfl_intr_replay_count); + } + + if (reload_pmcs) + pfm_arch_restore_pmcs(ctx, set); + + /* + * record current activation for this context + */ + __get_cpu_var(pmu_activation_number)++; + ctx->last_cpu = mycpu; + ctx->last_act = __get_cpu_var(pmu_activation_number); + + /* + * establish new ownership. + */ + pfm_set_pmu_owner(task, ctx); + + pfm_arch_ctxswin_thread(task, ctx); + /* + * set->duration does not count when context in MASKED state. + * set->duration_start is reset in unmask_monitoring() + */ + set->duration_start = now; + + /* + * re-arm switch timeout, if necessary + * Timeout is active only if monitoring is active, + * i.e., LOADED + started + * + * We reload the remainder timeout or the full timeout. + * Remainder is recorded on context switch out or in + * pfm_load_context() + */ + if (ctx->state == PFM_CTX_LOADED + && (set->flags & PFM_SETFL_TIME_SWITCH) && is_active) { + pfm_restart_timer(ctx, set); + /* careful here as pfm_restart_timer may switch sets */ + } +done: + spin_unlock(&ctx->lock); +} + +/* + * interrupts are masked, runqueue lock is held. + * + * In UP. we simply stop monitoring and leave the state + * in place, i.e., lazy save + */ +void __pfm_ctxswout_thread(struct task_struct *task, + struct pfm_context *ctx, u64 now) +{ + struct pfm_event_set *set; + int need_save_pmds, is_active; + + /* + * we need to lock context because it could be accessed + * from another CPU. Normally the schedule() functions + * has masked interrupts which should be enough to + * protect against PMU interrupts. + */ + + spin_lock(&ctx->lock); + + is_active = pfm_arch_is_active(ctx); + set = ctx->active_set; + + /* + * stop monitoring and + * collect pending overflow information + * needed on ctxswin. We cannot afford to lose + * a PMU interrupt. + */ + need_save_pmds = pfm_arch_ctxswout_thread(task, ctx); + + if (ctx->state == PFM_CTX_LOADED) { + /* + * accumulate only when set is actively monitoring, + */ + set->duration += now - set->duration_start; + + /* + * record remaining timeout + * reload in pfm_ctxsw_in() + */ + if (is_active && (set->flags & PFM_SETFL_TIME_SWITCH)) { + struct hrtimer *h = NULL; + h = &__get_cpu_var(pfm_hrtimer); + hrtimer_cancel(h); + set->hrtimer_rem = hrtimer_get_remaining(h); + PFM_DBG_ovfl("hrtimer=%lld", + (long long)set->hrtimer_rem.tv64); + } + } + +#ifdef CONFIG_SMP + /* + * in SMP, release ownership of this PMU. + * PMU interrupts are masked, so nothing + * can happen. + */ + pfm_set_pmu_owner(NULL, NULL); + + /* + * On some architectures, it is necessary to read the + * PMD registers to check for pending overflow in + * pfm_arch_ctxswout_thread(). In that case, saving of + * the PMDs may be done there and not here. + */ + if (need_save_pmds) + pfm_save_pmds(ctx, set); +#endif + spin_unlock(&ctx->lock); +} + +/* + * + */ +static void __pfm_ctxswout_sys(struct task_struct *prev, + struct task_struct *next) +{ + struct pfm_context *ctx; + + ctx = __get_cpu_var(pmu_ctx); + BUG_ON(!ctx); + + /* + * propagate TIF_PERFMON_CTXSW to ensure that: + * - previous task has TIF_PERFMON_CTXSW cleared, in case it is + * scheduled onto another CPU where there is syswide monitoring + * - next task has TIF_PERFMON_CTXSW set to ensure it will come back + * here when context switched out + */ + clear_tsk_thread_flag(prev, TIF_PERFMON_CTXSW); + set_tsk_thread_flag(next, TIF_PERFMON_CTXSW); + + /* + * nothing to do until actually started + * XXX: assumes no mean to start from user level + */ + if (!ctx->flags.started) + return; + + pfm_arch_ctxswout_sys(prev, ctx); +} + +/* + * + */ +static void __pfm_ctxswin_sys(struct task_struct *prev, + struct task_struct *next) +{ + struct pfm_context *ctx; + + ctx = __get_cpu_var(pmu_ctx); + BUG_ON(!ctx); + + /* + * nothing to do until actually started + * XXX: assumes no mean to start from user level + */ + if (!ctx->flags.started) + return; + + pfm_arch_ctxswin_sys(next, ctx); +} + +void pfm_ctxsw_out(struct task_struct *prev, + struct task_struct *next) +{ + struct pfm_context *ctxp; + u64 now; + + now = sched_clock(); + + ctxp = prev->pfm_context; + + if (ctxp) + __pfm_ctxswout_thread(prev, ctxp, now); + else + __pfm_ctxswout_sys(prev, next); + + pfm_stats_inc(ctxswout_count); + pfm_stats_add(ctxswout_ns, sched_clock() - now); +} + +void pfm_ctxsw_in(struct task_struct *prev, + struct task_struct *next) +{ + struct pfm_context *ctxn; + u64 now; + + now = sched_clock(); + + ctxn = next->pfm_context; + + if (ctxn) + __pfm_ctxswin_thread(next, ctxn, now); + else + __pfm_ctxswin_sys(prev, next); + + pfm_stats_inc(ctxswin_count); + pfm_stats_add(ctxswin_ns, sched_clock() - now); +} diff --git a/perfmon/perfmon_debugfs.c b/perfmon/perfmon_debugfs.c new file mode 100644 index 0000000..e4d2fad --- /dev/null +++ b/perfmon/perfmon_debugfs.c @@ -0,0 +1,168 @@ +/* + * perfmon_debugfs.c: perfmon2 statistics interface to debugfs + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 2007 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include + +/* + * to make the statistics visible to user space: + * $ mount -t debugfs none /mnt + * $ cd /mnt/perfmon + * then choose a CPU subdir + */ +DECLARE_PER_CPU(struct pfm_stats, pfm_stats); + +static struct dentry *pfm_debugfs_dir; + +void pfm_reset_stats(int cpu) +{ + struct pfm_stats *st; + unsigned long flags; + + st = &per_cpu(pfm_stats, cpu); + + local_irq_save(flags); + memset(st->v, 0, sizeof(st->v)); + local_irq_restore(flags); +} + +static const char *pfm_stats_strs[] = { + "ovfl_intr_all_count", + "ovfl_intr_ns", + "ovfl_intr_spurious_count", + "ovfl_intr_replay_count", + "ovfl_intr_regular_count", + "handle_work_count", + "ovfl_notify_count", + "reset_pmds_count", + "pfm_restart_count", + "fmt_handler_calls", + "fmt_handler_ns", + "set_switch_count", + "set_switch_ns", + "set_switch_exp", + "ctxswin_count", + "ctxswin_ns", + "handle_timeout_count", + "ovfl_intr_nmi_count", + "ctxswout_count", + "ctxswout_ns", +}; +#define PFM_NUM_STRS ARRAY_SIZE(pfm_stats_strs) + +void pfm_debugfs_del_cpu(int cpu) +{ + struct pfm_stats *st; + int i; + + st = &per_cpu(pfm_stats, cpu); + + for (i = 0; i < PFM_NUM_STATS; i++) { + if (st->dirs[i]) + debugfs_remove(st->dirs[i]); + st->dirs[i] = NULL; + } + if (st->cpu_dir) + debugfs_remove(st->cpu_dir); + st->cpu_dir = NULL; +} + +int pfm_debugfs_add_cpu(int cpu) +{ + struct pfm_stats *st; + int i; + + /* + * sanity check between stats names and the number + * of entries in the pfm_stats value array. + */ + if (PFM_NUM_STRS != PFM_NUM_STATS) { + PFM_ERR("PFM_NUM_STRS != PFM_NUM_STATS error"); + return -1; + } + + st = &per_cpu(pfm_stats, cpu); + sprintf(st->cpu_name, "cpu%d", cpu); + + st->cpu_dir = debugfs_create_dir(st->cpu_name, pfm_debugfs_dir); + if (!st->cpu_dir) + return -1; + + for (i = 0; i < PFM_NUM_STATS; i++) { + st->dirs[i] = debugfs_create_u64(pfm_stats_strs[i], + S_IRUGO, + st->cpu_dir, + &st->v[i]); + if (!st->dirs[i]) + goto error; + } + pfm_reset_stats(cpu); + return 0; +error: + while (i >= 0) { + debugfs_remove(st->dirs[i]); + i--; + } + debugfs_remove(st->cpu_dir); + return -1; +} + +/* + * called once from pfm_init() + */ +int __init pfm_init_debugfs(void) +{ + int cpu1, cpu2, ret; + + pfm_debugfs_dir = debugfs_create_dir("perfmon", NULL); + if (!pfm_debugfs_dir) + return -1; + + for_each_online_cpu(cpu1) { + ret = pfm_debugfs_add_cpu(cpu1); + if (ret) + goto error; + } + return 0; +error: + for_each_online_cpu(cpu2) { + if (cpu2 == cpu1) + break; + pfm_debugfs_del_cpu(cpu2); + } + return -1; +} diff --git a/perfmon/perfmon_dfl_smpl.c b/perfmon/perfmon_dfl_smpl.c new file mode 100644 index 0000000..8c83489 --- /dev/null +++ b/perfmon/perfmon_dfl_smpl.c @@ -0,0 +1,298 @@ +/* + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This file implements the new default sampling buffer format + * for the perfmon2 subsystem. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include +#include + +#include +#include + +MODULE_AUTHOR("Stephane Eranian "); +MODULE_DESCRIPTION("new perfmon default sampling format"); +MODULE_LICENSE("GPL"); + +static int pfm_dfl_fmt_validate(u32 ctx_flags, u16 npmds, void *data) +{ + struct pfm_dfl_smpl_arg *arg = data; + u64 min_buf_size; + + if (data == NULL) { + PFM_DBG("no argument passed"); + return -EINVAL; + } + + /* + * sanity check in case size_t is smaller then u64 + */ +#if BITS_PER_LONG == 4 +#define MAX_SIZE_T (1ULL<<(sizeof(size_t)<<3)) + if (sizeof(size_t) < sizeof(arg->buf_size)) { + if (arg->buf_size >= MAX_SIZE_T) + return -ETOOBIG; + } +#endif + + /* + * compute min buf size. npmds is the maximum number + * of implemented PMD registers. + */ + min_buf_size = sizeof(struct pfm_dfl_smpl_hdr) + + (sizeof(struct pfm_dfl_smpl_entry) + (npmds*sizeof(u64))); + + PFM_DBG("validate ctx_flags=0x%x flags=0x%x npmds=%u " + "min_buf_size=%llu buf_size=%llu\n", + ctx_flags, + arg->buf_flags, + npmds, + (unsigned long long)min_buf_size, + (unsigned long long)arg->buf_size); + + /* + * must hold at least the buffer header + one minimally sized entry + */ + if (arg->buf_size < min_buf_size) + return -EINVAL; + + return 0; +} + +static int pfm_dfl_fmt_get_size(u32 flags, void *data, size_t *size) +{ + struct pfm_dfl_smpl_arg *arg = data; + + /* + * size has been validated in default_validate + * we can never loose bits from buf_size. + */ + *size = (size_t)arg->buf_size; + + return 0; +} + +static int pfm_dfl_fmt_init(struct pfm_context *ctx, void *buf, u32 ctx_flags, + u16 npmds, void *data) +{ + struct pfm_dfl_smpl_hdr *hdr; + struct pfm_dfl_smpl_arg *arg = data; + + hdr = buf; + + hdr->hdr_version = PFM_DFL_SMPL_VERSION; + hdr->hdr_buf_size = arg->buf_size; + hdr->hdr_buf_flags = arg->buf_flags; + hdr->hdr_cur_offs = sizeof(*hdr); + hdr->hdr_overflows = 0; + hdr->hdr_count = 0; + hdr->hdr_min_buf_space = sizeof(struct pfm_dfl_smpl_entry) + (npmds*sizeof(u64)); + /* + * due to cache aliasing, it may be necessary to flush the cache + * on certain architectures (e.g., MIPS) + */ + pfm_cacheflush(hdr, sizeof(*hdr)); + + PFM_DBG("buffer=%p buf_size=%llu hdr_size=%zu hdr_version=%u.%u " + "min_space=%llu npmds=%u", + buf, + (unsigned long long)hdr->hdr_buf_size, + sizeof(*hdr), + PFM_VERSION_MAJOR(hdr->hdr_version), + PFM_VERSION_MINOR(hdr->hdr_version), + (unsigned long long)hdr->hdr_min_buf_space, + npmds); + + return 0; +} + +/* + * called from pfm_overflow_handler() to record a new sample + * + * context is locked, interrupts are disabled (no preemption) + */ +static int pfm_dfl_fmt_handler(struct pfm_context *ctx, + unsigned long ip, u64 tstamp, void *data) +{ + struct pfm_dfl_smpl_hdr *hdr; + struct pfm_dfl_smpl_entry *ent; + struct pfm_ovfl_arg *arg; + void *cur, *last; + u64 *e; + size_t entry_size, min_size; + u16 npmds, i; + u16 ovfl_pmd; + void *buf; + + hdr = ctx->smpl_addr; + arg = &ctx->ovfl_arg; + + buf = hdr; + cur = buf+hdr->hdr_cur_offs; + last = buf+hdr->hdr_buf_size; + ovfl_pmd = arg->ovfl_pmd; + min_size = hdr->hdr_min_buf_space; + + /* + * precheck for sanity + */ + if ((last - cur) < min_size) + goto full; + + npmds = arg->num_smpl_pmds; + + ent = (struct pfm_dfl_smpl_entry *)cur; + + entry_size = sizeof(*ent) + (npmds << 3); + + /* position for first pmd */ + e = (u64 *)(ent+1); + + hdr->hdr_count++; + + PFM_DBG_ovfl("count=%llu cur=%p last=%p free_bytes=%zu ovfl_pmd=%d " + "npmds=%u", + (unsigned long long)hdr->hdr_count, + cur, last, + (last-cur), + ovfl_pmd, + npmds); + + /* + * current = task running at the time of the overflow. + * + * per-task mode: + * - this is usually the task being monitored. + * Under certain conditions, it might be a different task + * + * system-wide: + * - this is not necessarily the task controlling the session + */ + ent->pid = current->pid; + ent->ovfl_pmd = ovfl_pmd; + ent->last_reset_val = arg->pmd_last_reset; + + /* + * where did the fault happen (includes slot number) + */ + ent->ip = ip; + + ent->tstamp = tstamp; + ent->cpu = smp_processor_id(); + ent->set = arg->active_set; + ent->tgid = current->tgid; + + /* + * selectively store PMDs in increasing index number + */ + if (npmds) { + u64 *val = arg->smpl_pmds_values; + for (i = 0; i < npmds; i++) + *e++ = *val++; + } + + /* + * update position for next entry + */ + hdr->hdr_cur_offs += entry_size; + cur += entry_size; + + pfm_cacheflush(hdr, sizeof(*hdr)); + pfm_cacheflush(ent, entry_size); + + /* + * post check to avoid losing the last sample + */ + if ((last - cur) < min_size) + goto full; + + /* reset before returning from interrupt handler */ + arg->ovfl_ctrl = PFM_OVFL_CTRL_RESET; + + return 0; +full: + PFM_DBG_ovfl("sampling buffer full free=%zu, count=%llu", + last-cur, + (unsigned long long)hdr->hdr_count); + + /* + * increment number of buffer overflows. + * important to detect duplicate set of samples. + */ + hdr->hdr_overflows++; + + /* + * request notification and masking of monitoring. + * Notification is still subject to the overflowed + * register having the FL_NOTIFY flag set. + */ + arg->ovfl_ctrl = PFM_OVFL_CTRL_NOTIFY | PFM_OVFL_CTRL_MASK; + + return -ENOBUFS; /* we are full, sorry */ +} + +static int pfm_dfl_fmt_restart(int is_active, u32 *ovfl_ctrl, void *buf) +{ + struct pfm_dfl_smpl_hdr *hdr; + + hdr = buf; + + hdr->hdr_count = 0; + hdr->hdr_cur_offs = sizeof(*hdr); + + pfm_cacheflush(hdr, sizeof(*hdr)); + + *ovfl_ctrl = PFM_OVFL_CTRL_RESET; + + return 0; +} + +static int pfm_dfl_fmt_exit(void *buf) +{ + return 0; +} + +static struct pfm_smpl_fmt dfl_fmt = { + .fmt_name = "default", + .fmt_version = 0x10000, + .fmt_arg_size = sizeof(struct pfm_dfl_smpl_arg), + .fmt_validate = pfm_dfl_fmt_validate, + .fmt_getsize = pfm_dfl_fmt_get_size, + .fmt_init = pfm_dfl_fmt_init, + .fmt_handler = pfm_dfl_fmt_handler, + .fmt_restart = pfm_dfl_fmt_restart, + .fmt_exit = pfm_dfl_fmt_exit, + .fmt_flags = PFM_FMT_BUILTIN_FLAG, + .owner = THIS_MODULE +}; + +static int pfm_dfl_fmt_init_module(void) +{ + return pfm_fmt_register(&dfl_fmt); +} + +static void pfm_dfl_fmt_cleanup_module(void) +{ + pfm_fmt_unregister(&dfl_fmt); +} + +module_init(pfm_dfl_fmt_init_module); +module_exit(pfm_dfl_fmt_cleanup_module); diff --git a/perfmon/perfmon_file.c b/perfmon/perfmon_file.c new file mode 100644 index 0000000..1cde81b --- /dev/null +++ b/perfmon/perfmon_file.c @@ -0,0 +1,751 @@ +/* + * perfmon_file.c: perfmon2 file input/output functions + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * David Mosberger-Tang + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "perfmon_priv.h" + +#define PFMFS_MAGIC 0xa0b4d889 /* perfmon filesystem magic number */ + +struct pfm_controls pfm_controls = { + .sys_group = PFM_GROUP_PERM_ANY, + .task_group = PFM_GROUP_PERM_ANY, + .arg_mem_max = PAGE_SIZE, + .smpl_buffer_mem_max = ~0, +}; +EXPORT_SYMBOL(pfm_controls); + +static int __init enable_debug(char *str) +{ + pfm_controls.debug = 1; + PFM_INFO("debug output enabled\n"); + return 1; +} +__setup("perfmon_debug", enable_debug); + +static int pfmfs_delete_dentry(struct dentry *dentry) +{ + return 1; +} + +static struct dentry_operations pfmfs_dentry_operations = { + .d_delete = pfmfs_delete_dentry, +}; + +int pfm_buf_map_pagefault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + void *kaddr; + unsigned long address; + struct pfm_context *ctx; + size_t size; + + address = (unsigned long)vmf->virtual_address; + + ctx = vma->vm_private_data; + if (ctx == NULL) { + PFM_DBG("no ctx"); + return VM_FAULT_SIGBUS; + } + /* + * size available to user (maybe different from real_smpl_size + */ + size = ctx->smpl_size; + + if ((address < vma->vm_start) || + (address >= (vma->vm_start + size))) + return VM_FAULT_SIGBUS; + + kaddr = ctx->smpl_addr + (address - vma->vm_start); + + vmf->page = vmalloc_to_page(kaddr); + get_page(vmf->page); + + PFM_DBG("[%d] start=%p ref_count=%d", + current->pid, + kaddr, page_count(vmf->page)); + + return 0; +} + +/* + * we need to determine whther or not we are closing the last reference + * to the file and thus are going to end up in pfm_close() which eventually + * calls pfm_release_buf_space(). In that function, we update the accouting + * for locked_vm given that we are actually freeing the sampling buffer. The + * issue is that there are multiple paths leading to pfm_release_buf_space(), + * from exit(), munmap(), close(). The path coming from munmap() is problematic + * becuse do_munmap() grabs mmap_sem in write-mode which is also what + * pfm_release_buf_space does. To avoid deadlock, we need to determine where + * we are calling from and skip the locking. The vm_ops->close() callback + * is invoked for each remove_vma() independently of the number of references + * left on the file descriptor, therefore simple reference counter does not + * work. We need to determine if this is the last call, and then set a flag + * to skip the locking. + */ +static void pfm_buf_map_close(struct vm_area_struct *vma) +{ + struct file *file; + struct pfm_context *ctx; + + file = vma->vm_file; + ctx = vma->vm_private_data; + + /* + * if file is going to close, then pfm_close() will + * be called, do not lock in pfm_release_buf + */ + if (atomic_read(&file->f_count) == 1) + ctx->flags.mmap_nlock = 1; +} + +/* + * we do not have a close callback because, the locked + * memory accounting must be done when the actual buffer + * is freed. Munmap does not free the page backing the vma + * because they may still be in use by the PMU interrupt handler. + */ +struct vm_operations_struct pfm_buf_map_vm_ops = { + .fault = pfm_buf_map_pagefault, + .close = pfm_buf_map_close +}; + +static int pfm_mmap_buffer(struct pfm_context *ctx, struct vm_area_struct *vma, + size_t size) +{ + if (ctx->smpl_addr == NULL) { + PFM_DBG("no sampling buffer to map"); + return -EINVAL; + } + + if (size > ctx->smpl_size) { + PFM_DBG("mmap size=%zu >= actual buf size=%zu", + size, + ctx->smpl_size); + return -EINVAL; + } + + vma->vm_ops = &pfm_buf_map_vm_ops; + vma->vm_private_data = ctx; + + return 0; +} + +static int pfm_mmap(struct file *file, struct vm_area_struct *vma) +{ + size_t size; + struct pfm_context *ctx; + unsigned long flags; + int ret; + + PFM_DBG("pfm_file_ops"); + + ctx = file->private_data; + size = (vma->vm_end - vma->vm_start); + + if (ctx == NULL) + return -EINVAL; + + ret = -EINVAL; + + spin_lock_irqsave(&ctx->lock, flags); + + if (vma->vm_flags & VM_WRITE) { + PFM_DBG("cannot map buffer for writing"); + goto done; + } + + PFM_DBG("vm_pgoff=%lu size=%zu vm_start=0x%lx", + vma->vm_pgoff, + size, + vma->vm_start); + + ret = pfm_mmap_buffer(ctx, vma, size); + if (ret == 0) + vma->vm_flags |= VM_RESERVED; + + PFM_DBG("ret=%d vma_flags=0x%lx vma_start=0x%lx vma_size=%lu", + ret, + vma->vm_flags, + vma->vm_start, + vma->vm_end-vma->vm_start); +done: + spin_unlock_irqrestore(&ctx->lock, flags); + + return ret; +} + +/* + * Extract one message from queue. + * + * return: + * -EAGAIN: when non-blocking and nothing is* in the queue. + * -ERESTARTSYS: when blocking and signal is pending + * Otherwise returns size of message (sizeof(pfarg_msg)) + */ +ssize_t __pfm_read(struct pfm_context *ctx, union pfarg_msg *msg_buf, int non_block) +{ + ssize_t ret = 0; + unsigned long flags; + DECLARE_WAITQUEUE(wait, current); + + /* + * we must masks interrupts to avoid a race condition + * with the PMU interrupt handler. + */ + spin_lock_irqsave(&ctx->lock, flags); + + while (pfm_msgq_is_empty(ctx)) { + + /* + * handle non-blocking reads + * return -EAGAIN + */ + ret = -EAGAIN; + if (non_block) + break; + + add_wait_queue(&ctx->msgq_wait, &wait); + set_current_state(TASK_INTERRUPTIBLE); + + spin_unlock_irqrestore(&ctx->lock, flags); + + schedule(); + + /* + * during this window, another thread may call + * pfm_read() and steal our message + */ + + spin_lock_irqsave(&ctx->lock, flags); + + remove_wait_queue(&ctx->msgq_wait, &wait); + set_current_state(TASK_RUNNING); + + /* + * check for pending signals + * return -ERESTARTSYS + */ + ret = -ERESTARTSYS; + if (signal_pending(current)) + break; + + /* + * we may have a message + */ + ret = 0; + } + + /* + * extract message + */ + if (ret == 0) { + /* + * copy the oldest message into msg_buf. + * We cannot directly call copy_to_user() + * because interrupts masked. This is done + * in the caller + */ + pfm_get_next_msg(ctx, msg_buf); + + ret = sizeof(*msg_buf); + + PFM_DBG("extracted type=%d", msg_buf->type); + } + + spin_unlock_irqrestore(&ctx->lock, flags); + + PFM_DBG("blocking=%d ret=%zd", non_block, ret); + + return ret; +} + +static ssize_t pfm_read(struct file *filp, char __user *buf, size_t size, + loff_t *ppos) +{ + struct pfm_context *ctx; + union pfarg_msg msg_buf; + int non_block, ret; + + PFM_DBG_ovfl("buf=%p size=%zu", buf, size); + + ctx = filp->private_data; + if (ctx == NULL) { + PFM_ERR("no ctx for pfm_read"); + return -EINVAL; + } + + non_block = filp->f_flags & O_NONBLOCK; + +#ifdef CONFIG_IA64_PERFMON_COMPAT + /* + * detect IA-64 v2.0 context read (message size is different) + * nops on all other architectures + */ + if (unlikely(ctx->flags.ia64_v20_compat)) + return pfm_arch_compat_read(ctx, buf, non_block, size); +#endif + /* + * cannot extract partial messages. + * check even when there is no message + * + * cannot extract more than one message per call. Bytes + * above sizeof(msg) are ignored. + */ + if (size < sizeof(msg_buf)) { + PFM_DBG("message is too small size=%zu must be >=%zu)", + size, + sizeof(msg_buf)); + return -EINVAL; + } + + ret = __pfm_read(ctx, &msg_buf, non_block); + if (ret > 0) { + if (copy_to_user(buf, &msg_buf, sizeof(msg_buf))) + ret = -EFAULT; + } + PFM_DBG_ovfl("ret=%d", ret); + return ret; +} + +static ssize_t pfm_write(struct file *file, const char __user *ubuf, + size_t size, loff_t *ppos) +{ + PFM_DBG("pfm_write called"); + return -EINVAL; +} + +static unsigned int pfm_poll(struct file *filp, poll_table *wait) +{ + struct pfm_context *ctx; + unsigned long flags; + unsigned int mask = 0; + + PFM_DBG("pfm_file_ops"); + + if (filp->f_op != &pfm_file_ops) { + PFM_ERR("pfm_poll bad magic"); + return 0; + } + + ctx = filp->private_data; + if (ctx == NULL) { + PFM_ERR("pfm_poll no ctx"); + return 0; + } + + PFM_DBG("before poll_wait"); + + poll_wait(filp, &ctx->msgq_wait, wait); + + /* + * pfm_msgq_is_empty() is non-atomic + * + * filp is protected by fget() at upper level + * context cannot be closed by another thread. + * + * There may be a race with a PMU interrupt adding + * messages to the queue. But we are interested in + * queue not empty, so adding more messages should + * not really be a problem. + * + * There may be a race with another thread issuing + * a read() and stealing messages from the queue thus + * may return the wrong answer. This could potentially + * lead to a blocking read, because nothing is + * available in the queue + */ + spin_lock_irqsave(&ctx->lock, flags); + + if (!pfm_msgq_is_empty(ctx)) + mask = POLLIN | POLLRDNORM; + + spin_unlock_irqrestore(&ctx->lock, flags); + + PFM_DBG("after poll_wait mask=0x%x", mask); + + return mask; +} + +static int pfm_ioctl(struct inode *inode, struct file *file, unsigned int cmd, + unsigned long arg) +{ + PFM_DBG("pfm_ioctl called"); + return -EINVAL; +} + +/* + * interrupt cannot be masked when entering this function + */ +static inline int __pfm_fasync(int fd, struct file *filp, + struct pfm_context *ctx, int on) +{ + int ret; + + PFM_DBG("in fd=%d on=%d async_q=%p", + fd, + on, + ctx->async_queue); + + ret = fasync_helper(fd, filp, on, &ctx->async_queue); + + PFM_DBG("out fd=%d on=%d async_q=%p ret=%d", + fd, + on, + ctx->async_queue, ret); + + return ret; +} + +static int pfm_fasync(int fd, struct file *filp, int on) +{ + struct pfm_context *ctx; + int ret; + + PFM_DBG("pfm_file_ops"); + + ctx = filp->private_data; + if (ctx == NULL) { + PFM_ERR("pfm_fasync no ctx"); + return -EBADF; + } + + /* + * we cannot mask interrupts during this call because this may + * may go to sleep if memory is not readily avalaible. + * + * We are protected from the context disappearing by the + * get_fd()/put_fd() done in caller. Serialization of this function + * is ensured by caller. + */ + ret = __pfm_fasync(fd, filp, ctx, on); + + PFM_DBG("pfm_fasync called on fd=%d on=%d async_queue=%p ret=%d", + fd, + on, + ctx->async_queue, ret); + + return ret; +} + +#ifdef CONFIG_SMP +static void __pfm_close_remote_cpu(void *info) +{ + struct pfm_context *ctx = info; + int can_release; + + BUG_ON(ctx != __get_cpu_var(pmu_ctx)); + + /* + * we are in IPI interrupt handler which has always higher + * priority than PMU interrupt, therefore we do not need to + * mask interrupts. context locking is not needed because we + * are in close(), no more user references. + * + * can_release is ignored, release done on calling CPU + */ + __pfm_unload_context(ctx, &can_release); + + /* + * we cannot free context here because we are in_interrupt(). + * we free on the calling CPU + */ +} + +static int pfm_close_remote_cpu(u32 cpu, struct pfm_context *ctx) +{ + BUG_ON(irqs_disabled()); + return smp_call_function_single(cpu, __pfm_close_remote_cpu, ctx, 1); +} +#endif /* CONFIG_SMP */ + +/* + * called either on explicit close() or from exit_files(). + * Only the LAST user of the file gets to this point, i.e., it is + * called only ONCE. + * + * IMPORTANT: we get called ONLY when the refcnt on the file gets to zero + * (fput()),i.e, last task to access the file. Nobody else can access the + * file at this point. + * + * When called from exit_files(), the VMA has been freed because exit_mm() + * is executed before exit_files(). + * + * When called from exit_files(), the current task is not yet ZOMBIE but we + * flush the PMU state to the context. + */ +int __pfm_close(struct pfm_context *ctx, struct file *filp) +{ + unsigned long flags; + int state; + int can_free = 1, can_unload = 1; + int is_system, can_release = 0; + u32 cpu; + + /* + * no risk of ctx of filp disappearing so we can operate outside + * of spin_lock(). fasync_helper() runs with interrupts masked, + * thus there is no risk with the PMU interrupt handler + * + * In case of zombie, we will not have the async struct anymore + * thus kill_fasync() will not do anything + * + * fd is not used when removing the entry so we pass -1 + */ + if (filp->f_flags & FASYNC) + __pfm_fasync (-1, filp, ctx, 0); + + spin_lock_irqsave(&ctx->lock, flags); + + state = ctx->state; + is_system = ctx->flags.system; + cpu = ctx->cpu; + + PFM_DBG("state=%d", state); + + /* + * check if unload is needed + */ + if (state == PFM_CTX_UNLOADED) + goto doit; + +#ifdef CONFIG_SMP + /* + * we need to release the resource on the ORIGINAL cpu. + * we need to release the context lock to avoid deadlocks + * on the original CPU, especially in the context switch + * routines. It is safe to unlock because we are in close(), + * in other words, there is no more access from user level. + * we can also unmask interrupts on this CPU because the + * context is running on the original CPU. Context will be + * unloaded and the session will be released on the original + * CPU. Upon return, the caller is guaranteed that the context + * is gone from original CPU. + */ + if (is_system && cpu != smp_processor_id()) { + spin_unlock_irqrestore(&ctx->lock, flags); + pfm_close_remote_cpu(cpu, ctx); + can_release = 1; + goto free_it; + } + + if (!is_system && ctx->task != current) { + /* + * switch context to zombie state + */ + ctx->state = PFM_CTX_ZOMBIE; + + PFM_DBG("zombie ctx for [%d]", ctx->task->pid); + /* + * must check if other thread is using block overflow + * notification mode. If so make sure it will not block + * because there will not be any pfm_restart() issued. + * When the thread notices the ZOMBIE state, it will clean + * up what is left of the context + */ + if (state == PFM_CTX_MASKED && ctx->flags.block) { + /* + * force task to wake up from MASKED state + */ + PFM_DBG("waking up [%d]", ctx->task->pid); + + complete(&ctx->restart_complete); + } + /* + * PMU session will be release by monitored task when it notices + * ZOMBIE state as part of pfm_unload_context() + */ + can_unload = can_free = 0; + } +#endif + if (can_unload) + __pfm_unload_context(ctx, &can_release); +doit: + spin_unlock_irqrestore(&ctx->lock, flags); + +#ifdef CONFIG_SMP +free_it: +#endif + if (can_release) + pfm_session_release(is_system, cpu); + + if (can_free) + pfm_free_context(ctx); + + return 0; +} + +static int pfm_close(struct inode *inode, struct file *filp) +{ + struct pfm_context *ctx; + + PFM_DBG("called filp=%p", filp); + + ctx = filp->private_data; + if (ctx == NULL) { + PFM_ERR("no ctx"); + return -EBADF; + } + return __pfm_close(ctx, filp); +} + +static int pfm_no_open(struct inode *irrelevant, struct file *dontcare) +{ + PFM_DBG("pfm_file_ops"); + + return -ENXIO; +} + + +const struct file_operations pfm_file_ops = { + .llseek = no_llseek, + .read = pfm_read, + .write = pfm_write, + .poll = pfm_poll, + .ioctl = pfm_ioctl, + .open = pfm_no_open, /* special open to disallow open via /proc */ + .fasync = pfm_fasync, + .release = pfm_close, + .mmap = pfm_mmap +}; + +static int pfmfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data, struct vfsmount *mnt) +{ + return get_sb_pseudo(fs_type, "pfm:", NULL, PFMFS_MAGIC, mnt); +} + +static struct file_system_type pfm_fs_type = { + .name = "pfmfs", + .get_sb = pfmfs_get_sb, + .kill_sb = kill_anon_super, +}; + +/* + * pfmfs should _never_ be mounted by userland - too much of security hassle, + * no real gain from having the whole whorehouse mounted. So we don't need + * any operations on the root directory. However, we need a non-trivial + * d_name - pfm: will go nicely and kill the special-casing in procfs. + */ +static struct vfsmount *pfmfs_mnt; + +int __init pfm_init_fs(void) +{ + int err = register_filesystem(&pfm_fs_type); + if (!err) { + pfmfs_mnt = kern_mount(&pfm_fs_type); + err = PTR_ERR(pfmfs_mnt); + if (IS_ERR(pfmfs_mnt)) + unregister_filesystem(&pfm_fs_type); + else + err = 0; + } + return err; +} + +int pfm_alloc_fd(struct file **cfile) +{ + int fd, ret = 0; + struct file *file = NULL; + struct inode * inode; + char name[32]; + struct qstr this; + + fd = get_unused_fd(); + if (fd < 0) + return -ENFILE; + + ret = -ENFILE; + + file = get_empty_filp(); + if (!file) + goto out; + + /* + * allocate a new inode + */ + inode = new_inode(pfmfs_mnt->mnt_sb); + if (!inode) + goto out; + + PFM_DBG("new inode ino=%ld @%p", inode->i_ino, inode); + + inode->i_sb = pfmfs_mnt->mnt_sb; + inode->i_mode = S_IFCHR|S_IRUGO; + inode->i_uid = current->fsuid; + inode->i_gid = current->fsgid; + + sprintf(name, "[%lu]", inode->i_ino); + this.name = name; + this.hash = inode->i_ino; + this.len = strlen(name); + + ret = -ENOMEM; + + /* + * allocate a new dcache entry + */ + file->f_dentry = d_alloc(pfmfs_mnt->mnt_sb->s_root, &this); + if (!file->f_dentry) + goto out; + + file->f_dentry->d_op = &pfmfs_dentry_operations; + + d_add(file->f_dentry, inode); + file->f_vfsmnt = mntget(pfmfs_mnt); + file->f_mapping = inode->i_mapping; + + file->f_op = &pfm_file_ops; + file->f_mode = FMODE_READ; + file->f_flags = O_RDONLY; + file->f_pos = 0; + + *cfile = file; + + return fd; +out: + if (file) + put_filp(file); + put_unused_fd(fd); + return ret; +} diff --git a/perfmon/perfmon_fmt.c b/perfmon/perfmon_fmt.c new file mode 100644 index 0000000..27c4340 --- /dev/null +++ b/perfmon/perfmon_fmt.c @@ -0,0 +1,219 @@ +/* + * perfmon_fmt.c: perfmon2 sampling buffer format management + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * David Mosberger-Tang + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include "perfmon_priv.h" + +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_smpl_fmt_lock); +static LIST_HEAD(pfm_smpl_fmt_list); + +static inline int fmt_is_mod(struct pfm_smpl_fmt *f) +{ + return !(f->fmt_flags & PFM_FMTFL_IS_BUILTIN); +} + +static struct pfm_smpl_fmt *pfm_find_fmt(char *name) +{ + struct pfm_smpl_fmt *entry; + + list_for_each_entry(entry, &pfm_smpl_fmt_list, fmt_list) { + if (!strcmp(entry->fmt_name, name)) + return entry; + } + return NULL; +} +/* + * find a buffer format based on its name + */ +struct pfm_smpl_fmt *pfm_smpl_fmt_get(char *name) +{ + struct pfm_smpl_fmt *fmt; + + spin_lock(&pfm_smpl_fmt_lock); + + fmt = pfm_find_fmt(name); + + /* + * increase module refcount + */ + if (fmt && fmt_is_mod(fmt) && !try_module_get(fmt->owner)) + fmt = NULL; + + spin_unlock(&pfm_smpl_fmt_lock); + + return fmt; +} + +void pfm_smpl_fmt_put(struct pfm_smpl_fmt *fmt) +{ + if (fmt == NULL || !fmt_is_mod(fmt)) + return; + BUG_ON(fmt->owner == NULL); + + spin_lock(&pfm_smpl_fmt_lock); + module_put(fmt->owner); + spin_unlock(&pfm_smpl_fmt_lock); +} + +int pfm_fmt_register(struct pfm_smpl_fmt *fmt) +{ + int ret = 0; + + if (perfmon_disabled) { + PFM_INFO("perfmon disabled, cannot add sampling format"); + return -ENOSYS; + } + + /* some sanity checks */ + if (fmt == NULL) { + PFM_INFO("perfmon: NULL format for register"); + return -EINVAL; + } + + if (fmt->fmt_name == NULL) { + PFM_INFO("perfmon: format has no name"); + return -EINVAL; + } + + if (fmt->fmt_qdepth > PFM_MSGS_COUNT) { + PFM_INFO("perfmon: format %s requires %u msg queue depth (max %d)", + fmt->fmt_name, + fmt->fmt_qdepth, + PFM_MSGS_COUNT); + return -EINVAL; + } + + /* + * fmt is missing the initialization of .owner = THIS_MODULE + * this is only valid when format is compiled as a module + */ + if (fmt->owner == NULL && fmt_is_mod(fmt)) { + PFM_INFO("format %s has no module owner", fmt->fmt_name); + return -EINVAL; + } + /* + * we need at least a handler + */ + if (fmt->fmt_handler == NULL) { + PFM_INFO("format %s has no handler", fmt->fmt_name); + return -EINVAL; + } + + /* + * format argument size cannot be bigger than PAGE_SIZE + */ + if (fmt->fmt_arg_size > PAGE_SIZE) { + PFM_INFO("format %s arguments too big", fmt->fmt_name); + return -EINVAL; + } + + spin_lock(&pfm_smpl_fmt_lock); + + /* + * because of sysfs, we cannot have two formats with the same name + */ + if (pfm_find_fmt(fmt->fmt_name)) { + PFM_INFO("format %s already registered", fmt->fmt_name); + ret = -EBUSY; + goto out; + } + + ret = pfm_sysfs_add_fmt(fmt); + if (ret) { + PFM_INFO("sysfs cannot add format entry for %s", fmt->fmt_name); + goto out; + } + + list_add(&fmt->fmt_list, &pfm_smpl_fmt_list); + + PFM_INFO("added sampling format %s", fmt->fmt_name); +out: + spin_unlock(&pfm_smpl_fmt_lock); + + return ret; +} +EXPORT_SYMBOL(pfm_fmt_register); + +int pfm_fmt_unregister(struct pfm_smpl_fmt *fmt) +{ + struct pfm_smpl_fmt *fmt2; + int ret = 0; + + if (!fmt || !fmt->fmt_name) { + PFM_DBG("invalid fmt"); + return -EINVAL; + } + + spin_lock(&pfm_smpl_fmt_lock); + + fmt2 = pfm_find_fmt(fmt->fmt_name); + if (!fmt) { + PFM_INFO("unregister failed, format not registered"); + ret = -EINVAL; + goto out; + } + list_del_init(&fmt->fmt_list); + + pfm_sysfs_remove_fmt(fmt); + + PFM_INFO("removed sampling format: %s", fmt->fmt_name); + +out: + spin_unlock(&pfm_smpl_fmt_lock); + return ret; + +} +EXPORT_SYMBOL(pfm_fmt_unregister); + +/* + * we defer adding the builtin formats to /sys/kernel/perfmon/formats + * until after the pfm sysfs subsystem is initialized. This function + * is called from pfm_init_sysfs() + */ +void __init pfm_sysfs_builtin_fmt_add(void) +{ + struct pfm_smpl_fmt *entry; + + /* + * locking not needed, kernel not fully booted + * when called + */ + list_for_each_entry(entry, &pfm_smpl_fmt_list, fmt_list) { + pfm_sysfs_add_fmt(entry); + } +} diff --git a/perfmon/perfmon_hotplug.c b/perfmon/perfmon_hotplug.c new file mode 100644 index 0000000..eaaba81 --- /dev/null +++ b/perfmon/perfmon_hotplug.c @@ -0,0 +1,151 @@ +/* + * perfmon_hotplug.c: handling of CPU hotplug + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * David Mosberger-Tang + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include "perfmon_priv.h" + +#ifndef CONFIG_HOTPLUG_CPU +void pfm_cpu_disable(void) +{} + +int __init pfm_init_hotplug(void) +{ + return 0; +} +#else /* CONFIG_HOTPLUG_CPU */ +/* + * CPU hotplug event nofication callback + * + * We use the callback to do manage the sysfs interface. + * Note that the actual shutdown of monitoring on the CPU + * is done in pfm_cpu_disable(), see comments there for more + * information. + */ +static int pfm_cpu_notify(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + int ret = NOTIFY_OK; + + pfm_pmu_conf_get(0); + + switch (action) { + case CPU_ONLINE: + pfm_debugfs_add_cpu(cpu); + PFM_INFO("CPU%d is online", cpu); + break; + case CPU_UP_PREPARE: + PFM_INFO("CPU%d prepare online", cpu); + break; + case CPU_UP_CANCELED: + pfm_debugfs_del_cpu(cpu); + PFM_INFO("CPU%d is up canceled", cpu); + break; + case CPU_DOWN_PREPARE: + PFM_INFO("CPU%d prepare offline", cpu); + break; + case CPU_DOWN_FAILED: + PFM_INFO("CPU%d is down failed", cpu); + break; + case CPU_DEAD: + pfm_debugfs_del_cpu(cpu); + PFM_INFO("CPU%d is offline", cpu); + break; + } + pfm_pmu_conf_put(); + return ret; +} + +/* + * called from cpu_disable() to detach the perfmon context + * from the CPU going down. + * + * We cannot use the cpu hotplug notifier because we MUST run + * on the CPU that is going down to save the PMU state + */ +void pfm_cpu_disable(void) +{ + struct pfm_context *ctx; + unsigned long flags; + int is_system, release_info = 0; + u32 cpu; + int r; + + ctx = __get_cpu_var(pmu_ctx); + if (ctx == NULL) + return; + + is_system = ctx->flags.system; + cpu = ctx->cpu; + + /* + * context is LOADED or MASKED + * + * we unload from CPU. That stops monitoring and does + * all the bookeeping of saving values and updating duration + */ + spin_lock_irqsave(&ctx->lock, flags); + if (is_system) + __pfm_unload_context(ctx, &release_info); + spin_unlock_irqrestore(&ctx->lock, flags); + + /* + * cancel timer + */ + if (release_info & 0x2) { + r = hrtimer_cancel(&__get_cpu_var(pfm_hrtimer)); + PFM_DBG("timeout cancel=%d", r); + } + + if (release_info & 0x1) + pfm_session_release(is_system, cpu); +} + +static struct notifier_block pfm_cpu_notifier = { + .notifier_call = pfm_cpu_notify +}; + +int __init pfm_init_hotplug(void) +{ + int ret = 0; + /* + * register CPU hotplug event notifier + */ + ret = register_cpu_notifier(&pfm_cpu_notifier); + if (!ret) + PFM_LOG("CPU hotplug support enabled"); + return ret; +} +#endif /* CONFIG_HOTPLUG_CPU */ diff --git a/perfmon/perfmon_init.c b/perfmon/perfmon_init.c new file mode 100644 index 0000000..bbb6e4d --- /dev/null +++ b/perfmon/perfmon_init.c @@ -0,0 +1,131 @@ +/* + * perfmon.c: perfmon2 global initialization functions + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * David Mosberger-Tang + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include "perfmon_priv.h" + +/* + * external variables + */ +DEFINE_PER_CPU(struct task_struct *, pmu_owner); +DEFINE_PER_CPU(struct pfm_context *, pmu_ctx); +DEFINE_PER_CPU(u64, pmu_activation_number); +DEFINE_PER_CPU(struct pfm_stats, pfm_stats); +DEFINE_PER_CPU(struct hrtimer, pfm_hrtimer); + + +int perfmon_disabled; /* >0 if perfmon is disabled */ + +/* + * called from cpu_init() and pfm_pmu_register() + */ +void __pfm_init_percpu(void *dummy) +{ + struct hrtimer *h; + + h = &__get_cpu_var(pfm_hrtimer); + + pfm_arch_init_percpu(); + + /* + * initialize per-cpu high res timer + */ + hrtimer_init(h, CLOCK_MONOTONIC, HRTIMER_MODE_REL); +#ifdef CONFIG_HIGH_RES_TIMERS + /* + * avoid potential deadlock on the runqueue lock + * during context switch when multiplexing. Situation + * arises on architectures which run switch_to() with + * the runqueue lock held, e.g., x86. On others, e.g., + * IA-64, the problem does not exist. + * Setting the callback mode to HRTIMER_CB_IRQSAFE_UNOCKED + * such that the callback routine is only called on hardirq + * context not on softirq, thus the context switch will not + * end up trying to wakeup the softirqd + */ + h->cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED; +#endif + h->function = pfm_handle_switch_timeout; +} + +/* + * global initialization routine, executed only once + */ +int __init pfm_init(void) +{ + PFM_LOG("version %u.%u", PFM_VERSION_MAJ, PFM_VERSION_MIN); + + if (pfm_init_ctx()) + goto error_disable; + + + if (pfm_init_sets()) + goto error_disable; + + if (pfm_init_fs()) + goto error_disable; + + if (pfm_init_sysfs()) + goto error_disable; + + /* not critical, so no error checking */ + pfm_init_debugfs(); + + /* + * one time, arch-specific global initialization + */ + if (pfm_arch_init()) + goto error_disable; + + if (pfm_init_hotplug()) + goto error_disable; + return 0; + +error_disable: + PFM_ERR("perfmon is disabled due to initialization error"); + perfmon_disabled = 1; + return -1; +} + +/* + * must use subsys_initcall() to ensure that the perfmon2 core + * is initialized before any PMU description module when they are + * compiled in. + */ +subsys_initcall(pfm_init); diff --git a/perfmon/perfmon_intr.c b/perfmon/perfmon_intr.c new file mode 100644 index 0000000..c5e3cda --- /dev/null +++ b/perfmon/perfmon_intr.c @@ -0,0 +1,648 @@ +/* + * perfmon_intr.c: perfmon2 interrupt handling + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * David Mosberger-Tang + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include "perfmon_priv.h" + +/** + * pfm_intr_process_64bit_ovfls - handle 64-bit counter emulation + * @ctx: context to operate on + * @set: set to operate on + * + * The function returns the number of 64-bit overflows detected. + * + * 64-bit software pmds are updated for overflowed pmd registers + * the set->reset_pmds is updated to the list of pmds to reset + * + * In any case, set->npend_ovfls is cleared + */ +static u16 pfm_intr_process_64bit_ovfls(struct pfm_context *ctx, + struct pfm_event_set *set, + u32 *ovfl_ctrl) +{ + u16 i, num_ovfls, max_pmd, max_intr; + u16 num_64b_ovfls, has_ovfl_sw, must_switch; + u64 ovfl_thres, old_val, new_val, ovfl_mask; + + num_64b_ovfls = must_switch = 0; + + ovfl_mask = pfm_pmu_conf->ovfl_mask; + max_pmd = ctx->regs.max_pmd; + max_intr = ctx->regs.max_intr_pmd; + + num_ovfls = set->npend_ovfls; + has_ovfl_sw = set->flags & PFM_SETFL_OVFL_SWITCH; + + bitmap_zero(cast_ulp(set->reset_pmds), max_pmd); + + for (i = ctx->regs.first_intr_pmd; num_ovfls; i++) { + /* + * skip pmd which did not overflow + */ + if (!test_bit(i, cast_ulp(set->povfl_pmds))) + continue; + + num_ovfls--; + + /* + * Update software value for counters ONLY + * + * Note that the pmd is not necessarily 0 at this point as + * qualified events may have happened before the PMU was + * frozen. The residual count is not taken into consideration + * here but will be with any read of the pmd + */ + ovfl_thres = set->pmds[i].ovflsw_thres; + + if (likely(test_bit(i, cast_ulp(ctx->regs.cnt_pmds)))) { + old_val = new_val = set->pmds[i].value; + new_val += 1 + ovfl_mask; + set->pmds[i].value = new_val; + } else { + /* + * for non counters which interrupt, e.g., AMD IBS, + * we consider this equivalent to a 64-bit counter + * overflow. + */ + old_val = 1; new_val = 0; + } + + /* + * check for 64-bit overflow condition + */ + if (likely(old_val > new_val)) { + num_64b_ovfls++; + if (has_ovfl_sw && ovfl_thres > 0) { + if (ovfl_thres == 1) + must_switch = 1; + set->pmds[i].ovflsw_thres = ovfl_thres - 1; + } + + /* + * what to reset because of this overflow + * - the overflowed register + * - its reset_smpls + */ + __set_bit(i, cast_ulp(set->reset_pmds)); + + bitmap_or(cast_ulp(set->reset_pmds), + cast_ulp(set->reset_pmds), + cast_ulp(set->pmds[i].reset_pmds), + max_pmd); + } else { + /* + * only keep track of 64-bit overflows or + * assimilated + */ + __clear_bit(i, cast_ulp(set->povfl_pmds)); + + /* + * on some PMU, it may be necessary to re-arm the PMD + */ + pfm_arch_ovfl_reset_pmd(ctx, i); + } + + PFM_DBG_ovfl("ovfl=%s pmd%u new=0x%llx old=0x%llx " + "hw_pmd=0x%llx o_pmds=0x%llx must_switch=%u " + "o_thres=%llu o_thres_ref=%llu", + old_val > new_val ? "64-bit" : "HW", + i, + (unsigned long long)new_val, + (unsigned long long)old_val, + (unsigned long long)pfm_read_pmd(ctx, i), + (unsigned long long)set->povfl_pmds[0], + must_switch, + (unsigned long long)set->pmds[i].ovflsw_thres, + (unsigned long long)set->pmds[i].ovflsw_ref_thres); + } + /* + * update public bitmask of 64-bit overflowed pmds + */ + if (num_64b_ovfls) + bitmap_copy(cast_ulp(set->ovfl_pmds), cast_ulp(set->povfl_pmds), + max_intr); + + if (must_switch) + *ovfl_ctrl |= PFM_OVFL_CTRL_SWITCH; + + /* + * mark the overflows as consumed + */ + set->npend_ovfls = 0; + bitmap_zero(cast_ulp(set->povfl_pmds), max_intr); + + return num_64b_ovfls; +} + +/** + * pfm_intr_get_smpl_pmds_values - copy 64-bit pmd values for sampling format + * @ctx: context to work on + * @set: current event set + * @arg: overflow arg to be passed to format + * @smpl_pmds: list of PMDs of interest for the overflowed register + * + * build an array of 46-bit PMD values based on smpl_pmds. Values are + * stored in increasing order of the PMD indexes + */ +static void pfm_intr_get_smpl_pmds_values(struct pfm_context *ctx, + struct pfm_event_set *set, + struct pfm_ovfl_arg *arg, + u64 *smpl_pmds) +{ + u16 j, k, max_pmd; + u64 new_val, ovfl_mask; + u64 *cnt_pmds; + + cnt_pmds = ctx->regs.cnt_pmds; + max_pmd = ctx->regs.max_pmd; + ovfl_mask = pfm_pmu_conf->ovfl_mask; + + for (j = k = 0; j < max_pmd; j++) { + + if (!test_bit(j, cast_ulp(smpl_pmds))) + continue; + + new_val = pfm_read_pmd(ctx, j); + + /* for counters, build 64-bit value */ + if (test_bit(j, cast_ulp(cnt_pmds))) + new_val = (set->pmds[j].value & ~ovfl_mask) + | (new_val & ovfl_mask); + + arg->smpl_pmds_values[k++] = new_val; + + PFM_DBG_ovfl("s_pmd_val[%u]=pmd%u=0x%llx", k, j, + (unsigned long long)new_val); + } + arg->num_smpl_pmds = k; +} + +/** + * pfm_intr_process_smpl_fmt -- handle sampling format callback + * @ctx: context to work on + * @set: current event set + * @ip: interrupted instruction pointer + * @now: timestamp + * @num_ovfls: number of 64-bit overflows + * @ovfl_ctrl: set of controls for interrupt handler tail processing + * @regs: register state + * + * Prepare argument (ovfl_arg) to be passed to sampling format callback, then + * invoke the callback (fmt_handler) + */ +static int pfm_intr_process_smpl_fmt(struct pfm_context *ctx, + struct pfm_event_set *set, + unsigned long ip, + u64 now, + u64 num_ovfls, + u32 *ovfl_ctrl, + struct pt_regs *regs) +{ + struct pfm_ovfl_arg *ovfl_arg; + u64 start_cycles, end_cycles; + u16 i, max_pmd; + int ret = 0; + + ovfl_arg = &ctx->ovfl_arg; + + ovfl_arg->active_set = set->id; + max_pmd = ctx->regs.max_pmd; + + /* + * first_intr_pmd: first PMD which can generate PMU interrupts + */ + for (i = ctx->regs.first_intr_pmd; num_ovfls; i++) { + /* + * skip pmd which did not have 64-bit overflows + */ + if (!test_bit(i, cast_ulp(set->ovfl_pmds))) + continue; + + num_ovfls--; + + /* + * prepare argument to fmt_handler + */ + ovfl_arg->ovfl_pmd = i; + ovfl_arg->ovfl_ctrl = 0; + + ovfl_arg->pmd_last_reset = set->pmds[i].lval; + ovfl_arg->pmd_eventid = set->pmds[i].eventid; + ovfl_arg->num_smpl_pmds = 0; + + /* + * copy values of pmds of interest, if any + * Sampling format may use them + * We do not initialize the unused smpl_pmds_values + */ + if (!bitmap_empty(cast_ulp(set->pmds[i].smpl_pmds), max_pmd)) + pfm_intr_get_smpl_pmds_values(ctx, set, ovfl_arg, + set->pmds[i].smpl_pmds); + + pfm_stats_inc(fmt_handler_calls); + + /* + * call format record (handler) routine + */ + start_cycles = sched_clock(); + ret = (*ctx->smpl_fmt->fmt_handler)(ctx, ip, now, regs); + end_cycles = sched_clock(); + + /* + * The reset_pmds mask is constructed automatically + * on overflow. When the actual reset takes place + * depends on the masking, switch and notification + * status. It may be deferred until pfm_restart(). + */ + *ovfl_ctrl |= ovfl_arg->ovfl_ctrl; + + pfm_stats_add(fmt_handler_ns, end_cycles - start_cycles); + } + /* + * when the format cannot handle the rest of the overflow, we abort + */ + if (ret) + PFM_DBG_ovfl("handler aborted at PMD%u ret=%d", i, ret); + return ret; +} +/** + * pfm_overflow_handler - main overflow processing routine. + * @ctx: context to work on (always current context) + * @set: current event set + * @ip: interrupt instruction pointer + * @regs: machine state + * + * set->num_ovfl_pmds is 0 when returning from this function even though + * set->ovfl_pmds[] may have bits set. When leaving set->num_ovfl_pmds + * must never be used to determine if there was a pending overflow. + */ +static void pfm_overflow_handler(struct pfm_context *ctx, + struct pfm_event_set *set, + unsigned long ip, + struct pt_regs *regs) +{ + struct pfm_event_set *set_orig; + u64 now; + u32 ovfl_ctrl; + u16 max_intr, max_pmd; + u16 num_ovfls; + int ret, has_notify; + + /* + * take timestamp + */ + now = sched_clock(); + + max_pmd = ctx->regs.max_pmd; + max_intr = ctx->regs.max_intr_pmd; + + set_orig = set; + ovfl_ctrl = 0; + + /* + * skip ZOMBIE case + */ + if (unlikely(ctx->state == PFM_CTX_ZOMBIE)) + goto stop_monitoring; + + PFM_DBG_ovfl("intr_pmds=0x%llx npend=%u ip=%p, blocking=%d " + "u_pmds=0x%llx use_fmt=%u", + (unsigned long long)set->povfl_pmds[0], + set->npend_ovfls, + (void *)ip, + ctx->flags.block, + (unsigned long long)set->used_pmds[0], + !!ctx->smpl_fmt); + + /* + * return number of 64-bit overflows + */ + num_ovfls = pfm_intr_process_64bit_ovfls(ctx, set, &ovfl_ctrl); + + /* + * there were no 64-bit overflows + * nothing else to do + */ + if (!num_ovfls) + return; + + /* + * tmp_ovfl_notify = ovfl_pmds & ovfl_notify + * with: + * - ovfl_pmds: last 64-bit overflowed pmds + * - ovfl_notify: notify on overflow registers + */ + bitmap_and(cast_ulp(ctx->tmp_ovfl_notify), + cast_ulp(set->ovfl_pmds), + cast_ulp(set->ovfl_notify), + max_intr); + + has_notify = !bitmap_empty(cast_ulp(ctx->tmp_ovfl_notify), max_intr); + + /* + * check for sampling format and invoke fmt_handler + */ + if (likely(ctx->smpl_fmt)) { + pfm_intr_process_smpl_fmt(ctx, set, ip, now, num_ovfls, + &ovfl_ctrl, regs); + } else { + /* + * When no sampling format is used, the default + * is: + * - mask monitoring if not switching + * - notify user if requested + * + * If notification is not requested, monitoring is masked + * and overflowed registers are not reset (saturation). + * This mimics the behavior of the default sampling format. + */ + ovfl_ctrl |= PFM_OVFL_CTRL_NOTIFY; + if (has_notify || !(ovfl_ctrl & PFM_OVFL_CTRL_SWITCH)) + ovfl_ctrl |= PFM_OVFL_CTRL_MASK; + } + + PFM_DBG_ovfl("set%u o_notify=0x%llx o_pmds=0x%llx " + "r_pmds=0x%llx ovfl_ctrl=0x%x", + set->id, + (unsigned long long)ctx->tmp_ovfl_notify[0], + (unsigned long long)set->ovfl_pmds[0], + (unsigned long long)set->reset_pmds[0], + ovfl_ctrl); + + /* + * execute the various controls + * ORDER MATTERS + */ + + + /* + * mask monitoring + */ + if (ovfl_ctrl & PFM_OVFL_CTRL_MASK) { + pfm_mask_monitoring(ctx, set); + /* + * when masking, reset is deferred until + * pfm_restart() + */ + ovfl_ctrl &= ~PFM_OVFL_CTRL_RESET; + + /* + * when masking, switching is deferred until + * pfm_restart and we need to remember it + */ + if (ovfl_ctrl & PFM_OVFL_CTRL_SWITCH) { + set->priv_flags |= PFM_SETFL_PRIV_SWITCH; + ovfl_ctrl &= ~PFM_OVFL_CTRL_SWITCH; + } + } + + /* + * switch event set + */ + if (ovfl_ctrl & PFM_OVFL_CTRL_SWITCH) { + pfm_switch_sets_from_intr(ctx); + /* update view of active set */ + set = ctx->active_set; + } + /* + * send overflow notification + * + * only necessary if at least one overflowed + * register had the notify flag set + */ + if (has_notify && (ovfl_ctrl & PFM_OVFL_CTRL_NOTIFY)) { + /* + * block on notify, not on masking + */ + if (ctx->flags.block) + pfm_post_work(current, ctx, PFM_WORK_BLOCK); + + /* + * send notification and passed original set id + * if error, queue full, for instance, then default + * to masking monitoring, i.e., saturate + */ + ret = pfm_ovfl_notify(ctx, set_orig, ip); + if (unlikely(ret)) { + if (ctx->state == PFM_CTX_LOADED) { + pfm_mask_monitoring(ctx, set); + ovfl_ctrl &= ~PFM_OVFL_CTRL_RESET; + } + } else { + ctx->flags.can_restart++; + PFM_DBG_ovfl("can_restart=%u", ctx->flags.can_restart); + } + } + + /* + * reset overflowed registers + */ + if (ovfl_ctrl & PFM_OVFL_CTRL_RESET) { + u16 nn; + nn = bitmap_weight(cast_ulp(set->reset_pmds), max_pmd); + if (nn) + pfm_reset_pmds(ctx, set, nn, PFM_PMD_RESET_SHORT); + } + return; + +stop_monitoring: + /* + * Does not happen for a system-wide context nor for a + * self-monitored context. We cannot attach to kernel-only + * thread, thus it is safe to set TIF bits, i.e., the thread + * will eventually leave the kernel or die and either we will + * catch the context and clean it up in pfm_handler_work() or + * pfm_exit_thread(). + * + * Mask until we get to pfm_handle_work() + */ + pfm_mask_monitoring(ctx, set); + + PFM_DBG_ovfl("ctx is zombie, converted to spurious"); + pfm_post_work(current, ctx, PFM_WORK_ZOMBIE); +} + +/** + * __pfm_interrupt_handler - 1st level interrupt handler + * @ip: interrupted instruction pointer + * @regs: machine state + * + * Function is static because we use a wrapper to easily capture timing infos. + * + * + * Context locking necessary to avoid concurrent accesses from other CPUs + * - For per-thread, we must prevent pfm_restart() which works when + * context is LOADED or MASKED + */ +static void __pfm_interrupt_handler(unsigned long ip, struct pt_regs *regs) +{ + struct task_struct *task; + struct pfm_context *ctx; + struct pfm_event_set *set; + + + task = __get_cpu_var(pmu_owner); + ctx = __get_cpu_var(pmu_ctx); + + /* + * verify if there is a context on this CPU + */ + if (unlikely(ctx == NULL)) { + PFM_DBG_ovfl("no ctx"); + goto spurious; + } + + /* + * we need to lock context because it could be accessed + * from another CPU. Depending on the priority level of + * the PMU interrupt or the arch, it may be necessary to + * mask interrupts alltogether to avoid race condition with + * the timer interrupt in case of time-based set switching, + * for instance. + */ + spin_lock(&ctx->lock); + + set = ctx->active_set; + + /* + * For SMP per-thread, it is not possible to have + * owner != NULL && task != current. + * + * For UP per-thread, because of lazy save, it + * is possible to receive an interrupt in another task + * which is not using the PMU. This means + * that the interrupt was in-flight at the + * time of pfm_ctxswout_thread(). In that + * case, it will be replayed when the task + * is scheduled again. Hence we convert to spurious. + * + * The basic rule is that an overflow is always + * processed in the context of the task that + * generated it for all per-thread contexts. + * + * for system-wide, task is always NULL + */ +#ifndef CONFIG_SMP + if (unlikely((task && current->pfm_context != ctx))) { + PFM_DBG_ovfl("spurious: not owned by current task"); + goto spurious; + } +#endif + if (unlikely(ctx->state == PFM_CTX_MASKED)) { + PFM_DBG_ovfl("spurious: monitoring masked"); + goto spurious; + } + + /* + * check that monitoring is active, otherwise convert + * to spurious + */ + if (unlikely(!pfm_arch_is_active(ctx))) { + PFM_DBG_ovfl("spurious: monitoring non active"); + goto spurious; + } + + /* + * freeze PMU and collect overflowed PMD registers + * into set->povfl_pmds. Number of overflowed PMDs + * reported in set->npend_ovfls + */ + pfm_arch_intr_freeze_pmu(ctx, set); + + /* + * no overflow detected, interrupt may have come + * from the previous thread running on this CPU + */ + if (unlikely(!set->npend_ovfls)) { + PFM_DBG_ovfl("no npend_ovfls"); + goto spurious; + } + + pfm_stats_inc(ovfl_intr_regular_count); + + /* + * invoke actual handler + */ + pfm_overflow_handler(ctx, set, ip, regs); + + /* + * unfreeze PMU, monitoring may not actual be restarted + * if context is MASKED + */ + pfm_arch_intr_unfreeze_pmu(ctx); + + spin_unlock(&ctx->lock); + + return; + +spurious: + /* ctx may be NULL */ + pfm_arch_intr_unfreeze_pmu(ctx); + if (ctx) + spin_unlock(&ctx->lock); + + pfm_stats_inc(ovfl_intr_spurious_count); +} + + +/** + * pfm_interrupt_handler - 1st level interrupt handler + * @ip: interrupt instruction pointer + * @regs: machine state + * + * Function called from the low-level assembly code or arch-specific perfmon + * code. Simple wrapper used for timing purpose. Actual work done in + * __pfm_overflow_handler() + */ +void pfm_interrupt_handler(unsigned long ip, struct pt_regs *regs) +{ + u64 start; + + pfm_stats_inc(ovfl_intr_all_count); + + BUG_ON(!irqs_disabled()); + + start = sched_clock(); + + __pfm_interrupt_handler(ip, regs); + + pfm_stats_add(ovfl_intr_ns, sched_clock() - start); +} +EXPORT_SYMBOL(pfm_interrupt_handler); + diff --git a/perfmon/perfmon_msg.c b/perfmon/perfmon_msg.c new file mode 100644 index 0000000..b8a1e4c --- /dev/null +++ b/perfmon/perfmon_msg.c @@ -0,0 +1,229 @@ +/* + * perfmon_msg.c: perfmon2 notification message queue management + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * David Mosberger-Tang + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include + +/** + * pfm_get_new_msg - get a new message slot from the queue + * @ctx: context to operate on + * + * if queue if full NULL is returned + */ +static union pfarg_msg *pfm_get_new_msg(struct pfm_context *ctx) +{ + int next; + + next = ctx->msgq_head & PFM_MSGQ_MASK; + + if ((ctx->msgq_head - ctx->msgq_tail) == PFM_MSGS_COUNT) + return NULL; + + /* + * move to next possible slot + */ + ctx->msgq_head++; + + PFM_DBG_ovfl("head=%d tail=%d msg=%d", + ctx->msgq_head & PFM_MSGQ_MASK, + ctx->msgq_tail & PFM_MSGQ_MASK, + next); + + return ctx->msgq+next; +} + +/** + * pfm_notify_user - wakeup any thread wiating on msg queue, post SIGIO + * @ctx: context to operate on + * + * message is already enqueued + */ +static void pfm_notify_user(struct pfm_context *ctx) +{ + if (ctx->state == PFM_CTX_ZOMBIE) { + PFM_DBG("no notification, context is zombie"); + return; + } + + PFM_DBG_ovfl("waking up"); + + wake_up_interruptible(&ctx->msgq_wait); + + /* + * it is safe to call kill_fasync() from an interrupt + * handler. kill_fasync() grabs two RW locks (fasync_lock, + * tasklist_lock) in read mode. There is conflict only in + * case the PMU interrupt occurs during a write mode critical + * section. This cannot happen because for both locks, the + * write mode is always using interrupt masking (write_lock_irq). + */ + kill_fasync(&ctx->async_queue, SIGIO, POLL_IN); +} + +/** + * pfm_ovfl_notify - send overflow notification + * @ctx: context to operate on + * @set: which set the overflow comes from + * @ip: overflow interrupt instruction address (IIP) + * + * Appends an overflow notification message to context queue. + * call pfm_notify() to wakeup any threads and/or send a signal + * + * Context is locked and interrupts are disabled (no preemption). + */ +int pfm_ovfl_notify(struct pfm_context *ctx, + struct pfm_event_set *set, + unsigned long ip) +{ + union pfarg_msg *msg = NULL; + u64 *ovfl_pmds; + + if (!ctx->flags.no_msg) { + msg = pfm_get_new_msg(ctx); + if (msg == NULL) { + /* + * when message queue fills up it is because the user + * did not extract the message, yet issued + * pfm_restart(). At this point, we stop sending + * notification, thus the user will not be able to get + * new samples when using the default format. + */ + PFM_DBG_ovfl("no more notification msgs"); + return -1; + } + + msg->pfm_ovfl_msg.msg_type = PFM_MSG_OVFL; + msg->pfm_ovfl_msg.msg_ovfl_pid = current->pid; + msg->pfm_ovfl_msg.msg_active_set = set->id; + + ovfl_pmds = msg->pfm_ovfl_msg.msg_ovfl_pmds; + + /* + * copy bitmask of all pmd that interrupted last + */ + bitmap_copy(cast_ulp(ovfl_pmds), cast_ulp(set->ovfl_pmds), + ctx->regs.max_intr_pmd); + + msg->pfm_ovfl_msg.msg_ovfl_cpu = smp_processor_id(); + msg->pfm_ovfl_msg.msg_ovfl_tid = current->tgid; + msg->pfm_ovfl_msg.msg_ovfl_ip = ip; + + pfm_stats_inc(ovfl_notify_count); + } + + PFM_DBG_ovfl("ip=0x%lx o_pmds=0x%llx", + ip, + (unsigned long long)set->ovfl_pmds[0]); + + pfm_notify_user(ctx); + return 0; +} + +/** + * pfm_end_notify_user - notify of thread termination + * @ctx: context to operate on + * + * In per-thread mode, when not self-monitoring, perfmon + * sends a 'end' notification message when the monitored + * thread where the context is attached is exiting. + * + * This helper message alleviates the need to track the activity + * of the thread/process when it is not directly related, i.e., + * was attached. In other words, no needto keep the thread + * ptraced. + * + * The context must be locked and interrupts disabled. + */ +int pfm_end_notify(struct pfm_context *ctx) +{ + union pfarg_msg *msg; + + msg = pfm_get_new_msg(ctx); + if (msg == NULL) { + PFM_ERR("%s no more msgs", __func__); + return -1; + } + /* no leak */ + memset(msg, 0, sizeof(*msg)); + + msg->type = PFM_MSG_END; + + PFM_DBG("end msg: msg=%p no_msg=%d", + msg, + ctx->flags.no_msg); + + pfm_notify_user(ctx); + return 0; +} + +/** + * pfm_get_next_msg - copy the oldest message from the queue and move tail + * @ctx: context to use + * @m: where to copy the message into + * + * The tail of the queue is moved as a consequence of this call + */ +void pfm_get_next_msg(struct pfm_context *ctx, union pfarg_msg *m) +{ + union pfarg_msg *next; + + PFM_DBG_ovfl("in head=%d tail=%d", + ctx->msgq_head & PFM_MSGQ_MASK, + ctx->msgq_tail & PFM_MSGQ_MASK); + + /* + * get oldest message + */ + next = ctx->msgq + (ctx->msgq_tail & PFM_MSGQ_MASK); + + /* + * move tail forward + */ + ctx->msgq_tail++; + + /* + * copy message, we cannot simply point to it + * as it may be re-used before we copy it out + */ + *m = *next; + + PFM_DBG_ovfl("out head=%d tail=%d type=%d", + ctx->msgq_head & PFM_MSGQ_MASK, + ctx->msgq_tail & PFM_MSGQ_MASK, + m->type); +} diff --git a/perfmon/perfmon_pmu.c b/perfmon/perfmon_pmu.c new file mode 100644 index 0000000..df7a9c9 --- /dev/null +++ b/perfmon/perfmon_pmu.c @@ -0,0 +1,590 @@ +/* + * perfmon_pmu.c: perfmon2 PMU configuration management + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * David Mosberger-Tang + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include "perfmon_priv.h" + +#ifndef CONFIG_MODULE_UNLOAD +#define module_refcount(n) 1 +#endif + +static __cacheline_aligned_in_smp int request_mod_in_progress; +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_pmu_conf_lock); + +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_pmu_acq_lock); +static u32 pfm_pmu_acquired; + +/* + * perfmon core must acces PMU information ONLY through pfm_pmu_conf + * if pfm_pmu_conf is NULL, then no description is registered + */ +struct pfm_pmu_config *pfm_pmu_conf; +EXPORT_SYMBOL(pfm_pmu_conf); + +static inline int pmu_is_module(struct pfm_pmu_config *c) +{ + return !(c->flags & PFM_PMUFL_IS_BUILTIN); +} +/** + * pfm_pmu_regdesc_init -- initialize regdesc structure from PMU table + * @regs: the regdesc structure to initialize + * @excl_type: the register type(s) to exclude from this regdesc + * @unvail_pmcs: unavailable PMC registers + * @unavail_pmds: unavailable PMD registers + * + * Return: + * 0 success + * errno in case of error + */ +static int pfm_pmu_regdesc_init(struct pfm_regdesc *regs, int excl_type, + u64 *unavail_pmcs, u64 *unavail_pmds) +{ + struct pfm_regmap_desc *d; + u16 n, n2, n_counters, i; + int first_intr_pmd = -1, max1, max2, max3; + + /* + * compute the number of implemented PMC from the + * description table + */ + n = 0; + max1 = max2 = -1; + d = pfm_pmu_conf->pmc_desc; + for (i = 0; i < pfm_pmu_conf->num_pmc_entries; i++, d++) { + if (!(d->type & PFM_REG_I)) + continue; + + if (test_bit(i, cast_ulp(unavail_pmcs))) + continue; + + if (d->type & excl_type) + continue; + + __set_bit(i, cast_ulp(regs->pmcs)); + + max1 = i; + n++; + } + + if (!n) { + PFM_INFO("%s PMU description has no PMC registers", + pfm_pmu_conf->pmu_name); + return -EINVAL; + } + + regs->max_pmc = max1 + 1; + regs->num_pmcs = n; + + n = n_counters = n2 = 0; + max1 = max2 = max3 = -1; + d = pfm_pmu_conf->pmd_desc; + for (i = 0; i < pfm_pmu_conf->num_pmd_entries; i++, d++) { + if (!(d->type & PFM_REG_I)) + continue; + + if (test_bit(i, cast_ulp(unavail_pmds))) + continue; + + if (d->type & excl_type) + continue; + + __set_bit(i, cast_ulp(regs->pmds)); + max1 = i; + n++; + + /* + * read-write registers + */ + if (!(d->type & PFM_REG_RO)) { + __set_bit(i, cast_ulp(regs->rw_pmds)); + max3 = i; + n2++; + } + + /* + * counter registers + */ + if (d->type & PFM_REG_C64) { + __set_bit(i, cast_ulp(regs->cnt_pmds)); + n_counters++; + } + + /* + * PMD with intr capabilities + */ + if (d->type & PFM_REG_INTR) { + __set_bit(i, cast_ulp(regs->intr_pmds)); + if (first_intr_pmd == -1) + first_intr_pmd = i; + max2 = i; + } + } + + if (!n) { + PFM_INFO("%s PMU description has no PMD registers", + pfm_pmu_conf->pmu_name); + return -EINVAL; + } + + regs->max_pmd = max1 + 1; + regs->first_intr_pmd = first_intr_pmd; + regs->max_intr_pmd = max2 + 1; + + regs->num_counters = n_counters; + regs->num_pmds = n; + regs->max_rw_pmd = max3 + 1; + regs->num_rw_pmd = n2; + + return 0; +} + +/** + * pfm_pmu_regdesc_init_all -- initialize all regdesc structures + * @una_pmcs : unavailable PMC registers + * @una_pmds : unavailable PMD registers + * + * Return: + * 0 sucess + * errno if error + * + * We maintain 3 regdesc: + * regs_all: all available registers + * regs_sys: registers available to system-wide contexts only + * regs_thr: registers available to per-thread contexts only + */ +static int pfm_pmu_regdesc_init_all(u64 *una_pmcs, u64 *una_pmds) +{ + int ret; + + memset(&pfm_pmu_conf->regs_all, 0, sizeof(struct pfm_regdesc)); + memset(&pfm_pmu_conf->regs_thr, 0, sizeof(struct pfm_regdesc)); + memset(&pfm_pmu_conf->regs_sys, 0, sizeof(struct pfm_regdesc)); + + ret = pfm_pmu_regdesc_init(&pfm_pmu_conf->regs_all, + 0, + una_pmcs, una_pmds); + if (ret) + return ret; + + PFM_DBG("regs_all.pmcs=0x%llx", + (unsigned long long)pfm_pmu_conf->regs_all.pmcs[0]); + + ret = pfm_pmu_regdesc_init(&pfm_pmu_conf->regs_thr, + PFM_REG_SYS, + una_pmcs, una_pmds); + if (ret) + return ret; + PFM_DBG("regs.thr.pmcs=0x%llx", + (unsigned long long)pfm_pmu_conf->regs_thr.pmcs[0]); + + ret = pfm_pmu_regdesc_init(&pfm_pmu_conf->regs_sys, + PFM_REG_THR, + una_pmcs, una_pmds); + + PFM_DBG("regs_sys.pmcs=0x%llx", + (unsigned long long)pfm_pmu_conf->regs_sys.pmcs[0]); + + return ret; +} + +int pfm_pmu_register(struct pfm_pmu_config *cfg) +{ + u16 i, nspec, nspec_ro, num_pmcs, num_pmds, num_wc = 0; + int type, ret = -EBUSY; + + if (perfmon_disabled) { + PFM_INFO("perfmon disabled, cannot add PMU description"); + return -ENOSYS; + } + + nspec = nspec_ro = num_pmds = num_pmcs = 0; + + /* some sanity checks */ + if (cfg == NULL || cfg->pmu_name == NULL) { + PFM_INFO("PMU config descriptor is invalid"); + return -EINVAL; + } + + /* must have a probe */ + if (cfg->probe_pmu == NULL) { + PFM_INFO("PMU config has no probe routine"); + return -EINVAL; + } + + /* + * execute probe routine before anything else as it + * may update configuration tables + */ + if ((*cfg->probe_pmu)() == -1) { + PFM_INFO("%s PMU detection failed", cfg->pmu_name); + return -EINVAL; + } + + if (!(cfg->flags & PFM_PMUFL_IS_BUILTIN) && cfg->owner == NULL) { + PFM_INFO("PMU config %s is missing owner", cfg->pmu_name); + return -EINVAL; + } + + if (!cfg->num_pmd_entries) { + PFM_INFO("%s needs to define num_pmd_entries", cfg->pmu_name); + return -EINVAL; + } + + if (!cfg->num_pmc_entries) { + PFM_INFO("%s needs to define num_pmc_entries", cfg->pmu_name); + return -EINVAL; + } + + if (!cfg->counter_width) { + PFM_INFO("PMU config %s, zero width counters", cfg->pmu_name); + return -EINVAL; + } + + /* + * REG_RO, REG_V not supported on PMC registers + */ + for (i = 0; i < cfg->num_pmc_entries; i++) { + + type = cfg->pmc_desc[i].type; + + if (type & PFM_REG_I) + num_pmcs++; + + if (type & PFM_REG_WC) + num_wc++; + + if (type & PFM_REG_V) { + PFM_INFO("PFM_REG_V is not supported on " + "PMCs (PMC%d)", i); + return -EINVAL; + } + if (type & PFM_REG_RO) { + PFM_INFO("PFM_REG_RO meaningless on " + "PMCs (PMC%u)", i); + return -EINVAL; + } + } + + if (num_wc && cfg->pmc_write_check == NULL) { + PFM_INFO("some PMCs have write-checker but no callback provided\n"); + return -EINVAL; + } + + /* + * check virtual PMD registers + */ + num_wc = 0; + for (i = 0; i < cfg->num_pmd_entries; i++) { + + type = cfg->pmd_desc[i].type; + + if (type & PFM_REG_I) + num_pmds++; + + if (type & PFM_REG_V) { + nspec++; + if (type & PFM_REG_RO) + nspec_ro++; + } + + if (type & PFM_REG_WC) + num_wc++; + } + + if (num_wc && cfg->pmd_write_check == NULL) { + PFM_INFO("PMD have write-checker but no callback provided\n"); + return -EINVAL; + } + + if (nspec && cfg->pmd_sread == NULL) { + PFM_INFO("PMU config is missing pmd_sread()"); + return -EINVAL; + } + + nspec = nspec - nspec_ro; + if (nspec && cfg->pmd_swrite == NULL) { + PFM_INFO("PMU config is missing pmd_swrite()"); + return -EINVAL; + } + + if (num_pmcs >= PFM_MAX_PMCS) { + PFM_INFO("%s PMCS registers exceed name space [0-%u]", + cfg->pmu_name, + PFM_MAX_PMCS); + return -EINVAL; + } + if (num_pmds >= PFM_MAX_PMDS) { + PFM_INFO("%s PMDS registers exceed name space [0-%u]", + cfg->pmu_name, + PFM_MAX_PMDS); + return -EINVAL; + } + spin_lock(&pfm_pmu_conf_lock); + + if (pfm_pmu_conf) + goto unlock; + + if (!cfg->version) + cfg->version = "0.0"; + + pfm_pmu_conf = cfg; + pfm_pmu_conf->ovfl_mask = (1ULL << cfg->counter_width) - 1; + + ret = pfm_arch_pmu_config_init(cfg); + if (ret) + goto unlock; + + ret = pfm_sysfs_add_pmu(pfm_pmu_conf); + if (ret) + pfm_pmu_conf = NULL; + +unlock: + spin_unlock(&pfm_pmu_conf_lock); + + if (ret) { + PFM_INFO("register %s PMU error %d", cfg->pmu_name, ret); + } else { + PFM_INFO("%s PMU installed", cfg->pmu_name); + /* + * (re)initialize PMU on each PMU now that we have a description + */ + on_each_cpu(__pfm_init_percpu, cfg, 0); + } + return ret; +} +EXPORT_SYMBOL(pfm_pmu_register); + +/* + * remove PMU description. Caller must pass address of current + * configuration. This is mostly for sanity checking as only + * one config can exist at any time. + * + * We are using the module refcount mechanism to protect against + * removal while the configuration is being used. As long as there is + * one context, a PMU configuration cannot be removed. The protection is + * managed in module logic. + */ +void pfm_pmu_unregister(struct pfm_pmu_config *cfg) +{ + if (!(cfg || pfm_pmu_conf)) + return; + + spin_lock(&pfm_pmu_conf_lock); + + BUG_ON(module_refcount(pfm_pmu_conf->owner)); + + if (cfg->owner == pfm_pmu_conf->owner) { + pfm_sysfs_remove_pmu(pfm_pmu_conf); + pfm_pmu_conf = NULL; + } + + spin_unlock(&pfm_pmu_conf_lock); +} +EXPORT_SYMBOL(pfm_pmu_unregister); + +static int pfm_pmu_request_module(void) +{ + char *mod_name; + int ret; + + mod_name = pfm_arch_get_pmu_module_name(); + if (mod_name == NULL) + return -ENOSYS; + + ret = request_module(mod_name); + + PFM_DBG("mod=%s ret=%d\n", mod_name, ret); + return ret; +} + +/* + * autoload: + * 0 : do not try to autoload the PMU description module + * not 0 : try to autoload the PMU description module + */ +int pfm_pmu_conf_get(int autoload) +{ + int ret; + + spin_lock(&pfm_pmu_conf_lock); + + if (request_mod_in_progress) { + ret = -ENOSYS; + goto skip; + } + + if (autoload && pfm_pmu_conf == NULL) { + + request_mod_in_progress = 1; + + spin_unlock(&pfm_pmu_conf_lock); + + pfm_pmu_request_module(); + + spin_lock(&pfm_pmu_conf_lock); + + request_mod_in_progress = 0; + + /* + * request_module() may succeed but the module + * may not have registered properly so we need + * to check + */ + } + + ret = pfm_pmu_conf == NULL ? -ENOSYS : 0; + if (!ret && pmu_is_module(pfm_pmu_conf) + && !try_module_get(pfm_pmu_conf->owner)) + ret = -ENOSYS; + +skip: + spin_unlock(&pfm_pmu_conf_lock); + + return ret; +} + +void pfm_pmu_conf_put(void) +{ + if (pfm_pmu_conf == NULL || !pmu_is_module(pfm_pmu_conf)) + return; + + spin_lock(&pfm_pmu_conf_lock); + module_put(pfm_pmu_conf->owner); + spin_unlock(&pfm_pmu_conf_lock); +} + + +/* + * acquire PMU resource from lower-level PMU register allocator + * (currently perfctr-watchdog.c) + * + * acquisition is done when the first context is created (and not + * when it is loaded). We grab all that is defined in the description + * module and then we make adjustments at the arch-specific level. + * + * The PMU resource is released when the last perfmon context is + * destroyed. + * + * interrupts are not masked + */ +int pfm_pmu_acquire(struct pfm_context *ctx) +{ + u64 unavail_pmcs[PFM_PMC_BV]; + u64 unavail_pmds[PFM_PMD_BV]; + int ret = 0; + + spin_lock(&pfm_pmu_acq_lock); + + PFM_DBG("pmu_acquired=%u", pfm_pmu_acquired); + + pfm_pmu_acquired++; + + /* + * we need to initialize regdesc each time we re-acquire + * the PMU for the first time as there may have been changes + * in the list of available registers, e.g., NMI may have + * been disabled. Checking on PMU module insert is not + * enough + */ + if (pfm_pmu_acquired == 1) { + memset(unavail_pmcs, 0, sizeof(unavail_pmcs)); + memset(unavail_pmds, 0, sizeof(unavail_pmds)); + + ret = pfm_arch_pmu_acquire(unavail_pmcs, unavail_pmds); + if (ret) { + pfm_pmu_acquired--; + } else { + pfm_pmu_regdesc_init_all(unavail_pmcs, unavail_pmds); + + /* available PMU ressources */ + PFM_DBG("PMU acquired: %u PMCs, %u PMDs, %u counters", + pfm_pmu_conf->regs_all.num_pmcs, + pfm_pmu_conf->regs_all.num_pmds, + pfm_pmu_conf->regs_all.num_counters); + } + } + spin_unlock(&pfm_pmu_acq_lock); + + /* + * copy the regdesc that corresponds to the context + * we copy and not just point because it helps with + * memory locality. the regdesc structure is accessed + * very frequently in performance critical code such + * as context switch and interrupt handling. By using + * a local copy, we increase memory footprint, but + * increase chance to have local memory access, + * especially for system-wide contexts. + */ + if (ctx->flags.system) + ctx->regs = pfm_pmu_conf->regs_sys; + else + ctx->regs = pfm_pmu_conf->regs_thr; + + return ret; +} + +/* + * release the PMU resource + * + * actual release happens when last context is destroyed + * + * interrupts are not masked + */ +void pfm_pmu_release(void) +{ + BUG_ON(irqs_disabled()); + + /* + * we need to use a spinlock because release takes some time + * and we may have a race with pfm_pmu_acquire() + */ + spin_lock(&pfm_pmu_acq_lock); + + PFM_DBG("pmu_acquired=%d", pfm_pmu_acquired); + + /* + * we decouple test and decrement because if we had errors + * in pfm_pmu_acquire(), we still come here on pfm_context_free() + * but with pfm_pmu_acquire=0 + */ + if (pfm_pmu_acquired > 0 && --pfm_pmu_acquired == 0) { + pfm_arch_pmu_release(); + PFM_DBG("PMU released"); + } + spin_unlock(&pfm_pmu_acq_lock); +} diff --git a/perfmon/perfmon_priv.h b/perfmon/perfmon_priv.h new file mode 100644 index 0000000..5b485de --- /dev/null +++ b/perfmon/perfmon_priv.h @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2001-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ + +#ifndef __PERFMON_PRIV_H__ +#define __PERFMON_PRIV_H__ +/* + * This file contains all the definitions of data structures, variables, macros + * that are to private to the generic code, i.e., not shared with any code that + * lives under arch/ or include/asm-XX + * + * For shared definitions, use include/linux/perfmon_kern.h + */ + +#ifdef CONFIG_PERFMON + +/* + * type of PMD reset for pfm_reset_pmds() or pfm_switch_sets*() + */ +#define PFM_PMD_RESET_SHORT 1 /* use short reset value */ +#define PFM_PMD_RESET_LONG 2 /* use long reset value */ + +/* + * context lazy save/restore activation count + */ +#define PFM_INVALID_ACTIVATION ((u64)~0) + +DECLARE_PER_CPU(u64, pmu_activation_number); +DECLARE_PER_CPU(struct hrtimer, pfm_hrtimer); + +static inline void pfm_set_pmu_owner(struct task_struct *task, + struct pfm_context *ctx) +{ + __get_cpu_var(pmu_owner) = task; + __get_cpu_var(pmu_ctx) = ctx; +} + +static inline int pfm_msgq_is_empty(struct pfm_context *ctx) +{ + return ctx->msgq_head == ctx->msgq_tail; +} + +void pfm_get_next_msg(struct pfm_context *ctx, union pfarg_msg *m); +int pfm_end_notify(struct pfm_context *ctx); +int pfm_ovfl_notify(struct pfm_context *ctx, struct pfm_event_set *set, + unsigned long ip); + +int pfm_alloc_fd(struct file **cfile); + +int __pfm_delete_evtsets(struct pfm_context *ctx, void *arg, int count); +int __pfm_getinfo_evtsets(struct pfm_context *ctx, struct pfarg_setinfo *req, + int count); +int __pfm_create_evtsets(struct pfm_context *ctx, struct pfarg_setdesc *req, + int count); + + +int pfm_init_ctx(void); + +int pfm_pmu_acquire(struct pfm_context *ctx); +void pfm_pmu_release(void); + +int pfm_session_acquire(int is_system, u32 cpu); +void pfm_session_release(int is_system, u32 cpu); + +int pfm_smpl_buf_space_acquire(struct pfm_context *ctx, size_t size); +int pfm_smpl_buf_load_context(struct pfm_context *ctx); +void pfm_smpl_buf_unload_context(struct pfm_context *ctx); + +int pfm_init_sysfs(void); + +#ifdef CONFIG_PERFMON_DEBUG_FS +int pfm_init_debugfs(void); +int pfm_debugfs_add_cpu(int mycpu); +void pfm_debugfs_del_cpu(int mycpu); +#else +static inline int pfm_init_debugfs(void) +{ + return 0; +} +static inline int pfm_debugfs_add_cpu(int mycpu) +{ + return 0; +} + +static inline void pfm_debugfs_del_cpu(int mycpu) +{} +#endif + + +void pfm_reset_pmds(struct pfm_context *ctx, struct pfm_event_set *set, + int num_pmds, + int reset_mode); + +struct pfm_event_set *pfm_prepare_sets(struct pfm_context *ctx, u16 load_set); +int pfm_init_sets(void); + +ssize_t pfm_sysfs_res_show(char *buf, size_t sz, int what); + +void pfm_free_sets(struct pfm_context *ctx); +int pfm_create_initial_set(struct pfm_context *ctx); +void pfm_switch_sets_from_intr(struct pfm_context *ctx); +void pfm_restart_timer(struct pfm_context *ctx, struct pfm_event_set *set); +enum hrtimer_restart pfm_handle_switch_timeout(struct hrtimer *t); + +enum hrtimer_restart pfm_switch_sets(struct pfm_context *ctx, + struct pfm_event_set *new_set, + int reset_mode, + int no_restart); + +/** + * pfm_save_prev_ctx - check if previous context exists and save state + * + * called from pfm_load_ctx_thread() and __pfm_ctxsin_thread() to + * check if previous context exists. If so saved its PMU state. This is used + * only for UP kernels. + * + * PMU ownership is not cleared because the function is always called while + * trying to install a new owner. + */ +static inline void pfm_check_save_prev_ctx(void) +{ +#ifdef CONFIG_SMP + struct pfm_event_set *set; + struct pfm_context *ctxp; + + ctxp = __get_cpu_var(pmu_ctx); + if (!ctxp) + return; + /* + * in UP per-thread, due to lazy save + * there could be a context from another + * task. We need to push it first before + * installing our new state + */ + set = ctxp->active_set; + pfm_save_pmds(ctxp, set); + /* + * do not clear ownership because we rewrite + * right away + */ +#endif +} + + +int pfm_init_fs(void); + +int pfm_init_hotplug(void); + +void pfm_mask_monitoring(struct pfm_context *ctx, struct pfm_event_set *set); +void pfm_resume_after_ovfl(struct pfm_context *ctx); +int pfm_setup_smpl_fmt(struct pfm_context *ctx, u32 ctx_flags, void *fmt_arg, + struct file *filp); + +static inline void pfm_post_work(struct task_struct *task, + struct pfm_context *ctx, int type) +{ + ctx->flags.work_type = type; + set_tsk_thread_flag(task, TIF_PERFMON_WORK); + pfm_arch_arm_handle_work(task); +} + +#define PFM_PMC_STK_ARG PFM_ARCH_PMC_STK_ARG +#define PFM_PMD_STK_ARG PFM_ARCH_PMD_STK_ARG + +#endif /* CONFIG_PERFMON */ + +#endif /* __PERFMON_PRIV_H__ */ diff --git a/perfmon/perfmon_res.c b/perfmon/perfmon_res.c new file mode 100644 index 0000000..7b0382b --- /dev/null +++ b/perfmon/perfmon_res.c @@ -0,0 +1,450 @@ +/* + * perfmon_res.c: perfmon2 resource allocations + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * David Mosberger-Tang + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include "perfmon_priv.h" + +/* + * global information about all sessions + * mostly used to synchronize between system wide and per-process + */ +struct pfm_resources { + size_t smpl_buf_mem_cur;/* current smpl buf mem usage */ + cpumask_t sys_cpumask; /* bitmask of used cpus */ + u32 thread_sessions; /* #num loaded per-thread sessions */ +}; + +static struct pfm_resources pfm_res; + +static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pfm_res_lock); + +/** + * pfm_smpl_buf_space_acquire - check memory resource usage for sampling buffer + * @ctx: context of interest + * @size: size fo requested buffer + * + * sampling buffer allocated by perfmon must be + * checked against max locked memory usage thresholds + * for security reasons. + * + * The first level check is against the system wide limit + * as indicated by the system administrator in /sys/kernel/perfmon + * + * The second level check is on a per-process basis using + * RLIMIT_MEMLOCK limit. + * + * Operating on the current task only. + */ +int pfm_smpl_buf_space_acquire(struct pfm_context *ctx, size_t size) +{ + struct mm_struct *mm; + unsigned long locked; + unsigned long buf_mem, buf_mem_max; + unsigned long flags; + + spin_lock_irqsave(&pfm_res_lock, flags); + + /* + * check against global buffer limit + */ + buf_mem_max = pfm_controls.smpl_buffer_mem_max; + buf_mem = pfm_res.smpl_buf_mem_cur + size; + + if (buf_mem <= buf_mem_max) { + pfm_res.smpl_buf_mem_cur = buf_mem; + + PFM_DBG("buf_mem_max=%lu current_buf_mem=%lu", + buf_mem_max, + buf_mem); + } + + spin_unlock_irqrestore(&pfm_res_lock, flags); + + if (buf_mem > buf_mem_max) { + PFM_DBG("smpl buffer memory threshold reached"); + return -ENOMEM; + } + + /* + * check against per-process RLIMIT_MEMLOCK + */ + mm = get_task_mm(current); + + down_write(&mm->mmap_sem); + + locked = mm->locked_vm << PAGE_SHIFT; + locked += size; + + if (locked > current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur) { + + PFM_DBG("RLIMIT_MEMLOCK reached ask_locked=%lu rlim_cur=%lu", + locked, + current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur); + + up_write(&mm->mmap_sem); + mmput(mm); + goto unres; + } + + mm->locked_vm = locked >> PAGE_SHIFT; + + up_write(&mm->mmap_sem); + + mmput(mm); + + return 0; + +unres: + /* + * remove global buffer memory allocation + */ + spin_lock_irqsave(&pfm_res_lock, flags); + + pfm_res.smpl_buf_mem_cur -= size; + + spin_unlock_irqrestore(&pfm_res_lock, flags); + + return -ENOMEM; +} +/** + * pfm_smpl_buf_space_release - release resource usage for sampling buffer + * @ctx: perfmon context of interest + * + * There exist multiple paths leading to this function. We need to + * be very careful withlokcing on the mmap_sem as it may already be + * held by the time we come here. + * The following paths exist: + * + * exit path: + * sys_exit_group + * do_group_exit + * do_exit + * exit_mm + * mmput + * exit_mmap + * remove_vma + * fput + * __fput + * pfm_close + * __pfm_close + * pfm_context_free + * pfm_release_buf_space + * munmap path: + * sys_munmap + * do_munmap + * remove_vma + * fput + * __fput + * pfm_close + * __pfm_close + * pfm_context_free + * pfm_release_buf_space + * + * close path: + * sys_close + * filp_close + * fput + * __fput + * pfm_close + * __pfm_close + * pfm_context_free + * pfm_release_buf_space + * + * The issue is that on the munmap() path, the mmap_sem is already held + * in write-mode by the time we come here. To avoid the deadlock, we need + * to know where we are coming from and skip down_write(). If is fairly + * difficult to know this because of the lack of good hooks and + * the fact that, there may not have been any mmap() of the sampling buffer + * (i.e. create_context() followed by close() or exit()). + * + * We use a set flag ctx->flags.mmap_nlock which is toggled in the vm_ops + * callback in remove_vma() which is called systematically for the call, so + * on all but the pure close() path. The exit path does not already hold + * the lock but this is exit so there is no task->mm by the time we come here. + * + * The mmap_nlock is set only when unmapping and this is the LAST reference + * to the file (i.e., close() followed by munmap()). + */ +void pfm_smpl_buf_space_release(struct pfm_context *ctx, size_t size) +{ + unsigned long flags; + struct mm_struct *mm; + + mm = get_task_mm(current); + if (mm) { + if (ctx->flags.mmap_nlock == 0) { + PFM_DBG("doing down_write"); + down_write(&mm->mmap_sem); + } + + mm->locked_vm -= size >> PAGE_SHIFT; + + PFM_DBG("size=%zu locked_vm=%lu", size, mm->locked_vm); + + if (ctx->flags.mmap_nlock == 0) + up_write(&mm->mmap_sem); + + mmput(mm); + } + + spin_lock_irqsave(&pfm_res_lock, flags); + + pfm_res.smpl_buf_mem_cur -= size; + + spin_unlock_irqrestore(&pfm_res_lock, flags); +} + +/** + * pfm_session_acquire - reserve a per-thread or per-cpu session + * @is_system: true if per-cpu session + * @cpu: cpu number for per-cpu session + * + * return: + * 0 : success + * -EBUSY: if conflicting session exist + */ +int pfm_session_acquire(int is_system, u32 cpu) +{ + unsigned long flags; + u32 nsys_cpus; + int ret = 0; + + /* + * validy checks on cpu_mask have been done upstream + */ + spin_lock_irqsave(&pfm_res_lock, flags); + + nsys_cpus = cpus_weight(pfm_res.sys_cpumask); + + PFM_DBG("in sys=%u task=%u is_sys=%d cpu=%u", + nsys_cpus, + pfm_res.thread_sessions, + is_system, + cpu); + + if (is_system) { + /* + * cannot mix system wide and per-task sessions + */ + if (pfm_res.thread_sessions > 0) { + PFM_DBG("%u conflicting thread_sessions", + pfm_res.thread_sessions); + ret = -EBUSY; + goto abort; + } + + if (cpu_isset(cpu, pfm_res.sys_cpumask)) { + PFM_DBG("conflicting session on CPU%u", cpu); + ret = -EBUSY; + goto abort; + } + + PFM_DBG("reserved session on CPU%u", cpu); + + cpu_set(cpu, pfm_res.sys_cpumask); + nsys_cpus++; + } else { + if (nsys_cpus) { + ret = -EBUSY; + goto abort; + } + pfm_res.thread_sessions++; + } + + PFM_DBG("out sys=%u task=%u is_sys=%d cpu=%u", + nsys_cpus, + pfm_res.thread_sessions, + is_system, + cpu); + +abort: + spin_unlock_irqrestore(&pfm_res_lock, flags); + + return ret; +} + +/** + * pfm_session_release - release a per-cpu or per-thread session + * @is_system: true if per-cpu session + * @cpu: cpu number for per-cpu session + * + * called from __pfm_unload_context() + */ +void pfm_session_release(int is_system, u32 cpu) +{ + unsigned long flags; + + spin_lock_irqsave(&pfm_res_lock, flags); + + PFM_DBG("in sys_sessions=%u thread_sessions=%u syswide=%d cpu=%u", + cpus_weight(pfm_res.sys_cpumask), + pfm_res.thread_sessions, + is_system, cpu); + + if (is_system) + cpu_clear(cpu, pfm_res.sys_cpumask); + else + pfm_res.thread_sessions--; + + PFM_DBG("out sys_sessions=%u thread_sessions=%u syswide=%d cpu=%u", + cpus_weight(pfm_res.sys_cpumask), + pfm_res.thread_sessions, + is_system, cpu); + + spin_unlock_irqrestore(&pfm_res_lock, flags); +} + +/** + * pfm_session_allcpus_acquire - acquire per-cpu sessions on all available cpus + * + * currently used by Oprofile on X86 + */ +int pfm_session_allcpus_acquire(void) +{ + unsigned long flags; + u32 nsys_cpus, cpu; + int ret = -EBUSY; + + spin_lock_irqsave(&pfm_res_lock, flags); + + nsys_cpus = cpus_weight(pfm_res.sys_cpumask); + + PFM_DBG("in sys=%u task=%u", + nsys_cpus, + pfm_res.thread_sessions); + + if (nsys_cpus) { + PFM_DBG("already some system-wide sessions"); + goto abort; + } + + /* + * cannot mix system wide and per-task sessions + */ + if (pfm_res.thread_sessions) { + PFM_DBG("%u conflicting thread_sessions", + pfm_res.thread_sessions); + goto abort; + } + + for_each_online_cpu(cpu) { + cpu_set(cpu, pfm_res.sys_cpumask); + nsys_cpus++; + } + + PFM_DBG("out sys=%u task=%u", + nsys_cpus, + pfm_res.thread_sessions); + + ret = 0; +abort: + spin_unlock_irqrestore(&pfm_res_lock, flags); + + return ret; +} +EXPORT_SYMBOL(pfm_session_allcpus_acquire); + +/** + * pfm_session_allcpus_release - relase per-cpu sessions on all cpus + * + * currently used by Oprofile code + */ +void pfm_session_allcpus_release(void) +{ + unsigned long flags; + u32 nsys_cpus, cpu; + + spin_lock_irqsave(&pfm_res_lock, flags); + + nsys_cpus = cpus_weight(pfm_res.sys_cpumask); + + PFM_DBG("in sys=%u task=%u", + nsys_cpus, + pfm_res.thread_sessions); + + /* + * XXX: could use __cpus_clear() with nbits + */ + for_each_online_cpu(cpu) { + cpu_clear(cpu, pfm_res.sys_cpumask); + nsys_cpus--; + } + + PFM_DBG("out sys=%u task=%u", + nsys_cpus, + pfm_res.thread_sessions); + + spin_unlock_irqrestore(&pfm_res_lock, flags); +} +EXPORT_SYMBOL(pfm_session_allcpus_release); + +/** + * pfm_sysfs_res_show - return currnt resourcde usage for sysfs + * @buf: buffer to hold string in return + * @sz: size of buf + * @what: what to produce + * what=0 : thread_sessions + * what=1 : cpus_weight(sys_cpumask) + * what=2 : smpl_buf_mem_cur + * what=3 : pmu model name + * + * called from perfmon_sysfs.c + * return number of bytes written into buf (up to sz) + */ +ssize_t pfm_sysfs_res_show(char *buf, size_t sz, int what) +{ + unsigned long flags; + + spin_lock_irqsave(&pfm_res_lock, flags); + + switch (what) { + case 0: snprintf(buf, sz, "%u\n", pfm_res.thread_sessions); + break; + case 1: snprintf(buf, sz, "%d\n", cpus_weight(pfm_res.sys_cpumask)); + break; + case 2: snprintf(buf, sz, "%zu\n", pfm_res.smpl_buf_mem_cur); + break; + case 3: + snprintf(buf, sz, "%s\n", + pfm_pmu_conf ? pfm_pmu_conf->pmu_name + : "unknown\n"); + } + spin_unlock_irqrestore(&pfm_res_lock, flags); + return strlen(buf); +} diff --git a/perfmon/perfmon_rw.c b/perfmon/perfmon_rw.c new file mode 100644 index 0000000..3168eb7 --- /dev/null +++ b/perfmon/perfmon_rw.c @@ -0,0 +1,733 @@ +/* + * perfmon.c: perfmon2 PMC/PMD read/write system calls + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * David Mosberger-Tang + * + * More information about perfmon available at: + * http://perfmon2.sf.net/ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include "perfmon_priv.h" + +#define PFM_REGFL_PMC_ALL (PFM_REGFL_NO_EMUL64) +#define PFM_REGFL_PMD_ALL (PFM_REGFL_RANDOM|PFM_REGFL_OVFL_NOTIFY) + +/** + * update_used_reg -- updated used_pmcs for a single PMD + * @set: set to update + * @cnum: new PMD to add + * + * This function adds the pmds and pmcs depending on PMD cnum + */ +static inline void update_used_reg(struct pfm_context *ctx, + struct pfm_event_set *set, u16 cnum) +{ + bitmap_or(cast_ulp(set->used_pmcs), + cast_ulp(set->used_pmcs), + cast_ulp(pfm_pmu_conf->pmd_desc[cnum].dep_pmcs), + ctx->regs.max_pmc); +} + +/** + * update_used -- update used_pmcs bitmask + * @set: event set to update + * @bv: bitmask to inspect for new PMD registers + * + * This function updates the used_pmcs bitmask for + * the set using bv, a bitmask of pmds. For each pmd in bv, + * its depending pmcs are added to used_pmcs. + */ +static void update_used_pmcs(struct pfm_context *ctx, + struct pfm_event_set *set, unsigned long *bv) +{ + u16 max_pmd; + int n, p, q; + + max_pmd = ctx->regs.max_pmd; + + n = bitmap_weight(bv, max_pmd); + for(p = 0; n; n--, p = q+1) { + q = find_next_bit(bv, max_pmd, p); + update_used_reg(ctx, set, q); + } +} + +/** + * update_changes -- update nused_pmcs, nused_pmds, write newly touched pmcs + * @ctx: context to use + * @set: event set to use + * @old_used_pmcs: former used_pmc bitmask + * @can_access: non-zero if PMU is accessible, i.e., can be written to + * + * This function updates nused_pmcs and nused_pmds after the last modificiation + * to an event set. When new pmcs are used, then they must be initialized such + * that we do not pick up stale values from another session. + */ +static inline int update_changes(struct pfm_context *ctx, struct pfm_event_set *set, + unsigned long *old_used_pmcs) +{ + struct pfarg_pmc req; + u16 max_pmc, max_pmd; + int n, p, q, ret = 0; + + max_pmd = ctx->regs.max_pmd; + max_pmc = ctx->regs.max_pmc; + + /* + * update used counts + */ + set->nused_pmds = bitmap_weight(cast_ulp(set->used_pmds), max_pmd); + set->nused_pmcs = bitmap_weight(cast_ulp(set->used_pmcs), max_pmc); + + PFM_DBG("set%u u_pmds=0x%llx nu_pmds=%u u_pmcs=0x%llx nu_pmcs=%u", + set->id, + (unsigned long long)set->used_pmds[0], + set->nused_pmds, + (unsigned long long)set->used_pmcs[0], + set->nused_pmcs); + + memset(&req, 0, sizeof(req)); + + n = bitmap_weight(cast_ulp(set->used_pmcs), max_pmc); + for(p = 0; n; n--, p = q+1) { + q = find_next_bit(cast_ulp(set->used_pmcs), max_pmc, p); + + if (test_bit(q, cast_ulp(old_used_pmcs))) + continue; + + req.reg_num = q; + req.reg_value = set->pmcs[q]; + + ret = __pfm_write_pmcs(ctx, &req, 1); + if (ret) + break; + } + return ret; +} + +/** + * handle_smpl_bv - checks sampling bitmasks for new PMDs + * @ctx: context to use + * @set: set to use + * @bv: sampling bitmask + * + * scans the smpl bitmask looking for new PMDs (not yet used), if found + * invoke pfm_write_pmds() on them to get them initialized and marked used + */ +static int handle_smpl_bv(struct pfm_context *ctx, struct pfm_event_set *set, + unsigned long *bv) +{ + struct pfarg_pmd req; + int p, q, n, ret = 0; + u16 max_pmd; + + memset(&req, 0, sizeof(req)); + + max_pmd = ctx->regs.max_pmd; + + n = bitmap_weight(cast_ulp(bv), max_pmd); + + for(p = 0; n; n--, p = q+1) { + q = find_next_bit(cast_ulp(bv), max_pmd, p); + + if (test_bit(q, cast_ulp(set->used_pmds))) + continue; + + req.reg_num = q; + req.reg_value = 0; + + ret = __pfm_write_pmds(ctx, &req, 1, 0); + if (ret) + break; + } + return ret; +} + +/** + * is_invalid -- check if register index is within limits + * @cnum: register index + * @impl: bitmask of implemented registers + * @max: highest implemented registers + 1 + * + * return: + * 0 is register index is valid + * 1 if invalid + */ +static inline int is_invalid(u16 cnum, unsigned long *impl, u16 max) +{ + return cnum >= max || !test_bit(cnum, impl); +} + +/** + * __pfm_write_pmds - modified data registers + * @ctx: context to operate on + * @req: pfarg_pmd_t request from user + * @count: number of element in the pfarg_pmd_t vector + * @compat: used only on IA-64 to maintain backward compatibility with v2.0 + * + * The function succeeds whether the context is attached or not. + * When attached to another thread, that thread must be stopped. + * + * The context is locked and interrupts are disabled. + */ +int __pfm_write_pmds(struct pfm_context *ctx, struct pfarg_pmd *req, int count, + int compat) +{ + struct pfm_event_set *set, *active_set; + u64 old_used_pmcs[PFM_PMC_BV]; + unsigned long *smpl_pmds, *reset_pmds, *impl_pmds, *impl_rw_pmds; + u32 req_flags, flags; + u16 cnum, pmd_type, max_pmd; + u16 set_id; + int i, can_access_pmu; + int ret; + pfm_pmd_check_t wr_func; + + active_set = ctx->active_set; + max_pmd = ctx->regs.max_pmd; + impl_pmds = cast_ulp(ctx->regs.pmds); + impl_rw_pmds = cast_ulp(ctx->regs.rw_pmds); + wr_func = pfm_pmu_conf->pmd_write_check; + set = list_first_entry(&ctx->set_list, struct pfm_event_set, list); + + can_access_pmu = 0; + + /* + * we cannot access the actual PMD registers when monitoring is masked + */ + if (unlikely(ctx->state == PFM_CTX_LOADED)) + can_access_pmu = __get_cpu_var(pmu_owner) == ctx->task + || ctx->flags.system; + + bitmap_copy(cast_ulp(old_used_pmcs), + cast_ulp(set->used_pmcs), + ctx->regs.max_pmc); + + ret = -EINVAL; + for (i = 0; i < count; i++, req++) { + + cnum = req->reg_num; + set_id = req->reg_set; + req_flags = req->reg_flags; + smpl_pmds = cast_ulp(req->reg_smpl_pmds); + reset_pmds = cast_ulp(req->reg_reset_pmds); + flags = 0; + + /* + * cannot write to unexisting + * writes to read-only register are ignored + */ + if (unlikely(is_invalid(cnum, impl_pmds, max_pmd))) { + PFM_DBG("pmd%u is not available", cnum); + goto error; + } + + pmd_type = pfm_pmu_conf->pmd_desc[cnum].type; + + /* + * ensure only valid flags are set + */ + if (req_flags & ~(PFM_REGFL_PMD_ALL)) { + PFM_DBG("pmd%u: invalid flags=0x%x", + cnum, req_flags); + goto error; + } + + /* + * OVFL_NOTIFY is valid for all types of PMD. + * non counting PMD may trigger PMU interrupt + * and thus may trigger recording of a sample. + * This is true with IBS on AMD family 16. + */ + if (req_flags & PFM_REGFL_OVFL_NOTIFY) + flags |= PFM_REGFL_OVFL_NOTIFY; + + /* + * We allow randomization to non counting PMD + */ + if (req_flags & PFM_REGFL_RANDOM) + flags |= PFM_REGFL_RANDOM; + + /* + * verify validity of smpl_pmds + */ + if (unlikely(!bitmap_subset(smpl_pmds, impl_pmds, PFM_MAX_PMDS))) { + PFM_DBG("invalid smpl_pmds=0x%llx for pmd%u", + (unsigned long long)req->reg_smpl_pmds[0], + cnum); + goto error; + } + + /* + * verify validity of reset_pmds + * check against impl_rw_pmds because it is not + * possible to reset read-only PMDs + */ + if (unlikely(!bitmap_subset(reset_pmds, impl_rw_pmds, PFM_MAX_PMDS))) { + PFM_DBG("invalid reset_pmds=0x%llx for pmd%u", + (unsigned long long)req->reg_reset_pmds[0], + cnum); + goto error; + } + + /* + * locate event set + */ + if (set_id != set->id) { + /* update number of used register for previous set */ + if (i) { + ret = update_changes(ctx, set, cast_ulp(old_used_pmcs)); + if (ret) + goto error; + } + + set = pfm_find_set(ctx, set_id, 0); + if (set == NULL) { + PFM_DBG("event set%u does not exist", + set_id); + goto error; + } + bitmap_copy(cast_ulp(old_used_pmcs), + cast_ulp(set->used_pmcs), + ctx->regs.max_pmc); + } + + /* + * execute write checker, if any + */ + if (unlikely(wr_func && (pmd_type & PFM_REG_WC))) { + ret = (*wr_func)(ctx, set, req); + if (ret) + goto error; + + } + + + /* + * now commit changes to software state + */ + + if (unlikely(compat)) + goto skip_set; + + if (bitmap_weight(smpl_pmds, max_pmd)) { + ret = handle_smpl_bv(ctx, set, smpl_pmds); + if (ret) + goto error; + update_used_pmcs(ctx, set, cast_ulp(smpl_pmds)); + } + + bitmap_copy(cast_ulp(set->pmds[cnum].smpl_pmds), + smpl_pmds, + max_pmd); + + + if (bitmap_weight(reset_pmds, max_pmd)) { + ret = handle_smpl_bv(ctx, set, reset_pmds); + if (ret) + goto error; + update_used_pmcs(ctx, set, cast_ulp(reset_pmds)); + } + + bitmap_copy(cast_ulp(set->pmds[cnum].reset_pmds), + reset_pmds, + max_pmd); + + set->pmds[cnum].flags = flags; + + __set_bit(cnum, cast_ulp(set->used_pmds)); + update_used_reg(ctx, set, cnum); + + /* + * we reprogram the PMD hence, we clear any pending + * ovfl. Does affect ovfl switch on restart but new + * value has already been established here + */ + if (test_bit(cnum, cast_ulp(set->povfl_pmds))) { + set->npend_ovfls--; + __clear_bit(cnum, cast_ulp(set->povfl_pmds)); + } + __clear_bit(cnum, cast_ulp(set->ovfl_pmds)); + + /* + * update ovfl_notify + */ + if (flags & PFM_REGFL_OVFL_NOTIFY) + __set_bit(cnum, cast_ulp(set->ovfl_notify)); + else + __clear_bit(cnum, cast_ulp(set->ovfl_notify)); + + /* + * establish new switch count + */ + set->pmds[cnum].ovflsw_thres = req->reg_ovfl_switch_cnt; + set->pmds[cnum].ovflsw_ref_thres = req->reg_ovfl_switch_cnt; +skip_set: + + /* + * set last value to new value for all types of PMD + */ + set->pmds[cnum].lval = req->reg_value; + set->pmds[cnum].value = req->reg_value; + + /* + * update reset values (not just for counters) + */ + set->pmds[cnum].long_reset = req->reg_long_reset; + set->pmds[cnum].short_reset = req->reg_short_reset; + + /* + * update randomization mask + */ + set->pmds[cnum].mask = req->reg_random_mask; + + set->pmds[cnum].eventid = req->reg_smpl_eventid; + + if (set == active_set) { + set->priv_flags |= PFM_SETFL_PRIV_MOD_PMDS; + if (can_access_pmu) + pfm_write_pmd(ctx, cnum, req->reg_value); + } + + + PFM_DBG("set%u pmd%u=0x%llx flags=0x%x a_pmu=%d " + "ctx_pmd=0x%llx s_reset=0x%llx " + "l_reset=0x%llx s_pmds=0x%llx " + "r_pmds=0x%llx o_pmds=0x%llx " + "o_thres=%llu compat=%d eventid=%llx", + set->id, + cnum, + (unsigned long long)req->reg_value, + set->pmds[cnum].flags, + can_access_pmu, + (unsigned long long)set->pmds[cnum].value, + (unsigned long long)set->pmds[cnum].short_reset, + (unsigned long long)set->pmds[cnum].long_reset, + (unsigned long long)set->pmds[cnum].smpl_pmds[0], + (unsigned long long)set->pmds[cnum].reset_pmds[0], + (unsigned long long)set->ovfl_pmds[0], + (unsigned long long)set->pmds[cnum].ovflsw_thres, + compat, + (unsigned long long)set->pmds[cnum].eventid); + } + ret = 0; + +error: + update_changes(ctx, set, cast_ulp(old_used_pmcs)); + + /* + * make changes visible + */ + if (can_access_pmu) + pfm_arch_serialize(); + + return ret; +} + +/** + * __pfm_write_pmcs - modified config registers + * @ctx: context to operate on + * @req: pfarg_pmc_t request from user + * @count: number of element in the pfarg_pmc_t vector + * + * + * The function succeeds whether the context is * attached or not. + * When attached to another thread, that thread must be stopped. + * + * The context is locked and interrupts are disabled. + */ +int __pfm_write_pmcs(struct pfm_context *ctx, struct pfarg_pmc *req, int count) +{ + struct pfm_event_set *set, *active_set; + u64 value, dfl_val, rsvd_msk; + unsigned long *impl_pmcs; + int i, can_access_pmu; + int ret; + u16 set_id; + u16 cnum, pmc_type, max_pmc; + u32 flags, expert; + pfm_pmc_check_t wr_func; + + active_set = ctx->active_set; + + wr_func = pfm_pmu_conf->pmc_write_check; + max_pmc = ctx->regs.max_pmc; + impl_pmcs = cast_ulp(ctx->regs.pmcs); + set = list_first_entry(&ctx->set_list, struct pfm_event_set, list); + + expert = pfm_controls.flags & PFM_CTRL_FL_RW_EXPERT; + + can_access_pmu = 0; + + /* + * we cannot access the actual PMC registers when monitoring is masked + */ + if (unlikely(ctx->state == PFM_CTX_LOADED)) + can_access_pmu = __get_cpu_var(pmu_owner) == ctx->task + || ctx->flags.system; + + ret = -EINVAL; + + for (i = 0; i < count; i++, req++) { + + cnum = req->reg_num; + set_id = req->reg_set; + value = req->reg_value; + flags = req->reg_flags; + + /* + * no access to unavailable PMC register + */ + if (unlikely(is_invalid(cnum, impl_pmcs, max_pmc))) { + PFM_DBG("pmc%u is not available", cnum); + goto error; + } + + pmc_type = pfm_pmu_conf->pmc_desc[cnum].type; + dfl_val = pfm_pmu_conf->pmc_desc[cnum].dfl_val; + rsvd_msk = pfm_pmu_conf->pmc_desc[cnum].rsvd_msk; + + /* + * ensure only valid flags are set + */ + if (flags & ~PFM_REGFL_PMC_ALL) { + PFM_DBG("pmc%u: invalid flags=0x%x", cnum, flags); + goto error; + } + + /* + * locate event set + */ + if (set_id != set->id) { + set = pfm_find_set(ctx, set_id, 0); + if (set == NULL) { + PFM_DBG("event set%u does not exist", + set_id); + goto error; + } + } + + /* + * set reserved bits to default values + * (reserved bits must be 1 in rsvd_msk) + * + * bypass via /sys/kernel/perfmon/mode = 1 + */ + if (likely(!expert)) + value = (value & ~rsvd_msk) | (dfl_val & rsvd_msk); + + if (flags & PFM_REGFL_NO_EMUL64) { + if (!(pmc_type & PFM_REG_NO64)) { + PFM_DBG("pmc%u no support for " + "PFM_REGFL_NO_EMUL64", cnum); + goto error; + } + value &= ~pfm_pmu_conf->pmc_desc[cnum].no_emul64_msk; + } + + /* + * execute write checker, if any + */ + if (likely(wr_func && (pmc_type & PFM_REG_WC))) { + req->reg_value = value; + ret = (*wr_func)(ctx, set, req); + if (ret) + goto error; + value = req->reg_value; + } + + /* + * Now we commit the changes + */ + + /* + * mark PMC register as used + * We do not track associated PMC register based on + * the fact that they will likely need to be written + * in order to become useful at which point the statement + * below will catch that. + * + * The used_pmcs bitmask is only useful on architectures where + * the PMC needs to be modified for particular bits, especially + * on overflow or to stop/start. + */ + if (!test_bit(cnum, cast_ulp(set->used_pmcs))) { + __set_bit(cnum, cast_ulp(set->used_pmcs)); + set->nused_pmcs++; + } + + set->pmcs[cnum] = value; + + if (set == active_set) { + set->priv_flags |= PFM_SETFL_PRIV_MOD_PMCS; + if (can_access_pmu) + pfm_arch_write_pmc(ctx, cnum, value); + } + + PFM_DBG("set%u pmc%u=0x%llx a_pmu=%d " + "u_pmcs=0x%llx nu_pmcs=%u", + set->id, + cnum, + (unsigned long long)value, + can_access_pmu, + (unsigned long long)set->used_pmcs[0], + set->nused_pmcs); + } + ret = 0; +error: + /* + * make sure the changes are visible + */ + if (can_access_pmu) + pfm_arch_serialize(); + + return ret; +} + +/** + * __pfm_read_pmds - read data registers + * @ctx: context to operate on + * @req: pfarg_pmd_t request from user + * @count: number of element in the pfarg_pmd_t vector + * + * + * The function succeeds whether the context is attached or not. + * When attached to another thread, that thread must be stopped. + * + * The context is locked and interrupts are disabled. + */ +int __pfm_read_pmds(struct pfm_context *ctx, struct pfarg_pmd *req, int count) +{ + u64 val = 0, lval, ovfl_mask, hw_val; + u64 sw_cnt; + unsigned long *impl_pmds; + struct pfm_event_set *set, *active_set; + int i, ret, can_access_pmu = 0; + u16 cnum, pmd_type, set_id, max_pmd; + + ovfl_mask = pfm_pmu_conf->ovfl_mask; + impl_pmds = cast_ulp(ctx->regs.pmds); + max_pmd = ctx->regs.max_pmd; + active_set = ctx->active_set; + set = list_first_entry(&ctx->set_list, struct pfm_event_set, list); + + if (likely(ctx->state == PFM_CTX_LOADED)) { + can_access_pmu = __get_cpu_var(pmu_owner) == ctx->task + || ctx->flags.system; + + if (can_access_pmu) + pfm_arch_serialize(); + } + + /* + * on both UP and SMP, we can only read the PMD from the hardware + * register when the task is the owner of the local PMU. + */ + ret = -EINVAL; + for (i = 0; i < count; i++, req++) { + + cnum = req->reg_num; + set_id = req->reg_set; + + if (unlikely(is_invalid(cnum, impl_pmds, max_pmd))) { + PFM_DBG("pmd%u is not implemented/unaccessible", cnum); + goto error; + } + + pmd_type = pfm_pmu_conf->pmd_desc[cnum].type; + + /* + * locate event set + */ + if (set_id != set->id) { + set = pfm_find_set(ctx, set_id, 0); + if (set == NULL) { + PFM_DBG("event set%u does not exist", + set_id); + goto error; + } + } + /* + * it is not possible to read a PMD which was not requested: + * - explicitly written via pfm_write_pmds() + * - provided as a reg_smpl_pmds[] to another PMD during + * pfm_write_pmds() + * + * This is motivated by security and for optimization purposes: + * - on context switch restore, we can restore only what + * we use (except when regs directly readable at user + * level, e.g., IA-64 self-monitoring, I386 RDPMC). + * - do not need to maintain PMC -> PMD dependencies + */ + if (unlikely(!test_bit(cnum, cast_ulp(set->used_pmds)))) { + PFM_DBG("pmd%u cannot read, because not used", cnum); + goto error; + } + + val = set->pmds[cnum].value; + lval = set->pmds[cnum].lval; + + /* + * extract remaining ovfl to switch + */ + sw_cnt = set->pmds[cnum].ovflsw_thres; + + /* + * If the task is not the current one, then we check if the + * PMU state is still in the local live register due to lazy + * ctxsw. If true, then we read directly from the registers. + */ + if (set == active_set && can_access_pmu) { + hw_val = pfm_read_pmd(ctx, cnum); + if (pmd_type & PFM_REG_C64) + val = (val & ~ovfl_mask) | (hw_val & ovfl_mask); + else + val = hw_val; + } + + PFM_DBG("set%u pmd%u=0x%llx sw_thr=%llu lval=0x%llx", + set->id, + cnum, + (unsigned long long)val, + (unsigned long long)sw_cnt, + (unsigned long long)lval); + + req->reg_value = val; + req->reg_last_reset_val = lval; + req->reg_ovfl_switch_cnt = sw_cnt; + } + ret = 0; +error: + return ret; +} diff --git a/perfmon/perfmon_sets.c b/perfmon/perfmon_sets.c new file mode 100644 index 0000000..24534cb --- /dev/null +++ b/perfmon/perfmon_sets.c @@ -0,0 +1,873 @@ +/* + * perfmon_sets.c: perfmon2 event sets and multiplexing functions + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * David Mosberger-Tang + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include "perfmon_priv.h" + +static struct kmem_cache *pfm_set_cachep; + +/** + * pfm_reload_switch_thresholds - reload overflow-based switch thresholds per set + * @set: the set for which to reload thresholds + * + */ +static void pfm_reload_switch_thresholds(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + u64 *used_pmds; + u16 i, max, first; + + used_pmds = set->used_pmds; + first = ctx->regs.first_intr_pmd; + max = ctx->regs.max_intr_pmd; + + for (i = first; i < max; i++) { + if (test_bit(i, cast_ulp(used_pmds))) { + set->pmds[i].ovflsw_thres = set->pmds[i].ovflsw_ref_thres; + + PFM_DBG("set%u pmd%u ovflsw_thres=%llu", + set->id, + i, + (unsigned long long)set->pmds[i].ovflsw_thres); + } + } +} + +/** + * pfm_prepare_sets - initialize sets on pfm_load_context + * @ctx : context to operate on + * @load_set: set to activate first + * + * connect all sets, reset internal fields + */ +struct pfm_event_set *pfm_prepare_sets(struct pfm_context *ctx, u16 load_set) +{ + struct pfm_event_set *set, *p; + u16 max; + + /* + * locate first set to activate + */ + set = pfm_find_set(ctx, load_set, 0); + if (!set) + return NULL; + + if (set->flags & PFM_SETFL_OVFL_SWITCH) + pfm_reload_switch_thresholds(ctx, set); + + max = ctx->regs.max_intr_pmd; + + list_for_each_entry(p, &ctx->set_list, list) { + /* + * cleanup bitvectors + */ + bitmap_zero(cast_ulp(p->ovfl_pmds), max); + bitmap_zero(cast_ulp(p->povfl_pmds), max); + + p->npend_ovfls = 0; + + /* + * we cannot just use plain clear because of arch-specific flags + */ + p->priv_flags &= ~(PFM_SETFL_PRIV_MOD_BOTH|PFM_SETFL_PRIV_SWITCH); + /* + * neither duration nor runs are reset because typically loading/unloading + * does not mean counts are reset. To reset, the set must be modified + */ + } + return set; +} + +/* + * called by hrtimer_interrupt() + * + * This is the only function where we come with + * cpu_base->lock held before ctx->lock + * + * interrupts are disabled + */ +enum hrtimer_restart pfm_handle_switch_timeout(struct hrtimer *t) +{ + struct pfm_event_set *set; + struct pfm_context *ctx; + unsigned long flags; + enum hrtimer_restart ret = HRTIMER_NORESTART; + + /* + * prevent against race with unload + */ + ctx = __get_cpu_var(pmu_ctx); + if (!ctx) + return HRTIMER_NORESTART; + + spin_lock_irqsave(&ctx->lock, flags); + + set = ctx->active_set; + + /* + * switching occurs only when context is attached + */ + if (ctx->state != PFM_CTX_LOADED) + goto done; + /* + * timer does not run while monitoring is inactive (not started) + */ + if (!pfm_arch_is_active(ctx)) + goto done; + + pfm_stats_inc(handle_timeout_count); + + ret = pfm_switch_sets(ctx, NULL, PFM_PMD_RESET_SHORT, 0); +done: + spin_unlock_irqrestore(&ctx->lock, flags); + return ret; +} + +/* + * + * always operating on the current task + * interrupts are masked + * + * input: + * - new_set: new set to switch to, if NULL follow normal chain + */ +enum hrtimer_restart pfm_switch_sets(struct pfm_context *ctx, + struct pfm_event_set *new_set, + int reset_mode, + int no_restart) +{ + struct pfm_event_set *set; + u64 now, end; + u32 new_flags; + int is_system, is_active, nn; + enum hrtimer_restart ret = HRTIMER_NORESTART; + + now = sched_clock(); + set = ctx->active_set; + is_active = pfm_arch_is_active(ctx); + + /* + * if no set is explicitly requested, + * use the set_switch_next field + */ + if (!new_set) { + /* + * we use round-robin unless the user specified + * a particular set to go to. + */ + new_set = list_first_entry(&set->list, struct pfm_event_set, list); + if (&new_set->list == &ctx->set_list) + new_set = list_first_entry(&ctx->set_list, struct pfm_event_set, list); + } + + PFM_DBG_ovfl("state=%d act=%d cur_set=%u cur_runs=%llu cur_npend=%d next_set=%u " + "next_runs=%llu new_npend=%d reset_mode=%d reset_pmds=%llx", + ctx->state, + is_active, + set->id, + (unsigned long long)set->runs, + set->npend_ovfls, + new_set->id, + (unsigned long long)new_set->runs, + new_set->npend_ovfls, + reset_mode, + (unsigned long long)new_set->reset_pmds[0]); + + is_system = ctx->flags.system; + new_flags = new_set->flags; + + /* + * nothing more to do + */ + if (new_set == set) + goto skip_same_set; + + if (is_active) { + pfm_arch_stop(current, ctx); + pfm_save_pmds(ctx, set); + /* + * compute elapsed ns for active set + */ + set->duration += now - set->duration_start; + } + + pfm_arch_restore_pmds(ctx, new_set); + /* + * if masked, we must restore the pmcs such that they + * do not capture anything. + */ + pfm_arch_restore_pmcs(ctx, new_set); + + if (new_set->npend_ovfls) { + pfm_arch_resend_irq(ctx); + pfm_stats_inc(ovfl_intr_replay_count); + } + + new_set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH; + +skip_same_set: + new_set->runs++; + /* + * reset switch threshold + */ + if (new_flags & PFM_SETFL_OVFL_SWITCH) + pfm_reload_switch_thresholds(ctx, new_set); + + /* + * reset overflowed PMD registers in new set + */ + nn = bitmap_weight(cast_ulp(new_set->reset_pmds), ctx->regs.max_pmd); + if (nn) + pfm_reset_pmds(ctx, new_set, nn, reset_mode); + + + /* + * This is needed when coming from pfm_start() + * + * When switching to the same set, there is no + * need to restart + */ + if (no_restart) + goto skip_restart; + + if (is_active) { + /* + * do not need to restart when same set + */ + if (new_set != set) { + ctx->active_set = new_set; + new_set->duration_start = now; + pfm_arch_start(current, ctx); + } + /* + * install new timeout if necessary + */ + if (new_flags & PFM_SETFL_TIME_SWITCH) { + struct hrtimer *h; + h = &__get_cpu_var(pfm_hrtimer); + hrtimer_forward(h, h->base->get_time(), new_set->hrtimer_exp); + new_set->hrtimer_rem = new_set->hrtimer_exp; + ret = HRTIMER_RESTART; + } + } + +skip_restart: + ctx->active_set = new_set; + + end = sched_clock(); + + pfm_stats_inc(set_switch_count); + pfm_stats_add(set_switch_ns, end - now); + + return ret; +} + +/* + * called from __pfm_overflow_handler() to switch event sets. + * monitoring is stopped, task is current, interrupts are masked. + * compared to pfm_switch_sets(), this version is simplified because + * it knows about the call path. There is no need to stop monitoring + * because it is already frozen by PMU handler. + */ +void pfm_switch_sets_from_intr(struct pfm_context *ctx) +{ + struct pfm_event_set *set, *new_set; + u64 now, end; + u32 new_flags; + int is_system, n; + + now = sched_clock(); + set = ctx->active_set; + new_set = list_first_entry(&set->list, struct pfm_event_set, list); + if (&new_set->list == &ctx->set_list) + new_set = list_first_entry(&ctx->set_list, struct pfm_event_set, list); + + PFM_DBG_ovfl("state=%d cur_set=%u cur_runs=%llu cur_npend=%d next_set=%u " + "next_runs=%llu new_npend=%d new_r_pmds=%llx", + ctx->state, + set->id, + (unsigned long long)set->runs, + set->npend_ovfls, + new_set->id, + (unsigned long long)new_set->runs, + new_set->npend_ovfls, + (unsigned long long)new_set->reset_pmds[0]); + + is_system = ctx->flags.system; + new_flags = new_set->flags; + + /* + * nothing more to do + */ + if (new_set == set) + goto skip_same_set; + + /* + * switch on intr only when set has OVFL_SWITCH + */ + BUG_ON(set->flags & PFM_SETFL_TIME_SWITCH); + + /* + * when called from PMU intr handler, monitoring + * is already stopped + * + * save current PMD registers, we use a special + * form for performance reason. On some architectures, + * such as x86, the pmds are already saved when entering + * the PMU interrupt handler via pfm-arch_intr_freeze() + * so we don't need to save them again. On the contrary, + * on IA-64, they are not saved by freeze, thus we have to + * to it here. + */ + pfm_arch_save_pmds_from_intr(ctx, set); + + /* + * compute elapsed ns for active set + */ + set->duration += now - set->duration_start; + + pfm_arch_restore_pmds(ctx, new_set); + + /* + * must not be restored active as we are still executing in the + * PMU interrupt handler. activation is deferred to unfreeze PMU + */ + pfm_arch_restore_pmcs(ctx, new_set); + + /* + * check for pending interrupt on incoming set. + * interrupts are masked so handler call deferred + */ + if (new_set->npend_ovfls) { + pfm_arch_resend_irq(ctx); + pfm_stats_inc(ovfl_intr_replay_count); + } + /* + * no need to restore anything, that is already done + */ + new_set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH; + /* + * reset duration counter + */ + new_set->duration_start = now; + +skip_same_set: + new_set->runs++; + + /* + * reset switch threshold + */ + if (new_flags & PFM_SETFL_OVFL_SWITCH) + pfm_reload_switch_thresholds(ctx, new_set); + + /* + * reset overflowed PMD registers + */ + n = bitmap_weight(cast_ulp(new_set->reset_pmds), ctx->regs.max_pmd); + if (n) + pfm_reset_pmds(ctx, new_set, n, PFM_PMD_RESET_SHORT); + + /* + * XXX: isactive? + * + * Came here following a interrupt which triggered a switch, i.e., + * previous set was using OVFL_SWITCH, thus we just need to arm + * check if the next set is using timeout, and if so arm the timer. + * + * Timeout is always at least one tick away. No risk of having to + * invoke the timeout handler right now. In any case, cb_mode is + * set to HRTIMER_CB_IRQSAFE_NO_SOFTIRQ such that hrtimer_start + * will not try to wakeup the softirqd which could cause a locking + * problem. + */ + if (new_flags & PFM_SETFL_TIME_SWITCH) { + hrtimer_start(&__get_cpu_var(pfm_hrtimer), set->hrtimer_exp, HRTIMER_MODE_REL); + PFM_DBG("armed new timeout for set%u", new_set->id); + } + + ctx->active_set = new_set; + + end = sched_clock(); + + pfm_stats_inc(set_switch_count); + pfm_stats_add(set_switch_ns, end - now); +} + + +static int pfm_setfl_sane(struct pfm_context *ctx, u32 flags) +{ +#define PFM_SETFL_BOTH_SWITCH (PFM_SETFL_OVFL_SWITCH|PFM_SETFL_TIME_SWITCH) + int ret; + + ret = pfm_arch_setfl_sane(ctx, flags); + if (ret) + return ret; + + if ((flags & PFM_SETFL_BOTH_SWITCH) == PFM_SETFL_BOTH_SWITCH) { + PFM_DBG("both switch ovfl and switch time are set"); + return -EINVAL; + } + return 0; +} + +/* + * it is never possible to change the identification of an existing set + */ +static int pfm_change_evtset(struct pfm_context *ctx, + struct pfm_event_set *set, + struct pfarg_setdesc *req) +{ + struct timeval tv; + struct timespec ts; + ktime_t kt; + long d, res_ns; + s32 rem; + u32 flags; + int ret; + u16 set_id; + + BUG_ON(ctx->state == PFM_CTX_LOADED); + + set_id = req->set_id; + flags = req->set_flags; + + ret = pfm_setfl_sane(ctx, flags); + if (ret) { + PFM_DBG("invalid flags 0x%x set %u", flags, set_id); + return -EINVAL; + } + + /* + * compute timeout value + */ + if (flags & PFM_SETFL_TIME_SWITCH) { + /* + * timeout value of zero is illegal + */ + if (req->set_timeout == 0) { + PFM_DBG("invalid timeout 0"); + return -EINVAL; + } + + hrtimer_get_res(CLOCK_MONOTONIC, &ts); + res_ns = (long)ktime_to_ns(timespec_to_ktime(ts)); + + /* + * round-up to multiple of clock resolution + * timeout = ((req->set_timeout+res_ns-1)/res_ns)*res_ns; + * + * u64 division missing on 32-bit arch, so use div_s64_rem + */ + d = div_s64_rem(req->set_timeout, res_ns, &rem); + + PFM_DBG("set%u flags=0x%x req_timeout=%lluns " + "HZ=%u TICK_NSEC=%lu clock_res=%ldns rem=%dns", + set_id, + flags, + (unsigned long long)req->set_timeout, + HZ, TICK_NSEC, + res_ns, + rem); + + /* + * Only accept timeout, we can actually achieve. + * users can invoke clock_getres(CLOCK_MONOTONIC) + * to figure out resolution and adjust timeout + */ + if (rem) { + PFM_DBG("set%u invalid timeout=%llu", + set_id, + (unsigned long long)req->set_timeout); + return -EINVAL; + } + + tv = ns_to_timeval(req->set_timeout); + kt = timeval_to_ktime(tv); + set->hrtimer_exp = kt; + } else { + set->hrtimer_exp = ktime_set(0, 0); + } + + /* + * commit changes + */ + set->id = set_id; + set->flags = flags; + set->priv_flags = 0; + + /* + * activation and duration counters are reset as + * most likely major things will change in the set + */ + set->runs = 0; + set->duration = 0; + + return 0; +} + +/* + * this function does not modify the next field + */ +static void pfm_initialize_set(struct pfm_context *ctx, + struct pfm_event_set *set) +{ + u64 *impl_pmcs; + u16 i, max_pmc; + + max_pmc = ctx->regs.max_pmc; + impl_pmcs = ctx->regs.pmcs; + + /* + * install default values for all PMC registers + */ + for (i = 0; i < max_pmc; i++) { + if (test_bit(i, cast_ulp(impl_pmcs))) { + set->pmcs[i] = pfm_pmu_conf->pmc_desc[i].dfl_val; + PFM_DBG("set%u pmc%u=0x%llx", + set->id, + i, + (unsigned long long)set->pmcs[i]); + } + } + + /* + * PMD registers are set to 0 when the event set is allocated, + * hence we do not need to explicitly initialize them. + * + * For virtual PMD registers (i.e., those tied to a SW resource) + * their value becomes meaningful once the context is attached. + */ +} + +/* + * look for an event set using its identification. If the set does not + * exist: + * - if alloc == 0 then return error + * - if alloc == 1 then allocate set + * + * alloc is one ONLY when coming from pfm_create_evtsets() which can only + * be called when the context is detached, i.e. monitoring is stopped. + */ +struct pfm_event_set *pfm_find_set(struct pfm_context *ctx, u16 set_id, int alloc) +{ + struct pfm_event_set *set = NULL, *prev, *new_set; + + PFM_DBG("looking for set=%u", set_id); + + prev = NULL; + list_for_each_entry(set, &ctx->set_list, list) { + if (set->id == set_id) + return set; + if (set->id > set_id) + break; + prev = set; + } + + if (!alloc) + return NULL; + + /* + * we are holding the context spinlock and interrupts + * are unmasked. We must use GFP_ATOMIC as we cannot + * sleep while holding a spin lock. + */ + new_set = kmem_cache_zalloc(pfm_set_cachep, GFP_ATOMIC); + if (!new_set) + return NULL; + + new_set->id = set_id; + + INIT_LIST_HEAD(&new_set->list); + + if (prev == NULL) { + list_add(&(new_set->list), &ctx->set_list); + } else { + PFM_DBG("add after set=%u", prev->id); + list_add(&(new_set->list), &prev->list); + } + return new_set; +} + +/** + * pfm_create_initial_set - create initial set from __pfm_c reate_context + * @ctx: context to atatched the set to + */ +int pfm_create_initial_set(struct pfm_context *ctx) +{ + struct pfm_event_set *set; + + /* + * create initial set0 + */ + if (!pfm_find_set(ctx, 0, 1)) + return -ENOMEM; + + set = list_first_entry(&ctx->set_list, struct pfm_event_set, list); + + pfm_initialize_set(ctx, set); + + return 0; +} + +/* + * context is unloaded for this command. Interrupts are enabled + */ +int __pfm_create_evtsets(struct pfm_context *ctx, struct pfarg_setdesc *req, + int count) +{ + struct pfm_event_set *set; + u16 set_id; + int i, ret; + + for (i = 0; i < count; i++, req++) { + set_id = req->set_id; + + PFM_DBG("set_id=%u", set_id); + + set = pfm_find_set(ctx, set_id, 1); + if (set == NULL) + goto error_mem; + + ret = pfm_change_evtset(ctx, set, req); + if (ret) + goto error_params; + + pfm_initialize_set(ctx, set); + } + return 0; +error_mem: + PFM_DBG("cannot allocate set %u", set_id); + return -ENOMEM; +error_params: + return ret; +} + +int __pfm_getinfo_evtsets(struct pfm_context *ctx, struct pfarg_setinfo *req, + int count) +{ + struct pfm_event_set *set; + int i, is_system, is_loaded, is_self, ret; + u16 set_id; + u64 end; + + end = sched_clock(); + + is_system = ctx->flags.system; + is_loaded = ctx->state == PFM_CTX_LOADED; + is_self = ctx->task == current || is_system; + + ret = -EINVAL; + for (i = 0; i < count; i++, req++) { + + set_id = req->set_id; + + list_for_each_entry(set, &ctx->set_list, list) { + if (set->id == set_id) + goto found; + if (set->id > set_id) + goto error; + } +found: + req->set_flags = set->flags; + + /* + * compute leftover timeout + * + * lockdep may complain about lock inversion + * because of get_remaining() however, this + * applies to self-montoring only, thus the + * thread cannot be in the timeout handler + * and here at the same time given that we + * run with interrupts disabled + */ + if (is_loaded && is_self) { + struct hrtimer *h; + h = &__get_cpu_var(pfm_hrtimer); + req->set_timeout = ktime_to_ns(hrtimer_get_remaining(h)); + } else { + /* + * hrtimer_rem zero when not using + * timeout-based switching + */ + req->set_timeout = ktime_to_ns(set->hrtimer_rem); + } + + req->set_runs = set->runs; + req->set_act_duration = set->duration; + + /* + * adjust for active set if needed + */ + if (is_system && is_loaded && ctx->flags.started + && set == ctx->active_set) + req->set_act_duration += end - set->duration_start; + + /* + * copy the list of pmds which last overflowed + */ + bitmap_copy(cast_ulp(req->set_ovfl_pmds), + cast_ulp(set->ovfl_pmds), + PFM_MAX_PMDS); + + /* + * copy bitmask of available PMU registers + * + * must copy over the entire vector to avoid + * returning bogus upper bits pass by user + */ + bitmap_copy(cast_ulp(req->set_avail_pmcs), + cast_ulp(ctx->regs.pmcs), + PFM_MAX_PMCS); + + bitmap_copy(cast_ulp(req->set_avail_pmds), + cast_ulp(ctx->regs.pmds), + PFM_MAX_PMDS); + + PFM_DBG("set%u flags=0x%x eff_usec=%llu runs=%llu " + "a_pmcs=0x%llx a_pmds=0x%llx", + set_id, + set->flags, + (unsigned long long)req->set_timeout, + (unsigned long long)set->runs, + (unsigned long long)ctx->regs.pmcs[0], + (unsigned long long)ctx->regs.pmds[0]); + } + ret = 0; +error: + return ret; +} + +/* + * context is unloaded for this command. Interrupts are enabled + */ +int __pfm_delete_evtsets(struct pfm_context *ctx, void *arg, int count) +{ + struct pfarg_setdesc *req = arg; + struct pfm_event_set *set; + u16 set_id; + int i, ret; + + ret = -EINVAL; + for (i = 0; i < count; i++, req++) { + set_id = req->set_id; + + list_for_each_entry(set, &ctx->set_list, list) { + if (set->id == set_id) + goto found; + if (set->id > set_id) + goto error; + } + goto error; +found: + /* + * clear active set if necessary. + * will be updated when context is loaded + */ + if (set == ctx->active_set) + ctx->active_set = NULL; + + list_del(&set->list); + + kmem_cache_free(pfm_set_cachep, set); + + PFM_DBG("set%u deleted", set_id); + } + ret = 0; +error: + return ret; +} + +/* + * called from pfm_context_free() to free all sets + */ +void pfm_free_sets(struct pfm_context *ctx) +{ + struct pfm_event_set *set, *tmp; + + list_for_each_entry_safe(set, tmp, &ctx->set_list, list) { + list_del(&set->list); + kmem_cache_free(pfm_set_cachep, set); + } +} + +/** + * pfm_restart_timer - restart hrtimer taking care of expired timeout + * @ctx : context to work with + * @set : current active set + * + * Must be called on the processor on which the timer is to be armed. + * Assumes context is locked and interrupts are masked + * + * Upon return the active set for the context may have changed + */ +void pfm_restart_timer(struct pfm_context *ctx, struct pfm_event_set *set) +{ + struct hrtimer *h; + enum hrtimer_restart ret; + + h = &__get_cpu_var(pfm_hrtimer); + + PFM_DBG_ovfl("hrtimer=%lld", (long long)ktime_to_ns(set->hrtimer_rem)); + + if (ktime_to_ns(set->hrtimer_rem) > 0) { + hrtimer_start(h, set->hrtimer_rem, HRTIMER_MODE_REL); + } else { + /* + * timer was not re-armed because it has already expired + * timer was not enqueued, we need to switch set now + */ + pfm_stats_inc(set_switch_exp); + + ret = pfm_switch_sets(ctx, NULL, 1, 0); + set = ctx->active_set; + if (ret == HRTIMER_RESTART) + hrtimer_start(h, set->hrtimer_rem, HRTIMER_MODE_REL); + } +} + +int __init pfm_init_sets(void) +{ + pfm_set_cachep = kmem_cache_create("pfm_event_set", + sizeof(struct pfm_event_set), + SLAB_HWCACHE_ALIGN, 0, NULL); + if (!pfm_set_cachep) { + PFM_ERR("cannot initialize event set slab"); + return -ENOMEM; + } + return 0; +} diff --git a/perfmon/perfmon_smpl.c b/perfmon/perfmon_smpl.c new file mode 100644 index 0000000..e31fb15 --- /dev/null +++ b/perfmon/perfmon_smpl.c @@ -0,0 +1,865 @@ +/* + * perfmon_smpl.c: perfmon2 sampling management + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * David Mosberger-Tang + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "perfmon_priv.h" + +/** + * pfm_smpl_buf_alloc - allocate memory for sampling buffer + * @ctx: context to operate on + * @rsize: requested size + * + * called from pfm_smpl_buffer_alloc_old() (IA64-COMPAT) + * and pfm_setup_smpl_fmt() + * + * interrupts are enabled, context is not locked. + * + * function is not static because it is called from the IA-64 + * compatibility module (perfmon_compat.c) + */ +int pfm_smpl_buf_alloc(struct pfm_context *ctx, size_t rsize) +{ +#if PFM_ARCH_SMPL_ALIGN_SIZE > 0 +#define PFM_ALIGN_SMPL(a, f) (void *)((((unsigned long)(a))+(f-1)) & ~(f-1)) +#else +#define PFM_ALIGN_SMPL(a, f) (a) +#endif + void *addr, *real_addr; + size_t size, real_size; + int ret; + + might_sleep(); + + /* + * align page boundary + */ + size = PAGE_ALIGN(rsize); + + /* + * On some arch, it may be necessary to get an alignment greater + * than page size to avoid certain cache effects (e.g., MIPS). + * This is the reason for PFM_ARCH_SMPL_ALIGN_SIZE. + */ + real_size = size + PFM_ARCH_SMPL_ALIGN_SIZE; + + PFM_DBG("req_size=%zu size=%zu real_size=%zu", + rsize, + size, + real_size); + + ret = pfm_smpl_buf_space_acquire(ctx, real_size); + if (ret) + return ret; + + /* + * vmalloc can sleep. we do not hold + * any spinlock and interrupts are enabled + */ + real_addr = addr = vmalloc(real_size); + if (!real_addr) { + PFM_DBG("cannot allocate sampling buffer"); + goto unres; + } + + /* + * align the useable sampling buffer address to the arch requirement + * This is a nop on most architectures + */ + addr = PFM_ALIGN_SMPL(real_addr, PFM_ARCH_SMPL_ALIGN_SIZE); + + memset(addr, 0, real_size); + + /* + * due to cache aliasing, it may be necessary to flush the pages + * on certain architectures (e.g., MIPS) + */ + pfm_cacheflush(addr, real_size); + + /* + * what needs to be freed + */ + ctx->smpl_real_addr = real_addr; + ctx->smpl_real_size = real_size; + + /* + * what is actually available to user + */ + ctx->smpl_addr = addr; + ctx->smpl_size = size; + + PFM_DBG("addr=%p real_addr=%p", addr, real_addr); + + return 0; +unres: + /* + * smpl_addr is NULL, no double freeing possible in pfm_context_free() + */ + pfm_smpl_buf_space_release(ctx, real_size); + + return -ENOMEM; +} + +/** + * pfm_smpl_buf_free - free resources associated with sampling + * @ctx: context to operate on + */ +void pfm_smpl_buf_free(struct pfm_context *ctx) +{ + struct pfm_smpl_fmt *fmt; + + fmt = ctx->smpl_fmt; + + /* + * some formats may not use a buffer, yet they may + * need to be called on exit + */ + if (fmt) { + if (fmt->fmt_exit) + (*fmt->fmt_exit)(ctx->smpl_addr); + /* + * decrease refcount of sampling format + */ + pfm_smpl_fmt_put(fmt); + } + + if (ctx->smpl_addr) { + pfm_smpl_buf_space_release(ctx, ctx->smpl_real_size); + + PFM_DBG("free buffer real_addr=0x%p real_size=%zu", + ctx->smpl_real_addr, + ctx->smpl_real_size); + + vfree(ctx->smpl_real_addr); + } +} + +/** + * pfm_setup_smpl_fmt - initialization of sampling format and buffer + * @ctx: context to operate on + * @fmt_arg: smapling format arguments + * @ctx_flags: context flags as passed by user + * @filp: file descriptor associated with context + * + * called from __pfm_create_context() + */ +int pfm_setup_smpl_fmt(struct pfm_context *ctx, u32 ctx_flags, void *fmt_arg, + struct file *filp) +{ + struct pfm_smpl_fmt *fmt; + size_t size = 0; + int ret = 0; + + fmt = ctx->smpl_fmt; + + /* + * validate parameters + */ + if (fmt->fmt_validate) { + ret = (*fmt->fmt_validate)(ctx_flags, + ctx->regs.num_pmds, + fmt_arg); + PFM_DBG("validate(0x%x,%p)=%d", ctx_flags, fmt_arg, ret); + if (ret) + goto error; + } + + /* + * check if buffer format needs buffer allocation + */ + size = 0; + if (fmt->fmt_getsize) { + ret = (*fmt->fmt_getsize)(ctx_flags, fmt_arg, &size); + if (ret) { + PFM_DBG("cannot get size ret=%d", ret); + goto error; + } + } + + /* + * allocate buffer + * v20_compat is for IA-64 backward compatibility with perfmon v2.0 + */ + if (size) { +#ifdef CONFIG_IA64_PERFMON_COMPAT + /* + * backward compatibility with perfmon v2.0 on Ia-64 + */ + if (ctx->flags.ia64_v20_compat) + ret = pfm_smpl_buf_alloc_compat(ctx, size, filp); + else +#endif + ret = pfm_smpl_buf_alloc(ctx, size); + + if (ret) + goto error; + + } + + if (fmt->fmt_init) { + ret = (*fmt->fmt_init)(ctx, ctx->smpl_addr, ctx_flags, + ctx->regs.num_pmds, + fmt_arg); + } + /* + * if there was an error, the buffer/resource will be freed by + * via pfm_context_free() + */ +error: + return ret; +} + +void pfm_mask_monitoring(struct pfm_context *ctx, struct pfm_event_set *set) +{ + u64 now; + + now = sched_clock(); + + /* + * we save the PMD values such that we can read them while + * MASKED without having the thread stopped + * because monitoring is stopped + * + * pfm_save_pmds() could be avoided if we knew + * that pfm_arch_intr_freeze() had saved them already + */ + pfm_save_pmds(ctx, set); + pfm_arch_mask_monitoring(ctx, set); + /* + * accumulate the set duration up to this point + */ + set->duration += now - set->duration_start; + + ctx->state = PFM_CTX_MASKED; + + /* + * need to stop timer and remember remaining time + * will be reloaded in pfm_unmask_monitoring + * hrtimer is cancelled in the tail of the interrupt + * handler once the context is unlocked + */ + if (set->flags & PFM_SETFL_TIME_SWITCH) { + struct hrtimer *h = &__get_cpu_var(pfm_hrtimer); + hrtimer_cancel(h); + set->hrtimer_rem = hrtimer_get_remaining(h); + } + PFM_DBG_ovfl("can_restart=%u", ctx->flags.can_restart); +} + +/** + * pfm_unmask_monitoring - unmask monitoring + * @ctx: context to work with + * @set: current active set + * + * interrupts are masked when entering this function. + * context must be in MASKED state when calling. + * + * Upon return, the active set may have changed when using timeout + * based switching. + */ +static void pfm_unmask_monitoring(struct pfm_context *ctx, struct pfm_event_set *set) +{ + if (ctx->state != PFM_CTX_MASKED) + return; + + PFM_DBG_ovfl("unmasking monitoring"); + + /* + * must be done before calling + * pfm_arch_unmask_monitoring() + */ + ctx->state = PFM_CTX_LOADED; + + /* + * we need to restore the PMDs because they + * may have been modified by user while MASKED in + * which case the actual registers have no yet + * been updated + */ + pfm_arch_restore_pmds(ctx, set); + + /* + * call arch specific handler + */ + pfm_arch_unmask_monitoring(ctx, set); + + /* + * clear force reload flag. May have been set + * in pfm_write_pmcs or pfm_write_pmds + */ + set->priv_flags &= ~PFM_SETFL_PRIV_MOD_BOTH; + + /* + * reset set duration timer + */ + set->duration_start = sched_clock(); + + /* + * restart hrtimer if needed + */ + if (set->flags & PFM_SETFL_TIME_SWITCH) { + pfm_restart_timer(ctx, set); + /* careful here as pfm_restart_timer may switch sets */ + } +} + +void pfm_reset_pmds(struct pfm_context *ctx, + struct pfm_event_set *set, + int num_pmds, + int reset_mode) +{ + u64 val, mask, new_seed; + struct pfm_pmd *reg; + unsigned int i, not_masked; + + not_masked = ctx->state != PFM_CTX_MASKED; + + PFM_DBG_ovfl("%s r_pmds=0x%llx not_masked=%d", + reset_mode == PFM_PMD_RESET_LONG ? "long" : "short", + (unsigned long long)set->reset_pmds[0], + not_masked); + + pfm_stats_inc(reset_pmds_count); + + for (i = 0; num_pmds; i++) { + if (test_bit(i, cast_ulp(set->reset_pmds))) { + num_pmds--; + + reg = set->pmds + i; + + val = reset_mode == PFM_PMD_RESET_LONG ? + reg->long_reset : reg->short_reset; + + if (reg->flags & PFM_REGFL_RANDOM) { + mask = reg->mask; + new_seed = random32(); + + /* construct a full 64-bit random value: */ + if ((unlikely(mask >> 32) != 0)) + new_seed |= (u64)random32() << 32; + + /* counter values are negative numbers! */ + val -= (new_seed & mask); + } + + set->pmds[i].value = val; + reg->lval = val; + + /* + * not all PMD to reset are necessarily + * counters + */ + if (not_masked) + pfm_write_pmd(ctx, i, val); + + PFM_DBG_ovfl("set%u pmd%u sval=0x%llx", + set->id, + i, + (unsigned long long)val); + } + } + + /* + * done with reset + */ + bitmap_zero(cast_ulp(set->reset_pmds), i); + + /* + * make changes visible + */ + if (not_masked) + pfm_arch_serialize(); +} + +/* + * called from pfm_handle_work() and __pfm_restart() + * for system-wide and per-thread context to resume + * monitoring after a user level notification. + * + * In both cases, the context is locked and interrupts + * are disabled. + */ +void pfm_resume_after_ovfl(struct pfm_context *ctx) +{ + struct pfm_smpl_fmt *fmt; + u32 rst_ctrl; + struct pfm_event_set *set; + u64 *reset_pmds; + void *hdr; + int state, ret; + + hdr = ctx->smpl_addr; + fmt = ctx->smpl_fmt; + state = ctx->state; + set = ctx->active_set; + ret = 0; + + if (hdr) { + rst_ctrl = 0; + prefetch(hdr); + } else { + rst_ctrl = PFM_OVFL_CTRL_RESET; + } + + /* + * if using a sampling buffer format and it has a restart callback, + * then invoke it. hdr may be NULL, if the format does not use a + * perfmon buffer + */ + if (fmt && fmt->fmt_restart) + ret = (*fmt->fmt_restart)(state == PFM_CTX_LOADED, &rst_ctrl, + hdr); + + reset_pmds = set->reset_pmds; + + PFM_DBG("fmt_restart=%d reset_count=%d set=%u r_pmds=0x%llx switch=%d " + "ctx_state=%d", + ret, + ctx->flags.reset_count, + set->id, + (unsigned long long)reset_pmds[0], + (set->priv_flags & PFM_SETFL_PRIV_SWITCH), + state); + + if (!ret) { + /* + * switch set if needed + */ + if (set->priv_flags & PFM_SETFL_PRIV_SWITCH) { + set->priv_flags &= ~PFM_SETFL_PRIV_SWITCH; + pfm_switch_sets(ctx, NULL, PFM_PMD_RESET_LONG, 0); + set = ctx->active_set; + } else if (rst_ctrl & PFM_OVFL_CTRL_RESET) { + int nn; + nn = bitmap_weight(cast_ulp(set->reset_pmds), + ctx->regs.max_pmd); + if (nn) + pfm_reset_pmds(ctx, set, nn, PFM_PMD_RESET_LONG); + } + + if (!(rst_ctrl & PFM_OVFL_CTRL_MASK)) + pfm_unmask_monitoring(ctx, set); + else + PFM_DBG("stopping monitoring?"); + ctx->state = PFM_CTX_LOADED; + } +} + +/* + * This function is called when we need to perform asynchronous + * work on a context. This function is called ONLY when about to + * return to user mode (very much like with signal handling). + * + * There are several reasons why we come here: + * + * - per-thread mode, not self-monitoring, to reset the counters + * after a pfm_restart() + * + * - we are zombie and we need to cleanup our state + * + * - we need to block after an overflow notification + * on a context with the PFM_OVFL_NOTIFY_BLOCK flag + * + * This function is never called for a system-wide context. + * + * pfm_handle_work() can be called with interrupts enabled + * (TIF_NEED_RESCHED) or disabled. The down_interruptible + * call may sleep, therefore we must re-enable interrupts + * to avoid deadlocks. It is safe to do so because this function + * is called ONLY when returning to user level, in which case + * there is no risk of kernel stack overflow due to deep + * interrupt nesting. + */ +void pfm_handle_work(struct pt_regs *regs) +{ + struct pfm_context *ctx; + unsigned long flags, dummy_flags; + int type, ret, info; + +#ifdef CONFIG_PPC + /* + * This is just a temporary fix. Obviously we'd like to fix the powerpc + * code to make that check before calling __pfm_handle_work() to + * prevent the function call overhead, but the call is made from + * assembly code, so it will take a little while to figure out how to + * perform the check correctly. + */ + if (!test_thread_flag(TIF_PERFMON_WORK)) + return; +#endif + + if (!user_mode(regs)) + return; + + clear_thread_flag(TIF_PERFMON_WORK); + + pfm_stats_inc(handle_work_count); + + ctx = current->pfm_context; + if (ctx == NULL) { + PFM_DBG("[%d] has no ctx", current->pid); + return; + } + + BUG_ON(ctx->flags.system); + + spin_lock_irqsave(&ctx->lock, flags); + + type = ctx->flags.work_type; + ctx->flags.work_type = PFM_WORK_NONE; + + PFM_DBG("work_type=%d reset_count=%d", + type, + ctx->flags.reset_count); + + switch (type) { + case PFM_WORK_ZOMBIE: + goto do_zombie; + case PFM_WORK_RESET: + /* simply reset, no blocking */ + goto skip_blocking; + case PFM_WORK_NONE: + PFM_DBG("unexpected PFM_WORK_NONE"); + goto nothing_todo; + case PFM_WORK_BLOCK: + break; + default: + PFM_DBG("unkown type=%d", type); + goto nothing_todo; + } + + /* + * restore interrupt mask to what it was on entry. + * Could be enabled/disabled. + */ + spin_unlock_irqrestore(&ctx->lock, flags); + + /* + * force interrupt enable because of down_interruptible() + */ + local_irq_enable(); + + PFM_DBG("before block sleeping"); + + /* + * may go through without blocking on SMP systems + * if restart has been received already by the time we call down() + */ + ret = wait_for_completion_interruptible(&ctx->restart_complete); + + PFM_DBG("after block sleeping ret=%d", ret); + + /* + * lock context and mask interrupts again + * We save flags into a dummy because we may have + * altered interrupts mask compared to entry in this + * function. + */ + spin_lock_irqsave(&ctx->lock, dummy_flags); + + if (ctx->state == PFM_CTX_ZOMBIE) + goto do_zombie; + + /* + * in case of interruption of down() we don't restart anything + */ + if (ret < 0) + goto nothing_todo; + +skip_blocking: + /* + * iterate over the number of pending resets + * There are certain situations where there may be + * multiple notifications sent before a pfm_restart(). + * As such, it may be that multiple pfm_restart() are + * issued before the monitored thread gets to + * pfm_handle_work(). To avoid losing restarts, pfm_restart() + * increments a counter (reset_counts). Here, we take this + * into account by potentially calling pfm_resume_after_ovfl() + * multiple times. It is up to the sampling format to take the + * appropriate actions. + */ + while (ctx->flags.reset_count) { + pfm_resume_after_ovfl(ctx); + /* careful as active set may have changed */ + ctx->flags.reset_count--; + } + +nothing_todo: + /* + * restore flags as they were upon entry + */ + spin_unlock_irqrestore(&ctx->lock, flags); + return; + +do_zombie: + PFM_DBG("context is zombie, bailing out"); + + __pfm_unload_context(ctx, &info); + + /* + * keep the spinlock check happy + */ + spin_unlock(&ctx->lock); + + /* + * enable interrupt for vfree() + */ + local_irq_enable(); + + /* + * cancel timer now that context is unlocked + */ + if (info & 0x2) { + ret = hrtimer_cancel(&__get_cpu_var(pfm_hrtimer)); + PFM_DBG("timeout cancel=%d", ret); + } + + /* + * actual context free + */ + pfm_free_context(ctx); + + /* + * restore interrupts as they were upon entry + */ + local_irq_restore(flags); + + /* always true */ + if (info & 0x1) + pfm_session_release(0, 0); +} + +/** + * __pfm_restart - resume monitoring after user-level notification + * @ctx: context to operate on + * @info: return information used to free resource once unlocked + * + * function called from sys_pfm_restart(). It is used when overflow + * notification is requested. For each notification received, the user + * must call pfm_restart() to indicate to the kernel that it is done + * processing the notification. + * + * When the caller is doing user level sampling, this function resets + * the overflowed counters and resumes monitoring which is normally stopped + * during notification (always the consequence of a counter overflow). + * + * When using a sampling format, the format restart() callback is invoked, + * overflowed PMDS may be reset based upon decision from sampling format. + * + * When operating in per-thread mode, and when not self-monitoring, the + * monitored thread DOES NOT need to be stopped, unlike for many other calls. + * + * This means that the effect of the restart may not necessarily be observed + * right when returning from the call. For instance, counters may not already + * be reset in the other thread. + * + * When operating in system-wide, the caller must be running on the monitored + * CPU. + * + * The context is locked and interrupts are disabled. + * + * info value upon return: + * - bit 0: when set, mudt issue complete() on restart semaphore + */ +int __pfm_restart(struct pfm_context *ctx, int *info) +{ + int state; + + state = ctx->state; + + PFM_DBG("state=%d can_restart=%d reset_count=%d", + state, + ctx->flags.can_restart, + ctx->flags.reset_count); + + *info = 0; + + switch (state) { + case PFM_CTX_MASKED: + break; + case PFM_CTX_LOADED: + if (ctx->smpl_addr && ctx->smpl_fmt->fmt_restart) + break; + default: + PFM_DBG("invalid state=%d", state); + return -EBUSY; + } + + /* + * first check if allowed to restart, i.e., notifications received + */ + if (!ctx->flags.can_restart) { + PFM_DBG("no restart can_restart=0"); + return -EBUSY; + } + + pfm_stats_inc(pfm_restart_count); + + /* + * at this point, the context is either LOADED or MASKED + */ + ctx->flags.can_restart--; + + /* + * handle self-monitoring case and system-wide + */ + if (ctx->task == current || ctx->flags.system) { + pfm_resume_after_ovfl(ctx); + return 0; + } + + /* + * restart another task + */ + + /* + * if blocking, then post the semaphore if PFM_CTX_MASKED, i.e. + * the task is blocked or on its way to block. That's the normal + * restart path. If the monitoring is not masked, then the task + * can be actively monitoring and we cannot directly intervene. + * Therefore we use the trap mechanism to catch the task and + * force it to reset the buffer/reset PMDs. + * + * if non-blocking, then we ensure that the task will go into + * pfm_handle_work() before returning to user mode. + * + * We cannot explicitly reset another task, it MUST always + * be done by the task itself. This works for system wide because + * the tool that is controlling the session is logically doing + * "self-monitoring". + */ + if (ctx->flags.block && state == PFM_CTX_MASKED) { + PFM_DBG("unblocking [%d]", ctx->task->pid); + /* + * It is not possible to call complete() with the context locked + * otherwise we have a potential deadlock with the PMU context + * switch code due to a lock inversion between task_rq_lock() + * and the context lock. + * Instead we mark whether or not we need to issue the complete + * and we invoke the function once the context lock is released + * in sys_pfm_restart() + */ + *info = 1; + } else { + PFM_DBG("[%d] armed exit trap", ctx->task->pid); + pfm_post_work(ctx->task, ctx, PFM_WORK_RESET); + } + ctx->flags.reset_count++; + return 0; +} + +/** + * pfm_get_smpl_arg -- copy user arguments to pfm_create_context() related to sampling format + * @name: format name as passed by user + * @fmt_arg: format optional argument as passed by user + * @uszie: size of structure pass in fmt_arg + * @arg: kernel copy of fmt_arg + * @fmt: pointer to sampling format upon success + * + * arg is kmalloc'ed, thus it needs a kfree by caller + */ +int pfm_get_smpl_arg(char __user *fmt_uname, void __user *fmt_uarg, size_t usize, void **arg, + struct pfm_smpl_fmt **fmt) +{ + struct pfm_smpl_fmt *f; + char *fmt_name; + void *addr = NULL; + size_t sz; + int ret; + + fmt_name = getname(fmt_uname); + if (!fmt_name) { + PFM_DBG("getname failed"); + return -ENOMEM; + } + + /* + * find fmt and increase refcount + */ + f = pfm_smpl_fmt_get(fmt_name); + + putname(fmt_name); + + if (f == NULL) { + PFM_DBG("buffer format not found"); + return -EINVAL; + } + + /* + * expected format argument size + */ + sz = f->fmt_arg_size; + + /* + * check user size matches expected size + * usize = -1 is for IA-64 backward compatibility + */ + ret = -EINVAL; + if (sz != usize && usize != -1) { + PFM_DBG("invalid arg size %zu, format expects %zu", + usize, sz); + goto error; + } + + if (sz) { + ret = -ENOMEM; + addr = kmalloc(sz, GFP_KERNEL); + if (addr == NULL) + goto error; + + ret = -EFAULT; + if (copy_from_user(addr, fmt_uarg, sz)) + goto error; + } + *arg = addr; + *fmt = f; + return 0; + +error: + kfree(addr); + pfm_smpl_fmt_put(f); + return ret; +} diff --git a/perfmon/perfmon_syscalls.c b/perfmon/perfmon_syscalls.c new file mode 100644 index 0000000..8777b58 --- /dev/null +++ b/perfmon/perfmon_syscalls.c @@ -0,0 +1,1060 @@ +/* + * perfmon_syscalls.c: perfmon2 system call interface + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * David Mosberger-Tang + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include +#include +#include +#include +#include "perfmon_priv.h" + +/* + * Context locking rules: + * --------------------- + * - any thread with access to the file descriptor of a context can + * potentially issue perfmon calls + * + * - calls must be serialized to guarantee correctness + * + * - as soon as a context is attached to a thread or CPU, it may be + * actively monitoring. On some architectures, such as IA-64, this + * is true even though the pfm_start() call has not been made. This + * comes from the fact that on some architectures, it is possible to + * start/stop monitoring from userland. + * + * - If monitoring is active, then there can PMU interrupts. Because + * context accesses must be serialized, the perfmon system calls + * must mask interrupts as soon as the context is attached. + * + * - perfmon system calls that operate with the context unloaded cannot + * assume it is actually unloaded when they are called. They first need + * to check and for that they need interrupts masked. Then, if the + * context is actually unloaded, they can unmask interrupts. + * + * - interrupt masking holds true for other internal perfmon functions as + * well. Except for PMU interrupt handler because those interrupts + * cannot be nested. + * + * - we mask ALL interrupts instead of just the PMU interrupt because we + * also need to protect against timer interrupts which could trigger + * a set switch. + */ +#ifdef CONFIG_UTRACE +#include + +static u32 +stopper_quiesce(struct utrace_attached_engine *engine, struct task_struct *tsk) +{ + PFM_DBG("quiesced [%d]", tsk->pid); + complete(engine->data); + return UTRACE_ACTION_RESUME; +} + +void +pfm_resume_task(struct task_struct *t, void *data) +{ + PFM_DBG("utrace detach [%d]", t->pid); + (void) utrace_detach(t, data); +} + +static const struct utrace_engine_ops utrace_ops = +{ + .report_quiesce = stopper_quiesce, +}; + +static int pfm_wait_task_stopped(struct task_struct *task, void **data) +{ + DECLARE_COMPLETION_ONSTACK(done); + struct utrace_attached_engine *eng; + int ret; + + eng = utrace_attach(task, UTRACE_ATTACH_CREATE, &utrace_ops, &done); + if (IS_ERR(eng)) + return PTR_ERR(eng); + + ret = utrace_set_flags(task, eng, + UTRACE_ACTION_QUIESCE | UTRACE_EVENT(QUIESCE)); + PFM_DBG("wait quiesce [%d]", task->pid); + if (!ret) + ret = wait_for_completion_interruptible(&done); + + if (ret) + (void) utrace_detach(task, eng); + else + *data = eng; + return 0; +} +#else /* !CONFIG_UTRACE */ +static int pfm_wait_task_stopped(struct task_struct *task, void **data) +{ + int ret; + + *data = NULL; + + /* + * returns 0 if cannot attach + */ + ret = ptrace_may_access(task, PTRACE_MODE_ATTACH); + PFM_DBG("may_attach=%d", ret); + if (!ret) + return -EPERM; + + ret = ptrace_check_attach(task, 0); + PFM_DBG("check_attach=%d", ret); + return ret; +} +void pfm_resume_task(struct task_struct *t, void *data) +{} +#endif + +struct pfm_syscall_cookie { + struct file *filp; + int fput_needed; +}; + +/* + * cannot attach if : + * - kernel task + * - task not owned by caller (checked by ptrace_may_attach()) + * - task is dead or zombie + * - cannot use blocking notification when self-monitoring + */ +static int pfm_task_incompatible(struct pfm_context *ctx, + struct task_struct *task) +{ + /* + * cannot attach to a kernel thread + */ + if (!task->mm) { + PFM_DBG("cannot attach to kernel thread [%d]", task->pid); + return -EPERM; + } + + /* + * cannot use block on notification when + * self-monitoring. + */ + if (ctx->flags.block && task == current) { + PFM_DBG("cannot use block on notification when self-monitoring" + "[%d]", task->pid); + return -EINVAL; + } + /* + * cannot attach to a zombie task + */ + if (task->exit_state == EXIT_ZOMBIE || task->exit_state == EXIT_DEAD) { + PFM_DBG("cannot attach to zombie/dead task [%d]", task->pid); + return -EBUSY; + } + return 0; +} + +/** + * pfm_get_task -- check permission and acquire task to monitor + * @ctx: perfmon context + * @pid: identification of the task to check + * @task: upon return, a pointer to the task to monitor + * + * This function is used in per-thread mode only AND when not + * self-monitoring. It finds the task to monitor and checks + * that the caller has permissions to attach. It also checks + * that the task is stopped via ptrace so that we can safely + * modify its state. + * + * task refcount is incremented when succesful. + */ +static int pfm_get_task(struct pfm_context *ctx, pid_t pid, + struct task_struct **task, void **data) +{ + struct task_struct *p; + int ret = 0, ret1 = 0; + + *data = NULL; + + /* + * When attaching to another thread we must ensure + * that the thread is actually stopped. + * + * As a consequence, only the ptracing parent can actually + * attach a context to a thread. Obviously, this constraint + * does not exist for self-monitoring threads. + * + * We use ptrace_may_attach() to check for permission. + */ + read_lock(&tasklist_lock); + + p = find_task_by_vpid(pid); + if (p) + get_task_struct(p); + + read_unlock(&tasklist_lock); + + if (!p) { + PFM_DBG("task not found %d", pid); + return -ESRCH; + } + + ret = pfm_task_incompatible(ctx, p); + if (ret) + goto error; + + ret = pfm_wait_task_stopped(p, data); + if (ret) + goto error; + + *task = p; + + return 0; +error: + if (!(ret1 || ret)) + ret = -EPERM; + + put_task_struct(p); + + return ret; +} + +/* + * context must be locked when calling this function + */ +int pfm_check_task_state(struct pfm_context *ctx, int check_mask, + unsigned long *flags, void **resume) +{ + struct task_struct *task; + unsigned long local_flags, new_flags; + int state, ret; + + *resume = NULL; + +recheck: + /* + * task is NULL for system-wide context + */ + task = ctx->task; + state = ctx->state; + local_flags = *flags; + + PFM_DBG("state=%d check_mask=0x%x", state, check_mask); + /* + * if the context is detached, then we do not touch + * hardware, therefore there is not restriction on when we can + * access it. + */ + if (state == PFM_CTX_UNLOADED) + return 0; + /* + * no command can operate on a zombie context. + * A context becomes zombie when the file that identifies + * it is closed while the context is still attached to the + * thread it monitors. + */ + if (state == PFM_CTX_ZOMBIE) + return -EINVAL; + + /* + * at this point, state is PFM_CTX_LOADED or PFM_CTX_MASKED + */ + + /* + * some commands require the context to be unloaded to operate + */ + if (check_mask & PFM_CMD_UNLOADED) { + PFM_DBG("state=%d, cmd needs context unloaded", state); + return -EBUSY; + } + + /* + * self-monitoring always ok. + */ + if (task == current) + return 0; + + /* + * for syswide, the calling thread must be running on the cpu + * the context is bound to. + */ + if (ctx->flags.system) { + if (ctx->cpu != smp_processor_id()) + return -EBUSY; + return 0; + } + + /* + * at this point, monitoring another thread + */ + + /* + * the pfm_unload_context() command is allowed on masked context + */ + if (state == PFM_CTX_MASKED && !(check_mask & PFM_CMD_UNLOAD)) + return 0; + + /* + * When we operate on another thread, we must wait for it to be + * stopped and completely off any CPU as we need to access the + * PMU state (or machine state). + * + * A thread can be put in the STOPPED state in various ways + * including PTRACE_ATTACH, or when it receives a SIGSTOP signal. + * We enforce that the thread must be ptraced, so it is stopped + * AND it CANNOT wake up while we operate on it because this + * would require an action from the ptracing parent which is the + * thread that is calling this function. + * + * The dependency on ptrace, imposes that only the ptracing + * parent can issue command on a thread. This is unfortunate + * but we do not know of a better way of doing this. + */ + if (check_mask & PFM_CMD_STOPPED) { + + spin_unlock_irqrestore(&ctx->lock, local_flags); + + /* + * check that the thread is ptraced AND STOPPED + */ + ret = pfm_wait_task_stopped(task, resume); + + spin_lock_irqsave(&ctx->lock, new_flags); + + /* + * flags may be different than when we released the lock + */ + *flags = new_flags; + + if (ret) + return ret; + /* + * we must recheck to verify if state has changed + */ + if (unlikely(ctx->state != state)) { + PFM_DBG("old_state=%d new_state=%d", + state, + ctx->state); + goto recheck; + } + } + return 0; +} + +/* + * pfm_get_args - Function used to copy the syscall argument into kernel memory. + * @ureq: user argument + * @sz: user argument size + * @lsz: size of stack buffer + * @laddr: stack buffer address + * @req: point to start of kernel copy of the argument + * @ptr_free: address of kernel copy to free + * + * There are two options: + * - use a stack buffer described by laddr (addresses) and lsz (size) + * - allocate memory + * + * return: + * < 0 : in case of error (ptr_free may not be updated) + * 0 : success + * - req: points to base of kernel copy of arguments + * - ptr_free: address of buffer to free by caller on exit. + * NULL if using the stack buffer + * + * when ptr_free is not NULL upon return, the caller must kfree() + */ +int pfm_get_args(void __user *ureq, size_t sz, size_t lsz, void *laddr, + void **req, void **ptr_free) +{ + void *addr; + + /* + * check syadmin argument limit + */ + if (unlikely(sz > pfm_controls.arg_mem_max)) { + PFM_DBG("argument too big %zu max=%zu", + sz, + pfm_controls.arg_mem_max); + return -E2BIG; + } + + /* + * check if vector fits on stack buffer + */ + if (sz > lsz) { + addr = kmalloc(sz, GFP_KERNEL); + if (unlikely(addr == NULL)) + return -ENOMEM; + *ptr_free = addr; + } else { + addr = laddr; + *req = laddr; + *ptr_free = NULL; + } + + /* + * bring the data in + */ + if (unlikely(copy_from_user(addr, ureq, sz))) { + if (addr != laddr) + kfree(addr); + return -EFAULT; + } + + /* + * base address of kernel buffer + */ + *req = addr; + + return 0; +} + +/** + * pfm_acquire_ctx_from_fd -- get ctx from file descriptor + * @fd: file descriptor + * @ctx: pointer to pointer of context updated on return + * @cookie: opaque structure to use for release + * + * This helper function extracts the ctx from the file descriptor. + * It also increments the refcount of the file structure. Thus + * it updates the cookie so the refcount can be decreased when + * leaving the perfmon syscall via pfm_release_ctx_from_fd + */ +static int pfm_acquire_ctx_from_fd(int fd, struct pfm_context **ctx, + struct pfm_syscall_cookie *cookie) +{ + struct file *filp; + int fput_needed; + + filp = fget_light(fd, &fput_needed); + if (unlikely(filp == NULL)) { + PFM_DBG("invalid fd %d", fd); + return -EBADF; + } + + *ctx = filp->private_data; + + if (unlikely(!*ctx || filp->f_op != &pfm_file_ops)) { + PFM_DBG("fd %d not related to perfmon", fd); + return -EBADF; + } + cookie->filp = filp; + cookie->fput_needed = fput_needed; + + return 0; +} + +/** + * pfm_release_ctx_from_fd -- decrease refcount of file associated with context + * @cookie: the cookie structure initialized by pfm_acquire_ctx_from_fd + */ +static inline void pfm_release_ctx_from_fd(struct pfm_syscall_cookie *cookie) +{ + fput_light(cookie->filp, cookie->fput_needed); +} + +/* + * unlike the other perfmon system calls, this one returns a file descriptor + * or a value < 0 in case of error, very much like open() or socket() + */ +asmlinkage long sys_pfm_create_context(struct pfarg_ctx __user *ureq, + char __user *fmt_name, + void __user *fmt_uarg, size_t fmt_size) +{ + struct pfarg_ctx req; + struct pfm_smpl_fmt *fmt = NULL; + void *fmt_arg = NULL; + int ret; + + PFM_DBG("req=%p fmt=%p fmt_arg=%p size=%zu", + ureq, fmt_name, fmt_uarg, fmt_size); + + if (perfmon_disabled) + return -ENOSYS; + + if (copy_from_user(&req, ureq, sizeof(req))) + return -EFAULT; + + if (fmt_name) { + ret = pfm_get_smpl_arg(fmt_name, fmt_uarg, fmt_size, &fmt_arg, &fmt); + if (ret) + goto abort; + } + + ret = __pfm_create_context(&req, fmt, fmt_arg, PFM_NORMAL, NULL); + + kfree(fmt_arg); +abort: + return ret; +} + +asmlinkage long sys_pfm_write_pmcs(int fd, struct pfarg_pmc __user *ureq, int count) +{ + struct pfm_context *ctx; + struct task_struct *task; + struct pfm_syscall_cookie cookie; + struct pfarg_pmc pmcs[PFM_PMC_STK_ARG]; + struct pfarg_pmc *req; + void *fptr, *resume; + unsigned long flags; + size_t sz; + int ret; + + PFM_DBG("fd=%d req=%p count=%d", fd, ureq, count); + + if (count < 0 || count >= PFM_MAX_ARG_COUNT(ureq)) { + PFM_DBG("invalid arg count %d", count); + return -EINVAL; + } + + sz = count*sizeof(*ureq); + + ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie); + if (ret) + return ret; + + ret = pfm_get_args(ureq, sz, sizeof(pmcs), pmcs, (void **)&req, &fptr); + if (ret) + goto error; + + spin_lock_irqsave(&ctx->lock, flags); + + task = ctx->task; + + ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags, &resume); + if (!ret) + ret = __pfm_write_pmcs(ctx, req, count); + + spin_unlock_irqrestore(&ctx->lock, flags); + + if (resume) + pfm_resume_task(task, resume); + + /* + * This function may be on the critical path. + * We want to avoid the branch if unecessary. + */ + if (fptr) + kfree(fptr); +error: + pfm_release_ctx_from_fd(&cookie); + return ret; +} + +asmlinkage long sys_pfm_write_pmds(int fd, struct pfarg_pmd __user *ureq, int count) +{ + struct pfm_context *ctx; + struct task_struct *task; + struct pfm_syscall_cookie cookie; + struct pfarg_pmd pmds[PFM_PMD_STK_ARG]; + struct pfarg_pmd *req; + void *fptr, *resume; + unsigned long flags; + size_t sz; + int ret; + + PFM_DBG("fd=%d req=%p count=%d", fd, ureq, count); + + if (count < 0 || count >= PFM_MAX_ARG_COUNT(ureq)) { + PFM_DBG("invalid arg count %d", count); + return -EINVAL; + } + + sz = count*sizeof(*ureq); + + ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie); + if (ret) + return ret; + + ret = pfm_get_args(ureq, sz, sizeof(pmds), pmds, (void **)&req, &fptr); + if (ret) + goto error; + + spin_lock_irqsave(&ctx->lock, flags); + + task = ctx->task; + + ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags, &resume); + if (!ret) + ret = __pfm_write_pmds(ctx, req, count, 0); + + spin_unlock_irqrestore(&ctx->lock, flags); + + if (resume) + pfm_resume_task(task, resume); + + if (fptr) + kfree(fptr); +error: + pfm_release_ctx_from_fd(&cookie); + return ret; +} + +asmlinkage long sys_pfm_read_pmds(int fd, struct pfarg_pmd __user *ureq, int count) +{ + struct pfm_context *ctx; + struct task_struct *task; + struct pfm_syscall_cookie cookie; + struct pfarg_pmd pmds[PFM_PMD_STK_ARG]; + struct pfarg_pmd *req; + void *fptr, *resume; + unsigned long flags; + size_t sz; + int ret; + + PFM_DBG("fd=%d req=%p count=%d", fd, ureq, count); + + if (count < 0 || count >= PFM_MAX_ARG_COUNT(ureq)) + return -EINVAL; + + sz = count*sizeof(*ureq); + + ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie); + if (ret) + return ret; + + ret = pfm_get_args(ureq, sz, sizeof(pmds), pmds, (void **)&req, &fptr); + if (ret) + goto error; + + spin_lock_irqsave(&ctx->lock, flags); + + task = ctx->task; + + ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags, &resume); + if (!ret) + ret = __pfm_read_pmds(ctx, req, count); + + spin_unlock_irqrestore(&ctx->lock, flags); + + if (copy_to_user(ureq, req, sz)) + ret = -EFAULT; + + if (resume) + pfm_resume_task(task, resume); + + if (fptr) + kfree(fptr); +error: + pfm_release_ctx_from_fd(&cookie); + return ret; +} + +asmlinkage long sys_pfm_restart(int fd) +{ + struct pfm_context *ctx; + struct task_struct *task; + struct pfm_syscall_cookie cookie; + void *resume; + unsigned long flags; + int ret, info; + + PFM_DBG("fd=%d", fd); + + ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie); + if (ret) + return ret; + + spin_lock_irqsave(&ctx->lock, flags); + + task = ctx->task; + + ret = pfm_check_task_state(ctx, 0, &flags, &resume); + if (!ret) + ret = __pfm_restart(ctx, &info); + + spin_unlock_irqrestore(&ctx->lock, flags); + + if (resume) + pfm_resume_task(task, resume); + /* + * In per-thread mode with blocking notification, i.e. + * ctx->flags.blocking=1, we need to defer issuing the + * complete to unblock the blocked monitored thread. + * Otherwise we have a potential deadlock due to a lock + * inversion between the context lock and the task_rq_lock() + * which can happen if one thread is in this call and the other + * (the monitored thread) is in the context switch code. + * + * It is safe to access the context outside the critical section + * because: + * - we are protected by the fget_light(), thus the context + * cannot disappear + */ + if (ret == 0 && info == 1) + complete(&ctx->restart_complete); + + pfm_release_ctx_from_fd(&cookie); + return ret; +} + +asmlinkage long sys_pfm_stop(int fd) +{ + struct pfm_context *ctx; + struct task_struct *task; + struct pfm_syscall_cookie cookie; + void *resume; + unsigned long flags; + int ret; + int release_info; + + PFM_DBG("fd=%d", fd); + + ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie); + if (ret) + return ret; + + spin_lock_irqsave(&ctx->lock, flags); + + task = ctx->task; + + ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags, &resume); + if (!ret) + ret = __pfm_stop(ctx, &release_info); + + spin_unlock_irqrestore(&ctx->lock, flags); + + if (resume) + pfm_resume_task(task, resume); + + /* + * defer cancellation of timer to avoid race + * with pfm_handle_switch_timeout() + * + * applies only when self-monitoring + */ + if (release_info & 0x2) + hrtimer_cancel(&__get_cpu_var(pfm_hrtimer)); + + pfm_release_ctx_from_fd(&cookie); + return ret; +} + +asmlinkage long sys_pfm_start(int fd, struct pfarg_start __user *ureq) +{ + struct pfm_context *ctx; + struct task_struct *task; + struct pfm_syscall_cookie cookie; + void *resume; + struct pfarg_start req; + unsigned long flags; + int ret; + + PFM_DBG("fd=%d req=%p", fd, ureq); + + ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie); + if (ret) + return ret; + + /* + * the one argument is actually optional + */ + if (ureq && copy_from_user(&req, ureq, sizeof(req))) + return -EFAULT; + + spin_lock_irqsave(&ctx->lock, flags); + + task = ctx->task; + + ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED, &flags, &resume); + if (!ret) + ret = __pfm_start(ctx, ureq ? &req : NULL); + + spin_unlock_irqrestore(&ctx->lock, flags); + + if (resume) + pfm_resume_task(task, resume); + + pfm_release_ctx_from_fd(&cookie); + return ret; +} + +asmlinkage long sys_pfm_load_context(int fd, struct pfarg_load __user *ureq) +{ + struct pfm_context *ctx; + struct task_struct *task; + struct pfm_syscall_cookie cookie; + void *resume, *dummy_resume; + unsigned long flags; + struct pfarg_load req; + int ret; + + PFM_DBG("fd=%d req=%p", fd, ureq); + + if (copy_from_user(&req, ureq, sizeof(req))) + return -EFAULT; + + ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie); + if (ret) + return ret; + + task = current; + + /* + * in per-thread mode (not self-monitoring), get a reference + * on task to monitor. This must be done with interrupts enabled + * Upon succesful return, refcount on task is increased. + * + * fget_light() is protecting the context. + */ + if (!ctx->flags.system && req.load_pid != current->pid) { + ret = pfm_get_task(ctx, req.load_pid, &task, &resume); + if (ret) + goto error; + } + + /* + * irqsave is required to avoid race in case context is already + * loaded or with switch timeout in the case of self-monitoring + */ + spin_lock_irqsave(&ctx->lock, flags); + + ret = pfm_check_task_state(ctx, PFM_CMD_UNLOADED, &flags, &dummy_resume); + if (!ret) + ret = __pfm_load_context(ctx, &req, task); + + spin_unlock_irqrestore(&ctx->lock, flags); + + if (resume) + pfm_resume_task(task, resume); + + /* + * in per-thread mode (not self-monitoring), we need + * to decrease refcount on task to monitor: + * - load successful: we have a reference to the task in ctx->task + * - load failed : undo the effect of pfm_get_task() + */ + if (task != current) + put_task_struct(task); +error: + pfm_release_ctx_from_fd(&cookie); + return ret; +} + +asmlinkage long sys_pfm_unload_context(int fd) +{ + struct pfm_context *ctx; + struct task_struct *task; + struct pfm_syscall_cookie cookie; + void *resume; + unsigned long flags; + int ret; + int is_system, release_info = 0; + u32 cpu; + + PFM_DBG("fd=%d", fd); + + ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie); + if (ret) + return ret; + + is_system = ctx->flags.system; + + spin_lock_irqsave(&ctx->lock, flags); + + cpu = ctx->cpu; + task = ctx->task; + + ret = pfm_check_task_state(ctx, PFM_CMD_STOPPED|PFM_CMD_UNLOAD, + &flags, &resume); + if (!ret) + ret = __pfm_unload_context(ctx, &release_info); + + spin_unlock_irqrestore(&ctx->lock, flags); + + if (resume) + pfm_resume_task(task, resume); + + /* + * cancel time now that context is unlocked + * avoid race with pfm_handle_switch_timeout() + */ + if (release_info & 0x2) { + int r; + r = hrtimer_cancel(&__get_cpu_var(pfm_hrtimer)); + PFM_DBG("timeout cancel=%d", r); + } + + if (release_info & 0x1) + pfm_session_release(is_system, cpu); + + pfm_release_ctx_from_fd(&cookie); + return ret; +} + +asmlinkage long sys_pfm_create_evtsets(int fd, struct pfarg_setdesc __user *ureq, int count) +{ + struct pfm_context *ctx; + struct pfm_syscall_cookie cookie; + struct pfarg_setdesc *req; + void *fptr, *resume; + unsigned long flags; + size_t sz; + int ret; + + PFM_DBG("fd=%d req=%p count=%d", fd, ureq, count); + + if (count < 0 || count >= PFM_MAX_ARG_COUNT(ureq)) + return -EINVAL; + + sz = count*sizeof(*ureq); + + ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie); + if (ret) + return ret; + + ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr); + if (ret) + goto error; + + /* + * must mask interrupts because we do not know the state of context, + * could be attached and we could be getting PMU interrupts. So + * we mask and lock context and we check and possibly relax masking + */ + spin_lock_irqsave(&ctx->lock, flags); + + ret = pfm_check_task_state(ctx, PFM_CMD_UNLOADED, &flags, &resume); + if (!ret) + ret = __pfm_create_evtsets(ctx, req, count); + + spin_unlock_irqrestore(&ctx->lock, flags); + /* + * context must be unloaded for this command. The resume pointer + * is necessarily NULL, thus no need to call pfm_resume_task() + */ + kfree(fptr); + +error: + pfm_release_ctx_from_fd(&cookie); + return ret; +} + +asmlinkage long sys_pfm_getinfo_evtsets(int fd, struct pfarg_setinfo __user *ureq, int count) +{ + struct pfm_context *ctx; + struct task_struct *task; + struct pfm_syscall_cookie cookie; + struct pfarg_setinfo *req; + void *fptr, *resume; + unsigned long flags; + size_t sz; + int ret; + + PFM_DBG("fd=%d req=%p count=%d", fd, ureq, count); + + if (count < 0 || count >= PFM_MAX_ARG_COUNT(ureq)) + return -EINVAL; + + sz = count*sizeof(*ureq); + + ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie); + if (ret) + return ret; + + ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr); + if (ret) + goto error; + + /* + * this command operates even when context is loaded, so we need + * to keep interrupts masked to avoid a race with PMU interrupt + * which may switch the active set + */ + spin_lock_irqsave(&ctx->lock, flags); + + task = ctx->task; + + ret = pfm_check_task_state(ctx, 0, &flags, &resume); + if (!ret) + ret = __pfm_getinfo_evtsets(ctx, req, count); + + spin_unlock_irqrestore(&ctx->lock, flags); + + if (resume) + pfm_resume_task(task, resume); + + if (copy_to_user(ureq, req, sz)) + ret = -EFAULT; + + kfree(fptr); +error: + pfm_release_ctx_from_fd(&cookie); + return ret; +} + +asmlinkage long sys_pfm_delete_evtsets(int fd, struct pfarg_setinfo __user *ureq, int count) +{ + struct pfm_context *ctx; + struct pfm_syscall_cookie cookie; + struct pfarg_setinfo *req; + void *fptr, *resume; + unsigned long flags; + size_t sz; + int ret; + + PFM_DBG("fd=%d req=%p count=%d", fd, ureq, count); + + if (count < 0 || count >= PFM_MAX_ARG_COUNT(ureq)) + return -EINVAL; + + sz = count*sizeof(*ureq); + + ret = pfm_acquire_ctx_from_fd(fd, &ctx, &cookie); + if (ret) + return ret; + + ret = pfm_get_args(ureq, sz, 0, NULL, (void **)&req, &fptr); + if (ret) + goto error; + + /* + * must mask interrupts because we do not know the state of context, + * could be attached and we could be getting PMU interrupts + */ + spin_lock_irqsave(&ctx->lock, flags); + + ret = pfm_check_task_state(ctx, PFM_CMD_UNLOADED, &flags, &resume); + if (!ret) + ret = __pfm_delete_evtsets(ctx, req, count); + + spin_unlock_irqrestore(&ctx->lock, flags); + /* + * context must be unloaded for this command. The resume pointer + * is necessarily NULL, thus no need to call pfm_resume_task() + */ + kfree(fptr); + +error: + pfm_release_ctx_from_fd(&cookie); + return ret; +} diff --git a/perfmon/perfmon_sysfs.c b/perfmon/perfmon_sysfs.c new file mode 100644 index 0000000..7353c3b --- /dev/null +++ b/perfmon/perfmon_sysfs.c @@ -0,0 +1,525 @@ +/* + * perfmon_sysfs.c: perfmon2 sysfs interface + * + * This file implements the perfmon2 interface which + * provides access to the hardware performance counters + * of the host processor. + * + * The initial version of perfmon.c was written by + * Ganesh Venkitachalam, IBM Corp. + * + * Then it was modified for perfmon-1.x by Stephane Eranian and + * David Mosberger, Hewlett Packard Co. + * + * Version Perfmon-2.x is a complete rewrite of perfmon-1.x + * by Stephane Eranian, Hewlett Packard Co. + * + * Copyright (c) 1999-2006 Hewlett-Packard Development Company, L.P. + * Contributed by Stephane Eranian + * David Mosberger-Tang + * + * More information about perfmon available at: + * http://perfmon2.sf.net + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + * 02111-1307 USA + */ +#include +#include /* for EXPORT_SYMBOL */ +#include +#include "perfmon_priv.h" + +struct pfm_attribute { + struct attribute attr; + ssize_t (*show)(void *, struct pfm_attribute *attr, char *); + ssize_t (*store)(void *, const char *, size_t); +}; +#define to_attr(n) container_of(n, struct pfm_attribute, attr); + +#define PFM_RO_ATTR(_name, _show) \ + struct kobj_attribute attr_##_name = __ATTR(_name, 0444, _show, NULL) + +#define PFM_RW_ATTR(_name, _show, _store) \ + struct kobj_attribute attr_##_name = __ATTR(_name, 0644, _show, _store) + +#define PFM_ROS_ATTR(_name, _show) \ + struct pfm_attribute attr_##_name = __ATTR(_name, 0444, _show, NULL) + +#define is_attr_name(a, n) (!strcmp((a)->attr.name, n)) +int pfm_sysfs_add_pmu(struct pfm_pmu_config *pmu); + +static struct kobject *pfm_kernel_kobj, *pfm_fmt_kobj; +static struct kobject *pfm_pmu_kobj; + +static ssize_t pfm_regs_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct pfm_regmap_desc *reg = to_reg(kobj); + struct pfm_attribute *attribute = to_attr(attr); + return attribute->show ? attribute->show(reg, attribute, buf) : -EIO; +} + +static ssize_t pfm_fmt_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct pfm_smpl_fmt *fmt = to_smpl_fmt(kobj); + struct pfm_attribute *attribute = to_attr(attr); + return attribute->show ? attribute->show(fmt, attribute, buf) : -EIO; +} + +static struct sysfs_ops pfm_regs_sysfs_ops = { + .show = pfm_regs_attr_show +}; + +static struct sysfs_ops pfm_fmt_sysfs_ops = { + .show = pfm_fmt_attr_show +}; + +static struct kobj_type pfm_regs_ktype = { + .sysfs_ops = &pfm_regs_sysfs_ops, +}; + +static struct kobj_type pfm_fmt_ktype = { + .sysfs_ops = &pfm_fmt_sysfs_ops, +}; + +static ssize_t pfm_controls_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + int base; + + if (is_attr_name(attr, "version")) + return snprintf(buf, PAGE_SIZE, "%u.%u\n", PFM_VERSION_MAJ, PFM_VERSION_MIN); + + if (is_attr_name(attr, "task_sessions_count")) + return pfm_sysfs_res_show(buf, PAGE_SIZE, 0); + + if (is_attr_name(attr, "debug")) + return snprintf(buf, PAGE_SIZE, "%d\n", pfm_controls.debug); + + if (is_attr_name(attr, "task_group")) + return snprintf(buf, PAGE_SIZE, "%d\n", pfm_controls.task_group); + + if (is_attr_name(attr, "mode")) + return snprintf(buf, PAGE_SIZE, "%d\n", pfm_controls.flags); + + if (is_attr_name(attr, "arg_mem_max")) + return snprintf(buf, PAGE_SIZE, "%zu\n", pfm_controls.arg_mem_max); + + if (is_attr_name(attr, "syscall")) { + base = pfm_arch_get_base_syscall(); + return snprintf(buf, PAGE_SIZE, "%d\n", base); + } + + if (is_attr_name(attr, "sys_sessions_count")) + return pfm_sysfs_res_show(buf, PAGE_SIZE, 1); + + if (is_attr_name(attr, "smpl_buffer_mem_max")) + return snprintf(buf, PAGE_SIZE, "%zu\n", pfm_controls.smpl_buffer_mem_max); + + if (is_attr_name(attr, "smpl_buffer_mem_cur")) + return pfm_sysfs_res_show(buf, PAGE_SIZE, 2); + + if (is_attr_name(attr, "sys_group")) + return snprintf(buf, PAGE_SIZE, "%d\n", pfm_controls.sys_group); + + /* XXX: could be set to write-only */ + if (is_attr_name(attr, "reset_stats")) { + buf[0] = '0'; + buf[1] = '\0'; + return strnlen(buf, PAGE_SIZE); + } + return 0; +} + +static ssize_t pfm_controls_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int i; + size_t d; + + if (sscanf(buf, "%zu", &d) != 1) + goto skip; + + if (is_attr_name(attr, "debug")) + pfm_controls.debug = d; + + if (is_attr_name(attr, "task_group")) + pfm_controls.task_group = d; + + if (is_attr_name(attr, "sys_group")) + pfm_controls.sys_group = d; + + if (is_attr_name(attr, "mode")) + pfm_controls.flags = d ? PFM_CTRL_FL_RW_EXPERT : 0; + + if (is_attr_name(attr, "arg_mem_max")) { + /* + * we impose a page as the minimum. + * + * This limit may be smaller than the stack buffer + * available and that is fine. + */ + if (d >= PAGE_SIZE) + pfm_controls.arg_mem_max = d; + } + if (is_attr_name(attr, "reset_stats")) { + for_each_online_cpu(i) { + pfm_reset_stats(i); + } + } + + if (is_attr_name(attr, "smpl_buffer_mem_max")) { + if (d >= PAGE_SIZE) + pfm_controls.smpl_buffer_mem_max = d; + } +skip: + return count; +} + +/* + * /sys/kernel/perfmon attributes + */ +static PFM_RO_ATTR(version, pfm_controls_show); +static PFM_RO_ATTR(task_sessions_count, pfm_controls_show); +static PFM_RO_ATTR(syscall, pfm_controls_show); +static PFM_RO_ATTR(sys_sessions_count, pfm_controls_show); +static PFM_RO_ATTR(smpl_buffer_mem_cur, pfm_controls_show); + +static PFM_RW_ATTR(debug, pfm_controls_show, pfm_controls_store); +static PFM_RW_ATTR(task_group, pfm_controls_show, pfm_controls_store); +static PFM_RW_ATTR(mode, pfm_controls_show, pfm_controls_store); +static PFM_RW_ATTR(sys_group, pfm_controls_show, pfm_controls_store); +static PFM_RW_ATTR(arg_mem_max, pfm_controls_show, pfm_controls_store); +static PFM_RW_ATTR(smpl_buffer_mem_max, pfm_controls_show, pfm_controls_store); +static PFM_RW_ATTR(reset_stats, pfm_controls_show, pfm_controls_store); + +static struct attribute *pfm_kernel_attrs[] = { + &attr_version.attr, + &attr_syscall.attr, + &attr_task_sessions_count.attr, + &attr_sys_sessions_count.attr, + &attr_smpl_buffer_mem_cur.attr, + &attr_debug.attr, + &attr_reset_stats.attr, + &attr_sys_group.attr, + &attr_task_group.attr, + &attr_mode.attr, + &attr_smpl_buffer_mem_max.attr, + &attr_arg_mem_max.attr, + NULL +}; + +static struct attribute_group pfm_kernel_attr_group = { + .attrs = pfm_kernel_attrs, +}; + +/* + * per-reg attributes + */ +static ssize_t pfm_reg_show(void *data, struct pfm_attribute *attr, char *buf) +{ + struct pfm_regmap_desc *reg; + int w; + + reg = data; + + if (is_attr_name(attr, "name")) + return snprintf(buf, PAGE_SIZE, "%s\n", reg->desc); + + if (is_attr_name(attr, "dfl_val")) + return snprintf(buf, PAGE_SIZE, "0x%llx\n", + (unsigned long long)reg->dfl_val); + + if (is_attr_name(attr, "width")) { + w = (reg->type & PFM_REG_C64) ? + pfm_pmu_conf->counter_width : 64; + return snprintf(buf, PAGE_SIZE, "%d\n", w); + } + + if (is_attr_name(attr, "rsvd_msk")) + return snprintf(buf, PAGE_SIZE, "0x%llx\n", + (unsigned long long)reg->rsvd_msk); + + if (is_attr_name(attr, "addr")) + return snprintf(buf, PAGE_SIZE, "0x%lx\n", reg->hw_addr); + + return 0; +} + +static PFM_ROS_ATTR(name, pfm_reg_show); +static PFM_ROS_ATTR(dfl_val, pfm_reg_show); +static PFM_ROS_ATTR(rsvd_msk, pfm_reg_show); +static PFM_ROS_ATTR(width, pfm_reg_show); +static PFM_ROS_ATTR(addr, pfm_reg_show); + +static struct attribute *pfm_reg_attrs[] = { + &attr_name.attr, + &attr_dfl_val.attr, + &attr_rsvd_msk.attr, + &attr_width.attr, + &attr_addr.attr, + NULL +}; + +static struct attribute_group pfm_reg_attr_group = { + .attrs = pfm_reg_attrs, +}; + +static ssize_t pfm_pmu_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + if (is_attr_name(attr, "model")) + return snprintf(buf, PAGE_SIZE, "%s\n", pfm_pmu_conf->pmu_name); + return 0; +} +static PFM_RO_ATTR(model, pfm_pmu_show); + +static struct attribute *pfm_pmu_desc_attrs[] = { + &attr_model.attr, + NULL +}; + +static struct attribute_group pfm_pmu_desc_attr_group = { + .attrs = pfm_pmu_desc_attrs, +}; + +static int pfm_sysfs_add_pmu_regs(struct pfm_pmu_config *pmu) +{ + struct pfm_regmap_desc *reg; + unsigned int i, k; + int ret; + + reg = pmu->pmc_desc; + for (i = 0; i < pmu->num_pmc_entries; i++, reg++) { + + if (!(reg->type & PFM_REG_I)) + continue; + + ret = kobject_init_and_add(®->kobj, &pfm_regs_ktype, + pfm_pmu_kobj, "pmc%u", i); + if (ret) + goto undo_pmcs; + + ret = sysfs_create_group(®->kobj, &pfm_reg_attr_group); + if (ret) { + kobject_del(®->kobj); + goto undo_pmcs; + } + } + + reg = pmu->pmd_desc; + for (i = 0; i < pmu->num_pmd_entries; i++, reg++) { + + if (!(reg->type & PFM_REG_I)) + continue; + + ret = kobject_init_and_add(®->kobj, &pfm_regs_ktype, + pfm_pmu_kobj, "pmd%u", i); + if (ret) + goto undo_pmds; + + ret = sysfs_create_group(®->kobj, &pfm_reg_attr_group); + if (ret) { + kobject_del(®->kobj); + goto undo_pmds; + } + } + return 0; +undo_pmds: + reg = pmu->pmd_desc; + for (k = 0; k < i; k++, reg++) { + if (!(reg->type & PFM_REG_I)) + continue; + sysfs_remove_group(®->kobj, &pfm_reg_attr_group); + kobject_del(®->kobj); + } + i = pmu->num_pmc_entries; + /* fall through */ +undo_pmcs: + reg = pmu->pmc_desc; + for (k = 0; k < i; k++, reg++) { + if (!(reg->type & PFM_REG_I)) + continue; + sysfs_remove_group(®->kobj, &pfm_reg_attr_group); + kobject_del(®->kobj); + } + return ret; +} + +static int pfm_sysfs_del_pmu_regs(struct pfm_pmu_config *pmu) +{ + struct pfm_regmap_desc *reg; + unsigned int i; + + reg = pmu->pmc_desc; + for (i = 0; i < pmu->num_pmc_entries; i++, reg++) { + + if (!(reg->type & PFM_REG_I)) + continue; + + sysfs_remove_group(®->kobj, &pfm_reg_attr_group); + kobject_del(®->kobj); + } + + reg = pmu->pmd_desc; + for (i = 0; i < pmu->num_pmd_entries; i++, reg++) { + + if (!(reg->type & PFM_REG_I)) + continue; + + sysfs_remove_group(®->kobj, &pfm_reg_attr_group); + kobject_del(®->kobj); + } + return 0; +} + +/* + * when a PMU description module is inserted, we create + * a pmu_desc subdir in sysfs and we populate it with + * PMU specific information, such as register mappings + */ +int pfm_sysfs_add_pmu(struct pfm_pmu_config *pmu) +{ + int ret; + + pfm_pmu_kobj = kobject_create_and_add("pmu_desc", pfm_kernel_kobj); + if (!pfm_pmu_kobj) + return -ENOMEM; + + ret = sysfs_create_group(pfm_pmu_kobj, &pfm_pmu_desc_attr_group); + if (ret) { + /* will release pfm_pmu_kobj */ + kobject_put(pfm_pmu_kobj); + return ret; + } + + ret = pfm_sysfs_add_pmu_regs(pmu); + if (ret) { + sysfs_remove_group(pfm_pmu_kobj, &pfm_pmu_desc_attr_group); + /* will release pfm_pmu_kobj */ + kobject_put(pfm_pmu_kobj); + } else + kobject_uevent(pfm_pmu_kobj, KOBJ_ADD); + + return ret; +} + +/* + * when a PMU description module is removed, we also remove + * all its information from sysfs, i.e., the pmu_desc subdir + * disappears + */ +int pfm_sysfs_remove_pmu(struct pfm_pmu_config *pmu) +{ + pfm_sysfs_del_pmu_regs(pmu); + sysfs_remove_group(pfm_pmu_kobj, &pfm_pmu_desc_attr_group); + kobject_uevent(pfm_pmu_kobj, KOBJ_REMOVE); + kobject_put(pfm_pmu_kobj); + pfm_pmu_kobj = NULL; + return 0; +} + +static ssize_t pfm_fmt_show(void *data, struct pfm_attribute *attr, char *buf) +{ + struct pfm_smpl_fmt *fmt = data; + + if (is_attr_name(attr, "version")) + return snprintf(buf, PAGE_SIZE, "%u.%u\n", + fmt->fmt_version >> 16 & 0xffff, + fmt->fmt_version & 0xffff); + return 0; +} + +/* + * do not use predefined macros because of name conflict + * with /sys/kernel/perfmon/version + */ +struct pfm_attribute attr_fmt_version = { + .attr = { .name = "version", .mode = 0444 }, + .show = pfm_fmt_show, +}; + +static struct attribute *pfm_fmt_attrs[] = { + &attr_fmt_version.attr, + NULL +}; + +static struct attribute_group pfm_fmt_attr_group = { + .attrs = pfm_fmt_attrs, +}; + +/* + * when a sampling format module is inserted, we populate + * sysfs with some information + */ +int pfm_sysfs_add_fmt(struct pfm_smpl_fmt *fmt) +{ + int ret; + + ret = kobject_init_and_add(&fmt->kobj, &pfm_fmt_ktype, + pfm_fmt_kobj, fmt->fmt_name); + if (ret) + return ret; + + ret = sysfs_create_group(&fmt->kobj, &pfm_fmt_attr_group); + if (ret) + kobject_del(&fmt->kobj); + else + kobject_uevent(&fmt->kobj, KOBJ_ADD); + + return ret; +} + +/* + * when a sampling format module is removed, its information + * must also be removed from sysfs + */ +void pfm_sysfs_remove_fmt(struct pfm_smpl_fmt *fmt) +{ + sysfs_remove_group(&fmt->kobj, &pfm_fmt_attr_group); + kobject_uevent(&fmt->kobj, KOBJ_REMOVE); + kobject_del(&fmt->kobj); +} + +int __init pfm_init_sysfs(void) +{ + int ret; + + pfm_kernel_kobj = kobject_create_and_add("perfmon", kernel_kobj); + if (!pfm_kernel_kobj) { + PFM_ERR("cannot add kernel object: /sys/kernel/perfmon"); + return -ENOMEM; + } + + ret = sysfs_create_group(pfm_kernel_kobj, &pfm_kernel_attr_group); + if (ret) { + kobject_put(pfm_kernel_kobj); + return ret; + } + + pfm_fmt_kobj = kobject_create_and_add("formats", pfm_kernel_kobj); + if (ret) { + PFM_ERR("cannot add fmt object: %d", ret); + goto error_fmt; + } + if (pfm_pmu_conf) + pfm_sysfs_add_pmu(pfm_pmu_conf); + + pfm_sysfs_builtin_fmt_add(); + + return 0; + +error_fmt: + kobject_del(pfm_kernel_kobj); + return ret; +}