aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--docs/misc/xen-command-line.markdown8
-rw-r--r--xen/arch/x86/acpi/cpu_idle.c66
-rw-r--r--xen/arch/x86/cpu/Makefile1
-rw-r--r--xen/arch/x86/cpu/mwait-idle.c513
-rw-r--r--xen/include/asm-x86/cpuidle.h35
-rw-r--r--xen/include/asm-x86/msr-index.h5
-rw-r--r--xen/include/asm-x86/mwait.h17
7 files changed, 610 insertions, 35 deletions
diff --git a/docs/misc/xen-command-line.markdown b/docs/misc/xen-command-line.markdown
index e771fed3b2..716f3692c2 100644
--- a/docs/misc/xen-command-line.markdown
+++ b/docs/misc/xen-command-line.markdown
@@ -620,6 +620,14 @@ limit is ignored by Xen.
Specify if the MMConfig space should be enabled.
+### mwait-idle
+> `= <boolean>`
+
+> Default: `true`
+
+Use the MWAIT idle driver (with model specific C-state knowledge) instead
+of the ACPI based one.
+
### nmi
> `= ignore | dom0 | fatal`
diff --git a/xen/arch/x86/acpi/cpu_idle.c b/xen/arch/x86/acpi/cpu_idle.c
index db9a3d89fd..f6ec64a0ae 100644
--- a/xen/arch/x86/acpi/cpu_idle.c
+++ b/xen/arch/x86/acpi/cpu_idle.c
@@ -39,7 +39,6 @@
#include <xen/smp.h>
#include <xen/guest_access.h>
#include <xen/keyhandler.h>
-#include <xen/cpuidle.h>
#include <xen/trace.h>
#include <xen/sched-if.h>
#include <xen/irq.h>
@@ -54,6 +53,8 @@
#include <public/sysctl.h>
#include <acpi/cpufreq/cpufreq.h>
#include <asm/apic.h>
+#include <asm/cpuidle.h>
+#include <asm/mwait.h>
#include <xen/notifier.h>
#include <xen/cpu.h>
@@ -70,18 +71,18 @@
#define GET_CC7_RES(val) GET_HW_RES_IN_NS(0x3FE, val) /* SNB only */
static void lapic_timer_nop(void) { }
-static void (*lapic_timer_off)(void);
-static void (*lapic_timer_on)(void);
+void (*__read_mostly lapic_timer_off)(void);
+void (*__read_mostly lapic_timer_on)(void);
static uint64_t (*__read_mostly tick_to_ns)(uint64_t) = acpi_pm_tick_to_ns;
-static void (*pm_idle_save) (void) __read_mostly;
+void (*__read_mostly pm_idle_save)(void);
unsigned int max_cstate __read_mostly = ACPI_PROCESSOR_MAX_POWER - 1;
integer_param("max_cstate", max_cstate);
static bool_t __read_mostly local_apic_timer_c2_ok;
boolean_param("lapic_timer_c2_ok", local_apic_timer_c2_ok);
-static struct acpi_processor_power *__read_mostly processor_powers[NR_CPUS];
+struct acpi_processor_power *__read_mostly processor_powers[NR_CPUS];
struct hw_residencies
{
@@ -236,12 +237,10 @@ static uint64_t acpi_pm_ticks_elapsed(uint64_t t1, uint64_t t2)
return ((0xFFFFFFFF - t1) + t2 +1);
}
-static uint64_t (*__read_mostly get_tick)(void) = get_acpi_pm_tick;
+uint64_t (*__read_mostly cpuidle_get_tick)(void) = get_acpi_pm_tick;
static uint64_t (*__read_mostly ticks_elapsed)(uint64_t, uint64_t)
= acpi_pm_ticks_elapsed;
-#define MWAIT_ECX_INTERRUPT_BREAK (0x1)
-
/*
* The bit is set iff cpu use monitor/mwait to enter C state
* with this flag set, CPU can be waken up from C state
@@ -263,7 +262,7 @@ void cpuidle_wakeup_mwait(cpumask_t *mask)
cpumask_andnot(mask, mask, &target);
}
-static void mwait_idle_with_hints(unsigned long eax, unsigned long ecx)
+void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
{
unsigned int cpu = smp_processor_id();
s_time_t expires = per_cpu(timer_deadline, cpu);
@@ -334,7 +333,7 @@ static struct {
unsigned int count;
} c3_cpu_status = { .lock = SPIN_LOCK_UNLOCKED };
-static inline void trace_exit_reason(u32 *irq_traced)
+void trace_exit_reason(u32 *irq_traced)
{
if ( unlikely(tb_init_done) )
{
@@ -354,15 +353,6 @@ static inline void trace_exit_reason(u32 *irq_traced)
}
}
-/* vcpu is urgent if vcpu is polling event channel
- *
- * if urgent vcpu exists, CPU should not enter deep C state
- */
-static int sched_has_urgent_vcpu(void)
-{
- return atomic_read(&this_cpu(schedule_data).urgent_count);
-}
-
/*
* "AAJ72. EOI Transaction May Not be Sent if Software Enters Core C6 During
* an Interrupt Service Routine"
@@ -388,10 +378,11 @@ bool_t errata_c6_eoi_workaround(void)
return (fix_needed && cpu_has_pending_apic_eoi());
}
-static inline void acpi_update_idle_stats(struct acpi_processor_power *power,
- struct acpi_processor_cx *cx,
- int64_t sleep_ticks)
+void update_idle_stats(struct acpi_processor_power *power,
+ struct acpi_processor_cx *cx,
+ uint64_t before, uint64_t after)
{
+ int64_t sleep_ticks = ticks_elapsed(before, after);
/* Interrupts are disabled */
spin_lock(&power->stat_lock);
@@ -472,19 +463,19 @@ static void acpi_processor_idle(void)
if ( cx->type == ACPI_STATE_C1 || local_apic_timer_c2_ok )
{
/* Get start time (ticks) */
- t1 = get_tick();
+ t1 = cpuidle_get_tick();
/* Trace cpu idle entry */
TRACE_4D(TRC_PM_IDLE_ENTRY, cx->idx, t1, exp, pred);
/* Invoke C2 */
acpi_idle_do_entry(cx);
/* Get end time (ticks) */
- t2 = get_tick();
+ t2 = cpuidle_get_tick();
trace_exit_reason(irq_traced);
/* Trace cpu idle exit */
TRACE_6D(TRC_PM_IDLE_EXIT, cx->idx, t2,
irq_traced[0], irq_traced[1], irq_traced[2], irq_traced[3]);
/* Update statistics */
- acpi_update_idle_stats(power, cx, ticks_elapsed(t1, t2));
+ update_idle_stats(power, cx, t1, t2);
/* Re-enable interrupts */
local_irq_enable();
break;
@@ -500,7 +491,7 @@ static void acpi_processor_idle(void)
lapic_timer_off();
/* Get start time (ticks) */
- t1 = get_tick();
+ t1 = cpuidle_get_tick();
/* Trace cpu idle entry */
TRACE_4D(TRC_PM_IDLE_ENTRY, cx->idx, t1, exp, pred);
@@ -549,7 +540,7 @@ static void acpi_processor_idle(void)
}
/* Get end time (ticks) */
- t2 = get_tick();
+ t2 = cpuidle_get_tick();
/* recovering TSC */
cstate_restore_tsc();
@@ -559,7 +550,7 @@ static void acpi_processor_idle(void)
irq_traced[0], irq_traced[1], irq_traced[2], irq_traced[3]);
/* Update statistics */
- acpi_update_idle_stats(power, cx, ticks_elapsed(t1, t2));
+ update_idle_stats(power, cx, t1, t2);
/* Re-enable interrupts */
local_irq_enable();
/* recovering APIC */
@@ -586,7 +577,7 @@ static void acpi_processor_idle(void)
cpuidle_current_governor->reflect(power);
}
-static void acpi_dead_idle(void)
+void acpi_dead_idle(void)
{
struct acpi_processor_power *power;
struct acpi_processor_cx *cx;
@@ -649,7 +640,7 @@ default_halt:
halt();
}
-static int cpuidle_init_cpu(int cpu)
+int cpuidle_init_cpu(unsigned int cpu)
{
struct acpi_processor_power *acpi_power;
@@ -660,7 +651,7 @@ static int cpuidle_init_cpu(int cpu)
if ( cpu == 0 && boot_cpu_has(X86_FEATURE_NONSTOP_TSC) )
{
- get_tick = get_stime_tick;
+ cpuidle_get_tick = get_stime_tick;
ticks_elapsed = stime_ticks_elapsed;
tick_to_ns = stime_tick_to_ns;
}
@@ -685,9 +676,6 @@ static int cpuidle_init_cpu(int cpu)
return 0;
}
-#define MWAIT_SUBSTATE_MASK (0xf)
-#define MWAIT_SUBSTATE_SIZE (4)
-
static int acpi_processor_ffh_cstate_probe(xen_processor_cx_t *cx)
{
struct cpuinfo_x86 *c = &current_cpu_data;
@@ -1026,6 +1014,9 @@ long set_cx_pminfo(uint32_t cpu, struct xen_processor_power *power)
if ( unlikely(!guest_handle_okay(power->states, power->count)) )
return -EFAULT;
+ if ( pm_idle_save && pm_idle != acpi_processor_idle )
+ return 0;
+
print_cx_pminfo(cpu, power);
/* map from acpi_id to cpu_id */
@@ -1195,7 +1186,12 @@ static struct notifier_block cpu_nfb = {
static int __init cpuidle_presmp_init(void)
{
void *cpu = (void *)(long)smp_processor_id();
- cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+
+ if ( !xen_cpuidle )
+ return 0;
+
+ mwait_idle_init(&cpu_nfb);
+ cpu_nfb.notifier_call(&cpu_nfb, CPU_ONLINE, cpu);
register_cpu_notifier(&cpu_nfb);
return 0;
}
diff --git a/xen/arch/x86/cpu/Makefile b/xen/arch/x86/cpu/Makefile
index f0eeba85b4..98f283682b 100644
--- a/xen/arch/x86/cpu/Makefile
+++ b/xen/arch/x86/cpu/Makefile
@@ -5,6 +5,7 @@ obj-y += amd.o
obj-y += common.o
obj-y += intel.o
obj-y += intel_cacheinfo.o
+obj-y += mwait-idle.o
# Keeping around for VIA support (JBeulich)
# obj-$(x86_32) += centaur.o
diff --git a/xen/arch/x86/cpu/mwait-idle.c b/xen/arch/x86/cpu/mwait-idle.c
new file mode 100644
index 0000000000..5c10fd206d
--- /dev/null
+++ b/xen/arch/x86/cpu/mwait-idle.c
@@ -0,0 +1,513 @@
+/*
+ * mwait_idle.c - native hardware idle loop for modern processors
+ *
+ * Copyright (c) 2010, Intel Corporation.
+ * Len Brown <len.brown@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+/*
+ * mwait_idle is a cpuidle driver that loads on specific processors
+ * in lieu of the legacy ACPI processor_idle driver. The intent is to
+ * make Linux more efficient on these processors, as mwait_idle knows
+ * more than ACPI, as well as make Linux more immune to ACPI BIOS bugs.
+ */
+
+/*
+ * Design Assumptions
+ *
+ * All CPUs have same idle states as boot CPU
+ *
+ * Chipset BM_STS (bus master status) bit is a NOP
+ * for preventing entry into deep C-states
+ */
+
+/*
+ * Known limitations
+ *
+ * The driver currently initializes for_each_online_cpu() upon load.
+ * It it unaware of subsequent processors hot-added to the system.
+ * This means that if you boot with maxcpus=n and later online
+ * processors above n, those processors will use C1 only.
+ *
+ * ACPI has a .suspend hack to turn off deep C-states during suspend
+ * to avoid complications with the lapic timer workaround.
+ * Have not seen issues with suspend, but may need same workaround here.
+ */
+
+/* un-comment DEBUG to enable pr_debug() statements */
+#define DEBUG
+
+#include <xen/lib.h>
+#include <xen/cpu.h>
+#include <xen/init.h>
+#include <xen/softirq.h>
+#include <xen/trace.h>
+#include <asm/cpuidle.h>
+#include <asm/mwait.h>
+#include <asm/msr.h>
+#include <acpi/cpufreq/cpufreq.h>
+
+#define MWAIT_IDLE_VERSION "0.4"
+#undef PREFIX
+#define PREFIX "mwait-idle: "
+
+#ifdef DEBUG
+# define pr_debug(fmt...) printk(KERN_DEBUG fmt)
+#else
+# define pr_debug(fmt...)
+#endif
+
+static __initdata bool_t no_mwait_idle;
+invbool_param("mwait-idle", no_mwait_idle);
+
+static unsigned int mwait_substates;
+
+#define LAPIC_TIMER_ALWAYS_RELIABLE 0xFFFFFFFF
+/* Reliable LAPIC Timer States, bit 1 for C1 etc. Default to only C1. */
+static unsigned int lapic_timer_reliable_states = (1 << 1);
+
+struct idle_cpu {
+ const struct cpuidle_state *state_table;
+
+ /*
+ * Hardware C-state auto-demotion may not always be optimal.
+ * Indicate which enable bits to clear here.
+ */
+ unsigned long auto_demotion_disable_flags;
+};
+
+static const struct idle_cpu *icpu;
+
+static const struct cpuidle_state {
+ char name[16];
+ unsigned int flags;
+ unsigned int exit_latency; /* in US */
+ int power_usage; /* in mW */
+ unsigned int target_residency; /* in US */
+} *cpuidle_state_table;
+
+/*
+ * Set this flag for states where the HW flushes the TLB for us
+ * and so we don't need cross-calls to keep it consistent.
+ * If this flag is set, SW flushes the TLB, so even if the
+ * HW doesn't do the flushing, this flag is safe to use.
+ */
+#define CPUIDLE_FLAG_TLB_FLUSHED 0x10000
+
+/*
+ * States are indexed by the cstate number,
+ * which is also the index into the MWAIT hint array.
+ * Thus C0 is a dummy.
+ */
+static const struct cpuidle_state nehalem_cstates[MWAIT_MAX_NUM_CSTATES] = {
+ { /* MWAIT C0 */ },
+ { /* MWAIT C1 */
+ .name = "C1-NHM",
+ .exit_latency = 3,
+ .target_residency = 6,
+ },
+ { /* MWAIT C2 */
+ .name = "C3-NHM",
+ .flags = CPUIDLE_FLAG_TLB_FLUSHED,
+ .exit_latency = 20,
+ .target_residency = 80,
+ },
+ { /* MWAIT C3 */
+ .name = "C6-NHM",
+ .flags = CPUIDLE_FLAG_TLB_FLUSHED,
+ .exit_latency = 200,
+ .target_residency = 800,
+ }
+};
+
+static const struct cpuidle_state snb_cstates[MWAIT_MAX_NUM_CSTATES] = {
+ { /* MWAIT C0 */ },
+ { /* MWAIT C1 */
+ .name = "C1-SNB",
+ .exit_latency = 1,
+ .target_residency = 1,
+ },
+ { /* MWAIT C2 */
+ .name = "C3-SNB",
+ .flags = CPUIDLE_FLAG_TLB_FLUSHED,
+ .exit_latency = 80,
+ .target_residency = 211,
+ },
+ { /* MWAIT C3 */
+ .name = "C6-SNB",
+ .flags = CPUIDLE_FLAG_TLB_FLUSHED,
+ .exit_latency = 104,
+ .target_residency = 345,
+ },
+ { /* MWAIT C4 */
+ .name = "C7-SNB",
+ .flags = CPUIDLE_FLAG_TLB_FLUSHED,
+ .exit_latency = 109,
+ .target_residency = 345,
+ }
+};
+
+static const struct cpuidle_state ivb_cstates[MWAIT_MAX_NUM_CSTATES] = {
+ { /* MWAIT C0 */ },
+ { /* MWAIT C1 */
+ .name = "C1-IVB",
+ .exit_latency = 1,
+ .target_residency = 1,
+ },
+ { /* MWAIT C2 */
+ .name = "C3-IVB",
+ .flags = CPUIDLE_FLAG_TLB_FLUSHED,
+ .exit_latency = 59,
+ .target_residency = 156,
+ },
+ { /* MWAIT C3 */
+ .name = "C6-IVB",
+ .flags = CPUIDLE_FLAG_TLB_FLUSHED,
+ .exit_latency = 80,
+ .target_residency = 300,
+ },
+ { /* MWAIT C4 */
+ .name = "C7-IVB",
+ .flags = CPUIDLE_FLAG_TLB_FLUSHED,
+ .exit_latency = 87,
+ .target_residency = 300,
+ }
+};
+
+static const struct cpuidle_state atom_cstates[MWAIT_MAX_NUM_CSTATES] = {
+ { /* MWAIT C0 */ },
+ { /* MWAIT C1 */
+ .name = "C1-ATM",
+ .exit_latency = 1,
+ .target_residency = 4,
+ },
+ { /* MWAIT C2 */
+ .name = "C2-ATM",
+ .exit_latency = 20,
+ .target_residency = 80,
+ },
+ { /* MWAIT C3 */ },
+ { /* MWAIT C4 */
+ .name = "C4-ATM",
+ .flags = CPUIDLE_FLAG_TLB_FLUSHED,
+ .exit_latency = 100,
+ .target_residency = 400,
+ },
+ { /* MWAIT C5 */ },
+ { /* MWAIT C6 */
+ .name = "C6-ATM",
+ .flags = CPUIDLE_FLAG_TLB_FLUSHED,
+ .exit_latency = 140,
+ .target_residency = 560,
+ }
+};
+
+static u32 get_driver_data(unsigned int cstate)
+{
+ static const u32 driver_data[] = {
+ [1] /* MWAIT C1 */ = 0x00,
+ [2] /* MWAIT C2 */ = 0x10,
+ [3] /* MWAIT C3 */ = 0x20,
+ [4] /* MWAIT C4 */ = 0x30,
+ [5] /* MWAIT C5 */ = 0x40,
+ [6] /* MWAIT C6 */ = 0x52,
+ };
+
+ return driver_data[cstate < ARRAY_SIZE(driver_data) ? cstate : 0];
+}
+
+static void mwait_idle(void)
+{
+ unsigned int cpu = smp_processor_id();
+ struct acpi_processor_power *power = processor_powers[cpu];
+ struct acpi_processor_cx *cx = NULL;
+ unsigned int eax, next_state, cstate;
+ u64 before, after;
+ u32 exp = 0, pred = 0, irq_traced[4] = { 0 };
+
+ if (max_cstate > 0 && power && !sched_has_urgent_vcpu() &&
+ (next_state = cpuidle_current_governor->select(power)) > 0) {
+ do {
+ cx = &power->states[next_state];
+ } while (cx->type > max_cstate && --next_state);
+ if (!next_state)
+ cx = NULL;
+ menu_get_trace_data(&exp, &pred);
+ }
+ if (!cx) {
+ if (pm_idle_save)
+ pm_idle_save();
+ else
+ safe_halt();
+ return;
+ }
+
+ cpufreq_dbs_timer_suspend();
+
+ sched_tick_suspend();
+ /* sched_tick_suspend() can raise TIMER_SOFTIRQ. Process it now. */
+ process_pending_softirqs();
+
+ /* Interrupts must be disabled for C2 and higher transitions. */
+ local_irq_disable();
+
+ if (!cpu_is_haltable(cpu)) {
+ local_irq_enable();
+ sched_tick_resume();
+ cpufreq_dbs_timer_resume();
+ return;
+ }
+
+ power->last_state = cx;
+ eax = cx->address;
+ cstate = ((eax >> MWAIT_SUBSTATE_SIZE) & MWAIT_CSTATE_MASK) + 1;
+
+#if 0 /* XXX Can we/do we need to do something similar on Xen? */
+ /*
+ * leave_mm() to avoid costly and often unnecessary wakeups
+ * for flushing the user TLB's associated with the active mm.
+ */
+ if (cpuidle_state_table[].flags & CPUIDLE_FLAG_TLB_FLUSHED)
+ leave_mm(cpu);
+#endif
+
+ if (!(lapic_timer_reliable_states & (1 << cstate)))
+ lapic_timer_off();
+
+ before = cpuidle_get_tick();
+ TRACE_4D(TRC_PM_IDLE_ENTRY, cx->idx, before, exp, pred);
+
+ if (cpu_is_haltable(cpu))
+ mwait_idle_with_hints(eax, MWAIT_ECX_INTERRUPT_BREAK);
+
+ after = cpuidle_get_tick();
+
+ cstate_restore_tsc();
+ trace_exit_reason(irq_traced);
+ TRACE_6D(TRC_PM_IDLE_EXIT, cx->idx, after,
+ irq_traced[0], irq_traced[1], irq_traced[2], irq_traced[3]);
+
+ update_idle_stats(power, cx, before, after);
+ local_irq_enable();
+
+ if (!(lapic_timer_reliable_states & (1 << cstate)))
+ lapic_timer_on();
+
+ /* Now back in C0. */
+ power->last_state = &power->states[0];
+
+ sched_tick_resume();
+ cpufreq_dbs_timer_resume();
+
+ if ( cpuidle_current_governor->reflect )
+ cpuidle_current_governor->reflect(power);
+}
+
+static void auto_demotion_disable(void *dummy)
+{
+ u64 msr_bits;
+
+ rdmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits);
+ msr_bits &= ~(icpu->auto_demotion_disable_flags);
+ wrmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr_bits);
+}
+
+static const struct idle_cpu idle_cpu_nehalem = {
+ .state_table = nehalem_cstates,
+ .auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE,
+};
+
+static const struct idle_cpu idle_cpu_atom = {
+ .state_table = atom_cstates,
+};
+
+static const struct idle_cpu idle_cpu_lincroft = {
+ .state_table = atom_cstates,
+ .auto_demotion_disable_flags = ATM_LNC_C6_AUTO_DEMOTE,
+};
+
+static const struct idle_cpu idle_cpu_snb = {
+ .state_table = snb_cstates,
+};
+
+static const struct idle_cpu idle_cpu_ivb = {
+ .state_table = ivb_cstates,
+};
+
+#define ICPU(model, cpu) { 6, model, &idle_cpu_##cpu }
+
+static struct intel_idle_id {
+ unsigned int family, model;
+ const struct idle_cpu *data;
+} intel_idle_ids[] __initdata = {
+ ICPU(0x1a, nehalem),
+ ICPU(0x1e, nehalem),
+ ICPU(0x1f, nehalem),
+ ICPU(0x25, nehalem),
+ ICPU(0x2c, nehalem),
+ ICPU(0x2e, nehalem),
+ ICPU(0x2f, nehalem),
+ ICPU(0x1c, atom),
+ ICPU(0x26, lincroft),
+ ICPU(0x2a, snb),
+ ICPU(0x2d, snb),
+ ICPU(0x3a, ivb),
+ {}
+};
+
+static int __init mwait_idle_probe(void)
+{
+ unsigned int eax, ebx, ecx;
+ const struct intel_idle_id *id;
+
+ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+ !boot_cpu_has(X86_FEATURE_MWAIT) ||
+ boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
+ return -ENODEV;
+
+ for (id = intel_idle_ids; id->family; ++id)
+ if (id->family == boot_cpu_data.x86 &&
+ id->model == boot_cpu_data.x86_model)
+ break;
+ if (!id->family) {
+ pr_debug(PREFIX "does not run on family %d model %d\n",
+ boot_cpu_data.x86, boot_cpu_data.x86_model);
+ return -ENODEV;
+ }
+
+ cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &mwait_substates);
+
+ if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
+ !(ecx & CPUID5_ECX_INTERRUPT_BREAK) ||
+ !mwait_substates)
+ return -ENODEV;
+
+ if (!max_cstate || no_mwait_idle) {
+ pr_debug(PREFIX "disabled\n");
+ return -EPERM;
+ }
+
+ pr_debug(PREFIX "MWAIT substates: %#x\n", mwait_substates);
+
+ icpu = id->data;
+ cpuidle_state_table = icpu->state_table;
+
+ if (boot_cpu_has(X86_FEATURE_ARAT))
+ lapic_timer_reliable_states = LAPIC_TIMER_ALWAYS_RELIABLE;
+
+ pr_debug(PREFIX "v" MWAIT_IDLE_VERSION " model %#x\n",
+ boot_cpu_data.x86_model);
+
+ pr_debug(PREFIX "lapic_timer_reliable_states %#x\n",
+ lapic_timer_reliable_states);
+ return 0;
+}
+
+static int mwait_idle_cpu_init(struct notifier_block *nfb,
+ unsigned long action, void *hcpu)
+{
+ unsigned int cpu = (unsigned long)hcpu, cstate;
+ struct acpi_processor_power *dev = processor_powers[cpu];
+
+ switch (action) {
+ default:
+ return NOTIFY_DONE;
+
+ case CPU_UP_PREPARE:
+ cpuidle_init_cpu(cpu);
+ return NOTIFY_DONE;
+
+ case CPU_ONLINE:
+ if (!dev)
+ return NOTIFY_DONE;
+ break;
+ }
+
+ dev->count = 1;
+
+ for (cstate = 1; cstate < MWAIT_MAX_NUM_CSTATES; ++cstate) {
+ unsigned int num_substates;
+ struct acpi_processor_cx *cx;
+
+ if (cstate > max_cstate) {
+ printk(PREFIX "max C-state %u reached\n", max_cstate);
+ break;
+ }
+
+ /* Does the state exist in CPUID.MWAIT? */
+ num_substates = (mwait_substates >> (cstate * 4))
+ & MWAIT_SUBSTATE_MASK;
+ if (!num_substates)
+ continue;
+ /* Is the state not enabled? */
+ if (!cpuidle_state_table[cstate].target_residency) {
+ /* does the driver not know about the state? */
+ if (!pm_idle_save && !*cpuidle_state_table[cstate].name)
+ pr_debug(PREFIX "unaware of family %#x model %#x MWAIT %u\n",
+ boot_cpu_data.x86,
+ boot_cpu_data.x86_model, cstate);
+ continue;
+ }
+
+ if (dev->count >= ACPI_PROCESSOR_MAX_POWER) {
+ printk(PREFIX "max C-state count of %u reached\n",
+ ACPI_PROCESSOR_MAX_POWER);
+ break;
+ }
+
+ if (cstate > 2 && !boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) {
+ if (pm_idle_save)
+ continue;
+ setup_clear_cpu_cap(X86_FEATURE_TSC_RELIABLE);
+ }
+
+ cx = dev->states + dev->count;
+ cx->type = cstate;
+ cx->address = get_driver_data(cstate);
+ cx->entry_method = ACPI_CSTATE_EM_FFH;
+ cx->power = cpuidle_state_table[cstate].power_usage;
+ cx->latency = cpuidle_state_table[cstate].exit_latency;
+ cx->target_residency =
+ cpuidle_state_table[cstate].target_residency;
+
+ dev->count++;
+ }
+
+ if (icpu->auto_demotion_disable_flags)
+ on_selected_cpus(cpumask_of(cpu), auto_demotion_disable, NULL, 1);
+
+ return NOTIFY_DONE;
+}
+
+int __init mwait_idle_init(struct notifier_block *nfb)
+{
+ int err;
+
+ if (pm_idle_save)
+ return -ENODEV;
+
+ err = mwait_idle_probe();
+ if (!err) {
+ nfb->notifier_call = mwait_idle_cpu_init;
+ mwait_idle_cpu_init(nfb, CPU_UP_PREPARE, NULL);
+
+ pm_idle_save = pm_idle;
+ pm_idle = mwait_idle;
+ dead_idle = acpi_dead_idle;
+ }
+
+ return err;
+}
diff --git a/xen/include/asm-x86/cpuidle.h b/xen/include/asm-x86/cpuidle.h
new file mode 100644
index 0000000000..73edf90efd
--- /dev/null
+++ b/xen/include/asm-x86/cpuidle.h
@@ -0,0 +1,35 @@
+#ifndef __ASM_X86_CPUIDLE_H__
+#define __ASM_X86_CPUIDLE_H__
+
+#include <xen/cpuidle.h>
+#include <xen/notifier.h>
+#include <xen/sched.h>
+#include <xen/sched-if.h>
+
+extern struct acpi_processor_power *processor_powers[];
+
+extern void (*pm_idle_save)(void);
+
+extern void (*lapic_timer_off)(void);
+extern void (*lapic_timer_on)(void);
+
+extern uint64_t (*cpuidle_get_tick)(void);
+
+int mwait_idle_init(struct notifier_block *);
+int cpuidle_init_cpu(unsigned int cpu);
+void acpi_dead_idle(void);
+void trace_exit_reason(u32 *irq_traced);
+void update_idle_stats(struct acpi_processor_power *,
+ struct acpi_processor_cx *, uint64_t, uint64_t);
+
+/*
+ * vcpu is urgent if vcpu is polling event channel
+ *
+ * if urgent vcpu exists, CPU should not enter deep C state
+ */
+static inline int sched_has_urgent_vcpu(void)
+{
+ return atomic_read(&this_cpu(schedule_data).urgent_count);
+}
+
+#endif /* __X86_ASM_CPUIDLE_H__ */
diff --git a/xen/include/asm-x86/msr-index.h b/xen/include/asm-x86/msr-index.h
index ccb961bd5b..0d6c67b120 100644
--- a/xen/include/asm-x86/msr-index.h
+++ b/xen/include/asm-x86/msr-index.h
@@ -36,6 +36,11 @@
#define MSR_IA32_PERFCTR1 0x000000c2
#define MSR_FSB_FREQ 0x000000cd
+#define MSR_NHM_SNB_PKG_CST_CFG_CTL 0x000000e2
+#define NHM_C3_AUTO_DEMOTE (1UL << 25)
+#define NHM_C1_AUTO_DEMOTE (1UL << 26)
+#define ATM_LNC_C6_AUTO_DEMOTE (1UL << 25)
+
#define MSR_MTRRcap 0x000000fe
#define MSR_IA32_BBL_CR_CTL 0x00000119
diff --git a/xen/include/asm-x86/mwait.h b/xen/include/asm-x86/mwait.h
new file mode 100644
index 0000000000..3ad3d9ca1d
--- /dev/null
+++ b/xen/include/asm-x86/mwait.h
@@ -0,0 +1,17 @@
+#ifndef __ASM_X86_MWAIT_H__
+#define __ASM_X86_MWAIT_H__
+
+#define MWAIT_SUBSTATE_MASK 0xf
+#define MWAIT_CSTATE_MASK 0xf
+#define MWAIT_SUBSTATE_SIZE 4
+#define MWAIT_MAX_NUM_CSTATES 8
+
+#define CPUID_MWAIT_LEAF 5
+#define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1
+#define CPUID5_ECX_INTERRUPT_BREAK 0x2
+
+#define MWAIT_ECX_INTERRUPT_BREAK 0x1
+
+void mwait_idle_with_hints(unsigned int eax, unsigned int ecx);
+
+#endif /* __ASM_X86_MWAIT_H__ */