1 /*
2  * cpu_idle - xen idle state module derived from Linux
3  *            drivers/acpi/processor_idle.c &
4  *            arch/x86/kernel/acpi/cstate.c
5  *
6  *  Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
7  *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
8  *  Copyright (C) 2004, 2005 Dominik Brodowski <linux@brodo.de>
9  *  Copyright (C) 2004  Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
10  *                      - Added processor hotplug support
11  *  Copyright (C) 2005  Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
12  *                      - Added support for C3 on SMP
13  *  Copyright (C) 2007, 2008 Intel Corporation
14  *
15  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
16  *
17  *  This program is free software; you can redistribute it and/or modify
18  *  it under the terms of the GNU General Public License as published by
19  *  the Free Software Foundation; either version 2 of the License, or (at
20  *  your option) any later version.
21  *
22  *  This program is distributed in the hope that it will be useful, but
23  *  WITHOUT ANY WARRANTY; without even the implied warranty of
24  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
25  *  General Public License for more details.
26  *
27  *  You should have received a copy of the GNU General Public License along
28  *  with this program; If not, see <http://www.gnu.org/licenses/>.
29  *
30  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
31  */
32 
33 #include <xen/errno.h>
34 #include <xen/lib.h>
35 #include <xen/types.h>
36 #include <xen/acpi.h>
37 #include <xen/smp.h>
38 #include <xen/guest_access.h>
39 #include <xen/keyhandler.h>
40 #include <xen/param.h>
41 #include <xen/trace.h>
42 #include <xen/irq.h>
43 #include <asm/cache.h>
44 #include <asm/io.h>
45 #include <asm/iocap.h>
46 #include <asm/hpet.h>
47 #include <asm/processor.h>
48 #include <xen/pmstat.h>
49 #include <xen/softirq.h>
50 #include <public/platform.h>
51 #include <public/sysctl.h>
52 #include <acpi/cpufreq/cpufreq.h>
53 #include <asm/apic.h>
54 #include <asm/cpuidle.h>
55 #include <asm/mwait.h>
56 #include <xen/notifier.h>
57 #include <xen/cpu.h>
58 #include <asm/spec_ctrl.h>
59 
60 /*#define DEBUG_PM_CX*/
61 
62 #define GET_HW_RES_IN_NS(msr, val) \
63     do { rdmsrl(msr, val); val = tsc_ticks2ns(val); } while( 0 )
64 #define GET_MC6_RES(val)  GET_HW_RES_IN_NS(0x664, val)
65 #define GET_PC2_RES(val)  GET_HW_RES_IN_NS(0x60D, val) /* SNB onwards */
66 #define GET_PC3_RES(val)  GET_HW_RES_IN_NS(0x3F8, val)
67 #define GET_PC6_RES(val)  GET_HW_RES_IN_NS(0x3F9, val)
68 #define GET_PC7_RES(val)  GET_HW_RES_IN_NS(0x3FA, val)
69 #define GET_PC8_RES(val)  GET_HW_RES_IN_NS(0x630, val) /* some Haswells only */
70 #define GET_PC9_RES(val)  GET_HW_RES_IN_NS(0x631, val) /* some Haswells only */
71 #define GET_PC10_RES(val) GET_HW_RES_IN_NS(0x632, val) /* some Haswells only */
72 #define GET_CC1_RES(val)  GET_HW_RES_IN_NS(0x660, val)
73 #define GET_CC3_RES(val)  GET_HW_RES_IN_NS(0x3FC, val)
74 #define GET_CC6_RES(val)  GET_HW_RES_IN_NS(0x3FD, val)
75 #define GET_CC7_RES(val)  GET_HW_RES_IN_NS(0x3FE, val) /* SNB onwards */
76 #define PHI_CC6_RES(val)  GET_HW_RES_IN_NS(0x3FF, val) /* Xeon Phi only */
77 
lapic_timer_nop(void)78 static void lapic_timer_nop(void) { }
79 void (*__read_mostly lapic_timer_off)(void);
80 void (*__read_mostly lapic_timer_on)(void);
81 
lapic_timer_init(void)82 bool lapic_timer_init(void)
83 {
84     if ( boot_cpu_has(X86_FEATURE_ARAT) )
85     {
86         lapic_timer_off = lapic_timer_nop;
87         lapic_timer_on = lapic_timer_nop;
88     }
89     else if ( hpet_broadcast_is_available() )
90     {
91         lapic_timer_off = hpet_broadcast_enter;
92         lapic_timer_on = hpet_broadcast_exit;
93     }
94     else if ( pit_broadcast_is_available() )
95     {
96         lapic_timer_off = pit_broadcast_enter;
97         lapic_timer_on = pit_broadcast_exit;
98     }
99     else
100         return false;
101 
102     return true;
103 }
104 
105 void (*__read_mostly pm_idle_save)(void);
106 unsigned int max_cstate __read_mostly = UINT_MAX;
107 unsigned int max_csubstate __read_mostly = UINT_MAX;
108 
parse_cstate(const char * s)109 static int __init parse_cstate(const char *s)
110 {
111     max_cstate = simple_strtoul(s, &s, 0);
112     if ( *s == ',' )
113         max_csubstate = simple_strtoul(s + 1, NULL, 0);
114     return 0;
115 }
116 custom_param("max_cstate", parse_cstate);
117 
118 static bool __read_mostly local_apic_timer_c2_ok;
119 boolean_param("lapic_timer_c2_ok", local_apic_timer_c2_ok);
120 
121 struct acpi_processor_power *__read_mostly processor_powers[NR_CPUS];
122 
123 /*
124  * This field starts out as zero, and can be set to -1 just to signal it has
125  * been set (and that vendor specific logic has failed, and shouldn't be
126  * tried again), or to +1 to ignore Dom0 side uploads of C-state ACPI data.
127  */
128 static int8_t __read_mostly vendor_override;
129 
130 struct hw_residencies
131 {
132     uint64_t mc0;
133     uint64_t mc6;
134     uint64_t pc2;
135     uint64_t pc3;
136     uint64_t pc4;
137     uint64_t pc6;
138     uint64_t pc7;
139     uint64_t pc8;
140     uint64_t pc9;
141     uint64_t pc10;
142     uint64_t cc1;
143     uint64_t cc3;
144     uint64_t cc6;
145     uint64_t cc7;
146 };
147 
do_get_hw_residencies(void * arg)148 static void do_get_hw_residencies(void *arg)
149 {
150     struct cpuinfo_x86 *c = &current_cpu_data;
151     struct hw_residencies *hw_res = arg;
152 
153     if ( c->x86_vendor != X86_VENDOR_INTEL || c->x86 != 6 )
154         return;
155 
156     switch ( c->x86_model )
157     {
158     /* 4th generation Intel Core (Haswell) */
159     case 0x45:
160         GET_PC8_RES(hw_res->pc8);
161         GET_PC9_RES(hw_res->pc9);
162         GET_PC10_RES(hw_res->pc10);
163         /* fall through */
164     /* Sandy bridge */
165     case 0x2A:
166     case 0x2D:
167     /* Ivy bridge */
168     case 0x3A:
169     case 0x3E:
170     /* Haswell */
171     case 0x3C:
172     case 0x3F:
173     case 0x46:
174     /* Broadwell */
175     case 0x3D:
176     case 0x47:
177     case 0x4F:
178     case 0x56:
179     /* Skylake */
180     case 0x4E:
181     case 0x55:
182     case 0x5E:
183     /* Ice Lake */
184     case 0x7D:
185     case 0x7E:
186     /* Kaby Lake */
187     case 0x8E:
188     case 0x9E:
189     /* Comet Lake */
190     case 0xA5:
191     case 0xA6:
192         GET_PC2_RES(hw_res->pc2);
193         GET_CC7_RES(hw_res->cc7);
194         /* fall through */
195     /* Nehalem */
196     case 0x1A:
197     case 0x1E:
198     case 0x1F:
199     case 0x2E:
200     /* Westmere */
201     case 0x25:
202     case 0x2C:
203     case 0x2F:
204         GET_PC3_RES(hw_res->pc3);
205         GET_PC6_RES(hw_res->pc6);
206         GET_PC7_RES(hw_res->pc7);
207         GET_CC3_RES(hw_res->cc3);
208         GET_CC6_RES(hw_res->cc6);
209         break;
210     /* Cannon Lake */
211     case 0x66:
212         GET_PC2_RES(hw_res->pc2);
213         GET_PC3_RES(hw_res->pc3);
214         GET_PC6_RES(hw_res->pc6);
215         GET_PC7_RES(hw_res->pc7);
216         GET_CC1_RES(hw_res->cc1);
217         GET_CC6_RES(hw_res->cc6);
218         GET_CC7_RES(hw_res->cc7);
219         break;
220     /* Xeon Phi Knights Landing */
221     case 0x57:
222     /* Xeon Phi Knights Mill */
223     case 0x85:
224         GET_CC3_RES(hw_res->mc0); /* abusing GET_CC3_RES */
225         GET_CC6_RES(hw_res->mc6); /* abusing GET_CC6_RES */
226         GET_PC2_RES(hw_res->pc2);
227         GET_PC3_RES(hw_res->pc3);
228         GET_PC6_RES(hw_res->pc6);
229         GET_PC7_RES(hw_res->pc7);
230         PHI_CC6_RES(hw_res->cc6);
231         break;
232     /* various Atoms */
233     case 0x27:
234         GET_PC3_RES(hw_res->pc2); /* abusing GET_PC3_RES */
235         GET_PC6_RES(hw_res->pc4); /* abusing GET_PC6_RES */
236         GET_PC7_RES(hw_res->pc6); /* abusing GET_PC7_RES */
237         break;
238     /* Silvermont */
239     case 0x37:
240     case 0x4A:
241     case 0x4D:
242     case 0x5A:
243     case 0x5D:
244     /* Airmont */
245     case 0x4C:
246         GET_MC6_RES(hw_res->mc6);
247         GET_PC7_RES(hw_res->pc6); /* abusing GET_PC7_RES */
248         GET_CC1_RES(hw_res->cc1);
249         GET_CC6_RES(hw_res->cc6);
250         break;
251     /* Goldmont */
252     case 0x5C:
253     case 0x5F:
254     /* Goldmont Plus */
255     case 0x7A:
256     /* Tremont */
257     case 0x86:
258         GET_PC2_RES(hw_res->pc2);
259         GET_PC3_RES(hw_res->pc3);
260         GET_PC6_RES(hw_res->pc6);
261         GET_PC10_RES(hw_res->pc10);
262         GET_CC1_RES(hw_res->cc1);
263         GET_CC3_RES(hw_res->cc3);
264         GET_CC6_RES(hw_res->cc6);
265         break;
266     }
267 }
268 
get_hw_residencies(uint32_t cpu,struct hw_residencies * hw_res)269 static void get_hw_residencies(uint32_t cpu, struct hw_residencies *hw_res)
270 {
271     memset(hw_res, 0, sizeof(*hw_res));
272 
273     if ( smp_processor_id() == cpu )
274         do_get_hw_residencies(hw_res);
275     else
276         on_selected_cpus(cpumask_of(cpu), do_get_hw_residencies, hw_res, 1);
277 }
278 
print_hw_residencies(uint32_t cpu)279 static void print_hw_residencies(uint32_t cpu)
280 {
281     struct hw_residencies hw_res;
282 
283     get_hw_residencies(cpu, &hw_res);
284 
285     if ( hw_res.mc0 | hw_res.mc6 )
286         printk("MC0[%"PRIu64"] MC6[%"PRIu64"]\n",
287                hw_res.mc0, hw_res.mc6);
288     printk("PC2[%"PRIu64"] PC%d[%"PRIu64"] PC6[%"PRIu64"] PC7[%"PRIu64"]\n",
289            hw_res.pc2,
290            hw_res.pc4 ? 4 : 3, hw_res.pc4 ?: hw_res.pc3,
291            hw_res.pc6, hw_res.pc7);
292     if ( hw_res.pc8 | hw_res.pc9 | hw_res.pc10 )
293         printk("PC8[%"PRIu64"] PC9[%"PRIu64"] PC10[%"PRIu64"]\n",
294                hw_res.pc8, hw_res.pc9, hw_res.pc10);
295     printk("CC%d[%"PRIu64"] CC6[%"PRIu64"] CC7[%"PRIu64"]\n",
296            hw_res.cc1 ? 1 : 3, hw_res.cc1 ?: hw_res.cc3,
297            hw_res.cc6, hw_res.cc7);
298 }
299 
300 static char* acpi_cstate_method_name[] =
301 {
302     "NONE",
303     "SYSIO",
304     "FFH",
305     "HALT"
306 };
307 
get_stime_tick(void)308 static uint64_t get_stime_tick(void) { return (uint64_t)NOW(); }
stime_ticks_elapsed(uint64_t t1,uint64_t t2)309 static uint64_t stime_ticks_elapsed(uint64_t t1, uint64_t t2) { return t2 - t1; }
stime_tick_to_ns(uint64_t ticks)310 static uint64_t stime_tick_to_ns(uint64_t ticks) { return ticks; }
311 
get_acpi_pm_tick(void)312 static uint64_t get_acpi_pm_tick(void) { return (uint64_t)inl(pmtmr_ioport); }
acpi_pm_ticks_elapsed(uint64_t t1,uint64_t t2)313 static uint64_t acpi_pm_ticks_elapsed(uint64_t t1, uint64_t t2)
314 {
315     if ( t2 >= t1 )
316         return (t2 - t1);
317     else if ( !(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER) )
318         return (((0x00FFFFFF - t1) + t2 + 1) & 0x00FFFFFF);
319     else
320         return ((0xFFFFFFFF - t1) + t2 +1);
321 }
322 
323 uint64_t (*__read_mostly cpuidle_get_tick)(void);
324 static uint64_t (*__read_mostly tick_to_ns)(uint64_t);
325 static uint64_t (*__read_mostly ticks_elapsed)(uint64_t, uint64_t);
326 
print_acpi_power(uint32_t cpu,struct acpi_processor_power * power)327 static void print_acpi_power(uint32_t cpu, struct acpi_processor_power *power)
328 {
329     uint64_t idle_res = 0, idle_usage = 0;
330     uint64_t last_state_update_tick, current_tick, current_stime;
331     uint64_t usage[ACPI_PROCESSOR_MAX_POWER] = { 0 };
332     uint64_t res_tick[ACPI_PROCESSOR_MAX_POWER] = { 0 };
333     unsigned int i;
334     signed int last_state_idx;
335 
336     printk("==cpu%d==\n", cpu);
337     last_state_idx = power->last_state ? power->last_state->idx : -1;
338 
339     spin_lock_irq(&power->stat_lock);
340     current_tick = cpuidle_get_tick();
341     current_stime = NOW();
342     for ( i = 1; i < power->count; i++ )
343     {
344         res_tick[i] = power->states[i].time;
345         usage[i] = power->states[i].usage;
346     }
347     last_state_update_tick = power->last_state_update_tick;
348     spin_unlock_irq(&power->stat_lock);
349 
350     if ( last_state_idx >= 0 )
351     {
352         res_tick[last_state_idx] += ticks_elapsed(last_state_update_tick,
353                                                   current_tick);
354         usage[last_state_idx]++;
355     }
356 
357     for ( i = 1; i < power->count; i++ )
358     {
359         idle_usage += usage[i];
360         idle_res += tick_to_ns(res_tick[i]);
361 
362         printk("   %cC%u:\ttype[C%d] latency[%3u] usage[%8"PRIu64"] method[%5s] duration[%"PRIu64"]\n",
363                (last_state_idx == i) ? '*' : ' ', i,
364                power->states[i].type, power->states[i].latency, usage[i],
365                acpi_cstate_method_name[power->states[i].entry_method],
366                tick_to_ns(res_tick[i]));
367     }
368     printk("   %cC0:\tusage[%8"PRIu64"] duration[%"PRIu64"]\n",
369            (last_state_idx == 0) ? '*' : ' ',
370            usage[0] + idle_usage, current_stime - idle_res);
371 
372     print_hw_residencies(cpu);
373 }
374 
dump_cx(unsigned char key)375 static void dump_cx(unsigned char key)
376 {
377     unsigned int cpu;
378 
379     printk("'%c' pressed -> printing ACPI Cx structures\n", key);
380     if ( max_cstate < UINT_MAX )
381     {
382         printk("max state: C%u\n", max_cstate);
383         if ( max_csubstate < UINT_MAX )
384             printk("max sub-state: %u\n", max_csubstate);
385         else
386             printk("max sub-state: unlimited\n");
387     }
388     else
389         printk("max state: unlimited\n");
390     for_each_present_cpu ( cpu )
391     {
392         struct acpi_processor_power *power = processor_powers[cpu];
393 
394         if ( !power )
395             continue;
396 
397         if ( cpu_online(cpu) )
398             print_acpi_power(cpu, power);
399         else if ( park_offline_cpus )
400             printk("CPU%u parked in state %u (C%u)\n", cpu,
401                    power->last_state ? power->last_state->idx : 1,
402                    power->last_state ? power->last_state->type : 1);
403 
404         process_pending_softirqs();
405     }
406 }
407 
cpu_idle_key_init(void)408 static int __init cpu_idle_key_init(void)
409 {
410     register_keyhandler('c', dump_cx, "dump ACPI Cx structures", 1);
411     return 0;
412 }
413 __initcall(cpu_idle_key_init);
414 
415 /*
416  * The bit is set iff cpu use monitor/mwait to enter C state
417  * with this flag set, CPU can be waken up from C state
418  * by writing to specific memory address, instead of sending an IPI.
419  */
420 static cpumask_t cpuidle_mwait_flags;
421 
cpuidle_wakeup_mwait(cpumask_t * mask)422 void cpuidle_wakeup_mwait(cpumask_t *mask)
423 {
424     cpumask_t target;
425     unsigned int cpu;
426 
427     cpumask_and(&target, mask, &cpuidle_mwait_flags);
428 
429     /* CPU is MWAITing on the cpuidle_mwait_wakeup flag. */
430     for_each_cpu(cpu, &target)
431         mwait_wakeup(cpu) = 0;
432 
433     cpumask_andnot(mask, mask, &target);
434 }
435 
arch_skip_send_event_check(unsigned int cpu)436 bool arch_skip_send_event_check(unsigned int cpu)
437 {
438     /*
439      * This relies on softirq_pending() and mwait_wakeup() to access data
440      * on the same cache line.
441      */
442     smp_mb();
443     return !!cpumask_test_cpu(cpu, &cpuidle_mwait_flags);
444 }
445 
mwait_idle_with_hints(unsigned int eax,unsigned int ecx)446 void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
447 {
448     unsigned int cpu = smp_processor_id();
449     s_time_t expires = per_cpu(timer_deadline, cpu);
450     const void *monitor_addr = &mwait_wakeup(cpu);
451 
452     if ( boot_cpu_has(X86_FEATURE_CLFLUSH_MONITOR) )
453     {
454         mb();
455         clflush(monitor_addr);
456         mb();
457     }
458 
459     __monitor(monitor_addr, 0, 0);
460     smp_mb();
461 
462     /*
463      * Timer deadline passing is the event on which we will be woken via
464      * cpuidle_mwait_wakeup. So check it now that the location is armed.
465      */
466     if ( (expires > NOW() || expires == 0) && !softirq_pending(cpu) )
467     {
468         struct cpu_info *info = get_cpu_info();
469 
470         cpumask_set_cpu(cpu, &cpuidle_mwait_flags);
471 
472         spec_ctrl_enter_idle(info);
473         __mwait(eax, ecx);
474         spec_ctrl_exit_idle(info);
475 
476         cpumask_clear_cpu(cpu, &cpuidle_mwait_flags);
477     }
478 
479     if ( expires <= NOW() && expires > 0 )
480         raise_softirq(TIMER_SOFTIRQ);
481 }
482 
acpi_processor_ffh_cstate_enter(struct acpi_processor_cx * cx)483 static void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
484 {
485     mwait_idle_with_hints(cx->address, MWAIT_ECX_INTERRUPT_BREAK);
486 }
487 
acpi_idle_do_entry(struct acpi_processor_cx * cx)488 static void acpi_idle_do_entry(struct acpi_processor_cx *cx)
489 {
490     struct cpu_info *info = get_cpu_info();
491 
492     switch ( cx->entry_method )
493     {
494     case ACPI_CSTATE_EM_FFH:
495         /* Call into architectural FFH based C-state */
496         acpi_processor_ffh_cstate_enter(cx);
497         return;
498     case ACPI_CSTATE_EM_SYSIO:
499         spec_ctrl_enter_idle(info);
500         /* IO port based C-state */
501         inb(cx->address);
502         /* Dummy wait op - must do something useless after P_LVL2 read
503            because chipsets cannot guarantee that STPCLK# signal
504            gets asserted in time to freeze execution properly. */
505         inl(pmtmr_ioport);
506         spec_ctrl_exit_idle(info);
507         return;
508     case ACPI_CSTATE_EM_HALT:
509         spec_ctrl_enter_idle(info);
510         safe_halt();
511         spec_ctrl_exit_idle(info);
512         local_irq_disable();
513         return;
514     }
515 }
516 
acpi_idle_bm_check(void)517 static int acpi_idle_bm_check(void)
518 {
519     u32 bm_status = 0;
520 
521     acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status);
522     if ( bm_status )
523         acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1);
524     /*
525      * TBD: PIIX4 Erratum #18: Note that BM_STS doesn't always reflect
526      * the true state of bus mastering activity; forcing us to
527      * manually check the BMIDEA bit of each IDE channel.
528      */
529     return bm_status;
530 }
531 
532 static struct {
533     spinlock_t lock;
534     unsigned int count;
535 } c3_cpu_status = { .lock = SPIN_LOCK_UNLOCKED };
536 
trace_exit_reason(u32 * irq_traced)537 void trace_exit_reason(u32 *irq_traced)
538 {
539     if ( unlikely(tb_init_done) )
540     {
541         int i, curbit;
542         u32 irr_status[8] = { 0 };
543 
544         /* Get local apic IRR register */
545         for ( i = 0; i < 8; i++ )
546             irr_status[i] = apic_read(APIC_IRR + (i << 4));
547         i = 0;
548         curbit = find_first_bit((const unsigned long *)irr_status, 256);
549         while ( i < 4 && curbit < 256 )
550         {
551             irq_traced[i++] = curbit;
552             curbit = find_next_bit((const unsigned long *)irr_status, 256, curbit + 1);
553         }
554     }
555 }
556 
errata_c6_workaround(void)557 bool errata_c6_workaround(void)
558 {
559     static int8_t __read_mostly fix_needed = -1;
560 
561     if ( unlikely(fix_needed == -1) )
562     {
563 #define INTEL_FAM6_MODEL(m) { X86_VENDOR_INTEL, 6, m, X86_FEATURE_ALWAYS }
564         /*
565          * Errata AAJ72: EOI Transaction May Not be Sent if Software Enters
566          * Core C6 During an Interrupt Service Routine"
567          *
568          * There was an errata with some Core i7 processors that an EOI
569          * transaction may not be sent if software enters core C6 during an
570          * interrupt service routine. So we don't enter deep Cx state if
571          * there is an EOI pending.
572          */
573         static const struct x86_cpu_id eoi_errata[] = {
574             INTEL_FAM6_MODEL(0x1a),
575             INTEL_FAM6_MODEL(0x1e),
576             INTEL_FAM6_MODEL(0x1f),
577             INTEL_FAM6_MODEL(0x25),
578             INTEL_FAM6_MODEL(0x2c),
579             INTEL_FAM6_MODEL(0x2f),
580             { }
581         };
582         /*
583          * Errata BDX99, CLX30, SKX100, CFW125, BDF104, BDH85, BDM135, KWB131:
584          * A Pending Fixed Interrupt May Be Dispatched Before an Interrupt of
585          * The Same Priority Completes.
586          *
587          * Resuming from C6 Sleep-State, with Fixed Interrupts of the same
588          * priority queued (in the corresponding bits of the IRR and ISR APIC
589          * registers), the processor may dispatch the second interrupt (from
590          * the IRR bit) before the first interrupt has completed and written to
591          * the EOI register, causing the first interrupt to never complete.
592          *
593          * Note: Haswell hasn't had errata issued, but this issue was first
594          * discovered on Haswell hardware, and is affected.
595          */
596         static const struct x86_cpu_id isr_errata[] = {
597             /* Haswell */
598             INTEL_FAM6_MODEL(0x3c),
599             INTEL_FAM6_MODEL(0x3f),
600             INTEL_FAM6_MODEL(0x45),
601             INTEL_FAM6_MODEL(0x46),
602             /* Broadwell */
603             INTEL_FAM6_MODEL(0x47),
604             INTEL_FAM6_MODEL(0x3d),
605             INTEL_FAM6_MODEL(0x4f),
606             INTEL_FAM6_MODEL(0x56),
607             /* Skylake (client) */
608             INTEL_FAM6_MODEL(0x5e),
609             INTEL_FAM6_MODEL(0x4e),
610             /* {Sky/Cascade}lake (server) */
611             INTEL_FAM6_MODEL(0x55),
612             /* {Kaby/Coffee/Whiskey/Amber} Lake */
613             INTEL_FAM6_MODEL(0x9e),
614             INTEL_FAM6_MODEL(0x8e),
615             /* Cannon Lake */
616             INTEL_FAM6_MODEL(0x66),
617             { }
618         };
619 #undef INTEL_FAM6_MODEL
620 
621         fix_needed = cpu_has_apic &&
622                      ((!directed_eoi_enabled && x86_match_cpu(eoi_errata)) ||
623                       x86_match_cpu(isr_errata));
624     }
625 
626     return (fix_needed && cpu_has_pending_apic_eoi());
627 }
628 
update_last_cx_stat(struct acpi_processor_power * power,struct acpi_processor_cx * cx,uint64_t ticks)629 void update_last_cx_stat(struct acpi_processor_power *power,
630                          struct acpi_processor_cx *cx, uint64_t ticks)
631 {
632     ASSERT(!local_irq_is_enabled());
633 
634     spin_lock(&power->stat_lock);
635     power->last_state = cx;
636     power->last_state_update_tick = ticks;
637     spin_unlock(&power->stat_lock);
638 }
639 
update_idle_stats(struct acpi_processor_power * power,struct acpi_processor_cx * cx,uint64_t before,uint64_t after)640 void update_idle_stats(struct acpi_processor_power *power,
641                        struct acpi_processor_cx *cx,
642                        uint64_t before, uint64_t after)
643 {
644     int64_t sleep_ticks = alternative_call(ticks_elapsed, before, after);
645     /* Interrupts are disabled */
646 
647     spin_lock(&power->stat_lock);
648 
649     cx->usage++;
650     if ( sleep_ticks > 0 )
651     {
652         power->last_residency = alternative_call(tick_to_ns, sleep_ticks) /
653                                 1000UL;
654         cx->time += sleep_ticks;
655     }
656     power->last_state = &power->states[0];
657     power->last_state_update_tick = after;
658 
659     spin_unlock(&power->stat_lock);
660 }
661 
acpi_processor_idle(void)662 static void acpi_processor_idle(void)
663 {
664     unsigned int cpu = smp_processor_id();
665     struct acpi_processor_power *power = processor_powers[cpu];
666     struct acpi_processor_cx *cx = NULL;
667     int next_state;
668     uint64_t t1, t2 = 0;
669     u32 exp = 0, pred = 0;
670     u32 irq_traced[4] = { 0 };
671 
672     if ( max_cstate > 0 && power &&
673          (next_state = cpuidle_current_governor->select(power)) > 0 )
674     {
675         unsigned int max_state = sched_has_urgent_vcpu() ? ACPI_STATE_C1
676                                                          : max_cstate;
677 
678         do {
679             cx = &power->states[next_state];
680         } while ( (cx->type > max_state ||
681                    cx->entry_method == ACPI_CSTATE_EM_NONE ||
682                    (cx->entry_method == ACPI_CSTATE_EM_FFH &&
683                     cx->type == max_cstate &&
684                     (cx->address & MWAIT_SUBSTATE_MASK) > max_csubstate)) &&
685                   --next_state );
686         if ( next_state )
687         {
688             if ( cx->type == ACPI_STATE_C3 && power->flags.bm_check &&
689                  acpi_idle_bm_check() )
690                 cx = power->safe_state;
691             if ( tb_init_done )
692                 menu_get_trace_data(&exp, &pred);
693         }
694         else
695             cx = NULL;
696     }
697     if ( !cx )
698     {
699         if ( pm_idle_save )
700             pm_idle_save();
701         else
702         {
703             struct cpu_info *info = get_cpu_info();
704 
705             spec_ctrl_enter_idle(info);
706             safe_halt();
707             spec_ctrl_exit_idle(info);
708         }
709         return;
710     }
711 
712     cpufreq_dbs_timer_suspend();
713 
714     rcu_idle_enter(cpu);
715     /* rcu_idle_enter() can raise TIMER_SOFTIRQ. Process it now. */
716     process_pending_softirqs();
717 
718     /*
719      * Interrupts must be disabled during bus mastering calculations and
720      * for C2/C3 transitions.
721      */
722     local_irq_disable();
723 
724     if ( !cpu_is_haltable(cpu) )
725     {
726         local_irq_enable();
727         rcu_idle_exit(cpu);
728         cpufreq_dbs_timer_resume();
729         return;
730     }
731 
732     if ( (cx->type >= ACPI_STATE_C3) && errata_c6_workaround() )
733         cx = power->safe_state;
734 
735 
736     /*
737      * Sleep:
738      * ------
739      * Invoke the current Cx state to put the processor to sleep.
740      */
741     switch ( cx->type )
742     {
743     case ACPI_STATE_C1:
744     case ACPI_STATE_C2:
745         if ( cx->type == ACPI_STATE_C1 || local_apic_timer_c2_ok )
746         {
747             /* Get start time (ticks) */
748             t1 = alternative_call(cpuidle_get_tick);
749             /* Trace cpu idle entry */
750             TRACE_4D(TRC_PM_IDLE_ENTRY, cx->idx, t1, exp, pred);
751 
752             update_last_cx_stat(power, cx, t1);
753 
754             /* Invoke C2 */
755             acpi_idle_do_entry(cx);
756             /* Get end time (ticks) */
757             t2 = alternative_call(cpuidle_get_tick);
758             trace_exit_reason(irq_traced);
759             /* Trace cpu idle exit */
760             TRACE_6D(TRC_PM_IDLE_EXIT, cx->idx, t2,
761                      irq_traced[0], irq_traced[1], irq_traced[2], irq_traced[3]);
762             /* Update statistics */
763             update_idle_stats(power, cx, t1, t2);
764             /* Re-enable interrupts */
765             local_irq_enable();
766             break;
767         }
768 
769     case ACPI_STATE_C3:
770         /*
771          * Before invoking C3, be aware that TSC/APIC timer may be
772          * stopped by H/W. Without carefully handling of TSC/APIC stop issues,
773          * deep C state can't work correctly.
774          */
775         /* preparing APIC stop */
776         lapic_timer_off();
777 
778         /* Get start time (ticks) */
779         t1 = alternative_call(cpuidle_get_tick);
780         /* Trace cpu idle entry */
781         TRACE_4D(TRC_PM_IDLE_ENTRY, cx->idx, t1, exp, pred);
782 
783         update_last_cx_stat(power, cx, t1);
784 
785         /*
786          * disable bus master
787          * bm_check implies we need ARB_DIS
788          * !bm_check implies we need cache flush
789          * bm_control implies whether we can do ARB_DIS
790          *
791          * That leaves a case where bm_check is set and bm_control is
792          * not set. In that case we cannot do much, we enter C3
793          * without doing anything.
794          */
795         if ( cx->type != ACPI_STATE_C3 )
796             /* nothing to be done here */;
797         else if ( power->flags.bm_check && power->flags.bm_control )
798         {
799             spin_lock(&c3_cpu_status.lock);
800             if ( ++c3_cpu_status.count == num_online_cpus() )
801             {
802                 /*
803                  * All CPUs are trying to go to C3
804                  * Disable bus master arbitration
805                  */
806                 acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1);
807             }
808             spin_unlock(&c3_cpu_status.lock);
809         }
810         else if ( !power->flags.bm_check )
811         {
812             /* SMP with no shared cache... Invalidate cache  */
813             ACPI_FLUSH_CPU_CACHE();
814         }
815 
816         /* Invoke C3 */
817         acpi_idle_do_entry(cx);
818 
819         if ( (cx->type == ACPI_STATE_C3) &&
820              power->flags.bm_check && power->flags.bm_control )
821         {
822             /* Enable bus master arbitration */
823             spin_lock(&c3_cpu_status.lock);
824             if ( c3_cpu_status.count-- == num_online_cpus() )
825                 acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
826             spin_unlock(&c3_cpu_status.lock);
827         }
828 
829         /* Get end time (ticks) */
830         t2 = alternative_call(cpuidle_get_tick);
831 
832         /* recovering TSC */
833         cstate_restore_tsc();
834         trace_exit_reason(irq_traced);
835         /* Trace cpu idle exit */
836         TRACE_6D(TRC_PM_IDLE_EXIT, cx->idx, t2,
837                  irq_traced[0], irq_traced[1], irq_traced[2], irq_traced[3]);
838 
839         /* Update statistics */
840         update_idle_stats(power, cx, t1, t2);
841         /* Re-enable interrupts */
842         local_irq_enable();
843         /* recovering APIC */
844         lapic_timer_on();
845 
846         break;
847 
848     default:
849         /* Now in C0 */
850         power->last_state = &power->states[0];
851         local_irq_enable();
852         rcu_idle_exit(cpu);
853         cpufreq_dbs_timer_resume();
854         return;
855     }
856 
857     /* Now in C0 */
858     power->last_state = &power->states[0];
859 
860     rcu_idle_exit(cpu);
861     cpufreq_dbs_timer_resume();
862 
863     if ( cpuidle_current_governor->reflect )
864         cpuidle_current_governor->reflect(power);
865 }
866 
acpi_dead_idle(void)867 void acpi_dead_idle(void)
868 {
869     struct acpi_processor_power *power;
870     struct acpi_processor_cx *cx;
871 
872     if ( (power = processor_powers[smp_processor_id()]) == NULL ||
873          power->count < 2 )
874         goto default_halt;
875 
876     cx = &power->states[power->count - 1];
877     power->last_state = cx;
878 
879     if ( cx->entry_method == ACPI_CSTATE_EM_FFH )
880     {
881         void *mwait_ptr = &mwait_wakeup(smp_processor_id());
882 
883         /*
884          * Cache must be flushed as the last operation before sleeping.
885          * Otherwise, CPU may still hold dirty data, breaking cache coherency,
886          * leading to strange errors.
887          */
888         spec_ctrl_enter_idle(get_cpu_info());
889         wbinvd();
890 
891         while ( 1 )
892         {
893             /*
894              * 1. The CLFLUSH is a workaround for erratum AAI65 for
895              * the Xeon 7400 series.
896              * 2. The WBINVD is insufficient due to the spurious-wakeup
897              * case where we return around the loop.
898              * 3. Unlike wbinvd, clflush is a light weight but not serializing
899              * instruction, hence memory fence is necessary to make sure all
900              * load/store visible before flush cache line.
901              */
902             mb();
903             clflush(mwait_ptr);
904             __monitor(mwait_ptr, 0, 0);
905             mb();
906             __mwait(cx->address, 0);
907         }
908     }
909     else if ( (current_cpu_data.x86_vendor &
910                (X86_VENDOR_AMD | X86_VENDOR_HYGON)) &&
911               cx->entry_method == ACPI_CSTATE_EM_SYSIO )
912     {
913         /* Intel prefers not to use SYSIO */
914 
915         /* Avoid references to shared data after the cache flush */
916         u32 address = cx->address;
917         u32 pmtmr_ioport_local = pmtmr_ioport;
918 
919         spec_ctrl_enter_idle(get_cpu_info());
920         wbinvd();
921 
922         while ( 1 )
923         {
924             inb(address);
925             inl(pmtmr_ioport_local);
926         }
927     }
928 
929 default_halt:
930     default_dead_idle();
931 }
932 
cpuidle_init_cpu(unsigned int cpu)933 int cpuidle_init_cpu(unsigned int cpu)
934 {
935     struct acpi_processor_power *acpi_power;
936 
937     acpi_power = processor_powers[cpu];
938     if ( !acpi_power )
939     {
940         unsigned int i;
941 
942         if ( cpu == 0 && system_state < SYS_STATE_active )
943         {
944             if ( boot_cpu_has(X86_FEATURE_NONSTOP_TSC) )
945             {
946                 cpuidle_get_tick = get_stime_tick;
947                 ticks_elapsed = stime_ticks_elapsed;
948                 tick_to_ns = stime_tick_to_ns;
949             }
950             else
951             {
952                 cpuidle_get_tick = get_acpi_pm_tick;
953                 ticks_elapsed = acpi_pm_ticks_elapsed;
954                 tick_to_ns = acpi_pm_tick_to_ns;
955             }
956         }
957 
958         acpi_power = xzalloc(struct acpi_processor_power);
959         if ( !acpi_power )
960             return -ENOMEM;
961 
962         for ( i = 0; i < ACPI_PROCESSOR_MAX_POWER; i++ )
963             acpi_power->states[i].idx = i;
964 
965         acpi_power->cpu = cpu;
966 
967         spin_lock_init(&acpi_power->stat_lock);
968 
969         processor_powers[cpu] = acpi_power;
970     }
971 
972     acpi_power->count = 2;
973     acpi_power->states[1].type = ACPI_STATE_C1;
974     acpi_power->states[1].entry_method = ACPI_CSTATE_EM_HALT;
975     acpi_power->safe_state = &acpi_power->states[1];
976 
977     return 0;
978 }
979 
acpi_processor_ffh_cstate_probe(xen_processor_cx_t * cx)980 static int acpi_processor_ffh_cstate_probe(xen_processor_cx_t *cx)
981 {
982     struct cpuinfo_x86 *c = &current_cpu_data;
983     unsigned int eax, ebx, ecx, edx;
984     unsigned int edx_part;
985     unsigned int cstate_type; /* C-state type and not ACPI C-state type */
986     unsigned int num_cstate_subtype;
987     int ret = 0;
988     static unsigned long printed;
989 
990     if ( c->cpuid_level < CPUID_MWAIT_LEAF )
991     {
992         printk(XENLOG_INFO "MWAIT leaf not supported by cpuid\n");
993         return -EFAULT;
994     }
995 
996     cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
997     if ( opt_cpu_info )
998         printk(XENLOG_DEBUG "cpuid.MWAIT[eax=%x ebx=%x ecx=%x edx=%x]\n",
999                eax, ebx, ecx, edx);
1000 
1001     /* Check whether this particular cx_type (in CST) is supported or not */
1002     cstate_type = (cx->reg.address >> MWAIT_SUBSTATE_SIZE) + 1;
1003     edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE);
1004     num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK;
1005 
1006     if ( num_cstate_subtype < (cx->reg.address & MWAIT_SUBSTATE_MASK) )
1007         ret = -ERANGE;
1008     /* mwait ecx extensions INTERRUPT_BREAK should be supported for C2/C3 */
1009     else if ( !(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
1010               !(ecx & CPUID5_ECX_INTERRUPT_BREAK) )
1011         ret = -ENODEV;
1012     else if ( opt_cpu_info || cx->type >= BITS_PER_LONG ||
1013               !test_and_set_bit(cx->type, &printed) )
1014         printk(XENLOG_INFO "Monitor-Mwait will be used to enter C%d state\n",
1015                cx->type);
1016     return ret;
1017 }
1018 
1019 /*
1020  * Initialize bm_flags based on the CPU cache properties
1021  * On SMP it depends on cache configuration
1022  * - When cache is not shared among all CPUs, we flush cache
1023  *   before entering C3.
1024  * - When cache is shared among all CPUs, we use bm_check
1025  *   mechanism as in UP case
1026  *
1027  * This routine is called only after all the CPUs are online
1028  */
acpi_processor_power_init_bm_check(struct acpi_processor_flags * flags)1029 static void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags)
1030 {
1031     struct cpuinfo_x86 *c = &current_cpu_data;
1032 
1033     flags->bm_check = 0;
1034     if ( num_online_cpus() == 1 )
1035         flags->bm_check = 1;
1036     else if ( (c->x86_vendor == X86_VENDOR_INTEL) ||
1037               ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 0x15)) )
1038     {
1039         /*
1040          * Today all MP CPUs that support C3 share cache.
1041          * And caches should not be flushed by software while
1042          * entering C3 type state.
1043          */
1044         flags->bm_check = 1;
1045     }
1046 
1047     /*
1048      * On all recent platforms, ARB_DISABLE is a nop.
1049      * So, set bm_control to zero to indicate that ARB_DISABLE
1050      * is not required while entering C3 type state on
1051      * P4, Core and beyond CPUs
1052      */
1053     if ( c->x86_vendor == X86_VENDOR_INTEL &&
1054         (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 14)) )
1055             flags->bm_control = 0;
1056 }
1057 
1058 #define VENDOR_INTEL                   (1)
1059 #define NATIVE_CSTATE_BEYOND_HALT      (2)
1060 
check_cx(struct acpi_processor_power * power,xen_processor_cx_t * cx)1061 static int check_cx(struct acpi_processor_power *power, xen_processor_cx_t *cx)
1062 {
1063     static int bm_check_flag = -1;
1064     static int bm_control_flag = -1;
1065 
1066     switch ( cx->reg.space_id )
1067     {
1068     case ACPI_ADR_SPACE_SYSTEM_IO:
1069         if ( cx->reg.address == 0 )
1070             return -EINVAL;
1071         break;
1072 
1073     case ACPI_ADR_SPACE_FIXED_HARDWARE:
1074         if ( cx->reg.bit_width != VENDOR_INTEL ||
1075              cx->reg.bit_offset != NATIVE_CSTATE_BEYOND_HALT )
1076             return -EINVAL;
1077 
1078         /* assume all logical cpu has the same support for mwait */
1079         if ( acpi_processor_ffh_cstate_probe(cx) )
1080             return -EINVAL;
1081         break;
1082 
1083     default:
1084         return -ENODEV;
1085     }
1086 
1087     switch ( cx->type )
1088     {
1089     case ACPI_STATE_C2:
1090         if ( local_apic_timer_c2_ok )
1091             break;
1092     case ACPI_STATE_C3:
1093         if ( !lapic_timer_init() )
1094             return -EINVAL;
1095 
1096         /* All the logic here assumes flags.bm_check is same across all CPUs */
1097         if ( bm_check_flag < 0 )
1098         {
1099             /* Determine whether bm_check is needed based on CPU  */
1100             acpi_processor_power_init_bm_check(&(power->flags));
1101         }
1102         else
1103         {
1104             power->flags.bm_check = bm_check_flag;
1105             power->flags.bm_control = bm_control_flag;
1106         }
1107 
1108         if ( power->flags.bm_check )
1109         {
1110             if ( !power->flags.bm_control )
1111             {
1112                 if ( power->flags.has_cst != 1 )
1113                 {
1114                     /* bus mastering control is necessary */
1115                     ACPI_DEBUG_PRINT((ACPI_DB_INFO,
1116                         "C3 support requires BM control\n"));
1117                     return -EINVAL;
1118                 }
1119                 else
1120                 {
1121                     /* Here we enter C3 without bus mastering */
1122                     ACPI_DEBUG_PRINT((ACPI_DB_INFO,
1123                         "C3 support without BM control\n"));
1124                 }
1125             }
1126             /*
1127              * On older chipsets, BM_RLD needs to be set in order for Bus
1128              * Master activity to wake the system from C3, hence
1129              * acpi_set_register() is always being called once below.  Newer
1130              * chipsets handle DMA during C3 automatically and BM_RLD is a
1131              * NOP.  In either case, the proper way to handle BM_RLD is to
1132              * set it and leave it set.
1133              */
1134         }
1135         else
1136         {
1137             /*
1138              * WBINVD should be set in fadt, for C3 state to be
1139              * supported on when bm_check is not required.
1140              */
1141             if ( !(acpi_gbl_FADT.flags & ACPI_FADT_WBINVD) )
1142             {
1143                 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
1144                           "Cache invalidation should work properly"
1145                           " for C3 to be enabled on SMP systems\n"));
1146                 return -EINVAL;
1147             }
1148         }
1149 
1150         if ( bm_check_flag < 0 )
1151         {
1152             bm_check_flag = power->flags.bm_check;
1153             bm_control_flag = power->flags.bm_control;
1154             acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, bm_check_flag);
1155         }
1156 
1157         break;
1158     }
1159 
1160     return 0;
1161 }
1162 
1163 static unsigned int latency_factor = 2;
1164 integer_param("idle_latency_factor", latency_factor);
1165 
set_cx(struct acpi_processor_power * acpi_power,xen_processor_cx_t * xen_cx)1166 static void set_cx(
1167     struct acpi_processor_power *acpi_power,
1168     xen_processor_cx_t *xen_cx)
1169 {
1170     struct acpi_processor_cx *cx;
1171 
1172     if ( check_cx(acpi_power, xen_cx) != 0 )
1173         return;
1174 
1175     switch ( xen_cx->type )
1176     {
1177     case ACPI_STATE_C1:
1178         cx = &acpi_power->states[1];
1179         break;
1180     default:
1181         if ( acpi_power->count >= ACPI_PROCESSOR_MAX_POWER )
1182         {
1183     case ACPI_STATE_C0:
1184             printk(XENLOG_WARNING "CPU%u: C%d data ignored\n",
1185                    acpi_power->cpu, xen_cx->type);
1186             return;
1187         }
1188         cx = &acpi_power->states[acpi_power->count];
1189         cx->type = xen_cx->type;
1190         break;
1191     }
1192 
1193     cx->address = xen_cx->reg.address;
1194 
1195     switch ( xen_cx->reg.space_id )
1196     {
1197     case ACPI_ADR_SPACE_FIXED_HARDWARE:
1198         if ( xen_cx->reg.bit_width == VENDOR_INTEL &&
1199              xen_cx->reg.bit_offset == NATIVE_CSTATE_BEYOND_HALT &&
1200              boot_cpu_has(X86_FEATURE_MONITOR) )
1201             cx->entry_method = ACPI_CSTATE_EM_FFH;
1202         else
1203             cx->entry_method = ACPI_CSTATE_EM_HALT;
1204         break;
1205     case ACPI_ADR_SPACE_SYSTEM_IO:
1206         if ( ioports_deny_access(hardware_domain, cx->address, cx->address) )
1207             printk(XENLOG_WARNING "Could not deny access to port %04x\n",
1208                    cx->address);
1209         cx->entry_method = ACPI_CSTATE_EM_SYSIO;
1210         break;
1211     default:
1212         cx->entry_method = ACPI_CSTATE_EM_NONE;
1213         break;
1214     }
1215 
1216     cx->latency = xen_cx->latency;
1217     cx->target_residency = cx->latency * latency_factor;
1218 
1219     smp_wmb();
1220     acpi_power->count += (cx->type != ACPI_STATE_C1);
1221     if ( cx->type == ACPI_STATE_C1 || cx->type == ACPI_STATE_C2 )
1222         acpi_power->safe_state = cx;
1223 }
1224 
get_cpu_id(u32 acpi_id)1225 int get_cpu_id(u32 acpi_id)
1226 {
1227     int i;
1228     u32 apic_id;
1229 
1230     if ( acpi_id >= MAX_MADT_ENTRIES )
1231         return -1;
1232 
1233     apic_id = x86_acpiid_to_apicid[acpi_id];
1234     if ( apic_id == BAD_APICID )
1235         return -1;
1236 
1237     for ( i = 0; i < nr_cpu_ids; i++ )
1238     {
1239         if ( apic_id == x86_cpu_to_apicid[i] )
1240             return i;
1241     }
1242 
1243     return -1;
1244 }
1245 
1246 #ifdef DEBUG_PM_CX
print_cx_pminfo(uint32_t cpu,struct xen_processor_power * power)1247 static void print_cx_pminfo(uint32_t cpu, struct xen_processor_power *power)
1248 {
1249     XEN_GUEST_HANDLE(xen_processor_cx_t) states;
1250     xen_processor_cx_t  state;
1251     XEN_GUEST_HANDLE(xen_processor_csd_t) csd;
1252     xen_processor_csd_t dp;
1253     uint32_t i;
1254 
1255     printk("cpu%d cx acpi info:\n", cpu);
1256     printk("\tcount = %d\n", power->count);
1257     printk("\tflags: bm_cntl[%d], bm_chk[%d], has_cst[%d],\n"
1258            "\t       pwr_setup_done[%d], bm_rld_set[%d]\n",
1259            power->flags.bm_control, power->flags.bm_check, power->flags.has_cst,
1260            power->flags.power_setup_done, power->flags.bm_rld_set);
1261 
1262     states = power->states;
1263 
1264     for ( i = 0; i < power->count; i++ )
1265     {
1266         if ( unlikely(copy_from_guest_offset(&state, states, i, 1)) )
1267             return;
1268 
1269         printk("\tstates[%d]:\n", i);
1270         printk("\t\treg.space_id = %#x\n", state.reg.space_id);
1271         printk("\t\treg.bit_width = %#x\n", state.reg.bit_width);
1272         printk("\t\treg.bit_offset = %#x\n", state.reg.bit_offset);
1273         printk("\t\treg.access_size = %#x\n", state.reg.access_size);
1274         printk("\t\treg.address = %#"PRIx64"\n", state.reg.address);
1275         printk("\t\ttype    = %d\n", state.type);
1276         printk("\t\tlatency = %d\n", state.latency);
1277         printk("\t\tpower   = %d\n", state.power);
1278 
1279         csd = state.dp;
1280         printk("\t\tdp(@0x%p)\n", csd.p);
1281 
1282         if ( csd.p != NULL )
1283         {
1284             if ( unlikely(copy_from_guest(&dp, csd, 1)) )
1285                 return;
1286             printk("\t\t\tdomain = %d\n", dp.domain);
1287             printk("\t\t\tcoord_type   = %d\n", dp.coord_type);
1288             printk("\t\t\tnum = %d\n", dp.num);
1289         }
1290     }
1291 }
1292 #else
1293 #define print_cx_pminfo(c, p)
1294 #endif
1295 
set_cx_pminfo(uint32_t acpi_id,struct xen_processor_power * power)1296 long set_cx_pminfo(uint32_t acpi_id, struct xen_processor_power *power)
1297 {
1298     XEN_GUEST_HANDLE(xen_processor_cx_t) states;
1299     xen_processor_cx_t xen_cx;
1300     struct acpi_processor_power *acpi_power;
1301     int cpu_id, i, ret;
1302 
1303     if ( unlikely(!guest_handle_okay(power->states, power->count)) )
1304         return -EFAULT;
1305 
1306     if ( pm_idle_save && pm_idle != acpi_processor_idle )
1307         return 0;
1308 
1309     if ( vendor_override > 0 )
1310         return 0;
1311 
1312     print_cx_pminfo(acpi_id, power);
1313 
1314     cpu_id = get_cpu_id(acpi_id);
1315     if ( cpu_id == -1 )
1316     {
1317         static bool warn_once = true;
1318 
1319         if ( warn_once || opt_cpu_info )
1320             printk(XENLOG_WARNING "No CPU for ACPI ID %#x\n", acpi_id);
1321         warn_once = false;
1322         return -EINVAL;
1323     }
1324 
1325     ret = cpuidle_init_cpu(cpu_id);
1326     if ( ret < 0 )
1327         return ret;
1328 
1329     acpi_power = processor_powers[cpu_id];
1330     acpi_power->flags.bm_check = power->flags.bm_check;
1331     acpi_power->flags.bm_control = power->flags.bm_control;
1332     acpi_power->flags.has_cst = power->flags.has_cst;
1333 
1334     states = power->states;
1335     for ( i = 0; i < power->count; i++ )
1336     {
1337         if ( unlikely(copy_from_guest_offset(&xen_cx, states, i, 1)) )
1338             return -EFAULT;
1339 
1340         set_cx(acpi_power, &xen_cx);
1341     }
1342 
1343     if ( !cpu_online(cpu_id) )
1344     {
1345         uint32_t apic_id = x86_cpu_to_apicid[cpu_id];
1346 
1347         /*
1348          * If we've just learned of more available C states, wake the CPU if
1349          * it's parked, so it can go back to sleep in perhaps a deeper state.
1350          */
1351         if ( park_offline_cpus && apic_id != BAD_APICID )
1352         {
1353             unsigned long flags;
1354 
1355             local_irq_save(flags);
1356             apic_wait_icr_idle();
1357             apic_icr_write(APIC_DM_NMI | APIC_DEST_PHYSICAL, apic_id);
1358             local_irq_restore(flags);
1359         }
1360     }
1361     else if ( cpuidle_current_governor->enable )
1362     {
1363         ret = cpuidle_current_governor->enable(acpi_power);
1364         if ( ret < 0 )
1365             return ret;
1366     }
1367 
1368     /* FIXME: C-state dependency is not supported by far */
1369 
1370     if ( cpu_id == 0 )
1371     {
1372         if ( pm_idle_save == NULL )
1373         {
1374             pm_idle_save = pm_idle;
1375             pm_idle = acpi_processor_idle;
1376         }
1377 
1378         dead_idle = acpi_dead_idle;
1379     }
1380 
1381     return 0;
1382 }
1383 
amd_cpuidle_init(struct acpi_processor_power * power)1384 static void amd_cpuidle_init(struct acpi_processor_power *power)
1385 {
1386     unsigned int i, nr = 0;
1387     const struct cpuinfo_x86 *c = &current_cpu_data;
1388     const unsigned int ecx_req = CPUID5_ECX_EXTENSIONS_SUPPORTED |
1389                                  CPUID5_ECX_INTERRUPT_BREAK;
1390     const struct acpi_processor_cx *cx = NULL;
1391     static const struct acpi_processor_cx fam17[] = {
1392         {
1393             .type = ACPI_STATE_C1,
1394             .entry_method = ACPI_CSTATE_EM_FFH,
1395             .latency = 1,
1396         },
1397         {
1398             .type = ACPI_STATE_C2,
1399             .entry_method = ACPI_CSTATE_EM_HALT,
1400             .latency = 400,
1401         },
1402     };
1403 
1404     if ( pm_idle_save && pm_idle != acpi_processor_idle )
1405         return;
1406 
1407     if ( vendor_override < 0 )
1408         return;
1409 
1410     switch ( c->x86 )
1411     {
1412     case 0x19:
1413     case 0x18:
1414         if ( boot_cpu_data.x86_vendor != X86_VENDOR_HYGON )
1415         {
1416     default:
1417             vendor_override = -1;
1418             return;
1419         }
1420         /* fall through */
1421     case 0x17:
1422         if ( cpu_has_monitor && c->cpuid_level >= CPUID_MWAIT_LEAF &&
1423              (cpuid_ecx(CPUID_MWAIT_LEAF) & ecx_req) == ecx_req )
1424         {
1425             cx = fam17;
1426             nr = ARRAY_SIZE(fam17);
1427             local_apic_timer_c2_ok = true;
1428             break;
1429         }
1430         /* fall through */
1431     case 0x15:
1432     case 0x16:
1433         cx = &fam17[1];
1434         nr = ARRAY_SIZE(fam17) - 1;
1435         break;
1436     }
1437 
1438     power->flags.has_cst = true;
1439 
1440     for ( i = 0; i < nr; ++i )
1441     {
1442         if ( cx[i].type > max_cstate )
1443             break;
1444         power->states[i + 1] = cx[i];
1445         power->states[i + 1].idx = i + 1;
1446         power->states[i + 1].target_residency = cx[i].latency * latency_factor;
1447     }
1448 
1449     if ( i )
1450     {
1451         power->count = i + 1;
1452         power->safe_state = &power->states[i];
1453 
1454         if ( !vendor_override )
1455         {
1456             if ( !boot_cpu_has(X86_FEATURE_ARAT) )
1457                 hpet_broadcast_init();
1458 
1459             if ( !lapic_timer_init() )
1460             {
1461                 vendor_override = -1;
1462                 cpuidle_init_cpu(power->cpu);
1463                 return;
1464             }
1465 
1466             if ( !pm_idle_save )
1467             {
1468                 pm_idle_save = pm_idle;
1469                 pm_idle = acpi_processor_idle;
1470             }
1471 
1472             dead_idle = acpi_dead_idle;
1473 
1474             vendor_override = 1;
1475         }
1476     }
1477     else
1478         vendor_override = -1;
1479 }
1480 
pmstat_get_cx_nr(uint32_t cpuid)1481 uint32_t pmstat_get_cx_nr(uint32_t cpuid)
1482 {
1483     return processor_powers[cpuid] ? processor_powers[cpuid]->count : 0;
1484 }
1485 
pmstat_get_cx_stat(uint32_t cpuid,struct pm_cx_stat * stat)1486 int pmstat_get_cx_stat(uint32_t cpuid, struct pm_cx_stat *stat)
1487 {
1488     struct acpi_processor_power *power = processor_powers[cpuid];
1489     uint64_t idle_usage = 0, idle_res = 0;
1490     uint64_t last_state_update_tick, current_stime, current_tick;
1491     uint64_t usage[ACPI_PROCESSOR_MAX_POWER] = { 0 };
1492     uint64_t res[ACPI_PROCESSOR_MAX_POWER] = { 0 };
1493     unsigned int i, nr, nr_pc = 0, nr_cc = 0;
1494 
1495     if ( power == NULL )
1496     {
1497         stat->last = 0;
1498         stat->nr = 0;
1499         stat->idle_time = 0;
1500         stat->nr_pc = 0;
1501         stat->nr_cc = 0;
1502         return 0;
1503     }
1504 
1505     stat->idle_time = get_cpu_idle_time(cpuid);
1506     nr = min(stat->nr, power->count);
1507 
1508     /* mimic the stat when detail info hasn't been registered by dom0 */
1509     if ( pm_idle_save == NULL )
1510     {
1511         stat->nr = 2;
1512         stat->last = power->last_state ? power->last_state->idx : 0;
1513 
1514         usage[1] = idle_usage = 1;
1515         res[1] = idle_res = stat->idle_time;
1516 
1517         current_stime = NOW();
1518     }
1519     else
1520     {
1521         struct hw_residencies hw_res;
1522         signed int last_state_idx;
1523 
1524         stat->nr = power->count;
1525 
1526         spin_lock_irq(&power->stat_lock);
1527         current_tick = cpuidle_get_tick();
1528         current_stime = NOW();
1529         for ( i = 1; i < nr; i++ )
1530         {
1531             usage[i] = power->states[i].usage;
1532             res[i] = power->states[i].time;
1533         }
1534         last_state_update_tick = power->last_state_update_tick;
1535         last_state_idx = power->last_state ? power->last_state->idx : -1;
1536         spin_unlock_irq(&power->stat_lock);
1537 
1538         if ( last_state_idx >= 0 )
1539         {
1540             usage[last_state_idx]++;
1541             res[last_state_idx] += ticks_elapsed(last_state_update_tick,
1542                                                  current_tick);
1543             stat->last = last_state_idx;
1544         }
1545         else
1546             stat->last = 0;
1547 
1548         for ( i = 1; i < nr; i++ )
1549         {
1550             res[i] = tick_to_ns(res[i]);
1551             idle_usage += usage[i];
1552             idle_res += res[i];
1553         }
1554 
1555         get_hw_residencies(cpuid, &hw_res);
1556 
1557 #define PUT_xC(what, n) do { \
1558         if ( stat->nr_##what >= n && \
1559              copy_to_guest_offset(stat->what, n - 1, &hw_res.what##n, 1) ) \
1560             return -EFAULT; \
1561         if ( hw_res.what##n ) \
1562             nr_##what = n; \
1563     } while ( 0 )
1564 #define PUT_PC(n) PUT_xC(pc, n)
1565         PUT_PC(2);
1566         PUT_PC(3);
1567         PUT_PC(4);
1568         PUT_PC(6);
1569         PUT_PC(7);
1570         PUT_PC(8);
1571         PUT_PC(9);
1572         PUT_PC(10);
1573 #undef PUT_PC
1574 #define PUT_CC(n) PUT_xC(cc, n)
1575         PUT_CC(1);
1576         PUT_CC(3);
1577         PUT_CC(6);
1578         PUT_CC(7);
1579 #undef PUT_CC
1580 #undef PUT_xC
1581     }
1582 
1583     usage[0] += idle_usage;
1584     res[0] = current_stime - idle_res;
1585 
1586     if ( copy_to_guest(stat->triggers, usage, nr) ||
1587          copy_to_guest(stat->residencies, res, nr) )
1588         return -EFAULT;
1589 
1590     stat->nr_pc = nr_pc;
1591     stat->nr_cc = nr_cc;
1592 
1593     return 0;
1594 }
1595 
pmstat_reset_cx_stat(uint32_t cpuid)1596 int pmstat_reset_cx_stat(uint32_t cpuid)
1597 {
1598     return 0;
1599 }
1600 
cpuidle_disable_deep_cstate(void)1601 void cpuidle_disable_deep_cstate(void)
1602 {
1603     if ( max_cstate > ACPI_STATE_C1 )
1604     {
1605         if ( local_apic_timer_c2_ok )
1606             max_cstate = ACPI_STATE_C2;
1607         else
1608             max_cstate = ACPI_STATE_C1;
1609     }
1610 
1611     hpet_disable_legacy_broadcast();
1612 }
1613 
cpuidle_using_deep_cstate(void)1614 bool cpuidle_using_deep_cstate(void)
1615 {
1616     return xen_cpuidle && max_cstate > (local_apic_timer_c2_ok ? ACPI_STATE_C2
1617                                                                : ACPI_STATE_C1);
1618 }
1619 
cpu_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)1620 static int cpu_callback(
1621     struct notifier_block *nfb, unsigned long action, void *hcpu)
1622 {
1623     unsigned int cpu = (unsigned long)hcpu;
1624     int rc = 0;
1625 
1626     /*
1627      * Only hook on CPU_UP_PREPARE / CPU_ONLINE because a dead cpu may utilize
1628      * the info to enter deep C-state.
1629      */
1630     switch ( action )
1631     {
1632     case CPU_UP_PREPARE:
1633         rc = cpuidle_init_cpu(cpu);
1634         if ( !rc && cpuidle_current_governor->enable )
1635             rc = cpuidle_current_governor->enable(processor_powers[cpu]);
1636         break;
1637 
1638     case CPU_ONLINE:
1639         if ( (boot_cpu_data.x86_vendor &
1640               (X86_VENDOR_AMD | X86_VENDOR_HYGON)) &&
1641              processor_powers[cpu] )
1642             amd_cpuidle_init(processor_powers[cpu]);
1643         break;
1644     }
1645 
1646     return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
1647 }
1648 
1649 static struct notifier_block cpu_nfb = {
1650     .notifier_call = cpu_callback
1651 };
1652 
cpuidle_presmp_init(void)1653 static int __init cpuidle_presmp_init(void)
1654 {
1655     void *cpu = (void *)(long)smp_processor_id();
1656 
1657     if ( !xen_cpuidle )
1658         return 0;
1659 
1660     mwait_idle_init(&cpu_nfb);
1661     cpu_nfb.notifier_call(&cpu_nfb, CPU_UP_PREPARE, cpu);
1662     cpu_nfb.notifier_call(&cpu_nfb, CPU_ONLINE, cpu);
1663     register_cpu_notifier(&cpu_nfb);
1664     return 0;
1665 }
1666 presmp_initcall(cpuidle_presmp_init);
1667 
1668