1 /*
2 * cpu_idle - xen idle state module derived from Linux
3 * drivers/acpi/processor_idle.c &
4 * arch/x86/kernel/acpi/cstate.c
5 *
6 * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
7 * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
8 * Copyright (C) 2004, 2005 Dominik Brodowski <linux@brodo.de>
9 * Copyright (C) 2004 Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
10 * - Added processor hotplug support
11 * Copyright (C) 2005 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
12 * - Added support for C3 on SMP
13 * Copyright (C) 2007, 2008 Intel Corporation
14 *
15 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
16 *
17 * This program is free software; you can redistribute it and/or modify
18 * it under the terms of the GNU General Public License as published by
19 * the Free Software Foundation; either version 2 of the License, or (at
20 * your option) any later version.
21 *
22 * This program is distributed in the hope that it will be useful, but
23 * WITHOUT ANY WARRANTY; without even the implied warranty of
24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
25 * General Public License for more details.
26 *
27 * You should have received a copy of the GNU General Public License along
28 * with this program; If not, see <http://www.gnu.org/licenses/>.
29 *
30 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
31 */
32
33 #include <xen/errno.h>
34 #include <xen/lib.h>
35 #include <xen/types.h>
36 #include <xen/acpi.h>
37 #include <xen/smp.h>
38 #include <xen/guest_access.h>
39 #include <xen/keyhandler.h>
40 #include <xen/param.h>
41 #include <xen/trace.h>
42 #include <xen/irq.h>
43 #include <asm/cache.h>
44 #include <asm/io.h>
45 #include <asm/iocap.h>
46 #include <asm/hpet.h>
47 #include <asm/processor.h>
48 #include <xen/pmstat.h>
49 #include <xen/softirq.h>
50 #include <public/platform.h>
51 #include <public/sysctl.h>
52 #include <acpi/cpufreq/cpufreq.h>
53 #include <asm/apic.h>
54 #include <asm/cpuidle.h>
55 #include <asm/mwait.h>
56 #include <xen/notifier.h>
57 #include <xen/cpu.h>
58 #include <asm/spec_ctrl.h>
59
60 /*#define DEBUG_PM_CX*/
61
62 #define GET_HW_RES_IN_NS(msr, val) \
63 do { rdmsrl(msr, val); val = tsc_ticks2ns(val); } while( 0 )
64 #define GET_MC6_RES(val) GET_HW_RES_IN_NS(0x664, val)
65 #define GET_PC2_RES(val) GET_HW_RES_IN_NS(0x60D, val) /* SNB onwards */
66 #define GET_PC3_RES(val) GET_HW_RES_IN_NS(0x3F8, val)
67 #define GET_PC6_RES(val) GET_HW_RES_IN_NS(0x3F9, val)
68 #define GET_PC7_RES(val) GET_HW_RES_IN_NS(0x3FA, val)
69 #define GET_PC8_RES(val) GET_HW_RES_IN_NS(0x630, val) /* some Haswells only */
70 #define GET_PC9_RES(val) GET_HW_RES_IN_NS(0x631, val) /* some Haswells only */
71 #define GET_PC10_RES(val) GET_HW_RES_IN_NS(0x632, val) /* some Haswells only */
72 #define GET_CC1_RES(val) GET_HW_RES_IN_NS(0x660, val)
73 #define GET_CC3_RES(val) GET_HW_RES_IN_NS(0x3FC, val)
74 #define GET_CC6_RES(val) GET_HW_RES_IN_NS(0x3FD, val)
75 #define GET_CC7_RES(val) GET_HW_RES_IN_NS(0x3FE, val) /* SNB onwards */
76 #define PHI_CC6_RES(val) GET_HW_RES_IN_NS(0x3FF, val) /* Xeon Phi only */
77
lapic_timer_nop(void)78 static void lapic_timer_nop(void) { }
79 void (*__read_mostly lapic_timer_off)(void);
80 void (*__read_mostly lapic_timer_on)(void);
81
lapic_timer_init(void)82 bool lapic_timer_init(void)
83 {
84 if ( boot_cpu_has(X86_FEATURE_ARAT) )
85 {
86 lapic_timer_off = lapic_timer_nop;
87 lapic_timer_on = lapic_timer_nop;
88 }
89 else if ( hpet_broadcast_is_available() )
90 {
91 lapic_timer_off = hpet_broadcast_enter;
92 lapic_timer_on = hpet_broadcast_exit;
93 }
94 else if ( pit_broadcast_is_available() )
95 {
96 lapic_timer_off = pit_broadcast_enter;
97 lapic_timer_on = pit_broadcast_exit;
98 }
99 else
100 return false;
101
102 return true;
103 }
104
105 void (*__read_mostly pm_idle_save)(void);
106 unsigned int max_cstate __read_mostly = UINT_MAX;
107 unsigned int max_csubstate __read_mostly = UINT_MAX;
108
parse_cstate(const char * s)109 static int __init parse_cstate(const char *s)
110 {
111 max_cstate = simple_strtoul(s, &s, 0);
112 if ( *s == ',' )
113 max_csubstate = simple_strtoul(s + 1, NULL, 0);
114 return 0;
115 }
116 custom_param("max_cstate", parse_cstate);
117
118 static bool __read_mostly local_apic_timer_c2_ok;
119 boolean_param("lapic_timer_c2_ok", local_apic_timer_c2_ok);
120
121 struct acpi_processor_power *__read_mostly processor_powers[NR_CPUS];
122
123 /*
124 * This field starts out as zero, and can be set to -1 just to signal it has
125 * been set (and that vendor specific logic has failed, and shouldn't be
126 * tried again), or to +1 to ignore Dom0 side uploads of C-state ACPI data.
127 */
128 static int8_t __read_mostly vendor_override;
129
130 struct hw_residencies
131 {
132 uint64_t mc0;
133 uint64_t mc6;
134 uint64_t pc2;
135 uint64_t pc3;
136 uint64_t pc4;
137 uint64_t pc6;
138 uint64_t pc7;
139 uint64_t pc8;
140 uint64_t pc9;
141 uint64_t pc10;
142 uint64_t cc1;
143 uint64_t cc3;
144 uint64_t cc6;
145 uint64_t cc7;
146 };
147
do_get_hw_residencies(void * arg)148 static void do_get_hw_residencies(void *arg)
149 {
150 struct cpuinfo_x86 *c = ¤t_cpu_data;
151 struct hw_residencies *hw_res = arg;
152
153 if ( c->x86_vendor != X86_VENDOR_INTEL || c->x86 != 6 )
154 return;
155
156 switch ( c->x86_model )
157 {
158 /* 4th generation Intel Core (Haswell) */
159 case 0x45:
160 GET_PC8_RES(hw_res->pc8);
161 GET_PC9_RES(hw_res->pc9);
162 GET_PC10_RES(hw_res->pc10);
163 /* fall through */
164 /* Sandy bridge */
165 case 0x2A:
166 case 0x2D:
167 /* Ivy bridge */
168 case 0x3A:
169 case 0x3E:
170 /* Haswell */
171 case 0x3C:
172 case 0x3F:
173 case 0x46:
174 /* Broadwell */
175 case 0x3D:
176 case 0x47:
177 case 0x4F:
178 case 0x56:
179 /* Skylake */
180 case 0x4E:
181 case 0x55:
182 case 0x5E:
183 /* Ice Lake */
184 case 0x7D:
185 case 0x7E:
186 /* Kaby Lake */
187 case 0x8E:
188 case 0x9E:
189 /* Comet Lake */
190 case 0xA5:
191 case 0xA6:
192 GET_PC2_RES(hw_res->pc2);
193 GET_CC7_RES(hw_res->cc7);
194 /* fall through */
195 /* Nehalem */
196 case 0x1A:
197 case 0x1E:
198 case 0x1F:
199 case 0x2E:
200 /* Westmere */
201 case 0x25:
202 case 0x2C:
203 case 0x2F:
204 GET_PC3_RES(hw_res->pc3);
205 GET_PC6_RES(hw_res->pc6);
206 GET_PC7_RES(hw_res->pc7);
207 GET_CC3_RES(hw_res->cc3);
208 GET_CC6_RES(hw_res->cc6);
209 break;
210 /* Cannon Lake */
211 case 0x66:
212 GET_PC2_RES(hw_res->pc2);
213 GET_PC3_RES(hw_res->pc3);
214 GET_PC6_RES(hw_res->pc6);
215 GET_PC7_RES(hw_res->pc7);
216 GET_CC1_RES(hw_res->cc1);
217 GET_CC6_RES(hw_res->cc6);
218 GET_CC7_RES(hw_res->cc7);
219 break;
220 /* Xeon Phi Knights Landing */
221 case 0x57:
222 /* Xeon Phi Knights Mill */
223 case 0x85:
224 GET_CC3_RES(hw_res->mc0); /* abusing GET_CC3_RES */
225 GET_CC6_RES(hw_res->mc6); /* abusing GET_CC6_RES */
226 GET_PC2_RES(hw_res->pc2);
227 GET_PC3_RES(hw_res->pc3);
228 GET_PC6_RES(hw_res->pc6);
229 GET_PC7_RES(hw_res->pc7);
230 PHI_CC6_RES(hw_res->cc6);
231 break;
232 /* various Atoms */
233 case 0x27:
234 GET_PC3_RES(hw_res->pc2); /* abusing GET_PC3_RES */
235 GET_PC6_RES(hw_res->pc4); /* abusing GET_PC6_RES */
236 GET_PC7_RES(hw_res->pc6); /* abusing GET_PC7_RES */
237 break;
238 /* Silvermont */
239 case 0x37:
240 case 0x4A:
241 case 0x4D:
242 case 0x5A:
243 case 0x5D:
244 /* Airmont */
245 case 0x4C:
246 GET_MC6_RES(hw_res->mc6);
247 GET_PC7_RES(hw_res->pc6); /* abusing GET_PC7_RES */
248 GET_CC1_RES(hw_res->cc1);
249 GET_CC6_RES(hw_res->cc6);
250 break;
251 /* Goldmont */
252 case 0x5C:
253 case 0x5F:
254 /* Goldmont Plus */
255 case 0x7A:
256 /* Tremont */
257 case 0x86:
258 GET_PC2_RES(hw_res->pc2);
259 GET_PC3_RES(hw_res->pc3);
260 GET_PC6_RES(hw_res->pc6);
261 GET_PC10_RES(hw_res->pc10);
262 GET_CC1_RES(hw_res->cc1);
263 GET_CC3_RES(hw_res->cc3);
264 GET_CC6_RES(hw_res->cc6);
265 break;
266 }
267 }
268
get_hw_residencies(uint32_t cpu,struct hw_residencies * hw_res)269 static void get_hw_residencies(uint32_t cpu, struct hw_residencies *hw_res)
270 {
271 memset(hw_res, 0, sizeof(*hw_res));
272
273 if ( smp_processor_id() == cpu )
274 do_get_hw_residencies(hw_res);
275 else
276 on_selected_cpus(cpumask_of(cpu), do_get_hw_residencies, hw_res, 1);
277 }
278
print_hw_residencies(uint32_t cpu)279 static void print_hw_residencies(uint32_t cpu)
280 {
281 struct hw_residencies hw_res;
282
283 get_hw_residencies(cpu, &hw_res);
284
285 if ( hw_res.mc0 | hw_res.mc6 )
286 printk("MC0[%"PRIu64"] MC6[%"PRIu64"]\n",
287 hw_res.mc0, hw_res.mc6);
288 printk("PC2[%"PRIu64"] PC%d[%"PRIu64"] PC6[%"PRIu64"] PC7[%"PRIu64"]\n",
289 hw_res.pc2,
290 hw_res.pc4 ? 4 : 3, hw_res.pc4 ?: hw_res.pc3,
291 hw_res.pc6, hw_res.pc7);
292 if ( hw_res.pc8 | hw_res.pc9 | hw_res.pc10 )
293 printk("PC8[%"PRIu64"] PC9[%"PRIu64"] PC10[%"PRIu64"]\n",
294 hw_res.pc8, hw_res.pc9, hw_res.pc10);
295 printk("CC%d[%"PRIu64"] CC6[%"PRIu64"] CC7[%"PRIu64"]\n",
296 hw_res.cc1 ? 1 : 3, hw_res.cc1 ?: hw_res.cc3,
297 hw_res.cc6, hw_res.cc7);
298 }
299
300 static char* acpi_cstate_method_name[] =
301 {
302 "NONE",
303 "SYSIO",
304 "FFH",
305 "HALT"
306 };
307
get_stime_tick(void)308 static uint64_t get_stime_tick(void) { return (uint64_t)NOW(); }
stime_ticks_elapsed(uint64_t t1,uint64_t t2)309 static uint64_t stime_ticks_elapsed(uint64_t t1, uint64_t t2) { return t2 - t1; }
stime_tick_to_ns(uint64_t ticks)310 static uint64_t stime_tick_to_ns(uint64_t ticks) { return ticks; }
311
get_acpi_pm_tick(void)312 static uint64_t get_acpi_pm_tick(void) { return (uint64_t)inl(pmtmr_ioport); }
acpi_pm_ticks_elapsed(uint64_t t1,uint64_t t2)313 static uint64_t acpi_pm_ticks_elapsed(uint64_t t1, uint64_t t2)
314 {
315 if ( t2 >= t1 )
316 return (t2 - t1);
317 else if ( !(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER) )
318 return (((0x00FFFFFF - t1) + t2 + 1) & 0x00FFFFFF);
319 else
320 return ((0xFFFFFFFF - t1) + t2 +1);
321 }
322
323 uint64_t (*__read_mostly cpuidle_get_tick)(void);
324 static uint64_t (*__read_mostly tick_to_ns)(uint64_t);
325 static uint64_t (*__read_mostly ticks_elapsed)(uint64_t, uint64_t);
326
print_acpi_power(uint32_t cpu,struct acpi_processor_power * power)327 static void print_acpi_power(uint32_t cpu, struct acpi_processor_power *power)
328 {
329 uint64_t idle_res = 0, idle_usage = 0;
330 uint64_t last_state_update_tick, current_tick, current_stime;
331 uint64_t usage[ACPI_PROCESSOR_MAX_POWER] = { 0 };
332 uint64_t res_tick[ACPI_PROCESSOR_MAX_POWER] = { 0 };
333 unsigned int i;
334 signed int last_state_idx;
335
336 printk("==cpu%d==\n", cpu);
337 last_state_idx = power->last_state ? power->last_state->idx : -1;
338
339 spin_lock_irq(&power->stat_lock);
340 current_tick = cpuidle_get_tick();
341 current_stime = NOW();
342 for ( i = 1; i < power->count; i++ )
343 {
344 res_tick[i] = power->states[i].time;
345 usage[i] = power->states[i].usage;
346 }
347 last_state_update_tick = power->last_state_update_tick;
348 spin_unlock_irq(&power->stat_lock);
349
350 if ( last_state_idx >= 0 )
351 {
352 res_tick[last_state_idx] += ticks_elapsed(last_state_update_tick,
353 current_tick);
354 usage[last_state_idx]++;
355 }
356
357 for ( i = 1; i < power->count; i++ )
358 {
359 idle_usage += usage[i];
360 idle_res += tick_to_ns(res_tick[i]);
361
362 printk(" %cC%u:\ttype[C%d] latency[%3u] usage[%8"PRIu64"] method[%5s] duration[%"PRIu64"]\n",
363 (last_state_idx == i) ? '*' : ' ', i,
364 power->states[i].type, power->states[i].latency, usage[i],
365 acpi_cstate_method_name[power->states[i].entry_method],
366 tick_to_ns(res_tick[i]));
367 }
368 printk(" %cC0:\tusage[%8"PRIu64"] duration[%"PRIu64"]\n",
369 (last_state_idx == 0) ? '*' : ' ',
370 usage[0] + idle_usage, current_stime - idle_res);
371
372 print_hw_residencies(cpu);
373 }
374
dump_cx(unsigned char key)375 static void dump_cx(unsigned char key)
376 {
377 unsigned int cpu;
378
379 printk("'%c' pressed -> printing ACPI Cx structures\n", key);
380 if ( max_cstate < UINT_MAX )
381 {
382 printk("max state: C%u\n", max_cstate);
383 if ( max_csubstate < UINT_MAX )
384 printk("max sub-state: %u\n", max_csubstate);
385 else
386 printk("max sub-state: unlimited\n");
387 }
388 else
389 printk("max state: unlimited\n");
390 for_each_present_cpu ( cpu )
391 {
392 struct acpi_processor_power *power = processor_powers[cpu];
393
394 if ( !power )
395 continue;
396
397 if ( cpu_online(cpu) )
398 print_acpi_power(cpu, power);
399 else if ( park_offline_cpus )
400 printk("CPU%u parked in state %u (C%u)\n", cpu,
401 power->last_state ? power->last_state->idx : 1,
402 power->last_state ? power->last_state->type : 1);
403
404 process_pending_softirqs();
405 }
406 }
407
cpu_idle_key_init(void)408 static int __init cpu_idle_key_init(void)
409 {
410 register_keyhandler('c', dump_cx, "dump ACPI Cx structures", 1);
411 return 0;
412 }
413 __initcall(cpu_idle_key_init);
414
415 /*
416 * The bit is set iff cpu use monitor/mwait to enter C state
417 * with this flag set, CPU can be waken up from C state
418 * by writing to specific memory address, instead of sending an IPI.
419 */
420 static cpumask_t cpuidle_mwait_flags;
421
cpuidle_wakeup_mwait(cpumask_t * mask)422 void cpuidle_wakeup_mwait(cpumask_t *mask)
423 {
424 cpumask_t target;
425 unsigned int cpu;
426
427 cpumask_and(&target, mask, &cpuidle_mwait_flags);
428
429 /* CPU is MWAITing on the cpuidle_mwait_wakeup flag. */
430 for_each_cpu(cpu, &target)
431 mwait_wakeup(cpu) = 0;
432
433 cpumask_andnot(mask, mask, &target);
434 }
435
arch_skip_send_event_check(unsigned int cpu)436 bool arch_skip_send_event_check(unsigned int cpu)
437 {
438 /*
439 * This relies on softirq_pending() and mwait_wakeup() to access data
440 * on the same cache line.
441 */
442 smp_mb();
443 return !!cpumask_test_cpu(cpu, &cpuidle_mwait_flags);
444 }
445
mwait_idle_with_hints(unsigned int eax,unsigned int ecx)446 void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
447 {
448 unsigned int cpu = smp_processor_id();
449 s_time_t expires = per_cpu(timer_deadline, cpu);
450 const void *monitor_addr = &mwait_wakeup(cpu);
451
452 if ( boot_cpu_has(X86_FEATURE_CLFLUSH_MONITOR) )
453 {
454 mb();
455 clflush(monitor_addr);
456 mb();
457 }
458
459 __monitor(monitor_addr, 0, 0);
460 smp_mb();
461
462 /*
463 * Timer deadline passing is the event on which we will be woken via
464 * cpuidle_mwait_wakeup. So check it now that the location is armed.
465 */
466 if ( (expires > NOW() || expires == 0) && !softirq_pending(cpu) )
467 {
468 struct cpu_info *info = get_cpu_info();
469
470 cpumask_set_cpu(cpu, &cpuidle_mwait_flags);
471
472 spec_ctrl_enter_idle(info);
473 __mwait(eax, ecx);
474 spec_ctrl_exit_idle(info);
475
476 cpumask_clear_cpu(cpu, &cpuidle_mwait_flags);
477 }
478
479 if ( expires <= NOW() && expires > 0 )
480 raise_softirq(TIMER_SOFTIRQ);
481 }
482
acpi_processor_ffh_cstate_enter(struct acpi_processor_cx * cx)483 static void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
484 {
485 mwait_idle_with_hints(cx->address, MWAIT_ECX_INTERRUPT_BREAK);
486 }
487
acpi_idle_do_entry(struct acpi_processor_cx * cx)488 static void acpi_idle_do_entry(struct acpi_processor_cx *cx)
489 {
490 struct cpu_info *info = get_cpu_info();
491
492 switch ( cx->entry_method )
493 {
494 case ACPI_CSTATE_EM_FFH:
495 /* Call into architectural FFH based C-state */
496 acpi_processor_ffh_cstate_enter(cx);
497 return;
498 case ACPI_CSTATE_EM_SYSIO:
499 spec_ctrl_enter_idle(info);
500 /* IO port based C-state */
501 inb(cx->address);
502 /* Dummy wait op - must do something useless after P_LVL2 read
503 because chipsets cannot guarantee that STPCLK# signal
504 gets asserted in time to freeze execution properly. */
505 inl(pmtmr_ioport);
506 spec_ctrl_exit_idle(info);
507 return;
508 case ACPI_CSTATE_EM_HALT:
509 spec_ctrl_enter_idle(info);
510 safe_halt();
511 spec_ctrl_exit_idle(info);
512 local_irq_disable();
513 return;
514 }
515 }
516
acpi_idle_bm_check(void)517 static int acpi_idle_bm_check(void)
518 {
519 u32 bm_status = 0;
520
521 acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status);
522 if ( bm_status )
523 acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1);
524 /*
525 * TBD: PIIX4 Erratum #18: Note that BM_STS doesn't always reflect
526 * the true state of bus mastering activity; forcing us to
527 * manually check the BMIDEA bit of each IDE channel.
528 */
529 return bm_status;
530 }
531
532 static struct {
533 spinlock_t lock;
534 unsigned int count;
535 } c3_cpu_status = { .lock = SPIN_LOCK_UNLOCKED };
536
trace_exit_reason(u32 * irq_traced)537 void trace_exit_reason(u32 *irq_traced)
538 {
539 if ( unlikely(tb_init_done) )
540 {
541 int i, curbit;
542 u32 irr_status[8] = { 0 };
543
544 /* Get local apic IRR register */
545 for ( i = 0; i < 8; i++ )
546 irr_status[i] = apic_read(APIC_IRR + (i << 4));
547 i = 0;
548 curbit = find_first_bit((const unsigned long *)irr_status, 256);
549 while ( i < 4 && curbit < 256 )
550 {
551 irq_traced[i++] = curbit;
552 curbit = find_next_bit((const unsigned long *)irr_status, 256, curbit + 1);
553 }
554 }
555 }
556
errata_c6_workaround(void)557 bool errata_c6_workaround(void)
558 {
559 static int8_t __read_mostly fix_needed = -1;
560
561 if ( unlikely(fix_needed == -1) )
562 {
563 #define INTEL_FAM6_MODEL(m) { X86_VENDOR_INTEL, 6, m, X86_FEATURE_ALWAYS }
564 /*
565 * Errata AAJ72: EOI Transaction May Not be Sent if Software Enters
566 * Core C6 During an Interrupt Service Routine"
567 *
568 * There was an errata with some Core i7 processors that an EOI
569 * transaction may not be sent if software enters core C6 during an
570 * interrupt service routine. So we don't enter deep Cx state if
571 * there is an EOI pending.
572 */
573 static const struct x86_cpu_id eoi_errata[] = {
574 INTEL_FAM6_MODEL(0x1a),
575 INTEL_FAM6_MODEL(0x1e),
576 INTEL_FAM6_MODEL(0x1f),
577 INTEL_FAM6_MODEL(0x25),
578 INTEL_FAM6_MODEL(0x2c),
579 INTEL_FAM6_MODEL(0x2f),
580 { }
581 };
582 /*
583 * Errata BDX99, CLX30, SKX100, CFW125, BDF104, BDH85, BDM135, KWB131:
584 * A Pending Fixed Interrupt May Be Dispatched Before an Interrupt of
585 * The Same Priority Completes.
586 *
587 * Resuming from C6 Sleep-State, with Fixed Interrupts of the same
588 * priority queued (in the corresponding bits of the IRR and ISR APIC
589 * registers), the processor may dispatch the second interrupt (from
590 * the IRR bit) before the first interrupt has completed and written to
591 * the EOI register, causing the first interrupt to never complete.
592 *
593 * Note: Haswell hasn't had errata issued, but this issue was first
594 * discovered on Haswell hardware, and is affected.
595 */
596 static const struct x86_cpu_id isr_errata[] = {
597 /* Haswell */
598 INTEL_FAM6_MODEL(0x3c),
599 INTEL_FAM6_MODEL(0x3f),
600 INTEL_FAM6_MODEL(0x45),
601 INTEL_FAM6_MODEL(0x46),
602 /* Broadwell */
603 INTEL_FAM6_MODEL(0x47),
604 INTEL_FAM6_MODEL(0x3d),
605 INTEL_FAM6_MODEL(0x4f),
606 INTEL_FAM6_MODEL(0x56),
607 /* Skylake (client) */
608 INTEL_FAM6_MODEL(0x5e),
609 INTEL_FAM6_MODEL(0x4e),
610 /* {Sky/Cascade}lake (server) */
611 INTEL_FAM6_MODEL(0x55),
612 /* {Kaby/Coffee/Whiskey/Amber} Lake */
613 INTEL_FAM6_MODEL(0x9e),
614 INTEL_FAM6_MODEL(0x8e),
615 /* Cannon Lake */
616 INTEL_FAM6_MODEL(0x66),
617 { }
618 };
619 #undef INTEL_FAM6_MODEL
620
621 fix_needed = cpu_has_apic &&
622 ((!directed_eoi_enabled && x86_match_cpu(eoi_errata)) ||
623 x86_match_cpu(isr_errata));
624 }
625
626 return (fix_needed && cpu_has_pending_apic_eoi());
627 }
628
update_last_cx_stat(struct acpi_processor_power * power,struct acpi_processor_cx * cx,uint64_t ticks)629 void update_last_cx_stat(struct acpi_processor_power *power,
630 struct acpi_processor_cx *cx, uint64_t ticks)
631 {
632 ASSERT(!local_irq_is_enabled());
633
634 spin_lock(&power->stat_lock);
635 power->last_state = cx;
636 power->last_state_update_tick = ticks;
637 spin_unlock(&power->stat_lock);
638 }
639
update_idle_stats(struct acpi_processor_power * power,struct acpi_processor_cx * cx,uint64_t before,uint64_t after)640 void update_idle_stats(struct acpi_processor_power *power,
641 struct acpi_processor_cx *cx,
642 uint64_t before, uint64_t after)
643 {
644 int64_t sleep_ticks = alternative_call(ticks_elapsed, before, after);
645 /* Interrupts are disabled */
646
647 spin_lock(&power->stat_lock);
648
649 cx->usage++;
650 if ( sleep_ticks > 0 )
651 {
652 power->last_residency = alternative_call(tick_to_ns, sleep_ticks) /
653 1000UL;
654 cx->time += sleep_ticks;
655 }
656 power->last_state = &power->states[0];
657 power->last_state_update_tick = after;
658
659 spin_unlock(&power->stat_lock);
660 }
661
acpi_processor_idle(void)662 static void acpi_processor_idle(void)
663 {
664 unsigned int cpu = smp_processor_id();
665 struct acpi_processor_power *power = processor_powers[cpu];
666 struct acpi_processor_cx *cx = NULL;
667 int next_state;
668 uint64_t t1, t2 = 0;
669 u32 exp = 0, pred = 0;
670 u32 irq_traced[4] = { 0 };
671
672 if ( max_cstate > 0 && power &&
673 (next_state = cpuidle_current_governor->select(power)) > 0 )
674 {
675 unsigned int max_state = sched_has_urgent_vcpu() ? ACPI_STATE_C1
676 : max_cstate;
677
678 do {
679 cx = &power->states[next_state];
680 } while ( (cx->type > max_state ||
681 cx->entry_method == ACPI_CSTATE_EM_NONE ||
682 (cx->entry_method == ACPI_CSTATE_EM_FFH &&
683 cx->type == max_cstate &&
684 (cx->address & MWAIT_SUBSTATE_MASK) > max_csubstate)) &&
685 --next_state );
686 if ( next_state )
687 {
688 if ( cx->type == ACPI_STATE_C3 && power->flags.bm_check &&
689 acpi_idle_bm_check() )
690 cx = power->safe_state;
691 if ( tb_init_done )
692 menu_get_trace_data(&exp, &pred);
693 }
694 else
695 cx = NULL;
696 }
697 if ( !cx )
698 {
699 if ( pm_idle_save )
700 pm_idle_save();
701 else
702 {
703 struct cpu_info *info = get_cpu_info();
704
705 spec_ctrl_enter_idle(info);
706 safe_halt();
707 spec_ctrl_exit_idle(info);
708 }
709 return;
710 }
711
712 cpufreq_dbs_timer_suspend();
713
714 rcu_idle_enter(cpu);
715 /* rcu_idle_enter() can raise TIMER_SOFTIRQ. Process it now. */
716 process_pending_softirqs();
717
718 /*
719 * Interrupts must be disabled during bus mastering calculations and
720 * for C2/C3 transitions.
721 */
722 local_irq_disable();
723
724 if ( !cpu_is_haltable(cpu) )
725 {
726 local_irq_enable();
727 rcu_idle_exit(cpu);
728 cpufreq_dbs_timer_resume();
729 return;
730 }
731
732 if ( (cx->type >= ACPI_STATE_C3) && errata_c6_workaround() )
733 cx = power->safe_state;
734
735
736 /*
737 * Sleep:
738 * ------
739 * Invoke the current Cx state to put the processor to sleep.
740 */
741 switch ( cx->type )
742 {
743 case ACPI_STATE_C1:
744 case ACPI_STATE_C2:
745 if ( cx->type == ACPI_STATE_C1 || local_apic_timer_c2_ok )
746 {
747 /* Get start time (ticks) */
748 t1 = alternative_call(cpuidle_get_tick);
749 /* Trace cpu idle entry */
750 TRACE_4D(TRC_PM_IDLE_ENTRY, cx->idx, t1, exp, pred);
751
752 update_last_cx_stat(power, cx, t1);
753
754 /* Invoke C2 */
755 acpi_idle_do_entry(cx);
756 /* Get end time (ticks) */
757 t2 = alternative_call(cpuidle_get_tick);
758 trace_exit_reason(irq_traced);
759 /* Trace cpu idle exit */
760 TRACE_6D(TRC_PM_IDLE_EXIT, cx->idx, t2,
761 irq_traced[0], irq_traced[1], irq_traced[2], irq_traced[3]);
762 /* Update statistics */
763 update_idle_stats(power, cx, t1, t2);
764 /* Re-enable interrupts */
765 local_irq_enable();
766 break;
767 }
768
769 case ACPI_STATE_C3:
770 /*
771 * Before invoking C3, be aware that TSC/APIC timer may be
772 * stopped by H/W. Without carefully handling of TSC/APIC stop issues,
773 * deep C state can't work correctly.
774 */
775 /* preparing APIC stop */
776 lapic_timer_off();
777
778 /* Get start time (ticks) */
779 t1 = alternative_call(cpuidle_get_tick);
780 /* Trace cpu idle entry */
781 TRACE_4D(TRC_PM_IDLE_ENTRY, cx->idx, t1, exp, pred);
782
783 update_last_cx_stat(power, cx, t1);
784
785 /*
786 * disable bus master
787 * bm_check implies we need ARB_DIS
788 * !bm_check implies we need cache flush
789 * bm_control implies whether we can do ARB_DIS
790 *
791 * That leaves a case where bm_check is set and bm_control is
792 * not set. In that case we cannot do much, we enter C3
793 * without doing anything.
794 */
795 if ( cx->type != ACPI_STATE_C3 )
796 /* nothing to be done here */;
797 else if ( power->flags.bm_check && power->flags.bm_control )
798 {
799 spin_lock(&c3_cpu_status.lock);
800 if ( ++c3_cpu_status.count == num_online_cpus() )
801 {
802 /*
803 * All CPUs are trying to go to C3
804 * Disable bus master arbitration
805 */
806 acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1);
807 }
808 spin_unlock(&c3_cpu_status.lock);
809 }
810 else if ( !power->flags.bm_check )
811 {
812 /* SMP with no shared cache... Invalidate cache */
813 ACPI_FLUSH_CPU_CACHE();
814 }
815
816 /* Invoke C3 */
817 acpi_idle_do_entry(cx);
818
819 if ( (cx->type == ACPI_STATE_C3) &&
820 power->flags.bm_check && power->flags.bm_control )
821 {
822 /* Enable bus master arbitration */
823 spin_lock(&c3_cpu_status.lock);
824 if ( c3_cpu_status.count-- == num_online_cpus() )
825 acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
826 spin_unlock(&c3_cpu_status.lock);
827 }
828
829 /* Get end time (ticks) */
830 t2 = alternative_call(cpuidle_get_tick);
831
832 /* recovering TSC */
833 cstate_restore_tsc();
834 trace_exit_reason(irq_traced);
835 /* Trace cpu idle exit */
836 TRACE_6D(TRC_PM_IDLE_EXIT, cx->idx, t2,
837 irq_traced[0], irq_traced[1], irq_traced[2], irq_traced[3]);
838
839 /* Update statistics */
840 update_idle_stats(power, cx, t1, t2);
841 /* Re-enable interrupts */
842 local_irq_enable();
843 /* recovering APIC */
844 lapic_timer_on();
845
846 break;
847
848 default:
849 /* Now in C0 */
850 power->last_state = &power->states[0];
851 local_irq_enable();
852 rcu_idle_exit(cpu);
853 cpufreq_dbs_timer_resume();
854 return;
855 }
856
857 /* Now in C0 */
858 power->last_state = &power->states[0];
859
860 rcu_idle_exit(cpu);
861 cpufreq_dbs_timer_resume();
862
863 if ( cpuidle_current_governor->reflect )
864 cpuidle_current_governor->reflect(power);
865 }
866
acpi_dead_idle(void)867 void acpi_dead_idle(void)
868 {
869 struct acpi_processor_power *power;
870 struct acpi_processor_cx *cx;
871
872 if ( (power = processor_powers[smp_processor_id()]) == NULL ||
873 power->count < 2 )
874 goto default_halt;
875
876 cx = &power->states[power->count - 1];
877 power->last_state = cx;
878
879 if ( cx->entry_method == ACPI_CSTATE_EM_FFH )
880 {
881 void *mwait_ptr = &mwait_wakeup(smp_processor_id());
882
883 /*
884 * Cache must be flushed as the last operation before sleeping.
885 * Otherwise, CPU may still hold dirty data, breaking cache coherency,
886 * leading to strange errors.
887 */
888 spec_ctrl_enter_idle(get_cpu_info());
889 wbinvd();
890
891 while ( 1 )
892 {
893 /*
894 * 1. The CLFLUSH is a workaround for erratum AAI65 for
895 * the Xeon 7400 series.
896 * 2. The WBINVD is insufficient due to the spurious-wakeup
897 * case where we return around the loop.
898 * 3. Unlike wbinvd, clflush is a light weight but not serializing
899 * instruction, hence memory fence is necessary to make sure all
900 * load/store visible before flush cache line.
901 */
902 mb();
903 clflush(mwait_ptr);
904 __monitor(mwait_ptr, 0, 0);
905 mb();
906 __mwait(cx->address, 0);
907 }
908 }
909 else if ( (current_cpu_data.x86_vendor &
910 (X86_VENDOR_AMD | X86_VENDOR_HYGON)) &&
911 cx->entry_method == ACPI_CSTATE_EM_SYSIO )
912 {
913 /* Intel prefers not to use SYSIO */
914
915 /* Avoid references to shared data after the cache flush */
916 u32 address = cx->address;
917 u32 pmtmr_ioport_local = pmtmr_ioport;
918
919 spec_ctrl_enter_idle(get_cpu_info());
920 wbinvd();
921
922 while ( 1 )
923 {
924 inb(address);
925 inl(pmtmr_ioport_local);
926 }
927 }
928
929 default_halt:
930 default_dead_idle();
931 }
932
cpuidle_init_cpu(unsigned int cpu)933 int cpuidle_init_cpu(unsigned int cpu)
934 {
935 struct acpi_processor_power *acpi_power;
936
937 acpi_power = processor_powers[cpu];
938 if ( !acpi_power )
939 {
940 unsigned int i;
941
942 if ( cpu == 0 && system_state < SYS_STATE_active )
943 {
944 if ( boot_cpu_has(X86_FEATURE_NONSTOP_TSC) )
945 {
946 cpuidle_get_tick = get_stime_tick;
947 ticks_elapsed = stime_ticks_elapsed;
948 tick_to_ns = stime_tick_to_ns;
949 }
950 else
951 {
952 cpuidle_get_tick = get_acpi_pm_tick;
953 ticks_elapsed = acpi_pm_ticks_elapsed;
954 tick_to_ns = acpi_pm_tick_to_ns;
955 }
956 }
957
958 acpi_power = xzalloc(struct acpi_processor_power);
959 if ( !acpi_power )
960 return -ENOMEM;
961
962 for ( i = 0; i < ACPI_PROCESSOR_MAX_POWER; i++ )
963 acpi_power->states[i].idx = i;
964
965 acpi_power->cpu = cpu;
966
967 spin_lock_init(&acpi_power->stat_lock);
968
969 processor_powers[cpu] = acpi_power;
970 }
971
972 acpi_power->count = 2;
973 acpi_power->states[1].type = ACPI_STATE_C1;
974 acpi_power->states[1].entry_method = ACPI_CSTATE_EM_HALT;
975 acpi_power->safe_state = &acpi_power->states[1];
976
977 return 0;
978 }
979
acpi_processor_ffh_cstate_probe(xen_processor_cx_t * cx)980 static int acpi_processor_ffh_cstate_probe(xen_processor_cx_t *cx)
981 {
982 struct cpuinfo_x86 *c = ¤t_cpu_data;
983 unsigned int eax, ebx, ecx, edx;
984 unsigned int edx_part;
985 unsigned int cstate_type; /* C-state type and not ACPI C-state type */
986 unsigned int num_cstate_subtype;
987 int ret = 0;
988 static unsigned long printed;
989
990 if ( c->cpuid_level < CPUID_MWAIT_LEAF )
991 {
992 printk(XENLOG_INFO "MWAIT leaf not supported by cpuid\n");
993 return -EFAULT;
994 }
995
996 cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
997 if ( opt_cpu_info )
998 printk(XENLOG_DEBUG "cpuid.MWAIT[eax=%x ebx=%x ecx=%x edx=%x]\n",
999 eax, ebx, ecx, edx);
1000
1001 /* Check whether this particular cx_type (in CST) is supported or not */
1002 cstate_type = (cx->reg.address >> MWAIT_SUBSTATE_SIZE) + 1;
1003 edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE);
1004 num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK;
1005
1006 if ( num_cstate_subtype < (cx->reg.address & MWAIT_SUBSTATE_MASK) )
1007 ret = -ERANGE;
1008 /* mwait ecx extensions INTERRUPT_BREAK should be supported for C2/C3 */
1009 else if ( !(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
1010 !(ecx & CPUID5_ECX_INTERRUPT_BREAK) )
1011 ret = -ENODEV;
1012 else if ( opt_cpu_info || cx->type >= BITS_PER_LONG ||
1013 !test_and_set_bit(cx->type, &printed) )
1014 printk(XENLOG_INFO "Monitor-Mwait will be used to enter C%d state\n",
1015 cx->type);
1016 return ret;
1017 }
1018
1019 /*
1020 * Initialize bm_flags based on the CPU cache properties
1021 * On SMP it depends on cache configuration
1022 * - When cache is not shared among all CPUs, we flush cache
1023 * before entering C3.
1024 * - When cache is shared among all CPUs, we use bm_check
1025 * mechanism as in UP case
1026 *
1027 * This routine is called only after all the CPUs are online
1028 */
acpi_processor_power_init_bm_check(struct acpi_processor_flags * flags)1029 static void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags)
1030 {
1031 struct cpuinfo_x86 *c = ¤t_cpu_data;
1032
1033 flags->bm_check = 0;
1034 if ( num_online_cpus() == 1 )
1035 flags->bm_check = 1;
1036 else if ( (c->x86_vendor == X86_VENDOR_INTEL) ||
1037 ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 0x15)) )
1038 {
1039 /*
1040 * Today all MP CPUs that support C3 share cache.
1041 * And caches should not be flushed by software while
1042 * entering C3 type state.
1043 */
1044 flags->bm_check = 1;
1045 }
1046
1047 /*
1048 * On all recent platforms, ARB_DISABLE is a nop.
1049 * So, set bm_control to zero to indicate that ARB_DISABLE
1050 * is not required while entering C3 type state on
1051 * P4, Core and beyond CPUs
1052 */
1053 if ( c->x86_vendor == X86_VENDOR_INTEL &&
1054 (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 14)) )
1055 flags->bm_control = 0;
1056 }
1057
1058 #define VENDOR_INTEL (1)
1059 #define NATIVE_CSTATE_BEYOND_HALT (2)
1060
check_cx(struct acpi_processor_power * power,xen_processor_cx_t * cx)1061 static int check_cx(struct acpi_processor_power *power, xen_processor_cx_t *cx)
1062 {
1063 static int bm_check_flag = -1;
1064 static int bm_control_flag = -1;
1065
1066 switch ( cx->reg.space_id )
1067 {
1068 case ACPI_ADR_SPACE_SYSTEM_IO:
1069 if ( cx->reg.address == 0 )
1070 return -EINVAL;
1071 break;
1072
1073 case ACPI_ADR_SPACE_FIXED_HARDWARE:
1074 if ( cx->reg.bit_width != VENDOR_INTEL ||
1075 cx->reg.bit_offset != NATIVE_CSTATE_BEYOND_HALT )
1076 return -EINVAL;
1077
1078 /* assume all logical cpu has the same support for mwait */
1079 if ( acpi_processor_ffh_cstate_probe(cx) )
1080 return -EINVAL;
1081 break;
1082
1083 default:
1084 return -ENODEV;
1085 }
1086
1087 switch ( cx->type )
1088 {
1089 case ACPI_STATE_C2:
1090 if ( local_apic_timer_c2_ok )
1091 break;
1092 case ACPI_STATE_C3:
1093 if ( !lapic_timer_init() )
1094 return -EINVAL;
1095
1096 /* All the logic here assumes flags.bm_check is same across all CPUs */
1097 if ( bm_check_flag < 0 )
1098 {
1099 /* Determine whether bm_check is needed based on CPU */
1100 acpi_processor_power_init_bm_check(&(power->flags));
1101 }
1102 else
1103 {
1104 power->flags.bm_check = bm_check_flag;
1105 power->flags.bm_control = bm_control_flag;
1106 }
1107
1108 if ( power->flags.bm_check )
1109 {
1110 if ( !power->flags.bm_control )
1111 {
1112 if ( power->flags.has_cst != 1 )
1113 {
1114 /* bus mastering control is necessary */
1115 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
1116 "C3 support requires BM control\n"));
1117 return -EINVAL;
1118 }
1119 else
1120 {
1121 /* Here we enter C3 without bus mastering */
1122 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
1123 "C3 support without BM control\n"));
1124 }
1125 }
1126 /*
1127 * On older chipsets, BM_RLD needs to be set in order for Bus
1128 * Master activity to wake the system from C3, hence
1129 * acpi_set_register() is always being called once below. Newer
1130 * chipsets handle DMA during C3 automatically and BM_RLD is a
1131 * NOP. In either case, the proper way to handle BM_RLD is to
1132 * set it and leave it set.
1133 */
1134 }
1135 else
1136 {
1137 /*
1138 * WBINVD should be set in fadt, for C3 state to be
1139 * supported on when bm_check is not required.
1140 */
1141 if ( !(acpi_gbl_FADT.flags & ACPI_FADT_WBINVD) )
1142 {
1143 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
1144 "Cache invalidation should work properly"
1145 " for C3 to be enabled on SMP systems\n"));
1146 return -EINVAL;
1147 }
1148 }
1149
1150 if ( bm_check_flag < 0 )
1151 {
1152 bm_check_flag = power->flags.bm_check;
1153 bm_control_flag = power->flags.bm_control;
1154 acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, bm_check_flag);
1155 }
1156
1157 break;
1158 }
1159
1160 return 0;
1161 }
1162
1163 static unsigned int latency_factor = 2;
1164 integer_param("idle_latency_factor", latency_factor);
1165
set_cx(struct acpi_processor_power * acpi_power,xen_processor_cx_t * xen_cx)1166 static void set_cx(
1167 struct acpi_processor_power *acpi_power,
1168 xen_processor_cx_t *xen_cx)
1169 {
1170 struct acpi_processor_cx *cx;
1171
1172 if ( check_cx(acpi_power, xen_cx) != 0 )
1173 return;
1174
1175 switch ( xen_cx->type )
1176 {
1177 case ACPI_STATE_C1:
1178 cx = &acpi_power->states[1];
1179 break;
1180 default:
1181 if ( acpi_power->count >= ACPI_PROCESSOR_MAX_POWER )
1182 {
1183 case ACPI_STATE_C0:
1184 printk(XENLOG_WARNING "CPU%u: C%d data ignored\n",
1185 acpi_power->cpu, xen_cx->type);
1186 return;
1187 }
1188 cx = &acpi_power->states[acpi_power->count];
1189 cx->type = xen_cx->type;
1190 break;
1191 }
1192
1193 cx->address = xen_cx->reg.address;
1194
1195 switch ( xen_cx->reg.space_id )
1196 {
1197 case ACPI_ADR_SPACE_FIXED_HARDWARE:
1198 if ( xen_cx->reg.bit_width == VENDOR_INTEL &&
1199 xen_cx->reg.bit_offset == NATIVE_CSTATE_BEYOND_HALT &&
1200 boot_cpu_has(X86_FEATURE_MONITOR) )
1201 cx->entry_method = ACPI_CSTATE_EM_FFH;
1202 else
1203 cx->entry_method = ACPI_CSTATE_EM_HALT;
1204 break;
1205 case ACPI_ADR_SPACE_SYSTEM_IO:
1206 if ( ioports_deny_access(hardware_domain, cx->address, cx->address) )
1207 printk(XENLOG_WARNING "Could not deny access to port %04x\n",
1208 cx->address);
1209 cx->entry_method = ACPI_CSTATE_EM_SYSIO;
1210 break;
1211 default:
1212 cx->entry_method = ACPI_CSTATE_EM_NONE;
1213 break;
1214 }
1215
1216 cx->latency = xen_cx->latency;
1217 cx->target_residency = cx->latency * latency_factor;
1218
1219 smp_wmb();
1220 acpi_power->count += (cx->type != ACPI_STATE_C1);
1221 if ( cx->type == ACPI_STATE_C1 || cx->type == ACPI_STATE_C2 )
1222 acpi_power->safe_state = cx;
1223 }
1224
get_cpu_id(u32 acpi_id)1225 int get_cpu_id(u32 acpi_id)
1226 {
1227 int i;
1228 u32 apic_id;
1229
1230 if ( acpi_id >= MAX_MADT_ENTRIES )
1231 return -1;
1232
1233 apic_id = x86_acpiid_to_apicid[acpi_id];
1234 if ( apic_id == BAD_APICID )
1235 return -1;
1236
1237 for ( i = 0; i < nr_cpu_ids; i++ )
1238 {
1239 if ( apic_id == x86_cpu_to_apicid[i] )
1240 return i;
1241 }
1242
1243 return -1;
1244 }
1245
1246 #ifdef DEBUG_PM_CX
print_cx_pminfo(uint32_t cpu,struct xen_processor_power * power)1247 static void print_cx_pminfo(uint32_t cpu, struct xen_processor_power *power)
1248 {
1249 XEN_GUEST_HANDLE(xen_processor_cx_t) states;
1250 xen_processor_cx_t state;
1251 XEN_GUEST_HANDLE(xen_processor_csd_t) csd;
1252 xen_processor_csd_t dp;
1253 uint32_t i;
1254
1255 printk("cpu%d cx acpi info:\n", cpu);
1256 printk("\tcount = %d\n", power->count);
1257 printk("\tflags: bm_cntl[%d], bm_chk[%d], has_cst[%d],\n"
1258 "\t pwr_setup_done[%d], bm_rld_set[%d]\n",
1259 power->flags.bm_control, power->flags.bm_check, power->flags.has_cst,
1260 power->flags.power_setup_done, power->flags.bm_rld_set);
1261
1262 states = power->states;
1263
1264 for ( i = 0; i < power->count; i++ )
1265 {
1266 if ( unlikely(copy_from_guest_offset(&state, states, i, 1)) )
1267 return;
1268
1269 printk("\tstates[%d]:\n", i);
1270 printk("\t\treg.space_id = %#x\n", state.reg.space_id);
1271 printk("\t\treg.bit_width = %#x\n", state.reg.bit_width);
1272 printk("\t\treg.bit_offset = %#x\n", state.reg.bit_offset);
1273 printk("\t\treg.access_size = %#x\n", state.reg.access_size);
1274 printk("\t\treg.address = %#"PRIx64"\n", state.reg.address);
1275 printk("\t\ttype = %d\n", state.type);
1276 printk("\t\tlatency = %d\n", state.latency);
1277 printk("\t\tpower = %d\n", state.power);
1278
1279 csd = state.dp;
1280 printk("\t\tdp(@0x%p)\n", csd.p);
1281
1282 if ( csd.p != NULL )
1283 {
1284 if ( unlikely(copy_from_guest(&dp, csd, 1)) )
1285 return;
1286 printk("\t\t\tdomain = %d\n", dp.domain);
1287 printk("\t\t\tcoord_type = %d\n", dp.coord_type);
1288 printk("\t\t\tnum = %d\n", dp.num);
1289 }
1290 }
1291 }
1292 #else
1293 #define print_cx_pminfo(c, p)
1294 #endif
1295
set_cx_pminfo(uint32_t acpi_id,struct xen_processor_power * power)1296 long set_cx_pminfo(uint32_t acpi_id, struct xen_processor_power *power)
1297 {
1298 XEN_GUEST_HANDLE(xen_processor_cx_t) states;
1299 xen_processor_cx_t xen_cx;
1300 struct acpi_processor_power *acpi_power;
1301 int cpu_id, i, ret;
1302
1303 if ( unlikely(!guest_handle_okay(power->states, power->count)) )
1304 return -EFAULT;
1305
1306 if ( pm_idle_save && pm_idle != acpi_processor_idle )
1307 return 0;
1308
1309 if ( vendor_override > 0 )
1310 return 0;
1311
1312 print_cx_pminfo(acpi_id, power);
1313
1314 cpu_id = get_cpu_id(acpi_id);
1315 if ( cpu_id == -1 )
1316 {
1317 static bool warn_once = true;
1318
1319 if ( warn_once || opt_cpu_info )
1320 printk(XENLOG_WARNING "No CPU for ACPI ID %#x\n", acpi_id);
1321 warn_once = false;
1322 return -EINVAL;
1323 }
1324
1325 ret = cpuidle_init_cpu(cpu_id);
1326 if ( ret < 0 )
1327 return ret;
1328
1329 acpi_power = processor_powers[cpu_id];
1330 acpi_power->flags.bm_check = power->flags.bm_check;
1331 acpi_power->flags.bm_control = power->flags.bm_control;
1332 acpi_power->flags.has_cst = power->flags.has_cst;
1333
1334 states = power->states;
1335 for ( i = 0; i < power->count; i++ )
1336 {
1337 if ( unlikely(copy_from_guest_offset(&xen_cx, states, i, 1)) )
1338 return -EFAULT;
1339
1340 set_cx(acpi_power, &xen_cx);
1341 }
1342
1343 if ( !cpu_online(cpu_id) )
1344 {
1345 uint32_t apic_id = x86_cpu_to_apicid[cpu_id];
1346
1347 /*
1348 * If we've just learned of more available C states, wake the CPU if
1349 * it's parked, so it can go back to sleep in perhaps a deeper state.
1350 */
1351 if ( park_offline_cpus && apic_id != BAD_APICID )
1352 {
1353 unsigned long flags;
1354
1355 local_irq_save(flags);
1356 apic_wait_icr_idle();
1357 apic_icr_write(APIC_DM_NMI | APIC_DEST_PHYSICAL, apic_id);
1358 local_irq_restore(flags);
1359 }
1360 }
1361 else if ( cpuidle_current_governor->enable )
1362 {
1363 ret = cpuidle_current_governor->enable(acpi_power);
1364 if ( ret < 0 )
1365 return ret;
1366 }
1367
1368 /* FIXME: C-state dependency is not supported by far */
1369
1370 if ( cpu_id == 0 )
1371 {
1372 if ( pm_idle_save == NULL )
1373 {
1374 pm_idle_save = pm_idle;
1375 pm_idle = acpi_processor_idle;
1376 }
1377
1378 dead_idle = acpi_dead_idle;
1379 }
1380
1381 return 0;
1382 }
1383
amd_cpuidle_init(struct acpi_processor_power * power)1384 static void amd_cpuidle_init(struct acpi_processor_power *power)
1385 {
1386 unsigned int i, nr = 0;
1387 const struct cpuinfo_x86 *c = ¤t_cpu_data;
1388 const unsigned int ecx_req = CPUID5_ECX_EXTENSIONS_SUPPORTED |
1389 CPUID5_ECX_INTERRUPT_BREAK;
1390 const struct acpi_processor_cx *cx = NULL;
1391 static const struct acpi_processor_cx fam17[] = {
1392 {
1393 .type = ACPI_STATE_C1,
1394 .entry_method = ACPI_CSTATE_EM_FFH,
1395 .latency = 1,
1396 },
1397 {
1398 .type = ACPI_STATE_C2,
1399 .entry_method = ACPI_CSTATE_EM_HALT,
1400 .latency = 400,
1401 },
1402 };
1403
1404 if ( pm_idle_save && pm_idle != acpi_processor_idle )
1405 return;
1406
1407 if ( vendor_override < 0 )
1408 return;
1409
1410 switch ( c->x86 )
1411 {
1412 case 0x19:
1413 case 0x18:
1414 if ( boot_cpu_data.x86_vendor != X86_VENDOR_HYGON )
1415 {
1416 default:
1417 vendor_override = -1;
1418 return;
1419 }
1420 /* fall through */
1421 case 0x17:
1422 if ( cpu_has_monitor && c->cpuid_level >= CPUID_MWAIT_LEAF &&
1423 (cpuid_ecx(CPUID_MWAIT_LEAF) & ecx_req) == ecx_req )
1424 {
1425 cx = fam17;
1426 nr = ARRAY_SIZE(fam17);
1427 local_apic_timer_c2_ok = true;
1428 break;
1429 }
1430 /* fall through */
1431 case 0x15:
1432 case 0x16:
1433 cx = &fam17[1];
1434 nr = ARRAY_SIZE(fam17) - 1;
1435 break;
1436 }
1437
1438 power->flags.has_cst = true;
1439
1440 for ( i = 0; i < nr; ++i )
1441 {
1442 if ( cx[i].type > max_cstate )
1443 break;
1444 power->states[i + 1] = cx[i];
1445 power->states[i + 1].idx = i + 1;
1446 power->states[i + 1].target_residency = cx[i].latency * latency_factor;
1447 }
1448
1449 if ( i )
1450 {
1451 power->count = i + 1;
1452 power->safe_state = &power->states[i];
1453
1454 if ( !vendor_override )
1455 {
1456 if ( !boot_cpu_has(X86_FEATURE_ARAT) )
1457 hpet_broadcast_init();
1458
1459 if ( !lapic_timer_init() )
1460 {
1461 vendor_override = -1;
1462 cpuidle_init_cpu(power->cpu);
1463 return;
1464 }
1465
1466 if ( !pm_idle_save )
1467 {
1468 pm_idle_save = pm_idle;
1469 pm_idle = acpi_processor_idle;
1470 }
1471
1472 dead_idle = acpi_dead_idle;
1473
1474 vendor_override = 1;
1475 }
1476 }
1477 else
1478 vendor_override = -1;
1479 }
1480
pmstat_get_cx_nr(uint32_t cpuid)1481 uint32_t pmstat_get_cx_nr(uint32_t cpuid)
1482 {
1483 return processor_powers[cpuid] ? processor_powers[cpuid]->count : 0;
1484 }
1485
pmstat_get_cx_stat(uint32_t cpuid,struct pm_cx_stat * stat)1486 int pmstat_get_cx_stat(uint32_t cpuid, struct pm_cx_stat *stat)
1487 {
1488 struct acpi_processor_power *power = processor_powers[cpuid];
1489 uint64_t idle_usage = 0, idle_res = 0;
1490 uint64_t last_state_update_tick, current_stime, current_tick;
1491 uint64_t usage[ACPI_PROCESSOR_MAX_POWER] = { 0 };
1492 uint64_t res[ACPI_PROCESSOR_MAX_POWER] = { 0 };
1493 unsigned int i, nr, nr_pc = 0, nr_cc = 0;
1494
1495 if ( power == NULL )
1496 {
1497 stat->last = 0;
1498 stat->nr = 0;
1499 stat->idle_time = 0;
1500 stat->nr_pc = 0;
1501 stat->nr_cc = 0;
1502 return 0;
1503 }
1504
1505 stat->idle_time = get_cpu_idle_time(cpuid);
1506 nr = min(stat->nr, power->count);
1507
1508 /* mimic the stat when detail info hasn't been registered by dom0 */
1509 if ( pm_idle_save == NULL )
1510 {
1511 stat->nr = 2;
1512 stat->last = power->last_state ? power->last_state->idx : 0;
1513
1514 usage[1] = idle_usage = 1;
1515 res[1] = idle_res = stat->idle_time;
1516
1517 current_stime = NOW();
1518 }
1519 else
1520 {
1521 struct hw_residencies hw_res;
1522 signed int last_state_idx;
1523
1524 stat->nr = power->count;
1525
1526 spin_lock_irq(&power->stat_lock);
1527 current_tick = cpuidle_get_tick();
1528 current_stime = NOW();
1529 for ( i = 1; i < nr; i++ )
1530 {
1531 usage[i] = power->states[i].usage;
1532 res[i] = power->states[i].time;
1533 }
1534 last_state_update_tick = power->last_state_update_tick;
1535 last_state_idx = power->last_state ? power->last_state->idx : -1;
1536 spin_unlock_irq(&power->stat_lock);
1537
1538 if ( last_state_idx >= 0 )
1539 {
1540 usage[last_state_idx]++;
1541 res[last_state_idx] += ticks_elapsed(last_state_update_tick,
1542 current_tick);
1543 stat->last = last_state_idx;
1544 }
1545 else
1546 stat->last = 0;
1547
1548 for ( i = 1; i < nr; i++ )
1549 {
1550 res[i] = tick_to_ns(res[i]);
1551 idle_usage += usage[i];
1552 idle_res += res[i];
1553 }
1554
1555 get_hw_residencies(cpuid, &hw_res);
1556
1557 #define PUT_xC(what, n) do { \
1558 if ( stat->nr_##what >= n && \
1559 copy_to_guest_offset(stat->what, n - 1, &hw_res.what##n, 1) ) \
1560 return -EFAULT; \
1561 if ( hw_res.what##n ) \
1562 nr_##what = n; \
1563 } while ( 0 )
1564 #define PUT_PC(n) PUT_xC(pc, n)
1565 PUT_PC(2);
1566 PUT_PC(3);
1567 PUT_PC(4);
1568 PUT_PC(6);
1569 PUT_PC(7);
1570 PUT_PC(8);
1571 PUT_PC(9);
1572 PUT_PC(10);
1573 #undef PUT_PC
1574 #define PUT_CC(n) PUT_xC(cc, n)
1575 PUT_CC(1);
1576 PUT_CC(3);
1577 PUT_CC(6);
1578 PUT_CC(7);
1579 #undef PUT_CC
1580 #undef PUT_xC
1581 }
1582
1583 usage[0] += idle_usage;
1584 res[0] = current_stime - idle_res;
1585
1586 if ( copy_to_guest(stat->triggers, usage, nr) ||
1587 copy_to_guest(stat->residencies, res, nr) )
1588 return -EFAULT;
1589
1590 stat->nr_pc = nr_pc;
1591 stat->nr_cc = nr_cc;
1592
1593 return 0;
1594 }
1595
pmstat_reset_cx_stat(uint32_t cpuid)1596 int pmstat_reset_cx_stat(uint32_t cpuid)
1597 {
1598 return 0;
1599 }
1600
cpuidle_disable_deep_cstate(void)1601 void cpuidle_disable_deep_cstate(void)
1602 {
1603 if ( max_cstate > ACPI_STATE_C1 )
1604 {
1605 if ( local_apic_timer_c2_ok )
1606 max_cstate = ACPI_STATE_C2;
1607 else
1608 max_cstate = ACPI_STATE_C1;
1609 }
1610
1611 hpet_disable_legacy_broadcast();
1612 }
1613
cpuidle_using_deep_cstate(void)1614 bool cpuidle_using_deep_cstate(void)
1615 {
1616 return xen_cpuidle && max_cstate > (local_apic_timer_c2_ok ? ACPI_STATE_C2
1617 : ACPI_STATE_C1);
1618 }
1619
cpu_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)1620 static int cpu_callback(
1621 struct notifier_block *nfb, unsigned long action, void *hcpu)
1622 {
1623 unsigned int cpu = (unsigned long)hcpu;
1624 int rc = 0;
1625
1626 /*
1627 * Only hook on CPU_UP_PREPARE / CPU_ONLINE because a dead cpu may utilize
1628 * the info to enter deep C-state.
1629 */
1630 switch ( action )
1631 {
1632 case CPU_UP_PREPARE:
1633 rc = cpuidle_init_cpu(cpu);
1634 if ( !rc && cpuidle_current_governor->enable )
1635 rc = cpuidle_current_governor->enable(processor_powers[cpu]);
1636 break;
1637
1638 case CPU_ONLINE:
1639 if ( (boot_cpu_data.x86_vendor &
1640 (X86_VENDOR_AMD | X86_VENDOR_HYGON)) &&
1641 processor_powers[cpu] )
1642 amd_cpuidle_init(processor_powers[cpu]);
1643 break;
1644 }
1645
1646 return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
1647 }
1648
1649 static struct notifier_block cpu_nfb = {
1650 .notifier_call = cpu_callback
1651 };
1652
cpuidle_presmp_init(void)1653 static int __init cpuidle_presmp_init(void)
1654 {
1655 void *cpu = (void *)(long)smp_processor_id();
1656
1657 if ( !xen_cpuidle )
1658 return 0;
1659
1660 mwait_idle_init(&cpu_nfb);
1661 cpu_nfb.notifier_call(&cpu_nfb, CPU_UP_PREPARE, cpu);
1662 cpu_nfb.notifier_call(&cpu_nfb, CPU_ONLINE, cpu);
1663 register_cpu_notifier(&cpu_nfb);
1664 return 0;
1665 }
1666 presmp_initcall(cpuidle_presmp_init);
1667
1668