1 /****************************************************************************
2 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
3 * (C) 2002-2003 University of Cambridge
4 * (C) 2004 - Mark Williamson - Intel Research Cambridge
5 ****************************************************************************
6 *
7 * File: common/schedule.c
8 * Author: Rolf Neugebauer & Keir Fraser
9 * Updated for generic API by Mark Williamson
10 *
11 * Description: Generic CPU scheduling code
12 * implements support functionality for the Xen scheduler API.
13 *
14 */
15
16 #ifndef COMPAT
17 #include <xen/init.h>
18 #include <xen/lib.h>
19 #include <xen/param.h>
20 #include <xen/sched.h>
21 #include <xen/domain.h>
22 #include <xen/delay.h>
23 #include <xen/event.h>
24 #include <xen/time.h>
25 #include <xen/timer.h>
26 #include <xen/perfc.h>
27 #include <xen/softirq.h>
28 #include <xen/trace.h>
29 #include <xen/mm.h>
30 #include <xen/err.h>
31 #include <xen/guest_access.h>
32 #include <xen/hypercall.h>
33 #include <xen/multicall.h>
34 #include <xen/cpu.h>
35 #include <xen/preempt.h>
36 #include <xen/event.h>
37 #include <public/sched.h>
38 #include <xsm/xsm.h>
39 #include <xen/err.h>
40
41 #include "private.h"
42
43 #ifdef CONFIG_XEN_GUEST
44 #include <asm/guest.h>
45 #else
46 #define pv_shim false
47 #endif
48
49 /* opt_sched: scheduler - default to configured value */
50 static char __initdata opt_sched[10] = CONFIG_SCHED_DEFAULT;
51 string_param("sched", opt_sched);
52
53 /* if sched_smt_power_savings is set,
54 * scheduler will give preferrence to partially idle package compared to
55 * the full idle package, when picking pCPU to schedule vCPU.
56 */
57 bool sched_smt_power_savings;
58 boolean_param("sched_smt_power_savings", sched_smt_power_savings);
59
60 /* Default scheduling rate limit: 1ms
61 * The behavior when sched_ratelimit_us is greater than sched_credit_tslice_ms is undefined
62 * */
63 int sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US;
64 integer_param("sched_ratelimit_us", sched_ratelimit_us);
65
66 /* Number of vcpus per struct sched_unit. */
67 bool __read_mostly sched_disable_smt_switching;
68 cpumask_t sched_res_mask;
69
70 /* Common lock for free cpus. */
71 static DEFINE_SPINLOCK(sched_free_cpu_lock);
72
73 /* Various timer handlers. */
74 static void s_timer_fn(void *unused);
75 static void vcpu_periodic_timer_fn(void *data);
76 static void vcpu_singleshot_timer_fn(void *data);
77 static void poll_timer_fn(void *data);
78
79 /* This is global for now so that private implementations can reach it */
80 DEFINE_PER_CPU_READ_MOSTLY(struct sched_resource *, sched_res);
81 static DEFINE_PER_CPU_READ_MOSTLY(unsigned int, sched_res_idx);
82 DEFINE_RCU_READ_LOCK(sched_res_rculock);
83
84 /* Scratch space for cpumasks. */
85 DEFINE_PER_CPU(cpumask_t, cpumask_scratch);
86
87 /* How many urgent vcpus. */
88 DEFINE_PER_CPU(atomic_t, sched_urgent_count);
89
90 extern const struct scheduler *__start_schedulers_array[], *__end_schedulers_array[];
91 #define NUM_SCHEDULERS (__end_schedulers_array - __start_schedulers_array)
92 #define schedulers __start_schedulers_array
93
94 static struct scheduler __read_mostly ops;
95
96 static bool scheduler_active;
97
98 static void sched_set_affinity(
99 struct sched_unit *unit, const cpumask_t *hard, const cpumask_t *soft);
100
101 static struct sched_resource *
sched_idle_res_pick(const struct scheduler * ops,const struct sched_unit * unit)102 sched_idle_res_pick(const struct scheduler *ops, const struct sched_unit *unit)
103 {
104 return unit->res;
105 }
106
107 static void *
sched_idle_alloc_udata(const struct scheduler * ops,struct sched_unit * unit,void * dd)108 sched_idle_alloc_udata(const struct scheduler *ops, struct sched_unit *unit,
109 void *dd)
110 {
111 /* Any non-NULL pointer is fine here. */
112 return ZERO_BLOCK_PTR;
113 }
114
115 static void
sched_idle_free_udata(const struct scheduler * ops,void * priv)116 sched_idle_free_udata(const struct scheduler *ops, void *priv)
117 {
118 }
119
sched_idle_schedule(const struct scheduler * ops,struct sched_unit * unit,s_time_t now,bool tasklet_work_scheduled)120 static void sched_idle_schedule(
121 const struct scheduler *ops, struct sched_unit *unit, s_time_t now,
122 bool tasklet_work_scheduled)
123 {
124 const unsigned int cpu = smp_processor_id();
125
126 unit->next_time = -1;
127 unit->next_task = sched_idle_unit(cpu);
128 }
129
130 static struct scheduler sched_idle_ops = {
131 .name = "Idle Scheduler",
132 .opt_name = "idle",
133 .sched_data = NULL,
134
135 .pick_resource = sched_idle_res_pick,
136 .do_schedule = sched_idle_schedule,
137
138 .alloc_udata = sched_idle_alloc_udata,
139 .free_udata = sched_idle_free_udata,
140 };
141
unit2vcpu_cpu(const struct sched_unit * unit,unsigned int cpu)142 static inline struct vcpu *unit2vcpu_cpu(const struct sched_unit *unit,
143 unsigned int cpu)
144 {
145 unsigned int idx = unit->unit_id + per_cpu(sched_res_idx, cpu);
146 const struct domain *d = unit->domain;
147
148 return (idx < d->max_vcpus) ? d->vcpu[idx] : NULL;
149 }
150
sched_unit2vcpu_cpu(const struct sched_unit * unit,unsigned int cpu)151 static inline struct vcpu *sched_unit2vcpu_cpu(const struct sched_unit *unit,
152 unsigned int cpu)
153 {
154 struct vcpu *v = unit2vcpu_cpu(unit, cpu);
155
156 return (v && v->new_state == RUNSTATE_running) ? v : idle_vcpu[cpu];
157 }
158
dom_scheduler(const struct domain * d)159 static inline struct scheduler *dom_scheduler(const struct domain *d)
160 {
161 if ( likely(d->cpupool != NULL) )
162 return d->cpupool->sched;
163
164 /*
165 * If d->cpupool is NULL, this is the idle domain. This is special
166 * because the idle domain does not really belong to any cpupool, and,
167 * hence, does not really have a scheduler.
168 *
169 * This is (should be!) only called like this for allocating the idle
170 * vCPUs for the first time, during boot, in which case what we want
171 * is the default scheduler that has been, choosen at boot.
172 */
173 ASSERT(is_idle_domain(d));
174 return &ops;
175 }
176
unit_scheduler(const struct sched_unit * unit)177 static inline struct scheduler *unit_scheduler(const struct sched_unit *unit)
178 {
179 const struct domain *d = unit->domain;
180
181 if ( likely(d->cpupool != NULL) )
182 return d->cpupool->sched;
183
184 /*
185 * If d->cpupool is NULL, this is a unit of the idle domain. And this
186 * case is special because the idle domain does not really belong to
187 * a cpupool and, hence, doesn't really have a scheduler). In fact, its
188 * units (may) run on pCPUs which are in different pools, with different
189 * schedulers.
190 *
191 * What we want, in this case, is the scheduler of the pCPU where this
192 * particular idle unit is running. And, since unit->res never changes
193 * for idle units, it is safe to use it, with no locks, to figure that out.
194 */
195
196 ASSERT(is_idle_domain(d));
197 return unit->res->scheduler;
198 }
199
vcpu_scheduler(const struct vcpu * v)200 static inline struct scheduler *vcpu_scheduler(const struct vcpu *v)
201 {
202 return unit_scheduler(v->sched_unit);
203 }
204 #define VCPU2ONLINE(_v) cpupool_domain_master_cpumask((_v)->domain)
205
trace_runstate_change(const struct vcpu * v,int new_state)206 static inline void trace_runstate_change(const struct vcpu *v, int new_state)
207 {
208 struct { uint32_t vcpu:16, domain:16; } d;
209 uint32_t event;
210
211 if ( likely(!tb_init_done) )
212 return;
213
214 d.vcpu = v->vcpu_id;
215 d.domain = v->domain->domain_id;
216
217 event = TRC_SCHED_RUNSTATE_CHANGE;
218 event |= ( v->runstate.state & 0x3 ) << 8;
219 event |= ( new_state & 0x3 ) << 4;
220
221 __trace_var(event, 1/*tsc*/, sizeof(d), &d);
222 }
223
trace_continue_running(const struct vcpu * v)224 static inline void trace_continue_running(const struct vcpu *v)
225 {
226 struct { uint32_t vcpu:16, domain:16; } d;
227
228 if ( likely(!tb_init_done) )
229 return;
230
231 d.vcpu = v->vcpu_id;
232 d.domain = v->domain->domain_id;
233
234 __trace_var(TRC_SCHED_CONTINUE_RUNNING, 1/*tsc*/, sizeof(d), &d);
235 }
236
vcpu_urgent_count_update(struct vcpu * v)237 static inline void vcpu_urgent_count_update(struct vcpu *v)
238 {
239 if ( is_idle_vcpu(v) )
240 return;
241
242 if ( unlikely(v->is_urgent) )
243 {
244 if ( !(v->pause_flags & VPF_blocked) ||
245 !test_bit(v->vcpu_id, v->domain->poll_mask) )
246 {
247 v->is_urgent = 0;
248 atomic_dec(&per_cpu(sched_urgent_count, v->processor));
249 }
250 }
251 else
252 {
253 if ( unlikely(v->pause_flags & VPF_blocked) &&
254 unlikely(test_bit(v->vcpu_id, v->domain->poll_mask)) )
255 {
256 v->is_urgent = 1;
257 atomic_inc(&per_cpu(sched_urgent_count, v->processor));
258 }
259 }
260 }
261
vcpu_runstate_change(struct vcpu * v,int new_state,s_time_t new_entry_time)262 static inline void vcpu_runstate_change(
263 struct vcpu *v, int new_state, s_time_t new_entry_time)
264 {
265 s_time_t delta;
266 struct sched_unit *unit = v->sched_unit;
267
268 ASSERT(spin_is_locked(get_sched_res(v->processor)->schedule_lock));
269 if ( v->runstate.state == new_state )
270 return;
271
272 vcpu_urgent_count_update(v);
273
274 trace_runstate_change(v, new_state);
275
276 if ( !is_idle_vcpu(v) )
277 {
278 unit->runstate_cnt[v->runstate.state]--;
279 unit->runstate_cnt[new_state]++;
280 }
281
282 delta = new_entry_time - v->runstate.state_entry_time;
283 if ( delta > 0 )
284 {
285 v->runstate.time[v->runstate.state] += delta;
286 v->runstate.state_entry_time = new_entry_time;
287 }
288
289 v->runstate.state = new_state;
290 }
291
sched_guest_idle(void (* idle)(void),unsigned int cpu)292 void sched_guest_idle(void (*idle) (void), unsigned int cpu)
293 {
294 /*
295 * Another vcpu of the unit is active in guest context while this one is
296 * idle. In case of a scheduling event we don't want to have high latencies
297 * due to a cpu needing to wake up from deep C state for joining the
298 * rendezvous, so avoid those deep C states by incrementing the urgent
299 * count of the cpu.
300 */
301 atomic_inc(&per_cpu(sched_urgent_count, cpu));
302 idle();
303 atomic_dec(&per_cpu(sched_urgent_count, cpu));
304 }
305
vcpu_runstate_get(const struct vcpu * v,struct vcpu_runstate_info * runstate)306 void vcpu_runstate_get(const struct vcpu *v,
307 struct vcpu_runstate_info *runstate)
308 {
309 spinlock_t *lock;
310 s_time_t delta;
311 struct sched_unit *unit;
312
313 rcu_read_lock(&sched_res_rculock);
314
315 /*
316 * Be careful in case of an idle vcpu: the assignment to a unit might
317 * change even with the scheduling lock held, so be sure to use the
318 * correct unit for locking in order to avoid triggering an ASSERT() in
319 * the unlock function.
320 */
321 unit = is_idle_vcpu(v) ? get_sched_res(v->processor)->sched_unit_idle
322 : v->sched_unit;
323 lock = likely(v == current) ? NULL : unit_schedule_lock_irq(unit);
324 memcpy(runstate, &v->runstate, sizeof(*runstate));
325 delta = NOW() - runstate->state_entry_time;
326 if ( delta > 0 )
327 runstate->time[runstate->state] += delta;
328
329 if ( unlikely(lock != NULL) )
330 unit_schedule_unlock_irq(lock, unit);
331
332 rcu_read_unlock(&sched_res_rculock);
333 }
334
get_cpu_idle_time(unsigned int cpu)335 uint64_t get_cpu_idle_time(unsigned int cpu)
336 {
337 struct vcpu_runstate_info state = { 0 };
338 const struct vcpu *v = idle_vcpu[cpu];
339
340 if ( cpu_online(cpu) && v )
341 vcpu_runstate_get(v, &state);
342
343 return state.time[RUNSTATE_running];
344 }
345
346 /*
347 * If locks are different, take the one with the lower address first.
348 * This avoids dead- or live-locks when this code is running on both
349 * cpus at the same time.
350 */
sched_spin_lock_double(spinlock_t * lock1,spinlock_t * lock2,unsigned long * flags)351 static void sched_spin_lock_double(spinlock_t *lock1, spinlock_t *lock2,
352 unsigned long *flags)
353 {
354 if ( lock1 == lock2 )
355 {
356 spin_lock_irqsave(lock1, *flags);
357 }
358 else if ( lock1 < lock2 )
359 {
360 spin_lock_irqsave(lock1, *flags);
361 spin_lock(lock2);
362 }
363 else
364 {
365 spin_lock_irqsave(lock2, *flags);
366 spin_lock(lock1);
367 }
368 }
369
sched_spin_unlock_double(spinlock_t * lock1,spinlock_t * lock2,unsigned long flags)370 static void sched_spin_unlock_double(spinlock_t *lock1, spinlock_t *lock2,
371 unsigned long flags)
372 {
373 if ( lock1 != lock2 )
374 spin_unlock(lock2);
375 spin_unlock_irqrestore(lock1, flags);
376 }
377
sched_free_unit_mem(struct sched_unit * unit)378 static void sched_free_unit_mem(struct sched_unit *unit)
379 {
380 struct sched_unit *prev_unit;
381 struct domain *d = unit->domain;
382
383 if ( d->sched_unit_list == unit )
384 d->sched_unit_list = unit->next_in_list;
385 else
386 {
387 for_each_sched_unit ( d, prev_unit )
388 {
389 if ( prev_unit->next_in_list == unit )
390 {
391 prev_unit->next_in_list = unit->next_in_list;
392 break;
393 }
394 }
395 }
396
397 free_cpumask_var(unit->cpu_hard_affinity);
398 free_cpumask_var(unit->cpu_hard_affinity_saved);
399 free_cpumask_var(unit->cpu_soft_affinity);
400
401 xfree(unit);
402 }
403
sched_free_unit(struct sched_unit * unit,struct vcpu * v)404 static void sched_free_unit(struct sched_unit *unit, struct vcpu *v)
405 {
406 const struct vcpu *vunit;
407 unsigned int cnt = 0;
408
409 /* Don't count to be released vcpu, might be not in vcpu list yet. */
410 for_each_sched_unit_vcpu ( unit, vunit )
411 if ( vunit != v )
412 cnt++;
413
414 v->sched_unit = NULL;
415 unit->runstate_cnt[v->runstate.state]--;
416
417 if ( unit->vcpu_list == v )
418 unit->vcpu_list = v->next_in_list;
419
420 if ( !cnt )
421 sched_free_unit_mem(unit);
422 }
423
sched_unit_add_vcpu(struct sched_unit * unit,struct vcpu * v)424 static void sched_unit_add_vcpu(struct sched_unit *unit, struct vcpu *v)
425 {
426 v->sched_unit = unit;
427
428 /* All but idle vcpus are allocated with sequential vcpu_id. */
429 if ( !unit->vcpu_list || unit->vcpu_list->vcpu_id > v->vcpu_id )
430 {
431 unit->vcpu_list = v;
432 /*
433 * unit_id is always the same as lowest vcpu_id of unit.
434 * This is used for stopping for_each_sched_unit_vcpu() loop and in
435 * order to support cpupools with different granularities.
436 */
437 unit->unit_id = v->vcpu_id;
438 }
439 unit->runstate_cnt[v->runstate.state]++;
440 }
441
sched_alloc_unit_mem(void)442 static struct sched_unit *sched_alloc_unit_mem(void)
443 {
444 struct sched_unit *unit;
445
446 unit = xzalloc(struct sched_unit);
447 if ( !unit )
448 return NULL;
449
450 if ( !zalloc_cpumask_var(&unit->cpu_hard_affinity) ||
451 !zalloc_cpumask_var(&unit->cpu_hard_affinity_saved) ||
452 !zalloc_cpumask_var(&unit->cpu_soft_affinity) )
453 {
454 sched_free_unit_mem(unit);
455 unit = NULL;
456 }
457
458 return unit;
459 }
460
sched_domain_insert_unit(struct sched_unit * unit,struct domain * d)461 static void sched_domain_insert_unit(struct sched_unit *unit, struct domain *d)
462 {
463 struct sched_unit **prev_unit;
464
465 unit->domain = d;
466
467 for ( prev_unit = &d->sched_unit_list; *prev_unit;
468 prev_unit = &(*prev_unit)->next_in_list )
469 if ( (*prev_unit)->next_in_list &&
470 (*prev_unit)->next_in_list->unit_id > unit->unit_id )
471 break;
472
473 unit->next_in_list = *prev_unit;
474 *prev_unit = unit;
475 }
476
sched_alloc_unit(struct vcpu * v)477 static struct sched_unit *sched_alloc_unit(struct vcpu *v)
478 {
479 struct sched_unit *unit;
480 struct domain *d = v->domain;
481 unsigned int gran = cpupool_get_granularity(d->cpupool);
482
483 for_each_sched_unit ( d, unit )
484 if ( unit->unit_id / gran == v->vcpu_id / gran )
485 break;
486
487 if ( unit )
488 {
489 sched_unit_add_vcpu(unit, v);
490 return unit;
491 }
492
493 if ( (unit = sched_alloc_unit_mem()) == NULL )
494 return NULL;
495
496 sched_unit_add_vcpu(unit, v);
497 sched_domain_insert_unit(unit, d);
498
499 return unit;
500 }
501
sched_select_initial_cpu(const struct vcpu * v)502 static unsigned int sched_select_initial_cpu(const struct vcpu *v)
503 {
504 const struct domain *d = v->domain;
505 nodeid_t node;
506 spinlock_t *lock;
507 unsigned long flags;
508 unsigned int cpu_ret, cpu = smp_processor_id();
509 cpumask_t *cpus = cpumask_scratch_cpu(cpu);
510
511 lock = pcpu_schedule_lock_irqsave(cpu, &flags);
512 cpumask_clear(cpus);
513 for_each_node_mask ( node, d->node_affinity )
514 cpumask_or(cpus, cpus, &node_to_cpumask(node));
515 cpumask_and(cpus, cpus, d->cpupool->cpu_valid);
516 if ( cpumask_empty(cpus) )
517 cpumask_copy(cpus, d->cpupool->cpu_valid);
518
519 if ( v->vcpu_id == 0 )
520 cpu_ret = cpumask_first(cpus);
521 else
522 {
523 /* We can rely on previous vcpu being available. */
524 ASSERT(!is_idle_domain(d));
525
526 cpu_ret = cpumask_cycle(d->vcpu[v->vcpu_id - 1]->processor, cpus);
527 }
528
529 pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
530
531 return cpu_ret;
532 }
533
sched_init_vcpu(struct vcpu * v)534 int sched_init_vcpu(struct vcpu *v)
535 {
536 const struct domain *d = v->domain;
537 struct sched_unit *unit;
538 unsigned int processor;
539
540 if ( (unit = sched_alloc_unit(v)) == NULL )
541 return 1;
542
543 if ( is_idle_domain(d) )
544 processor = v->vcpu_id;
545 else
546 processor = sched_select_initial_cpu(v);
547
548 /* Initialise the per-vcpu timers. */
549 spin_lock_init(&v->periodic_timer_lock);
550 init_timer(&v->periodic_timer, vcpu_periodic_timer_fn, v, processor);
551 init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn, v, processor);
552 init_timer(&v->poll_timer, poll_timer_fn, v, processor);
553
554 /* If this is not the first vcpu of the unit we are done. */
555 if ( unit->priv != NULL )
556 {
557 v->processor = processor;
558 return 0;
559 }
560
561 rcu_read_lock(&sched_res_rculock);
562
563 /* The first vcpu of an unit can be set via sched_set_res(). */
564 sched_set_res(unit, get_sched_res(processor));
565
566 unit->priv = sched_alloc_udata(dom_scheduler(d), unit, d->sched_priv);
567 if ( unit->priv == NULL )
568 {
569 sched_free_unit(unit, v);
570 rcu_read_unlock(&sched_res_rculock);
571 return 1;
572 }
573
574 /*
575 * Initialize affinity settings. The idler, and potentially
576 * domain-0 VCPUs, are pinned onto their respective physical CPUs.
577 */
578 if ( is_idle_domain(d) || (is_hardware_domain(d) && opt_dom0_vcpus_pin) )
579 sched_set_affinity(unit, cpumask_of(processor), &cpumask_all);
580 else
581 sched_set_affinity(unit, &cpumask_all, &cpumask_all);
582
583 /* Idle VCPUs are scheduled immediately, so don't put them in runqueue. */
584 if ( is_idle_domain(d) )
585 {
586 get_sched_res(v->processor)->curr = unit;
587 get_sched_res(v->processor)->sched_unit_idle = unit;
588 v->is_running = true;
589 unit->is_running = true;
590 unit->state_entry_time = NOW();
591 }
592 else
593 {
594 sched_insert_unit(dom_scheduler(d), unit);
595 }
596
597 rcu_read_unlock(&sched_res_rculock);
598
599 return 0;
600 }
601
vcpu_move_irqs(struct vcpu * v)602 static void vcpu_move_irqs(struct vcpu *v)
603 {
604 arch_move_irqs(v);
605 evtchn_move_pirqs(v);
606 }
607
sched_move_irqs(const struct sched_unit * unit)608 static void sched_move_irqs(const struct sched_unit *unit)
609 {
610 struct vcpu *v;
611
612 for_each_sched_unit_vcpu ( unit, v )
613 vcpu_move_irqs(v);
614 }
615
sched_move_domain(struct domain * d,struct cpupool * c)616 int sched_move_domain(struct domain *d, struct cpupool *c)
617 {
618 struct vcpu *v;
619 struct sched_unit *unit;
620 unsigned int new_p, unit_idx;
621 void **unit_priv;
622 void *domdata;
623 void *unitdata;
624 struct scheduler *old_ops;
625 void *old_domdata;
626 unsigned int gran = cpupool_get_granularity(c);
627 int ret = 0;
628
629 for_each_vcpu ( d, v )
630 {
631 if ( v->affinity_broken )
632 return -EBUSY;
633 }
634
635 rcu_read_lock(&sched_res_rculock);
636
637 domdata = sched_alloc_domdata(c->sched, d);
638 if ( IS_ERR(domdata) )
639 {
640 ret = PTR_ERR(domdata);
641 goto out;
642 }
643
644 unit_priv = xzalloc_array(void *, DIV_ROUND_UP(d->max_vcpus, gran));
645 if ( unit_priv == NULL )
646 {
647 sched_free_domdata(c->sched, domdata);
648 ret = -ENOMEM;
649 goto out;
650 }
651
652 unit_idx = 0;
653 for_each_sched_unit ( d, unit )
654 {
655 unit_priv[unit_idx] = sched_alloc_udata(c->sched, unit, domdata);
656 if ( unit_priv[unit_idx] == NULL )
657 {
658 for ( unit_idx = 0; unit_priv[unit_idx]; unit_idx++ )
659 sched_free_udata(c->sched, unit_priv[unit_idx]);
660 xfree(unit_priv);
661 sched_free_domdata(c->sched, domdata);
662 ret = -ENOMEM;
663 goto out;
664 }
665 unit_idx++;
666 }
667
668 domain_pause(d);
669
670 old_ops = dom_scheduler(d);
671 old_domdata = d->sched_priv;
672
673 for_each_sched_unit ( d, unit )
674 {
675 sched_remove_unit(old_ops, unit);
676 }
677
678 d->cpupool = c;
679 d->sched_priv = domdata;
680
681 new_p = cpumask_first(c->cpu_valid);
682 unit_idx = 0;
683 for_each_sched_unit ( d, unit )
684 {
685 spinlock_t *lock;
686 unsigned int unit_p = new_p;
687
688 unitdata = unit->priv;
689 unit->priv = unit_priv[unit_idx];
690
691 for_each_sched_unit_vcpu ( unit, v )
692 {
693 migrate_timer(&v->periodic_timer, new_p);
694 migrate_timer(&v->singleshot_timer, new_p);
695 migrate_timer(&v->poll_timer, new_p);
696 new_p = cpumask_cycle(new_p, c->cpu_valid);
697 }
698
699 lock = unit_schedule_lock_irq(unit);
700
701 sched_set_affinity(unit, &cpumask_all, &cpumask_all);
702
703 sched_set_res(unit, get_sched_res(unit_p));
704 /*
705 * With v->processor modified we must not
706 * - make any further changes assuming we hold the scheduler lock,
707 * - use unit_schedule_unlock_irq().
708 */
709 spin_unlock_irq(lock);
710
711 if ( !d->is_dying )
712 sched_move_irqs(unit);
713
714 sched_insert_unit(c->sched, unit);
715
716 sched_free_udata(old_ops, unitdata);
717
718 unit_idx++;
719 }
720
721 domain_update_node_affinity(d);
722
723 domain_unpause(d);
724
725 sched_free_domdata(old_ops, old_domdata);
726
727 xfree(unit_priv);
728
729 out:
730 rcu_read_unlock(&sched_res_rculock);
731
732 return ret;
733 }
734
sched_destroy_vcpu(struct vcpu * v)735 void sched_destroy_vcpu(struct vcpu *v)
736 {
737 struct sched_unit *unit = v->sched_unit;
738
739 kill_timer(&v->periodic_timer);
740 kill_timer(&v->singleshot_timer);
741 kill_timer(&v->poll_timer);
742 if ( test_and_clear_bool(v->is_urgent) )
743 atomic_dec(&per_cpu(sched_urgent_count, v->processor));
744 /*
745 * Vcpus are being destroyed top-down. So being the first vcpu of an unit
746 * is the same as being the only one.
747 */
748 if ( unit->vcpu_list == v )
749 {
750 rcu_read_lock(&sched_res_rculock);
751
752 sched_remove_unit(vcpu_scheduler(v), unit);
753 sched_free_udata(vcpu_scheduler(v), unit->priv);
754 sched_free_unit(unit, v);
755
756 rcu_read_unlock(&sched_res_rculock);
757 }
758 }
759
sched_init_domain(struct domain * d,int poolid)760 int sched_init_domain(struct domain *d, int poolid)
761 {
762 void *sdom;
763 int ret;
764
765 ASSERT(d->cpupool == NULL);
766 ASSERT(d->domain_id < DOMID_FIRST_RESERVED);
767
768 if ( (ret = cpupool_add_domain(d, poolid)) )
769 return ret;
770
771 SCHED_STAT_CRANK(dom_init);
772 TRACE_1D(TRC_SCHED_DOM_ADD, d->domain_id);
773
774 rcu_read_lock(&sched_res_rculock);
775
776 sdom = sched_alloc_domdata(dom_scheduler(d), d);
777
778 rcu_read_unlock(&sched_res_rculock);
779
780 if ( IS_ERR(sdom) )
781 return PTR_ERR(sdom);
782
783 d->sched_priv = sdom;
784
785 return 0;
786 }
787
sched_destroy_domain(struct domain * d)788 void sched_destroy_domain(struct domain *d)
789 {
790 ASSERT(d->domain_id < DOMID_FIRST_RESERVED);
791
792 if ( d->cpupool )
793 {
794 SCHED_STAT_CRANK(dom_destroy);
795 TRACE_1D(TRC_SCHED_DOM_REM, d->domain_id);
796
797 rcu_read_lock(&sched_res_rculock);
798
799 sched_free_domdata(dom_scheduler(d), d->sched_priv);
800 d->sched_priv = NULL;
801
802 rcu_read_unlock(&sched_res_rculock);
803
804 cpupool_rm_domain(d);
805 }
806 }
807
vcpu_sleep_nosync_locked(struct vcpu * v)808 static void vcpu_sleep_nosync_locked(struct vcpu *v)
809 {
810 struct sched_unit *unit = v->sched_unit;
811
812 ASSERT(spin_is_locked(get_sched_res(v->processor)->schedule_lock));
813
814 if ( likely(!vcpu_runnable(v)) )
815 {
816 if ( v->runstate.state == RUNSTATE_runnable )
817 vcpu_runstate_change(v, RUNSTATE_offline, NOW());
818
819 /* Only put unit to sleep in case all vcpus are not runnable. */
820 if ( likely(!unit_runnable(unit)) )
821 sched_sleep(unit_scheduler(unit), unit);
822 else if ( unit_running(unit) > 1 && v->is_running &&
823 !v->force_context_switch )
824 {
825 v->force_context_switch = true;
826 cpu_raise_softirq(v->processor, SCHED_SLAVE_SOFTIRQ);
827 }
828 }
829 }
830
vcpu_sleep_nosync(struct vcpu * v)831 void vcpu_sleep_nosync(struct vcpu *v)
832 {
833 unsigned long flags;
834 spinlock_t *lock;
835
836 TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
837
838 rcu_read_lock(&sched_res_rculock);
839
840 lock = unit_schedule_lock_irqsave(v->sched_unit, &flags);
841
842 vcpu_sleep_nosync_locked(v);
843
844 unit_schedule_unlock_irqrestore(lock, flags, v->sched_unit);
845
846 rcu_read_unlock(&sched_res_rculock);
847 }
848
vcpu_sleep_sync(struct vcpu * v)849 void vcpu_sleep_sync(struct vcpu *v)
850 {
851 vcpu_sleep_nosync(v);
852
853 while ( !vcpu_runnable(v) && v->is_running )
854 cpu_relax();
855
856 sync_vcpu_execstate(v);
857 }
858
vcpu_wake(struct vcpu * v)859 void vcpu_wake(struct vcpu *v)
860 {
861 unsigned long flags;
862 spinlock_t *lock;
863 struct sched_unit *unit = v->sched_unit;
864
865 TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
866
867 rcu_read_lock(&sched_res_rculock);
868
869 lock = unit_schedule_lock_irqsave(unit, &flags);
870
871 if ( likely(vcpu_runnable(v)) )
872 {
873 if ( v->runstate.state >= RUNSTATE_blocked )
874 vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
875 /*
876 * Call sched_wake() unconditionally, even if unit is running already.
877 * We might have not been de-scheduled after vcpu_sleep_nosync_locked()
878 * and are now to be woken up again.
879 */
880 sched_wake(unit_scheduler(unit), unit);
881 if ( unit->is_running && !v->is_running && !v->force_context_switch )
882 {
883 v->force_context_switch = true;
884 cpu_raise_softirq(v->processor, SCHED_SLAVE_SOFTIRQ);
885 }
886 }
887 else if ( !(v->pause_flags & VPF_blocked) )
888 {
889 if ( v->runstate.state == RUNSTATE_blocked )
890 vcpu_runstate_change(v, RUNSTATE_offline, NOW());
891 }
892
893 unit_schedule_unlock_irqrestore(lock, flags, unit);
894
895 rcu_read_unlock(&sched_res_rculock);
896 }
897
vcpu_unblock(struct vcpu * v)898 void vcpu_unblock(struct vcpu *v)
899 {
900 if ( !test_and_clear_bit(_VPF_blocked, &v->pause_flags) )
901 return;
902
903 /* Polling period ends when a VCPU is unblocked. */
904 if ( unlikely(v->poll_evtchn != 0) )
905 {
906 v->poll_evtchn = 0;
907 /*
908 * We *must* re-clear _VPF_blocked to avoid racing other wakeups of
909 * this VCPU (and it then going back to sleep on poll_mask).
910 * Test-and-clear is idiomatic and ensures clear_bit not reordered.
911 */
912 if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
913 clear_bit(_VPF_blocked, &v->pause_flags);
914 }
915
916 vcpu_wake(v);
917 }
918
919 /*
920 * Do the actual movement of an unit from old to new CPU. Locks for *both*
921 * CPUs needs to have been taken already when calling this!
922 */
sched_unit_move_locked(struct sched_unit * unit,unsigned int new_cpu)923 static void sched_unit_move_locked(struct sched_unit *unit,
924 unsigned int new_cpu)
925 {
926 unsigned int old_cpu = unit->res->master_cpu;
927 const struct vcpu *v;
928
929 rcu_read_lock(&sched_res_rculock);
930
931 /*
932 * Transfer urgency status to new CPU before switching CPUs, as
933 * once the switch occurs, v->is_urgent is no longer protected by
934 * the per-CPU scheduler lock we are holding.
935 */
936 for_each_sched_unit_vcpu ( unit, v )
937 {
938 if ( unlikely(v->is_urgent) && (old_cpu != new_cpu) )
939 {
940 atomic_inc(&per_cpu(sched_urgent_count, new_cpu));
941 atomic_dec(&per_cpu(sched_urgent_count, old_cpu));
942 }
943 }
944
945 /*
946 * Actual CPU switch to new CPU. This is safe because the lock
947 * pointer can't change while the current lock is held.
948 */
949 sched_migrate(unit_scheduler(unit), unit, new_cpu);
950
951 rcu_read_unlock(&sched_res_rculock);
952 }
953
954 /*
955 * Initiating migration
956 *
957 * In order to migrate, we need the unit in question to have stopped
958 * running and have called sched_sleep() (to take it off any
959 * runqueues, for instance); and if it is currently running, it needs
960 * to be scheduled out. Finally, we need to hold the scheduling locks
961 * for both the processor we're migrating from, and the processor
962 * we're migrating to.
963 *
964 * In order to avoid deadlock while satisfying the final requirement,
965 * we must release any scheduling lock we hold, then try to grab both
966 * locks we want, then double-check to make sure that what we started
967 * to do hasn't been changed in the mean time.
968 *
969 * These steps are encapsulated in the following two functions; they
970 * should be called like this:
971 *
972 * lock = unit_schedule_lock_irq(unit);
973 * sched_unit_migrate_start(unit);
974 * unit_schedule_unlock_irq(lock, unit)
975 * sched_unit_migrate_finish(unit);
976 *
977 * sched_unit_migrate_finish() will do the work now if it can, or simply
978 * return if it can't (because unit is still running); in that case
979 * sched_unit_migrate_finish() will be called by unit_context_saved().
980 */
sched_unit_migrate_start(struct sched_unit * unit)981 static void sched_unit_migrate_start(struct sched_unit *unit)
982 {
983 struct vcpu *v;
984
985 for_each_sched_unit_vcpu ( unit, v )
986 {
987 set_bit(_VPF_migrating, &v->pause_flags);
988 vcpu_sleep_nosync_locked(v);
989 }
990 }
991
sched_unit_migrate_finish(struct sched_unit * unit)992 static void sched_unit_migrate_finish(struct sched_unit *unit)
993 {
994 unsigned long flags;
995 unsigned int old_cpu, new_cpu;
996 spinlock_t *old_lock, *new_lock;
997 bool pick_called = false;
998 struct vcpu *v;
999
1000 /*
1001 * If the unit is currently running, this will be handled by
1002 * unit_context_saved(); and in any case, if the bit is cleared, then
1003 * someone else has already done the work so we don't need to.
1004 */
1005 if ( unit->is_running )
1006 return;
1007 for_each_sched_unit_vcpu ( unit, v )
1008 if ( !test_bit(_VPF_migrating, &v->pause_flags) )
1009 return;
1010
1011 old_cpu = new_cpu = unit->res->master_cpu;
1012 for ( ; ; )
1013 {
1014 /*
1015 * We need another iteration if the pre-calculated lock addresses
1016 * are not correct any longer after evaluating old and new cpu holding
1017 * the locks.
1018 */
1019 old_lock = get_sched_res(old_cpu)->schedule_lock;
1020 new_lock = get_sched_res(new_cpu)->schedule_lock;
1021
1022 sched_spin_lock_double(old_lock, new_lock, &flags);
1023
1024 old_cpu = unit->res->master_cpu;
1025 if ( old_lock == get_sched_res(old_cpu)->schedule_lock )
1026 {
1027 /*
1028 * If we selected a CPU on the previosu iteration, check if it
1029 * remains suitable for running this vCPU.
1030 */
1031 if ( pick_called &&
1032 (new_lock == get_sched_res(new_cpu)->schedule_lock) &&
1033 cpumask_test_cpu(new_cpu, unit->cpu_hard_affinity) &&
1034 cpumask_test_cpu(new_cpu, unit->domain->cpupool->cpu_valid) )
1035 break;
1036
1037 /* Select a new CPU. */
1038 new_cpu = sched_pick_resource(unit_scheduler(unit),
1039 unit)->master_cpu;
1040 if ( (new_lock == get_sched_res(new_cpu)->schedule_lock) &&
1041 cpumask_test_cpu(new_cpu, unit->domain->cpupool->cpu_valid) )
1042 break;
1043 pick_called = true;
1044 }
1045 else
1046 {
1047 /*
1048 * We do not hold the scheduler lock appropriate for this vCPU.
1049 * Thus we cannot select a new CPU on this iteration. Try again.
1050 */
1051 pick_called = false;
1052 }
1053
1054 sched_spin_unlock_double(old_lock, new_lock, flags);
1055 }
1056
1057 /*
1058 * NB. Check of v->running happens /after/ setting migration flag
1059 * because they both happen in (different) spinlock regions, and those
1060 * regions are strictly serialised.
1061 */
1062 if ( unit->is_running )
1063 {
1064 sched_spin_unlock_double(old_lock, new_lock, flags);
1065 return;
1066 }
1067 for_each_sched_unit_vcpu ( unit, v )
1068 {
1069 if ( !test_and_clear_bit(_VPF_migrating, &v->pause_flags) )
1070 {
1071 sched_spin_unlock_double(old_lock, new_lock, flags);
1072 return;
1073 }
1074 }
1075
1076 sched_unit_move_locked(unit, new_cpu);
1077
1078 sched_spin_unlock_double(old_lock, new_lock, flags);
1079
1080 if ( old_cpu != new_cpu )
1081 sched_move_irqs(unit);
1082
1083 /* Wake on new CPU. */
1084 for_each_sched_unit_vcpu ( unit, v )
1085 vcpu_wake(v);
1086 }
1087
sched_check_affinity_broken(const struct sched_unit * unit)1088 static bool sched_check_affinity_broken(const struct sched_unit *unit)
1089 {
1090 const struct vcpu *v;
1091
1092 for_each_sched_unit_vcpu ( unit, v )
1093 if ( v->affinity_broken )
1094 return true;
1095
1096 return false;
1097 }
1098
sched_reset_affinity_broken(const struct sched_unit * unit)1099 static void sched_reset_affinity_broken(const struct sched_unit *unit)
1100 {
1101 struct vcpu *v;
1102
1103 for_each_sched_unit_vcpu ( unit, v )
1104 v->affinity_broken = false;
1105 }
1106
restore_vcpu_affinity(struct domain * d)1107 void restore_vcpu_affinity(struct domain *d)
1108 {
1109 unsigned int cpu = smp_processor_id();
1110 struct sched_unit *unit;
1111
1112 ASSERT(system_state == SYS_STATE_resume);
1113
1114 rcu_read_lock(&sched_res_rculock);
1115
1116 for_each_sched_unit ( d, unit )
1117 {
1118 spinlock_t *lock;
1119 unsigned int old_cpu = sched_unit_master(unit);
1120 struct sched_resource *res;
1121
1122 ASSERT(!unit_runnable(unit));
1123
1124 /*
1125 * Re-assign the initial processor as after resume we have no
1126 * guarantee the old processor has come back to life again.
1127 *
1128 * Therefore, here, before actually unpausing the domains, we should
1129 * set v->processor of each of their vCPUs to something that will
1130 * make sense for the scheduler of the cpupool in which they are in.
1131 */
1132 lock = unit_schedule_lock_irq(unit);
1133
1134 cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
1135 cpupool_domain_master_cpumask(d));
1136 if ( cpumask_empty(cpumask_scratch_cpu(cpu)) )
1137 {
1138 if ( sched_check_affinity_broken(unit) )
1139 {
1140 sched_set_affinity(unit, unit->cpu_hard_affinity_saved, NULL);
1141 sched_reset_affinity_broken(unit);
1142 cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
1143 cpupool_domain_master_cpumask(d));
1144 }
1145
1146 if ( cpumask_empty(cpumask_scratch_cpu(cpu)) )
1147 {
1148 /* Affinity settings of one vcpu are for the complete unit. */
1149 printk(XENLOG_DEBUG "Breaking affinity for %pv\n",
1150 unit->vcpu_list);
1151 sched_set_affinity(unit, &cpumask_all, NULL);
1152 cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
1153 cpupool_domain_master_cpumask(d));
1154 }
1155 }
1156
1157 res = get_sched_res(cpumask_any(cpumask_scratch_cpu(cpu)));
1158 sched_set_res(unit, res);
1159
1160 spin_unlock_irq(lock);
1161
1162 /* v->processor might have changed, so reacquire the lock. */
1163 lock = unit_schedule_lock_irq(unit);
1164 res = sched_pick_resource(unit_scheduler(unit), unit);
1165 sched_set_res(unit, res);
1166 spin_unlock_irq(lock);
1167
1168 if ( old_cpu != sched_unit_master(unit) )
1169 sched_move_irqs(unit);
1170 }
1171
1172 rcu_read_unlock(&sched_res_rculock);
1173
1174 domain_update_node_affinity(d);
1175 }
1176
1177 /*
1178 * This function is used by cpu_hotplug code via cpu notifier chain
1179 * and from cpupools to switch schedulers on a cpu.
1180 * Caller must get domlist_read_lock.
1181 */
cpu_disable_scheduler(unsigned int cpu)1182 int cpu_disable_scheduler(unsigned int cpu)
1183 {
1184 struct domain *d;
1185 const struct cpupool *c;
1186 int ret = 0;
1187
1188 rcu_read_lock(&sched_res_rculock);
1189
1190 c = get_sched_res(cpu)->cpupool;
1191 if ( c == NULL )
1192 goto out;
1193
1194 for_each_domain_in_cpupool ( d, c )
1195 {
1196 struct sched_unit *unit;
1197
1198 for_each_sched_unit ( d, unit )
1199 {
1200 unsigned long flags;
1201 spinlock_t *lock = unit_schedule_lock_irqsave(unit, &flags);
1202
1203 if ( !cpumask_intersects(unit->cpu_hard_affinity, c->cpu_valid) &&
1204 cpumask_test_cpu(cpu, unit->cpu_hard_affinity) )
1205 {
1206 if ( sched_check_affinity_broken(unit) )
1207 {
1208 /* The unit is temporarily pinned, can't move it. */
1209 unit_schedule_unlock_irqrestore(lock, flags, unit);
1210 ret = -EADDRINUSE;
1211 break;
1212 }
1213
1214 printk(XENLOG_DEBUG "Breaking affinity for %pv\n",
1215 unit->vcpu_list);
1216
1217 sched_set_affinity(unit, &cpumask_all, NULL);
1218 }
1219
1220 if ( unit->res != get_sched_res(cpu) )
1221 {
1222 /* The unit is not on this cpu, so we can move on. */
1223 unit_schedule_unlock_irqrestore(lock, flags, unit);
1224 continue;
1225 }
1226
1227 /* If it is on this cpu, we must send it away.
1228 * We are doing some cpupool manipulations:
1229 * * we want to call the scheduler, and let it re-evaluation
1230 * the placement of the vcpu, taking into account the new
1231 * cpupool configuration;
1232 * * the scheduler will always find a suitable solution, or
1233 * things would have failed before getting in here.
1234 */
1235 sched_unit_migrate_start(unit);
1236 unit_schedule_unlock_irqrestore(lock, flags, unit);
1237 sched_unit_migrate_finish(unit);
1238
1239 /*
1240 * The only caveat, in this case, is that if a vcpu active in
1241 * the hypervisor isn't migratable. In this case, the caller
1242 * should try again after releasing and reaquiring all locks.
1243 */
1244 if ( unit->res == get_sched_res(cpu) )
1245 ret = -EAGAIN;
1246 }
1247 }
1248
1249 out:
1250 rcu_read_unlock(&sched_res_rculock);
1251
1252 return ret;
1253 }
1254
cpu_disable_scheduler_check(unsigned int cpu)1255 static int cpu_disable_scheduler_check(unsigned int cpu)
1256 {
1257 struct domain *d;
1258 const struct vcpu *v;
1259 const struct cpupool *c;
1260
1261 c = get_sched_res(cpu)->cpupool;
1262 if ( c == NULL )
1263 return 0;
1264
1265 for_each_domain_in_cpupool ( d, c )
1266 for_each_vcpu ( d, v )
1267 if ( v->affinity_broken )
1268 return -EADDRINUSE;
1269
1270 return 0;
1271 }
1272
1273 /*
1274 * In general, this must be called with the scheduler lock held, because the
1275 * adjust_affinity hook may want to modify the vCPU state. However, when the
1276 * vCPU is being initialized (either for dom0 or domU) there is no risk of
1277 * races, and it's fine to not take the look (we're talking about
1278 * sched_setup_dom0_vcpus() an sched_init_vcpu()).
1279 */
sched_set_affinity(struct sched_unit * unit,const cpumask_t * hard,const cpumask_t * soft)1280 static void sched_set_affinity(
1281 struct sched_unit *unit, const cpumask_t *hard, const cpumask_t *soft)
1282 {
1283 rcu_read_lock(&sched_res_rculock);
1284 sched_adjust_affinity(dom_scheduler(unit->domain), unit, hard, soft);
1285 rcu_read_unlock(&sched_res_rculock);
1286
1287 if ( hard )
1288 cpumask_copy(unit->cpu_hard_affinity, hard);
1289 if ( soft )
1290 cpumask_copy(unit->cpu_soft_affinity, soft);
1291
1292 unit->soft_aff_effective = !cpumask_subset(unit->cpu_hard_affinity,
1293 unit->cpu_soft_affinity) &&
1294 cpumask_intersects(unit->cpu_soft_affinity,
1295 unit->cpu_hard_affinity);
1296 }
1297
vcpu_set_affinity(struct vcpu * v,const cpumask_t * affinity,const cpumask_t * which)1298 static int vcpu_set_affinity(
1299 struct vcpu *v, const cpumask_t *affinity, const cpumask_t *which)
1300 {
1301 struct sched_unit *unit = v->sched_unit;
1302 spinlock_t *lock;
1303 int ret = 0;
1304
1305 rcu_read_lock(&sched_res_rculock);
1306
1307 lock = unit_schedule_lock_irq(unit);
1308
1309 if ( v->affinity_broken )
1310 ret = -EBUSY;
1311 else
1312 {
1313 /*
1314 * Tell the scheduler we changes something about affinity,
1315 * and ask to re-evaluate vcpu placement.
1316 */
1317 if ( which == unit->cpu_hard_affinity )
1318 {
1319 sched_set_affinity(unit, affinity, NULL);
1320 }
1321 else
1322 {
1323 ASSERT(which == unit->cpu_soft_affinity);
1324 sched_set_affinity(unit, NULL, affinity);
1325 }
1326 sched_unit_migrate_start(unit);
1327 }
1328
1329 unit_schedule_unlock_irq(lock, unit);
1330
1331 domain_update_node_affinity(v->domain);
1332
1333 sched_unit_migrate_finish(unit);
1334
1335 rcu_read_unlock(&sched_res_rculock);
1336
1337 return ret;
1338 }
1339
vcpu_set_hard_affinity(struct vcpu * v,const cpumask_t * affinity)1340 int vcpu_set_hard_affinity(struct vcpu *v, const cpumask_t *affinity)
1341 {
1342 cpumask_t *online;
1343
1344 online = VCPU2ONLINE(v);
1345 if ( !cpumask_intersects(online, affinity) )
1346 return -EINVAL;
1347
1348 return vcpu_set_affinity(v, affinity, v->sched_unit->cpu_hard_affinity);
1349 }
1350
vcpu_set_soft_affinity(struct vcpu * v,const cpumask_t * affinity)1351 static int vcpu_set_soft_affinity(struct vcpu *v, const cpumask_t *affinity)
1352 {
1353 return vcpu_set_affinity(v, affinity, v->sched_unit->cpu_soft_affinity);
1354 }
1355
1356 /* Block the currently-executing domain until a pertinent event occurs. */
vcpu_block(void)1357 void vcpu_block(void)
1358 {
1359 struct vcpu *v = current;
1360
1361 set_bit(_VPF_blocked, &v->pause_flags);
1362
1363 arch_vcpu_block(v);
1364
1365 /* Check for events /after/ blocking: avoids wakeup waiting race. */
1366 if ( local_events_need_delivery() )
1367 {
1368 clear_bit(_VPF_blocked, &v->pause_flags);
1369 }
1370 else
1371 {
1372 TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id);
1373 raise_softirq(SCHEDULE_SOFTIRQ);
1374 }
1375 }
1376
vcpu_block_enable_events(void)1377 static void vcpu_block_enable_events(void)
1378 {
1379 local_event_delivery_enable();
1380 vcpu_block();
1381 }
1382
do_poll(struct sched_poll * sched_poll)1383 static long do_poll(struct sched_poll *sched_poll)
1384 {
1385 struct vcpu *v = current;
1386 struct domain *d = v->domain;
1387 evtchn_port_t port = 0;
1388 long rc;
1389 unsigned int i;
1390
1391 /* Fairly arbitrary limit. */
1392 if ( sched_poll->nr_ports > 128 )
1393 return -EINVAL;
1394
1395 if ( !guest_handle_okay(sched_poll->ports, sched_poll->nr_ports) )
1396 return -EFAULT;
1397
1398 set_bit(_VPF_blocked, &v->pause_flags);
1399 v->poll_evtchn = -1;
1400 set_bit(v->vcpu_id, d->poll_mask);
1401
1402 arch_vcpu_block(v);
1403
1404 #ifndef CONFIG_X86 /* set_bit() implies mb() on x86 */
1405 /* Check for events /after/ setting flags: avoids wakeup waiting race. */
1406 smp_mb();
1407
1408 /*
1409 * Someone may have seen we are blocked but not that we are polling, or
1410 * vice versa. We are certainly being woken, so clean up and bail. Beyond
1411 * this point others can be guaranteed to clean up for us if they wake us.
1412 */
1413 rc = 0;
1414 if ( (v->poll_evtchn == 0) ||
1415 !test_bit(_VPF_blocked, &v->pause_flags) ||
1416 !test_bit(v->vcpu_id, d->poll_mask) )
1417 goto out;
1418 #endif
1419
1420 rc = 0;
1421 if ( local_events_need_delivery() )
1422 goto out;
1423
1424 for ( i = 0; i < sched_poll->nr_ports; i++ )
1425 {
1426 rc = -EFAULT;
1427 if ( __copy_from_guest_offset(&port, sched_poll->ports, i, 1) )
1428 goto out;
1429
1430 rc = -EINVAL;
1431 if ( !port_is_valid(d, port) )
1432 goto out;
1433
1434 rc = 0;
1435 if ( evtchn_port_is_pending(d, port) )
1436 goto out;
1437 }
1438
1439 if ( sched_poll->nr_ports == 1 )
1440 v->poll_evtchn = port;
1441
1442 if ( sched_poll->timeout != 0 )
1443 set_timer(&v->poll_timer, sched_poll->timeout);
1444
1445 TRACE_2D(TRC_SCHED_BLOCK, d->domain_id, v->vcpu_id);
1446 raise_softirq(SCHEDULE_SOFTIRQ);
1447
1448 return 0;
1449
1450 out:
1451 v->poll_evtchn = 0;
1452 clear_bit(v->vcpu_id, d->poll_mask);
1453 clear_bit(_VPF_blocked, &v->pause_flags);
1454 return rc;
1455 }
1456
1457 /* Voluntarily yield the processor for this allocation. */
vcpu_yield(void)1458 long vcpu_yield(void)
1459 {
1460 struct vcpu * v=current;
1461 spinlock_t *lock;
1462
1463 rcu_read_lock(&sched_res_rculock);
1464
1465 lock = unit_schedule_lock_irq(v->sched_unit);
1466 sched_yield(vcpu_scheduler(v), v->sched_unit);
1467 unit_schedule_unlock_irq(lock, v->sched_unit);
1468
1469 rcu_read_unlock(&sched_res_rculock);
1470
1471 SCHED_STAT_CRANK(vcpu_yield);
1472
1473 TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id);
1474 raise_softirq(SCHEDULE_SOFTIRQ);
1475 return 0;
1476 }
1477
domain_watchdog_timeout(void * data)1478 static void domain_watchdog_timeout(void *data)
1479 {
1480 struct domain *d = data;
1481
1482 if ( d->is_shutting_down || d->is_dying )
1483 return;
1484
1485 printk("Watchdog timer fired for domain %u\n", d->domain_id);
1486 domain_shutdown(d, SHUTDOWN_watchdog);
1487 }
1488
domain_watchdog(struct domain * d,uint32_t id,uint32_t timeout)1489 static long domain_watchdog(struct domain *d, uint32_t id, uint32_t timeout)
1490 {
1491 if ( id > NR_DOMAIN_WATCHDOG_TIMERS )
1492 return -EINVAL;
1493
1494 spin_lock(&d->watchdog_lock);
1495
1496 if ( id == 0 )
1497 {
1498 for ( id = 0; id < NR_DOMAIN_WATCHDOG_TIMERS; id++ )
1499 {
1500 if ( test_and_set_bit(id, &d->watchdog_inuse_map) )
1501 continue;
1502 set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout));
1503 break;
1504 }
1505 spin_unlock(&d->watchdog_lock);
1506 return id == NR_DOMAIN_WATCHDOG_TIMERS ? -ENOSPC : id + 1;
1507 }
1508
1509 id -= 1;
1510 if ( !test_bit(id, &d->watchdog_inuse_map) )
1511 {
1512 spin_unlock(&d->watchdog_lock);
1513 return -EINVAL;
1514 }
1515
1516 if ( timeout == 0 )
1517 {
1518 stop_timer(&d->watchdog_timer[id]);
1519 clear_bit(id, &d->watchdog_inuse_map);
1520 }
1521 else
1522 {
1523 set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout));
1524 }
1525
1526 spin_unlock(&d->watchdog_lock);
1527 return 0;
1528 }
1529
watchdog_domain_init(struct domain * d)1530 void watchdog_domain_init(struct domain *d)
1531 {
1532 unsigned int i;
1533
1534 spin_lock_init(&d->watchdog_lock);
1535
1536 d->watchdog_inuse_map = 0;
1537
1538 for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
1539 init_timer(&d->watchdog_timer[i], domain_watchdog_timeout, d, 0);
1540 }
1541
watchdog_domain_destroy(struct domain * d)1542 void watchdog_domain_destroy(struct domain *d)
1543 {
1544 unsigned int i;
1545
1546 for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
1547 kill_timer(&d->watchdog_timer[i]);
1548 }
1549
1550 /*
1551 * Pin a vcpu temporarily to a specific CPU (or restore old pinning state if
1552 * cpu is NR_CPUS).
1553 * Temporary pinning can be done due to two reasons, which may be nested:
1554 * - VCPU_AFFINITY_OVERRIDE (requested by guest): is allowed to fail in case
1555 * of a conflict (e.g. in case cpupool doesn't include requested CPU, or
1556 * another conflicting temporary pinning is already in effect.
1557 * - VCPU_AFFINITY_WAIT (called by wait_event()): only used to pin vcpu to the
1558 * CPU it is just running on. Can't fail if used properly.
1559 */
vcpu_temporary_affinity(struct vcpu * v,unsigned int cpu,uint8_t reason)1560 int vcpu_temporary_affinity(struct vcpu *v, unsigned int cpu, uint8_t reason)
1561 {
1562 struct sched_unit *unit = v->sched_unit;
1563 spinlock_t *lock;
1564 int ret = -EINVAL;
1565 bool migrate;
1566
1567 rcu_read_lock(&sched_res_rculock);
1568
1569 lock = unit_schedule_lock_irq(unit);
1570
1571 if ( cpu == NR_CPUS )
1572 {
1573 if ( v->affinity_broken & reason )
1574 {
1575 ret = 0;
1576 v->affinity_broken &= ~reason;
1577 }
1578 if ( !ret && !sched_check_affinity_broken(unit) )
1579 sched_set_affinity(unit, unit->cpu_hard_affinity_saved, NULL);
1580 }
1581 else if ( cpu < nr_cpu_ids )
1582 {
1583 if ( (v->affinity_broken & reason) ||
1584 (sched_check_affinity_broken(unit) && v->processor != cpu) )
1585 ret = -EBUSY;
1586 else if ( cpumask_test_cpu(cpu, VCPU2ONLINE(v)) )
1587 {
1588 if ( !sched_check_affinity_broken(unit) )
1589 {
1590 cpumask_copy(unit->cpu_hard_affinity_saved,
1591 unit->cpu_hard_affinity);
1592 sched_set_affinity(unit, cpumask_of(cpu), NULL);
1593 }
1594 v->affinity_broken |= reason;
1595 ret = 0;
1596 }
1597 }
1598
1599 migrate = !ret && !cpumask_test_cpu(v->processor, unit->cpu_hard_affinity);
1600 if ( migrate )
1601 sched_unit_migrate_start(unit);
1602
1603 unit_schedule_unlock_irq(lock, unit);
1604
1605 if ( migrate )
1606 sched_unit_migrate_finish(unit);
1607
1608 rcu_read_unlock(&sched_res_rculock);
1609
1610 return ret;
1611 }
1612
1613 static inline
vcpuaffinity_params_invalid(const struct xen_domctl_vcpuaffinity * vcpuaff)1614 int vcpuaffinity_params_invalid(const struct xen_domctl_vcpuaffinity *vcpuaff)
1615 {
1616 return vcpuaff->flags == 0 ||
1617 ((vcpuaff->flags & XEN_VCPUAFFINITY_HARD) &&
1618 guest_handle_is_null(vcpuaff->cpumap_hard.bitmap)) ||
1619 ((vcpuaff->flags & XEN_VCPUAFFINITY_SOFT) &&
1620 guest_handle_is_null(vcpuaff->cpumap_soft.bitmap));
1621 }
1622
vcpu_affinity_domctl(struct domain * d,uint32_t cmd,struct xen_domctl_vcpuaffinity * vcpuaff)1623 int vcpu_affinity_domctl(struct domain *d, uint32_t cmd,
1624 struct xen_domctl_vcpuaffinity *vcpuaff)
1625 {
1626 struct vcpu *v;
1627 const struct sched_unit *unit;
1628 int ret = 0;
1629
1630 if ( vcpuaff->vcpu >= d->max_vcpus )
1631 return -EINVAL;
1632
1633 if ( (v = d->vcpu[vcpuaff->vcpu]) == NULL )
1634 return -ESRCH;
1635
1636 if ( vcpuaffinity_params_invalid(vcpuaff) )
1637 return -EINVAL;
1638
1639 unit = v->sched_unit;
1640
1641 if ( cmd == XEN_DOMCTL_setvcpuaffinity )
1642 {
1643 cpumask_var_t new_affinity, old_affinity;
1644 cpumask_t *online = cpupool_domain_master_cpumask(v->domain);
1645
1646 /*
1647 * We want to be able to restore hard affinity if we are trying
1648 * setting both and changing soft affinity (which happens later,
1649 * when hard affinity has been succesfully chaged already) fails.
1650 */
1651 if ( !alloc_cpumask_var(&old_affinity) )
1652 return -ENOMEM;
1653
1654 cpumask_copy(old_affinity, unit->cpu_hard_affinity);
1655
1656 if ( !alloc_cpumask_var(&new_affinity) )
1657 {
1658 free_cpumask_var(old_affinity);
1659 return -ENOMEM;
1660 }
1661
1662 /* Undo a stuck SCHED_pin_override? */
1663 if ( vcpuaff->flags & XEN_VCPUAFFINITY_FORCE )
1664 vcpu_temporary_affinity(v, NR_CPUS, VCPU_AFFINITY_OVERRIDE);
1665
1666 ret = 0;
1667
1668 /*
1669 * We both set a new affinity and report back to the caller what
1670 * the scheduler will be effectively using.
1671 */
1672 if ( vcpuaff->flags & XEN_VCPUAFFINITY_HARD )
1673 {
1674 ret = xenctl_bitmap_to_bitmap(cpumask_bits(new_affinity),
1675 &vcpuaff->cpumap_hard, nr_cpu_ids);
1676 if ( !ret )
1677 ret = vcpu_set_hard_affinity(v, new_affinity);
1678 if ( ret )
1679 goto setvcpuaffinity_out;
1680
1681 /*
1682 * For hard affinity, what we return is the intersection of
1683 * cpupool's online mask and the new hard affinity.
1684 */
1685 cpumask_and(new_affinity, online, unit->cpu_hard_affinity);
1686 ret = cpumask_to_xenctl_bitmap(&vcpuaff->cpumap_hard, new_affinity);
1687 }
1688 if ( vcpuaff->flags & XEN_VCPUAFFINITY_SOFT )
1689 {
1690 ret = xenctl_bitmap_to_bitmap(cpumask_bits(new_affinity),
1691 &vcpuaff->cpumap_soft, nr_cpu_ids);
1692 if ( !ret)
1693 ret = vcpu_set_soft_affinity(v, new_affinity);
1694 if ( ret )
1695 {
1696 /*
1697 * Since we're returning error, the caller expects nothing
1698 * happened, so we rollback the changes to hard affinity
1699 * (if any).
1700 */
1701 if ( vcpuaff->flags & XEN_VCPUAFFINITY_HARD )
1702 vcpu_set_hard_affinity(v, old_affinity);
1703 goto setvcpuaffinity_out;
1704 }
1705
1706 /*
1707 * For soft affinity, we return the intersection between the
1708 * new soft affinity, the cpupool's online map and the (new)
1709 * hard affinity.
1710 */
1711 cpumask_and(new_affinity, new_affinity, online);
1712 cpumask_and(new_affinity, new_affinity, unit->cpu_hard_affinity);
1713 ret = cpumask_to_xenctl_bitmap(&vcpuaff->cpumap_soft, new_affinity);
1714 }
1715
1716 setvcpuaffinity_out:
1717 free_cpumask_var(new_affinity);
1718 free_cpumask_var(old_affinity);
1719 }
1720 else
1721 {
1722 if ( vcpuaff->flags & XEN_VCPUAFFINITY_HARD )
1723 ret = cpumask_to_xenctl_bitmap(&vcpuaff->cpumap_hard,
1724 unit->cpu_hard_affinity);
1725 if ( vcpuaff->flags & XEN_VCPUAFFINITY_SOFT )
1726 ret = cpumask_to_xenctl_bitmap(&vcpuaff->cpumap_soft,
1727 unit->cpu_soft_affinity);
1728 }
1729
1730 return ret;
1731 }
1732
domain_update_node_affinity(struct domain * d)1733 void domain_update_node_affinity(struct domain *d)
1734 {
1735 cpumask_var_t dom_cpumask, dom_cpumask_soft;
1736 cpumask_t *dom_affinity;
1737 const cpumask_t *online;
1738 struct sched_unit *unit;
1739 unsigned int cpu;
1740
1741 /* Do we have vcpus already? If not, no need to update node-affinity. */
1742 if ( !d->vcpu || !d->vcpu[0] )
1743 return;
1744
1745 if ( !zalloc_cpumask_var(&dom_cpumask) )
1746 return;
1747 if ( !zalloc_cpumask_var(&dom_cpumask_soft) )
1748 {
1749 free_cpumask_var(dom_cpumask);
1750 return;
1751 }
1752
1753 online = cpupool_domain_master_cpumask(d);
1754
1755 spin_lock(&d->node_affinity_lock);
1756
1757 /*
1758 * If d->auto_node_affinity is true, let's compute the domain's
1759 * node-affinity and update d->node_affinity accordingly. if false,
1760 * just leave d->auto_node_affinity alone.
1761 */
1762 if ( d->auto_node_affinity )
1763 {
1764 /*
1765 * We want the narrowest possible set of pcpus (to get the narowest
1766 * possible set of nodes). What we need is the cpumask of where the
1767 * domain can run (the union of the hard affinity of all its vcpus),
1768 * and the full mask of where it would prefer to run (the union of
1769 * the soft affinity of all its various vcpus). Let's build them.
1770 */
1771 for_each_sched_unit ( d, unit )
1772 {
1773 cpumask_or(dom_cpumask, dom_cpumask, unit->cpu_hard_affinity);
1774 cpumask_or(dom_cpumask_soft, dom_cpumask_soft,
1775 unit->cpu_soft_affinity);
1776 }
1777 /* Filter out non-online cpus */
1778 cpumask_and(dom_cpumask, dom_cpumask, online);
1779 ASSERT(!cpumask_empty(dom_cpumask));
1780 /* And compute the intersection between hard, online and soft */
1781 cpumask_and(dom_cpumask_soft, dom_cpumask_soft, dom_cpumask);
1782
1783 /*
1784 * If not empty, the intersection of hard, soft and online is the
1785 * narrowest set we want. If empty, we fall back to hard&online.
1786 */
1787 dom_affinity = cpumask_empty(dom_cpumask_soft) ?
1788 dom_cpumask : dom_cpumask_soft;
1789
1790 nodes_clear(d->node_affinity);
1791 for_each_cpu ( cpu, dom_affinity )
1792 node_set(cpu_to_node(cpu), d->node_affinity);
1793 }
1794
1795 spin_unlock(&d->node_affinity_lock);
1796
1797 free_cpumask_var(dom_cpumask_soft);
1798 free_cpumask_var(dom_cpumask);
1799 }
1800
1801 typedef long ret_t;
1802
1803 #endif /* !COMPAT */
1804
do_sched_op(int cmd,XEN_GUEST_HANDLE_PARAM (void)arg)1805 ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
1806 {
1807 ret_t ret = 0;
1808
1809 switch ( cmd )
1810 {
1811 case SCHEDOP_yield:
1812 {
1813 ret = vcpu_yield();
1814 break;
1815 }
1816
1817 case SCHEDOP_block:
1818 {
1819 vcpu_block_enable_events();
1820 break;
1821 }
1822
1823 case SCHEDOP_shutdown:
1824 {
1825 struct sched_shutdown sched_shutdown;
1826
1827 ret = -EFAULT;
1828 if ( copy_from_guest(&sched_shutdown, arg, 1) )
1829 break;
1830
1831 TRACE_3D(TRC_SCHED_SHUTDOWN,
1832 current->domain->domain_id, current->vcpu_id,
1833 sched_shutdown.reason);
1834 ret = domain_shutdown(current->domain, (u8)sched_shutdown.reason);
1835
1836 break;
1837 }
1838
1839 case SCHEDOP_shutdown_code:
1840 {
1841 struct sched_shutdown sched_shutdown;
1842 struct domain *d = current->domain;
1843
1844 ret = -EFAULT;
1845 if ( copy_from_guest(&sched_shutdown, arg, 1) )
1846 break;
1847
1848 TRACE_3D(TRC_SCHED_SHUTDOWN_CODE,
1849 d->domain_id, current->vcpu_id, sched_shutdown.reason);
1850
1851 spin_lock(&d->shutdown_lock);
1852 if ( d->shutdown_code == SHUTDOWN_CODE_INVALID )
1853 d->shutdown_code = (u8)sched_shutdown.reason;
1854 spin_unlock(&d->shutdown_lock);
1855
1856 ret = 0;
1857 break;
1858 }
1859
1860 case SCHEDOP_poll:
1861 {
1862 struct sched_poll sched_poll;
1863
1864 ret = -EFAULT;
1865 if ( copy_from_guest(&sched_poll, arg, 1) )
1866 break;
1867
1868 ret = do_poll(&sched_poll);
1869
1870 break;
1871 }
1872
1873 case SCHEDOP_remote_shutdown:
1874 {
1875 struct domain *d;
1876 struct sched_remote_shutdown sched_remote_shutdown;
1877
1878 ret = -EFAULT;
1879 if ( copy_from_guest(&sched_remote_shutdown, arg, 1) )
1880 break;
1881
1882 ret = -ESRCH;
1883 d = rcu_lock_domain_by_id(sched_remote_shutdown.domain_id);
1884 if ( d == NULL )
1885 break;
1886
1887 ret = xsm_schedop_shutdown(XSM_DM_PRIV, current->domain, d);
1888 if ( likely(!ret) )
1889 domain_shutdown(d, sched_remote_shutdown.reason);
1890
1891 rcu_unlock_domain(d);
1892
1893 break;
1894 }
1895
1896 case SCHEDOP_watchdog:
1897 {
1898 struct sched_watchdog sched_watchdog;
1899
1900 ret = -EFAULT;
1901 if ( copy_from_guest(&sched_watchdog, arg, 1) )
1902 break;
1903
1904 ret = domain_watchdog(
1905 current->domain, sched_watchdog.id, sched_watchdog.timeout);
1906 break;
1907 }
1908
1909 case SCHEDOP_pin_override:
1910 {
1911 struct sched_pin_override sched_pin_override;
1912 unsigned int cpu;
1913
1914 ret = -EPERM;
1915 if ( !is_hardware_domain(current->domain) )
1916 break;
1917
1918 ret = -EFAULT;
1919 if ( copy_from_guest(&sched_pin_override, arg, 1) )
1920 break;
1921
1922 ret = -EINVAL;
1923 if ( sched_pin_override.pcpu >= NR_CPUS )
1924 break;
1925
1926 cpu = sched_pin_override.pcpu < 0 ? NR_CPUS : sched_pin_override.pcpu;
1927 ret = vcpu_temporary_affinity(current, cpu, VCPU_AFFINITY_OVERRIDE);
1928
1929 break;
1930 }
1931
1932 default:
1933 ret = -ENOSYS;
1934 }
1935
1936 return ret;
1937 }
1938
1939 #ifndef COMPAT
1940
1941 /* Per-vcpu oneshot-timer hypercall. */
do_set_timer_op(s_time_t timeout)1942 long do_set_timer_op(s_time_t timeout)
1943 {
1944 struct vcpu *v = current;
1945 s_time_t offset = timeout - NOW();
1946
1947 if ( timeout == 0 )
1948 {
1949 stop_timer(&v->singleshot_timer);
1950 }
1951 else if ( unlikely(timeout < 0) || /* overflow into 64th bit? */
1952 unlikely((offset > 0) && ((uint32_t)(offset >> 50) != 0)) )
1953 {
1954 /*
1955 * Linux workaround: occasionally we will see timeouts a long way in
1956 * the future due to wrapping in Linux's jiffy time handling. We check
1957 * for timeouts wrapped negative, and for positive timeouts more than
1958 * about 13 days in the future (2^50ns). The correct fix is to trigger
1959 * an interrupt immediately (since Linux in fact has pending work to
1960 * do in this situation). However, older guests also set a long timeout
1961 * when they have *no* pending timers at all: setting an immediate
1962 * timeout in this case can burn a lot of CPU. We therefore go for a
1963 * reasonable middleground of triggering a timer event in 100ms.
1964 */
1965 gdprintk(XENLOG_INFO, "Warning: huge timeout set: %"PRIx64"\n",
1966 timeout);
1967 set_timer(&v->singleshot_timer, NOW() + MILLISECS(100));
1968 }
1969 else
1970 {
1971 migrate_timer(&v->singleshot_timer, smp_processor_id());
1972 set_timer(&v->singleshot_timer, timeout);
1973 }
1974
1975 return 0;
1976 }
1977
1978 /* sched_id - fetch ID of current scheduler */
sched_id(void)1979 int sched_id(void)
1980 {
1981 return ops.sched_id;
1982 }
1983
1984 /* Adjust scheduling parameter for a given domain. */
sched_adjust(struct domain * d,struct xen_domctl_scheduler_op * op)1985 long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op)
1986 {
1987 long ret;
1988
1989 ret = xsm_domctl_scheduler_op(XSM_HOOK, d, op->cmd);
1990 if ( ret )
1991 return ret;
1992
1993 if ( op->sched_id != dom_scheduler(d)->sched_id )
1994 return -EINVAL;
1995
1996 switch ( op->cmd )
1997 {
1998 case XEN_DOMCTL_SCHEDOP_putinfo:
1999 case XEN_DOMCTL_SCHEDOP_getinfo:
2000 case XEN_DOMCTL_SCHEDOP_putvcpuinfo:
2001 case XEN_DOMCTL_SCHEDOP_getvcpuinfo:
2002 break;
2003 default:
2004 return -EINVAL;
2005 }
2006
2007 /* NB: the pluggable scheduler code needs to take care
2008 * of locking by itself. */
2009 rcu_read_lock(&sched_res_rculock);
2010
2011 if ( (ret = sched_adjust_dom(dom_scheduler(d), d, op)) == 0 )
2012 TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
2013
2014 rcu_read_unlock(&sched_res_rculock);
2015
2016 return ret;
2017 }
2018
sched_adjust_global(struct xen_sysctl_scheduler_op * op)2019 long sched_adjust_global(struct xen_sysctl_scheduler_op *op)
2020 {
2021 struct cpupool *pool;
2022 int rc;
2023
2024 rc = xsm_sysctl_scheduler_op(XSM_HOOK, op->cmd);
2025 if ( rc )
2026 return rc;
2027
2028 if ( (op->cmd != XEN_SYSCTL_SCHEDOP_putinfo) &&
2029 (op->cmd != XEN_SYSCTL_SCHEDOP_getinfo) )
2030 return -EINVAL;
2031
2032 pool = cpupool_get_by_id(op->cpupool_id);
2033 if ( pool == NULL )
2034 return -ESRCH;
2035
2036 rcu_read_lock(&sched_res_rculock);
2037
2038 rc = ((op->sched_id == pool->sched->sched_id)
2039 ? sched_adjust_cpupool(pool->sched, op) : -EINVAL);
2040
2041 rcu_read_unlock(&sched_res_rculock);
2042
2043 cpupool_put(pool);
2044
2045 return rc;
2046 }
2047
vcpu_periodic_timer_work_locked(struct vcpu * v)2048 static void vcpu_periodic_timer_work_locked(struct vcpu *v)
2049 {
2050 s_time_t now;
2051 s_time_t periodic_next_event;
2052
2053 now = NOW();
2054 periodic_next_event = v->periodic_last_event + v->periodic_period;
2055
2056 if ( now >= periodic_next_event )
2057 {
2058 send_timer_event(v);
2059 v->periodic_last_event = now;
2060 periodic_next_event = now + v->periodic_period;
2061 }
2062
2063 migrate_timer(&v->periodic_timer, v->processor);
2064 set_timer(&v->periodic_timer, periodic_next_event);
2065 }
2066
vcpu_periodic_timer_work(struct vcpu * v)2067 static void vcpu_periodic_timer_work(struct vcpu *v)
2068 {
2069 if ( v->periodic_period == 0 )
2070 return;
2071
2072 spin_lock(&v->periodic_timer_lock);
2073 if ( v->periodic_period )
2074 vcpu_periodic_timer_work_locked(v);
2075 spin_unlock(&v->periodic_timer_lock);
2076 }
2077
2078 /*
2079 * Set the periodic timer of a vcpu.
2080 */
vcpu_set_periodic_timer(struct vcpu * v,s_time_t value)2081 void vcpu_set_periodic_timer(struct vcpu *v, s_time_t value)
2082 {
2083 spin_lock(&v->periodic_timer_lock);
2084
2085 stop_timer(&v->periodic_timer);
2086
2087 v->periodic_period = value;
2088 if ( value )
2089 vcpu_periodic_timer_work_locked(v);
2090
2091 spin_unlock(&v->periodic_timer_lock);
2092 }
2093
sched_switch_units(struct sched_resource * sr,struct sched_unit * next,struct sched_unit * prev,s_time_t now)2094 static void sched_switch_units(struct sched_resource *sr,
2095 struct sched_unit *next, struct sched_unit *prev,
2096 s_time_t now)
2097 {
2098 unsigned int cpu;
2099
2100 ASSERT(unit_running(prev));
2101
2102 if ( prev != next )
2103 {
2104 sr->curr = next;
2105 sr->prev = prev;
2106
2107 TRACE_3D(TRC_SCHED_SWITCH_INFPREV, prev->domain->domain_id,
2108 prev->unit_id, now - prev->state_entry_time);
2109 TRACE_4D(TRC_SCHED_SWITCH_INFNEXT, next->domain->domain_id,
2110 next->unit_id,
2111 (next->vcpu_list->runstate.state == RUNSTATE_runnable) ?
2112 (now - next->state_entry_time) : 0, prev->next_time);
2113 TRACE_4D(TRC_SCHED_SWITCH, prev->domain->domain_id, prev->unit_id,
2114 next->domain->domain_id, next->unit_id);
2115
2116 ASSERT(!unit_running(next));
2117
2118 /*
2119 * NB. Don't add any trace records from here until the actual context
2120 * switch, else lost_records resume will not work properly.
2121 */
2122
2123 ASSERT(!next->is_running);
2124 next->is_running = true;
2125 next->state_entry_time = now;
2126
2127 if ( is_idle_unit(prev) )
2128 {
2129 prev->runstate_cnt[RUNSTATE_running] = 0;
2130 prev->runstate_cnt[RUNSTATE_runnable] = sr->granularity;
2131 }
2132 if ( is_idle_unit(next) )
2133 {
2134 next->runstate_cnt[RUNSTATE_running] = sr->granularity;
2135 next->runstate_cnt[RUNSTATE_runnable] = 0;
2136 }
2137 }
2138
2139 for_each_cpu ( cpu, sr->cpus )
2140 {
2141 struct vcpu *vprev = get_cpu_current(cpu);
2142 struct vcpu *vnext = sched_unit2vcpu_cpu(next, cpu);
2143
2144 if ( vprev != vnext || vprev->runstate.state != vnext->new_state )
2145 {
2146 vcpu_runstate_change(vprev,
2147 ((vprev->pause_flags & VPF_blocked) ? RUNSTATE_blocked :
2148 (vcpu_runnable(vprev) ? RUNSTATE_runnable : RUNSTATE_offline)),
2149 now);
2150 vcpu_runstate_change(vnext, vnext->new_state, now);
2151 }
2152
2153 vnext->is_running = true;
2154
2155 if ( is_idle_vcpu(vnext) )
2156 vnext->sched_unit = next;
2157 }
2158 }
2159
sched_tasklet_check_cpu(unsigned int cpu)2160 static bool sched_tasklet_check_cpu(unsigned int cpu)
2161 {
2162 unsigned long *tasklet_work = &per_cpu(tasklet_work_to_do, cpu);
2163
2164 switch ( *tasklet_work )
2165 {
2166 case TASKLET_enqueued:
2167 set_bit(_TASKLET_scheduled, tasklet_work);
2168 /* fallthrough */
2169 case TASKLET_enqueued|TASKLET_scheduled:
2170 return true;
2171 break;
2172 case TASKLET_scheduled:
2173 clear_bit(_TASKLET_scheduled, tasklet_work);
2174 /* fallthrough */
2175 case 0:
2176 /* return false; */
2177 break;
2178 default:
2179 BUG();
2180 }
2181
2182 return false;
2183 }
2184
sched_tasklet_check(unsigned int cpu)2185 static bool sched_tasklet_check(unsigned int cpu)
2186 {
2187 bool tasklet_work_scheduled = false;
2188 const cpumask_t *mask = get_sched_res(cpu)->cpus;
2189 unsigned int cpu_iter;
2190
2191 for_each_cpu ( cpu_iter, mask )
2192 if ( sched_tasklet_check_cpu(cpu_iter) )
2193 tasklet_work_scheduled = true;
2194
2195 return tasklet_work_scheduled;
2196 }
2197
do_schedule(struct sched_unit * prev,s_time_t now,unsigned int cpu)2198 static struct sched_unit *do_schedule(struct sched_unit *prev, s_time_t now,
2199 unsigned int cpu)
2200 {
2201 struct sched_resource *sr = get_sched_res(cpu);
2202 struct scheduler *sched = sr->scheduler;
2203 struct sched_unit *next;
2204
2205 /* get policy-specific decision on scheduling... */
2206 sched->do_schedule(sched, prev, now, sched_tasklet_check(cpu));
2207
2208 next = prev->next_task;
2209
2210 if ( prev->next_time >= 0 ) /* -ve means no limit */
2211 set_timer(&sr->s_timer, now + prev->next_time);
2212
2213 sched_switch_units(sr, next, prev, now);
2214
2215 return next;
2216 }
2217
vcpu_context_saved(struct vcpu * vprev,struct vcpu * vnext)2218 static void vcpu_context_saved(struct vcpu *vprev, struct vcpu *vnext)
2219 {
2220 /* Clear running flag /after/ writing context to memory. */
2221 smp_wmb();
2222
2223 if ( vprev != vnext )
2224 vprev->is_running = false;
2225 }
2226
unit_context_saved(struct sched_resource * sr)2227 static void unit_context_saved(struct sched_resource *sr)
2228 {
2229 struct sched_unit *unit = sr->prev;
2230
2231 if ( !unit )
2232 return;
2233
2234 unit->is_running = false;
2235 unit->state_entry_time = NOW();
2236 sr->prev = NULL;
2237
2238 /* Check for migration request /after/ clearing running flag. */
2239 smp_mb();
2240
2241 sched_context_saved(unit_scheduler(unit), unit);
2242
2243 /* Idle never migrates and idle vcpus might belong to other units. */
2244 if ( !is_idle_unit(unit) )
2245 sched_unit_migrate_finish(unit);
2246 }
2247
2248 /*
2249 * Rendezvous on end of context switch.
2250 * As no lock is protecting this rendezvous function we need to use atomic
2251 * access functions on the counter.
2252 * The counter will be 0 in case no rendezvous is needed. For the rendezvous
2253 * case it is initialised to the number of cpus to rendezvous plus 1. Each
2254 * member entering decrements the counter. The last one will decrement it to
2255 * 1 and perform the final needed action in that case (call of
2256 * unit_context_saved()), and then set the counter to zero. The other members
2257 * will wait until the counter becomes zero until they proceed.
2258 */
sched_context_switched(struct vcpu * vprev,struct vcpu * vnext)2259 void sched_context_switched(struct vcpu *vprev, struct vcpu *vnext)
2260 {
2261 struct sched_unit *next = vnext->sched_unit;
2262 struct sched_resource *sr;
2263
2264 rcu_read_lock(&sched_res_rculock);
2265
2266 sr = get_sched_res(smp_processor_id());
2267
2268 if ( atomic_read(&next->rendezvous_out_cnt) )
2269 {
2270 int cnt = atomic_dec_return(&next->rendezvous_out_cnt);
2271
2272 vcpu_context_saved(vprev, vnext);
2273
2274 /* Call unit_context_saved() before releasing other waiters. */
2275 if ( cnt == 1 )
2276 {
2277 unit_context_saved(sr);
2278 atomic_set(&next->rendezvous_out_cnt, 0);
2279 }
2280 else
2281 while ( atomic_read(&next->rendezvous_out_cnt) )
2282 cpu_relax();
2283 }
2284 else
2285 {
2286 vcpu_context_saved(vprev, vnext);
2287 if ( sr->granularity == 1 )
2288 unit_context_saved(sr);
2289 }
2290
2291 if ( is_idle_vcpu(vprev) && vprev != vnext )
2292 vprev->sched_unit = sr->sched_unit_idle;
2293
2294 rcu_read_unlock(&sched_res_rculock);
2295 }
2296
2297 /*
2298 * Switch to a new context or keep the current one running.
2299 * On x86 it won't return, so it needs to drop the still held sched_res_rculock.
2300 */
sched_context_switch(struct vcpu * vprev,struct vcpu * vnext,bool reset_idle_unit,s_time_t now)2301 static void sched_context_switch(struct vcpu *vprev, struct vcpu *vnext,
2302 bool reset_idle_unit, s_time_t now)
2303 {
2304 if ( unlikely(vprev == vnext) )
2305 {
2306 TRACE_4D(TRC_SCHED_SWITCH_INFCONT,
2307 vnext->domain->domain_id, vnext->sched_unit->unit_id,
2308 now - vprev->runstate.state_entry_time,
2309 vprev->sched_unit->next_time);
2310 sched_context_switched(vprev, vnext);
2311
2312 /*
2313 * We are switching from a non-idle to an idle unit.
2314 * A vcpu of the idle unit might have been running before due to
2315 * the guest vcpu being blocked. We must adjust the unit of the idle
2316 * vcpu which might have been set to the guest's one.
2317 */
2318 if ( reset_idle_unit )
2319 vnext->sched_unit =
2320 get_sched_res(smp_processor_id())->sched_unit_idle;
2321
2322 rcu_read_unlock(&sched_res_rculock);
2323
2324 trace_continue_running(vnext);
2325 return continue_running(vprev);
2326 }
2327
2328 SCHED_STAT_CRANK(sched_ctx);
2329
2330 stop_timer(&vprev->periodic_timer);
2331
2332 if ( vnext->sched_unit->migrated )
2333 vcpu_move_irqs(vnext);
2334
2335 vcpu_periodic_timer_work(vnext);
2336
2337 rcu_read_unlock(&sched_res_rculock);
2338
2339 context_switch(vprev, vnext);
2340 }
2341
2342 /*
2343 * Force a context switch of a single vcpu of an unit.
2344 * Might be called either if a vcpu of an already running unit is woken up
2345 * or if a vcpu of a running unit is put asleep with other vcpus of the same
2346 * unit still running.
2347 * Returns either NULL if v is already in the correct state or the vcpu to
2348 * run next.
2349 */
sched_force_context_switch(struct vcpu * vprev,struct vcpu * v,unsigned int cpu,s_time_t now)2350 static struct vcpu *sched_force_context_switch(struct vcpu *vprev,
2351 struct vcpu *v,
2352 unsigned int cpu, s_time_t now)
2353 {
2354 v->force_context_switch = false;
2355
2356 if ( vcpu_runnable(v) == v->is_running )
2357 return NULL;
2358
2359 if ( vcpu_runnable(v) )
2360 {
2361 if ( is_idle_vcpu(vprev) )
2362 {
2363 vcpu_runstate_change(vprev, RUNSTATE_runnable, now);
2364 vprev->sched_unit = get_sched_res(cpu)->sched_unit_idle;
2365 }
2366 vcpu_runstate_change(v, RUNSTATE_running, now);
2367 }
2368 else
2369 {
2370 /* Make sure not to switch last vcpu of an unit away. */
2371 if ( unit_running(v->sched_unit) == 1 )
2372 return NULL;
2373
2374 v->new_state = vcpu_runstate_blocked(v);
2375 vcpu_runstate_change(v, v->new_state, now);
2376 v = sched_unit2vcpu_cpu(vprev->sched_unit, cpu);
2377 if ( v != vprev )
2378 {
2379 if ( is_idle_vcpu(vprev) )
2380 {
2381 vcpu_runstate_change(vprev, RUNSTATE_runnable, now);
2382 vprev->sched_unit = get_sched_res(cpu)->sched_unit_idle;
2383 }
2384 else
2385 {
2386 v->sched_unit = vprev->sched_unit;
2387 vcpu_runstate_change(v, RUNSTATE_running, now);
2388 }
2389 }
2390 }
2391
2392 /* This vcpu will be switched to. */
2393 v->is_running = true;
2394
2395 /* Make sure not to loose another slave call. */
2396 raise_softirq(SCHED_SLAVE_SOFTIRQ);
2397
2398 return v;
2399 }
2400
2401 /*
2402 * Rendezvous before taking a scheduling decision.
2403 * Called with schedule lock held, so all accesses to the rendezvous counter
2404 * can be normal ones (no atomic accesses needed).
2405 * The counter is initialized to the number of cpus to rendezvous initially.
2406 * Each cpu entering will decrement the counter. In case the counter becomes
2407 * zero do_schedule() is called and the rendezvous counter for leaving
2408 * context_switch() is set. All other members will wait until the counter is
2409 * becoming zero, dropping the schedule lock in between.
2410 * Either returns the new unit to run, or NULL if no context switch is
2411 * required or (on Arm) has already been performed. If NULL is returned
2412 * sched_res_rculock has been dropped.
2413 */
sched_wait_rendezvous_in(struct sched_unit * prev,spinlock_t ** lock,int cpu,s_time_t now)2414 static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev,
2415 spinlock_t **lock, int cpu,
2416 s_time_t now)
2417 {
2418 struct sched_unit *next;
2419 struct vcpu *v;
2420 struct sched_resource *sr = get_sched_res(cpu);
2421 unsigned int gran = sr->granularity;
2422
2423 if ( !--prev->rendezvous_in_cnt )
2424 {
2425 next = do_schedule(prev, now, cpu);
2426 atomic_set(&next->rendezvous_out_cnt, gran + 1);
2427 return next;
2428 }
2429
2430 v = unit2vcpu_cpu(prev, cpu);
2431 while ( prev->rendezvous_in_cnt )
2432 {
2433 if ( v && v->force_context_switch )
2434 {
2435 struct vcpu *vprev = current;
2436
2437 v = sched_force_context_switch(vprev, v, cpu, now);
2438
2439 if ( v )
2440 {
2441 /* We'll come back another time, so adjust rendezvous_in_cnt. */
2442 prev->rendezvous_in_cnt++;
2443 atomic_set(&prev->rendezvous_out_cnt, 0);
2444
2445 pcpu_schedule_unlock_irq(*lock, cpu);
2446
2447 sched_context_switch(vprev, v, false, now);
2448
2449 return NULL; /* ARM only. */
2450 }
2451
2452 v = unit2vcpu_cpu(prev, cpu);
2453 }
2454 /*
2455 * Check for any work to be done which might need cpu synchronization.
2456 * This is either pending RCU work, or tasklet work when coming from
2457 * idle. It is mandatory that RCU softirqs are of higher priority
2458 * than scheduling ones as otherwise a deadlock might occur.
2459 * In order to avoid deadlocks we can't do that here, but have to
2460 * schedule the previous vcpu again, which will lead to the desired
2461 * processing to be done.
2462 * Undo the rendezvous_in_cnt decrement and schedule another call of
2463 * sched_slave().
2464 */
2465 BUILD_BUG_ON(RCU_SOFTIRQ > SCHED_SLAVE_SOFTIRQ ||
2466 RCU_SOFTIRQ > SCHEDULE_SOFTIRQ);
2467 if ( rcu_pending(cpu) ||
2468 (is_idle_unit(prev) && sched_tasklet_check_cpu(cpu)) )
2469 {
2470 struct vcpu *vprev = current;
2471
2472 prev->rendezvous_in_cnt++;
2473 atomic_set(&prev->rendezvous_out_cnt, 0);
2474
2475 pcpu_schedule_unlock_irq(*lock, cpu);
2476
2477 raise_softirq(SCHED_SLAVE_SOFTIRQ);
2478 sched_context_switch(vprev, vprev, false, now);
2479
2480 return NULL; /* ARM only. */
2481 }
2482
2483 pcpu_schedule_unlock_irq(*lock, cpu);
2484
2485 cpu_relax();
2486
2487 *lock = pcpu_schedule_lock_irq(cpu);
2488
2489 /*
2490 * Check for scheduling resource switched. This happens when we are
2491 * moved away from our cpupool and cpus are subject of the idle
2492 * scheduler now.
2493 *
2494 * This is also a bail out case when scheduler_disable() has been
2495 * called.
2496 */
2497 if ( unlikely(sr != get_sched_res(cpu) || !scheduler_active) )
2498 {
2499 ASSERT(is_idle_unit(prev));
2500 atomic_set(&prev->next_task->rendezvous_out_cnt, 0);
2501 prev->rendezvous_in_cnt = 0;
2502 pcpu_schedule_unlock_irq(*lock, cpu);
2503 rcu_read_unlock(&sched_res_rculock);
2504 return NULL;
2505 }
2506 }
2507
2508 return prev->next_task;
2509 }
2510
sched_slave(void)2511 static void sched_slave(void)
2512 {
2513 struct vcpu *v, *vprev = current;
2514 struct sched_unit *prev = vprev->sched_unit, *next;
2515 s_time_t now;
2516 spinlock_t *lock;
2517 bool do_softirq = false;
2518 unsigned int cpu = smp_processor_id();
2519
2520 ASSERT_NOT_IN_ATOMIC();
2521
2522 rcu_read_lock(&sched_res_rculock);
2523
2524 lock = pcpu_schedule_lock_irq(cpu);
2525
2526 now = NOW();
2527
2528 v = unit2vcpu_cpu(prev, cpu);
2529 if ( v && v->force_context_switch )
2530 {
2531 v = sched_force_context_switch(vprev, v, cpu, now);
2532
2533 if ( v )
2534 {
2535 pcpu_schedule_unlock_irq(lock, cpu);
2536
2537 sched_context_switch(vprev, v, false, now);
2538
2539 return;
2540 }
2541
2542 do_softirq = true;
2543 }
2544
2545 if ( !prev->rendezvous_in_cnt )
2546 {
2547 pcpu_schedule_unlock_irq(lock, cpu);
2548
2549 rcu_read_unlock(&sched_res_rculock);
2550
2551 /* Check for failed forced context switch. */
2552 if ( do_softirq )
2553 raise_softirq(SCHEDULE_SOFTIRQ);
2554
2555 return;
2556 }
2557
2558 stop_timer(&get_sched_res(cpu)->s_timer);
2559
2560 next = sched_wait_rendezvous_in(prev, &lock, cpu, now);
2561 if ( !next )
2562 return;
2563
2564 pcpu_schedule_unlock_irq(lock, cpu);
2565
2566 sched_context_switch(vprev, sched_unit2vcpu_cpu(next, cpu),
2567 is_idle_unit(next) && !is_idle_unit(prev), now);
2568 }
2569
2570 /*
2571 * The main function
2572 * - deschedule the current domain (scheduler independent).
2573 * - pick a new domain (scheduler dependent).
2574 */
schedule(void)2575 static void schedule(void)
2576 {
2577 struct vcpu *vnext, *vprev = current;
2578 struct sched_unit *prev = vprev->sched_unit, *next = NULL;
2579 s_time_t now;
2580 struct sched_resource *sr;
2581 spinlock_t *lock;
2582 int cpu = smp_processor_id();
2583 unsigned int gran;
2584
2585 ASSERT_NOT_IN_ATOMIC();
2586
2587 SCHED_STAT_CRANK(sched_run);
2588
2589 rcu_read_lock(&sched_res_rculock);
2590
2591 lock = pcpu_schedule_lock_irq(cpu);
2592
2593 sr = get_sched_res(cpu);
2594 gran = sr->granularity;
2595
2596 if ( prev->rendezvous_in_cnt )
2597 {
2598 /*
2599 * We have a race: sched_slave() should be called, so raise a softirq
2600 * in order to re-enter schedule() later and call sched_slave() now.
2601 */
2602 pcpu_schedule_unlock_irq(lock, cpu);
2603
2604 rcu_read_unlock(&sched_res_rculock);
2605
2606 raise_softirq(SCHEDULE_SOFTIRQ);
2607 return sched_slave();
2608 }
2609
2610 stop_timer(&sr->s_timer);
2611
2612 now = NOW();
2613
2614 if ( gran > 1 )
2615 {
2616 cpumask_t *mask = cpumask_scratch_cpu(cpu);
2617
2618 prev->rendezvous_in_cnt = gran;
2619 cpumask_andnot(mask, sr->cpus, cpumask_of(cpu));
2620 cpumask_raise_softirq(mask, SCHED_SLAVE_SOFTIRQ);
2621 next = sched_wait_rendezvous_in(prev, &lock, cpu, now);
2622 if ( !next )
2623 return;
2624 }
2625 else
2626 {
2627 prev->rendezvous_in_cnt = 0;
2628 next = do_schedule(prev, now, cpu);
2629 atomic_set(&next->rendezvous_out_cnt, 0);
2630 }
2631
2632 pcpu_schedule_unlock_irq(lock, cpu);
2633
2634 vnext = sched_unit2vcpu_cpu(next, cpu);
2635 sched_context_switch(vprev, vnext,
2636 !is_idle_unit(prev) && is_idle_unit(next), now);
2637 }
2638
2639 /* The scheduler timer: force a run through the scheduler */
s_timer_fn(void * unused)2640 static void s_timer_fn(void *unused)
2641 {
2642 raise_softirq(SCHEDULE_SOFTIRQ);
2643 SCHED_STAT_CRANK(sched_irq);
2644 }
2645
2646 /* Per-VCPU periodic timer function: sends a virtual timer interrupt. */
vcpu_periodic_timer_fn(void * data)2647 static void vcpu_periodic_timer_fn(void *data)
2648 {
2649 struct vcpu *v = data;
2650 vcpu_periodic_timer_work(v);
2651 }
2652
2653 /* Per-VCPU single-shot timer function: sends a virtual timer interrupt. */
vcpu_singleshot_timer_fn(void * data)2654 static void vcpu_singleshot_timer_fn(void *data)
2655 {
2656 struct vcpu *v = data;
2657 send_timer_event(v);
2658 }
2659
2660 /* SCHEDOP_poll timeout callback. */
poll_timer_fn(void * data)2661 static void poll_timer_fn(void *data)
2662 {
2663 struct vcpu *v = data;
2664
2665 if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
2666 vcpu_unblock(v);
2667 }
2668
sched_alloc_res(void)2669 static struct sched_resource *sched_alloc_res(void)
2670 {
2671 struct sched_resource *sr;
2672
2673 sr = xzalloc(struct sched_resource);
2674 if ( sr == NULL )
2675 return NULL;
2676 if ( !zalloc_cpumask_var(&sr->cpus) )
2677 {
2678 xfree(sr);
2679 return NULL;
2680 }
2681 return sr;
2682 }
2683
cpu_schedule_up(unsigned int cpu)2684 static int cpu_schedule_up(unsigned int cpu)
2685 {
2686 struct sched_resource *sr;
2687
2688 sr = sched_alloc_res();
2689 if ( sr == NULL )
2690 return -ENOMEM;
2691
2692 sr->master_cpu = cpu;
2693 cpumask_copy(sr->cpus, cpumask_of(cpu));
2694 set_sched_res(cpu, sr);
2695
2696 sr->scheduler = &sched_idle_ops;
2697 spin_lock_init(&sr->_lock);
2698 sr->schedule_lock = &sched_free_cpu_lock;
2699 init_timer(&sr->s_timer, s_timer_fn, NULL, cpu);
2700 atomic_set(&per_cpu(sched_urgent_count, cpu), 0);
2701
2702 /* We start with cpu granularity. */
2703 sr->granularity = 1;
2704
2705 cpumask_set_cpu(cpu, &sched_res_mask);
2706
2707 /* Boot CPU is dealt with later in scheduler_init(). */
2708 if ( cpu == 0 )
2709 return 0;
2710
2711 if ( idle_vcpu[cpu] == NULL )
2712 vcpu_create(idle_vcpu[0]->domain, cpu);
2713 else
2714 idle_vcpu[cpu]->sched_unit->res = sr;
2715
2716 if ( idle_vcpu[cpu] == NULL )
2717 return -ENOMEM;
2718
2719 idle_vcpu[cpu]->sched_unit->rendezvous_in_cnt = 0;
2720
2721 /*
2722 * No need to allocate any scheduler data, as cpus coming online are
2723 * free initially and the idle scheduler doesn't need any data areas
2724 * allocated.
2725 */
2726
2727 sr->curr = idle_vcpu[cpu]->sched_unit;
2728 sr->sched_unit_idle = idle_vcpu[cpu]->sched_unit;
2729
2730 sr->sched_priv = NULL;
2731
2732 return 0;
2733 }
2734
sched_res_free(struct rcu_head * head)2735 static void sched_res_free(struct rcu_head *head)
2736 {
2737 struct sched_resource *sr = container_of(head, struct sched_resource, rcu);
2738
2739 free_cpumask_var(sr->cpus);
2740 if ( sr->sched_unit_idle )
2741 sched_free_unit_mem(sr->sched_unit_idle);
2742 xfree(sr);
2743 }
2744
cpu_schedule_down(unsigned int cpu)2745 static void cpu_schedule_down(unsigned int cpu)
2746 {
2747 struct sched_resource *sr;
2748
2749 rcu_read_lock(&sched_res_rculock);
2750
2751 sr = get_sched_res(cpu);
2752
2753 kill_timer(&sr->s_timer);
2754
2755 cpumask_clear_cpu(cpu, &sched_res_mask);
2756 set_sched_res(cpu, NULL);
2757
2758 /* Keep idle unit. */
2759 sr->sched_unit_idle = NULL;
2760 call_rcu(&sr->rcu, sched_res_free);
2761
2762 rcu_read_unlock(&sched_res_rculock);
2763 }
2764
sched_rm_cpu(unsigned int cpu)2765 void sched_rm_cpu(unsigned int cpu)
2766 {
2767 int rc;
2768
2769 rcu_read_lock(&domlist_read_lock);
2770 rc = cpu_disable_scheduler(cpu);
2771 BUG_ON(rc);
2772 rcu_read_unlock(&domlist_read_lock);
2773 cpu_schedule_down(cpu);
2774 }
2775
cpu_schedule_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)2776 static int cpu_schedule_callback(
2777 struct notifier_block *nfb, unsigned long action, void *hcpu)
2778 {
2779 unsigned int cpu = (unsigned long)hcpu;
2780 int rc = 0;
2781
2782 /*
2783 * All scheduler related suspend/resume handling needed is done in
2784 * cpupool.c.
2785 */
2786 if ( system_state > SYS_STATE_active )
2787 return NOTIFY_DONE;
2788
2789 rcu_read_lock(&sched_res_rculock);
2790
2791 /*
2792 * From the scheduler perspective, bringing up a pCPU requires
2793 * allocating and initializing the per-pCPU scheduler specific data,
2794 * as well as "registering" this pCPU to the scheduler (which may
2795 * involve modifying some scheduler wide data structures).
2796 * As new pCPUs always start as "free" cpus with the minimal idle
2797 * scheduler being in charge, we don't need any of that.
2798 *
2799 * On the other hand, at teardown, we need to reverse what has been done
2800 * during initialization, and then free the per-pCPU specific data. A
2801 * pCPU brought down is not forced through "free" cpus, so here we need to
2802 * use the appropriate hooks.
2803 *
2804 * This happens by calling the deinit_pdata and free_pdata hooks, in this
2805 * order. If no per-pCPU memory was allocated, there is no need to
2806 * provide an implementation of free_pdata. deinit_pdata may, however,
2807 * be necessary/useful in this case too (e.g., it can undo something done
2808 * on scheduler wide data structure during switch_sched). Both deinit_pdata
2809 * and free_pdata are called during CPU_DEAD.
2810 *
2811 * If something goes wrong during bringup, we go to CPU_UP_CANCELLED.
2812 */
2813 switch ( action )
2814 {
2815 case CPU_UP_PREPARE:
2816 rc = cpu_schedule_up(cpu);
2817 break;
2818 case CPU_DOWN_PREPARE:
2819 rcu_read_lock(&domlist_read_lock);
2820 rc = cpu_disable_scheduler_check(cpu);
2821 rcu_read_unlock(&domlist_read_lock);
2822 break;
2823 case CPU_DEAD:
2824 sched_rm_cpu(cpu);
2825 break;
2826 case CPU_UP_CANCELED:
2827 cpu_schedule_down(cpu);
2828 break;
2829 default:
2830 break;
2831 }
2832
2833 rcu_read_unlock(&sched_res_rculock);
2834
2835 return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
2836 }
2837
2838 static struct notifier_block cpu_schedule_nfb = {
2839 .notifier_call = cpu_schedule_callback
2840 };
2841
sched_get_opt_cpumask(enum sched_gran opt,unsigned int cpu)2842 const cpumask_t *sched_get_opt_cpumask(enum sched_gran opt, unsigned int cpu)
2843 {
2844 const cpumask_t *mask;
2845
2846 switch ( opt )
2847 {
2848 case SCHED_GRAN_cpu:
2849 mask = cpumask_of(cpu);
2850 break;
2851 case SCHED_GRAN_core:
2852 mask = per_cpu(cpu_sibling_mask, cpu);
2853 break;
2854 case SCHED_GRAN_socket:
2855 mask = per_cpu(cpu_core_mask, cpu);
2856 break;
2857 default:
2858 ASSERT_UNREACHABLE();
2859 return NULL;
2860 }
2861
2862 return mask;
2863 }
2864
schedule_dummy(void)2865 static void schedule_dummy(void)
2866 {
2867 sched_tasklet_check_cpu(smp_processor_id());
2868 }
2869
scheduler_disable(void)2870 void scheduler_disable(void)
2871 {
2872 scheduler_active = false;
2873 open_softirq(SCHEDULE_SOFTIRQ, schedule_dummy);
2874 open_softirq(SCHED_SLAVE_SOFTIRQ, schedule_dummy);
2875 }
2876
scheduler_enable(void)2877 void scheduler_enable(void)
2878 {
2879 open_softirq(SCHEDULE_SOFTIRQ, schedule);
2880 open_softirq(SCHED_SLAVE_SOFTIRQ, sched_slave);
2881 scheduler_active = true;
2882 }
2883
2884 /* Initialise the data structures. */
scheduler_init(void)2885 void __init scheduler_init(void)
2886 {
2887 struct domain *idle_domain;
2888 int i;
2889
2890 scheduler_enable();
2891
2892 for ( i = 0; i < NUM_SCHEDULERS; i++)
2893 {
2894 #define sched_test_func(f) \
2895 if ( !schedulers[i]->f ) \
2896 { \
2897 printk("scheduler %s misses .%s, dropped\n", \
2898 schedulers[i]->opt_name, #f); \
2899 schedulers[i] = NULL; \
2900 }
2901
2902 sched_test_func(init);
2903 sched_test_func(deinit);
2904 sched_test_func(pick_resource);
2905 sched_test_func(alloc_udata);
2906 sched_test_func(free_udata);
2907 sched_test_func(switch_sched);
2908 sched_test_func(do_schedule);
2909
2910 #undef sched_test_func
2911
2912 if ( schedulers[i]->global_init && schedulers[i]->global_init() < 0 )
2913 {
2914 printk("scheduler %s failed initialization, dropped\n",
2915 schedulers[i]->opt_name);
2916 schedulers[i] = NULL;
2917 }
2918
2919 if ( schedulers[i] && !ops.name &&
2920 !strcmp(schedulers[i]->opt_name, opt_sched) )
2921 ops = *schedulers[i];
2922 }
2923
2924 if ( !ops.name )
2925 {
2926 printk("Could not find scheduler: %s\n", opt_sched);
2927 for ( i = 0; i < NUM_SCHEDULERS; i++ )
2928 if ( schedulers[i] &&
2929 !strcmp(schedulers[i]->opt_name, CONFIG_SCHED_DEFAULT) )
2930 {
2931 ops = *schedulers[i];
2932 break;
2933 }
2934 BUG_ON(!ops.name);
2935 printk("Using '%s' (%s)\n", ops.name, ops.opt_name);
2936 }
2937
2938 if ( cpu_schedule_up(0) )
2939 BUG();
2940 register_cpu_notifier(&cpu_schedule_nfb);
2941
2942 printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
2943 if ( sched_init(&ops) )
2944 panic("scheduler returned error on init\n");
2945
2946 if ( sched_ratelimit_us &&
2947 (sched_ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX
2948 || sched_ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN) )
2949 {
2950 printk("WARNING: sched_ratelimit_us outside of valid range [%d,%d].\n"
2951 " Resetting to default %u\n",
2952 XEN_SYSCTL_SCHED_RATELIMIT_MIN,
2953 XEN_SYSCTL_SCHED_RATELIMIT_MAX,
2954 SCHED_DEFAULT_RATELIMIT_US);
2955 sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US;
2956 }
2957
2958 idle_domain = domain_create(DOMID_IDLE, NULL, false);
2959 BUG_ON(IS_ERR(idle_domain));
2960 BUG_ON(nr_cpu_ids > ARRAY_SIZE(idle_vcpu));
2961 idle_domain->vcpu = idle_vcpu;
2962 idle_domain->max_vcpus = nr_cpu_ids;
2963 if ( vcpu_create(idle_domain, 0) == NULL )
2964 BUG();
2965
2966 rcu_read_lock(&sched_res_rculock);
2967
2968 get_sched_res(0)->curr = idle_vcpu[0]->sched_unit;
2969 get_sched_res(0)->sched_unit_idle = idle_vcpu[0]->sched_unit;
2970
2971 rcu_read_unlock(&sched_res_rculock);
2972 }
2973
2974 /*
2975 * Move a pCPU from free cpus (running the idle scheduler) to a cpupool
2976 * using any "real" scheduler.
2977 * The cpu is still marked as "free" and not yet valid for its cpupool.
2978 */
schedule_cpu_add(unsigned int cpu,struct cpupool * c)2979 int schedule_cpu_add(unsigned int cpu, struct cpupool *c)
2980 {
2981 struct vcpu *idle;
2982 void *ppriv, *vpriv;
2983 struct scheduler *new_ops = c->sched;
2984 struct sched_resource *sr;
2985 spinlock_t *old_lock, *new_lock;
2986 unsigned long flags;
2987 int ret = 0;
2988
2989 rcu_read_lock(&sched_res_rculock);
2990
2991 sr = get_sched_res(cpu);
2992
2993 ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus));
2994 ASSERT(!cpumask_test_cpu(cpu, c->cpu_valid));
2995 ASSERT(get_sched_res(cpu)->cpupool == NULL);
2996
2997 /*
2998 * To setup the cpu for the new scheduler we need:
2999 * - a valid instance of per-CPU scheduler specific data, as it is
3000 * allocated by sched_alloc_pdata(). Note that we do not want to
3001 * initialize it yet, as that will be done by the target scheduler,
3002 * in sched_switch_sched(), in proper ordering and with locking.
3003 * - a valid instance of per-vCPU scheduler specific data, for the idle
3004 * vCPU of cpu. That is what the target scheduler will use for the
3005 * sched_priv field of the per-vCPU info of the idle domain.
3006 */
3007 idle = idle_vcpu[cpu];
3008 ppriv = sched_alloc_pdata(new_ops, cpu);
3009 if ( IS_ERR(ppriv) )
3010 {
3011 ret = PTR_ERR(ppriv);
3012 goto out;
3013 }
3014
3015 vpriv = sched_alloc_udata(new_ops, idle->sched_unit,
3016 idle->domain->sched_priv);
3017 if ( vpriv == NULL )
3018 {
3019 sched_free_pdata(new_ops, ppriv, cpu);
3020 ret = -ENOMEM;
3021 goto out;
3022 }
3023
3024 /*
3025 * The actual switch, including the rerouting of the scheduler lock to
3026 * whatever new_ops prefers, needs to happen in one critical section,
3027 * protected by old_ops' lock, or races are possible.
3028 * It is, in fact, the lock of the idle scheduler that we are taking.
3029 * But that is ok as anyone trying to schedule on this cpu will spin until
3030 * when we release that lock (bottom of this function). When he'll get the
3031 * lock --thanks to the loop inside *_schedule_lock() functions-- he'll
3032 * notice that the lock itself changed, and retry acquiring the new one
3033 * (which will be the correct, remapped one, at that point).
3034 */
3035 old_lock = pcpu_schedule_lock_irqsave(cpu, &flags);
3036
3037 if ( cpupool_get_granularity(c) > 1 )
3038 {
3039 const cpumask_t *mask;
3040 unsigned int cpu_iter, idx = 0;
3041 struct sched_unit *old_unit, *master_unit;
3042 struct sched_resource *sr_old;
3043
3044 /*
3045 * We need to merge multiple idle_vcpu units and sched_resource structs
3046 * into one. As the free cpus all share the same lock we are fine doing
3047 * that now. The worst which could happen would be someone waiting for
3048 * the lock, thus dereferencing sched_res->schedule_lock. This is the
3049 * reason we are freeing struct sched_res via call_rcu() to avoid the
3050 * lock pointer suddenly disappearing.
3051 */
3052 mask = sched_get_opt_cpumask(c->gran, cpu);
3053 master_unit = idle_vcpu[cpu]->sched_unit;
3054
3055 for_each_cpu ( cpu_iter, mask )
3056 {
3057 if ( idx )
3058 cpumask_clear_cpu(cpu_iter, &sched_res_mask);
3059
3060 per_cpu(sched_res_idx, cpu_iter) = idx++;
3061
3062 if ( cpu == cpu_iter )
3063 continue;
3064
3065 old_unit = idle_vcpu[cpu_iter]->sched_unit;
3066 sr_old = get_sched_res(cpu_iter);
3067 kill_timer(&sr_old->s_timer);
3068 idle_vcpu[cpu_iter]->sched_unit = master_unit;
3069 master_unit->runstate_cnt[RUNSTATE_running]++;
3070 set_sched_res(cpu_iter, sr);
3071 cpumask_set_cpu(cpu_iter, sr->cpus);
3072
3073 call_rcu(&sr_old->rcu, sched_res_free);
3074 }
3075 }
3076
3077 new_lock = sched_switch_sched(new_ops, cpu, ppriv, vpriv);
3078
3079 sr->scheduler = new_ops;
3080 sr->sched_priv = ppriv;
3081
3082 /*
3083 * Reroute the lock to the per pCPU lock as /last/ thing. In fact,
3084 * if it is free (and it can be) we want that anyone that manages
3085 * taking it, finds all the initializations we've done above in place.
3086 */
3087 smp_wmb();
3088 sr->schedule_lock = new_lock;
3089
3090 /* _Not_ pcpu_schedule_unlock(): schedule_lock has changed! */
3091 spin_unlock_irqrestore(old_lock, flags);
3092
3093 sr->granularity = cpupool_get_granularity(c);
3094 sr->cpupool = c;
3095 /* The cpu is added to a pool, trigger it to go pick up some work */
3096 cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
3097
3098 out:
3099 rcu_read_unlock(&sched_res_rculock);
3100
3101 return ret;
3102 }
3103
3104 /*
3105 * Remove a pCPU from its cpupool. Its scheduler becomes &sched_idle_ops
3106 * (the idle scheduler).
3107 * The cpu is already marked as "free" and not valid any longer for its
3108 * cpupool.
3109 */
schedule_cpu_rm(unsigned int cpu)3110 int schedule_cpu_rm(unsigned int cpu)
3111 {
3112 void *ppriv_old, *vpriv_old;
3113 struct sched_resource *sr, **sr_new = NULL;
3114 struct sched_unit *unit;
3115 struct scheduler *old_ops;
3116 spinlock_t *old_lock;
3117 unsigned long flags;
3118 int idx, ret = -ENOMEM;
3119 unsigned int cpu_iter;
3120
3121 rcu_read_lock(&sched_res_rculock);
3122
3123 sr = get_sched_res(cpu);
3124 old_ops = sr->scheduler;
3125
3126 if ( sr->granularity > 1 )
3127 {
3128 sr_new = xmalloc_array(struct sched_resource *, sr->granularity - 1);
3129 if ( !sr_new )
3130 goto out;
3131 for ( idx = 0; idx < sr->granularity - 1; idx++ )
3132 {
3133 sr_new[idx] = sched_alloc_res();
3134 if ( sr_new[idx] )
3135 {
3136 sr_new[idx]->sched_unit_idle = sched_alloc_unit_mem();
3137 if ( !sr_new[idx]->sched_unit_idle )
3138 {
3139 sched_res_free(&sr_new[idx]->rcu);
3140 sr_new[idx] = NULL;
3141 }
3142 }
3143 if ( !sr_new[idx] )
3144 {
3145 for ( idx--; idx >= 0; idx-- )
3146 sched_res_free(&sr_new[idx]->rcu);
3147 goto out;
3148 }
3149 sr_new[idx]->curr = sr_new[idx]->sched_unit_idle;
3150 sr_new[idx]->scheduler = &sched_idle_ops;
3151 sr_new[idx]->granularity = 1;
3152
3153 /* We want the lock not to change when replacing the resource. */
3154 sr_new[idx]->schedule_lock = sr->schedule_lock;
3155 }
3156 }
3157
3158 ret = 0;
3159 ASSERT(sr->cpupool != NULL);
3160 ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus));
3161 ASSERT(!cpumask_test_cpu(cpu, sr->cpupool->cpu_valid));
3162
3163 /* See comment in schedule_cpu_add() regarding lock switching. */
3164 old_lock = pcpu_schedule_lock_irqsave(cpu, &flags);
3165
3166 vpriv_old = idle_vcpu[cpu]->sched_unit->priv;
3167 ppriv_old = sr->sched_priv;
3168
3169 idx = 0;
3170 for_each_cpu ( cpu_iter, sr->cpus )
3171 {
3172 per_cpu(sched_res_idx, cpu_iter) = 0;
3173 if ( cpu_iter == cpu )
3174 {
3175 unit = idle_vcpu[cpu_iter]->sched_unit;
3176 unit->priv = NULL;
3177 atomic_set(&unit->next_task->rendezvous_out_cnt, 0);
3178 unit->rendezvous_in_cnt = 0;
3179 }
3180 else
3181 {
3182 /* Initialize unit. */
3183 unit = sr_new[idx]->sched_unit_idle;
3184 unit->res = sr_new[idx];
3185 unit->is_running = true;
3186 sched_unit_add_vcpu(unit, idle_vcpu[cpu_iter]);
3187 sched_domain_insert_unit(unit, idle_vcpu[cpu_iter]->domain);
3188
3189 /* Adjust cpu masks of resources (old and new). */
3190 cpumask_clear_cpu(cpu_iter, sr->cpus);
3191 cpumask_set_cpu(cpu_iter, sr_new[idx]->cpus);
3192
3193 /* Init timer. */
3194 init_timer(&sr_new[idx]->s_timer, s_timer_fn, NULL, cpu_iter);
3195
3196 /* Last resource initializations and insert resource pointer. */
3197 sr_new[idx]->master_cpu = cpu_iter;
3198 set_sched_res(cpu_iter, sr_new[idx]);
3199
3200 /* Last action: set the new lock pointer. */
3201 smp_mb();
3202 sr_new[idx]->schedule_lock = &sched_free_cpu_lock;
3203
3204 idx++;
3205 }
3206 }
3207 sr->scheduler = &sched_idle_ops;
3208 sr->sched_priv = NULL;
3209 sr->granularity = 1;
3210 sr->cpupool = NULL;
3211
3212 smp_mb();
3213 sr->schedule_lock = &sched_free_cpu_lock;
3214
3215 /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */
3216 spin_unlock_irqrestore(old_lock, flags);
3217
3218 sched_deinit_pdata(old_ops, ppriv_old, cpu);
3219
3220 sched_free_udata(old_ops, vpriv_old);
3221 sched_free_pdata(old_ops, ppriv_old, cpu);
3222
3223 out:
3224 rcu_read_unlock(&sched_res_rculock);
3225 xfree(sr_new);
3226
3227 return ret;
3228 }
3229
scheduler_get_default(void)3230 struct scheduler *scheduler_get_default(void)
3231 {
3232 return &ops;
3233 }
3234
scheduler_alloc(unsigned int sched_id,int * perr)3235 struct scheduler *scheduler_alloc(unsigned int sched_id, int *perr)
3236 {
3237 int i;
3238 struct scheduler *sched;
3239
3240 for ( i = 0; i < NUM_SCHEDULERS; i++ )
3241 if ( schedulers[i] && schedulers[i]->sched_id == sched_id )
3242 goto found;
3243 *perr = -ENOENT;
3244 return NULL;
3245
3246 found:
3247 *perr = -ENOMEM;
3248 if ( (sched = xmalloc(struct scheduler)) == NULL )
3249 return NULL;
3250 memcpy(sched, schedulers[i], sizeof(*sched));
3251 if ( (*perr = sched_init(sched)) != 0 )
3252 {
3253 xfree(sched);
3254 sched = NULL;
3255 }
3256
3257 return sched;
3258 }
3259
scheduler_free(struct scheduler * sched)3260 void scheduler_free(struct scheduler *sched)
3261 {
3262 BUG_ON(sched == &ops);
3263 sched_deinit(sched);
3264 xfree(sched);
3265 }
3266
schedule_dump(struct cpupool * c)3267 void schedule_dump(struct cpupool *c)
3268 {
3269 unsigned int i, j;
3270 struct scheduler *sched;
3271 cpumask_t *cpus;
3272
3273 /* Locking, if necessary, must be handled withing each scheduler */
3274
3275 rcu_read_lock(&sched_res_rculock);
3276
3277 if ( c != NULL )
3278 {
3279 sched = c->sched;
3280 cpus = c->res_valid;
3281 printk("Scheduler: %s (%s)\n", sched->name, sched->opt_name);
3282 sched_dump_settings(sched);
3283 }
3284 else
3285 {
3286 sched = &ops;
3287 cpus = &cpupool_free_cpus;
3288 }
3289
3290 printk("CPUs info:\n");
3291 for_each_cpu (i, cpus)
3292 {
3293 struct sched_resource *sr = get_sched_res(i);
3294 unsigned long flags;
3295 spinlock_t *lock;
3296
3297 lock = pcpu_schedule_lock_irqsave(i, &flags);
3298
3299 printk("CPU[%02d] current=%pv, curr=%pv, prev=%pv\n", i,
3300 get_cpu_current(i), sr->curr ? sr->curr->vcpu_list : NULL,
3301 sr->prev ? sr->prev->vcpu_list : NULL);
3302 for_each_cpu (j, sr->cpus)
3303 if ( i != j )
3304 printk("CPU[%02d] current=%pv\n", j, get_cpu_current(j));
3305
3306 pcpu_schedule_unlock_irqrestore(lock, flags, i);
3307
3308 sched_dump_cpu_state(sched, i);
3309 }
3310
3311 rcu_read_unlock(&sched_res_rculock);
3312 }
3313
wait(void)3314 void wait(void)
3315 {
3316 schedule();
3317 }
3318
3319 #ifdef CONFIG_X86
sched_setup_dom0_vcpus(struct domain * d)3320 void __init sched_setup_dom0_vcpus(struct domain *d)
3321 {
3322 unsigned int i;
3323 struct sched_unit *unit;
3324
3325 for ( i = 1; i < d->max_vcpus; i++ )
3326 vcpu_create(d, i);
3327
3328 /*
3329 * PV-shim: vcpus are pinned 1:1.
3330 * Initially only 1 cpu is online, others will be dealt with when
3331 * onlining them. This avoids pinning a vcpu to a not yet online cpu here.
3332 */
3333 if ( pv_shim )
3334 sched_set_affinity(d->vcpu[0]->sched_unit,
3335 cpumask_of(0), cpumask_of(0));
3336 else
3337 {
3338 for_each_sched_unit ( d, unit )
3339 {
3340 if ( !opt_dom0_vcpus_pin && !dom0_affinity_relaxed )
3341 sched_set_affinity(unit, &dom0_cpus, NULL);
3342 sched_set_affinity(unit, NULL, &dom0_cpus);
3343 }
3344 }
3345
3346 domain_update_node_affinity(d);
3347 }
3348 #endif
3349
3350 #ifdef CONFIG_COMPAT
3351 #include "compat.c"
3352 #endif
3353
3354 #endif /* !COMPAT */
3355
3356 /*
3357 * Local variables:
3358 * mode: C
3359 * c-file-style: "BSD"
3360 * c-basic-offset: 4
3361 * tab-width: 4
3362 * indent-tabs-mode: nil
3363 * End:
3364 */
3365