1 /****************************************************************************
2  * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
3  * (C) 2002-2003 University of Cambridge
4  * (C) 2004      - Mark Williamson - Intel Research Cambridge
5  ****************************************************************************
6  *
7  *        File: common/schedule.c
8  *      Author: Rolf Neugebauer & Keir Fraser
9  *              Updated for generic API by Mark Williamson
10  *
11  * Description: Generic CPU scheduling code
12  *              implements support functionality for the Xen scheduler API.
13  *
14  */
15 
16 #ifndef COMPAT
17 #include <xen/init.h>
18 #include <xen/lib.h>
19 #include <xen/param.h>
20 #include <xen/sched.h>
21 #include <xen/domain.h>
22 #include <xen/delay.h>
23 #include <xen/event.h>
24 #include <xen/time.h>
25 #include <xen/timer.h>
26 #include <xen/perfc.h>
27 #include <xen/softirq.h>
28 #include <xen/trace.h>
29 #include <xen/mm.h>
30 #include <xen/err.h>
31 #include <xen/guest_access.h>
32 #include <xen/hypercall.h>
33 #include <xen/multicall.h>
34 #include <xen/cpu.h>
35 #include <xen/preempt.h>
36 #include <xen/event.h>
37 #include <public/sched.h>
38 #include <xsm/xsm.h>
39 #include <xen/err.h>
40 
41 #include "private.h"
42 
43 #ifdef CONFIG_XEN_GUEST
44 #include <asm/guest.h>
45 #else
46 #define pv_shim false
47 #endif
48 
49 /* opt_sched: scheduler - default to configured value */
50 static char __initdata opt_sched[10] = CONFIG_SCHED_DEFAULT;
51 string_param("sched", opt_sched);
52 
53 /* if sched_smt_power_savings is set,
54  * scheduler will give preferrence to partially idle package compared to
55  * the full idle package, when picking pCPU to schedule vCPU.
56  */
57 bool sched_smt_power_savings;
58 boolean_param("sched_smt_power_savings", sched_smt_power_savings);
59 
60 /* Default scheduling rate limit: 1ms
61  * The behavior when sched_ratelimit_us is greater than sched_credit_tslice_ms is undefined
62  * */
63 int sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US;
64 integer_param("sched_ratelimit_us", sched_ratelimit_us);
65 
66 /* Number of vcpus per struct sched_unit. */
67 bool __read_mostly sched_disable_smt_switching;
68 cpumask_t sched_res_mask;
69 
70 /* Common lock for free cpus. */
71 static DEFINE_SPINLOCK(sched_free_cpu_lock);
72 
73 /* Various timer handlers. */
74 static void s_timer_fn(void *unused);
75 static void vcpu_periodic_timer_fn(void *data);
76 static void vcpu_singleshot_timer_fn(void *data);
77 static void poll_timer_fn(void *data);
78 
79 /* This is global for now so that private implementations can reach it */
80 DEFINE_PER_CPU_READ_MOSTLY(struct sched_resource *, sched_res);
81 static DEFINE_PER_CPU_READ_MOSTLY(unsigned int, sched_res_idx);
82 DEFINE_RCU_READ_LOCK(sched_res_rculock);
83 
84 /* Scratch space for cpumasks. */
85 DEFINE_PER_CPU(cpumask_t, cpumask_scratch);
86 
87 /* How many urgent vcpus. */
88 DEFINE_PER_CPU(atomic_t, sched_urgent_count);
89 
90 extern const struct scheduler *__start_schedulers_array[], *__end_schedulers_array[];
91 #define NUM_SCHEDULERS (__end_schedulers_array - __start_schedulers_array)
92 #define schedulers __start_schedulers_array
93 
94 static struct scheduler __read_mostly ops;
95 
96 static bool scheduler_active;
97 
98 static void sched_set_affinity(
99     struct sched_unit *unit, const cpumask_t *hard, const cpumask_t *soft);
100 
101 static struct sched_resource *
sched_idle_res_pick(const struct scheduler * ops,const struct sched_unit * unit)102 sched_idle_res_pick(const struct scheduler *ops, const struct sched_unit *unit)
103 {
104     return unit->res;
105 }
106 
107 static void *
sched_idle_alloc_udata(const struct scheduler * ops,struct sched_unit * unit,void * dd)108 sched_idle_alloc_udata(const struct scheduler *ops, struct sched_unit *unit,
109                        void *dd)
110 {
111     /* Any non-NULL pointer is fine here. */
112     return ZERO_BLOCK_PTR;
113 }
114 
115 static void
sched_idle_free_udata(const struct scheduler * ops,void * priv)116 sched_idle_free_udata(const struct scheduler *ops, void *priv)
117 {
118 }
119 
sched_idle_schedule(const struct scheduler * ops,struct sched_unit * unit,s_time_t now,bool tasklet_work_scheduled)120 static void sched_idle_schedule(
121     const struct scheduler *ops, struct sched_unit *unit, s_time_t now,
122     bool tasklet_work_scheduled)
123 {
124     const unsigned int cpu = smp_processor_id();
125 
126     unit->next_time = -1;
127     unit->next_task = sched_idle_unit(cpu);
128 }
129 
130 static struct scheduler sched_idle_ops = {
131     .name           = "Idle Scheduler",
132     .opt_name       = "idle",
133     .sched_data     = NULL,
134 
135     .pick_resource  = sched_idle_res_pick,
136     .do_schedule    = sched_idle_schedule,
137 
138     .alloc_udata    = sched_idle_alloc_udata,
139     .free_udata     = sched_idle_free_udata,
140 };
141 
unit2vcpu_cpu(const struct sched_unit * unit,unsigned int cpu)142 static inline struct vcpu *unit2vcpu_cpu(const struct sched_unit *unit,
143                                          unsigned int cpu)
144 {
145     unsigned int idx = unit->unit_id + per_cpu(sched_res_idx, cpu);
146     const struct domain *d = unit->domain;
147 
148     return (idx < d->max_vcpus) ? d->vcpu[idx] : NULL;
149 }
150 
sched_unit2vcpu_cpu(const struct sched_unit * unit,unsigned int cpu)151 static inline struct vcpu *sched_unit2vcpu_cpu(const struct sched_unit *unit,
152                                                unsigned int cpu)
153 {
154     struct vcpu *v = unit2vcpu_cpu(unit, cpu);
155 
156     return (v && v->new_state == RUNSTATE_running) ? v : idle_vcpu[cpu];
157 }
158 
dom_scheduler(const struct domain * d)159 static inline struct scheduler *dom_scheduler(const struct domain *d)
160 {
161     if ( likely(d->cpupool != NULL) )
162         return d->cpupool->sched;
163 
164     /*
165      * If d->cpupool is NULL, this is the idle domain. This is special
166      * because the idle domain does not really belong to any cpupool, and,
167      * hence, does not really have a scheduler.
168      *
169      * This is (should be!) only called like this for allocating the idle
170      * vCPUs for the first time, during boot, in which case what we want
171      * is the default scheduler that has been, choosen at boot.
172      */
173     ASSERT(is_idle_domain(d));
174     return &ops;
175 }
176 
unit_scheduler(const struct sched_unit * unit)177 static inline struct scheduler *unit_scheduler(const struct sched_unit *unit)
178 {
179     const struct domain *d = unit->domain;
180 
181     if ( likely(d->cpupool != NULL) )
182         return d->cpupool->sched;
183 
184     /*
185      * If d->cpupool is NULL, this is a unit of the idle domain. And this
186      * case is special because the idle domain does not really belong to
187      * a cpupool and, hence, doesn't really have a scheduler). In fact, its
188      * units (may) run on pCPUs which are in different pools, with different
189      * schedulers.
190      *
191      * What we want, in this case, is the scheduler of the pCPU where this
192      * particular idle unit is running. And, since unit->res never changes
193      * for idle units, it is safe to use it, with no locks, to figure that out.
194      */
195 
196     ASSERT(is_idle_domain(d));
197     return unit->res->scheduler;
198 }
199 
vcpu_scheduler(const struct vcpu * v)200 static inline struct scheduler *vcpu_scheduler(const struct vcpu *v)
201 {
202     return unit_scheduler(v->sched_unit);
203 }
204 #define VCPU2ONLINE(_v) cpupool_domain_master_cpumask((_v)->domain)
205 
trace_runstate_change(const struct vcpu * v,int new_state)206 static inline void trace_runstate_change(const struct vcpu *v, int new_state)
207 {
208     struct { uint32_t vcpu:16, domain:16; } d;
209     uint32_t event;
210 
211     if ( likely(!tb_init_done) )
212         return;
213 
214     d.vcpu = v->vcpu_id;
215     d.domain = v->domain->domain_id;
216 
217     event = TRC_SCHED_RUNSTATE_CHANGE;
218     event |= ( v->runstate.state & 0x3 ) << 8;
219     event |= ( new_state & 0x3 ) << 4;
220 
221     __trace_var(event, 1/*tsc*/, sizeof(d), &d);
222 }
223 
trace_continue_running(const struct vcpu * v)224 static inline void trace_continue_running(const struct vcpu *v)
225 {
226     struct { uint32_t vcpu:16, domain:16; } d;
227 
228     if ( likely(!tb_init_done) )
229         return;
230 
231     d.vcpu = v->vcpu_id;
232     d.domain = v->domain->domain_id;
233 
234     __trace_var(TRC_SCHED_CONTINUE_RUNNING, 1/*tsc*/, sizeof(d), &d);
235 }
236 
vcpu_urgent_count_update(struct vcpu * v)237 static inline void vcpu_urgent_count_update(struct vcpu *v)
238 {
239     if ( is_idle_vcpu(v) )
240         return;
241 
242     if ( unlikely(v->is_urgent) )
243     {
244         if ( !(v->pause_flags & VPF_blocked) ||
245              !test_bit(v->vcpu_id, v->domain->poll_mask) )
246         {
247             v->is_urgent = 0;
248             atomic_dec(&per_cpu(sched_urgent_count, v->processor));
249         }
250     }
251     else
252     {
253         if ( unlikely(v->pause_flags & VPF_blocked) &&
254              unlikely(test_bit(v->vcpu_id, v->domain->poll_mask)) )
255         {
256             v->is_urgent = 1;
257             atomic_inc(&per_cpu(sched_urgent_count, v->processor));
258         }
259     }
260 }
261 
vcpu_runstate_change(struct vcpu * v,int new_state,s_time_t new_entry_time)262 static inline void vcpu_runstate_change(
263     struct vcpu *v, int new_state, s_time_t new_entry_time)
264 {
265     s_time_t delta;
266     struct sched_unit *unit = v->sched_unit;
267 
268     ASSERT(spin_is_locked(get_sched_res(v->processor)->schedule_lock));
269     if ( v->runstate.state == new_state )
270         return;
271 
272     vcpu_urgent_count_update(v);
273 
274     trace_runstate_change(v, new_state);
275 
276     if ( !is_idle_vcpu(v) )
277     {
278         unit->runstate_cnt[v->runstate.state]--;
279         unit->runstate_cnt[new_state]++;
280     }
281 
282     delta = new_entry_time - v->runstate.state_entry_time;
283     if ( delta > 0 )
284     {
285         v->runstate.time[v->runstate.state] += delta;
286         v->runstate.state_entry_time = new_entry_time;
287     }
288 
289     v->runstate.state = new_state;
290 }
291 
sched_guest_idle(void (* idle)(void),unsigned int cpu)292 void sched_guest_idle(void (*idle) (void), unsigned int cpu)
293 {
294     /*
295      * Another vcpu of the unit is active in guest context while this one is
296      * idle. In case of a scheduling event we don't want to have high latencies
297      * due to a cpu needing to wake up from deep C state for joining the
298      * rendezvous, so avoid those deep C states by incrementing the urgent
299      * count of the cpu.
300      */
301     atomic_inc(&per_cpu(sched_urgent_count, cpu));
302     idle();
303     atomic_dec(&per_cpu(sched_urgent_count, cpu));
304 }
305 
vcpu_runstate_get(const struct vcpu * v,struct vcpu_runstate_info * runstate)306 void vcpu_runstate_get(const struct vcpu *v,
307                        struct vcpu_runstate_info *runstate)
308 {
309     spinlock_t *lock;
310     s_time_t delta;
311     struct sched_unit *unit;
312 
313     rcu_read_lock(&sched_res_rculock);
314 
315     /*
316      * Be careful in case of an idle vcpu: the assignment to a unit might
317      * change even with the scheduling lock held, so be sure to use the
318      * correct unit for locking in order to avoid triggering an ASSERT() in
319      * the unlock function.
320      */
321     unit = is_idle_vcpu(v) ? get_sched_res(v->processor)->sched_unit_idle
322                            : v->sched_unit;
323     lock = likely(v == current) ? NULL : unit_schedule_lock_irq(unit);
324     memcpy(runstate, &v->runstate, sizeof(*runstate));
325     delta = NOW() - runstate->state_entry_time;
326     if ( delta > 0 )
327         runstate->time[runstate->state] += delta;
328 
329     if ( unlikely(lock != NULL) )
330         unit_schedule_unlock_irq(lock, unit);
331 
332     rcu_read_unlock(&sched_res_rculock);
333 }
334 
get_cpu_idle_time(unsigned int cpu)335 uint64_t get_cpu_idle_time(unsigned int cpu)
336 {
337     struct vcpu_runstate_info state = { 0 };
338     const struct vcpu *v = idle_vcpu[cpu];
339 
340     if ( cpu_online(cpu) && v )
341         vcpu_runstate_get(v, &state);
342 
343     return state.time[RUNSTATE_running];
344 }
345 
346 /*
347  * If locks are different, take the one with the lower address first.
348  * This avoids dead- or live-locks when this code is running on both
349  * cpus at the same time.
350  */
sched_spin_lock_double(spinlock_t * lock1,spinlock_t * lock2,unsigned long * flags)351 static void sched_spin_lock_double(spinlock_t *lock1, spinlock_t *lock2,
352                                    unsigned long *flags)
353 {
354     if ( lock1 == lock2 )
355     {
356         spin_lock_irqsave(lock1, *flags);
357     }
358     else if ( lock1 < lock2 )
359     {
360         spin_lock_irqsave(lock1, *flags);
361         spin_lock(lock2);
362     }
363     else
364     {
365         spin_lock_irqsave(lock2, *flags);
366         spin_lock(lock1);
367     }
368 }
369 
sched_spin_unlock_double(spinlock_t * lock1,spinlock_t * lock2,unsigned long flags)370 static void sched_spin_unlock_double(spinlock_t *lock1, spinlock_t *lock2,
371                                      unsigned long flags)
372 {
373     if ( lock1 != lock2 )
374         spin_unlock(lock2);
375     spin_unlock_irqrestore(lock1, flags);
376 }
377 
sched_free_unit_mem(struct sched_unit * unit)378 static void sched_free_unit_mem(struct sched_unit *unit)
379 {
380     struct sched_unit *prev_unit;
381     struct domain *d = unit->domain;
382 
383     if ( d->sched_unit_list == unit )
384         d->sched_unit_list = unit->next_in_list;
385     else
386     {
387         for_each_sched_unit ( d, prev_unit )
388         {
389             if ( prev_unit->next_in_list == unit )
390             {
391                 prev_unit->next_in_list = unit->next_in_list;
392                 break;
393             }
394         }
395     }
396 
397     free_cpumask_var(unit->cpu_hard_affinity);
398     free_cpumask_var(unit->cpu_hard_affinity_saved);
399     free_cpumask_var(unit->cpu_soft_affinity);
400 
401     xfree(unit);
402 }
403 
sched_free_unit(struct sched_unit * unit,struct vcpu * v)404 static void sched_free_unit(struct sched_unit *unit, struct vcpu *v)
405 {
406     const struct vcpu *vunit;
407     unsigned int cnt = 0;
408 
409     /* Don't count to be released vcpu, might be not in vcpu list yet. */
410     for_each_sched_unit_vcpu ( unit, vunit )
411         if ( vunit != v )
412             cnt++;
413 
414     v->sched_unit = NULL;
415     unit->runstate_cnt[v->runstate.state]--;
416 
417     if ( unit->vcpu_list == v )
418         unit->vcpu_list = v->next_in_list;
419 
420     if ( !cnt )
421         sched_free_unit_mem(unit);
422 }
423 
sched_unit_add_vcpu(struct sched_unit * unit,struct vcpu * v)424 static void sched_unit_add_vcpu(struct sched_unit *unit, struct vcpu *v)
425 {
426     v->sched_unit = unit;
427 
428     /* All but idle vcpus are allocated with sequential vcpu_id. */
429     if ( !unit->vcpu_list || unit->vcpu_list->vcpu_id > v->vcpu_id )
430     {
431         unit->vcpu_list = v;
432         /*
433          * unit_id is always the same as lowest vcpu_id of unit.
434          * This is used for stopping for_each_sched_unit_vcpu() loop and in
435          * order to support cpupools with different granularities.
436          */
437         unit->unit_id = v->vcpu_id;
438     }
439     unit->runstate_cnt[v->runstate.state]++;
440 }
441 
sched_alloc_unit_mem(void)442 static struct sched_unit *sched_alloc_unit_mem(void)
443 {
444     struct sched_unit *unit;
445 
446     unit = xzalloc(struct sched_unit);
447     if ( !unit )
448         return NULL;
449 
450     if ( !zalloc_cpumask_var(&unit->cpu_hard_affinity) ||
451          !zalloc_cpumask_var(&unit->cpu_hard_affinity_saved) ||
452          !zalloc_cpumask_var(&unit->cpu_soft_affinity) )
453     {
454         sched_free_unit_mem(unit);
455         unit = NULL;
456     }
457 
458     return unit;
459 }
460 
sched_domain_insert_unit(struct sched_unit * unit,struct domain * d)461 static void sched_domain_insert_unit(struct sched_unit *unit, struct domain *d)
462 {
463     struct sched_unit **prev_unit;
464 
465     unit->domain = d;
466 
467     for ( prev_unit = &d->sched_unit_list; *prev_unit;
468           prev_unit = &(*prev_unit)->next_in_list )
469         if ( (*prev_unit)->next_in_list &&
470              (*prev_unit)->next_in_list->unit_id > unit->unit_id )
471             break;
472 
473     unit->next_in_list = *prev_unit;
474     *prev_unit = unit;
475 }
476 
sched_alloc_unit(struct vcpu * v)477 static struct sched_unit *sched_alloc_unit(struct vcpu *v)
478 {
479     struct sched_unit *unit;
480     struct domain *d = v->domain;
481     unsigned int gran = cpupool_get_granularity(d->cpupool);
482 
483     for_each_sched_unit ( d, unit )
484         if ( unit->unit_id / gran == v->vcpu_id / gran )
485             break;
486 
487     if ( unit )
488     {
489         sched_unit_add_vcpu(unit, v);
490         return unit;
491     }
492 
493     if ( (unit = sched_alloc_unit_mem()) == NULL )
494         return NULL;
495 
496     sched_unit_add_vcpu(unit, v);
497     sched_domain_insert_unit(unit, d);
498 
499     return unit;
500 }
501 
sched_select_initial_cpu(const struct vcpu * v)502 static unsigned int sched_select_initial_cpu(const struct vcpu *v)
503 {
504     const struct domain *d = v->domain;
505     nodeid_t node;
506     spinlock_t *lock;
507     unsigned long flags;
508     unsigned int cpu_ret, cpu = smp_processor_id();
509     cpumask_t *cpus = cpumask_scratch_cpu(cpu);
510 
511     lock = pcpu_schedule_lock_irqsave(cpu, &flags);
512     cpumask_clear(cpus);
513     for_each_node_mask ( node, d->node_affinity )
514         cpumask_or(cpus, cpus, &node_to_cpumask(node));
515     cpumask_and(cpus, cpus, d->cpupool->cpu_valid);
516     if ( cpumask_empty(cpus) )
517         cpumask_copy(cpus, d->cpupool->cpu_valid);
518 
519     if ( v->vcpu_id == 0 )
520         cpu_ret = cpumask_first(cpus);
521     else
522     {
523         /* We can rely on previous vcpu being available. */
524         ASSERT(!is_idle_domain(d));
525 
526         cpu_ret = cpumask_cycle(d->vcpu[v->vcpu_id - 1]->processor, cpus);
527     }
528 
529     pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
530 
531     return cpu_ret;
532 }
533 
sched_init_vcpu(struct vcpu * v)534 int sched_init_vcpu(struct vcpu *v)
535 {
536     const struct domain *d = v->domain;
537     struct sched_unit *unit;
538     unsigned int processor;
539 
540     if ( (unit = sched_alloc_unit(v)) == NULL )
541         return 1;
542 
543     if ( is_idle_domain(d) )
544         processor = v->vcpu_id;
545     else
546         processor = sched_select_initial_cpu(v);
547 
548     /* Initialise the per-vcpu timers. */
549     spin_lock_init(&v->periodic_timer_lock);
550     init_timer(&v->periodic_timer, vcpu_periodic_timer_fn, v, processor);
551     init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn, v, processor);
552     init_timer(&v->poll_timer, poll_timer_fn, v, processor);
553 
554     /* If this is not the first vcpu of the unit we are done. */
555     if ( unit->priv != NULL )
556     {
557         v->processor = processor;
558         return 0;
559     }
560 
561     rcu_read_lock(&sched_res_rculock);
562 
563     /* The first vcpu of an unit can be set via sched_set_res(). */
564     sched_set_res(unit, get_sched_res(processor));
565 
566     unit->priv = sched_alloc_udata(dom_scheduler(d), unit, d->sched_priv);
567     if ( unit->priv == NULL )
568     {
569         sched_free_unit(unit, v);
570         rcu_read_unlock(&sched_res_rculock);
571         return 1;
572     }
573 
574     /*
575      * Initialize affinity settings. The idler, and potentially
576      * domain-0 VCPUs, are pinned onto their respective physical CPUs.
577      */
578     if ( is_idle_domain(d) || (is_hardware_domain(d) && opt_dom0_vcpus_pin) )
579         sched_set_affinity(unit, cpumask_of(processor), &cpumask_all);
580     else
581         sched_set_affinity(unit, &cpumask_all, &cpumask_all);
582 
583     /* Idle VCPUs are scheduled immediately, so don't put them in runqueue. */
584     if ( is_idle_domain(d) )
585     {
586         get_sched_res(v->processor)->curr = unit;
587         get_sched_res(v->processor)->sched_unit_idle = unit;
588         v->is_running = true;
589         unit->is_running = true;
590         unit->state_entry_time = NOW();
591     }
592     else
593     {
594         sched_insert_unit(dom_scheduler(d), unit);
595     }
596 
597     rcu_read_unlock(&sched_res_rculock);
598 
599     return 0;
600 }
601 
vcpu_move_irqs(struct vcpu * v)602 static void vcpu_move_irqs(struct vcpu *v)
603 {
604     arch_move_irqs(v);
605     evtchn_move_pirqs(v);
606 }
607 
sched_move_irqs(const struct sched_unit * unit)608 static void sched_move_irqs(const struct sched_unit *unit)
609 {
610     struct vcpu *v;
611 
612     for_each_sched_unit_vcpu ( unit, v )
613         vcpu_move_irqs(v);
614 }
615 
sched_move_domain(struct domain * d,struct cpupool * c)616 int sched_move_domain(struct domain *d, struct cpupool *c)
617 {
618     struct vcpu *v;
619     struct sched_unit *unit;
620     unsigned int new_p, unit_idx;
621     void **unit_priv;
622     void *domdata;
623     void *unitdata;
624     struct scheduler *old_ops;
625     void *old_domdata;
626     unsigned int gran = cpupool_get_granularity(c);
627     int ret = 0;
628 
629     for_each_vcpu ( d, v )
630     {
631         if ( v->affinity_broken )
632             return -EBUSY;
633     }
634 
635     rcu_read_lock(&sched_res_rculock);
636 
637     domdata = sched_alloc_domdata(c->sched, d);
638     if ( IS_ERR(domdata) )
639     {
640         ret = PTR_ERR(domdata);
641         goto out;
642     }
643 
644     unit_priv = xzalloc_array(void *, DIV_ROUND_UP(d->max_vcpus, gran));
645     if ( unit_priv == NULL )
646     {
647         sched_free_domdata(c->sched, domdata);
648         ret = -ENOMEM;
649         goto out;
650     }
651 
652     unit_idx = 0;
653     for_each_sched_unit ( d, unit )
654     {
655         unit_priv[unit_idx] = sched_alloc_udata(c->sched, unit, domdata);
656         if ( unit_priv[unit_idx] == NULL )
657         {
658             for ( unit_idx = 0; unit_priv[unit_idx]; unit_idx++ )
659                 sched_free_udata(c->sched, unit_priv[unit_idx]);
660             xfree(unit_priv);
661             sched_free_domdata(c->sched, domdata);
662             ret = -ENOMEM;
663             goto out;
664         }
665         unit_idx++;
666     }
667 
668     domain_pause(d);
669 
670     old_ops = dom_scheduler(d);
671     old_domdata = d->sched_priv;
672 
673     for_each_sched_unit ( d, unit )
674     {
675         sched_remove_unit(old_ops, unit);
676     }
677 
678     d->cpupool = c;
679     d->sched_priv = domdata;
680 
681     new_p = cpumask_first(c->cpu_valid);
682     unit_idx = 0;
683     for_each_sched_unit ( d, unit )
684     {
685         spinlock_t *lock;
686         unsigned int unit_p = new_p;
687 
688         unitdata = unit->priv;
689         unit->priv = unit_priv[unit_idx];
690 
691         for_each_sched_unit_vcpu ( unit, v )
692         {
693             migrate_timer(&v->periodic_timer, new_p);
694             migrate_timer(&v->singleshot_timer, new_p);
695             migrate_timer(&v->poll_timer, new_p);
696             new_p = cpumask_cycle(new_p, c->cpu_valid);
697         }
698 
699         lock = unit_schedule_lock_irq(unit);
700 
701         sched_set_affinity(unit, &cpumask_all, &cpumask_all);
702 
703         sched_set_res(unit, get_sched_res(unit_p));
704         /*
705          * With v->processor modified we must not
706          * - make any further changes assuming we hold the scheduler lock,
707          * - use unit_schedule_unlock_irq().
708          */
709         spin_unlock_irq(lock);
710 
711         if ( !d->is_dying )
712             sched_move_irqs(unit);
713 
714         sched_insert_unit(c->sched, unit);
715 
716         sched_free_udata(old_ops, unitdata);
717 
718         unit_idx++;
719     }
720 
721     domain_update_node_affinity(d);
722 
723     domain_unpause(d);
724 
725     sched_free_domdata(old_ops, old_domdata);
726 
727     xfree(unit_priv);
728 
729 out:
730     rcu_read_unlock(&sched_res_rculock);
731 
732     return ret;
733 }
734 
sched_destroy_vcpu(struct vcpu * v)735 void sched_destroy_vcpu(struct vcpu *v)
736 {
737     struct sched_unit *unit = v->sched_unit;
738 
739     kill_timer(&v->periodic_timer);
740     kill_timer(&v->singleshot_timer);
741     kill_timer(&v->poll_timer);
742     if ( test_and_clear_bool(v->is_urgent) )
743         atomic_dec(&per_cpu(sched_urgent_count, v->processor));
744     /*
745      * Vcpus are being destroyed top-down. So being the first vcpu of an unit
746      * is the same as being the only one.
747      */
748     if ( unit->vcpu_list == v )
749     {
750         rcu_read_lock(&sched_res_rculock);
751 
752         sched_remove_unit(vcpu_scheduler(v), unit);
753         sched_free_udata(vcpu_scheduler(v), unit->priv);
754         sched_free_unit(unit, v);
755 
756         rcu_read_unlock(&sched_res_rculock);
757     }
758 }
759 
sched_init_domain(struct domain * d,int poolid)760 int sched_init_domain(struct domain *d, int poolid)
761 {
762     void *sdom;
763     int ret;
764 
765     ASSERT(d->cpupool == NULL);
766     ASSERT(d->domain_id < DOMID_FIRST_RESERVED);
767 
768     if ( (ret = cpupool_add_domain(d, poolid)) )
769         return ret;
770 
771     SCHED_STAT_CRANK(dom_init);
772     TRACE_1D(TRC_SCHED_DOM_ADD, d->domain_id);
773 
774     rcu_read_lock(&sched_res_rculock);
775 
776     sdom = sched_alloc_domdata(dom_scheduler(d), d);
777 
778     rcu_read_unlock(&sched_res_rculock);
779 
780     if ( IS_ERR(sdom) )
781         return PTR_ERR(sdom);
782 
783     d->sched_priv = sdom;
784 
785     return 0;
786 }
787 
sched_destroy_domain(struct domain * d)788 void sched_destroy_domain(struct domain *d)
789 {
790     ASSERT(d->domain_id < DOMID_FIRST_RESERVED);
791 
792     if ( d->cpupool )
793     {
794         SCHED_STAT_CRANK(dom_destroy);
795         TRACE_1D(TRC_SCHED_DOM_REM, d->domain_id);
796 
797         rcu_read_lock(&sched_res_rculock);
798 
799         sched_free_domdata(dom_scheduler(d), d->sched_priv);
800         d->sched_priv = NULL;
801 
802         rcu_read_unlock(&sched_res_rculock);
803 
804         cpupool_rm_domain(d);
805     }
806 }
807 
vcpu_sleep_nosync_locked(struct vcpu * v)808 static void vcpu_sleep_nosync_locked(struct vcpu *v)
809 {
810     struct sched_unit *unit = v->sched_unit;
811 
812     ASSERT(spin_is_locked(get_sched_res(v->processor)->schedule_lock));
813 
814     if ( likely(!vcpu_runnable(v)) )
815     {
816         if ( v->runstate.state == RUNSTATE_runnable )
817             vcpu_runstate_change(v, RUNSTATE_offline, NOW());
818 
819         /* Only put unit to sleep in case all vcpus are not runnable. */
820         if ( likely(!unit_runnable(unit)) )
821             sched_sleep(unit_scheduler(unit), unit);
822         else if ( unit_running(unit) > 1 && v->is_running &&
823                   !v->force_context_switch )
824         {
825             v->force_context_switch = true;
826             cpu_raise_softirq(v->processor, SCHED_SLAVE_SOFTIRQ);
827         }
828     }
829 }
830 
vcpu_sleep_nosync(struct vcpu * v)831 void vcpu_sleep_nosync(struct vcpu *v)
832 {
833     unsigned long flags;
834     spinlock_t *lock;
835 
836     TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
837 
838     rcu_read_lock(&sched_res_rculock);
839 
840     lock = unit_schedule_lock_irqsave(v->sched_unit, &flags);
841 
842     vcpu_sleep_nosync_locked(v);
843 
844     unit_schedule_unlock_irqrestore(lock, flags, v->sched_unit);
845 
846     rcu_read_unlock(&sched_res_rculock);
847 }
848 
vcpu_sleep_sync(struct vcpu * v)849 void vcpu_sleep_sync(struct vcpu *v)
850 {
851     vcpu_sleep_nosync(v);
852 
853     while ( !vcpu_runnable(v) && v->is_running )
854         cpu_relax();
855 
856     sync_vcpu_execstate(v);
857 }
858 
vcpu_wake(struct vcpu * v)859 void vcpu_wake(struct vcpu *v)
860 {
861     unsigned long flags;
862     spinlock_t *lock;
863     struct sched_unit *unit = v->sched_unit;
864 
865     TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
866 
867     rcu_read_lock(&sched_res_rculock);
868 
869     lock = unit_schedule_lock_irqsave(unit, &flags);
870 
871     if ( likely(vcpu_runnable(v)) )
872     {
873         if ( v->runstate.state >= RUNSTATE_blocked )
874             vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
875         /*
876          * Call sched_wake() unconditionally, even if unit is running already.
877          * We might have not been de-scheduled after vcpu_sleep_nosync_locked()
878          * and are now to be woken up again.
879          */
880         sched_wake(unit_scheduler(unit), unit);
881         if ( unit->is_running && !v->is_running && !v->force_context_switch )
882         {
883             v->force_context_switch = true;
884             cpu_raise_softirq(v->processor, SCHED_SLAVE_SOFTIRQ);
885         }
886     }
887     else if ( !(v->pause_flags & VPF_blocked) )
888     {
889         if ( v->runstate.state == RUNSTATE_blocked )
890             vcpu_runstate_change(v, RUNSTATE_offline, NOW());
891     }
892 
893     unit_schedule_unlock_irqrestore(lock, flags, unit);
894 
895     rcu_read_unlock(&sched_res_rculock);
896 }
897 
vcpu_unblock(struct vcpu * v)898 void vcpu_unblock(struct vcpu *v)
899 {
900     if ( !test_and_clear_bit(_VPF_blocked, &v->pause_flags) )
901         return;
902 
903     /* Polling period ends when a VCPU is unblocked. */
904     if ( unlikely(v->poll_evtchn != 0) )
905     {
906         v->poll_evtchn = 0;
907         /*
908          * We *must* re-clear _VPF_blocked to avoid racing other wakeups of
909          * this VCPU (and it then going back to sleep on poll_mask).
910          * Test-and-clear is idiomatic and ensures clear_bit not reordered.
911          */
912         if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
913             clear_bit(_VPF_blocked, &v->pause_flags);
914     }
915 
916     vcpu_wake(v);
917 }
918 
919 /*
920  * Do the actual movement of an unit from old to new CPU. Locks for *both*
921  * CPUs needs to have been taken already when calling this!
922  */
sched_unit_move_locked(struct sched_unit * unit,unsigned int new_cpu)923 static void sched_unit_move_locked(struct sched_unit *unit,
924                                    unsigned int new_cpu)
925 {
926     unsigned int old_cpu = unit->res->master_cpu;
927     const struct vcpu *v;
928 
929     rcu_read_lock(&sched_res_rculock);
930 
931     /*
932      * Transfer urgency status to new CPU before switching CPUs, as
933      * once the switch occurs, v->is_urgent is no longer protected by
934      * the per-CPU scheduler lock we are holding.
935      */
936     for_each_sched_unit_vcpu ( unit, v )
937     {
938         if ( unlikely(v->is_urgent) && (old_cpu != new_cpu) )
939         {
940             atomic_inc(&per_cpu(sched_urgent_count, new_cpu));
941             atomic_dec(&per_cpu(sched_urgent_count, old_cpu));
942         }
943     }
944 
945     /*
946      * Actual CPU switch to new CPU.  This is safe because the lock
947      * pointer can't change while the current lock is held.
948      */
949     sched_migrate(unit_scheduler(unit), unit, new_cpu);
950 
951     rcu_read_unlock(&sched_res_rculock);
952 }
953 
954 /*
955  * Initiating migration
956  *
957  * In order to migrate, we need the unit in question to have stopped
958  * running and have called sched_sleep() (to take it off any
959  * runqueues, for instance); and if it is currently running, it needs
960  * to be scheduled out.  Finally, we need to hold the scheduling locks
961  * for both the processor we're migrating from, and the processor
962  * we're migrating to.
963  *
964  * In order to avoid deadlock while satisfying the final requirement,
965  * we must release any scheduling lock we hold, then try to grab both
966  * locks we want, then double-check to make sure that what we started
967  * to do hasn't been changed in the mean time.
968  *
969  * These steps are encapsulated in the following two functions; they
970  * should be called like this:
971  *
972  *     lock = unit_schedule_lock_irq(unit);
973  *     sched_unit_migrate_start(unit);
974  *     unit_schedule_unlock_irq(lock, unit)
975  *     sched_unit_migrate_finish(unit);
976  *
977  * sched_unit_migrate_finish() will do the work now if it can, or simply
978  * return if it can't (because unit is still running); in that case
979  * sched_unit_migrate_finish() will be called by unit_context_saved().
980  */
sched_unit_migrate_start(struct sched_unit * unit)981 static void sched_unit_migrate_start(struct sched_unit *unit)
982 {
983     struct vcpu *v;
984 
985     for_each_sched_unit_vcpu ( unit, v )
986     {
987         set_bit(_VPF_migrating, &v->pause_flags);
988         vcpu_sleep_nosync_locked(v);
989     }
990 }
991 
sched_unit_migrate_finish(struct sched_unit * unit)992 static void sched_unit_migrate_finish(struct sched_unit *unit)
993 {
994     unsigned long flags;
995     unsigned int old_cpu, new_cpu;
996     spinlock_t *old_lock, *new_lock;
997     bool pick_called = false;
998     struct vcpu *v;
999 
1000     /*
1001      * If the unit is currently running, this will be handled by
1002      * unit_context_saved(); and in any case, if the bit is cleared, then
1003      * someone else has already done the work so we don't need to.
1004      */
1005     if ( unit->is_running )
1006         return;
1007     for_each_sched_unit_vcpu ( unit, v )
1008         if ( !test_bit(_VPF_migrating, &v->pause_flags) )
1009             return;
1010 
1011     old_cpu = new_cpu = unit->res->master_cpu;
1012     for ( ; ; )
1013     {
1014         /*
1015          * We need another iteration if the pre-calculated lock addresses
1016          * are not correct any longer after evaluating old and new cpu holding
1017          * the locks.
1018          */
1019         old_lock = get_sched_res(old_cpu)->schedule_lock;
1020         new_lock = get_sched_res(new_cpu)->schedule_lock;
1021 
1022         sched_spin_lock_double(old_lock, new_lock, &flags);
1023 
1024         old_cpu = unit->res->master_cpu;
1025         if ( old_lock == get_sched_res(old_cpu)->schedule_lock )
1026         {
1027             /*
1028              * If we selected a CPU on the previosu iteration, check if it
1029              * remains suitable for running this vCPU.
1030              */
1031             if ( pick_called &&
1032                  (new_lock == get_sched_res(new_cpu)->schedule_lock) &&
1033                  cpumask_test_cpu(new_cpu, unit->cpu_hard_affinity) &&
1034                  cpumask_test_cpu(new_cpu, unit->domain->cpupool->cpu_valid) )
1035                 break;
1036 
1037             /* Select a new CPU. */
1038             new_cpu = sched_pick_resource(unit_scheduler(unit),
1039                                           unit)->master_cpu;
1040             if ( (new_lock == get_sched_res(new_cpu)->schedule_lock) &&
1041                  cpumask_test_cpu(new_cpu, unit->domain->cpupool->cpu_valid) )
1042                 break;
1043             pick_called = true;
1044         }
1045         else
1046         {
1047             /*
1048              * We do not hold the scheduler lock appropriate for this vCPU.
1049              * Thus we cannot select a new CPU on this iteration. Try again.
1050              */
1051             pick_called = false;
1052         }
1053 
1054         sched_spin_unlock_double(old_lock, new_lock, flags);
1055     }
1056 
1057     /*
1058      * NB. Check of v->running happens /after/ setting migration flag
1059      * because they both happen in (different) spinlock regions, and those
1060      * regions are strictly serialised.
1061      */
1062     if ( unit->is_running )
1063     {
1064         sched_spin_unlock_double(old_lock, new_lock, flags);
1065         return;
1066     }
1067     for_each_sched_unit_vcpu ( unit, v )
1068     {
1069         if ( !test_and_clear_bit(_VPF_migrating, &v->pause_flags) )
1070         {
1071             sched_spin_unlock_double(old_lock, new_lock, flags);
1072             return;
1073         }
1074     }
1075 
1076     sched_unit_move_locked(unit, new_cpu);
1077 
1078     sched_spin_unlock_double(old_lock, new_lock, flags);
1079 
1080     if ( old_cpu != new_cpu )
1081         sched_move_irqs(unit);
1082 
1083     /* Wake on new CPU. */
1084     for_each_sched_unit_vcpu ( unit, v )
1085         vcpu_wake(v);
1086 }
1087 
sched_check_affinity_broken(const struct sched_unit * unit)1088 static bool sched_check_affinity_broken(const struct sched_unit *unit)
1089 {
1090     const struct vcpu *v;
1091 
1092     for_each_sched_unit_vcpu ( unit, v )
1093         if ( v->affinity_broken )
1094             return true;
1095 
1096     return false;
1097 }
1098 
sched_reset_affinity_broken(const struct sched_unit * unit)1099 static void sched_reset_affinity_broken(const struct sched_unit *unit)
1100 {
1101     struct vcpu *v;
1102 
1103     for_each_sched_unit_vcpu ( unit, v )
1104         v->affinity_broken = false;
1105 }
1106 
restore_vcpu_affinity(struct domain * d)1107 void restore_vcpu_affinity(struct domain *d)
1108 {
1109     unsigned int cpu = smp_processor_id();
1110     struct sched_unit *unit;
1111 
1112     ASSERT(system_state == SYS_STATE_resume);
1113 
1114     rcu_read_lock(&sched_res_rculock);
1115 
1116     for_each_sched_unit ( d, unit )
1117     {
1118         spinlock_t *lock;
1119         unsigned int old_cpu = sched_unit_master(unit);
1120         struct sched_resource *res;
1121 
1122         ASSERT(!unit_runnable(unit));
1123 
1124         /*
1125          * Re-assign the initial processor as after resume we have no
1126          * guarantee the old processor has come back to life again.
1127          *
1128          * Therefore, here, before actually unpausing the domains, we should
1129          * set v->processor of each of their vCPUs to something that will
1130          * make sense for the scheduler of the cpupool in which they are in.
1131          */
1132         lock = unit_schedule_lock_irq(unit);
1133 
1134         cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
1135                     cpupool_domain_master_cpumask(d));
1136         if ( cpumask_empty(cpumask_scratch_cpu(cpu)) )
1137         {
1138             if ( sched_check_affinity_broken(unit) )
1139             {
1140                 sched_set_affinity(unit, unit->cpu_hard_affinity_saved, NULL);
1141                 sched_reset_affinity_broken(unit);
1142                 cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
1143                             cpupool_domain_master_cpumask(d));
1144             }
1145 
1146             if ( cpumask_empty(cpumask_scratch_cpu(cpu)) )
1147             {
1148                 /* Affinity settings of one vcpu are for the complete unit. */
1149                 printk(XENLOG_DEBUG "Breaking affinity for %pv\n",
1150                        unit->vcpu_list);
1151                 sched_set_affinity(unit, &cpumask_all, NULL);
1152                 cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
1153                             cpupool_domain_master_cpumask(d));
1154             }
1155         }
1156 
1157         res = get_sched_res(cpumask_any(cpumask_scratch_cpu(cpu)));
1158         sched_set_res(unit, res);
1159 
1160         spin_unlock_irq(lock);
1161 
1162         /* v->processor might have changed, so reacquire the lock. */
1163         lock = unit_schedule_lock_irq(unit);
1164         res = sched_pick_resource(unit_scheduler(unit), unit);
1165         sched_set_res(unit, res);
1166         spin_unlock_irq(lock);
1167 
1168         if ( old_cpu != sched_unit_master(unit) )
1169             sched_move_irqs(unit);
1170     }
1171 
1172     rcu_read_unlock(&sched_res_rculock);
1173 
1174     domain_update_node_affinity(d);
1175 }
1176 
1177 /*
1178  * This function is used by cpu_hotplug code via cpu notifier chain
1179  * and from cpupools to switch schedulers on a cpu.
1180  * Caller must get domlist_read_lock.
1181  */
cpu_disable_scheduler(unsigned int cpu)1182 int cpu_disable_scheduler(unsigned int cpu)
1183 {
1184     struct domain *d;
1185     const struct cpupool *c;
1186     int ret = 0;
1187 
1188     rcu_read_lock(&sched_res_rculock);
1189 
1190     c = get_sched_res(cpu)->cpupool;
1191     if ( c == NULL )
1192         goto out;
1193 
1194     for_each_domain_in_cpupool ( d, c )
1195     {
1196         struct sched_unit *unit;
1197 
1198         for_each_sched_unit ( d, unit )
1199         {
1200             unsigned long flags;
1201             spinlock_t *lock = unit_schedule_lock_irqsave(unit, &flags);
1202 
1203             if ( !cpumask_intersects(unit->cpu_hard_affinity, c->cpu_valid) &&
1204                  cpumask_test_cpu(cpu, unit->cpu_hard_affinity) )
1205             {
1206                 if ( sched_check_affinity_broken(unit) )
1207                 {
1208                     /* The unit is temporarily pinned, can't move it. */
1209                     unit_schedule_unlock_irqrestore(lock, flags, unit);
1210                     ret = -EADDRINUSE;
1211                     break;
1212                 }
1213 
1214                 printk(XENLOG_DEBUG "Breaking affinity for %pv\n",
1215                        unit->vcpu_list);
1216 
1217                 sched_set_affinity(unit, &cpumask_all, NULL);
1218             }
1219 
1220             if ( unit->res != get_sched_res(cpu) )
1221             {
1222                 /* The unit is not on this cpu, so we can move on. */
1223                 unit_schedule_unlock_irqrestore(lock, flags, unit);
1224                 continue;
1225             }
1226 
1227             /* If it is on this cpu, we must send it away.
1228              * We are doing some cpupool manipulations:
1229              *  * we want to call the scheduler, and let it re-evaluation
1230              *    the placement of the vcpu, taking into account the new
1231              *    cpupool configuration;
1232              *  * the scheduler will always find a suitable solution, or
1233              *    things would have failed before getting in here.
1234              */
1235             sched_unit_migrate_start(unit);
1236             unit_schedule_unlock_irqrestore(lock, flags, unit);
1237             sched_unit_migrate_finish(unit);
1238 
1239             /*
1240              * The only caveat, in this case, is that if a vcpu active in
1241              * the hypervisor isn't migratable. In this case, the caller
1242              * should try again after releasing and reaquiring all locks.
1243              */
1244             if ( unit->res == get_sched_res(cpu) )
1245                 ret = -EAGAIN;
1246         }
1247     }
1248 
1249 out:
1250     rcu_read_unlock(&sched_res_rculock);
1251 
1252     return ret;
1253 }
1254 
cpu_disable_scheduler_check(unsigned int cpu)1255 static int cpu_disable_scheduler_check(unsigned int cpu)
1256 {
1257     struct domain *d;
1258     const struct vcpu *v;
1259     const struct cpupool *c;
1260 
1261     c = get_sched_res(cpu)->cpupool;
1262     if ( c == NULL )
1263         return 0;
1264 
1265     for_each_domain_in_cpupool ( d, c )
1266         for_each_vcpu ( d, v )
1267             if ( v->affinity_broken )
1268                 return -EADDRINUSE;
1269 
1270     return 0;
1271 }
1272 
1273 /*
1274  * In general, this must be called with the scheduler lock held, because the
1275  * adjust_affinity hook may want to modify the vCPU state. However, when the
1276  * vCPU is being initialized (either for dom0 or domU) there is no risk of
1277  * races, and it's fine to not take the look (we're talking about
1278  * sched_setup_dom0_vcpus() an sched_init_vcpu()).
1279  */
sched_set_affinity(struct sched_unit * unit,const cpumask_t * hard,const cpumask_t * soft)1280 static void sched_set_affinity(
1281     struct sched_unit *unit, const cpumask_t *hard, const cpumask_t *soft)
1282 {
1283     rcu_read_lock(&sched_res_rculock);
1284     sched_adjust_affinity(dom_scheduler(unit->domain), unit, hard, soft);
1285     rcu_read_unlock(&sched_res_rculock);
1286 
1287     if ( hard )
1288         cpumask_copy(unit->cpu_hard_affinity, hard);
1289     if ( soft )
1290         cpumask_copy(unit->cpu_soft_affinity, soft);
1291 
1292     unit->soft_aff_effective = !cpumask_subset(unit->cpu_hard_affinity,
1293                                                unit->cpu_soft_affinity) &&
1294                                cpumask_intersects(unit->cpu_soft_affinity,
1295                                                   unit->cpu_hard_affinity);
1296 }
1297 
vcpu_set_affinity(struct vcpu * v,const cpumask_t * affinity,const cpumask_t * which)1298 static int vcpu_set_affinity(
1299     struct vcpu *v, const cpumask_t *affinity, const cpumask_t *which)
1300 {
1301     struct sched_unit *unit = v->sched_unit;
1302     spinlock_t *lock;
1303     int ret = 0;
1304 
1305     rcu_read_lock(&sched_res_rculock);
1306 
1307     lock = unit_schedule_lock_irq(unit);
1308 
1309     if ( v->affinity_broken )
1310         ret = -EBUSY;
1311     else
1312     {
1313         /*
1314          * Tell the scheduler we changes something about affinity,
1315          * and ask to re-evaluate vcpu placement.
1316          */
1317         if ( which == unit->cpu_hard_affinity )
1318         {
1319             sched_set_affinity(unit, affinity, NULL);
1320         }
1321         else
1322         {
1323             ASSERT(which == unit->cpu_soft_affinity);
1324             sched_set_affinity(unit, NULL, affinity);
1325         }
1326         sched_unit_migrate_start(unit);
1327     }
1328 
1329     unit_schedule_unlock_irq(lock, unit);
1330 
1331     domain_update_node_affinity(v->domain);
1332 
1333     sched_unit_migrate_finish(unit);
1334 
1335     rcu_read_unlock(&sched_res_rculock);
1336 
1337     return ret;
1338 }
1339 
vcpu_set_hard_affinity(struct vcpu * v,const cpumask_t * affinity)1340 int vcpu_set_hard_affinity(struct vcpu *v, const cpumask_t *affinity)
1341 {
1342     cpumask_t *online;
1343 
1344     online = VCPU2ONLINE(v);
1345     if ( !cpumask_intersects(online, affinity) )
1346         return -EINVAL;
1347 
1348     return vcpu_set_affinity(v, affinity, v->sched_unit->cpu_hard_affinity);
1349 }
1350 
vcpu_set_soft_affinity(struct vcpu * v,const cpumask_t * affinity)1351 static int vcpu_set_soft_affinity(struct vcpu *v, const cpumask_t *affinity)
1352 {
1353     return vcpu_set_affinity(v, affinity, v->sched_unit->cpu_soft_affinity);
1354 }
1355 
1356 /* Block the currently-executing domain until a pertinent event occurs. */
vcpu_block(void)1357 void vcpu_block(void)
1358 {
1359     struct vcpu *v = current;
1360 
1361     set_bit(_VPF_blocked, &v->pause_flags);
1362 
1363     arch_vcpu_block(v);
1364 
1365     /* Check for events /after/ blocking: avoids wakeup waiting race. */
1366     if ( local_events_need_delivery() )
1367     {
1368         clear_bit(_VPF_blocked, &v->pause_flags);
1369     }
1370     else
1371     {
1372         TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id);
1373         raise_softirq(SCHEDULE_SOFTIRQ);
1374     }
1375 }
1376 
vcpu_block_enable_events(void)1377 static void vcpu_block_enable_events(void)
1378 {
1379     local_event_delivery_enable();
1380     vcpu_block();
1381 }
1382 
do_poll(struct sched_poll * sched_poll)1383 static long do_poll(struct sched_poll *sched_poll)
1384 {
1385     struct vcpu   *v = current;
1386     struct domain *d = v->domain;
1387     evtchn_port_t  port = 0;
1388     long           rc;
1389     unsigned int   i;
1390 
1391     /* Fairly arbitrary limit. */
1392     if ( sched_poll->nr_ports > 128 )
1393         return -EINVAL;
1394 
1395     if ( !guest_handle_okay(sched_poll->ports, sched_poll->nr_ports) )
1396         return -EFAULT;
1397 
1398     set_bit(_VPF_blocked, &v->pause_flags);
1399     v->poll_evtchn = -1;
1400     set_bit(v->vcpu_id, d->poll_mask);
1401 
1402     arch_vcpu_block(v);
1403 
1404 #ifndef CONFIG_X86 /* set_bit() implies mb() on x86 */
1405     /* Check for events /after/ setting flags: avoids wakeup waiting race. */
1406     smp_mb();
1407 
1408     /*
1409      * Someone may have seen we are blocked but not that we are polling, or
1410      * vice versa. We are certainly being woken, so clean up and bail. Beyond
1411      * this point others can be guaranteed to clean up for us if they wake us.
1412      */
1413     rc = 0;
1414     if ( (v->poll_evtchn == 0) ||
1415          !test_bit(_VPF_blocked, &v->pause_flags) ||
1416          !test_bit(v->vcpu_id, d->poll_mask) )
1417         goto out;
1418 #endif
1419 
1420     rc = 0;
1421     if ( local_events_need_delivery() )
1422         goto out;
1423 
1424     for ( i = 0; i < sched_poll->nr_ports; i++ )
1425     {
1426         rc = -EFAULT;
1427         if ( __copy_from_guest_offset(&port, sched_poll->ports, i, 1) )
1428             goto out;
1429 
1430         rc = -EINVAL;
1431         if ( !port_is_valid(d, port) )
1432             goto out;
1433 
1434         rc = 0;
1435         if ( evtchn_port_is_pending(d, port) )
1436             goto out;
1437     }
1438 
1439     if ( sched_poll->nr_ports == 1 )
1440         v->poll_evtchn = port;
1441 
1442     if ( sched_poll->timeout != 0 )
1443         set_timer(&v->poll_timer, sched_poll->timeout);
1444 
1445     TRACE_2D(TRC_SCHED_BLOCK, d->domain_id, v->vcpu_id);
1446     raise_softirq(SCHEDULE_SOFTIRQ);
1447 
1448     return 0;
1449 
1450  out:
1451     v->poll_evtchn = 0;
1452     clear_bit(v->vcpu_id, d->poll_mask);
1453     clear_bit(_VPF_blocked, &v->pause_flags);
1454     return rc;
1455 }
1456 
1457 /* Voluntarily yield the processor for this allocation. */
vcpu_yield(void)1458 long vcpu_yield(void)
1459 {
1460     struct vcpu * v=current;
1461     spinlock_t *lock;
1462 
1463     rcu_read_lock(&sched_res_rculock);
1464 
1465     lock = unit_schedule_lock_irq(v->sched_unit);
1466     sched_yield(vcpu_scheduler(v), v->sched_unit);
1467     unit_schedule_unlock_irq(lock, v->sched_unit);
1468 
1469     rcu_read_unlock(&sched_res_rculock);
1470 
1471     SCHED_STAT_CRANK(vcpu_yield);
1472 
1473     TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id);
1474     raise_softirq(SCHEDULE_SOFTIRQ);
1475     return 0;
1476 }
1477 
domain_watchdog_timeout(void * data)1478 static void domain_watchdog_timeout(void *data)
1479 {
1480     struct domain *d = data;
1481 
1482     if ( d->is_shutting_down || d->is_dying )
1483         return;
1484 
1485     printk("Watchdog timer fired for domain %u\n", d->domain_id);
1486     domain_shutdown(d, SHUTDOWN_watchdog);
1487 }
1488 
domain_watchdog(struct domain * d,uint32_t id,uint32_t timeout)1489 static long domain_watchdog(struct domain *d, uint32_t id, uint32_t timeout)
1490 {
1491     if ( id > NR_DOMAIN_WATCHDOG_TIMERS )
1492         return -EINVAL;
1493 
1494     spin_lock(&d->watchdog_lock);
1495 
1496     if ( id == 0 )
1497     {
1498         for ( id = 0; id < NR_DOMAIN_WATCHDOG_TIMERS; id++ )
1499         {
1500             if ( test_and_set_bit(id, &d->watchdog_inuse_map) )
1501                 continue;
1502             set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout));
1503             break;
1504         }
1505         spin_unlock(&d->watchdog_lock);
1506         return id == NR_DOMAIN_WATCHDOG_TIMERS ? -ENOSPC : id + 1;
1507     }
1508 
1509     id -= 1;
1510     if ( !test_bit(id, &d->watchdog_inuse_map) )
1511     {
1512         spin_unlock(&d->watchdog_lock);
1513         return -EINVAL;
1514     }
1515 
1516     if ( timeout == 0 )
1517     {
1518         stop_timer(&d->watchdog_timer[id]);
1519         clear_bit(id, &d->watchdog_inuse_map);
1520     }
1521     else
1522     {
1523         set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout));
1524     }
1525 
1526     spin_unlock(&d->watchdog_lock);
1527     return 0;
1528 }
1529 
watchdog_domain_init(struct domain * d)1530 void watchdog_domain_init(struct domain *d)
1531 {
1532     unsigned int i;
1533 
1534     spin_lock_init(&d->watchdog_lock);
1535 
1536     d->watchdog_inuse_map = 0;
1537 
1538     for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
1539         init_timer(&d->watchdog_timer[i], domain_watchdog_timeout, d, 0);
1540 }
1541 
watchdog_domain_destroy(struct domain * d)1542 void watchdog_domain_destroy(struct domain *d)
1543 {
1544     unsigned int i;
1545 
1546     for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
1547         kill_timer(&d->watchdog_timer[i]);
1548 }
1549 
1550 /*
1551  * Pin a vcpu temporarily to a specific CPU (or restore old pinning state if
1552  * cpu is NR_CPUS).
1553  * Temporary pinning can be done due to two reasons, which may be nested:
1554  * - VCPU_AFFINITY_OVERRIDE (requested by guest): is allowed to fail in case
1555  *   of a conflict (e.g. in case cpupool doesn't include requested CPU, or
1556  *   another conflicting temporary pinning is already in effect.
1557  * - VCPU_AFFINITY_WAIT (called by wait_event()): only used to pin vcpu to the
1558  *   CPU it is just running on. Can't fail if used properly.
1559  */
vcpu_temporary_affinity(struct vcpu * v,unsigned int cpu,uint8_t reason)1560 int vcpu_temporary_affinity(struct vcpu *v, unsigned int cpu, uint8_t reason)
1561 {
1562     struct sched_unit *unit = v->sched_unit;
1563     spinlock_t *lock;
1564     int ret = -EINVAL;
1565     bool migrate;
1566 
1567     rcu_read_lock(&sched_res_rculock);
1568 
1569     lock = unit_schedule_lock_irq(unit);
1570 
1571     if ( cpu == NR_CPUS )
1572     {
1573         if ( v->affinity_broken & reason )
1574         {
1575             ret = 0;
1576             v->affinity_broken &= ~reason;
1577         }
1578         if ( !ret && !sched_check_affinity_broken(unit) )
1579             sched_set_affinity(unit, unit->cpu_hard_affinity_saved, NULL);
1580     }
1581     else if ( cpu < nr_cpu_ids )
1582     {
1583         if ( (v->affinity_broken & reason) ||
1584              (sched_check_affinity_broken(unit) && v->processor != cpu) )
1585             ret = -EBUSY;
1586         else if ( cpumask_test_cpu(cpu, VCPU2ONLINE(v)) )
1587         {
1588             if ( !sched_check_affinity_broken(unit) )
1589             {
1590                 cpumask_copy(unit->cpu_hard_affinity_saved,
1591                              unit->cpu_hard_affinity);
1592                 sched_set_affinity(unit, cpumask_of(cpu), NULL);
1593             }
1594             v->affinity_broken |= reason;
1595             ret = 0;
1596         }
1597     }
1598 
1599     migrate = !ret && !cpumask_test_cpu(v->processor, unit->cpu_hard_affinity);
1600     if ( migrate )
1601         sched_unit_migrate_start(unit);
1602 
1603     unit_schedule_unlock_irq(lock, unit);
1604 
1605     if ( migrate )
1606         sched_unit_migrate_finish(unit);
1607 
1608     rcu_read_unlock(&sched_res_rculock);
1609 
1610     return ret;
1611 }
1612 
1613 static inline
vcpuaffinity_params_invalid(const struct xen_domctl_vcpuaffinity * vcpuaff)1614 int vcpuaffinity_params_invalid(const struct xen_domctl_vcpuaffinity *vcpuaff)
1615 {
1616     return vcpuaff->flags == 0 ||
1617            ((vcpuaff->flags & XEN_VCPUAFFINITY_HARD) &&
1618             guest_handle_is_null(vcpuaff->cpumap_hard.bitmap)) ||
1619            ((vcpuaff->flags & XEN_VCPUAFFINITY_SOFT) &&
1620             guest_handle_is_null(vcpuaff->cpumap_soft.bitmap));
1621 }
1622 
vcpu_affinity_domctl(struct domain * d,uint32_t cmd,struct xen_domctl_vcpuaffinity * vcpuaff)1623 int vcpu_affinity_domctl(struct domain *d, uint32_t cmd,
1624                          struct xen_domctl_vcpuaffinity *vcpuaff)
1625 {
1626     struct vcpu *v;
1627     const struct sched_unit *unit;
1628     int ret = 0;
1629 
1630     if ( vcpuaff->vcpu >= d->max_vcpus )
1631         return -EINVAL;
1632 
1633     if ( (v = d->vcpu[vcpuaff->vcpu]) == NULL )
1634         return -ESRCH;
1635 
1636     if ( vcpuaffinity_params_invalid(vcpuaff) )
1637         return -EINVAL;
1638 
1639     unit = v->sched_unit;
1640 
1641     if ( cmd == XEN_DOMCTL_setvcpuaffinity )
1642     {
1643         cpumask_var_t new_affinity, old_affinity;
1644         cpumask_t *online = cpupool_domain_master_cpumask(v->domain);
1645 
1646         /*
1647          * We want to be able to restore hard affinity if we are trying
1648          * setting both and changing soft affinity (which happens later,
1649          * when hard affinity has been succesfully chaged already) fails.
1650          */
1651         if ( !alloc_cpumask_var(&old_affinity) )
1652             return -ENOMEM;
1653 
1654         cpumask_copy(old_affinity, unit->cpu_hard_affinity);
1655 
1656         if ( !alloc_cpumask_var(&new_affinity) )
1657         {
1658             free_cpumask_var(old_affinity);
1659             return -ENOMEM;
1660         }
1661 
1662         /* Undo a stuck SCHED_pin_override? */
1663         if ( vcpuaff->flags & XEN_VCPUAFFINITY_FORCE )
1664             vcpu_temporary_affinity(v, NR_CPUS, VCPU_AFFINITY_OVERRIDE);
1665 
1666         ret = 0;
1667 
1668         /*
1669          * We both set a new affinity and report back to the caller what
1670          * the scheduler will be effectively using.
1671          */
1672         if ( vcpuaff->flags & XEN_VCPUAFFINITY_HARD )
1673         {
1674             ret = xenctl_bitmap_to_bitmap(cpumask_bits(new_affinity),
1675                                           &vcpuaff->cpumap_hard, nr_cpu_ids);
1676             if ( !ret )
1677                 ret = vcpu_set_hard_affinity(v, new_affinity);
1678             if ( ret )
1679                 goto setvcpuaffinity_out;
1680 
1681             /*
1682              * For hard affinity, what we return is the intersection of
1683              * cpupool's online mask and the new hard affinity.
1684              */
1685             cpumask_and(new_affinity, online, unit->cpu_hard_affinity);
1686             ret = cpumask_to_xenctl_bitmap(&vcpuaff->cpumap_hard, new_affinity);
1687         }
1688         if ( vcpuaff->flags & XEN_VCPUAFFINITY_SOFT )
1689         {
1690             ret = xenctl_bitmap_to_bitmap(cpumask_bits(new_affinity),
1691                                           &vcpuaff->cpumap_soft, nr_cpu_ids);
1692             if ( !ret)
1693                 ret = vcpu_set_soft_affinity(v, new_affinity);
1694             if ( ret )
1695             {
1696                 /*
1697                  * Since we're returning error, the caller expects nothing
1698                  * happened, so we rollback the changes to hard affinity
1699                  * (if any).
1700                  */
1701                 if ( vcpuaff->flags & XEN_VCPUAFFINITY_HARD )
1702                     vcpu_set_hard_affinity(v, old_affinity);
1703                 goto setvcpuaffinity_out;
1704             }
1705 
1706             /*
1707              * For soft affinity, we return the intersection between the
1708              * new soft affinity, the cpupool's online map and the (new)
1709              * hard affinity.
1710              */
1711             cpumask_and(new_affinity, new_affinity, online);
1712             cpumask_and(new_affinity, new_affinity, unit->cpu_hard_affinity);
1713             ret = cpumask_to_xenctl_bitmap(&vcpuaff->cpumap_soft, new_affinity);
1714         }
1715 
1716  setvcpuaffinity_out:
1717         free_cpumask_var(new_affinity);
1718         free_cpumask_var(old_affinity);
1719     }
1720     else
1721     {
1722         if ( vcpuaff->flags & XEN_VCPUAFFINITY_HARD )
1723             ret = cpumask_to_xenctl_bitmap(&vcpuaff->cpumap_hard,
1724                                            unit->cpu_hard_affinity);
1725         if ( vcpuaff->flags & XEN_VCPUAFFINITY_SOFT )
1726             ret = cpumask_to_xenctl_bitmap(&vcpuaff->cpumap_soft,
1727                                            unit->cpu_soft_affinity);
1728     }
1729 
1730     return ret;
1731 }
1732 
domain_update_node_affinity(struct domain * d)1733 void domain_update_node_affinity(struct domain *d)
1734 {
1735     cpumask_var_t dom_cpumask, dom_cpumask_soft;
1736     cpumask_t *dom_affinity;
1737     const cpumask_t *online;
1738     struct sched_unit *unit;
1739     unsigned int cpu;
1740 
1741     /* Do we have vcpus already? If not, no need to update node-affinity. */
1742     if ( !d->vcpu || !d->vcpu[0] )
1743         return;
1744 
1745     if ( !zalloc_cpumask_var(&dom_cpumask) )
1746         return;
1747     if ( !zalloc_cpumask_var(&dom_cpumask_soft) )
1748     {
1749         free_cpumask_var(dom_cpumask);
1750         return;
1751     }
1752 
1753     online = cpupool_domain_master_cpumask(d);
1754 
1755     spin_lock(&d->node_affinity_lock);
1756 
1757     /*
1758      * If d->auto_node_affinity is true, let's compute the domain's
1759      * node-affinity and update d->node_affinity accordingly. if false,
1760      * just leave d->auto_node_affinity alone.
1761      */
1762     if ( d->auto_node_affinity )
1763     {
1764         /*
1765          * We want the narrowest possible set of pcpus (to get the narowest
1766          * possible set of nodes). What we need is the cpumask of where the
1767          * domain can run (the union of the hard affinity of all its vcpus),
1768          * and the full mask of where it would prefer to run (the union of
1769          * the soft affinity of all its various vcpus). Let's build them.
1770          */
1771         for_each_sched_unit ( d, unit )
1772         {
1773             cpumask_or(dom_cpumask, dom_cpumask, unit->cpu_hard_affinity);
1774             cpumask_or(dom_cpumask_soft, dom_cpumask_soft,
1775                        unit->cpu_soft_affinity);
1776         }
1777         /* Filter out non-online cpus */
1778         cpumask_and(dom_cpumask, dom_cpumask, online);
1779         ASSERT(!cpumask_empty(dom_cpumask));
1780         /* And compute the intersection between hard, online and soft */
1781         cpumask_and(dom_cpumask_soft, dom_cpumask_soft, dom_cpumask);
1782 
1783         /*
1784          * If not empty, the intersection of hard, soft and online is the
1785          * narrowest set we want. If empty, we fall back to hard&online.
1786          */
1787         dom_affinity = cpumask_empty(dom_cpumask_soft) ?
1788                            dom_cpumask : dom_cpumask_soft;
1789 
1790         nodes_clear(d->node_affinity);
1791         for_each_cpu ( cpu, dom_affinity )
1792             node_set(cpu_to_node(cpu), d->node_affinity);
1793     }
1794 
1795     spin_unlock(&d->node_affinity_lock);
1796 
1797     free_cpumask_var(dom_cpumask_soft);
1798     free_cpumask_var(dom_cpumask);
1799 }
1800 
1801 typedef long ret_t;
1802 
1803 #endif /* !COMPAT */
1804 
do_sched_op(int cmd,XEN_GUEST_HANDLE_PARAM (void)arg)1805 ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
1806 {
1807     ret_t ret = 0;
1808 
1809     switch ( cmd )
1810     {
1811     case SCHEDOP_yield:
1812     {
1813         ret = vcpu_yield();
1814         break;
1815     }
1816 
1817     case SCHEDOP_block:
1818     {
1819         vcpu_block_enable_events();
1820         break;
1821     }
1822 
1823     case SCHEDOP_shutdown:
1824     {
1825         struct sched_shutdown sched_shutdown;
1826 
1827         ret = -EFAULT;
1828         if ( copy_from_guest(&sched_shutdown, arg, 1) )
1829             break;
1830 
1831         TRACE_3D(TRC_SCHED_SHUTDOWN,
1832                  current->domain->domain_id, current->vcpu_id,
1833                  sched_shutdown.reason);
1834         ret = domain_shutdown(current->domain, (u8)sched_shutdown.reason);
1835 
1836         break;
1837     }
1838 
1839     case SCHEDOP_shutdown_code:
1840     {
1841         struct sched_shutdown sched_shutdown;
1842         struct domain *d = current->domain;
1843 
1844         ret = -EFAULT;
1845         if ( copy_from_guest(&sched_shutdown, arg, 1) )
1846             break;
1847 
1848         TRACE_3D(TRC_SCHED_SHUTDOWN_CODE,
1849                  d->domain_id, current->vcpu_id, sched_shutdown.reason);
1850 
1851         spin_lock(&d->shutdown_lock);
1852         if ( d->shutdown_code == SHUTDOWN_CODE_INVALID )
1853             d->shutdown_code = (u8)sched_shutdown.reason;
1854         spin_unlock(&d->shutdown_lock);
1855 
1856         ret = 0;
1857         break;
1858     }
1859 
1860     case SCHEDOP_poll:
1861     {
1862         struct sched_poll sched_poll;
1863 
1864         ret = -EFAULT;
1865         if ( copy_from_guest(&sched_poll, arg, 1) )
1866             break;
1867 
1868         ret = do_poll(&sched_poll);
1869 
1870         break;
1871     }
1872 
1873     case SCHEDOP_remote_shutdown:
1874     {
1875         struct domain *d;
1876         struct sched_remote_shutdown sched_remote_shutdown;
1877 
1878         ret = -EFAULT;
1879         if ( copy_from_guest(&sched_remote_shutdown, arg, 1) )
1880             break;
1881 
1882         ret = -ESRCH;
1883         d = rcu_lock_domain_by_id(sched_remote_shutdown.domain_id);
1884         if ( d == NULL )
1885             break;
1886 
1887         ret = xsm_schedop_shutdown(XSM_DM_PRIV, current->domain, d);
1888         if ( likely(!ret) )
1889             domain_shutdown(d, sched_remote_shutdown.reason);
1890 
1891         rcu_unlock_domain(d);
1892 
1893         break;
1894     }
1895 
1896     case SCHEDOP_watchdog:
1897     {
1898         struct sched_watchdog sched_watchdog;
1899 
1900         ret = -EFAULT;
1901         if ( copy_from_guest(&sched_watchdog, arg, 1) )
1902             break;
1903 
1904         ret = domain_watchdog(
1905             current->domain, sched_watchdog.id, sched_watchdog.timeout);
1906         break;
1907     }
1908 
1909     case SCHEDOP_pin_override:
1910     {
1911         struct sched_pin_override sched_pin_override;
1912         unsigned int cpu;
1913 
1914         ret = -EPERM;
1915         if ( !is_hardware_domain(current->domain) )
1916             break;
1917 
1918         ret = -EFAULT;
1919         if ( copy_from_guest(&sched_pin_override, arg, 1) )
1920             break;
1921 
1922         ret = -EINVAL;
1923         if ( sched_pin_override.pcpu >= NR_CPUS )
1924            break;
1925 
1926         cpu = sched_pin_override.pcpu < 0 ? NR_CPUS : sched_pin_override.pcpu;
1927         ret = vcpu_temporary_affinity(current, cpu, VCPU_AFFINITY_OVERRIDE);
1928 
1929         break;
1930     }
1931 
1932     default:
1933         ret = -ENOSYS;
1934     }
1935 
1936     return ret;
1937 }
1938 
1939 #ifndef COMPAT
1940 
1941 /* Per-vcpu oneshot-timer hypercall. */
do_set_timer_op(s_time_t timeout)1942 long do_set_timer_op(s_time_t timeout)
1943 {
1944     struct vcpu *v = current;
1945     s_time_t offset = timeout - NOW();
1946 
1947     if ( timeout == 0 )
1948     {
1949         stop_timer(&v->singleshot_timer);
1950     }
1951     else if ( unlikely(timeout < 0) || /* overflow into 64th bit? */
1952               unlikely((offset > 0) && ((uint32_t)(offset >> 50) != 0)) )
1953     {
1954         /*
1955          * Linux workaround: occasionally we will see timeouts a long way in
1956          * the future due to wrapping in Linux's jiffy time handling. We check
1957          * for timeouts wrapped negative, and for positive timeouts more than
1958          * about 13 days in the future (2^50ns). The correct fix is to trigger
1959          * an interrupt immediately (since Linux in fact has pending work to
1960          * do in this situation). However, older guests also set a long timeout
1961          * when they have *no* pending timers at all: setting an immediate
1962          * timeout in this case can burn a lot of CPU. We therefore go for a
1963          * reasonable middleground of triggering a timer event in 100ms.
1964          */
1965         gdprintk(XENLOG_INFO, "Warning: huge timeout set: %"PRIx64"\n",
1966                  timeout);
1967         set_timer(&v->singleshot_timer, NOW() + MILLISECS(100));
1968     }
1969     else
1970     {
1971         migrate_timer(&v->singleshot_timer, smp_processor_id());
1972         set_timer(&v->singleshot_timer, timeout);
1973     }
1974 
1975     return 0;
1976 }
1977 
1978 /* sched_id - fetch ID of current scheduler */
sched_id(void)1979 int sched_id(void)
1980 {
1981     return ops.sched_id;
1982 }
1983 
1984 /* Adjust scheduling parameter for a given domain. */
sched_adjust(struct domain * d,struct xen_domctl_scheduler_op * op)1985 long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op)
1986 {
1987     long ret;
1988 
1989     ret = xsm_domctl_scheduler_op(XSM_HOOK, d, op->cmd);
1990     if ( ret )
1991         return ret;
1992 
1993     if ( op->sched_id != dom_scheduler(d)->sched_id )
1994         return -EINVAL;
1995 
1996     switch ( op->cmd )
1997     {
1998     case XEN_DOMCTL_SCHEDOP_putinfo:
1999     case XEN_DOMCTL_SCHEDOP_getinfo:
2000     case XEN_DOMCTL_SCHEDOP_putvcpuinfo:
2001     case XEN_DOMCTL_SCHEDOP_getvcpuinfo:
2002         break;
2003     default:
2004         return -EINVAL;
2005     }
2006 
2007     /* NB: the pluggable scheduler code needs to take care
2008      * of locking by itself. */
2009     rcu_read_lock(&sched_res_rculock);
2010 
2011     if ( (ret = sched_adjust_dom(dom_scheduler(d), d, op)) == 0 )
2012         TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
2013 
2014     rcu_read_unlock(&sched_res_rculock);
2015 
2016     return ret;
2017 }
2018 
sched_adjust_global(struct xen_sysctl_scheduler_op * op)2019 long sched_adjust_global(struct xen_sysctl_scheduler_op *op)
2020 {
2021     struct cpupool *pool;
2022     int rc;
2023 
2024     rc = xsm_sysctl_scheduler_op(XSM_HOOK, op->cmd);
2025     if ( rc )
2026         return rc;
2027 
2028     if ( (op->cmd != XEN_SYSCTL_SCHEDOP_putinfo) &&
2029          (op->cmd != XEN_SYSCTL_SCHEDOP_getinfo) )
2030         return -EINVAL;
2031 
2032     pool = cpupool_get_by_id(op->cpupool_id);
2033     if ( pool == NULL )
2034         return -ESRCH;
2035 
2036     rcu_read_lock(&sched_res_rculock);
2037 
2038     rc = ((op->sched_id == pool->sched->sched_id)
2039           ? sched_adjust_cpupool(pool->sched, op) : -EINVAL);
2040 
2041     rcu_read_unlock(&sched_res_rculock);
2042 
2043     cpupool_put(pool);
2044 
2045     return rc;
2046 }
2047 
vcpu_periodic_timer_work_locked(struct vcpu * v)2048 static void vcpu_periodic_timer_work_locked(struct vcpu *v)
2049 {
2050     s_time_t now;
2051     s_time_t periodic_next_event;
2052 
2053     now = NOW();
2054     periodic_next_event = v->periodic_last_event + v->periodic_period;
2055 
2056     if ( now >= periodic_next_event )
2057     {
2058         send_timer_event(v);
2059         v->periodic_last_event = now;
2060         periodic_next_event = now + v->periodic_period;
2061     }
2062 
2063     migrate_timer(&v->periodic_timer, v->processor);
2064     set_timer(&v->periodic_timer, periodic_next_event);
2065 }
2066 
vcpu_periodic_timer_work(struct vcpu * v)2067 static void vcpu_periodic_timer_work(struct vcpu *v)
2068 {
2069     if ( v->periodic_period == 0 )
2070         return;
2071 
2072     spin_lock(&v->periodic_timer_lock);
2073     if ( v->periodic_period )
2074         vcpu_periodic_timer_work_locked(v);
2075     spin_unlock(&v->periodic_timer_lock);
2076 }
2077 
2078 /*
2079  * Set the periodic timer of a vcpu.
2080  */
vcpu_set_periodic_timer(struct vcpu * v,s_time_t value)2081 void vcpu_set_periodic_timer(struct vcpu *v, s_time_t value)
2082 {
2083     spin_lock(&v->periodic_timer_lock);
2084 
2085     stop_timer(&v->periodic_timer);
2086 
2087     v->periodic_period = value;
2088     if ( value )
2089         vcpu_periodic_timer_work_locked(v);
2090 
2091     spin_unlock(&v->periodic_timer_lock);
2092 }
2093 
sched_switch_units(struct sched_resource * sr,struct sched_unit * next,struct sched_unit * prev,s_time_t now)2094 static void sched_switch_units(struct sched_resource *sr,
2095                                struct sched_unit *next, struct sched_unit *prev,
2096                                s_time_t now)
2097 {
2098     unsigned int cpu;
2099 
2100     ASSERT(unit_running(prev));
2101 
2102     if ( prev != next )
2103     {
2104         sr->curr = next;
2105         sr->prev = prev;
2106 
2107         TRACE_3D(TRC_SCHED_SWITCH_INFPREV, prev->domain->domain_id,
2108                  prev->unit_id, now - prev->state_entry_time);
2109         TRACE_4D(TRC_SCHED_SWITCH_INFNEXT, next->domain->domain_id,
2110                  next->unit_id,
2111                  (next->vcpu_list->runstate.state == RUNSTATE_runnable) ?
2112                  (now - next->state_entry_time) : 0, prev->next_time);
2113         TRACE_4D(TRC_SCHED_SWITCH, prev->domain->domain_id, prev->unit_id,
2114                  next->domain->domain_id, next->unit_id);
2115 
2116         ASSERT(!unit_running(next));
2117 
2118         /*
2119          * NB. Don't add any trace records from here until the actual context
2120          * switch, else lost_records resume will not work properly.
2121          */
2122 
2123         ASSERT(!next->is_running);
2124         next->is_running = true;
2125         next->state_entry_time = now;
2126 
2127         if ( is_idle_unit(prev) )
2128         {
2129             prev->runstate_cnt[RUNSTATE_running] = 0;
2130             prev->runstate_cnt[RUNSTATE_runnable] = sr->granularity;
2131         }
2132         if ( is_idle_unit(next) )
2133         {
2134             next->runstate_cnt[RUNSTATE_running] = sr->granularity;
2135             next->runstate_cnt[RUNSTATE_runnable] = 0;
2136         }
2137     }
2138 
2139     for_each_cpu ( cpu, sr->cpus )
2140     {
2141         struct vcpu *vprev = get_cpu_current(cpu);
2142         struct vcpu *vnext = sched_unit2vcpu_cpu(next, cpu);
2143 
2144         if ( vprev != vnext || vprev->runstate.state != vnext->new_state )
2145         {
2146             vcpu_runstate_change(vprev,
2147                 ((vprev->pause_flags & VPF_blocked) ? RUNSTATE_blocked :
2148                  (vcpu_runnable(vprev) ? RUNSTATE_runnable : RUNSTATE_offline)),
2149                 now);
2150             vcpu_runstate_change(vnext, vnext->new_state, now);
2151         }
2152 
2153         vnext->is_running = true;
2154 
2155         if ( is_idle_vcpu(vnext) )
2156             vnext->sched_unit = next;
2157     }
2158 }
2159 
sched_tasklet_check_cpu(unsigned int cpu)2160 static bool sched_tasklet_check_cpu(unsigned int cpu)
2161 {
2162     unsigned long *tasklet_work = &per_cpu(tasklet_work_to_do, cpu);
2163 
2164     switch ( *tasklet_work )
2165     {
2166     case TASKLET_enqueued:
2167         set_bit(_TASKLET_scheduled, tasklet_work);
2168         /* fallthrough */
2169     case TASKLET_enqueued|TASKLET_scheduled:
2170         return true;
2171         break;
2172     case TASKLET_scheduled:
2173         clear_bit(_TASKLET_scheduled, tasklet_work);
2174         /* fallthrough */
2175     case 0:
2176         /* return false; */
2177         break;
2178     default:
2179         BUG();
2180     }
2181 
2182     return false;
2183 }
2184 
sched_tasklet_check(unsigned int cpu)2185 static bool sched_tasklet_check(unsigned int cpu)
2186 {
2187     bool tasklet_work_scheduled = false;
2188     const cpumask_t *mask = get_sched_res(cpu)->cpus;
2189     unsigned int cpu_iter;
2190 
2191     for_each_cpu ( cpu_iter, mask )
2192         if ( sched_tasklet_check_cpu(cpu_iter) )
2193             tasklet_work_scheduled = true;
2194 
2195     return tasklet_work_scheduled;
2196 }
2197 
do_schedule(struct sched_unit * prev,s_time_t now,unsigned int cpu)2198 static struct sched_unit *do_schedule(struct sched_unit *prev, s_time_t now,
2199                                       unsigned int cpu)
2200 {
2201     struct sched_resource *sr = get_sched_res(cpu);
2202     struct scheduler *sched = sr->scheduler;
2203     struct sched_unit *next;
2204 
2205     /* get policy-specific decision on scheduling... */
2206     sched->do_schedule(sched, prev, now, sched_tasklet_check(cpu));
2207 
2208     next = prev->next_task;
2209 
2210     if ( prev->next_time >= 0 ) /* -ve means no limit */
2211         set_timer(&sr->s_timer, now + prev->next_time);
2212 
2213     sched_switch_units(sr, next, prev, now);
2214 
2215     return next;
2216 }
2217 
vcpu_context_saved(struct vcpu * vprev,struct vcpu * vnext)2218 static void vcpu_context_saved(struct vcpu *vprev, struct vcpu *vnext)
2219 {
2220     /* Clear running flag /after/ writing context to memory. */
2221     smp_wmb();
2222 
2223     if ( vprev != vnext )
2224         vprev->is_running = false;
2225 }
2226 
unit_context_saved(struct sched_resource * sr)2227 static void unit_context_saved(struct sched_resource *sr)
2228 {
2229     struct sched_unit *unit = sr->prev;
2230 
2231     if ( !unit )
2232         return;
2233 
2234     unit->is_running = false;
2235     unit->state_entry_time = NOW();
2236     sr->prev = NULL;
2237 
2238     /* Check for migration request /after/ clearing running flag. */
2239     smp_mb();
2240 
2241     sched_context_saved(unit_scheduler(unit), unit);
2242 
2243     /* Idle never migrates and idle vcpus might belong to other units. */
2244     if ( !is_idle_unit(unit) )
2245         sched_unit_migrate_finish(unit);
2246 }
2247 
2248 /*
2249  * Rendezvous on end of context switch.
2250  * As no lock is protecting this rendezvous function we need to use atomic
2251  * access functions on the counter.
2252  * The counter will be 0 in case no rendezvous is needed. For the rendezvous
2253  * case it is initialised to the number of cpus to rendezvous plus 1. Each
2254  * member entering decrements the counter. The last one will decrement it to
2255  * 1 and perform the final needed action in that case (call of
2256  * unit_context_saved()), and then set the counter to zero. The other members
2257  * will wait until the counter becomes zero until they proceed.
2258  */
sched_context_switched(struct vcpu * vprev,struct vcpu * vnext)2259 void sched_context_switched(struct vcpu *vprev, struct vcpu *vnext)
2260 {
2261     struct sched_unit *next = vnext->sched_unit;
2262     struct sched_resource *sr;
2263 
2264     rcu_read_lock(&sched_res_rculock);
2265 
2266     sr = get_sched_res(smp_processor_id());
2267 
2268     if ( atomic_read(&next->rendezvous_out_cnt) )
2269     {
2270         int cnt = atomic_dec_return(&next->rendezvous_out_cnt);
2271 
2272         vcpu_context_saved(vprev, vnext);
2273 
2274         /* Call unit_context_saved() before releasing other waiters. */
2275         if ( cnt == 1 )
2276         {
2277             unit_context_saved(sr);
2278             atomic_set(&next->rendezvous_out_cnt, 0);
2279         }
2280         else
2281             while ( atomic_read(&next->rendezvous_out_cnt) )
2282                 cpu_relax();
2283     }
2284     else
2285     {
2286         vcpu_context_saved(vprev, vnext);
2287         if ( sr->granularity == 1 )
2288             unit_context_saved(sr);
2289     }
2290 
2291     if ( is_idle_vcpu(vprev) && vprev != vnext )
2292         vprev->sched_unit = sr->sched_unit_idle;
2293 
2294     rcu_read_unlock(&sched_res_rculock);
2295 }
2296 
2297 /*
2298  * Switch to a new context or keep the current one running.
2299  * On x86 it won't return, so it needs to drop the still held sched_res_rculock.
2300  */
sched_context_switch(struct vcpu * vprev,struct vcpu * vnext,bool reset_idle_unit,s_time_t now)2301 static void sched_context_switch(struct vcpu *vprev, struct vcpu *vnext,
2302                                  bool reset_idle_unit, s_time_t now)
2303 {
2304     if ( unlikely(vprev == vnext) )
2305     {
2306         TRACE_4D(TRC_SCHED_SWITCH_INFCONT,
2307                  vnext->domain->domain_id, vnext->sched_unit->unit_id,
2308                  now - vprev->runstate.state_entry_time,
2309                  vprev->sched_unit->next_time);
2310         sched_context_switched(vprev, vnext);
2311 
2312         /*
2313          * We are switching from a non-idle to an idle unit.
2314          * A vcpu of the idle unit might have been running before due to
2315          * the guest vcpu being blocked. We must adjust the unit of the idle
2316          * vcpu which might have been set to the guest's one.
2317          */
2318         if ( reset_idle_unit )
2319             vnext->sched_unit =
2320                 get_sched_res(smp_processor_id())->sched_unit_idle;
2321 
2322         rcu_read_unlock(&sched_res_rculock);
2323 
2324         trace_continue_running(vnext);
2325         return continue_running(vprev);
2326     }
2327 
2328     SCHED_STAT_CRANK(sched_ctx);
2329 
2330     stop_timer(&vprev->periodic_timer);
2331 
2332     if ( vnext->sched_unit->migrated )
2333         vcpu_move_irqs(vnext);
2334 
2335     vcpu_periodic_timer_work(vnext);
2336 
2337     rcu_read_unlock(&sched_res_rculock);
2338 
2339     context_switch(vprev, vnext);
2340 }
2341 
2342 /*
2343  * Force a context switch of a single vcpu of an unit.
2344  * Might be called either if a vcpu of an already running unit is woken up
2345  * or if a vcpu of a running unit is put asleep with other vcpus of the same
2346  * unit still running.
2347  * Returns either NULL if v is already in the correct state or the vcpu to
2348  * run next.
2349  */
sched_force_context_switch(struct vcpu * vprev,struct vcpu * v,unsigned int cpu,s_time_t now)2350 static struct vcpu *sched_force_context_switch(struct vcpu *vprev,
2351                                                struct vcpu *v,
2352                                                unsigned int cpu, s_time_t now)
2353 {
2354     v->force_context_switch = false;
2355 
2356     if ( vcpu_runnable(v) == v->is_running )
2357         return NULL;
2358 
2359     if ( vcpu_runnable(v) )
2360     {
2361         if ( is_idle_vcpu(vprev) )
2362         {
2363             vcpu_runstate_change(vprev, RUNSTATE_runnable, now);
2364             vprev->sched_unit = get_sched_res(cpu)->sched_unit_idle;
2365         }
2366         vcpu_runstate_change(v, RUNSTATE_running, now);
2367     }
2368     else
2369     {
2370         /* Make sure not to switch last vcpu of an unit away. */
2371         if ( unit_running(v->sched_unit) == 1 )
2372             return NULL;
2373 
2374         v->new_state = vcpu_runstate_blocked(v);
2375         vcpu_runstate_change(v, v->new_state, now);
2376         v = sched_unit2vcpu_cpu(vprev->sched_unit, cpu);
2377         if ( v != vprev )
2378         {
2379             if ( is_idle_vcpu(vprev) )
2380             {
2381                 vcpu_runstate_change(vprev, RUNSTATE_runnable, now);
2382                 vprev->sched_unit = get_sched_res(cpu)->sched_unit_idle;
2383             }
2384             else
2385             {
2386                 v->sched_unit = vprev->sched_unit;
2387                 vcpu_runstate_change(v, RUNSTATE_running, now);
2388             }
2389         }
2390     }
2391 
2392     /* This vcpu will be switched to. */
2393     v->is_running = true;
2394 
2395     /* Make sure not to loose another slave call. */
2396     raise_softirq(SCHED_SLAVE_SOFTIRQ);
2397 
2398     return v;
2399 }
2400 
2401 /*
2402  * Rendezvous before taking a scheduling decision.
2403  * Called with schedule lock held, so all accesses to the rendezvous counter
2404  * can be normal ones (no atomic accesses needed).
2405  * The counter is initialized to the number of cpus to rendezvous initially.
2406  * Each cpu entering will decrement the counter. In case the counter becomes
2407  * zero do_schedule() is called and the rendezvous counter for leaving
2408  * context_switch() is set. All other members will wait until the counter is
2409  * becoming zero, dropping the schedule lock in between.
2410  * Either returns the new unit to run, or NULL if no context switch is
2411  * required or (on Arm) has already been performed. If NULL is returned
2412  * sched_res_rculock has been dropped.
2413  */
sched_wait_rendezvous_in(struct sched_unit * prev,spinlock_t ** lock,int cpu,s_time_t now)2414 static struct sched_unit *sched_wait_rendezvous_in(struct sched_unit *prev,
2415                                                    spinlock_t **lock, int cpu,
2416                                                    s_time_t now)
2417 {
2418     struct sched_unit *next;
2419     struct vcpu *v;
2420     struct sched_resource *sr = get_sched_res(cpu);
2421     unsigned int gran = sr->granularity;
2422 
2423     if ( !--prev->rendezvous_in_cnt )
2424     {
2425         next = do_schedule(prev, now, cpu);
2426         atomic_set(&next->rendezvous_out_cnt, gran + 1);
2427         return next;
2428     }
2429 
2430     v = unit2vcpu_cpu(prev, cpu);
2431     while ( prev->rendezvous_in_cnt )
2432     {
2433         if ( v && v->force_context_switch )
2434         {
2435             struct vcpu *vprev = current;
2436 
2437             v = sched_force_context_switch(vprev, v, cpu, now);
2438 
2439             if ( v )
2440             {
2441                 /* We'll come back another time, so adjust rendezvous_in_cnt. */
2442                 prev->rendezvous_in_cnt++;
2443                 atomic_set(&prev->rendezvous_out_cnt, 0);
2444 
2445                 pcpu_schedule_unlock_irq(*lock, cpu);
2446 
2447                 sched_context_switch(vprev, v, false, now);
2448 
2449                 return NULL;     /* ARM only. */
2450             }
2451 
2452             v = unit2vcpu_cpu(prev, cpu);
2453         }
2454         /*
2455          * Check for any work to be done which might need cpu synchronization.
2456          * This is either pending RCU work, or tasklet work when coming from
2457          * idle. It is mandatory that RCU softirqs are of higher priority
2458          * than scheduling ones as otherwise a deadlock might occur.
2459          * In order to avoid deadlocks we can't do that here, but have to
2460          * schedule the previous vcpu again, which will lead to the desired
2461          * processing to be done.
2462          * Undo the rendezvous_in_cnt decrement and schedule another call of
2463          * sched_slave().
2464          */
2465         BUILD_BUG_ON(RCU_SOFTIRQ > SCHED_SLAVE_SOFTIRQ ||
2466                      RCU_SOFTIRQ > SCHEDULE_SOFTIRQ);
2467         if ( rcu_pending(cpu) ||
2468              (is_idle_unit(prev) && sched_tasklet_check_cpu(cpu)) )
2469         {
2470             struct vcpu *vprev = current;
2471 
2472             prev->rendezvous_in_cnt++;
2473             atomic_set(&prev->rendezvous_out_cnt, 0);
2474 
2475             pcpu_schedule_unlock_irq(*lock, cpu);
2476 
2477             raise_softirq(SCHED_SLAVE_SOFTIRQ);
2478             sched_context_switch(vprev, vprev, false, now);
2479 
2480             return NULL;         /* ARM only. */
2481         }
2482 
2483         pcpu_schedule_unlock_irq(*lock, cpu);
2484 
2485         cpu_relax();
2486 
2487         *lock = pcpu_schedule_lock_irq(cpu);
2488 
2489         /*
2490          * Check for scheduling resource switched. This happens when we are
2491          * moved away from our cpupool and cpus are subject of the idle
2492          * scheduler now.
2493          *
2494          * This is also a bail out case when scheduler_disable() has been
2495          * called.
2496          */
2497         if ( unlikely(sr != get_sched_res(cpu) || !scheduler_active) )
2498         {
2499             ASSERT(is_idle_unit(prev));
2500             atomic_set(&prev->next_task->rendezvous_out_cnt, 0);
2501             prev->rendezvous_in_cnt = 0;
2502             pcpu_schedule_unlock_irq(*lock, cpu);
2503             rcu_read_unlock(&sched_res_rculock);
2504             return NULL;
2505         }
2506     }
2507 
2508     return prev->next_task;
2509 }
2510 
sched_slave(void)2511 static void sched_slave(void)
2512 {
2513     struct vcpu          *v, *vprev = current;
2514     struct sched_unit    *prev = vprev->sched_unit, *next;
2515     s_time_t              now;
2516     spinlock_t           *lock;
2517     bool                  do_softirq = false;
2518     unsigned int          cpu = smp_processor_id();
2519 
2520     ASSERT_NOT_IN_ATOMIC();
2521 
2522     rcu_read_lock(&sched_res_rculock);
2523 
2524     lock = pcpu_schedule_lock_irq(cpu);
2525 
2526     now = NOW();
2527 
2528     v = unit2vcpu_cpu(prev, cpu);
2529     if ( v && v->force_context_switch )
2530     {
2531         v = sched_force_context_switch(vprev, v, cpu, now);
2532 
2533         if ( v )
2534         {
2535             pcpu_schedule_unlock_irq(lock, cpu);
2536 
2537             sched_context_switch(vprev, v, false, now);
2538 
2539             return;
2540         }
2541 
2542         do_softirq = true;
2543     }
2544 
2545     if ( !prev->rendezvous_in_cnt )
2546     {
2547         pcpu_schedule_unlock_irq(lock, cpu);
2548 
2549         rcu_read_unlock(&sched_res_rculock);
2550 
2551         /* Check for failed forced context switch. */
2552         if ( do_softirq )
2553             raise_softirq(SCHEDULE_SOFTIRQ);
2554 
2555         return;
2556     }
2557 
2558     stop_timer(&get_sched_res(cpu)->s_timer);
2559 
2560     next = sched_wait_rendezvous_in(prev, &lock, cpu, now);
2561     if ( !next )
2562         return;
2563 
2564     pcpu_schedule_unlock_irq(lock, cpu);
2565 
2566     sched_context_switch(vprev, sched_unit2vcpu_cpu(next, cpu),
2567                          is_idle_unit(next) && !is_idle_unit(prev), now);
2568 }
2569 
2570 /*
2571  * The main function
2572  * - deschedule the current domain (scheduler independent).
2573  * - pick a new domain (scheduler dependent).
2574  */
schedule(void)2575 static void schedule(void)
2576 {
2577     struct vcpu          *vnext, *vprev = current;
2578     struct sched_unit    *prev = vprev->sched_unit, *next = NULL;
2579     s_time_t              now;
2580     struct sched_resource *sr;
2581     spinlock_t           *lock;
2582     int cpu = smp_processor_id();
2583     unsigned int          gran;
2584 
2585     ASSERT_NOT_IN_ATOMIC();
2586 
2587     SCHED_STAT_CRANK(sched_run);
2588 
2589     rcu_read_lock(&sched_res_rculock);
2590 
2591     lock = pcpu_schedule_lock_irq(cpu);
2592 
2593     sr = get_sched_res(cpu);
2594     gran = sr->granularity;
2595 
2596     if ( prev->rendezvous_in_cnt )
2597     {
2598         /*
2599          * We have a race: sched_slave() should be called, so raise a softirq
2600          * in order to re-enter schedule() later and call sched_slave() now.
2601          */
2602         pcpu_schedule_unlock_irq(lock, cpu);
2603 
2604         rcu_read_unlock(&sched_res_rculock);
2605 
2606         raise_softirq(SCHEDULE_SOFTIRQ);
2607         return sched_slave();
2608     }
2609 
2610     stop_timer(&sr->s_timer);
2611 
2612     now = NOW();
2613 
2614     if ( gran > 1 )
2615     {
2616         cpumask_t *mask = cpumask_scratch_cpu(cpu);
2617 
2618         prev->rendezvous_in_cnt = gran;
2619         cpumask_andnot(mask, sr->cpus, cpumask_of(cpu));
2620         cpumask_raise_softirq(mask, SCHED_SLAVE_SOFTIRQ);
2621         next = sched_wait_rendezvous_in(prev, &lock, cpu, now);
2622         if ( !next )
2623             return;
2624     }
2625     else
2626     {
2627         prev->rendezvous_in_cnt = 0;
2628         next = do_schedule(prev, now, cpu);
2629         atomic_set(&next->rendezvous_out_cnt, 0);
2630     }
2631 
2632     pcpu_schedule_unlock_irq(lock, cpu);
2633 
2634     vnext = sched_unit2vcpu_cpu(next, cpu);
2635     sched_context_switch(vprev, vnext,
2636                          !is_idle_unit(prev) && is_idle_unit(next), now);
2637 }
2638 
2639 /* The scheduler timer: force a run through the scheduler */
s_timer_fn(void * unused)2640 static void s_timer_fn(void *unused)
2641 {
2642     raise_softirq(SCHEDULE_SOFTIRQ);
2643     SCHED_STAT_CRANK(sched_irq);
2644 }
2645 
2646 /* Per-VCPU periodic timer function: sends a virtual timer interrupt. */
vcpu_periodic_timer_fn(void * data)2647 static void vcpu_periodic_timer_fn(void *data)
2648 {
2649     struct vcpu *v = data;
2650     vcpu_periodic_timer_work(v);
2651 }
2652 
2653 /* Per-VCPU single-shot timer function: sends a virtual timer interrupt. */
vcpu_singleshot_timer_fn(void * data)2654 static void vcpu_singleshot_timer_fn(void *data)
2655 {
2656     struct vcpu *v = data;
2657     send_timer_event(v);
2658 }
2659 
2660 /* SCHEDOP_poll timeout callback. */
poll_timer_fn(void * data)2661 static void poll_timer_fn(void *data)
2662 {
2663     struct vcpu *v = data;
2664 
2665     if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
2666         vcpu_unblock(v);
2667 }
2668 
sched_alloc_res(void)2669 static struct sched_resource *sched_alloc_res(void)
2670 {
2671     struct sched_resource *sr;
2672 
2673     sr = xzalloc(struct sched_resource);
2674     if ( sr == NULL )
2675         return NULL;
2676     if ( !zalloc_cpumask_var(&sr->cpus) )
2677     {
2678         xfree(sr);
2679         return NULL;
2680     }
2681     return sr;
2682 }
2683 
cpu_schedule_up(unsigned int cpu)2684 static int cpu_schedule_up(unsigned int cpu)
2685 {
2686     struct sched_resource *sr;
2687 
2688     sr = sched_alloc_res();
2689     if ( sr == NULL )
2690         return -ENOMEM;
2691 
2692     sr->master_cpu = cpu;
2693     cpumask_copy(sr->cpus, cpumask_of(cpu));
2694     set_sched_res(cpu, sr);
2695 
2696     sr->scheduler = &sched_idle_ops;
2697     spin_lock_init(&sr->_lock);
2698     sr->schedule_lock = &sched_free_cpu_lock;
2699     init_timer(&sr->s_timer, s_timer_fn, NULL, cpu);
2700     atomic_set(&per_cpu(sched_urgent_count, cpu), 0);
2701 
2702     /* We start with cpu granularity. */
2703     sr->granularity = 1;
2704 
2705     cpumask_set_cpu(cpu, &sched_res_mask);
2706 
2707     /* Boot CPU is dealt with later in scheduler_init(). */
2708     if ( cpu == 0 )
2709         return 0;
2710 
2711     if ( idle_vcpu[cpu] == NULL )
2712         vcpu_create(idle_vcpu[0]->domain, cpu);
2713     else
2714         idle_vcpu[cpu]->sched_unit->res = sr;
2715 
2716     if ( idle_vcpu[cpu] == NULL )
2717         return -ENOMEM;
2718 
2719     idle_vcpu[cpu]->sched_unit->rendezvous_in_cnt = 0;
2720 
2721     /*
2722      * No need to allocate any scheduler data, as cpus coming online are
2723      * free initially and the idle scheduler doesn't need any data areas
2724      * allocated.
2725      */
2726 
2727     sr->curr = idle_vcpu[cpu]->sched_unit;
2728     sr->sched_unit_idle = idle_vcpu[cpu]->sched_unit;
2729 
2730     sr->sched_priv = NULL;
2731 
2732     return 0;
2733 }
2734 
sched_res_free(struct rcu_head * head)2735 static void sched_res_free(struct rcu_head *head)
2736 {
2737     struct sched_resource *sr = container_of(head, struct sched_resource, rcu);
2738 
2739     free_cpumask_var(sr->cpus);
2740     if ( sr->sched_unit_idle )
2741         sched_free_unit_mem(sr->sched_unit_idle);
2742     xfree(sr);
2743 }
2744 
cpu_schedule_down(unsigned int cpu)2745 static void cpu_schedule_down(unsigned int cpu)
2746 {
2747     struct sched_resource *sr;
2748 
2749     rcu_read_lock(&sched_res_rculock);
2750 
2751     sr = get_sched_res(cpu);
2752 
2753     kill_timer(&sr->s_timer);
2754 
2755     cpumask_clear_cpu(cpu, &sched_res_mask);
2756     set_sched_res(cpu, NULL);
2757 
2758     /* Keep idle unit. */
2759     sr->sched_unit_idle = NULL;
2760     call_rcu(&sr->rcu, sched_res_free);
2761 
2762     rcu_read_unlock(&sched_res_rculock);
2763 }
2764 
sched_rm_cpu(unsigned int cpu)2765 void sched_rm_cpu(unsigned int cpu)
2766 {
2767     int rc;
2768 
2769     rcu_read_lock(&domlist_read_lock);
2770     rc = cpu_disable_scheduler(cpu);
2771     BUG_ON(rc);
2772     rcu_read_unlock(&domlist_read_lock);
2773     cpu_schedule_down(cpu);
2774 }
2775 
cpu_schedule_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)2776 static int cpu_schedule_callback(
2777     struct notifier_block *nfb, unsigned long action, void *hcpu)
2778 {
2779     unsigned int cpu = (unsigned long)hcpu;
2780     int rc = 0;
2781 
2782     /*
2783      * All scheduler related suspend/resume handling needed is done in
2784      * cpupool.c.
2785      */
2786     if ( system_state > SYS_STATE_active )
2787         return NOTIFY_DONE;
2788 
2789     rcu_read_lock(&sched_res_rculock);
2790 
2791     /*
2792      * From the scheduler perspective, bringing up a pCPU requires
2793      * allocating and initializing the per-pCPU scheduler specific data,
2794      * as well as "registering" this pCPU to the scheduler (which may
2795      * involve modifying some scheduler wide data structures).
2796      * As new pCPUs always start as "free" cpus with the minimal idle
2797      * scheduler being in charge, we don't need any of that.
2798      *
2799      * On the other hand, at teardown, we need to reverse what has been done
2800      * during initialization, and then free the per-pCPU specific data. A
2801      * pCPU brought down is not forced through "free" cpus, so here we need to
2802      * use the appropriate hooks.
2803      *
2804      * This happens by calling the deinit_pdata and free_pdata hooks, in this
2805      * order. If no per-pCPU memory was allocated, there is no need to
2806      * provide an implementation of free_pdata. deinit_pdata may, however,
2807      * be necessary/useful in this case too (e.g., it can undo something done
2808      * on scheduler wide data structure during switch_sched). Both deinit_pdata
2809      * and free_pdata are called during CPU_DEAD.
2810      *
2811      * If something goes wrong during bringup, we go to CPU_UP_CANCELLED.
2812      */
2813     switch ( action )
2814     {
2815     case CPU_UP_PREPARE:
2816         rc = cpu_schedule_up(cpu);
2817         break;
2818     case CPU_DOWN_PREPARE:
2819         rcu_read_lock(&domlist_read_lock);
2820         rc = cpu_disable_scheduler_check(cpu);
2821         rcu_read_unlock(&domlist_read_lock);
2822         break;
2823     case CPU_DEAD:
2824         sched_rm_cpu(cpu);
2825         break;
2826     case CPU_UP_CANCELED:
2827         cpu_schedule_down(cpu);
2828         break;
2829     default:
2830         break;
2831     }
2832 
2833     rcu_read_unlock(&sched_res_rculock);
2834 
2835     return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
2836 }
2837 
2838 static struct notifier_block cpu_schedule_nfb = {
2839     .notifier_call = cpu_schedule_callback
2840 };
2841 
sched_get_opt_cpumask(enum sched_gran opt,unsigned int cpu)2842 const cpumask_t *sched_get_opt_cpumask(enum sched_gran opt, unsigned int cpu)
2843 {
2844     const cpumask_t *mask;
2845 
2846     switch ( opt )
2847     {
2848     case SCHED_GRAN_cpu:
2849         mask = cpumask_of(cpu);
2850         break;
2851     case SCHED_GRAN_core:
2852         mask = per_cpu(cpu_sibling_mask, cpu);
2853         break;
2854     case SCHED_GRAN_socket:
2855         mask = per_cpu(cpu_core_mask, cpu);
2856         break;
2857     default:
2858         ASSERT_UNREACHABLE();
2859         return NULL;
2860     }
2861 
2862     return mask;
2863 }
2864 
schedule_dummy(void)2865 static void schedule_dummy(void)
2866 {
2867     sched_tasklet_check_cpu(smp_processor_id());
2868 }
2869 
scheduler_disable(void)2870 void scheduler_disable(void)
2871 {
2872     scheduler_active = false;
2873     open_softirq(SCHEDULE_SOFTIRQ, schedule_dummy);
2874     open_softirq(SCHED_SLAVE_SOFTIRQ, schedule_dummy);
2875 }
2876 
scheduler_enable(void)2877 void scheduler_enable(void)
2878 {
2879     open_softirq(SCHEDULE_SOFTIRQ, schedule);
2880     open_softirq(SCHED_SLAVE_SOFTIRQ, sched_slave);
2881     scheduler_active = true;
2882 }
2883 
2884 /* Initialise the data structures. */
scheduler_init(void)2885 void __init scheduler_init(void)
2886 {
2887     struct domain *idle_domain;
2888     int i;
2889 
2890     scheduler_enable();
2891 
2892     for ( i = 0; i < NUM_SCHEDULERS; i++)
2893     {
2894 #define sched_test_func(f)                               \
2895         if ( !schedulers[i]->f )                         \
2896         {                                                \
2897             printk("scheduler %s misses .%s, dropped\n", \
2898                    schedulers[i]->opt_name, #f);         \
2899             schedulers[i] = NULL;                        \
2900         }
2901 
2902         sched_test_func(init);
2903         sched_test_func(deinit);
2904         sched_test_func(pick_resource);
2905         sched_test_func(alloc_udata);
2906         sched_test_func(free_udata);
2907         sched_test_func(switch_sched);
2908         sched_test_func(do_schedule);
2909 
2910 #undef sched_test_func
2911 
2912         if ( schedulers[i]->global_init && schedulers[i]->global_init() < 0 )
2913         {
2914             printk("scheduler %s failed initialization, dropped\n",
2915                    schedulers[i]->opt_name);
2916             schedulers[i] = NULL;
2917         }
2918 
2919         if ( schedulers[i] && !ops.name &&
2920              !strcmp(schedulers[i]->opt_name, opt_sched) )
2921             ops = *schedulers[i];
2922     }
2923 
2924     if ( !ops.name )
2925     {
2926         printk("Could not find scheduler: %s\n", opt_sched);
2927         for ( i = 0; i < NUM_SCHEDULERS; i++ )
2928             if ( schedulers[i] &&
2929                  !strcmp(schedulers[i]->opt_name, CONFIG_SCHED_DEFAULT) )
2930             {
2931                 ops = *schedulers[i];
2932                 break;
2933             }
2934         BUG_ON(!ops.name);
2935         printk("Using '%s' (%s)\n", ops.name, ops.opt_name);
2936     }
2937 
2938     if ( cpu_schedule_up(0) )
2939         BUG();
2940     register_cpu_notifier(&cpu_schedule_nfb);
2941 
2942     printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
2943     if ( sched_init(&ops) )
2944         panic("scheduler returned error on init\n");
2945 
2946     if ( sched_ratelimit_us &&
2947          (sched_ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX
2948           || sched_ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN) )
2949     {
2950         printk("WARNING: sched_ratelimit_us outside of valid range [%d,%d].\n"
2951                " Resetting to default %u\n",
2952                XEN_SYSCTL_SCHED_RATELIMIT_MIN,
2953                XEN_SYSCTL_SCHED_RATELIMIT_MAX,
2954                SCHED_DEFAULT_RATELIMIT_US);
2955         sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US;
2956     }
2957 
2958     idle_domain = domain_create(DOMID_IDLE, NULL, false);
2959     BUG_ON(IS_ERR(idle_domain));
2960     BUG_ON(nr_cpu_ids > ARRAY_SIZE(idle_vcpu));
2961     idle_domain->vcpu = idle_vcpu;
2962     idle_domain->max_vcpus = nr_cpu_ids;
2963     if ( vcpu_create(idle_domain, 0) == NULL )
2964         BUG();
2965 
2966     rcu_read_lock(&sched_res_rculock);
2967 
2968     get_sched_res(0)->curr = idle_vcpu[0]->sched_unit;
2969     get_sched_res(0)->sched_unit_idle = idle_vcpu[0]->sched_unit;
2970 
2971     rcu_read_unlock(&sched_res_rculock);
2972 }
2973 
2974 /*
2975  * Move a pCPU from free cpus (running the idle scheduler) to a cpupool
2976  * using any "real" scheduler.
2977  * The cpu is still marked as "free" and not yet valid for its cpupool.
2978  */
schedule_cpu_add(unsigned int cpu,struct cpupool * c)2979 int schedule_cpu_add(unsigned int cpu, struct cpupool *c)
2980 {
2981     struct vcpu *idle;
2982     void *ppriv, *vpriv;
2983     struct scheduler *new_ops = c->sched;
2984     struct sched_resource *sr;
2985     spinlock_t *old_lock, *new_lock;
2986     unsigned long flags;
2987     int ret = 0;
2988 
2989     rcu_read_lock(&sched_res_rculock);
2990 
2991     sr = get_sched_res(cpu);
2992 
2993     ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus));
2994     ASSERT(!cpumask_test_cpu(cpu, c->cpu_valid));
2995     ASSERT(get_sched_res(cpu)->cpupool == NULL);
2996 
2997     /*
2998      * To setup the cpu for the new scheduler we need:
2999      *  - a valid instance of per-CPU scheduler specific data, as it is
3000      *    allocated by sched_alloc_pdata(). Note that we do not want to
3001      *    initialize it yet, as that will be done by the target scheduler,
3002      *    in sched_switch_sched(), in proper ordering and with locking.
3003      *  - a valid instance of per-vCPU scheduler specific data, for the idle
3004      *    vCPU of cpu. That is what the target scheduler will use for the
3005      *    sched_priv field of the per-vCPU info of the idle domain.
3006      */
3007     idle = idle_vcpu[cpu];
3008     ppriv = sched_alloc_pdata(new_ops, cpu);
3009     if ( IS_ERR(ppriv) )
3010     {
3011         ret = PTR_ERR(ppriv);
3012         goto out;
3013     }
3014 
3015     vpriv = sched_alloc_udata(new_ops, idle->sched_unit,
3016                               idle->domain->sched_priv);
3017     if ( vpriv == NULL )
3018     {
3019         sched_free_pdata(new_ops, ppriv, cpu);
3020         ret = -ENOMEM;
3021         goto out;
3022     }
3023 
3024     /*
3025      * The actual switch, including the rerouting of the scheduler lock to
3026      * whatever new_ops prefers, needs to happen in one critical section,
3027      * protected by old_ops' lock, or races are possible.
3028      * It is, in fact, the lock of the idle scheduler that we are taking.
3029      * But that is ok as anyone trying to schedule on this cpu will spin until
3030      * when we release that lock (bottom of this function). When he'll get the
3031      * lock --thanks to the loop inside *_schedule_lock() functions-- he'll
3032      * notice that the lock itself changed, and retry acquiring the new one
3033      * (which will be the correct, remapped one, at that point).
3034      */
3035     old_lock = pcpu_schedule_lock_irqsave(cpu, &flags);
3036 
3037     if ( cpupool_get_granularity(c) > 1 )
3038     {
3039         const cpumask_t *mask;
3040         unsigned int cpu_iter, idx = 0;
3041         struct sched_unit *old_unit, *master_unit;
3042         struct sched_resource *sr_old;
3043 
3044         /*
3045          * We need to merge multiple idle_vcpu units and sched_resource structs
3046          * into one. As the free cpus all share the same lock we are fine doing
3047          * that now. The worst which could happen would be someone waiting for
3048          * the lock, thus dereferencing sched_res->schedule_lock. This is the
3049          * reason we are freeing struct sched_res via call_rcu() to avoid the
3050          * lock pointer suddenly disappearing.
3051          */
3052         mask = sched_get_opt_cpumask(c->gran, cpu);
3053         master_unit = idle_vcpu[cpu]->sched_unit;
3054 
3055         for_each_cpu ( cpu_iter, mask )
3056         {
3057             if ( idx )
3058                 cpumask_clear_cpu(cpu_iter, &sched_res_mask);
3059 
3060             per_cpu(sched_res_idx, cpu_iter) = idx++;
3061 
3062             if ( cpu == cpu_iter )
3063                 continue;
3064 
3065             old_unit = idle_vcpu[cpu_iter]->sched_unit;
3066             sr_old = get_sched_res(cpu_iter);
3067             kill_timer(&sr_old->s_timer);
3068             idle_vcpu[cpu_iter]->sched_unit = master_unit;
3069             master_unit->runstate_cnt[RUNSTATE_running]++;
3070             set_sched_res(cpu_iter, sr);
3071             cpumask_set_cpu(cpu_iter, sr->cpus);
3072 
3073             call_rcu(&sr_old->rcu, sched_res_free);
3074         }
3075     }
3076 
3077     new_lock = sched_switch_sched(new_ops, cpu, ppriv, vpriv);
3078 
3079     sr->scheduler = new_ops;
3080     sr->sched_priv = ppriv;
3081 
3082     /*
3083      * Reroute the lock to the per pCPU lock as /last/ thing. In fact,
3084      * if it is free (and it can be) we want that anyone that manages
3085      * taking it, finds all the initializations we've done above in place.
3086      */
3087     smp_wmb();
3088     sr->schedule_lock = new_lock;
3089 
3090     /* _Not_ pcpu_schedule_unlock(): schedule_lock has changed! */
3091     spin_unlock_irqrestore(old_lock, flags);
3092 
3093     sr->granularity = cpupool_get_granularity(c);
3094     sr->cpupool = c;
3095     /* The  cpu is added to a pool, trigger it to go pick up some work */
3096     cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
3097 
3098 out:
3099     rcu_read_unlock(&sched_res_rculock);
3100 
3101     return ret;
3102 }
3103 
3104 /*
3105  * Remove a pCPU from its cpupool. Its scheduler becomes &sched_idle_ops
3106  * (the idle scheduler).
3107  * The cpu is already marked as "free" and not valid any longer for its
3108  * cpupool.
3109  */
schedule_cpu_rm(unsigned int cpu)3110 int schedule_cpu_rm(unsigned int cpu)
3111 {
3112     void *ppriv_old, *vpriv_old;
3113     struct sched_resource *sr, **sr_new = NULL;
3114     struct sched_unit *unit;
3115     struct scheduler *old_ops;
3116     spinlock_t *old_lock;
3117     unsigned long flags;
3118     int idx, ret = -ENOMEM;
3119     unsigned int cpu_iter;
3120 
3121     rcu_read_lock(&sched_res_rculock);
3122 
3123     sr = get_sched_res(cpu);
3124     old_ops = sr->scheduler;
3125 
3126     if ( sr->granularity > 1 )
3127     {
3128         sr_new = xmalloc_array(struct sched_resource *, sr->granularity - 1);
3129         if ( !sr_new )
3130             goto out;
3131         for ( idx = 0; idx < sr->granularity - 1; idx++ )
3132         {
3133             sr_new[idx] = sched_alloc_res();
3134             if ( sr_new[idx] )
3135             {
3136                 sr_new[idx]->sched_unit_idle = sched_alloc_unit_mem();
3137                 if ( !sr_new[idx]->sched_unit_idle )
3138                 {
3139                     sched_res_free(&sr_new[idx]->rcu);
3140                     sr_new[idx] = NULL;
3141                 }
3142             }
3143             if ( !sr_new[idx] )
3144             {
3145                 for ( idx--; idx >= 0; idx-- )
3146                     sched_res_free(&sr_new[idx]->rcu);
3147                 goto out;
3148             }
3149             sr_new[idx]->curr = sr_new[idx]->sched_unit_idle;
3150             sr_new[idx]->scheduler = &sched_idle_ops;
3151             sr_new[idx]->granularity = 1;
3152 
3153             /* We want the lock not to change when replacing the resource. */
3154             sr_new[idx]->schedule_lock = sr->schedule_lock;
3155         }
3156     }
3157 
3158     ret = 0;
3159     ASSERT(sr->cpupool != NULL);
3160     ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus));
3161     ASSERT(!cpumask_test_cpu(cpu, sr->cpupool->cpu_valid));
3162 
3163     /* See comment in schedule_cpu_add() regarding lock switching. */
3164     old_lock = pcpu_schedule_lock_irqsave(cpu, &flags);
3165 
3166     vpriv_old = idle_vcpu[cpu]->sched_unit->priv;
3167     ppriv_old = sr->sched_priv;
3168 
3169     idx = 0;
3170     for_each_cpu ( cpu_iter, sr->cpus )
3171     {
3172         per_cpu(sched_res_idx, cpu_iter) = 0;
3173         if ( cpu_iter == cpu )
3174         {
3175             unit = idle_vcpu[cpu_iter]->sched_unit;
3176             unit->priv = NULL;
3177             atomic_set(&unit->next_task->rendezvous_out_cnt, 0);
3178             unit->rendezvous_in_cnt = 0;
3179         }
3180         else
3181         {
3182             /* Initialize unit. */
3183             unit = sr_new[idx]->sched_unit_idle;
3184             unit->res = sr_new[idx];
3185             unit->is_running = true;
3186             sched_unit_add_vcpu(unit, idle_vcpu[cpu_iter]);
3187             sched_domain_insert_unit(unit, idle_vcpu[cpu_iter]->domain);
3188 
3189             /* Adjust cpu masks of resources (old and new). */
3190             cpumask_clear_cpu(cpu_iter, sr->cpus);
3191             cpumask_set_cpu(cpu_iter, sr_new[idx]->cpus);
3192 
3193             /* Init timer. */
3194             init_timer(&sr_new[idx]->s_timer, s_timer_fn, NULL, cpu_iter);
3195 
3196             /* Last resource initializations and insert resource pointer. */
3197             sr_new[idx]->master_cpu = cpu_iter;
3198             set_sched_res(cpu_iter, sr_new[idx]);
3199 
3200             /* Last action: set the new lock pointer. */
3201             smp_mb();
3202             sr_new[idx]->schedule_lock = &sched_free_cpu_lock;
3203 
3204             idx++;
3205         }
3206     }
3207     sr->scheduler = &sched_idle_ops;
3208     sr->sched_priv = NULL;
3209     sr->granularity = 1;
3210     sr->cpupool = NULL;
3211 
3212     smp_mb();
3213     sr->schedule_lock = &sched_free_cpu_lock;
3214 
3215     /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */
3216     spin_unlock_irqrestore(old_lock, flags);
3217 
3218     sched_deinit_pdata(old_ops, ppriv_old, cpu);
3219 
3220     sched_free_udata(old_ops, vpriv_old);
3221     sched_free_pdata(old_ops, ppriv_old, cpu);
3222 
3223 out:
3224     rcu_read_unlock(&sched_res_rculock);
3225     xfree(sr_new);
3226 
3227     return ret;
3228 }
3229 
scheduler_get_default(void)3230 struct scheduler *scheduler_get_default(void)
3231 {
3232     return &ops;
3233 }
3234 
scheduler_alloc(unsigned int sched_id,int * perr)3235 struct scheduler *scheduler_alloc(unsigned int sched_id, int *perr)
3236 {
3237     int i;
3238     struct scheduler *sched;
3239 
3240     for ( i = 0; i < NUM_SCHEDULERS; i++ )
3241         if ( schedulers[i] && schedulers[i]->sched_id == sched_id )
3242             goto found;
3243     *perr = -ENOENT;
3244     return NULL;
3245 
3246  found:
3247     *perr = -ENOMEM;
3248     if ( (sched = xmalloc(struct scheduler)) == NULL )
3249         return NULL;
3250     memcpy(sched, schedulers[i], sizeof(*sched));
3251     if ( (*perr = sched_init(sched)) != 0 )
3252     {
3253         xfree(sched);
3254         sched = NULL;
3255     }
3256 
3257     return sched;
3258 }
3259 
scheduler_free(struct scheduler * sched)3260 void scheduler_free(struct scheduler *sched)
3261 {
3262     BUG_ON(sched == &ops);
3263     sched_deinit(sched);
3264     xfree(sched);
3265 }
3266 
schedule_dump(struct cpupool * c)3267 void schedule_dump(struct cpupool *c)
3268 {
3269     unsigned int      i, j;
3270     struct scheduler *sched;
3271     cpumask_t        *cpus;
3272 
3273     /* Locking, if necessary, must be handled withing each scheduler */
3274 
3275     rcu_read_lock(&sched_res_rculock);
3276 
3277     if ( c != NULL )
3278     {
3279         sched = c->sched;
3280         cpus = c->res_valid;
3281         printk("Scheduler: %s (%s)\n", sched->name, sched->opt_name);
3282         sched_dump_settings(sched);
3283     }
3284     else
3285     {
3286         sched = &ops;
3287         cpus = &cpupool_free_cpus;
3288     }
3289 
3290     printk("CPUs info:\n");
3291     for_each_cpu (i, cpus)
3292     {
3293         struct sched_resource *sr = get_sched_res(i);
3294         unsigned long flags;
3295         spinlock_t *lock;
3296 
3297         lock = pcpu_schedule_lock_irqsave(i, &flags);
3298 
3299         printk("CPU[%02d] current=%pv, curr=%pv, prev=%pv\n", i,
3300                get_cpu_current(i), sr->curr ? sr->curr->vcpu_list : NULL,
3301                sr->prev ? sr->prev->vcpu_list : NULL);
3302         for_each_cpu (j, sr->cpus)
3303             if ( i != j )
3304                 printk("CPU[%02d] current=%pv\n", j, get_cpu_current(j));
3305 
3306         pcpu_schedule_unlock_irqrestore(lock, flags, i);
3307 
3308         sched_dump_cpu_state(sched, i);
3309     }
3310 
3311     rcu_read_unlock(&sched_res_rculock);
3312 }
3313 
wait(void)3314 void wait(void)
3315 {
3316     schedule();
3317 }
3318 
3319 #ifdef CONFIG_X86
sched_setup_dom0_vcpus(struct domain * d)3320 void __init sched_setup_dom0_vcpus(struct domain *d)
3321 {
3322     unsigned int i;
3323     struct sched_unit *unit;
3324 
3325     for ( i = 1; i < d->max_vcpus; i++ )
3326         vcpu_create(d, i);
3327 
3328     /*
3329      * PV-shim: vcpus are pinned 1:1.
3330      * Initially only 1 cpu is online, others will be dealt with when
3331      * onlining them. This avoids pinning a vcpu to a not yet online cpu here.
3332      */
3333     if ( pv_shim )
3334         sched_set_affinity(d->vcpu[0]->sched_unit,
3335                            cpumask_of(0), cpumask_of(0));
3336     else
3337     {
3338         for_each_sched_unit ( d, unit )
3339         {
3340             if ( !opt_dom0_vcpus_pin && !dom0_affinity_relaxed )
3341                 sched_set_affinity(unit, &dom0_cpus, NULL);
3342             sched_set_affinity(unit, NULL, &dom0_cpus);
3343         }
3344     }
3345 
3346     domain_update_node_affinity(d);
3347 }
3348 #endif
3349 
3350 #ifdef CONFIG_COMPAT
3351 #include "compat.c"
3352 #endif
3353 
3354 #endif /* !COMPAT */
3355 
3356 /*
3357  * Local variables:
3358  * mode: C
3359  * c-file-style: "BSD"
3360  * c-basic-offset: 4
3361  * tab-width: 4
3362  * indent-tabs-mode: nil
3363  * End:
3364  */
3365