1 /*
2  * xen/common/sched_null.c
3  *
4  *  Copyright (c) 2017, Dario Faggioli, Citrix Ltd
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public
8  * License v2 as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public
16  * License along with this program; If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 /*
20  * The 'null' scheduler always choose to run, on each pCPU, either nothing
21  * (i.e., the pCPU stays idle) or always the same vCPU.
22  *
23  * It is aimed at supporting static scenarios, where there always are
24  * less vCPUs than pCPUs (and the vCPUs don't need to move among pCPUs
25  * for any reason) with the least possible overhead.
26  *
27  * Typical usecase are embedded applications, but also HPC, especially
28  * if the scheduler is used inside a cpupool.
29  */
30 
31 #include <xen/sched.h>
32 #include <xen/sched-if.h>
33 #include <xen/softirq.h>
34 #include <xen/keyhandler.h>
35 #include <xen/trace.h>
36 
37 /*
38  * null tracing events. Check include/public/trace.h for more details.
39  */
40 #define TRC_SNULL_PICKED_CPU    TRC_SCHED_CLASS_EVT(SNULL, 1)
41 #define TRC_SNULL_VCPU_ASSIGN   TRC_SCHED_CLASS_EVT(SNULL, 2)
42 #define TRC_SNULL_VCPU_DEASSIGN TRC_SCHED_CLASS_EVT(SNULL, 3)
43 #define TRC_SNULL_MIGRATE       TRC_SCHED_CLASS_EVT(SNULL, 4)
44 #define TRC_SNULL_SCHEDULE      TRC_SCHED_CLASS_EVT(SNULL, 5)
45 #define TRC_SNULL_TASKLET       TRC_SCHED_CLASS_EVT(SNULL, 6)
46 
47 /*
48  * Locking:
49  * - Scheduler-lock (a.k.a. runqueue lock):
50  *  + is per-pCPU;
51  *  + serializes assignment and deassignment of vCPUs to a pCPU.
52  * - Private data lock (a.k.a. private scheduler lock):
53  *  + is scheduler-wide;
54  *  + serializes accesses to the list of domains in this scheduler.
55  * - Waitqueue lock:
56  *  + is scheduler-wide;
57  *  + serialize accesses to the list of vCPUs waiting to be assigned
58  *    to pCPUs.
59  *
60  * Ordering is: private lock, runqueue lock, waitqueue lock. Or, OTOH,
61  * waitqueue lock nests inside runqueue lock which nests inside private
62  * lock. More specifically:
63  *  + if we need both runqueue and private locks, we must acquire the
64  *    private lock for first;
65  *  + if we need both runqueue and waitqueue locks, we must acquire
66  *    the runqueue lock for first;
67  *  + if we need both private and waitqueue locks, we must acquire
68  *    the private lock for first;
69  *  + if we already own a runqueue lock, we must never acquire
70  *    the private lock;
71  *  + if we already own the waitqueue lock, we must never acquire
72  *    the runqueue lock or the private lock.
73  */
74 
75 /*
76  * System-wide private data
77  */
78 struct null_private {
79     spinlock_t lock;        /* scheduler lock; nests inside cpupool_lock */
80     struct list_head ndom;  /* Domains of this scheduler                 */
81     struct list_head waitq; /* vCPUs not assigned to any pCPU            */
82     spinlock_t waitq_lock;  /* serializes waitq; nests inside runq locks */
83     cpumask_t cpus_free;    /* CPUs without a vCPU associated to them    */
84 };
85 
86 /*
87  * Physical CPU
88  */
89 struct null_pcpu {
90     struct vcpu *vcpu;
91 };
92 DEFINE_PER_CPU(struct null_pcpu, npc);
93 
94 /*
95  * Virtual CPU
96  */
97 struct null_vcpu {
98     struct list_head waitq_elem;
99     struct vcpu *vcpu;
100 };
101 
102 /*
103  * Domain
104  */
105 struct null_dom {
106     struct list_head ndom_elem;
107     struct domain *dom;
108 };
109 
110 /*
111  * Accessor helpers functions
112  */
null_priv(const struct scheduler * ops)113 static inline struct null_private *null_priv(const struct scheduler *ops)
114 {
115     return ops->sched_data;
116 }
117 
null_vcpu(const struct vcpu * v)118 static inline struct null_vcpu *null_vcpu(const struct vcpu *v)
119 {
120     return v->sched_priv;
121 }
122 
null_dom(const struct domain * d)123 static inline struct null_dom *null_dom(const struct domain *d)
124 {
125     return d->sched_priv;
126 }
127 
vcpu_check_affinity(struct vcpu * v,unsigned int cpu,unsigned int balance_step)128 static inline bool vcpu_check_affinity(struct vcpu *v, unsigned int cpu,
129                                        unsigned int balance_step)
130 {
131     affinity_balance_cpumask(v, balance_step, cpumask_scratch_cpu(cpu));
132     cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
133                 cpupool_domain_cpumask(v->domain));
134 
135     return cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu));
136 }
137 
null_init(struct scheduler * ops)138 static int null_init(struct scheduler *ops)
139 {
140     struct null_private *prv;
141 
142     printk("Initializing null scheduler\n"
143            "WARNING: This is experimental software in development.\n"
144            "Use at your own risk.\n");
145 
146     prv = xzalloc(struct null_private);
147     if ( prv == NULL )
148         return -ENOMEM;
149 
150     spin_lock_init(&prv->lock);
151     spin_lock_init(&prv->waitq_lock);
152     INIT_LIST_HEAD(&prv->ndom);
153     INIT_LIST_HEAD(&prv->waitq);
154 
155     ops->sched_data = prv;
156 
157     return 0;
158 }
159 
null_deinit(struct scheduler * ops)160 static void null_deinit(struct scheduler *ops)
161 {
162     xfree(ops->sched_data);
163     ops->sched_data = NULL;
164 }
165 
init_pdata(struct null_private * prv,unsigned int cpu)166 static void init_pdata(struct null_private *prv, unsigned int cpu)
167 {
168     /* Mark the pCPU as free, and with no vCPU assigned */
169     cpumask_set_cpu(cpu, &prv->cpus_free);
170     per_cpu(npc, cpu).vcpu = NULL;
171 }
172 
null_init_pdata(const struct scheduler * ops,void * pdata,int cpu)173 static void null_init_pdata(const struct scheduler *ops, void *pdata, int cpu)
174 {
175     struct null_private *prv = null_priv(ops);
176     struct schedule_data *sd = &per_cpu(schedule_data, cpu);
177 
178     /* alloc_pdata is not implemented, so we want this to be NULL. */
179     ASSERT(!pdata);
180 
181     /*
182      * The scheduler lock points already to the default per-cpu spinlock,
183      * so there is no remapping to be done.
184      */
185     ASSERT(sd->schedule_lock == &sd->_lock && !spin_is_locked(&sd->_lock));
186 
187     init_pdata(prv, cpu);
188 }
189 
null_deinit_pdata(const struct scheduler * ops,void * pcpu,int cpu)190 static void null_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu)
191 {
192     struct null_private *prv = null_priv(ops);
193 
194     /* alloc_pdata not implemented, so this must have stayed NULL */
195     ASSERT(!pcpu);
196 
197     cpumask_clear_cpu(cpu, &prv->cpus_free);
198     per_cpu(npc, cpu).vcpu = NULL;
199 }
200 
null_alloc_vdata(const struct scheduler * ops,struct vcpu * v,void * dd)201 static void *null_alloc_vdata(const struct scheduler *ops,
202                               struct vcpu *v, void *dd)
203 {
204     struct null_vcpu *nvc;
205 
206     nvc = xzalloc(struct null_vcpu);
207     if ( nvc == NULL )
208         return NULL;
209 
210     INIT_LIST_HEAD(&nvc->waitq_elem);
211     nvc->vcpu = v;
212 
213     SCHED_STAT_CRANK(vcpu_alloc);
214 
215     return nvc;
216 }
217 
null_free_vdata(const struct scheduler * ops,void * priv)218 static void null_free_vdata(const struct scheduler *ops, void *priv)
219 {
220     struct null_vcpu *nvc = priv;
221 
222     xfree(nvc);
223 }
224 
null_alloc_domdata(const struct scheduler * ops,struct domain * d)225 static void * null_alloc_domdata(const struct scheduler *ops,
226                                  struct domain *d)
227 {
228     struct null_private *prv = null_priv(ops);
229     struct null_dom *ndom;
230     unsigned long flags;
231 
232     ndom = xzalloc(struct null_dom);
233     if ( ndom == NULL )
234         return NULL;
235 
236     ndom->dom = d;
237 
238     spin_lock_irqsave(&prv->lock, flags);
239     list_add_tail(&ndom->ndom_elem, &null_priv(ops)->ndom);
240     spin_unlock_irqrestore(&prv->lock, flags);
241 
242     return (void*)ndom;
243 }
244 
null_free_domdata(const struct scheduler * ops,void * data)245 static void null_free_domdata(const struct scheduler *ops, void *data)
246 {
247     unsigned long flags;
248     struct null_dom *ndom = data;
249     struct null_private *prv = null_priv(ops);
250 
251     spin_lock_irqsave(&prv->lock, flags);
252     list_del_init(&ndom->ndom_elem);
253     spin_unlock_irqrestore(&prv->lock, flags);
254 
255     xfree(data);
256 }
257 
null_dom_init(const struct scheduler * ops,struct domain * d)258 static int null_dom_init(const struct scheduler *ops, struct domain *d)
259 {
260     struct null_dom *ndom;
261 
262     if ( is_idle_domain(d) )
263         return 0;
264 
265     ndom = null_alloc_domdata(ops, d);
266     if ( ndom == NULL )
267         return -ENOMEM;
268 
269     d->sched_priv = ndom;
270 
271     return 0;
272 }
null_dom_destroy(const struct scheduler * ops,struct domain * d)273 static void null_dom_destroy(const struct scheduler *ops, struct domain *d)
274 {
275     null_free_domdata(ops, null_dom(d));
276 }
277 
278 /*
279  * vCPU to pCPU assignment and placement. This _only_ happens:
280  *  - on insert,
281  *  - on migrate.
282  *
283  * Insert occurs when a vCPU joins this scheduler for the first time
284  * (e.g., when the domain it's part of is moved to the scheduler's
285  * cpupool).
286  *
287  * Migration may be necessary if a pCPU (with a vCPU assigned to it)
288  * is removed from the scheduler's cpupool.
289  *
290  * So this is not part of any hot path.
291  */
pick_cpu(struct null_private * prv,struct vcpu * v)292 static unsigned int pick_cpu(struct null_private *prv, struct vcpu *v)
293 {
294     unsigned int bs;
295     unsigned int cpu = v->processor, new_cpu;
296     cpumask_t *cpus = cpupool_domain_cpumask(v->domain);
297 
298     ASSERT(spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock));
299 
300     for_each_affinity_balance_step( bs )
301     {
302         if ( bs == BALANCE_SOFT_AFFINITY &&
303              !has_soft_affinity(v, v->cpu_hard_affinity) )
304             continue;
305 
306         affinity_balance_cpumask(v, bs, cpumask_scratch_cpu(cpu));
307         cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), cpus);
308 
309         /*
310          * If our processor is free, or we are assigned to it, and it is also
311          * still valid and part of our affinity, just go for it.
312          * (Note that we may call vcpu_check_affinity(), but we deliberately
313          * don't, so we get to keep in the scratch cpumask what we have just
314          * put in it.)
315          */
316         if ( likely((per_cpu(npc, cpu).vcpu == NULL || per_cpu(npc, cpu).vcpu == v)
317                     && cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu))) )
318         {
319             new_cpu = cpu;
320             goto out;
321         }
322 
323         /* If not, just go for a free pCPU, within our affinity, if any */
324         cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
325                     &prv->cpus_free);
326         new_cpu = cpumask_first(cpumask_scratch_cpu(cpu));
327 
328         if ( likely(new_cpu != nr_cpu_ids) )
329             goto out;
330     }
331 
332     /*
333      * If we didn't find any free pCPU, just pick any valid pcpu, even if
334      * it has another vCPU assigned. This will happen during shutdown and
335      * suspend/resume, but it may also happen during "normal operation", if
336      * all the pCPUs are busy.
337      *
338      * In fact, there must always be something sane in v->processor, or
339      * vcpu_schedule_lock() and friends won't work. This is not a problem,
340      * as we will actually assign the vCPU to the pCPU we return from here,
341      * only if the pCPU is free.
342      */
343     cpumask_and(cpumask_scratch_cpu(cpu), cpus, v->cpu_hard_affinity);
344     new_cpu = cpumask_any(cpumask_scratch_cpu(cpu));
345 
346  out:
347     if ( unlikely(tb_init_done) )
348     {
349         struct {
350             uint16_t vcpu, dom;
351             uint32_t new_cpu;
352         } d;
353         d.dom = v->domain->domain_id;
354         d.vcpu = v->vcpu_id;
355         d.new_cpu = new_cpu;
356         __trace_var(TRC_SNULL_PICKED_CPU, 1, sizeof(d), &d);
357     }
358 
359     return new_cpu;
360 }
361 
vcpu_assign(struct null_private * prv,struct vcpu * v,unsigned int cpu)362 static void vcpu_assign(struct null_private *prv, struct vcpu *v,
363                         unsigned int cpu)
364 {
365     per_cpu(npc, cpu).vcpu = v;
366     v->processor = cpu;
367     cpumask_clear_cpu(cpu, &prv->cpus_free);
368 
369     dprintk(XENLOG_G_INFO, "%d <-- d%dv%d\n", cpu, v->domain->domain_id, v->vcpu_id);
370 
371     if ( unlikely(tb_init_done) )
372     {
373         struct {
374             uint16_t vcpu, dom;
375             uint32_t cpu;
376         } d;
377         d.dom = v->domain->domain_id;
378         d.vcpu = v->vcpu_id;
379         d.cpu = cpu;
380         __trace_var(TRC_SNULL_VCPU_ASSIGN, 1, sizeof(d), &d);
381     }
382 }
383 
vcpu_deassign(struct null_private * prv,struct vcpu * v,unsigned int cpu)384 static void vcpu_deassign(struct null_private *prv, struct vcpu *v,
385                           unsigned int cpu)
386 {
387     per_cpu(npc, cpu).vcpu = NULL;
388     cpumask_set_cpu(cpu, &prv->cpus_free);
389 
390     dprintk(XENLOG_G_INFO, "%d <-- NULL (d%dv%d)\n", cpu, v->domain->domain_id, v->vcpu_id);
391 
392     if ( unlikely(tb_init_done) )
393     {
394         struct {
395             uint16_t vcpu, dom;
396             uint32_t cpu;
397         } d;
398         d.dom = v->domain->domain_id;
399         d.vcpu = v->vcpu_id;
400         d.cpu = cpu;
401         __trace_var(TRC_SNULL_VCPU_DEASSIGN, 1, sizeof(d), &d);
402     }
403 }
404 
405 /* Change the scheduler of cpu to us (null). */
null_switch_sched(struct scheduler * new_ops,unsigned int cpu,void * pdata,void * vdata)406 static void null_switch_sched(struct scheduler *new_ops, unsigned int cpu,
407                               void *pdata, void *vdata)
408 {
409     struct schedule_data *sd = &per_cpu(schedule_data, cpu);
410     struct null_private *prv = null_priv(new_ops);
411     struct null_vcpu *nvc = vdata;
412 
413     ASSERT(nvc && is_idle_vcpu(nvc->vcpu));
414 
415     idle_vcpu[cpu]->sched_priv = vdata;
416 
417     /*
418      * We are holding the runqueue lock already (it's been taken in
419      * schedule_cpu_switch()). It actually may or may not be the 'right'
420      * one for this cpu, but that is ok for preventing races.
421      */
422     ASSERT(!local_irq_is_enabled());
423 
424     init_pdata(prv, cpu);
425 
426     per_cpu(scheduler, cpu) = new_ops;
427     per_cpu(schedule_data, cpu).sched_priv = pdata;
428 
429     /*
430      * (Re?)route the lock to the per pCPU lock as /last/ thing. In fact,
431      * if it is free (and it can be) we want that anyone that manages
432      * taking it, finds all the initializations we've done above in place.
433      */
434     smp_mb();
435     sd->schedule_lock = &sd->_lock;
436 }
437 
null_vcpu_insert(const struct scheduler * ops,struct vcpu * v)438 static void null_vcpu_insert(const struct scheduler *ops, struct vcpu *v)
439 {
440     struct null_private *prv = null_priv(ops);
441     struct null_vcpu *nvc = null_vcpu(v);
442     unsigned int cpu;
443     spinlock_t *lock;
444 
445     ASSERT(!is_idle_vcpu(v));
446 
447     lock = vcpu_schedule_lock_irq(v);
448  retry:
449 
450     cpu = v->processor = pick_cpu(prv, v);
451 
452     spin_unlock(lock);
453 
454     lock = vcpu_schedule_lock(v);
455 
456     cpumask_and(cpumask_scratch_cpu(cpu), v->cpu_hard_affinity,
457                 cpupool_domain_cpumask(v->domain));
458 
459     /* If the pCPU is free, we assign v to it */
460     if ( likely(per_cpu(npc, cpu).vcpu == NULL) )
461     {
462         /*
463          * Insert is followed by vcpu_wake(), so there's no need to poke
464          * the pcpu with the SCHEDULE_SOFTIRQ, as wake will do that.
465          */
466         vcpu_assign(prv, v, cpu);
467     }
468     else if ( cpumask_intersects(&prv->cpus_free, cpumask_scratch_cpu(cpu)) )
469     {
470         /*
471          * If the pCPU is not free (e.g., because we raced with another
472          * insert or a migrate), but there are other free pCPUs, we can
473          * try to pick again.
474          */
475          goto retry;
476     }
477     else
478     {
479         /*
480          * If the pCPU is not free, and there aren't any (valid) others,
481          * we have no alternatives than to go into the waitqueue.
482          */
483         spin_lock(&prv->waitq_lock);
484         list_add_tail(&nvc->waitq_elem, &prv->waitq);
485         dprintk(XENLOG_G_WARNING, "WARNING: d%dv%d not assigned to any CPU!\n",
486                 v->domain->domain_id, v->vcpu_id);
487         spin_unlock(&prv->waitq_lock);
488     }
489     spin_unlock_irq(lock);
490 
491     SCHED_STAT_CRANK(vcpu_insert);
492 }
493 
_vcpu_remove(struct null_private * prv,struct vcpu * v)494 static void _vcpu_remove(struct null_private *prv, struct vcpu *v)
495 {
496     unsigned int bs;
497     unsigned int cpu = v->processor;
498     struct null_vcpu *wvc;
499 
500     ASSERT(list_empty(&null_vcpu(v)->waitq_elem));
501 
502     vcpu_deassign(prv, v, cpu);
503 
504     spin_lock(&prv->waitq_lock);
505 
506     /*
507      * If v is assigned to a pCPU, let's see if there is someone waiting,
508      * suitable to be assigned to it (prioritizing vcpus that have
509      * soft-affinity with cpu).
510      */
511     for_each_affinity_balance_step( bs )
512     {
513         list_for_each_entry( wvc, &prv->waitq, waitq_elem )
514         {
515             if ( bs == BALANCE_SOFT_AFFINITY &&
516                  !has_soft_affinity(wvc->vcpu, wvc->vcpu->cpu_hard_affinity) )
517                 continue;
518 
519             if ( vcpu_check_affinity(wvc->vcpu, cpu, bs) )
520             {
521                 list_del_init(&wvc->waitq_elem);
522                 vcpu_assign(prv, wvc->vcpu, cpu);
523                 cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
524                 spin_unlock(&prv->waitq_lock);
525                 return;
526             }
527         }
528     }
529     spin_unlock(&prv->waitq_lock);
530 }
531 
null_vcpu_remove(const struct scheduler * ops,struct vcpu * v)532 static void null_vcpu_remove(const struct scheduler *ops, struct vcpu *v)
533 {
534     struct null_private *prv = null_priv(ops);
535     struct null_vcpu *nvc = null_vcpu(v);
536     spinlock_t *lock;
537 
538     ASSERT(!is_idle_vcpu(v));
539 
540     lock = vcpu_schedule_lock_irq(v);
541 
542     /* If v is in waitqueue, just get it out of there and bail */
543     if ( unlikely(!list_empty(&nvc->waitq_elem)) )
544     {
545         spin_lock(&prv->waitq_lock);
546         list_del_init(&nvc->waitq_elem);
547         spin_unlock(&prv->waitq_lock);
548 
549         goto out;
550     }
551 
552     ASSERT(per_cpu(npc, v->processor).vcpu == v);
553     ASSERT(!cpumask_test_cpu(v->processor, &prv->cpus_free));
554 
555     _vcpu_remove(prv, v);
556 
557  out:
558     vcpu_schedule_unlock_irq(lock, v);
559 
560     SCHED_STAT_CRANK(vcpu_remove);
561 }
562 
null_vcpu_wake(const struct scheduler * ops,struct vcpu * v)563 static void null_vcpu_wake(const struct scheduler *ops, struct vcpu *v)
564 {
565     ASSERT(!is_idle_vcpu(v));
566 
567     if ( unlikely(curr_on_cpu(v->processor) == v) )
568     {
569         SCHED_STAT_CRANK(vcpu_wake_running);
570         return;
571     }
572 
573     if ( unlikely(!list_empty(&null_vcpu(v)->waitq_elem)) )
574     {
575         /* Not exactly "on runq", but close enough for reusing the counter */
576         SCHED_STAT_CRANK(vcpu_wake_onrunq);
577         return;
578     }
579 
580     if ( likely(vcpu_runnable(v)) )
581         SCHED_STAT_CRANK(vcpu_wake_runnable);
582     else
583         SCHED_STAT_CRANK(vcpu_wake_not_runnable);
584 
585     /* Note that we get here only for vCPUs assigned to a pCPU */
586     cpu_raise_softirq(v->processor, SCHEDULE_SOFTIRQ);
587 }
588 
null_vcpu_sleep(const struct scheduler * ops,struct vcpu * v)589 static void null_vcpu_sleep(const struct scheduler *ops, struct vcpu *v)
590 {
591     ASSERT(!is_idle_vcpu(v));
592 
593     /* If v is not assigned to a pCPU, or is not running, no need to bother */
594     if ( curr_on_cpu(v->processor) == v )
595         cpu_raise_softirq(v->processor, SCHEDULE_SOFTIRQ);
596 
597     SCHED_STAT_CRANK(vcpu_sleep);
598 }
599 
null_cpu_pick(const struct scheduler * ops,struct vcpu * v)600 static int null_cpu_pick(const struct scheduler *ops, struct vcpu *v)
601 {
602     ASSERT(!is_idle_vcpu(v));
603     return pick_cpu(null_priv(ops), v);
604 }
605 
null_vcpu_migrate(const struct scheduler * ops,struct vcpu * v,unsigned int new_cpu)606 static void null_vcpu_migrate(const struct scheduler *ops, struct vcpu *v,
607                               unsigned int new_cpu)
608 {
609     struct null_private *prv = null_priv(ops);
610     struct null_vcpu *nvc = null_vcpu(v);
611 
612     ASSERT(!is_idle_vcpu(v));
613 
614     if ( v->processor == new_cpu )
615         return;
616 
617     if ( unlikely(tb_init_done) )
618     {
619         struct {
620             uint16_t vcpu, dom;
621             uint16_t cpu, new_cpu;
622         } d;
623         d.dom = v->domain->domain_id;
624         d.vcpu = v->vcpu_id;
625         d.cpu = v->processor;
626         d.new_cpu = new_cpu;
627         __trace_var(TRC_SNULL_MIGRATE, 1, sizeof(d), &d);
628     }
629 
630     /*
631      * v is either assigned to a pCPU, or in the waitqueue.
632      *
633      * In the former case, the pCPU to which it was assigned would
634      * become free, and we, therefore, should check whether there is
635      * anyone in the waitqueue that can be assigned to it.
636      *
637      * In the latter, there is just nothing to do.
638      */
639     if ( likely(list_empty(&nvc->waitq_elem)) )
640     {
641         _vcpu_remove(prv, v);
642         SCHED_STAT_CRANK(migrate_running);
643     }
644     else
645         SCHED_STAT_CRANK(migrate_on_runq);
646 
647     SCHED_STAT_CRANK(migrated);
648 
649     /*
650      * Let's now consider new_cpu, which is where v is being sent. It can be
651      * either free, or have a vCPU already assigned to it.
652      *
653      * In the former case, we should assign v to it, and try to get it to run,
654      * if possible, according to affinity.
655      *
656      * In latter, all we can do is to park v in the waitqueue.
657      */
658     if ( per_cpu(npc, new_cpu).vcpu == NULL &&
659          vcpu_check_affinity(v, new_cpu, BALANCE_HARD_AFFINITY) )
660     {
661         /* v might have been in the waitqueue, so remove it */
662         spin_lock(&prv->waitq_lock);
663         list_del_init(&nvc->waitq_elem);
664         spin_unlock(&prv->waitq_lock);
665 
666         vcpu_assign(prv, v, new_cpu);
667     }
668     else
669     {
670         /* Put v in the waitqueue, if it wasn't there already */
671         spin_lock(&prv->waitq_lock);
672         if ( list_empty(&nvc->waitq_elem) )
673         {
674             list_add_tail(&nvc->waitq_elem, &prv->waitq);
675             dprintk(XENLOG_G_WARNING, "WARNING: d%dv%d not assigned to any CPU!\n",
676                     v->domain->domain_id, v->vcpu_id);
677         }
678         spin_unlock(&prv->waitq_lock);
679     }
680 
681     /*
682      * Whatever all the above, we always at least override v->processor.
683      * This is especially important for shutdown or suspend/resume paths,
684      * when it is important to let our caller (cpu_disable_scheduler())
685      * know that the migration did happen, to the best of our possibilities,
686      * at least. In case of suspend, any temporary inconsistency caused
687      * by this, will be fixed-up during resume.
688      */
689     v->processor = new_cpu;
690 }
691 
692 #ifndef NDEBUG
null_vcpu_check(struct vcpu * v)693 static inline void null_vcpu_check(struct vcpu *v)
694 {
695     struct null_vcpu * const nvc = null_vcpu(v);
696     struct null_dom * const ndom = null_dom(v->domain);
697 
698     BUG_ON(nvc->vcpu != v);
699 
700     if ( ndom )
701         BUG_ON(is_idle_vcpu(v));
702     else
703         BUG_ON(!is_idle_vcpu(v));
704 
705     SCHED_STAT_CRANK(vcpu_check);
706 }
707 #define NULL_VCPU_CHECK(v)  (null_vcpu_check(v))
708 #else
709 #define NULL_VCPU_CHECK(v)
710 #endif
711 
712 
713 /*
714  * The most simple scheduling function of all times! We either return:
715  *  - the vCPU assigned to the pCPU, if there's one and it can run;
716  *  - the idle vCPU, otherwise.
717  */
null_schedule(const struct scheduler * ops,s_time_t now,bool_t tasklet_work_scheduled)718 static struct task_slice null_schedule(const struct scheduler *ops,
719                                        s_time_t now,
720                                        bool_t tasklet_work_scheduled)
721 {
722     unsigned int bs;
723     const unsigned int cpu = smp_processor_id();
724     struct null_private *prv = null_priv(ops);
725     struct null_vcpu *wvc;
726     struct task_slice ret;
727 
728     SCHED_STAT_CRANK(schedule);
729     NULL_VCPU_CHECK(current);
730 
731     if ( unlikely(tb_init_done) )
732     {
733         struct {
734             uint16_t tasklet, cpu;
735             int16_t vcpu, dom;
736         } d;
737         d.cpu = cpu;
738         d.tasklet = tasklet_work_scheduled;
739         if ( per_cpu(npc, cpu).vcpu == NULL )
740         {
741             d.vcpu = d.dom = -1;
742         }
743         else
744         {
745             d.vcpu = per_cpu(npc, cpu).vcpu->vcpu_id;
746             d.dom = per_cpu(npc, cpu).vcpu->domain->domain_id;
747         }
748         __trace_var(TRC_SNULL_SCHEDULE, 1, sizeof(d), &d);
749     }
750 
751     if ( tasklet_work_scheduled )
752     {
753         trace_var(TRC_SNULL_TASKLET, 1, 0, NULL);
754         ret.task = idle_vcpu[cpu];
755     }
756     else
757         ret.task = per_cpu(npc, cpu).vcpu;
758     ret.migrated = 0;
759     ret.time = -1;
760 
761     /*
762      * We may be new in the cpupool, or just coming back online. In which
763      * case, there may be vCPUs in the waitqueue that we can assign to us
764      * and run.
765      */
766     if ( unlikely(ret.task == NULL) )
767     {
768         spin_lock(&prv->waitq_lock);
769 
770         if ( list_empty(&prv->waitq) )
771             goto unlock;
772 
773         /*
774          * We scan the waitqueue twice, for prioritizing vcpus that have
775          * soft-affinity with cpu. This may look like something expensive to
776          * do here in null_schedule(), but it's actually fine, beceuse we do
777          * it only in cases where a pcpu has no vcpu associated (e.g., as
778          * said above, the cpu has just joined a cpupool).
779          */
780         for_each_affinity_balance_step( bs )
781         {
782             list_for_each_entry( wvc, &prv->waitq, waitq_elem )
783             {
784                 if ( bs == BALANCE_SOFT_AFFINITY &&
785                      !has_soft_affinity(wvc->vcpu, wvc->vcpu->cpu_hard_affinity) )
786                     continue;
787 
788                 if ( vcpu_check_affinity(wvc->vcpu, cpu, bs) )
789                 {
790                     vcpu_assign(prv, wvc->vcpu, cpu);
791                     list_del_init(&wvc->waitq_elem);
792                     ret.task = wvc->vcpu;
793                     goto unlock;
794                 }
795             }
796         }
797  unlock:
798         spin_unlock(&prv->waitq_lock);
799     }
800 
801     if ( unlikely(ret.task == NULL || !vcpu_runnable(ret.task)) )
802         ret.task = idle_vcpu[cpu];
803 
804     NULL_VCPU_CHECK(ret.task);
805     return ret;
806 }
807 
dump_vcpu(struct null_private * prv,struct null_vcpu * nvc)808 static inline void dump_vcpu(struct null_private *prv, struct null_vcpu *nvc)
809 {
810     printk("[%i.%i] pcpu=%d", nvc->vcpu->domain->domain_id,
811             nvc->vcpu->vcpu_id, list_empty(&nvc->waitq_elem) ?
812                                 nvc->vcpu->processor : -1);
813 }
814 
null_dump_pcpu(const struct scheduler * ops,int cpu)815 static void null_dump_pcpu(const struct scheduler *ops, int cpu)
816 {
817     struct null_private *prv = null_priv(ops);
818     struct null_vcpu *nvc;
819     spinlock_t *lock;
820     unsigned long flags;
821 #define cpustr keyhandler_scratch
822 
823     lock = pcpu_schedule_lock_irqsave(cpu, &flags);
824 
825     cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_mask, cpu));
826     printk("CPU[%02d] sibling=%s, ", cpu, cpustr);
827     cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_core_mask, cpu));
828     printk("core=%s", cpustr);
829     if ( per_cpu(npc, cpu).vcpu != NULL )
830         printk(", vcpu=d%dv%d", per_cpu(npc, cpu).vcpu->domain->domain_id,
831                per_cpu(npc, cpu).vcpu->vcpu_id);
832     printk("\n");
833 
834     /* current VCPU (nothing to say if that's the idle vcpu) */
835     nvc = null_vcpu(curr_on_cpu(cpu));
836     if ( nvc && !is_idle_vcpu(nvc->vcpu) )
837     {
838         printk("\trun: ");
839         dump_vcpu(prv, nvc);
840         printk("\n");
841     }
842 
843     pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
844 #undef cpustr
845 }
846 
null_dump(const struct scheduler * ops)847 static void null_dump(const struct scheduler *ops)
848 {
849     struct null_private *prv = null_priv(ops);
850     struct list_head *iter;
851     unsigned long flags;
852     unsigned int loop;
853 #define cpustr keyhandler_scratch
854 
855     spin_lock_irqsave(&prv->lock, flags);
856 
857     cpulist_scnprintf(cpustr, sizeof(cpustr), &prv->cpus_free);
858     printk("\tcpus_free = %s\n", cpustr);
859 
860     printk("Domain info:\n");
861     loop = 0;
862     list_for_each( iter, &prv->ndom )
863     {
864         struct null_dom *ndom;
865         struct vcpu *v;
866 
867         ndom = list_entry(iter, struct null_dom, ndom_elem);
868 
869         printk("\tDomain: %d\n", ndom->dom->domain_id);
870         for_each_vcpu( ndom->dom, v )
871         {
872             struct null_vcpu * const nvc = null_vcpu(v);
873             spinlock_t *lock;
874 
875             lock = vcpu_schedule_lock(nvc->vcpu);
876 
877             printk("\t%3d: ", ++loop);
878             dump_vcpu(prv, nvc);
879             printk("\n");
880 
881             vcpu_schedule_unlock(lock, nvc->vcpu);
882         }
883     }
884 
885     printk("Waitqueue: ");
886     loop = 0;
887     spin_lock(&prv->waitq_lock);
888     list_for_each( iter, &prv->waitq )
889     {
890         struct null_vcpu *nvc = list_entry(iter, struct null_vcpu, waitq_elem);
891 
892         if ( loop++ != 0 )
893             printk(", ");
894         if ( loop % 24 == 0 )
895             printk("\n\t");
896         printk("d%dv%d", nvc->vcpu->domain->domain_id, nvc->vcpu->vcpu_id);
897     }
898     printk("\n");
899     spin_unlock(&prv->waitq_lock);
900 
901     spin_unlock_irqrestore(&prv->lock, flags);
902 #undef cpustr
903 }
904 
905 const struct scheduler sched_null_def = {
906     .name           = "null Scheduler",
907     .opt_name       = "null",
908     .sched_id       = XEN_SCHEDULER_NULL,
909     .sched_data     = NULL,
910 
911     .init           = null_init,
912     .deinit         = null_deinit,
913     .init_pdata     = null_init_pdata,
914     .switch_sched   = null_switch_sched,
915     .deinit_pdata   = null_deinit_pdata,
916 
917     .alloc_vdata    = null_alloc_vdata,
918     .free_vdata     = null_free_vdata,
919     .alloc_domdata  = null_alloc_domdata,
920     .free_domdata   = null_free_domdata,
921 
922     .init_domain    = null_dom_init,
923     .destroy_domain = null_dom_destroy,
924 
925     .insert_vcpu    = null_vcpu_insert,
926     .remove_vcpu    = null_vcpu_remove,
927 
928     .wake           = null_vcpu_wake,
929     .sleep          = null_vcpu_sleep,
930     .pick_cpu       = null_cpu_pick,
931     .migrate        = null_vcpu_migrate,
932     .do_schedule    = null_schedule,
933 
934     .dump_cpu_state = null_dump_pcpu,
935     .dump_settings  = null_dump,
936 };
937 
938 REGISTER_SCHEDULER(sched_null_def);
939