1 /******************************************************************************
2  * arch/x86/x86_64/mm.c
3  *
4  * Modifications to Linux original are copyright (c) 2004, K A Fraser tr This
5  * program is free software; you can redistribute it and/or modify it under
6  * the terms of the GNU General Public License as published by the Free
7  * Software Foundation; either version 2 of the License, or (at your option)
8  * any later version.
9  *
10  * This program is distributed in the hope that it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along
16  * with this program; If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 EMIT_FILE;
20 
21 #include <xen/lib.h>
22 #include <xen/init.h>
23 #include <xen/mm.h>
24 #include <xen/sched.h>
25 #include <xen/numa.h>
26 #include <xen/nodemask.h>
27 #include <xen/guest_access.h>
28 #include <xen/hypercall.h>
29 #include <xen/mem_access.h>
30 #include <asm/current.h>
31 #include <asm/asm_defns.h>
32 #include <asm/page.h>
33 #include <asm/flushtlb.h>
34 #include <asm/fixmap.h>
35 #include <asm/hypercall.h>
36 #include <asm/msr.h>
37 #include <asm/setup.h>
38 #include <asm/numa.h>
39 #include <asm/mem_paging.h>
40 #include <asm/mem_sharing.h>
41 #include <public/memory.h>
42 
43 unsigned int __read_mostly m2p_compat_vstart = __HYPERVISOR_COMPAT_VIRT_START;
44 
45 l2_pgentry_t *compat_idle_pg_table_l2;
46 
do_page_walk(struct vcpu * v,unsigned long addr)47 void *do_page_walk(struct vcpu *v, unsigned long addr)
48 {
49     unsigned long mfn = pagetable_get_pfn(v->arch.guest_table);
50     l4_pgentry_t l4e, *l4t;
51     l3_pgentry_t l3e, *l3t;
52     l2_pgentry_t l2e, *l2t;
53     l1_pgentry_t l1e, *l1t;
54 
55     if ( !is_pv_vcpu(v) || !is_canonical_address(addr) )
56         return NULL;
57 
58     l4t = map_domain_page(_mfn(mfn));
59     l4e = l4t[l4_table_offset(addr)];
60     unmap_domain_page(l4t);
61     if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
62         return NULL;
63 
64     l3t = map_l3t_from_l4e(l4e);
65     l3e = l3t[l3_table_offset(addr)];
66     unmap_domain_page(l3t);
67     mfn = l3e_get_pfn(l3e);
68     if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
69         return NULL;
70     if ( (l3e_get_flags(l3e) & _PAGE_PSE) )
71     {
72         mfn += PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1));
73         goto ret;
74     }
75 
76     l2t = map_domain_page(_mfn(mfn));
77     l2e = l2t[l2_table_offset(addr)];
78     unmap_domain_page(l2t);
79     mfn = l2e_get_pfn(l2e);
80     if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
81         return NULL;
82     if ( (l2e_get_flags(l2e) & _PAGE_PSE) )
83     {
84         mfn += PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1));
85         goto ret;
86     }
87 
88     l1t = map_domain_page(_mfn(mfn));
89     l1e = l1t[l1_table_offset(addr)];
90     unmap_domain_page(l1t);
91     mfn = l1e_get_pfn(l1e);
92     if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
93         return NULL;
94 
95  ret:
96     return map_domain_page(_mfn(mfn)) + (addr & ~PAGE_MASK);
97 }
98 
99 /*
100  * Allocate page table pages for m2p table
101  */
102 struct mem_hotadd_info
103 {
104     unsigned long spfn;
105     unsigned long epfn;
106     unsigned long cur;
107 };
108 
hotadd_mem_valid(unsigned long pfn,struct mem_hotadd_info * info)109 static int hotadd_mem_valid(unsigned long pfn, struct mem_hotadd_info *info)
110 {
111     return (pfn < info->epfn && pfn >= info->spfn);
112 }
113 
alloc_hotadd_mfn(struct mem_hotadd_info * info)114 static mfn_t alloc_hotadd_mfn(struct mem_hotadd_info *info)
115 {
116     mfn_t mfn;
117 
118     ASSERT((info->cur + ( 1UL << PAGETABLE_ORDER) < info->epfn) &&
119             info->cur >= info->spfn);
120 
121     mfn = _mfn(info->cur);
122     info->cur += (1UL << PAGETABLE_ORDER);
123     return mfn;
124 }
125 
126 #define M2P_NO_MAPPED   0
127 #define M2P_2M_MAPPED   1
128 #define M2P_1G_MAPPED   2
m2p_mapped(unsigned long spfn)129 static int m2p_mapped(unsigned long spfn)
130 {
131     unsigned long va;
132     l3_pgentry_t l3e;
133     l2_pgentry_t l2e;
134 
135     va = RO_MPT_VIRT_START + spfn * sizeof(*machine_to_phys_mapping);
136     l3e = l3e_from_l4e(idle_pg_table[l4_table_offset(va)], l3_table_offset(va));
137 
138     switch ( l3e_get_flags(l3e) & (_PAGE_PRESENT | _PAGE_PSE) )
139     {
140         case _PAGE_PSE|_PAGE_PRESENT:
141             return M2P_1G_MAPPED;
142         /* Check for next level */
143         case _PAGE_PRESENT:
144             break;
145         default:
146             return M2P_NO_MAPPED;
147     }
148     l2e = l2e_from_l3e(l3e, l2_table_offset(va));
149 
150     if ( l2e_get_flags(l2e) & _PAGE_PRESENT )
151         return M2P_2M_MAPPED;
152 
153     return M2P_NO_MAPPED;
154 }
155 
share_hotadd_m2p_table(struct mem_hotadd_info * info)156 static int share_hotadd_m2p_table(struct mem_hotadd_info *info)
157 {
158     unsigned long i, n, v;
159     mfn_t m2p_start_mfn = INVALID_MFN;
160     l3_pgentry_t l3e;
161     l2_pgentry_t l2e;
162 
163     /* M2P table is mappable read-only by privileged domains. */
164     for ( v  = RDWR_MPT_VIRT_START;
165           v != RDWR_MPT_VIRT_END;
166           v += n << PAGE_SHIFT )
167     {
168         n = L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES;
169         l3e = l3e_from_l4e(idle_pg_table[l4_table_offset(v)],
170                            l3_table_offset(v));
171         if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
172             continue;
173         if ( !(l3e_get_flags(l3e) & _PAGE_PSE) )
174         {
175             n = L1_PAGETABLE_ENTRIES;
176             l2e = l2e_from_l3e(l3e, l2_table_offset(v));
177             if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
178                 continue;
179             m2p_start_mfn = l2e_get_mfn(l2e);
180         }
181         else
182             continue;
183 
184         for ( i = 0; i < n; i++ )
185         {
186             struct page_info *page = mfn_to_page(mfn_add(m2p_start_mfn, i));
187 
188             if ( hotadd_mem_valid(mfn_x(mfn_add(m2p_start_mfn, i)), info) )
189                 share_xen_page_with_privileged_guests(page, SHARE_ro);
190         }
191     }
192 
193     for ( v  = RDWR_COMPAT_MPT_VIRT_START;
194           v != RDWR_COMPAT_MPT_VIRT_END;
195           v += 1 << L2_PAGETABLE_SHIFT )
196     {
197         l3e = l3e_from_l4e(idle_pg_table[l4_table_offset(v)],
198                            l3_table_offset(v));
199         if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
200             continue;
201         l2e = l2e_from_l3e(l3e, l2_table_offset(v));
202         if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
203             continue;
204         m2p_start_mfn = l2e_get_mfn(l2e);
205 
206         for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
207         {
208             struct page_info *page = mfn_to_page(mfn_add(m2p_start_mfn, i));
209 
210             if ( hotadd_mem_valid(mfn_x(mfn_add(m2p_start_mfn, i)), info) )
211                 share_xen_page_with_privileged_guests(page, SHARE_ro);
212         }
213     }
214     return 0;
215 }
216 
destroy_compat_m2p_mapping(struct mem_hotadd_info * info)217 static void destroy_compat_m2p_mapping(struct mem_hotadd_info *info)
218 {
219     unsigned long i, smap = info->spfn, emap = info->spfn;
220 
221     if ( smap > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2) )
222         return;
223 
224     if ( emap > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2) )
225         emap = (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2;
226 
227     for ( i = smap; i < emap; )
228     {
229         unsigned int off = i * sizeof(*compat_machine_to_phys_mapping);
230         l2_pgentry_t *pl2e = compat_idle_pg_table_l2 + l2_table_offset(off);
231 
232         if ( l2e_get_flags(*pl2e) & _PAGE_PRESENT )
233         {
234             unsigned long pt_pfn = l2e_get_pfn(*pl2e);
235 
236             if ( hotadd_mem_valid(pt_pfn, info) )
237             {
238                 unsigned long rwva = RDWR_COMPAT_MPT_VIRT_START + off;
239 
240                 destroy_xen_mappings(rwva, rwva + (1UL << L2_PAGETABLE_SHIFT));
241                 l2e_write(pl2e, l2e_empty());
242             }
243         }
244 
245         i += 1UL << (L2_PAGETABLE_SHIFT - 2);
246     }
247 
248     return;
249 }
250 
destroy_m2p_mapping(struct mem_hotadd_info * info)251 static void destroy_m2p_mapping(struct mem_hotadd_info *info)
252 {
253     l3_pgentry_t *l3_ro_mpt;
254     unsigned long i, va, rwva;
255     unsigned long smap = info->spfn, emap = info->epfn;
256 
257     l3_ro_mpt = map_l3t_from_l4e(
258                     idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)]);
259 
260     /*
261      * No need to clean m2p structure existing before the hotplug
262      */
263     for (i = smap; i < emap;)
264     {
265         unsigned long pt_pfn;
266         l2_pgentry_t *pl2e;
267 
268         va = RO_MPT_VIRT_START + i * sizeof(*machine_to_phys_mapping);
269         rwva = RDWR_MPT_VIRT_START + i * sizeof(*machine_to_phys_mapping);
270 
271         /* 1G mapping should not be created by mem hotadd */
272         if (!(l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) & _PAGE_PRESENT) ||
273             (l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) & _PAGE_PSE))
274         {
275             i = ( i & ~((1UL << (L3_PAGETABLE_SHIFT - 3)) - 1)) +
276                 (1UL << (L3_PAGETABLE_SHIFT - 3) );
277             continue;
278         }
279 
280         pl2e = map_l2t_from_l3e(l3_ro_mpt[l3_table_offset(va)]) +
281                     l2_table_offset(va);
282         if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
283         {
284             i = ( i & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)) +
285                     (1UL << (L2_PAGETABLE_SHIFT - 3)) ;
286             UNMAP_DOMAIN_PAGE(pl2e);
287             continue;
288         }
289 
290         pt_pfn = l2e_get_pfn(*pl2e);
291         if ( hotadd_mem_valid(pt_pfn, info) )
292         {
293             destroy_xen_mappings(rwva, rwva + (1UL << L2_PAGETABLE_SHIFT));
294 
295             l2e_write(pl2e, l2e_empty());
296         }
297         i = ( i & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)) +
298               (1UL << (L2_PAGETABLE_SHIFT - 3));
299         unmap_domain_page(pl2e);
300     }
301 
302     UNMAP_DOMAIN_PAGE(l3_ro_mpt);
303 
304     destroy_compat_m2p_mapping(info);
305 
306     /* Brute-Force flush all TLB */
307     flush_tlb_all();
308     return;
309 }
310 
311 /*
312  * Allocate and map the compatibility mode machine-to-phys table.
313  * spfn/epfn: the pfn ranges to be setup
314  * free_s/free_e: the pfn ranges that is free still
315  */
setup_compat_m2p_table(struct mem_hotadd_info * info)316 static int setup_compat_m2p_table(struct mem_hotadd_info *info)
317 {
318     unsigned long i, smap, emap, epfn = info->epfn;
319     mfn_t mfn;
320     unsigned int n;
321     int err = 0;
322 
323     smap = info->spfn & (~((1UL << (L2_PAGETABLE_SHIFT - 2)) -1));
324 
325     /*
326      * Notice: For hot-added memory, only range below m2p_compat_vstart
327      * will be filled up (assuming memory is discontinous when booting).
328      */
329     if   ((smap > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2)) )
330         return 0;
331 
332     if ( epfn > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2) )
333         epfn = (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2;
334 
335     emap = ( (epfn + ((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1 )) &
336                 ~((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1) );
337 
338 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned int))
339 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
340              sizeof(*compat_machine_to_phys_mapping))
341     BUILD_BUG_ON((sizeof(*frame_table) & -sizeof(*frame_table)) % \
342                  sizeof(*compat_machine_to_phys_mapping));
343 
344     for ( i = smap; i < emap; i += (1UL << (L2_PAGETABLE_SHIFT - 2)) )
345     {
346         unsigned int off = i * sizeof(*compat_machine_to_phys_mapping);
347         l2_pgentry_t *pl2e = compat_idle_pg_table_l2 + l2_table_offset(off);
348         unsigned long rwva = RDWR_COMPAT_MPT_VIRT_START + off;
349 
350         if ( l2e_get_flags(*pl2e) & _PAGE_PRESENT )
351             continue;
352 
353         for ( n = 0; n < CNT; ++n)
354             if ( mfn_valid(_mfn(i + n * PDX_GROUP_COUNT)) )
355                 break;
356         if ( n == CNT )
357             continue;
358 
359         mfn = alloc_hotadd_mfn(info);
360         err = map_pages_to_xen(rwva, mfn, 1UL << PAGETABLE_ORDER,
361                                PAGE_HYPERVISOR);
362         if ( err )
363             break;
364         /* Fill with INVALID_M2P_ENTRY. */
365         memset((void *)rwva, 0xFF, 1UL << L2_PAGETABLE_SHIFT);
366         /* NB. Cannot be GLOBAL as the ptes get copied into per-VM space. */
367         l2e_write(pl2e, l2e_from_mfn(mfn, _PAGE_PSE|_PAGE_PRESENT));
368     }
369 #undef CNT
370 #undef MFN
371     return err;
372 }
373 
374 /*
375  * Allocate and map the machine-to-phys table.
376  * The L3 for RO/RWRW MPT and the L2 for compatible MPT should be setup already
377  */
setup_m2p_table(struct mem_hotadd_info * info)378 static int setup_m2p_table(struct mem_hotadd_info *info)
379 {
380     unsigned long i, va, smap, emap;
381     unsigned int n;
382     l2_pgentry_t *l2_ro_mpt = NULL;
383     l3_pgentry_t *l3_ro_mpt = NULL;
384     int ret = 0;
385 
386     ASSERT(l4e_get_flags(idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)])
387             & _PAGE_PRESENT);
388     l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)]);
389 
390     smap = (info->spfn & (~((1UL << (L2_PAGETABLE_SHIFT - 3)) -1)));
391     emap = ((info->epfn + ((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1 )) &
392                 ~((1UL << (L2_PAGETABLE_SHIFT - 3)) -1));
393 
394     va = RO_MPT_VIRT_START + smap * sizeof(*machine_to_phys_mapping);
395 
396 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned long))
397 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
398              sizeof(*machine_to_phys_mapping))
399 
400     BUILD_BUG_ON((sizeof(*frame_table) & -sizeof(*frame_table)) % \
401                  sizeof(*machine_to_phys_mapping));
402 
403     i = smap;
404     while ( i < emap )
405     {
406         switch ( m2p_mapped(i) )
407         {
408         case M2P_1G_MAPPED:
409             i = ( i & ~((1UL << (L3_PAGETABLE_SHIFT - 3)) - 1)) +
410                 (1UL << (L3_PAGETABLE_SHIFT - 3));
411             continue;
412         case M2P_2M_MAPPED:
413             i = (i & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)) +
414                 (1UL << (L2_PAGETABLE_SHIFT - 3));
415             continue;
416         default:
417             break;
418         }
419 
420         va = RO_MPT_VIRT_START + i * sizeof(*machine_to_phys_mapping);
421 
422         for ( n = 0; n < CNT; ++n)
423             if ( mfn_valid(_mfn(i + n * PDX_GROUP_COUNT)) )
424                 break;
425         if ( n < CNT )
426         {
427             mfn_t mfn = alloc_hotadd_mfn(info);
428 
429             ret = map_pages_to_xen(
430                         RDWR_MPT_VIRT_START + i * sizeof(unsigned long),
431                         mfn, 1UL << PAGETABLE_ORDER,
432                         PAGE_HYPERVISOR);
433             if ( ret )
434                 goto error;
435             /* Fill with INVALID_M2P_ENTRY. */
436             memset((void *)(RDWR_MPT_VIRT_START + i * sizeof(unsigned long)),
437                    0xFF, 1UL << L2_PAGETABLE_SHIFT);
438 
439             ASSERT(!(l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) &
440                   _PAGE_PSE));
441             if ( l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) &
442               _PAGE_PRESENT )
443                 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]) +
444                   l2_table_offset(va);
445             else
446             {
447                 l2_ro_mpt = alloc_xen_pagetable();
448                 if ( !l2_ro_mpt )
449                 {
450                     ret = -ENOMEM;
451                     goto error;
452                 }
453 
454                 clear_page(l2_ro_mpt);
455                 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
456                           l3e_from_paddr(__pa(l2_ro_mpt),
457                                          __PAGE_HYPERVISOR_RO | _PAGE_USER));
458                 l2_ro_mpt += l2_table_offset(va);
459             }
460 
461             /* NB. Cannot be GLOBAL: guest user mode should not see it. */
462             l2e_write(l2_ro_mpt, l2e_from_mfn(mfn,
463                    /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
464         }
465         if ( !((unsigned long)l2_ro_mpt & ~PAGE_MASK) )
466             l2_ro_mpt = NULL;
467         i += ( 1UL << (L2_PAGETABLE_SHIFT - 3));
468     }
469 #undef CNT
470 #undef MFN
471 
472     ret = setup_compat_m2p_table(info);
473 error:
474     return ret;
475 }
476 
paging_init(void)477 void __init paging_init(void)
478 {
479     unsigned long i, mpt_size, va;
480     unsigned int n, memflags;
481     l3_pgentry_t *l3_ro_mpt;
482     l2_pgentry_t *l2_ro_mpt = NULL;
483     struct page_info *l1_pg;
484 
485     /*
486      * We setup the L3s for 1:1 mapping if host support memory hotplug
487      * to avoid sync the 1:1 mapping on page fault handler
488      */
489     for ( va = DIRECTMAP_VIRT_START;
490           va < DIRECTMAP_VIRT_END && (void *)va < __va(mem_hotplug);
491           va += (1UL << L4_PAGETABLE_SHIFT) )
492     {
493         if ( !(l4e_get_flags(idle_pg_table[l4_table_offset(va)]) &
494               _PAGE_PRESENT) )
495         {
496             l3_pgentry_t *pl3t = alloc_xen_pagetable();
497 
498             if ( !pl3t )
499                 goto nomem;
500             clear_page(pl3t);
501             l4e_write(&idle_pg_table[l4_table_offset(va)],
502                       l4e_from_paddr(__pa(pl3t), __PAGE_HYPERVISOR_RW));
503         }
504     }
505 
506     /* Create user-accessible L2 directory to map the MPT for guests. */
507     if ( (l3_ro_mpt = alloc_xen_pagetable()) == NULL )
508         goto nomem;
509     clear_page(l3_ro_mpt);
510     l4e_write(&idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)],
511               l4e_from_paddr(__pa(l3_ro_mpt), __PAGE_HYPERVISOR_RO | _PAGE_USER));
512 
513     /*
514      * Allocate and map the machine-to-phys table.
515      * This also ensures L3 is present for fixmaps.
516      */
517     mpt_size  = (max_page * BYTES_PER_LONG) + (1UL << L2_PAGETABLE_SHIFT) - 1;
518     mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
519 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned long))
520 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
521              sizeof(*machine_to_phys_mapping))
522     BUILD_BUG_ON((sizeof(*frame_table) & ~sizeof(*frame_table)) % \
523                  sizeof(*machine_to_phys_mapping));
524     for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ )
525     {
526         BUILD_BUG_ON(RO_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
527         va = RO_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT);
528         memflags = MEMF_node(phys_to_nid(i <<
529             (L2_PAGETABLE_SHIFT - 3 + PAGE_SHIFT)));
530 
531         if ( cpu_has_page1gb &&
532              !((unsigned long)l2_ro_mpt & ~PAGE_MASK) &&
533              (mpt_size >> L3_PAGETABLE_SHIFT) > (i >> PAGETABLE_ORDER) )
534         {
535             unsigned int k, holes;
536 
537             for ( holes = k = 0; k < 1 << PAGETABLE_ORDER; ++k)
538             {
539                 for ( n = 0; n < CNT; ++n)
540                     if ( mfn_valid(_mfn(MFN(i + k) + n * PDX_GROUP_COUNT)) )
541                         break;
542                 if ( n == CNT )
543                     ++holes;
544             }
545             if ( k == holes )
546             {
547                 i += (1UL << PAGETABLE_ORDER) - 1;
548                 continue;
549             }
550             if ( holes == 0 &&
551                  (l1_pg = alloc_domheap_pages(NULL, 2 * PAGETABLE_ORDER,
552                                               memflags)) != NULL )
553             {
554                 map_pages_to_xen(
555                     RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
556                     page_to_mfn(l1_pg),
557                     1UL << (2 * PAGETABLE_ORDER),
558                     PAGE_HYPERVISOR);
559                 /* Fill with INVALID_M2P_ENTRY. */
560                 memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)),
561                        0xFF, 1UL << L3_PAGETABLE_SHIFT);
562 
563                 ASSERT(!l2_table_offset(va));
564                 /* NB. Cannot be GLOBAL: guest user mode should not see it. */
565                 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
566                     l3e_from_page(l1_pg,
567                         /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
568                 i += (1UL << PAGETABLE_ORDER) - 1;
569                 continue;
570             }
571         }
572 
573         for ( n = 0; n < CNT; ++n)
574             if ( mfn_valid(_mfn(MFN(i) + n * PDX_GROUP_COUNT)) )
575                 break;
576         if ( n == CNT )
577             l1_pg = NULL;
578         else if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER,
579                                                memflags)) == NULL )
580             goto nomem;
581         else
582         {
583             map_pages_to_xen(
584                 RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
585                 page_to_mfn(l1_pg),
586                 1UL << PAGETABLE_ORDER,
587                 PAGE_HYPERVISOR);
588             /* Fill with INVALID_M2P_ENTRY. */
589             memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)),
590                    0xFF, 1UL << L2_PAGETABLE_SHIFT);
591         }
592         if ( !((unsigned long)l2_ro_mpt & ~PAGE_MASK) )
593         {
594             if ( (l2_ro_mpt = alloc_xen_pagetable()) == NULL )
595                 goto nomem;
596             clear_page(l2_ro_mpt);
597             l3e_write(&l3_ro_mpt[l3_table_offset(va)],
598                       l3e_from_paddr(__pa(l2_ro_mpt),
599                                      __PAGE_HYPERVISOR_RO | _PAGE_USER));
600             ASSERT(!l2_table_offset(va));
601         }
602         /* NB. Cannot be GLOBAL: guest user mode should not see it. */
603         if ( l1_pg )
604             l2e_write(l2_ro_mpt, l2e_from_page(
605                 l1_pg, /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
606         l2_ro_mpt++;
607     }
608 #undef CNT
609 #undef MFN
610 
611     /* Create user-accessible L2 directory to map the MPT for compat guests. */
612     if ( (l2_ro_mpt = alloc_xen_pagetable()) == NULL )
613         goto nomem;
614     compat_idle_pg_table_l2 = l2_ro_mpt;
615     clear_page(l2_ro_mpt);
616     /* Allocate and map the compatibility mode machine-to-phys table. */
617     mpt_size = (mpt_size >> 1) + (1UL << (L2_PAGETABLE_SHIFT - 1));
618     if ( mpt_size > RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START )
619         mpt_size = RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START;
620     mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
621     if ( (m2p_compat_vstart + mpt_size) < MACH2PHYS_COMPAT_VIRT_END )
622         m2p_compat_vstart = MACH2PHYS_COMPAT_VIRT_END - mpt_size;
623 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned int))
624 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
625              sizeof(*compat_machine_to_phys_mapping))
626     BUILD_BUG_ON((sizeof(*frame_table) & ~sizeof(*frame_table)) % \
627                  sizeof(*compat_machine_to_phys_mapping));
628     for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++, l2_ro_mpt++ )
629     {
630         memflags = MEMF_node(phys_to_nid(i <<
631             (L2_PAGETABLE_SHIFT - 2 + PAGE_SHIFT)));
632         for ( n = 0; n < CNT; ++n)
633             if ( mfn_valid(_mfn(MFN(i) + n * PDX_GROUP_COUNT)) )
634                 break;
635         if ( n == CNT )
636             continue;
637         if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER,
638                                                memflags)) == NULL )
639             goto nomem;
640         map_pages_to_xen(
641             RDWR_COMPAT_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
642             page_to_mfn(l1_pg),
643             1UL << PAGETABLE_ORDER,
644             PAGE_HYPERVISOR);
645         /* Fill with INVALID_M2P_ENTRY. */
646         memset((void *)(RDWR_COMPAT_MPT_VIRT_START +
647                         (i << L2_PAGETABLE_SHIFT)),
648                0xFF, 1UL << L2_PAGETABLE_SHIFT);
649         /* NB. Cannot be GLOBAL as the ptes get copied into per-VM space. */
650         l2e_write(l2_ro_mpt, l2e_from_page(l1_pg, _PAGE_PSE|_PAGE_PRESENT));
651     }
652 #undef CNT
653 #undef MFN
654 
655     machine_to_phys_mapping_valid = 1;
656 
657     /* Set up linear page table mapping. */
658     l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)],
659               l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR_RW));
660     return;
661 
662  nomem:
663     panic("Not enough memory for m2p table\n");
664 }
665 
zap_low_mappings(void)666 void __init zap_low_mappings(void)
667 {
668     BUG_ON(num_online_cpus() != 1);
669 
670     /* Remove aliased mapping of first 1:1 PML4 entry. */
671     l4e_write(&idle_pg_table[0], l4e_empty());
672     flush_local(FLUSH_TLB_GLOBAL);
673 
674     /* Replace with mapping of the boot trampoline only. */
675     map_pages_to_xen(trampoline_phys, maddr_to_mfn(trampoline_phys),
676                      PFN_UP(trampoline_end - trampoline_start),
677                      __PAGE_HYPERVISOR_RX);
678 }
679 
setup_compat_arg_xlat(struct vcpu * v)680 int setup_compat_arg_xlat(struct vcpu *v)
681 {
682     return create_perdomain_mapping(v->domain, ARG_XLAT_START(v),
683                                     PFN_UP(COMPAT_ARG_XLAT_SIZE),
684                                     NULL, NIL(struct page_info *));
685 }
686 
free_compat_arg_xlat(struct vcpu * v)687 void free_compat_arg_xlat(struct vcpu *v)
688 {
689     destroy_perdomain_mapping(v->domain, ARG_XLAT_START(v),
690                               PFN_UP(COMPAT_ARG_XLAT_SIZE));
691 }
692 
cleanup_frame_table(struct mem_hotadd_info * info)693 static void cleanup_frame_table(struct mem_hotadd_info *info)
694 {
695     unsigned long sva, eva;
696     l3_pgentry_t l3e;
697     l2_pgentry_t l2e;
698     mfn_t spfn, epfn;
699 
700     spfn = _mfn(info->spfn);
701     epfn = _mfn(info->epfn);
702 
703     sva = (unsigned long)mfn_to_page(spfn);
704     eva = (unsigned long)mfn_to_page(epfn);
705 
706     /* Intialize all page */
707     memset((void *)sva, -1, eva - sva);
708 
709     while (sva < eva)
710     {
711         l3e = l3e_from_l4e(idle_pg_table[l4_table_offset(sva)],
712                            l3_table_offset(sva));
713         if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ||
714              (l3e_get_flags(l3e) & _PAGE_PSE) )
715         {
716             sva = (sva & ~((1UL << L3_PAGETABLE_SHIFT) - 1)) +
717                     (1UL << L3_PAGETABLE_SHIFT);
718             continue;
719         }
720 
721         l2e = l2e_from_l3e(l3e, l2_table_offset(sva));
722         ASSERT(l2e_get_flags(l2e) & _PAGE_PRESENT);
723 
724         if ( (l2e_get_flags(l2e) & (_PAGE_PRESENT | _PAGE_PSE)) ==
725               (_PAGE_PSE | _PAGE_PRESENT) )
726         {
727             if (hotadd_mem_valid(l2e_get_pfn(l2e), info))
728                 destroy_xen_mappings(sva & ~((1UL << L2_PAGETABLE_SHIFT) - 1),
729                          ((sva & ~((1UL << L2_PAGETABLE_SHIFT) -1 )) +
730                             (1UL << L2_PAGETABLE_SHIFT) - 1));
731 
732             sva = (sva & ~((1UL << L2_PAGETABLE_SHIFT) -1 )) +
733                   (1UL << L2_PAGETABLE_SHIFT);
734             continue;
735         }
736 
737         ASSERT(l1e_get_flags(l1e_from_l2e(l2e, l1_table_offset(sva))) &
738                _PAGE_PRESENT);
739 
740         sva = (sva & PAGE_MASK) + PAGE_SIZE;
741     }
742 
743     /* Brute-Force flush all TLB */
744     flush_tlb_all();
745 }
746 
setup_frametable_chunk(void * start,void * end,struct mem_hotadd_info * info)747 static int setup_frametable_chunk(void *start, void *end,
748                                   struct mem_hotadd_info *info)
749 {
750     unsigned long s = (unsigned long)start;
751     unsigned long e = (unsigned long)end;
752     mfn_t mfn;
753     int err;
754 
755     ASSERT(!(s & ((1 << L2_PAGETABLE_SHIFT) - 1)));
756     ASSERT(!(e & ((1 << L2_PAGETABLE_SHIFT) - 1)));
757 
758     for ( ; s < e; s += (1UL << L2_PAGETABLE_SHIFT))
759     {
760         mfn = alloc_hotadd_mfn(info);
761         err = map_pages_to_xen(s, mfn, 1UL << PAGETABLE_ORDER,
762                                PAGE_HYPERVISOR);
763         if ( err )
764             return err;
765     }
766     memset(start, -1, s - (unsigned long)start);
767 
768     return 0;
769 }
770 
extend_frame_table(struct mem_hotadd_info * info)771 static int extend_frame_table(struct mem_hotadd_info *info)
772 {
773     unsigned long cidx, nidx, eidx;
774     mfn_t spfn, epfn;
775 
776     spfn = _mfn(info->spfn);
777     epfn = _mfn(info->epfn);
778 
779     eidx = DIV_ROUND_UP(mfn_to_pdx(epfn), PDX_GROUP_COUNT);
780     nidx = cidx = mfn_to_pdx(spfn)/PDX_GROUP_COUNT;
781 
782     ASSERT( mfn_to_pdx(epfn) <= (DIRECTMAP_SIZE >> PAGE_SHIFT) &&
783             mfn_to_pdx(epfn) <= FRAMETABLE_NR );
784 
785     if ( test_bit(cidx, pdx_group_valid) )
786         cidx = find_next_zero_bit(pdx_group_valid, eidx, cidx);
787 
788     if ( cidx >= eidx )
789         return 0;
790 
791     while ( cidx < eidx )
792     {
793         int err;
794 
795         nidx = find_next_bit(pdx_group_valid, eidx, cidx);
796         if ( nidx >= eidx )
797             nidx = eidx;
798         err = setup_frametable_chunk(pdx_to_page(cidx * PDX_GROUP_COUNT ),
799                                      pdx_to_page(nidx * PDX_GROUP_COUNT),
800                                      info);
801         if ( err )
802             return err;
803 
804         cidx = find_next_zero_bit(pdx_group_valid, eidx, nidx);
805     }
806 
807     memset(mfn_to_page(spfn), 0,
808            (unsigned long)mfn_to_page(epfn) - (unsigned long)mfn_to_page(spfn));
809     return 0;
810 }
811 
subarch_init_memory(void)812 void __init subarch_init_memory(void)
813 {
814     unsigned long i, n, v, m2p_start_mfn;
815     l3_pgentry_t l3e;
816     l2_pgentry_t l2e;
817 
818     BUILD_BUG_ON(RDWR_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
819     BUILD_BUG_ON(RDWR_MPT_VIRT_END   & ((1UL << L3_PAGETABLE_SHIFT) - 1));
820     /* M2P table is mappable read-only by privileged domains. */
821     for ( v  = RDWR_MPT_VIRT_START;
822           v != RDWR_MPT_VIRT_END;
823           v += n << PAGE_SHIFT )
824     {
825         n = L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES;
826         l3e = l3e_from_l4e(idle_pg_table[l4_table_offset(v)],
827                            l3_table_offset(v));
828         if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
829             continue;
830         if ( !(l3e_get_flags(l3e) & _PAGE_PSE) )
831         {
832             n = L1_PAGETABLE_ENTRIES;
833             l2e = l2e_from_l3e(l3e, l2_table_offset(v));
834             if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
835                 continue;
836             m2p_start_mfn = l2e_get_pfn(l2e);
837         }
838         else
839         {
840             m2p_start_mfn = l3e_get_pfn(l3e);
841         }
842 
843         for ( i = 0; i < n; i++ )
844             share_xen_page_with_privileged_guests(
845                 mfn_to_page(_mfn(m2p_start_mfn + i)), SHARE_ro);
846     }
847 
848     for ( v  = RDWR_COMPAT_MPT_VIRT_START;
849           v != RDWR_COMPAT_MPT_VIRT_END;
850           v += 1 << L2_PAGETABLE_SHIFT )
851     {
852         l3e = l3e_from_l4e(idle_pg_table[l4_table_offset(v)],
853                            l3_table_offset(v));
854         if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
855             continue;
856         l2e = l2e_from_l3e(l3e, l2_table_offset(v));
857         if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
858             continue;
859         m2p_start_mfn = l2e_get_pfn(l2e);
860 
861         for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
862             share_xen_page_with_privileged_guests(
863                 mfn_to_page(_mfn(m2p_start_mfn + i)), SHARE_ro);
864     }
865 
866     /* Mark all of direct map NX if hardware supports it. */
867     if ( !cpu_has_nx )
868         return;
869 
870     for ( i = l4_table_offset(DIRECTMAP_VIRT_START);
871           i < l4_table_offset(DIRECTMAP_VIRT_END); ++i )
872     {
873         l4_pgentry_t l4e = idle_pg_table[i];
874 
875         if ( l4e_get_flags(l4e) & _PAGE_PRESENT )
876         {
877             l4e_add_flags(l4e, _PAGE_NX_BIT);
878             idle_pg_table[i] = l4e;
879         }
880     }
881 }
882 
subarch_memory_op(unsigned long cmd,XEN_GUEST_HANDLE_PARAM (void)arg)883 long subarch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
884 {
885     struct xen_machphys_mfn_list xmml;
886     l3_pgentry_t l3e;
887     l2_pgentry_t l2e;
888     unsigned long v, limit;
889     xen_pfn_t mfn, last_mfn;
890     unsigned int i;
891     long rc = 0;
892 
893     switch ( cmd )
894     {
895     case XENMEM_machphys_mfn_list:
896         if ( copy_from_guest(&xmml, arg, 1) )
897             return -EFAULT;
898 
899         BUILD_BUG_ON(RDWR_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
900         BUILD_BUG_ON(RDWR_MPT_VIRT_END   & ((1UL << L3_PAGETABLE_SHIFT) - 1));
901         for ( i = 0, v = RDWR_MPT_VIRT_START, last_mfn = 0;
902               (i != xmml.max_extents) &&
903               (v < (unsigned long)(machine_to_phys_mapping + max_page));
904               i++, v += 1UL << L2_PAGETABLE_SHIFT )
905         {
906             l3e = l3e_from_l4e(idle_pg_table[l4_table_offset(v)],
907                                l3_table_offset(v));
908             if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
909                 mfn = last_mfn;
910             else if ( !(l3e_get_flags(l3e) & _PAGE_PSE) )
911             {
912                 l2e = l2e_from_l3e(l3e, l2_table_offset(v));
913                 if ( l2e_get_flags(l2e) & _PAGE_PRESENT )
914                     mfn = l2e_get_pfn(l2e);
915                 else
916                     mfn = last_mfn;
917             }
918             else
919             {
920                 mfn = l3e_get_pfn(l3e)
921                     + (l2_table_offset(v) << PAGETABLE_ORDER);
922             }
923             ASSERT(mfn);
924             if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) )
925                 return -EFAULT;
926             last_mfn = mfn;
927         }
928 
929         xmml.nr_extents = i;
930         if ( __copy_to_guest(arg, &xmml, 1) )
931             return -EFAULT;
932 
933         break;
934 
935     case XENMEM_machphys_compat_mfn_list:
936         if ( copy_from_guest(&xmml, arg, 1) )
937             return -EFAULT;
938 
939         limit = (unsigned long)(compat_machine_to_phys_mapping + max_page);
940         if ( limit > RDWR_COMPAT_MPT_VIRT_END )
941             limit = RDWR_COMPAT_MPT_VIRT_END;
942         for ( i = 0, v = RDWR_COMPAT_MPT_VIRT_START, last_mfn = 0;
943               (i != xmml.max_extents) && (v < limit);
944               i++, v += 1 << L2_PAGETABLE_SHIFT )
945         {
946             l2e = compat_idle_pg_table_l2[l2_table_offset(v)];
947             if ( l2e_get_flags(l2e) & _PAGE_PRESENT )
948                 mfn = l2e_get_pfn(l2e);
949             else
950                 mfn = last_mfn;
951             ASSERT(mfn);
952             if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) )
953                 return -EFAULT;
954             last_mfn = mfn;
955         }
956 
957         xmml.nr_extents = i;
958         if ( __copy_to_guest(arg, &xmml, 1) )
959             rc = -EFAULT;
960 
961         break;
962 
963     case XENMEM_get_sharing_freed_pages:
964         return mem_sharing_get_nr_saved_mfns();
965 
966     case XENMEM_get_sharing_shared_pages:
967         return mem_sharing_get_nr_shared_mfns();
968 
969     case XENMEM_paging_op:
970         return mem_paging_memop(guest_handle_cast(arg, xen_mem_paging_op_t));
971 
972 #ifdef CONFIG_MEM_SHARING
973     case XENMEM_sharing_op:
974         return mem_sharing_memop(guest_handle_cast(arg, xen_mem_sharing_op_t));
975 #endif
976 
977     default:
978         rc = -ENOSYS;
979         break;
980     }
981 
982     return rc;
983 }
984 
do_stack_switch(unsigned long ss,unsigned long esp)985 long do_stack_switch(unsigned long ss, unsigned long esp)
986 {
987     fixup_guest_stack_selector(current->domain, ss);
988     current->arch.pv.kernel_ss = ss;
989     current->arch.pv.kernel_sp = esp;
990     return 0;
991 }
992 
do_set_segment_base(unsigned int which,unsigned long base)993 long do_set_segment_base(unsigned int which, unsigned long base)
994 {
995     struct vcpu *v = current;
996     long ret = 0;
997 
998     if ( is_pv_32bit_vcpu(v) )
999         return -ENOSYS; /* x86/64 only. */
1000 
1001     switch ( which )
1002     {
1003     case SEGBASE_FS:
1004         if ( is_canonical_address(base) )
1005             wrfsbase(base);
1006         else
1007             ret = -EINVAL;
1008         break;
1009 
1010     case SEGBASE_GS_USER:
1011         if ( is_canonical_address(base) )
1012         {
1013             wrgsshadow(base);
1014             v->arch.pv.gs_base_user = base;
1015         }
1016         else
1017             ret = -EINVAL;
1018         break;
1019 
1020     case SEGBASE_GS_KERNEL:
1021         if ( is_canonical_address(base) )
1022             wrgsbase(base);
1023         else
1024             ret = -EINVAL;
1025         break;
1026 
1027     case SEGBASE_GS_USER_SEL:
1028     {
1029         unsigned int sel = (uint16_t)base;
1030 
1031         /*
1032          * We wish to update the user %gs from the GDT/LDT.  Currently, the
1033          * guest kernel's GS_BASE is in context.
1034          */
1035         asm volatile ( "swapgs" );
1036 
1037         if ( sel > 3 )
1038             /* Fix up RPL for non-NUL selectors. */
1039             sel |= 3;
1040         else if ( boot_cpu_data.x86_vendor &
1041                   (X86_VENDOR_AMD | X86_VENDOR_HYGON) )
1042             /* Work around NUL segment behaviour on AMD hardware. */
1043             asm volatile ( "mov %[sel], %%gs"
1044                            :: [sel] "r" (FLAT_USER_DS32) );
1045 
1046         /*
1047          * Load the chosen selector, with fault handling.
1048          *
1049          * Errors ought to fail the hypercall, but that was never built in
1050          * originally, and Linux will BUG() if this call fails.
1051          *
1052          * NUL the selector in the case of an error.  This too needs to deal
1053          * with the AMD NUL segment behaviour, but it is already a slowpath in
1054          * #GP context so perform the flat load unconditionally to avoid
1055          * complicated logic.
1056          *
1057          * Anyone wanting to check for errors from this hypercall should
1058          * re-read %gs and compare against the input.
1059          */
1060         asm volatile ( "1: mov %[sel], %%gs\n\t"
1061                        ".section .fixup, \"ax\", @progbits\n\t"
1062                        "2: mov %k[flat], %%gs\n\t"
1063                        "   xor %[sel], %[sel]\n\t"
1064                        "   jmp 1b\n\t"
1065                        ".previous\n\t"
1066                        _ASM_EXTABLE(1b, 2b)
1067                        : [sel] "+r" (sel)
1068                        : [flat] "r" (FLAT_USER_DS32) );
1069 
1070         /* Update the cache of the inactive base, as read from the GDT/LDT. */
1071         v->arch.pv.gs_base_user = rdgsbase();
1072 
1073         asm volatile ( safe_swapgs );
1074         break;
1075     }
1076 
1077     default:
1078         ret = -EINVAL;
1079         break;
1080     }
1081 
1082     return ret;
1083 }
1084 
1085 
1086 /* Returns TRUE if given descriptor is valid for GDT or LDT. */
check_descriptor(const struct domain * dom,seg_desc_t * d)1087 int check_descriptor(const struct domain *dom, seg_desc_t *d)
1088 {
1089     u32 a = d->a, b = d->b;
1090     u16 cs;
1091     unsigned int dpl;
1092 
1093     /* A not-present descriptor will always fault, so is safe. */
1094     if ( !(b & _SEGMENT_P) )
1095         return 1;
1096 
1097     /* Check and fix up the DPL. */
1098     dpl = (b >> 13) & 3;
1099     __fixup_guest_selector(dom, dpl);
1100     b = (b & ~_SEGMENT_DPL) | (dpl << 13);
1101 
1102     /* All code and data segments are okay. No base/limit checking. */
1103     if ( (b & _SEGMENT_S) )
1104     {
1105         if ( is_pv_32bit_domain(dom) )
1106         {
1107             unsigned long base, limit;
1108 
1109             if ( b & _SEGMENT_L )
1110                 goto bad;
1111 
1112             /*
1113              * Older PAE Linux guests use segments which are limited to
1114              * 0xf6800000. Extend these to allow access to the larger read-only
1115              * M2P table available in 32on64 mode.
1116              */
1117             base = (b & 0xff000000) | ((b & 0xff) << 16) | (a >> 16);
1118 
1119             limit = (b & 0xf0000) | (a & 0xffff);
1120             limit++; /* We add one because limit is inclusive. */
1121 
1122             if ( (b & _SEGMENT_G) )
1123                 limit <<= 12;
1124 
1125             if ( (base == 0) && (limit > HYPERVISOR_COMPAT_VIRT_START(dom)) )
1126             {
1127                 a |= 0x0000ffff;
1128                 b |= 0x000f0000;
1129             }
1130         }
1131 
1132         goto good;
1133     }
1134 
1135     /* Invalid type 0 is harmless. It is used for 2nd half of a call gate. */
1136     if ( (b & _SEGMENT_TYPE) == 0x000 )
1137         return 1;
1138 
1139     /* Everything but a call gate is discarded here. */
1140     if ( (b & _SEGMENT_TYPE) != 0xc00 )
1141         goto bad;
1142 
1143     /* Validate the target code selector. */
1144     cs = a >> 16;
1145     if ( !guest_gate_selector_okay(dom, cs) )
1146         goto bad;
1147     /*
1148      * Force DPL to zero, causing a GP fault with its error code indicating
1149      * the gate in use, allowing emulation. This is necessary because with
1150      * native guests (kernel in ring 3) call gates cannot be used directly
1151      * to transition from user to kernel mode (and whether a gate is used
1152      * to enter the kernel can only be determined when the gate is being
1153      * used), and with compat guests call gates cannot be used at all as
1154      * there are only 64-bit ones.
1155      * Store the original DPL in the selector's RPL field.
1156      */
1157     b &= ~_SEGMENT_DPL;
1158     cs = (cs & ~3) | dpl;
1159     a = (a & 0xffffU) | (cs << 16);
1160 
1161     /* Reserved bits must be zero. */
1162     if ( b & (is_pv_32bit_domain(dom) ? 0xe0 : 0xff) )
1163         goto bad;
1164 
1165  good:
1166     d->a = a;
1167     d->b = b;
1168     return 1;
1169  bad:
1170     return 0;
1171 }
1172 
pagefault_by_memadd(unsigned long addr,struct cpu_user_regs * regs)1173 int pagefault_by_memadd(unsigned long addr, struct cpu_user_regs *regs)
1174 {
1175     struct domain *d = current->domain;
1176 
1177     return mem_hotplug && guest_mode(regs) && is_pv_32bit_domain(d) &&
1178            (addr >= HYPERVISOR_COMPAT_VIRT_START(d)) &&
1179            (addr < MACH2PHYS_COMPAT_VIRT_END);
1180 }
1181 
handle_memadd_fault(unsigned long addr,struct cpu_user_regs * regs)1182 int handle_memadd_fault(unsigned long addr, struct cpu_user_regs *regs)
1183 {
1184     struct domain *d = current->domain;
1185     l4_pgentry_t *pl4e = NULL;
1186     l4_pgentry_t l4e;
1187     l3_pgentry_t  *pl3e = NULL;
1188     l3_pgentry_t l3e;
1189     l2_pgentry_t *pl2e = NULL;
1190     l2_pgentry_t l2e, idle_l2e;
1191     unsigned long mfn, idle_index;
1192     int ret = 0;
1193 
1194     if (!is_pv_32bit_domain(d))
1195         return 0;
1196 
1197     if ( (addr < HYPERVISOR_COMPAT_VIRT_START(d)) ||
1198          (addr >= MACH2PHYS_COMPAT_VIRT_END) )
1199         return 0;
1200 
1201     mfn = (read_cr3()) >> PAGE_SHIFT;
1202 
1203     pl4e = map_domain_page(_mfn(mfn));
1204 
1205     l4e = pl4e[0];
1206 
1207     if (!(l4e_get_flags(l4e) & _PAGE_PRESENT))
1208         goto unmap;
1209 
1210     mfn = l4e_get_pfn(l4e);
1211     /* We don't need get page type here since it is current CR3 */
1212     pl3e = map_domain_page(_mfn(mfn));
1213 
1214     l3e = pl3e[3];
1215 
1216     if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
1217         goto unmap;
1218 
1219     mfn = l3e_get_pfn(l3e);
1220     pl2e = map_domain_page(_mfn(mfn));
1221 
1222     l2e = pl2e[l2_table_offset(addr)];
1223 
1224     if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT))
1225         goto unmap;
1226 
1227     idle_index = (l2_table_offset(addr) -
1228                         COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d))/
1229                   sizeof(l2_pgentry_t);
1230     idle_l2e = compat_idle_pg_table_l2[idle_index];
1231     if (!(l2e_get_flags(idle_l2e) & _PAGE_PRESENT))
1232         goto unmap;
1233 
1234     memcpy(&pl2e[l2_table_offset(addr)],
1235             &compat_idle_pg_table_l2[idle_index],
1236             sizeof(l2_pgentry_t));
1237 
1238     ret = EXCRET_fault_fixed;
1239 
1240 unmap:
1241     if ( pl4e )
1242         unmap_domain_page(pl4e);
1243     if ( pl3e )
1244         unmap_domain_page(pl3e);
1245     if ( pl2e )
1246         unmap_domain_page(pl2e);
1247 
1248     return ret;
1249 }
1250 
domain_set_alloc_bitsize(struct domain * d)1251 void domain_set_alloc_bitsize(struct domain *d)
1252 {
1253     if ( !is_pv_32bit_domain(d) ||
1254          (MACH2PHYS_COMPAT_NR_ENTRIES(d) >= max_page) ||
1255          d->arch.physaddr_bitsize > 0 )
1256         return;
1257     d->arch.physaddr_bitsize =
1258         /* 2^n entries can be contained in guest's p2m mapping space */
1259         fls(MACH2PHYS_COMPAT_NR_ENTRIES(d)) - 1
1260         /* 2^n pages -> 2^(n+PAGE_SHIFT) bits */
1261         + PAGE_SHIFT;
1262 }
1263 
domain_clamp_alloc_bitsize(struct domain * d,unsigned int bits)1264 unsigned int domain_clamp_alloc_bitsize(struct domain *d, unsigned int bits)
1265 {
1266     if ( (d == NULL) || (d->arch.physaddr_bitsize == 0) )
1267         return bits;
1268     return min(d->arch.physaddr_bitsize, bits);
1269 }
1270 
transfer_pages_to_heap(struct mem_hotadd_info * info)1271 static int transfer_pages_to_heap(struct mem_hotadd_info *info)
1272 {
1273     unsigned long i;
1274     struct page_info *pg;
1275 
1276     /*
1277      * Mark the allocated page before put free pages to buddy allocator
1278      * to avoid merge in free_heap_pages
1279      */
1280     for (i = info->spfn; i < info->cur; i++)
1281     {
1282         pg = mfn_to_page(_mfn(i));
1283         pg->count_info = PGC_state_inuse;
1284     }
1285 
1286     init_domheap_pages(pfn_to_paddr(info->cur), pfn_to_paddr(info->epfn));
1287 
1288     return 0;
1289 }
1290 
mem_hotadd_check(unsigned long spfn,unsigned long epfn)1291 static int mem_hotadd_check(unsigned long spfn, unsigned long epfn)
1292 {
1293     unsigned long s, e, length, sidx, eidx;
1294 
1295     if ( (spfn >= epfn) )
1296         return 0;
1297 
1298     if (pfn_to_pdx(epfn) > FRAMETABLE_NR)
1299         return 0;
1300 
1301     if ( (spfn | epfn) & ((1UL << PAGETABLE_ORDER) - 1) )
1302         return 0;
1303 
1304     if ( (spfn | epfn) & pfn_hole_mask )
1305         return 0;
1306 
1307     /* Make sure the new range is not present now */
1308     sidx = ((pfn_to_pdx(spfn) + PDX_GROUP_COUNT - 1)  & ~(PDX_GROUP_COUNT - 1))
1309             / PDX_GROUP_COUNT;
1310     eidx = (pfn_to_pdx(epfn - 1) & ~(PDX_GROUP_COUNT - 1)) / PDX_GROUP_COUNT;
1311     if (sidx >= eidx)
1312         return 0;
1313 
1314     s = find_next_zero_bit(pdx_group_valid, eidx, sidx);
1315     if ( s > eidx )
1316         return 0;
1317     e = find_next_bit(pdx_group_valid, eidx, s);
1318     if ( e < eidx )
1319         return 0;
1320 
1321     /* Caculate at most required m2p/compat m2p/frametable pages */
1322     s = (spfn & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1));
1323     e = (epfn + (1UL << (L2_PAGETABLE_SHIFT - 3)) - 1) &
1324             ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1);
1325 
1326     length = (e - s) * sizeof(unsigned long);
1327 
1328     s = (spfn & ~((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1));
1329     e = (epfn + (1UL << (L2_PAGETABLE_SHIFT - 2)) - 1) &
1330             ~((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1);
1331 
1332     e = min_t(unsigned long, e,
1333             (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2);
1334 
1335     if ( e > s )
1336         length += (e -s) * sizeof(unsigned int);
1337 
1338     s = pfn_to_pdx(spfn) & ~(PDX_GROUP_COUNT - 1);
1339     e = ( pfn_to_pdx(epfn) + (PDX_GROUP_COUNT - 1) ) & ~(PDX_GROUP_COUNT - 1);
1340 
1341     length += (e - s) * sizeof(struct page_info);
1342 
1343     if ((length >> PAGE_SHIFT) > (epfn - spfn))
1344         return 0;
1345 
1346     return 1;
1347 }
1348 
1349 /*
1350  * A bit paranoid for memory allocation failure issue since
1351  * it may be reason for memory add
1352  */
memory_add(unsigned long spfn,unsigned long epfn,unsigned int pxm)1353 int memory_add(unsigned long spfn, unsigned long epfn, unsigned int pxm)
1354 {
1355     struct mem_hotadd_info info;
1356     int ret;
1357     nodeid_t node;
1358     unsigned long old_max = max_page, old_total = total_pages;
1359     unsigned long old_node_start, old_node_span, orig_online;
1360     unsigned long i;
1361 
1362     dprintk(XENLOG_INFO, "memory_add %lx ~ %lx with pxm %x\n", spfn, epfn, pxm);
1363 
1364     if ( !mem_hotadd_check(spfn, epfn) )
1365         return -EINVAL;
1366 
1367     if ( (node = setup_node(pxm)) == NUMA_NO_NODE )
1368         return -EINVAL;
1369 
1370     if ( !valid_numa_range(spfn << PAGE_SHIFT, epfn << PAGE_SHIFT, node) )
1371     {
1372         printk(XENLOG_WARNING
1373                "pfn range %lx..%lx PXM %x node %x is not NUMA-valid\n",
1374                spfn, epfn, pxm, node);
1375         return -EINVAL;
1376     }
1377 
1378     i = virt_to_mfn(HYPERVISOR_VIRT_END - 1) + 1;
1379     if ( spfn < i )
1380     {
1381         ret = map_pages_to_xen((unsigned long)mfn_to_virt(spfn), _mfn(spfn),
1382                                min(epfn, i) - spfn, PAGE_HYPERVISOR);
1383         if ( ret )
1384             goto destroy_directmap;
1385     }
1386     if ( i < epfn )
1387     {
1388         if ( i < spfn )
1389             i = spfn;
1390         ret = map_pages_to_xen((unsigned long)mfn_to_virt(i), _mfn(i),
1391                                epfn - i, __PAGE_HYPERVISOR_RW);
1392         if ( ret )
1393             goto destroy_directmap;
1394     }
1395 
1396     old_node_start = node_start_pfn(node);
1397     old_node_span = node_spanned_pages(node);
1398     orig_online = node_online(node);
1399 
1400     if ( !orig_online )
1401     {
1402         dprintk(XENLOG_WARNING, "node %x pxm %x is not online\n",node, pxm);
1403         NODE_DATA(node)->node_start_pfn = spfn;
1404         NODE_DATA(node)->node_spanned_pages =
1405                 epfn - node_start_pfn(node);
1406         node_set_online(node);
1407     }
1408     else
1409     {
1410         if (node_start_pfn(node) > spfn)
1411             NODE_DATA(node)->node_start_pfn = spfn;
1412         if (node_end_pfn(node) < epfn)
1413             NODE_DATA(node)->node_spanned_pages = epfn - node_start_pfn(node);
1414     }
1415 
1416     info.spfn = spfn;
1417     info.epfn = epfn;
1418     info.cur = spfn;
1419 
1420     ret = extend_frame_table(&info);
1421     if (ret)
1422         goto destroy_frametable;
1423 
1424     /* Set max_page as setup_m2p_table will use it*/
1425     if (max_page < epfn)
1426     {
1427         max_page = epfn;
1428         max_pdx = pfn_to_pdx(max_page - 1) + 1;
1429     }
1430     total_pages += epfn - spfn;
1431 
1432     set_pdx_range(spfn, epfn);
1433     ret = setup_m2p_table(&info);
1434 
1435     if ( ret )
1436         goto destroy_m2p;
1437 
1438     /*
1439      * If hardware domain has IOMMU mappings but page tables are not
1440      * shared or being kept in sync then newly added memory needs to be
1441      * mapped here.
1442      */
1443     if ( is_iommu_enabled(hardware_domain) &&
1444          !iommu_use_hap_pt(hardware_domain) &&
1445          !need_iommu_pt_sync(hardware_domain) )
1446     {
1447         for ( i = spfn; i < epfn; i++ )
1448             if ( iommu_legacy_map(hardware_domain, _dfn(i), _mfn(i),
1449                                   PAGE_ORDER_4K,
1450                                   IOMMUF_readable | IOMMUF_writable) )
1451                 break;
1452         if ( i != epfn )
1453         {
1454             while (i-- > old_max)
1455                 /* If statement to satisfy __must_check. */
1456                 if ( iommu_legacy_unmap(hardware_domain, _dfn(i),
1457                                         PAGE_ORDER_4K) )
1458                     continue;
1459 
1460             goto destroy_m2p;
1461         }
1462     }
1463 
1464     /* We can't revert any more */
1465     share_hotadd_m2p_table(&info);
1466     transfer_pages_to_heap(&info);
1467 
1468     return 0;
1469 
1470 destroy_m2p:
1471     destroy_m2p_mapping(&info);
1472     max_page = old_max;
1473     total_pages = old_total;
1474     max_pdx = pfn_to_pdx(max_page - 1) + 1;
1475 destroy_frametable:
1476     cleanup_frame_table(&info);
1477     if ( !orig_online )
1478         node_set_offline(node);
1479     NODE_DATA(node)->node_start_pfn = old_node_start;
1480     NODE_DATA(node)->node_spanned_pages = old_node_span;
1481  destroy_directmap:
1482     destroy_xen_mappings((unsigned long)mfn_to_virt(spfn),
1483                          (unsigned long)mfn_to_virt(epfn));
1484 
1485     return ret;
1486 }
1487 
1488 #include "compat/mm.c"
1489 
1490 /*
1491  * Local variables:
1492  * mode: C
1493  * c-file-style: "BSD"
1494  * c-basic-offset: 4
1495  * tab-width: 4
1496  * indent-tabs-mode: nil
1497  * End:
1498  */
1499