1 /******************************************************************************
2 * arch/x86/x86_64/mm.c
3 *
4 * Modifications to Linux original are copyright (c) 2004, K A Fraser tr This
5 * program is free software; you can redistribute it and/or modify it under
6 * the terms of the GNU General Public License as published by the Free
7 * Software Foundation; either version 2 of the License, or (at your option)
8 * any later version.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along
16 * with this program; If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 EMIT_FILE;
20
21 #include <xen/lib.h>
22 #include <xen/init.h>
23 #include <xen/mm.h>
24 #include <xen/sched.h>
25 #include <xen/numa.h>
26 #include <xen/nodemask.h>
27 #include <xen/guest_access.h>
28 #include <xen/hypercall.h>
29 #include <xen/mem_access.h>
30 #include <asm/current.h>
31 #include <asm/asm_defns.h>
32 #include <asm/page.h>
33 #include <asm/flushtlb.h>
34 #include <asm/fixmap.h>
35 #include <asm/hypercall.h>
36 #include <asm/msr.h>
37 #include <asm/setup.h>
38 #include <asm/numa.h>
39 #include <asm/mem_paging.h>
40 #include <asm/mem_sharing.h>
41 #include <public/memory.h>
42
43 unsigned int __read_mostly m2p_compat_vstart = __HYPERVISOR_COMPAT_VIRT_START;
44
45 l2_pgentry_t *compat_idle_pg_table_l2;
46
do_page_walk(struct vcpu * v,unsigned long addr)47 void *do_page_walk(struct vcpu *v, unsigned long addr)
48 {
49 unsigned long mfn = pagetable_get_pfn(v->arch.guest_table);
50 l4_pgentry_t l4e, *l4t;
51 l3_pgentry_t l3e, *l3t;
52 l2_pgentry_t l2e, *l2t;
53 l1_pgentry_t l1e, *l1t;
54
55 if ( !is_pv_vcpu(v) || !is_canonical_address(addr) )
56 return NULL;
57
58 l4t = map_domain_page(_mfn(mfn));
59 l4e = l4t[l4_table_offset(addr)];
60 unmap_domain_page(l4t);
61 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
62 return NULL;
63
64 l3t = map_l3t_from_l4e(l4e);
65 l3e = l3t[l3_table_offset(addr)];
66 unmap_domain_page(l3t);
67 mfn = l3e_get_pfn(l3e);
68 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
69 return NULL;
70 if ( (l3e_get_flags(l3e) & _PAGE_PSE) )
71 {
72 mfn += PFN_DOWN(addr & ((1UL << L3_PAGETABLE_SHIFT) - 1));
73 goto ret;
74 }
75
76 l2t = map_domain_page(_mfn(mfn));
77 l2e = l2t[l2_table_offset(addr)];
78 unmap_domain_page(l2t);
79 mfn = l2e_get_pfn(l2e);
80 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
81 return NULL;
82 if ( (l2e_get_flags(l2e) & _PAGE_PSE) )
83 {
84 mfn += PFN_DOWN(addr & ((1UL << L2_PAGETABLE_SHIFT) - 1));
85 goto ret;
86 }
87
88 l1t = map_domain_page(_mfn(mfn));
89 l1e = l1t[l1_table_offset(addr)];
90 unmap_domain_page(l1t);
91 mfn = l1e_get_pfn(l1e);
92 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || !mfn_valid(_mfn(mfn)) )
93 return NULL;
94
95 ret:
96 return map_domain_page(_mfn(mfn)) + (addr & ~PAGE_MASK);
97 }
98
99 /*
100 * Allocate page table pages for m2p table
101 */
102 struct mem_hotadd_info
103 {
104 unsigned long spfn;
105 unsigned long epfn;
106 unsigned long cur;
107 };
108
hotadd_mem_valid(unsigned long pfn,struct mem_hotadd_info * info)109 static int hotadd_mem_valid(unsigned long pfn, struct mem_hotadd_info *info)
110 {
111 return (pfn < info->epfn && pfn >= info->spfn);
112 }
113
alloc_hotadd_mfn(struct mem_hotadd_info * info)114 static mfn_t alloc_hotadd_mfn(struct mem_hotadd_info *info)
115 {
116 mfn_t mfn;
117
118 ASSERT((info->cur + ( 1UL << PAGETABLE_ORDER) < info->epfn) &&
119 info->cur >= info->spfn);
120
121 mfn = _mfn(info->cur);
122 info->cur += (1UL << PAGETABLE_ORDER);
123 return mfn;
124 }
125
126 #define M2P_NO_MAPPED 0
127 #define M2P_2M_MAPPED 1
128 #define M2P_1G_MAPPED 2
m2p_mapped(unsigned long spfn)129 static int m2p_mapped(unsigned long spfn)
130 {
131 unsigned long va;
132 l3_pgentry_t l3e;
133 l2_pgentry_t l2e;
134
135 va = RO_MPT_VIRT_START + spfn * sizeof(*machine_to_phys_mapping);
136 l3e = l3e_from_l4e(idle_pg_table[l4_table_offset(va)], l3_table_offset(va));
137
138 switch ( l3e_get_flags(l3e) & (_PAGE_PRESENT | _PAGE_PSE) )
139 {
140 case _PAGE_PSE|_PAGE_PRESENT:
141 return M2P_1G_MAPPED;
142 /* Check for next level */
143 case _PAGE_PRESENT:
144 break;
145 default:
146 return M2P_NO_MAPPED;
147 }
148 l2e = l2e_from_l3e(l3e, l2_table_offset(va));
149
150 if ( l2e_get_flags(l2e) & _PAGE_PRESENT )
151 return M2P_2M_MAPPED;
152
153 return M2P_NO_MAPPED;
154 }
155
share_hotadd_m2p_table(struct mem_hotadd_info * info)156 static int share_hotadd_m2p_table(struct mem_hotadd_info *info)
157 {
158 unsigned long i, n, v;
159 mfn_t m2p_start_mfn = INVALID_MFN;
160 l3_pgentry_t l3e;
161 l2_pgentry_t l2e;
162
163 /* M2P table is mappable read-only by privileged domains. */
164 for ( v = RDWR_MPT_VIRT_START;
165 v != RDWR_MPT_VIRT_END;
166 v += n << PAGE_SHIFT )
167 {
168 n = L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES;
169 l3e = l3e_from_l4e(idle_pg_table[l4_table_offset(v)],
170 l3_table_offset(v));
171 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
172 continue;
173 if ( !(l3e_get_flags(l3e) & _PAGE_PSE) )
174 {
175 n = L1_PAGETABLE_ENTRIES;
176 l2e = l2e_from_l3e(l3e, l2_table_offset(v));
177 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
178 continue;
179 m2p_start_mfn = l2e_get_mfn(l2e);
180 }
181 else
182 continue;
183
184 for ( i = 0; i < n; i++ )
185 {
186 struct page_info *page = mfn_to_page(mfn_add(m2p_start_mfn, i));
187
188 if ( hotadd_mem_valid(mfn_x(mfn_add(m2p_start_mfn, i)), info) )
189 share_xen_page_with_privileged_guests(page, SHARE_ro);
190 }
191 }
192
193 for ( v = RDWR_COMPAT_MPT_VIRT_START;
194 v != RDWR_COMPAT_MPT_VIRT_END;
195 v += 1 << L2_PAGETABLE_SHIFT )
196 {
197 l3e = l3e_from_l4e(idle_pg_table[l4_table_offset(v)],
198 l3_table_offset(v));
199 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
200 continue;
201 l2e = l2e_from_l3e(l3e, l2_table_offset(v));
202 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
203 continue;
204 m2p_start_mfn = l2e_get_mfn(l2e);
205
206 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
207 {
208 struct page_info *page = mfn_to_page(mfn_add(m2p_start_mfn, i));
209
210 if ( hotadd_mem_valid(mfn_x(mfn_add(m2p_start_mfn, i)), info) )
211 share_xen_page_with_privileged_guests(page, SHARE_ro);
212 }
213 }
214 return 0;
215 }
216
destroy_compat_m2p_mapping(struct mem_hotadd_info * info)217 static void destroy_compat_m2p_mapping(struct mem_hotadd_info *info)
218 {
219 unsigned long i, smap = info->spfn, emap = info->spfn;
220
221 if ( smap > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2) )
222 return;
223
224 if ( emap > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2) )
225 emap = (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2;
226
227 for ( i = smap; i < emap; )
228 {
229 unsigned int off = i * sizeof(*compat_machine_to_phys_mapping);
230 l2_pgentry_t *pl2e = compat_idle_pg_table_l2 + l2_table_offset(off);
231
232 if ( l2e_get_flags(*pl2e) & _PAGE_PRESENT )
233 {
234 unsigned long pt_pfn = l2e_get_pfn(*pl2e);
235
236 if ( hotadd_mem_valid(pt_pfn, info) )
237 {
238 unsigned long rwva = RDWR_COMPAT_MPT_VIRT_START + off;
239
240 destroy_xen_mappings(rwva, rwva + (1UL << L2_PAGETABLE_SHIFT));
241 l2e_write(pl2e, l2e_empty());
242 }
243 }
244
245 i += 1UL << (L2_PAGETABLE_SHIFT - 2);
246 }
247
248 return;
249 }
250
destroy_m2p_mapping(struct mem_hotadd_info * info)251 static void destroy_m2p_mapping(struct mem_hotadd_info *info)
252 {
253 l3_pgentry_t *l3_ro_mpt;
254 unsigned long i, va, rwva;
255 unsigned long smap = info->spfn, emap = info->epfn;
256
257 l3_ro_mpt = map_l3t_from_l4e(
258 idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)]);
259
260 /*
261 * No need to clean m2p structure existing before the hotplug
262 */
263 for (i = smap; i < emap;)
264 {
265 unsigned long pt_pfn;
266 l2_pgentry_t *pl2e;
267
268 va = RO_MPT_VIRT_START + i * sizeof(*machine_to_phys_mapping);
269 rwva = RDWR_MPT_VIRT_START + i * sizeof(*machine_to_phys_mapping);
270
271 /* 1G mapping should not be created by mem hotadd */
272 if (!(l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) & _PAGE_PRESENT) ||
273 (l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) & _PAGE_PSE))
274 {
275 i = ( i & ~((1UL << (L3_PAGETABLE_SHIFT - 3)) - 1)) +
276 (1UL << (L3_PAGETABLE_SHIFT - 3) );
277 continue;
278 }
279
280 pl2e = map_l2t_from_l3e(l3_ro_mpt[l3_table_offset(va)]) +
281 l2_table_offset(va);
282 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
283 {
284 i = ( i & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)) +
285 (1UL << (L2_PAGETABLE_SHIFT - 3)) ;
286 UNMAP_DOMAIN_PAGE(pl2e);
287 continue;
288 }
289
290 pt_pfn = l2e_get_pfn(*pl2e);
291 if ( hotadd_mem_valid(pt_pfn, info) )
292 {
293 destroy_xen_mappings(rwva, rwva + (1UL << L2_PAGETABLE_SHIFT));
294
295 l2e_write(pl2e, l2e_empty());
296 }
297 i = ( i & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)) +
298 (1UL << (L2_PAGETABLE_SHIFT - 3));
299 unmap_domain_page(pl2e);
300 }
301
302 UNMAP_DOMAIN_PAGE(l3_ro_mpt);
303
304 destroy_compat_m2p_mapping(info);
305
306 /* Brute-Force flush all TLB */
307 flush_tlb_all();
308 return;
309 }
310
311 /*
312 * Allocate and map the compatibility mode machine-to-phys table.
313 * spfn/epfn: the pfn ranges to be setup
314 * free_s/free_e: the pfn ranges that is free still
315 */
setup_compat_m2p_table(struct mem_hotadd_info * info)316 static int setup_compat_m2p_table(struct mem_hotadd_info *info)
317 {
318 unsigned long i, smap, emap, epfn = info->epfn;
319 mfn_t mfn;
320 unsigned int n;
321 int err = 0;
322
323 smap = info->spfn & (~((1UL << (L2_PAGETABLE_SHIFT - 2)) -1));
324
325 /*
326 * Notice: For hot-added memory, only range below m2p_compat_vstart
327 * will be filled up (assuming memory is discontinous when booting).
328 */
329 if ((smap > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2)) )
330 return 0;
331
332 if ( epfn > ((RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2) )
333 epfn = (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2;
334
335 emap = ( (epfn + ((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1 )) &
336 ~((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1) );
337
338 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned int))
339 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
340 sizeof(*compat_machine_to_phys_mapping))
341 BUILD_BUG_ON((sizeof(*frame_table) & -sizeof(*frame_table)) % \
342 sizeof(*compat_machine_to_phys_mapping));
343
344 for ( i = smap; i < emap; i += (1UL << (L2_PAGETABLE_SHIFT - 2)) )
345 {
346 unsigned int off = i * sizeof(*compat_machine_to_phys_mapping);
347 l2_pgentry_t *pl2e = compat_idle_pg_table_l2 + l2_table_offset(off);
348 unsigned long rwva = RDWR_COMPAT_MPT_VIRT_START + off;
349
350 if ( l2e_get_flags(*pl2e) & _PAGE_PRESENT )
351 continue;
352
353 for ( n = 0; n < CNT; ++n)
354 if ( mfn_valid(_mfn(i + n * PDX_GROUP_COUNT)) )
355 break;
356 if ( n == CNT )
357 continue;
358
359 mfn = alloc_hotadd_mfn(info);
360 err = map_pages_to_xen(rwva, mfn, 1UL << PAGETABLE_ORDER,
361 PAGE_HYPERVISOR);
362 if ( err )
363 break;
364 /* Fill with INVALID_M2P_ENTRY. */
365 memset((void *)rwva, 0xFF, 1UL << L2_PAGETABLE_SHIFT);
366 /* NB. Cannot be GLOBAL as the ptes get copied into per-VM space. */
367 l2e_write(pl2e, l2e_from_mfn(mfn, _PAGE_PSE|_PAGE_PRESENT));
368 }
369 #undef CNT
370 #undef MFN
371 return err;
372 }
373
374 /*
375 * Allocate and map the machine-to-phys table.
376 * The L3 for RO/RWRW MPT and the L2 for compatible MPT should be setup already
377 */
setup_m2p_table(struct mem_hotadd_info * info)378 static int setup_m2p_table(struct mem_hotadd_info *info)
379 {
380 unsigned long i, va, smap, emap;
381 unsigned int n;
382 l2_pgentry_t *l2_ro_mpt = NULL;
383 l3_pgentry_t *l3_ro_mpt = NULL;
384 int ret = 0;
385
386 ASSERT(l4e_get_flags(idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)])
387 & _PAGE_PRESENT);
388 l3_ro_mpt = l4e_to_l3e(idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)]);
389
390 smap = (info->spfn & (~((1UL << (L2_PAGETABLE_SHIFT - 3)) -1)));
391 emap = ((info->epfn + ((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1 )) &
392 ~((1UL << (L2_PAGETABLE_SHIFT - 3)) -1));
393
394 va = RO_MPT_VIRT_START + smap * sizeof(*machine_to_phys_mapping);
395
396 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned long))
397 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
398 sizeof(*machine_to_phys_mapping))
399
400 BUILD_BUG_ON((sizeof(*frame_table) & -sizeof(*frame_table)) % \
401 sizeof(*machine_to_phys_mapping));
402
403 i = smap;
404 while ( i < emap )
405 {
406 switch ( m2p_mapped(i) )
407 {
408 case M2P_1G_MAPPED:
409 i = ( i & ~((1UL << (L3_PAGETABLE_SHIFT - 3)) - 1)) +
410 (1UL << (L3_PAGETABLE_SHIFT - 3));
411 continue;
412 case M2P_2M_MAPPED:
413 i = (i & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1)) +
414 (1UL << (L2_PAGETABLE_SHIFT - 3));
415 continue;
416 default:
417 break;
418 }
419
420 va = RO_MPT_VIRT_START + i * sizeof(*machine_to_phys_mapping);
421
422 for ( n = 0; n < CNT; ++n)
423 if ( mfn_valid(_mfn(i + n * PDX_GROUP_COUNT)) )
424 break;
425 if ( n < CNT )
426 {
427 mfn_t mfn = alloc_hotadd_mfn(info);
428
429 ret = map_pages_to_xen(
430 RDWR_MPT_VIRT_START + i * sizeof(unsigned long),
431 mfn, 1UL << PAGETABLE_ORDER,
432 PAGE_HYPERVISOR);
433 if ( ret )
434 goto error;
435 /* Fill with INVALID_M2P_ENTRY. */
436 memset((void *)(RDWR_MPT_VIRT_START + i * sizeof(unsigned long)),
437 0xFF, 1UL << L2_PAGETABLE_SHIFT);
438
439 ASSERT(!(l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) &
440 _PAGE_PSE));
441 if ( l3e_get_flags(l3_ro_mpt[l3_table_offset(va)]) &
442 _PAGE_PRESENT )
443 l2_ro_mpt = l3e_to_l2e(l3_ro_mpt[l3_table_offset(va)]) +
444 l2_table_offset(va);
445 else
446 {
447 l2_ro_mpt = alloc_xen_pagetable();
448 if ( !l2_ro_mpt )
449 {
450 ret = -ENOMEM;
451 goto error;
452 }
453
454 clear_page(l2_ro_mpt);
455 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
456 l3e_from_paddr(__pa(l2_ro_mpt),
457 __PAGE_HYPERVISOR_RO | _PAGE_USER));
458 l2_ro_mpt += l2_table_offset(va);
459 }
460
461 /* NB. Cannot be GLOBAL: guest user mode should not see it. */
462 l2e_write(l2_ro_mpt, l2e_from_mfn(mfn,
463 /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
464 }
465 if ( !((unsigned long)l2_ro_mpt & ~PAGE_MASK) )
466 l2_ro_mpt = NULL;
467 i += ( 1UL << (L2_PAGETABLE_SHIFT - 3));
468 }
469 #undef CNT
470 #undef MFN
471
472 ret = setup_compat_m2p_table(info);
473 error:
474 return ret;
475 }
476
paging_init(void)477 void __init paging_init(void)
478 {
479 unsigned long i, mpt_size, va;
480 unsigned int n, memflags;
481 l3_pgentry_t *l3_ro_mpt;
482 l2_pgentry_t *l2_ro_mpt = NULL;
483 struct page_info *l1_pg;
484
485 /*
486 * We setup the L3s for 1:1 mapping if host support memory hotplug
487 * to avoid sync the 1:1 mapping on page fault handler
488 */
489 for ( va = DIRECTMAP_VIRT_START;
490 va < DIRECTMAP_VIRT_END && (void *)va < __va(mem_hotplug);
491 va += (1UL << L4_PAGETABLE_SHIFT) )
492 {
493 if ( !(l4e_get_flags(idle_pg_table[l4_table_offset(va)]) &
494 _PAGE_PRESENT) )
495 {
496 l3_pgentry_t *pl3t = alloc_xen_pagetable();
497
498 if ( !pl3t )
499 goto nomem;
500 clear_page(pl3t);
501 l4e_write(&idle_pg_table[l4_table_offset(va)],
502 l4e_from_paddr(__pa(pl3t), __PAGE_HYPERVISOR_RW));
503 }
504 }
505
506 /* Create user-accessible L2 directory to map the MPT for guests. */
507 if ( (l3_ro_mpt = alloc_xen_pagetable()) == NULL )
508 goto nomem;
509 clear_page(l3_ro_mpt);
510 l4e_write(&idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)],
511 l4e_from_paddr(__pa(l3_ro_mpt), __PAGE_HYPERVISOR_RO | _PAGE_USER));
512
513 /*
514 * Allocate and map the machine-to-phys table.
515 * This also ensures L3 is present for fixmaps.
516 */
517 mpt_size = (max_page * BYTES_PER_LONG) + (1UL << L2_PAGETABLE_SHIFT) - 1;
518 mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
519 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned long))
520 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
521 sizeof(*machine_to_phys_mapping))
522 BUILD_BUG_ON((sizeof(*frame_table) & ~sizeof(*frame_table)) % \
523 sizeof(*machine_to_phys_mapping));
524 for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++ )
525 {
526 BUILD_BUG_ON(RO_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
527 va = RO_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT);
528 memflags = MEMF_node(phys_to_nid(i <<
529 (L2_PAGETABLE_SHIFT - 3 + PAGE_SHIFT)));
530
531 if ( cpu_has_page1gb &&
532 !((unsigned long)l2_ro_mpt & ~PAGE_MASK) &&
533 (mpt_size >> L3_PAGETABLE_SHIFT) > (i >> PAGETABLE_ORDER) )
534 {
535 unsigned int k, holes;
536
537 for ( holes = k = 0; k < 1 << PAGETABLE_ORDER; ++k)
538 {
539 for ( n = 0; n < CNT; ++n)
540 if ( mfn_valid(_mfn(MFN(i + k) + n * PDX_GROUP_COUNT)) )
541 break;
542 if ( n == CNT )
543 ++holes;
544 }
545 if ( k == holes )
546 {
547 i += (1UL << PAGETABLE_ORDER) - 1;
548 continue;
549 }
550 if ( holes == 0 &&
551 (l1_pg = alloc_domheap_pages(NULL, 2 * PAGETABLE_ORDER,
552 memflags)) != NULL )
553 {
554 map_pages_to_xen(
555 RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
556 page_to_mfn(l1_pg),
557 1UL << (2 * PAGETABLE_ORDER),
558 PAGE_HYPERVISOR);
559 /* Fill with INVALID_M2P_ENTRY. */
560 memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)),
561 0xFF, 1UL << L3_PAGETABLE_SHIFT);
562
563 ASSERT(!l2_table_offset(va));
564 /* NB. Cannot be GLOBAL: guest user mode should not see it. */
565 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
566 l3e_from_page(l1_pg,
567 /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
568 i += (1UL << PAGETABLE_ORDER) - 1;
569 continue;
570 }
571 }
572
573 for ( n = 0; n < CNT; ++n)
574 if ( mfn_valid(_mfn(MFN(i) + n * PDX_GROUP_COUNT)) )
575 break;
576 if ( n == CNT )
577 l1_pg = NULL;
578 else if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER,
579 memflags)) == NULL )
580 goto nomem;
581 else
582 {
583 map_pages_to_xen(
584 RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
585 page_to_mfn(l1_pg),
586 1UL << PAGETABLE_ORDER,
587 PAGE_HYPERVISOR);
588 /* Fill with INVALID_M2P_ENTRY. */
589 memset((void *)(RDWR_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT)),
590 0xFF, 1UL << L2_PAGETABLE_SHIFT);
591 }
592 if ( !((unsigned long)l2_ro_mpt & ~PAGE_MASK) )
593 {
594 if ( (l2_ro_mpt = alloc_xen_pagetable()) == NULL )
595 goto nomem;
596 clear_page(l2_ro_mpt);
597 l3e_write(&l3_ro_mpt[l3_table_offset(va)],
598 l3e_from_paddr(__pa(l2_ro_mpt),
599 __PAGE_HYPERVISOR_RO | _PAGE_USER));
600 ASSERT(!l2_table_offset(va));
601 }
602 /* NB. Cannot be GLOBAL: guest user mode should not see it. */
603 if ( l1_pg )
604 l2e_write(l2_ro_mpt, l2e_from_page(
605 l1_pg, /*_PAGE_GLOBAL|*/_PAGE_PSE|_PAGE_USER|_PAGE_PRESENT));
606 l2_ro_mpt++;
607 }
608 #undef CNT
609 #undef MFN
610
611 /* Create user-accessible L2 directory to map the MPT for compat guests. */
612 if ( (l2_ro_mpt = alloc_xen_pagetable()) == NULL )
613 goto nomem;
614 compat_idle_pg_table_l2 = l2_ro_mpt;
615 clear_page(l2_ro_mpt);
616 /* Allocate and map the compatibility mode machine-to-phys table. */
617 mpt_size = (mpt_size >> 1) + (1UL << (L2_PAGETABLE_SHIFT - 1));
618 if ( mpt_size > RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START )
619 mpt_size = RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START;
620 mpt_size &= ~((1UL << L2_PAGETABLE_SHIFT) - 1UL);
621 if ( (m2p_compat_vstart + mpt_size) < MACH2PHYS_COMPAT_VIRT_END )
622 m2p_compat_vstart = MACH2PHYS_COMPAT_VIRT_END - mpt_size;
623 #define MFN(x) (((x) << L2_PAGETABLE_SHIFT) / sizeof(unsigned int))
624 #define CNT ((sizeof(*frame_table) & -sizeof(*frame_table)) / \
625 sizeof(*compat_machine_to_phys_mapping))
626 BUILD_BUG_ON((sizeof(*frame_table) & ~sizeof(*frame_table)) % \
627 sizeof(*compat_machine_to_phys_mapping));
628 for ( i = 0; i < (mpt_size >> L2_PAGETABLE_SHIFT); i++, l2_ro_mpt++ )
629 {
630 memflags = MEMF_node(phys_to_nid(i <<
631 (L2_PAGETABLE_SHIFT - 2 + PAGE_SHIFT)));
632 for ( n = 0; n < CNT; ++n)
633 if ( mfn_valid(_mfn(MFN(i) + n * PDX_GROUP_COUNT)) )
634 break;
635 if ( n == CNT )
636 continue;
637 if ( (l1_pg = alloc_domheap_pages(NULL, PAGETABLE_ORDER,
638 memflags)) == NULL )
639 goto nomem;
640 map_pages_to_xen(
641 RDWR_COMPAT_MPT_VIRT_START + (i << L2_PAGETABLE_SHIFT),
642 page_to_mfn(l1_pg),
643 1UL << PAGETABLE_ORDER,
644 PAGE_HYPERVISOR);
645 /* Fill with INVALID_M2P_ENTRY. */
646 memset((void *)(RDWR_COMPAT_MPT_VIRT_START +
647 (i << L2_PAGETABLE_SHIFT)),
648 0xFF, 1UL << L2_PAGETABLE_SHIFT);
649 /* NB. Cannot be GLOBAL as the ptes get copied into per-VM space. */
650 l2e_write(l2_ro_mpt, l2e_from_page(l1_pg, _PAGE_PSE|_PAGE_PRESENT));
651 }
652 #undef CNT
653 #undef MFN
654
655 machine_to_phys_mapping_valid = 1;
656
657 /* Set up linear page table mapping. */
658 l4e_write(&idle_pg_table[l4_table_offset(LINEAR_PT_VIRT_START)],
659 l4e_from_paddr(__pa(idle_pg_table), __PAGE_HYPERVISOR_RW));
660 return;
661
662 nomem:
663 panic("Not enough memory for m2p table\n");
664 }
665
zap_low_mappings(void)666 void __init zap_low_mappings(void)
667 {
668 BUG_ON(num_online_cpus() != 1);
669
670 /* Remove aliased mapping of first 1:1 PML4 entry. */
671 l4e_write(&idle_pg_table[0], l4e_empty());
672 flush_local(FLUSH_TLB_GLOBAL);
673
674 /* Replace with mapping of the boot trampoline only. */
675 map_pages_to_xen(trampoline_phys, maddr_to_mfn(trampoline_phys),
676 PFN_UP(trampoline_end - trampoline_start),
677 __PAGE_HYPERVISOR_RX);
678 }
679
setup_compat_arg_xlat(struct vcpu * v)680 int setup_compat_arg_xlat(struct vcpu *v)
681 {
682 return create_perdomain_mapping(v->domain, ARG_XLAT_START(v),
683 PFN_UP(COMPAT_ARG_XLAT_SIZE),
684 NULL, NIL(struct page_info *));
685 }
686
free_compat_arg_xlat(struct vcpu * v)687 void free_compat_arg_xlat(struct vcpu *v)
688 {
689 destroy_perdomain_mapping(v->domain, ARG_XLAT_START(v),
690 PFN_UP(COMPAT_ARG_XLAT_SIZE));
691 }
692
cleanup_frame_table(struct mem_hotadd_info * info)693 static void cleanup_frame_table(struct mem_hotadd_info *info)
694 {
695 unsigned long sva, eva;
696 l3_pgentry_t l3e;
697 l2_pgentry_t l2e;
698 mfn_t spfn, epfn;
699
700 spfn = _mfn(info->spfn);
701 epfn = _mfn(info->epfn);
702
703 sva = (unsigned long)mfn_to_page(spfn);
704 eva = (unsigned long)mfn_to_page(epfn);
705
706 /* Intialize all page */
707 memset((void *)sva, -1, eva - sva);
708
709 while (sva < eva)
710 {
711 l3e = l3e_from_l4e(idle_pg_table[l4_table_offset(sva)],
712 l3_table_offset(sva));
713 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) ||
714 (l3e_get_flags(l3e) & _PAGE_PSE) )
715 {
716 sva = (sva & ~((1UL << L3_PAGETABLE_SHIFT) - 1)) +
717 (1UL << L3_PAGETABLE_SHIFT);
718 continue;
719 }
720
721 l2e = l2e_from_l3e(l3e, l2_table_offset(sva));
722 ASSERT(l2e_get_flags(l2e) & _PAGE_PRESENT);
723
724 if ( (l2e_get_flags(l2e) & (_PAGE_PRESENT | _PAGE_PSE)) ==
725 (_PAGE_PSE | _PAGE_PRESENT) )
726 {
727 if (hotadd_mem_valid(l2e_get_pfn(l2e), info))
728 destroy_xen_mappings(sva & ~((1UL << L2_PAGETABLE_SHIFT) - 1),
729 ((sva & ~((1UL << L2_PAGETABLE_SHIFT) -1 )) +
730 (1UL << L2_PAGETABLE_SHIFT) - 1));
731
732 sva = (sva & ~((1UL << L2_PAGETABLE_SHIFT) -1 )) +
733 (1UL << L2_PAGETABLE_SHIFT);
734 continue;
735 }
736
737 ASSERT(l1e_get_flags(l1e_from_l2e(l2e, l1_table_offset(sva))) &
738 _PAGE_PRESENT);
739
740 sva = (sva & PAGE_MASK) + PAGE_SIZE;
741 }
742
743 /* Brute-Force flush all TLB */
744 flush_tlb_all();
745 }
746
setup_frametable_chunk(void * start,void * end,struct mem_hotadd_info * info)747 static int setup_frametable_chunk(void *start, void *end,
748 struct mem_hotadd_info *info)
749 {
750 unsigned long s = (unsigned long)start;
751 unsigned long e = (unsigned long)end;
752 mfn_t mfn;
753 int err;
754
755 ASSERT(!(s & ((1 << L2_PAGETABLE_SHIFT) - 1)));
756 ASSERT(!(e & ((1 << L2_PAGETABLE_SHIFT) - 1)));
757
758 for ( ; s < e; s += (1UL << L2_PAGETABLE_SHIFT))
759 {
760 mfn = alloc_hotadd_mfn(info);
761 err = map_pages_to_xen(s, mfn, 1UL << PAGETABLE_ORDER,
762 PAGE_HYPERVISOR);
763 if ( err )
764 return err;
765 }
766 memset(start, -1, s - (unsigned long)start);
767
768 return 0;
769 }
770
extend_frame_table(struct mem_hotadd_info * info)771 static int extend_frame_table(struct mem_hotadd_info *info)
772 {
773 unsigned long cidx, nidx, eidx;
774 mfn_t spfn, epfn;
775
776 spfn = _mfn(info->spfn);
777 epfn = _mfn(info->epfn);
778
779 eidx = DIV_ROUND_UP(mfn_to_pdx(epfn), PDX_GROUP_COUNT);
780 nidx = cidx = mfn_to_pdx(spfn)/PDX_GROUP_COUNT;
781
782 ASSERT( mfn_to_pdx(epfn) <= (DIRECTMAP_SIZE >> PAGE_SHIFT) &&
783 mfn_to_pdx(epfn) <= FRAMETABLE_NR );
784
785 if ( test_bit(cidx, pdx_group_valid) )
786 cidx = find_next_zero_bit(pdx_group_valid, eidx, cidx);
787
788 if ( cidx >= eidx )
789 return 0;
790
791 while ( cidx < eidx )
792 {
793 int err;
794
795 nidx = find_next_bit(pdx_group_valid, eidx, cidx);
796 if ( nidx >= eidx )
797 nidx = eidx;
798 err = setup_frametable_chunk(pdx_to_page(cidx * PDX_GROUP_COUNT ),
799 pdx_to_page(nidx * PDX_GROUP_COUNT),
800 info);
801 if ( err )
802 return err;
803
804 cidx = find_next_zero_bit(pdx_group_valid, eidx, nidx);
805 }
806
807 memset(mfn_to_page(spfn), 0,
808 (unsigned long)mfn_to_page(epfn) - (unsigned long)mfn_to_page(spfn));
809 return 0;
810 }
811
subarch_init_memory(void)812 void __init subarch_init_memory(void)
813 {
814 unsigned long i, n, v, m2p_start_mfn;
815 l3_pgentry_t l3e;
816 l2_pgentry_t l2e;
817
818 BUILD_BUG_ON(RDWR_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
819 BUILD_BUG_ON(RDWR_MPT_VIRT_END & ((1UL << L3_PAGETABLE_SHIFT) - 1));
820 /* M2P table is mappable read-only by privileged domains. */
821 for ( v = RDWR_MPT_VIRT_START;
822 v != RDWR_MPT_VIRT_END;
823 v += n << PAGE_SHIFT )
824 {
825 n = L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES;
826 l3e = l3e_from_l4e(idle_pg_table[l4_table_offset(v)],
827 l3_table_offset(v));
828 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
829 continue;
830 if ( !(l3e_get_flags(l3e) & _PAGE_PSE) )
831 {
832 n = L1_PAGETABLE_ENTRIES;
833 l2e = l2e_from_l3e(l3e, l2_table_offset(v));
834 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
835 continue;
836 m2p_start_mfn = l2e_get_pfn(l2e);
837 }
838 else
839 {
840 m2p_start_mfn = l3e_get_pfn(l3e);
841 }
842
843 for ( i = 0; i < n; i++ )
844 share_xen_page_with_privileged_guests(
845 mfn_to_page(_mfn(m2p_start_mfn + i)), SHARE_ro);
846 }
847
848 for ( v = RDWR_COMPAT_MPT_VIRT_START;
849 v != RDWR_COMPAT_MPT_VIRT_END;
850 v += 1 << L2_PAGETABLE_SHIFT )
851 {
852 l3e = l3e_from_l4e(idle_pg_table[l4_table_offset(v)],
853 l3_table_offset(v));
854 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
855 continue;
856 l2e = l2e_from_l3e(l3e, l2_table_offset(v));
857 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
858 continue;
859 m2p_start_mfn = l2e_get_pfn(l2e);
860
861 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
862 share_xen_page_with_privileged_guests(
863 mfn_to_page(_mfn(m2p_start_mfn + i)), SHARE_ro);
864 }
865
866 /* Mark all of direct map NX if hardware supports it. */
867 if ( !cpu_has_nx )
868 return;
869
870 for ( i = l4_table_offset(DIRECTMAP_VIRT_START);
871 i < l4_table_offset(DIRECTMAP_VIRT_END); ++i )
872 {
873 l4_pgentry_t l4e = idle_pg_table[i];
874
875 if ( l4e_get_flags(l4e) & _PAGE_PRESENT )
876 {
877 l4e_add_flags(l4e, _PAGE_NX_BIT);
878 idle_pg_table[i] = l4e;
879 }
880 }
881 }
882
subarch_memory_op(unsigned long cmd,XEN_GUEST_HANDLE_PARAM (void)arg)883 long subarch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
884 {
885 struct xen_machphys_mfn_list xmml;
886 l3_pgentry_t l3e;
887 l2_pgentry_t l2e;
888 unsigned long v, limit;
889 xen_pfn_t mfn, last_mfn;
890 unsigned int i;
891 long rc = 0;
892
893 switch ( cmd )
894 {
895 case XENMEM_machphys_mfn_list:
896 if ( copy_from_guest(&xmml, arg, 1) )
897 return -EFAULT;
898
899 BUILD_BUG_ON(RDWR_MPT_VIRT_START & ((1UL << L3_PAGETABLE_SHIFT) - 1));
900 BUILD_BUG_ON(RDWR_MPT_VIRT_END & ((1UL << L3_PAGETABLE_SHIFT) - 1));
901 for ( i = 0, v = RDWR_MPT_VIRT_START, last_mfn = 0;
902 (i != xmml.max_extents) &&
903 (v < (unsigned long)(machine_to_phys_mapping + max_page));
904 i++, v += 1UL << L2_PAGETABLE_SHIFT )
905 {
906 l3e = l3e_from_l4e(idle_pg_table[l4_table_offset(v)],
907 l3_table_offset(v));
908 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
909 mfn = last_mfn;
910 else if ( !(l3e_get_flags(l3e) & _PAGE_PSE) )
911 {
912 l2e = l2e_from_l3e(l3e, l2_table_offset(v));
913 if ( l2e_get_flags(l2e) & _PAGE_PRESENT )
914 mfn = l2e_get_pfn(l2e);
915 else
916 mfn = last_mfn;
917 }
918 else
919 {
920 mfn = l3e_get_pfn(l3e)
921 + (l2_table_offset(v) << PAGETABLE_ORDER);
922 }
923 ASSERT(mfn);
924 if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) )
925 return -EFAULT;
926 last_mfn = mfn;
927 }
928
929 xmml.nr_extents = i;
930 if ( __copy_to_guest(arg, &xmml, 1) )
931 return -EFAULT;
932
933 break;
934
935 case XENMEM_machphys_compat_mfn_list:
936 if ( copy_from_guest(&xmml, arg, 1) )
937 return -EFAULT;
938
939 limit = (unsigned long)(compat_machine_to_phys_mapping + max_page);
940 if ( limit > RDWR_COMPAT_MPT_VIRT_END )
941 limit = RDWR_COMPAT_MPT_VIRT_END;
942 for ( i = 0, v = RDWR_COMPAT_MPT_VIRT_START, last_mfn = 0;
943 (i != xmml.max_extents) && (v < limit);
944 i++, v += 1 << L2_PAGETABLE_SHIFT )
945 {
946 l2e = compat_idle_pg_table_l2[l2_table_offset(v)];
947 if ( l2e_get_flags(l2e) & _PAGE_PRESENT )
948 mfn = l2e_get_pfn(l2e);
949 else
950 mfn = last_mfn;
951 ASSERT(mfn);
952 if ( copy_to_guest_offset(xmml.extent_start, i, &mfn, 1) )
953 return -EFAULT;
954 last_mfn = mfn;
955 }
956
957 xmml.nr_extents = i;
958 if ( __copy_to_guest(arg, &xmml, 1) )
959 rc = -EFAULT;
960
961 break;
962
963 case XENMEM_get_sharing_freed_pages:
964 return mem_sharing_get_nr_saved_mfns();
965
966 case XENMEM_get_sharing_shared_pages:
967 return mem_sharing_get_nr_shared_mfns();
968
969 case XENMEM_paging_op:
970 return mem_paging_memop(guest_handle_cast(arg, xen_mem_paging_op_t));
971
972 #ifdef CONFIG_MEM_SHARING
973 case XENMEM_sharing_op:
974 return mem_sharing_memop(guest_handle_cast(arg, xen_mem_sharing_op_t));
975 #endif
976
977 default:
978 rc = -ENOSYS;
979 break;
980 }
981
982 return rc;
983 }
984
do_stack_switch(unsigned long ss,unsigned long esp)985 long do_stack_switch(unsigned long ss, unsigned long esp)
986 {
987 fixup_guest_stack_selector(current->domain, ss);
988 current->arch.pv.kernel_ss = ss;
989 current->arch.pv.kernel_sp = esp;
990 return 0;
991 }
992
do_set_segment_base(unsigned int which,unsigned long base)993 long do_set_segment_base(unsigned int which, unsigned long base)
994 {
995 struct vcpu *v = current;
996 long ret = 0;
997
998 if ( is_pv_32bit_vcpu(v) )
999 return -ENOSYS; /* x86/64 only. */
1000
1001 switch ( which )
1002 {
1003 case SEGBASE_FS:
1004 if ( is_canonical_address(base) )
1005 wrfsbase(base);
1006 else
1007 ret = -EINVAL;
1008 break;
1009
1010 case SEGBASE_GS_USER:
1011 if ( is_canonical_address(base) )
1012 {
1013 wrgsshadow(base);
1014 v->arch.pv.gs_base_user = base;
1015 }
1016 else
1017 ret = -EINVAL;
1018 break;
1019
1020 case SEGBASE_GS_KERNEL:
1021 if ( is_canonical_address(base) )
1022 wrgsbase(base);
1023 else
1024 ret = -EINVAL;
1025 break;
1026
1027 case SEGBASE_GS_USER_SEL:
1028 {
1029 unsigned int sel = (uint16_t)base;
1030
1031 /*
1032 * We wish to update the user %gs from the GDT/LDT. Currently, the
1033 * guest kernel's GS_BASE is in context.
1034 */
1035 asm volatile ( "swapgs" );
1036
1037 if ( sel > 3 )
1038 /* Fix up RPL for non-NUL selectors. */
1039 sel |= 3;
1040 else if ( boot_cpu_data.x86_vendor &
1041 (X86_VENDOR_AMD | X86_VENDOR_HYGON) )
1042 /* Work around NUL segment behaviour on AMD hardware. */
1043 asm volatile ( "mov %[sel], %%gs"
1044 :: [sel] "r" (FLAT_USER_DS32) );
1045
1046 /*
1047 * Load the chosen selector, with fault handling.
1048 *
1049 * Errors ought to fail the hypercall, but that was never built in
1050 * originally, and Linux will BUG() if this call fails.
1051 *
1052 * NUL the selector in the case of an error. This too needs to deal
1053 * with the AMD NUL segment behaviour, but it is already a slowpath in
1054 * #GP context so perform the flat load unconditionally to avoid
1055 * complicated logic.
1056 *
1057 * Anyone wanting to check for errors from this hypercall should
1058 * re-read %gs and compare against the input.
1059 */
1060 asm volatile ( "1: mov %[sel], %%gs\n\t"
1061 ".section .fixup, \"ax\", @progbits\n\t"
1062 "2: mov %k[flat], %%gs\n\t"
1063 " xor %[sel], %[sel]\n\t"
1064 " jmp 1b\n\t"
1065 ".previous\n\t"
1066 _ASM_EXTABLE(1b, 2b)
1067 : [sel] "+r" (sel)
1068 : [flat] "r" (FLAT_USER_DS32) );
1069
1070 /* Update the cache of the inactive base, as read from the GDT/LDT. */
1071 v->arch.pv.gs_base_user = rdgsbase();
1072
1073 asm volatile ( safe_swapgs );
1074 break;
1075 }
1076
1077 default:
1078 ret = -EINVAL;
1079 break;
1080 }
1081
1082 return ret;
1083 }
1084
1085
1086 /* Returns TRUE if given descriptor is valid for GDT or LDT. */
check_descriptor(const struct domain * dom,seg_desc_t * d)1087 int check_descriptor(const struct domain *dom, seg_desc_t *d)
1088 {
1089 u32 a = d->a, b = d->b;
1090 u16 cs;
1091 unsigned int dpl;
1092
1093 /* A not-present descriptor will always fault, so is safe. */
1094 if ( !(b & _SEGMENT_P) )
1095 return 1;
1096
1097 /* Check and fix up the DPL. */
1098 dpl = (b >> 13) & 3;
1099 __fixup_guest_selector(dom, dpl);
1100 b = (b & ~_SEGMENT_DPL) | (dpl << 13);
1101
1102 /* All code and data segments are okay. No base/limit checking. */
1103 if ( (b & _SEGMENT_S) )
1104 {
1105 if ( is_pv_32bit_domain(dom) )
1106 {
1107 unsigned long base, limit;
1108
1109 if ( b & _SEGMENT_L )
1110 goto bad;
1111
1112 /*
1113 * Older PAE Linux guests use segments which are limited to
1114 * 0xf6800000. Extend these to allow access to the larger read-only
1115 * M2P table available in 32on64 mode.
1116 */
1117 base = (b & 0xff000000) | ((b & 0xff) << 16) | (a >> 16);
1118
1119 limit = (b & 0xf0000) | (a & 0xffff);
1120 limit++; /* We add one because limit is inclusive. */
1121
1122 if ( (b & _SEGMENT_G) )
1123 limit <<= 12;
1124
1125 if ( (base == 0) && (limit > HYPERVISOR_COMPAT_VIRT_START(dom)) )
1126 {
1127 a |= 0x0000ffff;
1128 b |= 0x000f0000;
1129 }
1130 }
1131
1132 goto good;
1133 }
1134
1135 /* Invalid type 0 is harmless. It is used for 2nd half of a call gate. */
1136 if ( (b & _SEGMENT_TYPE) == 0x000 )
1137 return 1;
1138
1139 /* Everything but a call gate is discarded here. */
1140 if ( (b & _SEGMENT_TYPE) != 0xc00 )
1141 goto bad;
1142
1143 /* Validate the target code selector. */
1144 cs = a >> 16;
1145 if ( !guest_gate_selector_okay(dom, cs) )
1146 goto bad;
1147 /*
1148 * Force DPL to zero, causing a GP fault with its error code indicating
1149 * the gate in use, allowing emulation. This is necessary because with
1150 * native guests (kernel in ring 3) call gates cannot be used directly
1151 * to transition from user to kernel mode (and whether a gate is used
1152 * to enter the kernel can only be determined when the gate is being
1153 * used), and with compat guests call gates cannot be used at all as
1154 * there are only 64-bit ones.
1155 * Store the original DPL in the selector's RPL field.
1156 */
1157 b &= ~_SEGMENT_DPL;
1158 cs = (cs & ~3) | dpl;
1159 a = (a & 0xffffU) | (cs << 16);
1160
1161 /* Reserved bits must be zero. */
1162 if ( b & (is_pv_32bit_domain(dom) ? 0xe0 : 0xff) )
1163 goto bad;
1164
1165 good:
1166 d->a = a;
1167 d->b = b;
1168 return 1;
1169 bad:
1170 return 0;
1171 }
1172
pagefault_by_memadd(unsigned long addr,struct cpu_user_regs * regs)1173 int pagefault_by_memadd(unsigned long addr, struct cpu_user_regs *regs)
1174 {
1175 struct domain *d = current->domain;
1176
1177 return mem_hotplug && guest_mode(regs) && is_pv_32bit_domain(d) &&
1178 (addr >= HYPERVISOR_COMPAT_VIRT_START(d)) &&
1179 (addr < MACH2PHYS_COMPAT_VIRT_END);
1180 }
1181
handle_memadd_fault(unsigned long addr,struct cpu_user_regs * regs)1182 int handle_memadd_fault(unsigned long addr, struct cpu_user_regs *regs)
1183 {
1184 struct domain *d = current->domain;
1185 l4_pgentry_t *pl4e = NULL;
1186 l4_pgentry_t l4e;
1187 l3_pgentry_t *pl3e = NULL;
1188 l3_pgentry_t l3e;
1189 l2_pgentry_t *pl2e = NULL;
1190 l2_pgentry_t l2e, idle_l2e;
1191 unsigned long mfn, idle_index;
1192 int ret = 0;
1193
1194 if (!is_pv_32bit_domain(d))
1195 return 0;
1196
1197 if ( (addr < HYPERVISOR_COMPAT_VIRT_START(d)) ||
1198 (addr >= MACH2PHYS_COMPAT_VIRT_END) )
1199 return 0;
1200
1201 mfn = (read_cr3()) >> PAGE_SHIFT;
1202
1203 pl4e = map_domain_page(_mfn(mfn));
1204
1205 l4e = pl4e[0];
1206
1207 if (!(l4e_get_flags(l4e) & _PAGE_PRESENT))
1208 goto unmap;
1209
1210 mfn = l4e_get_pfn(l4e);
1211 /* We don't need get page type here since it is current CR3 */
1212 pl3e = map_domain_page(_mfn(mfn));
1213
1214 l3e = pl3e[3];
1215
1216 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
1217 goto unmap;
1218
1219 mfn = l3e_get_pfn(l3e);
1220 pl2e = map_domain_page(_mfn(mfn));
1221
1222 l2e = pl2e[l2_table_offset(addr)];
1223
1224 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT))
1225 goto unmap;
1226
1227 idle_index = (l2_table_offset(addr) -
1228 COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d))/
1229 sizeof(l2_pgentry_t);
1230 idle_l2e = compat_idle_pg_table_l2[idle_index];
1231 if (!(l2e_get_flags(idle_l2e) & _PAGE_PRESENT))
1232 goto unmap;
1233
1234 memcpy(&pl2e[l2_table_offset(addr)],
1235 &compat_idle_pg_table_l2[idle_index],
1236 sizeof(l2_pgentry_t));
1237
1238 ret = EXCRET_fault_fixed;
1239
1240 unmap:
1241 if ( pl4e )
1242 unmap_domain_page(pl4e);
1243 if ( pl3e )
1244 unmap_domain_page(pl3e);
1245 if ( pl2e )
1246 unmap_domain_page(pl2e);
1247
1248 return ret;
1249 }
1250
domain_set_alloc_bitsize(struct domain * d)1251 void domain_set_alloc_bitsize(struct domain *d)
1252 {
1253 if ( !is_pv_32bit_domain(d) ||
1254 (MACH2PHYS_COMPAT_NR_ENTRIES(d) >= max_page) ||
1255 d->arch.physaddr_bitsize > 0 )
1256 return;
1257 d->arch.physaddr_bitsize =
1258 /* 2^n entries can be contained in guest's p2m mapping space */
1259 fls(MACH2PHYS_COMPAT_NR_ENTRIES(d)) - 1
1260 /* 2^n pages -> 2^(n+PAGE_SHIFT) bits */
1261 + PAGE_SHIFT;
1262 }
1263
domain_clamp_alloc_bitsize(struct domain * d,unsigned int bits)1264 unsigned int domain_clamp_alloc_bitsize(struct domain *d, unsigned int bits)
1265 {
1266 if ( (d == NULL) || (d->arch.physaddr_bitsize == 0) )
1267 return bits;
1268 return min(d->arch.physaddr_bitsize, bits);
1269 }
1270
transfer_pages_to_heap(struct mem_hotadd_info * info)1271 static int transfer_pages_to_heap(struct mem_hotadd_info *info)
1272 {
1273 unsigned long i;
1274 struct page_info *pg;
1275
1276 /*
1277 * Mark the allocated page before put free pages to buddy allocator
1278 * to avoid merge in free_heap_pages
1279 */
1280 for (i = info->spfn; i < info->cur; i++)
1281 {
1282 pg = mfn_to_page(_mfn(i));
1283 pg->count_info = PGC_state_inuse;
1284 }
1285
1286 init_domheap_pages(pfn_to_paddr(info->cur), pfn_to_paddr(info->epfn));
1287
1288 return 0;
1289 }
1290
mem_hotadd_check(unsigned long spfn,unsigned long epfn)1291 static int mem_hotadd_check(unsigned long spfn, unsigned long epfn)
1292 {
1293 unsigned long s, e, length, sidx, eidx;
1294
1295 if ( (spfn >= epfn) )
1296 return 0;
1297
1298 if (pfn_to_pdx(epfn) > FRAMETABLE_NR)
1299 return 0;
1300
1301 if ( (spfn | epfn) & ((1UL << PAGETABLE_ORDER) - 1) )
1302 return 0;
1303
1304 if ( (spfn | epfn) & pfn_hole_mask )
1305 return 0;
1306
1307 /* Make sure the new range is not present now */
1308 sidx = ((pfn_to_pdx(spfn) + PDX_GROUP_COUNT - 1) & ~(PDX_GROUP_COUNT - 1))
1309 / PDX_GROUP_COUNT;
1310 eidx = (pfn_to_pdx(epfn - 1) & ~(PDX_GROUP_COUNT - 1)) / PDX_GROUP_COUNT;
1311 if (sidx >= eidx)
1312 return 0;
1313
1314 s = find_next_zero_bit(pdx_group_valid, eidx, sidx);
1315 if ( s > eidx )
1316 return 0;
1317 e = find_next_bit(pdx_group_valid, eidx, s);
1318 if ( e < eidx )
1319 return 0;
1320
1321 /* Caculate at most required m2p/compat m2p/frametable pages */
1322 s = (spfn & ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1));
1323 e = (epfn + (1UL << (L2_PAGETABLE_SHIFT - 3)) - 1) &
1324 ~((1UL << (L2_PAGETABLE_SHIFT - 3)) - 1);
1325
1326 length = (e - s) * sizeof(unsigned long);
1327
1328 s = (spfn & ~((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1));
1329 e = (epfn + (1UL << (L2_PAGETABLE_SHIFT - 2)) - 1) &
1330 ~((1UL << (L2_PAGETABLE_SHIFT - 2)) - 1);
1331
1332 e = min_t(unsigned long, e,
1333 (RDWR_COMPAT_MPT_VIRT_END - RDWR_COMPAT_MPT_VIRT_START) >> 2);
1334
1335 if ( e > s )
1336 length += (e -s) * sizeof(unsigned int);
1337
1338 s = pfn_to_pdx(spfn) & ~(PDX_GROUP_COUNT - 1);
1339 e = ( pfn_to_pdx(epfn) + (PDX_GROUP_COUNT - 1) ) & ~(PDX_GROUP_COUNT - 1);
1340
1341 length += (e - s) * sizeof(struct page_info);
1342
1343 if ((length >> PAGE_SHIFT) > (epfn - spfn))
1344 return 0;
1345
1346 return 1;
1347 }
1348
1349 /*
1350 * A bit paranoid for memory allocation failure issue since
1351 * it may be reason for memory add
1352 */
memory_add(unsigned long spfn,unsigned long epfn,unsigned int pxm)1353 int memory_add(unsigned long spfn, unsigned long epfn, unsigned int pxm)
1354 {
1355 struct mem_hotadd_info info;
1356 int ret;
1357 nodeid_t node;
1358 unsigned long old_max = max_page, old_total = total_pages;
1359 unsigned long old_node_start, old_node_span, orig_online;
1360 unsigned long i;
1361
1362 dprintk(XENLOG_INFO, "memory_add %lx ~ %lx with pxm %x\n", spfn, epfn, pxm);
1363
1364 if ( !mem_hotadd_check(spfn, epfn) )
1365 return -EINVAL;
1366
1367 if ( (node = setup_node(pxm)) == NUMA_NO_NODE )
1368 return -EINVAL;
1369
1370 if ( !valid_numa_range(spfn << PAGE_SHIFT, epfn << PAGE_SHIFT, node) )
1371 {
1372 printk(XENLOG_WARNING
1373 "pfn range %lx..%lx PXM %x node %x is not NUMA-valid\n",
1374 spfn, epfn, pxm, node);
1375 return -EINVAL;
1376 }
1377
1378 i = virt_to_mfn(HYPERVISOR_VIRT_END - 1) + 1;
1379 if ( spfn < i )
1380 {
1381 ret = map_pages_to_xen((unsigned long)mfn_to_virt(spfn), _mfn(spfn),
1382 min(epfn, i) - spfn, PAGE_HYPERVISOR);
1383 if ( ret )
1384 goto destroy_directmap;
1385 }
1386 if ( i < epfn )
1387 {
1388 if ( i < spfn )
1389 i = spfn;
1390 ret = map_pages_to_xen((unsigned long)mfn_to_virt(i), _mfn(i),
1391 epfn - i, __PAGE_HYPERVISOR_RW);
1392 if ( ret )
1393 goto destroy_directmap;
1394 }
1395
1396 old_node_start = node_start_pfn(node);
1397 old_node_span = node_spanned_pages(node);
1398 orig_online = node_online(node);
1399
1400 if ( !orig_online )
1401 {
1402 dprintk(XENLOG_WARNING, "node %x pxm %x is not online\n",node, pxm);
1403 NODE_DATA(node)->node_start_pfn = spfn;
1404 NODE_DATA(node)->node_spanned_pages =
1405 epfn - node_start_pfn(node);
1406 node_set_online(node);
1407 }
1408 else
1409 {
1410 if (node_start_pfn(node) > spfn)
1411 NODE_DATA(node)->node_start_pfn = spfn;
1412 if (node_end_pfn(node) < epfn)
1413 NODE_DATA(node)->node_spanned_pages = epfn - node_start_pfn(node);
1414 }
1415
1416 info.spfn = spfn;
1417 info.epfn = epfn;
1418 info.cur = spfn;
1419
1420 ret = extend_frame_table(&info);
1421 if (ret)
1422 goto destroy_frametable;
1423
1424 /* Set max_page as setup_m2p_table will use it*/
1425 if (max_page < epfn)
1426 {
1427 max_page = epfn;
1428 max_pdx = pfn_to_pdx(max_page - 1) + 1;
1429 }
1430 total_pages += epfn - spfn;
1431
1432 set_pdx_range(spfn, epfn);
1433 ret = setup_m2p_table(&info);
1434
1435 if ( ret )
1436 goto destroy_m2p;
1437
1438 /*
1439 * If hardware domain has IOMMU mappings but page tables are not
1440 * shared or being kept in sync then newly added memory needs to be
1441 * mapped here.
1442 */
1443 if ( is_iommu_enabled(hardware_domain) &&
1444 !iommu_use_hap_pt(hardware_domain) &&
1445 !need_iommu_pt_sync(hardware_domain) )
1446 {
1447 for ( i = spfn; i < epfn; i++ )
1448 if ( iommu_legacy_map(hardware_domain, _dfn(i), _mfn(i),
1449 PAGE_ORDER_4K,
1450 IOMMUF_readable | IOMMUF_writable) )
1451 break;
1452 if ( i != epfn )
1453 {
1454 while (i-- > old_max)
1455 /* If statement to satisfy __must_check. */
1456 if ( iommu_legacy_unmap(hardware_domain, _dfn(i),
1457 PAGE_ORDER_4K) )
1458 continue;
1459
1460 goto destroy_m2p;
1461 }
1462 }
1463
1464 /* We can't revert any more */
1465 share_hotadd_m2p_table(&info);
1466 transfer_pages_to_heap(&info);
1467
1468 return 0;
1469
1470 destroy_m2p:
1471 destroy_m2p_mapping(&info);
1472 max_page = old_max;
1473 total_pages = old_total;
1474 max_pdx = pfn_to_pdx(max_page - 1) + 1;
1475 destroy_frametable:
1476 cleanup_frame_table(&info);
1477 if ( !orig_online )
1478 node_set_offline(node);
1479 NODE_DATA(node)->node_start_pfn = old_node_start;
1480 NODE_DATA(node)->node_spanned_pages = old_node_span;
1481 destroy_directmap:
1482 destroy_xen_mappings((unsigned long)mfn_to_virt(spfn),
1483 (unsigned long)mfn_to_virt(epfn));
1484
1485 return ret;
1486 }
1487
1488 #include "compat/mm.c"
1489
1490 /*
1491 * Local variables:
1492 * mode: C
1493 * c-file-style: "BSD"
1494 * c-basic-offset: 4
1495 * tab-width: 4
1496 * indent-tabs-mode: nil
1497 * End:
1498 */
1499