1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; If not, see <http://www.gnu.org/licenses/>.
15  *
16  * Copyright (C) Ashok Raj <ashok.raj@intel.com>
17  * Copyright (C) Shaohua Li <shaohua.li@intel.com>
18  * Copyright (C) Allen Kay <allen.m.kay@intel.com> - adapted to xen
19  */
20 
21 #include <xen/irq.h>
22 #include <xen/sched.h>
23 #include <xen/xmalloc.h>
24 #include <xen/domain_page.h>
25 #include <xen/iocap.h>
26 #include <xen/iommu.h>
27 #include <xen/numa.h>
28 #include <xen/softirq.h>
29 #include <xen/time.h>
30 #include <xen/pci.h>
31 #include <xen/pci_regs.h>
32 #include <xen/keyhandler.h>
33 #include <asm/msi.h>
34 #include <asm/irq.h>
35 #include <asm/hvm/vmx/vmx.h>
36 #include <asm/p2m.h>
37 #include <mach_apic.h>
38 #include "iommu.h"
39 #include "dmar.h"
40 #include "extern.h"
41 #include "vtd.h"
42 #include "../ats.h"
43 
44 struct mapped_rmrr {
45     struct list_head list;
46     u64 base, end;
47     unsigned int count;
48 };
49 
50 /* Possible unfiltered LAPIC/MSI messages from untrusted sources? */
51 bool __read_mostly untrusted_msi;
52 
53 int nr_iommus;
54 
55 static struct tasklet vtd_fault_tasklet;
56 
57 static int setup_hwdom_device(u8 devfn, struct pci_dev *);
58 static void setup_hwdom_rmrr(struct domain *d);
59 
domain_iommu_domid(struct domain * d,struct iommu * iommu)60 static int domain_iommu_domid(struct domain *d,
61                               struct iommu *iommu)
62 {
63     unsigned long nr_dom, i;
64 
65     nr_dom = cap_ndoms(iommu->cap);
66     i = find_first_bit(iommu->domid_bitmap, nr_dom);
67     while ( i < nr_dom )
68     {
69         if ( iommu->domid_map[i] == d->domain_id )
70             return i;
71 
72         i = find_next_bit(iommu->domid_bitmap, nr_dom, i+1);
73     }
74 
75     dprintk(XENLOG_ERR VTDPREFIX,
76             "Cannot get valid iommu domid: domid=%d iommu->index=%d\n",
77             d->domain_id, iommu->index);
78     return -1;
79 }
80 
81 #define DID_FIELD_WIDTH 16
82 #define DID_HIGH_OFFSET 8
context_set_domain_id(struct context_entry * context,struct domain * d,struct iommu * iommu)83 static int context_set_domain_id(struct context_entry *context,
84                                  struct domain *d,
85                                  struct iommu *iommu)
86 {
87     unsigned long nr_dom, i;
88     int found = 0;
89 
90     ASSERT(spin_is_locked(&iommu->lock));
91 
92     nr_dom = cap_ndoms(iommu->cap);
93     i = find_first_bit(iommu->domid_bitmap, nr_dom);
94     while ( i < nr_dom )
95     {
96         if ( iommu->domid_map[i] == d->domain_id )
97         {
98             found = 1;
99             break;
100         }
101         i = find_next_bit(iommu->domid_bitmap, nr_dom, i+1);
102     }
103 
104     if ( found == 0 )
105     {
106         i = find_first_zero_bit(iommu->domid_bitmap, nr_dom);
107         if ( i >= nr_dom )
108         {
109             dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no free domain ids\n");
110             return -EFAULT;
111         }
112         iommu->domid_map[i] = d->domain_id;
113     }
114 
115     set_bit(i, iommu->domid_bitmap);
116     context->hi |= (i & ((1 << DID_FIELD_WIDTH) - 1)) << DID_HIGH_OFFSET;
117     return 0;
118 }
119 
context_get_domain_id(struct context_entry * context,struct iommu * iommu)120 static int context_get_domain_id(struct context_entry *context,
121                                  struct iommu *iommu)
122 {
123     unsigned long dom_index, nr_dom;
124     int domid = -1;
125 
126     if (iommu && context)
127     {
128         nr_dom = cap_ndoms(iommu->cap);
129 
130         dom_index = context_domain_id(*context);
131 
132         if ( dom_index < nr_dom && iommu->domid_map )
133             domid = iommu->domid_map[dom_index];
134         else
135             dprintk(XENLOG_DEBUG VTDPREFIX,
136                     "dom_index %lu exceeds nr_dom %lu or iommu has no domid_map\n",
137                     dom_index, nr_dom);
138     }
139     return domid;
140 }
141 
alloc_intel_iommu(void)142 static struct intel_iommu *__init alloc_intel_iommu(void)
143 {
144     struct intel_iommu *intel;
145 
146     intel = xzalloc(struct intel_iommu);
147     if ( intel == NULL )
148         return NULL;
149 
150     spin_lock_init(&intel->ir_ctrl.iremap_lock);
151 
152     return intel;
153 }
154 
free_intel_iommu(struct intel_iommu * intel)155 static void __init free_intel_iommu(struct intel_iommu *intel)
156 {
157     xfree(intel);
158 }
159 
160 static int iommus_incoherent;
__iommu_flush_cache(void * addr,unsigned int size)161 static void __iommu_flush_cache(void *addr, unsigned int size)
162 {
163     int i;
164     static unsigned int clflush_size = 0;
165 
166     if ( !iommus_incoherent )
167         return;
168 
169     if ( clflush_size == 0 )
170         clflush_size = get_cache_line_size();
171 
172     for ( i = 0; i < size; i += clflush_size )
173         cacheline_flush((char *)addr + i);
174 }
175 
iommu_flush_cache_entry(void * addr,unsigned int size)176 void iommu_flush_cache_entry(void *addr, unsigned int size)
177 {
178     __iommu_flush_cache(addr, size);
179 }
180 
iommu_flush_cache_page(void * addr,unsigned long npages)181 void iommu_flush_cache_page(void *addr, unsigned long npages)
182 {
183     __iommu_flush_cache(addr, PAGE_SIZE * npages);
184 }
185 
186 /* Allocate page table, return its machine address */
alloc_pgtable_maddr(struct acpi_drhd_unit * drhd,unsigned long npages)187 u64 alloc_pgtable_maddr(struct acpi_drhd_unit *drhd, unsigned long npages)
188 {
189     struct acpi_rhsa_unit *rhsa;
190     struct page_info *pg, *cur_pg;
191     u64 *vaddr;
192     nodeid_t node = NUMA_NO_NODE;
193     unsigned int i;
194 
195     rhsa = drhd_to_rhsa(drhd);
196     if ( rhsa )
197         node =  pxm_to_node(rhsa->proximity_domain);
198 
199     pg = alloc_domheap_pages(NULL, get_order_from_pages(npages),
200                              (node == NUMA_NO_NODE) ? 0 : MEMF_node(node));
201     if ( !pg )
202         return 0;
203 
204     cur_pg = pg;
205     for ( i = 0; i < npages; i++ )
206     {
207         vaddr = __map_domain_page(cur_pg);
208         memset(vaddr, 0, PAGE_SIZE);
209 
210         iommu_flush_cache_page(vaddr, 1);
211         unmap_domain_page(vaddr);
212         cur_pg++;
213     }
214 
215     return page_to_maddr(pg);
216 }
217 
free_pgtable_maddr(u64 maddr)218 void free_pgtable_maddr(u64 maddr)
219 {
220     if ( maddr != 0 )
221         free_domheap_page(maddr_to_page(maddr));
222 }
223 
224 /* context entry handling */
bus_to_context_maddr(struct iommu * iommu,u8 bus)225 static u64 bus_to_context_maddr(struct iommu *iommu, u8 bus)
226 {
227     struct acpi_drhd_unit *drhd;
228     struct root_entry *root, *root_entries;
229     u64 maddr;
230 
231     ASSERT(spin_is_locked(&iommu->lock));
232     root_entries = (struct root_entry *)map_vtd_domain_page(iommu->root_maddr);
233     root = &root_entries[bus];
234     if ( !root_present(*root) )
235     {
236         drhd = iommu_to_drhd(iommu);
237         maddr = alloc_pgtable_maddr(drhd, 1);
238         if ( maddr == 0 )
239         {
240             unmap_vtd_domain_page(root_entries);
241             return 0;
242         }
243         set_root_value(*root, maddr);
244         set_root_present(*root);
245         iommu_flush_cache_entry(root, sizeof(struct root_entry));
246     }
247     maddr = (u64) get_context_addr(*root);
248     unmap_vtd_domain_page(root_entries);
249     return maddr;
250 }
251 
addr_to_dma_page_maddr(struct domain * domain,u64 addr,int alloc)252 static u64 addr_to_dma_page_maddr(struct domain *domain, u64 addr, int alloc)
253 {
254     struct acpi_drhd_unit *drhd;
255     struct pci_dev *pdev;
256     struct domain_iommu *hd = dom_iommu(domain);
257     int addr_width = agaw_to_width(hd->arch.agaw);
258     struct dma_pte *parent, *pte = NULL;
259     int level = agaw_to_level(hd->arch.agaw);
260     int offset;
261     u64 pte_maddr = 0;
262 
263     addr &= (((u64)1) << addr_width) - 1;
264     ASSERT(spin_is_locked(&hd->arch.mapping_lock));
265     if ( hd->arch.pgd_maddr == 0 )
266     {
267         /*
268          * just get any passthrough device in the domainr - assume user
269          * assigns only devices from same node to a given guest.
270          */
271         pdev = pci_get_pdev_by_domain(domain, -1, -1, -1);
272         drhd = acpi_find_matched_drhd_unit(pdev);
273         if ( !alloc || ((hd->arch.pgd_maddr = alloc_pgtable_maddr(drhd, 1)) == 0) )
274             goto out;
275     }
276 
277     parent = (struct dma_pte *)map_vtd_domain_page(hd->arch.pgd_maddr);
278     while ( level > 1 )
279     {
280         offset = address_level_offset(addr, level);
281         pte = &parent[offset];
282 
283         pte_maddr = dma_pte_addr(*pte);
284         if ( !pte_maddr )
285         {
286             if ( !alloc )
287                 break;
288 
289             pdev = pci_get_pdev_by_domain(domain, -1, -1, -1);
290             drhd = acpi_find_matched_drhd_unit(pdev);
291             pte_maddr = alloc_pgtable_maddr(drhd, 1);
292             if ( !pte_maddr )
293                 break;
294 
295             dma_set_pte_addr(*pte, pte_maddr);
296 
297             /*
298              * high level table always sets r/w, last level
299              * page table control read/write
300              */
301             dma_set_pte_readable(*pte);
302             dma_set_pte_writable(*pte);
303             iommu_flush_cache_entry(pte, sizeof(struct dma_pte));
304         }
305 
306         if ( level == 2 )
307             break;
308 
309         unmap_vtd_domain_page(parent);
310         parent = map_vtd_domain_page(pte_maddr);
311         level--;
312     }
313 
314     unmap_vtd_domain_page(parent);
315  out:
316     return pte_maddr;
317 }
318 
iommu_flush_write_buffer(struct iommu * iommu)319 static void iommu_flush_write_buffer(struct iommu *iommu)
320 {
321     u32 val;
322     unsigned long flags;
323 
324     if ( !rwbf_quirk && !cap_rwbf(iommu->cap) )
325         return;
326 
327     spin_lock_irqsave(&iommu->register_lock, flags);
328     val = dmar_readl(iommu->reg, DMAR_GSTS_REG);
329     dmar_writel(iommu->reg, DMAR_GCMD_REG, val | DMA_GCMD_WBF);
330 
331     /* Make sure hardware complete it */
332     IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
333                   !(val & DMA_GSTS_WBFS), val);
334 
335     spin_unlock_irqrestore(&iommu->register_lock, flags);
336 }
337 
338 /* return value determine if we need a write buffer flush */
flush_context_reg(void * _iommu,u16 did,u16 source_id,u8 function_mask,u64 type,bool_t flush_non_present_entry)339 static int __must_check flush_context_reg(void *_iommu, u16 did, u16 source_id,
340                                           u8 function_mask, u64 type,
341                                           bool_t flush_non_present_entry)
342 {
343     struct iommu *iommu = (struct iommu *) _iommu;
344     u64 val = 0;
345     unsigned long flags;
346 
347     /*
348      * In the non-present entry flush case, if hardware doesn't cache
349      * non-present entry we do nothing and if hardware cache non-present
350      * entry, we flush entries of domain 0 (the domain id is used to cache
351      * any non-present entries)
352      */
353     if ( flush_non_present_entry )
354     {
355         if ( !cap_caching_mode(iommu->cap) )
356             return 1;
357         else
358             did = 0;
359     }
360 
361     /* use register invalidation */
362     switch ( type )
363     {
364     case DMA_CCMD_GLOBAL_INVL:
365         val = DMA_CCMD_GLOBAL_INVL;
366         break;
367     case DMA_CCMD_DOMAIN_INVL:
368         val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
369         break;
370     case DMA_CCMD_DEVICE_INVL:
371         val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
372             |DMA_CCMD_SID(source_id)|DMA_CCMD_FM(function_mask);
373         break;
374     default:
375         BUG();
376     }
377     val |= DMA_CCMD_ICC;
378 
379     spin_lock_irqsave(&iommu->register_lock, flags);
380     dmar_writeq(iommu->reg, DMAR_CCMD_REG, val);
381 
382     /* Make sure hardware complete it */
383     IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG, dmar_readq,
384                   !(val & DMA_CCMD_ICC), val);
385 
386     spin_unlock_irqrestore(&iommu->register_lock, flags);
387     /* flush context entry will implicitly flush write buffer */
388     return 0;
389 }
390 
iommu_flush_context_global(struct iommu * iommu,bool_t flush_non_present_entry)391 static int __must_check iommu_flush_context_global(struct iommu *iommu,
392                                                    bool_t flush_non_present_entry)
393 {
394     struct iommu_flush *flush = iommu_get_flush(iommu);
395     return flush->context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
396                                  flush_non_present_entry);
397 }
398 
iommu_flush_context_device(struct iommu * iommu,u16 did,u16 source_id,u8 function_mask,bool_t flush_non_present_entry)399 static int __must_check iommu_flush_context_device(struct iommu *iommu,
400                                                    u16 did, u16 source_id,
401                                                    u8 function_mask,
402                                                    bool_t flush_non_present_entry)
403 {
404     struct iommu_flush *flush = iommu_get_flush(iommu);
405     return flush->context(iommu, did, source_id, function_mask,
406                                  DMA_CCMD_DEVICE_INVL,
407                                  flush_non_present_entry);
408 }
409 
410 /* return value determine if we need a write buffer flush */
flush_iotlb_reg(void * _iommu,u16 did,u64 addr,unsigned int size_order,u64 type,bool_t flush_non_present_entry,bool_t flush_dev_iotlb)411 static int __must_check flush_iotlb_reg(void *_iommu, u16 did, u64 addr,
412                                         unsigned int size_order, u64 type,
413                                         bool_t flush_non_present_entry,
414                                         bool_t flush_dev_iotlb)
415 {
416     struct iommu *iommu = (struct iommu *) _iommu;
417     int tlb_offset = ecap_iotlb_offset(iommu->ecap);
418     u64 val = 0;
419     unsigned long flags;
420 
421     /*
422      * In the non-present entry flush case, if hardware doesn't cache
423      * non-present entry we do nothing and if hardware cache non-present
424      * entry, we flush entries of domain 0 (the domain id is used to cache
425      * any non-present entries)
426      */
427     if ( flush_non_present_entry )
428     {
429         if ( !cap_caching_mode(iommu->cap) )
430             return 1;
431         else
432             did = 0;
433     }
434 
435     /* use register invalidation */
436     switch ( type )
437     {
438     case DMA_TLB_GLOBAL_FLUSH:
439         val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
440         break;
441     case DMA_TLB_DSI_FLUSH:
442         val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
443         break;
444     case DMA_TLB_PSI_FLUSH:
445         val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
446         break;
447     default:
448         BUG();
449     }
450     /* Note: set drain read/write */
451     if ( cap_read_drain(iommu->cap) )
452         val |= DMA_TLB_READ_DRAIN;
453     if ( cap_write_drain(iommu->cap) )
454         val |= DMA_TLB_WRITE_DRAIN;
455 
456     spin_lock_irqsave(&iommu->register_lock, flags);
457     /* Note: Only uses first TLB reg currently */
458     if ( type == DMA_TLB_PSI_FLUSH )
459     {
460         /* Note: always flush non-leaf currently. */
461         dmar_writeq(iommu->reg, tlb_offset, size_order | addr);
462     }
463     dmar_writeq(iommu->reg, tlb_offset + 8, val);
464 
465     /* Make sure hardware complete it */
466     IOMMU_WAIT_OP(iommu, (tlb_offset + 8), dmar_readq,
467                   !(val & DMA_TLB_IVT), val);
468     spin_unlock_irqrestore(&iommu->register_lock, flags);
469 
470     /* check IOTLB invalidation granularity */
471     if ( DMA_TLB_IAIG(val) == 0 )
472         dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: flush IOTLB failed\n");
473 
474     /* flush iotlb entry will implicitly flush write buffer */
475     return 0;
476 }
477 
iommu_flush_iotlb_global(struct iommu * iommu,bool_t flush_non_present_entry,bool_t flush_dev_iotlb)478 static int __must_check iommu_flush_iotlb_global(struct iommu *iommu,
479                                                  bool_t flush_non_present_entry,
480                                                  bool_t flush_dev_iotlb)
481 {
482     struct iommu_flush *flush = iommu_get_flush(iommu);
483     int status;
484 
485     /* apply platform specific errata workarounds */
486     vtd_ops_preamble_quirk(iommu);
487 
488     status = flush->iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
489                         flush_non_present_entry, flush_dev_iotlb);
490 
491     /* undo platform specific errata workarounds */
492     vtd_ops_postamble_quirk(iommu);
493 
494     return status;
495 }
496 
iommu_flush_iotlb_dsi(struct iommu * iommu,u16 did,bool_t flush_non_present_entry,bool_t flush_dev_iotlb)497 static int __must_check iommu_flush_iotlb_dsi(struct iommu *iommu, u16 did,
498                                               bool_t flush_non_present_entry,
499                                               bool_t flush_dev_iotlb)
500 {
501     struct iommu_flush *flush = iommu_get_flush(iommu);
502     int status;
503 
504     /* apply platform specific errata workarounds */
505     vtd_ops_preamble_quirk(iommu);
506 
507     status =  flush->iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
508                         flush_non_present_entry, flush_dev_iotlb);
509 
510     /* undo platform specific errata workarounds */
511     vtd_ops_postamble_quirk(iommu);
512 
513     return status;
514 }
515 
iommu_flush_iotlb_psi(struct iommu * iommu,u16 did,u64 addr,unsigned int order,bool_t flush_non_present_entry,bool_t flush_dev_iotlb)516 static int __must_check iommu_flush_iotlb_psi(struct iommu *iommu, u16 did,
517                                               u64 addr, unsigned int order,
518                                               bool_t flush_non_present_entry,
519                                               bool_t flush_dev_iotlb)
520 {
521     struct iommu_flush *flush = iommu_get_flush(iommu);
522     int status;
523 
524     ASSERT(!(addr & (~PAGE_MASK_4K)));
525 
526     /* Fallback to domain selective flush if no PSI support */
527     if ( !cap_pgsel_inv(iommu->cap) )
528         return iommu_flush_iotlb_dsi(iommu, did, flush_non_present_entry, flush_dev_iotlb);
529 
530     /* Fallback to domain selective flush if size is too big */
531     if ( order > cap_max_amask_val(iommu->cap) )
532         return iommu_flush_iotlb_dsi(iommu, did, flush_non_present_entry, flush_dev_iotlb);
533 
534     addr >>= PAGE_SHIFT_4K + order;
535     addr <<= PAGE_SHIFT_4K + order;
536 
537     /* apply platform specific errata workarounds */
538     vtd_ops_preamble_quirk(iommu);
539 
540     status = flush->iotlb(iommu, did, addr, order, DMA_TLB_PSI_FLUSH,
541                         flush_non_present_entry, flush_dev_iotlb);
542 
543     /* undo platform specific errata workarounds */
544     vtd_ops_postamble_quirk(iommu);
545 
546     return status;
547 }
548 
iommu_flush_all(void)549 static int __must_check iommu_flush_all(void)
550 {
551     struct acpi_drhd_unit *drhd;
552     struct iommu *iommu;
553     bool_t flush_dev_iotlb;
554     int rc = 0;
555 
556     flush_all_cache();
557     for_each_drhd_unit ( drhd )
558     {
559         int context_rc, iotlb_rc;
560 
561         iommu = drhd->iommu;
562         context_rc = iommu_flush_context_global(iommu, 0);
563         flush_dev_iotlb = !!find_ats_dev_drhd(iommu);
564         iotlb_rc = iommu_flush_iotlb_global(iommu, 0, flush_dev_iotlb);
565 
566         /*
567          * The current logic for returns:
568          *   - positive  invoke iommu_flush_write_buffer to flush cache.
569          *   - zero      on success.
570          *   - negative  on failure. Continue to flush IOMMU IOTLB on a
571          *               best effort basis.
572          */
573         if ( context_rc > 0 || iotlb_rc > 0 )
574             iommu_flush_write_buffer(iommu);
575         if ( rc >= 0 )
576             rc = context_rc;
577         if ( rc >= 0 )
578             rc = iotlb_rc;
579     }
580 
581     if ( rc > 0 )
582         rc = 0;
583 
584     return rc;
585 }
586 
iommu_flush_iotlb(struct domain * d,unsigned long gfn,bool_t dma_old_pte_present,unsigned int page_count)587 static int __must_check iommu_flush_iotlb(struct domain *d,
588                                           unsigned long gfn,
589                                           bool_t dma_old_pte_present,
590                                           unsigned int page_count)
591 {
592     struct domain_iommu *hd = dom_iommu(d);
593     struct acpi_drhd_unit *drhd;
594     struct iommu *iommu;
595     bool_t flush_dev_iotlb;
596     int iommu_domid;
597     int rc = 0;
598 
599     /*
600      * No need pcideves_lock here because we have flush
601      * when assign/deassign device
602      */
603     for_each_drhd_unit ( drhd )
604     {
605         iommu = drhd->iommu;
606 
607         if ( !test_bit(iommu->index, &hd->arch.iommu_bitmap) )
608             continue;
609 
610         flush_dev_iotlb = !!find_ats_dev_drhd(iommu);
611         iommu_domid= domain_iommu_domid(d, iommu);
612         if ( iommu_domid == -1 )
613             continue;
614 
615         if ( page_count != 1 || gfn == gfn_x(INVALID_GFN) )
616             rc = iommu_flush_iotlb_dsi(iommu, iommu_domid,
617                                        0, flush_dev_iotlb);
618         else
619             rc = iommu_flush_iotlb_psi(iommu, iommu_domid,
620                                        (paddr_t)gfn << PAGE_SHIFT_4K,
621                                        PAGE_ORDER_4K,
622                                        !dma_old_pte_present,
623                                        flush_dev_iotlb);
624 
625         if ( rc > 0 )
626         {
627             iommu_flush_write_buffer(iommu);
628             rc = 0;
629         }
630     }
631 
632     return rc;
633 }
634 
iommu_flush_iotlb_pages(struct domain * d,unsigned long gfn,unsigned int page_count)635 static int __must_check iommu_flush_iotlb_pages(struct domain *d,
636                                                 unsigned long gfn,
637                                                 unsigned int page_count)
638 {
639     return iommu_flush_iotlb(d, gfn, 1, page_count);
640 }
641 
iommu_flush_iotlb_all(struct domain * d)642 static int __must_check iommu_flush_iotlb_all(struct domain *d)
643 {
644     return iommu_flush_iotlb(d, gfn_x(INVALID_GFN), 0, 0);
645 }
646 
647 /* clear one page's page table */
dma_pte_clear_one(struct domain * domain,u64 addr)648 static int __must_check dma_pte_clear_one(struct domain *domain, u64 addr)
649 {
650     struct domain_iommu *hd = dom_iommu(domain);
651     struct dma_pte *page = NULL, *pte = NULL;
652     u64 pg_maddr;
653     int rc = 0;
654 
655     spin_lock(&hd->arch.mapping_lock);
656     /* get last level pte */
657     pg_maddr = addr_to_dma_page_maddr(domain, addr, 0);
658     if ( pg_maddr == 0 )
659     {
660         spin_unlock(&hd->arch.mapping_lock);
661         return 0;
662     }
663 
664     page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
665     pte = page + address_level_offset(addr, 1);
666 
667     if ( !dma_pte_present(*pte) )
668     {
669         spin_unlock(&hd->arch.mapping_lock);
670         unmap_vtd_domain_page(page);
671         return 0;
672     }
673 
674     dma_clear_pte(*pte);
675     spin_unlock(&hd->arch.mapping_lock);
676     iommu_flush_cache_entry(pte, sizeof(struct dma_pte));
677 
678     if ( !this_cpu(iommu_dont_flush_iotlb) )
679         rc = iommu_flush_iotlb_pages(domain, addr >> PAGE_SHIFT_4K, 1);
680 
681     unmap_vtd_domain_page(page);
682 
683     return rc;
684 }
685 
iommu_free_pagetable(u64 pt_maddr,int level)686 static void iommu_free_pagetable(u64 pt_maddr, int level)
687 {
688     struct page_info *pg = maddr_to_page(pt_maddr);
689 
690     if ( pt_maddr == 0 )
691         return;
692 
693     PFN_ORDER(pg) = level;
694     spin_lock(&iommu_pt_cleanup_lock);
695     page_list_add_tail(pg, &iommu_pt_cleanup_list);
696     spin_unlock(&iommu_pt_cleanup_lock);
697 }
698 
iommu_free_page_table(struct page_info * pg)699 static void iommu_free_page_table(struct page_info *pg)
700 {
701     unsigned int i, next_level = PFN_ORDER(pg) - 1;
702     u64 pt_maddr = page_to_maddr(pg);
703     struct dma_pte *pt_vaddr, *pte;
704 
705     PFN_ORDER(pg) = 0;
706     pt_vaddr = (struct dma_pte *)map_vtd_domain_page(pt_maddr);
707 
708     for ( i = 0; i < PTE_NUM; i++ )
709     {
710         pte = &pt_vaddr[i];
711         if ( !dma_pte_present(*pte) )
712             continue;
713 
714         if ( next_level >= 1 )
715             iommu_free_pagetable(dma_pte_addr(*pte), next_level);
716 
717         dma_clear_pte(*pte);
718         iommu_flush_cache_entry(pte, sizeof(struct dma_pte));
719     }
720 
721     unmap_vtd_domain_page(pt_vaddr);
722     free_pgtable_maddr(pt_maddr);
723 }
724 
iommu_set_root_entry(struct iommu * iommu)725 static int iommu_set_root_entry(struct iommu *iommu)
726 {
727     u32 sts;
728     unsigned long flags;
729 
730     spin_lock_irqsave(&iommu->register_lock, flags);
731     dmar_writeq(iommu->reg, DMAR_RTADDR_REG, iommu->root_maddr);
732 
733     sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
734     dmar_writel(iommu->reg, DMAR_GCMD_REG, sts | DMA_GCMD_SRTP);
735 
736     /* Make sure hardware complete it */
737     IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
738                   (sts & DMA_GSTS_RTPS), sts);
739     spin_unlock_irqrestore(&iommu->register_lock, flags);
740 
741     return 0;
742 }
743 
iommu_enable_translation(struct acpi_drhd_unit * drhd)744 static void iommu_enable_translation(struct acpi_drhd_unit *drhd)
745 {
746     u32 sts;
747     unsigned long flags;
748     struct iommu *iommu = drhd->iommu;
749 
750     if ( is_igd_drhd(drhd) )
751     {
752         if ( !iommu_igfx )
753         {
754             printk(XENLOG_INFO VTDPREFIX
755                    "Passed iommu=no-igfx option.  Disabling IGD VT-d engine.\n");
756             return;
757         }
758 
759         if ( !is_igd_vt_enabled_quirk() )
760         {
761             if ( force_iommu )
762                 panic("BIOS did not enable IGD for VT properly, crash Xen for security purpose");
763 
764             printk(XENLOG_WARNING VTDPREFIX
765                    "BIOS did not enable IGD for VT properly.  Disabling IGD VT-d engine.\n");
766             return;
767         }
768     }
769 
770     /* apply platform specific errata workarounds */
771     vtd_ops_preamble_quirk(iommu);
772 
773     if ( iommu_verbose )
774         printk(VTDPREFIX "iommu_enable_translation: iommu->reg = %p\n",
775                iommu->reg);
776     spin_lock_irqsave(&iommu->register_lock, flags);
777     sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
778     dmar_writel(iommu->reg, DMAR_GCMD_REG, sts | DMA_GCMD_TE);
779 
780     /* Make sure hardware complete it */
781     IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
782                   (sts & DMA_GSTS_TES), sts);
783     spin_unlock_irqrestore(&iommu->register_lock, flags);
784 
785     /* undo platform specific errata workarounds */
786     vtd_ops_postamble_quirk(iommu);
787 
788     /* Disable PMRs when VT-d engine takes effect per spec definition */
789     disable_pmr(iommu);
790 }
791 
iommu_disable_translation(struct iommu * iommu)792 static void iommu_disable_translation(struct iommu *iommu)
793 {
794     u32 sts;
795     unsigned long flags;
796 
797     /* apply platform specific errata workarounds */
798     vtd_ops_preamble_quirk(iommu);
799 
800     spin_lock_irqsave(&iommu->register_lock, flags);
801     sts = dmar_readl(iommu->reg, DMAR_GSTS_REG);
802     dmar_writel(iommu->reg, DMAR_GCMD_REG, sts & (~DMA_GCMD_TE));
803 
804     /* Make sure hardware complete it */
805     IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, dmar_readl,
806                   !(sts & DMA_GSTS_TES), sts);
807     spin_unlock_irqrestore(&iommu->register_lock, flags);
808 
809     /* undo platform specific errata workarounds */
810     vtd_ops_postamble_quirk(iommu);
811 }
812 
813 enum faulttype {
814     DMA_REMAP,
815     INTR_REMAP,
816     UNKNOWN,
817 };
818 
819 static const char *dma_remap_fault_reasons[] =
820 {
821     "Software",
822     "Present bit in root entry is clear",
823     "Present bit in context entry is clear",
824     "Invalid context entry",
825     "Access beyond MGAW",
826     "PTE Write access is not set",
827     "PTE Read access is not set",
828     "Next page table ptr is invalid",
829     "Root table address invalid",
830     "Context table ptr is invalid",
831     "non-zero reserved fields in RTP",
832     "non-zero reserved fields in CTP",
833     "non-zero reserved fields in PTE",
834     "Blocked a DMA translation request",
835 };
836 
837 static const char *intr_remap_fault_reasons[] =
838 {
839     "Detected reserved fields in the decoded interrupt-remapped request",
840     "Interrupt index exceeded the interrupt-remapping table size",
841     "Present field in the IRTE entry is clear",
842     "Error accessing interrupt-remapping table pointed by IRTA_REG",
843     "Detected reserved fields in the IRTE entry",
844     "Blocked a compatibility format interrupt request",
845     "Blocked an interrupt request due to source-id verification failure",
846 };
847 
iommu_get_fault_reason(u8 fault_reason,enum faulttype * fault_type)848 static const char *iommu_get_fault_reason(u8 fault_reason,
849                                           enum faulttype *fault_type)
850 {
851     if ( fault_reason >= 0x20 && ( fault_reason < 0x20 +
852                 ARRAY_SIZE(intr_remap_fault_reasons)) )
853     {
854         *fault_type = INTR_REMAP;
855         return intr_remap_fault_reasons[fault_reason - 0x20];
856     }
857     else if ( fault_reason < ARRAY_SIZE(dma_remap_fault_reasons) )
858     {
859         *fault_type = DMA_REMAP;
860         return dma_remap_fault_reasons[fault_reason];
861     }
862     else
863     {
864         *fault_type = UNKNOWN;
865         return "Unknown";
866     }
867 }
868 
iommu_page_fault_do_one(struct iommu * iommu,int type,u8 fault_reason,u16 source_id,u64 addr)869 static int iommu_page_fault_do_one(struct iommu *iommu, int type,
870                                    u8 fault_reason, u16 source_id, u64 addr)
871 {
872     const char *reason, *kind;
873     enum faulttype fault_type;
874     u16 seg = iommu->intel->drhd->segment;
875 
876     reason = iommu_get_fault_reason(fault_reason, &fault_type);
877     switch ( fault_type )
878     {
879     case DMA_REMAP:
880         printk(XENLOG_G_WARNING VTDPREFIX
881                "DMAR:[%s] Request device [%04x:%02x:%02x.%u] "
882                "fault addr %"PRIx64", iommu reg = %p\n",
883                (type ? "DMA Read" : "DMA Write"),
884                seg, PCI_BUS(source_id), PCI_SLOT(source_id),
885                PCI_FUNC(source_id), addr, iommu->reg);
886         kind = "DMAR";
887         break;
888     case INTR_REMAP:
889         printk(XENLOG_G_WARNING VTDPREFIX
890                "INTR-REMAP: Request device [%04x:%02x:%02x.%u] "
891                "fault index %"PRIx64", iommu reg = %p\n",
892                seg, PCI_BUS(source_id), PCI_SLOT(source_id),
893                PCI_FUNC(source_id), addr >> 48, iommu->reg);
894         kind = "INTR-REMAP";
895         break;
896     default:
897         printk(XENLOG_G_WARNING VTDPREFIX
898                "UNKNOWN: Request device [%04x:%02x:%02x.%u] "
899                "fault addr %"PRIx64", iommu reg = %p\n",
900                seg, PCI_BUS(source_id), PCI_SLOT(source_id),
901                PCI_FUNC(source_id), addr, iommu->reg);
902         kind = "UNKNOWN";
903         break;
904     }
905 
906     printk(XENLOG_G_WARNING VTDPREFIX "%s: reason %02x - %s\n",
907            kind, fault_reason, reason);
908 
909     if ( iommu_verbose && fault_type == DMA_REMAP )
910         print_vtd_entries(iommu, PCI_BUS(source_id), PCI_DEVFN2(source_id),
911                           addr >> PAGE_SHIFT);
912 
913     return 0;
914 }
915 
iommu_fault_status(u32 fault_status)916 static void iommu_fault_status(u32 fault_status)
917 {
918     if ( fault_status & DMA_FSTS_PFO )
919         INTEL_IOMMU_DEBUG("iommu_fault_status: Fault Overflow\n");
920     if ( fault_status & DMA_FSTS_PPF )
921         INTEL_IOMMU_DEBUG("iommu_fault_status: Primary Pending Fault\n");
922     if ( fault_status & DMA_FSTS_AFO )
923         INTEL_IOMMU_DEBUG("iommu_fault_status: Advanced Fault Overflow\n");
924     if ( fault_status & DMA_FSTS_APF )
925         INTEL_IOMMU_DEBUG("iommu_fault_status: Advanced Pending Fault\n");
926     if ( fault_status & DMA_FSTS_IQE )
927         INTEL_IOMMU_DEBUG("iommu_fault_status: Invalidation Queue Error\n");
928     if ( fault_status & DMA_FSTS_ICE )
929         INTEL_IOMMU_DEBUG("iommu_fault_status: Invalidation Completion Error\n");
930     if ( fault_status & DMA_FSTS_ITE )
931         INTEL_IOMMU_DEBUG("iommu_fault_status: Invalidation Time-out Error\n");
932 }
933 
934 #define PRIMARY_FAULT_REG_LEN (16)
__do_iommu_page_fault(struct iommu * iommu)935 static void __do_iommu_page_fault(struct iommu *iommu)
936 {
937     int reg, fault_index;
938     u32 fault_status;
939     unsigned long flags;
940 
941     fault_status = dmar_readl(iommu->reg, DMAR_FSTS_REG);
942 
943     iommu_fault_status(fault_status);
944 
945     /* FIXME: ignore advanced fault log */
946     if ( !(fault_status & DMA_FSTS_PPF) )
947         goto clear_overflow;
948 
949     fault_index = dma_fsts_fault_record_index(fault_status);
950     reg = cap_fault_reg_offset(iommu->cap);
951     while (1)
952     {
953         u8 fault_reason;
954         u16 source_id;
955         u32 data;
956         u64 guest_addr;
957         int type;
958 
959         /* highest 32 bits */
960         spin_lock_irqsave(&iommu->register_lock, flags);
961         data = dmar_readl(iommu->reg, reg +
962                           fault_index * PRIMARY_FAULT_REG_LEN + 12);
963         if ( !(data & DMA_FRCD_F) )
964         {
965             spin_unlock_irqrestore(&iommu->register_lock, flags);
966             break;
967         }
968 
969         fault_reason = dma_frcd_fault_reason(data);
970         type = dma_frcd_type(data);
971 
972         data = dmar_readl(iommu->reg, reg +
973                           fault_index * PRIMARY_FAULT_REG_LEN + 8);
974         source_id = dma_frcd_source_id(data);
975 
976         guest_addr = dmar_readq(iommu->reg, reg +
977                                 fault_index * PRIMARY_FAULT_REG_LEN);
978         guest_addr = dma_frcd_page_addr(guest_addr);
979         /* clear the fault */
980         dmar_writel(iommu->reg, reg +
981                     fault_index * PRIMARY_FAULT_REG_LEN + 12, DMA_FRCD_F);
982         spin_unlock_irqrestore(&iommu->register_lock, flags);
983 
984         iommu_page_fault_do_one(iommu, type, fault_reason,
985                                 source_id, guest_addr);
986 
987         pci_check_disable_device(iommu->intel->drhd->segment,
988                                  PCI_BUS(source_id), PCI_DEVFN2(source_id));
989 
990         fault_index++;
991         if ( fault_index > cap_num_fault_regs(iommu->cap) )
992             fault_index = 0;
993     }
994 clear_overflow:
995     /* clear primary fault overflow */
996     fault_status = readl(iommu->reg + DMAR_FSTS_REG);
997     if ( fault_status & DMA_FSTS_PFO )
998     {
999         spin_lock_irqsave(&iommu->register_lock, flags);
1000         dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_PFO);
1001         spin_unlock_irqrestore(&iommu->register_lock, flags);
1002     }
1003 }
1004 
do_iommu_page_fault(unsigned long data)1005 static void do_iommu_page_fault(unsigned long data)
1006 {
1007     struct acpi_drhd_unit *drhd;
1008 
1009     if ( list_empty(&acpi_drhd_units) )
1010     {
1011        INTEL_IOMMU_DEBUG("no device found, something must be very wrong!\n");
1012        return;
1013     }
1014 
1015     /*
1016      * No matter from whom the interrupt came from, check all the
1017      * IOMMUs present in the system. This allows for having just one
1018      * tasklet (instead of one per each IOMMUs) and should be more than
1019      * fine, considering how rare the event of a fault should be.
1020      */
1021     for_each_drhd_unit ( drhd )
1022         __do_iommu_page_fault(drhd->iommu);
1023 }
1024 
iommu_page_fault(int irq,void * dev_id,struct cpu_user_regs * regs)1025 static void iommu_page_fault(int irq, void *dev_id,
1026                              struct cpu_user_regs *regs)
1027 {
1028     /*
1029      * Just flag the tasklet as runnable. This is fine, according to VT-d
1030      * specs since a new interrupt won't be generated until we clear all
1031      * the faults that caused this one to happen.
1032      */
1033     tasklet_schedule(&vtd_fault_tasklet);
1034 }
1035 
dma_msi_unmask(struct irq_desc * desc)1036 static void dma_msi_unmask(struct irq_desc *desc)
1037 {
1038     struct iommu *iommu = desc->action->dev_id;
1039     unsigned long flags;
1040     u32 sts;
1041 
1042     /* unmask it */
1043     spin_lock_irqsave(&iommu->register_lock, flags);
1044     sts = dmar_readl(iommu->reg, DMAR_FECTL_REG);
1045     sts &= ~DMA_FECTL_IM;
1046     dmar_writel(iommu->reg, DMAR_FECTL_REG, sts);
1047     spin_unlock_irqrestore(&iommu->register_lock, flags);
1048     iommu->msi.msi_attrib.host_masked = 0;
1049 }
1050 
dma_msi_mask(struct irq_desc * desc)1051 static void dma_msi_mask(struct irq_desc *desc)
1052 {
1053     unsigned long flags;
1054     struct iommu *iommu = desc->action->dev_id;
1055     u32 sts;
1056 
1057     /* mask it */
1058     spin_lock_irqsave(&iommu->register_lock, flags);
1059     sts = dmar_readl(iommu->reg, DMAR_FECTL_REG);
1060     sts |= DMA_FECTL_IM;
1061     dmar_writel(iommu->reg, DMAR_FECTL_REG, sts);
1062     spin_unlock_irqrestore(&iommu->register_lock, flags);
1063     iommu->msi.msi_attrib.host_masked = 1;
1064 }
1065 
dma_msi_startup(struct irq_desc * desc)1066 static unsigned int dma_msi_startup(struct irq_desc *desc)
1067 {
1068     dma_msi_unmask(desc);
1069     return 0;
1070 }
1071 
dma_msi_ack(struct irq_desc * desc)1072 static void dma_msi_ack(struct irq_desc *desc)
1073 {
1074     irq_complete_move(desc);
1075     dma_msi_mask(desc);
1076     move_masked_irq(desc);
1077 }
1078 
dma_msi_end(struct irq_desc * desc,u8 vector)1079 static void dma_msi_end(struct irq_desc *desc, u8 vector)
1080 {
1081     dma_msi_unmask(desc);
1082     ack_APIC_irq();
1083 }
1084 
dma_msi_set_affinity(struct irq_desc * desc,const cpumask_t * mask)1085 static void dma_msi_set_affinity(struct irq_desc *desc, const cpumask_t *mask)
1086 {
1087     struct msi_msg msg;
1088     unsigned int dest;
1089     unsigned long flags;
1090     struct iommu *iommu = desc->action->dev_id;
1091 
1092     dest = set_desc_affinity(desc, mask);
1093     if (dest == BAD_APICID){
1094         dprintk(XENLOG_ERR VTDPREFIX, "Set iommu interrupt affinity error!\n");
1095         return;
1096     }
1097 
1098     msi_compose_msg(desc->arch.vector, NULL, &msg);
1099     msg.dest32 = dest;
1100     if (x2apic_enabled)
1101         msg.address_hi = dest & 0xFFFFFF00;
1102     ASSERT(!(msg.address_lo & MSI_ADDR_DEST_ID_MASK));
1103     msg.address_lo |= MSI_ADDR_DEST_ID(dest);
1104     iommu->msi.msg = msg;
1105 
1106     spin_lock_irqsave(&iommu->register_lock, flags);
1107     dmar_writel(iommu->reg, DMAR_FEDATA_REG, msg.data);
1108     dmar_writeq(iommu->reg, DMAR_FEADDR_REG, msg.address);
1109     spin_unlock_irqrestore(&iommu->register_lock, flags);
1110 }
1111 
1112 static hw_irq_controller dma_msi_type = {
1113     .typename = "DMA_MSI",
1114     .startup = dma_msi_startup,
1115     .shutdown = dma_msi_mask,
1116     .enable = dma_msi_unmask,
1117     .disable = dma_msi_mask,
1118     .ack = dma_msi_ack,
1119     .end = dma_msi_end,
1120     .set_affinity = dma_msi_set_affinity,
1121 };
1122 
iommu_set_interrupt(struct acpi_drhd_unit * drhd)1123 static int __init iommu_set_interrupt(struct acpi_drhd_unit *drhd)
1124 {
1125     int irq, ret;
1126     struct acpi_rhsa_unit *rhsa = drhd_to_rhsa(drhd);
1127     struct iommu *iommu = drhd->iommu;
1128     struct irq_desc *desc;
1129 
1130     irq = create_irq(rhsa ? pxm_to_node(rhsa->proximity_domain)
1131                           : NUMA_NO_NODE);
1132     if ( irq <= 0 )
1133     {
1134         dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: no irq available!\n");
1135         return -EINVAL;
1136     }
1137 
1138     desc = irq_to_desc(irq);
1139     desc->handler = &dma_msi_type;
1140     ret = request_irq(irq, 0, iommu_page_fault, "dmar", iommu);
1141     if ( ret )
1142     {
1143         desc->handler = &no_irq_type;
1144         destroy_irq(irq);
1145         dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: can't request irq\n");
1146         return ret;
1147     }
1148 
1149     iommu->msi.irq = irq;
1150     iommu->msi.msi_attrib.pos = MSI_TYPE_IOMMU;
1151     iommu->msi.msi_attrib.maskbit = 1;
1152     iommu->msi.msi_attrib.is_64 = 1;
1153     desc->msi_desc = &iommu->msi;
1154 
1155     return 0;
1156 }
1157 
iommu_alloc(struct acpi_drhd_unit * drhd)1158 int __init iommu_alloc(struct acpi_drhd_unit *drhd)
1159 {
1160     struct iommu *iommu;
1161     unsigned long sagaw, nr_dom;
1162     int agaw;
1163 
1164     if ( nr_iommus > MAX_IOMMUS )
1165     {
1166         dprintk(XENLOG_ERR VTDPREFIX,
1167                  "IOMMU: nr_iommus %d > MAX_IOMMUS\n", nr_iommus);
1168         return -ENOMEM;
1169     }
1170 
1171     iommu = xzalloc(struct iommu);
1172     if ( iommu == NULL )
1173         return -ENOMEM;
1174 
1175     iommu->msi.irq = -1; /* No irq assigned yet. */
1176     INIT_LIST_HEAD(&iommu->ats_devices);
1177 
1178     iommu->intel = alloc_intel_iommu();
1179     if ( iommu->intel == NULL )
1180     {
1181         xfree(iommu);
1182         return -ENOMEM;
1183     }
1184     iommu->intel->drhd = drhd;
1185     drhd->iommu = iommu;
1186 
1187     if ( !(iommu->root_maddr = alloc_pgtable_maddr(drhd, 1)) )
1188         return -ENOMEM;
1189 
1190     iommu->reg = ioremap(drhd->address, PAGE_SIZE);
1191     if ( !iommu->reg )
1192         return -ENOMEM;
1193     iommu->index = nr_iommus++;
1194 
1195     iommu->cap = dmar_readq(iommu->reg, DMAR_CAP_REG);
1196     iommu->ecap = dmar_readq(iommu->reg, DMAR_ECAP_REG);
1197 
1198     if ( iommu_verbose )
1199     {
1200         printk(VTDPREFIX "drhd->address = %"PRIx64" iommu->reg = %p\n",
1201                drhd->address, iommu->reg);
1202         printk(VTDPREFIX "cap = %"PRIx64" ecap = %"PRIx64"\n",
1203                iommu->cap, iommu->ecap);
1204     }
1205     if ( !(iommu->cap + 1) || !(iommu->ecap + 1) )
1206         return -ENODEV;
1207 
1208     if ( cap_fault_reg_offset(iommu->cap) +
1209          cap_num_fault_regs(iommu->cap) * PRIMARY_FAULT_REG_LEN >= PAGE_SIZE ||
1210          ecap_iotlb_offset(iommu->ecap) >= PAGE_SIZE )
1211     {
1212         printk(XENLOG_ERR VTDPREFIX "IOMMU: unsupported\n");
1213         print_iommu_regs(drhd);
1214         return -ENODEV;
1215     }
1216 
1217     /* Calculate number of pagetable levels: between 2 and 4. */
1218     sagaw = cap_sagaw(iommu->cap);
1219     for ( agaw = level_to_agaw(4); agaw >= 0; agaw-- )
1220         if ( test_bit(agaw, &sagaw) )
1221             break;
1222     if ( agaw < 0 )
1223     {
1224         printk(XENLOG_ERR VTDPREFIX "IOMMU: unsupported sagaw %lx\n", sagaw);
1225         print_iommu_regs(drhd);
1226         return -ENODEV;
1227     }
1228     iommu->nr_pt_levels = agaw_to_level(agaw);
1229 
1230     if ( !ecap_coherent(iommu->ecap) )
1231         iommus_incoherent = 1;
1232 
1233     /* allocate domain id bitmap */
1234     nr_dom = cap_ndoms(iommu->cap);
1235     iommu->domid_bitmap = xzalloc_array(unsigned long, BITS_TO_LONGS(nr_dom));
1236     if ( !iommu->domid_bitmap )
1237         return -ENOMEM ;
1238 
1239     /*
1240      * if Caching mode is set, then invalid translations are tagged with
1241      * domain id 0, Hence reserve bit 0 for it
1242      */
1243     if ( cap_caching_mode(iommu->cap) )
1244         set_bit(0, iommu->domid_bitmap);
1245 
1246     iommu->domid_map = xzalloc_array(u16, nr_dom);
1247     if ( !iommu->domid_map )
1248         return -ENOMEM ;
1249 
1250     spin_lock_init(&iommu->lock);
1251     spin_lock_init(&iommu->register_lock);
1252 
1253     return 0;
1254 }
1255 
iommu_free(struct acpi_drhd_unit * drhd)1256 void __init iommu_free(struct acpi_drhd_unit *drhd)
1257 {
1258     struct iommu *iommu = drhd->iommu;
1259 
1260     if ( iommu == NULL )
1261         return;
1262 
1263     drhd->iommu = NULL;
1264 
1265     if ( iommu->root_maddr != 0 )
1266     {
1267         free_pgtable_maddr(iommu->root_maddr);
1268         iommu->root_maddr = 0;
1269     }
1270 
1271     if ( iommu->reg )
1272         iounmap(iommu->reg);
1273 
1274     xfree(iommu->domid_bitmap);
1275     xfree(iommu->domid_map);
1276 
1277     free_intel_iommu(iommu->intel);
1278     if ( iommu->msi.irq >= 0 )
1279         destroy_irq(iommu->msi.irq);
1280     xfree(iommu);
1281 }
1282 
1283 #define guestwidth_to_adjustwidth(gaw) ({       \
1284     int agaw, r = (gaw - 12) % 9;               \
1285     agaw = (r == 0) ? gaw : (gaw + 9 - r);      \
1286     if ( agaw > 64 )                            \
1287         agaw = 64;                              \
1288     agaw; })
1289 
intel_iommu_domain_init(struct domain * d)1290 static int intel_iommu_domain_init(struct domain *d)
1291 {
1292     dom_iommu(d)->arch.agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
1293 
1294     return 0;
1295 }
1296 
intel_iommu_hwdom_init(struct domain * d)1297 static void __hwdom_init intel_iommu_hwdom_init(struct domain *d)
1298 {
1299     struct acpi_drhd_unit *drhd;
1300 
1301     if ( !iommu_passthrough && !need_iommu(d) )
1302     {
1303         /* Set up 1:1 page table for hardware domain. */
1304         vtd_set_hwdom_mapping(d);
1305     }
1306 
1307     setup_hwdom_pci_devices(d, setup_hwdom_device);
1308     setup_hwdom_rmrr(d);
1309 
1310     if ( iommu_flush_all() )
1311         printk(XENLOG_WARNING VTDPREFIX
1312                " IOMMU flush all failed for hardware domain\n");
1313 
1314     for_each_drhd_unit ( drhd )
1315     {
1316         if ( iomem_deny_access(d, PFN_DOWN(drhd->address),
1317                                PFN_DOWN(drhd->address)) )
1318             BUG();
1319         iommu_enable_translation(drhd);
1320     }
1321 }
1322 
domain_context_mapping_one(struct domain * domain,struct iommu * iommu,u8 bus,u8 devfn,const struct pci_dev * pdev)1323 int domain_context_mapping_one(
1324     struct domain *domain,
1325     struct iommu *iommu,
1326     u8 bus, u8 devfn, const struct pci_dev *pdev)
1327 {
1328     struct domain_iommu *hd = dom_iommu(domain);
1329     struct context_entry *context, *context_entries;
1330     u64 maddr, pgd_maddr;
1331     u16 seg = iommu->intel->drhd->segment;
1332     int agaw, rc, ret;
1333     bool_t flush_dev_iotlb;
1334 
1335     ASSERT(pcidevs_locked());
1336     spin_lock(&iommu->lock);
1337     maddr = bus_to_context_maddr(iommu, bus);
1338     context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
1339     context = &context_entries[devfn];
1340 
1341     if ( context_present(*context) )
1342     {
1343         int res = 0;
1344 
1345         /* Try to get domain ownership from device structure.  If that's
1346          * not available, try to read it from the context itself. */
1347         if ( pdev )
1348         {
1349             if ( pdev->domain != domain )
1350             {
1351                 printk(XENLOG_G_INFO VTDPREFIX
1352                        "d%d: %04x:%02x:%02x.%u owned by d%d!",
1353                        domain->domain_id,
1354                        seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
1355                        pdev->domain ? pdev->domain->domain_id : -1);
1356                 res = -EINVAL;
1357             }
1358         }
1359         else
1360         {
1361             int cdomain;
1362             cdomain = context_get_domain_id(context, iommu);
1363 
1364             if ( cdomain < 0 )
1365             {
1366                 printk(XENLOG_G_WARNING VTDPREFIX
1367                        "d%d: %04x:%02x:%02x.%u mapped, but can't find owner!\n",
1368                        domain->domain_id,
1369                        seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1370                 res = -EINVAL;
1371             }
1372             else if ( cdomain != domain->domain_id )
1373             {
1374                 printk(XENLOG_G_INFO VTDPREFIX
1375                        "d%d: %04x:%02x:%02x.%u already mapped to d%d!",
1376                        domain->domain_id,
1377                        seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
1378                        cdomain);
1379                 res = -EINVAL;
1380             }
1381         }
1382 
1383         unmap_vtd_domain_page(context_entries);
1384         spin_unlock(&iommu->lock);
1385         return res;
1386     }
1387 
1388     if ( iommu_passthrough && is_hardware_domain(domain) )
1389     {
1390         context_set_translation_type(*context, CONTEXT_TT_PASS_THRU);
1391         agaw = level_to_agaw(iommu->nr_pt_levels);
1392     }
1393     else
1394     {
1395         spin_lock(&hd->arch.mapping_lock);
1396 
1397         /* Ensure we have pagetables allocated down to leaf PTE. */
1398         if ( hd->arch.pgd_maddr == 0 )
1399         {
1400             addr_to_dma_page_maddr(domain, 0, 1);
1401             if ( hd->arch.pgd_maddr == 0 )
1402             {
1403             nomem:
1404                 spin_unlock(&hd->arch.mapping_lock);
1405                 spin_unlock(&iommu->lock);
1406                 unmap_vtd_domain_page(context_entries);
1407                 return -ENOMEM;
1408             }
1409         }
1410 
1411         /* Skip top levels of page tables for 2- and 3-level DRHDs. */
1412         pgd_maddr = hd->arch.pgd_maddr;
1413         for ( agaw = level_to_agaw(4);
1414               agaw != level_to_agaw(iommu->nr_pt_levels);
1415               agaw-- )
1416         {
1417             struct dma_pte *p = map_vtd_domain_page(pgd_maddr);
1418             pgd_maddr = dma_pte_addr(*p);
1419             unmap_vtd_domain_page(p);
1420             if ( pgd_maddr == 0 )
1421                 goto nomem;
1422         }
1423 
1424         context_set_address_root(*context, pgd_maddr);
1425         if ( ats_enabled && ecap_dev_iotlb(iommu->ecap) )
1426             context_set_translation_type(*context, CONTEXT_TT_DEV_IOTLB);
1427         else
1428             context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1429 
1430         spin_unlock(&hd->arch.mapping_lock);
1431     }
1432 
1433     if ( context_set_domain_id(context, domain, iommu) )
1434     {
1435         spin_unlock(&iommu->lock);
1436         unmap_vtd_domain_page(context_entries);
1437         return -EFAULT;
1438     }
1439 
1440     context_set_address_width(*context, agaw);
1441     context_set_fault_enable(*context);
1442     context_set_present(*context);
1443     iommu_flush_cache_entry(context, sizeof(struct context_entry));
1444     spin_unlock(&iommu->lock);
1445 
1446     /* Context entry was previously non-present (with domid 0). */
1447     rc = iommu_flush_context_device(iommu, 0, PCI_BDF2(bus, devfn),
1448                                     DMA_CCMD_MASK_NOBIT, 1);
1449     flush_dev_iotlb = !!find_ats_dev_drhd(iommu);
1450     ret = iommu_flush_iotlb_dsi(iommu, 0, 1, flush_dev_iotlb);
1451 
1452     /*
1453      * The current logic for returns:
1454      *   - positive  invoke iommu_flush_write_buffer to flush cache.
1455      *   - zero      on success.
1456      *   - negative  on failure. Continue to flush IOMMU IOTLB on a
1457      *               best effort basis.
1458      */
1459     if ( rc > 0 || ret > 0 )
1460         iommu_flush_write_buffer(iommu);
1461     if ( rc >= 0 )
1462         rc = ret;
1463     if ( rc > 0 )
1464         rc = 0;
1465 
1466     set_bit(iommu->index, &hd->arch.iommu_bitmap);
1467 
1468     unmap_vtd_domain_page(context_entries);
1469 
1470     if ( !seg && !rc )
1471         rc = me_wifi_quirk(domain, bus, devfn, MAP_ME_PHANTOM_FUNC);
1472 
1473     return rc;
1474 }
1475 
domain_context_mapping(struct domain * domain,u8 devfn,struct pci_dev * pdev)1476 static int domain_context_mapping(struct domain *domain, u8 devfn,
1477                                   struct pci_dev *pdev)
1478 {
1479     struct acpi_drhd_unit *drhd;
1480     int ret = 0;
1481     u8 seg = pdev->seg, bus = pdev->bus, secbus;
1482 
1483     drhd = acpi_find_matched_drhd_unit(pdev);
1484     if ( !drhd )
1485         return -ENODEV;
1486 
1487     ASSERT(pcidevs_locked());
1488 
1489     switch ( pdev->type )
1490     {
1491     case DEV_TYPE_PCI_HOST_BRIDGE:
1492         if ( iommu_debug )
1493             printk(VTDPREFIX "d%d:Hostbridge: skip %04x:%02x:%02x.%u map\n",
1494                    domain->domain_id, seg, bus,
1495                    PCI_SLOT(devfn), PCI_FUNC(devfn));
1496         if ( !is_hardware_domain(domain) )
1497             return -EPERM;
1498         break;
1499 
1500     case DEV_TYPE_PCIe_BRIDGE:
1501     case DEV_TYPE_PCIe2PCI_BRIDGE:
1502     case DEV_TYPE_LEGACY_PCI_BRIDGE:
1503         break;
1504 
1505     case DEV_TYPE_PCIe_ENDPOINT:
1506         if ( iommu_debug )
1507             printk(VTDPREFIX "d%d:PCIe: map %04x:%02x:%02x.%u\n",
1508                    domain->domain_id, seg, bus,
1509                    PCI_SLOT(devfn), PCI_FUNC(devfn));
1510         ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
1511                                          pdev);
1512         if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 )
1513             enable_ats_device(pdev, &drhd->iommu->ats_devices);
1514 
1515         break;
1516 
1517     case DEV_TYPE_PCI:
1518         if ( iommu_debug )
1519             printk(VTDPREFIX "d%d:PCI: map %04x:%02x:%02x.%u\n",
1520                    domain->domain_id, seg, bus,
1521                    PCI_SLOT(devfn), PCI_FUNC(devfn));
1522 
1523         ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
1524                                          pdev);
1525         if ( ret )
1526             break;
1527 
1528         if ( find_upstream_bridge(seg, &bus, &devfn, &secbus) < 1 )
1529             break;
1530 
1531         ret = domain_context_mapping_one(domain, drhd->iommu, bus, devfn,
1532                                          pci_get_pdev(seg, bus, devfn));
1533 
1534         /*
1535          * Devices behind PCIe-to-PCI/PCIx bridge may generate different
1536          * requester-id. It may originate from devfn=0 on the secondary bus
1537          * behind the bridge. Map that id as well if we didn't already.
1538          */
1539         if ( !ret && pdev_type(seg, bus, devfn) == DEV_TYPE_PCIe2PCI_BRIDGE &&
1540              (secbus != pdev->bus || pdev->devfn != 0) )
1541             ret = domain_context_mapping_one(domain, drhd->iommu, secbus, 0,
1542                                              pci_get_pdev(seg, secbus, 0));
1543 
1544         break;
1545 
1546     default:
1547         dprintk(XENLOG_ERR VTDPREFIX, "d%d:unknown(%u): %04x:%02x:%02x.%u\n",
1548                 domain->domain_id, pdev->type,
1549                 seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1550         ret = -EINVAL;
1551         break;
1552     }
1553 
1554     if ( !ret && devfn == pdev->devfn )
1555         pci_vtd_quirk(pdev);
1556 
1557     return ret;
1558 }
1559 
domain_context_unmap_one(struct domain * domain,struct iommu * iommu,u8 bus,u8 devfn)1560 int domain_context_unmap_one(
1561     struct domain *domain,
1562     struct iommu *iommu,
1563     u8 bus, u8 devfn)
1564 {
1565     struct context_entry *context, *context_entries;
1566     u64 maddr;
1567     int iommu_domid, rc, ret;
1568     bool_t flush_dev_iotlb;
1569 
1570     ASSERT(pcidevs_locked());
1571     spin_lock(&iommu->lock);
1572 
1573     maddr = bus_to_context_maddr(iommu, bus);
1574     context_entries = (struct context_entry *)map_vtd_domain_page(maddr);
1575     context = &context_entries[devfn];
1576 
1577     if ( !context_present(*context) )
1578     {
1579         spin_unlock(&iommu->lock);
1580         unmap_vtd_domain_page(context_entries);
1581         return 0;
1582     }
1583 
1584     context_clear_present(*context);
1585     context_clear_entry(*context);
1586     iommu_flush_cache_entry(context, sizeof(struct context_entry));
1587 
1588     iommu_domid= domain_iommu_domid(domain, iommu);
1589     if ( iommu_domid == -1 )
1590     {
1591         spin_unlock(&iommu->lock);
1592         unmap_vtd_domain_page(context_entries);
1593         return -EINVAL;
1594     }
1595 
1596     rc = iommu_flush_context_device(iommu, iommu_domid,
1597                                     PCI_BDF2(bus, devfn),
1598                                     DMA_CCMD_MASK_NOBIT, 0);
1599 
1600     flush_dev_iotlb = !!find_ats_dev_drhd(iommu);
1601     ret = iommu_flush_iotlb_dsi(iommu, iommu_domid, 0, flush_dev_iotlb);
1602 
1603     /*
1604      * The current logic for returns:
1605      *   - positive  invoke iommu_flush_write_buffer to flush cache.
1606      *   - zero      on success.
1607      *   - negative  on failure. Continue to flush IOMMU IOTLB on a
1608      *               best effort basis.
1609      */
1610     if ( rc > 0 || ret > 0 )
1611         iommu_flush_write_buffer(iommu);
1612     if ( rc >= 0 )
1613         rc = ret;
1614     if ( rc > 0 )
1615         rc = 0;
1616 
1617     spin_unlock(&iommu->lock);
1618     unmap_vtd_domain_page(context_entries);
1619 
1620     if ( !iommu->intel->drhd->segment && !rc )
1621         rc = me_wifi_quirk(domain, bus, devfn, UNMAP_ME_PHANTOM_FUNC);
1622 
1623     return rc;
1624 }
1625 
domain_context_unmap(struct domain * domain,u8 devfn,struct pci_dev * pdev)1626 static int domain_context_unmap(struct domain *domain, u8 devfn,
1627                                 struct pci_dev *pdev)
1628 {
1629     struct acpi_drhd_unit *drhd;
1630     struct iommu *iommu;
1631     int ret = 0;
1632     u8 seg = pdev->seg, bus = pdev->bus, tmp_bus, tmp_devfn, secbus;
1633     int found = 0;
1634 
1635     drhd = acpi_find_matched_drhd_unit(pdev);
1636     if ( !drhd )
1637         return -ENODEV;
1638     iommu = drhd->iommu;
1639 
1640     switch ( pdev->type )
1641     {
1642     case DEV_TYPE_PCI_HOST_BRIDGE:
1643         if ( iommu_debug )
1644             printk(VTDPREFIX "d%d:Hostbridge: skip %04x:%02x:%02x.%u unmap\n",
1645                    domain->domain_id, seg, bus,
1646                    PCI_SLOT(devfn), PCI_FUNC(devfn));
1647         if ( !is_hardware_domain(domain) )
1648             return -EPERM;
1649         goto out;
1650 
1651     case DEV_TYPE_PCIe_BRIDGE:
1652     case DEV_TYPE_PCIe2PCI_BRIDGE:
1653     case DEV_TYPE_LEGACY_PCI_BRIDGE:
1654         goto out;
1655 
1656     case DEV_TYPE_PCIe_ENDPOINT:
1657         if ( iommu_debug )
1658             printk(VTDPREFIX "d%d:PCIe: unmap %04x:%02x:%02x.%u\n",
1659                    domain->domain_id, seg, bus,
1660                    PCI_SLOT(devfn), PCI_FUNC(devfn));
1661         ret = domain_context_unmap_one(domain, iommu, bus, devfn);
1662         if ( !ret && devfn == pdev->devfn && ats_device(pdev, drhd) > 0 )
1663             disable_ats_device(pdev);
1664 
1665         break;
1666 
1667     case DEV_TYPE_PCI:
1668         if ( iommu_debug )
1669             printk(VTDPREFIX "d%d:PCI: unmap %04x:%02x:%02x.%u\n",
1670                    domain->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1671         ret = domain_context_unmap_one(domain, iommu, bus, devfn);
1672         if ( ret )
1673             break;
1674 
1675         tmp_bus = bus;
1676         tmp_devfn = devfn;
1677         if ( find_upstream_bridge(seg, &tmp_bus, &tmp_devfn, &secbus) < 1 )
1678             break;
1679 
1680         /* PCIe to PCI/PCIx bridge */
1681         if ( pdev_type(seg, tmp_bus, tmp_devfn) == DEV_TYPE_PCIe2PCI_BRIDGE )
1682         {
1683             ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn);
1684             if ( ret )
1685                 return ret;
1686 
1687             ret = domain_context_unmap_one(domain, iommu, secbus, 0);
1688         }
1689         else /* Legacy PCI bridge */
1690             ret = domain_context_unmap_one(domain, iommu, tmp_bus, tmp_devfn);
1691 
1692         break;
1693 
1694     default:
1695         dprintk(XENLOG_ERR VTDPREFIX, "d%d:unknown(%u): %04x:%02x:%02x.%u\n",
1696                 domain->domain_id, pdev->type,
1697                 seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1698         ret = -EINVAL;
1699         goto out;
1700     }
1701 
1702     /*
1703      * if no other devices under the same iommu owned by this domain,
1704      * clear iommu in iommu_bitmap and clear domain_id in domid_bitmp
1705      */
1706     for_each_pdev ( domain, pdev )
1707     {
1708         if ( pdev->seg == seg && pdev->bus == bus && pdev->devfn == devfn )
1709             continue;
1710 
1711         drhd = acpi_find_matched_drhd_unit(pdev);
1712         if ( drhd && drhd->iommu == iommu )
1713         {
1714             found = 1;
1715             break;
1716         }
1717     }
1718 
1719     if ( found == 0 )
1720     {
1721         int iommu_domid;
1722 
1723         clear_bit(iommu->index, &dom_iommu(domain)->arch.iommu_bitmap);
1724 
1725         iommu_domid = domain_iommu_domid(domain, iommu);
1726         if ( iommu_domid == -1 )
1727         {
1728             ret = -EINVAL;
1729             goto out;
1730         }
1731 
1732         clear_bit(iommu_domid, iommu->domid_bitmap);
1733         iommu->domid_map[iommu_domid] = 0;
1734     }
1735 
1736 out:
1737     return ret;
1738 }
1739 
iommu_domain_teardown(struct domain * d)1740 static void iommu_domain_teardown(struct domain *d)
1741 {
1742     struct domain_iommu *hd = dom_iommu(d);
1743     struct mapped_rmrr *mrmrr, *tmp;
1744 
1745     if ( list_empty(&acpi_drhd_units) )
1746         return;
1747 
1748     list_for_each_entry_safe ( mrmrr, tmp, &hd->arch.mapped_rmrrs, list )
1749     {
1750         list_del(&mrmrr->list);
1751         xfree(mrmrr);
1752     }
1753 
1754     if ( iommu_use_hap_pt(d) )
1755         return;
1756 
1757     spin_lock(&hd->arch.mapping_lock);
1758     iommu_free_pagetable(hd->arch.pgd_maddr, agaw_to_level(hd->arch.agaw));
1759     hd->arch.pgd_maddr = 0;
1760     spin_unlock(&hd->arch.mapping_lock);
1761 }
1762 
intel_iommu_map_page(struct domain * d,unsigned long gfn,unsigned long mfn,unsigned int flags)1763 static int __must_check intel_iommu_map_page(struct domain *d,
1764                                              unsigned long gfn,
1765                                              unsigned long mfn,
1766                                              unsigned int flags)
1767 {
1768     struct domain_iommu *hd = dom_iommu(d);
1769     struct dma_pte *page = NULL, *pte = NULL, old, new = { 0 };
1770     u64 pg_maddr;
1771     int rc = 0;
1772 
1773     /* Do nothing if VT-d shares EPT page table */
1774     if ( iommu_use_hap_pt(d) )
1775         return 0;
1776 
1777     /* Do nothing if hardware domain and iommu supports pass thru. */
1778     if ( iommu_passthrough && is_hardware_domain(d) )
1779         return 0;
1780 
1781     spin_lock(&hd->arch.mapping_lock);
1782 
1783     pg_maddr = addr_to_dma_page_maddr(d, (paddr_t)gfn << PAGE_SHIFT_4K, 1);
1784     if ( pg_maddr == 0 )
1785     {
1786         spin_unlock(&hd->arch.mapping_lock);
1787         return -ENOMEM;
1788     }
1789     page = (struct dma_pte *)map_vtd_domain_page(pg_maddr);
1790     pte = page + (gfn & LEVEL_MASK);
1791     old = *pte;
1792     dma_set_pte_addr(new, (paddr_t)mfn << PAGE_SHIFT_4K);
1793     dma_set_pte_prot(new,
1794                      ((flags & IOMMUF_readable) ? DMA_PTE_READ  : 0) |
1795                      ((flags & IOMMUF_writable) ? DMA_PTE_WRITE : 0));
1796 
1797     /* Set the SNP on leaf page table if Snoop Control available */
1798     if ( iommu_snoop )
1799         dma_set_pte_snp(new);
1800 
1801     if ( old.val == new.val )
1802     {
1803         spin_unlock(&hd->arch.mapping_lock);
1804         unmap_vtd_domain_page(page);
1805         return 0;
1806     }
1807     *pte = new;
1808 
1809     iommu_flush_cache_entry(pte, sizeof(struct dma_pte));
1810     spin_unlock(&hd->arch.mapping_lock);
1811     unmap_vtd_domain_page(page);
1812 
1813     if ( !this_cpu(iommu_dont_flush_iotlb) )
1814         rc = iommu_flush_iotlb(d, gfn, dma_pte_present(old), 1);
1815 
1816     return rc;
1817 }
1818 
intel_iommu_unmap_page(struct domain * d,unsigned long gfn)1819 static int __must_check intel_iommu_unmap_page(struct domain *d,
1820                                                unsigned long gfn)
1821 {
1822     /* Do nothing if hardware domain and iommu supports pass thru. */
1823     if ( iommu_passthrough && is_hardware_domain(d) )
1824         return 0;
1825 
1826     return dma_pte_clear_one(d, (paddr_t)gfn << PAGE_SHIFT_4K);
1827 }
1828 
iommu_pte_flush(struct domain * d,u64 gfn,u64 * pte,int order,int present)1829 int iommu_pte_flush(struct domain *d, u64 gfn, u64 *pte,
1830                     int order, int present)
1831 {
1832     struct acpi_drhd_unit *drhd;
1833     struct iommu *iommu = NULL;
1834     struct domain_iommu *hd = dom_iommu(d);
1835     bool_t flush_dev_iotlb;
1836     int iommu_domid;
1837     int rc = 0;
1838 
1839     iommu_flush_cache_entry(pte, sizeof(struct dma_pte));
1840 
1841     for_each_drhd_unit ( drhd )
1842     {
1843         iommu = drhd->iommu;
1844         if ( !test_bit(iommu->index, &hd->arch.iommu_bitmap) )
1845             continue;
1846 
1847         flush_dev_iotlb = !!find_ats_dev_drhd(iommu);
1848         iommu_domid= domain_iommu_domid(d, iommu);
1849         if ( iommu_domid == -1 )
1850             continue;
1851 
1852         rc = iommu_flush_iotlb_psi(iommu, iommu_domid,
1853                                    (paddr_t)gfn << PAGE_SHIFT_4K,
1854                                    order, !present, flush_dev_iotlb);
1855         if ( rc > 0 )
1856         {
1857             iommu_flush_write_buffer(iommu);
1858             rc = 0;
1859         }
1860     }
1861 
1862     if ( unlikely(rc) )
1863     {
1864         if ( !d->is_shutting_down && printk_ratelimit() )
1865             printk(XENLOG_ERR VTDPREFIX
1866                    " d%d: IOMMU pages flush failed: %d\n",
1867                    d->domain_id, rc);
1868 
1869         if ( !is_hardware_domain(d) )
1870             domain_crash(d);
1871     }
1872 
1873     return rc;
1874 }
1875 
vtd_ept_page_compatible(struct iommu * iommu)1876 static int __init vtd_ept_page_compatible(struct iommu *iommu)
1877 {
1878     u64 ept_cap, vtd_cap = iommu->cap;
1879 
1880     /* EPT is not initialised yet, so we must check the capability in
1881      * the MSR explicitly rather than use cpu_has_vmx_ept_*() */
1882     if ( rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP, ept_cap) != 0 )
1883         return 0;
1884 
1885     return (ept_has_2mb(ept_cap) && opt_hap_2mb) == cap_sps_2mb(vtd_cap) &&
1886            (ept_has_1gb(ept_cap) && opt_hap_1gb) == cap_sps_1gb(vtd_cap);
1887 }
1888 
1889 /*
1890  * set VT-d page table directory to EPT table if allowed
1891  */
iommu_set_pgd(struct domain * d)1892 static void iommu_set_pgd(struct domain *d)
1893 {
1894     mfn_t pgd_mfn;
1895 
1896     pgd_mfn = pagetable_get_mfn(p2m_get_pagetable(p2m_get_hostp2m(d)));
1897     dom_iommu(d)->arch.pgd_maddr =
1898         pagetable_get_paddr(pagetable_from_mfn(pgd_mfn));
1899 }
1900 
rmrr_identity_mapping(struct domain * d,bool_t map,const struct acpi_rmrr_unit * rmrr,u32 flag)1901 static int rmrr_identity_mapping(struct domain *d, bool_t map,
1902                                  const struct acpi_rmrr_unit *rmrr,
1903                                  u32 flag)
1904 {
1905     unsigned long base_pfn = rmrr->base_address >> PAGE_SHIFT_4K;
1906     unsigned long end_pfn = PAGE_ALIGN_4K(rmrr->end_address) >> PAGE_SHIFT_4K;
1907     struct mapped_rmrr *mrmrr;
1908     struct domain_iommu *hd = dom_iommu(d);
1909 
1910     ASSERT(pcidevs_locked());
1911     ASSERT(rmrr->base_address < rmrr->end_address);
1912 
1913     /*
1914      * No need to acquire hd->arch.mapping_lock: Both insertion and removal
1915      * get done while holding pcidevs_lock.
1916      */
1917     list_for_each_entry( mrmrr, &hd->arch.mapped_rmrrs, list )
1918     {
1919         if ( mrmrr->base == rmrr->base_address &&
1920              mrmrr->end == rmrr->end_address )
1921         {
1922             int ret = 0;
1923 
1924             if ( map )
1925             {
1926                 ++mrmrr->count;
1927                 return 0;
1928             }
1929 
1930             if ( --mrmrr->count )
1931                 return 0;
1932 
1933             while ( base_pfn < end_pfn )
1934             {
1935                 if ( clear_identity_p2m_entry(d, base_pfn) )
1936                     ret = -ENXIO;
1937                 base_pfn++;
1938             }
1939 
1940             list_del(&mrmrr->list);
1941             xfree(mrmrr);
1942             return ret;
1943         }
1944     }
1945 
1946     if ( !map )
1947         return -ENOENT;
1948 
1949     while ( base_pfn < end_pfn )
1950     {
1951         int err = set_identity_p2m_entry(d, base_pfn, p2m_access_rw, flag);
1952 
1953         if ( err )
1954             return err;
1955         base_pfn++;
1956     }
1957 
1958     mrmrr = xmalloc(struct mapped_rmrr);
1959     if ( !mrmrr )
1960         return -ENOMEM;
1961     mrmrr->base = rmrr->base_address;
1962     mrmrr->end = rmrr->end_address;
1963     mrmrr->count = 1;
1964     list_add_tail(&mrmrr->list, &hd->arch.mapped_rmrrs);
1965 
1966     return 0;
1967 }
1968 
intel_iommu_add_device(u8 devfn,struct pci_dev * pdev)1969 static int intel_iommu_add_device(u8 devfn, struct pci_dev *pdev)
1970 {
1971     struct acpi_rmrr_unit *rmrr;
1972     u16 bdf;
1973     int ret, i;
1974 
1975     ASSERT(pcidevs_locked());
1976 
1977     if ( !pdev->domain )
1978         return -EINVAL;
1979 
1980     ret = domain_context_mapping(pdev->domain, devfn, pdev);
1981     if ( ret )
1982     {
1983         dprintk(XENLOG_ERR VTDPREFIX, "d%d: context mapping failed\n",
1984                 pdev->domain->domain_id);
1985         return ret;
1986     }
1987 
1988     for_each_rmrr_device ( rmrr, bdf, i )
1989     {
1990         if ( rmrr->segment == pdev->seg &&
1991              PCI_BUS(bdf) == pdev->bus &&
1992              PCI_DEVFN2(bdf) == devfn )
1993         {
1994             /*
1995              * iommu_add_device() is only called for the hardware
1996              * domain (see xen/drivers/passthrough/pci.c:pci_add_device()).
1997              * Since RMRRs are always reserved in the e820 map for the hardware
1998              * domain, there shouldn't be a conflict.
1999              */
2000             ret = rmrr_identity_mapping(pdev->domain, 1, rmrr, 0);
2001             if ( ret )
2002                 dprintk(XENLOG_ERR VTDPREFIX, "d%d: RMRR mapping failed\n",
2003                         pdev->domain->domain_id);
2004         }
2005     }
2006 
2007     return 0;
2008 }
2009 
intel_iommu_enable_device(struct pci_dev * pdev)2010 static int intel_iommu_enable_device(struct pci_dev *pdev)
2011 {
2012     struct acpi_drhd_unit *drhd = acpi_find_matched_drhd_unit(pdev);
2013     int ret = drhd ? ats_device(pdev, drhd) : -ENODEV;
2014 
2015     pci_vtd_quirk(pdev);
2016 
2017     if ( ret <= 0 )
2018         return ret;
2019 
2020     ret = enable_ats_device(pdev, &drhd->iommu->ats_devices);
2021 
2022     return ret >= 0 ? 0 : ret;
2023 }
2024 
intel_iommu_remove_device(u8 devfn,struct pci_dev * pdev)2025 static int intel_iommu_remove_device(u8 devfn, struct pci_dev *pdev)
2026 {
2027     struct acpi_rmrr_unit *rmrr;
2028     u16 bdf;
2029     int i;
2030 
2031     if ( !pdev->domain )
2032         return -EINVAL;
2033 
2034     for_each_rmrr_device ( rmrr, bdf, i )
2035     {
2036         if ( rmrr->segment != pdev->seg ||
2037              PCI_BUS(bdf) != pdev->bus ||
2038              PCI_DEVFN2(bdf) != devfn )
2039             continue;
2040 
2041         /*
2042          * Any flag is nothing to clear these mappings but here
2043          * its always safe and strict to set 0.
2044          */
2045         rmrr_identity_mapping(pdev->domain, 0, rmrr, 0);
2046     }
2047 
2048     return domain_context_unmap(pdev->domain, devfn, pdev);
2049 }
2050 
setup_hwdom_device(u8 devfn,struct pci_dev * pdev)2051 static int __hwdom_init setup_hwdom_device(u8 devfn, struct pci_dev *pdev)
2052 {
2053     return domain_context_mapping(pdev->domain, devfn, pdev);
2054 }
2055 
clear_fault_bits(struct iommu * iommu)2056 void clear_fault_bits(struct iommu *iommu)
2057 {
2058     u64 val;
2059     unsigned long flags;
2060 
2061     spin_lock_irqsave(&iommu->register_lock, flags);
2062     val = dmar_readq(iommu->reg, cap_fault_reg_offset(iommu->cap) + 8);
2063     dmar_writeq(iommu->reg, cap_fault_reg_offset(iommu->cap) + 8, val);
2064     dmar_writel(iommu->reg, DMAR_FSTS_REG, DMA_FSTS_FAULTS);
2065     spin_unlock_irqrestore(&iommu->register_lock, flags);
2066 }
2067 
adjust_irq_affinity(struct acpi_drhd_unit * drhd)2068 static void adjust_irq_affinity(struct acpi_drhd_unit *drhd)
2069 {
2070     const struct acpi_rhsa_unit *rhsa = drhd_to_rhsa(drhd);
2071     unsigned int node = rhsa ? pxm_to_node(rhsa->proximity_domain)
2072                              : NUMA_NO_NODE;
2073     const cpumask_t *cpumask = &cpu_online_map;
2074 
2075     if ( node < MAX_NUMNODES && node_online(node) &&
2076          cpumask_intersects(&node_to_cpumask(node), cpumask) )
2077         cpumask = &node_to_cpumask(node);
2078     dma_msi_set_affinity(irq_to_desc(drhd->iommu->msi.irq), cpumask);
2079 }
2080 
adjust_vtd_irq_affinities(void)2081 int adjust_vtd_irq_affinities(void)
2082 {
2083     struct acpi_drhd_unit *drhd;
2084 
2085     if ( !iommu_enabled )
2086         return 0;
2087 
2088     for_each_drhd_unit ( drhd )
2089         adjust_irq_affinity(drhd);
2090 
2091     return 0;
2092 }
2093 __initcall(adjust_vtd_irq_affinities);
2094 
init_vtd_hw(void)2095 static int __must_check init_vtd_hw(void)
2096 {
2097     struct acpi_drhd_unit *drhd;
2098     struct iommu *iommu;
2099     struct iommu_flush *flush = NULL;
2100     int ret;
2101     unsigned long flags;
2102     u32 sts;
2103 
2104     /*
2105      * Basic VT-d HW init: set VT-d interrupt, clear VT-d faults.
2106      */
2107     for_each_drhd_unit ( drhd )
2108     {
2109         adjust_irq_affinity(drhd);
2110 
2111         iommu = drhd->iommu;
2112 
2113         clear_fault_bits(iommu);
2114 
2115         spin_lock_irqsave(&iommu->register_lock, flags);
2116         sts = dmar_readl(iommu->reg, DMAR_FECTL_REG);
2117         sts &= ~DMA_FECTL_IM;
2118         dmar_writel(iommu->reg, DMAR_FECTL_REG, sts);
2119         spin_unlock_irqrestore(&iommu->register_lock, flags);
2120     }
2121 
2122     /*
2123      * Enable queue invalidation
2124      */
2125     for_each_drhd_unit ( drhd )
2126     {
2127         iommu = drhd->iommu;
2128         /*
2129          * If queued invalidation not enabled, use regiser based
2130          * invalidation
2131          */
2132         if ( enable_qinval(iommu) != 0 )
2133         {
2134             flush = iommu_get_flush(iommu);
2135             flush->context = flush_context_reg;
2136             flush->iotlb = flush_iotlb_reg;
2137         }
2138     }
2139 
2140     /*
2141      * Enable interrupt remapping
2142      */
2143     if ( iommu_intremap )
2144     {
2145         int apic;
2146         for ( apic = 0; apic < nr_ioapics; apic++ )
2147         {
2148             if ( ioapic_to_iommu(IO_APIC_ID(apic)) == NULL )
2149             {
2150                 iommu_intremap = 0;
2151                 dprintk(XENLOG_ERR VTDPREFIX,
2152                     "ioapic_to_iommu: ioapic %#x (id: %#x) is NULL! "
2153                     "Will not try to enable Interrupt Remapping.\n",
2154                     apic, IO_APIC_ID(apic));
2155                 break;
2156             }
2157         }
2158     }
2159     if ( iommu_intremap )
2160     {
2161         for_each_drhd_unit ( drhd )
2162         {
2163             iommu = drhd->iommu;
2164             if ( enable_intremap(iommu, 0) != 0 )
2165             {
2166                 iommu_intremap = 0;
2167                 dprintk(XENLOG_WARNING VTDPREFIX,
2168                         "Interrupt Remapping not enabled\n");
2169 
2170                 break;
2171             }
2172         }
2173         if ( !iommu_intremap )
2174             for_each_drhd_unit ( drhd )
2175                 disable_intremap(drhd->iommu);
2176     }
2177 
2178     /*
2179      * Set root entries for each VT-d engine.  After set root entry,
2180      * must globally invalidate context cache, and then globally
2181      * invalidate IOTLB
2182      */
2183     for_each_drhd_unit ( drhd )
2184     {
2185         iommu = drhd->iommu;
2186         ret = iommu_set_root_entry(iommu);
2187         if ( ret )
2188         {
2189             dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: set root entry failed\n");
2190             return -EIO;
2191         }
2192     }
2193 
2194     return iommu_flush_all();
2195 }
2196 
setup_hwdom_rmrr(struct domain * d)2197 static void __hwdom_init setup_hwdom_rmrr(struct domain *d)
2198 {
2199     struct acpi_rmrr_unit *rmrr;
2200     u16 bdf;
2201     int ret, i;
2202 
2203     pcidevs_lock();
2204     for_each_rmrr_device ( rmrr, bdf, i )
2205     {
2206         /*
2207          * Here means we're add a device to the hardware domain.
2208          * Since RMRRs are always reserved in the e820 map for the hardware
2209          * domain, there shouldn't be a conflict. So its always safe and
2210          * strict to set 0.
2211          */
2212         ret = rmrr_identity_mapping(d, 1, rmrr, 0);
2213         if ( ret )
2214             dprintk(XENLOG_ERR VTDPREFIX,
2215                      "IOMMU: mapping reserved region failed\n");
2216     }
2217     pcidevs_unlock();
2218 }
2219 
intel_vtd_setup(void)2220 int __init intel_vtd_setup(void)
2221 {
2222     struct acpi_drhd_unit *drhd;
2223     struct iommu *iommu;
2224     int ret;
2225 
2226     if ( list_empty(&acpi_drhd_units) )
2227     {
2228         ret = -ENODEV;
2229         goto error;
2230     }
2231 
2232     if ( unlikely(acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_MSI) )
2233     {
2234         ret = -EPERM;
2235         goto error;
2236     }
2237 
2238     platform_quirks_init();
2239     if ( !iommu_enable )
2240     {
2241         ret = -ENODEV;
2242         goto error;
2243     }
2244 
2245     /* We enable the following features only if they are supported by all VT-d
2246      * engines: Snoop Control, DMA passthrough, Queued Invalidation, Interrupt
2247      * Remapping, and Posted Interrupt
2248      */
2249     for_each_drhd_unit ( drhd )
2250     {
2251         iommu = drhd->iommu;
2252 
2253         printk("Intel VT-d iommu %"PRIu32" supported page sizes: 4kB",
2254                iommu->index);
2255         if (cap_sps_2mb(iommu->cap))
2256             printk(", 2MB");
2257 
2258         if (cap_sps_1gb(iommu->cap))
2259             printk(", 1GB");
2260 
2261         printk(".\n");
2262 
2263         if ( iommu_snoop && !ecap_snp_ctl(iommu->ecap) )
2264             iommu_snoop = 0;
2265 
2266         if ( iommu_passthrough && !ecap_pass_thru(iommu->ecap) )
2267             iommu_passthrough = 0;
2268 
2269         if ( iommu_qinval && !ecap_queued_inval(iommu->ecap) )
2270             iommu_qinval = 0;
2271 
2272         if ( iommu_intremap && !ecap_intr_remap(iommu->ecap) )
2273             iommu_intremap = 0;
2274 
2275         /*
2276          * We cannot use posted interrupt if X86_FEATURE_CX16 is
2277          * not supported, since we count on this feature to
2278          * atomically update 16-byte IRTE in posted format.
2279          */
2280         if ( !cap_intr_post(iommu->cap) || !cpu_has_cx16 )
2281             iommu_intpost = 0;
2282 
2283         if ( !vtd_ept_page_compatible(iommu) )
2284             iommu_hap_pt_share = 0;
2285 
2286         ret = iommu_set_interrupt(drhd);
2287         if ( ret )
2288         {
2289             dprintk(XENLOG_ERR VTDPREFIX, "IOMMU: interrupt setup failed\n");
2290             goto error;
2291         }
2292     }
2293 
2294     softirq_tasklet_init(&vtd_fault_tasklet, do_iommu_page_fault, 0);
2295 
2296     if ( !iommu_qinval && iommu_intremap )
2297     {
2298         iommu_intremap = 0;
2299         dprintk(XENLOG_WARNING VTDPREFIX, "Interrupt Remapping disabled "
2300             "since Queued Invalidation isn't supported or enabled.\n");
2301     }
2302 
2303 #define P(p,s) printk("Intel VT-d %s %senabled.\n", s, (p)? "" : "not ")
2304     P(iommu_snoop, "Snoop Control");
2305     P(iommu_passthrough, "Dom0 DMA Passthrough");
2306     P(iommu_qinval, "Queued Invalidation");
2307     P(iommu_intremap, "Interrupt Remapping");
2308     P(iommu_intpost, "Posted Interrupt");
2309     P(iommu_hap_pt_share, "Shared EPT tables");
2310 #undef P
2311 
2312     ret = scan_pci_devices();
2313     if ( ret )
2314         goto error;
2315 
2316     ret = init_vtd_hw();
2317     if ( ret )
2318         goto error;
2319 
2320     register_keyhandler('V', vtd_dump_iommu_info, "dump iommu info", 1);
2321 
2322     return 0;
2323 
2324  error:
2325     iommu_enabled = 0;
2326     iommu_snoop = 0;
2327     iommu_passthrough = 0;
2328     iommu_qinval = 0;
2329     iommu_intremap = 0;
2330     iommu_intpost = 0;
2331     return ret;
2332 }
2333 
reassign_device_ownership(struct domain * source,struct domain * target,u8 devfn,struct pci_dev * pdev)2334 static int reassign_device_ownership(
2335     struct domain *source,
2336     struct domain *target,
2337     u8 devfn, struct pci_dev *pdev)
2338 {
2339     int ret;
2340 
2341     /*
2342      * Devices assigned to untrusted domains (here assumed to be any domU)
2343      * can attempt to send arbitrary LAPIC/MSI messages. We are unprotected
2344      * by the root complex unless interrupt remapping is enabled.
2345      */
2346     if ( (target != hardware_domain) && !iommu_intremap )
2347         untrusted_msi = true;
2348 
2349     /*
2350      * If the device belongs to the hardware domain, and it has RMRR, don't
2351      * remove it from the hardware domain, because BIOS may use RMRR at
2352      * booting time.
2353      */
2354     if ( !is_hardware_domain(source) )
2355     {
2356         const struct acpi_rmrr_unit *rmrr;
2357         u16 bdf;
2358         unsigned int i;
2359 
2360         for_each_rmrr_device( rmrr, bdf, i )
2361             if ( rmrr->segment == pdev->seg &&
2362                  PCI_BUS(bdf) == pdev->bus &&
2363                  PCI_DEVFN2(bdf) == devfn )
2364             {
2365                 /*
2366                  * Any RMRR flag is always ignored when remove a device,
2367                  * but its always safe and strict to set 0.
2368                  */
2369                 ret = rmrr_identity_mapping(source, 0, rmrr, 0);
2370                 if ( ret != -ENOENT )
2371                     return ret;
2372             }
2373     }
2374 
2375     ret = domain_context_unmap(source, devfn, pdev);
2376     if ( ret )
2377         return ret;
2378 
2379     if ( !has_arch_pdevs(target) )
2380         vmx_pi_hooks_assign(target);
2381 
2382     ret = domain_context_mapping(target, devfn, pdev);
2383     if ( ret )
2384     {
2385         if ( !has_arch_pdevs(target) )
2386             vmx_pi_hooks_deassign(target);
2387 
2388         return ret;
2389     }
2390 
2391     if ( devfn == pdev->devfn )
2392     {
2393         list_move(&pdev->domain_list, &target->arch.pdev_list);
2394         pdev->domain = target;
2395     }
2396 
2397     if ( !has_arch_pdevs(source) )
2398         vmx_pi_hooks_deassign(source);
2399 
2400     return ret;
2401 }
2402 
intel_iommu_assign_device(struct domain * d,u8 devfn,struct pci_dev * pdev,u32 flag)2403 static int intel_iommu_assign_device(
2404     struct domain *d, u8 devfn, struct pci_dev *pdev, u32 flag)
2405 {
2406     struct acpi_rmrr_unit *rmrr;
2407     int ret = 0, i;
2408     u16 bdf, seg;
2409     u8 bus;
2410 
2411     if ( list_empty(&acpi_drhd_units) )
2412         return -ENODEV;
2413 
2414     seg = pdev->seg;
2415     bus = pdev->bus;
2416     /*
2417      * In rare cases one given rmrr is shared by multiple devices but
2418      * obviously this would put the security of a system at risk. So
2419      * we would prevent from this sort of device assignment. But this
2420      * can be permitted if user set
2421      *      "pci = [ 'sbdf, rdm_policy=relaxed' ]"
2422      *
2423      * TODO: in the future we can introduce group device assignment
2424      * interface to make sure devices sharing RMRR are assigned to the
2425      * same domain together.
2426      */
2427     for_each_rmrr_device( rmrr, bdf, i )
2428     {
2429         if ( rmrr->segment == seg &&
2430              PCI_BUS(bdf) == bus &&
2431              PCI_DEVFN2(bdf) == devfn &&
2432              rmrr->scope.devices_cnt > 1 )
2433         {
2434             bool_t relaxed = !!(flag & XEN_DOMCTL_DEV_RDM_RELAXED);
2435 
2436             printk(XENLOG_GUEST "%s" VTDPREFIX
2437                    " It's %s to assign %04x:%02x:%02x.%u"
2438                    " with shared RMRR at %"PRIx64" for Dom%d.\n",
2439                    relaxed ? XENLOG_WARNING : XENLOG_ERR,
2440                    relaxed ? "risky" : "disallowed",
2441                    seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
2442                    rmrr->base_address, d->domain_id);
2443             if ( !relaxed )
2444                 return -EPERM;
2445         }
2446     }
2447 
2448     ret = reassign_device_ownership(hardware_domain, d, devfn, pdev);
2449     if ( ret )
2450         return ret;
2451 
2452     /* Setup rmrr identity mapping */
2453     for_each_rmrr_device( rmrr, bdf, i )
2454     {
2455         if ( rmrr->segment == seg &&
2456              PCI_BUS(bdf) == bus &&
2457              PCI_DEVFN2(bdf) == devfn )
2458         {
2459             ret = rmrr_identity_mapping(d, 1, rmrr, flag);
2460             if ( ret )
2461             {
2462                 reassign_device_ownership(d, hardware_domain, devfn, pdev);
2463                 printk(XENLOG_G_ERR VTDPREFIX
2464                        " cannot map reserved region (%"PRIx64",%"PRIx64"] for Dom%d (%d)\n",
2465                        rmrr->base_address, rmrr->end_address,
2466                        d->domain_id, ret);
2467                 break;
2468             }
2469         }
2470     }
2471 
2472     return ret;
2473 }
2474 
intel_iommu_group_id(u16 seg,u8 bus,u8 devfn)2475 static int intel_iommu_group_id(u16 seg, u8 bus, u8 devfn)
2476 {
2477     u8 secbus;
2478     if ( find_upstream_bridge(seg, &bus, &devfn, &secbus) < 0 )
2479         return -1;
2480     else
2481         return PCI_BDF2(bus, devfn);
2482 }
2483 
2484 static u32 iommu_state[MAX_IOMMUS][MAX_IOMMU_REGS];
2485 
vtd_suspend(void)2486 static int __must_check vtd_suspend(void)
2487 {
2488     struct acpi_drhd_unit *drhd;
2489     struct iommu *iommu;
2490     u32    i;
2491     int rc;
2492 
2493     if ( !iommu_enabled )
2494         return 0;
2495 
2496     rc = iommu_flush_all();
2497     if ( unlikely(rc) )
2498     {
2499         printk(XENLOG_WARNING VTDPREFIX
2500                " suspend: IOMMU flush all failed: %d\n", rc);
2501 
2502         return rc;
2503     }
2504 
2505     for_each_drhd_unit ( drhd )
2506     {
2507         iommu = drhd->iommu;
2508         i = iommu->index;
2509 
2510         iommu_state[i][DMAR_FECTL_REG] =
2511             (u32) dmar_readl(iommu->reg, DMAR_FECTL_REG);
2512         iommu_state[i][DMAR_FEDATA_REG] =
2513             (u32) dmar_readl(iommu->reg, DMAR_FEDATA_REG);
2514         iommu_state[i][DMAR_FEADDR_REG] =
2515             (u32) dmar_readl(iommu->reg, DMAR_FEADDR_REG);
2516         iommu_state[i][DMAR_FEUADDR_REG] =
2517             (u32) dmar_readl(iommu->reg, DMAR_FEUADDR_REG);
2518 
2519         /* don't disable VT-d engine when force_iommu is set. */
2520         if ( force_iommu )
2521             continue;
2522 
2523         iommu_disable_translation(iommu);
2524 
2525         /* If interrupt remapping is enabled, queued invalidation
2526          * will be disabled following interupt remapping disabling
2527          * in local apic suspend
2528          */
2529         if ( !iommu_intremap && iommu_qinval )
2530             disable_qinval(iommu);
2531     }
2532 
2533     return 0;
2534 }
2535 
vtd_crash_shutdown(void)2536 static void vtd_crash_shutdown(void)
2537 {
2538     struct acpi_drhd_unit *drhd;
2539     struct iommu *iommu;
2540 
2541     if ( !iommu_enabled )
2542         return;
2543 
2544     if ( iommu_flush_all() )
2545         printk(XENLOG_WARNING VTDPREFIX
2546                " crash shutdown: IOMMU flush all failed\n");
2547 
2548     for_each_drhd_unit ( drhd )
2549     {
2550         iommu = drhd->iommu;
2551         iommu_disable_translation(iommu);
2552         disable_intremap(drhd->iommu);
2553         disable_qinval(drhd->iommu);
2554     }
2555 }
2556 
vtd_resume(void)2557 static void vtd_resume(void)
2558 {
2559     struct acpi_drhd_unit *drhd;
2560     struct iommu *iommu;
2561     u32 i;
2562     unsigned long flags;
2563 
2564     if ( !iommu_enabled )
2565         return;
2566 
2567     if ( init_vtd_hw() != 0  && force_iommu )
2568          panic("IOMMU setup failed, crash Xen for security purpose");
2569 
2570     for_each_drhd_unit ( drhd )
2571     {
2572         iommu = drhd->iommu;
2573         i = iommu->index;
2574 
2575         spin_lock_irqsave(&iommu->register_lock, flags);
2576         dmar_writel(iommu->reg, DMAR_FECTL_REG,
2577                     (u32) iommu_state[i][DMAR_FECTL_REG]);
2578         dmar_writel(iommu->reg, DMAR_FEDATA_REG,
2579                     (u32) iommu_state[i][DMAR_FEDATA_REG]);
2580         dmar_writel(iommu->reg, DMAR_FEADDR_REG,
2581                     (u32) iommu_state[i][DMAR_FEADDR_REG]);
2582         dmar_writel(iommu->reg, DMAR_FEUADDR_REG,
2583                     (u32) iommu_state[i][DMAR_FEUADDR_REG]);
2584         spin_unlock_irqrestore(&iommu->register_lock, flags);
2585 
2586         iommu_enable_translation(drhd);
2587     }
2588 }
2589 
vtd_dump_p2m_table_level(paddr_t pt_maddr,int level,paddr_t gpa,int indent)2590 static void vtd_dump_p2m_table_level(paddr_t pt_maddr, int level, paddr_t gpa,
2591                                      int indent)
2592 {
2593     paddr_t address;
2594     int i;
2595     struct dma_pte *pt_vaddr, *pte;
2596     int next_level;
2597 
2598     if ( level < 1 )
2599         return;
2600 
2601     pt_vaddr = map_vtd_domain_page(pt_maddr);
2602     if ( pt_vaddr == NULL )
2603     {
2604         printk("Failed to map VT-D domain page %"PRIpaddr"\n", pt_maddr);
2605         return;
2606     }
2607 
2608     next_level = level - 1;
2609     for ( i = 0; i < PTE_NUM; i++ )
2610     {
2611         if ( !(i % 2) )
2612             process_pending_softirqs();
2613 
2614         pte = &pt_vaddr[i];
2615         if ( !dma_pte_present(*pte) )
2616             continue;
2617 
2618         address = gpa + offset_level_address(i, level);
2619         if ( next_level >= 1 )
2620             vtd_dump_p2m_table_level(dma_pte_addr(*pte), next_level,
2621                                      address, indent + 1);
2622         else
2623             printk("%*sgfn: %08lx mfn: %08lx\n",
2624                    indent, "",
2625                    (unsigned long)(address >> PAGE_SHIFT_4K),
2626                    (unsigned long)(dma_pte_addr(*pte) >> PAGE_SHIFT_4K));
2627     }
2628 
2629     unmap_vtd_domain_page(pt_vaddr);
2630 }
2631 
vtd_dump_p2m_table(struct domain * d)2632 static void vtd_dump_p2m_table(struct domain *d)
2633 {
2634     const struct domain_iommu *hd;
2635 
2636     if ( list_empty(&acpi_drhd_units) )
2637         return;
2638 
2639     hd = dom_iommu(d);
2640     printk("p2m table has %d levels\n", agaw_to_level(hd->arch.agaw));
2641     vtd_dump_p2m_table_level(hd->arch.pgd_maddr, agaw_to_level(hd->arch.agaw), 0, 0);
2642 }
2643 
2644 const struct iommu_ops intel_iommu_ops = {
2645     .init = intel_iommu_domain_init,
2646     .hwdom_init = intel_iommu_hwdom_init,
2647     .add_device = intel_iommu_add_device,
2648     .enable_device = intel_iommu_enable_device,
2649     .remove_device = intel_iommu_remove_device,
2650     .assign_device  = intel_iommu_assign_device,
2651     .teardown = iommu_domain_teardown,
2652     .map_page = intel_iommu_map_page,
2653     .unmap_page = intel_iommu_unmap_page,
2654     .free_page_table = iommu_free_page_table,
2655     .reassign_device = reassign_device_ownership,
2656     .get_device_group_id = intel_iommu_group_id,
2657     .update_ire_from_apic = io_apic_write_remap_rte,
2658     .update_ire_from_msi = msi_msg_write_remap_rte,
2659     .read_apic_from_ire = io_apic_read_remap_rte,
2660     .read_msi_from_ire = msi_msg_read_remap_rte,
2661     .setup_hpet_msi = intel_setup_hpet_msi,
2662     .suspend = vtd_suspend,
2663     .resume = vtd_resume,
2664     .share_p2m = iommu_set_pgd,
2665     .crash_shutdown = vtd_crash_shutdown,
2666     .iotlb_flush = iommu_flush_iotlb_pages,
2667     .iotlb_flush_all = iommu_flush_iotlb_all,
2668     .get_reserved_device_memory = intel_iommu_get_reserved_device_memory,
2669     .dump_p2m_table = vtd_dump_p2m_table,
2670 };
2671 
2672 /*
2673  * Local variables:
2674  * mode: C
2675  * c-file-style: "BSD"
2676  * c-basic-offset: 4
2677  * tab-width: 4
2678  * indent-tabs-mode: nil
2679  * End:
2680  */
2681