1 /*
2 * Copyright (c) 2006, Intel Corporation.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 *
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
16 *
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
22 */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/export.h>
28 #include <linux/slab.h>
29 #include <linux/irq.h>
30 #include <linux/interrupt.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/iommu.h>
39 #include <linux/intel-iommu.h>
40 #include <linux/syscore_ops.h>
41 #include <linux/tboot.h>
42 #include <linux/dmi.h>
43 #include <linux/pci-ats.h>
44 #include <linux/memblock.h>
45 #include <asm/cacheflush.h>
46 #include <asm/iommu.h>
47
48 #define ROOT_SIZE VTD_PAGE_SIZE
49 #define CONTEXT_SIZE VTD_PAGE_SIZE
50
51 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
52 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
53 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
54
55 #define IOAPIC_RANGE_START (0xfee00000)
56 #define IOAPIC_RANGE_END (0xfeefffff)
57 #define IOVA_START_ADDR (0x1000)
58
59 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
60
61 #define MAX_AGAW_WIDTH 64
62
63 #define __DOMAIN_MAX_PFN(gaw) ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
64 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
65
66 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
67 to match. That way, we can use 'unsigned long' for PFNs with impunity. */
68 #define DOMAIN_MAX_PFN(gaw) ((unsigned long) min_t(uint64_t, \
69 __DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
70 #define DOMAIN_MAX_ADDR(gaw) (((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
71
72 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
73 #define DMA_32BIT_PFN IOVA_PFN(DMA_BIT_MASK(32))
74 #define DMA_64BIT_PFN IOVA_PFN(DMA_BIT_MASK(64))
75
76 /* page table handling */
77 #define LEVEL_STRIDE (9)
78 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
79
80 /*
81 * This bitmap is used to advertise the page sizes our hardware support
82 * to the IOMMU core, which will then use this information to split
83 * physically contiguous memory regions it is mapping into page sizes
84 * that we support.
85 *
86 * Traditionally the IOMMU core just handed us the mappings directly,
87 * after making sure the size is an order of a 4KiB page and that the
88 * mapping has natural alignment.
89 *
90 * To retain this behavior, we currently advertise that we support
91 * all page sizes that are an order of 4KiB.
92 *
93 * If at some point we'd like to utilize the IOMMU core's new behavior,
94 * we could change this to advertise the real page sizes we support.
95 */
96 #define INTEL_IOMMU_PGSIZES (~0xFFFUL)
97
agaw_to_level(int agaw)98 static inline int agaw_to_level(int agaw)
99 {
100 return agaw + 2;
101 }
102
agaw_to_width(int agaw)103 static inline int agaw_to_width(int agaw)
104 {
105 return 30 + agaw * LEVEL_STRIDE;
106 }
107
width_to_agaw(int width)108 static inline int width_to_agaw(int width)
109 {
110 return (width - 30) / LEVEL_STRIDE;
111 }
112
level_to_offset_bits(int level)113 static inline unsigned int level_to_offset_bits(int level)
114 {
115 return (level - 1) * LEVEL_STRIDE;
116 }
117
pfn_level_offset(unsigned long pfn,int level)118 static inline int pfn_level_offset(unsigned long pfn, int level)
119 {
120 return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
121 }
122
level_mask(int level)123 static inline unsigned long level_mask(int level)
124 {
125 return -1UL << level_to_offset_bits(level);
126 }
127
level_size(int level)128 static inline unsigned long level_size(int level)
129 {
130 return 1UL << level_to_offset_bits(level);
131 }
132
align_to_level(unsigned long pfn,int level)133 static inline unsigned long align_to_level(unsigned long pfn, int level)
134 {
135 return (pfn + level_size(level) - 1) & level_mask(level);
136 }
137
lvl_to_nr_pages(unsigned int lvl)138 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
139 {
140 return 1 << ((lvl - 1) * LEVEL_STRIDE);
141 }
142
143 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
144 are never going to work. */
dma_to_mm_pfn(unsigned long dma_pfn)145 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
146 {
147 return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
148 }
149
mm_to_dma_pfn(unsigned long mm_pfn)150 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
151 {
152 return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
153 }
page_to_dma_pfn(struct page * pg)154 static inline unsigned long page_to_dma_pfn(struct page *pg)
155 {
156 return mm_to_dma_pfn(page_to_pfn(pg));
157 }
virt_to_dma_pfn(void * p)158 static inline unsigned long virt_to_dma_pfn(void *p)
159 {
160 return page_to_dma_pfn(virt_to_page(p));
161 }
162
163 /* global iommu list, set NULL for ignored DMAR units */
164 static struct intel_iommu **g_iommus;
165
166 static void __init check_tylersburg_isoch(void);
167 static int rwbf_quirk;
168
169 /*
170 * set to 1 to panic kernel if can't successfully enable VT-d
171 * (used when kernel is launched w/ TXT)
172 */
173 static int force_on = 0;
174
175 /*
176 * 0: Present
177 * 1-11: Reserved
178 * 12-63: Context Ptr (12 - (haw-1))
179 * 64-127: Reserved
180 */
181 struct root_entry {
182 u64 val;
183 u64 rsvd1;
184 };
185 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
root_present(struct root_entry * root)186 static inline bool root_present(struct root_entry *root)
187 {
188 return (root->val & 1);
189 }
set_root_present(struct root_entry * root)190 static inline void set_root_present(struct root_entry *root)
191 {
192 root->val |= 1;
193 }
set_root_value(struct root_entry * root,unsigned long value)194 static inline void set_root_value(struct root_entry *root, unsigned long value)
195 {
196 root->val |= value & VTD_PAGE_MASK;
197 }
198
199 static inline struct context_entry *
get_context_addr_from_root(struct root_entry * root)200 get_context_addr_from_root(struct root_entry *root)
201 {
202 return (struct context_entry *)
203 (root_present(root)?phys_to_virt(
204 root->val & VTD_PAGE_MASK) :
205 NULL);
206 }
207
208 /*
209 * low 64 bits:
210 * 0: present
211 * 1: fault processing disable
212 * 2-3: translation type
213 * 12-63: address space root
214 * high 64 bits:
215 * 0-2: address width
216 * 3-6: aval
217 * 8-23: domain id
218 */
219 struct context_entry {
220 u64 lo;
221 u64 hi;
222 };
223
context_present(struct context_entry * context)224 static inline bool context_present(struct context_entry *context)
225 {
226 return (context->lo & 1);
227 }
context_set_present(struct context_entry * context)228 static inline void context_set_present(struct context_entry *context)
229 {
230 context->lo |= 1;
231 }
232
context_set_fault_enable(struct context_entry * context)233 static inline void context_set_fault_enable(struct context_entry *context)
234 {
235 context->lo &= (((u64)-1) << 2) | 1;
236 }
237
context_set_translation_type(struct context_entry * context,unsigned long value)238 static inline void context_set_translation_type(struct context_entry *context,
239 unsigned long value)
240 {
241 context->lo &= (((u64)-1) << 4) | 3;
242 context->lo |= (value & 3) << 2;
243 }
244
context_set_address_root(struct context_entry * context,unsigned long value)245 static inline void context_set_address_root(struct context_entry *context,
246 unsigned long value)
247 {
248 context->lo |= value & VTD_PAGE_MASK;
249 }
250
context_set_address_width(struct context_entry * context,unsigned long value)251 static inline void context_set_address_width(struct context_entry *context,
252 unsigned long value)
253 {
254 context->hi |= value & 7;
255 }
256
context_set_domain_id(struct context_entry * context,unsigned long value)257 static inline void context_set_domain_id(struct context_entry *context,
258 unsigned long value)
259 {
260 context->hi |= (value & ((1 << 16) - 1)) << 8;
261 }
262
context_clear_entry(struct context_entry * context)263 static inline void context_clear_entry(struct context_entry *context)
264 {
265 context->lo = 0;
266 context->hi = 0;
267 }
268
269 /*
270 * 0: readable
271 * 1: writable
272 * 2-6: reserved
273 * 7: super page
274 * 8-10: available
275 * 11: snoop behavior
276 * 12-63: Host physcial address
277 */
278 struct dma_pte {
279 u64 val;
280 };
281
dma_clear_pte(struct dma_pte * pte)282 static inline void dma_clear_pte(struct dma_pte *pte)
283 {
284 pte->val = 0;
285 }
286
dma_set_pte_readable(struct dma_pte * pte)287 static inline void dma_set_pte_readable(struct dma_pte *pte)
288 {
289 pte->val |= DMA_PTE_READ;
290 }
291
dma_set_pte_writable(struct dma_pte * pte)292 static inline void dma_set_pte_writable(struct dma_pte *pte)
293 {
294 pte->val |= DMA_PTE_WRITE;
295 }
296
dma_set_pte_snp(struct dma_pte * pte)297 static inline void dma_set_pte_snp(struct dma_pte *pte)
298 {
299 pte->val |= DMA_PTE_SNP;
300 }
301
dma_set_pte_prot(struct dma_pte * pte,unsigned long prot)302 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
303 {
304 pte->val = (pte->val & ~3) | (prot & 3);
305 }
306
dma_pte_addr(struct dma_pte * pte)307 static inline u64 dma_pte_addr(struct dma_pte *pte)
308 {
309 #ifdef CONFIG_64BIT
310 return pte->val & VTD_PAGE_MASK;
311 #else
312 /* Must have a full atomic 64-bit read */
313 return __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
314 #endif
315 }
316
dma_set_pte_pfn(struct dma_pte * pte,unsigned long pfn)317 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
318 {
319 pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
320 }
321
dma_pte_present(struct dma_pte * pte)322 static inline bool dma_pte_present(struct dma_pte *pte)
323 {
324 return (pte->val & 3) != 0;
325 }
326
dma_pte_superpage(struct dma_pte * pte)327 static inline bool dma_pte_superpage(struct dma_pte *pte)
328 {
329 return (pte->val & (1 << 7));
330 }
331
first_pte_in_page(struct dma_pte * pte)332 static inline int first_pte_in_page(struct dma_pte *pte)
333 {
334 return !((unsigned long)pte & ~VTD_PAGE_MASK);
335 }
336
337 /*
338 * This domain is a statically identity mapping domain.
339 * 1. This domain creats a static 1:1 mapping to all usable memory.
340 * 2. It maps to each iommu if successful.
341 * 3. Each iommu mapps to this domain if successful.
342 */
343 static struct dmar_domain *si_domain;
344 static int hw_pass_through = 1;
345
346 /* devices under the same p2p bridge are owned in one domain */
347 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
348
349 /* domain represents a virtual machine, more than one devices
350 * across iommus may be owned in one domain, e.g. kvm guest.
351 */
352 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
353
354 /* si_domain contains mulitple devices */
355 #define DOMAIN_FLAG_STATIC_IDENTITY (1 << 2)
356
357 /* define the limit of IOMMUs supported in each domain */
358 #ifdef CONFIG_X86
359 # define IOMMU_UNITS_SUPPORTED MAX_IO_APICS
360 #else
361 # define IOMMU_UNITS_SUPPORTED 64
362 #endif
363
364 struct dmar_domain {
365 int id; /* domain id */
366 int nid; /* node id */
367 DECLARE_BITMAP(iommu_bmp, IOMMU_UNITS_SUPPORTED);
368 /* bitmap of iommus this domain uses*/
369
370 struct list_head devices; /* all devices' list */
371 struct iova_domain iovad; /* iova's that belong to this domain */
372
373 struct dma_pte *pgd; /* virtual address */
374 int gaw; /* max guest address width */
375
376 /* adjusted guest address width, 0 is level 2 30-bit */
377 int agaw;
378
379 int flags; /* flags to find out type of domain */
380
381 int iommu_coherency;/* indicate coherency of iommu access */
382 int iommu_snooping; /* indicate snooping control feature*/
383 int iommu_count; /* reference count of iommu */
384 int iommu_superpage;/* Level of superpages supported:
385 0 == 4KiB (no superpages), 1 == 2MiB,
386 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */
387 spinlock_t iommu_lock; /* protect iommu set in domain */
388 u64 max_addr; /* maximum mapped address */
389 };
390
391 /* PCI domain-device relationship */
392 struct device_domain_info {
393 struct list_head link; /* link to domain siblings */
394 struct list_head global; /* link to global list */
395 int segment; /* PCI domain */
396 u8 bus; /* PCI bus number */
397 u8 devfn; /* PCI devfn number */
398 struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
399 struct intel_iommu *iommu; /* IOMMU used by this device */
400 struct dmar_domain *domain; /* pointer to domain */
401 };
402
403 static void flush_unmaps_timeout(unsigned long data);
404
405 DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
406
407 #define HIGH_WATER_MARK 250
408 struct deferred_flush_tables {
409 int next;
410 struct iova *iova[HIGH_WATER_MARK];
411 struct dmar_domain *domain[HIGH_WATER_MARK];
412 };
413
414 static struct deferred_flush_tables *deferred_flush;
415
416 /* bitmap for indexing intel_iommus */
417 static int g_num_of_iommus;
418
419 static DEFINE_SPINLOCK(async_umap_flush_lock);
420 static LIST_HEAD(unmaps_to_do);
421
422 static int timer_on;
423 static long list_size;
424
425 static void domain_remove_dev_info(struct dmar_domain *domain);
426
427 #ifdef CONFIG_INTEL_IOMMU_DEFAULT_ON
428 int dmar_disabled = 0;
429 #else
430 int dmar_disabled = 1;
431 #endif /*CONFIG_INTEL_IOMMU_DEFAULT_ON*/
432
433 int intel_iommu_enabled = 0;
434 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
435
436 static int dmar_map_gfx = 1;
437 static int dmar_forcedac;
438 static int intel_iommu_strict;
439 static int intel_iommu_superpage = 1;
440
441 int intel_iommu_gfx_mapped;
442 EXPORT_SYMBOL_GPL(intel_iommu_gfx_mapped);
443
444 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
445 static DEFINE_SPINLOCK(device_domain_lock);
446 static LIST_HEAD(device_domain_list);
447
448 static struct iommu_ops intel_iommu_ops;
449
intel_iommu_setup(char * str)450 static int __init intel_iommu_setup(char *str)
451 {
452 if (!str)
453 return -EINVAL;
454 while (*str) {
455 if (!strncmp(str, "on", 2)) {
456 dmar_disabled = 0;
457 printk(KERN_INFO "Intel-IOMMU: enabled\n");
458 } else if (!strncmp(str, "off", 3)) {
459 dmar_disabled = 1;
460 printk(KERN_INFO "Intel-IOMMU: disabled\n");
461 } else if (!strncmp(str, "igfx_off", 8)) {
462 dmar_map_gfx = 0;
463 printk(KERN_INFO
464 "Intel-IOMMU: disable GFX device mapping\n");
465 } else if (!strncmp(str, "forcedac", 8)) {
466 printk(KERN_INFO
467 "Intel-IOMMU: Forcing DAC for PCI devices\n");
468 dmar_forcedac = 1;
469 } else if (!strncmp(str, "strict", 6)) {
470 printk(KERN_INFO
471 "Intel-IOMMU: disable batched IOTLB flush\n");
472 intel_iommu_strict = 1;
473 } else if (!strncmp(str, "sp_off", 6)) {
474 printk(KERN_INFO
475 "Intel-IOMMU: disable supported super page\n");
476 intel_iommu_superpage = 0;
477 }
478
479 str += strcspn(str, ",");
480 while (*str == ',')
481 str++;
482 }
483 return 0;
484 }
485 __setup("intel_iommu=", intel_iommu_setup);
486
487 static struct kmem_cache *iommu_domain_cache;
488 static struct kmem_cache *iommu_devinfo_cache;
489 static struct kmem_cache *iommu_iova_cache;
490
alloc_pgtable_page(int node)491 static inline void *alloc_pgtable_page(int node)
492 {
493 struct page *page;
494 void *vaddr = NULL;
495
496 page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
497 if (page)
498 vaddr = page_address(page);
499 return vaddr;
500 }
501
free_pgtable_page(void * vaddr)502 static inline void free_pgtable_page(void *vaddr)
503 {
504 free_page((unsigned long)vaddr);
505 }
506
alloc_domain_mem(void)507 static inline void *alloc_domain_mem(void)
508 {
509 return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
510 }
511
free_domain_mem(void * vaddr)512 static void free_domain_mem(void *vaddr)
513 {
514 kmem_cache_free(iommu_domain_cache, vaddr);
515 }
516
alloc_devinfo_mem(void)517 static inline void * alloc_devinfo_mem(void)
518 {
519 return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
520 }
521
free_devinfo_mem(void * vaddr)522 static inline void free_devinfo_mem(void *vaddr)
523 {
524 kmem_cache_free(iommu_devinfo_cache, vaddr);
525 }
526
alloc_iova_mem(void)527 struct iova *alloc_iova_mem(void)
528 {
529 return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
530 }
531
free_iova_mem(struct iova * iova)532 void free_iova_mem(struct iova *iova)
533 {
534 kmem_cache_free(iommu_iova_cache, iova);
535 }
536
537
__iommu_calculate_agaw(struct intel_iommu * iommu,int max_gaw)538 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
539 {
540 unsigned long sagaw;
541 int agaw = -1;
542
543 sagaw = cap_sagaw(iommu->cap);
544 for (agaw = width_to_agaw(max_gaw);
545 agaw >= 0; agaw--) {
546 if (test_bit(agaw, &sagaw))
547 break;
548 }
549
550 return agaw;
551 }
552
553 /*
554 * Calculate max SAGAW for each iommu.
555 */
iommu_calculate_max_sagaw(struct intel_iommu * iommu)556 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
557 {
558 return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
559 }
560
561 /*
562 * calculate agaw for each iommu.
563 * "SAGAW" may be different across iommus, use a default agaw, and
564 * get a supported less agaw for iommus that don't support the default agaw.
565 */
iommu_calculate_agaw(struct intel_iommu * iommu)566 int iommu_calculate_agaw(struct intel_iommu *iommu)
567 {
568 return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
569 }
570
571 /* This functionin only returns single iommu in a domain */
domain_get_iommu(struct dmar_domain * domain)572 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
573 {
574 int iommu_id;
575
576 /* si_domain and vm domain should not get here. */
577 BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
578 BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
579
580 iommu_id = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
581 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
582 return NULL;
583
584 return g_iommus[iommu_id];
585 }
586
domain_update_iommu_coherency(struct dmar_domain * domain)587 static void domain_update_iommu_coherency(struct dmar_domain *domain)
588 {
589 int i;
590
591 i = find_first_bit(domain->iommu_bmp, g_num_of_iommus);
592
593 domain->iommu_coherency = i < g_num_of_iommus ? 1 : 0;
594
595 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
596 if (!ecap_coherent(g_iommus[i]->ecap)) {
597 domain->iommu_coherency = 0;
598 break;
599 }
600 }
601 }
602
domain_update_iommu_snooping(struct dmar_domain * domain)603 static void domain_update_iommu_snooping(struct dmar_domain *domain)
604 {
605 int i;
606
607 domain->iommu_snooping = 1;
608
609 for_each_set_bit(i, domain->iommu_bmp, g_num_of_iommus) {
610 if (!ecap_sc_support(g_iommus[i]->ecap)) {
611 domain->iommu_snooping = 0;
612 break;
613 }
614 }
615 }
616
domain_update_iommu_superpage(struct dmar_domain * domain)617 static void domain_update_iommu_superpage(struct dmar_domain *domain)
618 {
619 struct dmar_drhd_unit *drhd;
620 struct intel_iommu *iommu = NULL;
621 int mask = 0xf;
622
623 if (!intel_iommu_superpage) {
624 domain->iommu_superpage = 0;
625 return;
626 }
627
628 /* set iommu_superpage to the smallest common denominator */
629 for_each_active_iommu(iommu, drhd) {
630 mask &= cap_super_page_val(iommu->cap);
631 if (!mask) {
632 break;
633 }
634 }
635 domain->iommu_superpage = fls(mask);
636 }
637
638 /* Some capabilities may be different across iommus */
domain_update_iommu_cap(struct dmar_domain * domain)639 static void domain_update_iommu_cap(struct dmar_domain *domain)
640 {
641 domain_update_iommu_coherency(domain);
642 domain_update_iommu_snooping(domain);
643 domain_update_iommu_superpage(domain);
644 }
645
device_to_iommu(int segment,u8 bus,u8 devfn)646 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
647 {
648 struct dmar_drhd_unit *drhd = NULL;
649 int i;
650
651 for_each_drhd_unit(drhd) {
652 if (drhd->ignored)
653 continue;
654 if (segment != drhd->segment)
655 continue;
656
657 for (i = 0; i < drhd->devices_cnt; i++) {
658 if (drhd->devices[i] &&
659 drhd->devices[i]->bus->number == bus &&
660 drhd->devices[i]->devfn == devfn)
661 return drhd->iommu;
662 if (drhd->devices[i] &&
663 drhd->devices[i]->subordinate &&
664 drhd->devices[i]->subordinate->number <= bus &&
665 drhd->devices[i]->subordinate->subordinate >= bus)
666 return drhd->iommu;
667 }
668
669 if (drhd->include_all)
670 return drhd->iommu;
671 }
672
673 return NULL;
674 }
675
domain_flush_cache(struct dmar_domain * domain,void * addr,int size)676 static void domain_flush_cache(struct dmar_domain *domain,
677 void *addr, int size)
678 {
679 if (!domain->iommu_coherency)
680 clflush_cache_range(addr, size);
681 }
682
683 /* Gets context entry for a given bus and devfn */
device_to_context_entry(struct intel_iommu * iommu,u8 bus,u8 devfn)684 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
685 u8 bus, u8 devfn)
686 {
687 struct root_entry *root;
688 struct context_entry *context;
689 unsigned long phy_addr;
690 unsigned long flags;
691
692 spin_lock_irqsave(&iommu->lock, flags);
693 root = &iommu->root_entry[bus];
694 context = get_context_addr_from_root(root);
695 if (!context) {
696 context = (struct context_entry *)
697 alloc_pgtable_page(iommu->node);
698 if (!context) {
699 spin_unlock_irqrestore(&iommu->lock, flags);
700 return NULL;
701 }
702 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
703 phy_addr = virt_to_phys((void *)context);
704 set_root_value(root, phy_addr);
705 set_root_present(root);
706 __iommu_flush_cache(iommu, root, sizeof(*root));
707 }
708 spin_unlock_irqrestore(&iommu->lock, flags);
709 return &context[devfn];
710 }
711
device_context_mapped(struct intel_iommu * iommu,u8 bus,u8 devfn)712 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
713 {
714 struct root_entry *root;
715 struct context_entry *context;
716 int ret;
717 unsigned long flags;
718
719 spin_lock_irqsave(&iommu->lock, flags);
720 root = &iommu->root_entry[bus];
721 context = get_context_addr_from_root(root);
722 if (!context) {
723 ret = 0;
724 goto out;
725 }
726 ret = context_present(&context[devfn]);
727 out:
728 spin_unlock_irqrestore(&iommu->lock, flags);
729 return ret;
730 }
731
clear_context_table(struct intel_iommu * iommu,u8 bus,u8 devfn)732 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
733 {
734 struct root_entry *root;
735 struct context_entry *context;
736 unsigned long flags;
737
738 spin_lock_irqsave(&iommu->lock, flags);
739 root = &iommu->root_entry[bus];
740 context = get_context_addr_from_root(root);
741 if (context) {
742 context_clear_entry(&context[devfn]);
743 __iommu_flush_cache(iommu, &context[devfn], \
744 sizeof(*context));
745 }
746 spin_unlock_irqrestore(&iommu->lock, flags);
747 }
748
free_context_table(struct intel_iommu * iommu)749 static void free_context_table(struct intel_iommu *iommu)
750 {
751 struct root_entry *root;
752 int i;
753 unsigned long flags;
754 struct context_entry *context;
755
756 spin_lock_irqsave(&iommu->lock, flags);
757 if (!iommu->root_entry) {
758 goto out;
759 }
760 for (i = 0; i < ROOT_ENTRY_NR; i++) {
761 root = &iommu->root_entry[i];
762 context = get_context_addr_from_root(root);
763 if (context)
764 free_pgtable_page(context);
765 }
766 free_pgtable_page(iommu->root_entry);
767 iommu->root_entry = NULL;
768 out:
769 spin_unlock_irqrestore(&iommu->lock, flags);
770 }
771
pfn_to_dma_pte(struct dmar_domain * domain,unsigned long pfn,int target_level)772 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
773 unsigned long pfn, int target_level)
774 {
775 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
776 struct dma_pte *parent, *pte = NULL;
777 int level = agaw_to_level(domain->agaw);
778 int offset;
779
780 BUG_ON(!domain->pgd);
781
782 if (addr_width < BITS_PER_LONG && pfn >> addr_width)
783 /* Address beyond IOMMU's addressing capabilities. */
784 return NULL;
785
786 parent = domain->pgd;
787
788 while (level > 0) {
789 void *tmp_page;
790
791 offset = pfn_level_offset(pfn, level);
792 pte = &parent[offset];
793 if (!target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
794 break;
795 if (level == target_level)
796 break;
797
798 if (!dma_pte_present(pte)) {
799 uint64_t pteval;
800
801 tmp_page = alloc_pgtable_page(domain->nid);
802
803 if (!tmp_page)
804 return NULL;
805
806 domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
807 pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
808 if (cmpxchg64(&pte->val, 0ULL, pteval)) {
809 /* Someone else set it while we were thinking; use theirs. */
810 free_pgtable_page(tmp_page);
811 } else {
812 dma_pte_addr(pte);
813 domain_flush_cache(domain, pte, sizeof(*pte));
814 }
815 }
816 parent = phys_to_virt(dma_pte_addr(pte));
817 level--;
818 }
819
820 return pte;
821 }
822
823
824 /* return address's pte at specific level */
dma_pfn_level_pte(struct dmar_domain * domain,unsigned long pfn,int level,int * large_page)825 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
826 unsigned long pfn,
827 int level, int *large_page)
828 {
829 struct dma_pte *parent, *pte = NULL;
830 int total = agaw_to_level(domain->agaw);
831 int offset;
832
833 parent = domain->pgd;
834 while (level <= total) {
835 offset = pfn_level_offset(pfn, total);
836 pte = &parent[offset];
837 if (level == total)
838 return pte;
839
840 if (!dma_pte_present(pte)) {
841 *large_page = total;
842 break;
843 }
844
845 if (pte->val & DMA_PTE_LARGE_PAGE) {
846 *large_page = total;
847 return pte;
848 }
849
850 parent = phys_to_virt(dma_pte_addr(pte));
851 total--;
852 }
853 return NULL;
854 }
855
856 /* clear last level pte, a tlb flush should be followed */
dma_pte_clear_range(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn)857 static int dma_pte_clear_range(struct dmar_domain *domain,
858 unsigned long start_pfn,
859 unsigned long last_pfn)
860 {
861 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
862 unsigned int large_page = 1;
863 struct dma_pte *first_pte, *pte;
864 int order;
865
866 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
867 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
868 BUG_ON(start_pfn > last_pfn);
869
870 /* we don't need lock here; nobody else touches the iova range */
871 do {
872 large_page = 1;
873 first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
874 if (!pte) {
875 start_pfn = align_to_level(start_pfn + 1, large_page + 1);
876 continue;
877 }
878 do {
879 dma_clear_pte(pte);
880 start_pfn += lvl_to_nr_pages(large_page);
881 pte++;
882 } while (start_pfn <= last_pfn && !first_pte_in_page(pte));
883
884 domain_flush_cache(domain, first_pte,
885 (void *)pte - (void *)first_pte);
886
887 } while (start_pfn && start_pfn <= last_pfn);
888
889 order = (large_page - 1) * 9;
890 return order;
891 }
892
dma_pte_free_level(struct dmar_domain * domain,int level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn)893 static void dma_pte_free_level(struct dmar_domain *domain, int level,
894 struct dma_pte *pte, unsigned long pfn,
895 unsigned long start_pfn, unsigned long last_pfn)
896 {
897 pfn = max(start_pfn, pfn);
898 pte = &pte[pfn_level_offset(pfn, level)];
899
900 do {
901 unsigned long level_pfn;
902 struct dma_pte *level_pte;
903
904 if (!dma_pte_present(pte) || dma_pte_superpage(pte))
905 goto next;
906
907 level_pfn = pfn & level_mask(level - 1);
908 level_pte = phys_to_virt(dma_pte_addr(pte));
909
910 if (level > 2)
911 dma_pte_free_level(domain, level - 1, level_pte,
912 level_pfn, start_pfn, last_pfn);
913
914 /* If range covers entire pagetable, free it */
915 if (!(start_pfn > level_pfn ||
916 last_pfn < level_pfn + level_size(level) - 1)) {
917 dma_clear_pte(pte);
918 domain_flush_cache(domain, pte, sizeof(*pte));
919 free_pgtable_page(level_pte);
920 }
921 next:
922 pfn += level_size(level);
923 } while (!first_pte_in_page(++pte) && pfn <= last_pfn);
924 }
925
926 /* free page table pages. last level pte should already be cleared */
dma_pte_free_pagetable(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn)927 static void dma_pte_free_pagetable(struct dmar_domain *domain,
928 unsigned long start_pfn,
929 unsigned long last_pfn)
930 {
931 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
932
933 BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
934 BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
935 BUG_ON(start_pfn > last_pfn);
936
937 /* We don't need lock here; nobody else touches the iova range */
938 dma_pte_free_level(domain, agaw_to_level(domain->agaw),
939 domain->pgd, 0, start_pfn, last_pfn);
940
941 /* free pgd */
942 if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
943 free_pgtable_page(domain->pgd);
944 domain->pgd = NULL;
945 }
946 }
947
948 /* iommu handling */
iommu_alloc_root_entry(struct intel_iommu * iommu)949 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
950 {
951 struct root_entry *root;
952 unsigned long flags;
953
954 root = (struct root_entry *)alloc_pgtable_page(iommu->node);
955 if (!root)
956 return -ENOMEM;
957
958 __iommu_flush_cache(iommu, root, ROOT_SIZE);
959
960 spin_lock_irqsave(&iommu->lock, flags);
961 iommu->root_entry = root;
962 spin_unlock_irqrestore(&iommu->lock, flags);
963
964 return 0;
965 }
966
iommu_set_root_entry(struct intel_iommu * iommu)967 static void iommu_set_root_entry(struct intel_iommu *iommu)
968 {
969 void *addr;
970 u32 sts;
971 unsigned long flag;
972
973 addr = iommu->root_entry;
974
975 raw_spin_lock_irqsave(&iommu->register_lock, flag);
976 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
977
978 writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
979
980 /* Make sure hardware complete it */
981 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
982 readl, (sts & DMA_GSTS_RTPS), sts);
983
984 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
985 }
986
iommu_flush_write_buffer(struct intel_iommu * iommu)987 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
988 {
989 u32 val;
990 unsigned long flag;
991
992 if (!rwbf_quirk && !cap_rwbf(iommu->cap))
993 return;
994
995 raw_spin_lock_irqsave(&iommu->register_lock, flag);
996 writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
997
998 /* Make sure hardware complete it */
999 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1000 readl, (!(val & DMA_GSTS_WBFS)), val);
1001
1002 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1003 }
1004
1005 /* return value determine if we need a write buffer flush */
__iommu_flush_context(struct intel_iommu * iommu,u16 did,u16 source_id,u8 function_mask,u64 type)1006 static void __iommu_flush_context(struct intel_iommu *iommu,
1007 u16 did, u16 source_id, u8 function_mask,
1008 u64 type)
1009 {
1010 u64 val = 0;
1011 unsigned long flag;
1012
1013 switch (type) {
1014 case DMA_CCMD_GLOBAL_INVL:
1015 val = DMA_CCMD_GLOBAL_INVL;
1016 break;
1017 case DMA_CCMD_DOMAIN_INVL:
1018 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1019 break;
1020 case DMA_CCMD_DEVICE_INVL:
1021 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1022 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1023 break;
1024 default:
1025 BUG();
1026 }
1027 val |= DMA_CCMD_ICC;
1028
1029 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1030 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1031
1032 /* Make sure hardware complete it */
1033 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1034 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1035
1036 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1037 }
1038
1039 /* return value determine if we need a write buffer flush */
__iommu_flush_iotlb(struct intel_iommu * iommu,u16 did,u64 addr,unsigned int size_order,u64 type)1040 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1041 u64 addr, unsigned int size_order, u64 type)
1042 {
1043 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1044 u64 val = 0, val_iva = 0;
1045 unsigned long flag;
1046
1047 switch (type) {
1048 case DMA_TLB_GLOBAL_FLUSH:
1049 /* global flush doesn't need set IVA_REG */
1050 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1051 break;
1052 case DMA_TLB_DSI_FLUSH:
1053 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1054 break;
1055 case DMA_TLB_PSI_FLUSH:
1056 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1057 /* Note: always flush non-leaf currently */
1058 val_iva = size_order | addr;
1059 break;
1060 default:
1061 BUG();
1062 }
1063 /* Note: set drain read/write */
1064 #if 0
1065 /*
1066 * This is probably to be super secure.. Looks like we can
1067 * ignore it without any impact.
1068 */
1069 if (cap_read_drain(iommu->cap))
1070 val |= DMA_TLB_READ_DRAIN;
1071 #endif
1072 if (cap_write_drain(iommu->cap))
1073 val |= DMA_TLB_WRITE_DRAIN;
1074
1075 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1076 /* Note: Only uses first TLB reg currently */
1077 if (val_iva)
1078 dmar_writeq(iommu->reg + tlb_offset, val_iva);
1079 dmar_writeq(iommu->reg + tlb_offset + 8, val);
1080
1081 /* Make sure hardware complete it */
1082 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1083 dmar_readq, (!(val & DMA_TLB_IVT)), val);
1084
1085 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1086
1087 /* check IOTLB invalidation granularity */
1088 if (DMA_TLB_IAIG(val) == 0)
1089 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
1090 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1091 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
1092 (unsigned long long)DMA_TLB_IIRG(type),
1093 (unsigned long long)DMA_TLB_IAIG(val));
1094 }
1095
iommu_support_dev_iotlb(struct dmar_domain * domain,int segment,u8 bus,u8 devfn)1096 static struct device_domain_info *iommu_support_dev_iotlb(
1097 struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
1098 {
1099 int found = 0;
1100 unsigned long flags;
1101 struct device_domain_info *info;
1102 struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
1103
1104 if (!ecap_dev_iotlb_support(iommu->ecap))
1105 return NULL;
1106
1107 if (!iommu->qi)
1108 return NULL;
1109
1110 spin_lock_irqsave(&device_domain_lock, flags);
1111 list_for_each_entry(info, &domain->devices, link)
1112 if (info->bus == bus && info->devfn == devfn) {
1113 found = 1;
1114 break;
1115 }
1116 spin_unlock_irqrestore(&device_domain_lock, flags);
1117
1118 if (!found || !info->dev)
1119 return NULL;
1120
1121 if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1122 return NULL;
1123
1124 if (!dmar_find_matched_atsr_unit(info->dev))
1125 return NULL;
1126
1127 info->iommu = iommu;
1128
1129 return info;
1130 }
1131
iommu_enable_dev_iotlb(struct device_domain_info * info)1132 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1133 {
1134 if (!info)
1135 return;
1136
1137 pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1138 }
1139
iommu_disable_dev_iotlb(struct device_domain_info * info)1140 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1141 {
1142 if (!info->dev || !pci_ats_enabled(info->dev))
1143 return;
1144
1145 pci_disable_ats(info->dev);
1146 }
1147
iommu_flush_dev_iotlb(struct dmar_domain * domain,u64 addr,unsigned mask)1148 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1149 u64 addr, unsigned mask)
1150 {
1151 u16 sid, qdep;
1152 unsigned long flags;
1153 struct device_domain_info *info;
1154
1155 spin_lock_irqsave(&device_domain_lock, flags);
1156 list_for_each_entry(info, &domain->devices, link) {
1157 if (!info->dev || !pci_ats_enabled(info->dev))
1158 continue;
1159
1160 sid = info->bus << 8 | info->devfn;
1161 qdep = pci_ats_queue_depth(info->dev);
1162 qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1163 }
1164 spin_unlock_irqrestore(&device_domain_lock, flags);
1165 }
1166
iommu_flush_iotlb_psi(struct intel_iommu * iommu,u16 did,unsigned long pfn,unsigned int pages,int map)1167 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1168 unsigned long pfn, unsigned int pages, int map)
1169 {
1170 unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1171 uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1172
1173 BUG_ON(pages == 0);
1174
1175 /*
1176 * Fallback to domain selective flush if no PSI support or the size is
1177 * too big.
1178 * PSI requires page size to be 2 ^ x, and the base address is naturally
1179 * aligned to the size
1180 */
1181 if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1182 iommu->flush.flush_iotlb(iommu, did, 0, 0,
1183 DMA_TLB_DSI_FLUSH);
1184 else
1185 iommu->flush.flush_iotlb(iommu, did, addr, mask,
1186 DMA_TLB_PSI_FLUSH);
1187
1188 /*
1189 * In caching mode, changes of pages from non-present to present require
1190 * flush. However, device IOTLB doesn't need to be flushed in this case.
1191 */
1192 if (!cap_caching_mode(iommu->cap) || !map)
1193 iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1194 }
1195
iommu_disable_protect_mem_regions(struct intel_iommu * iommu)1196 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1197 {
1198 u32 pmen;
1199 unsigned long flags;
1200
1201 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1202 pmen = readl(iommu->reg + DMAR_PMEN_REG);
1203 pmen &= ~DMA_PMEN_EPM;
1204 writel(pmen, iommu->reg + DMAR_PMEN_REG);
1205
1206 /* wait for the protected region status bit to clear */
1207 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1208 readl, !(pmen & DMA_PMEN_PRS), pmen);
1209
1210 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1211 }
1212
iommu_enable_translation(struct intel_iommu * iommu)1213 static int iommu_enable_translation(struct intel_iommu *iommu)
1214 {
1215 u32 sts;
1216 unsigned long flags;
1217
1218 raw_spin_lock_irqsave(&iommu->register_lock, flags);
1219 iommu->gcmd |= DMA_GCMD_TE;
1220 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1221
1222 /* Make sure hardware complete it */
1223 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1224 readl, (sts & DMA_GSTS_TES), sts);
1225
1226 raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1227 return 0;
1228 }
1229
iommu_disable_translation(struct intel_iommu * iommu)1230 static int iommu_disable_translation(struct intel_iommu *iommu)
1231 {
1232 u32 sts;
1233 unsigned long flag;
1234
1235 raw_spin_lock_irqsave(&iommu->register_lock, flag);
1236 iommu->gcmd &= ~DMA_GCMD_TE;
1237 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1238
1239 /* Make sure hardware complete it */
1240 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1241 readl, (!(sts & DMA_GSTS_TES)), sts);
1242
1243 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1244 return 0;
1245 }
1246
1247
iommu_init_domains(struct intel_iommu * iommu)1248 static int iommu_init_domains(struct intel_iommu *iommu)
1249 {
1250 unsigned long ndomains;
1251 unsigned long nlongs;
1252
1253 ndomains = cap_ndoms(iommu->cap);
1254 pr_debug("IOMMU %d: Number of Domains supported <%ld>\n", iommu->seq_id,
1255 ndomains);
1256 nlongs = BITS_TO_LONGS(ndomains);
1257
1258 spin_lock_init(&iommu->lock);
1259
1260 /* TBD: there might be 64K domains,
1261 * consider other allocation for future chip
1262 */
1263 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1264 if (!iommu->domain_ids) {
1265 printk(KERN_ERR "Allocating domain id array failed\n");
1266 return -ENOMEM;
1267 }
1268 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1269 GFP_KERNEL);
1270 if (!iommu->domains) {
1271 printk(KERN_ERR "Allocating domain array failed\n");
1272 return -ENOMEM;
1273 }
1274
1275 /*
1276 * if Caching mode is set, then invalid translations are tagged
1277 * with domainid 0. Hence we need to pre-allocate it.
1278 */
1279 if (cap_caching_mode(iommu->cap))
1280 set_bit(0, iommu->domain_ids);
1281 return 0;
1282 }
1283
1284
1285 static void domain_exit(struct dmar_domain *domain);
1286 static void vm_domain_exit(struct dmar_domain *domain);
1287
free_dmar_iommu(struct intel_iommu * iommu)1288 void free_dmar_iommu(struct intel_iommu *iommu)
1289 {
1290 struct dmar_domain *domain;
1291 int i;
1292 unsigned long flags;
1293
1294 if ((iommu->domains) && (iommu->domain_ids)) {
1295 for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1296 domain = iommu->domains[i];
1297 clear_bit(i, iommu->domain_ids);
1298
1299 spin_lock_irqsave(&domain->iommu_lock, flags);
1300 if (--domain->iommu_count == 0) {
1301 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1302 vm_domain_exit(domain);
1303 else
1304 domain_exit(domain);
1305 }
1306 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1307 }
1308 }
1309
1310 if (iommu->gcmd & DMA_GCMD_TE)
1311 iommu_disable_translation(iommu);
1312
1313 if (iommu->irq) {
1314 irq_set_handler_data(iommu->irq, NULL);
1315 /* This will mask the irq */
1316 free_irq(iommu->irq, iommu);
1317 destroy_irq(iommu->irq);
1318 }
1319
1320 kfree(iommu->domains);
1321 kfree(iommu->domain_ids);
1322
1323 g_iommus[iommu->seq_id] = NULL;
1324
1325 /* if all iommus are freed, free g_iommus */
1326 for (i = 0; i < g_num_of_iommus; i++) {
1327 if (g_iommus[i])
1328 break;
1329 }
1330
1331 if (i == g_num_of_iommus)
1332 kfree(g_iommus);
1333
1334 /* free context mapping */
1335 free_context_table(iommu);
1336 }
1337
alloc_domain(void)1338 static struct dmar_domain *alloc_domain(void)
1339 {
1340 struct dmar_domain *domain;
1341
1342 domain = alloc_domain_mem();
1343 if (!domain)
1344 return NULL;
1345
1346 domain->nid = -1;
1347 memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
1348 domain->flags = 0;
1349
1350 return domain;
1351 }
1352
iommu_attach_domain(struct dmar_domain * domain,struct intel_iommu * iommu)1353 static int iommu_attach_domain(struct dmar_domain *domain,
1354 struct intel_iommu *iommu)
1355 {
1356 int num;
1357 unsigned long ndomains;
1358 unsigned long flags;
1359
1360 ndomains = cap_ndoms(iommu->cap);
1361
1362 spin_lock_irqsave(&iommu->lock, flags);
1363
1364 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1365 if (num >= ndomains) {
1366 spin_unlock_irqrestore(&iommu->lock, flags);
1367 printk(KERN_ERR "IOMMU: no free domain ids\n");
1368 return -ENOMEM;
1369 }
1370
1371 domain->id = num;
1372 set_bit(num, iommu->domain_ids);
1373 set_bit(iommu->seq_id, domain->iommu_bmp);
1374 iommu->domains[num] = domain;
1375 spin_unlock_irqrestore(&iommu->lock, flags);
1376
1377 return 0;
1378 }
1379
iommu_detach_domain(struct dmar_domain * domain,struct intel_iommu * iommu)1380 static void iommu_detach_domain(struct dmar_domain *domain,
1381 struct intel_iommu *iommu)
1382 {
1383 unsigned long flags;
1384 int num, ndomains;
1385 int found = 0;
1386
1387 spin_lock_irqsave(&iommu->lock, flags);
1388 ndomains = cap_ndoms(iommu->cap);
1389 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1390 if (iommu->domains[num] == domain) {
1391 found = 1;
1392 break;
1393 }
1394 }
1395
1396 if (found) {
1397 clear_bit(num, iommu->domain_ids);
1398 clear_bit(iommu->seq_id, domain->iommu_bmp);
1399 iommu->domains[num] = NULL;
1400 }
1401 spin_unlock_irqrestore(&iommu->lock, flags);
1402 }
1403
1404 static struct iova_domain reserved_iova_list;
1405 static struct lock_class_key reserved_rbtree_key;
1406
dmar_init_reserved_ranges(void)1407 static int dmar_init_reserved_ranges(void)
1408 {
1409 struct pci_dev *pdev = NULL;
1410 struct iova *iova;
1411 int i;
1412
1413 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1414
1415 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1416 &reserved_rbtree_key);
1417
1418 /* IOAPIC ranges shouldn't be accessed by DMA */
1419 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1420 IOVA_PFN(IOAPIC_RANGE_END));
1421 if (!iova) {
1422 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1423 return -ENODEV;
1424 }
1425
1426 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1427 for_each_pci_dev(pdev) {
1428 struct resource *r;
1429
1430 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1431 r = &pdev->resource[i];
1432 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1433 continue;
1434 iova = reserve_iova(&reserved_iova_list,
1435 IOVA_PFN(r->start),
1436 IOVA_PFN(r->end));
1437 if (!iova) {
1438 printk(KERN_ERR "Reserve iova failed\n");
1439 return -ENODEV;
1440 }
1441 }
1442 }
1443 return 0;
1444 }
1445
domain_reserve_special_ranges(struct dmar_domain * domain)1446 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1447 {
1448 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1449 }
1450
guestwidth_to_adjustwidth(int gaw)1451 static inline int guestwidth_to_adjustwidth(int gaw)
1452 {
1453 int agaw;
1454 int r = (gaw - 12) % 9;
1455
1456 if (r == 0)
1457 agaw = gaw;
1458 else
1459 agaw = gaw + 9 - r;
1460 if (agaw > 64)
1461 agaw = 64;
1462 return agaw;
1463 }
1464
domain_init(struct dmar_domain * domain,int guest_width)1465 static int domain_init(struct dmar_domain *domain, int guest_width)
1466 {
1467 struct intel_iommu *iommu;
1468 int adjust_width, agaw;
1469 unsigned long sagaw;
1470
1471 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1472 spin_lock_init(&domain->iommu_lock);
1473
1474 domain_reserve_special_ranges(domain);
1475
1476 /* calculate AGAW */
1477 iommu = domain_get_iommu(domain);
1478 if (guest_width > cap_mgaw(iommu->cap))
1479 guest_width = cap_mgaw(iommu->cap);
1480 domain->gaw = guest_width;
1481 adjust_width = guestwidth_to_adjustwidth(guest_width);
1482 agaw = width_to_agaw(adjust_width);
1483 sagaw = cap_sagaw(iommu->cap);
1484 if (!test_bit(agaw, &sagaw)) {
1485 /* hardware doesn't support it, choose a bigger one */
1486 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1487 agaw = find_next_bit(&sagaw, 5, agaw);
1488 if (agaw >= 5)
1489 return -ENODEV;
1490 }
1491 domain->agaw = agaw;
1492 INIT_LIST_HEAD(&domain->devices);
1493
1494 if (ecap_coherent(iommu->ecap))
1495 domain->iommu_coherency = 1;
1496 else
1497 domain->iommu_coherency = 0;
1498
1499 if (ecap_sc_support(iommu->ecap))
1500 domain->iommu_snooping = 1;
1501 else
1502 domain->iommu_snooping = 0;
1503
1504 domain->iommu_superpage = fls(cap_super_page_val(iommu->cap));
1505 domain->iommu_count = 1;
1506 domain->nid = iommu->node;
1507
1508 /* always allocate the top pgd */
1509 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1510 if (!domain->pgd)
1511 return -ENOMEM;
1512 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1513 return 0;
1514 }
1515
domain_exit(struct dmar_domain * domain)1516 static void domain_exit(struct dmar_domain *domain)
1517 {
1518 struct dmar_drhd_unit *drhd;
1519 struct intel_iommu *iommu;
1520
1521 /* Domain 0 is reserved, so dont process it */
1522 if (!domain)
1523 return;
1524
1525 /* Flush any lazy unmaps that may reference this domain */
1526 if (!intel_iommu_strict)
1527 flush_unmaps_timeout(0);
1528
1529 domain_remove_dev_info(domain);
1530 /* destroy iovas */
1531 put_iova_domain(&domain->iovad);
1532
1533 /* clear ptes */
1534 dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1535
1536 /* free page tables */
1537 dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1538
1539 for_each_active_iommu(iommu, drhd)
1540 if (test_bit(iommu->seq_id, domain->iommu_bmp))
1541 iommu_detach_domain(domain, iommu);
1542
1543 free_domain_mem(domain);
1544 }
1545
domain_context_mapping_one(struct dmar_domain * domain,int segment,u8 bus,u8 devfn,int translation)1546 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1547 u8 bus, u8 devfn, int translation)
1548 {
1549 struct context_entry *context;
1550 unsigned long flags;
1551 struct intel_iommu *iommu;
1552 struct dma_pte *pgd;
1553 unsigned long num;
1554 unsigned long ndomains;
1555 int id;
1556 int agaw;
1557 struct device_domain_info *info = NULL;
1558
1559 pr_debug("Set context mapping for %02x:%02x.%d\n",
1560 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1561
1562 BUG_ON(!domain->pgd);
1563 BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1564 translation != CONTEXT_TT_MULTI_LEVEL);
1565
1566 iommu = device_to_iommu(segment, bus, devfn);
1567 if (!iommu)
1568 return -ENODEV;
1569
1570 context = device_to_context_entry(iommu, bus, devfn);
1571 if (!context)
1572 return -ENOMEM;
1573 spin_lock_irqsave(&iommu->lock, flags);
1574 if (context_present(context)) {
1575 spin_unlock_irqrestore(&iommu->lock, flags);
1576 return 0;
1577 }
1578
1579 id = domain->id;
1580 pgd = domain->pgd;
1581
1582 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1583 domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1584 int found = 0;
1585
1586 /* find an available domain id for this device in iommu */
1587 ndomains = cap_ndoms(iommu->cap);
1588 for_each_set_bit(num, iommu->domain_ids, ndomains) {
1589 if (iommu->domains[num] == domain) {
1590 id = num;
1591 found = 1;
1592 break;
1593 }
1594 }
1595
1596 if (found == 0) {
1597 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1598 if (num >= ndomains) {
1599 spin_unlock_irqrestore(&iommu->lock, flags);
1600 printk(KERN_ERR "IOMMU: no free domain ids\n");
1601 return -EFAULT;
1602 }
1603
1604 set_bit(num, iommu->domain_ids);
1605 iommu->domains[num] = domain;
1606 id = num;
1607 }
1608
1609 /* Skip top levels of page tables for
1610 * iommu which has less agaw than default.
1611 * Unnecessary for PT mode.
1612 */
1613 if (translation != CONTEXT_TT_PASS_THROUGH) {
1614 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1615 pgd = phys_to_virt(dma_pte_addr(pgd));
1616 if (!dma_pte_present(pgd)) {
1617 spin_unlock_irqrestore(&iommu->lock, flags);
1618 return -ENOMEM;
1619 }
1620 }
1621 }
1622 }
1623
1624 context_set_domain_id(context, id);
1625
1626 if (translation != CONTEXT_TT_PASS_THROUGH) {
1627 info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1628 translation = info ? CONTEXT_TT_DEV_IOTLB :
1629 CONTEXT_TT_MULTI_LEVEL;
1630 }
1631 /*
1632 * In pass through mode, AW must be programmed to indicate the largest
1633 * AGAW value supported by hardware. And ASR is ignored by hardware.
1634 */
1635 if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1636 context_set_address_width(context, iommu->msagaw);
1637 else {
1638 context_set_address_root(context, virt_to_phys(pgd));
1639 context_set_address_width(context, iommu->agaw);
1640 }
1641
1642 context_set_translation_type(context, translation);
1643 context_set_fault_enable(context);
1644 context_set_present(context);
1645 domain_flush_cache(domain, context, sizeof(*context));
1646
1647 /*
1648 * It's a non-present to present mapping. If hardware doesn't cache
1649 * non-present entry we only need to flush the write-buffer. If the
1650 * _does_ cache non-present entries, then it does so in the special
1651 * domain #0, which we have to flush:
1652 */
1653 if (cap_caching_mode(iommu->cap)) {
1654 iommu->flush.flush_context(iommu, 0,
1655 (((u16)bus) << 8) | devfn,
1656 DMA_CCMD_MASK_NOBIT,
1657 DMA_CCMD_DEVICE_INVL);
1658 iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1659 } else {
1660 iommu_flush_write_buffer(iommu);
1661 }
1662 iommu_enable_dev_iotlb(info);
1663 spin_unlock_irqrestore(&iommu->lock, flags);
1664
1665 spin_lock_irqsave(&domain->iommu_lock, flags);
1666 if (!test_and_set_bit(iommu->seq_id, domain->iommu_bmp)) {
1667 domain->iommu_count++;
1668 if (domain->iommu_count == 1)
1669 domain->nid = iommu->node;
1670 domain_update_iommu_cap(domain);
1671 }
1672 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1673 return 0;
1674 }
1675
1676 static int
domain_context_mapping(struct dmar_domain * domain,struct pci_dev * pdev,int translation)1677 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1678 int translation)
1679 {
1680 int ret;
1681 struct pci_dev *tmp, *parent;
1682
1683 ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1684 pdev->bus->number, pdev->devfn,
1685 translation);
1686 if (ret)
1687 return ret;
1688
1689 /* dependent device mapping */
1690 tmp = pci_find_upstream_pcie_bridge(pdev);
1691 if (!tmp)
1692 return 0;
1693 /* Secondary interface's bus number and devfn 0 */
1694 parent = pdev->bus->self;
1695 while (parent != tmp) {
1696 ret = domain_context_mapping_one(domain,
1697 pci_domain_nr(parent->bus),
1698 parent->bus->number,
1699 parent->devfn, translation);
1700 if (ret)
1701 return ret;
1702 parent = parent->bus->self;
1703 }
1704 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1705 return domain_context_mapping_one(domain,
1706 pci_domain_nr(tmp->subordinate),
1707 tmp->subordinate->number, 0,
1708 translation);
1709 else /* this is a legacy PCI bridge */
1710 return domain_context_mapping_one(domain,
1711 pci_domain_nr(tmp->bus),
1712 tmp->bus->number,
1713 tmp->devfn,
1714 translation);
1715 }
1716
domain_context_mapped(struct pci_dev * pdev)1717 static int domain_context_mapped(struct pci_dev *pdev)
1718 {
1719 int ret;
1720 struct pci_dev *tmp, *parent;
1721 struct intel_iommu *iommu;
1722
1723 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1724 pdev->devfn);
1725 if (!iommu)
1726 return -ENODEV;
1727
1728 ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1729 if (!ret)
1730 return ret;
1731 /* dependent device mapping */
1732 tmp = pci_find_upstream_pcie_bridge(pdev);
1733 if (!tmp)
1734 return ret;
1735 /* Secondary interface's bus number and devfn 0 */
1736 parent = pdev->bus->self;
1737 while (parent != tmp) {
1738 ret = device_context_mapped(iommu, parent->bus->number,
1739 parent->devfn);
1740 if (!ret)
1741 return ret;
1742 parent = parent->bus->self;
1743 }
1744 if (pci_is_pcie(tmp))
1745 return device_context_mapped(iommu, tmp->subordinate->number,
1746 0);
1747 else
1748 return device_context_mapped(iommu, tmp->bus->number,
1749 tmp->devfn);
1750 }
1751
1752 /* Returns a number of VTD pages, but aligned to MM page size */
aligned_nrpages(unsigned long host_addr,size_t size)1753 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1754 size_t size)
1755 {
1756 host_addr &= ~PAGE_MASK;
1757 return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1758 }
1759
1760 /* Return largest possible superpage level for a given mapping */
hardware_largepage_caps(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phy_pfn,unsigned long pages)1761 static inline int hardware_largepage_caps(struct dmar_domain *domain,
1762 unsigned long iov_pfn,
1763 unsigned long phy_pfn,
1764 unsigned long pages)
1765 {
1766 int support, level = 1;
1767 unsigned long pfnmerge;
1768
1769 support = domain->iommu_superpage;
1770
1771 /* To use a large page, the virtual *and* physical addresses
1772 must be aligned to 2MiB/1GiB/etc. Lower bits set in either
1773 of them will mean we have to use smaller pages. So just
1774 merge them and check both at once. */
1775 pfnmerge = iov_pfn | phy_pfn;
1776
1777 while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
1778 pages >>= VTD_STRIDE_SHIFT;
1779 if (!pages)
1780 break;
1781 pfnmerge >>= VTD_STRIDE_SHIFT;
1782 level++;
1783 support--;
1784 }
1785 return level;
1786 }
1787
__domain_mapping(struct dmar_domain * domain,unsigned long iov_pfn,struct scatterlist * sg,unsigned long phys_pfn,unsigned long nr_pages,int prot)1788 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1789 struct scatterlist *sg, unsigned long phys_pfn,
1790 unsigned long nr_pages, int prot)
1791 {
1792 struct dma_pte *first_pte = NULL, *pte = NULL;
1793 phys_addr_t uninitialized_var(pteval);
1794 int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1795 unsigned long sg_res;
1796 unsigned int largepage_lvl = 0;
1797 unsigned long lvl_pages = 0;
1798
1799 BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1800
1801 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1802 return -EINVAL;
1803
1804 prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1805
1806 if (sg)
1807 sg_res = 0;
1808 else {
1809 sg_res = nr_pages + 1;
1810 pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1811 }
1812
1813 while (nr_pages > 0) {
1814 uint64_t tmp;
1815
1816 if (!sg_res) {
1817 sg_res = aligned_nrpages(sg->offset, sg->length);
1818 sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1819 sg->dma_length = sg->length;
1820 pteval = page_to_phys(sg_page(sg)) | prot;
1821 phys_pfn = pteval >> VTD_PAGE_SHIFT;
1822 }
1823
1824 if (!pte) {
1825 largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res);
1826
1827 first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl);
1828 if (!pte)
1829 return -ENOMEM;
1830 /* It is large page*/
1831 if (largepage_lvl > 1) {
1832 pteval |= DMA_PTE_LARGE_PAGE;
1833 /* Ensure that old small page tables are removed to make room
1834 for superpage, if they exist. */
1835 dma_pte_clear_range(domain, iov_pfn,
1836 iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1837 dma_pte_free_pagetable(domain, iov_pfn,
1838 iov_pfn + lvl_to_nr_pages(largepage_lvl) - 1);
1839 } else {
1840 pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
1841 }
1842
1843 }
1844 /* We don't need lock here, nobody else
1845 * touches the iova range
1846 */
1847 tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1848 if (tmp) {
1849 static int dumps = 5;
1850 printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1851 iov_pfn, tmp, (unsigned long long)pteval);
1852 if (dumps) {
1853 dumps--;
1854 debug_dma_dump_mappings(NULL);
1855 }
1856 WARN_ON(1);
1857 }
1858
1859 lvl_pages = lvl_to_nr_pages(largepage_lvl);
1860
1861 BUG_ON(nr_pages < lvl_pages);
1862 BUG_ON(sg_res < lvl_pages);
1863
1864 nr_pages -= lvl_pages;
1865 iov_pfn += lvl_pages;
1866 phys_pfn += lvl_pages;
1867 pteval += lvl_pages * VTD_PAGE_SIZE;
1868 sg_res -= lvl_pages;
1869
1870 /* If the next PTE would be the first in a new page, then we
1871 need to flush the cache on the entries we've just written.
1872 And then we'll need to recalculate 'pte', so clear it and
1873 let it get set again in the if (!pte) block above.
1874
1875 If we're done (!nr_pages) we need to flush the cache too.
1876
1877 Also if we've been setting superpages, we may need to
1878 recalculate 'pte' and switch back to smaller pages for the
1879 end of the mapping, if the trailing size is not enough to
1880 use another superpage (i.e. sg_res < lvl_pages). */
1881 pte++;
1882 if (!nr_pages || first_pte_in_page(pte) ||
1883 (largepage_lvl > 1 && sg_res < lvl_pages)) {
1884 domain_flush_cache(domain, first_pte,
1885 (void *)pte - (void *)first_pte);
1886 pte = NULL;
1887 }
1888
1889 if (!sg_res && nr_pages)
1890 sg = sg_next(sg);
1891 }
1892 return 0;
1893 }
1894
domain_sg_mapping(struct dmar_domain * domain,unsigned long iov_pfn,struct scatterlist * sg,unsigned long nr_pages,int prot)1895 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1896 struct scatterlist *sg, unsigned long nr_pages,
1897 int prot)
1898 {
1899 return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1900 }
1901
domain_pfn_mapping(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phys_pfn,unsigned long nr_pages,int prot)1902 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1903 unsigned long phys_pfn, unsigned long nr_pages,
1904 int prot)
1905 {
1906 return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1907 }
1908
iommu_detach_dev(struct intel_iommu * iommu,u8 bus,u8 devfn)1909 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1910 {
1911 if (!iommu)
1912 return;
1913
1914 clear_context_table(iommu, bus, devfn);
1915 iommu->flush.flush_context(iommu, 0, 0, 0,
1916 DMA_CCMD_GLOBAL_INVL);
1917 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1918 }
1919
domain_remove_dev_info(struct dmar_domain * domain)1920 static void domain_remove_dev_info(struct dmar_domain *domain)
1921 {
1922 struct device_domain_info *info;
1923 unsigned long flags;
1924 struct intel_iommu *iommu;
1925
1926 spin_lock_irqsave(&device_domain_lock, flags);
1927 while (!list_empty(&domain->devices)) {
1928 info = list_entry(domain->devices.next,
1929 struct device_domain_info, link);
1930 list_del(&info->link);
1931 list_del(&info->global);
1932 if (info->dev)
1933 info->dev->dev.archdata.iommu = NULL;
1934 spin_unlock_irqrestore(&device_domain_lock, flags);
1935
1936 iommu_disable_dev_iotlb(info);
1937 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1938 iommu_detach_dev(iommu, info->bus, info->devfn);
1939 free_devinfo_mem(info);
1940
1941 spin_lock_irqsave(&device_domain_lock, flags);
1942 }
1943 spin_unlock_irqrestore(&device_domain_lock, flags);
1944 }
1945
1946 /*
1947 * find_domain
1948 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1949 */
1950 static struct dmar_domain *
find_domain(struct pci_dev * pdev)1951 find_domain(struct pci_dev *pdev)
1952 {
1953 struct device_domain_info *info;
1954
1955 /* No lock here, assumes no domain exit in normal case */
1956 info = pdev->dev.archdata.iommu;
1957 if (info)
1958 return info->domain;
1959 return NULL;
1960 }
1961
1962 /* domain is initialized */
get_domain_for_dev(struct pci_dev * pdev,int gaw)1963 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1964 {
1965 struct dmar_domain *domain, *found = NULL;
1966 struct intel_iommu *iommu;
1967 struct dmar_drhd_unit *drhd;
1968 struct device_domain_info *info, *tmp;
1969 struct pci_dev *dev_tmp;
1970 unsigned long flags;
1971 int bus = 0, devfn = 0;
1972 int segment;
1973 int ret;
1974
1975 domain = find_domain(pdev);
1976 if (domain)
1977 return domain;
1978
1979 segment = pci_domain_nr(pdev->bus);
1980
1981 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1982 if (dev_tmp) {
1983 if (pci_is_pcie(dev_tmp)) {
1984 bus = dev_tmp->subordinate->number;
1985 devfn = 0;
1986 } else {
1987 bus = dev_tmp->bus->number;
1988 devfn = dev_tmp->devfn;
1989 }
1990 spin_lock_irqsave(&device_domain_lock, flags);
1991 list_for_each_entry(info, &device_domain_list, global) {
1992 if (info->segment == segment &&
1993 info->bus == bus && info->devfn == devfn) {
1994 found = info->domain;
1995 break;
1996 }
1997 }
1998 spin_unlock_irqrestore(&device_domain_lock, flags);
1999 /* pcie-pci bridge already has a domain, uses it */
2000 if (found) {
2001 domain = found;
2002 goto found_domain;
2003 }
2004 }
2005
2006 domain = alloc_domain();
2007 if (!domain)
2008 goto error;
2009
2010 /* Allocate new domain for the device */
2011 drhd = dmar_find_matched_drhd_unit(pdev);
2012 if (!drhd) {
2013 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
2014 pci_name(pdev));
2015 return NULL;
2016 }
2017 iommu = drhd->iommu;
2018
2019 ret = iommu_attach_domain(domain, iommu);
2020 if (ret) {
2021 free_domain_mem(domain);
2022 goto error;
2023 }
2024
2025 if (domain_init(domain, gaw)) {
2026 domain_exit(domain);
2027 goto error;
2028 }
2029
2030 /* register pcie-to-pci device */
2031 if (dev_tmp) {
2032 info = alloc_devinfo_mem();
2033 if (!info) {
2034 domain_exit(domain);
2035 goto error;
2036 }
2037 info->segment = segment;
2038 info->bus = bus;
2039 info->devfn = devfn;
2040 info->dev = NULL;
2041 info->domain = domain;
2042 /* This domain is shared by devices under p2p bridge */
2043 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
2044
2045 /* pcie-to-pci bridge already has a domain, uses it */
2046 found = NULL;
2047 spin_lock_irqsave(&device_domain_lock, flags);
2048 list_for_each_entry(tmp, &device_domain_list, global) {
2049 if (tmp->segment == segment &&
2050 tmp->bus == bus && tmp->devfn == devfn) {
2051 found = tmp->domain;
2052 break;
2053 }
2054 }
2055 if (found) {
2056 spin_unlock_irqrestore(&device_domain_lock, flags);
2057 free_devinfo_mem(info);
2058 domain_exit(domain);
2059 domain = found;
2060 } else {
2061 list_add(&info->link, &domain->devices);
2062 list_add(&info->global, &device_domain_list);
2063 spin_unlock_irqrestore(&device_domain_lock, flags);
2064 }
2065 }
2066
2067 found_domain:
2068 info = alloc_devinfo_mem();
2069 if (!info)
2070 goto error;
2071 info->segment = segment;
2072 info->bus = pdev->bus->number;
2073 info->devfn = pdev->devfn;
2074 info->dev = pdev;
2075 info->domain = domain;
2076 spin_lock_irqsave(&device_domain_lock, flags);
2077 /* somebody is fast */
2078 found = find_domain(pdev);
2079 if (found != NULL) {
2080 spin_unlock_irqrestore(&device_domain_lock, flags);
2081 if (found != domain) {
2082 domain_exit(domain);
2083 domain = found;
2084 }
2085 free_devinfo_mem(info);
2086 return domain;
2087 }
2088 list_add(&info->link, &domain->devices);
2089 list_add(&info->global, &device_domain_list);
2090 pdev->dev.archdata.iommu = info;
2091 spin_unlock_irqrestore(&device_domain_lock, flags);
2092 return domain;
2093 error:
2094 /* recheck it here, maybe others set it */
2095 return find_domain(pdev);
2096 }
2097
2098 static int iommu_identity_mapping;
2099 #define IDENTMAP_ALL 1
2100 #define IDENTMAP_GFX 2
2101 #define IDENTMAP_AZALIA 4
2102
iommu_domain_identity_map(struct dmar_domain * domain,unsigned long long start,unsigned long long end)2103 static int iommu_domain_identity_map(struct dmar_domain *domain,
2104 unsigned long long start,
2105 unsigned long long end)
2106 {
2107 unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
2108 unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
2109
2110 if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
2111 dma_to_mm_pfn(last_vpfn))) {
2112 printk(KERN_ERR "IOMMU: reserve iova failed\n");
2113 return -ENOMEM;
2114 }
2115
2116 pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
2117 start, end, domain->id);
2118 /*
2119 * RMRR range might have overlap with physical memory range,
2120 * clear it first
2121 */
2122 dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2123
2124 return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
2125 last_vpfn - first_vpfn + 1,
2126 DMA_PTE_READ|DMA_PTE_WRITE);
2127 }
2128
iommu_prepare_identity_map(struct pci_dev * pdev,unsigned long long start,unsigned long long end)2129 static int iommu_prepare_identity_map(struct pci_dev *pdev,
2130 unsigned long long start,
2131 unsigned long long end)
2132 {
2133 struct dmar_domain *domain;
2134 int ret;
2135
2136 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
2137 if (!domain)
2138 return -ENOMEM;
2139
2140 /* For _hardware_ passthrough, don't bother. But for software
2141 passthrough, we do it anyway -- it may indicate a memory
2142 range which is reserved in E820, so which didn't get set
2143 up to start with in si_domain */
2144 if (domain == si_domain && hw_pass_through) {
2145 printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
2146 pci_name(pdev), start, end);
2147 return 0;
2148 }
2149
2150 printk(KERN_INFO
2151 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
2152 pci_name(pdev), start, end);
2153
2154 if (end < start) {
2155 WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
2156 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2157 dmi_get_system_info(DMI_BIOS_VENDOR),
2158 dmi_get_system_info(DMI_BIOS_VERSION),
2159 dmi_get_system_info(DMI_PRODUCT_VERSION));
2160 ret = -EIO;
2161 goto error;
2162 }
2163
2164 if (end >> agaw_to_width(domain->agaw)) {
2165 WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
2166 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
2167 agaw_to_width(domain->agaw),
2168 dmi_get_system_info(DMI_BIOS_VENDOR),
2169 dmi_get_system_info(DMI_BIOS_VERSION),
2170 dmi_get_system_info(DMI_PRODUCT_VERSION));
2171 ret = -EIO;
2172 goto error;
2173 }
2174
2175 ret = iommu_domain_identity_map(domain, start, end);
2176 if (ret)
2177 goto error;
2178
2179 /* context entry init */
2180 ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2181 if (ret)
2182 goto error;
2183
2184 return 0;
2185
2186 error:
2187 domain_exit(domain);
2188 return ret;
2189 }
2190
iommu_prepare_rmrr_dev(struct dmar_rmrr_unit * rmrr,struct pci_dev * pdev)2191 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2192 struct pci_dev *pdev)
2193 {
2194 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2195 return 0;
2196 return iommu_prepare_identity_map(pdev, rmrr->base_address,
2197 rmrr->end_address);
2198 }
2199
2200 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
iommu_prepare_isa(void)2201 static inline void iommu_prepare_isa(void)
2202 {
2203 struct pci_dev *pdev;
2204 int ret;
2205
2206 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2207 if (!pdev)
2208 return;
2209
2210 printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2211 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1);
2212
2213 if (ret)
2214 printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2215 "floppy might not work\n");
2216
2217 }
2218 #else
iommu_prepare_isa(void)2219 static inline void iommu_prepare_isa(void)
2220 {
2221 return;
2222 }
2223 #endif /* !CONFIG_INTEL_IOMMU_FLPY_WA */
2224
2225 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2226
si_domain_init(int hw)2227 static int __init si_domain_init(int hw)
2228 {
2229 struct dmar_drhd_unit *drhd;
2230 struct intel_iommu *iommu;
2231 int nid, ret = 0;
2232
2233 si_domain = alloc_domain();
2234 if (!si_domain)
2235 return -EFAULT;
2236
2237 pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2238
2239 for_each_active_iommu(iommu, drhd) {
2240 ret = iommu_attach_domain(si_domain, iommu);
2241 if (ret) {
2242 domain_exit(si_domain);
2243 return -EFAULT;
2244 }
2245 }
2246
2247 if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2248 domain_exit(si_domain);
2249 return -EFAULT;
2250 }
2251
2252 si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2253
2254 if (hw)
2255 return 0;
2256
2257 for_each_online_node(nid) {
2258 unsigned long start_pfn, end_pfn;
2259 int i;
2260
2261 for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2262 ret = iommu_domain_identity_map(si_domain,
2263 PFN_PHYS(start_pfn), PFN_PHYS(end_pfn));
2264 if (ret)
2265 return ret;
2266 }
2267 }
2268
2269 return 0;
2270 }
2271
2272 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2273 struct pci_dev *pdev);
identity_mapping(struct pci_dev * pdev)2274 static int identity_mapping(struct pci_dev *pdev)
2275 {
2276 struct device_domain_info *info;
2277
2278 if (likely(!iommu_identity_mapping))
2279 return 0;
2280
2281 info = pdev->dev.archdata.iommu;
2282 if (info && info != DUMMY_DEVICE_DOMAIN_INFO)
2283 return (info->domain == si_domain);
2284
2285 return 0;
2286 }
2287
domain_add_dev_info(struct dmar_domain * domain,struct pci_dev * pdev,int translation)2288 static int domain_add_dev_info(struct dmar_domain *domain,
2289 struct pci_dev *pdev,
2290 int translation)
2291 {
2292 struct device_domain_info *info;
2293 unsigned long flags;
2294 int ret;
2295
2296 info = alloc_devinfo_mem();
2297 if (!info)
2298 return -ENOMEM;
2299
2300 info->segment = pci_domain_nr(pdev->bus);
2301 info->bus = pdev->bus->number;
2302 info->devfn = pdev->devfn;
2303 info->dev = pdev;
2304 info->domain = domain;
2305
2306 spin_lock_irqsave(&device_domain_lock, flags);
2307 list_add(&info->link, &domain->devices);
2308 list_add(&info->global, &device_domain_list);
2309 pdev->dev.archdata.iommu = info;
2310 spin_unlock_irqrestore(&device_domain_lock, flags);
2311
2312 ret = domain_context_mapping(domain, pdev, translation);
2313 if (ret) {
2314 spin_lock_irqsave(&device_domain_lock, flags);
2315 list_del(&info->link);
2316 list_del(&info->global);
2317 pdev->dev.archdata.iommu = NULL;
2318 spin_unlock_irqrestore(&device_domain_lock, flags);
2319 free_devinfo_mem(info);
2320 return ret;
2321 }
2322
2323 return 0;
2324 }
2325
device_has_rmrr(struct pci_dev * dev)2326 static bool device_has_rmrr(struct pci_dev *dev)
2327 {
2328 struct dmar_rmrr_unit *rmrr;
2329 int i;
2330
2331 for_each_rmrr_units(rmrr) {
2332 for (i = 0; i < rmrr->devices_cnt; i++) {
2333 /*
2334 * Return TRUE if this RMRR contains the device that
2335 * is passed in.
2336 */
2337 if (rmrr->devices[i] == dev)
2338 return true;
2339 }
2340 }
2341 return false;
2342 }
2343
iommu_should_identity_map(struct pci_dev * pdev,int startup)2344 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2345 {
2346
2347 /*
2348 * We want to prevent any device associated with an RMRR from
2349 * getting placed into the SI Domain. This is done because
2350 * problems exist when devices are moved in and out of domains
2351 * and their respective RMRR info is lost. We exempt USB devices
2352 * from this process due to their usage of RMRRs that are known
2353 * to not be needed after BIOS hand-off to OS.
2354 */
2355 if (device_has_rmrr(pdev) &&
2356 (pdev->class >> 8) != PCI_CLASS_SERIAL_USB)
2357 return 0;
2358
2359 if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2360 return 1;
2361
2362 if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2363 return 1;
2364
2365 if (!(iommu_identity_mapping & IDENTMAP_ALL))
2366 return 0;
2367
2368 /*
2369 * We want to start off with all devices in the 1:1 domain, and
2370 * take them out later if we find they can't access all of memory.
2371 *
2372 * However, we can't do this for PCI devices behind bridges,
2373 * because all PCI devices behind the same bridge will end up
2374 * with the same source-id on their transactions.
2375 *
2376 * Practically speaking, we can't change things around for these
2377 * devices at run-time, because we can't be sure there'll be no
2378 * DMA transactions in flight for any of their siblings.
2379 *
2380 * So PCI devices (unless they're on the root bus) as well as
2381 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2382 * the 1:1 domain, just in _case_ one of their siblings turns out
2383 * not to be able to map all of memory.
2384 */
2385 if (!pci_is_pcie(pdev)) {
2386 if (!pci_is_root_bus(pdev->bus))
2387 return 0;
2388 if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2389 return 0;
2390 } else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2391 return 0;
2392
2393 /*
2394 * At boot time, we don't yet know if devices will be 64-bit capable.
2395 * Assume that they will -- if they turn out not to be, then we can
2396 * take them out of the 1:1 domain later.
2397 */
2398 if (!startup) {
2399 /*
2400 * If the device's dma_mask is less than the system's memory
2401 * size then this is not a candidate for identity mapping.
2402 */
2403 u64 dma_mask = pdev->dma_mask;
2404
2405 if (pdev->dev.coherent_dma_mask &&
2406 pdev->dev.coherent_dma_mask < dma_mask)
2407 dma_mask = pdev->dev.coherent_dma_mask;
2408
2409 return dma_mask >= dma_get_required_mask(&pdev->dev);
2410 }
2411
2412 return 1;
2413 }
2414
iommu_prepare_static_identity_mapping(int hw)2415 static int __init iommu_prepare_static_identity_mapping(int hw)
2416 {
2417 struct pci_dev *pdev = NULL;
2418 int ret;
2419
2420 ret = si_domain_init(hw);
2421 if (ret)
2422 return -EFAULT;
2423
2424 for_each_pci_dev(pdev) {
2425 if (iommu_should_identity_map(pdev, 1)) {
2426 ret = domain_add_dev_info(si_domain, pdev,
2427 hw ? CONTEXT_TT_PASS_THROUGH :
2428 CONTEXT_TT_MULTI_LEVEL);
2429 if (ret) {
2430 /* device not associated with an iommu */
2431 if (ret == -ENODEV)
2432 continue;
2433 return ret;
2434 }
2435 pr_info("IOMMU: %s identity mapping for device %s\n",
2436 hw ? "hardware" : "software", pci_name(pdev));
2437 }
2438 }
2439
2440 return 0;
2441 }
2442
init_dmars(void)2443 static int __init init_dmars(void)
2444 {
2445 struct dmar_drhd_unit *drhd;
2446 struct dmar_rmrr_unit *rmrr;
2447 struct pci_dev *pdev;
2448 struct intel_iommu *iommu;
2449 int i, ret;
2450
2451 /*
2452 * for each drhd
2453 * allocate root
2454 * initialize and program root entry to not present
2455 * endfor
2456 */
2457 for_each_drhd_unit(drhd) {
2458 /*
2459 * lock not needed as this is only incremented in the single
2460 * threaded kernel __init code path all other access are read
2461 * only
2462 */
2463 if (g_num_of_iommus < IOMMU_UNITS_SUPPORTED) {
2464 g_num_of_iommus++;
2465 continue;
2466 }
2467 printk_once(KERN_ERR "intel-iommu: exceeded %d IOMMUs\n",
2468 IOMMU_UNITS_SUPPORTED);
2469 }
2470
2471 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2472 GFP_KERNEL);
2473 if (!g_iommus) {
2474 printk(KERN_ERR "Allocating global iommu array failed\n");
2475 ret = -ENOMEM;
2476 goto error;
2477 }
2478
2479 deferred_flush = kzalloc(g_num_of_iommus *
2480 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2481 if (!deferred_flush) {
2482 ret = -ENOMEM;
2483 goto error;
2484 }
2485
2486 for_each_drhd_unit(drhd) {
2487 if (drhd->ignored)
2488 continue;
2489
2490 iommu = drhd->iommu;
2491 g_iommus[iommu->seq_id] = iommu;
2492
2493 ret = iommu_init_domains(iommu);
2494 if (ret)
2495 goto error;
2496
2497 /*
2498 * TBD:
2499 * we could share the same root & context tables
2500 * among all IOMMU's. Need to Split it later.
2501 */
2502 ret = iommu_alloc_root_entry(iommu);
2503 if (ret) {
2504 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2505 goto error;
2506 }
2507 if (!ecap_pass_through(iommu->ecap))
2508 hw_pass_through = 0;
2509 }
2510
2511 /*
2512 * Start from the sane iommu hardware state.
2513 */
2514 for_each_drhd_unit(drhd) {
2515 if (drhd->ignored)
2516 continue;
2517
2518 iommu = drhd->iommu;
2519
2520 /*
2521 * If the queued invalidation is already initialized by us
2522 * (for example, while enabling interrupt-remapping) then
2523 * we got the things already rolling from a sane state.
2524 */
2525 if (iommu->qi)
2526 continue;
2527
2528 /*
2529 * Clear any previous faults.
2530 */
2531 dmar_fault(-1, iommu);
2532 /*
2533 * Disable queued invalidation if supported and already enabled
2534 * before OS handover.
2535 */
2536 dmar_disable_qi(iommu);
2537 }
2538
2539 for_each_drhd_unit(drhd) {
2540 if (drhd->ignored)
2541 continue;
2542
2543 iommu = drhd->iommu;
2544
2545 if (dmar_enable_qi(iommu)) {
2546 /*
2547 * Queued Invalidate not enabled, use Register Based
2548 * Invalidate
2549 */
2550 iommu->flush.flush_context = __iommu_flush_context;
2551 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2552 printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2553 "invalidation\n",
2554 iommu->seq_id,
2555 (unsigned long long)drhd->reg_base_addr);
2556 } else {
2557 iommu->flush.flush_context = qi_flush_context;
2558 iommu->flush.flush_iotlb = qi_flush_iotlb;
2559 printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2560 "invalidation\n",
2561 iommu->seq_id,
2562 (unsigned long long)drhd->reg_base_addr);
2563 }
2564 }
2565
2566 if (iommu_pass_through)
2567 iommu_identity_mapping |= IDENTMAP_ALL;
2568
2569 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
2570 iommu_identity_mapping |= IDENTMAP_GFX;
2571 #endif
2572
2573 check_tylersburg_isoch();
2574
2575 /*
2576 * If pass through is not set or not enabled, setup context entries for
2577 * identity mappings for rmrr, gfx, and isa and may fall back to static
2578 * identity mapping if iommu_identity_mapping is set.
2579 */
2580 if (iommu_identity_mapping) {
2581 ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2582 if (ret) {
2583 printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2584 goto error;
2585 }
2586 }
2587 /*
2588 * For each rmrr
2589 * for each dev attached to rmrr
2590 * do
2591 * locate drhd for dev, alloc domain for dev
2592 * allocate free domain
2593 * allocate page table entries for rmrr
2594 * if context not allocated for bus
2595 * allocate and init context
2596 * set present in root table for this bus
2597 * init context with domain, translation etc
2598 * endfor
2599 * endfor
2600 */
2601 printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2602 for_each_rmrr_units(rmrr) {
2603 for (i = 0; i < rmrr->devices_cnt; i++) {
2604 pdev = rmrr->devices[i];
2605 /*
2606 * some BIOS lists non-exist devices in DMAR
2607 * table.
2608 */
2609 if (!pdev)
2610 continue;
2611 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2612 if (ret)
2613 printk(KERN_ERR
2614 "IOMMU: mapping reserved region failed\n");
2615 }
2616 }
2617
2618 iommu_prepare_isa();
2619
2620 /*
2621 * for each drhd
2622 * enable fault log
2623 * global invalidate context cache
2624 * global invalidate iotlb
2625 * enable translation
2626 */
2627 for_each_drhd_unit(drhd) {
2628 if (drhd->ignored) {
2629 /*
2630 * we always have to disable PMRs or DMA may fail on
2631 * this device
2632 */
2633 if (force_on)
2634 iommu_disable_protect_mem_regions(drhd->iommu);
2635 continue;
2636 }
2637 iommu = drhd->iommu;
2638
2639 iommu_flush_write_buffer(iommu);
2640
2641 ret = dmar_set_interrupt(iommu);
2642 if (ret)
2643 goto error;
2644
2645 iommu_set_root_entry(iommu);
2646
2647 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2648 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2649
2650 ret = iommu_enable_translation(iommu);
2651 if (ret)
2652 goto error;
2653
2654 iommu_disable_protect_mem_regions(iommu);
2655 }
2656
2657 return 0;
2658 error:
2659 for_each_drhd_unit(drhd) {
2660 if (drhd->ignored)
2661 continue;
2662 iommu = drhd->iommu;
2663 free_iommu(iommu);
2664 }
2665 kfree(g_iommus);
2666 return ret;
2667 }
2668
2669 /* This takes a number of _MM_ pages, not VTD pages */
intel_alloc_iova(struct device * dev,struct dmar_domain * domain,unsigned long nrpages,uint64_t dma_mask)2670 static struct iova *intel_alloc_iova(struct device *dev,
2671 struct dmar_domain *domain,
2672 unsigned long nrpages, uint64_t dma_mask)
2673 {
2674 struct pci_dev *pdev = to_pci_dev(dev);
2675 struct iova *iova = NULL;
2676
2677 /* Restrict dma_mask to the width that the iommu can handle */
2678 dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2679
2680 if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2681 /*
2682 * First try to allocate an io virtual address in
2683 * DMA_BIT_MASK(32) and if that fails then try allocating
2684 * from higher range
2685 */
2686 iova = alloc_iova(&domain->iovad, nrpages,
2687 IOVA_PFN(DMA_BIT_MASK(32)), 1);
2688 if (iova)
2689 return iova;
2690 }
2691 iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2692 if (unlikely(!iova)) {
2693 printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2694 nrpages, pci_name(pdev));
2695 return NULL;
2696 }
2697
2698 return iova;
2699 }
2700
__get_valid_domain_for_dev(struct pci_dev * pdev)2701 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2702 {
2703 struct dmar_domain *domain;
2704 int ret;
2705
2706 domain = get_domain_for_dev(pdev,
2707 DEFAULT_DOMAIN_ADDRESS_WIDTH);
2708 if (!domain) {
2709 printk(KERN_ERR
2710 "Allocating domain for %s failed", pci_name(pdev));
2711 return NULL;
2712 }
2713
2714 /* make sure context mapping is ok */
2715 if (unlikely(!domain_context_mapped(pdev))) {
2716 ret = domain_context_mapping(domain, pdev,
2717 CONTEXT_TT_MULTI_LEVEL);
2718 if (ret) {
2719 printk(KERN_ERR
2720 "Domain context map for %s failed",
2721 pci_name(pdev));
2722 return NULL;
2723 }
2724 }
2725
2726 return domain;
2727 }
2728
get_valid_domain_for_dev(struct pci_dev * dev)2729 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2730 {
2731 struct device_domain_info *info;
2732
2733 /* No lock here, assumes no domain exit in normal case */
2734 info = dev->dev.archdata.iommu;
2735 if (likely(info))
2736 return info->domain;
2737
2738 return __get_valid_domain_for_dev(dev);
2739 }
2740
iommu_dummy(struct pci_dev * pdev)2741 static int iommu_dummy(struct pci_dev *pdev)
2742 {
2743 return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2744 }
2745
2746 /* Check if the pdev needs to go through non-identity map and unmap process.*/
iommu_no_mapping(struct device * dev)2747 static int iommu_no_mapping(struct device *dev)
2748 {
2749 struct pci_dev *pdev;
2750 int found;
2751
2752 if (unlikely(dev->bus != &pci_bus_type))
2753 return 1;
2754
2755 pdev = to_pci_dev(dev);
2756 if (iommu_dummy(pdev))
2757 return 1;
2758
2759 if (!iommu_identity_mapping)
2760 return 0;
2761
2762 found = identity_mapping(pdev);
2763 if (found) {
2764 if (iommu_should_identity_map(pdev, 0))
2765 return 1;
2766 else {
2767 /*
2768 * 32 bit DMA is removed from si_domain and fall back
2769 * to non-identity mapping.
2770 */
2771 domain_remove_one_dev_info(si_domain, pdev);
2772 printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2773 pci_name(pdev));
2774 return 0;
2775 }
2776 } else {
2777 /*
2778 * In case of a detached 64 bit DMA device from vm, the device
2779 * is put into si_domain for identity mapping.
2780 */
2781 if (iommu_should_identity_map(pdev, 0)) {
2782 int ret;
2783 ret = domain_add_dev_info(si_domain, pdev,
2784 hw_pass_through ?
2785 CONTEXT_TT_PASS_THROUGH :
2786 CONTEXT_TT_MULTI_LEVEL);
2787 if (!ret) {
2788 printk(KERN_INFO "64bit %s uses identity mapping\n",
2789 pci_name(pdev));
2790 return 1;
2791 }
2792 }
2793 }
2794
2795 return 0;
2796 }
2797
__intel_map_single(struct device * hwdev,phys_addr_t paddr,size_t size,int dir,u64 dma_mask)2798 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2799 size_t size, int dir, u64 dma_mask)
2800 {
2801 struct pci_dev *pdev = to_pci_dev(hwdev);
2802 struct dmar_domain *domain;
2803 phys_addr_t start_paddr;
2804 struct iova *iova;
2805 int prot = 0;
2806 int ret;
2807 struct intel_iommu *iommu;
2808 unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2809
2810 BUG_ON(dir == DMA_NONE);
2811
2812 if (iommu_no_mapping(hwdev))
2813 return paddr;
2814
2815 domain = get_valid_domain_for_dev(pdev);
2816 if (!domain)
2817 return 0;
2818
2819 iommu = domain_get_iommu(domain);
2820 size = aligned_nrpages(paddr, size);
2821
2822 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask);
2823 if (!iova)
2824 goto error;
2825
2826 /*
2827 * Check if DMAR supports zero-length reads on write only
2828 * mappings..
2829 */
2830 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2831 !cap_zlr(iommu->cap))
2832 prot |= DMA_PTE_READ;
2833 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2834 prot |= DMA_PTE_WRITE;
2835 /*
2836 * paddr - (paddr + size) might be partial page, we should map the whole
2837 * page. Note: if two part of one page are separately mapped, we
2838 * might have two guest_addr mapping to the same host paddr, but this
2839 * is not a big problem
2840 */
2841 ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2842 mm_to_dma_pfn(paddr_pfn), size, prot);
2843 if (ret)
2844 goto error;
2845
2846 /* it's a non-present to present mapping. Only flush if caching mode */
2847 if (cap_caching_mode(iommu->cap))
2848 iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2849 else
2850 iommu_flush_write_buffer(iommu);
2851
2852 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2853 start_paddr += paddr & ~PAGE_MASK;
2854 return start_paddr;
2855
2856 error:
2857 if (iova)
2858 __free_iova(&domain->iovad, iova);
2859 printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2860 pci_name(pdev), size, (unsigned long long)paddr, dir);
2861 return 0;
2862 }
2863
intel_map_page(struct device * dev,struct page * page,unsigned long offset,size_t size,enum dma_data_direction dir,struct dma_attrs * attrs)2864 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2865 unsigned long offset, size_t size,
2866 enum dma_data_direction dir,
2867 struct dma_attrs *attrs)
2868 {
2869 return __intel_map_single(dev, page_to_phys(page) + offset, size,
2870 dir, to_pci_dev(dev)->dma_mask);
2871 }
2872
flush_unmaps(void)2873 static void flush_unmaps(void)
2874 {
2875 int i, j;
2876
2877 timer_on = 0;
2878
2879 /* just flush them all */
2880 for (i = 0; i < g_num_of_iommus; i++) {
2881 struct intel_iommu *iommu = g_iommus[i];
2882 if (!iommu)
2883 continue;
2884
2885 if (!deferred_flush[i].next)
2886 continue;
2887
2888 /* In caching mode, global flushes turn emulation expensive */
2889 if (!cap_caching_mode(iommu->cap))
2890 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2891 DMA_TLB_GLOBAL_FLUSH);
2892 for (j = 0; j < deferred_flush[i].next; j++) {
2893 unsigned long mask;
2894 struct iova *iova = deferred_flush[i].iova[j];
2895 struct dmar_domain *domain = deferred_flush[i].domain[j];
2896
2897 /* On real hardware multiple invalidations are expensive */
2898 if (cap_caching_mode(iommu->cap))
2899 iommu_flush_iotlb_psi(iommu, domain->id,
2900 iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2901 else {
2902 mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2903 iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2904 (uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2905 }
2906 __free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2907 }
2908 deferred_flush[i].next = 0;
2909 }
2910
2911 list_size = 0;
2912 }
2913
flush_unmaps_timeout(unsigned long data)2914 static void flush_unmaps_timeout(unsigned long data)
2915 {
2916 unsigned long flags;
2917
2918 spin_lock_irqsave(&async_umap_flush_lock, flags);
2919 flush_unmaps();
2920 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2921 }
2922
add_unmap(struct dmar_domain * dom,struct iova * iova)2923 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2924 {
2925 unsigned long flags;
2926 int next, iommu_id;
2927 struct intel_iommu *iommu;
2928
2929 spin_lock_irqsave(&async_umap_flush_lock, flags);
2930 if (list_size == HIGH_WATER_MARK)
2931 flush_unmaps();
2932
2933 iommu = domain_get_iommu(dom);
2934 iommu_id = iommu->seq_id;
2935
2936 next = deferred_flush[iommu_id].next;
2937 deferred_flush[iommu_id].domain[next] = dom;
2938 deferred_flush[iommu_id].iova[next] = iova;
2939 deferred_flush[iommu_id].next++;
2940
2941 if (!timer_on) {
2942 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2943 timer_on = 1;
2944 }
2945 list_size++;
2946 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2947 }
2948
intel_unmap_page(struct device * dev,dma_addr_t dev_addr,size_t size,enum dma_data_direction dir,struct dma_attrs * attrs)2949 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2950 size_t size, enum dma_data_direction dir,
2951 struct dma_attrs *attrs)
2952 {
2953 struct pci_dev *pdev = to_pci_dev(dev);
2954 struct dmar_domain *domain;
2955 unsigned long start_pfn, last_pfn;
2956 struct iova *iova;
2957 struct intel_iommu *iommu;
2958
2959 if (iommu_no_mapping(dev))
2960 return;
2961
2962 domain = find_domain(pdev);
2963 BUG_ON(!domain);
2964
2965 iommu = domain_get_iommu(domain);
2966
2967 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2968 if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2969 (unsigned long long)dev_addr))
2970 return;
2971
2972 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2973 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2974
2975 pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2976 pci_name(pdev), start_pfn, last_pfn);
2977
2978 /* clear the whole page */
2979 dma_pte_clear_range(domain, start_pfn, last_pfn);
2980
2981 /* free page tables */
2982 dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2983
2984 if (intel_iommu_strict) {
2985 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2986 last_pfn - start_pfn + 1, 0);
2987 /* free iova */
2988 __free_iova(&domain->iovad, iova);
2989 } else {
2990 add_unmap(domain, iova);
2991 /*
2992 * queue up the release of the unmap to save the 1/6th of the
2993 * cpu used up by the iotlb flush operation...
2994 */
2995 }
2996 }
2997
intel_alloc_coherent(struct device * hwdev,size_t size,dma_addr_t * dma_handle,gfp_t flags,struct dma_attrs * attrs)2998 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2999 dma_addr_t *dma_handle, gfp_t flags,
3000 struct dma_attrs *attrs)
3001 {
3002 void *vaddr;
3003 int order;
3004
3005 size = PAGE_ALIGN(size);
3006 order = get_order(size);
3007
3008 if (!iommu_no_mapping(hwdev))
3009 flags &= ~(GFP_DMA | GFP_DMA32);
3010 else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
3011 if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
3012 flags |= GFP_DMA;
3013 else
3014 flags |= GFP_DMA32;
3015 }
3016
3017 vaddr = (void *)__get_free_pages(flags, order);
3018 if (!vaddr)
3019 return NULL;
3020 memset(vaddr, 0, size);
3021
3022 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
3023 DMA_BIDIRECTIONAL,
3024 hwdev->coherent_dma_mask);
3025 if (*dma_handle)
3026 return vaddr;
3027 free_pages((unsigned long)vaddr, order);
3028 return NULL;
3029 }
3030
intel_free_coherent(struct device * hwdev,size_t size,void * vaddr,dma_addr_t dma_handle,struct dma_attrs * attrs)3031 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
3032 dma_addr_t dma_handle, struct dma_attrs *attrs)
3033 {
3034 int order;
3035
3036 size = PAGE_ALIGN(size);
3037 order = get_order(size);
3038
3039 intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
3040 free_pages((unsigned long)vaddr, order);
3041 }
3042
intel_unmap_sg(struct device * hwdev,struct scatterlist * sglist,int nelems,enum dma_data_direction dir,struct dma_attrs * attrs)3043 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
3044 int nelems, enum dma_data_direction dir,
3045 struct dma_attrs *attrs)
3046 {
3047 struct pci_dev *pdev = to_pci_dev(hwdev);
3048 struct dmar_domain *domain;
3049 unsigned long start_pfn, last_pfn;
3050 struct iova *iova;
3051 struct intel_iommu *iommu;
3052
3053 if (iommu_no_mapping(hwdev))
3054 return;
3055
3056 domain = find_domain(pdev);
3057 BUG_ON(!domain);
3058
3059 iommu = domain_get_iommu(domain);
3060
3061 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
3062 if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
3063 (unsigned long long)sglist[0].dma_address))
3064 return;
3065
3066 start_pfn = mm_to_dma_pfn(iova->pfn_lo);
3067 last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
3068
3069 /* clear the whole page */
3070 dma_pte_clear_range(domain, start_pfn, last_pfn);
3071
3072 /* free page tables */
3073 dma_pte_free_pagetable(domain, start_pfn, last_pfn);
3074
3075 if (intel_iommu_strict) {
3076 iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
3077 last_pfn - start_pfn + 1, 0);
3078 /* free iova */
3079 __free_iova(&domain->iovad, iova);
3080 } else {
3081 add_unmap(domain, iova);
3082 /*
3083 * queue up the release of the unmap to save the 1/6th of the
3084 * cpu used up by the iotlb flush operation...
3085 */
3086 }
3087 }
3088
intel_nontranslate_map_sg(struct device * hddev,struct scatterlist * sglist,int nelems,int dir)3089 static int intel_nontranslate_map_sg(struct device *hddev,
3090 struct scatterlist *sglist, int nelems, int dir)
3091 {
3092 int i;
3093 struct scatterlist *sg;
3094
3095 for_each_sg(sglist, sg, nelems, i) {
3096 BUG_ON(!sg_page(sg));
3097 sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
3098 sg->dma_length = sg->length;
3099 }
3100 return nelems;
3101 }
3102
intel_map_sg(struct device * hwdev,struct scatterlist * sglist,int nelems,enum dma_data_direction dir,struct dma_attrs * attrs)3103 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
3104 enum dma_data_direction dir, struct dma_attrs *attrs)
3105 {
3106 int i;
3107 struct pci_dev *pdev = to_pci_dev(hwdev);
3108 struct dmar_domain *domain;
3109 size_t size = 0;
3110 int prot = 0;
3111 struct iova *iova = NULL;
3112 int ret;
3113 struct scatterlist *sg;
3114 unsigned long start_vpfn;
3115 struct intel_iommu *iommu;
3116
3117 BUG_ON(dir == DMA_NONE);
3118 if (iommu_no_mapping(hwdev))
3119 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
3120
3121 domain = get_valid_domain_for_dev(pdev);
3122 if (!domain)
3123 return 0;
3124
3125 iommu = domain_get_iommu(domain);
3126
3127 for_each_sg(sglist, sg, nelems, i)
3128 size += aligned_nrpages(sg->offset, sg->length);
3129
3130 iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
3131 pdev->dma_mask);
3132 if (!iova) {
3133 sglist->dma_length = 0;
3134 return 0;
3135 }
3136
3137 /*
3138 * Check if DMAR supports zero-length reads on write only
3139 * mappings..
3140 */
3141 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
3142 !cap_zlr(iommu->cap))
3143 prot |= DMA_PTE_READ;
3144 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
3145 prot |= DMA_PTE_WRITE;
3146
3147 start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
3148
3149 ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
3150 if (unlikely(ret)) {
3151 /* clear the page */
3152 dma_pte_clear_range(domain, start_vpfn,
3153 start_vpfn + size - 1);
3154 /* free page tables */
3155 dma_pte_free_pagetable(domain, start_vpfn,
3156 start_vpfn + size - 1);
3157 /* free iova */
3158 __free_iova(&domain->iovad, iova);
3159 return 0;
3160 }
3161
3162 /* it's a non-present to present mapping. Only flush if caching mode */
3163 if (cap_caching_mode(iommu->cap))
3164 iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
3165 else
3166 iommu_flush_write_buffer(iommu);
3167
3168 return nelems;
3169 }
3170
intel_mapping_error(struct device * dev,dma_addr_t dma_addr)3171 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
3172 {
3173 return !dma_addr;
3174 }
3175
3176 struct dma_map_ops intel_dma_ops = {
3177 .alloc = intel_alloc_coherent,
3178 .free = intel_free_coherent,
3179 .map_sg = intel_map_sg,
3180 .unmap_sg = intel_unmap_sg,
3181 .map_page = intel_map_page,
3182 .unmap_page = intel_unmap_page,
3183 .mapping_error = intel_mapping_error,
3184 };
3185
iommu_domain_cache_init(void)3186 static inline int iommu_domain_cache_init(void)
3187 {
3188 int ret = 0;
3189
3190 iommu_domain_cache = kmem_cache_create("iommu_domain",
3191 sizeof(struct dmar_domain),
3192 0,
3193 SLAB_HWCACHE_ALIGN,
3194
3195 NULL);
3196 if (!iommu_domain_cache) {
3197 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
3198 ret = -ENOMEM;
3199 }
3200
3201 return ret;
3202 }
3203
iommu_devinfo_cache_init(void)3204 static inline int iommu_devinfo_cache_init(void)
3205 {
3206 int ret = 0;
3207
3208 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
3209 sizeof(struct device_domain_info),
3210 0,
3211 SLAB_HWCACHE_ALIGN,
3212 NULL);
3213 if (!iommu_devinfo_cache) {
3214 printk(KERN_ERR "Couldn't create devinfo cache\n");
3215 ret = -ENOMEM;
3216 }
3217
3218 return ret;
3219 }
3220
iommu_iova_cache_init(void)3221 static inline int iommu_iova_cache_init(void)
3222 {
3223 int ret = 0;
3224
3225 iommu_iova_cache = kmem_cache_create("iommu_iova",
3226 sizeof(struct iova),
3227 0,
3228 SLAB_HWCACHE_ALIGN,
3229 NULL);
3230 if (!iommu_iova_cache) {
3231 printk(KERN_ERR "Couldn't create iova cache\n");
3232 ret = -ENOMEM;
3233 }
3234
3235 return ret;
3236 }
3237
iommu_init_mempool(void)3238 static int __init iommu_init_mempool(void)
3239 {
3240 int ret;
3241 ret = iommu_iova_cache_init();
3242 if (ret)
3243 return ret;
3244
3245 ret = iommu_domain_cache_init();
3246 if (ret)
3247 goto domain_error;
3248
3249 ret = iommu_devinfo_cache_init();
3250 if (!ret)
3251 return ret;
3252
3253 kmem_cache_destroy(iommu_domain_cache);
3254 domain_error:
3255 kmem_cache_destroy(iommu_iova_cache);
3256
3257 return -ENOMEM;
3258 }
3259
iommu_exit_mempool(void)3260 static void __init iommu_exit_mempool(void)
3261 {
3262 kmem_cache_destroy(iommu_devinfo_cache);
3263 kmem_cache_destroy(iommu_domain_cache);
3264 kmem_cache_destroy(iommu_iova_cache);
3265
3266 }
3267
quirk_ioat_snb_local_iommu(struct pci_dev * pdev)3268 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3269 {
3270 struct dmar_drhd_unit *drhd;
3271 u32 vtbar;
3272 int rc;
3273
3274 /* We know that this device on this chipset has its own IOMMU.
3275 * If we find it under a different IOMMU, then the BIOS is lying
3276 * to us. Hope that the IOMMU for this device is actually
3277 * disabled, and it needs no translation...
3278 */
3279 rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3280 if (rc) {
3281 /* "can't" happen */
3282 dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3283 return;
3284 }
3285 vtbar &= 0xffff0000;
3286
3287 /* we know that the this iommu should be at offset 0xa000 from vtbar */
3288 drhd = dmar_find_matched_drhd_unit(pdev);
3289 if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3290 TAINT_FIRMWARE_WORKAROUND,
3291 "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3292 pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3293 }
3294 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3295
init_no_remapping_devices(void)3296 static void __init init_no_remapping_devices(void)
3297 {
3298 struct dmar_drhd_unit *drhd;
3299
3300 for_each_drhd_unit(drhd) {
3301 if (!drhd->include_all) {
3302 int i;
3303 for (i = 0; i < drhd->devices_cnt; i++)
3304 if (drhd->devices[i] != NULL)
3305 break;
3306 /* ignore DMAR unit if no pci devices exist */
3307 if (i == drhd->devices_cnt)
3308 drhd->ignored = 1;
3309 }
3310 }
3311
3312 for_each_drhd_unit(drhd) {
3313 int i;
3314 if (drhd->ignored || drhd->include_all)
3315 continue;
3316
3317 for (i = 0; i < drhd->devices_cnt; i++)
3318 if (drhd->devices[i] &&
3319 !IS_GFX_DEVICE(drhd->devices[i]))
3320 break;
3321
3322 if (i < drhd->devices_cnt)
3323 continue;
3324
3325 /* This IOMMU has *only* gfx devices. Either bypass it or
3326 set the gfx_mapped flag, as appropriate */
3327 if (dmar_map_gfx) {
3328 intel_iommu_gfx_mapped = 1;
3329 } else {
3330 drhd->ignored = 1;
3331 for (i = 0; i < drhd->devices_cnt; i++) {
3332 if (!drhd->devices[i])
3333 continue;
3334 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3335 }
3336 }
3337 }
3338 }
3339
3340 #ifdef CONFIG_SUSPEND
init_iommu_hw(void)3341 static int init_iommu_hw(void)
3342 {
3343 struct dmar_drhd_unit *drhd;
3344 struct intel_iommu *iommu = NULL;
3345
3346 for_each_active_iommu(iommu, drhd)
3347 if (iommu->qi)
3348 dmar_reenable_qi(iommu);
3349
3350 for_each_iommu(iommu, drhd) {
3351 if (drhd->ignored) {
3352 /*
3353 * we always have to disable PMRs or DMA may fail on
3354 * this device
3355 */
3356 if (force_on)
3357 iommu_disable_protect_mem_regions(iommu);
3358 continue;
3359 }
3360
3361 iommu_flush_write_buffer(iommu);
3362
3363 iommu_set_root_entry(iommu);
3364
3365 iommu->flush.flush_context(iommu, 0, 0, 0,
3366 DMA_CCMD_GLOBAL_INVL);
3367 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3368 DMA_TLB_GLOBAL_FLUSH);
3369 if (iommu_enable_translation(iommu))
3370 return 1;
3371 iommu_disable_protect_mem_regions(iommu);
3372 }
3373
3374 return 0;
3375 }
3376
iommu_flush_all(void)3377 static void iommu_flush_all(void)
3378 {
3379 struct dmar_drhd_unit *drhd;
3380 struct intel_iommu *iommu;
3381
3382 for_each_active_iommu(iommu, drhd) {
3383 iommu->flush.flush_context(iommu, 0, 0, 0,
3384 DMA_CCMD_GLOBAL_INVL);
3385 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3386 DMA_TLB_GLOBAL_FLUSH);
3387 }
3388 }
3389
iommu_suspend(void)3390 static int iommu_suspend(void)
3391 {
3392 struct dmar_drhd_unit *drhd;
3393 struct intel_iommu *iommu = NULL;
3394 unsigned long flag;
3395
3396 for_each_active_iommu(iommu, drhd) {
3397 iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3398 GFP_ATOMIC);
3399 if (!iommu->iommu_state)
3400 goto nomem;
3401 }
3402
3403 iommu_flush_all();
3404
3405 for_each_active_iommu(iommu, drhd) {
3406 iommu_disable_translation(iommu);
3407
3408 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3409
3410 iommu->iommu_state[SR_DMAR_FECTL_REG] =
3411 readl(iommu->reg + DMAR_FECTL_REG);
3412 iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3413 readl(iommu->reg + DMAR_FEDATA_REG);
3414 iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3415 readl(iommu->reg + DMAR_FEADDR_REG);
3416 iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3417 readl(iommu->reg + DMAR_FEUADDR_REG);
3418
3419 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3420 }
3421 return 0;
3422
3423 nomem:
3424 for_each_active_iommu(iommu, drhd)
3425 kfree(iommu->iommu_state);
3426
3427 return -ENOMEM;
3428 }
3429
iommu_resume(void)3430 static void iommu_resume(void)
3431 {
3432 struct dmar_drhd_unit *drhd;
3433 struct intel_iommu *iommu = NULL;
3434 unsigned long flag;
3435
3436 if (init_iommu_hw()) {
3437 if (force_on)
3438 panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3439 else
3440 WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3441 return;
3442 }
3443
3444 for_each_active_iommu(iommu, drhd) {
3445
3446 raw_spin_lock_irqsave(&iommu->register_lock, flag);
3447
3448 writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3449 iommu->reg + DMAR_FECTL_REG);
3450 writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3451 iommu->reg + DMAR_FEDATA_REG);
3452 writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3453 iommu->reg + DMAR_FEADDR_REG);
3454 writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3455 iommu->reg + DMAR_FEUADDR_REG);
3456
3457 raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3458 }
3459
3460 for_each_active_iommu(iommu, drhd)
3461 kfree(iommu->iommu_state);
3462 }
3463
3464 static struct syscore_ops iommu_syscore_ops = {
3465 .resume = iommu_resume,
3466 .suspend = iommu_suspend,
3467 };
3468
init_iommu_pm_ops(void)3469 static void __init init_iommu_pm_ops(void)
3470 {
3471 register_syscore_ops(&iommu_syscore_ops);
3472 }
3473
3474 #else
init_iommu_pm_ops(void)3475 static inline void init_iommu_pm_ops(void) {}
3476 #endif /* CONFIG_PM */
3477
3478 LIST_HEAD(dmar_rmrr_units);
3479
dmar_register_rmrr_unit(struct dmar_rmrr_unit * rmrr)3480 static void __init dmar_register_rmrr_unit(struct dmar_rmrr_unit *rmrr)
3481 {
3482 list_add(&rmrr->list, &dmar_rmrr_units);
3483 }
3484
3485
dmar_parse_one_rmrr(struct acpi_dmar_header * header)3486 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header)
3487 {
3488 struct acpi_dmar_reserved_memory *rmrr;
3489 struct dmar_rmrr_unit *rmrru;
3490
3491 rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3492 if (!rmrru)
3493 return -ENOMEM;
3494
3495 rmrru->hdr = header;
3496 rmrr = (struct acpi_dmar_reserved_memory *)header;
3497 rmrru->base_address = rmrr->base_address;
3498 rmrru->end_address = rmrr->end_address;
3499
3500 dmar_register_rmrr_unit(rmrru);
3501 return 0;
3502 }
3503
3504 static int __init
rmrr_parse_dev(struct dmar_rmrr_unit * rmrru)3505 rmrr_parse_dev(struct dmar_rmrr_unit *rmrru)
3506 {
3507 struct acpi_dmar_reserved_memory *rmrr;
3508 int ret;
3509
3510 rmrr = (struct acpi_dmar_reserved_memory *) rmrru->hdr;
3511 ret = dmar_parse_dev_scope((void *)(rmrr + 1),
3512 ((void *)rmrr) + rmrr->header.length,
3513 &rmrru->devices_cnt, &rmrru->devices, rmrr->segment);
3514
3515 if (ret || (rmrru->devices_cnt == 0)) {
3516 list_del(&rmrru->list);
3517 kfree(rmrru);
3518 }
3519 return ret;
3520 }
3521
3522 static LIST_HEAD(dmar_atsr_units);
3523
dmar_parse_one_atsr(struct acpi_dmar_header * hdr)3524 int __init dmar_parse_one_atsr(struct acpi_dmar_header *hdr)
3525 {
3526 struct acpi_dmar_atsr *atsr;
3527 struct dmar_atsr_unit *atsru;
3528
3529 atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3530 atsru = kzalloc(sizeof(*atsru), GFP_KERNEL);
3531 if (!atsru)
3532 return -ENOMEM;
3533
3534 atsru->hdr = hdr;
3535 atsru->include_all = atsr->flags & 0x1;
3536
3537 list_add(&atsru->list, &dmar_atsr_units);
3538
3539 return 0;
3540 }
3541
atsr_parse_dev(struct dmar_atsr_unit * atsru)3542 static int __init atsr_parse_dev(struct dmar_atsr_unit *atsru)
3543 {
3544 int rc;
3545 struct acpi_dmar_atsr *atsr;
3546
3547 if (atsru->include_all)
3548 return 0;
3549
3550 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3551 rc = dmar_parse_dev_scope((void *)(atsr + 1),
3552 (void *)atsr + atsr->header.length,
3553 &atsru->devices_cnt, &atsru->devices,
3554 atsr->segment);
3555 if (rc || !atsru->devices_cnt) {
3556 list_del(&atsru->list);
3557 kfree(atsru);
3558 }
3559
3560 return rc;
3561 }
3562
dmar_find_matched_atsr_unit(struct pci_dev * dev)3563 int dmar_find_matched_atsr_unit(struct pci_dev *dev)
3564 {
3565 int i;
3566 struct pci_bus *bus;
3567 struct acpi_dmar_atsr *atsr;
3568 struct dmar_atsr_unit *atsru;
3569
3570 dev = pci_physfn(dev);
3571
3572 list_for_each_entry(atsru, &dmar_atsr_units, list) {
3573 atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3574 if (atsr->segment == pci_domain_nr(dev->bus))
3575 goto found;
3576 }
3577
3578 return 0;
3579
3580 found:
3581 for (bus = dev->bus; bus; bus = bus->parent) {
3582 struct pci_dev *bridge = bus->self;
3583
3584 if (!bridge || !pci_is_pcie(bridge) ||
3585 bridge->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
3586 return 0;
3587
3588 if (bridge->pcie_type == PCI_EXP_TYPE_ROOT_PORT) {
3589 for (i = 0; i < atsru->devices_cnt; i++)
3590 if (atsru->devices[i] == bridge)
3591 return 1;
3592 break;
3593 }
3594 }
3595
3596 if (atsru->include_all)
3597 return 1;
3598
3599 return 0;
3600 }
3601
dmar_parse_rmrr_atsr_dev(void)3602 int __init dmar_parse_rmrr_atsr_dev(void)
3603 {
3604 struct dmar_rmrr_unit *rmrr, *rmrr_n;
3605 struct dmar_atsr_unit *atsr, *atsr_n;
3606 int ret = 0;
3607
3608 list_for_each_entry_safe(rmrr, rmrr_n, &dmar_rmrr_units, list) {
3609 ret = rmrr_parse_dev(rmrr);
3610 if (ret)
3611 return ret;
3612 }
3613
3614 list_for_each_entry_safe(atsr, atsr_n, &dmar_atsr_units, list) {
3615 ret = atsr_parse_dev(atsr);
3616 if (ret)
3617 return ret;
3618 }
3619
3620 return ret;
3621 }
3622
3623 /*
3624 * Here we only respond to action of unbound device from driver.
3625 *
3626 * Added device is not attached to its DMAR domain here yet. That will happen
3627 * when mapping the device to iova.
3628 */
device_notifier(struct notifier_block * nb,unsigned long action,void * data)3629 static int device_notifier(struct notifier_block *nb,
3630 unsigned long action, void *data)
3631 {
3632 struct device *dev = data;
3633 struct pci_dev *pdev = to_pci_dev(dev);
3634 struct dmar_domain *domain;
3635
3636 if (iommu_no_mapping(dev))
3637 return 0;
3638
3639 domain = find_domain(pdev);
3640 if (!domain)
3641 return 0;
3642
3643 if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3644 domain_remove_one_dev_info(domain, pdev);
3645
3646 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3647 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3648 list_empty(&domain->devices))
3649 domain_exit(domain);
3650 }
3651
3652 return 0;
3653 }
3654
3655 static struct notifier_block device_nb = {
3656 .notifier_call = device_notifier,
3657 };
3658
intel_iommu_init(void)3659 int __init intel_iommu_init(void)
3660 {
3661 int ret = 0;
3662
3663 /* VT-d is required for a TXT/tboot launch, so enforce that */
3664 force_on = tboot_force_iommu();
3665
3666 if (dmar_table_init()) {
3667 if (force_on)
3668 panic("tboot: Failed to initialize DMAR table\n");
3669 return -ENODEV;
3670 }
3671
3672 if (dmar_dev_scope_init() < 0) {
3673 if (force_on)
3674 panic("tboot: Failed to initialize DMAR device scope\n");
3675 return -ENODEV;
3676 }
3677
3678 if (no_iommu || dmar_disabled)
3679 return -ENODEV;
3680
3681 if (iommu_init_mempool()) {
3682 if (force_on)
3683 panic("tboot: Failed to initialize iommu memory\n");
3684 return -ENODEV;
3685 }
3686
3687 if (list_empty(&dmar_rmrr_units))
3688 printk(KERN_INFO "DMAR: No RMRR found\n");
3689
3690 if (list_empty(&dmar_atsr_units))
3691 printk(KERN_INFO "DMAR: No ATSR found\n");
3692
3693 if (dmar_init_reserved_ranges()) {
3694 if (force_on)
3695 panic("tboot: Failed to reserve iommu ranges\n");
3696 return -ENODEV;
3697 }
3698
3699 init_no_remapping_devices();
3700
3701 ret = init_dmars();
3702 if (ret) {
3703 if (force_on)
3704 panic("tboot: Failed to initialize DMARs\n");
3705 printk(KERN_ERR "IOMMU: dmar init failed\n");
3706 put_iova_domain(&reserved_iova_list);
3707 iommu_exit_mempool();
3708 return ret;
3709 }
3710 printk(KERN_INFO
3711 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3712
3713 init_timer(&unmap_timer);
3714 #ifdef CONFIG_SWIOTLB
3715 swiotlb = 0;
3716 #endif
3717 dma_ops = &intel_dma_ops;
3718
3719 init_iommu_pm_ops();
3720
3721 bus_set_iommu(&pci_bus_type, &intel_iommu_ops);
3722
3723 bus_register_notifier(&pci_bus_type, &device_nb);
3724
3725 intel_iommu_enabled = 1;
3726
3727 return 0;
3728 }
3729
iommu_detach_dependent_devices(struct intel_iommu * iommu,struct pci_dev * pdev)3730 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3731 struct pci_dev *pdev)
3732 {
3733 struct pci_dev *tmp, *parent;
3734
3735 if (!iommu || !pdev)
3736 return;
3737
3738 /* dependent device detach */
3739 tmp = pci_find_upstream_pcie_bridge(pdev);
3740 /* Secondary interface's bus number and devfn 0 */
3741 if (tmp) {
3742 parent = pdev->bus->self;
3743 while (parent != tmp) {
3744 iommu_detach_dev(iommu, parent->bus->number,
3745 parent->devfn);
3746 parent = parent->bus->self;
3747 }
3748 if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3749 iommu_detach_dev(iommu,
3750 tmp->subordinate->number, 0);
3751 else /* this is a legacy PCI bridge */
3752 iommu_detach_dev(iommu, tmp->bus->number,
3753 tmp->devfn);
3754 }
3755 }
3756
domain_remove_one_dev_info(struct dmar_domain * domain,struct pci_dev * pdev)3757 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3758 struct pci_dev *pdev)
3759 {
3760 struct device_domain_info *info;
3761 struct intel_iommu *iommu;
3762 unsigned long flags;
3763 int found = 0;
3764 struct list_head *entry, *tmp;
3765
3766 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3767 pdev->devfn);
3768 if (!iommu)
3769 return;
3770
3771 spin_lock_irqsave(&device_domain_lock, flags);
3772 list_for_each_safe(entry, tmp, &domain->devices) {
3773 info = list_entry(entry, struct device_domain_info, link);
3774 if (info->segment == pci_domain_nr(pdev->bus) &&
3775 info->bus == pdev->bus->number &&
3776 info->devfn == pdev->devfn) {
3777 list_del(&info->link);
3778 list_del(&info->global);
3779 if (info->dev)
3780 info->dev->dev.archdata.iommu = NULL;
3781 spin_unlock_irqrestore(&device_domain_lock, flags);
3782
3783 iommu_disable_dev_iotlb(info);
3784 iommu_detach_dev(iommu, info->bus, info->devfn);
3785 iommu_detach_dependent_devices(iommu, pdev);
3786 free_devinfo_mem(info);
3787
3788 spin_lock_irqsave(&device_domain_lock, flags);
3789
3790 if (found)
3791 break;
3792 else
3793 continue;
3794 }
3795
3796 /* if there is no other devices under the same iommu
3797 * owned by this domain, clear this iommu in iommu_bmp
3798 * update iommu count and coherency
3799 */
3800 if (iommu == device_to_iommu(info->segment, info->bus,
3801 info->devfn))
3802 found = 1;
3803 }
3804
3805 spin_unlock_irqrestore(&device_domain_lock, flags);
3806
3807 if (found == 0) {
3808 unsigned long tmp_flags;
3809 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3810 clear_bit(iommu->seq_id, domain->iommu_bmp);
3811 domain->iommu_count--;
3812 domain_update_iommu_cap(domain);
3813 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3814
3815 if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3816 !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) {
3817 spin_lock_irqsave(&iommu->lock, tmp_flags);
3818 clear_bit(domain->id, iommu->domain_ids);
3819 iommu->domains[domain->id] = NULL;
3820 spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3821 }
3822 }
3823 }
3824
vm_domain_remove_all_dev_info(struct dmar_domain * domain)3825 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3826 {
3827 struct device_domain_info *info;
3828 struct intel_iommu *iommu;
3829 unsigned long flags1, flags2;
3830
3831 spin_lock_irqsave(&device_domain_lock, flags1);
3832 while (!list_empty(&domain->devices)) {
3833 info = list_entry(domain->devices.next,
3834 struct device_domain_info, link);
3835 list_del(&info->link);
3836 list_del(&info->global);
3837 if (info->dev)
3838 info->dev->dev.archdata.iommu = NULL;
3839
3840 spin_unlock_irqrestore(&device_domain_lock, flags1);
3841
3842 iommu_disable_dev_iotlb(info);
3843 iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3844 iommu_detach_dev(iommu, info->bus, info->devfn);
3845 iommu_detach_dependent_devices(iommu, info->dev);
3846
3847 /* clear this iommu in iommu_bmp, update iommu count
3848 * and capabilities
3849 */
3850 spin_lock_irqsave(&domain->iommu_lock, flags2);
3851 if (test_and_clear_bit(iommu->seq_id,
3852 domain->iommu_bmp)) {
3853 domain->iommu_count--;
3854 domain_update_iommu_cap(domain);
3855 }
3856 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3857
3858 free_devinfo_mem(info);
3859 spin_lock_irqsave(&device_domain_lock, flags1);
3860 }
3861 spin_unlock_irqrestore(&device_domain_lock, flags1);
3862 }
3863
3864 /* domain id for virtual machine, it won't be set in context */
3865 static unsigned long vm_domid;
3866
iommu_alloc_vm_domain(void)3867 static struct dmar_domain *iommu_alloc_vm_domain(void)
3868 {
3869 struct dmar_domain *domain;
3870
3871 domain = alloc_domain_mem();
3872 if (!domain)
3873 return NULL;
3874
3875 domain->id = vm_domid++;
3876 domain->nid = -1;
3877 memset(domain->iommu_bmp, 0, sizeof(domain->iommu_bmp));
3878 domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3879
3880 return domain;
3881 }
3882
md_domain_init(struct dmar_domain * domain,int guest_width)3883 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3884 {
3885 int adjust_width;
3886
3887 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3888 spin_lock_init(&domain->iommu_lock);
3889
3890 domain_reserve_special_ranges(domain);
3891
3892 /* calculate AGAW */
3893 domain->gaw = guest_width;
3894 adjust_width = guestwidth_to_adjustwidth(guest_width);
3895 domain->agaw = width_to_agaw(adjust_width);
3896
3897 INIT_LIST_HEAD(&domain->devices);
3898
3899 domain->iommu_count = 0;
3900 domain->iommu_coherency = 0;
3901 domain->iommu_snooping = 0;
3902 domain->iommu_superpage = 0;
3903 domain->max_addr = 0;
3904 domain->nid = -1;
3905
3906 /* always allocate the top pgd */
3907 domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3908 if (!domain->pgd)
3909 return -ENOMEM;
3910 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3911 return 0;
3912 }
3913
iommu_free_vm_domain(struct dmar_domain * domain)3914 static void iommu_free_vm_domain(struct dmar_domain *domain)
3915 {
3916 unsigned long flags;
3917 struct dmar_drhd_unit *drhd;
3918 struct intel_iommu *iommu;
3919 unsigned long i;
3920 unsigned long ndomains;
3921
3922 for_each_drhd_unit(drhd) {
3923 if (drhd->ignored)
3924 continue;
3925 iommu = drhd->iommu;
3926
3927 ndomains = cap_ndoms(iommu->cap);
3928 for_each_set_bit(i, iommu->domain_ids, ndomains) {
3929 if (iommu->domains[i] == domain) {
3930 spin_lock_irqsave(&iommu->lock, flags);
3931 clear_bit(i, iommu->domain_ids);
3932 iommu->domains[i] = NULL;
3933 spin_unlock_irqrestore(&iommu->lock, flags);
3934 break;
3935 }
3936 }
3937 }
3938 }
3939
vm_domain_exit(struct dmar_domain * domain)3940 static void vm_domain_exit(struct dmar_domain *domain)
3941 {
3942 /* Domain 0 is reserved, so dont process it */
3943 if (!domain)
3944 return;
3945
3946 vm_domain_remove_all_dev_info(domain);
3947 /* destroy iovas */
3948 put_iova_domain(&domain->iovad);
3949
3950 /* clear ptes */
3951 dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3952
3953 /* free page tables */
3954 dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3955
3956 iommu_free_vm_domain(domain);
3957 free_domain_mem(domain);
3958 }
3959
intel_iommu_domain_init(struct iommu_domain * domain)3960 static int intel_iommu_domain_init(struct iommu_domain *domain)
3961 {
3962 struct dmar_domain *dmar_domain;
3963
3964 dmar_domain = iommu_alloc_vm_domain();
3965 if (!dmar_domain) {
3966 printk(KERN_ERR
3967 "intel_iommu_domain_init: dmar_domain == NULL\n");
3968 return -ENOMEM;
3969 }
3970 if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3971 printk(KERN_ERR
3972 "intel_iommu_domain_init() failed\n");
3973 vm_domain_exit(dmar_domain);
3974 return -ENOMEM;
3975 }
3976 domain_update_iommu_cap(dmar_domain);
3977 domain->priv = dmar_domain;
3978
3979 return 0;
3980 }
3981
intel_iommu_domain_destroy(struct iommu_domain * domain)3982 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3983 {
3984 struct dmar_domain *dmar_domain = domain->priv;
3985
3986 domain->priv = NULL;
3987 vm_domain_exit(dmar_domain);
3988 }
3989
intel_iommu_attach_device(struct iommu_domain * domain,struct device * dev)3990 static int intel_iommu_attach_device(struct iommu_domain *domain,
3991 struct device *dev)
3992 {
3993 struct dmar_domain *dmar_domain = domain->priv;
3994 struct pci_dev *pdev = to_pci_dev(dev);
3995 struct intel_iommu *iommu;
3996 int addr_width;
3997
3998 /* normally pdev is not mapped */
3999 if (unlikely(domain_context_mapped(pdev))) {
4000 struct dmar_domain *old_domain;
4001
4002 old_domain = find_domain(pdev);
4003 if (old_domain) {
4004 if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
4005 dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
4006 domain_remove_one_dev_info(old_domain, pdev);
4007 else
4008 domain_remove_dev_info(old_domain);
4009 }
4010 }
4011
4012 iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
4013 pdev->devfn);
4014 if (!iommu)
4015 return -ENODEV;
4016
4017 /* check if this iommu agaw is sufficient for max mapped address */
4018 addr_width = agaw_to_width(iommu->agaw);
4019 if (addr_width > cap_mgaw(iommu->cap))
4020 addr_width = cap_mgaw(iommu->cap);
4021
4022 if (dmar_domain->max_addr > (1LL << addr_width)) {
4023 printk(KERN_ERR "%s: iommu width (%d) is not "
4024 "sufficient for the mapped address (%llx)\n",
4025 __func__, addr_width, dmar_domain->max_addr);
4026 return -EFAULT;
4027 }
4028 dmar_domain->gaw = addr_width;
4029
4030 /*
4031 * Knock out extra levels of page tables if necessary
4032 */
4033 while (iommu->agaw < dmar_domain->agaw) {
4034 struct dma_pte *pte;
4035
4036 pte = dmar_domain->pgd;
4037 if (dma_pte_present(pte)) {
4038 dmar_domain->pgd = (struct dma_pte *)
4039 phys_to_virt(dma_pte_addr(pte));
4040 free_pgtable_page(pte);
4041 }
4042 dmar_domain->agaw--;
4043 }
4044
4045 return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
4046 }
4047
intel_iommu_detach_device(struct iommu_domain * domain,struct device * dev)4048 static void intel_iommu_detach_device(struct iommu_domain *domain,
4049 struct device *dev)
4050 {
4051 struct dmar_domain *dmar_domain = domain->priv;
4052 struct pci_dev *pdev = to_pci_dev(dev);
4053
4054 domain_remove_one_dev_info(dmar_domain, pdev);
4055 }
4056
intel_iommu_map(struct iommu_domain * domain,unsigned long iova,phys_addr_t hpa,size_t size,int iommu_prot)4057 static int intel_iommu_map(struct iommu_domain *domain,
4058 unsigned long iova, phys_addr_t hpa,
4059 size_t size, int iommu_prot)
4060 {
4061 struct dmar_domain *dmar_domain = domain->priv;
4062 u64 max_addr;
4063 int prot = 0;
4064 int ret;
4065
4066 if (iommu_prot & IOMMU_READ)
4067 prot |= DMA_PTE_READ;
4068 if (iommu_prot & IOMMU_WRITE)
4069 prot |= DMA_PTE_WRITE;
4070 if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
4071 prot |= DMA_PTE_SNP;
4072
4073 max_addr = iova + size;
4074 if (dmar_domain->max_addr < max_addr) {
4075 u64 end;
4076
4077 /* check if minimum agaw is sufficient for mapped address */
4078 end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4079 if (end < max_addr) {
4080 printk(KERN_ERR "%s: iommu width (%d) is not "
4081 "sufficient for the mapped address (%llx)\n",
4082 __func__, dmar_domain->gaw, max_addr);
4083 return -EFAULT;
4084 }
4085 dmar_domain->max_addr = max_addr;
4086 }
4087 /* Round up size to next multiple of PAGE_SIZE, if it and
4088 the low bits of hpa would take us onto the next page */
4089 size = aligned_nrpages(hpa, size);
4090 ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4091 hpa >> VTD_PAGE_SHIFT, size, prot);
4092 return ret;
4093 }
4094
intel_iommu_unmap(struct iommu_domain * domain,unsigned long iova,size_t size)4095 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4096 unsigned long iova, size_t size)
4097 {
4098 struct dmar_domain *dmar_domain = domain->priv;
4099 int order;
4100
4101 order = dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
4102 (iova + size - 1) >> VTD_PAGE_SHIFT);
4103
4104 if (dmar_domain->max_addr == iova + size)
4105 dmar_domain->max_addr = iova;
4106
4107 return PAGE_SIZE << order;
4108 }
4109
intel_iommu_iova_to_phys(struct iommu_domain * domain,unsigned long iova)4110 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4111 unsigned long iova)
4112 {
4113 struct dmar_domain *dmar_domain = domain->priv;
4114 struct dma_pte *pte;
4115 u64 phys = 0;
4116
4117 pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0);
4118 if (pte)
4119 phys = dma_pte_addr(pte);
4120
4121 return phys;
4122 }
4123
intel_iommu_domain_has_cap(struct iommu_domain * domain,unsigned long cap)4124 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
4125 unsigned long cap)
4126 {
4127 struct dmar_domain *dmar_domain = domain->priv;
4128
4129 if (cap == IOMMU_CAP_CACHE_COHERENCY)
4130 return dmar_domain->iommu_snooping;
4131 if (cap == IOMMU_CAP_INTR_REMAP)
4132 return intr_remapping_enabled;
4133
4134 return 0;
4135 }
4136
4137 /*
4138 * Group numbers are arbitrary. Device with the same group number
4139 * indicate the iommu cannot differentiate between them. To avoid
4140 * tracking used groups we just use the seg|bus|devfn of the lowest
4141 * level we're able to differentiate devices
4142 */
intel_iommu_device_group(struct device * dev,unsigned int * groupid)4143 static int intel_iommu_device_group(struct device *dev, unsigned int *groupid)
4144 {
4145 struct pci_dev *pdev = to_pci_dev(dev);
4146 struct pci_dev *bridge;
4147 union {
4148 struct {
4149 u8 devfn;
4150 u8 bus;
4151 u16 segment;
4152 } pci;
4153 u32 group;
4154 } id;
4155
4156 if (iommu_no_mapping(dev))
4157 return -ENODEV;
4158
4159 id.pci.segment = pci_domain_nr(pdev->bus);
4160 id.pci.bus = pdev->bus->number;
4161 id.pci.devfn = pdev->devfn;
4162
4163 if (!device_to_iommu(id.pci.segment, id.pci.bus, id.pci.devfn))
4164 return -ENODEV;
4165
4166 bridge = pci_find_upstream_pcie_bridge(pdev);
4167 if (bridge) {
4168 if (pci_is_pcie(bridge)) {
4169 id.pci.bus = bridge->subordinate->number;
4170 id.pci.devfn = 0;
4171 } else {
4172 id.pci.bus = bridge->bus->number;
4173 id.pci.devfn = bridge->devfn;
4174 }
4175 }
4176
4177 if (!pdev->is_virtfn && iommu_group_mf)
4178 id.pci.devfn = PCI_DEVFN(PCI_SLOT(id.pci.devfn), 0);
4179
4180 *groupid = id.group;
4181
4182 return 0;
4183 }
4184
4185 static struct iommu_ops intel_iommu_ops = {
4186 .domain_init = intel_iommu_domain_init,
4187 .domain_destroy = intel_iommu_domain_destroy,
4188 .attach_dev = intel_iommu_attach_device,
4189 .detach_dev = intel_iommu_detach_device,
4190 .map = intel_iommu_map,
4191 .unmap = intel_iommu_unmap,
4192 .iova_to_phys = intel_iommu_iova_to_phys,
4193 .domain_has_cap = intel_iommu_domain_has_cap,
4194 .device_group = intel_iommu_device_group,
4195 .pgsize_bitmap = INTEL_IOMMU_PGSIZES,
4196 };
4197
quirk_iommu_g4x_gfx(struct pci_dev * dev)4198 static void __devinit quirk_iommu_g4x_gfx(struct pci_dev *dev)
4199 {
4200 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4201 printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
4202 dmar_map_gfx = 0;
4203 }
4204
4205 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_g4x_gfx);
4206 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_g4x_gfx);
4207 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_g4x_gfx);
4208 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_g4x_gfx);
4209 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_g4x_gfx);
4210 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_g4x_gfx);
4211 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_g4x_gfx);
4212
quirk_iommu_rwbf(struct pci_dev * dev)4213 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
4214 {
4215 /*
4216 * Mobile 4 Series Chipset neglects to set RWBF capability,
4217 * but needs it. Same seems to hold for the desktop versions.
4218 */
4219 printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
4220 rwbf_quirk = 1;
4221 }
4222
4223 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4224 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4225 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4226 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4227 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4228 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4229 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4230
4231 #define GGC 0x52
4232 #define GGC_MEMORY_SIZE_MASK (0xf << 8)
4233 #define GGC_MEMORY_SIZE_NONE (0x0 << 8)
4234 #define GGC_MEMORY_SIZE_1M (0x1 << 8)
4235 #define GGC_MEMORY_SIZE_2M (0x3 << 8)
4236 #define GGC_MEMORY_VT_ENABLED (0x8 << 8)
4237 #define GGC_MEMORY_SIZE_2M_VT (0x9 << 8)
4238 #define GGC_MEMORY_SIZE_3M_VT (0xa << 8)
4239 #define GGC_MEMORY_SIZE_4M_VT (0xb << 8)
4240
quirk_calpella_no_shadow_gtt(struct pci_dev * dev)4241 static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4242 {
4243 unsigned short ggc;
4244
4245 if (pci_read_config_word(dev, GGC, &ggc))
4246 return;
4247
4248 if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4249 printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4250 dmar_map_gfx = 0;
4251 } else if (dmar_map_gfx) {
4252 /* we have to ensure the gfx device is idle before we flush */
4253 printk(KERN_INFO "DMAR: Disabling batched IOTLB flush on Ironlake\n");
4254 intel_iommu_strict = 1;
4255 }
4256 }
4257 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4258 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4259 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4260 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4261
4262 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4263 ISOCH DMAR unit for the Azalia sound device, but not give it any
4264 TLB entries, which causes it to deadlock. Check for that. We do
4265 this in a function called from init_dmars(), instead of in a PCI
4266 quirk, because we don't want to print the obnoxious "BIOS broken"
4267 message if VT-d is actually disabled.
4268 */
check_tylersburg_isoch(void)4269 static void __init check_tylersburg_isoch(void)
4270 {
4271 struct pci_dev *pdev;
4272 uint32_t vtisochctrl;
4273
4274 /* If there's no Azalia in the system anyway, forget it. */
4275 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4276 if (!pdev)
4277 return;
4278 pci_dev_put(pdev);
4279
4280 /* System Management Registers. Might be hidden, in which case
4281 we can't do the sanity check. But that's OK, because the
4282 known-broken BIOSes _don't_ actually hide it, so far. */
4283 pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4284 if (!pdev)
4285 return;
4286
4287 if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4288 pci_dev_put(pdev);
4289 return;
4290 }
4291
4292 pci_dev_put(pdev);
4293
4294 /* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4295 if (vtisochctrl & 1)
4296 return;
4297
4298 /* Drop all bits other than the number of TLB entries */
4299 vtisochctrl &= 0x1c;
4300
4301 /* If we have the recommended number of TLB entries (16), fine. */
4302 if (vtisochctrl == 0x10)
4303 return;
4304
4305 /* Zero TLB entries? You get to ride the short bus to school. */
4306 if (!vtisochctrl) {
4307 WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4308 "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4309 dmi_get_system_info(DMI_BIOS_VENDOR),
4310 dmi_get_system_info(DMI_BIOS_VERSION),
4311 dmi_get_system_info(DMI_PRODUCT_VERSION));
4312 iommu_identity_mapping |= IDENTMAP_AZALIA;
4313 return;
4314 }
4315
4316 printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4317 vtisochctrl);
4318 }
4319