1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23 
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <linux/syscore_ops.h>
40 #include <linux/tboot.h>
41 #include <linux/dmi.h>
42 #include <asm/cacheflush.h>
43 #include <asm/iommu.h>
44 #include "pci.h"
45 
46 #define ROOT_SIZE		VTD_PAGE_SIZE
47 #define CONTEXT_SIZE		VTD_PAGE_SIZE
48 
49 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
50 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
51 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
52 
53 #define IOAPIC_RANGE_START	(0xfee00000)
54 #define IOAPIC_RANGE_END	(0xfeefffff)
55 #define IOVA_START_ADDR		(0x1000)
56 
57 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
58 
59 #define MAX_AGAW_WIDTH 64
60 
61 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << (gaw-VTD_PAGE_SHIFT)) - 1)
62 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << gaw) - 1)
63 
64 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
65    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
66 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
67 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
68 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
69 
70 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
71 #define DMA_32BIT_PFN		IOVA_PFN(DMA_BIT_MASK(32))
72 #define DMA_64BIT_PFN		IOVA_PFN(DMA_BIT_MASK(64))
73 
74 /* page table handling */
75 #define LEVEL_STRIDE		(9)
76 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
77 
agaw_to_level(int agaw)78 static inline int agaw_to_level(int agaw)
79 {
80 	return agaw + 2;
81 }
82 
agaw_to_width(int agaw)83 static inline int agaw_to_width(int agaw)
84 {
85 	return 30 + agaw * LEVEL_STRIDE;
86 }
87 
width_to_agaw(int width)88 static inline int width_to_agaw(int width)
89 {
90 	return (width - 30) / LEVEL_STRIDE;
91 }
92 
level_to_offset_bits(int level)93 static inline unsigned int level_to_offset_bits(int level)
94 {
95 	return (level - 1) * LEVEL_STRIDE;
96 }
97 
pfn_level_offset(unsigned long pfn,int level)98 static inline int pfn_level_offset(unsigned long pfn, int level)
99 {
100 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
101 }
102 
level_mask(int level)103 static inline unsigned long level_mask(int level)
104 {
105 	return -1UL << level_to_offset_bits(level);
106 }
107 
level_size(int level)108 static inline unsigned long level_size(int level)
109 {
110 	return 1UL << level_to_offset_bits(level);
111 }
112 
align_to_level(unsigned long pfn,int level)113 static inline unsigned long align_to_level(unsigned long pfn, int level)
114 {
115 	return (pfn + level_size(level) - 1) & level_mask(level);
116 }
117 
118 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
119    are never going to work. */
dma_to_mm_pfn(unsigned long dma_pfn)120 static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn)
121 {
122 	return dma_pfn >> (PAGE_SHIFT - VTD_PAGE_SHIFT);
123 }
124 
mm_to_dma_pfn(unsigned long mm_pfn)125 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
126 {
127 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
128 }
page_to_dma_pfn(struct page * pg)129 static inline unsigned long page_to_dma_pfn(struct page *pg)
130 {
131 	return mm_to_dma_pfn(page_to_pfn(pg));
132 }
virt_to_dma_pfn(void * p)133 static inline unsigned long virt_to_dma_pfn(void *p)
134 {
135 	return page_to_dma_pfn(virt_to_page(p));
136 }
137 
138 /* global iommu list, set NULL for ignored DMAR units */
139 static struct intel_iommu **g_iommus;
140 
141 static void __init check_tylersburg_isoch(void);
142 static int rwbf_quirk;
143 
144 /*
145  * 0: Present
146  * 1-11: Reserved
147  * 12-63: Context Ptr (12 - (haw-1))
148  * 64-127: Reserved
149  */
150 struct root_entry {
151 	u64	val;
152 	u64	rsvd1;
153 };
154 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
root_present(struct root_entry * root)155 static inline bool root_present(struct root_entry *root)
156 {
157 	return (root->val & 1);
158 }
set_root_present(struct root_entry * root)159 static inline void set_root_present(struct root_entry *root)
160 {
161 	root->val |= 1;
162 }
set_root_value(struct root_entry * root,unsigned long value)163 static inline void set_root_value(struct root_entry *root, unsigned long value)
164 {
165 	root->val |= value & VTD_PAGE_MASK;
166 }
167 
168 static inline struct context_entry *
get_context_addr_from_root(struct root_entry * root)169 get_context_addr_from_root(struct root_entry *root)
170 {
171 	return (struct context_entry *)
172 		(root_present(root)?phys_to_virt(
173 		root->val & VTD_PAGE_MASK) :
174 		NULL);
175 }
176 
177 /*
178  * low 64 bits:
179  * 0: present
180  * 1: fault processing disable
181  * 2-3: translation type
182  * 12-63: address space root
183  * high 64 bits:
184  * 0-2: address width
185  * 3-6: aval
186  * 8-23: domain id
187  */
188 struct context_entry {
189 	u64 lo;
190 	u64 hi;
191 };
192 
context_present(struct context_entry * context)193 static inline bool context_present(struct context_entry *context)
194 {
195 	return (context->lo & 1);
196 }
context_set_present(struct context_entry * context)197 static inline void context_set_present(struct context_entry *context)
198 {
199 	context->lo |= 1;
200 }
201 
context_set_fault_enable(struct context_entry * context)202 static inline void context_set_fault_enable(struct context_entry *context)
203 {
204 	context->lo &= (((u64)-1) << 2) | 1;
205 }
206 
context_set_translation_type(struct context_entry * context,unsigned long value)207 static inline void context_set_translation_type(struct context_entry *context,
208 						unsigned long value)
209 {
210 	context->lo &= (((u64)-1) << 4) | 3;
211 	context->lo |= (value & 3) << 2;
212 }
213 
context_set_address_root(struct context_entry * context,unsigned long value)214 static inline void context_set_address_root(struct context_entry *context,
215 					    unsigned long value)
216 {
217 	context->lo |= value & VTD_PAGE_MASK;
218 }
219 
context_set_address_width(struct context_entry * context,unsigned long value)220 static inline void context_set_address_width(struct context_entry *context,
221 					     unsigned long value)
222 {
223 	context->hi |= value & 7;
224 }
225 
context_set_domain_id(struct context_entry * context,unsigned long value)226 static inline void context_set_domain_id(struct context_entry *context,
227 					 unsigned long value)
228 {
229 	context->hi |= (value & ((1 << 16) - 1)) << 8;
230 }
231 
context_clear_entry(struct context_entry * context)232 static inline void context_clear_entry(struct context_entry *context)
233 {
234 	context->lo = 0;
235 	context->hi = 0;
236 }
237 
238 /*
239  * 0: readable
240  * 1: writable
241  * 2-6: reserved
242  * 7: super page
243  * 8-10: available
244  * 11: snoop behavior
245  * 12-63: Host physcial address
246  */
247 struct dma_pte {
248 	u64 val;
249 };
250 
dma_clear_pte(struct dma_pte * pte)251 static inline void dma_clear_pte(struct dma_pte *pte)
252 {
253 	pte->val = 0;
254 }
255 
dma_set_pte_readable(struct dma_pte * pte)256 static inline void dma_set_pte_readable(struct dma_pte *pte)
257 {
258 	pte->val |= DMA_PTE_READ;
259 }
260 
dma_set_pte_writable(struct dma_pte * pte)261 static inline void dma_set_pte_writable(struct dma_pte *pte)
262 {
263 	pte->val |= DMA_PTE_WRITE;
264 }
265 
dma_set_pte_snp(struct dma_pte * pte)266 static inline void dma_set_pte_snp(struct dma_pte *pte)
267 {
268 	pte->val |= DMA_PTE_SNP;
269 }
270 
dma_set_pte_prot(struct dma_pte * pte,unsigned long prot)271 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
272 {
273 	pte->val = (pte->val & ~3) | (prot & 3);
274 }
275 
dma_pte_addr(struct dma_pte * pte)276 static inline u64 dma_pte_addr(struct dma_pte *pte)
277 {
278 #ifdef CONFIG_64BIT
279 	return pte->val & VTD_PAGE_MASK;
280 #else
281 	/* Must have a full atomic 64-bit read */
282 	return  __cmpxchg64(&pte->val, 0ULL, 0ULL) & VTD_PAGE_MASK;
283 #endif
284 }
285 
dma_set_pte_pfn(struct dma_pte * pte,unsigned long pfn)286 static inline void dma_set_pte_pfn(struct dma_pte *pte, unsigned long pfn)
287 {
288 	pte->val |= (uint64_t)pfn << VTD_PAGE_SHIFT;
289 }
290 
dma_pte_present(struct dma_pte * pte)291 static inline bool dma_pte_present(struct dma_pte *pte)
292 {
293 	return (pte->val & 3) != 0;
294 }
295 
first_pte_in_page(struct dma_pte * pte)296 static inline int first_pte_in_page(struct dma_pte *pte)
297 {
298 	return !((unsigned long)pte & ~VTD_PAGE_MASK);
299 }
300 
301 /*
302  * This domain is a statically identity mapping domain.
303  *	1. This domain creats a static 1:1 mapping to all usable memory.
304  * 	2. It maps to each iommu if successful.
305  *	3. Each iommu mapps to this domain if successful.
306  */
307 static struct dmar_domain *si_domain;
308 static int hw_pass_through = 1;
309 
310 /* devices under the same p2p bridge are owned in one domain */
311 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
312 
313 /* domain represents a virtual machine, more than one devices
314  * across iommus may be owned in one domain, e.g. kvm guest.
315  */
316 #define DOMAIN_FLAG_VIRTUAL_MACHINE	(1 << 1)
317 
318 /* si_domain contains mulitple devices */
319 #define DOMAIN_FLAG_STATIC_IDENTITY	(1 << 2)
320 
321 struct dmar_domain {
322 	int	id;			/* domain id */
323 	int	nid;			/* node id */
324 	unsigned long iommu_bmp;	/* bitmap of iommus this domain uses*/
325 
326 	struct list_head devices; 	/* all devices' list */
327 	struct iova_domain iovad;	/* iova's that belong to this domain */
328 
329 	struct dma_pte	*pgd;		/* virtual address */
330 	int		gaw;		/* max guest address width */
331 
332 	/* adjusted guest address width, 0 is level 2 30-bit */
333 	int		agaw;
334 
335 	int		flags;		/* flags to find out type of domain */
336 
337 	int		iommu_coherency;/* indicate coherency of iommu access */
338 	int		iommu_snooping; /* indicate snooping control feature*/
339 	int		iommu_count;	/* reference count of iommu */
340 	spinlock_t	iommu_lock;	/* protect iommu set in domain */
341 	u64		max_addr;	/* maximum mapped address */
342 };
343 
344 /* PCI domain-device relationship */
345 struct device_domain_info {
346 	struct list_head link;	/* link to domain siblings */
347 	struct list_head global; /* link to global list */
348 	int segment;		/* PCI domain */
349 	u8 bus;			/* PCI bus number */
350 	u8 devfn;		/* PCI devfn number */
351 	struct pci_dev *dev; /* it's NULL for PCIe-to-PCI bridge */
352 	struct intel_iommu *iommu; /* IOMMU used by this device */
353 	struct dmar_domain *domain; /* pointer to domain */
354 };
355 
356 static void flush_unmaps_timeout(unsigned long data);
357 
358 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
359 
360 #define HIGH_WATER_MARK 250
361 struct deferred_flush_tables {
362 	int next;
363 	struct iova *iova[HIGH_WATER_MARK];
364 	struct dmar_domain *domain[HIGH_WATER_MARK];
365 };
366 
367 static struct deferred_flush_tables *deferred_flush;
368 
369 /* bitmap for indexing intel_iommus */
370 static int g_num_of_iommus;
371 
372 static DEFINE_SPINLOCK(async_umap_flush_lock);
373 static LIST_HEAD(unmaps_to_do);
374 
375 static int timer_on;
376 static long list_size;
377 
378 static void domain_remove_dev_info(struct dmar_domain *domain);
379 
380 #ifdef CONFIG_DMAR_DEFAULT_ON
381 int dmar_disabled = 0;
382 #else
383 int dmar_disabled = 1;
384 #endif /*CONFIG_DMAR_DEFAULT_ON*/
385 
386 static int dmar_map_gfx = 1;
387 static int dmar_forcedac;
388 static int intel_iommu_strict;
389 
390 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
391 static DEFINE_SPINLOCK(device_domain_lock);
392 static LIST_HEAD(device_domain_list);
393 
394 static struct iommu_ops intel_iommu_ops;
395 
intel_iommu_setup(char * str)396 static int __init intel_iommu_setup(char *str)
397 {
398 	if (!str)
399 		return -EINVAL;
400 	while (*str) {
401 		if (!strncmp(str, "on", 2)) {
402 			dmar_disabled = 0;
403 			printk(KERN_INFO "Intel-IOMMU: enabled\n");
404 		} else if (!strncmp(str, "off", 3)) {
405 			dmar_disabled = 1;
406 			printk(KERN_INFO "Intel-IOMMU: disabled\n");
407 		} else if (!strncmp(str, "igfx_off", 8)) {
408 			dmar_map_gfx = 0;
409 			printk(KERN_INFO
410 				"Intel-IOMMU: disable GFX device mapping\n");
411 		} else if (!strncmp(str, "forcedac", 8)) {
412 			printk(KERN_INFO
413 				"Intel-IOMMU: Forcing DAC for PCI devices\n");
414 			dmar_forcedac = 1;
415 		} else if (!strncmp(str, "strict", 6)) {
416 			printk(KERN_INFO
417 				"Intel-IOMMU: disable batched IOTLB flush\n");
418 			intel_iommu_strict = 1;
419 		}
420 
421 		str += strcspn(str, ",");
422 		while (*str == ',')
423 			str++;
424 	}
425 	return 0;
426 }
427 __setup("intel_iommu=", intel_iommu_setup);
428 
429 static struct kmem_cache *iommu_domain_cache;
430 static struct kmem_cache *iommu_devinfo_cache;
431 static struct kmem_cache *iommu_iova_cache;
432 
alloc_pgtable_page(int node)433 static inline void *alloc_pgtable_page(int node)
434 {
435 	struct page *page;
436 	void *vaddr = NULL;
437 
438 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
439 	if (page)
440 		vaddr = page_address(page);
441 	return vaddr;
442 }
443 
free_pgtable_page(void * vaddr)444 static inline void free_pgtable_page(void *vaddr)
445 {
446 	free_page((unsigned long)vaddr);
447 }
448 
alloc_domain_mem(void)449 static inline void *alloc_domain_mem(void)
450 {
451 	return kmem_cache_alloc(iommu_domain_cache, GFP_ATOMIC);
452 }
453 
free_domain_mem(void * vaddr)454 static void free_domain_mem(void *vaddr)
455 {
456 	kmem_cache_free(iommu_domain_cache, vaddr);
457 }
458 
alloc_devinfo_mem(void)459 static inline void * alloc_devinfo_mem(void)
460 {
461 	return kmem_cache_alloc(iommu_devinfo_cache, GFP_ATOMIC);
462 }
463 
free_devinfo_mem(void * vaddr)464 static inline void free_devinfo_mem(void *vaddr)
465 {
466 	kmem_cache_free(iommu_devinfo_cache, vaddr);
467 }
468 
alloc_iova_mem(void)469 struct iova *alloc_iova_mem(void)
470 {
471 	return kmem_cache_alloc(iommu_iova_cache, GFP_ATOMIC);
472 }
473 
free_iova_mem(struct iova * iova)474 void free_iova_mem(struct iova *iova)
475 {
476 	kmem_cache_free(iommu_iova_cache, iova);
477 }
478 
479 
__iommu_calculate_agaw(struct intel_iommu * iommu,int max_gaw)480 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
481 {
482 	unsigned long sagaw;
483 	int agaw = -1;
484 
485 	sagaw = cap_sagaw(iommu->cap);
486 	for (agaw = width_to_agaw(max_gaw);
487 	     agaw >= 0; agaw--) {
488 		if (test_bit(agaw, &sagaw))
489 			break;
490 	}
491 
492 	return agaw;
493 }
494 
495 /*
496  * Calculate max SAGAW for each iommu.
497  */
iommu_calculate_max_sagaw(struct intel_iommu * iommu)498 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
499 {
500 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
501 }
502 
503 /*
504  * calculate agaw for each iommu.
505  * "SAGAW" may be different across iommus, use a default agaw, and
506  * get a supported less agaw for iommus that don't support the default agaw.
507  */
iommu_calculate_agaw(struct intel_iommu * iommu)508 int iommu_calculate_agaw(struct intel_iommu *iommu)
509 {
510 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
511 }
512 
513 /* This functionin only returns single iommu in a domain */
domain_get_iommu(struct dmar_domain * domain)514 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
515 {
516 	int iommu_id;
517 
518 	/* si_domain and vm domain should not get here. */
519 	BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
520 	BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY);
521 
522 	iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
523 	if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
524 		return NULL;
525 
526 	return g_iommus[iommu_id];
527 }
528 
domain_update_iommu_coherency(struct dmar_domain * domain)529 static void domain_update_iommu_coherency(struct dmar_domain *domain)
530 {
531 	int i;
532 
533 	domain->iommu_coherency = 1;
534 
535 	for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
536 		if (!ecap_coherent(g_iommus[i]->ecap)) {
537 			domain->iommu_coherency = 0;
538 			break;
539 		}
540 	}
541 }
542 
domain_update_iommu_snooping(struct dmar_domain * domain)543 static void domain_update_iommu_snooping(struct dmar_domain *domain)
544 {
545 	int i;
546 
547 	domain->iommu_snooping = 1;
548 
549 	for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) {
550 		if (!ecap_sc_support(g_iommus[i]->ecap)) {
551 			domain->iommu_snooping = 0;
552 			break;
553 		}
554 	}
555 }
556 
557 /* Some capabilities may be different across iommus */
domain_update_iommu_cap(struct dmar_domain * domain)558 static void domain_update_iommu_cap(struct dmar_domain *domain)
559 {
560 	domain_update_iommu_coherency(domain);
561 	domain_update_iommu_snooping(domain);
562 }
563 
device_to_iommu(int segment,u8 bus,u8 devfn)564 static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn)
565 {
566 	struct dmar_drhd_unit *drhd = NULL;
567 	int i;
568 
569 	for_each_drhd_unit(drhd) {
570 		if (drhd->ignored)
571 			continue;
572 		if (segment != drhd->segment)
573 			continue;
574 
575 		for (i = 0; i < drhd->devices_cnt; i++) {
576 			if (drhd->devices[i] &&
577 			    drhd->devices[i]->bus->number == bus &&
578 			    drhd->devices[i]->devfn == devfn)
579 				return drhd->iommu;
580 			if (drhd->devices[i] &&
581 			    drhd->devices[i]->subordinate &&
582 			    drhd->devices[i]->subordinate->number <= bus &&
583 			    drhd->devices[i]->subordinate->subordinate >= bus)
584 				return drhd->iommu;
585 		}
586 
587 		if (drhd->include_all)
588 			return drhd->iommu;
589 	}
590 
591 	return NULL;
592 }
593 
domain_flush_cache(struct dmar_domain * domain,void * addr,int size)594 static void domain_flush_cache(struct dmar_domain *domain,
595 			       void *addr, int size)
596 {
597 	if (!domain->iommu_coherency)
598 		clflush_cache_range(addr, size);
599 }
600 
601 /* Gets context entry for a given bus and devfn */
device_to_context_entry(struct intel_iommu * iommu,u8 bus,u8 devfn)602 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
603 		u8 bus, u8 devfn)
604 {
605 	struct root_entry *root;
606 	struct context_entry *context;
607 	unsigned long phy_addr;
608 	unsigned long flags;
609 
610 	spin_lock_irqsave(&iommu->lock, flags);
611 	root = &iommu->root_entry[bus];
612 	context = get_context_addr_from_root(root);
613 	if (!context) {
614 		context = (struct context_entry *)
615 				alloc_pgtable_page(iommu->node);
616 		if (!context) {
617 			spin_unlock_irqrestore(&iommu->lock, flags);
618 			return NULL;
619 		}
620 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
621 		phy_addr = virt_to_phys((void *)context);
622 		set_root_value(root, phy_addr);
623 		set_root_present(root);
624 		__iommu_flush_cache(iommu, root, sizeof(*root));
625 	}
626 	spin_unlock_irqrestore(&iommu->lock, flags);
627 	return &context[devfn];
628 }
629 
device_context_mapped(struct intel_iommu * iommu,u8 bus,u8 devfn)630 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
631 {
632 	struct root_entry *root;
633 	struct context_entry *context;
634 	int ret;
635 	unsigned long flags;
636 
637 	spin_lock_irqsave(&iommu->lock, flags);
638 	root = &iommu->root_entry[bus];
639 	context = get_context_addr_from_root(root);
640 	if (!context) {
641 		ret = 0;
642 		goto out;
643 	}
644 	ret = context_present(&context[devfn]);
645 out:
646 	spin_unlock_irqrestore(&iommu->lock, flags);
647 	return ret;
648 }
649 
clear_context_table(struct intel_iommu * iommu,u8 bus,u8 devfn)650 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
651 {
652 	struct root_entry *root;
653 	struct context_entry *context;
654 	unsigned long flags;
655 
656 	spin_lock_irqsave(&iommu->lock, flags);
657 	root = &iommu->root_entry[bus];
658 	context = get_context_addr_from_root(root);
659 	if (context) {
660 		context_clear_entry(&context[devfn]);
661 		__iommu_flush_cache(iommu, &context[devfn], \
662 			sizeof(*context));
663 	}
664 	spin_unlock_irqrestore(&iommu->lock, flags);
665 }
666 
free_context_table(struct intel_iommu * iommu)667 static void free_context_table(struct intel_iommu *iommu)
668 {
669 	struct root_entry *root;
670 	int i;
671 	unsigned long flags;
672 	struct context_entry *context;
673 
674 	spin_lock_irqsave(&iommu->lock, flags);
675 	if (!iommu->root_entry) {
676 		goto out;
677 	}
678 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
679 		root = &iommu->root_entry[i];
680 		context = get_context_addr_from_root(root);
681 		if (context)
682 			free_pgtable_page(context);
683 	}
684 	free_pgtable_page(iommu->root_entry);
685 	iommu->root_entry = NULL;
686 out:
687 	spin_unlock_irqrestore(&iommu->lock, flags);
688 }
689 
pfn_to_dma_pte(struct dmar_domain * domain,unsigned long pfn)690 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
691 				      unsigned long pfn)
692 {
693 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
694 	struct dma_pte *parent, *pte = NULL;
695 	int level = agaw_to_level(domain->agaw);
696 	int offset;
697 
698 	BUG_ON(!domain->pgd);
699 	BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width);
700 	parent = domain->pgd;
701 
702 	while (level > 0) {
703 		void *tmp_page;
704 
705 		offset = pfn_level_offset(pfn, level);
706 		pte = &parent[offset];
707 		if (level == 1)
708 			break;
709 
710 		if (!dma_pte_present(pte)) {
711 			uint64_t pteval;
712 
713 			tmp_page = alloc_pgtable_page(domain->nid);
714 
715 			if (!tmp_page)
716 				return NULL;
717 
718 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
719 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
720 			if (cmpxchg64(&pte->val, 0ULL, pteval)) {
721 				/* Someone else set it while we were thinking; use theirs. */
722 				free_pgtable_page(tmp_page);
723 			} else {
724 				dma_pte_addr(pte);
725 				domain_flush_cache(domain, pte, sizeof(*pte));
726 			}
727 		}
728 		parent = phys_to_virt(dma_pte_addr(pte));
729 		level--;
730 	}
731 
732 	return pte;
733 }
734 
735 /* return address's pte at specific level */
dma_pfn_level_pte(struct dmar_domain * domain,unsigned long pfn,int level)736 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
737 					 unsigned long pfn,
738 					 int level)
739 {
740 	struct dma_pte *parent, *pte = NULL;
741 	int total = agaw_to_level(domain->agaw);
742 	int offset;
743 
744 	parent = domain->pgd;
745 	while (level <= total) {
746 		offset = pfn_level_offset(pfn, total);
747 		pte = &parent[offset];
748 		if (level == total)
749 			return pte;
750 
751 		if (!dma_pte_present(pte))
752 			break;
753 		parent = phys_to_virt(dma_pte_addr(pte));
754 		total--;
755 	}
756 	return NULL;
757 }
758 
759 /* clear last level pte, a tlb flush should be followed */
dma_pte_clear_range(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn)760 static void dma_pte_clear_range(struct dmar_domain *domain,
761 				unsigned long start_pfn,
762 				unsigned long last_pfn)
763 {
764 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
765 	struct dma_pte *first_pte, *pte;
766 
767 	BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
768 	BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
769 	BUG_ON(start_pfn > last_pfn);
770 
771 	/* we don't need lock here; nobody else touches the iova range */
772 	do {
773 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1);
774 		if (!pte) {
775 			start_pfn = align_to_level(start_pfn + 1, 2);
776 			continue;
777 		}
778 		do {
779 			dma_clear_pte(pte);
780 			start_pfn++;
781 			pte++;
782 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
783 
784 		domain_flush_cache(domain, first_pte,
785 				   (void *)pte - (void *)first_pte);
786 
787 	} while (start_pfn && start_pfn <= last_pfn);
788 }
789 
790 /* free page table pages. last level pte should already be cleared */
dma_pte_free_pagetable(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn)791 static void dma_pte_free_pagetable(struct dmar_domain *domain,
792 				   unsigned long start_pfn,
793 				   unsigned long last_pfn)
794 {
795 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
796 	struct dma_pte *first_pte, *pte;
797 	int total = agaw_to_level(domain->agaw);
798 	int level;
799 	unsigned long tmp;
800 
801 	BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width);
802 	BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width);
803 	BUG_ON(start_pfn > last_pfn);
804 
805 	/* We don't need lock here; nobody else touches the iova range */
806 	level = 2;
807 	while (level <= total) {
808 		tmp = align_to_level(start_pfn, level);
809 
810 		/* If we can't even clear one PTE at this level, we're done */
811 		if (tmp + level_size(level) - 1 > last_pfn)
812 			return;
813 
814 		do {
815 			first_pte = pte = dma_pfn_level_pte(domain, tmp, level);
816 			if (!pte) {
817 				tmp = align_to_level(tmp + 1, level + 1);
818 				continue;
819 			}
820 			do {
821 				if (dma_pte_present(pte)) {
822 					free_pgtable_page(phys_to_virt(dma_pte_addr(pte)));
823 					dma_clear_pte(pte);
824 				}
825 				pte++;
826 				tmp += level_size(level);
827 			} while (!first_pte_in_page(pte) &&
828 				 tmp + level_size(level) - 1 <= last_pfn);
829 
830 			domain_flush_cache(domain, first_pte,
831 					   (void *)pte - (void *)first_pte);
832 
833 		} while (tmp && tmp + level_size(level) - 1 <= last_pfn);
834 		level++;
835 	}
836 	/* free pgd */
837 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
838 		free_pgtable_page(domain->pgd);
839 		domain->pgd = NULL;
840 	}
841 }
842 
843 /* iommu handling */
iommu_alloc_root_entry(struct intel_iommu * iommu)844 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
845 {
846 	struct root_entry *root;
847 	unsigned long flags;
848 
849 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
850 	if (!root)
851 		return -ENOMEM;
852 
853 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
854 
855 	spin_lock_irqsave(&iommu->lock, flags);
856 	iommu->root_entry = root;
857 	spin_unlock_irqrestore(&iommu->lock, flags);
858 
859 	return 0;
860 }
861 
iommu_set_root_entry(struct intel_iommu * iommu)862 static void iommu_set_root_entry(struct intel_iommu *iommu)
863 {
864 	void *addr;
865 	u32 sts;
866 	unsigned long flag;
867 
868 	addr = iommu->root_entry;
869 
870 	spin_lock_irqsave(&iommu->register_lock, flag);
871 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
872 
873 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
874 
875 	/* Make sure hardware complete it */
876 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
877 		      readl, (sts & DMA_GSTS_RTPS), sts);
878 
879 	spin_unlock_irqrestore(&iommu->register_lock, flag);
880 }
881 
iommu_flush_write_buffer(struct intel_iommu * iommu)882 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
883 {
884 	u32 val;
885 	unsigned long flag;
886 
887 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
888 		return;
889 
890 	spin_lock_irqsave(&iommu->register_lock, flag);
891 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
892 
893 	/* Make sure hardware complete it */
894 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
895 		      readl, (!(val & DMA_GSTS_WBFS)), val);
896 
897 	spin_unlock_irqrestore(&iommu->register_lock, flag);
898 }
899 
900 /* return value determine if we need a write buffer flush */
__iommu_flush_context(struct intel_iommu * iommu,u16 did,u16 source_id,u8 function_mask,u64 type)901 static void __iommu_flush_context(struct intel_iommu *iommu,
902 				  u16 did, u16 source_id, u8 function_mask,
903 				  u64 type)
904 {
905 	u64 val = 0;
906 	unsigned long flag;
907 
908 	switch (type) {
909 	case DMA_CCMD_GLOBAL_INVL:
910 		val = DMA_CCMD_GLOBAL_INVL;
911 		break;
912 	case DMA_CCMD_DOMAIN_INVL:
913 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
914 		break;
915 	case DMA_CCMD_DEVICE_INVL:
916 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
917 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
918 		break;
919 	default:
920 		BUG();
921 	}
922 	val |= DMA_CCMD_ICC;
923 
924 	spin_lock_irqsave(&iommu->register_lock, flag);
925 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
926 
927 	/* Make sure hardware complete it */
928 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
929 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
930 
931 	spin_unlock_irqrestore(&iommu->register_lock, flag);
932 }
933 
934 /* return value determine if we need a write buffer flush */
__iommu_flush_iotlb(struct intel_iommu * iommu,u16 did,u64 addr,unsigned int size_order,u64 type)935 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
936 				u64 addr, unsigned int size_order, u64 type)
937 {
938 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
939 	u64 val = 0, val_iva = 0;
940 	unsigned long flag;
941 
942 	switch (type) {
943 	case DMA_TLB_GLOBAL_FLUSH:
944 		/* global flush doesn't need set IVA_REG */
945 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
946 		break;
947 	case DMA_TLB_DSI_FLUSH:
948 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
949 		break;
950 	case DMA_TLB_PSI_FLUSH:
951 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
952 		/* Note: always flush non-leaf currently */
953 		val_iva = size_order | addr;
954 		break;
955 	default:
956 		BUG();
957 	}
958 	/* Note: set drain read/write */
959 #if 0
960 	/*
961 	 * This is probably to be super secure.. Looks like we can
962 	 * ignore it without any impact.
963 	 */
964 	if (cap_read_drain(iommu->cap))
965 		val |= DMA_TLB_READ_DRAIN;
966 #endif
967 	if (cap_write_drain(iommu->cap))
968 		val |= DMA_TLB_WRITE_DRAIN;
969 
970 	spin_lock_irqsave(&iommu->register_lock, flag);
971 	/* Note: Only uses first TLB reg currently */
972 	if (val_iva)
973 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
974 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
975 
976 	/* Make sure hardware complete it */
977 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
978 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
979 
980 	spin_unlock_irqrestore(&iommu->register_lock, flag);
981 
982 	/* check IOTLB invalidation granularity */
983 	if (DMA_TLB_IAIG(val) == 0)
984 		printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
985 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
986 		pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
987 			(unsigned long long)DMA_TLB_IIRG(type),
988 			(unsigned long long)DMA_TLB_IAIG(val));
989 }
990 
iommu_support_dev_iotlb(struct dmar_domain * domain,int segment,u8 bus,u8 devfn)991 static struct device_domain_info *iommu_support_dev_iotlb(
992 	struct dmar_domain *domain, int segment, u8 bus, u8 devfn)
993 {
994 	int found = 0;
995 	unsigned long flags;
996 	struct device_domain_info *info;
997 	struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn);
998 
999 	if (!ecap_dev_iotlb_support(iommu->ecap))
1000 		return NULL;
1001 
1002 	if (!iommu->qi)
1003 		return NULL;
1004 
1005 	spin_lock_irqsave(&device_domain_lock, flags);
1006 	list_for_each_entry(info, &domain->devices, link)
1007 		if (info->bus == bus && info->devfn == devfn) {
1008 			found = 1;
1009 			break;
1010 		}
1011 	spin_unlock_irqrestore(&device_domain_lock, flags);
1012 
1013 	if (!found || !info->dev)
1014 		return NULL;
1015 
1016 	if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS))
1017 		return NULL;
1018 
1019 	if (!dmar_find_matched_atsr_unit(info->dev))
1020 		return NULL;
1021 
1022 	info->iommu = iommu;
1023 
1024 	return info;
1025 }
1026 
iommu_enable_dev_iotlb(struct device_domain_info * info)1027 static void iommu_enable_dev_iotlb(struct device_domain_info *info)
1028 {
1029 	if (!info)
1030 		return;
1031 
1032 	pci_enable_ats(info->dev, VTD_PAGE_SHIFT);
1033 }
1034 
iommu_disable_dev_iotlb(struct device_domain_info * info)1035 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1036 {
1037 	if (!info->dev || !pci_ats_enabled(info->dev))
1038 		return;
1039 
1040 	pci_disable_ats(info->dev);
1041 }
1042 
iommu_flush_dev_iotlb(struct dmar_domain * domain,u64 addr,unsigned mask)1043 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1044 				  u64 addr, unsigned mask)
1045 {
1046 	u16 sid, qdep;
1047 	unsigned long flags;
1048 	struct device_domain_info *info;
1049 
1050 	spin_lock_irqsave(&device_domain_lock, flags);
1051 	list_for_each_entry(info, &domain->devices, link) {
1052 		if (!info->dev || !pci_ats_enabled(info->dev))
1053 			continue;
1054 
1055 		sid = info->bus << 8 | info->devfn;
1056 		qdep = pci_ats_queue_depth(info->dev);
1057 		qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask);
1058 	}
1059 	spin_unlock_irqrestore(&device_domain_lock, flags);
1060 }
1061 
iommu_flush_iotlb_psi(struct intel_iommu * iommu,u16 did,unsigned long pfn,unsigned int pages,int map)1062 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
1063 				  unsigned long pfn, unsigned int pages, int map)
1064 {
1065 	unsigned int mask = ilog2(__roundup_pow_of_two(pages));
1066 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1067 
1068 	BUG_ON(pages == 0);
1069 
1070 	/*
1071 	 * Fallback to domain selective flush if no PSI support or the size is
1072 	 * too big.
1073 	 * PSI requires page size to be 2 ^ x, and the base address is naturally
1074 	 * aligned to the size
1075 	 */
1076 	if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap))
1077 		iommu->flush.flush_iotlb(iommu, did, 0, 0,
1078 						DMA_TLB_DSI_FLUSH);
1079 	else
1080 		iommu->flush.flush_iotlb(iommu, did, addr, mask,
1081 						DMA_TLB_PSI_FLUSH);
1082 
1083 	/*
1084 	 * In caching mode, changes of pages from non-present to present require
1085 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1086 	 */
1087 	if (!cap_caching_mode(iommu->cap) || !map)
1088 		iommu_flush_dev_iotlb(iommu->domains[did], addr, mask);
1089 }
1090 
iommu_disable_protect_mem_regions(struct intel_iommu * iommu)1091 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1092 {
1093 	u32 pmen;
1094 	unsigned long flags;
1095 
1096 	spin_lock_irqsave(&iommu->register_lock, flags);
1097 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1098 	pmen &= ~DMA_PMEN_EPM;
1099 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1100 
1101 	/* wait for the protected region status bit to clear */
1102 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1103 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1104 
1105 	spin_unlock_irqrestore(&iommu->register_lock, flags);
1106 }
1107 
iommu_enable_translation(struct intel_iommu * iommu)1108 static int iommu_enable_translation(struct intel_iommu *iommu)
1109 {
1110 	u32 sts;
1111 	unsigned long flags;
1112 
1113 	spin_lock_irqsave(&iommu->register_lock, flags);
1114 	iommu->gcmd |= DMA_GCMD_TE;
1115 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1116 
1117 	/* Make sure hardware complete it */
1118 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1119 		      readl, (sts & DMA_GSTS_TES), sts);
1120 
1121 	spin_unlock_irqrestore(&iommu->register_lock, flags);
1122 	return 0;
1123 }
1124 
iommu_disable_translation(struct intel_iommu * iommu)1125 static int iommu_disable_translation(struct intel_iommu *iommu)
1126 {
1127 	u32 sts;
1128 	unsigned long flag;
1129 
1130 	spin_lock_irqsave(&iommu->register_lock, flag);
1131 	iommu->gcmd &= ~DMA_GCMD_TE;
1132 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1133 
1134 	/* Make sure hardware complete it */
1135 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1136 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1137 
1138 	spin_unlock_irqrestore(&iommu->register_lock, flag);
1139 	return 0;
1140 }
1141 
1142 
iommu_init_domains(struct intel_iommu * iommu)1143 static int iommu_init_domains(struct intel_iommu *iommu)
1144 {
1145 	unsigned long ndomains;
1146 	unsigned long nlongs;
1147 
1148 	ndomains = cap_ndoms(iommu->cap);
1149 	pr_debug("IOMMU %d: Number of Domains supportd <%ld>\n", iommu->seq_id,
1150 			ndomains);
1151 	nlongs = BITS_TO_LONGS(ndomains);
1152 
1153 	spin_lock_init(&iommu->lock);
1154 
1155 	/* TBD: there might be 64K domains,
1156 	 * consider other allocation for future chip
1157 	 */
1158 	iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1159 	if (!iommu->domain_ids) {
1160 		printk(KERN_ERR "Allocating domain id array failed\n");
1161 		return -ENOMEM;
1162 	}
1163 	iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1164 			GFP_KERNEL);
1165 	if (!iommu->domains) {
1166 		printk(KERN_ERR "Allocating domain array failed\n");
1167 		return -ENOMEM;
1168 	}
1169 
1170 	/*
1171 	 * if Caching mode is set, then invalid translations are tagged
1172 	 * with domainid 0. Hence we need to pre-allocate it.
1173 	 */
1174 	if (cap_caching_mode(iommu->cap))
1175 		set_bit(0, iommu->domain_ids);
1176 	return 0;
1177 }
1178 
1179 
1180 static void domain_exit(struct dmar_domain *domain);
1181 static void vm_domain_exit(struct dmar_domain *domain);
1182 
free_dmar_iommu(struct intel_iommu * iommu)1183 void free_dmar_iommu(struct intel_iommu *iommu)
1184 {
1185 	struct dmar_domain *domain;
1186 	int i;
1187 	unsigned long flags;
1188 
1189 	if ((iommu->domains) && (iommu->domain_ids)) {
1190 		for_each_set_bit(i, iommu->domain_ids, cap_ndoms(iommu->cap)) {
1191 			domain = iommu->domains[i];
1192 			clear_bit(i, iommu->domain_ids);
1193 
1194 			spin_lock_irqsave(&domain->iommu_lock, flags);
1195 			if (--domain->iommu_count == 0) {
1196 				if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1197 					vm_domain_exit(domain);
1198 				else
1199 					domain_exit(domain);
1200 			}
1201 			spin_unlock_irqrestore(&domain->iommu_lock, flags);
1202 		}
1203 	}
1204 
1205 	if (iommu->gcmd & DMA_GCMD_TE)
1206 		iommu_disable_translation(iommu);
1207 
1208 	if (iommu->irq) {
1209 		irq_set_handler_data(iommu->irq, NULL);
1210 		/* This will mask the irq */
1211 		free_irq(iommu->irq, iommu);
1212 		destroy_irq(iommu->irq);
1213 	}
1214 
1215 	kfree(iommu->domains);
1216 	kfree(iommu->domain_ids);
1217 
1218 	g_iommus[iommu->seq_id] = NULL;
1219 
1220 	/* if all iommus are freed, free g_iommus */
1221 	for (i = 0; i < g_num_of_iommus; i++) {
1222 		if (g_iommus[i])
1223 			break;
1224 	}
1225 
1226 	if (i == g_num_of_iommus)
1227 		kfree(g_iommus);
1228 
1229 	/* free context mapping */
1230 	free_context_table(iommu);
1231 }
1232 
alloc_domain(void)1233 static struct dmar_domain *alloc_domain(void)
1234 {
1235 	struct dmar_domain *domain;
1236 
1237 	domain = alloc_domain_mem();
1238 	if (!domain)
1239 		return NULL;
1240 
1241 	domain->nid = -1;
1242 	memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1243 	domain->flags = 0;
1244 
1245 	return domain;
1246 }
1247 
iommu_attach_domain(struct dmar_domain * domain,struct intel_iommu * iommu)1248 static int iommu_attach_domain(struct dmar_domain *domain,
1249 			       struct intel_iommu *iommu)
1250 {
1251 	int num;
1252 	unsigned long ndomains;
1253 	unsigned long flags;
1254 
1255 	ndomains = cap_ndoms(iommu->cap);
1256 
1257 	spin_lock_irqsave(&iommu->lock, flags);
1258 
1259 	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1260 	if (num >= ndomains) {
1261 		spin_unlock_irqrestore(&iommu->lock, flags);
1262 		printk(KERN_ERR "IOMMU: no free domain ids\n");
1263 		return -ENOMEM;
1264 	}
1265 
1266 	domain->id = num;
1267 	set_bit(num, iommu->domain_ids);
1268 	set_bit(iommu->seq_id, &domain->iommu_bmp);
1269 	iommu->domains[num] = domain;
1270 	spin_unlock_irqrestore(&iommu->lock, flags);
1271 
1272 	return 0;
1273 }
1274 
iommu_detach_domain(struct dmar_domain * domain,struct intel_iommu * iommu)1275 static void iommu_detach_domain(struct dmar_domain *domain,
1276 				struct intel_iommu *iommu)
1277 {
1278 	unsigned long flags;
1279 	int num, ndomains;
1280 	int found = 0;
1281 
1282 	spin_lock_irqsave(&iommu->lock, flags);
1283 	ndomains = cap_ndoms(iommu->cap);
1284 	for_each_set_bit(num, iommu->domain_ids, ndomains) {
1285 		if (iommu->domains[num] == domain) {
1286 			found = 1;
1287 			break;
1288 		}
1289 	}
1290 
1291 	if (found) {
1292 		clear_bit(num, iommu->domain_ids);
1293 		clear_bit(iommu->seq_id, &domain->iommu_bmp);
1294 		iommu->domains[num] = NULL;
1295 	}
1296 	spin_unlock_irqrestore(&iommu->lock, flags);
1297 }
1298 
1299 static struct iova_domain reserved_iova_list;
1300 static struct lock_class_key reserved_rbtree_key;
1301 
dmar_init_reserved_ranges(void)1302 static int dmar_init_reserved_ranges(void)
1303 {
1304 	struct pci_dev *pdev = NULL;
1305 	struct iova *iova;
1306 	int i;
1307 
1308 	init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1309 
1310 	lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1311 		&reserved_rbtree_key);
1312 
1313 	/* IOAPIC ranges shouldn't be accessed by DMA */
1314 	iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1315 		IOVA_PFN(IOAPIC_RANGE_END));
1316 	if (!iova) {
1317 		printk(KERN_ERR "Reserve IOAPIC range failed\n");
1318 		return -ENODEV;
1319 	}
1320 
1321 	/* Reserve all PCI MMIO to avoid peer-to-peer access */
1322 	for_each_pci_dev(pdev) {
1323 		struct resource *r;
1324 
1325 		for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1326 			r = &pdev->resource[i];
1327 			if (!r->flags || !(r->flags & IORESOURCE_MEM))
1328 				continue;
1329 			iova = reserve_iova(&reserved_iova_list,
1330 					    IOVA_PFN(r->start),
1331 					    IOVA_PFN(r->end));
1332 			if (!iova) {
1333 				printk(KERN_ERR "Reserve iova failed\n");
1334 				return -ENODEV;
1335 			}
1336 		}
1337 	}
1338 	return 0;
1339 }
1340 
domain_reserve_special_ranges(struct dmar_domain * domain)1341 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1342 {
1343 	copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1344 }
1345 
guestwidth_to_adjustwidth(int gaw)1346 static inline int guestwidth_to_adjustwidth(int gaw)
1347 {
1348 	int agaw;
1349 	int r = (gaw - 12) % 9;
1350 
1351 	if (r == 0)
1352 		agaw = gaw;
1353 	else
1354 		agaw = gaw + 9 - r;
1355 	if (agaw > 64)
1356 		agaw = 64;
1357 	return agaw;
1358 }
1359 
domain_init(struct dmar_domain * domain,int guest_width)1360 static int domain_init(struct dmar_domain *domain, int guest_width)
1361 {
1362 	struct intel_iommu *iommu;
1363 	int adjust_width, agaw;
1364 	unsigned long sagaw;
1365 
1366 	init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1367 	spin_lock_init(&domain->iommu_lock);
1368 
1369 	domain_reserve_special_ranges(domain);
1370 
1371 	/* calculate AGAW */
1372 	iommu = domain_get_iommu(domain);
1373 	if (guest_width > cap_mgaw(iommu->cap))
1374 		guest_width = cap_mgaw(iommu->cap);
1375 	domain->gaw = guest_width;
1376 	adjust_width = guestwidth_to_adjustwidth(guest_width);
1377 	agaw = width_to_agaw(adjust_width);
1378 	sagaw = cap_sagaw(iommu->cap);
1379 	if (!test_bit(agaw, &sagaw)) {
1380 		/* hardware doesn't support it, choose a bigger one */
1381 		pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1382 		agaw = find_next_bit(&sagaw, 5, agaw);
1383 		if (agaw >= 5)
1384 			return -ENODEV;
1385 	}
1386 	domain->agaw = agaw;
1387 	INIT_LIST_HEAD(&domain->devices);
1388 
1389 	if (ecap_coherent(iommu->ecap))
1390 		domain->iommu_coherency = 1;
1391 	else
1392 		domain->iommu_coherency = 0;
1393 
1394 	if (ecap_sc_support(iommu->ecap))
1395 		domain->iommu_snooping = 1;
1396 	else
1397 		domain->iommu_snooping = 0;
1398 
1399 	domain->iommu_count = 1;
1400 	domain->nid = iommu->node;
1401 
1402 	/* always allocate the top pgd */
1403 	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
1404 	if (!domain->pgd)
1405 		return -ENOMEM;
1406 	__iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1407 	return 0;
1408 }
1409 
domain_exit(struct dmar_domain * domain)1410 static void domain_exit(struct dmar_domain *domain)
1411 {
1412 	struct dmar_drhd_unit *drhd;
1413 	struct intel_iommu *iommu;
1414 
1415 	/* Domain 0 is reserved, so dont process it */
1416 	if (!domain)
1417 		return;
1418 
1419 	domain_remove_dev_info(domain);
1420 	/* destroy iovas */
1421 	put_iova_domain(&domain->iovad);
1422 
1423 	/* clear ptes */
1424 	dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1425 
1426 	/* free page tables */
1427 	dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
1428 
1429 	for_each_active_iommu(iommu, drhd)
1430 		if (test_bit(iommu->seq_id, &domain->iommu_bmp))
1431 			iommu_detach_domain(domain, iommu);
1432 
1433 	free_domain_mem(domain);
1434 }
1435 
domain_context_mapping_one(struct dmar_domain * domain,int segment,u8 bus,u8 devfn,int translation)1436 static int domain_context_mapping_one(struct dmar_domain *domain, int segment,
1437 				 u8 bus, u8 devfn, int translation)
1438 {
1439 	struct context_entry *context;
1440 	unsigned long flags;
1441 	struct intel_iommu *iommu;
1442 	struct dma_pte *pgd;
1443 	unsigned long num;
1444 	unsigned long ndomains;
1445 	int id;
1446 	int agaw;
1447 	struct device_domain_info *info = NULL;
1448 
1449 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1450 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1451 
1452 	BUG_ON(!domain->pgd);
1453 	BUG_ON(translation != CONTEXT_TT_PASS_THROUGH &&
1454 	       translation != CONTEXT_TT_MULTI_LEVEL);
1455 
1456 	iommu = device_to_iommu(segment, bus, devfn);
1457 	if (!iommu)
1458 		return -ENODEV;
1459 
1460 	context = device_to_context_entry(iommu, bus, devfn);
1461 	if (!context)
1462 		return -ENOMEM;
1463 	spin_lock_irqsave(&iommu->lock, flags);
1464 	if (context_present(context)) {
1465 		spin_unlock_irqrestore(&iommu->lock, flags);
1466 		return 0;
1467 	}
1468 
1469 	id = domain->id;
1470 	pgd = domain->pgd;
1471 
1472 	if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
1473 	    domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) {
1474 		int found = 0;
1475 
1476 		/* find an available domain id for this device in iommu */
1477 		ndomains = cap_ndoms(iommu->cap);
1478 		for_each_set_bit(num, iommu->domain_ids, ndomains) {
1479 			if (iommu->domains[num] == domain) {
1480 				id = num;
1481 				found = 1;
1482 				break;
1483 			}
1484 		}
1485 
1486 		if (found == 0) {
1487 			num = find_first_zero_bit(iommu->domain_ids, ndomains);
1488 			if (num >= ndomains) {
1489 				spin_unlock_irqrestore(&iommu->lock, flags);
1490 				printk(KERN_ERR "IOMMU: no free domain ids\n");
1491 				return -EFAULT;
1492 			}
1493 
1494 			set_bit(num, iommu->domain_ids);
1495 			iommu->domains[num] = domain;
1496 			id = num;
1497 		}
1498 
1499 		/* Skip top levels of page tables for
1500 		 * iommu which has less agaw than default.
1501 		 * Unnecessary for PT mode.
1502 		 */
1503 		if (translation != CONTEXT_TT_PASS_THROUGH) {
1504 			for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1505 				pgd = phys_to_virt(dma_pte_addr(pgd));
1506 				if (!dma_pte_present(pgd)) {
1507 					spin_unlock_irqrestore(&iommu->lock, flags);
1508 					return -ENOMEM;
1509 				}
1510 			}
1511 		}
1512 	}
1513 
1514 	context_set_domain_id(context, id);
1515 
1516 	if (translation != CONTEXT_TT_PASS_THROUGH) {
1517 		info = iommu_support_dev_iotlb(domain, segment, bus, devfn);
1518 		translation = info ? CONTEXT_TT_DEV_IOTLB :
1519 				     CONTEXT_TT_MULTI_LEVEL;
1520 	}
1521 	/*
1522 	 * In pass through mode, AW must be programmed to indicate the largest
1523 	 * AGAW value supported by hardware. And ASR is ignored by hardware.
1524 	 */
1525 	if (unlikely(translation == CONTEXT_TT_PASS_THROUGH))
1526 		context_set_address_width(context, iommu->msagaw);
1527 	else {
1528 		context_set_address_root(context, virt_to_phys(pgd));
1529 		context_set_address_width(context, iommu->agaw);
1530 	}
1531 
1532 	context_set_translation_type(context, translation);
1533 	context_set_fault_enable(context);
1534 	context_set_present(context);
1535 	domain_flush_cache(domain, context, sizeof(*context));
1536 
1537 	/*
1538 	 * It's a non-present to present mapping. If hardware doesn't cache
1539 	 * non-present entry we only need to flush the write-buffer. If the
1540 	 * _does_ cache non-present entries, then it does so in the special
1541 	 * domain #0, which we have to flush:
1542 	 */
1543 	if (cap_caching_mode(iommu->cap)) {
1544 		iommu->flush.flush_context(iommu, 0,
1545 					   (((u16)bus) << 8) | devfn,
1546 					   DMA_CCMD_MASK_NOBIT,
1547 					   DMA_CCMD_DEVICE_INVL);
1548 		iommu->flush.flush_iotlb(iommu, domain->id, 0, 0, DMA_TLB_DSI_FLUSH);
1549 	} else {
1550 		iommu_flush_write_buffer(iommu);
1551 	}
1552 	iommu_enable_dev_iotlb(info);
1553 	spin_unlock_irqrestore(&iommu->lock, flags);
1554 
1555 	spin_lock_irqsave(&domain->iommu_lock, flags);
1556 	if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1557 		domain->iommu_count++;
1558 		if (domain->iommu_count == 1)
1559 			domain->nid = iommu->node;
1560 		domain_update_iommu_cap(domain);
1561 	}
1562 	spin_unlock_irqrestore(&domain->iommu_lock, flags);
1563 	return 0;
1564 }
1565 
1566 static int
domain_context_mapping(struct dmar_domain * domain,struct pci_dev * pdev,int translation)1567 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev,
1568 			int translation)
1569 {
1570 	int ret;
1571 	struct pci_dev *tmp, *parent;
1572 
1573 	ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus),
1574 					 pdev->bus->number, pdev->devfn,
1575 					 translation);
1576 	if (ret)
1577 		return ret;
1578 
1579 	/* dependent device mapping */
1580 	tmp = pci_find_upstream_pcie_bridge(pdev);
1581 	if (!tmp)
1582 		return 0;
1583 	/* Secondary interface's bus number and devfn 0 */
1584 	parent = pdev->bus->self;
1585 	while (parent != tmp) {
1586 		ret = domain_context_mapping_one(domain,
1587 						 pci_domain_nr(parent->bus),
1588 						 parent->bus->number,
1589 						 parent->devfn, translation);
1590 		if (ret)
1591 			return ret;
1592 		parent = parent->bus->self;
1593 	}
1594 	if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
1595 		return domain_context_mapping_one(domain,
1596 					pci_domain_nr(tmp->subordinate),
1597 					tmp->subordinate->number, 0,
1598 					translation);
1599 	else /* this is a legacy PCI bridge */
1600 		return domain_context_mapping_one(domain,
1601 						  pci_domain_nr(tmp->bus),
1602 						  tmp->bus->number,
1603 						  tmp->devfn,
1604 						  translation);
1605 }
1606 
domain_context_mapped(struct pci_dev * pdev)1607 static int domain_context_mapped(struct pci_dev *pdev)
1608 {
1609 	int ret;
1610 	struct pci_dev *tmp, *parent;
1611 	struct intel_iommu *iommu;
1612 
1613 	iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
1614 				pdev->devfn);
1615 	if (!iommu)
1616 		return -ENODEV;
1617 
1618 	ret = device_context_mapped(iommu, pdev->bus->number, pdev->devfn);
1619 	if (!ret)
1620 		return ret;
1621 	/* dependent device mapping */
1622 	tmp = pci_find_upstream_pcie_bridge(pdev);
1623 	if (!tmp)
1624 		return ret;
1625 	/* Secondary interface's bus number and devfn 0 */
1626 	parent = pdev->bus->self;
1627 	while (parent != tmp) {
1628 		ret = device_context_mapped(iommu, parent->bus->number,
1629 					    parent->devfn);
1630 		if (!ret)
1631 			return ret;
1632 		parent = parent->bus->self;
1633 	}
1634 	if (pci_is_pcie(tmp))
1635 		return device_context_mapped(iommu, tmp->subordinate->number,
1636 					     0);
1637 	else
1638 		return device_context_mapped(iommu, tmp->bus->number,
1639 					     tmp->devfn);
1640 }
1641 
1642 /* Returns a number of VTD pages, but aligned to MM page size */
aligned_nrpages(unsigned long host_addr,size_t size)1643 static inline unsigned long aligned_nrpages(unsigned long host_addr,
1644 					    size_t size)
1645 {
1646 	host_addr &= ~PAGE_MASK;
1647 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
1648 }
1649 
__domain_mapping(struct dmar_domain * domain,unsigned long iov_pfn,struct scatterlist * sg,unsigned long phys_pfn,unsigned long nr_pages,int prot)1650 static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1651 			    struct scatterlist *sg, unsigned long phys_pfn,
1652 			    unsigned long nr_pages, int prot)
1653 {
1654 	struct dma_pte *first_pte = NULL, *pte = NULL;
1655 	phys_addr_t uninitialized_var(pteval);
1656 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
1657 	unsigned long sg_res;
1658 
1659 	BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width);
1660 
1661 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1662 		return -EINVAL;
1663 
1664 	prot &= DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP;
1665 
1666 	if (sg)
1667 		sg_res = 0;
1668 	else {
1669 		sg_res = nr_pages + 1;
1670 		pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot;
1671 	}
1672 
1673 	while (nr_pages--) {
1674 		uint64_t tmp;
1675 
1676 		if (!sg_res) {
1677 			sg_res = aligned_nrpages(sg->offset, sg->length);
1678 			sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset;
1679 			sg->dma_length = sg->length;
1680 			pteval = page_to_phys(sg_page(sg)) | prot;
1681 		}
1682 		if (!pte) {
1683 			first_pte = pte = pfn_to_dma_pte(domain, iov_pfn);
1684 			if (!pte)
1685 				return -ENOMEM;
1686 		}
1687 		/* We don't need lock here, nobody else
1688 		 * touches the iova range
1689 		 */
1690 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
1691 		if (tmp) {
1692 			static int dumps = 5;
1693 			printk(KERN_CRIT "ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
1694 			       iov_pfn, tmp, (unsigned long long)pteval);
1695 			if (dumps) {
1696 				dumps--;
1697 				debug_dma_dump_mappings(NULL);
1698 			}
1699 			WARN_ON(1);
1700 		}
1701 		pte++;
1702 		if (!nr_pages || first_pte_in_page(pte)) {
1703 			domain_flush_cache(domain, first_pte,
1704 					   (void *)pte - (void *)first_pte);
1705 			pte = NULL;
1706 		}
1707 		iov_pfn++;
1708 		pteval += VTD_PAGE_SIZE;
1709 		sg_res--;
1710 		if (!sg_res)
1711 			sg = sg_next(sg);
1712 	}
1713 	return 0;
1714 }
1715 
domain_sg_mapping(struct dmar_domain * domain,unsigned long iov_pfn,struct scatterlist * sg,unsigned long nr_pages,int prot)1716 static inline int domain_sg_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1717 				    struct scatterlist *sg, unsigned long nr_pages,
1718 				    int prot)
1719 {
1720 	return __domain_mapping(domain, iov_pfn, sg, 0, nr_pages, prot);
1721 }
1722 
domain_pfn_mapping(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phys_pfn,unsigned long nr_pages,int prot)1723 static inline int domain_pfn_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
1724 				     unsigned long phys_pfn, unsigned long nr_pages,
1725 				     int prot)
1726 {
1727 	return __domain_mapping(domain, iov_pfn, NULL, phys_pfn, nr_pages, prot);
1728 }
1729 
iommu_detach_dev(struct intel_iommu * iommu,u8 bus,u8 devfn)1730 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1731 {
1732 	if (!iommu)
1733 		return;
1734 
1735 	clear_context_table(iommu, bus, devfn);
1736 	iommu->flush.flush_context(iommu, 0, 0, 0,
1737 					   DMA_CCMD_GLOBAL_INVL);
1738 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1739 }
1740 
domain_remove_dev_info(struct dmar_domain * domain)1741 static void domain_remove_dev_info(struct dmar_domain *domain)
1742 {
1743 	struct device_domain_info *info;
1744 	unsigned long flags;
1745 	struct intel_iommu *iommu;
1746 
1747 	spin_lock_irqsave(&device_domain_lock, flags);
1748 	while (!list_empty(&domain->devices)) {
1749 		info = list_entry(domain->devices.next,
1750 			struct device_domain_info, link);
1751 		list_del(&info->link);
1752 		list_del(&info->global);
1753 		if (info->dev)
1754 			info->dev->dev.archdata.iommu = NULL;
1755 		spin_unlock_irqrestore(&device_domain_lock, flags);
1756 
1757 		iommu_disable_dev_iotlb(info);
1758 		iommu = device_to_iommu(info->segment, info->bus, info->devfn);
1759 		iommu_detach_dev(iommu, info->bus, info->devfn);
1760 		free_devinfo_mem(info);
1761 
1762 		spin_lock_irqsave(&device_domain_lock, flags);
1763 	}
1764 	spin_unlock_irqrestore(&device_domain_lock, flags);
1765 }
1766 
1767 /*
1768  * find_domain
1769  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1770  */
1771 static struct dmar_domain *
find_domain(struct pci_dev * pdev)1772 find_domain(struct pci_dev *pdev)
1773 {
1774 	struct device_domain_info *info;
1775 
1776 	/* No lock here, assumes no domain exit in normal case */
1777 	info = pdev->dev.archdata.iommu;
1778 	if (info)
1779 		return info->domain;
1780 	return NULL;
1781 }
1782 
1783 /* domain is initialized */
get_domain_for_dev(struct pci_dev * pdev,int gaw)1784 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1785 {
1786 	struct dmar_domain *domain, *found = NULL;
1787 	struct intel_iommu *iommu;
1788 	struct dmar_drhd_unit *drhd;
1789 	struct device_domain_info *info, *tmp;
1790 	struct pci_dev *dev_tmp;
1791 	unsigned long flags;
1792 	int bus = 0, devfn = 0;
1793 	int segment;
1794 	int ret;
1795 
1796 	domain = find_domain(pdev);
1797 	if (domain)
1798 		return domain;
1799 
1800 	segment = pci_domain_nr(pdev->bus);
1801 
1802 	dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1803 	if (dev_tmp) {
1804 		if (pci_is_pcie(dev_tmp)) {
1805 			bus = dev_tmp->subordinate->number;
1806 			devfn = 0;
1807 		} else {
1808 			bus = dev_tmp->bus->number;
1809 			devfn = dev_tmp->devfn;
1810 		}
1811 		spin_lock_irqsave(&device_domain_lock, flags);
1812 		list_for_each_entry(info, &device_domain_list, global) {
1813 			if (info->segment == segment &&
1814 			    info->bus == bus && info->devfn == devfn) {
1815 				found = info->domain;
1816 				break;
1817 			}
1818 		}
1819 		spin_unlock_irqrestore(&device_domain_lock, flags);
1820 		/* pcie-pci bridge already has a domain, uses it */
1821 		if (found) {
1822 			domain = found;
1823 			goto found_domain;
1824 		}
1825 	}
1826 
1827 	domain = alloc_domain();
1828 	if (!domain)
1829 		goto error;
1830 
1831 	/* Allocate new domain for the device */
1832 	drhd = dmar_find_matched_drhd_unit(pdev);
1833 	if (!drhd) {
1834 		printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1835 			pci_name(pdev));
1836 		return NULL;
1837 	}
1838 	iommu = drhd->iommu;
1839 
1840 	ret = iommu_attach_domain(domain, iommu);
1841 	if (ret) {
1842 		free_domain_mem(domain);
1843 		goto error;
1844 	}
1845 
1846 	if (domain_init(domain, gaw)) {
1847 		domain_exit(domain);
1848 		goto error;
1849 	}
1850 
1851 	/* register pcie-to-pci device */
1852 	if (dev_tmp) {
1853 		info = alloc_devinfo_mem();
1854 		if (!info) {
1855 			domain_exit(domain);
1856 			goto error;
1857 		}
1858 		info->segment = segment;
1859 		info->bus = bus;
1860 		info->devfn = devfn;
1861 		info->dev = NULL;
1862 		info->domain = domain;
1863 		/* This domain is shared by devices under p2p bridge */
1864 		domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1865 
1866 		/* pcie-to-pci bridge already has a domain, uses it */
1867 		found = NULL;
1868 		spin_lock_irqsave(&device_domain_lock, flags);
1869 		list_for_each_entry(tmp, &device_domain_list, global) {
1870 			if (tmp->segment == segment &&
1871 			    tmp->bus == bus && tmp->devfn == devfn) {
1872 				found = tmp->domain;
1873 				break;
1874 			}
1875 		}
1876 		if (found) {
1877 			spin_unlock_irqrestore(&device_domain_lock, flags);
1878 			free_devinfo_mem(info);
1879 			domain_exit(domain);
1880 			domain = found;
1881 		} else {
1882 			list_add(&info->link, &domain->devices);
1883 			list_add(&info->global, &device_domain_list);
1884 			spin_unlock_irqrestore(&device_domain_lock, flags);
1885 		}
1886 	}
1887 
1888 found_domain:
1889 	info = alloc_devinfo_mem();
1890 	if (!info)
1891 		goto error;
1892 	info->segment = segment;
1893 	info->bus = pdev->bus->number;
1894 	info->devfn = pdev->devfn;
1895 	info->dev = pdev;
1896 	info->domain = domain;
1897 	spin_lock_irqsave(&device_domain_lock, flags);
1898 	/* somebody is fast */
1899 	found = find_domain(pdev);
1900 	if (found != NULL) {
1901 		spin_unlock_irqrestore(&device_domain_lock, flags);
1902 		if (found != domain) {
1903 			domain_exit(domain);
1904 			domain = found;
1905 		}
1906 		free_devinfo_mem(info);
1907 		return domain;
1908 	}
1909 	list_add(&info->link, &domain->devices);
1910 	list_add(&info->global, &device_domain_list);
1911 	pdev->dev.archdata.iommu = info;
1912 	spin_unlock_irqrestore(&device_domain_lock, flags);
1913 	return domain;
1914 error:
1915 	/* recheck it here, maybe others set it */
1916 	return find_domain(pdev);
1917 }
1918 
1919 static int iommu_identity_mapping;
1920 #define IDENTMAP_ALL		1
1921 #define IDENTMAP_GFX		2
1922 #define IDENTMAP_AZALIA		4
1923 
iommu_domain_identity_map(struct dmar_domain * domain,unsigned long long start,unsigned long long end)1924 static int iommu_domain_identity_map(struct dmar_domain *domain,
1925 				     unsigned long long start,
1926 				     unsigned long long end)
1927 {
1928 	unsigned long first_vpfn = start >> VTD_PAGE_SHIFT;
1929 	unsigned long last_vpfn = end >> VTD_PAGE_SHIFT;
1930 
1931 	if (!reserve_iova(&domain->iovad, dma_to_mm_pfn(first_vpfn),
1932 			  dma_to_mm_pfn(last_vpfn))) {
1933 		printk(KERN_ERR "IOMMU: reserve iova failed\n");
1934 		return -ENOMEM;
1935 	}
1936 
1937 	pr_debug("Mapping reserved region %llx-%llx for domain %d\n",
1938 		 start, end, domain->id);
1939 	/*
1940 	 * RMRR range might have overlap with physical memory range,
1941 	 * clear it first
1942 	 */
1943 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
1944 
1945 	return domain_pfn_mapping(domain, first_vpfn, first_vpfn,
1946 				  last_vpfn - first_vpfn + 1,
1947 				  DMA_PTE_READ|DMA_PTE_WRITE);
1948 }
1949 
iommu_prepare_identity_map(struct pci_dev * pdev,unsigned long long start,unsigned long long end)1950 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1951 				      unsigned long long start,
1952 				      unsigned long long end)
1953 {
1954 	struct dmar_domain *domain;
1955 	int ret;
1956 
1957 	domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1958 	if (!domain)
1959 		return -ENOMEM;
1960 
1961 	/* For _hardware_ passthrough, don't bother. But for software
1962 	   passthrough, we do it anyway -- it may indicate a memory
1963 	   range which is reserved in E820, so which didn't get set
1964 	   up to start with in si_domain */
1965 	if (domain == si_domain && hw_pass_through) {
1966 		printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n",
1967 		       pci_name(pdev), start, end);
1968 		return 0;
1969 	}
1970 
1971 	printk(KERN_INFO
1972 	       "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1973 	       pci_name(pdev), start, end);
1974 
1975 	if (end < start) {
1976 		WARN(1, "Your BIOS is broken; RMRR ends before it starts!\n"
1977 			"BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1978 			dmi_get_system_info(DMI_BIOS_VENDOR),
1979 			dmi_get_system_info(DMI_BIOS_VERSION),
1980 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
1981 		ret = -EIO;
1982 		goto error;
1983 	}
1984 
1985 	if (end >> agaw_to_width(domain->agaw)) {
1986 		WARN(1, "Your BIOS is broken; RMRR exceeds permitted address width (%d bits)\n"
1987 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
1988 		     agaw_to_width(domain->agaw),
1989 		     dmi_get_system_info(DMI_BIOS_VENDOR),
1990 		     dmi_get_system_info(DMI_BIOS_VERSION),
1991 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
1992 		ret = -EIO;
1993 		goto error;
1994 	}
1995 
1996 	ret = iommu_domain_identity_map(domain, start, end);
1997 	if (ret)
1998 		goto error;
1999 
2000 	/* context entry init */
2001 	ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL);
2002 	if (ret)
2003 		goto error;
2004 
2005 	return 0;
2006 
2007  error:
2008 	domain_exit(domain);
2009 	return ret;
2010 }
2011 
iommu_prepare_rmrr_dev(struct dmar_rmrr_unit * rmrr,struct pci_dev * pdev)2012 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
2013 	struct pci_dev *pdev)
2014 {
2015 	if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2016 		return 0;
2017 	return iommu_prepare_identity_map(pdev, rmrr->base_address,
2018 		rmrr->end_address + 1);
2019 }
2020 
2021 #ifdef CONFIG_DMAR_FLOPPY_WA
iommu_prepare_isa(void)2022 static inline void iommu_prepare_isa(void)
2023 {
2024 	struct pci_dev *pdev;
2025 	int ret;
2026 
2027 	pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
2028 	if (!pdev)
2029 		return;
2030 
2031 	printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n");
2032 	ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
2033 
2034 	if (ret)
2035 		printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; "
2036 		       "floppy might not work\n");
2037 
2038 }
2039 #else
iommu_prepare_isa(void)2040 static inline void iommu_prepare_isa(void)
2041 {
2042 	return;
2043 }
2044 #endif /* !CONFIG_DMAR_FLPY_WA */
2045 
2046 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2047 
si_domain_work_fn(unsigned long start_pfn,unsigned long end_pfn,void * datax)2048 static int __init si_domain_work_fn(unsigned long start_pfn,
2049 				    unsigned long end_pfn, void *datax)
2050 {
2051 	int *ret = datax;
2052 
2053 	*ret = iommu_domain_identity_map(si_domain,
2054 					 (uint64_t)start_pfn << PAGE_SHIFT,
2055 					 (uint64_t)end_pfn << PAGE_SHIFT);
2056 	return *ret;
2057 
2058 }
2059 
si_domain_init(int hw)2060 static int __init si_domain_init(int hw)
2061 {
2062 	struct dmar_drhd_unit *drhd;
2063 	struct intel_iommu *iommu;
2064 	int nid, ret = 0;
2065 
2066 	si_domain = alloc_domain();
2067 	if (!si_domain)
2068 		return -EFAULT;
2069 
2070 	pr_debug("Identity mapping domain is domain %d\n", si_domain->id);
2071 
2072 	for_each_active_iommu(iommu, drhd) {
2073 		ret = iommu_attach_domain(si_domain, iommu);
2074 		if (ret) {
2075 			domain_exit(si_domain);
2076 			return -EFAULT;
2077 		}
2078 	}
2079 
2080 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2081 		domain_exit(si_domain);
2082 		return -EFAULT;
2083 	}
2084 
2085 	si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY;
2086 
2087 	if (hw)
2088 		return 0;
2089 
2090 	for_each_online_node(nid) {
2091 		work_with_active_regions(nid, si_domain_work_fn, &ret);
2092 		if (ret)
2093 			return ret;
2094 	}
2095 
2096 	return 0;
2097 }
2098 
2099 static void domain_remove_one_dev_info(struct dmar_domain *domain,
2100 					  struct pci_dev *pdev);
identity_mapping(struct pci_dev * pdev)2101 static int identity_mapping(struct pci_dev *pdev)
2102 {
2103 	struct device_domain_info *info;
2104 
2105 	if (likely(!iommu_identity_mapping))
2106 		return 0;
2107 
2108 
2109 	list_for_each_entry(info, &si_domain->devices, link)
2110 		if (info->dev == pdev)
2111 			return 1;
2112 	return 0;
2113 }
2114 
domain_add_dev_info(struct dmar_domain * domain,struct pci_dev * pdev,int translation)2115 static int domain_add_dev_info(struct dmar_domain *domain,
2116 			       struct pci_dev *pdev,
2117 			       int translation)
2118 {
2119 	struct device_domain_info *info;
2120 	unsigned long flags;
2121 	int ret;
2122 
2123 	info = alloc_devinfo_mem();
2124 	if (!info)
2125 		return -ENOMEM;
2126 
2127 	ret = domain_context_mapping(domain, pdev, translation);
2128 	if (ret) {
2129 		free_devinfo_mem(info);
2130 		return ret;
2131 	}
2132 
2133 	info->segment = pci_domain_nr(pdev->bus);
2134 	info->bus = pdev->bus->number;
2135 	info->devfn = pdev->devfn;
2136 	info->dev = pdev;
2137 	info->domain = domain;
2138 
2139 	spin_lock_irqsave(&device_domain_lock, flags);
2140 	list_add(&info->link, &domain->devices);
2141 	list_add(&info->global, &device_domain_list);
2142 	pdev->dev.archdata.iommu = info;
2143 	spin_unlock_irqrestore(&device_domain_lock, flags);
2144 
2145 	return 0;
2146 }
2147 
iommu_should_identity_map(struct pci_dev * pdev,int startup)2148 static int iommu_should_identity_map(struct pci_dev *pdev, int startup)
2149 {
2150 	if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2151 		return 1;
2152 
2153 	if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2154 		return 1;
2155 
2156 	if (!(iommu_identity_mapping & IDENTMAP_ALL))
2157 		return 0;
2158 
2159 	/*
2160 	 * We want to start off with all devices in the 1:1 domain, and
2161 	 * take them out later if we find they can't access all of memory.
2162 	 *
2163 	 * However, we can't do this for PCI devices behind bridges,
2164 	 * because all PCI devices behind the same bridge will end up
2165 	 * with the same source-id on their transactions.
2166 	 *
2167 	 * Practically speaking, we can't change things around for these
2168 	 * devices at run-time, because we can't be sure there'll be no
2169 	 * DMA transactions in flight for any of their siblings.
2170 	 *
2171 	 * So PCI devices (unless they're on the root bus) as well as
2172 	 * their parent PCI-PCI or PCIe-PCI bridges must be left _out_ of
2173 	 * the 1:1 domain, just in _case_ one of their siblings turns out
2174 	 * not to be able to map all of memory.
2175 	 */
2176 	if (!pci_is_pcie(pdev)) {
2177 		if (!pci_is_root_bus(pdev->bus))
2178 			return 0;
2179 		if (pdev->class >> 8 == PCI_CLASS_BRIDGE_PCI)
2180 			return 0;
2181 	} else if (pdev->pcie_type == PCI_EXP_TYPE_PCI_BRIDGE)
2182 		return 0;
2183 
2184 	/*
2185 	 * At boot time, we don't yet know if devices will be 64-bit capable.
2186 	 * Assume that they will -- if they turn out not to be, then we can
2187 	 * take them out of the 1:1 domain later.
2188 	 */
2189 	if (!startup)
2190 		return pdev->dma_mask > DMA_BIT_MASK(32);
2191 
2192 	return 1;
2193 }
2194 
iommu_prepare_static_identity_mapping(int hw)2195 static int __init iommu_prepare_static_identity_mapping(int hw)
2196 {
2197 	struct pci_dev *pdev = NULL;
2198 	int ret;
2199 
2200 	ret = si_domain_init(hw);
2201 	if (ret)
2202 		return -EFAULT;
2203 
2204 	for_each_pci_dev(pdev) {
2205 		if (iommu_should_identity_map(pdev, 1)) {
2206 			printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n",
2207 			       hw ? "hardware" : "software", pci_name(pdev));
2208 
2209 			ret = domain_add_dev_info(si_domain, pdev,
2210 						     hw ? CONTEXT_TT_PASS_THROUGH :
2211 						     CONTEXT_TT_MULTI_LEVEL);
2212 			if (ret)
2213 				return ret;
2214 		}
2215 	}
2216 
2217 	return 0;
2218 }
2219 
init_dmars(int force_on)2220 static int __init init_dmars(int force_on)
2221 {
2222 	struct dmar_drhd_unit *drhd;
2223 	struct dmar_rmrr_unit *rmrr;
2224 	struct pci_dev *pdev;
2225 	struct intel_iommu *iommu;
2226 	int i, ret;
2227 
2228 	/*
2229 	 * for each drhd
2230 	 *    allocate root
2231 	 *    initialize and program root entry to not present
2232 	 * endfor
2233 	 */
2234 	for_each_drhd_unit(drhd) {
2235 		g_num_of_iommus++;
2236 		/*
2237 		 * lock not needed as this is only incremented in the single
2238 		 * threaded kernel __init code path all other access are read
2239 		 * only
2240 		 */
2241 	}
2242 
2243 	g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2244 			GFP_KERNEL);
2245 	if (!g_iommus) {
2246 		printk(KERN_ERR "Allocating global iommu array failed\n");
2247 		ret = -ENOMEM;
2248 		goto error;
2249 	}
2250 
2251 	deferred_flush = kzalloc(g_num_of_iommus *
2252 		sizeof(struct deferred_flush_tables), GFP_KERNEL);
2253 	if (!deferred_flush) {
2254 		ret = -ENOMEM;
2255 		goto error;
2256 	}
2257 
2258 	for_each_drhd_unit(drhd) {
2259 		if (drhd->ignored)
2260 			continue;
2261 
2262 		iommu = drhd->iommu;
2263 		g_iommus[iommu->seq_id] = iommu;
2264 
2265 		ret = iommu_init_domains(iommu);
2266 		if (ret)
2267 			goto error;
2268 
2269 		/*
2270 		 * TBD:
2271 		 * we could share the same root & context tables
2272 		 * among all IOMMU's. Need to Split it later.
2273 		 */
2274 		ret = iommu_alloc_root_entry(iommu);
2275 		if (ret) {
2276 			printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2277 			goto error;
2278 		}
2279 		if (!ecap_pass_through(iommu->ecap))
2280 			hw_pass_through = 0;
2281 	}
2282 
2283 	/*
2284 	 * Start from the sane iommu hardware state.
2285 	 */
2286 	for_each_drhd_unit(drhd) {
2287 		if (drhd->ignored)
2288 			continue;
2289 
2290 		iommu = drhd->iommu;
2291 
2292 		/*
2293 		 * If the queued invalidation is already initialized by us
2294 		 * (for example, while enabling interrupt-remapping) then
2295 		 * we got the things already rolling from a sane state.
2296 		 */
2297 		if (iommu->qi)
2298 			continue;
2299 
2300 		/*
2301 		 * Clear any previous faults.
2302 		 */
2303 		dmar_fault(-1, iommu);
2304 		/*
2305 		 * Disable queued invalidation if supported and already enabled
2306 		 * before OS handover.
2307 		 */
2308 		dmar_disable_qi(iommu);
2309 	}
2310 
2311 	for_each_drhd_unit(drhd) {
2312 		if (drhd->ignored)
2313 			continue;
2314 
2315 		iommu = drhd->iommu;
2316 
2317 		if (dmar_enable_qi(iommu)) {
2318 			/*
2319 			 * Queued Invalidate not enabled, use Register Based
2320 			 * Invalidate
2321 			 */
2322 			iommu->flush.flush_context = __iommu_flush_context;
2323 			iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2324 			printk(KERN_INFO "IOMMU %d 0x%Lx: using Register based "
2325 			       "invalidation\n",
2326 				iommu->seq_id,
2327 			       (unsigned long long)drhd->reg_base_addr);
2328 		} else {
2329 			iommu->flush.flush_context = qi_flush_context;
2330 			iommu->flush.flush_iotlb = qi_flush_iotlb;
2331 			printk(KERN_INFO "IOMMU %d 0x%Lx: using Queued "
2332 			       "invalidation\n",
2333 				iommu->seq_id,
2334 			       (unsigned long long)drhd->reg_base_addr);
2335 		}
2336 	}
2337 
2338 	if (iommu_pass_through)
2339 		iommu_identity_mapping |= IDENTMAP_ALL;
2340 
2341 #ifdef CONFIG_DMAR_BROKEN_GFX_WA
2342 	iommu_identity_mapping |= IDENTMAP_GFX;
2343 #endif
2344 
2345 	check_tylersburg_isoch();
2346 
2347 	/*
2348 	 * If pass through is not set or not enabled, setup context entries for
2349 	 * identity mappings for rmrr, gfx, and isa and may fall back to static
2350 	 * identity mapping if iommu_identity_mapping is set.
2351 	 */
2352 	if (iommu_identity_mapping) {
2353 		ret = iommu_prepare_static_identity_mapping(hw_pass_through);
2354 		if (ret) {
2355 			printk(KERN_CRIT "Failed to setup IOMMU pass-through\n");
2356 			goto error;
2357 		}
2358 	}
2359 	/*
2360 	 * For each rmrr
2361 	 *   for each dev attached to rmrr
2362 	 *   do
2363 	 *     locate drhd for dev, alloc domain for dev
2364 	 *     allocate free domain
2365 	 *     allocate page table entries for rmrr
2366 	 *     if context not allocated for bus
2367 	 *           allocate and init context
2368 	 *           set present in root table for this bus
2369 	 *     init context with domain, translation etc
2370 	 *    endfor
2371 	 * endfor
2372 	 */
2373 	printk(KERN_INFO "IOMMU: Setting RMRR:\n");
2374 	for_each_rmrr_units(rmrr) {
2375 		for (i = 0; i < rmrr->devices_cnt; i++) {
2376 			pdev = rmrr->devices[i];
2377 			/*
2378 			 * some BIOS lists non-exist devices in DMAR
2379 			 * table.
2380 			 */
2381 			if (!pdev)
2382 				continue;
2383 			ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2384 			if (ret)
2385 				printk(KERN_ERR
2386 				       "IOMMU: mapping reserved region failed\n");
2387 		}
2388 	}
2389 
2390 	iommu_prepare_isa();
2391 
2392 	/*
2393 	 * for each drhd
2394 	 *   enable fault log
2395 	 *   global invalidate context cache
2396 	 *   global invalidate iotlb
2397 	 *   enable translation
2398 	 */
2399 	for_each_drhd_unit(drhd) {
2400 		if (drhd->ignored) {
2401 			/*
2402 			 * we always have to disable PMRs or DMA may fail on
2403 			 * this device
2404 			 */
2405 			if (force_on)
2406 				iommu_disable_protect_mem_regions(drhd->iommu);
2407 			continue;
2408 		}
2409 		iommu = drhd->iommu;
2410 
2411 		iommu_flush_write_buffer(iommu);
2412 
2413 		ret = dmar_set_interrupt(iommu);
2414 		if (ret)
2415 			goto error;
2416 
2417 		iommu_set_root_entry(iommu);
2418 
2419 		iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
2420 		iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
2421 
2422 		ret = iommu_enable_translation(iommu);
2423 		if (ret)
2424 			goto error;
2425 
2426 		iommu_disable_protect_mem_regions(iommu);
2427 	}
2428 
2429 	return 0;
2430 error:
2431 	for_each_drhd_unit(drhd) {
2432 		if (drhd->ignored)
2433 			continue;
2434 		iommu = drhd->iommu;
2435 		free_iommu(iommu);
2436 	}
2437 	kfree(g_iommus);
2438 	return ret;
2439 }
2440 
2441 /* This takes a number of _MM_ pages, not VTD pages */
intel_alloc_iova(struct device * dev,struct dmar_domain * domain,unsigned long nrpages,uint64_t dma_mask)2442 static struct iova *intel_alloc_iova(struct device *dev,
2443 				     struct dmar_domain *domain,
2444 				     unsigned long nrpages, uint64_t dma_mask)
2445 {
2446 	struct pci_dev *pdev = to_pci_dev(dev);
2447 	struct iova *iova = NULL;
2448 
2449 	/* Restrict dma_mask to the width that the iommu can handle */
2450 	dma_mask = min_t(uint64_t, DOMAIN_MAX_ADDR(domain->gaw), dma_mask);
2451 
2452 	if (!dmar_forcedac && dma_mask > DMA_BIT_MASK(32)) {
2453 		/*
2454 		 * First try to allocate an io virtual address in
2455 		 * DMA_BIT_MASK(32) and if that fails then try allocating
2456 		 * from higher range
2457 		 */
2458 		iova = alloc_iova(&domain->iovad, nrpages,
2459 				  IOVA_PFN(DMA_BIT_MASK(32)), 1);
2460 		if (iova)
2461 			return iova;
2462 	}
2463 	iova = alloc_iova(&domain->iovad, nrpages, IOVA_PFN(dma_mask), 1);
2464 	if (unlikely(!iova)) {
2465 		printk(KERN_ERR "Allocating %ld-page iova for %s failed",
2466 		       nrpages, pci_name(pdev));
2467 		return NULL;
2468 	}
2469 
2470 	return iova;
2471 }
2472 
__get_valid_domain_for_dev(struct pci_dev * pdev)2473 static struct dmar_domain *__get_valid_domain_for_dev(struct pci_dev *pdev)
2474 {
2475 	struct dmar_domain *domain;
2476 	int ret;
2477 
2478 	domain = get_domain_for_dev(pdev,
2479 			DEFAULT_DOMAIN_ADDRESS_WIDTH);
2480 	if (!domain) {
2481 		printk(KERN_ERR
2482 			"Allocating domain for %s failed", pci_name(pdev));
2483 		return NULL;
2484 	}
2485 
2486 	/* make sure context mapping is ok */
2487 	if (unlikely(!domain_context_mapped(pdev))) {
2488 		ret = domain_context_mapping(domain, pdev,
2489 					     CONTEXT_TT_MULTI_LEVEL);
2490 		if (ret) {
2491 			printk(KERN_ERR
2492 				"Domain context map for %s failed",
2493 				pci_name(pdev));
2494 			return NULL;
2495 		}
2496 	}
2497 
2498 	return domain;
2499 }
2500 
get_valid_domain_for_dev(struct pci_dev * dev)2501 static inline struct dmar_domain *get_valid_domain_for_dev(struct pci_dev *dev)
2502 {
2503 	struct device_domain_info *info;
2504 
2505 	/* No lock here, assumes no domain exit in normal case */
2506 	info = dev->dev.archdata.iommu;
2507 	if (likely(info))
2508 		return info->domain;
2509 
2510 	return __get_valid_domain_for_dev(dev);
2511 }
2512 
iommu_dummy(struct pci_dev * pdev)2513 static int iommu_dummy(struct pci_dev *pdev)
2514 {
2515 	return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO;
2516 }
2517 
2518 /* Check if the pdev needs to go through non-identity map and unmap process.*/
iommu_no_mapping(struct device * dev)2519 static int iommu_no_mapping(struct device *dev)
2520 {
2521 	struct pci_dev *pdev;
2522 	int found;
2523 
2524 	if (unlikely(dev->bus != &pci_bus_type))
2525 		return 1;
2526 
2527 	pdev = to_pci_dev(dev);
2528 	if (iommu_dummy(pdev))
2529 		return 1;
2530 
2531 	if (!iommu_identity_mapping)
2532 		return 0;
2533 
2534 	found = identity_mapping(pdev);
2535 	if (found) {
2536 		if (iommu_should_identity_map(pdev, 0))
2537 			return 1;
2538 		else {
2539 			/*
2540 			 * 32 bit DMA is removed from si_domain and fall back
2541 			 * to non-identity mapping.
2542 			 */
2543 			domain_remove_one_dev_info(si_domain, pdev);
2544 			printk(KERN_INFO "32bit %s uses non-identity mapping\n",
2545 			       pci_name(pdev));
2546 			return 0;
2547 		}
2548 	} else {
2549 		/*
2550 		 * In case of a detached 64 bit DMA device from vm, the device
2551 		 * is put into si_domain for identity mapping.
2552 		 */
2553 		if (iommu_should_identity_map(pdev, 0)) {
2554 			int ret;
2555 			ret = domain_add_dev_info(si_domain, pdev,
2556 						  hw_pass_through ?
2557 						  CONTEXT_TT_PASS_THROUGH :
2558 						  CONTEXT_TT_MULTI_LEVEL);
2559 			if (!ret) {
2560 				printk(KERN_INFO "64bit %s uses identity mapping\n",
2561 				       pci_name(pdev));
2562 				return 1;
2563 			}
2564 		}
2565 	}
2566 
2567 	return 0;
2568 }
2569 
__intel_map_single(struct device * hwdev,phys_addr_t paddr,size_t size,int dir,u64 dma_mask)2570 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2571 				     size_t size, int dir, u64 dma_mask)
2572 {
2573 	struct pci_dev *pdev = to_pci_dev(hwdev);
2574 	struct dmar_domain *domain;
2575 	phys_addr_t start_paddr;
2576 	struct iova *iova;
2577 	int prot = 0;
2578 	int ret;
2579 	struct intel_iommu *iommu;
2580 	unsigned long paddr_pfn = paddr >> PAGE_SHIFT;
2581 
2582 	BUG_ON(dir == DMA_NONE);
2583 
2584 	if (iommu_no_mapping(hwdev))
2585 		return paddr;
2586 
2587 	domain = get_valid_domain_for_dev(pdev);
2588 	if (!domain)
2589 		return 0;
2590 
2591 	iommu = domain_get_iommu(domain);
2592 	size = aligned_nrpages(paddr, size);
2593 
2594 	iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2595 				pdev->dma_mask);
2596 	if (!iova)
2597 		goto error;
2598 
2599 	/*
2600 	 * Check if DMAR supports zero-length reads on write only
2601 	 * mappings..
2602 	 */
2603 	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2604 			!cap_zlr(iommu->cap))
2605 		prot |= DMA_PTE_READ;
2606 	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2607 		prot |= DMA_PTE_WRITE;
2608 	/*
2609 	 * paddr - (paddr + size) might be partial page, we should map the whole
2610 	 * page.  Note: if two part of one page are separately mapped, we
2611 	 * might have two guest_addr mapping to the same host paddr, but this
2612 	 * is not a big problem
2613 	 */
2614 	ret = domain_pfn_mapping(domain, mm_to_dma_pfn(iova->pfn_lo),
2615 				 mm_to_dma_pfn(paddr_pfn), size, prot);
2616 	if (ret)
2617 		goto error;
2618 
2619 	/* it's a non-present to present mapping. Only flush if caching mode */
2620 	if (cap_caching_mode(iommu->cap))
2621 		iommu_flush_iotlb_psi(iommu, domain->id, mm_to_dma_pfn(iova->pfn_lo), size, 1);
2622 	else
2623 		iommu_flush_write_buffer(iommu);
2624 
2625 	start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2626 	start_paddr += paddr & ~PAGE_MASK;
2627 	return start_paddr;
2628 
2629 error:
2630 	if (iova)
2631 		__free_iova(&domain->iovad, iova);
2632 	printk(KERN_ERR"Device %s request: %zx@%llx dir %d --- failed\n",
2633 		pci_name(pdev), size, (unsigned long long)paddr, dir);
2634 	return 0;
2635 }
2636 
intel_map_page(struct device * dev,struct page * page,unsigned long offset,size_t size,enum dma_data_direction dir,struct dma_attrs * attrs)2637 static dma_addr_t intel_map_page(struct device *dev, struct page *page,
2638 				 unsigned long offset, size_t size,
2639 				 enum dma_data_direction dir,
2640 				 struct dma_attrs *attrs)
2641 {
2642 	return __intel_map_single(dev, page_to_phys(page) + offset, size,
2643 				  dir, to_pci_dev(dev)->dma_mask);
2644 }
2645 
flush_unmaps(void)2646 static void flush_unmaps(void)
2647 {
2648 	int i, j;
2649 
2650 	timer_on = 0;
2651 
2652 	/* just flush them all */
2653 	for (i = 0; i < g_num_of_iommus; i++) {
2654 		struct intel_iommu *iommu = g_iommus[i];
2655 		if (!iommu)
2656 			continue;
2657 
2658 		if (!deferred_flush[i].next)
2659 			continue;
2660 
2661 		/* In caching mode, global flushes turn emulation expensive */
2662 		if (!cap_caching_mode(iommu->cap))
2663 			iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2664 					 DMA_TLB_GLOBAL_FLUSH);
2665 		for (j = 0; j < deferred_flush[i].next; j++) {
2666 			unsigned long mask;
2667 			struct iova *iova = deferred_flush[i].iova[j];
2668 			struct dmar_domain *domain = deferred_flush[i].domain[j];
2669 
2670 			/* On real hardware multiple invalidations are expensive */
2671 			if (cap_caching_mode(iommu->cap))
2672 				iommu_flush_iotlb_psi(iommu, domain->id,
2673 				iova->pfn_lo, iova->pfn_hi - iova->pfn_lo + 1, 0);
2674 			else {
2675 				mask = ilog2(mm_to_dma_pfn(iova->pfn_hi - iova->pfn_lo + 1));
2676 				iommu_flush_dev_iotlb(deferred_flush[i].domain[j],
2677 						(uint64_t)iova->pfn_lo << PAGE_SHIFT, mask);
2678 			}
2679 			__free_iova(&deferred_flush[i].domain[j]->iovad, iova);
2680 		}
2681 		deferred_flush[i].next = 0;
2682 	}
2683 
2684 	list_size = 0;
2685 }
2686 
flush_unmaps_timeout(unsigned long data)2687 static void flush_unmaps_timeout(unsigned long data)
2688 {
2689 	unsigned long flags;
2690 
2691 	spin_lock_irqsave(&async_umap_flush_lock, flags);
2692 	flush_unmaps();
2693 	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2694 }
2695 
add_unmap(struct dmar_domain * dom,struct iova * iova)2696 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2697 {
2698 	unsigned long flags;
2699 	int next, iommu_id;
2700 	struct intel_iommu *iommu;
2701 
2702 	spin_lock_irqsave(&async_umap_flush_lock, flags);
2703 	if (list_size == HIGH_WATER_MARK)
2704 		flush_unmaps();
2705 
2706 	iommu = domain_get_iommu(dom);
2707 	iommu_id = iommu->seq_id;
2708 
2709 	next = deferred_flush[iommu_id].next;
2710 	deferred_flush[iommu_id].domain[next] = dom;
2711 	deferred_flush[iommu_id].iova[next] = iova;
2712 	deferred_flush[iommu_id].next++;
2713 
2714 	if (!timer_on) {
2715 		mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2716 		timer_on = 1;
2717 	}
2718 	list_size++;
2719 	spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2720 }
2721 
intel_unmap_page(struct device * dev,dma_addr_t dev_addr,size_t size,enum dma_data_direction dir,struct dma_attrs * attrs)2722 static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr,
2723 			     size_t size, enum dma_data_direction dir,
2724 			     struct dma_attrs *attrs)
2725 {
2726 	struct pci_dev *pdev = to_pci_dev(dev);
2727 	struct dmar_domain *domain;
2728 	unsigned long start_pfn, last_pfn;
2729 	struct iova *iova;
2730 	struct intel_iommu *iommu;
2731 
2732 	if (iommu_no_mapping(dev))
2733 		return;
2734 
2735 	domain = find_domain(pdev);
2736 	BUG_ON(!domain);
2737 
2738 	iommu = domain_get_iommu(domain);
2739 
2740 	iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2741 	if (WARN_ONCE(!iova, "Driver unmaps unmatched page at PFN %llx\n",
2742 		      (unsigned long long)dev_addr))
2743 		return;
2744 
2745 	start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2746 	last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2747 
2748 	pr_debug("Device %s unmapping: pfn %lx-%lx\n",
2749 		 pci_name(pdev), start_pfn, last_pfn);
2750 
2751 	/*  clear the whole page */
2752 	dma_pte_clear_range(domain, start_pfn, last_pfn);
2753 
2754 	/* free page tables */
2755 	dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2756 
2757 	if (intel_iommu_strict) {
2758 		iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2759 				      last_pfn - start_pfn + 1, 0);
2760 		/* free iova */
2761 		__free_iova(&domain->iovad, iova);
2762 	} else {
2763 		add_unmap(domain, iova);
2764 		/*
2765 		 * queue up the release of the unmap to save the 1/6th of the
2766 		 * cpu used up by the iotlb flush operation...
2767 		 */
2768 	}
2769 }
2770 
intel_alloc_coherent(struct device * hwdev,size_t size,dma_addr_t * dma_handle,gfp_t flags)2771 static void *intel_alloc_coherent(struct device *hwdev, size_t size,
2772 				  dma_addr_t *dma_handle, gfp_t flags)
2773 {
2774 	void *vaddr;
2775 	int order;
2776 
2777 	size = PAGE_ALIGN(size);
2778 	order = get_order(size);
2779 
2780 	if (!iommu_no_mapping(hwdev))
2781 		flags &= ~(GFP_DMA | GFP_DMA32);
2782 	else if (hwdev->coherent_dma_mask < dma_get_required_mask(hwdev)) {
2783 		if (hwdev->coherent_dma_mask < DMA_BIT_MASK(32))
2784 			flags |= GFP_DMA;
2785 		else
2786 			flags |= GFP_DMA32;
2787 	}
2788 
2789 	vaddr = (void *)__get_free_pages(flags, order);
2790 	if (!vaddr)
2791 		return NULL;
2792 	memset(vaddr, 0, size);
2793 
2794 	*dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2795 					 DMA_BIDIRECTIONAL,
2796 					 hwdev->coherent_dma_mask);
2797 	if (*dma_handle)
2798 		return vaddr;
2799 	free_pages((unsigned long)vaddr, order);
2800 	return NULL;
2801 }
2802 
intel_free_coherent(struct device * hwdev,size_t size,void * vaddr,dma_addr_t dma_handle)2803 static void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2804 				dma_addr_t dma_handle)
2805 {
2806 	int order;
2807 
2808 	size = PAGE_ALIGN(size);
2809 	order = get_order(size);
2810 
2811 	intel_unmap_page(hwdev, dma_handle, size, DMA_BIDIRECTIONAL, NULL);
2812 	free_pages((unsigned long)vaddr, order);
2813 }
2814 
intel_unmap_sg(struct device * hwdev,struct scatterlist * sglist,int nelems,enum dma_data_direction dir,struct dma_attrs * attrs)2815 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2816 			   int nelems, enum dma_data_direction dir,
2817 			   struct dma_attrs *attrs)
2818 {
2819 	struct pci_dev *pdev = to_pci_dev(hwdev);
2820 	struct dmar_domain *domain;
2821 	unsigned long start_pfn, last_pfn;
2822 	struct iova *iova;
2823 	struct intel_iommu *iommu;
2824 
2825 	if (iommu_no_mapping(hwdev))
2826 		return;
2827 
2828 	domain = find_domain(pdev);
2829 	BUG_ON(!domain);
2830 
2831 	iommu = domain_get_iommu(domain);
2832 
2833 	iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2834 	if (WARN_ONCE(!iova, "Driver unmaps unmatched sglist at PFN %llx\n",
2835 		      (unsigned long long)sglist[0].dma_address))
2836 		return;
2837 
2838 	start_pfn = mm_to_dma_pfn(iova->pfn_lo);
2839 	last_pfn = mm_to_dma_pfn(iova->pfn_hi + 1) - 1;
2840 
2841 	/*  clear the whole page */
2842 	dma_pte_clear_range(domain, start_pfn, last_pfn);
2843 
2844 	/* free page tables */
2845 	dma_pte_free_pagetable(domain, start_pfn, last_pfn);
2846 
2847 	if (intel_iommu_strict) {
2848 		iommu_flush_iotlb_psi(iommu, domain->id, start_pfn,
2849 				      last_pfn - start_pfn + 1, 0);
2850 		/* free iova */
2851 		__free_iova(&domain->iovad, iova);
2852 	} else {
2853 		add_unmap(domain, iova);
2854 		/*
2855 		 * queue up the release of the unmap to save the 1/6th of the
2856 		 * cpu used up by the iotlb flush operation...
2857 		 */
2858 	}
2859 }
2860 
intel_nontranslate_map_sg(struct device * hddev,struct scatterlist * sglist,int nelems,int dir)2861 static int intel_nontranslate_map_sg(struct device *hddev,
2862 	struct scatterlist *sglist, int nelems, int dir)
2863 {
2864 	int i;
2865 	struct scatterlist *sg;
2866 
2867 	for_each_sg(sglist, sg, nelems, i) {
2868 		BUG_ON(!sg_page(sg));
2869 		sg->dma_address = page_to_phys(sg_page(sg)) + sg->offset;
2870 		sg->dma_length = sg->length;
2871 	}
2872 	return nelems;
2873 }
2874 
intel_map_sg(struct device * hwdev,struct scatterlist * sglist,int nelems,enum dma_data_direction dir,struct dma_attrs * attrs)2875 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2876 			enum dma_data_direction dir, struct dma_attrs *attrs)
2877 {
2878 	int i;
2879 	struct pci_dev *pdev = to_pci_dev(hwdev);
2880 	struct dmar_domain *domain;
2881 	size_t size = 0;
2882 	int prot = 0;
2883 	struct iova *iova = NULL;
2884 	int ret;
2885 	struct scatterlist *sg;
2886 	unsigned long start_vpfn;
2887 	struct intel_iommu *iommu;
2888 
2889 	BUG_ON(dir == DMA_NONE);
2890 	if (iommu_no_mapping(hwdev))
2891 		return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2892 
2893 	domain = get_valid_domain_for_dev(pdev);
2894 	if (!domain)
2895 		return 0;
2896 
2897 	iommu = domain_get_iommu(domain);
2898 
2899 	for_each_sg(sglist, sg, nelems, i)
2900 		size += aligned_nrpages(sg->offset, sg->length);
2901 
2902 	iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size),
2903 				pdev->dma_mask);
2904 	if (!iova) {
2905 		sglist->dma_length = 0;
2906 		return 0;
2907 	}
2908 
2909 	/*
2910 	 * Check if DMAR supports zero-length reads on write only
2911 	 * mappings..
2912 	 */
2913 	if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2914 			!cap_zlr(iommu->cap))
2915 		prot |= DMA_PTE_READ;
2916 	if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2917 		prot |= DMA_PTE_WRITE;
2918 
2919 	start_vpfn = mm_to_dma_pfn(iova->pfn_lo);
2920 
2921 	ret = domain_sg_mapping(domain, start_vpfn, sglist, size, prot);
2922 	if (unlikely(ret)) {
2923 		/*  clear the page */
2924 		dma_pte_clear_range(domain, start_vpfn,
2925 				    start_vpfn + size - 1);
2926 		/* free page tables */
2927 		dma_pte_free_pagetable(domain, start_vpfn,
2928 				       start_vpfn + size - 1);
2929 		/* free iova */
2930 		__free_iova(&domain->iovad, iova);
2931 		return 0;
2932 	}
2933 
2934 	/* it's a non-present to present mapping. Only flush if caching mode */
2935 	if (cap_caching_mode(iommu->cap))
2936 		iommu_flush_iotlb_psi(iommu, domain->id, start_vpfn, size, 1);
2937 	else
2938 		iommu_flush_write_buffer(iommu);
2939 
2940 	return nelems;
2941 }
2942 
intel_mapping_error(struct device * dev,dma_addr_t dma_addr)2943 static int intel_mapping_error(struct device *dev, dma_addr_t dma_addr)
2944 {
2945 	return !dma_addr;
2946 }
2947 
2948 struct dma_map_ops intel_dma_ops = {
2949 	.alloc_coherent = intel_alloc_coherent,
2950 	.free_coherent = intel_free_coherent,
2951 	.map_sg = intel_map_sg,
2952 	.unmap_sg = intel_unmap_sg,
2953 	.map_page = intel_map_page,
2954 	.unmap_page = intel_unmap_page,
2955 	.mapping_error = intel_mapping_error,
2956 };
2957 
iommu_domain_cache_init(void)2958 static inline int iommu_domain_cache_init(void)
2959 {
2960 	int ret = 0;
2961 
2962 	iommu_domain_cache = kmem_cache_create("iommu_domain",
2963 					 sizeof(struct dmar_domain),
2964 					 0,
2965 					 SLAB_HWCACHE_ALIGN,
2966 
2967 					 NULL);
2968 	if (!iommu_domain_cache) {
2969 		printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2970 		ret = -ENOMEM;
2971 	}
2972 
2973 	return ret;
2974 }
2975 
iommu_devinfo_cache_init(void)2976 static inline int iommu_devinfo_cache_init(void)
2977 {
2978 	int ret = 0;
2979 
2980 	iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2981 					 sizeof(struct device_domain_info),
2982 					 0,
2983 					 SLAB_HWCACHE_ALIGN,
2984 					 NULL);
2985 	if (!iommu_devinfo_cache) {
2986 		printk(KERN_ERR "Couldn't create devinfo cache\n");
2987 		ret = -ENOMEM;
2988 	}
2989 
2990 	return ret;
2991 }
2992 
iommu_iova_cache_init(void)2993 static inline int iommu_iova_cache_init(void)
2994 {
2995 	int ret = 0;
2996 
2997 	iommu_iova_cache = kmem_cache_create("iommu_iova",
2998 					 sizeof(struct iova),
2999 					 0,
3000 					 SLAB_HWCACHE_ALIGN,
3001 					 NULL);
3002 	if (!iommu_iova_cache) {
3003 		printk(KERN_ERR "Couldn't create iova cache\n");
3004 		ret = -ENOMEM;
3005 	}
3006 
3007 	return ret;
3008 }
3009 
iommu_init_mempool(void)3010 static int __init iommu_init_mempool(void)
3011 {
3012 	int ret;
3013 	ret = iommu_iova_cache_init();
3014 	if (ret)
3015 		return ret;
3016 
3017 	ret = iommu_domain_cache_init();
3018 	if (ret)
3019 		goto domain_error;
3020 
3021 	ret = iommu_devinfo_cache_init();
3022 	if (!ret)
3023 		return ret;
3024 
3025 	kmem_cache_destroy(iommu_domain_cache);
3026 domain_error:
3027 	kmem_cache_destroy(iommu_iova_cache);
3028 
3029 	return -ENOMEM;
3030 }
3031 
iommu_exit_mempool(void)3032 static void __init iommu_exit_mempool(void)
3033 {
3034 	kmem_cache_destroy(iommu_devinfo_cache);
3035 	kmem_cache_destroy(iommu_domain_cache);
3036 	kmem_cache_destroy(iommu_iova_cache);
3037 
3038 }
3039 
quirk_ioat_snb_local_iommu(struct pci_dev * pdev)3040 static void quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
3041 {
3042 	struct dmar_drhd_unit *drhd;
3043 	u32 vtbar;
3044 	int rc;
3045 
3046 	/* We know that this device on this chipset has its own IOMMU.
3047 	 * If we find it under a different IOMMU, then the BIOS is lying
3048 	 * to us. Hope that the IOMMU for this device is actually
3049 	 * disabled, and it needs no translation...
3050 	 */
3051 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
3052 	if (rc) {
3053 		/* "can't" happen */
3054 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
3055 		return;
3056 	}
3057 	vtbar &= 0xffff0000;
3058 
3059 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
3060 	drhd = dmar_find_matched_drhd_unit(pdev);
3061 	if (WARN_TAINT_ONCE(!drhd || drhd->reg_base_addr - vtbar != 0xa000,
3062 			    TAINT_FIRMWARE_WORKAROUND,
3063 			    "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n"))
3064 		pdev->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3065 }
3066 DECLARE_PCI_FIXUP_ENABLE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_IOAT_SNB, quirk_ioat_snb_local_iommu);
3067 
init_no_remapping_devices(void)3068 static void __init init_no_remapping_devices(void)
3069 {
3070 	struct dmar_drhd_unit *drhd;
3071 
3072 	for_each_drhd_unit(drhd) {
3073 		if (!drhd->include_all) {
3074 			int i;
3075 			for (i = 0; i < drhd->devices_cnt; i++)
3076 				if (drhd->devices[i] != NULL)
3077 					break;
3078 			/* ignore DMAR unit if no pci devices exist */
3079 			if (i == drhd->devices_cnt)
3080 				drhd->ignored = 1;
3081 		}
3082 	}
3083 
3084 	if (dmar_map_gfx)
3085 		return;
3086 
3087 	for_each_drhd_unit(drhd) {
3088 		int i;
3089 		if (drhd->ignored || drhd->include_all)
3090 			continue;
3091 
3092 		for (i = 0; i < drhd->devices_cnt; i++)
3093 			if (drhd->devices[i] &&
3094 				!IS_GFX_DEVICE(drhd->devices[i]))
3095 				break;
3096 
3097 		if (i < drhd->devices_cnt)
3098 			continue;
3099 
3100 		/* bypass IOMMU if it is just for gfx devices */
3101 		drhd->ignored = 1;
3102 		for (i = 0; i < drhd->devices_cnt; i++) {
3103 			if (!drhd->devices[i])
3104 				continue;
3105 			drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
3106 		}
3107 	}
3108 }
3109 
3110 #ifdef CONFIG_SUSPEND
init_iommu_hw(void)3111 static int init_iommu_hw(void)
3112 {
3113 	struct dmar_drhd_unit *drhd;
3114 	struct intel_iommu *iommu = NULL;
3115 
3116 	for_each_active_iommu(iommu, drhd)
3117 		if (iommu->qi)
3118 			dmar_reenable_qi(iommu);
3119 
3120 	for_each_active_iommu(iommu, drhd) {
3121 		iommu_flush_write_buffer(iommu);
3122 
3123 		iommu_set_root_entry(iommu);
3124 
3125 		iommu->flush.flush_context(iommu, 0, 0, 0,
3126 					   DMA_CCMD_GLOBAL_INVL);
3127 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3128 					 DMA_TLB_GLOBAL_FLUSH);
3129 		iommu_enable_translation(iommu);
3130 		iommu_disable_protect_mem_regions(iommu);
3131 	}
3132 
3133 	return 0;
3134 }
3135 
iommu_flush_all(void)3136 static void iommu_flush_all(void)
3137 {
3138 	struct dmar_drhd_unit *drhd;
3139 	struct intel_iommu *iommu;
3140 
3141 	for_each_active_iommu(iommu, drhd) {
3142 		iommu->flush.flush_context(iommu, 0, 0, 0,
3143 					   DMA_CCMD_GLOBAL_INVL);
3144 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3145 					 DMA_TLB_GLOBAL_FLUSH);
3146 	}
3147 }
3148 
iommu_suspend(void)3149 static int iommu_suspend(void)
3150 {
3151 	struct dmar_drhd_unit *drhd;
3152 	struct intel_iommu *iommu = NULL;
3153 	unsigned long flag;
3154 
3155 	for_each_active_iommu(iommu, drhd) {
3156 		iommu->iommu_state = kzalloc(sizeof(u32) * MAX_SR_DMAR_REGS,
3157 						 GFP_ATOMIC);
3158 		if (!iommu->iommu_state)
3159 			goto nomem;
3160 	}
3161 
3162 	iommu_flush_all();
3163 
3164 	for_each_active_iommu(iommu, drhd) {
3165 		iommu_disable_translation(iommu);
3166 
3167 		spin_lock_irqsave(&iommu->register_lock, flag);
3168 
3169 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3170 			readl(iommu->reg + DMAR_FECTL_REG);
3171 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3172 			readl(iommu->reg + DMAR_FEDATA_REG);
3173 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3174 			readl(iommu->reg + DMAR_FEADDR_REG);
3175 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3176 			readl(iommu->reg + DMAR_FEUADDR_REG);
3177 
3178 		spin_unlock_irqrestore(&iommu->register_lock, flag);
3179 	}
3180 	return 0;
3181 
3182 nomem:
3183 	for_each_active_iommu(iommu, drhd)
3184 		kfree(iommu->iommu_state);
3185 
3186 	return -ENOMEM;
3187 }
3188 
iommu_resume(void)3189 static void iommu_resume(void)
3190 {
3191 	struct dmar_drhd_unit *drhd;
3192 	struct intel_iommu *iommu = NULL;
3193 	unsigned long flag;
3194 
3195 	if (init_iommu_hw()) {
3196 		WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3197 		return;
3198 	}
3199 
3200 	for_each_active_iommu(iommu, drhd) {
3201 
3202 		spin_lock_irqsave(&iommu->register_lock, flag);
3203 
3204 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3205 			iommu->reg + DMAR_FECTL_REG);
3206 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3207 			iommu->reg + DMAR_FEDATA_REG);
3208 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3209 			iommu->reg + DMAR_FEADDR_REG);
3210 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3211 			iommu->reg + DMAR_FEUADDR_REG);
3212 
3213 		spin_unlock_irqrestore(&iommu->register_lock, flag);
3214 	}
3215 
3216 	for_each_active_iommu(iommu, drhd)
3217 		kfree(iommu->iommu_state);
3218 }
3219 
3220 static struct syscore_ops iommu_syscore_ops = {
3221 	.resume		= iommu_resume,
3222 	.suspend	= iommu_suspend,
3223 };
3224 
init_iommu_pm_ops(void)3225 static void __init init_iommu_pm_ops(void)
3226 {
3227 	register_syscore_ops(&iommu_syscore_ops);
3228 }
3229 
3230 #else
init_iommu_pm_ops(void)3231 static inline int init_iommu_pm_ops(void) { }
3232 #endif	/* CONFIG_PM */
3233 
3234 /*
3235  * Here we only respond to action of unbound device from driver.
3236  *
3237  * Added device is not attached to its DMAR domain here yet. That will happen
3238  * when mapping the device to iova.
3239  */
device_notifier(struct notifier_block * nb,unsigned long action,void * data)3240 static int device_notifier(struct notifier_block *nb,
3241 				  unsigned long action, void *data)
3242 {
3243 	struct device *dev = data;
3244 	struct pci_dev *pdev = to_pci_dev(dev);
3245 	struct dmar_domain *domain;
3246 
3247 	if (iommu_no_mapping(dev))
3248 		return 0;
3249 
3250 	domain = find_domain(pdev);
3251 	if (!domain)
3252 		return 0;
3253 
3254 	if (action == BUS_NOTIFY_UNBOUND_DRIVER && !iommu_pass_through) {
3255 		domain_remove_one_dev_info(domain, pdev);
3256 
3257 		if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) &&
3258 		    !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) &&
3259 		    list_empty(&domain->devices))
3260 			domain_exit(domain);
3261 	}
3262 
3263 	return 0;
3264 }
3265 
3266 static struct notifier_block device_nb = {
3267 	.notifier_call = device_notifier,
3268 };
3269 
intel_iommu_init(void)3270 int __init intel_iommu_init(void)
3271 {
3272 	int ret = 0;
3273 	int force_on = 0;
3274 
3275 	/* VT-d is required for a TXT/tboot launch, so enforce that */
3276 	force_on = tboot_force_iommu();
3277 
3278 	if (dmar_table_init()) {
3279 		if (force_on)
3280 			panic("tboot: Failed to initialize DMAR table\n");
3281 		return 	-ENODEV;
3282 	}
3283 
3284 	if (dmar_dev_scope_init()) {
3285 		if (force_on)
3286 			panic("tboot: Failed to initialize DMAR device scope\n");
3287 		return 	-ENODEV;
3288 	}
3289 
3290 	/*
3291 	 * Check the need for DMA-remapping initialization now.
3292 	 * Above initialization will also be used by Interrupt-remapping.
3293 	 */
3294 	if (no_iommu || dmar_disabled)
3295 		return -ENODEV;
3296 
3297 	if (iommu_init_mempool()) {
3298 		if (force_on)
3299 			panic("tboot: Failed to initialize iommu memory\n");
3300 		return 	-ENODEV;
3301 	}
3302 
3303 	if (dmar_init_reserved_ranges()) {
3304 		if (force_on)
3305 			panic("tboot: Failed to reserve iommu ranges\n");
3306 		return 	-ENODEV;
3307 	}
3308 
3309 	init_no_remapping_devices();
3310 
3311 	ret = init_dmars(force_on);
3312 	if (ret) {
3313 		if (force_on)
3314 			panic("tboot: Failed to initialize DMARs\n");
3315 		printk(KERN_ERR "IOMMU: dmar init failed\n");
3316 		put_iova_domain(&reserved_iova_list);
3317 		iommu_exit_mempool();
3318 		return ret;
3319 	}
3320 	printk(KERN_INFO
3321 	"PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
3322 
3323 	init_timer(&unmap_timer);
3324 #ifdef CONFIG_SWIOTLB
3325 	swiotlb = 0;
3326 #endif
3327 	dma_ops = &intel_dma_ops;
3328 
3329 	init_iommu_pm_ops();
3330 
3331 	register_iommu(&intel_iommu_ops);
3332 
3333 	bus_register_notifier(&pci_bus_type, &device_nb);
3334 
3335 	return 0;
3336 }
3337 
iommu_detach_dependent_devices(struct intel_iommu * iommu,struct pci_dev * pdev)3338 static void iommu_detach_dependent_devices(struct intel_iommu *iommu,
3339 					   struct pci_dev *pdev)
3340 {
3341 	struct pci_dev *tmp, *parent;
3342 
3343 	if (!iommu || !pdev)
3344 		return;
3345 
3346 	/* dependent device detach */
3347 	tmp = pci_find_upstream_pcie_bridge(pdev);
3348 	/* Secondary interface's bus number and devfn 0 */
3349 	if (tmp) {
3350 		parent = pdev->bus->self;
3351 		while (parent != tmp) {
3352 			iommu_detach_dev(iommu, parent->bus->number,
3353 					 parent->devfn);
3354 			parent = parent->bus->self;
3355 		}
3356 		if (pci_is_pcie(tmp)) /* this is a PCIe-to-PCI bridge */
3357 			iommu_detach_dev(iommu,
3358 				tmp->subordinate->number, 0);
3359 		else /* this is a legacy PCI bridge */
3360 			iommu_detach_dev(iommu, tmp->bus->number,
3361 					 tmp->devfn);
3362 	}
3363 }
3364 
domain_remove_one_dev_info(struct dmar_domain * domain,struct pci_dev * pdev)3365 static void domain_remove_one_dev_info(struct dmar_domain *domain,
3366 					  struct pci_dev *pdev)
3367 {
3368 	struct device_domain_info *info;
3369 	struct intel_iommu *iommu;
3370 	unsigned long flags;
3371 	int found = 0;
3372 	struct list_head *entry, *tmp;
3373 
3374 	iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3375 				pdev->devfn);
3376 	if (!iommu)
3377 		return;
3378 
3379 	spin_lock_irqsave(&device_domain_lock, flags);
3380 	list_for_each_safe(entry, tmp, &domain->devices) {
3381 		info = list_entry(entry, struct device_domain_info, link);
3382 		/* No need to compare PCI domain; it has to be the same */
3383 		if (info->bus == pdev->bus->number &&
3384 		    info->devfn == pdev->devfn) {
3385 			list_del(&info->link);
3386 			list_del(&info->global);
3387 			if (info->dev)
3388 				info->dev->dev.archdata.iommu = NULL;
3389 			spin_unlock_irqrestore(&device_domain_lock, flags);
3390 
3391 			iommu_disable_dev_iotlb(info);
3392 			iommu_detach_dev(iommu, info->bus, info->devfn);
3393 			iommu_detach_dependent_devices(iommu, pdev);
3394 			free_devinfo_mem(info);
3395 
3396 			spin_lock_irqsave(&device_domain_lock, flags);
3397 
3398 			if (found)
3399 				break;
3400 			else
3401 				continue;
3402 		}
3403 
3404 		/* if there is no other devices under the same iommu
3405 		 * owned by this domain, clear this iommu in iommu_bmp
3406 		 * update iommu count and coherency
3407 		 */
3408 		if (iommu == device_to_iommu(info->segment, info->bus,
3409 					    info->devfn))
3410 			found = 1;
3411 	}
3412 
3413 	if (found == 0) {
3414 		unsigned long tmp_flags;
3415 		spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
3416 		clear_bit(iommu->seq_id, &domain->iommu_bmp);
3417 		domain->iommu_count--;
3418 		domain_update_iommu_cap(domain);
3419 		spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
3420 
3421 		spin_lock_irqsave(&iommu->lock, tmp_flags);
3422 		clear_bit(domain->id, iommu->domain_ids);
3423 		iommu->domains[domain->id] = NULL;
3424 		spin_unlock_irqrestore(&iommu->lock, tmp_flags);
3425 	}
3426 
3427 	spin_unlock_irqrestore(&device_domain_lock, flags);
3428 }
3429 
vm_domain_remove_all_dev_info(struct dmar_domain * domain)3430 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
3431 {
3432 	struct device_domain_info *info;
3433 	struct intel_iommu *iommu;
3434 	unsigned long flags1, flags2;
3435 
3436 	spin_lock_irqsave(&device_domain_lock, flags1);
3437 	while (!list_empty(&domain->devices)) {
3438 		info = list_entry(domain->devices.next,
3439 			struct device_domain_info, link);
3440 		list_del(&info->link);
3441 		list_del(&info->global);
3442 		if (info->dev)
3443 			info->dev->dev.archdata.iommu = NULL;
3444 
3445 		spin_unlock_irqrestore(&device_domain_lock, flags1);
3446 
3447 		iommu_disable_dev_iotlb(info);
3448 		iommu = device_to_iommu(info->segment, info->bus, info->devfn);
3449 		iommu_detach_dev(iommu, info->bus, info->devfn);
3450 		iommu_detach_dependent_devices(iommu, info->dev);
3451 
3452 		/* clear this iommu in iommu_bmp, update iommu count
3453 		 * and capabilities
3454 		 */
3455 		spin_lock_irqsave(&domain->iommu_lock, flags2);
3456 		if (test_and_clear_bit(iommu->seq_id,
3457 				       &domain->iommu_bmp)) {
3458 			domain->iommu_count--;
3459 			domain_update_iommu_cap(domain);
3460 		}
3461 		spin_unlock_irqrestore(&domain->iommu_lock, flags2);
3462 
3463 		free_devinfo_mem(info);
3464 		spin_lock_irqsave(&device_domain_lock, flags1);
3465 	}
3466 	spin_unlock_irqrestore(&device_domain_lock, flags1);
3467 }
3468 
3469 /* domain id for virtual machine, it won't be set in context */
3470 static unsigned long vm_domid;
3471 
iommu_alloc_vm_domain(void)3472 static struct dmar_domain *iommu_alloc_vm_domain(void)
3473 {
3474 	struct dmar_domain *domain;
3475 
3476 	domain = alloc_domain_mem();
3477 	if (!domain)
3478 		return NULL;
3479 
3480 	domain->id = vm_domid++;
3481 	domain->nid = -1;
3482 	memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
3483 	domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
3484 
3485 	return domain;
3486 }
3487 
md_domain_init(struct dmar_domain * domain,int guest_width)3488 static int md_domain_init(struct dmar_domain *domain, int guest_width)
3489 {
3490 	int adjust_width;
3491 
3492 	init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
3493 	spin_lock_init(&domain->iommu_lock);
3494 
3495 	domain_reserve_special_ranges(domain);
3496 
3497 	/* calculate AGAW */
3498 	domain->gaw = guest_width;
3499 	adjust_width = guestwidth_to_adjustwidth(guest_width);
3500 	domain->agaw = width_to_agaw(adjust_width);
3501 
3502 	INIT_LIST_HEAD(&domain->devices);
3503 
3504 	domain->iommu_count = 0;
3505 	domain->iommu_coherency = 0;
3506 	domain->iommu_snooping = 0;
3507 	domain->max_addr = 0;
3508 	domain->nid = -1;
3509 
3510 	/* always allocate the top pgd */
3511 	domain->pgd = (struct dma_pte *)alloc_pgtable_page(domain->nid);
3512 	if (!domain->pgd)
3513 		return -ENOMEM;
3514 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
3515 	return 0;
3516 }
3517 
iommu_free_vm_domain(struct dmar_domain * domain)3518 static void iommu_free_vm_domain(struct dmar_domain *domain)
3519 {
3520 	unsigned long flags;
3521 	struct dmar_drhd_unit *drhd;
3522 	struct intel_iommu *iommu;
3523 	unsigned long i;
3524 	unsigned long ndomains;
3525 
3526 	for_each_drhd_unit(drhd) {
3527 		if (drhd->ignored)
3528 			continue;
3529 		iommu = drhd->iommu;
3530 
3531 		ndomains = cap_ndoms(iommu->cap);
3532 		for_each_set_bit(i, iommu->domain_ids, ndomains) {
3533 			if (iommu->domains[i] == domain) {
3534 				spin_lock_irqsave(&iommu->lock, flags);
3535 				clear_bit(i, iommu->domain_ids);
3536 				iommu->domains[i] = NULL;
3537 				spin_unlock_irqrestore(&iommu->lock, flags);
3538 				break;
3539 			}
3540 		}
3541 	}
3542 }
3543 
vm_domain_exit(struct dmar_domain * domain)3544 static void vm_domain_exit(struct dmar_domain *domain)
3545 {
3546 	/* Domain 0 is reserved, so dont process it */
3547 	if (!domain)
3548 		return;
3549 
3550 	vm_domain_remove_all_dev_info(domain);
3551 	/* destroy iovas */
3552 	put_iova_domain(&domain->iovad);
3553 
3554 	/* clear ptes */
3555 	dma_pte_clear_range(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3556 
3557 	/* free page tables */
3558 	dma_pte_free_pagetable(domain, 0, DOMAIN_MAX_PFN(domain->gaw));
3559 
3560 	iommu_free_vm_domain(domain);
3561 	free_domain_mem(domain);
3562 }
3563 
intel_iommu_domain_init(struct iommu_domain * domain)3564 static int intel_iommu_domain_init(struct iommu_domain *domain)
3565 {
3566 	struct dmar_domain *dmar_domain;
3567 
3568 	dmar_domain = iommu_alloc_vm_domain();
3569 	if (!dmar_domain) {
3570 		printk(KERN_ERR
3571 			"intel_iommu_domain_init: dmar_domain == NULL\n");
3572 		return -ENOMEM;
3573 	}
3574 	if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
3575 		printk(KERN_ERR
3576 			"intel_iommu_domain_init() failed\n");
3577 		vm_domain_exit(dmar_domain);
3578 		return -ENOMEM;
3579 	}
3580 	domain->priv = dmar_domain;
3581 
3582 	return 0;
3583 }
3584 
intel_iommu_domain_destroy(struct iommu_domain * domain)3585 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3586 {
3587 	struct dmar_domain *dmar_domain = domain->priv;
3588 
3589 	domain->priv = NULL;
3590 	vm_domain_exit(dmar_domain);
3591 }
3592 
intel_iommu_attach_device(struct iommu_domain * domain,struct device * dev)3593 static int intel_iommu_attach_device(struct iommu_domain *domain,
3594 				     struct device *dev)
3595 {
3596 	struct dmar_domain *dmar_domain = domain->priv;
3597 	struct pci_dev *pdev = to_pci_dev(dev);
3598 	struct intel_iommu *iommu;
3599 	int addr_width;
3600 
3601 	/* normally pdev is not mapped */
3602 	if (unlikely(domain_context_mapped(pdev))) {
3603 		struct dmar_domain *old_domain;
3604 
3605 		old_domain = find_domain(pdev);
3606 		if (old_domain) {
3607 			if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE ||
3608 			    dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)
3609 				domain_remove_one_dev_info(old_domain, pdev);
3610 			else
3611 				domain_remove_dev_info(old_domain);
3612 		}
3613 	}
3614 
3615 	iommu = device_to_iommu(pci_domain_nr(pdev->bus), pdev->bus->number,
3616 				pdev->devfn);
3617 	if (!iommu)
3618 		return -ENODEV;
3619 
3620 	/* check if this iommu agaw is sufficient for max mapped address */
3621 	addr_width = agaw_to_width(iommu->agaw);
3622 	if (addr_width > cap_mgaw(iommu->cap))
3623 		addr_width = cap_mgaw(iommu->cap);
3624 
3625 	if (dmar_domain->max_addr > (1LL << addr_width)) {
3626 		printk(KERN_ERR "%s: iommu width (%d) is not "
3627 		       "sufficient for the mapped address (%llx)\n",
3628 		       __func__, addr_width, dmar_domain->max_addr);
3629 		return -EFAULT;
3630 	}
3631 	dmar_domain->gaw = addr_width;
3632 
3633 	/*
3634 	 * Knock out extra levels of page tables if necessary
3635 	 */
3636 	while (iommu->agaw < dmar_domain->agaw) {
3637 		struct dma_pte *pte;
3638 
3639 		pte = dmar_domain->pgd;
3640 		if (dma_pte_present(pte)) {
3641 			dmar_domain->pgd = (struct dma_pte *)
3642 				phys_to_virt(dma_pte_addr(pte));
3643 			free_pgtable_page(pte);
3644 		}
3645 		dmar_domain->agaw--;
3646 	}
3647 
3648 	return domain_add_dev_info(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL);
3649 }
3650 
intel_iommu_detach_device(struct iommu_domain * domain,struct device * dev)3651 static void intel_iommu_detach_device(struct iommu_domain *domain,
3652 				      struct device *dev)
3653 {
3654 	struct dmar_domain *dmar_domain = domain->priv;
3655 	struct pci_dev *pdev = to_pci_dev(dev);
3656 
3657 	domain_remove_one_dev_info(dmar_domain, pdev);
3658 }
3659 
intel_iommu_map(struct iommu_domain * domain,unsigned long iova,phys_addr_t hpa,int gfp_order,int iommu_prot)3660 static int intel_iommu_map(struct iommu_domain *domain,
3661 			   unsigned long iova, phys_addr_t hpa,
3662 			   int gfp_order, int iommu_prot)
3663 {
3664 	struct dmar_domain *dmar_domain = domain->priv;
3665 	u64 max_addr;
3666 	int prot = 0;
3667 	size_t size;
3668 	int ret;
3669 
3670 	if (iommu_prot & IOMMU_READ)
3671 		prot |= DMA_PTE_READ;
3672 	if (iommu_prot & IOMMU_WRITE)
3673 		prot |= DMA_PTE_WRITE;
3674 	if ((iommu_prot & IOMMU_CACHE) && dmar_domain->iommu_snooping)
3675 		prot |= DMA_PTE_SNP;
3676 
3677 	size     = PAGE_SIZE << gfp_order;
3678 	max_addr = iova + size;
3679 	if (dmar_domain->max_addr < max_addr) {
3680 		u64 end;
3681 
3682 		/* check if minimum agaw is sufficient for mapped address */
3683 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
3684 		if (end < max_addr) {
3685 			printk(KERN_ERR "%s: iommu width (%d) is not "
3686 			       "sufficient for the mapped address (%llx)\n",
3687 			       __func__, dmar_domain->gaw, max_addr);
3688 			return -EFAULT;
3689 		}
3690 		dmar_domain->max_addr = max_addr;
3691 	}
3692 	/* Round up size to next multiple of PAGE_SIZE, if it and
3693 	   the low bits of hpa would take us onto the next page */
3694 	size = aligned_nrpages(hpa, size);
3695 	ret = domain_pfn_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
3696 				 hpa >> VTD_PAGE_SHIFT, size, prot);
3697 	return ret;
3698 }
3699 
intel_iommu_unmap(struct iommu_domain * domain,unsigned long iova,int gfp_order)3700 static int intel_iommu_unmap(struct iommu_domain *domain,
3701 			     unsigned long iova, int gfp_order)
3702 {
3703 	struct dmar_domain *dmar_domain = domain->priv;
3704 	size_t size = PAGE_SIZE << gfp_order;
3705 
3706 	dma_pte_clear_range(dmar_domain, iova >> VTD_PAGE_SHIFT,
3707 			    (iova + size - 1) >> VTD_PAGE_SHIFT);
3708 
3709 	if (dmar_domain->max_addr == iova + size)
3710 		dmar_domain->max_addr = iova;
3711 
3712 	return gfp_order;
3713 }
3714 
intel_iommu_iova_to_phys(struct iommu_domain * domain,unsigned long iova)3715 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3716 					    unsigned long iova)
3717 {
3718 	struct dmar_domain *dmar_domain = domain->priv;
3719 	struct dma_pte *pte;
3720 	u64 phys = 0;
3721 
3722 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT);
3723 	if (pte)
3724 		phys = dma_pte_addr(pte);
3725 
3726 	return phys;
3727 }
3728 
intel_iommu_domain_has_cap(struct iommu_domain * domain,unsigned long cap)3729 static int intel_iommu_domain_has_cap(struct iommu_domain *domain,
3730 				      unsigned long cap)
3731 {
3732 	struct dmar_domain *dmar_domain = domain->priv;
3733 
3734 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
3735 		return dmar_domain->iommu_snooping;
3736 	if (cap == IOMMU_CAP_INTR_REMAP)
3737 		return intr_remapping_enabled;
3738 
3739 	return 0;
3740 }
3741 
3742 static struct iommu_ops intel_iommu_ops = {
3743 	.domain_init	= intel_iommu_domain_init,
3744 	.domain_destroy = intel_iommu_domain_destroy,
3745 	.attach_dev	= intel_iommu_attach_device,
3746 	.detach_dev	= intel_iommu_detach_device,
3747 	.map		= intel_iommu_map,
3748 	.unmap		= intel_iommu_unmap,
3749 	.iova_to_phys	= intel_iommu_iova_to_phys,
3750 	.domain_has_cap = intel_iommu_domain_has_cap,
3751 };
3752 
quirk_iommu_rwbf(struct pci_dev * dev)3753 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3754 {
3755 	/*
3756 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
3757 	 * but needs it:
3758 	 */
3759 	printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3760 	rwbf_quirk = 1;
3761 
3762 	/* https://bugzilla.redhat.com/show_bug.cgi?id=538163 */
3763 	if (dev->revision == 0x07) {
3764 		printk(KERN_INFO "DMAR: Disabling IOMMU for graphics on this chipset\n");
3765 		dmar_map_gfx = 0;
3766 	}
3767 }
3768 
3769 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
3770 
3771 #define GGC 0x52
3772 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
3773 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
3774 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
3775 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
3776 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
3777 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
3778 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
3779 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
3780 
quirk_calpella_no_shadow_gtt(struct pci_dev * dev)3781 static void __devinit quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
3782 {
3783 	unsigned short ggc;
3784 
3785 	if (pci_read_config_word(dev, GGC, &ggc))
3786 		return;
3787 
3788 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
3789 		printk(KERN_INFO "DMAR: BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
3790 		dmar_map_gfx = 0;
3791 	}
3792 }
3793 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
3794 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
3795 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
3796 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
3797 
3798 /* On Tylersburg chipsets, some BIOSes have been known to enable the
3799    ISOCH DMAR unit for the Azalia sound device, but not give it any
3800    TLB entries, which causes it to deadlock. Check for that.  We do
3801    this in a function called from init_dmars(), instead of in a PCI
3802    quirk, because we don't want to print the obnoxious "BIOS broken"
3803    message if VT-d is actually disabled.
3804 */
check_tylersburg_isoch(void)3805 static void __init check_tylersburg_isoch(void)
3806 {
3807 	struct pci_dev *pdev;
3808 	uint32_t vtisochctrl;
3809 
3810 	/* If there's no Azalia in the system anyway, forget it. */
3811 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
3812 	if (!pdev)
3813 		return;
3814 	pci_dev_put(pdev);
3815 
3816 	/* System Management Registers. Might be hidden, in which case
3817 	   we can't do the sanity check. But that's OK, because the
3818 	   known-broken BIOSes _don't_ actually hide it, so far. */
3819 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
3820 	if (!pdev)
3821 		return;
3822 
3823 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
3824 		pci_dev_put(pdev);
3825 		return;
3826 	}
3827 
3828 	pci_dev_put(pdev);
3829 
3830 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
3831 	if (vtisochctrl & 1)
3832 		return;
3833 
3834 	/* Drop all bits other than the number of TLB entries */
3835 	vtisochctrl &= 0x1c;
3836 
3837 	/* If we have the recommended number of TLB entries (16), fine. */
3838 	if (vtisochctrl == 0x10)
3839 		return;
3840 
3841 	/* Zero TLB entries? You get to ride the short bus to school. */
3842 	if (!vtisochctrl) {
3843 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
3844 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3845 		     dmi_get_system_info(DMI_BIOS_VENDOR),
3846 		     dmi_get_system_info(DMI_BIOS_VERSION),
3847 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
3848 		iommu_identity_mapping |= IDENTMAP_AZALIA;
3849 		return;
3850 	}
3851 
3852 	printk(KERN_WARNING "DMAR: Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
3853 	       vtisochctrl);
3854 }
3855