1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright © 2006-2014 Intel Corporation.
4  *
5  * Authors: David Woodhouse <dwmw2@infradead.org>,
6  *          Ashok Raj <ashok.raj@intel.com>,
7  *          Shaohua Li <shaohua.li@intel.com>,
8  *          Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>,
9  *          Fenghua Yu <fenghua.yu@intel.com>
10  *          Joerg Roedel <jroedel@suse.de>
11  */
12 
13 #define pr_fmt(fmt)     "DMAR: " fmt
14 #define dev_fmt(fmt)    pr_fmt(fmt)
15 
16 #include <linux/crash_dump.h>
17 #include <linux/dma-direct.h>
18 #include <linux/dmi.h>
19 #include <linux/intel-svm.h>
20 #include <linux/memory.h>
21 #include <linux/pci.h>
22 #include <linux/pci-ats.h>
23 #include <linux/spinlock.h>
24 #include <linux/syscore_ops.h>
25 #include <linux/tboot.h>
26 
27 #include "iommu.h"
28 #include "../dma-iommu.h"
29 #include "../irq_remapping.h"
30 #include "../iommu-sva-lib.h"
31 #include "pasid.h"
32 #include "cap_audit.h"
33 
34 #define ROOT_SIZE		VTD_PAGE_SIZE
35 #define CONTEXT_SIZE		VTD_PAGE_SIZE
36 
37 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
38 #define IS_USB_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_SERIAL_USB)
39 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
40 #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e)
41 
42 #define IOAPIC_RANGE_START	(0xfee00000)
43 #define IOAPIC_RANGE_END	(0xfeefffff)
44 #define IOVA_START_ADDR		(0x1000)
45 
46 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 57
47 
48 #define MAX_AGAW_WIDTH 64
49 #define MAX_AGAW_PFN_WIDTH	(MAX_AGAW_WIDTH - VTD_PAGE_SHIFT)
50 
51 #define __DOMAIN_MAX_PFN(gaw)  ((((uint64_t)1) << ((gaw) - VTD_PAGE_SHIFT)) - 1)
52 #define __DOMAIN_MAX_ADDR(gaw) ((((uint64_t)1) << (gaw)) - 1)
53 
54 /* We limit DOMAIN_MAX_PFN to fit in an unsigned long, and DOMAIN_MAX_ADDR
55    to match. That way, we can use 'unsigned long' for PFNs with impunity. */
56 #define DOMAIN_MAX_PFN(gaw)	((unsigned long) min_t(uint64_t, \
57 				__DOMAIN_MAX_PFN(gaw), (unsigned long)-1))
58 #define DOMAIN_MAX_ADDR(gaw)	(((uint64_t)__DOMAIN_MAX_PFN(gaw)) << VTD_PAGE_SHIFT)
59 
60 /* IO virtual address start page frame number */
61 #define IOVA_START_PFN		(1)
62 
63 #define IOVA_PFN(addr)		((addr) >> PAGE_SHIFT)
64 
65 /* page table handling */
66 #define LEVEL_STRIDE		(9)
67 #define LEVEL_MASK		(((u64)1 << LEVEL_STRIDE) - 1)
68 
agaw_to_level(int agaw)69 static inline int agaw_to_level(int agaw)
70 {
71 	return agaw + 2;
72 }
73 
agaw_to_width(int agaw)74 static inline int agaw_to_width(int agaw)
75 {
76 	return min_t(int, 30 + agaw * LEVEL_STRIDE, MAX_AGAW_WIDTH);
77 }
78 
width_to_agaw(int width)79 static inline int width_to_agaw(int width)
80 {
81 	return DIV_ROUND_UP(width - 30, LEVEL_STRIDE);
82 }
83 
level_to_offset_bits(int level)84 static inline unsigned int level_to_offset_bits(int level)
85 {
86 	return (level - 1) * LEVEL_STRIDE;
87 }
88 
pfn_level_offset(u64 pfn,int level)89 static inline int pfn_level_offset(u64 pfn, int level)
90 {
91 	return (pfn >> level_to_offset_bits(level)) & LEVEL_MASK;
92 }
93 
level_mask(int level)94 static inline u64 level_mask(int level)
95 {
96 	return -1ULL << level_to_offset_bits(level);
97 }
98 
level_size(int level)99 static inline u64 level_size(int level)
100 {
101 	return 1ULL << level_to_offset_bits(level);
102 }
103 
align_to_level(u64 pfn,int level)104 static inline u64 align_to_level(u64 pfn, int level)
105 {
106 	return (pfn + level_size(level) - 1) & level_mask(level);
107 }
108 
lvl_to_nr_pages(unsigned int lvl)109 static inline unsigned long lvl_to_nr_pages(unsigned int lvl)
110 {
111 	return 1UL << min_t(int, (lvl - 1) * LEVEL_STRIDE, MAX_AGAW_PFN_WIDTH);
112 }
113 
114 /* VT-d pages must always be _smaller_ than MM pages. Otherwise things
115    are never going to work. */
mm_to_dma_pfn(unsigned long mm_pfn)116 static inline unsigned long mm_to_dma_pfn(unsigned long mm_pfn)
117 {
118 	return mm_pfn << (PAGE_SHIFT - VTD_PAGE_SHIFT);
119 }
page_to_dma_pfn(struct page * pg)120 static inline unsigned long page_to_dma_pfn(struct page *pg)
121 {
122 	return mm_to_dma_pfn(page_to_pfn(pg));
123 }
virt_to_dma_pfn(void * p)124 static inline unsigned long virt_to_dma_pfn(void *p)
125 {
126 	return page_to_dma_pfn(virt_to_page(p));
127 }
128 
129 static void __init check_tylersburg_isoch(void);
130 static int rwbf_quirk;
131 
132 /*
133  * set to 1 to panic kernel if can't successfully enable VT-d
134  * (used when kernel is launched w/ TXT)
135  */
136 static int force_on = 0;
137 static int intel_iommu_tboot_noforce;
138 static int no_platform_optin;
139 
140 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
141 
142 /*
143  * Take a root_entry and return the Lower Context Table Pointer (LCTP)
144  * if marked present.
145  */
root_entry_lctp(struct root_entry * re)146 static phys_addr_t root_entry_lctp(struct root_entry *re)
147 {
148 	if (!(re->lo & 1))
149 		return 0;
150 
151 	return re->lo & VTD_PAGE_MASK;
152 }
153 
154 /*
155  * Take a root_entry and return the Upper Context Table Pointer (UCTP)
156  * if marked present.
157  */
root_entry_uctp(struct root_entry * re)158 static phys_addr_t root_entry_uctp(struct root_entry *re)
159 {
160 	if (!(re->hi & 1))
161 		return 0;
162 
163 	return re->hi & VTD_PAGE_MASK;
164 }
165 
context_set_present(struct context_entry * context)166 static inline void context_set_present(struct context_entry *context)
167 {
168 	context->lo |= 1;
169 }
170 
context_set_fault_enable(struct context_entry * context)171 static inline void context_set_fault_enable(struct context_entry *context)
172 {
173 	context->lo &= (((u64)-1) << 2) | 1;
174 }
175 
context_set_translation_type(struct context_entry * context,unsigned long value)176 static inline void context_set_translation_type(struct context_entry *context,
177 						unsigned long value)
178 {
179 	context->lo &= (((u64)-1) << 4) | 3;
180 	context->lo |= (value & 3) << 2;
181 }
182 
context_set_address_root(struct context_entry * context,unsigned long value)183 static inline void context_set_address_root(struct context_entry *context,
184 					    unsigned long value)
185 {
186 	context->lo &= ~VTD_PAGE_MASK;
187 	context->lo |= value & VTD_PAGE_MASK;
188 }
189 
context_set_address_width(struct context_entry * context,unsigned long value)190 static inline void context_set_address_width(struct context_entry *context,
191 					     unsigned long value)
192 {
193 	context->hi |= value & 7;
194 }
195 
context_set_domain_id(struct context_entry * context,unsigned long value)196 static inline void context_set_domain_id(struct context_entry *context,
197 					 unsigned long value)
198 {
199 	context->hi |= (value & ((1 << 16) - 1)) << 8;
200 }
201 
context_set_pasid(struct context_entry * context)202 static inline void context_set_pasid(struct context_entry *context)
203 {
204 	context->lo |= CONTEXT_PASIDE;
205 }
206 
context_domain_id(struct context_entry * c)207 static inline int context_domain_id(struct context_entry *c)
208 {
209 	return((c->hi >> 8) & 0xffff);
210 }
211 
context_clear_entry(struct context_entry * context)212 static inline void context_clear_entry(struct context_entry *context)
213 {
214 	context->lo = 0;
215 	context->hi = 0;
216 }
217 
context_copied(struct intel_iommu * iommu,u8 bus,u8 devfn)218 static inline bool context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
219 {
220 	if (!iommu->copied_tables)
221 		return false;
222 
223 	return test_bit(((long)bus << 8) | devfn, iommu->copied_tables);
224 }
225 
226 static inline void
set_context_copied(struct intel_iommu * iommu,u8 bus,u8 devfn)227 set_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
228 {
229 	set_bit(((long)bus << 8) | devfn, iommu->copied_tables);
230 }
231 
232 static inline void
clear_context_copied(struct intel_iommu * iommu,u8 bus,u8 devfn)233 clear_context_copied(struct intel_iommu *iommu, u8 bus, u8 devfn)
234 {
235 	clear_bit(((long)bus << 8) | devfn, iommu->copied_tables);
236 }
237 
238 /*
239  * This domain is a statically identity mapping domain.
240  *	1. This domain creats a static 1:1 mapping to all usable memory.
241  * 	2. It maps to each iommu if successful.
242  *	3. Each iommu mapps to this domain if successful.
243  */
244 static struct dmar_domain *si_domain;
245 static int hw_pass_through = 1;
246 
247 struct dmar_rmrr_unit {
248 	struct list_head list;		/* list of rmrr units	*/
249 	struct acpi_dmar_header *hdr;	/* ACPI header		*/
250 	u64	base_address;		/* reserved base address*/
251 	u64	end_address;		/* reserved end address */
252 	struct dmar_dev_scope *devices;	/* target devices */
253 	int	devices_cnt;		/* target device count */
254 };
255 
256 struct dmar_atsr_unit {
257 	struct list_head list;		/* list of ATSR units */
258 	struct acpi_dmar_header *hdr;	/* ACPI header */
259 	struct dmar_dev_scope *devices;	/* target devices */
260 	int devices_cnt;		/* target device count */
261 	u8 include_all:1;		/* include all ports */
262 };
263 
264 struct dmar_satc_unit {
265 	struct list_head list;		/* list of SATC units */
266 	struct acpi_dmar_header *hdr;	/* ACPI header */
267 	struct dmar_dev_scope *devices;	/* target devices */
268 	struct intel_iommu *iommu;	/* the corresponding iommu */
269 	int devices_cnt;		/* target device count */
270 	u8 atc_required:1;		/* ATS is required */
271 };
272 
273 static LIST_HEAD(dmar_atsr_units);
274 static LIST_HEAD(dmar_rmrr_units);
275 static LIST_HEAD(dmar_satc_units);
276 
277 #define for_each_rmrr_units(rmrr) \
278 	list_for_each_entry(rmrr, &dmar_rmrr_units, list)
279 
280 static void dmar_remove_one_dev_info(struct device *dev);
281 
282 int dmar_disabled = !IS_ENABLED(CONFIG_INTEL_IOMMU_DEFAULT_ON);
283 int intel_iommu_sm = IS_ENABLED(CONFIG_INTEL_IOMMU_SCALABLE_MODE_DEFAULT_ON);
284 
285 int intel_iommu_enabled = 0;
286 EXPORT_SYMBOL_GPL(intel_iommu_enabled);
287 
288 static int dmar_map_gfx = 1;
289 static int intel_iommu_superpage = 1;
290 static int iommu_identity_mapping;
291 static int iommu_skip_te_disable;
292 
293 #define IDENTMAP_GFX		2
294 #define IDENTMAP_AZALIA		4
295 
296 const struct iommu_ops intel_iommu_ops;
297 
translation_pre_enabled(struct intel_iommu * iommu)298 static bool translation_pre_enabled(struct intel_iommu *iommu)
299 {
300 	return (iommu->flags & VTD_FLAG_TRANS_PRE_ENABLED);
301 }
302 
clear_translation_pre_enabled(struct intel_iommu * iommu)303 static void clear_translation_pre_enabled(struct intel_iommu *iommu)
304 {
305 	iommu->flags &= ~VTD_FLAG_TRANS_PRE_ENABLED;
306 }
307 
init_translation_status(struct intel_iommu * iommu)308 static void init_translation_status(struct intel_iommu *iommu)
309 {
310 	u32 gsts;
311 
312 	gsts = readl(iommu->reg + DMAR_GSTS_REG);
313 	if (gsts & DMA_GSTS_TES)
314 		iommu->flags |= VTD_FLAG_TRANS_PRE_ENABLED;
315 }
316 
intel_iommu_setup(char * str)317 static int __init intel_iommu_setup(char *str)
318 {
319 	if (!str)
320 		return -EINVAL;
321 
322 	while (*str) {
323 		if (!strncmp(str, "on", 2)) {
324 			dmar_disabled = 0;
325 			pr_info("IOMMU enabled\n");
326 		} else if (!strncmp(str, "off", 3)) {
327 			dmar_disabled = 1;
328 			no_platform_optin = 1;
329 			pr_info("IOMMU disabled\n");
330 		} else if (!strncmp(str, "igfx_off", 8)) {
331 			dmar_map_gfx = 0;
332 			pr_info("Disable GFX device mapping\n");
333 		} else if (!strncmp(str, "forcedac", 8)) {
334 			pr_warn("intel_iommu=forcedac deprecated; use iommu.forcedac instead\n");
335 			iommu_dma_forcedac = true;
336 		} else if (!strncmp(str, "strict", 6)) {
337 			pr_warn("intel_iommu=strict deprecated; use iommu.strict=1 instead\n");
338 			iommu_set_dma_strict();
339 		} else if (!strncmp(str, "sp_off", 6)) {
340 			pr_info("Disable supported super page\n");
341 			intel_iommu_superpage = 0;
342 		} else if (!strncmp(str, "sm_on", 5)) {
343 			pr_info("Enable scalable mode if hardware supports\n");
344 			intel_iommu_sm = 1;
345 		} else if (!strncmp(str, "sm_off", 6)) {
346 			pr_info("Scalable mode is disallowed\n");
347 			intel_iommu_sm = 0;
348 		} else if (!strncmp(str, "tboot_noforce", 13)) {
349 			pr_info("Intel-IOMMU: not forcing on after tboot. This could expose security risk for tboot\n");
350 			intel_iommu_tboot_noforce = 1;
351 		} else {
352 			pr_notice("Unknown option - '%s'\n", str);
353 		}
354 
355 		str += strcspn(str, ",");
356 		while (*str == ',')
357 			str++;
358 	}
359 
360 	return 1;
361 }
362 __setup("intel_iommu=", intel_iommu_setup);
363 
alloc_pgtable_page(int node)364 void *alloc_pgtable_page(int node)
365 {
366 	struct page *page;
367 	void *vaddr = NULL;
368 
369 	page = alloc_pages_node(node, GFP_ATOMIC | __GFP_ZERO, 0);
370 	if (page)
371 		vaddr = page_address(page);
372 	return vaddr;
373 }
374 
free_pgtable_page(void * vaddr)375 void free_pgtable_page(void *vaddr)
376 {
377 	free_page((unsigned long)vaddr);
378 }
379 
domain_type_is_si(struct dmar_domain * domain)380 static inline int domain_type_is_si(struct dmar_domain *domain)
381 {
382 	return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
383 }
384 
domain_use_first_level(struct dmar_domain * domain)385 static inline bool domain_use_first_level(struct dmar_domain *domain)
386 {
387 	return domain->flags & DOMAIN_FLAG_USE_FIRST_LEVEL;
388 }
389 
domain_pfn_supported(struct dmar_domain * domain,unsigned long pfn)390 static inline int domain_pfn_supported(struct dmar_domain *domain,
391 				       unsigned long pfn)
392 {
393 	int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT;
394 
395 	return !(addr_width < BITS_PER_LONG && pfn >> addr_width);
396 }
397 
398 /*
399  * Calculate the Supported Adjusted Guest Address Widths of an IOMMU.
400  * Refer to 11.4.2 of the VT-d spec for the encoding of each bit of
401  * the returned SAGAW.
402  */
__iommu_calculate_sagaw(struct intel_iommu * iommu)403 static unsigned long __iommu_calculate_sagaw(struct intel_iommu *iommu)
404 {
405 	unsigned long fl_sagaw, sl_sagaw;
406 
407 	fl_sagaw = BIT(2) | (cap_fl5lp_support(iommu->cap) ? BIT(3) : 0);
408 	sl_sagaw = cap_sagaw(iommu->cap);
409 
410 	/* Second level only. */
411 	if (!sm_supported(iommu) || !ecap_flts(iommu->ecap))
412 		return sl_sagaw;
413 
414 	/* First level only. */
415 	if (!ecap_slts(iommu->ecap))
416 		return fl_sagaw;
417 
418 	return fl_sagaw & sl_sagaw;
419 }
420 
__iommu_calculate_agaw(struct intel_iommu * iommu,int max_gaw)421 static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw)
422 {
423 	unsigned long sagaw;
424 	int agaw;
425 
426 	sagaw = __iommu_calculate_sagaw(iommu);
427 	for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) {
428 		if (test_bit(agaw, &sagaw))
429 			break;
430 	}
431 
432 	return agaw;
433 }
434 
435 /*
436  * Calculate max SAGAW for each iommu.
437  */
iommu_calculate_max_sagaw(struct intel_iommu * iommu)438 int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
439 {
440 	return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH);
441 }
442 
443 /*
444  * calculate agaw for each iommu.
445  * "SAGAW" may be different across iommus, use a default agaw, and
446  * get a supported less agaw for iommus that don't support the default agaw.
447  */
iommu_calculate_agaw(struct intel_iommu * iommu)448 int iommu_calculate_agaw(struct intel_iommu *iommu)
449 {
450 	return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH);
451 }
452 
iommu_paging_structure_coherency(struct intel_iommu * iommu)453 static inline bool iommu_paging_structure_coherency(struct intel_iommu *iommu)
454 {
455 	return sm_supported(iommu) ?
456 			ecap_smpwc(iommu->ecap) : ecap_coherent(iommu->ecap);
457 }
458 
domain_update_iommu_coherency(struct dmar_domain * domain)459 static void domain_update_iommu_coherency(struct dmar_domain *domain)
460 {
461 	struct iommu_domain_info *info;
462 	struct dmar_drhd_unit *drhd;
463 	struct intel_iommu *iommu;
464 	bool found = false;
465 	unsigned long i;
466 
467 	domain->iommu_coherency = true;
468 	xa_for_each(&domain->iommu_array, i, info) {
469 		found = true;
470 		if (!iommu_paging_structure_coherency(info->iommu)) {
471 			domain->iommu_coherency = false;
472 			break;
473 		}
474 	}
475 	if (found)
476 		return;
477 
478 	/* No hardware attached; use lowest common denominator */
479 	rcu_read_lock();
480 	for_each_active_iommu(iommu, drhd) {
481 		if (!iommu_paging_structure_coherency(iommu)) {
482 			domain->iommu_coherency = false;
483 			break;
484 		}
485 	}
486 	rcu_read_unlock();
487 }
488 
domain_update_iommu_superpage(struct dmar_domain * domain,struct intel_iommu * skip)489 static int domain_update_iommu_superpage(struct dmar_domain *domain,
490 					 struct intel_iommu *skip)
491 {
492 	struct dmar_drhd_unit *drhd;
493 	struct intel_iommu *iommu;
494 	int mask = 0x3;
495 
496 	if (!intel_iommu_superpage)
497 		return 0;
498 
499 	/* set iommu_superpage to the smallest common denominator */
500 	rcu_read_lock();
501 	for_each_active_iommu(iommu, drhd) {
502 		if (iommu != skip) {
503 			if (domain && domain_use_first_level(domain)) {
504 				if (!cap_fl1gp_support(iommu->cap))
505 					mask = 0x1;
506 			} else {
507 				mask &= cap_super_page_val(iommu->cap);
508 			}
509 
510 			if (!mask)
511 				break;
512 		}
513 	}
514 	rcu_read_unlock();
515 
516 	return fls(mask);
517 }
518 
domain_update_device_node(struct dmar_domain * domain)519 static int domain_update_device_node(struct dmar_domain *domain)
520 {
521 	struct device_domain_info *info;
522 	int nid = NUMA_NO_NODE;
523 	unsigned long flags;
524 
525 	spin_lock_irqsave(&domain->lock, flags);
526 	list_for_each_entry(info, &domain->devices, link) {
527 		/*
528 		 * There could possibly be multiple device numa nodes as devices
529 		 * within the same domain may sit behind different IOMMUs. There
530 		 * isn't perfect answer in such situation, so we select first
531 		 * come first served policy.
532 		 */
533 		nid = dev_to_node(info->dev);
534 		if (nid != NUMA_NO_NODE)
535 			break;
536 	}
537 	spin_unlock_irqrestore(&domain->lock, flags);
538 
539 	return nid;
540 }
541 
542 static void domain_update_iotlb(struct dmar_domain *domain);
543 
544 /* Return the super pagesize bitmap if supported. */
domain_super_pgsize_bitmap(struct dmar_domain * domain)545 static unsigned long domain_super_pgsize_bitmap(struct dmar_domain *domain)
546 {
547 	unsigned long bitmap = 0;
548 
549 	/*
550 	 * 1-level super page supports page size of 2MiB, 2-level super page
551 	 * supports page size of both 2MiB and 1GiB.
552 	 */
553 	if (domain->iommu_superpage == 1)
554 		bitmap |= SZ_2M;
555 	else if (domain->iommu_superpage == 2)
556 		bitmap |= SZ_2M | SZ_1G;
557 
558 	return bitmap;
559 }
560 
561 /* Some capabilities may be different across iommus */
domain_update_iommu_cap(struct dmar_domain * domain)562 static void domain_update_iommu_cap(struct dmar_domain *domain)
563 {
564 	domain_update_iommu_coherency(domain);
565 	domain->iommu_superpage = domain_update_iommu_superpage(domain, NULL);
566 
567 	/*
568 	 * If RHSA is missing, we should default to the device numa domain
569 	 * as fall back.
570 	 */
571 	if (domain->nid == NUMA_NO_NODE)
572 		domain->nid = domain_update_device_node(domain);
573 
574 	/*
575 	 * First-level translation restricts the input-address to a
576 	 * canonical address (i.e., address bits 63:N have the same
577 	 * value as address bit [N-1], where N is 48-bits with 4-level
578 	 * paging and 57-bits with 5-level paging). Hence, skip bit
579 	 * [N-1].
580 	 */
581 	if (domain_use_first_level(domain))
582 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw - 1);
583 	else
584 		domain->domain.geometry.aperture_end = __DOMAIN_MAX_ADDR(domain->gaw);
585 
586 	domain->domain.pgsize_bitmap |= domain_super_pgsize_bitmap(domain);
587 	domain_update_iotlb(domain);
588 }
589 
iommu_context_addr(struct intel_iommu * iommu,u8 bus,u8 devfn,int alloc)590 struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
591 					 u8 devfn, int alloc)
592 {
593 	struct root_entry *root = &iommu->root_entry[bus];
594 	struct context_entry *context;
595 	u64 *entry;
596 
597 	/*
598 	 * Except that the caller requested to allocate a new entry,
599 	 * returning a copied context entry makes no sense.
600 	 */
601 	if (!alloc && context_copied(iommu, bus, devfn))
602 		return NULL;
603 
604 	entry = &root->lo;
605 	if (sm_supported(iommu)) {
606 		if (devfn >= 0x80) {
607 			devfn -= 0x80;
608 			entry = &root->hi;
609 		}
610 		devfn *= 2;
611 	}
612 	if (*entry & 1)
613 		context = phys_to_virt(*entry & VTD_PAGE_MASK);
614 	else {
615 		unsigned long phy_addr;
616 		if (!alloc)
617 			return NULL;
618 
619 		context = alloc_pgtable_page(iommu->node);
620 		if (!context)
621 			return NULL;
622 
623 		__iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
624 		phy_addr = virt_to_phys((void *)context);
625 		*entry = phy_addr | 1;
626 		__iommu_flush_cache(iommu, entry, sizeof(*entry));
627 	}
628 	return &context[devfn];
629 }
630 
631 /**
632  * is_downstream_to_pci_bridge - test if a device belongs to the PCI
633  *				 sub-hierarchy of a candidate PCI-PCI bridge
634  * @dev: candidate PCI device belonging to @bridge PCI sub-hierarchy
635  * @bridge: the candidate PCI-PCI bridge
636  *
637  * Return: true if @dev belongs to @bridge PCI sub-hierarchy, else false.
638  */
639 static bool
is_downstream_to_pci_bridge(struct device * dev,struct device * bridge)640 is_downstream_to_pci_bridge(struct device *dev, struct device *bridge)
641 {
642 	struct pci_dev *pdev, *pbridge;
643 
644 	if (!dev_is_pci(dev) || !dev_is_pci(bridge))
645 		return false;
646 
647 	pdev = to_pci_dev(dev);
648 	pbridge = to_pci_dev(bridge);
649 
650 	if (pbridge->subordinate &&
651 	    pbridge->subordinate->number <= pdev->bus->number &&
652 	    pbridge->subordinate->busn_res.end >= pdev->bus->number)
653 		return true;
654 
655 	return false;
656 }
657 
quirk_ioat_snb_local_iommu(struct pci_dev * pdev)658 static bool quirk_ioat_snb_local_iommu(struct pci_dev *pdev)
659 {
660 	struct dmar_drhd_unit *drhd;
661 	u32 vtbar;
662 	int rc;
663 
664 	/* We know that this device on this chipset has its own IOMMU.
665 	 * If we find it under a different IOMMU, then the BIOS is lying
666 	 * to us. Hope that the IOMMU for this device is actually
667 	 * disabled, and it needs no translation...
668 	 */
669 	rc = pci_bus_read_config_dword(pdev->bus, PCI_DEVFN(0, 0), 0xb0, &vtbar);
670 	if (rc) {
671 		/* "can't" happen */
672 		dev_info(&pdev->dev, "failed to run vt-d quirk\n");
673 		return false;
674 	}
675 	vtbar &= 0xffff0000;
676 
677 	/* we know that the this iommu should be at offset 0xa000 from vtbar */
678 	drhd = dmar_find_matched_drhd_unit(pdev);
679 	if (!drhd || drhd->reg_base_addr - vtbar != 0xa000) {
680 		pr_warn_once(FW_BUG "BIOS assigned incorrect VT-d unit for Intel(R) QuickData Technology device\n");
681 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
682 		return true;
683 	}
684 
685 	return false;
686 }
687 
iommu_is_dummy(struct intel_iommu * iommu,struct device * dev)688 static bool iommu_is_dummy(struct intel_iommu *iommu, struct device *dev)
689 {
690 	if (!iommu || iommu->drhd->ignored)
691 		return true;
692 
693 	if (dev_is_pci(dev)) {
694 		struct pci_dev *pdev = to_pci_dev(dev);
695 
696 		if (pdev->vendor == PCI_VENDOR_ID_INTEL &&
697 		    pdev->device == PCI_DEVICE_ID_INTEL_IOAT_SNB &&
698 		    quirk_ioat_snb_local_iommu(pdev))
699 			return true;
700 	}
701 
702 	return false;
703 }
704 
device_to_iommu(struct device * dev,u8 * bus,u8 * devfn)705 struct intel_iommu *device_to_iommu(struct device *dev, u8 *bus, u8 *devfn)
706 {
707 	struct dmar_drhd_unit *drhd = NULL;
708 	struct pci_dev *pdev = NULL;
709 	struct intel_iommu *iommu;
710 	struct device *tmp;
711 	u16 segment = 0;
712 	int i;
713 
714 	if (!dev)
715 		return NULL;
716 
717 	if (dev_is_pci(dev)) {
718 		struct pci_dev *pf_pdev;
719 
720 		pdev = pci_real_dma_dev(to_pci_dev(dev));
721 
722 		/* VFs aren't listed in scope tables; we need to look up
723 		 * the PF instead to find the IOMMU. */
724 		pf_pdev = pci_physfn(pdev);
725 		dev = &pf_pdev->dev;
726 		segment = pci_domain_nr(pdev->bus);
727 	} else if (has_acpi_companion(dev))
728 		dev = &ACPI_COMPANION(dev)->dev;
729 
730 	rcu_read_lock();
731 	for_each_iommu(iommu, drhd) {
732 		if (pdev && segment != drhd->segment)
733 			continue;
734 
735 		for_each_active_dev_scope(drhd->devices,
736 					  drhd->devices_cnt, i, tmp) {
737 			if (tmp == dev) {
738 				/* For a VF use its original BDF# not that of the PF
739 				 * which we used for the IOMMU lookup. Strictly speaking
740 				 * we could do this for all PCI devices; we only need to
741 				 * get the BDF# from the scope table for ACPI matches. */
742 				if (pdev && pdev->is_virtfn)
743 					goto got_pdev;
744 
745 				if (bus && devfn) {
746 					*bus = drhd->devices[i].bus;
747 					*devfn = drhd->devices[i].devfn;
748 				}
749 				goto out;
750 			}
751 
752 			if (is_downstream_to_pci_bridge(dev, tmp))
753 				goto got_pdev;
754 		}
755 
756 		if (pdev && drhd->include_all) {
757 got_pdev:
758 			if (bus && devfn) {
759 				*bus = pdev->bus->number;
760 				*devfn = pdev->devfn;
761 			}
762 			goto out;
763 		}
764 	}
765 	iommu = NULL;
766 out:
767 	if (iommu_is_dummy(iommu, dev))
768 		iommu = NULL;
769 
770 	rcu_read_unlock();
771 
772 	return iommu;
773 }
774 
domain_flush_cache(struct dmar_domain * domain,void * addr,int size)775 static void domain_flush_cache(struct dmar_domain *domain,
776 			       void *addr, int size)
777 {
778 	if (!domain->iommu_coherency)
779 		clflush_cache_range(addr, size);
780 }
781 
device_context_mapped(struct intel_iommu * iommu,u8 bus,u8 devfn)782 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
783 {
784 	struct context_entry *context;
785 	int ret = 0;
786 
787 	spin_lock(&iommu->lock);
788 	context = iommu_context_addr(iommu, bus, devfn, 0);
789 	if (context)
790 		ret = context_present(context);
791 	spin_unlock(&iommu->lock);
792 	return ret;
793 }
794 
free_context_table(struct intel_iommu * iommu)795 static void free_context_table(struct intel_iommu *iommu)
796 {
797 	struct context_entry *context;
798 	int i;
799 
800 	if (!iommu->root_entry)
801 		return;
802 
803 	for (i = 0; i < ROOT_ENTRY_NR; i++) {
804 		context = iommu_context_addr(iommu, i, 0, 0);
805 		if (context)
806 			free_pgtable_page(context);
807 
808 		if (!sm_supported(iommu))
809 			continue;
810 
811 		context = iommu_context_addr(iommu, i, 0x80, 0);
812 		if (context)
813 			free_pgtable_page(context);
814 	}
815 
816 	free_pgtable_page(iommu->root_entry);
817 	iommu->root_entry = NULL;
818 }
819 
820 #ifdef CONFIG_DMAR_DEBUG
pgtable_walk(struct intel_iommu * iommu,unsigned long pfn,u8 bus,u8 devfn,struct dma_pte * parent,int level)821 static void pgtable_walk(struct intel_iommu *iommu, unsigned long pfn,
822 			 u8 bus, u8 devfn, struct dma_pte *parent, int level)
823 {
824 	struct dma_pte *pte;
825 	int offset;
826 
827 	while (1) {
828 		offset = pfn_level_offset(pfn, level);
829 		pte = &parent[offset];
830 		if (!pte || (dma_pte_superpage(pte) || !dma_pte_present(pte))) {
831 			pr_info("PTE not present at level %d\n", level);
832 			break;
833 		}
834 
835 		pr_info("pte level: %d, pte value: 0x%016llx\n", level, pte->val);
836 
837 		if (level == 1)
838 			break;
839 
840 		parent = phys_to_virt(dma_pte_addr(pte));
841 		level--;
842 	}
843 }
844 
dmar_fault_dump_ptes(struct intel_iommu * iommu,u16 source_id,unsigned long long addr,u32 pasid)845 void dmar_fault_dump_ptes(struct intel_iommu *iommu, u16 source_id,
846 			  unsigned long long addr, u32 pasid)
847 {
848 	struct pasid_dir_entry *dir, *pde;
849 	struct pasid_entry *entries, *pte;
850 	struct context_entry *ctx_entry;
851 	struct root_entry *rt_entry;
852 	int i, dir_index, index, level;
853 	u8 devfn = source_id & 0xff;
854 	u8 bus = source_id >> 8;
855 	struct dma_pte *pgtable;
856 
857 	pr_info("Dump %s table entries for IOVA 0x%llx\n", iommu->name, addr);
858 
859 	/* root entry dump */
860 	rt_entry = &iommu->root_entry[bus];
861 	if (!rt_entry) {
862 		pr_info("root table entry is not present\n");
863 		return;
864 	}
865 
866 	if (sm_supported(iommu))
867 		pr_info("scalable mode root entry: hi 0x%016llx, low 0x%016llx\n",
868 			rt_entry->hi, rt_entry->lo);
869 	else
870 		pr_info("root entry: 0x%016llx", rt_entry->lo);
871 
872 	/* context entry dump */
873 	ctx_entry = iommu_context_addr(iommu, bus, devfn, 0);
874 	if (!ctx_entry) {
875 		pr_info("context table entry is not present\n");
876 		return;
877 	}
878 
879 	pr_info("context entry: hi 0x%016llx, low 0x%016llx\n",
880 		ctx_entry->hi, ctx_entry->lo);
881 
882 	/* legacy mode does not require PASID entries */
883 	if (!sm_supported(iommu)) {
884 		level = agaw_to_level(ctx_entry->hi & 7);
885 		pgtable = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
886 		goto pgtable_walk;
887 	}
888 
889 	/* get the pointer to pasid directory entry */
890 	dir = phys_to_virt(ctx_entry->lo & VTD_PAGE_MASK);
891 	if (!dir) {
892 		pr_info("pasid directory entry is not present\n");
893 		return;
894 	}
895 	/* For request-without-pasid, get the pasid from context entry */
896 	if (intel_iommu_sm && pasid == INVALID_IOASID)
897 		pasid = PASID_RID2PASID;
898 
899 	dir_index = pasid >> PASID_PDE_SHIFT;
900 	pde = &dir[dir_index];
901 	pr_info("pasid dir entry: 0x%016llx\n", pde->val);
902 
903 	/* get the pointer to the pasid table entry */
904 	entries = get_pasid_table_from_pde(pde);
905 	if (!entries) {
906 		pr_info("pasid table entry is not present\n");
907 		return;
908 	}
909 	index = pasid & PASID_PTE_MASK;
910 	pte = &entries[index];
911 	for (i = 0; i < ARRAY_SIZE(pte->val); i++)
912 		pr_info("pasid table entry[%d]: 0x%016llx\n", i, pte->val[i]);
913 
914 	if (pasid_pte_get_pgtt(pte) == PASID_ENTRY_PGTT_FL_ONLY) {
915 		level = pte->val[2] & BIT_ULL(2) ? 5 : 4;
916 		pgtable = phys_to_virt(pte->val[2] & VTD_PAGE_MASK);
917 	} else {
918 		level = agaw_to_level((pte->val[0] >> 2) & 0x7);
919 		pgtable = phys_to_virt(pte->val[0] & VTD_PAGE_MASK);
920 	}
921 
922 pgtable_walk:
923 	pgtable_walk(iommu, addr >> VTD_PAGE_SHIFT, bus, devfn, pgtable, level);
924 }
925 #endif
926 
pfn_to_dma_pte(struct dmar_domain * domain,unsigned long pfn,int * target_level)927 static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
928 				      unsigned long pfn, int *target_level)
929 {
930 	struct dma_pte *parent, *pte;
931 	int level = agaw_to_level(domain->agaw);
932 	int offset;
933 
934 	BUG_ON(!domain->pgd);
935 
936 	if (!domain_pfn_supported(domain, pfn))
937 		/* Address beyond IOMMU's addressing capabilities. */
938 		return NULL;
939 
940 	parent = domain->pgd;
941 
942 	while (1) {
943 		void *tmp_page;
944 
945 		offset = pfn_level_offset(pfn, level);
946 		pte = &parent[offset];
947 		if (!*target_level && (dma_pte_superpage(pte) || !dma_pte_present(pte)))
948 			break;
949 		if (level == *target_level)
950 			break;
951 
952 		if (!dma_pte_present(pte)) {
953 			uint64_t pteval;
954 
955 			tmp_page = alloc_pgtable_page(domain->nid);
956 
957 			if (!tmp_page)
958 				return NULL;
959 
960 			domain_flush_cache(domain, tmp_page, VTD_PAGE_SIZE);
961 			pteval = ((uint64_t)virt_to_dma_pfn(tmp_page) << VTD_PAGE_SHIFT) | DMA_PTE_READ | DMA_PTE_WRITE;
962 			if (domain_use_first_level(domain))
963 				pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
964 
965 			if (cmpxchg64(&pte->val, 0ULL, pteval))
966 				/* Someone else set it while we were thinking; use theirs. */
967 				free_pgtable_page(tmp_page);
968 			else
969 				domain_flush_cache(domain, pte, sizeof(*pte));
970 		}
971 		if (level == 1)
972 			break;
973 
974 		parent = phys_to_virt(dma_pte_addr(pte));
975 		level--;
976 	}
977 
978 	if (!*target_level)
979 		*target_level = level;
980 
981 	return pte;
982 }
983 
984 /* return address's pte at specific level */
dma_pfn_level_pte(struct dmar_domain * domain,unsigned long pfn,int level,int * large_page)985 static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain,
986 					 unsigned long pfn,
987 					 int level, int *large_page)
988 {
989 	struct dma_pte *parent, *pte;
990 	int total = agaw_to_level(domain->agaw);
991 	int offset;
992 
993 	parent = domain->pgd;
994 	while (level <= total) {
995 		offset = pfn_level_offset(pfn, total);
996 		pte = &parent[offset];
997 		if (level == total)
998 			return pte;
999 
1000 		if (!dma_pte_present(pte)) {
1001 			*large_page = total;
1002 			break;
1003 		}
1004 
1005 		if (dma_pte_superpage(pte)) {
1006 			*large_page = total;
1007 			return pte;
1008 		}
1009 
1010 		parent = phys_to_virt(dma_pte_addr(pte));
1011 		total--;
1012 	}
1013 	return NULL;
1014 }
1015 
1016 /* clear last level pte, a tlb flush should be followed */
dma_pte_clear_range(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn)1017 static void dma_pte_clear_range(struct dmar_domain *domain,
1018 				unsigned long start_pfn,
1019 				unsigned long last_pfn)
1020 {
1021 	unsigned int large_page;
1022 	struct dma_pte *first_pte, *pte;
1023 
1024 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1025 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1026 	BUG_ON(start_pfn > last_pfn);
1027 
1028 	/* we don't need lock here; nobody else touches the iova range */
1029 	do {
1030 		large_page = 1;
1031 		first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page);
1032 		if (!pte) {
1033 			start_pfn = align_to_level(start_pfn + 1, large_page + 1);
1034 			continue;
1035 		}
1036 		do {
1037 			dma_clear_pte(pte);
1038 			start_pfn += lvl_to_nr_pages(large_page);
1039 			pte++;
1040 		} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
1041 
1042 		domain_flush_cache(domain, first_pte,
1043 				   (void *)pte - (void *)first_pte);
1044 
1045 	} while (start_pfn && start_pfn <= last_pfn);
1046 }
1047 
dma_pte_free_level(struct dmar_domain * domain,int level,int retain_level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn)1048 static void dma_pte_free_level(struct dmar_domain *domain, int level,
1049 			       int retain_level, struct dma_pte *pte,
1050 			       unsigned long pfn, unsigned long start_pfn,
1051 			       unsigned long last_pfn)
1052 {
1053 	pfn = max(start_pfn, pfn);
1054 	pte = &pte[pfn_level_offset(pfn, level)];
1055 
1056 	do {
1057 		unsigned long level_pfn;
1058 		struct dma_pte *level_pte;
1059 
1060 		if (!dma_pte_present(pte) || dma_pte_superpage(pte))
1061 			goto next;
1062 
1063 		level_pfn = pfn & level_mask(level);
1064 		level_pte = phys_to_virt(dma_pte_addr(pte));
1065 
1066 		if (level > 2) {
1067 			dma_pte_free_level(domain, level - 1, retain_level,
1068 					   level_pte, level_pfn, start_pfn,
1069 					   last_pfn);
1070 		}
1071 
1072 		/*
1073 		 * Free the page table if we're below the level we want to
1074 		 * retain and the range covers the entire table.
1075 		 */
1076 		if (level < retain_level && !(start_pfn > level_pfn ||
1077 		      last_pfn < level_pfn + level_size(level) - 1)) {
1078 			dma_clear_pte(pte);
1079 			domain_flush_cache(domain, pte, sizeof(*pte));
1080 			free_pgtable_page(level_pte);
1081 		}
1082 next:
1083 		pfn += level_size(level);
1084 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1085 }
1086 
1087 /*
1088  * clear last level (leaf) ptes and free page table pages below the
1089  * level we wish to keep intact.
1090  */
dma_pte_free_pagetable(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,int retain_level)1091 static void dma_pte_free_pagetable(struct dmar_domain *domain,
1092 				   unsigned long start_pfn,
1093 				   unsigned long last_pfn,
1094 				   int retain_level)
1095 {
1096 	dma_pte_clear_range(domain, start_pfn, last_pfn);
1097 
1098 	/* We don't need lock here; nobody else touches the iova range */
1099 	dma_pte_free_level(domain, agaw_to_level(domain->agaw), retain_level,
1100 			   domain->pgd, 0, start_pfn, last_pfn);
1101 
1102 	/* free pgd */
1103 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1104 		free_pgtable_page(domain->pgd);
1105 		domain->pgd = NULL;
1106 	}
1107 }
1108 
1109 /* When a page at a given level is being unlinked from its parent, we don't
1110    need to *modify* it at all. All we need to do is make a list of all the
1111    pages which can be freed just as soon as we've flushed the IOTLB and we
1112    know the hardware page-walk will no longer touch them.
1113    The 'pte' argument is the *parent* PTE, pointing to the page that is to
1114    be freed. */
dma_pte_list_pagetables(struct dmar_domain * domain,int level,struct dma_pte * pte,struct list_head * freelist)1115 static void dma_pte_list_pagetables(struct dmar_domain *domain,
1116 				    int level, struct dma_pte *pte,
1117 				    struct list_head *freelist)
1118 {
1119 	struct page *pg;
1120 
1121 	pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
1122 	list_add_tail(&pg->lru, freelist);
1123 
1124 	if (level == 1)
1125 		return;
1126 
1127 	pte = page_address(pg);
1128 	do {
1129 		if (dma_pte_present(pte) && !dma_pte_superpage(pte))
1130 			dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1131 		pte++;
1132 	} while (!first_pte_in_page(pte));
1133 }
1134 
dma_pte_clear_level(struct dmar_domain * domain,int level,struct dma_pte * pte,unsigned long pfn,unsigned long start_pfn,unsigned long last_pfn,struct list_head * freelist)1135 static void dma_pte_clear_level(struct dmar_domain *domain, int level,
1136 				struct dma_pte *pte, unsigned long pfn,
1137 				unsigned long start_pfn, unsigned long last_pfn,
1138 				struct list_head *freelist)
1139 {
1140 	struct dma_pte *first_pte = NULL, *last_pte = NULL;
1141 
1142 	pfn = max(start_pfn, pfn);
1143 	pte = &pte[pfn_level_offset(pfn, level)];
1144 
1145 	do {
1146 		unsigned long level_pfn = pfn & level_mask(level);
1147 
1148 		if (!dma_pte_present(pte))
1149 			goto next;
1150 
1151 		/* If range covers entire pagetable, free it */
1152 		if (start_pfn <= level_pfn &&
1153 		    last_pfn >= level_pfn + level_size(level) - 1) {
1154 			/* These suborbinate page tables are going away entirely. Don't
1155 			   bother to clear them; we're just going to *free* them. */
1156 			if (level > 1 && !dma_pte_superpage(pte))
1157 				dma_pte_list_pagetables(domain, level - 1, pte, freelist);
1158 
1159 			dma_clear_pte(pte);
1160 			if (!first_pte)
1161 				first_pte = pte;
1162 			last_pte = pte;
1163 		} else if (level > 1) {
1164 			/* Recurse down into a level that isn't *entirely* obsolete */
1165 			dma_pte_clear_level(domain, level - 1,
1166 					    phys_to_virt(dma_pte_addr(pte)),
1167 					    level_pfn, start_pfn, last_pfn,
1168 					    freelist);
1169 		}
1170 next:
1171 		pfn = level_pfn + level_size(level);
1172 	} while (!first_pte_in_page(++pte) && pfn <= last_pfn);
1173 
1174 	if (first_pte)
1175 		domain_flush_cache(domain, first_pte,
1176 				   (void *)++last_pte - (void *)first_pte);
1177 }
1178 
1179 /* We can't just free the pages because the IOMMU may still be walking
1180    the page tables, and may have cached the intermediate levels. The
1181    pages can only be freed after the IOTLB flush has been done. */
domain_unmap(struct dmar_domain * domain,unsigned long start_pfn,unsigned long last_pfn,struct list_head * freelist)1182 static void domain_unmap(struct dmar_domain *domain, unsigned long start_pfn,
1183 			 unsigned long last_pfn, struct list_head *freelist)
1184 {
1185 	BUG_ON(!domain_pfn_supported(domain, start_pfn));
1186 	BUG_ON(!domain_pfn_supported(domain, last_pfn));
1187 	BUG_ON(start_pfn > last_pfn);
1188 
1189 	/* we don't need lock here; nobody else touches the iova range */
1190 	dma_pte_clear_level(domain, agaw_to_level(domain->agaw),
1191 			    domain->pgd, 0, start_pfn, last_pfn, freelist);
1192 
1193 	/* free pgd */
1194 	if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
1195 		struct page *pgd_page = virt_to_page(domain->pgd);
1196 		list_add_tail(&pgd_page->lru, freelist);
1197 		domain->pgd = NULL;
1198 	}
1199 }
1200 
1201 /* iommu handling */
iommu_alloc_root_entry(struct intel_iommu * iommu)1202 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
1203 {
1204 	struct root_entry *root;
1205 
1206 	root = (struct root_entry *)alloc_pgtable_page(iommu->node);
1207 	if (!root) {
1208 		pr_err("Allocating root entry for %s failed\n",
1209 			iommu->name);
1210 		return -ENOMEM;
1211 	}
1212 
1213 	__iommu_flush_cache(iommu, root, ROOT_SIZE);
1214 	iommu->root_entry = root;
1215 
1216 	return 0;
1217 }
1218 
iommu_set_root_entry(struct intel_iommu * iommu)1219 static void iommu_set_root_entry(struct intel_iommu *iommu)
1220 {
1221 	u64 addr;
1222 	u32 sts;
1223 	unsigned long flag;
1224 
1225 	addr = virt_to_phys(iommu->root_entry);
1226 	if (sm_supported(iommu))
1227 		addr |= DMA_RTADDR_SMT;
1228 
1229 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1230 	dmar_writeq(iommu->reg + DMAR_RTADDR_REG, addr);
1231 
1232 	writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG);
1233 
1234 	/* Make sure hardware complete it */
1235 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1236 		      readl, (sts & DMA_GSTS_RTPS), sts);
1237 
1238 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1239 
1240 	/*
1241 	 * Hardware invalidates all DMA remapping hardware translation
1242 	 * caches as part of SRTP flow.
1243 	 */
1244 	if (cap_esrtps(iommu->cap))
1245 		return;
1246 
1247 	iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL);
1248 	if (sm_supported(iommu))
1249 		qi_flush_pasid_cache(iommu, 0, QI_PC_GLOBAL, 0);
1250 	iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH);
1251 }
1252 
iommu_flush_write_buffer(struct intel_iommu * iommu)1253 void iommu_flush_write_buffer(struct intel_iommu *iommu)
1254 {
1255 	u32 val;
1256 	unsigned long flag;
1257 
1258 	if (!rwbf_quirk && !cap_rwbf(iommu->cap))
1259 		return;
1260 
1261 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1262 	writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG);
1263 
1264 	/* Make sure hardware complete it */
1265 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1266 		      readl, (!(val & DMA_GSTS_WBFS)), val);
1267 
1268 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1269 }
1270 
1271 /* return value determine if we need a write buffer flush */
__iommu_flush_context(struct intel_iommu * iommu,u16 did,u16 source_id,u8 function_mask,u64 type)1272 static void __iommu_flush_context(struct intel_iommu *iommu,
1273 				  u16 did, u16 source_id, u8 function_mask,
1274 				  u64 type)
1275 {
1276 	u64 val = 0;
1277 	unsigned long flag;
1278 
1279 	switch (type) {
1280 	case DMA_CCMD_GLOBAL_INVL:
1281 		val = DMA_CCMD_GLOBAL_INVL;
1282 		break;
1283 	case DMA_CCMD_DOMAIN_INVL:
1284 		val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
1285 		break;
1286 	case DMA_CCMD_DEVICE_INVL:
1287 		val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
1288 			| DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
1289 		break;
1290 	default:
1291 		BUG();
1292 	}
1293 	val |= DMA_CCMD_ICC;
1294 
1295 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1296 	dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
1297 
1298 	/* Make sure hardware complete it */
1299 	IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
1300 		dmar_readq, (!(val & DMA_CCMD_ICC)), val);
1301 
1302 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1303 }
1304 
1305 /* return value determine if we need a write buffer flush */
__iommu_flush_iotlb(struct intel_iommu * iommu,u16 did,u64 addr,unsigned int size_order,u64 type)1306 static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
1307 				u64 addr, unsigned int size_order, u64 type)
1308 {
1309 	int tlb_offset = ecap_iotlb_offset(iommu->ecap);
1310 	u64 val = 0, val_iva = 0;
1311 	unsigned long flag;
1312 
1313 	switch (type) {
1314 	case DMA_TLB_GLOBAL_FLUSH:
1315 		/* global flush doesn't need set IVA_REG */
1316 		val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
1317 		break;
1318 	case DMA_TLB_DSI_FLUSH:
1319 		val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1320 		break;
1321 	case DMA_TLB_PSI_FLUSH:
1322 		val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
1323 		/* IH bit is passed in as part of address */
1324 		val_iva = size_order | addr;
1325 		break;
1326 	default:
1327 		BUG();
1328 	}
1329 	/* Note: set drain read/write */
1330 #if 0
1331 	/*
1332 	 * This is probably to be super secure.. Looks like we can
1333 	 * ignore it without any impact.
1334 	 */
1335 	if (cap_read_drain(iommu->cap))
1336 		val |= DMA_TLB_READ_DRAIN;
1337 #endif
1338 	if (cap_write_drain(iommu->cap))
1339 		val |= DMA_TLB_WRITE_DRAIN;
1340 
1341 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1342 	/* Note: Only uses first TLB reg currently */
1343 	if (val_iva)
1344 		dmar_writeq(iommu->reg + tlb_offset, val_iva);
1345 	dmar_writeq(iommu->reg + tlb_offset + 8, val);
1346 
1347 	/* Make sure hardware complete it */
1348 	IOMMU_WAIT_OP(iommu, tlb_offset + 8,
1349 		dmar_readq, (!(val & DMA_TLB_IVT)), val);
1350 
1351 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1352 
1353 	/* check IOTLB invalidation granularity */
1354 	if (DMA_TLB_IAIG(val) == 0)
1355 		pr_err("Flush IOTLB failed\n");
1356 	if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
1357 		pr_debug("TLB flush request %Lx, actual %Lx\n",
1358 			(unsigned long long)DMA_TLB_IIRG(type),
1359 			(unsigned long long)DMA_TLB_IAIG(val));
1360 }
1361 
1362 static struct device_domain_info *
domain_lookup_dev_info(struct dmar_domain * domain,struct intel_iommu * iommu,u8 bus,u8 devfn)1363 domain_lookup_dev_info(struct dmar_domain *domain,
1364 		       struct intel_iommu *iommu, u8 bus, u8 devfn)
1365 {
1366 	struct device_domain_info *info;
1367 	unsigned long flags;
1368 
1369 	spin_lock_irqsave(&domain->lock, flags);
1370 	list_for_each_entry(info, &domain->devices, link) {
1371 		if (info->iommu == iommu && info->bus == bus &&
1372 		    info->devfn == devfn) {
1373 			spin_unlock_irqrestore(&domain->lock, flags);
1374 			return info;
1375 		}
1376 	}
1377 	spin_unlock_irqrestore(&domain->lock, flags);
1378 
1379 	return NULL;
1380 }
1381 
domain_update_iotlb(struct dmar_domain * domain)1382 static void domain_update_iotlb(struct dmar_domain *domain)
1383 {
1384 	struct device_domain_info *info;
1385 	bool has_iotlb_device = false;
1386 	unsigned long flags;
1387 
1388 	spin_lock_irqsave(&domain->lock, flags);
1389 	list_for_each_entry(info, &domain->devices, link) {
1390 		if (info->ats_enabled) {
1391 			has_iotlb_device = true;
1392 			break;
1393 		}
1394 	}
1395 	domain->has_iotlb_device = has_iotlb_device;
1396 	spin_unlock_irqrestore(&domain->lock, flags);
1397 }
1398 
1399 /*
1400  * The extra devTLB flush quirk impacts those QAT devices with PCI device
1401  * IDs ranging from 0x4940 to 0x4943. It is exempted from risky_device()
1402  * check because it applies only to the built-in QAT devices and it doesn't
1403  * grant additional privileges.
1404  */
1405 #define BUGGY_QAT_DEVID_MASK 0x4940
dev_needs_extra_dtlb_flush(struct pci_dev * pdev)1406 static bool dev_needs_extra_dtlb_flush(struct pci_dev *pdev)
1407 {
1408 	if (pdev->vendor != PCI_VENDOR_ID_INTEL)
1409 		return false;
1410 
1411 	if ((pdev->device & 0xfffc) != BUGGY_QAT_DEVID_MASK)
1412 		return false;
1413 
1414 	return true;
1415 }
1416 
iommu_enable_pci_caps(struct device_domain_info * info)1417 static void iommu_enable_pci_caps(struct device_domain_info *info)
1418 {
1419 	struct pci_dev *pdev;
1420 
1421 	if (!info || !dev_is_pci(info->dev))
1422 		return;
1423 
1424 	pdev = to_pci_dev(info->dev);
1425 	/* For IOMMU that supports device IOTLB throttling (DIT), we assign
1426 	 * PFSID to the invalidation desc of a VF such that IOMMU HW can gauge
1427 	 * queue depth at PF level. If DIT is not set, PFSID will be treated as
1428 	 * reserved, which should be set to 0.
1429 	 */
1430 	if (!ecap_dit(info->iommu->ecap))
1431 		info->pfsid = 0;
1432 	else {
1433 		struct pci_dev *pf_pdev;
1434 
1435 		/* pdev will be returned if device is not a vf */
1436 		pf_pdev = pci_physfn(pdev);
1437 		info->pfsid = pci_dev_id(pf_pdev);
1438 	}
1439 
1440 	/* The PCIe spec, in its wisdom, declares that the behaviour of
1441 	   the device if you enable PASID support after ATS support is
1442 	   undefined. So always enable PASID support on devices which
1443 	   have it, even if we can't yet know if we're ever going to
1444 	   use it. */
1445 	if (info->pasid_supported && !pci_enable_pasid(pdev, info->pasid_supported & ~1))
1446 		info->pasid_enabled = 1;
1447 
1448 	if (info->pri_supported &&
1449 	    (info->pasid_enabled ? pci_prg_resp_pasid_required(pdev) : 1)  &&
1450 	    !pci_reset_pri(pdev) && !pci_enable_pri(pdev, PRQ_DEPTH))
1451 		info->pri_enabled = 1;
1452 
1453 	if (info->ats_supported && pci_ats_page_aligned(pdev) &&
1454 	    !pci_enable_ats(pdev, VTD_PAGE_SHIFT)) {
1455 		info->ats_enabled = 1;
1456 		domain_update_iotlb(info->domain);
1457 		info->ats_qdep = pci_ats_queue_depth(pdev);
1458 	}
1459 }
1460 
iommu_disable_dev_iotlb(struct device_domain_info * info)1461 static void iommu_disable_dev_iotlb(struct device_domain_info *info)
1462 {
1463 	struct pci_dev *pdev;
1464 
1465 	if (!dev_is_pci(info->dev))
1466 		return;
1467 
1468 	pdev = to_pci_dev(info->dev);
1469 
1470 	if (info->ats_enabled) {
1471 		pci_disable_ats(pdev);
1472 		info->ats_enabled = 0;
1473 		domain_update_iotlb(info->domain);
1474 	}
1475 
1476 	if (info->pri_enabled) {
1477 		pci_disable_pri(pdev);
1478 		info->pri_enabled = 0;
1479 	}
1480 
1481 	if (info->pasid_enabled) {
1482 		pci_disable_pasid(pdev);
1483 		info->pasid_enabled = 0;
1484 	}
1485 }
1486 
__iommu_flush_dev_iotlb(struct device_domain_info * info,u64 addr,unsigned int mask)1487 static void __iommu_flush_dev_iotlb(struct device_domain_info *info,
1488 				    u64 addr, unsigned int mask)
1489 {
1490 	u16 sid, qdep;
1491 
1492 	if (!info || !info->ats_enabled)
1493 		return;
1494 
1495 	sid = info->bus << 8 | info->devfn;
1496 	qdep = info->ats_qdep;
1497 	qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
1498 			   qdep, addr, mask);
1499 	quirk_extra_dev_tlb_flush(info, addr, mask, PASID_RID2PASID, qdep);
1500 }
1501 
iommu_flush_dev_iotlb(struct dmar_domain * domain,u64 addr,unsigned mask)1502 static void iommu_flush_dev_iotlb(struct dmar_domain *domain,
1503 				  u64 addr, unsigned mask)
1504 {
1505 	struct device_domain_info *info;
1506 	unsigned long flags;
1507 
1508 	if (!domain->has_iotlb_device)
1509 		return;
1510 
1511 	spin_lock_irqsave(&domain->lock, flags);
1512 	list_for_each_entry(info, &domain->devices, link)
1513 		__iommu_flush_dev_iotlb(info, addr, mask);
1514 	spin_unlock_irqrestore(&domain->lock, flags);
1515 }
1516 
iommu_flush_iotlb_psi(struct intel_iommu * iommu,struct dmar_domain * domain,unsigned long pfn,unsigned int pages,int ih,int map)1517 static void iommu_flush_iotlb_psi(struct intel_iommu *iommu,
1518 				  struct dmar_domain *domain,
1519 				  unsigned long pfn, unsigned int pages,
1520 				  int ih, int map)
1521 {
1522 	unsigned int aligned_pages = __roundup_pow_of_two(pages);
1523 	unsigned int mask = ilog2(aligned_pages);
1524 	uint64_t addr = (uint64_t)pfn << VTD_PAGE_SHIFT;
1525 	u16 did = domain_id_iommu(domain, iommu);
1526 
1527 	BUG_ON(pages == 0);
1528 
1529 	if (ih)
1530 		ih = 1 << 6;
1531 
1532 	if (domain_use_first_level(domain)) {
1533 		qi_flush_piotlb(iommu, did, PASID_RID2PASID, addr, pages, ih);
1534 	} else {
1535 		unsigned long bitmask = aligned_pages - 1;
1536 
1537 		/*
1538 		 * PSI masks the low order bits of the base address. If the
1539 		 * address isn't aligned to the mask, then compute a mask value
1540 		 * needed to ensure the target range is flushed.
1541 		 */
1542 		if (unlikely(bitmask & pfn)) {
1543 			unsigned long end_pfn = pfn + pages - 1, shared_bits;
1544 
1545 			/*
1546 			 * Since end_pfn <= pfn + bitmask, the only way bits
1547 			 * higher than bitmask can differ in pfn and end_pfn is
1548 			 * by carrying. This means after masking out bitmask,
1549 			 * high bits starting with the first set bit in
1550 			 * shared_bits are all equal in both pfn and end_pfn.
1551 			 */
1552 			shared_bits = ~(pfn ^ end_pfn) & ~bitmask;
1553 			mask = shared_bits ? __ffs(shared_bits) : BITS_PER_LONG;
1554 		}
1555 
1556 		/*
1557 		 * Fallback to domain selective flush if no PSI support or
1558 		 * the size is too big.
1559 		 */
1560 		if (!cap_pgsel_inv(iommu->cap) ||
1561 		    mask > cap_max_amask_val(iommu->cap))
1562 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1563 							DMA_TLB_DSI_FLUSH);
1564 		else
1565 			iommu->flush.flush_iotlb(iommu, did, addr | ih, mask,
1566 							DMA_TLB_PSI_FLUSH);
1567 	}
1568 
1569 	/*
1570 	 * In caching mode, changes of pages from non-present to present require
1571 	 * flush. However, device IOTLB doesn't need to be flushed in this case.
1572 	 */
1573 	if (!cap_caching_mode(iommu->cap) || !map)
1574 		iommu_flush_dev_iotlb(domain, addr, mask);
1575 }
1576 
1577 /* Notification for newly created mappings */
__mapping_notify_one(struct intel_iommu * iommu,struct dmar_domain * domain,unsigned long pfn,unsigned int pages)1578 static inline void __mapping_notify_one(struct intel_iommu *iommu,
1579 					struct dmar_domain *domain,
1580 					unsigned long pfn, unsigned int pages)
1581 {
1582 	/*
1583 	 * It's a non-present to present mapping. Only flush if caching mode
1584 	 * and second level.
1585 	 */
1586 	if (cap_caching_mode(iommu->cap) && !domain_use_first_level(domain))
1587 		iommu_flush_iotlb_psi(iommu, domain, pfn, pages, 0, 1);
1588 	else
1589 		iommu_flush_write_buffer(iommu);
1590 }
1591 
intel_flush_iotlb_all(struct iommu_domain * domain)1592 static void intel_flush_iotlb_all(struct iommu_domain *domain)
1593 {
1594 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
1595 	struct iommu_domain_info *info;
1596 	unsigned long idx;
1597 
1598 	xa_for_each(&dmar_domain->iommu_array, idx, info) {
1599 		struct intel_iommu *iommu = info->iommu;
1600 		u16 did = domain_id_iommu(dmar_domain, iommu);
1601 
1602 		if (domain_use_first_level(dmar_domain))
1603 			qi_flush_piotlb(iommu, did, PASID_RID2PASID, 0, -1, 0);
1604 		else
1605 			iommu->flush.flush_iotlb(iommu, did, 0, 0,
1606 						 DMA_TLB_DSI_FLUSH);
1607 
1608 		if (!cap_caching_mode(iommu->cap))
1609 			iommu_flush_dev_iotlb(dmar_domain, 0, MAX_AGAW_PFN_WIDTH);
1610 	}
1611 }
1612 
iommu_disable_protect_mem_regions(struct intel_iommu * iommu)1613 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
1614 {
1615 	u32 pmen;
1616 	unsigned long flags;
1617 
1618 	if (!cap_plmr(iommu->cap) && !cap_phmr(iommu->cap))
1619 		return;
1620 
1621 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1622 	pmen = readl(iommu->reg + DMAR_PMEN_REG);
1623 	pmen &= ~DMA_PMEN_EPM;
1624 	writel(pmen, iommu->reg + DMAR_PMEN_REG);
1625 
1626 	/* wait for the protected region status bit to clear */
1627 	IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
1628 		readl, !(pmen & DMA_PMEN_PRS), pmen);
1629 
1630 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1631 }
1632 
iommu_enable_translation(struct intel_iommu * iommu)1633 static void iommu_enable_translation(struct intel_iommu *iommu)
1634 {
1635 	u32 sts;
1636 	unsigned long flags;
1637 
1638 	raw_spin_lock_irqsave(&iommu->register_lock, flags);
1639 	iommu->gcmd |= DMA_GCMD_TE;
1640 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1641 
1642 	/* Make sure hardware complete it */
1643 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1644 		      readl, (sts & DMA_GSTS_TES), sts);
1645 
1646 	raw_spin_unlock_irqrestore(&iommu->register_lock, flags);
1647 }
1648 
iommu_disable_translation(struct intel_iommu * iommu)1649 static void iommu_disable_translation(struct intel_iommu *iommu)
1650 {
1651 	u32 sts;
1652 	unsigned long flag;
1653 
1654 	if (iommu_skip_te_disable && iommu->drhd->gfx_dedicated &&
1655 	    (cap_read_drain(iommu->cap) || cap_write_drain(iommu->cap)))
1656 		return;
1657 
1658 	raw_spin_lock_irqsave(&iommu->register_lock, flag);
1659 	iommu->gcmd &= ~DMA_GCMD_TE;
1660 	writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
1661 
1662 	/* Make sure hardware complete it */
1663 	IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1664 		      readl, (!(sts & DMA_GSTS_TES)), sts);
1665 
1666 	raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
1667 }
1668 
iommu_init_domains(struct intel_iommu * iommu)1669 static int iommu_init_domains(struct intel_iommu *iommu)
1670 {
1671 	u32 ndomains;
1672 
1673 	ndomains = cap_ndoms(iommu->cap);
1674 	pr_debug("%s: Number of Domains supported <%d>\n",
1675 		 iommu->name, ndomains);
1676 
1677 	spin_lock_init(&iommu->lock);
1678 
1679 	iommu->domain_ids = bitmap_zalloc(ndomains, GFP_KERNEL);
1680 	if (!iommu->domain_ids)
1681 		return -ENOMEM;
1682 
1683 	/*
1684 	 * If Caching mode is set, then invalid translations are tagged
1685 	 * with domain-id 0, hence we need to pre-allocate it. We also
1686 	 * use domain-id 0 as a marker for non-allocated domain-id, so
1687 	 * make sure it is not used for a real domain.
1688 	 */
1689 	set_bit(0, iommu->domain_ids);
1690 
1691 	/*
1692 	 * Vt-d spec rev3.0 (section 6.2.3.1) requires that each pasid
1693 	 * entry for first-level or pass-through translation modes should
1694 	 * be programmed with a domain id different from those used for
1695 	 * second-level or nested translation. We reserve a domain id for
1696 	 * this purpose.
1697 	 */
1698 	if (sm_supported(iommu))
1699 		set_bit(FLPT_DEFAULT_DID, iommu->domain_ids);
1700 
1701 	return 0;
1702 }
1703 
disable_dmar_iommu(struct intel_iommu * iommu)1704 static void disable_dmar_iommu(struct intel_iommu *iommu)
1705 {
1706 	if (!iommu->domain_ids)
1707 		return;
1708 
1709 	/*
1710 	 * All iommu domains must have been detached from the devices,
1711 	 * hence there should be no domain IDs in use.
1712 	 */
1713 	if (WARN_ON(bitmap_weight(iommu->domain_ids, cap_ndoms(iommu->cap))
1714 		    > NUM_RESERVED_DID))
1715 		return;
1716 
1717 	if (iommu->gcmd & DMA_GCMD_TE)
1718 		iommu_disable_translation(iommu);
1719 }
1720 
free_dmar_iommu(struct intel_iommu * iommu)1721 static void free_dmar_iommu(struct intel_iommu *iommu)
1722 {
1723 	if (iommu->domain_ids) {
1724 		bitmap_free(iommu->domain_ids);
1725 		iommu->domain_ids = NULL;
1726 	}
1727 
1728 	if (iommu->copied_tables) {
1729 		bitmap_free(iommu->copied_tables);
1730 		iommu->copied_tables = NULL;
1731 	}
1732 
1733 	/* free context mapping */
1734 	free_context_table(iommu);
1735 
1736 #ifdef CONFIG_INTEL_IOMMU_SVM
1737 	if (pasid_supported(iommu)) {
1738 		if (ecap_prs(iommu->ecap))
1739 			intel_svm_finish_prq(iommu);
1740 	}
1741 	if (vccap_pasid(iommu->vccap))
1742 		ioasid_unregister_allocator(&iommu->pasid_allocator);
1743 
1744 #endif
1745 }
1746 
1747 /*
1748  * Check and return whether first level is used by default for
1749  * DMA translation.
1750  */
first_level_by_default(unsigned int type)1751 static bool first_level_by_default(unsigned int type)
1752 {
1753 	/* Only SL is available in legacy mode */
1754 	if (!scalable_mode_support())
1755 		return false;
1756 
1757 	/* Only level (either FL or SL) is available, just use it */
1758 	if (intel_cap_flts_sanity() ^ intel_cap_slts_sanity())
1759 		return intel_cap_flts_sanity();
1760 
1761 	/* Both levels are available, decide it based on domain type */
1762 	return type != IOMMU_DOMAIN_UNMANAGED;
1763 }
1764 
alloc_domain(unsigned int type)1765 static struct dmar_domain *alloc_domain(unsigned int type)
1766 {
1767 	struct dmar_domain *domain;
1768 
1769 	domain = kzalloc(sizeof(*domain), GFP_KERNEL);
1770 	if (!domain)
1771 		return NULL;
1772 
1773 	domain->nid = NUMA_NO_NODE;
1774 	if (first_level_by_default(type))
1775 		domain->flags |= DOMAIN_FLAG_USE_FIRST_LEVEL;
1776 	domain->has_iotlb_device = false;
1777 	INIT_LIST_HEAD(&domain->devices);
1778 	spin_lock_init(&domain->lock);
1779 	xa_init(&domain->iommu_array);
1780 
1781 	return domain;
1782 }
1783 
domain_attach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1784 static int domain_attach_iommu(struct dmar_domain *domain,
1785 			       struct intel_iommu *iommu)
1786 {
1787 	struct iommu_domain_info *info, *curr;
1788 	unsigned long ndomains;
1789 	int num, ret = -ENOSPC;
1790 
1791 	info = kzalloc(sizeof(*info), GFP_KERNEL);
1792 	if (!info)
1793 		return -ENOMEM;
1794 
1795 	spin_lock(&iommu->lock);
1796 	curr = xa_load(&domain->iommu_array, iommu->seq_id);
1797 	if (curr) {
1798 		curr->refcnt++;
1799 		spin_unlock(&iommu->lock);
1800 		kfree(info);
1801 		return 0;
1802 	}
1803 
1804 	ndomains = cap_ndoms(iommu->cap);
1805 	num = find_first_zero_bit(iommu->domain_ids, ndomains);
1806 	if (num >= ndomains) {
1807 		pr_err("%s: No free domain ids\n", iommu->name);
1808 		goto err_unlock;
1809 	}
1810 
1811 	set_bit(num, iommu->domain_ids);
1812 	info->refcnt	= 1;
1813 	info->did	= num;
1814 	info->iommu	= iommu;
1815 	curr = xa_cmpxchg(&domain->iommu_array, iommu->seq_id,
1816 			  NULL, info, GFP_ATOMIC);
1817 	if (curr) {
1818 		ret = xa_err(curr) ? : -EBUSY;
1819 		goto err_clear;
1820 	}
1821 	domain_update_iommu_cap(domain);
1822 
1823 	spin_unlock(&iommu->lock);
1824 	return 0;
1825 
1826 err_clear:
1827 	clear_bit(info->did, iommu->domain_ids);
1828 err_unlock:
1829 	spin_unlock(&iommu->lock);
1830 	kfree(info);
1831 	return ret;
1832 }
1833 
domain_detach_iommu(struct dmar_domain * domain,struct intel_iommu * iommu)1834 static void domain_detach_iommu(struct dmar_domain *domain,
1835 				struct intel_iommu *iommu)
1836 {
1837 	struct iommu_domain_info *info;
1838 
1839 	spin_lock(&iommu->lock);
1840 	info = xa_load(&domain->iommu_array, iommu->seq_id);
1841 	if (--info->refcnt == 0) {
1842 		clear_bit(info->did, iommu->domain_ids);
1843 		xa_erase(&domain->iommu_array, iommu->seq_id);
1844 		domain->nid = NUMA_NO_NODE;
1845 		domain_update_iommu_cap(domain);
1846 		kfree(info);
1847 	}
1848 	spin_unlock(&iommu->lock);
1849 }
1850 
guestwidth_to_adjustwidth(int gaw)1851 static inline int guestwidth_to_adjustwidth(int gaw)
1852 {
1853 	int agaw;
1854 	int r = (gaw - 12) % 9;
1855 
1856 	if (r == 0)
1857 		agaw = gaw;
1858 	else
1859 		agaw = gaw + 9 - r;
1860 	if (agaw > 64)
1861 		agaw = 64;
1862 	return agaw;
1863 }
1864 
domain_exit(struct dmar_domain * domain)1865 static void domain_exit(struct dmar_domain *domain)
1866 {
1867 	if (domain->pgd) {
1868 		LIST_HEAD(freelist);
1869 
1870 		domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
1871 		put_pages_list(&freelist);
1872 	}
1873 
1874 	if (WARN_ON(!list_empty(&domain->devices)))
1875 		return;
1876 
1877 	kfree(domain);
1878 }
1879 
1880 /*
1881  * Get the PASID directory size for scalable mode context entry.
1882  * Value of X in the PDTS field of a scalable mode context entry
1883  * indicates PASID directory with 2^(X + 7) entries.
1884  */
context_get_sm_pds(struct pasid_table * table)1885 static inline unsigned long context_get_sm_pds(struct pasid_table *table)
1886 {
1887 	unsigned long pds, max_pde;
1888 
1889 	max_pde = table->max_pasid >> PASID_PDE_SHIFT;
1890 	pds = find_first_bit(&max_pde, MAX_NR_PASID_BITS);
1891 	if (pds < 7)
1892 		return 0;
1893 
1894 	return pds - 7;
1895 }
1896 
1897 /*
1898  * Set the RID_PASID field of a scalable mode context entry. The
1899  * IOMMU hardware will use the PASID value set in this field for
1900  * DMA translations of DMA requests without PASID.
1901  */
1902 static inline void
context_set_sm_rid2pasid(struct context_entry * context,unsigned long pasid)1903 context_set_sm_rid2pasid(struct context_entry *context, unsigned long pasid)
1904 {
1905 	context->hi |= pasid & ((1 << 20) - 1);
1906 }
1907 
1908 /*
1909  * Set the DTE(Device-TLB Enable) field of a scalable mode context
1910  * entry.
1911  */
context_set_sm_dte(struct context_entry * context)1912 static inline void context_set_sm_dte(struct context_entry *context)
1913 {
1914 	context->lo |= (1 << 2);
1915 }
1916 
1917 /*
1918  * Set the PRE(Page Request Enable) field of a scalable mode context
1919  * entry.
1920  */
context_set_sm_pre(struct context_entry * context)1921 static inline void context_set_sm_pre(struct context_entry *context)
1922 {
1923 	context->lo |= (1 << 4);
1924 }
1925 
1926 /* Convert value to context PASID directory size field coding. */
1927 #define context_pdts(pds)	(((pds) & 0x7) << 9)
1928 
domain_context_mapping_one(struct dmar_domain * domain,struct intel_iommu * iommu,struct pasid_table * table,u8 bus,u8 devfn)1929 static int domain_context_mapping_one(struct dmar_domain *domain,
1930 				      struct intel_iommu *iommu,
1931 				      struct pasid_table *table,
1932 				      u8 bus, u8 devfn)
1933 {
1934 	struct device_domain_info *info =
1935 			domain_lookup_dev_info(domain, iommu, bus, devfn);
1936 	u16 did = domain_id_iommu(domain, iommu);
1937 	int translation = CONTEXT_TT_MULTI_LEVEL;
1938 	struct context_entry *context;
1939 	int ret;
1940 
1941 	WARN_ON(did == 0);
1942 
1943 	if (hw_pass_through && domain_type_is_si(domain))
1944 		translation = CONTEXT_TT_PASS_THROUGH;
1945 
1946 	pr_debug("Set context mapping for %02x:%02x.%d\n",
1947 		bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1948 
1949 	BUG_ON(!domain->pgd);
1950 
1951 	spin_lock(&iommu->lock);
1952 	ret = -ENOMEM;
1953 	context = iommu_context_addr(iommu, bus, devfn, 1);
1954 	if (!context)
1955 		goto out_unlock;
1956 
1957 	ret = 0;
1958 	if (context_present(context) && !context_copied(iommu, bus, devfn))
1959 		goto out_unlock;
1960 
1961 	/*
1962 	 * For kdump cases, old valid entries may be cached due to the
1963 	 * in-flight DMA and copied pgtable, but there is no unmapping
1964 	 * behaviour for them, thus we need an explicit cache flush for
1965 	 * the newly-mapped device. For kdump, at this point, the device
1966 	 * is supposed to finish reset at its driver probe stage, so no
1967 	 * in-flight DMA will exist, and we don't need to worry anymore
1968 	 * hereafter.
1969 	 */
1970 	if (context_copied(iommu, bus, devfn)) {
1971 		u16 did_old = context_domain_id(context);
1972 
1973 		if (did_old < cap_ndoms(iommu->cap)) {
1974 			iommu->flush.flush_context(iommu, did_old,
1975 						   (((u16)bus) << 8) | devfn,
1976 						   DMA_CCMD_MASK_NOBIT,
1977 						   DMA_CCMD_DEVICE_INVL);
1978 			iommu->flush.flush_iotlb(iommu, did_old, 0, 0,
1979 						 DMA_TLB_DSI_FLUSH);
1980 		}
1981 
1982 		clear_context_copied(iommu, bus, devfn);
1983 	}
1984 
1985 	context_clear_entry(context);
1986 
1987 	if (sm_supported(iommu)) {
1988 		unsigned long pds;
1989 
1990 		WARN_ON(!table);
1991 
1992 		/* Setup the PASID DIR pointer: */
1993 		pds = context_get_sm_pds(table);
1994 		context->lo = (u64)virt_to_phys(table->table) |
1995 				context_pdts(pds);
1996 
1997 		/* Setup the RID_PASID field: */
1998 		context_set_sm_rid2pasid(context, PASID_RID2PASID);
1999 
2000 		/*
2001 		 * Setup the Device-TLB enable bit and Page request
2002 		 * Enable bit:
2003 		 */
2004 		if (info && info->ats_supported)
2005 			context_set_sm_dte(context);
2006 		if (info && info->pri_supported)
2007 			context_set_sm_pre(context);
2008 		if (info && info->pasid_supported)
2009 			context_set_pasid(context);
2010 	} else {
2011 		struct dma_pte *pgd = domain->pgd;
2012 		int agaw;
2013 
2014 		context_set_domain_id(context, did);
2015 
2016 		if (translation != CONTEXT_TT_PASS_THROUGH) {
2017 			/*
2018 			 * Skip top levels of page tables for iommu which has
2019 			 * less agaw than default. Unnecessary for PT mode.
2020 			 */
2021 			for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2022 				ret = -ENOMEM;
2023 				pgd = phys_to_virt(dma_pte_addr(pgd));
2024 				if (!dma_pte_present(pgd))
2025 					goto out_unlock;
2026 			}
2027 
2028 			if (info && info->ats_supported)
2029 				translation = CONTEXT_TT_DEV_IOTLB;
2030 			else
2031 				translation = CONTEXT_TT_MULTI_LEVEL;
2032 
2033 			context_set_address_root(context, virt_to_phys(pgd));
2034 			context_set_address_width(context, agaw);
2035 		} else {
2036 			/*
2037 			 * In pass through mode, AW must be programmed to
2038 			 * indicate the largest AGAW value supported by
2039 			 * hardware. And ASR is ignored by hardware.
2040 			 */
2041 			context_set_address_width(context, iommu->msagaw);
2042 		}
2043 
2044 		context_set_translation_type(context, translation);
2045 	}
2046 
2047 	context_set_fault_enable(context);
2048 	context_set_present(context);
2049 	if (!ecap_coherent(iommu->ecap))
2050 		clflush_cache_range(context, sizeof(*context));
2051 
2052 	/*
2053 	 * It's a non-present to present mapping. If hardware doesn't cache
2054 	 * non-present entry we only need to flush the write-buffer. If the
2055 	 * _does_ cache non-present entries, then it does so in the special
2056 	 * domain #0, which we have to flush:
2057 	 */
2058 	if (cap_caching_mode(iommu->cap)) {
2059 		iommu->flush.flush_context(iommu, 0,
2060 					   (((u16)bus) << 8) | devfn,
2061 					   DMA_CCMD_MASK_NOBIT,
2062 					   DMA_CCMD_DEVICE_INVL);
2063 		iommu->flush.flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH);
2064 	} else {
2065 		iommu_flush_write_buffer(iommu);
2066 	}
2067 	iommu_enable_pci_caps(info);
2068 
2069 	ret = 0;
2070 
2071 out_unlock:
2072 	spin_unlock(&iommu->lock);
2073 
2074 	return ret;
2075 }
2076 
2077 struct domain_context_mapping_data {
2078 	struct dmar_domain *domain;
2079 	struct intel_iommu *iommu;
2080 	struct pasid_table *table;
2081 };
2082 
domain_context_mapping_cb(struct pci_dev * pdev,u16 alias,void * opaque)2083 static int domain_context_mapping_cb(struct pci_dev *pdev,
2084 				     u16 alias, void *opaque)
2085 {
2086 	struct domain_context_mapping_data *data = opaque;
2087 
2088 	return domain_context_mapping_one(data->domain, data->iommu,
2089 					  data->table, PCI_BUS_NUM(alias),
2090 					  alias & 0xff);
2091 }
2092 
2093 static int
domain_context_mapping(struct dmar_domain * domain,struct device * dev)2094 domain_context_mapping(struct dmar_domain *domain, struct device *dev)
2095 {
2096 	struct domain_context_mapping_data data;
2097 	struct pasid_table *table;
2098 	struct intel_iommu *iommu;
2099 	u8 bus, devfn;
2100 
2101 	iommu = device_to_iommu(dev, &bus, &devfn);
2102 	if (!iommu)
2103 		return -ENODEV;
2104 
2105 	table = intel_pasid_get_table(dev);
2106 
2107 	if (!dev_is_pci(dev))
2108 		return domain_context_mapping_one(domain, iommu, table,
2109 						  bus, devfn);
2110 
2111 	data.domain = domain;
2112 	data.iommu = iommu;
2113 	data.table = table;
2114 
2115 	return pci_for_each_dma_alias(to_pci_dev(dev),
2116 				      &domain_context_mapping_cb, &data);
2117 }
2118 
domain_context_mapped_cb(struct pci_dev * pdev,u16 alias,void * opaque)2119 static int domain_context_mapped_cb(struct pci_dev *pdev,
2120 				    u16 alias, void *opaque)
2121 {
2122 	struct intel_iommu *iommu = opaque;
2123 
2124 	return !device_context_mapped(iommu, PCI_BUS_NUM(alias), alias & 0xff);
2125 }
2126 
domain_context_mapped(struct device * dev)2127 static int domain_context_mapped(struct device *dev)
2128 {
2129 	struct intel_iommu *iommu;
2130 	u8 bus, devfn;
2131 
2132 	iommu = device_to_iommu(dev, &bus, &devfn);
2133 	if (!iommu)
2134 		return -ENODEV;
2135 
2136 	if (!dev_is_pci(dev))
2137 		return device_context_mapped(iommu, bus, devfn);
2138 
2139 	return !pci_for_each_dma_alias(to_pci_dev(dev),
2140 				       domain_context_mapped_cb, iommu);
2141 }
2142 
2143 /* Returns a number of VTD pages, but aligned to MM page size */
aligned_nrpages(unsigned long host_addr,size_t size)2144 static inline unsigned long aligned_nrpages(unsigned long host_addr,
2145 					    size_t size)
2146 {
2147 	host_addr &= ~PAGE_MASK;
2148 	return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT;
2149 }
2150 
2151 /* Return largest possible superpage level for a given mapping */
hardware_largepage_caps(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phy_pfn,unsigned long pages)2152 static inline int hardware_largepage_caps(struct dmar_domain *domain,
2153 					  unsigned long iov_pfn,
2154 					  unsigned long phy_pfn,
2155 					  unsigned long pages)
2156 {
2157 	int support, level = 1;
2158 	unsigned long pfnmerge;
2159 
2160 	support = domain->iommu_superpage;
2161 
2162 	/* To use a large page, the virtual *and* physical addresses
2163 	   must be aligned to 2MiB/1GiB/etc. Lower bits set in either
2164 	   of them will mean we have to use smaller pages. So just
2165 	   merge them and check both at once. */
2166 	pfnmerge = iov_pfn | phy_pfn;
2167 
2168 	while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) {
2169 		pages >>= VTD_STRIDE_SHIFT;
2170 		if (!pages)
2171 			break;
2172 		pfnmerge >>= VTD_STRIDE_SHIFT;
2173 		level++;
2174 		support--;
2175 	}
2176 	return level;
2177 }
2178 
2179 /*
2180  * Ensure that old small page tables are removed to make room for superpage(s).
2181  * We're going to add new large pages, so make sure we don't remove their parent
2182  * tables. The IOTLB/devTLBs should be flushed if any PDE/PTEs are cleared.
2183  */
switch_to_super_page(struct dmar_domain * domain,unsigned long start_pfn,unsigned long end_pfn,int level)2184 static void switch_to_super_page(struct dmar_domain *domain,
2185 				 unsigned long start_pfn,
2186 				 unsigned long end_pfn, int level)
2187 {
2188 	unsigned long lvl_pages = lvl_to_nr_pages(level);
2189 	struct iommu_domain_info *info;
2190 	struct dma_pte *pte = NULL;
2191 	unsigned long i;
2192 
2193 	while (start_pfn <= end_pfn) {
2194 		if (!pte)
2195 			pte = pfn_to_dma_pte(domain, start_pfn, &level);
2196 
2197 		if (dma_pte_present(pte)) {
2198 			dma_pte_free_pagetable(domain, start_pfn,
2199 					       start_pfn + lvl_pages - 1,
2200 					       level + 1);
2201 
2202 			xa_for_each(&domain->iommu_array, i, info)
2203 				iommu_flush_iotlb_psi(info->iommu, domain,
2204 						      start_pfn, lvl_pages,
2205 						      0, 0);
2206 		}
2207 
2208 		pte++;
2209 		start_pfn += lvl_pages;
2210 		if (first_pte_in_page(pte))
2211 			pte = NULL;
2212 	}
2213 }
2214 
2215 static int
__domain_mapping(struct dmar_domain * domain,unsigned long iov_pfn,unsigned long phys_pfn,unsigned long nr_pages,int prot)2216 __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
2217 		 unsigned long phys_pfn, unsigned long nr_pages, int prot)
2218 {
2219 	struct dma_pte *first_pte = NULL, *pte = NULL;
2220 	unsigned int largepage_lvl = 0;
2221 	unsigned long lvl_pages = 0;
2222 	phys_addr_t pteval;
2223 	u64 attr;
2224 
2225 	BUG_ON(!domain_pfn_supported(domain, iov_pfn + nr_pages - 1));
2226 
2227 	if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
2228 		return -EINVAL;
2229 
2230 	attr = prot & (DMA_PTE_READ | DMA_PTE_WRITE | DMA_PTE_SNP);
2231 	attr |= DMA_FL_PTE_PRESENT;
2232 	if (domain_use_first_level(domain)) {
2233 		attr |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
2234 		if (prot & DMA_PTE_WRITE)
2235 			attr |= DMA_FL_PTE_DIRTY;
2236 	}
2237 
2238 	pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | attr;
2239 
2240 	while (nr_pages > 0) {
2241 		uint64_t tmp;
2242 
2243 		if (!pte) {
2244 			largepage_lvl = hardware_largepage_caps(domain, iov_pfn,
2245 					phys_pfn, nr_pages);
2246 
2247 			pte = pfn_to_dma_pte(domain, iov_pfn, &largepage_lvl);
2248 			if (!pte)
2249 				return -ENOMEM;
2250 			first_pte = pte;
2251 
2252 			lvl_pages = lvl_to_nr_pages(largepage_lvl);
2253 
2254 			/* It is large page*/
2255 			if (largepage_lvl > 1) {
2256 				unsigned long end_pfn;
2257 				unsigned long pages_to_remove;
2258 
2259 				pteval |= DMA_PTE_LARGE_PAGE;
2260 				pages_to_remove = min_t(unsigned long, nr_pages,
2261 							nr_pte_to_next_page(pte) * lvl_pages);
2262 				end_pfn = iov_pfn + pages_to_remove - 1;
2263 				switch_to_super_page(domain, iov_pfn, end_pfn, largepage_lvl);
2264 			} else {
2265 				pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE;
2266 			}
2267 
2268 		}
2269 		/* We don't need lock here, nobody else
2270 		 * touches the iova range
2271 		 */
2272 		tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
2273 		if (tmp) {
2274 			static int dumps = 5;
2275 			pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
2276 				iov_pfn, tmp, (unsigned long long)pteval);
2277 			if (dumps) {
2278 				dumps--;
2279 				debug_dma_dump_mappings(NULL);
2280 			}
2281 			WARN_ON(1);
2282 		}
2283 
2284 		nr_pages -= lvl_pages;
2285 		iov_pfn += lvl_pages;
2286 		phys_pfn += lvl_pages;
2287 		pteval += lvl_pages * VTD_PAGE_SIZE;
2288 
2289 		/* If the next PTE would be the first in a new page, then we
2290 		 * need to flush the cache on the entries we've just written.
2291 		 * And then we'll need to recalculate 'pte', so clear it and
2292 		 * let it get set again in the if (!pte) block above.
2293 		 *
2294 		 * If we're done (!nr_pages) we need to flush the cache too.
2295 		 *
2296 		 * Also if we've been setting superpages, we may need to
2297 		 * recalculate 'pte' and switch back to smaller pages for the
2298 		 * end of the mapping, if the trailing size is not enough to
2299 		 * use another superpage (i.e. nr_pages < lvl_pages).
2300 		 */
2301 		pte++;
2302 		if (!nr_pages || first_pte_in_page(pte) ||
2303 		    (largepage_lvl > 1 && nr_pages < lvl_pages)) {
2304 			domain_flush_cache(domain, first_pte,
2305 					   (void *)pte - (void *)first_pte);
2306 			pte = NULL;
2307 		}
2308 	}
2309 
2310 	return 0;
2311 }
2312 
domain_context_clear_one(struct device_domain_info * info,u8 bus,u8 devfn)2313 static void domain_context_clear_one(struct device_domain_info *info, u8 bus, u8 devfn)
2314 {
2315 	struct intel_iommu *iommu = info->iommu;
2316 	struct context_entry *context;
2317 	u16 did_old;
2318 
2319 	if (!iommu)
2320 		return;
2321 
2322 	spin_lock(&iommu->lock);
2323 	context = iommu_context_addr(iommu, bus, devfn, 0);
2324 	if (!context) {
2325 		spin_unlock(&iommu->lock);
2326 		return;
2327 	}
2328 
2329 	if (sm_supported(iommu)) {
2330 		if (hw_pass_through && domain_type_is_si(info->domain))
2331 			did_old = FLPT_DEFAULT_DID;
2332 		else
2333 			did_old = domain_id_iommu(info->domain, iommu);
2334 	} else {
2335 		did_old = context_domain_id(context);
2336 	}
2337 
2338 	context_clear_entry(context);
2339 	__iommu_flush_cache(iommu, context, sizeof(*context));
2340 	spin_unlock(&iommu->lock);
2341 	iommu->flush.flush_context(iommu,
2342 				   did_old,
2343 				   (((u16)bus) << 8) | devfn,
2344 				   DMA_CCMD_MASK_NOBIT,
2345 				   DMA_CCMD_DEVICE_INVL);
2346 
2347 	if (sm_supported(iommu))
2348 		qi_flush_pasid_cache(iommu, did_old, QI_PC_ALL_PASIDS, 0);
2349 
2350 	iommu->flush.flush_iotlb(iommu,
2351 				 did_old,
2352 				 0,
2353 				 0,
2354 				 DMA_TLB_DSI_FLUSH);
2355 
2356 	__iommu_flush_dev_iotlb(info, 0, MAX_AGAW_PFN_WIDTH);
2357 }
2358 
domain_setup_first_level(struct intel_iommu * iommu,struct dmar_domain * domain,struct device * dev,u32 pasid)2359 static int domain_setup_first_level(struct intel_iommu *iommu,
2360 				    struct dmar_domain *domain,
2361 				    struct device *dev,
2362 				    u32 pasid)
2363 {
2364 	struct dma_pte *pgd = domain->pgd;
2365 	int agaw, level;
2366 	int flags = 0;
2367 
2368 	/*
2369 	 * Skip top levels of page tables for iommu which has
2370 	 * less agaw than default. Unnecessary for PT mode.
2371 	 */
2372 	for (agaw = domain->agaw; agaw > iommu->agaw; agaw--) {
2373 		pgd = phys_to_virt(dma_pte_addr(pgd));
2374 		if (!dma_pte_present(pgd))
2375 			return -ENOMEM;
2376 	}
2377 
2378 	level = agaw_to_level(agaw);
2379 	if (level != 4 && level != 5)
2380 		return -EINVAL;
2381 
2382 	if (pasid != PASID_RID2PASID)
2383 		flags |= PASID_FLAG_SUPERVISOR_MODE;
2384 	if (level == 5)
2385 		flags |= PASID_FLAG_FL5LP;
2386 
2387 	if (domain->force_snooping)
2388 		flags |= PASID_FLAG_PAGE_SNOOP;
2389 
2390 	return intel_pasid_setup_first_level(iommu, dev, (pgd_t *)pgd, pasid,
2391 					     domain_id_iommu(domain, iommu),
2392 					     flags);
2393 }
2394 
dev_is_real_dma_subdevice(struct device * dev)2395 static bool dev_is_real_dma_subdevice(struct device *dev)
2396 {
2397 	return dev && dev_is_pci(dev) &&
2398 	       pci_real_dma_dev(to_pci_dev(dev)) != to_pci_dev(dev);
2399 }
2400 
iommu_domain_identity_map(struct dmar_domain * domain,unsigned long first_vpfn,unsigned long last_vpfn)2401 static int iommu_domain_identity_map(struct dmar_domain *domain,
2402 				     unsigned long first_vpfn,
2403 				     unsigned long last_vpfn)
2404 {
2405 	/*
2406 	 * RMRR range might have overlap with physical memory range,
2407 	 * clear it first
2408 	 */
2409 	dma_pte_clear_range(domain, first_vpfn, last_vpfn);
2410 
2411 	return __domain_mapping(domain, first_vpfn,
2412 				first_vpfn, last_vpfn - first_vpfn + 1,
2413 				DMA_PTE_READ|DMA_PTE_WRITE);
2414 }
2415 
2416 static int md_domain_init(struct dmar_domain *domain, int guest_width);
2417 
si_domain_init(int hw)2418 static int __init si_domain_init(int hw)
2419 {
2420 	struct dmar_rmrr_unit *rmrr;
2421 	struct device *dev;
2422 	int i, nid, ret;
2423 
2424 	si_domain = alloc_domain(IOMMU_DOMAIN_IDENTITY);
2425 	if (!si_domain)
2426 		return -EFAULT;
2427 
2428 	if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2429 		domain_exit(si_domain);
2430 		si_domain = NULL;
2431 		return -EFAULT;
2432 	}
2433 
2434 	if (hw)
2435 		return 0;
2436 
2437 	for_each_online_node(nid) {
2438 		unsigned long start_pfn, end_pfn;
2439 		int i;
2440 
2441 		for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
2442 			ret = iommu_domain_identity_map(si_domain,
2443 					mm_to_dma_pfn(start_pfn),
2444 					mm_to_dma_pfn(end_pfn));
2445 			if (ret)
2446 				return ret;
2447 		}
2448 	}
2449 
2450 	/*
2451 	 * Identity map the RMRRs so that devices with RMRRs could also use
2452 	 * the si_domain.
2453 	 */
2454 	for_each_rmrr_units(rmrr) {
2455 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
2456 					  i, dev) {
2457 			unsigned long long start = rmrr->base_address;
2458 			unsigned long long end = rmrr->end_address;
2459 
2460 			if (WARN_ON(end < start ||
2461 				    end >> agaw_to_width(si_domain->agaw)))
2462 				continue;
2463 
2464 			ret = iommu_domain_identity_map(si_domain,
2465 					mm_to_dma_pfn(start >> PAGE_SHIFT),
2466 					mm_to_dma_pfn(end >> PAGE_SHIFT));
2467 			if (ret)
2468 				return ret;
2469 		}
2470 	}
2471 
2472 	return 0;
2473 }
2474 
domain_add_dev_info(struct dmar_domain * domain,struct device * dev)2475 static int domain_add_dev_info(struct dmar_domain *domain, struct device *dev)
2476 {
2477 	struct device_domain_info *info = dev_iommu_priv_get(dev);
2478 	struct intel_iommu *iommu;
2479 	unsigned long flags;
2480 	u8 bus, devfn;
2481 	int ret;
2482 
2483 	iommu = device_to_iommu(dev, &bus, &devfn);
2484 	if (!iommu)
2485 		return -ENODEV;
2486 
2487 	ret = domain_attach_iommu(domain, iommu);
2488 	if (ret)
2489 		return ret;
2490 	info->domain = domain;
2491 	spin_lock_irqsave(&domain->lock, flags);
2492 	list_add(&info->link, &domain->devices);
2493 	spin_unlock_irqrestore(&domain->lock, flags);
2494 
2495 	/* PASID table is mandatory for a PCI device in scalable mode. */
2496 	if (sm_supported(iommu) && !dev_is_real_dma_subdevice(dev)) {
2497 		ret = intel_pasid_alloc_table(dev);
2498 		if (ret) {
2499 			dev_err(dev, "PASID table allocation failed\n");
2500 			dmar_remove_one_dev_info(dev);
2501 			return ret;
2502 		}
2503 
2504 		/* Setup the PASID entry for requests without PASID: */
2505 		if (hw_pass_through && domain_type_is_si(domain))
2506 			ret = intel_pasid_setup_pass_through(iommu, domain,
2507 					dev, PASID_RID2PASID);
2508 		else if (domain_use_first_level(domain))
2509 			ret = domain_setup_first_level(iommu, domain, dev,
2510 					PASID_RID2PASID);
2511 		else
2512 			ret = intel_pasid_setup_second_level(iommu, domain,
2513 					dev, PASID_RID2PASID);
2514 		if (ret) {
2515 			dev_err(dev, "Setup RID2PASID failed\n");
2516 			dmar_remove_one_dev_info(dev);
2517 			return ret;
2518 		}
2519 	}
2520 
2521 	ret = domain_context_mapping(domain, dev);
2522 	if (ret) {
2523 		dev_err(dev, "Domain context map failed\n");
2524 		dmar_remove_one_dev_info(dev);
2525 		return ret;
2526 	}
2527 
2528 	return 0;
2529 }
2530 
device_has_rmrr(struct device * dev)2531 static bool device_has_rmrr(struct device *dev)
2532 {
2533 	struct dmar_rmrr_unit *rmrr;
2534 	struct device *tmp;
2535 	int i;
2536 
2537 	rcu_read_lock();
2538 	for_each_rmrr_units(rmrr) {
2539 		/*
2540 		 * Return TRUE if this RMRR contains the device that
2541 		 * is passed in.
2542 		 */
2543 		for_each_active_dev_scope(rmrr->devices,
2544 					  rmrr->devices_cnt, i, tmp)
2545 			if (tmp == dev ||
2546 			    is_downstream_to_pci_bridge(dev, tmp)) {
2547 				rcu_read_unlock();
2548 				return true;
2549 			}
2550 	}
2551 	rcu_read_unlock();
2552 	return false;
2553 }
2554 
2555 /**
2556  * device_rmrr_is_relaxable - Test whether the RMRR of this device
2557  * is relaxable (ie. is allowed to be not enforced under some conditions)
2558  * @dev: device handle
2559  *
2560  * We assume that PCI USB devices with RMRRs have them largely
2561  * for historical reasons and that the RMRR space is not actively used post
2562  * boot.  This exclusion may change if vendors begin to abuse it.
2563  *
2564  * The same exception is made for graphics devices, with the requirement that
2565  * any use of the RMRR regions will be torn down before assigning the device
2566  * to a guest.
2567  *
2568  * Return: true if the RMRR is relaxable, false otherwise
2569  */
device_rmrr_is_relaxable(struct device * dev)2570 static bool device_rmrr_is_relaxable(struct device *dev)
2571 {
2572 	struct pci_dev *pdev;
2573 
2574 	if (!dev_is_pci(dev))
2575 		return false;
2576 
2577 	pdev = to_pci_dev(dev);
2578 	if (IS_USB_DEVICE(pdev) || IS_GFX_DEVICE(pdev))
2579 		return true;
2580 	else
2581 		return false;
2582 }
2583 
2584 /*
2585  * There are a couple cases where we need to restrict the functionality of
2586  * devices associated with RMRRs.  The first is when evaluating a device for
2587  * identity mapping because problems exist when devices are moved in and out
2588  * of domains and their respective RMRR information is lost.  This means that
2589  * a device with associated RMRRs will never be in a "passthrough" domain.
2590  * The second is use of the device through the IOMMU API.  This interface
2591  * expects to have full control of the IOVA space for the device.  We cannot
2592  * satisfy both the requirement that RMRR access is maintained and have an
2593  * unencumbered IOVA space.  We also have no ability to quiesce the device's
2594  * use of the RMRR space or even inform the IOMMU API user of the restriction.
2595  * We therefore prevent devices associated with an RMRR from participating in
2596  * the IOMMU API, which eliminates them from device assignment.
2597  *
2598  * In both cases, devices which have relaxable RMRRs are not concerned by this
2599  * restriction. See device_rmrr_is_relaxable comment.
2600  */
device_is_rmrr_locked(struct device * dev)2601 static bool device_is_rmrr_locked(struct device *dev)
2602 {
2603 	if (!device_has_rmrr(dev))
2604 		return false;
2605 
2606 	if (device_rmrr_is_relaxable(dev))
2607 		return false;
2608 
2609 	return true;
2610 }
2611 
2612 /*
2613  * Return the required default domain type for a specific device.
2614  *
2615  * @dev: the device in query
2616  * @startup: true if this is during early boot
2617  *
2618  * Returns:
2619  *  - IOMMU_DOMAIN_DMA: device requires a dynamic mapping domain
2620  *  - IOMMU_DOMAIN_IDENTITY: device requires an identical mapping domain
2621  *  - 0: both identity and dynamic domains work for this device
2622  */
device_def_domain_type(struct device * dev)2623 static int device_def_domain_type(struct device *dev)
2624 {
2625 	if (dev_is_pci(dev)) {
2626 		struct pci_dev *pdev = to_pci_dev(dev);
2627 
2628 		if ((iommu_identity_mapping & IDENTMAP_AZALIA) && IS_AZALIA(pdev))
2629 			return IOMMU_DOMAIN_IDENTITY;
2630 
2631 		if ((iommu_identity_mapping & IDENTMAP_GFX) && IS_GFX_DEVICE(pdev))
2632 			return IOMMU_DOMAIN_IDENTITY;
2633 	}
2634 
2635 	return 0;
2636 }
2637 
intel_iommu_init_qi(struct intel_iommu * iommu)2638 static void intel_iommu_init_qi(struct intel_iommu *iommu)
2639 {
2640 	/*
2641 	 * Start from the sane iommu hardware state.
2642 	 * If the queued invalidation is already initialized by us
2643 	 * (for example, while enabling interrupt-remapping) then
2644 	 * we got the things already rolling from a sane state.
2645 	 */
2646 	if (!iommu->qi) {
2647 		/*
2648 		 * Clear any previous faults.
2649 		 */
2650 		dmar_fault(-1, iommu);
2651 		/*
2652 		 * Disable queued invalidation if supported and already enabled
2653 		 * before OS handover.
2654 		 */
2655 		dmar_disable_qi(iommu);
2656 	}
2657 
2658 	if (dmar_enable_qi(iommu)) {
2659 		/*
2660 		 * Queued Invalidate not enabled, use Register Based Invalidate
2661 		 */
2662 		iommu->flush.flush_context = __iommu_flush_context;
2663 		iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2664 		pr_info("%s: Using Register based invalidation\n",
2665 			iommu->name);
2666 	} else {
2667 		iommu->flush.flush_context = qi_flush_context;
2668 		iommu->flush.flush_iotlb = qi_flush_iotlb;
2669 		pr_info("%s: Using Queued invalidation\n", iommu->name);
2670 	}
2671 }
2672 
copy_context_table(struct intel_iommu * iommu,struct root_entry * old_re,struct context_entry ** tbl,int bus,bool ext)2673 static int copy_context_table(struct intel_iommu *iommu,
2674 			      struct root_entry *old_re,
2675 			      struct context_entry **tbl,
2676 			      int bus, bool ext)
2677 {
2678 	int tbl_idx, pos = 0, idx, devfn, ret = 0, did;
2679 	struct context_entry *new_ce = NULL, ce;
2680 	struct context_entry *old_ce = NULL;
2681 	struct root_entry re;
2682 	phys_addr_t old_ce_phys;
2683 
2684 	tbl_idx = ext ? bus * 2 : bus;
2685 	memcpy(&re, old_re, sizeof(re));
2686 
2687 	for (devfn = 0; devfn < 256; devfn++) {
2688 		/* First calculate the correct index */
2689 		idx = (ext ? devfn * 2 : devfn) % 256;
2690 
2691 		if (idx == 0) {
2692 			/* First save what we may have and clean up */
2693 			if (new_ce) {
2694 				tbl[tbl_idx] = new_ce;
2695 				__iommu_flush_cache(iommu, new_ce,
2696 						    VTD_PAGE_SIZE);
2697 				pos = 1;
2698 			}
2699 
2700 			if (old_ce)
2701 				memunmap(old_ce);
2702 
2703 			ret = 0;
2704 			if (devfn < 0x80)
2705 				old_ce_phys = root_entry_lctp(&re);
2706 			else
2707 				old_ce_phys = root_entry_uctp(&re);
2708 
2709 			if (!old_ce_phys) {
2710 				if (ext && devfn == 0) {
2711 					/* No LCTP, try UCTP */
2712 					devfn = 0x7f;
2713 					continue;
2714 				} else {
2715 					goto out;
2716 				}
2717 			}
2718 
2719 			ret = -ENOMEM;
2720 			old_ce = memremap(old_ce_phys, PAGE_SIZE,
2721 					MEMREMAP_WB);
2722 			if (!old_ce)
2723 				goto out;
2724 
2725 			new_ce = alloc_pgtable_page(iommu->node);
2726 			if (!new_ce)
2727 				goto out_unmap;
2728 
2729 			ret = 0;
2730 		}
2731 
2732 		/* Now copy the context entry */
2733 		memcpy(&ce, old_ce + idx, sizeof(ce));
2734 
2735 		if (!context_present(&ce))
2736 			continue;
2737 
2738 		did = context_domain_id(&ce);
2739 		if (did >= 0 && did < cap_ndoms(iommu->cap))
2740 			set_bit(did, iommu->domain_ids);
2741 
2742 		set_context_copied(iommu, bus, devfn);
2743 		new_ce[idx] = ce;
2744 	}
2745 
2746 	tbl[tbl_idx + pos] = new_ce;
2747 
2748 	__iommu_flush_cache(iommu, new_ce, VTD_PAGE_SIZE);
2749 
2750 out_unmap:
2751 	memunmap(old_ce);
2752 
2753 out:
2754 	return ret;
2755 }
2756 
copy_translation_tables(struct intel_iommu * iommu)2757 static int copy_translation_tables(struct intel_iommu *iommu)
2758 {
2759 	struct context_entry **ctxt_tbls;
2760 	struct root_entry *old_rt;
2761 	phys_addr_t old_rt_phys;
2762 	int ctxt_table_entries;
2763 	u64 rtaddr_reg;
2764 	int bus, ret;
2765 	bool new_ext, ext;
2766 
2767 	rtaddr_reg = dmar_readq(iommu->reg + DMAR_RTADDR_REG);
2768 	ext        = !!(rtaddr_reg & DMA_RTADDR_SMT);
2769 	new_ext    = !!sm_supported(iommu);
2770 
2771 	/*
2772 	 * The RTT bit can only be changed when translation is disabled,
2773 	 * but disabling translation means to open a window for data
2774 	 * corruption. So bail out and don't copy anything if we would
2775 	 * have to change the bit.
2776 	 */
2777 	if (new_ext != ext)
2778 		return -EINVAL;
2779 
2780 	iommu->copied_tables = bitmap_zalloc(BIT_ULL(16), GFP_KERNEL);
2781 	if (!iommu->copied_tables)
2782 		return -ENOMEM;
2783 
2784 	old_rt_phys = rtaddr_reg & VTD_PAGE_MASK;
2785 	if (!old_rt_phys)
2786 		return -EINVAL;
2787 
2788 	old_rt = memremap(old_rt_phys, PAGE_SIZE, MEMREMAP_WB);
2789 	if (!old_rt)
2790 		return -ENOMEM;
2791 
2792 	/* This is too big for the stack - allocate it from slab */
2793 	ctxt_table_entries = ext ? 512 : 256;
2794 	ret = -ENOMEM;
2795 	ctxt_tbls = kcalloc(ctxt_table_entries, sizeof(void *), GFP_KERNEL);
2796 	if (!ctxt_tbls)
2797 		goto out_unmap;
2798 
2799 	for (bus = 0; bus < 256; bus++) {
2800 		ret = copy_context_table(iommu, &old_rt[bus],
2801 					 ctxt_tbls, bus, ext);
2802 		if (ret) {
2803 			pr_err("%s: Failed to copy context table for bus %d\n",
2804 				iommu->name, bus);
2805 			continue;
2806 		}
2807 	}
2808 
2809 	spin_lock(&iommu->lock);
2810 
2811 	/* Context tables are copied, now write them to the root_entry table */
2812 	for (bus = 0; bus < 256; bus++) {
2813 		int idx = ext ? bus * 2 : bus;
2814 		u64 val;
2815 
2816 		if (ctxt_tbls[idx]) {
2817 			val = virt_to_phys(ctxt_tbls[idx]) | 1;
2818 			iommu->root_entry[bus].lo = val;
2819 		}
2820 
2821 		if (!ext || !ctxt_tbls[idx + 1])
2822 			continue;
2823 
2824 		val = virt_to_phys(ctxt_tbls[idx + 1]) | 1;
2825 		iommu->root_entry[bus].hi = val;
2826 	}
2827 
2828 	spin_unlock(&iommu->lock);
2829 
2830 	kfree(ctxt_tbls);
2831 
2832 	__iommu_flush_cache(iommu, iommu->root_entry, PAGE_SIZE);
2833 
2834 	ret = 0;
2835 
2836 out_unmap:
2837 	memunmap(old_rt);
2838 
2839 	return ret;
2840 }
2841 
2842 #ifdef CONFIG_INTEL_IOMMU_SVM
intel_vcmd_ioasid_alloc(ioasid_t min,ioasid_t max,void * data)2843 static ioasid_t intel_vcmd_ioasid_alloc(ioasid_t min, ioasid_t max, void *data)
2844 {
2845 	struct intel_iommu *iommu = data;
2846 	ioasid_t ioasid;
2847 
2848 	if (!iommu)
2849 		return INVALID_IOASID;
2850 	/*
2851 	 * VT-d virtual command interface always uses the full 20 bit
2852 	 * PASID range. Host can partition guest PASID range based on
2853 	 * policies but it is out of guest's control.
2854 	 */
2855 	if (min < PASID_MIN || max > intel_pasid_max_id)
2856 		return INVALID_IOASID;
2857 
2858 	if (vcmd_alloc_pasid(iommu, &ioasid))
2859 		return INVALID_IOASID;
2860 
2861 	return ioasid;
2862 }
2863 
intel_vcmd_ioasid_free(ioasid_t ioasid,void * data)2864 static void intel_vcmd_ioasid_free(ioasid_t ioasid, void *data)
2865 {
2866 	struct intel_iommu *iommu = data;
2867 
2868 	if (!iommu)
2869 		return;
2870 	/*
2871 	 * Sanity check the ioasid owner is done at upper layer, e.g. VFIO
2872 	 * We can only free the PASID when all the devices are unbound.
2873 	 */
2874 	if (ioasid_find(NULL, ioasid, NULL)) {
2875 		pr_alert("Cannot free active IOASID %d\n", ioasid);
2876 		return;
2877 	}
2878 	vcmd_free_pasid(iommu, ioasid);
2879 }
2880 
register_pasid_allocator(struct intel_iommu * iommu)2881 static void register_pasid_allocator(struct intel_iommu *iommu)
2882 {
2883 	/*
2884 	 * If we are running in the host, no need for custom allocator
2885 	 * in that PASIDs are allocated from the host system-wide.
2886 	 */
2887 	if (!cap_caching_mode(iommu->cap))
2888 		return;
2889 
2890 	if (!sm_supported(iommu)) {
2891 		pr_warn("VT-d Scalable Mode not enabled, no PASID allocation\n");
2892 		return;
2893 	}
2894 
2895 	/*
2896 	 * Register a custom PASID allocator if we are running in a guest,
2897 	 * guest PASID must be obtained via virtual command interface.
2898 	 * There can be multiple vIOMMUs in each guest but only one allocator
2899 	 * is active. All vIOMMU allocators will eventually be calling the same
2900 	 * host allocator.
2901 	 */
2902 	if (!vccap_pasid(iommu->vccap))
2903 		return;
2904 
2905 	pr_info("Register custom PASID allocator\n");
2906 	iommu->pasid_allocator.alloc = intel_vcmd_ioasid_alloc;
2907 	iommu->pasid_allocator.free = intel_vcmd_ioasid_free;
2908 	iommu->pasid_allocator.pdata = (void *)iommu;
2909 	if (ioasid_register_allocator(&iommu->pasid_allocator)) {
2910 		pr_warn("Custom PASID allocator failed, scalable mode disabled\n");
2911 		/*
2912 		 * Disable scalable mode on this IOMMU if there
2913 		 * is no custom allocator. Mixing SM capable vIOMMU
2914 		 * and non-SM vIOMMU are not supported.
2915 		 */
2916 		intel_iommu_sm = 0;
2917 	}
2918 }
2919 #endif
2920 
init_dmars(void)2921 static int __init init_dmars(void)
2922 {
2923 	struct dmar_drhd_unit *drhd;
2924 	struct intel_iommu *iommu;
2925 	int ret;
2926 
2927 	ret = intel_cap_audit(CAP_AUDIT_STATIC_DMAR, NULL);
2928 	if (ret)
2929 		goto free_iommu;
2930 
2931 	for_each_iommu(iommu, drhd) {
2932 		if (drhd->ignored) {
2933 			iommu_disable_translation(iommu);
2934 			continue;
2935 		}
2936 
2937 		/*
2938 		 * Find the max pasid size of all IOMMU's in the system.
2939 		 * We need to ensure the system pasid table is no bigger
2940 		 * than the smallest supported.
2941 		 */
2942 		if (pasid_supported(iommu)) {
2943 			u32 temp = 2 << ecap_pss(iommu->ecap);
2944 
2945 			intel_pasid_max_id = min_t(u32, temp,
2946 						   intel_pasid_max_id);
2947 		}
2948 
2949 		intel_iommu_init_qi(iommu);
2950 
2951 		ret = iommu_init_domains(iommu);
2952 		if (ret)
2953 			goto free_iommu;
2954 
2955 		init_translation_status(iommu);
2956 
2957 		if (translation_pre_enabled(iommu) && !is_kdump_kernel()) {
2958 			iommu_disable_translation(iommu);
2959 			clear_translation_pre_enabled(iommu);
2960 			pr_warn("Translation was enabled for %s but we are not in kdump mode\n",
2961 				iommu->name);
2962 		}
2963 
2964 		/*
2965 		 * TBD:
2966 		 * we could share the same root & context tables
2967 		 * among all IOMMU's. Need to Split it later.
2968 		 */
2969 		ret = iommu_alloc_root_entry(iommu);
2970 		if (ret)
2971 			goto free_iommu;
2972 
2973 		if (translation_pre_enabled(iommu)) {
2974 			pr_info("Translation already enabled - trying to copy translation structures\n");
2975 
2976 			ret = copy_translation_tables(iommu);
2977 			if (ret) {
2978 				/*
2979 				 * We found the IOMMU with translation
2980 				 * enabled - but failed to copy over the
2981 				 * old root-entry table. Try to proceed
2982 				 * by disabling translation now and
2983 				 * allocating a clean root-entry table.
2984 				 * This might cause DMAR faults, but
2985 				 * probably the dump will still succeed.
2986 				 */
2987 				pr_err("Failed to copy translation tables from previous kernel for %s\n",
2988 				       iommu->name);
2989 				iommu_disable_translation(iommu);
2990 				clear_translation_pre_enabled(iommu);
2991 			} else {
2992 				pr_info("Copied translation tables from previous kernel for %s\n",
2993 					iommu->name);
2994 			}
2995 		}
2996 
2997 		if (!ecap_pass_through(iommu->ecap))
2998 			hw_pass_through = 0;
2999 		intel_svm_check(iommu);
3000 	}
3001 
3002 	/*
3003 	 * Now that qi is enabled on all iommus, set the root entry and flush
3004 	 * caches. This is required on some Intel X58 chipsets, otherwise the
3005 	 * flush_context function will loop forever and the boot hangs.
3006 	 */
3007 	for_each_active_iommu(iommu, drhd) {
3008 		iommu_flush_write_buffer(iommu);
3009 #ifdef CONFIG_INTEL_IOMMU_SVM
3010 		register_pasid_allocator(iommu);
3011 #endif
3012 		iommu_set_root_entry(iommu);
3013 	}
3014 
3015 #ifdef CONFIG_INTEL_IOMMU_BROKEN_GFX_WA
3016 	dmar_map_gfx = 0;
3017 #endif
3018 
3019 	if (!dmar_map_gfx)
3020 		iommu_identity_mapping |= IDENTMAP_GFX;
3021 
3022 	check_tylersburg_isoch();
3023 
3024 	ret = si_domain_init(hw_pass_through);
3025 	if (ret)
3026 		goto free_iommu;
3027 
3028 	/*
3029 	 * for each drhd
3030 	 *   enable fault log
3031 	 *   global invalidate context cache
3032 	 *   global invalidate iotlb
3033 	 *   enable translation
3034 	 */
3035 	for_each_iommu(iommu, drhd) {
3036 		if (drhd->ignored) {
3037 			/*
3038 			 * we always have to disable PMRs or DMA may fail on
3039 			 * this device
3040 			 */
3041 			if (force_on)
3042 				iommu_disable_protect_mem_regions(iommu);
3043 			continue;
3044 		}
3045 
3046 		iommu_flush_write_buffer(iommu);
3047 
3048 #ifdef CONFIG_INTEL_IOMMU_SVM
3049 		if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3050 			/*
3051 			 * Call dmar_alloc_hwirq() with dmar_global_lock held,
3052 			 * could cause possible lock race condition.
3053 			 */
3054 			up_write(&dmar_global_lock);
3055 			ret = intel_svm_enable_prq(iommu);
3056 			down_write(&dmar_global_lock);
3057 			if (ret)
3058 				goto free_iommu;
3059 		}
3060 #endif
3061 		ret = dmar_set_interrupt(iommu);
3062 		if (ret)
3063 			goto free_iommu;
3064 	}
3065 
3066 	return 0;
3067 
3068 free_iommu:
3069 	for_each_active_iommu(iommu, drhd) {
3070 		disable_dmar_iommu(iommu);
3071 		free_dmar_iommu(iommu);
3072 	}
3073 	if (si_domain) {
3074 		domain_exit(si_domain);
3075 		si_domain = NULL;
3076 	}
3077 
3078 	return ret;
3079 }
3080 
init_no_remapping_devices(void)3081 static void __init init_no_remapping_devices(void)
3082 {
3083 	struct dmar_drhd_unit *drhd;
3084 	struct device *dev;
3085 	int i;
3086 
3087 	for_each_drhd_unit(drhd) {
3088 		if (!drhd->include_all) {
3089 			for_each_active_dev_scope(drhd->devices,
3090 						  drhd->devices_cnt, i, dev)
3091 				break;
3092 			/* ignore DMAR unit if no devices exist */
3093 			if (i == drhd->devices_cnt)
3094 				drhd->ignored = 1;
3095 		}
3096 	}
3097 
3098 	for_each_active_drhd_unit(drhd) {
3099 		if (drhd->include_all)
3100 			continue;
3101 
3102 		for_each_active_dev_scope(drhd->devices,
3103 					  drhd->devices_cnt, i, dev)
3104 			if (!dev_is_pci(dev) || !IS_GFX_DEVICE(to_pci_dev(dev)))
3105 				break;
3106 		if (i < drhd->devices_cnt)
3107 			continue;
3108 
3109 		/* This IOMMU has *only* gfx devices. Either bypass it or
3110 		   set the gfx_mapped flag, as appropriate */
3111 		drhd->gfx_dedicated = 1;
3112 		if (!dmar_map_gfx)
3113 			drhd->ignored = 1;
3114 	}
3115 }
3116 
3117 #ifdef CONFIG_SUSPEND
init_iommu_hw(void)3118 static int init_iommu_hw(void)
3119 {
3120 	struct dmar_drhd_unit *drhd;
3121 	struct intel_iommu *iommu = NULL;
3122 
3123 	for_each_active_iommu(iommu, drhd)
3124 		if (iommu->qi)
3125 			dmar_reenable_qi(iommu);
3126 
3127 	for_each_iommu(iommu, drhd) {
3128 		if (drhd->ignored) {
3129 			/*
3130 			 * we always have to disable PMRs or DMA may fail on
3131 			 * this device
3132 			 */
3133 			if (force_on)
3134 				iommu_disable_protect_mem_regions(iommu);
3135 			continue;
3136 		}
3137 
3138 		iommu_flush_write_buffer(iommu);
3139 		iommu_set_root_entry(iommu);
3140 		iommu_enable_translation(iommu);
3141 		iommu_disable_protect_mem_regions(iommu);
3142 	}
3143 
3144 	return 0;
3145 }
3146 
iommu_flush_all(void)3147 static void iommu_flush_all(void)
3148 {
3149 	struct dmar_drhd_unit *drhd;
3150 	struct intel_iommu *iommu;
3151 
3152 	for_each_active_iommu(iommu, drhd) {
3153 		iommu->flush.flush_context(iommu, 0, 0, 0,
3154 					   DMA_CCMD_GLOBAL_INVL);
3155 		iommu->flush.flush_iotlb(iommu, 0, 0, 0,
3156 					 DMA_TLB_GLOBAL_FLUSH);
3157 	}
3158 }
3159 
iommu_suspend(void)3160 static int iommu_suspend(void)
3161 {
3162 	struct dmar_drhd_unit *drhd;
3163 	struct intel_iommu *iommu = NULL;
3164 	unsigned long flag;
3165 
3166 	for_each_active_iommu(iommu, drhd) {
3167 		iommu->iommu_state = kcalloc(MAX_SR_DMAR_REGS, sizeof(u32),
3168 					     GFP_KERNEL);
3169 		if (!iommu->iommu_state)
3170 			goto nomem;
3171 	}
3172 
3173 	iommu_flush_all();
3174 
3175 	for_each_active_iommu(iommu, drhd) {
3176 		iommu_disable_translation(iommu);
3177 
3178 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3179 
3180 		iommu->iommu_state[SR_DMAR_FECTL_REG] =
3181 			readl(iommu->reg + DMAR_FECTL_REG);
3182 		iommu->iommu_state[SR_DMAR_FEDATA_REG] =
3183 			readl(iommu->reg + DMAR_FEDATA_REG);
3184 		iommu->iommu_state[SR_DMAR_FEADDR_REG] =
3185 			readl(iommu->reg + DMAR_FEADDR_REG);
3186 		iommu->iommu_state[SR_DMAR_FEUADDR_REG] =
3187 			readl(iommu->reg + DMAR_FEUADDR_REG);
3188 
3189 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3190 	}
3191 	return 0;
3192 
3193 nomem:
3194 	for_each_active_iommu(iommu, drhd)
3195 		kfree(iommu->iommu_state);
3196 
3197 	return -ENOMEM;
3198 }
3199 
iommu_resume(void)3200 static void iommu_resume(void)
3201 {
3202 	struct dmar_drhd_unit *drhd;
3203 	struct intel_iommu *iommu = NULL;
3204 	unsigned long flag;
3205 
3206 	if (init_iommu_hw()) {
3207 		if (force_on)
3208 			panic("tboot: IOMMU setup failed, DMAR can not resume!\n");
3209 		else
3210 			WARN(1, "IOMMU setup failed, DMAR can not resume!\n");
3211 		return;
3212 	}
3213 
3214 	for_each_active_iommu(iommu, drhd) {
3215 
3216 		raw_spin_lock_irqsave(&iommu->register_lock, flag);
3217 
3218 		writel(iommu->iommu_state[SR_DMAR_FECTL_REG],
3219 			iommu->reg + DMAR_FECTL_REG);
3220 		writel(iommu->iommu_state[SR_DMAR_FEDATA_REG],
3221 			iommu->reg + DMAR_FEDATA_REG);
3222 		writel(iommu->iommu_state[SR_DMAR_FEADDR_REG],
3223 			iommu->reg + DMAR_FEADDR_REG);
3224 		writel(iommu->iommu_state[SR_DMAR_FEUADDR_REG],
3225 			iommu->reg + DMAR_FEUADDR_REG);
3226 
3227 		raw_spin_unlock_irqrestore(&iommu->register_lock, flag);
3228 	}
3229 
3230 	for_each_active_iommu(iommu, drhd)
3231 		kfree(iommu->iommu_state);
3232 }
3233 
3234 static struct syscore_ops iommu_syscore_ops = {
3235 	.resume		= iommu_resume,
3236 	.suspend	= iommu_suspend,
3237 };
3238 
init_iommu_pm_ops(void)3239 static void __init init_iommu_pm_ops(void)
3240 {
3241 	register_syscore_ops(&iommu_syscore_ops);
3242 }
3243 
3244 #else
init_iommu_pm_ops(void)3245 static inline void init_iommu_pm_ops(void) {}
3246 #endif	/* CONFIG_PM */
3247 
rmrr_sanity_check(struct acpi_dmar_reserved_memory * rmrr)3248 static int __init rmrr_sanity_check(struct acpi_dmar_reserved_memory *rmrr)
3249 {
3250 	if (!IS_ALIGNED(rmrr->base_address, PAGE_SIZE) ||
3251 	    !IS_ALIGNED(rmrr->end_address + 1, PAGE_SIZE) ||
3252 	    rmrr->end_address <= rmrr->base_address ||
3253 	    arch_rmrr_sanity_check(rmrr))
3254 		return -EINVAL;
3255 
3256 	return 0;
3257 }
3258 
dmar_parse_one_rmrr(struct acpi_dmar_header * header,void * arg)3259 int __init dmar_parse_one_rmrr(struct acpi_dmar_header *header, void *arg)
3260 {
3261 	struct acpi_dmar_reserved_memory *rmrr;
3262 	struct dmar_rmrr_unit *rmrru;
3263 
3264 	rmrr = (struct acpi_dmar_reserved_memory *)header;
3265 	if (rmrr_sanity_check(rmrr)) {
3266 		pr_warn(FW_BUG
3267 			   "Your BIOS is broken; bad RMRR [%#018Lx-%#018Lx]\n"
3268 			   "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
3269 			   rmrr->base_address, rmrr->end_address,
3270 			   dmi_get_system_info(DMI_BIOS_VENDOR),
3271 			   dmi_get_system_info(DMI_BIOS_VERSION),
3272 			   dmi_get_system_info(DMI_PRODUCT_VERSION));
3273 		add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
3274 	}
3275 
3276 	rmrru = kzalloc(sizeof(*rmrru), GFP_KERNEL);
3277 	if (!rmrru)
3278 		goto out;
3279 
3280 	rmrru->hdr = header;
3281 
3282 	rmrru->base_address = rmrr->base_address;
3283 	rmrru->end_address = rmrr->end_address;
3284 
3285 	rmrru->devices = dmar_alloc_dev_scope((void *)(rmrr + 1),
3286 				((void *)rmrr) + rmrr->header.length,
3287 				&rmrru->devices_cnt);
3288 	if (rmrru->devices_cnt && rmrru->devices == NULL)
3289 		goto free_rmrru;
3290 
3291 	list_add(&rmrru->list, &dmar_rmrr_units);
3292 
3293 	return 0;
3294 free_rmrru:
3295 	kfree(rmrru);
3296 out:
3297 	return -ENOMEM;
3298 }
3299 
dmar_find_atsr(struct acpi_dmar_atsr * atsr)3300 static struct dmar_atsr_unit *dmar_find_atsr(struct acpi_dmar_atsr *atsr)
3301 {
3302 	struct dmar_atsr_unit *atsru;
3303 	struct acpi_dmar_atsr *tmp;
3304 
3305 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list,
3306 				dmar_rcu_check()) {
3307 		tmp = (struct acpi_dmar_atsr *)atsru->hdr;
3308 		if (atsr->segment != tmp->segment)
3309 			continue;
3310 		if (atsr->header.length != tmp->header.length)
3311 			continue;
3312 		if (memcmp(atsr, tmp, atsr->header.length) == 0)
3313 			return atsru;
3314 	}
3315 
3316 	return NULL;
3317 }
3318 
dmar_parse_one_atsr(struct acpi_dmar_header * hdr,void * arg)3319 int dmar_parse_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3320 {
3321 	struct acpi_dmar_atsr *atsr;
3322 	struct dmar_atsr_unit *atsru;
3323 
3324 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3325 		return 0;
3326 
3327 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3328 	atsru = dmar_find_atsr(atsr);
3329 	if (atsru)
3330 		return 0;
3331 
3332 	atsru = kzalloc(sizeof(*atsru) + hdr->length, GFP_KERNEL);
3333 	if (!atsru)
3334 		return -ENOMEM;
3335 
3336 	/*
3337 	 * If memory is allocated from slab by ACPI _DSM method, we need to
3338 	 * copy the memory content because the memory buffer will be freed
3339 	 * on return.
3340 	 */
3341 	atsru->hdr = (void *)(atsru + 1);
3342 	memcpy(atsru->hdr, hdr, hdr->length);
3343 	atsru->include_all = atsr->flags & 0x1;
3344 	if (!atsru->include_all) {
3345 		atsru->devices = dmar_alloc_dev_scope((void *)(atsr + 1),
3346 				(void *)atsr + atsr->header.length,
3347 				&atsru->devices_cnt);
3348 		if (atsru->devices_cnt && atsru->devices == NULL) {
3349 			kfree(atsru);
3350 			return -ENOMEM;
3351 		}
3352 	}
3353 
3354 	list_add_rcu(&atsru->list, &dmar_atsr_units);
3355 
3356 	return 0;
3357 }
3358 
intel_iommu_free_atsr(struct dmar_atsr_unit * atsru)3359 static void intel_iommu_free_atsr(struct dmar_atsr_unit *atsru)
3360 {
3361 	dmar_free_dev_scope(&atsru->devices, &atsru->devices_cnt);
3362 	kfree(atsru);
3363 }
3364 
dmar_release_one_atsr(struct acpi_dmar_header * hdr,void * arg)3365 int dmar_release_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3366 {
3367 	struct acpi_dmar_atsr *atsr;
3368 	struct dmar_atsr_unit *atsru;
3369 
3370 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3371 	atsru = dmar_find_atsr(atsr);
3372 	if (atsru) {
3373 		list_del_rcu(&atsru->list);
3374 		synchronize_rcu();
3375 		intel_iommu_free_atsr(atsru);
3376 	}
3377 
3378 	return 0;
3379 }
3380 
dmar_check_one_atsr(struct acpi_dmar_header * hdr,void * arg)3381 int dmar_check_one_atsr(struct acpi_dmar_header *hdr, void *arg)
3382 {
3383 	int i;
3384 	struct device *dev;
3385 	struct acpi_dmar_atsr *atsr;
3386 	struct dmar_atsr_unit *atsru;
3387 
3388 	atsr = container_of(hdr, struct acpi_dmar_atsr, header);
3389 	atsru = dmar_find_atsr(atsr);
3390 	if (!atsru)
3391 		return 0;
3392 
3393 	if (!atsru->include_all && atsru->devices && atsru->devices_cnt) {
3394 		for_each_active_dev_scope(atsru->devices, atsru->devices_cnt,
3395 					  i, dev)
3396 			return -EBUSY;
3397 	}
3398 
3399 	return 0;
3400 }
3401 
dmar_find_satc(struct acpi_dmar_satc * satc)3402 static struct dmar_satc_unit *dmar_find_satc(struct acpi_dmar_satc *satc)
3403 {
3404 	struct dmar_satc_unit *satcu;
3405 	struct acpi_dmar_satc *tmp;
3406 
3407 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list,
3408 				dmar_rcu_check()) {
3409 		tmp = (struct acpi_dmar_satc *)satcu->hdr;
3410 		if (satc->segment != tmp->segment)
3411 			continue;
3412 		if (satc->header.length != tmp->header.length)
3413 			continue;
3414 		if (memcmp(satc, tmp, satc->header.length) == 0)
3415 			return satcu;
3416 	}
3417 
3418 	return NULL;
3419 }
3420 
dmar_parse_one_satc(struct acpi_dmar_header * hdr,void * arg)3421 int dmar_parse_one_satc(struct acpi_dmar_header *hdr, void *arg)
3422 {
3423 	struct acpi_dmar_satc *satc;
3424 	struct dmar_satc_unit *satcu;
3425 
3426 	if (system_state >= SYSTEM_RUNNING && !intel_iommu_enabled)
3427 		return 0;
3428 
3429 	satc = container_of(hdr, struct acpi_dmar_satc, header);
3430 	satcu = dmar_find_satc(satc);
3431 	if (satcu)
3432 		return 0;
3433 
3434 	satcu = kzalloc(sizeof(*satcu) + hdr->length, GFP_KERNEL);
3435 	if (!satcu)
3436 		return -ENOMEM;
3437 
3438 	satcu->hdr = (void *)(satcu + 1);
3439 	memcpy(satcu->hdr, hdr, hdr->length);
3440 	satcu->atc_required = satc->flags & 0x1;
3441 	satcu->devices = dmar_alloc_dev_scope((void *)(satc + 1),
3442 					      (void *)satc + satc->header.length,
3443 					      &satcu->devices_cnt);
3444 	if (satcu->devices_cnt && !satcu->devices) {
3445 		kfree(satcu);
3446 		return -ENOMEM;
3447 	}
3448 	list_add_rcu(&satcu->list, &dmar_satc_units);
3449 
3450 	return 0;
3451 }
3452 
intel_iommu_add(struct dmar_drhd_unit * dmaru)3453 static int intel_iommu_add(struct dmar_drhd_unit *dmaru)
3454 {
3455 	int sp, ret;
3456 	struct intel_iommu *iommu = dmaru->iommu;
3457 
3458 	ret = intel_cap_audit(CAP_AUDIT_HOTPLUG_DMAR, iommu);
3459 	if (ret)
3460 		goto out;
3461 
3462 	if (hw_pass_through && !ecap_pass_through(iommu->ecap)) {
3463 		pr_warn("%s: Doesn't support hardware pass through.\n",
3464 			iommu->name);
3465 		return -ENXIO;
3466 	}
3467 
3468 	sp = domain_update_iommu_superpage(NULL, iommu) - 1;
3469 	if (sp >= 0 && !(cap_super_page_val(iommu->cap) & (1 << sp))) {
3470 		pr_warn("%s: Doesn't support large page.\n",
3471 			iommu->name);
3472 		return -ENXIO;
3473 	}
3474 
3475 	/*
3476 	 * Disable translation if already enabled prior to OS handover.
3477 	 */
3478 	if (iommu->gcmd & DMA_GCMD_TE)
3479 		iommu_disable_translation(iommu);
3480 
3481 	ret = iommu_init_domains(iommu);
3482 	if (ret == 0)
3483 		ret = iommu_alloc_root_entry(iommu);
3484 	if (ret)
3485 		goto out;
3486 
3487 	intel_svm_check(iommu);
3488 
3489 	if (dmaru->ignored) {
3490 		/*
3491 		 * we always have to disable PMRs or DMA may fail on this device
3492 		 */
3493 		if (force_on)
3494 			iommu_disable_protect_mem_regions(iommu);
3495 		return 0;
3496 	}
3497 
3498 	intel_iommu_init_qi(iommu);
3499 	iommu_flush_write_buffer(iommu);
3500 
3501 #ifdef CONFIG_INTEL_IOMMU_SVM
3502 	if (pasid_supported(iommu) && ecap_prs(iommu->ecap)) {
3503 		ret = intel_svm_enable_prq(iommu);
3504 		if (ret)
3505 			goto disable_iommu;
3506 	}
3507 #endif
3508 	ret = dmar_set_interrupt(iommu);
3509 	if (ret)
3510 		goto disable_iommu;
3511 
3512 	iommu_set_root_entry(iommu);
3513 	iommu_enable_translation(iommu);
3514 
3515 	iommu_disable_protect_mem_regions(iommu);
3516 	return 0;
3517 
3518 disable_iommu:
3519 	disable_dmar_iommu(iommu);
3520 out:
3521 	free_dmar_iommu(iommu);
3522 	return ret;
3523 }
3524 
dmar_iommu_hotplug(struct dmar_drhd_unit * dmaru,bool insert)3525 int dmar_iommu_hotplug(struct dmar_drhd_unit *dmaru, bool insert)
3526 {
3527 	int ret = 0;
3528 	struct intel_iommu *iommu = dmaru->iommu;
3529 
3530 	if (!intel_iommu_enabled)
3531 		return 0;
3532 	if (iommu == NULL)
3533 		return -EINVAL;
3534 
3535 	if (insert) {
3536 		ret = intel_iommu_add(dmaru);
3537 	} else {
3538 		disable_dmar_iommu(iommu);
3539 		free_dmar_iommu(iommu);
3540 	}
3541 
3542 	return ret;
3543 }
3544 
intel_iommu_free_dmars(void)3545 static void intel_iommu_free_dmars(void)
3546 {
3547 	struct dmar_rmrr_unit *rmrru, *rmrr_n;
3548 	struct dmar_atsr_unit *atsru, *atsr_n;
3549 	struct dmar_satc_unit *satcu, *satc_n;
3550 
3551 	list_for_each_entry_safe(rmrru, rmrr_n, &dmar_rmrr_units, list) {
3552 		list_del(&rmrru->list);
3553 		dmar_free_dev_scope(&rmrru->devices, &rmrru->devices_cnt);
3554 		kfree(rmrru);
3555 	}
3556 
3557 	list_for_each_entry_safe(atsru, atsr_n, &dmar_atsr_units, list) {
3558 		list_del(&atsru->list);
3559 		intel_iommu_free_atsr(atsru);
3560 	}
3561 	list_for_each_entry_safe(satcu, satc_n, &dmar_satc_units, list) {
3562 		list_del(&satcu->list);
3563 		dmar_free_dev_scope(&satcu->devices, &satcu->devices_cnt);
3564 		kfree(satcu);
3565 	}
3566 }
3567 
dmar_find_matched_satc_unit(struct pci_dev * dev)3568 static struct dmar_satc_unit *dmar_find_matched_satc_unit(struct pci_dev *dev)
3569 {
3570 	struct dmar_satc_unit *satcu;
3571 	struct acpi_dmar_satc *satc;
3572 	struct device *tmp;
3573 	int i;
3574 
3575 	dev = pci_physfn(dev);
3576 	rcu_read_lock();
3577 
3578 	list_for_each_entry_rcu(satcu, &dmar_satc_units, list) {
3579 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3580 		if (satc->segment != pci_domain_nr(dev->bus))
3581 			continue;
3582 		for_each_dev_scope(satcu->devices, satcu->devices_cnt, i, tmp)
3583 			if (to_pci_dev(tmp) == dev)
3584 				goto out;
3585 	}
3586 	satcu = NULL;
3587 out:
3588 	rcu_read_unlock();
3589 	return satcu;
3590 }
3591 
dmar_ats_supported(struct pci_dev * dev,struct intel_iommu * iommu)3592 static int dmar_ats_supported(struct pci_dev *dev, struct intel_iommu *iommu)
3593 {
3594 	int i, ret = 1;
3595 	struct pci_bus *bus;
3596 	struct pci_dev *bridge = NULL;
3597 	struct device *tmp;
3598 	struct acpi_dmar_atsr *atsr;
3599 	struct dmar_atsr_unit *atsru;
3600 	struct dmar_satc_unit *satcu;
3601 
3602 	dev = pci_physfn(dev);
3603 	satcu = dmar_find_matched_satc_unit(dev);
3604 	if (satcu)
3605 		/*
3606 		 * This device supports ATS as it is in SATC table.
3607 		 * When IOMMU is in legacy mode, enabling ATS is done
3608 		 * automatically by HW for the device that requires
3609 		 * ATS, hence OS should not enable this device ATS
3610 		 * to avoid duplicated TLB invalidation.
3611 		 */
3612 		return !(satcu->atc_required && !sm_supported(iommu));
3613 
3614 	for (bus = dev->bus; bus; bus = bus->parent) {
3615 		bridge = bus->self;
3616 		/* If it's an integrated device, allow ATS */
3617 		if (!bridge)
3618 			return 1;
3619 		/* Connected via non-PCIe: no ATS */
3620 		if (!pci_is_pcie(bridge) ||
3621 		    pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE)
3622 			return 0;
3623 		/* If we found the root port, look it up in the ATSR */
3624 		if (pci_pcie_type(bridge) == PCI_EXP_TYPE_ROOT_PORT)
3625 			break;
3626 	}
3627 
3628 	rcu_read_lock();
3629 	list_for_each_entry_rcu(atsru, &dmar_atsr_units, list) {
3630 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3631 		if (atsr->segment != pci_domain_nr(dev->bus))
3632 			continue;
3633 
3634 		for_each_dev_scope(atsru->devices, atsru->devices_cnt, i, tmp)
3635 			if (tmp == &bridge->dev)
3636 				goto out;
3637 
3638 		if (atsru->include_all)
3639 			goto out;
3640 	}
3641 	ret = 0;
3642 out:
3643 	rcu_read_unlock();
3644 
3645 	return ret;
3646 }
3647 
dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info * info)3648 int dmar_iommu_notify_scope_dev(struct dmar_pci_notify_info *info)
3649 {
3650 	int ret;
3651 	struct dmar_rmrr_unit *rmrru;
3652 	struct dmar_atsr_unit *atsru;
3653 	struct dmar_satc_unit *satcu;
3654 	struct acpi_dmar_atsr *atsr;
3655 	struct acpi_dmar_reserved_memory *rmrr;
3656 	struct acpi_dmar_satc *satc;
3657 
3658 	if (!intel_iommu_enabled && system_state >= SYSTEM_RUNNING)
3659 		return 0;
3660 
3661 	list_for_each_entry(rmrru, &dmar_rmrr_units, list) {
3662 		rmrr = container_of(rmrru->hdr,
3663 				    struct acpi_dmar_reserved_memory, header);
3664 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3665 			ret = dmar_insert_dev_scope(info, (void *)(rmrr + 1),
3666 				((void *)rmrr) + rmrr->header.length,
3667 				rmrr->segment, rmrru->devices,
3668 				rmrru->devices_cnt);
3669 			if (ret < 0)
3670 				return ret;
3671 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3672 			dmar_remove_dev_scope(info, rmrr->segment,
3673 				rmrru->devices, rmrru->devices_cnt);
3674 		}
3675 	}
3676 
3677 	list_for_each_entry(atsru, &dmar_atsr_units, list) {
3678 		if (atsru->include_all)
3679 			continue;
3680 
3681 		atsr = container_of(atsru->hdr, struct acpi_dmar_atsr, header);
3682 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3683 			ret = dmar_insert_dev_scope(info, (void *)(atsr + 1),
3684 					(void *)atsr + atsr->header.length,
3685 					atsr->segment, atsru->devices,
3686 					atsru->devices_cnt);
3687 			if (ret > 0)
3688 				break;
3689 			else if (ret < 0)
3690 				return ret;
3691 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3692 			if (dmar_remove_dev_scope(info, atsr->segment,
3693 					atsru->devices, atsru->devices_cnt))
3694 				break;
3695 		}
3696 	}
3697 	list_for_each_entry(satcu, &dmar_satc_units, list) {
3698 		satc = container_of(satcu->hdr, struct acpi_dmar_satc, header);
3699 		if (info->event == BUS_NOTIFY_ADD_DEVICE) {
3700 			ret = dmar_insert_dev_scope(info, (void *)(satc + 1),
3701 					(void *)satc + satc->header.length,
3702 					satc->segment, satcu->devices,
3703 					satcu->devices_cnt);
3704 			if (ret > 0)
3705 				break;
3706 			else if (ret < 0)
3707 				return ret;
3708 		} else if (info->event == BUS_NOTIFY_REMOVED_DEVICE) {
3709 			if (dmar_remove_dev_scope(info, satc->segment,
3710 					satcu->devices, satcu->devices_cnt))
3711 				break;
3712 		}
3713 	}
3714 
3715 	return 0;
3716 }
3717 
intel_iommu_memory_notifier(struct notifier_block * nb,unsigned long val,void * v)3718 static int intel_iommu_memory_notifier(struct notifier_block *nb,
3719 				       unsigned long val, void *v)
3720 {
3721 	struct memory_notify *mhp = v;
3722 	unsigned long start_vpfn = mm_to_dma_pfn(mhp->start_pfn);
3723 	unsigned long last_vpfn = mm_to_dma_pfn(mhp->start_pfn +
3724 			mhp->nr_pages - 1);
3725 
3726 	switch (val) {
3727 	case MEM_GOING_ONLINE:
3728 		if (iommu_domain_identity_map(si_domain,
3729 					      start_vpfn, last_vpfn)) {
3730 			pr_warn("Failed to build identity map for [%lx-%lx]\n",
3731 				start_vpfn, last_vpfn);
3732 			return NOTIFY_BAD;
3733 		}
3734 		break;
3735 
3736 	case MEM_OFFLINE:
3737 	case MEM_CANCEL_ONLINE:
3738 		{
3739 			struct dmar_drhd_unit *drhd;
3740 			struct intel_iommu *iommu;
3741 			LIST_HEAD(freelist);
3742 
3743 			domain_unmap(si_domain, start_vpfn, last_vpfn, &freelist);
3744 
3745 			rcu_read_lock();
3746 			for_each_active_iommu(iommu, drhd)
3747 				iommu_flush_iotlb_psi(iommu, si_domain,
3748 					start_vpfn, mhp->nr_pages,
3749 					list_empty(&freelist), 0);
3750 			rcu_read_unlock();
3751 			put_pages_list(&freelist);
3752 		}
3753 		break;
3754 	}
3755 
3756 	return NOTIFY_OK;
3757 }
3758 
3759 static struct notifier_block intel_iommu_memory_nb = {
3760 	.notifier_call = intel_iommu_memory_notifier,
3761 	.priority = 0
3762 };
3763 
intel_disable_iommus(void)3764 static void intel_disable_iommus(void)
3765 {
3766 	struct intel_iommu *iommu = NULL;
3767 	struct dmar_drhd_unit *drhd;
3768 
3769 	for_each_iommu(iommu, drhd)
3770 		iommu_disable_translation(iommu);
3771 }
3772 
intel_iommu_shutdown(void)3773 void intel_iommu_shutdown(void)
3774 {
3775 	struct dmar_drhd_unit *drhd;
3776 	struct intel_iommu *iommu = NULL;
3777 
3778 	if (no_iommu || dmar_disabled)
3779 		return;
3780 
3781 	down_write(&dmar_global_lock);
3782 
3783 	/* Disable PMRs explicitly here. */
3784 	for_each_iommu(iommu, drhd)
3785 		iommu_disable_protect_mem_regions(iommu);
3786 
3787 	/* Make sure the IOMMUs are switched off */
3788 	intel_disable_iommus();
3789 
3790 	up_write(&dmar_global_lock);
3791 }
3792 
dev_to_intel_iommu(struct device * dev)3793 static inline struct intel_iommu *dev_to_intel_iommu(struct device *dev)
3794 {
3795 	struct iommu_device *iommu_dev = dev_to_iommu_device(dev);
3796 
3797 	return container_of(iommu_dev, struct intel_iommu, iommu);
3798 }
3799 
version_show(struct device * dev,struct device_attribute * attr,char * buf)3800 static ssize_t version_show(struct device *dev,
3801 			    struct device_attribute *attr, char *buf)
3802 {
3803 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3804 	u32 ver = readl(iommu->reg + DMAR_VER_REG);
3805 	return sprintf(buf, "%d:%d\n",
3806 		       DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver));
3807 }
3808 static DEVICE_ATTR_RO(version);
3809 
address_show(struct device * dev,struct device_attribute * attr,char * buf)3810 static ssize_t address_show(struct device *dev,
3811 			    struct device_attribute *attr, char *buf)
3812 {
3813 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3814 	return sprintf(buf, "%llx\n", iommu->reg_phys);
3815 }
3816 static DEVICE_ATTR_RO(address);
3817 
cap_show(struct device * dev,struct device_attribute * attr,char * buf)3818 static ssize_t cap_show(struct device *dev,
3819 			struct device_attribute *attr, char *buf)
3820 {
3821 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3822 	return sprintf(buf, "%llx\n", iommu->cap);
3823 }
3824 static DEVICE_ATTR_RO(cap);
3825 
ecap_show(struct device * dev,struct device_attribute * attr,char * buf)3826 static ssize_t ecap_show(struct device *dev,
3827 			 struct device_attribute *attr, char *buf)
3828 {
3829 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3830 	return sprintf(buf, "%llx\n", iommu->ecap);
3831 }
3832 static DEVICE_ATTR_RO(ecap);
3833 
domains_supported_show(struct device * dev,struct device_attribute * attr,char * buf)3834 static ssize_t domains_supported_show(struct device *dev,
3835 				      struct device_attribute *attr, char *buf)
3836 {
3837 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3838 	return sprintf(buf, "%ld\n", cap_ndoms(iommu->cap));
3839 }
3840 static DEVICE_ATTR_RO(domains_supported);
3841 
domains_used_show(struct device * dev,struct device_attribute * attr,char * buf)3842 static ssize_t domains_used_show(struct device *dev,
3843 				 struct device_attribute *attr, char *buf)
3844 {
3845 	struct intel_iommu *iommu = dev_to_intel_iommu(dev);
3846 	return sprintf(buf, "%d\n", bitmap_weight(iommu->domain_ids,
3847 						  cap_ndoms(iommu->cap)));
3848 }
3849 static DEVICE_ATTR_RO(domains_used);
3850 
3851 static struct attribute *intel_iommu_attrs[] = {
3852 	&dev_attr_version.attr,
3853 	&dev_attr_address.attr,
3854 	&dev_attr_cap.attr,
3855 	&dev_attr_ecap.attr,
3856 	&dev_attr_domains_supported.attr,
3857 	&dev_attr_domains_used.attr,
3858 	NULL,
3859 };
3860 
3861 static struct attribute_group intel_iommu_group = {
3862 	.name = "intel-iommu",
3863 	.attrs = intel_iommu_attrs,
3864 };
3865 
3866 const struct attribute_group *intel_iommu_groups[] = {
3867 	&intel_iommu_group,
3868 	NULL,
3869 };
3870 
has_external_pci(void)3871 static inline bool has_external_pci(void)
3872 {
3873 	struct pci_dev *pdev = NULL;
3874 
3875 	for_each_pci_dev(pdev)
3876 		if (pdev->external_facing) {
3877 			pci_dev_put(pdev);
3878 			return true;
3879 		}
3880 
3881 	return false;
3882 }
3883 
platform_optin_force_iommu(void)3884 static int __init platform_optin_force_iommu(void)
3885 {
3886 	if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
3887 		return 0;
3888 
3889 	if (no_iommu || dmar_disabled)
3890 		pr_info("Intel-IOMMU force enabled due to platform opt in\n");
3891 
3892 	/*
3893 	 * If Intel-IOMMU is disabled by default, we will apply identity
3894 	 * map for all devices except those marked as being untrusted.
3895 	 */
3896 	if (dmar_disabled)
3897 		iommu_set_default_passthrough(false);
3898 
3899 	dmar_disabled = 0;
3900 	no_iommu = 0;
3901 
3902 	return 1;
3903 }
3904 
probe_acpi_namespace_devices(void)3905 static int __init probe_acpi_namespace_devices(void)
3906 {
3907 	struct dmar_drhd_unit *drhd;
3908 	/* To avoid a -Wunused-but-set-variable warning. */
3909 	struct intel_iommu *iommu __maybe_unused;
3910 	struct device *dev;
3911 	int i, ret = 0;
3912 
3913 	for_each_active_iommu(iommu, drhd) {
3914 		for_each_active_dev_scope(drhd->devices,
3915 					  drhd->devices_cnt, i, dev) {
3916 			struct acpi_device_physical_node *pn;
3917 			struct iommu_group *group;
3918 			struct acpi_device *adev;
3919 
3920 			if (dev->bus != &acpi_bus_type)
3921 				continue;
3922 
3923 			adev = to_acpi_device(dev);
3924 			mutex_lock(&adev->physical_node_lock);
3925 			list_for_each_entry(pn,
3926 					    &adev->physical_node_list, node) {
3927 				group = iommu_group_get(pn->dev);
3928 				if (group) {
3929 					iommu_group_put(group);
3930 					continue;
3931 				}
3932 
3933 				ret = iommu_probe_device(pn->dev);
3934 				if (ret)
3935 					break;
3936 			}
3937 			mutex_unlock(&adev->physical_node_lock);
3938 
3939 			if (ret)
3940 				return ret;
3941 		}
3942 	}
3943 
3944 	return 0;
3945 }
3946 
tboot_force_iommu(void)3947 static __init int tboot_force_iommu(void)
3948 {
3949 	if (!tboot_enabled())
3950 		return 0;
3951 
3952 	if (no_iommu || dmar_disabled)
3953 		pr_warn("Forcing Intel-IOMMU to enabled\n");
3954 
3955 	dmar_disabled = 0;
3956 	no_iommu = 0;
3957 
3958 	return 1;
3959 }
3960 
intel_iommu_init(void)3961 int __init intel_iommu_init(void)
3962 {
3963 	int ret = -ENODEV;
3964 	struct dmar_drhd_unit *drhd;
3965 	struct intel_iommu *iommu;
3966 
3967 	/*
3968 	 * Intel IOMMU is required for a TXT/tboot launch or platform
3969 	 * opt in, so enforce that.
3970 	 */
3971 	force_on = (!intel_iommu_tboot_noforce && tboot_force_iommu()) ||
3972 		    platform_optin_force_iommu();
3973 
3974 	down_write(&dmar_global_lock);
3975 	if (dmar_table_init()) {
3976 		if (force_on)
3977 			panic("tboot: Failed to initialize DMAR table\n");
3978 		goto out_free_dmar;
3979 	}
3980 
3981 	if (dmar_dev_scope_init() < 0) {
3982 		if (force_on)
3983 			panic("tboot: Failed to initialize DMAR device scope\n");
3984 		goto out_free_dmar;
3985 	}
3986 
3987 	up_write(&dmar_global_lock);
3988 
3989 	/*
3990 	 * The bus notifier takes the dmar_global_lock, so lockdep will
3991 	 * complain later when we register it under the lock.
3992 	 */
3993 	dmar_register_bus_notifier();
3994 
3995 	down_write(&dmar_global_lock);
3996 
3997 	if (!no_iommu)
3998 		intel_iommu_debugfs_init();
3999 
4000 	if (no_iommu || dmar_disabled) {
4001 		/*
4002 		 * We exit the function here to ensure IOMMU's remapping and
4003 		 * mempool aren't setup, which means that the IOMMU's PMRs
4004 		 * won't be disabled via the call to init_dmars(). So disable
4005 		 * it explicitly here. The PMRs were setup by tboot prior to
4006 		 * calling SENTER, but the kernel is expected to reset/tear
4007 		 * down the PMRs.
4008 		 */
4009 		if (intel_iommu_tboot_noforce) {
4010 			for_each_iommu(iommu, drhd)
4011 				iommu_disable_protect_mem_regions(iommu);
4012 		}
4013 
4014 		/*
4015 		 * Make sure the IOMMUs are switched off, even when we
4016 		 * boot into a kexec kernel and the previous kernel left
4017 		 * them enabled
4018 		 */
4019 		intel_disable_iommus();
4020 		goto out_free_dmar;
4021 	}
4022 
4023 	if (list_empty(&dmar_rmrr_units))
4024 		pr_info("No RMRR found\n");
4025 
4026 	if (list_empty(&dmar_atsr_units))
4027 		pr_info("No ATSR found\n");
4028 
4029 	if (list_empty(&dmar_satc_units))
4030 		pr_info("No SATC found\n");
4031 
4032 	init_no_remapping_devices();
4033 
4034 	ret = init_dmars();
4035 	if (ret) {
4036 		if (force_on)
4037 			panic("tboot: Failed to initialize DMARs\n");
4038 		pr_err("Initialization failed\n");
4039 		goto out_free_dmar;
4040 	}
4041 	up_write(&dmar_global_lock);
4042 
4043 	init_iommu_pm_ops();
4044 
4045 	down_read(&dmar_global_lock);
4046 	for_each_active_iommu(iommu, drhd) {
4047 		/*
4048 		 * The flush queue implementation does not perform
4049 		 * page-selective invalidations that are required for efficient
4050 		 * TLB flushes in virtual environments.  The benefit of batching
4051 		 * is likely to be much lower than the overhead of synchronizing
4052 		 * the virtual and physical IOMMU page-tables.
4053 		 */
4054 		if (cap_caching_mode(iommu->cap)) {
4055 			pr_info_once("IOMMU batching disallowed due to virtualization\n");
4056 			iommu_set_dma_strict();
4057 		}
4058 		iommu_device_sysfs_add(&iommu->iommu, NULL,
4059 				       intel_iommu_groups,
4060 				       "%s", iommu->name);
4061 		iommu_device_register(&iommu->iommu, &intel_iommu_ops, NULL);
4062 	}
4063 	up_read(&dmar_global_lock);
4064 
4065 	if (si_domain && !hw_pass_through)
4066 		register_memory_notifier(&intel_iommu_memory_nb);
4067 
4068 	down_read(&dmar_global_lock);
4069 	if (probe_acpi_namespace_devices())
4070 		pr_warn("ACPI name space devices didn't probe correctly\n");
4071 
4072 	/* Finally, we enable the DMA remapping hardware. */
4073 	for_each_iommu(iommu, drhd) {
4074 		if (!drhd->ignored && !translation_pre_enabled(iommu))
4075 			iommu_enable_translation(iommu);
4076 
4077 		iommu_disable_protect_mem_regions(iommu);
4078 	}
4079 	up_read(&dmar_global_lock);
4080 
4081 	pr_info("Intel(R) Virtualization Technology for Directed I/O\n");
4082 
4083 	intel_iommu_enabled = 1;
4084 
4085 	return 0;
4086 
4087 out_free_dmar:
4088 	intel_iommu_free_dmars();
4089 	up_write(&dmar_global_lock);
4090 	return ret;
4091 }
4092 
domain_context_clear_one_cb(struct pci_dev * pdev,u16 alias,void * opaque)4093 static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *opaque)
4094 {
4095 	struct device_domain_info *info = opaque;
4096 
4097 	domain_context_clear_one(info, PCI_BUS_NUM(alias), alias & 0xff);
4098 	return 0;
4099 }
4100 
4101 /*
4102  * NB - intel-iommu lacks any sort of reference counting for the users of
4103  * dependent devices.  If multiple endpoints have intersecting dependent
4104  * devices, unbinding the driver from any one of them will possibly leave
4105  * the others unable to operate.
4106  */
domain_context_clear(struct device_domain_info * info)4107 static void domain_context_clear(struct device_domain_info *info)
4108 {
4109 	if (!info->iommu || !info->dev || !dev_is_pci(info->dev))
4110 		return;
4111 
4112 	pci_for_each_dma_alias(to_pci_dev(info->dev),
4113 			       &domain_context_clear_one_cb, info);
4114 }
4115 
dmar_remove_one_dev_info(struct device * dev)4116 static void dmar_remove_one_dev_info(struct device *dev)
4117 {
4118 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4119 	struct dmar_domain *domain = info->domain;
4120 	struct intel_iommu *iommu = info->iommu;
4121 	unsigned long flags;
4122 
4123 	if (!dev_is_real_dma_subdevice(info->dev)) {
4124 		if (dev_is_pci(info->dev) && sm_supported(iommu))
4125 			intel_pasid_tear_down_entry(iommu, info->dev,
4126 					PASID_RID2PASID, false);
4127 
4128 		iommu_disable_dev_iotlb(info);
4129 		domain_context_clear(info);
4130 		intel_pasid_free_table(info->dev);
4131 	}
4132 
4133 	spin_lock_irqsave(&domain->lock, flags);
4134 	list_del(&info->link);
4135 	spin_unlock_irqrestore(&domain->lock, flags);
4136 
4137 	domain_detach_iommu(domain, iommu);
4138 	info->domain = NULL;
4139 }
4140 
md_domain_init(struct dmar_domain * domain,int guest_width)4141 static int md_domain_init(struct dmar_domain *domain, int guest_width)
4142 {
4143 	int adjust_width;
4144 
4145 	/* calculate AGAW */
4146 	domain->gaw = guest_width;
4147 	adjust_width = guestwidth_to_adjustwidth(guest_width);
4148 	domain->agaw = width_to_agaw(adjust_width);
4149 
4150 	domain->iommu_coherency = false;
4151 	domain->iommu_superpage = 0;
4152 	domain->max_addr = 0;
4153 
4154 	/* always allocate the top pgd */
4155 	domain->pgd = alloc_pgtable_page(domain->nid);
4156 	if (!domain->pgd)
4157 		return -ENOMEM;
4158 	domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
4159 	return 0;
4160 }
4161 
intel_iommu_domain_alloc(unsigned type)4162 static struct iommu_domain *intel_iommu_domain_alloc(unsigned type)
4163 {
4164 	struct dmar_domain *dmar_domain;
4165 	struct iommu_domain *domain;
4166 
4167 	switch (type) {
4168 	case IOMMU_DOMAIN_DMA:
4169 	case IOMMU_DOMAIN_DMA_FQ:
4170 	case IOMMU_DOMAIN_UNMANAGED:
4171 		dmar_domain = alloc_domain(type);
4172 		if (!dmar_domain) {
4173 			pr_err("Can't allocate dmar_domain\n");
4174 			return NULL;
4175 		}
4176 		if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
4177 			pr_err("Domain initialization failed\n");
4178 			domain_exit(dmar_domain);
4179 			return NULL;
4180 		}
4181 
4182 		domain = &dmar_domain->domain;
4183 		domain->geometry.aperture_start = 0;
4184 		domain->geometry.aperture_end   =
4185 				__DOMAIN_MAX_ADDR(dmar_domain->gaw);
4186 		domain->geometry.force_aperture = true;
4187 
4188 		return domain;
4189 	case IOMMU_DOMAIN_IDENTITY:
4190 		return &si_domain->domain;
4191 	default:
4192 		return NULL;
4193 	}
4194 
4195 	return NULL;
4196 }
4197 
intel_iommu_domain_free(struct iommu_domain * domain)4198 static void intel_iommu_domain_free(struct iommu_domain *domain)
4199 {
4200 	if (domain != &si_domain->domain)
4201 		domain_exit(to_dmar_domain(domain));
4202 }
4203 
prepare_domain_attach_device(struct iommu_domain * domain,struct device * dev)4204 static int prepare_domain_attach_device(struct iommu_domain *domain,
4205 					struct device *dev)
4206 {
4207 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4208 	struct intel_iommu *iommu;
4209 	int addr_width;
4210 
4211 	iommu = device_to_iommu(dev, NULL, NULL);
4212 	if (!iommu)
4213 		return -ENODEV;
4214 
4215 	if (dmar_domain->force_snooping && !ecap_sc_support(iommu->ecap))
4216 		return -EOPNOTSUPP;
4217 
4218 	/* check if this iommu agaw is sufficient for max mapped address */
4219 	addr_width = agaw_to_width(iommu->agaw);
4220 	if (addr_width > cap_mgaw(iommu->cap))
4221 		addr_width = cap_mgaw(iommu->cap);
4222 
4223 	if (dmar_domain->max_addr > (1LL << addr_width)) {
4224 		dev_err(dev, "%s: iommu width (%d) is not "
4225 		        "sufficient for the mapped address (%llx)\n",
4226 		        __func__, addr_width, dmar_domain->max_addr);
4227 		return -EFAULT;
4228 	}
4229 	dmar_domain->gaw = addr_width;
4230 
4231 	/*
4232 	 * Knock out extra levels of page tables if necessary
4233 	 */
4234 	while (iommu->agaw < dmar_domain->agaw) {
4235 		struct dma_pte *pte;
4236 
4237 		pte = dmar_domain->pgd;
4238 		if (dma_pte_present(pte)) {
4239 			dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
4240 			free_pgtable_page(pte);
4241 		}
4242 		dmar_domain->agaw--;
4243 	}
4244 
4245 	return 0;
4246 }
4247 
intel_iommu_attach_device(struct iommu_domain * domain,struct device * dev)4248 static int intel_iommu_attach_device(struct iommu_domain *domain,
4249 				     struct device *dev)
4250 {
4251 	int ret;
4252 
4253 	if (domain->type == IOMMU_DOMAIN_UNMANAGED &&
4254 	    device_is_rmrr_locked(dev)) {
4255 		dev_warn(dev, "Device is ineligible for IOMMU domain attach due to platform RMRR requirement.  Contact your platform vendor.\n");
4256 		return -EPERM;
4257 	}
4258 
4259 	/* normally dev is not mapped */
4260 	if (unlikely(domain_context_mapped(dev))) {
4261 		struct device_domain_info *info = dev_iommu_priv_get(dev);
4262 
4263 		if (info->domain)
4264 			dmar_remove_one_dev_info(dev);
4265 	}
4266 
4267 	ret = prepare_domain_attach_device(domain, dev);
4268 	if (ret)
4269 		return ret;
4270 
4271 	return domain_add_dev_info(to_dmar_domain(domain), dev);
4272 }
4273 
intel_iommu_detach_device(struct iommu_domain * domain,struct device * dev)4274 static void intel_iommu_detach_device(struct iommu_domain *domain,
4275 				      struct device *dev)
4276 {
4277 	dmar_remove_one_dev_info(dev);
4278 }
4279 
intel_iommu_map(struct iommu_domain * domain,unsigned long iova,phys_addr_t hpa,size_t size,int iommu_prot,gfp_t gfp)4280 static int intel_iommu_map(struct iommu_domain *domain,
4281 			   unsigned long iova, phys_addr_t hpa,
4282 			   size_t size, int iommu_prot, gfp_t gfp)
4283 {
4284 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4285 	u64 max_addr;
4286 	int prot = 0;
4287 
4288 	if (iommu_prot & IOMMU_READ)
4289 		prot |= DMA_PTE_READ;
4290 	if (iommu_prot & IOMMU_WRITE)
4291 		prot |= DMA_PTE_WRITE;
4292 	if (dmar_domain->set_pte_snp)
4293 		prot |= DMA_PTE_SNP;
4294 
4295 	max_addr = iova + size;
4296 	if (dmar_domain->max_addr < max_addr) {
4297 		u64 end;
4298 
4299 		/* check if minimum agaw is sufficient for mapped address */
4300 		end = __DOMAIN_MAX_ADDR(dmar_domain->gaw) + 1;
4301 		if (end < max_addr) {
4302 			pr_err("%s: iommu width (%d) is not "
4303 			       "sufficient for the mapped address (%llx)\n",
4304 			       __func__, dmar_domain->gaw, max_addr);
4305 			return -EFAULT;
4306 		}
4307 		dmar_domain->max_addr = max_addr;
4308 	}
4309 	/* Round up size to next multiple of PAGE_SIZE, if it and
4310 	   the low bits of hpa would take us onto the next page */
4311 	size = aligned_nrpages(hpa, size);
4312 	return __domain_mapping(dmar_domain, iova >> VTD_PAGE_SHIFT,
4313 				hpa >> VTD_PAGE_SHIFT, size, prot);
4314 }
4315 
intel_iommu_map_pages(struct iommu_domain * domain,unsigned long iova,phys_addr_t paddr,size_t pgsize,size_t pgcount,int prot,gfp_t gfp,size_t * mapped)4316 static int intel_iommu_map_pages(struct iommu_domain *domain,
4317 				 unsigned long iova, phys_addr_t paddr,
4318 				 size_t pgsize, size_t pgcount,
4319 				 int prot, gfp_t gfp, size_t *mapped)
4320 {
4321 	unsigned long pgshift = __ffs(pgsize);
4322 	size_t size = pgcount << pgshift;
4323 	int ret;
4324 
4325 	if (pgsize != SZ_4K && pgsize != SZ_2M && pgsize != SZ_1G)
4326 		return -EINVAL;
4327 
4328 	if (!IS_ALIGNED(iova | paddr, pgsize))
4329 		return -EINVAL;
4330 
4331 	ret = intel_iommu_map(domain, iova, paddr, size, prot, gfp);
4332 	if (!ret && mapped)
4333 		*mapped = size;
4334 
4335 	return ret;
4336 }
4337 
intel_iommu_unmap(struct iommu_domain * domain,unsigned long iova,size_t size,struct iommu_iotlb_gather * gather)4338 static size_t intel_iommu_unmap(struct iommu_domain *domain,
4339 				unsigned long iova, size_t size,
4340 				struct iommu_iotlb_gather *gather)
4341 {
4342 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4343 	unsigned long start_pfn, last_pfn;
4344 	int level = 0;
4345 
4346 	/* Cope with horrid API which requires us to unmap more than the
4347 	   size argument if it happens to be a large-page mapping. */
4348 	BUG_ON(!pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level));
4349 
4350 	if (size < VTD_PAGE_SIZE << level_to_offset_bits(level))
4351 		size = VTD_PAGE_SIZE << level_to_offset_bits(level);
4352 
4353 	start_pfn = iova >> VTD_PAGE_SHIFT;
4354 	last_pfn = (iova + size - 1) >> VTD_PAGE_SHIFT;
4355 
4356 	domain_unmap(dmar_domain, start_pfn, last_pfn, &gather->freelist);
4357 
4358 	if (dmar_domain->max_addr == iova + size)
4359 		dmar_domain->max_addr = iova;
4360 
4361 	iommu_iotlb_gather_add_page(domain, gather, iova, size);
4362 
4363 	return size;
4364 }
4365 
intel_iommu_unmap_pages(struct iommu_domain * domain,unsigned long iova,size_t pgsize,size_t pgcount,struct iommu_iotlb_gather * gather)4366 static size_t intel_iommu_unmap_pages(struct iommu_domain *domain,
4367 				      unsigned long iova,
4368 				      size_t pgsize, size_t pgcount,
4369 				      struct iommu_iotlb_gather *gather)
4370 {
4371 	unsigned long pgshift = __ffs(pgsize);
4372 	size_t size = pgcount << pgshift;
4373 
4374 	return intel_iommu_unmap(domain, iova, size, gather);
4375 }
4376 
intel_iommu_tlb_sync(struct iommu_domain * domain,struct iommu_iotlb_gather * gather)4377 static void intel_iommu_tlb_sync(struct iommu_domain *domain,
4378 				 struct iommu_iotlb_gather *gather)
4379 {
4380 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4381 	unsigned long iova_pfn = IOVA_PFN(gather->start);
4382 	size_t size = gather->end - gather->start;
4383 	struct iommu_domain_info *info;
4384 	unsigned long start_pfn;
4385 	unsigned long nrpages;
4386 	unsigned long i;
4387 
4388 	nrpages = aligned_nrpages(gather->start, size);
4389 	start_pfn = mm_to_dma_pfn(iova_pfn);
4390 
4391 	xa_for_each(&dmar_domain->iommu_array, i, info)
4392 		iommu_flush_iotlb_psi(info->iommu, dmar_domain,
4393 				      start_pfn, nrpages,
4394 				      list_empty(&gather->freelist), 0);
4395 
4396 	put_pages_list(&gather->freelist);
4397 }
4398 
intel_iommu_iova_to_phys(struct iommu_domain * domain,dma_addr_t iova)4399 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
4400 					    dma_addr_t iova)
4401 {
4402 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4403 	struct dma_pte *pte;
4404 	int level = 0;
4405 	u64 phys = 0;
4406 
4407 	pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, &level);
4408 	if (pte && dma_pte_present(pte))
4409 		phys = dma_pte_addr(pte) +
4410 			(iova & (BIT_MASK(level_to_offset_bits(level) +
4411 						VTD_PAGE_SHIFT) - 1));
4412 
4413 	return phys;
4414 }
4415 
domain_support_force_snooping(struct dmar_domain * domain)4416 static bool domain_support_force_snooping(struct dmar_domain *domain)
4417 {
4418 	struct device_domain_info *info;
4419 	bool support = true;
4420 
4421 	assert_spin_locked(&domain->lock);
4422 	list_for_each_entry(info, &domain->devices, link) {
4423 		if (!ecap_sc_support(info->iommu->ecap)) {
4424 			support = false;
4425 			break;
4426 		}
4427 	}
4428 
4429 	return support;
4430 }
4431 
domain_set_force_snooping(struct dmar_domain * domain)4432 static void domain_set_force_snooping(struct dmar_domain *domain)
4433 {
4434 	struct device_domain_info *info;
4435 
4436 	assert_spin_locked(&domain->lock);
4437 	/*
4438 	 * Second level page table supports per-PTE snoop control. The
4439 	 * iommu_map() interface will handle this by setting SNP bit.
4440 	 */
4441 	if (!domain_use_first_level(domain)) {
4442 		domain->set_pte_snp = true;
4443 		return;
4444 	}
4445 
4446 	list_for_each_entry(info, &domain->devices, link)
4447 		intel_pasid_setup_page_snoop_control(info->iommu, info->dev,
4448 						     PASID_RID2PASID);
4449 }
4450 
intel_iommu_enforce_cache_coherency(struct iommu_domain * domain)4451 static bool intel_iommu_enforce_cache_coherency(struct iommu_domain *domain)
4452 {
4453 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4454 	unsigned long flags;
4455 
4456 	if (dmar_domain->force_snooping)
4457 		return true;
4458 
4459 	spin_lock_irqsave(&dmar_domain->lock, flags);
4460 	if (!domain_support_force_snooping(dmar_domain)) {
4461 		spin_unlock_irqrestore(&dmar_domain->lock, flags);
4462 		return false;
4463 	}
4464 
4465 	domain_set_force_snooping(dmar_domain);
4466 	dmar_domain->force_snooping = true;
4467 	spin_unlock_irqrestore(&dmar_domain->lock, flags);
4468 
4469 	return true;
4470 }
4471 
intel_iommu_capable(struct device * dev,enum iommu_cap cap)4472 static bool intel_iommu_capable(struct device *dev, enum iommu_cap cap)
4473 {
4474 	if (cap == IOMMU_CAP_CACHE_COHERENCY)
4475 		return true;
4476 	if (cap == IOMMU_CAP_INTR_REMAP)
4477 		return irq_remapping_enabled == 1;
4478 	if (cap == IOMMU_CAP_PRE_BOOT_PROTECTION)
4479 		return dmar_platform_optin();
4480 
4481 	return false;
4482 }
4483 
intel_iommu_probe_device(struct device * dev)4484 static struct iommu_device *intel_iommu_probe_device(struct device *dev)
4485 {
4486 	struct pci_dev *pdev = dev_is_pci(dev) ? to_pci_dev(dev) : NULL;
4487 	struct device_domain_info *info;
4488 	struct intel_iommu *iommu;
4489 	u8 bus, devfn;
4490 
4491 	iommu = device_to_iommu(dev, &bus, &devfn);
4492 	if (!iommu || !iommu->iommu.ops)
4493 		return ERR_PTR(-ENODEV);
4494 
4495 	info = kzalloc(sizeof(*info), GFP_KERNEL);
4496 	if (!info)
4497 		return ERR_PTR(-ENOMEM);
4498 
4499 	if (dev_is_real_dma_subdevice(dev)) {
4500 		info->bus = pdev->bus->number;
4501 		info->devfn = pdev->devfn;
4502 		info->segment = pci_domain_nr(pdev->bus);
4503 	} else {
4504 		info->bus = bus;
4505 		info->devfn = devfn;
4506 		info->segment = iommu->segment;
4507 	}
4508 
4509 	info->dev = dev;
4510 	info->iommu = iommu;
4511 	if (dev_is_pci(dev)) {
4512 		if (ecap_dev_iotlb_support(iommu->ecap) &&
4513 		    pci_ats_supported(pdev) &&
4514 		    dmar_ats_supported(pdev, iommu)) {
4515 			info->ats_supported = 1;
4516 			info->dtlb_extra_inval = dev_needs_extra_dtlb_flush(pdev);
4517 		}
4518 		if (sm_supported(iommu)) {
4519 			if (pasid_supported(iommu)) {
4520 				int features = pci_pasid_features(pdev);
4521 
4522 				if (features >= 0)
4523 					info->pasid_supported = features | 1;
4524 			}
4525 
4526 			if (info->ats_supported && ecap_prs(iommu->ecap) &&
4527 			    pci_pri_supported(pdev))
4528 				info->pri_supported = 1;
4529 		}
4530 	}
4531 
4532 	dev_iommu_priv_set(dev, info);
4533 
4534 	return &iommu->iommu;
4535 }
4536 
intel_iommu_release_device(struct device * dev)4537 static void intel_iommu_release_device(struct device *dev)
4538 {
4539 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4540 
4541 	dmar_remove_one_dev_info(dev);
4542 	dev_iommu_priv_set(dev, NULL);
4543 	kfree(info);
4544 	set_dma_ops(dev, NULL);
4545 }
4546 
intel_iommu_probe_finalize(struct device * dev)4547 static void intel_iommu_probe_finalize(struct device *dev)
4548 {
4549 	set_dma_ops(dev, NULL);
4550 	iommu_setup_dma_ops(dev, 0, U64_MAX);
4551 }
4552 
intel_iommu_get_resv_regions(struct device * device,struct list_head * head)4553 static void intel_iommu_get_resv_regions(struct device *device,
4554 					 struct list_head *head)
4555 {
4556 	int prot = DMA_PTE_READ | DMA_PTE_WRITE;
4557 	struct iommu_resv_region *reg;
4558 	struct dmar_rmrr_unit *rmrr;
4559 	struct device *i_dev;
4560 	int i;
4561 
4562 	rcu_read_lock();
4563 	for_each_rmrr_units(rmrr) {
4564 		for_each_active_dev_scope(rmrr->devices, rmrr->devices_cnt,
4565 					  i, i_dev) {
4566 			struct iommu_resv_region *resv;
4567 			enum iommu_resv_type type;
4568 			size_t length;
4569 
4570 			if (i_dev != device &&
4571 			    !is_downstream_to_pci_bridge(device, i_dev))
4572 				continue;
4573 
4574 			length = rmrr->end_address - rmrr->base_address + 1;
4575 
4576 			type = device_rmrr_is_relaxable(device) ?
4577 				IOMMU_RESV_DIRECT_RELAXABLE : IOMMU_RESV_DIRECT;
4578 
4579 			resv = iommu_alloc_resv_region(rmrr->base_address,
4580 						       length, prot, type,
4581 						       GFP_ATOMIC);
4582 			if (!resv)
4583 				break;
4584 
4585 			list_add_tail(&resv->list, head);
4586 		}
4587 	}
4588 	rcu_read_unlock();
4589 
4590 #ifdef CONFIG_INTEL_IOMMU_FLOPPY_WA
4591 	if (dev_is_pci(device)) {
4592 		struct pci_dev *pdev = to_pci_dev(device);
4593 
4594 		if ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) {
4595 			reg = iommu_alloc_resv_region(0, 1UL << 24, prot,
4596 					IOMMU_RESV_DIRECT_RELAXABLE,
4597 					GFP_KERNEL);
4598 			if (reg)
4599 				list_add_tail(&reg->list, head);
4600 		}
4601 	}
4602 #endif /* CONFIG_INTEL_IOMMU_FLOPPY_WA */
4603 
4604 	reg = iommu_alloc_resv_region(IOAPIC_RANGE_START,
4605 				      IOAPIC_RANGE_END - IOAPIC_RANGE_START + 1,
4606 				      0, IOMMU_RESV_MSI, GFP_KERNEL);
4607 	if (!reg)
4608 		return;
4609 	list_add_tail(&reg->list, head);
4610 }
4611 
intel_iommu_device_group(struct device * dev)4612 static struct iommu_group *intel_iommu_device_group(struct device *dev)
4613 {
4614 	if (dev_is_pci(dev))
4615 		return pci_device_group(dev);
4616 	return generic_device_group(dev);
4617 }
4618 
intel_iommu_enable_sva(struct device * dev)4619 static int intel_iommu_enable_sva(struct device *dev)
4620 {
4621 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4622 	struct intel_iommu *iommu;
4623 	int ret;
4624 
4625 	if (!info || dmar_disabled)
4626 		return -EINVAL;
4627 
4628 	iommu = info->iommu;
4629 	if (!iommu)
4630 		return -EINVAL;
4631 
4632 	if (!(iommu->flags & VTD_FLAG_SVM_CAPABLE))
4633 		return -ENODEV;
4634 
4635 	if (!info->pasid_enabled || !info->pri_enabled || !info->ats_enabled)
4636 		return -EINVAL;
4637 
4638 	ret = iopf_queue_add_device(iommu->iopf_queue, dev);
4639 	if (!ret)
4640 		ret = iommu_register_device_fault_handler(dev, iommu_queue_iopf, dev);
4641 
4642 	return ret;
4643 }
4644 
intel_iommu_disable_sva(struct device * dev)4645 static int intel_iommu_disable_sva(struct device *dev)
4646 {
4647 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4648 	struct intel_iommu *iommu = info->iommu;
4649 	int ret;
4650 
4651 	ret = iommu_unregister_device_fault_handler(dev);
4652 	if (!ret)
4653 		ret = iopf_queue_remove_device(iommu->iopf_queue, dev);
4654 
4655 	return ret;
4656 }
4657 
intel_iommu_enable_iopf(struct device * dev)4658 static int intel_iommu_enable_iopf(struct device *dev)
4659 {
4660 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4661 
4662 	if (info && info->pri_supported)
4663 		return 0;
4664 
4665 	return -ENODEV;
4666 }
4667 
4668 static int
intel_iommu_dev_enable_feat(struct device * dev,enum iommu_dev_features feat)4669 intel_iommu_dev_enable_feat(struct device *dev, enum iommu_dev_features feat)
4670 {
4671 	switch (feat) {
4672 	case IOMMU_DEV_FEAT_IOPF:
4673 		return intel_iommu_enable_iopf(dev);
4674 
4675 	case IOMMU_DEV_FEAT_SVA:
4676 		return intel_iommu_enable_sva(dev);
4677 
4678 	default:
4679 		return -ENODEV;
4680 	}
4681 }
4682 
4683 static int
intel_iommu_dev_disable_feat(struct device * dev,enum iommu_dev_features feat)4684 intel_iommu_dev_disable_feat(struct device *dev, enum iommu_dev_features feat)
4685 {
4686 	switch (feat) {
4687 	case IOMMU_DEV_FEAT_IOPF:
4688 		return 0;
4689 
4690 	case IOMMU_DEV_FEAT_SVA:
4691 		return intel_iommu_disable_sva(dev);
4692 
4693 	default:
4694 		return -ENODEV;
4695 	}
4696 }
4697 
intel_iommu_is_attach_deferred(struct device * dev)4698 static bool intel_iommu_is_attach_deferred(struct device *dev)
4699 {
4700 	struct device_domain_info *info = dev_iommu_priv_get(dev);
4701 
4702 	return translation_pre_enabled(info->iommu) && !info->domain;
4703 }
4704 
4705 /*
4706  * Check that the device does not live on an external facing PCI port that is
4707  * marked as untrusted. Such devices should not be able to apply quirks and
4708  * thus not be able to bypass the IOMMU restrictions.
4709  */
risky_device(struct pci_dev * pdev)4710 static bool risky_device(struct pci_dev *pdev)
4711 {
4712 	if (pdev->untrusted) {
4713 		pci_info(pdev,
4714 			 "Skipping IOMMU quirk for dev [%04X:%04X] on untrusted PCI link\n",
4715 			 pdev->vendor, pdev->device);
4716 		pci_info(pdev, "Please check with your BIOS/Platform vendor about this\n");
4717 		return true;
4718 	}
4719 	return false;
4720 }
4721 
intel_iommu_iotlb_sync_map(struct iommu_domain * domain,unsigned long iova,size_t size)4722 static void intel_iommu_iotlb_sync_map(struct iommu_domain *domain,
4723 				       unsigned long iova, size_t size)
4724 {
4725 	struct dmar_domain *dmar_domain = to_dmar_domain(domain);
4726 	unsigned long pages = aligned_nrpages(iova, size);
4727 	unsigned long pfn = iova >> VTD_PAGE_SHIFT;
4728 	struct iommu_domain_info *info;
4729 	unsigned long i;
4730 
4731 	xa_for_each(&dmar_domain->iommu_array, i, info)
4732 		__mapping_notify_one(info->iommu, dmar_domain, pfn, pages);
4733 }
4734 
4735 const struct iommu_ops intel_iommu_ops = {
4736 	.capable		= intel_iommu_capable,
4737 	.domain_alloc		= intel_iommu_domain_alloc,
4738 	.probe_device		= intel_iommu_probe_device,
4739 	.probe_finalize		= intel_iommu_probe_finalize,
4740 	.release_device		= intel_iommu_release_device,
4741 	.get_resv_regions	= intel_iommu_get_resv_regions,
4742 	.device_group		= intel_iommu_device_group,
4743 	.dev_enable_feat	= intel_iommu_dev_enable_feat,
4744 	.dev_disable_feat	= intel_iommu_dev_disable_feat,
4745 	.is_attach_deferred	= intel_iommu_is_attach_deferred,
4746 	.def_domain_type	= device_def_domain_type,
4747 	.pgsize_bitmap		= SZ_4K,
4748 #ifdef CONFIG_INTEL_IOMMU_SVM
4749 	.sva_bind		= intel_svm_bind,
4750 	.sva_unbind		= intel_svm_unbind,
4751 	.sva_get_pasid		= intel_svm_get_pasid,
4752 	.page_response		= intel_svm_page_response,
4753 #endif
4754 	.default_domain_ops = &(const struct iommu_domain_ops) {
4755 		.attach_dev		= intel_iommu_attach_device,
4756 		.detach_dev		= intel_iommu_detach_device,
4757 		.map_pages		= intel_iommu_map_pages,
4758 		.unmap_pages		= intel_iommu_unmap_pages,
4759 		.iotlb_sync_map		= intel_iommu_iotlb_sync_map,
4760 		.flush_iotlb_all        = intel_flush_iotlb_all,
4761 		.iotlb_sync		= intel_iommu_tlb_sync,
4762 		.iova_to_phys		= intel_iommu_iova_to_phys,
4763 		.free			= intel_iommu_domain_free,
4764 		.enforce_cache_coherency = intel_iommu_enforce_cache_coherency,
4765 	}
4766 };
4767 
quirk_iommu_igfx(struct pci_dev * dev)4768 static void quirk_iommu_igfx(struct pci_dev *dev)
4769 {
4770 	if (risky_device(dev))
4771 		return;
4772 
4773 	pci_info(dev, "Disabling IOMMU for graphics on this chipset\n");
4774 	dmar_map_gfx = 0;
4775 }
4776 
4777 /* G4x/GM45 integrated gfx dmar support is totally busted. */
4778 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_igfx);
4779 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_igfx);
4780 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_igfx);
4781 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_igfx);
4782 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_igfx);
4783 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_igfx);
4784 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_igfx);
4785 
4786 /* Broadwell igfx malfunctions with dmar */
4787 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1606, quirk_iommu_igfx);
4788 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160B, quirk_iommu_igfx);
4789 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160E, quirk_iommu_igfx);
4790 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1602, quirk_iommu_igfx);
4791 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160A, quirk_iommu_igfx);
4792 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x160D, quirk_iommu_igfx);
4793 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1616, quirk_iommu_igfx);
4794 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161B, quirk_iommu_igfx);
4795 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161E, quirk_iommu_igfx);
4796 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1612, quirk_iommu_igfx);
4797 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161A, quirk_iommu_igfx);
4798 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x161D, quirk_iommu_igfx);
4799 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1626, quirk_iommu_igfx);
4800 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162B, quirk_iommu_igfx);
4801 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162E, quirk_iommu_igfx);
4802 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1622, quirk_iommu_igfx);
4803 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162A, quirk_iommu_igfx);
4804 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x162D, quirk_iommu_igfx);
4805 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1636, quirk_iommu_igfx);
4806 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163B, quirk_iommu_igfx);
4807 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163E, quirk_iommu_igfx);
4808 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x1632, quirk_iommu_igfx);
4809 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163A, quirk_iommu_igfx);
4810 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x163D, quirk_iommu_igfx);
4811 
quirk_iommu_rwbf(struct pci_dev * dev)4812 static void quirk_iommu_rwbf(struct pci_dev *dev)
4813 {
4814 	if (risky_device(dev))
4815 		return;
4816 
4817 	/*
4818 	 * Mobile 4 Series Chipset neglects to set RWBF capability,
4819 	 * but needs it. Same seems to hold for the desktop versions.
4820 	 */
4821 	pci_info(dev, "Forcing write-buffer flush capability\n");
4822 	rwbf_quirk = 1;
4823 }
4824 
4825 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);
4826 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e00, quirk_iommu_rwbf);
4827 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e10, quirk_iommu_rwbf);
4828 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e20, quirk_iommu_rwbf);
4829 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e30, quirk_iommu_rwbf);
4830 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e40, quirk_iommu_rwbf);
4831 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2e90, quirk_iommu_rwbf);
4832 
4833 #define GGC 0x52
4834 #define GGC_MEMORY_SIZE_MASK	(0xf << 8)
4835 #define GGC_MEMORY_SIZE_NONE	(0x0 << 8)
4836 #define GGC_MEMORY_SIZE_1M	(0x1 << 8)
4837 #define GGC_MEMORY_SIZE_2M	(0x3 << 8)
4838 #define GGC_MEMORY_VT_ENABLED	(0x8 << 8)
4839 #define GGC_MEMORY_SIZE_2M_VT	(0x9 << 8)
4840 #define GGC_MEMORY_SIZE_3M_VT	(0xa << 8)
4841 #define GGC_MEMORY_SIZE_4M_VT	(0xb << 8)
4842 
quirk_calpella_no_shadow_gtt(struct pci_dev * dev)4843 static void quirk_calpella_no_shadow_gtt(struct pci_dev *dev)
4844 {
4845 	unsigned short ggc;
4846 
4847 	if (risky_device(dev))
4848 		return;
4849 
4850 	if (pci_read_config_word(dev, GGC, &ggc))
4851 		return;
4852 
4853 	if (!(ggc & GGC_MEMORY_VT_ENABLED)) {
4854 		pci_info(dev, "BIOS has allocated no shadow GTT; disabling IOMMU for graphics\n");
4855 		dmar_map_gfx = 0;
4856 	} else if (dmar_map_gfx) {
4857 		/* we have to ensure the gfx device is idle before we flush */
4858 		pci_info(dev, "Disabling batched IOTLB flush on Ironlake\n");
4859 		iommu_set_dma_strict();
4860 	}
4861 }
4862 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0040, quirk_calpella_no_shadow_gtt);
4863 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0044, quirk_calpella_no_shadow_gtt);
4864 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x0062, quirk_calpella_no_shadow_gtt);
4865 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x006a, quirk_calpella_no_shadow_gtt);
4866 
quirk_igfx_skip_te_disable(struct pci_dev * dev)4867 static void quirk_igfx_skip_te_disable(struct pci_dev *dev)
4868 {
4869 	unsigned short ver;
4870 
4871 	if (!IS_GFX_DEVICE(dev))
4872 		return;
4873 
4874 	ver = (dev->device >> 8) & 0xff;
4875 	if (ver != 0x45 && ver != 0x46 && ver != 0x4c &&
4876 	    ver != 0x4e && ver != 0x8a && ver != 0x98 &&
4877 	    ver != 0x9a && ver != 0xa7)
4878 		return;
4879 
4880 	if (risky_device(dev))
4881 		return;
4882 
4883 	pci_info(dev, "Skip IOMMU disabling for graphics\n");
4884 	iommu_skip_te_disable = 1;
4885 }
4886 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, quirk_igfx_skip_te_disable);
4887 
4888 /* On Tylersburg chipsets, some BIOSes have been known to enable the
4889    ISOCH DMAR unit for the Azalia sound device, but not give it any
4890    TLB entries, which causes it to deadlock. Check for that.  We do
4891    this in a function called from init_dmars(), instead of in a PCI
4892    quirk, because we don't want to print the obnoxious "BIOS broken"
4893    message if VT-d is actually disabled.
4894 */
check_tylersburg_isoch(void)4895 static void __init check_tylersburg_isoch(void)
4896 {
4897 	struct pci_dev *pdev;
4898 	uint32_t vtisochctrl;
4899 
4900 	/* If there's no Azalia in the system anyway, forget it. */
4901 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x3a3e, NULL);
4902 	if (!pdev)
4903 		return;
4904 
4905 	if (risky_device(pdev)) {
4906 		pci_dev_put(pdev);
4907 		return;
4908 	}
4909 
4910 	pci_dev_put(pdev);
4911 
4912 	/* System Management Registers. Might be hidden, in which case
4913 	   we can't do the sanity check. But that's OK, because the
4914 	   known-broken BIOSes _don't_ actually hide it, so far. */
4915 	pdev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x342e, NULL);
4916 	if (!pdev)
4917 		return;
4918 
4919 	if (risky_device(pdev)) {
4920 		pci_dev_put(pdev);
4921 		return;
4922 	}
4923 
4924 	if (pci_read_config_dword(pdev, 0x188, &vtisochctrl)) {
4925 		pci_dev_put(pdev);
4926 		return;
4927 	}
4928 
4929 	pci_dev_put(pdev);
4930 
4931 	/* If Azalia DMA is routed to the non-isoch DMAR unit, fine. */
4932 	if (vtisochctrl & 1)
4933 		return;
4934 
4935 	/* Drop all bits other than the number of TLB entries */
4936 	vtisochctrl &= 0x1c;
4937 
4938 	/* If we have the recommended number of TLB entries (16), fine. */
4939 	if (vtisochctrl == 0x10)
4940 		return;
4941 
4942 	/* Zero TLB entries? You get to ride the short bus to school. */
4943 	if (!vtisochctrl) {
4944 		WARN(1, "Your BIOS is broken; DMA routed to ISOCH DMAR unit but no TLB space.\n"
4945 		     "BIOS vendor: %s; Ver: %s; Product Version: %s\n",
4946 		     dmi_get_system_info(DMI_BIOS_VENDOR),
4947 		     dmi_get_system_info(DMI_BIOS_VERSION),
4948 		     dmi_get_system_info(DMI_PRODUCT_VERSION));
4949 		iommu_identity_mapping |= IDENTMAP_AZALIA;
4950 		return;
4951 	}
4952 
4953 	pr_warn("Recommended TLB entries for ISOCH unit is 16; your BIOS set %d\n",
4954 	       vtisochctrl);
4955 }
4956 
4957 /*
4958  * Here we deal with a device TLB defect where device may inadvertently issue ATS
4959  * invalidation completion before posted writes initiated with translated address
4960  * that utilized translations matching the invalidation address range, violating
4961  * the invalidation completion ordering.
4962  * Therefore, any use cases that cannot guarantee DMA is stopped before unmap is
4963  * vulnerable to this defect. In other words, any dTLB invalidation initiated not
4964  * under the control of the trusted/privileged host device driver must use this
4965  * quirk.
4966  * Device TLBs are invalidated under the following six conditions:
4967  * 1. Device driver does DMA API unmap IOVA
4968  * 2. Device driver unbind a PASID from a process, sva_unbind_device()
4969  * 3. PASID is torn down, after PASID cache is flushed. e.g. process
4970  *    exit_mmap() due to crash
4971  * 4. Under SVA usage, called by mmu_notifier.invalidate_range() where
4972  *    VM has to free pages that were unmapped
4973  * 5. Userspace driver unmaps a DMA buffer
4974  * 6. Cache invalidation in vSVA usage (upcoming)
4975  *
4976  * For #1 and #2, device drivers are responsible for stopping DMA traffic
4977  * before unmap/unbind. For #3, iommu driver gets mmu_notifier to
4978  * invalidate TLB the same way as normal user unmap which will use this quirk.
4979  * The dTLB invalidation after PASID cache flush does not need this quirk.
4980  *
4981  * As a reminder, #6 will *NEED* this quirk as we enable nested translation.
4982  */
quirk_extra_dev_tlb_flush(struct device_domain_info * info,unsigned long address,unsigned long mask,u32 pasid,u16 qdep)4983 void quirk_extra_dev_tlb_flush(struct device_domain_info *info,
4984 			       unsigned long address, unsigned long mask,
4985 			       u32 pasid, u16 qdep)
4986 {
4987 	u16 sid;
4988 
4989 	if (likely(!info->dtlb_extra_inval))
4990 		return;
4991 
4992 	sid = PCI_DEVID(info->bus, info->devfn);
4993 	if (pasid == PASID_RID2PASID) {
4994 		qi_flush_dev_iotlb(info->iommu, sid, info->pfsid,
4995 				   qdep, address, mask);
4996 	} else {
4997 		qi_flush_dev_iotlb_pasid(info->iommu, sid, info->pfsid,
4998 					 pasid, qdep, address, mask);
4999 	}
5000 }
5001