diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2021-04-28 17:19:47 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2021-04-28 17:19:47 -0700 |
commit | 238da4d004856ac5f832899f6f3fa27c0102381f (patch) | |
tree | 5f71d17600ee4e16a3176116059a350949f5aa5e /drivers/vfio/pci | |
parent | 35655ceb31b56cd1cb52635a725dfcdb9662d7b7 (diff) | |
parent | adaeb718d46f6b42a3fc1dffd4f946f26b33779a (diff) |
Merge tag 'vfio-v5.13-rc1' of git://github.com/awilliam/linux-vfio
Pull VFIO updates from Alex Williamson:
- Embed struct vfio_device into vfio driver structures (Jason
Gunthorpe)
- Make vfio_mdev type safe (Jason Gunthorpe)
- Remove vfio-pci NVLink2 extensions for POWER9 (Christoph Hellwig)
- Update vfio-pci IGD extensions for OpRegion 2.1+ (Fred Gao)
- Various spelling/blank line fixes (Zhen Lei, Zhou Wang, Bhaskar
Chowdhury)
- Simplify unpin_pages error handling (Shenming Lu)
- Fix i915 mdev Kconfig dependency (Arnd Bergmann)
- Remove unused structure member (Keqian Zhu)
* tag 'vfio-v5.13-rc1' of git://github.com/awilliam/linux-vfio: (43 commits)
vfio/gvt: fix DRM_I915_GVT dependency on VFIO_MDEV
vfio/iommu_type1: Remove unused pinned_page_dirty_scope in vfio_iommu
vfio/mdev: Correct the function signatures for the mdev_type_attributes
vfio/mdev: Remove kobj from mdev_parent_ops->create()
vfio/gvt: Use mdev_get_type_group_id()
vfio/gvt: Make DRM_I915_GVT depend on VFIO_MDEV
vfio/mbochs: Use mdev_get_type_group_id()
vfio/mdpy: Use mdev_get_type_group_id()
vfio/mtty: Use mdev_get_type_group_id()
vfio/mdev: Add mdev/mtype_get_type_group_id()
vfio/mdev: Remove duplicate storage of parent in mdev_device
vfio/mdev: Add missing error handling to dev_set_name()
vfio/mdev: Reorganize mdev_device_create()
vfio/mdev: Add missing reference counting to mdev_type
vfio/mdev: Expose mdev_get/put_parent to mdev_private.h
vfio/mdev: Use struct mdev_type in struct mdev_device
vfio/mdev: Simplify driver registration
vfio/mdev: Add missing typesafety around mdev_device
vfio/mdev: Do not allow a mdev_type to have a NULL parent pointer
vfio/mdev: Fix missing static's on MDEV_TYPE_ATTR's
...
Diffstat (limited to 'drivers/vfio/pci')
-rw-r--r-- | drivers/vfio/pci/Kconfig | 6 | ||||
-rw-r--r-- | drivers/vfio/pci/Makefile | 1 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci.c | 274 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_config.c | 2 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_igd.c | 53 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_nvlink2.c | 490 | ||||
-rw-r--r-- | drivers/vfio/pci/vfio_pci_private.h | 15 |
7 files changed, 195 insertions, 646 deletions
diff --git a/drivers/vfio/pci/Kconfig b/drivers/vfio/pci/Kconfig index 4abddbebd4b2..53ce78d7d07b 100644 --- a/drivers/vfio/pci/Kconfig +++ b/drivers/vfio/pci/Kconfig @@ -39,9 +39,3 @@ config VFIO_PCI_IGD and LPC bridge config space. To enable Intel IGD assignment through vfio-pci, say Y. - -config VFIO_PCI_NVLINK2 - def_bool y - depends on VFIO_PCI && PPC_POWERNV && SPAPR_TCE_IOMMU - help - VFIO PCI support for P9 Witherspoon machine with NVIDIA V100 GPUs diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile index eff97a7cd9f1..3ff42093962f 100644 --- a/drivers/vfio/pci/Makefile +++ b/drivers/vfio/pci/Makefile @@ -2,7 +2,6 @@ vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o vfio-pci-$(CONFIG_VFIO_PCI_IGD) += vfio_pci_igd.o -vfio-pci-$(CONFIG_VFIO_PCI_NVLINK2) += vfio_pci_nvlink2.o vfio-pci-$(CONFIG_S390) += vfio_pci_zdev.o obj-$(CONFIG_VFIO_PCI) += vfio-pci.o diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c index 5023e23db3bc..bd7c482c948a 100644 --- a/drivers/vfio/pci/vfio_pci.c +++ b/drivers/vfio/pci/vfio_pci.c @@ -378,7 +378,6 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev) if (!vfio_vga_disabled() && vfio_pci_is_vga(pdev)) vdev->has_vga = true; - if (vfio_pci_is_vga(pdev) && pdev->vendor == PCI_VENDOR_ID_INTEL && IS_ENABLED(CONFIG_VFIO_PCI_IGD)) { @@ -389,24 +388,6 @@ static int vfio_pci_enable(struct vfio_pci_device *vdev) } } - if (pdev->vendor == PCI_VENDOR_ID_NVIDIA && - IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) { - ret = vfio_pci_nvdia_v100_nvlink2_init(vdev); - if (ret && ret != -ENODEV) { - pci_warn(pdev, "Failed to setup NVIDIA NV2 RAM region\n"); - goto disable_exit; - } - } - - if (pdev->vendor == PCI_VENDOR_ID_IBM && - IS_ENABLED(CONFIG_VFIO_PCI_NVLINK2)) { - ret = vfio_pci_ibm_npu2_init(vdev); - if (ret && ret != -ENODEV) { - pci_warn(pdev, "Failed to setup NVIDIA NV2 ATSD region\n"); - goto disable_exit; - } - } - vfio_pci_probe_mmaps(vdev); return 0; @@ -517,30 +498,29 @@ out: static struct pci_driver vfio_pci_driver; -static struct vfio_pci_device *get_pf_vdev(struct vfio_pci_device *vdev, - struct vfio_device **pf_dev) +static struct vfio_pci_device *get_pf_vdev(struct vfio_pci_device *vdev) { struct pci_dev *physfn = pci_physfn(vdev->pdev); + struct vfio_device *pf_dev; if (!vdev->pdev->is_virtfn) return NULL; - *pf_dev = vfio_device_get_from_dev(&physfn->dev); - if (!*pf_dev) + pf_dev = vfio_device_get_from_dev(&physfn->dev); + if (!pf_dev) return NULL; if (pci_dev_driver(physfn) != &vfio_pci_driver) { - vfio_device_put(*pf_dev); + vfio_device_put(pf_dev); return NULL; } - return vfio_device_data(*pf_dev); + return container_of(pf_dev, struct vfio_pci_device, vdev); } static void vfio_pci_vf_token_user_add(struct vfio_pci_device *vdev, int val) { - struct vfio_device *pf_dev; - struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev, &pf_dev); + struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev); if (!pf_vdev) return; @@ -550,12 +530,13 @@ static void vfio_pci_vf_token_user_add(struct vfio_pci_device *vdev, int val) WARN_ON(pf_vdev->vf_token->users < 0); mutex_unlock(&pf_vdev->vf_token->lock); - vfio_device_put(pf_dev); + vfio_device_put(&pf_vdev->vdev); } -static void vfio_pci_release(void *device_data) +static void vfio_pci_release(struct vfio_device *core_vdev) { - struct vfio_pci_device *vdev = device_data; + struct vfio_pci_device *vdev = + container_of(core_vdev, struct vfio_pci_device, vdev); mutex_lock(&vdev->reflck->lock); @@ -581,9 +562,10 @@ static void vfio_pci_release(void *device_data) module_put(THIS_MODULE); } -static int vfio_pci_open(void *device_data) +static int vfio_pci_open(struct vfio_device *core_vdev) { - struct vfio_pci_device *vdev = device_data; + struct vfio_pci_device *vdev = + container_of(core_vdev, struct vfio_pci_device, vdev); int ret = 0; if (!try_module_get(THIS_MODULE)) @@ -792,15 +774,16 @@ int vfio_pci_register_dev_region(struct vfio_pci_device *vdev, } struct vfio_devices { - struct vfio_device **devices; + struct vfio_pci_device **devices; int cur_index; int max_index; }; -static long vfio_pci_ioctl(void *device_data, +static long vfio_pci_ioctl(struct vfio_device *core_vdev, unsigned int cmd, unsigned long arg) { - struct vfio_pci_device *vdev = device_data; + struct vfio_pci_device *vdev = + container_of(core_vdev, struct vfio_pci_device, vdev); unsigned long minsz; if (cmd == VFIO_DEVICE_GET_INFO) { @@ -1280,9 +1263,7 @@ reset_info_exit: goto hot_reset_release; for (; mem_idx < devs.cur_index; mem_idx++) { - struct vfio_pci_device *tmp; - - tmp = vfio_device_data(devs.devices[mem_idx]); + struct vfio_pci_device *tmp = devs.devices[mem_idx]; ret = down_write_trylock(&tmp->memory_lock); if (!ret) { @@ -1297,17 +1278,13 @@ reset_info_exit: hot_reset_release: for (i = 0; i < devs.cur_index; i++) { - struct vfio_device *device; - struct vfio_pci_device *tmp; - - device = devs.devices[i]; - tmp = vfio_device_data(device); + struct vfio_pci_device *tmp = devs.devices[i]; if (i < mem_idx) up_write(&tmp->memory_lock); else mutex_unlock(&tmp->vma_lock); - vfio_device_put(device); + vfio_device_put(&tmp->vdev); } kfree(devs.devices); @@ -1402,11 +1379,10 @@ hot_reset_release: return -ENOTTY; } -static ssize_t vfio_pci_rw(void *device_data, char __user *buf, +static ssize_t vfio_pci_rw(struct vfio_pci_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite) { unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); - struct vfio_pci_device *vdev = device_data; if (index >= VFIO_PCI_NUM_REGIONS + vdev->num_regions) return -EINVAL; @@ -1434,22 +1410,28 @@ static ssize_t vfio_pci_rw(void *device_data, char __user *buf, return -EINVAL; } -static ssize_t vfio_pci_read(void *device_data, char __user *buf, +static ssize_t vfio_pci_read(struct vfio_device *core_vdev, char __user *buf, size_t count, loff_t *ppos) { + struct vfio_pci_device *vdev = + container_of(core_vdev, struct vfio_pci_device, vdev); + if (!count) return 0; - return vfio_pci_rw(device_data, buf, count, ppos, false); + return vfio_pci_rw(vdev, buf, count, ppos, false); } -static ssize_t vfio_pci_write(void *device_data, const char __user *buf, +static ssize_t vfio_pci_write(struct vfio_device *core_vdev, const char __user *buf, size_t count, loff_t *ppos) { + struct vfio_pci_device *vdev = + container_of(core_vdev, struct vfio_pci_device, vdev); + if (!count) return 0; - return vfio_pci_rw(device_data, (char __user *)buf, count, ppos, true); + return vfio_pci_rw(vdev, (char __user *)buf, count, ppos, true); } /* Return 1 on zap and vma_lock acquired, 0 on contention (only with @try) */ @@ -1646,9 +1628,10 @@ static const struct vm_operations_struct vfio_pci_mmap_ops = { .fault = vfio_pci_mmap_fault, }; -static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) +static int vfio_pci_mmap(struct vfio_device *core_vdev, struct vm_area_struct *vma) { - struct vfio_pci_device *vdev = device_data; + struct vfio_pci_device *vdev = + container_of(core_vdev, struct vfio_pci_device, vdev); struct pci_dev *pdev = vdev->pdev; unsigned int index; u64 phys_len, req_len, pgoff, req_start; @@ -1716,9 +1699,10 @@ static int vfio_pci_mmap(void *device_data, struct vm_area_struct *vma) return 0; } -static void vfio_pci_request(void *device_data, unsigned int count) +static void vfio_pci_request(struct vfio_device *core_vdev, unsigned int count) { - struct vfio_pci_device *vdev = device_data; + struct vfio_pci_device *vdev = + container_of(core_vdev, struct vfio_pci_device, vdev); struct pci_dev *pdev = vdev->pdev; mutex_lock(&vdev->igate); @@ -1769,8 +1753,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_device *vdev, return 0; /* No VF token provided or required */ if (vdev->pdev->is_virtfn) { - struct vfio_device *pf_dev; - struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev, &pf_dev); + struct vfio_pci_device *pf_vdev = get_pf_vdev(vdev); bool match; if (!pf_vdev) { @@ -1783,7 +1766,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_device *vdev, } if (!vf_token) { - vfio_device_put(pf_dev); + vfio_device_put(&pf_vdev->vdev); pci_info_ratelimited(vdev->pdev, "VF token required to access device\n"); return -EACCES; @@ -1793,7 +1776,7 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_device *vdev, match = uuid_equal(uuid, &pf_vdev->vf_token->uuid); mutex_unlock(&pf_vdev->vf_token->lock); - vfio_device_put(pf_dev); + vfio_device_put(&pf_vdev->vdev); if (!match) { pci_info_ratelimited(vdev->pdev, @@ -1832,9 +1815,10 @@ static int vfio_pci_validate_vf_token(struct vfio_pci_device *vdev, #define VF_TOKEN_ARG "vf_token=" -static int vfio_pci_match(void *device_data, char *buf) +static int vfio_pci_match(struct vfio_device *core_vdev, char *buf) { - struct vfio_pci_device *vdev = device_data; + struct vfio_pci_device *vdev = + container_of(core_vdev, struct vfio_pci_device, vdev); bool vf_token = false; uuid_t uuid; int ret; @@ -1924,6 +1908,68 @@ static int vfio_pci_bus_notifier(struct notifier_block *nb, return 0; } +static int vfio_pci_vf_init(struct vfio_pci_device *vdev) +{ + struct pci_dev *pdev = vdev->pdev; + int ret; + + if (!pdev->is_physfn) + return 0; + + vdev->vf_token = kzalloc(sizeof(*vdev->vf_token), GFP_KERNEL); + if (!vdev->vf_token) + return -ENOMEM; + + mutex_init(&vdev->vf_token->lock); + uuid_gen(&vdev->vf_token->uuid); + + vdev->nb.notifier_call = vfio_pci_bus_notifier; + ret = bus_register_notifier(&pci_bus_type, &vdev->nb); + if (ret) { + kfree(vdev->vf_token); + return ret; + } + return 0; +} + +static void vfio_pci_vf_uninit(struct vfio_pci_device *vdev) +{ + if (!vdev->vf_token) + return; + + bus_unregister_notifier(&pci_bus_type, &vdev->nb); + WARN_ON(vdev->vf_token->users); + mutex_destroy(&vdev->vf_token->lock); + kfree(vdev->vf_token); +} + +static int vfio_pci_vga_init(struct vfio_pci_device *vdev) +{ + struct pci_dev *pdev = vdev->pdev; + int ret; + + if (!vfio_pci_is_vga(pdev)) + return 0; + + ret = vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode); + if (ret) + return ret; + vga_set_legacy_decoding(pdev, vfio_pci_set_vga_decode(vdev, false)); + return 0; +} + +static void vfio_pci_vga_uninit(struct vfio_pci_device *vdev) +{ + struct pci_dev *pdev = vdev->pdev; + + if (!vfio_pci_is_vga(pdev)) + return; + vga_client_register(pdev, NULL, NULL, NULL); + vga_set_legacy_decoding(pdev, VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM | + VGA_RSRC_LEGACY_IO | + VGA_RSRC_LEGACY_MEM); +} + static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct vfio_pci_device *vdev; @@ -1959,6 +2005,7 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto out_group_put; } + vfio_init_group_dev(&vdev->vdev, &pdev->dev, &vfio_pci_ops); vdev->pdev = pdev; vdev->irq_type = VFIO_PCI_NUM_IRQS; mutex_init(&vdev->igate); @@ -1970,35 +2017,15 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) INIT_LIST_HEAD(&vdev->vma_list); init_rwsem(&vdev->memory_lock); - ret = vfio_add_group_dev(&pdev->dev, &vfio_pci_ops, vdev); + ret = vfio_pci_reflck_attach(vdev); if (ret) goto out_free; - - ret = vfio_pci_reflck_attach(vdev); + ret = vfio_pci_vf_init(vdev); if (ret) - goto out_del_group_dev; - - if (pdev->is_physfn) { - vdev->vf_token = kzalloc(sizeof(*vdev->vf_token), GFP_KERNEL); - if (!vdev->vf_token) { - ret = -ENOMEM; - goto out_reflck; - } - - mutex_init(&vdev->vf_token->lock); - uuid_gen(&vdev->vf_token->uuid); - - vdev->nb.notifier_call = vfio_pci_bus_notifier; - ret = bus_register_notifier(&pci_bus_type, &vdev->nb); - if (ret) - goto out_vf_token; - } - - if (vfio_pci_is_vga(pdev)) { - vga_client_register(pdev, vdev, NULL, vfio_pci_set_vga_decode); - vga_set_legacy_decoding(pdev, - vfio_pci_set_vga_decode(vdev, false)); - } + goto out_reflck; + ret = vfio_pci_vga_init(vdev); + if (ret) + goto out_vf; vfio_pci_probe_power_state(vdev); @@ -2016,15 +2043,21 @@ static int vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) vfio_pci_set_power_state(vdev, PCI_D3hot); } - return ret; + ret = vfio_register_group_dev(&vdev->vdev); + if (ret) + goto out_power; + dev_set_drvdata(&pdev->dev, vdev); + return 0; -out_vf_token: - kfree(vdev->vf_token); +out_power: + if (!disable_idle_d3) + vfio_pci_set_power_state(vdev, PCI_D0); +out_vf: + vfio_pci_vf_uninit(vdev); out_reflck: vfio_pci_reflck_put(vdev->reflck); -out_del_group_dev: - vfio_del_group_dev(&pdev->dev); out_free: + kfree(vdev->pm_save); kfree(vdev); out_group_put: vfio_iommu_group_put(group, &pdev->dev); @@ -2033,41 +2066,25 @@ out_group_put: static void vfio_pci_remove(struct pci_dev *pdev) { - struct vfio_pci_device *vdev; + struct vfio_pci_device *vdev = dev_get_drvdata(&pdev->dev); pci_disable_sriov(pdev); - vdev = vfio_del_group_dev(&pdev->dev); - if (!vdev) - return; - - if (vdev->vf_token) { - WARN_ON(vdev->vf_token->users); - mutex_destroy(&vdev->vf_token->lock); - kfree(vdev->vf_token); - } - - if (vdev->nb.notifier_call) - bus_unregister_notifier(&pci_bus_type, &vdev->nb); + vfio_unregister_group_dev(&vdev->vdev); + vfio_pci_vf_uninit(vdev); vfio_pci_reflck_put(vdev->reflck); + vfio_pci_vga_uninit(vdev); vfio_iommu_group_put(pdev->dev.iommu_group, &pdev->dev); - kfree(vdev->region); - mutex_destroy(&vdev->ioeventfds_lock); if (!disable_idle_d3) vfio_pci_set_power_state(vdev, PCI_D0); + mutex_destroy(&vdev->ioeventfds_lock); + kfree(vdev->region); kfree(vdev->pm_save); kfree(vdev); - - if (vfio_pci_is_vga(pdev)) { - vga_client_register(pdev, NULL, NULL, NULL); - vga_set_legacy_decoding(pdev, - VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM | - VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM); - } } static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, @@ -2080,11 +2097,7 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, if (device == NULL) return PCI_ERS_RESULT_DISCONNECT; - vdev = vfio_device_data(device); - if (vdev == NULL) { - vfio_device_put(device); - return PCI_ERS_RESULT_DISCONNECT; - } + vdev = container_of(device, struct vfio_pci_device, vdev); mutex_lock(&vdev->igate); @@ -2100,7 +2113,6 @@ static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev, static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn) { - struct vfio_pci_device *vdev; struct vfio_device *device; int ret = 0; @@ -2113,12 +2125,6 @@ static int vfio_pci_sriov_configure(struct pci_dev *pdev, int nr_virtfn) if (!device) return -ENODEV; - vdev = vfio_device_data(device); - if (!vdev) { - vfio_device_put(device); - return -ENODEV; - } - if (nr_virtfn == 0) pci_disable_sriov(pdev); else @@ -2178,7 +2184,7 @@ static int vfio_pci_reflck_find(struct pci_dev *pdev, void *data) return 0; } - vdev = vfio_device_data(device); + vdev = container_of(device, struct vfio_pci_device, vdev); if (vdev->reflck) { vfio_pci_reflck_get(vdev->reflck); @@ -2240,7 +2246,7 @@ static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data) return -EBUSY; } - vdev = vfio_device_data(device); + vdev = container_of(device, struct vfio_pci_device, vdev); /* Fault if the device is not unused */ if (vdev->refcnt) { @@ -2248,7 +2254,7 @@ static int vfio_pci_get_unused_devs(struct pci_dev *pdev, void *data) return -EBUSY; } - devs->devices[devs->cur_index++] = device; + devs->devices[devs->cur_index++] = vdev; return 0; } @@ -2270,7 +2276,7 @@ static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data) return -EBUSY; } - vdev = vfio_device_data(device); + vdev = container_of(device, struct vfio_pci_device, vdev); /* * Locking multiple devices is prone to deadlock, runaway and @@ -2281,7 +2287,7 @@ static int vfio_pci_try_zap_and_vma_lock_cb(struct pci_dev *pdev, void *data) return -EBUSY; } - devs->devices[devs->cur_index++] = device; + devs->devices[devs->cur_index++] = vdev; return 0; } @@ -2329,7 +2335,7 @@ static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev) /* Does at least one need a reset? */ for (i = 0; i < devs.cur_index; i++) { - tmp = vfio_device_data(devs.devices[i]); + tmp = devs.devices[i]; if (tmp->needs_reset) { ret = pci_reset_bus(vdev->pdev); break; @@ -2338,7 +2344,7 @@ static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev) put_devs: for (i = 0; i < devs.cur_index; i++) { - tmp = vfio_device_data(devs.devices[i]); + tmp = devs.devices[i]; /* * If reset was successful, affected devices no longer need @@ -2354,7 +2360,7 @@ put_devs: vfio_pci_set_power_state(tmp, PCI_D3hot); } - vfio_device_put(devs.devices[i]); + vfio_device_put(&tmp->vdev); } kfree(devs.devices); @@ -2411,7 +2417,7 @@ static int __init vfio_pci_init(void) { int ret; - /* Allocate shared config space permision data used by all devices */ + /* Allocate shared config space permission data used by all devices */ ret = vfio_pci_init_perm_bits(); if (ret) return ret; diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index a402adee8a21..d57f037f65b8 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -101,7 +101,7 @@ static const u16 pci_ext_cap_length[PCI_EXT_CAP_ID_MAX + 1] = { /* * Read/Write Permission Bits - one bit for each bit in capability * Any field can be read if it exists, but what is read depends on - * whether the field is 'virtualized', or just pass thru to the + * whether the field is 'virtualized', or just pass through to the * hardware. Any virtualized field is also virtualized for writes. * Writes are only permitted if they have a 1 bit here. */ diff --git a/drivers/vfio/pci/vfio_pci_igd.c b/drivers/vfio/pci/vfio_pci_igd.c index e66dfb0178ed..228df565e9bc 100644 --- a/drivers/vfio/pci/vfio_pci_igd.c +++ b/drivers/vfio/pci/vfio_pci_igd.c @@ -21,6 +21,10 @@ #define OPREGION_SIZE (8 * 1024) #define OPREGION_PCI_ADDR 0xfc +#define OPREGION_RVDA 0x3ba +#define OPREGION_RVDS 0x3c2 +#define OPREGION_VERSION 0x16 + static size_t vfio_pci_igd_rw(struct vfio_pci_device *vdev, char __user *buf, size_t count, loff_t *ppos, bool iswrite) { @@ -58,6 +62,7 @@ static int vfio_pci_igd_opregion_init(struct vfio_pci_device *vdev) u32 addr, size; void *base; int ret; + u16 version; ret = pci_read_config_dword(vdev->pdev, OPREGION_PCI_ADDR, &addr); if (ret) @@ -83,6 +88,54 @@ static int vfio_pci_igd_opregion_init(struct vfio_pci_device *vdev) size *= 1024; /* In KB */ + /* + * Support opregion v2.1+ + * When VBT data exceeds 6KB size and cannot be within mailbox #4, then + * the Extended VBT region next to opregion is used to hold the VBT data. + * RVDA (Relative Address of VBT Data from Opregion Base) and RVDS + * (Raw VBT Data Size) from opregion structure member are used to hold the + * address from region base and size of VBT data. RVDA/RVDS are not + * defined before opregion 2.0. + * + * opregion 2.1+: RVDA is unsigned, relative offset from + * opregion base, and should point to the end of opregion. + * otherwise, exposing to userspace to allow read access to everything between + * the OpRegion and VBT is not safe. + * RVDS is defined as size in bytes. + * + * opregion 2.0: rvda is the physical VBT address. + * Since rvda is HPA it cannot be directly used in guest. + * And it should not be practically available for end user,so it is not supported. + */ + version = le16_to_cpu(*(__le16 *)(base + OPREGION_VERSION)); + if (version >= 0x0200) { + u64 rvda; + u32 rvds; + + rvda = le64_to_cpu(*(__le64 *)(base + OPREGION_RVDA)); + rvds = le32_to_cpu(*(__le32 *)(base + OPREGION_RVDS)); + if (rvda && rvds) { + /* no support for opregion v2.0 with physical VBT address */ + if (version == 0x0200) { + memunmap(base); + pci_err(vdev->pdev, + "IGD assignment does not support opregion v2.0 with an extended VBT region\n"); + return -EINVAL; + } + + if (rvda != size) { + memunmap(base); + pci_err(vdev->pdev, + "Extended VBT does not follow opregion on version 0x%04x\n", + version); + return -EINVAL; + } + + /* region size for opregion v2.0+: opregion and VBT size. */ + size += rvds; + } + } + if (size != OPREGION_SIZE) { memunmap(base); base = memremap(addr, size, MEMREMAP_WB); diff --git a/drivers/vfio/pci/vfio_pci_nvlink2.c b/drivers/vfio/pci/vfio_pci_nvlink2.c deleted file mode 100644 index 9adcf6a8f888..000000000000 --- a/drivers/vfio/pci/vfio_pci_nvlink2.c +++ /dev/null @@ -1,490 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * VFIO PCI NVIDIA Whitherspoon GPU support a.k.a. NVLink2. - * - * Copyright (C) 2018 IBM Corp. All rights reserved. - * Author: Alexey Kardashevskiy <aik@ozlabs.ru> - * - * Register an on-GPU RAM region for cacheable access. - * - * Derived from original vfio_pci_igd.c: - * Copyright (C) 2016 Red Hat, Inc. All rights reserved. - * Author: Alex Williamson <alex.williamson@redhat.com> - */ - -#include <linux/io.h> -#include <linux/pci.h> -#include <linux/uaccess.h> -#include <linux/vfio.h> -#include <linux/sched/mm.h> -#include <linux/mmu_context.h> -#include <asm/kvm_ppc.h> -#include "vfio_pci_private.h" - -#define CREATE_TRACE_POINTS -#include "trace.h" - -EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap_fault); -EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_nvgpu_mmap); -EXPORT_TRACEPOINT_SYMBOL_GPL(vfio_pci_npu2_mmap); - -struct vfio_pci_nvgpu_data { - unsigned long gpu_hpa; /* GPU RAM physical address */ - unsigned long gpu_tgt; /* TGT address of corresponding GPU RAM */ - unsigned long useraddr; /* GPU RAM userspace address */ - unsigned long size; /* Size of the GPU RAM window (usually 128GB) */ - struct mm_struct *mm; - struct mm_iommu_table_group_mem_t *mem; /* Pre-registered RAM descr. */ - struct pci_dev *gpdev; - struct notifier_block group_notifier; -}; - -static size_t vfio_pci_nvgpu_rw(struct vfio_pci_device *vdev, - char __user *buf, size_t count, loff_t *ppos, bool iswrite) -{ - unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; - struct vfio_pci_nvgpu_data *data = vdev->region[i].data; - loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; - loff_t posaligned = pos & PAGE_MASK, posoff = pos & ~PAGE_MASK; - size_t sizealigned; - void __iomem *ptr; - - if (pos >= vdev->region[i].size) - return -EINVAL; - - count = min(count, (size_t)(vdev->region[i].size - pos)); - - /* - * We map only a bit of GPU RAM for a short time instead of mapping it - * for the guest lifetime as: - * - * 1) we do not know GPU RAM size, only aperture which is 4-8 times - * bigger than actual RAM size (16/32GB RAM vs. 128GB aperture); - * 2) mapping GPU RAM allows CPU to prefetch and if this happens - * before NVLink bridge is reset (which fences GPU RAM), - * hardware management interrupts (HMI) might happen, this - * will freeze NVLink bridge. - * - * This is not fast path anyway. - */ - sizealigned = ALIGN(posoff + count, PAGE_SIZE); - ptr = ioremap_cache(data->gpu_hpa + posaligned, sizealigned); - if (!ptr) - return -EFAULT; - - if (iswrite) { - if (copy_from_user(ptr + posoff, buf, count)) - count = -EFAULT; - else - *ppos += count; - } else { - if (copy_to_user(buf, ptr + posoff, count)) - count = -EFAULT; - else - *ppos += count; - } - - iounmap(ptr); - - return count; -} - -static void vfio_pci_nvgpu_release(struct vfio_pci_device *vdev, - struct vfio_pci_region *region) -{ - struct vfio_pci_nvgpu_data *data = region->data; - long ret; - - /* If there were any mappings at all... */ - if (data->mm) { - if (data->mem) { - ret = mm_iommu_put(data->mm, data->mem); - WARN_ON(ret); - } - - mmdrop(data->mm); - } - - vfio_unregister_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY, - &data->group_notifier); - - pnv_npu2_unmap_lpar_dev(data->gpdev); - - kfree(data); -} - -static vm_fault_t vfio_pci_nvgpu_mmap_fault(struct vm_fault *vmf) -{ - vm_fault_t ret; - struct vm_area_struct *vma = vmf->vma; - struct vfio_pci_region *region = vma->vm_private_data; - struct vfio_pci_nvgpu_data *data = region->data; - unsigned long vmf_off = (vmf->address - vma->vm_start) >> PAGE_SHIFT; - unsigned long nv2pg = data->gpu_hpa >> PAGE_SHIFT; - unsigned long vm_pgoff = vma->vm_pgoff & - ((1U << (VFIO_PCI_OFFSET_SHIFT - PAGE_SHIFT)) - 1); - unsigned long pfn = nv2pg + vm_pgoff + vmf_off; - - ret = vmf_insert_pfn(vma, vmf->address, pfn); - trace_vfio_pci_nvgpu_mmap_fault(data->gpdev, pfn << PAGE_SHIFT, - vmf->address, ret); - - return ret; -} - -static const struct vm_operations_struct vfio_pci_nvgpu_mmap_vmops = { - .fault = vfio_pci_nvgpu_mmap_fault, -}; - -static int vfio_pci_nvgpu_mmap(struct vfio_pci_device *vdev, - struct vfio_pci_region *region, struct vm_area_struct *vma) -{ - int ret; - struct vfio_pci_nvgpu_data *data = region->data; - - if (data->useraddr) - return -EPERM; - - if (vma->vm_end - vma->vm_start > data->size) - return -EINVAL; - - vma->vm_private_data = region; - vma->vm_flags |= VM_PFNMAP; - vma->vm_ops = &vfio_pci_nvgpu_mmap_vmops; - - /* - * Calling mm_iommu_newdev() here once as the region is not - * registered yet and therefore right initialization will happen now. - * Other places will use mm_iommu_find() which returns - * registered @mem and does not go gup(). - */ - data->useraddr = vma->vm_start; - data->mm = current->mm; - - mmgrab(data->mm); - ret = (int) mm_iommu_newdev(data->mm, data->useraddr, - vma_pages(vma), data->gpu_hpa, &data->mem); - - trace_vfio_pci_nvgpu_mmap(vdev->pdev, data->gpu_hpa, data->useraddr, - vma->vm_end - vma->vm_start, ret); - - return ret; -} - -static int vfio_pci_nvgpu_add_capability(struct vfio_pci_device *vdev, - struct vfio_pci_region *region, struct vfio_info_cap *caps) -{ - struct vfio_pci_nvgpu_data *data = region->data; - struct vfio_region_info_cap_nvlink2_ssatgt cap = { - .header.id = VFIO_REGION_INFO_CAP_NVLINK2_SSATGT, - .header.version = 1, - .tgt = data->gpu_tgt - }; - - return vfio_info_add_capability(caps, &cap.header, sizeof(cap)); -} - -static const struct vfio_pci_regops vfio_pci_nvgpu_regops = { - .rw = vfio_pci_nvgpu_rw, - .release = vfio_pci_nvgpu_release, - .mmap = vfio_pci_nvgpu_mmap, - .add_capability = vfio_pci_nvgpu_add_capability, -}; - -static int vfio_pci_nvgpu_group_notifier(struct notifier_block *nb, - unsigned long action, void *opaque) -{ - struct kvm *kvm = opaque; - struct vfio_pci_nvgpu_data *data = container_of(nb, - struct vfio_pci_nvgpu_data, - group_notifier); - - if (action == VFIO_GROUP_NOTIFY_SET_KVM && kvm && - pnv_npu2_map_lpar_dev(data->gpdev, - kvm->arch.lpid, MSR_DR | MSR_PR)) - return NOTIFY_BAD; - - return NOTIFY_OK; -} - -int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev) -{ - int ret; - u64 reg[2]; - u64 tgt = 0; - struct device_node *npu_node, *mem_node; - struct pci_dev *npu_dev; - struct vfio_pci_nvgpu_data *data; - uint32_t mem_phandle = 0; - unsigned long events = VFIO_GROUP_NOTIFY_SET_KVM; - - /* - * PCI config space does not tell us about NVLink presense but - * platform does, use this. - */ - npu_dev = pnv_pci_get_npu_dev(vdev->pdev, 0); - if (!npu_dev) - return -ENODEV; - - npu_node = pci_device_to_OF_node(npu_dev); - if (!npu_node) - return -EINVAL; - - if (of_property_read_u32(npu_node, "memory-region", &mem_phandle)) - return -ENODEV; - - mem_node = of_find_node_by_phandle(mem_phandle); - if (!mem_node) - return -EINVAL; - - if (of_property_read_variable_u64_array(mem_node, "reg", reg, - ARRAY_SIZE(reg), ARRAY_SIZE(reg)) != - ARRAY_SIZE(reg)) - return -EINVAL; - - if (of_property_read_u64(npu_node, "ibm,device-tgt-addr", &tgt)) { - dev_warn(&vdev->pdev->dev, "No ibm,device-tgt-addr found\n"); - return -EFAULT; - } - - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - return -ENOMEM; - - data->gpu_hpa = reg[0]; - data->gpu_tgt = tgt; - data->size = reg[1]; - - dev_dbg(&vdev->pdev->dev, "%lx..%lx\n", data->gpu_hpa, - data->gpu_hpa + data->size - 1); - - data->gpdev = vdev->pdev; - data->group_notifier.notifier_call = vfio_pci_nvgpu_group_notifier; - - ret = vfio_register_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY, - &events, &data->group_notifier); - if (ret) - goto free_exit; - - /* - * We have just set KVM, we do not need the listener anymore. - * Also, keeping it registered means that if more than one GPU is - * assigned, we will get several similar notifiers notifying about - * the same device again which does not help with anything. - */ - vfio_unregister_notifier(&data->gpdev->dev, VFIO_GROUP_NOTIFY, - &data->group_notifier); - - ret = vfio_pci_register_dev_region(vdev, - PCI_VENDOR_ID_NVIDIA | VFIO_REGION_TYPE_PCI_VENDOR_TYPE, - VFIO_REGION_SUBTYPE_NVIDIA_NVLINK2_RAM, - &vfio_pci_nvgpu_regops, - data->size, - VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE | - VFIO_REGION_INFO_FLAG_MMAP, - data); - if (ret) - goto free_exit; - - return 0; -free_exit: - kfree(data); - - return ret; -} - -/* - * IBM NPU2 bridge - */ -struct vfio_pci_npu2_data { - void *base; /* ATSD register virtual address, for emulated access */ - unsigned long mmio_atsd; /* ATSD physical address */ - unsigned long gpu_tgt; /* TGT address of corresponding GPU RAM */ - unsigned int link_speed; /* The link speed from DT's ibm,nvlink-speed */ -}; - -static size_t vfio_pci_npu2_rw(struct vfio_pci_device *vdev, - char __user *buf, size_t count, loff_t *ppos, bool iswrite) -{ - unsigned int i = VFIO_PCI_OFFSET_TO_INDEX(*ppos) - VFIO_PCI_NUM_REGIONS; - struct vfio_pci_npu2_data *data = vdev->region[i].data; - loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; - - if (pos >= vdev->region[i].size) - return -EINVAL; - - count = min(count, (size_t)(vdev->region[i].size - pos)); - - if (iswrite) { - if (copy_from_user(data->base + pos, buf, count)) - return -EFAULT; - } else { - if (copy_to_user(buf, data->base + pos, count)) - return -EFAULT; - } - *ppos += count; - - return count; -} - -static int vfio_pci_npu2_mmap(struct vfio_pci_device *vdev, - struct vfio_pci_region *region, struct vm_area_struct *vma) -{ - int ret; - struct vfio_pci_npu2_data *data = region->data; - unsigned long req_len = vma->vm_end - vma->vm_start; - - if (req_len != PAGE_SIZE) - return -EINVAL; - - vma->vm_flags |= VM_PFNMAP; - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - - ret = remap_pfn_range(vma, vma->vm_start, data->mmio_atsd >> PAGE_SHIFT, - req_len, vma->vm_page_prot); - trace_vfio_pci_npu2_mmap(vdev->pdev, data->mmio_atsd, vma->vm_start, - vma->vm_end - vma->vm_start, ret); - - return ret; -} - -static void vfio_pci_npu2_release(struct vfio_pci_device *vdev, - struct vfio_pci_region *region) -{ - struct vfio_pci_npu2_data *data = region->data; - - memunmap(data->base); - kfree(data); -} - -static int vfio_pci_npu2_add_capability(struct vfio_pci_device *vdev, - struct vfio_pci_region *region, struct vfio_info_cap *caps) -{ - struct vfio_pci_npu2_data *data = region->data; - struct vfio_region_info_cap_nvlink2_ssatgt captgt = { - .header.id = VFIO_REGION_INFO_CAP_NVLINK2_SSATGT, - .header.version = 1, - .tgt = data->gpu_tgt - }; - struct vfio_region_info_cap_nvlink2_lnkspd capspd = { - .header.id = VFIO_REGION_INFO_CAP_NVLINK2_LNKSPD, - .header.version = 1, - .link_speed = data->link_speed - }; - int ret; - - ret = vfio_info_add_capability(caps, &captgt.header, sizeof(captgt)); - if (ret) - return ret; - - return vfio_info_add_capability(caps, &capspd.header, sizeof(capspd)); -} - -static const struct vfio_pci_regops vfio_pci_npu2_regops = { - .rw = vfio_pci_npu2_rw, - .mmap = vfio_pci_npu2_mmap, - .release = vfio_pci_npu2_release, - .add_capability = vfio_pci_npu2_add_capability, -}; - -int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev) -{ - int ret; - struct vfio_pci_npu2_data *data; - struct device_node *nvlink_dn; - u32 nvlink_index = 0, mem_phandle = 0; - struct pci_dev *npdev = vdev->pdev; - struct device_node *npu_node = pci_device_to_OF_node(npdev); - struct pci_controller *hose = pci_bus_to_host(npdev->bus); - u64 mmio_atsd = 0; - u64 tgt = 0; - u32 link_speed = 0xff; - - /* - * PCI config space does not tell us about NVLink presense but - * platform does, use this. - */ - if (!pnv_pci_get_gpu_dev(vdev->pdev)) - return -ENODEV; - - if (of_property_read_u32(npu_node, "memory-region", &mem_phandle)) - return -ENODEV; - - /* - * NPU2 normally has 8 ATSD registers (for concurrency) and 6 links - * so we can allocate one register per link, using nvlink index as - * a key. - * There is always at least one ATSD register so as long as at least - * NVLink bridge #0 is passed to the guest, ATSD will be available. - */ - nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0); - if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", - &nvlink_index))) - return -ENODEV; - - if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", nvlink_index, - &mmio_atsd)) { - if (of_property_read_u64_index(hose->dn, "ibm,mmio-atsd", 0, - &mmio_atsd)) { - dev_warn(&vdev->pdev->dev, "No available ATSD found\n"); - mmio_atsd = 0; - } else { - dev_warn(&vdev->pdev->dev, - "Using fallback ibm,mmio-atsd[0] for ATSD.\n"); - } - } - - if (of_property_read_u64(npu_node, "ibm,device-tgt-addr", &tgt)) { - dev_warn(&vdev->pdev->dev, "No ibm,device-tgt-addr found\n"); - return -EFAULT; - } - - if (of_property_read_u32(npu_node, "ibm,nvlink-speed", &link_speed)) { - dev_warn(&vdev->pdev->dev, "No ibm,nvlink-speed found\n"); - return -EFAULT; - } - - data = kzalloc(sizeof(*data), GFP_KERNEL); - if (!data) - return -ENOMEM; - - data->mmio_atsd = mmio_atsd; - data->gpu_tgt = tgt; - data->link_speed = link_speed; - if (data->mmio_atsd) { - data->base = memremap(data->mmio_atsd, SZ_64K, MEMREMAP_WT); - if (!data->base) { - ret = -ENOMEM; - goto free_exit; - } - } - - /* - * We want to expose the capability even if this specific NVLink - * did not get its own ATSD register because capabilities - * belong to VFIO regions and normally there will be ATSD register - * assigned to the NVLink bridge. - */ - ret = vfio_pci_register_dev_region(vdev, - PCI_VENDOR_ID_IBM | - VFIO_REGION_TYPE_PCI_VENDOR_TYPE, - VFIO_REGION_SUBTYPE_IBM_NVLINK2_ATSD, - &vfio_pci_npu2_regops, - data->mmio_atsd ? PAGE_SIZE : 0, - VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE | - VFIO_REGION_INFO_FLAG_MMAP, - data); - if (ret) - goto free_exit; - - return 0; - -free_exit: - if (data->base) - memunmap(data->base); - kfree(data); - - return ret; -} diff --git a/drivers/vfio/pci/vfio_pci_private.h b/drivers/vfio/pci/vfio_pci_private.h index 9cd1882a05af..5a36272cecbf 100644 --- a/drivers/vfio/pci/vfio_pci_private.h +++ b/drivers/vfio/pci/vfio_pci_private.h @@ -100,6 +100,7 @@ struct vfio_pci_mmap_vma { }; struct vfio_pci_device { + struct vfio_device vdev; struct pci_dev *pdev; void __iomem *barmap[PCI_STD_NUM_BARS]; bool bar_mmap_supported[PCI_STD_NUM_BARS]; @@ -199,20 +200,6 @@ static inline int vfio_pci_igd_init(struct vfio_pci_device *vdev) return -ENODEV; } #endif -#ifdef CONFIG_VFIO_PCI_NVLINK2 -extern int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev); -extern int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev); -#else -static inline int vfio_pci_nvdia_v100_nvlink2_init(struct vfio_pci_device *vdev) -{ - return -ENODEV; -} - -static inline int vfio_pci_ibm_npu2_init(struct vfio_pci_device *vdev) -{ - return -ENODEV; -} -#endif #ifdef CONFIG_S390 extern int vfio_pci_info_zdev_add_caps(struct vfio_pci_device *vdev, |