diff options
Diffstat (limited to 'target/linux/brcm2708/patches-4.19/950-0531-PCI-brcmstb-Add-dma-range-mapping-for-inbound-traffi.patch')
-rw-r--r-- | target/linux/brcm2708/patches-4.19/950-0531-PCI-brcmstb-Add-dma-range-mapping-for-inbound-traffi.patch | 569 |
1 files changed, 569 insertions, 0 deletions
diff --git a/target/linux/brcm2708/patches-4.19/950-0531-PCI-brcmstb-Add-dma-range-mapping-for-inbound-traffi.patch b/target/linux/brcm2708/patches-4.19/950-0531-PCI-brcmstb-Add-dma-range-mapping-for-inbound-traffi.patch new file mode 100644 index 0000000000..86f3384488 --- /dev/null +++ b/target/linux/brcm2708/patches-4.19/950-0531-PCI-brcmstb-Add-dma-range-mapping-for-inbound-traffi.patch @@ -0,0 +1,569 @@ +From 155b86245f5c14317f41a0677ed78601fbe1f28b Mon Sep 17 00:00:00 2001 +From: Phil Elwell <phil@raspberrypi.org> +Date: Tue, 19 Feb 2019 22:06:59 +0000 +Subject: [PATCH 531/725] PCI: brcmstb: Add dma-range mapping for inbound + traffic + +The Broadcom STB PCIe host controller is intimately related to the +memory subsystem. This close relationship adds complexity to how cpu +system memory is mapped to PCIe memory. Ideally, this mapping is an +identity mapping, or an identity mapping off by a constant. Not so in +this case. + +Consider the Broadcom reference board BCM97445LCC_4X8 which has 6 GB +of system memory. Here is how the PCIe controller maps the +system memory to PCIe memory: + + memc0-a@[ 0....3fffffff] <=> pci@[ 0....3fffffff] + memc0-b@[100000000...13fffffff] <=> pci@[ 40000000....7fffffff] + memc1-a@[ 40000000....7fffffff] <=> pci@[ 80000000....bfffffff] + memc1-b@[300000000...33fffffff] <=> pci@[ c0000000....ffffffff] + memc2-a@[ 80000000....bfffffff] <=> pci@[100000000...13fffffff] + memc2-b@[c00000000...c3fffffff] <=> pci@[140000000...17fffffff] + +Although there are some "gaps" that can be added between the +individual mappings by software, the permutation of memory regions for +the most part is fixed by HW. The solution of having something close +to an identity mapping is not possible. + +The idea behind this HW design is that the same PCIe module can +act as an RC or EP, and if it acts as an EP it concatenates all +of system memory into a BAR so anything can be accessed. Unfortunately, +when the PCIe block is in the role of an RC it also presents this +"BAR" to downstream PCIe devices, rather than offering an identity map +between its system memory and PCIe space. + +Suppose that an endpoint driver allocs some DMA memory. Suppose this +memory is located at 0x6000_0000, which is in the middle of memc1-a. +The driver wants a dma_addr_t value that it can pass on to the EP to +use. Without doing any custom mapping, the EP will use this value for +DMA: the driver will get a dma_addr_t equal to 0x6000_0000. But this +won't work; the device needs a dma_addr_t that reflects the PCIe space +address, namely 0xa000_0000. + +So, essentially the solution to this problem must modify the +dma_addr_t returned by the DMA routines routines. There are two +ways (I know of) of doing this: + +(a) overriding/redefining the dma_to_phys() and phys_to_dma() calls +that are used by the dma_ops routines. This is the approach of + + arch/mips/cavium-octeon/dma-octeon.c + +In ARM and ARM64 these two routines are defiend in asm/dma-mapping.h +as static inline functions. + +(b) Subscribe to a notifier that notifies when a device is added to a +bus. When this happens, set_dma_ops() can be called for the device. +This method is mentioned in: + + http://lxr.free-electrons.com/source/drivers/of/platform.c?v=3.16#L152 + +where it says as a comment + + "In case if platform code need to use own special DMA + configuration, it can use Platform bus notifier and + handle BUS_NOTIFY_ADD_DEVICE event to fix up DMA + configuration." + +Solution (b) is what this commit does. It uses its own set of +dma_ops which are wrappers around the arch_dma_ops. The +wrappers translate the dma addresses before/after invoking +the arch_dma_ops, as appropriate. + +Signed-off-by: Jim Quinlan <jim2101024@gmail.com> +--- + drivers/pci/controller/pcie-brcmstb.c | 420 +++++++++++++++++++++++++- + 1 file changed, 411 insertions(+), 9 deletions(-) + +--- a/drivers/pci/controller/pcie-brcmstb.c ++++ b/drivers/pci/controller/pcie-brcmstb.c +@@ -4,6 +4,7 @@ + #include <linux/clk.h> + #include <linux/compiler.h> + #include <linux/delay.h> ++#include <linux/dma-mapping.h> + #include <linux/init.h> + #include <linux/interrupt.h> + #include <linux/io.h> +@@ -319,11 +320,307 @@ static struct pci_ops brcm_pcie_ops = { + ((val & ~reg##_##field##_MASK) | \ + (reg##_##field##_MASK & (field_val << reg##_##field##_SHIFT))) + ++static const struct dma_map_ops *arch_dma_ops; ++static const struct dma_map_ops *brcm_dma_ops_ptr; ++static struct of_pci_range *dma_ranges; ++static int num_dma_ranges; ++ + static phys_addr_t scb_size[BRCM_MAX_SCB]; + static int num_memc; + static int num_pcie; + static DEFINE_MUTEX(brcm_pcie_lock); + ++static dma_addr_t brcm_to_pci(dma_addr_t addr) ++{ ++ struct of_pci_range *p; ++ ++ if (!num_dma_ranges) ++ return addr; ++ ++ for (p = dma_ranges; p < &dma_ranges[num_dma_ranges]; p++) ++ if (addr >= p->cpu_addr && addr < (p->cpu_addr + p->size)) ++ return addr - p->cpu_addr + p->pci_addr; ++ ++ return addr; ++} ++ ++static dma_addr_t brcm_to_cpu(dma_addr_t addr) ++{ ++ struct of_pci_range *p; ++ ++ if (!num_dma_ranges) ++ return addr; ++ ++ for (p = dma_ranges; p < &dma_ranges[num_dma_ranges]; p++) ++ if (addr >= p->pci_addr && addr < (p->pci_addr + p->size)) ++ return addr - p->pci_addr + p->cpu_addr; ++ ++ return addr; ++} ++ ++static void *brcm_alloc(struct device *dev, size_t size, dma_addr_t *handle, ++ gfp_t gfp, unsigned long attrs) ++{ ++ void *ret; ++ ++ ret = arch_dma_ops->alloc(dev, size, handle, gfp, attrs); ++ if (ret) ++ *handle = brcm_to_pci(*handle); ++ return ret; ++} ++ ++static void brcm_free(struct device *dev, size_t size, void *cpu_addr, ++ dma_addr_t handle, unsigned long attrs) ++{ ++ handle = brcm_to_cpu(handle); ++ arch_dma_ops->free(dev, size, cpu_addr, handle, attrs); ++} ++ ++static int brcm_mmap(struct device *dev, struct vm_area_struct *vma, ++ void *cpu_addr, dma_addr_t dma_addr, size_t size, ++ unsigned long attrs) ++{ ++ dma_addr = brcm_to_cpu(dma_addr); ++ return arch_dma_ops->mmap(dev, vma, cpu_addr, dma_addr, size, attrs); ++} ++ ++static int brcm_get_sgtable(struct device *dev, struct sg_table *sgt, ++ void *cpu_addr, dma_addr_t handle, size_t size, ++ unsigned long attrs) ++{ ++ handle = brcm_to_cpu(handle); ++ return arch_dma_ops->get_sgtable(dev, sgt, cpu_addr, handle, size, ++ attrs); ++} ++ ++static dma_addr_t brcm_map_page(struct device *dev, struct page *page, ++ unsigned long offset, size_t size, ++ enum dma_data_direction dir, ++ unsigned long attrs) ++{ ++ return brcm_to_pci(arch_dma_ops->map_page(dev, page, offset, size, ++ dir, attrs)); ++} ++ ++static void brcm_unmap_page(struct device *dev, dma_addr_t handle, ++ size_t size, enum dma_data_direction dir, ++ unsigned long attrs) ++{ ++ handle = brcm_to_cpu(handle); ++ arch_dma_ops->unmap_page(dev, handle, size, dir, attrs); ++} ++ ++static int brcm_map_sg(struct device *dev, struct scatterlist *sgl, ++ int nents, enum dma_data_direction dir, ++ unsigned long attrs) ++{ ++ int i, j; ++ struct scatterlist *sg; ++ ++ for_each_sg(sgl, sg, nents, i) { ++#ifdef CONFIG_NEED_SG_DMA_LENGTH ++ sg->dma_length = sg->length; ++#endif ++ sg->dma_address = ++ brcm_dma_ops_ptr->map_page(dev, sg_page(sg), sg->offset, ++ sg->length, dir, attrs); ++ if (dma_mapping_error(dev, sg->dma_address)) ++ goto bad_mapping; ++ } ++ return nents; ++ ++bad_mapping: ++ for_each_sg(sgl, sg, i, j) ++ brcm_dma_ops_ptr->unmap_page(dev, sg_dma_address(sg), ++ sg_dma_len(sg), dir, attrs); ++ return 0; ++} ++ ++static void brcm_unmap_sg(struct device *dev, ++ struct scatterlist *sgl, int nents, ++ enum dma_data_direction dir, ++ unsigned long attrs) ++{ ++ int i; ++ struct scatterlist *sg; ++ ++ for_each_sg(sgl, sg, nents, i) ++ brcm_dma_ops_ptr->unmap_page(dev, sg_dma_address(sg), ++ sg_dma_len(sg), dir, attrs); ++} ++ ++static void brcm_sync_single_for_cpu(struct device *dev, ++ dma_addr_t handle, size_t size, ++ enum dma_data_direction dir) ++{ ++ handle = brcm_to_cpu(handle); ++ arch_dma_ops->sync_single_for_cpu(dev, handle, size, dir); ++} ++ ++static void brcm_sync_single_for_device(struct device *dev, ++ dma_addr_t handle, size_t size, ++ enum dma_data_direction dir) ++{ ++ handle = brcm_to_cpu(handle); ++ arch_dma_ops->sync_single_for_device(dev, handle, size, dir); ++} ++ ++static dma_addr_t brcm_map_resource(struct device *dev, phys_addr_t phys, ++ size_t size, ++ enum dma_data_direction dir, ++ unsigned long attrs) ++{ ++ if (arch_dma_ops->map_resource) ++ return brcm_to_pci(arch_dma_ops->map_resource ++ (dev, phys, size, dir, attrs)); ++ return brcm_to_pci((dma_addr_t)phys); ++} ++ ++static void brcm_unmap_resource(struct device *dev, dma_addr_t handle, ++ size_t size, enum dma_data_direction dir, ++ unsigned long attrs) ++{ ++ if (arch_dma_ops->unmap_resource) ++ arch_dma_ops->unmap_resource(dev, brcm_to_cpu(handle), size, ++ dir, attrs); ++} ++ ++void brcm_sync_sg_for_cpu(struct device *dev, struct scatterlist *sgl, ++ int nents, enum dma_data_direction dir) ++{ ++ struct scatterlist *sg; ++ int i; ++ ++ for_each_sg(sgl, sg, nents, i) ++ brcm_dma_ops_ptr->sync_single_for_cpu(dev, sg_dma_address(sg), ++ sg->length, dir); ++} ++ ++void brcm_sync_sg_for_device(struct device *dev, struct scatterlist *sgl, ++ int nents, enum dma_data_direction dir) ++{ ++ struct scatterlist *sg; ++ int i; ++ ++ for_each_sg(sgl, sg, nents, i) ++ brcm_dma_ops_ptr->sync_single_for_device(dev, ++ sg_dma_address(sg), ++ sg->length, dir); ++} ++ ++static int brcm_mapping_error(struct device *dev, dma_addr_t dma_addr) ++{ ++ return arch_dma_ops->mapping_error(dev, dma_addr); ++} ++ ++static int brcm_dma_supported(struct device *dev, u64 mask) ++{ ++ if (num_dma_ranges) { ++ /* ++ * It is our translated addresses that the EP will "see", so ++ * we check all of the ranges for the largest possible value. ++ */ ++ int i; ++ ++ for (i = 0; i < num_dma_ranges; i++) ++ if (dma_ranges[i].pci_addr + dma_ranges[i].size - 1 ++ > mask) ++ return 0; ++ return 1; ++ } ++ ++ return arch_dma_ops->dma_supported(dev, mask); ++} ++ ++#ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK ++u64 brcm_get_required_mask)(struct device *dev) ++{ ++ return arch_dma_ops->get_required_mask(dev); ++} ++#endif ++ ++static const struct dma_map_ops brcm_dma_ops = { ++ .alloc = brcm_alloc, ++ .free = brcm_free, ++ .mmap = brcm_mmap, ++ .get_sgtable = brcm_get_sgtable, ++ .map_page = brcm_map_page, ++ .unmap_page = brcm_unmap_page, ++ .map_sg = brcm_map_sg, ++ .unmap_sg = brcm_unmap_sg, ++ .map_resource = brcm_map_resource, ++ .unmap_resource = brcm_unmap_resource, ++ .sync_single_for_cpu = brcm_sync_single_for_cpu, ++ .sync_single_for_device = brcm_sync_single_for_device, ++ .sync_sg_for_cpu = brcm_sync_sg_for_cpu, ++ .sync_sg_for_device = brcm_sync_sg_for_device, ++ .mapping_error = brcm_mapping_error, ++ .dma_supported = brcm_dma_supported, ++#ifdef ARCH_HAS_DMA_GET_REQUIRED_MASK ++ .get_required_mask = brcm_get_required_mask, ++#endif ++}; ++ ++static void brcm_set_dma_ops(struct device *dev) ++{ ++ int ret; ++ ++ if (IS_ENABLED(CONFIG_ARM64)) { ++ /* ++ * We are going to invoke get_dma_ops(). That ++ * function, at this point in time, invokes ++ * get_arch_dma_ops(), and for ARM64 that function ++ * returns a pointer to dummy_dma_ops. So then we'd ++ * like to call arch_setup_dma_ops(), but that isn't ++ * exported. Instead, we call of_dma_configure(), ++ * which is exported, and this calls ++ * arch_setup_dma_ops(). Once we do this the call to ++ * get_dma_ops() will work properly because ++ * dev->dma_ops will be set. ++ */ ++ ret = of_dma_configure(dev, dev->of_node, true); ++ if (ret) { ++ dev_err(dev, "of_dma_configure() failed: %d\n", ret); ++ return; ++ } ++ } ++ ++ arch_dma_ops = get_dma_ops(dev); ++ if (!arch_dma_ops) { ++ dev_err(dev, "failed to get arch_dma_ops\n"); ++ return; ++ } ++ ++ set_dma_ops(dev, &brcm_dma_ops); ++} ++ ++static int brcmstb_platform_notifier(struct notifier_block *nb, ++ unsigned long event, void *__dev) ++{ ++ struct device *dev = __dev; ++ ++ brcm_dma_ops_ptr = &brcm_dma_ops; ++ if (event != BUS_NOTIFY_ADD_DEVICE) ++ return NOTIFY_DONE; ++ ++ brcm_set_dma_ops(dev); ++ return NOTIFY_OK; ++} ++ ++static struct notifier_block brcmstb_platform_nb = { ++ .notifier_call = brcmstb_platform_notifier, ++}; ++ ++static int brcm_register_notifier(void) ++{ ++ return bus_register_notifier(&pci_bus_type, &brcmstb_platform_nb); ++} ++ ++static int brcm_unregister_notifier(void) ++{ ++ return bus_unregister_notifier(&pci_bus_type, &brcmstb_platform_nb); ++} ++ + static u32 rd_fld(void __iomem *p, u32 mask, int shift) + { + return (bcm_readl(p) & mask) >> shift; +@@ -597,9 +894,71 @@ static inline void brcm_pcie_perst_set(s + WR_FLD_RB(pcie->base, PCIE_MISC_PCIE_CTRL, PCIE_PERSTB, !val); + } + ++static int pci_dma_range_parser_init(struct of_pci_range_parser *parser, ++ struct device_node *node) ++{ ++ const int na = 3, ns = 2; ++ int rlen; ++ ++ parser->node = node; ++ parser->pna = of_n_addr_cells(node); ++ parser->np = parser->pna + na + ns; ++ ++ parser->range = of_get_property(node, "dma-ranges", &rlen); ++ if (!parser->range) ++ return -ENOENT; ++ ++ parser->end = parser->range + rlen / sizeof(__be32); ++ ++ return 0; ++} ++ ++static int brcm_pcie_parse_map_dma_ranges(struct brcm_pcie *pcie) ++{ ++ int i; ++ struct of_pci_range_parser parser; ++ struct device_node *dn = pcie->dn; ++ ++ /* ++ * Parse dma-ranges property if present. If there are multiple ++ * PCIe controllers, we only have to parse from one of them since ++ * the others will have an identical mapping. ++ */ ++ if (!pci_dma_range_parser_init(&parser, dn)) { ++ unsigned int max_ranges ++ = (parser.end - parser.range) / parser.np; ++ ++ dma_ranges = kcalloc(max_ranges, sizeof(struct of_pci_range), ++ GFP_KERNEL); ++ if (!dma_ranges) ++ return -ENOMEM; ++ ++ for (i = 0; of_pci_range_parser_one(&parser, dma_ranges + i); ++ i++) ++ num_dma_ranges++; ++ } ++ ++ for (i = 0, num_memc = 0; i < BRCM_MAX_SCB; i++) { ++ u64 size = brcmstb_memory_memc_size(i); ++ ++ if (size == (u64)-1) { ++ dev_err(pcie->dev, "cannot get memc%d size", i); ++ return -EINVAL; ++ } else if (size) { ++ scb_size[i] = roundup_pow_of_two_64(size); ++ num_memc++; ++ } else { ++ break; ++ } ++ } ++ ++ return 0; ++} ++ + static int brcm_pcie_add_controller(struct brcm_pcie *pcie) + { + int i, ret = 0; ++ struct device *dev = pcie->dev; + + mutex_lock(&brcm_pcie_lock); + if (num_pcie > 0) { +@@ -607,12 +966,21 @@ static int brcm_pcie_add_controller(stru + goto done; + } + ++ ret = brcm_register_notifier(); ++ if (ret) { ++ dev_err(dev, "failed to register pci bus notifier\n"); ++ goto done; ++ } ++ ret = brcm_pcie_parse_map_dma_ranges(pcie); ++ if (ret) ++ goto done; ++ + /* Determine num_memc and their sizes */ + for (i = 0, num_memc = 0; i < BRCM_MAX_SCB; i++) { + u64 size = brcmstb_memory_memc_size(i); + + if (size == (u64)-1) { +- dev_err(pcie->dev, "cannot get memc%d size\n", i); ++ dev_err(dev, "cannot get memc%d size\n", i); + ret = -EINVAL; + goto done; + } else if (size) { +@@ -636,8 +1004,16 @@ done: + static void brcm_pcie_remove_controller(struct brcm_pcie *pcie) + { + mutex_lock(&brcm_pcie_lock); +- if (--num_pcie == 0) +- num_memc = 0; ++ if (--num_pcie > 0) ++ goto out; ++ ++ if (brcm_unregister_notifier()) ++ dev_err(pcie->dev, "failed to unregister pci bus notifier\n"); ++ kfree(dma_ranges); ++ dma_ranges = NULL; ++ num_dma_ranges = 0; ++ num_memc = 0; ++out: + mutex_unlock(&brcm_pcie_lock); + } + +@@ -757,6 +1133,38 @@ static int brcm_pcie_setup(struct brcm_p + */ + rc_bar2_offset = 0; + ++ if (dma_ranges) { ++ /* ++ * The best-case scenario is to place the inbound ++ * region in the first 4GB of pci-space, as some ++ * legacy devices can only address 32bits. ++ * We would also like to put the MSI under 4GB ++ * as well, since some devices require a 32bit ++ * MSI target address. ++ */ ++ if (total_mem_size <= 0xc0000000ULL && ++ rc_bar2_size <= 0x100000000ULL) { ++ rc_bar2_offset = 0; ++ } else { ++ /* ++ * The system memory is 4GB or larger so we ++ * cannot start the inbound region at location ++ * 0 (since we have to allow some space for ++ * outbound memory @ 3GB). So instead we ++ * start it at the 1x multiple of its size ++ */ ++ rc_bar2_offset = rc_bar2_size; ++ } ++ ++ } else { ++ /* ++ * Set simple configuration based on memory sizes ++ * only. We always start the viewport at address 0, ++ * and set the MSI target address accordingly. ++ */ ++ rc_bar2_offset = 0; ++ } ++ + tmp = lower_32_bits(rc_bar2_offset); + tmp = INSERT_FIELD(tmp, PCIE_MISC_RC_BAR2_CONFIG_LO, SIZE, + encode_ibar_size(rc_bar2_size)); +@@ -967,7 +1375,6 @@ static int brcm_pcie_probe(struct platfo + struct brcm_pcie *pcie; + struct resource *res; + void __iomem *base; +- u32 tmp; + struct pci_host_bridge *bridge; + struct pci_bus *child; + +@@ -984,11 +1391,6 @@ static int brcm_pcie_probe(struct platfo + return -EINVAL; + } + +- if (of_property_read_u32(dn, "dma-ranges", &tmp) == 0) { +- dev_err(&pdev->dev, "cannot yet handle dma-ranges\n"); +- return -EINVAL; +- } +- + data = of_id->data; + pcie->reg_offsets = data->offsets; + pcie->reg_field_info = data->reg_field_info; |