/* * IOMMU mmap management and range allocation functions. * Based almost entirely upon the powerpc iommu allocator. */ #include #include #include #include #include #include #include static unsigned long iommu_large_alloc = 15; static DEFINE_PER_CPU(unsigned int, iommu_hash_common); static inline bool need_flush(struct iommu_map_table *iommu) { return ((iommu->flags & IOMMU_NEED_FLUSH) != 0); } static inline void set_flush(struct iommu_map_table *iommu) { iommu->flags |= IOMMU_NEED_FLUSH; } static inline void clear_flush(struct iommu_map_table *iommu) { iommu->flags &= ~IOMMU_NEED_FLUSH; } static void setup_iommu_pool_hash(void) { unsigned int i; static bool do_once; if (do_once) return; do_once = true; for_each_possible_cpu(i) per_cpu(iommu_hash_common, i) = hash_32(i, IOMMU_POOL_HASHBITS); } /* * Initialize iommu_pool entries for the iommu_map_table. `num_entries' * is the number of table entries. If `large_pool' is set to true, * the top 1/4 of the table will be set aside for pool allocations * of more than iommu_large_alloc pages. */ void iommu_tbl_pool_init(struct iommu_map_table *iommu, unsigned long num_entries, u32 table_shift, void (*lazy_flush)(struct iommu_map_table *), bool large_pool, u32 npools, bool skip_span_boundary_check) { unsigned int start, i; struct iommu_pool *p = &(iommu->large_pool); setup_iommu_pool_hash(); if (npools == 0) iommu->nr_pools = IOMMU_NR_POOLS; else iommu->nr_pools = npools; BUG_ON(npools > IOMMU_NR_POOLS); iommu->table_shift = table_shift; iommu->lazy_flush = lazy_flush; start = 0; if (skip_span_boundary_check) iommu->flags |= IOMMU_NO_SPAN_BOUND; if (large_pool) iommu->flags |= IOMMU_HAS_LARGE_POOL; if (!large_pool) iommu->poolsize = num_entries/iommu->nr_pools; else iommu->poolsize = (num_entries * 3 / 4)/iommu->nr_pools; for (i = 0; i < iommu->nr_pools; i++) { spin_lock_init(&(iommu->pools[i].lock)); iommu->pools[i].start = start; iommu->pools[i].hint = start; start += iommu->poolsize; /* start for next pool */ iommu->pools[i].end = start - 1; } if (!large_pool) return; /* initialize large_pool */ spin_lock_init(&(p->lock)); p->start = start; p->hint = p->start; p->end = num_entries; } EXPORT_SYMBOL(iommu_tbl_pool_init); unsigned long iommu_tbl_range_alloc(struct device *dev, struct iommu_map_table *iommu, unsigned long npages, unsigned long *handle, unsigned long mask, unsigned int align_order) { unsigned int pool_hash = __this_cpu_read(iommu_hash_common); unsigned long n, end, start, limit, boundary_size; struct iommu_pool *pool; int pass = 0; unsigned int pool_nr; unsigned int npools = iommu->nr_pools; unsigned long flags; bool large_pool = ((iommu->flags & IOMMU_HAS_LARGE_POOL) != 0); bool largealloc = (large_pool && npages > iommu_large_alloc); unsigned long shift; unsigned long align_mask = 0; if (align_order > 0) align_mask = ~0ul >> (BITS_PER_LONG - align_order); /* Sanity check */ if (unlikely(npages == 0)) { WARN_ON_ONCE(1); return IOMMU_ERROR_CODE; } if (largealloc) { pool = &(iommu->large_pool); pool_nr = 0; /* to keep compiler happy */ } else { /* pick out pool_nr */ pool_nr = pool_hash & (npools - 1); pool = &(iommu->pools[pool_nr]); } spin_lock_irqsave(&pool->lock, flags); again: if (pass == 0 && handle && *handle && (*handle >= pool->start) && (*handle < pool->end)) start = *handle; else start = pool->hint; limit = pool->end; /* The case below can happen if we have a small segment appended * to a large, or when the previous alloc was at the very end of * the available space. If so, go back to the beginning. If a * flush is needed, it will get done based on the return value * from iommu_area_alloc() below. */ if (start >= limit) start = pool->start; shift = iommu->table_map_base >> iommu->table_shift; if (limit + shift > mask) { limit = mask - shift + 1; /* If we're constrained on address range, first try * at the masked hint to avoid O(n) search complexity, * but on second pass, start at 0 in pool 0. */ if ((start & mask) >= limit || pass > 0) { spin_unlock(&(pool->lock)); pool = &(iommu->pools[0]); spin_lock(&(pool->lock)); start = pool->start; } else { start &= mask; } } if (dev) boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, 1 << iommu->table_shift); else boundary_size = ALIGN(1ULL << 32, 1 << iommu->table_shift); boundary_size = boundary_size >> iommu->table_shift; /* * if the skip_span_boundary_check had been set during init, we set * things up so that iommu_is_span_boundary() merely checks if the * (index + npages) < num_tsb_entries */ if ((iommu->flags & IOMMU_NO_SPAN_BOUND) != 0) { shift = 0; boundary_size = iommu->poolsize * iommu->nr_pools; } n = iommu_area_alloc(iommu->map, limit, start, npages, shift, boundary_size, align_mask); if (n == -1) { if (likely(pass == 0)) { /* First failure, rescan from the beginning. */ pool->hint = pool->start; set_flush(iommu); pass++; goto again; } else if (!largealloc && pass <= iommu->nr_pools) { spin_unlock(&(pool->lock)); pool_nr = (pool_nr + 1) & (iommu->nr_pools - 1); pool = &(iommu->pools[pool_nr]); spin_lock(&(pool->lock)); pool->hint = pool->start; set_flush(iommu); pass++; goto again; } else { /* give up */ n = IOMMU_ERROR_CODE; goto bail; } } if (iommu->lazy_flush && (n < pool->hint || need_flush(iommu))) { clear_flush(iommu); iommu->lazy_flush(iommu); } end = n + npages; pool->hint = end; /* Update handle for SG allocations */ if (handle) *handle = end; bail: spin_unlock_irqrestore(&(pool->lock), flags); return n; } EXPORT_SYMBOL(iommu_tbl_range_alloc); static struct iommu_pool *get_pool(struct iommu_map_table *tbl, unsigned long entry) { struct iommu_pool *p; unsigned long largepool_start = tbl->large_pool.start; bool large_pool = ((tbl->flags & IOMMU_HAS_LARGE_POOL) != 0); /* The large pool is the last pool at the top of the table */ if (large_pool && entry >= largepool_start) { p = &tbl->large_pool; } else { unsigned int pool_nr = entry / tbl->poolsize; BUG_ON(pool_nr >= tbl->nr_pools); p = &tbl->pools[pool_nr]; } return p; } /* Caller supplies the index of the entry into the iommu map table * itself when the mapping from dma_addr to the entry is not the * default addr->entry mapping below. */ void iommu_tbl_range_free(struct iommu_map_table *iommu, u64 dma_addr, unsigned long npages, unsigned long entry) { struct iommu_pool *pool; unsigned long flags; unsigned long shift = iommu->table_shift; if (entry == IOMMU_ERROR_CODE) /* use default addr->entry mapping */ entry = (dma_addr - iommu->table_map_base) >> shift; pool = get_pool(iommu, entry); spin_lock_irqsave(&(pool->lock), flags); bitmap_clear(iommu->map, entry, npages); spin_unlock_irqrestore(&(pool->lock), flags); } EXPORT_SYMBOL(iommu_tbl_range_free); YueHaibing <yuehaibing@huawei.com> for tracking the dst->pending_confirm problem. Sockets can obtain cached output route. Such routes can be to known nexthop (rt_gateway=IP) or to be used simultaneously for different nexthop IPs by different subnet prefixes (nh->nh_scope = RT_SCOPE_HOST, rt_gateway=0). At first look, there are more problems: - dst_confirm() sets flag on dst and not on dst->path, as result, indication is lost when XFRM is used - DNAT can change the nexthop, so the really used nexthop is not confirmed So, the following solution is to avoid using dst->pending_confirm. The current dst_confirm() usage is as follows: Protocols confirming dst on received packets: - TCP (1 dst per socket) - SCTP (1 dst per transport) - CXGB* Protocols supporting sendmsg with MSG_CONFIRM [ | MSG_PROBE ] to confirm neighbour: - UDP IPv4/IPv6 - ICMPv4 PING - RAW IPv4/IPv6 - L2TP/IPv6 MSG_CONFIRM for other purposes (fix not needed): - CAN Sending without locking the socket: - UDP (when no cork) - RAW (when hdrincl=1) Redirects from old to new GW: - rt6_do_redirect The patchset includes the following changes: 1. sock: add sk_dst_pending_confirm flag - used only by TCP with patch 4 to remember the received indication in sk->sk_dst_pending_confirm 2. net: add dst_pending_confirm flag to skbuff - skb->dst_pending_confirm will be used by all protocols in following patches, via skb_{set,get}_dst_pending_confirm 3. sctp: add dst_pending_confirm flag - SCTP uses per-transport dsts and can not use sk->sk_dst_pending_confirm like TCP 4. tcp: replace dst_confirm with sk_dst_confirm 5. net: add confirm_neigh method to dst_ops - IPv4 and IPv6 provision for slow neigh lookups for MSG_PROBE users. I decided to use neigh lookup only for this case because on MSG_PROBE the skb may pass MTU checks but it does not reach the neigh confirmation code. This patch will be used from patch 6. - xfrm_confirm_neigh: we use the last tunnel address, if present. When there are only transports, the original dest address is used. 6. net: use dst_confirm_neigh for UDP, RAW, ICMP, L2TP - dst_confirm conversion for UDP, RAW, ICMP and L2TP/IPv6 - these protocols use MSG_CONFIRM propagated by ip*_append_data to skb->dst_pending_confirm. sk->sk_dst_pending_confirm is not used because some sending paths do not lock the socket. For MSG_PROBE we use the slow lookup (dst_confirm_neigh). - there are also 2 cases that need the slow lookup: __ip6_rt_update_pmtu and rt6_do_redirect. I hope &ipv6_hdr(skb)->saddr is the correct nexthop address to use here. 7. net: pending_confirm is not used anymore - I failed to understand the CXGB* code, I see dst_confirm() calls but I'm not sure dst_neigh_output() was called. For now I just removed the dst->pending_confirm flag and left all dst_confirm() calls there. Any better idea? - Now may be old function neigh_output() should be restored instead of dst_neigh_output? ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/sctp/sm_sideeffect.c')
-rw-r--r--net/sctp/sm_sideeffect.c2
1 files changed, 1 insertions, 1 deletions
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c