/* * linux/fs/lockd/clntlock.c * * Lock handling for the client side NLM implementation * * Copyright (C) 1996, Olaf Kirch */ #include #include #include #include #include #include #include #include #include #define NLMDBG_FACILITY NLMDBG_CLIENT /* * Local function prototypes */ static int reclaimer(void *ptr); /* * The following functions handle blocking and granting from the * client perspective. */ /* * This is the representation of a blocked client lock. */ struct nlm_wait { struct list_head b_list; /* linked list */ wait_queue_head_t b_wait; /* where to wait on */ struct nlm_host * b_host; struct file_lock * b_lock; /* local file lock */ unsigned short b_reclaim; /* got to reclaim lock */ __be32 b_status; /* grant callback status */ }; static LIST_HEAD(nlm_blocked); static DEFINE_SPINLOCK(nlm_blocked_lock); /** * nlmclnt_init - Set up per-NFS mount point lockd data structures * @nlm_init: pointer to arguments structure * * Returns pointer to an appropriate nlm_host struct, * or an ERR_PTR value. */ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init) { struct nlm_host *host; u32 nlm_version = (nlm_init->nfs_version == 2) ? 1 : 4; int status; status = lockd_up(nlm_init->net); if (status < 0) return ERR_PTR(status); host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen, nlm_init->protocol, nlm_version, nlm_init->hostname, nlm_init->noresvport, nlm_init->net); if (host == NULL) goto out_nohost; if (host->h_rpcclnt == NULL && nlm_bind_host(host) == NULL) goto out_nobind; return host; out_nobind: nlmclnt_release_host(host); out_nohost: lockd_down(nlm_init->net); return ERR_PTR(-ENOLCK); } EXPORT_SYMBOL_GPL(nlmclnt_init); /** * nlmclnt_done - Release resources allocated by nlmclnt_init() * @host: nlm_host structure reserved by nlmclnt_init() * */ void nlmclnt_done(struct nlm_host *host) { struct net *net = host->net; nlmclnt_release_host(host); lockd_down(net); } EXPORT_SYMBOL_GPL(nlmclnt_done); /* * Queue up a lock for blocking so that the GRANTED request can see it */ struct nlm_wait *nlmclnt_prepare_block(struct nlm_host *host, struct file_lock *fl) { struct nlm_wait *block; block = kmalloc(sizeof(*block), GFP_KERNEL); if (block != NULL) { block->b_host = host; block->b_lock = fl; init_waitqueue_head(&block->b_wait); block->b_status = nlm_lck_blocked; spin_lock(&nlm_blocked_lock); list_add(&block->b_list, &nlm_blocked); spin_unlock(&nlm_blocked_lock); } return block; } void nlmclnt_finish_block(struct nlm_wait *block) { if (block == NULL) return; spin_lock(&nlm_blocked_lock); list_del(&block->b_list); spin_unlock(&nlm_blocked_lock); kfree(block); } /* * Block on a lock */ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout) { long ret; /* A borken server might ask us to block even if we didn't * request it. Just say no! */ if (block == NULL) return -EAGAIN; /* Go to sleep waiting for GRANT callback. Some servers seem * to lose callbacks, however, so we're going to poll from * time to time just to make sure. * * For now, the retry frequency is pretty high; normally * a 1 minute timeout would do. See the comment before * nlmclnt_lock for an explanation. */ ret = wait_event_interruptible_timeout(block->b_wait, block->b_status != nlm_lck_blocked, timeout); if (ret < 0) return -ERESTARTSYS; /* Reset the lock status after a server reboot so we resend */ if (block->b_status == nlm_lck_denied_grace_period) block->b_status = nlm_lck_blocked; req->a_res.status = block->b_status; return 0; } /* * The server lockd has called us back to tell us the lock was granted */ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) { const struct file_lock *fl = &lock->fl; const struct nfs_fh *fh = &lock->fh; struct nlm_wait *block; __be32 res = nlm_lck_denied; /* * Look up blocked request based on arguments. * Warning: must not use cookie to match it! */ spin_lock(&nlm_blocked_lock); list_for_each_entry(block, &nlm_blocked, b_list) { struct file_lock *fl_blocked = block->b_lock; if (fl_blocked->fl_start != fl->fl_start) continue; if (fl_blocked->fl_end != fl->fl_end) continue; /* * Careful! The NLM server will return the 32-bit "pid" that * we put on the wire: in this case the lockowner "pid". */ if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid) continue; if (!rpc_cmp_addr(nlm_addr(block->b_host), addr)) continue; if (nfs_compare_fh(NFS_FH(file_inode(fl_blocked->fl_file)) ,fh) != 0) continue; /* Alright, we found a lock. Set the return status * and wake up the caller */ block->b_status = nlm_granted; wake_up(&block->b_wait); res = nlm_granted; } spin_unlock(&nlm_blocked_lock); return res; } /* * The following procedures deal with the recovery of locks after a * server crash. */ /* * Reclaim all locks on server host. We do this by spawning a separate * reclaimer thread. */ void nlmclnt_recovery(struct nlm_host *host) { struct task_struct *task; if (!host->h_reclaiming++) { nlm_get_host(host); task = kthread_run(reclaimer, host, "%s-reclaim", host->h_name); if (IS_ERR(task)) printk(KERN_ERR "lockd: unable to spawn reclaimer " "thread. Locks for %s won't be reclaimed! " "(%ld)\n", host->h_name, PTR_ERR(task)); } } static int reclaimer(void *ptr) { struct nlm_host *host = (struct nlm_host *) ptr; struct nlm_wait *block; struct nlm_rqst *req; struct file_lock *fl, *next; u32 nsmstate; struct net *net = host->net; req = kmalloc(sizeof(*req), GFP_KERNEL); if (!req) { printk(KERN_ERR "lockd: reclaimer unable to alloc memory." " Locks for %s won't be reclaimed!\n", host->h_name); return 0; } allow_signal(SIGKILL); down_write(&host->h_rwsem); lockd_up(net); /* note: this cannot fail as lockd is already running */ dprintk("lockd: reclaiming locks for host %s\n", host->h_name); restart: nsmstate = host->h_nsmstate; /* Force a portmap getport - the peer's lockd will * most likely end up on a different port. */ host->h_nextrebind = jiffies; nlm_rebind_host(host); /* First, reclaim all locks that have been granted. */ list_splice_init(&host->h_granted, &host->h_reclaim); list_for_each_entry_safe(fl, next, &host->h_reclaim, fl_u.nfs_fl.list) { list_del_init(&fl->fl_u.nfs_fl.list); /* * sending this thread a SIGKILL will result in any unreclaimed * locks being removed from the h_granted list. This means that * the kernel will not attempt to reclaim them again if a new * reclaimer thread is spawned for this host. */ if (signalled()) continue; if (nlmclnt_reclaim(host, fl, req) != 0) continue; list_add_tail(&fl->fl_u.nfs_fl.list, &host->h_granted); if (host->h_nsmstate != nsmstate) { /* Argh! The server rebooted again! */ goto restart; } } host->h_reclaiming = 0; up_write(&host->h_rwsem); dprintk("NLM: done reclaiming locks for host %s\n", host->h_name); /* Now, wake up all processes that sleep on a blocked lock */ spin_lock(&nlm_blocked_lock); list_for_each_entry(block, &nlm_blocked, b_list) { if (block->b_host == host) { block->b_status = nlm_lck_denied_grace_period; wake_up(&block->b_wait); } } spin_unlock(&nlm_blocked_lock); /* Release host handle after use */ nlmclnt_release_host(host); lockd_down(net); kfree(req); return 0; } span>net: ipv6: Add support to dump multipath routes via RTA_MULTIPATH attributeDavid Ahern2-17/+105 IPv6 returns multipath routes as a series of individual routes making their display and handling by userspace different and more complicated than IPv4, putting the burden on the user to see that a route is part of a multipath route and internally creating a multipath route if desired (e.g., libnl does this as of commit 29b71371e764). This patch addresses this difference, allowing multipath routes to be returned using the RTA_MULTIPATH attribute. The end result is that IPv6 multipath routes can be treated and displayed in a format similar to IPv4: $ ip -6 ro ls vrf red 2001:db8:1::/120 dev eth1 proto kernel metric 256 pref medium 2001:db8:2::/120 dev eth2 proto kernel metric 256 pref medium 2001:db8:200::/120 metric 1024 nexthop via 2001:db8:1::2 dev eth1 weight 1 nexthop via 2001:db8:2::2 dev eth2 weight 1 Signed-off-by: David Ahern <dsa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-04net: ipv6: Allow shorthand delete of all nexthops in multipath routeDavid Ahern1-2/+36 IPv4 allows multipath routes to be deleted using just the prefix and length. For example: $ ip ro ls vrf red unreachable default metric 8192 1.1.1.0/24 nexthop via 10.100.1.254 dev eth1 weight 1 nexthop via 10.11.200.2 dev eth11.200 weight 1 10.11.200.0/24 dev eth11.200 proto kernel scope link src 10.11.200.3 10.100.1.0/24 dev eth1 proto kernel scope link src 10.100.1.3 $ ip ro del 1.1.1.0/24 vrf red $ ip ro ls vrf red unreachable default metric 8192 10.11.200.0/24 dev eth11.200 proto kernel scope link src 10.11.200.3 10.100.1.0/24 dev eth1 proto kernel scope link src 10.100.1.3 The same notation does not work with IPv6 because of how multipath routes are implemented for IPv6. For IPv6 only the first nexthop of a multipath route is deleted if the request contains only a prefix and length. This leads to unnecessary complexity in userspace dealing with IPv6 multipath routes. This patch allows all nexthops to be deleted without specifying each one in the delete request. Internally, this is done by walking the sibling list of the route matching the specifications given (prefix, length, metric, protocol, etc). $ ip -6 ro ls vrf red 2001:db8:1::/120 dev eth1 proto kernel metric 256 pref medium 2001:db8:2::/120 dev eth2 proto kernel metric 256 pref medium 2001:db8:200::/120 via 2001:db8:1::2 dev eth1 metric 1024 pref medium 2001:db8:200::/120 via 2001:db8:2::2 dev eth2 metric 1024 pref medium ... $ ip -6 ro del vrf red 2001:db8:200::/120 $ ip -6 ro ls vrf red 2001:db8:1::/120 dev eth1 proto kernel metric 256 pref medium 2001:db8:2::/120 dev eth2 proto kernel metric 256 pref medium ... Because IPv6 allows individual nexthops to be deleted without deleting the entire route, the ip6_route_multipath_del and non-multipath code path (ip6_route_del) have to be discriminated so that all nexthops are only deleted for the latter case. This is done by making the existing fc_type in fib6_config a u16 and then adding a new u16 field with fc_delete_all_nh as the first bit. Suggested-by: Dinesh Dutt <ddutt@cumulusnetworks.com> Signed-off-by: David Ahern <dsa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-nextDavid S. Miller7-39/+28 Pablo Neira Ayuso says: ==================== Netfilter updates for net-next The following patchset contains Netfilter updates for your net-next tree, they are: 1) Stash ctinfo 3-bit field into pointer to nf_conntrack object from sk_buff so we only access one single cacheline in the conntrack hotpath. Patchset from Florian Westphal. 2) Don't leak pointer to internal structures when exporting x_tables ruleset back to userspace, from Willem DeBruijn. This includes new helper functions to copy data to userspace such as xt_data_to_user() as well as conversions of our ip_tables, ip6_tables and arp_tables clients to use it. Not surprinsingly, ebtables requires an ad-hoc update. There is also a new field in x_tables extensions to indicate the amount of bytes that we copy to userspace. 3) Add nf_log_all_netns sysctl: This new knob allows you to enable logging via nf_log infrastructure for all existing netnamespaces. Given the effort to provide pernet syslog has been discontinued, let's provide a way to restore logging using netfilter kernel logging facilities in trusted environments. Patch from Michal Kubecek. 4) Validate SCTP checksum from conntrack helper, from Davide Caratti. 5) Merge UDPlite conntrack and NAT helpers into UDP, this was mostly a copy&paste from the original helper, from Florian Westphal. 6) Reset netfilter state when duplicating packets, also from Florian. 7) Remove unnecessary check for broadcast in IPv6 in pkttype match and nft_meta, from Liping Zhang. 8) Add missing code to deal with loopback packets from nft_meta when used by the netdev family, also from Liping. 9) Several cleanups on nf_tables, one to remove unnecessary check from the netlink control plane path to add table, set and stateful objects and code consolidation when unregister chain hooks, from Gao Feng. 10) Fix harmless reference counter underflow in IPVS that, however, results in problems with the introduction of the new refcount_t type, from David Windsor. 11) Enable LIBCRC32C from nf_ct_sctp instead of nf_nat_sctp, from Davide Caratti. 12) Missing documentation on nf_tables uapi header, from Liping Zhang. 13) Use rb_entry() helper in xt_connlimit, from Geliang Tang. ==================== Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03net: ipv6: Set protocol to kernel for local routesDavid Ahern1-0/+1 IPv6 stack does not set the protocol for local routes, so those routes show up with proto "none": $ ip -6 ro ls table local local ::1 dev lo proto none metric 0 pref medium local 2100:3:: dev lo proto none metric 0 pref medium local 2100:3::4 dev lo proto none metric 0 pref medium local fe80:: dev lo proto none metric 0 pref medium ... Set rt6i_protocol to RTPROT_KERNEL for consistency with IPv4. Now routes show up with proto "kernel": $ ip -6 ro ls table local local ::1 dev lo proto kernel metric 0 pref medium local 2100:3:: dev lo proto kernel metric 0 pref medium local 2100:3::4 dev lo proto kernel metric 0 pref medium local fe80:: dev lo proto kernel metric 0 pref medium ... Signed-off-by: David Ahern <dsa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03tcp: add tcp_mss_clamp() helperEric Dumazet1-4/+1 Small cleanup factorizing code doing the TCP_MAXSEG clamping. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-02Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/netDavid S. Miller2-2/+2 All merge conflicts were simple overlapping changes. Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-02netfilter: allow logging from non-init namespacesMichal Kubeček1-1/+1 Commit 69b34fb996b2 ("netfilter: xt_LOG: add net namespace support for xt_LOG") disabled logging packets using the LOG target from non-init namespaces. The motivation was to prevent containers from flooding kernel log of the host. The plan was to keep it that way until syslog namespace implementation allows containers to log in a safe way. However, the work on syslog namespace seems to have hit a dead end somewhere in 2013 and there are users who want to use xt_LOG in all network namespaces. This patch allows to do so by setting /proc/sys/net/netfilter/nf_log_all_netns to a nonzero value. This sysctl is only accessible from init_net so that one cannot switch the behaviour from inside a container. Signed-off-by: Michal Kubecek <mkubecek@suse.cz> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> 2017-02-02netfilter: merge ctinfo into nfct pointer storage areaFlorian Westphal1-1/+1 After this change conntrack operations (lookup, creation, matching from ruleset) only access one instead of two sk_buff cache lines. This works for normal conntracks because those are allocated from a slab that guarantees hw cacheline or 8byte alignment (whatever is larger) so the 3 bits needed for ctinfo won't overlap with nf_conn addresses. Template allocation now does manual address alignment (see previous change) on arches that don't have sufficent kmalloc min alignment. Some spots intentionally use skb->_nfct instead of skb_nfct() helpers, this is to avoid undoing the skb_nfct() use when we remove untracked conntrack object in the future. Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> 2017-02-02netfilter: add and use nf_ct_set helperFlorian Westphal3-8/+4 Add a helper to assign a nf_conn entry and the ctinfo bits to an sk_buff. This avoids changing code in followup patch that merges skb->nfct and skb->nfctinfo into skb->_nfct. Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> 2017-02-02skbuff: add and use skb_nfct helperFlorian Westphal3-8/+8 Followup patch renames skb->nfct and changes its type so add a helper to avoid intrusive rename change later. Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> 2017-02-02netfilter: reset netfilter state when duplicating packetFlorian Westphal1-1/+1 We should also toss nf_bridge_info, if any -- packet is leaving via ip_local_out, also, this skb isn't bridged -- it is a locally generated copy. Also this avoids the need to touch this later when skb->nfct is replaced with 'unsigned long _nfct' in followup patch. Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> 2017-02-02netfilter: conntrack: no need to pass ctinfo to error handlerFlorian Westphal1-6/+6 It is never accessed for reading and the only places that write to it are the icmp(6) handlers, which also set skb->nfct (and skb->nfctinfo). The conntrack core specifically checks for attached skb->nfct after ->error() invocation and returns early in this case. Signed-off-by: Florian Westphal <fw@strlen.de> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>