/* * netsniff-ng - the packet sniffing beast * Copyright 2009, 2010 Daniel Borkmann. * Copyright 2014, 2015 Tobias Klauser. * Subject to the GPL, version 2. */ #include #include #include #include #include #include #include #include #include #include #include "xmalloc.h" #include "die.h" #include "ring_rx.h" #include "built_in.h" /* * tpacket v3 data structures and constants are not available for older kernel * versions which only support tpacket v2, thus we need protect access to them. */ #ifdef HAVE_TPACKET3 static inline bool is_tpacket_v3(int sock) { return get_sockopt_tpacket(sock) == TPACKET_V3; } static inline size_t get_ring_layout_size(struct ring *ring, bool v3) { return v3 ? sizeof(ring->layout3) : sizeof(ring->layout); } static inline void setup_rx_ring_layout_v3(struct ring *ring) { /* Pass out, if this will ever change and we do crap on it! */ build_bug_on(offsetof(struct tpacket_req, tp_frame_nr) != offsetof(struct tpacket_req3, tp_frame_nr) && sizeof(struct tpacket_req) != offsetof(struct tpacket_req3, tp_retire_blk_tov)); ring->layout3.tp_retire_blk_tov = 100; /* 0: let kernel decide */ ring->layout3.tp_sizeof_priv = 0; ring->layout3.tp_feature_req_word = 0; } static inline int rx_ring_get_num(struct ring *ring, bool v3) { return v3 ? ring->layout3.tp_block_nr : ring->layout.tp_frame_nr; } static inline size_t rx_ring_get_size(struct ring *ring, bool v3) { return v3 ? ring->layout3.tp_block_size : ring->layout.tp_frame_size; } int get_rx_net_stats(int sock, uint64_t *packets, uint64_t *drops, bool v3) { int ret; union { struct tpacket_stats k2; struct tpacket_stats_v3 k3; } stats; socklen_t slen = v3 ? sizeof(stats.k3) : sizeof(stats.k2); memset(&stats, 0, sizeof(stats)); ret = getsockopt(sock, SOL_PACKET, PACKET_STATISTICS, &stats, &slen); if (ret == 0) { *packets = stats.k3.tp_packets; *drops = stats.k3.tp_drops; } return ret; } #else static inline bool is_tpacket_v3(int sock __maybe_unused) { return false; } static inline size_t get_ring_layout_size(struct ring *ring, bool v3 __maybe_unused) { return sizeof(ring->layout); } static inline void setup_rx_ring_layout_v3(struct ring *ring __maybe_unused) { } static inline size_t rx_ring_get_num(struct ring *ring, bool v3 __maybe_unused) { return ring->layout.tp_frame_nr; } static inline size_t rx_ring_get_size(struct ring *ring, bool v3 __maybe_unused) { return ring->layout.tp_frame_size; } int get_rx_net_stats(int sock, uint64_t *packets, uint64_t *drops, bool v3 __maybe_unused) { int ret; struct tpacket_stats stats; socklen_t slen = sizeof(stats); memset(&stats, 0, sizeof(stats)); ret = getsockopt(sock, SOL_PACKET, PACKET_STATISTICS, &stats, &slen); if (ret == 0) { *packets = stats.tp_packets; *drops = stats.tp_drops; } return ret; } #endif /* HAVE_TPACKET3 */ void destroy_rx_ring(int sock, struct ring *ring) { int ret; bool v3 = is_tpacket_v3(sock); munmap(ring->mm_space, ring->mm_len); ring->mm_len = 0; xfree(ring->frames); /* In general, this is freed during close(2) anyway. */ if (v3) return; fmemset(&ring->layout, 0, sizeof(ring->layout)); ret = setsockopt(sock, SOL_PACKET, PACKET_RX_RING, &ring->layout, sizeof(ring->layout)); if (unlikely(ret)) panic("Cannot destroy the RX_RING: %s!\n", strerror(errno)); } static void setup_rx_ring_layout(int sock, struct ring *ring, size_t size, bool jumbo_support, bool v3) { setup_ring_layout_generic(sock, ring, size, jumbo_support); if (v3) { setup_rx_ring_layout_v3(ring); set_sockopt_tpacket_v3(sock); } else { set_sockopt_tpacket_v2(sock); } ring_verify_layout(ring); } static void create_rx_ring(int sock, struct ring *ring, bool verbose) { int ret; bool v3 = is_tpacket_v3(sock); size_t layout_size = get_ring_layout_size(ring, v3); retry: ret = setsockopt(sock, SOL_PACKET, PACKET_RX_RING, &ring->raw, layout_size); if (errno == ENOMEM && ring->layout.tp_block_nr > 1) { shrink_ring_layout_generic(ring); goto retry; } if (ret < 0) panic("Cannot allocate RX_RING!\n"); ring->mm_len = (size_t) ring->layout.tp_block_size * ring->layout.tp_block_nr; if (verbose) { if (!v3) { printf("RX,V2: %.2Lf MiB, %u Frames, each %u Byte allocated\n", (long double) ring->mm_len / (1 << 20), ring->layout.tp_frame_nr, ring->layout.tp_frame_size); } else { printf("RX,V3: %.2Lf MiB, %u Blocks, each %u Byte allocated\n", (long double) ring->mm_len / (1 << 20), ring->layout.tp_block_nr, ring->layout.tp_block_size); } } } static void alloc_rx_ring_frames(int sock, struct ring *ring) { bool v3 = is_tpacket_v3(sock); alloc_ring_frames_generic(ring, rx_ring_get_num(ring, v3), rx_ring_get_size(ring, v3)); } void join_fanout_group(int sock, uint32_t fanout_group, uint32_t fanout_type) { uint32_t fanout_opt = 0; int ret; if (fanout_group == 0) return; fanout_opt = (fanout_group & 0xffff) | (fanout_type << 16); ret = setsockopt(sock, SOL_PACKET, PACKET_FANOUT, &fanout_opt, sizeof(fanout_opt)); if (ret < 0) panic("Cannot set fanout ring mode!\n"); } void ring_rx_setup(struct ring *ring, int sock, size_t size, int ifindex, struct pollfd *poll, bool v3, bool jumbo_support, bool verbose, uint32_t fanout_group, uint32_t fanout_type) { fmemset(ring, 0, sizeof(*ring)); setup_rx_ring_layout(sock, ring, size, jumbo_support, v3); create_rx_ring(sock, ring, verbose); mmap_ring_generic(sock, ring); alloc_rx_ring_frames(sock, ring); bind_ring_generic(sock, ring, ifindex, false); join_fanout_group(sock, fanout_group, fanout_type); prepare_polling(sock, poll); } io parent92d21ac74a9e3c09b0b01c764e530657e4c85c49 (diff)
libceph: apply new_state before new_up_client on incrementals
Currently, osd_weight and osd_state fields are updated in the encoding order. This is wrong, because an incremental map may look like e.g. new_up_client: { osd=6, addr=... } # set osd_state and addr new_state: { osd=6, xorstate=EXISTS } # clear osd_state Suppose osd6's current osd_state is EXISTS (i.e. osd6 is down). After applying new_up_client, osd_state is changed to EXISTS | UP. Carrying on with the new_state update, we flip EXISTS and leave osd6 in a weird "!EXISTS but UP" state. A non-existent OSD is considered down by the mapping code 2087 for (i = 0; i < pg->pg_temp.len; i++) { 2088 if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { 2089 if (ceph_can_shift_osds(pi)) 2090 continue; 2091 2092 temp->osds[temp->size++] = CRUSH_ITEM_NONE; and so requests get directed to the second OSD in the set instead of the first, resulting in OSD-side errors like: [WRN] : client.4239 192.168.122.21:0/2444980242 misdirected client.4239.1:2827 pg 2.5df899f2 to osd.4 not [1,4,6] in e680/680 and hung rbds on the client: [ 493.566367] rbd: rbd0: write 400000 at 11cc00000 (0) [ 493.566805] rbd: rbd0: result -6 xferred 400000 [ 493.567011] blk_update_request: I/O error, dev rbd0, sector 9330688 The fix is to decouple application from the decoding and: - apply new_weight first - apply new_state before new_up_client - twiddle osd_state flags if marking in - clear out some of the state if osd is destroyed Fixes: http://tracker.ceph.com/issues/14901 Cc: stable@vger.kernel.org # 3.15+: 6dd74e44dc1d: libceph: set 'exists' flag for newly up osd Cc: stable@vger.kernel.org # 3.15+ Signed-off-by: Ilya Dryomov <idryomov@gmail.com> Reviewed-by: Josh Durgin <jdurgin@redhat.com>
Diffstat (limited to 'Documentation/devicetree/bindings/iio')