/* * netsniff-ng - the packet sniffing beast * Copyright 2009, 2010 Daniel Borkmann. * Copyright 2009, 2010 Emmanuel Roullit. * Subject to the GPL, version 2. */ #include #include #include #include #include #include #include #include #include #include "die.h" #include "xmalloc.h" #include "ring_tx.h" #include "built_in.h" static void set_packet_loss_discard(int sock) { int ret, discard = 1; ret = setsockopt(sock, SOL_PACKET, PACKET_LOSS, (void *) &discard, sizeof(discard)); if (ret < 0) panic("setsockopt: cannot set packet loss"); } void destroy_tx_ring(int sock, struct ring *ring) { int ret; munmap(ring->mm_space, ring->mm_len); ring->mm_len = 0; fmemset(&ring->layout, 0, sizeof(ring->layout)); ret = setsockopt(sock, SOL_PACKET, PACKET_TX_RING, &ring->layout, sizeof(ring->layout)); if (unlikely(ret)) panic("Cannot destroy the TX_RING: %s!\n", strerror(errno)); xfree(ring->frames); } static void setup_tx_ring_layout(int sock, struct ring *ring, size_t size, bool jumbo_support) { fmemset(&ring->layout, 0, sizeof(ring->layout)); ring->layout.tp_block_size = (jumbo_support ? RUNTIME_PAGE_SIZE << 4 : RUNTIME_PAGE_SIZE << 2); ring->layout.tp_frame_size = (jumbo_support ? TPACKET_ALIGNMENT << 12 : TPACKET_ALIGNMENT << 7); ring->layout.tp_block_nr = size / ring->layout.tp_block_size; ring->layout.tp_frame_nr = ring->layout.tp_block_size / ring->layout.tp_frame_size * ring->layout.tp_block_nr; set_sockopt_tpacket_v2(sock); ring_verify_layout(ring); } static void create_tx_ring(int sock, struct ring *ring, bool verbose) { int ret; retry: ret = setsockopt(sock, SOL_PACKET, PACKET_TX_RING, &ring->layout, sizeof(ring->layout)); if (errno == ENOMEM && ring->layout.tp_block_nr > 1) { ring->layout.tp_block_nr >>= 1; ring->layout.tp_frame_nr = ring->layout.tp_block_size / ring->layout.tp_frame_size * ring->layout.tp_block_nr; goto retry; } if (ret < 0) panic("Cannot allocate TX_RING!\n"); ring->mm_len = (size_t) ring->layout.tp_block_size * ring->layout.tp_block_nr; if (verbose) { printf("TX,V2: %.2Lf MiB, %u Frames, each %u Byte allocated\n", (long double) ring->mm_len / (1 << 20), ring->layout.tp_frame_nr, ring->layout.tp_frame_size); } } static void alloc_tx_ring_frames(int sock __maybe_unused, struct ring *ring) { alloc_ring_frames_generic(ring, ring->layout.tp_frame_nr, ring->layout.tp_frame_size); } void ring_tx_setup(struct ring *ring, int sock, size_t size, int ifindex, bool jumbo_support, bool verbose) { fmemset(ring, 0, sizeof(*ring)); set_packet_loss_discard(sock); setup_tx_ring_layout(sock, ring, size, jumbo_support); create_tx_ring(sock, ring, verbose); mmap_ring_generic(sock, ring); alloc_tx_ring_frames(sock, ring); bind_ring_generic(sock, ring, ifindex, true); } lass='label'>context:space:mode:
authorIlya Dryomov <idryomov@gmail.com>2016-07-19 03:50:28 +0200
committerIlya Dryomov <idryomov@gmail.com>2016-07-22 15:17:40 +0200
commit930c532869774ebf8af9efe9484c597f896a7d46 (patch)
tree68072d2a5c42dbf347514b8737f1abd0df044966
parent92d21ac74a9e3c09b0b01c764e530657e4c85c49 (diff)
libceph: apply new_state before new_up_client on incrementals
Currently, osd_weight and osd_state fields are updated in the encoding order. This is wrong, because an incremental map may look like e.g. new_up_client: { osd=6, addr=... } # set osd_state and addr new_state: { osd=6, xorstate=EXISTS } # clear osd_state Suppose osd6's current osd_state is EXISTS (i.e. osd6 is down). After applying new_up_client, osd_state is changed to EXISTS | UP. Carrying on with the new_state update, we flip EXISTS and leave osd6 in a weird "!EXISTS but UP" state. A non-existent OSD is considered down by the mapping code 2087 for (i = 0; i < pg->pg_temp.len; i++) { 2088 if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { 2089 if (ceph_can_shift_osds(pi)) 2090 continue; 2091 2092 temp->osds[temp->size++] = CRUSH_ITEM_NONE; and so requests get directed to the second OSD in the set instead of the first, resulting in OSD-side errors like: [WRN] : client.4239 192.168.122.21:0/2444980242 misdirected client.4239.1:2827 pg 2.5df899f2 to osd.4 not [1,4,6] in e680/680 and hung rbds on the client: [ 493.566367] rbd: rbd0: write 400000 at 11cc00000 (0) [ 493.566805] rbd: rbd0: result -6 xferred 400000 [ 493.567011] blk_update_request: I/O error, dev rbd0, sector 9330688 The fix is to decouple application from the decoding and: - apply new_weight first - apply new_state before new_up_client - twiddle osd_state flags if marking in - clear out some of the state if osd is destroyed Fixes: http://tracker.ceph.com/issues/14901 Cc: stable@vger.kernel.org # 3.15+: 6dd74e44dc1d: libceph: set 'exists' flag for newly up osd Cc: stable@vger.kernel.org # 3.15+ Signed-off-by: Ilya Dryomov <idryomov@gmail.com> Reviewed-by: Josh Durgin <jdurgin@redhat.com>