From d8cdc6ab87550de9c93b1f6763ea6015f292d7fb Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 31 May 2013 15:07:15 +0200 Subject: ring: netsniff-ng: migrate capture only to TPACKET_V3 Lets migrate capturing to TPACKET_V3, since it will bring a better performance due to fewer page cache misses caused by a higher density of packets, since now they are contigous placed in the ring buffer. It is said that TPACKET_V3 brings the following benefits: *) ~15 - 20% reduction in CPU-usage *) ~20% increase in packet capture rate *) ~2x increase in packet density *) Port aggregation analysis *) Non static frame size to capture entire packet payload Signed-off-by: Daniel Borkmann --- dissector.h | 29 +++++++---- netsniff-ng.c | 154 ++++++++++++++++++++++++++++++---------------------------- pcap_io.h | 101 +++++++++++++++++++++++--------------- ring.h | 7 +++ ring_rx.c | 2 +- xutils.c | 9 ++-- xutils.h | 2 +- 7 files changed, 176 insertions(+), 128 deletions(-) diff --git a/dissector.h b/dissector.h index d211e06..2c2c128 100644 --- a/dissector.h +++ b/dissector.h @@ -9,6 +9,7 @@ #include #include +#include #include "ring.h" #include "tprintf.h" @@ -45,31 +46,41 @@ static inline const char *__show_ts_source(uint32_t status) return ""; } -static inline void show_frame_hdr(struct frame_map *hdr, int mode) +static inline void __show_frame_hdr(struct sockaddr_ll *s_ll, + void *raw, int mode, bool v3) { char tmp[IFNAMSIZ]; + union tpacket_uhdr hdr; if (mode == PRINT_NONE) return; + hdr.raw = raw; + switch (mode) { case PRINT_LESS: tprintf("%s %s %u", - packet_types[hdr->s_ll.sll_pkttype] ? : "?", - if_indextoname(hdr->s_ll.sll_ifindex, tmp) ? : "?", - hdr->tp_h.tp_len); + packet_types[s_ll->sll_pkttype] ? : "?", + if_indextoname(s_ll->sll_ifindex, tmp) ? : "?", + v3 ? hdr.h3->tp_len : hdr.h2->tp_len); break; default: tprintf("%s %s %u %us.%uns %s\n", - packet_types[hdr->s_ll.sll_pkttype] ? : "?", - if_indextoname(hdr->s_ll.sll_ifindex, tmp) ? : "?", - hdr->tp_h.tp_len, hdr->tp_h.tp_sec, - hdr->tp_h.tp_nsec, - __show_ts_source(hdr->tp_h.tp_status)); + packet_types[s_ll->sll_pkttype] ? : "?", + if_indextoname(s_ll->sll_ifindex, tmp) ? : "?", + v3 ? hdr.h3->tp_len : hdr.h2->tp_len, + v3 ? hdr.h3->tp_sec : hdr.h2->tp_sec, + v3 ? hdr.h3->tp_nsec : hdr.h2->tp_nsec, + v3 ? "" : __show_ts_source(hdr.h2->tp_status)); break; } } +static inline void show_frame_hdr(struct frame_map *hdr, int mode) +{ + __show_frame_hdr(&hdr->s_ll, &hdr->tp_h, mode, false); +} + extern void dissector_init_all(int fnttype); extern void dissector_entry_point(uint8_t *packet, size_t len, int linktype, int mode); extern void dissector_cleanup_all(void); diff --git a/netsniff-ng.c b/netsniff-ng.c index e5e91ed..4211386 100644 --- a/netsniff-ng.c +++ b/netsniff-ng.c @@ -475,7 +475,7 @@ static void receive_to_xmit(struct ctx *ctx) timer_purge(); - sock_print_net_stats(rx_sock, 0); + sock_print_net_stats(rx_sock); bpf_release(&bpf_ops); @@ -788,10 +788,9 @@ static int begin_single_pcap_file(struct ctx *ctx) return fd; } -static void print_pcap_file_stats(int sock, struct ctx *ctx, unsigned long skipped) +static void print_pcap_file_stats(int sock, struct ctx *ctx) { int ret; - unsigned long good, bad; struct tpacket_stats kstats; socklen_t slen = sizeof(kstats); @@ -802,27 +801,89 @@ static void print_pcap_file_stats(int sock, struct ctx *ctx, unsigned long skipp panic("Cannot get packet statistics!\n"); if (ctx->print_mode == PRINT_NONE) { - good = kstats.tp_packets - kstats.tp_drops - skipped; - bad = kstats.tp_drops + skipped; - - printf(".(+%lu/-%lu)", good, bad); + printf(".(+%u/-%u)", kstats.tp_packets - kstats.tp_drops, + kstats.tp_drops); fflush(stdout); } } -static void recv_only_or_dump(struct ctx *ctx) +static void walk_t3_block(struct block_desc *pbd, struct ctx *ctx, + int sock, int fd) { uint8_t *packet; + int num_pkts = pbd->h1.num_pkts, i, ret; + unsigned long frame_count = 0; + struct tpacket3_hdr *hdr; + pcap_pkthdr_t phdr; + struct sockaddr_ll *sll; + + hdr = (void *) ((uint8_t *) pbd + pbd->h1.offset_to_first_pkt); + sll = (void *) ((uint8_t *) hdr + TPACKET_ALIGN(sizeof(*hdr))); + + for (i = 0; i < num_pkts && likely(sigint == 0); ++i) { + __label__ next; + packet = ((uint8_t *) hdr + hdr->tp_mac); + frame_count++; + + if (ctx->packet_type != -1) + if (ctx->packet_type != sll->sll_pkttype) + goto next; + + if (dump_to_pcap(ctx)) { + tpacket3_hdr_to_pcap_pkthdr(hdr, sll, &phdr, ctx->magic); + + ret = __pcap_io->write_pcap(fd, &phdr, ctx->magic, packet, + pcap_get_length(&phdr, ctx->magic)); + if (unlikely(ret != pcap_get_total_length(&phdr, ctx->magic))) + panic("Write error to pcap!\n"); + } + + __show_frame_hdr(sll, hdr, ctx->print_mode, true); + + dissector_entry_point(packet, hdr->tp_snaplen, ctx->link_type, + ctx->print_mode); + next: + + hdr = (void *) ((uint8_t *) hdr + hdr->tp_next_offset); + sll = (void *) ((uint8_t *) hdr + TPACKET_ALIGN(sizeof(*hdr))); + + if (frame_count_max != 0) { + if (frame_count >= frame_count_max) { + sigint = 1; + break; + } + } + + if (dump_to_pcap(ctx)) { + if (ctx->dump_mode == DUMP_INTERVAL_SIZE) { + interval += hdr->tp_snaplen; + if (interval > ctx->dump_interval) { + next_dump = true; + interval = 0; + } + } + + if (next_dump) { + fd = next_multi_pcap_file(ctx, fd); + next_dump = false; + + if (unlikely(ctx->verbose)) + print_pcap_file_stats(sock, ctx); + } + } + } +} + +static void recv_only_or_dump(struct ctx *ctx) +{ short ifflags = 0; int sock, irq, ifindex, fd = 0, ret; unsigned int size, it = 0; - unsigned long frame_count = 0, skipped = 0; struct ring rx_ring; struct pollfd rx_poll; - struct frame_map *hdr; struct sock_fprog bpf_ops; struct timeval start, end, diff; - pcap_pkthdr_t phdr; + struct block_desc *pbd; sock = pf_socket(); @@ -851,7 +912,7 @@ static void recv_only_or_dump(struct ctx *ctx) set_sockopt_hwtimestamp(sock, ctx->device_in); - setup_rx_ring_layout(sock, &rx_ring, size, ctx->jumbo, false); + setup_rx_ring_layout(sock, &rx_ring, size, ctx->jumbo, true); create_rx_ring(sock, &rx_ring, ctx->verbose); mmap_rx_ring(sock, &rx_ring); alloc_rx_ring_frames(sock, &rx_ring); @@ -903,72 +964,15 @@ static void recv_only_or_dump(struct ctx *ctx) bug_on(gettimeofday(&start, NULL)); while (likely(sigint == 0)) { - while (user_may_pull_from_rx(rx_ring.frames[it].iov_base)) { - __label__ next; + while (user_may_pull_from_rx_block((pbd = (void *) + rx_ring.frames[it].iov_base))) { + walk_t3_block(pbd, ctx, sock, fd); - hdr = rx_ring.frames[it].iov_base; - packet = ((uint8_t *) hdr) + hdr->tp_h.tp_mac; - frame_count++; - - if (ctx->packet_type != -1) - if (ctx->packet_type != hdr->s_ll.sll_pkttype) - goto next; - - if (unlikely(ring_frame_size(&rx_ring) < hdr->tp_h.tp_snaplen)) { - skipped++; - goto next; - } - - if (dump_to_pcap(ctx)) { - tpacket_hdr_to_pcap_pkthdr(&hdr->tp_h, &hdr->s_ll, &phdr, ctx->magic); - - ret = __pcap_io->write_pcap(fd, &phdr, ctx->magic, packet, - pcap_get_length(&phdr, ctx->magic)); - if (unlikely(ret != pcap_get_total_length(&phdr, ctx->magic))) - panic("Write error to pcap!\n"); - } - - show_frame_hdr(hdr, ctx->print_mode); - - dissector_entry_point(packet, hdr->tp_h.tp_snaplen, - ctx->link_type, ctx->print_mode); - - if (frame_count_max != 0) { - if (frame_count >= frame_count_max) { - sigint = 1; - break; - } - } - - next: - - kernel_may_pull_from_rx(&hdr->tp_h); - - it++; - if (it >= rx_ring.layout.tp_frame_nr) - it = 0; + kernel_may_pull_from_rx_block(pbd); + it = (it + 1) % rx_ring.layout3.tp_block_nr; if (unlikely(sigint == 1)) break; - - if (dump_to_pcap(ctx)) { - if (ctx->dump_mode == DUMP_INTERVAL_SIZE) { - interval += hdr->tp_h.tp_snaplen; - - if (interval > ctx->dump_interval) { - next_dump = true; - interval = 0; - } - } - - if (next_dump) { - fd = next_multi_pcap_file(ctx, fd); - next_dump = false; - - if (ctx->verbose) - print_pcap_file_stats(sock, ctx, skipped); - } - } } poll(&rx_poll, 1, -1); @@ -978,7 +982,7 @@ static void recv_only_or_dump(struct ctx *ctx) timersub(&end, &start, &diff); if (!(ctx->dump_dir && ctx->print_mode == PRINT_NONE)) { - sock_print_net_stats(sock, skipped); + sock_print_net_stats(sock); printf("\r%12lu sec, %lu usec in total\n", diff.tv_sec, diff.tv_usec); diff --git a/pcap_io.h b/pcap_io.h index 64689af..98f16cf 100644 --- a/pcap_io.h +++ b/pcap_io.h @@ -274,66 +274,67 @@ static inline u32 pcap_get_total_length(pcap_pkthdr_t *phdr, enum pcap_type type } } -static inline void tpacket_hdr_to_pcap_pkthdr(struct tpacket2_hdr *thdr, - struct sockaddr_ll *sll, - pcap_pkthdr_t *phdr, - enum pcap_type type) +static inline void +__tpacket_hdr_to_pcap_pkthdr(uint32_t sec, uint32_t nsec, uint32_t snaplen, + uint32_t len, uint32_t status, + struct sockaddr_ll *sll, pcap_pkthdr_t *phdr, + enum pcap_type type) { switch (type) { case DEFAULT: - phdr->ppo.ts.tv_sec = thdr->tp_sec; - phdr->ppo.ts.tv_usec = thdr->tp_nsec / 1000; - phdr->ppo.caplen = thdr->tp_snaplen; - phdr->ppo.len = thdr->tp_len; + phdr->ppo.ts.tv_sec = sec; + phdr->ppo.ts.tv_usec = nsec / 1000; + phdr->ppo.caplen = snaplen; + phdr->ppo.len = len; break; case DEFAULT_SWAPPED: - phdr->ppo.ts.tv_sec = ___constant_swab32(thdr->tp_sec); - phdr->ppo.ts.tv_usec = ___constant_swab32(thdr->tp_nsec / 1000); - phdr->ppo.caplen = ___constant_swab32(thdr->tp_snaplen); - phdr->ppo.len = ___constant_swab32(thdr->tp_len); + phdr->ppo.ts.tv_sec = ___constant_swab32(sec); + phdr->ppo.ts.tv_usec = ___constant_swab32(nsec / 1000); + phdr->ppo.caplen = ___constant_swab32(snaplen); + phdr->ppo.len = ___constant_swab32(len); break; case NSEC: - phdr->ppn.ts.tv_sec = thdr->tp_sec; - phdr->ppn.ts.tv_nsec = thdr->tp_nsec; - phdr->ppn.caplen = thdr->tp_snaplen; - phdr->ppn.len = thdr->tp_len; + phdr->ppn.ts.tv_sec = sec; + phdr->ppn.ts.tv_nsec = nsec; + phdr->ppn.caplen = snaplen; + phdr->ppn.len = len; break; case NSEC_SWAPPED: - phdr->ppn.ts.tv_sec = ___constant_swab32(thdr->tp_sec); - phdr->ppn.ts.tv_nsec = ___constant_swab32(thdr->tp_nsec); - phdr->ppn.caplen = ___constant_swab32(thdr->tp_snaplen); - phdr->ppn.len = ___constant_swab32(thdr->tp_len); + phdr->ppn.ts.tv_sec = ___constant_swab32(sec); + phdr->ppn.ts.tv_nsec = ___constant_swab32(nsec); + phdr->ppn.caplen = ___constant_swab32(snaplen); + phdr->ppn.len = ___constant_swab32(len); break; case KUZNETZOV: - phdr->ppk.ts.tv_sec = thdr->tp_sec; - phdr->ppk.ts.tv_usec = thdr->tp_nsec / 1000; - phdr->ppk.caplen = thdr->tp_snaplen; - phdr->ppk.len = thdr->tp_len; + phdr->ppk.ts.tv_sec = sec; + phdr->ppk.ts.tv_usec = nsec / 1000; + phdr->ppk.caplen = snaplen; + phdr->ppk.len = len; phdr->ppk.ifindex = sll->sll_ifindex; phdr->ppk.protocol = sll->sll_protocol; phdr->ppk.pkttype = sll->sll_pkttype; break; case KUZNETZOV_SWAPPED: - phdr->ppk.ts.tv_sec = ___constant_swab32(thdr->tp_sec); - phdr->ppk.ts.tv_usec = ___constant_swab32(thdr->tp_nsec / 1000); - phdr->ppk.caplen = ___constant_swab32(thdr->tp_snaplen); - phdr->ppk.len = ___constant_swab32(thdr->tp_len); + phdr->ppk.ts.tv_sec = ___constant_swab32(sec); + phdr->ppk.ts.tv_usec = ___constant_swab32(nsec / 1000); + phdr->ppk.caplen = ___constant_swab32(snaplen); + phdr->ppk.len = ___constant_swab32(len); phdr->ppk.ifindex = ___constant_swab32(sll->sll_ifindex); phdr->ppk.protocol = ___constant_swab16(sll->sll_protocol); phdr->ppk.pkttype = sll->sll_pkttype; break; case BORKMANN: - phdr->ppb.ts.tv_sec = thdr->tp_sec; - phdr->ppb.ts.tv_nsec = thdr->tp_nsec; - phdr->ppb.caplen = thdr->tp_snaplen; - phdr->ppb.len = thdr->tp_len; - phdr->ppb.tsource = tp_to_pcap_tsource(thdr->tp_status); + phdr->ppb.ts.tv_sec = sec; + phdr->ppb.ts.tv_nsec = nsec; + phdr->ppb.caplen = snaplen; + phdr->ppb.len = len; + phdr->ppb.tsource = tp_to_pcap_tsource(status); phdr->ppb.ifindex = (u16) sll->sll_ifindex; phdr->ppb.protocol = sll->sll_protocol; phdr->ppb.hatype = sll->sll_hatype; @@ -341,11 +342,11 @@ static inline void tpacket_hdr_to_pcap_pkthdr(struct tpacket2_hdr *thdr, break; case BORKMANN_SWAPPED: - phdr->ppb.ts.tv_sec = ___constant_swab32(thdr->tp_sec); - phdr->ppb.ts.tv_nsec = ___constant_swab32(thdr->tp_nsec); - phdr->ppb.caplen = ___constant_swab32(thdr->tp_snaplen); - phdr->ppb.len = ___constant_swab32(thdr->tp_len); - phdr->ppb.tsource = ___constant_swab16(tp_to_pcap_tsource(thdr->tp_status)); + phdr->ppb.ts.tv_sec = ___constant_swab32(sec); + phdr->ppb.ts.tv_nsec = ___constant_swab32(nsec); + phdr->ppb.caplen = ___constant_swab32(snaplen); + phdr->ppb.len = ___constant_swab32(len); + phdr->ppb.tsource = ___constant_swab16(tp_to_pcap_tsource(status)); phdr->ppb.ifindex = ___constant_swab16((u16) sll->sll_ifindex); phdr->ppb.protocol = ___constant_swab16(sll->sll_protocol); phdr->ppb.hatype = sll->sll_hatype; @@ -357,6 +358,30 @@ static inline void tpacket_hdr_to_pcap_pkthdr(struct tpacket2_hdr *thdr, } } +/* We need to do this crap here since member offsets are not interleaved, + * so hopfully the compiler does his job here. ;-) + */ + +static inline void tpacket_hdr_to_pcap_pkthdr(struct tpacket2_hdr *thdr, + struct sockaddr_ll *sll, + pcap_pkthdr_t *phdr, + enum pcap_type type) +{ + __tpacket_hdr_to_pcap_pkthdr(thdr->tp_sec, thdr->tp_nsec, + thdr->tp_snaplen, thdr->tp_len, + thdr->tp_status, sll, phdr, type); +} + +static inline void tpacket3_hdr_to_pcap_pkthdr(struct tpacket3_hdr *thdr, + struct sockaddr_ll *sll, + pcap_pkthdr_t *phdr, + enum pcap_type type) +{ + __tpacket_hdr_to_pcap_pkthdr(thdr->tp_sec, thdr->tp_nsec, + thdr->tp_snaplen, thdr->tp_len, + 0, sll, phdr, type); +} + static inline void pcap_pkthdr_to_tpacket_hdr(pcap_pkthdr_t *phdr, enum pcap_type type, struct tpacket2_hdr *thdr, diff --git a/ring.h b/ring.h index be04cf0..8bfe1eb 100644 --- a/ring.h +++ b/ring.h @@ -26,6 +26,13 @@ #include "built_in.h" #include "die.h" +union tpacket_uhdr { + struct tpacket_hdr *h1; + struct tpacket2_hdr *h2; + struct tpacket3_hdr *h3; + void *raw; +}; + struct frame_map { struct tpacket2_hdr tp_h __aligned_tpacket; struct sockaddr_ll s_ll __align_tpacket(sizeof(struct tpacket2_hdr)); diff --git a/ring_rx.c b/ring_rx.c index 0d1f828..ae8ce0a 100644 --- a/ring_rx.c +++ b/ring_rx.c @@ -59,7 +59,7 @@ void setup_rx_ring_layout(int sock, struct ring *ring, unsigned int size, sizeof(struct tpacket_req) != offsetof(struct tpacket_req3, tp_retire_blk_tov)); - ring->layout3.tp_retire_blk_tov = 0; + ring->layout3.tp_retire_blk_tov = 100; /* 0: let kernel decide */ ring->layout3.tp_sizeof_priv = 0; ring->layout3.tp_feature_req_word = 0; diff --git a/xutils.c b/xutils.c index 60b598c..5421d7d 100644 --- a/xutils.c +++ b/xutils.c @@ -716,7 +716,7 @@ int device_bind_irq_to_cpu(int irq, int cpu) return (ret > 0 ? 0 : ret); } -void sock_print_net_stats(int sock, unsigned long skipped) +void sock_print_net_stats(int sock) { int ret; struct tpacket_stats kstats; @@ -730,10 +730,11 @@ void sock_print_net_stats(int sock, unsigned long skipped) uint64_t drops = kstats.tp_drops; printf("\r%12ld packets incoming\n", packets); - printf("\r%12ld packets passed filter\n", packets - drops - skipped); - printf("\r%12ld packets failed filter (out of space)\n", drops + skipped); + printf("\r%12ld packets passed filter\n", packets - drops); + printf("\r%12ld packets failed filter (out of space)\n", drops); if (kstats.tp_packets > 0) - printf("\r%12.4lf%\% packet droprate\n", (1.0 * drops / packets) * 100.0); + printf("\r%12.4lf%\% packet droprate\n", + (1.0 * drops / packets) * 100.0); } } diff --git a/xutils.h b/xutils.h index 38c8da4..6e72b5c 100644 --- a/xutils.h +++ b/xutils.h @@ -39,7 +39,7 @@ extern int device_address(const char *ifname, int af, struct sockaddr_storage *s extern int device_irq_number(const char *ifname); extern int device_set_irq_affinity_list(int irq, unsigned long from, unsigned long to); extern int device_bind_irq_to_cpu(int irq, int cpu); -extern void sock_print_net_stats(int sock, unsigned long skipped); +extern void sock_print_net_stats(int sock); extern int device_ifindex(const char *ifname); extern short device_get_flags(const char *ifname); extern void device_set_flags(const char *ifname, const short flags); -- cgit v1.2.3-54-g00ecf