/*
 * Copyright (C) 2016 Red Hat, Inc.
 * Author: Michael S. Tsirkin <mst@redhat.com>
 * This work is licensed under the terms of the GNU GPL, version 2.
 *
 * Simple descriptor-based ring. virtio 0.9 compatible event index is used for
 * signalling, unconditionally.
 */
#define _GNU_SOURCE
#include "main.h"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>

/* Next - Where next entry will be written.
 * Prev - "Next" value when event triggered previously.
 * Event - Peer requested event after writing this entry.
 */
static inline bool need_event(unsigned short event,
			      unsigned short next,
			      unsigned short prev)
{
	return (unsigned short)(next - event - 1) < (unsigned short)(next - prev);
}

/* Design:
 * Guest adds descriptors with unique index values and DESC_HW in flags.
 * Host overwrites used descriptors with correct len, index, and DESC_HW clear.
 * Flags are always set last.
 */
#define DESC_HW 0x1

struct desc {
	unsigned short flags;
	unsigned short index;
	unsigned len;
	unsigned long long addr;
};

/* how much padding is needed to avoid false cache sharing */
#define HOST_GUEST_PADDING 0x80

/* Mostly read */
struct event {
	unsigned short kick_index;
	unsigned char reserved0[HOST_GUEST_PADDING - 2];
	unsigned short call_index;
	unsigned char reserved1[HOST_GUEST_PADDING - 2];
};

struct data {
	void *buf; /* descriptor is writeable, we can't get buf from there */
	void *data;
} *data;

struct desc *ring;
struct event *event;

struct guest {
	unsigned avail_idx;
	unsigned last_used_idx;
	unsigned num_free;
	unsigned kicked_avail_idx;
	unsigned char reserved[HOST_GUEST_PADDING - 12];
} guest;

struct host {
	/* we do not need to track last avail index
	 * unless we have more than one in flight.
	 */
	unsigned used_idx;
	unsigned called_used_idx;
	unsigned char reserved[HOST_GUEST_PADDING - 4];
} host;

/* implemented by ring */
void alloc_ring(void)
{
	int ret;
	int i;

	ret = posix_memalign((void **)&ring, 0x1000, ring_size * sizeof *ring);
	if (ret) {
		perror("Unable to allocate ring buffer.\n");
		exit(3);
	}
	event = malloc(sizeof *event);
	if (!event) {
		perror("Unable to allocate event buffer.\n");
		exit(3);
	}
	memset(event, 0, sizeof *event);
	guest.avail_idx = 0;
	guest.kicked_avail_idx = -1;
	guest.last_used_idx = 0;
	host.used_idx = 0;
	host.called_used_idx = -1;
	for (i = 0; i < ring_size; ++i) {
		struct desc desc = {
			.index = i,
		};
		ring[i] = desc;
	}
	guest.num_free = ring_size;
	data = malloc(ring_size * sizeof *data);
	if (!data) {
		perror("Unable to allocate data buffer.\n");
		exit(3);
	}
	memset(data, 0, ring_size * sizeof *data);
}

/* guest side */
int add_inbuf(unsigned len, void *buf, void *datap)
{
	unsigned head, index;

	if (!guest.num_free)
		return -1;

	guest.num_free--;
	head = (ring_size - 1) & (guest.avail_idx++);

	/* Start with a write. On MESI architectures this helps
	 * avoid a shared state with consumer that is polling this descriptor.
	 */
	ring[head].addr = (unsigned long)(void*)buf;
	ring[head].len = len;
	/* read below might bypass write above. That is OK because it's just an
	 * optimization. If this happens, we will get the cache line in a
	 * shared state which is unfortunate, but probably not worth it to
	 * add an explicit full barrier to avoid this.
	 */
	barrier();
	index = ring[head].index;
	data[index].buf = buf;
	data[index].data = datap;
	/* Barrier A (for pairing) */
	smp_release();
	ring[head].flags = DESC_HW;

	return 0;
}

void *get_buf(unsigned *lenp, void **bufp)
{
	unsigned head = (ring_size - 1) & guest.last_used_idx;
	unsigned index;
	void *datap;

	if (ring[head].flags & DESC_HW)
		return NULL;
	/* Barrier B (for pairing) */
	smp_acquire();
	*lenp = ring[head].len;
	index = ring[head].index & (ring_size - 1);
	datap = data[index].data;
	*bufp = data[index].buf;
	data[index].buf = NULL;
	data[index].data = NULL;
	guest.num_free++;
	guest.last_used_idx++;
	return datap;
}

bool used_empty()
{
	unsigned head = (ring_size - 1) & guest.last_used_idx;

	return (ring[head].flags & DESC_HW);
}

void disable_call()
{
	/* Doing nothing to disable calls might cause
	 * extra interrupts, but reduces the number of cache misses.
	 */
}

bool enable_call()
{
	event->call_index = guest.last_used_idx;
	/* Flush call index write */
	/* Barrier D (for pairing) */
	smp_mb();
	return used_empty();
}

void kick_available(void)
{
	/* Flush in previous flags write */
	/* Barrier C (for pairing) */
	smp_mb();
	if (!need_event(event->kick_index,
			guest.avail_idx,
			guest.kicked_avail_idx))
		return;

	guest.kicked_avail_idx = guest.avail_idx;
	kick();
}

/* host side */
void disable_kick()
{
	/* Doing nothing to disable kicks might cause
	 * extra interrupts, but reduces the number of cache misses.
	 */
}

bool enable_kick()
{
	event->kick_index = host.used_idx;
	/* Barrier C (for pairing) */
	smp_mb();
	return avail_empty();
}

bool avail_empty()
{
	unsigned head = (ring_size - 1) & host.used_idx;

	return !(ring[head].flags & DESC_HW);
}

bool use_buf(unsigned *lenp, void **bufp)
{
	unsigned head = (ring_size - 1) & host.used_idx;

	if (!(ring[head].flags & DESC_HW))
		return false;

	/* make sure length read below is not speculated */
	/* Barrier A (for pairing) */
	smp_acquire();

	/* simple in-order completion: we don't need
	 * to touch index at all. This also means we
	 * can just modify the descriptor in-place.
	 */
	ring[head].len--;
	/* Make sure len is valid before flags.
	 * Note: alternative is to write len and flags in one access -
	 * possible on 64 bit architectures but wmb is free on Intel anyway
	 * so I have no way to test whether it's a gain.
	 */
	/* Barrier B (for pairing) */
	smp_release();
	ring[head].flags = 0;
	host.used_idx++;
	return true;
}

void call_used(void)
{
	/* Flush in previous flags write */
	/* Barrier D (for pairing) */
	smp_mb();
	if (!need_event(event->call_index,
			host.used_idx,
			host.called_used_idx))
		return;

	host.called_used_idx = host.used_idx;
	call();
}
lude?h=nds-private-remove&amp;id=4759d386d55fef452d692bf101167914437e848e&amp;id2=238d1d0f79f619d75c2cc741d6770fb0986aef24'>diff</a>)</td></tr><tr><th>parent</th><td colspan='2' class='oid'><a href='/cgit.cgi/linux/net-next.git/commit/tools/include?h=nds-private-remove&amp;id=1db175428ee374489448361213e9c3b749d14900'>1db175428ee374489448361213e9c3b749d14900</a> (<a href='/cgit.cgi/linux/net-next.git/diff/tools/include?h=nds-private-remove&amp;id=4759d386d55fef452d692bf101167914437e848e&amp;id2=1db175428ee374489448361213e9c3b749d14900'>diff</a>)</td></tr></table>
<div class='commit-subject'>Merge branch 'libnvdimm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm</div><div class='commit-msg'>Pull DAX updates from Dan Williams:
 "The completion of Jan's DAX work for 4.10.

  As I mentioned in the libnvdimm-for-4.10 pull request, these are some
  final fixes for the DAX dirty-cacheline-tracking invalidation work
  that was merged through the -mm, ext4, and xfs trees in -rc1. These
  patches were prepared prior to the merge window, but we waited for
  4.10-rc1 to have a stable merge base after all the prerequisites were
  merged.

  Quoting Jan on the overall changes in these patches:

     "So I'd like all these 6 patches to go for rc2. The first three
      patches fix invalidation of exceptional DAX entries (a bug which
      is there for a long time) - without these patches data loss can
      occur on power failure even though user called fsync(2). The other
      three patches change locking of DAX faults so that -&gt;iomap_begin()
      is called in a more relaxed locking context and we are safe to
      start a transaction there for ext4"

  These have received a build success notification from the kbuild
  robot, and pass the latest libnvdimm unit tests. There have not been
  any -next releases since -rc1, so they have not appeared there"

* 'libnvdimm-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm:
  ext4: Simplify DAX fault path
  dax: Call -&gt;iomap_begin without entry lock during dax fault
  dax: Finish fault completely when loading holes
  dax: Avoid page invalidation races and unnecessary radix tree traversals
  mm: Invalidate DAX radix tree entries only if appropriate
  ext2: Return BH_New buffers for zeroed blocks
</div><div class='diffstat-header'><a href='/cgit.cgi/linux/net-next.git/diff/?h=nds-private-remove&amp;id=4759d386d55fef452d692bf101167914437e848e'>Diffstat</a> (limited to 'tools/include')</div><table summary='diffstat' class='diffstat'>