summaryrefslogtreecommitdiff
path: root/tools/perf/arch/arm/tests/dwarf-unwind.c
blob: 62eff847f91c4edc4754fb08a110f7b88b9c0411 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#include <string.h>
#include "perf_regs.h"
#include "thread.h"
#include "map.h"
#include "event.h"
#include "debug.h"
#include "tests/tests.h"

#define STACK_SIZE 8192

static int sample_ustack(struct perf_sample *sample,
			 struct thread *thread, u64 *regs)
{
	struct stack_dump *stack = &sample->user_stack;
	struct map *map;
	unsigned long sp;
	u64 stack_size, *buf;

	buf = malloc(STACK_SIZE);
	if (!buf) {
		pr_debug("failed to allocate sample uregs data\n");
		return -1;
	}

	sp = (unsigned long) regs[PERF_REG_ARM_SP];

	map = map_groups__find(thread->mg, MAP__VARIABLE, (u64) sp);
	if (!map) {
		pr_debug("failed to get stack map\n");
		free(buf);
		return -1;
	}

	stack_size = map->end - sp;
	stack_size = stack_size > STACK_SIZE ? STACK_SIZE : stack_size;

	memcpy(buf, (void *) sp, stack_size);
	stack->data = (char *) buf;
	stack->size = stack_size;
	return 0;
}

int test__arch_unwind_sample(struct perf_sample *sample,
			     struct thread *thread)
{
	struct regs_dump *regs = &sample->user_regs;
	u64 *buf;

	buf = calloc(1, sizeof(u64) * PERF_REGS_MAX);
	if (!buf) {
		pr_debug("failed to allocate sample uregs data\n");
		return -1;
	}

	perf_regs_load(buf);
	regs->abi  = PERF_SAMPLE_REGS_ABI;
	regs->regs = buf;
	regs->mask = PERF_REGS_MASK;

	return sample_ustack(sample, thread, buf);
}
specialized: clear a bit in a byte, and test the sign bit. Nick doesn't love the resulting name of the new primitive, but I'd rather make the name be descriptive and very clear about the limitation imposed by trying to work across all relevant architectures than make it be some generic thing that doesn't make the odd semantics explicit. So this introduces the new architecture primitive clear_bit_unlock_is_negative_byte(); and adds the trivial implementation for x86. We have a generic non-optimized fallback (that just does a "clear_bit()"+"test_bit(7)" combination) which can be overridden by any architecture that can do better. According to Nick, Power has the same hickup x86 has, for example, but some other architectures may not even care. All these optimizations mean that my page locking stress-test (which is just executing a lot of small short-lived shell scripts: "make test" in the git source tree) no longer makes our page locking look horribly bad. Before all these optimizations, just the unlock_page() costs were just over 3% of all CPU overhead on "make test". After this, it's down to 0.66%, so just a quarter of the cost it used to be. (The difference on NUMA is bigger, but there this micro-optimization is likely less noticeable, since the big issue on NUMA was not the accesses to 'struct page', but the waitqueue accesses that were already removed by Nick's earlier commit). Acked-by: Nick Piggin <npiggin@gmail.com> Cc: Dave Hansen <dave.hansen@linux.intel.com> Cc: Bob Peterson <rpeterso@redhat.com> Cc: Steven Whitehouse <swhiteho@redhat.com> Cc: Andrew Lutomirski <luto@kernel.org> Cc: Andreas Gruenbacher <agruenba@redhat.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Mel Gorman <mgorman@techsingularity.net> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'tools/lib')