[PATCH] perf record: Add snapshot mode support for perf's regular events

For aux area tracing, there is already a snapshot mode support for
intel-pt and intel-bts events. Similarly, this patch adds a snapshot
mode for perf's regular events. A user space ring buffer is allocated to
handle the tracing data from the kernel space ring buffer, and the
tracing data will only dump to perf.data when perf receives a SIGUSR2
signal.

Similarly like '-S' in aux trace snapshot mode, '-M' enables perf's
regular event's snapshot mode by defining the size (bytes) of the user
space ring buffer.

Example 1:

$ perf record -a -M 10000000
/*
* Let perf record runs for some time before finally ends, and do not
* send any SIGUSR2 signal to perf during perf's running.
*/

$ perf report

Error:
The perf.data file has no samples!
# To display the perf.data header info, please use --header/--header-only options.

As shown above, without any SIGUSR2 signal, perf record will dump no samples
to perf.data in the snapshot mode.

Example 2:

$ perf record -a -M 10000000
/*
* Let perf record runs for some time before finally ends, and send
* several times of SIGUSR2 signal to perf during perf's running.
*/

# kill -SIGUSR2 `pidof perf`
...
# kill -SIGUSR2 `pidof perf`

$ perf report
<SNIP>
# Total Lost Samples: 0
#
# Samples: 942 of event 'cycles:pp'
# Event count (approx.): 175168972
#
# Overhead Command Shared Object Symbol
# ........ ............... ....................... .........................................
#
8.20% kworker/2:0 [kernel.kallsyms] [k] default_send_IPI_mask_allbutself_phys
6.33% swapper [kernel.kallsyms] [k] intel_idle
2.64% pidof [kernel.kallsyms] [k] arch_get_unmapped_area_topdown
2.56% pidof [kernel.kallsyms] [k] unmap_region
2.26% pidof [kernel.kallsyms] [k] memcpy
2.26% pidof libc-2.19.so [.] _IO_vfscanf
2.03% pidof [kernel.kallsyms] [k] lookup_fast
1.72% pidof [kernel.kallsyms] [k] filp_close
1.62% pidof [kernel.kallsyms] [k] apparmor_file_open
1.56% pidof [kernel.kallsyms] [k] process_measurement
1.50% pidof [kernel.kallsyms] [k] find_vma
<SNIP>

As shown above, perf record will dump samples to perf.data every time
it receives a SIGUSR2 signal in the snapshot mode.

Signed-off-by: Yunlong Song <***@huawei.com>
---
tools/perf/builtin-record.c | 181 +++++++++++++++++++++++++++++++++++++++++---
1 file changed, 170 insertions(+), 11 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 199fc31..75606a6 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -37,6 +37,16 @@
#include <sched.h>
#include <sys/mman.h>

+static volatile int memory_enabled;
+static volatile int memory_signalled;
+/* The maximum size of one perf_event is 65536*/
+#define MEMORY_SIZE_MIN 65537
+
+struct memory {
+ void *start;
+ u64 head, tail;
+ u64 size;
+};

struct record {
struct perf_tool tool;
@@ -51,16 +61,134 @@ struct record {
bool no_buildid;
bool no_buildid_cache;
unsigned long long samples;
+ struct memory memory;
};

-static int record__write(struct record *rec, void *bf, size_t size)
+static int buf_to_file(struct record *rec, void *buf,
+ size_t size, u64 head, u64 tail)
{
- if (perf_data_file__write(rec->session->file, bf, size) < 0) {
- pr_err("failed to write perf data, error: %m\n");
+ size_t written = 0;
+
+ if (head < tail) {
+ if (perf_data_file__write(rec->session->file,
+ buf + head, tail - head) < 0)
+ goto out;
+ written += tail - head;
+ } else if (head > tail) {
+ if (perf_data_file__write(rec->session->file,
+ buf + head, size - head) < 0)
+ goto out;
+ written += size - head;
+
+ if (perf_data_file__write(rec->session->file, buf, tail) < 0)
+ goto out;
+ written += tail;
+ }
+
+ rec->bytes_written += written;
+ return 0;
+out:
+ pr_err("failed to write perf data, error: %m\n");
+ return -1;
+}
+
+static int memory_to_file(struct record *rec)
+{
+ if (buf_to_file(rec, rec->memory.start, rec->memory.size,
+ rec->memory.head, rec->memory.tail) < 0)
return -1;
+ rec->memory.head = rec->memory.tail;
+
+ return 0;
+}
+
+static ssize_t perf_memory__write(struct memory *memory, void *buf, size_t size)
+{
+ void *buf_start = buf;
+ size_t left = size, written, delta, skip;
+ union perf_event *event;
+ struct perf_event_header hdr;
+ struct record *rec = container_of(memory, struct record, memory);
+
+ while (left) {
+ skip = 0;
+ written = min(left, memory->size - memory->tail);
+ if (memory->head > memory->tail)
+ delta = memory->head - memory->tail;
+ else
+ delta = memory->size - memory->tail + memory->head;
+ if (delta <= written) {
+ do {
+ if ((memory->head + skip) <= (memory->size -
+ sizeof(struct perf_event_header)))
+ event = (union perf_event *)(memory->start +
+ memory->head + skip);
+ else {
+ size_t hdr_left;
+
+ hdr_left = sizeof(struct perf_event_header) -
+ memory->size + memory->head + skip;
+ memcpy(&hdr, memory->start + memory->head + skip,
+ sizeof(struct perf_event_header) - hdr_left);
+
+ if (hdr_left <= memory->tail)
+ memcpy((void *)&hdr + sizeof(struct perf_event_header) -
+ hdr_left, memory->start, hdr_left);
+ else if (!memory->tail)
+ memcpy((void *)&hdr + sizeof(struct perf_event_header) -
+ hdr_left, buf, hdr_left);
+ else {
+ memcpy((void *)&hdr + sizeof(struct perf_event_header) -
+ hdr_left, memory->start, memory->tail);
+ hdr_left -= memory->tail;
+ memcpy((void *)&hdr + sizeof(struct perf_event_header) -
+ hdr_left, buf, hdr_left);
+ }
+
+ event = (union perf_event *)&hdr;
+ if (rec->session->header.needs_swap)
+ perf_event_header__bswap(&event->header);
+ }
+
+ if (event->header.type != PERF_RECORD_SAMPLE) {
+ if (buf_to_file(rec, memory->start, memory->size,
+ memory->head + skip, (memory->head + skip +
+ event->header.size) % memory->size) < 0)
+ return -1;
+ }
+
+ skip += event->header.size;
+ } while (skip <= written - delta);
+ }
+
+ memcpy(memory->start + memory->tail, buf, written);
+
+ memory->head = (memory->head + skip) % memory->size;
+ memory->tail = (memory->tail + written) % memory->size;
+
+ left -= written;
+ buf += written;
+ }
+
+ BUG_ON((size_t)(buf - buf_start) != size);
+ return size;
+}
+
+static int record__write(struct record *rec, void *bf, size_t size)
+{
+ if (rec->memory.size && memory_enabled) {
+ if (perf_memory__write(&rec->memory, bf, size) < 0) {
+ pr_err("failed to write memory data, error: %m\n");
+ return -1;
+ }
+ } else {
+ if (perf_data_file__write(rec->session->file, bf, size) < 0) {
+ pr_err("failed to write perf data, error: %m\n");
+ return -1;
+ }
+ rec->bytes_written += size;
}

- rec->bytes_written += size;
return 0;
}

@@ -86,6 +214,8 @@ static int record__mmap_read(struct record *rec, int idx)
if (old == head)
return 0;

+ memory_enabled = 1;
+
rec->samples++;

size = head - old;
@@ -113,6 +243,7 @@ static int record__mmap_read(struct record *rec, int idx)
md->prev = old;
perf_evlist__mmap_consume(rec->evlist, idx);
out:
+ memory_enabled = 0;
return rc;
}

@@ -426,8 +557,11 @@ static int record__mmap_read_all(struct record *rec)
* Mark the round finished in case we wrote
* at least one event.
*/
- if (bytes_written != rec->bytes_written)
+ if (bytes_written != rec->bytes_written) {
+ memory_enabled = 1;
rc = record__write(rec, &finished_round_event, sizeof(finished_round_event));
+ memory_enabled = 0;
+ }

out:
return rc;
@@ -492,7 +626,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
signal(SIGCHLD, sig_handler);
signal(SIGINT, sig_handler);
signal(SIGTERM, sig_handler);
- if (rec->opts.auxtrace_snapshot_mode)
+ if (rec->opts.auxtrace_snapshot_mode || rec->memory.size)
signal(SIGUSR2, snapshot_sig_handler);
else
signal(SIGUSR2, SIG_IGN);
@@ -687,6 +821,14 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
}
}

+ if (memory_signalled) {
+ memory_signalled = 0;
+ if (memory_to_file(rec) < 0) {
+ err = -1;
+ goto out_child;
+ }
+ }
+
if (hits == rec->samples) {
if (done || draining)
break;
@@ -1009,6 +1151,12 @@ static struct record record = {
.mmap2 = perf_event__process_mmap2,
.ordered_events = true,
},
+ .memory = {
+ .start = NULL,
+ .head = 0,
+ .tail = 0,
+ .size = 0,
+ },
};

const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
@@ -1119,6 +1267,7 @@ struct option __record_options[] = {
OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
"options passed to clang when compiling BPF scriptlets"),
#endif
+ OPT_U64('M', "memory", &record.memory.size, "user space ring buffer memory size (bytes)"),
OPT_END()
};

@@ -1220,19 +1369,29 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
goto out_symbol_exit;
}

+ if (rec->memory.size) {
+ if (rec->memory.size < MEMORY_SIZE_MIN)
+ rec->memory.size = MEMORY_SIZE_MIN;
+ rec->memory.start = malloc(rec->memory.size);
+ }
+
err = __cmd_record(&record, argc, argv);
out_symbol_exit:
perf_evlist__delete(rec->evlist);
symbol__exit();
auxtrace_record__free(rec->itr);
+ if (rec->memory.size)
+ free(rec->memory.start);
return err;
}

static void snapshot_sig_handler(int sig __maybe_unused)
{
- if (!auxtrace_snapshot_enabled)
- return;
- auxtrace_snapshot_enabled = 0;
- auxtrace_snapshot_err = auxtrace_record__snapshot_start(record.itr);
- auxtrace_record__snapshot_started = 1;
+ if (record.opts.auxtrace_snapshot_mode && auxtrace_snapshot_enabled) {
+ auxtrace_snapshot_enabled = 0;
+ auxtrace_snapshot_err = auxtrace_record__snapshot_start(record.itr);
+ auxtrace_record__snapshot_started = 1;
+ }
+ if (record.memory.size && !memory_signalled)
+ memory_signalled = 1;
}

Arnaldo Carvalho de Melo

2015-11-24 14:40:02 UTC

Post by Yunlong Song
For aux area tracing, there is already a snapshot mode support for
intel-pt and intel-bts events. Similarly, this patch adds a snapshot
mode for perf's regular events. A user space ring buffer is allocated to
handle the tracing data from the kernel space ring buffer, and the
tracing data will only dump to perf.data when perf receives a SIGUSR2
signal.
Similarly like '-S' in aux trace snapshot mode, '-M' enables perf's
regular event's snapshot mode by defining the size (bytes) of the user
space ring buffer.

Looks like an interesting feature, if for no other reason, to match what
we have for aux area tracing.

But perhaps having an event in the stream itself that would signal when
to dump the snapshot would be a better approach?

One that perhaps you create with 'perf probe' or a tracepoint, both
could use the in-kernel filtering capabilities to specify when to
trigger that snapshot?

If this snapshotting could happen in the kernel, that would be even
better, i.e. no need for two buffers (kernel rb + userspace rb) for
achieving this? Anyway, see below some comments.

Post by Yunlong Song
$ perf record -a -M 10000000
/*
* Let perf record runs for some time before finally ends, and do not
* send any SIGUSR2 signal to perf during perf's running.
*/
$ perf report
The perf.data file has no samples!
# To display the perf.data header info, please use --header/--header-only options.
As shown above, without any SIGUSR2 signal, perf record will dump no samples
to perf.data in the snapshot mode.
$ perf record -a -M 10000000
/*
* Let perf record runs for some time before finally ends, and send
* several times of SIGUSR2 signal to perf during perf's running.
*/
# kill -SIGUSR2 `pidof perf`
...
# kill -SIGUSR2 `pidof perf`
$ perf report
<SNIP>
# Total Lost Samples: 0
#
# Samples: 942 of event 'cycles:pp'
# Event count (approx.): 175168972
#
# Overhead Command Shared Object Symbol
# ........ ............... ....................... .........................................
#
8.20% kworker/2:0 [kernel.kallsyms] [k] default_send_IPI_mask_allbutself_phys
6.33% swapper [kernel.kallsyms] [k] intel_idle
2.64% pidof [kernel.kallsyms] [k] arch_get_unmapped_area_topdown
2.56% pidof [kernel.kallsyms] [k] unmap_region
2.26% pidof [kernel.kallsyms] [k] memcpy
2.26% pidof libc-2.19.so [.] _IO_vfscanf
2.03% pidof [kernel.kallsyms] [k] lookup_fast
1.72% pidof [kernel.kallsyms] [k] filp_close
1.62% pidof [kernel.kallsyms] [k] apparmor_file_open
1.56% pidof [kernel.kallsyms] [k] process_measurement
1.50% pidof [kernel.kallsyms] [k] find_vma
<SNIP>
As shown above, perf record will dump samples to perf.data every time
it receives a SIGUSR2 signal in the snapshot mode.
---
tools/perf/builtin-record.c | 181 +++++++++++++++++++++++++++++++++++++++++---
1 file changed, 170 insertions(+), 11 deletions(-)
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 199fc31..75606a6 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -37,6 +37,16 @@
#include <sched.h>
#include <sys/mman.h>
+static volatile int memory_enabled;
+static volatile int memory_signalled;

Those are really too generic names :-\

Post by Yunlong Song
+/* The maximum size of one perf_event is 65536*/
+#define MEMORY_SIZE_MIN 65537
+
+struct memory {

How about: snapshot_ring_buffer? If that is deemed too big, perhaps
snapshot_rb?

Post by Yunlong Song
+ void *start;
+ u64 head, tail;
+ u64 size;
+};
struct record {
struct perf_tool tool;
@@ -51,16 +61,134 @@ struct record {
bool no_buildid;
bool no_buildid_cache;
unsigned long long samples;
+ struct memory memory;
};
-static int record__write(struct record *rec, void *bf, size_t size)
+static int buf_to_file(struct record *rec, void *buf,
+ size_t size, u64 head, u64 tail)

Please continue following the existing convention and name this:

static int record__write_rb()

As the buffer you're writing is a ring buffer, thus needs checking about
wrap around, like you do here.

Post by Yunlong Song
{
- if (perf_data_file__write(rec->session->file, bf, size) < 0) {
- pr_err("failed to write perf data, error: %m\n");
+ size_t written = 0;
+
+ if (head < tail) {
+ if (perf_data_file__write(rec->session->file,
+ buf + head, tail - head) < 0)
+ goto out;
+ written += tail - head;
+ } else if (head > tail) {
+ if (perf_data_file__write(rec->session->file,
+ buf + head, size - head) < 0)
+ goto out;
+ written += size - head;
+
+ if (perf_data_file__write(rec->session->file, buf, tail) < 0)
+ goto out;
+ written += tail;
+ }
+
+ rec->bytes_written += written;
+ return 0;
+ pr_err("failed to write perf data, error: %m\n");
+ return -1;
+}
+
+static int memory_to_file(struct record *rec)
+{
+ if (buf_to_file(rec, rec->memory.start, rec->memory.size,
+ rec->memory.head, rec->memory.tail) < 0)
return -1;
+ rec->memory.head = rec->memory.tail;
+
+ return 0;
+}
+
+static ssize_t perf_memory__write(struct memory *memory, void *buf, size_t size)
+{
+ void *buf_start = buf;
+ size_t left = size, written, delta, skip;
+ union perf_event *event;
+ struct perf_event_header hdr;
+ struct record *rec = container_of(memory, struct record, memory);
+
+ while (left) {
+ skip = 0;
+ written = min(left, memory->size - memory->tail);
+ if (memory->head > memory->tail)
+ delta = memory->head - memory->tail;
+ else
+ delta = memory->size - memory->tail + memory->head;
+ if (delta <= written) {
+ do {
+ if ((memory->head + skip) <= (memory->size -
+ sizeof(struct perf_event_header)))
+ event = (union perf_event *)(memory->start +
+ memory->head + skip);
+ else {
+ size_t hdr_left;
+
+ hdr_left = sizeof(struct perf_event_header) -
+ memory->size + memory->head + skip;
+ memcpy(&hdr, memory->start + memory->head + skip,
+ sizeof(struct perf_event_header) - hdr_left);
+
+ if (hdr_left <= memory->tail)
+ memcpy((void *)&hdr + sizeof(struct perf_event_header) -
+ hdr_left, memory->start, hdr_left);
+ else if (!memory->tail)
+ memcpy((void *)&hdr + sizeof(struct perf_event_header) -
+ hdr_left, buf, hdr_left);
+ else {
+ memcpy((void *)&hdr + sizeof(struct perf_event_header) -
+ hdr_left, memory->start, memory->tail);
+ hdr_left -= memory->tail;
+ memcpy((void *)&hdr + sizeof(struct perf_event_header) -
+ hdr_left, buf, hdr_left);
+ }
+
+ event = (union perf_event *)&hdr;
+ if (rec->session->header.needs_swap)
+ perf_event_header__bswap(&event->header);
+ }
+
+ if (event->header.type != PERF_RECORD_SAMPLE) {
+ if (buf_to_file(rec, memory->start, memory->size,
+ memory->head + skip, (memory->head + skip +
+ event->header.size) % memory->size) < 0)
+ return -1;
+ }
+
+ skip += event->header.size;
+ } while (skip <= written - delta);
+ }
+
+ memcpy(memory->start + memory->tail, buf, written);
+
+ memory->head = (memory->head + skip) % memory->size;
+ memory->tail = (memory->tail + written) % memory->size;
+
+ left -= written;
+ buf += written;
+ }
+
+ BUG_ON((size_t)(buf - buf_start) != size);
+ return size;
+}
+
+static int record__write(struct record *rec, void *bf, size_t size)
+{
+ if (rec->memory.size && memory_enabled) {
+ if (perf_memory__write(&rec->memory, bf, size) < 0) {
+ pr_err("failed to write memory data, error: %m\n");
+ return -1;
+ }
+ } else {
+ if (perf_data_file__write(rec->session->file, bf, size) < 0) {
+ pr_err("failed to write perf data, error: %m\n");
+ return -1;
+ }
+ rec->bytes_written += size;
}
- rec->bytes_written += size;
return 0;
}
@@ -86,6 +214,8 @@ static int record__mmap_read(struct record *rec, int idx)
if (old == head)
return 0;
+ memory_enabled = 1;
+

This really looks hackish, using a global to change the behaviour of
some existing function, and a volatile at that, ouch, please change the
prototypes in some way to avoid globals like this.

Post by Yunlong Song
rec->samples++;
size = head - old;
@@ -113,6 +243,7 @@ static int record__mmap_read(struct record *rec, int idx)
md->prev = old;
perf_evlist__mmap_consume(rec->evlist, idx);
+ memory_enabled = 0;
return rc;
}
@@ -426,8 +557,11 @@ static int record__mmap_read_all(struct record *rec)
* at least one event.
*/
- if (bytes_written != rec->bytes_written)
+ if (bytes_written != rec->bytes_written) {
+ memory_enabled = 1;
rc = record__write(rec, &finished_round_event, sizeof(finished_round_event));
+ memory_enabled = 0;
+ }
return rc;
@@ -492,7 +626,7 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
signal(SIGCHLD, sig_handler);
signal(SIGINT, sig_handler);
signal(SIGTERM, sig_handler);
- if (rec->opts.auxtrace_snapshot_mode)
+ if (rec->opts.auxtrace_snapshot_mode || rec->memory.size)
signal(SIGUSR2, snapshot_sig_handler);
else
signal(SIGUSR2, SIG_IGN);
@@ -687,6 +821,14 @@ static int __cmd_record(struct record *rec, int argc, const char **argv)
}
}
+ if (memory_signalled) {
+ memory_signalled = 0;
+ if (memory_to_file(rec) < 0) {
+ err = -1;
+ goto out_child;
+ }
+ }
+
if (hits == rec->samples) {
if (done || draining)
break;
@@ -1009,6 +1151,12 @@ static struct record record = {
.mmap2 = perf_event__process_mmap2,
.ordered_events = true,
},
+ .memory = {
+ .start = NULL,
+ .head = 0,
+ .tail = 0,
+ .size = 0,
+ },
};
const char record_callchain_help[] = CALLCHAIN_RECORD_HELP
@@ -1119,6 +1267,7 @@ struct option __record_options[] = {
OPT_STRING(0, "clang-opt", &llvm_param.clang_opt, "clang options",
"options passed to clang when compiling BPF scriptlets"),
#endif
+ OPT_U64('M', "memory", &record.memory.size, "user space ring buffer memory size (bytes)"),
OPT_END()
};
@@ -1220,19 +1369,29 @@ int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
goto out_symbol_exit;
}
+ if (rec->memory.size) {
+ if (rec->memory.size < MEMORY_SIZE_MIN)
+ rec->memory.size = MEMORY_SIZE_MIN;
+ rec->memory.start = malloc(rec->memory.size);
+ }
+
err = __cmd_record(&record, argc, argv);
perf_evlist__delete(rec->evlist);
symbol__exit();
auxtrace_record__free(rec->itr);
+ if (rec->memory.size)
+ free(rec->memory.start);
return err;
}
static void snapshot_sig_handler(int sig __maybe_unused)
{
- if (!auxtrace_snapshot_enabled)
- return;
- auxtrace_snapshot_enabled = 0;
- auxtrace_snapshot_err = auxtrace_record__snapshot_start(record.itr);
- auxtrace_record__snapshot_started = 1;
+ if (record.opts.auxtrace_snapshot_mode && auxtrace_snapshot_enabled) {
+ auxtrace_snapshot_enabled = 0;
+ auxtrace_snapshot_err = auxtrace_record__snapshot_start(record.itr);
+ auxtrace_record__snapshot_started = 1;
+ }
+ if (record.memory.size && !memory_signalled)
+ memory_signalled = 1;
}
--
1.8.5.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Yunlong Song

2015-11-25 12:50:02 UTC

Post by Arnaldo Carvalho de Melo
Looks like an interesting feature, if for no other reason, to match what
we have for aux area tracing.
But perhaps having an event in the stream itself that would signal when
to dump the snapshot would be a better approach?
One that perhaps you create with 'perf probe' or a tracepoint, both
could use the in-kernel filtering capabilities to specify when to
trigger that snapshot?
If this snapshotting could happen in the kernel, that would be even
better, i.e. no need for two buffers (kernel rb + userspace rb) for
achieving this? Anyway, see below some comments.

Hi, Arnaldo,

Thanks for your suggestions and comments. As for the suggestion of snapshot
mode implemented in kernel ring buffer rather than user space ring buffer, I
have thought about that before I made this patch, the problems are:

1. Memory Handle

If snapshot mode implemented in kernel space ring buffer, what's proper size of
that ring buffer? If the size is small (such as 512k), it may lose the older
part of target info which is really related with the reason causing that snapshot.
For example, a temporary process launches and runs for a while, and it cannot
terminate properly in the end. The real reason is that something is wrong during
the launching stage, however, we do not know this prior knowledge and has to
record all the information from the beginning to the end of this process's liftime.
By analyzing the log of perf.data, we can finally figure out that the problem exists
in the launching stage. Here if kernel space ring buffer is not big enough, we will
lose the older part of the logs.

In this case, we may have to set the size of kernel space ring buffer big enough, but
can we kmalloc or vmalloc such big size easily in kernel space in any case? There
may be some limitations to do this. Since allocating memory in the kernel is not as
simple as allocating memory in user space. From this point of view, I prefer to use
user space ring buffer to store the records. It's more flexible and feasible to handle
memory.

2. Overwrite Overhead

Without overwrite feature, snapshot mode implemented in kernel space ring buffer
will also lose target info. For example, the mmap page of kernel space ring buffer
is not full at time A. The target info is filled to this mmap page and then it is
full at time B, then it wakes up perf to mmap_read all its records. In snapshot mode,
all the records are totally dropped without writing to perf.data. Later at time C,
a signal triggers the perf to dump, but the older part of target info was lost at
time B. This is because kernel space ring buffer does not overwrite itself, and dump
all its content once a page is full.

So, we need "overwrite" support, but it needs further fixes and the corresponding patch
is not applied to perf now probably due to its executive overhead (in the critical path,
not very sure yet).

Thus instead of incurring any probable overhead in the kernel space and thus may cause
any problems, I prefer to implement the "overwrite" feature in the user space ring
buffer.

3. Metadata Handle

Even if problem 2 above is solved, i.e., we finally fix the overwrite feature of kernel space
ring buffer. What about metadata handling (mmap_event, comm_event, fork_event, exit_event,
etc.)? When it's time to overwrite this info during the wraparound procedure, this metadata
events have to be handled separately to wake up perf (trigger the poll) to write to perf.data.
This may increase complexity in the kernel space. However, as for the user space ring buffer
version, it directly writes to perf.data once it notices the metadata event, no action of waking
up perf by poll at all. Since the user space ring buffer design has to parse each event to
figure out its size to skip when overwriting, it can easily figure out the event type by the
way (which event is perf_record_sample, and which event is metadata).

However, for this problem, there is a under-discussing solution. As what Wang Nan replies, "For
such tracking events (PERF_RECORD_FORK...), we have dummy event so it is possible for us to receive
tracking events from a separated channel, therefore we don't have to parse every events to pick
those events out."

4. Trigger Semantic

As what you mentioned, snapshot mode implemented in kernel space ring buffer "can" use the in-kernel
filtering capabilities to specify when to trigger that snapshot. In my personal opinion, I regard
the in-kernel filtering capabilities as something all about controlling, or say, programming with
the tracepoints/trace events, which does not relate to the semantic of snapshot. You can use the
in-kernel filtering capabilities (such as eBPF) to enable/disable tracing, filter tracing, aggregate
tracing, but all this tracing info should be written to perf.data normally. However, we can decide
when to really do this writing action via snapshot mode. Snapshot mode is just a semantic in the
user space from a high level view. Without snapshot mode, those "in-kernel filtering capabilities"
can still work for the regular perf. In the production case, apps can send SIGUSR2 signal to perf to
easily record the most useful info which is strongly related with the apps performance problem, no
redundant info any more (only collect the trace at the time any problem happens).

Post by Arnaldo Carvalho de Melo
How about: snapshot_ring_buffer? If that is deemed too big, perhaps
snapshot_rb?
static int record__write_rb()
As the buffer you're writing is a ring buffer, thus needs checking about
wrap around, like you do here.
This really looks hackish, using a global to change the behaviour of
some existing function, and a volatile at that, ouch, please change the
prototypes in some way to avoid globals like this.

Thanks for your careful review and helpful comments, my patch stands for certain kind of design for
snapshot mode. And other people may have suggestions and different designs, let's talk and compare
different designs first. If the design of this patch is taken finally, I will rewrite it then.

--
Thanks,
Yunlong Song

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

David Ahern

2015-11-24 15:10:02 UTC

Post by Yunlong Song
+static int record__write(struct record *rec, void *bf, size_t size)
+{
+ if (rec->memory.size && memory_enabled) {
+ if (perf_memory__write(&rec->memory, bf, size) < 0) {
+ pr_err("failed to write memory data, error: %m\n");
+ return -1;
+ }
+ } else {
+ if (perf_data_file__write(rec->session->file, bf, size) < 0) {
+ pr_err("failed to write perf data, error: %m\n");
+ return -1;
+ }
+ rec->bytes_written += size;
}
- rec->bytes_written += size;
return 0;
}
@@ -86,6 +214,8 @@ static int record__mmap_read(struct record *rec, int idx)
if (old == head)
return 0;
+ memory_enabled = 1;
+
rec->samples++;
size = head - old;
@@ -113,6 +243,7 @@ static int record__mmap_read(struct record *rec, int idx)
md->prev = old;
perf_evlist__mmap_consume(rec->evlist, idx);
+ memory_enabled = 0;
return rc;
}

So you are basically ignoring all samples until SIGUSR2 is received.
That means the resulting data file will have limited history of task
events for example. And for other events the quantity is random as to
when the mmaps were last scanned.

Your cover letter mentioned my code "just makes some count when the
signal triggers perf sched, with no sample recording and has nothing to
do with perf.data". That is not correct. If you look at the perf-daemon
code I pointed you to it processes task events as they are received and
saves the last N-events after time sorting (limited by memory or time).
When a signal is received it processes the saved events and dumps them
to stdout versus writing a perf.data file.

David
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Arnaldo Carvalho de Melo

2015-11-24 15:30:03 UTC

So you are basically ignoring all samples until SIGUSR2 is received. That

No, he is not, its just that his code is difficult to follow, has to be
rewritten, but he is ignoring just PERF_RECORD_SAMPLE events, so it
will..

means the resulting data file will have limited history of task events for

... have a complete history of task events, since PERF_RECORD_FORK, etc
are not being ignored.

No?

- Arnaldo

example. And for other events the quantity is random as to when the mmaps
were last scanned.
Your cover letter mentioned my code "just makes some count when the signal
triggers perf sched, with no sample recording and has nothing to do with
perf.data". That is not correct. If you look at the perf-daemon code I
pointed you to it processes task events as they are received and saves the
last N-events after time sorting (limited by memory or time). When a signal
is received it processes the saved events and dumps them to stdout versus
writing a perf.data file.
David

David Ahern

2015-11-24 15:30:04 UTC

So you are basically ignoring all samples until SIGUSR2 is received. That

No, he is not, its just that his code is difficult to follow, has to be
rewritten, but he is ignoring just PERF_RECORD_SAMPLE events, so it
will..

means the resulting data file will have limited history of task events for

... have a complete history of task events, since PERF_RECORD_FORK, etc
are not being ignored.
No?

perf-record does not process events, it only writes to a file. If that
is skipped then it skips all events regardless of type.

David

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Arnaldo Carvalho de Melo

2015-11-24 15:50:01 UTC

So you are basically ignoring all samples until SIGUSR2 is received. That

No, he is not, its just that his code is difficult to follow, has to be
rewritten, but he is ignoring just PERF_RECORD_SAMPLE events, so it
will..

means the resulting data file will have limited history of task events for

... have a complete history of task events, since PERF_RECORD_FORK, etc
are not being ignored.
No?

perf-record does not process events, it only writes to a file. If that is
skipped then it skips all events regardless of type.

perf-record without his patch? yes, but with his patch it does:

__cmd_record()
for (;;)
record__mmap_read_all()
record__write()
perf_memory__write()
event = (union perf_event *)(memory->start + memory->head + skip);
if (event->header.type != PERF_RECORD_SAMPLE) {
if (buf_to_file(rec, memory->start, memory->size,
}

I almost thought that I had been fooled by the difficulty to follow his
patch and was forgetting that 'perf record' doesn't processes events,
and hasn't done so for a very good reason: to reduce its impact on the
observed workload, but that ain't so, no?

So, when not snapshotting, what you said remains true:

static int record__write(struct record *rec, void *bf, size_t size)
{
if (rec->memory.size && memory_enabled) {
if (perf_memory__write(&rec->memory, bf, size) < 0) {
pr_err("failed to write memory data, error: %m\n");
return -1;
}
} else {
if (perf_data_file__write(rec->session->file, bf, size) < 0) {

I'll continue taking the else branch and in that case, no events are
processed, we just dump that bf into the rec->session->file and go on
with life.

- Arnaldo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

David Ahern

2015-11-24 16:20:01 UTC

Post by Arnaldo Carvalho de Melo
__cmd_record()
for (;;)
record__mmap_read_all()
record__write()
perf_memory__write()
event = (union perf_event *)(memory->start + memory->head + skip);
if (event->header.type != PERF_RECORD_SAMPLE) {
if (buf_to_file(rec, memory->start, memory->size,
}
I almost thought that I had been fooled by the difficulty to follow his
patch and was forgetting that 'perf record' doesn't processes events,
and hasn't done so for a very good reason: to reduce its impact on the
observed workload, but that ain't so, no?

exactly. And I missed the above. Thanks for pointing that out.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Wangnan (F)

2015-11-25 04:00:01 UTC

So you are basically ignoring all samples until SIGUSR2 is received. That

No, he is not, its just that his code is difficult to follow, has to be
rewritten, but he is ignoring just PERF_RECORD_SAMPLE events, so it
will..

means the resulting data file will have limited history of task events for

... have a complete history of task events, since PERF_RECORD_FORK, etc
are not being ignored.
No?

Actually we are discussing about this problem.

For such tracking events (PERF_RECORD_FORK...), we have dummy event so
it is possible for us to receive tracking events from a separated
channel, therefore we don't have to parse every events to pick those
events out. Instead, we can process tracking events differently, then
more interesting things can be done. For example, squashing those tracking
events if it takes too much memory...

Furthermore, there's another problem being discussed: if userspace
ringbuffer
is bytes based, parsing event is unavoidable. Without parsing event we are
unable to find the new 'head' pointer when overwriting. Instead, we are
thinking about a bucket-based ringbuffer that, let perf maintain a series
of bucket, each time 'poll' return, perf copies new events to the start of
a bucket. If all bucket is occupied, we drop the oldest bucket. Bucket-based
ringbuffer watest some memory but can avoid event parsing.

And there's many other problems in this patch. For example, when SIGUSR2 is
received, we need to do something to let all perf events start dumping.
Current implementation can't ensure we receive events just before the
SIGUSR2 if we not set 'no-buffer'.

Also, output events are in one perf.data, which is not user friendly.
Our final goal is to make perf a daemonized moniter, which can run 7x24
in user's environment. Each time a glitch is detected, a framework sends
a signal to perf to get a perf.data from it perf. The framework manage
those perf.data like logrotate, help developer analysis those glitch.

We are seeking the route implementing the final monitor. This patch is
an attempt to let you know what we want and get your thought about it.
Looks like you agree out basic idea. That's good. Then we decide to
start from some small feature to support the final goal. For example:
snapshot mode for specific events:

# perf record -a -e cycles/snapshot/

And when C-c is pressed, for cycles event, only those data still in
kernel would be dump.

Thank you.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

David Ahern

2015-11-25 05:10:01 UTC

Post by Wangnan (F)
Actually we are discussing about this problem.
For such tracking events (PERF_RECORD_FORK...), we have dummy event so
it is possible for us to receive tracking events from a separated
channel, therefore we don't have to parse every events to pick those
events out. Instead, we can process tracking events differently, then
more interesting things can be done. For example, squashing those tracking
events if it takes too much memory...

If you look at my daemon code I process task events (FORK, MMAP, EXIT)
to maintain task state including flushing threads when they terminate.
This is a trade-off to having the knowledge to pretty-print addresses
(address to symbol resolution) yet not grow without bounds -- be it a
file or memory.

Post by Wangnan (F)
Furthermore, there's another problem being discussed: if userspace
ringbuffer
is bytes based, parsing event is unavoidable. Without parsing event we are
unable to find the new 'head' pointer when overwriting. Instead, we are
thinking about a bucket-based ringbuffer that, let perf maintain a series
of bucket, each time 'poll' return, perf copies new events to the start of
a bucket. If all bucket is occupied, we drop the oldest bucket. Bucket-based
ringbuffer watest some memory but can avoid event parsing.
And there's many other problems in this patch. For example, when SIGUSR2 is
received, we need to do something to let all perf events start dumping.
Current implementation can't ensure we receive events just before the
SIGUSR2 if we not set 'no-buffer'.
Also, output events are in one perf.data, which is not user friendly.
Our final goal is to make perf a daemonized moniter, which can run 7x24
in user's environment. Each time a glitch is detected, a framework sends
a signal to perf to get a perf.data from it perf. The framework manage
those perf.data like logrotate, help developer analysis those glitch.

Exactly. And that's why my daemon is written the way it is. It is
intended to run 24x7x365. It retains the last N events which are dumped
when some external trigger tells it to.

Arnaldo: you asked about an event in the stream but that is not
possible. My scheduling daemon targets CPU usage prior to a significant
event (what was running, how long, where, etc). The significant event in
the motivating case was STP timeouts -- if stp daemon is not able to
send BPDUs why? What was running leading up to the timeout. The point is
something external to the perf daemon says 'hey, save the last N-events
for analysis'.

This case sounds like a generalization of my problem with the desire to
write a perf.data file instead of processing the events and dumping to a
file. It is doable. For example, synthesize task events for all threads
in memory and then write out the saved samples.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Adrian Hunter

2015-11-25 07:30:02 UTC

So you are basically ignoring all samples until SIGUSR2 is received. That

No, he is not, its just that his code is difficult to follow, has to be
rewritten, but he is ignoring just PERF_RECORD_SAMPLE events, so it
will..

means the resulting data file will have limited history of task events for

... have a complete history of task events, since PERF_RECORD_FORK, etc
are not being ignored.
No?

Have you considered trying to find the head by trial-and-error at the time
you make the snapshot i.e. look at the first 8 bytes (event records are 8
byte aligned) and see if it is a valid record header, if not try the next 8
bytes. When you find a real event record it should parse without error and
the subsequent events should all parse without error too, all the way to the
tail. Then you can use timestamps and compare the events byte-by-byte to
avoid overlaps between 2 snapshots.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Wangnan (F)

2015-11-25 08:00:02 UTC

So you are basically ignoring all samples until SIGUSR2 is received. That

No, he is not, its just that his code is difficult to follow, has to be
rewritten, but he is ignoring just PERF_RECORD_SAMPLE events, so it
will..

means the resulting data file will have limited history of task events for

... have a complete history of task events, since PERF_RECORD_FORK, etc
are not being ignored.
No?

It seems not work. Now we have BPF output event, it is possible that a
BPF program output anything through that event. Even if we have a magic
in head of each event, we can't prevent BPF output event output that
magic, except we introduce some 'escape' method to prevent BPF output
event output some data pattern. So although might work in reallife,
this solution is logically incorrect. Or am I miss someting?

Thank you.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Adrian Hunter

2015-11-25 08:40:02 UTC

So you are basically ignoring all samples until SIGUSR2 is received. That

No, he is not, its just that his code is difficult to follow, has to be
rewritten, but he is ignoring just PERF_RECORD_SAMPLE events, so it
will..

means the resulting data file will have limited history of task events for

... have a complete history of task events, since PERF_RECORD_FORK, etc
are not being ignored.
No?

When you find the head, all the events will parse correctly. It seems to me
highly unlikely that would happen if you guessed the head wrongly.
It is only incorrect if it gives the wrong result.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Wangnan (F)

2015-11-25 09:00:02 UTC

So you are basically ignoring all samples until SIGUSR2 is received. That

No, he is not, its just that his code is difficult to follow, has to be
rewritten, but he is ignoring just PERF_RECORD_SAMPLE events, so it
will..

means the resulting data file will have limited history of task events for

... have a complete history of task events, since PERF_RECORD_FORK, etc
are not being ignored.
No?

When you find the head, all the events will parse correctly. It seems to me
highly unlikely that would happen if you guessed the head wrongly.
It is only incorrect if it gives the wrong result.

Right, so I said it might work in reallife. However, I think we
should better to try to provide some logically correct solution.
Also, 'guessing' means some sort of intelligence, or how do we
deal with guessing error? Simply drop them?

And what's your opinion on the bucket besed ring buffer? With that
design we only need to maintain a ringbuffer of pointers. It should
be much simpler. The only drawback I can image is the waste of memory
because we have to alloc buckets pessimistically. Do you think
that method have other problem I haven't considered?

Thank you.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Adrian Hunter

2015-11-25 09:10:02 UTC

Post by Yunlong Song
+static int record__write(struct record *rec, void *bf, size_t size)
+{
+ if (rec->memory.size && memory_enabled) {
+ if (perf_memory__write(&rec->memory, bf, size) < 0) {
+ pr_err("failed to write memory data, error: %m\n");
+ return -1;
+ }
+ } else {
+ if (perf_data_file__write(rec->session->file, bf, size) < 0) {
+ pr_err("failed to write perf data, error: %m\n");
+ return -1;
+ }
+ rec->bytes_written += size;
}
- rec->bytes_written += size;
return 0;
}
@@ -86,6 +214,8 @@ static int record__mmap_read(struct record *rec, int idx)
if (old == head)
return 0;
+ memory_enabled = 1;
+
rec->samples++;
size = head - old;
@@ -113,6 +243,7 @@ static int record__mmap_read(struct record *rec,
int
idx)
md->prev = old;
perf_evlist__mmap_consume(rec->evlist, idx);
+ memory_enabled = 0;
return rc;
}

So you are basically ignoring all samples until SIGUSR2 is received. That

No, he is not, its just that his code is difficult to follow, has to be
rewritten, but he is ignoring just PERF_RECORD_SAMPLE events, so it
will..

means the resulting data file will have limited history of task events for

... have a complete history of task events, since PERF_RECORD_FORK, etc
are not being ignored.
No?

When you find the head, all the events will parse correctly. It seems to me
highly unlikely that would happen if you guessed the head wrongly.
It is only incorrect if it gives the wrong result.

It is not "intelligence" it is a linear search. If it gives more than one
answer, it is a fatal error. You can mitigate that by adding more
validation of the event records.

But it is only a suggestion.

Post by Wangnan (F)
And what's your opinion on the bucket besed ring buffer? With that
design we only need to maintain a ringbuffer of pointers. It should
be much simpler. The only drawback I can image is the waste of memory
because we have to alloc buckets pessimistically. Do you think
that method have other problem I haven't considered?

The drawback is that you have to copy all the events all the time instead of
letting the kernel ring buffer wraparound without any userspace involvement
until you make a snapshot.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Yunlong Song

2015-11-25 08:00:02 UTC

So you are basically ignoring all samples until SIGUSR2 is received. That means the resulting data file will have limited history of task events for example. And for other events the quantity is random as to when the mmaps were last scanned.
Your cover letter mentioned my code "just makes some count when the signal triggers perf sched, with no sample recording and has nothing to do with perf.data". That is not correct. If you look at the perf-daemon code I pointed you to it processes task events as they are received and saves the last N-events after time sorting (limited by memory or time). When a signal is received it processes the saved events and dumps them to stdout versus writing a perf.data file.
David

Hi, David,

Yes, I know that your sched daemon can store and print info when the signal triggers,
however, what I mean 'makes some count' is: sched daemon parses and processes the events
to extract the tracing info related with sched, rather than a general use of perf.data
like "perf script", "perf report", "perf data convert --to-ctf", etc. And what I mean
'no sample recording and has nothing to do with perf.data' is: when perf receives a signal,
sched daemon uses timehist_print_summary and timehist_pstree to record those tracing info
related with sched to a new file rather than the raw perf event records in the perf.data.
We can not use those files generated by sched daemon to enjoy the strong functions like how
perf.data can be used in "perf script", "perf report", "perf data convert --to-ctf", etc.

Sched daemon is good, but it is carefully designed for specific use of perf sched. In general
case of perf record, with snapshot mode, we still want a perf.data as before. Your sched daemon
concurrently does the work of storing and sched-parsing action for each signal trigger. To get
a general style of perf.data, the sched-parsing semantic action may have to be removed.

Peter Zijlstra

2015-11-25 09:30:02 UTC

Post by Yunlong Song
In our patch, we create and maintain a user space ring buffer to store
perf's tracing info, instead of directly writing to perf.data file as
before. In snapshot mode, only a SIGUSR2 signal can trigger perf to dump
the tracing info currently stored in the user space ring buffer to
perf.data file.

I would very much like to first fix the perf overwrite mode: see
lkml.kernel.org/r/***@twins.programming.kicks-ass.net
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Wangnan (F)

2015-11-25 09:50:02 UTC

I would very much like to first fix the perf overwrite mode: see

I think they can be done in parallel. We can first do something with
tracking events and perf's output file, and wait for kernel level
overwrite mode fixed, then decide whether to implement perf's own
ringbuffer.

Thank you.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-11-25 12:30:02 UTC

I would very much like to first fix the perf overwrite mode: see

That seems backwards; why would you ever want to endlessly copy the
events if you're not going to use them?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Wangnan (F)

2015-11-25 13:00:02 UTC

I would very much like to first fix the perf overwrite mode: see

That seems backwards; why would you ever want to endlessly copy the
events if you're not going to use them?

I agree that we need to fixing overwrite mode. However, user
space ringbuffer can be more flexible. for example, dynamically
shrinking and expansion. It would be hard in kernel I think,
and I'm not sure how much flexibility we need. Doing things
in kernel always more difficult than in userspace.

But yes, we can do that userspace ring buffer when we really
need it. At very first we can start working on perf side and
assume overwrite mode is ready.

Thank you.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Ingo Molnar

2015-11-26 09:30:02 UTC

But yes, we can do that userspace ring buffer when we really need it. At very
first we can start working on perf side and assume overwrite mode is ready.

Btw., there's another trick we could use to support ftrace-alike workflows even
better: we could expose a task's active perf ring-buffers under /proc/<PID>/ and
could make it readable.

So if an overwrite-mode background tracing session is running, you don't even have
to signal it to capture the ring-buffer: just open the ring-buffer fd in procfs,
under /proc/XYZ/perf/ring-buffers/5.trace or so, and dump its current contents,
assuming the task doing that has sufficient permissions - i.e.
ptrace_may_access().

We could even pretty-print some very basic version of the records from the kernel,
via /proc/XYZ/perf/ring-buffers/5.txt, to support a tooling-less tracing modes.
This way perf based tracing could be supported even on systems that have no
writable filesystems.

I.e. in this regard perf can be made to match ftrace's tracing workflow as well -
in addition to the more traditional perf profiling workflow we all love and know!

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Ingo Molnar

2015-11-26 09:50:01 UTC

Post by Ingo Molnar

But yes, we can do that userspace ring buffer when we really need it. At
very first we can start working on perf side and assume overwrite mode is
ready.

Btw., there's another trick we could use to support ftrace-alike workflows even
better: we could expose a task's active perf ring-buffers under /proc/<PID>/ and
could make it readable.
So if an overwrite-mode background tracing session is running, you don't even
have to signal it to capture the ring-buffer: just open the ring-buffer fd in
procfs, under /proc/XYZ/perf/ring-buffers/5.trace or so, and dump its current
contents, assuming the task doing that has sufficient permissions - i.e.
ptrace_may_access().
We could even pretty-print some very basic version of the records from the
kernel, via /proc/XYZ/perf/ring-buffers/5.txt, to support a tooling-less tracing
modes. This way perf based tracing could be supported even on systems that have
no writable filesystems.
I.e. in this regard perf can be made to match ftrace's tracing workflow as well
- in addition to the more traditional perf profiling workflow we all love and
know!

Also note that if we go in this direction then with some additional changes we
could also support lightweight tracing with no tooling side at all on the traced
system: a simple kernel feature with a kernel thread could be added that takes a
list of events from sysfs or debugfs and opens them system-wide and exposes
per-cpu overwrite mode ring-buffers.

Those ring-buffers can then be accessed via procfs (and/or also be exposed in
parallel via debugfs). The kernel thread never actually does anything except set
up the events - i.e. this is a very lightweight mode of always-on tracing.

Additional debugfs toggles can be added to temporarily turn tracing on/off without
closing the events - just like ftrace.

Other toggles could be added, such as: 'stop tracing when the kernel has crashed,
or if a specific event has occured or a condition has been met'.

That way we could, among other things, capture traces on embedded systems and copy
the traces to another, larger system (or NFS-mount the target system), and run
perf tooling to analyze the traces on that more powerful system.

But it all starts with making overwrite mode work well, and working with the
kernel visible ring-buffer. That can then be exposed to user-space in very
expressive ways to turn perf into a flexible system tracing subsystem as well.

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Ingo Molnar

2015-11-26 10:00:02 UTC

Post by Ingo Molnar
So if an overwrite-mode background tracing session is running, you don't even have
to signal it to capture the ring-buffer: just open the ring-buffer fd in procfs,
under /proc/XYZ/perf/ring-buffers/5.trace or so, and dump its current contents,
assuming the task doing that has sufficient permissions - i.e.
ptrace_may_access().
We could even pretty-print some very basic version of the records from the kernel,
via /proc/XYZ/perf/ring-buffers/5.txt, to support a tooling-less tracing modes.
This way perf based tracing could be supported even on systems that have no
writable filesystems.

With this 'cat' could be used to look at the current trace:

cat /proc/XYZ/perf/ring-buffers/5.txt

would result in 'perf script' alike output generated by the kernel:

$ cat /proc/XYZ/perf/ring-buffers/5.txt
perf 24816 63796.780079: 1 cycles:pp: ffffffff810603f8 native_write_msr_safe
perf 24816 63796.780083: 1 cycles:pp: ffffffff810603f8 native_write_msr_safe
perf 24816 63796.780086: 8 cycles:pp: ffffffff810603f8 native_write_msr_safe
perf 24816 63796.780089: 97 cycles:pp: ffffffff810603f8 native_write_msr_safe
perf 24816 63796.780092: 1237 cycles:pp: ffffffff8103450c intel_pmu_handle_irq
perf 24816 63796.780094: 13879 cycles:pp: ffffffff81204f23 setup_new_exec
sh 24816 63796.780104: 170378 cycles:pp: ffffffff811bc437 change_protection_range
sh 24816 63796.780145: 698206 cycles:pp: ffffffff813d22d7 clear_page_c_e
sh 24816 63796.780304: 1145748 cycles:pp: 7f60aca20bdb _dl_addr
sh 24817 63796.780400: 1 cycles:pp: ffffffff810603f8 native_write_msr_safe
sh 24817 63796.780403: 1 cycles:pp: ffffffff810603f8 native_write_msr_safe
sh 24817 63796.780406: 10 cycles:pp: ffffffff810603f8 native_write_msr_safe
sh 24817 63796.780409: 118 cycles:pp: ffffffff810603f8 native_write_msr_safe

... and you could use shell scripting to analyze it - just like with ftrace.

of course this would be simplified output - and you could still access or copy the
raw trace data as well, and use all the rich tooling and visualization features of
full perf.

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Wangnan (F)

2015-11-26 09:30:02 UTC

I would very much like to first fix the perf overwrite mode: see

That seems backwards; why would you ever want to endlessly copy the
events if you're not going to use them?

Thank you for your reply. We'll start looking Peter's patch and try
to find what should we do to make it upstream faster. Could you please
kindly give some hints?

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Ingo Molnar

2015-11-26 09:30:02 UTC

I would very much like to first fix the perf overwrite mode: see

That seems backwards; why would you ever want to endlessly copy the
events if you're not going to use them?

I don't think Peter asked for much: pick up the patch he has already written and
use it, to have an even lower overhead always-enabled background tracing mode of
perf.

Resizing shouldn't be much of an issue with existing features: if events start
overflowing or some other threshold for dynamic increase of the ring-buffer is met
then the daemon should open a new set of events with a larger ring-buffer, and
close the old events once the new tracing ring-buffer is up and running.

Use event multiplexing to output all interesting events into the same single (per
CPU) ring-buffer.

Thanks,

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Wangnan (F)

2015-12-02 08:30:01 UTC

Hi Peter,

I would very much like to first fix the perf overwrite mode: see

I have an idea on this problem that, is it possible to
put the size of an event at the end of it when we working
on overwrite mode? So we can backtrack every events without
too much change to ring buffer?

Thank you.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Wang Nan

2015-12-02 13:50:04 UTC

This is an RFC patch which is for overwrite mode ring buffer. I'd like
to discuss the correctness of this new idea for retriving as many
events as possible from overwrite mode ring buffer. If there's no
fundamental problem, I'll start perf side work.

The biggest problem for overwrite ring buffer is that it is hard to find
the start position of valid record. [1] and [2] tries to solve this
problem by introducing 'tail' and 'next_tail' into metadata page, and
update them each time the ring buffer is half full. Which adds more
instructions to event output code path, hurt performance. In addition,
even with them we still unable to recover all possible records. For
example:

data_tail head
| |
V V
+------+-------+----------+------+---+
| A | B | C | D | |
+------+-------+----------+------+---+

If a record written at head pointer and it overwrites record A:

head data_tail
| |
V V
+--+---+-------+----------+------+---+
|E |...| B | C | D | E |
+--+---+-------+----------+------+---+

Record B is still valid but we can't get it through data_tail.

This patch suggests a different solution for this problem that, by
appending the length of a record at the end of it, user program is
possible to get every possible record in a backward manner, don't
need saving tail pointer.

For example:

head
|
V
+--+---+-------+----------+------+---+
|E6|...| B 8| C 11| D 7|E..|
+--+---+-------+----------+------+---+

In this case, from the 'head' pointer provided by kernel, user program
can first see '6' by (*(head - sizeof(u16))), then it can get the start
pointer of record 'E', then it can read size and find start position
of record D, C, B in similar way.

Kernel side implementation is easy: simply adding a PERF_SAMPLE_SIZE
for the size output.

This sloution requires user program (perf) do more things. At least
following things and limitations should be considered:

1. Before reading such ring buffer, perf must ensure all events which
may output to it is already stopped, so the 'head' pointer it get
is the end of the last record.

2. We must ensure all events attached this ring buffer has
'PERF_SAMPLE_SIZE' selected.

3. There must no tracking events output to this ring buffer.

4. 2 bytes extra space is required for each record.

Further improvement can be taken:

1. If PERF_SAMPLE_SIZE is selected, we can avoid outputting the event
size in header. Which eliminate extra space cose;

2. We can find a way to append size information for tracking events
also.

[1] http://lkml.kernel.org/r/***@twins.programming.kicks-ass.net
[2] http://lkml.kernel.org/r/***@twins.programming.kicks-ass.net

Signed-off-by: Wang Nan <***@huawei.com>
Cc: Adrian Hunter <***@intel.com>
Cc: Arnaldo Carvalho de Melo <***@redhat.com>
Cc: David Ahern <***@gmail.com>
Cc: Ingo Molnar <***@kernel.org>
Cc: Peter Zijlstra <***@chello.nl>
Cc: Yunlong Song <***@huawei.com>
---
include/uapi/linux/perf_event.h | 3 ++-
kernel/events/core.c | 6 ++++++
2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 1afe962..c4066da 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -139,8 +139,9 @@ enum perf_event_sample_format {
PERF_SAMPLE_IDENTIFIER = 1U << 16,
PERF_SAMPLE_TRANSACTION = 1U << 17,
PERF_SAMPLE_REGS_INTR = 1U << 18,
+ PERF_SAMPLE_SIZE = 1U << 19,

- PERF_SAMPLE_MAX = 1U << 19, /* non-ABI */
+ PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */
};

/*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 5854fcf..bbbacec 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5473,6 +5473,9 @@ void perf_output_sample(struct perf_output_handle *handle,
}
}

+ if (sample_type & PERF_SAMPLE_SIZE)
+ perf_output_put(handle, header->size);
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;

@@ -5592,6 +5595,9 @@ void perf_prepare_sample(struct perf_event_header *header,

header->size += size;
}
+
+ if (sample_type & PERF_SAMPLE_SIZE)
+ header->size += sizeof(u16);
}

void perf_event_output(struct perf_event *event,

--
1.8.3.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2015-12-03 10:10:01 UTC

Post by Wang Nan
This sloution requires user program (perf) do more things. At least
1. Before reading such ring buffer, perf must ensure all events which
may output to it is already stopped, so the 'head' pointer it get
is the end of the last record.

Right, this is tricky, this would not allow two snapshots to happen back
to back since that would then result in a bunch of missed events.

Aside from this issue its a rather nice idea.

Post by Wang Nan
2. We must ensure all events attached this ring buffer has
'PERF_SAMPLE_SIZE' selected.

That can be easily enforced.

Post by Wang Nan
3. There must no tracking events output to this ring buffer.

That is rather unfortunate, we'd best fix that up.

Post by Wang Nan
4. 2 bytes extra space is required for each record.

8, perf records must be 8 byte aligned and sized.

Post by Wang Nan
1. If PERF_SAMPLE_SIZE is selected, we can avoid outputting the event
size in header. Which eliminate extra space cose;

That would mandate you always parse the stream backwards. Which seems
rather unfortunate. Also, no you cannot recoup the extra space, see the
alignment and size requirement.

Post by Wang Nan
2. We can find a way to append size information for tracking events
also.

The !sample records you mean? Yes those had better have them too.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Wangnan (F)

2015-12-03 10:40:02 UTC

Right, this is tricky, this would not allow two snapshots to happen back
to back since that would then result in a bunch of missed events.
Aside from this issue its a rather nice idea.

Thank you for your attitude. We can start consider it seriously.

Now I'm working on perf side code to make a workable prototype.

Post by Wang Nan
2. We must ensure all events attached this ring buffer has
'PERF_SAMPLE_SIZE' selected.

That can be easily enforced.

Yes.

Post by Wang Nan
3. There must no tracking events output to this ring buffer.

That is rather unfortunate, we'd best fix that up.

Post by Wang Nan
4. 2 bytes extra space is required for each record.

8, perf records must be 8 byte aligned and sized.

So does it means we need to pad before outputing size?

Post by Wang Nan
1. If PERF_SAMPLE_SIZE is selected, we can avoid outputting the event
size in header. Which eliminate extra space cose;

That would mandate you always parse the stream backwards. Which seems
rather unfortunate. Also, no you cannot recoup the extra space, see the
alignment and size requirement.

That's good. We don't need to consider this :)

Before receiving your comment I'm thinking about modifying
DEFINE_OUTPUT_COPY() to write first sizeof(header) bytes at the
end of reserved area, so it would work automatically for every
possible events.

Post by Wang Nan
2. We can find a way to append size information for tracking events
also.

The !sample records you mean? Yes those had better have them too.

Thank you.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Wang Nan

2015-12-07 13:40:02 UTC

This patch allows following config terms and option:

# perf record --overwrite ...

Globally set following events to overwrite;

# perf record --event cycles/overwrite/ ...
# perf record --event cycles/no-overwrite/ ...

Set specific events to be overwrite or no-overwrite.

Signed-off-by: Wang Nan <***@huawei.com>
---
tools/perf/builtin-record.c | 3 +++
tools/perf/perf.h | 2 ++
tools/perf/util/evsel.c | 4 ++++
tools/perf/util/evsel.h | 3 +++
tools/perf/util/parse-events.c | 14 ++++++++++++++
tools/perf/util/parse-events.h | 4 +++-
tools/perf/util/parse-events.l | 2 ++
7 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 2230b85..ec4135c 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -1061,6 +1061,9 @@ struct option __record_options[] = {
OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
&record.opts.no_inherit_set,
"child tasks do not inherit counters"),
+ OPT_BOOLEAN_SET(0, "overwrite", &record.opts.overwrite,
+ &record.opts.overwrite_set,
+ "use overwrite mode"),
OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
OPT_CALLBACK('m', "mmap-pages", &record.opts, "pages[,pages]",
"number of mmap data pages and AUX area tracing mmap pages",
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 90129ac..43d79fb 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -58,6 +58,8 @@ struct record_opts {
bool full_auxtrace;
bool auxtrace_snapshot_mode;
bool record_switch_events;
+ bool overwrite;
+ bool overwrite_set;
unsigned int freq;
unsigned int mmap_pages;
unsigned int auxtrace_mmap_pages;
diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c
index 4dee8e3..3386437 100644
--- a/tools/perf/util/evsel.c
+++ b/tools/perf/util/evsel.c
@@ -668,6 +668,9 @@ static void apply_config_terms(struct perf_evsel *evsel,
*/
attr->inherit = term->val.inherit ? 1 : 0;
break;
+ case PERF_EVSEL__CONFIG_TERM_OVERWRITE:
+ evsel->overwrite = term->val.overwrite ? 1 : 0;
+ break;
default:
break;
}
@@ -743,6 +746,7 @@ void perf_evsel__config(struct perf_evsel *evsel, struct record_opts *opts)

attr->sample_id_all = perf_missing_features.sample_id_all ? 0 : 1;
attr->inherit = !opts->no_inherit;
+ evsel->overwrite = opts->overwrite;

perf_evsel__set_sample_bit(evsel, IP);
perf_evsel__set_sample_bit(evsel, TID);
diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h
index 51bab0f..c3b49e0 100644
--- a/tools/perf/util/evsel.h
+++ b/tools/perf/util/evsel.h
@@ -44,6 +44,7 @@ enum {
PERF_EVSEL__CONFIG_TERM_CALLGRAPH,
PERF_EVSEL__CONFIG_TERM_STACK_USER,
PERF_EVSEL__CONFIG_TERM_INHERIT,
+ PERF_EVSEL__CONFIG_TERM_OVERWRITE,
PERF_EVSEL__CONFIG_TERM_MAX,
};

@@ -57,6 +58,7 @@ struct perf_evsel_config_term {
char *callgraph;
u64 stack_user;
bool inherit;
+ bool overwrite;
} val;
};

@@ -115,6 +117,7 @@ struct perf_evsel {
bool tracking;
bool per_pkg;
bool precise_max;
+ bool overwrite;
/* parse modifier helper */
int exclude_GH;
int nr_members;
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index c485b32..f20cc81 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -855,6 +855,12 @@ do { \
case PARSE_EVENTS__TERM_TYPE_NOINHERIT:
CHECK_TYPE_VAL(NUM);
break;
+ case PARSE_EVENTS__TERM_TYPE_OVERWRITE:
+ CHECK_TYPE_VAL(NUM);
+ break;
+ case PARSE_EVENTS__TERM_TYPE_NOOVERWRITE:
+ CHECK_TYPE_VAL(NUM);
+ break;
case PARSE_EVENTS__TERM_TYPE_NAME:
CHECK_TYPE_VAL(STR);
break;
@@ -892,6 +898,8 @@ static int config_term_tracepoint(struct perf_event_attr *attr,
case PARSE_EVENTS__TERM_TYPE_STACKSIZE:
case PARSE_EVENTS__TERM_TYPE_INHERIT:
case PARSE_EVENTS__TERM_TYPE_NOINHERIT:
+ case PARSE_EVENTS__TERM_TYPE_OVERWRITE:
+ case PARSE_EVENTS__TERM_TYPE_NOOVERWRITE:
return config_term_common(attr, term, err);
default:
if (err) {
@@ -961,6 +969,12 @@ do { \
case PARSE_EVENTS__TERM_TYPE_NOINHERIT:
ADD_CONFIG_TERM(INHERIT, inherit, term->val.num ? 0 : 1);
break;
+ case PARSE_EVENTS__TERM_TYPE_OVERWRITE:
+ ADD_CONFIG_TERM(OVERWRITE, overwrite, term->val.num ? 1 : 0);
+ break;
+ case PARSE_EVENTS__TERM_TYPE_NOOVERWRITE:
+ ADD_CONFIG_TERM(OVERWRITE, overwrite, term->val.num ? 0 : 1);
+ break;
default:
break;
}
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index c34615f..29cc804 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -68,7 +68,9 @@ enum {
PARSE_EVENTS__TERM_TYPE_CALLGRAPH,
PARSE_EVENTS__TERM_TYPE_STACKSIZE,
PARSE_EVENTS__TERM_TYPE_NOINHERIT,
- PARSE_EVENTS__TERM_TYPE_INHERIT
+ PARSE_EVENTS__TERM_TYPE_INHERIT,
+ PARSE_EVENTS__TERM_TYPE_NOOVERWRITE,
+ PARSE_EVENTS__TERM_TYPE_OVERWRITE,
};

struct parse_events_array {
diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l
index 27d567f..2ef6f96 100644
--- a/tools/perf/util/parse-events.l
+++ b/tools/perf/util/parse-events.l
@@ -202,6 +202,8 @@ call-graph { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CALLGRAPH); }
stack-size { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_STACKSIZE); }
inherit { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_INHERIT); }
no-inherit { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_NOINHERIT); }
+overwrite { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_OVERWRITE); }
+no-overwrite { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_NOOVERWRITE); }
, { return ','; }
"/" { BEGIN(INITIAL); return '/'; }
{name_minus} { return str(yyscanner, PE_NAME); }

Wang Nan

2015-12-07 13:40:03 UTC

This patch set explores the idea shown in [1], which puts size of every
events at the end of them in ring buffer so user space tool like perf
can parse the ring buffer backward and find the oldest event in it.

In this version:

- Kernel side, rename PERF_SAMPLE_SIZE to PERF_SAMPLE_SIZE_AT_END,
output 8 bytes to meet the alignment requirement.

- Perf side, provide a prototype utilize this feature. With this
prototype users are allowed to capture the last events in an
overwrite ring buffer using:

# ./perf record -e dummy -e syscalls:*/overwrite/ ...

[1] http://lkml.kernel.org/g/1449063499-236703-1-git-send-email-***@huawei.com

Wang Nan (3):
perf/core: Put size of a sample at the end of it
perf tools: Enable overwrite settings
perf record: Find tail pointer through size at end of event

include/uapi/linux/perf_event.h | 3 +-
kernel/events/core.c | 9 +++++
tools/perf/builtin-record.c | 76 +++++++++++++++++++++++++++++++++++++++--
tools/perf/perf.h | 2 ++
tools/perf/util/evlist.c | 42 +++++++++++++++++------
tools/perf/util/evsel.c | 7 ++++
tools/perf/util/evsel.h | 3 ++
tools/perf/util/parse-events.c | 14 ++++++++
tools/perf/util/parse-events.h | 4 ++-
tools/perf/util/parse-events.l | 2 ++
10 files changed, 147 insertions(+), 15 deletions(-)

Wang Nan

2015-12-07 13:40:03 UTC

This is an RFC patch which is for overwrite mode ring buffer. I'd like
to discuss the correctness of this new idea for retriving as many
events as possible from overwrite mode ring buffer. If there's no
fundamental problem, I'll start perf side work.

The biggest problem for overwrite ring buffer is that it is hard to find
the start position of valid record. [1] and [2] tries to solve this
problem by introducing 'tail' and 'next_tail' into metadata page, and
update them each time the ring buffer is half full. Which adds more
instructions to event output code path, hurt performance. In addition,
even with them we still unable to recover all possible records. For
example:

data_tail head
| |
V V
+------+-------+----------+------+---+
| A | B | C | D | |
+------+-------+----------+------+---+

If a record written at head pointer and it overwrites record A:

head data_tail
| |
V V
+--+---+-------+----------+------+---+
|E |...| B | C | D | E |
+--+---+-------+----------+------+---+

Record B is still valid but we can't get it through data_tail.

This patch suggests a different solution for this problem that, by
appending the length of a record at the end of it, user program is
possible to get every possible record in a backward manner, don't
need saving tail pointer.

For example:

head
|
V
+--+---+-------+----------+------+---+
|E6|...| B 8| C 11| D 7|E..|
+--+---+-------+----------+------+---+

In this case, from the 'head' pointer provided by kernel, user program
can first see '6' by (*(head - sizeof(u16))), then it can get the start
pointer of record 'E', then it can read size and find start position
of record D, C, B in similar way.

Kernel side implementation is easy: simply adding a PERF_SAMPLE_SIZE_AT_END
for the size output.

This sloution requires user program (perf) do more things. At least
following things and limitations should be considered:

1. Before reading such ring buffer, perf must ensure all events which
may output to it is already stopped, so the 'head' pointer it get
is the end of the last record.

Further improvement can be taken:

1. We must ensure all events attached this ring buffer has
'PERF_SAMPLE_SIZE_AT_END' selected.

2. 8 bytes extra space is required for each record.

3. We can find a way to append size information for tracking events.

[1] http://lkml.kernel.org/r/***@twins.programming.kicks-ass.net
[2] http://lkml.kernel.org/r/***@twins.programming.kicks-ass.net

Signed-off-by: Wang Nan <***@huawei.com>
Cc: Adrian Hunter <***@intel.com>
Cc: Arnaldo Carvalho de Melo <***@redhat.com>
Cc: David Ahern <***@gmail.com>
Cc: Ingo Molnar <***@kernel.org>
Cc: Peter Zijlstra <***@chello.nl>
Cc: Yunlong Song <***@huawei.com>
---
include/uapi/linux/perf_event.h | 3 ++-
kernel/events/core.c | 9 +++++++++
2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 1afe962..e79606c 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -139,8 +139,9 @@ enum perf_event_sample_format {
PERF_SAMPLE_IDENTIFIER = 1U << 16,
PERF_SAMPLE_TRANSACTION = 1U << 17,
PERF_SAMPLE_REGS_INTR = 1U << 18,
+ PERF_SAMPLE_SIZE_AT_END = 1U << 19,

- PERF_SAMPLE_MAX = 1U << 19, /* non-ABI */
+ PERF_SAMPLE_MAX = 1U << 20, /* non-ABI */
};

/*
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c3d61b9..cfe9336 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -5452,6 +5452,12 @@ void perf_output_sample(struct perf_output_handle *handle,
}
}

+ /* Should be the last one */
+ if (sample_type & PERF_SAMPLE_SIZE_AT_END) {
+ perf_output_skip(handle, sizeof(u64) - sizeof(header->size));
+ perf_output_put(handle, header->size);
+ }
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;

@@ -5571,6 +5577,9 @@ void perf_prepare_sample(struct perf_event_header *header,

header->size += size;
}
+
+ if (sample_type & PERF_SAMPLE_SIZE_AT_END)
+ header->size += sizeof(u64);
}

void perf_event_output(struct perf_event *event,

Wang Nan

2015-12-07 13:40:03 UTC