[RFC PATCH 2/3] ftrace: combine some print formating

RFC RFC RFC RFC RFC!!!!

Now did I get your attention that this is a request for comment patch.

This is probably very buggy. I ran it as a back end for ftrace but only
tested the irqsoff and ftrace tracers. The selftests are busted with it.

But this is an attempt to get a unified buffering system that was
talked about at the LPC meeting.

I did not get a chance to implement all the event recording and printing
in the debugfs/tracing/buffers directory. But I got enough to do
some ftrace work with it.

Now that it boots and runs (albeit, a bit buggy), I decided to post it.
This is some idea that I had to handle this.

I tried to make it as simple as possible. Basically we have:

buffer = ring_buffer_alloc(size, flags, max_event_size, print_func, name);

We can record either the fast way of reserving a part of the buffer:

event = ring_buffer_lock_reserve(buffer, event_id, length, &flags);
event->data = record_this_data;
ring_buffer_unlock_commit(buffer, event, flags);

Or if we already have the data together:

ring_buffer_write(buffer, event_id, length, data);

We can read by consuming or iterating:

event = ring_buffer_consume(buffer);

iter = ring_buffer_start(buffer, iter_flags);
event = ring_buffer_read(iter, &next_cpu);
ring_buffer_finish(iter);

Note, the iteration part stops recording to the buffer. This is a feature.
If you want producer/consumer, then you should be using the consumer.

Signed-off-by: Steven Rostedt <***@redhat.com>
---
include/linux/ring_buffer.h | 138 +++
kernel/trace/Kconfig | 4
kernel/trace/Makefile | 1
kernel/trace/ring_buffer.c | 1565 ++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 1708 insertions(+)

Index: linux-compile.git/include/linux/ring_buffer.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-compile.git/include/linux/ring_buffer.h 2008-09-23 17:45:49.000000000 -0400
@@ -0,0 +1,138 @@
+#ifndef _LINUX_RING_BUFFER_H
+#define _LINUX_RING_BUFFER_H
+
+#include <linux/mm.h>
+#include <linux/seq_file.h>
+
+struct ring_buffer;
+struct ring_buffer_iter;
+
+/*
+ * Don't reference this struct directly, use the inline items below.
+ */
+struct ring_buffer_event {
+ unsigned long long counter;
+ short type;
+ short length;
+ char body[];
+} __attribute__((__packed__));
+
+#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event))
+
+static inline unsigned
+ring_buffer_event_type(struct ring_buffer_event *event_handler)
+{
+ return event_handler->type;
+}
+
+static inline unsigned
+ring_buffer_event_length(struct ring_buffer_event *event_handler)
+{
+ return event_handler->length;
+}
+
+static inline void *
+ring_buffer_event_data(struct ring_buffer_event *event_handler)
+{
+ return event_handler->body;
+}
+
+static inline unsigned long long
+ring_buffer_event_counter(struct ring_buffer_event *event_handler)
+{
+ return event_handler->counter;
+}
+
+struct ring_buffer_seq;
+
+unsigned long ring_buffer_max_event_size(struct ring_buffer *buffer);
+
+typedef void (*ring_buffer_print_func) (struct ring_buffer *buffer,
+ struct ring_buffer_seq *seq,
+ struct ring_buffer_event *event);
+
+struct ring_buffer_seq *ring_buffer_seq_alloc(gfp_t flags);
+void ring_buffer_seq_free(struct ring_buffer_seq *seq);
+unsigned ring_buffer_seq_length(struct ring_buffer_seq *seq);
+void ring_buffer_seq_set_length(struct ring_buffer_seq *seq, unsigned len);
+int ring_buffer_seq_printf(struct ring_buffer_seq *seq, const char *fmt, ...)
+ __attribute__ ((format (printf, 2, 3)));
+int ring_buffer_seq_puts(struct ring_buffer_seq *seq, const char *str);
+int ring_buffer_seq_putc(struct ring_buffer_seq *seq, unsigned char c);
+int ring_buffer_seq_putmem(struct ring_buffer_seq *s, void *mem, size_t len);
+int ring_buffer_seq_to_seqfile(struct seq_file *m, struct ring_buffer_seq *s);
+int ring_buffer_seq_putmem_hex(struct ring_buffer_seq *s, void *mem, size_t len);
+ssize_t ring_buffer_seq_copy_to_user(struct ring_buffer_seq *seq,
+ char __user *ubuf,
+ size_t cnt);
+int ring_buffer_seq_to_mem(struct ring_buffer_seq *s, void *mem, size_t len);
+void ring_buffer_seq_reset(struct ring_buffer_seq *s);
+
+void ring_buffer_lock(struct ring_buffer *buffer, unsigned long *flags);
+void ring_buffer_unlock(struct ring_buffer *buffer, unsigned long flags);
+
+struct ring_buffer *
+ring_buffer_alloc(unsigned long size, unsigned long flags,
+ unsigned long max_event_size,
+ ring_buffer_print_func print_func,
+ const char *name, ...);
+void ring_buffer_free(struct ring_buffer *buffer);
+
+int
+ring_buffer_register_event(struct ring_buffer *buffer, unsigned long length,
+ ring_buffer_print_func print_func,
+ int event_type,
+ const char *name, ...);
+void *ring_buffer_lock_reserve(struct ring_buffer *buffer,
+ int event_type,
+ unsigned long length,
+ unsigned long *flags);
+int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+ void *data, unsigned long flags);
+
+int ring_buffer_rename(struct ring_buffer *buffer, char *new_name, ...);
+
+void *ring_buffer_write(struct ring_buffer *buffer,
+ int event_type,
+ unsigned long length,
+ void *event);
+
+enum ring_buffer_iter_flags {
+ RB_ITER_FL_SNAP = 1 << 0,
+};
+
+struct ring_buffer_event *
+ring_buffer_consume(struct ring_buffer *buffer);
+
+struct ring_buffer_iter *
+ring_buffer_start(struct ring_buffer *buffer, unsigned flags);
+void ring_buffer_finish(struct ring_buffer_iter *iter);
+
+struct ring_buffer_event *
+ring_buffer_peek(struct ring_buffer_iter *iter, int *next_cpu);
+
+struct ring_buffer_event *
+ring_buffer_read(struct ring_buffer_iter *iter, int *next_cpu);
+
+unsigned long ring_buffer_size(struct ring_buffer *buffer);
+
+void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu);
+
+int ring_buffer_empty(struct ring_buffer *buffer);
+
+void ring_buffer_disable(struct ring_buffer *buffer);
+void ring_buffer_enable(struct ring_buffer *buffer);
+
+void ring_buffer_snapshot(struct ring_buffer *buffer);
+void ring_buffer_snapshot_cpu(struct ring_buffer *buffer, int cpu);
+void ring_buffer_snapshot_one_cpu(struct ring_buffer *buffer, int cpu);
+
+unsigned long ring_buffer_entries(struct ring_buffer *buffer);
+unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
+
+enum ring_buffer_flags {
+ RB_FL_OVERWRITE = 1 << 0,
+ RB_FL_SNAPSHOT = 1 << 1,
+};
+
+#endif /* _LINUX_RING_BUFFER_H */
Index: linux-compile.git/kernel/trace/ring_buffer.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-compile.git/kernel/trace/ring_buffer.c 2008-09-24 00:46:40.000000000 -0400
@@ -0,0 +1,1565 @@
+/*
+ * Generic ring buffer
+ *
+ * Copyright (C) 2008 Steven Rostedt <***@redhat.com>
+ */
+#include <linux/ring_buffer.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/list.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+
+#define MAX_NAME_SIZE 256
+#define RING_BUFFER_EVENT_DYN_START 1000
+#define RB_EVENT_HASHBITS 10
+#define RB_EVENT_HASHSIZE (1<<RB_EVENT_HASHBITS)
+
+/*
+ * head_page == tail_page && head == tail then buffer is empty.
+ */
+struct ring_buffer_per_cpu {
+ int cpu;
+ struct ring_buffer *buffer;
+ raw_spinlock_t lock;
+ struct lock_class_key lock_key;
+ void **pages;
+ unsigned long head; /* read from head */
+ unsigned long tail; /* write to tail */
+ unsigned long head_page;
+ unsigned long tail_page;
+ unsigned long overrun;
+ unsigned long entries;
+};
+
+struct ring_buffer {
+ char name[MAX_NAME_SIZE + 1];
+ ring_buffer_print_func default_func;
+ unsigned long size;
+ unsigned long next_event_type;
+ unsigned long max_event_size;
+ unsigned pages;
+ unsigned page_size;
+ unsigned flags;
+ int cpus;
+ atomic_t record_disabled;
+
+ spinlock_t lock;
+ struct mutex mutex;
+
+ struct ring_buffer_per_cpu *buffers[NR_CPUS];
+ struct list_head list;
+ struct list_head events;
+
+ struct ring_buffer_per_cpu *snap_buffers[NR_CPUS];
+
+ struct hlist_head event_hash[RB_EVENT_HASHSIZE];
+
+ /* debugfs file entries */
+ struct dentry *dir_ent;
+ struct dentry *entry_dir;
+ struct dentry *text_ent;
+ struct dentry **binary_ents; /* per cpu */
+};
+
+struct ring_buffer_event_holder {
+ struct ring_buffer *buffer;
+ struct list_head list;
+ struct hlist_node hash;
+ char *name;
+ unsigned event_type;
+ unsigned length;
+ ring_buffer_print_func print_func;
+};
+
+struct ring_buffer_iter_per_cpu {
+ unsigned long head;
+ unsigned long head_page;
+};
+
+struct ring_buffer_iter {
+ struct ring_buffer *buffer;
+ struct ring_buffer_iter_per_cpu buffers[NR_CPUS];
+ int next_cpu;
+ unsigned flags;
+};
+
+struct ring_buffer_seq {
+ unsigned char buffer[PAGE_SIZE];
+ unsigned int len;
+ unsigned int readpos;
+};
+
+static struct file_operations text_fops = {
+#if 0
+ .open = text_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = text_release,
+#endif
+};
+
+static struct file_operations binary_fops = {
+#if 0
+ .open = binary_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = binary_release,
+#endif
+};
+
+/* FIXME!!! */
+unsigned long long
+ring_buffer_next_counter(int cpu)
+{
+ return sched_clock();
+}
+
+DEFINE_MUTEX(buffer_mutex);
+static LIST_HEAD(ring_buffer_list);
+static struct dentry *buffer_dent;
+#define TEMP_BUFFER_SIZE 1023
+static char temp_buffer[TEMP_BUFFER_SIZE+1];
+
+static int ring_buffer_register_debugfs(struct ring_buffer *buffer)
+{
+ struct dentry *tracing_dent;
+ struct dentry *dentry;
+ struct dentry *entry;
+ char name_buf[32];
+ int ret = -ENOMEM, i;
+
+ if (!buffer_dent) {
+ tracing_dent = tracing_init_dentry();
+ buffer_dent = debugfs_create_dir("buffers", tracing_dent);
+ if (!buffer_dent) {
+ pr_warning("Could not create debugfs directory"
+ " 'tracing/buffers'\n");
+ return ret;
+ }
+ }
+
+ buffer->binary_ents = kzalloc(sizeof(struct dentry *) * buffer->cpus,
+ GFP_KERNEL);
+ if (!buffer->binary_ents)
+ return ret;
+
+ dentry = debugfs_create_dir(buffer->name, buffer_dent);
+ if (!dentry)
+ goto free_binary_ents;
+ buffer->dir_ent = dentry;
+
+ entry = debugfs_create_file("text", 0444, dentry,
+ buffer, &text_fops);
+ if (!entry)
+ goto fail_free_dir;
+ buffer->text_ent = entry;
+
+ for (i = 0; i < buffer->cpus; i++) {
+ snprintf(name_buf, 32, "binary%d", i);
+ entry = debugfs_create_file(name_buf, 0444, dentry,
+ buffer->buffers[i], &binary_fops);
+ if (!entry)
+ goto fail_free_ents;
+ buffer->binary_ents[i] = entry;
+ }
+
+ return 0;
+
+ fail_free_ents:
+ debugfs_remove(buffer->text_ent);
+ for (i = 0; i < buffer->cpus; i++) {
+ if (buffer->binary_ents[i])
+ debugfs_remove(buffer->binary_ents[i]);
+ }
+
+ fail_free_dir:
+ kfree(buffer->binary_ents);
+ debugfs_remove(dentry);
+
+ free_binary_ents:
+ kfree(buffer->binary_ents);
+ return -1;
+}
+
+static void ring_buffer_unregister_debugfs(struct ring_buffer *buffer)
+{
+ /* fast and simple for now */
+ debugfs_remove_recursive(buffer->dir_ent);
+}
+
+static struct ring_buffer_per_cpu *
+ring_buffer_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int pages = buffer->pages;
+ int i;
+
+ cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!cpu_buffer)
+ return NULL;
+
+ cpu_buffer->cpu = cpu;
+ cpu_buffer->buffer = buffer;
+ cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
+ cpu_buffer->pages = kzalloc_node(ALIGN(sizeof(void *) * pages,
+ cache_line_size()), GFP_KERNEL,
+ cpu_to_node(cpu));
+ if (!cpu_buffer->pages)
+ goto fail_free_buffer;
+
+ for (i = 0; i < pages; i++) {
+ cpu_buffer->pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!cpu_buffer->pages[i])
+ goto fail_free_pages;
+ }
+
+ return cpu_buffer;
+
+ fail_free_pages:
+ for (i = 0; i < pages; i++) {
+ if (cpu_buffer->pages[i])
+ free_page((unsigned long)cpu_buffer->pages[i]);
+ }
+ kfree(cpu_buffer->pages);
+
+ fail_free_buffer:
+ kfree(cpu_buffer);
+ return NULL;
+}
+
+static void
+ring_buffer_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ int i;
+
+ for (i = 0; i < cpu_buffer->buffer->pages; i++) {
+ if (cpu_buffer->pages[i])
+ free_page((unsigned long)cpu_buffer->pages[i]);
+ }
+ kfree(cpu_buffer->pages);
+ kfree(cpu_buffer);
+}
+
+struct ring_buffer *
+ring_buffer_alloc(unsigned long size, unsigned long flags,
+ unsigned long max_event_size,
+ ring_buffer_print_func print_func,
+ const char *name, ...)
+{
+ struct ring_buffer *buffer, *p;
+ va_list args;
+ int order = 0;
+ int ret = 0;
+ int cpu;
+
+ /* For now, we only allow max of page size */
+ if (max_event_size > PAGE_SIZE) {
+ WARN_ON(1);
+ return NULL;
+ }
+ max_event_size = PAGE_SIZE;
+
+ mutex_lock(&buffer_mutex);
+
+ /* keep it in its own cache line */
+ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
+ GFP_KERNEL);
+ if (!buffer)
+ goto fail_unlock;
+
+ va_start(args, name);
+ vsnprintf(buffer->name, MAX_NAME_SIZE, name, args);
+ va_end(args);
+
+ buffer->name[MAX_NAME_SIZE] = 0;
+ /* FIXME; this should be better than a linear search */
+ list_for_each_entry(p, &ring_buffer_list, list) {
+ if (strcmp(p->name, buffer->name) == 0) {
+ ret = -EBUSY;
+ break;
+ }
+ }
+ if (ret)
+ goto fail_free_buffer;
+
+ buffer->page_size = 1 << order << PAGE_SHIFT;
+ buffer->next_event_type = RING_BUFFER_EVENT_DYN_START;
+ buffer->max_event_size = max_event_size;
+ INIT_LIST_HEAD(&buffer->events);
+
+ buffer->default_func = print_func;
+ buffer->pages = (size + (buffer->page_size - 1)) / buffer->page_size;
+ buffer->flags = flags;
+
+ /* need at least two pages */
+ if (buffer->pages == 1)
+ buffer->pages++;
+
+ /* FIXME: do for only online CPUS */
+ buffer->cpus = num_possible_cpus();
+ for_each_possible_cpu(cpu) {
+ if (cpu >= buffer->cpus)
+ continue;
+ buffer->buffers[cpu] =
+ ring_buffer_allocate_cpu_buffer(buffer, cpu);
+ if (!buffer->buffers[cpu])
+ goto fail_free_buffers;
+ }
+
+ if (flags & RB_FL_SNAPSHOT) {
+ for_each_possible_cpu(cpu) {
+ if (cpu >= buffer->cpus)
+ continue;
+ buffer->snap_buffers[cpu] =
+ ring_buffer_allocate_cpu_buffer(buffer, cpu);
+ if (!buffer->snap_buffers[cpu])
+ goto fail_free_snap_buffers;
+ }
+ }
+
+ ret = ring_buffer_register_debugfs(buffer);
+ if (ret)
+ goto fail_free_snap_buffers;
+
+ spin_lock_init(&buffer->lock);
+ mutex_init(&buffer->mutex);
+
+ mutex_unlock(&buffer_mutex);
+
+ return buffer;
+
+ fail_free_snap_buffers:
+ if (!(flags & RB_FL_SNAPSHOT))
+ goto fail_free_buffers;
+
+ for_each_possible_cpu(cpu) {
+ if (cpu >= buffer->cpus)
+ continue;
+ if (buffer->snap_buffers[cpu])
+ ring_buffer_free_cpu_buffer(buffer->snap_buffers[cpu]);
+ }
+
+ fail_free_buffers:
+ for_each_possible_cpu(cpu) {
+ if (cpu >= buffer->cpus)
+ continue;
+ if (buffer->buffers[cpu])
+ ring_buffer_free_cpu_buffer(buffer->buffers[cpu]);
+ }
+
+ fail_free_buffer:
+ kfree(buffer);
+
+ fail_unlock:
+ mutex_unlock(&buffer_mutex);
+ return NULL;
+}
+
+static struct ring_buffer_event_holder *
+__ring_buffer_find_event(struct ring_buffer *buffer, int event_type)
+{
+ struct ring_buffer_event_holder *p;
+ struct hlist_node *t;
+ unsigned long key;
+
+ key = hash_long(event_type, RB_EVENT_HASHBITS);
+
+ hlist_for_each_entry_rcu(p, t, &buffer->event_hash[key], hash) {
+ if (p->event_type == event_type)
+ return p;
+ }
+
+ return NULL;
+}
+
+/**
+ * ring_buffer_register_event - register an event to a ring buffer
+ * @buffer: the buffer to register the event to.
+ * @length: the length of the event
+ * @print_func: The pretty print output to handle this event.
+ * @event_type: Set the event type, or 0 to have one given
+ * @name: The name of this event, to show in the debugfs.
+ *
+ * This function allows events to be registered, as well as adding
+ * a function to handle how to show this event in text format.
+ * The event_type must be less than 1000, since that is where
+ * the dynamic event types start. Event types are unique to buffers.
+ */
+int
+ring_buffer_register_event(struct ring_buffer *buffer, unsigned long length,
+ ring_buffer_print_func print_func,
+ int event_type,
+ const char *name, ...)
+{
+ struct ring_buffer_event_holder *ptr, *event;
+ struct list_head *p;
+ va_list args, args2;
+ unsigned long key;
+ int r;
+
+ if (event_type >= RING_BUFFER_EVENT_DYN_START)
+ return -EINVAL;
+
+ event = kzalloc(sizeof(*event), GFP_KERNEL);
+ if (!event)
+ return -ENOMEM;
+
+ event->print_func = print_func;
+
+ mutex_lock(&buffer->mutex);
+
+ if (!event_type)
+ event_type = buffer->next_event_type++;
+
+ ptr = __ring_buffer_find_event(buffer, event_type);
+ if (ptr) {
+ event_type = -EBUSY;
+ kfree(event);
+ goto out;
+ }
+
+ va_start(args, name);
+ va_copy(args2, args);
+ r = vsnprintf(temp_buffer, TEMP_BUFFER_SIZE, name, args);
+
+ event->name = kzalloc(r+1, GFP_KERNEL);
+ if (!event->name) {
+ va_end(args2);
+ va_end(args);
+ kfree(event);
+ return -ENOMEM;
+ }
+
+ if (unlikely(r >= TEMP_BUFFER_SIZE))
+ vsnprintf(event->name, r+1, name, args2);
+ else
+ strcpy(event->name, temp_buffer);
+
+ va_end(args2);
+ va_end(args);
+
+ list_for_each(p, &buffer->events) {
+ ptr = list_entry(p, struct ring_buffer_event_holder, list);
+ r = strcmp(event->name, ptr->name);
+ if (!r) {
+ WARN_ON(1);
+ kfree(event->name);
+ kfree(event);
+ event = NULL;
+ goto out;
+ }
+ if (r < 0)
+ break;
+ }
+
+ list_add_tail(&event->list, p);
+
+ key = hash_long(event_type, RB_EVENT_HASHBITS);
+ hlist_add_head_rcu(&event->hash, &buffer->event_hash[key]);
+
+ out:
+ mutex_unlock(&buffer->mutex);
+ return event_type;
+}
+
+static void
+ring_buffer_event_free(struct ring_buffer_event_holder *event)
+{
+ kfree(event->name);
+ kfree(event);
+}
+
+/**
+ * ring_buffer_free - free a ring buffer.
+ * @buffer: the buffer to free.
+ */
+void
+ring_buffer_free(struct ring_buffer *buffer)
+{
+ struct ring_buffer_event_holder *event_holder, *n;
+ int cpu;
+
+ for (cpu = 0; cpu < buffer->cpus; cpu++)
+ ring_buffer_free_cpu_buffer(buffer->buffers[cpu]);
+
+ list_for_each_entry_safe(event_holder, n,
+ &buffer->events, list)
+ ring_buffer_event_free(event_holder);
+
+ ring_buffer_unregister_debugfs(buffer);
+ kfree(buffer);
+}
+
+/**
+ * ring_buffer_max_event_size - return max_event_size of the buffer
+ * @buffer: the buffer to get the max event size that is allowed.
+ *
+ * Returns the max event size allowed for the given buffer.
+ */
+unsigned long
+ring_buffer_max_event_size(struct ring_buffer *buffer)
+{
+ return buffer->max_event_size;
+}
+
+static inline int
+ring_buffer_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return cpu_buffer->head_page == cpu_buffer->tail_page &&
+ cpu_buffer->head == cpu_buffer->tail;
+}
+
+static inline int
+ring_buffer_null_event(struct ring_buffer_event *event)
+{
+ return !event->type && !event->counter;
+}
+
+static inline int
+ring_buffer_short_event(struct ring_buffer *buffer, unsigned long ptr)
+{
+ return ptr + RB_EVNT_HDR_SIZE > buffer->page_size;
+}
+
+static void
+ring_buffer_update_overflow(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned long head_page)
+{
+ struct ring_buffer *buffer = cpu_buffer->buffer;
+ struct ring_buffer_event *event;
+ unsigned long head;
+
+ for (head = 0; head < buffer->page_size; head += event->length) {
+ if (ring_buffer_short_event(buffer, head))
+ break;
+ event = cpu_buffer->pages[cpu_buffer->head_page] + head;
+ if (ring_buffer_null_event(event))
+ break;
+ cpu_buffer->overrun++;
+ cpu_buffer->entries--;
+ }
+}
+
+static inline void
+ring_buffer_inc_page(struct ring_buffer *buffer,
+ unsigned long *page)
+{
+ (*page)++;
+ if (*page >= buffer->pages)
+ *page = 0;
+}
+
+static struct ring_buffer_event *
+ring_buffer_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned long length)
+{
+ unsigned long head_page, tail_page, tail;
+ struct ring_buffer *buffer = cpu_buffer->buffer;
+ struct ring_buffer_event *event;
+
+ if (length > buffer->page_size)
+ return NULL;
+
+ tail_page = cpu_buffer->tail_page;
+ head_page = cpu_buffer->head_page;
+ tail = cpu_buffer->tail;
+
+ BUG_ON(tail_page >= buffer->pages);
+ BUG_ON(head_page >= buffer->pages);
+
+ if (tail + length > buffer->page_size) {
+ unsigned long next_page = tail_page;
+
+ ring_buffer_inc_page(buffer, &next_page);
+
+ if (next_page == head_page) {
+ if (!(buffer->flags & RB_FL_OVERWRITE))
+ return NULL;
+
+ /* count overflows */
+ ring_buffer_update_overflow(cpu_buffer, head_page);
+
+ ring_buffer_inc_page(buffer, &head_page);
+ cpu_buffer->head_page = head_page;
+ cpu_buffer->head = 0;
+ }
+
+ if (!ring_buffer_short_event(buffer, tail)) {
+ event = cpu_buffer->pages[tail_page] + tail;
+ /* empty event */
+ event->counter = 0;
+ event->type = 0;
+ event->length = buffer->page_size - tail;
+ }
+
+ tail = 0;
+ tail_page = next_page;
+ cpu_buffer->tail_page = tail_page;
+ cpu_buffer->tail = tail;
+ }
+
+ BUG_ON(tail_page >= buffer->pages);
+ BUG_ON(ring_buffer_short_event(buffer, tail));
+
+ event = cpu_buffer->pages[tail_page] + tail;
+ event->length = length;
+ cpu_buffer->entries++;
+
+ return event;
+}
+
+/**
+ * ring_buffer_lock_reserve - reserve a part of the buffer
+ * @buffer: the ring buffer to reserve from
+ * @event_type: the event type to reserve
+ * @length: the length of the data to reserve (excluding event header)
+ * @flags: a pointer to save the interrupt flags
+ *
+ * Returns a location on the ring buffer to copy directly to.
+ * The length is the length of the data needed, not the event length
+ * which also includes the event header.
+ *
+ * Must be paired with ring_buffer_unlock_commit, unless NULL is returned.
+ * If NULL is returned, then nothing has been allocated or locked.
+ */
+void *ring_buffer_lock_reserve(struct ring_buffer *buffer,
+ int event_type,
+ unsigned long length,
+ unsigned long *flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ int cpu;
+
+ if (atomic_read(&buffer->record_disabled))
+ return NULL;
+
+ raw_local_irq_save(*flags);
+ cpu = raw_smp_processor_id();
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_lock(&cpu_buffer->lock);
+
+ if (atomic_read(&buffer->record_disabled))
+ goto no_record;
+
+ length += RB_EVNT_HDR_SIZE;
+ length = ALIGN(length, 8);
+ WARN_ON(length > buffer->max_event_size);
+ event = ring_buffer_reserve_next_event(cpu_buffer, length);
+ if (!event)
+ goto no_record;
+
+ event->counter = ring_buffer_next_counter(cpu_buffer->cpu);
+ event->type = event_type;
+
+ return &event->body;
+
+ no_record:
+ __raw_spin_unlock(&cpu_buffer->lock);
+ local_irq_restore(*flags);
+ return NULL;
+}
+
+/**
+ * ring_buffer_unlock_commit - commit a reserved
+ * @buffer: The buffer to commit to
+ * @data: The data pointer to commit.
+ * @flags: the interrupt flags received from ring_buffer_lock_reserve.
+ *
+ * This commits the data to the ring buffer, and releases any locks held.
+ *
+ * Must be paired with ring_buffer_lock_reserve.
+ */
+int ring_buffer_unlock_commit(struct ring_buffer *buffer, void *data, unsigned long flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event =
+ container_of(data, struct ring_buffer_event, body);
+
+ int cpu = raw_smp_processor_id();
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ cpu_buffer->tail += event->length;
+
+ __raw_spin_unlock(&cpu_buffer->lock);
+ raw_local_irq_restore(flags);
+
+ return 0;
+}
+
+/**
+ * ring_buffer_write - write data to the buffer without reserving
+ * @buffer: The ring buffer to write to.
+ * @event_type: The event type to write to.
+ * @length: The length of the data being written (excluding the event header)
+ * @data: The data to write to the buffer.
+ *
+ * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as
+ * one function. If you already have the data to write to the buffer, it
+ * may be easier to simply call this function.
+ *
+ * Note, like ring_buffer_lock_reserve, the length is the length of the data
+ * and not the length of the event which would hold the header.
+ */
+void *ring_buffer_write(struct ring_buffer *buffer,
+ int event_type,
+ unsigned long length,
+ void *data)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ unsigned long event_length, flags;
+ void *ret = NULL;
+ int cpu;
+
+ if (atomic_read(&buffer->record_disabled))
+ return NULL;
+
+ local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_lock(&cpu_buffer->lock);
+
+ if (atomic_read(&buffer->record_disabled))
+ goto out;
+
+ event_length = ALIGN(length + RB_EVNT_HDR_SIZE, 8);
+ event = ring_buffer_reserve_next_event(cpu_buffer, event_length);
+ if (!event)
+ goto out;
+
+ event->counter = ring_buffer_next_counter(cpu_buffer->cpu);
+ event->type = event_type;
+ memcpy(&event->body, data, length);
+
+ cpu_buffer->tail += event->length;
+
+ ret = event->body;
+ out:
+ __raw_spin_unlock(&cpu_buffer->lock);
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+/**
+ * ring_buffer_lock - lock the ring buffer
+ * @buffer: The ring buffer to lock
+ * @flags: The place to store the interrupt flags
+ *
+ * This locks all the per CPU buffers.
+ *
+ * Must be unlocked by ring_buffer_unlock.
+ */
+void ring_buffer_lock(struct ring_buffer *buffer, unsigned long *flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ local_irq_save(*flags);
+
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_lock(&cpu_buffer->lock);
+ }
+}
+
+/**
+ * ring_buffer_unlock - unlock a locked buffer
+ * @buffer: The locked buffer to unlock
+ * @flags: The interrupt flags received by ring_buffer_lock
+ */
+void ring_buffer_unlock(struct ring_buffer *buffer, unsigned long flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ for (cpu = buffer->cpus - 1; cpu >= 0; cpu--) {
+
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_unlock(&cpu_buffer->lock);
+ }
+
+ local_irq_restore(flags);
+}
+
+/**
+ * ring_buffer_record_disable - stop all writes into the buffer
+ * @buffer: The ring buffer to stop writes to.
+ *
+ * This prevents all writes to the buffer. Any attempt to write
+ * to the buffer after this will fail and return NULL.
+ */
+void ring_buffer_record_disable(struct ring_buffer *buffer)
+{
+ atomic_inc(&buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_enable - enable writes to the buffer
+ * @buffer: The ring buffer to enable writes
+ *
+ * Note, multiple disables will need the same number of enables
+ * to truely enable the writing (much like preempt_disable).
+ */
+void ring_buffer_record_enable(struct ring_buffer *buffer)
+{
+ atomic_dec(&buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the entries from.
+ */
+unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return cpu_buffer->entries;
+}
+
+/**
+ * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return cpu_buffer->overrun;
+}
+
+/**
+ * ring_buffer_entries - get the number of entries in a buffer
+ * @buffer: The ring buffer
+ *
+ * Returns the total number of entries in the ring buffer
+ * (all CPU entries)
+ */
+unsigned long ring_buffer_entries(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long entries = 0;
+ int cpu;
+
+ /* if you care about this being correct, lock the buffer */
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = buffer->buffers[cpu];
+ entries += cpu_buffer->entries;
+ }
+
+ return entries;
+}
+
+/**
+ * ring_buffer_overrun_cpu - get the number of overruns in buffer
+ * @buffer: The ring buffer
+ *
+ * Returns the total number of overruns in the ring buffer
+ * (all CPU entries)
+ */
+unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long overruns = 0;
+ int cpu;
+
+ /* if you care about this being correct, lock the buffer */
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = buffer->buffers[cpu];
+ overruns += cpu_buffer->overrun;
+ }
+
+ return overruns;
+}
+
+static inline int
+ring_buffer_iter_cpu_empty(struct ring_buffer_iter_per_cpu *cpu_iter,
+ struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return cpu_iter->head_page == cpu_buffer->tail_page &&
+ cpu_iter->head == cpu_buffer->tail;
+}
+
+static inline struct ring_buffer_per_cpu *
+iter_choose_buffer(struct ring_buffer_iter *iter, int cpu)
+{
+ struct ring_buffer *buffer = iter->buffer;
+
+ if (iter->flags & RB_ITER_FL_SNAP)
+ return buffer->snap_buffers[cpu];
+ else
+ return buffer->buffers[cpu];
+}
+
+static void
+ring_buffer_advance_head(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct ring_buffer *buffer = cpu_buffer->buffer;
+ struct ring_buffer_event *event;
+
+ event = cpu_buffer->pages[cpu_buffer->head_page] + cpu_buffer->head;
+
+ if (ring_buffer_short_event(buffer, cpu_buffer->head) ||
+ ring_buffer_null_event(event)) {
+ BUG_ON(cpu_buffer->head_page == cpu_buffer->tail_page);
+ ring_buffer_inc_page(buffer, &cpu_buffer->head_page);
+ cpu_buffer->head = 0;
+ return;
+ }
+
+ BUG_ON((cpu_buffer->head_page == cpu_buffer->tail_page) &&
+ (cpu_buffer->head + event->length > cpu_buffer->tail));
+
+ cpu_buffer->head += event->length;
+ if (ring_buffer_short_event(buffer, cpu_buffer->head)) {
+ ring_buffer_inc_page(buffer, &cpu_buffer->head_page);
+ cpu_buffer->head = 0;
+ return;
+ }
+
+ /* check for end of page padding */
+ event = cpu_buffer->pages[cpu_buffer->head_page] + cpu_buffer->head;
+ if ((ring_buffer_short_event(buffer, cpu_buffer->head) ||
+ ring_buffer_null_event(event)) &&
+ (cpu_buffer->head_page != cpu_buffer->tail_page))
+ ring_buffer_advance_head(cpu_buffer);
+}
+
+static void
+ring_buffer_advance_iter(struct ring_buffer_iter *iter, int cpu)
+{
+ struct ring_buffer *buffer = iter->buffer;
+ struct ring_buffer_iter_per_cpu *cpu_iter = &iter->buffers[cpu];
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+
+ cpu_buffer = iter_choose_buffer(iter, cpu);
+ event = cpu_buffer->pages[cpu_iter->head_page] + cpu_iter->head;
+
+ if (ring_buffer_short_event(buffer, cpu_iter->head) ||
+ ring_buffer_null_event(event)) {
+ BUG_ON(cpu_iter->head_page == cpu_buffer->tail_page);
+ ring_buffer_inc_page(buffer, &cpu_iter->head_page);
+ cpu_iter->head = 0;
+ return;
+ }
+
+ BUG_ON((cpu_iter->head_page == cpu_buffer->tail_page) &&
+ (cpu_iter->head + event->length > cpu_buffer->tail));
+
+ cpu_iter->head += event->length;
+ if (ring_buffer_short_event(buffer, cpu_iter->head)) {
+ ring_buffer_inc_page(buffer, &cpu_iter->head_page);
+ cpu_iter->head = 0;
+ return;
+ }
+
+ /* check for end of page padding */
+ event = cpu_buffer->pages[cpu_iter->head_page] + cpu_iter->head;
+ if ((ring_buffer_short_event(buffer, cpu_iter->head) ||
+ ring_buffer_null_event(event)) &&
+ (cpu_iter->head_page != cpu_buffer->tail_page))
+ ring_buffer_advance_iter(iter, cpu);
+}
+
+/**
+ * ring_buffer_consume - return an event and consume it
+ * @buffer: The ring buffer to get the next event from
+ *
+ * Returns the next event in the ring buffer, and that event is consumed.
+ * Meaning, that sequential reads will keep returning a different event,
+ * and eventually empty the ring buffer if the producer is slower.
+ */
+struct ring_buffer_event *
+ring_buffer_consume(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event, *next_event = NULL;
+ int cpu, next_cpu = -1;
+
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = buffer->buffers[cpu];
+ if (ring_buffer_per_cpu_empty(cpu_buffer))
+ continue;
+
+ event = cpu_buffer->pages[cpu_buffer->head_page] +
+ cpu_buffer->head;
+ if (unlikely(ring_buffer_short_event(buffer, cpu_buffer->head) ||
+ ring_buffer_null_event(event))) {
+ if (cpu_buffer->head_page == cpu_buffer->tail_page)
+ continue;
+ ring_buffer_inc_page(buffer, &cpu_buffer->head_page);
+ cpu_buffer->head = 0;
+
+ if (ring_buffer_per_cpu_empty(cpu_buffer))
+ continue;
+ event = cpu_buffer->pages[cpu_buffer->head_page] +
+ cpu_buffer->head;
+ }
+
+ if (!next_event || event->counter < next_event->counter) {
+ next_cpu = cpu;
+ next_event = event;
+ }
+ }
+
+ if (!next_event)
+ return NULL;
+
+ cpu_buffer = buffer->buffers[next_cpu];
+ ring_buffer_advance_head(cpu_buffer);
+ cpu_buffer->entries--;
+
+ return next_event;
+}
+
+/**
+ * ring_buffer_start - start a non consuming read of the buffer
+ * @buffer: The ring buffer to read from
+ * @iter_flags: control flags on how to read the buffer.
+ *
+ * This starts up an iteration through the buffer. It also disables
+ * the recording to the buffer until the reading is finished.
+ * This prevents the reading from being corrupted. This is not
+ * a consuming read, so a producer is not expected.
+ *
+ * The iter_flags of RB_ITER_FL_SNAP will read the snapshot image
+ * and not the main buffer.
+ *
+ * Must be paired with ring_buffer_finish.
+ */
+struct ring_buffer_iter *
+ring_buffer_start(struct ring_buffer *buffer, unsigned iter_flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_iter *iter;
+ unsigned long flags;
+ int cpu;
+
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return NULL;
+
+ iter->buffer = buffer;
+ iter->flags = iter_flags;
+
+ WARN_ON((iter_flags & RB_ITER_FL_SNAP) &&
+ !(buffer->flags & RB_FL_SNAPSHOT));
+
+ atomic_inc(&buffer->record_disabled);
+
+ ring_buffer_lock(buffer, &flags);
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = iter_choose_buffer(iter, cpu);
+ iter->buffers[cpu].head = cpu_buffer->head;
+ iter->buffers[cpu].head_page = cpu_buffer->head_page;
+ }
+ ring_buffer_unlock(buffer, flags);
+
+ iter->next_cpu = -1;
+
+ return iter;
+}
+
+/**
+ * ring_buffer_finish - finish reading the iterator of the buffer
+ * @iter: The iterator retrieved by ring_buffer_start
+ *
+ * This re-enables the recording to the buffer, and frees the
+ * iterator.
+ */
+void
+ring_buffer_finish(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer *buffer = iter->buffer;
+
+ atomic_dec(&buffer->record_disabled);
+ kfree(iter);
+}
+
+/**
+ * ring_buffer_peek - peek at the next event to be read
+ * @iter: The ring buffer iterator
+ * @iter_next_cpu: The CPU that the next event belongs on
+ *
+ * This will return the event that will be read next, but does
+ * not increment the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_peek(struct ring_buffer_iter *iter, int *iter_next_cpu)
+{
+ struct ring_buffer *buffer = iter->buffer;
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_iter_per_cpu *cpu_iter;
+ struct ring_buffer_event *event, *next_event = NULL;
+ int cpu, next_cpu = -1;
+
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = iter_choose_buffer(iter, cpu);
+ cpu_iter = &iter->buffers[cpu];
+
+ if (ring_buffer_iter_cpu_empty(cpu_iter, cpu_buffer))
+ continue;
+
+ event = cpu_buffer->pages[cpu_iter->head_page] + cpu_iter->head;
+
+ if (ring_buffer_short_event(buffer, cpu_iter->head) ||
+ ring_buffer_null_event(event)) {
+ if (cpu_iter->head_page == cpu_buffer->tail_page)
+ continue;
+ ring_buffer_inc_page(buffer, &cpu_iter->head_page);
+ cpu_iter->head = 0;
+
+ if (ring_buffer_iter_cpu_empty(cpu_iter, cpu_buffer))
+ continue;
+
+ event = cpu_buffer->pages[cpu_iter->head_page]
+ + cpu_iter->head;
+ }
+
+ if (!next_event || event->counter < next_event->counter) {
+ next_cpu = cpu;
+ next_event = event;
+ }
+ }
+
+ if (!next_event)
+ return NULL;
+
+ if (iter_next_cpu)
+ *iter_next_cpu = next_cpu;
+
+ return next_event;
+}
+
+/**
+ * ring_buffer_read - read the next item in the ring buffer by the iterator
+ * @iter: The ring buffer iterator
+ * @iter_next_cpu: The CPU that the event was read from
+ *
+ * This reads the next event in the ring buffer and increments the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_read(struct ring_buffer_iter *iter, int *iter_next_cpu)
+{
+ struct ring_buffer_event *event;
+ int next_cpu;
+
+ event = ring_buffer_peek(iter, &next_cpu);
+ if (!event)
+ return NULL;
+
+ ring_buffer_advance_iter(iter, next_cpu);
+
+ if (iter_next_cpu)
+ *iter_next_cpu = next_cpu;
+
+ return event;
+}
+
+/**
+ * ring_buffer_size - return the size of the ring buffer (in bytes)
+ * @buffer: The ring buffer.
+ */
+unsigned long ring_buffer_size(struct ring_buffer *buffer)
+{
+ return buffer->page_size * buffer->pages;
+}
+
+/**
+ * ring_buffer_rename - rename the ring buffer
+ * @buffer: The ring buffer to rename
+ * @new_name: The new name to rename the ring buffer to
+ */
+int ring_buffer_rename(struct ring_buffer *buffer, char *new_name, ...)
+{
+ va_list args;
+ int ret;
+
+ mutex_lock(&buffer_mutex);
+
+ va_start(args, new_name);
+ vsnprintf(buffer->name, MAX_NAME_SIZE, new_name, args);
+ va_end(args);
+
+ buffer->dir_ent = debugfs_rename(buffer_dent, buffer->dir_ent,
+ buffer_dent, buffer->name);
+ if (!buffer->dir_ent) {
+ WARN_ON(1);
+ ret = -EBUSY;
+ }
+
+ mutex_unlock(&buffer_mutex);
+
+ return ret;
+}
+
+static void
+__ring_buffer_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer, int cpu)
+{
+ cpu_buffer->head_page = cpu_buffer->tail_page = 0;
+ cpu_buffer->head = cpu_buffer->tail = 0;
+ cpu_buffer->overrun = 0;
+ cpu_buffer->entries = 0;
+}
+
+/**
+ * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
+ * @buffer: The ring buffer to reset a per cpu buffer of
+ * @cpu: The CPU buffer to be reset
+ */
+void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ __raw_spin_lock(&cpu_buffer->lock);
+
+ __ring_buffer_reset_cpu(cpu_buffer, cpu);
+
+ __raw_spin_unlock(&cpu_buffer->lock);
+ raw_local_irq_restore(flags);
+}
+
+/**
+ * rind_buffer_empty - is the ring buffer empty?
+ * @buffer: The ring buffer to test
+ */
+int ring_buffer_empty(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ /* yes this is racy, but if you don't like the race, lock the buffer */
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = buffer->buffers[cpu];
+ if (!ring_buffer_per_cpu_empty(cpu_buffer))
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty?
+ * @buffer: The ring buffer
+ * @cpu: The CPU buffer to test
+ */
+int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ /* yes this is racy, but if you don't like the race, lock the buffer */
+ cpu_buffer = buffer->buffers[cpu];
+ return ring_buffer_per_cpu_empty(cpu_buffer);
+}
+
+/**
+ * ring_buffer_snapshot_cpu - take a snapshot of a current ring buffer cpu buffer
+ * @buffer: The ring buffer
+ * @cpu: Take a snapshot of this CPU buffer
+ *
+ * A snapshot of the per CPU buffer is saved and the main buffer is
+ * replaced. This allows live traces to have a snap shot taken.
+ * This is very effective when needing to take maximums and still record
+ * new traces.
+ */
+void ring_buffer_snapshot_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ cpu_buffer = buffer->buffers[cpu];
+
+ __raw_spin_lock(&cpu_buffer->lock);
+ __ring_buffer_reset_cpu(buffer->snap_buffers[cpu], cpu);
+ buffer->buffers[cpu] = buffer->snap_buffers[cpu];
+ buffer->snap_buffers[cpu] = cpu_buffer;
+
+ __raw_spin_unlock(&cpu_buffer->lock);
+ raw_local_irq_restore(flags);
+}
+
+/**
+ * ring_buffer_snapshot_one_cpu - take a snapshot of cpu and zero the rest
+ * @buffer: The ring buffer
+ * @cpu: Take a snapshot of this CPU buffer
+ *
+ * A snapshot of the per CPU buffer is saved and the main buffer is
+ * replaced. This allows live traces to have a snap shot taken.
+ * This is very effective when needing to take maximums and still record
+ * new traces.
+ *
+ * This function will not only snapshot a particular CPU buffer, but it
+ * will also zero the others. This facilitates reading the snapshot buffer
+ * if only one buffer is of interest.
+ */
+void ring_buffer_snapshot_one_cpu(struct ring_buffer *buffer, int snap_cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long flags;
+ int cpu;
+
+ raw_local_irq_save(flags);
+
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ __raw_spin_lock(&cpu_buffer->lock);
+ __ring_buffer_reset_cpu(buffer->snap_buffers[cpu], cpu);
+ if (cpu == snap_cpu) {
+ buffer->buffers[cpu] = buffer->snap_buffers[cpu];
+ buffer->snap_buffers[cpu] = cpu_buffer;
+ }
+ __raw_spin_unlock(&cpu_buffer->lock);
+ }
+
+ raw_local_irq_restore(flags);
+}
+
+/**
+ * ring_buffer_snapshot - take a snapshot of the ring buffer
+ * @buffer: The ring buffer
+ *
+ * A snapshot of the entire ring buffer is saved, and can be
+ * retrieved later, even when we currently have a live trace
+ * recording.
+ */
+void ring_buffer_snapshot(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long flags;
+ int cpu;
+
+ raw_local_irq_save(flags);
+
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ __raw_spin_lock(&cpu_buffer->lock);
+ __ring_buffer_reset_cpu(buffer->snap_buffers[cpu], cpu);
+ buffer->buffers[cpu] = buffer->snap_buffers[cpu];
+ buffer->snap_buffers[cpu] = cpu_buffer;
+ __raw_spin_unlock(&cpu_buffer->lock);
+ }
+
+ raw_local_irq_restore(flags);
+}
+
+struct ring_buffer_seq *
+ring_buffer_seq_alloc(gfp_t flags)
+{
+ struct ring_buffer_seq *s;
+
+ s = kzalloc(sizeof(*s), flags);
+ return s;
+}
+
+void ring_buffer_seq_free(struct ring_buffer_seq *s)
+{
+ kfree(s);
+}
+
+unsigned ring_buffer_seq_length(struct ring_buffer_seq *seq)
+{
+ return seq->len;
+}
+
+void ring_buffer_seq_set_length(struct ring_buffer_seq *seq, unsigned len)
+{
+ BUG_ON(len > PAGE_SIZE);
+ seq->len = len;
+}
+
+/**
+ * ring_buffer_seq_printf - sequence printing of buffer information
+ * @s: buffer sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * ring_buffer_seq_printf is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ */
+int
+ring_buffer_seq_printf(struct ring_buffer_seq *s, const char *fmt, ...)
+{
+ int len = (PAGE_SIZE - 1) - s->len;
+ va_list ap;
+ int ret;
+
+ if (!len)
+ return 0;
+
+ va_start(ap, fmt);
+ ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
+ va_end(ap);
+
+ /* If we can't write it all, don't bother writing anything */
+ if (ret >= len)
+ return 0;
+
+ s->len += ret;
+
+ return len;
+}
+
+/**
+ * ring_buffer_seq_puts - buffer sequence printing of simple string
+ * @s: buffer sequence descriptor
+ * @str: simple string to record
+ *
+ * The tracer may use either the sequence operations or its own
+ * copy to user routines. This function records a simple string
+ * into a special buffer (@s) for later retrieval by a sequencer
+ * or other mechanism.
+ */
+int
+ring_buffer_seq_puts(struct ring_buffer_seq *s, const char *str)
+{
+ int len = strlen(str);
+
+ if (len > ((PAGE_SIZE - 1) - s->len))
+ return 0;
+
+ memcpy(s->buffer + s->len, str, len);
+ s->len += len;
+
+ return len;
+}
+
+int
+ring_buffer_seq_putc(struct ring_buffer_seq *s, unsigned char c)
+{
+ if (s->len >= (PAGE_SIZE - 1))
+ return 0;
+
+ s->buffer[s->len++] = c;
+
+ return 1;
+}
+
+int
+ring_buffer_seq_putmem(struct ring_buffer_seq *s, void *mem, size_t len)
+{
+ if (len > ((PAGE_SIZE - 1) - s->len))
+ return 0;
+
+ memcpy(s->buffer + s->len, mem, len);
+ s->len += len;
+
+ return len;
+}
+
+#define HEX_CHARS 17
+static const char hex2asc[] = "0123456789abcdef";
+
+int
+ring_buffer_seq_putmem_hex(struct ring_buffer_seq *s, void *mem, size_t len)
+{
+ unsigned char hex[HEX_CHARS];
+ unsigned char *data = mem;
+ unsigned char byte;
+ int i, j;
+
+ BUG_ON(len >= HEX_CHARS);
+
+#ifdef __BIG_ENDIAN
+ for (i = 0, j = 0; i < len; i++) {
+#else
+ for (i = len-1, j = 0; i >= 0; i--) {
+#endif
+ byte = data[i];
+
+ hex[j++] = hex2asc[byte & 0x0f];
+ hex[j++] = hex2asc[byte >> 4];
+ }
+ hex[j++] = ' ';
+
+ return ring_buffer_seq_putmem(s, hex, j);
+}
+
+void
+ring_buffer_seq_reset(struct ring_buffer_seq *s)
+{
+ s->len = 0;
+ s->readpos = 0;
+}
+
+ssize_t
+ring_buffer_seq_copy_to_user(struct ring_buffer_seq *s,
+ char __user *ubuf, size_t cnt)
+{
+ int len;
+ int ret;
+
+ if (s->len <= s->readpos)
+ return -EBUSY;
+
+ len = s->len - s->readpos;
+ if (cnt > len)
+ cnt = len;
+ ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
+ if (ret)
+ return -EFAULT;
+
+ s->readpos += len;
+
+ if (s->readpos >= s->len)
+ ring_buffer_seq_reset(s);
+
+ return cnt;
+}
+
+int
+ring_buffer_seq_copy_to_mem(struct ring_buffer_seq *s,
+ void *mem, int cnt)
+{
+ int len;
+
+ if (s->len <= s->readpos)
+ return -EBUSY;
+
+ len = s->len - s->readpos;
+ if (cnt > len)
+ cnt = len;
+ memcpy(mem, s->buffer + s->readpos, cnt);
+
+ s->readpos += len;
+
+ if (s->readpos >= s->len)
+ ring_buffer_seq_reset(s);
+
+ return cnt;
+}
+
+int
+ring_buffer_seq_to_seqfile(struct seq_file *m, struct ring_buffer_seq *s)
+{
+ int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
+ int ret;
+
+ s->buffer[len] = 0;
+ ret = seq_puts(m, s->buffer);
+ if (ret)
+ ring_buffer_seq_reset(s);
+ return ret;
+}
Index: linux-compile.git/kernel/trace/Kconfig
===================================================================
--- linux-compile.git.orig/kernel/trace/Kconfig 2008-07-27 09:26:34.000000000 -0400
+++ linux-compile.git/kernel/trace/Kconfig 2008-09-22 11:47:29.000000000 -0400
@@ -15,6 +15,10 @@ config TRACING
select DEBUG_FS
select STACKTRACE

+config RING_BUFFER
+ bool "ring buffer"
+ select DEBUG_FS
+
config FTRACE
bool "Kernel Function Tracer"
depends on HAVE_FTRACE
Index: linux-compile.git/kernel/trace/Makefile
===================================================================
--- linux-compile.git.orig/kernel/trace/Makefile 2008-07-27 09:26:34.000000000 -0400
+++ linux-compile.git/kernel/trace/Makefile 2008-09-22 11:46:46.000000000 -0400
@@ -11,6 +11,7 @@ obj-y += trace_selftest_dynamic.o
endif

obj-$(CONFIG_FTRACE) += libftrace.o
+obj-$(CONFIG_RING_BUFFER) += ring_buffer.o

obj-$(CONFIG_TRACING) += trace.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o

Peter Zijlstra

2008-09-24 15:10:13 UTC

plain text document attachment (ring-buffer.patch)
RFC RFC RFC RFC RFC!!!!

Plenty comments, things I like, things I don't like, specifics below ;-)

Now did I get your attention that this is a request for comment patch.
This is probably very buggy. I ran it as a back end for ftrace but only
tested the irqsoff and ftrace tracers. The selftests are busted with it.
But this is an attempt to get a unified buffering system that was
talked about at the LPC meeting.
I did not get a chance to implement all the event recording and printing
in the debugfs/tracing/buffers directory. But I got enough to do
some ftrace work with it.
Now that it boots and runs (albeit, a bit buggy), I decided to post it.
This is some idea that I had to handle this.
buffer = ring_buffer_alloc(size, flags, max_event_size, print_func, name);

Don't like that max_event_size param.
Don't like that print_func param.
Maybe like the name param.

event = ring_buffer_lock_reserve(buffer, event_id, length, &flags);
event->data = record_this_data;
ring_buffer_unlock_commit(buffer, event, flags);

This can, in generic, not work. Due to the simple fact that we might
straddle a page boundary. Therefore I think its best to limit our self
to the write interface below, so that it can handle that.

ring_buffer_write(buffer, event_id, length, data);

Don't like the event_id, just stick to plain binary data, and leave
interpretation open to whoemever uses it.

event = ring_buffer_consume(buffer);

By the above, this would have to be per-cpu as you cannot interpret the
actual binary data, and this cannot do the fwd-merge-sort-iter thing.

iter = ring_buffer_start(buffer, iter_flags);
event = ring_buffer_read(iter, &next_cpu);
ring_buffer_finish(iter);
Note, the iteration part stops recording to the buffer. This is a feature.
If you want producer/consumer, then you should be using the consumer.

Why?

---

So the interface I would like to see is:

struct ringbuffer *ringbuffer_alloc(unsigned long size);
void ringbuffer_free(struct ringbuffer *rb);

/*
* disables preemption, cmpxchg reserves size on local cpu, memcpy
* chunks of @buf into the page buffers, enables preemption.
*/
int ringbuffer_write(struct ringbuffer *buffer, const void *buf, unsigned long size);

/*
* consumes @size data from @cpu's buffer and copies it into @buf.
* has internal synchronization for read pos, returns error when
* overwritten - but resets state to next half-buffer.
*/
int ringbuffer_read(struct ringbuffer *buffer, int cpu, void *buf, unsigned long size);

This interface is enough to abstract the fact that our buffer
is non-linear and further assumes as little as possible.

For your IRQ/preempt tracers you can possibly add another operation:

/*
* swaps the @cpu buffer of @src and @dst, returns error when dimensions
* mis-match.
*/
int ringbuffer_xchg_cpu(struct ringbuffer *src, struct ringbuffer *dst, int cpu);

---

On top of that foundation build an eventbuffer, which knows about
encoding/decoding/printing events.

This too needs to be a flexible layer - as I suspect the google guys
will want their ultra-compressed events back.

I'm not quite sure yet how to model this layer most flexible without
being a nuisance.

So obviously its also this layer that has the whole debugfs interface,
but maybe we could even push that one layer up, so as to keep that
reusable with the print methods and be encoding invariant.

---
include/linux/ring_buffer.h | 138 +++
kernel/trace/Kconfig | 4
kernel/trace/Makefile | 1
kernel/trace/ring_buffer.c | 1565 ++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 1708 insertions(+)
Index: linux-compile.git/include/linux/ring_buffer.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-compile.git/include/linux/ring_buffer.h 2008-09-23 17:45:49.000000000 -0400
@@ -0,0 +1,138 @@
+#ifndef _LINUX_RING_BUFFER_H
+#define _LINUX_RING_BUFFER_H
+
+#include <linux/mm.h>
+#include <linux/seq_file.h>
+
+struct ring_buffer;
+struct ring_buffer_iter;
+
+/*
+ * Don't reference this struct directly, use the inline items below.
+ */
+struct ring_buffer_event {
+ unsigned long long counter;
+ short type;
+ short length;
+ char body[];
+} __attribute__((__packed__));
+
+#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event))
+
+static inline unsigned
+ring_buffer_event_type(struct ring_buffer_event *event_handler)
+{
+ return event_handler->type;
+}
+
+static inline unsigned
+ring_buffer_event_length(struct ring_buffer_event *event_handler)
+{
+ return event_handler->length;
+}
+
+static inline void *
+ring_buffer_event_data(struct ring_buffer_event *event_handler)
+{
+ return event_handler->body;
+}
+
+static inline unsigned long long
+ring_buffer_event_counter(struct ring_buffer_event *event_handler)
+{
+ return event_handler->counter;
+}
+
+struct ring_buffer_seq;
+
+unsigned long ring_buffer_max_event_size(struct ring_buffer *buffer);
+
+typedef void (*ring_buffer_print_func) (struct ring_buffer *buffer,
+ struct ring_buffer_seq *seq,
+ struct ring_buffer_event *event);
+
+struct ring_buffer_seq *ring_buffer_seq_alloc(gfp_t flags);
+void ring_buffer_seq_free(struct ring_buffer_seq *seq);
+unsigned ring_buffer_seq_length(struct ring_buffer_seq *seq);
+void ring_buffer_seq_set_length(struct ring_buffer_seq *seq, unsigned len);
+int ring_buffer_seq_printf(struct ring_buffer_seq *seq, const char *fmt, ...)
+ __attribute__ ((format (printf, 2, 3)));
+int ring_buffer_seq_puts(struct ring_buffer_seq *seq, const char *str);
+int ring_buffer_seq_putc(struct ring_buffer_seq *seq, unsigned char c);
+int ring_buffer_seq_putmem(struct ring_buffer_seq *s, void *mem, size_t len);
+int ring_buffer_seq_to_seqfile(struct seq_file *m, struct ring_buffer_seq *s);
+int ring_buffer_seq_putmem_hex(struct ring_buffer_seq *s, void *mem, size_t len);
+ssize_t ring_buffer_seq_copy_to_user(struct ring_buffer_seq *seq,
+ char __user *ubuf,
+ size_t cnt);
+int ring_buffer_seq_to_mem(struct ring_buffer_seq *s, void *mem, size_t len);
+void ring_buffer_seq_reset(struct ring_buffer_seq *s);
+
+void ring_buffer_lock(struct ring_buffer *buffer, unsigned long *flags);
+void ring_buffer_unlock(struct ring_buffer *buffer, unsigned long flags);
+
+struct ring_buffer *
+ring_buffer_alloc(unsigned long size, unsigned long flags,
+ unsigned long max_event_size,
+ ring_buffer_print_func print_func,
+ const char *name, ...);
+void ring_buffer_free(struct ring_buffer *buffer);
+
+int
+ring_buffer_register_event(struct ring_buffer *buffer, unsigned long length,
+ ring_buffer_print_func print_func,
+ int event_type,
+ const char *name, ...);
+void *ring_buffer_lock_reserve(struct ring_buffer *buffer,
+ int event_type,
+ unsigned long length,
+ unsigned long *flags);
+int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+ void *data, unsigned long flags);
+
+int ring_buffer_rename(struct ring_buffer *buffer, char *new_name, ...);
+
+void *ring_buffer_write(struct ring_buffer *buffer,
+ int event_type,
+ unsigned long length,
+ void *event);
+
+enum ring_buffer_iter_flags {
+ RB_ITER_FL_SNAP = 1 << 0,
+};
+
+struct ring_buffer_event *
+ring_buffer_consume(struct ring_buffer *buffer);
+
+struct ring_buffer_iter *
+ring_buffer_start(struct ring_buffer *buffer, unsigned flags);
+void ring_buffer_finish(struct ring_buffer_iter *iter);
+
+struct ring_buffer_event *
+ring_buffer_peek(struct ring_buffer_iter *iter, int *next_cpu);
+
+struct ring_buffer_event *
+ring_buffer_read(struct ring_buffer_iter *iter, int *next_cpu);
+
+unsigned long ring_buffer_size(struct ring_buffer *buffer);
+
+void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu);
+
+int ring_buffer_empty(struct ring_buffer *buffer);
+
+void ring_buffer_disable(struct ring_buffer *buffer);
+void ring_buffer_enable(struct ring_buffer *buffer);
+
+void ring_buffer_snapshot(struct ring_buffer *buffer);
+void ring_buffer_snapshot_cpu(struct ring_buffer *buffer, int cpu);
+void ring_buffer_snapshot_one_cpu(struct ring_buffer *buffer, int cpu);
+
+unsigned long ring_buffer_entries(struct ring_buffer *buffer);
+unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
+
+enum ring_buffer_flags {
+ RB_FL_OVERWRITE = 1 << 0,
+ RB_FL_SNAPSHOT = 1 << 1,
+};
+
+#endif /* _LINUX_RING_BUFFER_H */
Index: linux-compile.git/kernel/trace/ring_buffer.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-compile.git/kernel/trace/ring_buffer.c 2008-09-24 00:46:40.000000000 -0400
@@ -0,0 +1,1565 @@
+/*
+ * Generic ring buffer
+ *
+ */
+#include <linux/ring_buffer.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/list.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+
+#define MAX_NAME_SIZE 256
+#define RING_BUFFER_EVENT_DYN_START 1000
+#define RB_EVENT_HASHBITS 10
+#define RB_EVENT_HASHSIZE (1<<RB_EVENT_HASHBITS)
+
+/*
+ * head_page == tail_page && head == tail then buffer is empty.
+ */
+struct ring_buffer_per_cpu {
+ int cpu;
+ struct ring_buffer *buffer;
+ raw_spinlock_t lock;
+ struct lock_class_key lock_key;

lockdep keys cannot be in dynamic storage, also mainline raw_spinlock_t
doesn't have lockdep.

+ void **pages;

You used to link these using the pageframe, what happened to that?

+ unsigned long head; /* read from head */
+ unsigned long tail; /* write to tail */
+ unsigned long head_page;
+ unsigned long tail_page;
+ unsigned long overrun;
+ unsigned long entries;
+};
+
+struct ring_buffer {
+ char name[MAX_NAME_SIZE + 1];
+ ring_buffer_print_func default_func;
+ unsigned long size;
+ unsigned long next_event_type;
+ unsigned long max_event_size;
+ unsigned pages;
+ unsigned page_size;
+ unsigned flags;
+ int cpus;
+ atomic_t record_disabled;
+
+ spinlock_t lock;
+ struct mutex mutex;
+
+ struct ring_buffer_per_cpu *buffers[NR_CPUS];

People (read SGI) prefer you dynamically allocate this array due to them
wanting distros to be able to set NR_CPUS=insane. Same goes for all
NR_CPUS usage below.

+ struct list_head list;
+ struct list_head events;
+
+ struct ring_buffer_per_cpu *snap_buffers[NR_CPUS];
+
+ struct hlist_head event_hash[RB_EVENT_HASHSIZE];
+
+ /* debugfs file entries */
+ struct dentry *dir_ent;
+ struct dentry *entry_dir;
+ struct dentry *text_ent;
+ struct dentry **binary_ents; /* per cpu */
+};
+
+struct ring_buffer_event_holder {
+ struct ring_buffer *buffer;
+ struct list_head list;
+ struct hlist_node hash;
+ char *name;
+ unsigned event_type;
+ unsigned length;
+ ring_buffer_print_func print_func;
+};
+
+struct ring_buffer_iter_per_cpu {
+ unsigned long head;
+ unsigned long head_page;
+};
+
+struct ring_buffer_iter {
+ struct ring_buffer *buffer;
+ struct ring_buffer_iter_per_cpu buffers[NR_CPUS];
+ int next_cpu;
+ unsigned flags;
+};
+
+struct ring_buffer_seq {
+ unsigned char buffer[PAGE_SIZE];
+ unsigned int len;
+ unsigned int readpos;
+};

Why not dynamically allocate the page?

+static struct file_operations text_fops = {
+#if 0
+ .open = text_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = text_release,
+#endif
+};
+
+static struct file_operations binary_fops = {
+#if 0
+ .open = binary_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = binary_release,
+#endif
+};
+
+/* FIXME!!! */
+unsigned long long
+ring_buffer_next_counter(int cpu)
+{
+ return sched_clock();
+}

Yeah, we should ask for mathieu's event stamp counter. Non of the clocks
we currently have suffice for this goal.

+DEFINE_MUTEX(buffer_mutex);
+static LIST_HEAD(ring_buffer_list);
+static struct dentry *buffer_dent;
+#define TEMP_BUFFER_SIZE 1023
+static char temp_buffer[TEMP_BUFFER_SIZE+1];
+
+static int ring_buffer_register_debugfs(struct ring_buffer *buffer)
+{
+ struct dentry *tracing_dent;
+ struct dentry *dentry;
+ struct dentry *entry;
+ char name_buf[32];
+ int ret = -ENOMEM, i;
+
+ if (!buffer_dent) {
+ tracing_dent = tracing_init_dentry();
+ buffer_dent = debugfs_create_dir("buffers", tracing_dent);
+ if (!buffer_dent) {
+ pr_warning("Could not create debugfs directory"
+ " 'tracing/buffers'\n");
+ return ret;
+ }
+ }
+
+ buffer->binary_ents = kzalloc(sizeof(struct dentry *) * buffer->cpus,
+ GFP_KERNEL);
+ if (!buffer->binary_ents)
+ return ret;
+
+ dentry = debugfs_create_dir(buffer->name, buffer_dent);
+ if (!dentry)
+ goto free_binary_ents;
+ buffer->dir_ent = dentry;
+
+ entry = debugfs_create_file("text", 0444, dentry,
+ buffer, &text_fops);
+ if (!entry)
+ goto fail_free_dir;
+ buffer->text_ent = entry;
+
+ for (i = 0; i < buffer->cpus; i++) {
+ snprintf(name_buf, 32, "binary%d", i);
+ entry = debugfs_create_file(name_buf, 0444, dentry,
+ buffer->buffers[i], &binary_fops);
+ if (!entry)
+ goto fail_free_ents;
+ buffer->binary_ents[i] = entry;
+ }
+
+ return 0;
+
+ debugfs_remove(buffer->text_ent);
+ for (i = 0; i < buffer->cpus; i++) {
+ if (buffer->binary_ents[i])
+ debugfs_remove(buffer->binary_ents[i]);
+ }
+
+ kfree(buffer->binary_ents);
+ debugfs_remove(dentry);
+
+ kfree(buffer->binary_ents);
+ return -1;
+}
+
+static void ring_buffer_unregister_debugfs(struct ring_buffer *buffer)
+{
+ /* fast and simple for now */
+ debugfs_remove_recursive(buffer->dir_ent);
+}
+
+static struct ring_buffer_per_cpu *
+ring_buffer_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int pages = buffer->pages;
+ int i;
+
+ cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!cpu_buffer)
+ return NULL;
+
+ cpu_buffer->cpu = cpu;
+ cpu_buffer->buffer = buffer;
+ cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;

Ah, see, here you don't use this lockdep key ;-

+ cpu_buffer->pages = kzalloc_node(ALIGN(sizeof(void *) * pages,
+ cache_line_size()), GFP_KERNEL,
+ cpu_to_node(cpu));
+ if (!cpu_buffer->pages)
+ goto fail_free_buffer;
+
+ for (i = 0; i < pages; i++) {
+ cpu_buffer->pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!cpu_buffer->pages[i])
+ goto fail_free_pages;
+ }

Like said, I rather liked using the pageframe to link these pages
together. The simple fact is that both read and write are fwd iterative
operations so you don't need the random access array.

+ return cpu_buffer;
+
+ for (i = 0; i < pages; i++) {
+ if (cpu_buffer->pages[i])
+ free_page((unsigned long)cpu_buffer->pages[i]);
+ }
+ kfree(cpu_buffer->pages);
+
+ kfree(cpu_buffer);
+ return NULL;
+}
+
+static void
+ring_buffer_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ int i;
+
+ for (i = 0; i < cpu_buffer->buffer->pages; i++) {
+ if (cpu_buffer->pages[i])
+ free_page((unsigned long)cpu_buffer->pages[i]);
+ }
+ kfree(cpu_buffer->pages);
+ kfree(cpu_buffer);
+}
+
+struct ring_buffer *
+ring_buffer_alloc(unsigned long size, unsigned long flags,
+ unsigned long max_event_size,
+ ring_buffer_print_func print_func,
+ const char *name, ...)
+{
+ struct ring_buffer *buffer, *p;
+ va_list args;
+ int order = 0;
+ int ret = 0;
+ int cpu;
+
+ /* For now, we only allow max of page size */
+ if (max_event_size > PAGE_SIZE) {
+ WARN_ON(1);
+ return NULL;
+ }
+ max_event_size = PAGE_SIZE;
+
+ mutex_lock(&buffer_mutex);
+
+ /* keep it in its own cache line */
+ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
+ GFP_KERNEL);
+ if (!buffer)
+ goto fail_unlock;
+
+ va_start(args, name);
+ vsnprintf(buffer->name, MAX_NAME_SIZE, name, args);
+ va_end(args);
+
+ buffer->name[MAX_NAME_SIZE] = 0;
+ /* FIXME; this should be better than a linear search */
+ list_for_each_entry(p, &ring_buffer_list, list) {
+ if (strcmp(p->name, buffer->name) == 0) {
+ ret = -EBUSY;
+ break;
+ }
+ }
+ if (ret)
+ goto fail_free_buffer;
+
+ buffer->page_size = 1 << order << PAGE_SHIFT;
+ buffer->next_event_type = RING_BUFFER_EVENT_DYN_START;
+ buffer->max_event_size = max_event_size;
+ INIT_LIST_HEAD(&buffer->events);
+
+ buffer->default_func = print_func;
+ buffer->pages = (size + (buffer->page_size - 1)) / buffer->page_size;
+ buffer->flags = flags;
+
+ /* need at least two pages */
+ if (buffer->pages == 1)
+ buffer->pages++;
+
+ /* FIXME: do for only online CPUS */
+ buffer->cpus = num_possible_cpus();
+ for_each_possible_cpu(cpu) {
+ if (cpu >= buffer->cpus)
+ continue;
+ buffer->buffers[cpu] =
+ ring_buffer_allocate_cpu_buffer(buffer, cpu);
+ if (!buffer->buffers[cpu])
+ goto fail_free_buffers;
+ }
+
+ if (flags & RB_FL_SNAPSHOT) {
+ for_each_possible_cpu(cpu) {
+ if (cpu >= buffer->cpus)
+ continue;
+ buffer->snap_buffers[cpu] =
+ ring_buffer_allocate_cpu_buffer(buffer, cpu);
+ if (!buffer->snap_buffers[cpu])
+ goto fail_free_snap_buffers;
+ }
+ }

Right, like said above, I don't think you need the snapshot stuff in
here, if you provide this per cpu buffer xchg method.

+ ret = ring_buffer_register_debugfs(buffer);
+ if (ret)
+ goto fail_free_snap_buffers;
+
+ spin_lock_init(&buffer->lock);
+ mutex_init(&buffer->mutex);
+
+ mutex_unlock(&buffer_mutex);
+
+ return buffer;
+
+ if (!(flags & RB_FL_SNAPSHOT))
+ goto fail_free_buffers;
+
+ for_each_possible_cpu(cpu) {
+ if (cpu >= buffer->cpus)
+ continue;
+ if (buffer->snap_buffers[cpu])
+ ring_buffer_free_cpu_buffer(buffer->snap_buffers[cpu]);
+ }
+
+ for_each_possible_cpu(cpu) {
+ if (cpu >= buffer->cpus)
+ continue;
+ if (buffer->buffers[cpu])
+ ring_buffer_free_cpu_buffer(buffer->buffers[cpu]);
+ }
+
+ kfree(buffer);
+
+ mutex_unlock(&buffer_mutex);
+ return NULL;
+}
+
+static struct ring_buffer_event_holder *
+__ring_buffer_find_event(struct ring_buffer *buffer, int event_type)
+{
+ struct ring_buffer_event_holder *p;
+ struct hlist_node *t;
+ unsigned long key;
+
+ key = hash_long(event_type, RB_EVENT_HASHBITS);
+
+ hlist_for_each_entry_rcu(p, t, &buffer->event_hash[key], hash) {
+ if (p->event_type == event_type)
+ return p;
+ }
+
+ return NULL;
+}
+
+/**
+ * ring_buffer_register_event - register an event to a ring buffer
+ *
+ * This function allows events to be registered, as well as adding
+ * a function to handle how to show this event in text format.
+ * The event_type must be less than 1000, since that is where
+ * the dynamic event types start. Event types are unique to buffers.
+ */
+int
+ring_buffer_register_event(struct ring_buffer *buffer, unsigned long length,
+ ring_buffer_print_func print_func,
+ int event_type,
+ const char *name, ...)
+{
+ struct ring_buffer_event_holder *ptr, *event;
+ struct list_head *p;
+ va_list args, args2;
+ unsigned long key;
+ int r;
+
+ if (event_type >= RING_BUFFER_EVENT_DYN_START)
+ return -EINVAL;
+
+ event = kzalloc(sizeof(*event), GFP_KERNEL);
+ if (!event)
+ return -ENOMEM;
+
+ event->print_func = print_func;
+
+ mutex_lock(&buffer->mutex);
+
+ if (!event_type)
+ event_type = buffer->next_event_type++;
+
+ ptr = __ring_buffer_find_event(buffer, event_type);
+ if (ptr) {
+ event_type = -EBUSY;
+ kfree(event);
+ goto out;
+ }
+
+ va_start(args, name);
+ va_copy(args2, args);
+ r = vsnprintf(temp_buffer, TEMP_BUFFER_SIZE, name, args);
+
+ event->name = kzalloc(r+1, GFP_KERNEL);
+ if (!event->name) {
+ va_end(args2);
+ va_end(args);
+ kfree(event);
+ return -ENOMEM;
+ }
+
+ if (unlikely(r >= TEMP_BUFFER_SIZE))
+ vsnprintf(event->name, r+1, name, args2);
+ else
+ strcpy(event->name, temp_buffer);
+
+ va_end(args2);
+ va_end(args);
+
+ list_for_each(p, &buffer->events) {
+ ptr = list_entry(p, struct ring_buffer_event_holder, list);
+ r = strcmp(event->name, ptr->name);
+ if (!r) {
+ WARN_ON(1);
+ kfree(event->name);
+ kfree(event);
+ event = NULL;
+ goto out;
+ }
+ if (r < 0)
+ break;
+ }
+
+ list_add_tail(&event->list, p);
+
+ key = hash_long(event_type, RB_EVENT_HASHBITS);
+ hlist_add_head_rcu(&event->hash, &buffer->event_hash[key]);
+
+ mutex_unlock(&buffer->mutex);
+ return event_type;
+}
+
+static void
+ring_buffer_event_free(struct ring_buffer_event_holder *event)
+{
+ kfree(event->name);
+ kfree(event);
+}
+
+/**
+ * ring_buffer_free - free a ring buffer.
+ */
+void
+ring_buffer_free(struct ring_buffer *buffer)
+{
+ struct ring_buffer_event_holder *event_holder, *n;
+ int cpu;
+
+ for (cpu = 0; cpu < buffer->cpus; cpu++)
+ ring_buffer_free_cpu_buffer(buffer->buffers[cpu]);
+
+ list_for_each_entry_safe(event_holder, n,
+ &buffer->events, list)
+ ring_buffer_event_free(event_holder);
+
+ ring_buffer_unregister_debugfs(buffer);
+ kfree(buffer);
+}
+
+/**
+ * ring_buffer_max_event_size - return max_event_size of the buffer
+ *
+ * Returns the max event size allowed for the given buffer.
+ */
+unsigned long
+ring_buffer_max_event_size(struct ring_buffer *buffer)
+{
+ return buffer->max_event_size;
+}

Right, I think this sound die.

+static inline int
+ring_buffer_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return cpu_buffer->head_page == cpu_buffer->tail_page &&
+ cpu_buffer->head == cpu_buffer->tail;
+}
+
+static inline int
+ring_buffer_null_event(struct ring_buffer_event *event)
+{
+ return !event->type && !event->counter;
+}
+
+static inline int
+ring_buffer_short_event(struct ring_buffer *buffer, unsigned long ptr)
+{
+ return ptr + RB_EVNT_HDR_SIZE > buffer->page_size;
+}
+
+static void
+ring_buffer_update_overflow(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned long head_page)
+{
+ struct ring_buffer *buffer = cpu_buffer->buffer;
+ struct ring_buffer_event *event;
+ unsigned long head;
+
+ for (head = 0; head < buffer->page_size; head += event->length) {
+ if (ring_buffer_short_event(buffer, head))
+ break;
+ event = cpu_buffer->pages[cpu_buffer->head_page] + head;
+ if (ring_buffer_null_event(event))
+ break;
+ cpu_buffer->overrun++;
+ cpu_buffer->entries--;
+ }
+}
+
+static inline void
+ring_buffer_inc_page(struct ring_buffer *buffer,
+ unsigned long *page)
+{
+ (*page)++;
+ if (*page >= buffer->pages)
+ *page = 0;
+}
+
+static struct ring_buffer_event *
+ring_buffer_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned long length)
+{
+ unsigned long head_page, tail_page, tail;
+ struct ring_buffer *buffer = cpu_buffer->buffer;
+ struct ring_buffer_event *event;
+
+ if (length > buffer->page_size)
+ return NULL;
+
+ tail_page = cpu_buffer->tail_page;
+ head_page = cpu_buffer->head_page;
+ tail = cpu_buffer->tail;
+
+ BUG_ON(tail_page >= buffer->pages);
+ BUG_ON(head_page >= buffer->pages);
+
+ if (tail + length > buffer->page_size) {
+ unsigned long next_page = tail_page;
+
+ ring_buffer_inc_page(buffer, &next_page);
+
+ if (next_page == head_page) {
+ if (!(buffer->flags & RB_FL_OVERWRITE))
+ return NULL;
+
+ /* count overflows */
+ ring_buffer_update_overflow(cpu_buffer, head_page);
+
+ ring_buffer_inc_page(buffer, &head_page);
+ cpu_buffer->head_page = head_page;
+ cpu_buffer->head = 0;
+ }
+
+ if (!ring_buffer_short_event(buffer, tail)) {
+ event = cpu_buffer->pages[tail_page] + tail;
+ /* empty event */
+ event->counter = 0;
+ event->type = 0;
+ event->length = buffer->page_size - tail;
+ }
+
+ tail = 0;
+ tail_page = next_page;
+ cpu_buffer->tail_page = tail_page;
+ cpu_buffer->tail = tail;
+ }
+
+ BUG_ON(tail_page >= buffer->pages);
+ BUG_ON(ring_buffer_short_event(buffer, tail));
+
+ event = cpu_buffer->pages[tail_page] + tail;
+ event->length = length;
+ cpu_buffer->entries++;
+
+ return event;
+}
+
+/**
+ * ring_buffer_lock_reserve - reserve a part of the buffer
+ *
+ * Returns a location on the ring buffer to copy directly to.
+ * The length is the length of the data needed, not the event length
+ * which also includes the event header.
+ *
+ * Must be paired with ring_buffer_unlock_commit, unless NULL is returned.
+ * If NULL is returned, then nothing has been allocated or locked.
+ */
+void *ring_buffer_lock_reserve(struct ring_buffer *buffer,
+ int event_type,
+ unsigned long length,
+ unsigned long *flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ int cpu;
+
+ if (atomic_read(&buffer->record_disabled))
+ return NULL;
+
+ raw_local_irq_save(*flags);
+ cpu = raw_smp_processor_id();
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_lock(&cpu_buffer->lock);

I'm a bit mystified by the need to take an actual lock on the write path
- disabling preemption should be sufficient to keep up local, or are you
synchonizing against read as well?

+ if (atomic_read(&buffer->record_disabled))
+ goto no_record;
+
+ length += RB_EVNT_HDR_SIZE;
+ length = ALIGN(length, 8);
+ WARN_ON(length > buffer->max_event_size);
+ event = ring_buffer_reserve_next_event(cpu_buffer, length);
+ if (!event)
+ goto no_record;
+
+ event->counter = ring_buffer_next_counter(cpu_buffer->cpu);
+ event->type = event_type;
+
+ return &event->body;
+
+ __raw_spin_unlock(&cpu_buffer->lock);
+ local_irq_restore(*flags);
+ return NULL;
+}
+
+/**
+ * ring_buffer_unlock_commit - commit a reserved
+ *
+ * This commits the data to the ring buffer, and releases any locks held.
+ *
+ * Must be paired with ring_buffer_lock_reserve.
+ */
+int ring_buffer_unlock_commit(struct ring_buffer *buffer, void *data, unsigned long flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event =
+ container_of(data, struct ring_buffer_event, body);
+
+ int cpu = raw_smp_processor_id();
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ cpu_buffer->tail += event->length;
+
+ __raw_spin_unlock(&cpu_buffer->lock);
+ raw_local_irq_restore(flags);
+
+ return 0;
+}

Like said, I don't think we can use the reserve/commit interface due to
not having a linear buffer.

+/**
+ * ring_buffer_write - write data to the buffer without reserving
+ *
+ * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as
+ * one function. If you already have the data to write to the buffer, it
+ * may be easier to simply call this function.
+ *
+ * Note, like ring_buffer_lock_reserve, the length is the length of the data
+ * and not the length of the event which would hold the header.
+ */
+void *ring_buffer_write(struct ring_buffer *buffer,
+ int event_type,
+ unsigned long length,
+ void *data)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ unsigned long event_length, flags;
+ void *ret = NULL;
+ int cpu;
+
+ if (atomic_read(&buffer->record_disabled))
+ return NULL;
+
+ local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_lock(&cpu_buffer->lock);
+
+ if (atomic_read(&buffer->record_disabled))
+ goto out;
+
+ event_length = ALIGN(length + RB_EVNT_HDR_SIZE, 8);
+ event = ring_buffer_reserve_next_event(cpu_buffer, event_length);
+ if (!event)
+ goto out;
+
+ event->counter = ring_buffer_next_counter(cpu_buffer->cpu);
+ event->type = event_type;
+ memcpy(&event->body, data, length);

I'm missing the loop over the page...

+ cpu_buffer->tail += event->length;
+
+ ret = event->body;
+ __raw_spin_unlock(&cpu_buffer->lock);
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+/**
+ * ring_buffer_lock - lock the ring buffer
+ *
+ * This locks all the per CPU buffers.
+ *
+ * Must be unlocked by ring_buffer_unlock.
+ */
+void ring_buffer_lock(struct ring_buffer *buffer, unsigned long *flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ local_irq_save(*flags);
+
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_lock(&cpu_buffer->lock);
+ }
+}

This stuff made me go GAH for a bit,.. the only user I could quickly
locate is ring_buffer_start() (which btw. is a horribly name) which,
because it disables writing to the buffer, doesn't seem to need such
heavy handed locking.

+/**
+ * ring_buffer_unlock - unlock a locked buffer
+ */
+void ring_buffer_unlock(struct ring_buffer *buffer, unsigned long flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ for (cpu = buffer->cpus - 1; cpu >= 0; cpu--) {
+
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_unlock(&cpu_buffer->lock);
+ }
+
+ local_irq_restore(flags);
+}
+
+/**
+ * ring_buffer_record_disable - stop all writes into the buffer
+ *
+ * This prevents all writes to the buffer. Any attempt to write
+ * to the buffer after this will fail and return NULL.
+ */
+void ring_buffer_record_disable(struct ring_buffer *buffer)
+{
+ atomic_inc(&buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_enable - enable writes to the buffer
+ *
+ * Note, multiple disables will need the same number of enables
+ * to truely enable the writing (much like preempt_disable).
+ */
+void ring_buffer_record_enable(struct ring_buffer *buffer)
+{
+ atomic_dec(&buffer->record_disabled);
+}

Right - I dont think this should be part of the ringbuffer interface,
maybe on the event interface, preferable on the tracer layer.

+/**
+ * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
+ */
+unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return cpu_buffer->entries;
+}
+
+/**
+ * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer
+ */
+unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return cpu_buffer->overrun;
+}

Does it make sense to expose this as a statistics interface like:

enum ringbuffer_stats {
RB_ENTRIES,
RB_OVERFLOWS,
}

unsigned long ringbuffer_stat_cpu(struct ringbuffer *buffer, int cpu, enum ringbuffer_stats);

+/**
+ * ring_buffer_entries - get the number of entries in a buffer
+ *
+ * Returns the total number of entries in the ring buffer
+ * (all CPU entries)
+ */
+unsigned long ring_buffer_entries(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long entries = 0;
+ int cpu;
+
+ /* if you care about this being correct, lock the buffer */
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = buffer->buffers[cpu];
+ entries += cpu_buffer->entries;
+ }
+
+ return entries;
+}
+
+/**
+ * ring_buffer_overrun_cpu - get the number of overruns in buffer
+ *
+ * Returns the total number of overruns in the ring buffer
+ * (all CPU entries)
+ */
+unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long overruns = 0;
+ int cpu;
+
+ /* if you care about this being correct, lock the buffer */
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = buffer->buffers[cpu];
+ overruns += cpu_buffer->overrun;
+ }
+
+ return overruns;
+}

The summing part of the statistics interface..

unsigned long ringbuffer_stat(struct ringbuffer *buffer, enum ringbuffer_stats);

+static inline int
+ring_buffer_iter_cpu_empty(struct ring_buffer_iter_per_cpu *cpu_iter,
+ struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return cpu_iter->head_page == cpu_buffer->tail_page &&
+ cpu_iter->head == cpu_buffer->tail;
+}
+
+static inline struct ring_buffer_per_cpu *
+iter_choose_buffer(struct ring_buffer_iter *iter, int cpu)
+{
+ struct ring_buffer *buffer = iter->buffer;
+
+ if (iter->flags & RB_ITER_FL_SNAP)
+ return buffer->snap_buffers[cpu];
+ else
+ return buffer->buffers[cpu];
+}
+
+static void
+ring_buffer_advance_head(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct ring_buffer *buffer = cpu_buffer->buffer;
+ struct ring_buffer_event *event;
+
+ event = cpu_buffer->pages[cpu_buffer->head_page] + cpu_buffer->head;
+
+ if (ring_buffer_short_event(buffer, cpu_buffer->head) ||
+ ring_buffer_null_event(event)) {
+ BUG_ON(cpu_buffer->head_page == cpu_buffer->tail_page);
+ ring_buffer_inc_page(buffer, &cpu_buffer->head_page);
+ cpu_buffer->head = 0;
+ return;
+ }
+
+ BUG_ON((cpu_buffer->head_page == cpu_buffer->tail_page) &&
+ (cpu_buffer->head + event->length > cpu_buffer->tail));
+
+ cpu_buffer->head += event->length;
+ if (ring_buffer_short_event(buffer, cpu_buffer->head)) {
+ ring_buffer_inc_page(buffer, &cpu_buffer->head_page);
+ cpu_buffer->head = 0;
+ return;
+ }
+
+ /* check for end of page padding */
+ event = cpu_buffer->pages[cpu_buffer->head_page] + cpu_buffer->head;
+ if ((ring_buffer_short_event(buffer, cpu_buffer->head) ||
+ ring_buffer_null_event(event)) &&
+ (cpu_buffer->head_page != cpu_buffer->tail_page))
+ ring_buffer_advance_head(cpu_buffer);
+}
+
+static void
+ring_buffer_advance_iter(struct ring_buffer_iter *iter, int cpu)
+{
+ struct ring_buffer *buffer = iter->buffer;
+ struct ring_buffer_iter_per_cpu *cpu_iter = &iter->buffers[cpu];
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+
+ cpu_buffer = iter_choose_buffer(iter, cpu);
+ event = cpu_buffer->pages[cpu_iter->head_page] + cpu_iter->head;
+
+ if (ring_buffer_short_event(buffer, cpu_iter->head) ||
+ ring_buffer_null_event(event)) {
+ BUG_ON(cpu_iter->head_page == cpu_buffer->tail_page);
+ ring_buffer_inc_page(buffer, &cpu_iter->head_page);
+ cpu_iter->head = 0;
+ return;
+ }
+
+ BUG_ON((cpu_iter->head_page == cpu_buffer->tail_page) &&
+ (cpu_iter->head + event->length > cpu_buffer->tail));
+
+ cpu_iter->head += event->length;
+ if (ring_buffer_short_event(buffer, cpu_iter->head)) {
+ ring_buffer_inc_page(buffer, &cpu_iter->head_page);
+ cpu_iter->head = 0;
+ return;
+ }
+
+ /* check for end of page padding */
+ event = cpu_buffer->pages[cpu_iter->head_page] + cpu_iter->head;
+ if ((ring_buffer_short_event(buffer, cpu_iter->head) ||
+ ring_buffer_null_event(event)) &&
+ (cpu_iter->head_page != cpu_buffer->tail_page))
+ ring_buffer_advance_iter(iter, cpu);
+}
+
+/**
+ * ring_buffer_consume - return an event and consume it
+ *
+ * Returns the next event in the ring buffer, and that event is consumed.
+ * Meaning, that sequential reads will keep returning a different event,
+ * and eventually empty the ring buffer if the producer is slower.
+ */
+struct ring_buffer_event *
+ring_buffer_consume(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event, *next_event = NULL;
+ int cpu, next_cpu = -1;
+
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = buffer->buffers[cpu];
+ if (ring_buffer_per_cpu_empty(cpu_buffer))
+ continue;
+
+ event = cpu_buffer->pages[cpu_buffer->head_page] +
+ cpu_buffer->head;
+ if (unlikely(ring_buffer_short_event(buffer, cpu_buffer->head) ||
+ ring_buffer_null_event(event))) {
+ if (cpu_buffer->head_page == cpu_buffer->tail_page)
+ continue;
+ ring_buffer_inc_page(buffer, &cpu_buffer->head_page);
+ cpu_buffer->head = 0;
+
+ if (ring_buffer_per_cpu_empty(cpu_buffer))
+ continue;
+ event = cpu_buffer->pages[cpu_buffer->head_page] +
+ cpu_buffer->head;
+ }
+
+ if (!next_event || event->counter < next_event->counter) {
+ next_cpu = cpu;
+ next_event = event;
+ }
+ }
+
+ if (!next_event)
+ return NULL;
+
+ cpu_buffer = buffer->buffers[next_cpu];
+ ring_buffer_advance_head(cpu_buffer);
+ cpu_buffer->entries--;
+
+ return next_event;
+}
+
+/**
+ * ring_buffer_start - start a non consuming read of the buffer

One might think from the name it starts recording events or something
similarly daft..

+ *
+ * This starts up an iteration through the buffer. It also disables
+ * the recording to the buffer until the reading is finished.
+ * This prevents the reading from being corrupted. This is not
+ * a consuming read, so a producer is not expected.
+ *
+ * The iter_flags of RB_ITER_FL_SNAP will read the snapshot image
+ * and not the main buffer.
+ *
+ * Must be paired with ring_buffer_finish.
+ */
+struct ring_buffer_iter *
+ring_buffer_start(struct ring_buffer *buffer, unsigned iter_flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_iter *iter;
+ unsigned long flags;
+ int cpu;
+
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return NULL;
+
+ iter->buffer = buffer;
+ iter->flags = iter_flags;
+
+ WARN_ON((iter_flags & RB_ITER_FL_SNAP) &&
+ !(buffer->flags & RB_FL_SNAPSHOT));
+
+ atomic_inc(&buffer->record_disabled);
+
+ ring_buffer_lock(buffer, &flags);
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = iter_choose_buffer(iter, cpu);
+ iter->buffers[cpu].head = cpu_buffer->head;
+ iter->buffers[cpu].head_page = cpu_buffer->head_page;
+ }
+ ring_buffer_unlock(buffer, flags);
+
+ iter->next_cpu = -1;
+
+ return iter;
+}
+
+/**
+ * ring_buffer_finish - finish reading the iterator of the buffer
+ *
+ * This re-enables the recording to the buffer, and frees the
+ * iterator.
+ */
+void
+ring_buffer_finish(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer *buffer = iter->buffer;
+
+ atomic_dec(&buffer->record_disabled);
+ kfree(iter);
+}
+
+/**
+ * ring_buffer_peek - peek at the next event to be read

I don't think we actually need this, the iterator could consume and keep
it if its not the least valued one.

+ *
+ * This will return the event that will be read next, but does
+ * not increment the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_peek(struct ring_buffer_iter *iter, int *iter_next_cpu)
+{
+ struct ring_buffer *buffer = iter->buffer;
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_iter_per_cpu *cpu_iter;
+ struct ring_buffer_event *event, *next_event = NULL;
+ int cpu, next_cpu = -1;
+
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = iter_choose_buffer(iter, cpu);
+ cpu_iter = &iter->buffers[cpu];
+
+ if (ring_buffer_iter_cpu_empty(cpu_iter, cpu_buffer))
+ continue;
+
+ event = cpu_buffer->pages[cpu_iter->head_page] + cpu_iter->head;
+
+ if (ring_buffer_short_event(buffer, cpu_iter->head) ||
+ ring_buffer_null_event(event)) {
+ if (cpu_iter->head_page == cpu_buffer->tail_page)
+ continue;
+ ring_buffer_inc_page(buffer, &cpu_iter->head_page);
+ cpu_iter->head = 0;
+
+ if (ring_buffer_iter_cpu_empty(cpu_iter, cpu_buffer))
+ continue;
+
+ event = cpu_buffer->pages[cpu_iter->head_page]
+ + cpu_iter->head;
+ }
+
+ if (!next_event || event->counter < next_event->counter) {
+ next_cpu = cpu;
+ next_event = event;
+ }
+ }
+
+ if (!next_event)
+ return NULL;
+
+ if (iter_next_cpu)
+ *iter_next_cpu = next_cpu;
+
+ return next_event;
+}
+
+/**
+ * ring_buffer_read - read the next item in the ring buffer by the iterator
+ *
+ * This reads the next event in the ring buffer and increments the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_read(struct ring_buffer_iter *iter, int *iter_next_cpu)
+{
+ struct ring_buffer_event *event;
+ int next_cpu;
+
+ event = ring_buffer_peek(iter, &next_cpu);
+ if (!event)
+ return NULL;
+
+ ring_buffer_advance_iter(iter, next_cpu);
+
+ if (iter_next_cpu)
+ *iter_next_cpu = next_cpu;
+
+ return event;
+}
+
+/**
+ * ring_buffer_size - return the size of the ring buffer (in bytes)
+ */
+unsigned long ring_buffer_size(struct ring_buffer *buffer)
+{
+ return buffer->page_size * buffer->pages;
+}
+
+/**
+ * ring_buffer_rename - rename the ring buffer

Do we actually need this?

+ */
+int ring_buffer_rename(struct ring_buffer *buffer, char *new_name, ...)
+{
+ va_list args;
+ int ret;
+
+ mutex_lock(&buffer_mutex);
+
+ va_start(args, new_name);
+ vsnprintf(buffer->name, MAX_NAME_SIZE, new_name, args);
+ va_end(args);
+
+ buffer->dir_ent = debugfs_rename(buffer_dent, buffer->dir_ent,
+ buffer_dent, buffer->name);
+ if (!buffer->dir_ent) {
+ WARN_ON(1);
+ ret = -EBUSY;
+ }
+
+ mutex_unlock(&buffer_mutex);
+
+ return ret;
+}
+
+static void
+__ring_buffer_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer, int cpu)
+{
+ cpu_buffer->head_page = cpu_buffer->tail_page = 0;
+ cpu_buffer->head = cpu_buffer->tail = 0;
+ cpu_buffer->overrun = 0;
+ cpu_buffer->entries = 0;
+}
+
+/**
+ * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
+ */
+void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ __raw_spin_lock(&cpu_buffer->lock);
+
+ __ring_buffer_reset_cpu(cpu_buffer, cpu);
+
+ __raw_spin_unlock(&cpu_buffer->lock);
+ raw_local_irq_restore(flags);
+}
+
+/**
+ * rind_buffer_empty - is the ring buffer empty?
+ */
+int ring_buffer_empty(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ /* yes this is racy, but if you don't like the race, lock the buffer */
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = buffer->buffers[cpu];
+ if (!ring_buffer_per_cpu_empty(cpu_buffer))
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty?
+ */
+int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ /* yes this is racy, but if you don't like the race, lock the buffer */
+ cpu_buffer = buffer->buffers[cpu];
+ return ring_buffer_per_cpu_empty(cpu_buffer);
+}
+
+/**
+ * ring_buffer_snapshot_cpu - take a snapshot of a current ring buffer cpu buffer
+ *
+ * A snapshot of the per CPU buffer is saved and the main buffer is
+ * replaced. This allows live traces to have a snap shot taken.
+ * This is very effective when needing to take maximums and still record
+ * new traces.
+ */
+void ring_buffer_snapshot_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ cpu_buffer = buffer->buffers[cpu];
+
+ __raw_spin_lock(&cpu_buffer->lock);
+ __ring_buffer_reset_cpu(buffer->snap_buffers[cpu], cpu);
+ buffer->buffers[cpu] = buffer->snap_buffers[cpu];
+ buffer->snap_buffers[cpu] = cpu_buffer;
+
+ __raw_spin_unlock(&cpu_buffer->lock);
+ raw_local_irq_restore(flags);
+}
+
+/**
+ * ring_buffer_snapshot_one_cpu - take a snapshot of cpu and zero the rest

You actually made me look for the difference between this and the last
function,.. I don't think we really need this one.

+ *
+ * A snapshot of the per CPU buffer is saved and the main buffer is
+ * replaced. This allows live traces to have a snap shot taken.
+ * This is very effective when needing to take maximums and still record
+ * new traces.
+ *
+ * This function will not only snapshot a particular CPU buffer, but it
+ * will also zero the others. This facilitates reading the snapshot buffer
+ * if only one buffer is of interest.
+ */
+void ring_buffer_snapshot_one_cpu(struct ring_buffer *buffer, int snap_cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long flags;
+ int cpu;
+
+ raw_local_irq_save(flags);
+
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ __raw_spin_lock(&cpu_buffer->lock);
+ __ring_buffer_reset_cpu(buffer->snap_buffers[cpu], cpu);
+ if (cpu == snap_cpu) {
+ buffer->buffers[cpu] = buffer->snap_buffers[cpu];
+ buffer->snap_buffers[cpu] = cpu_buffer;
+ }
+ __raw_spin_unlock(&cpu_buffer->lock);
+ }
+
+ raw_local_irq_restore(flags);
+}
+
+/**
+ * ring_buffer_snapshot - take a snapshot of the ring buffer
+ *
+ * A snapshot of the entire ring buffer is saved, and can be
+ * retrieved later, even when we currently have a live trace
+ * recording.
+ */
+void ring_buffer_snapshot(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long flags;
+ int cpu;
+
+ raw_local_irq_save(flags);
+
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ __raw_spin_lock(&cpu_buffer->lock);
+ __ring_buffer_reset_cpu(buffer->snap_buffers[cpu], cpu);
+ buffer->buffers[cpu] = buffer->snap_buffers[cpu];
+ buffer->snap_buffers[cpu] = cpu_buffer;
+ __raw_spin_unlock(&cpu_buffer->lock);
+ }
+
+ raw_local_irq_restore(flags);
+}

Fancy output stuff below,. skipped that.. ;-)

+struct ring_buffer_seq *
+ring_buffer_seq_alloc(gfp_t flags)
+{
+ struct ring_buffer_seq *s;
+
+ s = kzalloc(sizeof(*s), flags);
+ return s;
+}
+
+void ring_buffer_seq_free(struct ring_buffer_seq *s)
+{
+ kfree(s);
+}
+
+unsigned ring_buffer_seq_length(struct ring_buffer_seq *seq)
+{
+ return seq->len;
+}
+
+void ring_buffer_seq_set_length(struct ring_buffer_seq *seq, unsigned len)
+{
+ BUG_ON(len > PAGE_SIZE);
+ seq->len = len;
+}
+
+/**
+ * ring_buffer_seq_printf - sequence printing of buffer information
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * ring_buffer_seq_printf is used to store strings into a special
+ * the sequencer or pulled into another buffer.
+ */
+int
+ring_buffer_seq_printf(struct ring_buffer_seq *s, const char *fmt, ...)
+{
+ int len = (PAGE_SIZE - 1) - s->len;
+ va_list ap;
+ int ret;
+
+ if (!len)
+ return 0;
+
+ va_start(ap, fmt);
+ ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
+ va_end(ap);
+
+ /* If we can't write it all, don't bother writing anything */
+ if (ret >= len)
+ return 0;
+
+ s->len += ret;
+
+ return len;
+}
+
+/**
+ * ring_buffer_seq_puts - buffer sequence printing of simple string
+ *
+ * The tracer may use either the sequence operations or its own
+ * copy to user routines. This function records a simple string
+ * or other mechanism.
+ */
+int
+ring_buffer_seq_puts(struct ring_buffer_seq *s, const char *str)
+{
+ int len = strlen(str);
+
+ if (len > ((PAGE_SIZE - 1) - s->len))
+ return 0;
+
+ memcpy(s->buffer + s->len, str, len);
+ s->len += len;
+
+ return len;
+}
+
+int
+ring_buffer_seq_putc(struct ring_buffer_seq *s, unsigned char c)
+{
+ if (s->len >= (PAGE_SIZE - 1))
+ return 0;
+
+ s->buffer[s->len++] = c;
+
+ return 1;
+}
+
+int
+ring_buffer_seq_putmem(struct ring_buffer_seq *s, void *mem, size_t len)
+{
+ if (len > ((PAGE_SIZE - 1) - s->len))
+ return 0;
+
+ memcpy(s->buffer + s->len, mem, len);
+ s->len += len;
+
+ return len;
+}
+
+#define HEX_CHARS 17
+static const char hex2asc[] = "0123456789abcdef";
+
+int
+ring_buffer_seq_putmem_hex(struct ring_buffer_seq *s, void *mem, size_t len)
+{
+ unsigned char hex[HEX_CHARS];
+ unsigned char *data = mem;
+ unsigned char byte;
+ int i, j;
+
+ BUG_ON(len >= HEX_CHARS);
+
+#ifdef __BIG_ENDIAN
+ for (i = 0, j = 0; i < len; i++) {
+#else
+ for (i = len-1, j = 0; i >= 0; i--) {
+#endif
+ byte = data[i];
+
+ hex[j++] = hex2asc[byte & 0x0f];
+ hex[j++] = hex2asc[byte >> 4];
+ }
+ hex[j++] = ' ';
+
+ return ring_buffer_seq_putmem(s, hex, j);
+}
+
+void
+ring_buffer_seq_reset(struct ring_buffer_seq *s)
+{
+ s->len = 0;
+ s->readpos = 0;
+}
+
+ssize_t
+ring_buffer_seq_copy_to_user(struct ring_buffer_seq *s,
+ char __user *ubuf, size_t cnt)
+{
+ int len;
+ int ret;
+
+ if (s->len <= s->readpos)
+ return -EBUSY;
+
+ len = s->len - s->readpos;
+ if (cnt > len)
+ cnt = len;
+ ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
+ if (ret)
+ return -EFAULT;
+
+ s->readpos += len;
+
+ if (s->readpos >= s->len)
+ ring_buffer_seq_reset(s);
+
+ return cnt;
+}
+
+int
+ring_buffer_seq_copy_to_mem(struct ring_buffer_seq *s,
+ void *mem, int cnt)
+{
+ int len;
+
+ if (s->len <= s->readpos)
+ return -EBUSY;
+
+ len = s->len - s->readpos;
+ if (cnt > len)
+ cnt = len;
+ memcpy(mem, s->buffer + s->readpos, cnt);
+
+ s->readpos += len;
+
+ if (s->readpos >= s->len)
+ ring_buffer_seq_reset(s);
+
+ return cnt;
+}
+
+int
+ring_buffer_seq_to_seqfile(struct seq_file *m, struct ring_buffer_seq *s)
+{
+ int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
+ int ret;
+
+ s->buffer[len] = 0;
+ ret = seq_puts(m, s->buffer);
+ if (ret)
+ ring_buffer_seq_reset(s);
+ return ret;
+}
Index: linux-compile.git/kernel/trace/Kconfig
===================================================================
--- linux-compile.git.orig/kernel/trace/Kconfig 2008-07-27 09:26:34.000000000 -0400
+++ linux-compile.git/kernel/trace/Kconfig 2008-09-22 11:47:29.000000000 -0400
@@ -15,6 +15,10 @@ config TRACING
select DEBUG_FS
select STACKTRACE
+config RING_BUFFER
+ bool "ring buffer"
+ select DEBUG_FS
+
config FTRACE
bool "Kernel Function Tracer"
depends on HAVE_FTRACE
Index: linux-compile.git/kernel/trace/Makefile
===================================================================
--- linux-compile.git.orig/kernel/trace/Makefile 2008-07-27 09:26:34.000000000 -0400
+++ linux-compile.git/kernel/trace/Makefile 2008-09-22 11:46:46.000000000 -0400
@@ -11,6 +11,7 @@ obj-y += trace_selftest_dynamic.o
endif
obj-$(CONFIG_FTRACE) += libftrace.o
+obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
obj-$(CONFIG_TRACING) += trace.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Steven Rostedt

2008-09-24 15:50:09 UTC

plain text document attachment (ring-buffer.patch)
RFC RFC RFC RFC RFC!!!!

Plenty comments, things I like, things I don't like, specifics below ;-)

Thanks!

Don't like that max_event_size param.

Actually, that is there so that a user reading the buffer could allocate
a buffer and read all the events. It also allows for greater than 1 page
size events to be recorded, which it currently does not do (the
max_event_size is currently ignored and set for you).

Post by Peter Zijlstra
Don't like that print_func param.

At the meeting we talked about having a way to do pretty print from the
debugfs. But you are free to pass in a NULL. I did with the ftrace work.

Post by Peter Zijlstra
Maybe like the name param.

That gives you a way to see what buffers are allocated in
/debugfs/tracing/buffers/<name>

event = ring_buffer_lock_reserve(buffer, event_id, length, &flags);
event->data = record_this_data;
ring_buffer_unlock_commit(buffer, event, flags);

This can, in generic, not work. Due to the simple fact that we might
straddle a page boundary. Therefore I think its best to limit our self
to the write interface below, so that it can handle that.

The code does not allow straddling of pages. Actually this is not much
different than what ftrace does today. If the new entry straddles a page,
we mark the rest of the page as "not used" and start on the next page.
This is actually nicer than straddling pages, which is what logdev does,
and one of the things that makes logdev slow.

Otherwise, we always need to copy twice. Once into the tracer sturcture
and than again into the buffer.

Having implemented both concepts, I much prefer this one. It also allows
us to improve the buffering in the future.

ring_buffer_write(buffer, event_id, length, data);

Don't like the event_id, just stick to plain binary data, and leave
interpretation open to whoemever uses it.

This is up to debate. I know you don't like this extra event layer,
but seriously, all my uses with ring buffers has had some kind of event.
But then I'm sure you can argue that if you are using a single type you
can can the event.

I'm open to doing this, but I would like a consensus on this.

Martin, Thomas, Ingo, Linus?

event = ring_buffer_consume(buffer);

By the above, this would have to be per-cpu as you cannot interpret the
actual binary data, and this cannot do the fwd-merge-sort-iter thing.

Why not? It does ;-)

Why?

Because if we do not, then it is a lot more work to keep the iterator
knowing about overruns, and becomes a pain in the ass. I ended up in
ftrace trying hard to disable tracing when we are reading the trace. If we
do not, then the output always becomes corrupted. That is, the end half
of the trace does not match the beginning.

For static reads (non consuming), in my pass experience, has become the
preferred method. Note that the consuming read does not have this
limitation, because a consuming read is usually done with in flight
tracing. A static read is usually done (for best results) from running a
trace and looking at the results at a later time.

Post by Peter Zijlstra
---
struct ringbuffer *ringbuffer_alloc(unsigned long size);
void ringbuffer_free(struct ringbuffer *rb);

Again this should be up to debate. I'm not that attached to one way or
another.

Post by Peter Zijlstra
/*
* disables preemption, cmpxchg reserves size on local cpu, memcpy
*/
int ringbuffer_write(struct ringbuffer *buffer, const void *buf, unsigned long size);

Again this forces a double copy when one should be enough. This is what
logdev currently does, but it has proved inefficient.

Post by Peter Zijlstra
/*
* has internal synchronization for read pos, returns error when
* overwritten - but resets state to next half-buffer.
*/
int ringbuffer_read(struct ringbuffer *buffer, int cpu, void *buf, unsigned long size);
This interface is enough to abstract the fact that our buffer
is non-linear and further assumes as little as possible.

So you want to push the merge sort into the tracer. This also means that
the tracer must be aware of per cpu data and also add the counter.

You may be the only one that wants this. But I'll let others speak for
themselves.

Post by Peter Zijlstra
/*
* mis-match.
*/
int ringbuffer_xchg_cpu(struct ringbuffer *src, struct ringbuffer *dst, int cpu);

I can see this getting a bit confusing.

Post by Peter Zijlstra
---
On top of that foundation build an eventbuffer, which knows about
encoding/decoding/printing events.
This too needs to be a flexible layer - as I suspect the google guys
will want their ultra-compressed events back.
I'm not quite sure yet how to model this layer most flexible without
being a nuisance.
So obviously its also this layer that has the whole debugfs interface,
but maybe we could even push that one layer up, so as to keep that
reusable with the print methods and be encoding invariant.

Since I didn't even need to use the debugfs layer for this buffering,
I have no problems in making that a separate layer. But separating out
the ring buffer from the events might be a bit hard.

I could work on something if others agree too, but I do not want to spend
too much time on it if this idea is quickly NACKed by others.

Please someone speak up on this.

lockdep keys cannot be in dynamic storage, also mainline raw_spinlock_t
doesn't have lockdep.

This was copied from ftrace. I didn't know what it did so I just copied
it ;-)

+ void **pages;

You used to link these using the pageframe, what happened to that?

It would have added another day to code it. Also I was playing with the
idea that the "pages" might actually be bigger than a single page. But
that could still be done with pages. The page handling can be very tricky.

Another thing about using the page struct link list is that it is very
difficult to find the buffer pages from crash. This is something that
needs to be solved before going back to that idea.

People (read SGI) prefer you dynamically allocate this array due to them
wanting distros to be able to set NR_CPUS=insane. Same goes for all
NR_CPUS usage below.

I thought I had a comment somewhere that said /* FIXME: */. Yes I hate
this too, but it was the easiest thing to write up in this patch. But
this is just implementation, it does not change the basic ideas.

Why not dynamically allocate the page?

Well, the struct is. Yeah, I can probably make this cleaner, but
this was easier. ;-)

Yeah, we should ask for mathieu's event stamp counter. Non of the clocks
we currently have suffice for this goal.

Ah, you see the "/* FIXME!!! */" here.

Ah, see, here you don't use this lockdep key ;-

again, copied from the ftrace code. This was not my doing ;-)

Like said, I rather liked using the pageframe to link these pages
together. The simple fact is that both read and write are fwd iterative
operations so you don't need the random access array.

as stated above, we need to fix the "crash" issue.

Right, like said above, I don't think you need the snapshot stuff in
here, if you provide this per cpu buffer xchg method.

If we go with your approach, I will probably keep this interface as is,
(perhaps rename it s/ring_buffer/trace_buffer/) and implement all this
on top of your ring buffer approach. Because this snapshot idea can really
clean up ftrace a lot.

Right, I think this sound die.

I actually use this to copy the event into a buffer while consuming it.
Once you consume it, it is gone.

I'm a bit mystified by the need to take an actual lock on the write path
- disabling preemption should be sufficient to keep up local, or are you
synchonizing against read as well?

Yeah, the read is syncronized here. Since all reads will merge sort.

Like said, I don't think we can use the reserve/commit interface due to
not having a linear buffer.

Without this, we must copy twice, and I really hate to do that.

I'm missing the loop over the page...

It doesn't happen. If the item crosses page bounders, we start on the
next page. Yes this may waste some space, but makes things much cleaner.
Again, logdev does the page bounder crossing, and it causes lots of
headaches.

This is where I said if you use an iterator, it will stop the tracing. The
spinlocks are there to synchronize with the disabling.

If you don't want this, just use the consumer.

Right - I dont think this should be part of the ringbuffer interface,
maybe on the event interface, preferable on the tracer layer.

I actually disagree here. It makes things so much easier, and is something
I wish ftrace had. I even have something similar with logdev.

enum ringbuffer_stats {
RB_ENTRIES,
RB_OVERFLOWS,
}
unsigned long ringbuffer_stat_cpu(struct ringbuffer *buffer, int cpu, enum ringbuffer_stats);

This is just an implementation detail. I have no pref. I just picked one,
because I found that I needed it in the last minute and quickly wrote one
up. Then found I needed the other, and cut and paste it.

The summing part of the statistics interface..
unsigned long ringbuffer_stat(struct ringbuffer *buffer, enum ringbuffer_stats);

again, whatever ;-)

One might think from the name it starts recording events or something
similarly daft..

How about "ring_buffer_read_start"?

I don't think we actually need this, the iterator could consume and keep
it if its not the least valued one.

I use this in ftrace. This is how we get the diff in times and do the
"!" annotation in the latency trace. This peek method was really simple
to implement, and as you can see, easy to reuse code.

Do we actually need this?

I did with ftrace. Since the names of the buffer is exposed to userspace,
the only way to resize the buffer (currently) was to create a new buffer.
To get back to the old name, I had to free the old buffer, and rename
the new one. I would have freed the old one first, but if the new one
fails to allocate, we can not fall back to the old one.

Thanks for the review Peter. Perhaps we can get someone else to look too.
Or at least comment on this discussion.

-- Steve
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Ingo Molnar

2008-09-25 10:50:13 UTC

Post by Steven Rostedt
ring_buffer_write(buffer, event_id, length, data);

Don't like the event_id, just stick to plain binary data, and leave
interpretation open to whoemever uses it.

This is up to debate. I know you don't like this extra event layer,
but seriously, all my uses with ring buffers has had some kind of
event. But then I'm sure you can argue that if you are using a single
type you can can the event.
I'm open to doing this, but I would like a consensus on this.
Martin, Thomas, Ingo, Linus?

i'd prefer Peter's simplification and not pass event_id along. Since
static types are lost anyway (which is the biggest cost and risk of any
such abstraction), we have to convert between types early on. Whether
event_id is visible in the API is no big difference.

(It might be cheaper to not pass it along even if everyone ends up using
it - as it has no semantic meaning anyway.)

pretty much the only generic tracing information is time and payload
size. ( but even a time key is debatable - there are various resolutions
needed by different usecases. Some usecases are even fine without any
timestamps at all - they just want to know the ordering of events and
that's it.)

i'd like to see some automatic type protection though, as an
off-by-default debug option: encode the record type on storing and
double-check it on extraction. So it should be possible to reliably
store/restore a typed trace buffer and notice corruption early in
testing.

because there's one thing that is far more important tracer feature than
sheer performance: robustness.

Automated type checking in debug mode would also mean we could go for
RLE encoding much more agressively. Most of the risks of a more complex,
more compressed and pointer-laden data format come from type mismatches
and the loss of compiler protection against human errors/stupidity.
(running off the end of the page, misinterpreting a pointer, a record,
etc.)

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Martin Bligh

2008-09-24 15:50:12 UTC

Thanks for creating this so quickly ;-)

Post by Steven Rostedt
event = ring_buffer_lock_reserve(buffer, event_id, length, &flags);
event->data = record_this_data;
ring_buffer_unlock_commit(buffer, event, flags);

This can, in generic, not work. Due to the simple fact that we might
straddle a page boundary. Therefore I think its best to limit our self
to the write interface below, so that it can handle that.

I'm not sure why this is any harder to deal with in write, than it is
in reserve? We should be able to make reserve handle this just
as well?

If you use write rather than reserve, you have to copy all the data
twice for every event.

Post by Peter Zijlstra
On top of that foundation build an eventbuffer, which knows about
encoding/decoding/printing events.
This too needs to be a flexible layer -

That would be nice. However, we need to keep at least the length
and timestamp fields common so we can do parsing and the mergesort?

+struct ring_buffer_event {
+ unsigned long long counter;
+ short type;
+ short length;
+ char body[];
+} __attribute__((__packed__))

So type would move into the body here?

Post by Peter Zijlstra
as I suspect the google guys
will want their ultra-compressed events back.

Is useful when gathering GB of data across 10,000 machines ;-)
Also reduces general overhead for everyone to keep events small.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2008-09-24 16:20:05 UTC

Post by Martin Bligh
Thanks for creating this so quickly ;-)

Post by Steven Rostedt
event = ring_buffer_lock_reserve(buffer, event_id, length, &flags);
event->data = record_this_data;
ring_buffer_unlock_commit(buffer, event, flags);

This can, in generic, not work. Due to the simple fact that we might
straddle a page boundary. Therefore I think its best to limit our self
to the write interface below, so that it can handle that.

I'm not sure why this is any harder to deal with in write, than it is
in reserve? We should be able to make reserve handle this just
as well?

No, imagine the mentioned case where we're straddling a page boundary.

A----| |----B
^------|

So when we reserve we get a pointer into page A, but our reserve length
will run over into page B. A write() method will know how to check for
this and break up the memcpy to copy up-to the end of A and continue
into B.

You cannot expect the reserve/commit interface users to do this
correctly - it would also require one to expose too much internals,
you'd need to be able to locate page B for starters.

Post by Martin Bligh
If you use write rather than reserve, you have to copy all the data
twice for every event.

Well, once. I'm not seeing where the second copy comes from.

Post by Peter Zijlstra
On top of that foundation build an eventbuffer, which knows about
encoding/decoding/printing events.
This too needs to be a flexible layer -

That would be nice. However, we need to keep at least the length
and timestamp fields common so we can do parsing and the mergesort?

And here I was thinking you guys bit encoded the event id into the
timestamp delta :-)

Post by Martin Bligh
+struct ring_buffer_event {
+ unsigned long long counter;
+ short type;
+ short length;
+ char body[];
+} __attribute__((__packed__))
So type would move into the body here?

All of it would, basically I have no notion of an event in the
ringbuffer API. You write $something and your read routine would need to
be smart enough to figure it out.

The trivial case is a fixed size entry, in which case you always know
how much to read. A slightly more involved but still easy to understand
example might be a 7bit encoding and using the 8th bit for continuation.

Another option is to start out with a fixed sized header that contains a
length field.

But the raw ringbuffer layer, the one concerned with fiddling the pages
and writing/reading thereto need not be aware of anything else.

Post by Peter Zijlstra
as I suspect the google guys
will want their ultra-compressed events back.

Is useful when gathering GB of data across 10,000 machines ;-)
Also reduces general overhead for everyone to keep events small.

Exactly - which is why a flexible encoding layer makes sense to me -
aside from the abstraction itself.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Steven Rostedt

2008-09-24 16:30:23 UTC

Post by Martin Bligh
Thanks for creating this so quickly ;-)

Post by Steven Rostedt
event = ring_buffer_lock_reserve(buffer, event_id, length, &flags);
event->data = record_this_data;
ring_buffer_unlock_commit(buffer, event, flags);

This can, in generic, not work. Due to the simple fact that we might
straddle a page boundary. Therefore I think its best to limit our self
to the write interface below, so that it can handle that.

I'm not sure why this is any harder to deal with in write, than it is
in reserve? We should be able to make reserve handle this just
as well?

No, imagine the mentioned case where we're straddling a page boundary.
A----| |----B
^------|

This would not happen. The ring buffer reserve routine will take care of
it. If the requested length will straddle a page, I add a "nop" event
into the end of A, and give the user a pointer starting at B.

A----| |-----B
^^ ^
|nop| +---- record.

Post by Peter Zijlstra
So when we reserve we get a pointer into page A, but our reserve length
will run over into page B. A write() method will know how to check for
this and break up the memcpy to copy up-to the end of A and continue
into B.
You cannot expect the reserve/commit interface users to do this
correctly - it would also require one to expose too much internals,
you'd need to be able to locate page B for starters.

Post by Martin Bligh
If you use write rather than reserve, you have to copy all the data
twice for every event.

Well, once. I'm not seeing where the second copy comes from.

I'll give you the ftrace example. In ftrace we record the task pid,
preempt count, and interrupt flags. The reserve commit way is this.

event = ring_buffer_reserve(buffer, sizeof(*event));
event->pid = current->pid;
event->pc = preempt_count();
event->flags = flags;
ring_buffer_commit(buffer, event);

Done! One copy of all this data directly into the buffer. But if we use
the write method that you propose:

struct event event;

event.pid = current->pid;
event.pc = preempt_count();
event.flags = flags;
ring_buffer_write(buffer, &event, sizeof(event));

One copy into event has been done, but we are not done yet. Inside the
write we need to do a...

memcopy(buffer->buf, data, size);

There's the second copy. No way around it.

Post by Peter Zijlstra
On top of that foundation build an eventbuffer, which knows about
encoding/decoding/printing events.
This too needs to be a flexible layer -

That would be nice. However, we need to keep at least the length
and timestamp fields common so we can do parsing and the mergesort?

And here I was thinking you guys bit encoded the event id into the
timestamp delta :-)

Post by Martin Bligh
+struct ring_buffer_event {
+ unsigned long long counter;
+ short type;
+ short length;
+ char body[];
+} __attribute__((__packed__))
So type would move into the body here?

All of it would, basically I have no notion of an event in the
ringbuffer API. You write $something and your read routine would need to
be smart enough to figure it out.

Logdev has an internal buffer that does exactly what you are proposing.
I had to give it some minor smarts to improve preformance, and stability.
If you don't even have the length then it is over when you wrap. The
There's no way to know where the next record starts. It gets ugly with
callbacks.

Giving the ring buffer at least the length as a minimum, is what is
needed. And as I have that nop id as well, I would need to put that
either into a type field or counter (I currently use both for these
patches).

Post by Peter Zijlstra
The trivial case is a fixed size entry, in which case you always know
how much to read. A slightly more involved but still easy to understand
example might be a 7bit encoding and using the 8th bit for continuation.

ftrace is proving to be a pain with this regard.

Post by Peter Zijlstra
Another option is to start out with a fixed sized header that contains a
length field.

As a minimum, sure.

Post by Peter Zijlstra
But the raw ringbuffer layer, the one concerned with fiddling the pages
and writing/reading thereto need not be aware of anything else.

As I have learned from both ftrace and logdev, giving the ring buffer a
little intelligence, has paid off a lot. I will say, I have not hit that
minimum with my patches. But I will argue that you are going beneath that
threshold.

Post by Peter Zijlstra
as I suspect the google guys
will want their ultra-compressed events back.

Is useful when gathering GB of data across 10,000 machines ;-)
Also reduces general overhead for everyone to keep events small.

Exactly - which is why a flexible encoding layer makes sense to me -
aside from the abstraction itself.

But being too minimal and flexible, will actually be counter productive.
That is, you will not be able to enhance it due to the lack of abilities.

That is, the top layers will now be limited.

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Linus Torvalds

2008-09-24 16:40:11 UTC

I would suggest just not allowing page straddling.

Yeah, it would limit event size to less than a page, but seriously, do
people really want more than that? If you have huge events, I suspect it
would be a hell of a lot better to support some kind of indirection
scheme than to force the ring buffer to handle insane cases.

Most people will want the events to be as _small_ as humanly possible. The
normal event size should hopefully be in the 8-16 bytes, and I think the
RFC patch is already broken because it allocates that insane 64-bit event
counter for things. Who the hell wants a 64-bit event counter that much?
That's broken.

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Steven Rostedt

2008-09-24 16:40:14 UTC

I would suggest just not allowing page straddling.
Yeah, it would limit event size to less than a page, but seriously, do
people really want more than that? If you have huge events, I suspect it
would be a hell of a lot better to support some kind of indirection
scheme than to force the ring buffer to handle insane cases.
Most people will want the events to be as _small_ as humanly possible. The
normal event size should hopefully be in the 8-16 bytes, and I think the
RFC patch is already broken because it allocates that insane 64-bit event
counter for things. Who the hell wants a 64-bit event counter that much?
That's broken.

The event counter is just the timestamp (quick patch, simple to fix). The
term "counter" was bad. It should have been timestamp, which one would
want a 64bit timestamp. Or at least a way to figure it out. Yes, we can
store a special event called "timestamp" and have a smaller counter. But
for simplicity, the 64 bit was easy. The event id was just 16 bits, which
I think is way more than enough.

The current method has a 16 bit length as well, and prevents crossing of
page boundaries.

Other than that, I would love to have you review more of this patch.

Note, I plan on hacking the "max_event_size", and just have that be the
standard "PAGE_SIZE". If you need a preallocated storage to store events,
one could just use PAGE_SIZE and fit any event they want into it.

Thanks,

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Martin Bligh

2008-09-24 17:00:24 UTC

Post by Linus Torvalds
Most people will want the events to be as _small_ as humanly possible. The
normal event size should hopefully be in the 8-16 bytes, and I think the
RFC patch is already broken because it allocates that insane 64-bit event
counter for things. Who the hell wants a 64-bit event counter that much?
That's broken.

Yup, is just a confusing name. we can definitely make this a smaller field
by doing an offset time from the last event, but we agreed on 64 bits to
keep version 1 simple ;-)

I think in retrospect the timestamp events we used with wall time stuck
in them were a mistake, as NTP will make them difficult. We should have
just recorded wall time at the start of the buffer, and done offsets from
there.

Without relayfs subbuffers, the offset thing gets trickier, as you'd have
to update the "start time" constantly once you'd filled the buffer and
were shifting the start pointer. OTOH, I guess it's only 1 cacheline.

M.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Linus Torvalds

2008-09-24 17:40:08 UTC

Post by Steven Rostedt
The event counter is just the timestamp (quick patch, simple to fix). The
term "counter" was bad. It should have been timestamp, which one would
want a 64bit timestamp.

One definitely would _not_ want the full 64-bit timestamp.

There are two cases:

- lots and lots of events

just do a 32 bit "timestamp delta" to the previous packet (where the
first packet in the queue would be a delta from a value in the queue
header - and we can obviously make that value be the TSC so that the
first delta is always zero, but it may also make sense to make it a
real delta, and the queue header would contain some good
synchronization point value).

- occasional events

Oops. The delta wouldn't fit. So create a new "timestamp update" packet
with a 64-bit thing when doing the reservation. There's obviously no
cost issue (since this would only happen for things where there was a
multi-second delay - or at least an appreciable fraction of a delay -
between events)

This definitely is worth doing. If we have small trace objects (and many
things really do have pretty small traces), using just a 32-bit TSC not
only saves 4 bytes per trace event, but it makes it quite reasonable to
keep the trace data 4-byte-aligned rather than requiring 8-byte alignment.

Of course, if the traces end up being horribly bloated, none of that will
matter. But I really would hope that you we keep the header itself to just
8 bytes (and being 2 4-byte entities), so that small payloads are
reasonable. And that looks doable, if you have a 16-bit "type" and a
16-bit "size" field.

One thing I'd like to do is to also architecturally reserve a few of the
types for internal queue management stuff. Things like "padding" objects
(or a "end-of-ringbuffer" object), and the TSC overflow object, and a
"time sync" object (or heartbeat). So maybe the type would have the high
bit set as a "reserved for internal ringbuffer use" or whatever.

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Mathieu Desnoyers

2008-09-24 18:10:08 UTC

I would suggest just not allowing page straddling.
Yeah, it would limit event size to less than a page, but seriously, do
people really want more than that? If you have huge events, I suspect it
would be a hell of a lot better to support some kind of indirection
scheme than to force the ring buffer to handle insane cases.
Most people will want the events to be as _small_ as humanly possible. The
normal event size should hopefully be in the 8-16 bytes, and I think the
RFC patch is already broken because it allocates that insane 64-bit event
counter for things. Who the hell wants a 64-bit event counter that much?
That's broken.
Linus

Hi Linus,

I agree that the standard "high event rate" use-case, when events are as
small as possible, would fit perfectly in 4kB sub-subbfers. However,
I see a few use-cases where having the ability to write across page
boundaries would be useful. Those will likely be low event-rate
situations where it is useful to take a bigger snapshot of a problematic
condition, but still to have it synchronized with the rest of the trace
data. e.g. :

- Writing a whole video frame into the trace upon video card glitch.
- Writing a jumbo frame (up to 9000 bytes) into the buffer when a
network card error is detected or when some iptables rules (LOG, TRACE
?) are reached.
- Dumping a kernel stack (potentially 8KB) in a single event when a
kernel OOPS is reached.
- Dumping a userspace process stack into the trace upon SIGILL, SIGSEGV
and friends.

That's only what I come up with from the top of my head, and I am sure
we'll find very ingenious users who will find plenty of other use-cases
where 4kB events won't be enough.

(It reminds me of someone saying "640K ought to be enough for anybody.")
;-)

If the write abstraction supports page straddling, I think it would be a
huge gain in simplicity for such users because they would not have to
break their payload in various events and have to create another event
layer on top of all that which would identify events uniquely with a
"cookie" or to protect writing events into the buffers with another
layer of locking.

Besides, there are other memory backends where the buffers can be put
that do not depend on the page size, namely video card memory. It can be
very useful to collect data that survives reboots. Given that this
memory will likely consist of contiguous pages, I see no need to limit
the maximum event size to a page on such support. Therefore, I think the
support for page-crossing should be placed in the "write" abstraction
(which would be specific to the type of memory used to back the
buffers) rather that the reserve/commit layer (which can simply
do reserve/commit in terms of offset from the buffer start, without
having to know the gory buffer implementation details (e.g. : array of
pages, linear mapping at boot time, video card memory...).

So, given the relative simplicity of doing a write() abstraction layer
which would deal with page-crossing writes compared to the complexity
that users would have to deal with when splitting up their large events,
I would recommend to abstract page straddling when writing to a page
array.

I think having the ability to break the buffers into sub-buffers is
still required, because it's good to have the ability to seek quickly in
such data, and the way to do this is to separate the buffer in
fixed-size sub-buffers which each contains many variable-sized events.
But I would recommend to make the sub-buffer size configurable by the
tracers so we can support events bigger than 4kB when needed.

Mathieu

--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Linus Torvalds

2008-09-24 21:00:17 UTC

[...] Those will likely be low event-rate
situations where it is useful to take a bigger snapshot of a problematic
condition, but still to have it synchronized with the rest of the trace
- Writing a whole video frame into the trace upon video card glitch.
- Writing a jumbo frame (up to 9000 bytes) into the buffer when a
network card error is detected or when some iptables rules (LOG, TRACE
?) are reached.
- Dumping a kernel stack (potentially 8KB) in a single event when a
kernel OOPS is reached.
- Dumping a userspace process stack into the trace upon SIGILL, SIGSEGV
and friends.

But these are _all_ things that would be much better off with a "allocate
a separate buffer, and just add a pointer to the trace".

Why? If for no other reason than the fact that we don't even want to spend
lots of time to (atomically) have to copy the big data into the trace
buffer!

Just allocate the buffer and fill it in (maybe it's pre-allocated already,
like when a network packet event happens!) and do all of that
independently of the low-level trace code. And then add the trace with the
pointer.

We want the low-level trace code to be useful for things like interrupt
events etc, which makes it a _disaster_ to try to add huge buffers
directly to the ring buffer. You also don't want to allocate a
multi-megabyte ring buffer for some odd case that happens rarely, when you
can allocate the big memory users dynamically.

So limiting a trace entry to 4kB does not mean that you can't add more
than 4kB to the trace - it just means that you need to have a "data
indirection" trace type. Nothing more, nothing less.

[ And btw - you'd need that *anyway* for other reasons. You also don't
want to have any length fields have to be 32-bit lengths etc - the
length field of the trace buffer entry should be something really small
like 8 or 16 bits, or even be implicit in the type for some basic event
types, so that a trace event doesn't necessarily waste any bits at ALL
on the length field ]

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Martin Bligh

2008-09-24 17:00:16 UTC

Post by Martin Bligh
I'm not sure why this is any harder to deal with in write, than it is
in reserve? We should be able to make reserve handle this just
as well?

No, imagine the mentioned case where we're straddling a page boundary.
A----| |----B
^------|
So when we reserve we get a pointer into page A, but our reserve length
will run over into page B. A write() method will know how to check for
this and break up the memcpy to copy up-to the end of A and continue
into B.
You cannot expect the reserve/commit interface users to do this
correctly - it would also require one to expose too much internals,
you'd need to be able to locate page B for starters.

Can't the reserve interface just put a padding event into page A,
or otherwise mark it, and return the start of page B?

Post by Martin Bligh
If you use write rather than reserve, you have to copy all the data
twice for every event.

Well, once. I'm not seeing where the second copy comes from.

Depends how you count ;-) One more time than you would have to
with reserve - the temporarily packed structure doesn't exist.

Post by Peter Zijlstra
On top of that foundation build an eventbuffer, which knows about
encoding/decoding/printing events.
This too needs to be a flexible layer -

That would be nice. However, we need to keep at least the length
and timestamp fields common so we can do parsing and the mergesort?

And here I was thinking you guys bit encoded the event id into the
timestamp delta :-)

+/* header plus 32-bits of event data */
+struct ktrace_entry {
+ u32 event_type:5, tsc_shifted:27;
+ u32 data;
+};

was our basic data type. So ... sort of ;-)

Post by Martin Bligh
So type would move into the body here?

All of it would, basically I have no notion of an event in the
ringbuffer API. You write $something and your read routine would need to
be smart enough to figure it out.

If you don't have timestamps, you need domain-specific context to merge
the per-cpu buffers back together. As long as these are common format
amongst all the event-level alternatives, I guess it doesn't matter.

Post by Peter Zijlstra
Another option is to start out with a fixed sized header that contains a
length field.

That's what we discussed at KS/plumbers, and seems like the simplest
option by far to start with.

Post by Peter Zijlstra
But the raw ringbuffer layer, the one concerned with fiddling the pages
and writing/reading thereto need not be aware of anything else.

When you loop around the ringbuffer, you need to shift the starting "read"
pointer up to the next event, don't you? How do you do that to start on
a whole event without knowing the event size?

Post by Peter Zijlstra
Exactly - which is why a flexible encoding layer makes sense to me -
aside from the abstraction itself.

I like the abstraction, yes ;-) Just not convinced how much we can put in it.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Linus Torvalds

2008-09-24 17:40:10 UTC

Post by Martin Bligh
Can't the reserve interface just put a padding event into page A,
or otherwise mark it, and return the start of page B?

Yes, I think having a "padding" entry that just gets skipped on read would
simplify things. Use that to fill up the end of the page.

Post by Peter Zijlstra
And here I was thinking you guys bit encoded the event id into the
timestamp delta :-)

+/* header plus 32-bits of event data */
+struct ktrace_entry {
+ u32 event_type:5, tsc_shifted:27;
+ u32 data;
+};
was our basic data type. So ... sort of ;-)

Why "tsc_shifted"?

I think 27 bits is probably fine, but not by removing precision. Instead
of shifting it so it will fit (and dropping low bits as uninteresting), do
it by encoding it as a delta against the previous thing. 27 bits would
still be sufficient for any high-performance thing that has tons and tons
of packets, and if you only have a few trace events you can afford to have
the "TSC overflow" event type (and if you want it that dense, you could
just make 'data' be the high bits, for a total of 59 bits rather than 64
bits of TSC.

59 bits of cycle counters is perfectly fine unless you are talking trace
events over a year or so (I didn't do the math, but let's assume a 4GHz
TSC as a reasonable thing even going forward - even _if_ CPU's get faster
than that, the TSC is unlikely to tick faster since it's just not worth it
from a power standpoint).

Ok, I did the math. 1<<27 seconds (assuming the low 32 bits are just
fractions) is something like 4+ years. I _really_ don't think we need more
than that (or even close to that) in TSC timestamps for tracing within one
single buffer.

Once you go to the next ring buffer, you'd get a new time-base anyway.

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Martin Bligh

2008-09-24 18:00:10 UTC

Post by Linus Torvalds
Why "tsc_shifted"?
I think 27 bits is probably fine, but not by removing precision. Instead
of shifting it so it will fit (and dropping low bits as uninteresting), do
it by encoding it as a delta against the previous thing. 27 bits would
still be sufficient for any high-performance thing that has tons and tons
of packets, and if you only have a few trace events you can afford to have
the "TSC overflow" event type (and if you want it that dense, you could
just make 'data' be the high bits, for a total of 59 bits rather than 64
bits of TSC.
59 bits of cycle counters is perfectly fine unless you are talking trace
events over a year or so (I didn't do the math, but let's assume a 4GHz
TSC as a reasonable thing even going forward - even _if_ CPU's get faster
than that, the TSC is unlikely to tick faster since it's just not worth it
from a power standpoint).
Ok, I did the math. 1<<27 seconds (assuming the low 32 bits are just
fractions) is something like 4+ years. I _really_ don't think we need more
than that (or even close to that) in TSC timestamps for tracing within one
single buffer.

Mmm. Either I'm confused, or we're talking about different things.

If we just record the TSC unshifted, in 27 bits, at 4GHz, that gives us
about 1/30 of a second? So we either shift, use > 27 bits, or record
at least 30 events a second, none of which I like much ...
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Martin Bligh

2008-09-24 18:10:09 UTC

Post by Martin Bligh
If we just record the TSC unshifted, in 27 bits, at 4GHz, that gives us
about 1/30 of a second? So we either shift, use > 27 bits, or record
at least 30 events a second, none of which I like much ...

If we use 32 bits instead of 27, then the timestamp events are only
about once per second, which is probably fine for overhead ... ?

I think we're OK losing 5 bits of precision, that's only 32 cycles,
given all the CPU reordering stuff we've talked about here,
not-quite-synced TSCs, etc. I suspect you thought we were
shifting by much more than this, in reality it was 5-10 bits,
with timestamp events inbetween, though we put wall time in
them, which I think was a mistake.

(note: I'm not suggesting we have to use this compact a format,
at least by default)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Linus Torvalds

2008-09-24 20:50:14 UTC

Post by Martin Bligh
If we use 32 bits instead of 27, then the timestamp events are only
about once per second, which is probably fine for overhead ... ?

You'd have them ONCE IN A BLUE MOON.

If there is nothing going on, you don't need the timestamps at all.

Yes, some people will _want_ a heartbeat, of course, and there might be
serialization points where we want to serialize some external clock to the
TSC, but that's a separate issue from generating a full TSC value. We may
well decide that once a second we want a whole packet with TSC _and_
HR-timer sync information, for example. But that's a separate issue.

And if there is a lot of things going on (eg you're tracing things like a
block device performance issue), you have events _much_ closer to each
other than 1/30th of a second, and you again _never_ need a full 59-bit
timestamp. Because the delta would be cumulative, and as long as you are
generating events, you're also updating the base TSC.

The only case you'll see lots of those timestamps (where "lots" is
guaranteed to be less than 30 in one second) would be if you're tracing
something that literally does a couple of events per second. But then you
sure as hell don't need to worry about performance _or_ memory use. If you
have some trace that gives you five hits per second (and spread out, to
boot!), you'll generate twice the number of trace entries because each
entry would always be preceded by an extended thing, but hey, do we really
worry about five trace events per second?

And quite frankly, most of the tracing I've ever done really does fall
into the "either nothing" or "a flood of events" thing. The "5-25 events
per second at regular intervals" case really doesn't sound very common at
all - not that I would worry about it if it was since it's going to be a
really simple case..

And btw, this would be an issue only on really fast CPU's anyway. If
you're in the mobile space, your TSC will be clocking at less than that,
so the slower the machine, the less the overhead. Again, that's exactly
what you want - the overhead is not some fixed thing that is detemined by
the fastest possible TSC and hurts slower machines too, it's relative to
the speed of the machine itself.

(Of course, with cpufreq and fixed TSC's, a 2GHz CPU will have a 2GHz TSC
even when it's running at just 1GHz, but the worry is more about trying to
trace on an embedded board with some pitiful 500MHz thing that is pretty
overworked anyway).

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Martin Bligh

2008-09-24 21:00:17 UTC

Post by Martin Bligh
If we use 32 bits instead of 27, then the timestamp events are only
about once per second, which is probably fine for overhead ... ?

You'd have them ONCE IN A BLUE MOON.
If there is nothing going on, you don't need the timestamps at all.

Yeah, you're right - we can just mark it dirty, and 'pre-log' the timestamp
events when someone calls a reserve and we haven't logged anything
for more time than we can store. Did not think of that. Was only 5 bits
for us, not an extra 37, but still, is much better.

Is a 5-bit event id generic enough though?
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Steven Rostedt

2008-09-24 21:10:12 UTC

Post by Martin Bligh
If we use 32 bits instead of 27, then the timestamp events are only
about once per second, which is probably fine for overhead ... ?

You'd have them ONCE IN A BLUE MOON.
If there is nothing going on, you don't need the timestamps at all.

Actually, I was keeping the event id completely out of the ring buffer and
let a higher layer deal with that. For padding, I just made the length
field zero.

For overflows of the timestamp, we can reserve the -1 timestamp as a
trigger to read the tsc again and put the full 64 bits into the record.

Just an idea.

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Linus Torvalds

2008-09-24 20:40:10 UTC

No, we don't shift (we don't want to lose precision), and we don't use
more than 27 bits by default.

the TSC at each entry should be a _delta_. It's the difference from the
last one. And if you get less than 30 events per second, and you need a
bigger difference, you insert an extra "sync" tracepoint that has a 59-bit
thing (27 bits _plus_ the extra 'data').

Yes, it adds 8 bytes (assuming that minimal format), but it does so only
for any trace event that is more than 1/30th of a second from its previous
one. IOW, think of this not in bytes, but in bytes-per-second. It adds at
most 8*30=240 bytes per second, but what it _saves_ is that when you have
tens of thousands of events, it shaves 4 bytes FOR EACH EVENT.

See?

Also, quite often, the clock won't be running at 4GHz even if the CPU
might. Intel already doesn't make the TSC be the nominal frequency, and
other architectures with TSC's have long had the TSC be something like a
"divide-by-16" clock rather than every single cycle because it's more
power-efficient.

So there is often a built-in shift, and I doubt we'll see 10GHz TSC's even
if we see 10GHz CPU's (which many people consider unlikely anyway, but
I'm not going to bet against technology).

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Mathieu Desnoyers

2008-09-24 21:00:19 UTC

No, we don't shift (we don't want to lose precision), and we don't use
more than 27 bits by default.

The reason why Martin did use only a 27 bits TSC in ktrace was that they
were statically limited to 32 event types. I doubt this will suffice for
general purpose kernel tracing. For simplicity, I would just start with
a header made of the 32 TSC LSBs, 16 bits for events ID and 16 bits for
event size in the buffer header. We can always create extra-compact
schemes later on which can be tied to specific buffers. I actually have
one in LTTng.

Post by Linus Torvalds
the TSC at each entry should be a _delta_. It's the difference from the
last one. And if you get less than 30 events per second, and you need a
bigger difference, you insert an extra "sync" tracepoint that has a 59-bit
thing (27 bits _plus_ the extra 'data').

I agree that, in the end, we will end up with "delta" information given
by the timestamp, but there is a way to encode that very simply without
having to compute any time delta between events : we just have to keep
the bits we are interested to save (say, the 32 LSBs) and write that as
a time value. Then, whenever we have to write this value, we either have
a heartbeat system making sure we always detect 32 bit overflows by
writing an event at least once per 32-bit overflow or by adding the full
64-bits timestamp as a prefix to the event when this occurs (as you
proposed). Note that the latter proposal imply extra computation at the
tracing site, which could have some performance impact.

There are a few reasons why I would prefer to stay away from enconding
time deltas and use direct LSB tsc representation in the event headers.
First, deltas make it hard to deal with missing information (lost
events, lost buffers); it those cases, you simply don't know what the
delta is. OTOH, if you encode directly the LSBs read from the cycle
counter, you can more easily deal with such lack of information (lost
events) and lost subbuffers by writing an extended 64-bits event header
when needed.

The benefit of using the bigger event header when required over using
heartbeat, even if it makes the tracing fastpath a bit slower, is that
is won't impact systems using dynamic ticks. Heartbeats are generally
bad at that because their require the system to be woken up
periodically.

Mathieu

Post by Linus Torvalds
Yes, it adds 8 bytes (assuming that minimal format), but it does so only
for any trace event that is more than 1/30th of a second from its previous
one. IOW, think of this not in bytes, but in bytes-per-second. It adds at
most 8*30=240 bytes per second, but what it _saves_ is that when you have
tens of thousands of events, it shaves 4 bytes FOR EACH EVENT.
See?
Also, quite often, the clock won't be running at 4GHz even if the CPU
might. Intel already doesn't make the TSC be the nominal frequency, and
other architectures with TSC's have long had the TSC be something like a
"divide-by-16" clock rather than every single cycle because it's more
power-efficient.
So there is often a built-in shift, and I doubt we'll see 10GHz TSC's even
if we see 10GHz CPU's (which many people consider unlikely anyway, but
I'm not going to bet against technology).
Linus

Linus Torvalds

2008-09-24 22:40:10 UTC

Post by Mathieu Desnoyers
The reason why Martin did use only a 27 bits TSC in ktrace was that they
were statically limited to 32 event types.

Well, I actually think we could do the same - for the "internal" types.

So why not do something like 4-5 bits for the basic type information, and
then oen of those cases is a "freeform" thing, and the others are reserved
for other uses.

So a trace entry header could easily look something like

struct trace_entry {
u32 tsc_delta:27,
type:5;
u32 data;
u64 array[];
}

and then depending on the that 5-bit type, the "data" field in the header
means different things, and the size of the trace_entry also is different.

So it could be something like

- case 0: EnfOfPage marker
(data is ignored)
size = 8

- case 1: TSCExtend marker
data = extended TSC (bits 28..59)
size = 8

- case 2: TimeStamp marker
data = tv_nsec
array[0] = tv_sec
size = 16

- case 3: LargeBinaryBlob marker
data = 32-bit length of binary data
array[0] = 64-bit pointer to binary blob
array[1] = 64-bit pointer to "free" function
size = 24

- case 4: SmallBinaryBlob marker
data = inline length in bytes, must be < 4096
array[0..(len+7)/8] = inline data, padded
size = (len+15) & ~7

- case 5: AsciiFormat marker
data = number of arguments
array[0] = 64-bit pointer to static const format string
array[1..arg] = argument values
size = 8*(2+arg)

...

ie we use a few bits for "trace _internal_ type fields", and then for a
few of those types we have internal meanings, and other types just means
that the user can fill in the data itself.

IOW, you _could_ have an interface like

ascii_marker_2(ringbuffer,
"Reading sector %lu-%lu",
sector, sector+nsec);

and what it would create would be a fairly small trace packet that looks
something like

.type = 5,
.tsc_delta = ...,
.data = 2,
.array[0] = (const char *) "Reading sector %lu-%lu\n"
.array[1] = xx,
.array[2] = yy

and you would not actually print it out as ASCII until somebody read it
from the kernel (and any "binary" interface would get the string as a
string, not as a pointer, because the pointer is obviously meaningless
outside the kernel.

Also note how you'd literally just have a single copy of the string,
because the rule would be that a trace user must use a static string, not
some generated one that can go away (module unloading would need to be
aware of any trace buffer entries, of course - perhaps by just disallowing
unloading while trace buffers are active).

And note! Everything above is meant as an example of something that
_could_ work. I do like the notion of putting pointers to strings in the
markers, rather than having some odd magic numerical meaning that user
space has to just magically know that "event type 56 for ring buffer type
171 means that there are two words that mean 'sector' and 'end-sector'
respectively".

But it's still meant more as an RFC. But I think it could work.

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Linus Torvalds

2008-09-24 22:50:09 UTC

Post by Linus Torvalds
- case 1: TSCExtend marker
data = extended TSC (bits 28..59)
size = 8
- case 2: TimeStamp marker
data = tv_nsec
array[0] = tv_sec
size = 16

Btw, in case it wasn't clear, those two are totally different things.

The "case 1" thing is the thing that gets inserted automatically by the
trace code when it's needed because the 27-bit TSC is too limited.

The "case 2" thing is to allow us to occasionally synchronize with some
global known wall-time clock like the HPET + xtime. IOW, it would be
something that on demand creates a mapping from wall clock to TSC for that
particular CPU.

I guess I should perhaps have put the TSC frequency in there in that "case
2" thing too. Maybe that should be in "data" (in kHz) and tv_sec/tv_nsec
should be in array[0..1], and the time sync packet would be 24 bytes.

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Mathieu Desnoyers

2008-09-25 17:30:13 UTC

Post by Mathieu Desnoyers
The reason why Martin did use only a 27 bits TSC in ktrace was that they
were statically limited to 32 event types.

We could use a page header instead to contain the "unused_size"
information. It does not need to be an event per se. Putting this
information in the page header makes it easy for a consumer to just read
the amount of bytes needed, excluding the padding (turns up to be useful
for network streaming of trace data). Also, it frees up one event ID for
other uses. I think the event ID real estate is pretty important,
because every event ID we don't keep for "internal uses" could be used
to encode standard tracer event IDs.

Post by Linus Torvalds
- case 1: TSCExtend marker
data = extended TSC (bits 28..59)
size = 8

I would prefer to put the extended timestamp within the event header
instead of creating a separate entry for this for atomicity concerns
(what happens if a long interrupt executes between the TSCExtend marker
event and the event expecting to be written right next to it ?). If we
choose to have such "full tsc" event headers, we would have to reserve 1
selection bit. It leaves us 4 bits for event IDs. If we remove
heartbeats, we need even less "internal" IDs.

Given that we can allocate event IDs per buffer, if in general we assume
that buffer users will have few event IDs, I think we can find a way to
minimize the number of "internal" event IDs and keep possibly all the 4
bits (16 IDs) for real tracer events. One way to achieve it is to encode
the extra typing information within a table (dumped in a separate buffer)
indexed by event ID.

Post by Linus Torvalds
- case 2: TimeStamp marker
data = tv_nsec
array[0] = tv_sec
size = 16

This one could even be a standard event put in a single buffer. There is
no need to repeat it in various buffers all over the place.

Post by Linus Torvalds
- case 3: LargeBinaryBlob marker
data = 32-bit length of binary data
array[0] = 64-bit pointer to binary blob
array[1] = 64-bit pointer to "free" function
size = 24
- case 4: SmallBinaryBlob marker
data = inline length in bytes, must be < 4096
array[0..(len+7)/8] = inline data, padded
size = (len+15) & ~7
- case 5: AsciiFormat marker
data = number of arguments
array[0] = 64-bit pointer to static const format string
array[1..arg] = argument values
size = 8*(2+arg)
...
ie we use a few bits for "trace _internal_ type fields", and then for a
few of those types we have internal meanings, and other types just means
that the user can fill in the data itself.
IOW, you _could_ have an interface like
ascii_marker_2(ringbuffer,
"Reading sector %lu-%lu",
sector, sector+nsec);
and what it would create would be a fairly small trace packet that looks
something like
.type = 5,
.tsc_delta = ...,
.data = 2,
.array[0] = (const char *) "Reading sector %lu-%lu\n"
.array[1] = xx,
.array[2] = yy

I agree that exporting semantic information is important. Moreover, I
think this should also be made available when the trace is exported in
binary format to userspace. The markers currently in mainline has been
designed to do this efficiently. With small adaptation of the markers,
one could do a :

trace_mark(block, read,
"Reading sector sec_from to sec_to",
"sec_from %lu sec_to %lu",
sector, sector + nsec);

The nice part about the markers is that it keeps tables for the
description "Reading sector sec_from to sec_to" and the typing
information "sec_from %lu sec_to %lu" in a separate section. Therefore,
we only need to dump this information once (at trace start or when the
module containing this specific marker is loaded). It automatically
deals with module load/unload and does not require to write their format
string in the trace buffers.

The strings can be looked up by a userspace pretty-printer by dumping a
table mapping event IDs to marker names, which in turn map to
description and event types. This implies creating a small "metadata"
buffer along with each data transfer buffer to export these tables. This
metadata buffer must be read first to get the event typing information,
and then the userspace program is all set to pretty-print the binary
information.

An efficient ID -> format string mapping could also be kept around in
the kernel (or built on demand) to simplify the task for an in-kernel
pretty-printer.

Mathieu

Post by Linus Torvalds
and you would not actually print it out as ASCII until somebody read it
from the kernel (and any "binary" interface would get the string as a
string, not as a pointer, because the pointer is obviously meaningless
outside the kernel.
Also note how you'd literally just have a single copy of the string,
because the rule would be that a trace user must use a static string, not
some generated one that can go away (module unloading would need to be
aware of any trace buffer entries, of course - perhaps by just disallowing
unloading while trace buffers are active).
And note! Everything above is meant as an example of something that
_could_ work. I do like the notion of putting pointers to strings in the
markers, rather than having some odd magic numerical meaning that user
space has to just magically know that "event type 56 for ring buffer type
171 means that there are two words that mean 'sector' and 'end-sector'
respectively".
But it's still meant more as an RFC. But I think it could work.
Linus

Linus Torvalds

2008-09-25 17:40:14 UTC

Post by Mathieu Desnoyers
We could use a page header instead to contain the "unused_size"
information.

Absolutely. There's no one way to do this.

Post by Mathieu Desnoyers
I would prefer to put the extended timestamp within the event header
instead of creating a separate entry for this for atomicity concerns
(what happens if a long interrupt executes between the TSCExtend marker
event and the event expecting to be written right next to it ?).

The log entries should be reserved with interrupts disabled anyway, and
they are per-CPU, so there are no atomicity issues.

For NMI's, things get more exciting. I'd really prefer NMI's to go to a
separate ring buffer entirely, because otherwise consistency gets really
hard. Using lockless algorithms for a variable-sized pool of pages is a
disaster waiting to happen.

I don't think we can currently necessarily reasonably trace NMI's, but
it's something to keep in mind as required support eventually.

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Mathieu Desnoyers

2008-09-25 17:50:12 UTC

Post by Mathieu Desnoyers
We could use a page header instead to contain the "unused_size"
information.

Absolutely. There's no one way to do this.

The log entries should be reserved with interrupts disabled anyway, and
they are per-CPU, so there are no atomicity issues.

I actually do use a lockless algorithm in LTTng and don't have to
disable interrupts around tracing. This is how I get the kind of
performance the Google folks expect. I would recommend to stay with
interrupt disable + per-cpu spinlock (slow and heavy locking) for v1,
but to keep in mind that we might want to go for a more lightweight
locking scheme in v2.

Post by Linus Torvalds
For NMI's, things get more exciting. I'd really prefer NMI's to go to a
separate ring buffer entirely, because otherwise consistency gets really
hard. Using lockless algorithms for a variable-sized pool of pages is a
disaster waiting to happen.

LTTng does it, no disaster happened in the past 2-3 years. :)

I guess we could manage to deal with NMI tracing specfically using the
in_nmi() helpers.

Post by Linus Torvalds
I don't think we can currently necessarily reasonably trace NMI's, but
it's something to keep in mind as required support eventually.

NMI tracing is a nice-to-have (and lttng does provide it), but the core
thing is really performance; disabling interrupts happens to be fairly
slow on many architectures.

Mathieu

Post by Linus Torvalds
Linus

Mathieu Desnoyers

2008-09-25 16:50:42 UTC

No, we don't shift (we don't want to lose precision), and we don't use
more than 27 bits by default.
the TSC at each entry should be a _delta_. It's the difference from the
last one. And if you get less than 30 events per second, and you need a
bigger difference, you insert an extra "sync" tracepoint that has a 59-bit
thing (27 bits _plus_ the extra 'data').
Yes, it adds 8 bytes (assuming that minimal format), but it does so only
for any trace event that is more than 1/30th of a second from its previous
one. IOW, think of this not in bytes, but in bytes-per-second. It adds at
most 8*30=240 bytes per second, but what it _saves_ is that when you have
tens of thousands of events, it shaves 4 bytes FOR EACH EVENT.
See?
Also, quite often, the clock won't be running at 4GHz even if the CPU
might. Intel already doesn't make the TSC be the nominal frequency, and
other architectures with TSC's have long had the TSC be something like a
"divide-by-16" clock rather than every single cycle because it's more
power-efficient.
So there is often a built-in shift, and I doubt we'll see 10GHz TSC's even
if we see 10GHz CPU's (which many people consider unlikely anyway, but
I'm not going to bet against technology).
Linus

I remembered other concerns about 27 vs 32 bits TSC decision, which are
rather important. First, if we have a 27 bits TSC, with overflow every
33ms at 4GHz, we assume the kernel will _never_ have an interrupt
latency longer than this for correct heartbeat behavior. The sad thing
is that it is not uncommon to see such interrupt latencies once in a
while. It's especially bad if the correctness of the timestamps gathered
by the one tool that would be helpful to debug such interrupt latency
problem is broken by the thing it try to instrument.

Given that heartbeat events have this particular downside, we may think
writing an extended TSC value in the event header will solve the
problem, but it actually creates a new kind of problem when we try to
relax buffer locking. The basic idea is that it's easy to compute the
delta from the previous timestamp taken on the same CPU when disabling
interrupts because we are serializing all operations for this specific
core. However, a more lightweight locking (without interrupt disabling)
involves using a loop such as:

reserve space (preempt is disabled) :
unsigned long old, new;

do {
old = read_offset();
timestamp = get_cycles();
new = old + header_size() + event_size();
} while ((local_cmpxchg(&buf->offset, old, new) != old);
write event header
write event payload
commit

So we can make sure the timestamps will never go backward in a given
buffer (this is insured by the fact that this cmpxchg ties together
timestamp read and buffer space reservation).

If we want to implement detection of TSC rollover at the tracing site,
we would have to remember the last TSC value read, and this is where it
becomes racy. We cannot read the previous event slot, because we have no
guarantee it's been written to (only reserve operation is ordered. write
and commit are unordered). We would therefore have to keep a copy of the
previous TSC value, but given we don't do any locking, it would only be
safe to only keep data that can be read/writte to atomically, namely
only 32 bits on x86_32. Also note that the cmpxchg loop may fail, so we
cannot update the "last_tsc_msb" data structure inside the loop. It
could cause a nested IRQ to think the previous event written had a
higher timestamp than what is actually in the trace.

So we could do :

per buffer, per cpu : u32 last_tsc_msb;

reserve space (preempt is disabled) :
unsigned long old, new;
u64 timestamp;
int full_tsc = 0;

do {
old = read_offset();
timestamp = get_cycles();
new = old + event_size();
if ((u32)(timestamp >> (64 - 27)) - buf->last_tsc_msb) {
full_tsc = 1;
new += header_size_full_tsc();
} else
new += header_size();
} while ((local_cmpxchg(&buf->offset, old, new) != old);

buf->last_tsc_msb = (u32)(timestamp >> (64 - 27));

if (full_tsc)
write full tsc event header
else
write event header
write event payload
commit()

The incorrect race would be not to write a full tsc when needed (an
overflow would go undetected). A correct race would be to write a full
TSC even if unneeded. This would just consume a little more space, but
given the unlikeliness of the race, we don't care.

The worse case scenario is to have an interrupt coming right between a
successful cmpxchg reserving the space to write a full tsc and the
last_tsc_msb write. This interrupt would think it has to write a full
tsc when it in fact the previous slot already does contain the full tsc.
Duplicated full TSCs, that's the "correct race".

Then, the IRQ will update the last_tsc_msb value before the actual
thread that has been interrupted. This will cause the last_tsc_msb value
to go slightly backward when updated by the thread. But it's not a
problem, because letting the last_tsc_msb decrement slightly could only
result in the "correct race", which is to cause following events to
write a full TSC event if not strictly needed.

So I think we could manage to completely remove the heartbeat, which
would make life much easier in dynticks system and would make the time
source much more robust wrt long interrupt latencies.

Mathieu

Linus Torvalds

2008-09-25 17:00:21 UTC

Post by Mathieu Desnoyers
I remembered other concerns about 27 vs 32 bits TSC decision, which are
rather important. First, if we have a 27 bits TSC, with overflow every
33ms at 4GHz, we assume the kernel will _never_ have an interrupt
latency longer than this for correct heartbeat behavior.

We do no such thing.

Guys, the heartbeat is a _separate_ thing from overflow handling.

You don't handle overflow by having a heartbeat that beats fifty times a
second just to insert events, just so that the TSC delta would always fit
in 27 bits. That would work, but be stupid. It would mean that you fill up
your event buffer with uninteresting crud just because nothing happens.

Yes, many people want to have a heartbeat (a "Mark" event) every once in a
while, but what I suggest is independent of heartbeats, even if it _could_
be implemented that way. What I suggest is simply that when you insert an
event, you always read the full 64 bits of TSC (on x86 - others will do
other things), and then you insert the delta against the last one.

After all, you cannot read just 27 bits of the TSC anyway. You _have_ to
read the whole 64 bits, and then you subtract the pervious trace event TSC
(that you have in the per-CPU trace buffer header) from that. You now have
a delta value.

And if the delta doesn't fit in 27 bits, you generate a 59-bit TSC event!

None of this has _anything_ to do with interrupt latency. There is no
dependency on a heartbeat, or any dependency on always inserting a trace
event at least 30 times a second. There's no worry about conversions, and
these are all trivial single assembly instructions to do (or a couple, on
a 32-bit architecture that needs to do a sub/sbb pair and test two
different registers to see if the result fits in 27 bits).

The only issue is that if you insert trace events more seldom, you'll
always get the extra TSC event as well, inserted automatically in front of
the event you explicitly inserted. The tracer doesn't need to know.

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Steven Rostedt

2008-09-25 17:10:09 UTC

Note: RFC v2 implements this.

-- Steve

Post by Linus Torvalds
None of this has _anything_ to do with interrupt latency. There is no
dependency on a heartbeat, or any dependency on always inserting a trace
event at least 30 times a second. There's no worry about conversions, and
these are all trivial single assembly instructions to do (or a couple, on
a 32-bit architecture that needs to do a sub/sbb pair and test two
different registers to see if the result fits in 27 bits).
The only issue is that if you insert trace events more seldom, you'll
always get the extra TSC event as well, inserted automatically in front of
the event you explicitly inserted. The tracer doesn't need to know.

Steven Rostedt

2008-09-24 18:00:14 UTC

Post by Martin Bligh
Can't the reserve interface just put a padding event into page A,
or otherwise mark it, and return the start of page B?

Yes, I think having a "padding" entry that just gets skipped on read would
simplify things. Use that to fill up the end of the page.

Yep, that is what the RFC patch did.

Post by Peter Zijlstra
And here I was thinking you guys bit encoded the event id into the
timestamp delta :-)

+/* header plus 32-bits of event data */
+struct ktrace_entry {
+ u32 event_type:5, tsc_shifted:27;
+ u32 data;
+};
was our basic data type. So ... sort of ;-)

Why "tsc_shifted"?
I think 27 bits is probably fine, but not by removing precision. Instead
of shifting it so it will fit (and dropping low bits as uninteresting), do
it by encoding it as a delta against the previous thing. 27 bits would
still be sufficient for any high-performance thing that has tons and tons
of packets, and if you only have a few trace events you can afford to have
the "TSC overflow" event type (and if you want it that dense, you could
just make 'data' be the high bits, for a total of 59 bits rather than 64
bits of TSC.
59 bits of cycle counters is perfectly fine unless you are talking trace
events over a year or so (I didn't do the math, but let's assume a 4GHz
TSC as a reasonable thing even going forward - even _if_ CPU's get faster
than that, the TSC is unlikely to tick faster since it's just not worth it
from a power standpoint).
Ok, I did the math. 1<<27 seconds (assuming the low 32 bits are just
fractions) is something like 4+ years. I _really_ don't think we need more
than that (or even close to that) in TSC timestamps for tracing within one
single buffer.
Once you go to the next ring buffer, you'd get a new time-base anyway.

Right now I have a list of pages that make up the ring buffer. Are you
saying that the first entry in the page should be a timestamp?

Anyway, after talking with Peter Zijlstra, I'm working on RFC-v2, which
splits up the ring buffer a bit more. I'm removing all the debugfs crud,
and I even will remove the merge sort from the ring buffer.

I will now have a ring_buffer API, which will do basic recording. It will
have two modes when allocated. Fixed sized entry mode where you can just
put whatever you want in (I'm still aligning everything by 8 bytes, just
since memory is cheap). Or you can have variable length mode that will
make the following event header:

struct {
unsigned char length;
unsigned char buff[];
};

The length will be shifted 3 since we are 8 byte aligned anyway, making
the largest entry 2046 bytes (2045 bytes of data since 1 byte is already
taken for the length field). If the next entry is not large enough to fit
on the page, I will enter a zero length and that will tell the tracer that
the entry is padding to the end of the page.

For fixed sized entries, a simple calculation of whether an entry can fit
on a page will determine if there is an entry or padding.

Then I will add a trace_buffer API that will add the counting and merge
sort on top of this interface. If you don't care about the
trace_buffering, one could simply use the ring_buffering and be done with
it.

Note, I am still keeping the reserve and commit interface for now.

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Linus Torvalds

2008-09-24 20:30:07 UTC

Post by Steven Rostedt
Right now I have a list of pages that make up the ring buffer. Are you
saying that the first entry in the page should be a timestamp?

I think the most straightforward model would be that the "head" of the
ring buffer (regardless of size in pages) would have that timestamp.
Making them per-page is an option, of course, I have no strong opinions
either way. The per-page one could have advantages (ie it would give a
nice upper limit for just how many entries you have to walk in order to
convert an entry into a full timestamp), but I certainly don't think
that's a big decision, more of a detail.

But if we start out with having the full TSC in each entry, that's easily
going to be painful to fix later. If we start out with a delta system,
changing the details of where the base is gotten is likely to be exactly
that - just a detail.

So I'd like the thing to have small headers, and be designed from the
start to have small headers.

Post by Steven Rostedt
I will now have a ring_buffer API, which will do basic recording. It will
have two modes when allocated. Fixed sized entry mode where you can just
put whatever you want in (I'm still aligning everything by 8 bytes, just
since memory is cheap). Or you can have variable length mode that will
struct {
unsigned char length;
unsigned char buff[];
};

So the only reason I'm not thrilled with this is that I really think that
timestamping should be inherent, and at the lowest level.

Without timestamping, what's the real point? EVERYBODY eventually wants a
timestamp. We added it even to the kernel printk()'s. People want them for
network packets to user space. X wants it for all its events. It's one of
those things that people never do from the beginning, but that everybody
eventually wants anyway.

So I certainly don't mind layering, but I *do* mind it if it then means
that some people will use a broken model and not have timestamps. So I
think the timestamping code should just be there - without it, a trace
buffer is pointless.

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

David Miller

2008-09-24 20:40:09 UTC

From: Linus Torvalds <***@linux-foundation.org>
Date: Wed, 24 Sep 2008 13:23:47 -0700 (PDT)

Post by Linus Torvalds
So I'd like the thing to have small headers, and be designed from the
start to have small headers.

Small headers are good, but I'd suggest making sure there is an
"offset" or similar field in there.

Not that I want to encourage frequent changes to header layout, but if
you do need to add something, then this offset field allows you to do
so while keeping existing analysis tools working. They will just
ignore the new information in the headers, but they will still be able
to get at the data bits using the offset.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Steven Rostedt

2008-09-24 20:50:12 UTC

Post by David Miller
Date: Wed, 24 Sep 2008 13:23:47 -0700 (PDT)

Post by Linus Torvalds
So I'd like the thing to have small headers, and be designed from the
start to have small headers.

Right now I have a "length" field when fixed_length is not specified. This
length is currently offest by 8 leaving all 8 bytes aligned.

We could change this to be variable? Would that help you?

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Martin Bligh

2008-09-24 21:00:16 UTC

Post by David Miller

Post by Linus Torvalds
So I'd like the thing to have small headers, and be designed from the
start to have small headers.

One thing we said we could do is compile the "decompaction" tools
along with the kernel, in the kernel tree. Then if we change the in-kernel
format, you don't break all the userspace tools. We used:

struct ktrace_time {
u32 seconds;
u32 nanoseconds;
};

struct ktrace_event {
struct ktrace_time time;
u16 cpu;
u16 type;
u32 pid;
u32 data;
u32 elapsed_time; /* ns for interrupt, otherwise us */
u32 expanded_length;
}

The format is much easier to parse for userspace tools, though much
less compact. A simple C tool can turn in-kernel format into userspace
format:

1. Merge the per-cpu buffers into a single stream
2. put cpu ids in
3. Work out which pid was running, from the last context switch
4. Put in elapsed times (keeping track of the start of a system call
and recording the delta at end of system call (or interrupt, etc))
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Frank Ch. Eigler

2008-09-24 21:30:18 UTC

Hi -

Post by Martin Bligh
[...]
One thing we said we could do is compile the "decompaction" tools
along with the kernel, in the kernel tree. Then if we change the in-kernel
format, you don't break all the userspace tools.

If the common tracing idea still includes kernel-supplied ASCII as an
alternative to binary (so that one could grep some debugfs file), then
it would be nice to have some common code for decoding the trace
records offline.

If we can associate a simple specific struct type ("struct
ftrace_event") with each trace id type in an object-code-resident
table, we could automate some of this. The kernel build process could
sniff dwarf data (a la acme's struct-layout-printing dwarves) at or
before modpost time to generate a snippet of C code for each such
event struct. That C code could be the generic ASCII-printing
callback for use by debugfs. A variant could be a userspace-usable
version. Given the same id-to-struct-name mapping, Sytemtap could
expose event records field by field, by name.

- FChE
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Steven Rostedt

2008-09-24 21:40:07 UTC

Post by Frank Ch. Eigler
Hi -

If the common tracing idea still includes kernel-supplied ASCII as an
alternative to binary (so that one could grep some debugfs file), then
it would be nice to have some common code for decoding the trace
records offline.
If we can associate a simple specific struct type ("struct
ftrace_event") with each trace id type in an object-code-resident
table, we could automate some of this. The kernel build process could
sniff dwarf data (a la acme's struct-layout-printing dwarves) at or
before modpost time to generate a snippet of C code for each such
event struct. That C code could be the generic ASCII-printing
callback for use by debugfs. A variant could be a userspace-usable
version. Given the same id-to-struct-name mapping, Sytemtap could
expose event records field by field, by name.

Hi Frank,

I think we are going a step below this. That is, the ring buffer itself
will not be expecting to expose anything to the user interface.

That will need to be done in a higher layer. Right now we just want a way
to stabilize the ring buffer infrastructure. Then we can add a tracing
infrastructure on top that can do the above work.

I'm working on having a ring_buffer.c that will do the bare minimum, and a
trace_buffer.c that will be a layer on top that will add more
functionality. What you are asking for may apply there.

Thanks,

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Steven Rostedt

2008-09-24 20:50:07 UTC

So the only reason I'm not thrilled with this is that I really think that
timestamping should be inherent, and at the lowest level.

OK, then how about this?

Each page will start with a time stamp (I'm still aligning everything by 8
bytes, just because it simplifies things). Then we can have a 3 byte
(24 bit) counter offset? Then we can have a header that looks like:

struct {
unsigned char time[3];
unsigned char length;
unsigned char buff[];
};

This still allows me to have the 2048 byte size buffer.

Or is 24 bits for time too small? The offest will be from the previous
entry, and not the beginning of the page.

If one defines a fixed size entry, we could just use the full 32 bits for
the timestamp, since the length will be ignored in that case, and will
become part of the buffer.

Hence,

struct {
unsigned int time;
unsigned char length;
unsigend char buff[];
};

Post by Linus Torvalds
Without timestamping, what's the real point? EVERYBODY eventually wants a
timestamp. We added it even to the kernel printk()'s. People want them for
network packets to user space. X wants it for all its events. It's one of
those things that people never do from the beginning, but that everybody
eventually wants anyway.

OK, I'll hack something up like this.

Post by Linus Torvalds
So I certainly don't mind layering, but I *do* mind it if it then means
that some people will use a broken model and not have timestamps. So I
think the timestamping code should just be there - without it, a trace
buffer is pointless.

OK, the bottom layer will have some kind of timestamps. Now we only need
to agree on what the header will look like.

Thanks,

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Martin Bligh

2008-09-24 21:10:11 UTC

Post by Steven Rostedt
OK, then how about this?
Each page will start with a time stamp (I'm still aligning everything by 8
bytes, just because it simplifies things). Then we can have a 3 byte
struct {
unsigned char time[3];
unsigned char length;
unsigned char buff[];
};
This still allows me to have the 2048 byte size buffer.
Or is 24 bits for time too small? The offest will be from the previous
entry, and not the beginning of the page.
If one defines a fixed size entry, we could just use the full 32 bits for
the timestamp, since the length will be ignored in that case, and will
become part of the buffer.
Hence,
struct {
unsigned int time;
unsigned char length;
unsigend char buff[];
};

How about we just steal 5 bits from the timestamp to indicate event
lengths up to 32 bytes, and if it's 0, that means there's a length
field following? Also that'd mean you could use a longer length field
and get beyond 256 bytes to 4096, without impacting most events.

struct {
u32 length:5, time_delta:27;
u16 length;
u8 buf[];
};

struct {
u32 length:5, time_delta:27; /* where length == 0 */
u8 buf[];
};

Obviously we could less than 5 bits, even just 1 for a flag ...
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Steven Rostedt

2008-09-24 21:20:08 UTC

Post by Martin Bligh
How about we just steal 5 bits from the timestamp to indicate event
lengths up to 32 bytes, and if it's 0, that means there's a length
field following? Also that'd mean you could use a longer length field
and get beyond 256 bytes to 4096, without impacting most events.
struct {
u32 length:5, time_delta:27;

I think you mean this is where length == 0 ;-)

Post by Martin Bligh
u16 length;
u8 buf[];
};
struct {
u32 length:5, time_delta:27; /* where length == 0 */
u8 buf[];
};
Obviously we could less than 5 bits, even just 1 for a flag ...

OK then. Since I like the idea of aligning the buffer to 8 bytes, we can
always shift the length field by 3. So...

For records 256 bytes or less, we only have:

struct {
u32 length:5, time_delta: 27;
u8 buf[];
};

For 257 bytes or more we have:

struct {
u32 length:5 (=0), time_delta: 27;
u16 large_length;
u8 buf[];
};

This is what you want?

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Steven Rostedt

2008-09-24 22:00:18 UTC

BTW, if we declare the ring buffer to have fixed length entries, do we
still want to record the length?

Hmm, probably should. It would make the code easier. If it is fixed length
under 256 bytes, we still need to record the timestamp, and the length is
only 5 bits. It could probably store something else, but we can leave that
for version two ;-)

Heck, for now, I'll remove the work I did to add the fix length option,
and just use this default. If you have fixed length entries, it will
just make it easier to code the above layers.

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2008-09-25 10:50:14 UTC

I rather like this idea, as it gives small entries (the common case) the
least overhead but does allow for larger ones.

By also putting the time in there you can do the merge sort iterator,
Linus was right that everybody wants this anyway.

As for delta encoding the time, we could make the tick log the absolute
time packet, that's at least 100Hz and it already has to compute the
full gtod thing anyway.

I don't much like Linus' idea of bringing type information back into the
primitive header (sorry Linus ;-)). I'd much rather keep that
abstraction in the next layer.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Martin Bligh

2008-09-25 14:40:11 UTC

Post by Peter Zijlstra
I rather like this idea, as it gives small entries (the common case) the
least overhead but does allow for larger ones.
By also putting the time in there you can do the merge sort iterator,
Linus was right that everybody wants this anyway.
As for delta encoding the time, we could make the tick log the absolute
time packet, that's at least 100Hz and it already has to compute the
full gtod thing anyway.
I don't much like Linus' idea of bringing type information back into the
primitive header (sorry Linus ;-)). I'd much rather keep that
abstraction in the next layer.

There is part of the type stuff that belongs in the lower layer, it seems -
the padding events for the up-to-end-of-page buffering, and the timestamp
extensions. It seems wrong to split those across two layers.

But perhaps we can keep a couple of bits for this, and three of the bits
to represent the length of the data payload (maybe in 4 byte multiples
rather than bytes?) That'd let up to 28 bytes as a payload in a short event.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2008-09-25 15:00:17 UTC

Hmm, you've got a point there, then it would be 3 package types:

- regular
- full time
- nop

Which can be encoded using 2 bits

Post by Martin Bligh
But perhaps we can keep a couple of bits for this, and three of the bits
to represent the length of the data payload (maybe in 4 byte multiples
rather than bytes?) That'd let up to 28 bytes as a payload in a short event.

Right - if you use raw tsc you're dependent on clock speed, if we'd
normalize that on ns instead you'd need at least:

l(10000000)/l(2)
23.25349666421153643532

bits to handle HZ=100, leaving us with 32-2-24 = 6 bits for size.

Sounds doable (unless I mis-counted on the 0's).

Also, I agree on the 4byte alignment, rather than the 8byte Steve seems
to favour.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Linus Torvalds

2008-09-25 15:20:12 UTC

Post by Peter Zijlstra
Right - if you use raw tsc you're dependent on clock speed, if we'd
normalize that on ns instead you'd need at least: [...]

Please don't normalize to ns.

It's really quite hard, and it's rather _expensive_ on many CPU's. It
involves a non-constant 64-bit divide, after all. I bet it can be
optimized to be a multiply-by-inverse instead, but it would be a 128-bit
(or maybe just 96-bit?) multiply, and the code would be nasty, and likely
rather more expensive than the TSC reading itself.

Sure, you have to normalize at _some_ point, and normalizing early might
make some things simpler, but the main thing that would become easier is
people messing about in the raw log buffer on their own directly, which
would hopefully be something that we'd discourage _anyway_ (ie we should
try to use helper functions for people to do things like "get the next
event data", not only because the headers are going to be odd due to
trying to pack things together, but because maybe we can more easily
extend on them later that way when nobody accesses the headers by hand).

And I don't think normalizing later is in any way more fundamentally hard.
It just means that you do part of the expensive things after you have
gathered the trace, rather than during.

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Martin Bligh

2008-09-25 15:30:14 UTC

Post by Peter Zijlstra
Right - if you use raw tsc you're dependent on clock speed, if we'd
normalize that on ns instead you'd need at least: [...]

Please don't normalize to ns.
It's really quite hard, and it's rather _expensive_ on many CPU's. It
involves a non-constant 64-bit divide, after all. I bet it can be
optimized to be a multiply-by-inverse instead, but it would be a 128-bit
(or maybe just 96-bit?) multiply, and the code would be nasty, and likely
rather more expensive than the TSC reading itself.
Sure, you have to normalize at _some_ point, and normalizing early might
make some things simpler, but the main thing that would become easier is
people messing about in the raw log buffer on their own directly, which
would hopefully be something that we'd discourage _anyway_ (ie we should
try to use helper functions for people to do things like "get the next
event data", not only because the headers are going to be odd due to
trying to pack things together, but because maybe we can more easily
extend on them later that way when nobody accesses the headers by hand).
And I don't think normalizing later is in any way more fundamentally hard.
It just means that you do part of the expensive things after you have
gathered the trace, rather than during.

Agree with you on doing the expensive stuff later. If we wanted to get
something that'd pack down to a couple fewer bits, and approximate ns,
we could always >> 1 if you were > 2GHz, and >> 2 if you where > 4GHz,
etc. which is at least cheap.

But do we really need more than 3 bits for size anyway? 28 bytes
would fit most events such as system calls, interrupts, page faults,
all the common stuff.. Longer than that starts to be expensive
in memory consumption anyway, and if you're logging 32 bytes
or more ... 2 extra bytes for a length field is a small overhead to pay.

M.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Ingo Molnar

2008-09-25 15:40:14 UTC

Post by Linus Torvalds
And I don't think normalizing later is in any way more fundamentally
hard. It just means that you do part of the expensive things after
you have gathered the trace, rather than during.

... which is exactly what sched_clock() does, combined with a
multiplication. (which is about as expensive as normal linear
arithmetics on most CPUs - i.e. in the 1 cycle range)

Normalizing has the advantage that we dont have to worry about it ever
again. Not about a changing scale due to cpufreq, slowing down or
speeding up TSCs due to C2/C3. We have so much TSC breakage all across
the spectrum that post-processing it is a nightmare in practice. Plus we
want sched_clock() to be fast anyway.

in the distant future we not only will have constant-TSC but it wont
stop in C2/C3 either at a whim (which they do right now, messing up
timestamps). At that stage fast time readout it will be so sane that CPU
makers should really provide a nanosec readout - it's easy to do a
simple multiplicator and hide the few cycles multiplicator latency to
RDTSC (this is continuous time after all so it's easy for the hw).

Hm?

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Mathieu Desnoyers

2008-09-25 16:30:20 UTC

... which is exactly what sched_clock() does, combined with a
multiplication. (which is about as expensive as normal linear
arithmetics on most CPUs - i.e. in the 1 cycle range)
Normalizing has the advantage that we dont have to worry about it ever
again. Not about a changing scale due to cpufreq, slowing down or
speeding up TSCs due to C2/C3. We have so much TSC breakage all across
the spectrum that post-processing it is a nightmare in practice. Plus we
want sched_clock() to be fast anyway.
in the distant future we not only will have constant-TSC but it wont
stop in C2/C3 either at a whim (which they do right now, messing up
timestamps). At that stage fast time readout it will be so sane that CPU
makers should really provide a nanosec readout - it's easy to do a
simple multiplicator and hide the few cycles multiplicator latency to
RDTSC (this is continuous time after all so it's easy for the hw).
Hm?
Ingo

Hi Ingo,

The problem with sched_clock is that it gives a 1 HZ timestamp accuracy
for events happening across different CPUs. Within this 1 HZ range, it
uses the TSC and clip when it reaches a max. Good enough for scheduler
or for tracing events on a single CPU, but I think it is not exactly
what we need to reorder events happening across CPUs.

Mathieu

Steven Rostedt

2008-09-25 16:40:04 UTC

Post by Mathieu Desnoyers
The problem with sched_clock is that it gives a 1 HZ timestamp accuracy
for events happening across different CPUs. Within this 1 HZ range, it
uses the TSC and clip when it reaches a max. Good enough for scheduler
or for tracing events on a single CPU, but I think it is not exactly
what we need to reorder events happening across CPUs.

Hmm,

sched_clock gives ns accuracy unless the tsc is disabled. And in that
case, we don't have any CPU clock :-/

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Mathieu Desnoyers

2008-09-25 17:30:15 UTC

Hmm,
sched_clock gives ns accuracy unless the tsc is disabled. And in that
case, we don't have any CPU clock :-/

Even on architectures with non-synchronized TSCs ?

Mathieu

Post by Steven Rostedt
-- Steve

Steven Rostedt

2008-09-25 17:40:12 UTC

Post by Mathieu Desnoyers

Hmm,
sched_clock gives ns accuracy unless the tsc is disabled. And in that
case, we don't have any CPU clock :-/

Even on architectures with non-synchronized TSCs ?

Yep, even on those ;-)

-- Steve
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Linus Torvalds

2008-09-25 16:50:38 UTC

Post by Ingo Molnar
... which is exactly what sched_clock() does, combined with a
multiplication. (which is about as expensive as normal linear
arithmetics on most CPUs - i.e. in the 1 cycle range)

First off, that's simply not true.

Yes, it happens to be true on modern x86-64 CPU's. But in very few other
places. Doing even just 64-bit multiples is _expensive_. It's not even
_near_ single-cycle.

Post by Ingo Molnar
Normalizing has the advantage that we dont have to worry about it ever
again. Not about a changing scale due to cpufreq, slowing down or
speeding up TSCs due to C2/C3. We have so much TSC breakage all across
the spectrum that post-processing it is a nightmare in practice.

Total and utter bullshit, all of it.

Have you forgotten all the oopses due to divide-by-zero because
sched_clock() was called early? All that early code that we might well
want to trace through?

Not only that, but have you forgotten about FTRACE and -pg? Which means
that every single C function calls into tracing code, and that can
basically only be disabled on a per-file basis?

As for C2/C3 - that's just an argument for *not* doing anything at trace
time. What do you think happens when you try to trace through those
things? You're much better off trying to sort out the problems later, when
you don't hold critical locks and are possibly deep down in some buggy
ACPI code, and you're trying to trace it exactly _because_ it is buggy.

The thing is, the trace timestamp generation should be at least capable of
being just a couple of versions of assembly language. If you cannot write
it in asm, you lose. You cannot (and MUST NOT) use things like a
virtualized TSC by mistake. If the CPU doesn't natively support 'rdtsc' in
hardware on x86, for example, you have to have another function altogether
for the trace timestamp.

And no way in hell do we want to call complex indirection chains that take
us all over the map and have fragile dependencies that we have already hit
several times wrt things like cpufreq.

WE ARE MUCH BETTER OFF WITH EVEN _INCORRECT_ TIME THAN WE ARE WITH FRAGILE
TRACE INFRASTUCTURE.

Post by Ingo Molnar
Plus we want sched_clock() to be fast anyway.

Yeah. And we want system calls to be _really_ fast, because they are even
more critical than the scheduler. So maybe we can use a "gettime()" system
call.

IOW, your argument is a non-argument. No way in HELL do we want to mix up
sched_clock() in tracing. Quite the reverse. We want to have the ability
to trace _into_ sched_clock() and never even have to think about it!

TSC is not pefect, but (a) it's getting better (as you yourself point
out), and in fact most other architectures already have the better
version. And (b) it's the kind of simplicity that we absolutely want.

Do you realize, for example, that a lot of architectures really only have
a 32-bit TSC, and they have to emulate a 64-bit one (in addition to
conveting it to nanoseconds using divides) for the sched_clock()? They'd
almost certainly be much better off able to just use their native one
directly.

Yeah, it would probably cause some code duplication, but the low-leel
trace infrastructure really is special. It can't afford to call other
subsystems helper functions, because people want to trace _those_.

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Steven Rostedt

2008-09-25 17:00:23 UTC

Post by Linus Torvalds
Not only that, but have you forgotten about FTRACE and -pg? Which means
that every single C function calls into tracing code, and that can
basically only be disabled on a per-file basis?

Slight correction. You can annotate the function with "notrace" and
that function will not be traced. So the "only be disabled on a per-file
basis" statement is false.

Post by Linus Torvalds
Yeah, it would probably cause some code duplication, but the low-leel
trace infrastructure really is special. It can't afford to call other
subsystems helper functions, because people want to trace _those_.

Currently my code calls "ring_buffer_time_stamp" to get the time stamp,
whatever it will be. Currently it is using sched_clock, but since I have
it as a wrapper, it shouldn't be too hard to modify later.

I'll also add a "ring_buffer_time_stamp_normalize(ts)" function to be
called on reading of the trace. This will normalize whatever time stamp
that we use back to ns for the users. For now it will just return "ts"
since sched_clock is already normalize.

IOW, for my current work, I don't care what timing we use. Others do, and
I will try to make the tracing infrastructure let it be easy to change
what is used. For now, I'm concentrating on the infrastructure itself.

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Linus Torvalds

2008-09-25 17:20:07 UTC

Post by Steven Rostedt
Slight correction. You can annotate the function with "notrace" and
that function will not be traced. So the "only be disabled on a per-file
basis" statement is false.

Ok. It's still true that we absolutely don't want to add random notrace
markers to code just because it's shared with the scheduler. And
"sched_clock()" is not a single function with just a few well-defined
places, nor are all versions of it at all appropriate for tracing (the
non-TSC ones are a total joke - it works for scheduling, but not tracing.
Same goes for the virtualized versions).

Post by Steven Rostedt
Currently my code calls "ring_buffer_time_stamp" to get the time stamp,
whatever it will be. Currently it is using sched_clock, but since I have
it as a wrapper, it shouldn't be too hard to modify later.

Yes. The code looked fine, and had a FIXME. I have no objection to using
it as a known buggy approximation for TSC in order to not force every
architecture to immediately write one when the patch is discussed. But I
literally would expect that on x86, we'd basically just have a function
that does "rdtsc" for the common case, along with possibly a generic
fallback that does "xadd" in the absense of any other reasonable
alternative.

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Ingo Molnar

2008-09-25 20:00:26 UTC

Post by Steven Rostedt
Slight correction. You can annotate the function with "notrace" and
that function will not be traced. So the "only be disabled on a
per-file basis" statement is false.

Ok. It's still true that we absolutely don't want to add random
notrace markers to code just because it's shared with the scheduler.

firstly, for the sake of full disclosure, the very first versions of the
latency tracer (which, through hundreds of revisions, morphed into
ftrace), used raw TSC timestamps.

I stuck to that simple design for a _long_ time because i shared your
exact views about robustness and simplicity. But it was pure utter
nightmare to get the timings right after the fact, and i got a _lot_ of
complaints about the quality of timings, and i could never _trust_ the
timings myself for certain types of analysis.

So i eventually went to the scheduler clock and never looked back.

So i've been there, i've done that. In fact i briefly tried to use the
_GTOD_ clock for tracing - that was utter nightmare as well, because the
scale and breath of the GTOD code is staggering.

cpu_clock() proved to be a good middle ground. I'm not a believer in any
of this stuff, i just react to how things behave in practice, as none of
this is really easy to get right from the theoretical angle.

... so we can certainly go back to the TSC again, i'll be the last one
to complain about extra tracing performance and it will certainly make
it less fragile. Maybe i was wrong in declaring that a dead end and it
will work out fine. [ Worst-case we'll go back to the sched_clock()
again, that will always be an easy option. ]

... and regarding sched.o's notrace marking, we have had functional -pg
tracing of sched.o for ages, and we could enable it right now. We can
trace inside the runqueue lock just fine. Note: we indeed have commit
c349e0a0 that proves that this stuff can lock up.

But most notrace markings are not for robustness reasons at all. They
have two purposes:

1) correctness. sched_clock() itself should be recursion free. (Even
_that_ has a recursion check btw., so normally it shouldnt lock up -
it just wont produce very nice traces.)

2) we use notrace and -no-pg to filter out very high-frequency and
mostly uninteresting debug function calls. lockdep.o is an example,
but sched.o was done due to that too. (and because sched.o had
sched_clock() bits - but that's now all in a separate file.)

They are marked notrace globally for performance and information density
reasons - if you've ever seen lockdep graph walking in traces you'll
know what i mean. But ftrace is robust enough to trace inside those
places. We just dont want to by default.

Post by Linus Torvalds
And "sched_clock()" is not a single function with just a few
well-defined places, nor are all versions of it at all appropriate for
tracing (the non-TSC ones are a total joke - it works for scheduling,
but not tracing. Same goes for the virtualized versions).

that's true. I'd not mind (at all) having an assembly version of
cpu_clock() or so. Because, believe me, people absolutely depend on
accurate and dependable timestamps, and i depend on them quite often
when i use traces.

There are two reasons for that:

1) they want to understand the exact timings on a single CPU, often
across multiple idle points. How do you propose we solve the
TSC-stops-in-idle problem?

2) they want to understand timings and ordering of events on
multiple CPUs. For example there's a suspected race condition and i'd
like to see the order of events. With TSC post-processing i can tell
you that it's almost impossible to reach the current ~1-10 usec
cross-CPU timing accuracy.

Note that these usecases have proven to be _FAR_ more important in
practice than 'to stream gobs of data to user-space', so we have to give
a design answer to them.

If sched_clock() is broken then the kernel wont work anyway. And i never
wanted to trace inside sched_clock() itself. If you want to, we can
track TSCs and approximate time after the fact, but i can predict it to
you right now that it's pretty stupid to do and we'll eventually have to
fix it. Chances are that we'll end up with a parallel tracer clock
implementation that will resemble sched_clock() in essence.

So i think the better approach is to introduce a trace_clock() that is
based on the TSC and gives nanosec timestamps, and to make sched_clock()
use _that_. I really think it's wrong to have two kinds of clocks in the
system that share so many goals.

anyway ... we can do TSC based timestamps too, it will just be far less
useful for the things _i_ used the tracer for in the past.

So ... i think that i understand 100% of your arguments (i've been
there, i've done that), but i think that you understand only about 50%
of my arguments. So we definitely need to talk more ;)

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Ingo Molnar

2008-09-25 20:20:13 UTC

Post by Ingo Molnar
firstly, for the sake of full disclosure, the very first versions of
the latency tracer (which, through hundreds of revisions, morphed into
ftrace), used raw TSC timestamps.
I stuck to that simple design for a _long_ time because i shared your
exact views about robustness and simplicity. But it was pure utter
nightmare to get the timings right after the fact, and i got a _lot_
of complaints about the quality of timings, and i could never _trust_
the timings myself for certain types of analysis.
So i eventually went to the scheduler clock and never looked back.
So i've been there, i've done that. In fact i briefly tried to use the
_GTOD_ clock for tracing - that was utter nightmare as well, because
the scale and breath of the GTOD code is staggering.

heh, and i even have a link for a latency tracing patch for 2005 that is
still alive that proves it:

http://people.redhat.com/mingo/latency-tracing-patches/patches/latency-tracing.patch

(dont look at the quality of that code too much)

It has this line for timestamp generation:

+ timestamp = get_cycles();

i.e. we used the raw TSC, we used RDTSC straight away, and we used that
for _years_, literally.

So i can tell you my direct experience with it: i had far more problems
with the tracer due to inexact timings and traces that i could not
depend on, than i had problems with sched_clock() locking up or
crashing.

Far more people complained about the accuracy of timings than about
performance or about the ability (or inability) to stream gigs of
tracing data to user-space.

It was a very striking difference:

- every second person who used the tracer observed that the timings
looked odd at places.

- only every 6 months has someone asked whether he could save
gigabytes of trace data.

For years i maintained a tracer with TSC timestamps, and for years i
maintained another tracer that used sched_clock(). Exact timings are a
feature most people are willing to spend extra cycles on.

You seem to dismiss that angle by calling my arguments bullshit, but i
dont know on what basis you dismiss it. Sure, a feature and extra
complexity _always_ has a robustness cost. If your argument is that we
should move cpu_clock() to assembly to make it more dependable - i'm all
for it.

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Mathieu Desnoyers

2008-09-25 20:30:20 UTC

heh, and i even have a link for a latency tracing patch for 2005 that is
http://people.redhat.com/mingo/latency-tracing-patches/patches/latency-tracing.patch
(dont look at the quality of that code too much)
+ timestamp = get_cycles();
i.e. we used the raw TSC, we used RDTSC straight away, and we used that
for _years_, literally.
So i can tell you my direct experience with it: i had far more problems
with the tracer due to inexact timings and traces that i could not
depend on, than i had problems with sched_clock() locking up or
crashing.
Far more people complained about the accuracy of timings than about
performance or about the ability (or inability) to stream gigs of
tracing data to user-space.
- every second person who used the tracer observed that the timings
looked odd at places.
- only every 6 months has someone asked whether he could save
gigabytes of trace data.
For years i maintained a tracer with TSC timestamps, and for years i
maintained another tracer that used sched_clock(). Exact timings are a
feature most people are willing to spend extra cycles on.
You seem to dismiss that angle by calling my arguments bullshit, but i
dont know on what basis you dismiss it. Sure, a feature and extra
complexity _always_ has a robustness cost. If your argument is that we
should move cpu_clock() to assembly to make it more dependable - i'm all
for it.
Ingo

Hi Ingo,

I completely agree with both Linus and you that accuracy utterly
matters. I currently provide a time source meant to meant the tracing
requirements and support architectures lacking synchronized TSC (or tsc
at all) in my lttng tree. Feel free to have a look. I've had statisfied
users relying on these time sources for about 3 years.

See the lttng-timestamp-* commits in
git://git.kernel.org/pub/scm/linux/kernel/git/compudj/linux-2.6-lttng.git

The one in question here (x86) is here. You'll see that everything fits
in a small header and can thus be inlined in the callers.

http://git.kernel.org/?p=linux/kernel/git/compudj/linux-2.6-lttng.git;a=blob;f=include/asm-x86/ltt.h;h=96ef292729a15d93af020ce5526669d220a1d795;hb=5fced7ecdac8ce65298ddbad191ce9fe998cfe9a

Mathieu

Linus Torvalds

2008-09-25 20:40:15 UTC

Post by Ingo Molnar
You seem to dismiss that angle by calling my arguments bullshit, but i
dont know on what basis you dismiss it. Sure, a feature and extra
complexity _always_ has a robustness cost. If your argument is that we
should move cpu_clock() to assembly to make it more dependable - i'm all
for it.

Umm. cpu_clock() isn't even cross-cpu synchronized, and has actually
thrown away all the information that can make it so, afaik. At least the
comments say "never more than 2 jiffies difference"). You do realize that
if you want to order events across CPU's, we're not talking about
"jiffies" here, we're talking about 50-100 CPU _cycles_.

You also ignore the early trace issues, and have apparently not used it
for FTRACE. You also ignore the fact that without TSC, it goes into the
same "crap mode" that is appropriate for the scheduler, but totally
useless for tracing.

IOW, you say that I call your arguments BS without telling you why, but
that's just because you apparently cut out all the things I _did_ tell you
why about!

The fact is, people who do tracing will want better clocks - and have
gotten with other infrastructure - than you have apparently cared about.
You've worried about scheduler tracing, and you seem to want to just have
everybody use a simple but known-bad approach that was good enough for
you.

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Linus Torvalds

2008-09-25 20:40:16 UTC

Post by Linus Torvalds
You also ignore the early trace issues, and have apparently not used it
for FTRACE. You also ignore the fact that without TSC, it goes into the
same "crap mode" that is appropriate for the scheduler, but totally
useless for tracing.

Oh, and I didn't notice (because Steven pointed out "notrace" and I didn't
see any of them), that in order to get things to work you had just added

CFLAGS_REMOVE_lockdep.o = -pg
CFLAGS_REMOVE_lockdep_proc.o = -pg
CFLAGS_REMOVE_mutex-debug.o = -pg
CFLAGS_REMOVE_rtmutex-debug.o = -pg
CFLAGS_REMOVE_cgroup-debug.o = -pg
CFLAGS_REMOVE_sched_clock.o = -pg
CFLAGS_REMOVE_sched.o = -mno-spe -pg

all ovr the place, which was part of my argument against this crap in the
first place.

Yes, by using all that common infrastructure, you can share some code, but
you will always hit that case that now you have serious issues with
actually marking it. Now the tracer has to have recursion detection if you
ever want to trace any function that might be used for the clock - and
quite frankly, especially with virtualization, it's not AT ALL obvious
what those are all the time..

That is exactly one of the examples I gave for _not_ doing this. Go back
and read my previous emails. Rather than talking about how I call your
arguments BS without saying why.

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Steven Rostedt

2008-09-25 20:50:22 UTC

Post by Linus Torvalds
CFLAGS_REMOVE_lockdep.o = -pg
CFLAGS_REMOVE_lockdep_proc.o = -pg
CFLAGS_REMOVE_mutex-debug.o = -pg
CFLAGS_REMOVE_rtmutex-debug.o = -pg
CFLAGS_REMOVE_cgroup-debug.o = -pg

The above I added because they just made the function tracer output
quite ugly and bloated. The lockdep got messing when it was debugging
the locks used in the tracer that was tracing lockdep. The above had
nothing to do with what kind of clock we used. We added recursion
protection, but it was still producing ugly output.

-- Steve

Post by Linus Torvalds
CFLAGS_REMOVE_sched_clock.o = -pg
CFLAGS_REMOVE_sched.o = -mno-spe -pg
all ovr the place, which was part of my argument against this crap in the
first place.

Steven Rostedt

2008-09-25 21:10:08 UTC

Post by Linus Torvalds
CFLAGS_REMOVE_lockdep.o = -pg
CFLAGS_REMOVE_lockdep_proc.o = -pg
CFLAGS_REMOVE_mutex-debug.o = -pg
CFLAGS_REMOVE_rtmutex-debug.o = -pg
CFLAGS_REMOVE_cgroup-debug.o = -pg
CFLAGS_REMOVE_sched_clock.o = -pg
CFLAGS_REMOVE_sched.o = -mno-spe -pg

You'll also find in the lib Makefile:

ifdef CONFIG_FTRACE
ORIG_CFLAGS := $(KBUILD_CFLAGS)
KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
endif

Which removes all -pg flags from all in the lib directory. The reason is
that a lot of archs (well, I know PPC for sure) use these functions on
early boot up, where simply calling mcount will produce a page fault.

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Ingo Molnar

2008-09-25 21:20:12 UTC

ftrace has the same robustness design as lockdep has: as little
external infrastructure dependencies as possible. And lockdep has
recursion checks too, and excessive amounts of paranoia all around the
place.
Ftrace has the same robustness philosophy too, and yes, despite that
we judged cpu_clock() to be worth the risk, because accurate and fast
timestamps are a feature and we didnt want to duplicate.

and note that there's another pragmatic argument: often we notice
cpu_clock() bugs by looking at traces. I.e. people fixing trace
timestamps _fix the scheduler_. Sometimes it is very hard to notice
scheduling artifacts that happen due to small inaccuracies in
cpu_clock().

so there's continuous coupling between precise scheduling and good trace
timestamps. I'd be willing to pay a lot more for that than the few
(rather obvious...) robustness problems we had with sched_clock() in the
past.

anyway ... i'm not _that_ attached to the idea, we can certainly go back
to the original ftrace method of saving raw TSC timestamps and
postprocessing. I think users will quickly force us back to a more
dependable clock, and if not then you were right and i was wrong ;-)

In fact even when we used sched_clock() there were some artifacts: as
you pointed it out we dont want to do per event cross-CPU
synchronization by default as that is very expensive. Some people wanted
GTOD clock for tracing and we very briefly tried that - but that was an
utter maintenance nightmare in practice.

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Ingo Molnar

2008-09-25 21:20:15 UTC

Post by Linus Torvalds
You also ignore the early trace issues, and have apparently not used
it for FTRACE. You also ignore the fact that without TSC, it goes
into the same "crap mode" that is appropriate for the scheduler, but
totally useless for tracing.

Really, we traced all these files for ages. I can restore it if it's
worthwile - but lockdep totally kills the usability of function traces,
it inserts thousands of uninteresting events over and over again.

Note commit c349e0a0, there i added -no-pg for robustness reasons (we
locked up) - back then the scheduler clock was within sched.c. Now it is
all separated out cleanly in kernel/sched_clock.o and i think we can add
-pg to sched.o again and trace it.

Post by Linus Torvalds
Yes, by using all that common infrastructure, you can share some code,
but you will always hit that case that now you have serious issues
with actually marking it. Now the tracer has to have recursion
detection if you ever want to trace any function that might be used
for the clock - and quite frankly, especially with virtualization,
it's not AT ALL obvious what those are all the time..

yes, that's true. And that's why we absolutely want to have recursion
detection anyway - even given ftrace's _totally_ conservative design
it's _very_ easy to accidentally recurse somewhere.

ftrace has the same robustness design as lockdep has: as little external
infrastructure dependencies as possible. And lockdep has recursion
checks too, and excessive amounts of paranoia all around the place.

Ftrace has the same robustness philosophy too, and yes, despite that we
judged cpu_clock() to be worth the risk, because accurate and fast
timestamps are a feature and we didnt want to duplicate.

If then i'd rather move towards simplifying sched_clock() even more -
that's important for the scheduler too. The scheduler clock has _very_
similar requirements to the tracer clock.

Post by Linus Torvalds
That is exactly one of the examples I gave for _not_ doing this. Go
back and read my previous emails. Rather than talking about how I call
your arguments BS without saying why.

we lived with these kinds of complications for years literally, and i'm
not that stupid to not simplify a debugging framework when i can do it -
and i'd not mind risk reduction. Paravirt is hugely misdesigned piece of
PITA, for basically every piece of infrastructure that we have: locking,
GTOD, lockdep, and yes, the tracer too.

So ... if you could suggest a method of how we could get good timestamps
in a post-processed method with TSC timestamps, that would be great.
That would solve all this discussion and i'm not going to argue against
saving raw TSC timestamps, as they _are_ simpler and more robust.

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Ingo Molnar

2008-09-25 21:50:15 UTC

Post by Linus Torvalds
You also ignore the early trace issues, and have apparently not used
it for FTRACE. You also ignore the fact that without TSC, it goes
into the same "crap mode" that is appropriate for the scheduler, but
totally useless for tracing.

to prove it, i just applied this patch:

Index: linux/kernel/Makefile
===================================================================
--- linux.orig/kernel/Makefile
+++ linux/kernel/Makefile
@@ -21,7 +21,6 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
CFLAGS_REMOVE_rtmutex-debug.o = -pg
CFLAGS_REMOVE_cgroup-debug.o = -pg
CFLAGS_REMOVE_sched_clock.o = -pg
-CFLAGS_REMOVE_sched.o = -mno-spe -pg
endif

obj-$(CONFIG_PROFILING) += profile.o

and sched.o was fully traced again. For example schedule() to idle is 38
function calls:

$ cd /debug/tracing
$ echo ftrace > current_tracer
$ cat trace

# tracer: ftrace
#
# TASK-PID CPU# TIMESTAMP FUNCTION
# | | | | |
[...]
ssh-2734 [001] 52.291772: schedule <-schedule_timeout
ssh-2734 [001] 52.291772: hrtick_clear <-schedule
ssh-2734 [001] 52.291774: _spin_lock <-schedule
ssh-2734 [001] 52.291775: deactivate_task <-schedule
ssh-2734 [001] 52.291776: dequeue_task <-deactivate_task
ssh-2734 [001] 52.291777: dequeue_task_fair <-dequeue_task
ssh-2734 [001] 52.291778: update_curr <-dequeue_task_fair
ssh-2734 [001] 52.291779: calc_delta_mine <-update_curr
ssh-2734 [001] 52.291780: hrtick_start_fair <-dequeue_task_fair
ssh-2734 [001] 52.291782: find_busiest_group <-schedule
ssh-2734 [001] 52.291783: idle_cpu <-find_busiest_group
ssh-2734 [001] 52.291784: target_load <-find_busiest_group
ssh-2734 [001] 52.291785: weighted_cpuload <-target_load
ssh-2734 [001] 52.291786: weighted_cpuload <-find_busiest_group
ssh-2734 [001] 52.291787: cpu_avg_load_per_task <-find_busiest_group
ssh-2734 [001] 52.291788: source_load <-find_busiest_group
ssh-2734 [001] 52.291789: weighted_cpuload <-source_load
ssh-2734 [001] 52.291790: weighted_cpuload <-find_busiest_group
ssh-2734 [001] 52.291791: cpu_avg_load_per_task <-find_busiest_group
ssh-2734 [001] 52.291792: msecs_to_jiffies <-schedule
ssh-2734 [001] 52.291792: msecs_to_jiffies <-schedule
ssh-2734 [001] 52.291793: put_prev_task_fair <-schedule
ssh-2734 [001] 52.291795: pick_next_task_fair <-schedule
ssh-2734 [001] 52.291796: pick_next_task_rt <-schedule
ssh-2734 [001] 52.291796: pick_next_task_fair <-schedule
ssh-2734 [001] 52.291797: pick_next_task_idle <-schedule
ssh-2734 [001] 52.291798: _spin_trylock <-tracing_record_cmdline
ssh-2734 [001] 52.291800: _spin_unlock <-tracing_record_cmdline
ssh-2734 [001] 52.291802: __switch_to <-thread_return
<idle>-0 [001] 52.291804: finish_task_switch <-thread_return
<idle>-0 [001] 52.291805: _spin_unlock_irq <-finish_task_switch
<idle>-0 [001] 52.291806: tick_nohz_stop_sched_tick <-cpu_idle
<idle>-0 [001] 52.291807: ktime_get <-tick_nohz_stop_sched_tick
<idle>-0 [001] 52.291807: ktime_get_ts <-ktime_get
<idle>-0 [001] 52.291808: getnstimeofday <-ktime_get_ts
<idle>-0 [001] 52.291808: acpi_pm_read <-getnstimeofday
<idle>-0 [001] 52.291810: set_normalized_timespec <-ktime_get_ts
<idle>-0 [001] 52.291811: get_next_timer_interrupt <-tick_nohz_stop_sched_tick

it worked just fine in my first blind attempt.

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Ingo Molnar

2008-09-25 22:00:25 UTC

Post by Ingo Molnar
<idle>-0 [001] 52.291807: ktime_get <-tick_nohz_stop_sched_tick
<idle>-0 [001] 52.291807: ktime_get_ts <-ktime_get
<idle>-0 [001] 52.291808: getnstimeofday <-ktime_get_ts
<idle>-0 [001] 52.291808: acpi_pm_read <-getnstimeofday
<idle>-0 [001] 52.291810: set_normalized_timespec <-ktime_get_ts
<idle>-0 [001] 52.291811: get_next_timer_interrupt <-tick_nohz_stop_sched_tick
it worked just fine in my first blind attempt.

here is the observations from my second blind attempt - this time to
show that we can trace even lockdep internals just fine.

I applied the patch attached below: it removes all the -pg removals from
kernel/Makefile and restores the notrace markings in kernel/lockdep.c
that commit 1d09daa5 ("ftrace: use Makefile to remove tracing from
lockdep") removed.

then i enabled the whole lockdep machinery (to insert as much extra code
as possible):

CONFIG_DEBUG_SPINLOCK=y
CONFIG_DEBUG_MUTEXES=y
CONFIG_DEBUG_LOCK_ALLOC=y
CONFIG_PROVE_LOCKING=y
CONFIG_LOCKDEP=y
CONFIG_LOCK_STAT=y
CONFIG_DEBUG_LOCKDEP=y
CONFIG_TRACE_IRQFLAGS=y

and enabled ftrace function tracing:

CONFIG_FTRACE=y
CONFIG_DYNAMIC_FTRACE=y
CONFIG_FTRACE_MCOUNT_RECORD=y

[ mcount-record is a new ftrace feature for v2.6.28 ]

and rebuilt and rebooted into the kernel, and enabled ftrace.

It worked just fine this time too, and i was able to trace inside
lockdep and lockstat internals:

<idle>-0 [001] 71.120828: _spin_lock <-tick_do_update_jiffies64
<idle>-0 [001] 71.120829: __lock_acquire <-lock_acquire
<idle>-0 [001] 71.120829: lock_acquired <-_spin_lock
<idle>-0 [001] 71.120829: print_lock_contention_bug <-lock_acquired
<idle>-0 [001] 71.120830: do_timer <-tick_do_update_jiffies64

this wasnt done since May this year (since commit 1d09daa5).

It's absolutely not unrobust. Yes, more code will always regress more
than no code, but it's a cost/benefit balance not a design must-have.
And to prove me wrong i'm sure we'll have some really bad cpu_clock()
regression within the next 24 hours ;)

Why did we do commit 1d09daa5, 6ec56232 and c349e0a0 which marked all
these .o's notrace? Partly to address a real (meanwhile fixed)
regression wrt. sched_clock() [and this supports your point], partly to
remove extra unnecessary trace entries from debug infrastructure, and
partly out of pure paranoia: i didnt want to see _any_ ftrace lockup in
mainline, i wanted it to have the kind of almost perfect track record
that lockdep gathered.

Maybe that was the wrong thing to do as we hide certain trace entries -
we could certainly apply the patch below. Hm?

Ingo

------------->
Index: linux/kernel/Makefile
===================================================================
--- linux.orig/kernel/Makefile
+++ linux/kernel/Makefile
@@ -15,13 +15,11 @@ CFLAGS_REMOVE_sched.o = -mno-spe

ifdef CONFIG_FTRACE
# Do not trace debug files and internal ftrace files
-CFLAGS_REMOVE_lockdep.o = -pg
CFLAGS_REMOVE_lockdep_proc.o = -pg
CFLAGS_REMOVE_mutex-debug.o = -pg
CFLAGS_REMOVE_rtmutex-debug.o = -pg
CFLAGS_REMOVE_cgroup-debug.o = -pg
CFLAGS_REMOVE_sched_clock.o = -pg
-CFLAGS_REMOVE_sched.o = -mno-spe -pg
endif

obj-$(CONFIG_PROFILING) += profile.o
Index: linux/kernel/lockdep.c
===================================================================
--- linux.orig/kernel/lockdep.c
+++ linux/kernel/lockdep.c
@@ -283,14 +283,14 @@ static struct list_head chainhash_table[
((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \
(key2))

-void lockdep_off(void)
+notrace void lockdep_off(void)
{
current->lockdep_recursion++;
}

EXPORT_SYMBOL(lockdep_off);

-void lockdep_on(void)
+notrace void lockdep_on(void)
{
current->lockdep_recursion--;
}
@@ -1136,7 +1136,7 @@ find_usage_forwards(struct lock_class *s
* Return 1 otherwise and keep <backwards_match> unchanged.
* Return 0 on error.
*/
-static noinline int
+static noinline notrace int
find_usage_backwards(struct lock_class *source, unsigned int depth)
{
struct lock_list *entry;
@@ -1748,7 +1748,7 @@ static inline int validate_chain(struct
* We are building curr_chain_key incrementally, so double-check
* it from scratch, to make sure that it's done correctly:
*/
-static void check_chain_key(struct task_struct *curr)
+static notrace void check_chain_key(struct task_struct *curr)
{
#ifdef CONFIG_DEBUG_LOCKDEP
struct held_lock *hlock, *prev_hlock = NULL;
@@ -2122,7 +2122,7 @@ static int mark_lock_irq(struct task_str
/*
* Mark all held locks with a usage bit:
*/
-static int
+static notrace int
mark_held_locks(struct task_struct *curr, int hardirq)
{
enum lock_usage_bit usage_bit;
@@ -2169,7 +2169,7 @@ void early_boot_irqs_on(void)
/*
* Hardirqs will be enabled:
*/
-void trace_hardirqs_on_caller(unsigned long a0)
+void notrace trace_hardirqs_on_caller(unsigned long a0)
{
struct task_struct *curr = current;
unsigned long ip;
@@ -2215,7 +2215,7 @@ void trace_hardirqs_on_caller(unsigned l
}
EXPORT_SYMBOL(trace_hardirqs_on_caller);

-void trace_hardirqs_on(void)
+void notrace trace_hardirqs_on(void)
{
trace_hardirqs_on_caller(CALLER_ADDR0);
}
@@ -2224,7 +2224,7 @@ EXPORT_SYMBOL(trace_hardirqs_on);
/*
* Hardirqs were disabled:
*/
-void trace_hardirqs_off_caller(unsigned long a0)
+void notrace trace_hardirqs_off_caller(unsigned long a0)
{
struct task_struct *curr = current;

@@ -2249,7 +2249,7 @@ void trace_hardirqs_off_caller(unsigned
}
EXPORT_SYMBOL(trace_hardirqs_off_caller);

-void trace_hardirqs_off(void)
+void notrace trace_hardirqs_off(void)
{
trace_hardirqs_off_caller(CALLER_ADDR0);
}
@@ -2415,7 +2415,7 @@ static inline int separate_irq_context(s
/*
* Mark a lock with a usage bit, and validate the state transition:
*/
-static int mark_lock(struct task_struct *curr, struct held_lock *this,
+static notrace int mark_lock(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit new_bit)
{
unsigned int new_mask = 1 << new_bit, ret = 1;
@@ -2869,7 +2869,7 @@ __lock_release(struct lockdep_map *lock,
/*
* Check whether we follow the irq-flags state precisely:
*/
-static void check_flags(unsigned long flags)
+static notrace void check_flags(unsigned long flags)
{
#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) && \
defined(CONFIG_TRACE_IRQFLAGS)
@@ -2927,7 +2927,7 @@ EXPORT_SYMBOL_GPL(lock_set_subclass);
* We are not always called with irqs disabled - do that here,
* and also avoid lockdep recursion:
*/
-void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
+notrace void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
int trylock, int read, int check,
struct lockdep_map *nest_lock, unsigned long ip)
{
@@ -2948,7 +2948,7 @@ void lock_acquire(struct lockdep_map *lo

EXPORT_SYMBOL_GPL(lock_acquire);

-void lock_release(struct lockdep_map *lock, int nested,
+notrace void lock_release(struct lockdep_map *lock, int nested,
unsigned long ip)
{
unsigned long flags;
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Linus Torvalds

2008-09-25 22:10:15 UTC

Now do the same on a CPU that doesn't have TSC. And notice how useless the
timestamps are.

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Ingo Molnar

2008-09-25 22:20:10 UTC

Post by Linus Torvalds
Now do the same on a CPU that doesn't have TSC. And notice how useless
the timestamps are.

i do not understand this argument of yours. (really)

1) is your point that we might lock up?

2) or perhaps that the timestamps update only once every jiffy, and are
in essence useless because they show the same value again and again?

the latter is true, and that's why we were pushed hard in the past by
tracer users towards using GTOD timestamps. Everyone's favorite
suggestion was: "why dont you use gettimeofday internally in the
tracer???".

We resisted that because GTOD timestamps are totally crazy IMO:

- it is 1-2 orders of magnitude more code than cpu_clock() and
all sched_clock() variants altogether.

- it's also pretty fragile code that uses non-trivial locking
internally.

- pmtimer takes like 6000-10000 cycles to read. hpet ditto. Not to talk
about the PIT. Same on other architectures.

[ ... and as usual, only Sparc64 is sane in this field. ]

for a some time we had a runtime option in the latency tracer that
allowed the GTOD clock to be used (default-off) - but even that one was
too much and too fragile so we removed it - it never got upstream.

Fortunately this is not a big issue as almost everything on this planet
that runs Linux and has a kernel developer or user sitting in front of
it has a TSC - and if it doesnt have a TSC it doesnt have any other
high-precision time source to begin with. So worst-case sched_clock()
falls back to a sucky jiffies approximation:

unsigned long long __attribute__((weak)) sched_clock(void)
{
return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
}

3) ... or perhaps is it your point more highlevel, that we shouldnt be
dealing with timestamps in a central manner _at all_ in the tracer, and
we should make them purely optional?

I indeed _had_ a few cases (bugs i debugged) where i was not interested
at all in the timestamps, just in their relative ordering. For that we
had a switch in the latency tracer that turned on (expensive!) central
synchronization [a shared global atomic counter] between traced events.
After some struggling it died a quick and peaceful death.

In that sense the global counter was a kind of 'time' though.

4) ... or if you have some other point which you already mentioned
before then i totally missed it and apologize. :-/

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Linus Torvalds

2008-09-25 23:40:09 UTC

Post by Ingo Molnar
i do not understand this argument of yours. (really)
1) is your point that we might lock up?

Have you at all followed the discussion about the people who asked for
cross-CPU ordering? They wanted not timestamps at all, but global atomic
counter updates. Which is very reasonable, if timestamps don't work (and
jiffies certainly doesn't, especially in a NOHZ environment).

IOW, my whole argument is that what tracers want is _not_ the same thing
as what "sched_clock()" wants.

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Ingo Molnar

2008-09-25 21:00:15 UTC

Post by Ingo Molnar
You seem to dismiss that angle by calling my arguments bullshit, but
i dont know on what basis you dismiss it. Sure, a feature and extra
complexity _always_ has a robustness cost. If your argument is that
we should move cpu_clock() to assembly to make it more dependable -
i'm all for it.

Steve got the _worst-case_ cpu_clock() difference down to 60 usecs not
so long ago. It might have regressed since then, it's really hard to do
it without cross-CPU synchronization.

( But it's not impossible, as Steve has proven it, because physical time
goes on linearly on each CPU so we have a chance to do it: by
accurately correlating the GTOD timestamps we get at to-idle/from-idle
times to the TSC. )

And note that i'm not only talking about cross-CPU synchronization, i'm
also talking about _single CPU_ timestamps. How do you get it right with
TSCs via a pure postprocessing method? A very large body of modern CPUs
will halt the TSC when they go into idle. (about 70% of the installed
base or so)

Note, we absolutely cannot do accurate timings in a pure
TSC-post-processing environment: unless you want to trace _every_
to-idle and from-idle event, which can easily be tens of thousands of
extra events per seconds.

What we could do perhaps is a hybrid method:

- save a GTOD+TSC pair at important events, such as to-idle and
from-idle, and in the periodic sched_tick(). [ perhaps also save it
when we change cpufreq. ]

- save the (last_GTOD, _relative_-TSC) pair in the trace entry

with that we have a chance to do good post-processed correlation - at
the cost of having 12-16 bytes of timestamp, per trace entry.

Or we could upscale the GTOD to 'TSC time', at go-idle and from-idle.
Which is rather complicated with cpufreq - which frequency do we want to
upscale to if we have a box with three available frequencies? We could
ignore cpufreq altogether - but then there goes dependable tracing on
another range of boxes.

Post by Linus Torvalds
You also ignore the early trace issues, and have apparently not used
it for FTRACE. [...]

i very much used early code tracing with ftrace in the past. In fact
once i debugged and early boot hang that happened so early before
_PRINTK_ was not functional yet (!).

So, to solve this bug, i hacked ftrace to use early_printk(), to print
out the last 10,000 functions executed before the hang - and that's how
i found the reason for the hang - i captured a huge trace via a serial
console. It was dead slow to capture, but it worked and sched_clock()
worked just fine in that kind of usecase as well.

[ Note that we added tracing/fastboot recently (for v2.6.28), to enable
the tracing of early boot code timings. Havent had a problem with it
yet on x86. ]

Post by Linus Torvalds
[...] You also ignore the fact that without TSC, it goes into the same
"crap mode" that is appropriate for the scheduler, but totally useless
for tracing.

i havent used a TSC-less CPU in 10 years, i'm not sure i get this point
of yours. (and IIRC the division by zero was exactly on such CPUs where
we divided by cpu_khz - that's why it could even regress.)

note that sched_clock() will use the TSC whenever it is there physically
- even if GTOD does not use it anymore.

Post by Linus Torvalds
IOW, you say that I call your arguments BS without telling you why,
but that's just because you apparently cut out all the things I _did_
tell you why about!
The fact is, people who do tracing will want better clocks - and have
gotten with other infrastructure - than you have apparently cared
about. You've worried about scheduler tracing, and you seem to want to
just have everybody use a simple but known-bad approach that was good
enough for you.

i wrote my first -pg/mcount based tracer about 11 years ago, to learn
more about the kernel. I traced everything with it. I then used it to
find performance bottlenecks in the kernel, and i used it to learn
kernel internals - when i saw a function in the trace that i did not
recognize, i read the source code.

Scheduler tracing came much later into the picture - the -pg tracer was
written well _before_ it was used for latency tracing purposes. But it
is indeed a pretty popular use of it. (but by no means the only one)

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Jeremy Fitzhardinge

2008-09-25 21:20:07 UTC

Post by Ingo Molnar
You seem to dismiss that angle by calling my arguments bullshit, but
i dont know on what basis you dismiss it. Sure, a feature and extra
complexity _always_ has a robustness cost. If your argument is that
we should move cpu_clock() to assembly to make it more dependable -
i'm all for it.

Steve got the _worst-case_ cpu_clock() difference down to 60 usecs not
so long ago. It might have regressed since then, it's really hard to do
it without cross-CPU synchronization.
( But it's not impossible, as Steve has proven it, because physical time
goes on linearly on each CPU so we have a chance to do it: by
accurately correlating the GTOD timestamps we get at to-idle/from-idle
times to the TSC. )
And note that i'm not only talking about cross-CPU synchronization, i'm
also talking about _single CPU_ timestamps. How do you get it right with
TSCs via a pure postprocessing method? A very large body of modern CPUs
will halt the TSC when they go into idle. (about 70% of the installed
base or so)
Note, we absolutely cannot do accurate timings in a pure
TSC-post-processing environment: unless you want to trace _every_
to-idle and from-idle event, which can easily be tens of thousands of
extra events per seconds.
- save a GTOD+TSC pair at important events, such as to-idle and
from-idle, and in the periodic sched_tick(). [ perhaps also save it
when we change cpufreq. ]
- save the (last_GTOD, _relative_-TSC) pair in the trace entry
with that we have a chance to do good post-processed correlation - at
the cost of having 12-16 bytes of timestamp, per trace entry.
Or we could upscale the GTOD to 'TSC time', at go-idle and from-idle.
Which is rather complicated with cpufreq - which frequency do we want to
upscale to if we have a box with three available frequencies? We could
ignore cpufreq altogether - but then there goes dependable tracing on
another range of boxes.

The "full timestamp" records should include:

* absolute tsc
* absolute monotonic timestamp
* new tsc freqency

If you then make sure that all the cpufreq/idle/suspend-resume code
emits appropriate records when changing the tsc frequency, then you
should always be able to fully regenerate an absolute timestamp.

If you generate the monotonic timestamp with a good clocksource, then
you should be able to correlate the timestamps between cpus.

Oddly enough, this is identical to the Xen clocksource's use of the tsc ;)

J
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Martin Bligh

2008-09-25 21:20:15 UTC

Post by Ingo Molnar
- save a GTOD+TSC pair at important events, such as to-idle and
from-idle, and in the periodic sched_tick(). [ perhaps also save it
when we change cpufreq. ]

We did GTOD, but I think it's a bad idea, because NTP moves it.

Post by Ingo Molnar
- save the (last_GTOD, _relative_-TSC) pair in the trace entry
with that we have a chance to do good post-processed correlation - at
the cost of having 12-16 bytes of timestamp, per trace entry.
Or we could upscale the GTOD to 'TSC time', at go-idle and from-idle.
Which is rather complicated with cpufreq - which frequency do we want to
upscale to if we have a box with three available frequencies? We could
ignore cpufreq altogether - but then there goes dependable tracing on
another range of boxes.

Simple solution: turn off cpufreq whilst tracing is on ;-)

Harder: Keep a timebase and frequency divisor on a per-cpu basis
and calculate your offsets from there. This brings you down to
HPET resolution though
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Ingo Molnar

2008-09-25 20:30:15 UTC

Post by Ingo Molnar
cpu_clock() proved to be a good middle ground. I'm not a believer in
any of this stuff, i just react to how things behave in practice, as
none of this is really easy to get right from the theoretical angle.

[...]

Post by Ingo Molnar
If sched_clock() is broken then the kernel wont work anyway. And i never
wanted to trace inside sched_clock() itself. [...]

note, CONFIG_LOCKSTAT - which hooks deep inside lockdep, uses
cpu_clock() too for timings, for similar reasons. Lockdep and lockstat
both have a very robust design and a very good track record to prove it.

you'd be correct in pointing out that we _do_ have a relatively high
regression count in cpu_clock()/sched_clock(), but the reason for that
is that its implementation balances on the narrow edge of doability. It
implements a very unstable set of requirements: "absolutely fast" pitted
against "as accurate as possible".

That is two conflicting requirements and it is a very fine line to walk
and everyone tries to have their own variant of it and their own
balance. We try as hard as possible to use the TSC even in face of
C2/C3, cpufreq, unstable TSCs, etc. The moment we go too much towards
performance we regress accuracy and hurt the scheduler's quality and
vice versa.

and note that my years long experience in the tracing field show that it
has a _very_ similar need for accuracy versus performance, so it was a
good match for ftrace.

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Jeremy Fitzhardinge

2008-09-25 21:10:16 UTC

Post by Linus Torvalds
As for C2/C3 - that's just an argument for *not* doing anything at trace
time. What do you think happens when you try to trace through those
things? You're much better off trying to sort out the problems later, when
you don't hold critical locks and are possibly deep down in some buggy
ACPI code, and you're trying to trace it exactly _because_ it is buggy.

That suggests that frequency changes should be recorded at a lower layer
as well, along with full timestamps and deltas, so that a log parser can
correctly generate the timestamps without having to parse higher-level
trace records. Maybe the simplest thing to do is make the full
timestamp records also include frequency and offset information (to deal
with discontinuities caused by things like a vcpu switching between
physical cpus with unsynced tscs).

J
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Linus Torvalds

2008-09-25 22:00:20 UTC

Post by Jeremy Fitzhardinge
That suggests that frequency changes should be recorded at a lower layer
as well

Yes and no.

The reason I say "and no" is that it's not technically really possible to
atomically give the exact TSC at which the frequency change took place. We
just don't have the information, and I doubt we will ever have it.

As such, there is no point in trying to make it a low-level special op,
because we'd _still_ end up being totally equivalent with just doing as
regular trace-event, with a regular TSC field, and then just fill the data
field with the new frequency.

But yes, I do think we'd need to have that as a trace packet type. I
thought I even said so in my RFC for packet types. Ahh, it was in the

Post by Jeremy Fitzhardinge
I guess I should perhaps have put the TSC frequency in there in that "case
2" thing too. Maybe that should be in "data" (in kHz) and tv_sec/tv_nsec
should be in array[0..1], and the time sync packet would be 24 bytes.

but yes, we obviously need the frequency in order to calculate some kind
of wall-clock time (it doesn't _have_ to be in the same packet type as the
thing that tries to sync with a real clock, but it makes sense for it to
be there.

That said, if people think they can do a good job of ns conversion, I'll
stop arguing. Quite frankly, I think people are wrong about that, and
quite frankly, I think that anybody who looks even for one second at those
"alternate" sched_clock() implementations should realize that they aren't
suitable, but whatever. I'm not writing the code, I can only try to
convince people to not add the insane call-chains we have now.

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Ingo Molnar

2008-09-25 22:30:12 UTC

Post by Linus Torvalds
That said, if people think they can do a good job of ns conversion,
I'll stop arguing. Quite frankly, I think people are wrong about that,
and quite frankly, I think that anybody who looks even for one second
at those "alternate" sched_clock() implementations should realize that
they aren't suitable, but whatever. I'm not writing the code, I can
only try to convince people to not add the insane call-chains we have
now.

hm, i'd really hope hw makers see the light and actually make the hw do
it all. Signs are that they are cozying up to these ideas.

Good and fast timestamps are important, and it is _infinitely_ more easy
to do it in hw than in sw.

Firstly they need a low-frequency (10khz-100khz) shared clock line
across all CPUs. A single line - and since it's low frequency it could
be overlaid on some existing data line and filtered out. That works
across NUMA nodes as well and physics allows it to be nanosec accurate
up to dozens of meters or so. Then they need some really cheap way to
realize what absolute value the clock counts, and read it out every now
and then in the CPU, and approximate it inbetween, and have a secondary
stage cheap few-transitors long-latency multiplicator that keeps passing
on the nanosec-ish value to a register/MSR that can be read out by the
instruction.

This trivially works fine even if the CPU is turned off. It uses nary
any power as it's low freq, and can be spread across larger system
designs too. In fact it would be a totally exciting new capability for
things like analysis of SMP events. PEBS/BTS could be extended to save
this kind of timestamp, and suddenly one could see _very_ accurately
what happens between CPUs, without expensive bus snooping kit.

and CPUs wont go beyond the '~1nsec' event granularity for quite some
time anyway - so nanoseconds is not a time scale that gets obsoleted
quickly.

[ i guess this proves it that everyone has his pipe dream ;-) ]

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Steven Rostedt

2008-09-25 22:50:09 UTC

Post by Ingo Molnar
Firstly they need a low-frequency (10khz-100khz) shared clock line
across all CPUs. A single line - and since it's low frequency it could
be overlaid on some existing data line and filtered out. That works
across NUMA nodes as well and physics allows it to be nanosec accurate
up to dozens of meters or so.

Can this possibly be true? I mean, light travels only one foot every
nanosecond. Can it really keep nanosecond accuracy up to dozens of meters
away? If you send the same signal to CPU1 that is 1 foot away, as well as
send it to CPU2 that is 2 feet away. CPU2 will get that signal at least 1
nanosec after CPU1 receives it.

Of course if the hardware is smart enough to know this topology, then it
could account for these delays in traffic.

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Jeremy Fitzhardinge

2008-09-25 23:10:09 UTC

Can this possibly be true? I mean, light travels only one foot every
nanosecond. Can it really keep nanosecond accuracy up to dozens of meters
away?

Sure. NTP keeps machines within 1ms (or better) of each other even
though the network latency is much higher and jittery.

J

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Ingo Molnar

2008-09-25 23:30:13 UTC

Can this possibly be true? I mean, light travels only one foot every
nanosecond. Can it really keep nanosecond accuracy up to dozens of meters
away?

Sure. NTP keeps machines within 1ms (or better) of each other even
though the network latency is much higher and jittery.

yes. And there are radio telescope arrays that are synced up to do delta
interferometry, over thousands of kilometers. Syncing up time over a few
dozen meters is no challenge - and the reason for that ease is that
physical time is neatly and uniformly broadcasted by nature in a pretty
dependable way, at around 300 thousand kilometers per second.

the challenge is to make it cheap enough for commodity hw. I.e. no extra
CPU pins or lines in critical parts of the board, no extra power, low
transistor count, no impact on any critical path, short and reliable
clock readout after powerup, etc. But that is quite possible too IMO,
and the payback is very real.

[ OTOH, this is a world that still ships FreeDOS on many whitebox PC
instead of putting Linux on it, so dont expect logic to prevail
in all cases ;-) ]

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Jeremy Fitzhardinge

2008-09-25 22:50:12 UTC

Post by Linus Torvalds
The reason I say "and no" is that it's not technically really possible to
atomically give the exact TSC at which the frequency change took place. We
just don't have the information, and I doubt we will ever have it.

Well, you don't need the tsc at the precise moment of the frequency
change. You just need to emit the current tsc+frequency+wallclock time
before you emit any more delta records after the frequency change. You
can't fetch all those values instantaneously, but you can get close.

Post by Linus Torvalds
As such, there is no point in trying to make it a low-level special op,
because we'd _still_ end up being totally equivalent with just doing as
regular trace-event, with a regular TSC field, and then just fill the data
field with the new frequency.
But yes, I do think we'd need to have that as a trace packet type. I
thought I even said so in my RFC for packet types. Ahh, it was in the

Post by Linus Torvalds
I guess I should perhaps have put the TSC frequency in there in that "case
2" thing too. Maybe that should be in "data" (in kHz) and tv_sec/tv_nsec
should be in array[0..1], and the time sync packet would be 24 bytes.

Yeah. If you ever mention wallclock time in the event stream, you have
to tie it to your local timebase (tsc+frequency) to make the whole thing
fit together.

Post by Linus Torvalds
That said, if people think they can do a good job of ns conversion, I'll
stop arguing. Quite frankly, I think people are wrong about that, and
quite frankly, I think that anybody who looks even for one second at those
"alternate" sched_clock() implementations should realize that they aren't
suitable, but whatever. I'm not writing the code, I can only try to
convince people to not add the insane call-chains we have now.

Yeah. Unfortunately, in the virtual case - unless you're virtualizing
the tsc itself, which is horrible - you can't really control or measure
how the tsc is going to behave, because its all under the hypervisor's
control. A "cpu" could be migrated between different physical cpus, the
whole machine could be migrated between hosts, or suspended, etc, making
it very hard to use the naked tsc. In that case the only real option is
to use a hypervisor-supplied timebase (which for Xen and KVM is a
tsc-based scheme exactly like we've been discussing, except the
hypervisor provides the tsc timing parameters).

asm/x86/kernel/pvclock.c does the tsc to ns conversion with just adds
and multiplies, but unfortunately it can't be expressed in C because it
uses the extra precision the x86 gives for multiplies.

J
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Ingo Molnar

2008-09-25 23:00:15 UTC

Post by Linus Torvalds
The reason I say "and no" is that it's not technically really
possible to atomically give the exact TSC at which the frequency
change took place. We just don't have the information, and I doubt
we will ever have it.

Well, you don't need the tsc at the precise moment of the frequency
change. You just need to emit the current tsc+frequency+wallclock
time before you emit any more delta records after the frequency
change. You can't fetch all those values instantaneously, but you can
get close.

hm, i'm not sure you've thought through this delta record idea.

Take a system that goes idle thousands of times a second. (it's easy -
just some networking workload)

Now take a tracer that just wants to emit a trace entry every now and
then. Once or twice a second or so.

Note that suddenly you have thousands of totally artificial 'delta' time
records between two real events, and have to post-process all your way
up between these events to get to the real timeline.

... it is totally impractical and costly.

and then i havent even mentioned some other big complications:

- the numeric errors that mount up over thousands of delta events

- the memory overhead over thousands of entries

- the fact that cpufreq and PLL changes are rarely atomic and that the
TSC can flip-flop between two frequencies.

TSC based delta time post-processing is just not practical. Micro-time
has to be synced up to GTOD in some manner - OR you might declare that
the TSC _has_ to never stop and has to be constant. (no modern x86 CPU
meets that criterium at the moment though.)

... and the moment you accept the fact that the GTOD _has_ to be mixed
into it, all the rest follows pretty much automatically: either you
store the (GTOD,freq,TSC) triple and post-process that absolute
timestamp, and you accept the in-memory cost of that and do
post-processing, or you compress that triple in situ and store the
result only.

[ You will then also want some fall-back implementation for CPUs that
have no TSCs, and for architectures that have no default
implementation - something jiffies based. And you want some special
hooks for paravirt, as always. ]

I.e. you will end up having something quite close to
cpu_clock()/sched_clock().

_Or_ if you manage to get any better than that for tracing then please
tell us about it because we want to apply those concepts to
cpu_clock()/sched_clock() too :-)

Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Jeremy Fitzhardinge

2008-09-26 01:20:09 UTC

Post by Ingo Molnar
hm, i'm not sure you've thought through this delta record idea.
Take a system that goes idle thousands of times a second. (it's easy -
just some networking workload)
Now take a tracer that just wants to emit a trace entry every now and
then. Once or twice a second or so.
Note that suddenly you have thousands of totally artificial 'delta' time
records between two real events, and have to post-process all your way
up between these events to get to the real timeline.
... it is totally impractical and costly.

No, as I said: "You just need to emit the current
tsc+frequency+wallclock time before you emit any more delta records
after the frequency change."

When an event which affects the tsc occurs, like a frequency change or
pause, set a flag. When you're next about to emit a delta, check the
flag and emit new timing parameters (or instead).

Post by Ingo Molnar
- the numeric errors that mount up over thousands of delta events
- the memory overhead over thousands of entries

No, you only need to emit records as needed.

Post by Ingo Molnar
- the fact that cpufreq and PLL changes are rarely atomic and that the
TSC can flip-flop between two frequencies.

You need to know the frequency at the time you sample the tsc, and you
need to know when the frequency changes. If you don't, you can't use
the tsc for time, regardless of whether you process it immediately or
post-process it.

Post by Ingo Molnar
... and the moment you accept the fact that the GTOD _has_ to be mixed
into it, all the rest follows pretty much automatically: either you
store the (GTOD,freq,TSC) triple and post-process that absolute
timestamp, and you accept the in-memory cost of that and do
post-processing, or you compress that triple in situ and store the
result only.

Right. You store (GTOD,freq,tsc) every time you need that information,
and then interpolate with the tsc while you know its monotonic.

Unless your tsc is completely screwed, the (GTOD,freq,tsc) triple is
going to be stored at a fairly low frequency, and won't fill your event
buffer very much (though it might be a large proportion of your recorded
events if you're only recording stuff at a low frequency).

Post by Ingo Molnar
[ You will then also want some fall-back implementation for CPUs that
have no TSCs, and for architectures that have no default
implementation - something jiffies based

Well, whatever the best timer the platform has. And maybe its already
processed into real time, in which case you just emit raw deltas and
never worry about updating the timing parameters.

Post by Ingo Molnar
. And you want some special hooks for paravirt, as always. ]

Yeah. The scheme relies on a cpu's tsc remaining the cpu's tsc.

Post by Ingo Molnar
I.e. you will end up having something quite close to
cpu_clock()/sched_clock().
_Or_ if you manage to get any better than that for tracing then please
tell us about it because we want to apply those concepts to
cpu_clock()/sched_clock() too :-)

Well, its what Xen does already for time. It works well.

J
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Steven Rostedt

2008-09-26 01:30:14 UTC

Post by Jeremy Fitzhardinge
No, as I said: "You just need to emit the current
tsc+frequency+wallclock time before you emit any more delta records
after the frequency change."
When an event which affects the tsc occurs, like a frequency change or
pause, set a flag. When you're next about to emit a delta, check the
flag and emit new timing parameters (or instead).

This is where I get confused. When do we add this to the trace buffer?
Every record (currently) records the delta time from the last event.
If a frequency changes, do we need to record the tuple at that moment?

If so, we have an issue. We can be in the middle of tracing the GTOD
and we can not recurse back into the GTOD for fear of deadlocking on
the GTOD spinlocks.

Do we add this data to the buffers when it happens? This means every
allocated ring buffer (and yes you can have more than one), will need to
register a call back so that it can record this information into the
buffer.

Or am I just not getting it?

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Jeremy Fitzhardinge

2008-09-26 01:50:13 UTC

No, you just need to save away a consistent (tsc,freq,GTOD) triple at
some point in time; it doesn't matter when it is, so long as the tsc
frequency hasn't changed since then. Once you have that, you can insert
it into your trace buffer, and then use it as the base for subsequent
deltas.

And as I mentioned in the other mail, if you don't mind backtracking
when you process the log, you can even record (tsc,freq) (tsc,GTOD)
separately (but in that order).

Post by Steven Rostedt
If so, we have an issue. We can be in the middle of tracing the GTOD
and we can not recurse back into the GTOD for fear of deadlocking on
the GTOD spinlocks.

Right, you don't need to read the GTOD when you actually write a record.

J
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Steven Rostedt

2008-09-25 23:10:15 UTC

Even in my last version I added a "TIME_STAMP" type that can be used in
the future to add some kind of synchronization into the trace, that reads
GTOD or whatnot.

But as you can see, I've been trying to implement these various ideas,
since the devil is in the details and the code is the details.

How do you get this GTOD read in the ring buffer? If the ring buffer does
it without any knowledge from the tracer, it may be doing it a
inappropriate times. This would also imply that the GTOD infrastructure
itself is reentrent safe. Imagine tracing the GTOD code when the buffer
decides it is about time to add the GTOD timestamp into the buffer. Can
the GTOD handle this recursion. If the GTOD has spinlocks, probably not.

Perhaps we can add a ring_buffer_write_safe() method that would prevent
the trace from doing these. Or we can add a way for the tracer to trigger
a time event (ring_buffer_tick());

Post by Jeremy Fitzhardinge
asm/x86/kernel/pvclock.c does the tsc to ns conversion with just adds
and multiplies, but unfortunately it can't be expressed in C because it
uses the extra precision the x86 gives for multiplies.

If we do end up making a new clock API, I imagine that each arch will
define their own.

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Jeremy Fitzhardinge

2008-09-26 01:30:14 UTC

Post by Steven Rostedt
How do you get this GTOD read in the ring buffer? If the ring buffer does
it without any knowledge from the tracer, it may be doing it a
inappropriate times. This would also imply that the GTOD infrastructure
itself is reentrent safe. Imagine tracing the GTOD code when the buffer
decides it is about time to add the GTOD timestamp into the buffer. Can
the GTOD handle this recursion. If the GTOD has spinlocks, probably not.

It doesn't need to read the GTOD synchronously when writing the trace
record. When a tsc event occurs, it needs to store the
GTOD/tsc/frequency somewhere at that point. When you next write a trace
record, if that structure has been updated, you write it into the trace
before emitting the next delta timestamp.

In fact you can read the GTOD at almost any time between tsc frequency
updates, so you can defer it to whenever is convenient. At the time the
tsc changes frequency, you emit: (tsc,frequency), and a bit later
(before it changes again) you emit (tsc,GTOD). That allows you to
retroactively compute GTODs for all timestamps. Obviously it would be
good to get them as close as possible together.

J
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Steven Rostedt

2008-09-26 01:40:09 UTC

It doesn't need to read the GTOD synchronously when writing the trace
record. When a tsc event occurs, it needs to store the
GTOD/tsc/frequency somewhere at that point. When you next write a trace
record, if that structure has been updated, you write it into the trace
before emitting the next delta timestamp.
In fact you can read the GTOD at almost any time between tsc frequency
updates, so you can defer it to whenever is convenient. At the time the
tsc changes frequency, you emit: (tsc,frequency), and a bit later
(before it changes again) you emit (tsc,GTOD). That allows you to
retroactively compute GTODs for all timestamps. Obviously it would be
good to get them as close as possible together.

OK, let me rephrase my question.

How and where do we record this? Do we keep this information in some
global variable that we must compare to every time we add a new item in
the trace?

Do we have the buffer register a call back to record this information?

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Jeremy Fitzhardinge

2008-09-26 02:10:06 UTC

Post by Steven Rostedt
OK, let me rephrase my question.
How and where do we record this? Do we keep this information in some
global variable that we must compare to every time we add a new item in
the trace?
Do we have the buffer register a call back to record this information?

Something like (total pseudocode):

struct tsc_time_parameters {
int version; /* even - values OK; odd - values being updated */
u64 tsc;
u32 tsc_freq;
u64 gtod;
};

DEFINE_PERCPU(struct tsc_time_parameters, tsc_params);

/* To be called after a tsc frequency change, before any new
trace records are being emitted, in a context where we can call get_GTOD() */
void update_tsc_params(void)
{
struct tsc_time_parameters *p = __get_percpu_var(tsc_params);

p->version |= 1;
wmb();

p->tsc = get_tsc();
p->tsc_freq = get_tsc_freq();
p->gtod = get_GTOD();

wmb();
p->version++;
wmb();
}

DEFINE_PERCPU(unsigned, current_tsc_version);
DEFINE_PERCPU(u64, prev_tsc);

/* may be called in any context */
u64 get_trace_timestamp_delta(void)
{
const struct tsc_time_parameters *p = &__get_percpu_var(tsc_params);
unsigned *current_version = &__get_cpu_var(current_tsc_version);
u64 prev = __get_cpu_var(prev_tsc);
u64 now, ret;

/* check the current tsc_params version against the last one we emitted;
if the version is odd, then we interrupted the parameters as they were
being updated, so just emit a new delta with the old parameters */
if (unlikely(*current_version != p->version && !(p->version & 1))) {
/* XXX probably need a loop to deal with p->version changing under our feet */
emit_tsc_freq_record(p);
prev = p->tsc;
__get_cpu_var(current_tsc_version) = p->version;
}

now = read_tsc();
ret = now - prev;
__get_cpu_var(prev_tsc) = now;

return ret;
}

J
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Steven Rostedt

2008-09-26 02:30:18 UTC

[
"When people ask me what language my mother tongue is,
I simply reply 'C'" - Steven Rostedt
]

This is exactly why I have that saying ;-)

Good enough.

Post by Jeremy Fitzhardinge
struct tsc_time_parameters {
int version; /* even - values OK; odd - values being updated */
u64 tsc;
u32 tsc_freq;
u64 gtod;
};
DEFINE_PERCPU(struct tsc_time_parameters, tsc_params);

These are all global I presume (No "static" in front)

Post by Jeremy Fitzhardinge
/* To be called after a tsc frequency change, before any new
trace records are being emitted, in a context where we can call get_GTOD() */
void update_tsc_params(void)

So this needs to be called by the cpu freq code?

Post by Jeremy Fitzhardinge
{
struct tsc_time_parameters *p = __get_percpu_var(tsc_params);
p->version |= 1;
wmb();
p->tsc = get_tsc();
p->tsc_freq = get_tsc_freq();
p->gtod = get_GTOD();
wmb();
p->version++;
wmb();
}
DEFINE_PERCPU(unsigned, current_tsc_version);
DEFINE_PERCPU(u64, prev_tsc);
/* may be called in any context */
u64 get_trace_timestamp_delta(void)
{
const struct tsc_time_parameters *p = &__get_percpu_var(tsc_params);
unsigned *current_version = &__get_cpu_var(current_tsc_version);
u64 prev = __get_cpu_var(prev_tsc);
u64 now, ret;
/* check the current tsc_params version against the last one we emitted;
if the version is odd, then we interrupted the parameters as they were
being updated, so just emit a new delta with the old parameters */
if (unlikely(*current_version != p->version && !(p->version & 1))) {
/* XXX probably need a loop to deal with p->version changing under our feet */
emit_tsc_freq_record(p);

I take it the above is your record to the tracer?

Post by Jeremy Fitzhardinge
prev = p->tsc;
__get_cpu_var(current_tsc_version) = p->version;
}
now = read_tsc();

We probably wont to check here that p didn't change again.
and try again if it did.

Post by Jeremy Fitzhardinge
ret = now - prev;
__get_cpu_var(prev_tsc) = now;
return ret;
}

Hmm, the beginning of each patch will need to record the global tsc, as
well as this information. Simply because in overwrite mode, we do not want
to lose it if the producer is faster than te consumer.

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Jeremy Fitzhardinge

2008-09-26 05:40:11 UTC

Post by Steven Rostedt
[
"When people ask me what language my mother tongue is,
I simply reply 'C'" - Steven Rostedt
]
This is exactly why I have that saying ;-)

Good enough.

These are all global I presume (No "static" in front)

No, they could probably be static, depending where everything ends up.
It would only need to get accessed from a couple of places.

Post by Jeremy Fitzhardinge
/* To be called after a tsc frequency change, before any new
trace records are being emitted, in a context where we can call get_GTOD() */
void update_tsc_params(void)

So this needs to be called by the cpu freq code?

Yes, and any other place the tsc might get affected, like going into a
C-state which stops the tsc, and things like suspend/resume.

I take it the above is your record to the tracer?

Yeah. No doubt it needs a few more parameters.

Post by Jeremy Fitzhardinge
prev = p->tsc;
__get_cpu_var(current_tsc_version) = p->version;
}
now = read_tsc();

We probably wont to check here that p didn't change again.
and try again if it did.

Yeah, you may want to put the whole thing in a loop to make sure that
the version is consistent. You might end up emitting multiple redundant
tsc parameters, but that should be very rare.

Post by Jeremy Fitzhardinge
ret = now - prev;
__get_cpu_var(prev_tsc) = now;
return ret;
}

Each patch? Page?

J
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Steven Rostedt

2008-09-25 17:10:17 UTC

Post by Peter Zijlstra
Right - if you use raw tsc you're dependent on clock speed, if we'd
normalize that on ns instead you'd need at least: [...]

Please don't normalize to ns.
It's really quite hard, and it's rather _expensive_ on many CPU's. It
involves a non-constant 64-bit divide, after all. I bet it can be
optimized to be a multiply-by-inverse instead, but it would be a 128-bit
(or maybe just 96-bit?) multiply, and the code would be nasty, and likely
rather more expensive than the TSC reading itself.
Sure, you have to normalize at _some_ point, and normalizing early might
make some things simpler, but the main thing that would become easier is
people messing about in the raw log buffer on their own directly, which
would hopefully be something that we'd discourage _anyway_ (ie we should
try to use helper functions for people to do things like "get the next
event data", not only because the headers are going to be odd due to
trying to pack things together, but because maybe we can more easily
extend on them later that way when nobody accesses the headers by hand).
And I don't think normalizing later is in any way more fundamentally hard.
It just means that you do part of the expensive things after you have
gathered the trace, rather than during.

I've been just using sched_clock() which already normalizes to ns. But I
use a wrapper (ring_buffer_time_stamp) so we can decide on how to keep
track later.

If we do not normalize, then we must come up yet another generic way to read
the CPU clock for all archs. And then we also need to come up with another
generic way to normalize it later for output.

If I'm missing that this already exists, then I'll go and use it, but I do
not think that tracing is worthy enough to implement this timing
infrastructure just to get faster traces.

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Linus Torvalds

2008-09-25 17:30:14 UTC

Post by Steven Rostedt
If we do not normalize, then we must come up yet another generic way to read
the CPU clock for all archs. And then we also need to come up with another
generic way to normalize it later for output.

Why would any of this be "generic"?

Quite the reverse. It should be as trace-buffer specific as possible, so
that we do *not* share any code or any constraints with other people.

Just do rdtsc at first, and make it depend on x86. If the thing is made
simple enough, it will be a couple of lines of code for architectures to
read their own timestamp counters.

And since the normalization is then no longer in the critical part, _that_
can be architecture-independent, but obviously still trace-specific. You
need to know the frequency, and that involves having frequency events in
the trace if it changes, but if you don't see any frequency events you
just take "current frequency".

And doing it at trace parse time, we can some day enable a boot trace that
actually WORKS. Have you looked at the timestamp events we get from
"sched_clock()" in early bootup? They show up in the kernel logs when you
have CONFIG_PRINTK_TIME. And they are totally and utterly broken and
_useless_ for the early stages right now. And they shouldn't have to be
that way.

Yeah, we'll never be able to trace stuff that happens really early
(tracing will obviously always need kernel page tables and some really
basic stuf working), but we should be able to trace through things like
TSC calibration for boot time analysis. It wasn't that long ago that we
had the whole discussion about TSC calibration taking 200ms. Or the early
ACPI code. And get meaningful data.

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Steven Rostedt

2008-09-25 17:50:09 UTC

Why would any of this be "generic"?

generic as in, could be implement in architecture dependent ways but with
a "generic" interface. IOW, I don't want the trace to be dependent on
any arch. ftrace already runs on x86, ppc, sparc64, mips, arm, sh, and
more.

Post by Linus Torvalds
Quite the reverse. It should be as trace-buffer specific as possible, so
that we do *not* share any code or any constraints with other people.
Just do rdtsc at first, and make it depend on x86. If the thing is made
simple enough, it will be a couple of lines of code for architectures to
read their own timestamp counters.

I could do a HAVE_RING_BUFFER_TIMESTAMP config option for archs that
implement it, and just use something dumb for those that don't. For now
I'll keep to sched_clock, just because it makes it easy for me. But with
the wrappers, it should be easy to change later.

Post by Linus Torvalds
And since the normalization is then no longer in the critical part, _that_
can be architecture-independent, but obviously still trace-specific. You
need to know the frequency, and that involves having frequency events in
the trace if it changes, but if you don't see any frequency events you
just take "current frequency".

The one thing that seemed to me most apparent from talking to people
at LPC, is that they want a simple ring buffer API. If every tracer that
uses this must come up with its own time keeping management, I don't think
this will be used at all (except by those that are maintaining tracers
now).

Post by Linus Torvalds
And doing it at trace parse time, we can some day enable a boot trace that
actually WORKS. Have you looked at the timestamp events we get from
"sched_clock()" in early bootup? They show up in the kernel logs when you
have CONFIG_PRINTK_TIME. And they are totally and utterly broken and
_useless_ for the early stages right now. And they shouldn't have to be
that way.
Yeah, we'll never be able to trace stuff that happens really early
(tracing will obviously always need kernel page tables and some really
basic stuf working), but we should be able to trace through things like
TSC calibration for boot time analysis. It wasn't that long ago that we
had the whole discussion about TSC calibration taking 200ms. Or the early
ACPI code. And get meaningful data.

My logdev code has a define option to use bootmem for its buffers, and it
also uses an atomic counter to try to keep things in order. Heck, at early
boot, the events happen in order anyway, since it is still a single CPU
system.

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Linus Torvalds

2008-09-25 18:20:22 UTC

Post by Steven Rostedt
The one thing that seemed to me most apparent from talking to people
at LPC, is that they want a simple ring buffer API. If every tracer that
uses this must come up with its own time keeping management, I don't think
this will be used at all (except by those that are maintaining tracers
now).

No, no.

The timestamp code is all in the ring buffer code. That was why I refused
to have the layering without it.

And hell no, nobody should *ever* read the "tsc_delta" fields etc. Those
are entirely internal to the buffering. If any user _ever_ reads or writes
those on its own, it's a bug, plain and simple.

So when you read trace events, you should get the event data and the
timestamp from the trace buffer routines. Nobody should ever even _see_
the internal trace buffer implementation!

Linus
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Steven Rostedt

2008-09-25 15:40:13 UTC

Post by Peter Zijlstra
Also, I agree on the 4byte alignment, rather than the 8byte Steve seems
to favour.

Pretty much all CPUs word align on a 8 byte bounder (until we get those
128bit boxes running), but not all can word align on 4 bytes. I was hoping
to make the buffer output somewhat the same across archs.

Otherwise, this is going to be quite a complex mess IMHO.

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Mathieu Desnoyers

2008-09-24 16:20:11 UTC

Post by Martin Bligh
Thanks for creating this so quickly ;-)

Post by Steven Rostedt
event = ring_buffer_lock_reserve(buffer, event_id, length, &flags);
event->data = record_this_data;
ring_buffer_unlock_commit(buffer, event, flags);

This can, in generic, not work. Due to the simple fact that we might
straddle a page boundary. Therefore I think its best to limit our self
to the write interface below, so that it can handle that.

I'm not sure why this is any harder to deal with in write, than it is
in reserve? We should be able to make reserve handle this just
as well?
If you use write rather than reserve, you have to copy all the data
twice for every event.

I think we all agree that a supplementary copy is no wanted, but I think
this question is orthogonal to having a write wrapper. The way we can do
both is by using reserve/commit to deal with space reservation, and a
write() to perform the actual data write into the buffers once space has
been reserved.

Reserve/commit would allocate a variable-sized "slot" into the buffer.
We reserve X amount of bytes, and it returns the offset from the
buffer start where the allocated slot is. This reserve/commit mechanism
deals with synchronization (cli/spinlock or cmpxchg_local scheme...).

We can then use this offset to see in which page(s) we have to write.
This offset + len can in fact cross multiple page boundaries.

Doing this elegantly could involve a page array that would represent the
buffer data :

struct page **buffer;

And be given as parameter to the read() and write() methods, which would
deal with page-crossing.

e.g.

/*
* Perform an aligned write of the input data into the buffer.
*
* buffer : page pointer array
* woffset : offset in the page pointer array where write starts from
* data : input data
* len : length of data to copy
*
* Note : if a NULL buffer is passed, no copy is performed, but the
* alignment and offset calculation is done. Useful to calculate the
* size to reserve.
*
* return : length written
*/
size_t write(struct page **buffer, size_t woffset, void *data, size_t len);

Therefore, we could have code which writes in the buffers, without extra
copy, and without using vmap, in multiple writes for a single event,
which would deal with data alignment, e.g. :

size_t woffset, evsize = 0;

evsize += write(NULL, evsize, &var1, sizeof(var1));
evsize += write(NULL, evsize, &var2, sizeof(var2));
evsize += write(NULL, evsize, &var3, sizeof(var3));

woffset = reserve(..., evsize);

woffset += write(buffer, woffset, &var1, sizeof(var1));
woffset += write(buffer, woffset, &var2, sizeof(var2));
woffset += write(buffer, woffset, &var3, sizeof(var3));

commit(..., evsize);

Does that make sense ?

Mathieu

Steven Rostedt

2008-09-24 16:40:16 UTC

Post by Mathieu Desnoyers
/*
* Perform an aligned write of the input data into the buffer.
*
* buffer : page pointer array
* woffset : offset in the page pointer array where write starts from
* data : input data
* len : length of data to copy
*
* Note : if a NULL buffer is passed, no copy is performed, but the
* alignment and offset calculation is done. Useful to calculate the
* size to reserve.
*
* return : length written
*/
size_t write(struct page **buffer, size_t woffset, void *data, size_t len);
Therefore, we could have code which writes in the buffers, without extra
copy, and without using vmap, in multiple writes for a single event,
size_t woffset, evsize = 0;
evsize += write(NULL, evsize, &var1, sizeof(var1));
evsize += write(NULL, evsize, &var2, sizeof(var2));
evsize += write(NULL, evsize, &var3, sizeof(var3));
woffset = reserve(..., evsize);
woffset += write(buffer, woffset, &var1, sizeof(var1));
woffset += write(buffer, woffset, &var2, sizeof(var2));
woffset += write(buffer, woffset, &var3, sizeof(var3));
commit(..., evsize);
Does that make sense ?

Mathieu,

I'm starting to think that you are just too smart for your own good ;-)
No it does not make sense. Well, it does not after looking at it for 10
seconds.

Which brings up my point, the interface must be simple, and not cause
people to spend minutes trying to figure out what the trace is doing.

-- Steve

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Peter Zijlstra

2008-09-24 16:50:08 UTC

Post by Mathieu Desnoyers

Post by Martin Bligh
Thanks for creating this so quickly ;-)

Post by Steven Rostedt
event = ring_buffer_lock_reserve(buffer, event_id, length, &flags);
event->data = record_this_data;
ring_buffer_unlock_commit(buffer, event, flags);

This can, in generic, not work. Due to the simple fact that we might
straddle a page boundary. Therefore I think its best to limit our self
to the write interface below, so that it can handle that.

I'm not sure why this is any harder to deal with in write, than it is
in reserve? We should be able to make reserve handle this just
as well?
If you use write rather than reserve, you have to copy all the data
twice for every event.

I think we all agree that a supplementary copy is no wanted, but I think
this question is orthogonal to having a write wrapper.
This reserve/commit mechanism
deals with synchronization (cli/spinlock or cmpxchg_local scheme...).

Right

Post by Mathieu Desnoyers
We can then use this offset to see in which page(s) we have to write.
This offset + len can in fact cross multiple page boundaries.

Sure

Post by Mathieu Desnoyers
Doing this elegantly could involve a page array that would represent the
struct page **buffer;

I really don't like the page array, but we can do without..

Post by Mathieu Desnoyers
And be given as parameter to the read() and write() methods, which would
deal with page-crossing.
e.g.
size_t write(struct page **buffer, size_t woffset, void *data, size_t len);
Therefore, we could have code which writes in the buffers, without extra
copy, and without using vmap, in multiple writes for a single event,
size_t woffset, evsize = 0;
evsize += write(NULL, evsize, &var1, sizeof(var1));
evsize += write(NULL, evsize, &var2, sizeof(var2));
evsize += write(NULL, evsize, &var3, sizeof(var3));
woffset = reserve(..., evsize);
woffset += write(buffer, woffset, &var1, sizeof(var1));
woffset += write(buffer, woffset, &var2, sizeof(var2));
woffset += write(buffer, woffset, &var3, sizeof(var3));
commit(..., evsize);
Does that make sense ?

Yes, we can do the sub-write, how about:

struct ringbuffer_write_state
ringbuffer_write_start(struct ringbuffer *buffer, unsigned long size);

int ringbuffer_write(struct ringbuffer_write_state *state,
const void *buf, unsigned long size);

void ringbuffer_write_finish(struct ringbuffer_write_state *state);

That way write_start() can do the reserve and set a local write
iterator. write() can then do whatever, either the direct copy of break
it up - will error on overflowing the reserved size. write_finish() will
clean up (sti, preempt_enable etc..)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to ***@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Mathieu Desnoyers

2008-09-24 17:00:23 UTC

Post by Mathieu Desnoyers

Post by Martin Bligh
Thanks for creating this so quickly ;-)

Post by Steven Rostedt
event = ring_buffer_lock_reserve(buffer, event_id, length, &flags);
event->data = record_this_data;
ring_buffer_unlock_commit(buffer, event, flags);

This can, in generic, not work. Due to the simple fact that we might
straddle a page boundary. Therefore I think its best to limit our self
to the write interface below, so that it can handle that.

I'm not sure why this is any harder to deal with in write, than it is
in reserve? We should be able to make reserve handle this just
as well?
If you use write rather than reserve, you have to copy all the data
twice for every event.

Right

Post by Mathieu Desnoyers
We can then use this offset to see in which page(s) we have to write.
This offset + len can in fact cross multiple page boundaries.

Sure

Post by Mathieu Desnoyers
Doing this elegantly could involve a page array that would represent the
struct page **buffer;

I really don't like the page array, but we can do without..

struct ringbuffer_write_state
ringbuffer_write_start(struct ringbuffer *buffer, unsigned long size);
int ringbuffer_write(struct ringbuffer_write_state *state,
const void *buf, unsigned long size);
void ringbuffer_write_finish(struct ringbuffer_write_state *state);
That way write_start() can do the reserve and set a local write
iterator. write() can then do whatever, either the direct copy of break
it up - will error on overflowing the reserved size. write_finish() will
clean up (sti, preempt_enable etc..)

Yup, that looks neat. I don't know if it's worth separating data
alignment concerns from this part of the infrastructure so it stays
simple. OTOH, embedding automatic alignment of data elements would be
easy to do here, e.g. :

struct ringbuffer_write_state
ringbuffer_write_start(struct ringbuffer *buffer, unsigned long size);

int ringbuffer_write(struct ringbuffer_write_state *state,
const void *buf,
unsigned long size,
unsigned long alignment);

#define ringbuffer_compute_size(size, alignment) \
ringbuffer_write(NULL, NULL, size, alignment)

void ringbuffer_write_finish(struct ringbuffer_write_state *state);

So ringbuffer_compute_size could be useds to compute the total slot size
needed to write the event before doing the ringbuffer_write_start(). It
would be good to keep ringbuffer_write() mostly as a static inline so
the compiler could statically compile in much of these operations.

Mathieu

Steven Rostedt

2008-09-24 05:20:08 UTC