Discussion:
[RFC PATCH 1/2 v3] Unified trace buffer
(too old to reply)
Steven Rostedt
2008-09-25 18:51:55 UTC
Permalink
This is probably very buggy. I ran it as a back end for ftrace but only
tested the irqsoff and ftrace tracers. The selftests are busted with it.

But this is an attempt to get a unified buffering system that was
talked about at the LPC meeting.

Now that it boots and runs (albeit, a bit buggy), I decided to post it.
This is some idea that I had to handle this.

I tried to make it as simple as possible.

I'm not going to explain all the stuff I'm doing here, since this code
is under a lot of flux (RFC, POC work), and I don't want to keep updating
this change log. When we finally agree on something, I'll make this
change log worthy.

If you want to know what this patch does, the code below explains it :-p

Signed-off-by: Steven Rostedt <***@redhat.com>
---
include/linux/ring_buffer.h | 175 ++++++
kernel/trace/Kconfig | 3
kernel/trace/Makefile | 1
kernel/trace/ring_buffer.c | 1218 ++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 1397 insertions(+)

Index: linux-compile.git/include/linux/ring_buffer.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-compile.git/include/linux/ring_buffer.h 2008-09-25 13:59:09.000000000 -0400
@@ -0,0 +1,175 @@
+#ifndef _LINUX_RING_BUFFER_H
+#define _LINUX_RING_BUFFER_H
+
+#include <linux/mm.h>
+#include <linux/seq_file.h>
+
+struct ring_buffer;
+struct ring_buffer_iter;
+
+/*
+ * Don't reference this struct directly, use the inline items below.
+ */
+struct ring_buffer_event {
+ u32 type:2, len:3, time_delta:27;
+ u32 array[];
+} __attribute__((__packed__));
+
+enum {
+ RB_TYPE_PADDING, /* Left over page padding
+ * array is ignored
+ * size is variable depending on
+ */
+ RB_TYPE_TIME_EXTENT, /* Extent the time delta
+ * array[0] = time delta (28 .. 59)
+ * size = 8 bytes
+ */
+ /* FIXME: RB_TYPE_TIME_STAMP not implemented */
+ RB_TYPE_TIME_STAMP, /* Sync time stamp with external clock
+ * array[0] = tv_nsec
+ * array[1] = tv_sec
+ * size = 16 bytes
+ */
+
+ RB_TYPE_DATA, /* Data record
+ * If len is zero:
+ * array[0] holds the actual length
+ * array[1..(length+3)/4] holds data
+ * else
+ * length = len << 2
+ * array[0..(length+3)/4] holds data
+ */
+};
+
+#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event))
+#define RB_ALIGNMENT_SHIFT 2
+#define RB_ALIGNMENT (1 << RB_ALIGNMENT_SHIFT)
+#define RB_MAX_SMALL_DATA (28)
+
+enum {
+ RB_LEN_TIME_EXTENT = 8,
+ RB_LEN_TIME_STAMP = 16,
+};
+
+/**
+ * ring_buffer_event_length - return the length of the event
+ * @event: the event to get the length of
+ */
+static inline unsigned
+ring_buffer_event_length(struct ring_buffer_event *event)
+{
+ unsigned length;
+
+ switch (event->type) {
+ case RB_TYPE_PADDING:
+ /* undefined */
+ return -1;
+
+ case RB_TYPE_TIME_EXTENT:
+ return RB_LEN_TIME_EXTENT;
+
+ case RB_TYPE_TIME_STAMP:
+ return RB_LEN_TIME_STAMP;
+
+ case RB_TYPE_DATA:
+ if (event->len)
+ length = event->len << RB_ALIGNMENT_SHIFT;
+ else
+ length = event->array[0];
+ return length + RB_EVNT_HDR_SIZE;
+ default:
+ BUG();
+ }
+ /* not hit */
+ return 0;
+}
+
+/**
+ * ring_buffer_event_time_delta - return the delta timestamp of the event
+ * @event: the event to get the delta timestamp of
+ *
+ * The delta timestamp is the 27 bit timestamp since the last event.
+ */
+static inline unsigned
+ring_buffer_event_time_delta(struct ring_buffer_event *event)
+{
+ return event->time_delta;
+}
+
+/**
+ * ring_buffer_event_data - return the data of the event
+ * @event: the event to get the data from
+ *
+ * Note, if the length of the event is more than 256 bytes, the
+ * length field is stored in the body. We need to return
+ * after the length field in that case.
+ */
+static inline void *
+ring_buffer_event_data(struct ring_buffer_event *event)
+{
+ BUG_ON(event->type != RB_TYPE_DATA);
+ /* If length is in len field, then array[0] has the data */
+ if (event->len)
+ return (void *)&event->array[0];
+ /* Otherwise length is in array[0] and array[1] has the data */
+ return (void *)&event->array[1];
+}
+
+void ring_buffer_lock(struct ring_buffer *buffer, unsigned long *flags);
+void ring_buffer_unlock(struct ring_buffer *buffer, unsigned long flags);
+
+/*
+ * size is in bytes for each per CPU buffer.
+ */
+struct ring_buffer *
+ring_buffer_alloc(unsigned long size, unsigned flags);
+void ring_buffer_free(struct ring_buffer *buffer);
+
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size);
+
+void *ring_buffer_lock_reserve(struct ring_buffer *buffer,
+ unsigned long length,
+ unsigned long *flags);
+int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+ void *data, unsigned long flags);
+void *ring_buffer_write(struct ring_buffer *buffer,
+ unsigned long length, void *data);
+
+struct ring_buffer_event *
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts);
+
+struct ring_buffer_iter *
+ring_buffer_read_start(struct ring_buffer *buffer, int cpu);
+void ring_buffer_read_finish(struct ring_buffer_iter *iter);
+
+struct ring_buffer_event *
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts);
+struct ring_buffer_event *
+ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts);
+struct ring_buffer_event *
+ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts);
+void ring_buffer_iter_reset(struct ring_buffer_iter *iter);
+int ring_buffer_iter_empty(struct ring_buffer_iter *iter);
+
+unsigned long ring_buffer_size(struct ring_buffer *buffer);
+
+void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu);
+void ring_buffer_reset(struct ring_buffer *buffer);
+
+int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
+ struct ring_buffer *buffer_b, int cpu);
+
+int ring_buffer_empty(struct ring_buffer *buffer);
+int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu);
+
+void ring_buffer_disable(struct ring_buffer *buffer);
+void ring_buffer_enable(struct ring_buffer *buffer);
+
+unsigned long ring_buffer_entries(struct ring_buffer *buffer);
+unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
+
+enum ring_buffer_flags {
+ RB_FL_OVERWRITE = 1 << 0,
+};
+
+#endif /* _LINUX_RING_BUFFER_H */
Index: linux-compile.git/kernel/trace/ring_buffer.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-compile.git/kernel/trace/ring_buffer.c 2008-09-25 14:30:12.000000000 -0400
@@ -0,0 +1,1218 @@
+/*
+ * Generic ring buffer
+ *
+ * Copyright (C) 2008 Steven Rostedt <***@redhat.com>
+ */
+#include <linux/ring_buffer.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/list.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+
+#define sdr_print(x, y...) printk("%s:%d " x "\n", __FUNCTION__, __LINE__, y)
+
+/* FIXME!!! */
+unsigned long long
+ring_buffer_time_stamp(int cpu)
+{
+ /* mult -1 to test normalize */
+ return sched_clock() * -1;
+}
+void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
+{
+ *ts *= -1;
+}
+
+#define TS_SHIFT 27
+#define TS_MASK ((1ULL << TS_SHIFT) - 1)
+#define TS_DELTA_TEST ~TS_MASK
+
+/*
+ * We need to fit the time_stamp delta into 27 bits.
+ * Plue the time stamp delta of (-1) is a special flag.
+ */
+static inline int
+test_time_stamp(unsigned long long delta)
+{
+ if ((delta + 1) & TS_DELTA_TEST)
+ return 1;
+ return 0;
+}
+
+struct buffer_page {
+ u64 time_stamp;
+ unsigned char body[];
+};
+
+#define BUF_PAGE_SIZE (PAGE_SIZE - sizeof(u64))
+
+/*
+ * head_page == tail_page && head == tail then buffer is empty.
+ */
+struct ring_buffer_per_cpu {
+ int cpu;
+ struct ring_buffer *buffer;
+ raw_spinlock_t lock;
+ struct lock_class_key lock_key;
+ struct buffer_page **pages;
+ unsigned long head; /* read from head */
+ unsigned long tail; /* write to tail */
+ unsigned long head_page;
+ unsigned long tail_page;
+ unsigned long overrun;
+ unsigned long entries;
+ u64 last_stamp;
+ u64 read_stamp;
+ atomic_t record_disabled;
+};
+
+struct ring_buffer {
+ unsigned long size;
+ unsigned pages;
+ unsigned flags;
+ int cpus;
+ atomic_t record_disabled;
+
+ spinlock_t lock;
+ struct mutex mutex;
+
+ /* FIXME: this should be online CPUS */
+ struct ring_buffer_per_cpu *buffers[NR_CPUS];
+};
+
+struct ring_buffer_iter {
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long head;
+ unsigned long head_page;
+ u64 read_stamp;
+};
+
+static struct ring_buffer_per_cpu *
+ring_buffer_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int pages = buffer->pages;
+ int i;
+
+ cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!cpu_buffer)
+ return NULL;
+
+ cpu_buffer->cpu = cpu;
+ cpu_buffer->buffer = buffer;
+ cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
+ cpu_buffer->pages = kzalloc_node(ALIGN(sizeof(void *) * pages,
+ cache_line_size()), GFP_KERNEL,
+ cpu_to_node(cpu));
+ if (!cpu_buffer->pages)
+ goto fail_free_buffer;
+
+ for (i = 0; i < pages; i++) {
+ cpu_buffer->pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!cpu_buffer->pages[i])
+ goto fail_free_pages;
+ }
+
+ return cpu_buffer;
+
+ fail_free_pages:
+ for (i = 0; i < pages; i++) {
+ if (cpu_buffer->pages[i])
+ free_page((unsigned long)cpu_buffer->pages[i]);
+ }
+ kfree(cpu_buffer->pages);
+
+ fail_free_buffer:
+ kfree(cpu_buffer);
+ return NULL;
+}
+
+static void
+ring_buffer_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ int i;
+
+ for (i = 0; i < cpu_buffer->buffer->pages; i++) {
+ if (cpu_buffer->pages[i])
+ free_page((unsigned long)cpu_buffer->pages[i]);
+ }
+ kfree(cpu_buffer->pages);
+ kfree(cpu_buffer);
+}
+
+struct ring_buffer *
+ring_buffer_alloc(unsigned long size, unsigned flags)
+{
+ struct ring_buffer *buffer;
+ int cpu;
+
+ /* keep it in its own cache line */
+ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
+ GFP_KERNEL);
+ if (!buffer)
+ return NULL;
+
+ buffer->pages = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
+ buffer->flags = flags;
+
+ /* need at least two pages */
+ if (buffer->pages == 1)
+ buffer->pages++;
+
+ /* FIXME: do for only online CPUS */
+ buffer->cpus = num_possible_cpus();
+ for_each_possible_cpu(cpu) {
+ if (cpu >= buffer->cpus)
+ continue;
+ buffer->buffers[cpu] =
+ ring_buffer_allocate_cpu_buffer(buffer, cpu);
+ if (!buffer->buffers[cpu])
+ goto fail_free_buffers;
+ }
+
+ spin_lock_init(&buffer->lock);
+ mutex_init(&buffer->mutex);
+
+ return buffer;
+
+ fail_free_buffers:
+ for_each_possible_cpu(cpu) {
+ if (cpu >= buffer->cpus)
+ continue;
+ if (buffer->buffers[cpu])
+ ring_buffer_free_cpu_buffer(buffer->buffers[cpu]);
+ }
+
+ kfree(buffer);
+ return NULL;
+}
+
+/**
+ * ring_buffer_free - free a ring buffer.
+ * @buffer: the buffer to free.
+ */
+void
+ring_buffer_free(struct ring_buffer *buffer)
+{
+ int cpu;
+
+ for (cpu = 0; cpu < buffer->cpus; cpu++)
+ ring_buffer_free_cpu_buffer(buffer->buffers[cpu]);
+
+ kfree(buffer);
+}
+
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
+{
+ /* FIXME: */
+ return -1;
+}
+
+static inline int
+ring_buffer_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return cpu_buffer->head_page == cpu_buffer->tail_page &&
+ cpu_buffer->head == cpu_buffer->tail;
+}
+
+static inline int
+ring_buffer_null_event(struct ring_buffer_event *event)
+{
+ return event->type == RB_TYPE_PADDING;
+}
+
+static inline void *
+rb_page_body(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned long page, unsigned index)
+{
+ return cpu_buffer->pages[page]->body + index;
+}
+
+static inline struct ring_buffer_event *
+ring_buffer_head_event(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return rb_page_body(cpu_buffer,cpu_buffer->head_page,
+ cpu_buffer->head);
+}
+
+static inline struct ring_buffer_event *
+ring_buffer_iter_head_event(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+
+ return rb_page_body(cpu_buffer, iter->head_page,
+ iter->head);
+}
+
+static void
+ring_buffer_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct ring_buffer_event *event;
+ unsigned long head;
+
+ for (head = 0; head < BUF_PAGE_SIZE;
+ head += ring_buffer_event_length(event)) {
+ event = rb_page_body(cpu_buffer, cpu_buffer->head_page, head);
+ if (ring_buffer_null_event(event))
+ break;
+ cpu_buffer->overrun++;
+ cpu_buffer->entries--;
+ }
+}
+
+static inline void
+ring_buffer_inc_page(struct ring_buffer *buffer,
+ unsigned long *page)
+{
+ (*page)++;
+ if (*page >= buffer->pages)
+ *page = 0;
+}
+
+static inline void
+rb_add_stamp(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
+{
+ struct buffer_page *bpage;
+
+ bpage = cpu_buffer->pages[cpu_buffer->tail_page];
+ bpage->time_stamp = *ts;
+}
+
+static void
+rb_reset_read_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct buffer_page *bpage;
+
+ cpu_buffer->head = 0;
+ bpage = cpu_buffer->pages[cpu_buffer->head_page];
+ cpu_buffer->read_stamp = bpage->time_stamp;
+}
+
+static void
+rb_reset_iter_read_page(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+ struct buffer_page *bpage;
+
+ iter->head = 0;
+ bpage = cpu_buffer->pages[iter->head_page];
+ iter->read_stamp = bpage->time_stamp;
+}
+
+/**
+ * ring_buffer_update_event - update event type and data
+ * @event: the even to update
+ * @type: the type of event
+ * @length: the size of the event field in the ring buffer
+ *
+ * Update the type and data fields of the event. The length
+ * is the actual size that is written to the ring buffer,
+ * and with this, we can determine what to place into the
+ * data field.
+ */
+static inline void
+ring_buffer_update_event(struct ring_buffer_event *event,
+ unsigned type, unsigned length)
+{
+ event->type = type;
+
+ switch (type) {
+ /* ignore fixed size types */
+ case RB_TYPE_PADDING:
+ break;
+
+ case RB_TYPE_TIME_EXTENT:
+ event->len =
+ (RB_LEN_TIME_EXTENT + (RB_ALIGNMENT-1))
+ >> RB_ALIGNMENT_SHIFT;
+ break;
+
+ case RB_TYPE_TIME_STAMP:
+ event->len =
+ (RB_LEN_TIME_STAMP + (RB_ALIGNMENT-1))
+ >> RB_ALIGNMENT_SHIFT;
+ break;
+
+ case RB_TYPE_DATA:
+ length -= RB_EVNT_HDR_SIZE;
+ if (length > RB_MAX_SMALL_DATA) {
+ event->len = 0;
+ event->array[0] = length;
+ } else
+ event->len =
+ (length + (RB_ALIGNMENT-1))
+ >> RB_ALIGNMENT_SHIFT;
+ break;
+ default:
+ BUG();
+ }
+}
+
+static inline unsigned rb_calculate_event_length(unsigned length)
+{
+ struct ring_buffer_event event; /* Used only for sizeof array */
+
+ /* zero length can cause confusions */
+ if (!length)
+ length = 1;
+
+ if (length > RB_MAX_SMALL_DATA)
+ length += sizeof(event.array[0]);
+
+ length += RB_EVNT_HDR_SIZE;
+ length = ALIGN(length, RB_ALIGNMENT);
+
+ return length;
+}
+
+static struct ring_buffer_event *
+__ring_buffer_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned type, unsigned long length, u64 *ts)
+{
+ unsigned long head_page, tail_page, tail;
+ struct ring_buffer *buffer = cpu_buffer->buffer;
+ struct ring_buffer_event *event;
+
+ tail_page = cpu_buffer->tail_page;
+ head_page = cpu_buffer->head_page;
+ tail = cpu_buffer->tail;
+
+ BUG_ON(tail_page >= buffer->pages);
+ BUG_ON(head_page >= buffer->pages);
+
+ if (tail + length > BUF_PAGE_SIZE) {
+ unsigned long next_page = tail_page;
+
+ ring_buffer_inc_page(buffer, &next_page);
+
+ if (next_page == head_page) {
+ if (!(buffer->flags & RB_FL_OVERWRITE))
+ return NULL;
+
+ /* count overflows */
+ ring_buffer_update_overflow(cpu_buffer);
+
+ ring_buffer_inc_page(buffer, &head_page);
+ cpu_buffer->head_page = head_page;
+ rb_reset_read_page(cpu_buffer);
+ }
+
+ if (tail != BUF_PAGE_SIZE) {
+ event = rb_page_body(cpu_buffer, tail_page, tail);
+ /* page padding */
+ event->type = RB_TYPE_PADDING;
+ }
+
+ tail = 0;
+ tail_page = next_page;
+ cpu_buffer->tail_page = tail_page;
+ cpu_buffer->tail = tail;
+ rb_add_stamp(cpu_buffer, ts);
+ }
+
+ BUG_ON(tail_page >= buffer->pages);
+ BUG_ON(tail + length > BUF_PAGE_SIZE);
+
+ event = rb_page_body(cpu_buffer, tail_page, tail);
+ ring_buffer_update_event(event, type, length);
+ cpu_buffer->entries++;
+
+ return event;
+}
+
+static struct ring_buffer_event *
+ring_buffer_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned type, unsigned long length)
+{
+ unsigned long long ts, delta;
+ struct ring_buffer_event *event;
+
+ ts = ring_buffer_time_stamp(cpu_buffer->cpu);
+
+ if (cpu_buffer->tail) {
+ delta = ts - cpu_buffer->last_stamp;
+
+ if (test_time_stamp(delta)) {
+ /*
+ * The delta is too big, we to add a
+ * new timestamp.
+ */
+ event = __ring_buffer_reserve_next(cpu_buffer,
+ RB_TYPE_TIME_EXTENT,
+ RB_LEN_TIME_EXTENT,
+ &ts);
+ if (!event)
+ return NULL;
+
+ /* check to see if we went to the next page */
+ if (!cpu_buffer->tail) {
+ /*
+ * new page, dont commit this and add the
+ * time stamp to the page instead.
+ */
+ rb_add_stamp(cpu_buffer, &ts);
+ } else {
+ event->time_delta = delta & TS_MASK;
+ event->array[0] = delta >> TS_SHIFT;
+ }
+
+ cpu_buffer->last_stamp = ts;
+ delta = 0;
+ }
+ } else {
+ rb_add_stamp(cpu_buffer, &ts);
+ delta = 0;
+ }
+
+ event = __ring_buffer_reserve_next(cpu_buffer, type, length, &ts);
+ if (!event)
+ return NULL;
+
+ event->time_delta = delta;
+ cpu_buffer->last_stamp = ts;
+
+ return event;
+}
+
+/**
+ * ring_buffer_lock_reserve - reserve a part of the buffer
+ * @buffer: the ring buffer to reserve from
+ * @length: the length of the data to reserve (excluding event header)
+ * @flags: a pointer to save the interrupt flags
+ *
+ * Returns a location on the ring buffer to copy directly to.
+ * The length is the length of the data needed, not the event length
+ * which also includes the event header.
+ *
+ * Must be paired with ring_buffer_unlock_commit, unless NULL is returned.
+ * If NULL is returned, then nothing has been allocated or locked.
+ */
+void *ring_buffer_lock_reserve(struct ring_buffer *buffer,
+ unsigned long length,
+ unsigned long *flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ int cpu;
+
+ if (atomic_read(&buffer->record_disabled))
+ return NULL;
+
+ raw_local_irq_save(*flags);
+ cpu = raw_smp_processor_id();
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_lock(&cpu_buffer->lock);
+
+ if (atomic_read(&cpu_buffer->record_disabled))
+ goto no_record;
+
+ length = rb_calculate_event_length(length);
+ if (length > BUF_PAGE_SIZE)
+ return NULL;
+
+ event = ring_buffer_reserve_next_event(cpu_buffer,
+ RB_TYPE_DATA, length);
+ if (!event)
+ goto no_record;
+
+ return ring_buffer_event_data(event);
+
+ no_record:
+ __raw_spin_unlock(&cpu_buffer->lock);
+ local_irq_restore(*flags);
+ return NULL;
+}
+
+/**
+ * ring_buffer_unlock_commit - commit a reserved
+ * @buffer: The buffer to commit to
+ * @data: The data pointer to commit.
+ * @flags: the interrupt flags received from ring_buffer_lock_reserve.
+ *
+ * This commits the data to the ring buffer, and releases any locks held.
+ *
+ * Must be paired with ring_buffer_lock_reserve.
+ */
+int ring_buffer_unlock_commit(struct ring_buffer *buffer, void *data, unsigned long flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ u32 *array = data;
+ int cpu = raw_smp_processor_id();
+
+ /*
+ * If the data was larger than max small size, the array[0] will
+ * hold the length, which must be less than PAGE_SIZE.
+ * Since the type field is in the MSB, and must not be zero
+ * we can test that to see if this entry is a large entry
+ * or not.
+ */
+ array--;
+ if (*array < PAGE_SIZE)
+ array--; /* this is large data */
+ event = (struct ring_buffer_event *)array;
+
+ cpu_buffer = buffer->buffers[cpu];
+ cpu_buffer->tail += ring_buffer_event_length(event);
+
+ __raw_spin_unlock(&cpu_buffer->lock);
+ raw_local_irq_restore(flags);
+
+ return 0;
+}
+
+/**
+ * ring_buffer_write - write data to the buffer without reserving
+ * @buffer: The ring buffer to write to.
+ * @event_type: The event type to write to.
+ * @length: The length of the data being written (excluding the event header)
+ * @data: The data to write to the buffer.
+ *
+ * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as
+ * one function. If you already have the data to write to the buffer, it
+ * may be easier to simply call this function.
+ *
+ * Note, like ring_buffer_lock_reserve, the length is the length of the data
+ * and not the length of the event which would hold the header.
+ */
+void *ring_buffer_write(struct ring_buffer *buffer,
+ unsigned long length,
+ void *data)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ unsigned long event_length, flags;
+ void *ret = NULL;
+ int cpu;
+
+ if (atomic_read(&buffer->record_disabled))
+ return NULL;
+
+ local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_lock(&cpu_buffer->lock);
+
+ if (atomic_read(&cpu_buffer->record_disabled))
+ goto out;
+
+ event_length = rb_calculate_event_length(length);
+ event = ring_buffer_reserve_next_event(cpu_buffer,
+ RB_TYPE_DATA, event_length);
+ if (!event)
+ goto out;
+
+ ret = ring_buffer_event_data(event);
+
+ memcpy(ret, data, length);
+ cpu_buffer->tail += event_length;
+
+ out:
+ __raw_spin_unlock(&cpu_buffer->lock);
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+/**
+ * ring_buffer_lock - lock the ring buffer
+ * @buffer: The ring buffer to lock
+ * @flags: The place to store the interrupt flags
+ *
+ * This locks all the per CPU buffers.
+ *
+ * Must be unlocked by ring_buffer_unlock.
+ */
+void ring_buffer_lock(struct ring_buffer *buffer, unsigned long *flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ local_irq_save(*flags);
+
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_lock(&cpu_buffer->lock);
+ }
+}
+
+/**
+ * ring_buffer_unlock - unlock a locked buffer
+ * @buffer: The locked buffer to unlock
+ * @flags: The interrupt flags received by ring_buffer_lock
+ */
+void ring_buffer_unlock(struct ring_buffer *buffer, unsigned long flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ for (cpu = buffer->cpus - 1; cpu >= 0; cpu--) {
+
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_unlock(&cpu_buffer->lock);
+ }
+
+ local_irq_restore(flags);
+}
+
+/**
+ * ring_buffer_record_disable - stop all writes into the buffer
+ * @buffer: The ring buffer to stop writes to.
+ *
+ * This prevents all writes to the buffer. Any attempt to write
+ * to the buffer after this will fail and return NULL.
+ */
+void ring_buffer_record_disable(struct ring_buffer *buffer)
+{
+ atomic_inc(&buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_enable - enable writes to the buffer
+ * @buffer: The ring buffer to enable writes
+ *
+ * Note, multiple disables will need the same number of enables
+ * to truely enable the writing (much like preempt_disable).
+ */
+void ring_buffer_record_enable(struct ring_buffer *buffer)
+{
+ atomic_dec(&buffer->record_disabled);
+}
+
+void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+ atomic_inc(&cpu_buffer->record_disabled);
+}
+
+void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+ atomic_dec(&cpu_buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the entries from.
+ */
+unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return cpu_buffer->entries;
+}
+
+/**
+ * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return cpu_buffer->overrun;
+}
+
+/**
+ * ring_buffer_entries - get the number of entries in a buffer
+ * @buffer: The ring buffer
+ *
+ * Returns the total number of entries in the ring buffer
+ * (all CPU entries)
+ */
+unsigned long ring_buffer_entries(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long entries = 0;
+ int cpu;
+
+ /* if you care about this being correct, lock the buffer */
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = buffer->buffers[cpu];
+ entries += cpu_buffer->entries;
+ }
+
+ return entries;
+}
+
+/**
+ * ring_buffer_overrun_cpu - get the number of overruns in buffer
+ * @buffer: The ring buffer
+ *
+ * Returns the total number of overruns in the ring buffer
+ * (all CPU entries)
+ */
+unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long overruns = 0;
+ int cpu;
+
+ /* if you care about this being correct, lock the buffer */
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = buffer->buffers[cpu];
+ overruns += cpu_buffer->overrun;
+ }
+
+ return overruns;
+}
+
+void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
+{
+ iter->head_page = 0;
+ iter->head = 0;
+}
+
+int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = iter->cpu_buffer;
+
+ return iter->head_page == cpu_buffer->tail_page &&
+ iter->head == cpu_buffer->tail;
+}
+
+static void
+ring_buffer_advance_head(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct ring_buffer *buffer = cpu_buffer->buffer;
+ struct ring_buffer_event *event;
+ unsigned length;
+
+ event = ring_buffer_head_event(cpu_buffer);
+ /*
+ * Check if we are at the end of the buffer.
+ * For fixed length, we need to check if we can fit
+ * another entry on the page.
+ * Otherwise we need to see if the end is a null
+ * pointer.
+ */
+ if (ring_buffer_null_event(event)) {
+ BUG_ON(cpu_buffer->head_page == cpu_buffer->tail_page);
+ ring_buffer_inc_page(buffer, &cpu_buffer->head_page);
+ rb_reset_read_page(cpu_buffer);
+ return;
+ }
+
+ length = ring_buffer_event_length(event);
+
+ /*
+ * This should not be called to advance the header if we are
+ * at the tail of the buffer.
+ */
+ BUG_ON((cpu_buffer->head_page == cpu_buffer->tail_page) &&
+ (cpu_buffer->head + length > cpu_buffer->tail));
+
+ cpu_buffer->head += length;
+
+ /* check for end of page padding */
+ event = ring_buffer_head_event(cpu_buffer);
+ if (ring_buffer_null_event(event) &&
+ (cpu_buffer->head_page != cpu_buffer->tail_page))
+ ring_buffer_advance_head(cpu_buffer);
+}
+
+static void
+ring_buffer_advance_iter(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer *buffer;
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ unsigned length;
+
+ cpu_buffer = iter->cpu_buffer;
+ buffer = cpu_buffer->buffer;
+
+ event = ring_buffer_iter_head_event(iter);
+
+ /*
+ * Check if we are at the end of the buffer.
+ * For fixed length, we need to check if we can fit
+ * another entry on the page.
+ * Otherwise we need to see if the end is a null
+ * pointer.
+ */
+ if (ring_buffer_null_event(event)) {
+ BUG_ON(iter->head_page == cpu_buffer->tail_page);
+ ring_buffer_inc_page(buffer, &iter->head_page);
+ rb_reset_iter_read_page(iter);
+ return;
+ }
+
+ length = ring_buffer_event_length(event);
+
+ /*
+ * This should not be called to advance the header if we are
+ * at the tail of the buffer.
+ */
+ BUG_ON((iter->head_page == cpu_buffer->tail_page) &&
+ (iter->head + length > cpu_buffer->tail));
+
+ iter->head += length;
+
+ /* check for end of page padding */
+ event = ring_buffer_iter_head_event(iter);
+ if (ring_buffer_null_event(event) &&
+ (iter->head_page != cpu_buffer->tail_page))
+ ring_buffer_advance_iter(iter);
+}
+
+/**
+ * ring_buffer_peek - peek at the next event to be read
+ * @iter: The ring buffer iterator
+ * @iter_next_cpu: The CPU that the next event belongs on
+ *
+ * This will return the event that will be read next, but does
+ * not increment the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ u64 delta;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ again:
+ if (ring_buffer_per_cpu_empty(cpu_buffer))
+ return NULL;
+
+ event = ring_buffer_head_event(cpu_buffer);
+
+ switch (event->type) {
+ case RB_TYPE_PADDING:
+ ring_buffer_inc_page(buffer, &cpu_buffer->head_page);
+ rb_reset_read_page(cpu_buffer);
+ goto again;
+
+ case RB_TYPE_TIME_EXTENT:
+ delta = event->array[0];
+ delta <<= TS_SHIFT;
+ delta += event->time_delta;
+ cpu_buffer->read_stamp += delta;
+ goto again;
+
+ case RB_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ goto again;
+
+ case RB_TYPE_DATA:
+ if (ts) {
+ *ts = cpu_buffer->read_stamp + event->time_delta;
+ ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+ }
+ return event;
+
+ default:
+ BUG();
+ }
+
+ return NULL;
+}
+
+/**
+ * ring_buffer_peek - peek at the next event to be read
+ * @iter: The ring buffer iterator
+ * @iter_next_cpu: The CPU that the next event belongs on
+ *
+ * This will return the event that will be read next, but does
+ * not increment the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
+{
+ struct ring_buffer *buffer;
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ u64 delta;
+
+ if (ring_buffer_iter_empty(iter))
+ return NULL;
+
+ cpu_buffer = iter->cpu_buffer;
+ buffer = cpu_buffer->buffer;
+
+ again:
+ if (ring_buffer_per_cpu_empty(cpu_buffer))
+ return NULL;
+
+ event = ring_buffer_iter_head_event(iter);
+
+ switch (event->type) {
+ case RB_TYPE_PADDING:
+ ring_buffer_inc_page(buffer, &iter->head_page);
+ rb_reset_iter_read_page(iter);
+ goto again;
+
+ case RB_TYPE_TIME_EXTENT:
+ delta = event->array[0];
+ delta <<= TS_SHIFT;
+ delta += event->time_delta;
+ iter->read_stamp += delta;
+ goto again;
+
+ case RB_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ goto again;
+
+ case RB_TYPE_DATA:
+ if (ts) {
+ *ts = iter->read_stamp + event->time_delta;
+ ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+ }
+ return event;
+
+ default:
+ BUG();
+ }
+
+ return NULL;
+}
+
+/**
+ * ring_buffer_consume - return an event and consume it
+ * @buffer: The ring buffer to get the next event from
+ *
+ * Returns the next event in the ring buffer, and that event is consumed.
+ * Meaning, that sequential reads will keep returning a different event,
+ * and eventually empty the ring buffer if the producer is slower.
+ */
+struct ring_buffer_event *
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+
+ event = ring_buffer_peek(buffer, cpu, ts);
+ if (!event)
+ return NULL;
+
+ cpu_buffer = buffer->buffers[cpu];
+ ring_buffer_advance_head(cpu_buffer);
+
+ return event;
+}
+
+/**
+ * ring_buffer_read_start - start a non consuming read of the buffer
+ * @buffer: The ring buffer to read from
+ * @iter_flags: control flags on how to read the buffer.
+ *
+ * This starts up an iteration through the buffer. It also disables
+ * the recording to the buffer until the reading is finished.
+ * This prevents the reading from being corrupted. This is not
+ * a consuming read, so a producer is not expected.
+ *
+ * The iter_flags of RB_ITER_FL_SNAP will read the snapshot image
+ * and not the main buffer.
+ *
+ * Must be paired with ring_buffer_finish.
+ */
+struct ring_buffer_iter *
+ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_iter *iter;
+
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return NULL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ iter->cpu_buffer = cpu_buffer;
+
+ atomic_inc(&cpu_buffer->record_disabled);
+
+ __raw_spin_lock(&cpu_buffer->lock);
+ iter->head = cpu_buffer->head;
+ iter->head_page = cpu_buffer->head_page;
+ rb_reset_iter_read_page(iter);
+ __raw_spin_unlock(&cpu_buffer->lock);
+
+ return iter;
+}
+
+/**
+ * ring_buffer_finish - finish reading the iterator of the buffer
+ * @iter: The iterator retrieved by ring_buffer_start
+ *
+ * This re-enables the recording to the buffer, and frees the
+ * iterator.
+ */
+void
+ring_buffer_read_finish(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+
+ atomic_dec(&cpu_buffer->record_disabled);
+ kfree(iter);
+}
+
+/**
+ * ring_buffer_read - read the next item in the ring buffer by the iterator
+ * @iter: The ring buffer iterator
+ * @cpu: The cpu buffer to read from.
+ *
+ * This reads the next event in the ring buffer and increments the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
+{
+ struct ring_buffer_event *event;
+
+ event = ring_buffer_iter_peek(iter, ts);
+ if (!event)
+ return NULL;
+
+ ring_buffer_advance_iter(iter);
+
+ return event;
+}
+
+/**
+ * ring_buffer_size - return the size of the ring buffer (in bytes)
+ * @buffer: The ring buffer.
+ */
+unsigned long ring_buffer_size(struct ring_buffer *buffer)
+{
+ return PAGE_SIZE * buffer->pages;
+}
+
+static void
+__ring_buffer_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ cpu_buffer->head_page = cpu_buffer->tail_page = 0;
+ cpu_buffer->head = cpu_buffer->tail = 0;
+ cpu_buffer->overrun = 0;
+ cpu_buffer->entries = 0;
+}
+
+/**
+ * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
+ * @buffer: The ring buffer to reset a per cpu buffer of
+ * @cpu: The CPU buffer to be reset
+ */
+void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ __raw_spin_lock(&cpu_buffer->lock);
+
+ __ring_buffer_reset_cpu(cpu_buffer);
+
+ __raw_spin_unlock(&cpu_buffer->lock);
+ raw_local_irq_restore(flags);
+}
+
+/**
+ * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
+ * @buffer: The ring buffer to reset a per cpu buffer of
+ * @cpu: The CPU buffer to be reset
+ */
+void ring_buffer_reset(struct ring_buffer *buffer)
+{
+ unsigned long flags;
+ int cpu;
+
+ ring_buffer_lock(buffer, &flags);
+
+ for (cpu = 0; cpu < buffer->cpus; cpu++)
+ __ring_buffer_reset_cpu(buffer->buffers[cpu]);
+
+ ring_buffer_unlock(buffer, flags);
+}
+
+/**
+ * rind_buffer_empty - is the ring buffer empty?
+ * @buffer: The ring buffer to test
+ */
+int ring_buffer_empty(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ /* yes this is racy, but if you don't like the race, lock the buffer */
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = buffer->buffers[cpu];
+ if (!ring_buffer_per_cpu_empty(cpu_buffer))
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty?
+ * @buffer: The ring buffer
+ * @cpu: The CPU buffer to test
+ */
+int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ /* yes this is racy, but if you don't like the race, lock the buffer */
+ cpu_buffer = buffer->buffers[cpu];
+ return ring_buffer_per_cpu_empty(cpu_buffer);
+}
+
+/**
+ * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
+ * @buffer_a: One buffer to swap with
+ * @buffer_b: The other buffer to swap with
+ *
+ * This function is useful for tracers that want to take a "snapshot"
+ * of a CPU buffer and has another back up buffer lying around.
+ * it is expected that the tracer handles the cpu buffer not being
+ * used at the moment.
+ */
+int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
+ struct ring_buffer *buffer_b, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer_a;
+ struct ring_buffer_per_cpu *cpu_buffer_b;
+
+ /* At least make sure the two buffers are somewhat the same */
+ if (buffer_a->size != buffer_b->size ||
+ buffer_a->pages != buffer_b->pages)
+ return -EINVAL;
+
+ cpu_buffer_a = buffer_a->buffers[cpu];
+ cpu_buffer_b = buffer_b->buffers[cpu];
+
+ atomic_inc(&cpu_buffer_a->record_disabled);
+ atomic_inc(&cpu_buffer_b->record_disabled);
+
+ buffer_a->buffers[cpu] = cpu_buffer_b;
+ buffer_b->buffers[cpu] = cpu_buffer_a;
+
+ cpu_buffer_b->buffer = buffer_a;
+ cpu_buffer_a->buffer = buffer_b;
+
+ atomic_dec(&cpu_buffer_a->record_disabled);
+ atomic_dec(&cpu_buffer_b->record_disabled);
+
+ return 0;
+}
+
Index: linux-compile.git/kernel/trace/Kconfig
===================================================================
--- linux-compile.git.orig/kernel/trace/Kconfig 2008-09-24 13:21:18.000000000 -0400
+++ linux-compile.git/kernel/trace/Kconfig 2008-09-24 19:31:01.000000000 -0400
@@ -15,6 +15,9 @@ config TRACING
select DEBUG_FS
select STACKTRACE

+config RING_BUFFER
+ bool "ring buffer"
+
config FTRACE
bool "Kernel Function Tracer"
depends on HAVE_FTRACE
Index: linux-compile.git/kernel/trace/Makefile
===================================================================
--- linux-compile.git.orig/kernel/trace/Makefile 2008-09-24 13:21:18.000000000 -0400
+++ linux-compile.git/kernel/trace/Makefile 2008-09-24 19:31:01.000000000 -0400
@@ -11,6 +11,7 @@ obj-y += trace_selftest_dynamic.o
endif

obj-$(CONFIG_FTRACE) += libftrace.o
+obj-$(CONFIG_RING_BUFFER) += ring_buffer.o

obj-$(CONFIG_TRACING) += trace.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o

--
Steven Rostedt
2008-09-25 18:51:56 UTC
Permalink
Note: This patch is a proof of concept, and breaks a lot of
functionality of ftrace.

This patch simply makes ftrace work with the developmental ring buffer.

Signed-off-by: Steven Rostedt <***@redhat.com>
---
kernel/trace/trace.c | 776 ++++++++------------------------------
kernel/trace/trace.h | 22 -
kernel/trace/trace_functions.c | 2
kernel/trace/trace_irqsoff.c | 6
kernel/trace/trace_mmiotrace.c | 10
kernel/trace/trace_sched_switch.c | 2
kernel/trace/trace_sched_wakeup.c | 2
7 files changed, 195 insertions(+), 625 deletions(-)

Index: linux-compile.git/kernel/trace/trace.c
===================================================================
--- linux-compile.git.orig/kernel/trace/trace.c 2008-09-25 12:34:11.000000000 -0400
+++ linux-compile.git/kernel/trace/trace.c 2008-09-25 12:34:23.000000000 -0400
@@ -31,25 +31,24 @@
#include <linux/writeback.h>

#include <linux/stacktrace.h>
+#include <linux/ring_buffer.h>

#include "trace.h"

+#define sdr_print(x, y...) printk("%s:%d " x "\n", __FUNCTION__, __LINE__, y)
+
+#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
+
unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX;
unsigned long __read_mostly tracing_thresh;

-static unsigned long __read_mostly tracing_nr_buffers;
static cpumask_t __read_mostly tracing_buffer_mask;

#define for_each_tracing_cpu(cpu) \
for_each_cpu_mask(cpu, tracing_buffer_mask)

-static int trace_alloc_page(void);
-static int trace_free_page(void);
-
static int tracing_disabled = 1;

-static unsigned long tracing_pages_allocated;
-
long
ns2usecs(cycle_t nsec)
{
@@ -100,11 +99,11 @@ static int tracer_enabled = 1;
int ftrace_function_enabled;

/*
- * trace_nr_entries is the number of entries that is allocated
- * for a buffer. Note, the number of entries is always rounded
- * to ENTRIES_PER_PAGE.
+ * trace_buf_size is the size in bytes that is allocated
+ * for a buffer. Note, the number of bytes is always rounded
+ * to page size.
*/
-static unsigned long trace_nr_entries = 65536UL;
+static unsigned long trace_buf_size = 65536UL;

/* trace_types holds a link list of available tracers. */
static struct tracer *trace_types __read_mostly;
@@ -139,8 +138,8 @@ static notrace void no_trace_init(struct

ftrace_function_enabled = 0;
if(tr->ctrl)
- for_each_online_cpu(cpu)
- tracing_reset(tr->data[cpu]);
+ for_each_tracing_cpu(cpu)
+ tracing_reset(tr, cpu);
tracer_enabled = 0;
}

@@ -167,23 +166,21 @@ void trace_wake_up(void)
wake_up(&trace_wait);
}

-#define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry))
-
-static int __init set_nr_entries(char *str)
+static int __init set_buf_size(char *str)
{
- unsigned long nr_entries;
+ unsigned long buf_size;
int ret;

if (!str)
return 0;
- ret = strict_strtoul(str, 0, &nr_entries);
+ ret = strict_strtoul(str, 0, &buf_size);
/* nr_entries can not be zero */
- if (ret < 0 || nr_entries == 0)
+ if (ret < 0 || buf_size == 0)
return 0;
- trace_nr_entries = nr_entries;
+ trace_buf_size = buf_size;
return 1;
}
-__setup("trace_entries=", set_nr_entries);
+__setup("trace_buf_size=", set_buf_size);

unsigned long nsecs_to_usecs(unsigned long nsecs)
{
@@ -266,54 +263,6 @@ __update_max_tr(struct trace_array *tr,
tracing_record_cmdline(current);
}

-#define CHECK_COND(cond) \
- if (unlikely(cond)) { \
- tracing_disabled = 1; \
- WARN_ON(1); \
- return -1; \
- }
-
-/**
- * check_pages - integrity check of trace buffers
- *
- * As a safty measure we check to make sure the data pages have not
- * been corrupted.
- */
-int check_pages(struct trace_array_cpu *data)
-{
- struct page *page, *tmp;
-
- CHECK_COND(data->trace_pages.next->prev != &data->trace_pages);
- CHECK_COND(data->trace_pages.prev->next != &data->trace_pages);
-
- list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) {
- CHECK_COND(page->lru.next->prev != &page->lru);
- CHECK_COND(page->lru.prev->next != &page->lru);
- }
-
- return 0;
-}
-
-/**
- * head_page - page address of the first page in per_cpu buffer.
- *
- * head_page returns the page address of the first page in
- * a per_cpu buffer. This also preforms various consistency
- * checks to make sure the buffer has not been corrupted.
- */
-void *head_page(struct trace_array_cpu *data)
-{
- struct page *page;
-
- if (list_empty(&data->trace_pages))
- return NULL;
-
- page = list_entry(data->trace_pages.next, struct page, lru);
- BUG_ON(&page->lru == &data->trace_pages);
-
- return page_address(page);
-}
-
/**
* trace_seq_printf - sequence printing of trace information
* @s: trace sequence descriptor
@@ -460,34 +409,6 @@ trace_print_seq(struct seq_file *m, stru
trace_seq_reset(s);
}

-/*
- * flip the trace buffers between two trace descriptors.
- * This usually is the buffers between the global_trace and
- * the max_tr to record a snapshot of a current trace.
- *
- * The ftrace_max_lock must be held.
- */
-static void
-flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2)
-{
- struct list_head flip_pages;
-
- INIT_LIST_HEAD(&flip_pages);
-
- memcpy(&tr1->trace_head_idx, &tr2->trace_head_idx,
- sizeof(struct trace_array_cpu) -
- offsetof(struct trace_array_cpu, trace_head_idx));
-
- check_pages(tr1);
- check_pages(tr2);
- list_splice_init(&tr1->trace_pages, &flip_pages);
- list_splice_init(&tr2->trace_pages, &tr1->trace_pages);
- list_splice_init(&flip_pages, &tr2->trace_pages);
- BUG_ON(!list_empty(&flip_pages));
- check_pages(tr1);
- check_pages(tr2);
-}
-
/**
* update_max_tr - snapshot all trace buffers from global_trace to max_tr
* @tr: tracer
@@ -500,17 +421,15 @@ flip_trace(struct trace_array_cpu *tr1,
void
update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
{
- struct trace_array_cpu *data;
- int i;
+ struct ring_buffer *buf = tr->buffer;

WARN_ON_ONCE(!irqs_disabled());
__raw_spin_lock(&ftrace_max_lock);
- /* clear out all the previous traces */
- for_each_tracing_cpu(i) {
- data = tr->data[i];
- flip_trace(max_tr.data[i], data);
- tracing_reset(data);
- }
+
+ tr->buffer = max_tr.buffer;
+ max_tr.buffer = buf;
+
+ ring_buffer_reset(tr->buffer);

__update_max_tr(tr, tsk, cpu);
__raw_spin_unlock(&ftrace_max_lock);
@@ -527,16 +446,15 @@ update_max_tr(struct trace_array *tr, st
void
update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
{
- struct trace_array_cpu *data = tr->data[cpu];
- int i;
+ int ret;

WARN_ON_ONCE(!irqs_disabled());
__raw_spin_lock(&ftrace_max_lock);
- for_each_tracing_cpu(i)
- tracing_reset(max_tr.data[i]);

- flip_trace(max_tr.data[cpu], data);
- tracing_reset(data);
+ ring_buffer_reset(max_tr.buffer);
+ ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu);
+
+ WARN_ON_ONCE(ret);

__update_max_tr(tr, tsk, cpu);
__raw_spin_unlock(&ftrace_max_lock);
@@ -573,7 +491,6 @@ int register_tracer(struct tracer *type)
#ifdef CONFIG_FTRACE_STARTUP_TEST
if (type->selftest) {
struct tracer *saved_tracer = current_trace;
- struct trace_array_cpu *data;
struct trace_array *tr = &global_trace;
int saved_ctrl = tr->ctrl;
int i;
@@ -585,10 +502,7 @@ int register_tracer(struct tracer *type)
* If we fail, we do not register this tracer.
*/
for_each_tracing_cpu(i) {
- data = tr->data[i];
- if (!head_page(data))
- continue;
- tracing_reset(data);
+ tracing_reset(tr, i);
}
current_trace = type;
tr->ctrl = 0;
@@ -604,10 +518,7 @@ int register_tracer(struct tracer *type)
}
/* Only reset on passing, to avoid touching corrupted buffers */
for_each_tracing_cpu(i) {
- data = tr->data[i];
- if (!head_page(data))
- continue;
- tracing_reset(data);
+ tracing_reset(tr, i);
}
printk(KERN_CONT "PASSED\n");
}
@@ -653,13 +564,9 @@ void unregister_tracer(struct tracer *ty
mutex_unlock(&trace_types_lock);
}

-void tracing_reset(struct trace_array_cpu *data)
+void tracing_reset(struct trace_array *tr, int cpu)
{
- data->trace_idx = 0;
- data->overrun = 0;
- data->trace_head = data->trace_tail = head_page(data);
- data->trace_head_idx = 0;
- data->trace_tail_idx = 0;
+ ring_buffer_reset_cpu(tr->buffer, cpu);
}

#define SAVED_CMDLINES 128
@@ -745,70 +652,6 @@ void tracing_record_cmdline(struct task_
trace_save_cmdline(tsk);
}

-static inline struct list_head *
-trace_next_list(struct trace_array_cpu *data, struct list_head *next)
-{
- /*
- * Roundrobin - but skip the head (which is not a real page):
- */
- next = next->next;
- if (unlikely(next == &data->trace_pages))
- next = next->next;
- BUG_ON(next == &data->trace_pages);
-
- return next;
-}
-
-static inline void *
-trace_next_page(struct trace_array_cpu *data, void *addr)
-{
- struct list_head *next;
- struct page *page;
-
- page = virt_to_page(addr);
-
- next = trace_next_list(data, &page->lru);
- page = list_entry(next, struct page, lru);
-
- return page_address(page);
-}
-
-static inline struct trace_entry *
-tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data)
-{
- unsigned long idx, idx_next;
- struct trace_entry *entry;
-
- data->trace_idx++;
- idx = data->trace_head_idx;
- idx_next = idx + 1;
-
- BUG_ON(idx * TRACE_ENTRY_SIZE >= PAGE_SIZE);
-
- entry = data->trace_head + idx * TRACE_ENTRY_SIZE;
-
- if (unlikely(idx_next >= ENTRIES_PER_PAGE)) {
- data->trace_head = trace_next_page(data, data->trace_head);
- idx_next = 0;
- }
-
- if (data->trace_head == data->trace_tail &&
- idx_next == data->trace_tail_idx) {
- /* overrun */
- data->overrun++;
- data->trace_tail_idx++;
- if (data->trace_tail_idx >= ENTRIES_PER_PAGE) {
- data->trace_tail =
- trace_next_page(data, data->trace_tail);
- data->trace_tail_idx = 0;
- }
- }
-
- data->trace_head_idx = idx_next;
-
- return entry;
-}
-
static inline void
tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags)
{
@@ -819,7 +662,6 @@ tracing_generic_entry_update(struct trac

entry->preempt_count = pc & 0xff;
entry->pid = (tsk) ? tsk->pid : 0;
- entry->t = ftrace_now(raw_smp_processor_id());
entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
@@ -833,15 +675,14 @@ trace_function(struct trace_array *tr, s
struct trace_entry *entry;
unsigned long irq_flags;

- raw_local_irq_save(irq_flags);
- __raw_spin_lock(&data->lock);
- entry = tracing_get_trace_entry(tr, data);
+ entry = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq_flags);
+ if (!entry)
+ return;
tracing_generic_entry_update(entry, flags);
entry->type = TRACE_FN;
entry->fn.ip = ip;
entry->fn.parent_ip = parent_ip;
- __raw_spin_unlock(&data->lock);
- raw_local_irq_restore(irq_flags);
+ ring_buffer_unlock_commit(tr->buffer, entry, irq_flags);
}

void
@@ -859,16 +700,13 @@ void __trace_mmiotrace_rw(struct trace_a
struct trace_entry *entry;
unsigned long irq_flags;

- raw_local_irq_save(irq_flags);
- __raw_spin_lock(&data->lock);
-
- entry = tracing_get_trace_entry(tr, data);
+ entry = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq_flags);
+ if (!entry)
+ return;
tracing_generic_entry_update(entry, 0);
entry->type = TRACE_MMIO_RW;
entry->mmiorw = *rw;
-
- __raw_spin_unlock(&data->lock);
- raw_local_irq_restore(irq_flags);
+ ring_buffer_unlock_commit(tr->buffer, entry, irq_flags);

trace_wake_up();
}
@@ -879,16 +717,13 @@ void __trace_mmiotrace_map(struct trace_
struct trace_entry *entry;
unsigned long irq_flags;

- raw_local_irq_save(irq_flags);
- __raw_spin_lock(&data->lock);
-
- entry = tracing_get_trace_entry(tr, data);
+ entry = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq_flags);
+ if (!entry)
+ return;
tracing_generic_entry_update(entry, 0);
entry->type = TRACE_MMIO_MAP;
entry->mmiomap = *map;
-
- __raw_spin_unlock(&data->lock);
- raw_local_irq_restore(irq_flags);
+ ring_buffer_unlock_commit(tr->buffer, entry, irq_flags);

trace_wake_up();
}
@@ -901,11 +736,14 @@ void __trace_stack(struct trace_array *t
{
struct trace_entry *entry;
struct stack_trace trace;
+ unsigned long irq_flags;

if (!(trace_flags & TRACE_ITER_STACKTRACE))
return;

- entry = tracing_get_trace_entry(tr, data);
+ entry = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq_flags);
+ if (!entry)
+ return;
tracing_generic_entry_update(entry, flags);
entry->type = TRACE_STACK;

@@ -917,6 +755,7 @@ void __trace_stack(struct trace_array *t
trace.entries = entry->stack.caller;

save_stack_trace(&trace);
+ ring_buffer_unlock_commit(tr->buffer, entry, irq_flags);
}

void
@@ -928,17 +767,16 @@ __trace_special(void *__tr, void *__data
struct trace_entry *entry;
unsigned long irq_flags;

- raw_local_irq_save(irq_flags);
- __raw_spin_lock(&data->lock);
- entry = tracing_get_trace_entry(tr, data);
+ entry = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq_flags);
+ if (!entry)
+ return;
tracing_generic_entry_update(entry, 0);
entry->type = TRACE_SPECIAL;
entry->special.arg1 = arg1;
entry->special.arg2 = arg2;
entry->special.arg3 = arg3;
+ ring_buffer_unlock_commit(tr->buffer, entry, irq_flags);
__trace_stack(tr, data, irq_flags, 4);
- __raw_spin_unlock(&data->lock);
- raw_local_irq_restore(irq_flags);

trace_wake_up();
}
@@ -953,9 +791,9 @@ tracing_sched_switch_trace(struct trace_
struct trace_entry *entry;
unsigned long irq_flags;

- raw_local_irq_save(irq_flags);
- __raw_spin_lock(&data->lock);
- entry = tracing_get_trace_entry(tr, data);
+ entry = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq_flags);
+ if (!entry)
+ return;
tracing_generic_entry_update(entry, flags);
entry->type = TRACE_CTX;
entry->ctx.prev_pid = prev->pid;
@@ -964,9 +802,8 @@ tracing_sched_switch_trace(struct trace_
entry->ctx.next_pid = next->pid;
entry->ctx.next_prio = next->prio;
entry->ctx.next_state = next->state;
+ ring_buffer_unlock_commit(tr->buffer, entry, irq_flags);
__trace_stack(tr, data, flags, 5);
- __raw_spin_unlock(&data->lock);
- raw_local_irq_restore(irq_flags);
}

void
@@ -979,9 +816,9 @@ tracing_sched_wakeup_trace(struct trace_
struct trace_entry *entry;
unsigned long irq_flags;

- raw_local_irq_save(irq_flags);
- __raw_spin_lock(&data->lock);
- entry = tracing_get_trace_entry(tr, data);
+ entry = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq_flags);
+ if (!entry)
+ return;
tracing_generic_entry_update(entry, flags);
entry->type = TRACE_WAKE;
entry->ctx.prev_pid = curr->pid;
@@ -990,9 +827,8 @@ tracing_sched_wakeup_trace(struct trace_
entry->ctx.next_pid = wakee->pid;
entry->ctx.next_prio = wakee->prio;
entry->ctx.next_state = wakee->state;
+ ring_buffer_unlock_commit(tr->buffer, entry, irq_flags);
__trace_stack(tr, data, flags, 6);
- __raw_spin_unlock(&data->lock);
- raw_local_irq_restore(irq_flags);

trace_wake_up();
}
@@ -1074,105 +910,66 @@ enum trace_file_type {
};

static struct trace_entry *
-trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data,
- struct trace_iterator *iter, int cpu)
-{
- struct page *page;
- struct trace_entry *array;
-
- if (iter->next_idx[cpu] >= tr->entries ||
- iter->next_idx[cpu] >= data->trace_idx ||
- (data->trace_head == data->trace_tail &&
- data->trace_head_idx == data->trace_tail_idx))
- return NULL;
-
- if (!iter->next_page[cpu]) {
- /* Initialize the iterator for this cpu trace buffer */
- WARN_ON(!data->trace_tail);
- page = virt_to_page(data->trace_tail);
- iter->next_page[cpu] = &page->lru;
- iter->next_page_idx[cpu] = data->trace_tail_idx;
- }
-
- page = list_entry(iter->next_page[cpu], struct page, lru);
- BUG_ON(&data->trace_pages == &page->lru);
-
- array = page_address(page);
-
- WARN_ON(iter->next_page_idx[cpu] >= ENTRIES_PER_PAGE);
- return &array[iter->next_page_idx[cpu]];
-}
-
-static struct trace_entry *
-find_next_entry(struct trace_iterator *iter, int *ent_cpu)
+find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
{
- struct trace_array *tr = iter->tr;
+ struct ring_buffer *buffer = iter->tr->buffer;
+ struct ring_buffer_event *event;
struct trace_entry *ent, *next = NULL;
+ u64 next_ts = 0, ts;
int next_cpu = -1;
int cpu;

for_each_tracing_cpu(cpu) {
- if (!head_page(tr->data[cpu]))
+ struct ring_buffer_iter *buf_iter;
+
+ if (ring_buffer_empty_cpu(buffer, cpu))
continue;
- ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu);
+
+ buf_iter = iter->buffer_iter[cpu];
+ event = ring_buffer_iter_peek(buf_iter, &ts);
+ ent = event ? ring_buffer_event_data(event) : NULL;
+
/*
* Pick the entry with the smallest timestamp:
*/
- if (ent && (!next || ent->t < next->t)) {
+ if (ent && (!next || ts < next_ts)) {
next = ent;
next_cpu = cpu;
+ next_ts = ts;
}
}

if (ent_cpu)
*ent_cpu = next_cpu;

+ if (ent_ts)
+ *ent_ts = next_ts;
+
return next;
}

static void trace_iterator_increment(struct trace_iterator *iter)
{
iter->idx++;
- iter->next_idx[iter->cpu]++;
- iter->next_page_idx[iter->cpu]++;
-
- if (iter->next_page_idx[iter->cpu] >= ENTRIES_PER_PAGE) {
- struct trace_array_cpu *data = iter->tr->data[iter->cpu];
-
- iter->next_page_idx[iter->cpu] = 0;
- iter->next_page[iter->cpu] =
- trace_next_list(data, iter->next_page[iter->cpu]);
- }
+ ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
}

static void trace_consume(struct trace_iterator *iter)
{
- struct trace_array_cpu *data = iter->tr->data[iter->cpu];
-
- data->trace_tail_idx++;
- if (data->trace_tail_idx >= ENTRIES_PER_PAGE) {
- data->trace_tail = trace_next_page(data, data->trace_tail);
- data->trace_tail_idx = 0;
- }
-
- /* Check if we empty it, then reset the index */
- if (data->trace_head == data->trace_tail &&
- data->trace_head_idx == data->trace_tail_idx)
- data->trace_idx = 0;
+ ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts);
}

static void *find_next_entry_inc(struct trace_iterator *iter)
{
struct trace_entry *next;
int next_cpu = -1;
+ u64 ts;

- next = find_next_entry(iter, &next_cpu);
-
- iter->prev_ent = iter->ent;
- iter->prev_cpu = iter->cpu;
+ next = find_next_entry(iter, &next_cpu, &ts);

iter->ent = next;
iter->cpu = next_cpu;
+ iter->ts = ts;

if (next)
trace_iterator_increment(iter);
@@ -1210,7 +1007,7 @@ static void *s_start(struct seq_file *m,
struct trace_iterator *iter = m->private;
void *p = NULL;
loff_t l = 0;
- int i;
+ int cpu;

mutex_lock(&trace_types_lock);

@@ -1229,12 +1026,9 @@ static void *s_start(struct seq_file *m,
iter->ent = NULL;
iter->cpu = 0;
iter->idx = -1;
- iter->prev_ent = NULL;
- iter->prev_cpu = -1;

- for_each_tracing_cpu(i) {
- iter->next_idx[i] = 0;
- iter->next_page[i] = NULL;
+ for_each_tracing_cpu(cpu) {
+ ring_buffer_iter_reset(iter->buffer_iter[cpu]);
}

for (p = iter; p && l < *pos; p = s_next(m, p, &l))
@@ -1357,21 +1151,12 @@ print_trace_header(struct seq_file *m, s
struct tracer *type = current_trace;
unsigned long total = 0;
unsigned long entries = 0;
- int cpu;
const char *name = "preemption";

if (type)
name = type->name;

- for_each_tracing_cpu(cpu) {
- if (head_page(tr->data[cpu])) {
- total += tr->data[cpu]->trace_idx;
- if (tr->data[cpu]->trace_idx > tr->entries)
- entries += tr->entries;
- else
- entries += tr->data[cpu]->trace_idx;
- }
- }
+ entries = ring_buffer_entries(iter->tr->buffer);

seq_printf(m, "%s latency trace v1.1.5 on %s\n",
name, UTS_RELEASE);
@@ -1457,7 +1242,7 @@ lat_print_generic(struct trace_seq *s, s
unsigned long preempt_mark_thresh = 100;

static void
-lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs,
+lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
unsigned long rel_usecs)
{
trace_seq_printf(s, " %4lldus", abs_usecs);
@@ -1476,20 +1261,22 @@ print_lat_fmt(struct trace_iterator *ite
{
struct trace_seq *s = &iter->seq;
unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
- struct trace_entry *next_entry = find_next_entry(iter, NULL);
+ struct trace_entry *next_entry;
unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
struct trace_entry *entry = iter->ent;
unsigned long abs_usecs;
unsigned long rel_usecs;
+ u64 next_ts;
char *comm;
int S, T;
int i;
unsigned state;

+ next_entry = find_next_entry(iter, NULL, &next_ts);
if (!next_entry)
- next_entry = entry;
- rel_usecs = ns2usecs(next_entry->t - entry->t);
- abs_usecs = ns2usecs(entry->t - iter->tr->time_start);
+ next_ts = iter->ts;
+ rel_usecs = ns2usecs(next_ts - iter->ts);
+ abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);

if (verbose) {
comm = trace_find_cmdline(entry->pid);
@@ -1498,7 +1285,7 @@ print_lat_fmt(struct trace_iterator *ite
comm,
entry->pid, cpu, entry->flags,
entry->preempt_count, trace_idx,
- ns2usecs(entry->t),
+ ns2usecs(iter->ts),
abs_usecs/1000,
abs_usecs % 1000, rel_usecs/1000,
rel_usecs % 1000);
@@ -1569,7 +1356,7 @@ static int print_trace_fmt(struct trace_

comm = trace_find_cmdline(iter->ent->pid);

- t = ns2usecs(entry->t);
+ t = ns2usecs(iter->ts);
usec_rem = do_div(t, 1000000ULL);
secs = (unsigned long)t;

@@ -1660,7 +1447,7 @@ static int print_raw_fmt(struct trace_it
entry = iter->ent;

ret = trace_seq_printf(s, "%d %d %llu ",
- entry->pid, iter->cpu, entry->t);
+ entry->pid, iter->cpu, iter->ts);
if (!ret)
return 0;

@@ -1725,7 +1512,7 @@ static int print_hex_fmt(struct trace_it

SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
- SEQ_PUT_HEX_FIELD_RET(s, entry->t);
+ SEQ_PUT_HEX_FIELD_RET(s, iter->ts);

switch (entry->type) {
case TRACE_FN:
@@ -1769,7 +1556,7 @@ static int print_bin_fmt(struct trace_it

SEQ_PUT_FIELD_RET(s, entry->pid);
SEQ_PUT_FIELD_RET(s, entry->cpu);
- SEQ_PUT_FIELD_RET(s, entry->t);
+ SEQ_PUT_FIELD_RET(s, iter->ts);

switch (entry->type) {
case TRACE_FN:
@@ -1796,16 +1583,10 @@ static int print_bin_fmt(struct trace_it

static int trace_empty(struct trace_iterator *iter)
{
- struct trace_array_cpu *data;
int cpu;

for_each_tracing_cpu(cpu) {
- data = iter->tr->data[cpu];
-
- if (head_page(data) && data->trace_idx &&
- (data->trace_tail != data->trace_head ||
- data->trace_tail_idx != data->trace_head_idx))
- return 0;
+ ring_buffer_iter_empty(iter->buffer_iter[cpu]);
}
return 1;
}
@@ -1869,6 +1650,8 @@ static struct trace_iterator *
__tracing_open(struct inode *inode, struct file *file, int *ret)
{
struct trace_iterator *iter;
+ struct seq_file *m;
+ int cpu;

if (tracing_disabled) {
*ret = -ENODEV;
@@ -1889,28 +1672,43 @@ __tracing_open(struct inode *inode, stru
iter->trace = current_trace;
iter->pos = -1;

+ for_each_tracing_cpu(cpu) {
+ iter->buffer_iter[cpu] =
+ ring_buffer_read_start(iter->tr->buffer, cpu);
+ if (!iter->buffer_iter[cpu])
+ goto fail_buffer;
+ }
+
/* TODO stop tracer */
*ret = seq_open(file, &tracer_seq_ops);
- if (!*ret) {
- struct seq_file *m = file->private_data;
- m->private = iter;
+ if (*ret)
+ goto fail_buffer;

- /* stop the trace while dumping */
- if (iter->tr->ctrl) {
- tracer_enabled = 0;
- ftrace_function_enabled = 0;
- }
+ m = file->private_data;
+ m->private = iter;

- if (iter->trace && iter->trace->open)
- iter->trace->open(iter);
- } else {
- kfree(iter);
- iter = NULL;
+ /* stop the trace while dumping */
+ if (iter->tr->ctrl) {
+ tracer_enabled = 0;
+ ftrace_function_enabled = 0;
}
+
+ if (iter->trace && iter->trace->open)
+ iter->trace->open(iter);
+
mutex_unlock(&trace_types_lock);

out:
return iter;
+
+ fail_buffer:
+ for_each_tracing_cpu(cpu) {
+ if (iter->buffer_iter[cpu])
+ ring_buffer_read_finish(iter->buffer_iter[cpu]);
+ }
+ mutex_unlock(&trace_types_lock);
+
+ return ERR_PTR(-ENOMEM);
}

int tracing_open_generic(struct inode *inode, struct file *filp)
@@ -1926,8 +1724,14 @@ int tracing_release(struct inode *inode,
{
struct seq_file *m = (struct seq_file *)file->private_data;
struct trace_iterator *iter = m->private;
+ int cpu;

mutex_lock(&trace_types_lock);
+ for_each_tracing_cpu(cpu) {
+ if (iter->buffer_iter[cpu])
+ ring_buffer_read_finish(iter->buffer_iter[cpu]);
+ }
+
if (iter->trace && iter->trace->close)
iter->trace->close(iter);

@@ -2500,13 +2304,10 @@ tracing_read_pipe(struct file *filp, cha
size_t cnt, loff_t *ppos)
{
struct trace_iterator *iter = filp->private_data;
- struct trace_array_cpu *data;
- static cpumask_t mask;
unsigned long flags;
#ifdef CONFIG_FTRACE
int ftrace_save;
#endif
- int cpu;
ssize_t sret;

/* return any leftover data */
@@ -2595,32 +2396,13 @@ tracing_read_pipe(struct file *filp, cha
* and then release the locks again.
*/

- cpus_clear(mask);
- local_irq_save(flags);
+ local_irq_disable();
#ifdef CONFIG_FTRACE
ftrace_save = ftrace_enabled;
ftrace_enabled = 0;
#endif
smp_wmb();
- for_each_tracing_cpu(cpu) {
- data = iter->tr->data[cpu];
-
- if (!head_page(data) || !data->trace_idx)
- continue;
-
- atomic_inc(&data->disabled);
- cpu_set(cpu, mask);
- }
-
- for_each_cpu_mask(cpu, mask) {
- data = iter->tr->data[cpu];
- __raw_spin_lock(&data->lock);
-
- if (data->overrun > iter->last_overrun[cpu])
- iter->overrun[cpu] +=
- data->overrun - iter->last_overrun[cpu];
- iter->last_overrun[cpu] = data->overrun;
- }
+ ring_buffer_lock(iter->tr->buffer, &flags);

while (find_next_entry_inc(iter) != NULL) {
int ret;
@@ -2639,19 +2421,11 @@ tracing_read_pipe(struct file *filp, cha
break;
}

- for_each_cpu_mask(cpu, mask) {
- data = iter->tr->data[cpu];
- __raw_spin_unlock(&data->lock);
- }
-
- for_each_cpu_mask(cpu, mask) {
- data = iter->tr->data[cpu];
- atomic_dec(&data->disabled);
- }
+ ring_buffer_unlock(iter->tr->buffer, flags);
#ifdef CONFIG_FTRACE
ftrace_enabled = ftrace_save;
#endif
- local_irq_restore(flags);
+ local_irq_enable();

/* Now copy what we have to the user */
sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
@@ -2684,7 +2458,7 @@ tracing_entries_write(struct file *filp,
{
unsigned long val;
char buf[64];
- int i, ret;
+ int ret;

if (cnt >= sizeof(buf))
return -EINVAL;
@@ -2711,52 +2485,31 @@ tracing_entries_write(struct file *filp,
goto out;
}

- if (val > global_trace.entries) {
- long pages_requested;
- unsigned long freeable_pages;
-
- /* make sure we have enough memory before mapping */
- pages_requested =
- (val + (ENTRIES_PER_PAGE-1)) / ENTRIES_PER_PAGE;
-
- /* account for each buffer (and max_tr) */
- pages_requested *= tracing_nr_buffers * 2;
-
- /* Check for overflow */
- if (pages_requested < 0) {
- cnt = -ENOMEM;
+ if (val != global_trace.entries) {
+ ret = ring_buffer_resize(global_trace.buffer, val);
+ if (ret < 0) {
+ cnt = ret;
goto out;
}

- freeable_pages = determine_dirtyable_memory();
-
- /* we only allow to request 1/4 of useable memory */
- if (pages_requested >
- ((freeable_pages + tracing_pages_allocated) / 4)) {
- cnt = -ENOMEM;
- goto out;
- }
-
- while (global_trace.entries < val) {
- if (trace_alloc_page()) {
- cnt = -ENOMEM;
- goto out;
+ ret = ring_buffer_resize(max_tr.buffer, val);
+ if (ret < 0) {
+ int r;
+ cnt = ret;
+ r = ring_buffer_resize(global_trace.buffer,
+ global_trace.entries);
+ if (r < 0) {
+ /* AARGH! We are left with different
+ * size max buffer!!!! */
+ WARN_ON(1);
+ tracing_disabled = 1;
}
- /* double check that we don't go over the known pages */
- if (tracing_pages_allocated > pages_requested)
- break;
+ goto out;
}

- } else {
- /* include the number of entries in val (inc of page entries) */
- while (global_trace.entries > val + (ENTRIES_PER_PAGE - 1))
- trace_free_page();
+ global_trace.entries = val;
}

- /* check integrity */
- for_each_tracing_cpu(i)
- check_pages(global_trace.data[i]);
-
filp->f_pos += cnt;

/* If check pages failed, return ENOMEM */
@@ -2930,190 +2683,41 @@ static __init void tracer_init_debugfs(v
#endif
}

-static int trace_alloc_page(void)
+__init static int tracer_alloc_buffers(void)
{
struct trace_array_cpu *data;
- struct page *page, *tmp;
- LIST_HEAD(pages);
- void *array;
- unsigned pages_allocated = 0;
int i;

- /* first allocate a page for each CPU */
- for_each_tracing_cpu(i) {
- array = (void *)__get_free_page(GFP_KERNEL);
- if (array == NULL) {
- printk(KERN_ERR "tracer: failed to allocate page"
- "for trace buffer!\n");
- goto free_pages;
- }
-
- pages_allocated++;
- page = virt_to_page(array);
- list_add(&page->lru, &pages);
+ /* TODO: make the number of buffers hot pluggable with CPUS */
+ tracing_buffer_mask = cpu_possible_map;

-/* Only allocate if we are actually using the max trace */
-#ifdef CONFIG_TRACER_MAX_TRACE
- array = (void *)__get_free_page(GFP_KERNEL);
- if (array == NULL) {
- printk(KERN_ERR "tracer: failed to allocate page"
- "for trace buffer!\n");
- goto free_pages;
- }
- pages_allocated++;
- page = virt_to_page(array);
- list_add(&page->lru, &pages);
-#endif
+ global_trace.buffer = ring_buffer_alloc(trace_buf_size,
+ TRACE_BUFFER_FLAGS);
+ if (!global_trace.buffer) {
+ printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
+ WARN_ON(1);
+ return 0;
}
-
- /* Now that we successfully allocate a page per CPU, add them */
- for_each_tracing_cpu(i) {
- data = global_trace.data[i];
- page = list_entry(pages.next, struct page, lru);
- list_del_init(&page->lru);
- list_add_tail(&page->lru, &data->trace_pages);
- ClearPageLRU(page);
+ global_trace.entries = ring_buffer_size(global_trace.buffer);

#ifdef CONFIG_TRACER_MAX_TRACE
- data = max_tr.data[i];
- page = list_entry(pages.next, struct page, lru);
- list_del_init(&page->lru);
- list_add_tail(&page->lru, &data->trace_pages);
- SetPageLRU(page);
-#endif
- }
- tracing_pages_allocated += pages_allocated;
- global_trace.entries += ENTRIES_PER_PAGE;
-
- return 0;
-
- free_pages:
- list_for_each_entry_safe(page, tmp, &pages, lru) {
- list_del_init(&page->lru);
- __free_page(page);
+ max_tr.buffer = ring_buffer_alloc(trace_buf_size,
+ TRACE_BUFFER_FLAGS);
+ if (!max_tr.buffer) {
+ printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
+ WARN_ON(1);
+ ring_buffer_free(global_trace.buffer);
+ return 0;
}
- return -ENOMEM;
-}
-
-static int trace_free_page(void)
-{
- struct trace_array_cpu *data;
- struct page *page;
- struct list_head *p;
- int i;
- int ret = 0;
-
- /* free one page from each buffer */
- for_each_tracing_cpu(i) {
- data = global_trace.data[i];
- p = data->trace_pages.next;
- if (p == &data->trace_pages) {
- /* should never happen */
- WARN_ON(1);
- tracing_disabled = 1;
- ret = -1;
- break;
- }
- page = list_entry(p, struct page, lru);
- ClearPageLRU(page);
- list_del(&page->lru);
- tracing_pages_allocated--;
- tracing_pages_allocated--;
- __free_page(page);
-
- tracing_reset(data);
-
-#ifdef CONFIG_TRACER_MAX_TRACE
- data = max_tr.data[i];
- p = data->trace_pages.next;
- if (p == &data->trace_pages) {
- /* should never happen */
- WARN_ON(1);
- tracing_disabled = 1;
- ret = -1;
- break;
- }
- page = list_entry(p, struct page, lru);
- ClearPageLRU(page);
- list_del(&page->lru);
- __free_page(page);
-
- tracing_reset(data);
+ max_tr.entries = ring_buffer_size(max_tr.buffer);
+ WARN_ON(max_tr.entries != global_trace.entries);
#endif
- }
- global_trace.entries -= ENTRIES_PER_PAGE;
-
- return ret;
-}
-
-__init static int tracer_alloc_buffers(void)
-{
- struct trace_array_cpu *data;
- void *array;
- struct page *page;
- int pages = 0;
- int ret = -ENOMEM;
- int i;
-
- /* TODO: make the number of buffers hot pluggable with CPUS */
- tracing_nr_buffers = num_possible_cpus();
- tracing_buffer_mask = cpu_possible_map;

/* Allocate the first page for all buffers */
for_each_tracing_cpu(i) {
data = global_trace.data[i] = &per_cpu(global_trace_cpu, i);
max_tr.data[i] = &per_cpu(max_data, i);
-
- array = (void *)__get_free_page(GFP_KERNEL);
- if (array == NULL) {
- printk(KERN_ERR "tracer: failed to allocate page"
- "for trace buffer!\n");
- goto free_buffers;
- }
-
- /* set the array to the list */
- INIT_LIST_HEAD(&data->trace_pages);
- page = virt_to_page(array);
- list_add(&page->lru, &data->trace_pages);
- /* use the LRU flag to differentiate the two buffers */
- ClearPageLRU(page);
-
- data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
- max_tr.data[i]->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
-
-/* Only allocate if we are actually using the max trace */
-#ifdef CONFIG_TRACER_MAX_TRACE
- array = (void *)__get_free_page(GFP_KERNEL);
- if (array == NULL) {
- printk(KERN_ERR "tracer: failed to allocate page"
- "for trace buffer!\n");
- goto free_buffers;
- }
-
- INIT_LIST_HEAD(&max_tr.data[i]->trace_pages);
- page = virt_to_page(array);
- list_add(&page->lru, &max_tr.data[i]->trace_pages);
- SetPageLRU(page);
-#endif
- }
-
- /*
- * Since we allocate by orders of pages, we may be able to
- * round up a bit.
- */
- global_trace.entries = ENTRIES_PER_PAGE;
- pages++;
-
- while (global_trace.entries < trace_nr_entries) {
- if (trace_alloc_page())
- break;
- pages++;
}
- max_tr.entries = global_trace.entries;
-
- pr_info("tracer: %d pages allocated for %ld entries of %ld bytes\n",
- pages, trace_nr_entries, (long)TRACE_ENTRY_SIZE);
- pr_info(" actual entries %ld\n", global_trace.entries);

tracer_init_debugfs();

@@ -3127,31 +2731,5 @@ __init static int tracer_alloc_buffers(v
tracing_disabled = 0;

return 0;
-
- free_buffers:
- for (i-- ; i >= 0; i--) {
- struct page *page, *tmp;
- struct trace_array_cpu *data = global_trace.data[i];
-
- if (data) {
- list_for_each_entry_safe(page, tmp,
- &data->trace_pages, lru) {
- list_del_init(&page->lru);
- __free_page(page);
- }
- }
-
-#ifdef CONFIG_TRACER_MAX_TRACE
- data = max_tr.data[i];
- if (data) {
- list_for_each_entry_safe(page, tmp,
- &data->trace_pages, lru) {
- list_del_init(&page->lru);
- __free_page(page);
- }
- }
-#endif
- }
- return ret;
}
fs_initcall(tracer_alloc_buffers);
Index: linux-compile.git/kernel/trace/trace.h
===================================================================
--- linux-compile.git.orig/kernel/trace/trace.h 2008-09-25 12:34:11.000000000 -0400
+++ linux-compile.git/kernel/trace/trace.h 2008-09-25 12:34:23.000000000 -0400
@@ -6,6 +6,7 @@
#include <linux/sched.h>
#include <linux/clocksource.h>
#include <linux/mmiotrace.h>
+#include <linux/ring_buffer.h>

enum trace_type {
__TRACE_FIRST_TYPE = 0,
@@ -72,7 +73,6 @@ struct trace_entry {
char flags;
char preempt_count;
int pid;
- cycle_t t;
union {
struct ftrace_entry fn;
struct ctx_switch_entry ctx;
@@ -91,16 +91,9 @@ struct trace_entry {
* the trace, etc.)
*/
struct trace_array_cpu {
- struct list_head trace_pages;
atomic_t disabled;
- raw_spinlock_t lock;
- struct lock_class_key lock_key;

/* these fields get copied into max-trace: */
- unsigned trace_head_idx;
- unsigned trace_tail_idx;
- void *trace_head; /* producer */
- void *trace_tail; /* consumer */
unsigned long trace_idx;
unsigned long overrun;
unsigned long saved_latency;
@@ -124,6 +117,7 @@ struct trace_iterator;
* They have on/off state as well:
*/
struct trace_array {
+ struct ring_buffer *buffer;
unsigned long entries;
long ctrl;
int cpu;
@@ -171,26 +165,20 @@ struct trace_iterator {
struct trace_array *tr;
struct tracer *trace;
void *private;
- long last_overrun[NR_CPUS];
- long overrun[NR_CPUS];
+ struct ring_buffer_iter *buffer_iter[NR_CPUS];

/* The below is zeroed out in pipe_read */
struct trace_seq seq;
struct trace_entry *ent;
int cpu;
-
- struct trace_entry *prev_ent;
- int prev_cpu;
+ u64 ts;

unsigned long iter_flags;
loff_t pos;
- unsigned long next_idx[NR_CPUS];
- struct list_head *next_page[NR_CPUS];
- unsigned next_page_idx[NR_CPUS];
long idx;
};

-void tracing_reset(struct trace_array_cpu *data);
+void tracing_reset(struct trace_array *tr, int cpu);
int tracing_open_generic(struct inode *inode, struct file *filp);
struct dentry *tracing_init_dentry(void);
void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
Index: linux-compile.git/kernel/trace/trace_functions.c
===================================================================
--- linux-compile.git.orig/kernel/trace/trace_functions.c 2008-09-25 12:34:11.000000000 -0400
+++ linux-compile.git/kernel/trace/trace_functions.c 2008-09-25 12:34:23.000000000 -0400
@@ -23,7 +23,7 @@ static void function_reset(struct trace_
tr->time_start = ftrace_now(tr->cpu);

for_each_online_cpu(cpu)
- tracing_reset(tr->data[cpu]);
+ tracing_reset(tr, cpu);
}

static void start_function_trace(struct trace_array *tr)
Index: linux-compile.git/kernel/trace/trace_irqsoff.c
===================================================================
--- linux-compile.git.orig/kernel/trace/trace_irqsoff.c 2008-09-25 12:34:11.000000000 -0400
+++ linux-compile.git/kernel/trace/trace_irqsoff.c 2008-09-25 12:34:23.000000000 -0400
@@ -173,7 +173,7 @@ out_unlock:
out:
data->critical_sequence = max_sequence;
data->preempt_timestamp = ftrace_now(cpu);
- tracing_reset(data);
+ tracing_reset(tr, cpu);
trace_function(tr, data, CALLER_ADDR0, parent_ip, flags);
}

@@ -203,7 +203,7 @@ start_critical_timing(unsigned long ip,
data->critical_sequence = max_sequence;
data->preempt_timestamp = ftrace_now(cpu);
data->critical_start = parent_ip ? : ip;
- tracing_reset(data);
+ tracing_reset(tr, cpu);

local_save_flags(flags);

@@ -234,7 +234,7 @@ stop_critical_timing(unsigned long ip, u

data = tr->data[cpu];

- if (unlikely(!data) || unlikely(!head_page(data)) ||
+ if (unlikely(!data) ||
!data->critical_start || atomic_read(&data->disabled))
return;

Index: linux-compile.git/kernel/trace/trace_mmiotrace.c
===================================================================
--- linux-compile.git.orig/kernel/trace/trace_mmiotrace.c 2008-09-25 12:34:11.000000000 -0400
+++ linux-compile.git/kernel/trace/trace_mmiotrace.c 2008-09-25 12:34:23.000000000 -0400
@@ -27,7 +27,7 @@ static void mmio_reset_data(struct trace
tr->time_start = ftrace_now(tr->cpu);

for_each_online_cpu(cpu)
- tracing_reset(tr->data[cpu]);
+ tracing_reset(tr, cpu);
}

static void mmio_trace_init(struct trace_array *tr)
@@ -130,10 +130,14 @@ static unsigned long count_overruns(stru
{
int cpu;
unsigned long cnt = 0;
+/* FIXME: */
+#if 0
for_each_online_cpu(cpu) {
cnt += iter->overrun[cpu];
iter->overrun[cpu] = 0;
}
+#endif
+ (void)cpu;
return cnt;
}

@@ -176,7 +180,7 @@ static int mmio_print_rw(struct trace_it
struct trace_entry *entry = iter->ent;
struct mmiotrace_rw *rw = &entry->mmiorw;
struct trace_seq *s = &iter->seq;
- unsigned long long t = ns2usecs(entry->t);
+ unsigned long long t = ns2usecs(iter->ts);
unsigned long usec_rem = do_div(t, 1000000ULL);
unsigned secs = (unsigned long)t;
int ret = 1;
@@ -218,7 +222,7 @@ static int mmio_print_map(struct trace_i
struct trace_entry *entry = iter->ent;
struct mmiotrace_map *m = &entry->mmiomap;
struct trace_seq *s = &iter->seq;
- unsigned long long t = ns2usecs(entry->t);
+ unsigned long long t = ns2usecs(iter->ts);
unsigned long usec_rem = do_div(t, 1000000ULL);
unsigned secs = (unsigned long)t;
int ret = 1;
Index: linux-compile.git/kernel/trace/trace_sched_switch.c
===================================================================
--- linux-compile.git.orig/kernel/trace/trace_sched_switch.c 2008-09-25 12:34:11.000000000 -0400
+++ linux-compile.git/kernel/trace/trace_sched_switch.c 2008-09-25 12:34:23.000000000 -0400
@@ -133,7 +133,7 @@ static void sched_switch_reset(struct tr
tr->time_start = ftrace_now(tr->cpu);

for_each_online_cpu(cpu)
- tracing_reset(tr->data[cpu]);
+ tracing_reset(tr, cpu);
}

static int tracing_sched_register(void)
Index: linux-compile.git/kernel/trace/trace_sched_wakeup.c
===================================================================
--- linux-compile.git.orig/kernel/trace/trace_sched_wakeup.c 2008-09-25 12:34:11.000000000 -0400
+++ linux-compile.git/kernel/trace/trace_sched_wakeup.c 2008-09-25 12:34:23.000000000 -0400
@@ -216,7 +216,7 @@ static void __wakeup_reset(struct trace_

for_each_possible_cpu(cpu) {
data = tr->data[cpu];
- tracing_reset(data);
+ tracing_reset(tr, cpu);
}

wakeup_cpu = -1;

--
Steven Rostedt
2008-09-26 01:02:44 UTC
Permalink
This version has been cleaned up a bit. I've been running it as
a back end to ftrace, and it has been handling pretty well.

I did not implement the GTOD sync part and will leave that for later.
But this is the basic design that I like and will be the basis
of my future work.

Signed-off-by: Steven Rostedt <***@redhat.com>
---
include/linux/ring_buffer.h | 178 ++++++
kernel/trace/Kconfig | 4
kernel/trace/Makefile | 1
kernel/trace/ring_buffer.c | 1252 ++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 1435 insertions(+)

Index: linux-trace.git/include/linux/ring_buffer.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-trace.git/include/linux/ring_buffer.h 2008-09-25 20:36:12.000000000 -0400
@@ -0,0 +1,178 @@
+#ifndef _LINUX_RING_BUFFER_H
+#define _LINUX_RING_BUFFER_H
+
+#include <linux/mm.h>
+#include <linux/seq_file.h>
+
+struct ring_buffer;
+struct ring_buffer_iter;
+
+/*
+ * Don't reference this struct directly, use the inline items below.
+ */
+struct ring_buffer_event {
+ u32 type:2, len:3, time_delta:27;
+ u32 array[];
+} __attribute__((__packed__));
+
+enum {
+ RB_TYPE_PADDING, /* Left over page padding
+ * array is ignored
+ * size is variable depending on
+ */
+ RB_TYPE_TIME_EXTENT, /* Extent the time delta
+ * array[0] = time delta (28 .. 59)
+ * size = 8 bytes
+ */
+ /* FIXME: RB_TYPE_TIME_STAMP not implemented */
+ RB_TYPE_TIME_STAMP, /* Sync time stamp with external clock
+ * array[0] = tv_nsec
+ * array[1] = tv_sec
+ * size = 16 bytes
+ */
+
+ RB_TYPE_DATA, /* Data record
+ * If len is zero:
+ * array[0] holds the actual length
+ * array[1..(length+3)/4] holds data
+ * else
+ * length = len << 2
+ * array[0..(length+3)/4] holds data
+ */
+};
+
+#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event))
+#define RB_ALIGNMENT_SHIFT 2
+#define RB_ALIGNMENT (1 << RB_ALIGNMENT_SHIFT)
+#define RB_MAX_SMALL_DATA (28)
+
+enum {
+ RB_LEN_TIME_EXTENT = 8,
+ RB_LEN_TIME_STAMP = 16,
+};
+
+/**
+ * ring_buffer_event_length - return the length of the event
+ * @event: the event to get the length of
+ */
+static inline unsigned
+ring_buffer_event_length(struct ring_buffer_event *event)
+{
+ unsigned length;
+
+ switch (event->type) {
+ case RB_TYPE_PADDING:
+ /* undefined */
+ return -1;
+
+ case RB_TYPE_TIME_EXTENT:
+ return RB_LEN_TIME_EXTENT;
+
+ case RB_TYPE_TIME_STAMP:
+ return RB_LEN_TIME_STAMP;
+
+ case RB_TYPE_DATA:
+ if (event->len)
+ length = event->len << RB_ALIGNMENT_SHIFT;
+ else
+ length = event->array[0];
+ return length + RB_EVNT_HDR_SIZE;
+ default:
+ BUG();
+ }
+ /* not hit */
+ return 0;
+}
+
+/**
+ * ring_buffer_event_time_delta - return the delta timestamp of the event
+ * @event: the event to get the delta timestamp of
+ *
+ * The delta timestamp is the 27 bit timestamp since the last event.
+ */
+static inline unsigned
+ring_buffer_event_time_delta(struct ring_buffer_event *event)
+{
+ return event->time_delta;
+}
+
+/**
+ * ring_buffer_event_data - return the data of the event
+ * @event: the event to get the data from
+ */
+static inline void *
+ring_buffer_event_data(struct ring_buffer_event *event)
+{
+ BUG_ON(event->type != RB_TYPE_DATA);
+ /* If length is in len field, then array[0] has the data */
+ if (event->len)
+ return (void *)&event->array[0];
+ /* Otherwise length is in array[0] and array[1] has the data */
+ return (void *)&event->array[1];
+}
+
+void ring_buffer_lock(struct ring_buffer *buffer, unsigned long *flags);
+void ring_buffer_unlock(struct ring_buffer *buffer, unsigned long flags);
+
+/*
+ * size is in bytes for each per CPU buffer.
+ */
+struct ring_buffer *
+ring_buffer_alloc(unsigned long size, unsigned flags);
+void ring_buffer_free(struct ring_buffer *buffer);
+
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size);
+
+struct ring_buffer_event *
+ring_buffer_lock_reserve(struct ring_buffer *buffer,
+ unsigned long length,
+ unsigned long *flags);
+int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags);
+int ring_buffer_write(struct ring_buffer *buffer,
+ unsigned long length, void *data);
+
+struct ring_buffer_event *
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts);
+struct ring_buffer_event *
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts);
+
+struct ring_buffer_iter *
+ring_buffer_read_start(struct ring_buffer *buffer, int cpu);
+void ring_buffer_read_finish(struct ring_buffer_iter *iter);
+
+struct ring_buffer_event *
+ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts);
+struct ring_buffer_event *
+ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts);
+void ring_buffer_iter_reset(struct ring_buffer_iter *iter);
+int ring_buffer_iter_empty(struct ring_buffer_iter *iter);
+
+unsigned long ring_buffer_size(struct ring_buffer *buffer);
+
+void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu);
+void ring_buffer_reset(struct ring_buffer *buffer);
+
+int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
+ struct ring_buffer *buffer_b, int cpu);
+
+int ring_buffer_empty(struct ring_buffer *buffer);
+int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu);
+
+void ring_buffer_record_disable(struct ring_buffer *buffer);
+void ring_buffer_record_enable(struct ring_buffer *buffer);
+void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu);
+void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu);
+
+unsigned long ring_buffer_entries(struct ring_buffer *buffer);
+unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
+
+u64 ring_buffer_time_stamp(int cpu);
+void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
+
+enum ring_buffer_flags {
+ RB_FL_OVERWRITE = 1 << 0,
+};
+
+#endif /* _LINUX_RING_BUFFER_H */
Index: linux-trace.git/kernel/trace/ring_buffer.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-trace.git/kernel/trace/ring_buffer.c 2008-09-25 20:35:44.000000000 -0400
@@ -0,0 +1,1252 @@
+/*
+ * Generic ring buffer
+ *
+ * Copyright (C) 2008 Steven Rostedt <***@redhat.com>
+ */
+#include <linux/ring_buffer.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/list.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+
+/* FIXME!!! */
+u64 ring_buffer_time_stamp(int cpu)
+{
+ /* mult -1 to test normalize */
+ return sched_clock() * -1;
+}
+void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
+{
+ *ts *= -1;
+}
+
+#define TS_SHIFT 27
+#define TS_MASK ((1ULL << TS_SHIFT) - 1)
+#define TS_DELTA_TEST ~TS_MASK
+
+/*
+ * We need to fit the time_stamp delta into 27 bits.
+ */
+static inline int
+test_time_stamp(unsigned long long delta)
+{
+ if (delta & TS_DELTA_TEST)
+ return 1;
+ return 0;
+}
+
+struct buffer_page {
+ u64 time_stamp;
+ unsigned char body[];
+};
+
+#define BUF_PAGE_SIZE (PAGE_SIZE - sizeof(u64))
+
+/*
+ * head_page == tail_page && head == tail then buffer is empty.
+ */
+struct ring_buffer_per_cpu {
+ int cpu;
+ struct ring_buffer *buffer;
+ raw_spinlock_t lock;
+ struct lock_class_key lock_key;
+ struct buffer_page **pages;
+ unsigned long head; /* read from head */
+ unsigned long tail; /* write to tail */
+ unsigned long head_page;
+ unsigned long tail_page;
+ unsigned long overrun;
+ unsigned long entries;
+ u64 write_stamp;
+ u64 read_stamp;
+ atomic_t record_disabled;
+};
+
+struct ring_buffer {
+ unsigned long size;
+ unsigned pages;
+ unsigned flags;
+ int cpus;
+ atomic_t record_disabled;
+
+ spinlock_t lock;
+
+ /* FIXME: this should be online CPUS */
+ struct ring_buffer_per_cpu *buffers[NR_CPUS];
+};
+
+struct ring_buffer_iter {
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long head;
+ unsigned long head_page;
+ u64 read_stamp;
+};
+
+static struct ring_buffer_per_cpu *
+ring_buffer_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int pages = buffer->pages;
+ int i;
+
+ cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!cpu_buffer)
+ return NULL;
+
+ cpu_buffer->cpu = cpu;
+ cpu_buffer->buffer = buffer;
+ cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
+ cpu_buffer->pages = kzalloc_node(ALIGN(sizeof(void *) * pages,
+ cache_line_size()), GFP_KERNEL,
+ cpu_to_node(cpu));
+ if (!cpu_buffer->pages)
+ goto fail_free_buffer;
+
+ for (i = 0; i < pages; i++) {
+ cpu_buffer->pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!cpu_buffer->pages[i])
+ goto fail_free_pages;
+ }
+
+ return cpu_buffer;
+
+ fail_free_pages:
+ for (i = 0; i < pages; i++) {
+ if (cpu_buffer->pages[i])
+ free_page((unsigned long)cpu_buffer->pages[i]);
+ }
+ kfree(cpu_buffer->pages);
+
+ fail_free_buffer:
+ kfree(cpu_buffer);
+ return NULL;
+}
+
+static void
+ring_buffer_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ int i;
+
+ for (i = 0; i < cpu_buffer->buffer->pages; i++) {
+ if (cpu_buffer->pages[i])
+ free_page((unsigned long)cpu_buffer->pages[i]);
+ }
+ kfree(cpu_buffer->pages);
+ kfree(cpu_buffer);
+}
+
+/**
+ * ring_buffer_alloc - allocate a new ring_buffer
+ * @size: the size in bytes that is needed.
+ * @flags: attributes to set for the ring buffer.
+ *
+ * Currently the only flag that is available is the RB_FL_OVERWRITE
+ * flag. This flag means that the buffer will overwrite old data
+ * when the buffer wraps. If this flag is not set, the buffer will
+ * drop data when the tail hits the head.
+ */
+struct ring_buffer *
+ring_buffer_alloc(unsigned long size, unsigned flags)
+{
+ struct ring_buffer *buffer;
+ int cpu;
+
+ /* keep it in its own cache line */
+ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
+ GFP_KERNEL);
+ if (!buffer)
+ return NULL;
+
+ buffer->pages = (size + (PAGE_SIZE - 1)) / PAGE_SIZE;
+ buffer->flags = flags;
+
+ /* need at least two pages */
+ if (buffer->pages == 1)
+ buffer->pages++;
+
+ /* FIXME: do for only online CPUS */
+ buffer->cpus = num_possible_cpus();
+ for_each_possible_cpu(cpu) {
+ if (cpu >= buffer->cpus)
+ continue;
+ buffer->buffers[cpu] =
+ ring_buffer_allocate_cpu_buffer(buffer, cpu);
+ if (!buffer->buffers[cpu])
+ goto fail_free_buffers;
+ }
+
+ spin_lock_init(&buffer->lock);
+
+ return buffer;
+
+ fail_free_buffers:
+ for_each_possible_cpu(cpu) {
+ if (cpu >= buffer->cpus)
+ continue;
+ if (buffer->buffers[cpu])
+ ring_buffer_free_cpu_buffer(buffer->buffers[cpu]);
+ }
+
+ kfree(buffer);
+ return NULL;
+}
+
+/**
+ * ring_buffer_free - free a ring buffer.
+ * @buffer: the buffer to free.
+ */
+void
+ring_buffer_free(struct ring_buffer *buffer)
+{
+ int cpu;
+
+ for (cpu = 0; cpu < buffer->cpus; cpu++)
+ ring_buffer_free_cpu_buffer(buffer->buffers[cpu]);
+
+ kfree(buffer);
+}
+
+/**
+ * ring_buffer_resize - resize the ring buffer
+ * @buffer: the buffer to resize.
+ * @size: the new size.
+ *
+ * Returns -1 on failure.
+ */
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
+{
+ /* FIXME: */
+ return -1;
+}
+
+static inline int
+ring_buffer_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return cpu_buffer->head_page == cpu_buffer->tail_page &&
+ cpu_buffer->head == cpu_buffer->tail;
+}
+
+static inline int
+ring_buffer_null_event(struct ring_buffer_event *event)
+{
+ return event->type == RB_TYPE_PADDING;
+}
+
+static inline void *
+rb_page_body(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned long page, unsigned index)
+{
+ return cpu_buffer->pages[page]->body + index;
+}
+
+static inline struct ring_buffer_event *
+ring_buffer_head_event(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return rb_page_body(cpu_buffer,cpu_buffer->head_page,
+ cpu_buffer->head);
+}
+
+static inline struct ring_buffer_event *
+ring_buffer_iter_head_event(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+
+ return rb_page_body(cpu_buffer, iter->head_page,
+ iter->head);
+}
+
+/*
+ * When the tail hits the head and the buffer is in overwrite mode,
+ * the head jumps to the next page and all content on the previous
+ * page is discarded. But before doing so, we update the overrun
+ * variable of the buffer.
+ */
+static void
+ring_buffer_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct ring_buffer_event *event;
+ unsigned long head;
+
+ for (head = 0; head < BUF_PAGE_SIZE;
+ head += ring_buffer_event_length(event)) {
+ event = rb_page_body(cpu_buffer, cpu_buffer->head_page, head);
+ if (ring_buffer_null_event(event))
+ break;
+ cpu_buffer->overrun++;
+ cpu_buffer->entries--;
+ }
+}
+
+static inline void
+ring_buffer_inc_page(struct ring_buffer *buffer,
+ unsigned long *page)
+{
+ (*page)++;
+ if (*page >= buffer->pages)
+ *page = 0;
+}
+
+static inline void
+rb_add_stamp(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
+{
+ struct buffer_page *bpage;
+
+ bpage = cpu_buffer->pages[cpu_buffer->tail_page];
+ bpage->time_stamp = *ts;
+}
+
+static void
+rb_reset_read_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct buffer_page *bpage;
+
+ cpu_buffer->head = 0;
+ bpage = cpu_buffer->pages[cpu_buffer->head_page];
+ cpu_buffer->read_stamp = bpage->time_stamp;
+}
+
+static void
+rb_reset_iter_read_page(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+ struct buffer_page *bpage;
+
+ iter->head = 0;
+ bpage = cpu_buffer->pages[iter->head_page];
+ iter->read_stamp = bpage->time_stamp;
+}
+
+/**
+ * ring_buffer_update_event - update event type and data
+ * @event: the even to update
+ * @type: the type of event
+ * @length: the size of the event field in the ring buffer
+ *
+ * Update the type and data fields of the event. The length
+ * is the actual size that is written to the ring buffer,
+ * and with this, we can determine what to place into the
+ * data field.
+ */
+static inline void
+ring_buffer_update_event(struct ring_buffer_event *event,
+ unsigned type, unsigned length)
+{
+ event->type = type;
+
+ switch (type) {
+
+ case RB_TYPE_PADDING:
+ break;
+
+ case RB_TYPE_TIME_EXTENT:
+ event->len =
+ (RB_LEN_TIME_EXTENT + (RB_ALIGNMENT-1))
+ >> RB_ALIGNMENT_SHIFT;
+ break;
+
+ case RB_TYPE_TIME_STAMP:
+ event->len =
+ (RB_LEN_TIME_STAMP + (RB_ALIGNMENT-1))
+ >> RB_ALIGNMENT_SHIFT;
+ break;
+
+ case RB_TYPE_DATA:
+ length -= RB_EVNT_HDR_SIZE;
+ if (length > RB_MAX_SMALL_DATA) {
+ event->len = 0;
+ event->array[0] = length;
+ } else
+ event->len =
+ (length + (RB_ALIGNMENT-1))
+ >> RB_ALIGNMENT_SHIFT;
+ break;
+ default:
+ BUG();
+ }
+}
+
+static inline unsigned rb_calculate_event_length(unsigned length)
+{
+ struct ring_buffer_event event; /* Used only for sizeof array */
+
+ /* zero length can cause confusions */
+ if (!length)
+ length = 1;
+
+ if (length > RB_MAX_SMALL_DATA)
+ length += sizeof(event.array[0]);
+
+ length += RB_EVNT_HDR_SIZE;
+ length = ALIGN(length, RB_ALIGNMENT);
+
+ return length;
+}
+
+static struct ring_buffer_event *
+__ring_buffer_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned type, unsigned long length, u64 *ts)
+{
+ unsigned long head_page, tail_page, tail;
+ struct ring_buffer *buffer = cpu_buffer->buffer;
+ struct ring_buffer_event *event;
+
+ tail_page = cpu_buffer->tail_page;
+ head_page = cpu_buffer->head_page;
+ tail = cpu_buffer->tail;
+
+ BUG_ON(tail_page >= buffer->pages);
+ BUG_ON(head_page >= buffer->pages);
+
+ if (tail + length > BUF_PAGE_SIZE) {
+ unsigned long next_page = tail_page;
+
+ ring_buffer_inc_page(buffer, &next_page);
+
+ if (next_page == head_page) {
+ if (!(buffer->flags & RB_FL_OVERWRITE))
+ return NULL;
+
+ /* count overflows */
+ ring_buffer_update_overflow(cpu_buffer);
+
+ ring_buffer_inc_page(buffer, &head_page);
+ cpu_buffer->head_page = head_page;
+ rb_reset_read_page(cpu_buffer);
+ }
+
+ if (tail != BUF_PAGE_SIZE) {
+ event = rb_page_body(cpu_buffer, tail_page, tail);
+ /* page padding */
+ event->type = RB_TYPE_PADDING;
+ }
+
+ tail = 0;
+ tail_page = next_page;
+ cpu_buffer->tail_page = tail_page;
+ cpu_buffer->tail = tail;
+ rb_add_stamp(cpu_buffer, ts);
+ }
+
+ BUG_ON(tail_page >= buffer->pages);
+ BUG_ON(tail + length > BUF_PAGE_SIZE);
+
+ event = rb_page_body(cpu_buffer, tail_page, tail);
+ ring_buffer_update_event(event, type, length);
+ cpu_buffer->entries++;
+
+ return event;
+}
+
+static struct ring_buffer_event *
+ring_buffer_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned type, unsigned long length)
+{
+ unsigned long long ts, delta;
+ struct ring_buffer_event *event;
+
+ ts = ring_buffer_time_stamp(cpu_buffer->cpu);
+
+ if (cpu_buffer->tail) {
+ delta = ts - cpu_buffer->write_stamp;
+
+ if (test_time_stamp(delta)) {
+ /*
+ * The delta is too big, we to add a
+ * new timestamp.
+ */
+ event = __ring_buffer_reserve_next(cpu_buffer,
+ RB_TYPE_TIME_EXTENT,
+ RB_LEN_TIME_EXTENT,
+ &ts);
+ if (!event)
+ return NULL;
+
+ /* check to see if we went to the next page */
+ if (!cpu_buffer->tail) {
+ /*
+ * new page, dont commit this and add the
+ * time stamp to the page instead.
+ */
+ rb_add_stamp(cpu_buffer, &ts);
+ } else {
+ event->time_delta = delta & TS_MASK;
+ event->array[0] = delta >> TS_SHIFT;
+ }
+
+ cpu_buffer->write_stamp = ts;
+ delta = 0;
+ }
+ } else {
+ rb_add_stamp(cpu_buffer, &ts);
+ delta = 0;
+ }
+
+ event = __ring_buffer_reserve_next(cpu_buffer, type, length, &ts);
+ if (!event)
+ return NULL;
+
+ event->time_delta = delta;
+ cpu_buffer->write_stamp = ts;
+
+ return event;
+}
+
+/**
+ * ring_buffer_lock_reserve - reserve a part of the buffer
+ * @buffer: the ring buffer to reserve from
+ * @length: the length of the data to reserve (excluding event header)
+ * @flags: a pointer to save the interrupt flags
+ *
+ * Returns a reseverd event on the ring buffer to copy directly to.
+ * The user of this interface will need to get the body to write into
+ * and can use the ring_buffer_event_data() interface.
+ *
+ * The length is the length of the data needed, not the event length
+ * which also includes the event header.
+ *
+ * Must be paired with ring_buffer_unlock_commit, unless NULL is returned.
+ * If NULL is returned, then nothing has been allocated or locked.
+ */
+struct ring_buffer_event *
+ring_buffer_lock_reserve(struct ring_buffer *buffer,
+ unsigned long length,
+ unsigned long *flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ int cpu;
+
+ if (atomic_read(&buffer->record_disabled))
+ return NULL;
+
+ raw_local_irq_save(*flags);
+ cpu = raw_smp_processor_id();
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_lock(&cpu_buffer->lock);
+
+ if (atomic_read(&cpu_buffer->record_disabled))
+ goto no_record;
+
+ length = rb_calculate_event_length(length);
+ if (length > BUF_PAGE_SIZE)
+ return NULL;
+
+ event = ring_buffer_reserve_next_event(cpu_buffer,
+ RB_TYPE_DATA, length);
+ if (!event)
+ goto no_record;
+
+ return event;
+
+ no_record:
+ __raw_spin_unlock(&cpu_buffer->lock);
+ local_irq_restore(*flags);
+ return NULL;
+}
+
+/**
+ * ring_buffer_unlock_commit - commit a reserved
+ * @buffer: The buffer to commit to
+ * @event: The event pointer to commit.
+ * @flags: the interrupt flags received from ring_buffer_lock_reserve.
+ *
+ * This commits the data to the ring buffer, and releases any locks held.
+ *
+ * Must be paired with ring_buffer_lock_reserve.
+ */
+int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu = raw_smp_processor_id();
+
+ cpu_buffer = buffer->buffers[cpu];
+ cpu_buffer->tail += ring_buffer_event_length(event);
+
+ __raw_spin_unlock(&cpu_buffer->lock);
+ raw_local_irq_restore(flags);
+
+ return 0;
+}
+
+/**
+ * ring_buffer_write - write data to the buffer without reserving
+ * @buffer: The ring buffer to write to.
+ * @length: The length of the data being written (excluding the event header)
+ * @data: The data to write to the buffer.
+ *
+ * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as
+ * one function. If you already have the data to write to the buffer, it
+ * may be easier to simply call this function.
+ *
+ * Note, like ring_buffer_lock_reserve, the length is the length of the data
+ * and not the length of the event which would hold the header.
+ */
+int ring_buffer_write(struct ring_buffer *buffer,
+ unsigned long length,
+ void *data)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ unsigned long event_length, flags;
+ void *body;
+ int ret = 0;
+ int cpu;
+
+ if (atomic_read(&buffer->record_disabled))
+ return -EBUSY;
+
+ local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_lock(&cpu_buffer->lock);
+
+ if (atomic_read(&cpu_buffer->record_disabled))
+ goto out;
+
+ event_length = rb_calculate_event_length(length);
+ event = ring_buffer_reserve_next_event(cpu_buffer,
+ RB_TYPE_DATA, event_length);
+ if (!event)
+ goto out;
+
+ body = ring_buffer_event_data(event);
+
+ memcpy(body, data, length);
+ cpu_buffer->tail += event_length;
+
+ out:
+ __raw_spin_unlock(&cpu_buffer->lock);
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+/**
+ * ring_buffer_lock - lock the ring buffer
+ * @buffer: The ring buffer to lock
+ * @flags: The place to store the interrupt flags
+ *
+ * This locks all the per CPU buffers.
+ *
+ * Must be unlocked by ring_buffer_unlock.
+ */
+void ring_buffer_lock(struct ring_buffer *buffer, unsigned long *flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ local_irq_save(*flags);
+
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_lock(&cpu_buffer->lock);
+ }
+}
+
+/**
+ * ring_buffer_unlock - unlock a locked buffer
+ * @buffer: The locked buffer to unlock
+ * @flags: The interrupt flags received by ring_buffer_lock
+ */
+void ring_buffer_unlock(struct ring_buffer *buffer, unsigned long flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ for (cpu = buffer->cpus - 1; cpu >= 0; cpu--) {
+
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_unlock(&cpu_buffer->lock);
+ }
+
+ local_irq_restore(flags);
+}
+
+/**
+ * ring_buffer_record_disable - stop all writes into the buffer
+ * @buffer: The ring buffer to stop writes to.
+ *
+ * This prevents all writes to the buffer. Any attempt to write
+ * to the buffer after this will fail and return NULL.
+ */
+void ring_buffer_record_disable(struct ring_buffer *buffer)
+{
+ atomic_inc(&buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_enable - enable writes to the buffer
+ * @buffer: The ring buffer to enable writes
+ *
+ * Note, multiple disables will need the same number of enables
+ * to truely enable the writing (much like preempt_disable).
+ */
+void ring_buffer_record_enable(struct ring_buffer *buffer)
+{
+ atomic_dec(&buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
+ * @buffer: The ring buffer to stop writes to.
+ * @cpu: The CPU buffer to stop
+ *
+ * This prevents all writes to the buffer. Any attempt to write
+ * to the buffer after this will fail and return NULL.
+ */
+void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+ atomic_inc(&cpu_buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_enable_cpu - enable writes to the buffer
+ * @buffer: The ring buffer to enable writes
+ * @cpu: The CPU to enable.
+ *
+ * Note, multiple disables will need the same number of enables
+ * to truely enable the writing (much like preempt_disable).
+ */
+void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+ atomic_dec(&cpu_buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the entries from.
+ */
+unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return cpu_buffer->entries;
+}
+
+/**
+ * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return cpu_buffer->overrun;
+}
+
+/**
+ * ring_buffer_entries - get the number of entries in a buffer
+ * @buffer: The ring buffer
+ *
+ * Returns the total number of entries in the ring buffer
+ * (all CPU entries)
+ */
+unsigned long ring_buffer_entries(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long entries = 0;
+ int cpu;
+
+ /* if you care about this being correct, lock the buffer */
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = buffer->buffers[cpu];
+ entries += cpu_buffer->entries;
+ }
+
+ return entries;
+}
+
+/**
+ * ring_buffer_overrun_cpu - get the number of overruns in buffer
+ * @buffer: The ring buffer
+ *
+ * Returns the total number of overruns in the ring buffer
+ * (all CPU entries)
+ */
+unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long overruns = 0;
+ int cpu;
+
+ /* if you care about this being correct, lock the buffer */
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = buffer->buffers[cpu];
+ overruns += cpu_buffer->overrun;
+ }
+
+ return overruns;
+}
+
+/**
+ * ring_buffer_iter_reset - reset an iterator
+ * @iter: The iterator to reset
+ *
+ * Resets the iterator, so that it will start from the beginning
+ * again.
+ */
+void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+
+ iter->head_page = cpu_buffer->head_page;
+ iter->head = cpu_buffer->head;
+ rb_reset_iter_read_page(iter);
+}
+
+/**
+ * ring_buffer_iter_empty - check if an iterator has no more to read
+ * @iter: The iterator to check
+ */
+int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = iter->cpu_buffer;
+
+ return iter->head_page == cpu_buffer->tail_page &&
+ iter->head == cpu_buffer->tail;
+}
+
+static void
+ring_buffer_advance_head(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct ring_buffer *buffer = cpu_buffer->buffer;
+ struct ring_buffer_event *event;
+ unsigned length;
+
+ event = ring_buffer_head_event(cpu_buffer);
+ /*
+ * Check if we are at the end of the buffer.
+ */
+ if (ring_buffer_null_event(event)) {
+ BUG_ON(cpu_buffer->head_page == cpu_buffer->tail_page);
+ ring_buffer_inc_page(buffer, &cpu_buffer->head_page);
+ rb_reset_read_page(cpu_buffer);
+ return;
+ }
+
+ length = ring_buffer_event_length(event);
+
+ /*
+ * This should not be called to advance the header if we are
+ * at the tail of the buffer.
+ */
+ BUG_ON((cpu_buffer->head_page == cpu_buffer->tail_page) &&
+ (cpu_buffer->head + length > cpu_buffer->tail));
+
+ cpu_buffer->head += length;
+
+ /* check for end of page padding */
+ event = ring_buffer_head_event(cpu_buffer);
+ if (ring_buffer_null_event(event) &&
+ (cpu_buffer->head_page != cpu_buffer->tail_page))
+ ring_buffer_advance_head(cpu_buffer);
+}
+
+static void
+ring_buffer_advance_iter(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer *buffer;
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ unsigned length;
+
+ cpu_buffer = iter->cpu_buffer;
+ buffer = cpu_buffer->buffer;
+
+ event = ring_buffer_iter_head_event(iter);
+
+ /*
+ * Check if we are at the end of the buffer.
+ */
+ if (ring_buffer_null_event(event)) {
+ BUG_ON(iter->head_page == cpu_buffer->tail_page);
+ ring_buffer_inc_page(buffer, &iter->head_page);
+ rb_reset_iter_read_page(iter);
+ return;
+ }
+
+ length = ring_buffer_event_length(event);
+
+ /*
+ * This should not be called to advance the header if we are
+ * at the tail of the buffer.
+ */
+ BUG_ON((iter->head_page == cpu_buffer->tail_page) &&
+ (iter->head + length > cpu_buffer->tail));
+
+ iter->head += length;
+
+ /* check for end of page padding */
+ event = ring_buffer_iter_head_event(iter);
+ if (ring_buffer_null_event(event) &&
+ (iter->head_page != cpu_buffer->tail_page))
+ ring_buffer_advance_iter(iter);
+}
+
+/**
+ * ring_buffer_peek - peek at the next event to be read
+ * @buffer: The ring buffer to read
+ * @cpu: The cpu to peak at
+ * @ts: The timestamp counter of this event.
+ *
+ * This will return the event that will be read next, but does
+ * not consume the data.
+ */
+struct ring_buffer_event *
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ u64 delta;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ again:
+ if (ring_buffer_per_cpu_empty(cpu_buffer))
+ return NULL;
+
+ event = ring_buffer_head_event(cpu_buffer);
+
+ switch (event->type) {
+ case RB_TYPE_PADDING:
+ ring_buffer_inc_page(buffer, &cpu_buffer->head_page);
+ rb_reset_read_page(cpu_buffer);
+ goto again;
+
+ case RB_TYPE_TIME_EXTENT:
+ delta = event->array[0];
+ delta <<= TS_SHIFT;
+ delta += event->time_delta;
+ cpu_buffer->read_stamp += delta;
+ /* Internal data, OK to advance */
+ ring_buffer_advance_head(cpu_buffer);
+ goto again;
+
+ case RB_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ ring_buffer_advance_head(cpu_buffer);
+ goto again;
+
+ case RB_TYPE_DATA:
+ if (ts) {
+ *ts = cpu_buffer->read_stamp + event->time_delta;
+ ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+ }
+ return event;
+
+ default:
+ BUG();
+ }
+
+ return NULL;
+}
+
+/**
+ * ring_buffer_iter_peek - peek at the next event to be read
+ * @iter: The ring buffer iterator
+ * @ts: The timestamp counter of this event.
+ *
+ * This will return the event that will be read next, but does
+ * not increment the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
+{
+ struct ring_buffer *buffer;
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ u64 delta;
+
+ if (ring_buffer_iter_empty(iter))
+ return NULL;
+
+ cpu_buffer = iter->cpu_buffer;
+ buffer = cpu_buffer->buffer;
+
+ again:
+ if (ring_buffer_per_cpu_empty(cpu_buffer))
+ return NULL;
+
+ event = ring_buffer_iter_head_event(iter);
+
+ switch (event->type) {
+ case RB_TYPE_PADDING:
+ ring_buffer_inc_page(buffer, &iter->head_page);
+ rb_reset_iter_read_page(iter);
+ goto again;
+
+ case RB_TYPE_TIME_EXTENT:
+ delta = event->array[0];
+ delta <<= TS_SHIFT;
+ delta += event->time_delta;
+ iter->read_stamp += delta;
+ /* Internal data, OK to advance */
+ ring_buffer_advance_iter(iter);
+ goto again;
+
+ case RB_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ ring_buffer_advance_iter(iter);
+ goto again;
+
+ case RB_TYPE_DATA:
+ if (ts) {
+ *ts = iter->read_stamp + event->time_delta;
+ ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+ }
+ return event;
+
+ default:
+ BUG();
+ }
+
+ return NULL;
+}
+
+/**
+ * ring_buffer_consume - return an event and consume it
+ * @buffer: The ring buffer to get the next event from
+ *
+ * Returns the next event in the ring buffer, and that event is consumed.
+ * Meaning, that sequential reads will keep returning a different event,
+ * and eventually empty the ring buffer if the producer is slower.
+ */
+struct ring_buffer_event *
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+
+ event = ring_buffer_peek(buffer, cpu, ts);
+ if (!event)
+ return NULL;
+
+ cpu_buffer = buffer->buffers[cpu];
+ ring_buffer_advance_head(cpu_buffer);
+
+ return event;
+}
+
+/**
+ * ring_buffer_read_start - start a non consuming read of the buffer
+ * @buffer: The ring buffer to read from
+ * @cpu: The cpu buffer to iterate over
+ *
+ * This starts up an iteration through the buffer. It also disables
+ * the recording to the buffer until the reading is finished.
+ * This prevents the reading from being corrupted. This is not
+ * a consuming read, so a producer is not expected.
+ *
+ * Must be paired with ring_buffer_finish.
+ */
+struct ring_buffer_iter *
+ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_iter *iter;
+
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return NULL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ iter->cpu_buffer = cpu_buffer;
+
+ atomic_inc(&cpu_buffer->record_disabled);
+
+ __raw_spin_lock(&cpu_buffer->lock);
+ iter->head = cpu_buffer->head;
+ iter->head_page = cpu_buffer->head_page;
+ rb_reset_iter_read_page(iter);
+ __raw_spin_unlock(&cpu_buffer->lock);
+
+ return iter;
+}
+
+/**
+ * ring_buffer_finish - finish reading the iterator of the buffer
+ * @iter: The iterator retrieved by ring_buffer_start
+ *
+ * This re-enables the recording to the buffer, and frees the
+ * iterator.
+ */
+void
+ring_buffer_read_finish(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+
+ atomic_dec(&cpu_buffer->record_disabled);
+ kfree(iter);
+}
+
+/**
+ * ring_buffer_read - read the next item in the ring buffer by the iterator
+ * @iter: The ring buffer iterator
+ * @ts: The time stamp of the event read.
+ *
+ * This reads the next event in the ring buffer and increments the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
+{
+ struct ring_buffer_event *event;
+
+ event = ring_buffer_iter_peek(iter, ts);
+ if (!event)
+ return NULL;
+
+ ring_buffer_advance_iter(iter);
+
+ return event;
+}
+
+/**
+ * ring_buffer_size - return the size of the ring buffer (in bytes)
+ * @buffer: The ring buffer.
+ */
+unsigned long ring_buffer_size(struct ring_buffer *buffer)
+{
+ return BUF_PAGE_SIZE * buffer->pages;
+}
+
+static void
+__ring_buffer_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ cpu_buffer->head_page = cpu_buffer->tail_page = 0;
+ cpu_buffer->head = cpu_buffer->tail = 0;
+ cpu_buffer->overrun = 0;
+ cpu_buffer->entries = 0;
+}
+
+/**
+ * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
+ * @buffer: The ring buffer to reset a per cpu buffer of
+ * @cpu: The CPU buffer to be reset
+ */
+void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ __raw_spin_lock(&cpu_buffer->lock);
+
+ __ring_buffer_reset_cpu(cpu_buffer);
+
+ __raw_spin_unlock(&cpu_buffer->lock);
+ raw_local_irq_restore(flags);
+}
+
+/**
+ * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
+ * @buffer: The ring buffer to reset a per cpu buffer of
+ * @cpu: The CPU buffer to be reset
+ */
+void ring_buffer_reset(struct ring_buffer *buffer)
+{
+ unsigned long flags;
+ int cpu;
+
+ ring_buffer_lock(buffer, &flags);
+
+ for (cpu = 0; cpu < buffer->cpus; cpu++)
+ __ring_buffer_reset_cpu(buffer->buffers[cpu]);
+
+ ring_buffer_unlock(buffer, flags);
+}
+
+/**
+ * rind_buffer_empty - is the ring buffer empty?
+ * @buffer: The ring buffer to test
+ */
+int ring_buffer_empty(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ /* yes this is racy, but if you don't like the race, lock the buffer */
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = buffer->buffers[cpu];
+ if (!ring_buffer_per_cpu_empty(cpu_buffer))
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty?
+ * @buffer: The ring buffer
+ * @cpu: The CPU buffer to test
+ */
+int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return ring_buffer_per_cpu_empty(cpu_buffer);
+}
+
+/**
+ * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
+ * @buffer_a: One buffer to swap with
+ * @buffer_b: The other buffer to swap with
+ *
+ * This function is useful for tracers that want to take a "snapshot"
+ * of a CPU buffer and has another back up buffer lying around.
+ * it is expected that the tracer handles the cpu buffer not being
+ * used at the moment.
+ */
+int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
+ struct ring_buffer *buffer_b, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer_a;
+ struct ring_buffer_per_cpu *cpu_buffer_b;
+
+ /* At least make sure the two buffers are somewhat the same */
+ if (buffer_a->size != buffer_b->size ||
+ buffer_a->pages != buffer_b->pages)
+ return -EINVAL;
+
+ cpu_buffer_a = buffer_a->buffers[cpu];
+ cpu_buffer_b = buffer_b->buffers[cpu];
+
+ atomic_inc(&cpu_buffer_a->record_disabled);
+ atomic_inc(&cpu_buffer_b->record_disabled);
+
+ buffer_a->buffers[cpu] = cpu_buffer_b;
+ buffer_b->buffers[cpu] = cpu_buffer_a;
+
+ cpu_buffer_b->buffer = buffer_a;
+ cpu_buffer_a->buffer = buffer_b;
+
+ atomic_dec(&cpu_buffer_a->record_disabled);
+ atomic_dec(&cpu_buffer_b->record_disabled);
+
+ return 0;
+}
+
Index: linux-trace.git/kernel/trace/Kconfig
===================================================================
--- linux-trace.git.orig/kernel/trace/Kconfig 2008-09-25 18:26:10.000000000 -0400
+++ linux-trace.git/kernel/trace/Kconfig 2008-09-25 18:30:51.000000000 -0400
@@ -10,10 +10,14 @@ config HAVE_DYNAMIC_FTRACE
config TRACER_MAX_TRACE
bool

+config RING_BUFFER
+ bool
+
config TRACING
bool
select DEBUG_FS
select STACKTRACE
+ select RING_BUFFER

config FTRACE
bool "Kernel Function Tracer"
Index: linux-trace.git/kernel/trace/Makefile
===================================================================
--- linux-trace.git.orig/kernel/trace/Makefile 2008-09-25 18:26:10.000000000 -0400
+++ linux-trace.git/kernel/trace/Makefile 2008-09-25 18:29:07.000000000 -0400
@@ -11,6 +11,7 @@ obj-y += trace_selftest_dynamic.o
endif

obj-$(CONFIG_FTRACE) += libftrace.o
+obj-$(CONFIG_RING_BUFFER) += ring_buffer.o

obj-$(CONFIG_TRACING) += trace.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
Masami Hiramatsu
2008-09-26 01:52:45 UTC
Permalink
Hi Steven,
Post by Steven Rostedt
This version has been cleaned up a bit. I've been running it as
a back end to ftrace, and it has been handling pretty well.
Thank you for your great work.
It seems good to me(especially, encapsulating events :)).

I have one request of enhancement.
Post by Steven Rostedt
+static struct ring_buffer_per_cpu *
+ring_buffer_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
+{
[...]
Post by Steven Rostedt
+ cpu_buffer->pages = kzalloc_node(ALIGN(sizeof(void *) * pages,
+ cache_line_size()), GFP_KERNEL,
+ cpu_to_node(cpu));
Here, you are using a slab object for page managing array,
the largest object size is 128KB(x86-64), so it can contain
16K pages = 64MB.

As I had improved relayfs, in some rare case(on 64bit arch),
we'd like to use larger buffer than 64MB.

http://sourceware.org/ml/systemtap/2008-q2/msg00103.html

So, I think similar hack can be applicable.

Would it be acceptable for the next version?

Thank you,
--
Masami Hiramatsu

Software Engineer
Hitachi Computer Products (America) Inc.
Software Solutions Division

e-mail: ***@redhat.com
Steven Rostedt
2008-09-26 02:11:39 UTC
Permalink
Post by Masami Hiramatsu
Hi Steven,
Post by Steven Rostedt
This version has been cleaned up a bit. I've been running it as
a back end to ftrace, and it has been handling pretty well.
Thank you for your great work.
It seems good to me(especially, encapsulating events :)).
Thanks!
Post by Masami Hiramatsu
I have one request of enhancement.
Post by Steven Rostedt
+static struct ring_buffer_per_cpu *
+ring_buffer_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
+{
[...]
Post by Steven Rostedt
+ cpu_buffer->pages = kzalloc_node(ALIGN(sizeof(void *) * pages,
+ cache_line_size()), GFP_KERNEL,
+ cpu_to_node(cpu));
Here, you are using a slab object for page managing array,
the largest object size is 128KB(x86-64), so it can contain
16K pages = 64MB.
As I had improved relayfs, in some rare case(on 64bit arch),
we'd like to use larger buffer than 64MB.
http://sourceware.org/ml/systemtap/2008-q2/msg00103.html
So, I think similar hack can be applicable.
Would it be acceptable for the next version?
I would like to avoid using vmalloc as much as possible, but I do see the
limitation here. Here's my compromise.

Instead of using vmalloc if the page array is greater than one page,
how about using vmalloc if the page array is greater than
KMALLOC_MAX_SIZE?

This would let us keep the vmap area free unless we have no choice.

-- Steve
Masami Hiramatsu
2008-09-26 02:47:20 UTC
Permalink
Post by Steven Rostedt
Post by Masami Hiramatsu
Hi Steven,
Post by Steven Rostedt
This version has been cleaned up a bit. I've been running it as
a back end to ftrace, and it has been handling pretty well.
Thank you for your great work.
It seems good to me(especially, encapsulating events :)).
Thanks!
Post by Masami Hiramatsu
I have one request of enhancement.
Post by Steven Rostedt
+static struct ring_buffer_per_cpu *
+ring_buffer_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
+{
[...]
Post by Steven Rostedt
+ cpu_buffer->pages = kzalloc_node(ALIGN(sizeof(void *) * pages,
+ cache_line_size()), GFP_KERNEL,
+ cpu_to_node(cpu));
Here, you are using a slab object for page managing array,
the largest object size is 128KB(x86-64), so it can contain
16K pages = 64MB.
As I had improved relayfs, in some rare case(on 64bit arch),
we'd like to use larger buffer than 64MB.
http://sourceware.org/ml/systemtap/2008-q2/msg00103.html
So, I think similar hack can be applicable.
Would it be acceptable for the next version?
I would like to avoid using vmalloc as much as possible, but I do see the
limitation here. Here's my compromise.
Instead of using vmalloc if the page array is greater than one page,
how about using vmalloc if the page array is greater than
KMALLOC_MAX_SIZE?
This would let us keep the vmap area free unless we have no choice.
Hmm, that's a good idea.
In most cases, per-cpu buffer may be less than 64MB,
so I think it is reasonable.

Thank you,
Post by Steven Rostedt
-- Steve
--
Masami Hiramatsu

Software Engineer
Hitachi Computer Products (America) Inc.
Software Solutions Division

e-mail: ***@redhat.com
Mathieu Desnoyers
2008-09-26 03:20:45 UTC
Permalink
Post by Steven Rostedt
Post by Masami Hiramatsu
Hi Steven,
Post by Steven Rostedt
This version has been cleaned up a bit. I've been running it as
a back end to ftrace, and it has been handling pretty well.
Thank you for your great work.
It seems good to me(especially, encapsulating events :)).
Thanks!
Post by Masami Hiramatsu
I have one request of enhancement.
Post by Steven Rostedt
+static struct ring_buffer_per_cpu *
+ring_buffer_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
+{
[...]
Post by Steven Rostedt
+ cpu_buffer->pages = kzalloc_node(ALIGN(sizeof(void *) * pages,
+ cache_line_size()), GFP_KERNEL,
+ cpu_to_node(cpu));
Here, you are using a slab object for page managing array,
the largest object size is 128KB(x86-64), so it can contain
16K pages = 64MB.
As I had improved relayfs, in some rare case(on 64bit arch),
we'd like to use larger buffer than 64MB.
http://sourceware.org/ml/systemtap/2008-q2/msg00103.html
So, I think similar hack can be applicable.
Would it be acceptable for the next version?
I would like to avoid using vmalloc as much as possible, but I do see the
limitation here. Here's my compromise.
Instead of using vmalloc if the page array is greater than one page,
how about using vmalloc if the page array is greater than
KMALLOC_MAX_SIZE?
This would let us keep the vmap area free unless we have no choice.
-- Steve
You could also fallback on a 2-level page array when buffer size is >
64MB. The cost is mainly a supplementary pointer dereference, but one
more should not make sure a big difference overall.

Mathieu
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
Peter Zijlstra
2008-09-26 07:18:27 UTC
Permalink
Post by Mathieu Desnoyers
Post by Steven Rostedt
Post by Masami Hiramatsu
Hi Steven,
Post by Steven Rostedt
This version has been cleaned up a bit. I've been running it as
a back end to ftrace, and it has been handling pretty well.
Thank you for your great work.
It seems good to me(especially, encapsulating events :)).
Thanks!
Post by Masami Hiramatsu
I have one request of enhancement.
Post by Steven Rostedt
+static struct ring_buffer_per_cpu *
+ring_buffer_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
+{
[...]
Post by Steven Rostedt
+ cpu_buffer->pages = kzalloc_node(ALIGN(sizeof(void *) * pages,
+ cache_line_size()), GFP_KERNEL,
+ cpu_to_node(cpu));
Here, you are using a slab object for page managing array,
the largest object size is 128KB(x86-64), so it can contain
16K pages = 64MB.
As I had improved relayfs, in some rare case(on 64bit arch),
we'd like to use larger buffer than 64MB.
http://sourceware.org/ml/systemtap/2008-q2/msg00103.html
So, I think similar hack can be applicable.
Would it be acceptable for the next version?
I would like to avoid using vmalloc as much as possible, but I do see the
limitation here. Here's my compromise.
Instead of using vmalloc if the page array is greater than one page,
how about using vmalloc if the page array is greater than
KMALLOC_MAX_SIZE?
This would let us keep the vmap area free unless we have no choice.
-- Steve
You could also fallback on a 2-level page array when buffer size is >
64MB. The cost is mainly a supplementary pointer dereference, but one
more should not make sure a big difference overall.
I'm still not sure why we don't just link the pages using the page
frames, we don't need the random access, do we?
Steven Rostedt
2008-09-26 10:45:28 UTC
Permalink
Post by Peter Zijlstra
Post by Mathieu Desnoyers
You could also fallback on a 2-level page array when buffer size is >
64MB. The cost is mainly a supplementary pointer dereference, but one
more should not make sure a big difference overall.
I'm still not sure why we don't just link the pages using the page
frames, we don't need the random access, do we?
Yeah we can go back to that (as ftrace does).

1) It can be very error prone. I will need to encapsulate the logic more.

2) I'm still not sure if crash can handle it.


I was going to reply to Masami with this answer, but it makes things more
complex. For v1 (non RFC v1) I wanted to start simple. v2 can have this
enhancement.

-- Steve
Peter Zijlstra
2008-09-26 11:00:54 UTC
Permalink
Post by Steven Rostedt
Post by Peter Zijlstra
Post by Mathieu Desnoyers
You could also fallback on a 2-level page array when buffer size is >
64MB. The cost is mainly a supplementary pointer dereference, but one
more should not make sure a big difference overall.
I'm still not sure why we don't just link the pages using the page
frames, we don't need the random access, do we?
Yeah we can go back to that (as ftrace does).
1) It can be very error prone. I will need to encapsulate the logic more.
Sure.
Post by Steven Rostedt
2) I'm still not sure if crash can handle it.
It ought to, and if it can't it should be fixed. Having easy access to
the pageframes is vital to debugging VM issues. So I'd not bother about
this issue too much.
Post by Steven Rostedt
I was going to reply to Masami with this answer, but it makes things more
complex. For v1 (non RFC v1) I wanted to start simple. v2 can have this
enhancement.
Right - I just object to having anything vmalloc.
Masami Hiramatsu
2008-09-26 16:57:25 UTC
Permalink
Post by Peter Zijlstra
Post by Steven Rostedt
Post by Peter Zijlstra
Post by Mathieu Desnoyers
You could also fallback on a 2-level page array when buffer size is >
64MB. The cost is mainly a supplementary pointer dereference, but one
more should not make sure a big difference overall.
I'm still not sure why we don't just link the pages using the page
frames, we don't need the random access, do we?
Yeah we can go back to that (as ftrace does).
1) It can be very error prone. I will need to encapsulate the logic more.
Sure.
Post by Steven Rostedt
2) I'm still not sure if crash can handle it.
It ought to, and if it can't it should be fixed. Having easy access to
the pageframes is vital to debugging VM issues. So I'd not bother about
this issue too much.
Post by Steven Rostedt
I was going to reply to Masami with this answer, but it makes things more
complex. For v1 (non RFC v1) I wanted to start simple. v2 can have this
enhancement.
Right - I just object to having anything vmalloc.
I just requested that the expansion of buffer size limitation too. :)

I don't stick with vmalloc. If that (page frame chain?) can
achieve better performance, I agree that trace buffer uses it.

Thank you,
--
Masami Hiramatsu

Software Engineer
Hitachi Computer Products (America) Inc.
Software Solutions Division

e-mail: ***@redhat.com
Steven Rostedt
2008-09-26 17:14:23 UTC
Permalink
Post by Masami Hiramatsu
Post by Peter Zijlstra
Post by Steven Rostedt
I was going to reply to Masami with this answer, but it makes things more
complex. For v1 (non RFC v1) I wanted to start simple. v2 can have this
enhancement.
Right - I just object to having anything vmalloc.
I just requested that the expansion of buffer size limitation too. :)
I don't stick with vmalloc. If that (page frame chain?) can
achieve better performance, I agree that trace buffer uses it.
v5 is out with this implementation. It may or may not be better
performance, but the difference is most likely negligible.

Anyway, I'm happing with this last release, and hopefully it can get into
2.6.28. This would mean I can start basing ftrace on top of it.

-- Steve
Steven Rostedt
2008-09-26 10:47:27 UTC
Permalink
Post by Peter Zijlstra
Post by Mathieu Desnoyers
You could also fallback on a 2-level page array when buffer size is >
64MB. The cost is mainly a supplementary pointer dereference, but one
more should not make sure a big difference overall.
I'm still not sure why we don't just link the pages using the page
frames, we don't need the random access, do we?
Hmm, but this does make changing the buffer size much easier. I'll think
about it and perhaps try it out. If I can tidy it up nicer than the
ftrace code, then I may include it for v1.

-- Steve
Mathieu Desnoyers
2008-09-26 16:04:56 UTC
Permalink
Post by Peter Zijlstra
Post by Mathieu Desnoyers
Post by Steven Rostedt
Post by Masami Hiramatsu
Hi Steven,
Post by Steven Rostedt
This version has been cleaned up a bit. I've been running it as
a back end to ftrace, and it has been handling pretty well.
Thank you for your great work.
It seems good to me(especially, encapsulating events :)).
Thanks!
Post by Masami Hiramatsu
I have one request of enhancement.
Post by Steven Rostedt
+static struct ring_buffer_per_cpu *
+ring_buffer_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
+{
[...]
Post by Steven Rostedt
+ cpu_buffer->pages = kzalloc_node(ALIGN(sizeof(void *) * pages,
+ cache_line_size()), GFP_KERNEL,
+ cpu_to_node(cpu));
Here, you are using a slab object for page managing array,
the largest object size is 128KB(x86-64), so it can contain
16K pages = 64MB.
As I had improved relayfs, in some rare case(on 64bit arch),
we'd like to use larger buffer than 64MB.
http://sourceware.org/ml/systemtap/2008-q2/msg00103.html
So, I think similar hack can be applicable.
Would it be acceptable for the next version?
I would like to avoid using vmalloc as much as possible, but I do see the
limitation here. Here's my compromise.
Instead of using vmalloc if the page array is greater than one page,
how about using vmalloc if the page array is greater than
KMALLOC_MAX_SIZE?
This would let us keep the vmap area free unless we have no choice.
-- Steve
You could also fallback on a 2-level page array when buffer size is >
64MB. The cost is mainly a supplementary pointer dereference, but one
more should not make sure a big difference overall.
I'm still not sure why we don't just link the pages using the page
frames, we don't need the random access, do we?
Yes, that's a brilliant idea :)

Mathieu
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
Steven Rostedt
2008-09-26 17:11:57 UTC
Permalink
[
Note the removal of the RFC in the subject.
I am happy with this version. It handles everything I need
for ftrace.

New since last version:

- Fixed timing bug. I did not add the deltas properly when
reading the buffer.

- Removed "-1" time stamp normalize test. This made the
clock go backwards!

- Removed page pointer array and replaced it with the ftrace
page struct link list trick. Since this is my second time
writing this code (first with ftrace), it is actually much
cleaner than the ftrace code.

- Implemented buffer resizing. By using the page link list trick,
this became much simpler.

Note, the GOTD part is still not implemented, but can be done
later without affecting this interface.

]

This is a unified tracing buffer that implements a ring buffer that
hopefully everyone will eventually be able to use.

The events recorded into the buffer have the following structure:

struct ring_buffer_event {
u32 type:2, len:3, time_delta:27;
u32 array[];
};

The minimum size of an event is 8 bytes. All events are 4 byte
aligned inside the buffer.

There are 4 types (all internal use for the ring buffer, only
the data type is exported to the interface users).

RB_TYPE_PADDING: this type is used to note extra space at the end
of a buffer page.

RB_TYPE_TIME_EXTENT: This type is used when the time between events
is greater than the 27 bit delta can hold. We add another
32 bits, and record that in its own event (8 byte size).

RB_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to
help keep the buffer timestamps in sync.

RB_TYPE_DATA: The event actually holds user data.

The "len" field is only three bits. Since the data must be
4 byte aligned, this field is shifted left by 2, giving a
max length of 28 bytes. If the data load is greater than 28
bytes, the first array field holds the full length of the
data load and the len field is set to zero.

Example, data size of 7 bytes:

type = RB_TYPE_DATA
len = 2
time_delta: <time-stamp> - <prev_event-time-stamp>
array[0..1]: <7 bytes of data> <1 byte empty>

This event is saved in 12 bytes of the buffer.

An event with 82 bytes of data:

type = RB_TYPE_DATA
len = 0
time_delta: <time-stamp> - <prev_event-time-stamp>
array[0]: 84 (Note the alignment)
array[1..14]: <82 bytes of data> <2 bytes empty>

The above event is saved in 92 bytes (if my math is correct).
82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length.

Do not reference the above event struct directly. Use the following
functions to gain access to the event table, since the
ring_buffer_event structure may change in the future.

ring_buffer_event_length(event): get the length of the event.
This is the size of the memory used to record this
event, and not the size of the data pay load.

ring_buffer_time_delta(event): get the time delta of the event
This returns the delta time stamp since the last event.
Note: Even though this is in the header, there should
be no reason to access this directly, accept
for debugging.

ring_buffer_event_data(event): get the data from the event
This is the function to use to get the actual data
from the event. Note, it is only a pointer to the
data inside the buffer. This data must be copied to
another location otherwise you risk it being written
over in the buffer.

ring_buffer_lock: A way to lock the entire buffer.
ring_buffer_unlock: unlock the buffer.

ring_buffer_alloc: create a new ring buffer. Can choose between
overwrite or consumer/producer mode. Overwrite will
overwrite old data, where as consumer producer will
throw away new data if the consumer catches up with the
producer. The consumer/producer is the default.

ring_buffer_free: free the ring buffer.

ring_buffer_resize: resize the buffer. Changes the size of each cpu
buffer. Note, it is up to the caller to provide that
the buffer is not being used while this is happening.
This requirement may go away but do not count on it.

ring_buffer_lock_reserve: locks the ring buffer and allocates an
entry on the buffer to write to.
ring_buffer_unlock_commit: unlocks the ring buffer and commits it to
the buffer.

ring_buffer_write: writes some data into the ring buffer.

ring_buffer_peek: Look at a next item in the cpu buffer.
ring_buffer_consume: get the next item in the cpu buffer and
consume it. That is, this function increments the head
pointer.

ring_buffer_read_start: Start an iterator of a cpu buffer.
For now, this disables the cpu buffer, until you issue
a finish. This is just because we do not want the iterator
to be overwritten. This restriction may change in the future.
But note, this is used for static reading of a buffer which
is usually done "after" a trace. Live readings would want
to use the ring_buffer_consume above, which will not
disable the ring buffer.

ring_buffer_read_finish: Finishes the read iterator and reenables
the ring buffer.

ring_buffer_iter_peek: Look at the next item in the cpu iterator.
ring_buffer_read: Read the iterator and increment it.
ring_buffer_iter_reset: Reset the iterator to point to the beginning
of the cpu buffer.
ring_buffer_iter_empty: Returns true if the iterator is at the end
of the cpu buffer.

ring_buffer_size: returns the size in bytes of each cpu buffer.
Note, the real size is this times the number of CPUs.

ring_buffer_reset_cpu: Sets the cpu buffer to empty
ring_buffer_reset: sets all cpu buffers to empty

ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a
cpu buffer of another buffer. This is handy when you
want to take a snap shot of a running trace on just one
cpu. Having a backup buffer, to swap with facilitates this.
Ftrace max latencies use this.

ring_buffer_empty: Returns true if the ring buffer is empty.
ring_buffer_empty_cpu: Returns true if the cpu buffer is empty.

ring_buffer_record_disable: disable all cpu buffers (read only)
ring_buffer_record_disable_cpu: disable a single cpu buffer (read only)
ring_buffer_record_enable: enable all cpu buffers.
ring_buffer_record_enabl_cpu: enable a single cpu buffer.

ring_buffer_entries: The number of entries in a ring buffer.
ring_buffer_overruns: The number of entries removed due to writing wrap.

ring_buffer_time_stamp: Get the time stamp used by the ring buffer
ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp
into nanosecs.

I still need to implement the GTOD feature. But we need support from
the cpu frequency infrastructure. But this can be done at a later
time without affecting the ring buffer interface.

Signed-off-by: Steven Rostedt <***@redhat.com>
---
include/linux/ring_buffer.h | 178 +++++
kernel/trace/Kconfig | 4
kernel/trace/Makefile | 1
kernel/trace/ring_buffer.c | 1491 ++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 1674 insertions(+)

Index: linux-trace.git/include/linux/ring_buffer.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-trace.git/include/linux/ring_buffer.h 2008-09-25 21:29:16.000000000 -0400
@@ -0,0 +1,178 @@
+#ifndef _LINUX_RING_BUFFER_H
+#define _LINUX_RING_BUFFER_H
+
+#include <linux/mm.h>
+#include <linux/seq_file.h>
+
+struct ring_buffer;
+struct ring_buffer_iter;
+
+/*
+ * Don't reference this struct directly, use the inline items below.
+ */
+struct ring_buffer_event {
+ u32 type:2, len:3, time_delta:27;
+ u32 array[];
+} __attribute__((__packed__));
+
+enum {
+ RB_TYPE_PADDING, /* Left over page padding
+ * array is ignored
+ * size is variable depending on
+ */
+ RB_TYPE_TIME_EXTENT, /* Extent the time delta
+ * array[0] = time delta (28 .. 59)
+ * size = 8 bytes
+ */
+ /* FIXME: RB_TYPE_TIME_STAMP not implemented */
+ RB_TYPE_TIME_STAMP, /* Sync time stamp with external clock
+ * array[0] = tv_nsec
+ * array[1] = tv_sec
+ * size = 16 bytes
+ */
+
+ RB_TYPE_DATA, /* Data record
+ * If len is zero:
+ * array[0] holds the actual length
+ * array[1..(length+3)/4] holds data
+ * else
+ * length = len << 2
+ * array[0..(length+3)/4] holds data
+ */
+};
+
+#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event))
+#define RB_ALIGNMENT_SHIFT 2
+#define RB_ALIGNMENT (1 << RB_ALIGNMENT_SHIFT)
+#define RB_MAX_SMALL_DATA (28)
+
+enum {
+ RB_LEN_TIME_EXTENT = 8,
+ RB_LEN_TIME_STAMP = 16,
+};
+
+/**
+ * ring_buffer_event_length - return the length of the event
+ * @event: the event to get the length of
+ */
+static inline unsigned
+ring_buffer_event_length(struct ring_buffer_event *event)
+{
+ unsigned length;
+
+ switch (event->type) {
+ case RB_TYPE_PADDING:
+ /* undefined */
+ return -1;
+
+ case RB_TYPE_TIME_EXTENT:
+ return RB_LEN_TIME_EXTENT;
+
+ case RB_TYPE_TIME_STAMP:
+ return RB_LEN_TIME_STAMP;
+
+ case RB_TYPE_DATA:
+ if (event->len)
+ length = event->len << RB_ALIGNMENT_SHIFT;
+ else
+ length = event->array[0];
+ return length + RB_EVNT_HDR_SIZE;
+ default:
+ BUG();
+ }
+ /* not hit */
+ return 0;
+}
+
+/**
+ * ring_buffer_event_time_delta - return the delta timestamp of the event
+ * @event: the event to get the delta timestamp of
+ *
+ * The delta timestamp is the 27 bit timestamp since the last event.
+ */
+static inline unsigned
+ring_buffer_event_time_delta(struct ring_buffer_event *event)
+{
+ return event->time_delta;
+}
+
+/**
+ * ring_buffer_event_data - return the data of the event
+ * @event: the event to get the data from
+ */
+static inline void *
+ring_buffer_event_data(struct ring_buffer_event *event)
+{
+ BUG_ON(event->type != RB_TYPE_DATA);
+ /* If length is in len field, then array[0] has the data */
+ if (event->len)
+ return (void *)&event->array[0];
+ /* Otherwise length is in array[0] and array[1] has the data */
+ return (void *)&event->array[1];
+}
+
+void ring_buffer_lock(struct ring_buffer *buffer, unsigned long *flags);
+void ring_buffer_unlock(struct ring_buffer *buffer, unsigned long flags);
+
+/*
+ * size is in bytes for each per CPU buffer.
+ */
+struct ring_buffer *
+ring_buffer_alloc(unsigned long size, unsigned flags);
+void ring_buffer_free(struct ring_buffer *buffer);
+
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size);
+
+struct ring_buffer_event *
+ring_buffer_lock_reserve(struct ring_buffer *buffer,
+ unsigned long length,
+ unsigned long *flags);
+int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags);
+int ring_buffer_write(struct ring_buffer *buffer,
+ unsigned long length, void *data);
+
+struct ring_buffer_event *
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts);
+struct ring_buffer_event *
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts);
+
+struct ring_buffer_iter *
+ring_buffer_read_start(struct ring_buffer *buffer, int cpu);
+void ring_buffer_read_finish(struct ring_buffer_iter *iter);
+
+struct ring_buffer_event *
+ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts);
+struct ring_buffer_event *
+ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts);
+void ring_buffer_iter_reset(struct ring_buffer_iter *iter);
+int ring_buffer_iter_empty(struct ring_buffer_iter *iter);
+
+unsigned long ring_buffer_size(struct ring_buffer *buffer);
+
+void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu);
+void ring_buffer_reset(struct ring_buffer *buffer);
+
+int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
+ struct ring_buffer *buffer_b, int cpu);
+
+int ring_buffer_empty(struct ring_buffer *buffer);
+int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu);
+
+void ring_buffer_record_disable(struct ring_buffer *buffer);
+void ring_buffer_record_enable(struct ring_buffer *buffer);
+void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu);
+void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu);
+
+unsigned long ring_buffer_entries(struct ring_buffer *buffer);
+unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
+
+u64 ring_buffer_time_stamp(int cpu);
+void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
+
+enum ring_buffer_flags {
+ RB_FL_OVERWRITE = 1 << 0,
+};
+
+#endif /* _LINUX_RING_BUFFER_H */
Index: linux-trace.git/kernel/trace/ring_buffer.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-trace.git/kernel/trace/ring_buffer.c 2008-09-26 12:13:02.000000000 -0400
@@ -0,0 +1,1491 @@
+/*
+ * Generic ring buffer
+ *
+ * Copyright (C) 2008 Steven Rostedt <***@redhat.com>
+ */
+#include <linux/ring_buffer.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/mutex.h>
+#include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/list.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+
+/* FIXME!!! */
+u64 ring_buffer_time_stamp(int cpu)
+{
+ return sched_clock();
+}
+void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
+{
+}
+
+#define TS_SHIFT 27
+#define TS_MASK ((1ULL << TS_SHIFT) - 1)
+#define TS_DELTA_TEST ~TS_MASK
+
+/*
+ * We need to fit the time_stamp delta into 27 bits.
+ */
+static inline int
+test_time_stamp(unsigned long long delta)
+{
+ if (delta & TS_DELTA_TEST)
+ return 1;
+ return 0;
+}
+
+struct buffer_page {
+ u64 time_stamp;
+ unsigned char body[];
+};
+
+#define BUF_PAGE_SIZE (PAGE_SIZE - sizeof(u64))
+
+/*
+ * head_page == tail_page && head == tail then buffer is empty.
+ */
+struct ring_buffer_per_cpu {
+ int cpu;
+ struct ring_buffer *buffer;
+ raw_spinlock_t lock;
+ struct lock_class_key lock_key;
+ struct list_head pages;
+ unsigned long head; /* read from head */
+ unsigned long tail; /* write to tail */
+ struct page *head_page;
+ struct page *tail_page;
+ unsigned long overrun;
+ unsigned long entries;
+ u64 write_stamp;
+ u64 read_stamp;
+ atomic_t record_disabled;
+};
+
+struct ring_buffer {
+ unsigned long size;
+ unsigned pages;
+ unsigned flags;
+ int cpus;
+ atomic_t record_disabled;
+
+ struct mutex mutex;
+
+ /* FIXME: this should be online CPUS */
+ struct ring_buffer_per_cpu *buffers[NR_CPUS];
+};
+
+struct ring_buffer_iter {
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long head;
+ struct page *head_page;
+ u64 read_stamp;
+};
+
+#define CHECK_COND(buffer, cond) \
+ if (unlikely(cond)) { \
+ atomic_inc(&buffer->record_disabled); \
+ WARN_ON(1); \
+ return -1; \
+ }
+
+/**
+ * check_pages - integrity check of buffer pages
+ * @cpu_buffer: CPU buffer with pages to test
+ *
+ * As a safty measure we check to make sure the data pages have not
+ * been corrupted.
+ */
+static int check_pages(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct list_head *head = &cpu_buffer->pages;
+ struct page *page, *tmp;
+
+ CHECK_COND(cpu_buffer, head->next->prev != head);
+ CHECK_COND(cpu_buffer, head->prev->next != head);
+
+ list_for_each_entry_safe(page, tmp, head, lru) {
+ CHECK_COND(cpu_buffer, page->lru.next->prev != &page->lru);
+ CHECK_COND(cpu_buffer, page->lru.prev->next != &page->lru);
+ }
+
+ return 0;
+}
+
+static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned nr_pages)
+{
+ struct list_head *head = &cpu_buffer->pages;
+ LIST_HEAD(pages);
+ struct page *page, *tmp;
+ unsigned long addr;
+ unsigned i;
+
+ for (i = 0; i < nr_pages; i++) {
+ addr = __get_free_page(GFP_KERNEL);
+ if (!addr)
+ goto free_pages;
+ page = virt_to_page(addr);
+ list_add(&page->lru, &pages);
+ }
+
+ list_splice(&pages, head);
+
+ check_pages(cpu_buffer);
+
+ return 0;
+
+ free_pages:
+ list_for_each_entry_safe(page, tmp, &pages, lru) {
+ list_del_init(&page->lru);
+ __free_page(page);
+ }
+ return -ENOMEM;
+}
+
+static struct ring_buffer_per_cpu *
+ring_buffer_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int ret;
+
+ cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!cpu_buffer)
+ return NULL;
+
+ cpu_buffer->cpu = cpu;
+ cpu_buffer->buffer = buffer;
+ cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+ INIT_LIST_HEAD(&cpu_buffer->pages);
+
+ ret = rb_allocate_pages(cpu_buffer, buffer->pages);
+ if (ret < 0)
+ goto fail_free_buffer;
+
+ cpu_buffer->head_page
+ = list_entry(cpu_buffer->pages.next, struct page, lru);
+ cpu_buffer->tail_page
+ = list_entry(cpu_buffer->pages.next, struct page, lru);
+
+ return cpu_buffer;
+
+ fail_free_buffer:
+ kfree(cpu_buffer);
+ return NULL;
+}
+
+static void
+ring_buffer_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct list_head *head = &cpu_buffer->pages;
+ struct page *page, *tmp;
+
+ list_for_each_entry_safe(page, tmp, head, lru) {
+ list_del_init(&page->lru);
+ __free_page(page);
+ }
+ kfree(cpu_buffer);
+}
+
+/**
+ * ring_buffer_alloc - allocate a new ring_buffer
+ * @size: the size in bytes that is needed.
+ * @flags: attributes to set for the ring buffer.
+ *
+ * Currently the only flag that is available is the RB_FL_OVERWRITE
+ * flag. This flag means that the buffer will overwrite old data
+ * when the buffer wraps. If this flag is not set, the buffer will
+ * drop data when the tail hits the head.
+ */
+struct ring_buffer *
+ring_buffer_alloc(unsigned long size, unsigned flags)
+{
+ struct ring_buffer *buffer;
+ int cpu;
+
+ /* keep it in its own cache line */
+ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
+ GFP_KERNEL);
+ if (!buffer)
+ return NULL;
+
+ buffer->pages = (size + (BUF_PAGE_SIZE - 1)) / BUF_PAGE_SIZE;
+ buffer->flags = flags;
+
+ /* need at least two pages */
+ if (buffer->pages == 1)
+ buffer->pages++;
+
+ /* FIXME: do for only online CPUS */
+ buffer->cpus = num_possible_cpus();
+ for_each_possible_cpu(cpu) {
+ if (cpu >= buffer->cpus)
+ continue;
+ buffer->buffers[cpu] =
+ ring_buffer_allocate_cpu_buffer(buffer, cpu);
+ if (!buffer->buffers[cpu])
+ goto fail_free_buffers;
+ }
+
+ mutex_init(&buffer->mutex);
+
+ return buffer;
+
+ fail_free_buffers:
+ for_each_possible_cpu(cpu) {
+ if (cpu >= buffer->cpus)
+ continue;
+ if (buffer->buffers[cpu])
+ ring_buffer_free_cpu_buffer(buffer->buffers[cpu]);
+ }
+
+ kfree(buffer);
+ return NULL;
+}
+
+/**
+ * ring_buffer_free - free a ring buffer.
+ * @buffer: the buffer to free.
+ */
+void
+ring_buffer_free(struct ring_buffer *buffer)
+{
+ int cpu;
+
+ for (cpu = 0; cpu < buffer->cpus; cpu++)
+ ring_buffer_free_cpu_buffer(buffer->buffers[cpu]);
+
+ kfree(buffer);
+}
+
+static void
+__ring_buffer_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
+
+static void
+rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
+{
+ struct page *page;
+ struct list_head *p;
+ unsigned i;
+
+ atomic_inc(&cpu_buffer->record_disabled);
+
+ for (i = 0; i < nr_pages; i++) {
+ BUG_ON(list_empty(&cpu_buffer->pages));
+ p = cpu_buffer->pages.next;
+ page = list_entry(p, struct page, lru);
+ list_del_init(&page->lru);
+ __free_page(page);
+ }
+ BUG_ON(list_empty(&cpu_buffer->pages));
+
+ __ring_buffer_reset_cpu(cpu_buffer);
+
+ atomic_dec(&cpu_buffer->record_disabled);
+
+}
+
+static void
+rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
+ struct list_head *pages, unsigned nr_pages)
+{
+ struct page *page;
+ struct list_head *p;
+ unsigned i;
+
+ atomic_inc(&cpu_buffer->record_disabled);
+
+ for (i = 0; i < nr_pages; i++) {
+ BUG_ON(list_empty(pages));
+ p = pages->next;
+ page = list_entry(p, struct page, lru);
+ list_del_init(&page->lru);
+ list_add_tail(&page->lru, &cpu_buffer->pages);
+ }
+ __ring_buffer_reset_cpu(cpu_buffer);
+
+ atomic_dec(&cpu_buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_resize - resize the ring buffer
+ * @buffer: the buffer to resize.
+ * @size: the new size.
+ *
+ * The tracer is responsible for making sure that the buffer is
+ * not being used while changing the size.
+ * Note: We may be able to change the above requirement by using
+ * RCU synchronizations.
+ *
+ * Minimum size is 2 * BUF_PAGE_SIZE.
+ *
+ * Returns -1 on failure.
+ */
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long buffer_size;
+ LIST_HEAD(pages);
+ unsigned long addr;
+ unsigned nr_pages, rm_pages, new_pages;
+ struct page *page, *tmp;
+ int i, cpu;
+
+ size = (size + (BUF_PAGE_SIZE-1)) / BUF_PAGE_SIZE;
+ size *= BUF_PAGE_SIZE;
+ buffer_size = buffer->pages * BUF_PAGE_SIZE;
+
+ /* we need a minimum of two pages */
+ if (size < BUF_PAGE_SIZE * 2)
+ size = BUF_PAGE_SIZE * 2;
+
+ if (size == buffer_size)
+ return size;
+
+ mutex_lock(&buffer->mutex);
+
+ nr_pages = (size + (BUF_PAGE_SIZE-1)) / BUF_PAGE_SIZE;
+
+ if (size < buffer_size) {
+
+ /* easy case, just free pages */
+ BUG_ON(nr_pages >= buffer->pages);
+
+ rm_pages = buffer->pages - nr_pages;
+
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = buffer->buffers[cpu];
+ rb_remove_pages(cpu_buffer, rm_pages);
+ }
+ goto out;
+ }
+
+ /*
+ * This is a bit more difficult. We only want to add pages
+ * when we can allocate enough for all CPUs. We do this
+ * by allocating all the pages and storing them on a local
+ * link list. If we succeed in our allocation, then we
+ * add these pages to the cpu_buffers. Otherwise we just free
+ * them all and return -ENOMEM;
+ */
+ BUG_ON(nr_pages <= buffer->pages);
+ new_pages = nr_pages - buffer->pages;
+
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ for (i = 0; i < new_pages; i++) {
+ addr = __get_free_page(GFP_KERNEL);
+ if (!addr)
+ goto free_pages;
+ page = virt_to_page(addr);
+ list_add(&page->lru, &pages);
+ }
+ }
+
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = buffer->buffers[cpu];
+ rb_insert_pages(cpu_buffer, &pages, new_pages);
+ }
+
+ BUG_ON(!list_empty(&pages));
+
+ out:
+ buffer->pages = nr_pages;
+ mutex_unlock(&buffer->mutex);
+
+ return size;
+
+ free_pages:
+ list_for_each_entry_safe(page, tmp, &pages, lru) {
+ list_del_init(&page->lru);
+ __free_page(page);
+ }
+ return -ENOMEM;
+}
+
+static inline int
+ring_buffer_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return cpu_buffer->head_page == cpu_buffer->tail_page &&
+ cpu_buffer->head == cpu_buffer->tail;
+}
+
+static inline int
+ring_buffer_null_event(struct ring_buffer_event *event)
+{
+ return event->type == RB_TYPE_PADDING;
+}
+
+static inline void *
+rb_page_index(struct page *page, unsigned index)
+{
+ struct buffer_page *bpage;
+
+ bpage = page_address(page);
+ return bpage->body + index;
+}
+
+static inline struct ring_buffer_event *
+ring_buffer_head_event(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return rb_page_index(cpu_buffer->head_page,
+ cpu_buffer->head);
+}
+
+static inline struct ring_buffer_event *
+ring_buffer_iter_head_event(struct ring_buffer_iter *iter)
+{
+ return rb_page_index(iter->head_page,
+ iter->head);
+}
+
+/*
+ * When the tail hits the head and the buffer is in overwrite mode,
+ * the head jumps to the next page and all content on the previous
+ * page is discarded. But before doing so, we update the overrun
+ * variable of the buffer.
+ */
+static void
+ring_buffer_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct ring_buffer_event *event;
+ unsigned long head;
+
+ for (head = 0; head < BUF_PAGE_SIZE;
+ head += ring_buffer_event_length(event)) {
+ event = rb_page_index(cpu_buffer->head_page, head);
+ if (ring_buffer_null_event(event))
+ break;
+ cpu_buffer->overrun++;
+ cpu_buffer->entries--;
+ }
+}
+
+static inline void
+ring_buffer_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
+ struct page **page)
+{
+ struct list_head *p = (*page)->lru.next;
+
+ if (p == &cpu_buffer->pages)
+ p = p->next;
+
+ *page = list_entry(p, struct page, lru);
+}
+
+static inline void
+rb_add_stamp(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
+{
+ struct buffer_page *bpage;
+
+ bpage = page_address(cpu_buffer->tail_page);
+ bpage->time_stamp = *ts;
+}
+
+static void
+rb_reset_read_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct buffer_page *bpage;
+
+ cpu_buffer->head = 0;
+ bpage = page_address(cpu_buffer->head_page);
+ cpu_buffer->read_stamp = bpage->time_stamp;
+}
+
+static void
+rb_reset_iter_read_page(struct ring_buffer_iter *iter)
+{
+ struct buffer_page *bpage;
+
+ iter->head = 0;
+ bpage = page_address(iter->head_page);
+ iter->read_stamp = bpage->time_stamp;
+}
+
+/**
+ * ring_buffer_update_event - update event type and data
+ * @event: the even to update
+ * @type: the type of event
+ * @length: the size of the event field in the ring buffer
+ *
+ * Update the type and data fields of the event. The length
+ * is the actual size that is written to the ring buffer,
+ * and with this, we can determine what to place into the
+ * data field.
+ */
+static inline void
+ring_buffer_update_event(struct ring_buffer_event *event,
+ unsigned type, unsigned length)
+{
+ event->type = type;
+
+ switch (type) {
+
+ case RB_TYPE_PADDING:
+ break;
+
+ case RB_TYPE_TIME_EXTENT:
+ event->len =
+ (RB_LEN_TIME_EXTENT + (RB_ALIGNMENT-1))
+ >> RB_ALIGNMENT_SHIFT;
+ break;
+
+ case RB_TYPE_TIME_STAMP:
+ event->len =
+ (RB_LEN_TIME_STAMP + (RB_ALIGNMENT-1))
+ >> RB_ALIGNMENT_SHIFT;
+ break;
+
+ case RB_TYPE_DATA:
+ length -= RB_EVNT_HDR_SIZE;
+ if (length > RB_MAX_SMALL_DATA) {
+ event->len = 0;
+ event->array[0] = length;
+ } else
+ event->len =
+ (length + (RB_ALIGNMENT-1))
+ >> RB_ALIGNMENT_SHIFT;
+ break;
+ default:
+ BUG();
+ }
+}
+
+static inline unsigned rb_calculate_event_length(unsigned length)
+{
+ struct ring_buffer_event event; /* Used only for sizeof array */
+
+ /* zero length can cause confusions */
+ if (!length)
+ length = 1;
+
+ if (length > RB_MAX_SMALL_DATA)
+ length += sizeof(event.array[0]);
+
+ length += RB_EVNT_HDR_SIZE;
+ length = ALIGN(length, RB_ALIGNMENT);
+
+ return length;
+}
+
+static struct ring_buffer_event *
+__ring_buffer_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned type, unsigned long length, u64 *ts)
+{
+ struct page *head_page, *tail_page;
+ unsigned long tail;
+ struct ring_buffer *buffer = cpu_buffer->buffer;
+ struct ring_buffer_event *event;
+
+ tail_page = cpu_buffer->tail_page;
+ head_page = cpu_buffer->head_page;
+ tail = cpu_buffer->tail;
+
+ if (tail + length > BUF_PAGE_SIZE) {
+ struct page *next_page = tail_page;
+
+ ring_buffer_inc_page(cpu_buffer, &next_page);
+
+ if (next_page == head_page) {
+ if (!(buffer->flags & RB_FL_OVERWRITE))
+ return NULL;
+
+ /* count overflows */
+ ring_buffer_update_overflow(cpu_buffer);
+
+ ring_buffer_inc_page(cpu_buffer, &head_page);
+ cpu_buffer->head_page = head_page;
+ rb_reset_read_page(cpu_buffer);
+ }
+
+ if (tail != BUF_PAGE_SIZE) {
+ event = rb_page_index(tail_page, tail);
+ /* page padding */
+ event->type = RB_TYPE_PADDING;
+ }
+
+ tail = 0;
+ tail_page = next_page;
+ cpu_buffer->tail_page = tail_page;
+ cpu_buffer->tail = tail;
+ rb_add_stamp(cpu_buffer, ts);
+ }
+
+ BUG_ON(tail + length > BUF_PAGE_SIZE);
+
+ event = rb_page_index(tail_page, tail);
+ ring_buffer_update_event(event, type, length);
+ cpu_buffer->entries++;
+
+ return event;
+}
+
+static struct ring_buffer_event *
+ring_buffer_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned type, unsigned long length)
+{
+ unsigned long long ts, delta;
+ struct ring_buffer_event *event;
+
+ ts = ring_buffer_time_stamp(cpu_buffer->cpu);
+
+ if (cpu_buffer->tail) {
+ delta = ts - cpu_buffer->write_stamp;
+
+ if (test_time_stamp(delta)) {
+ /*
+ * The delta is too big, we to add a
+ * new timestamp.
+ */
+ event = __ring_buffer_reserve_next(cpu_buffer,
+ RB_TYPE_TIME_EXTENT,
+ RB_LEN_TIME_EXTENT,
+ &ts);
+ if (!event)
+ return NULL;
+
+ /* check to see if we went to the next page */
+ if (!cpu_buffer->tail) {
+ /*
+ * new page, dont commit this and add the
+ * time stamp to the page instead.
+ */
+ rb_add_stamp(cpu_buffer, &ts);
+ } else {
+ event->time_delta = delta & TS_MASK;
+ event->array[0] = delta >> TS_SHIFT;
+ }
+
+ cpu_buffer->write_stamp = ts;
+ delta = 0;
+ }
+ } else {
+ rb_add_stamp(cpu_buffer, &ts);
+ delta = 0;
+ }
+
+ event = __ring_buffer_reserve_next(cpu_buffer, type, length, &ts);
+ if (!event)
+ return NULL;
+
+ event->time_delta = delta;
+ cpu_buffer->write_stamp = ts;
+
+ return event;
+}
+
+/**
+ * ring_buffer_lock_reserve - reserve a part of the buffer
+ * @buffer: the ring buffer to reserve from
+ * @length: the length of the data to reserve (excluding event header)
+ * @flags: a pointer to save the interrupt flags
+ *
+ * Returns a reseverd event on the ring buffer to copy directly to.
+ * The user of this interface will need to get the body to write into
+ * and can use the ring_buffer_event_data() interface.
+ *
+ * The length is the length of the data needed, not the event length
+ * which also includes the event header.
+ *
+ * Must be paired with ring_buffer_unlock_commit, unless NULL is returned.
+ * If NULL is returned, then nothing has been allocated or locked.
+ */
+struct ring_buffer_event *
+ring_buffer_lock_reserve(struct ring_buffer *buffer,
+ unsigned long length,
+ unsigned long *flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ int cpu;
+
+ if (atomic_read(&buffer->record_disabled))
+ return NULL;
+
+ raw_local_irq_save(*flags);
+ cpu = raw_smp_processor_id();
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_lock(&cpu_buffer->lock);
+
+ if (atomic_read(&cpu_buffer->record_disabled))
+ goto no_record;
+
+ length = rb_calculate_event_length(length);
+ if (length > BUF_PAGE_SIZE)
+ return NULL;
+
+ event = ring_buffer_reserve_next_event(cpu_buffer,
+ RB_TYPE_DATA, length);
+ if (!event)
+ goto no_record;
+
+ return event;
+
+ no_record:
+ __raw_spin_unlock(&cpu_buffer->lock);
+ local_irq_restore(*flags);
+ return NULL;
+}
+
+/**
+ * ring_buffer_unlock_commit - commit a reserved
+ * @buffer: The buffer to commit to
+ * @event: The event pointer to commit.
+ * @flags: the interrupt flags received from ring_buffer_lock_reserve.
+ *
+ * This commits the data to the ring buffer, and releases any locks held.
+ *
+ * Must be paired with ring_buffer_lock_reserve.
+ */
+int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu = raw_smp_processor_id();
+
+ cpu_buffer = buffer->buffers[cpu];
+ cpu_buffer->tail += ring_buffer_event_length(event);
+
+ __raw_spin_unlock(&cpu_buffer->lock);
+ raw_local_irq_restore(flags);
+
+ return 0;
+}
+
+/**
+ * ring_buffer_write - write data to the buffer without reserving
+ * @buffer: The ring buffer to write to.
+ * @length: The length of the data being written (excluding the event header)
+ * @data: The data to write to the buffer.
+ *
+ * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as
+ * one function. If you already have the data to write to the buffer, it
+ * may be easier to simply call this function.
+ *
+ * Note, like ring_buffer_lock_reserve, the length is the length of the data
+ * and not the length of the event which would hold the header.
+ */
+int ring_buffer_write(struct ring_buffer *buffer,
+ unsigned long length,
+ void *data)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ unsigned long event_length, flags;
+ void *body;
+ int ret = 0;
+ int cpu;
+
+ if (atomic_read(&buffer->record_disabled))
+ return -EBUSY;
+
+ local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_lock(&cpu_buffer->lock);
+
+ if (atomic_read(&cpu_buffer->record_disabled))
+ goto out;
+
+ event_length = rb_calculate_event_length(length);
+ event = ring_buffer_reserve_next_event(cpu_buffer,
+ RB_TYPE_DATA, event_length);
+ if (!event)
+ goto out;
+
+ body = ring_buffer_event_data(event);
+
+ memcpy(body, data, length);
+ cpu_buffer->tail += event_length;
+
+ out:
+ __raw_spin_unlock(&cpu_buffer->lock);
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+/**
+ * ring_buffer_lock - lock the ring buffer
+ * @buffer: The ring buffer to lock
+ * @flags: The place to store the interrupt flags
+ *
+ * This locks all the per CPU buffers.
+ *
+ * Must be unlocked by ring_buffer_unlock.
+ */
+void ring_buffer_lock(struct ring_buffer *buffer, unsigned long *flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ local_irq_save(*flags);
+
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_lock(&cpu_buffer->lock);
+ }
+}
+
+/**
+ * ring_buffer_unlock - unlock a locked buffer
+ * @buffer: The locked buffer to unlock
+ * @flags: The interrupt flags received by ring_buffer_lock
+ */
+void ring_buffer_unlock(struct ring_buffer *buffer, unsigned long flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ for (cpu = buffer->cpus - 1; cpu >= 0; cpu--) {
+
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_unlock(&cpu_buffer->lock);
+ }
+
+ local_irq_restore(flags);
+}
+
+/**
+ * ring_buffer_record_disable - stop all writes into the buffer
+ * @buffer: The ring buffer to stop writes to.
+ *
+ * This prevents all writes to the buffer. Any attempt to write
+ * to the buffer after this will fail and return NULL.
+ */
+void ring_buffer_record_disable(struct ring_buffer *buffer)
+{
+ atomic_inc(&buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_enable - enable writes to the buffer
+ * @buffer: The ring buffer to enable writes
+ *
+ * Note, multiple disables will need the same number of enables
+ * to truely enable the writing (much like preempt_disable).
+ */
+void ring_buffer_record_enable(struct ring_buffer *buffer)
+{
+ atomic_dec(&buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
+ * @buffer: The ring buffer to stop writes to.
+ * @cpu: The CPU buffer to stop
+ *
+ * This prevents all writes to the buffer. Any attempt to write
+ * to the buffer after this will fail and return NULL.
+ */
+void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+ atomic_inc(&cpu_buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_enable_cpu - enable writes to the buffer
+ * @buffer: The ring buffer to enable writes
+ * @cpu: The CPU to enable.
+ *
+ * Note, multiple disables will need the same number of enables
+ * to truely enable the writing (much like preempt_disable).
+ */
+void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+ atomic_dec(&cpu_buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the entries from.
+ */
+unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return cpu_buffer->entries;
+}
+
+/**
+ * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return cpu_buffer->overrun;
+}
+
+/**
+ * ring_buffer_entries - get the number of entries in a buffer
+ * @buffer: The ring buffer
+ *
+ * Returns the total number of entries in the ring buffer
+ * (all CPU entries)
+ */
+unsigned long ring_buffer_entries(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long entries = 0;
+ int cpu;
+
+ /* if you care about this being correct, lock the buffer */
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = buffer->buffers[cpu];
+ entries += cpu_buffer->entries;
+ }
+
+ return entries;
+}
+
+/**
+ * ring_buffer_overrun_cpu - get the number of overruns in buffer
+ * @buffer: The ring buffer
+ *
+ * Returns the total number of overruns in the ring buffer
+ * (all CPU entries)
+ */
+unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long overruns = 0;
+ int cpu;
+
+ /* if you care about this being correct, lock the buffer */
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = buffer->buffers[cpu];
+ overruns += cpu_buffer->overrun;
+ }
+
+ return overruns;
+}
+
+/**
+ * ring_buffer_iter_reset - reset an iterator
+ * @iter: The iterator to reset
+ *
+ * Resets the iterator, so that it will start from the beginning
+ * again.
+ */
+void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+
+ iter->head_page = cpu_buffer->head_page;
+ iter->head = cpu_buffer->head;
+ rb_reset_iter_read_page(iter);
+}
+
+/**
+ * ring_buffer_iter_empty - check if an iterator has no more to read
+ * @iter: The iterator to check
+ */
+int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = iter->cpu_buffer;
+
+ return iter->head_page == cpu_buffer->tail_page &&
+ iter->head == cpu_buffer->tail;
+}
+
+static void
+rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ u64 delta;
+
+ switch (event->type) {
+ case RB_TYPE_PADDING:
+ return;
+
+ case RB_TYPE_TIME_EXTENT:
+ delta = event->array[0];
+ delta <<= TS_SHIFT;
+ delta += event->time_delta;
+ cpu_buffer->read_stamp += delta;
+ return;
+
+ case RB_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ return;
+
+ case RB_TYPE_DATA:
+ cpu_buffer->read_stamp += event->time_delta;
+ return;
+
+ default:
+ BUG();
+ }
+ return;
+}
+
+static void
+rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
+ struct ring_buffer_event *event)
+{
+ u64 delta;
+
+ switch (event->type) {
+ case RB_TYPE_PADDING:
+ return;
+
+ case RB_TYPE_TIME_EXTENT:
+ delta = event->array[0];
+ delta <<= TS_SHIFT;
+ delta += event->time_delta;
+ iter->read_stamp += delta;
+ return;
+
+ case RB_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ return;
+
+ case RB_TYPE_DATA:
+ iter->read_stamp += event->time_delta;
+ return;
+
+ default:
+ BUG();
+ }
+ return;
+}
+
+static void
+ring_buffer_advance_head(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct ring_buffer_event *event;
+ unsigned length;
+
+ event = ring_buffer_head_event(cpu_buffer);
+ /*
+ * Check if we are at the end of the buffer.
+ */
+ if (ring_buffer_null_event(event)) {
+ BUG_ON(cpu_buffer->head_page == cpu_buffer->tail_page);
+ ring_buffer_inc_page(cpu_buffer, &cpu_buffer->head_page);
+ rb_reset_read_page(cpu_buffer);
+ return;
+ }
+
+ length = ring_buffer_event_length(event);
+
+ /*
+ * This should not be called to advance the header if we are
+ * at the tail of the buffer.
+ */
+ BUG_ON((cpu_buffer->head_page == cpu_buffer->tail_page) &&
+ (cpu_buffer->head + length > cpu_buffer->tail));
+
+ rb_update_read_stamp(cpu_buffer, event);
+
+ cpu_buffer->head += length;
+
+ /* check for end of page padding */
+ event = ring_buffer_head_event(cpu_buffer);
+ if (ring_buffer_null_event(event) &&
+ (cpu_buffer->head_page != cpu_buffer->tail_page))
+ ring_buffer_advance_head(cpu_buffer);
+}
+
+static void
+ring_buffer_advance_iter(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer *buffer;
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ unsigned length;
+
+ cpu_buffer = iter->cpu_buffer;
+ buffer = cpu_buffer->buffer;
+
+ event = ring_buffer_iter_head_event(iter);
+
+ /*
+ * Check if we are at the end of the buffer.
+ */
+ if (ring_buffer_null_event(event)) {
+ BUG_ON(iter->head_page == cpu_buffer->tail_page);
+ ring_buffer_inc_page(cpu_buffer, &iter->head_page);
+ rb_reset_iter_read_page(iter);
+ return;
+ }
+
+ length = ring_buffer_event_length(event);
+
+ /*
+ * This should not be called to advance the header if we are
+ * at the tail of the buffer.
+ */
+ BUG_ON((iter->head_page == cpu_buffer->tail_page) &&
+ (iter->head + length > cpu_buffer->tail));
+
+ rb_update_iter_read_stamp(iter, event);
+
+ iter->head += length;
+
+ /* check for end of page padding */
+ event = ring_buffer_iter_head_event(iter);
+ if (ring_buffer_null_event(event) &&
+ (iter->head_page != cpu_buffer->tail_page))
+ ring_buffer_advance_iter(iter);
+}
+
+/**
+ * ring_buffer_peek - peek at the next event to be read
+ * @buffer: The ring buffer to read
+ * @cpu: The cpu to peak at
+ * @ts: The timestamp counter of this event.
+ *
+ * This will return the event that will be read next, but does
+ * not consume the data.
+ */
+struct ring_buffer_event *
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ again:
+ if (ring_buffer_per_cpu_empty(cpu_buffer))
+ return NULL;
+
+ event = ring_buffer_head_event(cpu_buffer);
+
+ switch (event->type) {
+ case RB_TYPE_PADDING:
+ ring_buffer_inc_page(cpu_buffer, &cpu_buffer->head_page);
+ rb_reset_read_page(cpu_buffer);
+ goto again;
+
+ case RB_TYPE_TIME_EXTENT:
+ /* Internal data, OK to advance */
+ ring_buffer_advance_head(cpu_buffer);
+ goto again;
+
+ case RB_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ ring_buffer_advance_head(cpu_buffer);
+ goto again;
+
+ case RB_TYPE_DATA:
+ if (ts) {
+ *ts = cpu_buffer->read_stamp + event->time_delta;
+ ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+ }
+ return event;
+
+ default:
+ BUG();
+ }
+
+ return NULL;
+}
+
+/**
+ * ring_buffer_iter_peek - peek at the next event to be read
+ * @iter: The ring buffer iterator
+ * @ts: The timestamp counter of this event.
+ *
+ * This will return the event that will be read next, but does
+ * not increment the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
+{
+ struct ring_buffer *buffer;
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+
+ if (ring_buffer_iter_empty(iter))
+ return NULL;
+
+ cpu_buffer = iter->cpu_buffer;
+ buffer = cpu_buffer->buffer;
+
+ again:
+ if (ring_buffer_per_cpu_empty(cpu_buffer))
+ return NULL;
+
+ event = ring_buffer_iter_head_event(iter);
+
+ switch (event->type) {
+ case RB_TYPE_PADDING:
+ ring_buffer_inc_page(cpu_buffer, &iter->head_page);
+ rb_reset_iter_read_page(iter);
+ goto again;
+
+ case RB_TYPE_TIME_EXTENT:
+ /* Internal data, OK to advance */
+ ring_buffer_advance_iter(iter);
+ goto again;
+
+ case RB_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ ring_buffer_advance_iter(iter);
+ goto again;
+
+ case RB_TYPE_DATA:
+ if (ts) {
+ *ts = iter->read_stamp + event->time_delta;
+ ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+ }
+ return event;
+
+ default:
+ BUG();
+ }
+
+ return NULL;
+}
+
+/**
+ * ring_buffer_consume - return an event and consume it
+ * @buffer: The ring buffer to get the next event from
+ *
+ * Returns the next event in the ring buffer, and that event is consumed.
+ * Meaning, that sequential reads will keep returning a different event,
+ * and eventually empty the ring buffer if the producer is slower.
+ */
+struct ring_buffer_event *
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+
+ event = ring_buffer_peek(buffer, cpu, ts);
+ if (!event)
+ return NULL;
+
+ cpu_buffer = buffer->buffers[cpu];
+ ring_buffer_advance_head(cpu_buffer);
+
+ return event;
+}
+
+/**
+ * ring_buffer_read_start - start a non consuming read of the buffer
+ * @buffer: The ring buffer to read from
+ * @cpu: The cpu buffer to iterate over
+ *
+ * This starts up an iteration through the buffer. It also disables
+ * the recording to the buffer until the reading is finished.
+ * This prevents the reading from being corrupted. This is not
+ * a consuming read, so a producer is not expected.
+ *
+ * Must be paired with ring_buffer_finish.
+ */
+struct ring_buffer_iter *
+ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_iter *iter;
+
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return NULL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ iter->cpu_buffer = cpu_buffer;
+
+ atomic_inc(&cpu_buffer->record_disabled);
+
+ __raw_spin_lock(&cpu_buffer->lock);
+ iter->head = cpu_buffer->head;
+ iter->head_page = cpu_buffer->head_page;
+ rb_reset_iter_read_page(iter);
+ __raw_spin_unlock(&cpu_buffer->lock);
+
+ return iter;
+}
+
+/**
+ * ring_buffer_finish - finish reading the iterator of the buffer
+ * @iter: The iterator retrieved by ring_buffer_start
+ *
+ * This re-enables the recording to the buffer, and frees the
+ * iterator.
+ */
+void
+ring_buffer_read_finish(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+
+ atomic_dec(&cpu_buffer->record_disabled);
+ kfree(iter);
+}
+
+/**
+ * ring_buffer_read - read the next item in the ring buffer by the iterator
+ * @iter: The ring buffer iterator
+ * @ts: The time stamp of the event read.
+ *
+ * This reads the next event in the ring buffer and increments the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
+{
+ struct ring_buffer_event *event;
+
+ event = ring_buffer_iter_peek(iter, ts);
+ if (!event)
+ return NULL;
+
+ ring_buffer_advance_iter(iter);
+
+ return event;
+}
+
+/**
+ * ring_buffer_size - return the size of the ring buffer (in bytes)
+ * @buffer: The ring buffer.
+ */
+unsigned long ring_buffer_size(struct ring_buffer *buffer)
+{
+ return BUF_PAGE_SIZE * buffer->pages;
+}
+
+static void
+__ring_buffer_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ cpu_buffer->head_page
+ = list_entry(cpu_buffer->pages.next, struct page, lru);
+ cpu_buffer->tail_page
+ = list_entry(cpu_buffer->pages.next, struct page, lru);
+
+ cpu_buffer->head = cpu_buffer->tail = 0;
+ cpu_buffer->overrun = 0;
+ cpu_buffer->entries = 0;
+}
+
+/**
+ * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
+ * @buffer: The ring buffer to reset a per cpu buffer of
+ * @cpu: The CPU buffer to be reset
+ */
+void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ __raw_spin_lock(&cpu_buffer->lock);
+
+ __ring_buffer_reset_cpu(cpu_buffer);
+
+ __raw_spin_unlock(&cpu_buffer->lock);
+ raw_local_irq_restore(flags);
+}
+
+/**
+ * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
+ * @buffer: The ring buffer to reset a per cpu buffer of
+ * @cpu: The CPU buffer to be reset
+ */
+void ring_buffer_reset(struct ring_buffer *buffer)
+{
+ unsigned long flags;
+ int cpu;
+
+ ring_buffer_lock(buffer, &flags);
+
+ for (cpu = 0; cpu < buffer->cpus; cpu++)
+ __ring_buffer_reset_cpu(buffer->buffers[cpu]);
+
+ ring_buffer_unlock(buffer, flags);
+}
+
+/**
+ * rind_buffer_empty - is the ring buffer empty?
+ * @buffer: The ring buffer to test
+ */
+int ring_buffer_empty(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ /* yes this is racy, but if you don't like the race, lock the buffer */
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = buffer->buffers[cpu];
+ if (!ring_buffer_per_cpu_empty(cpu_buffer))
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty?
+ * @buffer: The ring buffer
+ * @cpu: The CPU buffer to test
+ */
+int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return ring_buffer_per_cpu_empty(cpu_buffer);
+}
+
+/**
+ * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
+ * @buffer_a: One buffer to swap with
+ * @buffer_b: The other buffer to swap with
+ *
+ * This function is useful for tracers that want to take a "snapshot"
+ * of a CPU buffer and has another back up buffer lying around.
+ * it is expected that the tracer handles the cpu buffer not being
+ * used at the moment.
+ */
+int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
+ struct ring_buffer *buffer_b, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer_a;
+ struct ring_buffer_per_cpu *cpu_buffer_b;
+
+ /* At least make sure the two buffers are somewhat the same */
+ if (buffer_a->size != buffer_b->size ||
+ buffer_a->pages != buffer_b->pages)
+ return -EINVAL;
+
+ cpu_buffer_a = buffer_a->buffers[cpu];
+ cpu_buffer_b = buffer_b->buffers[cpu];
+
+ atomic_inc(&cpu_buffer_a->record_disabled);
+ atomic_inc(&cpu_buffer_b->record_disabled);
+
+ buffer_a->buffers[cpu] = cpu_buffer_b;
+ buffer_b->buffers[cpu] = cpu_buffer_a;
+
+ cpu_buffer_b->buffer = buffer_a;
+ cpu_buffer_a->buffer = buffer_b;
+
+ atomic_dec(&cpu_buffer_a->record_disabled);
+ atomic_dec(&cpu_buffer_b->record_disabled);
+
+ return 0;
+}
+
Index: linux-trace.git/kernel/trace/Kconfig
===================================================================
--- linux-trace.git.orig/kernel/trace/Kconfig 2008-09-25 21:28:29.000000000 -0400
+++ linux-trace.git/kernel/trace/Kconfig 2008-09-25 21:29:16.000000000 -0400
@@ -10,10 +10,14 @@ config HAVE_DYNAMIC_FTRACE
config TRACER_MAX_TRACE
bool

+config RING_BUFFER
+ bool
+
config TRACING
bool
select DEBUG_FS
select STACKTRACE
+ select RING_BUFFER

config FTRACE
bool "Kernel Function Tracer"
Index: linux-trace.git/kernel/trace/Makefile
===================================================================
--- linux-trace.git.orig/kernel/trace/Makefile 2008-09-25 21:28:29.000000000 -0400
+++ linux-trace.git/kernel/trace/Makefile 2008-09-25 21:29:16.000000000 -0400
@@ -11,6 +11,7 @@ obj-y += trace_selftest_dynamic.o
endif

obj-$(CONFIG_FTRACE) += libftrace.o
+obj-$(CONFIG_RING_BUFFER) += ring_buffer.o

obj-$(CONFIG_TRACING) += trace.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
Linus Torvalds
2008-09-26 17:37:54 UTC
Permalink
Why do you need __packed__ here? With or without it the layout is the
Indeed. And on some architectures 'packed' will actually mean that the
compiler may think that it's unaligned, and then generate much worse code
to access the fields. So if you align things anyway (and you do), then
'packed' is the wrong thing to do.

Linus
Steven Rostedt
2008-09-26 17:46:45 UTC
Permalink
Why do you need __packed__ here? With or without it the layout is the
From just being paranoid.
Indeed. And on some architectures 'packed' will actually mean that the
compiler may think that it's unaligned, and then generate much worse code
to access the fields. So if you align things anyway (and you do), then
'packed' is the wrong thing to do.
OK, I'm making v6 now with various cleanups. I'll nuke it on that one.

-- Steve
Ingo Molnar
2008-09-27 17:02:25 UTC
Permalink
Post by Steven Rostedt
Post by Linus Torvalds
Indeed. And on some architectures 'packed' will actually mean that
the compiler may think that it's unaligned, and then generate much
worse code to access the fields. So if you align things anyway (and
you do), then 'packed' is the wrong thing to do.
OK, I'm making v6 now with various cleanups. I'll nuke it on that one.
btw., now that it's getting into shape, could you please fix the ftrace
Post by Steven Rostedt
Subject: [RFC PATCH 2/2 v3] ftrace: make work with new ring buffer
Note: This patch is a proof of concept, and breaks a lot of
functionality of ftrace.
This patch simply makes ftrace work with the developmental ring
buffer.
... to not have known bugs, so that we could try it in tip/ftrace and
make sure it works well in practice?

it's a ton of changes already, it would be nice to get to some stable
known-working state and do delta patches from that point on, and keep
its 'works well' quality.

Ingo
Steven Rostedt
2008-09-27 17:18:01 UTC
Permalink
Post by Ingo Molnar
Post by Steven Rostedt
Subject: [RFC PATCH 2/2 v3] ftrace: make work with new ring buffer
Note: This patch is a proof of concept, and breaks a lot of
functionality of ftrace.
This patch simply makes ftrace work with the developmental ring
buffer.
... to not have known bugs, so that we could try it in tip/ftrace and
make sure it works well in practice?
it's a ton of changes already, it would be nice to get to some stable
known-working state and do delta patches from that point on, and keep
its 'works well' quality.
OK, the patch that I was using was against Linus's tree. I'll port it over
to linux-tip on Monday and get it past the "proof of concept" stage.
Actually, the verison I have on my desk works pretty well. The main issues
to solve is that some other tracers and the self test stick their noses
into the buffering system, which would need to be fixed.

There's also some bugs in the status numbers printed in the latency_trace
header. But I have not hit any bugs with the buffering itself.

I'll clean all this up and send out a patch on Monday. My wife is
mandating that I do not do anymore work over the weekend ;-)

-- Steve
Arnaldo Carvalho de Melo
2008-09-26 17:31:30 UTC
Permalink
Post by Steven Rostedt
[
Note the removal of the RFC in the subject.
I am happy with this version. It handles everything I need
for ftrace.
- Fixed timing bug. I did not add the deltas properly when
reading the buffer.
- Removed "-1" time stamp normalize test. This made the
clock go backwards!
- Removed page pointer array and replaced it with the ftrace
page struct link list trick. Since this is my second time
writing this code (first with ftrace), it is actually much
cleaner than the ftrace code.
- Implemented buffer resizing. By using the page link list trick,
this became much simpler.
Note, the GOTD part is still not implemented, but can be done
later without affecting this interface.
]
This is a unified tracing buffer that implements a ring buffer that
hopefully everyone will eventually be able to use.
struct ring_buffer_event {
u32 type:2, len:3, time_delta:27;
u32 array[];
};
The minimum size of an event is 8 bytes. All events are 4 byte
aligned inside the buffer.
There are 4 types (all internal use for the ring buffer, only
the data type is exported to the interface users).
RB_TYPE_PADDING: this type is used to note extra space at the end
of a buffer page.
RB_TYPE_TIME_EXTENT: This type is used when the time between events
is greater than the 27 bit delta can hold. We add another
32 bits, and record that in its own event (8 byte size).
RB_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to
help keep the buffer timestamps in sync.
RB_TYPE_DATA: The event actually holds user data.
The "len" field is only three bits. Since the data must be
4 byte aligned, this field is shifted left by 2, giving a
max length of 28 bytes. If the data load is greater than 28
bytes, the first array field holds the full length of the
data load and the len field is set to zero.
type = RB_TYPE_DATA
len = 2
time_delta: <time-stamp> - <prev_event-time-stamp>
array[0..1]: <7 bytes of data> <1 byte empty>
This event is saved in 12 bytes of the buffer.
type = RB_TYPE_DATA
len = 0
time_delta: <time-stamp> - <prev_event-time-stamp>
array[0]: 84 (Note the alignment)
array[1..14]: <82 bytes of data> <2 bytes empty>
The above event is saved in 92 bytes (if my math is correct).
82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length.
Do not reference the above event struct directly. Use the following
functions to gain access to the event table, since the
ring_buffer_event structure may change in the future.
ring_buffer_event_length(event): get the length of the event.
This is the size of the memory used to record this
event, and not the size of the data pay load.
ring_buffer_time_delta(event): get the time delta of the event
This returns the delta time stamp since the last event.
Note: Even though this is in the header, there should
be no reason to access this directly, accept
for debugging.
ring_buffer_event_data(event): get the data from the event
This is the function to use to get the actual data
from the event. Note, it is only a pointer to the
data inside the buffer. This data must be copied to
another location otherwise you risk it being written
over in the buffer.
ring_buffer_lock: A way to lock the entire buffer.
ring_buffer_unlock: unlock the buffer.
ring_buffer_alloc: create a new ring buffer. Can choose between
overwrite or consumer/producer mode. Overwrite will
overwrite old data, where as consumer producer will
throw away new data if the consumer catches up with the
producer. The consumer/producer is the default.
ring_buffer_free: free the ring buffer.
ring_buffer_resize: resize the buffer. Changes the size of each cpu
buffer. Note, it is up to the caller to provide that
the buffer is not being used while this is happening.
This requirement may go away but do not count on it.
ring_buffer_lock_reserve: locks the ring buffer and allocates an
entry on the buffer to write to.
ring_buffer_unlock_commit: unlocks the ring buffer and commits it to
the buffer.
ring_buffer_write: writes some data into the ring buffer.
ring_buffer_peek: Look at a next item in the cpu buffer.
ring_buffer_consume: get the next item in the cpu buffer and
consume it. That is, this function increments the head
pointer.
ring_buffer_read_start: Start an iterator of a cpu buffer.
For now, this disables the cpu buffer, until you issue
a finish. This is just because we do not want the iterator
to be overwritten. This restriction may change in the future.
But note, this is used for static reading of a buffer which
is usually done "after" a trace. Live readings would want
to use the ring_buffer_consume above, which will not
disable the ring buffer.
ring_buffer_read_finish: Finishes the read iterator and reenables
the ring buffer.
ring_buffer_iter_peek: Look at the next item in the cpu iterator.
ring_buffer_read: Read the iterator and increment it.
ring_buffer_iter_reset: Reset the iterator to point to the beginning
of the cpu buffer.
ring_buffer_iter_empty: Returns true if the iterator is at the end
of the cpu buffer.
ring_buffer_size: returns the size in bytes of each cpu buffer.
Note, the real size is this times the number of CPUs.
ring_buffer_reset_cpu: Sets the cpu buffer to empty
ring_buffer_reset: sets all cpu buffers to empty
ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a
cpu buffer of another buffer. This is handy when you
want to take a snap shot of a running trace on just one
cpu. Having a backup buffer, to swap with facilitates this.
Ftrace max latencies use this.
ring_buffer_empty: Returns true if the ring buffer is empty.
ring_buffer_empty_cpu: Returns true if the cpu buffer is empty.
ring_buffer_record_disable: disable all cpu buffers (read only)
ring_buffer_record_disable_cpu: disable a single cpu buffer (read only)
ring_buffer_record_enable: enable all cpu buffers.
ring_buffer_record_enabl_cpu: enable a single cpu buffer.
ring_buffer_entries: The number of entries in a ring buffer.
ring_buffer_overruns: The number of entries removed due to writing wrap.
ring_buffer_time_stamp: Get the time stamp used by the ring buffer
ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp
into nanosecs.
I still need to implement the GTOD feature. But we need support from
the cpu frequency infrastructure. But this can be done at a later
time without affecting the ring buffer interface.
---
include/linux/ring_buffer.h | 178 +++++
kernel/trace/Kconfig | 4
kernel/trace/Makefile | 1
kernel/trace/ring_buffer.c | 1491 ++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 1674 insertions(+)
Index: linux-trace.git/include/linux/ring_buffer.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-trace.git/include/linux/ring_buffer.h 2008-09-25 21:29:16.000000000 -0400
@@ -0,0 +1,178 @@
+#ifndef _LINUX_RING_BUFFER_H
+#define _LINUX_RING_BUFFER_H
+
+#include <linux/mm.h>
+#include <linux/seq_file.h>
+
+struct ring_buffer;
+struct ring_buffer_iter;
+
+/*
+ * Don't reference this struct directly, use the inline items below.
+ */
+struct ring_buffer_event {
+ u32 type:2, len:3, time_delta:27;
+ u32 array[];
+} __attribute__((__packed__));
Why do you need __packed__ here? With or without it the layout is the
same:

[***@doppio examples]$ pahole packed
struct ring_buffer_event {
u32 type:2; /* 0:30 4 */
u32 len:3; /* 0:27 4 */
u32 time_delta:27; /* 0: 0 4 */
u32 array[0]; /* 4 0 */

/* size: 4, cachelines: 1, members: 4 */
/* last cacheline: 4 bytes */
};

- Arnaldo
Steven Rostedt
2008-09-26 18:05:44 UTC
Permalink
[
Changes since v5:

- removed packed attribute from event structure.

- added parenthesis around ~TS_MASK

- fixed some comments in header

- fixed ret value on ring_buffer_write on errors.

- added check_pages when modifying the size of cpu buffers
]

This is a unified tracing buffer that implements a ring buffer that
hopefully everyone will eventually be able to use.

The events recorded into the buffer have the following structure:

struct ring_buffer_event {
u32 type:2, len:3, time_delta:27;
u32 array[];
};

The minimum size of an event is 8 bytes. All events are 4 byte
aligned inside the buffer.

There are 4 types (all internal use for the ring buffer, only
the data type is exported to the interface users).

RB_TYPE_PADDING: this type is used to note extra space at the end
of a buffer page.

RB_TYPE_TIME_EXTENT: This type is used when the time between events
is greater than the 27 bit delta can hold. We add another
32 bits, and record that in its own event (8 byte size).

RB_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to
help keep the buffer timestamps in sync.

RB_TYPE_DATA: The event actually holds user data.

The "len" field is only three bits. Since the data must be
4 byte aligned, this field is shifted left by 2, giving a
max length of 28 bytes. If the data load is greater than 28
bytes, the first array field holds the full length of the
data load and the len field is set to zero.

Example, data size of 7 bytes:

type = RB_TYPE_DATA
len = 2
time_delta: <time-stamp> - <prev_event-time-stamp>
array[0..1]: <7 bytes of data> <1 byte empty>

This event is saved in 12 bytes of the buffer.

An event with 82 bytes of data:

type = RB_TYPE_DATA
len = 0
time_delta: <time-stamp> - <prev_event-time-stamp>
array[0]: 84 (Note the alignment)
array[1..14]: <82 bytes of data> <2 bytes empty>

The above event is saved in 92 bytes (if my math is correct).
82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length.

Do not reference the above event struct directly. Use the following
functions to gain access to the event table, since the
ring_buffer_event structure may change in the future.

ring_buffer_event_length(event): get the length of the event.
This is the size of the memory used to record this
event, and not the size of the data pay load.

ring_buffer_time_delta(event): get the time delta of the event
This returns the delta time stamp since the last event.
Note: Even though this is in the header, there should
be no reason to access this directly, accept
for debugging.

ring_buffer_event_data(event): get the data from the event
This is the function to use to get the actual data
from the event. Note, it is only a pointer to the
data inside the buffer. This data must be copied to
another location otherwise you risk it being written
over in the buffer.

ring_buffer_lock: A way to lock the entire buffer.
ring_buffer_unlock: unlock the buffer.

ring_buffer_alloc: create a new ring buffer. Can choose between
overwrite or consumer/producer mode. Overwrite will
overwrite old data, where as consumer producer will
throw away new data if the consumer catches up with the
producer. The consumer/producer is the default.

ring_buffer_free: free the ring buffer.

ring_buffer_resize: resize the buffer. Changes the size of each cpu
buffer. Note, it is up to the caller to provide that
the buffer is not being used while this is happening.
This requirement may go away but do not count on it.

ring_buffer_lock_reserve: locks the ring buffer and allocates an
entry on the buffer to write to.
ring_buffer_unlock_commit: unlocks the ring buffer and commits it to
the buffer.

ring_buffer_write: writes some data into the ring buffer.

ring_buffer_peek: Look at a next item in the cpu buffer.
ring_buffer_consume: get the next item in the cpu buffer and
consume it. That is, this function increments the head
pointer.

ring_buffer_read_start: Start an iterator of a cpu buffer.
For now, this disables the cpu buffer, until you issue
a finish. This is just because we do not want the iterator
to be overwritten. This restriction may change in the future.
But note, this is used for static reading of a buffer which
is usually done "after" a trace. Live readings would want
to use the ring_buffer_consume above, which will not
disable the ring buffer.

ring_buffer_read_finish: Finishes the read iterator and reenables
the ring buffer.

ring_buffer_iter_peek: Look at the next item in the cpu iterator.
ring_buffer_read: Read the iterator and increment it.
ring_buffer_iter_reset: Reset the iterator to point to the beginning
of the cpu buffer.
ring_buffer_iter_empty: Returns true if the iterator is at the end
of the cpu buffer.

ring_buffer_size: returns the size in bytes of each cpu buffer.
Note, the real size is this times the number of CPUs.

ring_buffer_reset_cpu: Sets the cpu buffer to empty
ring_buffer_reset: sets all cpu buffers to empty

ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a
cpu buffer of another buffer. This is handy when you
want to take a snap shot of a running trace on just one
cpu. Having a backup buffer, to swap with facilitates this.
Ftrace max latencies use this.

ring_buffer_empty: Returns true if the ring buffer is empty.
ring_buffer_empty_cpu: Returns true if the cpu buffer is empty.

ring_buffer_record_disable: disable all cpu buffers (read only)
ring_buffer_record_disable_cpu: disable a single cpu buffer (read only)
ring_buffer_record_enable: enable all cpu buffers.
ring_buffer_record_enabl_cpu: enable a single cpu buffer.

ring_buffer_entries: The number of entries in a ring buffer.
ring_buffer_overruns: The number of entries removed due to writing wrap.

ring_buffer_time_stamp: Get the time stamp used by the ring buffer
ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp
into nanosecs.

I still need to implement the GTOD feature. But we need support from
the cpu frequency infrastructure. But this can be done at a later
time without affecting the ring buffer interface.

Signed-off-by: Steven Rostedt <***@redhat.com>
---
include/linux/ring_buffer.h | 179 +++++
kernel/trace/Kconfig | 4
kernel/trace/Makefile | 1
kernel/trace/ring_buffer.c | 1496 ++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 1680 insertions(+)

Index: linux-trace.git/include/linux/ring_buffer.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-trace.git/include/linux/ring_buffer.h 2008-09-26 13:44:33.000000000 -0400
@@ -0,0 +1,179 @@
+#ifndef _LINUX_RING_BUFFER_H
+#define _LINUX_RING_BUFFER_H
+
+#include <linux/mm.h>
+#include <linux/seq_file.h>
+
+struct ring_buffer;
+struct ring_buffer_iter;
+
+/*
+ * Don't reference this struct directly, use the inline items below.
+ */
+struct ring_buffer_event {
+ u32 type:2, len:3, time_delta:27;
+ u32 array[];
+};
+
+enum {
+ RB_TYPE_PADDING, /* Left over page padding
+ * array is ignored
+ * size is variable depending on
+ * how much padding is needed
+ */
+ RB_TYPE_TIME_EXTENT, /* Extent the time delta
+ * array[0] = time delta (28 .. 59)
+ * size = 8 bytes
+ */
+ /* FIXME: RB_TYPE_TIME_STAMP not implemented */
+ RB_TYPE_TIME_STAMP, /* Sync time stamp with external clock
+ * array[0] = tv_nsec
+ * array[1] = tv_sec
+ * size = 16 bytes
+ */
+
+ RB_TYPE_DATA, /* Data record
+ * If len is zero:
+ * array[0] holds the actual length
+ * array[1..(length+3)/4-1] holds data
+ * else
+ * length = len << 2
+ * array[0..(length+3)/4] holds data
+ */
+};
+
+#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event))
+#define RB_ALIGNMENT_SHIFT 2
+#define RB_ALIGNMENT (1 << RB_ALIGNMENT_SHIFT)
+#define RB_MAX_SMALL_DATA (28)
+
+enum {
+ RB_LEN_TIME_EXTENT = 8,
+ RB_LEN_TIME_STAMP = 16,
+};
+
+/**
+ * ring_buffer_event_length - return the length of the event
+ * @event: the event to get the length of
+ */
+static inline unsigned
+ring_buffer_event_length(struct ring_buffer_event *event)
+{
+ unsigned length;
+
+ switch (event->type) {
+ case RB_TYPE_PADDING:
+ /* undefined */
+ return -1;
+
+ case RB_TYPE_TIME_EXTENT:
+ return RB_LEN_TIME_EXTENT;
+
+ case RB_TYPE_TIME_STAMP:
+ return RB_LEN_TIME_STAMP;
+
+ case RB_TYPE_DATA:
+ if (event->len)
+ length = event->len << RB_ALIGNMENT_SHIFT;
+ else
+ length = event->array[0];
+ return length + RB_EVNT_HDR_SIZE;
+ default:
+ BUG();
+ }
+ /* not hit */
+ return 0;
+}
+
+/**
+ * ring_buffer_event_time_delta - return the delta timestamp of the event
+ * @event: the event to get the delta timestamp of
+ *
+ * The delta timestamp is the 27 bit timestamp since the last event.
+ */
+static inline unsigned
+ring_buffer_event_time_delta(struct ring_buffer_event *event)
+{
+ return event->time_delta;
+}
+
+/**
+ * ring_buffer_event_data - return the data of the event
+ * @event: the event to get the data from
+ */
+static inline void *
+ring_buffer_event_data(struct ring_buffer_event *event)
+{
+ BUG_ON(event->type != RB_TYPE_DATA);
+ /* If length is in len field, then array[0] has the data */
+ if (event->len)
+ return (void *)&event->array[0];
+ /* Otherwise length is in array[0] and array[1] has the data */
+ return (void *)&event->array[1];
+}
+
+void ring_buffer_lock(struct ring_buffer *buffer, unsigned long *flags);
+void ring_buffer_unlock(struct ring_buffer *buffer, unsigned long flags);
+
+/*
+ * size is in bytes for each per CPU buffer.
+ */
+struct ring_buffer *
+ring_buffer_alloc(unsigned long size, unsigned flags);
+void ring_buffer_free(struct ring_buffer *buffer);
+
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size);
+
+struct ring_buffer_event *
+ring_buffer_lock_reserve(struct ring_buffer *buffer,
+ unsigned long length,
+ unsigned long *flags);
+int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags);
+int ring_buffer_write(struct ring_buffer *buffer,
+ unsigned long length, void *data);
+
+struct ring_buffer_event *
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts);
+struct ring_buffer_event *
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts);
+
+struct ring_buffer_iter *
+ring_buffer_read_start(struct ring_buffer *buffer, int cpu);
+void ring_buffer_read_finish(struct ring_buffer_iter *iter);
+
+struct ring_buffer_event *
+ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts);
+struct ring_buffer_event *
+ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts);
+void ring_buffer_iter_reset(struct ring_buffer_iter *iter);
+int ring_buffer_iter_empty(struct ring_buffer_iter *iter);
+
+unsigned long ring_buffer_size(struct ring_buffer *buffer);
+
+void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu);
+void ring_buffer_reset(struct ring_buffer *buffer);
+
+int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
+ struct ring_buffer *buffer_b, int cpu);
+
+int ring_buffer_empty(struct ring_buffer *buffer);
+int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu);
+
+void ring_buffer_record_disable(struct ring_buffer *buffer);
+void ring_buffer_record_enable(struct ring_buffer *buffer);
+void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu);
+void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu);
+
+unsigned long ring_buffer_entries(struct ring_buffer *buffer);
+unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
+
+u64 ring_buffer_time_stamp(int cpu);
+void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
+
+enum ring_buffer_flags {
+ RB_FL_OVERWRITE = 1 << 0,
+};
+
+#endif /* _LINUX_RING_BUFFER_H */
Index: linux-trace.git/kernel/trace/ring_buffer.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-trace.git/kernel/trace/ring_buffer.c 2008-09-26 13:53:52.000000000 -0400
@@ -0,0 +1,1496 @@
+/*
+ * Generic ring buffer
+ *
+ * Copyright (C) 2008 Steven Rostedt <***@redhat.com>
+ */
+#include <linux/ring_buffer.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/mutex.h>
+#include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/list.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+
+/* FIXME!!! */
+u64 ring_buffer_time_stamp(int cpu)
+{
+ return sched_clock();
+}
+void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
+{
+}
+
+#define TS_SHIFT 27
+#define TS_MASK ((1ULL << TS_SHIFT) - 1)
+#define TS_DELTA_TEST (~TS_MASK)
+
+/*
+ * We need to fit the time_stamp delta into 27 bits.
+ */
+static inline int
+test_time_stamp(unsigned long long delta)
+{
+ if (delta & TS_DELTA_TEST)
+ return 1;
+ return 0;
+}
+
+struct buffer_page {
+ u64 time_stamp;
+ unsigned char body[];
+};
+
+#define BUF_PAGE_SIZE (PAGE_SIZE - sizeof(u64))
+
+/*
+ * head_page == tail_page && head == tail then buffer is empty.
+ */
+struct ring_buffer_per_cpu {
+ int cpu;
+ struct ring_buffer *buffer;
+ raw_spinlock_t lock;
+ struct lock_class_key lock_key;
+ struct list_head pages;
+ unsigned long head; /* read from head */
+ unsigned long tail; /* write to tail */
+ struct page *head_page;
+ struct page *tail_page;
+ unsigned long overrun;
+ unsigned long entries;
+ u64 write_stamp;
+ u64 read_stamp;
+ atomic_t record_disabled;
+};
+
+struct ring_buffer {
+ unsigned long size;
+ unsigned pages;
+ unsigned flags;
+ int cpus;
+ atomic_t record_disabled;
+
+ struct mutex mutex;
+
+ /* FIXME: this should be online CPUS */
+ struct ring_buffer_per_cpu *buffers[NR_CPUS];
+};
+
+struct ring_buffer_iter {
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long head;
+ struct page *head_page;
+ u64 read_stamp;
+};
+
+#define CHECK_COND(buffer, cond) \
+ if (unlikely(cond)) { \
+ atomic_inc(&buffer->record_disabled); \
+ WARN_ON(1); \
+ return -1; \
+ }
+
+/**
+ * check_pages - integrity check of buffer pages
+ * @cpu_buffer: CPU buffer with pages to test
+ *
+ * As a safty measure we check to make sure the data pages have not
+ * been corrupted.
+ */
+static int check_pages(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct list_head *head = &cpu_buffer->pages;
+ struct page *page, *tmp;
+
+ CHECK_COND(cpu_buffer, head->next->prev != head);
+ CHECK_COND(cpu_buffer, head->prev->next != head);
+
+ list_for_each_entry_safe(page, tmp, head, lru) {
+ CHECK_COND(cpu_buffer, page->lru.next->prev != &page->lru);
+ CHECK_COND(cpu_buffer, page->lru.prev->next != &page->lru);
+ }
+
+ return 0;
+}
+
+static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned nr_pages)
+{
+ struct list_head *head = &cpu_buffer->pages;
+ LIST_HEAD(pages);
+ struct page *page, *tmp;
+ unsigned long addr;
+ unsigned i;
+
+ for (i = 0; i < nr_pages; i++) {
+ addr = __get_free_page(GFP_KERNEL);
+ if (!addr)
+ goto free_pages;
+ page = virt_to_page(addr);
+ list_add(&page->lru, &pages);
+ }
+
+ list_splice(&pages, head);
+
+ check_pages(cpu_buffer);
+
+ return 0;
+
+ free_pages:
+ list_for_each_entry_safe(page, tmp, &pages, lru) {
+ list_del_init(&page->lru);
+ __free_page(page);
+ }
+ return -ENOMEM;
+}
+
+static struct ring_buffer_per_cpu *
+ring_buffer_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int ret;
+
+ cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!cpu_buffer)
+ return NULL;
+
+ cpu_buffer->cpu = cpu;
+ cpu_buffer->buffer = buffer;
+ cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+ INIT_LIST_HEAD(&cpu_buffer->pages);
+
+ ret = rb_allocate_pages(cpu_buffer, buffer->pages);
+ if (ret < 0)
+ goto fail_free_buffer;
+
+ cpu_buffer->head_page
+ = list_entry(cpu_buffer->pages.next, struct page, lru);
+ cpu_buffer->tail_page
+ = list_entry(cpu_buffer->pages.next, struct page, lru);
+
+ return cpu_buffer;
+
+ fail_free_buffer:
+ kfree(cpu_buffer);
+ return NULL;
+}
+
+static void
+ring_buffer_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct list_head *head = &cpu_buffer->pages;
+ struct page *page, *tmp;
+
+ list_for_each_entry_safe(page, tmp, head, lru) {
+ list_del_init(&page->lru);
+ __free_page(page);
+ }
+ kfree(cpu_buffer);
+}
+
+/**
+ * ring_buffer_alloc - allocate a new ring_buffer
+ * @size: the size in bytes that is needed.
+ * @flags: attributes to set for the ring buffer.
+ *
+ * Currently the only flag that is available is the RB_FL_OVERWRITE
+ * flag. This flag means that the buffer will overwrite old data
+ * when the buffer wraps. If this flag is not set, the buffer will
+ * drop data when the tail hits the head.
+ */
+struct ring_buffer *
+ring_buffer_alloc(unsigned long size, unsigned flags)
+{
+ struct ring_buffer *buffer;
+ int cpu;
+
+ /* keep it in its own cache line */
+ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
+ GFP_KERNEL);
+ if (!buffer)
+ return NULL;
+
+ buffer->pages = (size + (BUF_PAGE_SIZE - 1)) / BUF_PAGE_SIZE;
+ buffer->flags = flags;
+
+ /* need at least two pages */
+ if (buffer->pages == 1)
+ buffer->pages++;
+
+ /* FIXME: do for only online CPUS */
+ buffer->cpus = num_possible_cpus();
+ for_each_possible_cpu(cpu) {
+ if (cpu >= buffer->cpus)
+ continue;
+ buffer->buffers[cpu] =
+ ring_buffer_allocate_cpu_buffer(buffer, cpu);
+ if (!buffer->buffers[cpu])
+ goto fail_free_buffers;
+ }
+
+ mutex_init(&buffer->mutex);
+
+ return buffer;
+
+ fail_free_buffers:
+ for_each_possible_cpu(cpu) {
+ if (cpu >= buffer->cpus)
+ continue;
+ if (buffer->buffers[cpu])
+ ring_buffer_free_cpu_buffer(buffer->buffers[cpu]);
+ }
+
+ kfree(buffer);
+ return NULL;
+}
+
+/**
+ * ring_buffer_free - free a ring buffer.
+ * @buffer: the buffer to free.
+ */
+void
+ring_buffer_free(struct ring_buffer *buffer)
+{
+ int cpu;
+
+ for (cpu = 0; cpu < buffer->cpus; cpu++)
+ ring_buffer_free_cpu_buffer(buffer->buffers[cpu]);
+
+ kfree(buffer);
+}
+
+static void
+__ring_buffer_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
+
+static void
+rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
+{
+ struct page *page;
+ struct list_head *p;
+ unsigned i;
+
+ atomic_inc(&cpu_buffer->record_disabled);
+
+ for (i = 0; i < nr_pages; i++) {
+ BUG_ON(list_empty(&cpu_buffer->pages));
+ p = cpu_buffer->pages.next;
+ page = list_entry(p, struct page, lru);
+ list_del_init(&page->lru);
+ __free_page(page);
+ }
+ BUG_ON(list_empty(&cpu_buffer->pages));
+
+ __ring_buffer_reset_cpu(cpu_buffer);
+
+ check_pages(cpu_buffer);
+
+ atomic_dec(&cpu_buffer->record_disabled);
+
+}
+
+static void
+rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
+ struct list_head *pages, unsigned nr_pages)
+{
+ struct page *page;
+ struct list_head *p;
+ unsigned i;
+
+ atomic_inc(&cpu_buffer->record_disabled);
+
+ for (i = 0; i < nr_pages; i++) {
+ BUG_ON(list_empty(pages));
+ p = pages->next;
+ page = list_entry(p, struct page, lru);
+ list_del_init(&page->lru);
+ list_add_tail(&page->lru, &cpu_buffer->pages);
+ }
+ __ring_buffer_reset_cpu(cpu_buffer);
+
+ check_pages(cpu_buffer);
+
+ atomic_dec(&cpu_buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_resize - resize the ring buffer
+ * @buffer: the buffer to resize.
+ * @size: the new size.
+ *
+ * The tracer is responsible for making sure that the buffer is
+ * not being used while changing the size.
+ * Note: We may be able to change the above requirement by using
+ * RCU synchronizations.
+ *
+ * Minimum size is 2 * BUF_PAGE_SIZE.
+ *
+ * Returns -1 on failure.
+ */
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long buffer_size;
+ LIST_HEAD(pages);
+ unsigned long addr;
+ unsigned nr_pages, rm_pages, new_pages;
+ struct page *page, *tmp;
+ int i, cpu;
+
+ size = (size + (BUF_PAGE_SIZE-1)) / BUF_PAGE_SIZE;
+ size *= BUF_PAGE_SIZE;
+ buffer_size = buffer->pages * BUF_PAGE_SIZE;
+
+ /* we need a minimum of two pages */
+ if (size < BUF_PAGE_SIZE * 2)
+ size = BUF_PAGE_SIZE * 2;
+
+ if (size == buffer_size)
+ return size;
+
+ mutex_lock(&buffer->mutex);
+
+ nr_pages = (size + (BUF_PAGE_SIZE-1)) / BUF_PAGE_SIZE;
+
+ if (size < buffer_size) {
+
+ /* easy case, just free pages */
+ BUG_ON(nr_pages >= buffer->pages);
+
+ rm_pages = buffer->pages - nr_pages;
+
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = buffer->buffers[cpu];
+ rb_remove_pages(cpu_buffer, rm_pages);
+ }
+ goto out;
+ }
+
+ /*
+ * This is a bit more difficult. We only want to add pages
+ * when we can allocate enough for all CPUs. We do this
+ * by allocating all the pages and storing them on a local
+ * link list. If we succeed in our allocation, then we
+ * add these pages to the cpu_buffers. Otherwise we just free
+ * them all and return -ENOMEM;
+ */
+ BUG_ON(nr_pages <= buffer->pages);
+ new_pages = nr_pages - buffer->pages;
+
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ for (i = 0; i < new_pages; i++) {
+ addr = __get_free_page(GFP_KERNEL);
+ if (!addr)
+ goto free_pages;
+ page = virt_to_page(addr);
+ list_add(&page->lru, &pages);
+ }
+ }
+
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = buffer->buffers[cpu];
+ rb_insert_pages(cpu_buffer, &pages, new_pages);
+ }
+
+ BUG_ON(!list_empty(&pages));
+
+ out:
+ buffer->pages = nr_pages;
+ mutex_unlock(&buffer->mutex);
+
+ return size;
+
+ free_pages:
+ list_for_each_entry_safe(page, tmp, &pages, lru) {
+ list_del_init(&page->lru);
+ __free_page(page);
+ }
+ return -ENOMEM;
+}
+
+static inline int
+ring_buffer_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return cpu_buffer->head_page == cpu_buffer->tail_page &&
+ cpu_buffer->head == cpu_buffer->tail;
+}
+
+static inline int
+ring_buffer_null_event(struct ring_buffer_event *event)
+{
+ return event->type == RB_TYPE_PADDING;
+}
+
+static inline void *
+rb_page_index(struct page *page, unsigned index)
+{
+ struct buffer_page *bpage;
+
+ bpage = page_address(page);
+ return bpage->body + index;
+}
+
+static inline struct ring_buffer_event *
+ring_buffer_head_event(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return rb_page_index(cpu_buffer->head_page,
+ cpu_buffer->head);
+}
+
+static inline struct ring_buffer_event *
+ring_buffer_iter_head_event(struct ring_buffer_iter *iter)
+{
+ return rb_page_index(iter->head_page,
+ iter->head);
+}
+
+/*
+ * When the tail hits the head and the buffer is in overwrite mode,
+ * the head jumps to the next page and all content on the previous
+ * page is discarded. But before doing so, we update the overrun
+ * variable of the buffer.
+ */
+static void
+ring_buffer_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct ring_buffer_event *event;
+ unsigned long head;
+
+ for (head = 0; head < BUF_PAGE_SIZE;
+ head += ring_buffer_event_length(event)) {
+ event = rb_page_index(cpu_buffer->head_page, head);
+ if (ring_buffer_null_event(event))
+ break;
+ cpu_buffer->overrun++;
+ cpu_buffer->entries--;
+ }
+}
+
+static inline void
+ring_buffer_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
+ struct page **page)
+{
+ struct list_head *p = (*page)->lru.next;
+
+ if (p == &cpu_buffer->pages)
+ p = p->next;
+
+ *page = list_entry(p, struct page, lru);
+}
+
+static inline void
+rb_add_stamp(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
+{
+ struct buffer_page *bpage;
+
+ bpage = page_address(cpu_buffer->tail_page);
+ bpage->time_stamp = *ts;
+}
+
+static void
+rb_reset_read_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct buffer_page *bpage;
+
+ cpu_buffer->head = 0;
+ bpage = page_address(cpu_buffer->head_page);
+ cpu_buffer->read_stamp = bpage->time_stamp;
+}
+
+static void
+rb_reset_iter_read_page(struct ring_buffer_iter *iter)
+{
+ struct buffer_page *bpage;
+
+ iter->head = 0;
+ bpage = page_address(iter->head_page);
+ iter->read_stamp = bpage->time_stamp;
+}
+
+/**
+ * ring_buffer_update_event - update event type and data
+ * @event: the even to update
+ * @type: the type of event
+ * @length: the size of the event field in the ring buffer
+ *
+ * Update the type and data fields of the event. The length
+ * is the actual size that is written to the ring buffer,
+ * and with this, we can determine what to place into the
+ * data field.
+ */
+static inline void
+ring_buffer_update_event(struct ring_buffer_event *event,
+ unsigned type, unsigned length)
+{
+ event->type = type;
+
+ switch (type) {
+
+ case RB_TYPE_PADDING:
+ break;
+
+ case RB_TYPE_TIME_EXTENT:
+ event->len =
+ (RB_LEN_TIME_EXTENT + (RB_ALIGNMENT-1))
+ >> RB_ALIGNMENT_SHIFT;
+ break;
+
+ case RB_TYPE_TIME_STAMP:
+ event->len =
+ (RB_LEN_TIME_STAMP + (RB_ALIGNMENT-1))
+ >> RB_ALIGNMENT_SHIFT;
+ break;
+
+ case RB_TYPE_DATA:
+ length -= RB_EVNT_HDR_SIZE;
+ if (length > RB_MAX_SMALL_DATA) {
+ event->len = 0;
+ event->array[0] = length;
+ } else
+ event->len =
+ (length + (RB_ALIGNMENT-1))
+ >> RB_ALIGNMENT_SHIFT;
+ break;
+ default:
+ BUG();
+ }
+}
+
+static inline unsigned rb_calculate_event_length(unsigned length)
+{
+ struct ring_buffer_event event; /* Used only for sizeof array */
+
+ /* zero length can cause confusions */
+ if (!length)
+ length = 1;
+
+ if (length > RB_MAX_SMALL_DATA)
+ length += sizeof(event.array[0]);
+
+ length += RB_EVNT_HDR_SIZE;
+ length = ALIGN(length, RB_ALIGNMENT);
+
+ return length;
+}
+
+static struct ring_buffer_event *
+__ring_buffer_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned type, unsigned long length, u64 *ts)
+{
+ struct page *head_page, *tail_page;
+ unsigned long tail;
+ struct ring_buffer *buffer = cpu_buffer->buffer;
+ struct ring_buffer_event *event;
+
+ tail_page = cpu_buffer->tail_page;
+ head_page = cpu_buffer->head_page;
+ tail = cpu_buffer->tail;
+
+ if (tail + length > BUF_PAGE_SIZE) {
+ struct page *next_page = tail_page;
+
+ ring_buffer_inc_page(cpu_buffer, &next_page);
+
+ if (next_page == head_page) {
+ if (!(buffer->flags & RB_FL_OVERWRITE))
+ return NULL;
+
+ /* count overflows */
+ ring_buffer_update_overflow(cpu_buffer);
+
+ ring_buffer_inc_page(cpu_buffer, &head_page);
+ cpu_buffer->head_page = head_page;
+ rb_reset_read_page(cpu_buffer);
+ }
+
+ if (tail != BUF_PAGE_SIZE) {
+ event = rb_page_index(tail_page, tail);
+ /* page padding */
+ event->type = RB_TYPE_PADDING;
+ }
+
+ tail = 0;
+ tail_page = next_page;
+ cpu_buffer->tail_page = tail_page;
+ cpu_buffer->tail = tail;
+ rb_add_stamp(cpu_buffer, ts);
+ }
+
+ BUG_ON(tail + length > BUF_PAGE_SIZE);
+
+ event = rb_page_index(tail_page, tail);
+ ring_buffer_update_event(event, type, length);
+ cpu_buffer->entries++;
+
+ return event;
+}
+
+static struct ring_buffer_event *
+ring_buffer_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned type, unsigned long length)
+{
+ unsigned long long ts, delta;
+ struct ring_buffer_event *event;
+
+ ts = ring_buffer_time_stamp(cpu_buffer->cpu);
+
+ if (cpu_buffer->tail) {
+ delta = ts - cpu_buffer->write_stamp;
+
+ if (test_time_stamp(delta)) {
+ /*
+ * The delta is too big, we to add a
+ * new timestamp.
+ */
+ event = __ring_buffer_reserve_next(cpu_buffer,
+ RB_TYPE_TIME_EXTENT,
+ RB_LEN_TIME_EXTENT,
+ &ts);
+ if (!event)
+ return NULL;
+
+ /* check to see if we went to the next page */
+ if (!cpu_buffer->tail) {
+ /*
+ * new page, dont commit this and add the
+ * time stamp to the page instead.
+ */
+ rb_add_stamp(cpu_buffer, &ts);
+ } else {
+ event->time_delta = delta & TS_MASK;
+ event->array[0] = delta >> TS_SHIFT;
+ }
+
+ cpu_buffer->write_stamp = ts;
+ delta = 0;
+ }
+ } else {
+ rb_add_stamp(cpu_buffer, &ts);
+ delta = 0;
+ }
+
+ event = __ring_buffer_reserve_next(cpu_buffer, type, length, &ts);
+ if (!event)
+ return NULL;
+
+ event->time_delta = delta;
+ cpu_buffer->write_stamp = ts;
+
+ return event;
+}
+
+/**
+ * ring_buffer_lock_reserve - reserve a part of the buffer
+ * @buffer: the ring buffer to reserve from
+ * @length: the length of the data to reserve (excluding event header)
+ * @flags: a pointer to save the interrupt flags
+ *
+ * Returns a reseverd event on the ring buffer to copy directly to.
+ * The user of this interface will need to get the body to write into
+ * and can use the ring_buffer_event_data() interface.
+ *
+ * The length is the length of the data needed, not the event length
+ * which also includes the event header.
+ *
+ * Must be paired with ring_buffer_unlock_commit, unless NULL is returned.
+ * If NULL is returned, then nothing has been allocated or locked.
+ */
+struct ring_buffer_event *
+ring_buffer_lock_reserve(struct ring_buffer *buffer,
+ unsigned long length,
+ unsigned long *flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ int cpu;
+
+ if (atomic_read(&buffer->record_disabled))
+ return NULL;
+
+ raw_local_irq_save(*flags);
+ cpu = raw_smp_processor_id();
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_lock(&cpu_buffer->lock);
+
+ if (atomic_read(&cpu_buffer->record_disabled))
+ goto no_record;
+
+ length = rb_calculate_event_length(length);
+ if (length > BUF_PAGE_SIZE)
+ return NULL;
+
+ event = ring_buffer_reserve_next_event(cpu_buffer,
+ RB_TYPE_DATA, length);
+ if (!event)
+ goto no_record;
+
+ return event;
+
+ no_record:
+ __raw_spin_unlock(&cpu_buffer->lock);
+ local_irq_restore(*flags);
+ return NULL;
+}
+
+/**
+ * ring_buffer_unlock_commit - commit a reserved
+ * @buffer: The buffer to commit to
+ * @event: The event pointer to commit.
+ * @flags: the interrupt flags received from ring_buffer_lock_reserve.
+ *
+ * This commits the data to the ring buffer, and releases any locks held.
+ *
+ * Must be paired with ring_buffer_lock_reserve.
+ */
+int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu = raw_smp_processor_id();
+
+ cpu_buffer = buffer->buffers[cpu];
+ cpu_buffer->tail += ring_buffer_event_length(event);
+
+ __raw_spin_unlock(&cpu_buffer->lock);
+ raw_local_irq_restore(flags);
+
+ return 0;
+}
+
+/**
+ * ring_buffer_write - write data to the buffer without reserving
+ * @buffer: The ring buffer to write to.
+ * @length: The length of the data being written (excluding the event header)
+ * @data: The data to write to the buffer.
+ *
+ * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as
+ * one function. If you already have the data to write to the buffer, it
+ * may be easier to simply call this function.
+ *
+ * Note, like ring_buffer_lock_reserve, the length is the length of the data
+ * and not the length of the event which would hold the header.
+ */
+int ring_buffer_write(struct ring_buffer *buffer,
+ unsigned long length,
+ void *data)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ unsigned long event_length, flags;
+ void *body;
+ int ret = -EBUSY;
+ int cpu;
+
+ if (atomic_read(&buffer->record_disabled))
+ return -EBUSY;
+
+ local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_lock(&cpu_buffer->lock);
+
+ if (atomic_read(&cpu_buffer->record_disabled))
+ goto out;
+
+ event_length = rb_calculate_event_length(length);
+ event = ring_buffer_reserve_next_event(cpu_buffer,
+ RB_TYPE_DATA, event_length);
+ if (!event)
+ goto out;
+
+ body = ring_buffer_event_data(event);
+
+ memcpy(body, data, length);
+ cpu_buffer->tail += event_length;
+
+ ret = 0;
+ out:
+ __raw_spin_unlock(&cpu_buffer->lock);
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+/**
+ * ring_buffer_lock - lock the ring buffer
+ * @buffer: The ring buffer to lock
+ * @flags: The place to store the interrupt flags
+ *
+ * This locks all the per CPU buffers.
+ *
+ * Must be unlocked by ring_buffer_unlock.
+ */
+void ring_buffer_lock(struct ring_buffer *buffer, unsigned long *flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ local_irq_save(*flags);
+
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_lock(&cpu_buffer->lock);
+ }
+}
+
+/**
+ * ring_buffer_unlock - unlock a locked buffer
+ * @buffer: The locked buffer to unlock
+ * @flags: The interrupt flags received by ring_buffer_lock
+ */
+void ring_buffer_unlock(struct ring_buffer *buffer, unsigned long flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ for (cpu = buffer->cpus - 1; cpu >= 0; cpu--) {
+
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_unlock(&cpu_buffer->lock);
+ }
+
+ local_irq_restore(flags);
+}
+
+/**
+ * ring_buffer_record_disable - stop all writes into the buffer
+ * @buffer: The ring buffer to stop writes to.
+ *
+ * This prevents all writes to the buffer. Any attempt to write
+ * to the buffer after this will fail and return NULL.
+ */
+void ring_buffer_record_disable(struct ring_buffer *buffer)
+{
+ atomic_inc(&buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_enable - enable writes to the buffer
+ * @buffer: The ring buffer to enable writes
+ *
+ * Note, multiple disables will need the same number of enables
+ * to truely enable the writing (much like preempt_disable).
+ */
+void ring_buffer_record_enable(struct ring_buffer *buffer)
+{
+ atomic_dec(&buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
+ * @buffer: The ring buffer to stop writes to.
+ * @cpu: The CPU buffer to stop
+ *
+ * This prevents all writes to the buffer. Any attempt to write
+ * to the buffer after this will fail and return NULL.
+ */
+void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+ atomic_inc(&cpu_buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_enable_cpu - enable writes to the buffer
+ * @buffer: The ring buffer to enable writes
+ * @cpu: The CPU to enable.
+ *
+ * Note, multiple disables will need the same number of enables
+ * to truely enable the writing (much like preempt_disable).
+ */
+void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+ atomic_dec(&cpu_buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the entries from.
+ */
+unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return cpu_buffer->entries;
+}
+
+/**
+ * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return cpu_buffer->overrun;
+}
+
+/**
+ * ring_buffer_entries - get the number of entries in a buffer
+ * @buffer: The ring buffer
+ *
+ * Returns the total number of entries in the ring buffer
+ * (all CPU entries)
+ */
+unsigned long ring_buffer_entries(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long entries = 0;
+ int cpu;
+
+ /* if you care about this being correct, lock the buffer */
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = buffer->buffers[cpu];
+ entries += cpu_buffer->entries;
+ }
+
+ return entries;
+}
+
+/**
+ * ring_buffer_overrun_cpu - get the number of overruns in buffer
+ * @buffer: The ring buffer
+ *
+ * Returns the total number of overruns in the ring buffer
+ * (all CPU entries)
+ */
+unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long overruns = 0;
+ int cpu;
+
+ /* if you care about this being correct, lock the buffer */
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = buffer->buffers[cpu];
+ overruns += cpu_buffer->overrun;
+ }
+
+ return overruns;
+}
+
+/**
+ * ring_buffer_iter_reset - reset an iterator
+ * @iter: The iterator to reset
+ *
+ * Resets the iterator, so that it will start from the beginning
+ * again.
+ */
+void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+
+ iter->head_page = cpu_buffer->head_page;
+ iter->head = cpu_buffer->head;
+ rb_reset_iter_read_page(iter);
+}
+
+/**
+ * ring_buffer_iter_empty - check if an iterator has no more to read
+ * @iter: The iterator to check
+ */
+int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = iter->cpu_buffer;
+
+ return iter->head_page == cpu_buffer->tail_page &&
+ iter->head == cpu_buffer->tail;
+}
+
+static void
+rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ u64 delta;
+
+ switch (event->type) {
+ case RB_TYPE_PADDING:
+ return;
+
+ case RB_TYPE_TIME_EXTENT:
+ delta = event->array[0];
+ delta <<= TS_SHIFT;
+ delta += event->time_delta;
+ cpu_buffer->read_stamp += delta;
+ return;
+
+ case RB_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ return;
+
+ case RB_TYPE_DATA:
+ cpu_buffer->read_stamp += event->time_delta;
+ return;
+
+ default:
+ BUG();
+ }
+ return;
+}
+
+static void
+rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
+ struct ring_buffer_event *event)
+{
+ u64 delta;
+
+ switch (event->type) {
+ case RB_TYPE_PADDING:
+ return;
+
+ case RB_TYPE_TIME_EXTENT:
+ delta = event->array[0];
+ delta <<= TS_SHIFT;
+ delta += event->time_delta;
+ iter->read_stamp += delta;
+ return;
+
+ case RB_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ return;
+
+ case RB_TYPE_DATA:
+ iter->read_stamp += event->time_delta;
+ return;
+
+ default:
+ BUG();
+ }
+ return;
+}
+
+static void
+ring_buffer_advance_head(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct ring_buffer_event *event;
+ unsigned length;
+
+ event = ring_buffer_head_event(cpu_buffer);
+ /*
+ * Check if we are at the end of the buffer.
+ */
+ if (ring_buffer_null_event(event)) {
+ BUG_ON(cpu_buffer->head_page == cpu_buffer->tail_page);
+ ring_buffer_inc_page(cpu_buffer, &cpu_buffer->head_page);
+ rb_reset_read_page(cpu_buffer);
+ return;
+ }
+
+ length = ring_buffer_event_length(event);
+
+ /*
+ * This should not be called to advance the header if we are
+ * at the tail of the buffer.
+ */
+ BUG_ON((cpu_buffer->head_page == cpu_buffer->tail_page) &&
+ (cpu_buffer->head + length > cpu_buffer->tail));
+
+ rb_update_read_stamp(cpu_buffer, event);
+
+ cpu_buffer->head += length;
+
+ /* check for end of page padding */
+ event = ring_buffer_head_event(cpu_buffer);
+ if (ring_buffer_null_event(event) &&
+ (cpu_buffer->head_page != cpu_buffer->tail_page))
+ ring_buffer_advance_head(cpu_buffer);
+}
+
+static void
+ring_buffer_advance_iter(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer *buffer;
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ unsigned length;
+
+ cpu_buffer = iter->cpu_buffer;
+ buffer = cpu_buffer->buffer;
+
+ event = ring_buffer_iter_head_event(iter);
+
+ /*
+ * Check if we are at the end of the buffer.
+ */
+ if (ring_buffer_null_event(event)) {
+ BUG_ON(iter->head_page == cpu_buffer->tail_page);
+ ring_buffer_inc_page(cpu_buffer, &iter->head_page);
+ rb_reset_iter_read_page(iter);
+ return;
+ }
+
+ length = ring_buffer_event_length(event);
+
+ /*
+ * This should not be called to advance the header if we are
+ * at the tail of the buffer.
+ */
+ BUG_ON((iter->head_page == cpu_buffer->tail_page) &&
+ (iter->head + length > cpu_buffer->tail));
+
+ rb_update_iter_read_stamp(iter, event);
+
+ iter->head += length;
+
+ /* check for end of page padding */
+ event = ring_buffer_iter_head_event(iter);
+ if (ring_buffer_null_event(event) &&
+ (iter->head_page != cpu_buffer->tail_page))
+ ring_buffer_advance_iter(iter);
+}
+
+/**
+ * ring_buffer_peek - peek at the next event to be read
+ * @buffer: The ring buffer to read
+ * @cpu: The cpu to peak at
+ * @ts: The timestamp counter of this event.
+ *
+ * This will return the event that will be read next, but does
+ * not consume the data.
+ */
+struct ring_buffer_event *
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ again:
+ if (ring_buffer_per_cpu_empty(cpu_buffer))
+ return NULL;
+
+ event = ring_buffer_head_event(cpu_buffer);
+
+ switch (event->type) {
+ case RB_TYPE_PADDING:
+ ring_buffer_inc_page(cpu_buffer, &cpu_buffer->head_page);
+ rb_reset_read_page(cpu_buffer);
+ goto again;
+
+ case RB_TYPE_TIME_EXTENT:
+ /* Internal data, OK to advance */
+ ring_buffer_advance_head(cpu_buffer);
+ goto again;
+
+ case RB_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ ring_buffer_advance_head(cpu_buffer);
+ goto again;
+
+ case RB_TYPE_DATA:
+ if (ts) {
+ *ts = cpu_buffer->read_stamp + event->time_delta;
+ ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+ }
+ return event;
+
+ default:
+ BUG();
+ }
+
+ return NULL;
+}
+
+/**
+ * ring_buffer_iter_peek - peek at the next event to be read
+ * @iter: The ring buffer iterator
+ * @ts: The timestamp counter of this event.
+ *
+ * This will return the event that will be read next, but does
+ * not increment the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
+{
+ struct ring_buffer *buffer;
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+
+ if (ring_buffer_iter_empty(iter))
+ return NULL;
+
+ cpu_buffer = iter->cpu_buffer;
+ buffer = cpu_buffer->buffer;
+
+ again:
+ if (ring_buffer_per_cpu_empty(cpu_buffer))
+ return NULL;
+
+ event = ring_buffer_iter_head_event(iter);
+
+ switch (event->type) {
+ case RB_TYPE_PADDING:
+ ring_buffer_inc_page(cpu_buffer, &iter->head_page);
+ rb_reset_iter_read_page(iter);
+ goto again;
+
+ case RB_TYPE_TIME_EXTENT:
+ /* Internal data, OK to advance */
+ ring_buffer_advance_iter(iter);
+ goto again;
+
+ case RB_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ ring_buffer_advance_iter(iter);
+ goto again;
+
+ case RB_TYPE_DATA:
+ if (ts) {
+ *ts = iter->read_stamp + event->time_delta;
+ ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+ }
+ return event;
+
+ default:
+ BUG();
+ }
+
+ return NULL;
+}
+
+/**
+ * ring_buffer_consume - return an event and consume it
+ * @buffer: The ring buffer to get the next event from
+ *
+ * Returns the next event in the ring buffer, and that event is consumed.
+ * Meaning, that sequential reads will keep returning a different event,
+ * and eventually empty the ring buffer if the producer is slower.
+ */
+struct ring_buffer_event *
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+
+ event = ring_buffer_peek(buffer, cpu, ts);
+ if (!event)
+ return NULL;
+
+ cpu_buffer = buffer->buffers[cpu];
+ ring_buffer_advance_head(cpu_buffer);
+
+ return event;
+}
+
+/**
+ * ring_buffer_read_start - start a non consuming read of the buffer
+ * @buffer: The ring buffer to read from
+ * @cpu: The cpu buffer to iterate over
+ *
+ * This starts up an iteration through the buffer. It also disables
+ * the recording to the buffer until the reading is finished.
+ * This prevents the reading from being corrupted. This is not
+ * a consuming read, so a producer is not expected.
+ *
+ * Must be paired with ring_buffer_finish.
+ */
+struct ring_buffer_iter *
+ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_iter *iter;
+
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return NULL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ iter->cpu_buffer = cpu_buffer;
+
+ atomic_inc(&cpu_buffer->record_disabled);
+
+ __raw_spin_lock(&cpu_buffer->lock);
+ iter->head = cpu_buffer->head;
+ iter->head_page = cpu_buffer->head_page;
+ rb_reset_iter_read_page(iter);
+ __raw_spin_unlock(&cpu_buffer->lock);
+
+ return iter;
+}
+
+/**
+ * ring_buffer_finish - finish reading the iterator of the buffer
+ * @iter: The iterator retrieved by ring_buffer_start
+ *
+ * This re-enables the recording to the buffer, and frees the
+ * iterator.
+ */
+void
+ring_buffer_read_finish(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+
+ atomic_dec(&cpu_buffer->record_disabled);
+ kfree(iter);
+}
+
+/**
+ * ring_buffer_read - read the next item in the ring buffer by the iterator
+ * @iter: The ring buffer iterator
+ * @ts: The time stamp of the event read.
+ *
+ * This reads the next event in the ring buffer and increments the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
+{
+ struct ring_buffer_event *event;
+
+ event = ring_buffer_iter_peek(iter, ts);
+ if (!event)
+ return NULL;
+
+ ring_buffer_advance_iter(iter);
+
+ return event;
+}
+
+/**
+ * ring_buffer_size - return the size of the ring buffer (in bytes)
+ * @buffer: The ring buffer.
+ */
+unsigned long ring_buffer_size(struct ring_buffer *buffer)
+{
+ return BUF_PAGE_SIZE * buffer->pages;
+}
+
+static void
+__ring_buffer_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ cpu_buffer->head_page
+ = list_entry(cpu_buffer->pages.next, struct page, lru);
+ cpu_buffer->tail_page
+ = list_entry(cpu_buffer->pages.next, struct page, lru);
+
+ cpu_buffer->head = cpu_buffer->tail = 0;
+ cpu_buffer->overrun = 0;
+ cpu_buffer->entries = 0;
+}
+
+/**
+ * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
+ * @buffer: The ring buffer to reset a per cpu buffer of
+ * @cpu: The CPU buffer to be reset
+ */
+void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ __raw_spin_lock(&cpu_buffer->lock);
+
+ __ring_buffer_reset_cpu(cpu_buffer);
+
+ __raw_spin_unlock(&cpu_buffer->lock);
+ raw_local_irq_restore(flags);
+}
+
+/**
+ * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
+ * @buffer: The ring buffer to reset a per cpu buffer of
+ * @cpu: The CPU buffer to be reset
+ */
+void ring_buffer_reset(struct ring_buffer *buffer)
+{
+ unsigned long flags;
+ int cpu;
+
+ ring_buffer_lock(buffer, &flags);
+
+ for (cpu = 0; cpu < buffer->cpus; cpu++)
+ __ring_buffer_reset_cpu(buffer->buffers[cpu]);
+
+ ring_buffer_unlock(buffer, flags);
+}
+
+/**
+ * rind_buffer_empty - is the ring buffer empty?
+ * @buffer: The ring buffer to test
+ */
+int ring_buffer_empty(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ /* yes this is racy, but if you don't like the race, lock the buffer */
+ for (cpu = 0; cpu < buffer->cpus; cpu++) {
+ cpu_buffer = buffer->buffers[cpu];
+ if (!ring_buffer_per_cpu_empty(cpu_buffer))
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty?
+ * @buffer: The ring buffer
+ * @cpu: The CPU buffer to test
+ */
+int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return ring_buffer_per_cpu_empty(cpu_buffer);
+}
+
+/**
+ * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
+ * @buffer_a: One buffer to swap with
+ * @buffer_b: The other buffer to swap with
+ *
+ * This function is useful for tracers that want to take a "snapshot"
+ * of a CPU buffer and has another back up buffer lying around.
+ * it is expected that the tracer handles the cpu buffer not being
+ * used at the moment.
+ */
+int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
+ struct ring_buffer *buffer_b, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer_a;
+ struct ring_buffer_per_cpu *cpu_buffer_b;
+
+ /* At least make sure the two buffers are somewhat the same */
+ if (buffer_a->size != buffer_b->size ||
+ buffer_a->pages != buffer_b->pages)
+ return -EINVAL;
+
+ cpu_buffer_a = buffer_a->buffers[cpu];
+ cpu_buffer_b = buffer_b->buffers[cpu];
+
+ atomic_inc(&cpu_buffer_a->record_disabled);
+ atomic_inc(&cpu_buffer_b->record_disabled);
+
+ buffer_a->buffers[cpu] = cpu_buffer_b;
+ buffer_b->buffers[cpu] = cpu_buffer_a;
+
+ cpu_buffer_b->buffer = buffer_a;
+ cpu_buffer_a->buffer = buffer_b;
+
+ atomic_dec(&cpu_buffer_a->record_disabled);
+ atomic_dec(&cpu_buffer_b->record_disabled);
+
+ return 0;
+}
+
Index: linux-trace.git/kernel/trace/Kconfig
===================================================================
--- linux-trace.git.orig/kernel/trace/Kconfig 2008-09-25 21:28:29.000000000 -0400
+++ linux-trace.git/kernel/trace/Kconfig 2008-09-25 21:29:16.000000000 -0400
@@ -10,10 +10,14 @@ config HAVE_DYNAMIC_FTRACE
config TRACER_MAX_TRACE
bool

+config RING_BUFFER
+ bool
+
config TRACING
bool
select DEBUG_FS
select STACKTRACE
+ select RING_BUFFER

config FTRACE
bool "Kernel Function Tracer"
Index: linux-trace.git/kernel/trace/Makefile
===================================================================
--- linux-trace.git.orig/kernel/trace/Makefile 2008-09-25 21:28:29.000000000 -0400
+++ linux-trace.git/kernel/trace/Makefile 2008-09-25 21:29:16.000000000 -0400
@@ -11,6 +11,7 @@ obj-y += trace_selftest_dynamic.o
endif

obj-$(CONFIG_FTRACE) += libftrace.o
+obj-$(CONFIG_RING_BUFFER) += ring_buffer.o

obj-$(CONFIG_TRACING) += trace.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
Richard Holden
2008-09-26 18:30:34 UTC
Permalink
Post by Steven Rostedt
ring_buffer_alloc: create a new ring buffer. Can choose between
overwrite or consumer/producer mode. Overwrite will
overwrite old data, where as consumer producer will
throw away new data if the consumer catches up with the
producer. The consumer/producer is the default.
Forgive me if I've gotten this wrong but the terminology seems backwards
Here, I would think we only throw away new data if the producer catches up
with the consumer, if the consumer catches up with the producer we're
reading data as fast as it's being written.
Post by Steven Rostedt
ring_buffer_write: writes some data into the ring buffer.
ring_buffer_peek: Look at a next item in the cpu buffer.
ring_buffer_consume: get the next item in the cpu buffer and
consume it. That is, this function increments the head
pointer.
Here too, I would think that consuming data would modify the tail pointer.
Just trying to understand the terminology before I look at the code so I'm
sorry if I have just completely misunderstood.

-Richard Holden
Steven Rostedt
2008-09-26 18:39:55 UTC
Permalink
Post by Richard Holden
Post by Steven Rostedt
ring_buffer_alloc: create a new ring buffer. Can choose between
overwrite or consumer/producer mode. Overwrite will
overwrite old data, where as consumer producer will
throw away new data if the consumer catches up with the
producer. The consumer/producer is the default.
Forgive me if I've gotten this wrong but the terminology seems backwards
Here, I would think we only throw away new data if the producer catches up
with the consumer, if the consumer catches up with the producer we're
reading data as fast as it's being written.
Argh! Yes. I'm the one that is backwards ;-)

Yeah, that is what I meant. Don't you know? You are suppose to understand
what I mean, not what I say :)
Post by Richard Holden
Post by Steven Rostedt
ring_buffer_write: writes some data into the ring buffer.
ring_buffer_peek: Look at a next item in the cpu buffer.
ring_buffer_consume: get the next item in the cpu buffer and
consume it. That is, this function increments the head
pointer.
Here too, I would think that consuming data would modify the tail pointer.
I always get confused with the translation of what the head/tail to
producer/consumer.

Here I have the producer adding to the tail, and the consumer reading from
the head. Perhaps this is backwards? I could change it.

s/head/foobar/g
s/tail/head/g
s/foobar/tail/g

That could do it.
Post by Richard Holden
Just trying to understand the terminology before I look at the code so I'm
sorry if I have just completely misunderstood.
Sure, thanks.

-- Steve
Peter Zijlstra
2008-09-26 18:59:17 UTC
Permalink
Post by Steven Rostedt
+struct buffer_page {
+ u64 time_stamp;
+ unsigned char body[];
+};
+
+#define BUF_PAGE_SIZE (PAGE_SIZE - sizeof(u64))
Since you're already using the page frame, you can stick this per page
timestamp in there as well, and get the full page for data.

You can either use a struct page overlay like slob does, or add a u64 in
the union that contains struct {private, mapping}.
Martin Bligh
2008-09-26 19:46:23 UTC
Permalink
Post by Peter Zijlstra
Post by Steven Rostedt
+struct buffer_page {
+ u64 time_stamp;
+ unsigned char body[];
+};
+
+#define BUF_PAGE_SIZE (PAGE_SIZE - sizeof(u64))
Since you're already using the page frame, you can stick this per page
timestamp in there as well, and get the full page for data.
You can either use a struct page overlay like slob does, or add a u64 in
the union that contains struct {private, mapping}.
What did you guys think of Mathieu's idea of sticking the buffer length
in the header here, rather than using padding events? Seemed cleaner
to me.
Steven Rostedt
2008-09-26 19:52:14 UTC
Permalink
Post by Martin Bligh
Post by Peter Zijlstra
Post by Steven Rostedt
+struct buffer_page {
+ u64 time_stamp;
+ unsigned char body[];
+};
+
+#define BUF_PAGE_SIZE (PAGE_SIZE - sizeof(u64))
Since you're already using the page frame, you can stick this per page
timestamp in there as well, and get the full page for data.
You can either use a struct page overlay like slob does, or add a u64 in
the union that contains struct {private, mapping}.
What did you guys think of Mathieu's idea of sticking the buffer length
in the header here, rather than using padding events? Seemed cleaner
to me.
Actually I like the padding. This way when I move the event pointer
forward, I only need to compare it to a constant (PAGE_SIZE), or test to
see if the event is padding. Placing this into the buffer page, I will
have to always compare it to that pointer.

But I guess I could change it to that if needed. That doesn't affect the
API, as it is only internal.

I'm almost done with v7, perhaps I might try that with v8 to see if I like
it better.

-- Steve
Steven Rostedt
2008-09-26 21:37:07 UTC
Permalink
Post by Martin Bligh
What did you guys think of Mathieu's idea of sticking the buffer length
in the header here, rather than using padding events? Seemed cleaner
to me.
OK, I just implemented the size field in the page struct. Seems to work
well. I'm still keeping the padded event, so in the future, if we ever
map these pages to userspace or files, these holes will have a type.

Will post later today, need to actually enter a real life for a bit.

-- Steve
Peter Zijlstra
2008-09-26 19:14:33 UTC
Permalink
Post by Steven Rostedt
+struct ring_buffer {
+ unsigned long size;
+ unsigned pages;
+ unsigned flags;
+ int cpus;
+ atomic_t record_disabled;
+
+ struct mutex mutex;
+
+ /* FIXME: this should be online CPUS */
+ struct ring_buffer_per_cpu *buffers[NR_CPUS];
actually nr_possible makes sense, and you might consider always
allocating buffers (and keeping them for offlined cpus) to avoid massive
allocations/frees cpu-hotplug events.

Mike Travis has been going over the kernel removing constructs like
this, and replacing them with dynamically allocated arrays of
nr_possible.
Post by Steven Rostedt
+};
Mike Travis
2008-09-26 22:28:04 UTC
Permalink
Post by Peter Zijlstra
Post by Steven Rostedt
+struct ring_buffer {
+ unsigned long size;
+ unsigned pages;
+ unsigned flags;
+ int cpus;
+ atomic_t record_disabled;
+
+ struct mutex mutex;
+
+ /* FIXME: this should be online CPUS */
+ struct ring_buffer_per_cpu *buffers[NR_CPUS];
actually nr_possible makes sense, and you might consider always
allocating buffers (and keeping them for offlined cpus) to avoid massive
allocations/frees cpu-hotplug events.
Mike Travis has been going over the kernel removing constructs like
this, and replacing them with dynamically allocated arrays of
nr_possible.
Post by Steven Rostedt
+};
The other thing to consider is using a percpu variable.

Cheers,
Mike
Steven Rostedt
2008-09-26 23:56:52 UTC
Permalink
Post by Mike Travis
Post by Peter Zijlstra
Post by Steven Rostedt
+struct ring_buffer {
+ unsigned long size;
+ unsigned pages;
+ unsigned flags;
+ int cpus;
+ atomic_t record_disabled;
+
+ struct mutex mutex;
+
+ /* FIXME: this should be online CPUS */
+ struct ring_buffer_per_cpu *buffers[NR_CPUS];
actually nr_possible makes sense, and you might consider always
allocating buffers (and keeping them for offlined cpus) to avoid massive
allocations/frees cpu-hotplug events.
Mike Travis has been going over the kernel removing constructs like
this, and replacing them with dynamically allocated arrays of
nr_possible.
Post by Steven Rostedt
+};
The other thing to consider is using a percpu variable.
This structure is allocated on request.

-- Steve
Mike Travis
2008-09-27 00:05:26 UTC
Permalink
Post by Steven Rostedt
Post by Mike Travis
Post by Peter Zijlstra
Post by Steven Rostedt
+struct ring_buffer {
+ unsigned long size;
+ unsigned pages;
+ unsigned flags;
+ int cpus;
+ atomic_t record_disabled;
+
+ struct mutex mutex;
+
+ /* FIXME: this should be online CPUS */
+ struct ring_buffer_per_cpu *buffers[NR_CPUS];
actually nr_possible makes sense, and you might consider always
allocating buffers (and keeping them for offlined cpus) to avoid massive
allocations/frees cpu-hotplug events.
Mike Travis has been going over the kernel removing constructs like
this, and replacing them with dynamically allocated arrays of
nr_possible.
Post by Steven Rostedt
+};
The other thing to consider is using a percpu variable.
This structure is allocated on request.
-- Steve
Ahh, then it would need the yet to be added cpu_alloc() from Christoph.

Your best bet then is to allocate based on nr_cpu_ids.

Cheers,
Mike
Steven Rostedt
2008-09-27 00:18:14 UTC
Permalink
Post by Mike Travis
Post by Steven Rostedt
Post by Mike Travis
The other thing to consider is using a percpu variable.
This structure is allocated on request.
-- Steve
Ahh, then it would need the yet to be added cpu_alloc() from Christoph.
We can always change this later.
Post by Mike Travis
Your best bet then is to allocate based on nr_cpu_ids.
Actually in this case I chose num_possible_cpus(). Reason being is that
later I may add an interface to allow the user to select which CPUs they
want to trace, and this will only allocate a subset of CPU buffers.
(not going to implement that in the first release).

But to lay the ground work, I set a buffers->cpumask to be that of all the
cpus with buffers allocated. For now that mask is set to cpu_possible_map.
Since num_possible_cpus() is defined as cpus_weight_nr(cpu_possible_map)
I figured that was the better choice.

-- Steve
Mike Travis
2008-09-27 00:46:27 UTC
Permalink
Post by Steven Rostedt
Post by Mike Travis
Post by Steven Rostedt
Post by Mike Travis
The other thing to consider is using a percpu variable.
This structure is allocated on request.
-- Steve
Ahh, then it would need the yet to be added cpu_alloc() from Christoph.
We can always change this later.
Post by Mike Travis
Your best bet then is to allocate based on nr_cpu_ids.
Actually in this case I chose num_possible_cpus(). Reason being is that
later I may add an interface to allow the user to select which CPUs they
want to trace, and this will only allocate a subset of CPU buffers.
(not going to implement that in the first release).
But to lay the ground work, I set a buffers->cpumask to be that of all the
cpus with buffers allocated. For now that mask is set to cpu_possible_map.
Since num_possible_cpus() is defined as cpus_weight_nr(cpu_possible_map)
I figured that was the better choice.
-- Steve
One problem though, it's *theoretically* possible for num_possible to be
less than nr_cpu_ids and a cpu index may extend past the end of your
allocated array. This would happen if the cpu indices are allocated
some other way than as each cpu is discovered. For example, a system
might want a group of cpus in one section (say by node, or socket) and
then a hole in the cpu_possible_map until the next group. nr_cpu_ids
is guaranteed to be the highest possible cpu + 1.

Cheers,
Mike
Steven Rostedt
2008-09-27 00:52:18 UTC
Permalink
Post by Mike Travis
Post by Steven Rostedt
But to lay the ground work, I set a buffers->cpumask to be that of all the
cpus with buffers allocated. For now that mask is set to cpu_possible_map.
Since num_possible_cpus() is defined as cpus_weight_nr(cpu_possible_map)
I figured that was the better choice.
-- Steve
One problem though, it's *theoretically* possible for num_possible to be
less than nr_cpu_ids and a cpu index may extend past the end of your
allocated array. This would happen if the cpu indices are allocated
some other way than as each cpu is discovered. For example, a system
might want a group of cpus in one section (say by node, or socket) and
then a hole in the cpu_possible_map until the next group. nr_cpu_ids
is guaranteed to be the highest possible cpu + 1.
Thanks for the explanation. I'll change buffer->cpus to be set to
nr_cpu_ids.

-- Steve
Peter Zijlstra
2008-09-26 19:17:13 UTC
Permalink
+#define CHECK_COND(buffer, cond) \
+ if (unlikely(cond)) { \
+ atomic_inc(&buffer->record_disabled); \
+ WARN_ON(1); \
+ return -1; \
+ }
Arjan, any preferences wrt kerneloops.org?
Arjan van de Ven
2008-09-26 23:16:54 UTC
Permalink
On Fri, 26 Sep 2008 21:17:13 +0200
Post by Peter Zijlstra
+#define CHECK_COND(buffer, cond) \
+ if (unlikely(cond)) { \
+ atomic_inc(&buffer->record_disabled); \
+ WARN_ON(1); \
+ return -1; \
+ }
Arjan, any preferences wrt kerneloops.org?
this works; if you also want to print something use WARN() instead
--
Arjan van de Ven Intel Open Source Technology Centre
For development, discussion and tips for power savings,
visit http://www.lesswatts.org
Peter Zijlstra
2008-09-26 20:08:50 UTC
Permalink
Post by Steven Rostedt
+static void
+rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
+{
+ struct page *page;
+ struct list_head *p;
+ unsigned i;
+
+ atomic_inc(&cpu_buffer->record_disabled);
You probably want synchronize_sched() here (and similar other places) to
ensure any active writer on the corresponding cpu is actually stopped.

Which suggests you want to use something like ring_buffer_lock_cpu() and
implement that as above.
Post by Steven Rostedt
+ for (i = 0; i < nr_pages; i++) {
+ BUG_ON(list_empty(&cpu_buffer->pages));
+ p = cpu_buffer->pages.next;
+ page = list_entry(p, struct page, lru);
+ list_del_init(&page->lru);
+ __free_page(page);
+ }
+ BUG_ON(list_empty(&cpu_buffer->pages));
+
+ __ring_buffer_reset_cpu(cpu_buffer);
+
+ check_pages(cpu_buffer);
+
+ atomic_dec(&cpu_buffer->record_disabled);
+
+}
Masami Hiramatsu
2008-09-26 21:14:19 UTC
Permalink
Post by Peter Zijlstra
Post by Steven Rostedt
+static void
+rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
+{
+ struct page *page;
+ struct list_head *p;
+ unsigned i;
+
+ atomic_inc(&cpu_buffer->record_disabled);
You probably want synchronize_sched() here (and similar other places) to
ensure any active writer on the corresponding cpu is actually stopped.
Would it really be done in the buffer layer?
I think it should be done by each tracer, because buffer layer
can't ensure truly active writers have stopped.

Thank you,
Post by Peter Zijlstra
Which suggests you want to use something like ring_buffer_lock_cpu() and
implement that as above.
Post by Steven Rostedt
+ for (i = 0; i < nr_pages; i++) {
+ BUG_ON(list_empty(&cpu_buffer->pages));
+ p = cpu_buffer->pages.next;
+ page = list_entry(p, struct page, lru);
+ list_del_init(&page->lru);
+ __free_page(page);
+ }
+ BUG_ON(list_empty(&cpu_buffer->pages));
+
+ __ring_buffer_reset_cpu(cpu_buffer);
+
+ check_pages(cpu_buffer);
+
+ atomic_dec(&cpu_buffer->record_disabled);
+
+}
--
Masami Hiramatsu

Software Engineer
Hitachi Computer Products (America) Inc.
Software Solutions Division

e-mail: ***@redhat.com
Steven Rostedt
2008-09-26 21:26:15 UTC
Permalink
Post by Masami Hiramatsu
Post by Peter Zijlstra
Post by Steven Rostedt
+static void
+rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
+{
+ struct page *page;
+ struct list_head *p;
+ unsigned i;
+
+ atomic_inc(&cpu_buffer->record_disabled);
You probably want synchronize_sched() here (and similar other places) to
ensure any active writer on the corresponding cpu is actually stopped.
Would it really be done in the buffer layer?
I think it should be done by each tracer, because buffer layer
can't ensure truly active writers have stopped.
Actually it can ;-)

Since all writes to the buffer at least disable preemption, by issuing a
synchronize_sched, we can guarantee that after disabling the record, all
activity will be done.

-- Steve
Steven Rostedt
2008-09-26 21:13:14 UTC
Permalink
[
Changes since v6:

- Added shift debug test to test both normalization of
timestamp, but also the large time deltas. ftrace records too quickly
to get large deltas :-/

- Fixed some minor issues with keeping track of time.

- used slob hack to put more information in the page struct and now
have the full buffer page free for data. Thanks to Peter Zijlstra
for suggesting the idea.

- have the buffer use a cpu mask (initialized to cpu_possible_map)
to allocate for cpu usage.

- fixed entries counting.

- use DIV_ROUND_UP macro (also suggested by Peter)
]

This is a unified tracing buffer that implements a ring buffer that
hopefully everyone will eventually be able to use.

The events recorded into the buffer have the following structure:

struct ring_buffer_event {
u32 type:2, len:3, time_delta:27;
u32 array[];
};

The minimum size of an event is 8 bytes. All events are 4 byte
aligned inside the buffer.

There are 4 types (all internal use for the ring buffer, only
the data type is exported to the interface users).

RB_TYPE_PADDING: this type is used to note extra space at the end
of a buffer page.

RB_TYPE_TIME_EXTENT: This type is used when the time between events
is greater than the 27 bit delta can hold. We add another
32 bits, and record that in its own event (8 byte size).

RB_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to
help keep the buffer timestamps in sync.

RB_TYPE_DATA: The event actually holds user data.

The "len" field is only three bits. Since the data must be
4 byte aligned, this field is shifted left by 2, giving a
max length of 28 bytes. If the data load is greater than 28
bytes, the first array field holds the full length of the
data load and the len field is set to zero.

Example, data size of 7 bytes:

type = RB_TYPE_DATA
len = 2
time_delta: <time-stamp> - <prev_event-time-stamp>
array[0..1]: <7 bytes of data> <1 byte empty>

This event is saved in 12 bytes of the buffer.

An event with 82 bytes of data:

type = RB_TYPE_DATA
len = 0
time_delta: <time-stamp> - <prev_event-time-stamp>
array[0]: 84 (Note the alignment)
array[1..14]: <82 bytes of data> <2 bytes empty>

The above event is saved in 92 bytes (if my math is correct).
82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length.

Do not reference the above event struct directly. Use the following
functions to gain access to the event table, since the
ring_buffer_event structure may change in the future.

ring_buffer_event_length(event): get the length of the event.
This is the size of the memory used to record this
event, and not the size of the data pay load.

ring_buffer_time_delta(event): get the time delta of the event
This returns the delta time stamp since the last event.
Note: Even though this is in the header, there should
be no reason to access this directly, accept
for debugging.

ring_buffer_event_data(event): get the data from the event
This is the function to use to get the actual data
from the event. Note, it is only a pointer to the
data inside the buffer. This data must be copied to
another location otherwise you risk it being written
over in the buffer.

ring_buffer_lock: A way to lock the entire buffer.
ring_buffer_unlock: unlock the buffer.

ring_buffer_alloc: create a new ring buffer. Can choose between
overwrite or consumer/producer mode. Overwrite will
overwrite old data, where as consumer producer will
throw away new data if the consumer catches up with the
producer. The consumer/producer is the default.

ring_buffer_free: free the ring buffer.

ring_buffer_resize: resize the buffer. Changes the size of each cpu
buffer. Note, it is up to the caller to provide that
the buffer is not being used while this is happening.
This requirement may go away but do not count on it.

ring_buffer_lock_reserve: locks the ring buffer and allocates an
entry on the buffer to write to.
ring_buffer_unlock_commit: unlocks the ring buffer and commits it to
the buffer.

ring_buffer_write: writes some data into the ring buffer.

ring_buffer_peek: Look at a next item in the cpu buffer.
ring_buffer_consume: get the next item in the cpu buffer and
consume it. That is, this function increments the head
pointer.

ring_buffer_read_start: Start an iterator of a cpu buffer.
For now, this disables the cpu buffer, until you issue
a finish. This is just because we do not want the iterator
to be overwritten. This restriction may change in the future.
But note, this is used for static reading of a buffer which
is usually done "after" a trace. Live readings would want
to use the ring_buffer_consume above, which will not
disable the ring buffer.

ring_buffer_read_finish: Finishes the read iterator and reenables
the ring buffer.

ring_buffer_iter_peek: Look at the next item in the cpu iterator.
ring_buffer_read: Read the iterator and increment it.
ring_buffer_iter_reset: Reset the iterator to point to the beginning
of the cpu buffer.
ring_buffer_iter_empty: Returns true if the iterator is at the end
of the cpu buffer.

ring_buffer_size: returns the size in bytes of each cpu buffer.
Note, the real size is this times the number of CPUs.

ring_buffer_reset_cpu: Sets the cpu buffer to empty
ring_buffer_reset: sets all cpu buffers to empty

ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a
cpu buffer of another buffer. This is handy when you
want to take a snap shot of a running trace on just one
cpu. Having a backup buffer, to swap with facilitates this.
Ftrace max latencies use this.

ring_buffer_empty: Returns true if the ring buffer is empty.
ring_buffer_empty_cpu: Returns true if the cpu buffer is empty.

ring_buffer_record_disable: disable all cpu buffers (read only)
ring_buffer_record_disable_cpu: disable a single cpu buffer (read only)
ring_buffer_record_enable: enable all cpu buffers.
ring_buffer_record_enabl_cpu: enable a single cpu buffer.

ring_buffer_entries: The number of entries in a ring buffer.
ring_buffer_overruns: The number of entries removed due to writing wrap.

ring_buffer_time_stamp: Get the time stamp used by the ring buffer
ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp
into nanosecs.

I still need to implement the GTOD feature. But we need support from
the cpu frequency infrastructure. But this can be done at a later
time without affecting the ring buffer interface.

Signed-off-by: Steven Rostedt <***@redhat.com>
---
include/linux/ring_buffer.h | 179 +++++
kernel/trace/Kconfig | 4
kernel/trace/Makefile | 1
kernel/trace/ring_buffer.c | 1525 ++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 1709 insertions(+)

Index: linux-trace.git/include/linux/ring_buffer.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-trace.git/include/linux/ring_buffer.h 2008-09-26 14:16:54.000000000 -0400
@@ -0,0 +1,179 @@
+#ifndef _LINUX_RING_BUFFER_H
+#define _LINUX_RING_BUFFER_H
+
+#include <linux/mm.h>
+#include <linux/seq_file.h>
+
+struct ring_buffer;
+struct ring_buffer_iter;
+
+/*
+ * Don't reference this struct directly, use the inline items below.
+ */
+struct ring_buffer_event {
+ u32 type:2, len:3, time_delta:27;
+ u32 array[];
+};
+
+enum {
+ RB_TYPE_PADDING, /* Left over page padding
+ * array is ignored
+ * size is variable depending on
+ * how much padding is needed
+ */
+ RB_TYPE_TIME_EXTENT, /* Extent the time delta
+ * array[0] = time delta (28 .. 59)
+ * size = 8 bytes
+ */
+ /* FIXME: RB_TYPE_TIME_STAMP not implemented */
+ RB_TYPE_TIME_STAMP, /* Sync time stamp with external clock
+ * array[0] = tv_nsec
+ * array[1] = tv_sec
+ * size = 16 bytes
+ */
+
+ RB_TYPE_DATA, /* Data record
+ * If len is zero:
+ * array[0] holds the actual length
+ * array[1..(length+3)/4-1] holds data
+ * else
+ * length = len << 2
+ * array[0..(length+3)/4] holds data
+ */
+};
+
+#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event))
+#define RB_ALIGNMENT_SHIFT 2
+#define RB_ALIGNMENT (1 << RB_ALIGNMENT_SHIFT)
+#define RB_MAX_SMALL_DATA (28)
+
+enum {
+ RB_LEN_TIME_EXTENT = 8,
+ RB_LEN_TIME_STAMP = 16,
+};
+
+/**
+ * ring_buffer_event_length - return the length of the event
+ * @event: the event to get the length of
+ */
+static inline unsigned
+ring_buffer_event_length(struct ring_buffer_event *event)
+{
+ unsigned length;
+
+ switch (event->type) {
+ case RB_TYPE_PADDING:
+ /* undefined */
+ return -1;
+
+ case RB_TYPE_TIME_EXTENT:
+ return RB_LEN_TIME_EXTENT;
+
+ case RB_TYPE_TIME_STAMP:
+ return RB_LEN_TIME_STAMP;
+
+ case RB_TYPE_DATA:
+ if (event->len)
+ length = event->len << RB_ALIGNMENT_SHIFT;
+ else
+ length = event->array[0];
+ return length + RB_EVNT_HDR_SIZE;
+ default:
+ BUG();
+ }
+ /* not hit */
+ return 0;
+}
+
+/**
+ * ring_buffer_event_time_delta - return the delta timestamp of the event
+ * @event: the event to get the delta timestamp of
+ *
+ * The delta timestamp is the 27 bit timestamp since the last event.
+ */
+static inline unsigned
+ring_buffer_event_time_delta(struct ring_buffer_event *event)
+{
+ return event->time_delta;
+}
+
+/**
+ * ring_buffer_event_data - return the data of the event
+ * @event: the event to get the data from
+ */
+static inline void *
+ring_buffer_event_data(struct ring_buffer_event *event)
+{
+ BUG_ON(event->type != RB_TYPE_DATA);
+ /* If length is in len field, then array[0] has the data */
+ if (event->len)
+ return (void *)&event->array[0];
+ /* Otherwise length is in array[0] and array[1] has the data */
+ return (void *)&event->array[1];
+}
+
+void ring_buffer_lock(struct ring_buffer *buffer, unsigned long *flags);
+void ring_buffer_unlock(struct ring_buffer *buffer, unsigned long flags);
+
+/*
+ * size is in bytes for each per CPU buffer.
+ */
+struct ring_buffer *
+ring_buffer_alloc(unsigned long size, unsigned flags);
+void ring_buffer_free(struct ring_buffer *buffer);
+
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size);
+
+struct ring_buffer_event *
+ring_buffer_lock_reserve(struct ring_buffer *buffer,
+ unsigned long length,
+ unsigned long *flags);
+int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags);
+int ring_buffer_write(struct ring_buffer *buffer,
+ unsigned long length, void *data);
+
+struct ring_buffer_event *
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts);
+struct ring_buffer_event *
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts);
+
+struct ring_buffer_iter *
+ring_buffer_read_start(struct ring_buffer *buffer, int cpu);
+void ring_buffer_read_finish(struct ring_buffer_iter *iter);
+
+struct ring_buffer_event *
+ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts);
+struct ring_buffer_event *
+ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts);
+void ring_buffer_iter_reset(struct ring_buffer_iter *iter);
+int ring_buffer_iter_empty(struct ring_buffer_iter *iter);
+
+unsigned long ring_buffer_size(struct ring_buffer *buffer);
+
+void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu);
+void ring_buffer_reset(struct ring_buffer *buffer);
+
+int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
+ struct ring_buffer *buffer_b, int cpu);
+
+int ring_buffer_empty(struct ring_buffer *buffer);
+int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu);
+
+void ring_buffer_record_disable(struct ring_buffer *buffer);
+void ring_buffer_record_enable(struct ring_buffer *buffer);
+void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu);
+void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu);
+
+unsigned long ring_buffer_entries(struct ring_buffer *buffer);
+unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
+
+u64 ring_buffer_time_stamp(int cpu);
+void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
+
+enum ring_buffer_flags {
+ RB_FL_OVERWRITE = 1 << 0,
+};
+
+#endif /* _LINUX_RING_BUFFER_H */
Index: linux-trace.git/kernel/trace/ring_buffer.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-trace.git/kernel/trace/ring_buffer.c 2008-09-26 17:01:53.000000000 -0400
@@ -0,0 +1,1525 @@
+/*
+ * Generic ring buffer
+ *
+ * Copyright (C) 2008 Steven Rostedt <***@redhat.com>
+ */
+#include <linux/ring_buffer.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/mutex.h>
+#include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/list.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+
+#define DEBUG_SHIFT 15
+
+/* FIXME!!! */
+u64 ring_buffer_time_stamp(int cpu)
+{
+ /* shift to debug/test normalization and TIME_EXTENTS */
+ return sched_clock() << DEBUG_SHIFT;
+}
+void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
+{
+ /* Just stupid testing the normalize function and deltas */
+ *ts >>= DEBUG_SHIFT;
+}
+
+#define for_each_buffer_cpu(buffer, cpu) \
+ for_each_cpu_mask(cpu, buffer->cpumask)
+
+#define TS_SHIFT 27
+#define TS_MASK ((1ULL << TS_SHIFT) - 1)
+#define TS_DELTA_TEST (~TS_MASK)
+
+/*
+ * This hack stolen from mm/slob.c.
+ * We can store per page timing information in the page frame of the page.
+ * Thanks to Peter Zijlstra for suggesting this idea.
+ */
+struct buffer_page {
+ union {
+ struct {
+ unsigned long flags; /* mandatory */
+ atomic_t _count; /* mandatory */
+ u64 time_stamp; /* page time stamp */
+ struct list_head list; /* linked list of free pages */
+ };
+ struct page page;
+ };
+};
+
+/*
+ * We need to fit the time_stamp delta into 27 bits.
+ */
+static inline int
+test_time_stamp(unsigned long long delta)
+{
+ if (delta & TS_DELTA_TEST)
+ return 1;
+ return 0;
+}
+
+#define BUF_PAGE_SIZE PAGE_SIZE
+
+/*
+ * head_page == tail_page && head == tail then buffer is empty.
+ */
+struct ring_buffer_per_cpu {
+ int cpu;
+ struct ring_buffer *buffer;
+ raw_spinlock_t lock;
+ struct lock_class_key lock_key;
+ struct list_head pages;
+ unsigned long head; /* read from head */
+ unsigned long tail; /* write to tail */
+ struct buffer_page *head_page;
+ struct buffer_page *tail_page;
+ unsigned long overrun;
+ unsigned long entries;
+ u64 write_stamp;
+ u64 read_stamp;
+ atomic_t record_disabled;
+};
+
+struct ring_buffer {
+ unsigned long size;
+ unsigned pages;
+ unsigned flags;
+ int cpus;
+ cpumask_t cpumask;
+ atomic_t record_disabled;
+
+ struct mutex mutex;
+
+ struct ring_buffer_per_cpu **buffers;
+};
+
+struct ring_buffer_iter {
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long head;
+ struct buffer_page *head_page;
+ u64 read_stamp;
+};
+
+#define CHECK_COND(buffer, cond) \
+ if (unlikely(cond)) { \
+ atomic_inc(&buffer->record_disabled); \
+ WARN_ON(1); \
+ return -1; \
+ }
+
+/**
+ * check_pages - integrity check of buffer pages
+ * @cpu_buffer: CPU buffer with pages to test
+ *
+ * As a safty measure we check to make sure the data pages have not
+ * been corrupted.
+ */
+static int check_pages(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct list_head *head = &cpu_buffer->pages;
+ struct page *page, *tmp;
+
+ CHECK_COND(cpu_buffer, head->next->prev != head);
+ CHECK_COND(cpu_buffer, head->prev->next != head);
+
+ list_for_each_entry_safe(page, tmp, head, lru) {
+ CHECK_COND(cpu_buffer, page->lru.next->prev != &page->lru);
+ CHECK_COND(cpu_buffer, page->lru.prev->next != &page->lru);
+ }
+
+ return 0;
+}
+
+static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned nr_pages)
+{
+ struct list_head *head = &cpu_buffer->pages;
+ LIST_HEAD(pages);
+ struct page *page, *tmp;
+ unsigned long addr;
+ unsigned i;
+
+ for (i = 0; i < nr_pages; i++) {
+ addr = __get_free_page(GFP_KERNEL);
+ if (!addr)
+ goto free_pages;
+ page = virt_to_page(addr);
+ list_add(&page->lru, &pages);
+ }
+
+ list_splice(&pages, head);
+
+ check_pages(cpu_buffer);
+
+ return 0;
+
+ free_pages:
+ list_for_each_entry_safe(page, tmp, &pages, lru) {
+ list_del_init(&page->lru);
+ __free_page(page);
+ }
+ return -ENOMEM;
+}
+
+static struct ring_buffer_per_cpu *
+ring_buffer_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int ret;
+
+ cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!cpu_buffer)
+ return NULL;
+
+ cpu_buffer->cpu = cpu;
+ cpu_buffer->buffer = buffer;
+ cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+ INIT_LIST_HEAD(&cpu_buffer->pages);
+
+ ret = rb_allocate_pages(cpu_buffer, buffer->pages);
+ if (ret < 0)
+ goto fail_free_buffer;
+
+ cpu_buffer->head_page
+ = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+ cpu_buffer->tail_page
+ = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+
+ return cpu_buffer;
+
+ fail_free_buffer:
+ kfree(cpu_buffer);
+ return NULL;
+}
+
+static void
+ring_buffer_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct list_head *head = &cpu_buffer->pages;
+ struct page *page, *tmp;
+
+ list_for_each_entry_safe(page, tmp, head, lru) {
+ list_del_init(&page->lru);
+ __free_page(page);
+ }
+ kfree(cpu_buffer);
+}
+
+/**
+ * ring_buffer_alloc - allocate a new ring_buffer
+ * @size: the size in bytes that is needed.
+ * @flags: attributes to set for the ring buffer.
+ *
+ * Currently the only flag that is available is the RB_FL_OVERWRITE
+ * flag. This flag means that the buffer will overwrite old data
+ * when the buffer wraps. If this flag is not set, the buffer will
+ * drop data when the tail hits the head.
+ */
+struct ring_buffer *
+ring_buffer_alloc(unsigned long size, unsigned flags)
+{
+ struct ring_buffer *buffer;
+ int bsize;
+ int cpu;
+
+ /* keep it in its own cache line */
+ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
+ GFP_KERNEL);
+ if (!buffer)
+ return NULL;
+
+ buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ buffer->flags = flags;
+
+ /* need at least two pages */
+ if (buffer->pages == 1)
+ buffer->pages++;
+
+ buffer->cpumask = cpu_possible_map;
+ buffer->cpus = num_possible_cpus();
+
+ bsize = sizeof(void*) * nr_cpu_ids;
+ buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()),
+ GFP_KERNEL);
+ if (!buffer->buffers)
+ goto fail_free_buffer;
+
+ for_each_buffer_cpu(buffer, cpu) {
+ buffer->buffers[cpu] =
+ ring_buffer_allocate_cpu_buffer(buffer, cpu);
+ if (!buffer->buffers[cpu])
+ goto fail_free_buffers;
+ }
+
+ mutex_init(&buffer->mutex);
+
+ return buffer;
+
+ fail_free_buffers:
+ for_each_buffer_cpu(buffer, cpu) {
+ if (buffer->buffers[cpu])
+ ring_buffer_free_cpu_buffer(buffer->buffers[cpu]);
+ }
+ kfree(buffer->buffers);
+
+ fail_free_buffer:
+ kfree(buffer);
+ return NULL;
+}
+
+/**
+ * ring_buffer_free - free a ring buffer.
+ * @buffer: the buffer to free.
+ */
+void
+ring_buffer_free(struct ring_buffer *buffer)
+{
+ int cpu;
+
+ for_each_buffer_cpu(buffer, cpu)
+ ring_buffer_free_cpu_buffer(buffer->buffers[cpu]);
+
+ kfree(buffer);
+}
+
+static void
+__ring_buffer_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
+
+static void
+rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
+{
+ struct page *page;
+ struct list_head *p;
+ unsigned i;
+
+ atomic_inc(&cpu_buffer->record_disabled);
+
+ for (i = 0; i < nr_pages; i++) {
+ BUG_ON(list_empty(&cpu_buffer->pages));
+ p = cpu_buffer->pages.next;
+ page = list_entry(p, struct page, lru);
+ list_del_init(&page->lru);
+ __free_page(page);
+ }
+ BUG_ON(list_empty(&cpu_buffer->pages));
+
+ __ring_buffer_reset_cpu(cpu_buffer);
+
+ check_pages(cpu_buffer);
+
+ atomic_dec(&cpu_buffer->record_disabled);
+
+}
+
+static void
+rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
+ struct list_head *pages, unsigned nr_pages)
+{
+ struct page *page;
+ struct list_head *p;
+ unsigned i;
+
+ atomic_inc(&cpu_buffer->record_disabled);
+
+ for (i = 0; i < nr_pages; i++) {
+ BUG_ON(list_empty(pages));
+ p = pages->next;
+ page = list_entry(p, struct page, lru);
+ list_del_init(&page->lru);
+ list_add_tail(&page->lru, &cpu_buffer->pages);
+ }
+ __ring_buffer_reset_cpu(cpu_buffer);
+
+ check_pages(cpu_buffer);
+
+ atomic_dec(&cpu_buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_resize - resize the ring buffer
+ * @buffer: the buffer to resize.
+ * @size: the new size.
+ *
+ * The tracer is responsible for making sure that the buffer is
+ * not being used while changing the size.
+ * Note: We may be able to change the above requirement by using
+ * RCU synchronizations.
+ *
+ * Minimum size is 2 * BUF_PAGE_SIZE.
+ *
+ * Returns -1 on failure.
+ */
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long buffer_size;
+ LIST_HEAD(pages);
+ unsigned long addr;
+ unsigned nr_pages, rm_pages, new_pages;
+ struct page *page, *tmp;
+ int i, cpu;
+
+ size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ size *= BUF_PAGE_SIZE;
+ buffer_size = buffer->pages * BUF_PAGE_SIZE;
+
+ /* we need a minimum of two pages */
+ if (size < BUF_PAGE_SIZE * 2)
+ size = BUF_PAGE_SIZE * 2;
+
+ if (size == buffer_size)
+ return size;
+
+ mutex_lock(&buffer->mutex);
+
+ nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+
+ if (size < buffer_size) {
+
+ /* easy case, just free pages */
+ BUG_ON(nr_pages >= buffer->pages);
+
+ rm_pages = buffer->pages - nr_pages;
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ rb_remove_pages(cpu_buffer, rm_pages);
+ }
+ goto out;
+ }
+
+ /*
+ * This is a bit more difficult. We only want to add pages
+ * when we can allocate enough for all CPUs. We do this
+ * by allocating all the pages and storing them on a local
+ * link list. If we succeed in our allocation, then we
+ * add these pages to the cpu_buffers. Otherwise we just free
+ * them all and return -ENOMEM;
+ */
+ BUG_ON(nr_pages <= buffer->pages);
+ new_pages = nr_pages - buffer->pages;
+
+ for_each_buffer_cpu(buffer, cpu) {
+ for (i = 0; i < new_pages; i++) {
+ addr = __get_free_page(GFP_KERNEL);
+ if (!addr)
+ goto free_pages;
+ page = virt_to_page(addr);
+ list_add(&page->lru, &pages);
+ }
+ }
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ rb_insert_pages(cpu_buffer, &pages, new_pages);
+ }
+
+ BUG_ON(!list_empty(&pages));
+
+ out:
+ buffer->pages = nr_pages;
+ mutex_unlock(&buffer->mutex);
+
+ return size;
+
+ free_pages:
+ list_for_each_entry_safe(page, tmp, &pages, lru) {
+ list_del_init(&page->lru);
+ __free_page(page);
+ }
+ return -ENOMEM;
+}
+
+static inline int
+ring_buffer_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return cpu_buffer->head_page == cpu_buffer->tail_page &&
+ cpu_buffer->head == cpu_buffer->tail;
+}
+
+static inline int
+ring_buffer_null_event(struct ring_buffer_event *event)
+{
+ return event->type == RB_TYPE_PADDING;
+}
+
+static inline void *
+rb_page_index(struct buffer_page *page, unsigned index)
+{
+ void *addr;
+
+ addr = page_address(&page->page);
+ return addr + index;
+}
+
+static inline struct ring_buffer_event *
+ring_buffer_head_event(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return rb_page_index(cpu_buffer->head_page,
+ cpu_buffer->head);
+}
+
+static inline struct ring_buffer_event *
+ring_buffer_iter_head_event(struct ring_buffer_iter *iter)
+{
+ return rb_page_index(iter->head_page,
+ iter->head);
+}
+
+/*
+ * When the tail hits the head and the buffer is in overwrite mode,
+ * the head jumps to the next page and all content on the previous
+ * page is discarded. But before doing so, we update the overrun
+ * variable of the buffer.
+ */
+static void
+ring_buffer_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct ring_buffer_event *event;
+ unsigned long head;
+
+ for (head = 0; head < BUF_PAGE_SIZE;
+ head += ring_buffer_event_length(event)) {
+ event = rb_page_index(cpu_buffer->head_page, head);
+ if (ring_buffer_null_event(event))
+ break;
+ /* Only count data entries */
+ if (event->type != RB_TYPE_DATA)
+ continue;
+ cpu_buffer->overrun++;
+ cpu_buffer->entries--;
+ }
+}
+
+static inline void
+ring_buffer_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page **page)
+{
+ struct list_head *p = (*page)->list.next;
+
+ if (p == &cpu_buffer->pages)
+ p = p->next;
+
+ *page = list_entry(p, struct buffer_page, list);
+}
+
+static inline void
+rb_add_stamp(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
+{
+ cpu_buffer->tail_page->time_stamp = *ts;
+ cpu_buffer->write_stamp = *ts;
+}
+
+static void
+rb_reset_read_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ cpu_buffer->read_stamp = cpu_buffer->head_page->time_stamp;
+ cpu_buffer->head = 0;
+}
+
+static void
+rb_reset_iter_read_page(struct ring_buffer_iter *iter)
+{
+ iter->read_stamp = iter->head_page->time_stamp;
+ iter->head = 0;
+}
+
+/**
+ * ring_buffer_update_event - update event type and data
+ * @event: the even to update
+ * @type: the type of event
+ * @length: the size of the event field in the ring buffer
+ *
+ * Update the type and data fields of the event. The length
+ * is the actual size that is written to the ring buffer,
+ * and with this, we can determine what to place into the
+ * data field.
+ */
+static inline void
+ring_buffer_update_event(struct ring_buffer_event *event,
+ unsigned type, unsigned length)
+{
+ event->type = type;
+
+ switch (type) {
+
+ case RB_TYPE_PADDING:
+ break;
+
+ case RB_TYPE_TIME_EXTENT:
+ event->len =
+ (RB_LEN_TIME_EXTENT + (RB_ALIGNMENT-1))
+ >> RB_ALIGNMENT_SHIFT;
+ break;
+
+ case RB_TYPE_TIME_STAMP:
+ event->len =
+ (RB_LEN_TIME_STAMP + (RB_ALIGNMENT-1))
+ >> RB_ALIGNMENT_SHIFT;
+ break;
+
+ case RB_TYPE_DATA:
+ length -= RB_EVNT_HDR_SIZE;
+ if (length > RB_MAX_SMALL_DATA) {
+ event->len = 0;
+ event->array[0] = length;
+ } else
+ event->len =
+ (length + (RB_ALIGNMENT-1))
+ >> RB_ALIGNMENT_SHIFT;
+ break;
+ default:
+ BUG();
+ }
+}
+
+static inline unsigned rb_calculate_event_length(unsigned length)
+{
+ struct ring_buffer_event event; /* Used only for sizeof array */
+
+ /* zero length can cause confusions */
+ if (!length)
+ length = 1;
+
+ if (length > RB_MAX_SMALL_DATA)
+ length += sizeof(event.array[0]);
+
+ length += RB_EVNT_HDR_SIZE;
+ length = ALIGN(length, RB_ALIGNMENT);
+
+ return length;
+}
+
+static struct ring_buffer_event *
+__ring_buffer_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned type, unsigned long length, u64 *ts)
+{
+ struct buffer_page *head_page, *tail_page;
+ unsigned long tail;
+ struct ring_buffer *buffer = cpu_buffer->buffer;
+ struct ring_buffer_event *event;
+
+ tail_page = cpu_buffer->tail_page;
+ head_page = cpu_buffer->head_page;
+ tail = cpu_buffer->tail;
+
+ if (tail + length > BUF_PAGE_SIZE) {
+ struct buffer_page *next_page = tail_page;
+
+ ring_buffer_inc_page(cpu_buffer, &next_page);
+
+ if (next_page == head_page) {
+ if (!(buffer->flags & RB_FL_OVERWRITE))
+ return NULL;
+
+ /* count overflows */
+ ring_buffer_update_overflow(cpu_buffer);
+
+ ring_buffer_inc_page(cpu_buffer, &head_page);
+ cpu_buffer->head_page = head_page;
+ rb_reset_read_page(cpu_buffer);
+ }
+
+ if (tail != BUF_PAGE_SIZE) {
+ event = rb_page_index(tail_page, tail);
+ /* page padding */
+ event->type = RB_TYPE_PADDING;
+ }
+
+ tail = 0;
+ tail_page = next_page;
+ cpu_buffer->tail_page = tail_page;
+ cpu_buffer->tail = tail;
+ rb_add_stamp(cpu_buffer, ts);
+ }
+
+ BUG_ON(tail + length > BUF_PAGE_SIZE);
+
+ event = rb_page_index(tail_page, tail);
+ ring_buffer_update_event(event, type, length);
+
+ return event;
+}
+
+static struct ring_buffer_event *
+ring_buffer_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned type, unsigned long length)
+{
+ unsigned long long ts, delta;
+ struct ring_buffer_event *event;
+
+ ts = ring_buffer_time_stamp(cpu_buffer->cpu);
+
+ if (cpu_buffer->tail) {
+ delta = ts - cpu_buffer->write_stamp;
+
+ if (test_time_stamp(delta)) {
+ /*
+ * The delta is too big, we to add a
+ * new timestamp.
+ */
+ event = __ring_buffer_reserve_next(cpu_buffer,
+ RB_TYPE_TIME_EXTENT,
+ RB_LEN_TIME_EXTENT,
+ &ts);
+ if (!event)
+ return NULL;
+
+ /* check to see if we went to the next page */
+ if (cpu_buffer->tail) {
+ /* Still on same page, update timestamp */
+ event->time_delta = delta & TS_MASK;
+ event->array[0] = delta >> TS_SHIFT;
+ /* commit the time event */
+ cpu_buffer->tail +=
+ ring_buffer_event_length(event);
+ cpu_buffer->write_stamp = ts;
+ }
+ delta = 0;
+ }
+ } else {
+ rb_add_stamp(cpu_buffer, &ts);
+ delta = 0;
+ }
+
+ event = __ring_buffer_reserve_next(cpu_buffer, type, length, &ts);
+ if (!event)
+ return NULL;
+
+ event->time_delta = delta;
+
+ return event;
+}
+
+/**
+ * ring_buffer_lock_reserve - reserve a part of the buffer
+ * @buffer: the ring buffer to reserve from
+ * @length: the length of the data to reserve (excluding event header)
+ * @flags: a pointer to save the interrupt flags
+ *
+ * Returns a reseverd event on the ring buffer to copy directly to.
+ * The user of this interface will need to get the body to write into
+ * and can use the ring_buffer_event_data() interface.
+ *
+ * The length is the length of the data needed, not the event length
+ * which also includes the event header.
+ *
+ * Must be paired with ring_buffer_unlock_commit, unless NULL is returned.
+ * If NULL is returned, then nothing has been allocated or locked.
+ */
+struct ring_buffer_event *
+ring_buffer_lock_reserve(struct ring_buffer *buffer,
+ unsigned long length,
+ unsigned long *flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ int cpu;
+
+ if (atomic_read(&buffer->record_disabled))
+ return NULL;
+
+ raw_local_irq_save(*flags);
+ cpu = raw_smp_processor_id();
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_lock(&cpu_buffer->lock);
+
+ if (atomic_read(&cpu_buffer->record_disabled))
+ goto no_record;
+
+ length = rb_calculate_event_length(length);
+ if (length > BUF_PAGE_SIZE)
+ return NULL;
+
+ event = ring_buffer_reserve_next_event(cpu_buffer,
+ RB_TYPE_DATA, length);
+ if (!event)
+ goto no_record;
+
+ return event;
+
+ no_record:
+ __raw_spin_unlock(&cpu_buffer->lock);
+ local_irq_restore(*flags);
+ return NULL;
+}
+
+static void
+__ring_buffer_commit(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ cpu_buffer->tail += ring_buffer_event_length(event);
+ cpu_buffer->write_stamp += event->time_delta;
+ cpu_buffer->entries++;
+}
+
+/**
+ * ring_buffer_unlock_commit - commit a reserved
+ * @buffer: The buffer to commit to
+ * @event: The event pointer to commit.
+ * @flags: the interrupt flags received from ring_buffer_lock_reserve.
+ *
+ * This commits the data to the ring buffer, and releases any locks held.
+ *
+ * Must be paired with ring_buffer_lock_reserve.
+ */
+int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu = raw_smp_processor_id();
+
+ cpu_buffer = buffer->buffers[cpu];
+ __ring_buffer_commit(cpu_buffer, event);
+
+ __raw_spin_unlock(&cpu_buffer->lock);
+ raw_local_irq_restore(flags);
+
+ return 0;
+}
+
+/**
+ * ring_buffer_write - write data to the buffer without reserving
+ * @buffer: The ring buffer to write to.
+ * @length: The length of the data being written (excluding the event header)
+ * @data: The data to write to the buffer.
+ *
+ * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as
+ * one function. If you already have the data to write to the buffer, it
+ * may be easier to simply call this function.
+ *
+ * Note, like ring_buffer_lock_reserve, the length is the length of the data
+ * and not the length of the event which would hold the header.
+ */
+int ring_buffer_write(struct ring_buffer *buffer,
+ unsigned long length,
+ void *data)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ unsigned long event_length, flags;
+ void *body;
+ int ret = -EBUSY;
+ int cpu;
+
+ if (atomic_read(&buffer->record_disabled))
+ return -EBUSY;
+
+ local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_lock(&cpu_buffer->lock);
+
+ if (atomic_read(&cpu_buffer->record_disabled))
+ goto out;
+
+ event_length = rb_calculate_event_length(length);
+ event = ring_buffer_reserve_next_event(cpu_buffer,
+ RB_TYPE_DATA, event_length);
+ if (!event)
+ goto out;
+
+ body = ring_buffer_event_data(event);
+
+ memcpy(body, data, length);
+
+ __ring_buffer_commit(cpu_buffer, event);
+
+ ret = 0;
+ out:
+ __raw_spin_unlock(&cpu_buffer->lock);
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+/**
+ * ring_buffer_lock - lock the ring buffer
+ * @buffer: The ring buffer to lock
+ * @flags: The place to store the interrupt flags
+ *
+ * This locks all the per CPU buffers.
+ *
+ * Must be unlocked by ring_buffer_unlock.
+ */
+void ring_buffer_lock(struct ring_buffer *buffer, unsigned long *flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ local_irq_save(*flags);
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_lock(&cpu_buffer->lock);
+ }
+}
+
+/**
+ * ring_buffer_unlock - unlock a locked buffer
+ * @buffer: The locked buffer to unlock
+ * @flags: The interrupt flags received by ring_buffer_lock
+ */
+void ring_buffer_unlock(struct ring_buffer *buffer, unsigned long flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ for (cpu = buffer->cpus - 1; cpu >= 0; cpu--) {
+ if (!cpu_isset(cpu, buffer->cpumask))
+ continue;
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_unlock(&cpu_buffer->lock);
+ }
+
+ local_irq_restore(flags);
+}
+
+/**
+ * ring_buffer_record_disable - stop all writes into the buffer
+ * @buffer: The ring buffer to stop writes to.
+ *
+ * This prevents all writes to the buffer. Any attempt to write
+ * to the buffer after this will fail and return NULL.
+ */
+void ring_buffer_record_disable(struct ring_buffer *buffer)
+{
+ atomic_inc(&buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_enable - enable writes to the buffer
+ * @buffer: The ring buffer to enable writes
+ *
+ * Note, multiple disables will need the same number of enables
+ * to truely enable the writing (much like preempt_disable).
+ */
+void ring_buffer_record_enable(struct ring_buffer *buffer)
+{
+ atomic_dec(&buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
+ * @buffer: The ring buffer to stop writes to.
+ * @cpu: The CPU buffer to stop
+ *
+ * This prevents all writes to the buffer. Any attempt to write
+ * to the buffer after this will fail and return NULL.
+ */
+void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+ atomic_inc(&cpu_buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_enable_cpu - enable writes to the buffer
+ * @buffer: The ring buffer to enable writes
+ * @cpu: The CPU to enable.
+ *
+ * Note, multiple disables will need the same number of enables
+ * to truely enable the writing (much like preempt_disable).
+ */
+void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+ atomic_dec(&cpu_buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the entries from.
+ */
+unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return cpu_buffer->entries;
+}
+
+/**
+ * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return cpu_buffer->overrun;
+}
+
+/**
+ * ring_buffer_entries - get the number of entries in a buffer
+ * @buffer: The ring buffer
+ *
+ * Returns the total number of entries in the ring buffer
+ * (all CPU entries)
+ */
+unsigned long ring_buffer_entries(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long entries = 0;
+ int cpu;
+
+ /* if you care about this being correct, lock the buffer */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ entries += cpu_buffer->entries;
+ }
+
+ return entries;
+}
+
+/**
+ * ring_buffer_overrun_cpu - get the number of overruns in buffer
+ * @buffer: The ring buffer
+ *
+ * Returns the total number of overruns in the ring buffer
+ * (all CPU entries)
+ */
+unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long overruns = 0;
+ int cpu;
+
+ /* if you care about this being correct, lock the buffer */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ overruns += cpu_buffer->overrun;
+ }
+
+ return overruns;
+}
+
+/**
+ * ring_buffer_iter_reset - reset an iterator
+ * @iter: The iterator to reset
+ *
+ * Resets the iterator, so that it will start from the beginning
+ * again.
+ */
+void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+
+ iter->head_page = cpu_buffer->head_page;
+ iter->head = cpu_buffer->head;
+ rb_reset_iter_read_page(iter);
+}
+
+/**
+ * ring_buffer_iter_empty - check if an iterator has no more to read
+ * @iter: The iterator to check
+ */
+int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = iter->cpu_buffer;
+
+ return iter->head_page == cpu_buffer->tail_page &&
+ iter->head == cpu_buffer->tail;
+}
+
+static void
+rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ u64 delta;
+
+ switch (event->type) {
+ case RB_TYPE_PADDING:
+ return;
+
+ case RB_TYPE_TIME_EXTENT:
+ delta = event->array[0];
+ delta <<= TS_SHIFT;
+ delta += event->time_delta;
+ cpu_buffer->read_stamp += delta;
+ return;
+
+ case RB_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ return;
+
+ case RB_TYPE_DATA:
+ cpu_buffer->read_stamp += event->time_delta;
+ return;
+
+ default:
+ BUG();
+ }
+ return;
+}
+
+static void
+rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
+ struct ring_buffer_event *event)
+{
+ u64 delta;
+
+ switch (event->type) {
+ case RB_TYPE_PADDING:
+ return;
+
+ case RB_TYPE_TIME_EXTENT:
+ delta = event->array[0];
+ delta <<= TS_SHIFT;
+ delta += event->time_delta;
+ iter->read_stamp += delta;
+ return;
+
+ case RB_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ return;
+
+ case RB_TYPE_DATA:
+ iter->read_stamp += event->time_delta;
+ return;
+
+ default:
+ BUG();
+ }
+ return;
+}
+
+static void
+ring_buffer_advance_head(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct ring_buffer_event *event;
+ unsigned length;
+
+ event = ring_buffer_head_event(cpu_buffer);
+ /*
+ * Check if we are at the end of the buffer.
+ */
+ if (ring_buffer_null_event(event)) {
+ BUG_ON(cpu_buffer->head_page == cpu_buffer->tail_page);
+ ring_buffer_inc_page(cpu_buffer, &cpu_buffer->head_page);
+ rb_reset_read_page(cpu_buffer);
+ return;
+ }
+
+ if (event->type == RB_TYPE_DATA)
+ cpu_buffer->entries--;
+
+ length = ring_buffer_event_length(event);
+
+ /*
+ * This should not be called to advance the header if we are
+ * at the tail of the buffer.
+ */
+ BUG_ON((cpu_buffer->head_page == cpu_buffer->tail_page) &&
+ (cpu_buffer->head + length > cpu_buffer->tail));
+
+ rb_update_read_stamp(cpu_buffer, event);
+
+ cpu_buffer->head += length;
+
+ /* check for end of page padding */
+ event = ring_buffer_head_event(cpu_buffer);
+ if (ring_buffer_null_event(event) &&
+ (cpu_buffer->head_page != cpu_buffer->tail_page))
+ ring_buffer_advance_head(cpu_buffer);
+}
+
+static void
+ring_buffer_advance_iter(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer *buffer;
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ unsigned length;
+
+ cpu_buffer = iter->cpu_buffer;
+ buffer = cpu_buffer->buffer;
+
+ event = ring_buffer_iter_head_event(iter);
+
+ /*
+ * Check if we are at the end of the buffer.
+ */
+ if (ring_buffer_null_event(event)) {
+ BUG_ON(iter->head_page == cpu_buffer->tail_page);
+ ring_buffer_inc_page(cpu_buffer, &iter->head_page);
+ rb_reset_iter_read_page(iter);
+ return;
+ }
+
+ length = ring_buffer_event_length(event);
+
+ /*
+ * This should not be called to advance the header if we are
+ * at the tail of the buffer.
+ */
+ BUG_ON((iter->head_page == cpu_buffer->tail_page) &&
+ (iter->head + length > cpu_buffer->tail));
+
+ rb_update_iter_read_stamp(iter, event);
+
+ iter->head += length;
+
+ /* check for end of page padding */
+ event = ring_buffer_iter_head_event(iter);
+ if (ring_buffer_null_event(event) &&
+ (iter->head_page != cpu_buffer->tail_page))
+ ring_buffer_advance_iter(iter);
+}
+
+/**
+ * ring_buffer_peek - peek at the next event to be read
+ * @buffer: The ring buffer to read
+ * @cpu: The cpu to peak at
+ * @ts: The timestamp counter of this event.
+ *
+ * This will return the event that will be read next, but does
+ * not consume the data.
+ */
+struct ring_buffer_event *
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ again:
+ if (ring_buffer_per_cpu_empty(cpu_buffer))
+ return NULL;
+
+ event = ring_buffer_head_event(cpu_buffer);
+
+ switch (event->type) {
+ case RB_TYPE_PADDING:
+ ring_buffer_inc_page(cpu_buffer, &cpu_buffer->head_page);
+ rb_reset_read_page(cpu_buffer);
+ goto again;
+
+ case RB_TYPE_TIME_EXTENT:
+ /* Internal data, OK to advance */
+ ring_buffer_advance_head(cpu_buffer);
+ goto again;
+
+ case RB_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ ring_buffer_advance_head(cpu_buffer);
+ goto again;
+
+ case RB_TYPE_DATA:
+ if (ts) {
+ *ts = cpu_buffer->read_stamp + event->time_delta;
+ ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+ }
+ return event;
+
+ default:
+ BUG();
+ }
+
+ return NULL;
+}
+
+/**
+ * ring_buffer_iter_peek - peek at the next event to be read
+ * @iter: The ring buffer iterator
+ * @ts: The timestamp counter of this event.
+ *
+ * This will return the event that will be read next, but does
+ * not increment the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
+{
+ struct ring_buffer *buffer;
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+
+ if (ring_buffer_iter_empty(iter))
+ return NULL;
+
+ cpu_buffer = iter->cpu_buffer;
+ buffer = cpu_buffer->buffer;
+
+ again:
+ if (ring_buffer_per_cpu_empty(cpu_buffer))
+ return NULL;
+
+ event = ring_buffer_iter_head_event(iter);
+
+ switch (event->type) {
+ case RB_TYPE_PADDING:
+ ring_buffer_inc_page(cpu_buffer, &iter->head_page);
+ rb_reset_iter_read_page(iter);
+ goto again;
+
+ case RB_TYPE_TIME_EXTENT:
+ /* Internal data, OK to advance */
+ ring_buffer_advance_iter(iter);
+ goto again;
+
+ case RB_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ ring_buffer_advance_iter(iter);
+ goto again;
+
+ case RB_TYPE_DATA:
+ if (ts) {
+ *ts = iter->read_stamp + event->time_delta;
+ ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+ }
+ return event;
+
+ default:
+ BUG();
+ }
+
+ return NULL;
+}
+
+/**
+ * ring_buffer_consume - return an event and consume it
+ * @buffer: The ring buffer to get the next event from
+ *
+ * Returns the next event in the ring buffer, and that event is consumed.
+ * Meaning, that sequential reads will keep returning a different event,
+ * and eventually empty the ring buffer if the producer is slower.
+ */
+struct ring_buffer_event *
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+
+ event = ring_buffer_peek(buffer, cpu, ts);
+ if (!event)
+ return NULL;
+
+ cpu_buffer = buffer->buffers[cpu];
+ ring_buffer_advance_head(cpu_buffer);
+
+ return event;
+}
+
+/**
+ * ring_buffer_read_start - start a non consuming read of the buffer
+ * @buffer: The ring buffer to read from
+ * @cpu: The cpu buffer to iterate over
+ *
+ * This starts up an iteration through the buffer. It also disables
+ * the recording to the buffer until the reading is finished.
+ * This prevents the reading from being corrupted. This is not
+ * a consuming read, so a producer is not expected.
+ *
+ * Must be paired with ring_buffer_finish.
+ */
+struct ring_buffer_iter *
+ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_iter *iter;
+
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return NULL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ iter->cpu_buffer = cpu_buffer;
+
+ atomic_inc(&cpu_buffer->record_disabled);
+
+ __raw_spin_lock(&cpu_buffer->lock);
+ iter->head = cpu_buffer->head;
+ iter->head_page = cpu_buffer->head_page;
+ rb_reset_iter_read_page(iter);
+ __raw_spin_unlock(&cpu_buffer->lock);
+
+ return iter;
+}
+
+/**
+ * ring_buffer_finish - finish reading the iterator of the buffer
+ * @iter: The iterator retrieved by ring_buffer_start
+ *
+ * This re-enables the recording to the buffer, and frees the
+ * iterator.
+ */
+void
+ring_buffer_read_finish(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+
+ atomic_dec(&cpu_buffer->record_disabled);
+ kfree(iter);
+}
+
+/**
+ * ring_buffer_read - read the next item in the ring buffer by the iterator
+ * @iter: The ring buffer iterator
+ * @ts: The time stamp of the event read.
+ *
+ * This reads the next event in the ring buffer and increments the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
+{
+ struct ring_buffer_event *event;
+
+ event = ring_buffer_iter_peek(iter, ts);
+ if (!event)
+ return NULL;
+
+ ring_buffer_advance_iter(iter);
+
+ return event;
+}
+
+/**
+ * ring_buffer_size - return the size of the ring buffer (in bytes)
+ * @buffer: The ring buffer.
+ */
+unsigned long ring_buffer_size(struct ring_buffer *buffer)
+{
+ return BUF_PAGE_SIZE * buffer->pages;
+}
+
+static void
+__ring_buffer_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ cpu_buffer->head_page
+ = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+ cpu_buffer->tail_page
+ = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+
+ cpu_buffer->head = cpu_buffer->tail = 0;
+ cpu_buffer->overrun = 0;
+ cpu_buffer->entries = 0;
+}
+
+/**
+ * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
+ * @buffer: The ring buffer to reset a per cpu buffer of
+ * @cpu: The CPU buffer to be reset
+ */
+void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ unsigned long flags;
+
+ raw_local_irq_save(flags);
+ __raw_spin_lock(&cpu_buffer->lock);
+
+ __ring_buffer_reset_cpu(cpu_buffer);
+
+ __raw_spin_unlock(&cpu_buffer->lock);
+ raw_local_irq_restore(flags);
+}
+
+/**
+ * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
+ * @buffer: The ring buffer to reset a per cpu buffer of
+ * @cpu: The CPU buffer to be reset
+ */
+void ring_buffer_reset(struct ring_buffer *buffer)
+{
+ unsigned long flags;
+ int cpu;
+
+ ring_buffer_lock(buffer, &flags);
+
+ for_each_buffer_cpu(buffer, cpu)
+ __ring_buffer_reset_cpu(buffer->buffers[cpu]);
+
+ ring_buffer_unlock(buffer, flags);
+}
+
+/**
+ * rind_buffer_empty - is the ring buffer empty?
+ * @buffer: The ring buffer to test
+ */
+int ring_buffer_empty(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ /* yes this is racy, but if you don't like the race, lock the buffer */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ if (!ring_buffer_per_cpu_empty(cpu_buffer))
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty?
+ * @buffer: The ring buffer
+ * @cpu: The CPU buffer to test
+ */
+int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return ring_buffer_per_cpu_empty(cpu_buffer);
+}
+
+/**
+ * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
+ * @buffer_a: One buffer to swap with
+ * @buffer_b: The other buffer to swap with
+ *
+ * This function is useful for tracers that want to take a "snapshot"
+ * of a CPU buffer and has another back up buffer lying around.
+ * it is expected that the tracer handles the cpu buffer not being
+ * used at the moment.
+ */
+int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
+ struct ring_buffer *buffer_b, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer_a;
+ struct ring_buffer_per_cpu *cpu_buffer_b;
+
+ /* At least make sure the two buffers are somewhat the same */
+ if (buffer_a->size != buffer_b->size ||
+ buffer_a->pages != buffer_b->pages)
+ return -EINVAL;
+
+ cpu_buffer_a = buffer_a->buffers[cpu];
+ cpu_buffer_b = buffer_b->buffers[cpu];
+
+ atomic_inc(&cpu_buffer_a->record_disabled);
+ atomic_inc(&cpu_buffer_b->record_disabled);
+
+ buffer_a->buffers[cpu] = cpu_buffer_b;
+ buffer_b->buffers[cpu] = cpu_buffer_a;
+
+ cpu_buffer_b->buffer = buffer_a;
+ cpu_buffer_a->buffer = buffer_b;
+
+ atomic_dec(&cpu_buffer_a->record_disabled);
+ atomic_dec(&cpu_buffer_b->record_disabled);
+
+ return 0;
+}
+
Index: linux-trace.git/kernel/trace/Kconfig
===================================================================
--- linux-trace.git.orig/kernel/trace/Kconfig 2008-09-26 14:16:45.000000000 -0400
+++ linux-trace.git/kernel/trace/Kconfig 2008-09-26 14:16:54.000000000 -0400
@@ -10,10 +10,14 @@ config HAVE_DYNAMIC_FTRACE
config TRACER_MAX_TRACE
bool

+config RING_BUFFER
+ bool
+
config TRACING
bool
select DEBUG_FS
select STACKTRACE
+ select RING_BUFFER

config FTRACE
bool "Kernel Function Tracer"
Index: linux-trace.git/kernel/trace/Makefile
===================================================================
--- linux-trace.git.orig/kernel/trace/Makefile 2008-09-26 14:16:45.000000000 -0400
+++ linux-trace.git/kernel/trace/Makefile 2008-09-26 14:16:54.000000000 -0400
@@ -11,6 +11,7 @@ obj-y += trace_selftest_dynamic.o
endif

obj-$(CONFIG_FTRACE) += libftrace.o
+obj-$(CONFIG_RING_BUFFER) += ring_buffer.o

obj-$(CONFIG_TRACING) += trace.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
Steven Rostedt
2008-09-27 02:02:29 UTC
Permalink
[
Changes since v7:

- added the size of data in the page into the page frame.
Suggested by Martin Bligh and Mathieu Desnoyers

- Converted all static functions to be named with a rb_ prefix.
This may conflict with rbtree functions in the future, but if
this does happen, we will need to rename the functions in this
file. The rb_ prefixed functions here are all static, so it only
affects this code. Thanks to Arnaldo Carvalho de Melo.

- Added some synchronized_sched() where record_disabled is
incremented. There are other places that expect the caller
to handle it. Suggested by Peter Zijlstra.

- Use nr_cpu_ids for max cpu. Thanks to Mike Travis.
]

This is a unified tracing buffer that implements a ring buffer that
hopefully everyone will eventually be able to use.

The events recorded into the buffer have the following structure:

struct ring_buffer_event {
u32 type:2, len:3, time_delta:27;
u32 array[];
};

The minimum size of an event is 8 bytes. All events are 4 byte
aligned inside the buffer.

There are 4 types (all internal use for the ring buffer, only
the data type is exported to the interface users).

RB_TYPE_PADDING: this type is used to note extra space at the end
of a buffer page.

RB_TYPE_TIME_EXTENT: This type is used when the time between events
is greater than the 27 bit delta can hold. We add another
32 bits, and record that in its own event (8 byte size).

RB_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to
help keep the buffer timestamps in sync.

RB_TYPE_DATA: The event actually holds user data.

The "len" field is only three bits. Since the data must be
4 byte aligned, this field is shifted left by 2, giving a
max length of 28 bytes. If the data load is greater than 28
bytes, the first array field holds the full length of the
data load and the len field is set to zero.

Example, data size of 7 bytes:

type = RB_TYPE_DATA
len = 2
time_delta: <time-stamp> - <prev_event-time-stamp>
array[0..1]: <7 bytes of data> <1 byte empty>

This event is saved in 12 bytes of the buffer.

An event with 82 bytes of data:

type = RB_TYPE_DATA
len = 0
time_delta: <time-stamp> - <prev_event-time-stamp>
array[0]: 84 (Note the alignment)
array[1..14]: <82 bytes of data> <2 bytes empty>

The above event is saved in 92 bytes (if my math is correct).
82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length.

Do not reference the above event struct directly. Use the following
functions to gain access to the event table, since the
ring_buffer_event structure may change in the future.

ring_buffer_event_length(event): get the length of the event.
This is the size of the memory used to record this
event, and not the size of the data pay load.

ring_buffer_time_delta(event): get the time delta of the event
This returns the delta time stamp since the last event.
Note: Even though this is in the header, there should
be no reason to access this directly, accept
for debugging.

ring_buffer_event_data(event): get the data from the event
This is the function to use to get the actual data
from the event. Note, it is only a pointer to the
data inside the buffer. This data must be copied to
another location otherwise you risk it being written
over in the buffer.

ring_buffer_lock: A way to lock the entire buffer.
ring_buffer_unlock: unlock the buffer.

ring_buffer_alloc: create a new ring buffer. Can choose between
overwrite or consumer/producer mode. Overwrite will
overwrite old data, where as consumer producer will
throw away new data if the consumer catches up with the
producer. The consumer/producer is the default.

ring_buffer_free: free the ring buffer.

ring_buffer_resize: resize the buffer. Changes the size of each cpu
buffer. Note, it is up to the caller to provide that
the buffer is not being used while this is happening.
This requirement may go away but do not count on it.

ring_buffer_lock_reserve: locks the ring buffer and allocates an
entry on the buffer to write to.
ring_buffer_unlock_commit: unlocks the ring buffer and commits it to
the buffer.

ring_buffer_write: writes some data into the ring buffer.

ring_buffer_peek: Look at a next item in the cpu buffer.
ring_buffer_consume: get the next item in the cpu buffer and
consume it. That is, this function increments the head
pointer.

ring_buffer_read_start: Start an iterator of a cpu buffer.
For now, this disables the cpu buffer, until you issue
a finish. This is just because we do not want the iterator
to be overwritten. This restriction may change in the future.
But note, this is used for static reading of a buffer which
is usually done "after" a trace. Live readings would want
to use the ring_buffer_consume above, which will not
disable the ring buffer.

ring_buffer_read_finish: Finishes the read iterator and reenables
the ring buffer.

ring_buffer_iter_peek: Look at the next item in the cpu iterator.
ring_buffer_read: Read the iterator and increment it.
ring_buffer_iter_reset: Reset the iterator to point to the beginning
of the cpu buffer.
ring_buffer_iter_empty: Returns true if the iterator is at the end
of the cpu buffer.

ring_buffer_size: returns the size in bytes of each cpu buffer.
Note, the real size is this times the number of CPUs.

ring_buffer_reset_cpu: Sets the cpu buffer to empty
ring_buffer_reset: sets all cpu buffers to empty

ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a
cpu buffer of another buffer. This is handy when you
want to take a snap shot of a running trace on just one
cpu. Having a backup buffer, to swap with facilitates this.
Ftrace max latencies use this.

ring_buffer_empty: Returns true if the ring buffer is empty.
ring_buffer_empty_cpu: Returns true if the cpu buffer is empty.

ring_buffer_record_disable: disable all cpu buffers (read only)
ring_buffer_record_disable_cpu: disable a single cpu buffer (read only)
ring_buffer_record_enable: enable all cpu buffers.
ring_buffer_record_enabl_cpu: enable a single cpu buffer.

ring_buffer_entries: The number of entries in a ring buffer.
ring_buffer_overruns: The number of entries removed due to writing wrap.

ring_buffer_time_stamp: Get the time stamp used by the ring buffer
ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp
into nanosecs.

I still need to implement the GTOD feature. But we need support from
the cpu frequency infrastructure. But this can be done at a later
time without affecting the ring buffer interface.

Signed-off-by: Steven Rostedt <***@redhat.com>
---
include/linux/ring_buffer.h | 179 ++++
kernel/trace/Kconfig | 4
kernel/trace/Makefile | 1
kernel/trace/ring_buffer.c | 1584 ++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 1768 insertions(+)

Index: linux-trace.git/include/linux/ring_buffer.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-trace.git/include/linux/ring_buffer.h 2008-09-26 14:16:54.000000000 -0400
@@ -0,0 +1,179 @@
+#ifndef _LINUX_RING_BUFFER_H
+#define _LINUX_RING_BUFFER_H
+
+#include <linux/mm.h>
+#include <linux/seq_file.h>
+
+struct ring_buffer;
+struct ring_buffer_iter;
+
+/*
+ * Don't reference this struct directly, use the inline items below.
+ */
+struct ring_buffer_event {
+ u32 type:2, len:3, time_delta:27;
+ u32 array[];
+};
+
+enum {
+ RB_TYPE_PADDING, /* Left over page padding
+ * array is ignored
+ * size is variable depending on
+ * how much padding is needed
+ */
+ RB_TYPE_TIME_EXTENT, /* Extent the time delta
+ * array[0] = time delta (28 .. 59)
+ * size = 8 bytes
+ */
+ /* FIXME: RB_TYPE_TIME_STAMP not implemented */
+ RB_TYPE_TIME_STAMP, /* Sync time stamp with external clock
+ * array[0] = tv_nsec
+ * array[1] = tv_sec
+ * size = 16 bytes
+ */
+
+ RB_TYPE_DATA, /* Data record
+ * If len is zero:
+ * array[0] holds the actual length
+ * array[1..(length+3)/4-1] holds data
+ * else
+ * length = len << 2
+ * array[0..(length+3)/4] holds data
+ */
+};
+
+#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event))
+#define RB_ALIGNMENT_SHIFT 2
+#define RB_ALIGNMENT (1 << RB_ALIGNMENT_SHIFT)
+#define RB_MAX_SMALL_DATA (28)
+
+enum {
+ RB_LEN_TIME_EXTENT = 8,
+ RB_LEN_TIME_STAMP = 16,
+};
+
+/**
+ * ring_buffer_event_length - return the length of the event
+ * @event: the event to get the length of
+ */
+static inline unsigned
+ring_buffer_event_length(struct ring_buffer_event *event)
+{
+ unsigned length;
+
+ switch (event->type) {
+ case RB_TYPE_PADDING:
+ /* undefined */
+ return -1;
+
+ case RB_TYPE_TIME_EXTENT:
+ return RB_LEN_TIME_EXTENT;
+
+ case RB_TYPE_TIME_STAMP:
+ return RB_LEN_TIME_STAMP;
+
+ case RB_TYPE_DATA:
+ if (event->len)
+ length = event->len << RB_ALIGNMENT_SHIFT;
+ else
+ length = event->array[0];
+ return length + RB_EVNT_HDR_SIZE;
+ default:
+ BUG();
+ }
+ /* not hit */
+ return 0;
+}
+
+/**
+ * ring_buffer_event_time_delta - return the delta timestamp of the event
+ * @event: the event to get the delta timestamp of
+ *
+ * The delta timestamp is the 27 bit timestamp since the last event.
+ */
+static inline unsigned
+ring_buffer_event_time_delta(struct ring_buffer_event *event)
+{
+ return event->time_delta;
+}
+
+/**
+ * ring_buffer_event_data - return the data of the event
+ * @event: the event to get the data from
+ */
+static inline void *
+ring_buffer_event_data(struct ring_buffer_event *event)
+{
+ BUG_ON(event->type != RB_TYPE_DATA);
+ /* If length is in len field, then array[0] has the data */
+ if (event->len)
+ return (void *)&event->array[0];
+ /* Otherwise length is in array[0] and array[1] has the data */
+ return (void *)&event->array[1];
+}
+
+void ring_buffer_lock(struct ring_buffer *buffer, unsigned long *flags);
+void ring_buffer_unlock(struct ring_buffer *buffer, unsigned long flags);
+
+/*
+ * size is in bytes for each per CPU buffer.
+ */
+struct ring_buffer *
+ring_buffer_alloc(unsigned long size, unsigned flags);
+void ring_buffer_free(struct ring_buffer *buffer);
+
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size);
+
+struct ring_buffer_event *
+ring_buffer_lock_reserve(struct ring_buffer *buffer,
+ unsigned long length,
+ unsigned long *flags);
+int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags);
+int ring_buffer_write(struct ring_buffer *buffer,
+ unsigned long length, void *data);
+
+struct ring_buffer_event *
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts);
+struct ring_buffer_event *
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts);
+
+struct ring_buffer_iter *
+ring_buffer_read_start(struct ring_buffer *buffer, int cpu);
+void ring_buffer_read_finish(struct ring_buffer_iter *iter);
+
+struct ring_buffer_event *
+ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts);
+struct ring_buffer_event *
+ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts);
+void ring_buffer_iter_reset(struct ring_buffer_iter *iter);
+int ring_buffer_iter_empty(struct ring_buffer_iter *iter);
+
+unsigned long ring_buffer_size(struct ring_buffer *buffer);
+
+void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu);
+void ring_buffer_reset(struct ring_buffer *buffer);
+
+int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
+ struct ring_buffer *buffer_b, int cpu);
+
+int ring_buffer_empty(struct ring_buffer *buffer);
+int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu);
+
+void ring_buffer_record_disable(struct ring_buffer *buffer);
+void ring_buffer_record_enable(struct ring_buffer *buffer);
+void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu);
+void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu);
+
+unsigned long ring_buffer_entries(struct ring_buffer *buffer);
+unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
+
+u64 ring_buffer_time_stamp(int cpu);
+void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
+
+enum ring_buffer_flags {
+ RB_FL_OVERWRITE = 1 << 0,
+};
+
+#endif /* _LINUX_RING_BUFFER_H */
Index: linux-trace.git/kernel/trace/ring_buffer.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-trace.git/kernel/trace/ring_buffer.c 2008-09-26 21:55:29.000000000 -0400
@@ -0,0 +1,1584 @@
+/*
+ * Generic ring buffer
+ *
+ * Copyright (C) 2008 Steven Rostedt <***@redhat.com>
+ */
+#include <linux/ring_buffer.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/mutex.h>
+#include <linux/sched.h> /* used for sched_clock() (for now) */
+#include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/list.h>
+#include <linux/fs.h>
+
+/* This needs to be somewhere else */
+#ifdef CONFIG_SMP
+# define __raw_assert_spin_is_locked(lock) \
+ BUG_ON(!__raw_spin_is_locked(lock))
+#else
+# define __raw_assert_spin_is_locked(lock) do { } while (0)
+#endif
+
+/* Up this if you want to test the TIME_EXTENTS and normalization */
+#define DEBUG_SHIFT 0
+
+/* FIXME!!! */
+u64 ring_buffer_time_stamp(int cpu)
+{
+ /* shift to debug/test normalization and TIME_EXTENTS */
+ return sched_clock() << DEBUG_SHIFT;
+}
+void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
+{
+ /* Just stupid testing the normalize function and deltas */
+ *ts >>= DEBUG_SHIFT;
+}
+
+#define for_each_buffer_cpu(buffer, cpu) \
+ for_each_cpu_mask(cpu, buffer->cpumask)
+
+#define TS_SHIFT 27
+#define TS_MASK ((1ULL << TS_SHIFT) - 1)
+#define TS_DELTA_TEST (~TS_MASK)
+
+/*
+ * This hack stolen from mm/slob.c.
+ * We can store per page timing information in the page frame of the page.
+ * Thanks to Peter Zijlstra for suggesting this idea.
+ */
+struct buffer_page {
+ union {
+ struct {
+ unsigned long flags; /* mandatory */
+ atomic_t _count; /* mandatory */
+ u64 time_stamp; /* page time stamp */
+ unsigned size; /* size of page data */
+ struct list_head list; /* linked list of free pages */
+ };
+ struct page page;
+ };
+};
+
+/*
+ * We need to fit the time_stamp delta into 27 bits.
+ */
+static inline int
+test_time_stamp(unsigned long long delta)
+{
+ if (delta & TS_DELTA_TEST)
+ return 1;
+ return 0;
+}
+
+#define BUF_PAGE_SIZE PAGE_SIZE
+
+/*
+ * head_page == tail_page && head == tail then buffer is empty.
+ */
+struct ring_buffer_per_cpu {
+ int cpu;
+ struct ring_buffer *buffer;
+ raw_spinlock_t lock;
+ struct lock_class_key lock_key;
+ struct list_head pages;
+ unsigned long head; /* read from head */
+ unsigned long tail; /* write to tail */
+ struct buffer_page *head_page;
+ struct buffer_page *tail_page;
+ unsigned long overrun;
+ unsigned long entries;
+ u64 write_stamp;
+ u64 read_stamp;
+ atomic_t record_disabled;
+};
+
+struct ring_buffer {
+ unsigned long size;
+ unsigned pages;
+ unsigned flags;
+ int cpus;
+ cpumask_t cpumask;
+ atomic_t record_disabled;
+
+ struct mutex mutex;
+
+ struct ring_buffer_per_cpu **buffers;
+};
+
+struct ring_buffer_iter {
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long head;
+ struct buffer_page *head_page;
+ u64 read_stamp;
+};
+
+#define CHECK_COND(buffer, cond) \
+ if (unlikely(cond)) { \
+ atomic_inc(&buffer->record_disabled); \
+ WARN_ON(1); \
+ return -1; \
+ }
+
+/**
+ * check_pages - integrity check of buffer pages
+ * @cpu_buffer: CPU buffer with pages to test
+ *
+ * As a safty measure we check to make sure the data pages have not
+ * been corrupted.
+ */
+static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct list_head *head = &cpu_buffer->pages;
+ struct page *page, *tmp;
+
+ CHECK_COND(cpu_buffer, head->next->prev != head);
+ CHECK_COND(cpu_buffer, head->prev->next != head);
+
+ list_for_each_entry_safe(page, tmp, head, lru) {
+ CHECK_COND(cpu_buffer, page->lru.next->prev != &page->lru);
+ CHECK_COND(cpu_buffer, page->lru.prev->next != &page->lru);
+ }
+
+ return 0;
+}
+
+static unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return cpu_buffer->head_page->size;
+}
+
+static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned nr_pages)
+{
+ struct list_head *head = &cpu_buffer->pages;
+ LIST_HEAD(pages);
+ struct page *page, *tmp;
+ unsigned long addr;
+ unsigned i;
+
+ for (i = 0; i < nr_pages; i++) {
+ addr = __get_free_page(GFP_KERNEL);
+ if (!addr)
+ goto free_pages;
+ page = virt_to_page(addr);
+ list_add(&page->lru, &pages);
+ }
+
+ list_splice(&pages, head);
+
+ rb_check_pages(cpu_buffer);
+
+ return 0;
+
+ free_pages:
+ list_for_each_entry_safe(page, tmp, &pages, lru) {
+ list_del_init(&page->lru);
+ __free_page(page);
+ }
+ return -ENOMEM;
+}
+
+static struct ring_buffer_per_cpu *
+rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int ret;
+
+ cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!cpu_buffer)
+ return NULL;
+
+ cpu_buffer->cpu = cpu;
+ cpu_buffer->buffer = buffer;
+ cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+ INIT_LIST_HEAD(&cpu_buffer->pages);
+
+ ret = rb_allocate_pages(cpu_buffer, buffer->pages);
+ if (ret < 0)
+ goto fail_free_buffer;
+
+ cpu_buffer->head_page
+ = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+ cpu_buffer->tail_page
+ = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+
+ return cpu_buffer;
+
+ fail_free_buffer:
+ kfree(cpu_buffer);
+ return NULL;
+}
+
+static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct list_head *head = &cpu_buffer->pages;
+ struct page *page, *tmp;
+
+ list_for_each_entry_safe(page, tmp, head, lru) {
+ list_del_init(&page->lru);
+ __free_page(page);
+ }
+ kfree(cpu_buffer);
+}
+
+/**
+ * ring_buffer_alloc - allocate a new ring_buffer
+ * @size: the size in bytes that is needed.
+ * @flags: attributes to set for the ring buffer.
+ *
+ * Currently the only flag that is available is the RB_FL_OVERWRITE
+ * flag. This flag means that the buffer will overwrite old data
+ * when the buffer wraps. If this flag is not set, the buffer will
+ * drop data when the tail hits the head.
+ */
+struct ring_buffer *
+ring_buffer_alloc(unsigned long size, unsigned flags)
+{
+ struct ring_buffer *buffer;
+ int bsize;
+ int cpu;
+
+ /* keep it in its own cache line */
+ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
+ GFP_KERNEL);
+ if (!buffer)
+ return NULL;
+
+ buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ buffer->flags = flags;
+
+ /* need at least two pages */
+ if (buffer->pages == 1)
+ buffer->pages++;
+
+ buffer->cpumask = cpu_possible_map;
+ buffer->cpus = nr_cpu_ids;
+
+ bsize = sizeof(void *) * nr_cpu_ids;
+ buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()),
+ GFP_KERNEL);
+ if (!buffer->buffers)
+ goto fail_free_buffer;
+
+ for_each_buffer_cpu(buffer, cpu) {
+ buffer->buffers[cpu] =
+ rb_allocate_cpu_buffer(buffer, cpu);
+ if (!buffer->buffers[cpu])
+ goto fail_free_buffers;
+ }
+
+ mutex_init(&buffer->mutex);
+
+ return buffer;
+
+ fail_free_buffers:
+ for_each_buffer_cpu(buffer, cpu) {
+ if (buffer->buffers[cpu])
+ rb_free_cpu_buffer(buffer->buffers[cpu]);
+ }
+ kfree(buffer->buffers);
+
+ fail_free_buffer:
+ kfree(buffer);
+ return NULL;
+}
+
+/**
+ * ring_buffer_free - free a ring buffer.
+ * @buffer: the buffer to free.
+ */
+void
+ring_buffer_free(struct ring_buffer *buffer)
+{
+ int cpu;
+
+ for_each_buffer_cpu(buffer, cpu)
+ rb_free_cpu_buffer(buffer->buffers[cpu]);
+
+ kfree(buffer);
+}
+
+static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
+
+static void
+rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
+{
+ struct page *page;
+ struct list_head *p;
+ unsigned i;
+
+ atomic_inc(&cpu_buffer->record_disabled);
+ synchronize_sched();
+
+ for (i = 0; i < nr_pages; i++) {
+ BUG_ON(list_empty(&cpu_buffer->pages));
+ p = cpu_buffer->pages.next;
+ page = list_entry(p, struct page, lru);
+ list_del_init(&page->lru);
+ __free_page(page);
+ }
+ BUG_ON(list_empty(&cpu_buffer->pages));
+
+ rb_reset_cpu(cpu_buffer);
+
+ rb_check_pages(cpu_buffer);
+
+ atomic_dec(&cpu_buffer->record_disabled);
+
+}
+
+static void
+rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
+ struct list_head *pages, unsigned nr_pages)
+{
+ struct page *page;
+ struct list_head *p;
+ unsigned i;
+
+ atomic_inc(&cpu_buffer->record_disabled);
+ synchronize_sched();
+
+ for (i = 0; i < nr_pages; i++) {
+ BUG_ON(list_empty(pages));
+ p = pages->next;
+ page = list_entry(p, struct page, lru);
+ list_del_init(&page->lru);
+ list_add_tail(&page->lru, &cpu_buffer->pages);
+ }
+ rb_reset_cpu(cpu_buffer);
+
+ rb_check_pages(cpu_buffer);
+
+ atomic_dec(&cpu_buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_resize - resize the ring buffer
+ * @buffer: the buffer to resize.
+ * @size: the new size.
+ *
+ * The tracer is responsible for making sure that the buffer is
+ * not being used while changing the size.
+ * Note: We may be able to change the above requirement by using
+ * RCU synchronizations.
+ *
+ * Minimum size is 2 * BUF_PAGE_SIZE.
+ *
+ * Returns -1 on failure.
+ */
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long buffer_size;
+ LIST_HEAD(pages);
+ unsigned long addr;
+ unsigned nr_pages, rm_pages, new_pages;
+ struct page *page, *tmp;
+ int i, cpu;
+
+ size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ size *= BUF_PAGE_SIZE;
+ buffer_size = buffer->pages * BUF_PAGE_SIZE;
+
+ /* we need a minimum of two pages */
+ if (size < BUF_PAGE_SIZE * 2)
+ size = BUF_PAGE_SIZE * 2;
+
+ if (size == buffer_size)
+ return size;
+
+ mutex_lock(&buffer->mutex);
+
+ nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+
+ if (size < buffer_size) {
+
+ /* easy case, just free pages */
+ BUG_ON(nr_pages >= buffer->pages);
+
+ rm_pages = buffer->pages - nr_pages;
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ rb_remove_pages(cpu_buffer, rm_pages);
+ }
+ goto out;
+ }
+
+ /*
+ * This is a bit more difficult. We only want to add pages
+ * when we can allocate enough for all CPUs. We do this
+ * by allocating all the pages and storing them on a local
+ * link list. If we succeed in our allocation, then we
+ * add these pages to the cpu_buffers. Otherwise we just free
+ * them all and return -ENOMEM;
+ */
+ BUG_ON(nr_pages <= buffer->pages);
+ new_pages = nr_pages - buffer->pages;
+
+ for_each_buffer_cpu(buffer, cpu) {
+ for (i = 0; i < new_pages; i++) {
+ addr = __get_free_page(GFP_KERNEL);
+ if (!addr)
+ goto free_pages;
+ page = virt_to_page(addr);
+ list_add(&page->lru, &pages);
+ }
+ }
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ rb_insert_pages(cpu_buffer, &pages, new_pages);
+ }
+
+ BUG_ON(!list_empty(&pages));
+
+ out:
+ buffer->pages = nr_pages;
+ mutex_unlock(&buffer->mutex);
+
+ return size;
+
+ free_pages:
+ list_for_each_entry_safe(page, tmp, &pages, lru) {
+ list_del_init(&page->lru);
+ __free_page(page);
+ }
+ return -ENOMEM;
+}
+
+static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return cpu_buffer->head_page == cpu_buffer->tail_page &&
+ cpu_buffer->head == cpu_buffer->tail;
+}
+
+static inline int rb_null_event(struct ring_buffer_event *event)
+{
+ return event->type == RB_TYPE_PADDING;
+}
+
+static inline void *rb_page_index(struct buffer_page *page, unsigned index)
+{
+ void *addr;
+
+ addr = page_address(&page->page);
+ return addr + index;
+}
+
+static inline struct ring_buffer_event *
+rb_head_event(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return rb_page_index(cpu_buffer->head_page,
+ cpu_buffer->head);
+}
+
+static inline struct ring_buffer_event *
+rb_iter_head_event(struct ring_buffer_iter *iter)
+{
+ return rb_page_index(iter->head_page,
+ iter->head);
+}
+
+/*
+ * When the tail hits the head and the buffer is in overwrite mode,
+ * the head jumps to the next page and all content on the previous
+ * page is discarded. But before doing so, we update the overrun
+ * variable of the buffer.
+ */
+static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct ring_buffer_event *event;
+ unsigned long head;
+
+ for (head = 0; head < rb_head_size(cpu_buffer);
+ head += ring_buffer_event_length(event)) {
+ event = rb_page_index(cpu_buffer->head_page, head);
+ BUG_ON(rb_null_event(event));
+ /* Only count data entries */
+ if (event->type != RB_TYPE_DATA)
+ continue;
+ cpu_buffer->overrun++;
+ cpu_buffer->entries--;
+ }
+}
+
+static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page **page)
+{
+ struct list_head *p = (*page)->list.next;
+
+ if (p == &cpu_buffer->pages)
+ p = p->next;
+
+ *page = list_entry(p, struct buffer_page, list);
+}
+
+static inline void
+rb_add_stamp(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
+{
+ cpu_buffer->tail_page->time_stamp = *ts;
+ cpu_buffer->write_stamp = *ts;
+}
+
+static void rb_reset_read_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ cpu_buffer->read_stamp = cpu_buffer->head_page->time_stamp;
+ cpu_buffer->head = 0;
+}
+
+static void
+rb_reset_iter_read_page(struct ring_buffer_iter *iter)
+{
+ iter->read_stamp = iter->head_page->time_stamp;
+ iter->head = 0;
+}
+
+/**
+ * ring_buffer_update_event - update event type and data
+ * @event: the even to update
+ * @type: the type of event
+ * @length: the size of the event field in the ring buffer
+ *
+ * Update the type and data fields of the event. The length
+ * is the actual size that is written to the ring buffer,
+ * and with this, we can determine what to place into the
+ * data field.
+ */
+static inline void
+rb_update_event(struct ring_buffer_event *event,
+ unsigned type, unsigned length)
+{
+ event->type = type;
+
+ switch (type) {
+
+ case RB_TYPE_PADDING:
+ break;
+
+ case RB_TYPE_TIME_EXTENT:
+ event->len =
+ (RB_LEN_TIME_EXTENT + (RB_ALIGNMENT-1))
+ >> RB_ALIGNMENT_SHIFT;
+ break;
+
+ case RB_TYPE_TIME_STAMP:
+ event->len =
+ (RB_LEN_TIME_STAMP + (RB_ALIGNMENT-1))
+ >> RB_ALIGNMENT_SHIFT;
+ break;
+
+ case RB_TYPE_DATA:
+ length -= RB_EVNT_HDR_SIZE;
+ if (length > RB_MAX_SMALL_DATA) {
+ event->len = 0;
+ event->array[0] = length;
+ } else
+ event->len =
+ (length + (RB_ALIGNMENT-1))
+ >> RB_ALIGNMENT_SHIFT;
+ break;
+ default:
+ BUG();
+ }
+}
+
+static inline unsigned rb_calculate_event_length(unsigned length)
+{
+ struct ring_buffer_event event; /* Used only for sizeof array */
+
+ /* zero length can cause confusions */
+ if (!length)
+ length = 1;
+
+ if (length > RB_MAX_SMALL_DATA)
+ length += sizeof(event.array[0]);
+
+ length += RB_EVNT_HDR_SIZE;
+ length = ALIGN(length, RB_ALIGNMENT);
+
+ return length;
+}
+
+static struct ring_buffer_event *
+__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned type, unsigned long length, u64 *ts)
+{
+ struct buffer_page *head_page, *tail_page;
+ unsigned long tail;
+ struct ring_buffer *buffer = cpu_buffer->buffer;
+ struct ring_buffer_event *event;
+
+ tail_page = cpu_buffer->tail_page;
+ head_page = cpu_buffer->head_page;
+ tail = cpu_buffer->tail;
+
+ if (tail + length > BUF_PAGE_SIZE) {
+ struct buffer_page *next_page = tail_page;
+
+ rb_inc_page(cpu_buffer, &next_page);
+
+ if (next_page == head_page) {
+ if (!(buffer->flags & RB_FL_OVERWRITE))
+ return NULL;
+
+ /* count overflows */
+ rb_update_overflow(cpu_buffer);
+
+ rb_inc_page(cpu_buffer, &head_page);
+ cpu_buffer->head_page = head_page;
+ rb_reset_read_page(cpu_buffer);
+ }
+
+ if (tail != BUF_PAGE_SIZE) {
+ event = rb_page_index(tail_page, tail);
+ /* page padding */
+ event->type = RB_TYPE_PADDING;
+ }
+
+ tail_page->size = tail;
+ tail_page = next_page;
+ tail_page->size = 0;
+ tail = 0;
+ cpu_buffer->tail_page = tail_page;
+ cpu_buffer->tail = tail;
+ rb_add_stamp(cpu_buffer, ts);
+ }
+
+ BUG_ON(tail + length > BUF_PAGE_SIZE);
+
+ event = rb_page_index(tail_page, tail);
+ rb_update_event(event, type, length);
+
+ return event;
+}
+
+static struct ring_buffer_event *
+rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned type, unsigned long length)
+{
+ unsigned long long ts, delta;
+ struct ring_buffer_event *event;
+
+ ts = ring_buffer_time_stamp(cpu_buffer->cpu);
+
+ if (cpu_buffer->tail) {
+ delta = ts - cpu_buffer->write_stamp;
+
+ if (test_time_stamp(delta)) {
+ /*
+ * The delta is too big, we to add a
+ * new timestamp.
+ */
+ event = __rb_reserve_next(cpu_buffer,
+ RB_TYPE_TIME_EXTENT,
+ RB_LEN_TIME_EXTENT,
+ &ts);
+ if (!event)
+ return NULL;
+
+ /* check to see if we went to the next page */
+ if (cpu_buffer->tail) {
+ /* Still on same page, update timestamp */
+ event->time_delta = delta & TS_MASK;
+ event->array[0] = delta >> TS_SHIFT;
+ /* commit the time event */
+ cpu_buffer->tail +=
+ ring_buffer_event_length(event);
+ cpu_buffer->write_stamp = ts;
+ }
+ delta = 0;
+ }
+ } else {
+ rb_add_stamp(cpu_buffer, &ts);
+ delta = 0;
+ }
+
+ event = __rb_reserve_next(cpu_buffer, type, length, &ts);
+ if (!event)
+ return NULL;
+
+ event->time_delta = delta;
+
+ return event;
+}
+
+/**
+ * ring_buffer_lock_reserve - reserve a part of the buffer
+ * @buffer: the ring buffer to reserve from
+ * @length: the length of the data to reserve (excluding event header)
+ * @flags: a pointer to save the interrupt flags
+ *
+ * Returns a reseverd event on the ring buffer to copy directly to.
+ * The user of this interface will need to get the body to write into
+ * and can use the ring_buffer_event_data() interface.
+ *
+ * The length is the length of the data needed, not the event length
+ * which also includes the event header.
+ *
+ * Must be paired with ring_buffer_unlock_commit, unless NULL is returned.
+ * If NULL is returned, then nothing has been allocated or locked.
+ */
+struct ring_buffer_event *
+ring_buffer_lock_reserve(struct ring_buffer *buffer,
+ unsigned long length,
+ unsigned long *flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ int cpu;
+
+ if (atomic_read(&buffer->record_disabled))
+ return NULL;
+
+ raw_local_irq_save(*flags);
+ cpu = raw_smp_processor_id();
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ goto out_irq;
+
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_lock(&cpu_buffer->lock);
+
+ if (atomic_read(&cpu_buffer->record_disabled))
+ goto no_record;
+
+ length = rb_calculate_event_length(length);
+ if (length > BUF_PAGE_SIZE)
+ return NULL;
+
+ event = rb_reserve_next_event(cpu_buffer, RB_TYPE_DATA, length);
+ if (!event)
+ goto no_record;
+
+ return event;
+
+ no_record:
+ __raw_spin_unlock(&cpu_buffer->lock);
+ out_irq:
+ local_irq_restore(*flags);
+ return NULL;
+}
+
+static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ cpu_buffer->tail += ring_buffer_event_length(event);
+ cpu_buffer->tail_page->size = cpu_buffer->tail;
+ cpu_buffer->write_stamp += event->time_delta;
+ cpu_buffer->entries++;
+}
+
+/**
+ * ring_buffer_unlock_commit - commit a reserved
+ * @buffer: The buffer to commit to
+ * @event: The event pointer to commit.
+ * @flags: the interrupt flags received from ring_buffer_lock_reserve.
+ *
+ * This commits the data to the ring buffer, and releases any locks held.
+ *
+ * Must be paired with ring_buffer_lock_reserve.
+ */
+int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu = raw_smp_processor_id();
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ __raw_assert_spin_is_locked(&cpu_buffer->lock);
+
+ rb_commit(cpu_buffer, event);
+
+ __raw_spin_unlock(&cpu_buffer->lock);
+ raw_local_irq_restore(flags);
+
+ return 0;
+}
+
+/**
+ * ring_buffer_write - write data to the buffer without reserving
+ * @buffer: The ring buffer to write to.
+ * @length: The length of the data being written (excluding the event header)
+ * @data: The data to write to the buffer.
+ *
+ * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as
+ * one function. If you already have the data to write to the buffer, it
+ * may be easier to simply call this function.
+ *
+ * Note, like ring_buffer_lock_reserve, the length is the length of the data
+ * and not the length of the event which would hold the header.
+ */
+int ring_buffer_write(struct ring_buffer *buffer,
+ unsigned long length,
+ void *data)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ unsigned long event_length, flags;
+ void *body;
+ int ret = -EBUSY;
+ int cpu;
+
+ if (atomic_read(&buffer->record_disabled))
+ return -EBUSY;
+
+ local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ goto out_irq;
+
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_lock(&cpu_buffer->lock);
+
+ if (atomic_read(&cpu_buffer->record_disabled))
+ goto out;
+
+ event_length = rb_calculate_event_length(length);
+ event = rb_reserve_next_event(cpu_buffer,
+ RB_TYPE_DATA, event_length);
+ if (!event)
+ goto out;
+
+ body = ring_buffer_event_data(event);
+
+ memcpy(body, data, length);
+
+ rb_commit(cpu_buffer, event);
+
+ ret = 0;
+ out:
+ __raw_spin_unlock(&cpu_buffer->lock);
+ out_irq:
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+/**
+ * ring_buffer_lock - lock the ring buffer
+ * @buffer: The ring buffer to lock
+ * @flags: The place to store the interrupt flags
+ *
+ * This locks all the per CPU buffers.
+ *
+ * Must be unlocked by ring_buffer_unlock.
+ */
+void ring_buffer_lock(struct ring_buffer *buffer, unsigned long *flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ local_irq_save(*flags);
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_lock(&cpu_buffer->lock);
+ }
+}
+
+/**
+ * ring_buffer_unlock - unlock a locked buffer
+ * @buffer: The locked buffer to unlock
+ * @flags: The interrupt flags received by ring_buffer_lock
+ */
+void ring_buffer_unlock(struct ring_buffer *buffer, unsigned long flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ for (cpu = buffer->cpus - 1; cpu >= 0; cpu--) {
+ if (!cpu_isset(cpu, buffer->cpumask))
+ continue;
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_unlock(&cpu_buffer->lock);
+ }
+
+ local_irq_restore(flags);
+}
+
+/**
+ * ring_buffer_record_disable - stop all writes into the buffer
+ * @buffer: The ring buffer to stop writes to.
+ *
+ * This prevents all writes to the buffer. Any attempt to write
+ * to the buffer after this will fail and return NULL.
+ *
+ * The caller should call synchronize_sched() after this.
+ */
+void ring_buffer_record_disable(struct ring_buffer *buffer)
+{
+ atomic_inc(&buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_enable - enable writes to the buffer
+ * @buffer: The ring buffer to enable writes
+ *
+ * Note, multiple disables will need the same number of enables
+ * to truely enable the writing (much like preempt_disable).
+ */
+void ring_buffer_record_enable(struct ring_buffer *buffer)
+{
+ atomic_dec(&buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
+ * @buffer: The ring buffer to stop writes to.
+ * @cpu: The CPU buffer to stop
+ *
+ * This prevents all writes to the buffer. Any attempt to write
+ * to the buffer after this will fail and return NULL.
+ *
+ * The caller should call synchronize_sched() after this.
+ */
+void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return;
+
+ cpu_buffer = buffer->buffers[cpu];
+ atomic_inc(&cpu_buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_enable_cpu - enable writes to the buffer
+ * @buffer: The ring buffer to enable writes
+ * @cpu: The CPU to enable.
+ *
+ * Note, multiple disables will need the same number of enables
+ * to truely enable the writing (much like preempt_disable).
+ */
+void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return;
+
+ cpu_buffer = buffer->buffers[cpu];
+ atomic_dec(&cpu_buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the entries from.
+ */
+unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return 0;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return cpu_buffer->entries;
+}
+
+/**
+ * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return 0;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return cpu_buffer->overrun;
+}
+
+/**
+ * ring_buffer_entries - get the number of entries in a buffer
+ * @buffer: The ring buffer
+ *
+ * Returns the total number of entries in the ring buffer
+ * (all CPU entries)
+ */
+unsigned long ring_buffer_entries(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long entries = 0;
+ int cpu;
+
+ /* if you care about this being correct, lock the buffer */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ entries += cpu_buffer->entries;
+ }
+
+ return entries;
+}
+
+/**
+ * ring_buffer_overrun_cpu - get the number of overruns in buffer
+ * @buffer: The ring buffer
+ *
+ * Returns the total number of overruns in the ring buffer
+ * (all CPU entries)
+ */
+unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long overruns = 0;
+ int cpu;
+
+ /* if you care about this being correct, lock the buffer */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ overruns += cpu_buffer->overrun;
+ }
+
+ return overruns;
+}
+
+/**
+ * ring_buffer_iter_reset - reset an iterator
+ * @iter: The iterator to reset
+ *
+ * Resets the iterator, so that it will start from the beginning
+ * again.
+ */
+void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+
+ iter->head_page = cpu_buffer->head_page;
+ iter->head = cpu_buffer->head;
+ rb_reset_iter_read_page(iter);
+}
+
+/**
+ * ring_buffer_iter_empty - check if an iterator has no more to read
+ * @iter: The iterator to check
+ */
+int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = iter->cpu_buffer;
+
+ return iter->head_page == cpu_buffer->tail_page &&
+ iter->head == cpu_buffer->tail;
+}
+
+static void
+rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ u64 delta;
+
+ switch (event->type) {
+ case RB_TYPE_PADDING:
+ return;
+
+ case RB_TYPE_TIME_EXTENT:
+ delta = event->array[0];
+ delta <<= TS_SHIFT;
+ delta += event->time_delta;
+ cpu_buffer->read_stamp += delta;
+ return;
+
+ case RB_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ return;
+
+ case RB_TYPE_DATA:
+ cpu_buffer->read_stamp += event->time_delta;
+ return;
+
+ default:
+ BUG();
+ }
+ return;
+}
+
+static void
+rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
+ struct ring_buffer_event *event)
+{
+ u64 delta;
+
+ switch (event->type) {
+ case RB_TYPE_PADDING:
+ return;
+
+ case RB_TYPE_TIME_EXTENT:
+ delta = event->array[0];
+ delta <<= TS_SHIFT;
+ delta += event->time_delta;
+ iter->read_stamp += delta;
+ return;
+
+ case RB_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ return;
+
+ case RB_TYPE_DATA:
+ iter->read_stamp += event->time_delta;
+ return;
+
+ default:
+ BUG();
+ }
+ return;
+}
+
+static void rb_advance_head(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct ring_buffer_event *event;
+ unsigned length;
+
+ /*
+ * Check if we are at the end of the buffer.
+ */
+ if (cpu_buffer->head >= cpu_buffer->head_page->size) {
+ BUG_ON(cpu_buffer->head_page == cpu_buffer->tail_page);
+ rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
+ rb_reset_read_page(cpu_buffer);
+ return;
+ }
+
+ event = rb_head_event(cpu_buffer);
+
+ if (event->type == RB_TYPE_DATA)
+ cpu_buffer->entries--;
+
+ length = ring_buffer_event_length(event);
+
+ /*
+ * This should not be called to advance the header if we are
+ * at the tail of the buffer.
+ */
+ BUG_ON((cpu_buffer->head_page == cpu_buffer->tail_page) &&
+ (cpu_buffer->head + length > cpu_buffer->tail));
+
+ rb_update_read_stamp(cpu_buffer, event);
+
+ cpu_buffer->head += length;
+
+ /* check for end of page */
+ if ((cpu_buffer->head >= cpu_buffer->head_page->size) &&
+ (cpu_buffer->head_page != cpu_buffer->tail_page))
+ rb_advance_head(cpu_buffer);
+}
+
+static void rb_advance_iter(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer *buffer;
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ unsigned length;
+
+ cpu_buffer = iter->cpu_buffer;
+ buffer = cpu_buffer->buffer;
+
+ /*
+ * Check if we are at the end of the buffer.
+ */
+ if (iter->head >= iter->head_page->size) {
+ BUG_ON(iter->head_page == cpu_buffer->tail_page);
+ rb_inc_page(cpu_buffer, &iter->head_page);
+ rb_reset_iter_read_page(iter);
+ return;
+ }
+
+ event = rb_iter_head_event(iter);
+
+ length = ring_buffer_event_length(event);
+
+ /*
+ * This should not be called to advance the header if we are
+ * at the tail of the buffer.
+ */
+ BUG_ON((iter->head_page == cpu_buffer->tail_page) &&
+ (iter->head + length > cpu_buffer->tail));
+
+ rb_update_iter_read_stamp(iter, event);
+
+ iter->head += length;
+
+ /* check for end of page padding */
+ if ((iter->head >= iter->head_page->size) &&
+ (iter->head_page != cpu_buffer->tail_page))
+ rb_advance_iter(iter);
+}
+
+/**
+ * ring_buffer_peek - peek at the next event to be read
+ * @buffer: The ring buffer to read
+ * @cpu: The cpu to peak at
+ * @ts: The timestamp counter of this event.
+ *
+ * This will return the event that will be read next, but does
+ * not consume the data.
+ */
+struct ring_buffer_event *
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return NULL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ again:
+ if (rb_per_cpu_empty(cpu_buffer))
+ return NULL;
+
+ event = rb_head_event(cpu_buffer);
+
+ switch (event->type) {
+ case RB_TYPE_PADDING:
+ rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
+ rb_reset_read_page(cpu_buffer);
+ goto again;
+
+ case RB_TYPE_TIME_EXTENT:
+ /* Internal data, OK to advance */
+ rb_advance_head(cpu_buffer);
+ goto again;
+
+ case RB_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ rb_advance_head(cpu_buffer);
+ goto again;
+
+ case RB_TYPE_DATA:
+ if (ts) {
+ *ts = cpu_buffer->read_stamp + event->time_delta;
+ ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+ }
+ return event;
+
+ default:
+ BUG();
+ }
+
+ return NULL;
+}
+
+/**
+ * ring_buffer_iter_peek - peek at the next event to be read
+ * @iter: The ring buffer iterator
+ * @ts: The timestamp counter of this event.
+ *
+ * This will return the event that will be read next, but does
+ * not increment the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
+{
+ struct ring_buffer *buffer;
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+
+ if (ring_buffer_iter_empty(iter))
+ return NULL;
+
+ cpu_buffer = iter->cpu_buffer;
+ buffer = cpu_buffer->buffer;
+
+ again:
+ if (rb_per_cpu_empty(cpu_buffer))
+ return NULL;
+
+ event = rb_iter_head_event(iter);
+
+ switch (event->type) {
+ case RB_TYPE_PADDING:
+ rb_inc_page(cpu_buffer, &iter->head_page);
+ rb_reset_iter_read_page(iter);
+ goto again;
+
+ case RB_TYPE_TIME_EXTENT:
+ /* Internal data, OK to advance */
+ rb_advance_iter(iter);
+ goto again;
+
+ case RB_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ rb_advance_iter(iter);
+ goto again;
+
+ case RB_TYPE_DATA:
+ if (ts) {
+ *ts = iter->read_stamp + event->time_delta;
+ ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+ }
+ return event;
+
+ default:
+ BUG();
+ }
+
+ return NULL;
+}
+
+/**
+ * ring_buffer_consume - return an event and consume it
+ * @buffer: The ring buffer to get the next event from
+ *
+ * Returns the next event in the ring buffer, and that event is consumed.
+ * Meaning, that sequential reads will keep returning a different event,
+ * and eventually empty the ring buffer if the producer is slower.
+ */
+struct ring_buffer_event *
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return NULL;
+
+ event = ring_buffer_peek(buffer, cpu, ts);
+ if (!event)
+ return NULL;
+
+ cpu_buffer = buffer->buffers[cpu];
+ rb_advance_head(cpu_buffer);
+
+ return event;
+}
+
+/**
+ * ring_buffer_read_start - start a non consuming read of the buffer
+ * @buffer: The ring buffer to read from
+ * @cpu: The cpu buffer to iterate over
+ *
+ * This starts up an iteration through the buffer. It also disables
+ * the recording to the buffer until the reading is finished.
+ * This prevents the reading from being corrupted. This is not
+ * a consuming read, so a producer is not expected.
+ *
+ * Must be paired with ring_buffer_finish.
+ */
+struct ring_buffer_iter *
+ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_iter *iter;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return NULL;
+
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return NULL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ iter->cpu_buffer = cpu_buffer;
+
+ atomic_inc(&cpu_buffer->record_disabled);
+ synchronize_sched();
+
+ __raw_spin_lock(&cpu_buffer->lock);
+ iter->head = cpu_buffer->head;
+ iter->head_page = cpu_buffer->head_page;
+ rb_reset_iter_read_page(iter);
+ __raw_spin_unlock(&cpu_buffer->lock);
+
+ return iter;
+}
+
+/**
+ * ring_buffer_finish - finish reading the iterator of the buffer
+ * @iter: The iterator retrieved by ring_buffer_start
+ *
+ * This re-enables the recording to the buffer, and frees the
+ * iterator.
+ */
+void
+ring_buffer_read_finish(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+
+ atomic_dec(&cpu_buffer->record_disabled);
+ kfree(iter);
+}
+
+/**
+ * ring_buffer_read - read the next item in the ring buffer by the iterator
+ * @iter: The ring buffer iterator
+ * @ts: The time stamp of the event read.
+ *
+ * This reads the next event in the ring buffer and increments the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
+{
+ struct ring_buffer_event *event;
+
+ event = ring_buffer_iter_peek(iter, ts);
+ if (!event)
+ return NULL;
+
+ rb_advance_iter(iter);
+
+ return event;
+}
+
+/**
+ * ring_buffer_size - return the size of the ring buffer (in bytes)
+ * @buffer: The ring buffer.
+ */
+unsigned long ring_buffer_size(struct ring_buffer *buffer)
+{
+ return BUF_PAGE_SIZE * buffer->pages;
+}
+
+static void
+rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ cpu_buffer->head_page
+ = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+ cpu_buffer->tail_page
+ = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+
+ cpu_buffer->head = cpu_buffer->tail = 0;
+ cpu_buffer->overrun = 0;
+ cpu_buffer->entries = 0;
+}
+
+/**
+ * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
+ * @buffer: The ring buffer to reset a per cpu buffer of
+ * @cpu: The CPU buffer to be reset
+ */
+void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ unsigned long flags;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return;
+
+ raw_local_irq_save(flags);
+ __raw_spin_lock(&cpu_buffer->lock);
+
+ rb_reset_cpu(cpu_buffer);
+
+ __raw_spin_unlock(&cpu_buffer->lock);
+ raw_local_irq_restore(flags);
+}
+
+/**
+ * ring_buffer_reset - reset a ring buffer
+ * @buffer: The ring buffer to reset all cpu buffers
+ */
+void ring_buffer_reset(struct ring_buffer *buffer)
+{
+ unsigned long flags;
+ int cpu;
+
+ ring_buffer_lock(buffer, &flags);
+
+ for_each_buffer_cpu(buffer, cpu)
+ rb_reset_cpu(buffer->buffers[cpu]);
+
+ ring_buffer_unlock(buffer, flags);
+}
+
+/**
+ * rind_buffer_empty - is the ring buffer empty?
+ * @buffer: The ring buffer to test
+ */
+int ring_buffer_empty(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ /* yes this is racy, but if you don't like the race, lock the buffer */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ if (!rb_per_cpu_empty(cpu_buffer))
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty?
+ * @buffer: The ring buffer
+ * @cpu: The CPU buffer to test
+ */
+int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return 1;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return rb_per_cpu_empty(cpu_buffer);
+}
+
+/**
+ * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
+ * @buffer_a: One buffer to swap with
+ * @buffer_b: The other buffer to swap with
+ *
+ * This function is useful for tracers that want to take a "snapshot"
+ * of a CPU buffer and has another back up buffer lying around.
+ * it is expected that the tracer handles the cpu buffer not being
+ * used at the moment.
+ */
+int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
+ struct ring_buffer *buffer_b, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer_a;
+ struct ring_buffer_per_cpu *cpu_buffer_b;
+
+ if (!cpu_isset(cpu, buffer_a->cpumask) ||
+ !cpu_isset(cpu, buffer_b->cpumask))
+ return -EINVAL;
+
+ /* At least make sure the two buffers are somewhat the same */
+ if (buffer_a->size != buffer_b->size ||
+ buffer_a->pages != buffer_b->pages)
+ return -EINVAL;
+
+ cpu_buffer_a = buffer_a->buffers[cpu];
+ cpu_buffer_b = buffer_b->buffers[cpu];
+
+ /*
+ * We can't do a synchronize_sched here because this
+ * function can be called in atomic context.
+ * Normally this will be called from the same CPU as cpu.
+ * If not it's up to the caller to protect this.
+ */
+ atomic_inc(&cpu_buffer_a->record_disabled);
+ atomic_inc(&cpu_buffer_b->record_disabled);
+
+ buffer_a->buffers[cpu] = cpu_buffer_b;
+ buffer_b->buffers[cpu] = cpu_buffer_a;
+
+ cpu_buffer_b->buffer = buffer_a;
+ cpu_buffer_a->buffer = buffer_b;
+
+ atomic_dec(&cpu_buffer_a->record_disabled);
+ atomic_dec(&cpu_buffer_b->record_disabled);
+
+ return 0;
+}
+
Index: linux-trace.git/kernel/trace/Kconfig
===================================================================
--- linux-trace.git.orig/kernel/trace/Kconfig 2008-09-26 14:16:45.000000000 -0400
+++ linux-trace.git/kernel/trace/Kconfig 2008-09-26 14:16:54.000000000 -0400
@@ -10,10 +10,14 @@ config HAVE_DYNAMIC_FTRACE
config TRACER_MAX_TRACE
bool

+config RING_BUFFER
+ bool
+
config TRACING
bool
select DEBUG_FS
select STACKTRACE
+ select RING_BUFFER

config FTRACE
bool "Kernel Function Tracer"
Index: linux-trace.git/kernel/trace/Makefile
===================================================================
--- linux-trace.git.orig/kernel/trace/Makefile 2008-09-26 14:16:45.000000000 -0400
+++ linux-trace.git/kernel/trace/Makefile 2008-09-26 14:16:54.000000000 -0400
@@ -11,6 +11,7 @@ obj-y += trace_selftest_dynamic.o
endif

obj-$(CONFIG_FTRACE) += libftrace.o
+obj-$(CONFIG_RING_BUFFER) += ring_buffer.o

obj-$(CONFIG_TRACING) += trace.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
Steven Rostedt
2008-09-27 06:06:10 UTC
Permalink
[
Changes since version 8:

Two major bug fixes!

- Had mix of referencing the pages link list with both
page->lru and buffer_page->list. Perhaps they luckily
were lined up. But I have no idea why this didn't totally
crash my box.

- Missed a write stamp update that would cause funny times
]

From: Steven Rostedt <***@redhat.com>
Subject: Unified trace buffer

This is a unified tracing buffer that implements a ring buffer that
hopefully everyone will eventually be able to use.

The events recorded into the buffer have the following structure:

struct ring_buffer_event {
u32 type:2, len:3, time_delta:27;
u32 array[];
};

The minimum size of an event is 8 bytes. All events are 4 byte
aligned inside the buffer.

There are 4 types (all internal use for the ring buffer, only
the data type is exported to the interface users).

RB_TYPE_PADDING: this type is used to note extra space at the end
of a buffer page.

RB_TYPE_TIME_EXTENT: This type is used when the time between events
is greater than the 27 bit delta can hold. We add another
32 bits, and record that in its own event (8 byte size).

RB_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to
help keep the buffer timestamps in sync.

RB_TYPE_DATA: The event actually holds user data.

The "len" field is only three bits. Since the data must be
4 byte aligned, this field is shifted left by 2, giving a
max length of 28 bytes. If the data load is greater than 28
bytes, the first array field holds the full length of the
data load and the len field is set to zero.

Example, data size of 7 bytes:

type = RB_TYPE_DATA
len = 2
time_delta: <time-stamp> - <prev_event-time-stamp>
array[0..1]: <7 bytes of data> <1 byte empty>

This event is saved in 12 bytes of the buffer.

An event with 82 bytes of data:

type = RB_TYPE_DATA
len = 0
time_delta: <time-stamp> - <prev_event-time-stamp>
array[0]: 84 (Note the alignment)
array[1..14]: <82 bytes of data> <2 bytes empty>

The above event is saved in 92 bytes (if my math is correct).
82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length.

Do not reference the above event struct directly. Use the following
functions to gain access to the event table, since the
ring_buffer_event structure may change in the future.

ring_buffer_event_length(event): get the length of the event.
This is the size of the memory used to record this
event, and not the size of the data pay load.

ring_buffer_time_delta(event): get the time delta of the event
This returns the delta time stamp since the last event.
Note: Even though this is in the header, there should
be no reason to access this directly, accept
for debugging.

ring_buffer_event_data(event): get the data from the event
This is the function to use to get the actual data
from the event. Note, it is only a pointer to the
data inside the buffer. This data must be copied to
another location otherwise you risk it being written
over in the buffer.

ring_buffer_lock: A way to lock the entire buffer.
ring_buffer_unlock: unlock the buffer.

ring_buffer_alloc: create a new ring buffer. Can choose between
overwrite or consumer/producer mode. Overwrite will
overwrite old data, where as consumer producer will
throw away new data if the consumer catches up with the
producer. The consumer/producer is the default.

ring_buffer_free: free the ring buffer.

ring_buffer_resize: resize the buffer. Changes the size of each cpu
buffer. Note, it is up to the caller to provide that
the buffer is not being used while this is happening.
This requirement may go away but do not count on it.

ring_buffer_lock_reserve: locks the ring buffer and allocates an
entry on the buffer to write to.
ring_buffer_unlock_commit: unlocks the ring buffer and commits it to
the buffer.

ring_buffer_write: writes some data into the ring buffer.

ring_buffer_peek: Look at a next item in the cpu buffer.
ring_buffer_consume: get the next item in the cpu buffer and
consume it. That is, this function increments the head
pointer.

ring_buffer_read_start: Start an iterator of a cpu buffer.
For now, this disables the cpu buffer, until you issue
a finish. This is just because we do not want the iterator
to be overwritten. This restriction may change in the future.
But note, this is used for static reading of a buffer which
is usually done "after" a trace. Live readings would want
to use the ring_buffer_consume above, which will not
disable the ring buffer.

ring_buffer_read_finish: Finishes the read iterator and reenables
the ring buffer.

ring_buffer_iter_peek: Look at the next item in the cpu iterator.
ring_buffer_read: Read the iterator and increment it.
ring_buffer_iter_reset: Reset the iterator to point to the beginning
of the cpu buffer.
ring_buffer_iter_empty: Returns true if the iterator is at the end
of the cpu buffer.

ring_buffer_size: returns the size in bytes of each cpu buffer.
Note, the real size is this times the number of CPUs.

ring_buffer_reset_cpu: Sets the cpu buffer to empty
ring_buffer_reset: sets all cpu buffers to empty

ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a
cpu buffer of another buffer. This is handy when you
want to take a snap shot of a running trace on just one
cpu. Having a backup buffer, to swap with facilitates this.
Ftrace max latencies use this.

ring_buffer_empty: Returns true if the ring buffer is empty.
ring_buffer_empty_cpu: Returns true if the cpu buffer is empty.

ring_buffer_record_disable: disable all cpu buffers (read only)
ring_buffer_record_disable_cpu: disable a single cpu buffer (read only)
ring_buffer_record_enable: enable all cpu buffers.
ring_buffer_record_enabl_cpu: enable a single cpu buffer.

ring_buffer_entries: The number of entries in a ring buffer.
ring_buffer_overruns: The number of entries removed due to writing wrap.

ring_buffer_time_stamp: Get the time stamp used by the ring buffer
ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp
into nanosecs.

I still need to implement the GTOD feature. But we need support from
the cpu frequency infrastructure. But this can be done at a later
time without affecting the ring buffer interface.

Signed-off-by: Steven Rostedt <***@redhat.com>
---
include/linux/ring_buffer.h | 179 ++++
kernel/trace/Kconfig | 4
kernel/trace/Makefile | 1
kernel/trace/ring_buffer.c | 1594 ++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 1778 insertions(+)

Index: linux-trace.git/include/linux/ring_buffer.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-trace.git/include/linux/ring_buffer.h 2008-09-27 01:59:06.000000000 -0400
@@ -0,0 +1,179 @@
+#ifndef _LINUX_RING_BUFFER_H
+#define _LINUX_RING_BUFFER_H
+
+#include <linux/mm.h>
+#include <linux/seq_file.h>
+
+struct ring_buffer;
+struct ring_buffer_iter;
+
+/*
+ * Don't reference this struct directly, use the inline items below.
+ */
+struct ring_buffer_event {
+ u32 type:2, len:3, time_delta:27;
+ u32 array[];
+};
+
+enum {
+ RB_TYPE_PADDING, /* Left over page padding
+ * array is ignored
+ * size is variable depending on
+ * how much padding is needed
+ */
+ RB_TYPE_TIME_EXTENT, /* Extent the time delta
+ * array[0] = time delta (28 .. 59)
+ * size = 8 bytes
+ */
+ /* FIXME: RB_TYPE_TIME_STAMP not implemented */
+ RB_TYPE_TIME_STAMP, /* Sync time stamp with external clock
+ * array[0] = tv_nsec
+ * array[1] = tv_sec
+ * size = 16 bytes
+ */
+
+ RB_TYPE_DATA, /* Data record
+ * If len is zero:
+ * array[0] holds the actual length
+ * array[1..(length+3)/4-1] holds data
+ * else
+ * length = len << 2
+ * array[0..(length+3)/4] holds data
+ */
+};
+
+#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event))
+#define RB_ALIGNMENT_SHIFT 2
+#define RB_ALIGNMENT (1 << RB_ALIGNMENT_SHIFT)
+#define RB_MAX_SMALL_DATA (28)
+
+enum {
+ RB_LEN_TIME_EXTENT = 8,
+ RB_LEN_TIME_STAMP = 16,
+};
+
+/**
+ * ring_buffer_event_length - return the length of the event
+ * @event: the event to get the length of
+ */
+static inline unsigned
+ring_buffer_event_length(struct ring_buffer_event *event)
+{
+ unsigned length;
+
+ switch (event->type) {
+ case RB_TYPE_PADDING:
+ /* undefined */
+ return -1;
+
+ case RB_TYPE_TIME_EXTENT:
+ return RB_LEN_TIME_EXTENT;
+
+ case RB_TYPE_TIME_STAMP:
+ return RB_LEN_TIME_STAMP;
+
+ case RB_TYPE_DATA:
+ if (event->len)
+ length = event->len << RB_ALIGNMENT_SHIFT;
+ else
+ length = event->array[0];
+ return length + RB_EVNT_HDR_SIZE;
+ default:
+ BUG();
+ }
+ /* not hit */
+ return 0;
+}
+
+/**
+ * ring_buffer_event_time_delta - return the delta timestamp of the event
+ * @event: the event to get the delta timestamp of
+ *
+ * The delta timestamp is the 27 bit timestamp since the last event.
+ */
+static inline unsigned
+ring_buffer_event_time_delta(struct ring_buffer_event *event)
+{
+ return event->time_delta;
+}
+
+/**
+ * ring_buffer_event_data - return the data of the event
+ * @event: the event to get the data from
+ */
+static inline void *
+ring_buffer_event_data(struct ring_buffer_event *event)
+{
+ BUG_ON(event->type != RB_TYPE_DATA);
+ /* If length is in len field, then array[0] has the data */
+ if (event->len)
+ return (void *)&event->array[0];
+ /* Otherwise length is in array[0] and array[1] has the data */
+ return (void *)&event->array[1];
+}
+
+void ring_buffer_lock(struct ring_buffer *buffer, unsigned long *flags);
+void ring_buffer_unlock(struct ring_buffer *buffer, unsigned long flags);
+
+/*
+ * size is in bytes for each per CPU buffer.
+ */
+struct ring_buffer *
+ring_buffer_alloc(unsigned long size, unsigned flags);
+void ring_buffer_free(struct ring_buffer *buffer);
+
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size);
+
+struct ring_buffer_event *
+ring_buffer_lock_reserve(struct ring_buffer *buffer,
+ unsigned long length,
+ unsigned long *flags);
+int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags);
+int ring_buffer_write(struct ring_buffer *buffer,
+ unsigned long length, void *data);
+
+struct ring_buffer_event *
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts);
+struct ring_buffer_event *
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts);
+
+struct ring_buffer_iter *
+ring_buffer_read_start(struct ring_buffer *buffer, int cpu);
+void ring_buffer_read_finish(struct ring_buffer_iter *iter);
+
+struct ring_buffer_event *
+ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts);
+struct ring_buffer_event *
+ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts);
+void ring_buffer_iter_reset(struct ring_buffer_iter *iter);
+int ring_buffer_iter_empty(struct ring_buffer_iter *iter);
+
+unsigned long ring_buffer_size(struct ring_buffer *buffer);
+
+void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu);
+void ring_buffer_reset(struct ring_buffer *buffer);
+
+int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
+ struct ring_buffer *buffer_b, int cpu);
+
+int ring_buffer_empty(struct ring_buffer *buffer);
+int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu);
+
+void ring_buffer_record_disable(struct ring_buffer *buffer);
+void ring_buffer_record_enable(struct ring_buffer *buffer);
+void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu);
+void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu);
+
+unsigned long ring_buffer_entries(struct ring_buffer *buffer);
+unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
+
+u64 ring_buffer_time_stamp(int cpu);
+void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
+
+enum ring_buffer_flags {
+ RB_FL_OVERWRITE = 1 << 0,
+};
+
+#endif /* _LINUX_RING_BUFFER_H */
Index: linux-trace.git/kernel/trace/ring_buffer.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-trace.git/kernel/trace/ring_buffer.c 2008-09-27 02:02:11.000000000 -0400
@@ -0,0 +1,1594 @@
+/*
+ * Generic ring buffer
+ *
+ * Copyright (C) 2008 Steven Rostedt <***@redhat.com>
+ */
+#include <linux/ring_buffer.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/mutex.h>
+#include <linux/sched.h> /* used for sched_clock() (for now) */
+#include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/list.h>
+#include <linux/fs.h>
+
+/* This needs to be somewhere else */
+#ifdef CONFIG_SMP
+# define __raw_assert_spin_is_locked(lock) \
+ BUG_ON(!__raw_spin_is_locked(lock))
+#else
+# define __raw_assert_spin_is_locked(lock) do { } while (0)
+#endif
+
+/* Up this if you want to test the TIME_EXTENTS and normalization */
+#define DEBUG_SHIFT 0
+
+/* FIXME!!! */
+u64 ring_buffer_time_stamp(int cpu)
+{
+ /* shift to debug/test normalization and TIME_EXTENTS */
+ return sched_clock() << DEBUG_SHIFT;
+}
+void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
+{
+ /* Just stupid testing the normalize function and deltas */
+ *ts >>= DEBUG_SHIFT;
+}
+
+#define for_each_buffer_cpu(buffer, cpu) \
+ for_each_cpu_mask(cpu, buffer->cpumask)
+
+#define TS_SHIFT 27
+#define TS_MASK ((1ULL << TS_SHIFT) - 1)
+#define TS_DELTA_TEST (~TS_MASK)
+
+/*
+ * This hack stolen from mm/slob.c.
+ * We can store per page timing information in the page frame of the page.
+ * Thanks to Peter Zijlstra for suggesting this idea.
+ */
+struct buffer_page {
+ union {
+ struct {
+ unsigned long flags; /* mandatory */
+ atomic_t _count; /* mandatory */
+ u64 time_stamp; /* page time stamp */
+ unsigned size; /* size of page data */
+ struct list_head list; /* linked list of free pages */
+ };
+ struct page page;
+ };
+};
+
+/*
+ * We need to fit the time_stamp delta into 27 bits.
+ */
+static inline int test_time_stamp(u64 delta)
+{
+ if (delta & TS_DELTA_TEST)
+ return 1;
+ return 0;
+}
+
+#define BUF_PAGE_SIZE PAGE_SIZE
+
+/*
+ * head_page == tail_page && head == tail then buffer is empty.
+ */
+struct ring_buffer_per_cpu {
+ int cpu;
+ struct ring_buffer *buffer;
+ raw_spinlock_t lock;
+ struct lock_class_key lock_key;
+ struct list_head pages;
+ unsigned long head; /* read from head */
+ unsigned long tail; /* write to tail */
+ struct buffer_page *head_page;
+ struct buffer_page *tail_page;
+ unsigned long overrun;
+ unsigned long entries;
+ u64 write_stamp;
+ u64 read_stamp;
+ atomic_t record_disabled;
+};
+
+struct ring_buffer {
+ unsigned long size;
+ unsigned pages;
+ unsigned flags;
+ int cpus;
+ cpumask_t cpumask;
+ atomic_t record_disabled;
+
+ struct mutex mutex;
+
+ struct ring_buffer_per_cpu **buffers;
+};
+
+struct ring_buffer_iter {
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long head;
+ struct buffer_page *head_page;
+ u64 read_stamp;
+};
+
+#define CHECK_COND(buffer, cond) \
+ if (unlikely(cond)) { \
+ atomic_inc(&buffer->record_disabled); \
+ WARN_ON(1); \
+ return -1; \
+ }
+
+/**
+ * check_pages - integrity check of buffer pages
+ * @cpu_buffer: CPU buffer with pages to test
+ *
+ * As a safty measure we check to make sure the data pages have not
+ * been corrupted.
+ */
+static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct list_head *head = &cpu_buffer->pages;
+ struct buffer_page *page, *tmp;
+
+ CHECK_COND(cpu_buffer, head->next->prev != head);
+ CHECK_COND(cpu_buffer, head->prev->next != head);
+
+ list_for_each_entry_safe(page, tmp, head, list) {
+ CHECK_COND(cpu_buffer, page->list.next->prev != &page->list);
+ CHECK_COND(cpu_buffer, page->list.prev->next != &page->list);
+ }
+
+ return 0;
+}
+
+static unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return cpu_buffer->head_page->size;
+}
+
+static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned nr_pages)
+{
+ struct list_head *head = &cpu_buffer->pages;
+ LIST_HEAD(pages);
+ struct buffer_page *page, *tmp;
+ unsigned long addr;
+ unsigned i;
+
+ for (i = 0; i < nr_pages; i++) {
+ addr = __get_free_page(GFP_KERNEL);
+ if (!addr)
+ goto free_pages;
+ page = (struct buffer_page *)virt_to_page(addr);
+ list_add(&page->list, &pages);
+ }
+
+ list_splice(&pages, head);
+
+ rb_check_pages(cpu_buffer);
+
+ return 0;
+
+ free_pages:
+ list_for_each_entry_safe(page, tmp, &pages, list) {
+ list_del_init(&page->list);
+ __free_page(&page->page);
+ }
+ return -ENOMEM;
+}
+
+static struct ring_buffer_per_cpu *
+rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int ret;
+
+ cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!cpu_buffer)
+ return NULL;
+
+ cpu_buffer->cpu = cpu;
+ cpu_buffer->buffer = buffer;
+ cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+ INIT_LIST_HEAD(&cpu_buffer->pages);
+
+ ret = rb_allocate_pages(cpu_buffer, buffer->pages);
+ if (ret < 0)
+ goto fail_free_buffer;
+
+ cpu_buffer->head_page
+ = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+ cpu_buffer->tail_page
+ = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+
+ return cpu_buffer;
+
+ fail_free_buffer:
+ kfree(cpu_buffer);
+ return NULL;
+}
+
+static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct list_head *head = &cpu_buffer->pages;
+ struct buffer_page *page, *tmp;
+
+ list_for_each_entry_safe(page, tmp, head, list) {
+ list_del_init(&page->list);
+ __free_page(&page->page);
+ }
+ kfree(cpu_buffer);
+}
+
+/**
+ * ring_buffer_alloc - allocate a new ring_buffer
+ * @size: the size in bytes that is needed.
+ * @flags: attributes to set for the ring buffer.
+ *
+ * Currently the only flag that is available is the RB_FL_OVERWRITE
+ * flag. This flag means that the buffer will overwrite old data
+ * when the buffer wraps. If this flag is not set, the buffer will
+ * drop data when the tail hits the head.
+ */
+struct ring_buffer *
+ring_buffer_alloc(unsigned long size, unsigned flags)
+{
+ struct ring_buffer *buffer;
+ int bsize;
+ int cpu;
+
+ /* keep it in its own cache line */
+ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
+ GFP_KERNEL);
+ if (!buffer)
+ return NULL;
+
+ buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ buffer->flags = flags;
+
+ /* need at least two pages */
+ if (buffer->pages == 1)
+ buffer->pages++;
+
+ buffer->cpumask = cpu_possible_map;
+ buffer->cpus = nr_cpu_ids;
+
+ bsize = sizeof(void *) * nr_cpu_ids;
+ buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()),
+ GFP_KERNEL);
+ if (!buffer->buffers)
+ goto fail_free_buffer;
+
+ for_each_buffer_cpu(buffer, cpu) {
+ buffer->buffers[cpu] =
+ rb_allocate_cpu_buffer(buffer, cpu);
+ if (!buffer->buffers[cpu])
+ goto fail_free_buffers;
+ }
+
+ mutex_init(&buffer->mutex);
+
+ return buffer;
+
+ fail_free_buffers:
+ for_each_buffer_cpu(buffer, cpu) {
+ if (buffer->buffers[cpu])
+ rb_free_cpu_buffer(buffer->buffers[cpu]);
+ }
+ kfree(buffer->buffers);
+
+ fail_free_buffer:
+ kfree(buffer);
+ return NULL;
+}
+
+/**
+ * ring_buffer_free - free a ring buffer.
+ * @buffer: the buffer to free.
+ */
+void
+ring_buffer_free(struct ring_buffer *buffer)
+{
+ int cpu;
+
+ for_each_buffer_cpu(buffer, cpu)
+ rb_free_cpu_buffer(buffer->buffers[cpu]);
+
+ kfree(buffer);
+}
+
+static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
+
+static void
+rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
+{
+ struct buffer_page *page;
+ struct list_head *p;
+ unsigned i;
+
+ atomic_inc(&cpu_buffer->record_disabled);
+ synchronize_sched();
+
+ for (i = 0; i < nr_pages; i++) {
+ BUG_ON(list_empty(&cpu_buffer->pages));
+ p = cpu_buffer->pages.next;
+ page = list_entry(p, struct buffer_page, list);
+ list_del_init(&page->list);
+ __free_page(&page->page);
+ }
+ BUG_ON(list_empty(&cpu_buffer->pages));
+
+ rb_reset_cpu(cpu_buffer);
+
+ rb_check_pages(cpu_buffer);
+
+ atomic_dec(&cpu_buffer->record_disabled);
+
+}
+
+static void
+rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
+ struct list_head *pages, unsigned nr_pages)
+{
+ struct buffer_page *page;
+ struct list_head *p;
+ unsigned i;
+
+ atomic_inc(&cpu_buffer->record_disabled);
+ synchronize_sched();
+
+ for (i = 0; i < nr_pages; i++) {
+ BUG_ON(list_empty(pages));
+ p = pages->next;
+ page = list_entry(p, struct buffer_page, list);
+ list_del_init(&page->list);
+ list_add_tail(&page->list, &cpu_buffer->pages);
+ }
+ rb_reset_cpu(cpu_buffer);
+
+ rb_check_pages(cpu_buffer);
+
+ atomic_dec(&cpu_buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_resize - resize the ring buffer
+ * @buffer: the buffer to resize.
+ * @size: the new size.
+ *
+ * The tracer is responsible for making sure that the buffer is
+ * not being used while changing the size.
+ * Note: We may be able to change the above requirement by using
+ * RCU synchronizations.
+ *
+ * Minimum size is 2 * BUF_PAGE_SIZE.
+ *
+ * Returns -1 on failure.
+ */
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long buffer_size;
+ LIST_HEAD(pages);
+ unsigned long addr;
+ unsigned nr_pages, rm_pages, new_pages;
+ struct buffer_page *page, *tmp;
+ int i, cpu;
+
+ size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ size *= BUF_PAGE_SIZE;
+ buffer_size = buffer->pages * BUF_PAGE_SIZE;
+
+ /* we need a minimum of two pages */
+ if (size < BUF_PAGE_SIZE * 2)
+ size = BUF_PAGE_SIZE * 2;
+
+ if (size == buffer_size)
+ return size;
+
+ mutex_lock(&buffer->mutex);
+
+ nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+
+ if (size < buffer_size) {
+
+ /* easy case, just free pages */
+ BUG_ON(nr_pages >= buffer->pages);
+
+ rm_pages = buffer->pages - nr_pages;
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ rb_remove_pages(cpu_buffer, rm_pages);
+ }
+ goto out;
+ }
+
+ /*
+ * This is a bit more difficult. We only want to add pages
+ * when we can allocate enough for all CPUs. We do this
+ * by allocating all the pages and storing them on a local
+ * link list. If we succeed in our allocation, then we
+ * add these pages to the cpu_buffers. Otherwise we just free
+ * them all and return -ENOMEM;
+ */
+ BUG_ON(nr_pages <= buffer->pages);
+ new_pages = nr_pages - buffer->pages;
+
+ for_each_buffer_cpu(buffer, cpu) {
+ for (i = 0; i < new_pages; i++) {
+ addr = __get_free_page(GFP_KERNEL);
+ if (!addr)
+ goto free_pages;
+ page = (struct buffer_page *)virt_to_page(addr);
+ list_add(&page->list, &pages);
+ }
+ }
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ rb_insert_pages(cpu_buffer, &pages, new_pages);
+ }
+
+ BUG_ON(!list_empty(&pages));
+
+ out:
+ buffer->pages = nr_pages;
+ mutex_unlock(&buffer->mutex);
+
+ return size;
+
+ free_pages:
+ list_for_each_entry_safe(page, tmp, &pages, list) {
+ list_del_init(&page->list);
+ __free_page(&page->page);
+ }
+ return -ENOMEM;
+}
+
+static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return cpu_buffer->head_page == cpu_buffer->tail_page &&
+ cpu_buffer->head == cpu_buffer->tail;
+}
+
+static inline int rb_null_event(struct ring_buffer_event *event)
+{
+ return event->type == RB_TYPE_PADDING;
+}
+
+static inline void *rb_page_index(struct buffer_page *page, unsigned index)
+{
+ void *addr;
+
+ addr = page_address(&page->page);
+ return addr + index;
+}
+
+static inline struct ring_buffer_event *
+rb_head_event(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return rb_page_index(cpu_buffer->head_page,
+ cpu_buffer->head);
+}
+
+static inline struct ring_buffer_event *
+rb_iter_head_event(struct ring_buffer_iter *iter)
+{
+ return rb_page_index(iter->head_page,
+ iter->head);
+}
+
+/*
+ * When the tail hits the head and the buffer is in overwrite mode,
+ * the head jumps to the next page and all content on the previous
+ * page is discarded. But before doing so, we update the overrun
+ * variable of the buffer.
+ */
+static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct ring_buffer_event *event;
+ unsigned long head;
+
+ for (head = 0; head < rb_head_size(cpu_buffer);
+ head += ring_buffer_event_length(event)) {
+ event = rb_page_index(cpu_buffer->head_page, head);
+ BUG_ON(rb_null_event(event));
+ /* Only count data entries */
+ if (event->type != RB_TYPE_DATA)
+ continue;
+ cpu_buffer->overrun++;
+ cpu_buffer->entries--;
+ }
+}
+
+static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page **page)
+{
+ struct list_head *p = (*page)->list.next;
+
+ if (p == &cpu_buffer->pages)
+ p = p->next;
+
+ *page = list_entry(p, struct buffer_page, list);
+}
+
+static inline void
+rb_add_stamp(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
+{
+ cpu_buffer->tail_page->time_stamp = *ts;
+ cpu_buffer->write_stamp = *ts;
+}
+
+static void rb_reset_read_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ cpu_buffer->read_stamp = cpu_buffer->head_page->time_stamp;
+ cpu_buffer->head = 0;
+}
+
+static void
+rb_reset_iter_read_page(struct ring_buffer_iter *iter)
+{
+ iter->read_stamp = iter->head_page->time_stamp;
+ iter->head = 0;
+}
+
+/**
+ * ring_buffer_update_event - update event type and data
+ * @event: the even to update
+ * @type: the type of event
+ * @length: the size of the event field in the ring buffer
+ *
+ * Update the type and data fields of the event. The length
+ * is the actual size that is written to the ring buffer,
+ * and with this, we can determine what to place into the
+ * data field.
+ */
+static inline void
+rb_update_event(struct ring_buffer_event *event,
+ unsigned type, unsigned length)
+{
+ event->type = type;
+
+ switch (type) {
+
+ case RB_TYPE_PADDING:
+ break;
+
+ case RB_TYPE_TIME_EXTENT:
+ event->len =
+ (RB_LEN_TIME_EXTENT + (RB_ALIGNMENT-1))
+ >> RB_ALIGNMENT_SHIFT;
+ break;
+
+ case RB_TYPE_TIME_STAMP:
+ event->len =
+ (RB_LEN_TIME_STAMP + (RB_ALIGNMENT-1))
+ >> RB_ALIGNMENT_SHIFT;
+ break;
+
+ case RB_TYPE_DATA:
+ length -= RB_EVNT_HDR_SIZE;
+ if (length > RB_MAX_SMALL_DATA) {
+ event->len = 0;
+ event->array[0] = length;
+ } else
+ event->len =
+ (length + (RB_ALIGNMENT-1))
+ >> RB_ALIGNMENT_SHIFT;
+ break;
+ default:
+ BUG();
+ }
+}
+
+static inline unsigned rb_calculate_event_length(unsigned length)
+{
+ struct ring_buffer_event event; /* Used only for sizeof array */
+
+ /* zero length can cause confusions */
+ if (!length)
+ length = 1;
+
+ if (length > RB_MAX_SMALL_DATA)
+ length += sizeof(event.array[0]);
+
+ length += RB_EVNT_HDR_SIZE;
+ length = ALIGN(length, RB_ALIGNMENT);
+
+ return length;
+}
+
+static struct ring_buffer_event *
+__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned type, unsigned long length, u64 *ts)
+{
+ struct buffer_page *head_page, *tail_page;
+ unsigned long tail;
+ struct ring_buffer *buffer = cpu_buffer->buffer;
+ struct ring_buffer_event *event;
+
+ tail_page = cpu_buffer->tail_page;
+ head_page = cpu_buffer->head_page;
+ tail = cpu_buffer->tail;
+
+ if (tail + length > BUF_PAGE_SIZE) {
+ struct buffer_page *next_page = tail_page;
+
+ rb_inc_page(cpu_buffer, &next_page);
+
+ if (next_page == head_page) {
+ if (!(buffer->flags & RB_FL_OVERWRITE))
+ return NULL;
+
+ /* count overflows */
+ rb_update_overflow(cpu_buffer);
+
+ rb_inc_page(cpu_buffer, &head_page);
+ cpu_buffer->head_page = head_page;
+ rb_reset_read_page(cpu_buffer);
+ }
+
+ if (tail != BUF_PAGE_SIZE) {
+ event = rb_page_index(tail_page, tail);
+ /* page padding */
+ event->type = RB_TYPE_PADDING;
+ }
+
+ tail_page->size = tail;
+ tail_page = next_page;
+ tail_page->size = 0;
+ tail = 0;
+ cpu_buffer->tail_page = tail_page;
+ cpu_buffer->tail = tail;
+ rb_add_stamp(cpu_buffer, ts);
+ }
+
+ BUG_ON(tail + length > BUF_PAGE_SIZE);
+
+ event = rb_page_index(tail_page, tail);
+ rb_update_event(event, type, length);
+
+ return event;
+}
+
+static struct ring_buffer_event *
+rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned type, unsigned long length)
+{
+ u64 ts, delta;
+ struct ring_buffer_event *event;
+ static int once;
+
+ ts = ring_buffer_time_stamp(cpu_buffer->cpu);
+
+ if (cpu_buffer->tail) {
+ delta = ts - cpu_buffer->write_stamp;
+
+ if (test_time_stamp(delta)) {
+ if (unlikely(delta > (1ULL << 59) && !once++)) {
+ printk(KERN_WARNING "Delta way too big! %llu"
+ " ts=%llu write stamp = %llu\n",
+ delta, ts, cpu_buffer->write_stamp);
+ WARN_ON(1);
+ }
+ /*
+ * The delta is too big, we to add a
+ * new timestamp.
+ */
+ event = __rb_reserve_next(cpu_buffer,
+ RB_TYPE_TIME_EXTENT,
+ RB_LEN_TIME_EXTENT,
+ &ts);
+ if (!event)
+ return NULL;
+
+ /* check to see if we went to the next page */
+ if (cpu_buffer->tail) {
+ /* Still on same page, update timestamp */
+ event->time_delta = delta & TS_MASK;
+ event->array[0] = delta >> TS_SHIFT;
+ /* commit the time event */
+ cpu_buffer->tail +=
+ ring_buffer_event_length(event);
+ cpu_buffer->write_stamp = ts;
+ delta = 0;
+ }
+ }
+ } else {
+ rb_add_stamp(cpu_buffer, &ts);
+ delta = 0;
+ }
+
+ event = __rb_reserve_next(cpu_buffer, type, length, &ts);
+ if (!event)
+ return NULL;
+
+ /* If the reserve went to the next page, our delta is zero */
+ if (!cpu_buffer->tail)
+ delta = 0;
+
+ event->time_delta = delta;
+
+ return event;
+}
+
+/**
+ * ring_buffer_lock_reserve - reserve a part of the buffer
+ * @buffer: the ring buffer to reserve from
+ * @length: the length of the data to reserve (excluding event header)
+ * @flags: a pointer to save the interrupt flags
+ *
+ * Returns a reseverd event on the ring buffer to copy directly to.
+ * The user of this interface will need to get the body to write into
+ * and can use the ring_buffer_event_data() interface.
+ *
+ * The length is the length of the data needed, not the event length
+ * which also includes the event header.
+ *
+ * Must be paired with ring_buffer_unlock_commit, unless NULL is returned.
+ * If NULL is returned, then nothing has been allocated or locked.
+ */
+struct ring_buffer_event *
+ring_buffer_lock_reserve(struct ring_buffer *buffer,
+ unsigned long length,
+ unsigned long *flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ int cpu;
+
+ if (atomic_read(&buffer->record_disabled))
+ return NULL;
+
+ raw_local_irq_save(*flags);
+ cpu = raw_smp_processor_id();
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ goto out_irq;
+
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_lock(&cpu_buffer->lock);
+
+ if (atomic_read(&cpu_buffer->record_disabled))
+ goto no_record;
+
+ length = rb_calculate_event_length(length);
+ if (length > BUF_PAGE_SIZE)
+ return NULL;
+
+ event = rb_reserve_next_event(cpu_buffer, RB_TYPE_DATA, length);
+ if (!event)
+ goto no_record;
+
+ return event;
+
+ no_record:
+ __raw_spin_unlock(&cpu_buffer->lock);
+ out_irq:
+ local_irq_restore(*flags);
+ return NULL;
+}
+
+static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ cpu_buffer->tail += ring_buffer_event_length(event);
+ cpu_buffer->tail_page->size = cpu_buffer->tail;
+ cpu_buffer->write_stamp += event->time_delta;
+ cpu_buffer->entries++;
+}
+
+/**
+ * ring_buffer_unlock_commit - commit a reserved
+ * @buffer: The buffer to commit to
+ * @event: The event pointer to commit.
+ * @flags: the interrupt flags received from ring_buffer_lock_reserve.
+ *
+ * This commits the data to the ring buffer, and releases any locks held.
+ *
+ * Must be paired with ring_buffer_lock_reserve.
+ */
+int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu = raw_smp_processor_id();
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ __raw_assert_spin_is_locked(&cpu_buffer->lock);
+
+ rb_commit(cpu_buffer, event);
+
+ __raw_spin_unlock(&cpu_buffer->lock);
+ raw_local_irq_restore(flags);
+
+ return 0;
+}
+
+/**
+ * ring_buffer_write - write data to the buffer without reserving
+ * @buffer: The ring buffer to write to.
+ * @length: The length of the data being written (excluding the event header)
+ * @data: The data to write to the buffer.
+ *
+ * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as
+ * one function. If you already have the data to write to the buffer, it
+ * may be easier to simply call this function.
+ *
+ * Note, like ring_buffer_lock_reserve, the length is the length of the data
+ * and not the length of the event which would hold the header.
+ */
+int ring_buffer_write(struct ring_buffer *buffer,
+ unsigned long length,
+ void *data)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ unsigned long event_length, flags;
+ void *body;
+ int ret = -EBUSY;
+ int cpu;
+
+ if (atomic_read(&buffer->record_disabled))
+ return -EBUSY;
+
+ local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ goto out_irq;
+
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_lock(&cpu_buffer->lock);
+
+ if (atomic_read(&cpu_buffer->record_disabled))
+ goto out;
+
+ event_length = rb_calculate_event_length(length);
+ event = rb_reserve_next_event(cpu_buffer,
+ RB_TYPE_DATA, event_length);
+ if (!event)
+ goto out;
+
+ body = ring_buffer_event_data(event);
+
+ memcpy(body, data, length);
+
+ rb_commit(cpu_buffer, event);
+
+ ret = 0;
+ out:
+ __raw_spin_unlock(&cpu_buffer->lock);
+ out_irq:
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+/**
+ * ring_buffer_lock - lock the ring buffer
+ * @buffer: The ring buffer to lock
+ * @flags: The place to store the interrupt flags
+ *
+ * This locks all the per CPU buffers.
+ *
+ * Must be unlocked by ring_buffer_unlock.
+ */
+void ring_buffer_lock(struct ring_buffer *buffer, unsigned long *flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ local_irq_save(*flags);
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_lock(&cpu_buffer->lock);
+ }
+}
+
+/**
+ * ring_buffer_unlock - unlock a locked buffer
+ * @buffer: The locked buffer to unlock
+ * @flags: The interrupt flags received by ring_buffer_lock
+ */
+void ring_buffer_unlock(struct ring_buffer *buffer, unsigned long flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ for (cpu = buffer->cpus - 1; cpu >= 0; cpu--) {
+ if (!cpu_isset(cpu, buffer->cpumask))
+ continue;
+ cpu_buffer = buffer->buffers[cpu];
+ __raw_spin_unlock(&cpu_buffer->lock);
+ }
+
+ local_irq_restore(flags);
+}
+
+/**
+ * ring_buffer_record_disable - stop all writes into the buffer
+ * @buffer: The ring buffer to stop writes to.
+ *
+ * This prevents all writes to the buffer. Any attempt to write
+ * to the buffer after this will fail and return NULL.
+ *
+ * The caller should call synchronize_sched() after this.
+ */
+void ring_buffer_record_disable(struct ring_buffer *buffer)
+{
+ atomic_inc(&buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_enable - enable writes to the buffer
+ * @buffer: The ring buffer to enable writes
+ *
+ * Note, multiple disables will need the same number of enables
+ * to truely enable the writing (much like preempt_disable).
+ */
+void ring_buffer_record_enable(struct ring_buffer *buffer)
+{
+ atomic_dec(&buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
+ * @buffer: The ring buffer to stop writes to.
+ * @cpu: The CPU buffer to stop
+ *
+ * This prevents all writes to the buffer. Any attempt to write
+ * to the buffer after this will fail and return NULL.
+ *
+ * The caller should call synchronize_sched() after this.
+ */
+void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return;
+
+ cpu_buffer = buffer->buffers[cpu];
+ atomic_inc(&cpu_buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_enable_cpu - enable writes to the buffer
+ * @buffer: The ring buffer to enable writes
+ * @cpu: The CPU to enable.
+ *
+ * Note, multiple disables will need the same number of enables
+ * to truely enable the writing (much like preempt_disable).
+ */
+void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return;
+
+ cpu_buffer = buffer->buffers[cpu];
+ atomic_dec(&cpu_buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the entries from.
+ */
+unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return 0;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return cpu_buffer->entries;
+}
+
+/**
+ * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return 0;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return cpu_buffer->overrun;
+}
+
+/**
+ * ring_buffer_entries - get the number of entries in a buffer
+ * @buffer: The ring buffer
+ *
+ * Returns the total number of entries in the ring buffer
+ * (all CPU entries)
+ */
+unsigned long ring_buffer_entries(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long entries = 0;
+ int cpu;
+
+ /* if you care about this being correct, lock the buffer */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ entries += cpu_buffer->entries;
+ }
+
+ return entries;
+}
+
+/**
+ * ring_buffer_overrun_cpu - get the number of overruns in buffer
+ * @buffer: The ring buffer
+ *
+ * Returns the total number of overruns in the ring buffer
+ * (all CPU entries)
+ */
+unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long overruns = 0;
+ int cpu;
+
+ /* if you care about this being correct, lock the buffer */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ overruns += cpu_buffer->overrun;
+ }
+
+ return overruns;
+}
+
+/**
+ * ring_buffer_iter_reset - reset an iterator
+ * @iter: The iterator to reset
+ *
+ * Resets the iterator, so that it will start from the beginning
+ * again.
+ */
+void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+
+ iter->head_page = cpu_buffer->head_page;
+ iter->head = cpu_buffer->head;
+ rb_reset_iter_read_page(iter);
+}
+
+/**
+ * ring_buffer_iter_empty - check if an iterator has no more to read
+ * @iter: The iterator to check
+ */
+int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = iter->cpu_buffer;
+
+ return iter->head_page == cpu_buffer->tail_page &&
+ iter->head == cpu_buffer->tail;
+}
+
+static void
+rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ u64 delta;
+
+ switch (event->type) {
+ case RB_TYPE_PADDING:
+ return;
+
+ case RB_TYPE_TIME_EXTENT:
+ delta = event->array[0];
+ delta <<= TS_SHIFT;
+ delta += event->time_delta;
+ cpu_buffer->read_stamp += delta;
+ return;
+
+ case RB_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ return;
+
+ case RB_TYPE_DATA:
+ cpu_buffer->read_stamp += event->time_delta;
+ return;
+
+ default:
+ BUG();
+ }
+ return;
+}
+
+static void
+rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
+ struct ring_buffer_event *event)
+{
+ u64 delta;
+
+ switch (event->type) {
+ case RB_TYPE_PADDING:
+ return;
+
+ case RB_TYPE_TIME_EXTENT:
+ delta = event->array[0];
+ delta <<= TS_SHIFT;
+ delta += event->time_delta;
+ iter->read_stamp += delta;
+ return;
+
+ case RB_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ return;
+
+ case RB_TYPE_DATA:
+ iter->read_stamp += event->time_delta;
+ return;
+
+ default:
+ BUG();
+ }
+ return;
+}
+
+static void rb_advance_head(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct ring_buffer_event *event;
+ unsigned length;
+
+ /*
+ * Check if we are at the end of the buffer.
+ */
+ if (cpu_buffer->head >= cpu_buffer->head_page->size) {
+ BUG_ON(cpu_buffer->head_page == cpu_buffer->tail_page);
+ rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
+ rb_reset_read_page(cpu_buffer);
+ return;
+ }
+
+ event = rb_head_event(cpu_buffer);
+
+ if (event->type == RB_TYPE_DATA)
+ cpu_buffer->entries--;
+
+ length = ring_buffer_event_length(event);
+
+ /*
+ * This should not be called to advance the header if we are
+ * at the tail of the buffer.
+ */
+ BUG_ON((cpu_buffer->head_page == cpu_buffer->tail_page) &&
+ (cpu_buffer->head + length > cpu_buffer->tail));
+
+ rb_update_read_stamp(cpu_buffer, event);
+
+ cpu_buffer->head += length;
+
+ /* check for end of page */
+ if ((cpu_buffer->head >= cpu_buffer->head_page->size) &&
+ (cpu_buffer->head_page != cpu_buffer->tail_page))
+ rb_advance_head(cpu_buffer);
+}
+
+static void rb_advance_iter(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer *buffer;
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ unsigned length;
+
+ cpu_buffer = iter->cpu_buffer;
+ buffer = cpu_buffer->buffer;
+
+ /*
+ * Check if we are at the end of the buffer.
+ */
+ if (iter->head >= iter->head_page->size) {
+ BUG_ON(iter->head_page == cpu_buffer->tail_page);
+ rb_inc_page(cpu_buffer, &iter->head_page);
+ rb_reset_iter_read_page(iter);
+ return;
+ }
+
+ event = rb_iter_head_event(iter);
+
+ length = ring_buffer_event_length(event);
+
+ /*
+ * This should not be called to advance the header if we are
+ * at the tail of the buffer.
+ */
+ BUG_ON((iter->head_page == cpu_buffer->tail_page) &&
+ (iter->head + length > cpu_buffer->tail));
+
+ rb_update_iter_read_stamp(iter, event);
+
+ iter->head += length;
+
+ /* check for end of page padding */
+ if ((iter->head >= iter->head_page->size) &&
+ (iter->head_page != cpu_buffer->tail_page))
+ rb_advance_iter(iter);
+}
+
+/**
+ * ring_buffer_peek - peek at the next event to be read
+ * @buffer: The ring buffer to read
+ * @cpu: The cpu to peak at
+ * @ts: The timestamp counter of this event.
+ *
+ * This will return the event that will be read next, but does
+ * not consume the data.
+ */
+struct ring_buffer_event *
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return NULL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ again:
+ if (rb_per_cpu_empty(cpu_buffer))
+ return NULL;
+
+ event = rb_head_event(cpu_buffer);
+
+ switch (event->type) {
+ case RB_TYPE_PADDING:
+ rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
+ rb_reset_read_page(cpu_buffer);
+ goto again;
+
+ case RB_TYPE_TIME_EXTENT:
+ /* Internal data, OK to advance */
+ rb_advance_head(cpu_buffer);
+ goto again;
+
+ case RB_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ rb_advance_head(cpu_buffer);
+ goto again;
+
+ case RB_TYPE_DATA:
+ if (ts) {
+ *ts = cpu_buffer->read_stamp + event->time_delta;
+ ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+ }
+ return event;
+
+ default:
+ BUG();
+ }
+
+ return NULL;
+}
+
+/**
+ * ring_buffer_iter_peek - peek at the next event to be read
+ * @iter: The ring buffer iterator
+ * @ts: The timestamp counter of this event.
+ *
+ * This will return the event that will be read next, but does
+ * not increment the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
+{
+ struct ring_buffer *buffer;
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+
+ if (ring_buffer_iter_empty(iter))
+ return NULL;
+
+ cpu_buffer = iter->cpu_buffer;
+ buffer = cpu_buffer->buffer;
+
+ again:
+ if (rb_per_cpu_empty(cpu_buffer))
+ return NULL;
+
+ event = rb_iter_head_event(iter);
+
+ switch (event->type) {
+ case RB_TYPE_PADDING:
+ rb_inc_page(cpu_buffer, &iter->head_page);
+ rb_reset_iter_read_page(iter);
+ goto again;
+
+ case RB_TYPE_TIME_EXTENT:
+ /* Internal data, OK to advance */
+ rb_advance_iter(iter);
+ goto again;
+
+ case RB_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ rb_advance_iter(iter);
+ goto again;
+
+ case RB_TYPE_DATA:
+ if (ts) {
+ *ts = iter->read_stamp + event->time_delta;
+ ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+ }
+ return event;
+
+ default:
+ BUG();
+ }
+
+ return NULL;
+}
+
+/**
+ * ring_buffer_consume - return an event and consume it
+ * @buffer: The ring buffer to get the next event from
+ *
+ * Returns the next event in the ring buffer, and that event is consumed.
+ * Meaning, that sequential reads will keep returning a different event,
+ * and eventually empty the ring buffer if the producer is slower.
+ */
+struct ring_buffer_event *
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return NULL;
+
+ event = ring_buffer_peek(buffer, cpu, ts);
+ if (!event)
+ return NULL;
+
+ cpu_buffer = buffer->buffers[cpu];
+ rb_advance_head(cpu_buffer);
+
+ return event;
+}
+
+/**
+ * ring_buffer_read_start - start a non consuming read of the buffer
+ * @buffer: The ring buffer to read from
+ * @cpu: The cpu buffer to iterate over
+ *
+ * This starts up an iteration through the buffer. It also disables
+ * the recording to the buffer until the reading is finished.
+ * This prevents the reading from being corrupted. This is not
+ * a consuming read, so a producer is not expected.
+ *
+ * Must be paired with ring_buffer_finish.
+ */
+struct ring_buffer_iter *
+ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_iter *iter;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return NULL;
+
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return NULL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ iter->cpu_buffer = cpu_buffer;
+
+ atomic_inc(&cpu_buffer->record_disabled);
+ synchronize_sched();
+
+ __raw_spin_lock(&cpu_buffer->lock);
+ iter->head = cpu_buffer->head;
+ iter->head_page = cpu_buffer->head_page;
+ rb_reset_iter_read_page(iter);
+ __raw_spin_unlock(&cpu_buffer->lock);
+
+ return iter;
+}
+
+/**
+ * ring_buffer_finish - finish reading the iterator of the buffer
+ * @iter: The iterator retrieved by ring_buffer_start
+ *
+ * This re-enables the recording to the buffer, and frees the
+ * iterator.
+ */
+void
+ring_buffer_read_finish(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+
+ atomic_dec(&cpu_buffer->record_disabled);
+ kfree(iter);
+}
+
+/**
+ * ring_buffer_read - read the next item in the ring buffer by the iterator
+ * @iter: The ring buffer iterator
+ * @ts: The time stamp of the event read.
+ *
+ * This reads the next event in the ring buffer and increments the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
+{
+ struct ring_buffer_event *event;
+
+ event = ring_buffer_iter_peek(iter, ts);
+ if (!event)
+ return NULL;
+
+ rb_advance_iter(iter);
+
+ return event;
+}
+
+/**
+ * ring_buffer_size - return the size of the ring buffer (in bytes)
+ * @buffer: The ring buffer.
+ */
+unsigned long ring_buffer_size(struct ring_buffer *buffer)
+{
+ return BUF_PAGE_SIZE * buffer->pages;
+}
+
+static void
+rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ cpu_buffer->head_page
+ = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+ cpu_buffer->tail_page
+ = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+
+ cpu_buffer->head = cpu_buffer->tail = 0;
+ cpu_buffer->overrun = 0;
+ cpu_buffer->entries = 0;
+}
+
+/**
+ * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
+ * @buffer: The ring buffer to reset a per cpu buffer of
+ * @cpu: The CPU buffer to be reset
+ */
+void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ unsigned long flags;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return;
+
+ raw_local_irq_save(flags);
+ __raw_spin_lock(&cpu_buffer->lock);
+
+ rb_reset_cpu(cpu_buffer);
+
+ __raw_spin_unlock(&cpu_buffer->lock);
+ raw_local_irq_restore(flags);
+}
+
+/**
+ * ring_buffer_reset - reset a ring buffer
+ * @buffer: The ring buffer to reset all cpu buffers
+ */
+void ring_buffer_reset(struct ring_buffer *buffer)
+{
+ unsigned long flags;
+ int cpu;
+
+ ring_buffer_lock(buffer, &flags);
+
+ for_each_buffer_cpu(buffer, cpu)
+ rb_reset_cpu(buffer->buffers[cpu]);
+
+ ring_buffer_unlock(buffer, flags);
+}
+
+/**
+ * rind_buffer_empty - is the ring buffer empty?
+ * @buffer: The ring buffer to test
+ */
+int ring_buffer_empty(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ /* yes this is racy, but if you don't like the race, lock the buffer */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ if (!rb_per_cpu_empty(cpu_buffer))
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty?
+ * @buffer: The ring buffer
+ * @cpu: The CPU buffer to test
+ */
+int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return 1;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return rb_per_cpu_empty(cpu_buffer);
+}
+
+/**
+ * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
+ * @buffer_a: One buffer to swap with
+ * @buffer_b: The other buffer to swap with
+ *
+ * This function is useful for tracers that want to take a "snapshot"
+ * of a CPU buffer and has another back up buffer lying around.
+ * it is expected that the tracer handles the cpu buffer not being
+ * used at the moment.
+ */
+int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
+ struct ring_buffer *buffer_b, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer_a;
+ struct ring_buffer_per_cpu *cpu_buffer_b;
+
+ if (!cpu_isset(cpu, buffer_a->cpumask) ||
+ !cpu_isset(cpu, buffer_b->cpumask))
+ return -EINVAL;
+
+ /* At least make sure the two buffers are somewhat the same */
+ if (buffer_a->size != buffer_b->size ||
+ buffer_a->pages != buffer_b->pages)
+ return -EINVAL;
+
+ cpu_buffer_a = buffer_a->buffers[cpu];
+ cpu_buffer_b = buffer_b->buffers[cpu];
+
+ /*
+ * We can't do a synchronize_sched here because this
+ * function can be called in atomic context.
+ * Normally this will be called from the same CPU as cpu.
+ * If not it's up to the caller to protect this.
+ */
+ atomic_inc(&cpu_buffer_a->record_disabled);
+ atomic_inc(&cpu_buffer_b->record_disabled);
+
+ buffer_a->buffers[cpu] = cpu_buffer_b;
+ buffer_b->buffers[cpu] = cpu_buffer_a;
+
+ cpu_buffer_b->buffer = buffer_a;
+ cpu_buffer_a->buffer = buffer_b;
+
+ atomic_dec(&cpu_buffer_a->record_disabled);
+ atomic_dec(&cpu_buffer_b->record_disabled);
+
+ return 0;
+}
+
Index: linux-trace.git/kernel/trace/Kconfig
===================================================================
--- linux-trace.git.orig/kernel/trace/Kconfig 2008-09-27 01:58:49.000000000 -0400
+++ linux-trace.git/kernel/trace/Kconfig 2008-09-27 01:59:06.000000000 -0400
@@ -10,10 +10,14 @@ config HAVE_DYNAMIC_FTRACE
config TRACER_MAX_TRACE
bool

+config RING_BUFFER
+ bool
+
config TRACING
bool
select DEBUG_FS
select STACKTRACE
+ select RING_BUFFER

config FTRACE
bool "Kernel Function Tracer"
Index: linux-trace.git/kernel/trace/Makefile
===================================================================
--- linux-trace.git.orig/kernel/trace/Makefile 2008-09-27 01:58:49.000000000 -0400
+++ linux-trace.git/kernel/trace/Makefile 2008-09-27 01:59:06.000000000 -0400
@@ -11,6 +11,7 @@ obj-y += trace_selftest_dynamic.o
endif

obj-$(CONFIG_FTRACE) += libftrace.o
+obj-$(CONFIG_RING_BUFFER) += ring_buffer.o

obj-$(CONFIG_TRACING) += trace.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
Ingo Molnar
2008-09-27 18:39:12 UTC
Permalink
Post by Steven Rostedt
Index: linux-trace.git/include/linux/ring_buffer.h
+enum {
+ RB_TYPE_PADDING, /* Left over page padding
RB_ clashes with red-black tree namespace. (on the thought level)
Post by Steven Rostedt
+#define RB_ALIGNMENT_SHIFT 2
+#define RB_ALIGNMENT (1 << RB_ALIGNMENT_SHIFT)
+#define RB_MAX_SMALL_DATA (28)
no need to put numeric literals into parenthesis.
Post by Steven Rostedt
+static inline unsigned
+ring_buffer_event_length(struct ring_buffer_event *event)
+{
+ unsigned length;
+
+ switch (event->type) {
+ /* undefined */
+ return -1;
+
+ return RB_LEN_TIME_EXTENT;
+
+ return RB_LEN_TIME_STAMP;
+
+ if (event->len)
+ length = event->len << RB_ALIGNMENT_SHIFT;
+ else
+ length = event->array[0];
+ return length + RB_EVNT_HDR_SIZE;
+ BUG();
+ }
+ /* not hit */
+ return 0;
too large, please uninline.
Post by Steven Rostedt
+static inline void *
+ring_buffer_event_data(struct ring_buffer_event *event)
+{
+ BUG_ON(event->type != RB_TYPE_DATA);
+ /* If length is in len field, then array[0] has the data */
+ if (event->len)
+ return (void *)&event->array[0];
+ /* Otherwise length is in array[0] and array[1] has the data */
+ return (void *)&event->array[1];
+}
ditto.
Post by Steven Rostedt
+/* FIXME!!! */
+u64 ring_buffer_time_stamp(int cpu)
+{
+ /* shift to debug/test normalization and TIME_EXTENTS */
+ return sched_clock() << DEBUG_SHIFT;
[ duly noted ;-) ]
Post by Steven Rostedt
+}
+void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
needs extra newline above.
Post by Steven Rostedt
+/*
+ * head_page == tail_page && head == tail then buffer is empty.
+ */
+struct ring_buffer_per_cpu {
+ int cpu;
+ struct ring_buffer *buffer;
+ raw_spinlock_t lock;
hm, should not be raw, at least initially. I am 95% sure we'll see
lockups, we always did when we iterated ftrace's buffer implementation
;-)
Post by Steven Rostedt
+struct ring_buffer {
+ unsigned long size;
+ unsigned pages;
+ unsigned flags;
+ int cpus;
+ cpumask_t cpumask;
+ atomic_t record_disabled;
+
+ struct mutex mutex;
+
+ struct ring_buffer_per_cpu **buffers;
+};
+
+struct ring_buffer_iter {
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long head;
+ struct buffer_page *head_page;
+ u64 read_stamp;
please use consistent vertical whitespaces. Above, in the struct
ring_buffer definition, you can add another tab to most of the vars -
that will also make the '**buffers' line look nice.

same for all structs across this file. In my experience, a 50% vertical
break works best - the one you used here in 'struct ring_buffer_iter'.
Post by Steven Rostedt
+};
+
+#define CHECK_COND(buffer, cond) \
+ if (unlikely(cond)) { \
+ atomic_inc(&buffer->record_disabled); \
+ WARN_ON(1); \
+ return -1; \
+ }
please name it RINGBUFFER_BUG_ON() / RINGBUFFER_WARN_ON(), so that we
dont have to memorize another set of debug names. [ See
DEBUG_LOCKS_WARN_ON() in include/linux/debug_locks.h ]
Post by Steven Rostedt
+static int
+rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
+{
+ struct list_head *head = &cpu_buffer->pages;
+ LIST_HEAD(pages);
+ struct buffer_page *page, *tmp;
+ unsigned long addr;
+ unsigned i;
please apply ftrace's standard reverse christmas tree style and move the
'pages' line down two lines.
Post by Steven Rostedt
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long buffer_size;
+ LIST_HEAD(pages);
+ unsigned long addr;
+ unsigned nr_pages, rm_pages, new_pages;
+ struct buffer_page *page, *tmp;
+ int i, cpu;
ditto.
Post by Steven Rostedt
+static inline void *rb_page_index(struct buffer_page *page, unsigned index)
+{
+ void *addr;
+
+ addr = page_address(&page->page);
'addr' initialization can move to the definition line - you save two
lines.
Post by Steven Rostedt
+ return addr + index;
+}
+
+static inline struct ring_buffer_event *
+rb_head_event(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return rb_page_index(cpu_buffer->head_page,
+ cpu_buffer->head);
can all move to the same return line.
Post by Steven Rostedt
+}
+
+static inline struct ring_buffer_event *
+rb_iter_head_event(struct ring_buffer_iter *iter)
+{
+ return rb_page_index(iter->head_page,
+ iter->head);
ditto.
Post by Steven Rostedt
+ for (head = 0; head < rb_head_size(cpu_buffer);
+ head += ring_buffer_event_length(event)) {
+ event = rb_page_index(cpu_buffer->head_page, head);
+ BUG_ON(rb_null_event(event));
( optional:when there's a multi-line loop then i generally try to insert
an extra newline when starting the body - to make sure the iterator
and the body stands apart visually. Matter of taste. )
Post by Steven Rostedt
+static struct ring_buffer_event *
+rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned type, unsigned long length)
+{
+ u64 ts, delta;
+ struct ring_buffer_event *event;
+ static int once;
+
+ ts = ring_buffer_time_stamp(cpu_buffer->cpu);
+
+ if (cpu_buffer->tail) {
+ delta = ts - cpu_buffer->write_stamp;
+
+ if (test_time_stamp(delta)) {
+ if (unlikely(delta > (1ULL << 59) && !once++)) {
+ printk(KERN_WARNING "Delta way too big! %llu"
+ " ts=%llu write stamp = %llu\n",
+ delta, ts, cpu_buffer->write_stamp);
+ WARN_ON(1);
+ }
+ /*
+ * The delta is too big, we to add a
+ * new timestamp.
+ */
+ event = __rb_reserve_next(cpu_buffer,
+ RB_TYPE_TIME_EXTENT,
+ RB_LEN_TIME_EXTENT,
+ &ts);
+ if (!event)
+ return NULL;
+
+ /* check to see if we went to the next page */
+ if (cpu_buffer->tail) {
+ /* Still on same page, update timestamp */
+ event->time_delta = delta & TS_MASK;
+ event->array[0] = delta >> TS_SHIFT;
+ /* commit the time event */
+ cpu_buffer->tail +=
+ ring_buffer_event_length(event);
+ cpu_buffer->write_stamp = ts;
+ delta = 0;
+ }
+ }
+ } else {
+ rb_add_stamp(cpu_buffer, &ts);
+ delta = 0;
+ }
+
+ event = __rb_reserve_next(cpu_buffer, type, length, &ts);
+ if (!event)
+ return NULL;
+
+ /* If the reserve went to the next page, our delta is zero */
+ if (!cpu_buffer->tail)
+ delta = 0;
+
+ event->time_delta = delta;
+
+ return event;
+}
this function is too long, please split it up. The first condition's
body could go into a separate function i guess.
Post by Steven Rostedt
+ RB_TYPE_TIME_EXTENT, /* Extent the time delta
+ * array[0] = time delta (28 .. 59)
+ * size = 8 bytes
+ */
please use standard comment style:

/*
* Comment
*/

Ingo
Steven Rostedt
2008-09-27 19:24:54 UTC
Permalink
Hi Ingo,

Thanks for the review!
Post by Ingo Molnar
Post by Steven Rostedt
Index: linux-trace.git/include/linux/ring_buffer.h
+enum {
+ RB_TYPE_PADDING, /* Left over page padding
RB_ clashes with red-black tree namespace. (on the thought level)
Yeah, Linus pointed this out with the rb_ static function names. But since
the functions are static I kept them as is. But here we have global names.

Would RNGBF_ be OK, or do you have any other ideas?
Post by Ingo Molnar
Post by Steven Rostedt
+#define RB_ALIGNMENT_SHIFT 2
+#define RB_ALIGNMENT (1 << RB_ALIGNMENT_SHIFT)
+#define RB_MAX_SMALL_DATA (28)
no need to put numeric literals into parenthesis.
Ah, I think I had it more complex and changed it to a literal without
removing the parenthesis.
Post by Ingo Molnar
Post by Steven Rostedt
+static inline unsigned
+ring_buffer_event_length(struct ring_buffer_event *event)
+{
+ unsigned length;
+
+ switch (event->type) {
+ /* undefined */
+ return -1;
+
+ return RB_LEN_TIME_EXTENT;
+
+ return RB_LEN_TIME_STAMP;
+
+ if (event->len)
+ length = event->len << RB_ALIGNMENT_SHIFT;
+ else
+ length = event->array[0];
+ return length + RB_EVNT_HDR_SIZE;
+ BUG();
+ }
+ /* not hit */
+ return 0;
too large, please uninline.
I calculated this on x86_64 to add 78 bytes. Is that still too big?
Post by Ingo Molnar
Post by Steven Rostedt
+static inline void *
+ring_buffer_event_data(struct ring_buffer_event *event)
+{
+ BUG_ON(event->type != RB_TYPE_DATA);
+ /* If length is in len field, then array[0] has the data */
+ if (event->len)
+ return (void *)&event->array[0];
+ /* Otherwise length is in array[0] and array[1] has the data */
+ return (void *)&event->array[1];
+}
ditto.
No biggy. I thought this would be nicer as inline. But I have no problem
changing this.
Post by Ingo Molnar
Post by Steven Rostedt
+/* FIXME!!! */
+u64 ring_buffer_time_stamp(int cpu)
+{
+ /* shift to debug/test normalization and TIME_EXTENTS */
+ return sched_clock() << DEBUG_SHIFT;
[ duly noted ;-) ]
Post by Steven Rostedt
+}
+void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
needs extra newline above.
Yeah, I kept them bounded just to stress the "FIXME" part ;-)
Post by Ingo Molnar
Post by Steven Rostedt
+/*
+ * head_page == tail_page && head == tail then buffer is empty.
+ */
+struct ring_buffer_per_cpu {
+ int cpu;
+ struct ring_buffer *buffer;
+ raw_spinlock_t lock;
hm, should not be raw, at least initially. I am 95% sure we'll see
lockups, we always did when we iterated ftrace's buffer implementation
;-)
It was to prevent lockdep from checking the locks from inside. We had
issues with ftroce and lockdep in the past, because ftrace would trace the
internals of lockdep, and lockdep would then recurse back into itself to
trace. If lockdep itself can get away with not using raw_spinlocks, then
this will be OK to make back to spinlock.
Post by Ingo Molnar
Post by Steven Rostedt
+struct ring_buffer {
+ unsigned long size;
+ unsigned pages;
+ unsigned flags;
+ int cpus;
+ cpumask_t cpumask;
+ atomic_t record_disabled;
+
+ struct mutex mutex;
+
+ struct ring_buffer_per_cpu **buffers;
+};
+
+struct ring_buffer_iter {
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long head;
+ struct buffer_page *head_page;
+ u64 read_stamp;
please use consistent vertical whitespaces. Above, in the struct
ring_buffer definition, you can add another tab to most of the vars -
that will also make the '**buffers' line look nice.
OK, will fix.
Post by Ingo Molnar
same for all structs across this file. In my experience, a 50% vertical
break works best - the one you used here in 'struct ring_buffer_iter'.
Post by Steven Rostedt
+};
+
+#define CHECK_COND(buffer, cond) \
+ if (unlikely(cond)) { \
+ atomic_inc(&buffer->record_disabled); \
+ WARN_ON(1); \
+ return -1; \
+ }
please name it RINGBUFFER_BUG_ON() / RINGBUFFER_WARN_ON(), so that we
dont have to memorize another set of debug names. [ See
DEBUG_LOCKS_WARN_ON() in include/linux/debug_locks.h ]
OK, this was a direct copy from what was used in ftrace.
Post by Ingo Molnar
Post by Steven Rostedt
+static int
+rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
+{
+ struct list_head *head = &cpu_buffer->pages;
+ LIST_HEAD(pages);
+ struct buffer_page *page, *tmp;
+ unsigned long addr;
+ unsigned i;
please apply ftrace's standard reverse christmas tree style and move the
'pages' line down two lines.
Heh, this was directly from a bug I had and laziness ;-)
I originally just had struct list_head pages (and no *tmp), which kept the
christmas tree format. But later found that you need to initialize list
heads (duh!), and never moved it.
Post by Ingo Molnar
Post by Steven Rostedt
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long buffer_size;
+ LIST_HEAD(pages);
+ unsigned long addr;
+ unsigned nr_pages, rm_pages, new_pages;
+ struct buffer_page *page, *tmp;
+ int i, cpu;
ditto.
Same reason.
Post by Ingo Molnar
Post by Steven Rostedt
+static inline void *rb_page_index(struct buffer_page *page, unsigned index)
+{
+ void *addr;
+
+ addr = page_address(&page->page);
'addr' initialization can move to the definition line - you save two
lines.
Will fix.
Post by Ingo Molnar
Post by Steven Rostedt
+ return addr + index;
+}
+
+static inline struct ring_buffer_event *
+rb_head_event(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return rb_page_index(cpu_buffer->head_page,
+ cpu_buffer->head);
can all move to the same return line.
Ah, this was caused by my s/ring_buffer_page_index/rb_page_index/ run.
Post by Ingo Molnar
Post by Steven Rostedt
+}
+
+static inline struct ring_buffer_event *
+rb_iter_head_event(struct ring_buffer_iter *iter)
+{
+ return rb_page_index(iter->head_page,
+ iter->head);
ditto.
Will fix.
Post by Ingo Molnar
Post by Steven Rostedt
+ for (head = 0; head < rb_head_size(cpu_buffer);
+ head += ring_buffer_event_length(event)) {
+ event = rb_page_index(cpu_buffer->head_page, head);
+ BUG_ON(rb_null_event(event));
( optional:when there's a multi-line loop then i generally try to insert
an extra newline when starting the body - to make sure the iterator
and the body stands apart visually. Matter of taste. )
Will fix, I have no preference.
Post by Ingo Molnar
Post by Steven Rostedt
+static struct ring_buffer_event *
+rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned type, unsigned long length)
+{
+ u64 ts, delta;
+ struct ring_buffer_event *event;
+ static int once;
+
+ ts = ring_buffer_time_stamp(cpu_buffer->cpu);
+
+ if (cpu_buffer->tail) {
+ delta = ts - cpu_buffer->write_stamp;
+
+ if (test_time_stamp(delta)) {
+ if (unlikely(delta > (1ULL << 59) && !once++)) {
+ printk(KERN_WARNING "Delta way too big! %llu"
+ " ts=%llu write stamp = %llu\n",
+ delta, ts, cpu_buffer->write_stamp);
+ WARN_ON(1);
+ }
+ /*
+ * The delta is too big, we to add a
+ * new timestamp.
+ */
+ event = __rb_reserve_next(cpu_buffer,
+ RB_TYPE_TIME_EXTENT,
+ RB_LEN_TIME_EXTENT,
+ &ts);
+ if (!event)
+ return NULL;
+
+ /* check to see if we went to the next page */
+ if (cpu_buffer->tail) {
+ /* Still on same page, update timestamp */
+ event->time_delta = delta & TS_MASK;
+ event->array[0] = delta >> TS_SHIFT;
+ /* commit the time event */
+ cpu_buffer->tail +=
+ ring_buffer_event_length(event);
+ cpu_buffer->write_stamp = ts;
+ delta = 0;
+ }
+ }
+ } else {
+ rb_add_stamp(cpu_buffer, &ts);
+ delta = 0;
+ }
+
+ event = __rb_reserve_next(cpu_buffer, type, length, &ts);
+ if (!event)
+ return NULL;
+
+ /* If the reserve went to the next page, our delta is zero */
+ if (!cpu_buffer->tail)
+ delta = 0;
+
+ event->time_delta = delta;
+
+ return event;
+}
this function is too long, please split it up. The first condition's
body could go into a separate function i guess.
Will fix.
Post by Ingo Molnar
Post by Steven Rostedt
+ RB_TYPE_TIME_EXTENT, /* Extent the time delta
+ * array[0] = time delta (28 .. 59)
+ * size = 8 bytes
+ */
/*
* Comment
*/
Hmm, this is interesting. I kind of like this because it is not really a
standard comment. It is a comment about the definitions of the enum. I
believe if they are above:

/*
* Comment
*/
RB_ENUM_TYPE,

It is not as readable. But if we do:

RB_ENUM_TYPE, /*
* Comment
*/

The comment is not at the same line as the enum, which also looks
unpleasing.

We can't could do:

/*
RB_ENUM_TYPE, * Comment
*/
/*
RB_ENUM_TYPE2, * Comment
*/

Because the ENUM is also in the comment :-p


I chose this way because we have:

RB_ENUM_TYPE, /* Comment
* More comment
*/
RB_ENUM_TYPE2, /* Comment
*/

Since I find this the nices way to describe enums. That last */ is
good to space the comments apart, otherwise we have:

RB_ENUM_TYPE, /* Comment
* More comment */
RB_ENUM_TYPE2, /* Comment */

That is not as easy to see the separation of one description of enums with
the other.

-- Steve
Ingo Molnar
2008-09-27 19:41:05 UTC
Permalink
Post by Steven Rostedt
Post by Ingo Molnar
Post by Steven Rostedt
Index: linux-trace.git/include/linux/ring_buffer.h
+enum {
+ RB_TYPE_PADDING, /* Left over page padding
RB_ clashes with red-black tree namespace. (on the thought level)
Yeah, Linus pointed this out with the rb_ static function names. But since
the functions are static I kept them as is. But here we have global names.
Would RNGBF_ be OK, or do you have any other ideas?
that's even worse i think :-/ And this isnt bikeshed-painting really,
the RNGBF_ name hurts my eyes and RB_ is definitely confusing to read.
(as the rbtree constants are in capitals as well and similarly named)

RING_TYPE_PADDING

or:

RINGBUF_TYPE_PADDING

yes, it's longer, but still, saner.
Post by Steven Rostedt
Post by Ingo Molnar
too large, please uninline.
I calculated this on x86_64 to add 78 bytes. Is that still too big?
yes, way too big. Sometimes we make savings from a 10 bytes function
already. (but it's always case dependent - if a function has a lot of
parameters then uninlining can hurt)

the only exception would be if there's normally only a single
instantiation per tracer, and if it's in the absolute tracing hotpath.
Post by Steven Rostedt
Post by Ingo Molnar
hm, should not be raw, at least initially. I am 95% sure we'll see
lockups, we always did when we iterated ftrace's buffer
implementation ;-)
It was to prevent lockdep from checking the locks from inside. We had
issues with ftroce and lockdep in the past, because ftrace would trace
the internals of lockdep, and lockdep would then recurse back into
itself to trace. If lockdep itself can get away with not using
raw_spinlocks, then this will be OK to make back to spinlock.
would be nice to make sure that ftrace's recursion checks work as
intended - and the same goes for lockdep's recursion checks. Yes, we had
problems in this area, and it would be nice to make sure it all works
fine. (or fix it if it doesnt)
Post by Steven Rostedt
Post by Ingo Molnar
Post by Steven Rostedt
+ for (head = 0; head < rb_head_size(cpu_buffer);
+ head += ring_buffer_event_length(event)) {
+ event = rb_page_index(cpu_buffer->head_page, head);
+ BUG_ON(rb_null_event(event));
( optional:when there's a multi-line loop then i generally try to insert
an extra newline when starting the body - to make sure the iterator
and the body stands apart visually. Matter of taste. )
Will fix, I have no preference.
clarification: multi-line loop _condition_. It's pretty rare (this is
such a case) but sometimes unavoidable - and then the newline helps
visually.
Post by Steven Rostedt
Post by Ingo Molnar
Post by Steven Rostedt
+ RB_TYPE_TIME_EXTENT, /* Extent the time delta
+ * array[0] = time delta (28 .. 59)
+ * size = 8 bytes
+ */
/*
* Comment
*/
Hmm, this is interesting. I kind of like this because it is not really a
standard comment. It is a comment about the definitions of the enum. I
/*
* Comment
*/
RB_ENUM_TYPE,
RB_ENUM_TYPE, /*
* Comment
*/
The comment is not at the same line as the enum, which also looks
unpleasing.
RB_ENUM_TYPE, /* Comment
*/
So i suggested to fix it to:

+ RB_TYPE_TIME_EXTENT, /*
+ * Extent the time delta
+ * array[0] = time delta (28 .. 59)
+ * size = 8 bytes
+ */

ok? I.e. "comment" should have the same visual properties as other
comments.

I fully agree with moving it next to the enum, i sometimes use that
style too, it's a nice touch and more readable in this case than
comment-ahead. (which we use for statements)

Ingo
Steven Rostedt
2008-09-27 19:54:07 UTC
Permalink
Post by Ingo Molnar
that's even worse i think :-/ And this isnt bikeshed-painting really,
the RNGBF_ name hurts my eyes and RB_ is definitely confusing to read.
(as the rbtree constants are in capitals as well and similarly named)
RING_TYPE_PADDING
RINGBUF_TYPE_PADDING
yes, it's longer, but still, saner.
I don't mind the extra typing, it is just a bit more difficult to keep in
the 80 character line limit.
Post by Ingo Molnar
Post by Steven Rostedt
Post by Ingo Molnar
too large, please uninline.
I calculated this on x86_64 to add 78 bytes. Is that still too big?
yes, way too big. Sometimes we make savings from a 10 bytes function
already. (but it's always case dependent - if a function has a lot of
parameters then uninlining can hurt)
the only exception would be if there's normally only a single
instantiation per tracer, and if it's in the absolute tracing hotpath.
It is a hot path in the internals. Perhaps I'll make an inline function
in the interal code "rb_event_length" and have the other users call.

unsigned ring_buffer_event(struct ring_buffer_event *event)
{
return rb_event_length(event);
}
Post by Ingo Molnar
Post by Steven Rostedt
RB_ENUM_TYPE, /*
* Comment
*/
The comment is not at the same line as the enum, which also looks
unpleasing.
RB_ENUM_TYPE, /* Comment
*/
+ RB_TYPE_TIME_EXTENT, /*
+ * Extent the time delta
+ * array[0] = time delta (28 .. 59)
+ * size = 8 bytes
+ */
ok? I.e. "comment" should have the same visual properties as other
comments.
I fully agree with moving it next to the enum, i sometimes use that
style too, it's a nice touch and more readable in this case than
comment-ahead. (which we use for statements)
But then we have:

RB_TYPE_PADDING, /*
* Left over page padding
* array is ignored
* size is variable depending on
* how much padding is needed
*/
RB_TYPE_TIME_EXTENT, /*
* Extent the time delta
* array[0] = time delta (28 .. 59)
* size = 8 bytes
*/

Where it is not as easy to see which comment is with which enum.
Especially when you have many enums. That's why I like the method I used
with:

RB_TYPE_PADDING, /* Left over page padding
* array is ignored
* size is variable depending on
* how much padding is needed
*/
RB_TYPE_TIME_EXTENT, /* Extent the time delta
* array[0] = time delta (28 .. 59)
* size = 8 bytes
*/

Where it is very easy to notice which comment goes with which enum.

-- Steve
Ingo Molnar
2008-09-27 20:00:28 UTC
Permalink
Post by Steven Rostedt
Post by Ingo Molnar
RINGBUF_TYPE_PADDING
yes, it's longer, but still, saner.
I don't mind the extra typing, it is just a bit more difficult to keep
in the 80 character line limit.
that's really not a hard limit, but yeah.

generally, with clean and simple functions it's easy to keep it.
Post by Steven Rostedt
Post by Ingo Molnar
yes, way too big. Sometimes we make savings from a 10 bytes function
already. (but it's always case dependent - if a function has a lot
of parameters then uninlining can hurt)
the only exception would be if there's normally only a single
instantiation per tracer, and if it's in the absolute tracing hotpath.
It is a hot path in the internals. Perhaps I'll make an inline
function in the interal code "rb_event_length" and have the other
users call.
unsigned ring_buffer_event(struct ring_buffer_event *event)
{
return rb_event_length(event);
}
yeah, sounds sane.
Post by Steven Rostedt
Post by Ingo Molnar
Post by Steven Rostedt
RB_ENUM_TYPE, /*
* Comment
*/
The comment is not at the same line as the enum, which also looks
unpleasing.
RB_ENUM_TYPE, /* Comment
*/
+ RB_TYPE_TIME_EXTENT, /*
+ * Extent the time delta
+ * array[0] = time delta (28 .. 59)
+ * size = 8 bytes
+ */
ok? I.e. "comment" should have the same visual properties as other
comments.
I fully agree with moving it next to the enum, i sometimes use that
style too, it's a nice touch and more readable in this case than
comment-ahead. (which we use for statements)
RB_TYPE_PADDING, /*
* Left over page padding
* array is ignored
* size is variable depending on
* how much padding is needed
*/
RB_TYPE_TIME_EXTENT, /*
* Extent the time delta
* array[0] = time delta (28 .. 59)
* size = 8 bytes
*/
Where it is not as easy to see which comment is with which enum.
Especially when you have many enums. That's why I like the method I
RB_TYPE_PADDING, /* Left over page padding
* array is ignored
* size is variable depending on
* how much padding is needed
*/
RB_TYPE_TIME_EXTENT, /* Extent the time delta
* array[0] = time delta (28 .. 59)
* size = 8 bytes
*/
Where it is very easy to notice which comment goes with which enum.
RB_TYPE_PADDING, /*
* Left over page padding
* array is ignored
* size is variable depending on
* how much padding is needed
*/
RB_TYPE_TIME_EXTENT, /*
* Extent the time delta
* array[0] = time delta (28 .. 59)
* size = 8 bytes
*/
/*
* Left over page padding. 'array' is ignored,
* 'size' is variable depending on how much padding is needed.
*/
RB_TYPE_PADDING,
/*
* Extent the time delta,
* array[0] = time delta (28 .. 59), size = 8 bytes
*/
RB_TYPE_TIME_EXTENT,
oh, btw., that's a spelling mistake: s/extend/extend ?

Ingo
Steven Rostedt
2008-09-29 15:05:39 UTC
Permalink
Post by Steven Rostedt
RB_ENUM_TYPE, /*
* Comment
*/
The comment is not at the same line as the enum, which also looks
unpleasing.
RB_ENUM_TYPE, /* Comment
*/
OK, I did a quick survey of what others did in include/linux to handle
multi line comments for enums. I ignored the single line comments since
that is pretty standard. Here's what I found:

Those that do:

enum myenum {
ENUM_PING_PONG, /* Bounce a ball back and forth
till you have a winner. */
ENUM_HONEY_CONE, /* Soft and sweet a yummy for
the tummy. */
};

include/linux/atmdev.h
include/linux/fd.h
include/linux/hil.h
include/linux/if_pppol2tp.h
include/linux/ivtv.h
include/linux/libata.h
include/linux/mmzone.h
include/linux/reiserfs_fs.h
include/linux/reiserfs_fs_sb.h
include/linux/rtnetlink.h
include/linux/scc.h
include/linux/videodev2.h

Those that do:

enum myenum {
ENUM_PING_PONG, /* Bounce a ball back and forth */
/* till you have a winner. */
ENUM_HONEY_CONE, /* Soft and sweet a yummy for */
/* the tummy. */
};

include/linux/atmsvc.h
include/linux/pktcdvd.h


Those that do (what I did):

enum myenum {
ENUM_PING_PONG, /* Bounce a ball back and forth
* till you have a winner.
*/
ENUM_HONEY_CONE, /* Soft and sweet a yummy for
* the tummy.
*/
};

include/linux/buffer_head.h (with space between the two enums)
include/linux/personality.h


Those that do:

enum myenum {
/*
* Bounce a ball back and forth
* till you have a winner.
*/
ENUM_PING_PONG,
/*
* Soft and sweet a yummy for
* the tummy.
*/
ENUM_HONEY_CONE,
};

include/linux/cgroup.h
include/linux/cn_proc.h
include/linux/exportfs.h
include/linux/fb.h
include/linux/hil_mlc.h
include/linux/pci.h
include/linux/reiserfs_fs_i.h


And finally Doc book style:

/**
* enum myenum
* @ENUM_PING_PONG: Bounce a ball back and forth
* till you have a winner.
* @ENUM_HONEY_CONE: Soft and sweet a yummy for
* the tummy.
*/
enum myenum {
ENUM_PING_PONG,
ENUM_HONEY_CONE,
};

Note I did not see any enum users that did what you asked:

enum myenum {
ENUM_PING_PONG, /*
* Bounce a ball back and forth
* till you have a winner.
*/
ENUM_HONEY_CONE, /*
* Soft and sweet a yummy for
* the tummy.
*/
};

So by adding that, I will be adding yet another format.

Actually I think the docbook style is the most appropriate for me. I'll go
with that one.

Thanks,

-- Steve
Martin Bligh
2008-09-27 20:07:14 UTC
Permalink
Post by Steven Rostedt
Post by Ingo Molnar
that's even worse i think :-/ And this isnt bikeshed-painting really,
the RNGBF_ name hurts my eyes and RB_ is definitely confusing to read.
(as the rbtree constants are in capitals as well and similarly named)
RING_TYPE_PADDING
RINGBUF_TYPE_PADDING
yes, it's longer, but still, saner.
I don't mind the extra typing, it is just a bit more difficult to keep in
the 80 character line limit.
Would using tb_ (trace buffer) rather than rb_ help ?
Ingo Molnar
2008-09-27 20:34:27 UTC
Permalink
Post by Martin Bligh
Post by Steven Rostedt
Post by Ingo Molnar
that's even worse i think :-/ And this isnt bikeshed-painting really,
the RNGBF_ name hurts my eyes and RB_ is definitely confusing to read.
(as the rbtree constants are in capitals as well and similarly named)
RING_TYPE_PADDING
RINGBUF_TYPE_PADDING
yes, it's longer, but still, saner.
I don't mind the extra typing, it is just a bit more difficult to keep in
the 80 character line limit.
Would using tb_ (trace buffer) rather than rb_ help ?
excellent idea ...

Ingo
Steven Rostedt
2008-09-29 16:10:08 UTC
Permalink
[
This is the final version of this patch. From now on, I will be sending
changes on top of this patch.

Changes since v9:

All suggestions from Ingo Molnar.

- Changed comment of enum to DocBook style.

- Replaced the RB_TYPE_ enums with RINGBUF_TYPE_ prefixes to avoid
name collision with rbtree. Note, I did not use the TB_ extension
because I envision a "trace_buffer" layer on top of this layer
in the future.

- Moved ring_buffer_event_{length,data} into the .c file and added
internal inlines. External uses will need to call the function.

- Broke out rb_add_time_stamp function from rb_reserve_next_event.

- made the cpu_buffer->lock back to a normal spin lock.

- The rest are style changes.

]

This is a unified tracing buffer that implements a ring buffer that
hopefully everyone will eventually be able to use.

The events recorded into the buffer have the following structure:

struct ring_buffer_event {
u32 type:2, len:3, time_delta:27;
u32 array[];
};

The minimum size of an event is 8 bytes. All events are 4 byte
aligned inside the buffer.

There are 4 types (all internal use for the ring buffer, only
the data type is exported to the interface users).

RINGBUF_TYPE_PADDING: this type is used to note extra space at the end
of a buffer page.

RINGBUF_TYPE_TIME_EXTENT: This type is used when the time between events
is greater than the 27 bit delta can hold. We add another
32 bits, and record that in its own event (8 byte size).

RINGBUF_TYPE_TIME_STAMP: (Not implemented yet). This will hold data to
help keep the buffer timestamps in sync.

RINGBUF_TYPE_DATA: The event actually holds user data.

The "len" field is only three bits. Since the data must be
4 byte aligned, this field is shifted left by 2, giving a
max length of 28 bytes. If the data load is greater than 28
bytes, the first array field holds the full length of the
data load and the len field is set to zero.

Example, data size of 7 bytes:

type = RINGBUF_TYPE_DATA
len = 2
time_delta: <time-stamp> - <prev_event-time-stamp>
array[0..1]: <7 bytes of data> <1 byte empty>

This event is saved in 12 bytes of the buffer.

An event with 82 bytes of data:

type = RINGBUF_TYPE_DATA
len = 0
time_delta: <time-stamp> - <prev_event-time-stamp>
array[0]: 84 (Note the alignment)
array[1..14]: <82 bytes of data> <2 bytes empty>

The above event is saved in 92 bytes (if my math is correct).
82 bytes of data, 2 bytes empty, 4 byte header, 4 byte length.

Do not reference the above event struct directly. Use the following
functions to gain access to the event table, since the
ring_buffer_event structure may change in the future.

ring_buffer_event_length(event): get the length of the event.
This is the size of the memory used to record this
event, and not the size of the data pay load.

ring_buffer_time_delta(event): get the time delta of the event
This returns the delta time stamp since the last event.
Note: Even though this is in the header, there should
be no reason to access this directly, accept
for debugging.

ring_buffer_event_data(event): get the data from the event
This is the function to use to get the actual data
from the event. Note, it is only a pointer to the
data inside the buffer. This data must be copied to
another location otherwise you risk it being written
over in the buffer.

ring_buffer_lock: A way to lock the entire buffer.
ring_buffer_unlock: unlock the buffer.

ring_buffer_alloc: create a new ring buffer. Can choose between
overwrite or consumer/producer mode. Overwrite will
overwrite old data, where as consumer producer will
throw away new data if the consumer catches up with the
producer. The consumer/producer is the default.

ring_buffer_free: free the ring buffer.

ring_buffer_resize: resize the buffer. Changes the size of each cpu
buffer. Note, it is up to the caller to provide that
the buffer is not being used while this is happening.
This requirement may go away but do not count on it.

ring_buffer_lock_reserve: locks the ring buffer and allocates an
entry on the buffer to write to.
ring_buffer_unlock_commit: unlocks the ring buffer and commits it to
the buffer.

ring_buffer_write: writes some data into the ring buffer.

ring_buffer_peek: Look at a next item in the cpu buffer.
ring_buffer_consume: get the next item in the cpu buffer and
consume it. That is, this function increments the head
pointer.

ring_buffer_read_start: Start an iterator of a cpu buffer.
For now, this disables the cpu buffer, until you issue
a finish. This is just because we do not want the iterator
to be overwritten. This restriction may change in the future.
But note, this is used for static reading of a buffer which
is usually done "after" a trace. Live readings would want
to use the ring_buffer_consume above, which will not
disable the ring buffer.

ring_buffer_read_finish: Finishes the read iterator and reenables
the ring buffer.

ring_buffer_iter_peek: Look at the next item in the cpu iterator.
ring_buffer_read: Read the iterator and increment it.
ring_buffer_iter_reset: Reset the iterator to point to the beginning
of the cpu buffer.
ring_buffer_iter_empty: Returns true if the iterator is at the end
of the cpu buffer.

ring_buffer_size: returns the size in bytes of each cpu buffer.
Note, the real size is this times the number of CPUs.

ring_buffer_reset_cpu: Sets the cpu buffer to empty
ring_buffer_reset: sets all cpu buffers to empty

ring_buffer_swap_cpu: swaps a cpu buffer from one buffer with a
cpu buffer of another buffer. This is handy when you
want to take a snap shot of a running trace on just one
cpu. Having a backup buffer, to swap with facilitates this.
Ftrace max latencies use this.

ring_buffer_empty: Returns true if the ring buffer is empty.
ring_buffer_empty_cpu: Returns true if the cpu buffer is empty.

ring_buffer_record_disable: disable all cpu buffers (read only)
ring_buffer_record_disable_cpu: disable a single cpu buffer (read only)
ring_buffer_record_enable: enable all cpu buffers.
ring_buffer_record_enabl_cpu: enable a single cpu buffer.

ring_buffer_entries: The number of entries in a ring buffer.
ring_buffer_overruns: The number of entries removed due to writing wrap.

ring_buffer_time_stamp: Get the time stamp used by the ring buffer
ring_buffer_normalize_time_stamp: normalize the ring buffer time stamp
into nanosecs.

I still need to implement the GTOD feature. But we need support from
the cpu frequency infrastructure. But this can be done at a later
time without affecting the ring buffer interface.

Signed-off-by: Steven Rostedt <***@redhat.com>
---
include/linux/ring_buffer.h | 130 +++
kernel/trace/Kconfig | 4
kernel/trace/Makefile | 1
kernel/trace/ring_buffer.c | 1672 ++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 1807 insertions(+)

Index: linux-trace.git/include/linux/ring_buffer.h
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-trace.git/include/linux/ring_buffer.h 2008-09-29 11:12:32.000000000 -0400
@@ -0,0 +1,130 @@
+#ifndef _LINUX_RING_BUFFER_H
+#define _LINUX_RING_BUFFER_H
+
+#include <linux/mm.h>
+#include <linux/seq_file.h>
+
+struct ring_buffer;
+struct ring_buffer_iter;
+
+/*
+ * Don't reference this struct directly, use functions below.
+ */
+struct ring_buffer_event {
+ u32 type:2, len:3, time_delta:27;
+ u32 array[];
+};
+
+/**
+ * enum ring_buffer_type - internal ring buffer types
+ *
+ * @RINGBUF_TYPE_PADDING: Left over page padding
+ * array is ignored
+ * size is variable depending on how much
+ * padding is needed
+ *
+ * @RINGBUF_TYPE_TIME_EXTEND: Extend the time delta
+ * array[0] = time delta (28 .. 59)
+ * size = 8 bytes
+ *
+ * @RINGBUF_TYPE_TIME_STAMP: Sync time stamp with external clock
+ * array[0] = tv_nsec
+ * array[1] = tv_sec
+ * size = 16 bytes
+ *
+ * @RINGBUF_TYPE_DATA: Data record
+ * If len is zero:
+ * array[0] holds the actual length
+ * array[1..(length+3)/4-1] holds data
+ * else
+ * length = len << 2
+ * array[0..(length+3)/4] holds data
+ */
+enum ring_buffer_type {
+ RINGBUF_TYPE_PADDING,
+ RINGBUF_TYPE_TIME_EXTEND,
+ /* FIXME: RINGBUF_TYPE_TIME_STAMP not implemented */
+ RINGBUF_TYPE_TIME_STAMP,
+ RINGBUF_TYPE_DATA,
+};
+
+unsigned ring_buffer_event_length(struct ring_buffer_event *event);
+void *ring_buffer_event_data(struct ring_buffer_event *event);
+
+/**
+ * ring_buffer_event_time_delta - return the delta timestamp of the event
+ * @event: the event to get the delta timestamp of
+ *
+ * The delta timestamp is the 27 bit timestamp since the last event.
+ */
+static inline unsigned
+ring_buffer_event_time_delta(struct ring_buffer_event *event)
+{
+ return event->time_delta;
+}
+
+void ring_buffer_lock(struct ring_buffer *buffer, unsigned long *flags);
+void ring_buffer_unlock(struct ring_buffer *buffer, unsigned long flags);
+
+/*
+ * size is in bytes for each per CPU buffer.
+ */
+struct ring_buffer *
+ring_buffer_alloc(unsigned long size, unsigned flags);
+void ring_buffer_free(struct ring_buffer *buffer);
+
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size);
+
+struct ring_buffer_event *
+ring_buffer_lock_reserve(struct ring_buffer *buffer,
+ unsigned long length,
+ unsigned long *flags);
+int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags);
+int ring_buffer_write(struct ring_buffer *buffer,
+ unsigned long length, void *data);
+
+struct ring_buffer_event *
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts);
+struct ring_buffer_event *
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts);
+
+struct ring_buffer_iter *
+ring_buffer_read_start(struct ring_buffer *buffer, int cpu);
+void ring_buffer_read_finish(struct ring_buffer_iter *iter);
+
+struct ring_buffer_event *
+ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts);
+struct ring_buffer_event *
+ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts);
+void ring_buffer_iter_reset(struct ring_buffer_iter *iter);
+int ring_buffer_iter_empty(struct ring_buffer_iter *iter);
+
+unsigned long ring_buffer_size(struct ring_buffer *buffer);
+
+void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu);
+void ring_buffer_reset(struct ring_buffer *buffer);
+
+int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
+ struct ring_buffer *buffer_b, int cpu);
+
+int ring_buffer_empty(struct ring_buffer *buffer);
+int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu);
+
+void ring_buffer_record_disable(struct ring_buffer *buffer);
+void ring_buffer_record_enable(struct ring_buffer *buffer);
+void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu);
+void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu);
+
+unsigned long ring_buffer_entries(struct ring_buffer *buffer);
+unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
+
+u64 ring_buffer_time_stamp(int cpu);
+void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
+
+enum ring_buffer_flags {
+ RB_FL_OVERWRITE = 1 << 0,
+};
+
+#endif /* _LINUX_RING_BUFFER_H */
Index: linux-trace.git/kernel/trace/ring_buffer.c
===================================================================
--- /dev/null 1970-01-01 00:00:00.000000000 +0000
+++ linux-trace.git/kernel/trace/ring_buffer.c 2008-09-29 11:37:43.000000000 -0400
@@ -0,0 +1,1672 @@
+/*
+ * Generic ring buffer
+ *
+ * Copyright (C) 2008 Steven Rostedt <***@redhat.com>
+ */
+#include <linux/ring_buffer.h>
+#include <linux/spinlock.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/mutex.h>
+#include <linux/sched.h> /* used for sched_clock() (for now) */
+#include <linux/init.h>
+#include <linux/hash.h>
+#include <linux/list.h>
+#include <linux/fs.h>
+
+/* Up this if you want to test the TIME_EXTENTS and normalization */
+#define DEBUG_SHIFT 0
+
+/* FIXME!!! */
+u64 ring_buffer_time_stamp(int cpu)
+{
+ /* shift to debug/test normalization and TIME_EXTENTS */
+ return sched_clock() << DEBUG_SHIFT;
+}
+
+void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
+{
+ /* Just stupid testing the normalize function and deltas */
+ *ts >>= DEBUG_SHIFT;
+}
+
+#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event))
+#define RB_ALIGNMENT_SHIFT 2
+#define RB_ALIGNMENT (1 << RB_ALIGNMENT_SHIFT)
+#define RB_MAX_SMALL_DATA 28
+
+enum {
+ RB_LEN_TIME_EXTEND = 8,
+ RB_LEN_TIME_STAMP = 16,
+};
+
+/* inline for ring buffer fast paths */
+static inline unsigned
+rb_event_length(struct ring_buffer_event *event)
+{
+ unsigned length;
+
+ switch (event->type) {
+ case RINGBUF_TYPE_PADDING:
+ /* undefined */
+ return -1;
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ return RB_LEN_TIME_EXTEND;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ return RB_LEN_TIME_STAMP;
+
+ case RINGBUF_TYPE_DATA:
+ if (event->len)
+ length = event->len << RB_ALIGNMENT_SHIFT;
+ else
+ length = event->array[0];
+ return length + RB_EVNT_HDR_SIZE;
+ default:
+ BUG();
+ }
+ /* not hit */
+ return 0;
+}
+
+/**
+ * ring_buffer_event_length - return the length of the event
+ * @event: the event to get the length of
+ */
+unsigned ring_buffer_event_length(struct ring_buffer_event *event)
+{
+ return rb_event_length(event);
+}
+
+/* inline for ring buffer fast paths */
+static inline void *
+rb_event_data(struct ring_buffer_event *event)
+{
+ BUG_ON(event->type != RINGBUF_TYPE_DATA);
+ /* If length is in len field, then array[0] has the data */
+ if (event->len)
+ return (void *)&event->array[0];
+ /* Otherwise length is in array[0] and array[1] has the data */
+ return (void *)&event->array[1];
+}
+
+/**
+ * ring_buffer_event_data - return the data of the event
+ * @event: the event to get the data from
+ */
+void *ring_buffer_event_data(struct ring_buffer_event *event)
+{
+ return rb_event_data(event);
+}
+
+#define for_each_buffer_cpu(buffer, cpu) \
+ for_each_cpu_mask(cpu, buffer->cpumask)
+
+#define TS_SHIFT 27
+#define TS_MASK ((1ULL << TS_SHIFT) - 1)
+#define TS_DELTA_TEST (~TS_MASK)
+
+/*
+ * This hack stolen from mm/slob.c.
+ * We can store per page timing information in the page frame of the page.
+ * Thanks to Peter Zijlstra for suggesting this idea.
+ */
+struct buffer_page {
+ union {
+ struct {
+ unsigned long flags; /* mandatory */
+ atomic_t _count; /* mandatory */
+ u64 time_stamp; /* page time stamp */
+ unsigned size; /* size of page data */
+ struct list_head list; /* list of free pages */
+ };
+ struct page page;
+ };
+};
+
+/*
+ * We need to fit the time_stamp delta into 27 bits.
+ */
+static inline int test_time_stamp(u64 delta)
+{
+ if (delta & TS_DELTA_TEST)
+ return 1;
+ return 0;
+}
+
+#define BUF_PAGE_SIZE PAGE_SIZE
+
+/*
+ * head_page == tail_page && head == tail then buffer is empty.
+ */
+struct ring_buffer_per_cpu {
+ int cpu;
+ struct ring_buffer *buffer;
+ spinlock_t lock;
+ struct lock_class_key lock_key;
+ struct list_head pages;
+ unsigned long head; /* read from head */
+ unsigned long tail; /* write to tail */
+ struct buffer_page *head_page;
+ struct buffer_page *tail_page;
+ unsigned long overrun;
+ unsigned long entries;
+ u64 write_stamp;
+ u64 read_stamp;
+ atomic_t record_disabled;
+};
+
+struct ring_buffer {
+ unsigned long size;
+ unsigned pages;
+ unsigned flags;
+ int cpus;
+ cpumask_t cpumask;
+ atomic_t record_disabled;
+
+ struct mutex mutex;
+
+ struct ring_buffer_per_cpu **buffers;
+};
+
+struct ring_buffer_iter {
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long head;
+ struct buffer_page *head_page;
+ u64 read_stamp;
+};
+
+#define RB_WARN_ON(buffer, cond) \
+ if (unlikely(cond)) { \
+ atomic_inc(&buffer->record_disabled); \
+ WARN_ON(1); \
+ return -1; \
+ }
+
+/**
+ * check_pages - integrity check of buffer pages
+ * @cpu_buffer: CPU buffer with pages to test
+ *
+ * As a safty measure we check to make sure the data pages have not
+ * been corrupted.
+ */
+static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct list_head *head = &cpu_buffer->pages;
+ struct buffer_page *page, *tmp;
+
+ RB_WARN_ON(cpu_buffer, head->next->prev != head);
+ RB_WARN_ON(cpu_buffer, head->prev->next != head);
+
+ list_for_each_entry_safe(page, tmp, head, list) {
+ RB_WARN_ON(cpu_buffer, page->list.next->prev != &page->list);
+ RB_WARN_ON(cpu_buffer, page->list.prev->next != &page->list);
+ }
+
+ return 0;
+}
+
+static unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return cpu_buffer->head_page->size;
+}
+
+static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned nr_pages)
+{
+ struct list_head *head = &cpu_buffer->pages;
+ struct buffer_page *page, *tmp;
+ unsigned long addr;
+ LIST_HEAD(pages);
+ unsigned i;
+
+ for (i = 0; i < nr_pages; i++) {
+ addr = __get_free_page(GFP_KERNEL);
+ if (!addr)
+ goto free_pages;
+ page = (struct buffer_page *)virt_to_page(addr);
+ list_add(&page->list, &pages);
+ }
+
+ list_splice(&pages, head);
+
+ rb_check_pages(cpu_buffer);
+
+ return 0;
+
+ free_pages:
+ list_for_each_entry_safe(page, tmp, &pages, list) {
+ list_del_init(&page->list);
+ __free_page(&page->page);
+ }
+ return -ENOMEM;
+}
+
+static struct ring_buffer_per_cpu *
+rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int ret;
+
+ cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!cpu_buffer)
+ return NULL;
+
+ cpu_buffer->cpu = cpu;
+ cpu_buffer->buffer = buffer;
+ spin_lock_init(&cpu_buffer->lock);
+ INIT_LIST_HEAD(&cpu_buffer->pages);
+
+ ret = rb_allocate_pages(cpu_buffer, buffer->pages);
+ if (ret < 0)
+ goto fail_free_buffer;
+
+ cpu_buffer->head_page
+ = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+ cpu_buffer->tail_page
+ = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+
+ return cpu_buffer;
+
+ fail_free_buffer:
+ kfree(cpu_buffer);
+ return NULL;
+}
+
+static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct list_head *head = &cpu_buffer->pages;
+ struct buffer_page *page, *tmp;
+
+ list_for_each_entry_safe(page, tmp, head, list) {
+ list_del_init(&page->list);
+ __free_page(&page->page);
+ }
+ kfree(cpu_buffer);
+}
+
+/**
+ * ring_buffer_alloc - allocate a new ring_buffer
+ * @size: the size in bytes that is needed.
+ * @flags: attributes to set for the ring buffer.
+ *
+ * Currently the only flag that is available is the RB_FL_OVERWRITE
+ * flag. This flag means that the buffer will overwrite old data
+ * when the buffer wraps. If this flag is not set, the buffer will
+ * drop data when the tail hits the head.
+ */
+struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
+{
+ struct ring_buffer *buffer;
+ int bsize;
+ int cpu;
+
+ /* keep it in its own cache line */
+ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
+ GFP_KERNEL);
+ if (!buffer)
+ return NULL;
+
+ buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ buffer->flags = flags;
+
+ /* need at least two pages */
+ if (buffer->pages == 1)
+ buffer->pages++;
+
+ buffer->cpumask = cpu_possible_map;
+ buffer->cpus = nr_cpu_ids;
+
+ bsize = sizeof(void *) * nr_cpu_ids;
+ buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()),
+ GFP_KERNEL);
+ if (!buffer->buffers)
+ goto fail_free_buffer;
+
+ for_each_buffer_cpu(buffer, cpu) {
+ buffer->buffers[cpu] =
+ rb_allocate_cpu_buffer(buffer, cpu);
+ if (!buffer->buffers[cpu])
+ goto fail_free_buffers;
+ }
+
+ mutex_init(&buffer->mutex);
+
+ return buffer;
+
+ fail_free_buffers:
+ for_each_buffer_cpu(buffer, cpu) {
+ if (buffer->buffers[cpu])
+ rb_free_cpu_buffer(buffer->buffers[cpu]);
+ }
+ kfree(buffer->buffers);
+
+ fail_free_buffer:
+ kfree(buffer);
+ return NULL;
+}
+
+/**
+ * ring_buffer_free - free a ring buffer.
+ * @buffer: the buffer to free.
+ */
+void
+ring_buffer_free(struct ring_buffer *buffer)
+{
+ int cpu;
+
+ for_each_buffer_cpu(buffer, cpu)
+ rb_free_cpu_buffer(buffer->buffers[cpu]);
+
+ kfree(buffer);
+}
+
+static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
+
+static void
+rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
+{
+ struct buffer_page *page;
+ struct list_head *p;
+ unsigned i;
+
+ atomic_inc(&cpu_buffer->record_disabled);
+ synchronize_sched();
+
+ for (i = 0; i < nr_pages; i++) {
+ BUG_ON(list_empty(&cpu_buffer->pages));
+ p = cpu_buffer->pages.next;
+ page = list_entry(p, struct buffer_page, list);
+ list_del_init(&page->list);
+ __free_page(&page->page);
+ }
+ BUG_ON(list_empty(&cpu_buffer->pages));
+
+ rb_reset_cpu(cpu_buffer);
+
+ rb_check_pages(cpu_buffer);
+
+ atomic_dec(&cpu_buffer->record_disabled);
+
+}
+
+static void
+rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
+ struct list_head *pages, unsigned nr_pages)
+{
+ struct buffer_page *page;
+ struct list_head *p;
+ unsigned i;
+
+ atomic_inc(&cpu_buffer->record_disabled);
+ synchronize_sched();
+
+ for (i = 0; i < nr_pages; i++) {
+ BUG_ON(list_empty(pages));
+ p = pages->next;
+ page = list_entry(p, struct buffer_page, list);
+ list_del_init(&page->list);
+ list_add_tail(&page->list, &cpu_buffer->pages);
+ }
+ rb_reset_cpu(cpu_buffer);
+
+ rb_check_pages(cpu_buffer);
+
+ atomic_dec(&cpu_buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_resize - resize the ring buffer
+ * @buffer: the buffer to resize.
+ * @size: the new size.
+ *
+ * The tracer is responsible for making sure that the buffer is
+ * not being used while changing the size.
+ * Note: We may be able to change the above requirement by using
+ * RCU synchronizations.
+ *
+ * Minimum size is 2 * BUF_PAGE_SIZE.
+ *
+ * Returns -1 on failure.
+ */
+int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned nr_pages, rm_pages, new_pages;
+ struct buffer_page *page, *tmp;
+ unsigned long buffer_size;
+ unsigned long addr;
+ LIST_HEAD(pages);
+ int i, cpu;
+
+ size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ size *= BUF_PAGE_SIZE;
+ buffer_size = buffer->pages * BUF_PAGE_SIZE;
+
+ /* we need a minimum of two pages */
+ if (size < BUF_PAGE_SIZE * 2)
+ size = BUF_PAGE_SIZE * 2;
+
+ if (size == buffer_size)
+ return size;
+
+ mutex_lock(&buffer->mutex);
+
+ nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+
+ if (size < buffer_size) {
+
+ /* easy case, just free pages */
+ BUG_ON(nr_pages >= buffer->pages);
+
+ rm_pages = buffer->pages - nr_pages;
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ rb_remove_pages(cpu_buffer, rm_pages);
+ }
+ goto out;
+ }
+
+ /*
+ * This is a bit more difficult. We only want to add pages
+ * when we can allocate enough for all CPUs. We do this
+ * by allocating all the pages and storing them on a local
+ * link list. If we succeed in our allocation, then we
+ * add these pages to the cpu_buffers. Otherwise we just free
+ * them all and return -ENOMEM;
+ */
+ BUG_ON(nr_pages <= buffer->pages);
+ new_pages = nr_pages - buffer->pages;
+
+ for_each_buffer_cpu(buffer, cpu) {
+ for (i = 0; i < new_pages; i++) {
+ addr = __get_free_page(GFP_KERNEL);
+ if (!addr)
+ goto free_pages;
+ page = (struct buffer_page *)virt_to_page(addr);
+ list_add(&page->list, &pages);
+ }
+ }
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ rb_insert_pages(cpu_buffer, &pages, new_pages);
+ }
+
+ BUG_ON(!list_empty(&pages));
+
+ out:
+ buffer->pages = nr_pages;
+ mutex_unlock(&buffer->mutex);
+
+ return size;
+
+ free_pages:
+ list_for_each_entry_safe(page, tmp, &pages, list) {
+ list_del_init(&page->list);
+ __free_page(&page->page);
+ }
+ return -ENOMEM;
+}
+
+static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return cpu_buffer->head_page == cpu_buffer->tail_page &&
+ cpu_buffer->head == cpu_buffer->tail;
+}
+
+static inline int rb_null_event(struct ring_buffer_event *event)
+{
+ return event->type == RINGBUF_TYPE_PADDING;
+}
+
+static inline void *rb_page_index(struct buffer_page *page, unsigned index)
+{
+ void *addr = page_address(&page->page);
+
+ return addr + index;
+}
+
+static inline struct ring_buffer_event *
+rb_head_event(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ return rb_page_index(cpu_buffer->head_page,
+ cpu_buffer->head);
+}
+
+static inline struct ring_buffer_event *
+rb_iter_head_event(struct ring_buffer_iter *iter)
+{
+ return rb_page_index(iter->head_page,
+ iter->head);
+}
+
+/*
+ * When the tail hits the head and the buffer is in overwrite mode,
+ * the head jumps to the next page and all content on the previous
+ * page is discarded. But before doing so, we update the overrun
+ * variable of the buffer.
+ */
+static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct ring_buffer_event *event;
+ unsigned long head;
+
+ for (head = 0; head < rb_head_size(cpu_buffer);
+ head += rb_event_length(event)) {
+
+ event = rb_page_index(cpu_buffer->head_page, head);
+ BUG_ON(rb_null_event(event));
+ /* Only count data entries */
+ if (event->type != RINGBUF_TYPE_DATA)
+ continue;
+ cpu_buffer->overrun++;
+ cpu_buffer->entries--;
+ }
+}
+
+static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
+ struct buffer_page **page)
+{
+ struct list_head *p = (*page)->list.next;
+
+ if (p == &cpu_buffer->pages)
+ p = p->next;
+
+ *page = list_entry(p, struct buffer_page, list);
+}
+
+static inline void
+rb_add_stamp(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
+{
+ cpu_buffer->tail_page->time_stamp = *ts;
+ cpu_buffer->write_stamp = *ts;
+}
+
+static void rb_reset_read_page(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ cpu_buffer->read_stamp = cpu_buffer->head_page->time_stamp;
+ cpu_buffer->head = 0;
+}
+
+static void
+rb_reset_iter_read_page(struct ring_buffer_iter *iter)
+{
+ iter->read_stamp = iter->head_page->time_stamp;
+ iter->head = 0;
+}
+
+/**
+ * ring_buffer_update_event - update event type and data
+ * @event: the even to update
+ * @type: the type of event
+ * @length: the size of the event field in the ring buffer
+ *
+ * Update the type and data fields of the event. The length
+ * is the actual size that is written to the ring buffer,
+ * and with this, we can determine what to place into the
+ * data field.
+ */
+static inline void
+rb_update_event(struct ring_buffer_event *event,
+ unsigned type, unsigned length)
+{
+ event->type = type;
+
+ switch (type) {
+
+ case RINGBUF_TYPE_PADDING:
+ break;
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ event->len =
+ (RB_LEN_TIME_EXTEND + (RB_ALIGNMENT-1))
+ >> RB_ALIGNMENT_SHIFT;
+ break;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ event->len =
+ (RB_LEN_TIME_STAMP + (RB_ALIGNMENT-1))
+ >> RB_ALIGNMENT_SHIFT;
+ break;
+
+ case RINGBUF_TYPE_DATA:
+ length -= RB_EVNT_HDR_SIZE;
+ if (length > RB_MAX_SMALL_DATA) {
+ event->len = 0;
+ event->array[0] = length;
+ } else
+ event->len =
+ (length + (RB_ALIGNMENT-1))
+ >> RB_ALIGNMENT_SHIFT;
+ break;
+ default:
+ BUG();
+ }
+}
+
+static inline unsigned rb_calculate_event_length(unsigned length)
+{
+ struct ring_buffer_event event; /* Used only for sizeof array */
+
+ /* zero length can cause confusions */
+ if (!length)
+ length = 1;
+
+ if (length > RB_MAX_SMALL_DATA)
+ length += sizeof(event.array[0]);
+
+ length += RB_EVNT_HDR_SIZE;
+ length = ALIGN(length, RB_ALIGNMENT);
+
+ return length;
+}
+
+static struct ring_buffer_event *
+__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned type, unsigned long length, u64 *ts)
+{
+ struct buffer_page *head_page, *tail_page;
+ unsigned long tail;
+ struct ring_buffer *buffer = cpu_buffer->buffer;
+ struct ring_buffer_event *event;
+
+ tail_page = cpu_buffer->tail_page;
+ head_page = cpu_buffer->head_page;
+ tail = cpu_buffer->tail;
+
+ if (tail + length > BUF_PAGE_SIZE) {
+ struct buffer_page *next_page = tail_page;
+
+ rb_inc_page(cpu_buffer, &next_page);
+
+ if (next_page == head_page) {
+ if (!(buffer->flags & RB_FL_OVERWRITE))
+ return NULL;
+
+ /* count overflows */
+ rb_update_overflow(cpu_buffer);
+
+ rb_inc_page(cpu_buffer, &head_page);
+ cpu_buffer->head_page = head_page;
+ rb_reset_read_page(cpu_buffer);
+ }
+
+ if (tail != BUF_PAGE_SIZE) {
+ event = rb_page_index(tail_page, tail);
+ /* page padding */
+ event->type = RINGBUF_TYPE_PADDING;
+ }
+
+ tail_page->size = tail;
+ tail_page = next_page;
+ tail_page->size = 0;
+ tail = 0;
+ cpu_buffer->tail_page = tail_page;
+ cpu_buffer->tail = tail;
+ rb_add_stamp(cpu_buffer, ts);
+ }
+
+ BUG_ON(tail + length > BUF_PAGE_SIZE);
+
+ event = rb_page_index(tail_page, tail);
+ rb_update_event(event, type, length);
+
+ return event;
+}
+
+static int
+rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+ u64 *ts, u64 *delta)
+{
+ struct ring_buffer_event *event;
+ static int once;
+
+ if (unlikely(*delta > (1ULL << 59) && !once++)) {
+ printk(KERN_WARNING "Delta way too big! %llu"
+ " ts=%llu write stamp = %llu\n",
+ *delta, *ts, cpu_buffer->write_stamp);
+ WARN_ON(1);
+ }
+
+ /*
+ * The delta is too big, we to add a
+ * new timestamp.
+ */
+ event = __rb_reserve_next(cpu_buffer,
+ RINGBUF_TYPE_TIME_EXTEND,
+ RB_LEN_TIME_EXTEND,
+ ts);
+ if (!event)
+ return -1;
+
+ /* check to see if we went to the next page */
+ if (cpu_buffer->tail) {
+ /* Still on same page, update timestamp */
+ event->time_delta = *delta & TS_MASK;
+ event->array[0] = *delta >> TS_SHIFT;
+ /* commit the time event */
+ cpu_buffer->tail +=
+ rb_event_length(event);
+ cpu_buffer->write_stamp = *ts;
+ *delta = 0;
+ }
+
+ return 0;
+}
+
+static struct ring_buffer_event *
+rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned type, unsigned long length)
+{
+ struct ring_buffer_event *event;
+ u64 ts, delta;
+
+ ts = ring_buffer_time_stamp(cpu_buffer->cpu);
+
+ if (cpu_buffer->tail) {
+ delta = ts - cpu_buffer->write_stamp;
+
+ if (test_time_stamp(delta)) {
+ int ret;
+
+ ret = rb_add_time_stamp(cpu_buffer, &ts, &delta);
+ if (ret < 0)
+ return NULL;
+ }
+ } else {
+ rb_add_stamp(cpu_buffer, &ts);
+ delta = 0;
+ }
+
+ event = __rb_reserve_next(cpu_buffer, type, length, &ts);
+ if (!event)
+ return NULL;
+
+ /* If the reserve went to the next page, our delta is zero */
+ if (!cpu_buffer->tail)
+ delta = 0;
+
+ event->time_delta = delta;
+
+ return event;
+}
+
+/**
+ * ring_buffer_lock_reserve - reserve a part of the buffer
+ * @buffer: the ring buffer to reserve from
+ * @length: the length of the data to reserve (excluding event header)
+ * @flags: a pointer to save the interrupt flags
+ *
+ * Returns a reseverd event on the ring buffer to copy directly to.
+ * The user of this interface will need to get the body to write into
+ * and can use the ring_buffer_event_data() interface.
+ *
+ * The length is the length of the data needed, not the event length
+ * which also includes the event header.
+ *
+ * Must be paired with ring_buffer_unlock_commit, unless NULL is returned.
+ * If NULL is returned, then nothing has been allocated or locked.
+ */
+struct ring_buffer_event *
+ring_buffer_lock_reserve(struct ring_buffer *buffer,
+ unsigned long length,
+ unsigned long *flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ int cpu;
+
+ if (atomic_read(&buffer->record_disabled))
+ return NULL;
+
+ raw_local_irq_save(*flags);
+ cpu = raw_smp_processor_id();
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ goto out_irq;
+
+ cpu_buffer = buffer->buffers[cpu];
+ spin_lock(&cpu_buffer->lock);
+
+ if (atomic_read(&cpu_buffer->record_disabled))
+ goto no_record;
+
+ length = rb_calculate_event_length(length);
+ if (length > BUF_PAGE_SIZE)
+ return NULL;
+
+ event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length);
+ if (!event)
+ goto no_record;
+
+ return event;
+
+ no_record:
+ spin_unlock(&cpu_buffer->lock);
+ out_irq:
+ local_irq_restore(*flags);
+ return NULL;
+}
+
+static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ cpu_buffer->tail += rb_event_length(event);
+ cpu_buffer->tail_page->size = cpu_buffer->tail;
+ cpu_buffer->write_stamp += event->time_delta;
+ cpu_buffer->entries++;
+}
+
+/**
+ * ring_buffer_unlock_commit - commit a reserved
+ * @buffer: The buffer to commit to
+ * @event: The event pointer to commit.
+ * @flags: the interrupt flags received from ring_buffer_lock_reserve.
+ *
+ * This commits the data to the ring buffer, and releases any locks held.
+ *
+ * Must be paired with ring_buffer_lock_reserve.
+ */
+int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event,
+ unsigned long flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu = raw_smp_processor_id();
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ assert_spin_locked(&cpu_buffer->lock);
+
+ rb_commit(cpu_buffer, event);
+
+ spin_unlock(&cpu_buffer->lock);
+ raw_local_irq_restore(flags);
+
+ return 0;
+}
+
+/**
+ * ring_buffer_write - write data to the buffer without reserving
+ * @buffer: The ring buffer to write to.
+ * @length: The length of the data being written (excluding the event header)
+ * @data: The data to write to the buffer.
+ *
+ * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as
+ * one function. If you already have the data to write to the buffer, it
+ * may be easier to simply call this function.
+ *
+ * Note, like ring_buffer_lock_reserve, the length is the length of the data
+ * and not the length of the event which would hold the header.
+ */
+int ring_buffer_write(struct ring_buffer *buffer,
+ unsigned long length,
+ void *data)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ unsigned long event_length, flags;
+ void *body;
+ int ret = -EBUSY;
+ int cpu;
+
+ if (atomic_read(&buffer->record_disabled))
+ return -EBUSY;
+
+ local_irq_save(flags);
+ cpu = raw_smp_processor_id();
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ goto out_irq;
+
+ cpu_buffer = buffer->buffers[cpu];
+ spin_lock(&cpu_buffer->lock);
+
+ if (atomic_read(&cpu_buffer->record_disabled))
+ goto out;
+
+ event_length = rb_calculate_event_length(length);
+ event = rb_reserve_next_event(cpu_buffer,
+ RINGBUF_TYPE_DATA, event_length);
+ if (!event)
+ goto out;
+
+ body = rb_event_data(event);
+
+ memcpy(body, data, length);
+
+ rb_commit(cpu_buffer, event);
+
+ ret = 0;
+ out:
+ spin_unlock(&cpu_buffer->lock);
+ out_irq:
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+/**
+ * ring_buffer_lock - lock the ring buffer
+ * @buffer: The ring buffer to lock
+ * @flags: The place to store the interrupt flags
+ *
+ * This locks all the per CPU buffers.
+ *
+ * Must be unlocked by ring_buffer_unlock.
+ */
+void ring_buffer_lock(struct ring_buffer *buffer, unsigned long *flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ local_irq_save(*flags);
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ spin_lock(&cpu_buffer->lock);
+ }
+}
+
+/**
+ * ring_buffer_unlock - unlock a locked buffer
+ * @buffer: The locked buffer to unlock
+ * @flags: The interrupt flags received by ring_buffer_lock
+ */
+void ring_buffer_unlock(struct ring_buffer *buffer, unsigned long flags)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ for (cpu = buffer->cpus - 1; cpu >= 0; cpu--) {
+ if (!cpu_isset(cpu, buffer->cpumask))
+ continue;
+ cpu_buffer = buffer->buffers[cpu];
+ spin_unlock(&cpu_buffer->lock);
+ }
+
+ local_irq_restore(flags);
+}
+
+/**
+ * ring_buffer_record_disable - stop all writes into the buffer
+ * @buffer: The ring buffer to stop writes to.
+ *
+ * This prevents all writes to the buffer. Any attempt to write
+ * to the buffer after this will fail and return NULL.
+ *
+ * The caller should call synchronize_sched() after this.
+ */
+void ring_buffer_record_disable(struct ring_buffer *buffer)
+{
+ atomic_inc(&buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_enable - enable writes to the buffer
+ * @buffer: The ring buffer to enable writes
+ *
+ * Note, multiple disables will need the same number of enables
+ * to truely enable the writing (much like preempt_disable).
+ */
+void ring_buffer_record_enable(struct ring_buffer *buffer)
+{
+ atomic_dec(&buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
+ * @buffer: The ring buffer to stop writes to.
+ * @cpu: The CPU buffer to stop
+ *
+ * This prevents all writes to the buffer. Any attempt to write
+ * to the buffer after this will fail and return NULL.
+ *
+ * The caller should call synchronize_sched() after this.
+ */
+void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return;
+
+ cpu_buffer = buffer->buffers[cpu];
+ atomic_inc(&cpu_buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_record_enable_cpu - enable writes to the buffer
+ * @buffer: The ring buffer to enable writes
+ * @cpu: The CPU to enable.
+ *
+ * Note, multiple disables will need the same number of enables
+ * to truely enable the writing (much like preempt_disable).
+ */
+void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return;
+
+ cpu_buffer = buffer->buffers[cpu];
+ atomic_dec(&cpu_buffer->record_disabled);
+}
+
+/**
+ * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the entries from.
+ */
+unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return 0;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return cpu_buffer->entries;
+}
+
+/**
+ * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return 0;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return cpu_buffer->overrun;
+}
+
+/**
+ * ring_buffer_entries - get the number of entries in a buffer
+ * @buffer: The ring buffer
+ *
+ * Returns the total number of entries in the ring buffer
+ * (all CPU entries)
+ */
+unsigned long ring_buffer_entries(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long entries = 0;
+ int cpu;
+
+ /* if you care about this being correct, lock the buffer */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ entries += cpu_buffer->entries;
+ }
+
+ return entries;
+}
+
+/**
+ * ring_buffer_overrun_cpu - get the number of overruns in buffer
+ * @buffer: The ring buffer
+ *
+ * Returns the total number of overruns in the ring buffer
+ * (all CPU entries)
+ */
+unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ unsigned long overruns = 0;
+ int cpu;
+
+ /* if you care about this being correct, lock the buffer */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ overruns += cpu_buffer->overrun;
+ }
+
+ return overruns;
+}
+
+/**
+ * ring_buffer_iter_reset - reset an iterator
+ * @iter: The iterator to reset
+ *
+ * Resets the iterator, so that it will start from the beginning
+ * again.
+ */
+void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+
+ iter->head_page = cpu_buffer->head_page;
+ iter->head = cpu_buffer->head;
+ rb_reset_iter_read_page(iter);
+}
+
+/**
+ * ring_buffer_iter_empty - check if an iterator has no more to read
+ * @iter: The iterator to check
+ */
+int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ cpu_buffer = iter->cpu_buffer;
+
+ return iter->head_page == cpu_buffer->tail_page &&
+ iter->head == cpu_buffer->tail;
+}
+
+static void
+rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ u64 delta;
+
+ switch (event->type) {
+ case RINGBUF_TYPE_PADDING:
+ return;
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ delta = event->array[0];
+ delta <<= TS_SHIFT;
+ delta += event->time_delta;
+ cpu_buffer->read_stamp += delta;
+ return;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ return;
+
+ case RINGBUF_TYPE_DATA:
+ cpu_buffer->read_stamp += event->time_delta;
+ return;
+
+ default:
+ BUG();
+ }
+ return;
+}
+
+static void
+rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
+ struct ring_buffer_event *event)
+{
+ u64 delta;
+
+ switch (event->type) {
+ case RINGBUF_TYPE_PADDING:
+ return;
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ delta = event->array[0];
+ delta <<= TS_SHIFT;
+ delta += event->time_delta;
+ iter->read_stamp += delta;
+ return;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ return;
+
+ case RINGBUF_TYPE_DATA:
+ iter->read_stamp += event->time_delta;
+ return;
+
+ default:
+ BUG();
+ }
+ return;
+}
+
+static void rb_advance_head(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ struct ring_buffer_event *event;
+ unsigned length;
+
+ /*
+ * Check if we are at the end of the buffer.
+ */
+ if (cpu_buffer->head >= cpu_buffer->head_page->size) {
+ BUG_ON(cpu_buffer->head_page == cpu_buffer->tail_page);
+ rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
+ rb_reset_read_page(cpu_buffer);
+ return;
+ }
+
+ event = rb_head_event(cpu_buffer);
+
+ if (event->type == RINGBUF_TYPE_DATA)
+ cpu_buffer->entries--;
+
+ length = rb_event_length(event);
+
+ /*
+ * This should not be called to advance the header if we are
+ * at the tail of the buffer.
+ */
+ BUG_ON((cpu_buffer->head_page == cpu_buffer->tail_page) &&
+ (cpu_buffer->head + length > cpu_buffer->tail));
+
+ rb_update_read_stamp(cpu_buffer, event);
+
+ cpu_buffer->head += length;
+
+ /* check for end of page */
+ if ((cpu_buffer->head >= cpu_buffer->head_page->size) &&
+ (cpu_buffer->head_page != cpu_buffer->tail_page))
+ rb_advance_head(cpu_buffer);
+}
+
+static void rb_advance_iter(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer *buffer;
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+ unsigned length;
+
+ cpu_buffer = iter->cpu_buffer;
+ buffer = cpu_buffer->buffer;
+
+ /*
+ * Check if we are at the end of the buffer.
+ */
+ if (iter->head >= iter->head_page->size) {
+ BUG_ON(iter->head_page == cpu_buffer->tail_page);
+ rb_inc_page(cpu_buffer, &iter->head_page);
+ rb_reset_iter_read_page(iter);
+ return;
+ }
+
+ event = rb_iter_head_event(iter);
+
+ length = rb_event_length(event);
+
+ /*
+ * This should not be called to advance the header if we are
+ * at the tail of the buffer.
+ */
+ BUG_ON((iter->head_page == cpu_buffer->tail_page) &&
+ (iter->head + length > cpu_buffer->tail));
+
+ rb_update_iter_read_stamp(iter, event);
+
+ iter->head += length;
+
+ /* check for end of page padding */
+ if ((iter->head >= iter->head_page->size) &&
+ (iter->head_page != cpu_buffer->tail_page))
+ rb_advance_iter(iter);
+}
+
+/**
+ * ring_buffer_peek - peek at the next event to be read
+ * @buffer: The ring buffer to read
+ * @cpu: The cpu to peak at
+ * @ts: The timestamp counter of this event.
+ *
+ * This will return the event that will be read next, but does
+ * not consume the data.
+ */
+struct ring_buffer_event *
+ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return NULL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ again:
+ if (rb_per_cpu_empty(cpu_buffer))
+ return NULL;
+
+ event = rb_head_event(cpu_buffer);
+
+ switch (event->type) {
+ case RINGBUF_TYPE_PADDING:
+ rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
+ rb_reset_read_page(cpu_buffer);
+ goto again;
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ /* Internal data, OK to advance */
+ rb_advance_head(cpu_buffer);
+ goto again;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ rb_advance_head(cpu_buffer);
+ goto again;
+
+ case RINGBUF_TYPE_DATA:
+ if (ts) {
+ *ts = cpu_buffer->read_stamp + event->time_delta;
+ ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+ }
+ return event;
+
+ default:
+ BUG();
+ }
+
+ return NULL;
+}
+
+/**
+ * ring_buffer_iter_peek - peek at the next event to be read
+ * @iter: The ring buffer iterator
+ * @ts: The timestamp counter of this event.
+ *
+ * This will return the event that will be read next, but does
+ * not increment the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
+{
+ struct ring_buffer *buffer;
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+
+ if (ring_buffer_iter_empty(iter))
+ return NULL;
+
+ cpu_buffer = iter->cpu_buffer;
+ buffer = cpu_buffer->buffer;
+
+ again:
+ if (rb_per_cpu_empty(cpu_buffer))
+ return NULL;
+
+ event = rb_iter_head_event(iter);
+
+ switch (event->type) {
+ case RINGBUF_TYPE_PADDING:
+ rb_inc_page(cpu_buffer, &iter->head_page);
+ rb_reset_iter_read_page(iter);
+ goto again;
+
+ case RINGBUF_TYPE_TIME_EXTEND:
+ /* Internal data, OK to advance */
+ rb_advance_iter(iter);
+ goto again;
+
+ case RINGBUF_TYPE_TIME_STAMP:
+ /* FIXME: not implemented */
+ rb_advance_iter(iter);
+ goto again;
+
+ case RINGBUF_TYPE_DATA:
+ if (ts) {
+ *ts = iter->read_stamp + event->time_delta;
+ ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+ }
+ return event;
+
+ default:
+ BUG();
+ }
+
+ return NULL;
+}
+
+/**
+ * ring_buffer_consume - return an event and consume it
+ * @buffer: The ring buffer to get the next event from
+ *
+ * Returns the next event in the ring buffer, and that event is consumed.
+ * Meaning, that sequential reads will keep returning a different event,
+ * and eventually empty the ring buffer if the producer is slower.
+ */
+struct ring_buffer_event *
+ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_event *event;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return NULL;
+
+ event = ring_buffer_peek(buffer, cpu, ts);
+ if (!event)
+ return NULL;
+
+ cpu_buffer = buffer->buffers[cpu];
+ rb_advance_head(cpu_buffer);
+
+ return event;
+}
+
+/**
+ * ring_buffer_read_start - start a non consuming read of the buffer
+ * @buffer: The ring buffer to read from
+ * @cpu: The cpu buffer to iterate over
+ *
+ * This starts up an iteration through the buffer. It also disables
+ * the recording to the buffer until the reading is finished.
+ * This prevents the reading from being corrupted. This is not
+ * a consuming read, so a producer is not expected.
+ *
+ * Must be paired with ring_buffer_finish.
+ */
+struct ring_buffer_iter *
+ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct ring_buffer_iter *iter;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return NULL;
+
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return NULL;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ iter->cpu_buffer = cpu_buffer;
+
+ atomic_inc(&cpu_buffer->record_disabled);
+ synchronize_sched();
+
+ spin_lock(&cpu_buffer->lock);
+ iter->head = cpu_buffer->head;
+ iter->head_page = cpu_buffer->head_page;
+ rb_reset_iter_read_page(iter);
+ spin_unlock(&cpu_buffer->lock);
+
+ return iter;
+}
+
+/**
+ * ring_buffer_finish - finish reading the iterator of the buffer
+ * @iter: The iterator retrieved by ring_buffer_start
+ *
+ * This re-enables the recording to the buffer, and frees the
+ * iterator.
+ */
+void
+ring_buffer_read_finish(struct ring_buffer_iter *iter)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+
+ atomic_dec(&cpu_buffer->record_disabled);
+ kfree(iter);
+}
+
+/**
+ * ring_buffer_read - read the next item in the ring buffer by the iterator
+ * @iter: The ring buffer iterator
+ * @ts: The time stamp of the event read.
+ *
+ * This reads the next event in the ring buffer and increments the iterator.
+ */
+struct ring_buffer_event *
+ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
+{
+ struct ring_buffer_event *event;
+
+ event = ring_buffer_iter_peek(iter, ts);
+ if (!event)
+ return NULL;
+
+ rb_advance_iter(iter);
+
+ return event;
+}
+
+/**
+ * ring_buffer_size - return the size of the ring buffer (in bytes)
+ * @buffer: The ring buffer.
+ */
+unsigned long ring_buffer_size(struct ring_buffer *buffer)
+{
+ return BUF_PAGE_SIZE * buffer->pages;
+}
+
+static void
+rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ cpu_buffer->head_page
+ = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+ cpu_buffer->tail_page
+ = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
+
+ cpu_buffer->head = cpu_buffer->tail = 0;
+ cpu_buffer->overrun = 0;
+ cpu_buffer->entries = 0;
+}
+
+/**
+ * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
+ * @buffer: The ring buffer to reset a per cpu buffer of
+ * @cpu: The CPU buffer to be reset
+ */
+void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ unsigned long flags;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return;
+
+ raw_local_irq_save(flags);
+ spin_lock(&cpu_buffer->lock);
+
+ rb_reset_cpu(cpu_buffer);
+
+ spin_unlock(&cpu_buffer->lock);
+ raw_local_irq_restore(flags);
+}
+
+/**
+ * ring_buffer_reset - reset a ring buffer
+ * @buffer: The ring buffer to reset all cpu buffers
+ */
+void ring_buffer_reset(struct ring_buffer *buffer)
+{
+ unsigned long flags;
+ int cpu;
+
+ ring_buffer_lock(buffer, &flags);
+
+ for_each_buffer_cpu(buffer, cpu)
+ rb_reset_cpu(buffer->buffers[cpu]);
+
+ ring_buffer_unlock(buffer, flags);
+}
+
+/**
+ * rind_buffer_empty - is the ring buffer empty?
+ * @buffer: The ring buffer to test
+ */
+int ring_buffer_empty(struct ring_buffer *buffer)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu;
+
+ /* yes this is racy, but if you don't like the race, lock the buffer */
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+ if (!rb_per_cpu_empty(cpu_buffer))
+ return 0;
+ }
+ return 1;
+}
+
+/**
+ * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty?
+ * @buffer: The ring buffer
+ * @cpu: The CPU buffer to test
+ */
+int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+
+ if (!cpu_isset(cpu, buffer->cpumask))
+ return 1;
+
+ cpu_buffer = buffer->buffers[cpu];
+ return rb_per_cpu_empty(cpu_buffer);
+}
+
+/**
+ * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
+ * @buffer_a: One buffer to swap with
+ * @buffer_b: The other buffer to swap with
+ *
+ * This function is useful for tracers that want to take a "snapshot"
+ * of a CPU buffer and has another back up buffer lying around.
+ * it is expected that the tracer handles the cpu buffer not being
+ * used at the moment.
+ */
+int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
+ struct ring_buffer *buffer_b, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer_a;
+ struct ring_buffer_per_cpu *cpu_buffer_b;
+
+ if (!cpu_isset(cpu, buffer_a->cpumask) ||
+ !cpu_isset(cpu, buffer_b->cpumask))
+ return -EINVAL;
+
+ /* At least make sure the two buffers are somewhat the same */
+ if (buffer_a->size != buffer_b->size ||
+ buffer_a->pages != buffer_b->pages)
+ return -EINVAL;
+
+ cpu_buffer_a = buffer_a->buffers[cpu];
+ cpu_buffer_b = buffer_b->buffers[cpu];
+
+ /*
+ * We can't do a synchronize_sched here because this
+ * function can be called in atomic context.
+ * Normally this will be called from the same CPU as cpu.
+ * If not it's up to the caller to protect this.
+ */
+ atomic_inc(&cpu_buffer_a->record_disabled);
+ atomic_inc(&cpu_buffer_b->record_disabled);
+
+ buffer_a->buffers[cpu] = cpu_buffer_b;
+ buffer_b->buffers[cpu] = cpu_buffer_a;
+
+ cpu_buffer_b->buffer = buffer_a;
+ cpu_buffer_a->buffer = buffer_b;
+
+ atomic_dec(&cpu_buffer_a->record_disabled);
+ atomic_dec(&cpu_buffer_b->record_disabled);
+
+ return 0;
+}
+
Index: linux-trace.git/kernel/trace/Kconfig
===================================================================
--- linux-trace.git.orig/kernel/trace/Kconfig 2008-09-27 01:58:49.000000000 -0400
+++ linux-trace.git/kernel/trace/Kconfig 2008-09-27 01:59:06.000000000 -0400
@@ -10,10 +10,14 @@ config HAVE_DYNAMIC_FTRACE
config TRACER_MAX_TRACE
bool

+config RING_BUFFER
+ bool
+
config TRACING
bool
select DEBUG_FS
select STACKTRACE
+ select RING_BUFFER

config FTRACE
bool "Kernel Function Tracer"
Index: linux-trace.git/kernel/trace/Makefile
===================================================================
--- linux-trace.git.orig/kernel/trace/Makefile 2008-09-27 01:58:49.000000000 -0400
+++ linux-trace.git/kernel/trace/Makefile 2008-09-27 01:59:06.000000000 -0400
@@ -11,6 +11,7 @@ obj-y += trace_selftest_dynamic.o
endif

obj-$(CONFIG_FTRACE) += libftrace.o
+obj-$(CONFIG_RING_BUFFER) += ring_buffer.o

obj-$(CONFIG_TRACING) += trace.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
Steven Rostedt
2008-09-29 16:11:53 UTC
Permalink
Ingo,

I will add this patch to my linux-tip and then I will start porting ftrace
over to it in an incremental fashion.

-- Steve
Mathieu Desnoyers
2008-09-29 23:35:40 UTC
Permalink
* Steven Rostedt (***@goodmis.org) wrote:
[...]
Post by Steven Rostedt
+/*
+ * This hack stolen from mm/slob.c.
+ * We can store per page timing information in the page frame of the page.
+ * Thanks to Peter Zijlstra for suggesting this idea.
+ */
+struct buffer_page {
+ union {
+ struct {
+ unsigned long flags; /* mandatory */
+ atomic_t _count; /* mandatory */
+ u64 time_stamp; /* page time stamp */
+ unsigned size; /* size of page data */
+ struct list_head list; /* list of free pages */
+ };
+ struct page page;
+ };
+};
+
Hi Steven,

You should have a look at mm/slob.c free_slob_page(). I think your page
free will generate a "bad_page" call due to mapping != NULL and mapcount
!= 0. I just ran into this in my own code. :)

Regards,

Mathieu
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
Steven Rostedt
2008-09-30 00:01:11 UTC
Permalink
Post by Mathieu Desnoyers
You should have a look at mm/slob.c free_slob_page(). I think your page
free will generate a "bad_page" call due to mapping != NULL and mapcount
!= 0. I just ran into this in my own code. :)
Hi Mathieu!

Thanks! I must have been lucky some how not to trigger this :-/

I'll add an update patch for this.

-- Steve
Mathieu Desnoyers
2008-09-30 00:03:07 UTC
Permalink
Post by Steven Rostedt
Post by Mathieu Desnoyers
You should have a look at mm/slob.c free_slob_page(). I think your page
free will generate a "bad_page" call due to mapping != NULL and mapcount
!= 0. I just ran into this in my own code. :)
Hi Mathieu!
Thanks! I must have been lucky some how not to trigger this :-/
My guess is that you never free your buffers in your test cases. I don't
know if it was expected; probably not if your code is built into the
kernel.

Mathieu
Post by Steven Rostedt
I'll add an update patch for this.
-- Steve
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
Steven Rostedt
2008-09-30 00:12:02 UTC
Permalink
Post by Mathieu Desnoyers
Post by Steven Rostedt
Thanks! I must have been lucky some how not to trigger this :-/
My guess is that you never free your buffers in your test cases. I don't
know if it was expected; probably not if your code is built into the
kernel.
Actually my resize does free the buffers and I did test this. I probably
never ran the trace when testing the freeing which means those pointers
could have luckily not have been changed.

-- Steve
Mathieu Desnoyers
2008-09-30 03:46:03 UTC
Permalink
Post by Steven Rostedt
Post by Mathieu Desnoyers
Post by Steven Rostedt
Thanks! I must have been lucky some how not to trigger this :-/
My guess is that you never free your buffers in your test cases. I don't
know if it was expected; probably not if your code is built into the
kernel.
Actually my resize does free the buffers and I did test this. I probably
never ran the trace when testing the freeing which means those pointers
could have luckily not have been changed.
-- Steve
I also got some corruption of the offset field in the struct page I use.
I think it might be related to the fact that I don't set the PG_private
bit (slob does set it when the page is in its free pages list). However,
given I'd like to pass the buffer pages to disk I/O and for network
socket and still keep the ability to re-use it when the I/O has been
performed, I wonder where I should put my

struct list_head list; /* linked list of buf pages */
size_t offset; /* page offset in the buffer */

fields ? Any ideas ?

They are currently in :

struct buf_page {
union {
struct {
unsigned long flags; /* mandatory */
atomic_t _count; /* mandatory */
union { /* mandatory */
atomic_t _mapcount;
struct {
u16 inuse;
u16 objects;
};
};
struct list_head list; /* linked list of buf pages */
size_t offset; /* page offset in the buffer */
};
struct page page;
};
};

Mathieu
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
Steven Rostedt
2008-09-30 04:00:11 UTC
Permalink
Post by Mathieu Desnoyers
I also got some corruption of the offset field in the struct page I use.
I think it might be related to the fact that I don't set the PG_private
bit (slob does set it when the page is in its free pages list). However,
given I'd like to pass the buffer pages to disk I/O and for network
Ah, I believe the disk IO uses the page frame. That might be a bit more
difficult to pass the data to disk and still keep information on the
page frame.

-- Steve
Jonathan Corbet
2008-09-30 15:20:01 UTC
Permalink
On Tue, 30 Sep 2008 00:00:11 -0400 (EDT)
Post by Steven Rostedt
Ah, I believe the disk IO uses the page frame. That might be a bit more
difficult to pass the data to disk and still keep information on the
page frame.
Perhaps I'm speaking out of turn, but I have to wonder: am I the only one
who gets uncomfortable looking at these hacks to overload struct page? It
seems fragile as all hell; woe to he who tries to make a change to struct
page someday and has to track all of this stuff down.

Are the savings gained by using struct page this way really worth the
added complexity?

jon
Peter Zijlstra
2008-09-30 15:54:32 UTC
Permalink
Post by Jonathan Corbet
On Tue, 30 Sep 2008 00:00:11 -0400 (EDT)
Post by Steven Rostedt
Ah, I believe the disk IO uses the page frame. That might be a bit more
difficult to pass the data to disk and still keep information on the
page frame.
Perhaps I'm speaking out of turn, but I have to wonder: am I the only one
who gets uncomfortable looking at these hacks to overload struct page? It
seems fragile as all hell; woe to he who tries to make a change to struct
page someday and has to track all of this stuff down.
Are the savings gained by using struct page this way really worth the
added complexity?
Its not that complex IMHO, the thing that is ugly are those struct page
overloads, what we could do is try and sanitize the regular struct page
and pull all these things in.

Because the only reason people are doing these overloads is because
struct page in mm_types.h is becomming an unreadable mess.

Trouble is, looking at it I see no easy way out,
Linus Torvalds
2008-09-30 16:38:18 UTC
Permalink
Post by Peter Zijlstra
Its not that complex IMHO, the thing that is ugly are those struct page
overloads, what we could do is try and sanitize the regular struct page
and pull all these things in.
That's not the scary part. The scary part is that somebody may well want
to access the trace buffer pages in complex ways.

If you mmap them, for example, you can use VM_PFNMAP to make sure that
nobody should ever look at the "struct page", but if you want to do things
like direct-to-disk IO on the trace pages (either with splice() or with
some kind of in-kernel IO logic), then you're officially screwed.
Post by Peter Zijlstra
Because the only reason people are doing these overloads is because
struct page in mm_types.h is becomming an unreadable mess.
The "unreadable mess" has exactly the same issues, though: people need to
realize that when you overload fields in the page structure, you can then
NEVER EVER use those pages for any other thing.

For the internal VM code, that's ok. The VM knows that a page is either an
anonymous page or a file mapping etc, and the overloading wrt mm_types.h
is explicit. The same goes for SL*B, although it does the overloading
differently.

Trace buffers are different, though. Do people realize that doing the
overloading means that you never EVER can use those buffers for anything
else? Do people realize that it means that splice() and friends are out of
the question?
Post by Peter Zijlstra
Trouble is, looking at it I see no easy way out,
Quite frankly, we could just put it at the head of the page itself. Having
a "whole page" for the trace data is not possible anyway, since the trace
header itself will always eat 8 bytes.

And I do think it would potentially be a better model. Or at least safer.

Linus
Steven Rostedt
2008-09-30 16:48:18 UTC
Permalink
Post by Linus Torvalds
Trace buffers are different, though. Do people realize that doing the
overloading means that you never EVER can use those buffers for anything
else? Do people realize that it means that splice() and friends are out of
the question?
Post by Peter Zijlstra
Trouble is, looking at it I see no easy way out,
Quite frankly, we could just put it at the head of the page itself. Having
a "whole page" for the trace data is not possible anyway, since the trace
header itself will always eat 8 bytes.
And I do think it would potentially be a better model. Or at least safer.
Actually, looking at the code, there is no reason I need to keep this in
the frame buffer itself. I've also encapsulated the accesses to the
incrementing of the pointers so it would be trivial to try other
approaches.

The problem we had with the big array struct is that we can want large
buffers and to do that with pointers means we would need to either come up
with a large allocator or use vmap.

But I just realized that I could also just make a link list of page
pointers and do the exact same thing without having to worry about page
frames. Again, the way I coded this up, it is quite trivial to replace
the handling of the pages with other schemes.

-- Steve
Linus Torvalds
2008-09-30 17:01:01 UTC
Permalink
Post by Steven Rostedt
But I just realized that I could also just make a link list of page
pointers and do the exact same thing without having to worry about page
frames. Again, the way I coded this up, it is quite trivial to replace
the handling of the pages with other schemes.
That might be the best option.

Yes, doing it in the 'struct page' itself is obviously going to save us
some memory over having specially allocated page headers, but it's not
like we'd expect to have _that_ many of these, and having a separate
structure is actually good in that it also would make it simpler/clearer
when/if you want to add larger pages (or other non-page allocations) into
the mix.

For example, if somebody really wants bigger areas, they can allocate them
with vmalloc and/or multi-page allocations, and then add them as easily to
the list of pages as if it was a normal page. Doing the same with playing
tricks on 'struct page' would be pretty damn painful.

Linus
Steven Rostedt
2008-10-01 15:14:54 UTC
Permalink
The current method of overlaying the page frame as the buffer page pointer
can be very dangerous and limits our ability to do other things with
a page from the buffer, like send it off to disk.

This patch allocates the buffer_page instead of overlaying the page's
page frame. The use of the buffer_page has hardly changed due to this.

Signed-off-by: Steven Rostedt <***@redhat.com>
---
kernel/trace/ring_buffer.c | 54 ++++++++++++++++++++++++++-------------------
1 file changed, 32 insertions(+), 22 deletions(-)

Index: linux-tip.git/kernel/trace/ring_buffer.c
===================================================================
--- linux-tip.git.orig/kernel/trace/ring_buffer.c 2008-10-01 09:37:23.000000000 -0400
+++ linux-tip.git/kernel/trace/ring_buffer.c 2008-10-01 11:03:16.000000000 -0400
@@ -115,16 +115,10 @@ void *ring_buffer_event_data(struct ring
* Thanks to Peter Zijlstra for suggesting this idea.
*/
struct buffer_page {
- union {
- struct {
- unsigned long flags; /* mandatory */
- atomic_t _count; /* mandatory */
- u64 time_stamp; /* page time stamp */
- unsigned size; /* size of page data */
- struct list_head list; /* list of free pages */
- };
- struct page page;
- };
+ u64 time_stamp; /* page time stamp */
+ unsigned size; /* size of page data */
+ struct list_head list; /* list of free pages */
+ void *page; /* Actual data page */
};

/*
@@ -133,9 +127,9 @@ struct buffer_page {
*/
static inline void free_buffer_page(struct buffer_page *bpage)
{
- reset_page_mapcount(&bpage->page);
- bpage->page.mapping = NULL;
- __free_page(&bpage->page);
+ if (bpage->page)
+ __free_page(bpage->page);
+ kfree(bpage);
}

/*
@@ -237,11 +231,16 @@ static int rb_allocate_pages(struct ring
unsigned i;

for (i = 0; i < nr_pages; i++) {
+ page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!page)
+ goto free_pages;
+ list_add(&page->list, &pages);
+
addr = __get_free_page(GFP_KERNEL);
if (!addr)
goto free_pages;
- page = (struct buffer_page *)virt_to_page(addr);
- list_add(&page->list, &pages);
+ page->page = (void *)addr;
}

list_splice(&pages, head);
@@ -262,6 +261,7 @@ static struct ring_buffer_per_cpu *
rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
+ struct buffer_page *page;
unsigned long addr;
int ret;

@@ -275,10 +275,17 @@ rb_allocate_cpu_buffer(struct ring_buffe
spin_lock_init(&cpu_buffer->lock);
INIT_LIST_HEAD(&cpu_buffer->pages);

+ page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!page)
+ goto fail_free_buffer;
+
+ cpu_buffer->reader_page = page;
addr = __get_free_page(GFP_KERNEL);
if (!addr)
- goto fail_free_buffer;
- cpu_buffer->reader_page = (struct buffer_page *)virt_to_page(addr);
+ goto fail_free_reader;
+ page->page = (void *)addr;
+
INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
cpu_buffer->reader_page->size = 0;

@@ -523,11 +530,16 @@ int ring_buffer_resize(struct ring_buffe

for_each_buffer_cpu(buffer, cpu) {
for (i = 0; i < new_pages; i++) {
+ page = kzalloc_node(ALIGN(sizeof(*page),
+ cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!page)
+ goto free_pages;
+ list_add(&page->list, &pages);
addr = __get_free_page(GFP_KERNEL);
if (!addr)
goto free_pages;
- page = (struct buffer_page *)virt_to_page(addr);
- list_add(&page->list, &pages);
+ page->page = (void *)addr;
}
}

@@ -567,9 +579,7 @@ static inline int rb_null_event(struct r

static inline void *rb_page_index(struct buffer_page *page, unsigned index)
{
- void *addr = page_address(&page->page);
-
- return addr + index;
+ return page->page + index;
}

static inline struct ring_buffer_event *
Mathieu Desnoyers
2008-10-01 17:36:53 UTC
Permalink
Post by Steven Rostedt
The current method of overlaying the page frame as the buffer page pointer
can be very dangerous and limits our ability to do other things with
a page from the buffer, like send it off to disk.
This patch allocates the buffer_page instead of overlaying the page's
page frame. The use of the buffer_page has hardly changed due to this.
---
kernel/trace/ring_buffer.c | 54 ++++++++++++++++++++++++++-------------------
1 file changed, 32 insertions(+), 22 deletions(-)
Index: linux-tip.git/kernel/trace/ring_buffer.c
===================================================================
--- linux-tip.git.orig/kernel/trace/ring_buffer.c 2008-10-01 09:37:23.000000000 -0400
+++ linux-tip.git/kernel/trace/ring_buffer.c 2008-10-01 11:03:16.000000000 -0400
@@ -115,16 +115,10 @@ void *ring_buffer_event_data(struct ring
* Thanks to Peter Zijlstra for suggesting this idea.
*/
struct buffer_page {
- union {
- struct {
- unsigned long flags; /* mandatory */
- atomic_t _count; /* mandatory */
- u64 time_stamp; /* page time stamp */
- unsigned size; /* size of page data */
- struct list_head list; /* list of free pages */
- };
- struct page page;
- };
+ u64 time_stamp; /* page time stamp */
+ unsigned size; /* size of page data */
+ struct list_head list; /* list of free pages */
+ void *page; /* Actual data page */
};
/*
@@ -133,9 +127,9 @@ struct buffer_page {
*/
static inline void free_buffer_page(struct buffer_page *bpage)
{
- reset_page_mapcount(&bpage->page);
- bpage->page.mapping = NULL;
- __free_page(&bpage->page);
+ if (bpage->page)
+ __free_page(bpage->page);
+ kfree(bpage);
}
/*
@@ -237,11 +231,16 @@ static int rb_allocate_pages(struct ring
unsigned i;
for (i = 0; i < nr_pages; i++) {
+ page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!page)
+ goto free_pages;
+ list_add(&page->list, &pages);
+
addr = __get_free_page(GFP_KERNEL);
if (!addr)
goto free_pages;
- page = (struct buffer_page *)virt_to_page(addr);
- list_add(&page->list, &pages);
+ page->page = (void *)addr;
}
list_splice(&pages, head);
@@ -262,6 +261,7 @@ static struct ring_buffer_per_cpu *
rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
+ struct buffer_page *page;
unsigned long addr;
int ret;
@@ -275,10 +275,17 @@ rb_allocate_cpu_buffer(struct ring_buffe
spin_lock_init(&cpu_buffer->lock);
INIT_LIST_HEAD(&cpu_buffer->pages);
+ page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
Hi Steven,

I understand that you want to allocate these struct buffer_page in
memory local to a given cpu node, which is great, but why do you feel
you need to align them on cache_line_size() ?

Hrm.. you put the timestamp in there, so I guess you're concerned about
having a writer on one CPU, a reader on another, and the fact that you
will have cache line bouncing because of that.

Note that if you put the timestamp and the unused bytes in a tiny header
at the beginning of the page, you

1 - make this information directly accessible for disk, network I/O
without any other abstraction layer.
2 - won't have to do such alignment on the struct buffer_page, because
it will only be read once it's been allocated.

My 2 cents ;)

Mathieu
Post by Steven Rostedt
+ if (!page)
+ goto fail_free_buffer;
+
+ cpu_buffer->reader_page = page;
addr = __get_free_page(GFP_KERNEL);
if (!addr)
- goto fail_free_buffer;
- cpu_buffer->reader_page = (struct buffer_page *)virt_to_page(addr);
+ goto fail_free_reader;
+ page->page = (void *)addr;
+
INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
cpu_buffer->reader_page->size = 0;
@@ -523,11 +530,16 @@ int ring_buffer_resize(struct ring_buffe
for_each_buffer_cpu(buffer, cpu) {
for (i = 0; i < new_pages; i++) {
+ page = kzalloc_node(ALIGN(sizeof(*page),
+ cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!page)
+ goto free_pages;
+ list_add(&page->list, &pages);
addr = __get_free_page(GFP_KERNEL);
if (!addr)
goto free_pages;
- page = (struct buffer_page *)virt_to_page(addr);
- list_add(&page->list, &pages);
+ page->page = (void *)addr;
}
}
@@ -567,9 +579,7 @@ static inline int rb_null_event(struct r
static inline void *rb_page_index(struct buffer_page *page, unsigned index)
{
- void *addr = page_address(&page->page);
-
- return addr + index;
+ return page->page + index;
}
static inline struct ring_buffer_event *
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
Steven Rostedt
2008-10-01 17:49:59 UTC
Permalink
Post by Mathieu Desnoyers
I understand that you want to allocate these struct buffer_page in
memory local to a given cpu node, which is great, but why do you feel
you need to align them on cache_line_size() ?
Hrm.. you put the timestamp in there, so I guess you're concerned about
having a writer on one CPU, a reader on another, and the fact that you
will have cache line bouncing because of that.
Note that if you put the timestamp and the unused bytes in a tiny header
at the beginning of the page, you
1 - make this information directly accessible for disk, network I/O
without any other abstraction layer.
2 - won't have to do such alignment on the struct buffer_page, because
it will only be read once it's been allocated.
That was the approach I actually started with. But someone (I think
Peter) asked me to remove it.

Who knows, perhaps I can put it back. It's not that hard to do. This is
why I used BUF_PAGE_SIZE to determine the size of the buffer page.
Right now it BUF_PAGE_SIZE == PAGE_SIZE, but if we do add a header than
it will be BUF_PAGE_SIZE == PAGE_SIZE - sizeof(header)

-- Steve
Mathieu Desnoyers
2008-10-01 18:21:19 UTC
Permalink
Post by Steven Rostedt
The current method of overlaying the page frame as the buffer page pointer
can be very dangerous and limits our ability to do other things with
a page from the buffer, like send it off to disk.
This patch allocates the buffer_page instead of overlaying the page's
page frame. The use of the buffer_page has hardly changed due to this.
---
kernel/trace/ring_buffer.c | 54 ++++++++++++++++++++++++++-------------------
1 file changed, 32 insertions(+), 22 deletions(-)
Index: linux-tip.git/kernel/trace/ring_buffer.c
===================================================================
--- linux-tip.git.orig/kernel/trace/ring_buffer.c 2008-10-01 09:37:23.000000000 -0400
+++ linux-tip.git/kernel/trace/ring_buffer.c 2008-10-01 11:03:16.000000000 -0400
@@ -115,16 +115,10 @@ void *ring_buffer_event_data(struct ring
* Thanks to Peter Zijlstra for suggesting this idea.
*/
struct buffer_page {
- union {
- struct {
- unsigned long flags; /* mandatory */
- atomic_t _count; /* mandatory */
- u64 time_stamp; /* page time stamp */
- unsigned size; /* size of page data */
- struct list_head list; /* list of free pages */
- };
- struct page page;
- };
+ u64 time_stamp; /* page time stamp */
+ unsigned size; /* size of page data */
+ struct list_head list; /* list of free pages */
+ void *page; /* Actual data page */
};
/*
@@ -133,9 +127,9 @@ struct buffer_page {
*/
static inline void free_buffer_page(struct buffer_page *bpage)
{
- reset_page_mapcount(&bpage->page);
- bpage->page.mapping = NULL;
- __free_page(&bpage->page);
+ if (bpage->page)
+ __free_page(bpage->page);
+ kfree(bpage);
}
/*
@@ -237,11 +231,16 @@ static int rb_allocate_pages(struct ring
unsigned i;
for (i = 0; i < nr_pages; i++) {
+ page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!page)
+ goto free_pages;
+ list_add(&page->list, &pages);
+
addr = __get_free_page(GFP_KERNEL);
You could probably use alloc_pages_node instead here...

Mathieu
Post by Steven Rostedt
if (!addr)
goto free_pages;
- page = (struct buffer_page *)virt_to_page(addr);
- list_add(&page->list, &pages);
+ page->page = (void *)addr;
}
list_splice(&pages, head);
@@ -262,6 +261,7 @@ static struct ring_buffer_per_cpu *
rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
+ struct buffer_page *page;
unsigned long addr;
int ret;
@@ -275,10 +275,17 @@ rb_allocate_cpu_buffer(struct ring_buffe
spin_lock_init(&cpu_buffer->lock);
INIT_LIST_HEAD(&cpu_buffer->pages);
+ page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!page)
+ goto fail_free_buffer;
+
+ cpu_buffer->reader_page = page;
addr = __get_free_page(GFP_KERNEL);
if (!addr)
- goto fail_free_buffer;
- cpu_buffer->reader_page = (struct buffer_page *)virt_to_page(addr);
+ goto fail_free_reader;
+ page->page = (void *)addr;
+
INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
cpu_buffer->reader_page->size = 0;
@@ -523,11 +530,16 @@ int ring_buffer_resize(struct ring_buffe
for_each_buffer_cpu(buffer, cpu) {
for (i = 0; i < new_pages; i++) {
+ page = kzalloc_node(ALIGN(sizeof(*page),
+ cache_line_size()),
+ GFP_KERNEL, cpu_to_node(cpu));
+ if (!page)
+ goto free_pages;
+ list_add(&page->list, &pages);
addr = __get_free_page(GFP_KERNEL);
if (!addr)
goto free_pages;
- page = (struct buffer_page *)virt_to_page(addr);
- list_add(&page->list, &pages);
+ page->page = (void *)addr;
}
}
@@ -567,9 +579,7 @@ static inline int rb_null_event(struct r
static inline void *rb_page_index(struct buffer_page *page, unsigned index)
{
- void *addr = page_address(&page->page);
-
- return addr + index;
+ return page->page + index;
}
static inline struct ring_buffer_event *
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
Mathieu Desnoyers
2008-10-03 15:56:05 UTC
Permalink
---
include/asm-x86/topology.h | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
Index: linux-2.6-lttng/include/asm-x86/topology.h
===================================================================
--- linux-2.6-lttng.orig/include/asm-x86/topology.h 2008-10-03 00:37:05.000000000 -0400
+++ linux-2.6-lttng/include/asm-x86/topology.h 2008-10-03 00:45:52.000000000 -0400
@@ -182,9 +182,9 @@ extern int __node_distance(int, int);
#else /* !CONFIG_NUMA */
-#define numa_node_id() 0
-#define cpu_to_node(cpu) 0
-#define early_cpu_to_node(cpu) 0
+#define numa_node_id() 0
+#define cpu_to_node(cpu) ((void)(cpu),0)
+#define early_cpu_to_node(cpu) cpu_to_node(cpu)
static inline int cpu_to_node(int cpu)
{
return 0;
}
static inline int early_cpu_to_node(int cpu)
{
return 0;
}
This way you also get typechecks.
That's how I did it first, but then I looked at asm-generic/topology.h
and have seen it uses #defines. Should we change them too ?

Mathieu
-- Steve
static inline const cpumask_t *_node_to_cpumask_ptr(int node)
{
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
Steven Rostedt
2008-10-03 16:26:31 UTC
Permalink
Post by Mathieu Desnoyers
That's how I did it first, but then I looked at asm-generic/topology.h
and have seen it uses #defines. Should we change them too ?
The old way of doing this is with defines. But all new code should be
static inline functions when feasible. This way we can get typechecking
on the parameters even when the configuration is disabled.

Even if the rest of the file uses defines, the new code should be
static inlines. Eventually, even the old defines will be converted.

-- Steve
Mathieu Desnoyers
2008-10-03 17:21:43 UTC
Permalink
Post by Steven Rostedt
Post by Mathieu Desnoyers
That's how I did it first, but then I looked at asm-generic/topology.h
and have seen it uses #defines. Should we change them too ?
The old way of doing this is with defines. But all new code should be
static inline functions when feasible. This way we can get typechecking
on the parameters even when the configuration is disabled.
Even if the rest of the file uses defines, the new code should be
static inlines. Eventually, even the old defines will be converted.
-- Steve
Argh, I think topology.h is utterly broken :-(

Have you noticed the subtile interaction between the

include/asm-x86/topology.h :

#define numa_node_id() 0
#define cpu_to_node(cpu) 0
#define early_cpu_to_node(cpu) 0
...
#include <asm-generic/topology.h>


and
include/asm-generic/topology.h :
#ifndef cpu_to_node
#define cpu_to_node(cpu) ((void)(cpu),0)
#endif

If any architecture decide for some reason to use a static inline rather
than a define, as currently done with node_to_first_cpu :

include/asm-x86/topology.h :
static inline int node_to_first_cpu(int node)
{
return first_cpu(cpu_online_map);
}
...
#include <asm-generic/topology.h>

include/asm-generic/topology.h :
#ifndef node_to_first_cpu
#define node_to_first_cpu(node) ((void)(node),0)
#endif

(which will override the static inline !)

It results in an override of the arch-specific version. Nice eh ?

Mathieu
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
Steven Rostedt
2008-10-03 17:54:49 UTC
Permalink
Post by Mathieu Desnoyers
Post by Steven Rostedt
Post by Mathieu Desnoyers
That's how I did it first, but then I looked at asm-generic/topology.h
and have seen it uses #defines. Should we change them too ?
The old way of doing this is with defines. But all new code should be
static inline functions when feasible. This way we can get typechecking
on the parameters even when the configuration is disabled.
Even if the rest of the file uses defines, the new code should be
static inlines. Eventually, even the old defines will be converted.
-- Steve
Argh, I think topology.h is utterly broken :-(
Have you noticed the subtile interaction between the
#define numa_node_id() 0
#define cpu_to_node(cpu) 0
#define early_cpu_to_node(cpu) 0
...
#include <asm-generic/topology.h>
and
#ifndef cpu_to_node
#define cpu_to_node(cpu) ((void)(cpu),0)
#endif
If any architecture decide for some reason to use a static inline rather
static inline int node_to_first_cpu(int node)
{
return first_cpu(cpu_online_map);
}
...
#include <asm-generic/topology.h>
#ifndef node_to_first_cpu
#define node_to_first_cpu(node) ((void)(node),0)
#endif
(which will override the static inline !)
It results in an override of the arch-specific version. Nice eh ?
Seems that they expect cpu_to_node to be a macro if NUMA is not
configured.

Actually, since the asm-generic/topology.h does have the cpu shown
(although not in inline format), the solution here is to simply remove
the

#define cpu_to_node() 0

And we can still make the early_cpu_to_node a static inline since it is
not referenced in the generic code.

-- Steve
Mathieu Desnoyers
2008-10-03 18:53:33 UTC
Permalink
Post by Steven Rostedt
Seems that they expect cpu_to_node to be a macro if NUMA is not
configured.
Actually, since the asm-generic/topology.h does have the cpu shown
(although not in inline format), the solution here is to simply remove
the
#define cpu_to_node() 0
And we can still make the early_cpu_to_node a static inline since it is
not referenced in the generic code.
-- Steve
Or we take a deep breath and clean this up ?

Ingo, I build tested this on x86_64 (with and without NUMA), x86_32,
powerpc, arm and mips. I applies to both -tip and 2.6.27-rc8. Could it
be pulled into -tip for further testing ?

Note that checkpatch.pl spills a warning telling me to modify include/asm-*/
files (unexisting in my tree) rather than arch/*/include/asm/. Any idea
why ?

Thanks,

Mathieu


topology.h define mess fix

Original goal : Declare NUMA-less cpu_to_node with a check that the cpu
parameter exists so people without NUMA test configs (namely Steven Rostedt and
myself who ran into this error both in the same day with different
implementations) stop doing this trivial mistake.

End result :

Argh, I think topology.h is utterly broken :-(

Have you noticed the subtile interaction between the

include/asm-x86/topology.h :

#define numa_node_id() 0
#define cpu_to_node(cpu) 0
#define early_cpu_to_node(cpu) 0
...
#include <asm-generic/topology.h>


and
include/asm-generic/topology.h :
#ifndef cpu_to_node
#define cpu_to_node(cpu) ((void)(cpu),0)
#endif

If any architecture decide for some reason to use a static inline rather
than a define, as currently done with node_to_first_cpu :

include/asm-x86/topology.h :
static inline int node_to_first_cpu(int node)
{
return first_cpu(cpu_online_map);
}
...
#include <asm-generic/topology.h>

include/asm-generic/topology.h :
#ifndef node_to_first_cpu
#define node_to_first_cpu(node) ((void)(node),0)
#endif

(which will override the static inline !)

It results in an override of the arch-specific version. Nice eh ?

This patch fixes this issue by declaring static inlines in
asm-generic/topology.h and by requiring a _complete_ override of the
topology functions when an architecture needs to override them. An
architecture overriding the topology functions should not include
asm-generic/topology.h anymore.

- alpha needs careful checking, as it did not implement parent_node nor
node_to_first_cpu previously.
- Major cross-architecture built test is required.

Signed-off-by: Mathieu Desnoyers <***@polymtl.ca>
CC: Steven Rostedt <***@goodmis.org>
CC: Linus Torvalds <***@linux-foundation.org>
CC: Peter Zijlstra <***@infradead.org>
CC: Andrew Morton <***@linux-foundation.org>
CC: Ingo Molnar <***@elte.hu>
CC: ***@twiddle.net
CC: ***@intel.com
CC: ***@samba.org
CC: ***@kernel.crashing.org
CC: ***@linux-sh.org
---
arch/alpha/include/asm/topology.h | 38 +++++++++++++++++++
arch/ia64/include/asm/topology.h | 16 ++++----
arch/powerpc/include/asm/topology.h | 12 +++++-
arch/sh/include/asm/topology.h | 11 -----
include/asm-generic/topology.h | 70 ++++++++++++++++++++----------------
include/asm-x86/topology.h | 66 +++++++++++++++++++++++++--------
6 files changed, 144 insertions(+), 69 deletions(-)

Index: linux-2.6-lttng/include/asm-x86/topology.h
===================================================================
--- linux-2.6-lttng.orig/include/asm-x86/topology.h 2008-10-03 14:41:05.000000000 -0400
+++ linux-2.6-lttng/include/asm-x86/topology.h 2008-10-03 14:41:12.000000000 -0400
@@ -38,6 +38,8 @@
/* Node not present */
#define NUMA_NO_NODE (-1)

+struct pci_bus;
+
#ifdef CONFIG_NUMA
#include <linux/cpumask.h>
#include <asm/mpspec.h>
@@ -116,7 +118,6 @@ static inline cpumask_t node_to_cpumask(

#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */

-/* Replace default node_to_cpumask_ptr with optimized version */
#define node_to_cpumask_ptr(v, node) \
const cpumask_t *v = _node_to_cpumask_ptr(node)

@@ -129,8 +130,14 @@ static inline cpumask_t node_to_cpumask(
* Returns the number of the node containing Node 'node'. This
* architecture is flat, so it is a pretty simple function!
*/
-#define parent_node(node) (node)
+static inline int parent_node(int node)
+{
+ return node;
+}

+/*
+ * Leave those as defines so we don't have to include linux/pci.h.
+ */
#define pcibus_to_node(bus) __pcibus_to_node(bus)
#define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus)

@@ -180,42 +187,67 @@ extern int __node_distance(int, int);
#define node_distance(a, b) __node_distance(a, b)
#endif

+/* Returns the number of the first CPU on Node 'node'. */
+static inline int node_to_first_cpu(int node)
+{
+ node_to_cpumask_ptr(mask, node);
+ return first_cpu(*mask);
+}
+
#else /* !CONFIG_NUMA */

-#define numa_node_id() 0
-#define cpu_to_node(cpu) 0
-#define early_cpu_to_node(cpu) 0
+static inline int numa_node_id(void)
+{
+ return 0;
+}

-static inline const cpumask_t *_node_to_cpumask_ptr(int node)
+/*
+ * We override asm-generic/topology.h.
+ */
+static inline int cpu_to_node(int cpu)
{
- return &cpu_online_map;
+ return 0;
}
+
+static inline int parent_node(int node)
+{
+ return 0;
+}
+
static inline cpumask_t node_to_cpumask(int node)
{
return cpu_online_map;
}
+
static inline int node_to_first_cpu(int node)
{
return first_cpu(cpu_online_map);
}

+static inline int pcibus_to_node(struct pci_bus *bus)
+{
+ return -1;
+}
+
+static inline cpumask_t pcibus_to_cpumask(struct pci_bus *bus)
+{
+ return pcibus_to_node(bus) == -1 ?
+ CPU_MASK_ALL :
+ node_to_cpumask(pcibus_to_node(bus));
+}
+
+static inline const cpumask_t *_node_to_cpumask_ptr(int node)
+{
+ return &cpu_online_map;
+}
+
/* Replace default node_to_cpumask_ptr with optimized version */
#define node_to_cpumask_ptr(v, node) \
const cpumask_t *v = _node_to_cpumask_ptr(node)

#define node_to_cpumask_ptr_next(v, node) \
v = _node_to_cpumask_ptr(node)
-#endif
-
-#include <asm-generic/topology.h>

-#ifdef CONFIG_NUMA
-/* Returns the number of the first CPU on Node 'node'. */
-static inline int node_to_first_cpu(int node)
-{
- node_to_cpumask_ptr(mask, node);
- return first_cpu(*mask);
-}
#endif

extern cpumask_t cpu_coregroup_map(int cpu);
Index: linux-2.6-lttng/arch/alpha/include/asm/topology.h
===================================================================
--- linux-2.6-lttng.orig/arch/alpha/include/asm/topology.h 2008-10-03 14:41:05.000000000 -0400
+++ linux-2.6-lttng/arch/alpha/include/asm/topology.h 2008-10-03 14:41:12.000000000 -0400
@@ -41,7 +41,43 @@ static inline cpumask_t node_to_cpumask(

#define pcibus_to_cpumask(bus) (cpu_online_map)

+struct pci_bus;
+
+static inline int parent_node(int node)
+{
+ return node;
+}
+
+static inline int pcibus_to_node(struct pci_bus *bus)
+{
+ return -1;
+}
+
+static inline cpumask_t pcibus_to_cpumask(struct pci_bus *bus)
+{
+ return pcibus_to_node(bus) == -1 ?
+ CPU_MASK_ALL :
+ node_to_cpumask(pcibus_to_node(bus));
+}
+
+/* returns pointer to cpumask for specified node */
+#define node_to_cpumask_ptr(v, node) \
+ cpumask_t _##v = node_to_cpumask(node); \
+ const cpumask_t *v = &_##v
+
+#define node_to_cpumask_ptr_next(v, node) \
+ _##v = node_to_cpumask(node)
+
+static inline int node_to_first_cpu(int node)
+{
+ node_to_cpumask_ptr(mask, node);
+ return first_cpu(*mask);
+}
+
+#else
+
+#include <asm-generic/topology.h>
+
#endif /* !CONFIG_NUMA */
-# include <asm-generic/topology.h>

#endif /* _ASM_ALPHA_TOPOLOGY_H */
Index: linux-2.6-lttng/arch/ia64/include/asm/topology.h
===================================================================
--- linux-2.6-lttng.orig/arch/ia64/include/asm/topology.h 2008-10-03 14:41:05.000000000 -0400
+++ linux-2.6-lttng/arch/ia64/include/asm/topology.h 2008-10-03 14:41:12.000000000 -0400
@@ -104,6 +104,15 @@ void build_cpu_to_node_map(void);
.nr_balance_failed = 0, \
}

+#define pcibus_to_cpumask(bus) (pcibus_to_node(bus) == -1 ? \
+ CPU_MASK_ALL : \
+ node_to_cpumask(pcibus_to_node(bus)) \
+ )
+
+#else
+
+#include <asm-generic/topology.h>
+
#endif /* CONFIG_NUMA */

#ifdef CONFIG_SMP
@@ -116,11 +125,4 @@ void build_cpu_to_node_map(void);

extern void arch_fix_phys_package_id(int num, u32 slot);

-#define pcibus_to_cpumask(bus) (pcibus_to_node(bus) == -1 ? \
- CPU_MASK_ALL : \
- node_to_cpumask(pcibus_to_node(bus)) \
- )
-
-#include <asm-generic/topology.h>
-
#endif /* _ASM_IA64_TOPOLOGY_H */
Index: linux-2.6-lttng/arch/powerpc/include/asm/topology.h
===================================================================
--- linux-2.6-lttng.orig/arch/powerpc/include/asm/topology.h 2008-10-03 14:41:05.000000000 -0400
+++ linux-2.6-lttng/arch/powerpc/include/asm/topology.h 2008-10-03 14:41:12.000000000 -0400
@@ -77,6 +77,14 @@ extern void __init dump_numa_cpu_topolog
extern int sysfs_add_device_to_node(struct sys_device *dev, int nid);
extern void sysfs_remove_device_from_node(struct sys_device *dev, int nid);

+/* returns pointer to cpumask for specified node */
+#define node_to_cpumask_ptr(v, node) \
+ cpumask_t _##v = node_to_cpumask(node); \
+ const cpumask_t *v = &_##v
+
+#define node_to_cpumask_ptr_next(v, node) \
+ _##v = node_to_cpumask(node)
+
#else

static inline int of_node_to_nid(struct device_node *device)
@@ -96,10 +104,10 @@ static inline void sysfs_remove_device_f
{
}

-#endif /* CONFIG_NUMA */
-
#include <asm-generic/topology.h>

+#endif /* CONFIG_NUMA */
+
#ifdef CONFIG_SMP
#include <asm/cputable.h>
#define smt_capable() (cpu_has_feature(CPU_FTR_SMT))
Index: linux-2.6-lttng/arch/sh/include/asm/topology.h
===================================================================
--- linux-2.6-lttng.orig/arch/sh/include/asm/topology.h 2008-10-03 14:41:05.000000000 -0400
+++ linux-2.6-lttng/arch/sh/include/asm/topology.h 2008-10-03 14:41:12.000000000 -0400
@@ -29,17 +29,6 @@
.nr_balance_failed = 0, \
}

-#define cpu_to_node(cpu) ((void)(cpu),0)
-#define parent_node(node) ((void)(node),0)
-
-#define node_to_cpumask(node) ((void)node, cpu_online_map)
-#define node_to_first_cpu(node) ((void)(node),0)
-
-#define pcibus_to_node(bus) ((void)(bus), -1)
-#define pcibus_to_cpumask(bus) (pcibus_to_node(bus) == -1 ? \
- CPU_MASK_ALL : \
- node_to_cpumask(pcibus_to_node(bus)) \
- )
#endif

#include <asm-generic/topology.h>
Index: linux-2.6-lttng/include/asm-generic/topology.h
===================================================================
--- linux-2.6-lttng.orig/include/asm-generic/topology.h 2008-10-03 14:41:13.000000000 -0400
+++ linux-2.6-lttng/include/asm-generic/topology.h 2008-10-03 14:41:16.000000000 -0400
@@ -27,44 +27,52 @@
#ifndef _ASM_GENERIC_TOPOLOGY_H
#define _ASM_GENERIC_TOPOLOGY_H

-#ifndef CONFIG_NUMA
-
-/* Other architectures wishing to use this simple topology API should fill
- in the below functions as appropriate in their own <asm/topology.h> file. */
-#ifndef cpu_to_node
-#define cpu_to_node(cpu) ((void)(cpu),0)
-#endif
-#ifndef parent_node
-#define parent_node(node) ((void)(node),0)
-#endif
-#ifndef node_to_cpumask
-#define node_to_cpumask(node) ((void)node, cpu_online_map)
-#endif
-#ifndef node_to_first_cpu
-#define node_to_first_cpu(node) ((void)(node),0)
-#endif
-#ifndef pcibus_to_node
-#define pcibus_to_node(bus) ((void)(bus), -1)
-#endif
-
-#ifndef pcibus_to_cpumask
-#define pcibus_to_cpumask(bus) (pcibus_to_node(bus) == -1 ? \
- CPU_MASK_ALL : \
- node_to_cpumask(pcibus_to_node(bus)) \
- )
-#endif
-
-#endif /* CONFIG_NUMA */
+/*
+ * Other architectures wishing to use this simple topology API should fill
+ * in the below functions as appropriate in their own <asm/topology.h> file,
+ * and _don't_ include asm-generic/topology.h.
+ */
+
+struct pci_bus;
+
+static inline int cpu_to_node(int cpu)
+{
+ return 0;
+}
+
+static inline int parent_node(int node)
+{
+ return 0;
+}
+
+static inline cpumask_t node_to_cpumask(int node)
+{
+ return cpu_online_map;
+}
+
+static inline int node_to_first_cpu(int node)
+{
+ return 0;
+}
+
+static inline int pcibus_to_node(struct pci_bus *bus)
+{
+ return -1;
+}
+
+static inline cpumask_t pcibus_to_cpumask(struct pci_bus *bus)
+{
+ return pcibus_to_node(bus) == -1 ?
+ CPU_MASK_ALL :
+ node_to_cpumask(pcibus_to_node(bus));
+}

/* returns pointer to cpumask for specified node */
-#ifndef node_to_cpumask_ptr
-
#define node_to_cpumask_ptr(v, node) \
cpumask_t _##v = node_to_cpumask(node); \
const cpumask_t *v = &_##v

#define node_to_cpumask_ptr_next(v, node) \
_##v = node_to_cpumask(node)
-#endif

#endif /* _ASM_GENERIC_TOPOLOGY_H */
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
Luck, Tony
2008-10-03 20:14:02 UTC
Permalink
Post by Mathieu Desnoyers
- Major cross-architecture built test is required.
Some problems on ia64. With defconfig build (which has
CONFIG_NUMA=y) I see this:

kernel/sched.c: In function 'find_next_best_node':
kernel/sched.c:6920: error: implicit declaration of function 'node_to_cpumask_ptr'
kernel/sched.c:6920: error: '__tmp__' undeclared (first use in this function)
kernel/sched.c:6920: error: (Each undeclared identifier is reported only once
kernel/sched.c:6920: error: for each function it appears in.)
kernel/sched.c: In function 'sched_domain_node_span':
kernel/sched.c:6952: error: 'nodemask' undeclared (first use in this function)
kernel/sched.c:6953: warning: ISO C90 forbids mixed declarations and code
kernel/sched.c:6964: error: implicit declaration of function 'node_to_cpumask_ptr_next'
kernel/sched.c: In function '__build_sched_domains':
kernel/sched.c:7510: error: 'pnodemask' undeclared (first use in this function)

On an "allnoconfig" build (which curiously also has CONFIG_NUMA=y :-) I see

mm/page_alloc.c: In function 'find_next_best_node':
mm/page_alloc.c:2086: error: implicit declaration of function 'node_to_cpumask_ptr'
mm/page_alloc.c:2086: error: 'tmp' undeclared (first use in this function)
mm/page_alloc.c:2086: error: (Each undeclared identifier is reported only once
mm/page_alloc.c:2086: error: for each function it appears in.)
mm/page_alloc.c:2107: error: implicit declaration of function 'node_to_cpumask_ptr_next'

There are most probably more errors ... but this is where the build stopped.

-Tony
Mathieu Desnoyers
2008-10-03 22:47:25 UTC
Permalink
Post by Luck, Tony
Post by Mathieu Desnoyers
- Major cross-architecture built test is required.
Some problems on ia64. With defconfig build (which has
[...]

Ah, I did not select config "generic" for ia64, and thus did not get
CONFIG_NUMA. Here is a v2 which fixes this.

Thanks for testing this.

Mathieu


topology.h define mess fix v2

Update : build fix for ia64 CONFIG_NUMA.

Original goal : Declare NUMA-less cpu_to_node with a check that the cpu
parameter exists so people without NUMA test configs (namely Steven Rostedt and
myself who ran into this error both in the same day with different
implementations) stop doing this trivial mistake.

End result :

Argh, I think topology.h is utterly broken :-(

Have you noticed the subtile interaction between the

include/asm-x86/topology.h :

#define numa_node_id() 0
#define cpu_to_node(cpu) 0
#define early_cpu_to_node(cpu) 0
...
#include <asm-generic/topology.h>


and
include/asm-generic/topology.h :
#ifndef cpu_to_node
#define cpu_to_node(cpu) ((void)(cpu),0)
#endif

If any architecture decide for some reason to use a static inline rather
than a define, as currently done with node_to_first_cpu :

include/asm-x86/topology.h :
static inline int node_to_first_cpu(int node)
{
return first_cpu(cpu_online_map);
}
...
#include <asm-generic/topology.h>

include/asm-generic/topology.h :
#ifndef node_to_first_cpu
#define node_to_first_cpu(node) ((void)(node),0)
#endif

(which will override the static inline !)

It results in an override of the arch-specific version. Nice eh ?

This patch fixes this issue by declaring static inlines in
asm-generic/topology.h and by requiring a _complete_ override of the
topology functions when an architecture needs to override them. An
architecture overriding the topology functions should not include
asm-generic/topology.h anymore.

- alpha needs careful checking, as it did not implement parent_node nor
node_to_first_cpu previously.
- Major cross-architecture built test is required.

Signed-off-by: Mathieu Desnoyers <***@polymtl.ca>
CC: Steven Rostedt <***@goodmis.org>
CC: Linus Torvalds <***@linux-foundation.org>
CC: Peter Zijlstra <***@infradead.org>
CC: Andrew Morton <***@linux-foundation.org>
CC: Ingo Molnar <***@elte.hu>
CC: ***@twiddle.net
CC: ***@intel.com
CC: ***@samba.org
CC: ***@kernel.crashing.org
CC: ***@linux-sh.org
---
arch/alpha/include/asm/topology.h | 38 +++++++++++++++++++
arch/ia64/include/asm/topology.h | 24 ++++++++----
arch/powerpc/include/asm/topology.h | 12 +++++-
arch/sh/include/asm/topology.h | 11 -----
include/asm-generic/topology.h | 70 ++++++++++++++++++++----------------
include/asm-x86/topology.h | 66 +++++++++++++++++++++++++--------
6 files changed, 152 insertions(+), 69 deletions(-)

Index: linux-2.6-lttng/include/asm-x86/topology.h
===================================================================
--- linux-2.6-lttng.orig/include/asm-x86/topology.h 2008-10-03 17:58:00.000000000 -0400
+++ linux-2.6-lttng/include/asm-x86/topology.h 2008-10-03 17:59:12.000000000 -0400
@@ -38,6 +38,8 @@
/* Node not present */
#define NUMA_NO_NODE (-1)

+struct pci_bus;
+
#ifdef CONFIG_NUMA
#include <linux/cpumask.h>
#include <asm/mpspec.h>
@@ -116,7 +118,6 @@ static inline cpumask_t node_to_cpumask(

#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */

-/* Replace default node_to_cpumask_ptr with optimized version */
#define node_to_cpumask_ptr(v, node) \
const cpumask_t *v = _node_to_cpumask_ptr(node)

@@ -129,8 +130,14 @@ static inline cpumask_t node_to_cpumask(
* Returns the number of the node containing Node 'node'. This
* architecture is flat, so it is a pretty simple function!
*/
-#define parent_node(node) (node)
+static inline int parent_node(int node)
+{
+ return node;
+}

+/*
+ * Leave those as defines so we don't have to include linux/pci.h.
+ */
#define pcibus_to_node(bus) __pcibus_to_node(bus)
#define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus)

@@ -180,42 +187,67 @@ extern int __node_distance(int, int);
#define node_distance(a, b) __node_distance(a, b)
#endif

+/* Returns the number of the first CPU on Node 'node'. */
+static inline int node_to_first_cpu(int node)
+{
+ node_to_cpumask_ptr(mask, node);
+ return first_cpu(*mask);
+}
+
#else /* !CONFIG_NUMA */

-#define numa_node_id() 0
-#define cpu_to_node(cpu) 0
-#define early_cpu_to_node(cpu) 0
+static inline int numa_node_id(void)
+{
+ return 0;
+}

-static inline const cpumask_t *_node_to_cpumask_ptr(int node)
+/*
+ * We override asm-generic/topology.h.
+ */
+static inline int cpu_to_node(int cpu)
{
- return &cpu_online_map;
+ return 0;
}
+
+static inline int parent_node(int node)
+{
+ return 0;
+}
+
static inline cpumask_t node_to_cpumask(int node)
{
return cpu_online_map;
}
+
static inline int node_to_first_cpu(int node)
{
return first_cpu(cpu_online_map);
}

+static inline int pcibus_to_node(struct pci_bus *bus)
+{
+ return -1;
+}
+
+static inline cpumask_t pcibus_to_cpumask(struct pci_bus *bus)
+{
+ return pcibus_to_node(bus) == -1 ?
+ CPU_MASK_ALL :
+ node_to_cpumask(pcibus_to_node(bus));
+}
+
+static inline const cpumask_t *_node_to_cpumask_ptr(int node)
+{
+ return &cpu_online_map;
+}
+
/* Replace default node_to_cpumask_ptr with optimized version */
#define node_to_cpumask_ptr(v, node) \
const cpumask_t *v = _node_to_cpumask_ptr(node)

#define node_to_cpumask_ptr_next(v, node) \
v = _node_to_cpumask_ptr(node)
-#endif
-
-#include <asm-generic/topology.h>

-#ifdef CONFIG_NUMA
-/* Returns the number of the first CPU on Node 'node'. */
-static inline int node_to_first_cpu(int node)
-{
- node_to_cpumask_ptr(mask, node);
- return first_cpu(*mask);
-}
#endif

extern cpumask_t cpu_coregroup_map(int cpu);
Index: linux-2.6-lttng/arch/alpha/include/asm/topology.h
===================================================================
--- linux-2.6-lttng.orig/arch/alpha/include/asm/topology.h 2008-10-03 17:58:00.000000000 -0400
+++ linux-2.6-lttng/arch/alpha/include/asm/topology.h 2008-10-03 17:59:12.000000000 -0400
@@ -41,7 +41,43 @@ static inline cpumask_t node_to_cpumask(

#define pcibus_to_cpumask(bus) (cpu_online_map)

+struct pci_bus;
+
+static inline int parent_node(int node)
+{
+ return node;
+}
+
+static inline int pcibus_to_node(struct pci_bus *bus)
+{
+ return -1;
+}
+
+static inline cpumask_t pcibus_to_cpumask(struct pci_bus *bus)
+{
+ return pcibus_to_node(bus) == -1 ?
+ CPU_MASK_ALL :
+ node_to_cpumask(pcibus_to_node(bus));
+}
+
+/* returns pointer to cpumask for specified node */
+#define node_to_cpumask_ptr(v, node) \
+ cpumask_t _##v = node_to_cpumask(node); \
+ const cpumask_t *v = &_##v
+
+#define node_to_cpumask_ptr_next(v, node) \
+ _##v = node_to_cpumask(node)
+
+static inline int node_to_first_cpu(int node)
+{
+ node_to_cpumask_ptr(mask, node);
+ return first_cpu(*mask);
+}
+
+#else
+
+#include <asm-generic/topology.h>
+
#endif /* !CONFIG_NUMA */
-# include <asm-generic/topology.h>

#endif /* _ASM_ALPHA_TOPOLOGY_H */
Index: linux-2.6-lttng/arch/ia64/include/asm/topology.h
===================================================================
--- linux-2.6-lttng.orig/arch/ia64/include/asm/topology.h 2008-10-03 17:58:00.000000000 -0400
+++ linux-2.6-lttng/arch/ia64/include/asm/topology.h 2008-10-03 18:36:47.000000000 -0400
@@ -104,6 +104,23 @@ void build_cpu_to_node_map(void);
.nr_balance_failed = 0, \
}

+#define pcibus_to_cpumask(bus) (pcibus_to_node(bus) == -1 ? \
+ CPU_MASK_ALL : \
+ node_to_cpumask(pcibus_to_node(bus)) \
+ )
+
+/* returns pointer to cpumask for specified node */
+#define node_to_cpumask_ptr(v, node) \
+ cpumask_t _##v = node_to_cpumask(node); \
+ const cpumask_t *v = &_##v
+
+#define node_to_cpumask_ptr_next(v, node) \
+ _##v = node_to_cpumask(node)
+
+#else
+
+#include <asm-generic/topology.h>
+
#endif /* CONFIG_NUMA */

#ifdef CONFIG_SMP
@@ -116,11 +133,4 @@ void build_cpu_to_node_map(void);

extern void arch_fix_phys_package_id(int num, u32 slot);

-#define pcibus_to_cpumask(bus) (pcibus_to_node(bus) == -1 ? \
- CPU_MASK_ALL : \
- node_to_cpumask(pcibus_to_node(bus)) \
- )
-
-#include <asm-generic/topology.h>
-
#endif /* _ASM_IA64_TOPOLOGY_H */
Index: linux-2.6-lttng/arch/powerpc/include/asm/topology.h
===================================================================
--- linux-2.6-lttng.orig/arch/powerpc/include/asm/topology.h 2008-10-03 17:58:00.000000000 -0400
+++ linux-2.6-lttng/arch/powerpc/include/asm/topology.h 2008-10-03 17:59:12.000000000 -0400
@@ -77,6 +77,14 @@ extern void __init dump_numa_cpu_topolog
extern int sysfs_add_device_to_node(struct sys_device *dev, int nid);
extern void sysfs_remove_device_from_node(struct sys_device *dev, int nid);

+/* returns pointer to cpumask for specified node */
+#define node_to_cpumask_ptr(v, node) \
+ cpumask_t _##v = node_to_cpumask(node); \
+ const cpumask_t *v = &_##v
+
+#define node_to_cpumask_ptr_next(v, node) \
+ _##v = node_to_cpumask(node)
+
#else

static inline int of_node_to_nid(struct device_node *device)
@@ -96,10 +104,10 @@ static inline void sysfs_remove_device_f
{
}

-#endif /* CONFIG_NUMA */
-
#include <asm-generic/topology.h>

+#endif /* CONFIG_NUMA */
+
#ifdef CONFIG_SMP
#include <asm/cputable.h>
#define smt_capable() (cpu_has_feature(CPU_FTR_SMT))
Index: linux-2.6-lttng/arch/sh/include/asm/topology.h
===================================================================
--- linux-2.6-lttng.orig/arch/sh/include/asm/topology.h 2008-10-03 17:58:00.000000000 -0400
+++ linux-2.6-lttng/arch/sh/include/asm/topology.h 2008-10-03 17:59:12.000000000 -0400
@@ -29,17 +29,6 @@
.nr_balance_failed = 0, \
}

-#define cpu_to_node(cpu) ((void)(cpu),0)
-#define parent_node(node) ((void)(node),0)
-
-#define node_to_cpumask(node) ((void)node, cpu_online_map)
-#define node_to_first_cpu(node) ((void)(node),0)
-
-#define pcibus_to_node(bus) ((void)(bus), -1)
-#define pcibus_to_cpumask(bus) (pcibus_to_node(bus) == -1 ? \
- CPU_MASK_ALL : \
- node_to_cpumask(pcibus_to_node(bus)) \
- )
#endif

#include <asm-generic/topology.h>
Index: linux-2.6-lttng/include/asm-generic/topology.h
===================================================================
--- linux-2.6-lttng.orig/include/asm-generic/topology.h 2008-10-03 17:58:00.000000000 -0400
+++ linux-2.6-lttng/include/asm-generic/topology.h 2008-10-03 17:59:12.000000000 -0400
@@ -27,44 +27,52 @@
#ifndef _ASM_GENERIC_TOPOLOGY_H
#define _ASM_GENERIC_TOPOLOGY_H

-#ifndef CONFIG_NUMA
-
-/* Other architectures wishing to use this simple topology API should fill
- in the below functions as appropriate in their own <asm/topology.h> file. */
-#ifndef cpu_to_node
-#define cpu_to_node(cpu) ((void)(cpu),0)
-#endif
-#ifndef parent_node
-#define parent_node(node) ((void)(node),0)
-#endif
-#ifndef node_to_cpumask
-#define node_to_cpumask(node) ((void)node, cpu_online_map)
-#endif
-#ifndef node_to_first_cpu
-#define node_to_first_cpu(node) ((void)(node),0)
-#endif
-#ifndef pcibus_to_node
-#define pcibus_to_node(bus) ((void)(bus), -1)
-#endif
-
-#ifndef pcibus_to_cpumask
-#define pcibus_to_cpumask(bus) (pcibus_to_node(bus) == -1 ? \
- CPU_MASK_ALL : \
- node_to_cpumask(pcibus_to_node(bus)) \
- )
-#endif
-
-#endif /* CONFIG_NUMA */
+/*
+ * Other architectures wishing to use this simple topology API should fill
+ * in the below functions as appropriate in their own <asm/topology.h> file,
+ * and _don't_ include asm-generic/topology.h.
+ */
+
+struct pci_bus;
+
+static inline int cpu_to_node(int cpu)
+{
+ return 0;
+}
+
+static inline int parent_node(int node)
+{
+ return 0;
+}
+
+static inline cpumask_t node_to_cpumask(int node)
+{
+ return cpu_online_map;
+}
+
+static inline int node_to_first_cpu(int node)
+{
+ return 0;
+}
+
+static inline int pcibus_to_node(struct pci_bus *bus)
+{
+ return -1;
+}
+
+static inline cpumask_t pcibus_to_cpumask(struct pci_bus *bus)
+{
+ return pcibus_to_node(bus) == -1 ?
+ CPU_MASK_ALL :
+ node_to_cpumask(pcibus_to_node(bus));
+}

/* returns pointer to cpumask for specified node */
-#ifndef node_to_cpumask_ptr
-
#define node_to_cpumask_ptr(v, node) \
cpumask_t _##v = node_to_cpumask(node); \
const cpumask_t *v = &_##v

#define node_to_cpumask_ptr_next(v, node) \
_##v = node_to_cpumask(node)
-#endif

#endif /* _ASM_GENERIC_TOPOLOGY_H */
--
Mathieu Desnoyers
OpenPGP key fingerprint: 8CD5 52C3 8E3C 4140 715F BA06 3F25 A8FE 3BAE 9A68
Peter Zijlstra
2008-09-30 17:00:43 UTC
Permalink
Post by Steven Rostedt
Post by Linus Torvalds
Trace buffers are different, though. Do people realize that doing the
overloading means that you never EVER can use those buffers for anything
else? Do people realize that it means that splice() and friends are out of
the question?
Post by Peter Zijlstra
Trouble is, looking at it I see no easy way out,
Quite frankly, we could just put it at the head of the page itself. Having
a "whole page" for the trace data is not possible anyway, since the trace
header itself will always eat 8 bytes.
And I do think it would potentially be a better model. Or at least safer.
Actually, looking at the code, there is no reason I need to keep this in
the frame buffer itself. I've also encapsulated the accesses to the
incrementing of the pointers so it would be trivial to try other
approaches.
The problem we had with the big array struct is that we can want large
buffers and to do that with pointers means we would need to either come up
with a large allocator or use vmap.
But I just realized that I could also just make a link list of page
pointers and do the exact same thing without having to worry about page
frames. Again, the way I coded this up, it is quite trivial to replace
the handling of the pages with other schemes.
The list_head in the page frame should be available regardless of
splice() stuffs.
Steven Rostedt
2008-09-30 17:41:23 UTC
Permalink
Post by Peter Zijlstra
Post by Steven Rostedt
Actually, looking at the code, there is no reason I need to keep this in
the frame buffer itself. I've also encapsulated the accesses to the
incrementing of the pointers so it would be trivial to try other
approaches.
The problem we had with the big array struct is that we can want large
buffers and to do that with pointers means we would need to either come up
with a large allocator or use vmap.
But I just realized that I could also just make a link list of page
pointers and do the exact same thing without having to worry about page
frames. Again, the way I coded this up, it is quite trivial to replace
the handling of the pages with other schemes.
The list_head in the page frame should be available regardless of
splice() stuffs.
Regardless, there's more info we want to store for each page than the list
head. Especially when we start converting this to lockless. I rather get
out of the overlaying of the page frames, its nice to save the space, but
really scares the hell out of me. I can just imagine this blowing up if we
redo the paging, and I dislike this transparent coupling between the
tracer buffer and the pages.

-- Steve
Peter Zijlstra
2008-09-30 17:49:12 UTC
Permalink
Post by Steven Rostedt
Post by Peter Zijlstra
Post by Steven Rostedt
Actually, looking at the code, there is no reason I need to keep this in
the frame buffer itself. I've also encapsulated the accesses to the
incrementing of the pointers so it would be trivial to try other
approaches.
The problem we had with the big array struct is that we can want large
buffers and to do that with pointers means we would need to either come up
with a large allocator or use vmap.
But I just realized that I could also just make a link list of page
pointers and do the exact same thing without having to worry about page
frames. Again, the way I coded this up, it is quite trivial to replace
the handling of the pages with other schemes.
The list_head in the page frame should be available regardless of
splice() stuffs.
Regardless, there's more info we want to store for each page than the list
head. Especially when we start converting this to lockless. I rather get
out of the overlaying of the page frames, its nice to save the space, but
really scares the hell out of me. I can just imagine this blowing up if we
redo the paging, and I dislike this transparent coupling between the
tracer buffer and the pages.
The problem with storing the page link information inside the page is
that it doesnt transfer to another address space, so if you do indeed
mmap these pages, then the link information is bogus.

Of course, in such a situation you could ignore these headers, but
somehow that doesn't sound too apealing.
Steven Rostedt
2008-09-30 17:56:36 UTC
Permalink
Post by Peter Zijlstra
The problem with storing the page link information inside the page is
that it doesnt transfer to another address space, so if you do indeed
mmap these pages, then the link information is bogus.
Of course, in such a situation you could ignore these headers, but
somehow that doesn't sound too apealing.
No that's not what I'm proposing. I'm proposing to allocate a page_header
structure for every page we alloc, and make a link list of them.
In other words:


struct ring_buffer_per_cpu {
[...]
struct list_head pages;
[...]
};

struct buffer_page {
[...];
void *page;
struct list_head list;
[...];
};

In ring_buffer_allocate_cpu:

struct buffer_page *bpage;
struct unsigned long addr;

[...]

for every page() {
bpage = kzalloc(sizeof(*bpage), GFP_KERNEL);
addr = get_free_page();
bpage->page = (void *)addr;
list_add(&bpage->list, &cpu_buffer->pages);
}


Obviously need to add the error checking, but you get the idea. Here I do
not need to change any of the later logic, because we are still dealing
with the buffer_page. I only need to update way to index the page which is
already encapsulated in its own function.

-- Steve
Steven Rostedt
2008-09-30 18:02:33 UTC
Permalink
Post by Steven Rostedt
struct buffer_page *bpage;
struct unsigned long addr;
Of course we would not be declaring a "struct unsigned long" ;-)

-- Steve
Arnaldo Carvalho de Melo
2008-09-26 22:31:29 UTC
Permalink
Post by Steven Rostedt
+ */
+static inline void *
+ring_buffer_event_data(struct ring_buffer_event *event)
+{
+ BUG_ON(event->type != RB_TYPE_DATA);
+ /* If length is in len field, then array[0] has the data */
+ if (event->len)
+ return (void *)&event->array[0];
+ /* Otherwise length is in array[0] and array[1] has the data */
+ return (void *)&event->array[1];
Nitpick: Why cast to void *?

And sometimes you use the rb_ prefix, in other cases you use the longer
for ring_buffer_, is the ring_ namespace already used? Or can we make it
use rb_ consistently to shorten the names?

- Arnaldo
Steven Rostedt
2008-09-26 23:58:58 UTC
Permalink
Post by Arnaldo Carvalho de Melo
Post by Steven Rostedt
+ */
+static inline void *
+ring_buffer_event_data(struct ring_buffer_event *event)
+{
+ BUG_ON(event->type != RB_TYPE_DATA);
+ /* If length is in len field, then array[0] has the data */
+ if (event->len)
+ return (void *)&event->array[0];
+ /* Otherwise length is in array[0] and array[1] has the data */
+ return (void *)&event->array[1];
Nitpick: Why cast to void *?
5 day hacking marathon, I cast everything ;-)
Post by Arnaldo Carvalho de Melo
And sometimes you use the rb_ prefix, in other cases you use the longer
for ring_buffer_, is the ring_ namespace already used? Or can we make it
use rb_ consistently to shorten the names?
I started using the rb_ because I was constantly breaking the 80 character
line limit with ring_buffer ;-) OK, for v8, I'll rename all static
internal functions to rb_ and keep the global ones ring_buffer_

Thanks,

-- Steve
Linus Torvalds
2008-09-27 00:13:36 UTC
Permalink
Post by Steven Rostedt
I started using the rb_ because I was constantly breaking the 80 character
line limit with ring_buffer ;-) OK, for v8, I'll rename all static
internal functions to rb_ and keep the global ones ring_buffer_
It would probably be better to use something else than 'rb_', because that
prefix is already used by the red-black trees, and exported as such (eg
"rb_next()" etc).

But at least as long as it's static, it's probably not _too_ noticeable if
the rest of the names don't overlap. We _do_ include <linux/rbtree.h>
almost everywhere, since we use those things in the VM, in timers etc, so
it comes in through pretty much all headers.

Linus
Steven Rostedt
2008-09-27 00:23:27 UTC
Permalink
Post by Linus Torvalds
Post by Steven Rostedt
I started using the rb_ because I was constantly breaking the 80 character
line limit with ring_buffer ;-) OK, for v8, I'll rename all static
internal functions to rb_ and keep the global ones ring_buffer_
It would probably be better to use something else than 'rb_', because that
prefix is already used by the red-black trees, and exported as such (eg
"rb_next()" etc).
Good point.
Post by Linus Torvalds
But at least as long as it's static, it's probably not _too_ noticeable if
the rest of the names don't overlap. We _do_ include <linux/rbtree.h>
almost everywhere, since we use those things in the VM, in timers etc, so
it comes in through pretty much all headers.
Well, I just compiled it and it didn't have any name collisions, but that
doesn't mean that this wont change in the future.

What would you suggest? buffer_ ? ringbuf_ ?

-- Steve
Steven Rostedt
2008-09-27 00:28:17 UTC
Permalink
Post by Steven Rostedt
Post by Linus Torvalds
But at least as long as it's static, it's probably not _too_ noticeable if
the rest of the names don't overlap. We _do_ include <linux/rbtree.h>
almost everywhere, since we use those things in the VM, in timers etc, so
it comes in through pretty much all headers.
Well, I just compiled it and it didn't have any name collisions, but that
doesn't mean that this wont change in the future.
For kicks I just added #include <linux/rbtree.h> and it still passed. I
don't think we'll be adding new functions to rbtree.h, so it may be
fine to stay with the rb_ prefix.

-- Steve
Continue reading on narkive:
Loading...