1 #include <linux/types.h>
2 #include <linux/ring_buffer.h>
3 #include <linux/wait.h>
4 #include <linux/poll.h>
5 #include <linux/cpumask.h>
7 struct _stp_data_entry
{
12 static struct ring_buffer
*__stp_ring_buffer
= NULL
;
14 /* _stp_poll_wait is a waitqueue for tasks blocked on
15 * _stp_data_poll_trace() */
16 static DECLARE_WAIT_QUEUE_HEAD(_stp_poll_wait
);
19 * Trace iterator - used by printout routines who present trace
20 * results to users and which routines might sleep, etc:
22 struct _stp_ring_buffer_data
{
26 static struct _stp_ring_buffer_data _stp_rb_data
;
28 static cpumask_var_t _stp_trace_reader_cpumask
;
30 static void __stp_free_ring_buffer(void)
32 free_cpumask_var(_stp_trace_reader_cpumask
);
33 if (__stp_ring_buffer
)
34 ring_buffer_free(__stp_ring_buffer
);
35 __stp_ring_buffer
= NULL
;
38 static int __stp_alloc_ring_buffer(void)
41 unsigned long buffer_size
= _stp_bufsize
;
43 if (!alloc_cpumask_var(&_stp_trace_reader_cpumask
, GFP_KERNEL
))
45 cpumask_clear(_stp_trace_reader_cpumask
);
47 if (buffer_size
== 0) {
48 dbug_trans(1, "using default buffer size...\n");
49 buffer_size
= _stp_nsubbufs
* _stp_subbuf_size
;
51 /* The number passed to ring_buffer_alloc() is per cpu. Our
52 * 'buffer_size' is a total number of bytes to allocate. So,
53 * we need to divide buffer_size by the number of cpus. */
54 buffer_size
/= num_online_cpus();
55 dbug_trans(1, "%lu\n", buffer_size
);
56 __stp_ring_buffer
= ring_buffer_alloc(buffer_size
, 0);
57 if (!__stp_ring_buffer
)
60 dbug_trans(1, "size = %lu\n", ring_buffer_size(__stp_ring_buffer
));
64 __stp_free_ring_buffer();
68 static int _stp_data_open_trace(struct inode
*inode
, struct file
*file
)
70 long cpu_file
= (long) inode
->i_private
;
72 /* We only allow for one reader per cpu */
73 dbug_trans(1, "trace attach\n");
75 if (!cpumask_test_cpu(cpu_file
, _stp_trace_reader_cpumask
))
76 cpumask_set_cpu(cpu_file
, _stp_trace_reader_cpumask
);
78 dbug_trans(1, "returning EBUSY\n");
82 if (!cpumask_empty(_stp_trace_reader_cpumask
)) {
83 dbug_trans(1, "returning EBUSY\n");
86 cpumask_setall(_stp_trace_reader_cpumask
);
88 file
->private_data
= inode
->i_private
;
92 static int _stp_data_release_trace(struct inode
*inode
, struct file
*file
)
94 long cpu_file
= (long) inode
->i_private
;
95 dbug_trans(1, "trace detach\n");
97 cpumask_clear_cpu(cpu_file
, _stp_trace_reader_cpumask
);
99 cpumask_clear(_stp_trace_reader_cpumask
);
106 _stp_event_to_user(struct ring_buffer_event
*event
, char __user
*ubuf
,
110 struct _stp_data_entry
*entry
;
112 dbug_trans(1, "event(%p), ubuf(%p), cnt(%lu)\n", event
, ubuf
, cnt
);
113 if (event
== NULL
|| ubuf
== NULL
)
116 entry
= (struct _stp_data_entry
*)ring_buffer_event_data(event
);
120 /* We don't do partial entries - just fail. */
121 if (entry
->len
> cnt
)
124 if (cnt
> entry
->len
)
126 ret
= copy_to_user(ubuf
, entry
->buf
, cnt
);
133 static ssize_t
tracing_wait_pipe(struct file
*filp
)
135 while (ring_buffer_empty(__stp_ring_buffer
)) {
137 if ((filp
->f_flags
& O_NONBLOCK
)) {
138 dbug_trans(1, "returning -EAGAIN\n");
143 * This is a make-shift waitqueue. The reason we don't use
144 * an actual wait queue is because:
145 * 1) we only ever have one waiter
146 * 2) the tracing, traces all functions, we don't want
147 * the overhead of calling wake_up and friends
148 * (and tracing them too)
149 * Anyway, this is really very primitive wakeup.
151 set_current_state(TASK_INTERRUPTIBLE
);
153 /* sleep for 100 msecs, and try again. */
154 schedule_timeout(HZ
/10);
156 if (signal_pending(current
)) {
157 dbug_trans(1, "returning -EINTR\n");
162 dbug_trans(1, "returning 1\n");
166 static struct ring_buffer_event
*
167 peek_next_event(int cpu
, u64
*ts
)
169 return ring_buffer_peek(__stp_ring_buffer
, cpu
, ts
);
172 /* Find the next real event */
173 static struct ring_buffer_event
*
174 _stp_find_next_event(long cpu_file
)
176 struct ring_buffer_event
*event
;
180 * If we are in a per_cpu trace file, don't bother by iterating over
181 * all cpus and peek directly.
183 if (ring_buffer_empty_cpu(__stp_ring_buffer
, (int)cpu_file
))
185 event
= peek_next_event(cpu_file
, &_stp_rb_data
.ts
);
186 _stp_rb_data
.cpu
= cpu_file
;
190 struct ring_buffer_event
*next
= NULL
;
195 for_each_possible_cpu(cpu
) {
197 if (ring_buffer_empty_cpu(__stp_ring_buffer
, cpu
))
200 event
= peek_next_event(cpu
, &ts
);
203 * Pick the event with the smallest timestamp:
205 if (event
&& (!next
|| ts
< next_ts
)) {
212 _stp_rb_data
.cpu
= next_cpu
;
213 _stp_rb_data
.ts
= next_ts
;
224 _stp_data_read_trace(struct file
*filp
, char __user
*ubuf
,
225 size_t cnt
, loff_t
*ppos
)
228 struct ring_buffer_event
*event
;
229 long cpu_file
= (long) filp
->private_data
;
231 dbug_trans(1, "%lu\n", (unsigned long)cnt
);
233 sret
= tracing_wait_pipe(filp
);
234 dbug_trans(1, "tracing_wait_pipe returned %ld\n", sret
);
238 /* stop when tracing is finished */
239 if (ring_buffer_empty(__stp_ring_buffer
)) {
244 if (cnt
>= PAGE_SIZE
)
247 dbug_trans(1, "sret = %lu\n", (unsigned long)sret
);
249 while ((event
= _stp_find_next_event(cpu_file
)) != NULL
) {
252 len
= _stp_event_to_user(event
, ubuf
, cnt
);
256 ring_buffer_consume(__stp_ring_buffer
, _stp_rb_data
.cpu
,
270 _stp_data_poll_trace(struct file
*filp
, poll_table
*poll_table
)
272 dbug_trans(1, "entry\n");
273 if (! ring_buffer_empty(__stp_ring_buffer
))
274 return POLLIN
| POLLRDNORM
;
275 poll_wait(filp
, &_stp_poll_wait
, poll_table
);
276 if (! ring_buffer_empty(__stp_ring_buffer
))
277 return POLLIN
| POLLRDNORM
;
279 dbug_trans(1, "exit\n");
283 static struct file_operations __stp_data_fops
= {
284 .owner
= THIS_MODULE
,
285 .open
= _stp_data_open_trace
,
286 .release
= _stp_data_release_trace
,
287 .poll
= _stp_data_poll_trace
,
288 .read
= _stp_data_read_trace
,
290 .splice_read
= tracing_splice_read_pipe
,
295 * Here's how __STP_MAX_RESERVE_SIZE is figured. The value of
296 * BUF_PAGE_SIZE was gotten from the kernel's ring_buffer code. It
297 * is divided by 4, so we waste a maximum of 1/4 of the buffer (in
298 * the case of a small reservation).
300 #define __STP_MAX_RESERVE_SIZE ((/*BUF_PAGE_SIZE*/ 4080 / 4) \
301 - sizeof(struct _stp_data_entry) \
302 - sizeof(struct ring_buffer_event))
305 * This function prepares the cpu buffer to write a sample.
307 * Struct op_entry is used during operations on the ring buffer while
308 * struct op_sample contains the data that is stored in the ring
309 * buffer. Struct entry can be uninitialized. The function reserves a
310 * data array that is specified by size. Use
311 * op_cpu_buffer_write_commit() after preparing the sample. In case of
312 * errors a null pointer is returned, otherwise the pointer to the
317 _stp_data_write_reserve(size_t size_request
, void **entry
)
319 struct ring_buffer_event
*event
;
320 struct _stp_data_entry
*sde
;
325 if (size_request
> __STP_MAX_RESERVE_SIZE
) {
326 size_request
= __STP_MAX_RESERVE_SIZE
;
329 event
= ring_buffer_lock_reserve(__stp_ring_buffer
,
330 sizeof(struct _stp_data_entry
) + size_request
,
332 if (unlikely(! event
)) {
333 dbug_trans(1, "event = NULL (%p)?\n", event
);
338 sde
= (struct _stp_data_entry
*)ring_buffer_event_data(event
);
339 sde
->len
= size_request
;
345 static unsigned char *_stp_data_entry_data(void *entry
)
347 struct ring_buffer_event
*event
= entry
;
348 struct _stp_data_entry
*sde
;
353 sde
= (struct _stp_data_entry
*)ring_buffer_event_data(event
);
357 static int _stp_data_write_commit(void *entry
)
360 struct ring_buffer_event
*event
= (struct ring_buffer_event
*)entry
;
362 if (unlikely(! entry
)) {
363 dbug_trans(1, "entry = NULL, returning -EINVAL\n");
367 ret
= ring_buffer_unlock_commit(__stp_ring_buffer
, event
, 0);
368 dbug_trans(1, "after commit, empty returns %d\n",
369 ring_buffer_empty(__stp_ring_buffer
));
371 wake_up_interruptible(&_stp_poll_wait
);
376 static struct dentry
*__stp_entry
[NR_CPUS
] = { NULL
};
378 static int _stp_transport_data_fs_init(void)
384 dbug_trans(1, "entry...\n");
385 rc
= __stp_alloc_ring_buffer();
390 for_each_online_cpu(cpu
) {
391 char cpu_file
[9]; /* 5(trace) + 3(XXX) + 1(\0) = 9 */
393 if (cpu
> 999 || cpu
< 0) {
394 _stp_transport_data_fs_close();
397 sprintf(cpu_file
, "trace%ld", cpu
);
398 __stp_entry
[cpu
] = debugfs_create_file(cpu_file
, 0600,
399 _stp_get_module_dir(),
403 if (!__stp_entry
[cpu
]) {
404 pr_warning("Could not create debugfs 'trace' entry\n");
405 __stp_free_ring_buffer();
408 else if (IS_ERR(__stp_entry
[cpu
])) {
409 rc
= PTR_ERR(__stp_entry
[cpu
]);
410 pr_warning("Could not create debugfs 'trace' entry\n");
411 __stp_free_ring_buffer();
415 __stp_entry
[cpu
]->d_inode
->i_uid
= _stp_uid
;
416 __stp_entry
[cpu
]->d_inode
->i_gid
= _stp_gid
;
424 dbug_trans(1, "returning 0...\n");
428 static void _stp_transport_data_fs_start(void)
433 static void _stp_transport_data_fs_stop(void)
438 static void _stp_transport_data_fs_close(void)
442 for_each_possible_cpu(cpu
) {
443 if (__stp_entry
[cpu
])
444 debugfs_remove(__stp_entry
[cpu
]);
445 __stp_entry
[cpu
] = NULL
;
448 __stp_free_ring_buffer();