From: fche Date: Wed, 27 Feb 2008 23:11:19 +0000 (+0000) Subject: PR5697: include tutorial & language reference guide X-Git-Tag: release-0.6.2~20 X-Git-Url: https://sourceware.org/git/?a=commitdiff_plain;h=a20617af34e3dbeba682cfa6bf6366f3fc0f8e14;p=systemtap.git PR5697: include tutorial & language reference guide --- diff --git a/doc/ChangeLog b/doc/ChangeLog new file mode 100644 index 000000000..d9e609d28 --- /dev/null +++ b/doc/ChangeLog @@ -0,0 +1,7 @@ +2008-02-27 Frank Ch. Eigler + + * langref.tex, tutorial.tex: Copied over & aggregated + from former comfy digs under /cvs/doc. + * tutorial/*: Samples scripts from tutorial. + * Makefile.am: New build instructions. + * Makefile.in: New generated file. diff --git a/doc/Makefile.am b/doc/Makefile.am new file mode 100644 index 000000000..bf80fbd30 --- /dev/null +++ b/doc/Makefile.am @@ -0,0 +1,28 @@ +# Makefile.am --- automake input file for systemtap docs +## process this file with automake to produce Makefile.in + +if BUILD_DOCS +all-local: tutorial.pdf langref.pdf + +clean-local: + rm -f *.pdf *.out *.log *.aux *.toc *.lot *.idx *.glo +endif + +SUFFIXES = ps pdf dvi ps tex + +.ps.pdf: + ps2pdf -r600 $< + +.dvi.ps: + dvips -t letter -o $@ $< + +.tex.dvi: + pwd=`pwd`; cd $(srcdir); \ + latex -output-directory=$$pwd $<; \ + touch $*.glo \ + makeindex $*.glo -s nomencl.ist -o $*.gls \ + latex -output-directory=$$pwd $<; \ + latex -output-directory=$$pwd $<; \ + latex -output-directory=$$pwd $< + +EXTRA_DIST = tutorial.tex langref.tex tutorial diff --git a/doc/Makefile.in b/doc/Makefile.in new file mode 100644 index 000000000..27a4ab8bc --- /dev/null +++ b/doc/Makefile.in @@ -0,0 +1,336 @@ +# Makefile.in generated by automake 1.10 from Makefile.am. +# @configure_input@ + +# Copyright (C) 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, +# 2003, 2004, 2005, 2006 Free Software Foundation, Inc. +# This Makefile.in is free software; the Free Software Foundation +# gives unlimited permission to copy and/or distribute it, +# with or without modifications, as long as this notice is preserved. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY, to the extent permitted by law; without +# even the implied warranty of MERCHANTABILITY or FITNESS FOR A +# PARTICULAR PURPOSE. + +@SET_MAKE@ + +# Makefile.am --- automake input file for systemtap docs +VPATH = @srcdir@ +pkgdatadir = $(datadir)/@PACKAGE@ +pkglibdir = $(libdir)/@PACKAGE@ +pkgincludedir = $(includedir)/@PACKAGE@ +am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd +install_sh_DATA = $(install_sh) -c -m 644 +install_sh_PROGRAM = $(install_sh) -c +install_sh_SCRIPT = $(install_sh) -c +INSTALL_HEADER = $(INSTALL_DATA) +transform = $(program_transform_name) +NORMAL_INSTALL = : +PRE_INSTALL = : +POST_INSTALL = : +NORMAL_UNINSTALL = : +PRE_UNINSTALL = : +POST_UNINSTALL = : +subdir = doc +DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in +ACLOCAL_M4 = $(top_srcdir)/aclocal.m4 +am__aclocal_m4_deps = $(top_srcdir)/configure.ac +am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \ + $(ACLOCAL_M4) +mkinstalldirs = $(install_sh) -d +CONFIG_HEADER = $(top_builddir)/config.h +CONFIG_CLEAN_FILES = +SOURCES = +DIST_SOURCES = +DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST) +ACLOCAL = @ACLOCAL@ +AMTAR = @AMTAR@ +AUTOCONF = @AUTOCONF@ +AUTOHEADER = @AUTOHEADER@ +AUTOMAKE = @AUTOMAKE@ +AWK = @AWK@ +CC = @CC@ +CCDEPMODE = @CCDEPMODE@ +CFLAGS = @CFLAGS@ +CPP = @CPP@ +CPPFLAGS = @CPPFLAGS@ +CXX = @CXX@ +CXXDEPMODE = @CXXDEPMODE@ +CXXFLAGS = @CXXFLAGS@ +CYGPATH_W = @CYGPATH_W@ +DATE = @DATE@ +DEFS = @DEFS@ +DEPDIR = @DEPDIR@ +ECHO_C = @ECHO_C@ +ECHO_N = @ECHO_N@ +ECHO_T = @ECHO_T@ +EGREP = @EGREP@ +EXEEXT = @EXEEXT@ +GREP = @GREP@ +INSTALL = @INSTALL@ +INSTALL_DATA = @INSTALL_DATA@ +INSTALL_PROGRAM = @INSTALL_PROGRAM@ +INSTALL_SCRIPT = @INSTALL_SCRIPT@ +INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@ +LDFLAGS = @LDFLAGS@ +LIBOBJS = @LIBOBJS@ +LIBS = @LIBS@ +LN_S = @LN_S@ +LTLIBOBJS = @LTLIBOBJS@ +MAINT = @MAINT@ +MAKEINFO = @MAKEINFO@ +MKDIR_P = @MKDIR_P@ +OBJEXT = @OBJEXT@ +PACKAGE = @PACKAGE@ +PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@ +PACKAGE_NAME = @PACKAGE_NAME@ +PACKAGE_STRING = @PACKAGE_STRING@ +PACKAGE_TARNAME = @PACKAGE_TARNAME@ +PACKAGE_VERSION = @PACKAGE_VERSION@ +PATH_SEPARATOR = @PATH_SEPARATOR@ +PROCFLAGS = @PROCFLAGS@ +RANLIB = @RANLIB@ +SET_MAKE = @SET_MAKE@ +SHELL = @SHELL@ +STRIP = @STRIP@ +U = @U@ +VERSION = @VERSION@ +abs_builddir = @abs_builddir@ +abs_srcdir = @abs_srcdir@ +abs_top_builddir = @abs_top_builddir@ +abs_top_srcdir = @abs_top_srcdir@ +ac_ct_CC = @ac_ct_CC@ +ac_ct_CXX = @ac_ct_CXX@ +am__include = @am__include@ +am__leading_dot = @am__leading_dot@ +am__quote = @am__quote@ +am__tar = @am__tar@ +am__untar = @am__untar@ +bindir = @bindir@ +build_alias = @build_alias@ +builddir = @builddir@ +cap_LIBS = @cap_LIBS@ +datadir = @datadir@ +datarootdir = @datarootdir@ +docdir = @docdir@ +dvidir = @dvidir@ +elfutils_abs_srcdir = @elfutils_abs_srcdir@ +exec_prefix = @exec_prefix@ +have_dvips = @have_dvips@ +have_latex = @have_latex@ +have_ps2pdf = @have_ps2pdf@ +host_alias = @host_alias@ +htmldir = @htmldir@ +includedir = @includedir@ +infodir = @infodir@ +install_sh = @install_sh@ +libdir = @libdir@ +libexecdir = @libexecdir@ +localedir = @localedir@ +localstatedir = @localstatedir@ +mandir = @mandir@ +mkdir_p = @mkdir_p@ +oldincludedir = @oldincludedir@ +pdfdir = @pdfdir@ +prefix = @prefix@ +program_transform_name = @program_transform_name@ +psdir = @psdir@ +sbindir = @sbindir@ +sharedstatedir = @sharedstatedir@ +sqlite3_LIBS = @sqlite3_LIBS@ +srcdir = @srcdir@ +stap_LIBS = @stap_LIBS@ +staplog_CPPFLAGS = @staplog_CPPFLAGS@ +subdirs = @subdirs@ +sysconfdir = @sysconfdir@ +target_alias = @target_alias@ +top_builddir = @top_builddir@ +top_srcdir = @top_srcdir@ +SUFFIXES = ps pdf dvi ps tex +EXTRA_DIST = tutorial.tex langref.tex tutorial +all: all-am + +.SUFFIXES: +.SUFFIXES: ps pdf dvi ps tex .dvi .pdf .ps .tex +$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(am__configure_deps) + @for dep in $?; do \ + case '$(am__configure_deps)' in \ + *$$dep*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh \ + && exit 0; \ + exit 1;; \ + esac; \ + done; \ + echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu doc/Makefile'; \ + cd $(top_srcdir) && \ + $(AUTOMAKE) --gnu doc/Makefile +.PRECIOUS: Makefile +Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status + @case '$?' in \ + *config.status*) \ + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \ + *) \ + echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe)'; \ + cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__depfiles_maybe);; \ + esac; + +$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh + +$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps) + cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh +tags: TAGS +TAGS: + +ctags: CTAGS +CTAGS: + + +distdir: $(DISTFILES) + @srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \ + list='$(DISTFILES)'; \ + dist_files=`for file in $$list; do echo $$file; done | \ + sed -e "s|^$$srcdirstrip/||;t" \ + -e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \ + case $$dist_files in \ + */*) $(MKDIR_P) `echo "$$dist_files" | \ + sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \ + sort -u` ;; \ + esac; \ + for file in $$dist_files; do \ + if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \ + if test -d $$d/$$file; then \ + dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \ + if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \ + cp -pR $(srcdir)/$$file $(distdir)$$dir || exit 1; \ + fi; \ + cp -pR $$d/$$file $(distdir)$$dir || exit 1; \ + else \ + test -f $(distdir)/$$file \ + || cp -p $$d/$$file $(distdir)/$$file \ + || exit 1; \ + fi; \ + done +check-am: all-am +check: check-am +@BUILD_DOCS_FALSE@all-local: +all-am: Makefile all-local +installdirs: +install: install-am +install-exec: install-exec-am +install-data: install-data-am +uninstall: uninstall-am + +install-am: all-am + @$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am + +installcheck: installcheck-am +install-strip: + $(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \ + install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \ + `test -z '$(STRIP)' || \ + echo "INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'"` install +mostlyclean-generic: + +clean-generic: + +distclean-generic: + -test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES) + +maintainer-clean-generic: + @echo "This command is intended for maintainers to use" + @echo "it deletes files that may require special tools to rebuild." +@BUILD_DOCS_FALSE@clean-local: +clean: clean-am + +clean-am: clean-generic clean-local mostlyclean-am + +distclean: distclean-am + -rm -f Makefile +distclean-am: clean-am distclean-generic + +dvi: dvi-am + +dvi-am: + +html: html-am + +info: info-am + +info-am: + +install-data-am: + +install-dvi: install-dvi-am + +install-exec-am: + +install-html: install-html-am + +install-info: install-info-am + +install-man: + +install-pdf: install-pdf-am + +install-ps: install-ps-am + +installcheck-am: + +maintainer-clean: maintainer-clean-am + -rm -f Makefile +maintainer-clean-am: distclean-am maintainer-clean-generic + +mostlyclean: mostlyclean-am + +mostlyclean-am: mostlyclean-generic + +pdf: pdf-am + +pdf-am: + +ps: ps-am + +ps-am: + +uninstall-am: + +.MAKE: install-am install-strip + +.PHONY: all all-am all-local check check-am clean clean-generic \ + clean-local distclean distclean-generic distdir dvi dvi-am \ + html html-am info info-am install install-am install-data \ + install-data-am install-dvi install-dvi-am install-exec \ + install-exec-am install-html install-html-am install-info \ + install-info-am install-man install-pdf install-pdf-am \ + install-ps install-ps-am install-strip installcheck \ + installcheck-am installdirs maintainer-clean \ + maintainer-clean-generic mostlyclean mostlyclean-generic pdf \ + pdf-am ps ps-am uninstall uninstall-am + + +@BUILD_DOCS_TRUE@all-local: tutorial.pdf langref.pdf + +@BUILD_DOCS_TRUE@clean-local: +@BUILD_DOCS_TRUE@ rm -f *.pdf *.out *.log *.aux *.toc *.lot *.idx *.glo + +.ps.pdf: + ps2pdf -r600 $< + +.dvi.ps: + dvips -t letter -o $@ $< + +.tex.dvi: + pwd=`pwd`; cd $(srcdir); \ + latex -output-directory=$$pwd $<; \ + touch $*.glo \ + makeindex $*.glo -s nomencl.ist -o $*.gls \ + latex -output-directory=$$pwd $<; \ + latex -output-directory=$$pwd $<; \ + latex -output-directory=$$pwd $< +# Tell versions [3.59,3.63) of GNU make to not export all variables. +# Otherwise a system limit (for SysV at least) may be exceeded. +.NOEXPORT: diff --git a/doc/langref.tex b/doc/langref.tex new file mode 100644 index 000000000..5b91d01d0 --- /dev/null +++ b/doc/langref.tex @@ -0,0 +1,3285 @@ +% SystemTap Language Reference +\documentclass[twoside,english]{article} +\usepackage{geometry} +\geometry{verbose,letterpaper,tmargin=1.5in,bmargin=1.5in,lmargin=1in,rmargin=1in} +\usepackage{fancyhdr} +\pagestyle{fancy} +\usepackage{array} +\usepackage{varioref} +\usepackage{float} +\usepackage{makeidx} +\usepackage{verbatim} +\usepackage{url} +\makeindex + +\makeatletter + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% LyX specific LaTeX commands. +\newcommand{\noun}[1]{\textsc{#1}} +%% Bold symbol macro for standard LaTeX users +%\providecommand{\boldsymbol}[1]{\mbox{\boldmath $#1$}} + +%% Because html converters don't know tabularnewline +\providecommand{\tabularnewline}{\\} + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% User specified LaTeX commands. +\setlength{\parindent}{0pt} +%\setlength{\parskip}{3pt plus 2pt minus 1pt} +\setlength{\parskip}{5pt} + +% +% this makes list spacing much better. +% +\newenvironment{my_itemize}{ +\begin{itemize} + \setlength{\itemsep}{1pt} + \setlength{\parskip}{0pt} + \setlength{\parsep}{0pt}}{\end{itemize} +} + +\newenvironment{vindent} +{\begin{list}{}{\setlength{\listparindent}{6pt}} +\item[]} +{\end{list}} + +\usepackage{babel} +\makeatother +\begin{document} + +\title{SystemTap Language Reference} + +\maketitle +\newpage{} +This document was derived from other documents contributed to the SystemTap project by employees of Red Hat, IBM and Intel.\newline + +Copyright \copyright\space 2007 Red Hat Inc.\newline +Copyright \copyright\space 2007 IBM Corp.\newline +Copyright \copyright\space 2007 Intel Corporation.\newline + +Permission is granted to copy, distribute and/or modify this document +under the terms of the GNU Free Documentation License, Version 1.2 +or any later version published by the Free Software Foundation; +with no Invariant Sections, no Front-Cover Texts, and no Back-Cover Texts.\newline + +The GNU Free Documentation License is available from +\url{http://www.gnu.org/licenses/fdl.html} or by writing to +the Free Software Foundation, Inc., 51 Franklin Street, +Fifth Floor, Boston, MA 02110-1301, USA. +\newpage{} +\tableofcontents{} +\listoftables +\newpage{} + +\section{SystemTap overview\label{sec:SystemTap-Overview}} + +\subsection{About this guide} + +This guide is a comprehensive reference of SystemTap's language constructs +and syntax. The contents borrow heavily from existing SystemTap documentation +found in manual pages and the tutorial. The presentation of information here +provides the reader with a single place to find language syntax and recommended +usage. In order to successfully use this guide, you should be familiar with +the general theory and operation of SystemTap. If you are new to SystemTap, +you will find the tutorial to be an excellent place to start learning. For +detailed information about tapsets, see the manual pages provided with the +distribution. For information about the entire collection of SystemTap reference +material, see Section~\ref{sec:For-Further-Reference} + +\subsection{Reasons to use SystemTap} + +SystemTap provides infrastructure to simplify the gathering of information +about a running Linux kernel so that it may be further analyzed. This analysis +assists in identifying the underlying cause of a performance or functional +problem. SystemTap was designed to eliminate the need for a developer to +go through the tedious instrument, recompile, install, and reboot sequence +normally required to collect this kind of data. To do this, it provides a +simple command-line interface and scripting language for writing kernel instrumentation. +With SystemTap, developers, system administrators, and users can easily write +scripts that gather and manipulate kernel data that is not otherwise available +using standard Linux tools. Users of SystemTap will find it to be a significant +improvement over older methods. + +\subsection{Event-action language} +\index{language} +SystemTap's language is strictly typed, declaration free, procedural, and +inspired by dtrace and awk. Source code points or events in the kernel are +associated with handlers, which are subroutines that are executed synchronously. +These probes are conceptually similar to \char`\"{}breakpoint command lists\char`\"{} +in the GDB debugger. + +There are two main outermost constructs: probes and functions. Within these, +statements and expressions use C-like operator syntax and precedence. + +\subsection{Sample SystemTap scripts} +\index{example scripts} +Following are some example scripts that illustrate the basic operation of +SystemTap. For more examples, see the examples/small\_demos/ directory in +the source directory, the SystemTap wiki at \url{http://sourceware.org/systemtap/wiki/HomePage}, +or the SystemTap War Stories at \url{http://sourceware.org/systemtap/wiki/WarStories} page. + +\subsubsection{Basic SystemTap syntax and control structures} + +The following code examples demonstrate SystemTap syntax and control structures. + +\begin{vindent} +\begin{verbatim} +global odds, evens + +probe begin { + # "no" and "ne" are local integers + for (i = 0; i < 10; i++) { + if (i % 2) odds [no++] = i + else evens [ne++] = i + } + + delete odds[2] + delete evens[3] + exit() +} + +probe end { + foreach (x+ in odds) + printf ("odds[%d] = %d", x, odds[x]) + + foreach (x in evens-) + printf ("evens[%d] = %d", x, evens[x]) +} +\end{verbatim} +\end{vindent} +This prints: + +\begin{vindent} +\begin{verbatim} +odds[0] = 1 +odds[1] = 3 +odds[3] = 7 +odds[4] = 9 +evens[4] = 8 +evens[2] = 4 +evens[1] = 2 +evens[0] = 0 +\end{verbatim} +\end{vindent} +Note that all variable types are inferred, and that all locals and globals +are initialized. + +\subsubsection{Primes between 0 and 49} + +\begin{vindent} +\begin{verbatim} +function isprime (x) { + if (x < 2) return 0 + for (i = 2; i < x; i++) { + if (x % i == 0) return 0 + if (i * i > x) break + } + return 1 +} + +probe begin { + for (i = 0; i < 50; i++) + if (isprime (i)) printf("%d\n", i) + exit() +} +\end{verbatim} +\end{vindent} +This prints: + +\begin{vindent} +\begin{verbatim} +2 +3 +5 +7 +11 +13 +17 +19 +23 +29 +31 +37 +41 +43 +47 +\end{verbatim} +\end{vindent} + +\subsubsection{Recursive functions} +\index{recursion} +\begin{vindent} +\begin{verbatim} +function fibonacci(i) { + if (i < 1) error ("bad number") + if (i == 1) return 1 + if (i == 2) return 2 + return fibonacci (i-1) + fibonacci (i-2) +} + +probe begin { + printf ("11th fibonacci number: %d", fibonacci (11)) + exit () +} +\end{verbatim} +\end{vindent} +This prints: + +\begin{vindent} +\begin{verbatim} +11th fibonacci number: 118 +\end{verbatim} +\end{vindent} +Any larger number input to the function may exceed the MAXACTION or MAXNESTING +limits, which will be caught by the parser and result in an error. For more +about limits see Section~\ref{sub:SystemTap-safety}. +\newpage{} +\subsection{The stap command} +\index{stap} +The stap program is the front-end to the SystemTap tool. It accepts probing +instructions written in its scripting language, translates those instructions +into C code, compiles this C code, and loads the resulting kernel module +into a running Linux kernel to perform the requested system trace or probe +functions. You can supply the script in a named file, from standard input, +or from the command line. The program runs until it is interrupted by the +user or a sufficient number of soft errors, or if the script voluntarily +invokes the exit() function. + +The stap command does the following: + +\begin{itemize} +\item Translates the script +\item Generates and compiles a kernel module +\item Inserts the module; output to stap's stdout +\item CTRL-C unloads the module and terminates stap +\end{itemize} +For a full list of options to the stap command, see the stap(1) manual page. + +\subsection{Safety and security\label{sub:SystemTap-safety}} +\index{limits} +SystemTap is an administrative tool. It exposes kernel internal data structures +and potentially private user information. It requires root privileges to +actually run the kernel objects it builds using the \textbf{sudo} command, +applied to the \textbf{staprun} program. + +staprun is a part of the SystemTap package, dedicated to module loading and +unloading and kernel-to-user data transfer. Since staprun does not perform +any additional security checks on the kernel objects it is given, do not +give elevated privileges via sudo to untrusted users. + +The translator asserts certain safety constraints. \index{constraints}It +ensures that no handler routine can run for too long, allocate memory, perform +unsafe operations, or unintentionally interfere with the kernel. Use of script +global variables is locked to protect against manipulation by concurrent +probe handlers. Use of \emph{guru mode} constructs such as embedded C (see +Section~\ref{sub:Embedded-C}) can violate these constraints, leading to +a kernel crash or data corruption. + +The resource use limits are set by macros in the generated C code. These +may be overridden with the -D flag. The following list describes a selection +of these macros: + +\textbf{MAXNESTING} -- The maximum number of recursive function call levels. The default is 10. + +\textbf{MAXSTRINGLEN} -- The maximum length of strings. The default is 128. + +\textbf{MAXTRYLOCK} -- The maximum number of iterations to wait for locks on global variables before +declaring possible deadlock and skipping the probe. The default is 1000. + +\textbf{MAXACTION} -- The maximum number of statements to execute during any single probe hit. The default is 1000. + +\textbf{MAXMAPENTRIES} -- The maximum number of rows in an array if the array size is not specified +explicitly when declared. The default is 2048. + +\textbf{MAXERRORS} -- The maximum number of soft errors before an exit is triggered. The default is 0. + +\textbf{MAXSKIPPED} -- The maximum number of skipped reentrant probes before an exit is triggered. The default is 100. + +\textbf{MINSTACKSPACE} -- The minimum number of free kernel stack bytes required in order to run a +probe handler. This number should be large enough for the probe handler's +own needs, plus a safety margin. The default is 1024. + +If something goes wrong with stap or staprun after a probe has started running, +you may safely kill both user processes, and remove the active probe kernel +module with the rmmod command. Any pending trace messages may be lost. + +\section{Types of SystemTap scripts\label{sec:Types-of-SystemTap}} + +\subsection{Probe scripts} + +Probe scripts are analogous to programs; these scripts identify probe points +and associated handlers. + +\subsection{Tapset scripts} + +Tapset scripts are libraries of probe aliases and auxiliary functions. + +The /usr/share/systemtap/tapset directory contains tapset scripts. While +these scripts look like regular SystemTap scripts, they cannot be run directly. + +\section{Components of a SystemTap script} + +The main construct in the scripting language identifies probes. Probes associate +abstract events with a statement block, or probe handler, that is to be executed +when any of those events occur. + +The following example shows how to trace entry and exit from a function using +two probes. + +\begin{vindent} +\begin{verbatim} +probe kernel.function("sys_mkdir") { log ("enter") } +probe kernel.function("sys_mkdir").return { log ("exit") } +\end{verbatim} +\end{vindent} + +To list the probe-able functions in the kernel, use the last-pass option +to the translator. The output needs to be filtered because each inlined function +instance is listed separately. The following statement is an example. + +\begin{vindent} +\begin{verbatim} +# stap -p2 -e 'probe kernel.function("*") {}' | sort | uniq +\end{verbatim} +\end{vindent} + +\subsection{Probe definitions} + +The general syntax is as follows. + +\begin{vindent} +\begin{verbatim} +probe PROBEPOINT [, PROBEPOINT] { [STMT ...] } +\end{verbatim} +\end{vindent} +Events are specified in a special syntax called \emph{probe points}. There +are several varieties of probe points defined by the translator, and tapset +scripts may define others using aliases. The provided probe points are listed +in the stapprobes(5) man pages. + +The probe handler is interpreted relative to the context of each event. For +events associated with kernel code, this context may include variables defined +in the source code at that location. These \emph{target variables}\index{target variables} +are presented to the script as variables whose names are prefixed with a +dollar sign (\$). They may be accessed only if the compiler used to compile +the kernel preserved them, despite optimization. This is the same constraint +imposed by a debugger when working with optimized code. Other events may +have very little context. + + +\subsection{Probe aliases\label{sub:Probe-aliases}} +\index{probe aliases} +The general syntax is as follows. + +\begin{vindent} +\begin{verbatim} +probe = { } +probe += { } +\end{verbatim} +\end{vindent} +New probe points may be defined using \emph{aliases}. A probe point alias +looks similar to probe definitions, but instead of activating a probe at +the given point, it defines a new probe point name as an alias to an existing +one. New probe aliases may refer to one or more existing probe aliases. The +following is an example. + +\begin{vindent} +\begin{verbatim} +probe socket.sendmsg = kernel.function ("sock_sendmsg") { ... } +probe socket.do_write = kernel.function ("do_sock_write") { ... } +probe socket.send = socket.sendmsg, socket.do_write { ... } +\end{verbatim} +\end{vindent} +There are two types of aliases, the prologue style and the epilogue style +which are identified by the equal sign (\texttt{\textbf{=}}) and \char`\"{}\texttt{\textbf{+=}}\char`\"{} +respectively. + +A probe that names the new probe point will create an actual probe, with +the handler of the alias \emph{pre-pended}. + +This pre-pending behavior serves several purposes. It allows the alias definition +to pre-process the context of the probe before passing control to the handler +specified by the user. This has several possible uses, demonstrated as follows. + +\begin{vindent} +\begin{verbatim} +# Skip probe unless given condition is met: +if ($flag1 != $flag2) next + +# Supply values describing probes: +name = "foo" + +# Extract the target variable to a plain local variable: +var = $var +\end{verbatim} +\end{vindent} + +\subsubsection{Prologue-style aliases (=)} +\index{prologue-style aliases} +\index{=} +For a prologue style alias, the statement block that follows an alias definition +is implicitly added as a prologue to any probe that refers to the alias. +The following is an example. + +\begin{vindent} +\begin{verbatim} +# Defines a new probe point syscall.read, which expands to +# kernel.function("sys_read"), with the given statement as +# a prologue. +# +probe syscall.read = kernel.function("sys_read") { + fildes = $fd +} +\end{verbatim} +\end{vindent} + +\subsubsection{Epilogue-style aliases (+=)} +\index{epilogue-style aliases} +\index{+=} +The statement block that follows an alias definition is implicitly added +as an epilogue to any probe that refers to the alias. The following is an +example: + +\begin{vindent} +\begin{verbatim} +# Defines a new probe point with the given statement as an +# epilogue. +# +probe syscall.read += kernel.function("sys_read") { + fildes = $fd +} +\end{verbatim} +\end{vindent} + +\subsubsection{Probe alias usage} + +Another probe definition may use a previously defined alias. The following +is an example. + +\begin{vindent} +\begin{verbatim} +probe syscall.read { + printf("reading fd=%d\n", fildes) +} +\end{verbatim} +\end{vindent} + +\subsubsection{Unused alias variables} +\index{unused variables} +An unused alias variable is a variable defined in a probe alias, usually +as one of a group of \texttt{var = \$var} assignments, which is not actually +used by the script probe that instantiates the alias. These variables are +discarded. + +\subsection{Variables\label{sub:Variables}} +\index{variables} +Identifiers for variables and functions are alphanumeric sequences, and may +include the underscore (\_) and the dollar sign (\$) characters. They may +not start with a plain digit. Each variable is by default local to the probe +or function statement block where it is mentioned, and therefore its scope +and lifetime is limited to a particular probe or function invocation. Scalar +variables are implicitly typed as either string or integer. Associative arrays +also have a string or integer value, and a tuple of strings or integers serves +as a key. Arrays must be declared as global. Local arrays\index{local arrays} +are not allowed. + +The translator performs \emph{type inference} on all identifiers, including +array indexes and function parameters. Inconsistent type-related use of identifiers +results in an error. + +Variables may be declared global. Global variables are shared among all probes +and remain instantiated as long as the SystemTap session. There is one namespace +for all global variables, regardless of the script file in which they are +found. Because of possible concurrency limits, such as multiple probe handlers, +each global variable used by a probe is automatically read- or write-locked +while the handler is running. A global declaration may be written at the +outermost level anywhere in a script file, not just within a block of code. +The following declaration marks \texttt{var1} and \texttt{var2} as global. +The translator will infer a value type for each, and if the variable is used +as an array, its key types. + +\begin{vindent} +\begin{verbatim} +global var1[=], var2[=] +\end{verbatim} +\end{vindent} + +\subsection{Auxiliary functions\label{sub:Auxiliary-functions}} +\index{auxiliary functions} +General syntax: + +\begin{vindent} +\begin{verbatim} +function [:] ( [:], ... ) { } +\end{verbatim} +\end{vindent} +SystemTap scripts may define subroutines to factor out common work. Functions +may take any number of scalar arguments, and must return a single scalar +value. Scalars in this context are integers or strings. For more information +on scalars, see Section~\ref{sub:Variables} and Section~\ref{sub:Data-types}\texttt{.} +The following is an example function declaration. + +\begin{vindent} +\begin{verbatim} +function thisfn (arg1, arg2) { + return arg1 + arg2 +} +\end{verbatim} +\end{vindent} +Note the general absence of type declarations, which are inferred by the +translator. If desired, a function definition may include explicit type declarations +for its return value, its arguments, or both. This is helpful for embedded-C +functions. In the following example, the type inference engine need only +infer the type of arg2, a string. + +\begin{vindent} +\begin{verbatim} +function thatfn:string(arg1:long, arg2) { + return sprintf("%d%s", arg1, arg2) +} +\end{verbatim} +\end{vindent} +Functions may call others or themselves recursively, up to a fixed nesting +limit. See Section~\ref{sub:SystemTap-safety}. + + +\subsection{Embedded C\label{sub:Embedded-C}} +\index{embedded C} +SystemTap supports a \emph{guru\index{guru mode} mode} where script safety +features such as code and data memory reference protection are removed. Guru +mode is set by passing the ''-g'' flag to the stap command. When in guru +mode, the translator accepts embedded code enclosed between {}``\%\{'' +and {}``\%\}'' markers in the script file. Embedded code is transcribed +verbatim, without analysis, in sequence, into generated C code. At the outermost +level of a script, guru mode may be useful to add \#include instructions, +or any auxiliary definitions for use by other embedded code. + + +\subsection{Embedded C functions} + +General syntax: + +\begin{vindent} +\begin{verbatim} +function : ( :, ... ) %{ %} +\end{verbatim} +\end{vindent} +Embedded code is permitted in a function body. In that case, the script language +body is replaced entirely by a piece of C code enclosed between \%\{ and +\%\} markers. The enclosed code may do anything reasonable and safe as allowed +by the parser. + +There are a number of undocumented but complex safety constraints on concurrency, +resource consumption and runtime limits that are applied to code written +in the SystemTap language. These constraints are not applied to embedded +C code, so use such code with caution as it is used verbatim. Be especially +careful when dereferencing pointers. Use the kread() macro to dereference +any pointers that could potentially be invalid or dangerous. If you are unsure, +err on the side of caution and use kread(). The kread() macro is one of the +safety mechanisms used in code generated by embedded C. It protects against +pointer accesses that could crash the system. + +For example, to access the pointer chain \texttt{name = skb->dev->name} in +embedded C, use the following code. + +\begin{vindent} +\begin{verbatim} +struct net_device *dev; +char *name; +dev = kread(&(skb->dev)); +name = kread(&(dev->name)); +\end{verbatim} +\end{vindent} +The memory locations reserved for input and output values are provided to +a function using a macro named \texttt{THIS}\index{THIS}. The following +are examples. + +\begin{vindent} +\begin{verbatim} +function add_one (val) %{ + THIS->__retvalue = THIS->val + 1; +} +function add_one_str (val) %{ + strlcpy (THIS->__retvalue, THIS->val, MAXSTRINGLEN); + strlcat (THIS->__retvalue, "one", MAXSTRINGLEN); +} +\end{verbatim} +\end{vindent} +The function argument and return value types must be inferred by the translator +from the call sites in order for this method to work. You should examine +C code generated for ordinary script language functions to write compatible +embedded-C. Note that all SystemTap functions and probes run with interrupts +disabled, thus you cannot call functions that might sleep from within embedded +C. + +\section{Probe points\label{sec:Probe-Points}} +\index{probe points} +\subsection{General syntax} +\index{probe syntax} +The general probe point syntax is a dotted-symbol sequence. This divides +the event namespace into parts, analogous to the style of the Domain Name +System. Each component identifier is parameterized by a string or number +literal, with a syntax analogous to a function call. + +The following are all syntactically valid probe points. + +\begin{vindent} +\begin{verbatim} +kernel.function("foo") +kernel.function("foo").return +module{"ext3"}.function("ext3_*") +kernel.function("no_such_function") ? +syscall.* +end +timer.ms(5000) +\end{verbatim} +\end{vindent} +Probes may be broadly classified into \emph{synchronous}\index{synchronous} +or \emph{asynchronous}.\index{asynchronous} A synchronous event occurs when +any processor executes an instruction matched by the specification. This +gives these probes a reference point (instruction address) from which more +contextual data may be available. Other families of probe points refer to +asynchronous events such as timers, where no fixed reference point is related. +Each probe point specification may match multiple locations, such as by using +wildcards or aliases, and all are probed. A probe declaration may contain +several specifications separated by commas, which are all probed. + +\subsubsection{Prefixes} +\index{prefixes} +Prefixes specify the probe target, such as \textbf{kernel}, \textbf{module}, +\textbf{timer}, and so on. + +\subsubsection{Suffixes} +\index{suffixes} +Suffixes further qualify the point to probe, such as \textbf{.return} for the +exit point of a probed function. The absence of a suffix implies the function +entry point. + +\subsubsection{Wildcarded file names, function names} +\index{wildcards} +A component may include an asterisk ({*}) character, which expands to other +matching probe points. An example follows. + +\begin{vindent} +\begin{verbatim} +kernel.syscall.* +kernel.function("sys_*) +\end{verbatim} +\end{vindent} + +\subsubsection{Optional probe points\label{sub:Optional-probe-points}} +\index{?} +A probe point may be followed by a question mark (?) character, to indicate +that it is optional, and that no error should result if it fails to expand. +This effect passes down through all levels of alias or wildcard expansion. + +The following is the general syntax. + +\begin{vindent} +\begin{verbatim} +kernel.function("no_such_function") ? +\end{verbatim} +\end{vindent} + +\subsection{Built-in probe point types (DWARF probes)} +\index{built-in probes} +\index{dwarf probes} +This family of probe points uses symbolic debugging information for the target +kernel or module, as may be found in executables that have not +been stripped, or in the separate \textbf{debuginfo} packages. They allow +logical placement of probes into the execution path of the target +by specifying a set of points in the source or object code. When a matching +statement executes on any processor, the probe handler is run in that context. + +Points in a kernel are identified by module, source file, line number, function +name or some combination of these. + +Here is a list of probe point specifications currently supported: + +\begin{vindent} +\begin{verbatim} +kernel.function(PATTERN) +kernel.function(PATTERN).call +kernel.function(PATTERN).return +kernel.function(PATTERN).return.maxactive(VALUE) +kernel.function(PATTERN).inline +module(MPATTERN).function(PATTERN) +module(MPATTERN).function(PATTERN).call +module(MPATTERN).function(PATTERN).return.maxactive(VALUE) +module(MPATTERN).function(PATTERN).inline +kernel.statement(PATTERN) +kernel.statement(ADDRESS).absolute +module(MPATTERN).statement(PATTERN) +\end{verbatim} +\end{vindent} + +The \textbf{.function} variant places a probe near the beginning of the named +function, so that parameters are available as context variables. + +The \textbf{.return} variant places a probe at the moment of return from the named +function, so the return value is available as the \$return context variable. +The entry parameters are also available, though the function may have changed +their values. Return probes may be further qualified with \textbf{.maxactive}, +which specifies how many instances of the specified function can be probed simultaneously. +You can leave off \textbf{.maxactive} in most cases, as the default should be sufficient. +However, if you notice an excessive number of skipped probes, try setting \textbf{.maxactive} +to incrementally higher values to see if the number of skipped probes decreases. + +The \textbf{.inline} modifier for \textbf{.function} filters the results to include only +instances of inlined functions. The \textbf{.call} modifier selects the opposite subset. +Inline functions do not have an identifiable return point, so \textbf{.return} +is not supported on \textbf{.inline} probes. + +The \textbf{.statement} variant places a probe at the exact spot, exposing those local +variables that are visible there. + +In the above probe descriptions, MPATTERN stands for a string literal +that identifies the loaded kernel module of interest. It may include asterisk +({*}), square brackets \char`\"{}{[}]\char`\"{}, and question mark (?) wildcards. +PATTERN stands for a string literal that identifies a point in the program. +It is composed of three parts: + +\begin{enumerate} +\item The first part is the name of a function, as would appear in the nm program's +output. This part may use the asterisk and question mark wildcard operators +to match multiple names. +\item The second part is optional, and begins with the ampersand (@) character. +It is followed by the path to the source file containing the function, +which may include a wildcard pattern, such as mm/slab{*}. +In most cases, the path should be relative to the top of the +linux source directory, although an absolute path may be necessary for some kernels. +If a relative pathname doesn't work, try absolute. +\item The third part is optional if the file name part was given. It identifies +the line number in the source file, preceded by a colon. +\end{enumerate} +Alternately, specify PATTERN as a numeric constant to indicate a relative +module address or an absolute kernel address. + +Some of the source-level variables, such as function parameters, locals, +or globals visible in the compilation unit, are visible to probe handlers. +Refer to these variables by prefixing their name with a dollar sign within +the scripts. In addition, a special syntax allows limited traversal of structures, +pointers, and arrays. + +\texttt{\$var} refers to an in-scope variable var. If it is a type similar +to an integer, it will be cast to a 64-bit integer for script use. Pointers +similar to a string (char {*}) are copied to SystemTap string values by the +kernel\_string() or user\_string functions(). + +\texttt{\$var->field} traverses a structure's field. The indirection operator +may be repeated to follow additional levels of pointers. + +\texttt{\$var{[}N]} indexes into an array. The index is given with a literal +number. + +\subsubsection{kernel.function, module().function} +\index{kernel.function} +\index{module().function} +The \textbf{.function} variant places a probe near the beginning of the named function, +so that parameters are available as context variables. + +General syntax: + +\begin{vindent} +\begin{verbatim} +kernel.function("func[@file]" +module("modname").function("func[@file]" +\end{verbatim} +\end{vindent} +Examples: + +\begin{vindent} +\begin{verbatim} +# Refers to all kernel functions with "init" or "exit" +# in the name: +kernel.function("*init*"), kernel.function("*exit*") + +# Refers to any functions within the "kernel/sched.c" +# file that span line 240: +kernel.function("*@kernel/sched.c:240") + +# Refers to all functions in the ext3 module: +module("ext3").function("*") +\end{verbatim} +\end{vindent} + +\subsubsection{kernel.statement, module().statement} +\index{kernel.statement} +\index{module().statement} +The \textbf{.statement} variant places a probe at the exact spot, exposing those local +variables that are visible there. + +General syntax: + +\begin{vindent} +\begin{verbatim} +kernel.statement("func@file:linenumber") +module("modname").statement("func@file:linenumber") +\end{verbatim} +\end{vindent} +Example: + +\begin{vindent} +\begin{verbatim} +# Refers to the statement at line 2917 within the +# kernel/sched.c file: +kernel.statement("*@kernel/sched.c:2917") +\end{verbatim} +\end{vindent} + +\begin{comment} +\subsection{Marker probes} + +This family of probe points connects to static probe markers inserted into +the kernel or a module. These markers are special macro calls in the kernel +that make probing faster and more reliable than with DWARF-based probes. +DWARF debugging information is not required to use probe markers. + +Marker probe points begin with a kernel or module(\char`\"{}\emph{name}\char`\"{}) +prefix, the same as DWARF probes. This prefix identifies the source of the +symbol table used for finding markers. The suffix names the marker itself: +mark(\char`\"{}\emph{name}\char`\"{}). The marker name string, which may +contain wildcard characters, is matched against the names given to the marker +macros when the kernel or module was compiled. + +The handler associated with a marker probe reads any optional parameters +specified at the macro call site named \$arg1 through \$argNN, where NN is +the number of parameters supplied by the macro. Number and string parameters +are passed in a type-safe manner. +\end{comment} + +\subsection{Timer probes} +\index{timer probes} +You can use intervals defined by the standard kernel jiffies\index{jiffies} +timer to trigger probe handlers asynchronously. A \emph{jiffy} is a kernel-defined +unit of time typically between 1 and 60 msec. Two probe point variants are +supported by the translator: + +\begin{vindent} +\begin{verbatim} +timer.jiffies(N) +timer.jiffies(N).randomize(M) +\end{verbatim} +\end{vindent} +The probe handler runs every N jiffies. If the \texttt{randomize}\index{randomize} +component is given, a linearly distributed random value in the range {[}-M +\ldots{} +M] is added to N every time the handler executes. N is restricted +to a reasonable range (1 to approximately 1,000,000), and M is restricted +to be less than N. There are no target variables provided in either context. +Probes can be run concurrently on multiple processors. + +Intervals may be specified in units of time. There are two probe point variants +similar to the jiffies timer: + +\begin{vindent} +\begin{verbatim} +timer.ms(N) +timer.ms(N).randomize(M) +\end{verbatim} +\end{vindent} +Here, N and M are specified in milliseconds\index{milliseconds}, but the +full options for units are seconds (s or sec), milliseconds (ms or msec), +microseconds (us or usec), nanoseconds (ns or nsec), and hertz (hz). Randomization +is not supported for hertz timers. + +The resolution of the timers depends on the target kernel. For kernels prior +to 2.6.17, timers are limited to jiffies resolution, so intervals are rounded +up to the nearest jiffies interval. After 2.6.17, the implementation uses +hrtimers for tighter precision, though the resulting resolution will be dependent +upon architecture. In either case, if the randomize component is given, then +the random value will be added to the interval before any rounding occurs. + +Profiling timers are available to provide probes that execute on all CPUs +at each system tick. This probe takes no parameters, as follows. + +\begin{vindent} +\begin{verbatim} +timer.profile +\end{verbatim} +\end{vindent} +Full context information of the interrupted process is available, making +this probe suitable for implementing a time-based sampling profiler. + +The following is an example of timer usage. + +\begin{vindent} +\begin{verbatim} +# Refers to a periodic interrupt, every 1000 jiffies: +timer.jiffies(1000) + +# Fires every 5 seconds: +timer.sec(5) + +# Refers to a periodic interrupt, every 1000 +/- 200 jiffies: +timer.jiffies(1000).randomize(200) +\end{verbatim} +\end{vindent} + +\subsection{Return probes} +\index{return probes} +The \texttt{.return} variant places a probe at the moment of return from +the named function, so that the return value is available as the \$return +context variable. The entry parameters are also accessible in the context +of the return probe, though their values may have been changed by the function. +Inline functions do not have an identifiable return point, so \texttt{.return} +is not supported on \texttt{.inline} probes. + + +\subsection{Special probe points} + +The probe points \texttt{begin} and \texttt{end} are defined by the translator +to refer to the time of session startup and shutdown. There are no target +variables available in either context. + + +\subsubsection{begin} +\index{begin} +The \texttt{begin} probe is the start of the SystemTap session. All \texttt{begin} +probe handlers are run during the startup of the session. All global variables +must be declared prior to this point. + + +\subsubsection{end} +\index{end} +The \texttt{end} probe is the end of the SystemTap session. All \texttt{end} +probes are run during the normal shutdown of a session, such as in the aftermath +of an \texttt{exit} function call, or an interruption from the user. In the +case of an shutdown triggered by error, \texttt{end} probes are not run. + + +\subsubsection{begin and end probe sequence} +\index{sequence} +\texttt{begin} and \texttt{end} probes are specified with an optional sequence +number that controls the order in which they are run. If no sequence number +is provided, the sequence number defaults to zero and probes are run in the +order that they occur in the script file. Sequence numbers may be either +positive or negative, and are especially useful for tapset writers who want +to do initialization in a \texttt{begin} probe. The following are examples. + +\begin{vindent} +\begin{verbatim} +# In a tapset file: +probe begin(-1000) { ... } + +# In a user script: +probe begin { ... } +\end{verbatim} +\end{vindent} +The user script \texttt{begin} probe defaults to sequence number zero, so +the tapset \texttt{begin} probe will run first. + + +\subsubsection{never} +\index{never} +The \texttt{never} probe point is defined by the translator to mean \emph{never}. +Its statements are analyzed for symbol and type correctness, but its probe +handler is never run. This probe point may be useful in conjunction with +optional probes. See Section~\ref{sub:Optional-probe-points}. + + +\begin{comment} % Comment out until perfmon code is reactivated +\subsection{Probes to monitor performance} + +The perfmon family of probe points is used to access the performance monitoring +hardware available in modern processors. These probe points require perfmon2 +support in the kernel to access the hardware. + +Performance monitor hardware points have a \texttt{perfmon} prefix. The suffix +names the event being counted, for example \texttt{counter(event)}. The event +names are specific to the processor implementation, except for generic cycle +and instructions events, which are available on all processors. The probe +\texttt{perfmon.counter(event)} starts a counter on the processor which counts +the number of events that occur on that processor. For more details about +the performance monitoring events available on a specific processor, see +the help text returned by typing the perfmon2 command \texttt{pfmon -l.} + +\subsubsection{\$counter} + +\$counter is a handle used in the body of a probe for operations involving +the counter associated with the probe. + +\subsubsection{read\_counter} + +read\_counter is a function passed to the handle for a perfmon probe. It +returns the current count for the event. +\end{comment} + +\section{Language elements\label{sec:Language-Elements}} + + +\subsection{Identifiers} +\index{identifiers} +\emph{Identifiers} are used to name variables and functions. They are an +alphanumeric sequence that may include the underscore (\_) and dollar sign +(\$) characters. They have the same syntax as C identifiers, except that +the dollar sign is also a legal character. Identifiers that begin with a +dollar sign are interpreted as references to variables in the target software, +rather than to SystemTap script variables. Identifiers may not start with +a plain digit. + + +\subsection{Data types\label{sub:Data-types}} +\index{data types} +The SystemTap language includes a small number of data types, but no type +declarations. A variable's type is inferred\index{inference} from its use. +To support this inference, the translator enforces consistent typing of function +arguments and return values, array indices and values. There are no implicit +type conversions between strings and numbers. Inconsistent type-related use +of identifiers signals an error. + + +\subsubsection{Numbers} +\index{numbers} +Numbers are 64-bit signed integers. The parser will also accept (and wrap +around) values above positive $2^{63}$. + + +\subsubsection{Literals} +\index{literals} +Literals are either strings or integers. Literals can be expressed as decimal, +octal, or hexadecimal, using C notation. Type suffixes (e.g., \emph{L} or +\emph{U}) are not used. + + +\subsubsection{Integers\label{sub:Integers}} +\index{integers} +Integers are decimal, hexadecimal, or octal, and use the same notation as +in C. Integers are 64-bit signed quantities, although the parser also accepts +(and wraps around) values above positive $2^{63}$. + + +\subsubsection{Strings\label{sub:Strings}} +\index{strings} +Strings are enclosed in quotation marks ({}``string''), and pass through +standard C escape codes with backslashes. Strings are limited in length to +MAXSTRINGLEN. For more information about this and other limits, see Section~\ref{sub:SystemTap-safety}. + + +\subsubsection{Associative arrays} + +See Section~\ref{sec:Associative-Arrays} + + +\subsubsection{Statistics} + +See Section~\ref{sec:Statistics} + + +\subsection{Semicolons} +\index{;} +The semicolon is the null statement, or do nothing statement. It is optional, +and useful as a separator between statements to improve detection of syntax +errors and to reduce ambiguities in grammar. + + +\subsection{Comments} +\index{comments} +Three forms of comments are supported, as follows. + +\begin{vindent} +\begin{verbatim} +# ... shell style, to the end of line +// ... C++ style, to the end of line +/* ... C style ... */ +\end{verbatim} +\end{vindent} + +\subsection{Whitespace} +\index{whitespace} +As in C, spaces, tabs, returns, newlines, and comments are treated as whitespace. +Whitespace is ignored by the parser. + + +\subsection{Expressions} +\index{expressions} +SystemTap supports a number of operators that use the same general syntax, +semantics, and precedence as in C and awk. Arithmetic is performed per C +rules for signed integers. If the parser detects division by zero or an overflow, +it generates an error. The following subsections list these operators. + + +\subsubsection{Binary numeric operators} +\index{binary} +\texttt{{*} / \% + - >\,{}> <\,{}< \& \textasciicircum{} +| \&\& ||} + + +\subsubsection{Binary string operators} +\index{binary} +\texttt{\textbf{.}} (string concatenation) + + +\subsubsection{Numeric assignment operators} +\index{numeric} +\texttt{= {*}= /= \%= += -= >\,{}>= <\,{}<= +\&= \textasciicircum{}= |=} + + +\subsubsection{String assignment operators} + +\texttt{= .=} + + +\subsubsection{Unary numeric operators} +\index{unary} +\texttt{+ - ! \textasciitilde{} ++ -{}-} + + +\subsubsection{Binary numeric or string comparison operators} +\index{comparison} +\texttt{< > <= >= == !=} + + +\subsubsection{Ternary operator\label{sub:Ternary-operator}} +\index{?} +\texttt{cond ? exp1 : exp2} + + +\subsubsection{Grouping operator} +\index{grouping} +\texttt{( exp )} + + +\subsubsection{Function call} +\index{fn} +General syntax: + +\texttt{fn ({[} arg1, arg2, ... ])} + + +\subsubsection{\$ptr-\textgreater member} +\index{pointer} +\texttt{ptr} is a kernel pointer available in a probed context. + + +\subsubsection{\textless value\textgreater\ in \textless array\_name\textgreater} +\index{index} +This expression evaluates to true if the array contains an element with the +specified index. + + +\subsubsection{{[} \textless value\textgreater, ... ] in \textless array\_name\textgreater} + +The number of index values must match the number of indexes previously specified. + + +\subsection{Literals passed in from the stap command line\label{sub:Literals-passed-in}} +\index{literals} +\emph{Literals} are either strings enclosed in double quotes ('' '') or +integers. For information about integers, see Section~\ref{sub:Integers}. +For information about strings, see Section~\ref{sub:Strings}. + +Script arguments at the end of a command line are expanded as literals. You +can use these in all contexts where literals are accepted. A reference to +a nonexistent argument number is an error. + + +\subsubsection{\$1 \ldots{} \$\textless NN\textgreater\ for integers} +\index{\$} +Use \texttt{\$1 \ldots{} \$} for casting as a numeric literal. + + +\subsubsection{@1 \ldots{} @\textless NN\textgreater\ for strings} + +Use \texttt{@1 \ldots{} @} for casting as a string literal. + + +\subsubsection{Examples} + +For example, if the following script named example.stp + +\begin{vindent} +\begin{verbatim} +probe begin { printf("%d, %s\n", $1, @2) } +\end{verbatim} +\end{vindent} +is invoked as follows + +\begin{vindent} +\begin{verbatim} +# stap example.stp 10 mystring +\end{verbatim} +\end{vindent} +then 10 is substituted for \$1 and \char`\"{}mystring\char`\"{} for @2. The +output will be + +\begin{vindent} +\begin{verbatim} +10, mystring +\end{verbatim} +\end{vindent} + +\subsection{Conditional compilation} + + +\subsubsection{Conditions} +\index{conditions} +One of the steps of parsing is a simple conditional preprocessing stage. +The general form of this is similar to the ternary operator (Section~\ref{sub:Ternary-operator}). + +\begin{vindent} +\begin{verbatim} +%( CONDITION %? TRUE-TOKENS %) +%( CONDITION %? TRUE-TOKENS %: FALSE-TOKENS %) +\end{verbatim} +\end{vindent} +The CONDITION is a limited expression whose format is determined by its first +keyword. The following is the general syntax. + +\begin{vindent} +\begin{verbatim} +%( %? [ %: ] %) +\end{verbatim} +\end{vindent} + +\subsubsection{Conditions based on kernel version: kernel\_v, kernel\_vr} +\index{kernel version} +\index{kernel\_vr} +\index{kernel\_v} +If the first part of a conditional expression is the identifier \texttt{kernel\_v} +or \texttt{kernel\_vr}, the second part must be one of six standard numeric +comparison operators {}``\textless'', {}``\textless ='', {}``=='', {}``!='', {}``\textgreater'', +or {}``\textgreater ='', +and the third part must be a string literal that contains an RPM-style version-release +value. The condition returns true if the version of the target kernel (as +optionally overridden by the \textbf{-r} option) matches the given version +string. The comparison is performed by the glibc function strverscmp. + +\texttt{kernel\_v} refers to the kernel version number only, such as {}``2.6.13\char`\"{}. + +\texttt{kernel\_vr} refers to the kernel version number including the release +code suffix, such as {}``2.6.13-1.322FC3smp''. + + +\subsubsection{Conditions based on architecture: arch} +\index{arch} +If the first part of the conditional expression is the identifier \texttt{arch} +which refers to the processor architecture, then the second part is a string +comparison operator ''=='' or ''!='', and the third part is a string +literal for matching it. This comparison is a simple string equality or inequality. +The currently supported architecture strings are i386, i686, x86\_64, ia64, +s390x and ppc64. + + +\subsubsection{True and False Tokens} +\index{tokens} +TRUE-TOKENS and FALSE-TOKENS are zero or more general parser tokens, possibly +including nested preprocessor conditionals, that are pasted into the input +stream if the condition is true or false. For example, the following code +induces a parse error unless the target kernel version is newer than 2.6.5. + +\begin{vindent} +\begin{verbatim} +%( kernel_v <= "2.6.5" %? **ERROR** %) # invalid token sequence +\end{verbatim} +\end{vindent} +The following code adapts to hypothetical kernel version drift. + +\begin{vindent} +\begin{verbatim} +probe kernel.function ( + %( kernel_v <= "2.6.12" %? "__mm_do_fault" %: + %( kernel_vr == "2.6.13-1.8273FC3smp" %? "do_page_fault" %: UNSUPPORTED %) + %)) { /* ... */ } + +%( arch == "ia64" %? + probe syscall.vliw = kernel.function("vliw_widget") {} +%) +\end{verbatim} +\end{vindent} + +\section{Statement types\label{sec:Statement-Types}} + +Statements enable procedural control flow within functions and probe handlers. +The total number of statements executed in response to any single probe event +is limited to MAXACTION, which defaults to 1000. See Section~\ref{sub:SystemTap-safety}. + + +\subsection{break and continue} +\index{break} +\index{continue} +Use \texttt{break} or \texttt{continue} to exit or iterate the innermost +nesting loop statement, such as within a \texttt{while, for,} or \texttt{foreach} +statement. The syntax and semantics are the same as those used in C. + + +\subsection{delete} +\index{delete} +\texttt{delete} removes an element. + +The following statement removes from ARRAY the element specified by the index +tuple. The value will no longer be available, and subsequent iterations will +not report the element. It is not an error to delete an element that does +not exist. + +\begin{vindent} +\begin{verbatim} +delete ARRAY[INDEX1, INDEX2, ...] +\end{verbatim} +\end{vindent} +The following syntax removes all elements from ARRAY: + +\begin{vindent} +\begin{verbatim} +delete ARRAY +\end{verbatim} +\end{vindent} +The following statement removes the value of SCALAR. Integers and strings +are cleared to zero and null (\char`\"{}\char`\"{}) respectively, while statistics +are reset to their initial empty state. + +\begin{vindent} +\begin{verbatim} +delete SCALAR +\end{verbatim} +\end{vindent} + +\subsection{do} +\index{do} +The \texttt{do} statement has the same syntax and semantics as in C. + +\begin{vindent} +\begin{verbatim} +do STMT while (EXP) +\end{verbatim} +\end{vindent} + +\subsection{EXP (expression)} +\index{expression} +An \texttt{expression} executes a string- or integer-valued expression and +discards the value. + + +\subsection{for} +\index{for} +General syntax: +\begin{vindent} +\begin{verbatim} +for (EXP1; EXP2; EXP3) STMT +\end{verbatim} +\end{vindent} +The \texttt{for} statement is similar to the \texttt{for} statement in C. +The \texttt{for} expression executes EXP1 as initialization. While EXP2 is +non-zero, it executes STMT, then the iteration expression EXP3. + +\subsection{foreach\label{sub:foreach}} +\index{foreach} +General syntax: +\begin{vindent} +\begin{verbatim} +foreach (VAR in ARRAY) STMT +\end{verbatim} +\end{vindent} +The \texttt{foreach} statement loops over each element of a named global array, assigning +the current key to VAR. The array must not be modified within the statement. +If you add a single plus (+) or minus (-) operator after the VAR or the ARRAY +identifier, the iteration order will be sorted by the ascending or descending +index or value. + +The following statement behaves the same as the first example, except it +is used when an array is indexed with a tuple of keys. Use a sorting suffix +on at most one VAR or ARRAY identifier. + +\begin{vindent} +\begin{verbatim} +foreach ([VAR1, VAR2, ...] in ARRAY) STMT +\end{verbatim} +\end{vindent} +The following statement is the same as the first example, except that the +\texttt{limit} keyword limits the number of loop iterations to EXP times. +EXP is evaluated once at the beginning of the loop. + +\begin{vindent} +\begin{verbatim} +foreach (VAR in ARRAY limit EXP) STMT +\end{verbatim} +\end{vindent} + +\subsection{if} +\index{if} +General syntax: + +\begin{vindent} +\begin{verbatim} +if (EXP) STMT1 [ else STMT2 ] +\end{verbatim} +\end{vindent} +The \texttt{if} statement compares an integer-valued EXP to zero. It executes +the first STMT if non-zero, or the second STMT if zero. + +The \texttt{if} command has the same syntax and semantics as used in C. + + +\subsection{next} +\index{next} +The \texttt{next} statement returns immediately from the enclosing probe +handler. + + +\subsection{; (null statement)} +\index{;} +\index{null statement} +General syntax: + +\begin{vindent} +\begin{verbatim} +statement1 +; +statement2 +\end{verbatim} +\end{vindent} +The semicolon represents the null statement, or do nothing. It is useful +as an optional separator between statements to improve syntax error detection +and to handle certain grammar ambiguities. + + +\subsection{return} +\index{return} +General syntax: + +\begin{vindent} +\begin{verbatim} +return EXP +\end{verbatim} +\end{vindent} +The \texttt{return} statement returns the EXP value from the enclosing function. +If the value of the function is not returned, then a return statement is +not needed, and the function will have a special \emph{unknown} type with +no return value. + +\subsection{\{ \} (statement block)} +\index{\{ \}} +\index{statement block} +This is the statement block with zero or more statements enclosed within +brackets. The following is the general syntax: + +\begin{vindent} +\begin{verbatim} +{ STMT1 STMT2 ... } +\end{verbatim} +\end{vindent} +The statement block executes each statement in sequence in the block. Separators +or terminators are generally not necessary between statements. The statement +block uses the same syntax and semantics as in C. + + +\subsection{while} +\index{while} +General syntax: + +\begin{vindent} +\begin{verbatim} +while (EXP) STMT +\end{verbatim} +\end{vindent} +The \texttt{while} statement uses the same syntax and semantics as in C. +In the statement above, while the integer-valued EXP evaluates to non-zero, +the parser will execute STMT. + + +\section{Associative arrays\label{sec:Associative-Arrays}} +\index{associative arrays} +Associative arrays are implemented as hash tables with a maximum size set +at startup. Associative arrays are too large to be created dynamically for +individual probe handler runs, so they must be declared as global. The basic +operations for arrays are setting and looking up elements. These operations +are expressed in awk syntax: the array name followed by an opening bracket +({[}), a comma-separated list of up to five index index expressions, and +a closing bracket (]). Each index expression may be a string or a number, +as long as it is consistently typed throughout the script. + + +\subsection{Examples} + +\begin{vindent} +\begin{verbatim} +# Increment the named array slot: +foo [4,"hello"] ++ + +# Update a statistic: +processusage [uid(),execname()] ++ + +# Set a timestamp reference point: +times [tid()] = get_cycles() + +# Compute a timestamp delta: +delta = get_cycles() - times [tid()] +\end{verbatim} +\end{vindent} + +\subsection{Types of values} + +Array elements may be set to a number or a string. The type must be consistent +throughout the use of the array. The first assignment to the array defines +the type of the elements. Unset array elements may be fetched and return +a null value (zero or empty string) as appropriate, but they are not seen +by a membership test. + + +\subsection{Array capacity} + +Array sizes can be specified explicitly or allowed to default to the maximum +size as defined by MAXMAPENTRIES. See Section~\ref{sub:SystemTap-safety} +for details on changing MAXMAPENTRIES. + +You can explicitly specify the size of an array as follows: + +\begin{vindent} +\begin{verbatim} +global ARRAY[] +\end{verbatim} +\end{vindent} +If you do not specify the size parameter, then the array is created to hold +MAXMAPENTRIES number of elements + + +\subsection{Iteration, foreach} +\index{foreach} +Like awk, SystemTap's foreach creates a loop that iterates over key tuples +of an array, not only values. The iteration may be sorted by any single key +or a value by adding an extra plus symbol (+) or minus symbol (-) to the +code. The following are examples. + +\begin{vindent} +\begin{verbatim} +# Simple loop in arbitrary sequence: +foreach ([a,b] in foo) + fuss_with(foo[a,b]) + +# Loop in increasing sequence of value: +foreach ([a,b] in foo+) { ... } + +# Loop in decreasing sequence of first key: +foreach ([a-,b] in foo) { ... } +\end{verbatim} +\end{vindent} +The \texttt{break} and \texttt{continue} statements also work inside foreach +loops. Since arrays can be large but probe handlers must execute quickly, +you should write scripts that exit iteration early, if possible. For simplicity, +SystemTap forbids any modification of an array during iteration with a foreach. + + +\section{Statistics (aggregates)\label{sec:Statistics}} +\index{aggregates} +Aggregate instances are used to collect statistics on numerical values, when +it is important to accumulate new data quickly and in large volume. These +instances operate without exclusive locks, and store only aggregated stream +statistics. Aggregates make sense only for global variables. They are stored +individually or as elements of an array. + +\subsection{The aggregation (\textless\hspace{1 sp}\textless\hspace{1 sp}\textless) operator} +\index{\textless\hspace{1 sp}\textless\hspace{1 sp}\textless} +The aggregation operator is {}``\textless\hspace{1 sp}\textless\hspace{1 sp}\textless'', +and its effect is similar to an assignment or a C++ output streaming operation. +The left operand specifies a scalar or array-index \emph{l-value}, which +must be declared global. The right operand is a numeric expression. The meaning +is intuitive: add the given number to the set of numbers to compute their +statistics. The specific list of statistics to gather is given separately +by the extraction functions. The following is an example. + +\begin{vindent} +\begin{verbatim} +a <<< delta_timestamp +writes[execname()] <<< count +\end{verbatim} +\end{vindent} + +\subsection{Extraction functions} +\index{extraction} +For each instance of a distinct extraction function operating on a given +identifier, the translator computes a set of statistics. With each execution +of an extraction function, the aggregation is computed for that moment across +all processors. The first argument of each function is the same style of +l-value as used on the left side of the aggregation operation. + + +\subsection{Integer extractors} + +The following functions provide methods to extract information about integer +values. + + +\subsubsection{@count(s)} +\index{count} +This statement returns the number of all values accumulated into s. + + +\subsubsection{@sum(s)} +\index{sum} +This statement returns the total of all values accumulated into s. + + +\subsubsection{@min(s)} +\index{min} +This statement returns the minimum of all values accumulated into s. + + +\subsubsection{@max(s)} +\index{max} +This statement returns the maximum of all values accumulated into s. + + +\subsubsection{@avg(s)} +\index{avg} +This statement returns the average of all values accumulated into s. + + +\subsection{Histogram extractors} +\index{histograms} +The following functions provide methods to extract histogram information. +Printing a histogram with the print family of functions renders a histogram +object as a tabular "ASCII art" bar chart. + +\subsubsection{@hist\_linear} +\index{hist\_linear} +The statement \texttt{@hist\_linear(v,L,H,W)} represents a linear histogram +\texttt{v}, where \emph{L} and \emph{H} represent the lower and upper end of +a range of values and \emph{W} represents the width (or size) of each bucket +within the range. The low and high values can be negative, but the overall +difference (high minus low) must be positive. The width parameter must also +be positive. + +In the output, a range of consecutive empty buckets may be replaced with a tilde +(\textasciitilde{}) character. This can be controlled on the command line +with -DHIST\_ELISION=\textless\hspace{1 sp}num\textgreater\hspace{1 sp}, +where \textless\hspace{1 sp}num\textgreater\hspace{1 sp} specifies how many +empty buckets at the top and bottom of the range to print. +The default is 2. A \textless\hspace{1 sp}num\textgreater\hspace{1 sp} of 0 +removes all empty buckets. A negative \textless\hspace{1 sp}num\textgreater\hspace{1 sp} +turns off bucket removal all together. + +For example, if you specify -DHIST\_ELISION=3 and the histogram has 10 +consecutive empty buckets, the first 3 and last 3 empty buckets will +be printed and the middle 4 empty buckets will be represented by a +tilde (\textasciitilde{}). + +The following is an example. + +\begin{vindent} +\begin{verbatim} +global reads +probe netdev.receive { + reads <<< length +} +probe end { + print(@hist_linear(reads, 0, 10240, 200)) +} +\end{verbatim} +\end{vindent} +This generates the following output. + +\pagebreak +\begin{vindent} +\begin{verbatim} +value |-------------------------------------------------- count + 0 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1650 + 200 | 8 + 400 | 0 + 600 | 0 + ~ + 1000 | 0 + 1200 | 0 + 1400 | 1 + 1600 | 0 + 1800 | 0 +\end{verbatim} +\end{vindent} +This shows that 1650 network reads were of a size between 0 and 200 bytes, +8 reads were between 200 and 400 bytes, and 1 read was between +1200 and 1400 bytes. The tilde (\textasciitilde{}) character indicates +buckets 700, 800 and 900 were removed because they were empty. +Empty buckets at the upper end were also removed. + +\subsubsection{@hist\_log} +\index{hist\_log} +The statement \texttt{@hist\_log(v)} represents a base-2 logarithmic +histogram. Empty buckets are replaced with a tilde (\textasciitilde{}) +character in the same way as \texttt{@hist\_linear()} (see above). + +The following is an example. + +\begin{vindent} +\begin{verbatim} +global reads +probe netdev.receive { + reads <<< length +} +probe end { + print(@hist_log(reads)) +} +\end{verbatim} +\end{vindent} +This generates the following output. + +\begin{vindent} +\begin{verbatim} +value |-------------------------------------------------- count + 8 | 0 + 16 | 0 + 32 | 254 + 64 | 3 + 128 | 2 + 256 | 2 + 512 | 4 + 1024 |@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 16689 + 2048 | 0 + 4096 | 0 +\end{verbatim} +\end{vindent} + +\section{Predefined functions\label{sec:Predefined-Functions}} + +Unlike built-in functions, predefined functions are implemented in tapsets. + + +\subsection{Output functions} + +The following sections describe the functions you can use to output data. + + +\subsubsection{error} +\index{error} +General syntax: + +\begin{vindent} +\begin{verbatim} +error:unknown (msg:string) +\end{verbatim} +\end{vindent} +This function logs the given string to the error stream. It appends an implicit +end-of-line. It blocks any further execution of statements in this probe. +If the number of errors exceeds the MAXERRORS parameter, it triggers an \texttt{exit}. + + +\subsubsection{log} +\index{log} +General syntax: + +\begin{vindent} +\begin{verbatim} +log:unknown (msg:string) +log (const char *fmt, ) +\end{verbatim} +\end{vindent} +This function logs data. \texttt{log} sends the message immediately to staprun +and to the bulk transport (relayfs) if it is being used. If the last character +given is not a newline, then one is added. + +This function is not as efficient as printf and should only be used for urgent +messages. + +\subsubsection{print} +\index{print} +General syntax: + +\begin{vindent} +\begin{verbatim} +print:unknown () +\end{verbatim} +\end{vindent} +This function prints a single value of any type. + + +\subsubsection{printf} +\index{printf} +General syntax: + +\begin{vindent} +\begin{verbatim} +printf:unknown (fmt:string, ) +\end{verbatim} +\end{vindent} +The printf function takes a formatting string as an argument, and a number +of values of corresponding types, and prints them all. The format must be a +literal string constant. The printf formatting directives are similar to those +of C, except that they are fully checked for type by the translator. + +The formatting string can contain tags that are defined as follows: + +\begin{vindent} +\begin{verbatim} +%[flags][width][.precision][length]specifier +\end{verbatim} +\end{vindent} +Where \texttt{specifier} is required and defines the type and the interpretation +of the value of the corresponding argument. The following table shows the +details of the specifier parameter: + +\begin{table}[H] +\caption{printf specifier values} +\begin{tabular}{|>{\raggedright}p{1in}|>{\raggedright}p{3.5in}|>{\raggedright}p{1.25in}|} +\hline +\textbf{Specifier}& +\textbf{Output}& +\textbf{Example}\tabularnewline +\hline +\hline +d or i& +Signed decimal& +392\tabularnewline +\hline +o& +Unsigned octal& +610\tabularnewline +\hline +s& +String& +sample\tabularnewline +\hline +u& +Unsigned decimal& +7235\tabularnewline +\hline +x& +Unsigned hexadecimal (lowercase letters)& +7fa\tabularnewline +\hline +X& +Unsigned hexadecimal (uppercase letters)& +7FA\tabularnewline +\hline +p& +Pointer address& +0x0000000000bc614e\tabularnewline +\hline +n& +Writes a binary value that is the total length of the string written by printf. +The field width specifies the number of bytes to write. Valid specifications +are \%n, \%1n, \%2n and \%4n. The default is 2.& +See below\tabularnewline +\hline +b& +Writes a binary value as text. The field width specifies the number of bytes +to write. Valid specifications are \%b, \%1b, \%2b, \%4b and \%8b. The default +width is 4 (32-bits).& +See below\tabularnewline +\hline +\%& +A \% followed by another \% character will write \% to stdout.& +\%\tabularnewline +\hline +\end{tabular} +\end{table} +The tag can also contain \texttt{flags}, \texttt{width}, \texttt{.precision} +and \texttt{modifiers} sub-specifiers, which are optional and follow these +specifications: + +\begin{table}[H] +\caption{printf flag values} +\begin{tabular}{|>{\raggedright}p{1.5in}|>{\raggedright}p{4.5in}|} +\hline +\textbf{Flags}& +\textbf{Description}\tabularnewline +\hline +\hline +- (minus sign)& +Left-justify within the given field width. Right justification is the default +(see \texttt{width} sub-specifier).\tabularnewline +\hline ++ (plus sign)& +Precede the result with a plus or minus sign even for positive numbers. By +default, only negative numbers are preceded with a minus sign.\tabularnewline +\hline +(space)& +If no sign is going to be written, a blank space is inserted before the value.\tabularnewline +\hline +\#& +Used with \texttt{o}, \texttt{x} or \texttt{X} specifiers the value is preceded +with \texttt{0}, \texttt{0x} or \texttt{0X} respectively for non-zero values.\tabularnewline +\hline +0& +Left-pads the number with zeroes instead of spaces, where padding is specified +(see \texttt{width} sub-specifier).\tabularnewline +\hline +\end{tabular} +\end{table} + +\begin{table}[H] +\caption{printf width values} +\begin{tabular}{|>{\raggedright}p{1.5in}|>{\raggedright}p{4.5in}|} +\hline +\textbf{Width}& +\textbf{Description}\tabularnewline +\hline +\hline +(number)& +Minimum number of characters to be printed. If the value to be printed is +shorter than this number, the result is padded with blank spaces. The value +is not truncated even if the result is larger.\tabularnewline +\hline +\end{tabular} +\end{table} + +% +\begin{table}[H] + +\caption{printf precision values} + +\begin{tabular}{|>{\raggedright}p{1.5in}|>{\raggedright}p{4.5in}|} +\hline +\textbf{Precision}& +\textbf{Description}\tabularnewline +\hline +\hline +.number& +For integer specifiers (\texttt{d, i, o, u, x, X}): \texttt{precision} specifies +the minimum number of digits to be written. If the value to be written is +shorter than this number, the result is padded with leading zeros. The value +is not truncated even if the result is longer. A precision of 0 means that +no character is written for the value 0. For s: this is the maximum number +of characters to be printed. By default all characters are printed until +the ending null character is encountered. When no \texttt{precision} is specified, +the default is 1. If the period is specified without an explicit value for +\texttt{precision}, 0 is assumed.\tabularnewline +\hline +\end{tabular} +\end{table} + +\textbf{Binary Write Examples} + +The following is an example of using the binary write functions: + +\begin{vindent} +\begin{verbatim} +probe begin { + for (i = 97; i < 110; i++) + printf("%3d: %1b%1b%1b\n", i, i, i-32, i-64) + exit() +} +\end{verbatim} +\end{vindent} +This prints: + +\begin{vindent} +\begin{verbatim} + 97: aA! + 98: bB" + 99: cC# +100: dD$ +101: eE% +102: fF& +103: gG' +104: hH( +105: iI) +106: jJ* +107: kK+ +108: lL, +109: mM- +\end{verbatim} +\end{vindent} +Another example: + +\begin{vindent} +\begin{verbatim} +stap -e 'probe begin{printf("%1n%b%b", 0xc0dedbad, \ +0x12345678);exit()}' | hexdump -C + +\end{verbatim} +\end{vindent} +This prints: + +\begin{vindent} +\begin{verbatim} +00000000 08 ad db de c0 78 56 34 12 |.....xV4.| +00000009 +\end{verbatim} +\end{vindent} +Another example: + +\begin{vindent} +\begin{verbatim} +probe begin{ + printf("%1b%1b%1blo %1b%1brld\n", 72,101,108,87,111) + exit() +} +\end{verbatim} +\end{vindent} +This prints: + +\begin{vindent} +\begin{verbatim} +Hello World +\end{verbatim} +\end{vindent} + +\subsubsection{printd} +\index{printd} +General syntax: + +\begin{vindent} +\begin{verbatim} +printd:unknown (delimiter:string, ) +\end{verbatim} +\end{vindent} +This function takes a string delimiter and two or more values of any type, then +prints the values with the delimiter interposed. The delimiter must be a +literal string constant. + +For example: +\begin{vindent} +\begin{verbatim} +printd("/", "one", "two", "three", 4, 5, 6) +\end{verbatim} +\end{vindent} +prints: +\begin{vindent} +\begin{verbatim} +one/two/three/4/5/6 +\end{verbatim} +\end{vindent} + +\subsubsection{printdln} +\index{printdln} +General syntax: + +\begin{vindent} +\begin{verbatim} +printdln:unknown () +\end{verbatim} +\end{vindent} +This function operates like \texttt{printd}, but also appends a newline. + +\subsubsection{println} +\index{println} +General syntax: + +\begin{vindent} +\begin{verbatim} +println:unknown () +\end{verbatim} +\end{vindent} +This function operates like \texttt{print}, but also appends a newline. + +\subsubsection{sprint} +\index{sprint} +General syntax: + +\begin{vindent} +\begin{verbatim} +sprint:unknown () +\end{verbatim} +\end{vindent} +This function operates like \texttt{print}, but returns the string rather +than printing it. + +\subsubsection{sprintf} +\index{sprintf} +General syntax: + +\begin{vindent} +\begin{verbatim} +sprintf:unknown (fmt:string, ) +\end{verbatim} +\end{vindent} +This function operates like \texttt{printf}, but returns the formatted string +rather than printing it. + + +\subsubsection{system} +\index{system} +General syntax: + +\begin{vindent} +\begin{verbatim} +system (cmd:string) +\end{verbatim} +\end{vindent} +The system function runs a command on the system. The specified command runs +in the background once the current probe completes. + + +\subsubsection{warn} +\index{warn} +General syntax: + +\begin{vindent} +\begin{verbatim} +warn:unknown (msg:string) +\end{verbatim} +\end{vindent} +This function sends a warning message immediately to staprun. It is also +sent over the bulk transport (relayfs) if it is being used. If the last character +is not a newline, then one is added. + +\subsection{Context at the probe point} + +The following functions provide ways to access the current task context +at a probe point. Note that these may not return correct values when +a probe is hit in interrupt context. + +\subsubsection{backtrace} +\index{backtrace} +General syntax: + +\begin{vindent} +\begin{verbatim} +backtrace:string () +\end{verbatim} +\end{vindent} +Returns a string of hex addresses that are a backtrace of the +stack. The output is truncated to MAXSTRINGLEN. + +\subsubsection{caller} +\index{caller} +General syntax: + +\begin{vindent} +\begin{verbatim} +caller:string() +\end{verbatim} +\end{vindent} +Returns the address and name of the calling function. It works +only for return probes. + +\subsubsection{caller\_addr} +\index{caller\_addr} +General syntax: + +\begin{vindent} +\begin{verbatim} +caller_addr:long () +\end{verbatim} +\end{vindent} +Returns the address of the calling function. It works only +for return probes. + + +\subsubsection{cpu} +\index{cpu} +General syntax: + +\begin{vindent} +\begin{verbatim} +cpu:long () +\end{verbatim} +\end{vindent} +Returns the current cpu number. + + +\subsubsection{egid} +\index{egid} +General syntax: + +\begin{vindent} +\begin{verbatim} +egid:long () +\end{verbatim} +\end{vindent} +Returns the effective group ID of the current process. + + +\subsubsection{euid} +\index{euid} +General syntax: + +\begin{vindent} +\begin{verbatim} +euid:long () +\end{verbatim} +\end{vindent} +Returns the effective user ID of the current process. + + +\subsubsection{execname} +\index{execname} +General syntax: + +\begin{vindent} +\begin{verbatim} +execname:string () +\end{verbatim} +\end{vindent} +Returns the name of the current process. + + +\subsubsection{gid} +\index{gid} +General syntax: + +\begin{vindent} +\begin{verbatim} +gid:long () +\end{verbatim} +\end{vindent} +Returns the group ID of the current process. + + +\subsubsection{is\_return} +\index{is\_return} +General syntax: + +\begin{vindent} +\begin{verbatim} +is_return:long () +\end{verbatim} +\end{vindent} +Returns 1 if the probe point is a return probe, else it returns +zero. + +\noun{Deprecated}. + + +\subsubsection{pexecname} +\index{pexecname} +General syntax: + +\begin{vindent} +\begin{verbatim} +pexecname:string () +\end{verbatim} +\end{vindent} +Returns the name of the parent process. + + +\subsubsection{pid} +\index{pid} +General syntax: + +\begin{vindent} +\begin{verbatim} +pid:long () +\end{verbatim} +\end{vindent} +Returns the process ID of the current process. + + +\subsubsection{ppid} +\index{ppid} +General syntax: + +\begin{vindent} +\begin{verbatim} +ppid:long () +\end{verbatim} +\end{vindent} +Returns the process ID of the parent process. + + +\subsubsection{tid} +\index{tid} +General syntax: + +\begin{vindent} +\begin{verbatim} +tid:long () +\end{verbatim} +\end{vindent} +Returns the ID of the current thread. + + +\subsubsection{uid} +\index{uid} +General syntax: + +\begin{vindent} +\begin{verbatim} +uid:long () +\end{verbatim} +\end{vindent} +Returns the user ID of the current task. + + +\subsubsection{print\_backtrace} +\index{print\_backtrace} +General syntax: + +\begin{vindent} +\begin{verbatim} +print_backtrace:unknown () +\end{verbatim} +\end{vindent} +This function is equivalent to \texttt{print\_stack(backtrace())}, except +that deeper stack nesting is supported. The function does not return a value. + + +\subsubsection{print\_regs} +\index{print\_regs} +General syntax: + +\begin{vindent} +\begin{verbatim} +print_regs:unknown () +\end{verbatim} +\end{vindent} +This function prints a register dump. + + +\subsubsection{print\_stack} +\index{print\_stack} +General syntax: + +\begin{vindent} +\begin{verbatim} +print_stack:unknown (stk:string) +\end{verbatim} +\end{vindent} +This function performs a symbolic lookup of the addresses in the given string, +which is assumed to be the result of a prior call to \texttt{backtrace()}. +It prints one line per address. Each printed line includes the address, the +name of the function containing the address, and an estimate of its position +within that function. The function does not return a value. + + +\subsubsection{stack\_size} +\index{stack\_size} +General syntax: + +\begin{vindent} +\begin{verbatim} +stack_size:long () +\end{verbatim} +\end{vindent} +Returns the size of the stack. + + +\subsubsection{stack\_unused} +\index{stack\_unused} +General syntax: + +\begin{vindent} +\begin{verbatim} +stack_unused:long () +\end{verbatim} +\end{vindent} +Returns how many bytes are currently unused in the stack. + + +\subsubsection{stack\_used} +\index{stack\_used} +General syntax: + +\begin{vindent} +\begin{verbatim} +stack_used:long () +\end{verbatim} +\end{vindent} +Returns how many bytes are currently used in the stack. + + +\subsubsection{stp\_pid} +\index{stp\_pid} +\begin{vindent} +\begin{verbatim} +stp_pid:long () +\end{verbatim} +\end{vindent} +Returns the process ID of the of the staprun process. + + +\subsubsection{target} +\index{target} +General syntax: + +\begin{vindent} +\begin{verbatim} +target:long () +\end{verbatim} +\end{vindent} +Returns the process ID of the target process. This is useful +in conjunction with the -x PID or -c CMD command-line options to stap. An +example of its use is to create scripts that filter on a specific process. + +\begin{verbatim} +-x +\end{verbatim} +target() returns the pid specified by -x + +\begin{verbatim} +-c +\end{verbatim} +target() returns the pid for the executed command specified +by -c. + +\subsection{Task data} + +These functions return data about a task. They all require a task handle as +input, such as the value return by task\_current() or the variables +prev\_task and next\_task in the scheduler.ctxswitch probe alias. + +\subsubsection{task\_cpu} +\index{task\_cpu} +General syntax: + +\begin{vindent} +\begin{verbatim} +task_cpu:long (task:long) +\end{verbatim} +\end{vindent} +Returns the scheduled cpu for the given task. + + +\subsubsection{task\_current} +\index{task\_current} +General syntax: + +\begin{vindent} +\begin{verbatim} +task_current:long () +\end{verbatim} +\end{vindent} +Returns the address of the task\_struct representing +the current process. This address can be passed to the various task\_{*}() +functions to extract more task-specific data. + + +\subsubsection{task\_egid} +\index{task\_egid} +General syntax: + +\begin{vindent} +\begin{verbatim} +task_egid:long (task:long) +\end{verbatim} +\end{vindent} +Returns the effective group ID of the given task. + + +\subsubsection{task\_execname} +\index{task\_execname} +General syntax: + +\begin{vindent} +\begin{verbatim} +task_execname:string (task:long) +\end{verbatim} +\end{vindent} +Returns the name of the given task. + + +\subsubsection{task\_euid} +\index{task\_euid} +General syntax: + +\begin{vindent} +\begin{verbatim} +task_euid:long (task:long) +\end{verbatim} +\end{vindent} +Returns the effective user ID of the given task. + + +\subsubsection{task\_gid} +\index{task\_gid} +General syntax: + +\begin{vindent} +\begin{verbatim} +task_gid:long (task:long) +\end{verbatim} +\end{vindent} +Returns the group ID of the given task. + + +\subsubsection{task\_nice} +\index{task\_nice} +General syntax: + +\begin{vindent} +\begin{verbatim} +task_nice:long (task:long) +\end{verbatim} +\end{vindent} +Returns the nice value of the given task. + + +\subsubsection{task\_parent} +\index{task\_parent} +General syntax: + +\begin{vindent} +\begin{verbatim} +task_parent:long (task:long) +\end{verbatim} +\end{vindent} +Returns the address of the parent task\_struct of the given +task. This address can be passed to the various task\_{*}() functions to +extract more task-specific data. + + +\subsubsection{task\_pid} +\index{task\_pid} +General syntax: + +\begin{vindent} +\begin{verbatim} +task_pid:long (task:long) +\end{verbatim} +\end{vindent} +Returns the process ID of the given task. + + +\subsubsection{task\_prio} +\index{task\_prio} +General syntax: + +\begin{vindent} +\begin{verbatim} +task_prio:long (task:long) +\end{verbatim} +\end{vindent} +Returns the priority value of the given task. + + +\subsubsection{task\_state} +\index{task\_state} +General syntax: + +\begin{vindent} +\begin{verbatim} +task_state:long (task:long) +\end{verbatim} +\end{vindent} +Returns the state of the given task. Possible states are: + +\begin{vindent} +\begin{verbatim} +TASK_RUNNING 0 +TASK_INTERRUPTIBLE 1 +TASK_UNINTERRUPTIBLE 2 +TASK_STOPPED 4 +TASK_TRACED 8 +EXIT_ZOMBIE 16 +EXIT_DEAD 32 +\end{verbatim} +\end{vindent} + +\subsubsection{task\_tid} +\index{task\_tid} +General syntax: + +\begin{vindent} +\begin{verbatim} +task_tid:long (task:long) +\end{verbatim} +\end{vindent} +Returns the thread ID of the given task. + + +\subsubsection{task\_uid} +\index{task\_uid} +General syntax: + +\begin{vindent} +\begin{verbatim} +task_uid:long (task:long) +\end{verbatim} +\end{vindent} +Returns the user ID of the given task. + + +\subsubsection{task\_open\_file\_handles} +\index{task\_open\_file\_handles} +General syntax: + +\begin{vindent} +\begin{verbatim} +task_open_file_handles:long(task:long) +\end{verbatim} +\end{vindent} +Returns the number of open file handles for the given task. + + +\subsubsection{task\_max\_file\_handles} +\index{task\_max\_file\_handles} +General syntax: + +\begin{vindent} +\begin{verbatim} +task_max_file_handles:long(task:long) +\end{verbatim} +\end{vindent} +Returns the maximum number of file handles for the given task. + + +\subsection{Accessing string data at a probe point} + +The following functions provide methods to access string data at a probe +point. + + +\subsubsection{kernel\_string} +\index{kernel\_string} +General syntax: + +\begin{vindent} +\begin{verbatim} +kernel_string:string (addr:long) +\end{verbatim} +\end{vindent} +Copies a string from kernel space at a given address. The validation of this +address is only partial. + + +\subsubsection{user\_string\label{sub:user_string}} +\index{user\_string} +General syntax: + +\begin{vindent} +\begin{verbatim} +user_string:string (addr:long) +\end{verbatim} +\end{vindent} +This function copies a string from user space at a given address. The validation +of this address is only partial. In rare cases when userspace data is not +accessible, this function returns the string \texttt{.} + + +\subsubsection{user\_string2} +\index{user\_string2} +General syntax: + +\begin{vindent} +\begin{verbatim} +user_string2:string (addr:long, err_msg:string) +\end{verbatim} +\end{vindent} +This function is similar to \texttt{user\_string}, (Section~\ref{sub:user_string}) +but allows passing an error message as an argument to be returned if userspace +data is not available. + + +\subsubsection{user\_string\_warn} +\index{user\_string\_warn} +General syntax: + +\begin{vindent} +\begin{verbatim} +user_string_warn:string (addr:long) +\end{verbatim} +\end{vindent} +This function copies a string from userspace at given address. It prints +a verbose error message on failure. + + +\subsubsection{user\_string\_quoted} +\index{user\_string\_quoted} +General syntax: + +\begin{vindent} +\begin{verbatim} +user_string_quoted:string (addr:long) +\end{verbatim} +\end{vindent} +This function copies a string from userspace at given address. Any ASCII +characters that are not printable are replaced by the corresponding escape +sequence in the returned string. + + +\subsection{Initializing queue statistics} +\index{queue statistics} +The queue\_stats tapset provides functions that, when given notification +of queuing events like wait, run, or done, track averages such as queue length, +service and wait times, and utilization. Call the following three functions +from appropriate probes, in sequence. + + +\subsubsection{qs\_wait} +\index{qs\_wait} +General syntax: + +\begin{vindent} +\begin{verbatim} +qs_wait:unknown (qname:string) +\end{verbatim} +\end{vindent} +This function records that a new request was enqueued for the given queue +name. + + +\subsubsection{qs\_run} +\index{qs\_run} +General syntax: + +\begin{vindent} +\begin{verbatim} +qs_run:unknown (qname:string) +\end{verbatim} +\end{vindent} +This function records that a previously enqueued request was removed from +the given wait queue and is now being serviced. + + +\subsubsection{qs\_done} +\index{qs\_done} +General syntax: + +\begin{vindent} +\begin{verbatim} +qs_done:unknown (qname:string) +\end{verbatim} +\end{vindent} +This function records that a request originally from the given queue has +completed being serviced. + + +\subsection{Using queue statistics} + +Functions with the qsq\_ prefix query the statistics averaged since the first +queue operation or when qsq\_start was called. Since statistics are often +fractional, a scale parameter multiplies the result to a more useful scale. +For some fractions, a scale of 100 returns percentage numbers. + + +\subsubsection{qsq\_blocked} +\index{qsq\_blocked} +General syntax: + +\begin{vindent} +\begin{verbatim} +qsq_blocked:long (qname:string, scale:long) +\end{verbatim} +\end{vindent} +This function returns the fraction of elapsed time during which one or more +requests were on the wait queue. + + +\subsubsection{qsq\_print} +\index{qsq\_print} +General syntax: + +\begin{vindent} +\begin{verbatim} +qsq_print:unknown (qname:string) +\end{verbatim} +\end{vindent} +This function prints a line containing the following statistics for the given +queue: + +\begin{itemize} +\item queue name +\item average rate of requests per second +\item average wait queue length +\item average time on the wait queue +\item average time to service a request +\item percentage of time the wait queue was used +\item percentage of time any request was being serviced +\end{itemize} + +\subsubsection{qsq\_service\_time} +\index{qsq\_service\_time} +General syntax: + +\begin{vindent} +\begin{verbatim} +qsq_service_time:long (qname:string, scale:long) +\end{verbatim} +\end{vindent} +This function returns the average time in microseconds required to service +a request once it is removed from the wait queue. + + +\subsubsection{qsq\_start} +\index{qsq\_start} +General syntax: + +\begin{vindent} +\begin{verbatim} +qsq_start:unknown (qname:string) +\end{verbatim} +\end{vindent} +This function resets the statistics counters for the given queue, and restarts +tracking from the moment the function was called. This command is used to +create a queue. + + +\subsubsection{qsq\_throughput} +\index{qsq\_throughput} +General syntax: + +\begin{vindent} +\begin{verbatim} +qsq_throughput:long (qname:string, scale:long) +\end{verbatim} +\end{vindent} +This function returns the average number of requests served per microsecond. + + +\subsubsection{qsq\_utilization} +\index{qsq\_utilization} +General syntax: + +\begin{vindent} +\begin{verbatim} +qsq_utilization:long (qname:string, scale:long) +\end{verbatim} +\end{vindent} +This function returns the average time in microseconds that at least one +request was being serviced. + + +\subsubsection{qsq\_wait\_queue\_length} +\index{qsq wait\_queue\_length} +General syntax: + +\begin{vindent} +\begin{verbatim} +qsq_wait_queue_length:long (qname:string, scale:long) +\end{verbatim} +\end{vindent} +This function returns the average length of the wait queue. + + +\subsubsection{qsq\_wait\_time} +\index{qsq\_wait\_time} +General syntax: + +\begin{vindent} +\begin{verbatim} +qsq_wait_time:long (qname:string, scale:long) +\end{verbatim} +\end{vindent} +This function returns the average time in microseconds that it took for a +request to be serviced (qs\_wait() to qs\_done()). + + +\subsubsection{A queue example} + +What follows is an example from src/testsuite/systemtap.samples/queue\_demo.stp. +It uses the randomize feature of the timer probe to simulate queuing activity. + +\begin{vindent} +\begin{verbatim} +probe begin { + qsq_start ("block-read") + qsq_start ("block-write") +} + +probe timer.ms(3500), end { + qsq_print ("block-read") + qsq_start ("block-read") + qsq_print ("block-write") + qsq_start ("block-write") +} + +probe timer.ms(10000) { + exit () +} + +# synthesize queue work/service using three randomized "threads" for each queue. +global tc + +function qs_doit (thread, name) { + n = tc[thread] = (tc[thread]+1) % 3 # per-thread state counter + if (n==1) qs_wait (name) + else if (n==2) qs_run (name) + else if (n==0) qs_done (name) +} + +probe timer.ms(100).randomize(100) { qs_doit (0, "block-read") } +probe timer.ms(100).randomize(100) { qs_doit (1, "block-read") } +probe timer.ms(100).randomize(100) { qs_doit (2, "block-read") } +probe timer.ms(100).randomize(100) { qs_doit (3, "block-write") } +probe timer.ms(100).randomize(100) { qs_doit (4, "block-write") } +probe timer.ms(100).randomize(100) { qs_doit (5, "block-write") } +\end{verbatim} +\end{vindent} +This prints: + +\begin{vindent} +\begin{verbatim} +block-read: 9 ops/s, 1.090 qlen, 215749 await, 96382 svctm, 69% wait, 64% util +block-write: 9 ops/s, 0.992 qlen, 208485 await, 103150 svctm, 69% wait, 61% util +block-read: 9 ops/s, 0.968 qlen, 197411 await, 97762 svctm, 63% wait, 63% util +block-write: 8 ops/s, 0.930 qlen, 202414 await, 93870 svctm, 60% wait, 56% util +block-read: 8 ops/s, 0.774 qlen, 192957 await, 99995 svctm, 58% wait, 62% util +block-write: 9 ops/s, 0.861 qlen, 193857 await, 101573 svctm, 56% wait, 64% util +\end{verbatim} +\end{vindent} + +\subsection{Probe point identification} + +The following functions help you identify probe points. + + +\subsubsection{pp} +\index{pp} +General syntax: + +\begin{vindent} +\begin{verbatim} +pp:string () +\end{verbatim} +\end{vindent} +This function returns the probe point associated with a currently running +probe handler, including alias and wild-card expansion effects. + + +\subsubsection{probefunc} +\index{probefunc} +General syntax: + +\begin{vindent} +\begin{verbatim} +probefunc:string () +\end{verbatim} +\end{vindent} +This function returns the name of the function being probed. + + +\subsubsection{probemod} +\index{probefunc} +General syntax: + +\begin{vindent} +\begin{verbatim} +probemod:string () +\end{verbatim} +\end{vindent} +This function returns the name of the module containing the probe point. + + +\subsection{Formatting functions} +\index{formatting} +The following functions help you format output. + + +\subsubsection{ctime} +\index{ctime} +General syntax: + +\begin{vindent} +\begin{verbatim} +ctime:string(epochsecs:long) +\end{verbatim} +\end{vindent} +This function accepts an argument of seconds since the epoch as returned +by \texttt{gettimeofday\_s()}. It returns a date string in UTC of the form: + +\begin{vindent} +\begin{verbatim} +"Wed Jun 30 21:49:008 2006" +\end{verbatim} +\end{vindent} +This function does not adjust for timezones. The returned time is always +in GMT. Your script must manually adjust epochsecs before passing it to ctime() +if you want to print local time. + + +\subsubsection{errno\_str} +\index{errno\_str} +General syntax: + +\begin{vindent} +\begin{verbatim} +errno_str:string (err:long) +\end{verbatim} +\end{vindent} +This function returns the symbolic string associated with the given error +code, such as ENOENT for the number 2, or E\#3333 for an out-of-range value +such as 3333. + + +\subsubsection{returnstr} +\index{returnstr} +General syntax: + +\begin{vindent} +\begin{verbatim} +returnstr:string (returnp:long) +\end{verbatim} +\end{vindent} +This function is used by the syscall tapset, and returns a string. Set \texttt{}returnp +equal to 1 for decimal, or 2 for hex. + + +\subsubsection{thread\_indent} +\index{thread\_indent} +General syntax: + +\begin{vindent} +\begin{verbatim} +thread_indent:string (delta:long) +\end{verbatim} +\end{vindent} +This function returns a string with appropriate indentation for a thread. +Call it with a small positive or matching negative delta. If this is the +outermost, initial level of indentation, then the function resets the relative +timestamp base to zero. + +The following example uses thread\_indent() to trace the functions called +in the drivers/usb/core kernel source. It prints a relative timestamp and +the name and ID of the current process, followed by the appropriate indent +and the function name. Note that \char`\"{}swapper(0)\char`\"{} indicates +the kernel is running in interrupt context and there is no valid current +process. + +\begin{vindent} +\begin{verbatim} +probe kernel.function("*@drivers/usb/core/*") { + printf ("%s -> %s\n", thread_indent(1), probefunc()) +} +probe kernel.function("*@drivers/usb/core/*").return { + printf ("%s <- %s\n", thread_indent(-1), probefunc()) +} +\end{verbatim} +\end{vindent} +This prints: + +\begin{vindent} +\begin{verbatim} + 0 swapper(0): -> usb_hcd_irq + 8 swapper(0): <- usb_hcd_irq + 0 swapper(0): -> usb_hcd_irq +10 swapper(0): -> usb_hcd_giveback_urb +16 swapper(0): -> urb_unlink +22 swapper(0): <- urb_unlink +29 swapper(0): -> usb_free_urb +35 swapper(0): <- usb_free_urb +39 swapper(0): <- usb_hcd_giveback_urb +45 swapper(0): <- usb_hcd_irq + 0 usb-storage(1338): -> usb_submit_urb + 6 usb-storage(1338): -> usb_hcd_submit_urb +12 usb-storage(1338): -> usb_get_urb +18 usb-storage(1338): <- usb_get_urb +25 usb-storage(1338): <- usb_hcd_submit_urb +29 usb-storage(1338): <- usb_submit_urb + 0 swapper(0): -> usb_hcd_irq + 7 swapper(0): <- usb_hcd_irq +\end{verbatim} +\end{vindent} + +\subsubsection{thread\_timestamp} +\index{thread\_timestamp} + +General syntax: + +\begin{vindent} +\begin{verbatim} +thread_timestamp:long () +\end{verbatim} +\end{vindent} +This function returns an absolute timestamp value for use by the indentation +function. The default function uses \texttt{gettimeofday\_us.} + + +\subsection{String functions} +\index{string} +The following are string functions you can use. + + +\subsubsection{isinstr} +\index{isinstr} +General syntax: + +\begin{vindent} +\begin{verbatim} +isinstr:long (s1:string, s2:string) +\end{verbatim} +\end{vindent} +This function returns 1 if string s1 contains string s2, otherwise zero. + + +\subsubsection{strlen} +\index{strlen} +General syntax: + +\begin{vindent} +\begin{verbatim} +strlen:long (str:string) +\end{verbatim} +\end{vindent} +This function returns the number of characters in str. + + +\subsubsection{strtol} + +General syntax: + +\begin{vindent} +\begin{verbatim} +strtol:long (str:string, base:long) +\end{verbatim} +\end{vindent} +This function converts the string representation of a number to an integer. +The base parameter indicates the number base to assume for the string (e.g. +16 for hex, 8 for octal, 2 for binary). + + +\subsubsection{substr} +\index{substr} +General syntax: + +\begin{vindent} +\begin{verbatim} +substr:string (str:string, start:long, stop:long) +\end{verbatim} +\end{vindent} +This function returns the substring of \texttt{str} starting from character +position \texttt{start} and ending at character position \texttt{stop}. + + +\subsubsection{text\_str} +\index{text\_str} +General syntax: + +\begin{vindent} +\begin{verbatim} +text_str:string (input:string) +\end{verbatim} +\end{vindent} +This function accepts a string argument. Any ASCII characters in the string +that are not printable are replaced by a corresponding escape sequence in +the returned string. + + +\subsubsection{text\_strn} +\index{text\_strn} +General syntax: + +\begin{vindent} +\begin{verbatim} +text_strn:string (input:string, len:long, quoted:long) +\end{verbatim} +\end{vindent} +This function accepts a string of length \texttt{len}. Any ASCII characters +that are not printable are replaced by a corresponding escape sequence in +the returned string. If \texttt{quoted} is not null, the function adds a +backslash character to the output. + + +\subsubsection{tokenize} + +General syntax: + +\begin{vindent} +\begin{verbatim} +tokenize:string (input:string, delim:string) +\end{verbatim} +\end{vindent} +This function returns the next token in the given input string, where +the tokens are delimited by one of the characters in the delim string. +If the input string is non-NULL, it returns the first token. If the input string +is NULL, it returns the next token in the string passed in the previous call +to tokenize. If no delimiter is found, the entire remaining input string +is returned. It returns NULL when no more tokens are available. + + +\subsection{Timestamps} +\index{timestamps} +The following functions provide methods to extract time data. + + +\subsubsection{get\_cycles} +\index{get\_cycles} +General syntax: + +\begin{vindent} +\begin{verbatim} +get_cycles:long () +\end{verbatim} +\end{vindent} +This function returns the processor cycle counter value if available, else +it returns zero. + + +\subsubsection{gettimeofday\_ms} +\index{gettimeofday\_ms} +General syntax: + +\begin{vindent} +\begin{verbatim} +gettimeofday_ms:long () +\end{verbatim} +\end{vindent} +This function returns the number of milliseconds since the UNIX epoch. + + +\subsubsection{gettimeofday\_ns} +\index{gettimeofday\_ns} +General syntax: + +\begin{vindent} +\begin{verbatim} +gettimeofday_ns:long () +\end{verbatim} +\end{vindent} +This function returns the number of nanoseconds since the UNIX epoch. + + +\subsubsection{gettimeofday\_s} +\index{gettimeofday\_ s} +General syntax: + +\begin{vindent} +\begin{verbatim} +gettimeofday_s:long () +\end{verbatim} +\end{vindent} +This function returns the number of seconds since the UNIX epoch. + + +\subsubsection{gettimeofday\_us} +\index{gettimeofday\_us} +General syntax: + +\begin{vindent} +\begin{verbatim} +gettimeofday_us:long () +\end{verbatim} +\end{vindent} +This function returns the number of microseconds since the UNIX epoch. + + +\subsection{Miscellaneous tapset functions} + +The following are miscellaneous functions. + + +\subsubsection{addr\_to\_node} +\index{addr\_to\_node} +General syntax: + +\begin{vindent} +\begin{verbatim} +addr_to_node:long (addr:long) +\end{verbatim} +\end{vindent} +This function accepts an address, and returns the node that the given address +belongs to in a NUMA system. + + +\subsubsection{exit} +\index{exit} +General syntax: + +\begin{vindent} +\begin{verbatim} +exit:unknown () +\end{verbatim} +\end{vindent} +This function enqueues a request to shut down the SystemTap session. It does +not unwind the current probe handler, nor block new probe handlers. The stap +daemon will respond to the request and initiate an ordered shutdown. + + +\subsubsection{system} +\index{system} +General syntax: + +\begin{vindent} +\begin{verbatim} +system (cmd:string) +\end{verbatim} +\end{vindent} +This function runs a command on the system. The command will run in the background +when the current probe completes. + + +\section{For Further Reference\label{sec:For-Further-Reference}} + +For more information, see: +\begin{itemize} +\item The SystemTap tutorial at \url{http://sourceware.org/systemtap/tutorial/} +\item The SystemTap wiki at \url{http://sourceware.org/systemtap/wiki} +\item The SystemTap documentation page at \url{http://sourceware.org/systemtap/documentation.html} +\item From an unpacked source tarball or CVS directory, the examples in in the +src/examples directory, the tapsets in the src/tapset directory, and the +test scripts in the src/testsuite directory. +\item The man pages for tapsets. For a list, run the command \texttt{{}``man -k +stapprobes}''. +\end {itemize} + +\setcounter{secnumdepth}{0} +\newpage{} +\addcontentsline{toc}{section}{Index} +\printindex{} +\end{document} diff --git a/doc/tutorial.tex b/doc/tutorial.tex new file mode 100644 index 000000000..d465bf0bf --- /dev/null +++ b/doc/tutorial.tex @@ -0,0 +1,1210 @@ +% Copyright (C) 2005-2007 Red Hat Inc. +% This file is part of systemtap, and is free software. You can +% redistribute it and/or modify it under the terms of the GNU General +% Public License (GPL); either version 2, or (at your option) any +% later version. + +\documentclass{article} +\usepackage{html} +\usepackage{graphicx} +% \usepackage{moreverb} +\usepackage{fancyvrb} +\usepackage{listings} +\usepackage{fullpage} +\usepackage{fancybox} +\usepackage[compatible]{nomencl} +% \usepackage{geometry} +% \geometry{letterpaper,text={7in,8.5in}} +\usepackage{charter} + +\newenvironment{boxedminipage}%% Boxed minipage + {\begin{makeimage}\begin{center}\begin{Sbox}\begin{minipage}}% + {\end{minipage}\end{Sbox}\fbox{\TheSbox}\end{center}\end{makeimage}} + +\begin{htmlonly} +\renewcommand{\nomenclature}[2]{} +\end{htmlonly} + +% \usepackage{draftcopy} % ugly +\bibliographystyle{plain} +\makeglossary +\parindent0.0cm +\parskip0.2cm + +\begin{document} + +\begin{center} +\LARGE {\bf Systemtap tutorial} +\end{center} + +\hfill \begin{minipage}{2.5in} +% contributors please add your names to the list +Frank Ch. Eigler {\tt \small } \\ + +\hfill \today +\end{minipage} + +\tableofcontents + +\section{Introduction} + +Systemtap is a tool that allows developers and administrators to write +and reuse simple scripts to deeply examine the activities of a live +Linux system. Data may be extracted, filtered, and summarized quickly +and safely, to enable diagnoses of complex performance or functional +problems. + +\nomenclature{script}{A simple programming language understood by systemtap.} + +The essential idea behind a systemtap script is to name {\em events}, +and to give them {\em handlers}. Whenever a specified event occurs, +the Linux kernel runs the handler as if it were a quick subroutine, +then resumes. There are several kind of events, such as entering or +exiting a function, a timer expiring, or the entire systemtap session +starting or stopping. A handler is a series of script language +statements that specify the work to be done whenever the event occurs. +This work normally includes extracting data from the event context, +storing them into internal variables, or printing results. + +\nomenclature{event}{An identifiable instant in the operating system's +execution state, such as entry to a function, or expiry of a timer.} +\nomenclature{session}{A complete run of a systemtap script program.} +\nomenclature{handler}{A series of statements, written in script, which +is to be performed whenever an event occurs.} +\nomenclature{\tt .stp}{The standard file name extension for systemtap +scripts.} + +Systemtap works by translating the script to C, running the system C +compiler to create a kernel module from that. When the module is +loaded, it activates all the probed events by hooking into the kernel. +Then, as events occur on any processor, the compiled handlers run. +Eventually, the session stops, the hooks are disconnected, and the +module removed. This entire process is driven from a single +command-line program, \verb+stap+. + +\begin{figure}[h!] +\begin{boxedminipage}{4.5in} +\begin{verbatim} +# cat hello-world.stp +probe begin +{ + print ("hello world\n") + exit () +} + +# stap hello-world.stp +hello world +\end{verbatim} +\end{boxedminipage} +\label{fig:hello-world} +\caption{A systemtap smoke test.} +\end{figure} + +This paper assumes that you have installed systemtap and its +prerequisite kernel development tools and debugging data, so that you +can run the scripts such as the simple one in +Figure~\ref{fig:hello-world}. Log on as \verb+root+, or even better, +as a user authorized to \verb+sudo+, before running systemtap. + +\begin{figure}[h] +\begin{boxedminipage}{4.5in} +\begin{verbatim} +# cat strace-open.stp +probe syscall.open +{ + printf ("%s(%d) open (%s)\n", execname(), pid(), argstr) +} +probe timer.ms(4000) # after 4 seconds +{ + exit () +} + +# stap strace-open.stp +vmware-guestd(2206) open ("/etc/redhat-release", O_RDONLY) +hald(2360) open ("/dev/hdc", O_RDONLY|O_EXCL|O_NONBLOCK) +hald(2360) open ("/dev/hdc", O_RDONLY|O_EXCL|O_NONBLOCK) +hald(2360) open ("/dev/hdc", O_RDONLY|O_EXCL|O_NONBLOCK) +df(3433) open ("/etc/ld.so.cache", O_RDONLY) +df(3433) open ("/lib/tls/libc.so.6", O_RDONLY) +df(3433) open ("/etc/mtab", O_RDONLY) +hald(2360) open ("/dev/hdc", O_RDONLY|O_EXCL|O_NONBLOCK) +\end{verbatim} +\end{boxedminipage} +\label{fig:strace-open} +\caption{A taste of systemtap: a system-wide {\tt strace}, just for +the {\tt open} system call.} +\end{figure} +\nomenclature{strace}{A standard ptrace-based command line tool to trace system call activity of a process.} + +\section{Tracing} + +The simplest kind of probe is simply to {\em trace} an event. +\nomenclature{trace}{A compact textual record of an event occurrence.} +This is the effect of inserting strategically located \verb+print+ +statements into a program. This is often the first step of problem +solving: explore by seeing a history of what has happened. + +This style of instrumentation is the simplest. It just asks systemtap +to print something at each event. To express this in the script +language, you need to say where to probe and what to print there. + +\subsection{Where to probe} + +Systemtap supports a number of built-in events. The library of +scripts that comes with systemtap, each called a ``tapset'', may +define additional ones defined in terms of the built-in family. See +the \verb+stapprobes+ man page for details. \nomenclature{tapset}{A +reusable script forming part of the automatically searched tapset +library.} All these events are named using a unified syntax that +looks like dot-separated parameterized identifiers: + +\begin{tabular}{rl} +\verb+begin+ & The startup of the systemtap session. \\ +\verb+end+ & The end of the systemtap session. \\ +\verb+kernel.function("sys_open")+ & The entry to the function named +\verb+sys_open+ in the kernel. \\ +\verb+syscall.close.return+ & The return from the \verb+close+ system +call. \\ +\verb+module("ext3").statement(0xdeadbeef)+ & The addressed instruction +in the \verb+ext3+ filesystem driver. \\ +\verb+timer.ms(200)+ & A timer that fires every 200 milliseconds. \\ +\end{tabular} + +Let's say that you would like to trace all function entries and exits +in a source file, say \verb+net/socket.c+ in the kernel. The +\verb+kernel.function+ probe point lets you express that easily, since +systemtap examines the kernel's debugging information to relate object +code to source code. It works like a debugger: if you can name or +place it, you can probe it. Use +\verb+kernel.function("*@net/socket.c")+ for the function entries, and +\verb+kernel.function("*@net/socket.c").return+ for the exits. Note +the use of wildcards in the function name part, and the subsequent +\verb+@FILENAME+ part. You can also put wildcards into the file name, +and even add a colon (\verb+:+) and a line number, if you want to +restrict the search that precisely. Since systemtap will put a +separate probe in every place that matches a probe point, a few +wildcards can expand to hundreds or thousands of probes, so be careful +what you ask for. \nomenclature{debug information}{Data created by the +compiler when the kernel or application was built, sometimes packaged into +{\tt debuginfo} files, for use by a symbolic debugger.} +\nomenclature{wildcard}{Presence of \verb+*+ globbing patterns in probe points.} + +Once you identify the probe points, the skeleton of the systemtap +script appears. The \verb+probe+ keyword introduces a probe point, or +a comma-separated list of them. The following \verb+{+ and \verb+}+ +braces enclose the handler for all listed probe points. +\begin{verbatim} +probe kernel.function("*@net/socket.c") { } +probe kernel.function("*@net/socket.c").return { } +\end{verbatim} +You can run this script as is, though with empty handlers there will +be no output. Put the two lines into a new file. Run +\verb+stap -v FILE+. Terminate it any time with \verb+^C+. (The +\verb+-v+ option tells systemtap to print more verbose messages during +its processing. Try the \verb+-h+ option to see more options.) + +\subsection{What to print} + +Since you are interested in each function that was entered and exited, +a line should be printed for each, containing the function name. In +order to make that list easy to read, systemtap should indent the +lines so that functions called by other traced functions are nested +deeper. To tell each single process apart from any others that may be +running concurrently, systemtap should also print the process ID in +the line. + +Systemtap provides a variety of such contextual data, ready for +formatting. They usually appear as function calls within the handler, +like you already saw in Figure~\ref{fig:strace-open}. See the +\verb+stapfuncs+ man page for those functions and more defined in the +tapset library, but here's a sampling: + +\begin{tabular}{rl} +\verb+tid()+ & The id of the current thread. \\ +\verb+pid()+ & The process (task group) id of the current thread. \\ +\verb+uid()+ & The id of the current user. \\ +\verb+execname()+ & The name of the current process. \\ +\verb+cpu()+ & The current cpu number. \\ +\verb+gettimeofday_s()+ & Number of seconds since epoch. \\ +\verb+get_cycles()+ & Snapshot of hardware cycle counter. \\ +\verb+pp()+ & A string describing the probe point being currently handled. \\ +\verb+probefunc()+ & If known, the name of the function in which + this probe was placed. \\ +\end{tabular} + +The values returned may be strings or numbers. The \verb+print()+ +built-in function accepts either as its sole argument. Or, you can +use the C-style \verb+printf()+ built-in, whose formatting argument +may include \verb+%s+ for a string, \verb+%d+ for a number. +\verb+printf+ and other functions take comma-separated arguments. +Don't forget a \verb+"\n"+ at the end. + +A particularly handy function in the tapset library is +\verb+thread_indent+. Given an indentation delta parameter, it stores +internally an indentation counter for each thread (\verb+tid()+), and +returns a string with some generic trace data plus an appropriate +number of indentation spaces. That generic data includes a timestamp +(number of microseconds since the most recent initial indentation), a +process name and the thread id itself. It therefore gives an idea not +only about what functions were called, but who called them, and how +long they took. Figure~\ref{fig:socket-trace} shows the finished +script. It lacks a call to the \verb+exit()+ function, so you need to +interrupt it with \verb+^C+ when you want the tracing to stop. + +\begin{figure}[h!] +\begin{boxedminipage}{4.5in} +\begin{verbatim} +# cat socket-trace.stp +probe kernel.function("*@net/socket.c") { + printf ("%s -> %s\n", thread_indent(1), probefunc()) +} +probe kernel.function("*@net/socket.c").return { + printf ("%s <- %s\n", thread_indent(-1), probefunc()) +} + +# stap socket-trace.stp + 0 hald(2632): -> sock_poll + 28 hald(2632): <- sock_poll +[...] + 0 ftp(7223): -> sys_socketcall + 1159 ftp(7223): -> sys_socket + 2173 ftp(7223): -> __sock_create + 2286 ftp(7223): -> sock_alloc_inode + 2737 ftp(7223): <- sock_alloc_inode + 3349 ftp(7223): -> sock_alloc + 3389 ftp(7223): <- sock_alloc + 3417 ftp(7223): <- __sock_create + 4117 ftp(7223): -> sock_create + 4160 ftp(7223): <- sock_create + 4301 ftp(7223): -> sock_map_fd + 4644 ftp(7223): -> sock_map_file + 4699 ftp(7223): <- sock_map_file + 4715 ftp(7223): <- sock_map_fd + 4732 ftp(7223): <- sys_socket + 4775 ftp(7223): <- sys_socketcall +[...] +\end{verbatim} +\end{boxedminipage} +\caption{Tracing and timing functions in {\tt net/sockets.c}.} +\label{fig:socket-trace} +\end{figure} + +\subsection{Exercises} + +\begin{enumerate} +\item Use the \verb+-p2+ option to systemtap to list all the kernel +functions named with the word ``nit'' in them. The probe handlers +might as well be empty. + +\item Trace some system calls (use \verb+syscall.NAME+ and \verb+.return+ +probe points), with the same \verb+thread_indent+ probe handler as in +Figure~\ref{fig:socket-trace}. Interpret the results. + +\end{enumerate} + +\section{Analysis} + +Pages of generic tracing text may give you enough information for +exploring a system. With systemtap, it is possible to analyze that +data, to filter, aggregate, transform, and summarize it. Different +probes can work together to share data. Probe handlers can use a rich +set of control constructs to describe algorithms, with a syntax taken +roughly from \verb+awk+. With these tools, systemtap scripts can +focus on a specific question and provide a compact response: no +\verb+grep+ needed. +\nomenclature{awk}{A classic UNIX stream processing language.} + +\subsection{Basic constructs} + +Most systemtap scripts include conditionals, to limit tracing or other +logic to those processes or users or {\em whatever} of interest. The +syntax is simple: + +\begin{tabular}{rl} +\verb+if (+{\em EXPR}\verb+)+ {\em STATEMENT} [\verb+else+ {\em STATEMENT}\verb+]+ & if/else statement \\ +\verb+while (+{\em EXPR}\verb+)+ {\em STATEMENT} & while loop \\ +\verb+for (+{\em A}\verb+;+ {\em B}\verb+;+ {\em C}\verb+)+ {\em STATEMENT} & for loop \\ +\end{tabular} + +Scripts may use \verb+break+/\verb+continue+ as in C. +Probe handlers can return early using \verb+next+ as in \verb+awk+. +Blocks of statements are enclosed in \verb+{+ and \verb+}+. In +systemtap, the semicolon (\verb+;+) is accepted as a null statement +rather than as a statement terminator, so is only rarely\footnote{Use +them between consecutive expressions that place unary {\tt +},{\tt -} +or mixed pre/post {\tt ++},{\tt --} in an ambiguous manner.} +necessary. Shell-style (\verb+#+), C-style (\verb+/* */+), and +C++-style (\verb+//+) comments are all accepted. + +Expressions look like C or \verb+awk+, and support the usual +operators, precedences, and numeric literals. Strings are treated as +atomic values rather than arrays of characters. String concatenation +is done with the dot (\verb+"a" . "b"+). Some examples: + +\begin{tabular}{rl} +\verb+(uid() > 100)+ & probably an ordinary user \\ +\verb+(execname() == "sed")+ & current process is sed \\ +\verb+(cpu() == 0 && gettimeofday_s() > 1140498000)+ & after Feb. 21, 2006, on CPU 0 \\ +\verb+"hello" . " " . "world"+ & a string in three easy pieces \\ +\end{tabular} + +Variables may be used as well. Just pick a name, assign to it, and +use it in expressions. They are automatically initialized and +declared. The type of each identifier -- string vs. number -- is +automatically inferred by systemtap from the kinds of operators and +literals used on it. Any inconsistencies will be reported as errors. +Conversion between string and number types is done through explicit +function calls. + +\nomenclature{type}{A designation of each identifier such as a +variable, or function, or array value or index, as containing a string +or number.} \nomenclature{string}{A \verb+\0+-terminated character +string of up to a fixed limit in length.} \nomenclature{number}{A +64-bit signed integer.} \nomenclature{type inference}{The automatic +determination of the type of each variable, function parameter, array +value and index, based on their use.} + +\begin{tabular}{rl} +\verb+foo = gettimeofday_s()+ & foo is a number \\ +\verb+bar = "/usr/bin/" . execname()+ & bar is a string \\ +\verb|c++| & c is a number \\ +\verb+s = sprint(2345)+ & s becomes the string "2345" \\ +\end{tabular} + +By default, variables are local to the probe they are used in. That +is, they are initialized, used, and disposed of at each probe handler +invocation. To share variables between probes, declare them global +anywhere in the script. Because of possible concurrency (multiple +probe handlers running on different CPUs), each global variable used +by a probe is automatically read- or write-locked while the handler is +running. \nomenclature{global variable}{A scalar, array, or aggregate that was +named in a \verb+global+ declaration, sharing that object amongst all +probe handlers and functions executed during a systemtap session.} +\nomenclature{locking}{An automated facility used by systemtap to +protect global variables against concurrent modification and/or +access.} + +\begin{figure}[h!] +\begin{boxedminipage}{4.5in} +\begin{verbatim} +# cat timer-jiffies.stp +global count_jiffies, count_ms +probe timer.jiffies(100) { count_jiffies ++ } +probe timer.ms(100) { count_ms ++ } +probe timer.ms(12345) +{ + hz=(1000*count_jiffies) / count_ms + printf ("jiffies:ms ratio %d:%d => CONFIG_HZ=%d\n", + count_jiffies, count_ms, hz) + exit () +} + +# stap timer-jiffies.stp +jiffies:ms ratio 30:123 => CONFIG_HZ=243 +\end{verbatim} +\end{boxedminipage} +\caption{Experimentally measuring {\tt CONFIG\_HZ}.} +\label{fig:timer-jiffies} +\end{figure} + +\subsection{Target variables} + +A class of special ``target variables'' allow access to the probe +point context. \nomenclature{target variable}{A value that may be +extracted from the kernel context of the probe point, such as a +parameter or local variable within a probed function.} In a symbolic +debugger, when you're stopped at a breakpoint, you can print values +from the program's context. In systemtap scripts, for those probe +points that match with specific executable point (rather than an +asynchronous event like a timer), you can do the same. To know which +variables are likely to be available, you will need to be familiar +with the kernel source you are probing. In addition, you will need to +check that the compiler has not optimized those values into +unreachable nonexistence. + +Let's say that you are trying to trace filesystem reads/writes to a +particular device/inode. From your knowledge of the kernel, you know +that two functions of interest could be \verb+vfs_read+ and +\verb+vfs_write+. Each takes a \verb+struct file *+ argument, inside +which there is a \verb+struct dentry *+, a \verb+struct inode *+, and +so on. Systemtap allows limited dereferencing of such pointer chains. +Two functions, \verb+user_string+ and \verb+kernel_string+, can copy +\verb+char *+ target variables into systemtap strings. +Figure~\ref{fig:inode-watch} demonstrates one way to monitor a +particular file (identifed by device number and inode number). This +example also demonstrates pasting numeric command-line arguments +(\verb+$1+ etc.) into scripts. +%$ + +\begin{figure}[h!] +\begin{boxedminipage}{4.5in} +\begin{verbatim} +# cat inode-watch.stp +probe kernel.function ("vfs_write"), + kernel.function ("vfs_read") +{ + dev_nr = $file->f_dentry->d_inode->i_sb->s_dev + inode_nr = $file->f_dentry->d_inode->i_ino + + if (dev_nr == ($1 << 20 | $2) # major/minor device + && inode_nr == $3) + printf ("%s(%d) %s 0x%x/%u\n", + execname(), pid(), probefunc(), dev_nr, inode_nr) +} +# stat -c '%D %i' /etc/crontab +803 988136 +# stap inode-watch.stp 8 3 988136 +crond(2419) vfs_read 0x800003/988136 +crond(2419) vfs_read 0x800003/988136 +crond(2419) vfs_read 0x800003/988136 +\end{verbatim} +% $ +\end{boxedminipage} +\caption{Watching for reads/writes to a particular file.} +\label{fig:inode-watch} +\end{figure} + +\subsection{Functions} + +Functions are conveniently packaged reusable software: it would be a +shame to have to duplicate a complex condition expression or logging +directive in every placed it's used. So, systemtap lets you define +functions of your own. Like global variables, systemtap functions may +be defined anywhere in the script. They may take any number of string +or numeric arguments (by value), and may return a single string or +number. The parameter types are inferred as for ordinary variables, +and must be consistent throughout the program. Local and global +script variables are available, but target variables are {\em not}. +That's because there is no specific debugging-level context associated +with a function. +\nomenclature{function}{A clump of parametrized script statements that +may be repeatedly and recursively called from probe handlers and other +functions.} + +A function is defined with the keyword \verb+function+ followed by a +name. Then comes a comma-separated formal argument list (just a list +of variable names). The \verb+{ }+-enclosed body consists of any list +of statements, including expressions that call functions. Recursion +is possible, up to a nesting depth limit. Figure~\ref{fig:functions} +displays function syntax. + + +\begin{figure}[h!] +\begin{boxedminipage}{4.5in} +\begin{verbatim} +# Red Hat convention +function system_uid_p (u) { return u < 500 } + +# kernel device number assembly macro +function makedev (major,minor) { return major << 20 | minor } + +function trace_common () +{ + printf("%d %s(%d)", gettimeofday_s(), execname(), pid()) + # no return value necessary +} + +function fibonacci (i) +{ + if (i < 1) return 0 + else if (i < 2) return 1 + else return fibonacci(i-1) + fibonacci(i-2) +} +\end{verbatim} +\end{boxedminipage} +\caption{Some functions of dubious utility.} +\label{fig:functions} +\end{figure} + +\subsection{Arrays} + +Often, probes will want to share data that cannot be represented as a +simple scalar value. Much data is naturally tabular in nature, +indexed by some tuple of thread numbers, processor ids, names, time, +and so on. Systemtap offers associative arrays for this purpose. +These arrays are implemented as hash tables with a maximum size that +is fixed at startup. Because they are too large to be created +dynamically for inidividual probes handler runs, they must be declared +as global. \nomenclature{array}{A global +\verb+[+$k_1,k_2,\ldots,k_n\verb+]+\rightarrow value$ +associative lookup table, with a string, +number for each index; the value may be a string, number, or an aggregate.} + +\begin{tabular}{rl} +\verb|global a| & declare global scalar or array variable \\ +\verb|global b[400]| & declare array, reserving space for up to 400 tuples \\ +\end{tabular} + +The basic operations for arrays are setting and looking up elements. +These are expressed in \verb+awk+ syntax: the array name followed by +an opening \verb+[+ bracket, a comma-separated list of index +expressions, and a closing \verb+]+ bracket. Each index expression +may be string or numeric, as long as it is consistently typed +throughout the script. +\nomenclature{arity}{Number of indexes to an array, or number of parameters +to a function.} + +\begin{tabular}{rl} +\verb|foo [4,"hello"] ++ | & increment the named array slot \\ +\verb|processusage [uid(),execname()] ++| & update a statistic \\ +\verb|times [tid()] = get_cycles()| & set a timestamp reference point \\ +\verb|delta = get_cycles() - times [tid()]| & compute a timestamp delta \\ +\end{tabular} + +Array elements that have not been set {\em may} be fetched, and return +a dummy null value (zero or an empty string) as appropriate. However, +assigning a null value does not delete the element: an explicit +\verb|delete| statement is required. \nomenclature{null value}{A +default initialized value for globals and array elements: a zero or an +empty string, depending on type.} Systemtap provides syntactic sugar +for these operations, in the form of explicit membership testing and +deletion. + +\begin{tabular}{rl} +\verb|if ([4,"hello"] in foo) { }| & membership test \\ +\verb|delete times[tid()]| & deletion of a single element \\ +\verb|delete times| & deletion of all elements \\ +\end{tabular} + +One final and important operation is iteration over arrays. This uses +the keyword \verb+foreach+. Like \verb+awk+, this creates a loop that +{\em iterates over key tuples} of an array, not just {\em values}. In +addition, the iteration may be {\em sorted} by any single key or the +value by adding an extra \verb|+| or \verb|-| code. + +The \verb+break+ and \verb+continue+ statements work inside +\verb+foreach+ loops, too. Since arrays can be large but probe +handlers must not run for long, it is a good idea to exit iteration +early if possible. The \verb+limit+ option in the \verb+foreach+ +expression is one way. For simplicity, systemtap forbids any {\em +modification} of an array while it is being iterated using a +\verb+foreach+. + +\begin{tabular}{rl} +\verb|foreach ([a,b] in foo) { fuss_with(foo[a,b]) }| & simple loop in arbitrary sequence \\ +\verb|foreach ([a,b] in foo+ limit 5) { }| & loop in increasing sequence of value, stop after 5 \\ +\verb|foreach ([a-,b] in foo) { }| & loop in decreasing sequence of first key \\ +\end{tabular} + +\subsection{Aggregates} + +When we said above that values can only be strings or numbers, we lied +a little. There is a third type: statistics aggregates, or aggregates +for short. Instances of this type are used to collect statistics on +numerical values, where it is important to accumulate new data quickly +({\em without} exclusive locks) and in large volume (storing only +aggregated stream statistics). This type only makes sense for global +variables, and may be stored individually or as elements of an array. +\nomenclature{aggregate}{A special ``write-mostly'' data type used to +efficiently store aggregated statistical values of a potentially huge +data stream.} + +To add a value to a statistics aggregate, systemtap uses the special +operator \verb+<<<+. Think of it like C++'s \verb+<<+ output +streamer: the left hand side object accumulates the data sample given +on the right hand side. This operation is efficient (taking a shared +lock) because the aggregate values are kept separately on each +processor, and are only aggregated across processors on request. + +\begin{verbatim} +a <<< delta_timestamp +writes[execname()] <<< count +\end{verbatim} + +To read the aggregate value, special functions are available to +extract a selected statistical function. {\em The aggregate value +cannot be read by simply naming it as if it were an ordinary +variable.} These operations take an exclusive lock on the respective +globals, and should therefore be relatively rare. The simple ones +are: \verb+@min+, \verb+@max+, \verb+@count+, \verb+@avg+, and +\verb+@sum+, and evaluate to a single number. In addition, histograms +of the data stream may be extracted using the \verb+@hist_log+ and +\verb+@hist_linear+. These evaluate to a special sort of array that +may at present\footnote{We anticipate support for indexing and looping +using {\tt foreach} shortly.} only be printed. +\nomenclature{extractor}{A function-like expression in a script that +computes a single statistic for a given aggregate.} + +\begin{tabular}{rl} +\verb+@avg(a)+ & the average of all the values accumulated + into \verb+a+ \\ +\verb+print(@hist_linear(a,0,100,10))+ & print an ``ascii art'' linear + histogram of the same data stream, \\ + & bounds $0 \ldots 100$, bucket width is $10$ \\ +\verb|@count(writes["zsh"])| & the number of times ``zsh'' + ran the probe handler \\ +\verb+print(@hist_log(writes["zsh"]))+ & print an ``ascii art'' logarithmic + histogram of the same data stream \\ +\end{tabular} + +\subsection{Safety} +\label{sec:safety} + +The full expressivity of the scripting language raises good questions +of safety. Here is a set of Q\&A: + +\begin{description} +\item{\bf What about infinite loops? recursion?} A probe handler is +bounded in time. The C code generated by systemtap includes explicit +checks that limit the total number of statements executed to a small +number. A similar limit is imposed on the nesting depth of function +calls. When either limit is exceeded, that probe handler cleanly +aborts and signals an error. The systemtap session is normally +configured to abort as a whole at that time. + +\item{\bf What about running out of memory?} No dynamic memory +allocation whatsoever takes place during the execution of probe +handlers. Arrays, function contexts, and buffers are allocated during +initialization. These resources may run out during a session, and +generally result in errors. + +\item{\bf What about locking?} If multiple probes seek conflicting +locks on the same global variables, one or more of them will time out, +and be aborted. Such events are tallied as ``skipped'' probes, and a +count is displayed at session end. A configurable number of skipped +probes can trigger an abort of the session. + +\item{\bf What about null pointers? division by zero?} The C code +generated by systemtap translates potentially dangerous operations to +routines that check their arguments at run time. These signal errors +if they are invalid. Many arithmetic and string operations silently +overflow if the results exceed representation limits. + +\item{\bf What about bugs in the translator? compiler?} While bugs +in the translator, or the runtime layer certainly exist\footnote{See +\tt http://sources.redhat.com/bugzilla}, our test suite gives some +assurance. Plus, the entire generated C code may be inspected (try +the \verb+-p3+ option). Compiler bugs are unlikely to be of any +greater concern for systemtap than for the kernel as a whole. In +other words, if it was reliable enough to build the kernel, it will +build the systemtap modules properly too. + +\item{\bf Is that the whole truth?} In practice, there are several +weak points in systemtap and the underlying kprobes system at the time +of writing. Putting probes indiscriminately into unusually sensitive +parts of the kernel (low level context switching, interrupt +dispatching) has reportedly caused crashes in the past. We are +fixing these bugs as they are found, and +constructing a probe point ``blacklist'', but it is not complete. +\nomenclature{blacklist}{A list of probe point patterns encoded into +the translator or the kernel, where probing is prohibited for safety +reasons.} \nomenclature{kprobes}{A breakpoint dispatching system for +dynamic kernel probes, used by systemtap to implement some families of +probe points.} + +\end{description} + + +\subsection{Exercises} +\begin{enumerate} +\item Alter the last probe in \verb+timer-jiffies.stp+ to reset the +counters and continue reporting instead of exiting. + +\item Write a script that, every ten seconds, displays the top five +most frequent users of \verb+open+ system call during that interval. + +\item Write a script that experimentally measures the speed of the +\verb+get_cycles()+ counter on each processor. + +\item Use any suitable probe point to get an approximate profile of +process CPU usage: which processes/users use how much of each CPU. +\end{enumerate} + +\section{Tapsets} + +After writing enough analysis scripts for yourself, your may become +known as an expert to your colleagues, who will want to use your +scripts. Systemtap makes it possible to share in a controlled manner; +to build libraries of scripts that build on each other. In fact, all +of the functions (\verb+pid()+, etc.) used in the scripts above come +from tapset scripts like that. A ``tapset'' is just a script that +designed for reuse by installation into a special directory. + +\subsection{Automatic selection} + +Systemtap attempts to resolve references to global symbols (probes, +functions, variables) that are not defined within the script by a +systematic search through the tapset library for scripts that define +those symbols. Tapset scripts are installed under the default +directory named \verb+/usr/share/systemtap/tapset+. A user may give +additional directories with the \verb+-I DIR+ option. Systemtap +searches these directories for script (\verb+.stp+) files. + +The search process includes subdirectories that are specialized for a +particular kernel version and/or architecture, and ones that name only +larger kernel families. Naturally, the search is ordered from +specific to general, as shown in Figure~\ref{fig:tapset-search}. +\nomenclature{tapset search path}{A list of subdirectories searched by +systemtap for tapset scripts, allowing specialization by version +architecture.} + +\begin{figure}[h!] +\begin{boxedminipage}{6in} +\begin{verbatim} +# stap -p1 -vv -e 'probe begin { }' > /dev/null +Created temporary directory "/tmp/staplnEBh7" +Searched '/usr/share/systemtap/tapset/2.6.15/i686/*.stp', match count 0 +Searched '/usr/share/systemtap/tapset/2.6.15/*.stp', match count 0 +Searched '/usr/share/systemtap/tapset/2.6/i686/*.stp', match count 0 +Searched '/usr/share/systemtap/tapset/2.6/*.stp', match count 0 +Searched '/usr/share/systemtap/tapset/i686/*.stp', match count 1 +Searched '/usr/share/systemtap/tapset/*.stp', match count 12 +Pass 1: parsed user script and 13 library script(s) in 350usr/10sys/375real ms. +Running rm -rf /tmp/staplnEBh7 +\end{verbatim} +\end{boxedminipage} +\caption{Listing the tapset search path.} +\label{fig:tapset-search} +\end{figure} + +When a script file is found that {\em defines} one of the undefined +symbols, that {\em entire file} is added to the probing session being +analyzed. This search is repeated until no more references can become +satisfied. Systemtap signals an error if any are still unresolved. + +This mechanism enables several programming idioms. First, it allows +some global symbols to be defined only for applicable kernel +version/architecture pairs, and cause an error if their use is +attempted on an inapplicable host. Similarly, the same symbol can be +defined differently depending on kernels, in much the same way that +different kernel \verb+include/asm/ARCH/+ files contain macros that +provide a porting layer. + +Another use is to separate the default parameters of a tapset routine +from its implementation. For example, consider a tapset that defines +code for relating elapsed time intervals to process scheduling +activities. The data collection code can be generic with respect to +which time unit (jiffies, wall-clock seconds, cycle counts) it can +use. It should have a default, but should not require additional +run-time checks to let a user choose another. +Figure~\ref{fig:tapset-default} shows a way. + +\begin{figure}[h!] +\begin{boxedminipage}{6in} +\begin{verbatim} +# cat tapset/time-common.stp +global __time_vars +function timer_begin (name) { __time_vars[name] = __time_value () } +function timer_end (name) { return __time_value() - __time_vars[name] } + +# cat tapset/time-default.stp +function __time_value () { return gettimeofday_us () } + +# cat tapset-time-user.stp +probe begin +{ + timer_begin ("bench") + for (i=0; i<100; i++) ; + printf ("%d cycles\n", timer_end ("bench")) + exit () +} +function __time_value () { return get_ticks () } # override for greater precision + +\end{verbatim} +\end{boxedminipage} +\caption{Providing an overrideable default.} +\label{fig:tapset-default} +\end{figure} + +A tapset that exports only {\em data} may be as useful as ones that +exports functions or probe point aliases (see below). Such global +data can be computed and kept up-to-date using probes internal to the +tapset. Any outside reference to the global variable would +incidentally activate all the required probes. + +\subsection{Probe point aliases} + +\nomenclature{probe point alias}{A probe point that is defined in +terms of another probe point.} Probe point aliases allow creation of +new probe points from existing ones. This is useful if the new probe +points are named to provide a higher level of abstraction. For +example, the system-calls tapset defines probe point aliases of the +form \verb+syscall.open+ etc., in terms of lower level ones like +\verb+kernel.function("sys_open")+. Even if some future kernel +renames \verb+sys_open+, the aliased name can remain valid. + +A probe point alias definition looks like a normal probe. Both start +with the keyword \verb+probe+ and have a probe handler statement block +at the end. But where a normal probe just lists its probe points, an +alias creates a new name using the assignment (\verb+=+) operator. +Another probe that names the new probe point will create an actual +probe, with the handler of the alias {\em prepended}. + +This prepending behavior serves several purposes. It allows the alias +definition to ``preprocess'' the context of the probe before passing +control to the user-specified handler. This has several possible uses: +\begin{tabular}{rl} +\verb+if ($flag1 != $flag2) next+ & skip probe unless given condition is met \\ +\verb+name = "foo"+ & supply probe-describing values \\ +\verb+var = $var+ & extract target variable to plain local variable \\ %$ +\end{tabular} + +Figure~\ref{fig:probe-alias} demonstrates a probe point alias +definition as well as its use. It demonstrates how a single probe +point alias can expand to multiple probe points, even to other +aliases. It also includes probe point wildcarding. These functions +are designed to compose sensibly. + +\begin{figure}[h!] +\begin{boxedminipage}{4.5in} +\begin{verbatim} +# cat probe-alias.stp +probe syscallgroup.io = syscall.open, syscall.close, + syscall.read, syscall.write +{ groupname = "io" } + +probe syscallgroup.process = syscall.fork, syscall.execve +{ groupname = "process" } + +probe syscallgroup.* +{ groups [execname() . "/" . groupname] ++ } + +probe end +{ + foreach (eg+ in groups) + printf ("%s: %d\n", eg, groups[eg]) +} + +global groups + +# stap probe-alias.stp +05-wait_for_sys/io: 19 +10-udev.hotplug/io: 17 +20-hal.hotplug/io: 12 +X/io: 73 +apcsmart/io: 59 +[...] +make/io: 515 +make/process: 16 +[...] +xfce-mcs-manage/io: 3 +xfdesktop/io: 5 +[...] +xmms/io: 7070 +zsh/io: 78 +zsh/process: 5 +\end{verbatim} +\end{boxedminipage} +\caption{Classified system call activity.} +\label{fig:probe-alias} +\end{figure} + +\subsection{Embedded C} +\label{embedded-c} + +Sometimes, a tapset needs provide data values from the kernel that +cannot be extracted using ordinary target variables (\verb+$var+). %$ +This may be because the values are in complicated data structures, may +require lock awareness, or are defined by layers of macros. Systemtap +provides an ``escape hatch'' to go beyond what the language can safely +offer. In certain contexts, you may embed plain raw C in tapsets, +exchanging power for the safety guarantees listed in +section~\ref{sec:safety}. End-user scripts {\em may not} include +embedded C code, unless systemtap is run with the \verb+-g+ (``guru'' +mode) option. Tapset scripts get guru mode privileges automatically. +\nomenclature{embedded C}{Special syntax permitting tapsets to include +literal C code.} + +Embedded C can be the body of a script function. Instead enclosing +the function body statements in \verb+{+ and \verb+}+, use \verb+%{+ +and \verb+%}+. Any enclosed C code is literally transcribed into the +kernel module: it is up to you to make it safe and correct. In order +to take parameters and return a value, a pointer macro \verb+THIS+ is +available. Function parameters and a place for the return value are +available as fields of that pointer. The familiar data-gathering +functions \verb+pid()+, \verb+execname()+, and their neighbours are +all embedded C functions. Figure~\ref{fig:embedded-C} contains +another example. + +Since systemtap cannot examine the C code to infer these types, an +optional\footnote{This is only necessary if the types cannot be +inferred from other sources, such as the call sites.} annotation +syntax is available to assist the type inference process. Simply +suffix parameter names and/or the function name with \verb+:string+ or +\verb+:long+ to designate the string or numeric type. In addition, +the script may include a \verb+%{+ \verb+%}+ block at the outermost +level of the script, in order to transcribe declarative code like +\verb+#include +. These enable the embedded C functions +to refer to general kernel types. + +There are a number of safety-related constraints that should be +observed by developers of embedded C code. +\begin{enumerate} +\item Do not dereference pointers that are not known or testable valid. +\item Do not call any kernel routine that may cause a sleep or fault. +\item Consider possible undesirable recursion, where your embedded C +function calls a routine that may be the subject of a probe. If that +probe handler calls your embedded C function, you may suffer infinite +regress. Similar problems may arise with respect to non-reentrant +locks. +\item If locking of a data structure is necessary, use a +\verb+trylock+ type call to attempt to take the lock. If that fails, +give up, do not block. +\end{enumerate} + +\begin{figure}[h!] +\begin{boxedminipage}{4.5in} +\begin{verbatim} +# cat embedded-C.stp +%{ +#include +%} + +function utsname:string (field:long) +%{ + if (down_read_trylock (& uts_sem)) + { + const char *f = + (THIS->field == 0 ? system_utsname.sysname : + THIS->field == 1 ? system_utsname.nodename : + THIS->field == 2 ? system_utsname.release : + THIS->field == 3 ? system_utsname.version : + THIS->field == 4 ? system_utsname.machine : + THIS->field == 5 ? system_utsname.domainname : ""); + strlcpy (THIS->__retvalue, f, MAXSTRINGLEN); + up_read (& uts_sem); + } +%} + +probe begin +{ + printf ("%s %s\n", utsname(0), utsname(2)) + exit () +} + +# stap -g embedded-C.stp +Linux 2.6.15 +\end{verbatim} +\end{boxedminipage} +\caption{Embedded C function.} +\label{fig:embedded-C} +\end{figure} + +\subsection{Naming conventions} + +Using the tapset search mechanism just described, potentially many +script files can become selected for inclusion in a single session. +This raises the problem of name collisions, where different tapsets +accidentally use the same names for functions/globals. This can +result in errors at translate or run time. + +To control this problem, systemtap tapset developers are advised to +follow naming conventions. Here is some of the guidance. +\nomenclature{naming convention}{Guidelines for naming variables and +functions to prevent unintentional duplication.} +\begin{enumerate} +\item Pick a unique name for your tapset, and substitute it for +{\em TAPSET} below. +\item Separate identifiers meant to be used by tapset users from +those that are internal implementation artifacts. +\item Document the first set in the appropriate \verb+man+ pages. +\item Prefix the names of external identifiers with {\em TAPSET}\_ if +there is any likelihood of collision with other tapsets or end-user +scripts. +\item Prefix any probe point aliases with an appropriate prefix. +\item Prefix the names of internal identifiers with \_\_{\em TAPSET}\_. +\end{enumerate} + +\subsection{Exercises} + +\begin{enumerate} +\item Write a tapset that implements deferred and ``cancelable'' +logging. Export a function that enqueues a text string (into some +private array), returning an id token. Include a timer-based probe +that periodically flushes the array to the standard log output. +Export another function that, if the entry was not already flushed, +allows a text string to be cancelled from the queue. + +\item Create a ``relative timestamp'' tapset with functions return all +the same values as the ones in the timestamp tapset, except that they +are made relative to the start time of the script. + +\item Create a tapset that exports a global array that contains a +mapping of recently seen process ID numbers to process names. +Intercept key system calls (\verb+execve+?) to update the list +incrementally. + +\item Send your tapset ideas to the mailing list! +\end{enumerate} + +\section{Further information} + +For further information about systemtap, several sources are available. + +There are \verb+man+ pages: + +\begin{tabular}{rl} +\verb+stap+ & systemtap program usage, language summary \\ +\verb+stapfuncs+ & functions provided by tapsets \\ +\verb+stapprobes+ & probes / probe aliases provided by tapsets \\ +\verb+stapex+ & some example scripts \\ +\end{tabular} + +Then, there is the source code itself. Since systemtap is {\em free +software}, you should have available the entire source code. The +source files in the \verb+tapset/+ directory are also packaged along +with the systemtap binary. Since systemtap reads these files rather +than their documentation, they are the most reliable way to see what's +inside all the tapsets. Use the \verb+-v+ (verbose) command line +option, several times if you like, to show inner workings. +\nomenclature{free software}{Software licensed under terms such as the +GNU GPL, which aims to enforce certain specified user freedoms such +as study, modification, and sharing.} + +Finally, there is the project web site +(\verb+http://sources.redhat.com/systemtap/+) with several articles, +an archived public mailing list for users and developers +(\verb+systemtap@sources.redhat.com+), and a live CVS source +repository. Come join us! + + +\appendix + +\section{Glossary} +\renewcommand{\nomname}{} +\printglossary +\begin{htmlonly} +{\em Sorry, not available in HTML.} +\end{htmlonly} + +\section{Errors} + +We explain some common systemtap error messages in this section. Most +error messages include line/character numbers with which one can +locate the precise location of error in the script code. There is +sometimes a subsequent or prior line that elaborates. + +{\large {\em error} {\tt at:} {\em filename}:{\em line}:{\em column}: {\em details}} + +\subsection{Parse errors} + +\begin{description} +\item{\bf parse error: expected {\em foo}, saw {\em bar} $\ldots$} \\ +The script contained a grammar error. A different type of construct +was expected in the given context. + +\item{\bf parse error: embedded code in unprivileged script} \\ The +script contained unsafe constructs such as embedded C (section +\ref{embedded-c}), but was run without the \verb+-g+ (guru mode) +option. Confirm that the constructs are used safely, then try +again with \verb+-g+. +\end{description} + +\subsection{Type errors} + +\begin{description} +\item{\bf semantic error: type mismatch for identifier '{\em foo}' +$\ldots$ string vs. long} \\ In this case, the identifier {\em foo} +was previously inferred as a numeric type (``long''), but at the given +point is being used as a string. Similar messages appear if an array +index or function parameter slot is used with conflicting types. + +\item{\bf semantic error: unresolved type for identifier '{\em foo}'} +\\ The identifier {\em foo} was used, for example in a \verb+print+, +but without any operations that could assign it a type. Similar +messages may appear if a symbol is misspelled by a typo. + +\item{\bf semantic error: Expecting symbol or array index expression} +\\ Something other than an assignable lvalue was on the left hand sign +of an assignment. +\end{description} + +\subsection{Symbol errors} + +\begin{description} +\item{\bf while searching for arity {\em N} function, semantic error: +unresolved function call} \\ The script calls a function with {\em N} +arguments that does not exist. The function may exist with different +arity. + +\item{\bf semantic error: array locals not supported: $\ldots$} \\ An +array operation is present for which no matching global declaration +was found. Similar messages appear if an array is used with +inconsistent arities. + +\item{\bf semantic error: variable '{\em foo}' modified during 'foreach'} \\ +The array {\em foo} is being modified (being assigned to or deleted from) +within an active \verb+foreach+ loop. This invalid operation is also +detected within a function called from within the loop. +\end{description} + +\subsection{Probing errors } + +\begin{description} +\item{\bf semantic error: probe point mismatch at position {\em N}, +while resolving probe point {\em foo}} \\ A probe point was named that +neither directly understood by systemtap, nor defined as an alias by a +tapset script. The divergence from the ``tree'' of probe point +namespace is at position {\em N} (starting with zero at left). + +\item{\bf semantic error: no match for probe point, while resolving +probe point {\em foo}} \\ A probe point cannot be resolved for any of +a variety of reasons. It may be a debuginfo-based probe point such as +\verb+kernel.function("foobar")+ where no \verb+foobar+ function was +found. This can occur if the script specifies a wildcard on function +names, or an invalid file name or source line number. + +\item{\bf semantic error: unresolved target-symbol expression} \\ A +target variable was referred to in a probe handler that was not +resolvable. Or, a target variable is not valid at all in a context +such as a script function. This variable may have been elided by an +optimizing compiler, or may not have a suitable type, or there might +just be an annoying bug somewhere. Try again with a slightly +different probe point (use \verb+statement()+ instead of +\verb+function()+) to search for a more cooperative neighbour in the +same area. + +\item{\bf semantic error: libdwfl failure $\ldots$} \\ There was a +problem processing the debugging information. It may simply be +missing, or may have some consistency / correctness problems. Later +compilers tend to produce better debugging information, so if you can +upgrade and recompile your kernel/application, it may help. + +\item{\bf semantic error: cannot find {\em foo} debuginfo} \\ Similarly, +suitable debugging information was not found. Check that your kernel +build/installation includes a matching version of debugging data. +\end{description} + +\subsection{Runtime errors} + +\begin{description} + +\item{\bf WARNING: Number of errors: {\em N}, skipped probes: {\em M}} \\ +Errors and/or skipped probes occurred during this run. +\nomenclature{skipped probe}{A probe handler that should have run but +couldn't, due to contention or temporary resource problems.} + +\item{\bf division by 0} \\ The script code performed an invalid +division. + +\item{\bf aggregate element not found} \\ An statistics extractor +function other than \verb+@count+ was invoked on an aggregate that has +not had any values accumulated yet. This is similar to a division by +zero. + +\item{\bf aggregation overflow} \\ An array containing aggregate +values contains too many distinct key tuples at this time. + +\item{\bf MAXNESTING exceeded} \\ Too many levels of function call nesting +were attempted. + +\item{\bf MAXACTION exceeded} \\ The probe handler attempted to execute +too many statements. + +\item{\bf kernel/user string copy fault at {\em 0xaddr}} \\ +The probe handler attempted to copy a string from kernel or user space +at an invalid address. + +\item{\bf pointer dereference fault} \\ +There was a fault encountered during a pointer dereference operation such +as a target variable evaluation. + +\end{description} + + +\section{Acknowledgments} + +The author thanks Martin Hunt, Will Cohen, and Jim Keniston for +improvement advice for this paper. + +\end{document} diff --git a/doc/tutorial/embedded-C.stp b/doc/tutorial/embedded-C.stp new file mode 100644 index 000000000..6834d728a --- /dev/null +++ b/doc/tutorial/embedded-C.stp @@ -0,0 +1,25 @@ +%{ +#include +%} + +function utsname:string (field:long) +%{ + if (down_read_trylock (& uts_sem)) + { + const char *f = + (THIS->field == 0 ? system_utsname.sysname : + THIS->field == 1 ? system_utsname.nodename : + THIS->field == 2 ? system_utsname.release : + THIS->field == 3 ? system_utsname.version : + THIS->field == 4 ? system_utsname.machine : + THIS->field == 5 ? system_utsname.domainname : ""); + strlcpy (THIS->__retvalue, f, MAXSTRINGLEN); + up_read (& uts_sem); + } +%} + +probe begin +{ + printf ("%s %s\n", utsname(0), utsname(2)) + exit () +} diff --git a/doc/tutorial/functions.stp b/doc/tutorial/functions.stp new file mode 100644 index 000000000..6a825722d --- /dev/null +++ b/doc/tutorial/functions.stp @@ -0,0 +1,18 @@ +# Red Hat convention +function system_uid_p (u) { return u < 500 } + +# kernel device number assembly macro +function makedev (major,minor) { return major << 20 | minor } + +function trace_common () +{ + printf("%d %s(%d)", gettimeofday_s(), execname(), pid()) + # no return value +} + +function fibonacci (i) +{ + if (i < 1) return 0 + else if (i < 2) return 1 + else return fibonacci(i-1) + fibonacci(i-2) +} diff --git a/doc/tutorial/hello-world.stp b/doc/tutorial/hello-world.stp new file mode 100644 index 000000000..6a9037a71 --- /dev/null +++ b/doc/tutorial/hello-world.stp @@ -0,0 +1,5 @@ +probe begin +{ + print ("hello world\n") + exit () +} diff --git a/doc/tutorial/inode-watch.stp b/doc/tutorial/inode-watch.stp new file mode 100644 index 000000000..caf04b9ad --- /dev/null +++ b/doc/tutorial/inode-watch.stp @@ -0,0 +1,13 @@ +probe kernel.function ("vfs_write"), + kernel.function ("vfs_read") +{ + dev_nr = $file->f_dentry->d_inode->i_sb->s_dev + inode_nr = $file->f_dentry->d_inode->i_ino + + if (dev_nr == ($1 << 20 | $2) # major/minor device + && inode_nr == $3) + printf ("%s(%d) %s 0x%x/%u\n", + execname(), pid(), probefunc(), dev_nr, inode_nr) +} + +# dev_name = kernel_string ($file->f_dentry->d_inode->i_sb->s_id) diff --git a/doc/tutorial/probe-alias.stp b/doc/tutorial/probe-alias.stp new file mode 100644 index 000000000..aa5feb1b6 --- /dev/null +++ b/doc/tutorial/probe-alias.stp @@ -0,0 +1,17 @@ +probe syscallgroup.io = syscall.open, syscall.close, + syscall.read, syscall.write +{ groupname = "io" } + +probe syscallgroup.process = syscall.fork, syscall.execve +{ groupname = "process" } + +probe syscallgroup.* +{ groups [execname() . "/" . groupname] ++ } + +probe end +{ + foreach (eg+ in groups) + printf ("%s: %d\n", eg, groups[eg]) +} + +global groups diff --git a/doc/tutorial/socket-trace.stp b/doc/tutorial/socket-trace.stp new file mode 100644 index 000000000..53b69ecc6 --- /dev/null +++ b/doc/tutorial/socket-trace.stp @@ -0,0 +1,6 @@ +probe kernel.function("*@net/socket.c") { + printf ("%s -> %s\n", thread_indent(1), probefunc()) +} +probe kernel.function("*@net/socket.c").return { + printf ("%s <- %s\n", thread_indent(-1), probefunc()) +} diff --git a/doc/tutorial/strace-open.stp b/doc/tutorial/strace-open.stp new file mode 100644 index 000000000..fb87cec1d --- /dev/null +++ b/doc/tutorial/strace-open.stp @@ -0,0 +1,8 @@ +probe syscall.open +{ + printf ("%s(%d) open (%s)\n", execname(), pid(), argstr) +} +probe timer.ms(4000) # after 4 seconds +{ + exit () +} diff --git a/doc/tutorial/tapset-time-user.stp b/doc/tutorial/tapset-time-user.stp new file mode 100644 index 000000000..32069b032 --- /dev/null +++ b/doc/tutorial/tapset-time-user.stp @@ -0,0 +1,8 @@ +probe begin +{ + timer_begin ("bench") + for (i=0; i<100; i++) ; + printf ("%d cycles\n", timer_end ("bench")) + exit () +} +function __time_value () { return get_cycles () } # override diff --git a/doc/tutorial/tapset/time-common.stp b/doc/tutorial/tapset/time-common.stp new file mode 100644 index 000000000..cec5a4eaa --- /dev/null +++ b/doc/tutorial/tapset/time-common.stp @@ -0,0 +1,4 @@ +global __time_vars +function timer_begin (name) { __time_vars[name] = __time_value () } +function timer_end (name) { return __time_value() - __time_vars[name] } + diff --git a/doc/tutorial/tapset/time-default.stp b/doc/tutorial/tapset/time-default.stp new file mode 100644 index 000000000..614ff5066 --- /dev/null +++ b/doc/tutorial/tapset/time-default.stp @@ -0,0 +1,2 @@ +function __time_value () { return gettimeofday_us () } + diff --git a/doc/tutorial/timer-jiffies.stp b/doc/tutorial/timer-jiffies.stp new file mode 100644 index 000000000..d5e92e4ad --- /dev/null +++ b/doc/tutorial/timer-jiffies.stp @@ -0,0 +1,10 @@ +global count_jiffies, count_ms +probe timer.jiffies(100) { count_jiffies ++ } +probe timer.ms(100) { count_ms ++ } +probe timer.ms(12345) +{ + hz=(1000*count_jiffies) / count_ms + printf ("jiffies:ms ratio %d:%d => CONFIG_HZ=%d\n", + count_jiffies, count_ms, hz) + exit () +}