AUTOMAKE_OPTIONS = info-in-builddir foreign no-texinfo.tex
+PDFS = gprofng.pdf
+HTMLS = gprofng.html
+
# Options to extract the man page
MANCONF = -Dman
POD2MAN = pod2man --center="User Commands" \
--release="binutils-$(VERSION)" --section=1
-info_TEXINFOS = gprofng.texi
-gprofng_TEXINFOS = fdl.texi
+info_TEXINFOS = gprofng_ug.texi
+gprofng_ug_TEXINFOS = fdl.texi gp-macros.texi
TEXINFO_TEX = .
MAKEINFOHTML = $(MAKEINFO) --html --no-split
-man_MANS = gprofng.1
+man_MANS = gprofng.1 gp-archive.1 gp-collect-app.1 gp-display-html.1 gp-display-src.1 gp-display-text.1
# Build the man page from the texinfo file
# The sed command removes the no-adjust Nroff command so that
# the man output looks standard.
-gprofng.1: $(srcdir)/gprofng.texi
+$(man_MANS): $(srcdir)/gp-macros.texi
$(AM_V_GEN)touch $@
- $(AM_V_at)-$(TEXI2POD) $(MANCONF) < $(srcdir)/gprofng.texi > gprofng.pod
- $(AM_V_at)-($(POD2MAN) gprofng.pod | \
- sed -e '/^.if n .na/d' > $@.tmp && \
+ $(AM_V_at)-$(TEXI2POD) $(MANCONF) < $(srcdir)/`basename $@ .1`.texi > $@.pod
+ $(AM_V_at)-($(POD2MAN) $@.pod | sed -e '/^.if n .na/d' > $@.tmp && \
mv -f $@.tmp $@) || (rm -f $@.tmp && exit 1)
- $(AM_V_at)rm -f gprofng.pod
+ $(AM_V_at)rm -f $@.pod
+
+gprofng.1: $(srcdir)/gprofng.texi
+gp-archive.1: $(srcdir)/gp-archive.texi
+gp-collect-app.1: $(srcdir)/gp-collect-app.texi
+gp-display-html.1: $(srcdir)/gp-display-html.texi
+gp-display-src.1: $(srcdir)/gp-display-src.texi
+gp-display-text.1: $(srcdir)/gp-display-text.texi
MAINTAINERCLEANFILES = gprofng.info $(man_MANS)
+EXTRA_DIST = $(man_MANS) version.texi
info: $(man_MANS)
am__v_texidevnull_1 =
INFO_DEPS = gprofng.info
am__TEXINFO_TEX_DIR = $(srcdir)/.
-DVIS = gprofng.dvi
-PDFS = gprofng.pdf
-PSS = gprofng.ps
-HTMLS = gprofng.html
-TEXINFOS = gprofng.texi
+DVIS = gprofng_ug.dvi
+PSS = gprofng_ug.ps
+TEXINFOS = gprofng_ug.texi
TEXI2DVI = texi2dvi
TEXI2PDF = $(TEXI2DVI) --pdf --batch
AM_MAKEINFOHTMLFLAGS = $(AM_MAKEINFOFLAGS)
NROFF = nroff
MANS = $(man_MANS)
am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
-am__DIST_COMMON = $(gprofng_TEXINFOS) $(srcdir)/Makefile.in \
+am__DIST_COMMON = $(gprofng_ug_TEXINFOS) $(srcdir)/Makefile.in \
$(top_srcdir)/../mkinstalldirs mdate-sh texinfo.tex
DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
ACLOCAL = @ACLOCAL@
zlibdir = @zlibdir@
zlibinc = @zlibinc@
AUTOMAKE_OPTIONS = info-in-builddir foreign no-texinfo.tex
+PDFS = gprofng.pdf
+HTMLS = gprofng.html
# Options to extract the man page
MANCONF = -Dman
POD2MAN = pod2man --center="User Commands" \
--release="binutils-$(VERSION)" --section=1
-info_TEXINFOS = gprofng.texi
-gprofng_TEXINFOS = fdl.texi
+info_TEXINFOS = gprofng_ug.texi
+gprofng_ug_TEXINFOS = fdl.texi gp-macros.texi
TEXINFO_TEX = .
MAKEINFOHTML = $(MAKEINFO) --html --no-split
-man_MANS = gprofng.1
+man_MANS = gprofng.1 gp-archive.1 gp-collect-app.1 gp-display-html.1 gp-display-src.1 gp-display-text.1
MAINTAINERCLEANFILES = gprofng.info $(man_MANS)
+EXTRA_DIST = $(man_MANS) version.texi
all: all-am
.SUFFIXES:
clean-libtool:
-rm -rf .libs _libs
-gprofng.info: gprofng.texi $(srcdir)/version.texi $(gprofng_TEXINFOS)
+gprofng.info: gprofng_ug.texi $(srcdir)/version.texi $(gprofng_ug_TEXINFOS)
$(AM_V_MAKEINFO)restore=: && backupdir="$(am__leading_dot)am$$$$" && \
rm -rf $$backupdir && mkdir $$backupdir && \
if ($(MAKEINFO) --version) >/dev/null 2>&1; then \
done; \
else :; fi && \
if $(MAKEINFO) $(AM_MAKEINFOFLAGS) $(MAKEINFOFLAGS) -I $(srcdir) \
- -o $@ `test -f 'gprofng.texi' || echo '$(srcdir)/'`gprofng.texi; \
+ -o $@ `test -f 'gprofng_ug.texi' || echo '$(srcdir)/'`gprofng_ug.texi; \
then \
rc=0; \
else \
fi; \
rm -rf $$backupdir; exit $$rc
-gprofng.dvi: gprofng.texi $(srcdir)/version.texi $(gprofng_TEXINFOS)
+gprofng.dvi: gprofng_ug.texi $(srcdir)/version.texi $(gprofng_ug_TEXINFOS)
$(AM_V_TEXI2DVI)TEXINPUTS="$(am__TEXINFO_TEX_DIR)$(PATH_SEPARATOR)$$TEXINPUTS" \
MAKEINFO='$(MAKEINFO) $(AM_MAKEINFOFLAGS) $(MAKEINFOFLAGS) -I $(srcdir)' \
$(TEXI2DVI) $(AM_V_texinfo) --build-dir=$(@:.dvi=.t2d) -o $@ $(AM_V_texidevnull) \
- `test -f 'gprofng.texi' || echo '$(srcdir)/'`gprofng.texi
+ `test -f 'gprofng_ug.texi' || echo '$(srcdir)/'`gprofng_ug.texi
-gprofng.pdf: gprofng.texi $(srcdir)/version.texi $(gprofng_TEXINFOS)
+gprofng.pdf: gprofng_ug.texi $(srcdir)/version.texi $(gprofng_ug_TEXINFOS)
$(AM_V_TEXI2PDF)TEXINPUTS="$(am__TEXINFO_TEX_DIR)$(PATH_SEPARATOR)$$TEXINPUTS" \
MAKEINFO='$(MAKEINFO) $(AM_MAKEINFOFLAGS) $(MAKEINFOFLAGS) -I $(srcdir)' \
$(TEXI2PDF) $(AM_V_texinfo) --build-dir=$(@:.pdf=.t2p) -o $@ $(AM_V_texidevnull) \
- `test -f 'gprofng.texi' || echo '$(srcdir)/'`gprofng.texi
+ `test -f 'gprofng_ug.texi' || echo '$(srcdir)/'`gprofng_ug.texi
-gprofng.html: gprofng.texi $(srcdir)/version.texi $(gprofng_TEXINFOS)
+gprofng.html: gprofng_ug.texi $(srcdir)/version.texi $(gprofng_ug_TEXINFOS)
$(AM_V_MAKEINFO)rm -rf $(@:.html=.htp)
$(AM_V_at)if $(MAKEINFOHTML) $(AM_MAKEINFOHTMLFLAGS) $(MAKEINFOFLAGS) -I $(srcdir) \
- -o $(@:.html=.htp) `test -f 'gprofng.texi' || echo '$(srcdir)/'`gprofng.texi; \
+ -o $(@:.html=.htp) `test -f 'gprofng_ug.texi' || echo '$(srcdir)/'`gprofng_ug.texi; \
then \
rm -rf $@ && mv $(@:.html=.htp) $@; \
else \
rm -rf $(@:.html=.htp); exit 1; \
fi
$(srcdir)/version.texi: @MAINTAINER_MODE_TRUE@ $(srcdir)/stamp-vti
-$(srcdir)/stamp-vti: gprofng.texi $(top_srcdir)/configure
- @(dir=.; test -f ./gprofng.texi || dir=$(srcdir); \
- set `$(SHELL) $(srcdir)/mdate-sh $$dir/gprofng.texi`; \
+$(srcdir)/stamp-vti: gprofng_ug.texi $(top_srcdir)/configure
+ @(dir=.; test -f ./gprofng_ug.texi || dir=$(srcdir); \
+ set `$(SHELL) $(srcdir)/mdate-sh $$dir/gprofng_ug.texi`; \
echo "@set UPDATED $$1 $$2 $$3"; \
echo "@set UPDATED-MONTH $$2 $$3"; \
echo "@set EDITION $(VERSION)"; \
done
mostlyclean-aminfo:
- -rm -rf gprofng.t2d gprofng.t2p
+ -rm -rf gprofng_ug.t2d gprofng_ug.t2p
clean-aminfo:
-test -z "gprofng.dvi gprofng.pdf gprofng.ps gprofng.html" \
# Build the man page from the texinfo file
# The sed command removes the no-adjust Nroff command so that
# the man output looks standard.
-gprofng.1: $(srcdir)/gprofng.texi
+$(man_MANS): $(srcdir)/gp-macros.texi
$(AM_V_GEN)touch $@
- $(AM_V_at)-$(TEXI2POD) $(MANCONF) < $(srcdir)/gprofng.texi > gprofng.pod
- $(AM_V_at)-($(POD2MAN) gprofng.pod | \
- sed -e '/^.if n .na/d' > $@.tmp && \
+ $(AM_V_at)-$(TEXI2POD) $(MANCONF) < $(srcdir)/`basename $@ .1`.texi > $@.pod
+ $(AM_V_at)-($(POD2MAN) $@.pod | sed -e '/^.if n .na/d' > $@.tmp && \
mv -f $@.tmp $@) || (rm -f $@.tmp && exit 1)
- $(AM_V_at)rm -f gprofng.pod
+ $(AM_V_at)rm -f $@.pod
+
+gprofng.1: $(srcdir)/gprofng.texi
+gp-archive.1: $(srcdir)/gp-archive.texi
+gp-collect-app.1: $(srcdir)/gp-collect-app.texi
+gp-display-html.1: $(srcdir)/gp-display-html.texi
+gp-display-src.1: $(srcdir)/gp-display-src.texi
+gp-display-text.1: $(srcdir)/gp-display-text.texi
info: $(man_MANS)
--- /dev/null
+@c ----------------------------------------------------------------------------
+@c This is the Texinfo source file for the gp-collect-app man page.
+@c
+@c Author: Ruud van der Pas
+@c ----------------------------------------------------------------------------
+@ifset man
+\input texinfo @c -*-texinfo-*-
+@setfilename gprofng archive
+@settitle Archive gprofng experiment data
+@include gp-macros.texi
+@end ifset
+
+@c ----------------------------------------------------------------------------
+@c This is from the man-pages(7) man page
+@c
+@c "The list below shows conventional or suggested sections. Most manual pages
+@c should include at least the highlighted sections. Arrange a new manual
+@c page so that sections are placed in the order shown in the list."
+@c
+@c NAME
+@c SYNOPSIS
+@c CONFIGURATION [Normally only in Section 4]
+@c DESCRIPTION
+@c OPTIONS [Normally only in Sections 1, 8]
+@c EXIT STATUS [Normally only in Sections 1, 8]
+@c RETURN VALUE [Normally only in Sections 2, 3]
+@c ERRORS [Typically only in Sections 2, 3]
+@c ENVIRONMENT
+@c FILES
+@c VERSIONS [Normally only in Sections 2, 3]
+@c ATTRIBUTES [Normally only in Sections 2, 3]
+@c CONFORMING TO
+@c NOTES
+@c BUGS
+@c EXAMPLES
+@c AUTHORS [Discouraged]
+@c REPORTING BUGS [Not used in man-pages]
+@c COPYRIGHT [Not used in man-pages]
+@c SEE ALSO
+@c
+@c This is what the texi2pod.pl tool recognizes:
+@c
+@c for $sect (qw(NAME SYNOPSIS TARGET DESCRIPTION OPTIONS ENVIRONMENT FILES
+@c BUGS NOTES FOOTNOTES SEEALSO AUTHOR COPYRIGHT)) {
+@c
+@c What is interesting is that it places "SEE ALSO" before "COPYRIGHT", which
+@c makes sense and adhered to for the other formats.
+@c ----------------------------------------------------------------------------
+
+@c ----------------------------------------------------------------------------
+@c NAME section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{NAME}
+@c man begin NAME
+
+gprofng archive - Archive gprofng experiment data
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c SYNOPSIS section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{SYNOPSIS}
+@c man begin SYNOPSIS
+
+@command{gprofng archive} [@var{option(s)}] @var{experiment}
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c DESCRIPTION section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{DESCRIPTION}
+@c man begin DESCRIPTION
+
+Archive the associated application binaries and source files in a gprofng
+experiment to make it self contained and portable.
+
+By default, the binaries are archived, but the application source files
+are not archived. Use this tool to change this and afterwards archive
+additional components.
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c OPTIONS section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{OPTIONS}
+@c man begin OPTIONS
+
+@table @gcctabopt
+
+@item --version
+@ifclear man
+@IndexSubentry{Options, @code{--version}}
+@end ifclear
+
+Print the version number and exit.
+
+@item --help
+@ifclear man
+@IndexSubentry{Options, @code{--help}}
+@end ifclear
+
+Print usage information and exit.
+
+@c -- @item --verbose @{on|off@}
+@c -- @ifclear man
+@c -- @IndexSubentry{Options, @code{--verbose}}
+@c -- @end ifclear
+
+@c -- Enable (on) or disable (off) verbose mode; the default is @samp{off}.
+
+@item -a @{off|on|ldobjects|src|usedldobjects|usedsrc@}
+@ifclear man
+@IndexSubentry{Options, @code{-a}}
+@end ifclear
+
+Specify archiving of binaries and other files. In addition to disable this
+feature (off), or enable archiving off all loadobjects and sources (on),
+the other op tions support a more refined selection.
+
+All of these options enable archiving, but the keyword controls what exactly
+is selected: all load objects (ldobjects), all source files (src), the
+loadobjects asscoiated with a program counter (usedldobjects), or the source
+files associated with a program counter (usedsrc).
+The default is @samp{-a ldobjects}.
+
+@item -n
+@ifclear man
+@IndexSubentry{Options, @code{-n}}
+@end ifclear
+
+Archive the named experiment only, not any of its descendants.
+
+@item -m @var{regex}
+@ifclear man
+@IndexSubentry{Options, @code{-m}}
+@end ifclear
+
+Archive only those source, object, and debug info files whose full path name
+matches the given POSIX compliant @var{regex} regular expression.
+
+@item -q
+@ifclear man
+@IndexSubentry{Options, @code{-q}}
+@end ifclear
+
+Do not write any warnings to stderr. Warnings are incorporated into the
+.archive file in the experiment directory. They are shown in the output
+of @command{gprofng display text}.
+
+@item -F
+@ifclear man
+@IndexSubentry{Options, @code{-F}}
+@end ifclear
+
+Force writing or rewriting of the archive. This is ignored with the
+@samp{-n} or @samp{-m} option, or if this is a subexperiment.
+
+@item -d @var{path}
+@ifclear man
+@IndexSubentry{Options, @code{-d}}
+@end ifclear
+
+The @var{path} is the absolute path path to a common archive, which is a
+directory that contains archived files. If the directory does not
+exist, then it will be created. Files are saved in the common archive
+directory, and a symbolic link is created in the experiment archive.
+
+@end table
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c NOTES section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{NOTES}
+@c man begin NOTES
+
+Default archiving does not occur in case the application profiled terminates
+prematurely, or if archiving is disabled when collecting the performance data.
+In such cases, this tool can be used to afterwards archive the information,
+but it has to be run on the same system where the profiling data was recorded.
+
+Some Java applications store shared objects in jar files. By default, such
+shared objects are not automatically archived. To archive shared objects
+contained in jar files, the addpath directive in an .er.rc file. The addpath
+directive should give the path to the jar file, including the jar file itself.
+The .er.rc file should be saved in the user home directory or parent of the
+experiment directory.
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c SEEALSO section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{SEEALSO}
+@c man begin SEEALSO
+
+gprofng(1), gp-collect-app(1), gp-display-html(1), gp-display-src(1), gp-display-text(1)
+
+The user guide for gprofng is maintained as a Texinfo manual. If the info
+and gprofng programs are correctly installed, the command
+@command{info gprofng} should give access to this document.
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c COPYRIGHT section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{COPYRIGHT}
+@c man begin COPYRIGHT
+
+Copyright @copyright{} 2022-2023 Free Software Foundation, Inc.
+
+Permission is granted to copy, distribute and/or modify this document
+under the terms of the GNU Free Documentation License, Version 1.3
+or any later version published by the Free Software Foundation;
+with no Invariant Sections, with no Front-Cover Texts, and with no
+Back-Cover Texts. A copy of the license is included in the
+section entitled ``GNU Free Documentation License''.
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c If this text is used for a man page, exit. Otherwise we need to continue.
+@c ----------------------------------------------------------------------------
+
+@ifset man
+@bye
+@end ifset
--- /dev/null
+@c ----------------------------------------------------------------------------
+@c This is the Texinfo source file for the gp-collect-app man page.
+@c
+@c Author: Ruud van der Pas
+@c ----------------------------------------------------------------------------
+@ifset man
+\input texinfo @c -*-texinfo-*-
+@setfilename gprofng collect app
+@settitle Collect performance data for the target application
+@include gp-macros.texi
+@end ifset
+
+@c ----------------------------------------------------------------------------
+@c This is from the man-pages(7) man page
+@c
+@c "The list below shows conventional or suggested sections. Most manual pages
+@c should include at least the highlighted sections. Arrange a new manual
+@c page so that sections are placed in the order shown in the list."
+@c
+@c NAME
+@c SYNOPSIS
+@c CONFIGURATION [Normally only in Section 4]
+@c DESCRIPTION
+@c OPTIONS [Normally only in Sections 1, 8]
+@c EXIT STATUS [Normally only in Sections 1, 8]
+@c RETURN VALUE [Normally only in Sections 2, 3]
+@c ERRORS [Typically only in Sections 2, 3]
+@c ENVIRONMENT
+@c FILES
+@c VERSIONS [Normally only in Sections 2, 3]
+@c ATTRIBUTES [Normally only in Sections 2, 3]
+@c CONFORMING TO
+@c NOTES
+@c BUGS
+@c EXAMPLES
+@c AUTHORS [Discouraged]
+@c REPORTING BUGS [Not used in man-pages]
+@c COPYRIGHT [Not used in man-pages]
+@c SEE ALSO
+@c
+@c This is what the texi2pod.pl tool recognizes:
+@c
+@c for $sect (qw(NAME SYNOPSIS TARGET DESCRIPTION OPTIONS ENVIRONMENT FILES
+@c BUGS NOTES FOOTNOTES SEEALSO AUTHOR COPYRIGHT)) {
+@c
+@c What is interesting is that it places "SEE ALSO" before "COPYRIGHT", which
+@c makes sense and adhered to for the other formats.
+@c ----------------------------------------------------------------------------
+
+@c ----------------------------------------------------------------------------
+@c NAME section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{NAME}
+@c man begin NAME
+
+gprofng collect app - Collect performance data for the target program
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c SYNOPSIS section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{SYNOPSIS}
+@c man begin SYNOPSIS
+
+@command{gprofng collect app} [@var{option(s)}] @var{target} [@var{option(s)}]
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c DESCRIPTION section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{DESCRIPTION}
+@c man begin DESCRIPTION
+
+Collect performance data on the target program. In addition to Program Counter
+(PC) sampling, hardware event counters and various tracing options are supported.
+
+For example, this command collects performance data for an executable called
+@samp{a.out} and stores the data collected in an experiment directory with
+the name @samp{example.er}.
+
+@smallexample
+$ gprofng collect app -o example.er ./a.out
+@end smallexample
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c OPTIONS section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{OPTIONS}
+@c man begin OPTIONS
+
+@table @gcctabopt
+
+@item --version
+@ifclear man
+@IndexSubentry{Options, @code{--version}}
+@end ifclear
+
+Print the version number and exit.
+
+@item --help
+@ifclear man
+@IndexSubentry{Options, @code{--help}}
+@end ifclear
+
+Print usage information and exit.
+
+@c -- @item --verbose @{on|off@}
+@c -- @ifclear man
+@c -- @IndexSubentry{Options, @code{--verbose}}
+@c -- @end ifclear
+
+@c -- Enable (on) or disable (off) verbose mode; the default is @samp{off}.
+
+@item -p @{off|on|lo|hi|@var{<value>}@}
+@ifclear man
+@IndexSubentry{Options, @code{-p}}
+@end ifclear
+
+Disable (off) or enable (on) clock-profiling using a default sampling
+granularity, or enable clock-profiling implicitly by setting the sampling
+granularity (lo, hi, or a specific value in ms). By default, clock profiling
+is enabled (@samp{-p on}).
+
+@item -h @var{@{<ctr_def>...,<ctr_n_def>@}}
+@ifclear man
+@IndexSubentry{Options, @code{-h}}
+@end ifclear
+Enable hardware event counter profiling and select the counter(s).
+To see the supported counters on this system, use the @samp{-h} option
+without other arguments.
+
+@item -o @var{<exp_name>}
+@ifclear man
+@IndexSubentry{Options, @code{-o}}
+@end ifclear
+
+Specify the name for the experiment directory. The name has to end with
+@samp{.er} and may contain an absolute path (e.g. @file{/tmp/experiment.er}).
+
+@item -O @var{<exp_name>}
+@ifclear man
+@IndexSubentry{Options, @code{-O}}
+@end ifclear
+
+This is the same as the @samp{-o} option, but unlike this option, silently
+overwrites an existing experiment directory with the same name.
+
+@item -C @var{<comment_string>}
+@ifclear man
+@IndexSubentry{Options, @code{-C}}
+@end ifclear
+
+Add up to 10 comment strings to the experiment. These comments appear in the
+notes section of the header and can be retrieved with the
+@command{gprofng display text} command using the @samp{-header} option.
+
+@item -j @{on|off|@var{<path>}@}
+@ifclear man
+@IndexSubentry{Options, @code{-j}}
+@end ifclear
+
+Controls Java profiling when the target is a JVM machine. The allowed values of
+this option are: enable (on), disable (off) Java profiling when the target
+program is a JVM, or set @samp{<path>} to a non-default JVM.
+The default is @samp{-j on}
+
+@table @gcctabopt
+
+@item on
+Record profiling data for the JVM machine, and recognize methods compiled by
+the Java HotSpot virtual machine. Also record Java call stacks. The default
+is @samp{-j on}.
+
+@item off
+Does not record Java profiling data. Profiling data for native call stacks is
+still recorded.
+
+@item @var{<path>}
+Records profiling data for the JVM, and use the JVM as installed in @var{<path>}.
+
+@end table
+
+@item -J @var{<jvm-options>}
+@ifclear man
+@IndexSubentry{Options, @code{-J}}
+@end ifclear
+
+Specifies additional options to be passed to the JVM used. The
+@var{jvm-options} list must be enclosed in quotation marks if it contains more
+than one option. The items in the list need to be separated by spaces or tab.
+Each item is passed as a separate option to the JVM. Note that this option
+implies @samp{-j on}.
+
+@item -t @var{<duration>}[m|s]
+@ifclear man
+@IndexSubentry{Options, @code{-t}}
+@end ifclear
+
+Collects data for the specified duration. The duration can be a single number,
+optionally followed by either @samp{m} to specify minutes, or @samp{s} to
+specify seconds, which is the default.
+
+The duration can also two numbers separated by minus (-) sign. If a single
+number is given, data is collected from the start of the run until the given
+time. If two numbers are given, data is collected from the first time to the
+second. If the second time is zero, data is collected until the end of the
+run. If two non-zero numbers are given, the first must be less than the second.
+
+@item -n
+@ifclear man
+@IndexSubentry{Options, @code{-n}}
+@end ifclear
+
+This is used for a dry run. Several run-time settings are displayed, but the
+target is not executed and no performance data is collected.
+
+@item -F @{off|on|=@var{regex}@}
+@ifclear man
+@IndexSubentry{Options, @code{-F}}
+@end ifclear
+
+Control whether descendant processes should have their data recorded.
+To disable/enable this feature, use @samp{off}/@samp{on}. Use
+@samp{=}@var{regex} to record data on those processes whose executable name
+matches the regular expression. Only the basename of the executable is used,
+not the full path. If spaces or characters interpreted by the shell are used,
+enclose the @var{regex} in single quotes. The default is @samp{-F on}.
+
+@item -a @{off|on|ldobjects|src|usedldobjects|usedsrc@}
+@ifclear man
+@IndexSubentry{Options, @code{-a}}
+@end ifclear
+
+Specify archiving of binaries and other files. In addition to disable this
+feature (off), or enable archiving off all loadobjects and sources (on),
+the other op tions support a more refined selection.
+
+All of these options enable archiving, but the keyword controls what exactly
+is selected: all load objects (ldobjects), all source files (src), the
+loadobjects asscoiated with a program counter (usedldobjects), or the source
+files associated with a program counter (usedsrc).
+The default is @samp{-a ldobjects}.
+
+@item -S @{off|on|@var{<seconds>}@}
+@ifclear man
+@IndexSubentry{Options, @code{-S}}
+@end ifclear
+
+Disable (off), or enable (on) periodic sampling of process-wide resource
+utilization. By default, sampling occurs every second. Use the @var{<seconds>}
+option to change this. The default is @samp{-S on}.
+
+@item -y @var{<signal>}[,r]
+@ifclear man
+@IndexSubentry{Options, @code{-y}}
+@end ifclear
+
+Controls recording of data with the signal named @var{<signal>}, referred to
+as the pause-resume signal. Whenever the given signal is delivered to the
+process, switch between paused (no data is recorded) and resumed (data is
+recorded) states.
+
+By default, data collection begins in the paused state. If the optional
+@samp{r} is given, data collection begins in the resumed state and data
+collection begins immediately.
+
+SIGUSR1 or SIGUSR2 are recommended for this use, but any signal that is
+not used by the target can be used.
+
+@item -l @var{<signal>}
+@ifclear man
+@IndexSubentry{Options, @code{-l}}
+@end ifclear
+
+Specify a signal that will trigger a sample of process-wide resource utilization.
+When the named @var{<signal>} is delivered to the process, a sample is recorded.
+
+The signal can be specified using the full name, without the initial
+letters @code{SIG}, or the signal number. Note that the @command{kill}
+command can be used to deliver a signal.
+
+If both the @samp{-l} and @samp{-y} options are used, the signal must be
+different.
+
+@item -s @var{<option>}[,@var{<API>}]
+@ifclear man
+@IndexSubentry{Options, @code{-s}}
+@end ifclear
+
+Enable synchronization wait tracing, where @var{<option>} is used to define the
+specifics of the tracing (on, off, @var{<threshold>}, or all). The API is
+selected through the setting for @var{<API>}: @samp{n} selects native/Pthreads,
+@samp{j} selects Java, and @samp{nj} selects both. The default is @samp{-s off}.
+
+@item -H @{off|on@}
+@ifclear man
+@IndexSubentry{Options, @code{-H}}
+@end ifclear
+
+Disable (off), or enable (on) heap tracing. The default is @samp{-H off}.
+
+@item -i @{off|on@}
+@ifclear man
+@IndexSubentry{Options, @code{-i}}
+@end ifclear
+
+Disable (off), or enable (on) I/O tracing. The default is @samp{-i off}.
+
+@end table
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c NOTES section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{NOTES}
+@c man begin NOTES
+
+Any executable in the ELF (Executable and Linkable Format) object format can
+be used for profiling with gprofng. If debug information is available,
+gprofng can provide more details, but this is not a requirement.
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c SEEALSO section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{SEEALSO}
+@c man begin SEEALSO
+
+gprofng(1), gp-archive(1), gp-display-html(1), gp-display-src(1), gp-display-text(1)
+
+The user guide for gprofng is maintained as a Texinfo manual. If the
+@command{info} and @command{gprofng} programs are correctly installed, the
+command @command{info gprofng} should give access to this document.
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c COPYRIGHT section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{COPYRIGHT}
+@c man begin COPYRIGHT
+
+Copyright @copyright{} 2022-2023 Free Software Foundation, Inc.
+
+Permission is granted to copy, distribute and/or modify this document
+under the terms of the GNU Free Documentation License, Version 1.3
+or any later version published by the Free Software Foundation;
+with no Invariant Sections, with no Front-Cover Texts, and with no
+Back-Cover Texts. A copy of the license is included in the
+section entitled ``GNU Free Documentation License''.
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c If this text is used for a man page, exit. Otherwise we need to continue.
+@c ----------------------------------------------------------------------------
+
+@ifset man
+@bye
+@end ifset
--- /dev/null
+@c ----------------------------------------------------------------------------
+@c This is the Texinfo source file for the gp-collect-app man page.
+@c
+@c Author: Ruud van der Pas
+@c ----------------------------------------------------------------------------
+@ifset man
+\input texinfo @c -*-texinfo-*-
+@setfilename gprofng display html
+@settitle Generate an HTML based directory structure to browse the profiles
+@include gp-macros.texi
+@end ifset
+
+@c ----------------------------------------------------------------------------
+@c This is from the man-pages(7) man page
+@c
+@c "The list below shows conventional or suggested sections. Most manual pages
+@c should include at least the highlighted sections. Arrange a new manual
+@c page so that sections are placed in the order shown in the list."
+@c
+@c NAME
+@c SYNOPSIS
+@c CONFIGURATION [Normally only in Section 4]
+@c DESCRIPTION
+@c OPTIONS [Normally only in Sections 1, 8]
+@c EXIT STATUS [Normally only in Sections 1, 8]
+@c RETURN VALUE [Normally only in Sections 2, 3]
+@c ERRORS [Typically only in Sections 2, 3]
+@c ENVIRONMENT
+@c FILES
+@c VERSIONS [Normally only in Sections 2, 3]
+@c ATTRIBUTES [Normally only in Sections 2, 3]
+@c CONFORMING TO
+@c NOTES
+@c BUGS
+@c EXAMPLES
+@c AUTHORS [Discouraged]
+@c REPORTING BUGS [Not used in man-pages]
+@c COPYRIGHT [Not used in man-pages]
+@c SEE ALSO
+@c
+@c This is what the texi2pod.pl tool recognizes:
+@c
+@c for $sect (qw(NAME SYNOPSIS TARGET DESCRIPTION OPTIONS ENVIRONMENT FILES
+@c BUGS NOTES FOOTNOTES SEEALSO AUTHOR COPYRIGHT)) {
+@c
+@c What is interesting is that it places "SEE ALSO" before "COPYRIGHT", which
+@c makes sense and adhered to for the other formats.
+@c ----------------------------------------------------------------------------
+
+@c ----------------------------------------------------------------------------
+@c NAME section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{NAME}
+@c man begin NAME
+
+gprofng display html - Generate an HTML based directory structure to browse the profiles
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c SYNOPSIS section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{SYNOPSIS}
+@c man begin SYNOPSIS
+
+@command{gprofng display html} [@var{option(s)}] @var{experiment(s)}
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c DESCRIPTION section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{DESCRIPTION}
+@c man begin DESCRIPTION
+
+Process one or more experiments to generate a directory containing the
+@file{index.html} file that may be used to browse the experiment data.
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c OPTIONS section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{OPTIONS}
+@c man begin OPTIONS
+
+@table @gcctabopt
+
+@item --version
+@ifclear man
+@IndexSubentry{Options, @code{--version}}
+@end ifclear
+
+Print the version number and exit.
+
+@item --help
+@ifclear man
+@IndexSubentry{Options, @code{--help}}
+@end ifclear
+
+Print usage information and exit.
+
+@item --verbose @{on|off@}
+@ifclear man
+@IndexSubentry{Options, @code{--verbose}}
+@end ifclear
+
+Enable (@samp{on}) or disable (@samp{off)} verbose mode.
+The default is @samp{off}.
+
+@item --debug @{on|s|m|l|xl|off@}
+@item -d @{on|s|m|l|xl|off@}
+@ifclear man
+@IndexSubentry{Options, @code{-d}}
+@IndexSubentry{Options, @code{--debug}}
+@end ifclear
+
+Control the printing of run time information to assist with troubleshooting,
+or further development of this tool. The keyword is case insensitive.
+A setting of @samp{on} gives a modest amount of information. The keywords
+@samp{s}, @samp{m}, @samp{l}, and @samp{xl} give an increasing amount of
+information, while @samp{off} disables the printing of debug information.
+This is also the default.
+
+Note that currently @samp{on}, @samp{s}, @samp{m}, and @samp{l} are
+equivalent. This is expected to change in future updates.
+
+@item ---highlight-percentage @var{value}
+@item -hp @var{value}
+@ifclear man
+@IndexSubentry{Options, @code{--highlight-percentage}}
+@IndexSubentry{Options, @code{-hp}}
+@end ifclear
+
+Set a percentage value in the interval [0,100] to select and color code source
+lines, as well as instructions, that are within this percentage of the maximum
+metric value(s). The default is 90 (%).
+
+A value of zero @samp{(-hp 0)} disables this feature.
+
+@item --output @var{dirname}
+@item -o @var{dirname}
+@ifclear man
+@IndexSubentry{Options, @code{--output}}
+@IndexSubentry{Options, @code{-o}}
+@end ifclear
+
+Use @var{dirname} as the directory name to store the HTML files in.
+The default name is @samp{display.<n>.html} with @var{<n>} the first
+positive integer number not in use. An existing directory with the
+same name is not overwritten.
+
+@item --overwrite @var{dirname}
+@item -O @var{dirname}
+@ifclear man
+@IndexSubentry{Options, @code{--overwrite}}
+@IndexSubentry{Options, @code{-O}}
+@end ifclear
+
+Use @var{dirname} as the directory name to store the HTML files in.
+
+@item --quiet @{on|off@}
+@item -q @{on|off@}
+@ifclear man
+@IndexSubentry{Options, @code{--quiet}}
+@IndexSubentry{Options, @code{-q}}
+@end ifclear
+
+Control the display of all warning, debug and verbose messages.
+If set to @samp{on}, the settings for verbose, warnings and debug are ignored.
+By default the quiet mode is disabled (@samp{-q off}).
+
+@item --warnings @{on|off@}
+@item -w @{on|off@}
+@ifclear man
+@IndexSubentry{Options, @code{--warnings}}
+@IndexSubentry{Options, @code{-w}}
+@end ifclear
+
+Enable (@samp{on}), or disable (@samp{off}) run time warning messages from
+the tool. By default these are enabled.
+
+@end table
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c NOTES section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{NOTES}
+@c man begin NOTES
+
+When setting a directory name for the HTML files to be stored in, make sure that
+umask is set to the correct access permissions.
+
+Regardless of the setting for the warning messages, any warnings are accessible
+through the main @file{index.html} page.
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c SEEALSO section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{SEEALSO}
+@c man begin SEEALSO
+
+gprofng(1), gp-archive(1), gp-collect-app(1), gp-display-src(1), gp-display-text(1)
+
+The user guide for gprofng is maintained as a Texinfo manual. If the
+@command{info} and @command{gprofng} programs are correctly installed, the
+command @command{info gprofng} should give access to this document.
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c COPYRIGHT section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{COPYRIGHT}
+@c man begin COPYRIGHT
+
+Copyright @copyright{} 2022-2023 Free Software Foundation, Inc.
+
+Permission is granted to copy, distribute and/or modify this document
+under the terms of the GNU Free Documentation License, Version 1.3
+or any later version published by the Free Software Foundation;
+with no Invariant Sections, with no Front-Cover Texts, and with no
+Back-Cover Texts. A copy of the license is included in the
+section entitled ``GNU Free Documentation License''.
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c If this text is used for a man page, exit. Otherwise we need to continue.
+@c ----------------------------------------------------------------------------
+
+@ifset man
+@bye
+@end ifset
--- /dev/null
+@c ----------------------------------------------------------------------------
+@c This is the Texinfo source file for the gp-collect-app man page.
+@c
+@c Author: Ruud van der Pas
+@c ----------------------------------------------------------------------------
+@ifset man
+\input texinfo @c -*-texinfo-*-
+@setfilename gprofng display src
+@settitle Display the source code, optionally interleaved with the disassembly of the target object
+@include gp-macros.texi
+@end ifset
+
+@c ----------------------------------------------------------------------------
+@c This is from the man-pages(7) man page
+@c
+@c "The list below shows conventional or suggested sections. Most manual pages
+@c should include at least the highlighted sections. Arrange a new manual
+@c page so that sections are placed in the order shown in the list."
+@c
+@c NAME
+@c SYNOPSIS
+@c CONFIGURATION [Normally only in Section 4]
+@c DESCRIPTION
+@c OPTIONS [Normally only in Sections 1, 8]
+@c EXIT STATUS [Normally only in Sections 1, 8]
+@c RETURN VALUE [Normally only in Sections 2, 3]
+@c ERRORS [Typically only in Sections 2, 3]
+@c ENVIRONMENT
+@c FILES
+@c VERSIONS [Normally only in Sections 2, 3]
+@c ATTRIBUTES [Normally only in Sections 2, 3]
+@c CONFORMING TO
+@c NOTES
+@c BUGS
+@c EXAMPLES
+@c AUTHORS [Discouraged]
+@c REPORTING BUGS [Not used in man-pages]
+@c COPYRIGHT [Not used in man-pages]
+@c SEE ALSO
+@c
+@c This is what the texi2pod.pl tool recognizes:
+@c
+@c for $sect (qw(NAME SYNOPSIS TARGET DESCRIPTION OPTIONS ENVIRONMENT FILES
+@c BUGS NOTES FOOTNOTES SEEALSO AUTHOR COPYRIGHT)) {
+@c
+@c What is interesting is that it places "SEE ALSO" before "COPYRIGHT", which
+@c makes sense and adhered to for the other formats.
+@c ----------------------------------------------------------------------------
+
+@c ----------------------------------------------------------------------------
+@c NAME section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{NAME}
+@c man begin NAME
+
+gprofng display src - Display the source code, optionally interleaved with the disassembly of the target object
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c SYNOPSIS section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{SYNOPSIS}
+@c man begin SYNOPSIS
+
+@command{gprofng display src} [@var{option(s)}] @var{target_file}
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c DESCRIPTION section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{DESCRIPTION}
+@c man begin DESCRIPTION
+
+Display the source code listing, or source code interleaved with disassembly code,
+as extracted from the target file (an executable, shared object, object file, or a
+Java .class file).
+
+For example, this command displays the source code and disassembly listing for a
+function called @samp{mxv_core} that is part of object file @samp{mxv.o}:
+
+@smallexample
+$ gprofng display src -disasm mxv_core mxv.o
+@end smallexample
+
+To list the source code and disassembly for all the functions in this file,
+use the following command:
+
+@smallexample
+$ gprofng display src -disasm all -1 mxv.o
+@end smallexample
+
+The @var{target_file} is the name of an executable, a shared object, an object
+file (.o), or a Java .class file.
+
+If no options are given, the source code listing of the @var{target_file}
+is shown. This is equivalent to @samp{-source all -1}. If this information
+is not available, a message to this extent is printed.
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c OPTIONS section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{OPTIONS}
+@c man begin OPTIONS
+
+@table @gcctabopt
+
+@item --version
+@ifclear man
+@IndexSubentry{Options, @code{--version}}
+@end ifclear
+
+Print the version number and exit.
+
+@item --help
+@ifclear man
+@IndexSubentry{Options, @code{--help}}
+@end ifclear
+
+Print usage information and exit.
+
+@c -- @item --verbose @{on|off@}
+@c -- @ifclear man
+@c -- @IndexSubentry{Options, @code{--verbose}}
+@c -- @end ifclear
+
+@c -- Enable (on) or disable (off) verbose mode; the default is @samp{off}.
+
+@item -functions
+@ifclear man
+@IndexSubentry{Options, @code{-functions}}
+@IndexSubentry{Commands, @code{functions}}
+@end ifclear
+List all the functions from the given object.
+
+@item -source @var{item} @var{tag}
+@ifclear man
+@IndexSubentry{Options, @code{-source}}
+@IndexSubentry{Commands, @code{source}}
+@end ifclear
+Show the source code for @var{item} in @var{target_file}. The @var{tag}
+is used to differentiate in case there are multiple occurences with the same
+name.
+See the @samp{NOTES} section for the definition of @var{item} and @var{tag}.
+
+@item -disasm @var{item} @var{tag}
+@ifclear man
+@IndexSubentry{Options, @code{-disasm}}
+@IndexSubentry{Commands, @code{disasm}}
+@end ifclear
+Include the disassembly in the source listing. The default listing does not
+include the disassembly. If the source code is not available, show a listing
+of the disassembly only.
+See the @samp{NOTES} section for the definition of @var{item} and @var{tag}.
+
+@item -outfile @var{filename}
+@ifclear man
+@IndexSubentry{Options, @code{-outfile}}
+@IndexSubentry{Commands, @code{outfile}}
+@end ifclear
+Write results to file @var{filename}. A dash (-) writes to stdout. This is also
+the default. Note that this option only affects those options included to the
+right of this option.
+
+@end table
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c NOTES section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{NOTES}
+@c man begin NOTES
+
+Use @var{item} to specify the name of a function, or of a source or object
+file that was used to build the executable, or shared object.
+
+The @var{tag} is an index used to determine which item is being referred
+to when multiple functions have the same name. It is required, but will
+be ignored if not necessary to resolve the function.
+
+The @var{item} may also be specified in the form @samp{function`file`}, in
+which case the source or disassembly of the named function in the source
+context of the named file will be used.
+
+The special @var{item} and @var{tag} combination @samp{all -1}, is used to
+indicate generating the source, or disassembly, for all functions in the
+@var{target_file}.
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c SEEALSO section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{SEEALSO}
+@c man begin SEEALSO
+
+gprofng(1), gp-archive(1), gp-collect-app(1), gp-display-html(1), gp-display-text(1)
+
+The user guide for gprofng is maintained as a Texinfo manual. If the info
+and gprofng programs are correctly installed, the command
+@command{info gprofng} should give access to this document.
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c COPYRIGHT section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{COPYRIGHT}
+@c man begin COPYRIGHT
+
+Copyright @copyright{} 2022-2023 Free Software Foundation, Inc.
+
+Permission is granted to copy, distribute and/or modify this document
+under the terms of the GNU Free Documentation License, Version 1.3
+or any later version published by the Free Software Foundation;
+with no Invariant Sections, with no Front-Cover Texts, and with no
+Back-Cover Texts. A copy of the license is included in the
+section entitled ``GNU Free Documentation License''.
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c If this text is used for a man page, exit. Otherwise we need to continue.
+@c ----------------------------------------------------------------------------
+
+@ifset man
+@bye
+@end ifset
--- /dev/null
+@c ----------------------------------------------------------------------------
+@c This is the Texinfo source file for the gp-collect-app man page.
+@c
+@c Author: Ruud van der Pas
+@c ----------------------------------------------------------------------------
+@ifset man
+\input texinfo @c -*-texinfo-*-
+@setfilename gprofng display text
+@settitle Display the performance data in plain text format
+@include gp-macros.texi
+@end ifset
+
+@c ----------------------------------------------------------------------------
+@c This is from the man-pages(7) man page
+@c
+@c "The list below shows conventional or suggested sections. Most manual pages
+@c should include at least the highlighted sections. Arrange a new manual
+@c page so that sections are placed in the order shown in the list."
+@c
+@c NAME
+@c SYNOPSIS
+@c CONFIGURATION [Normally only in Section 4]
+@c DESCRIPTION
+@c OPTIONS [Normally only in Sections 1, 8]
+@c EXIT STATUS [Normally only in Sections 1, 8]
+@c RETURN VALUE [Normally only in Sections 2, 3]
+@c ERRORS [Typically only in Sections 2, 3]
+@c ENVIRONMENT
+@c FILES
+@c VERSIONS [Normally only in Sections 2, 3]
+@c ATTRIBUTES [Normally only in Sections 2, 3]
+@c CONFORMING TO
+@c NOTES
+@c BUGS
+@c EXAMPLES
+@c AUTHORS [Discouraged]
+@c REPORTING BUGS [Not used in man-pages]
+@c COPYRIGHT [Not used in man-pages]
+@c SEE ALSO
+@c
+@c This is what the texi2pod.pl tool recognizes:
+@c
+@c for $sect (qw(NAME SYNOPSIS TARGET DESCRIPTION OPTIONS ENVIRONMENT FILES
+@c BUGS NOTES FOOTNOTES SEEALSO AUTHOR COPYRIGHT)) {
+@c
+@c What is interesting is that it places "SEE ALSO" before "COPYRIGHT", which
+@c makes sense and adhered to for the other formats.
+@c ----------------------------------------------------------------------------
+
+@c ----------------------------------------------------------------------------
+@c NAME section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{NAME}
+@c man begin NAME
+
+gprofng display text - Display the performance data in plain text format
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c SYNOPSIS section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{SYNOPSIS}
+@c man begin SYNOPSIS
+
+@command{gprofng display text} [@var{option(s)}] [@var{commands}]
+[-script @var{script-file}] @var{experiment(s)}
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c DESCRIPTION section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{DESCRIPTION}
+@c man begin DESCRIPTION
+
+Print a plain text version of the various displays supported by gprofng.
+
+The input consists of one or more experiment directories. Through commands,
+the user controls the output.
+
+There is a rich set of commands to control the display of the data. The
+@samp{NOTES} section lists the most common ones. The gprofng user guide
+lists all the commands supported.
+
+Commands specified on the command line need to be prepended with the dash ('-')
+symbol.
+
+In this example, a function overview will be shown, followed by the source
+code listing of function @samp{my-func}, annotated with the
+performance metrics that have been recorded during the data collection
+and stored in experiment directory @samp{my-exp.er}:
+
+@smallexample
+$ gprofng display text -functions -source my-func my-exp.er
+@end smallexample
+
+Instead of, or in addition to, specifying these commands on the command line,
+commands may also be included in a file called the @var{script-file}.
+
+Note that the commands are processed and interpreted from left to right,
+@emph{so the order matters}.
+
+If this tool is invoked without options, commands, or a script file, it
+starts in interpreter mode. The user can then issue the commands interactively.
+The session is terminated with the @command{exit} command in the interpreter.
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c OPTIONS section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{OPTIONS}
+@c man begin OPTIONS
+
+@table @gcctabopt
+
+@item --version
+@ifclear man
+@IndexSubentry{Options, @code{--version}}
+@end ifclear
+
+Print the version number and exit.
+
+@item --help
+@ifclear man
+@IndexSubentry{Options, @code{--help}}
+@end ifclear
+
+Print usage information and exit.
+
+@c -- @item --verbose @{on|off@}
+@c -- @ifclear man
+@c -- @IndexSubentry{Options, @code{--verbose}}
+@c -- @end ifclear
+
+@c -- Enable (on) or disable (off) verbose mode; the default is @samp{off}.
+
+@item -script @var{script-file}
+@ifclear man
+@IndexSubentry{Options, @code{-script}}
+@IndexSubentry{Commands, @code{script}}
+@end ifclear
+
+Execute the commands stored in the script file. This feature may be combined
+with commands specified at the command line.
+
+@end table
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c NOTES section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{NOTES}
+@c man begin NOTES
+
+Many commands are supported. Below, the more common ones are listed in
+mostly alphabetical order, because sometimes it is more logical to
+swap the order of two entries.
+
+@ifset man
+There are many more commands. These are documented in the user guide.
+@end ifset
+
+@table @code
+
+@item callers-callees
+@ifclear man
+@IndexSubentry{Options, @code{-callers-callees}}
+@IndexSubentry{Commands, @code{callers-callees}}
+@end ifclear
+In a callers-callees panel, it is shown which function(s) call the target
+function (the @emph{callers}) and what functions it is calling (the
+@emph{callees}).
+This command prints the callers-callees panel for each of the functions,
+in the order specified by the function sort metric.
+
+@item calltree
+@ifclear man
+@IndexSubentry{Options, @code{-calltree}}
+@IndexSubentry{Commands, @code{calltree}}
+@end ifclear
+Display the dynamic call graph from the experiment, showing the hierarchical
+metrics at each level.
+
+@item compare @{on | off | delta | ratio@}
+@ifclear man
+@IndexSubentry{Options, @code{-compare}}
+@IndexSubentry{Commands, @code{compare}}
+@end ifclear
+By default, the results for multiple experiments are aggregated. This
+command changes this to enable the comparison of experiments for certain
+views (e.g. the function view). The first experiment specified is defined
+to be the reference. The following options are supported:
+
+@table @code
+
+@item on
+For each experiment specified on the command line, print the values for
+the metrics that have been activated for the experiment.
+
+@item off
+Disable the comparison of experiments. This is the default.
+
+@item delta
+Print the values for the reference experiment. The results for the other
+experiments are shown as a delta relative to the reference (current-reference).
+
+@item ratio
+Print the values for the reference experiment. The results for the other
+experiments are shown as a ratio relative to the reference (current/reference).
+
+@end table
+
+@item disasm @var{function-name}
+@ifclear man
+@IndexSubentry{Options, @code{-disasm}}
+@IndexSubentry{Commands, @code{disasm}}
+@end ifclear
+List the source code and instructions for the function specified. The
+instructions are annotated with the metrics used.
+
+@item fsingle @var{function-name} [@samp{n}]
+@ifclear man
+@IndexSubentry{Options, @code{-fsingle}}
+@IndexSubentry{Commands, @code{fsingle}}
+@end ifclear
+Write a summary panel for the specified function. The optional parameter
+@var{n} is needed for those cases where several functions have the same name.
+
+@item fsummary
+@ifclear man
+@IndexSubentry{Options, @code{-fsummary}}
+@IndexSubentry{Commands, @code{fsummary}}
+@end ifclear
+Write a summary panel for each function in the function list.
+
+@item functions
+@ifclear man
+@IndexSubentry{Options, @code{-functions}}
+@IndexSubentry{Commands, @code{functions}}
+@end ifclear
+Display a list of all functions executed. For each function the used metrics
+(e.g. the CPU time) ar shown.
+
+@item header
+@ifclear man
+@IndexSubentry{Options, @code{-header}}
+@IndexSubentry{Commands, @code{header}}
+@end ifclear
+Shows several operational characteristics of the experiment(s) specified
+on the command line.
+
+@item limit @var{n}
+@ifclear man
+@IndexSubentry{Options, @code{-limit}}
+@IndexSubentry{Commands, @code{limit}}
+@end ifclear
+Limit the output to @var{n} lines.
+
+@item lines
+@ifclear man
+@IndexSubentry{Options, @code{-lines}}
+@IndexSubentry{Commands, @code{lines}}
+@end ifclear
+Write a list of source lines and their metrics, ordered by the current
+sort metric.
+
+@item metric_list
+@ifclear man
+@IndexSubentry{Options, @code{-metric_list}}
+@IndexSubentry{Commands, @code{metric_list}}
+@end ifclear
+Display the currently selected metrics in the function view and a list
+of all the metrics available for the target experiment(s).
+
+@item metrics @var{metric-spec}
+@ifclear man
+@IndexSubentry{Options, @code{-metrics}}
+@IndexSubentry{Commands, @code{metrics}}
+@end ifclear
+Define the metrics to be displayed in the function and callers-callees
+overviews.
+
+The @var{metric-spec} can either be the keyword @samp{default}
+to restore the default metrics selection, or a colon separated list
+with metrics.
+
+The gprofng user guide has more details how to define metrics.
+
+@item name @{short | long | mangled@}[:@{soname | nosoname@}]
+@ifclear man
+@IndexSubentry{Options, @code{-name}}
+@IndexSubentry{Commands, @code{name}}
+@end ifclear
+Specify whether to use the short, long, or mangled form of function names.
+Optionally, the load object that the function is part of can be included in
+the output by adding the @emph{soname} keyword. It can also be ommitted
+(@emph{nosoname}), which is the default.
+
+Whether there is an actual difference between these types of names depends
+on the language.
+
+Note that there should be no (white)space to the left and right of the
+colon (@samp{:}).
+
+@item overview
+@ifclear man
+@IndexSubentry{Options, @code{-overview}}
+@IndexSubentry{Commands, @code{overview}}
+@end ifclear
+Shows a summary of the recorded performance data for the experiment(s)
+specified on the command line.
+
+@item pcs
+@ifclear man
+@IndexSubentry{Options, @code{-pcs}}
+@IndexSubentry{Commands, @code{pcs}}
+@end ifclear
+Write a list of program counters (PCs) and their metrics, ordered by
+the current sort metric.
+
+@item sort @var{metric-spec}
+@ifclear man
+@IndexSubentry{Options, @code{-sort}}
+@IndexSubentry{Commands, @code{sort}}
+@end ifclear
+Sort the function list on the @var{metric-spec} given.
+
+@IndexSubentry{Sort, Reverse order}
+The data can be sorted in reverse order by prepending the metric definition
+with a minus (@samp{-}) sign.
+
+@noindent
+For example @command{sort -e.totalcpu}.
+
+@IndexSubentry{Sort, Reset to default}
+A default metric for the sort operation has been defined and since this is
+a persistent command, this default can be restored with @code{default} as
+the key (@command{sort default}).
+
+@item source @var{function-name}
+@ifclear man
+@IndexSubentry{Options, @code{-source}}
+@IndexSubentry{Commands, @code{source}}
+@end ifclear
+List the source code for the function specified, annotated with the metrics
+used.
+
+@item viewmode @{user | expert | machine@}
+@ifclear man
+@IndexSubentry{Options, @code{-viewmode}}
+@IndexSubentry{Commands, @code{viewmode}}
+@end ifclear
+This command is only relevant for Java programs. For all other languages
+supported, the viewmode setting has no effect.
+
+The following options are supported:
+
+@table @code
+
+@item user
+Show the Java call stacks for Java threads, but do not show housekeeping
+threads. The function view includes a function called @samp{<JVM-System>}.
+This represents the aggregated time from non-Java threads.
+In case the JVM software does not report a Java call stack, time is reported
+against the function @samp{<no Java callstack recorded>}.
+
+@item expert
+Show the Java call stacks for Java threads when the user Java code is executed,
+and machine call stacks when JVM code is executed, or when the JVM software
+does not report a Java call stack. Show the machine call stacks for
+housekeeping threads.
+
+@item machine
+Show the actual native call stacks for all threads. This is the view mode
+for C, C++, and Fortran.
+
+@end table
+
+@end table
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c SEEALSO section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{SEEALSO}
+@c man begin SEEALSO
+
+gprofng(1), gp-archive(1), gp-collect-app(1), gp-display-html(1), gp-display-src(1)
+
+The user guide for gprofng is maintained as a Texinfo manual. If the
+@command{info} and @command{gprofng} programs are correctly installed, the
+command @command{info gprofng} should give access to this document.
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c COPYRIGHT section
+@c ----------------------------------------------------------------------------
+
+@ManPageStart{COPYRIGHT}
+@c man begin COPYRIGHT
+
+Copyright @copyright{} 2022-2023 Free Software Foundation, Inc.
+
+Permission is granted to copy, distribute and/or modify this document
+under the terms of the GNU Free Documentation License, Version 1.3
+or any later version published by the Free Software Foundation;
+with no Invariant Sections, with no Front-Cover Texts, and with no
+Back-Cover Texts. A copy of the license is included in the
+section entitled ``GNU Free Documentation License''.
+
+@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c If this text is used for a man page, exit. Otherwise we need to continue.
+@c ----------------------------------------------------------------------------
+
+@ifset man
+@bye
+@end ifset
--- /dev/null
+@c -- Macro definitions -------------------------------------------------------
+@c
+@c Since only letters can be used, we use capitalization to distinguish
+@c different words.
+@c ----------------------------------------------------------------------------
+@macro CollectApp{}
+@command{gprofng collect app}
+@end macro
+
+@macro DisplayHTML{}
+@command{gprofng display html}
+@end macro
+
+@macro DisplayText{}
+@command{gprofng display text}
+@end macro
+
+@macro DisplaySRC{}
+@command{gprofng display src}
+@end macro
+
+@macro Archive{}
+@command{gprofng archive}
+@end macro
+
+@macro Driver{}
+@command{gprofng}
+@end macro
+
+@macro ProductName{}
+gprofng
+@end macro
+
+@macro ToolName{}
+@command{gprofng}
+@end macro
+
+@macro IndexSubentry{label, string}
+@c -- @cindex \label\ @subentry \string\
+@cindex \label\, \string\
+@end macro
+
+@macro vspace {lines}
+@sp \lines\
+@end macro
+
+@c -- For some reason ending this macro with @noindent does not work out well.
+
+@macro OptionHeader {lines, option, description}
+@sp \lines\
+@noindent
+@code{\option\} @ @emph{\description\}
+@c -- @sp 1
+@end macro
+
+@macro gcctabopt{body}
+@code{\body\}
+@end macro
+
+@macro ManPageStart{headername}
+@ifclear man
+@sp 1
+@noindent @b{\headername\}
+@indentedblock
+@end ifclear
+@end macro
+
+@macro ManPageEnd{}
+@ifclear man
+@end indentedblock
+@end ifclear
+@end macro
-\input texinfo @c -*-texinfo-*-
-
-@c for $sect (qw(NAME SYNOPSIS TARGET DESCRIPTION OPTIONS ENVIRONMENT FILES
-@c BUGS NOTES FOOTNOTES SEEALSO AUTHOR COPYRIGHT)) {
-
@c ----------------------------------------------------------------------------
-@c This is the Texinfo source file for the GPROFNG manual.
+@c This is the Texinfo source file for the gprofng man page.
@c
@c Author: Ruud van der Pas
@c ----------------------------------------------------------------------------
+@ifset man
+\input texinfo @c -*-texinfo-*-
+@setfilename gprofng
+@settitle The next generation GNU application profiling tool
+@include gp-macros.texi
+@end ifset
-@c %**start of header
-
-@setfilename gprofng.info
-@settitle GNU gprofng
-
-@c -- Set the indent for the @example command to 1 space, not 5 ---------------
-@exampleindent 1
-
-@c %**end of header
-
-@c -- Start a new chapter on a new, odd numbered, page ------------------------
-@setchapternewpage odd
-
-@c -- Merge all index entries into the Concepts Index -------------------------
-@syncodeindex fn cp
-@syncodeindex ky cp
-@syncodeindex pg cp
-@syncodeindex vr cp
+@c @ManPageStart{NAME}
+@c @ManPageStart{SYNOPSIS}
+@c @ManPageStart{DESCRIPTION}
+@c @ManPageStart{OPTIONS}
+@c @ManPageStart{NOTES}
+@c @ManPageStart{SEEALSO}
+@c @ManPageStart{COPYRIGHT}
-@c -- Macro definitions -------------------------------------------------------
+@c ----------------------------------------------------------------------------
+@c This is from the man-pages(7) man page
+@c
+@c "The list below shows conventional or suggested sections. Most manual pages
+@c should include at least the highlighted sections. Arrange a new manual
+@c page so that sections are placed in the order shown in the list."
+@c
+@c NAME
+@c SYNOPSIS
+@c CONFIGURATION [Normally only in Section 4]
+@c DESCRIPTION
+@c OPTIONS [Normally only in Sections 1, 8]
+@c EXIT STATUS [Normally only in Sections 1, 8]
+@c RETURN VALUE [Normally only in Sections 2, 3]
+@c ERRORS [Typically only in Sections 2, 3]
+@c ENVIRONMENT
+@c FILES
+@c VERSIONS [Normally only in Sections 2, 3]
+@c ATTRIBUTES [Normally only in Sections 2, 3]
+@c CONFORMING TO
+@c NOTES
+@c BUGS
+@c EXAMPLES
+@c AUTHORS [Discouraged]
+@c REPORTING BUGS [Not used in man-pages]
+@c COPYRIGHT [Not used in man-pages]
+@c SEE ALSO
@c
-@c Since only letters can be used, we use capitalization to distinguish
-@c different words.
+@c This is what the texi2pod.pl tool recognizes:
+@c
+@c for $sect (qw(NAME SYNOPSIS TARGET DESCRIPTION OPTIONS ENVIRONMENT FILES
+@c BUGS NOTES FOOTNOTES SEEALSO AUTHOR COPYRIGHT)) {
+@c
+@c What is interesting is that it places "SEE ALSO" before "COPYRIGHT", which
+@c makes sense and adhered to for the other formats.
@c ----------------------------------------------------------------------------
-@macro CollectApp{}
-@command{gprofng collect app}
-@end macro
-
-@macro DisplayHTML{}
-@command{gprofng display html}
-@end macro
-
-@macro DisplayText{}
-@command{gprofng display text}
-@end macro
-
-@macro Driver{}
-@command{gprofng}
-@end macro
-
-@macro ProductName{}
-gprofng
-@end macro
-
-@macro ToolName{}
-@command{gprofng}
-@end macro
-
-@macro IndexSubentry{label, string}
-@c -- @cindex \label\ @subentry \string\
-@cindex \label\, \string\
-@end macro
-
-@macro gcctabopt{body}
-@code{\body\}
-@end macro
-
-@c -- Get the version information ---------------------------------------------
-@include version.texi
-
-@c -- Entry for the Info dir structure ----------------------------------------
-@ifnottex
-@dircategory Software development
-@direntry
-* gprofng: (gprofng). The next generation profiling tool for Linux
-@end direntry
-@end ifnottex
-
-@c -- Copyright stuff ---------------------------------------------------------
-@copying
-This document is the manual for @ProductName{}, last updated @value{UPDATED}.
-
-Copyright @copyright{} 2022-2023 Free Software Foundation, Inc.
-
-@c -- @quotation
-Permission is granted to copy, distribute and/or modify this document
-under the terms of the GNU Free Documentation License,
-Version 1.3 or any later version published by the Free Software
-Foundation; with no Invariant Sections, with no Front-Cover texts,
-and with no Back-Cover Texts. A copy of the license is included in the
-section entitled ``GNU Free Documentation License.''
-
-@c -- @end quotation
-@end copying
-
-@finalout
-@smallbook
-
-@c -- Define the title page ---------------------------------------------------
-@titlepage
-@title GNU gprofng
-@subtitle The next generation profiling tool for Linux
-@subtitle version @value{VERSION} (last updated @value{UPDATED})
-@author Ruud van der Pas
-@page
-@vskip 0pt plus 1filll
-@insertcopying
-@c man begin COPYRIGHT
+@c ----------------------------------------------------------------------------
+@c NAME section
+@c ----------------------------------------------------------------------------
-Copyright @copyright{} 2022-2023 Free Software Foundation, Inc.
+@ManPageStart{NAME}
+@c man begin NAME
-Permission is granted to copy, distribute and/or modify this document
-under the terms of the GNU Free Documentation License, Version 1.3
-or any later version published by the Free Software Foundation;
-with no Invariant Sections, with no Front-Cover Texts, and with no
-Back-Cover Texts. A copy of the license is included in the
-section entitled ``GNU Free Documentation License''.
+gprofng - The driver for the gprofng application profiling tool
@c man end
+@ManPageEnd{}
-@end titlepage
-
-@c -- Generate the Table of Contents ------------------------------------------
-@contents
-
-@c -- The Top node ------------------------------------------------------------
-@c Should contain a short summary, copying permissions and a master menu.
@c ----------------------------------------------------------------------------
-@ifnottex
-@node Top
-@top GNU Gprofng
-
-@insertcopying
-@end ifnottex
-
-@ifinfo
-@c -- The menu entries --------------------------------------------------------
-
-@menu
-* Introduction:: About this manual.
-* Overview:: A brief overview of @ProductName{}.
-* A Mini Tutorial:: A short tutorial covering the key features.
-* Terminology:: Various concepts and some terminology explained.
-* Other Document Formats:: How to create this document in other formats.
-* Index:: The index.
-
-@detailmenu
-
---- The Detailed Node Listing ---
-
-Introduction
-
-Overview
-
-* Main Features:: A high level overview.
-* Sampling versus Tracing:: The pros and cons of sampling versus tracing.
-* Steps Needed to Create a Profile:: How to create a profile.
-
-A Mini Tutorial
-
-* Getting Started:: The basics of profiling with @ProductName().
-* Support for Multithreading:: Commands specific to multithreaded applications.
-* Viewing Multiple Experiments:: Analyze multiple experiments.
-* Profile Hardware Event Counters:: How to use hardware event counters.
-* Java Profiling:: How to profile a Java application.
-
-Terminology
+@c SYNOPSIS section
+@c ----------------------------------------------------------------------------
-* The Program Counter:: What is a Program Counter?
-* Inclusive and Exclusive Metrics:: An explanation of inclusive and exclusive metrics.
-* Metric Definitions:: Definitions associated with metrics.
-* The Viewmode:: Select the way call stacks are presented.
-* The Selection List:: How to define a selection.
-* Load Objects and Functions:: The components in an application.
-* The Concept of a CPU in @ProductName{}:: The definition of a CPU.
-* Hardware Event Counters Explained:: What are event counters?
-* apath:: Our generic definition of a path.
+@ManPageStart{SYNOPSIS}
+@c man begin SYNOPSIS
-@c -- Index
+@command{gprofng} [@var{option(s)}] @var{action} [@var{qualifier}] [@var{option(s)}] @var{target} [@var{options}]
-@end detailmenu
-@end menu
-@end ifinfo
+@c man end
+@ManPageEnd{}
-@ifset man
+@c ----------------------------------------------------------------------------
+@c DESCRIPTION section
+@c ----------------------------------------------------------------------------
-@c man title gprofng the driver for the gprofng tool suite
+@ManPageStart{DESCRIPTION}
+@c man begin DESCRIPTION
-@c man begin SYNOPSIS
-gprofng [OPTION(S)] ACTION [@b{QUALIFIER}] [ARGUMENTS] TARGET
-@c man end
+This is the driver for the gprofng tools suite to gather and analyze performance
+data.
-@c man begin DESCRIPTION
-This is the driver for the GPROFNG tools suite to gather and analyze performance data.
+The driver executes the @var{action} specified. An example of an action is
+@samp{collect} to collect performance data. Depending on the action, a
+@var{qualifier} may be needed to further define the command.
+The last item is the @var{target} that the command applies to.
-The driver executes the action specified. An example of an action is @code{collect}
-to collect performance data. Depending on the action, a qualifier may be needed to
-define the command. Several qualifiers support options. The last item on the command
-is the target the command applies to.
+There are three places where options are supported. The driver supports
+options. These can be found below. The @var{action}, possibly in combination
+with the @var{qualifier} also supports options. A description of these can be
+found in the man page for the command. Any options needed to execute the
+target command should follow the target name.
-For example, to collect performance data for an application called @code{a.out} and
-store the results in experiment directory @code{mydata.er}, the following command may
-be used:
+For example, to collect performance data for an application called
+@command{a.out} and store the results in experiment directory @samp{mydata.er},
+the following command may be used:
@smallexample
-$ gprofng collect app -o mydata.er a.out
+$ gprofng collect app -o mydata.er a.out -t 2
@end smallexample
-In this example, the action is @code{collect}, the qualifier is @code{app}, the single
-argument is @code{-o mydata.er} and the target is @code{a.out}.
+In this example, the action is @samp{collect}, the qualifier is @samp{app}, the single
+argument to the command is @code{-o mydata.er} and the target is @command{a.out}.
+The target command is invoked with the @samp{-t 2} option.
If gprofng is executed without any additional option, action, or target, a usage
overview is printed.
@c man end
+@ManPageEnd{}
+
+@c ----------------------------------------------------------------------------
+@c OPTIONS section
+@c ----------------------------------------------------------------------------
+@ManPageStart{OPTIONS}
@c man begin OPTIONS
@table @gcctabopt
@item @var{--version}
-print the version number and exit.
+@ifclear man
+@IndexSubentry{Options, @code{--version}}
+@end ifclear
+Print the version number and exit.
@item @var{--help}
-print usage information and exit.
+@ifclear man
+@IndexSubentry{Options, @code{--help}}
+@end ifclear
+Print usage information and exit.
@end table
@c man end
+@ManPageEnd{}
-@c man begin NOTES
-
-The gprofng driver supports the following commands.
-
-@c The man pages for the commands below can be viewed using the command name with "gprofng" replaced by "gp" and the spaces replaced by a dash ("-"). For example the man page
-@c name for "gprofng collect app" is "gp-collect-app".
-
-Collect performance data:
-
-@table @code
-
-@item gprofng collect app
-collect application performance data.
-
-@end table
-
-Display the performance results:
-
-@table @code
-
-@item gprofng display text
-display the performance data in ASCII format.
-
-@item gprofng display html
-generate an HTML file from one or more experiments.
-
-@end table
-
-Miscellaneous commands:
-
-@table @code
-
-@item gprofng display src
-display source or disassembly with compiler annotations.
-
-@item gprofng archive
-include binaries and source code in an experiment directory.
-
-@end table
-
-It is also possible to invoke the lower level commands directly, but since
-these are subject to change, in particular the options, we recommend to
-use the driver.
-
-@c man end
+@c -----------------------------------------------------------------------------
+@c ENVIRONMENT SECTION
+@c -----------------------------------------------------------------------------
+@ManPageStart{ENVIRONMENT}
@c man begin ENVIRONMENT
+
The following environment variables are supported:
-@table @code
+@table @samp
@item @env{GPROFNG_MAX_CALL_STACK_DEPTH}
-set the depth of the call stack (default is 256).
+@cindex Environment variables
+Set the depth of the call stack (default is 256).
@item @env{GPROFNG_USE_JAVA_OPTIONS}
-may be set when profiling a C/C++ application that uses dlopen() to execute Java code.
+@cindex Environment variables
+May be set when profiling a C/C++ application that uses dlopen() to execute
+Java code.
-@item @env{GPROFNG_SSH_REMOTE_DISPLAY}
-use this variable to define the ssh command executed by the remote display tool.
+@c -- deferred @item @env{GPROFNG_SSH_REMOTE_DISPLAY}
+@c -- deferred Use this variable to define the ssh command executed by the remote display tool.
-@item @env{GPROFNG_SKIP_VALIDATION}
-set this variable to disable checking hardware, system, and Java versions.
+@c -- deferred @item @env{GPROFNG_SKIP_VALIDATION}
+@c -- deferred Set this variable to disable checking hardware, system, and Java versions.
@item @env{GPROFNG_ALLOW_CORE_DUMP}
-set this variable to allow a core file to be generated; otherwise an error report is created on /tmp.
+@cindex Environment variables
+Set this variable to allow a core file to be generated; otherwise an error
+report is created on /tmp.
@item @env{GPROFNG_ARCHIVE}
-use this variable to define the settings for automatic archiving upon experiment recording completion.
+@cindex Environment variables
+Use this variable to define the settings for automatic archiving upon experiment
+recording completion.
@item @env{GPROFNG_ARCHIVE_COMMON_DIR}
-set this variable to the location of the common archive.
+@cindex Environment variables
+Set this variable to the location of the common archive.
@item @env{GPROFNG_JAVA_MAX_CALL_STACK_DEPTH}
-set the depth of the Java call stack; the default is 256; set to 0 to disable capturing of call stacks.
+@cindex Environment variables
+Set the depth of the Java call stack; the default is 256; set to 0 to disable
+capturing of call stacks.
@item @env{GPROFNG_JAVA_NATIVE_MAX_CALL_STACK_DEPTH}
-set the depth of the Java native call stack; the default is 256; set to 0 to disable capturing of call stacks (JNI and assembly call stacks are not captured).
+@cindex Environment variables
+Set the depth of the Java native call stack; the default is 256; set to 0 to
+disable capturing of call stacks (JNI and assembly call stacks are not
+captured).
@end table
@c man end
+@ManPageEnd{}
-@c man begin SEEALSO
-The man pages for the various gprofng commands are not available yet, but
-the @option{--help} option supported on each of the commands lists the options
-and provides more information.
-
-For example this displays the options supported on the @command{gprofng collect app}
-command:
-
-@smallexample
-$ gprofng collect app --help
-@end smallexample
-
-The user guide is available as an Info entry for @file{gprofng}.
-@c man end
-
-@end ifset
-
-@c man begin DESCRIPTION
-@c man end
-
-@c -- A new node --------------------------------------------------------------
-@node Introduction
-@chapter Introduction
-@c ----------------------------------------------------------------------------
-The @ProductName{} tool is the next generation profiler for Linux. It consists
-of various commands to generate and display profile information.
-
-This manual starts with a tutorial how to create and interpret a profile. This
-part is highly practical and has the goal to get users up to speed as quickly
-as possible. As soon as possible, we would like to show you how to get your
-first profile on your screen.
-
-This is followed by more examples, covering many of the features. At the
-end of this tutorial, you should feel confident enough to tackle the more
-complex tasks.
-
-In a future update a more formal reference manual will be included as well.
-Since even in this tutorial we use certain terminology, we have included a
-chapter with descriptions at the end. In case you encounter unfamiliar
-wordings or terminology, please check this chapter.
-
-One word of caution. In several cases we had to somewhat tweak the screen
-output in order to make it fit. This is why the output may look somewhat
-different when you try things yourself.
-
-For now, we wish you a smooth profiling experience with @ProductName{} and
-good luck tackling performance bottlenecks.
-
-@c -- A new node --------------------------------------------------------------
-@c cccccc @node A Brief Overview of @ProductName{}
-@node Overview
-@chapter A Brief Overview of @ProductName{}
-@c ----------------------------------------------------------------------------
-
-@menu
-* Main Features:: A high level overview.
-* Sampling versus Tracing:: The pros and cons of sampling versus tracing.
-* Steps Needed to Create a Profile:: How to create a profile.
-@end menu
-
-Before we cover this tool in quite some detail, we start with a brief overview
-of what it is, and the main features. Since we know that many of you would
-like to get started rightaway, already in this first chapter we explain the
-basics of profiling with @ToolName{}.
-
-@c ----------------------------------------------------------------------------
-@c TBD Review this text. Probably be more specific on the gcc releases and
-@c processor specifics.
-@c ----------------------------------------------------------------------------
-
-@c -- A new node --------------------------------------------------------------
-@node Main Features
-@section Main Features
@c ----------------------------------------------------------------------------
-
-@noindent
-These are the main features of the @ProductName{} tool:
-
-@itemize @bullet
-
-@item
-Profiling is supported for an application written in C, C++, Java, or Scala.
-
-@c TBD Java: up to 1.8 full support, support other than for modules
-
-@item
-Shared libraries are supported. The information is presented at the instruction
-level.
-
-@item
-The following multithreading programming models are supported: Pthreads,
-OpenMP, and Java threads.
-
-@item
-This tool works with unmodified production level executables. There is no need to
-recompile the code, but if the @code{-g} option has been used when building
-the application, source line level information is available.
-
-@item
-The focus is on support for code generated with the @code{gcc} compiler, but
-there is some limited support for the @code{icc} compiler as well. Future
-improvements and enhancements will focus on @code{gcc} though.
-
-@item
-Processors from Intel, AMD, and Arm are supported, but the level of support
-depends on the architectural details. In particular, hardware event counters
-may not be supported.
-
-@item
-Several views into the data are supported. For example, a function overview
-where the time is spent, but also a source line, disassembly, call tree and
-a caller-callees overview are available.
-
-@item
-Through filters, the user can zoom in on an area of interest.
-
-@item
-Two or more profiles can be aggregated, or used in a comparison. This comparison
-can be obtained at the function, source line, and disassembly level.
-
-@item
-Through a scripting language, and customization of the metrics shown,
-the generation and creation of a profile can be fully automated and provide
-tailored output.
-
-@end itemize
-
-@c -- A new node --------------------------------------------------------------
-@node Sampling versus Tracing
-@section Sampling versus Tracing
+@c NOTES section
@c ----------------------------------------------------------------------------
-A key difference with some other profiling tools is that the main data
-collection command @CollectApp{} mostly uses
-@cindex Program Counter sampling
-@cindex PC sampling
-Program Counter (PC) sampling
-under the hood.
-
-With @emph{sampling}, the executable is stopped at regular intervals. Each time
-it is halted, key information is gathered and stored. This includes the Program
-Counter that keeps track of where the execution is. Hence the name.
-
-Together with operational
-data, this information is stored in the experiment directory and can be
-viewed in the second phase.
-
-For example, the PC information is used to derive where the program was when
-it was halted. Since the sampling interval is known, it is relatively easy to
-derive how much time was spent in the various parts of the program.
-
-The opposite technique is generally referred to as @emph{tracing}. With
-tracing, the target is instrumented with specific calls that collect the
-requested information.
-
-These are some of the pros and cons of PC sampling verus tracing:
-
-@itemize
-
-@item
-Since there is no need to recompile, existing executables can be used
-and the profile measures the behaviour of exactly the same executable that is
-used in production runs.
-
-With sampling, one inherently profiles a different executable because
-the calls to the instrumentation library may affect the compiler optimizations
-and run time behaviour.
-
-@item
-With sampling, there are very few restrictions on what can be profiled and even without
-access to the source code, a basic profile can be made.
+@ManPageStart{NOTES}
+@c man begin NOTES
-@item
-A downside of sampling is that, depending on the sampling frequency, small
-functions may be missed or not captured accurately. Although this is rare,
-this may happen and is the reason why the user has control over the sampling rate.
+The gprofng driver supports the following commands.
+@vspace{1}
-@item
-While tracing produces precise information, sampling is statistical in nature.
-As a result, small variations may occur across seemingly identical runs. We
-have not observed more than a few percent deviation though. Especially if
-the target job executed for a sufficiently long time.
+@c The man pages for the commands below can be viewed using the command name with "gprofng" replaced by "gp" and the spaces replaced by a dash ("-"). For example the man page
+@c name for "gprofng collect app" is "gp-collect-app".
-@item
-With sampling, it is not possible to get an accurate count how often
-functions are called.
+@i{Collect performance data:}
-@end itemize
+@table @code
-@c -- A new node --------------------------------------------------------------
-@node Steps Needed to Create a Profile
-@section Steps Needed to Create a Profile
-@c ----------------------------------------------------------------------------
+@item gprofng collect app
+Collect application performance data.
-Creating a profile takes two steps. First the profile data needs to be
-generated. This is followed by a viewing step to create a report from the
-information that has been gathered.
+@end table
-Every @ProductName{} command starts with @ToolName{}, the name of the driver. This is followed
-by a keyword to define the high level functionality. Depending on this
-keyword, a third qualifier may be needed to further narrow down the request.
-This combination is then followed by options that are specific to the functionality
-desired.
+@i{Display the performance results:}
-The command to gather, or ``collect'', the performance data is called
-@CollectApp{}. Aside from numerous options, this command takes the name
-of the target executable as an input parameter.
+@table @code
-Upon completion of the run, the performance data can be
-found in the newly created
-@cindex Experiment directory
-experiment directory.
+@item gprofng display text
+Display the performance data in ASCII format.
-Unless explicitly specified otherwise, a default
-name for this directory is chosen. The name is @code{test.<n>.er} where
-@code{n} is the first integer number not in use yet for such a name.
+@item gprofng display html
+Generate an HTML file from one or more experiments.
-For example, the first time @CollectApp{} is invoked, an experiment
-directory with the name @code{test.1.er} is created.
+@end table
-Upon a subsequent invocation of @CollectApp{} in the same directory,
-an experiment directory with the name @code{test.2.er} will be created,
-and so forth.
+@i{Miscellaneous commands:}
-Note that @CollectApp{} supports an option to explicitly name the experiment directory.
-Outside of the restriction that the name of this directory has to end
-with @code{.er}, any valid directory name can be used for this.
+@table @code
-Now that we have the performance data, the next step is to display it.
+@item gprofng display src
+Display source or disassembly with compiler annotations.
-@pindex @DisplayText{}
-The most commonly used command to view the performance information is
-@DisplayText{}. This is a very extensive and customizable tool that
-produces the information in ASCII format.
+@item gprofng archive
+Include binaries and source code in an experiment directory.
-@pindex @DisplayHTML{}
-Another option is to use @DisplayHTML{}. This tool generates a directory with
-files in html format. These can be viewed in a browser, allowing for easy
-navigation through the profile data.
+@end table
-@c -- A new node --------------------------------------------------------------
-@node A Mini Tutorial
-@chapter A Mini Tutorial
-@c ----------------------------------------------------------------------------
+It is also possible to invoke the lower level commands directly, but since
+these are subject to change, in particular the options, we recommend to
+use the driver.
-In this chapter we present and discuss the main functionality of @ToolName{}.
-This will be a practical approach, using an example code to generate profile
-data and show how to get various performance reports.
-
-@menu
-* Getting Started:: The basics of profiling with @ProductName().
-* Support for Multithreading:: Commands specific to multithreaded applications.
-* Viewing Multiple Experiments:: Analyze multiple experiments.
-* Profile Hardware Event Counters:: How to use hardware event counters.
-* Java Profiling:: How to profile a Java application.
-@end menu
-
-@c -- A new node --------------------------------------------------------------
-@node Getting Started
-@section Getting Started
-@c ----------------------------------------------------------------------------
+@c man end
+@ManPageEnd{}
-The information presented here provides a good and common basis for many
-profiling tasks, but there are more features that you may want to leverage.
-
-These are covered in subsequent sections in this chapter.
-
-@menu
-* The Example Program:: A description of the example program used.
-* A First Profile:: How to get the first profile.
-* The Source Code View:: Display the metrics in the source code.
-* The Disassembly View:: Display the metrics at the instruction level.
-* Display and Define the Metrics:: An example how to customize the metrics.
-* A First Customization of the Output:: An example how to customize the output.
-* Name the Experiment Directory:: Change the name of the experiment directory.
-* Control the Number of Lines in the Output:: Change the number of lines in the tables.
-* Sorting the Performance Data:: How to set the metric to sort by.
-* Scripting:: Use a script to execute the commands.
-* A More Elaborate Example:: An example of customization.
-* The Call Tree:: Display the dynamic call tree.
-* More Information on the Experiment:: How to get additional statistics.
-* Control the Sampling Frequency:: How to control the sampling granularity.
-* Information on Load Objects:: How to get more information on load objects.
-@end menu
-
-@c -- A new node --------------------------------------------------------------
-@node The Example Program
-@subsection The Example Program
@c ----------------------------------------------------------------------------
-
-Throughout this guide we use the same example C code that implements the
-multiplication of a vector of length @math{n} by an @math{m} by @math{n}
-matrix. The result is stored in a vector of length @math{m}.
-@cindex Pthreads
-@cindex Posix Threads
-The algorithm has been parallelized using Posix Threads, or Pthreads for short.
-
-The code was built using the @code{gcc} compiler and the name of the executable
-is
-@cindex mxv-pthreads.exe
-mxv-pthreads.exe.
-
-The matrix sizes can be set through the @code{-m} and @code{-n} options. The
-number of threads is set with the @code{-t} option. To increase the duration
-of the run, the multiplication is executed repeatedly.
-
-This is an example that multiplies a @math{3000} by @math{2000} matrix with
-a vector of length @math{2000} using @math{2} threads:
-
-@smallexample
-@verbatim
-$ ./mxv-pthreads.exe -m 3000 -n 2000 -t 2
-mxv: error check passed - rows = 3000 columns = 2000 threads = 2
-$
-@end verbatim
-@end smallexample
-
-The program performs an internal check to verify the results are correct.
-The result of this check is printed, followed by the matrix sizes and the
-number of threads used.
-
-@c -- A new node --------------------------------------------------------------
-@node A First Profile
-@subsection A First Profile
+@c SEEALSO section
@c ----------------------------------------------------------------------------
-The first step is to collect the performance data. It is important to remember
-that much more information is gathered than may be shown by default. Often a
-single data collection run is sufficient to get a lot of insight.
-
-The @CollectApp{} command is used for the data collection. Nothing needs to be
-changed in the way the application is executed. The only difference is that it
-is now run under control of the tool, as shown below:
-
-@cartouche
-@smallexample
-$ gprofng collect app ./mxv.pthreads.exe -m 3000 -n 2000 -t 1
-@end smallexample
-@end cartouche
-
-This command produces the following output:
-
-@smallexample
-@verbatim
-Creating experiment database test.1.er (Process ID: 2416504) ...
-mxv: error check passed - rows = 3000 columns = 2000 threads = 1
-@end verbatim
-@end smallexample
+@ManPageStart{SEEALSO}
+@c man begin SEEALSO
-We see the message that a directory with the name @code{test.1.er}
-has been created.
-The application then completes as usual and we have our first experiment
-directory that can be analyzed.
+gp-archive(1), gp-collect-app(1), gp-display-html(1), gp-display-src(1),
+gp-display-text(1)
-The tool we use for this is called @DisplayText{}. It takes the name of
-the experiment directory as an argument.
+Each gprofng command also supports the @option{--help} option. This lists the
+options and a short description for each option.
-@cindex Interpreter mode
-If invoked this way, the tool starts in the interactive @emph{interpreter} mode.
-While in this environment, commands can be given and the tool responds. This is
-illustrated below:
+For example this displays the options supported on the
+@command{gprofng collect app} command:
@smallexample
-@verbatim
-$ gprofng display text test.1.er
-Warning: History and command editing is not supported on this system.
-(gp-display-text) quit
-$
-@end verbatim
+$ gprofng collect app --help
@end smallexample
-@cindex Command line mode
-While useful in certain cases, we prefer to use this tool in command line mode,
-by specifying the commands to be issued when invoking the tool. The way to do
-this is to prepend the command with a hyphen (@code{-}) if used on the command
-line.
-
-For example,
-@IndexSubentry{Commands, @code{functions}}
-with the @code{functions} command we request a list of the functions that
-have been executed and their respective CPU times:
+The user guide for gprofng is maintained as a Texinfo manual. If the
+@command{info} and @command{gprofng} programs are correctly installed, the
+command @command{info gprofng} should give access to this document.
-@cartouche
-@smallexample
-$ gprofng display text -functions test.1.er
-@end smallexample
-@end cartouche
-
-@smallexample
-@verbatim
-$ gprofng display text -functions test.1.er
-Functions sorted by metric: Exclusive Total CPU Time
-
-Excl. Incl. Name
-Total Total
-CPU sec. CPU sec.
-2.272 2.272 <Total>
-2.160 2.160 mxv_core
-0.047 0.103 init_data
-0.030 0.043 erand48_r
-0.013 0.013 __drand48_iterate
-0.013 0.056 drand48
-0.008 0.010 _int_malloc
-0.001 0.001 brk
-0.001 0.002 sysmalloc
-0. 0.001 __default_morecore
-0. 0.113 __libc_start_main
-0. 0.010 allocate_data
-0. 2.160 collector_root
-0. 2.160 driver_mxv
-0. 0.113 main
-0. 0.010 malloc
-0. 0.001 sbrk
-@end verbatim
-@end smallexample
+@c man end
+@ManPageEnd{}
-As easy and simple as these steps are, we do have a first profile of our program!
-There are three columns. The first two contain the
-@cindex Total CPU time
-@emph{Total CPU Time},
-which
-is the sum of the user and system time. @xref{Inclusive and Exclusive Metrics}
-for an explanation of ``exclusive'' and ``inclusive'' times.
-
-The first line echoes the metric that is used to sort the output. By default, this
-is the exclusive CPU time, but the sort metric can be changed by the user.
-
-We then see three columns with the exclusive and inclusive CPU times, plus the
-name of the function.
-
-@IndexSubentry{Miscellaneous, @code{<Total>}}
-The function with the name @code{<Total>} is not a user function, but is introduced
-by @ToolName{} and is used to display the accumulated metric values. In this case,
-we see that the total CPU time of this job was @code{2.272} seconds.
-
-With @code{2.160} seconds, function @code{mxv_core} is the most time
-consuming function. It is also a leaf function.
-
-The next function in the list is @code{init_data}. Although the CPU time spent in
-this part is negligible, this is an interesting entry because the inclusive CPU
-time of @code{0.103} seconds is higher than the exclusive CPU time of @code{0.047}
-seconds. Clearly it is calling another function,
-or even more than one function.
-@xref{The Call Tree} for the details how to get more information on this.
-
-The function @code{collector_root} does not look familiar. It is one of the internal
-functions used by @CollectApp{} and can be ignored. While the inclusive time is high,
-the exclusive time is zero. This means it doesn't contribute to the performance.
-
-The question is how we know where this function originates from? There is a very useful
-command to get more details on a function. @xref{Information on Load Objects}.
-
-@c -- A new node --------------------------------------------------------------
-@node The Source Code View
-@subsection The Source Code View
@c ----------------------------------------------------------------------------
-
-In general, you would like to focus the tuning efforts on the most time
-consuming part(s) of the program. In this case that is easy, since 2.160
-seconds on a total of 2.272 seconds is spent in function @code{mxv_core}.
-That is 95% of the total and it is time to dig deeper and look
-@cindex Source level timings
-at the time distribution at the source code level.
-
-@IndexSubentry{Commands, @code{source}}
-The @code{source} command is used to accomplish this. It takes the name of the
-function, not the source filename, as an argument. This is demonstrated
-below, where the @DisplayText{} command is used to show the annotated
-source listing of function @code{mxv_core}.
-
-Please note that the source code has to be compiled with the @code{-g}
-option in order for the source code feature to work. Otherwise the
-location can not be determined.
-
-@cartouche
-@smallexample
-$ gprofng display text -source mxv_core test.1.er
-@end smallexample
-@end cartouche
-
-The slightly modified output is as follows:
-
-@smallexample
-@verbatim
-Source file: <apath>/mxv.c
-Object file: mxv-pthreads.exe (found as test.1.er/archives/...)
-Load Object: mxv-pthreads.exe (found as test.1.er/archives/...)
-
- Excl. Incl.
- Total Total
- CPU sec. CPU sec.
-
- <lines deleted>
- <Function: mxv_core>
- 0. 0. 32. void __attribute__ ((noinline))
- mxv_core (
- uint64_t row_index_start,
- uint64_t row_index_end,
- uint64_t m, uint64_t n,
- double **restrict A,
- double *restrict b,
- double *restrict c)
- 0. 0. 33. {
- 0. 0. 34. for (uint64_t i=row_index_start;
- i<=row_index_end; i++) {
- 0. 0. 35. double row_sum = 0.0;
-## 1.687 1.687 36. for (int64_t j=0; j<n; j++)
- 0.473 0.473 37. row_sum += A[i][j]*b[j];
- 0. 0. 38. c[i] = row_sum;
- 39. }
- 0. 0. 40. }
-@end verbatim
-@end smallexample
-
-The first three lines provide information on the location of the source file,
-the object file and the load object (@xref{Load Objects and Functions}).
-
-Function @code{mxv_core} is part of a source file that has other functions
-as well. These functions will be shown, but without timing information. They
-have been removed in the output shown above.
-
-This is followed by the annotated source code listing. The selected metrics
-are shown first, followed by a source line number, and the source code.
-@IndexSubentry{Miscellaneous ,@code{##}}
-The most time consuming line(s) are marked with the @code{##} symbol. In
-this way they are easier to find.
-
-What we see is that all of the time is spent in lines 36-37.
-
-@IndexSubentry{Commands, @code{lines}}
-A related command sometimes comes handy as well. It is called @code{lines}
-and displays a list of the source lines and their metrics, ordered according
-to the current sort metric (@xref{Sorting the Performance Data}).
-
-Below the command and the output. For lay-out reasons, only the top 10 is
-shown here and the last part of the text on some lines has been replaced
-by dots.
-
-@cartouche
-@smallexample
-$ gprofng display text -lines test.1.er
-@end smallexample
-@end cartouche
-
-@smallexample
-@verbatim
-Lines sorted by metric: Exclusive Total CPU Time
-
-Excl. Incl. Name
-Total Total
-CPU sec. CPU sec.
-2.272 2.272 <Total>
-1.687 1.687 mxv_core, line 36 in "mxv.c"
-0.473 0.473 mxv_core, line 37 in "mxv.c"
-0.032 0.088 init_data, line 72 in "manage_data.c"
-0.030 0.043 <Function: erand48_r, instructions without line numbers>
-0.013 0.013 <Function: __drand48_iterate, instructions without ...>
-0.013 0.056 <Function: drand48, instructions without line numbers>
-0.012 0.012 init_data, line 77 in "manage_data.c"
-0.008 0.010 <Function: _int_malloc, instructions without ...>
-0.003 0.003 init_data, line 71 in "manage_data.c"
-@end verbatim
-@end smallexample
-
-What this overview immediately highlights is that the next most time consuming
-source line takes 0.032 seconds only. With an inclusive time of 0.088 seconds,
-it is also clear that this branch of the code does not impact the performance.
-
-@c -- A new node --------------------------------------------------------------
-@node The Disassembly View
-@subsection The Disassembly View
+@c COPYRIGHT section
@c ----------------------------------------------------------------------------
-The source view is very useful to obtain more insight where the time is spent,
-but sometimes this is not sufficient. This is when the disassembly view comes
-in. It is activated with the
-@IndexSubentry{Commands, @code{disasm}}
-@code{disasm}
-command and as with the source view, it displays an annotated listing. In this
-@cindex Instruction level timings
-case it shows the instructions with the metrics, interleaved with the
-source lines. The
-instructions have a reference in square brackets (@code{[} and @code{]})
-to the source line they correspond to.
-
-@noindent
-This is what we get for our example:
-
-@cartouche
-@smallexample
-$ gprofng display text -disasm mxv_core test.1.er
-@end smallexample
-@end cartouche
-
-@smallexample
-@verbatim
-Source file: <apath>/mxv.c
-Object file: mxv-pthreads.exe (found as test.1.er/archives/...)
-Load Object: mxv-pthreads.exe (found as test.1.er/archives/...)
-
- Excl. Incl.
- Total Total
- CPU sec. CPU sec.
-
- <lines deleted>
- 32. void __attribute__ ((noinline))
- mxv_core (
- uint64_t row_index_start,
- uint64_t row_index_end,
- uint64_t m, uint64_t n,
- double **restrict A,
- double *restrict b,
- double *restrict c)
- 33. {
- <Function: mxv_core>
- 0. 0. [33] 4021ba: mov 0x8(%rsp),%r10
- 34. for (uint64_t i=row_index_start;
- i<=row_index_end; i++) {
- 0. 0. [34] 4021bf: cmp %rsi,%rdi
- 0. 0. [34] 4021c2: jbe 0x37
- 0. 0. [34] 4021c4: ret
- 35. double row_sum = 0.0;
- 36. for (int64_t j=0; j<n; j++)
- 37. row_sum += A[i][j]*b[j];
- 0. 0. [37] 4021c5: mov (%r8,%rdi,8),%rdx
- 0. 0. [36] 4021c9: mov $0x0,%eax
- 0. 0. [35] 4021ce: pxor %xmm1,%xmm1
- 0.002 0.002 [37] 4021d2: movsd (%rdx,%rax,8),%xmm0
- 0.096 0.096 [37] 4021d7: mulsd (%r9,%rax,8),%xmm0
- 0.375 0.375 [37] 4021dd: addsd %xmm0,%xmm1
-## 1.683 1.683 [36] 4021e1: add $0x1,%rax
- 0.004 0.004 [36] 4021e5: cmp %rax,%rcx
- 0. 0. [36] 4021e8: jne 0xffffffffffffffea
- 38. c[i] = row_sum;
- 0. 0. [38] 4021ea: movsd %xmm1,(%r10,%rdi,8)
- 0. 0. [34] 4021f0: add $0x1,%rdi
- 0. 0. [34] 4021f4: cmp %rdi,%rsi
- 0. 0. [34] 4021f7: jb 0xd
- 0. 0. [35] 4021f9: pxor %xmm1,%xmm1
- 0. 0. [36] 4021fd: test %rcx,%rcx
- 0. 0. [36] 402200: jne 0xffffffffffffffc5
- 0. 0. [36] 402202: jmp 0xffffffffffffffe8
- 39. }
- 40. }
- 0. 0. [40] 402204: ret
-@end verbatim
-@end smallexample
-
-For each instruction, the timing values are given and we can exactly which ones
-are the most expensive. As with the source level view, the most expensive
-instructions are market with the @code{##} symbol.
-
-As illustrated below and similar to the @code{lines} command, we can get
-an overview of the instructions executed by using the
-@IndexSubentry{Commands, @code{pcs}}
-@code{pcs}
-command.
+@ManPageStart{COPYRIGHT}
+@c man begin COPYRIGHT
-@noindent
-Below the command and the output, which again has been restricted
-to 10 lines:
+Copyright @copyright{} 2022-2023 Free Software Foundation, Inc.
-@cartouche
-@smallexample
-$ gprofng display text -pcs test.1.er
-@end smallexample
-@end cartouche
+Permission is granted to copy, distribute and/or modify this document
+under the terms of the GNU Free Documentation License, Version 1.3
+or any later version published by the Free Software Foundation;
+with no Invariant Sections, with no Front-Cover Texts, and with no
+Back-Cover Texts. A copy of the license is included in the
+section entitled ``GNU Free Documentation License''.
-@smallexample
-@verbatim
-PCs sorted by metric: Exclusive Total CPU Time
-
-Excl. Incl. Name
-Total Total
-CPU sec. CPU sec.
-2.272 2.272 <Total>
-1.683 1.683 mxv_core + 0x00000027, line 36 in "mxv.c"
-0.375 0.375 mxv_core + 0x00000023, line 37 in "mxv.c"
-0.096 0.096 mxv_core + 0x0000001D, line 37 in "mxv.c"
-0.027 0.027 init_data + 0x000000BD, line 72 in "manage_data.c"
-0.012 0.012 init_data + 0x00000117, line 77 in "manage_data.c"
-0.008 0.008 _int_malloc + 0x00000A45
-0.007 0.007 erand48_r + 0x00000062
-0.006 0.006 drand48 + 0x00000000
-0.005 0.005 __drand48_iterate + 0x00000005
-@end verbatim
-@end smallexample
+@c man end
+@ManPageEnd{}
-@c -- A new node --------------------------------------------------------------
-@node Display and Define the Metrics
-@subsection Display and Define the Metrics
@c ----------------------------------------------------------------------------
-
-The default metrics shown by @DisplayText{} are useful, but there is more
-recorded than displayed. We can customize the values shown by defining the
-metrics ourselves.
-
-@IndexSubentry{Commands, @code{metric_list}}
-There are two commands related to changing the metrics shown: @code{metric_list}
-and
-@IndexSubentry{Commands, @code{metrics}}
-@code{metrics}.
-
-The first command shows the metrics in use, plus all the metrics that have
-been stored as part of the experiment. The second command may be used to
-define the metric list.
-
-In our example we get the following values for the metrics:
-
-@IndexSubentry{Commands, @code{metric_list}}
-@cartouche
-@smallexample
-$ gprofng display text -metric_list test.1.er
-@end smallexample
-@end cartouche
-
-@smallexample
-@verbatim
-Current metrics: e.totalcpu:i.totalcpu:name
-Current Sort Metric: Exclusive Total CPU Time ( e.totalcpu )
-Available metrics:
- Exclusive Total CPU Time: e.%totalcpu
- Inclusive Total CPU Time: i.%totalcpu
- Size: size
- PC Address: address
- Name: name
-@end verbatim
-@end smallexample
-
-This shows the metrics currently in use, the metric that is used to sort
-the data and all the metrics that have been recorded, but are not necessarily
-shown.
-
-@cindex Default metrics
-In this case, the default metrics are set to the exclusive and inclusive
-total CPU times, plus the name of the function, or load object.
-
-@IndexSubentry{Commands, @code{metrics}}
-The @code{metrics} command is used to define the metrics that need to be
-displayed.
-
-For example, to display the exclusive total CPU time, both as a number and a
-percentage, use the following metric definition: @code{e.%totalcpu}
-
-Since the metrics can be tailored for different views, there is a way
-to reset them to the default. This is done through the special keyword
-@code{default}.
-
-@c -- A new node --------------------------------------------------------------
-@node A First Customization of the Output
-@subsection A First Customization of the Output
+@c If this text is used for a man page, exit. Otherwise we need to continue.
@c ----------------------------------------------------------------------------
-With the information just given, we can customize the function overview.
-For sake of the example, we would like to display the name of the function
-first, followed by the exclusive CPU time, given as an absolute number and
-a percentage.
-
-Note that the commands are parsed in order of appearance. This is why we
-need to define the metrics @emph{before} requesting the function overview:
-
-@cartouche
-@smallexample
-$ gprofng display text -metrics name:e.%totalcpu -functions test.1.er
-@end smallexample
-@end cartouche
-
-@smallexample
-@verbatim
-Current metrics: name:e.%totalcpu
-Current Sort Metric: Exclusive Total CPU Time ( e.%totalcpu )
-Functions sorted by metric: Exclusive Total CPU Time
-
-Name Excl. Total
- CPU
- sec. %
- <Total> 2.272 100.00
- mxv_core 2.160 95.04
- init_data 0.047 2.06
- erand48_r 0.030 1.32
- __drand48_iterate 0.013 0.57
- drand48 0.013 0.57
- _int_malloc 0.008 0.35
- brk 0.001 0.04
- sysmalloc 0.001 0.04
- __default_morecore 0. 0.
- __libc_start_main 0. 0.
- allocate_data 0. 0.
- collector_root 0. 0.
- driver_mxv 0. 0.
- main 0. 0.
- malloc 0. 0.
- sbrk 0. 0.
-@end verbatim
-@end smallexample
-
-This was a first and simple example how to customize the output. Note that we
-did not rerun our profiling job and merely modified the display settings.
-Below we will show other and also more advanced examples of customization.
-
-
-@c -- A new node --------------------------------------------------------------
-@node Name the Experiment Directory
-@subsection Name the Experiment Directory
-@c ----------------------------------------------------------------------------
-
-When using @CollectApp{}, the default names for experiments work fine, but
-they are quite generic. It is often more convenient to select a more
-descriptive name. For example, one that reflects conditions for the experiment
-conducted.
-
-For this, the mutually exclusive @code{-o} and @code{-O} options come in handy.
-Both may be used to provide a name for the experiment directory, but the
-behaviour of @CollectApp{} is different.
-
-With the
-@IndexSubentry{Options, @code{-o}}
-@code{-o}
-option, an existing experiment directory is not overwritten. You either
-need to explicitly remove an existing directory first, or use a name that is not
-in use yet.
-
-This is in contrast with the behaviour for the
- @IndexSubentry{Options, @code{-O}}
-@code{-O}
-option. Any existing (experiment) directory with the same name is silently
-overwritten.
-
-Be aware that the name of the experiment directory has to end with @code{.er}.
-
-@c -- A new node --------------------------------------------------------------
-@node Control the Number of Lines in the Output
-@subsection Control the Number of Lines in the Output
-@c ----------------------------------------------------------------------------
-
-@IndexSubentry{Commands, @code{limit}}
-The @code{limit <n>} command can be used to control the number of lines printed
-in various overviews, including the function view, but it also takes effect
-for other display commands, like @code{lines}.
-
-The argument @code{<n>} should be a positive integer number. It sets the number
-of lines in the function view. A value of zero resets the limit to the default.
-
-Be aware that the pseudo-function @code{<Total>} counts as a regular function.
-For example @code{limit 10} displays nine user level functions.
-
-@c -- A new node --------------------------------------------------------------
-@node Sorting the Performance Data
-@subsection Sorting the Performance Data
-@c ----------------------------------------------------------------------------
-
-@IndexSubentry{Commands, @code{sort}}
-The @code{sort <key>} command sets the key to be used when sorting the
-performance data.
-
-The key is a valid metric definition, but the
-@cindex Visibility field
-visibility field
-(@xref{Metric Definitions})
-in the metric
-definition is ignored since this does not affect the outcome of the sorting
-operation.
-For example if we set the sort key to @code{e.totalcpu}, the values
-will be sorted in descending order with respect to the exclusive total
-CPU time.
-
-The data can be sorted in reverse order by prepending the metric definition
-with a minus (@code{-}) sign. For example @code{sort -e.totalcpu}.
-
-A default metric for the sort operation has been defined and since this is
-a persistent command, this default can be restored with @code{default} as
-the key.
-
-@c -- A new node --------------------------------------------------------------
-@node Scripting
-@subsection Scripting
-@c ----------------------------------------------------------------------------
-
-As is probably clear by now, the list with commands for @DisplayText{} can be
-very long. This is tedious and also error prone. Luckily, there is an easier and
-more elegant way to control the behaviour of this tool.
-
-@IndexSubentry{Commands, @code{script}}
-Through the @code{script} command, the name of a file with commands can be
-passed in. These commands are parsed and executed as if they appeared on
-the command line in the same order as encountered in the file. The commands
-in this script file can actually be mixed with commands on the command line.
-
-The difference between the commands in the script file and those used on the
-command line is that the latter require a leading dash (@code{-}) symbol.
-
-Comment lines are supported. They need to start with the @code{#} symbol.
-
-@c -- A new node --------------------------------------------------------------
-@node A More Elaborate Example
-@subsection A More Elaborate Example
-@c ----------------------------------------------------------------------------
-
-With the information presented so far, we can customize our data
-gathering and display commands.
-
-As an example, to reflect the name of the algorithm and the number of threads
-that were used in the experiment, we select @code{mxv.1.thr.er}
-as the name of the experiment directory.
-All we then need to
-do is to add the
- @IndexSubentry{Options, @code{-O}}
-@code{-O}
-option followed by this name on the command line when running @CollectApp{}:
-
-@cartouche
-@smallexample
-$ exe=mxv-pthreads.exe
-$ m=3000
-$ n=2000
-$ gprofng collect app -O mxv.1.thr.er ./$exe -m $m -n $n -t 1
-@end smallexample
-@end cartouche
-
-The commands to generate the profile are put into a file that we simply call
-@code{my-script}:
-
-@smallexample
-@verbatim
-$ cat my-script
-# This is my first gprofng script
-# Set the metrics
-metrics i.%totalcpu:e.%totalcpu:name
-# Use the exclusive time to sort
-sort e.totalcpu
-# Limit the function list to 5 lines
-limit 5
-# Show the function list
-functions
-@end verbatim
-@end smallexample
-
-This script file is then specified as input to the @DisplayText{} command
-that is used to display the performance information stored in
-@code{mxv.1.thr.er}:
-
-@cartouche
-@smallexample
-$ gprofng display text -script my-script mxv.1.thr.er
-@end smallexample
-@end cartouche
-
-The command above produces the following output:
-
-@smallexample
-@verbatim
-# This is my first gprofng script
-# Set the metrics
-Current metrics: i.%totalcpu:e.%totalcpu:name
-Current Sort Metric: Exclusive Total CPU Time ( e.%totalcpu )
-# Use the exclusive time to sort
-Current Sort Metric: Exclusive Total CPU Time ( e.%totalcpu )
-# Limit the function list to 5 lines
-Print limit set to 5
-# Show the function list
-Functions sorted by metric: Exclusive Total CPU Time
-
-Incl. Total Excl. Total Name
-CPU CPU
- sec. % sec. %
-2.272 100.00 2.272 100.00 <Total>
-2.159 95.00 2.159 95.00 mxv_core
-0.102 4.48 0.054 2.37 init_data
-0.035 1.54 0.025 1.10 erand48_r
-0.048 2.11 0.013 0.57 drand48
-@end verbatim
-@end smallexample
-
-In the first part of the output, our comment lines in the script file are
-shown. These are interleaved with an acknowledgement message for the commands.
-
-This is followed by a profile consisting of 5 lines only. For both metrics,
-the percentages plus the timings are given. The numbers are sorted with respect
-to the exclusive total CPU time.
-
-It is now immediately clear that function @code{mxv_core} is responsbile for
-95% of the CPU time and @code{init_data} takes 4.5% only.
-
-This is also where we see sampling in action. Although this is exactly the
-same job we profiled before, the timings are somewhat different, but the
-differences are very small.
-
-@c -- A new node --------------------------------------------------------------
-@node The Call Tree
-@subsection The Call Tree
-@c ----------------------------------------------------------------------------
-
-The call tree shows the dynamic hierarchy of the application by displaying the
-functions executed and their parent. It helps to find the most expensive path
-in the program.
-
-@IndexSubentry{Commands, @code{calltree}}
-This feature is enabled through the @code{calltree} command. This is how to get
-this tree for our current experiment:
-
-@cartouche
-@smallexample
-$ gprofng display text -calltree mxv.1.thr.er
-@end smallexample
-@end cartouche
-
-This displays the following structure:
-
-@smallexample
-@verbatim
-Functions Call Tree. Metric: Attributed Total CPU Time
-
-Attr. Name
-Total
-CPU sec.
-2.272 +-<Total>
-2.159 +-collector_root
-2.159 | +-driver_mxv
-2.159 | +-mxv_core
-0.114 +-__libc_start_main
-0.114 +-main
-0.102 +-init_data
-0.048 | +-drand48
-0.035 | +-erand48_r
-0.010 | +-__drand48_iterate
-0.011 +-allocate_data
-0.011 | +-malloc
-0.011 | +-_int_malloc
-0.001 | +-sysmalloc
-0.001 +-check_results
-0.001 +-malloc
-0.001 +-_int_malloc
-@end verbatim
-@end smallexample
-
-At first sight this may not be what you expected and some explanation is in
-place.
-
-@c ----------------------------------------------------------------------------
-@c TBD: Revise this text when we have user and machine mode.
-@c ----------------------------------------------------------------------------
-First of all, function @code{collector_root} is internal to @ToolName{} and
-should be hidden to the user. This is part of a planned future enhancement.
-
-Recall that the @code{objects} and @code{fsingle} commands are very useful
-to find out more about load objects in general, but also to help identify
-an unknown entry in the function overview. @xref{Load Objects and Functions}.
-
-Another thing to note is that there are two main branches. The one under
-@code{collector_root} and the second one under @code{__libc_start_main}.
-This reflects the fact that we are executing a parallel program. Even though
-we only used one thread for this run, this is still executed in a separate
-path.
-
-The main, sequential part of the program is displayed under @code{main} and
-shows the functions called and the time they took.
-
-There are two things worth noting for the call tree feature:
-
-@itemize
-
-@item
-This is a dynamic tree and since sampling is used, it most likely looks
-slighlty different across seemingly identical profile runs. In case the
-run times are short, it is worth considering to use a high resolution
-through the
-@IndexSubentry{Options, @code{-p}}
-@code{-p}
-option. For example to use @code{-p hi} to increase the sampling rate.
-
-@item
-In case hardware event counters have been enabled
-(@xref{Profile Hardware Event Counters}), these values are also displayed
-in the call tree view.
-
-@end itemize
-
-@c -- A new node --------------------------------------------------------------
-@node More Information on the Experiment
-@subsection More Information on the Experiment
-@c ----------------------------------------------------------------------------
-
-The experiment directory not only contains performance related data. Several
-system characteristics, the actually command executed, and some global
-performance statistics can be displayed.
-
-@IndexSubentry{Commands, @code{header}}
-The @code{header} command displays information about the experiment(s).
-For example, this is the command to extract this data from for our experiment
-directory:
-
-@cartouche
-@smallexample
-$ gprofng display text -header mxv.1.thr.er
-@end smallexample
-@end cartouche
-
-The above command prints the following information. Note that some of the
-lay-out and the information has been modified. The textual changes are
-marked with the @code{<} and @code{>} symbols.
-
-@smallexample
-@verbatim
-Experiment: mxv.1.thr.er
-No errors
-No warnings
-Archive command `gp-archive -n -a on
- --outfile <exp_dir>/archive.log <exp_dir>'
-
-Target command (64-bit): './mxv-pthreads.exe -m 3000 -n 2000 -t 1'
-Process pid 30591, ppid 30589, pgrp 30551, sid 30468
-Current working directory: <cwd>
-Collector version: `2.36.50'; experiment version 12.4 (64-bit)
-Host `<hostname>', OS `Linux <version>', page size 4096,
- architecture `x86_64'
- 16 CPUs, clock speed 1995 MHz.
- Memory: 30871514 pages @ 4096 = 120591 MB.
-Data collection parameters:
- Clock-profiling, interval = 997 microsecs.
- Periodic sampling, 1 secs.
- Follow descendant processes from: fork|exec|combo
-
-Experiment started <date and time>
-
-Experiment Ended: 2.293162658
-Data Collection Duration: 2.293162658
-@end verbatim
-@end smallexample
-
-The output above may assist in troubleshooting, or to verify some of the
-operational conditions and we recommand to include this command when
-generating a profile.
-
-@IndexSubentry{Options, @code{-C}}
-Related to this command there is a useful option to record your own comment(s) in
-an experiment.
-To this end, use the @code{-C} option on the @CollectApp{} tool to
-specify a comment string. Up to ten comment lines can be included.
-These comments are displayed with the @code{header} command on
-the @DisplayText{} tool.
-
-@IndexSubentry{Commands, @code{overview}}
-The @code{overview} command displays information on the experiment(s) and also
-shows a summary of the values for the metric(s) used. This is an example how to
-use it on our newly created experiment directory:
-
-@cartouche
-@smallexample
-$ gprofng display text -overview mxv.1.thr.er
-@end smallexample
-@end cartouche
-
-@smallexample
-@verbatim
-Experiment(s):
-
-Experiment :mxv.1.thr.er
- Target : './mxv-pthreads.exe -m 3000 -n 2000 -t 1'
- Host : <hostname> (<ISA>, Linux <version>)
- Start Time : <date and time>
- Duration : 2.293 Seconds
-
-Metrics:
-
- Experiment Duration (Seconds): [2.293]
- Clock Profiling
- [X]Total CPU Time - totalcpu (Seconds): [*2.272]
-
-Notes: '*' indicates hot metrics, '[X]' indicates currently enabled
- metrics.
- The metrics command can be used to change selections. The
- metric_list command lists all available metrics.
-@end verbatim
-@end smallexample
-
-This command provides a dashboard overview that helps to easily identify
-where the time is spent and in case hardware event counters are used, it
-shows their total values.
-
-@c -- A new node --------------------------------------------------------------
-@node Control the Sampling Frequency
-@subsection Control the Sampling Frequency
-@c ----------------------------------------------------------------------------
-
-So far we did not talk about the frequency of the sampling process, but in
-some cases it is useful to change the default of 10 milliseconds.
-
-The advantage of increasing the sampling frequency is that functions that
-do not take much time per invocation are more accurately captured. The
-downside is that more data is gathered. This has an impact on the overhead
-of the collection process and more disk space is required.
-
-In general this is not an immediate concern, but with heavily threaded
-applications that run for an extended period of time, increasing the
-frequency may have a more noticeable impact.
-
-@IndexSubentry{Options, @code{-p}}
-The @code{-p} option on the @CollectApp{} tool is used to enable or disable
-clock based profiling, or to explicitly set the sampling rate.
-@cindex Sampling interval
-This option takes one of the following keywords:
-
-@table @code
-
-@item off
-Disable clock based profiling.
-
-@item on
-Enable clock based profiling with a per thread sampling interval of 10 ms. This is the default.
-
-@item lo
-Enable clock based profiling with a per thread sampling interval of 100 ms.
-
-@item hi
-Enable clock based profiling with a per thread sampling interval of 1 ms.
-
-@item <value>
-Enable clock based profiling with a per thread sampling interval of <value>.
-
-@end table
-
-One may wonder why there is an option to disable clock based profiling. This
-is because by default, it is enabled when conducting hardware event counter
-experiments (@xref{Profile Hardware Event Counters}).
-With the @code{-p off} option, this can be disabled.
-
-If an explicit value is set for the sampling, the number can be an integer or a
-floating-point number.
-A suffix of @code{u} for microseconds, or @code{m} for milliseconds is supported.
-If no suffix is used, the value is assumed to be in milliseconds.
-
-If the value is smaller than the clock profiling minimum, a warning message is issued
-and it is set to the minimum.
-In case it is not a multiple of the clock profiling resolution, it is silently rounded
-down to the nearest multiple of the clock resolution.
-
-If the value exceeds the clock profiling maximum, is negative, or zero, an error is
-reported.
-
-@IndexSubentry{Commands, @code{header}}
-Note that the @code{header} command echoes the sampling rate used.
-
-@c -- A new node --------------------------------------------------------------
-@node Information on Load Objects
-@subsection Information on Load Objects
-@c ----------------------------------------------------------------------------
-
-It may happen that the function list contains a function that is not known to
-the user. This can easily happen with library functions for example.
-Luckily there are three commands that come in handy then.
-
-@IndexSubentry{Commands, @code{objects}}
-@IndexSubentry{Commands, @code{fsingle}}
-@IndexSubentry{Commands, @code{fsummary}}
-These commands are @code{objects}, @code{fsingle}, and @code{fsummary}.
-They provide details on
-@cindex Load objects
-load objects (@xref{Load Objects and Functions}).
-
-The @code{objects} command lists all load objects that have been referenced
-during the performance experiment.
-Below we show the command and the result for our profile job. Like before,
-the (long) path names in the output have been shortened and replaced by the
-@IndexSubentry{Miscellaneous, @code{<apath>}}
-@code{<apath>} symbol that represents an absolute directory path.
-
-@cartouche
-@smallexample
-$ gprofng display text -objects mxv.1.thr.er
-@end smallexample
-@end cartouche
-
-The output includes the name and path of the target executable:
-
-@smallexample
-@verbatim
- <Unknown> (<Unknown>)
- <mxv-pthreads.exe> (<apath>/mxv-pthreads.exe)
- <librt-2.17.so> (/usr/lib64/librt-2.17.so)
- <libdl-2.17.so> (/usr/lib64/libdl-2.17.so)
- <libbfd-2.36.50.20210505.so> (<apath>/libbfd-2.36.50 <etc>)
- <libopcodes-2.36.50.20210505.so> (<apath>/libopcodes-2. <etc>)
- <libc-2.17.so> (/usr/lib64/libc-2.17.so)
- <libpthread-2.17.so> (/usr/lib64/libpthread-2.17.so)
- <libm-2.17.so> (/usr/lib64/libm-2.17.so)
- <libgp-collector.so> (<apath>/libgp-collector.so)
- <ld-2.17.so> (/usr/lib64/ld-2.17.so)
- <DYNAMIC_FUNCTIONS> (DYNAMIC_FUNCTIONS)
-@end verbatim
-@end smallexample
-
-@IndexSubentry{Commands, @code{fsingle}}
-The @code{fsingle} command may be used to get more details on a specific entry
-in the function view, say. For example, the command below provides additional
-information on the @code{collector_root} function shown in the function overview.
-
-@cartouche
-@smallexample
-$ gprofng display text -fsingle collector_root mxv.1.thr.er
-@end smallexample
-@end cartouche
-
-Below the output from this command. It has been somewhat modified to match the
-display requirements.
-
-@smallexample
-@verbatim
-collector_root
- Exclusive Total CPU Time: 0. ( 0. %)
- Inclusive Total CPU Time: 2.159 ( 95.0%)
- Size: 401
- PC Address: 10:0x0001db60
- Source File: <apath>/dispatcher.c
- Object File: mxv.1.thr.er/archives/libgp-collector.so_HpzZ6wMR-3b
- Load Object: <apath>/libgp-collector.so
- Mangled Name:
- Aliases:
-@end verbatim
-@end smallexample
-
-In this table we not only see how much time was spent in this function, we
-also see where it originates from. In addition to this, the size and start
-address are given as well. If the source code location is known it is also
-shown here.
-
-@IndexSubentry{Commands, @code{fsummary}}
-The related @code{fsummary} command displays the same information as
-@code{fsingle}, but for all functions in the function overview,
-including @code{<Total>}:
-
-@cartouche
-@smallexample
-$ gprofng display text -fsummary mxv.1.thr.er
-@end smallexample
-@end cartouche
-
-@smallexample
-@verbatim
-Functions sorted by metric: Exclusive Total CPU Time
-
-<Total>
- Exclusive Total CPU Time: 2.272 (100.0%)
- Inclusive Total CPU Time: 2.272 (100.0%)
- Size: 0
- PC Address: 1:0x00000000
- Source File: (unknown)
- Object File: (unknown)
- Load Object: <Total>
- Mangled Name:
- Aliases:
-
-mxv_core
- Exclusive Total CPU Time: 2.159 ( 95.0%)
- Inclusive Total CPU Time: 2.159 ( 95.0%)
- Size: 75
- PC Address: 2:0x000021ba
- Source File: <apath>/mxv.c
- Object File: mxv.1.thr.er/archives/mxv-pthreads.exe_hRxWdccbJPc
- Load Object: <apath>/mxv-pthreads.exe
- Mangled Name:
- Aliases:
-
- ... etc ...
-@end verbatim
-@end smallexample
-
-@c -- A new node --------------------------------------------------------------
-@node Support for Multithreading
-@section Support for Multithreading
-@c ----------------------------------------------------------------------------
-
-In this chapter we introduce and discuss the support for multithreading. As
-is shown below, nothing needs to be changed when collecting the performance
-data.
-
-The difference is that additional commands are available to get more
-information on the parallel environment, plus that several filters allow
-the user to zoom in on specific threads.
-
-@c -- A new node --------------------------------------------------------------
-@node Creating a Multithreading Experiment
-@subsection Creating a Multithreading Experiment
-@c ----------------------------------------------------------------------------
-
-We demonstrate the support for multithreading using the same code and settings
-as before, but this time we use 2 threads:
-
-@cartouche
-@smallexample
-$ exe=mxv-pthreads.exe
-$ m=3000
-$ n=2000
-$ gprofng collect app -O mxv.2.thr.er ./$exe -m $m -n $n -t 2
-@end smallexample
-@end cartouche
-
-First of all, note that we did not change anything, other than setting the
-number of threads to 2. Nothing special is needed to profile a multithreaded
-job when using @ToolName{}.
-
-The same is true when displaying the performance results. The same commands
-that we used before work unmodified. For example, this is all that is needed to
-get a function overview:
-
-@cartouche
-@smallexample
-$ gpprofng display text -limit 10 -functions mxv.2.thr.er
-@end smallexample
-@end cartouche
-
-This produces the following familiar looking output:
-
-@smallexample
-@verbatim
-Print limit set to 10
-Functions sorted by metric: Exclusive Total CPU Time
-
-Excl. Incl. Name
-Total Total
-CPU sec. CPU sec.
-2.268 2.268 <Total>
-2.155 2.155 mxv_core
-0.044 0.103 init_data
-0.030 0.046 erand48_r
-0.016 0.016 __drand48_iterate
-0.013 0.059 drand48
-0.008 0.011 _int_malloc
-0.003 0.003 brk
-0. 0.003 __default_morecore
-0. 0.114 __libc_start_main
-@end verbatim
-@end smallexample
-
-@c -- A new node --------------------------------------------------------------
-@node Commands Specific to Multithreading
-@subsection Commands Specific to Multithreading
-@c ----------------------------------------------------------------------------
-
-The function overview shown above shows the results aggregated over all the
-threads. The interesting new element is that we can also look at the
-performance data for the individual threads.
-
-@IndexSubentry{Commands, @code{thread_list}}
-The @code{thread_list} command displays how many threads have been used:
-
-@cartouche
-@smallexample
-$ gprofng display text -thread_list mxv.2.thr.er
-@end smallexample
-@end cartouche
-
-This produces the following output, showing that three threads have
-been used:
-
-@smallexample
-@verbatim
-Exp Sel Total
-=== === =====
- 1 all 3
-@end verbatim
-@end smallexample
-
-The output confirms there is one experiment and that by default all
-threads are selected.
-
-It may seem surprising to see three threads here, since we used the
-@code{-t 2} option, but it is common for a Pthreads program to use one
-additional thread. This is typically the thread that runs from start to
-finish and handles the sequential portions of the code, as well as takes
-care of managing the threads.
-
-It is no different in our example code. At some point, the main thread
-creates and activates the two threads that perform the multiplication
-of the matrix with the vector. Upon completion of this computation,
-the main thread continues.
-
-@IndexSubentry{Commands, @code{threads}}
-The @code{threads} command is simple, yet very powerful. It shows the
-total value of the metrics for each thread. To make it easier to
-interpret the data, we modify the metrics to include percentages:
-
-@cartouche
-@smallexample
-$ gprofng display text -metrics e.%totalcpu -threads mxv.2.thr.er
-@end smallexample
-@end cartouche
-
-The command above produces the following overview:
-
-@smallexample
-@verbatim
-Current metrics: e.%totalcpu:name
-Current Sort Metric: Exclusive Total CPU Time ( e.%totalcpu )
-Objects sorted by metric: Exclusive Total CPU Time
-
-Excl. Total Name
-CPU
- sec. %
-2.258 100.00 <Total>
-1.075 47.59 Process 1, Thread 3
-1.070 47.37 Process 1, Thread 2
-0.114 5.03 Process 1, Thread 1
-@end verbatim
-@end smallexample
-
-The first line gives the total CPU time accumulated over the threads
-selected. This is followed by the metric value(s) for each thread.
-
-From this it is clear that the main thread is responsible for 5% of
-the total CPU time, while the other two threads take 47% each.
-
-This view is ideally suited to verify if there any load balancing
-issues and also to find the most time consuming thread(s).
-
-@IndexSubentry{Filters, Thread selection}
-While useful, often more information than this is needed. This is
-@IndexSubentry{Commands, @code{thread_select}}
-where the thread selection filter comes in. Through the @code{thread_select}
-command, one or more threads may be selected
-(@xref{The Selection List} how to define the selection list).
-
-Since it is most common to use this command in a script, we do so as
-well here. Below the script we are using:
-
-@cartouche
-@smallexample
-# Define the metrics
-metrics e.%totalcpu
-# Limit the output to 10 lines
-limit 10
-# Get the function overview for thread 1
-thread_select 1
-functions
-# Get the function overview for thread 2
-thread_select 2
-functions
-# Get the function overview for thread 3
-thread_select 3
-functions
-@end smallexample
-@end cartouche
-
-The definition of the metrics and the output limiter has been shown and
-explained before and will be ignored. The new command we focus on is
-@IndexSubentry{Commands, @code{thread_select}}
-@code{thread_select}.
-
-This command takes a list (@xref{The Selection List}) to select specific
-threads. In this case we simply use the individual thread numbers that we
-obtained with the @code{thread_list} command earlier.
-
-This restricts the output of the @code{functions} command to the thread
-number(s) specified. This means that the script above shows which
-function(s) each thread executes and how much CPU time they consumed.
-Both the timings and their percentages are given.
-
-This is the relevant part of the output for the first thread:
-
-@smallexample
-@verbatim
-# Get the function overview for thread 1
-Exp Sel Total
-=== === =====
- 1 1 3
-Functions sorted by metric: Exclusive Total CPU Time
-
-Excl. Total Name
-CPU
- sec. %
-0.114 100.00 <Total>
-0.051 44.74 init_data
-0.028 24.56 erand48_r
-0.017 14.91 __drand48_iterate
-0.010 8.77 _int_malloc
-0.008 7.02 drand48
-0. 0. __libc_start_main
-0. 0. allocate_data
-0. 0. main
-0. 0. malloc
-@end verbatim
-@end smallexample
-
-As usual, the comment lines are echoed. This is followed by a confirmation
-of our selection. We see that indeed thread 1 has been selected. What is
-displayed next is the function overview for this particular thread. Due to
-the @code{limit 10} command, there are ten entries in this list.
-
-Below are the overviews for threads 2 and 3 respectively. We see that all
-of the CPU time is spent in function @code{mxv_core} and that this time
-is approximately the same for both threads.
-
-@smallexample
-@verbatim
-# Get the function overview for thread 2
-Exp Sel Total
-=== === =====
- 1 2 3
-Functions sorted by metric: Exclusive Total CPU Time
-
-Excl. Total Name
-CPU
- sec. %
-1.072 100.00 <Total>
-1.072 100.00 mxv_core
-0. 0. collector_root
-0. 0. driver_mxv
-
-# Get the function overview for thread 3
-Exp Sel Total
-=== === =====
- 1 3 3
-Functions sorted by metric: Exclusive Total CPU Time
-
-Excl. Total Name
-CPU
- sec. %
-1.076 100.00 <Total>
-1.076 100.00 mxv_core
-0. 0. collector_root
-0. 0. driver_mxv
-@end verbatim
-@end smallexample
-
-When analyzing the performance of a multithreaded application, it is sometimes
-useful to know whether threads have mostly executed on the same core, say, or
-if they have wandered across multiple cores. This sort of stickiness is usually
-referred to as
-@cindex Thread affinity
-@emph{thread affinity}.
-
-Similar to the commands for the threads, there are several commands related
-to the usage of the cores, or @emph{CPUs} as they are called in @ToolName{}
-(@xref{The Concept of a CPU in @ProductName{}}).
-
-In order to have some more interesting data to look at, we created a new
-experiment, this time using 8 threads:
-
-@cartouche
-@smallexample
-$ exe=mxv-pthreads.exe
-$ m=3000
-$ n=2000
-$ gprofng collect app -O mxv.8.thr.er ./$exe -m $m -n $n -t 8
-@end smallexample
-@end cartouche
-
-@IndexSubentry{Commands, @code{cpu_list}}
-Similar to the @code{thread_list} command, the @code{cpu_list} command
-displays how many CPUs have been used.
-@IndexSubentry{Commands, @code{cpus}}
-The equivalent of the @code{threads} threads command, is the @code{cpus}
-command, which shows the CPU numbers that were used and how much time was
-spent on each of them. Both are demonstrated below.
-
-@cartouche
-@smallexample
-$ gprofng display text -metrics e.%totalcpu -cpu_list -cpus mxv.8.thr.er
-@end smallexample
-@end cartouche
-
-This command produces the following output:
-
-@smallexample
-@verbatim
-Current metrics: e.%totalcpu:name
-Current Sort Metric: Exclusive Total CPU Time ( e.%totalcpu )
-Exp Sel Total
-=== === =====
- 1 all 10
-Objects sorted by metric: Exclusive Total CPU Time
-
-Excl. Total Name
-CPU
- sec. %
-2.310 100.00 <Total>
-0.286 12.39 CPU 7
-0.284 12.30 CPU 13
-0.282 12.21 CPU 5
-0.280 12.13 CPU 14
-0.266 11.52 CPU 9
-0.265 11.48 CPU 2
-0.264 11.44 CPU 11
-0.194 8.42 CPU 0
-0.114 4.92 CPU 1
-0.074 3.19 CPU 15
-@end verbatim
-@end smallexample
-
-@c ----------------------------------------------------------------------------
-@c TBD - Ruud
-@c I'd like to improve this and have a way to see where a thread has executed.
-@c ----------------------------------------------------------------------------
-
-What we see in this table is that a total of 10 CPUs have been used. This is
-followed by a list with all the CPU numbers that have been used during the
-run. For each CPU it is shown how much time was spent on it.
-
-While the table with thread times shown earlier may point at a load imbalance
-in the application, this overview has a different purpose.
-
-For example, we see that 10 CPUs have been used, but we know that the
-application uses 9 threads only.
-This means that at least one thread has executed on more than one CPU. In
-itself this is not something to worry about, but warrants a deeper
-investigation.
-
-Honesty dictates that next we performed a pre-analysis to find out
-which thread(s) have been running on more than one CPU. We found this
-to be thread 7. It has executed on CPUs 0 and 15.
-
-With this knowledge, we wrote the script shown below. It zooms in on
-the behaviour of thread 7.
-
-@cartouche
-@smallexample
-# Define the metrics
-metrics e.%totalcpu
-# Limit the output to 10 lines
-limit 10
-functions
-# Get the function overview for CPU 0
-cpu_select 0
-functions
-# Get the function overview for CPU 15
-cpu_select 15
-functions
-@end smallexample
-@end cartouche
-
-From the earlier shown threads overview, we know that thread 7 has
-used @code{0.268} seconds of CPU time..
-
-By selecting CPUs 0 and 15, respectively, we get the following
-function overviews:
-
-@smallexample
-@verbatim
-# Get the function overview for CPU 0
-Exp Sel Total
-=== === =====
- 1 0 10
-Functions sorted by metric: Exclusive Total CPU Time
-
-Excl. Total Name
-CPU
- sec. %
-0.194 100.00 <Total>
-0.194 100.00 mxv_core
-0. 0. collector_root
-0. 0. driver_mxv
-
-# Get the function overview for CPU 15
-Exp Sel Total
-=== === =====
- 1 15 10
-Functions sorted by metric: Exclusive Total CPU Time
-
-Excl. Total Name
-CPU
- sec. %
-0.074 100.00 <Total>
-0.074 100.00 mxv_core
-0. 0. collector_root
-0. 0. driver_mxv
-@end verbatim
-@end smallexample
-
-This shows that thread 7 spent @code{0.194} seconds on CPU 0 and
-@code{0.074} seconds on CPU 15.
-
-@c -- A new node --------------------------------------------------------------
-@node Viewing Multiple Experiments
-@section Viewing Multiple Experiments
-@c ----------------------------------------------------------------------------
-
-One thing we did not cover sofar is that @ToolName{} fully supports the analysis
-of multiple experiments. The @DisplayText{} tool accepts a list of experiments.
-The data can either be aggregated across the experiments, or used in a
-comparison.
-
-Mention @code{experiment_list}
-
-@c -- A new node --------------------------------------------------------------
-@node Aggregation of Experiments
-@subsection Aggregation of Experiments
-@c ----------------------------------------------------------------------------
-
-By default, the data for multiple experiments is aggregrated and the display
-commands shows these combined results.
-
-For example, we can aggregate the data for our single and dual thread
-experiments. Below is the script we used for this:
-
-@cartouche
-@smallexample
-# Define the metrics
-metrics e.%totalcpu
-# Limit the output to 10 lines
-limit 10
-# Get the list with experiments
-experiment_list
-# Get the function overview
-functions
-@end smallexample
-@end cartouche
-
-@IndexSubentry{Commands, @code{experiment_list}}
-With the exception of the @code{experiment_list} command, all commands
-used have been discussed earlier.
-
-The @code{experiment_list} command provides a list of the experiments
-that have been loaded. This is is used to verify we are looking at the
-experiments we intend to aggregate.
-
-@cartouche
-@smallexample
-$ gprofng display text -script my-script-agg mxv.1.thr.er mxv.2.thr.er
-@end smallexample
-@end cartouche
-
-With the command above, we get the following output:
-
-@smallexample
-@verbatim
-# Define the metrics
-Current metrics: e.%totalcpu:name
-Current Sort Metric: Exclusive Total CPU Time ( e.%totalcpu )
-# Limit the output to 10 lines
-Print limit set to 10
-# Get the list with experiments
-ID Sel PID Experiment
-== === ===== ============
- 1 yes 30591 mxv.1.thr.er
- 2 yes 11629 mxv.2.thr.er
-# Get the function overview
-Functions sorted by metric: Exclusive Total CPU Time
-
-Excl. Total Name
-CPU
- sec. %
-4.533 100.00 <Total>
-4.306 94.99 mxv_core
-0.105 2.31 init_data
-0.053 1.17 erand48_r
-0.027 0.59 __drand48_iterate
-0.021 0.46 _int_malloc
-0.021 0.46 drand48
-0.001 0.02 sysmalloc
-0. 0. __libc_start_main
-0. 0. allocate_data
-@end verbatim
-@end smallexample
-
-The first five lines should look familiar. The five lines following, echo
-the comment line in the script and show the overview of the experiments.
-This confirms two experiments have been loaded and that both are active.
-
-This is followed by the function overview. The timings have been summed
-up and the percentages are adjusted accordingly. For example, the total
-accumulated time is indeed 2.272 + 2.261 = 4.533 seconds.
-
-@c -- A new node --------------------------------------------------------------
-@node Comparison of Experiments
-@subsection Comparison of Experiments
-@c ----------------------------------------------------------------------------
-
-The support for multiple experiments really shines in comparison mode. This
-feature is enabled through the command
-@IndexSubentry{Commands, @code{compare on/off}}
-@code{compare on}
-and is disabled
-by setting
-@code{compare off}.
-
-@cindex Compare experiments
-In comparison mode, the data for the various experiments is shown side by
-side, as illustrated below where we compare the results for the multithreaded
-experiments using one and two threads respectively:
-
-@cartouche
-@smallexample
-$ gprofng display text -compare on -functions mxv.1.thr.er mxv.2.thr.er
-@end smallexample
-@end cartouche
-
-@noindent
-This produces the following output:
-
-@smallexample
-@verbatim
-Functions sorted by metric: Exclusive Total CPU Time
-
-mxv.1.thr.er mxv.2.thr.er mxv.1.thr.er mxv.2.thr.er
-Excl. Total Excl. Total Incl. Total Incl. Total Name
-CPU CPU CPU CPU
- sec. sec. sec. sec.
-2.272 2.261 2.272 2.261 <Total>
-2.159 2.148 2.159 2.148 mxv_core
-0.054 0.051 0.102 0.104 init_data
-0.025 0.028 0.035 0.045 erand48_r
-0.013 0.008 0.048 0.053 drand48
-0.011 0.010 0.012 0.010 _int_malloc
-0.010 0.017 0.010 0.017 __drand48_iterate
-0.001 0. 0.001 0. sysmalloc
-0. 0. 0.114 0.114 __libc_start_main
-0. 0. 0.011 0.010 allocate_data
-0. 0. 0.001 0. check_results
-0. 0. 2.159 2.148 collector_root
-0. 0. 2.159 2.148 driver_mxv
-0. 0. 0.114 0.114 main
-0. 0. 0.012 0.010 malloc
-@end verbatim
-@end smallexample
-
-This table is already helpful to more easily compare (two) profiles, but
-there is more that we can do here.
-
-By default, in comparison mode, all measured values are shown. Often
-profiling is about comparing performance data. It is therefore
-more useful to look at differences, or ratios, using one experiment as
-a reference.
-
-The values shown are relative to this difference. For example if a ratio
-is below one, it means the reference value was higher.
-
-@IndexSubentry{Commands, @code{compare on/off}}
-This feature is supported on the @code{compare} command. In addition to @code{on},
-or @code{off}, this command also supports
-@IndexSubentry{Commands, @code{compare delta}}
-@code{delta}, or
-@IndexSubentry{Commands, @code{compare ratio}}
-@code{ratio}.
-
-Usage of one of these two keywords enables the comparison feature and shows
-either the difference, or the ratio, relative to the reference data.
-
-In the example below, we use the same two experiments used in the comparison
-above, but as before, the number of lines is restricted to 10 and we focus on
-the exclusive timings plus percentages. For the comparison part we are
-interested in the differences.
-
-This is the script that produces such an overview:
-
-@cartouche
-@smallexample
-# Define the metrics
-metrics e.%totalcpu
-# Limit the output to 10 lines
-limit 10
-# Set the comparison mode to differences
-compare delta
-# Get the function overview
-functions
-@end smallexample
-@end cartouche
-
-Assuming this script file is called @code{my-script-comp}, this is how we
-get the table displayed on our screen:
-
-@cartouche
-@smallexample
-$ gprofng display text -script my-script-comp mxv.1.thr.er mxv.2.thr.er
-@end smallexample
-@end cartouche
-
-Leaving out some of the lines printed, but we have seen before, we get
-the following table:
-
-@smallexample
-@verbatim
-mxv.1.thr.er mxv.2.thr.er
-Excl. Total Excl. Total Name
-CPU CPU
- sec. % delta %
-2.272 100.00 -0.011 100.00 <Total>
-2.159 95.00 -0.011 94.97 mxv_core
-0.054 2.37 -0.003 2.25 init_data
-0.025 1.10 +0.003 1.23 erand48_r
-0.013 0.57 -0.005 0.35 drand48
-0.011 0.48 -0.001 0.44 _int_malloc
-0.010 0.44 +0.007 0.75 __drand48_iterate
-0.001 0.04 -0.001 0. sysmalloc
-0. 0. +0. 0. __libc_start_main
-0. 0. +0. 0. allocate_data
-@end verbatim
-@end smallexample
-
-It is now easy to see that the CPU times for the most time consuming
-functions in this code are practically the same.
-
-While in this case we used the delta as a comparison,
-
-Note that the comparison feature is supported at the function, source, and
-disassembly level. There is no practical limit on the number of experiments
-that can be used in a comparison.
-
-
-
-@c -- A new node --------------------------------------------------------------
-@node Profile Hardware Event Counters
-@section Profile Hardware Event Counters
-@c ----------------------------------------------------------------------------
-
-Many processors provide a set of hardware event counters and @ToolName{}
-provides support for this feature.
-@xref{Hardware Event Counters Explained} for those readers that are not
-familiar with such counters and like to learn more.
-
-In this section we explain how to get the details on the event counter
-support for the processor used in the experiment(s), and show several
-examples.
-
-@c -- A new node --------------------------------------------------------------
-@node Getting Information on the Counters Supported
-@subsection Getting Information on the Counters Supported
-@c ----------------------------------------------------------------------------
-
-The first step is to check if the processor used for the experiments is
-supported by @ToolName{}.
-
-@IndexSubentry{Options, @code{-h}}
-The @code{-h} option on @CollectApp{} will show the event counter
-information:
-
-@cartouche
-@smallexample
-$ gprofng collect app -h
-@end smallexample
-@end cartouche
-
-In case the counters are supported, a list with the events is printed.
-Otherwise, a warning message will be issued.
-
-For example, below we show this command and the output on an Intel Xeon
-Platinum 8167M (aka ``Skylake'') processor. The output has been split
-into several sections and each section is commented upon separately.
-
-@smallexample
-@verbatim
-Run "gprofng collect app --help" for a usage message.
-
-Specifying HW counters on `Intel Arch PerfMon v2 on Family 6 Model 85'
-(cpuver=2499):
-
- -h {auto|lo|on|hi}
- turn on default set of HW counters at the specified rate
- -h <ctr_def> [-h <ctr_def>]...
- -h <ctr_def>[,<ctr_def>]...
- specify HW counter profiling for up to 4 HW counters
-@end verbatim
-@end smallexample
-
-The first line shows how to get a usage overview. This is followed by
-some information on the target processor.
-
-The next five lines explain in what ways the @code{-h} option can be
-used to define the events to be monitored.
-
-The first version shown above enables a default set of counters. This
-default depends on the processor this command is executed on. The
-keyword following the @code{-h} option defines the sampling rate:
-
-@table @code
-
-@item auto
-Match the sample rate of used by clock profiling. If the latter is disabled,
-Use a per thread sampling rate of approximately 100 samples per second.
-This setting is the default and preferred.
-
-@item on
-Use a per thread sampling rate of approximately 100 samples per second.
-
-@item lo
-Use a per thread sampling rate of approximately 10 samples per second.
-
-@item hi
-Use a per thread sampling rate of approximately 1000 samples per second.
-
-@end table
-
-The second and third variant define the events to be monitored. Note
-that the number of simultaneous events supported is printed. In this
-case we can monitor four events in a single profiling job.
-
-It is a matter of preference whether you like to use the @code{-h}
-option for each event, or use it once, followed by a comma separated
-list.
-
-There is one slight catch though. The counter definition below has
-mandatory comma (@code{,}) between the event and the rate. While a
-default can be used for the rate, the comma cannot be omitted.
-This may result in a somewhat awkward counter definition in case
-the default sampling rate is used.
-
-For example, the following two commands are equivalent. Note
-the double comma in the second command. This is not a typo.
-
-@cartouche
-@smallexample
-$ gprofng collect app -h cycles -h insts ...
-$ gprofng collect app -h cycles,,insts ...
-@end smallexample
-@end cartouche
-
-In the first command this comma is not needed, because a
-comma (``@code{,}'') immediately followed by white space may
-be omitted.
-
-This is why we prefer the this syntax and in the remainder will
-use the first version of this command.
-
-@IndexSubentry{Hardware event counters, counter definition}
-The counter definition takes an event name, plus optionally one or
-more attributes, followed by a comma, and optionally the sampling rate.
-The output section below shows the formal definition.
-
-@cartouche
-@smallexample
- <ctr_def> == <ctr>[[~<attr>=<val>]...],[<rate>]
-@end smallexample
-@end cartouche
-
-The printed help then explains this syntax. Below we have summarized
-and expanded this output:
-
-@table @code
-
-@item <ctr>
-The counter name must be selected from the available counters listed
-as part of the output printed with the @code{-h} option.
-On most systems, if a counter is not listed, it may still be specified
-by its numeric value.
-
-@item ~<attr>=<val>
-This is an optional attribute that depends on the processor. The list
-of supported attributes is printed in the output. Examples of
-attributes are ``user'', or ``system''. The value can given in decimal
-or hexadecimal format.
-Multiple attributes may be specified, and each must be preceded
-by a ~.
-
-@item <rate>
-
-The sampling rate is one of the following:
-
-@table @code
-
-@item auto
-This is the default and matches the rate used by clock profiling.
-If clock profiling is disabled, use @code{on}.
-
-@item on
-Set the per thread maximum sampling rate to ~100 samples/second
-
-@item lo
-Set the per thread maximum sampling rate to ~10 samples/second
-
-@item hi
-Set the per thread maximum sampling rate to ~1000 samples/second
-
-@item <interval>
-Define the sampling interval.
-@xref{Control the Sampling Frequency} how to define this.
-
-@end table
-
-@end table
-
-After the section with the formal definition of events and counters, a
-processor specific list is displayed. This part starts with an overview
-of the default set of counters and the aliased names supported
-@emph{on this specific processor}.
-
-@smallexample
-@verbatim
-Default set of HW counters:
-
- -h cycles,,insts,,llm
-
-Aliases for most useful HW counters:
-
- alias raw name type units regs description
-
- cycles unhalted-core-cycles CPU-cycles 0123 CPU Cycles
- insts instruction-retired events 0123 Instructions Executed
- llm llc-misses events 0123 Last-Level Cache Misses
- br_msp branch-misses-retired events 0123 Branch Mispredict
- br_ins branch-instruction-retired events 0123 Branch Instructions
-@end verbatim
-@end smallexample
-
-The definitions given above may or may not be available on other processors,
-but we try to maximize the overlap across alias sets.
-
-The table above shows the default set of counters defined for this processor,
-and the aliases. For each alias the full ``raw'' name is given, plus the
-unit of the number returned by the counter (CPU cycles, or a raw count),
-the hardware counter the event is allowed to be mapped onto, and a short
-description.
-
-The last part of the output contains all the events that can be monitored:
-
-@smallexample
-@verbatim
-Raw HW counters:
-
- name type units regs description
-
- unhalted-core-cycles CPU-cycles 0123
- unhalted-reference-cycles events 0123
- instruction-retired events 0123
- llc-reference events 0123
- llc-misses events 0123
- branch-instruction-retired events 0123
- branch-misses-retired events 0123
- ld_blocks.store_forward events 0123
- ld_blocks.no_sr events 0123
- ld_blocks_partial.address_alias events 0123
- dtlb_load_misses.miss_causes_a_walk events 0123
- dtlb_load_misses.walk_completed_4k events 0123
-
- <many lines deleted>
-
- l2_lines_out.silent events 0123
- l2_lines_out.non_silent events 0123
- l2_lines_out.useless_hwpf events 0123
- sq_misc.split_lock events 0123
-
-See Chapter 19 of the "Intel 64 and IA-32 Architectures Software
-Developer's Manual Volume 3B: System Programming Guide"
-@end verbatim
-@end smallexample
-
-As can be seen, these names are not always easy to correlate to a specific
-event of interest. The processor manual should provide more clarity on this.
-
-@c -- A new node --------------------------------------------------------------
-@node Examples Using Hardware Event Counters
-@subsection Examples Using Hardware Event Counters
-@c ----------------------------------------------------------------------------
-
-The previous section may give the impression that these counters are hard to
-use, but as we will show now, in practice it is quite simple.
-
-With the information from the @code{-h} option, we can easily set up our first
-event counter experiment.
-
-We start by using the default set of counters defined for our processor and we
-use 2 threads:
-
-@cartouche
-@smallexample
-$ exe=mxv-pthreads.exe
-$ m=3000
-$ n=2000
-$ exp=mxv.hwc.def.2.thr.er
-$ gprofng collect app -O $exp -h auto ./$exe -m $m -n $n -t 2
-@end smallexample
-@end cartouche
-
-@IndexSubentry{Options, @code{-h}}
-@IndexSubentry{Hardware event counters, @code{auto} option}
-The new option here is @code{-h auto}. The @code{auto} keyword enables
-hardware event counter profiling and selects the default set of counters
-defined for this processor.
-
-As before, we can display the information, but there is one practical hurdle
-to take. Unless we like to view all metrics recorded, we would need to know
-the names of the events that have been enabled. This is tedious and also not
-portable in case we would like to repeat this experiment on another processor.
-
-@IndexSubentry{Hardware event counters, @code{hwc} metric}
-This is where the special @code{hwc} metric comes very handy. It
-automatically expands to the active set of events used.
-
-With this, it is very easy to display the event counter values. Note that
-although the regular clock based profiling was enabled, we only want to see
-the counter values. We also request to see the percentages and limit the
-output to the first 5 lines:
-
-@cartouche
-@smallexample
-$ exp=mxv.hwc.def.2.thr.er
-$ gprofng display text -metrics e.%hwc -limit 5 -functions $exp
-@end smallexample
-@end cartouche
-
-@smallexample
-@verbatim
-Current metrics: e.%cycles:e+%insts:e+%llm:name
-Current Sort Metric: Exclusive CPU Cycles ( e.%cycles )
-Print limit set to 5
-Functions sorted by metric: Exclusive CPU Cycles
-
-Excl. CPU Excl. Instructions Excl. Last-Level Name
-Cycles Executed Cache Misses
- sec. % % %
-2.691 100.00 7906475309 100.00 122658983 100.00 <Total>
-2.598 96.54 7432724378 94.01 121745696 99.26 mxv_core
-0.035 1.31 188860269 2.39 70084 0.06 erand48_r
-0.026 0.95 73623396 0.93 763116 0.62 init_data
-0.018 0.66 76824434 0.97 40040 0.03 drand48
-@end verbatim
-@end smallexample
-
-As we have seen before, the first few lines echo the settings.
-This includes a list with the hardware event counters used by
-default.
-
-The table that follows makes it very easy to get an overview where the
-time is spent and how many of the target events have occurred.
-
-As before, we can drill down deeper and see the same metrics at the source
-line and instruction level. Other than using @code{hwc} in the metrics
-definitions, nothing has changed compared to the previous examples:
-
-@cartouche
-@smallexample
-$ exp=mxv.hwc.def.2.thr.er
-$ gprofng display text -metrics e.hwc -source mxv_core $exp
-@end smallexample
-@end cartouche
-
-This is the relevant part of the output. Since the lines get very long,
-we have somewhat modified the lay-out:
-
-@smallexample
-@verbatim
- Excl. CPU Excl. Excl.
- Cycles Instructions Last-Level
- sec. Executed Cache Misses
- <Function: mxv_core>
- 0. 0 0 32. void __attribute__ ((noinline))
- mxv_core(...)
- 0. 0 0 33. {
- 0. 0 0 34. for (uint64_t i=...) {
- 0. 0 0 35. double row_sum = 0.0;
-## 1.872 7291879319 88150571 36. for (int64_t j=0; j<n; j++)
- 0.725 140845059 33595125 37. row_sum += A[i][j]*b[j];
- 0. 0 0 38. c[i] = row_sum;
- 39. }
- 0. 0 0 40. }
-@end verbatim
-@end smallexample
-
-In a smiliar way we can display the event counter values at the instruction
-level. Again we have modified the lay-out due to page width limitations:
-
-@cartouche
-@smallexample
-$ exp=mxv.hwc.def.2.thr.er
-$ gprofng display text -metrics e.hwc -disasm mxv_core $exp
-@end smallexample
-@end cartouche
-
-@smallexample
-@verbatim
- Excl. CPU Excl. Excl.
- Cycles Instructions Last-Level
- sec. Executed Cache Misses
- <Function: mxv_core>
- 0. 0 0 [33] 4021ba: mov 0x8(%rsp),%r10
- 34. for (uint64_t i=...) {
- 0. 0 0 [34] 4021bf: cmp %rsi,%rdi
- 0. 0 0 [34] 4021c2: jbe 0x37
- 0. 0 0 [34] 4021c4: ret
- 35. double row_sum = 0.0;
- 36. for (int64_t j=0; j<n; j++)
- 37. row_sum += A[i][j]*b[j];
- 0. 0 0 [37] 4021c5: mov (%r8,%rdi,8),%rdx
- 0. 0 0 [36] 4021c9: mov $0x0,%eax
- 0. 0 0 [35] 4021ce: pxor %xmm1,%xmm1
- 0.002 12804230 321394 [37] 4021d2: movsd (%rdx,%rax,8),%xmm0
- 0.141 60819025 3866677 [37] 4021d7: mulsd (%r9,%rax,8),%xmm0
- 0.582 67221804 29407054 [37] 4021dd: addsd %xmm0,%xmm1
-## 1.871 7279075109 87989870 [36] 4021e1: add $0x1,%rax
- 0.002 12804210 80351 [36] 4021e5: cmp %rax,%rcx
- 0. 0 0 [36] 4021e8: jne 0xffffffffffffffea
- 38. c[i] = row_sum;
- 0. 0 0 [38] 4021ea: movsd %xmm1,(%r10,%rdi,8)
- 0. 0 0 [34] 4021f0: add $0x1,%rdi
- 0. 0 0 [34] 4021f4: cmp %rdi,%rsi
- 0. 0 0 [34] 4021f7: jb 0xd
- 0. 0 0 [35] 4021f9: pxor %xmm1,%xmm1
- 0. 0 0 [36] 4021fd: test %rcx,%rcx
- 0. 0 80350 [36] 402200: jne 0xffffffffffffffc5
- 0. 0 0 [36] 402202: jmp 0xffffffffffffffe8
- 39. }
- 40. }
- 0. 0 0 [40] 402204: ret
-@end verbatim
-@end smallexample
-
-So far we have used the default settings for the event counters. It is
-quite straightforward to select specific counters. For sake of the
-example, let's assume we would like to count how many branch instructions
-and retired memory load instructions that missed in the L1 cache have been
-executed. We also want to count these events with a high resolution.
-
-This is the command to do so:
-
-@cartouche
-@smallexample
-$ exe=mxv-pthreads.exe
-$ m=3000
-$ n=2000
-$ exp=mxv.hwc.sel.2.thr.er
-$ hwc1=br_ins,hi
-$ hwc2=mem_load_retired.l1_miss,hi
-$ gprofng collect app -O $exp -h $hwc1 -h $hwc2 $exe -m $m -n $n -t 2
-@end smallexample
-@end cartouche
-
-As before, we get a table with the event counts. Due to the very
-long name for the second counter, we have somewhat modified the
-output.
-
-@cartouche
-@smallexample
-$ gprofng display text -limit 10 -functions mxv.hwc.sel.2.thr.er
-@end smallexample
-@end cartouche
-
-@smallexample
-@verbatim
-Functions sorted by metric: Exclusive Total CPU Time
-Excl. Incl. Excl. Branch Excl. Name
-Total Total Instructions mem_load_retired.l1_miss
-CPU sec. CPU sec. Events
-2.597 2.597 1305305319 4021340 <Total>
-2.481 2.481 1233233242 3982327 mxv_core
-0.040 0.107 19019012 9003 init_data
-0.028 0.052 23023048 15006 erand48_r
-0.024 0.024 19019008 9004 __drand48_iterate
-0.015 0.067 11011009 2998 drand48
-0.008 0.010 0 3002 _int_malloc
-0.001 0.001 0 0 brk
-0.001 0.002 0 0 sysmalloc
-0. 0.001 0 0 __default_morecore
-@end verbatim
-@end smallexample
-
-@IndexSubentry{Commands, @code{compare ratio}}
-When using event counters, the values could be very large and it is not easy
-to compare the numbers. As we will show next, the @code{ratio} feature is
-very useful when comparing such profiles.
-
-To demonstrate this, we have set up another event counter experiment where
-we would like to compare the number of last level cache miss and the number
-of branch instructions executed when using a single thread, or two threads.
-
-These are the commands used to generate the experiment directories:
-
-@cartouche
-@smallexample
-$ exe=./mxv-pthreads.exe
-$ m=3000
-$ n=2000
-$ exp1=mxv.hwc.comp.1.thr.er
-$ exp2=mxv.hwc.comp.2.thr.er
-$ gprofng collect app -O $exp1 -h llm -h br_ins $exe -m $m -n $n -t 1
-$ gprofng collect app -O $exp2 -h llm -h br_ins $exe -m $m -n $n -t 2
-@end smallexample
-@end cartouche
-
-The following script has been used to get the tables. Due to lay-out
-restrictions, we have to create two tables, one for each counter.
-
-@cartouche
-@smallexample
-# Limit the output to 5 lines
-limit 5
-# Define the metrics
-metrics name:e.llm
-# Set the comparison to ratio
-compare ratio
-functions
-# Define the metrics
-metrics name:e.br_ins
-# Set the comparison to ratio
-compare ratio
-functions
-@end smallexample
-@end cartouche
-
-Note that we print the name of the function first, followed by the counter
-data.
-The new element is that we set the comparison mode to @code{ratio}. This
-divides the data in a column by its counterpart in the reference experiment.
-
-This is the command using this script and the two experiment directories as
-input:
-
-@cartouche
-@smallexample
-$ gprofng display text -script my-script-comp-counters \
- mxv.hwc.comp.1.thr.er \
- mxv.hwc.comp.2.thr.er
-@end smallexample
-@end cartouche
-
-By design, we get two tables, one for each counter:
-
-@smallexample
-@verbatim
-Functions sorted by metric: Exclusive Last-Level Cache Misses
-
- mxv.hwc.comp.1.thr.er mxv.hwc.comp.2.thr.er
-Name Excl. Last-Level Excl. Last-Level
- Cache Misses Cache Misses
- ratio
- <Total> 122709276 x 0.788
- mxv_core 121796001 x 0.787
- init_data 723064 x 1.055
- erand48_r 100111 x 0.500
- drand48 60065 x 1.167
-
-Functions sorted by metric: Exclusive Branch Instructions
-
- mxv.hwc.comp.1.thr.er mxv.hwc.comp.2.thr.er
-Name Excl. Branch Excl. Branch
- Instructions Instructions
- ratio
- <Total> 1307307316 x 0.997
- mxv_core 1235235239 x 0.997
- erand48_r 23023033 x 0.957
- drand48 20020009 x 0.600
- __drand48_iterate 17017028 x 0.882
-@end verbatim
-@end smallexample
-
-A ratio less than one in the second column, means that this counter
-value was smaller than the value from the reference experiment shown
-in the first column.
-
-This kind of presentation of the results makes it much easier to
-quickly interpret the data.
-
-We conclude this section with thread-level event counter overviews,
-but before we go into this, there is an important metric we need to
-mention.
-
-@IndexSubentry{Hardware event counters, IPC}
-In case it is known how many instructions and CPU cycles have been executed,
-the value for the IPC (``Instructions Per Clockycle'') can be computed.
-@xref{Hardware Event Counters Explained}.
-This is a derived metric that gives an indication how well the processor
-is utilized. The inverse of the IPC is called CPI.
-
-The @DisplayText{} command automatically computes the IPC and CPI values
-if an experiment contains the event counter values for the instructions
-and CPU cycles executed. These are part of the metric list and can be
-displayed, just like any other metric.
-
-@IndexSubentry{Commands, @code{metric_list}}
-This can be verified through the @code{metric_list} command. If we go
-back to our earlier experiment with the default event counters, we get
-the following result.
-
-@cartouche
-@smallexample
-$ gprofng display text -metric_list mxv.hwc.def.2.thr.er
-@end smallexample
-@end cartouche
-
-@smallexample
-@verbatim
-Current metrics: e.totalcpu:i.totalcpu:e.cycles:e+insts:e+llm:name
-Current Sort Metric: Exclusive Total CPU Time ( e.totalcpu )
-Available metrics:
- Exclusive Total CPU Time: e.%totalcpu
- Inclusive Total CPU Time: i.%totalcpu
- Exclusive CPU Cycles: e.+%cycles
- Inclusive CPU Cycles: i.+%cycles
- Exclusive Instructions Executed: e+%insts
- Inclusive Instructions Executed: i+%insts
-Exclusive Last-Level Cache Misses: e+%llm
-Inclusive Last-Level Cache Misses: i+%llm
- Exclusive Instructions Per Cycle: e+IPC
- Inclusive Instructions Per Cycle: i+IPC
- Exclusive Cycles Per Instruction: e+CPI
- Inclusive Cycles Per Instruction: i+CPI
- Size: size
- PC Address: address
- Name: name
-@end verbatim
-@end smallexample
-
-Among the other metrics, we see the new metrics for the IPC and CPI
-listed.
-
-In the script below, we use this information and add the IPC and CPI
-to the metrics to be displayed. We also use a the thread filter to
-display these values for the individual threads.
-
-This is the complete script we have used. Other than a different selection
-of the metrics, there are no new features.
-
-@cartouche
-@smallexample
-# Define the metrics
-metrics e.insts:e.%cycles:e.IPC:e.CPI
-# Sort with respect to cycles
-sort e.cycles
-# Limit the output to 5 lines
-limit 5
-# Get the function overview for all threads
-functions
-# Get the function overview for thread 1
-thread_select 1
-functions
-# Get the function overview for thread 2
-thread_select 2
-functions
-# Get the function overview for thread 3
-thread_select 3
-functions
-@end smallexample
-@end cartouche
-
-In the metrics definition on the second line, we explicitly request the
-counter values for the instructions (@code{e.insts}) and CPU cycles
-(@code{e.cycles}) executed. These names can be found in output from the
-@code{metric_list} commad above.
-In addition to these metrics, we also request the IPC and CPI to be shown.
-
-As before, we used the @code{limit} command to control the number of
-functions displayed. We then request an overview for all the threads,
-followed by three sets of two commands to select a thread and display the
-function overview.
-
-The script above is used as follows:
-
-@cartouche
-@smallexample
-$ gprofng display text -script my-script-ipc mxv.hwc.def.2.thr.er
-@end smallexample
-@end cartouche
-
-This script produces four tables. We list them separately below,
-and have left out the additional output.
-
-The first table shows the accumulated values across the three
-threads that have been active.
-
-@smallexample
-@verbatim
-Functions sorted by metric: Exclusive CPU Cycles
-
-Excl. Excl. CPU Excl. Excl. Name
-Instructions Cycles IPC CPI
-Executed sec. %
-7906475309 2.691 100.00 1.473 0.679 <Total>
-7432724378 2.598 96.54 1.434 0.697 mxv_core
- 188860269 0.035 1.31 2.682 0.373 erand48_r
- 73623396 0.026 0.95 1.438 0.696 init_data
- 76824434 0.018 0.66 2.182 0.458 drand48
-@end verbatim
-@end smallexample
-
-This shows that IPC of this program is completely dominated
-by function @code{mxv_core}. It has a fairly low IPC value
-of 1.43.
-
-The next table is for thread 1 and shows the values for the
-main thread.
-
-@smallexample
-@verbatim
-Exp Sel Total
-=== === =====
- 1 1 3
-Functions sorted by metric: Exclusive CPU Cycles
-
-Excl. Excl. CPU Excl. Excl. Name
-Instructions Cycles IPC CPI
-Executed sec. %
-473750931 0.093 100.00 2.552 0.392 <Total>
-188860269 0.035 37.93 2.682 0.373 erand48_r
- 73623396 0.026 27.59 1.438 0.696 init_data
- 76824434 0.018 18.97 2.182 0.458 drand48
-134442832 0.013 13.79 5.250 0.190 __drand48_iterate
-@end verbatim
-@end smallexample
-
-Although this thread hardly uses any CPU cycles, the overall IPC
-of 2.55 is not all that bad.
-
-Last, we show the tables for threads 2 and 3:
-
-@smallexample
-@verbatim
-Exp Sel Total
-=== === =====
- 1 2 3
-Functions sorted by metric: Exclusive CPU Cycles
-
-Excl. Excl. CPU Excl. Excl. Name
-Instructions Cycles IPC CPI
-Executed sec. %
-3716362189 1.298 100.00 1.435 0.697 <Total>
-3716362189 1.298 100.00 1.435 0.697 mxv_core
- 0 0. 0. 0. 0. collector_root
- 0 0. 0. 0. 0. driver_mxv
-
-Exp Sel Total
-=== === =====
- 1 3 3
-Functions sorted by metric: Exclusive CPU Cycles
-
-Excl. Excl. CPU Excl. Excl. Name
-Instructions Cycles IPC CPI
-Executed sec. %
-3716362189 1.300 100.00 1.433 0.698 <Total>
-3716362189 1.300 100.00 1.433 0.698 mxv_core
- 0 0. 0. 0. 0. collector_root
- 0 0. 0. 0. 0. driver_mxv
-@end verbatim
-@end smallexample
-
-It is seen that both execute the same number of instructions and
-take about the same number of CPU cycles. As a result, the IPC is
-the same for both threads.
-
-@c -- A new node --------------------------------------------------------------
-@c TBD @node Additional Features
-@c TBD @section Additional Features
-@c ----------------------------------------------------------------------------
-
-@c -- A new node --------------------------------------------------------------
-@c TBD @node More Filtering Capabilities
-@c TBD @subsection More Filtering Capabilities
-@c ----------------------------------------------------------------------------
-
-@c TBD Cover @code{samples} and @code{seconds}
-
-@c -- A new node --------------------------------------------------------------
-@node Java Profiling
-@section Java Profiling
-@c ----------------------------------------------------------------------------
-
-@IndexSubentry{Java profiling, @code{-j on/off}}
-The @CollectApp{} command supports Java profiling. The @code{-j on} option
-can be used for this, but since this feature is enabled by default, there is
-no need to set this explicitly. Java profiling may be disabled through the
-@code{-j off} option.
-
-The program is compiled as usual and the experiment directory is created
-similar to what we have seen before. The only difference with a C/C++
-application is that the program has to be explicitly executed by java.
-
-For example, this is how to generate the experiment data for a Java
-program that has the source code stored in file @code{Pi.java}:
-
-@cartouche
-@smallexample
-$ javac Pi.java
-$ gprofng collect app -j on -O pi.demo.er java Pi < pi.in
-@end smallexample
-@end cartouche
-
-Regarding which java is selected to generate the data, @ToolName{}
-first looks for the JDK in the path set in either the
-@IndexSubentry{Java profiling, @code{JDK_HOME}}
-@code{JDK_HOME} environment variable, or in the
-@IndexSubentry{Java profiling, @code{JAVA_PATH}}
-@code{JAVA_PATH} environment variable. If neither of these variables is
-set, it checks for a JDK in the search path (set in the PATH
-environment variable). If there is no JDK in this path, it checks for
-the java executable in @code{/usr/java/bin/java}.
-
-In case additional options need to be passed on to the JVM, the
-@IndexSubentry{Java profiling, @code{-J <string>}}
-@code{-J <string>} option can be used. The string with the
-option(s) has to be delimited by quotation marks in case
-there is more than one argument.
-
-The @DisplayText{} command may be used to view the performance data. There is
-no need for any special options and the same commands as previously discussed
-are supported.
-
-@IndexSubentry{Commands, @code{viewmode}}
-@IndexSubentry{Java profiling, different view modes}
-The @code{viewmode} command
-@xref{The Viewmode}
-is very useful to examine the call stacks.
-
-For example, this is how one can see the native call stacks. For
-lay-out purposes we have restricted the list to the first five entries:
-
-@cartouche
-@smallexample
-$ gprofng display text -limit 5 -viewmode machine -calltree pi.demo.er
-@end smallexample
-@end cartouche
-
-@smallexample
-@verbatim
-Print limit set to 5
-Viewmode set to machine
-Functions Call Tree. Metric: Attributed Total CPU Time
-
-Attr. Name
-Total
-CPU sec.
-1.381 +-<Total>
-1.171 +-Pi.calculatePi(double)
-0.110 +-collector_root
-0.110 | +-JavaMain
-0.070 | +-jni_CallStaticVoidMethod
-@end verbatim
-@end smallexample
-
-@noindent
-Note that the selection of the viewmode is echoed in the output.
-
-@c -- A new node --------------------------------------------------------------
-@c TBD @node Summary of Options and Commands
-@c TBD @chapter Summary of Options and Commands
-@c ----------------------------------------------------------------------------
-
-@c -- A new node --------------------------------------------------------------
-@node Terminology
-@chapter Terminology
-
-Throughout this manual, certain terminology specific to profiling tools,
-or @ToolName{}, or even to this document only, is used. In this chapter we
-explain this terminology in detail.
-
-@menu
-* The Program Counter:: What is a Program Counter?
-* Inclusive and Exclusive Metrics:: An explanation of inclusive and exclusive metrics.
-* Metric Definitions:: Definitions associated with metrics.
-* The Viewmode:: Select the way call stacks are presented.
-* The Selection List:: How to define a selection.
-* Load Objects and Functions:: The components in an application.
-* The Concept of a CPU in @ProductName{}:: The definition of a CPU.
-* Hardware Event Counters Explained:: What are event counters?
-* apath:: Our generic definition of a path.
-@end menu
-
-@c ----------------------------------------------------------------------------
-@node The Program Counter
-@section The Program Counter
-@c ----------------------------------------------------------------------------
-
-@cindex PC
-@cindex Program Counter
-The @emph{Program Counter}, or PC for short, keeps track where program execution is.
-The address of the next instruction to be executed is stored in a special
-purpose register in the processor, or core.
-
-@cindex Instruction pointer
-The PC is sometimes also referred to as the @emph{instruction pointer}, but
-we will use Program Counter or PC throughout this document.
-
-@c ----------------------------------------------------------------------------
-@node Inclusive and Exclusive Metrics
-@section Inclusive and Exclusive Metrics
-@c ----------------------------------------------------------------------------
-
-In the remainder, these two concepts occur quite often and for lack of a better
-place, they are explained here.
-
-@cindex Inclusive metric
-The @emph{inclusive} value for a metric includes all values that are part of
-the dynamic extent of the target function. For example if function @code{A}
-calls functions @code{B} and @code{C}, the inclusive CPU time for @code{A}
-includes the CPU time spent in @code{B} and @code{C}.
-
-@cindex Exclusive metric
-In contrast with this, the @emph{exclusive} value for a metric is computed
-by excluding the metric values used by other functions called. In our imaginary
-example, the exclusive CPU time for function @code{A} is the time spent outside
-calling functions @code{B} and @code{C}.
-
-@cindex Leaf function
-In case of a @emph{leaf function}, the inclusive and exclusive values for the
-metric are the same since by definition, it is not calling any other
-function(s).
-
-Why do we use these two different values? The inclusive metric shows the most
-expensive path, in terms of this metric, in the application. For example, if
-the metric is cache misses, the function with the highest inclusive metric
-tells you where most of the cache misses come from.
-
-Within this branch of the application, the exclusive metric points to the
-functions that contribute and help to identify which part(s) to consider
-for further analysis.
-
-@c ----------------------------------------------------------------------------
-@node Metric Definitions
-@section Metric Definitions
-@c ----------------------------------------------------------------------------
-The metrics to be shown are highly customizable. In this section we explain
-the definitions associated with metrics.
-
-@IndexSubentry{Commands, @code{metrics}}
-The @code{metrics} command takes a colon (:) separated list with special
-keywords. This keyword consists of the following three fields:
-@code{<flavor>}@code{<visibility>}@code{<metric_name>}.
-
-@cindex Flavor field
-@cindex Visibility field
-@cindex Metric name field
-The @emph{<flavor>} field is either an @code{e} for ``exclusive'', or @code{i}
-for ``inclusive''. The @code{<metric_name>} field is the name of the metric
-request. The @emph{<visibility>} field consists of one ore more characters
-from the following table:
-
-@table @code
-
-@item .
-Show the metric as time. This applies to timing metrics and hardware event counters
-that measure cycles. Interpret as @code{+} for other metrics.
-
-@item %
-Show the metric as a percentage of the total value for this metric.
-
-@item +
-Show the metric as an absolute value. For hardware event counters this is
-the event count. Interpret as @code{.} for timing metrics.
-
-@item |
-Do not show any metric value. Cannot be used with other visibility characters.
-
-@end table
-
-@c ----------------------------------------------------------------------------
-@node The Viewmode
-@section The Viewmode
-
-@cindex Viewmode
-@IndexSubentry{Commands, @code{viewmode}}
-
-There are different ways to view a call stack in Java. In @ToolName{}, this
-is called the @emph{viewmode} and the setting is controlled through a command
-with the same name.
-
-The @code{viewmode} command takes one of the following keywords:
-
-@table @code
-
-@item user
-This is the default and shows the Java call stacks for Java threads.
-No call stacks for any housekeeping threads are shown. The function
-list contains a function
-@IndexSubentry{Java profiling, @code{<JVM-System>}}
-@code{<JVM-System>} that represents the aggregated time from non-Java
-threads.
-When the JVM software does not report a Java call stack, time is reported
-against the function
-@IndexSubentry{Java profiling, @code{<no Java callstack recorded>}}
-@code{<no Java callstack recorded>}.
-
-
-@item expert
-Show the Java call stacks for Java threads when the Java code from the
-user is executed and machine call stacks when JVM code is executed, or
-when the JVM software does not report a Java call stack.
-Show the machine call stacks for housekeeping threads.
-
-@item machine
-Show the actual native call stacks for all threads.
-
-@end table
-
-@c ----------------------------------------------------------------------------
-@c ----------------------------------------------------------------------------
-@node The Selection List
-@section The Selection List
-@c ----------------------------------------------------------------------------
-
-@cindex Selection list
-@cindex List specification
-Several commands allow the user to specify a subset of a list. For example,
-to select specific threads from all the threads that have been used when
-conducting the experiment(s).
-
-Such a selection list (or ``list'' in the remainder of this section) can be a
-single number, a contiguous range of numbers with the start and end numbers
-separated by a hyphen (@code{-}), a comma-separated list of numbers and
-ranges, or the @code{all} keyword. Lists must not contain spaces.
-
-Each list can optionally be preceded by an experiment list with a similar
-format, separated from the list by a colon (:).
-If no experiment list is included, the list applies to all experiments.
-
-Multiple lists can be concatenated by separating the individual lists
-by a plus sign.
-
-These are some examples of various filters using a list:
-
-@table @code
-
-@item thread_select 1
-Select thread 1 from all experiments.
-
-@item thread_select all:1
-Select thread 1 from all experiments.
-
-@item thread_select 1:1+2:2
-Select thread 1 from experiment 1 and thread 2 from experiment 2.
-
-@item cpu_select all:1,3,5
-Selects cores 1, 3, and 5 from all experiments.
-
-@item cpu_select 1,2:all
-Select all cores from experiments 1 and 2, as listed by the @code{by exp_list} command.
-
-@end table
-
-@c ----------------------------------------------------------------------------
-@node Load Objects and Functions
-@section Load Objects and Functions
-@c ----------------------------------------------------------------------------
-
-An application consists of various components. The source code files are
-compiled into object files. These are then glued together at link time to form
-the executable.
-During execution, the program may also dynamically load objects.
-
-@cindex Load object
-A @emph{load object} is defined to be an executable, or shared object. A shared
-library is an example of a load object in @ToolName{}.
-
-Each load object, contains a text section with the instructions generated by the
-compiler, a data section for data, and various symbol tables.
-All load objects must contain an
-@cindex ELF
-ELF
-symbol table, which gives the names and addresses of all the globally known
-functions in that object.
-
-Load objects compiled with the -g option contain additional symbolic information
-that can augment the ELF symbol table and provide information about functions that
-are not global, additional information about object modules from which the functions
-came, and line number information relating addresses to source lines.
-
-The term
-@cindex Function
-@emph{function}
-is used to describe a set of instructions that represent a high-level operation
-described in the source code. The term also covers methods as used in C++ and in
-the Java programming language.
-
-In the @ToolName{} context, functions are provided in source code format.
-Normally their names appear in the symbol table representing a set of addresses.
-@cindex Program Counter
-@cindex PC
-If the Program Counter (PC) is within that set, the program is executing within that function.
-
-In principle, any address within the text segment of a load object can be mapped to a
-function. Exactly the same mapping is used for the leaf PC and all the other PCs on the
-call stack.
-
-Most of the functions correspond directly to the source model of the program, but
-there are exceptions. This topic is however outside of the scope of this guide.
-
-@c ----------------------------------------------------------------------------
-@node The Concept of a CPU in @ProductName{}
-@section The Concept of a CPU in @ProductName{}
-@c ----------------------------------------------------------------------------
-
-@cindex CPU
-In @ProductName{}, there is the concept of a CPU. Admittedly, this is not the
-best word to describe what is meant here and may be replaced in the future.
-
-The word CPU is used in many of the displays.
-In the context of @ProductName{}, it is meant to denote a part of the
-processor that is capable of executing instructions and with its own state,
-like the program counter.
-
-For example, on a contemporary processor, a CPU could be a core. In case
-hardware threads are supported within a core, it could be one of those
-hardware threads.
-
-@c ----------------------------------------------------------------------------
-@node Hardware Event Counters Explained
-@section Hardware Event Counters Explained
-@c ----------------------------------------------------------------------------
-
-@IndexSubentry{Hardware event counters, description}
-For quite a number of years now, many microprocessors have supported hardware
-event counters.
-
-On the hardware side, this means that in the processor there are one or more
-registers dedicated to count certain activities, or ``events''.
-Examples of such events are the number of instructions executed, or the number
-of cache misses at level 2 in the memory hierarchy.
-
-While there is a limited set of such registers, the user can map events onto
-them. In case more than one register is available, this allows for the
-simultaenous measurement of various events.
-
-A simple, yet powerful, example is to simultaneously count the number of CPU
-cycles and the number of instructions excuted. These two numbers can then be
-used to compute the
-@cindex IPC
-@emph{IPC} value. IPC stands for ``Instructions Per Clockcycle'' and each processor
-has a maximum. For example, if this maximum number is 2, it means the
-processor is capable of executing two instructions every clock cycle.
-
-Whether this is actually achieved, depends on several factors, including the
-instruction characteristics.
-However, in case the IPC value is well below this maximum in a time critical
-part of the application and this cannot be easily explained, further
-investigation is probably warranted.
-
-@cindex CPI
-A related metric is called @emph{CPI}, or ``Clockcycles Per Instruction''.
-It is the inverse of the CPI and can be compared against the theoretical
-value(s) of the target instruction(s). A significant difference may point
-at a bottleneck.
-
-One thing to keep in mind is that the value returned by a counter can either
-be the number of times the event occured, or a CPU cycle count. In case of
-the latter it is possible to convert this number to time.
-
-@IndexSubentry{Hardware event counters, variable CPU frequency}
-This is often easier to interpret than a simple count, but there is one
-caveat to keep in mind. The CPU frequency may not have been constant while
-the experimen was recorded and this impacts the time reported.
-
-These event counters, or ``counters'' for short, provide great insight into
-what happens deep inside the processor. In case higher level information does
-not provide the insight needed, the counters provide the information to get
-to the bottom of a performance problem.
-
-There are some things to consider though.
-
-@itemize @bullet
-
-@item
-The event definitions and names vary across processors and it may even happen
-that some events change with an update.
-Unfortunately and this is luckily rare, there are sometimes bugs causing the
-wrong count to be returned.
-
-@IndexSubentry{Hardware event counters, alias name}
-In @ToolName{}, some of the processor specific event names have an alias
-name. For example @code{insts} measures the instructions executed.
-These aliases not only makes it easier to identify the functionality, but also
-provide portability of certain events across processors.
-
-@item
-Another complexity is that there are typically many events one can monitor.
-There may up to hundreds of events available and it could require several
-experiments to zoom in on the root cause of a performance problem.
-
-@item
-There may be restrictions regarding the mapping of event(s) onto the
-counters. For example, certain events may be restricted to specific
-counters only. As a result, one may have to conduct additional experiments
-to cover all the events of interest.
-
-@item
-The names of the events may also not be easy to interpret. In such cases,
-the description can be found in the architecture manual for the processor.
-
-@end itemize
-
-Despite these drawbacks, hardware event counters are extremely useful and
-may even turn out to be indispensable.
-
-@c ----------------------------------------------------------------------------
-@node apath
-@section What is <apath>?
-@c ----------------------------------------------------------------------------
-
-In most cases, @ToolName{} shows the absolute pathnames of directories. These
-tend to be rather long, causing display issues in this document.
-
-Instead of wrapping these long pathnames over multiple lines, we decided to
-represent them by the @code{<apath>} symbol, which stands for ``an absolute
-pathname''.
-
-Note that different occurrences of @code{<apath>} may represent different
-absolute pathnames.
-
-@c -- A new node --------------------------------------------------------------
-@node Other Document Formats
-@chapter Other Document Formats
-@c ----------------------------------------------------------------------------
-
-This document is written in Texinfo and the source text is made available as
-part of the binutils distribution. The file name is @code{gprofng.texi} and
-can be found in subdirectory @code{doc} under directory @code{gprofng} in the
-top level directory.
-
-This file can be used to generate the document in the @code{info}, @code{html},
-and @code{pdf} formats.
-The default installation procedure creates a file in the @code{info} format and
-stores it in the documentation section of binutils.
-
-The probably easiest way to generate a different format from this Texinfo
-document is to go to the distribution directory that was created when the
-tools were built.
-This is either the default distribution directory, or the one that has been set
-with the @code{--prefix} option as part of the @code{configure} command.
-In this example we symbolize this location with @code{<dist>}.
-
-The make file called @code{Makefile} in directory @code{<dist>/gprofng/doc}
-supports several commands to generate this document in different formats.
-We recommend to use these commands.
-
-They create the file(s) and install it in the documentation directory of binutils,
-which is @code{<dist>/share/doc} in case @code{html} or @code{pdf} is selected and
-@code{<dist>/share/info} for the file in the @code{info} format.
-
-To generate this document in the requested format and install it in the documentation
-directory, the commands below should be executed. In this notation, @code{<format>}
-is one of @code{info}, @code{html}, or @code{pdf}:
-
-@smallexample
-@verbatim
-$ cd <dist>/gprofng/doc
-$ make install-<format>
-@end verbatim
-@end smallexample
-
-@noindent
-Some things to note:
-
-@itemize
-
-@item
-For the @code{pdf} file to be generated, the
-@cindex TeX
-TeX document formatting software is required and the relevant commmands need
-to be included in the search path. An example of a popular TeX implementation
-is @emph{TexLive}. It is beyond the scope of this document to go into the
-details of installing and using TeX, but it is well documented elsewhere.
-
-@item
-Instead of generating a single file in the @code{html} format, it is also
-possible to create a directory with individual files for the various chapters.
-To do so, remove the use of @code{--no-split} in variable @code{MAKEINFOHTML}
-in the make file in the @code{doc} directory.
-
-@item
-The make file also supports commands to only generate the file in the desired
-format and not move them to the documentation directory. This is
-accomplished through the @code{make <format>} command.
-
-@end itemize
-
-@ifnothtml
-@node Index
-@unnumbered Index
-@printindex cp
-@end ifnothtml
-
+@ifset man
@bye
+@end ifset
--- /dev/null
+\input texinfo @c -*-texinfo-*-
+
+@c ----------------------------------------------------------------------------
+@c This is the Texinfo source file for the GPROFNG manual. This manual
+@c includes the man pages for the various tools.
+@c
+@c Author: Ruud van der Pas
+@c ----------------------------------------------------------------------------
+
+@c %**start of header
+
+@setfilename gprofng.info
+@settitle GNU gprofng
+
+@c -- Set the indent for the @example command to 1 space, not 5 ---------------
+@exampleindent 1
+
+@paragraphindent 3
+
+@c %**end of header
+
+@c -- Start a new chapter on a new, odd numbered, page ------------------------
+@setchapternewpage odd
+
+@c -- Merge all index entries into the Concepts Index -------------------------
+@syncodeindex fn cp
+@syncodeindex ky cp
+@syncodeindex pg cp
+@syncodeindex vr cp
+
+@c -- Macros specific to gprofng ----------------------------------------------
+@include gp-macros.texi
+
+@c -- Get the version information ---------------------------------------------
+@include version.texi
+
+@c -- Entry for the Info dir structure ----------------------------------------
+@ifnottex
+@dircategory Software development
+@direntry
+* gprofng: (gprofng). The next generation profiling tool for Linux
+@end direntry
+@end ifnottex
+
+@c -- Copyright stuff ---------------------------------------------------------
+@copying
+This document is the manual for @ProductName{}, last updated @value{UPDATED}.
+
+Copyright @copyright{} 2022-2023 Free Software Foundation, Inc.
+
+@c -- @quotation
+Permission is granted to copy, distribute and/or modify this document
+under the terms of the GNU Free Documentation License,
+Version 1.3 or any later version published by the Free Software
+Foundation; with no Invariant Sections, with no Front-Cover texts,
+and with no Back-Cover Texts. A copy of the license is included in the
+section entitled ``GNU Free Documentation License.''
+
+@c -- @end quotation
+@end copying
+
+@finalout
+@smallbook
+
+@c -- Define the title page ---------------------------------------------------
+@titlepage
+@title GNU gprofng
+@subtitle The next generation profiling tool for Linux
+@subtitle version @value{VERSION} (last updated @value{UPDATED})
+@author Ruud van der Pas
+@page
+@vskip 0pt plus 1filll
+@insertcopying
+@end titlepage
+
+@c -- Generate the Table of Contents ------------------------------------------
+@contents
+
+@c -- The Top node ------------------------------------------------------------
+@c Should contain a short summary, copying permissions and a master menu.
+@c ----------------------------------------------------------------------------
+@ifnottex
+@node Top
+@top GNU Gprofng
+
+@insertcopying
+@end ifnottex
+
+@ifinfo
+@c -- The menu entries --------------------------------------------------------
+
+@c * Display Source Code:: Display the source code and disassembly.
+@c * Archive Experiment Data:: Archive an experiment.
+
+@menu
+* Introduction:: About this manual.
+* Overview:: A brief overview of @ProductName{}.
+* A Mini Tutorial:: A short tutorial covering the key features.
+* The gprofng Tools:: An overview of the tools supported.
+* Performance Data Collection:: Record the performance information.
+* View the Performance Information:: Different ways to view the data.
+* Terminology:: Concepts and terminology explained.
+* Other Document Formats:: Create this document in other formats.
+* The gprofng Man Pages:: The gprofng man pages.
+* Index:: The index.
+
+@detailmenu
+
+--- The Detailed Node Listing ---
+
+Introduction
+
+Overview
+
+* Main Features:: A high level overview.
+* Sampling versus Tracing:: The pros and cons of sampling versus tracing.
+* Steps Needed to Create a Profile:: How to create a profile.
+
+A Mini Tutorial
+
+* Getting Started:: The basics of profiling with @ProductName().
+* Support for Multithreading:: Commands specific to multithreaded applications.
+* View Multiple Experiments:: Analyze multiple experiments simultaneously.
+* Profile Hardware Event Counters:: How to use hardware event counters.
+* Java Profiling:: How to profile a Java application.
+
+The gprofng Tools
+
+* Tools Overview:: A brief description of the tools.
+* The gprofng.rc file with default settings:: Customize the settings.
+* Filters:: Filters.
+* Supported Environment Variables:: The supported environment variables.
+
+Terminology
+
+* The Program Counter:: What is a Program Counter?
+* Inclusive and Exclusive Metrics:: An explanation of inclusive and exclusive metrics.
+* Metric Definitions:: Definitions associated with metrics.
+* The Viewmode:: Select the way call stacks are presented.
+* The Selection List:: How to define a selection.
+* Load Objects and Functions:: The components in an application.
+* The Concept of a CPU in @ProductName{}:: The definition of a CPU.
+* Hardware Event Counters Explained:: What are event counters?
+* apath:: Our generic definition of a path.
+
+The gprofng Man Pages
+
+* gprofng collect app:: The man page for gprofng collect app.
+* gprofng display text:: The man page for gprofng display text.
+* gprofng display src:: The man page for gprofng display src.
+* gprofng display html:: The man page for gprofng display html.
+* gprofng archive:: The man page for gprofng archive.
+
+@c -- Index
+
+@end detailmenu
+@end menu
+@end ifinfo
+
+@c -- A new node --------------------------------------------------------------
+@node Introduction
+@chapter Introduction
+@c ----------------------------------------------------------------------------
+The @ProductName{} tool is the next generation profiler for Linux. It consists
+of various commands to generate and display profile information.
+
+This manual starts with a tutorial how to create and interpret a profile. This
+part is highly practical and has the goal to get users up to speed as quickly
+as possible. As soon as possible, we would like to show you how to get your
+first profile on your screen.
+
+This is followed by more examples, covering many of the features. At the
+end of this tutorial, you should feel confident enough to tackle the more
+complex tasks.
+
+In a future update a more formal reference manual will be included as well.
+Since even in this tutorial we use certain terminology, we have included a
+chapter with descriptions at the end. In case you encounter unfamiliar
+wordings or terminology, please check this chapter.
+
+One word of caution. In several cases we had to somewhat tweak the screen
+output in order to make it fit. This is why the output may look somewhat
+different when you try things yourself.
+
+For now, we wish you a smooth profiling experience with @ProductName{} and
+good luck tackling performance bottlenecks.
+
+@c -- A new node --------------------------------------------------------------
+@c cccccc @node A Brief Overview of @ProductName{}
+@node Overview
+@chapter A Brief Overview of @ProductName{}
+@c ----------------------------------------------------------------------------
+
+@menu
+* Main Features:: A high level overview.
+* Sampling versus Tracing:: The pros and cons of sampling versus tracing.
+* Steps Needed to Create a Profile:: How to create a profile.
+@end menu
+
+Before we cover this tool in quite some detail, we start with a brief overview
+of what it is, and the main features. Since we know that many of you would
+like to get started rightaway, already in this first chapter we explain the
+basics of profiling with @ToolName{}.
+
+@c ----------------------------------------------------------------------------
+@c TBD Review this text. Probably be more specific on the gcc releases and
+@c processor specifics.
+@c ----------------------------------------------------------------------------
+
+@c -- A new node --------------------------------------------------------------
+@node Main Features
+@section Main Features
+@c ----------------------------------------------------------------------------
+
+@noindent
+These are the main features of the @ProductName{} tool:
+
+@itemize @bullet
+
+@item
+Profiling is supported for an application written in C, C++, Java, or Scala.
+
+@c TBD Java: up to 1.8 full support, support other than for modules
+
+@item
+Shared libraries are supported. The information is presented at the instruction
+level.
+
+@item
+The following multithreading programming models are supported: Pthreads,
+OpenMP, and Java threads.
+
+@item
+This tool works with unmodified production level executables. There is no need to
+recompile the code, but if the @samp{-g} option has been used when building
+the application, source line level information is available.
+
+@item
+The focus is on support for code generated with the @command{gcc} compiler, but
+there is some limited support for the @command{icc} compiler as well. Future
+improvements and enhancements will focus on @command{gcc} though.
+
+@item
+Processors from Intel, AMD, and Arm are supported, but the level of support
+depends on the architectural details. In particular, hardware event counters
+may not be supported. If this is the case, all views not related to these
+counters still ought to work though.
+
+@item
+Several views into the data are supported. For example, a function overview
+where the time is spent, but also a source line, disassembly, call tree and
+a caller-callees overview are available.
+
+@item
+Through filters, the user can zoom in on an area of interest.
+
+@item
+Two or more profiles can be aggregated, or used in a comparison. This comparison
+can be obtained at the function, source line, and disassembly level.
+
+@item
+Through a simple scripting language, and customization of the metrics shown,
+the generation and creation of a profile can be fully automated and provide
+tailored output.
+
+@end itemize
+
+@c -- A new node --------------------------------------------------------------
+@node Sampling versus Tracing
+@section Sampling versus Tracing
+@c ----------------------------------------------------------------------------
+
+A key difference with some other profiling tools is that the main data
+collection command @CollectApp{} mostly uses
+@cindex Program Counter sampling
+@cindex PC sampling
+Program Counter (PC) sampling
+under the hood.
+
+With @emph{sampling}, the executable is interrupted at regular intervals. Each
+time it is halted, key information is gathered and stored. This includes the
+Program Counter that keeps track of where the execution is. Hence the name.
+
+Together with operational data, this information is stored in the experiment
+directory and can be viewed in the second phase.
+
+For example, the PC information is used to derive where the program was when
+it was halted. Since the sampling interval is known, it is relatively easy to
+derive how much time was spent in the various parts of the program.
+
+The opposite technique is generally referred to as @emph{tracing}. With
+tracing, the target is instrumented with specific calls that collect the
+requested information.
+
+These are some of the pros and cons of PC sampling verus tracing:
+
+@itemize
+
+@item
+Since there is no need to recompile, existing executables can be used
+and the profile measures the behaviour of exactly the same executable that is
+used in production runs.
+
+With sampling, one inherently profiles a different executable, because
+the calls to the instrumentation library may affect the compiler optimizations
+and run time behaviour.
+
+@item
+With sampling, there are very few restrictions on what can be profiled and even without
+access to the source code, a basic profile can be made.
+
+@item
+A downside of sampling is that, depending on the sampling frequency, small
+functions may be missed or not captured accurately. Although this is rare,
+this may happen and is the reason why the user has control over the sampling rate.
+
+@item
+While tracing produces precise information, sampling is statistical in nature.
+As a result, small variations may occur across seemingly identical runs. We
+have not observed more than a few percent deviation though. Especially if
+the target job executed for a sufficiently long time.
+
+@item
+With sampling, it is not possible to get an accurate count how often
+functions are called.
+
+@end itemize
+
+@c -- A new node --------------------------------------------------------------
+@node Steps Needed to Create a Profile
+@section Steps Needed to Create a Profile
+@c ----------------------------------------------------------------------------
+
+Creating a profile takes two steps. First the profile data needs to be
+generated. This is followed by a viewing step to create a report from the
+information that has been gathered.
+
+Every @ProductName{} command starts with @ToolName{}, the name of the driver.
+This is followed by a keyword to define the high level functionality. Depending
+on this keyword, a third qualifier may be needed to further narrow down the request.
+This combination is then followed by options that are specific to the functionality
+desired.
+
+The command to gather, or ``collect'', the performance data is called
+@CollectApp{}. Aside from numerous options, this command takes the name
+of the target executable as an input parameter.
+
+Upon completion of the run, the performance data can be
+found in the newly created
+@cindex Experiment directory
+experiment directory.
+
+Unless explicitly specified otherwise, a default
+name for this directory is chosen. The name is @file{test.<n>.er} where
+@var{<n>} is the first integer number not in use yet for such a name.
+
+For example, the first time @CollectApp{} is invoked, an experiment
+directory with the name @file{test.1.er} is created.
+Upon a subsequent invocation of @CollectApp{} in the same directory,
+an experiment directory with the name @file{test.2.er} will be created,
+and so forth.
+
+Note that @CollectApp{} supports an option to explicitly name the experiment
+directory.
+Aside from the restriction that the name of this directory has to end
+with @samp{.er}, any valid directory name can be used for this.
+
+Now that we have the performance data, the next step is to display it.
+
+@IndexSubentry{@code{gprofng}, @code{display text}}
+The most commonly used command to view the performance information is
+@DisplayText{}. This is a very extensive and customizable tool that
+produces the information in ASCII format.
+
+@IndexSubentry{@code{gprofng}, @code{display html}}
+Another option is to use @DisplayHTML{}. This tool generates a directory with
+files in html format. These can be viewed in a browser, allowing for easy
+navigation through the profile data.
+
+@c -- A new node --------------------------------------------------------------
+@node A Mini Tutorial
+@chapter A Mini Tutorial
+@c ----------------------------------------------------------------------------
+
+In this chapter we present and discuss the main functionality of @ToolName{}.
+This will be a practical approach, using an example code to generate profile
+data and show how to get various performance reports.
+
+@menu
+* Getting Started:: The basics of profiling with @ProductName().
+* Support for Multithreading:: Commands specific to multithreaded applications.
+* View Multiple Experiments:: Analyze multiple experiments simultaneously.
+* Profile Hardware Event Counters:: How to use hardware event counters.
+* Java Profiling:: How to profile a Java application.
+@end menu
+
+@c -- A new node --------------------------------------------------------------
+@node Getting Started
+@section Getting Started
+@c ----------------------------------------------------------------------------
+
+The information presented here provides a good and common basis for many
+profiling tasks, but there are more features that you may want to leverage.
+
+These are covered in subsequent sections in this chapter.
+
+@menu
+* The Example Program:: A description of the example program used.
+* A First Profile:: How to get the first profile.
+* The Source Code View:: Display the metrics in the source code.
+* The Disassembly View:: Display the metrics at the instruction level.
+* Display and Define the Metrics:: An example how to customize the metrics.
+* Customization of the Output:: An example how to customize the output.
+* Name the Experiment Directory:: Change the name of the experiment directory.
+* Control the Number of Lines in the Output:: Change the number of lines in the tables.
+* Sorting the Performance Data:: How to set the metric to sort by.
+* Scripting:: Use a script to execute the commands.
+* A More Elaborate Example:: An example of customization.
+* The Call Tree:: Display the dynamic call tree.
+* More Information on the Experiment:: How to get additional statistics.
+* Control the Sampling Frequency:: How to control the sampling granularity.
+* Information on Load Objects:: How to get more information on load objects.
+@end menu
+
+@c -- A new node --------------------------------------------------------------
+@node The Example Program
+@subsection The Example Program
+@c ----------------------------------------------------------------------------
+
+Throughout this guide we use the same example C code that implements the
+multiplication of a vector of length @math{n} by an @math{m} by @math{n}
+matrix. The result is stored in a vector of length @math{m}.
+@cindex Pthreads
+@cindex Posix Threads
+The algorithm has been parallelized using Posix Threads, or Pthreads for short.
+
+The code was built using the @code{gcc} compiler and the name of the executable
+is
+@cindex mxv-pthreads
+@command{mxv-pthreads}.
+
+The matrix sizes can be set through the @code{-m} and @code{-n} options. The
+number of threads is set with the @code{-t} option. These are additional threads
+that are used in the multiplication. To increase the duration of the run, the
+computations are executed repeatedly.
+
+This is an example that multiplies a @math{8000} by @math{4000} matrix with
+a vector of length @math{4000}. Although this is a multithreaded application,
+initially we will be using @math{1} threads. Later on we will show examples
+using multiple threads.
+
+@smallexample
+@verbatim
+$ ./mxv-pthreads -m 8000 -n 4000 -t 1
+mxv: error check passed - rows = 8000 columns = 4000 threads = 1
+$
+@end verbatim
+@end smallexample
+
+The program performs an internal check to verify that the computed results
+are correct. The result of this check is printed, as well as the matrix
+sizes and the number of threads used.
+
+@c -- A new node --------------------------------------------------------------
+@node A First Profile
+@subsection A First Profile
+@c ----------------------------------------------------------------------------
+
+The first step is to collect the performance data. It is important to remember
+that much more information is gathered than may be shown by default. Often a
+single data collection run is sufficient to get a lot of insight.
+
+The @CollectApp{} command is used for the data collection. Nothing needs to be
+changed in the way the application is executed. The only difference is that it
+is now run under control of the tool, as shown below:
+
+@cartouche
+@smallexample
+$ gprofng collect app ./mxv-pthreads -m 8000 -n 4000 -t 1
+@end smallexample
+@end cartouche
+
+@noindent
+This produces the following output:
+
+@smallexample
+@verbatim
+Creating experiment directory test.1.er (Process ID: 2749878) ...
+mxv: error check passed - rows = 8000 columns = 4000 threads = 1
+@end verbatim
+@end smallexample
+
+We see a message that an experiment directory with the name @file{test.1.er}
+has been created. The process id is also echoed. The application completes
+as usual and we have our first experiment directory that can be analyzed.
+
+The tool we use for this is called @DisplayText{}. It takes the name of
+the experiment directory as an argument.
+
+@cindex Interpreter mode
+If invoked this way, the tool starts in the interactive @emph{interpreter} mode.
+While in this environment, commands can be given and the tool responds. This is
+illustrated below:
+
+@smallexample
+@verbatim
+$ gprofng display text test.1.er
+Warning: History and command editing is not supported on this system.
+(gp-display-text) quit
+$
+@end verbatim
+@end smallexample
+
+@cindex Command line mode
+While useful in certain cases, we prefer to use this tool in command line mode
+by specifying the commands to be issued when invoking the tool. The way to do
+this is to prepend the command(s) with a hyphen (@samp{-}) if used on the
+command line.
+
+Since this makes the command appear to be an option, they are also sometimes
+referred to as such, but technically they are commands. This is the
+terminology we will use in this user guide, but for convenience the commands
+are also listed as options in the index.
+
+For example,
+@IndexSubentry{Options, @code{-functions}}
+@IndexSubentry{Commands, @code{functions}}
+below we use the @command{functions} command to request a list of the functions
+that have been executed, plus their respective CPU times:
+
+@cartouche
+@smallexample
+$ gprofng display text -functions test.1.er
+@end smallexample
+@end cartouche
+
+@smallexample
+@verbatim
+$ gprofng display text -functions test.1.er
+
+Functions sorted by metric: Exclusive Total CPU Time
+
+Excl. Total Incl. Total Name
+CPU CPU
+ sec. % sec. %
+9.367 100.00 9.367 100.00 <Total>
+8.926 95.30 8.926 95.30 mxv_core
+0.210 2.24 0.420 4.49 init_data
+0.080 0.85 0.210 2.24 drand48
+0.070 0.75 0.130 1.39 erand48_r
+0.060 0.64 0.060 0.64 __drand48_iterate
+0.010 0.11 0.020 0.21 _int_malloc
+0.010 0.11 0.010 0.11 sysmalloc
+0. 0. 8.926 95.30 <static>@0x47960 (<libgp-collector.so>)
+0. 0. 0.440 4.70 __libc_start_main
+0. 0. 0.020 0.21 allocate_data
+0. 0. 8.926 95.30 driver_mxv
+0. 0. 0.440 4.70 main
+0. 0. 0.020 0.21 malloc
+0. 0. 8.926 95.30 start_thread
+@end verbatim
+@end smallexample
+
+As easy and simple as these steps are, we do have a first profile of our program!
+
+There are five columns. The first four contain the
+@cindex Total CPU time
+''Total CPU Time'', which
+is the sum of the user and system time. @xref{Inclusive and Exclusive Metrics}
+for an explanation of ``exclusive'' and ``inclusive'' times.
+
+The first line echoes the metric that is used to sort the output. By default,
+this is the exclusive CPU time, but through the @command{sort} command, the sort
+metric can be changed by the user.
+
+Next, there are four columns with the exclusive and inclusive CPU times and the
+respective percentages. This is followed by the name of the function.
+
+@IndexSubentry{Miscellaneous, @code{<Total>}}
+The function with the name @code{<Total>} is not a user function. It is a
+pseudo function introduced by @ToolName{}. It is used to display the
+accumulated measured metric values. In this example, we see that the total
+CPU time of this job was 9.367 seconds and it is scaled to 100%. All
+other percentages in the same column are relative to this number.
+
+@c -- If the metric is derived, for example the @code{IPC}, the value shown under
+@c -- @code{<Total>} is based upon the total values of the that are metrics used to
+@c -- compute the derived metric.
+@c -- @IndexSubentry{Hardware event counters, IPC}
+
+With 8.926 seconds, function @code{mxv_core} takes 95.30% of the
+total time and is by far the most time consuming function.
+The exclusive and inclusive metrics are identical, which means that is a
+leaf function not calling any other functions.
+
+The next function in the list is @code{init_data}. Although with 4.49%,
+the CPU time spent in this part is modest, this is an interesting entry because
+the inclusive CPU time of 0.420 seconds is twice the exclusive CPU time
+of 0.210 seconds. Clearly this function is calling another function,
+or even more than one function and collectively this takes 0.210 seconds.
+Below we show the call tree feature that provides more details on the call
+structure of the application.
+
+The function @code{<static>@@0x47960 (<libgp-collector.so>)} does odd and
+certainly not familiar. It is one of the internal functions used by
+@CollectApp{} and can be ignored. Also, while the inclusive time is high,
+the exclusive time is zero. This means it doesn't contribute to the
+performance.
+
+The question is how we know where this function originates from? There are
+several commands to dig deeper an get more details on a function.
+@xref{Information on Load Objects}.
+
+@c -- A new node --------------------------------------------------------------
+@node The Source Code View
+@subsection The Source Code View
+@c ----------------------------------------------------------------------------
+
+In general, the tuning efforts are best focused on the most time consuming
+part(s) of an application. In this case that is easy, since over 95% of
+the total CPU time is spent in function @code{mxv_core}.
+It is now time to dig deeper and look
+@cindex Source level metrics
+at the metrics distribution at the source code level. Since we measured
+CPU times, these are the metrics shown.
+
+@IndexSubentry{Options, @code{-source}}
+@IndexSubentry{Commands, @code{source}}
+The @code{source} command is used to accomplish this. It takes the name of the
+function, not the source filename, as an argument. This is demonstrated
+below, where the @DisplayText{} command is used to show the annotated
+source listing of function @code{mxv_core}.
+
+Be aware that when using the @command{gcc} compiler, the source code has to
+be compiled with the @code{-g} option in order for the source code feature
+to work. Otherwise the location(s) can not be determined. For other compilers
+we recommend to check the documentation for such an option.
+
+Below the command to display the source code of a function is shown. Since at
+this point we are primarily interested in the timings only, we use the
+@code{metrics} command to request the exclusive and inclusive total CPU
+timings only. @xref{Display and Define the Metrics} for more information
+how to define the metrics to be displayed.
+
+@cartouche
+@smallexample
+$ gprofng display text -metrics ei.totalcpu -source mxv_core test.1.er
+@end smallexample
+@end cartouche
+
+The output is shown below. It has been somewhat modified to fit the formatting
+constraints and reduce the number of lines.
+
+@smallexample
+@verbatim
+Current metrics: e.totalcpu:i.totalcpu:name
+Current Sort Metric: Exclusive Total CPU Time ( e.totalcpu )
+Source file: <apath>/mxv.c
+Object file: mxv-pthreads (found as test.1.er/archives/...)
+Load Object: mxv-pthreads (found as test.1.er/archives/...)
+
+ Excl. Incl.
+ Total Total
+ CPU sec. CPU sec.
+
+ <lines deleted>
+ <Function: mxv_core>
+ 43. void __attribute__ ((noinline))
+ mxv_core (int64_t row_index_start,
+ 44. int64_t row_index_end,
+ 45. int64_t m,
+ 46. int64_t n,
+ 47. double **restrict A,
+ 48. double *restrict b,
+ 49. double *restrict c)
+ 50. {
+ 0. 0. 50. {
+ 0. 0. 51. for (int64_t i=row_index_start;
+ i<=row_index_end; i++)
+ 52. {
+ 0. 0. 53. double row_sum = 0.0;
+## 4.613 4.613 54. for (int64_t j=0; j<n; j++)
+## 4.313 4.313 55. row_sum += A[i][j] * b[j];
+ 0. 0. 56. c[i] = row_sum;
+ 57. }
+ 0. 0. 58. }
+@end verbatim
+@end smallexample
+
+The first line echoes the metrics that have been selected. The second line
+is not very meaningful when looking at the source code listing, but it shows
+the metric that is used to sort the data.
+
+The next three lines provide information on the location of the source file,
+the object file and the load object (@xref{Load Objects and Functions}).
+
+Function @code{mxv_core} is part of a source file that has other functions
+as well. These functions will be shown with the values for the metrics, but
+for lay-out purposes they have been removed in the output shown above.
+
+The header is followed by the annotated source code listing. The selected
+metrics are shown first, followed by a source line number, and the source code.
+@IndexSubentry{Miscellaneous ,@code{##}}
+The most time consuming line(s) are marked with the @code{##} symbol. In
+this way they are easier to identify and find with a search.
+
+What we see is that all of the time is spent in lines 54-55.
+
+@IndexSubentry{Options, @code{-lines}}
+@IndexSubentry{Commands, @code{lines}}
+A related command sometimes comes handy as well. It is called @code{lines}
+and displays a list of the source lines and their metrics, ordered according
+to the current sort metric (@xref{Sorting the Performance Data}).
+
+Below the command and the output. For lay-out reasons, only the top 10 is
+shown here and the last part of the text on some lines has been replaced
+by dots. The full text is @samp{instructions without line numbers} and
+means that the line number information for that function was not found.
+
+@cartouche
+@smallexample
+$ gprofng display text -lines test.1.er
+@end smallexample
+@end cartouche
+
+@smallexample
+@verbatim
+Lines sorted by metric: Exclusive Total CPU Time
+
+Excl. Total Incl. Total Name
+CPU CPU
+ sec. % sec. %
+9.367 100.00 9.367 100.00 <Total>
+4.613 49.25 4.613 49.25 mxv_core, line 54 in "mxv.c"
+4.313 46.05 4.313 46.05 mxv_core, line 55 in "mxv.c"
+0.160 1.71 0.370 3.95 init_data, line 118 in "manage_data.c"
+0.080 0.85 0.210 2.24 <Function: drand48, instructions ...>
+0.070 0.75 0.130 1.39 <Function: erand48_r, instructions ...>
+0.060 0.64 0.060 0.64 <Function: __drand48_iterate, ...>
+0.040 0.43 0.040 0.43 init_data, line 124 in "manage_data.c"
+0.010 0.11 0.020 0.21 <Function: _int_malloc, instructions ...>
+0.010 0.11 0.010 0.11 <Function: sysmalloc, instructions ...>
+@end verbatim
+@end smallexample
+
+What this overview immediately highlights is that the third most time consuming
+source line takes 0.370 seconds only. This means that the inclusive time is
+only 3.95% and clearly this branch of the code hardly impacts the performance.
+
+@c -- A new node --------------------------------------------------------------
+@node The Disassembly View
+@subsection The Disassembly View
+@c ----------------------------------------------------------------------------
+
+The source view is very useful to obtain more insight where the time is spent,
+but sometimes this is not sufficient. The disassembly view provides more
+details since it shows the metrics at the instruction level.
+
+This view is displayed with the
+@IndexSubentry{Options, @code{-disasm}}
+@IndexSubentry{Commands, @code{disasm}}
+@command{disasm}
+command and as with the source view, it displays an annotated listing. In this
+@cindex Instruction level metrics
+case it shows the instructions with the metrics, interleaved with the
+source lines. The
+instructions have a reference in square brackets (@code{[} and @code{]})
+to the source line they correspond to.
+
+@noindent
+We again focus on the tmings only and set the metrics accordingly:
+
+@cartouche
+@smallexample
+$ gprofng display text -metrics ei.totalcpu -disasm mxv_core test.1.er
+@end smallexample
+@end cartouche
+
+@smallexample
+@verbatim
+Current metrics: e.totalcpu:i.totalcpu:name
+Current Sort Metric: Exclusive Total CPU Time ( e.totalcpu )
+Source file: <apath>/src/mxv.c
+Object file: mxv-pthreads (found as test.1.er/archives/...)
+Load Object: mxv-pthreads (found as test.1.er/archives/...)
+
+ Excl. Incl.
+ Total Total
+ CPU sec. CPU sec.
+
+ <lines deleted>
+ 43. void __attribute__ ((noinline))
+ mxv_core (int64_t row_index_start,
+ 44. int64_t row_index_end,
+ 45. int64_t m,
+ 46. int64_t n,
+ 47. double **restrict A,
+ 48. double *restrict b,
+ 49. double *restrict c)
+ 50. {
+ <Function: mxv_core>
+ 0. 0. [50] 401d56: mov 0x8(%rsp),%r10
+ 51. for (int64_t i=row_index_start;
+ i<=row_index_end; i++)
+ 0. 0. [51] 401d5b: cmp %rsi,%rdi
+ 0. 0. [51] 401d5e: jg 0x47
+ 0. 0. [51] 401d60: add $0x1,%rsi
+ 0. 0. [51] 401d64: jmp 0x36
+ 52. {
+ 53. double row_sum = 0.0;
+ 54. for (int64_t j=0; j<n; j++)
+ 55 row_sum += A[i][j] * b[j];
+ 0. 0. [55] 401d66: mov (%r8,%rdi,8),%rdx
+ 0. 0. [54] 401d6a: mov $0x0,%eax
+ 0. 0. [53] 401d6f: pxor %xmm1,%xmm1
+ 0.110 0.110 [55] 401d73: movsd (%rdx,%rax,8),%xmm0
+ 1.921 1.921 [55] 401d78: mulsd (%r9,%rax,8),%xmm0
+ 2.282 2.282 [55] 401d7e: addsd %xmm0,%xmm1
+## 4.613 4.613 [54] 401d82: add $0x1,%rax
+ 0. 0. [54] 401d86: cmp %rax,%rcx
+ 0. 0. [54] 401d89: jne 0xffffffffffffffea
+ 56. c[i] = row_sum;
+ 0. 0. [56] 401d8b: movsd %xmm1,(%r10,%rdi,8)
+ 0. 0. [51] 401d91: add $0x1,%rdi
+ 0. 0. [51] 401d95: cmp %rsi,%rdi
+ 0. 0. [51] 401d98: je 0xd
+ 0. 0. [53] 401d9a: pxor %xmm1,%xmm1
+ 0. 0. [54] 401d9e: test %rcx,%rcx
+ 0. 0. [54] 401da1: jg 0xffffffffffffffc5
+ 0. 0. [54] 401da3: jmp 0xffffffffffffffe8
+ 57. }
+ 58. }
+ 0. 0. [58] 401da5: ret
+@end verbatim
+@end smallexample
+
+For each instruction, the timing values are given and we can immediately
+identify the most expensive instructions. As with the source level view,
+these are marked with the @code{##} symbol.
+
+It comes as no surprise that the time consuming instructions originate from
+the source code at lines 54-55.
+One thing to note is that the source line numbers no longer appear in
+sequential order.
+This is because the compiler has re-ordered the instructions as part of
+the code optimizations it has performed.
+
+As illustrated below and similar to the @command{lines} command, we can get
+an overview of the instructions executed by using the
+@IndexSubentry{Options, @code{-pcs}}
+@IndexSubentry{Commands, @code{pcs}}
+@command{pcs}
+command.
+
+@noindent
+Below the command and the output, which again has been restricted
+to 10 lines. As before, some lines have been shortened for lay-out
+purposes.
+
+@cartouche
+@smallexample
+$ gprofng display text -pcs test.1.er
+@end smallexample
+@end cartouche
+
+@smallexample
+@verbatim
+PCs sorted by metric: Exclusive Total CPU Time
+
+Excl. Total Incl. Total Name
+CPU CPU
+ sec. % sec. %
+9.367 100.00 9.367 100.00 <Total>
+4.613 49.25 4.613 49.25 mxv_core + 0x0000002C, line 54 in "mxv.c"
+2.282 24.36 2.282 24.36 mxv_core + 0x00000028, line 55 in "mxv.c"
+1.921 20.51 1.921 20.51 mxv_core + 0x00000022, line 55 in "mxv.c"
+0.150 1.60 0.150 1.60 init_data + 0x000000AC, line 118 in ...
+0.110 1.18 0.110 1.18 mxv_core + 0x0000001D, line 55 in "mxv.c"
+0.040 0.43 0.040 0.43 drand48 + 0x00000022
+0.040 0.43 0.040 0.43 init_data + 0x000000F1, line 124 in ...
+0.030 0.32 0.030 0.32 __drand48_iterate + 0x0000001E
+0.020 0.21 0.020 0.21 __drand48_iterate + 0x00000038
+@end verbatim
+@end smallexample
+
+@noindent
+What we see is that the top three instructions take 94% of the total CPU time
+and any optimizations should focus on this part of the code..
+
+@c -- A new node --------------------------------------------------------------
+@node Display and Define the Metrics
+@subsection Display and Define the Metrics
+@c ----------------------------------------------------------------------------
+
+The metrics shown by @DisplayText{} are useful, but there is more recorded
+than displayed by default. We can customize the values shown by defining the
+metrics ourselves.
+
+There are two commands related to changing the metrics shown:
+@IndexSubentry{Options, @code{-metric_list}}
+@IndexSubentry{Commands, @code{metric_list}}
+@command{metric_list} and
+@IndexSubentry{Options, @code{-metrics}}
+@IndexSubentry{Commands, @code{metrics}}
+@command{metrics}.
+
+The first command shows the currently selected metrics, plus all the metrics
+that have been stored as part of the experiment. The second command may be
+used to define the metric list.
+
+@noindent
+This is the way to get the information about the metrics:
+
+@IndexSubentry{Options, @code{-metric_list}}
+@IndexSubentry{Commands, @code{metric_list}}
+@cartouche
+@smallexample
+$ gprofng display text -metric_list test.1.er
+@end smallexample
+@end cartouche
+
+@noindent
+This is the output:
+
+@smallexample
+@verbatim
+Current metrics: e.%totalcpu:i.%totalcpu:name
+Current Sort Metric: Exclusive Total CPU Time ( e.%totalcpu )
+Available metrics:
+Exclusive Total CPU Time: e.%totalcpu
+Inclusive Total CPU Time: i.%totalcpu
+ Size: size
+ PC Address: address
+ Name: name
+@end verbatim
+@end smallexample
+
+This shows the metrics that are currently used, the metric that is used to sort
+the data and all the metrics that have been recorded, but are not necessarily
+shown.
+
+@cindex Default metrics
+In this case, the current metrics are set to the exclusive and inclusive
+total CPU times, the respective percentages, and the name of the function,
+or load object.
+
+@IndexSubentry{Options, @code{-metrics}}
+@IndexSubentry{Commands, @code{metrics}}
+The @code{metrics} command is used to define the metrics that need to be
+displayed.
+
+For example, to swap the exclusive and inclusive metrics, use the following
+metric definition: @code{i.%totalcpu:e.%totalcpu}.
+
+Since the metrics can be tailored for different views, there is also a way
+to reset them to the default. This is done through the special keyword
+@code{default} for the metrics definition (@command{-metrics default}).
+@IndexSubentry{Metrics, Reset to default}
+
+@c -- A new node --------------------------------------------------------------
+@node Customization of the Output
+@subsection Customization of the Output
+@c ----------------------------------------------------------------------------
+
+With the information just given, the function overview can be customized.
+For sake of the example, we would like to display the name of the function
+first, only followed by the exclusive CPU time, given as an absolute number
+and a percentage.
+
+Note that the commands are parsed in order of appearance. This is why we
+need to define the metrics @emph{before} requesting the function overview:
+
+@cartouche
+@smallexample
+$ gprofng display text -metrics name:e.%totalcpu -functions test.1.er
+@end smallexample
+@end cartouche
+
+@smallexample
+@verbatim
+Current metrics: name:e.%totalcpu
+Current Sort Metric: Exclusive Total CPU Time ( e.%totalcpu )
+Functions sorted by metric: Exclusive Total CPU Time
+
+Name Excl. Total
+ CPU
+ sec. %
+ <Total> 9.367 100.00
+ mxv_core 8.926 95.30
+ init_data 0.210 2.24
+ drand48 0.080 0.85
+ erand48_r 0.070 0.75
+ __drand48_iterate 0.060 0.64
+ _int_malloc 0.010 0.11
+ sysmalloc 0.010 0.11
+ <static>@0x47960 (<libgp-collector.so>) 0. 0.
+ __libc_start_main 0. 0.
+ allocate_data 0. 0.
+ driver_mxv 0. 0.
+ main 0. 0.
+ malloc 0. 0.
+ start_thread 0. 0.
+@end verbatim
+@end smallexample
+
+This was a first and simple example how to customize the output. Note that we
+did not rerun our profiling job and merely modified the display settings.
+Below we will show other and also more advanced examples of customization.
+
+@c -- A new node --------------------------------------------------------------
+@node Name the Experiment Directory
+@subsection Name the Experiment Directory
+@c ----------------------------------------------------------------------------
+
+When using @CollectApp{}, the default names for experiments work fine, but
+they are quite generic. It is often more convenient to select a more
+descriptive name. For example, one that reflects conditions for the experiment
+conducted, like the number of threads used.
+
+For this, the mutually exclusive @code{-o} and @code{-O} options come in handy.
+Both may be used to provide a name for the experiment directory, but the
+behaviour of @CollectApp{} is different.
+
+With the
+@IndexSubentry{Options, @code{-o}}
+@samp{-o}
+option, an existing experiment directory is not overwritten. Any directory
+with the same name either needs to be renamed, moved, or removed, before the
+experiment can be conducted.
+
+This is in contrast with the behaviour for the
+@IndexSubentry{Options, @code{-O}}
+@samp{-O}
+option. Any existing directory with the same name is silently overwritten.
+
+Be aware that the name of the experiment directory has to end with @file{.er}.
+
+@c -- A new node --------------------------------------------------------------
+@node Control the Number of Lines in the Output
+@subsection Control the Number of Lines in the Output
+@c ----------------------------------------------------------------------------
+
+@IndexSubentry{Options, @code{-limit}}
+@IndexSubentry{Commands, @code{limit}}
+The @command{limit} @var{<n>} command can be used to control the number of lines
+printed in various views. For example it impacts the function view, but also
+takes effect for other display commands, like @command{lines}.
+
+The argument @var{<n>} should be a positive integer number. It sets the number
+of lines in the (function) view. A value of zero resets the limit to the
+default.
+
+Be aware that the pseudo-function @code{<Total>} counts as a regular function.
+For example @command{limit 10} displays nine user level functions.
+
+@c -- A new node --------------------------------------------------------------
+@node Sorting the Performance Data
+@subsection Sorting the Performance Data
+@c ----------------------------------------------------------------------------
+
+@IndexSubentry{Options, @code{-sort}}
+@IndexSubentry{Commands, @code{sort}}
+The @command{sort} @var{<key>} command sets the key to be used when sorting the
+performance data.
+
+The key is a valid metric definition, but the
+@IndexSubentry{Metrics, Visibility field}
+visibility field
+(@xref{Metric Definitions})
+in the metric
+definition is ignored, since this does not affect the outcome of the sorting
+operation.
+For example if the sort key is set to @code{e.totalcpu}, the values
+will be sorted in descending order with respect to the exclusive total
+CPU time.
+
+@IndexSubentry{Sort, Reverse order}
+The data can be sorted in reverse order by prepending the metric definition
+with a minus (@samp{-}) sign. For example @command{sort -e.totalcpu}.
+
+@IndexSubentry{Sort, Reset to default}
+A default metric for the sort operation has been defined and since this is
+a persistent command, this default can be restored with @code{default} as
+the key (@command{sort default}).
+
+@c -- A new node --------------------------------------------------------------
+@node Scripting
+@subsection Scripting
+@c ----------------------------------------------------------------------------
+
+@cindex Script files
+The list with commands for @DisplayText{} can be very long. This is tedious
+and also error prone. Luckily, there is an easier and elegant way to control
+the output of this tool.
+
+@IndexSubentry{Options, @code{-script}}
+@IndexSubentry{Commands, @code{script}}
+Through the @command{script} command, the name of a file with commands can be
+passed in. These commands are parsed and executed as if they appeared on
+the command line in the same order as encountered in the file. The commands
+in this script file can actually be mixed with commands on the command line
+and multiple script files may be used.
+The difference between the commands in the script file and those used on the
+command line is that the latter require a leading dash (@samp{-}) symbol.
+
+Comment lines in a script file are supported. They need to start with the
+@samp{#} symbol.
+
+@c -- A new node --------------------------------------------------------------
+@node A More Elaborate Example
+@subsection A More Elaborate Example
+@c ----------------------------------------------------------------------------
+
+With the information presented so far, we can customize our data
+gathering and display commands.
+
+As an example, we would like to use @file{mxv.1.thr.er} as the name for the
+experiment directory. In this way, the name of the algorithm and the
+number of threads that were used are included in the name.
+We also don't mind to overwrite an existing
+experiment directory with the same name.
+
+All that needs to be done is to use the
+@IndexSubentry{Options, @code{-O}}
+@samp{-O}
+option, followed by the directory name of choice when running @CollectApp{}:
+
+@cartouche
+@smallexample
+$ exe=mxv-pthreads
+$ m=8000
+$ n=4000
+$ gprofng collect app -O mxv.1.thr.er ./$exe -m $m -n $n -t 1
+@end smallexample
+@end cartouche
+
+Since we want to customize the profile and prefer to keep the command line
+short, the commands to generate the profile are put into a file with the
+name @file{my-script}:
+
+@smallexample
+@verbatim
+$ cat my-script
+# This is my first gprofng script
+# Set the metrics
+metrics i.%totalcpu:e.%totalcpu:name
+# Use the exclusive time to sort
+sort e.totalcpu
+# Limit the function list to 5 lines
+limit 5
+# Show the function list
+functions
+@end verbatim
+@end smallexample
+
+This script file is specified as input to the @DisplayText{} command
+that is used to display the performance information stored in experiment
+directory @file{mxv.1.thr.er}:
+
+@cartouche
+@smallexample
+$ gprofng display text -script my-script mxv.1.thr.er
+@end smallexample
+@end cartouche
+
+This command produces the following output:
+
+@smallexample
+@verbatim
+# This is my first gprofng script
+# Set the metrics
+Current metrics: i.%totalcpu:e.%totalcpu:name
+Current Sort Metric: Exclusive Total CPU Time ( e.%totalcpu )
+# Use the exclusive time to sort
+Current Sort Metric: Exclusive Total CPU Time ( e.%totalcpu )
+# Limit the function list to 5 lines
+Print limit set to 5
+# Show the function list
+Functions sorted by metric: Exclusive Total CPU Time
+
+Incl. Total Excl. Total Name
+CPU CPU
+ sec. % sec. %
+9.703 100.00 9.703 100.00 <Total>
+9.226 95.09 9.226 95.09 mxv_core
+0.455 4.69 0.210 2.17 init_data
+0.169 1.75 0.123 1.26 erand48_r
+0.244 2.52 0.075 0.77 drand48
+@end verbatim
+@end smallexample
+
+In the first part of the output the comment lines in the script file are
+echoed. These are interleaved with an acknowledgement message for the commands.
+
+This is followed by a profile consisting of 5 lines only. For both metrics,
+the percentages plus the timings are given. The numbers are sorted with respect
+to the exclusive total CPU time. Although this is the default, for
+demonstration purposes we use the @command{sort} command to explicitly define
+the metric for the sort.
+
+While we executed the same job as before and only changed the name of the
+experiment directory, the results are somewhat different. This is sampling
+in action. The numbers are not all that different though.
+It is seen that function @code{mxv_core} is responsbile for
+95% of the CPU time and @code{init_data} takes 4.5% only.
+
+@c -- A new node --------------------------------------------------------------
+@node The Call Tree
+@subsection The Call Tree
+@c ----------------------------------------------------------------------------
+
+The call tree shows the dynamic structure of the application by displaying the
+functions executed and their parent. The CPU time attributed to each function
+is shown as well. This view helps to find the most expensive
+execution path in the program.
+
+@IndexSubentry{Options, @code{-calltree}}
+@IndexSubentry{Commands, @code{calltree}}
+This feature is enabled through the @command{calltree} command. For example,
+this is how to get the call tree for our current experiment:
+
+@cartouche
+@smallexample
+$ gprofng display text -calltree mxv.1.thr.er
+@end smallexample
+@end cartouche
+
+This displays the following structure:
+
+@smallexample
+@verbatim
+Functions Call Tree. Metric: Attributed Total CPU Time
+
+Attr. Total Name
+CPU
+ sec. %
+9.703 100.00 +-<Total>
+9.226 95.09 +-start_thread
+9.226 95.09 | +-<static>@0x47960 (<libgp-collector.so>)
+9.226 95.09 | +-driver_mxv
+9.226 95.09 | +-mxv_core
+0.477 4.91 +-__libc_start_main
+0.477 4.91 +-main
+0.455 4.69 +-init_data
+0.244 2.52 | +-drand48
+0.169 1.75 | +-erand48_r
+0.047 0.48 | +-__drand48_iterate
+0.021 0.22 +-allocate_data
+0.021 0.22 | +-malloc
+0.021 0.22 | +-_int_malloc
+0.006 0.06 | +-sysmalloc
+0.003 0.03 | +-__default_morecore
+0.003 0.03 | +-sbrk
+0.003 0.03 | +-brk
+0.001 0.01 +-pthread_create
+0.001 0.01 +-__pthread_create_2_1
+@end verbatim
+@end smallexample
+
+At first sight this may not be what is expected and some explanation is in
+place.
+
+@c ----------------------------------------------------------------------------
+@c TBD: Revise this text when we have user and machine mode.
+@c ----------------------------------------------------------------------------
+The top function is the pseudo-function @code{<Total>} that we have seen
+before. It is introduced and shown here to provide the total value of the
+metric(s).
+
+We also see function @code{<static>@@0x47960} in the call tree and apparently
+it is from @code{libgp-collector.so}, a library that is internal to
+@ToolName{}.
+The @code{<static>} marker, followed by the program counter, is shown if the
+name of the function cannot be found. This function is part of the
+implementation of the data collection process and should be hidden to the
+user. This is part of a planned future enhancement.
+
+In general, if a view has a function that does not appear to be part of the
+user code, or seems odd anyhow, the @command{objects} and @command{fsingle}
+@IndexSubentry{Options, @code{-objects}}
+@IndexSubentry{Commands, @code{objects}}
+@IndexSubentry{Options, @code{-fsingle}}
+@IndexSubentry{Commands, @code{fsingle}}
+commands are very useful
+to find out more about load objects in general, but also to help identify
+an unknown entry in the function overview. @xref{Load Objects and Functions}.
+
+Another thing to note is that there are two main branches. The one under
+@code{<static>@@0x47960} and the second one under @code{__libc_start_main}.
+This reflects the fact that this is a multithreaded program and the
+threaded part shows up as a separate branch in the call tree.
+
+The way to interpret this structure is as follows. The program starts
+under control of @code{__libc_start_main}. This executes the main program
+called @code{main}, which at the top level executes functions
+@code{init_data}, @code{allocate_data}, and @code{pthread_create}.
+The latter function creates and executes the additional thread(s).
+
+For this multithreaded part of the code, we need to look at the branch
+under function @code{start_thread} that calls the driver code for the
+matrix-vector multiplication (@code{driver_mxv}), which executes the function
+that performs the actual multiplication (@code{mxv_core}).
+
+There are two things worth noting for the call tree feature:
+
+@itemize
+
+@item
+This is a dynamic tree and since sampling is used, it most likely looks
+slighlty different across seemingly identical profile runs. In case the
+run times are short, it is worth considering to use a high resolution
+through the
+@IndexSubentry{Options, @code{-p}}
+@samp{-p}
+option. For example use @samp{-p hi} to increase the sampling rate.
+
+@item
+In case hardware event counters have been enabled
+(@xref{Profile Hardware Event Counters}), these values are also displayed
+in the call tree view.
+
+@end itemize
+
+@c -- A new node --------------------------------------------------------------
+@node More Information on the Experiment
+@subsection More Information on the Experiment
+@c ----------------------------------------------------------------------------
+
+The experiment directory not only contains performance related data. Several
+system characteristics, the profiling command executed, plus some global
+performance statistics are stored and can be displayed.
+
+@IndexSubentry{Options, @code{-header}}
+@IndexSubentry{Commands, @code{header}}
+The @command{header} command displays information about the experiment(s).
+For example, this is command is used to extract this data from for our
+experiment directory:
+
+@cartouche
+@smallexample
+$ gprofng display text -header mxv.1.thr.er
+@end smallexample
+@end cartouche
+
+The above command prints the following information. Note that some of the
+lay-out and the information has been modified. Directory paths have been
+replaced @code{<apath>} for example. Textual changes are
+marked with the @samp{<} and @samp{>} symbols.
+
+@smallexample
+@verbatim
+Experiment: mxv.1.thr.er
+No errors
+No warnings
+Archive command ` /usr/bin/gp-archive -n -a on --outfile
+ <apath>/archive.log <apath>/mxv.1.thr.er'
+
+Target command (64-bit): './mxv-pthreads -m 8000 -n 4000 -t 1'
+Process pid 2750071, ppid 2750069, pgrp 2749860, sid 2742080
+Current working directory: <apath>
+Collector version: `2.40.00'; experiment version 12.4 (64-bit)
+Host `<the-host-name>', OS `Linux <version>', page size 4096,
+ architecture `x86_64'
+ 4 CPUs, clock speed 2294 MHz.
+ Memory: 3506491 pages @ 4096 = 13697 MB.
+Data collection parameters:
+ Clock-profiling, interval = 997 microsecs.
+ Periodic sampling, 1 secs.
+ Follow descendant processes from: fork|exec|combo
+
+Experiment started <date and time>
+
+Experiment Ended: 9.801216173
+Data Collection Duration: 9.801216173
+@end verbatim
+@end smallexample
+
+The output above may assist in troubleshooting, or to verify some of the
+operational conditions and we recommend to include this command when
+generating a profile.
+
+@IndexSubentry{Options, @code{-C}}
+Related to this command there is a useful option to record comment(s) in
+an experiment.
+To this end, use the @samp{-C} option on the @CollectApp{} tool to
+specify a comment string. Up to ten comment lines can be included.
+These comments are displayed with the @command{header} command on
+the @DisplayText{} tool.
+
+@IndexSubentry{Options, @code{-overview}}
+@IndexSubentry{Commands, @code{overview}}
+The @command{overview} command displays information on the experiment(s) and
+also shows a summary of the values for the metric(s) used. This is an example
+how to use it on the newly created experiment directory:
+
+@cartouche
+@smallexample
+$ gprofng display text -overview mxv.1.thr.er
+@end smallexample
+@end cartouche
+
+@smallexample
+@verbatim
+Experiment(s):
+
+Experiment :mxv.1.thr.er
+ Target : './mxv-pthreads -m 8000 -n 4000 -t 1'
+ Host : <hostname> (<ISA>, Linux <version>)
+ Start Time : <date and time>
+ Duration : 9.801 Seconds
+
+Metrics:
+
+ Experiment Duration (Seconds): [9.801]
+ Clock Profiling
+ [X]Total CPU Time - totalcpu (Seconds): [*9.703]
+
+Notes: '*' indicates hot metrics, '[X]' indicates currently enabled
+ metrics.
+ The metrics command can be used to change selections. The
+ metric_list command lists all available metrics.
+@end verbatim
+@end smallexample
+
+This command provides a dashboard overview that helps to easily identify
+where the time is spent and in case hardware event counters are used, it
+shows their total values.
+
+@c -- A new node --------------------------------------------------------------
+@node Control the Sampling Frequency
+@subsection Control the Sampling Frequency
+@c ----------------------------------------------------------------------------
+
+@cindex Sampling frequency
+So far we did not go into details on the frequency of the sampling process,
+but in some cases it is useful to change the default of 10 milliseconds.
+
+The advantage of increasing the sampling frequency is that functions that
+do not take much time per invocation are more accurately captured. The
+downside is that more data is gathered. This has an impact on the overhead
+of the collection process and more disk space is required.
+
+In general this is not an immediate concern, but with heavily threaded
+applications that run for an extended period of time, increasing the
+frequency may have a more noticeable impact.
+
+@IndexSubentry{Options, @code{-p}}
+The @code{-p} option on the @CollectApp{} tool is used to enable or disable
+clock based profiling, or to explicitly set the sampling rate.
+@cindex Sampling interval
+This option takes one of the following keywords:
+
+@table @code
+
+@item off
+Disable clock based profiling.
+
+@item on
+Enable clock based profiling with a per thread sampling interval of 10 ms.
+This is the default.
+
+@item lo
+Enable clock based profiling with a per thread sampling interval of 100 ms.
+
+@item hi
+Enable clock based profiling with a per thread sampling interval of 1 ms.
+
+@item @var{value}
+@cindex Sampling interval
+Enable clock based profiling with a per thread sampling interval of
+@var{value}.
+
+@end table
+
+It may seem unnecessary to have an option to disable clock based profiling,
+but there is a good reason to support this.
+By default, clock profiling is enabled when conducting hardware event counter
+experiments (@xref{Profile Hardware Event Counters}).
+With the @code{-p off} option, this can be disabled.
+
+If an explicit value is set for the sampling, the number can be an integer or a
+floating-point number.
+A suffix of @samp{u} for microseconds, or @samp{m} for milliseconds is supported.
+If no suffix is used, the value is assumed to be in milliseconds.
+
+For example, the following command sets the sampling rate to
+5123.4 microseconds:
+
+@cartouche
+@smallexample
+$ gprofng collect app -p 5123.4u ./mxv-pthreads -m 8000 -n 4000 -t 1
+@end smallexample
+@end cartouche
+
+If the value is smaller than the clock profiling minimum, a warning message is issued
+and it is set to the minimum.
+In case it is not a multiple of the clock profiling resolution, it is silently rounded
+down to the nearest multiple of the clock resolution.
+If the value exceeds the clock profiling maximum, is negative, or zero, an error is
+reported.
+
+@IndexSubentry{Options, @code{-header}}
+@IndexSubentry{Commands, @code{header}}
+@noindent
+Note that the @code{header} command echoes the sampling rate used.
+
+@c -- A new node --------------------------------------------------------------
+@node Information on Load Objects
+@subsection Information on Load Objects
+@c ----------------------------------------------------------------------------
+
+It may happen that the function view shows a function that is not known to
+the user. This can easily happen with library functions for example.
+Luckily there are three commands that come in handy then.
+
+@IndexSubentry{Options, @code{-objects}}
+@IndexSubentry{Commands, @code{objects}}
+@IndexSubentry{Options, @code{-fsingle}}
+@IndexSubentry{Commands, @code{fsingle}}
+@IndexSubentry{Options, @code{-fsummary}}
+@IndexSubentry{Commands, @code{fsummary}}
+These commands are @command{objects}, @command{fsingle}, and @command{fsummary}.
+They provide details on
+@cindex Load objects
+load objects (@xref{Load Objects and Functions}).
+
+The @command{objects} command lists all load objects that have been referenced
+during the performance experiment.
+Below we show the command and the result for our profile job. Like before,
+some path names in the output have been shortened and replaced by the
+@IndexSubentry{Miscellaneous, @code{<apath>}}
+@code{<apath>} symbol that represents an absolute directory path.
+
+@cartouche
+@smallexample
+$ gprofng display text -objects mxv.1.thr.er
+@end smallexample
+@end cartouche
+
+The output includes the name and path of the target executable:
+
+@smallexample
+@verbatim
+<Unknown> (<Unknown>)
+<mxv-pthreads> (<apath>/mxv-pthreads)
+<libdl-2.28.so> (/usr/lib64/libdl-2.28.so)
+<librt-2.28.so> (/usr/lib64/librt-2.28.so)
+<libc-2.28.so> (/usr/lib64/libc-2.28.so)
+<libpthread-2.28.so> (/usr/lib64/libpthread-2.28.so)
+<libm-2.28.so> (/usr/lib64/libm-2.28.so)
+<libgp-collector.so> (/usr/lib64/gprofng/libgp-collector.so)
+<ld-2.28.so> (/usr/lib64/ld-2.28.so)
+<DYNAMIC_FUNCTIONS> (DYNAMIC_FUNCTIONS)
+@end verbatim
+@end smallexample
+
+@IndexSubentry{Options, @code{-fsingle}}
+@IndexSubentry{Commands, @code{fsingle}}
+The @command{fsingle} command may be used to get more details on a specific entry
+in the function view, say. For example, the command below provides additional
+information on the @code{pthread_create} function shown in the function overview.
+
+@cartouche
+@smallexample
+$ gprofng display text -fsingle pthread_create mxv.1.thr.er
+@end smallexample
+@end cartouche
+
+Below the output from this command. It has been somewhat modified to match the
+display requirements.
+
+@smallexample
+@verbatim
++ gprofng display text -fsingle pthread_create mxv.1.thr.er
+pthread_create
+ Exclusive Total CPU Time: 0. ( 0. %)
+ Inclusive Total CPU Time: 0.001 ( 0.0%)
+ Size: 258
+ PC Address: 8:0x00049f60
+ Source File: (unknown)
+ Object File: (unknown)
+ Load Object: /usr/lib64/gprofng/libgp-collector.so
+ Mangled Name:
+ Aliases:
+@end verbatim
+@end smallexample
+
+In this table we not only see how much time was spent in this function, we
+also see where it originates from. In addition to this, the size and start
+address are given as well. If the source code location is known it is also
+shown here.
+
+@IndexSubentry{Options, @code{-fsummary}}
+@IndexSubentry{Commands, @code{fsummary}}
+The related @code{fsummary} command displays the same information as
+@code{fsingle}, but for all functions in the function overview,
+including @code{<Total>}:
+
+@cartouche
+@smallexample
+$ gprofng display text -fsummary mxv.1.thr.er
+@end smallexample
+@end cartouche
+
+@smallexample
+@verbatim
+Functions sorted by metric: Exclusive Total CPU Time
+
+<Total>
+ Exclusive Total CPU Time: 9.703 (100.0%)
+ Inclusive Total CPU Time: 9.703 (100.0%)
+ Size: 0
+ PC Address: 1:0x00000000
+ Source File: (unknown)
+ Object File: (unknown)
+ Load Object: <Total>
+ Mangled Name:
+ Aliases:
+
+mxv_core
+ Exclusive Total CPU Time: 9.226 ( 95.1%)
+ Inclusive Total CPU Time: 9.226 ( 95.1%)
+ Size: 80
+ PC Address: 2:0x00001d56
+ Source File: <apath>/src/mxv.c
+ Object File: mxv.1.thr.er/archives/mxv-pthreads_ss_pf53V__5
+ Load Object: <apath>/mxv-pthreads
+ Mangled Name:
+ Aliases:
+
+ ... etc ...
+@end verbatim
+@end smallexample
+
+@c -- A new node --------------------------------------------------------------
+@node Support for Multithreading
+@section Support for Multithreading
+@c ----------------------------------------------------------------------------
+
+In this chapter the support for multithreading is introduced and discussed.
+As is shown below, nothing needs to be changed when collecting the performance
+data.
+
+The difference is that additional commands are available to get more
+information on the multithreading details, plus that several filters allow
+the user to zoom in on specific threads.
+
+@c -- A new node --------------------------------------------------------------
+@node Creating a Multithreading Experiment
+@subsection Creating a Multithreading Experiment
+@c ----------------------------------------------------------------------------
+
+We demonstrate the support for multithreading using the same code and settings
+as before, but this time 2 threads are used:
+
+@cartouche
+@smallexample
+$ exe=mxv-pthreads
+$ m=8000
+$ n=4000
+$ gprofng collect app -O mxv.2.thr.er ./$exe -m $m -n $n -t 2
+@end smallexample
+@end cartouche
+
+First of all, in as far as @ProductName{} is concerned, no changes are needed.
+Nothing special is needed to profile a multithreaded job when using @ToolName{}.
+
+The same is true when displaying the performance results. The same commands
+that were used before work unmodified. For example, this is all that is needed to
+get a function overview:
+
+@cartouche
+@smallexample
+$ gprofng display text -limit 5 -functions mxv.2.thr.er
+@end smallexample
+@end cartouche
+
+This produces the following familiar looking output:
+
+@smallexample
+@verbatim
+Print limit set to 5
+Functions sorted by metric: Exclusive Total CPU Time
+
+Excl. Total Incl. Total Name
+CPU CPU
+ sec. % sec. %
+9.464 100.00 9.464 100.00 <Total>
+8.961 94.69 8.961 94.69 mxv_core
+0.224 2.37 0.469 4.95 init_data
+0.105 1.11 0.177 1.88 erand48_r
+0.073 0.77 0.073 0.77 __drand48_iterate
+@end verbatim
+@end smallexample
+
+@c -- A new node --------------------------------------------------------------
+@node Commands Specific to Multithreading
+@subsection Commands Specific to Multithreading
+@c ----------------------------------------------------------------------------
+
+The function overview shown above shows the results aggregated over all the
+threads. The interesting new element is that we can also look at the
+performance data for the individual threads.
+
+@IndexSubentry{Options, @code{-thread_list}}
+@IndexSubentry{Commands, @code{thread_list}}
+The @command{thread_list} command displays how many threads have been used:
+
+@cartouche
+@smallexample
+$ gprofng display text -thread_list mxv.2.thr.er
+@end smallexample
+@end cartouche
+
+This produces the following output, showing that three threads have
+been used:
+
+@smallexample
+@verbatim
+Exp Sel Total
+=== === =====
+ 1 all 3
+@end verbatim
+@end smallexample
+
+The output confirms there is one experiment and that by default all
+threads are selected.
+
+It may seem surprising to see three threads here, since we used the
+@code{-t 2} option, but it is common for a Pthreads program to use one
+additional thread.
+Typically, there is one main thread that runs from start to finish.
+It handles the sequential portions of the code, as well as thread
+management related tasks.
+It is no different in the example code. At some point, the main thread
+creates and activates the two threads that perform the multiplication
+of the matrix with the vector. Upon completion of this computation,
+the main thread continues.
+
+@IndexSubentry{Options, @code{-threads}}
+@IndexSubentry{Commands, @code{threads}}
+The @command{threads} command is simple, yet very powerful. It shows the
+total value of the metrics for each thread.
+
+@cartouche
+@smallexample
+$ gprofng display text -threads mxv.2.thr.er
+@end smallexample
+@end cartouche
+
+@noindent
+The command above produces the following overview:
+
+@smallexample
+@verbatim
+Objects sorted by metric: Exclusive Total CPU Time
+
+Excl. Total Name
+CPU
+ sec. %
+9.464 100.00 <Total>
+4.547 48.05 Process 1, Thread 3
+4.414 46.64 Process 1, Thread 2
+0.502 5.31 Process 1, Thread 1
+@end verbatim
+@end smallexample
+
+The first line gives the total CPU time accumulated over the threads
+selected. This is followed by the metric value(s) for each thread.
+
+From this it is clear that the main thread is responsible for a
+little over 5% of the total CPU time, while the other two threads
+take 47-48% each.
+
+This view is ideally suited to verify if there are any load balancing
+issues and also to find the most time consuming thread(s).
+
+@IndexSubentry{Filters, Thread selection}
+While useful, often more information than this is needed. This is
+@IndexSubentry{Options, @code{-thread_select}}
+@IndexSubentry{Commands, @code{thread_select}}
+where the thread selection filter comes in. Through the
+@command{thread_select}
+command, one or more threads may be selected.
+@xref{The Selection List} how to define the selection list.
+
+Since it is most common to use this command in a script, we do so as
+well here. Below the script we are using:
+
+@cartouche
+@smallexample
+# Define the metrics
+metrics e.%totalcpu
+# Limit the output to 5 lines
+limit 5
+# Get the function overview for thread 1
+thread_select 1
+functions
+# Get the function overview for thread 2
+thread_select 2
+functions
+# Get the function overview for thread 3
+thread_select 3
+functions
+@end smallexample
+@end cartouche
+
+The definition of the metrics and the output limit have been shown and
+explained earlier. The new command to focus on is @command{thread_select}.
+
+This command takes a list (@xref{The Selection List}) to select specific
+threads. In this case, the individual thread numbers that were
+obtained earlier with the @command{thread_list} command are selected.
+
+This restricts the output of the @command{functions} command to the thread
+number(s) specified. This means that the script above shows which
+function(s) each thread executes and how much CPU time they consumed.
+Both the exclusive timings and their percentages are given.
+
+Note that technically this command is a filter and persistent. The
+selection remains active until changed through another thread selection
+command, or when it is reset with the @samp{all} selection list.
+
+@noindent
+This is the relevant part of the output for the first thread:
+
+@smallexample
+@verbatim
+Exp Sel Total
+=== === =====
+ 1 1 3
+Functions sorted by metric: Exclusive Total CPU Time
+
+Excl. Total Name
+CPU
+ sec. %
+0.502 100.00 <Total>
+0.224 44.64 init_data
+0.105 20.83 erand48_r
+0.073 14.48 __drand48_iterate
+0.067 13.29 drand48
+@end verbatim
+@end smallexample
+
+As usual, the comment lines are echoed. This is followed by a confirmation
+of the selection. The first table shows that one experiment is loaded and
+that thread 1 out of the three threads has been selected. What is
+displayed next is the function overview for this particular thread. Due to
+the @code{limit 5} command, there are only five functions in this list.
+
+Clearly, this thread handles the data initialization part and as we know
+from the call tree output, function @code{init_data} executes the 3 other
+functions shown in this profile.
+
+Below are the overviews for threads 2 and 3 respectively. It is seen that all
+of the CPU time is spent in function @code{mxv_core} and that this time
+is approximately the same for both threads.
+
+@smallexample
+@verbatim
+# Get the function overview for thread 2
+Exp Sel Total
+=== === =====
+ 1 2 3
+Functions sorted by metric: Exclusive Total CPU Time
+
+Excl. Total Name
+CPU
+ sec. %
+4.414 100.00 <Total>
+4.414 100.00 mxv_core
+0. 0. <static>@0x48630 (<libgp-collector.so>)
+0. 0. driver_mxv
+0. 0. start_thread
+
+# Get the function overview for thread 3
+Exp Sel Total
+=== === =====
+ 1 3 3
+Functions sorted by metric: Exclusive Total CPU Time
+
+Excl. Total Name
+CPU
+ sec. %
+4.547 100.00 <Total>
+4.547 100.00 mxv_core
+0. 0. <static>@0x48630 (<libgp-collector.so>)
+0. 0. driver_mxv
+0. 0. start_thread
+@end verbatim
+@end smallexample
+
+When analyzing the performance of a multithreaded application, it is sometimes
+useful to know whether threads have mostly executed on the same core, say, or
+if they have wandered across multiple cores. This sort of stickiness is usually
+referred to as
+@cindex Thread affinity
+@emph{thread affinity}.
+
+Similar to the commands for the threads, there are several commands related
+to the usage of the cores, or @emph{CPUs} as they are called in @ToolName{}
+(@xref{The Concept of a CPU in @ProductName{}}).
+
+@IndexSubentry{Options, @code{-cpu_list}}
+@IndexSubentry{Commands, @code{cpu_list}}
+Similar to the @command{thread_list} command, the @command{cpu_list} command
+displays how many CPUs have been used.
+@IndexSubentry{Options, @code{-cpus}}
+@IndexSubentry{Commands, @code{cpus}}
+The equivalent of the @command{threads} threads command, is the @command{cpus}
+command, which shows the numbers of the CPUs that were used and the metric values
+for each one of them. Both commands are demonstrated below.
+
+@cartouche
+@smallexample
+$ gprofng display text -cpu_list -cpus mxv.2.thr.er
+@end smallexample
+@end cartouche
+
+@noindent
+This command produces the following output:
+
+@smallexample
+@verbatim
++ gprofng display text -cpu_list -cpus mxv.2.thr.er
+Exp Sel Total
+=== === =====
+ 1 all 4
+Objects sorted by metric: Exclusive Total CPU Time
+
+Excl. Total Name
+CPU
+ sec. %
+9.464 100.00 <Total>
+4.414 46.64 CPU 2
+2.696 28.49 CPU 0
+1.851 19.56 CPU 1
+0.502 5.31 CPU 3
+@end verbatim
+@end smallexample
+
+The first table shows that there is only one experiment and that all of the
+four CPUs have been selected. The second table shows the exclusive metrics
+for each of the CPUs that have been used.
+
+As also echoed in the output, the data is sorted with respect to the
+exclusive CPU time, but it is very easy to sort the data by the CPU id
+@IndexSubentry{Options, -sort}
+@IndexSubentry{Commands, sort}
+by using the @command{sort} command:
+
+@cartouche
+@smallexample
+$ gprofng display text -cpu_list -sort name -cpus mxv.2.thr.er
+@end smallexample
+@end cartouche
+
+@noindent
+With the @command{sort} added, the output is as follows:
+
+@smallexample
+@verbatim
+Exp Sel Total
+=== === =====
+ 1 all 4
+Current Sort Metric: Name ( name )
+Objects sorted by metric: Name
+
+Excl. Total Name
+CPU
+ sec. %
+9.464 100.00 <Total>
+2.696 28.49 CPU 0
+1.851 19.56 CPU 1
+4.414 46.64 CPU 2
+0.502 5.31 CPU 3
+@end verbatim
+@end smallexample
+
+While the table with thread times shown earlier may point at a load imbalance
+in the application, this overview has a different purpose.
+
+For example, we see that 4 CPUs have been used, but we know that the
+application uses 3 threads only.
+We will now demonstrate how filters can be used to help answer the
+question why 4 CPUs are used, while the application has 3 threads only.
+This means that at least one thread has executed on more than one CPU.
+
+Recall the thread level timings:
+
+@smallexample
+@verbatim
+Excl. Total Name
+CPU
+ sec. %
+9.464 100.00 <Total>
+4.547 48.05 Process 1, Thread 3
+4.414 46.64 Process 1, Thread 2
+0.502 5.31 Process 1, Thread 1
+@end verbatim
+@end smallexample
+
+Compared to the CPU timings above, it seems very likely that thread 3 has
+used more than one CPU, because the thread and CPU timings are the same
+for both other threads.
+
+The command below selects thread number 3 and then requests the CPU
+utilization for this thread:
+
+@cartouche
+@smallexample
+$ gprofng display text -thread_select 3 -sort name -cpus mxv.2.thr.er
+@end smallexample
+@end cartouche
+
+The output shown below confirms that thread 3 is selected and then displays
+the CPU(s) that have been used by this thread:
+
+@smallexample
+@verbatim
+Exp Sel Total
+=== === =====
+ 1 3 3
+
+Objects sorted by metric: Exclusive Total CPU Time
+
+Excl. Total Name
+CPU
+ sec. %
+4.547 100.00 <Total>
+2.696 59.29 CPU 0
+1.851 40.71 CPU 1
+@end verbatim
+@end smallexample
+
+The results show that this thread has used CPU 0 nearly 60% of the time
+and CPU 1 for the remaining 40%.
+
+To confirm that this is the only thread that has used more than one CPU, the
+same approach can be used for threads 1 and 2:
+
+@smallexample
+@verbatim
+$ gprofng display text -thread_select 1 -cpus mxv.2.thr.er
+Exp Sel Total
+=== === =====
+ 1 1 3
+Objects sorted by metric: Exclusive Total CPU Time
+
+Excl. Total Name
+CPU
+ sec. %
+0.502 100.00 <Total>
+0.502 100.00 CPU 3
+@end verbatim
+@end smallexample
+
+@smallexample
+@verbatim
+$ gprofng display text -thread_select 2 -cpus mxv.2.thr.er
+Exp Sel Total
+=== === =====
+ 1 2 3
+Objects sorted by metric: Exclusive Total CPU Time
+
+Excl. Total Name
+CPU
+ sec. %
+4.414 100.00 <Total>
+4.414 100.00 CPU 2
+@end verbatim
+@end smallexample
+
+@noindent
+The output above shows that indeed threads 1 and 2 each have used a single
+CPU only.
+
+@c -- A new node --------------------------------------------------------------
+@node View Multiple Experiments
+@section View Multiple Experiments
+@c ----------------------------------------------------------------------------
+
+One thing we did not cover sofar is that @ToolName{} fully supports the analysis
+of multiple experiments. The @DisplayText{} tool accepts a list of experiments.
+The data can either be aggregated across the experiments, or used in a
+comparison.
+
+The default is to aggregate the metric values across the experiments that have
+been loaded. The @command{compare} command can be used to enable the
+@IndexSubentry{Options, @code{-compare}}
+@IndexSubentry{Commands, @code{compare}}
+comparison of results.
+
+In this section both modes are illustrated with an example.
+
+@c -- A new node --------------------------------------------------------------
+@node Aggregation of Experiments
+@subsection Aggregation of Experiments
+@c ----------------------------------------------------------------------------
+
+If the data for multiple experiments is aggregrated, the @DisplayText{} tool
+shows the combined results.
+For example, below is the script to show the function view for the data
+aggregated over two experiments, drop the first experiment and then show
+the function view fo the second experiment only.
+We will call it @file{my-script-agg}.
+
+@cartouche
+@smallexample
+# Define the metrics
+metrics e.%totalcpu
+# Limit the output to 5 lines
+limit 5
+# Get the list with experiments
+experiment_list
+# Get the function overview for all
+functions
+# Drop the first experiment
+drop_exp mxv.2.thr.er
+# Get the function overview for exp #2
+functions
+@end smallexample
+@end cartouche
+
+@IndexSubentry{Options, @code{-experiment_list}}
+@IndexSubentry{Commands, @code{experiment_list}}
+With the exception of the @command{experiment_list} command, all commands
+used have been discussed earlier.
+
+The @command{experiment_list} command provides a list of the experiments
+that have been loaded. This may be used to get the experiment IDs and
+to verify the correct experiments are loaded for the aggregation.
+
+@noindent
+Below is an example that loads two experiments and uses the above
+script to display different function views.
+
+@cartouche
+@smallexample
+$ gprofng display text -script my-script-agg mxv.2.thr.er mxv.4.thr.er
+@end smallexample
+@end cartouche
+
+@noindent
+This produces the following output:
+
+@smallexample
+@verbatim
+# Define the metrics
+Current metrics: e.%totalcpu:name
+Current Sort Metric: Exclusive Total CPU Time ( e.%totalcpu )
+# Limit the output to 5 lines
+Print limit set to 5
+# Get the list with experiments
+ID Sel PID Experiment
+== === ======= ============
+ 1 yes 1339450 mxv.2.thr.er
+ 2 yes 3579561 mxv.4.thr.er
+# Get the function overview for all
+Functions sorted by metric: Exclusive Total CPU Time
+
+Excl. Total Name
+CPU
+ sec. %
+20.567 100.00 <Total>
+19.553 95.07 mxv_core
+ 0.474 2.30 init_data
+ 0.198 0.96 erand48_r
+ 0.149 0.72 drand48
+
+# Drop the first experiment
+Experiment mxv.2.thr.er has been dropped
+# Get the function overview for exp #2
+Functions sorted by metric: Exclusive Total CPU Time
+
+Excl. Total Name
+CPU
+ sec. %
+11.104 100.00 <Total>
+10.592 95.39 mxv_core
+ 0.249 2.24 init_data
+ 0.094 0.84 erand48_r
+ 0.082 0.74 drand48
+@end verbatim
+@end smallexample
+
+The first five lines should look familiar. The five lines following echo
+the comment line in the script and show the overview of the experiments.
+This confirms two experiments have been loaded and that both are active.
+This is followed by the function overview. The timings have been summed
+up and the percentages are adjusted accordingly.
+
+@c -- A new node --------------------------------------------------------------
+@node Comparison of Experiments
+@subsection Comparison of Experiments
+@c ----------------------------------------------------------------------------
+
+The support for multiple experiments really shines in comparison mode.
+@cindex Compare experiments
+In comparison mode, the data for the various experiments is shown side by
+side, as illustrated below where we compare the results for the multithreaded
+experiments using two and four threads respectively.
+
+This
+feature is controlled through the
+@IndexSubentry{Options, @code{-compare}}
+@IndexSubentry{Commands, @code{compare}}
+@code{compare} command.
+
+The comparison mode is enabled through @command{compare on} and with
+@command{compare off} it is disabled again.
+In addition to @samp{on}, or @samp{off}, this command also supports
+the @samp{delta} and @samp{ratio} keywords.
+
+This is the script that will be used in our example. It sets the comparison
+mode to @samp{on}:
+
+@smallexample
+@verbatim
+# Define the metrics
+metrics e.%totalcpu
+# Limit the output to 5 lines
+limit 5
+# Set the comparison mode to differences
+compare on
+# Get the function overview
+functions
+@end verbatim
+@end smallexample
+
+Assuming this script file is called @file{my-script-comp}, this is how
+it is used to display the differences:
+
+@cartouche
+@smallexample
+$ gprofng display text -script my-script-comp mxv.2.thr.er mxv.4.thr.er
+@end smallexample
+@end cartouche
+
+@noindent
+This produces the output shown below. The data for the first experiment
+is shown as absolute numbers. The timings for the other experiment are
+shown as a delta relative to these reference numbers:
+
+@smallexample
+@verbatim
+
+mxv.2.thr.er mxv.4.thr.er
+Excl. Total Excl. Total Name
+CPU CPU
+ sec. % sec. %
+9.464 100.00 11.104 100.00 <Total>
+8.961 94.69 10.592 95.39 mxv_core
+0.224 2.37 0.249 2.24 init_data
+0.105 1.11 0.094 0.84 erand48_r
+0.073 0.77 0.060 0.54 __drand48_iterate
+@end verbatim
+@end smallexample
+
+This table is already helpful to more easily compare (two) profiles, but
+there is more that we can do here.
+
+By default, in comparison mode, all measured values are shown. Often
+profiling is about comparing performance data. It is therefore
+sometimes more useful to look at differences or ratios, using one
+experiment as a reference.
+
+The values shown are relative to this difference. For example if a ratio
+is below one, it means the reference value was higher.
+
+In the example below, we use the same two experiments used in the comparison
+above. The script is also nearly identical. The only change is that we now
+use the @samp{delta} keyword.
+
+As before, the number of lines is restricted to 5 and we focus on
+the exclusive timings plus percentages. For the comparison part we are
+interested in the differences.
+
+This is the script that produces such an overview:
+
+@smallexample
+@verbatim
+# Define the metrics
+metrics e.%totalcpu
+# Limit the output to 5 lines
+limit 5
+# Set the comparison mode to differences
+compare delta
+# Get the function overview
+functions
+@end verbatim
+@end smallexample
+
+Assuming this script file is called @file{my-script-comp2}, this is how we
+get the table displayed on our screen:
+
+@cartouche
+@smallexample
+$ gprofng display text -script my-script-comp2 mxv.2.thr.er mxv.4.thr.er
+@end smallexample
+@end cartouche
+
+Leaving out some of the lines printed, but we have seen before, we get
+the following table:
+
+@smallexample
+@verbatim
+mxv.2.thr.er mxv.4.thr.er
+Excl. Total Excl. Total Name
+CPU CPU
+ sec. % delta %
+9.464 100.00 +1.640 100.00 <Total>
+8.961 94.69 +1.631 95.39 mxv_core
+0.224 2.37 +0.025 2.24 init_data
+0.105 1.11 -0.011 0.84 erand48_r
+0.073 0.77 -0.013 0.54 __drand48_iterate
+@end verbatim
+@end smallexample
+
+It is now easier to see that the CPU times for the most time consuming
+functions in this code are practically the same.
+
+It is also possible to show ratio's through the @command{compare ratio}
+@IndexSubentry{Options, @code{-compare}}
+@IndexSubentry{Commands, @code{compare}}
+command. The first colum is used as a reference and the values for
+the other columns with metrics are derived by dividing the value by
+the reference. The result for such a comparison is shown below:
+
+@smallexample
+@verbatim
+mxv.2.thr.er mxv.4.thr.er
+Excl. Total Excl. Total CPU Name
+CPU
+ sec. % ratio %
+9.464 100.00 x 1.173 100.00 <Total>
+8.961 94.69 x 1.182 95.39 mxv_core
+0.224 2.37 x 1.111 2.24 init_data
+0.105 1.11 x 0.895 0.84 erand48_r
+0.073 0.77 x 0.822 0.54 __drand48_iterate
+@end verbatim
+@end smallexample
+
+Note that the comparison feature is supported at the function, source, and
+disassembly level. There is no practical limit on the number of experiments
+that can be used in a comparison.
+
+@c -- A new node --------------------------------------------------------------
+@node Profile Hardware Event Counters
+@section Profile Hardware Event Counters
+@c ----------------------------------------------------------------------------
+
+Many processors provide a set of hardware event counters and @ToolName{}
+provides support for this feature.
+@xref{Hardware Event Counters Explained} for those readers that are not
+familiar with such counters and like to learn more.
+
+In this section we explain how to get the details on the event counter
+support for the processor used in the experiment(s), and show several
+examples.
+
+@c -- A new node --------------------------------------------------------------
+@node Getting Information on the Counters Supported
+@subsection Getting Information on the Counters Supported
+@c ----------------------------------------------------------------------------
+
+The first step is to check if the processor used for the experiments is
+supported by @ToolName{}.
+@IndexSubentry{Options, @code{-h}}
+The @code{-h} option on @CollectApp{} will show the event counter
+information:
+
+@cartouche
+@smallexample
+$ gprofng collect app -h
+@end smallexample
+@end cartouche
+
+In case the counters are supported, a list with the events is printed.
+Otherwise, a warning message will be issued.
+
+For example, below we show this command and the output on an Intel Xeon
+Platinum 8167M (aka ``Skylake'') processor. The output has been split
+into several sections and each section is commented upon separately.
+
+@smallexample
+@verbatim
+Run "gprofng collect app --help" for a usage message.
+
+Specifying HW counters on `Intel Arch PerfMon v2 on Family 6 Model 85'
+(cpuver=2499):
+
+ -h {auto|lo|on|hi}
+ turn on default set of HW counters at the specified rate
+ -h <ctr_def> [-h <ctr_def>]...
+ -h <ctr_def>[,<ctr_def>]...
+ specify HW counter profiling for up to 4 HW counters
+@end verbatim
+@end smallexample
+
+The first line shows how to get a usage overview. This is followed by
+some information on the target processor.
+The next five lines explain in what ways the @code{-h} option can be
+used to define the events to be monitored.
+
+The first version shown above enables a default set of counters. This
+default depends on the processor this command is executed on. The
+keyword following the @code{-h} option defines the sampling rate:
+
+@table @code
+
+@item auto
+Match the sample rate of used by clock profiling. If the latter is disabled,
+Use a per thread sampling rate of approximately 100 samples per second.
+This setting is the default and preferred.
+
+@item on
+Use a per thread sampling rate of approximately 100 samples per second.
+
+@item lo
+Use a per thread sampling rate of approximately 10 samples per second.
+
+@item hi
+Use a per thread sampling rate of approximately 1000 samples per second.
+
+@end table
+
+The second and third variant define the events to be monitored. Note
+that the number of simultaneous events supported is printed. In this
+case we can monitor four events in a single profiling job.
+
+It is a matter of preference whether you like to use the @code{-h}
+option for each event, or use it once, followed by a comma separated
+list.
+
+There is one slight catch though. The counter definition below has
+mandatory comma (@code{,}) between the event and the rate. While a
+default can be used for the rate, the comma cannot be omitted.
+This may result in a somewhat awkward counter definition in case
+the default sampling rate is used.
+
+For example, the following two commands are equivalent. Note
+the double comma in the second command. This is not a typo.
+
+@cartouche
+@smallexample
+$ gprofng collect app -h cycles -h insts ...
+$ gprofng collect app -h cycles,,insts ...
+@end smallexample
+@end cartouche
+
+In the first command this comma is not needed, because a
+comma (``@code{,}'') immediately followed by white space may
+be omitted.
+
+This is why we prefer the this syntax and in the remainder will
+use the first version of this command.
+
+@IndexSubentry{Hardware event counters, counter definition}
+The counter definition takes an event name, plus optionally one or
+more attributes, followed by a comma, and optionally the sampling rate.
+The output section below shows the formal definition.
+
+@cartouche
+@smallexample
+ <ctr_def> == <ctr>[[~<attr>=<val>]...],[<rate>]
+@end smallexample
+@end cartouche
+
+The printed help then explains this syntax. Below we have summarized
+and expanded this output:
+
+@table @code
+
+@item @var{<ctr>}
+The counter name must be selected from the available counters listed
+as part of the output printed with the @code{-h} option.
+On most systems, if a counter is not listed, it may still be specified
+by its numeric value.
+
+@item @var{~<attr>=<val>}
+This is an optional attribute that depends on the processor. The list
+of supported attributes is printed in the output. Examples of
+attributes are ``user'', or ``system''. The value can given in decimal
+or hexadecimal format.
+Multiple attributes may be specified, and each must be preceded
+by a ~.
+
+@item @var{<rate>}
+
+The sampling rate is one of the following:
+
+@table @code
+
+@item auto
+This is the default and matches the rate used by clock profiling.
+If clock profiling is disabled, use @samp{on}.
+
+@item on
+Set the per thread maximum sampling rate to ~100 samples/second
+
+@item lo
+Set the per thread maximum sampling rate to ~10 samples/second
+
+@item hi
+Set the per thread maximum sampling rate to ~1000 samples/second
+
+@item @var{<interval>}
+Define the sampling interval.
+@xref{Control the Sampling Frequency} how to define this.
+
+@end table
+
+@end table
+
+After the section with the formal definition of events and counters, a
+processor specific list is displayed. This part starts with an overview
+of the default set of counters and the aliased names supported
+@emph{on this specific processor}.
+
+@smallexample
+@verbatim
+Default set of HW counters:
+
+ -h cycles,,insts,,llm
+
+Aliases for most useful HW counters:
+
+ alias raw name type units regs description
+
+ cycles unhalted-core-cycles CPU-cycles 0123 CPU Cycles
+ insts instruction-retired events 0123 Instructions Executed
+ llm llc-misses events 0123 Last-Level Cache Misses
+ br_msp branch-misses-retired events 0123 Branch Mispredict
+ br_ins branch-instruction-retired events 0123 Branch Instructions
+@end verbatim
+@end smallexample
+
+@noindent
+The definitions given above may or may not be available on other processors.
+
+The table above shows the default set of counters defined for this processor,
+and the aliases. For each alias the full ``raw'' name is given, plus the
+unit of the number returned by the counter (CPU cycles, or a raw count),
+the hardware counter the event is allowed to be mapped onto, and a short
+description.
+
+The last part of the output contains all the events that can be monitored:
+
+@smallexample
+@verbatim
+Raw HW counters:
+
+ name type units regs description
+
+ unhalted-core-cycles CPU-cycles 0123
+ unhalted-reference-cycles events 0123
+ instruction-retired events 0123
+ llc-reference events 0123
+ llc-misses events 0123
+ branch-instruction-retired events 0123
+ branch-misses-retired events 0123
+ ld_blocks.store_forward events 0123
+ ld_blocks.no_sr events 0123
+ ld_blocks_partial.address_alias events 0123
+ dtlb_load_misses.miss_causes_a_walk events 0123
+ dtlb_load_misses.walk_completed_4k events 0123
+
+ <many lines deleted>
+
+ l2_lines_out.silent events 0123
+ l2_lines_out.non_silent events 0123
+ l2_lines_out.useless_hwpf events 0123
+ sq_misc.split_lock events 0123
+@end verbatim
+@end smallexample
+
+As can be seen, these names are not always easy to correlate to a specific
+event of interest. The processor manual should provide more clarity on this.
+
+@c -- A new node --------------------------------------------------------------
+@node Examples Using Hardware Event Counters
+@subsection Examples Using Hardware Event Counters
+@c ----------------------------------------------------------------------------
+
+The previous section may give the impression that these counters are hard to
+use, but as we will show now, in practice it is quite simple.
+
+With the information from the @code{-h} option, we can easily set up our first
+event counter experiment.
+
+We start by using the default set of counters defined for our processor and we
+use 2 threads:
+
+@cartouche
+@smallexample
+$ exe=mxv-pthreads
+$ m=8000
+$ n=4000
+$ exp=mxv.hwc.def.2.thr.er
+$ gprofng collect app -O $exp -h auto ./$exe -m $m -n $n -t 2
+@end smallexample
+@end cartouche
+
+@IndexSubentry{Options, @code{-h}}
+@IndexSubentry{Hardware event counters, @code{auto} option}
+The new option here is @code{-h auto}. The @code{auto} keyword enables
+hardware event counter profiling and selects the default set of counters
+defined for this processor.
+
+As before, we can display the information, but there is one practical hurdle
+to take. Unless we like to view all metrics recorded, we would need to know
+the names of the events that have been enabled. This is tedious and also not
+portable in case we would like to repeat this experiment on another processor.
+
+@IndexSubentry{Hardware event counters, @code{hwc} metric}
+This is where the special @code{hwc} metric comes very handy. It
+automatically expands to the active set of events used.
+
+With this, it is very easy to display the event counter values. Note that
+although the regular clock based profiling was enabled, we only want to see
+the counter values. We also request to see the percentages and limit the
+output to the first 5 lines:
+
+@cartouche
+@smallexample
+$ exp=mxv.hwc.def.2.thr.er
+$ gprofng display text -metrics e.%hwc -limit 5 -functions $exp
+@end smallexample
+@end cartouche
+
+@smallexample
+@verbatim
+Current metrics: e.%cycles:e+%insts:e+%llm:name
+Current Sort Metric: Exclusive CPU Cycles ( e.%cycles )
+Print limit set to 5
+Functions sorted by metric: Exclusive CPU Cycles
+
+Excl. CPU Excl. Instructions Excl. Last-Level Name
+Cycles Executed Cache Misses
+ sec. % % %
+2.691 100.00 7906475309 100.00 122658983 100.00 <Total>
+2.598 96.54 7432724378 94.01 121745696 99.26 mxv_core
+0.035 1.31 188860269 2.39 70084 0.06 erand48_r
+0.026 0.95 73623396 0.93 763116 0.62 init_data
+0.018 0.66 76824434 0.97 40040 0.03 drand48
+@end verbatim
+@end smallexample
+
+As we have seen before, the first few lines echo the settings.
+This includes a list with the hardware event counters used by
+default.
+
+The table that follows makes it very easy to get an overview where the
+time is spent and how many of the target events have occurred.
+
+As before, we can drill down deeper and see the same metrics at the source
+line and instruction level. Other than using @code{hwc} in the metrics
+definitions, nothing has changed compared to the previous examples:
+
+@cartouche
+@smallexample
+$ exp=mxv.hwc.def.2.thr.er
+$ gprofng display text -metrics e.hwc -source mxv_core $exp
+@end smallexample
+@end cartouche
+
+This is the relevant part of the output. Since the lines get very long,
+we have somewhat modified the lay-out:
+
+@smallexample
+@verbatim
+ Excl. CPU Excl. Excl.
+ Cycles Instructions Last-Level
+ sec. Executed Cache Misses
+ <Function: mxv_core>
+ 0. 0 0 32. void __attribute__ ((noinline))
+ mxv_core(...)
+ 0. 0 0 33. {
+ 0. 0 0 34. for (uint64_t i=...) {
+ 0. 0 0 35. double row_sum = 0.0;
+## 1.872 7291879319 88150571 36. for (int64_t j=0; j<n; j++)
+ 0.725 140845059 33595125 37. row_sum += A[i][j]*b[j];
+ 0. 0 0 38. c[i] = row_sum;
+ 39. }
+ 0. 0 0 40. }
+@end verbatim
+@end smallexample
+
+In a smiliar way we can display the event counter values at the instruction
+level. Again we have modified the lay-out due to page width limitations:
+
+@cartouche
+@smallexample
+$ exp=mxv.hwc.def.2.thr.er
+$ gprofng display text -metrics e.hwc -disasm mxv_core $exp
+@end smallexample
+@end cartouche
+
+@smallexample
+@verbatim
+ Excl. CPU Excl. Excl.
+ Cycles Instructions Last-Level
+ sec. Executed Cache Misses
+ <Function: mxv_core>
+ 0. 0 0 [33] 4021ba: mov 0x8(%rsp),%r10
+ 34. for (uint64_t i=...) {
+ 0. 0 0 [34] 4021bf: cmp %rsi,%rdi
+ 0. 0 0 [34] 4021c2: jbe 0x37
+ 0. 0 0 [34] 4021c4: ret
+ 35. double row_sum = 0.0;
+ 36. for (int64_t j=0; j<n; j++)
+ 37. row_sum += A[i][j]*b[j];
+ 0. 0 0 [37] 4021c5: mov (%r8,%rdi,8),%rdx
+ 0. 0 0 [36] 4021c9: mov $0x0,%eax
+ 0. 0 0 [35] 4021ce: pxor %xmm1,%xmm1
+ 0.002 12804230 321394 [37] 4021d2: movsd (%rdx,%rax,8),%xmm0
+ 0.141 60819025 3866677 [37] 4021d7: mulsd (%r9,%rax,8),%xmm0
+ 0.582 67221804 29407054 [37] 4021dd: addsd %xmm0,%xmm1
+## 1.871 7279075109 87989870 [36] 4021e1: add $0x1,%rax
+ 0.002 12804210 80351 [36] 4021e5: cmp %rax,%rcx
+ 0. 0 0 [36] 4021e8: jne 0xffffffffffffffea
+ 38. c[i] = row_sum;
+ 0. 0 0 [38] 4021ea: movsd %xmm1,(%r10,%rdi,8)
+ 0. 0 0 [34] 4021f0: add $0x1,%rdi
+ 0. 0 0 [34] 4021f4: cmp %rdi,%rsi
+ 0. 0 0 [34] 4021f7: jb 0xd
+ 0. 0 0 [35] 4021f9: pxor %xmm1,%xmm1
+ 0. 0 0 [36] 4021fd: test %rcx,%rcx
+ 0. 0 80350 [36] 402200: jne 0xffffffffffffffc5
+ 0. 0 0 [36] 402202: jmp 0xffffffffffffffe8
+ 39. }
+ 40. }
+ 0. 0 0 [40] 402204: ret
+@end verbatim
+@end smallexample
+
+So far we have used the default settings for the event counters. It is
+quite straightforward to select specific counters. For sake of the
+example, let's assume we would like to count how many branch instructions
+and retired memory load instructions that missed in the L1 cache have been
+executed. We also want to count these events with a high resolution.
+
+This is the command to do so:
+
+@cartouche
+@smallexample
+$ exe=mxv-pthreads
+$ m=8000
+$ n=4000
+$ exp=mxv.hwc.sel.2.thr.er
+$ hwc1=br_ins,hi
+$ hwc2=mem_load_retired.l1_miss,hi
+$ gprofng collect app -O $exp -h $hwc1 -h $hwc2 $exe -m $m -n $n -t 2
+@end smallexample
+@end cartouche
+
+As before, we get a table with the event counts. Due to the very
+long name for the second counter, we have somewhat modified the
+output.
+
+@cartouche
+@smallexample
+$ gprofng display text -limit 10 -functions mxv.hwc.sel.2.thr.er
+@end smallexample
+@end cartouche
+
+@smallexample
+@verbatim
+Functions sorted by metric: Exclusive Total CPU Time
+Excl. Incl. Excl. Branch Excl. Name
+Total Total Instructions mem_load_retired.l1_miss
+CPU sec. CPU sec. Events
+2.597 2.597 1305305319 4021340 <Total>
+2.481 2.481 1233233242 3982327 mxv_core
+0.040 0.107 19019012 9003 init_data
+0.028 0.052 23023048 15006 erand48_r
+0.024 0.024 19019008 9004 __drand48_iterate
+0.015 0.067 11011009 2998 drand48
+0.008 0.010 0 3002 _int_malloc
+0.001 0.001 0 0 brk
+0.001 0.002 0 0 sysmalloc
+0. 0.001 0 0 __default_morecore
+@end verbatim
+@end smallexample
+
+@IndexSubentry{Options, @code{-compare}}
+@IndexSubentry{Commands, @code{compare}}
+When using event counters, the values could be very large and it is not easy
+to compare the numbers. As we will show next, the @code{ratio} feature is
+very useful when comparing such profiles.
+
+To demonstrate this, we have set up another event counter experiment where
+we would like to compare the number of last level cache miss and the number
+of branch instructions executed when using a single thread, or two threads.
+
+These are the commands used to generate the experiment directories:
+
+@cartouche
+@smallexample
+$ exe=./mxv-pthreads
+$ m=8000
+$ n=4000
+$ exp1=mxv.hwc.comp.1.thr.er
+$ exp2=mxv.hwc.comp.2.thr.er
+$ gprofng collect app -O $exp1 -h llm -h br_ins $exe -m $m -n $n -t 1
+$ gprofng collect app -O $exp2 -h llm -h br_ins $exe -m $m -n $n -t 2
+@end smallexample
+@end cartouche
+
+The following script has been used to get the tables. Due to lay-out
+restrictions, we have to create two tables, one for each counter.
+
+@cartouche
+@smallexample
+# Limit the output to 5 lines
+limit 5
+# Define the metrics
+metrics name:e.llm
+# Set the comparison to ratio
+compare ratio
+functions
+# Define the metrics
+metrics name:e.br_ins
+# Set the comparison to ratio
+compare ratio
+functions
+@end smallexample
+@end cartouche
+
+Note that we print the name of the function first, followed by the counter
+data.
+The new element is that we set the comparison mode to @code{ratio}. This
+divides the data in a column by its counterpart in the reference experiment.
+
+This is the command using this script and the two experiment directories as
+input:
+
+@cartouche
+@smallexample
+$ gprofng display text -script my-script-comp-counters \
+ mxv.hwc.comp.1.thr.er \
+ mxv.hwc.comp.2.thr.er
+@end smallexample
+@end cartouche
+
+By design, we get two tables, one for each counter:
+
+@smallexample
+@verbatim
+Functions sorted by metric: Exclusive Last-Level Cache Misses
+
+ mxv.hwc.comp.1.thr.er mxv.hwc.comp.2.thr.er
+Name Excl. Last-Level Excl. Last-Level
+ Cache Misses Cache Misses
+ ratio
+ <Total> 122709276 x 0.788
+ mxv_core 121796001 x 0.787
+ init_data 723064 x 1.055
+ erand48_r 100111 x 0.500
+ drand48 60065 x 1.167
+
+Functions sorted by metric: Exclusive Branch Instructions
+
+ mxv.hwc.comp.1.thr.er mxv.hwc.comp.2.thr.er
+Name Excl. Branch Excl. Branch
+ Instructions Instructions
+ ratio
+ <Total> 1307307316 x 0.997
+ mxv_core 1235235239 x 0.997
+ erand48_r 23023033 x 0.957
+ drand48 20020009 x 0.600
+ __drand48_iterate 17017028 x 0.882
+@end verbatim
+@end smallexample
+
+A ratio less than one in the second column, means that this counter
+value was smaller than the value from the reference experiment shown
+in the first column.
+
+This kind of presentation of the results makes it much easier to
+quickly interpret the data.
+
+We conclude this section with thread-level event counter overviews,
+but before we go into this, there is an important metric we need to
+mention.
+
+@c -- TBD Explain <Total> for IPC
+
+@IndexSubentry{Hardware event counters, IPC}
+In case it is known how many instructions and CPU cycles have been executed,
+the value for the IPC (``Instructions Per Clockycle'') can be computed.
+@xref{Hardware Event Counters Explained}.
+This is a derived metric that gives an indication how well the processor
+is utilized. The inverse of the IPC is called CPI.
+@IndexSubentry{Hardware event counters, CPI}
+
+The @DisplayText{} command automatically computes the IPC and CPI values
+if an experiment contains the event counter values for the instructions
+and CPU cycles executed. These are part of the metric list and can be
+displayed, just like any other metric.
+
+@IndexSubentry{Options, @code{-metric_list}}
+@IndexSubentry{Commands, @code{metric_list}}
+This can be verified through the @command{metric_list} command. If we go
+back to our earlier experiment with the default event counters, we get
+the following result.
+
+@cartouche
+@smallexample
+$ gprofng display text -metric_list mxv.hwc.def.2.thr.er
+@end smallexample
+@end cartouche
+
+@smallexample
+@verbatim
+Current metrics: e.totalcpu:i.totalcpu:e.cycles:e+insts:e+llm:name
+Current Sort Metric: Exclusive Total CPU Time ( e.totalcpu )
+Available metrics:
+ Exclusive Total CPU Time: e.%totalcpu
+ Inclusive Total CPU Time: i.%totalcpu
+ Exclusive CPU Cycles: e.+%cycles
+ Inclusive CPU Cycles: i.+%cycles
+ Exclusive Instructions Executed: e+%insts
+ Inclusive Instructions Executed: i+%insts
+Exclusive Last-Level Cache Misses: e+%llm
+Inclusive Last-Level Cache Misses: i+%llm
+ Exclusive Instructions Per Cycle: e+IPC
+ Inclusive Instructions Per Cycle: i+IPC
+ Exclusive Cycles Per Instruction: e+CPI
+ Inclusive Cycles Per Instruction: i+CPI
+ Size: size
+ PC Address: address
+ Name: name
+@end verbatim
+@end smallexample
+
+Among the other metrics, we see the new metrics for the IPC and CPI
+listed.
+
+In the script below, we use this information and add the IPC and CPI
+to the metrics to be displayed. We also use a the thread filter to
+display these values for the individual threads.
+
+This is the complete script we have used. Other than a different selection
+of the metrics, there are no new features.
+
+@cartouche
+@smallexample
+# Define the metrics
+metrics e.insts:e.%cycles:e.IPC:e.CPI
+# Sort with respect to cycles
+sort e.cycles
+# Limit the output to 5 lines
+limit 5
+# Get the function overview for all threads
+functions
+# Get the function overview for thread 1
+thread_select 1
+functions
+# Get the function overview for thread 2
+thread_select 2
+functions
+# Get the function overview for thread 3
+thread_select 3
+functions
+@end smallexample
+@end cartouche
+
+In the metrics definition on the second line, we explicitly request the
+counter values for the instructions (@code{e.insts}) and CPU cycles
+(@code{e.cycles}) executed. These names can be found in output from the
+@IndexSubentry{Options, @code{-metric_list}}
+@IndexSubentry{Commands, @code{metric_list}}
+@command{metric_list} command above.
+In addition to these metrics, we also request the IPC and CPI to be shown.
+
+@IndexSubentry{Options, @code{-limit}}
+@IndexSubentry{Commands, @code{limit}}
+As before, we used the @command{limit} command to control the number of
+functions displayed. We then request an overview for all the threads,
+followed by three sets of two commands to select a thread and display the
+function overview.
+
+The script above is used as follows:
+
+@cartouche
+@smallexample
+$ gprofng display text -script my-script-ipc mxv.hwc.def.2.thr.er
+@end smallexample
+@end cartouche
+
+@noindent
+This script produces four tables. We list them separately below,
+and have left out the additional output.
+
+@noindent
+The first table shows the accumulated values across the three
+threads that have been active.
+
+@smallexample
+@verbatim
+Functions sorted by metric: Exclusive CPU Cycles
+
+Excl. Excl. CPU Excl. Excl. Name
+Instructions Cycles IPC CPI
+Executed sec. %
+7906475309 2.691 100.00 1.473 0.679 <Total>
+7432724378 2.598 96.54 1.434 0.697 mxv_core
+ 188860269 0.035 1.31 2.682 0.373 erand48_r
+ 73623396 0.026 0.95 1.438 0.696 init_data
+ 76824434 0.018 0.66 2.182 0.458 drand48
+@end verbatim
+@end smallexample
+
+@noindent
+This shows that IPC of this program is completely dominated
+by function @code{mxv_core}. It has a fairly low IPC value
+of 1.43.
+
+@noindent
+The next table is for thread 1 and shows the values for the
+main thread.
+
+@smallexample
+@verbatim
+Exp Sel Total
+=== === =====
+ 1 1 3
+Functions sorted by metric: Exclusive CPU Cycles
+
+Excl. Excl. CPU Excl. Excl. Name
+Instructions Cycles IPC CPI
+Executed sec. %
+473750931 0.093 100.00 2.552 0.392 <Total>
+188860269 0.035 37.93 2.682 0.373 erand48_r
+ 73623396 0.026 27.59 1.438 0.696 init_data
+ 76824434 0.018 18.97 2.182 0.458 drand48
+134442832 0.013 13.79 5.250 0.190 __drand48_iterate
+@end verbatim
+@end smallexample
+
+@noindent
+Although this thread hardly uses any CPU cycles, the overall IPC
+of 2.55 is not all that bad.
+
+@noindent
+Last, we show the tables for threads 2 and 3:
+
+@smallexample
+@verbatim
+Exp Sel Total
+=== === =====
+ 1 2 3
+Functions sorted by metric: Exclusive CPU Cycles
+
+Excl. Excl. CPU Excl. Excl. Name
+Instructions Cycles IPC CPI
+Executed sec. %
+3716362189 1.298 100.00 1.435 0.697 <Total>
+3716362189 1.298 100.00 1.435 0.697 mxv_core
+ 0 0. 0. 0. 0. collector_root
+ 0 0. 0. 0. 0. driver_mxv
+
+Exp Sel Total
+=== === =====
+ 1 3 3
+Functions sorted by metric: Exclusive CPU Cycles
+
+Excl. Excl. CPU Excl. Excl. Name
+Instructions Cycles IPC CPI
+Executed sec. %
+3716362189 1.300 100.00 1.433 0.698 <Total>
+3716362189 1.300 100.00 1.433 0.698 mxv_core
+ 0 0. 0. 0. 0. collector_root
+ 0 0. 0. 0. 0. driver_mxv
+@end verbatim
+@end smallexample
+
+It is seen that both execute the same number of instructions and
+take about the same number of CPU cycles. As a result, the IPC is
+the same for both threads.
+
+@c -- A new node --------------------------------------------------------------
+@c TBD @node Additional Features
+@c TBD @section Additional Features
+@c ----------------------------------------------------------------------------
+
+@c -- A new node --------------------------------------------------------------
+@c TBD @node More Filtering Capabilities
+@c TBD @subsection More Filtering Capabilities
+@c ----------------------------------------------------------------------------
+
+@c TBD Cover @code{samples} and @code{seconds}
+
+@c -- A new node --------------------------------------------------------------
+@node Java Profiling
+@section Java Profiling
+@c ----------------------------------------------------------------------------
+
+@IndexSubentry{Options, @code{-j}}
+@IndexSubentry{Java profiling, @code{-j on/off}}
+The @CollectApp{} command supports Java profiling. The @code{-j on} option
+can be used for this, but since this feature is enabled by default, there is
+no need to set this explicitly. Java profiling may be disabled through the
+@code{-j off} option.
+
+The program is compiled as usual and the experiment directory is created
+similar to what we have seen before. The only difference with a C/C++
+application is that the program has to be explicitly executed by java.
+
+For example, this is how to generate the experiment data for a Java
+program that has the source code stored in file @code{Pi.java}:
+
+@cartouche
+@smallexample
+$ javac Pi.java
+$ gprofng collect app -j on -O pi.demo.er java Pi < pi.in
+@end smallexample
+@end cartouche
+
+Regarding which java is selected to generate the data, @ToolName{}
+first looks for the JDK in the path set in either the
+@IndexSubentry{Java profiling, @code{JDK_HOME}}
+@code{JDK_HOME} environment variable, or in the
+@IndexSubentry{Java profiling, @code{JAVA_PATH}}
+@code{JAVA_PATH} environment variable. If neither of these variables is
+set, it checks for a JDK in the search path (set in the PATH
+environment variable). If there is no JDK in this path, it checks for
+the java executable in @code{/usr/java/bin/java}.
+
+In case additional options need to be passed on to the JVM, the
+@IndexSubentry{Options, @code{-J}}
+@IndexSubentry{Java profiling, @code{-J <string>}}
+@code{-J <string>} option can be used. The string with the
+option(s) has to be delimited by quotation marks in case
+there is more than one argument.
+
+The @DisplayText{} command may be used to view the performance data. There is
+no need for any special options and the same commands as previously discussed
+are supported.
+
+@IndexSubentry{Options, @code{-viewmode}}
+@IndexSubentry{Commands, @code{viewmode}}
+@IndexSubentry{Java profiling, different view modes}
+The @code{viewmode} command
+@xref{The Viewmode}
+is very useful to examine the call stacks.
+
+For example, this is how one can see the native call stacks. For
+lay-out purposes we have restricted the list to the first five entries:
+
+@cartouche
+@smallexample
+$ gprofng display text -limit 5 -viewmode machine -calltree pi.demo.er
+@end smallexample
+@end cartouche
+
+@smallexample
+@verbatim
+Print limit set to 5
+Viewmode set to machine
+Functions Call Tree. Metric: Attributed Total CPU Time
+
+Attr. Name
+Total
+CPU sec.
+1.381 +-<Total>
+1.171 +-Pi.calculatePi(double)
+0.110 +-collector_root
+0.110 | +-JavaMain
+0.070 | +-jni_CallStaticVoidMethod
+@end verbatim
+@end smallexample
+
+@noindent
+Note that the selection of the viewmode is echoed in the output.
+
+@c -- A new node --------------------------------------------------------------
+@node The gprofng Tools
+@chapter The gprofng Tools
+@c ----------------------------------------------------------------------------
+
+Several tools are included in @ProductName{}. In subsequent chapters these
+are discussed in detail. Below a brief description is given, followed by an
+overview of the environment variables that are supported.
+
+@c -- A new node --------------------------------------------------------------
+@node Tools Overview
+@section Tools Overview
+@c ----------------------------------------------------------------------------
+
+The following tools are supported by @ProductName{}:
+
+@table @code
+
+@item @CollectApp{}
+@IndexSubentry{@code{gprofng}, @code{collect app}}
+
+Collects the performance data and stores the results in an experiment
+directory. There are many options on this tool, but quite often the
+defaults are sufficient.
+An experiment directory is required for the subsequent analysis of
+the results.
+
+@item @DisplayText{}
+@IndexSubentry{@code{gprofng}, @code{display text}}
+Generates performance reports in ASCII format. Commandline
+options, and/or commands in a script file are used to control the contents
+and lay-out of the generated report(s).
+
+@item @DisplayHTML{}
+@IndexSubentry{@code{gprofng}, @code{display html}}
+Takes one or more experiment directories and generates a directory with
+HTML files. Starting from the index.html file, the performance data
+may be examined in a browser.
+
+@item @DisplaySRC{}
+@IndexSubentry{@code{gprofng}, @code{display src}}
+Displays the source code, interleaved with the disassembled instructions.
+
+@item @Archive{}
+@IndexSubentry{@code{gprofng}, @code{archive}}
+Archives an experiment directory by (optionally) including source code and
+object files, as well as the shared libraries that have been used.
+
+@end table
+
+@c -- A new section -----------------------------------------------------------
+@node The gprofng.rc file with default settings
+@section The gprofng.rc file with default settings
+@c ----------------------------------------------------------------------------
+The @file{gprofng.rc}
+@cindex gprofng.rc
+file is used to define default settings for the @DisplayText{} and
+@DisplaySRC{} tools, but the user can override these defaults through local
+configuration files.
+
+There are three files that are checked when the tool starts up. The first
+file has pre-defined settings and comes with the installation, but through
+a hidden file called @file{.gprofng.rc}, the user can (re)define the defaults:
+
+These are the locations and files that are checked upon starting the above
+mentioned tools:
+
+@enumerate
+
+@item
+The system-wide filename is called @file{gprofng.rc} and is located in
+the top level @file{/etc} directory.
+
+If @ProductName{} has been built from the source, this file is in
+subdirectory @file{etc} in the top level installation directory.
+
+@item
+The user's home directory may have a hidden file called @file{.gprofng.rc}.
+
+@item
+The directory where @DisplayText{} (or @DisplaySRC{}) is invoked from may
+have a hidden file called @file{.gprofng.rc}.
+
+@end enumerate
+
+The settings of each file override the settings of the file(s) read before it.
+Defaults in the system-wide file are overruled by the file in the user home
+directory (if any) and any settings in the @file{.gprofng.rc} file in the
+current directory override those.
+
+Note that the settings in these files only affect the defaults. Unlike
+the commands used in a script file, they are not commands for the tools.
+
+@c -- TBD indxobj_define,
+@c -- TBD object_show,
+@c -- TBD object_hide,
+@c -- TBD object_api,
+
+The @file{.gprofng.rc} configuration files can contain the
+@command{addpath},
+@command{compare},
+@command{dthresh},
+@command{name},
+@command{pathmap},
+@command{printmode},
+@command{sthresh},
+and
+@command{viewmode}
+commands as described in this user guide.
+
+They can also contain the following commands, @emph{which cannot be used on
+either the command line, or in a script file}:
+
+@table @code
+
+@item dmetrics @var{metric-spec}
+@IndexSubentry{Commands, @code{dmetrics}}
+
+Specify the default metrics to be displayed or printed in the function list.
+The syntax and use of the metric list is described in section
+@ref{Metric Definitions}.
+The order of the metric keywords in the list determines the order in which
+the metrics are presented.
+
+Default metrics for the @code{callers-callees} list are derived from the
+function list default metrics by adding the corresponding attributed metric
+before the first occurrence of each metric name in the list.
+
+@item dsort @var{metric-spec}
+@IndexSubentry{Commands, @code{dsort}}
+
+Specify the default metric by which the function list is sorted. The sort
+metric is the first metric in this list that matches a metric in any loaded
+experiment, subject to the following conditions:
+
+@itemize @bullet
+
+@item
+If the entry in @var{metric-spec} has a visibility string of an exclamation
+point (@samp{!}), the first metric whose name matches is used, regardless of
+whether it is visible.
+
+@item
+If the entry in @var{metric-spec} has any other visibility string, the first
+visible metric whose name matches is used.
+
+@end itemize
+
+The syntax and use of the metric list is described in section
+@ref{Metric Definitions}.
+The default sort metric for the @code{callers-callees} list is the attributed
+metric corresponding to the default sort metric for the function list.
+
+@item en_desc @{on | off | =@var{regex}@}
+@IndexSubentry{Commands, @code{en_desc}}
+
+Set the mode for reading descendant experiments to @samp{on} (enable all
+descendants) or @samp{off} to disable all descendants. If
+@samp{=}@var{regex} is used, enable data from those experiments whose
+executable name matches the regular expression.
+
+The default setting is @samp{on} to follow all descendants. In reading
+experiments with descendants, any sub-experiments that contain little or
+no performance data are ignored by @DisplayText{}.
+
+@end table
+
+@c -- A new section -----------------------------------------------------------
+@node Filters
+@section Filters
+@c ----------------------------------------------------------------------------
+
+Various filter commands are supported by @DisplayText{}.
+Thanks to the use of filters, the user can zoom in on a certain area of
+interest. With filters, it is possible to select one or more threads to
+focus on, define a window in time, select specific call stacks, etc.
+@IndexSubentry{Filters, Intro}
+
+While already powerful by themselves, filters may be combined to further
+narrow down the view into the data.
+
+@IndexSubentry{Filters, Persistence}
+It is important to note that filters are @emph{persistent}. A filter is
+active until it is reset. This means that successive filter commands
+increasingly narrow down the view until one or more are reset.
+
+@noindent
+An example is the following:
+
+@cartouche
+@smallexample
+$ gprofng display text -thread_select 1 -functions \
+ -cpu_select 2 -functions @dots{}
+@end smallexample
+@end cartouche
+
+This command selects thread 1 and requests the function view for this thread.
+The third (@command{cpu_select 2}) command @emph{adds} the
+constraint that only the events on CPU 2 are to be selected. This means
+that the next function view selects events that were executed by thread 1 and
+have been running on CPU 2.
+
+@noindent
+In contrast with this single command line, the two commands below look similar,
+but behave very differently:
+
+@cartouche
+@smallexample
+$ gprofng display text -thread_select 1 -functions @dots{}
+$ gprofng display text -cpu_select 2 -functions @dots{}
+@end smallexample
+@end cartouche
+
+The first command displays the function view for thread 1. The second
+command shows the function view for CPU 2 for @emph{all} threads that have
+been running on this CPU.
+
+As the following example demonstrates, things get a little more tricky in
+case a script file is used. Consider the following script file:
+
+@smallexample
+@verbatim
+thread_select 1
+functions
+cpu_select 2
+functions
+@end verbatim
+@end smallexample
+
+This script file displays the function view for thread 1 first. This is
+followed by those functions that were executed by thread 1 @emph{and} have
+been run on CPU 2.
+
+If however, the script should behave like the two command line invocations
+shown above, the thread selection filter needs to be reset before CPU 2 is
+selected:
+
+@smallexample
+@verbatim
+thread_select 1
+functions
+# Reset the thread selection filter:
+thread_select all
+cpu_select 2
+functions
+@end verbatim
+@end smallexample
+
+In general, filters behave differently than commands or options. In
+particular there may be an interaction between different filter definitions.
+
+For example, as explained above, in the first script file the
+@command{thread_select} and @command{cpu_select} commands interact.
+
+For a list of all the predefined filters see @ref{Predefined Filters}.
+
+@c -- A new section -----------------------------------------------------------
+@node Supported Environment Variables
+@section Supported Environment Variables
+@c ----------------------------------------------------------------------------
+
+Various environment variables are supported. We refer to the man page for
+gprofng(1) for an overview and description
+(@xref{Man page for gprofng}).
+
+@c -- A new chapter -----------------------------------------------------------
+@node Performance Data Collection
+@chapter Performance Data Collection
+@c ----------------------------------------------------------------------------
+
+The @CollectApp{} command is used to gather the application performance data
+while the application executes.
+At regular intervals, program execution is halted and the required data is
+recorded.
+@cindex Experiment directory
+An experiment directory is created when the tool starts. This directory is
+used to store the relevant information and forms the basis for a subsequent
+analysis with one of the viewing tools.
+
+@c -- A new section -----------------------------------------------------------
+@node The @CollectApp{} command
+@section The @CollectApp{} command
+@c ----------------------------------------------------------------------------
+
+This is the command to collect the performance information for the target
+application. The usage is as follows:
+
+@cartouche
+@smallexample
+$ gprofng collect app [OPTION(S)] TARGET [TARGET_ARGUMENTS]
+@end smallexample
+@end cartouche
+
+@noindent
+Options to the command are passed in first. This is followed by the name of
+the target, which is typically a binary executable or a script, followed by
+any options that may be required by the target.
+
+@c -- A new section -----------------------------------------------------------
+@node View the Performance Information
+@chapter View the Performance Information
+@c ----------------------------------------------------------------------------
+Various tools to view the performance data stored in one or more experiment
+directories are available. In this chapter, these will all be covered in
+detail.
+
+@c -- A new section -----------------------------------------------------------
+@node The @code{gprofng display text} Tool
+@section The @code{gprofng display text} Tool
+@c ----------------------------------------------------------------------------
+
+This tool displays the performance information in ASCII format. It supports
+a variety of views into the data recorded.
+These views can be specified in two ways and both may be used simultaneously:
+
+@itemize @bullet
+
+@item
+Command line options start with a dash (@samp{-}) symbol and may take an
+argument.
+
+@item
+Options may also be included in a file, the ``script file''. In this case,
+the dash symbol should @emph{not} be included. Multiple script files can
+be used on the same command line.
+
+@end itemize
+
+While they may appear as an option, they are really commands and this is
+why they will be referred to as @emph{commands}
+@cindex Commands
+in the documentation.
+
+As a general rule, @emph{the order of options matters} and if the same option,
+or command, occurs multiple times, the rightmost setting is selected.
+
+@c -- A new sub section -------------------------------------------------------
+@node The @code{gprofng display text} Commands
+@subsection The @code{gprofng display text} Commands
+@c ----------------------------------------------------------------------------
+
+The most commonly used commands are documented in the man page for this tool
+(@xref{gprofng display text}). In this section we list and describe all other
+commands that are supported.
+
+@c -- A new sub subsection ----------------------------------------------------
+@node Commands that List Experiment Details
+@unnumberedsubsubsec Commands that List Experiment Details
+@c ----------------------------------------------------------------------------
+
+@table @code
+
+@item experiment_ids
+@ifclear man
+@IndexSubentry{Options, @code{-experiment_ids}}
+@IndexSubentry{Commands, @code{experiment_ids}}
+@end ifclear
+
+For each experiment that has been loaded, show the totals of the metrics
+recorded, plus some other operational characteristics like the name of
+the executable, PID, etc. The top line contains the accumulated totals
+for the metrics.
+
+@item experiment_list
+@ifclear man
+@IndexSubentry{Options, @code{-experiment_list}}
+@IndexSubentry{Commands, @code{experiment_list}}
+@end ifclear
+
+Display the list of experiments that are loaded. Each experiment is listed
+with an index, which is used when selecting samples, threads, or LWPs, and
+a process id (PID), which can be used for advanced filtering.
+
+@item cpu_list
+@IndexSubentry{Options, @code{-cpu_list}}
+@IndexSubentry{Commands, @code{cpu_list}}
+
+Display the total number of CPUs that have been used during the experiment(s).
+
+@item cpus
+@IndexSubentry{Options, @code{-cpus}}
+@IndexSubentry{Commands, @code{cpus}}
+
+Show a list of CPUs that were used by the application, along with the metrics
+that have been recorded. The CPUs are represented by a CPU number and show the
+Total CPU time by default.
+
+Note that since the data is sorted with respect to the default metric, it may
+be useful to use the @command{sort name} command to show the list sorted with
+respect to the CPU id.
+
+@item GCEvents
+@IndexSubentry{Options, @code{-GCEvents}}
+@IndexSubentry{Commands, @code{GCEvents}}
+
+This commands is for Java applications only. It shows any Garbage Collection
+(GC) events that have occurred while the application was executing..
+
+@item lwp_list
+@IndexSubentry{Options, @code{-lwp_list}}
+@IndexSubentry{Commands, @code{lwp_list}}
+
+Displays the list of LWPs processed during the experiment(s).
+
+@item processes
+@IndexSubentry{Options, @code{-processes}}
+@IndexSubentry{Commands, @code{processes}}
+
+For each experiment that has been loaded, this command displays a list of
+processes that were created by the application, along with their metrics.
+The processes are represented by process ID (PID) numbers and show the
+Total CPU time metric by default. If additional metrics are recorded in
+an experiment, these are shown as well.
+
+@item samples
+@IndexSubentry{Options, @code{-samples}}
+@IndexSubentry{Commands, @code{samples}}
+
+Display a list of sample points and their metrics, which reflect the
+microstates recorded at each sample point in the loaded experiment.
+The samples are represented by sample numbers and show the Total CPU time
+by default. Other metrics might also be displayed if enabled.
+
+@item sample_list
+@IndexSubentry{Options, @code{-sample_list}}
+@IndexSubentry{Commands, @code{sample_list}}
+
+For each experiment loaded, display the list of samples currently selected.
+
+@item seconds
+@IndexSubentry{Options, @code{-seconds}}
+@IndexSubentry{Commands, @code{seconds}}
+
+Show each second of the profiling run that was captured in the experiment,
+along with the metrics collected in that second. The seconds view differs
+from the samples view in that it shows periodic samples that occur every
+second beginning at 0 and the interval cannot be changed.
+
+The seconds view lists the seconds of execution with the Total CPU time by
+default. Other metrics might also be displayed if the metrics are present
+in the loaded experiments.
+
+@item threads
+@IndexSubentry{Options, @code{-threads}}
+@IndexSubentry{Commands, @code{threads}}
+
+Show a list of threads and their metrics. The threads are represented
+by a process and thread pair and show the Total CPU time by default.
+Other metrics might also be displayed by default if the metrics are
+present in the loaded experiment.
+
+@item thread_list
+@IndexSubentry{Options, @code{-thread_list}}
+@IndexSubentry{Commands, @code{thread_list}}
+
+Display the list of threads currently selected for the analysis.
+
+@end table
+
+@noindent
+@emph{The commands below are for use in scripts and interactive mode only.
+They are not allowed on the command line.}
+
+@table @code
+
+@item add_exp @var{exp-name}
+@IndexSubentry{Commands, @code{add_exp}}
+
+Add the named experiment to the current session.
+
+@item drop_exp @var{exp-name}
+@IndexSubentry{Commands, @code{drop_exp}}
+
+Drop the named experiment from the current session.
+
+@item open_exp @var{exp-name}
+@IndexSubentry{Commands, @code{open_exp}}
+
+Drop all loaded experiments from the session, and then load the named
+experiment.
+
+@end table
+
+@c -- A new sub subsection ----------------------------------------------------
+@node Commands that Affect Listings and Output
+@unnumberedsubsubsec Commands that Affect Listings and Output
+@c ----------------------------------------------------------------------------
+
+@table @code
+
+@item dthresh @var{value}
+@IndexSubentry{Options, @code{-dthresh}}
+@IndexSubentry{Commands, @code{dthresh}}
+
+Specify the threshold percentage for highlighting metrics in the annotated
+disassembly code. If the value of any metric is equal to or greater than
+@var{value} as a percentage of the maximum value of that metric for any
+instruction line in the file, the line on which the metrics occur has a
+@samp{##} marker inserted at the beginning of the line. The default is 75.
+
+@item printmode @{text | html | @var{single-char}@}
+@IndexSubentry{Options, @code{-printmode}}
+@IndexSubentry{Commands, @code{printmode}}
+
+Set the print mode. If the keyword is @code{text}, printing will be done in
+tabular form using plain text. In case the @code{html} keyword is selected,
+the output is formatted as an HTML table.
+
+Alternatively, @var{single-char} may be used in a delimiter separated list,
+with the single character @var{single-char} as the delimiter.
+
+The printmode setting is used only for those commands that generate tables,
+such as @command{functions}. The setting is ignored for other printing
+commands, including those showing source and disassembly listings.
+
+@item sthresh @var{value}
+@IndexSubentry{Options, @code{-sthresh}}
+@IndexSubentry{Commands, @code{sthresh}}
+
+Specify the threshold percentage for highlighting metrics in the annotated
+source code. If the value of any metric is equal to or greater than
+@var{value} (as a percentage) of the maximum value of that metric for any
+source line in the file, the line on which the metrics occur has a @samp{##}
+marker inserted at the beginning of the line. The default is 75.
+
+@end table
+
+@c -- A new sub subsection ----------------------------------------------------
+@node Predefined Filters
+@unnumberedsubsubsec Predefined Filters
+@c ----------------------------------------------------------------------------
+
+The filters below use a list, the selection list, to define a sequence of
+numbers. @xref{The Selection List}.
+Note that this selection is persistent, but the filter can be reset by using
+@samp{all} as the @var{selection-list}.
+
+@table @code
+
+@item cpu_select @var{selection-list}
+@IndexSubentry{Options, @code{-cpu_select}}
+@IndexSubentry{Commands, @code{cpu_select}}
+
+Select the CPU ids specified in the @var{selection-list}.
+
+@item lwp_select @var{selection-list}
+@IndexSubentry{Options, @code{-lwp_select}}
+@IndexSubentry{Commands, @code{lwp_select}}
+
+Select the LWPs specified in the @var{selection-list}.
+
+@item sample_select @var{selection-list}
+@IndexSubentry{Options, @code{-sample-select}}
+@IndexSubentry{Commands, @code{sample-select}}
+
+@item thread_select @var{selection-list}
+@IndexSubentry{Options, @code{-thread_select}}
+@IndexSubentry{Commands, @code{thread_select}}
+
+Select a series of threads, or just one, to be used in subsequent views.
+The @var{selection-list} consists of a sequence of comma separated numbers.
+This may include a range of the form @samp{n-m}.
+
+@end table
+
+
+@c -- A new sub subsection ----------------------------------------------------
+@node Commands to Set and Change Search Paths
+@unnumberedsubsubsec Commands to Set and Change Search Paths
+@c ----------------------------------------------------------------------------
+
+@table @code
+
+@item addpath @var{path-list}
+@IndexSubentry{Options, @code{-addpath}}
+@IndexSubentry{Commands, @code{addpath}}
+
+Append @var{path-list} to the current setpath settings. Note that multiple
+@command{addpath} commands can be used in @file{.gprofng.rc} files, and will
+be concatenated.
+
+@item pathmap @var{old-prefix} @var{new-prefix}
+@IndexSubentry{Options, @code{-pathmap}}
+@IndexSubentry{Commands, @code{pathmap}}
+
+If a file cannot be found using the path list set by @command{addpath}, or
+the @command{setpath} command, one or more path remappings may be set with the
+@command{pathmap} command.
+
+With path mapping, the user can specify how to replace the leading component
+in a full path by a different string.
+
+With this command, any path name for a source file, object file, or shared
+object that begins with the prefix specified with @var{old-prefix}, the
+old prefix is replaced by the prefix specified with @var{new-prefix}.
+The resulting path is used to find the file.
+
+For example, if a source file located in directory @file{/tmp}
+is shown in the @DisplayText{} output, but should instead be taken from
+@file{/home/demo}, the following @file{pathmap} command redefines the
+path:
+
+@smallexample
+$ gprofng diplay text -pathmap /tmp /home/demo -source ...
+@end smallexample
+
+Note that multiple @command{pathmap} commands can be supplied, and each is
+tried until the file is found.
+
+@item setpath @var{path-list}
+@IndexSubentry{Options, @code{-setpath}}
+@IndexSubentry{Commands, @code{setpath}}
+
+Set the path used to find source and object files. The path is defined
+through the @var{path-list} keyword. It is a colon separated list of
+directories, jar files, or zip files.
+If any directory has a colon character in it, escape it with a
+backslash (@samp{\}).
+
+The special directory name @code{$expts}, refers
+to the set of current experiments in the order in which they were loaded.
+You can abbreviate it with a single @samp{$} character.
+
+The default path is @samp{$expts:..} which is the directories of the
+loaded experiments and the current working directory.
+
+Use @command{setpath} with no argument to display the current path.
+
+Note that @command{setpath} commands @emph{are not allowed .gprofng.rc
+configuration files}.
+
+@end table
+
+@c -- A new subsection --------------------------------------------------------
+@c -- TBD @node Usage examples for @code{gprofng display text}
+@c -- TBD @subsection Usage examples for @code{gprofng display text}
+@c ----------------------------------------------------------------------------
+
+@c -- TBD In this section we present usage examples.
+
+@c -- A new chapter -----------------------------------------------------------
+@c TBD @node The @code{gprofng display html} Tool
+@c TBD @section The @code{gprofng display html} Tool
+@c ----------------------------------------------------------------------------
+@c TBD The options are documented in the man page for this tool. In this section we
+@c TBD present usage examples.
+
+@c -- A new chapter -----------------------------------------------------------
+@c TBD @node Display Source Code
+@c TBD @chapter Display Source Code
+@c ----------------------------------------------------------------------------
+@c TBD The options are documented in the man page for this tool. In this section we
+@c TBD present usage examples.
+
+
+@c -- A new chapter -----------------------------------------------------------
+@c TBD @node Archive Experiment Data
+@c TBD @chapter Archive Experiment Data
+@c ----------------------------------------------------------------------------
+@c TBD The options are documented in the man page for this tool. In this section we
+@c TBD present usage examples.
+
+@c -- A new chapter -----------------------------------------------------------
+@node Terminology
+@chapter Terminology
+@c ----------------------------------------------------------------------------
+
+Throughout this manual, certain terminology specific to profiling tools,
+or @ToolName{}, or even to this document only, is used. In this chapter
+this terminology is explained in detail.
+
+@menu
+* The Program Counter:: What is a Program Counter?
+* Inclusive and Exclusive Metrics:: An explanation of inclusive and exclusive metrics.
+* Metric Definitions:: Definitions associated with metrics.
+* The Viewmode:: Select the way call stacks are presented.
+* The Selection List:: How to define a selection.
+* Load Objects and Functions:: The components in an application.
+* The Concept of a CPU in gprofng:: The definition of a CPU.
+* Hardware Event Counters Explained:: What are event counters?
+* apath:: Our generic definition of a path.
+@end menu
+
+@c ----------------------------------------------------------------------------
+@node The Program Counter
+@section The Program Counter
+@c ----------------------------------------------------------------------------
+
+@cindex PC
+@cindex Program Counter
+The @emph{Program Counter}, or PC for short, keeps track where program execution is.
+The address of the next instruction to be executed is stored in a special
+purpose register in the processor, or core.
+
+@cindex Instruction pointer
+The PC is sometimes also referred to as the @emph{instruction pointer}, but
+we will use Program Counter or PC throughout this document.
+
+@c ----------------------------------------------------------------------------
+@node Inclusive and Exclusive Metrics
+@section Inclusive and Exclusive Metrics
+@c ----------------------------------------------------------------------------
+
+In the remainder, these two concepts occur quite often and for lack of a better
+place, they are explained here.
+
+@cindex Inclusive metric
+The @emph{inclusive} value for a metric includes all values that are part of
+the dynamic extent of the target function. For example if function @code{A}
+calls functions @code{B} and @code{C}, the inclusive CPU time for @code{A}
+includes the CPU time spent in @code{B} and @code{C}.
+
+@cindex Exclusive metric
+In contrast with this, the @emph{exclusive} value for a metric is computed
+by excluding the metric values used by other functions called. In our imaginary
+example, the exclusive CPU time for function @code{A} is the time spent outside
+calling functions @code{B} and @code{C}.
+
+@cindex Leaf function
+In case of a @emph{leaf function}, the inclusive and exclusive values for the
+metric are the same since by definition, it is not calling any other
+function(s).
+
+Why do we use these two different values? The inclusive metric shows the most
+expensive path, in terms of this metric, in the application. For example, if
+the metric is cache misses, the function with the highest inclusive metric
+tells you where most of the cache misses come from.
+
+Within this branch of the application, the exclusive metric points to the
+functions that contribute and help to identify which part(s) to consider
+for further analysis.
+
+@c ----------------------------------------------------------------------------
+@node Metric Definitions
+@section Metric Definitions
+@c ----------------------------------------------------------------------------
+The metrics displayed in the various views are highly customizable. In this
+section it is explained how to construct the metrics definition(s).
+
+@IndexSubentry{Options, @code{-metrics}}
+@IndexSubentry{Commands, @code{metrics}}
+The @command{metrics} command takes a colon (@samp{:}) separated list, where
+each item in the list consists of the following three fields:
+@var{<flavor>}@var{<visibility>}@var{<metric-name>}.
+
+@cindex Flavor field
+@cindex Visibility field
+@cindex Metric name field
+@IndexSubentry{Metrics, Flavor field}
+@IndexSubentry{Metrics, Visibility field}
+@IndexSubentry{Metrics, Metric name field}
+The @var{<flavor>} field is either @samp{e} for ``exclusive'', and/or
+@samp{i} for ``inclusive''. The @var{<metric-name>} field is the name of
+the metric and the @var{<visibility>} field consists of one ore more characters
+from the following table:
+
+@table @code
+
+@item .
+Show the metric as time. This applies to timing metrics and hardware event
+counters that measure cycles. Interpret as @samp{+} for other metrics.
+
+@item %
+Show the metric as a percentage of the total value for this metric.
+
+@item +
+Show the metric as an absolute value. For hardware event counters this is
+the event count. Interpret as @samp{.} for timing metrics.
+
+@item !
+Do not show any metric value. Cannot be used with other visibility characters.
+This visibility is meant to be used in a @command{dmetrics} command to set
+default metrics that override the built-in visibility defaults
+for each type of metric.
+
+@end table
+
+Both the @var{<flavor>} and @var{<visibility>} strings may have more than one
+character. If both strings have more than one character, the @var{<flavor>}
+string is expanded first. For example, @code{ie.%user} is first expanded to
+@code{i.%user:e.%user}, which is then expanded into
+@code{i.user:i%user:e.user:e%user}.
+
+@c ----------------------------------------------------------------------------
+@node The Viewmode
+@section The Viewmode
+
+@cindex Viewmode
+@IndexSubentry{Options, @code{-viewmode}}
+@IndexSubentry{Commands, @code{viewmode}}
+
+There are different ways to view a call stack in Java. In @ToolName{}, this
+is called the @emph{viewmode} and the setting is controlled through a command
+with the same name.
+
+The @code{viewmode} command takes one of the following keywords:
+
+@table @code
+
+@item user
+This is the default and shows the Java call stacks for Java threads.
+No call stacks for any housekeeping threads are shown. The function
+list contains a function
+@IndexSubentry{Java profiling, @code{<JVM-System>}}
+@code{<JVM-System>} that represents the aggregated time from non-Java
+threads.
+When the JVM software does not report a Java call stack, time is reported
+against the function
+@IndexSubentry{Java profiling, @code{<no Java callstack recorded>}}
+@code{<no Java callstack recorded>}.
+
+@item expert
+Show the Java call stacks for Java threads when the Java code from the
+user is executed and machine call stacks when JVM code is executed, or
+when the JVM software does not report a Java call stack.
+Show the machine call stacks for housekeeping threads.
+
+@item machine
+Show the actual native call stacks for all threads.
+
+@end table
+
+@c ----------------------------------------------------------------------------
+@node The Selection List
+@section The Selection List
+@c ----------------------------------------------------------------------------
+
+@cindex Selection list
+@cindex List specification
+Several commands allow the user to specify a sequence of numbers called the
+@emph{selection list}. Such a list may for example be used to select specific
+threads from all the threads that have been used when conducting the
+experiment(s).
+
+A selection list (or ``list'' in the remainder of this section) can be a
+single number, a contiguous range of numbers with the start and end numbers
+separated by a hyphen (@samp{-}), a comma-separated list of numbers and
+ranges, or the @code{all} keyword that resets the filter.
+@IndexSubentry{Filters, Reset to default}
+Lists must not contain spaces.
+
+Each list can optionally be preceded by an experiment list with a similar
+format, separated from the list by a colon (:).
+If no experiment list is included, the list applies to all experiments.
+
+Multiple lists can be concatenated by separating the individual lists
+by a plus sign.
+
+These are some examples of various filters using a list:
+
+@table @code
+
+@item thread_select 1
+Select thread 1 from all experiments.
+
+@item thread_select all:1
+Select thread 1 from all experiments.
+
+@item thread_select 1:all
+Select all the threads from the first experiment loaded.
+
+@item thread_select 1:2+3:4
+Select thread 2 from experiment 1 and thread 4 from experiment 3.
+
+@item cpu_select all:1,3,5
+Selects cores 1, 3, and 5 from all experiments.
+
+@item cpu_select 1,2:all
+Select all cores from experiments 1 and 2.
+
+@end table
+
+Recall that there are several list commands that show the mapping between the
+numbers and the targets.
+
+@IndexSubentry{Options, @code{-experiment_list}}
+@IndexSubentry{Commands, @code{experiment_list}}
+For example, the @command{experiment_list} command shows the name(s) of the
+experiment(s) loaded and the associated number. In this example it is used
+to get this information for a range of experiments:
+
+@cartouche
+@smallexample
+$ gprofng display text -experiment_list mxv.?.thr.er
+@end smallexample
+@end cartouche
+
+@noindent
+This is the output, showing for each experiment the ID, the PID, and the name:
+
+@smallexample
+@verbatim
+ID Sel PID Experiment
+== === ======= ============
+ 1 yes 2750071 mxv.1.thr.er
+ 2 yes 1339450 mxv.2.thr.er
+ 3 yes 3579561 mxv.4.thr.er
+@end verbatim
+@end smallexample
+
+@c ----------------------------------------------------------------------------
+@node Load Objects and Functions
+@section Load Objects and Functions
+@c ----------------------------------------------------------------------------
+
+An application consists of various components. The source code files are
+compiled into object files. These are then glued together at link time to form
+the executable.
+During execution, the program may also dynamically load objects.
+
+@cindex Load object
+A @emph{load object} is defined to be an executable, or shared object. A shared
+library is an example of a load object in @ToolName{}.
+
+Each load object, contains a text section with the instructions generated by the
+compiler, a data section for data, and various symbol tables.
+All load objects must contain an
+@cindex ELF
+ELF
+symbol table, which gives the names and addresses of all the globally known
+functions in that object.
+
+Load objects compiled with the -g option contain additional symbolic information
+that can augment the ELF symbol table and provide information about functions that
+are not global, additional information about object modules from which the functions
+came, and line number information relating addresses to source lines.
+
+The term
+@cindex Function
+@emph{function}
+is used to describe a set of instructions that represent a high-level operation
+described in the source code. The term also covers methods as used in C++ and in
+the Java programming language.
+
+In the @ToolName{} context, functions are provided in source code format.
+Normally their names appear in the symbol table representing a set of addresses.
+@cindex Program Counter
+@cindex PC
+If the Program Counter (PC) is within that set, the program is executing within that function.
+
+In principle, any address within the text segment of a load object can be mapped to a
+function. Exactly the same mapping is used for the leaf PC and all the other PCs on the
+call stack.
+
+Most of the functions correspond directly to the source model of the program, but
+there are exceptions. This topic is however outside of the scope of this guide.
+
+@c ----------------------------------------------------------------------------
+@node The Concept of a CPU in @ProductName{}
+@section The Concept of a CPU in @ProductName{}
+@c ----------------------------------------------------------------------------
+
+@cindex CPU
+In @ProductName{}, there is the concept of a CPU. Admittedly, this is not the
+best word to describe what is meant here and may be replaced in the future.
+
+The word CPU is used in many of the displays.
+In the context of @ProductName{}, it is meant to denote a part of the
+processor that is capable of executing instructions and with its own state,
+like the program counter.
+
+For example, on a contemporary processor, a CPU could be a core. In case
+hardware threads are supported within a core, a CPU is one of those
+hardware threads.
+
+To see which CPUs have been used in the experiment, use the @command{cpu}
+command in @DisplayText{}.
+
+@c ----------------------------------------------------------------------------
+@node Hardware Event Counters Explained
+@section Hardware Event Counters Explained
+@c ----------------------------------------------------------------------------
+
+@IndexSubentry{Hardware event counters, description}
+For quite a number of years now, many microprocessors have supported hardware
+event counters.
+
+On the hardware side, this means that in the processor there are one or more
+registers dedicated to count certain activities, or ``events''.
+Examples of such events are the number of instructions executed, or the number
+of cache misses at level 2 in the memory hierarchy.
+
+While there is a limited set of such registers, the user can map events onto
+them. In case more than one register is available, this allows for the
+simultaenous measurement of various events.
+
+A simple, yet powerful, example is to simultaneously count the number of CPU
+cycles and the number of instructions excuted. These two numbers can then be
+used to compute the
+@cindex IPC
+@emph{IPC} value. IPC stands for ``Instructions Per Clockcycle'' and each processor
+has a maximum. For example, if this maximum number is 2, it means the
+processor is capable of executing two instructions every clock cycle.
+
+Whether this is actually achieved, depends on several factors, including the
+instruction characteristics.
+However, in case the IPC value is well below this maximum in a time critical
+part of the application and this cannot be easily explained, further
+investigation is probably warranted.
+
+@cindex CPI
+A related metric is called @emph{CPI}, or ``Clockcycles Per Instruction''.
+It is the inverse of the CPI and can be compared against the theoretical
+value(s) of the target instruction(s). A significant difference may point
+at a bottleneck.
+
+One thing to keep in mind is that the value returned by a counter can either
+be the number of times the event occured, or a CPU cycle count. In case of
+the latter it is possible to convert this number to time.
+
+@IndexSubentry{Hardware event counters, variable CPU frequency}
+This is often easier to interpret than a simple count, but there is one
+caveat to keep in mind. The CPU frequency may not have been constant while
+the experimen was recorded and this impacts the time reported.
+
+These event counters, or ``counters'' for short, provide great insight into
+what happens deep inside the processor. In case higher level information does
+not provide the insight needed, the counters provide the information to get
+to the bottom of a performance problem.
+
+There are some things to consider though.
+
+@itemize @bullet
+
+@item
+The event definitions and names vary across processors and it may even happen
+that some events change with an update.
+Unfortunately and this is luckily rare, there are sometimes bugs causing the
+wrong count to be returned.
+
+@IndexSubentry{Hardware event counters, alias name}
+In @ToolName{}, some of the processor specific event names have an alias
+name. For example @code{insts} measures the instructions executed.
+These aliases not only makes it easier to identify the functionality, but also
+provide portability of certain events across processors.
+
+@item
+Another complexity is that there are typically many events one can monitor.
+There may up to hundreds of events available and it could require several
+experiments to zoom in on the root cause of a performance problem.
+
+@item
+There may be restrictions regarding the mapping of event(s) onto the
+counters. For example, certain events may be restricted to specific
+counters only. As a result, one may have to conduct additional experiments
+to cover all the events of interest.
+
+@item
+The names of the events may also not be easy to interpret. In such cases,
+the description can be found in the architecture manual for the processor.
+
+@end itemize
+
+Despite these drawbacks, hardware event counters are extremely useful and
+may even turn out to be indispensable.
+
+@c ----------------------------------------------------------------------------
+@node apath
+@section What is <apath>?
+@c ----------------------------------------------------------------------------
+
+In most cases, @ToolName{} shows the absolute pathnames of directories. These
+tend to be rather long, causing display issues in this document.
+
+Instead of wrapping these long pathnames over multiple lines, we decided to
+represent them by the @code{<apath>} symbol, which stands for ``an absolute
+pathname''.
+
+Note that different occurrences of @code{<apath>} may represent different
+absolute pathnames.
+
+@c -- A new node --------------------------------------------------------------
+@node Other Document Formats
+@chapter Other Document Formats
+@c ----------------------------------------------------------------------------
+
+@emph{This chapter is applicable when building gprofng from the
+binutils source.}
+
+This document is written in Texinfo and the source text is made available as
+part of the binutils distribution. The file name is @code{gprofng.texi} and
+can be found in subdirectory @code{gprofng/doc} of the top level binutils
+directory.
+
+The default installation procedure creates a file in the @code{info} format and
+stores it in the documentation section of binutils.
+This source file can however also be used to generate the document in the
+@code{html} and @code{pdf} formats. These may be easier to read and search.
+
+To generate this documentation file in a different format, go to the directory
+that was used to build the tools. The make file to build the other formats is
+in the @code{gprofng/doc} subdirectory.
+
+For example, if you have set the build directory to be @var{<my-build-dir>},
+go to subdirectory @var{<my-build-dir>/gprofng/doc}.
+
+This subdirectory has a single filed called @file{Makefile} that can be used to
+build the documentation in various formats. We recommend to use these commands.
+
+There are four commands to generate the documentation in the @code{html} or
+@code{pdf} format. It is assumed that you are in directory @code{gprofng/doc}
+under the main directory @var{<my-build-dir>}.
+
+@table @code
+
+@item make html
+Create the html file in the current directory.
+
+@item make pdf
+Create the pdf file in the current directory.
+
+@item make install-html
+Create and install the html file in the binutils documentation directory.
+
+@item make install-pdf
+Creat and install the pdf file in the binutils documentation directory.
+
+@end table
+
+For example, to install this document in the binutils documentation directory, the
+commands below may be executed. In this notation, @var{<format>}
+is one of @code{html}, or @code{pdf}:
+
+@smallexample
+@verbatim
+$ cd <my-build-dir>/gprofng/doc
+$ make install-<format>
+@end verbatim
+@end smallexample
+
+The binutils installation directory is either the default @code{/usr/local} or the one
+that has been set with the @code{--prefix} option as part of the @code{configure}
+command. In this example we symbolize this location with @code{<install>}.
+
+The documentation directory is @code{<install>/share/doc/gprofng} in case
+@code{html} or @code{pdf} is selected and @code{<install>/share/info} for the
+file in the @code{info} format.
+
+@noindent
+Some things to note:
+
+@itemize
+
+@item
+For the @code{pdf} file to be generated, the @code{texi2dvi} tool is required.
+@cindex texi2dvi
+It is for example available as part of the @code{texinfo-tex} package.
+
+@item
+Instead of generating a single file in the @code{html} format, it is also
+possible to create a directory with individual files for the various chapters.
+To do so, remove the use of @code{--no-split} in variable @code{MAKEINFOHTML}
+in the make file in the @code{<my-build-dir/gprofng/doc} directory.
+
+@end itemize
+
+@c -- An appendix -------------------------------------------------------------
+@node The @ProductName{} Man Pages
+@appendix The @ProductName{} Man Pages
+@c ----------------------------------------------------------------------------
+
+In this appendix the man pages for the various @ProductName{} tools are listed.
+
+@c -- A new node --------------------------------------------------------------
+@c @node gprofng driver
+@node Man page for gprofng
+@section Man page for @command{gprofng}
+@c ----------------------------------------------------------------------------
+
+@include gprofng.texi
+
+@c -- A new node --------------------------------------------------------------
+@page
+@node gprofng collect app
+@section Man page for @command{gprofng collect app}
+@c ----------------------------------------------------------------------------
+
+@include gp-collect-app.texi
+
+@c -- A new node --------------------------------------------------------------
+@page
+@node gprofng display text
+@section Man page for @command{gprofng display text}
+@c ----------------------------------------------------------------------------
+
+@include gp-display-text.texi
+
+@c -- A new node --------------------------------------------------------------
+@page
+@node gprofng display html
+@section Man page for @command{gprofng display html}
+@c ----------------------------------------------------------------------------
+
+@include gp-display-html.texi
+
+@c -- A new node --------------------------------------------------------------
+@page
+@node gprofng display src
+@section Man page for @command{gprofng display src}
+@c ----------------------------------------------------------------------------
+
+@include gp-display-src.texi
+
+@c -- A new node --------------------------------------------------------------
+@page
+@node gprofng archive
+@section Man page for @command{gprofng archive}
+@c ----------------------------------------------------------------------------
+
+@include gp-archive.texi
+
+@ifnothtml
+@node Index
+@unnumbered Index
+@printindex cp
+@end ifnothtml
+
+@bye
-@set UPDATED 28 September 2022
-@set UPDATED-MONTH September 2022
+@set UPDATED 14 April 2022
+@set UPDATED-MONTH April 2022
@set EDITION 2.40.50
-@set VERSION 2.40.50
+@set VERSION 2.0
gp-display-html: gp-display-html.in Makefile
$(do_subst) < $(srcdir)/gp-display-html.in > $@
chmod +x $@
-
-if BUILD_MAN
-
-man_MANS = gp-display-html.1
-MAINTAINERCLEANFILES = $(man_MANS)
-
-# Use this if the man pages depend on the version number.
-# common_mandeps = $(top_srcdir)/../bfd/version.m4
-#
-# Also change the dependence line below to this:
-# gp-display-html.1: $(common_mandeps) gp-display-html
-#
-# Currently, the version number shown in the man page is derived from
-# the output printed with --version.
-
-# These variables are used by help2man to generate the man pages.
-
-INFO_PAGE = "gprofng"
-MANUAL = "User Commands"
-TEXT_GP_DISPLAY_HTML = "generate an HTML based directory structure to browse the profiles"
-
-HELP2MAN_OPT = --libtool --no-info --info-page=$(INFO_PAGE) --manual=$(MANUAL)
-H2M_FILTER = | sed 's/\.TP/\.TP\n.B/' | sed 's/Commands:/\.SH COMMANDS/' \
- | sed 's/See also:/\.SH SEE ALSO/' | sed 's/Documentation:/.SH DOCUMENTATION/' \
- | sed 's/Limitations:/.SH LIMITATIONS/'
-
-gp-display-html.1: gp-display-html
- $(AM_V_GEN)_BUILDING_MANPAGE=1 $(HELP2MAN) $(HELP2MAN_OPT) \
- --name=$(TEXT_GP_DISPLAY_HTML) ./gp-display-html $(H2M_FILTER) > $@
-
-endif
-
|| { echo " ( cd '$$dir' && rm -f" $$files ")"; \
$(am__cd) "$$dir" && rm -f $$files; }; \
}
-am__installdirs = "$(DESTDIR)$(bindir)" "$(DESTDIR)$(man1dir)"
+am__installdirs = "$(DESTDIR)$(bindir)"
SCRIPTS = $(bin_SCRIPTS)
AM_V_P = $(am__v_P_@AM_V@)
am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
n|no|NO) false;; \
*) (install-info --version) >/dev/null 2>&1;; \
esac
-man1dir = $(mandir)/man1
-NROFF = nroff
-MANS = $(man_MANS)
am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/../mkinstalldirs
DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
bin_SCRIPTS = gp-display-html
CLEANFILES = $(bin_SCRIPTS)
do_subst = sed -e 's/BINUTILS_VERSION/$(VERSION)/'
-@BUILD_MAN_TRUE@man_MANS = gp-display-html.1
-@BUILD_MAN_TRUE@MAINTAINERCLEANFILES = $(man_MANS)
-
-# Use this if the man pages depend on the version number.
-# common_mandeps = $(top_srcdir)/../bfd/version.m4
-#
-# Also change the dependence line below to this:
-# gp-display-html.1: $(common_mandeps) gp-display-html
-#
-# Currently, the version number shown in the man page is derived from
-# the output printed with --version.
-
-# These variables are used by help2man to generate the man pages.
-@BUILD_MAN_TRUE@INFO_PAGE = "gprofng"
-@BUILD_MAN_TRUE@MANUAL = "User Commands"
-@BUILD_MAN_TRUE@TEXT_GP_DISPLAY_HTML = "generate an HTML based directory structure to browse the profiles"
-@BUILD_MAN_TRUE@HELP2MAN_OPT = --libtool --no-info --info-page=$(INFO_PAGE) --manual=$(MANUAL)
-@BUILD_MAN_TRUE@H2M_FILTER = | sed 's/\.TP/\.TP\n.B/' | sed 's/Commands:/\.SH COMMANDS/' \
-@BUILD_MAN_TRUE@ | sed 's/See also:/\.SH SEE ALSO/' | sed 's/Documentation:/.SH DOCUMENTATION/' \
-@BUILD_MAN_TRUE@ | sed 's/Limitations:/.SH LIMITATIONS/'
-
all: all-am
.SUFFIXES:
clean-libtool:
-rm -rf .libs _libs
-install-man1: $(man_MANS)
- @$(NORMAL_INSTALL)
- @list1=''; \
- list2='$(man_MANS)'; \
- test -n "$(man1dir)" \
- && test -n "`echo $$list1$$list2`" \
- || exit 0; \
- echo " $(MKDIR_P) '$(DESTDIR)$(man1dir)'"; \
- $(MKDIR_P) "$(DESTDIR)$(man1dir)" || exit 1; \
- { for i in $$list1; do echo "$$i"; done; \
- if test -n "$$list2"; then \
- for i in $$list2; do echo "$$i"; done \
- | sed -n '/\.1[a-z]*$$/p'; \
- fi; \
- } | while read p; do \
- if test -f $$p; then d=; else d="$(srcdir)/"; fi; \
- echo "$$d$$p"; echo "$$p"; \
- done | \
- sed -e 'n;s,.*/,,;p;h;s,.*\.,,;s,^[^1][0-9a-z]*$$,1,;x' \
- -e 's,\.[0-9a-z]*$$,,;$(transform);G;s,\n,.,' | \
- sed 'N;N;s,\n, ,g' | { \
- list=; while read file base inst; do \
- if test "$$base" = "$$inst"; then list="$$list $$file"; else \
- echo " $(INSTALL_DATA) '$$file' '$(DESTDIR)$(man1dir)/$$inst'"; \
- $(INSTALL_DATA) "$$file" "$(DESTDIR)$(man1dir)/$$inst" || exit $$?; \
- fi; \
- done; \
- for i in $$list; do echo "$$i"; done | $(am__base_list) | \
- while read files; do \
- test -z "$$files" || { \
- echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(man1dir)'"; \
- $(INSTALL_DATA) $$files "$(DESTDIR)$(man1dir)" || exit $$?; }; \
- done; }
-
-uninstall-man1:
- @$(NORMAL_UNINSTALL)
- @list=''; test -n "$(man1dir)" || exit 0; \
- files=`{ for i in $$list; do echo "$$i"; done; \
- l2='$(man_MANS)'; for i in $$l2; do echo "$$i"; done | \
- sed -n '/\.1[a-z]*$$/p'; \
- } | sed -e 's,.*/,,;h;s,.*\.,,;s,^[^1][0-9a-z]*$$,1,;x' \
- -e 's,\.[0-9a-z]*$$,,;$(transform);G;s,\n,.,'`; \
- dir='$(DESTDIR)$(man1dir)'; $(am__uninstall_files_from_dir)
tags TAGS:
ctags CTAGS:
done
check-am: all-am
check: check-am
-all-am: Makefile $(SCRIPTS) $(MANS)
+all-am: Makefile $(SCRIPTS)
installdirs:
- for dir in "$(DESTDIR)$(bindir)" "$(DESTDIR)$(man1dir)"; do \
+ for dir in "$(DESTDIR)$(bindir)"; do \
test -z "$$dir" || $(MKDIR_P) "$$dir"; \
done
install: install-am
maintainer-clean-generic:
@echo "This command is intended for maintainers to use"
@echo "it deletes files that may require special tools to rebuild."
- -test -z "$(MAINTAINERCLEANFILES)" || rm -f $(MAINTAINERCLEANFILES)
clean: clean-am
clean-am: clean-generic clean-libtool mostlyclean-am
info-am:
-install-data-am: install-man
+install-data-am:
install-dvi: install-dvi-am
install-info-am:
-install-man: install-man1
+install-man:
install-pdf: install-pdf-am
ps-am:
-uninstall-am: uninstall-binSCRIPTS uninstall-man
-
-uninstall-man: uninstall-man1
+uninstall-am: uninstall-binSCRIPTS
.MAKE: install-am install-strip
install install-am install-binSCRIPTS install-data \
install-data-am install-dvi install-dvi-am install-exec \
install-exec-am install-html install-html-am install-info \
- install-info-am install-man install-man1 install-pdf \
- install-pdf-am install-ps install-ps-am install-strip \
- installcheck installcheck-am installdirs maintainer-clean \
+ install-info-am install-man install-pdf install-pdf-am \
+ install-ps install-ps-am install-strip installcheck \
+ installcheck-am installdirs maintainer-clean \
maintainer-clean-generic mostlyclean mostlyclean-generic \
mostlyclean-libtool pdf pdf-am ps ps-am tags-am uninstall \
- uninstall-am uninstall-binSCRIPTS uninstall-man uninstall-man1
+ uninstall-am uninstall-binSCRIPTS
.PRECIOUS: Makefile
$(do_subst) < $(srcdir)/gp-display-html.in > $@
chmod +x $@
-@BUILD_MAN_TRUE@gp-display-html.1: gp-display-html
-@BUILD_MAN_TRUE@ $(AM_V_GEN)_BUILDING_MANPAGE=1 $(HELP2MAN) $(HELP2MAN_OPT) \
-@BUILD_MAN_TRUE@ --name=$(TEXT_GP_DISPLAY_HTML) ./gp-display-html $(H2M_FILTER) > $@
-
# Tell versions [3.59,3.63) of GNU make to not export all variables.
# Otherwise a system limit (for SysV at least) may be exceeded.
.NOEXPORT:
$(BISON) $^
BUILT_SOURCES = QLParser.tab.hh
-EXTRA_DIST = QLParser.yy $(man_MANS)
+EXTRA_DIST = QLParser.yy
lib_LTLIBRARIES = $(LIBGPROFNG)
gp_display_text_SOURCES = gp-display-text.cc ipc.cc ipcio.cc
gp_display_text_LDADD = $(LIBGPROFNG) $(CLOCK_GETTIME_LINK) $(ZLIB)
-
-if BUILD_MAN
-
-man_MANS = \
- gp-archive.1 \
- gp-collect-app.1 \
- gp-display-src.1 \
- gp-display-text.1
-
-MAINTAINERCLEANFILES = $(man_MANS)
-
-# The man pages depend on the version number and on a help2man include file.
-common_mandeps = $(top_srcdir)/../bfd/version.m4
-
-# Use -o so that the `missing' program can infer the output file.
-# Embolden subcommand names in the output, and include a SEE ALSO.
-# Arrange to regenerate the output if we have help2man, but leave the
-# disted output there otherwise.
-# Some extra annoying complexity is in place so that people without
-# help2man dno't accidentally overwrite the manpage.
-
-INFO_PAGE = "gprofng"
-MANUAL = "User Commands"
-TEXT_GPROFNG = "the driver for the gprofng tool suite"
-TEXT_GP_ARCHIVE = "archive gprofng experiment data"
-TEXT_GP_COLLECT_APP = "collect performance data for the target application"
-TEXT_GP_DISPLAY_SRC = "display the source code, optionally interleaved with the disassembly of the target object"
-TEXT_GP_DISPLAY_TEXT = "display the performance data in plain text format"
-
-HELP2MAN_OPT = --libtool --no-info --info-page=$(INFO_PAGE) --manual=$(MANUAL)
-H2M_FILTER = | sed 's/\.TP/\.TP\n.B/' | sed 's/Commands:/\.SH COMMANDS/' \
- | sed 's/See also:/\.SH SEE ALSO/' | sed 's/Documentation:/.SH DOCUMENTATION/' \
- | sed 's/Limitations:/.SH LIMITATIONS/'
-
-gp-archive.1: $(srcdir)/gp-archive.cc $(common_mandeps) | ./gp-archive$(EXEEXT)
- $(AM_V_GEN)_BUILDING_MANPAGE=1 $(HELP2MAN) $(HELP2MAN_OPT) \
- --name=$(TEXT_GP_ARCHIVE) ./gp-archive$(EXEEXT) $(H2M_FILTER) > $@
-
-gp-collect-app.1: $(srcdir)/gp-collect-app.cc $(common_mandeps) | ./gp-collect-app$(EXEEXT)
- $(AM_V_GEN)_BUILDING_MANPAGE=1 $(HELP2MAN) $(HELP2MAN_OPT) \
- --name=$(TEXT_GP_COLLECT_APP) ./gp-collect-app$(EXEEXT) $(H2M_FILTER) > $@
-
-gp-display-src.1: $(srcdir)/gp-display-src.cc $(srcdir)/Command.cc \
- $(common_mandeps) | ./gp-display-src$(EXEEXT)
- $(AM_V_GEN)_BUILDING_MANPAGE=1 $(HELP2MAN) $(HELP2MAN_OPT) \
- --name=$(TEXT_GP_DISPLAY_SRC) ./gp-display-src$(EXEEXT) $(H2M_FILTER) > $@
-
-gp-display-text.1: $(srcdir)/gp-display-text.cc $(srcdir)/Command.cc \
- $(common_mandeps) | ./gp-display-text$(EXEEXT)
- $(AM_V_GEN)_BUILDING_MANPAGE=1 $(HELP2MAN) $(HELP2MAN_OPT) \
- --name=$(TEXT_GP_DISPLAY_TEXT) ./gp-display-text$(EXEEXT) $(H2M_FILTER) > $@
-
-endif
-
# Distribution involves building the binaries to generate the manpage,
# so ensure that the necessary libraries are built at dist time.
dist-hook: $(LIBGPROFNG)
$(am__cd) "$$dir" && rm -f $$files; }; \
}
am__installdirs = "$(DESTDIR)$(libdir)" "$(DESTDIR)$(bindir)" \
- "$(DESTDIR)$(man1dir)" "$(DESTDIR)$(dbedir)"
+ "$(DESTDIR)$(dbedir)"
LTLIBRARIES = $(lib_LTLIBRARIES)
am__DEPENDENCIES_1 =
libgprofng_la_DEPENDENCIES = $(top_builddir)/../opcodes/libopcodes.la \
n|no|NO) false;; \
*) (install-info --version) >/dev/null 2>&1;; \
esac
-man1dir = $(mandir)/man1
-NROFF = nroff
-MANS = $(man_MANS)
DATA = $(dbe_DATA)
am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
# Read a list of newline-separated strings from the standard input,
AM_CXXFLAGS = $(AM_CFLAGS)
BUILT_SOURCES = QLParser.tab.hh
-EXTRA_DIST = QLParser.yy $(man_MANS)
+EXTRA_DIST = QLParser.yy
lib_LTLIBRARIES = $(LIBGPROFNG)
libgprofng_la_SOURCES = $(CCSOURCES) $(CSOURCES)
libgprofng_la_LDFLAGS = -version-info 0:0:0
gp_display_src_LDADD = $(LIBGPROFNG) $(CLOCK_GETTIME_LINK) $(ZLIB)
gp_display_text_SOURCES = gp-display-text.cc ipc.cc ipcio.cc
gp_display_text_LDADD = $(LIBGPROFNG) $(CLOCK_GETTIME_LINK) $(ZLIB)
-@BUILD_MAN_TRUE@man_MANS = \
-@BUILD_MAN_TRUE@ gp-archive.1 \
-@BUILD_MAN_TRUE@ gp-collect-app.1 \
-@BUILD_MAN_TRUE@ gp-display-src.1 \
-@BUILD_MAN_TRUE@ gp-display-text.1
-
-@BUILD_MAN_TRUE@MAINTAINERCLEANFILES = $(man_MANS)
-
-# The man pages depend on the version number and on a help2man include file.
-@BUILD_MAN_TRUE@common_mandeps = $(top_srcdir)/../bfd/version.m4
-
-# Use -o so that the `missing' program can infer the output file.
-# Embolden subcommand names in the output, and include a SEE ALSO.
-# Arrange to regenerate the output if we have help2man, but leave the
-# disted output there otherwise.
-# Some extra annoying complexity is in place so that people without
-# help2man dno't accidentally overwrite the manpage.
-@BUILD_MAN_TRUE@INFO_PAGE = "gprofng"
-@BUILD_MAN_TRUE@MANUAL = "User Commands"
-@BUILD_MAN_TRUE@TEXT_GPROFNG = "the driver for the gprofng tool suite"
-@BUILD_MAN_TRUE@TEXT_GP_ARCHIVE = "archive gprofng experiment data"
-@BUILD_MAN_TRUE@TEXT_GP_COLLECT_APP = "collect performance data for the target application"
-@BUILD_MAN_TRUE@TEXT_GP_DISPLAY_SRC = "display the source code, optionally interleaved with the disassembly of the target object"
-@BUILD_MAN_TRUE@TEXT_GP_DISPLAY_TEXT = "display the performance data in plain text format"
-@BUILD_MAN_TRUE@HELP2MAN_OPT = --libtool --no-info --info-page=$(INFO_PAGE) --manual=$(MANUAL)
-@BUILD_MAN_TRUE@H2M_FILTER = | sed 's/\.TP/\.TP\n.B/' | sed 's/Commands:/\.SH COMMANDS/' \
-@BUILD_MAN_TRUE@ | sed 's/See also:/\.SH SEE ALSO/' | sed 's/Documentation:/.SH DOCUMENTATION/' \
-@BUILD_MAN_TRUE@ | sed 's/Limitations:/.SH LIMITATIONS/'
-
all: $(BUILT_SOURCES)
$(MAKE) $(AM_MAKEFLAGS) all-am
clean-libtool:
-rm -rf .libs _libs
-install-man1: $(man_MANS)
- @$(NORMAL_INSTALL)
- @list1=''; \
- list2='$(man_MANS)'; \
- test -n "$(man1dir)" \
- && test -n "`echo $$list1$$list2`" \
- || exit 0; \
- echo " $(MKDIR_P) '$(DESTDIR)$(man1dir)'"; \
- $(MKDIR_P) "$(DESTDIR)$(man1dir)" || exit 1; \
- { for i in $$list1; do echo "$$i"; done; \
- if test -n "$$list2"; then \
- for i in $$list2; do echo "$$i"; done \
- | sed -n '/\.1[a-z]*$$/p'; \
- fi; \
- } | while read p; do \
- if test -f $$p; then d=; else d="$(srcdir)/"; fi; \
- echo "$$d$$p"; echo "$$p"; \
- done | \
- sed -e 'n;s,.*/,,;p;h;s,.*\.,,;s,^[^1][0-9a-z]*$$,1,;x' \
- -e 's,\.[0-9a-z]*$$,,;$(transform);G;s,\n,.,' | \
- sed 'N;N;s,\n, ,g' | { \
- list=; while read file base inst; do \
- if test "$$base" = "$$inst"; then list="$$list $$file"; else \
- echo " $(INSTALL_DATA) '$$file' '$(DESTDIR)$(man1dir)/$$inst'"; \
- $(INSTALL_DATA) "$$file" "$(DESTDIR)$(man1dir)/$$inst" || exit $$?; \
- fi; \
- done; \
- for i in $$list; do echo "$$i"; done | $(am__base_list) | \
- while read files; do \
- test -z "$$files" || { \
- echo " $(INSTALL_DATA) $$files '$(DESTDIR)$(man1dir)'"; \
- $(INSTALL_DATA) $$files "$(DESTDIR)$(man1dir)" || exit $$?; }; \
- done; }
-
-uninstall-man1:
- @$(NORMAL_UNINSTALL)
- @list=''; test -n "$(man1dir)" || exit 0; \
- files=`{ for i in $$list; do echo "$$i"; done; \
- l2='$(man_MANS)'; for i in $$l2; do echo "$$i"; done | \
- sed -n '/\.1[a-z]*$$/p'; \
- } | sed -e 's,.*/,,;h;s,.*\.,,;s,^[^1][0-9a-z]*$$,1,;x' \
- -e 's,\.[0-9a-z]*$$,,;$(transform);G;s,\n,.,'`; \
- dir='$(DESTDIR)$(man1dir)'; $(am__uninstall_files_from_dir)
install-dbeDATA: $(dbe_DATA)
@$(NORMAL_INSTALL)
@list='$(dbe_DATA)'; test -n "$(dbedir)" || list=; \
check-am: all-am
check: $(BUILT_SOURCES)
$(MAKE) $(AM_MAKEFLAGS) check-am
-all-am: Makefile $(LTLIBRARIES) $(PROGRAMS) $(MANS) $(DATA)
+all-am: Makefile $(LTLIBRARIES) $(PROGRAMS) $(DATA)
install-binPROGRAMS: install-libLTLIBRARIES
installdirs:
- for dir in "$(DESTDIR)$(libdir)" "$(DESTDIR)$(bindir)" "$(DESTDIR)$(man1dir)" "$(DESTDIR)$(dbedir)"; do \
+ for dir in "$(DESTDIR)$(libdir)" "$(DESTDIR)$(bindir)" "$(DESTDIR)$(dbedir)"; do \
test -z "$$dir" || $(MKDIR_P) "$$dir"; \
done
install: $(BUILT_SOURCES)
@echo "This command is intended for maintainers to use"
@echo "it deletes files that may require special tools to rebuild."
-test -z "$(BUILT_SOURCES)" || rm -f $(BUILT_SOURCES)
- -test -z "$(MAINTAINERCLEANFILES)" || rm -f $(MAINTAINERCLEANFILES)
clean: clean-am
clean-am: clean-binPROGRAMS clean-generic clean-libLTLIBRARIES \
info-am:
-install-data-am: install-dbeDATA install-man
+install-data-am: install-dbeDATA
install-dvi: install-dvi-am
install-info-am:
-install-man: install-man1
+install-man:
install-pdf: install-pdf-am
ps-am:
uninstall-am: uninstall-binPROGRAMS uninstall-dbeDATA \
- uninstall-libLTLIBRARIES uninstall-man
-
-uninstall-man: uninstall-man1
+ uninstall-libLTLIBRARIES
.MAKE: all check install install-am install-strip
install-data-am install-dbeDATA install-dvi install-dvi-am \
install-exec install-exec-am install-html install-html-am \
install-info install-info-am install-libLTLIBRARIES \
- install-man install-man1 install-pdf install-pdf-am install-ps \
+ install-man install-pdf install-pdf-am install-ps \
install-ps-am install-strip installcheck installcheck-am \
installdirs maintainer-clean maintainer-clean-generic \
mostlyclean mostlyclean-compile mostlyclean-generic \
mostlyclean-libtool pdf pdf-am ps ps-am tags tags-am uninstall \
uninstall-am uninstall-binPROGRAMS uninstall-dbeDATA \
- uninstall-libLTLIBRARIES uninstall-man uninstall-man1
+ uninstall-libLTLIBRARIES
.PRECIOUS: Makefile
QLParser.tab.cc QLParser.tab.hh: QLParser.yy
$(BISON) $^
-@BUILD_MAN_TRUE@gp-archive.1: $(srcdir)/gp-archive.cc $(common_mandeps) | ./gp-archive$(EXEEXT)
-@BUILD_MAN_TRUE@ $(AM_V_GEN)_BUILDING_MANPAGE=1 $(HELP2MAN) $(HELP2MAN_OPT) \
-@BUILD_MAN_TRUE@ --name=$(TEXT_GP_ARCHIVE) ./gp-archive$(EXEEXT) $(H2M_FILTER) > $@
-
-@BUILD_MAN_TRUE@gp-collect-app.1: $(srcdir)/gp-collect-app.cc $(common_mandeps) | ./gp-collect-app$(EXEEXT)
-@BUILD_MAN_TRUE@ $(AM_V_GEN)_BUILDING_MANPAGE=1 $(HELP2MAN) $(HELP2MAN_OPT) \
-@BUILD_MAN_TRUE@ --name=$(TEXT_GP_COLLECT_APP) ./gp-collect-app$(EXEEXT) $(H2M_FILTER) > $@
-
-@BUILD_MAN_TRUE@gp-display-src.1: $(srcdir)/gp-display-src.cc $(srcdir)/Command.cc \
-@BUILD_MAN_TRUE@ $(common_mandeps) | ./gp-display-src$(EXEEXT)
-@BUILD_MAN_TRUE@ $(AM_V_GEN)_BUILDING_MANPAGE=1 $(HELP2MAN) $(HELP2MAN_OPT) \
-@BUILD_MAN_TRUE@ --name=$(TEXT_GP_DISPLAY_SRC) ./gp-display-src$(EXEEXT) $(H2M_FILTER) > $@
-
-@BUILD_MAN_TRUE@gp-display-text.1: $(srcdir)/gp-display-text.cc $(srcdir)/Command.cc \
-@BUILD_MAN_TRUE@ $(common_mandeps) | ./gp-display-text$(EXEEXT)
-@BUILD_MAN_TRUE@ $(AM_V_GEN)_BUILDING_MANPAGE=1 $(HELP2MAN) $(HELP2MAN_OPT) \
-@BUILD_MAN_TRUE@ --name=$(TEXT_GP_DISPLAY_TEXT) ./gp-display-text$(EXEEXT) $(H2M_FILTER) > $@
-
# Distribution involves building the binaries to generate the manpage,
# so ensure that the necessary libraries are built at dist time.
dist-hook: $(LIBGPROFNG)