Discussion:
[PATCH 2/2] perf bench: add x86-64 specific benchmarks to perf bench mem memcpy
(too old to reply)
Hitoshi Mitake
2010-10-29 16:01:38 UTC
Permalink
This patch adds new file: mem-memcpy-x86-64-asm.S
for x86-64 specific memcpy() benchmarking.
Added new benchmarks are,
x86-64-rep: memcpy() implemented with rep instruction
x86-64-unrolled: unrolled memcpy()

Original idea of including the source files of kernel
for benchmarking is suggested by Ingo Molnar.
This is more effective than write-once programs for quantitative
evaluation of in-kernel, little and leaf functions called high frequently.
Because perf bench is in kernel source tree and executing it
on various hardwares, especially new model CPUs, is easy.

This way can also be used for other functions of kernel e.g. checksum functions.

Example of usage on Core i3 M330:

| % ./perf bench mem memcpy -l 500MB
| # Running mem/memcpy benchmark...
| # Copying 500MB Bytes from 0x7f911f94c010 to 0x7f913ed4d010 ...
|
| 578.732506 MB/Sec
| % ./perf bench mem memcpy -l 500MB -r x86-64-rep
| # Running mem/memcpy benchmark...
| # Copying 500MB Bytes from 0x7fb4b6fe4010 to 0x7fb4d63e5010 ...
|
| 738.184980 MB/Sec
| % ./perf bench mem memcpy -l 500MB -r x86-64-unrolled
| # Running mem/memcpy benchmark...
| # Copying 500MB Bytes from 0x7f6f2e668010 to 0x7f6f4da69010 ...
|
| 767.483269 MB/Sec

This shows clearly that unrolled memcpy() is efficient
than rep version and glibc's one :)

# checkpatch.pl warns about two externs in bench/mem-memcpy.c
# added by this patch. But I think it is no problem.

Signed-off-by: Hitoshi Mitake <***@dcl.info.waseda.ac.jp>
Cc: Ma Ling: <***@intel.com>
Cc: Zhao Yakui <***@intel.com>
Cc: Peter Zijlstra <***@chello.nl>
Cc: Arnaldo Carvalho de Melo <***@redhat.com>
Cc: Paul Mackerras <***@samba.org>
Cc: Frederic Weisbecker <***@gmail.com>
Cc: Steven Rostedt <***@goodmis.org>
Cc: Thomas Gleixner <***@linutronix.de>
Cc: H. Peter Anvin <***@zytor.com>
---
tools/perf/Makefile | 8 ++++++++
tools/perf/bench/mem-memcpy-x86-64-asm.S | 4 ++++
tools/perf/bench/mem-memcpy.c | 14 ++++++++++++++
3 files changed, 26 insertions(+), 0 deletions(-)
create mode 100644 tools/perf/bench/mem-memcpy-x86-64-asm.S

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index d1db0f6..540020e 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -183,9 +183,12 @@ ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ \
# Additional ARCH settings for x86
ifeq ($(ARCH),i386)
ARCH := x86
+ ARCH_CFLAGS = -DARCH_X86_64
endif
ifeq ($(ARCH),x86_64)
ARCH := x86
+ ARCH_CFLAGS = -DARCH_X86_64
+ ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S
endif

# CFLAGS and LDFLAGS are for the users to override from the command line.
@@ -417,6 +420,7 @@ LIB_H += util/probe-finder.h
LIB_H += util/probe-event.h
LIB_H += util/pstack.h
LIB_H += util/cpumap.h
+LIB_H += $(ARCH_INCLUDE)

LIB_OBJS += $(OUTPUT)util/abspath.o
LIB_OBJS += $(OUTPUT)util/alias.o
@@ -472,6 +476,9 @@ BUILTIN_OBJS += $(OUTPUT)builtin-bench.o
# Benchmark modules
BUILTIN_OBJS += $(OUTPUT)bench/sched-messaging.o
BUILTIN_OBJS += $(OUTPUT)bench/sched-pipe.o
+ifeq ($(ARCH),x86)
+BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy-x86-64-asm.o
+endif
BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy.o

BUILTIN_OBJS += $(OUTPUT)builtin-diff.o
@@ -898,6 +905,7 @@ BASIC_CFLAGS += -DSHA1_HEADER='$(SHA1_HEADER_SQ)' \
LIB_OBJS += $(COMPAT_OBJS)

ALL_CFLAGS += $(BASIC_CFLAGS)
+ALL_CFLAGS += $(ARCH_CFLAGS)
ALL_LDFLAGS += $(BASIC_LDFLAGS)

export TAR INSTALL DESTDIR SHELL_PATH
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm.S b/tools/perf/bench/mem-memcpy-x86-64-asm.S
new file mode 100644
index 0000000..6246d94
--- /dev/null
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S
@@ -0,0 +1,4 @@
+
+#define PERF_BENCH
+
+#include "../../../arch/x86/lib/memcpy_64.S"
diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c
index 38dae74..ba73f39 100644
--- a/tools/perf/bench/mem-memcpy.c
+++ b/tools/perf/bench/mem-memcpy.c
@@ -19,6 +19,11 @@
#include <sys/time.h>
#include <errno.h>

+#ifdef ARCH_X86_64
+extern void *memcpy_x86_64_unrolled(void *to, const void *from, size_t len);
+extern void *memcpy_x86_64_rep(void *to, const void *from, size_t len);
+#endif
+
#define K 1024

static const char *length_str = "1MB";
@@ -47,6 +52,15 @@ struct routine routines[] = {
{ "default",
"Default memcpy() provided by glibc",
memcpy },
+#ifdef ARCH_X86_64
+ { "x86-64-unrolled",
+ "unrolled memcpy() in arch/x86/lib/memcpy_64.S",
+ memcpy_x86_64_unrolled },
+ { "x86-64-rep",
+ "memcpy() implemented with rep instruction"
+ " in arch/x86/lib/memcpy_64.S",
+ memcpy_x86_64_rep },
+#endif
{ NULL,
NULL,
NULL }
--
1.7.1.1
Peter Zijlstra
2010-10-29 19:49:11 UTC
Permalink
This patch ports arch/x86/lib/memcpy_64.S to "perf bench mem".
When PERF_BENCH is defined at preprocessor level,
memcpy_64.S is preprocessed to includable form from the sources
under tools/perf for benchmarking programs.
---
arch/x86/lib/memcpy_64.S | 30 ++++++++++++++++++++++++++++++
1 files changed, 30 insertions(+), 0 deletions(-)
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 75ef61e..72c6dfe 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,10 +1,23 @@
/* Copyright 2002 Andi Kleen */
+/*
+ * perf bench adoption by Hitoshi Mitake
+ * PERF_BENCH means that this file is included from
+ * the source files under tools/perf/ for benchmark programs.
+ *
+ * You don't have to care about PERF_BENCH when
+ * you are working on the kernel.
+ */
+
+#ifndef PERF_BENCH
I don't like littering the actual kernel code with tools/perf/
ifdeffery..
Ingo Molnar
2010-10-30 19:21:31 UTC
Permalink
Post by Peter Zijlstra
This patch ports arch/x86/lib/memcpy_64.S to "perf bench mem".
When PERF_BENCH is defined at preprocessor level,
memcpy_64.S is preprocessed to includable form from the sources
under tools/perf for benchmarking programs.
---
arch/x86/lib/memcpy_64.S | 30 ++++++++++++++++++++++++++++++
1 files changed, 30 insertions(+), 0 deletions(-)
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 75ef61e..72c6dfe 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,10 +1,23 @@
/* Copyright 2002 Andi Kleen */
+/*
+ * perf bench adoption by Hitoshi Mitake
+ * PERF_BENCH means that this file is included from
+ * the source files under tools/perf/ for benchmark programs.
+ *
+ * You don't have to care about PERF_BENCH when
+ * you are working on the kernel.
+ */
+
+#ifndef PERF_BENCH
I don't like littering the actual kernel code with tools/perf/
ifdeffery..
Yeah - could we somehow accept that file into a perf build as-is?

Thanks,

Ingo
Miao Xie
2010-12-20 06:30:47 UTC
Permalink
Post by Ingo Molnar
Post by Peter Zijlstra
This patch ports arch/x86/lib/memcpy_64.S to "perf bench mem".
When PERF_BENCH is defined at preprocessor level,
memcpy_64.S is preprocessed to includable form from the sources
under tools/perf for benchmarking programs.
---
arch/x86/lib/memcpy_64.S | 30 ++++++++++++++++++++++++++++++
1 files changed, 30 insertions(+), 0 deletions(-)
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 75ef61e..72c6dfe 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,10 +1,23 @@
/* Copyright 2002 Andi Kleen */
+/*
+ * perf bench adoption by Hitoshi Mitake
+ * PERF_BENCH means that this file is included from
+ * the source files under tools/perf/ for benchmark programs.
+ *
+ * You don't have to care about PERF_BENCH when
+ * you are working on the kernel.
+ */
+
+#ifndef PERF_BENCH
I don't like littering the actual kernel code with tools/perf/
ifdeffery..
Yeah - could we somehow accept that file into a perf build as-is?
Thanks,
Ingo
Really sorry for my slow work...
BTW, I have a question for Miao and Ingo.
We are planning to implement new memcpy() of Miao,
and the important point is not removing previous memcpy()
for future architectures and benchmarkings.
I feel that adding new CPU feature flag (like X86_FEATURE_REP_GOOD)
and switching memcpy() with alternative mechanism is good way.
(So we will have three memcpy()s: rep based, unrolled, and new
unaligned oriented one)
But there is another way: #ifdef. Which do you prefer?
I agree with your idea, but Ma Ling said this way may cause the i-cache
miss problem.
http://marc.info/?l=3Dlinux-kernel&m=3D128746120107953&w=3D2
(The size of the i-cache is 32K, the size of memcpy() in my patch is 56=
0Byte,
and the size of the last version in tip tree is 400Byte).

But I have not tested it, so I don't know the real result. Maybe we sho=
uld
try to implement the new memcpy() first.
And could you tell me the detail of CPU family information
you are targeting, Miao?
They are Core2 Duo E7300(Core name: Wolfdale) and Xeon X5260(Core name=
: Wolfdale-DP).

The following is the detailed information of these two CPU:
Core2 Duo E7300:
vendor_id : GenuineIntel
cpu family : 6
model : 23
model name : Intel(R) Core(TM)2 Duo CPU E7300 @ 2.66GHz
stepping : 6
cpu MHz : 1603.000
cache size : 3072 KB
physical id : 0
siblings : 2
core id : 1
cpu cores : 2
apicid : 1
initial apicid : 1
fpu : yes
fpu_exception : yes
cpuid level : 10
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov =
pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx lm=
constant_tsc arch_perfmon pebs bts rep_good aperfmperf pni dtes64 moni=
tor ds_cpl est tm2 ssse3 cx16 xtpr pdcm sse4_1 lahf_lm dts
bogomips : 5319.70
clflush size : 64
cache_alignment : 64
address sizes : 36 bits physical, 48 bits virtual
power management:

Xeon X5260:
vendor_id : GenuineIntel
cpu family : 6
model : 23
model name : Intel(R) Xeon(R) CPU X5260 @ 3.33GHz
stepping : 6
cpu MHz : 1999.000
cache size : 6144 KB
physical id : 3
siblings : 2
core id : 1
cpu cores : 2
apicid : 7
initial apicid : 7
fpu : yes
fpu_exception : yes
cpuid level : 10
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov =
pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall lm co=
nstant_tsc arch_perfmon pebs bts rep_good aperfmperf pni dtes64 monitor=
ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm dca sse4_1 lahf_lm dts tpr_sha=
dow vnmi flexpriority
bogomips : 6649.07
clflush size : 64
cache_alignment : 64
address sizes : 38 bits physical, 48 bits virtual
power management:

Thanks
Miao
Hitoshi Mitake
2010-12-20 15:34:39 UTC
Permalink
Post by Ingo Molnar
Post by Peter Zijlstra
This patch ports arch/x86/lib/memcpy_64.S to "perf bench mem".
When PERF_BENCH is defined at preprocessor level,
memcpy_64.S is preprocessed to includable form from the sources
under tools/perf for benchmarking programs.
---
arch/x86/lib/memcpy_64.S | 30 ++++++++++++++++++++++++++++++
1 files changed, 30 insertions(+), 0 deletions(-)
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 75ef61e..72c6dfe 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,10 +1,23 @@
/* Copyright 2002 Andi Kleen */
+/*
+ * perf bench adoption by Hitoshi Mitake
+ * PERF_BENCH means that this file is included from
+ * the source files under tools/perf/ for benchmark programs.
+ *
+ * You don't have to care about PERF_BENCH when
+ * you are working on the kernel.
+ */
+
+#ifndef PERF_BENCH
I don't like littering the actual kernel code with tools/perf/
ifdeffery..
Yeah - could we somehow accept that file into a perf build as-is?
Thanks,
Ingo
Really sorry for my slow work...
BTW, I have a question for Miao and Ingo.
We are planning to implement new memcpy() of Miao,
and the important point is not removing previous memcpy()
for future architectures and benchmarkings.
I feel that adding new CPU feature flag (like X86_FEATURE_REP_GOOD)
and switching memcpy() with alternative mechanism is good way.
(So we will have three memcpy()s: rep based, unrolled, and new
unaligned oriented one)
But there is another way: #ifdef. Which do you prefer?
I agree with your idea, but Ma Ling said this way may cause the i-cac=
he
miss problem.
=C2=A0http://marc.info/?l=3Dlinux-kernel&m=3D128746120107953&w=3D2
(The size of the i-cache is 32K, the size of memcpy() in my patch is
560Byte,
and the size of the last version in tip tree is 400Byte).
But I have not tested it, so I don't know the real result. Maybe we s=
hould
try to implement the new memcpy() first.
I compared memcpy()'s icache miss behaviour with my new
--wait-on patch ( https://patchwork.kernel.org/patch/408801/ ).
And the result is,

default of tip tree

% sudo ./perf stat -w /tmp/perf-stat-wait -e L1-icache-load-misses

Performance counter stats for process id '12559':

64,328 L1-icache-load-misses

0.106513157 seconds time elapsed

Miao Xie's memcpy()

% sudo ./perf stat -w /tmp/perf-stat-wait -e L1-icache-misses

Performance counter stats for process id '13159':

64,559 L1-icache-load-misses

0.107057925 seconds time elapsed

It seems that there is no fatal icache miss.
# I tested perf bench mem memcpy with Core i3 M 330 processor.

But I don't understand well about cache characteristics of intel proces=
sor.
I have to look at this problem more deeply.
And could you tell me the detail of CPU family information
you are targeting, Miao?
They are =C2=A0Core2 Duo E7300(Core name: Wolfdale) and Xeon X5260(Co=
Wolfdale-DP).
vendor_id =C2=A0 =C2=A0 =C2=A0 : GenuineIntel
cpu family =C2=A0 =C2=A0 =C2=A0: 6
model =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 : 23
model name =C2=A0 =C2=A0 =C2=A0: Intel(R) Core(TM)2 Duo CPU =C2=A0 =C2=
stepping =C2=A0 =C2=A0 =C2=A0 =C2=A0: 6
cpu MHz =C2=A0 =C2=A0 =C2=A0 =C2=A0 : 1603.000
cache size =C2=A0 =C2=A0 =C2=A0: 3072 KB
physical id =C2=A0 =C2=A0 : 0
siblings =C2=A0 =C2=A0 =C2=A0 =C2=A0: 2
core id =C2=A0 =C2=A0 =C2=A0 =C2=A0 : 1
cpu cores =C2=A0 =C2=A0 =C2=A0 : 2
apicid =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0: 1
initial apicid =C2=A0: 1
fpu =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 : yes
fpu_exception =C2=A0 : yes
cpuid level =C2=A0 =C2=A0 : 10
wp =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0: yes
flags =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 : fpu vme de pse tsc msr pae=
mce cx8 apic sep mtrr pge mca
cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscal=
l nx lm
constant_tsc arch_perfmon pebs bts rep_good aperfmperf pni dtes64 mon=
itor
ds_cpl est tm2 ssse3 cx16 xtpr pdcm sse4_1 lahf_lm dts
bogomips =C2=A0 =C2=A0 =C2=A0 =C2=A0: 5319.70
clflush size =C2=A0 =C2=A0: 64
cache_alignment : 64
address sizes =C2=A0 : 36 bits physical, 48 bits virtual
vendor_id =C2=A0 =C2=A0 =C2=A0 : GenuineIntel
cpu family =C2=A0 =C2=A0 =C2=A0: 6
model =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 : 23
model name =C2=A0 =C2=A0 =C2=A0: Intel(R) Xeon(R) CPU =C2=A0 =C2=A0 =C2=
stepping =C2=A0 =C2=A0 =C2=A0 =C2=A0: 6
cpu MHz =C2=A0 =C2=A0 =C2=A0 =C2=A0 : 1999.000
cache size =C2=A0 =C2=A0 =C2=A0: 6144 KB
physical id =C2=A0 =C2=A0 : 3
siblings =C2=A0 =C2=A0 =C2=A0 =C2=A0: 2
core id =C2=A0 =C2=A0 =C2=A0 =C2=A0 : 1
cpu cores =C2=A0 =C2=A0 =C2=A0 : 2
apicid =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0: 7
initial apicid =C2=A0: 7
fpu =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 : yes
fpu_exception =C2=A0 : yes
cpuid level =C2=A0 =C2=A0 : 10
wp =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0: yes
flags =C2=A0 =C2=A0 =C2=A0 =C2=A0 =C2=A0 : fpu vme de pse tsc msr pae=
mce cx8 apic sep mtrr pge mca
cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscal=
l lm
constant_tsc arch_perfmon pebs bts rep_good aperfmperf pni dtes64 mon=
itor
ds_cpl vmx est tm2 ssse3 cx16 xtpr pdcm dca sse4_1 lahf_lm dts tpr_sh=
adow
vnmi flexpriority
bogomips =C2=A0 =C2=A0 =C2=A0 =C2=A0: 6649.07
clflush size =C2=A0 =C2=A0: 64
cache_alignment : 64
address sizes =C2=A0 : 38 bits physical, 48 bits virtual
Thanks for your information!

Thanks,
Hitoshi
Hitoshi Mitake
2010-11-05 17:10:51 UTC
Permalink
On Sat, Oct 30, 2010 at 06:08, Arnaldo Carvalho de Melo
Post by Peter Zijlstra
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,10 +1,23 @@
=A0/* Copyright 2002 Andi Kleen */
+/*
+ * perf bench adoption by Hitoshi Mitake
+ * PERF_BENCH means that this file is included from
+ * the source files under tools/perf/ for benchmark programs.
+ *
+ * You don't have to care about PERF_BENCH when
+ * you are working on the kernel.
+ */
+
+#ifndef PERF_BENCH
I don't like littering the actual kernel code with tools/perf/
ifdeffery..
Yeah, this kind of problem appeared in the past, we can't use things
that weren't specifically designed to be shared, the discussion about
how to properly share things between the kernel and things in tools
still has to happen.
OK, it seems that I have to consider better solution.
Could you tell me about the past problem for reference?
Your experience must be useful for this case.

--=20
Hitoshi Mitake
***@gmail.com
Ingo Molnar
2010-10-30 19:23:57 UTC
Permalink
Post by Hitoshi Mitake
This patch adds new file: mem-memcpy-x86-64-asm.S
for x86-64 specific memcpy() benchmarking.
Added new benchmarks are,
x86-64-rep: memcpy() implemented with rep instruction
x86-64-unrolled: unrolled memcpy()
Original idea of including the source files of kernel
for benchmarking is suggested by Ingo Molnar.
This is more effective than write-once programs for quantitative
evaluation of in-kernel, little and leaf functions called high frequently.
Because perf bench is in kernel source tree and executing it
on various hardwares, especially new model CPUs, is easy.
This way can also be used for other functions of kernel e.g. checksum functions.
| % ./perf bench mem memcpy -l 500MB
| # Running mem/memcpy benchmark...
| # Copying 500MB Bytes from 0x7f911f94c010 to 0x7f913ed4d010 ...
|
| 578.732506 MB/Sec
| % ./perf bench mem memcpy -l 500MB -r x86-64-rep
| # Running mem/memcpy benchmark...
| # Copying 500MB Bytes from 0x7fb4b6fe4010 to 0x7fb4d63e5010 ...
|
| 738.184980 MB/Sec
| % ./perf bench mem memcpy -l 500MB -r x86-64-unrolled
| # Running mem/memcpy benchmark...
| # Copying 500MB Bytes from 0x7f6f2e668010 to 0x7f6f4da69010 ...
|
| 767.483269 MB/Sec
This shows clearly that unrolled memcpy() is efficient
than rep version and glibc's one :)
Hey, really cool output :-)

Might also make sense to measure Ma Ling's patched version?
Post by Hitoshi Mitake
# checkpatch.pl warns about two externs in bench/mem-memcpy.c
# added by this patch. But I think it is no problem.
You should put these:

+#ifdef ARCH_X86_64
+extern void *memcpy_x86_64_unrolled(void *to, const void *from, size_t len);
+extern void *memcpy_x86_64_rep(void *to, const void *from, size_t len);
+#endif

into a .h file - a new one if needed.

That will make both checkpatch and me happier ;-)

Thanks,

Ingo
Hitoshi Mitake
2010-11-01 05:36:38 UTC
Permalink
Post by Ingo Molnar
Post by Hitoshi Mitake
This patch adds new file: mem-memcpy-x86-64-asm.S
for x86-64 specific memcpy() benchmarking.
Added new benchmarks are,
x86-64-rep: memcpy() implemented with rep instruction
x86-64-unrolled: unrolled memcpy()
Original idea of including the source files of kernel
for benchmarking is suggested by Ingo Molnar.
This is more effective than write-once programs for quantitative
evaluation of in-kernel, little and leaf functions called high frequ=
ently.
Post by Ingo Molnar
Post by Hitoshi Mitake
Because perf bench is in kernel source tree and executing it
on various hardwares, especially new model CPUs, is easy.
This way can also be used for other functions of kernel e.g. checksu=
m functions.
Post by Ingo Molnar
Post by Hitoshi Mitake
| % ./perf bench mem memcpy -l 500MB
| # Running mem/memcpy benchmark...
| # Copying 500MB Bytes from 0x7f911f94c010 to 0x7f913ed4d010 ...
|
| 578.732506 MB/Sec
| % ./perf bench mem memcpy -l 500MB -r x86-64-rep
| # Running mem/memcpy benchmark...
| # Copying 500MB Bytes from 0x7fb4b6fe4010 to 0x7fb4d63e5010 ...
|
| 738.184980 MB/Sec
| % ./perf bench mem memcpy -l 500MB -r x86-64-unrolled
| # Running mem/memcpy benchmark...
| # Copying 500MB Bytes from 0x7f6f2e668010 to 0x7f6f4da69010 ...
|
| 767.483269 MB/Sec
This shows clearly that unrolled memcpy() is efficient
than rep version and glibc's one :)
Hey, really cool output :-)
Might also make sense to measure Ma Ling's patched version?
Does Ma Ling's patched version mean,

http://marc.info/?l=3Dlinux-kernel&m=3D128652296500989&w=3D2

the memcpy applied the patch of the URL?
(It seems that this patch was written by Miao Xie.)

I'll include the result of patched version in the next post.
Post by Ingo Molnar
Post by Hitoshi Mitake
# checkpatch.pl warns about two externs in bench/mem-memcpy.c
# added by this patch. But I think it is no problem.
+#ifdef ARCH_X86_64
+extern void *memcpy_x86_64_unrolled(void *to, const void *from, si=
ze_t len);
Post by Ingo Molnar
+extern void *memcpy_x86_64_rep(void *to, const void *from, size_t =
len);
Post by Ingo Molnar
+#endif
into a .h file - a new one if needed.
That will make both checkpatch and me happier ;-)
OK, I'll separate these files.

BTW, I found really interesting evaluation result.
Current results of "perf bench mem memcpy" include
the overhead of page faults because the measured memcpy()
is the first access to allocated memory area.

I tested the another version of perf bench mem memcpy,
which does memcpy() before measured memcpy() for removing
the overhead come from page faults.

And this is the result:

% ./perf bench mem memcpy -l 500MB -r x86-64-unrolled
# Running mem/memcpy benchmark...
# Copying 500MB Bytes from 0x7f19d488f010 to 0x7f19f3c90010 ...

4.608340 GB/Sec

% ./perf bench mem memcpy -l 500MB
# Running mem/memcpy benchmark...
# Copying 500MB Bytes from 0x7f696c3cc010 to 0x7f698b7cd010 ...

4.856442 GB/Sec

% ./perf bench mem memcpy -l 500MB -r x86-64-rep
# Running mem/memcpy benchmark...
# Copying 500MB Bytes from 0x7f45d6cff010 to 0x7f45f6100010 ...

6.024445 GB/Sec

The relation of scores reversed!
I cannot explain the cause of this result, and
this is really interesting phenomenon.

So I'd like to add new command line option,
like "--pre-page-faults" to perf bench mem memcpy,
for doing memcpy() before measured memcpy().

How do you think about this idea?

Thanks,
Ingo Molnar
2010-11-01 09:02:51 UTC
Permalink
Post by Hitoshi Mitake
Post by Ingo Molnar
Post by Hitoshi Mitake
This patch adds new file: mem-memcpy-x86-64-asm.S
for x86-64 specific memcpy() benchmarking.
Added new benchmarks are,
x86-64-rep: memcpy() implemented with rep instruction
x86-64-unrolled: unrolled memcpy()
Original idea of including the source files of kernel
for benchmarking is suggested by Ingo Molnar.
This is more effective than write-once programs for quantitative
evaluation of in-kernel, little and leaf functions called high freq=
uently.
Post by Hitoshi Mitake
Post by Ingo Molnar
Post by Hitoshi Mitake
Because perf bench is in kernel source tree and executing it
on various hardwares, especially new model CPUs, is easy.
This way can also be used for other functions of kernel e.g. checks=
um functions.
Post by Hitoshi Mitake
Post by Ingo Molnar
Post by Hitoshi Mitake
| % ./perf bench mem memcpy -l 500MB
| # Running mem/memcpy benchmark...
| # Copying 500MB Bytes from 0x7f911f94c010 to 0x7f913ed4d010 ...
|
| 578.732506 MB/Sec
| % ./perf bench mem memcpy -l 500MB -r x86-64-rep
| # Running mem/memcpy benchmark...
| # Copying 500MB Bytes from 0x7fb4b6fe4010 to 0x7fb4d63e5010 ...
|
| 738.184980 MB/Sec
| % ./perf bench mem memcpy -l 500MB -r x86-64-unrolled
| # Running mem/memcpy benchmark...
| # Copying 500MB Bytes from 0x7f6f2e668010 to 0x7f6f4da69010 ...
|
| 767.483269 MB/Sec
This shows clearly that unrolled memcpy() is efficient
than rep version and glibc's one :)
Hey, really cool output :-)
Might also make sense to measure Ma Ling's patched version?
=20
Does Ma Ling's patched version mean,
=20
http://marc.info/?l=3Dlinux-kernel&m=3D128652296500989&w=3D2
=20
the memcpy applied the patch of the URL?
(It seems that this patch was written by Miao Xie.)
=20
I'll include the result of patched version in the next post.
(Indeed it is Miao Xie - sorry!)
Post by Hitoshi Mitake
Post by Ingo Molnar
Post by Hitoshi Mitake
# checkpatch.pl warns about two externs in bench/mem-memcpy.c
# added by this patch. But I think it is no problem.
+#ifdef ARCH_X86_64
+extern void *memcpy_x86_64_unrolled(void *to, const void *from, s=
ize_t len);
Post by Hitoshi Mitake
Post by Ingo Molnar
+extern void *memcpy_x86_64_rep(void *to, const void *from, size_t=
len);
Post by Hitoshi Mitake
Post by Ingo Molnar
+#endif
into a .h file - a new one if needed.
That will make both checkpatch and me happier ;-)
=20
OK, I'll separate these files.
=20
BTW, I found really interesting evaluation result.
Current results of "perf bench mem memcpy" include
the overhead of page faults because the measured memcpy()
is the first access to allocated memory area.
=20
I tested the another version of perf bench mem memcpy,
which does memcpy() before measured memcpy() for removing
the overhead come from page faults.
=20
=20
% ./perf bench mem memcpy -l 500MB -r x86-64-unrolled
# Running mem/memcpy benchmark...
# Copying 500MB Bytes from 0x7f19d488f010 to 0x7f19f3c90010 ...
=20
4.608340 GB/Sec
=20
% ./perf bench mem memcpy -l 500MB
# Running mem/memcpy benchmark...
# Copying 500MB Bytes from 0x7f696c3cc010 to 0x7f698b7cd010 ...
=20
4.856442 GB/Sec
=20
% ./perf bench mem memcpy -l 500MB -r x86-64-rep
# Running mem/memcpy benchmark...
# Copying 500MB Bytes from 0x7f45d6cff010 to 0x7f45f6100010 ...
=20
6.024445 GB/Sec
=20
The relation of scores reversed!
I cannot explain the cause of this result, and
this is really interesting phenomenon.
Interesting indeed, and it would be nice to analyse that! (It should be=
possible,=20
using various PMU metrics in a clever way, to figure out what's happeni=
ng inside the=20
CPU, right?)
Post by Hitoshi Mitake
So I'd like to add new command line option,
like "--pre-page-faults" to perf bench mem memcpy,
for doing memcpy() before measured memcpy().
=20
How do you think about this idea?
Agreed. (Maybe name it --prefault, as 'prefaulting' is the term we gene=
rally use for=20
things like this.)

An even better solution would be to output _both_ results by default, s=
o that people=20
can see both characteristics at a glance?

Thanks,

Ingo
Hitoshi Mitake
2010-11-05 17:05:57 UTC
Permalink
Post by Ingo Molnar
Post by Hitoshi Mitake
Post by Ingo Molnar
Post by Hitoshi Mitake
This patch adds new file: mem-memcpy-x86-64-asm.S
for x86-64 specific memcpy() benchmarking.
Added new benchmarks are,
x86-64-rep: memcpy() implemented with rep instruction
x86-64-unrolled: unrolled memcpy()
Original idea of including the source files of kernel
for benchmarking is suggested by Ingo Molnar.
This is more effective than write-once programs for quantitative
evaluation of in-kernel, little and leaf functions called high fre=
quently.
Post by Ingo Molnar
Post by Hitoshi Mitake
Post by Ingo Molnar
Post by Hitoshi Mitake
Because perf bench is in kernel source tree and executing it
on various hardwares, especially new model CPUs, is easy.
This way can also be used for other functions of kernel e.g. check=
sum functions.
Post by Ingo Molnar
Post by Hitoshi Mitake
Post by Ingo Molnar
Post by Hitoshi Mitake
| % ./perf bench mem memcpy -l 500MB
| # Running mem/memcpy benchmark...
| # Copying 500MB Bytes from 0x7f911f94c010 to 0x7f913ed4d010 ...
|
| 578.732506 MB/Sec
| % ./perf bench mem memcpy -l 500MB -r x86-64-rep
| # Running mem/memcpy benchmark...
| # Copying 500MB Bytes from 0x7fb4b6fe4010 to 0x7fb4d63e5010 ...
|
| 738.184980 MB/Sec
| % ./perf bench mem memcpy -l 500MB -r x86-64-unrolled
| # Running mem/memcpy benchmark...
| # Copying 500MB Bytes from 0x7f6f2e668010 to 0x7f6f4da69010 ...
|
| 767.483269 MB/Sec
This shows clearly that unrolled memcpy() is efficient
than rep version and glibc's one :)
Hey, really cool output :-)
Might also make sense to measure Ma Ling's patched version?
Does Ma Ling's patched version mean,
http://marc.info/?l=3Dlinux-kernel&m=3D128652296500989&w=3D2
the memcpy applied the patch of the URL?
(It seems that this patch was written by Miao Xie.)
I'll include the result of patched version in the next post.
(Indeed it is Miao Xie - sorry!)
Post by Hitoshi Mitake
Post by Ingo Molnar
Post by Hitoshi Mitake
# checkpatch.pl warns about two externs in bench/mem-memcpy.c
# added by this patch. But I think it is no problem.
+#ifdef ARCH_X86_64
+extern void *memcpy_x86_64_unrolled(void *to, const void *from, =
size_t len);
Post by Ingo Molnar
Post by Hitoshi Mitake
Post by Ingo Molnar
+extern void *memcpy_x86_64_rep(void *to, const void *from, size_=
t len);
Post by Ingo Molnar
Post by Hitoshi Mitake
Post by Ingo Molnar
+#endif
into a .h file - a new one if needed.
That will make both checkpatch and me happier ;-)
OK, I'll separate these files.
BTW, I found really interesting evaluation result.
Current results of "perf bench mem memcpy" include
the overhead of page faults because the measured memcpy()
is the first access to allocated memory area.
I tested the another version of perf bench mem memcpy,
which does memcpy() before measured memcpy() for removing
the overhead come from page faults.
% ./perf bench mem memcpy -l 500MB -r x86-64-unrolled
# Running mem/memcpy benchmark...
# Copying 500MB Bytes from 0x7f19d488f010 to 0x7f19f3c90010 ...
4.608340 GB/Sec
% ./perf bench mem memcpy -l 500MB
# Running mem/memcpy benchmark...
# Copying 500MB Bytes from 0x7f696c3cc010 to 0x7f698b7cd010 ...
4.856442 GB/Sec
% ./perf bench mem memcpy -l 500MB -r x86-64-rep
# Running mem/memcpy benchmark...
# Copying 500MB Bytes from 0x7f45d6cff010 to 0x7f45f6100010 ...
6.024445 GB/Sec
The relation of scores reversed!
I cannot explain the cause of this result, and
this is really interesting phenomenon.
Interesting indeed, and it would be nice to analyse that! (It should =
be possible,
Post by Ingo Molnar
using various PMU metrics in a clever way, to figure out what's happe=
ning inside the
Post by Ingo Molnar
CPU, right?)
Post by Hitoshi Mitake
So I'd like to add new command line option,
like "--pre-page-faults" to perf bench mem memcpy,
for doing memcpy() before measured memcpy().
How do you think about this idea?
Agreed. (Maybe name it --prefault, as 'prefaulting' is the term we ge=
nerally use for
Post by Ingo Molnar
things like this.)
An even better solution would be to output _both_ results by default,=
so that people
Post by Ingo Molnar
can see both characteristics at a glance?
Outputting both result of prefaulted and non prefaulted will be useful,
but this might be not good for using from scripts.
So I'll implement --prefault option first. If there is request
for outputting both, I'll consider to modify default output.

# Please wait about the result of Miao Xie's patch,
# benchmarking memcpy() of unaligned memory area is
# a little difficult

Thanks,
Hitoshi
Ingo Molnar
2010-11-10 09:12:17 UTC
Permalink
An even better solution would be to output _both_ results by default, so that
people can see both characteristics at a glance?
Outputting both result of prefaulted and non prefaulted will be useful, but this
might be not good for using from scripts. So I'll implement --prefault option
first. If there is request for outputting both, I'll consider to modify default
output.
Ok - it should definitely be easily scriptable. The default can be have both flags
enabled and both results written to the output.

People will try 'perf bench x86' to see performance at a glance - so printing all
the tests we have is a good idea.

Thanks,

Ingo
Hitoshi Mitake
2010-11-12 15:01:55 UTC
Permalink
An even better solution would be to output _both_ results by defaul=
t, so that
people can see both characteristics at a glance?
Outputting both result of prefaulted and non prefaulted will be usef=
ul, but this
might be not good for using from scripts. So I'll implement --prefau=
lt option
first. If there is request for outputting both, I'll consider to mod=
ify default
output.
Ok - it should definitely be easily scriptable. The default can be ha=
ve both flags
enabled and both results written to the output.
People will try 'perf bench x86' to see performance at a glance - so =
printing all
the tests we have is a good idea.
OK, I added --no-prefault and --only-prefault to perf bench mem memcpy.
As you told, printing both of them is convenient.

I send the updated patch later.

Thanks,
Hitoshi Mitake
2010-11-12 15:02:38 UTC
Permalink
After applying this patch, perf bench mem memcpy prints
both of prefualted and without prefaulted score of memcpy().

New options --no-prefault and --only-prefault are added
for printing single result, mainly for scripting usage.

Example of usage:
| ***@X201i:~/linux/.../tools/perf% ./perf bench mem memcpy -l 500MB
| # Running mem/memcpy benchmark...
| # Copying 500MB Bytes ...
|
| 634.969014 MB/Sec
| 4.828062 GB/Sec (with prefault)
| ***@X201i:~/linux/.../tools/perf% ./perf bench mem memcpy -l 500MB --only-prefault
| # Running mem/memcpy benchmark...
| # Copying 500MB Bytes ...
|
| 4.705192 GB/Sec (with prefault)
| ***@X201i:~/linux/.../tools/perf% ./perf bench mem memcpy -l 500MB --no-prefault
| # Running mem/memcpy benchmark...
| # Copying 500MB Bytes ...
|
| 642.725568 MB/Sec

Signed-off-by: Hitoshi Mitake <***@dcl.info.waseda.ac.jp>
Cc: Ma Ling <***@intel.com>
Cc: Zhao Yakui <***@intel.com>
Cc: Peter Zijlstra <***@chello.nl>
Cc: Arnaldo Carvalho de Melo <***@redhat.com>
Cc: Paul Mackerras <***@samba.org>
Cc: Frederic Weisbecker <***@gmail.com>
Cc: Steven Rostedt <***@goodmis.org>
Cc: Thomas Gleixner <***@linutronix.de>
Cc: H. Peter Anvin <***@zytor.com>
---
tools/perf/bench/mem-memcpy.c | 215 +++++++++++++++++++++++++++++------------
1 files changed, 152 insertions(+), 63 deletions(-)

diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c
index be31ddb..61b6ead 100644
--- a/tools/perf/bench/mem-memcpy.c
+++ b/tools/perf/bench/mem-memcpy.c
@@ -25,7 +25,8 @@ static const char *length_str = "1MB";
static const char *routine = "default";
static bool use_clock;
static int clock_fd;
-static bool prefault;
+static bool only_prefault;
+static bool no_prefault;

static const struct option options[] = {
OPT_STRING('l', "length", &length_str, "1MB",
@@ -35,15 +36,19 @@ static const struct option options[] = {
"Specify routine to copy"),
OPT_BOOLEAN('c', "clock", &use_clock,
"Use CPU clock for measuring"),
- OPT_BOOLEAN('p', "prefault", &prefault,
- "Cause page faults before memcpy()"),
+ OPT_BOOLEAN('o', "only-prefault", &only_prefault,
+ "Show only the result with page faults before memcpy()"),
+ OPT_BOOLEAN('n', "no-prefault", &no_prefault,
+ "Show only the result without page faults before memcpy()"),
OPT_END()
};

+typedef void *(*memcpy_t)(void *, const void *, size_t);
+
struct routine {
const char *name;
const char *desc;
- void * (*fn)(void *dst, const void *src, size_t len);
+ memcpy_t fn;
};

struct routine routines[] = {
@@ -92,29 +97,98 @@ static double timeval2double(struct timeval *ts)
(double)ts->tv_usec / (double)1000000;
}

+static void alloc_mem(void **dst, void **src, size_t length)
+{
+ *dst = zalloc(length);
+ if (!dst)
+ die("memory allocation failed - maybe length is too large?\n");
+
+ *src = zalloc(length);
+ if (!src)
+ die("memory allocation failed - maybe length is too large?\n");
+}
+
+static u64 do_memcpy_clock(memcpy_t fn, size_t len, bool prefault)
+{
+ u64 clock_start = 0ULL, clock_end = 0ULL;
+ void *src = NULL, *dst = NULL;
+
+ alloc_mem(&src, &dst, len);
+
+ if (prefault)
+ fn(dst, src, len);
+
+ clock_start = get_clock();
+ fn(dst, src, len);
+ clock_end = get_clock();
+
+ free(src);
+ free(dst);
+ return clock_end - clock_start;
+}
+
+static double do_memcpy_gettimeofday(memcpy_t fn, size_t len, bool prefault)
+{
+ struct timeval tv_start, tv_end, tv_diff;
+ void *src = NULL, *dst = NULL;
+
+ alloc_mem(&src, &dst, len);
+
+ if (prefault)
+ fn(dst, src, len);
+
+ BUG_ON(gettimeofday(&tv_start, NULL));
+ fn(dst, src, len);
+ BUG_ON(gettimeofday(&tv_end, NULL));
+
+ timersub(&tv_end, &tv_start, &tv_diff);
+
+ free(src);
+ free(dst);
+ return (double)((double)len / timeval2double(&tv_diff));
+}
+
+#define pf (no_prefault ? 0 : 1)
+
+#define print_bps(x) do { \
+ if (x < K) \
+ printf(" %14lf B/Sec", x); \
+ else if (x < K * K) \
+ printf(" %14lfd KB/Sec", x / K); \
+ else if (x < K * K * K) \
+ printf(" %14lf MB/Sec", x / K / K); \
+ else \
+ printf(" %14lf GB/Sec", x / K / K / K); \
+ } while (0)
+
int bench_mem_memcpy(int argc, const char **argv,
const char *prefix __used)
{
int i;
- void *dst, *src;
- size_t length;
- double bps = 0.0;
- struct timeval tv_start, tv_end, tv_diff;
- u64 clock_start, clock_end, clock_diff;
+ size_t len;
+ double result_bps[2];
+ u64 result_clock[2];

- clock_start = clock_end = clock_diff = 0ULL;
argc = parse_options(argc, argv, options,
bench_mem_memcpy_usage, 0);

- tv_diff.tv_sec = 0;
- tv_diff.tv_usec = 0;
- length = (size_t)perf_atoll((char *)length_str);
+ if (use_clock)
+ init_clock();
+
+ len = (size_t)perf_atoll((char *)length_str);

- if ((s64)length <= 0) {
+ result_clock[0] = result_clock[1] = 0ULL;
+ result_bps[0] = result_bps[1] = 0.0;
+
+ if ((s64)len <= 0) {
fprintf(stderr, "Invalid length:%s\n", length_str);
return 1;
}

+ /* same to without specifying either of prefault and no-prefault */
+ if (only_prefault && no_prefault)
+ only_prefault = no_prefault = false;
+
for (i = 0; routines[i].name; i++) {
if (!strcmp(routines[i].name, routine))
break;
@@ -129,65 +203,80 @@ int bench_mem_memcpy(int argc, const char **argv,
return 1;
}

- dst = zalloc(length);
- if (!dst)
- die("memory allocation failed - maybe length is too large?\n");
-
- src = zalloc(length);
- if (!src)
- die("memory allocation failed - maybe length is too large?\n");
-
- if (bench_format == BENCH_FORMAT_DEFAULT) {
- printf("# Copying %s Bytes from %p to %p ...\n\n",
- length_str, src, dst);
- }
-
-
- if (prefault)
- routines[i].fn(dst, src, length);
-
- if (use_clock) {
- init_clock();
- clock_start = get_clock();
- } else {
- BUG_ON(gettimeofday(&tv_start, NULL));
- }
+ if (bench_format == BENCH_FORMAT_DEFAULT)
+ printf("# Copying %s Bytes ...\n\n", length_str);

- routines[i].fn(dst, src, length);
-
- if (use_clock) {
- clock_end = get_clock();
- clock_diff = clock_end - clock_start;
+ if (!only_prefault && !no_prefault) {
+ /* show both of results */
+ if (use_clock) {
+ result_clock[0] =
+ do_memcpy_clock(routines[i].fn, len, false);
+ result_clock[1] =
+ do_memcpy_clock(routines[i].fn, len, true);
+ } else {
+ result_bps[0] =
+ do_memcpy_gettimeofday(routines[i].fn,
+ len, false);
+ result_bps[1] =
+ do_memcpy_gettimeofday(routines[i].fn,
+ len, true);
+ }
} else {
- BUG_ON(gettimeofday(&tv_end, NULL));
- timersub(&tv_end, &tv_start, &tv_diff);
- bps = (double)((double)length / timeval2double(&tv_diff));
+ if (use_clock) {
+ result_clock[pf] =
+ do_memcpy_clock(routines[i].fn,
+ len, only_prefault);
+ } else {
+ result_bps[pf] =
+ do_memcpy_gettimeofday(routines[i].fn,
+ len, only_prefault);
+ }
}

switch (bench_format) {
case BENCH_FORMAT_DEFAULT:
- if (use_clock) {
- printf(" %14lf Clock/Byte\n",
- (double)clock_diff / (double)length);
- } else {
- if (bps < K)
- printf(" %14lf B/Sec\n", bps);
- else if (bps < K * K)
- printf(" %14lfd KB/Sec\n", bps / 1024);
- else if (bps < K * K * K)
- printf(" %14lf MB/Sec\n", bps / 1024 / 1024);
- else {
- printf(" %14lf GB/Sec\n",
- bps / 1024 / 1024 / 1024);
+ if (!only_prefault && !no_prefault) {
+ if (use_clock) {
+ printf(" %14lf Clock/Byte\n",
+ (double)result_clock[0]
+ / (double)len);
+ printf(" %14lf Clock/Byte (with prefault)\n",
+ (double)result_clock[1]
+ / (double)len);
+ } else {
+ print_bps(result_bps[0]);
+ printf("\n");
+ print_bps(result_bps[1]);
+ printf(" (with prefault)\n");
}
+ } else {
+ if (use_clock) {
+ printf(" %14lf Clock/Byte",
+ (double)result_clock[pf]
+ / (double)len);
+ } else
+ print_bps(result_bps[pf]);
+
+ printf("%s\n", only_prefault ? " (with prefault)" : "");
}
break;
case BENCH_FORMAT_SIMPLE:
- if (use_clock) {
- printf("%14lf\n",
- (double)clock_diff / (double)length);
- } else
- printf("%lf\n", bps);
+ if (!only_prefault && !no_prefault) {
+ if (use_clock) {
+ printf("%lf %lf\n",
+ (double)result_clock[0] / (double)len,
+ (double)result_clock[1] / (double)len);
+ } else {
+ printf("%lf %lf\n",
+ result_bps[0], result_bps[1]);
+ }
+ } else {
+ if (use_clock) {
+ printf("%lf\n", (double)result_clock[pf]
+ / (double)len);
+ } else
+ printf("%lf\n", result_bps[pf]);
+ }
break;
default:
/* reaching this means there's some disaster: */
--
1.7.1.1
Ingo Molnar
2010-11-18 07:58:16 UTC
Permalink
Post by Hitoshi Mitake
After applying this patch, perf bench mem memcpy prints
both of prefualted and without prefaulted score of memcpy().
New options --no-prefault and --only-prefault are added
for printing single result, mainly for scripting usage.
Ok. Mind resending the whole series once all review feedback has been incorporated?

Thanks,

Ingo
Hitoshi Mitake
2010-11-25 07:04:06 UTC
Permalink
Really sorry for my late reply..
Post by Ingo Molnar
Post by Hitoshi Mitake
After applying this patch, perf bench mem memcpy prints
both of prefualted and without prefaulted score of memcpy().
New options --no-prefault and --only-prefault are added
for printing single result, mainly for scripting usage.
Ok. Mind resending the whole series once all review feedback has been incorporated?
OK, I'll send the patch series for prefaulting and
porting memcpy_64.S to perf bench later.
This series do some dirty things especially in Makefile
of perf and defining ENTRY(). So I'd like to hear your comment.
Could you review these?

And I have another problem. I cannot see the name of
memcpy based on rep prefix because the symbol of it is ".Lmemcpy_c".
It seems that the symbol name start from "." cannot be seen
from other object files. So I have to seek the way to
find the name of rep memcpy...

Thanks,
Hitoshi
Hitoshi Mitake
2010-11-25 07:04:53 UTC
Permalink
This patch ports arch/x86/lib/memcpy_64.S to perf bench mem memcpy
for benchmarking memcpy() in userland with tricky and dirty way.

util/include/asm/cpufeature.h, util/include/asm/dwarf2.h, and
util/include/linux/linkage.h are dummy (but do a little work) for
including memcpy_64.S without modification to it (e.g. defining ENTRY()).

This makes checkpatch.pl angry like this:
\#177: FILE: tools/perf/util/include/linux/linkage.h:7:
+#define ENTRY(name) \
+ .globl name; \
+ name:

WARNING: labels should not be indented
\#179: FILE: tools/perf/util/include/linux/linkage.h:9:
+ name:

because checkpatch.pl treat this file as the file written in C.
But I think this can be forgived because original include/linux/linkage.h
is doing the similar thing.

Signed-off-by: Hitoshi Mitake <***@dcl.info.waseda.ac.jp>
Cc: Miao Xie <***@cn.fujitsu.com>
Cc: Ma Ling <***@intel.com>
Cc: Zhao Yakui <***@intel.com>
Cc: Peter Zijlstra <***@chello.nl>
Cc: Arnaldo Carvalho de Melo <***@redhat.com>
Cc: Paul Mackerras <***@samba.org>
Cc: Frederic Weisbecker <***@gmail.com>
Cc: Steven Rostedt <***@goodmis.org>
Cc: Thomas Gleixner <***@linutronix.de>
Cc: H. Peter Anvin <***@zytor.com>
Cc: Andi Kleen <***@firstfloor.org>
---
tools/perf/Makefile | 11 +++++++++++
tools/perf/bench/mem-memcpy-arch.h | 12 ++++++++++++
tools/perf/bench/mem-memcpy-x86-64-asm-def.h | 4 ++++
tools/perf/bench/mem-memcpy-x86-64-asm.S | 2 ++
tools/perf/util/include/asm/cpufeature.h | 9 +++++++++
tools/perf/util/include/asm/dwarf2.h | 11 +++++++++++
tools/perf/util/include/linux/linkage.h | 13 +++++++++++++
7 files changed, 62 insertions(+), 0 deletions(-)
create mode 100644 tools/perf/bench/mem-memcpy-arch.h
create mode 100644 tools/perf/bench/mem-memcpy-x86-64-asm-def.h
create mode 100644 tools/perf/bench/mem-memcpy-x86-64-asm.S
create mode 100644 tools/perf/util/include/asm/cpufeature.h
create mode 100644 tools/perf/util/include/asm/dwarf2.h
create mode 100644 tools/perf/util/include/linux/linkage.h

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 2d414b3..b3e6bc6 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -185,7 +185,10 @@ ifeq ($(ARCH),i386)
ARCH := x86
endif
ifeq ($(ARCH),x86_64)
+ RAW_ARCH := x86_64
ARCH := x86
+ ARCH_CFLAGS := -DARCH_X86_64
+ ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S
endif

# CFLAGS and LDFLAGS are for the users to override from the command line.
@@ -375,6 +378,7 @@ LIB_H += util/include/linux/prefetch.h
LIB_H += util/include/linux/rbtree.h
LIB_H += util/include/linux/string.h
LIB_H += util/include/linux/types.h
+LIB_H += util/include/linux/linkage.h
LIB_H += util/include/asm/asm-offsets.h
LIB_H += util/include/asm/bug.h
LIB_H += util/include/asm/byteorder.h
@@ -383,6 +387,8 @@ LIB_H += util/include/asm/swab.h
LIB_H += util/include/asm/system.h
LIB_H += util/include/asm/uaccess.h
LIB_H += util/include/dwarf-regs.h
+LIB_H += util/include/asm/dwarf2.h
+LIB_H += util/include/asm/cpufeature.h
LIB_H += perf.h
LIB_H += util/cache.h
LIB_H += util/callchain.h
@@ -417,6 +423,7 @@ LIB_H += util/probe-finder.h
LIB_H += util/probe-event.h
LIB_H += util/pstack.h
LIB_H += util/cpumap.h
+LIB_H += $(ARCH_INCLUDE)

LIB_OBJS += $(OUTPUT)util/abspath.o
LIB_OBJS += $(OUTPUT)util/alias.o
@@ -472,6 +479,9 @@ BUILTIN_OBJS += $(OUTPUT)builtin-bench.o
# Benchmark modules
BUILTIN_OBJS += $(OUTPUT)bench/sched-messaging.o
BUILTIN_OBJS += $(OUTPUT)bench/sched-pipe.o
+ifeq ($(RAW_ARCH),x86_64)
+BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy-x86-64-asm.o
+endif
BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy.o

BUILTIN_OBJS += $(OUTPUT)builtin-diff.o
@@ -909,6 +919,7 @@ BASIC_CFLAGS += -DSHA1_HEADER='$(SHA1_HEADER_SQ)' \
LIB_OBJS += $(COMPAT_OBJS)

ALL_CFLAGS += $(BASIC_CFLAGS)
+ALL_CFLAGS += $(ARCH_CFLAGS)
ALL_LDFLAGS += $(BASIC_LDFLAGS)

export TAR INSTALL DESTDIR SHELL_PATH
diff --git a/tools/perf/bench/mem-memcpy-arch.h b/tools/perf/bench/mem-memcpy-arch.h
new file mode 100644
index 0000000..a72e36c
--- /dev/null
+++ b/tools/perf/bench/mem-memcpy-arch.h
@@ -0,0 +1,12 @@
+
+#ifdef ARCH_X86_64
+
+#define MEMCPY_FN(fn, name, desc) \
+ extern void *fn(void *, const void *, size_t);
+
+#include "mem-memcpy-x86-64-asm-def.h"
+
+#undef MEMCPY_FN
+
+#endif
+
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
new file mode 100644
index 0000000..d588b87
--- /dev/null
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
@@ -0,0 +1,4 @@
+
+MEMCPY_FN(__memcpy,
+ "x86-64-unrolled",
+ "unrolled memcpy() in arch/x86/lib/memcpy_64.S")
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm.S b/tools/perf/bench/mem-memcpy-x86-64-asm.S
new file mode 100644
index 0000000..a57b66e
--- /dev/null
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S
@@ -0,0 +1,2 @@
+
+#include "../../../arch/x86/lib/memcpy_64.S"
diff --git a/tools/perf/util/include/asm/cpufeature.h b/tools/perf/util/include/asm/cpufeature.h
new file mode 100644
index 0000000..acffd5e
--- /dev/null
+++ b/tools/perf/util/include/asm/cpufeature.h
@@ -0,0 +1,9 @@
+
+#ifndef PERF_CPUFEATURE_H
+#define PERF_CPUFEATURE_H
+
+/* cpufeature.h ... dummy header file for including arch/x86/lib/memcpy_64.S */
+
+#define X86_FEATURE_REP_GOOD 0
+
+#endif /* PERF_CPUFEATURE_H */
diff --git a/tools/perf/util/include/asm/dwarf2.h b/tools/perf/util/include/asm/dwarf2.h
new file mode 100644
index 0000000..bb4198e
--- /dev/null
+++ b/tools/perf/util/include/asm/dwarf2.h
@@ -0,0 +1,11 @@
+
+#ifndef PERF_DWARF2_H
+#define PERF_DWARF2_H
+
+/* dwarf2.h ... dummy header file for including arch/x86/lib/memcpy_64.S */
+
+#define CFI_STARTPROC
+#define CFI_ENDPROC
+
+#endif /* PERF_DWARF2_H */
+
diff --git a/tools/perf/util/include/linux/linkage.h b/tools/perf/util/include/linux/linkage.h
new file mode 100644
index 0000000..06387cf
--- /dev/null
+++ b/tools/perf/util/include/linux/linkage.h
@@ -0,0 +1,13 @@
+
+#ifndef PERF_LINUX_LINKAGE_H_
+#define PERF_LINUX_LINKAGE_H_
+
+/* linkage.h ... for including arch/x86/lib/memcpy_64.S */
+
+#define ENTRY(name) \
+ .globl name; \
+ name:
+
+#define ENDPROC(name)
+
+#endif /* PERF_LINUX_LINKAGE_H_ */
--
1.6.5.2
tip-bot for Hitoshi Mitake
2010-11-26 10:31:19 UTC
Permalink
Commit-ID: ea7872b9d6a81101f6ba0ec141544a62fea35876
Gitweb: http://git.kernel.org/tip/ea7872b9d6a81101f6ba0ec141544a62fea35876
Author: Hitoshi Mitake <***@dcl.info.waseda.ac.jp>
AuthorDate: Thu, 25 Nov 2010 16:04:53 +0900
Committer: Ingo Molnar <***@elte.hu>
CommitDate: Fri, 26 Nov 2010 08:15:57 +0100

perf bench: Add feature that measures the performance of the arch/x86/lib/memcpy_64.S memcpy routines via 'perf bench mem'

This patch ports arch/x86/lib/memcpy_64.S to perf bench mem
memcpy for benchmarking memcpy() in userland with tricky and
dirty way.

util/include/asm/cpufeature.h, util/include/asm/dwarf2.h, and
util/include/linux/linkage.h are mostly dummy files with small
wrappers, so that we are able to include memcpy_64.S
unmodified.

Signed-off-by: Hitoshi Mitake <***@dcl.info.waseda.ac.jp>
Cc: ***@gmail.com
Cc: Miao Xie <***@cn.fujitsu.com>
Cc: Ma Ling <***@intel.com>
Cc: Zhao Yakui <***@intel.com>
Cc: Peter Zijlstra <***@chello.nl>
Cc: Arnaldo Carvalho de Melo <***@redhat.com>
Cc: Paul Mackerras <***@samba.org>
Cc: Frederic Weisbecker <***@gmail.com>
Cc: Steven Rostedt <***@goodmis.org>
Cc: Andi Kleen <***@firstfloor.org>
LKML-Reference: <1290668693-27068-2-git-send-email-***@dcl.info.waseda.ac.jp>
Signed-off-by: Ingo Molnar <***@elte.hu>
---
tools/perf/Makefile | 11 +++++++++++
tools/perf/bench/mem-memcpy-arch.h | 12 ++++++++++++
tools/perf/bench/mem-memcpy-x86-64-asm-def.h | 4 ++++
tools/perf/bench/mem-memcpy-x86-64-asm.S | 2 ++
tools/perf/util/include/asm/cpufeature.h | 9 +++++++++
tools/perf/util/include/asm/dwarf2.h | 11 +++++++++++
tools/perf/util/include/linux/linkage.h | 13 +++++++++++++
7 files changed, 62 insertions(+), 0 deletions(-)

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 74b684d..e0db197 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -185,7 +185,10 @@ ifeq ($(ARCH),i386)
ARCH := x86
endif
ifeq ($(ARCH),x86_64)
+ RAW_ARCH := x86_64
ARCH := x86
+ ARCH_CFLAGS := -DARCH_X86_64
+ ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S
endif

# CFLAGS and LDFLAGS are for the users to override from the command line.
@@ -375,6 +378,7 @@ LIB_H += util/include/linux/prefetch.h
LIB_H += util/include/linux/rbtree.h
LIB_H += util/include/linux/string.h
LIB_H += util/include/linux/types.h
+LIB_H += util/include/linux/linkage.h
LIB_H += util/include/asm/asm-offsets.h
LIB_H += util/include/asm/bug.h
LIB_H += util/include/asm/byteorder.h
@@ -383,6 +387,8 @@ LIB_H += util/include/asm/swab.h
LIB_H += util/include/asm/system.h
LIB_H += util/include/asm/uaccess.h
LIB_H += util/include/dwarf-regs.h
+LIB_H += util/include/asm/dwarf2.h
+LIB_H += util/include/asm/cpufeature.h
LIB_H += perf.h
LIB_H += util/cache.h
LIB_H += util/callchain.h
@@ -417,6 +423,7 @@ LIB_H += util/probe-finder.h
LIB_H += util/probe-event.h
LIB_H += util/pstack.h
LIB_H += util/cpumap.h
+LIB_H += $(ARCH_INCLUDE)

LIB_OBJS += $(OUTPUT)util/abspath.o
LIB_OBJS += $(OUTPUT)util/alias.o
@@ -472,6 +479,9 @@ BUILTIN_OBJS += $(OUTPUT)builtin-bench.o
# Benchmark modules
BUILTIN_OBJS += $(OUTPUT)bench/sched-messaging.o
BUILTIN_OBJS += $(OUTPUT)bench/sched-pipe.o
+ifeq ($(RAW_ARCH),x86_64)
+BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy-x86-64-asm.o
+endif
BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy.o

BUILTIN_OBJS += $(OUTPUT)builtin-diff.o
@@ -898,6 +908,7 @@ BASIC_CFLAGS += -DSHA1_HEADER='$(SHA1_HEADER_SQ)' \
LIB_OBJS += $(COMPAT_OBJS)

ALL_CFLAGS += $(BASIC_CFLAGS)
+ALL_CFLAGS += $(ARCH_CFLAGS)
ALL_LDFLAGS += $(BASIC_LDFLAGS)

export TAR INSTALL DESTDIR SHELL_PATH
diff --git a/tools/perf/bench/mem-memcpy-arch.h b/tools/perf/bench/mem-memcpy-arch.h
new file mode 100644
index 0000000..a72e36c
--- /dev/null
+++ b/tools/perf/bench/mem-memcpy-arch.h
@@ -0,0 +1,12 @@
+
+#ifdef ARCH_X86_64
+
+#define MEMCPY_FN(fn, name, desc) \
+ extern void *fn(void *, const void *, size_t);
+
+#include "mem-memcpy-x86-64-asm-def.h"
+
+#undef MEMCPY_FN
+
+#endif
+
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
new file mode 100644
index 0000000..d588b87
--- /dev/null
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
@@ -0,0 +1,4 @@
+
+MEMCPY_FN(__memcpy,
+ "x86-64-unrolled",
+ "unrolled memcpy() in arch/x86/lib/memcpy_64.S")
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm.S b/tools/perf/bench/mem-memcpy-x86-64-asm.S
new file mode 100644
index 0000000..a57b66e
--- /dev/null
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S
@@ -0,0 +1,2 @@
+
+#include "../../../arch/x86/lib/memcpy_64.S"
diff --git a/tools/perf/util/include/asm/cpufeature.h b/tools/perf/util/include/asm/cpufeature.h
new file mode 100644
index 0000000..acffd5e
--- /dev/null
+++ b/tools/perf/util/include/asm/cpufeature.h
@@ -0,0 +1,9 @@
+
+#ifndef PERF_CPUFEATURE_H
+#define PERF_CPUFEATURE_H
+
+/* cpufeature.h ... dummy header file for including arch/x86/lib/memcpy_64.S */
+
+#define X86_FEATURE_REP_GOOD 0
+
+#endif /* PERF_CPUFEATURE_H */
diff --git a/tools/perf/util/include/asm/dwarf2.h b/tools/perf/util/include/asm/dwarf2.h
new file mode 100644
index 0000000..bb4198e
--- /dev/null
+++ b/tools/perf/util/include/asm/dwarf2.h
@@ -0,0 +1,11 @@
+
+#ifndef PERF_DWARF2_H
+#define PERF_DWARF2_H
+
+/* dwarf2.h ... dummy header file for including arch/x86/lib/memcpy_64.S */
+
+#define CFI_STARTPROC
+#define CFI_ENDPROC
+
+#endif /* PERF_DWARF2_H */
+
diff --git a/tools/perf/util/include/linux/linkage.h b/tools/perf/util/include/linux/linkage.h
new file mode 100644
index 0000000..06387cf
--- /dev/null
+++ b/tools/perf/util/include/linux/linkage.h
@@ -0,0 +1,13 @@
+
+#ifndef PERF_LINUX_LINKAGE_H_
+#define PERF_LINUX_LINKAGE_H_
+
+/* linkage.h ... for including arch/x86/lib/memcpy_64.S */
+
+#define ENTRY(name) \
+ .globl name; \
+ name:
+
+#define ENDPROC(name)
+
+#endif /* PERF_LINUX_LINKAGE_H_ */
Hitoshi Mitake
2010-11-29 13:26:47 UTC
Permalink
On 2010=E5=B9=B411=E6=9C=8826=E6=97=A5 19:31, tip-bot for Hitoshi Mitak=
Post by tip-bot for Hitoshi Mitake
Commit-ID: ea7872b9d6a81101f6ba0ec141544a62fea35876
Gitweb:=20
http://git.kernel.org/tip/ea7872b9d6a81101f6ba0ec141544a62fea35876
Post by tip-bot for Hitoshi Mitake
AuthorDate: Thu, 25 Nov 2010 16:04:53 +0900
CommitDate: Fri, 26 Nov 2010 08:15:57 +0100
perf bench: Add feature that measures the performance of the=20
arch/x86/lib/memcpy_64.S memcpy routines via 'perf bench mem'
Post by tip-bot for Hitoshi Mitake
This patch ports arch/x86/lib/memcpy_64.S to perf bench mem
memcpy for benchmarking memcpy() in userland with tricky and
dirty way.
util/include/asm/cpufeature.h, util/include/asm/dwarf2.h, and
util/include/linux/linkage.h are mostly dummy files with small
wrappers, so that we are able to include memcpy_64.S
unmodified.
=20
LKML-Reference:<1290668693-27068-2-git-send-email-***@dcl.info.wased=
a.ac.jp>
Post by tip-bot for Hitoshi Mitake
---
tools/perf/Makefile | 11 +++++++++++
tools/perf/bench/mem-memcpy-arch.h | 12 ++++++++++++
tools/perf/bench/mem-memcpy-x86-64-asm-def.h | 4 ++++
tools/perf/bench/mem-memcpy-x86-64-asm.S | 2 ++
tools/perf/util/include/asm/cpufeature.h | 9 +++++++++
tools/perf/util/include/asm/dwarf2.h | 11 +++++++++++
tools/perf/util/include/linux/linkage.h | 13 +++++++++++++
7 files changed, 62 insertions(+), 0 deletions(-)
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 74b684d..e0db197 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -185,7 +185,10 @@ ifeq ($(ARCH),i386)
ARCH :=3D x86
endif
ifeq ($(ARCH),x86_64)
+ RAW_ARCH :=3D x86_64
ARCH :=3D x86
+ ARCH_CFLAGS :=3D -DARCH_X86_64
+ ARCH_INCLUDE =3D ../../arch/x86/lib/memcpy_64.S
endif
# CFLAGS and LDFLAGS are for the users to override from the comman=
d=20
line.
Post by tip-bot for Hitoshi Mitake
@@ -375,6 +378,7 @@ LIB_H +=3D util/include/linux/prefetch.h
LIB_H +=3D util/include/linux/rbtree.h
LIB_H +=3D util/include/linux/string.h
LIB_H +=3D util/include/linux/types.h
+LIB_H +=3D util/include/linux/linkage.h
LIB_H +=3D util/include/asm/asm-offsets.h
LIB_H +=3D util/include/asm/bug.h
LIB_H +=3D util/include/asm/byteorder.h
@@ -383,6 +387,8 @@ LIB_H +=3D util/include/asm/swab.h
LIB_H +=3D util/include/asm/system.h
LIB_H +=3D util/include/asm/uaccess.h
LIB_H +=3D util/include/dwarf-regs.h
+LIB_H +=3D util/include/asm/dwarf2.h
+LIB_H +=3D util/include/asm/cpufeature.h
LIB_H +=3D perf.h
LIB_H +=3D util/cache.h
LIB_H +=3D util/callchain.h
@@ -417,6 +423,7 @@ LIB_H +=3D util/probe-finder.h
LIB_H +=3D util/probe-event.h
LIB_H +=3D util/pstack.h
LIB_H +=3D util/cpumap.h
+LIB_H +=3D $(ARCH_INCLUDE)
LIB_OBJS +=3D $(OUTPUT)util/abspath.o
LIB_OBJS +=3D $(OUTPUT)util/alias.o
@@ -472,6 +479,9 @@ BUILTIN_OBJS +=3D $(OUTPUT)builtin-bench.o
# Benchmark modules
BUILTIN_OBJS +=3D $(OUTPUT)bench/sched-messaging.o
BUILTIN_OBJS +=3D $(OUTPUT)bench/sched-pipe.o
+ifeq ($(RAW_ARCH),x86_64)
+BUILTIN_OBJS +=3D $(OUTPUT)bench/mem-memcpy-x86-64-asm.o
+endif
BUILTIN_OBJS +=3D $(OUTPUT)bench/mem-memcpy.o
BUILTIN_OBJS +=3D $(OUTPUT)builtin-diff.o
@@ -898,6 +908,7 @@ BASIC_CFLAGS +=3D -DSHA1_HEADER=3D'$(SHA1_HEADER=
_SQ)' \
Post by tip-bot for Hitoshi Mitake
LIB_OBJS +=3D $(COMPAT_OBJS)
ALL_CFLAGS +=3D $(BASIC_CFLAGS)
+ALL_CFLAGS +=3D $(ARCH_CFLAGS)
ALL_LDFLAGS +=3D $(BASIC_LDFLAGS)
export TAR INSTALL DESTDIR SHELL_PATH
diff --git a/tools/perf/bench/mem-memcpy-arch.h=20
b/tools/perf/bench/mem-memcpy-arch.h
Post by tip-bot for Hitoshi Mitake
new file mode 100644
index 0000000..a72e36c
--- /dev/null
+++ b/tools/perf/bench/mem-memcpy-arch.h
@@ -0,0 +1,12 @@
+
+#ifdef ARCH_X86_64
+
+#define MEMCPY_FN(fn, name, desc) \
+ extern void *fn(void *, const void *, size_t);
+
+#include "mem-memcpy-x86-64-asm-def.h"
+
+#undef MEMCPY_FN
+
+#endif
+
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h=20
b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
Post by tip-bot for Hitoshi Mitake
new file mode 100644
index 0000000..d588b87
--- /dev/null
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
@@ -0,0 +1,4 @@
+
+MEMCPY_FN(__memcpy,
+ "x86-64-unrolled",
+ "unrolled memcpy() in arch/x86/lib/memcpy_64.S")
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm.S=20
b/tools/perf/bench/mem-memcpy-x86-64-asm.S
Post by tip-bot for Hitoshi Mitake
new file mode 100644
index 0000000..a57b66e
--- /dev/null
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S
@@ -0,0 +1,2 @@
+
+#include "../../../arch/x86/lib/memcpy_64.S"
diff --git a/tools/perf/util/include/asm/cpufeature.h=20
b/tools/perf/util/include/asm/cpufeature.h
Post by tip-bot for Hitoshi Mitake
new file mode 100644
index 0000000..acffd5e
--- /dev/null
+++ b/tools/perf/util/include/asm/cpufeature.h
@@ -0,0 +1,9 @@
+
+#ifndef PERF_CPUFEATURE_H
+#define PERF_CPUFEATURE_H
+
+/* cpufeature.h ... dummy header file for including=20
arch/x86/lib/memcpy_64.S */
Post by tip-bot for Hitoshi Mitake
+
+#define X86_FEATURE_REP_GOOD 0
+
+#endif /* PERF_CPUFEATURE_H */
diff --git a/tools/perf/util/include/asm/dwarf2.h=20
b/tools/perf/util/include/asm/dwarf2.h
Post by tip-bot for Hitoshi Mitake
new file mode 100644
index 0000000..bb4198e
--- /dev/null
+++ b/tools/perf/util/include/asm/dwarf2.h
@@ -0,0 +1,11 @@
+
+#ifndef PERF_DWARF2_H
+#define PERF_DWARF2_H
+
+/* dwarf2.h ... dummy header file for including=20
arch/x86/lib/memcpy_64.S */
Post by tip-bot for Hitoshi Mitake
+
+#define CFI_STARTPROC
+#define CFI_ENDPROC
+
+#endif /* PERF_DWARF2_H */
+
diff --git a/tools/perf/util/include/linux/linkage.h=20
b/tools/perf/util/include/linux/linkage.h
Post by tip-bot for Hitoshi Mitake
new file mode 100644
index 0000000..06387cf
--- /dev/null
+++ b/tools/perf/util/include/linux/linkage.h
@@ -0,0 +1,13 @@
+
+#ifndef PERF_LINUX_LINKAGE_H_
+#define PERF_LINUX_LINKAGE_H_
+
+/* linkage.h ... for including arch/x86/lib/memcpy_64.S */
+
+#define ENTRY(name) \
+ .globl name; \
+
+#define ENDPROC(name)
+
+#endif /* PERF_LINUX_LINKAGE_H_ */
Thanks for your applying, Ingo!

BTW, I have a question.
Why does the symbol name of rep prefix memcpy() start from '.'?
The symbol name starts from '.' like ".Lmemcpy_c" cannot seen as
symbol name after compile.

I couldn't find the reason why .Lmemcpy_c has to start from '.'.
=46or example, clear_page in arch/x86/lib/clear_page_64.S
doesn't start from '.' but it is alternative function.

If there is no special reason, I'd like to rename it.

Thanks,
Hitoshi
Hitoshi Mitake
2010-11-25 07:04:52 UTC
Permalink
After applying this patch, perf bench mem memcpy prints
both of prefualted and without prefaulted score of memcpy().

New options --no-prefault and --only-prefault are added
to print single result, mainly for scripting usage.

Example of usage:
| ***@X201i:~/linux/.../tools/perf% ./perf bench mem memcpy -l 500MB
| # Running mem/memcpy benchmark...
| # Copying 500MB Bytes ...
|
| 634.969014 MB/Sec
| 4.828062 GB/Sec (with prefault)
| ***@X201i:~/linux/.../tools/perf% ./perf bench mem memcpy -l 500MB --only-prefault
| # Running mem/memcpy benchmark...
| # Copying 500MB Bytes ...
|
| 4.705192 GB/Sec (with prefault)
| ***@X201i:~/linux/.../tools/perf% ./perf bench mem memcpy -l 500MB --no-prefault
| # Running mem/memcpy benchmark...
| # Copying 500MB Bytes ...
|
| 642.725568 MB/Sec

Signed-off-by: Hitoshi Mitake <***@dcl.info.waseda.ac.jp>
Cc: Miao Xie <***@cn.fujitsu.com>
Cc: Ma Ling <***@intel.com>
Cc: Zhao Yakui <***@intel.com>
Cc: Peter Zijlstra <***@chello.nl>
Cc: Arnaldo Carvalho de Melo <***@redhat.com>
Cc: Paul Mackerras <***@samba.org>
Cc: Frederic Weisbecker <***@gmail.com>
Cc: Steven Rostedt <***@goodmis.org>
Cc: Thomas Gleixner <***@linutronix.de>
Cc: H. Peter Anvin <***@zytor.com>
Cc: Andi Kleen <***@firstfloor.org>
---
tools/perf/bench/mem-memcpy.c | 219 ++++++++++++++++++++++++++++++-----------
1 files changed, 162 insertions(+), 57 deletions(-)

diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c
index 38dae74..db82021 100644
--- a/tools/perf/bench/mem-memcpy.c
+++ b/tools/perf/bench/mem-memcpy.c
@@ -12,6 +12,7 @@
#include "../util/parse-options.h"
#include "../util/header.h"
#include "bench.h"
+#include "mem-memcpy-arch.h"

#include <stdio.h>
#include <stdlib.h>
@@ -23,8 +24,10 @@

static const char *length_str = "1MB";
static const char *routine = "default";
-static bool use_clock = false;
+static bool use_clock;
static int clock_fd;
+static bool only_prefault;
+static bool no_prefault;

static const struct option options[] = {
OPT_STRING('l', "length", &length_str, "1MB",
@@ -34,19 +37,33 @@ static const struct option options[] = {
"Specify routine to copy"),
OPT_BOOLEAN('c', "clock", &use_clock,
"Use CPU clock for measuring"),
+ OPT_BOOLEAN('o', "only-prefault", &only_prefault,
+ "Show only the result with page faults before memcpy()"),
+ OPT_BOOLEAN('n', "no-prefault", &no_prefault,
+ "Show only the result without page faults before memcpy()"),
OPT_END()
};

+typedef void *(*memcpy_t)(void *, const void *, size_t);
+
struct routine {
const char *name;
const char *desc;
- void * (*fn)(void *dst, const void *src, size_t len);
+ memcpy_t fn;
};

struct routine routines[] = {
{ "default",
"Default memcpy() provided by glibc",
memcpy },
+#ifdef ARCH_X86_64
+
+#define MEMCPY_FN(fn, name, desc) { name, desc, fn },
+#include "mem-memcpy-x86-64-asm-def.h"
+#undef MEMCPY_FN
+
+#endif
+
{ NULL,
NULL,
NULL }
@@ -89,29 +106,98 @@ static double timeval2double(struct timeval *ts)
(double)ts->tv_usec / (double)1000000;
}

+static void alloc_mem(void **dst, void **src, size_t length)
+{
+ *dst = zalloc(length);
+ if (!dst)
+ die("memory allocation failed - maybe length is too large?\n");
+
+ *src = zalloc(length);
+ if (!src)
+ die("memory allocation failed - maybe length is too large?\n");
+}
+
+static u64 do_memcpy_clock(memcpy_t fn, size_t len, bool prefault)
+{
+ u64 clock_start = 0ULL, clock_end = 0ULL;
+ void *src = NULL, *dst = NULL;
+
+ alloc_mem(&src, &dst, len);
+
+ if (prefault)
+ fn(dst, src, len);
+
+ clock_start = get_clock();
+ fn(dst, src, len);
+ clock_end = get_clock();
+
+ free(src);
+ free(dst);
+ return clock_end - clock_start;
+}
+
+static double do_memcpy_gettimeofday(memcpy_t fn, size_t len, bool prefault)
+{
+ struct timeval tv_start, tv_end, tv_diff;
+ void *src = NULL, *dst = NULL;
+
+ alloc_mem(&src, &dst, len);
+
+ if (prefault)
+ fn(dst, src, len);
+
+ BUG_ON(gettimeofday(&tv_start, NULL));
+ fn(dst, src, len);
+ BUG_ON(gettimeofday(&tv_end, NULL));
+
+ timersub(&tv_end, &tv_start, &tv_diff);
+
+ free(src);
+ free(dst);
+ return (double)((double)len / timeval2double(&tv_diff));
+}
+
+#define pf (no_prefault ? 0 : 1)
+
+#define print_bps(x) do { \
+ if (x < K) \
+ printf(" %14lf B/Sec", x); \
+ else if (x < K * K) \
+ printf(" %14lfd KB/Sec", x / K); \
+ else if (x < K * K * K) \
+ printf(" %14lf MB/Sec", x / K / K); \
+ else \
+ printf(" %14lf GB/Sec", x / K / K / K); \
+ } while (0)
+
int bench_mem_memcpy(int argc, const char **argv,
const char *prefix __used)
{
int i;
- void *dst, *src;
- size_t length;
- double bps = 0.0;
- struct timeval tv_start, tv_end, tv_diff;
- u64 clock_start, clock_end, clock_diff;
+ size_t len;
+ double result_bps[2];
+ u64 result_clock[2];

- clock_start = clock_end = clock_diff = 0ULL;
argc = parse_options(argc, argv, options,
bench_mem_memcpy_usage, 0);

- tv_diff.tv_sec = 0;
- tv_diff.tv_usec = 0;
- length = (size_t)perf_atoll((char *)length_str);
+ if (use_clock)
+ init_clock();
+
+ len = (size_t)perf_atoll((char *)length_str);

- if ((s64)length <= 0) {
+ result_clock[0] = result_clock[1] = 0ULL;
+ result_bps[0] = result_bps[1] = 0.0;
+
+ if ((s64)len <= 0) {
fprintf(stderr, "Invalid length:%s\n", length_str);
return 1;
}

+ /* same to without specifying either of prefault and no-prefault */
+ if (only_prefault && no_prefault)
+ only_prefault = no_prefault = false;
+
for (i = 0; routines[i].name; i++) {
if (!strcmp(routines[i].name, routine))
break;
@@ -126,61 +212,80 @@ int bench_mem_memcpy(int argc, const char **argv,
return 1;
}

- dst = zalloc(length);
- if (!dst)
- die("memory allocation failed - maybe length is too large?\n");
-
- src = zalloc(length);
- if (!src)
- die("memory allocation failed - maybe length is too large?\n");
-
- if (bench_format == BENCH_FORMAT_DEFAULT) {
- printf("# Copying %s Bytes from %p to %p ...\n\n",
- length_str, src, dst);
- }
-
- if (use_clock) {
- init_clock();
- clock_start = get_clock();
- } else {
- BUG_ON(gettimeofday(&tv_start, NULL));
- }
-
- routines[i].fn(dst, src, length);
+ if (bench_format == BENCH_FORMAT_DEFAULT)
+ printf("# Copying %s Bytes ...\n\n", length_str);

- if (use_clock) {
- clock_end = get_clock();
- clock_diff = clock_end - clock_start;
+ if (!only_prefault && !no_prefault) {
+ /* show both of results */
+ if (use_clock) {
+ result_clock[0] =
+ do_memcpy_clock(routines[i].fn, len, false);
+ result_clock[1] =
+ do_memcpy_clock(routines[i].fn, len, true);
+ } else {
+ result_bps[0] =
+ do_memcpy_gettimeofday(routines[i].fn,
+ len, false);
+ result_bps[1] =
+ do_memcpy_gettimeofday(routines[i].fn,
+ len, true);
+ }
} else {
- BUG_ON(gettimeofday(&tv_end, NULL));
- timersub(&tv_end, &tv_start, &tv_diff);
- bps = (double)((double)length / timeval2double(&tv_diff));
+ if (use_clock) {
+ result_clock[pf] =
+ do_memcpy_clock(routines[i].fn,
+ len, only_prefault);
+ } else {
+ result_bps[pf] =
+ do_memcpy_gettimeofday(routines[i].fn,
+ len, only_prefault);
+ }
}

switch (bench_format) {
case BENCH_FORMAT_DEFAULT:
- if (use_clock) {
- printf(" %14lf Clock/Byte\n",
- (double)clock_diff / (double)length);
- } else {
- if (bps < K)
- printf(" %14lf B/Sec\n", bps);
- else if (bps < K * K)
- printf(" %14lfd KB/Sec\n", bps / 1024);
- else if (bps < K * K * K)
- printf(" %14lf MB/Sec\n", bps / 1024 / 1024);
- else {
- printf(" %14lf GB/Sec\n",
- bps / 1024 / 1024 / 1024);
+ if (!only_prefault && !no_prefault) {
+ if (use_clock) {
+ printf(" %14lf Clock/Byte\n",
+ (double)result_clock[0]
+ / (double)len);
+ printf(" %14lf Clock/Byte (with prefault)\n",
+ (double)result_clock[1]
+ / (double)len);
+ } else {
+ print_bps(result_bps[0]);
+ printf("\n");
+ print_bps(result_bps[1]);
+ printf(" (with prefault)\n");
}
+ } else {
+ if (use_clock) {
+ printf(" %14lf Clock/Byte",
+ (double)result_clock[pf]
+ / (double)len);
+ } else
+ print_bps(result_bps[pf]);
+
+ printf("%s\n", only_prefault ? " (with prefault)" : "");
}
break;
case BENCH_FORMAT_SIMPLE:
- if (use_clock) {
- printf("%14lf\n",
- (double)clock_diff / (double)length);
- } else
- printf("%lf\n", bps);
+ if (!only_prefault && !no_prefault) {
+ if (use_clock) {
+ printf("%lf %lf\n",
+ (double)result_clock[0] / (double)len,
+ (double)result_clock[1] / (double)len);
+ } else {
+ printf("%lf %lf\n",
+ result_bps[0], result_bps[1]);
+ }
+ } else {
+ if (use_clock) {
+ printf("%lf\n", (double)result_clock[pf]
+ / (double)len);
+ } else
+ printf("%lf\n", result_bps[pf]);
+ }
break;
default:
/* reaching this means there's some disaster: */
--
1.6.5.2
tip-bot for Hitoshi Mitake
2010-11-26 10:30:53 UTC
Permalink
Commit-ID: 49ce8fc651794878189fd5f273228832cdfb5be9
Gitweb: http://git.kernel.org/tip/49ce8fc651794878189fd5f273228832cdfb5be9
Author: Hitoshi Mitake <***@dcl.info.waseda.ac.jp>
AuthorDate: Thu, 25 Nov 2010 16:04:52 +0900
Committer: Ingo Molnar <***@elte.hu>
CommitDate: Fri, 26 Nov 2010 08:15:57 +0100

perf bench: Print both of prefaulted and no prefaulted results by default

After applying this patch, perf bench mem memcpy prints
both of prefualted and without prefaulted score of memcpy().

New options --no-prefault and --only-prefault are added
to print single result, mainly for scripting usage.

Usage example:

| ***@X201i:~/linux/.../tools/perf% ./perf bench mem memcpy -l 500MB
| # Running mem/memcpy benchmark...
| # Copying 500MB Bytes ...
|
| 634.969014 MB/Sec
| 4.828062 GB/Sec (with prefault)
| ***@X201i:~/linux/.../tools/perf% ./perf bench mem memcpy -l 500MB --only-prefault
| # Running mem/memcpy benchmark...
| # Copying 500MB Bytes ...
|
| 4.705192 GB/Sec (with prefault)
| ***@X201i:~/linux/.../tools/perf% ./perf bench mem memcpy -l 500MB --no-prefault
| # Running mem/memcpy benchmark...
| # Copying 500MB Bytes ...
|
| 642.725568 MB/Sec

Signed-off-by: Hitoshi Mitake <***@dcl.info.waseda.ac.jp>
Cc: ***@gmail.com
Cc: Miao Xie <***@cn.fujitsu.com>
Cc: Ma Ling <***@intel.com>
Cc: Zhao Yakui <***@intel.com>
Cc: Peter Zijlstra <***@chello.nl>
Cc: Arnaldo Carvalho de Melo <***@redhat.com>
Cc: Paul Mackerras <***@samba.org>
Cc: Frederic Weisbecker <***@gmail.com>
Cc: Steven Rostedt <***@goodmis.org>
Cc: Andi Kleen <***@firstfloor.org>
LKML-Reference: <1290668693-27068-1-git-send-email-***@dcl.info.waseda.ac.jp>
Signed-off-by: Ingo Molnar <***@elte.hu>
---
tools/perf/bench/mem-memcpy.c | 219 ++++++++++++++++++++++++++++++-----------
1 files changed, 162 insertions(+), 57 deletions(-)

diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c
index 38dae74..db82021 100644
--- a/tools/perf/bench/mem-memcpy.c
+++ b/tools/perf/bench/mem-memcpy.c
@@ -12,6 +12,7 @@
#include "../util/parse-options.h"
#include "../util/header.h"
#include "bench.h"
+#include "mem-memcpy-arch.h"

#include <stdio.h>
#include <stdlib.h>
@@ -23,8 +24,10 @@

static const char *length_str = "1MB";
static const char *routine = "default";
-static bool use_clock = false;
+static bool use_clock;
static int clock_fd;
+static bool only_prefault;
+static bool no_prefault;

static const struct option options[] = {
OPT_STRING('l', "length", &length_str, "1MB",
@@ -34,19 +37,33 @@ static const struct option options[] = {
"Specify routine to copy"),
OPT_BOOLEAN('c', "clock", &use_clock,
"Use CPU clock for measuring"),
+ OPT_BOOLEAN('o', "only-prefault", &only_prefault,
+ "Show only the result with page faults before memcpy()"),
+ OPT_BOOLEAN('n', "no-prefault", &no_prefault,
+ "Show only the result without page faults before memcpy()"),
OPT_END()
};

+typedef void *(*memcpy_t)(void *, const void *, size_t);
+
struct routine {
const char *name;
const char *desc;
- void * (*fn)(void *dst, const void *src, size_t len);
+ memcpy_t fn;
};

struct routine routines[] = {
{ "default",
"Default memcpy() provided by glibc",
memcpy },
+#ifdef ARCH_X86_64
+
+#define MEMCPY_FN(fn, name, desc) { name, desc, fn },
+#include "mem-memcpy-x86-64-asm-def.h"
+#undef MEMCPY_FN
+
+#endif
+
{ NULL,
NULL,
NULL }
@@ -89,29 +106,98 @@ static double timeval2double(struct timeval *ts)
(double)ts->tv_usec / (double)1000000;
}

+static void alloc_mem(void **dst, void **src, size_t length)
+{
+ *dst = zalloc(length);
+ if (!dst)
+ die("memory allocation failed - maybe length is too large?\n");
+
+ *src = zalloc(length);
+ if (!src)
+ die("memory allocation failed - maybe length is too large?\n");
+}
+
+static u64 do_memcpy_clock(memcpy_t fn, size_t len, bool prefault)
+{
+ u64 clock_start = 0ULL, clock_end = 0ULL;
+ void *src = NULL, *dst = NULL;
+
+ alloc_mem(&src, &dst, len);
+
+ if (prefault)
+ fn(dst, src, len);
+
+ clock_start = get_clock();
+ fn(dst, src, len);
+ clock_end = get_clock();
+
+ free(src);
+ free(dst);
+ return clock_end - clock_start;
+}
+
+static double do_memcpy_gettimeofday(memcpy_t fn, size_t len, bool prefault)
+{
+ struct timeval tv_start, tv_end, tv_diff;
+ void *src = NULL, *dst = NULL;
+
+ alloc_mem(&src, &dst, len);
+
+ if (prefault)
+ fn(dst, src, len);
+
+ BUG_ON(gettimeofday(&tv_start, NULL));
+ fn(dst, src, len);
+ BUG_ON(gettimeofday(&tv_end, NULL));
+
+ timersub(&tv_end, &tv_start, &tv_diff);
+
+ free(src);
+ free(dst);
+ return (double)((double)len / timeval2double(&tv_diff));
+}
+
+#define pf (no_prefault ? 0 : 1)
+
+#define print_bps(x) do { \
+ if (x < K) \
+ printf(" %14lf B/Sec", x); \
+ else if (x < K * K) \
+ printf(" %14lfd KB/Sec", x / K); \
+ else if (x < K * K * K) \
+ printf(" %14lf MB/Sec", x / K / K); \
+ else \
+ printf(" %14lf GB/Sec", x / K / K / K); \
+ } while (0)
+
int bench_mem_memcpy(int argc, const char **argv,
const char *prefix __used)
{
int i;
- void *dst, *src;
- size_t length;
- double bps = 0.0;
- struct timeval tv_start, tv_end, tv_diff;
- u64 clock_start, clock_end, clock_diff;
+ size_t len;
+ double result_bps[2];
+ u64 result_clock[2];

- clock_start = clock_end = clock_diff = 0ULL;
argc = parse_options(argc, argv, options,
bench_mem_memcpy_usage, 0);

- tv_diff.tv_sec = 0;
- tv_diff.tv_usec = 0;
- length = (size_t)perf_atoll((char *)length_str);
+ if (use_clock)
+ init_clock();
+
+ len = (size_t)perf_atoll((char *)length_str);

- if ((s64)length <= 0) {
+ result_clock[0] = result_clock[1] = 0ULL;
+ result_bps[0] = result_bps[1] = 0.0;
+
+ if ((s64)len <= 0) {
fprintf(stderr, "Invalid length:%s\n", length_str);
return 1;
}

+ /* same to without specifying either of prefault and no-prefault */
+ if (only_prefault && no_prefault)
+ only_prefault = no_prefault = false;
+
for (i = 0; routines[i].name; i++) {
if (!strcmp(routines[i].name, routine))
break;
@@ -126,61 +212,80 @@ int bench_mem_memcpy(int argc, const char **argv,
return 1;
}

- dst = zalloc(length);
- if (!dst)
- die("memory allocation failed - maybe length is too large?\n");
-
- src = zalloc(length);
- if (!src)
- die("memory allocation failed - maybe length is too large?\n");
-
- if (bench_format == BENCH_FORMAT_DEFAULT) {
- printf("# Copying %s Bytes from %p to %p ...\n\n",
- length_str, src, dst);
- }
-
- if (use_clock) {
- init_clock();
- clock_start = get_clock();
- } else {
- BUG_ON(gettimeofday(&tv_start, NULL));
- }
-
- routines[i].fn(dst, src, length);
+ if (bench_format == BENCH_FORMAT_DEFAULT)
+ printf("# Copying %s Bytes ...\n\n", length_str);

- if (use_clock) {
- clock_end = get_clock();
- clock_diff = clock_end - clock_start;
+ if (!only_prefault && !no_prefault) {
+ /* show both of results */
+ if (use_clock) {
+ result_clock[0] =
+ do_memcpy_clock(routines[i].fn, len, false);
+ result_clock[1] =
+ do_memcpy_clock(routines[i].fn, len, true);
+ } else {
+ result_bps[0] =
+ do_memcpy_gettimeofday(routines[i].fn,
+ len, false);
+ result_bps[1] =
+ do_memcpy_gettimeofday(routines[i].fn,
+ len, true);
+ }
} else {
- BUG_ON(gettimeofday(&tv_end, NULL));
- timersub(&tv_end, &tv_start, &tv_diff);
- bps = (double)((double)length / timeval2double(&tv_diff));
+ if (use_clock) {
+ result_clock[pf] =
+ do_memcpy_clock(routines[i].fn,
+ len, only_prefault);
+ } else {
+ result_bps[pf] =
+ do_memcpy_gettimeofday(routines[i].fn,
+ len, only_prefault);
+ }
}

switch (bench_format) {
case BENCH_FORMAT_DEFAULT:
- if (use_clock) {
- printf(" %14lf Clock/Byte\n",
- (double)clock_diff / (double)length);
- } else {
- if (bps < K)
- printf(" %14lf B/Sec\n", bps);
- else if (bps < K * K)
- printf(" %14lfd KB/Sec\n", bps / 1024);
- else if (bps < K * K * K)
- printf(" %14lf MB/Sec\n", bps / 1024 / 1024);
- else {
- printf(" %14lf GB/Sec\n",
- bps / 1024 / 1024 / 1024);
+ if (!only_prefault && !no_prefault) {
+ if (use_clock) {
+ printf(" %14lf Clock/Byte\n",
+ (double)result_clock[0]
+ / (double)len);
+ printf(" %14lf Clock/Byte (with prefault)\n",
+ (double)result_clock[1]
+ / (double)len);
+ } else {
+ print_bps(result_bps[0]);
+ printf("\n");
+ print_bps(result_bps[1]);
+ printf(" (with prefault)\n");
}
+ } else {
+ if (use_clock) {
+ printf(" %14lf Clock/Byte",
+ (double)result_clock[pf]
+ / (double)len);
+ } else
+ print_bps(result_bps[pf]);
+
+ printf("%s\n", only_prefault ? " (with prefault)" : "");
}
break;
case BENCH_FORMAT_SIMPLE:
- if (use_clock) {
- printf("%14lf\n",
- (double)clock_diff / (double)length);
- } else
- printf("%lf\n", bps);
+ if (!only_prefault && !no_prefault) {
+ if (use_clock) {
+ printf("%lf %lf\n",
+ (double)result_clock[0] / (double)len,
+ (double)result_clock[1] / (double)len);
+ } else {
+ printf("%lf %lf\n",
+ result_bps[0], result_bps[1]);
+ }
+ } else {
+ if (use_clock) {
+ printf("%lf\n", (double)result_clock[pf]
+ / (double)len);
+ } else
+ printf("%lf\n", result_bps[pf]);
+ }
break;
default:
/* reaching this means there's some disaster: */
Arnaldo Carvalho de Melo
2010-12-12 13:46:57 UTC
Permalink
BTW, I found that measuring performance of prefaulted memcpy()
with perf stat is difficult. Because current perf stat monitors
whole execution of program or range of perf stat lifetime.
If perf stat and monitored program can interact and work
synchronously, it will be better.
For example, if perf stat waits on the unix domain socket
before create_perf_stat_counter() and monitored program wakes perf stat
up through the socket, more fine grain monitoring will be possible.
perf stat --wait-on /tmp/perf_wait perf bench mem memcpy --wake-up
/tmp/perf_wait
--wait-on is imaginaly option of perf stat, and the way of waking up
perf stat is left to monitored program (in this case, --wake-up is
used for specifying the name of the socket).
I'd like to implement such a option to perf stat, how do you think?
Looks interesting, and also interesting would be to be able to place
probes that would wake up it too, for unmodified binaries to have
something similar.

Other kinds of triggers may be to hook on syscalls and when some
expression matches, like connecting to host 1.2.3.4, start monitoring,
stop when the socket is closed, i.e. monitor a connection lifetime, etc.

I think it is worth pursuing and encourage you to work on it :-)

- Arnaldo
Peter Zijlstra
2010-12-13 11:14:33 UTC
Permalink
Post by Arnaldo Carvalho de Melo
BTW, I found that measuring performance of prefaulted memcpy()
with perf stat is difficult. Because current perf stat monitors
whole execution of program or range of perf stat lifetime.
If perf stat and monitored program can interact and work
synchronously, it will be better.
For example, if perf stat waits on the unix domain socket
before create_perf_stat_counter() and monitored program wakes perf stat
up through the socket, more fine grain monitoring will be possible.
perf stat --wait-on /tmp/perf_wait perf bench mem memcpy --wake-up
/tmp/perf_wait
--wait-on is imaginaly option of perf stat, and the way of waking up
perf stat is left to monitored program (in this case, --wake-up is
used for specifying the name of the socket).
I'd like to implement such a option to perf stat, how do you think?
Looks interesting, and also interesting would be to be able to place
probes that would wake up it too, for unmodified binaries to have
something similar.
Other kinds of triggers may be to hook on syscalls and when some
expression matches, like connecting to host 1.2.3.4, start monitoring,
stop when the socket is closed, i.e. monitor a connection lifetime, etc.
I think it is worth pursuing and encourage you to work on it :-)
Sounds to me like you want something like a library with self-monitoring
stuff.
Arnaldo Carvalho de Melo
2010-12-13 12:38:10 UTC
Permalink
Post by Peter Zijlstra
Post by Arnaldo Carvalho de Melo
Looks interesting, and also interesting would be to be able to place
probes that would wake up it too, for unmodified binaries to have
something similar.
Other kinds of triggers may be to hook on syscalls and when some
expression matches, like connecting to host 1.2.3.4, start monitoring,
stop when the socket is closed, i.e. monitor a connection lifetime, etc.
Sounds to me like you want something like a library with self-monitoring
stuff.
Yeah, that could be a way, an LD_PRELOAD thingy that would intercept
library calls, setup counters, start a monitoring thread, etc.

Along the lines of:

http://git.kernel.org/?p=linux/kernel/git/acme/libautocork.git;a=blob;f=libautocork.c

This one just intercepts calls, but the __init function could do the
rest.

To make it easier we could move the counter setup we have in record/top
to a library, etc.

- Arnaldo
Peter Zijlstra
2010-12-13 12:40:59 UTC
Permalink
Post by Arnaldo Carvalho de Melo
Post by Peter Zijlstra
Post by Arnaldo Carvalho de Melo
Looks interesting, and also interesting would be to be able to place
probes that would wake up it too, for unmodified binaries to have
something similar.
Other kinds of triggers may be to hook on syscalls and when some
expression matches, like connecting to host 1.2.3.4, start monitoring,
stop when the socket is closed, i.e. monitor a connection lifetime, etc.
Sounds to me like you want something like a library with self-monitoring
stuff.
Yeah, that could be a way, an LD_PRELOAD thingy that would intercept
library calls, setup counters, start a monitoring thread, etc.
http://git.kernel.org/?p=linux/kernel/git/acme/libautocork.git;a=blob;f=libautocork.c
This one just intercepts calls, but the __init function could do the
rest.
To make it easier we could move the counter setup we have in record/top
to a library, etc.
Nah, I was more thinking of something along the lines of libPAPI and
libpfmon. A library that contains the needed building blocks for apps to
profile themselves.
Arnaldo Carvalho de Melo
2010-12-13 13:12:18 UTC
Permalink
Post by Peter Zijlstra
Post by Arnaldo Carvalho de Melo
Post by Peter Zijlstra
Sounds to me like you want something like a library with self-monitoring
stuff.
Yeah, that could be a way, an LD_PRELOAD thingy that would intercept
library calls, setup counters, start a monitoring thread, etc.
To make it easier we could move the counter setup we have in record/top
to a library, etc.
Nah, I was more thinking of something along the lines of libPAPI and
libpfmon. A library that contains the needed building blocks for apps to
profile themselves.
Ok, you mean for the case where you can modify the app, I was thinking
about when you can't.

In both cases its good to move the counter creation, etc routines from
record/top to a lib, that then could be used in the way you mention, and
in the way I mention too. Two different usecases :-)

- Arnaldo
Hitoshi Mitake
2010-12-13 17:37:26 UTC
Permalink
On 2010=E5=B9=B412=E6=9C=8813=E6=97=A5 22:12, Arnaldo Carvalho de Melo =
Post by Peter Zijlstra
Sounds to me like you want something like a library with self-moni=
toring
Post by Peter Zijlstra
stuff.
Yeah, that could be a way, an LD_PRELOAD thingy that would intercep=
t
Post by Peter Zijlstra
library calls, setup counters, start a monitoring thread, etc.
To make it easier we could move the counter setup we have in record=
/top
Post by Peter Zijlstra
to a library, etc.
Nah, I was more thinking of something along the lines of libPAPI and
libpfmon. A library that contains the needed building blocks for app=
s to
Post by Peter Zijlstra
profile themselves.
Ok, you mean for the case where you can modify the app, I was thinkin=
g
about when you can't.
In both cases its good to move the counter creation, etc routines fro=
m
record/top to a lib, that then could be used in the way you mention, =
and
in the way I mention too. Two different usecases :-)
Thanks for your comments, Arnaldo, Peter.

I implement basic feature of my proposal,
and found that communicating perf stat and benchmarking programs
via socket is really dirty. As you said, unified form,
interception for unmodified binary and library for modifiable binary,
will be ideal for fine grain monitoring.

But I believe that measuring performance of some sort of programs
like in kernel routines requires more fine grain perf stating,
so I'll seek the unified way.

Anyway, I'll send my proof of concept patch later.

Thanks,
Hitoshi
Hitoshi Mitake
2010-12-14 05:46:59 UTC
Permalink
This patch makes perf bench mem memcpy to use the new feature of perf stat.

New option --wake-up requires path name of unix domain socket.
If --only-prefault or --no-prefault is specified, the pid of itself is written
to this socket before actual memcpy() to be monitored. And the pid of perf stat
is read from it. The pid of perf stat is used for signaling perf stat
to terminate monitoring.

With this feature, the detailed performance monitoring of prefaulted
(or non prefaulted only) memcpy() will be possible.

Example of use, non prefaulted version:
| ***@x201i:~/linux/.../tools/perf% sudo ./perf stat -w /tmp/perf-stat-wait
|

After execution, perf stat waits the pid...

| Performance counter stats for process id '27109':
|
| 440.534943 task-clock-msecs # 0.997 CPUs
| 44 context-switches # 0.000 M/sec
| 5 CPU-migrations # 0.000 M/sec
| 256,002 page-faults # 0.581 M/sec
| 934,443,072 cycles # 2121.155 M/sec
| 780,408,435 instructions # 0.835 IPC
| 111,756,558 branches # 253.684 M/sec
| 392,170 branch-misses # 0.351 %
| 8,611,308 cache-references # 19.547 M/sec
| 8,533,588 cache-misses # 19.371 M/sec
|
| 0.441803031 seconds time elapsed

in another shell,

| ***@x201i:~/linux/.../tools/perf% sudo ./perf bench mem memcpy -l 500MB --no-prefault -w /tmp/perf-stat-wait
| # Running mem/memcpy benchmark...
| # Copying 500MB Bytes ...
|
| 1.105722 GB/Sec

Example of use, prefaulted version:

| ***@x201i:~/linux/.../tools/perf% sudo ./perf stat -w /tmp/perf-stat-wait
| Performance counter stats for process id '27112':
|
| 105.001542 task-clock-msecs # 0.997 CPUs
| 11 context-switches # 0.000 M/sec
| 0 CPU-migrations # 0.000 M/sec
| 2 page-faults # 0.000 M/sec
| 223,273,425 cycles # 2126.382 M/sec
| 197,992,585 instructions # 0.887 IPC
| 16,657,288 branches # 158.639 M/sec
| 1,942 branch-misses # 0.012 %
| 3,105,619 cache-references # 29.577 M/sec
| 3,082,390 cache-misses # 29.356 M/sec
|
| 0.105316101 seconds time elapsed

in another shell,

| ***@x201i:~/linux/.../tools/perf% sudo ./perf bench mem memcpy -l 500MB --only-prefault -w /tmp/perf-stat-wait
| # Running mem/memcpy benchmark...
| # Copying 500MB Bytes ...
|
| 4.640927 GB/Sec (with prefault)

The result shows that the difference between non-prefaulted memcpy() and prefaulted one.
And this will be useful for detailed performance analysis of various memcpy()s
like Miao Xie's one and rep prefix version.

But this is too adhoc and dirty... :(

Cc: Miao Xie <***@cn.fujitsu.com>
Cc: Ma Ling <***@intel.com>
Cc: Zhao Yakui <***@intel.com>
Cc: Peter Zijlstra <***@chello.nl>
Cc: Arnaldo Carvalho de Melo <***@redhat.com>
Cc: Paul Mackerras <***@samba.org>
Cc: Frederic Weisbecker <***@gmail.com>
Cc: Steven Rostedt <***@goodmis.org>
Cc: Andi Kleen <***@firstfloor.org>
Signed-off-by: Hitoshi Mitake <***@dcl.info.waseda.ac.jp>
---
tools/perf/bench/mem-memcpy.c | 56 +++++++++++++++++++++++++++++++++++++++++
1 files changed, 56 insertions(+), 0 deletions(-)

diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c
index ac88f52..7d0bcea 100644
--- a/tools/perf/bench/mem-memcpy.c
+++ b/tools/perf/bench/mem-memcpy.c
@@ -21,6 +21,10 @@
#include <errno.h>
#include <unistd.h>

+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
#define K 1024

static const char *length_str = "1MB";
@@ -31,6 +35,7 @@ static bool only_prefault;
static bool no_prefault;
static int src_align;
static int dst_align;
+static const char *wake_path;

static const struct option options[] = {
OPT_STRING('l', "length", &length_str, "1MB",
@@ -48,6 +53,9 @@ static const struct option options[] = {
"Alignment of source memory region (in byte)"),
OPT_INTEGER('d', "dst-alignment", &dst_align,
"Alignment of destination memory region (in byte)"),
+ OPT_STRING('w', "wake-up", &wake_path, "default",
+ "Path of unix domain socket for waking up perf stat"
+ " (use with only_prefault option)"),
OPT_END()
};

@@ -116,6 +124,33 @@ static double timeval2double(struct timeval *ts)
(double)ts->tv_usec / (double)1000000;
}

+static pid_t perf_stat_pid;
+
+static void wake_up_perf_stat(void)
+{
+ int wake_fd;
+ struct sockaddr_un wake_addr;
+ pid_t myself = getpid();
+
+ wake_fd = socket(PF_UNIX, SOCK_STREAM, 0);
+ if (wake_fd < 0)
+ die("unable to create socket for sync\n");
+
+ memset(&wake_addr, 0, sizeof(wake_addr));
+ wake_addr.sun_family = PF_UNIX;
+ strncpy(wake_addr.sun_path, wake_path, sizeof(wake_addr.sun_path));
+
+ if (connect(wake_fd, (struct sockaddr *)&wake_addr, sizeof(wake_addr)))
+ die("connect() failed\n");
+
+ if (write(wake_fd, &myself, sizeof(myself)) != sizeof(myself))
+ die("write() my pid to socket failed\n");
+
+ if (read(wake_fd, &perf_stat_pid, sizeof(perf_stat_pid))
+ != sizeof(perf_stat_pid))
+ die("read() pid of perf stat from socket\n");
+}
+
static void alloc_mem(void **dst, void **src, size_t length)
{
int ret;
@@ -139,10 +174,16 @@ static u64 do_memcpy_clock(memcpy_t fn, size_t len, bool prefault)
if (prefault)
fn(dst + dst_align, src + src_align, len);

+ if (wake_path)
+ wake_up_perf_stat();
+
clock_start = get_clock();
fn(dst + dst_align, src + src_align, len);
clock_end = get_clock();

+ if (wake_path) /* kill perf stat */
+ kill(perf_stat_pid, SIGINT);
+
free(src);
free(dst);
return clock_end - clock_start;
@@ -158,12 +199,18 @@ static double do_memcpy_gettimeofday(memcpy_t fn, size_t len, bool prefault)
if (prefault)
fn(dst + dst_align, src + src_align, len);

+ if (wake_path)
+ wake_up_perf_stat();
+
BUG_ON(gettimeofday(&tv_start, NULL));
fn(dst + dst_align, src + src_align, len);
BUG_ON(gettimeofday(&tv_end, NULL));

timersub(&tv_end, &tv_start, &tv_diff);

+ if (wake_path) /* kill perf stat */
+ kill(perf_stat_pid, SIGINT);
+
free(src);
free(dst);
return (double)((double)len / timeval2double(&tv_diff));
@@ -235,6 +282,15 @@ int bench_mem_memcpy(int argc, const char **argv,

if (!only_prefault && !no_prefault) {
/* show both of results */
+ if (wake_path) {
+ fprintf(stderr, "Meaningless combination of option, "
+ "you should not use wake_path alone.\n"
+ "Use it with --only-prefault"
+ " or --no-prefault\n");
+ return 1;
+ }
+
+
if (use_clock) {
result_clock[0] =
do_memcpy_clock(routines[i].fn, len, false);
--
1.7.3.3
Hitoshi Mitake
2010-12-14 05:46:58 UTC
Permalink
This patch adds new option "--wait-on" option to perf stat.

Current perf stat can monitor
1) lifetime of program specified as command line argument, or
2) lifetime of perf stat. Target process is specified with pid,
and end of monitoring is triggered with signal.
1) is too coarse grain. And 2) is difficult to distinguish the range to monitor.

This patch makes it possible to wait before sys_perf_event_open().
Monitored process can wake up perf stat via unix domain socket,
and terminate monitoring via signal.

New option --wait-on requires the string as the path of unix domain socket.
perf stat read the pid from the socket for target_pid. Monitored program
should write the pid of itself to it.
perf stat replies the pid of itself to monitored program. The monitored program
should send signal SIGINT to perf stat with this pid. Then monitoring is terminated.

I feel current implementation is really dirty. As Arnaldo and Peter suggested,
more unified way like interception or self monitoring library is ideal.
This is the proof of concept version. I'd like to hear your comments.

Cc: Miao Xie <***@cn.fujitsu.com>
Cc: Ma Ling <***@intel.com>
Cc: Zhao Yakui <***@intel.com>
Cc: Peter Zijlstra <***@chello.nl>
Cc: Arnaldo Carvalho de Melo <***@redhat.com>
Cc: Paul Mackerras <***@samba.org>
Cc: Frederic Weisbecker <***@gmail.com>
Cc: Steven Rostedt <***@goodmis.org>
Cc: Andi Kleen <***@firstfloor.org>
Signed-off-by: Hitoshi Mitake <***@dcl.info.waseda.ac.jp>
---
tools/perf/builtin-stat.c | 63 ++++++++++++++++++++++++++++++++++++++++++--
1 files changed, 60 insertions(+), 3 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 7ff746d..4cc10a1 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -51,6 +51,8 @@
#include <sys/prctl.h>
#include <math.h>
#include <locale.h>
+#include <sys/socket.h>
+#include <sys/un.h>

#define DEFAULT_SEPARATOR " "

@@ -90,11 +92,15 @@ static const char *cpu_list;
static const char *csv_sep = NULL;
static bool csv_output = false;

+static const char *wait_path;

static int *fd[MAX_NR_CPUS][MAX_COUNTERS];

static int event_scaled[MAX_COUNTERS];

+static int wait_fd = -1;
+static struct sockaddr_un wait_addr;
+
static struct {
u64 val;
u64 ena;
@@ -342,7 +348,7 @@ static int run_perf_stat(int argc __used, const char **argv)
unsigned long long t0, t1;
int status = 0;
int counter, ncreated = 0;
- int child_ready_pipe[2], go_pipe[2];
+ int child_ready_pipe[2], go_pipe[2], accepted_fd;
bool perm_err = false;
const bool forks = (argc > 0);
char buf;
@@ -401,6 +407,43 @@ static int run_perf_stat(int argc __used, const char **argv)
close(child_ready_pipe[0]);
}

+ if (wait_path) {
+ int sock_err;
+ struct sockaddr accepted_addr;
+ socklen_t accepted_len = sizeof(accepted_addr);
+
+ wait_fd = socket(PF_UNIX, SOCK_STREAM, 0);
+ if (wait_fd < 0)
+ die("unable to create socket for sync\n");
+
+ memset(&wait_addr, 0, sizeof(wait_addr));
+ wait_addr.sun_family = PF_UNIX;
+ strncpy(wait_addr.sun_path, wait_path,
+ sizeof(wait_addr.sun_path));
+
+ sock_err = bind(wait_fd, (struct sockaddr *)&wait_addr,
+ sizeof(wait_addr));
+ if (sock_err < 0)
+ die("bind() failed\n");
+
+ sock_err = listen(wait_fd, 1);
+ if (sock_err < 0)
+ die("listen() failed\n");
+
+ accepted_fd = accept(wait_fd, &accepted_addr, &accepted_len);
+ if (accepted_fd < 0)
+ die("accept() failed\n");
+
+ if (read(accepted_fd, &target_pid, sizeof(target_pid))
+ != sizeof(target_pid))
+ die("read() pid from socket failed\n");
+
+ target_tid = target_pid;
+ thread_num = find_all_tid(target_pid, &all_tids);
+ if (thread_num <= 0)
+ die("couldn't find threads of %d\n", target_pid);
+ }
+
for (counter = 0; counter < nr_counters; counter++)
ncreated += create_perf_stat_counter(counter, &perm_err);

@@ -425,6 +468,14 @@ static int run_perf_stat(int argc __used, const char **argv)
close(go_pipe[1]);
wait(&status);
} else {
+ if (wait_path) {
+ pid_t myself = getpid();
+ if (write(accepted_fd, &myself, sizeof(myself))
+ != sizeof(myself))
+ die("write() my pid failed\n");
+ close(accepted_fd);
+ }
+
while(!done) sleep(1);
}

@@ -670,6 +721,9 @@ static void sig_atexit(void)
if (signr == -1)
return;

+ if (wait_path)
+ unlink(wait_path);
+
signal(signr, SIG_DFL);
kill(getpid(), signr);
}
@@ -715,6 +769,8 @@ static const struct option options[] = {
"disable CPU count aggregation"),
OPT_STRING('x', "field-separator", &csv_sep, "separator",
"print counts with custom separator"),
+ OPT_STRING('w', "wait-on", &wait_path, "path",
+ "path of unix domain socket to wait on"),
OPT_END()
};

@@ -746,7 +802,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
} else if (big_num_opt == 0) /* User passed --no-big-num */
big_num = false;

- if (!argc && target_pid == -1 && target_tid == -1)
+ if (!argc && target_pid == -1 && target_tid == -1 && !wait_path)
usage_with_options(stat_usage, options);
if (run_count <= 0)
usage_with_options(stat_usage, options);
@@ -769,7 +825,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
if (nr_cpus < 1)
usage_with_options(stat_usage, options);

- if (target_pid != -1) {
+ /* if wait_path is specified, we read pid to monitor from it later */
+ if (target_pid != -1 && !wait_path) {
target_tid = target_pid;
thread_num = find_all_tid(target_pid, &all_tids);
if (thread_num <= 0) {
--
1.7.3.3
Hitoshi Mitake
2011-01-11 16:27:47 UTC
Permalink
Post by Ingo Molnar
Post by Hitoshi Mitake
Post by Ingo Molnar
Post by Hitoshi Mitake
This patch adds new file: mem-memcpy-x86-64-asm.S
for x86-64 specific memcpy() benchmarking.
Added new benchmarks are,
x86-64-rep: memcpy() implemented with rep instruction
x86-64-unrolled: unrolled memcpy()
Original idea of including the source files of kernel
for benchmarking is suggested by Ingo Molnar.
This is more effective than write-once programs for quantitative
evaluation of in-kernel, little and leaf functions called high fre=
quently.
Post by Ingo Molnar
Post by Hitoshi Mitake
Post by Ingo Molnar
Post by Hitoshi Mitake
Because perf bench is in kernel source tree and executing it
on various hardwares, especially new model CPUs, is easy.
This way can also be used for other functions of kernel e.g. check=
sum functions.
Post by Ingo Molnar
Post by Hitoshi Mitake
Post by Ingo Molnar
Post by Hitoshi Mitake
| % ./perf bench mem memcpy -l 500MB
| # Running mem/memcpy benchmark...
| # Copying 500MB Bytes from 0x7f911f94c010 to 0x7f913ed4d010 ...
|
| 578.732506 MB/Sec
| % ./perf bench mem memcpy -l 500MB -r x86-64-rep
| # Running mem/memcpy benchmark...
| # Copying 500MB Bytes from 0x7fb4b6fe4010 to 0x7fb4d63e5010 ...
|
| 738.184980 MB/Sec
| % ./perf bench mem memcpy -l 500MB -r x86-64-unrolled
| # Running mem/memcpy benchmark...
| # Copying 500MB Bytes from 0x7f6f2e668010 to 0x7f6f4da69010 ...
|
| 767.483269 MB/Sec
This shows clearly that unrolled memcpy() is efficient
than rep version and glibc's one :)
Hey, really cool output :-)
Might also make sense to measure Ma Ling's patched version?
Does Ma Ling's patched version mean,
http://marc.info/?l=3Dlinux-kernel&m=3D128652296500989&w=3D2
the memcpy applied the patch of the URL?
(It seems that this patch was written by Miao Xie.)
I'll include the result of patched version in the next post.
(Indeed it is Miao Xie - sorry!)
Post by Hitoshi Mitake
Post by Ingo Molnar
Post by Hitoshi Mitake
# checkpatch.pl warns about two externs in bench/mem-memcpy.c
# added by this patch. But I think it is no problem.
+#ifdef ARCH_X86_64
+extern void *memcpy_x86_64_unrolled(void *to, const void *from, =
size_t len);
Post by Ingo Molnar
Post by Hitoshi Mitake
Post by Ingo Molnar
+extern void *memcpy_x86_64_rep(void *to, const void *from, size_=
t len);
Post by Ingo Molnar
Post by Hitoshi Mitake
Post by Ingo Molnar
+#endif
into a .h file - a new one if needed.
That will make both checkpatch and me happier ;-)
OK, I'll separate these files.
BTW, I found really interesting evaluation result.
Current results of "perf bench mem memcpy" include
the overhead of page faults because the measured memcpy()
is the first access to allocated memory area.
I tested the another version of perf bench mem memcpy,
which does memcpy() before measured memcpy() for removing
the overhead come from page faults.
% ./perf bench mem memcpy -l 500MB -r x86-64-unrolled
# Running mem/memcpy benchmark...
# Copying 500MB Bytes from 0x7f19d488f010 to 0x7f19f3c90010 ...
4.608340 GB/Sec
% ./perf bench mem memcpy -l 500MB
# Running mem/memcpy benchmark...
# Copying 500MB Bytes from 0x7f696c3cc010 to 0x7f698b7cd010 ...
4.856442 GB/Sec
% ./perf bench mem memcpy -l 500MB -r x86-64-rep
# Running mem/memcpy benchmark...
# Copying 500MB Bytes from 0x7f45d6cff010 to 0x7f45f6100010 ...
6.024445 GB/Sec
The relation of scores reversed!
I cannot explain the cause of this result, and
this is really interesting phenomenon.
Interesting indeed, and it would be nice to analyse that! (It should =
be possible,
Post by Ingo Molnar
using various PMU metrics in a clever way, to figure out what's happe=
ning inside the
Post by Ingo Molnar
CPU, right?)
I corrected the PMU information of the each case of memcpy,
below is the result:

(I used partial monitoring patch I posted before:=20
https://patchwork.kernel.org/patch/408801/,
and my local modification for testing rep based memcpy)

no prefault benchmarking

unrolled

Score: 685.812729 MB/Sec
Stat:
Performance counter stats for process id '4139':

725.939831 task-clock-msecs # 0.995 CPUs
74 context-switches # 0.000 M/sec
2 CPU-migrations # 0.000 M/sec
256,002 page-faults # 0.353 M/sec
1,535,468,702 cycles # 2115.146 M/sec
1,691,516,817 instructions # 1.102 IPC
291,260,006 branches # 401.218 M/sec
1,487,762 branch-misses # 0.511 %
8,470,560 cache-references # 11.668 M/sec
8,364,176 cache-misses # 11.522 M/sec

0.729488573 seconds time elapsed

rep based

Score: 670.172114 MB/Sec
Stat:
Performance counter stats for process id '5539':

742.943772 task-clock-msecs # 0.995 CPUs
77 context-switches # 0.000 M/sec
2 CPU-migrations # 0.000 M/sec
256,002 page-faults # 0.345 M/sec
1,578,787,149 cycles # 2125.043 M/sec
1,499,144,628 instructions # 0.950 IPC
275,684,806 branches # 371.071 M/sec
1,522,326 branch-misses # 0.552 %
8,503,747 cache-references # 11.446 M/sec
8,386,673 cache-misses # 11.288 M/sec

0.746320411 seconds time elapsed

prefaulted benchmarking

unrolled

Score: 4.485941 GB/Sec
Stat:
Performance counter stats for process id '4279':

108.466761 task-clock-msecs # 0.994 CPUs
11 context-switches # 0.000 M/sec
2 CPU-migrations # 0.000 M/sec
2 page-faults # 0.000 M/sec
218,260,432 cycles # 2012.233 M/sec
199,520,023 instructions # 0.914 IPC
16,963,327 branches # 156.392 M/sec
8,169 branch-misses # 0.048 %
2,955,221 cache-references # 27.245 M/sec
2,916,018 cache-misses # 26.884 M/sec

0.109115820 seconds time elapsed

rep based

Score: 5.972859 GB/Sec
Stat:
Performance counter stats for process id '5535':

81.609445 task-clock-msecs # 0.995 CPUs
8 context-switches # 0.000 M/sec
0 CPU-migrations # 0.000 M/sec
2 page-faults # 0.000 M/sec
173,888,853 cycles # 2130.744 M/sec
3,034,096 instructions # 0.017 IPC
607,897 branches # 7.449 M/sec
5,874 branch-misses # 0.966 %
8,276,533 cache-references # 101.416 M/sec
8,274,865 cache-misses # 101.396 M/sec

0.082030877 seconds time

Again, the surprising point is the reverse of the score relation.
I cannot find the direct reason of this reverse,
but it seems that the count of branch-miss is refrecting it.

I have to look into this more deeply...

Continue reading on narkive:
Loading...