Akira Tsukamoto
2002-11-16 06:53:57 UTC
This is a faster copy_to/from_user function for athlon/duron.
(300% faster in file read/write?)
L2 cache size for Athlon(256)/Duron(64) should be adjusted on run-time.
I appreciate trying this patch who owns Athlon/Duron CPU.
1)
I have been using Taka’s file or socket benchmark program for
this purpose, if any better testing tools?
On the benchmark, athlon copy function is extremely fast.
read/write file test
http://dns.suna-asobi.com/~akira-t/linux/taka-bench/fileio2.c
org-copy: file read/write
buf(0x886e000) copied 24.0 Mbytes in 0.080 seconds at 299.4 Mbytes/sec
buf(0x886e001) copied 24.0 Mbytes in 0.129 seconds at 186.7 Mbytes/sec
buf(0x886e002) copied 24.0 Mbytes in 0.129 seconds at 186.4 Mbytes/sec
buf(0x886e003) copied 24.0 Mbytes in 0.129 seconds at 185.7 Mbytes/sec
(Entire log is here, http://dns.suna-asobi.com/~akira-t/linux/taka-bench/org-copy-file.log)
athlon-fast_copy: file read/write
buf(0x886e000) copied 24.0 Mbytes in 0.025 seconds at 959.2 Mbytes/sec
buf(0x886e001) copied 24.0 Mbytes in 0.032 seconds at 745.8 Mbytes/sec
buf(0x886e002) copied 24.0 Mbytes in 0.033 seconds at 731.4 Mbytes/sec
buf(0x886e003) copied 24.0 Mbytes in 0.032 seconds at 742.7 Mbytes/sec
(Entire log is here, http://dns.suna-asobi.com/~akira-t/linux/taka-bench/aki-copy-file.log)
network test
http://dns.suna-asobi.com/~akira-t/linux/taka-bench/netio2.c
org-copy: socket
0x846e000+0 -> 0x804e000+0 send/recv: 0.034387 seconds at 116.3 Mbytes/sec
0x846e000+1 -> 0x804e000+1 send/recv: 0.043644 seconds at 91.7 Mbytes/sec
0x846e000+2 -> 0x804e000+2 send/recv: 0.044038 seconds at 90.8 Mbytes/sec
0x846e000+3 -> 0x804e000+3 send/recv: 0.043457 seconds at 92.0 Mbytes/sec
(Entire log is here, http://dns.suna-asobi.com/~akira-t/linux/taka-bench/org-copy-net.log)
athlon-fast_copy: socket
0x846e000+0 -> 0x804e000+0 send/recv: 0.019374 seconds at 206.5 Mbytes/sec
0x846e000+1 -> 0x804e000+1 send/recv: 0.036772 seconds at 108.8 Mbytes/sec
0x846e000+2 -> 0x804e000+2 send/recv: 0.037353 seconds at 107.1 Mbytes/sec
0x846e000+3 -> 0x804e000+3 send/recv: 0.040598 seconds at 98.5 Mbytes/sec
(Entire log is here, http://dns.suna-asobi.com/~akira-t/linux/taka-bench/aki-copy-net.log)
2) Last Friday was my first day to touch gcc assembler, and I really
appreciate if there any suggestion for the aki_copy().
Should I save all the mmx registers? I didn't but seems to work ok.
3) From the comment from Andi and Andrew last time, statically compiling
for each CPU is not really optimal for the Linux distributors.
I saw Denis Vlasenko’s csum_copy routines with boot-time selection, and
it looked good. I think it is a good idea to do the same for the copy_user
functions.
4) I think this is just a mistake in the current kernel.
config X86_INTEL_USERCOPY
bool
- depends on MPENTIUM4 || MPENTIUMIII || M586MMX
+ depends on MPENTIUM4 || MPENTIUMIII || M686
default y
The intel_faster_copy is slower for Pentium MMX and faster for PenII.
Akira
--- linux-2.5.47/arch/i386/lib/usercopy.c Thu Oct 31 22:40:01 2002
+++ linux-2.5.47-aki/arch/i386/lib/usercopy.c Sat Nov 16 00:42:46 2002
@@ -337,6 +337,354 @@
__copy_user_intel(void *to, const void *from,unsigned long size);
#endif /* CONFIG_X86_INTEL_USERCOPY */
+#ifdef CONFIG_MK7
+/* Ahtlon version */
+/* akira version, specific to Athlon/Duron CPU
+ * 'size' must be larger than 256 */
+static unsigned long
+aki_copy(void *to, const void *from, unsigned long size) {
+ __asm__ __volatile(
+ /* These are just saving it for later use */
+ " movl %4, %%edx\n"
+ " push %%edi\n"
+ " movl %%ecx, %%ebx\n"
+ /* Athlon speeds up a lot when the address to read is
+ * aligned at 32bit(8bytes) boundary */
+ " movl %%esi, %%ecx\n"
+ " negl %%ecx\n"
+ " andl $7, %%ecx\n"
+ " subl %%ecx, %%ebx\n"
+ "80: rep; movsb\n"
+ /* Here is one trick to speed up, it is prefetching
+ * entire 'from' or up to L2 cache size, which ever is
+ * smaller.
+ * I used movl instead of special instruction, such as
+ * prefetch or prefetchnta because movl was faster
+ * when combining with movq, used in later. */
+ " movl %%ebx, %%eax\n"
+ " shrl $8, %%eax\n"
+ " cmpl %%edx, %%eax\n"
+ " jbe 10f\n"
+ " movl %%edx, %%eax\n"
+ " .align 2\n"
+ "10: movl 0(%%esi, %%ecx), %%edx\n"
+ "11: movl 64(%%esi, %%ecx), %%edx\n"
+ "12: movl 128(%%esi, %%ecx), %%edx\n"
+ "13: movl 192(%%esi, %%ecx), %%edx\n"
+ " addl $256, %%ecx\n"
+ " decl %%eax\n"
+ " jnz 10b\n"
+ // " femms\n"
+ /* Bulk transfer by using movq and movntq, every
+ * iteration 64 bytes. There is movl in the first line
+ * but it is not a mistake. I put movl there because it
+ * is a second prefetch for the next reading
+ * iteration, speeds up about 4 or 5%. */
+ "21: movl %%ebx, %%ecx\n"
+ " shrl $6, %%ecx\n"
+ " .align 2\n"
+ "20:\n"
+ " movl 64(%%esi), %%eax\n"
+ "30: movq (%%esi), %%mm0\n"
+ "31: movq 8(%%esi), %%mm1\n"
+ "32: movq 16(%%esi), %%mm2\n"
+ "33: movq 24(%%esi), %%mm3\n"
+ "34: movq 32(%%esi), %%mm4\n"
+ "35: movq 40(%%esi), %%mm5\n"
+ "36: movq 48(%%esi), %%mm6\n"
+ "37: movq 56(%%esi), %%mm7\n"
+ "40: movntq %%mm0, (%%edi)\n"
+ "41: movntq %%mm1, 8(%%edi)\n"
+ "42: movntq %%mm2, 16(%%edi)\n"
+ "43: movntq %%mm3, 24(%%edi)\n"
+ "44: movntq %%mm4, 32(%%edi)\n"
+ "45: movntq %%mm5, 40(%%edi)\n"
+ "46: movntq %%mm6, 48(%%edi)\n"
+ "47: movntq %%mm7, 56(%%edi)\n"
+ " addl $64, %%esi\n"
+ " addl $64, %%edi\n"
+ " decl %%ecx\n"
+ " jnz 20b\n"
+ " sfence\n"
+ " femms\n"
+ /* Finish the remaining data, which did not fit
+ * into 64 bytes.*/
+ " movl %%ebx, %%ecx\n"
+ " andl $0x3f, %%ecx\n"
+ "90: rep; movsb\n"
+ /* This is a postfetch for the written data,
+ * because the moved data is likely to be used right
+ * after this copy function but the movntq does not
+ * leave them in the L2 cache, so doing it here. */
+#if 0
+ " pop %%edi\n"
+ " prefetchnta (%%edi, %%ecx)\n"
+ " prefetchnta 64(%%edi, %%ecx)\n"
+ " prefetchnta 128(%%edi, %%ecx)\n"
+ " prefetchnta 192(%%edi, %%ecx)\n"
+#endif
+#if 1 /* 0 turn off postfetch */
+ " movl %%edi, %%eax\n"
+ " popl %%edi\n"
+ " subl %%edi, %%eax\n"
+ // " movl %%edx, %%edi\n"
+ " shrl $8, %%eax\n"
+ " jz 100f\n"
+ " cmpl $256, %%eax\n"
+ " jbe 50f\n"
+ " movl $256, %%eax\n"
+ " .align 2\n"
+ "50:\n"
+#if 0 /* 1 use 'prefetch' or 0 use movl for postfetch */
+ " prefetcht0 (%%edi, %%ecx)\n"
+ " prefetcht0 64(%%edi, %%ecx)\n"
+ " prefetcht0 128(%%edi, %%ecx)\n"
+ " prefetcht0 192(%%edi, %%ecx)\n"
+#else
+ // " prefetchnta 256(%%edi, %%ecx)\n"
+ " movl 0(%%edi, %%ecx), %%edx\n"
+ " movl 64(%%edi, %%ecx), %%edx\n"
+ " movl 128(%%edi, %%ecx), %%edx\n"
+ " movl 192(%%edi, %%ecx), %%edx\n"
+#endif
+ " addl $256, %%ecx\n"
+ " decl %%eax\n"
+ " jnz 50b\n"
+ " xorl %%ecx, %%ecx\n"
+#endif
+ "100:\n"
+
+ ".section .fixup,\"ax\"\n"
+ /* Page fault occured during prefetch, go back and
+ * start data transfer. */
+ "1:\n"
+ " jmp 21b\n"
+ "2:\n"
+ "3:\n"
+ " sfence\n"
+ " femms\n"
+ " shll $6, %%ecx\n"
+ " andl $0x3f, %%ebx\n"
+ " addl %%ebx, %%ecx\n"
+ " jmp 90b\n"
+ "8:\n"
+ " addl %%ebx, %%ecx\n"
+ "99:\n"
+ " popl %%edi\n"
+ " jmp 100b\n"
+
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n"
+ " .long 10b, 1b\n"
+ " .long 11b, 1b\n"
+ " .long 12b, 1b\n"
+ " .long 13b, 1b\n"
+#if 0
+ " .long 20b, 2b\n"
+ " .long 30b, 3b\n"
+ " .long 31b, 3b\n"
+ " .long 32b, 3b\n"
+ " .long 33b, 3b\n"
+ " .long 34b, 3b\n"
+ " .long 35b, 3b\n"
+ " .long 36b, 3b\n"
+ " .long 37b, 3b\n"
+#endif
+ " .long 40b, 3b\n"
+ " .long 41b, 3b\n"
+ " .long 42b, 3b\n"
+ " .long 43b, 3b\n"
+ " .long 44b, 3b\n"
+ " .long 45b, 3b\n"
+ " .long 46b, 3b\n"
+ " .long 47b, 3b\n"
+ " .long 80b, 8b\n"
+ " .long 90b, 99b\n"
+ ".previous"
+
+ : "=&c"(size)
+ : "0"(size), "D"(to), "S"(from),
+ "r"(current_cpu_data.x86_cache_size * 1024 / 256)
+ : "eax", "ebx", "edx", "memory");
+ return size;
+}
+
+static unsigned long
+aki_copy_zeroing(void *to, const void *from, unsigned long size) {
+ __asm__ __volatile(
+ /* These are just saving it for later use */
+ " movl %4, %%edx\n"
+ " push %%edi\n"
+ " movl %%ecx, %%ebx\n"
+ /* Athlon speeds up a lot when the address to read is
+ * aligned at 32bit(8bytes) boundary */
+ " movl %%esi, %%ecx\n"
+ " negl %%ecx\n"
+ " andl $7, %%ecx\n"
+ " subl %%ecx, %%ebx\n"
+ "80: rep; movsb\n"
+ /* Here is one trick to speed up, it is prefetching
+ * entire 'from' or up to L2 cache size, which ever is
+ * smaller.
+ * I used movl instead of special instruction, such as
+ * prefetch or prefetchnta because movl was faster
+ * when combining with movq, used in later. */
+ " movl %%ebx, %%eax\n"
+ " shrl $8, %%eax\n"
+ " jz 11f\n"
+ " cmpl %%edx, %%eax\n"
+ " jbe 10f\n"
+ " movl %%edx, %%eax\n"
+ " .align 2\n"
+ "10: movl 192(%%esi, %%ecx), %%edx\n"
+ "11: movl 0(%%esi, %%ecx), %%edx\n"
+ "12: movl 64(%%esi, %%ecx), %%edx\n"
+ "13: movl 128(%%esi, %%ecx), %%edx\n"
+ " addl $256, %%ecx\n"
+ " decl %%eax\n"
+ " jnz 10b\n"
+ // " femms\n"
+ /* Bulk transfer by using movq and movntq, every
+ * iteration 64 bytes. There is movl in the first line
+ * but it is not a mistake. I put movl there because it
+ * is a second prefetch for the next reading
+ * iteration, speeds up about 4 or 5%. */
+ "21: movl %%ebx, %%ecx\n"
+ " shrl $6, %%ecx\n"
+ " .align 2\n"
+ "20:\n"
+ " movl 64(%%esi), %%eax\n"
+ "30: movq (%%esi), %%mm0\n"
+ "31: movq 8(%%esi), %%mm1\n"
+ "32: movq 16(%%esi), %%mm2\n"
+ "33: movq 24(%%esi), %%mm3\n"
+ "34: movq 32(%%esi), %%mm4\n"
+ "35: movq 40(%%esi), %%mm5\n"
+ "36: movq 48(%%esi), %%mm6\n"
+ "37: movq 56(%%esi), %%mm7\n"
+ "40: movntq %%mm0, (%%edi)\n"
+ "41: movntq %%mm1, 8(%%edi)\n"
+ "42: movntq %%mm2, 16(%%edi)\n"
+ "43: movntq %%mm3, 24(%%edi)\n"
+ "44: movntq %%mm4, 32(%%edi)\n"
+ "45: movntq %%mm5, 40(%%edi)\n"
+ "46: movntq %%mm6, 48(%%edi)\n"
+ "47: movntq %%mm7, 56(%%edi)\n"
+ " addl $64, %%esi\n"
+ " addl $64, %%edi\n"
+ " decl %%ecx\n"
+ " jnz 20b\n"
+ " sfence\n"
+ " femms\n"
+ /* Finish the remaining data, which did not fit
+ * into 64 bytes.*/
+ " movl %%ebx, %%ecx\n"
+ " andl $0x3f, %%ecx\n"
+ "90: rep; movsb\n"
+ /* This is a postfetch for the written data,
+ * because the moved data is likely to be used right
+ * after this copy function but the movntq does not
+ * leave them in the L2 cache, so doing it here. */
+#if 0
+ " popl %%edi\n"
+ " prefetchnta (%%edi, %%ecx)\n"
+ " prefetchnta 64(%%edi, %%ecx)\n"
+ " prefetchnta 128(%%edi, %%ecx)\n"
+ " prefetchnta 192(%%edi, %%ecx)\n"
+#endif
+#if 1 /* 0 turn off postfetch */
+ " movl %%edi, %%eax\n"
+ " popl %%edi\n"
+ " subl %%edi, %%eax\n"
+ // " movl %%edx, %%edi\n"
+ " shrl $8, %%eax\n"
+ " jz 100f\n"
+ " cmpl $256, %%eax\n"
+ " jbe 50f\n"
+ " movl $256, %%eax\n"
+ " .align 2\n"
+ "50:\n"
+#if 1 /* 1 use 'prefetch' or 0 use movl for postfetch */
+ " prefetcht0 (%%edi, %%ecx)\n"
+ " prefetcht0 64(%%edi, %%ecx)\n"
+ " prefetcht0 128(%%edi, %%ecx)\n"
+ " prefetcht0 192(%%edi, %%ecx)\n"
+#else
+ // " prefetchnta 256(%%edi, %%ecx)\n"
+ " movl 0(%%edi, %%ecx), %%edx\n"
+ " movl 64(%%edi, %%ecx), %%edx\n"
+ " movl 128(%%edi, %%ecx), %%edx\n"
+ " movl 192(%%edi, %%ecx), %%edx\n"
+#endif
+ " addl $256, %%ecx\n"
+ " decl %%eax\n"
+ " jnz 50b\n"
+ " xorl %%ecx, %%ecx\n"
+#endif
+ "100:\n"
+
+ ".section .fixup,\"ax\"\n"
+ /* Page fault occured during prefetch, go back and
+ * start data transfer. */
+ "1:\n"
+ " jmp 21b\n"
+ "2:\n"
+ "3:\n"
+ " sfence\n"
+ " femms\n"
+ " shll $6, %%ecx\n"
+ " andl $0x3f, %%ebx\n"
+ " addl %%ebx, %%ecx\n"
+ " jmp 90b\n"
+ "8:\n"
+ " addl %%ebx, %%ecx\n"
+ "99:\n"
+ " movl %%ecx, %%ebx\n"
+ " xorl %%eax,%%eax\n"
+ " rep; stosb\n"
+ " movl %%ebx, %%ecx\n"
+ " popl %%edi\n"
+ " jmp 100b\n"
+
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n"
+ " .long 10b, 1b\n"
+ " .long 11b, 1b\n"
+ " .long 12b, 1b\n"
+ " .long 13b, 1b\n"
+ " .long 20b, 2b\n"
+ " .long 30b, 3b\n"
+ " .long 31b, 3b\n"
+ " .long 32b, 3b\n"
+ " .long 33b, 3b\n"
+ " .long 34b, 3b\n"
+ " .long 35b, 3b\n"
+ " .long 36b, 3b\n"
+ " .long 37b, 3b\n"
+#if 0
+ " .long 40b, 3b\n"
+ " .long 41b, 3b\n"
+ " .long 42b, 3b\n"
+ " .long 43b, 3b\n"
+ " .long 44b, 3b\n"
+ " .long 45b, 3b\n"
+ " .long 46b, 3b\n"
+ " .long 47b, 3b\n"
+#endif
+ " .long 80b, 8b\n"
+ " .long 90b, 99b\n"
+ ".previous"
+
+ : "=&c"(size)
+ : "0"(size), "D"(to), "S"(from),
+ "r"(current_cpu_data.x86_cache_size * 1024 / 256)
+ : "eax", "ebx", "edx", "memory");
+ return size;
+}
+#endif /* CONFIG_MK7 */
+
/* Generic arbitrary sized copy. */
#define __copy_user(to,from,size) \
do { \
@@ -416,7 +764,25 @@
: "memory"); \
} while (0)
+#ifdef CONFIG_MK7
+unsigned long __copy_to_user(void *to, const void *from, unsigned long n)
+{
+ if (n < 256)
+ __copy_user(to, from, n);
+ else
+ n = aki_copy(to, from, n);
+ return n;
+}
+unsigned long __copy_from_user(void *to, const void *from, unsigned long n)
+{
+ if (n < 256)
+ __copy_user_zeroing(to, from, n);
+ else
+ n = aki_copy_zeroing(to, from, n);
+ return n;
+}
+#else
unsigned long __copy_to_user(void *to, const void *from, unsigned long n)
{
if (movsl_is_ok(to, from, n))
@@ -434,6 +800,7 @@
n = __copy_user_zeroing_intel(to, from, n);
return n;
}
+#endif /* CONFIG_MK7 */
unsigned long copy_to_user(void *to, const void *from, unsigned long n)
{
(300% faster in file read/write?)
L2 cache size for Athlon(256)/Duron(64) should be adjusted on run-time.
I appreciate trying this patch who owns Athlon/Duron CPU.
1)
I have been using Taka’s file or socket benchmark program for
this purpose, if any better testing tools?
On the benchmark, athlon copy function is extremely fast.
read/write file test
http://dns.suna-asobi.com/~akira-t/linux/taka-bench/fileio2.c
org-copy: file read/write
buf(0x886e000) copied 24.0 Mbytes in 0.080 seconds at 299.4 Mbytes/sec
buf(0x886e001) copied 24.0 Mbytes in 0.129 seconds at 186.7 Mbytes/sec
buf(0x886e002) copied 24.0 Mbytes in 0.129 seconds at 186.4 Mbytes/sec
buf(0x886e003) copied 24.0 Mbytes in 0.129 seconds at 185.7 Mbytes/sec
(Entire log is here, http://dns.suna-asobi.com/~akira-t/linux/taka-bench/org-copy-file.log)
athlon-fast_copy: file read/write
buf(0x886e000) copied 24.0 Mbytes in 0.025 seconds at 959.2 Mbytes/sec
buf(0x886e001) copied 24.0 Mbytes in 0.032 seconds at 745.8 Mbytes/sec
buf(0x886e002) copied 24.0 Mbytes in 0.033 seconds at 731.4 Mbytes/sec
buf(0x886e003) copied 24.0 Mbytes in 0.032 seconds at 742.7 Mbytes/sec
(Entire log is here, http://dns.suna-asobi.com/~akira-t/linux/taka-bench/aki-copy-file.log)
network test
http://dns.suna-asobi.com/~akira-t/linux/taka-bench/netio2.c
org-copy: socket
0x846e000+0 -> 0x804e000+0 send/recv: 0.034387 seconds at 116.3 Mbytes/sec
0x846e000+1 -> 0x804e000+1 send/recv: 0.043644 seconds at 91.7 Mbytes/sec
0x846e000+2 -> 0x804e000+2 send/recv: 0.044038 seconds at 90.8 Mbytes/sec
0x846e000+3 -> 0x804e000+3 send/recv: 0.043457 seconds at 92.0 Mbytes/sec
(Entire log is here, http://dns.suna-asobi.com/~akira-t/linux/taka-bench/org-copy-net.log)
athlon-fast_copy: socket
0x846e000+0 -> 0x804e000+0 send/recv: 0.019374 seconds at 206.5 Mbytes/sec
0x846e000+1 -> 0x804e000+1 send/recv: 0.036772 seconds at 108.8 Mbytes/sec
0x846e000+2 -> 0x804e000+2 send/recv: 0.037353 seconds at 107.1 Mbytes/sec
0x846e000+3 -> 0x804e000+3 send/recv: 0.040598 seconds at 98.5 Mbytes/sec
(Entire log is here, http://dns.suna-asobi.com/~akira-t/linux/taka-bench/aki-copy-net.log)
2) Last Friday was my first day to touch gcc assembler, and I really
appreciate if there any suggestion for the aki_copy().
Should I save all the mmx registers? I didn't but seems to work ok.
3) From the comment from Andi and Andrew last time, statically compiling
for each CPU is not really optimal for the Linux distributors.
I saw Denis Vlasenko’s csum_copy routines with boot-time selection, and
it looked good. I think it is a good idea to do the same for the copy_user
functions.
4) I think this is just a mistake in the current kernel.
config X86_INTEL_USERCOPY
bool
- depends on MPENTIUM4 || MPENTIUMIII || M586MMX
+ depends on MPENTIUM4 || MPENTIUMIII || M686
default y
The intel_faster_copy is slower for Pentium MMX and faster for PenII.
Akira
--- linux-2.5.47/arch/i386/lib/usercopy.c Thu Oct 31 22:40:01 2002
+++ linux-2.5.47-aki/arch/i386/lib/usercopy.c Sat Nov 16 00:42:46 2002
@@ -337,6 +337,354 @@
__copy_user_intel(void *to, const void *from,unsigned long size);
#endif /* CONFIG_X86_INTEL_USERCOPY */
+#ifdef CONFIG_MK7
+/* Ahtlon version */
+/* akira version, specific to Athlon/Duron CPU
+ * 'size' must be larger than 256 */
+static unsigned long
+aki_copy(void *to, const void *from, unsigned long size) {
+ __asm__ __volatile(
+ /* These are just saving it for later use */
+ " movl %4, %%edx\n"
+ " push %%edi\n"
+ " movl %%ecx, %%ebx\n"
+ /* Athlon speeds up a lot when the address to read is
+ * aligned at 32bit(8bytes) boundary */
+ " movl %%esi, %%ecx\n"
+ " negl %%ecx\n"
+ " andl $7, %%ecx\n"
+ " subl %%ecx, %%ebx\n"
+ "80: rep; movsb\n"
+ /* Here is one trick to speed up, it is prefetching
+ * entire 'from' or up to L2 cache size, which ever is
+ * smaller.
+ * I used movl instead of special instruction, such as
+ * prefetch or prefetchnta because movl was faster
+ * when combining with movq, used in later. */
+ " movl %%ebx, %%eax\n"
+ " shrl $8, %%eax\n"
+ " cmpl %%edx, %%eax\n"
+ " jbe 10f\n"
+ " movl %%edx, %%eax\n"
+ " .align 2\n"
+ "10: movl 0(%%esi, %%ecx), %%edx\n"
+ "11: movl 64(%%esi, %%ecx), %%edx\n"
+ "12: movl 128(%%esi, %%ecx), %%edx\n"
+ "13: movl 192(%%esi, %%ecx), %%edx\n"
+ " addl $256, %%ecx\n"
+ " decl %%eax\n"
+ " jnz 10b\n"
+ // " femms\n"
+ /* Bulk transfer by using movq and movntq, every
+ * iteration 64 bytes. There is movl in the first line
+ * but it is not a mistake. I put movl there because it
+ * is a second prefetch for the next reading
+ * iteration, speeds up about 4 or 5%. */
+ "21: movl %%ebx, %%ecx\n"
+ " shrl $6, %%ecx\n"
+ " .align 2\n"
+ "20:\n"
+ " movl 64(%%esi), %%eax\n"
+ "30: movq (%%esi), %%mm0\n"
+ "31: movq 8(%%esi), %%mm1\n"
+ "32: movq 16(%%esi), %%mm2\n"
+ "33: movq 24(%%esi), %%mm3\n"
+ "34: movq 32(%%esi), %%mm4\n"
+ "35: movq 40(%%esi), %%mm5\n"
+ "36: movq 48(%%esi), %%mm6\n"
+ "37: movq 56(%%esi), %%mm7\n"
+ "40: movntq %%mm0, (%%edi)\n"
+ "41: movntq %%mm1, 8(%%edi)\n"
+ "42: movntq %%mm2, 16(%%edi)\n"
+ "43: movntq %%mm3, 24(%%edi)\n"
+ "44: movntq %%mm4, 32(%%edi)\n"
+ "45: movntq %%mm5, 40(%%edi)\n"
+ "46: movntq %%mm6, 48(%%edi)\n"
+ "47: movntq %%mm7, 56(%%edi)\n"
+ " addl $64, %%esi\n"
+ " addl $64, %%edi\n"
+ " decl %%ecx\n"
+ " jnz 20b\n"
+ " sfence\n"
+ " femms\n"
+ /* Finish the remaining data, which did not fit
+ * into 64 bytes.*/
+ " movl %%ebx, %%ecx\n"
+ " andl $0x3f, %%ecx\n"
+ "90: rep; movsb\n"
+ /* This is a postfetch for the written data,
+ * because the moved data is likely to be used right
+ * after this copy function but the movntq does not
+ * leave them in the L2 cache, so doing it here. */
+#if 0
+ " pop %%edi\n"
+ " prefetchnta (%%edi, %%ecx)\n"
+ " prefetchnta 64(%%edi, %%ecx)\n"
+ " prefetchnta 128(%%edi, %%ecx)\n"
+ " prefetchnta 192(%%edi, %%ecx)\n"
+#endif
+#if 1 /* 0 turn off postfetch */
+ " movl %%edi, %%eax\n"
+ " popl %%edi\n"
+ " subl %%edi, %%eax\n"
+ // " movl %%edx, %%edi\n"
+ " shrl $8, %%eax\n"
+ " jz 100f\n"
+ " cmpl $256, %%eax\n"
+ " jbe 50f\n"
+ " movl $256, %%eax\n"
+ " .align 2\n"
+ "50:\n"
+#if 0 /* 1 use 'prefetch' or 0 use movl for postfetch */
+ " prefetcht0 (%%edi, %%ecx)\n"
+ " prefetcht0 64(%%edi, %%ecx)\n"
+ " prefetcht0 128(%%edi, %%ecx)\n"
+ " prefetcht0 192(%%edi, %%ecx)\n"
+#else
+ // " prefetchnta 256(%%edi, %%ecx)\n"
+ " movl 0(%%edi, %%ecx), %%edx\n"
+ " movl 64(%%edi, %%ecx), %%edx\n"
+ " movl 128(%%edi, %%ecx), %%edx\n"
+ " movl 192(%%edi, %%ecx), %%edx\n"
+#endif
+ " addl $256, %%ecx\n"
+ " decl %%eax\n"
+ " jnz 50b\n"
+ " xorl %%ecx, %%ecx\n"
+#endif
+ "100:\n"
+
+ ".section .fixup,\"ax\"\n"
+ /* Page fault occured during prefetch, go back and
+ * start data transfer. */
+ "1:\n"
+ " jmp 21b\n"
+ "2:\n"
+ "3:\n"
+ " sfence\n"
+ " femms\n"
+ " shll $6, %%ecx\n"
+ " andl $0x3f, %%ebx\n"
+ " addl %%ebx, %%ecx\n"
+ " jmp 90b\n"
+ "8:\n"
+ " addl %%ebx, %%ecx\n"
+ "99:\n"
+ " popl %%edi\n"
+ " jmp 100b\n"
+
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n"
+ " .long 10b, 1b\n"
+ " .long 11b, 1b\n"
+ " .long 12b, 1b\n"
+ " .long 13b, 1b\n"
+#if 0
+ " .long 20b, 2b\n"
+ " .long 30b, 3b\n"
+ " .long 31b, 3b\n"
+ " .long 32b, 3b\n"
+ " .long 33b, 3b\n"
+ " .long 34b, 3b\n"
+ " .long 35b, 3b\n"
+ " .long 36b, 3b\n"
+ " .long 37b, 3b\n"
+#endif
+ " .long 40b, 3b\n"
+ " .long 41b, 3b\n"
+ " .long 42b, 3b\n"
+ " .long 43b, 3b\n"
+ " .long 44b, 3b\n"
+ " .long 45b, 3b\n"
+ " .long 46b, 3b\n"
+ " .long 47b, 3b\n"
+ " .long 80b, 8b\n"
+ " .long 90b, 99b\n"
+ ".previous"
+
+ : "=&c"(size)
+ : "0"(size), "D"(to), "S"(from),
+ "r"(current_cpu_data.x86_cache_size * 1024 / 256)
+ : "eax", "ebx", "edx", "memory");
+ return size;
+}
+
+static unsigned long
+aki_copy_zeroing(void *to, const void *from, unsigned long size) {
+ __asm__ __volatile(
+ /* These are just saving it for later use */
+ " movl %4, %%edx\n"
+ " push %%edi\n"
+ " movl %%ecx, %%ebx\n"
+ /* Athlon speeds up a lot when the address to read is
+ * aligned at 32bit(8bytes) boundary */
+ " movl %%esi, %%ecx\n"
+ " negl %%ecx\n"
+ " andl $7, %%ecx\n"
+ " subl %%ecx, %%ebx\n"
+ "80: rep; movsb\n"
+ /* Here is one trick to speed up, it is prefetching
+ * entire 'from' or up to L2 cache size, which ever is
+ * smaller.
+ * I used movl instead of special instruction, such as
+ * prefetch or prefetchnta because movl was faster
+ * when combining with movq, used in later. */
+ " movl %%ebx, %%eax\n"
+ " shrl $8, %%eax\n"
+ " jz 11f\n"
+ " cmpl %%edx, %%eax\n"
+ " jbe 10f\n"
+ " movl %%edx, %%eax\n"
+ " .align 2\n"
+ "10: movl 192(%%esi, %%ecx), %%edx\n"
+ "11: movl 0(%%esi, %%ecx), %%edx\n"
+ "12: movl 64(%%esi, %%ecx), %%edx\n"
+ "13: movl 128(%%esi, %%ecx), %%edx\n"
+ " addl $256, %%ecx\n"
+ " decl %%eax\n"
+ " jnz 10b\n"
+ // " femms\n"
+ /* Bulk transfer by using movq and movntq, every
+ * iteration 64 bytes. There is movl in the first line
+ * but it is not a mistake. I put movl there because it
+ * is a second prefetch for the next reading
+ * iteration, speeds up about 4 or 5%. */
+ "21: movl %%ebx, %%ecx\n"
+ " shrl $6, %%ecx\n"
+ " .align 2\n"
+ "20:\n"
+ " movl 64(%%esi), %%eax\n"
+ "30: movq (%%esi), %%mm0\n"
+ "31: movq 8(%%esi), %%mm1\n"
+ "32: movq 16(%%esi), %%mm2\n"
+ "33: movq 24(%%esi), %%mm3\n"
+ "34: movq 32(%%esi), %%mm4\n"
+ "35: movq 40(%%esi), %%mm5\n"
+ "36: movq 48(%%esi), %%mm6\n"
+ "37: movq 56(%%esi), %%mm7\n"
+ "40: movntq %%mm0, (%%edi)\n"
+ "41: movntq %%mm1, 8(%%edi)\n"
+ "42: movntq %%mm2, 16(%%edi)\n"
+ "43: movntq %%mm3, 24(%%edi)\n"
+ "44: movntq %%mm4, 32(%%edi)\n"
+ "45: movntq %%mm5, 40(%%edi)\n"
+ "46: movntq %%mm6, 48(%%edi)\n"
+ "47: movntq %%mm7, 56(%%edi)\n"
+ " addl $64, %%esi\n"
+ " addl $64, %%edi\n"
+ " decl %%ecx\n"
+ " jnz 20b\n"
+ " sfence\n"
+ " femms\n"
+ /* Finish the remaining data, which did not fit
+ * into 64 bytes.*/
+ " movl %%ebx, %%ecx\n"
+ " andl $0x3f, %%ecx\n"
+ "90: rep; movsb\n"
+ /* This is a postfetch for the written data,
+ * because the moved data is likely to be used right
+ * after this copy function but the movntq does not
+ * leave them in the L2 cache, so doing it here. */
+#if 0
+ " popl %%edi\n"
+ " prefetchnta (%%edi, %%ecx)\n"
+ " prefetchnta 64(%%edi, %%ecx)\n"
+ " prefetchnta 128(%%edi, %%ecx)\n"
+ " prefetchnta 192(%%edi, %%ecx)\n"
+#endif
+#if 1 /* 0 turn off postfetch */
+ " movl %%edi, %%eax\n"
+ " popl %%edi\n"
+ " subl %%edi, %%eax\n"
+ // " movl %%edx, %%edi\n"
+ " shrl $8, %%eax\n"
+ " jz 100f\n"
+ " cmpl $256, %%eax\n"
+ " jbe 50f\n"
+ " movl $256, %%eax\n"
+ " .align 2\n"
+ "50:\n"
+#if 1 /* 1 use 'prefetch' or 0 use movl for postfetch */
+ " prefetcht0 (%%edi, %%ecx)\n"
+ " prefetcht0 64(%%edi, %%ecx)\n"
+ " prefetcht0 128(%%edi, %%ecx)\n"
+ " prefetcht0 192(%%edi, %%ecx)\n"
+#else
+ // " prefetchnta 256(%%edi, %%ecx)\n"
+ " movl 0(%%edi, %%ecx), %%edx\n"
+ " movl 64(%%edi, %%ecx), %%edx\n"
+ " movl 128(%%edi, %%ecx), %%edx\n"
+ " movl 192(%%edi, %%ecx), %%edx\n"
+#endif
+ " addl $256, %%ecx\n"
+ " decl %%eax\n"
+ " jnz 50b\n"
+ " xorl %%ecx, %%ecx\n"
+#endif
+ "100:\n"
+
+ ".section .fixup,\"ax\"\n"
+ /* Page fault occured during prefetch, go back and
+ * start data transfer. */
+ "1:\n"
+ " jmp 21b\n"
+ "2:\n"
+ "3:\n"
+ " sfence\n"
+ " femms\n"
+ " shll $6, %%ecx\n"
+ " andl $0x3f, %%ebx\n"
+ " addl %%ebx, %%ecx\n"
+ " jmp 90b\n"
+ "8:\n"
+ " addl %%ebx, %%ecx\n"
+ "99:\n"
+ " movl %%ecx, %%ebx\n"
+ " xorl %%eax,%%eax\n"
+ " rep; stosb\n"
+ " movl %%ebx, %%ecx\n"
+ " popl %%edi\n"
+ " jmp 100b\n"
+
+ ".previous\n"
+ ".section __ex_table,\"a\"\n"
+ " .align 4\n"
+ " .long 10b, 1b\n"
+ " .long 11b, 1b\n"
+ " .long 12b, 1b\n"
+ " .long 13b, 1b\n"
+ " .long 20b, 2b\n"
+ " .long 30b, 3b\n"
+ " .long 31b, 3b\n"
+ " .long 32b, 3b\n"
+ " .long 33b, 3b\n"
+ " .long 34b, 3b\n"
+ " .long 35b, 3b\n"
+ " .long 36b, 3b\n"
+ " .long 37b, 3b\n"
+#if 0
+ " .long 40b, 3b\n"
+ " .long 41b, 3b\n"
+ " .long 42b, 3b\n"
+ " .long 43b, 3b\n"
+ " .long 44b, 3b\n"
+ " .long 45b, 3b\n"
+ " .long 46b, 3b\n"
+ " .long 47b, 3b\n"
+#endif
+ " .long 80b, 8b\n"
+ " .long 90b, 99b\n"
+ ".previous"
+
+ : "=&c"(size)
+ : "0"(size), "D"(to), "S"(from),
+ "r"(current_cpu_data.x86_cache_size * 1024 / 256)
+ : "eax", "ebx", "edx", "memory");
+ return size;
+}
+#endif /* CONFIG_MK7 */
+
/* Generic arbitrary sized copy. */
#define __copy_user(to,from,size) \
do { \
@@ -416,7 +764,25 @@
: "memory"); \
} while (0)
+#ifdef CONFIG_MK7
+unsigned long __copy_to_user(void *to, const void *from, unsigned long n)
+{
+ if (n < 256)
+ __copy_user(to, from, n);
+ else
+ n = aki_copy(to, from, n);
+ return n;
+}
+unsigned long __copy_from_user(void *to, const void *from, unsigned long n)
+{
+ if (n < 256)
+ __copy_user_zeroing(to, from, n);
+ else
+ n = aki_copy_zeroing(to, from, n);
+ return n;
+}
+#else
unsigned long __copy_to_user(void *to, const void *from, unsigned long n)
{
if (movsl_is_ok(to, from, n))
@@ -434,6 +800,7 @@
n = __copy_user_zeroing_intel(to, from, n);
return n;
}
+#endif /* CONFIG_MK7 */
unsigned long copy_to_user(void *to, const void *from, unsigned long n)
{