aboutsummaryrefslogtreecommitdiffstats
path: root/xen/arch/x86/copy_page.S
diff options
context:
space:
mode:
authorKeir Fraser <keir.fraser@citrix.com>2008-11-12 12:04:15 +0000
committerKeir Fraser <keir.fraser@citrix.com>2008-11-12 12:04:15 +0000
commitbe3d9af19ce74ac274d9c510f335f319f25ea701 (patch)
treec8ab8a3a45a9a16ec1285c601f5590b6c090b969 /xen/arch/x86/copy_page.S
parent71e65dbc3e12fdf39299a9abe37253bdc7e84bae (diff)
downloadxen-be3d9af19ce74ac274d9c510f335f319f25ea701.tar.gz
xen-be3d9af19ce74ac274d9c510f335f319f25ea701.tar.bz2
xen-be3d9af19ce74ac274d9c510f335f319f25ea701.zip
x86: add SSE-based copy_page()
In top of the highmem asstance hypercalls added earlier, this provides a performance improvement of another 12% (measured on Xeon E5345) for the page copying case. Signed-off-by: Jan Beulich <jbeulich@novell.com>
Diffstat (limited to 'xen/arch/x86/copy_page.S')
-rw-r--r--xen/arch/x86/copy_page.S66
1 files changed, 66 insertions, 0 deletions
diff --git a/xen/arch/x86/copy_page.S b/xen/arch/x86/copy_page.S
new file mode 100644
index 0000000000..2fd3e533c6
--- /dev/null
+++ b/xen/arch/x86/copy_page.S
@@ -0,0 +1,66 @@
+#include <xen/config.h>
+#include <asm/page.h>
+
+#ifdef __i386__
+#define src_reg %esi
+#define dst_reg %edi
+#define WORD_SIZE 4
+#define tmp1_reg %eax
+#define tmp2_reg %edx
+#define tmp3_reg %ebx
+#define tmp4_reg %ebp
+#else
+#define src_reg %rsi
+#define dst_reg %rdi
+#define WORD_SIZE 8
+#define tmp1_reg %r8
+#define tmp2_reg %r9
+#define tmp3_reg %r10
+#define tmp4_reg %r11
+#endif
+
+ENTRY(copy_page_sse2)
+#ifdef __i386__
+ push %ebx
+ push %ebp
+ push %esi
+ push %edi
+ mov 6*4(%esp), src_reg
+ mov 5*4(%esp), dst_reg
+#endif
+ mov $PAGE_SIZE/(4*WORD_SIZE)-3, %ecx
+
+ prefetchnta 2*4*WORD_SIZE(src_reg)
+ mov (src_reg), tmp1_reg
+ mov WORD_SIZE(src_reg), tmp2_reg
+ mov 2*WORD_SIZE(src_reg), tmp3_reg
+ mov 3*WORD_SIZE(src_reg), tmp4_reg
+
+0: prefetchnta 3*4*WORD_SIZE(src_reg)
+1: add $4*WORD_SIZE, src_reg
+ movnti tmp1_reg, (dst_reg)
+ mov (src_reg), tmp1_reg
+ dec %ecx
+ movnti tmp2_reg, WORD_SIZE(dst_reg)
+ mov WORD_SIZE(src_reg), tmp2_reg
+ movnti tmp3_reg, 2*WORD_SIZE(dst_reg)
+ mov 2*WORD_SIZE(src_reg), tmp3_reg
+ movnti tmp4_reg, 3*WORD_SIZE(dst_reg)
+ lea 4*WORD_SIZE(dst_reg), dst_reg
+ mov 3*WORD_SIZE(src_reg), tmp4_reg
+ jg 0b
+ jpe 1b
+
+ movnti tmp1_reg, (dst_reg)
+ movnti tmp2_reg, WORD_SIZE(dst_reg)
+ movnti tmp3_reg, 2*WORD_SIZE(dst_reg)
+ movnti tmp4_reg, 3*WORD_SIZE(dst_reg)
+
+#ifdef __i386__
+ pop %edi
+ pop %esi
+ pop %ebp
+ pop %ebx
+#endif
+ sfence
+ ret