diff options
author | Keir Fraser <keir.fraser@citrix.com> | 2008-11-12 12:04:15 +0000 |
---|---|---|
committer | Keir Fraser <keir.fraser@citrix.com> | 2008-11-12 12:04:15 +0000 |
commit | be3d9af19ce74ac274d9c510f335f319f25ea701 (patch) | |
tree | c8ab8a3a45a9a16ec1285c601f5590b6c090b969 /xen/arch/x86/copy_page.S | |
parent | 71e65dbc3e12fdf39299a9abe37253bdc7e84bae (diff) | |
download | xen-be3d9af19ce74ac274d9c510f335f319f25ea701.tar.gz xen-be3d9af19ce74ac274d9c510f335f319f25ea701.tar.bz2 xen-be3d9af19ce74ac274d9c510f335f319f25ea701.zip |
x86: add SSE-based copy_page()
In top of the highmem asstance hypercalls added earlier, this provides
a performance improvement of another 12% (measured on Xeon E5345) for
the page copying case.
Signed-off-by: Jan Beulich <jbeulich@novell.com>
Diffstat (limited to 'xen/arch/x86/copy_page.S')
-rw-r--r-- | xen/arch/x86/copy_page.S | 66 |
1 files changed, 66 insertions, 0 deletions
diff --git a/xen/arch/x86/copy_page.S b/xen/arch/x86/copy_page.S new file mode 100644 index 0000000000..2fd3e533c6 --- /dev/null +++ b/xen/arch/x86/copy_page.S @@ -0,0 +1,66 @@ +#include <xen/config.h> +#include <asm/page.h> + +#ifdef __i386__ +#define src_reg %esi +#define dst_reg %edi +#define WORD_SIZE 4 +#define tmp1_reg %eax +#define tmp2_reg %edx +#define tmp3_reg %ebx +#define tmp4_reg %ebp +#else +#define src_reg %rsi +#define dst_reg %rdi +#define WORD_SIZE 8 +#define tmp1_reg %r8 +#define tmp2_reg %r9 +#define tmp3_reg %r10 +#define tmp4_reg %r11 +#endif + +ENTRY(copy_page_sse2) +#ifdef __i386__ + push %ebx + push %ebp + push %esi + push %edi + mov 6*4(%esp), src_reg + mov 5*4(%esp), dst_reg +#endif + mov $PAGE_SIZE/(4*WORD_SIZE)-3, %ecx + + prefetchnta 2*4*WORD_SIZE(src_reg) + mov (src_reg), tmp1_reg + mov WORD_SIZE(src_reg), tmp2_reg + mov 2*WORD_SIZE(src_reg), tmp3_reg + mov 3*WORD_SIZE(src_reg), tmp4_reg + +0: prefetchnta 3*4*WORD_SIZE(src_reg) +1: add $4*WORD_SIZE, src_reg + movnti tmp1_reg, (dst_reg) + mov (src_reg), tmp1_reg + dec %ecx + movnti tmp2_reg, WORD_SIZE(dst_reg) + mov WORD_SIZE(src_reg), tmp2_reg + movnti tmp3_reg, 2*WORD_SIZE(dst_reg) + mov 2*WORD_SIZE(src_reg), tmp3_reg + movnti tmp4_reg, 3*WORD_SIZE(dst_reg) + lea 4*WORD_SIZE(dst_reg), dst_reg + mov 3*WORD_SIZE(src_reg), tmp4_reg + jg 0b + jpe 1b + + movnti tmp1_reg, (dst_reg) + movnti tmp2_reg, WORD_SIZE(dst_reg) + movnti tmp3_reg, 2*WORD_SIZE(dst_reg) + movnti tmp4_reg, 3*WORD_SIZE(dst_reg) + +#ifdef __i386__ + pop %edi + pop %esi + pop %ebp + pop %ebx +#endif + sfence + ret |