/* (C) 2000 Arjan van de Ven and others licensed under the terms of the GPL Copyright (C) 2005 Intel Corporation. Licesnsed under the terms of the GPL. $Revision: 1.6 $ */ static char cvsid[] = "$Id: fast.c,v 1.6 2000/09/23 09:05:45 arjan Exp $"; #include #include /* The 2.4 kernel one, adapted for userspace */ static void kernel_clear_page(void *page) { int i; long zero = 0; for(i=0;i<4096/128;i++) { __asm__ __volatile__ ( " movq %1, (%0)\n" " movq %1, 8(%0)\n" " movq %1, 16(%0)\n" " movq %1, 24(%0)\n" " movq %1, 32(%0)\n" " movq %1, 40(%0)\n" " movq %1, 48(%0)\n" " movq %1, 56(%0)\n" " movq %1, 64(%0)\n" " movq %1, 72(%0)\n" " movq %1, 80(%0)\n" " movq %1, 88(%0)\n" " movq %1, 96(%0)\n" " movq %1, 104(%0)\n" " movq %1, 112(%0)\n" " movq %1, 120(%0)\n" : : "r" (page), "r" (zero) : "memory"); page+=128; } } static void nt_clear_page(void *page) { int i; long zero = 0; for(i=0;i<4096/128;i++) { __asm__ __volatile__ ( " movnti %1, (%0)\n" " movnti %1, 8(%0)\n" " movnti %1, 16(%0)\n" " movnti %1, 24(%0)\n" " movnti %1, 32(%0)\n" " movnti %1, 40(%0)\n" " movnti %1, 48(%0)\n" " movnti %1, 56(%0)\n" " movnti %1, 64(%0)\n" " movnti %1, 72(%0)\n" " movnti %1, 80(%0)\n" " movnti %1, 88(%0)\n" " movnti %1, 96(%0)\n" " movnti %1, 104(%0)\n" " movnti %1, 112(%0)\n" " movnti %1, 120(%0)\n" : : "r" (page), "r" (zero) : "memory"); page+=128; } __asm__ volatile("sfence"); } static void nt_clear_page_pre(void *page) { int i; long zero = 0; for(i=0;i<4096/128;i++) { __asm__ __volatile__ ( " movnti %1, (%0)\n" " movnti %1, 8(%0)\n" " movnti %1, 16(%0)\n" " movnti %1, 24(%0)\n" " movnti %1, 32(%0)\n" " movnti %1, 40(%0)\n" " movnti %1, 48(%0)\n" " movnti %1, 56(%0)\n" " movnti %1, 64(%0)\n" " movnti %1, 72(%0)\n" " movnti %1, 80(%0)\n" " movnti %1, 88(%0)\n" " movnti %1, 96(%0)\n" " movnti %1, 104(%0)\n" " movnti %1, 112(%0)\n" " movnti %1, 120(%0)\n" " prefetcht1 (%0)\n" " prefetcht1 64(%0)\n" : : "r" (page), "r" (zero) : "memory"); page+=128; } __asm__ volatile("sfence"); } static void fast_clear_page(void *page) { int i; char fpu_save[108]; __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) ); __asm__ __volatile__ ( " pxor %%mm0, %%mm0\n" : : ); for(i=0;i<4096/128;i++) { __asm__ __volatile__ ( " movq %%mm0, (%0)\n" " movq %%mm0, 8(%0)\n" " movq %%mm0, 16(%0)\n" " movq %%mm0, 24(%0)\n" " movq %%mm0, 32(%0)\n" " movq %%mm0, 40(%0)\n" " movq %%mm0, 48(%0)\n" " movq %%mm0, 56(%0)\n" " movq %%mm0, 64(%0)\n" " movq %%mm0, 72(%0)\n" " movq %%mm0, 80(%0)\n" " movq %%mm0, 88(%0)\n" " movq %%mm0, 96(%0)\n" " movq %%mm0, 104(%0)\n" " movq %%mm0, 112(%0)\n" " movq %%mm0, 120(%0)\n" : : "r" (page) : "memory"); page+=128; } __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) ); } /* modified version for Athlon-family processors */ static void faster_clear_page(void *page) { int i; char fpu_save[108]; __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) ); __asm__ __volatile__ ( " pxor %%mm0, %%mm0\n" : : ); for(i=0;i<4096/64;i++) { __asm__ __volatile__ ( " movntq %%mm0, (%0)\n" " movntq %%mm0, 8(%0)\n" " movntq %%mm0, 16(%0)\n" " movntq %%mm0, 24(%0)\n" " movntq %%mm0, 32(%0)\n" " movntq %%mm0, 40(%0)\n" " movntq %%mm0, 48(%0)\n" " movntq %%mm0, 56(%0)\n" : : "r" (page) : "memory"); page+=64; } __asm__ __volatile__ ( " sfence \n " : : ); __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) ); } /* test version to go even faster... this might be the same as faster_ * but serves as my playground. */ static void even_faster_clear_page(void *page) { int i; char fpu_save[108]; __asm__ __volatile__ ( " fsave %0; fwait\n"::"m"(fpu_save[0]) ); __asm__ __volatile__ ( " pxor %%mm0, %%mm0\n" : : ); for(i=0;i<4096/64;i++) { __asm__ __volatile__ ( " movntq %%mm0, (%0)\n" " movntq %%mm0, 8(%0)\n" " movntq %%mm0, 16(%0)\n" " movntq %%mm0, 24(%0)\n" " movntq %%mm0, 32(%0)\n" " movntq %%mm0, 40(%0)\n" " movntq %%mm0, 48(%0)\n" " movntq %%mm0, 56(%0)\n" : : "r" (page) : "memory"); page+=64; } __asm__ __volatile__ ( " sfence \n " : : ); __asm__ __volatile__ ( " frstor %0;\n"::"m"(fpu_save[0]) ); } typedef struct { unsigned long a,b; } __attribute__((aligned(16))) xmm_store_t; static void xmm_clear_page(void *page) { int i; xmm_store_t xmm_save[1]; __asm__ __volatile__ ( " movdqu %%xmm0,(%0)\n" " pxor %%xmm0, %%xmm0\n" :: "r" (xmm_save) : "memory" ); for(i=0;i<4096/64;i++) { __asm__ __volatile__ ( " movntdq %%xmm0, (%0)\n" " movntdq %%xmm0, 16(%0)\n" " movntdq %%xmm0, 32(%0)\n" " movntdq %%xmm0, 48(%0)\n" : : "r" (page) : "memory"); page+=64; } __asm__ __volatile__ ( " sfence \n " " movdqu (%0),%%xmm0\n" :: "r" (xmm_save) ); } static void xmmp_clear_page(void *page) { int i; xmm_store_t xmm_save[1]; __asm__ __volatile__ ( " movdqu %%xmm0,(%0)\n" " pxor %%xmm0, %%xmm0\n" :: "r" (xmm_save) : "memory" ); for(i=0;i<4096/64;i++) { __asm__ __volatile__ ( " movntdq %%xmm0, (%0)\n" " movntdq %%xmm0, 16(%0)\n" " movntdq %%xmm0, 32(%0)\n" " movntdq %%xmm0, 48(%0)\n" "prefetcht1 (%0)\n" : : "r" (page) : "memory"); page+=64; } __asm__ __volatile__ ( " sfence \n " " movdqu (%0),%%xmm0\n" :: "r" (xmm_save) ); } static void xmm3_clear_page(void *page) { int i; xmm_store_t xmm_save[1]; __asm__ __volatile__ ( " movdqu %%xmm0,(%0)\n" " pxor %%xmm0, %%xmm0\n" :: "r" (xmm_save) : "memory" ); for(i=0;i<4096/128;i++) { __asm__ __volatile__ ( " movdqa %%xmm0, (%0)\n" " movdqa %%xmm0, 16(%0)\n" " movdqa %%xmm0, 32(%0)\n" " movdqa %%xmm0, 48(%0)\n" "prefetcht1 (%0)\n" " movdqa %%xmm0, 64(%0)\n" " movdqa %%xmm0, 80(%0)\n" " movdqa %%xmm0, 96(%0)\n" " movdqa %%xmm0, 112(%0)\n" "prefetcht1 64(%0)\n" : : "r" (page) : "memory"); page+=128; } __asm__ __volatile__ ( " sfence \n " " movdqu (%0),%%xmm0\n" :: "r" (xmm_save) ); } static void my_memset(void *page, int fill, size_t count) { unsigned char *dst = page; int i; xmm_store_t xmm_save[1]; __asm__ __volatile__ ( " movdqu %%xmm0,(%0)\n" " pxor %%xmm0, %%xmm0\n" :: "r" (xmm_save) : "memory" ); while (((long)page & 15) && (count > 0)) { *dst++ = fill; count--; } for(i=0;iafter) { printf("test invalid; timer overflow \n"); return; } printf("clear_page function '%s'\t took %4lli cycles per page\n",name,(after-before)/(4*1024) ); } __asm__( " .globl v26_copy_page\n" " .p2align 4\n" "v26_copy_page:\n" " subq $3*8,%rsp\n" " movq %rbx,(%rsp)\n" " movq %r12,1*8(%rsp)\n" " movq %r13,2*8(%rsp)\n" " movl $(4096/64)-5,%ecx\n" " .p2align 4\n" ".Loop64: \n" " dec %rcx\n" " movq (%rsi), %rax\n" " movq 8 (%rsi), %rbx\n" " movq 16 (%rsi), %rdx\n" " movq 24 (%rsi), %r8\n" " movq 32 (%rsi), %r9\n" " movq 40 (%rsi), %r10\n" " movq 48 (%rsi), %r11\n" " movq 56 (%rsi), %r12\n" " prefetcht0 5*64(%rsi)\n" " movq %rax, (%rdi)\n" " movq %rbx, 8 (%rdi)\n" " movq %rdx, 16 (%rdi)\n" " movq %r8, 24 (%rdi)\n" " movq %r9, 32 (%rdi)\n" " movq %r10, 40 (%rdi)\n" " movq %r11, 48 (%rdi)\n" " movq %r12, 56 (%rdi)\n" " leaq 64 (%rsi), %rsi\n" " leaq 64 (%rdi), %rdi\n" " jnz .Loop64\n" " movl $5,%ecx\n" " .p2align 4\n" ".Loop2: \n" " decl %ecx\n" " movq (%rsi), %rax\n" " movq 8 (%rsi), %rbx\n" " movq 16 (%rsi), %rdx\n" " movq 24 (%rsi), %r8\n" " movq 32 (%rsi), %r9\n" " movq 40 (%rsi), %r10\n" " movq 48 (%rsi), %r11\n" " movq 56 (%rsi), %r12\n" " movq %rax, (%rdi)\n" " movq %rbx, 8 (%rdi)\n" " movq %rdx, 16 (%rdi)\n" " movq %r8, 24 (%rdi)\n" " movq %r9, 32 (%rdi)\n" " movq %r10, 40 (%rdi)\n" " movq %r11, 48 (%rdi)\n" " movq %r12, 56 (%rdi)\n" " leaq 64(%rdi),%rdi \n" " leaq 64(%rsi),%rsi \n" " jnz .Loop2 \n" " movq (%rsp),%rbx\n" " movq 1*8(%rsp),%r12\n" " movq 2*8(%rsp),%r13\n" " addq $3*8,%rsp\n" " ret\n" ); __asm__( " .globl nt_copy_page\n" " .p2align 4\n" "nt_copy_page:\n" " subq $3*8,%rsp\n" " movq %rbx,(%rsp)\n" " movq %r12,1*8(%rsp)\n" " movq %r13,2*8(%rsp)\n" " movl $(4096/64)-5,%ecx\n" " .p2align 4\n" "1: \n" " dec %rcx\n" " movq (%rsi), %rax\n" " movq 8 (%rsi), %rbx\n" " movq 16 (%rsi), %rdx\n" " movq 24 (%rsi), %r8\n" " movq 32 (%rsi), %r9\n" " movq 40 (%rsi), %r10\n" " movq 48 (%rsi), %r11\n" " movq 56 (%rsi), %r12\n" " prefetcht0 5*64(%rsi)\n" " movnti %rax, (%rdi)\n" " movnti %rbx, 8 (%rdi)\n" " movnti %rdx, 16 (%rdi)\n" " movnti %r8, 24 (%rdi)\n" " movnti %r9, 32 (%rdi)\n" " movnti %r10, 40 (%rdi)\n" " movnti %r11, 48 (%rdi)\n" " movnti %r12, 56 (%rdi)\n" " leaq 64 (%rsi), %rsi\n" " leaq 64 (%rdi), %rdi\n" " jnz 1b\n" " movl $5,%ecx\n" " .p2align 4\n" "2: \n" " decl %ecx\n" " movq (%rsi), %rax\n" " movq 8 (%rsi), %rbx\n" " movq 16 (%rsi), %rdx\n" " movq 24 (%rsi), %r8\n" " movq 32 (%rsi), %r9\n" " movq 40 (%rsi), %r10\n" " movq 48 (%rsi), %r11\n" " movq 56 (%rsi), %r12\n" " movnti %rax, (%rdi)\n" " movnti %rbx, 8 (%rdi)\n" " movnti %rdx, 16 (%rdi)\n" " movnti %r8, 24 (%rdi)\n" " movnti %r9, 32 (%rdi)\n" " movnti %r10, 40 (%rdi)\n" " movnti %r11, 48 (%rdi)\n" " movnti %r12, 56 (%rdi)\n" " leaq 64(%rdi),%rdi \n" " leaq 64(%rsi),%rsi \n" " jnz 2b \n" " movq (%rsp),%rbx\n" " movq 1*8(%rsp),%r12\n" " movq 2*8(%rsp),%r13\n" " addq $3*8,%rsp\n" " ret\n" ); void test_one_copypage(copy_func *func, char *name, char *Buffer) { char *temp; int i; unsigned int blow,bhigh,alow,ahigh; unsigned long long before,after; rdtsc(blow,bhigh); temp = Buffer; for (i=0;i<2*1024;i++) { func(temp,temp+8*1024*1024); //func(temp,temp+256*1024); temp += 4096; } rdtsc(alow,ahigh); before = blow+ (((long long)bhigh)<<32); after = alow+(((long long)ahigh)<<32); if (before>after) { printf("test invalid; timer overflow \n"); return; } printf("copy_page function '%s'\t took %4lli cycles per page\n",name,(after-before)/(2*1024) ); } void test_clearpage(char *Buffer) { printf("clear_page() tests \n"); test_one_clearpage(fast_clear_page,"warm up run",Buffer); test_one_clearpage(kernel_clear_page,"kernel clear",Buffer); test_one_clearpage(normal_clear_page,"2.4 non MMX",Buffer); test_one_clearpage(slow_zero_page,"2.4 MMX fallback",Buffer); test_one_clearpage(fast_clear_page,"2.4 MMX version",Buffer); test_one_clearpage(faster_clear_page,"faster_clear_page",Buffer); test_one_clearpage(even_faster_clear_page,"even_faster_clear",Buffer); test_one_clearpage(xmm_clear_page,"xmm_clear ",Buffer); test_one_clearpage(xmmp_clear_page,"xmmp_clear",Buffer); test_one_clearpage(xmma_clear_page,"xmma_clear",Buffer); test_one_clearpage(xmm2_clear_page,"xmm2_clear",Buffer); test_one_clearpage(xmma2_clear_page,"xmma2_clear",Buffer); test_one_clearpage(xmm3_clear_page,"xmm3_clear",Buffer); test_one_clearpage(nt_clear_page,"nt clear ",Buffer); test_one_clearpage(nt_clear_page_pre,"nt clear pre",Buffer); test_one_clearpage(kernel_clear_page,"kernel clear",Buffer); } void test_copypage(char *Buffer) { extern void v26_copy_page(void *, void*); extern void nt_copy_page(void *, void*); printf("copy_page() tests \n"); test_one_copypage(fast_copy_page, "warm up run",Buffer); test_one_copypage(normal_copy_page,"2.4 non MMX",Buffer); test_one_copypage(slow_copy_page, "2.4 MMX fallback",Buffer); test_one_copypage(fast_copy_page, "2.4 MMX version",Buffer); test_one_copypage(faster_copy_page,"faster_copy",Buffer); test_one_copypage(even_faster_copy_page,"even_faster",Buffer); test_one_copypage(xmm_copy_page_no_prefetch,"xmm_copy_page_no",Buffer); test_one_copypage(xmm_copy_page,"xmm_copy_page",Buffer); test_one_copypage(xmma_copy_page,"xmma_copy_page",Buffer); test_one_copypage(xmm3_copy_page,"xmm3_copy_page",Buffer); test_one_copypage(v26_copy_page,"v26_copy_page",Buffer); test_one_copypage(nt_copy_page,"nt_copy_page",Buffer); } int main() { char *Buffer, *orig; orig = Buffer = malloc(1024*1024*16 + 4096); Buffer += (4096 - ((long)Buffer & 4095)); printf("SSE test program %s buffer = %p\n",cvsid, Buffer); test_clearpage(Buffer); printf("\n"); test_copypage(Buffer); free(orig); return 0; }