/* compiled with -msse2, etc*/ //#include /* MMX */ //#include /* SSE */ #include /* SSE2 */ //#include /* SSE3 */ //#include /* SSSE3 */ //#include /* SSE4.2 SSE4.1 */ #include #include #include typedef unsigned long long ticks; static __inline__ ticks getticks(void) { unsigned a, d; asm("cpuid"); asm volatile("rdtsc" : "=a" (a), "=d" (d)); return (((ticks)a) | (((ticks)d) << 32)); } void print_duration(struct timespec *b, struct timespec *c) { long long r = c->tv_nsec - b->tv_nsec; r += ((long long)(c->tv_sec - b->tv_sec) ) * 1000000000; printf("duration = %lld nanoseconds\n", r); } main() { unsigned long long int t1, t2, t3; struct timespec b, e; float z1[] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; float z2[] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; float z3[8]; float z4[8]; int i, j; // t1 = getticks(); clock_gettime(CLOCK_THREAD_CPUTIME_ID, &b); for (j=0;j<100; j++) { for(i=0; i<8; i++) { z3[i] = z1[i] + z2[i]; } } // t2 = getticks(); clock_gettime(CLOCK_THREAD_CPUTIME_ID, &e); //printf("time = %lld\n", t2-t1); print_duration(&b, &e); // t1 = getticks(); clock_gettime(CLOCK_THREAD_CPUTIME_ID, &b); for (j=0; j<100; j++) { __m128 *v_z1 = (__m128 *)z1; __m128 *v_z2 = (__m128 *)z2; __m128 *v_z3 = (__m128 *)z4; for(i=0; i<2; i++) { *v_z3 = _mm_add_ps(*v_z1, *v_z2); v_z1++; v_z2++; v_z3++; } } // t2 = getticks(); clock_gettime(CLOCK_THREAD_CPUTIME_ID, &e); print_duration(&b, &e); // printf("time = %lld\n", t2-t1); for (i=0; i<8; i++) { // if (z3[i] != z4[i]) {printf("Wrong.\n"); exit(0);} printf("%d %f %f\n", i, z3[i], z4[i]); } printf("Correct, results matched.\n"); return 0; }