diff -N -r -c HLib-1.3orig/Library/laplacebem.c HLib-1.3/Library/laplacebem.c *** HLib-1.3orig/Library/laplacebem.c 2004-12-12 17:42:33.000000000 +0100 --- HLib-1.3/Library/laplacebem.c 2006-02-27 10:25:59.643972448 +0100 *************** *** 43,49 **** --- 43,53 ---- } #ifdef HAVE_SSE2 + #if __GNUC__ >= 4 + typedef double v2df __attribute__((vector_size(16))); + #else typedef double v2df __attribute__((mode(V2DF))); + #endif static const v2df sse_slp_factor = { 1.0 / 4.0 / M_PI, 1.0 / 4.0 / M_PI}; static const v2df sse_one = { 1.0, 1.0 }; *************** *** 53,61 **** static v2df sse_loadsingle(double x) { ! v2df xx; ! xx = __builtin_ia32_loadsd(&x); ! return __builtin_ia32_shufpd(xx, xx, 0); } static v2df --- 57,73 ---- static v2df sse_loadsingle(double x) { ! double xx[2]; ! xx[0] = xx[1] = x; ! return __builtin_ia32_loadupd(xx); ! } ! ! static void ! sse_storesingle(double *buf, v2df x) ! { ! double xx[2]; ! __builtin_ia32_storeupd(xx, x); ! buf[0] = xx[0]; } static v2df *************** *** 465,473 **** yy[1] = b0 * xj[0][1] + b1 * xj[1][1] + b2 * xj[2][1]; yy[2] = b0 * xj[0][2] + b1 * xj[1][2] + b2 * xj[2][2]; ! __builtin_ia32_storesd(qbuf+q, ! sse_slp_kernel(xx[0], xx[1], xx[2], ! yy[0], yy[1], yy[2])); q++; } --- 477,485 ---- yy[1] = b0 * xj[0][1] + b1 * xj[1][1] + b2 * xj[2][1]; yy[2] = b0 * xj[0][2] + b1 * xj[1][2] + b2 * xj[2][2]; ! sse_storesingle(qbuf+q, ! sse_slp_kernel(xx[0], xx[1], xx[2], ! yy[0], yy[1], yy[2])); q++; } *************** *** 895,904 **** yy[1] = b0 * xj[0][1] + b1 * xj[1][1] + b2 * xj[2][1]; yy[2] = b0 * xj[0][2] + b1 * xj[1][2] + b2 * xj[2][2]; ! __builtin_ia32_storesd(qbuf+q, ! sse_dlp_kernel(xx[0], xx[1], xx[2], ! yy[0], yy[1], yy[2], ! nn[0], nn[1], nn[2])); q++; } --- 907,916 ---- yy[1] = b0 * xj[0][1] + b1 * xj[1][1] + b2 * xj[2][1]; yy[2] = b0 * xj[0][2] + b1 * xj[1][2] + b2 * xj[2][2]; ! sse_storesingle(qbuf+q, ! sse_dlp_kernel(xx[0], xx[1], xx[2], ! yy[0], yy[1], yy[2], ! nn[0], nn[1], nn[2])); q++; } *************** *** 1910,1916 **** xs[2] = sse_loadsingle(x2[2][0]); kslp = sse_slp_kernel(xs[0], xs[1], xs[2], ys[0], ys[1], ys[2]); ! __builtin_ia32_storesd(S+(i-1)+j*ldS, kslp); } j++; --- 1922,1928 ---- xs[2] = sse_loadsingle(x2[2][0]); kslp = sse_slp_kernel(xs[0], xs[1], xs[2], ys[0], ys[1], ys[2]); ! sse_storesingle(S+(i-1)+j*ldS, kslp); } j++;