From bd5657a5bcb73ee0dfcdc4ca886e96eba12cd5dd Mon Sep 17 00:00:00 2001 From: adash Date: Wed, 20 Jan 2010 23:53:42 +0000 Subject: [PATCH] final manual prefetch changes for MatrixMultiply --- .../MatrixMultiply/MatrixMultiplyD3.java | 81 ++++++++++++++----- 1 file changed, 60 insertions(+), 21 deletions(-) diff --git a/Robust/src/Benchmarks/Prefetch/ManualPrefetch/MatrixMultiply/MatrixMultiplyD3.java b/Robust/src/Benchmarks/Prefetch/ManualPrefetch/MatrixMultiply/MatrixMultiplyD3.java index 5c956b71..2a8239d8 100644 --- a/Robust/src/Benchmarks/Prefetch/ManualPrefetch/MatrixMultiply/MatrixMultiplyD3.java +++ b/Robust/src/Benchmarks/Prefetch/ManualPrefetch/MatrixMultiply/MatrixMultiplyD3.java @@ -15,62 +15,101 @@ public class MatrixMultiply extends Thread{ public void run() { Barrier barr=new Barrier("128.195.136.162"); + int mynumthreads, mytid, P, myx0, myx1, myy0, myy1; atomic { mmul.setValues(tid, numthreads); + myx0=x0; + myx1=x1; + myy0=y0; + myy1=y1; + mynumthreads=numthreads; + mytid=tid; + P=mmul.P; } Barrier.enterBarrier(barr); - + atomic { - short[] offsets = new short[4]; - // Prefetch mmul.btranspose[][] matrix + short[] offsets = new short[6]; + // Prefetch mmul.btranspose[][][] matrix //Get all of B first...we need them first offsets[0] = getoffset{MMul, btranspose}; offsets[1] = (short) 0; - offsets[2] = (short) y0; - offsets[3] = (short) (y1 - y0 -1); + offsets[2] = (short) 0; + offsets[3] = (short) 15; + offsets[4] = (short) y0; + offsets[5] = (short) (y1 - y0 -1); System.rangePrefetch(mmul, offsets); //Get first part of A offsets[0] = getoffset{MMul, a}; offsets[1] = (short) 0; - offsets[2] = (short) x0; + offsets[2] = (short) 0; offsets[3] = (short) 15; + offsets[4] = (short) x0; + offsets[5] = (short) 15; System.rangePrefetch(mmul, offsets); //Get first part of C offsets[0] = getoffset{MMul, c}; offsets[1] = (short) 0; System.rangePrefetch(mmul, offsets); - short[] offsets2=new short[2]; + short[] offsets2=new short[4]; double la[][][]=mmul.a; double lc[][][]=mmul.c; double lb[][][]=mmul.btranspose; int M=mmul.M; - int P=mmul.P; //Use btranspose for cache performance - for(int q=0;qP) { + int lx=P-ll-1; + if(lx>0) { + offsets2[1]=(short) lx; + offsets2[2] = (short) y0; + offsets2[3] = (short) (y1 - y0 -1); + System.rangePrefetch(lb, offsets2); + offsets2[2] = (short) x0; + offsets2[3] = (short) 15; + System.rangePrefetch(la, offsets2); + System.rangePrefetch(lc, offsets2); + } + } else { + offsets2[1]=(short) 15; + offsets2[2] = (short) y0; + offsets2[3] = (short) (y1 - y0 -1); + System.rangePrefetch(lb, offsets2); + offsets2[2] = (short) x0; + offsets2[3] = (short) 15; + System.rangePrefetch(la, offsets2); + System.rangePrefetch(lc, offsets2); + } + } + + short[] offsets3=new short[2]; int l=8; for(int i = x0; i< x1; i++,l++){ double a[]=ra[i]; double c[]=rc[i]; - if ((l&15)==0) { - offsets2[0] = (short) (x0+l); + if((l&15)==0) { + offsets3[0]=(short) (x0+l); if ((x0+l+16)>x1) { int x=x1-x0-l-1; if (x>0) { - offsets2[1]=(short) x; - System.rangePrefetch(la, offsets2); - System.rangePrefetch(lc, offsets2); + offsets3[1]=(short) x; + System.rangePrefetch(ra, offsets3); + System.rangePrefetch(rc, offsets3); } } else { - offsets2[1] = (short) 15; - System.rangePrefetch(la, offsets2); - System.rangePrefetch(lc, offsets2); + offsets3[1] = (short) 15; + System.rangePrefetch(ra, offsets3); + System.rangePrefetch(rc, offsets3); } } for (int j = y0; j < y1; j++) { @@ -81,10 +120,10 @@ public class MatrixMultiply extends Thread{ } c[j]=innerProduct; } - } - } - } - } + } //end of inner for + }//end of outer for + }//end of atomic + }//end of run public static void main(String[] args) { int NUM_THREADS = 4; -- 2.34.1