From: adash <adash>
Date: Mon, 11 Jan 2010 07:35:47 +0000 (+0000)
Subject: changes for manual prefetch in matrix multiply
X-Git-Url: http://demsky.eecs.uci.edu/git/?a=commitdiff_plain;h=e2e19ce636e3773c13370796b8ed9123735ef3b9;p=IRC.git

changes for manual prefetch in matrix multiply
---

diff --git a/Robust/src/Benchmarks/Prefetch/ManualPrefetch/MatrixMultiply/MatrixMultiplyD3.java b/Robust/src/Benchmarks/Prefetch/ManualPrefetch/MatrixMultiply/MatrixMultiplyD3.java
new file mode 100644
index 00000000..5c956b71
--- /dev/null
+++ b/Robust/src/Benchmarks/Prefetch/ManualPrefetch/MatrixMultiply/MatrixMultiplyD3.java
@@ -0,0 +1,210 @@
+public class MatrixMultiply extends Thread{
+    MMul mmul;
+    public int x0, y0, x1, y1;
+    public int tid, numthreads;
+
+    public MatrixMultiply(MMul mmul, int x0, int x1, int y0, int y1, int tid, int numthreads) {
+	this.mmul = mmul;
+	this.x0 = x0;
+	this.y0 = y0;
+	this.x1 = x1;
+	this.y1 = y1;
+	this.tid=tid;
+	this.numthreads=numthreads;
+    }
+    
+    public void run() {
+      Barrier barr=new Barrier("128.195.136.162");
+      atomic {
+        mmul.setValues(tid, numthreads);
+      }
+
+      Barrier.enterBarrier(barr);
+
+      atomic {
+        short[] offsets = new short[4];
+        // Prefetch mmul.btranspose[][] matrix
+        //Get all of B first...we need them first
+        offsets[0] = getoffset{MMul, btranspose};
+        offsets[1] = (short) 0;
+        offsets[2] = (short) y0;
+        offsets[3] = (short) (y1 - y0 -1);
+        System.rangePrefetch(mmul, offsets);
+
+        //Get first part of A
+        offsets[0] = getoffset{MMul, a};
+        offsets[1] = (short) 0;
+        offsets[2] = (short) x0;
+        offsets[3] = (short) 15;
+        System.rangePrefetch(mmul, offsets);
+
+        //Get first part of C
+        offsets[0] = getoffset{MMul, c};
+        offsets[1] = (short) 0;
+        System.rangePrefetch(mmul, offsets);
+        short[] offsets2=new short[2];
+
+        double la[][][]=mmul.a;
+        double lc[][][]=mmul.c;
+        double lb[][][]=mmul.btranspose;
+        int M=mmul.M;
+        int P=mmul.P;
+        //Use btranspose for cache performance
+        for(int q=0;q<P;q++) {
+          double ra[][]=la[q]; 
+          double rb[][]=lb[q];
+          double rc[][]=lc[q];
+          int l=8;
+          for(int i = x0; i< x1; i++,l++){
+            double a[]=ra[i]; 
+            double c[]=rc[i];
+            if ((l&15)==0) {
+              offsets2[0] = (short) (x0+l);
+              if ((x0+l+16)>x1) {
+                int x=x1-x0-l-1;
+                if (x>0) {
+                  offsets2[1]=(short) x;
+                  System.rangePrefetch(la, offsets2);
+                  System.rangePrefetch(lc, offsets2);
+                }
+              } else {
+                offsets2[1] = (short) 15;
+                System.rangePrefetch(la, offsets2);
+                System.rangePrefetch(lc, offsets2);
+              }
+            }
+            for (int j = y0; j < y1; j++) {
+              double innerProduct=0;
+              double b[] = rb[j];
+              for(int k = 0; k < M; k++) {
+                innerProduct += a[k] * b[k];
+              }
+              c[j]=innerProduct;
+            }
+          }
+        }
+      }
+    }
+
+    public static void main(String[] args) {
+	int NUM_THREADS = 4;
+	int SIZE=150;
+	int NUM_MATRIX = 1;
+	if (args.length>0) {
+	    NUM_THREADS=Integer.parseInt(args[0]);
+	    if (args.length>1) {
+		SIZE=Integer.parseInt(args[1]);
+		if (args.length>2)
+		    NUM_MATRIX=Integer.parseInt(args[2]);
+	    }
+	}
+	
+	int[] mid = new int[8];
+	mid[0] = (128<<24)|(195<<16)|(136<<8)|162; 
+	mid[1] = (128<<24)|(195<<16)|(136<<8)|163;
+	mid[2] = (128<<24)|(195<<16)|(136<<8)|164;
+	mid[3] = (128<<24)|(195<<16)|(136<<8)|165;
+	mid[4] = (128<<24)|(195<<16)|(136<<8)|166;
+	mid[5] = (128<<24)|(195<<16)|(136<<8)|167;
+	mid[6] = (128<<24)|(195<<16)|(136<<8)|168;
+	mid[7] = (128<<24)|(195<<16)|(136<<8)|169;
+
+	int p, q, r;
+	MatrixMultiply[] mm;
+	MatrixMultiply tmp;
+	MMul matrix;
+	BarrierServer mybarr;
+
+	atomic {
+	    mybarr = global new BarrierServer(NUM_THREADS);
+	}
+	mybarr.start(mid[0]);
+
+
+    System.out.println("NUM_MATRIX= "+NUM_MATRIX+" SIZE= "+SIZE);
+	atomic {
+	    matrix = global new MMul(NUM_MATRIX, SIZE, SIZE, SIZE);
+	    mm = global new MatrixMultiply[NUM_THREADS];
+	    int increment=SIZE/NUM_THREADS;
+	    int base=0;
+	    for(int i=0;i<NUM_THREADS;i++) {
+		if ((i+1)==NUM_THREADS)
+		    mm[i]=global new MatrixMultiply(matrix,base, SIZE, 0, SIZE, i, NUM_THREADS);
+		else
+		    mm[i]=global new MatrixMultiply(matrix,base, base+increment, 0, SIZE, i, NUM_THREADS);
+		base+=increment;
+	    }
+	    p = matrix.L;
+	    q = matrix.M;
+	    r = matrix.N;
+	}
+	boolean waitfordone=true;
+	while(waitfordone) {
+	    atomic { //Master aborts come from here
+		if (mybarr.done)
+		    waitfordone=false;
+	    }
+	}
+	
+	// start a thread to compute each c[l,n]
+	for (int i = 0; i < NUM_THREADS; i++) {
+	    atomic {
+		tmp = mm[i];
+	    }
+	    tmp.start(mid[i]);
+	}
+
+      // wait for them to finish
+      for (int i = 0; i < NUM_THREADS; i++) {
+        atomic {
+          tmp = mm[i];
+        }
+        tmp.join();
+      }
+    
+	// print out the result of the matrix multiply
+	System.printString("Finished\n");
+    }
+}
+
+public class MMul{
+    public int L, M, N, P;
+    public double[][][] a;
+    public double[][][] c;
+    public double[][][] btranspose;
+    
+    public MMul(int P, int L, int M, int N) {
+	this.L = L;
+	this.M = M;
+	this.N = N;
+	this.P = P;
+	a = global new double[P][L][];
+	c = global new double[P][L][];
+	btranspose = global new double[P][N][];
+    }
+
+    public void setValues(int tid, int numthreads) {
+      int delta=numthreads;
+      int start=tid;
+
+      for(int q = start; q < P; q+=delta) {
+        for(int i = 0; i < L; i++) {
+          double ai[] = global new double[M];
+          for(int j = 0; j < M; j++) {
+            ai[j] = j+1;
+          }
+          a[q][i]=ai; 
+        }
+        for(int i = 0; i < L; i++) {
+          c[q][i]=global new double[N];
+        }
+        for(int i = 0; i < N; i++) {
+          double bi[] = global new double[M];
+          for(int j = 0; j < M; j++) {
+            bi[j] = j+1;
+          }
+          btranspose[q][i]=bi;
+        }
+      }
+    }
+}
diff --git a/Robust/src/Benchmarks/Prefetch/ManualPrefetch/MatrixMultiply/makefile b/Robust/src/Benchmarks/Prefetch/ManualPrefetch/MatrixMultiply/makefile
index a482a4f0..8defc674 100644
--- a/Robust/src/Benchmarks/Prefetch/ManualPrefetch/MatrixMultiply/makefile
+++ b/Robust/src/Benchmarks/Prefetch/ManualPrefetch/MatrixMultiply/makefile
@@ -1,8 +1,12 @@
 MAINCLASS=MatrixMultiply
-SRC1=${MAINCLASS}N.java
-FLAGS1=-dsm -dsmcaching -rangeprefetch -optimize -mainclass ${MAINCLASS} -trueprob 0.98
+SRC1=${MAINCLASS}D3.java
+FLAGS1=-dsm -transstats -dsmcaching -rangeprefetch -optimize -mainclass ${MAINCLASS} -trueprob 0.98
+FLAGS2=-dsm -transstats -optimize -mainclass ${MAINCLASS}
+FLAGS3=-dsm -optimize -transstats -dsmcaching -mainclass ${MAINCLASS}
 default:
 	../../../../buildscript ${FLAGS1} -o ${MAINCLASS}RangeN ${SRC1}
+	../../../../buildscript ${FLAGS2} -o ${MAINCLASS}NPNC ${SRC1}
+	../../../../buildscript ${FLAGS3} -o ${MAINCLASS}NPC ${SRC1}
 
 clean:
 	rm -rf tmpbuilddirectory