public void run() {
atomic {
- // Prefetch mmul.a[][] matrix
- Object o = mmul;
short[] offsets = new short[4];
+ // Prefetch mmul.btranspose[][] matrix
+ //Get all of B first...we need them first
+ offsets[0] = getoffset{MMul, btranspose};
+ offsets[1] = (short) 0;
+ offsets[2] = (short) y0;
+ offsets[3] = (short) (y1 - y0 -1);
+ System.rangePrefetch(mmul, offsets);
+
+ //Get first part of A
offsets[0] = getoffset{MMul, a};
offsets[1] = (short) 0;
offsets[2] = (short) x0;
- offsets[3] = (short) (x1 - x0 -1);
- System.rangePrefetch(o, offsets);
-
- // Prefetch mmul.btranspose[][] matrix
- Object o1 = mmul;
- short[] offsets1 = new short[4];
- offsets1[0] = getoffset{MMul, btranspose};
- offsets1[1] = (short) 0;
- offsets1[2] = (short) x0;
- offsets1[3] = (short) (x1 - x0 -1);
- System.rangePrefetch(o1, offsets1);
+ offsets[3] = (short) 63;
+ System.rangePrefetch(mmul, offsets);
- // Prefetch mmul.c[][] matrix
- Object o2 = mmul;
- short[] offsets2 = new short[4];
- offsets2[0] = getoffset{MMul, c};
- offsets2[1] = (short) 0;
- offsets2[2] = (short) x0;
- offsets2[3] = (short) (x1 - x0 -1);
- System.rangePrefetch(o2, offsets2);
+ //Get first part of C
+ offsets[0] = getoffset{MMul, c};
+ offsets[1] = (short) 0;
+ System.rangePrefetch(mmul, offsets);
double la[][]=mmul.a;
double lc[][]=mmul.c;
double lb[][]=mmul.btranspose;
int M=mmul.M;
+ int l=0;
//Use btranspose for cache performance
- for(int i = x0; i< x1; i++){
+ for(int i = x0; i< x1; i++,l++){
double a[]=la[i];
double c[]=lc[i];
+ if (((l+32)&63)==0) {
+ offsets[0] = getoffset{MMul, a};
+ offsets[1] = (short) 0;
+ offsets[2] = (short) x0+l;
+ if ((x0+l+64)>x1)
+ offsets[3]=x1-x0-l-1;
+ else
+ offsets[3] = (short) 63;
+ System.rangePrefetch(mmul, offsets);
+
+ //Get first part of C
+ offsets[0] = getoffset{MMul, c};
+ offsets[1] = (short) 0;
+ System.rangePrefetch(mmul, offsets);
+ }
for (int j = y0; j < y1; j++) {
double innerProduct=0;
double b[] = lb[j];