Robust/src/Benchmarks/SingleTM/KMeans/KMeans.java

   1 /* =============================================================================
   2  *
   3  * kmeans.java
   4  *
   5  * =============================================================================
   6  *
   7  * Description:
   8  *
   9  * Takes as input a file:
  10  *   ascii  file: containing 1 data point per line
  11  *   binary file: first int is the number of objects
  12  *                2nd int is the no. of features of each object
  13  *
  14  * This example performs a fuzzy c-means clustering on the data. Fuzzy clustering
  15  * is performed using min to max clusters and the clustering that gets the best
  16  * score according to a compactness and separation criterion are returned.
  17  *
  18  *
  19  * Author:
  20  *
  21  * Wei-keng Liao
  22  * ECE Department Northwestern University
  23  * email: wkliao@ece.northwestern.edu
  24  *
  25  *
  26  * Edited by:
  27  *
  28  * Jay Pisharath
  29  * Northwestern University
  30  *
  31  * Chi Cao Minh
  32  * Stanford University
  33  *
  34  * Port to Java version
  35  * Alokika Dash
  36  * University of California, Irvine
  37  *
  38  * =============================================================================
  39  *
  40  * ------------------------------------------------------------------------
  41  *
  42  * For the license of kmeans, please see kmeans/LICENSE.kmeans
  43  *
  44  * ------------------------------------------------------------------------
  45  *
  46  * Unless otherwise noted, the following license applies to STAMP files:
  47  *
  48  * Copyright (c) 2007, Stanford University
  49  * All rights reserved.
  50  *
  51  * Redistribution and use in source and binary forms, with or without
  52  * modification, are permitted provided that the following conditions are
  53  * met:
  54  *
  55  *     * Redistributions of source code must retain the above copyright
  56  *       notice, this list of conditions and the following disclaimer.
  57  *
  58  *     * Redistributions in binary form must reproduce the above copyright
  59  *       notice, this list of conditions and the following disclaimer in
  60  *       the documentation and/or other materials provided with the
  61  *       distribution.
  62  *
  63  *     * Neither the name of Stanford University nor the names of its
  64  *       contributors may be used to endorse or promote products derived
  65  *       from this software without specific prior written permission.
  66  *
  67  * THIS SOFTWARE IS PROVIDED BY STANFORD UNIVERSITY ``AS IS'' AND ANY
  68  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  69  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  70  * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL STANFORD UNIVERSITY BE LIABLE
  71  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  72  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  73  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  74  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  75  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  76  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  77  * THE POSSIBILITY OF SUCH DAMAGE.
  78  *
  79  * =============================================================================
  80  */
  81
  82 public class KMeans extends Thread {
  83   int max_nclusters;
  84   int min_nclusters;
  85   int isBinaryFile;
  86   int use_zscore_transform;
  87   String filename;
  88   int nthreads;
  89   double threshold;
  90   int threadid; /* my thread id */
  91
  92   /* Global arguments for threads */
  93   GlobalArgs g_args;
  94
  95   /**
  96    * Output:  Number of best clusters
  97    **/
  98   int best_nclusters;
  99
 100   /**
 101    * Output: Cluster centers
 102    **/
 103   double[][] cluster_centres;
 104
 105   public KMeans() {
 106     max_nclusters = 13;
 107     min_nclusters = 4;
 108     isBinaryFile = 0;
 109     use_zscore_transform = 1;
 110     threshold = 0.001;
 111     best_nclusters = 0;
 112   }
 113
 114   public KMeans(int threadid, GlobalArgs g_args) {
 115     this.threadid = threadid;
 116     this.g_args = g_args;
 117   }
 118
 119   public void run() {
 120     Barrier barr;
 121     barr = new Barrier("128.195.136.162");
 122     while(true) {
 123       Barrier.enterBarrier(barr);
 124
 125       Normal.work(threadid, g_args);
 126
 127       Barrier.enterBarrier(barr);
 128     }
 129   }
 130
 131   /* =============================================================================
 132    * main
 133    * =============================================================================
 134    */
 135   public static void main(String[] args) {
 136     int nthreads;
 137
 138     /**
 139      * Read options fron the command prompt
 140      **/
 141     KMeans kms = new KMeans();
 142     KMeans.parseCmdLine(args, kms);
 143     nthreads = kms.nthreads;
 144
 145     if (kms.max_nclusters < kms.min_nclusters) {
 146       System.out.println("Error: max_clusters must be >= min_clusters\n");
 147       System.exit(0);
 148     }
 149
 150     double[][] buf;
 151     double[][] attributes;
 152     int numAttributes = 0;
 153     int numObjects = 0;
 154
 155     /*
 156      * From the input file, get the numAttributes and numObjects
 157      */
 158     if (kms.isBinaryFile == 1) {
 159       System.out.println("TODO: Unimplemented Binary file option\n");
 160       System.exit(0);
 161     }
 162     System.out.println("filename= " + kms.filename);
 163     FileInputStream inputFile = new FileInputStream(kms.filename);
 164     String line = null;
 165     while((line = inputFile.readLine()) != null) {
 166       numObjects++;
 167     }
 168     inputFile = new FileInputStream(kms.filename);
 169     if((line = inputFile.readLine()) != null) {
 170       int index = 0;
 171       boolean prevWhiteSpace = true;
 172       while(index < line.length()) {
 173         char c = line.charAt(index++);
 174         boolean currWhiteSpace = Character.isWhitespace(c);
 175         if(prevWhiteSpace && !currWhiteSpace){
 176           numAttributes++;
 177         }
 178         prevWhiteSpace = currWhiteSpace;
 179       }
 180     }
 181
 182     /* Ignore the id (first attribute): numAttributes = 1; */
 183     numAttributes = numAttributes - 1; //
 184     System.out.println("numObjects= " + numObjects + "numAttributes= " + numAttributes);
 185
 186     /* Allocate new shared objects and read attributes of all objects */
 187     buf = new double[numObjects][numAttributes];
 188     attributes = new double[numObjects][numAttributes];
 189     KMeans.readFromFile(inputFile, kms.filename, buf);
 190
 191     /*
 192      * The core of the clustering
 193      */
 194
 195     int[] cluster_assign = new int[numObjects];
 196     int nloops = 1;
 197     int len = kms.max_nclusters - kms.min_nclusters + 1;
 198
 199     KMeans[] km = new KMeans[nthreads];
 200     GlobalArgs g_args = new GlobalArgs();
 201     g_args.nthreads = nthreads;
 202     //args.nfeatures = numAttributes;
 203     //args.npoints = numObjects;
 204
 205     /* Create and Start Threads */
 206     for(int i = 1; i<nthreads; i++) {
 207       km[i] = new KMeans(i, g_args);
 208     }
 209
 210     for(int i = 1; i<nthreads; i++) {
 211       km[i].start();
 212     }
 213
 214     for (int i = 0; i < nloops; i++) {
 215       /*
 216        * Since zscore transform may perform in cluster() which modifies the
 217        * contents of attributes[][], we need to re-store the originals
 218        */
 219       //memcpy(attributes[0], buf, (numObjects * numAttributes * sizeof(double)));
 220       for(int x = 0; x < numObjects; x++) {
 221         for(int y = 0; y < numAttributes; y++) {
 222           attributes[x][y] = buf[x][y];
 223         }
 224       }
 225
 226       Cluster clus = new Cluster();
 227       clus.cluster_exec(nthreads,
 228           numObjects,
 229           numAttributes,
 230           attributes,             // [numObjects][numAttributes]
 231           kms.use_zscore_transform, // 0 or 1
 232           kms.min_nclusters,      // pre-define range from min to max
 233           kms.max_nclusters,
 234           kms.threshold,
 235           kms.best_nclusters,     // return: number between min and max
 236           kms.cluster_centres,    // return: [best_nclusters][numAttributes]
 237           cluster_assign,         // return: [numObjects] cluster id for each object
 238           g_args);                // Global arguments common to all threads
 239     }
 240
 241     /* Output: the coordinates of the cluster centres */
 242     {
 243       for (int i = 0; i < kms.best_nclusters; i++) {
 244         System.out.println(i);
 245         for (int j = 0; j < numAttributes; j++) {
 246           System.out.println(kms.cluster_centres[i][j]);
 247         }
 248         System.out.println("\n");
 249       }
 250     }
 251
 252     System.exit(0);
 253   }
 254
 255   public static void parseCmdLine(String args[], KMeans km) {
 256     int i = 0;
 257     String arg;
 258     while (i < args.length && args[i].startsWith("-")) {
 259       arg = args[i++];
 260       //check options
 261       if(arg.equals("-m")) {
 262         if(i < args.length) {
 263           km.max_nclusters = new Integer(args[i++]).intValue();
 264         }
 265       } else if(arg.equals("-n")) {
 266         if(i < args.length) {
 267           km.min_nclusters = new Integer(args[i++]).intValue();
 268         }
 269       } else if(arg.equals("-t")) {
 270         if(i < args.length) {
 271           km.threshold = new Integer(args[i++]).intValue();
 272         }
 273       } else if(args.equals("-i")) {
 274         if(i < args.length) {
 275           km.filename = new String(args[i++]);
 276         }
 277       } else if(args.equals("-b")) {
 278         if(i < args.length) {
 279           km.isBinaryFile = new Integer(args[i++]).intValue();
 280         }
 281       } else if(args.equals("-z")) {
 282         if(i < args.length) {
 283
 284         }
 285       } else if(args.equals("-nthreads")) {
 286         if(i < args.length) {
 287           km.nthreads = new Integer(args[i++]).intValue();
 288         }
 289       } else if(args.equals("-h")) {
 290         km.usage();
 291       }
 292     }
 293   }
 294
 295   /**
 296    * The usage routine which describes the program options.
 297    **/
 298   public void usage() {
 299     System.out.println("usage: ./kmeans -m <max_clusters> -n <min_clusters> -t <threshold> -i <filename> -nthreads <threads>\n");
 300     System.out.println(                   "  -i filename:     file containing data to be clustered\n");
 301     System.out.println(                   "  -b               input file is in binary format\n");
 302     System.out.println(                   "  -m max_clusters: maximum number of clusters allowed\n");
 303     System.out.println(                   "  -n min_clusters: minimum number of clusters allowed\n");
 304     System.out.println(                   "  -z             : don't zscore transform data\n");
 305     System.out.println(                   "  -t threshold   : threshold value\n");
 306     System.out.println(                   "  -nthreads      : number of threads\n");
 307   }
 308
 309   /**
 310    * readFromFile()
 311    * Read attributes into an array
 312    **/
 313   public static void readFromFile(FileInputStream inputFile, String filename, double[][] buf) {
 314     inputFile = new FileInputStream(filename);
 315     int i = 0;
 316     int j;
 317     String line = null;
 318     while((line = inputFile.readLine()) != null) {
 319       System.out.println("line= " + line);
 320       int index=0;
 321       StringBuffer buffer = new StringBuffer();
 322       j = 0;
 323       boolean skipFirstVar = true;
 324       while(index < line.length()) {
 325         char c = line.charAt(index++);
 326         if(c != ' ') {
 327           buffer.append(c);
 328         } else {
 329           if(skipFirstVar) {
 330             skipFirstVar = false;
 331             buffer = new StringBuffer();
 332             continue;
 333           }
 334           //System.out.println("buffer.toString()= " + buffer.toString());
 335           double f = KMeans.StringToFloat(buffer.toString());
 336           buf[i][j] = f;
 337           System.out.println("f= " + f);
 338           buffer = new StringBuffer();
 339           j++;
 340         }
 341       }
 342       i++;
 343     }
 344   }
 345
 346   /**
 347    * Convert a string into float
 348    **/
 349   public static double StringToFloat (String str) {
 350     double total = 0; // the total to return
 351     int length = str.length(); // the length of the string
 352     int prefixLength=0; // the length of the number BEFORE the decimal
 353     int suffixLength=0; // the length of the number AFTER the decimal
 354     boolean decimalFound = false; // use this to decide whether to increment prefix or suffix
 355     for (int i = 0; i < str.length(); i++)
 356     { // loop through the string
 357       if (str.charAt(i) == '.')
 358       { // if we found the '.' then we are now counting how long the decimal place is
 359         length --; // subtract one from the length (. isn't an integer!)
 360         decimalFound = true; // we found the decimal!
 361       }
 362       else if (!decimalFound)
 363         prefixLength++;// if the decimal still hasn't been found, we should count the main number
 364       else if (decimalFound) // otherwise, we should count how long the decimal is!
 365         suffixLength++;
 366     }
 367
 368     //System.out.println("str.length()= " + str.length() + " prefixLength= " + prefixLength + " suffixLength= " + suffixLength);
 369     long x = 1; // our multiplier, used for thousands, hundreds, tens, units, etc
 370     for (int i = 1; i < prefixLength; i++)
 371       x *= 10;
 372
 373     //System.out.println("x= " + x);
 374
 375     for (int i = 0; i < prefixLength; i++)
 376     { // get the integer value
 377       // 48 is the base value (ASCII)
 378       // multiply it by x for tens, units, etc
 379       total += ((int)(str.charAt(i)) - 48) * x;
 380       x /= 10; // divide to decide which is the next unit
 381     }
 382
 383     double decimal=0; // our value of the decimal only (we'll add it to total later)
 384     x = 1; // again, but this time we'll go the other way to make it all below 0
 385     for (int i = 1; i < suffixLength; i++) {
 386       x *= 10;
 387       //System.out.println("x= " + x);
 388     }
 389
 390     //System.out.println("str.length()= " + str.length() + " prefixLength= " + prefixLength + " suffixLength= " + suffixLength);
 391     for (int i = 0; i < suffixLength; i++)
 392     { // same again, but this time it's for the decimal value
 393       //decimal += (static_cast <int> (str[i+suffixLength+1]) - 48) * x;
 394       //decimal += ((int)(str.charAt(i+suffixLength+1)) - 48) * x;
 395       decimal += ((int)(str.charAt(i+prefixLength+1)) - 48) * x;
 396       //System.out.println("i+prefixLength+1= " + (i+prefixLength+1) + " char= " + str.charAt(i+prefixLength+1));
 397       x /= 10;
 398       //System.out.println("x= " + x);
 399     }
 400
 401     for (int i = 0; i < suffixLength; i++)
 402       decimal /= 10; // make the decimal so that it is 0.whatever
 403
 404     total += decimal; // add them together
 405     return total;
 406   }
 407 }
 408
 409 /* =============================================================================
 410  *
 411  * End of kmeans.java
 412  *
 413  * =============================================================================
 414  */