1 /* =============================================================================
5 * =============================================================================
9 * Takes as input a file:
10 * ascii file: containing 1 data point per line
11 * binary file: first int is the number of objects
12 * 2nd int is the no. of features of each object
14 * This example performs a fuzzy c-means clustering on the data. Fuzzy clustering
15 * is performed using min to max clusters and the clustering that gets the best
16 * score according to a compactness and separation criterion are returned.
22 * ECE Department Northwestern University
23 * email: wkliao@ece.northwestern.edu
29 * Northwestern University
34 * Port to Java version
36 * University of California, Irvine
38 * =============================================================================
40 * ------------------------------------------------------------------------
42 * For the license of kmeans, please see kmeans/LICENSE.kmeans
44 * ------------------------------------------------------------------------
46 * Unless otherwise noted, the following license applies to STAMP files:
48 * Copyright (c) 2007, Stanford University
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are
55 * * Redistributions of source code must retain the above copyright
56 * notice, this list of conditions and the following disclaimer.
58 * * Redistributions in binary form must reproduce the above copyright
59 * notice, this list of conditions and the following disclaimer in
60 * the documentation and/or other materials provided with the
63 * * Neither the name of Stanford University nor the names of its
64 * contributors may be used to endorse or promote products derived
65 * from this software without specific prior written permission.
67 * THIS SOFTWARE IS PROVIDED BY STANFORD UNIVERSITY ``AS IS'' AND ANY
68 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
69 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
70 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL STANFORD UNIVERSITY BE LIABLE
71 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
72 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
73 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
74 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
75 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
76 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
77 * THE POSSIBILITY OF SUCH DAMAGE.
79 * =============================================================================
82 public class KMeans extends Thread {
86 int use_zscore_transform;
90 int threadid; /* my thread id */
92 /* Global arguments for threads */
96 * Output: Number of best clusters
101 * Output: Cluster centers
103 double[][] cluster_centres;
109 use_zscore_transform = 1;
114 public KMeans(int threadid, GlobalArgs g_args) {
115 this.threadid = threadid;
116 this.g_args = g_args;
121 barr = new Barrier("128.195.136.162");
123 Barrier.enterBarrier(barr);
125 Normal.work(threadid, g_args);
127 Barrier.enterBarrier(barr);
131 /* =============================================================================
133 * =============================================================================
135 public static void main(String[] args) {
139 * Read options fron the command prompt
141 KMeans kms = new KMeans();
142 KMeans.parseCmdLine(args, kms);
143 nthreads = kms.nthreads;
145 if (kms.max_nclusters < kms.min_nclusters) {
146 System.out.println("Error: max_clusters must be >= min_clusters\n");
151 double[][] attributes;
152 int numAttributes = 0;
156 * From the input file, get the numAttributes and numObjects
158 if (kms.isBinaryFile == 1) {
159 System.out.println("TODO: Unimplemented Binary file option\n");
162 System.out.println("filename= " + kms.filename);
163 FileInputStream inputFile = new FileInputStream(kms.filename);
165 while((line = inputFile.readLine()) != null) {
168 inputFile = new FileInputStream(kms.filename);
169 if((line = inputFile.readLine()) != null) {
171 boolean prevWhiteSpace = true;
172 while(index < line.length()) {
173 char c = line.charAt(index++);
174 boolean currWhiteSpace = Character.isWhitespace(c);
175 if(prevWhiteSpace && !currWhiteSpace){
178 prevWhiteSpace = currWhiteSpace;
182 /* Ignore the id (first attribute): numAttributes = 1; */
183 numAttributes = numAttributes - 1; //
184 System.out.println("numObjects= " + numObjects + "numAttributes= " + numAttributes);
186 /* Allocate new shared objects and read attributes of all objects */
187 buf = new double[numObjects][numAttributes];
188 attributes = new double[numObjects][numAttributes];
189 KMeans.readFromFile(inputFile, kms.filename, buf);
192 * The core of the clustering
195 int[] cluster_assign = new int[numObjects];
197 int len = kms.max_nclusters - kms.min_nclusters + 1;
199 KMeans[] km = new KMeans[nthreads];
200 GlobalArgs g_args = new GlobalArgs();
201 g_args.nthreads = nthreads;
202 //args.nfeatures = numAttributes;
203 //args.npoints = numObjects;
205 /* Create and Start Threads */
206 for(int i = 1; i<nthreads; i++) {
207 km[i] = new KMeans(i, g_args);
210 for(int i = 1; i<nthreads; i++) {
214 for (int i = 0; i < nloops; i++) {
216 * Since zscore transform may perform in cluster() which modifies the
217 * contents of attributes[][], we need to re-store the originals
219 //memcpy(attributes[0], buf, (numObjects * numAttributes * sizeof(double)));
220 for(int x = 0; x < numObjects; x++) {
221 for(int y = 0; y < numAttributes; y++) {
222 attributes[x][y] = buf[x][y];
226 Cluster clus = new Cluster();
227 clus.cluster_exec(nthreads,
230 attributes, // [numObjects][numAttributes]
231 kms.use_zscore_transform, // 0 or 1
232 kms.min_nclusters, // pre-define range from min to max
235 kms.best_nclusters, // return: number between min and max
236 kms.cluster_centres, // return: [best_nclusters][numAttributes]
237 cluster_assign, // return: [numObjects] cluster id for each object
238 g_args); // Global arguments common to all threads
241 /* Output: the coordinates of the cluster centres */
243 for (int i = 0; i < kms.best_nclusters; i++) {
244 System.out.println(i);
245 for (int j = 0; j < numAttributes; j++) {
246 System.out.println(kms.cluster_centres[i][j]);
248 System.out.println("\n");
255 public static void parseCmdLine(String args[], KMeans km) {
258 while (i < args.length && args[i].startsWith("-")) {
261 if(arg.equals("-m")) {
262 if(i < args.length) {
263 km.max_nclusters = new Integer(args[i++]).intValue();
265 } else if(arg.equals("-n")) {
266 if(i < args.length) {
267 km.min_nclusters = new Integer(args[i++]).intValue();
269 } else if(arg.equals("-t")) {
270 if(i < args.length) {
271 km.threshold = new Integer(args[i++]).intValue();
273 } else if(args.equals("-i")) {
274 if(i < args.length) {
275 km.filename = new String(args[i++]);
277 } else if(args.equals("-b")) {
278 if(i < args.length) {
279 km.isBinaryFile = new Integer(args[i++]).intValue();
281 } else if(args.equals("-z")) {
282 if(i < args.length) {
285 } else if(args.equals("-nthreads")) {
286 if(i < args.length) {
287 km.nthreads = new Integer(args[i++]).intValue();
289 } else if(args.equals("-h")) {
296 * The usage routine which describes the program options.
298 public void usage() {
299 System.out.println("usage: ./kmeans -m <max_clusters> -n <min_clusters> -t <threshold> -i <filename> -nthreads <threads>\n");
300 System.out.println( " -i filename: file containing data to be clustered\n");
301 System.out.println( " -b input file is in binary format\n");
302 System.out.println( " -m max_clusters: maximum number of clusters allowed\n");
303 System.out.println( " -n min_clusters: minimum number of clusters allowed\n");
304 System.out.println( " -z : don't zscore transform data\n");
305 System.out.println( " -t threshold : threshold value\n");
306 System.out.println( " -nthreads : number of threads\n");
311 * Read attributes into an array
313 public static void readFromFile(FileInputStream inputFile, String filename, double[][] buf) {
314 inputFile = new FileInputStream(filename);
318 while((line = inputFile.readLine()) != null) {
319 System.out.println("line= " + line);
321 StringBuffer buffer = new StringBuffer();
323 boolean skipFirstVar = true;
324 while(index < line.length()) {
325 char c = line.charAt(index++);
330 skipFirstVar = false;
331 buffer = new StringBuffer();
334 //System.out.println("buffer.toString()= " + buffer.toString());
335 double f = KMeans.StringToFloat(buffer.toString());
337 System.out.println("f= " + f);
338 buffer = new StringBuffer();
347 * Convert a string into float
349 public static double StringToFloat (String str) {
350 double total = 0; // the total to return
351 int length = str.length(); // the length of the string
352 int prefixLength=0; // the length of the number BEFORE the decimal
353 int suffixLength=0; // the length of the number AFTER the decimal
354 boolean decimalFound = false; // use this to decide whether to increment prefix or suffix
355 for (int i = 0; i < str.length(); i++)
356 { // loop through the string
357 if (str.charAt(i) == '.')
358 { // if we found the '.' then we are now counting how long the decimal place is
359 length --; // subtract one from the length (. isn't an integer!)
360 decimalFound = true; // we found the decimal!
362 else if (!decimalFound)
363 prefixLength++;// if the decimal still hasn't been found, we should count the main number
364 else if (decimalFound) // otherwise, we should count how long the decimal is!
368 //System.out.println("str.length()= " + str.length() + " prefixLength= " + prefixLength + " suffixLength= " + suffixLength);
369 long x = 1; // our multiplier, used for thousands, hundreds, tens, units, etc
370 for (int i = 1; i < prefixLength; i++)
373 //System.out.println("x= " + x);
375 for (int i = 0; i < prefixLength; i++)
376 { // get the integer value
377 // 48 is the base value (ASCII)
378 // multiply it by x for tens, units, etc
379 total += ((int)(str.charAt(i)) - 48) * x;
380 x /= 10; // divide to decide which is the next unit
383 double decimal=0; // our value of the decimal only (we'll add it to total later)
384 x = 1; // again, but this time we'll go the other way to make it all below 0
385 for (int i = 1; i < suffixLength; i++) {
387 //System.out.println("x= " + x);
390 //System.out.println("str.length()= " + str.length() + " prefixLength= " + prefixLength + " suffixLength= " + suffixLength);
391 for (int i = 0; i < suffixLength; i++)
392 { // same again, but this time it's for the decimal value
393 //decimal += (static_cast <int> (str[i+suffixLength+1]) - 48) * x;
394 //decimal += ((int)(str.charAt(i+suffixLength+1)) - 48) * x;
395 decimal += ((int)(str.charAt(i+prefixLength+1)) - 48) * x;
396 //System.out.println("i+prefixLength+1= " + (i+prefixLength+1) + " char= " + str.charAt(i+prefixLength+1));
398 //System.out.println("x= " + x);
401 for (int i = 0; i < suffixLength; i++)
402 decimal /= 10; // make the decimal so that it is 0.whatever
404 total += decimal; // add them together
409 /* =============================================================================
413 * =============================================================================