python_ml/plotting-dbscan-diff-metric.py

   1 from sklearn.cluster import DBSCAN
   2 from sklearn import metrics
   3 import matplotlib.cm as cm
   4 import numpy as np
   5 import matplotlib.pyplot as plt
   6
   7 # metric function for clustering
   8 def metric(x, y):
   9         # Compare 2 datapoints in array element 2 and 3 that contains C or S
  10         if x[2] != y[2] or x[3] != y[3]:
  11                 # We are not going to cluster these together since they have different directions
  12                 return sys.maxsize;
  13         else:
  14                 # Compute Euclidian distance here
  15                 return math.sqrt((x[0] - y[0])**2 + (x[1] - y[1])**2)
  16
  17 # Create a subplot with 1 row and 2 columns
  18 fig, (ax2) = plt.subplots(1, 1)
  19 fig.set_size_inches(20, 20)
  20
  21 # Read from file
  22 # TODO: Just change the following path and filename
  23 #       when needed to read from a different file
  24 path = "/scratch/July-2018/Pairs3/"
  25 # TODO: Change the order of the files below to generate
  26 #               the diff plot reversedly
  27 device1 = "kwikset-off-phone-side"
  28 device2 = "kwikset-on-phone-side"
  29 filename1 = device1 + ".txt"
  30 filename2 = device2 + ".txt"
  31 plt.ylim(0, 2000)
  32 plt.xlim(0, 2000)
  33
  34 # Number of triggers
  35 trig = 50
  36
  37 # PLOTTING FOR DEVICE ON EVENT
  38 # Read and create an array of pairs
  39 with open(path + filename1, "r") as pairs:
  40         pairsArr1 = list()
  41         pairsSrcLabels1 = list()
  42         for line in pairs:
  43                 # We will see a pair and we need to split it into xpoint and ypoint
  44                 xpoint, ypoint, srcHost1, srcHost2, src1, src2 = line.split(", ")
  45                 # Assign 1000 for client and 0 for server to create distance
  46                 src1Val = 1000 if src1 == 'C' else 0
  47                 src2Val = 1000 if src2 == 'C' else 0
  48                 pair = [int(xpoint), int(ypoint), int(src1Val), int(src2Val)]
  49                 pairSrc = [int(xpoint), int(ypoint), srcHost1, srcHost2, src1, src2]
  50                 # Array of actual points
  51                 pairsArr1.append(pair)
  52                 # Array of source labels
  53                 pairsSrcLabels1.append(pairSrc)
  54
  55 # PLOTTING FOR DEVICE ON EVENT
  56 # Read and create an array of pairs
  57 with open(path + filename2, "r") as pairs:
  58         pairsArr2 = list()
  59         pairsSrcLabels2 = list()
  60         for line in pairs:
  61                 # We will see a pair and we need to split it into xpoint and ypoint
  62                 xpoint, ypoint, srcHost1, srcHost2, src1, src2 = line.split(", ")
  63                 # Assign 1000 for client and 0 for server to create distance
  64                 src1Val = 1000 if src1 == 'C' else 0
  65                 src2Val = 1000 if src2 == 'C' else 0
  66                 pair = [int(xpoint), int(ypoint), int(src1Val), int(src2Val)]
  67                 pairSrc = [int(xpoint), int(ypoint), srcHost1, srcHost2, src1, src2]
  68                 # Array of actual points
  69                 pairsArr2.append(pair)
  70                 # Array of source labels
  71                 pairsSrcLabels2.append(pairSrc)
  72
  73 diff12 = [i for i in pairsArr1 if i not in pairsArr2]
  74 diff12SrcLabels = [i for i in pairsSrcLabels1 if i not in pairsSrcLabels2]
  75
  76 X = np.array(diff12);
  77
  78 # Compute DBSCAN
  79 # eps = distances
  80 # min_samples = minimum number of members of a cluster
  81 db = DBSCAN(eps=10, min_samples=trig - 45).fit(X)
  82 core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
  83 core_samples_mask[db.core_sample_indices_] = True
  84 labels = db.labels_
  85
  86 # Number of clusters in labels, ignoring noise if present.
  87 n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
  88
  89 # Black removed and is used for noise instead.
  90 unique_labels = set(labels)
  91
  92 colors = [plt.cm.Spectral(each)
  93               for each in np.linspace(0, 1, len(unique_labels))]
  94 for k, col in zip(unique_labels, colors):
  95         cluster_col = [1, 0, 0, 1]
  96         if k == -1:
  97             # Black used for noise.
  98             col = [0, 0, 0, 1]
  99
 100         class_member_mask = (labels == k)
 101
 102         # print("Unique label: " + str(k) + " with freq: " + str(labels.tolist().count(k)))
 103         xy = X[class_member_mask & core_samples_mask]
 104         plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(cluster_col),
 105                  markeredgecolor='k', markersize=10)
 106
 107         xy = X[class_member_mask & ~core_samples_mask]
 108         plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
 109                  markeredgecolor='k', markersize=6)
 110
 111 # Print lengths
 112 count = 0
 113 for pair in diff12:
 114         if labels[count] == -1:
 115                 plt.text(pair[0], pair[1], str(pair[0]) + ", " + str(pair[1]), fontsize=10)
 116         else:
 117         # Only print the frequency when this is a real cluster
 118                 plt.text(pair[0], pair[1], str(pair[0]) + ", " + str(pair[1]) +
 119                         " - Freq:" + str(labels.tolist().count(labels[count])), fontsize=10)
 120         count = count + 1
 121
 122 # Print source-destination labels
 123 count = 0
 124 for pair in diff12SrcLabels:
 125         # Only print the frequency when this is a real cluster
 126         plt.text(pair[0], pair[1], str(pair[4]) + "->" + str(pair[5]))
 127         count = count + 1
 128
 129 plt.title(device1 + ' - diff - ' + device2)
 130 plt.show()
 131
 132